Merge 3.10-rc6 into staging-next
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 17 Jun 2013 18:57:00 +0000 (11:57 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 17 Jun 2013 18:57:00 +0000 (11:57 -0700)
We want the fixes in here as well.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
839 files changed:
Documentation/ABI/testing/sysfs-bus-iio
Documentation/devicetree/bindings/iio/dac/ad7303.txt [new file with mode: 0644]
Documentation/devicetree/bindings/iio/frequency/adf4350.txt [new file with mode: 0644]
Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt [new file with mode: 0644]
MAINTAINERS
drivers/iio/Kconfig
drivers/iio/Makefile
drivers/iio/accel/Kconfig
drivers/iio/accel/st_accel_core.c
drivers/iio/adc/Kconfig
drivers/iio/adc/Makefile
drivers/iio/adc/at91_adc.c
drivers/iio/adc/exynos_adc.c
drivers/iio/adc/max1363.c
drivers/iio/adc/mcp320x.c [new file with mode: 0644]
drivers/iio/common/st_sensors/st_sensors_buffer.c
drivers/iio/common/st_sensors/st_sensors_core.c
drivers/iio/dac/Kconfig
drivers/iio/dac/Makefile
drivers/iio/dac/ad7303.c [new file with mode: 0644]
drivers/iio/frequency/adf4350.c
drivers/iio/gyro/Kconfig
drivers/iio/gyro/Makefile
drivers/iio/gyro/adis16130.c [moved from drivers/staging/iio/gyro/adis16130_core.c with 77% similarity]
drivers/iio/gyro/st_gyro_core.c
drivers/iio/industrialio-buffer.c
drivers/iio/light/hid-sensor-als.c
drivers/iio/magnetometer/Kconfig
drivers/iio/magnetometer/ak8975.c
drivers/iio/magnetometer/st_magn_core.c
drivers/iio/pressure/Kconfig [new file with mode: 0644]
drivers/iio/pressure/Makefile [new file with mode: 0644]
drivers/iio/pressure/st_pressure.h [new file with mode: 0644]
drivers/iio/pressure/st_pressure_buffer.c [new file with mode: 0644]
drivers/iio/pressure/st_pressure_core.c [new file with mode: 0644]
drivers/iio/pressure/st_pressure_i2c.c [new file with mode: 0644]
drivers/iio/pressure/st_pressure_spi.c [new file with mode: 0644]
drivers/iio/trigger/Kconfig [new file with mode: 0644]
drivers/iio/trigger/Makefile [new file with mode: 0644]
drivers/iio/trigger/iio-trig-interrupt.c [new file with mode: 0644]
drivers/iio/trigger/iio-trig-sysfs.c [moved from drivers/staging/iio/trigger/iio-trig-sysfs.c with 98% similarity]
drivers/staging/Kconfig
drivers/staging/Makefile
drivers/staging/android/ashmem.c
drivers/staging/android/sw_sync.c
drivers/staging/android/sync.c
drivers/staging/asus_oled/asus_oled.c
drivers/staging/bcm/Bcmchar.c
drivers/staging/bcm/InterfaceIdleMode.c
drivers/staging/bcm/Version.h
drivers/staging/bcm/vendorspecificextn.c
drivers/staging/btmtk_usb/Kconfig [new file with mode: 0644]
drivers/staging/btmtk_usb/Makefile [new file with mode: 0644]
drivers/staging/btmtk_usb/README [new file with mode: 0644]
drivers/staging/btmtk_usb/TODO [new file with mode: 0644]
drivers/staging/btmtk_usb/btmtk_usb.c [new file with mode: 0644]
drivers/staging/btmtk_usb/btmtk_usb.h [new file with mode: 0644]
drivers/staging/ced1401/ced_ioc.c
drivers/staging/ced1401/ced_ioctl.h
drivers/staging/ced1401/machine.h
drivers/staging/ced1401/usb1401.c
drivers/staging/ced1401/usb1401.h
drivers/staging/ced1401/use1401.h
drivers/staging/ced1401/use14_ioc.h
drivers/staging/ced1401/userspace/use1401.c
drivers/staging/comedi/Kconfig
drivers/staging/comedi/comedi.h
drivers/staging/comedi/comedi_buf.c
drivers/staging/comedi/comedi_compat32.c
drivers/staging/comedi/comedi_compat32.h
drivers/staging/comedi/comedi_fops.c
drivers/staging/comedi/comedi_pci.c
drivers/staging/comedi/comedi_pcmcia.c
drivers/staging/comedi/comedi_usb.c
drivers/staging/comedi/comedidev.h
drivers/staging/comedi/comedilib.h
drivers/staging/comedi/drivers.c
drivers/staging/comedi/drivers/8253.h
drivers/staging/comedi/drivers/8255.c
drivers/staging/comedi/drivers/8255.h
drivers/staging/comedi/drivers/8255_pci.c
drivers/staging/comedi/drivers/Makefile
drivers/staging/comedi/drivers/acl7225b.c [deleted file]
drivers/staging/comedi/drivers/addi-data/APCI1710_Chrono.c
drivers/staging/comedi/drivers/addi-data/APCI1710_Dig_io.c
drivers/staging/comedi/drivers/addi-data/APCI1710_INCCPT.c
drivers/staging/comedi/drivers/addi-data/APCI1710_Inp_cpt.c
drivers/staging/comedi/drivers/addi-data/APCI1710_Pwm.c
drivers/staging/comedi/drivers/addi-data/APCI1710_Ssi.c
drivers/staging/comedi/drivers/addi-data/APCI1710_Tor.c
drivers/staging/comedi/drivers/addi-data/APCI1710_Ttl.c
drivers/staging/comedi/drivers/addi-data/addi_common.c
drivers/staging/comedi/drivers/addi-data/addi_eeprom.c
drivers/staging/comedi/drivers/addi-data/hwdrv_APCI1710.c
drivers/staging/comedi/drivers/addi-data/hwdrv_apci035.c
drivers/staging/comedi/drivers/addi-data/hwdrv_apci1500.c
drivers/staging/comedi/drivers/addi-data/hwdrv_apci1564.c
drivers/staging/comedi/drivers/addi-data/hwdrv_apci3120.c
drivers/staging/comedi/drivers/addi-data/hwdrv_apci3200.c
drivers/staging/comedi/drivers/addi-data/hwdrv_apci3xxx.c
drivers/staging/comedi/drivers/addi_apci_1032.c
drivers/staging/comedi/drivers/addi_apci_1516.c
drivers/staging/comedi/drivers/addi_apci_16xx.c
drivers/staging/comedi/drivers/addi_apci_2032.c
drivers/staging/comedi/drivers/addi_apci_2200.c
drivers/staging/comedi/drivers/addi_apci_3501.c
drivers/staging/comedi/drivers/addi_watchdog.c
drivers/staging/comedi/drivers/adl_pci6208.c
drivers/staging/comedi/drivers/adl_pci7x3x.c
drivers/staging/comedi/drivers/adl_pci8164.c
drivers/staging/comedi/drivers/adl_pci9111.c
drivers/staging/comedi/drivers/adq12b.c
drivers/staging/comedi/drivers/adv_pci1723.c
drivers/staging/comedi/drivers/adv_pci1724.c
drivers/staging/comedi/drivers/aio_aio12_8.c
drivers/staging/comedi/drivers/aio_iiro_16.c
drivers/staging/comedi/drivers/amplc_dio200.c
drivers/staging/comedi/drivers/amplc_dio200.h
drivers/staging/comedi/drivers/amplc_dio200_common.c
drivers/staging/comedi/drivers/amplc_dio200_pci.c
drivers/staging/comedi/drivers/amplc_pc236.c
drivers/staging/comedi/drivers/amplc_pc263.c
drivers/staging/comedi/drivers/amplc_pci224.c
drivers/staging/comedi/drivers/amplc_pci230.c
drivers/staging/comedi/drivers/amplc_pci263.c
drivers/staging/comedi/drivers/c6xdigio.c
drivers/staging/comedi/drivers/cb_das16_cs.c
drivers/staging/comedi/drivers/cb_pcidas.c
drivers/staging/comedi/drivers/cb_pcidas64.c
drivers/staging/comedi/drivers/cb_pcidda.c
drivers/staging/comedi/drivers/cb_pcimdas.c
drivers/staging/comedi/drivers/cb_pcimdda.c
drivers/staging/comedi/drivers/comedi_bond.c
drivers/staging/comedi/drivers/comedi_fc.c
drivers/staging/comedi/drivers/comedi_fc.h
drivers/staging/comedi/drivers/comedi_parport.c
drivers/staging/comedi/drivers/comedi_test.c
drivers/staging/comedi/drivers/contec_pci_dio.c
drivers/staging/comedi/drivers/daqboard2000.c
drivers/staging/comedi/drivers/das08.c
drivers/staging/comedi/drivers/das08.h
drivers/staging/comedi/drivers/das08_cs.c
drivers/staging/comedi/drivers/das08_isa.c
drivers/staging/comedi/drivers/das08_pci.c
drivers/staging/comedi/drivers/das16.c
drivers/staging/comedi/drivers/das16m1.c
drivers/staging/comedi/drivers/das1800.c
drivers/staging/comedi/drivers/das6402.c
drivers/staging/comedi/drivers/das800.c
drivers/staging/comedi/drivers/dmm32at.c
drivers/staging/comedi/drivers/dt2811.c
drivers/staging/comedi/drivers/dt2814.c
drivers/staging/comedi/drivers/dt2815.c
drivers/staging/comedi/drivers/dt2817.c
drivers/staging/comedi/drivers/dt282x.c
drivers/staging/comedi/drivers/dt3000.c
drivers/staging/comedi/drivers/dt9812.c
drivers/staging/comedi/drivers/dyna_pci10xx.c
drivers/staging/comedi/drivers/gsc_hpdi.c
drivers/staging/comedi/drivers/icp_multi.c
drivers/staging/comedi/drivers/jr3_pci.c
drivers/staging/comedi/drivers/ke_counter.c
drivers/staging/comedi/drivers/me4000.c
drivers/staging/comedi/drivers/me_daq.c
drivers/staging/comedi/drivers/mite.c
drivers/staging/comedi/drivers/mite.h
drivers/staging/comedi/drivers/mpc624.c
drivers/staging/comedi/drivers/multiq3.c
drivers/staging/comedi/drivers/ni_6527.c
drivers/staging/comedi/drivers/ni_65xx.c
drivers/staging/comedi/drivers/ni_660x.c
drivers/staging/comedi/drivers/ni_670x.c
drivers/staging/comedi/drivers/ni_at_a2150.c
drivers/staging/comedi/drivers/ni_at_ao.c
drivers/staging/comedi/drivers/ni_atmio.c
drivers/staging/comedi/drivers/ni_atmio16d.c
drivers/staging/comedi/drivers/ni_daq_700.c
drivers/staging/comedi/drivers/ni_daq_dio24.c
drivers/staging/comedi/drivers/ni_labpc.c
drivers/staging/comedi/drivers/ni_labpc.h
drivers/staging/comedi/drivers/ni_labpc_cs.c
drivers/staging/comedi/drivers/ni_labpc_pci.c
drivers/staging/comedi/drivers/ni_mio_common.c
drivers/staging/comedi/drivers/ni_mio_cs.c
drivers/staging/comedi/drivers/ni_pcidio.c
drivers/staging/comedi/drivers/ni_pcimio.c
drivers/staging/comedi/drivers/ni_stc.h
drivers/staging/comedi/drivers/ni_tio.c
drivers/staging/comedi/drivers/ni_tio.h
drivers/staging/comedi/drivers/ni_tio_internal.h
drivers/staging/comedi/drivers/ni_tiocmd.c
drivers/staging/comedi/drivers/pcl711.c
drivers/staging/comedi/drivers/pcl724.c
drivers/staging/comedi/drivers/pcl725.c [deleted file]
drivers/staging/comedi/drivers/pcl726.c
drivers/staging/comedi/drivers/pcl730.c
drivers/staging/comedi/drivers/pcm3730.c [deleted file]
drivers/staging/comedi/drivers/pcmad.c
drivers/staging/comedi/drivers/pcmda12.c
drivers/staging/comedi/drivers/pcmmio.c
drivers/staging/comedi/drivers/pcmuio.c
drivers/staging/comedi/drivers/plx9052.h
drivers/staging/comedi/drivers/poc.c
drivers/staging/comedi/drivers/rtd520.c
drivers/staging/comedi/drivers/rti800.c
drivers/staging/comedi/drivers/rti802.c
drivers/staging/comedi/drivers/s526.c
drivers/staging/comedi/drivers/s626.c
drivers/staging/comedi/drivers/s626.h
drivers/staging/comedi/drivers/serial2002.c
drivers/staging/comedi/drivers/skel.c
drivers/staging/comedi/drivers/ssv_dnp.c
drivers/staging/comedi/drivers/unioxx5.c
drivers/staging/comedi/drivers/usbdux.c
drivers/staging/comedi/drivers/usbduxfast.c
drivers/staging/comedi/drivers/usbduxsigma.c
drivers/staging/comedi/drivers/vmk80xx.c
drivers/staging/comedi/kcomedilib/kcomedilib_main.c
drivers/staging/comedi/proc.c
drivers/staging/comedi/range.c
drivers/staging/cptm1217/clearpad_tm1217.c
drivers/staging/crystalhd/bc_dts_glob_lnx.h
drivers/staging/crystalhd/crystalhd_cmds.c
drivers/staging/crystalhd/crystalhd_cmds.h
drivers/staging/crystalhd/crystalhd_fw_if.h
drivers/staging/crystalhd/crystalhd_hw.c
drivers/staging/crystalhd/crystalhd_hw.h
drivers/staging/crystalhd/crystalhd_lnx.c
drivers/staging/crystalhd/crystalhd_lnx.h
drivers/staging/crystalhd/crystalhd_misc.c
drivers/staging/crystalhd/crystalhd_misc.h
drivers/staging/csr/bh.c
drivers/staging/csr/csr_framework_ext.c
drivers/staging/csr/csr_framework_ext.h
drivers/staging/csr/csr_wifi_nme_ap_sef.c
drivers/staging/csr/drv.c
drivers/staging/csr/io.c
drivers/staging/csr/netdev.c
drivers/staging/csr/sdio_mmc.c
drivers/staging/csr/sme_blocking.c
drivers/staging/csr/sme_native.c
drivers/staging/csr/sme_sys.c
drivers/staging/csr/sme_userspace.c
drivers/staging/csr/sme_wext.c
drivers/staging/csr/ul_int.c
drivers/staging/csr/unifi_event.c
drivers/staging/csr/unifi_pdu_processing.c
drivers/staging/csr/unifi_priv.h
drivers/staging/csr/unifi_sme.c
drivers/staging/csr/unifi_sme.h
drivers/staging/cxt1e1/comet.c
drivers/staging/cxt1e1/functions.c
drivers/staging/cxt1e1/hwprobe.c
drivers/staging/cxt1e1/linux.c
drivers/staging/cxt1e1/musycc.c
drivers/staging/cxt1e1/pmcc4.h
drivers/staging/cxt1e1/pmcc4_drv.c
drivers/staging/cxt1e1/sbecom_inline_linux.h
drivers/staging/cxt1e1/sbeid.c
drivers/staging/cxt1e1/sbeproc.h
drivers/staging/dgrp/dgrp_dpa_ops.c
drivers/staging/dgrp/dgrp_net_ops.c
drivers/staging/dgrp/drp.h
drivers/staging/dwc2/core.c
drivers/staging/dwc2/core_intr.c
drivers/staging/dwc2/hcd.c
drivers/staging/dwc2/hcd.h
drivers/staging/dwc2/hcd_intr.c
drivers/staging/dwc2/pci.c
drivers/staging/echo/echo.c
drivers/staging/echo/echo.h
drivers/staging/frontier/alphatrack.c
drivers/staging/frontier/alphatrack.h
drivers/staging/frontier/tranzport.c
drivers/staging/ft1000/ft1000-pcmcia/ft1000_dnld.c
drivers/staging/ft1000/ft1000-usb/ft1000_debug.c
drivers/staging/ft1000/ft1000-usb/ft1000_ioctl.h
drivers/staging/ft1000/ft1000-usb/ft1000_usb.c
drivers/staging/fwserial/fwserial.c
drivers/staging/fwserial/fwserial.h
drivers/staging/gdm72xx/Kconfig
drivers/staging/gdm72xx/gdm_wimax.c
drivers/staging/goldfish/goldfish_audio.c
drivers/staging/goldfish/goldfish_nand.c
drivers/staging/goldfish/goldfish_nand_reg.h
drivers/staging/iio/adc/ad7192.c
drivers/staging/iio/adc/ad7280a.c
drivers/staging/iio/adc/ad7291.c
drivers/staging/iio/adc/ad7291.h [new file with mode: 0644]
drivers/staging/iio/adc/ad7606_core.c
drivers/staging/iio/adc/ad7606_par.c
drivers/staging/iio/adc/ad7816.c
drivers/staging/iio/adc/ad799x_core.c
drivers/staging/iio/adc/lpc32xx_adc.c
drivers/staging/iio/adc/mxs-lradc.c
drivers/staging/iio/adc/spear_adc.c
drivers/staging/iio/gyro/Kconfig
drivers/staging/iio/gyro/Makefile
drivers/staging/iio/trigger/Kconfig
drivers/staging/iio/trigger/Makefile
drivers/staging/iio/trigger/iio-trig-gpio.c [deleted file]
drivers/staging/imx-drm/imx-drm-core.c
drivers/staging/imx-drm/imx-tve.c
drivers/staging/imx-drm/ipu-v3/ipu-di.c
drivers/staging/imx-drm/parallel-display.c
drivers/staging/keucr/init.c
drivers/staging/keucr/scsiglue.c
drivers/staging/keucr/smil.h
drivers/staging/keucr/smilmain.c
drivers/staging/keucr/smilsub.c
drivers/staging/keucr/smscsi.c
drivers/staging/keucr/transport.c
drivers/staging/keucr/transport.h
drivers/staging/keucr/usb.c
drivers/staging/keucr/usb.h
drivers/staging/lustre/Kconfig [new file with mode: 0644]
drivers/staging/lustre/Makefile [new file with mode: 0644]
drivers/staging/lustre/TODO [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/bitmap.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/curproc.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_private.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_string.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_time.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/kp30.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/lucache.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/libcfs/params_tree.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/api-support.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/api.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lib-lnet.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lib-types.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/linux/api-support.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/linux/lib-types.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/linux/lnet.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lnet.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lnetctl.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/lnetst.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/ptllnd.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/socklnd.h [new file with mode: 0644]
drivers/staging/lustre/include/linux/lnet/types.h [new file with mode: 0644]
drivers/staging/lustre/lnet/Kconfig [new file with mode: 0644]
drivers/staging/lustre/lnet/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c [new file with mode: 0644]
drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/acceptor.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/api-errno.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/api-ni.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/config.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-eq.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-md.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-me.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-move.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-msg.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lib-ptl.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/lo.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/module.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/peer.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/router.c [new file with mode: 0644]
drivers/staging/lustre/lnet/lnet/router_proc.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/Makefile [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/brw_test.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/conctl.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/conrpc.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/conrpc.h [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/console.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/console.h [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/framework.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/module.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/ping_test.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/rpc.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/rpc.h [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/selftest.h [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/timer.c [new file with mode: 0644]
drivers/staging/lustre/lnet/selftest/timer.h [new file with mode: 0644]
drivers/staging/lustre/lustre/Kconfig [new file with mode: 0644]
drivers/staging/lustre/lustre/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/fid_handler.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/fid_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/fid_lib.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/fid_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/fid_store.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fid/lproc_fid.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/fld_cache.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/fld_handler.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/fld_index.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/fld_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/fld_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/fld/lproc_fld.c [new file with mode: 0644]
drivers/staging/lustre/lustre/include/cl_object.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/dt_object.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/interval_tree.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/ioctl.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lclient.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lprocfs_status.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_acl.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_common.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_compat25.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_debug.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_dlm.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_handles.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_intent.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_lib.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_lite.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_log.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_net.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_quota.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lustre_user.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lvfs.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/lvfs_linux.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/obd.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/obd_class.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/linux/obd_support.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lprocfs_status.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lu_object.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lu_ref.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lu_target.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/libiam.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/liblustreapi.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/lustre_idl.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/lustre_user.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre/lustreapi.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_acl.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_capa.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_cfg.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_debug.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_disk.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_dlm.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_eacl.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_export.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_fid.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_fld.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_fsfilt.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_ha.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_handles.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_idmap.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_import.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_lib.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_linkea.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_lite.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_log.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_mdc.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_mds.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_mdt.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_net.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_param.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_quota.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_req_layout.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_sec.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_update.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lustre_ver.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/lvfs.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/md_object.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_cache.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_cksum.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_class.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_lov.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_ost.h [new file with mode: 0644]
drivers/staging/lustre/lustre/include/obd_support.h [new file with mode: 0644]
drivers/staging/lustre/lustre/lclient/glimpse.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lclient/lcommon_cl.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lclient/lcommon_misc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/interval_tree.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/l_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_extent.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_flock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_lib.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_plain.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_pool.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ldlm/ldlm_resource.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/debug.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/fail.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/hash.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/heap.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/libcfs_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/libcfs_mem.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/libcfs_string.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-module.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/lwt.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/module.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/nidstrings.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/prng.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/tracefile.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/tracefile.h [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/upcall_cache.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/watchdog.c [new file with mode: 0644]
drivers/staging/lustre/lustre/libcfs/workitem.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/dcache.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/dir.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/file.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_capa.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_close.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_lib.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_mmap.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_nfs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/llite_rmtacl.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/lloop.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/lproc_llite.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/namei.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/remote_perm.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/rw.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/rw26.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/statahead.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/super25.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/symlink.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_dev.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_io.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/vvp_page.c [new file with mode: 0644]
drivers/staging/lustre/lustre/llite/xattr.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/lmv_fld.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/lmv_intent.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/lmv_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/lmv_obd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lmv/lproc_lmv.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_cl_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_dev.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_ea.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_io.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_log.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_merge.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_obd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_offset.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_pack.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_page.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_pool.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lov_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lovsub_dev.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lovsub_io.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lovsub_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lovsub_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lovsub_page.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lov/lproc_lov.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lvfs/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/lvfs/fsfilt.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lvfs/lvfs_lib.c [new file with mode: 0644]
drivers/staging/lustre/lustre/lvfs/lvfs_linux.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/lproc_mdc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/mdc_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/mdc_lib.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/mdc_locks.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/mdc_reint.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mdc/mdc_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mgc/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/mgc/libmgc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mgc/lproc_mgc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/mgc/mgc_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/mgc/mgc_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/acl.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/capa.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/cl_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/cl_io.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/cl_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/cl_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/cl_page.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/class_obd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/debug.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/dt_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/genops.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/idmap.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/linkea.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/linux/linux-module.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_cat.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_ioctl.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_lvfs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_obd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_osd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_swab.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/llog_test.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/local_storage.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/local_storage.h [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lprocfs_status.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lu_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lu_ref.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lu_ucred.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lustre_handles.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/lustre_peer.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/md_attrs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/mea.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/obd_config.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/obd_mount.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/obdo.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/statfs_pack.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdclass/uuid.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdecho/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/obdecho/echo.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdecho/echo_client.c [new file with mode: 0644]
drivers/staging/lustre/lustre/obdecho/echo_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/obdecho/lproc_echo.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/lproc_osc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_cache.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_cl_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_dev.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_io.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_lock.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_object.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_page.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_quota.c [new file with mode: 0644]
drivers/staging/lustre/lustre/osc/osc_request.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/client.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/connection.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/events.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/Makefile [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/import.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/layout.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/llog_client.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/llog_net.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/llog_server.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/niobuf.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/nrs.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/pack_generic.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/pers.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/pinger.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/recover.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_config.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_gc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_null.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/sec_plain.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/service.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/wirehdr.c [new file with mode: 0644]
drivers/staging/lustre/lustre/ptlrpc/wiretest.c [new file with mode: 0644]
drivers/staging/netlogic/xlr_net.c
drivers/staging/nvec/nvec_kbd.c
drivers/staging/octeon-usb/Kconfig [new file with mode: 0644]
drivers/staging/octeon-usb/Makefile [new file with mode: 0644]
drivers/staging/octeon-usb/TODO [new file with mode: 0644]
drivers/staging/octeon-usb/cvmx-usb.c [new file with mode: 0644]
drivers/staging/octeon-usb/cvmx-usb.h [new file with mode: 0644]
drivers/staging/octeon-usb/cvmx-usbcx-defs.h [new file with mode: 0644]
drivers/staging/octeon-usb/cvmx-usbnx-defs.h [new file with mode: 0644]
drivers/staging/octeon-usb/octeon-hcd.c [new file with mode: 0644]
drivers/staging/ozwpan/ozcdev.c
drivers/staging/ozwpan/ozhcd.c
drivers/staging/panel/panel.c
drivers/staging/rtl8192e/rtl8192e/r8192E_cmdpkt.c
drivers/staging/rtl8192e/rtl8192e/r8192E_dev.c
drivers/staging/rtl8192e/rtllib_rx.c
drivers/staging/rtl8192e/rtllib_softmac.c
drivers/staging/rtl8192u/ieee80211/dot11d.c
drivers/staging/rtl8192u/ieee80211/dot11d.h
drivers/staging/rtl8192u/ieee80211/ieee80211.h
drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.c
drivers/staging/rtl8192u/ieee80211/ieee80211_crypt.h
drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c
drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_tkip.c
drivers/staging/rtl8192u/ieee80211/ieee80211_crypt_wep.c
drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c
drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c
drivers/staging/rtl8192u/ieee80211/ieee80211_tx.c
drivers/staging/rtl8192u/ieee80211/rtl819x_BAProc.c
drivers/staging/rtl8192u/ieee80211/rtl819x_HTProc.c
drivers/staging/rtl8192u/ieee80211/rtl819x_Qos.h
drivers/staging/rtl8192u/ieee80211/rtl819x_TSProc.c
drivers/staging/rtl8192u/r8180_93cx6.c
drivers/staging/rtl8192u/r8190_rtl8256.c
drivers/staging/rtl8192u/r8190_rtl8256.h
drivers/staging/rtl8192u/r8192U.h
drivers/staging/rtl8192u/r8192U_core.c
drivers/staging/rtl8192u/r8192U_dm.c
drivers/staging/rtl8192u/r8192U_hw.h
drivers/staging/rtl8192u/r8192U_wx.c
drivers/staging/rtl8192u/r819xU_cmdpkt.c
drivers/staging/rtl8192u/r819xU_cmdpkt.h
drivers/staging/rtl8192u/r819xU_firmware.c
drivers/staging/rtl8192u/r819xU_phy.c
drivers/staging/rtl8192u/r819xU_phy.h
drivers/staging/rts5139/rts51x_transport.c
drivers/staging/sb105x/sb_pci_mp.c
drivers/staging/sb105x/sb_pci_mp.h
drivers/staging/silicom/bpctl_mod.c
drivers/staging/silicom/bypasslib/bp_ioctl.h
drivers/staging/silicom/bypasslib/bplibk.h
drivers/staging/slicoss/slicoss.c
drivers/staging/speakup/Kconfig
drivers/staging/speakup/devsynth.c
drivers/staging/speakup/i18n.c
drivers/staging/speakup/kobjects.c
drivers/staging/speakup/main.c
drivers/staging/speakup/serialio.c
drivers/staging/speakup/speakup_acntpc.c
drivers/staging/speakup/speakup_apollo.c
drivers/staging/speakup/speakup_decext.c
drivers/staging/speakup/speakup_decpc.c
drivers/staging/speakup/speakup_dectlk.c
drivers/staging/speakup/speakup_dtlk.c
drivers/staging/speakup/speakup_keypc.c
drivers/staging/speakup/speakup_soft.c
drivers/staging/speakup/spk_priv.h
drivers/staging/speakup/synth.c
drivers/staging/speakup/thread.c
drivers/staging/speakup/varhandlers.c
drivers/staging/ti-soc-thermal/ti-thermal-common.c
drivers/staging/ti-soc-thermal/ti_soc_thermal.txt
drivers/staging/tidspbridge/core/_tiomap.h
drivers/staging/tidspbridge/core/_tiomap_pwr.h
drivers/staging/tidspbridge/core/tiomap3430_pwr.c
drivers/staging/tidspbridge/core/ue_deh.c
drivers/staging/tidspbridge/core/wdt.c
drivers/staging/tidspbridge/rmgr/drv_interface.c
drivers/staging/usbip/usbip_event.c
drivers/staging/vme/devices/vme_user.c
drivers/staging/vme/devices/vme_user.h
drivers/staging/vt6655/80211hdr.h
drivers/staging/vt6655/80211mgr.c
drivers/staging/vt6655/80211mgr.h
drivers/staging/vt6655/aes_ccmp.c
drivers/staging/vt6655/aes_ccmp.h
drivers/staging/vt6655/hostap.c
drivers/staging/vt6656/rf.c
drivers/staging/vt6656/tether.h
drivers/staging/vt6656/tmacro.h
drivers/staging/winbond/phy_calibration.c
drivers/staging/winbond/reg.c
drivers/staging/winbond/wb35reg.c
drivers/staging/winbond/wb35rx.c
drivers/staging/wlags49_h2/wl_cs.c
drivers/staging/wlags49_h2/wl_cs.h
drivers/staging/wlags49_h2/wl_main.c
drivers/staging/wlan-ng/prism2sta.c
drivers/staging/xgifb/vb_def.h
drivers/staging/xgifb/vb_init.c
drivers/staging/xgifb/vb_setmode.c
drivers/staging/xgifb/vb_setmode.h
drivers/staging/zram/zram_drv.c
drivers/staging/zram/zram_drv.h
drivers/staging/zram/zram_sysfs.c
drivers/staging/zsmalloc/zsmalloc-main.c
drivers/staging/zsmalloc/zsmalloc.h
include/linux/iio/common/st_sensors.h
include/linux/iio/frequency/adf4350.h
include/linux/platform_data/ad7303.h [new file with mode: 0644]

index 2e33dc6b23463046614586deba015d3f1c1ebb66..dda81ffae5cfb4b385eaf89c799f888268ce7148 100644 (file)
@@ -690,45 +690,45 @@ Description:
                Actually start the buffer capture up.  Will start trigger
                if first device and appropriate.
 
-What:          /sys/bus/iio/devices/iio:deviceX/buffer/scan_elements
+What:          /sys/bus/iio/devices/iio:deviceX/scan_elements
 KernelVersion: 2.6.37
 Contact:       linux-iio@vger.kernel.org
 Description:
                Directory containing interfaces for elements that will be
                captured for a single triggered sample set in the buffer.
 
-What:          /sys/.../buffer/scan_elements/in_accel_x_en
-What:          /sys/.../buffer/scan_elements/in_accel_y_en
-What:          /sys/.../buffer/scan_elements/in_accel_z_en
-What:          /sys/.../buffer/scan_elements/in_anglvel_x_en
-What:          /sys/.../buffer/scan_elements/in_anglvel_y_en
-What:          /sys/.../buffer/scan_elements/in_anglvel_z_en
-What:          /sys/.../buffer/scan_elements/in_magn_x_en
-What:          /sys/.../buffer/scan_elements/in_magn_y_en
-What:          /sys/.../buffer/scan_elements/in_magn_z_en
-What:          /sys/.../buffer/scan_elements/in_timestamp_en
-What:          /sys/.../buffer/scan_elements/in_voltageY_supply_en
-What:          /sys/.../buffer/scan_elements/in_voltageY_en
-What:          /sys/.../buffer/scan_elements/in_voltageY-voltageZ_en
-What:          /sys/.../buffer/scan_elements/in_incli_x_en
-What:          /sys/.../buffer/scan_elements/in_incli_y_en
-What:          /sys/.../buffer/scan_elements/in_pressureY_en
-What:          /sys/.../buffer/scan_elements/in_pressure_en
+What:          /sys/.../iio:deviceX/scan_elements/in_accel_x_en
+What:          /sys/.../iio:deviceX/scan_elements/in_accel_y_en
+What:          /sys/.../iio:deviceX/scan_elements/in_accel_z_en
+What:          /sys/.../iio:deviceX/scan_elements/in_anglvel_x_en
+What:          /sys/.../iio:deviceX/scan_elements/in_anglvel_y_en
+What:          /sys/.../iio:deviceX/scan_elements/in_anglvel_z_en
+What:          /sys/.../iio:deviceX/scan_elements/in_magn_x_en
+What:          /sys/.../iio:deviceX/scan_elements/in_magn_y_en
+What:          /sys/.../iio:deviceX/scan_elements/in_magn_z_en
+What:          /sys/.../iio:deviceX/scan_elements/in_timestamp_en
+What:          /sys/.../iio:deviceX/scan_elements/in_voltageY_supply_en
+What:          /sys/.../iio:deviceX/scan_elements/in_voltageY_en
+What:          /sys/.../iio:deviceX/scan_elements/in_voltageY-voltageZ_en
+What:          /sys/.../iio:deviceX/scan_elements/in_incli_x_en
+What:          /sys/.../iio:deviceX/scan_elements/in_incli_y_en
+What:          /sys/.../iio:deviceX/scan_elements/in_pressureY_en
+What:          /sys/.../iio:deviceX/scan_elements/in_pressure_en
 KernelVersion: 2.6.37
 Contact:       linux-iio@vger.kernel.org
 Description:
                Scan element control for triggered data capture.
 
-What:          /sys/.../buffer/scan_elements/in_accel_type
-What:          /sys/.../buffer/scan_elements/in_anglvel_type
-What:          /sys/.../buffer/scan_elements/in_magn_type
-What:          /sys/.../buffer/scan_elements/in_incli_type
-What:          /sys/.../buffer/scan_elements/in_voltageY_type
-What:          /sys/.../buffer/scan_elements/in_voltage_type
-What:          /sys/.../buffer/scan_elements/in_voltageY_supply_type
-What:          /sys/.../buffer/scan_elements/in_timestamp_type
-What:          /sys/.../buffer/scan_elements/in_pressureY_type
-What:          /sys/.../buffer/scan_elements/in_pressure_type
+What:          /sys/.../iio:deviceX/scan_elements/in_accel_type
+What:          /sys/.../iio:deviceX/scan_elements/in_anglvel_type
+What:          /sys/.../iio:deviceX/scan_elements/in_magn_type
+What:          /sys/.../iio:deviceX/scan_elements/in_incli_type
+What:          /sys/.../iio:deviceX/scan_elements/in_voltageY_type
+What:          /sys/.../iio:deviceX/scan_elements/in_voltage_type
+What:          /sys/.../iio:deviceX/scan_elements/in_voltageY_supply_type
+What:          /sys/.../iio:deviceX/scan_elements/in_timestamp_type
+What:          /sys/.../iio:deviceX/scan_elements/in_pressureY_type
+What:          /sys/.../iio:deviceX/scan_elements/in_pressure_type
 KernelVersion: 2.6.37
 Contact:       linux-iio@vger.kernel.org
 Description:
@@ -752,29 +752,29 @@ Description:
                For other storage combinations this attribute will be extended
                appropriately.
 
-What:          /sys/.../buffer/scan_elements/in_accel_type_available
+What:          /sys/.../iio:deviceX/scan_elements/in_accel_type_available
 KernelVersion: 2.6.37
 Contact:       linux-iio@vger.kernel.org
 Description:
                If the type parameter can take one of a small set of values,
                this attribute lists them.
 
-What:          /sys/.../buffer/scan_elements/in_voltageY_index
-What:          /sys/.../buffer/scan_elements/in_voltageY_supply_index
-What:          /sys/.../buffer/scan_elements/in_accel_x_index
-What:          /sys/.../buffer/scan_elements/in_accel_y_index
-What:          /sys/.../buffer/scan_elements/in_accel_z_index
-What:          /sys/.../buffer/scan_elements/in_anglvel_x_index
-What:          /sys/.../buffer/scan_elements/in_anglvel_y_index
-What:          /sys/.../buffer/scan_elements/in_anglvel_z_index
-What:          /sys/.../buffer/scan_elements/in_magn_x_index
-What:          /sys/.../buffer/scan_elements/in_magn_y_index
-What:          /sys/.../buffer/scan_elements/in_magn_z_index
-What:          /sys/.../buffer/scan_elements/in_incli_x_index
-What:          /sys/.../buffer/scan_elements/in_incli_y_index
-What:          /sys/.../buffer/scan_elements/in_timestamp_index
-What:          /sys/.../buffer/scan_elements/in_pressureY_index
-What:          /sys/.../buffer/scan_elements/in_pressure_index
+What:          /sys/.../iio:deviceX/scan_elements/in_voltageY_index
+What:          /sys/.../iio:deviceX/scan_elements/in_voltageY_supply_index
+What:          /sys/.../iio:deviceX/scan_elements/in_accel_x_index
+What:          /sys/.../iio:deviceX/scan_elements/in_accel_y_index
+What:          /sys/.../iio:deviceX/scan_elements/in_accel_z_index
+What:          /sys/.../iio:deviceX/scan_elements/in_anglvel_x_index
+What:          /sys/.../iio:deviceX/scan_elements/in_anglvel_y_index
+What:          /sys/.../iio:deviceX/scan_elements/in_anglvel_z_index
+What:          /sys/.../iio:deviceX/scan_elements/in_magn_x_index
+What:          /sys/.../iio:deviceX/scan_elements/in_magn_y_index
+What:          /sys/.../iio:deviceX/scan_elements/in_magn_z_index
+What:          /sys/.../iio:deviceX/scan_elements/in_incli_x_index
+What:          /sys/.../iio:deviceX/scan_elements/in_incli_y_index
+What:          /sys/.../iio:deviceX/scan_elements/in_timestamp_index
+What:          /sys/.../iio:deviceX/scan_elements/in_pressureY_index
+What:          /sys/.../iio:deviceX/scan_elements/in_pressure_index
 KernelVersion: 2.6.37
 Contact:       linux-iio@vger.kernel.org
 Description:
diff --git a/Documentation/devicetree/bindings/iio/dac/ad7303.txt b/Documentation/devicetree/bindings/iio/dac/ad7303.txt
new file mode 100644 (file)
index 0000000..914610f
--- /dev/null
@@ -0,0 +1,23 @@
+Analog Devices AD7303 DAC device driver
+
+Required properties:
+       - compatible: Must be "adi,ad7303"
+       - reg: SPI chip select number for the device
+       - spi-max-frequency: Max SPI frequency to use (< 30000000)
+       - Vdd-supply: Phandle to the Vdd power supply
+
+Optional properties:
+       - REF-supply: Phandle to the external reference voltage supply. This should
+         only be set if there is an external reference voltage connected to the REF
+         pin. If the property is not set Vdd/2 is used as the reference voltage.
+
+Example:
+
+               ad7303@4 {
+                       compatible = "adi,ad7303";
+                       reg = <4>;
+                       spi-max-frequency = <10000000>;
+                       Vdd-supply = <&vdd_supply>;
+                       adi,use-external-reference;
+                       REF-supply = <&vref_supply>;
+               };
diff --git a/Documentation/devicetree/bindings/iio/frequency/adf4350.txt b/Documentation/devicetree/bindings/iio/frequency/adf4350.txt
new file mode 100644 (file)
index 0000000..f8c181d
--- /dev/null
@@ -0,0 +1,86 @@
+Analog Devices ADF4350/ADF4351 device driver
+
+Required properties:
+       - compatible: Should be one of
+               * "adi,adf4350": When using the ADF4350 device
+               * "adi,adf4351": When using the ADF4351 device
+       - reg: SPI chip select numbert for the device
+       - spi-max-frequency: Max SPI frequency to use (< 20000000)
+       - clocks: From common clock binding. Clock is phandle to clock for
+               ADF435x Reference Clock (CLKIN).
+
+Optional properties:
+       - gpios:         GPIO Lock detect - If set with a valid phandle and GPIO number,
+                       pll lock state is tested upon read.
+       - adi,channel-spacing: Channel spacing in Hz (influences MODULUS).
+       - adi,power-up-frequency:       If set in Hz the PLL tunes to
+                       the desired frequency on probe.
+       - adi,reference-div-factor: If set the driver skips dynamic calculation
+                       and uses this default value instead.
+       - adi,reference-doubler-enable: Enables reference doubler.
+       - adi,reference-div2-enable: Enables reference divider.
+       - adi,phase-detector-polarity-positive-enable: Enables positive phase
+                       detector polarity. Default = negative.
+       - adi,lock-detect-precision-6ns-enable: Enables 6ns lock detect precision.
+                       Default = 10ns.
+       - adi,lock-detect-function-integer-n-enable: Enables lock detect
+                       for integer-N mode. Default = factional-N mode.
+       - adi,charge-pump-current: Charge pump current in mA.
+                       Default = 2500mA.
+       - adi,muxout-select: On chip multiplexer output selection.
+                       Valid values for the multiplexer output are:
+                       0: Three-State Output (default)
+                       1: DVDD
+                       2: DGND
+                       3: R-Counter output
+                       4: N-Divider output
+                       5: Analog lock detect
+                       6: Digital lock detect
+       - adi,low-spur-mode-enable: Enables low spur mode.
+                       Default = Low noise mode.
+       - adi,cycle-slip-reduction-enable: Enables cycle slip reduction.
+       - adi,charge-cancellation-enable: Enabled charge pump
+                       charge cancellation for integer-N modes.
+       - adi,anti-backlash-3ns-enable: Enables 3ns antibacklash pulse width
+                        for integer-N modes.
+       - adi,band-select-clock-mode-high-enable: Enables faster band
+                       selection logic.
+       - adi,12bit-clk-divider: Clock divider value used when
+                       adi,12bit-clkdiv-mode != 0
+       - adi,clk-divider-mode:
+                       Valid values for the clkdiv mode are:
+                       0: Clock divider off (default)
+                       1: Fast lock enable
+                       2: Phase resync enable
+       - adi,aux-output-enable: Enables auxiliary RF output.
+       - adi,aux-output-fundamental-enable: Selects fundamental VCO output on
+                       the auxiliary RF output. Default = Output of RF dividers.
+       - adi,mute-till-lock-enable: Enables Mute-Till-Lock-Detect function.
+       - adi,output-power: Output power selection.
+                       Valid values for the power mode are:
+                       0: -4dBm (default)
+                       1: -1dBm
+                       2: +2dBm
+                       3: +5dBm
+       - adi,aux-output-power: Auxiliary output power selection.
+                       Valid values for the power mode are:
+                       0: -4dBm (default)
+                       1: -1dBm
+                       2: +2dBm
+                       3: +5dBm
+
+
+Example:
+               lo_pll0_rx_adf4351: adf4351-rx-lpc@4 {
+                       compatible = "adi,adf4351";
+                       reg = <4>;
+                       spi-max-frequency = <10000000>;
+                       clocks = <&clk0_ad9523 9>;
+                       clock-names = "clkin";
+                       adi,channel-spacing = <10000>;
+                       adi,power-up-frequency = <2400000000>;
+                       adi,phase-detector-polarity-positive-enable;
+                       adi,charge-pump-current = <2500>;
+                       adi,output-power = <3>;
+                       adi,mute-till-lock-enable;
+               };
diff --git a/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt b/Documentation/devicetree/bindings/iio/magnetometer/ak8975.txt
new file mode 100644 (file)
index 0000000..011679f
--- /dev/null
@@ -0,0 +1,18 @@
+* AsahiKASEI AK8975 magnetometer sensor
+
+Required properties:
+
+  - compatible : should be "asahi-kasei,ak8975"
+  - reg : the I2C address of the magnetometer
+
+Optional properties:
+
+  - gpios : should be device tree identifier of the magnetometer DRDY pin
+
+Example:
+
+ak8975@0c {
+        compatible = "asahi-kasei,ak8975";
+        reg = <0x0c>;
+        gpios = <&gpj0 7 0>;
+};
index 5be702cc8449d3edb8107256bb03bac25d937238..93898597ab617aa08c6bfafb966c0a16aa3bc099 100644 (file)
@@ -7783,7 +7783,7 @@ F:        drivers/staging/media/solo6x10/
 STAGING - SPEAKUP CONSOLE SPEECH DRIVER
 M:     William Hubbs <w.d.hubbs@gmail.com>
 M:     Chris Brannon <chris@the-brannons.com>
-M:     Kirk Reiser <kirk@braille.uwo.ca>
+M:     Kirk Reiser <kirk@reisers.ca>
 M:     Samuel Thibault <samuel.thibault@ens-lyon.org>
 L:     speakup@braille.uwo.ca
 W:     http://www.linux-speakup.org/
index b2f963be39937d2f6fe0775f5c9510866e69f4ea..9af763a90d9330a42b7bebd7007b9d5fbe65b0d2 100644 (file)
@@ -70,5 +70,9 @@ source "drivers/iio/gyro/Kconfig"
 source "drivers/iio/imu/Kconfig"
 source "drivers/iio/light/Kconfig"
 source "drivers/iio/magnetometer/Kconfig"
+if IIO_TRIGGER
+   source "drivers/iio/trigger/Kconfig"
+endif #IIO_TRIGGER
+source "drivers/iio/pressure/Kconfig"
 
 endif # IIO
index a0e8cdd67e4ddef91083600d1ea80b7e1ea069dd..7a3866c2d2a140813c4d7560a19703cf0b62d826 100644 (file)
@@ -21,3 +21,5 @@ obj-y += frequency/
 obj-y += imu/
 obj-y += light/
 obj-y += magnetometer/
+obj-y += trigger/
+obj-y += pressure/
index bb594963f91e3d647e0993f0f60bb22a03ea1cdc..719d83fe51dd04620a3754529e64cf7a257349b2 100644 (file)
@@ -28,7 +28,6 @@ config IIO_ST_ACCEL_3AXIS
        select IIO_ST_ACCEL_I2C_3AXIS if (I2C)
        select IIO_ST_ACCEL_SPI_3AXIS if (SPI_MASTER)
        select IIO_TRIGGERED_BUFFER if (IIO_BUFFER)
-       select IIO_ST_ACCEL_BUFFER if (IIO_TRIGGERED_BUFFER)
        help
          Say yes here to build support for STMicroelectronics accelerometers:
          LSM303DLH, LSM303DLHC, LIS3DH, LSM330D, LSM330DL, LSM330DLC,
index e0f5a3ceba5edee7ce96ed4736f0a4a5f94aa653..4aec121261d7bdd00993af55381c7371d8eb0ce1 100644 (file)
@@ -26,6 +26,8 @@
 #include <linux/iio/common/st_sensors.h>
 #include "st_accel.h"
 
+#define ST_ACCEL_NUMBER_DATA_CHANNELS          3
+
 /* DEFAULT VALUE FOR SENSORS */
 #define ST_ACCEL_DEFAULT_OUT_X_L_ADDR          0x28
 #define ST_ACCEL_DEFAULT_OUT_Y_L_ADDR          0x2a
 #define ST_ACCEL_3_MULTIREAD_BIT               false
 
 static const struct iio_chan_spec st_accel_12bit_channels[] = {
-       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_X, IIO_MOD_X, IIO_LE,
-               ST_SENSORS_DEFAULT_12_REALBITS, ST_ACCEL_DEFAULT_OUT_X_L_ADDR),
-       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_Y, IIO_MOD_Y, IIO_LE,
-               ST_SENSORS_DEFAULT_12_REALBITS, ST_ACCEL_DEFAULT_OUT_Y_L_ADDR),
-       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_Z, IIO_MOD_Z, IIO_LE,
-               ST_SENSORS_DEFAULT_12_REALBITS, ST_ACCEL_DEFAULT_OUT_Z_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 12, 16,
+                       ST_ACCEL_DEFAULT_OUT_X_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 12, 16,
+                       ST_ACCEL_DEFAULT_OUT_Y_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 12, 16,
+                       ST_ACCEL_DEFAULT_OUT_Z_L_ADDR),
        IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
 static const struct iio_chan_spec st_accel_16bit_channels[] = {
-       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_X, IIO_MOD_X, IIO_LE,
-               ST_SENSORS_DEFAULT_16_REALBITS, ST_ACCEL_DEFAULT_OUT_X_L_ADDR),
-       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_Y, IIO_MOD_Y, IIO_LE,
-               ST_SENSORS_DEFAULT_16_REALBITS, ST_ACCEL_DEFAULT_OUT_Y_L_ADDR),
-       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL, ST_SENSORS_SCAN_Z, IIO_MOD_Z, IIO_LE,
-               ST_SENSORS_DEFAULT_16_REALBITS, ST_ACCEL_DEFAULT_OUT_Z_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 16, 16,
+                       ST_ACCEL_DEFAULT_OUT_X_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 16, 16,
+                       ST_ACCEL_DEFAULT_OUT_Y_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_ACCEL,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 16, 16,
+                       ST_ACCEL_DEFAULT_OUT_Z_L_ADDR),
        IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
@@ -442,6 +456,7 @@ int st_accel_common_probe(struct iio_dev *indio_dev)
        if (err < 0)
                goto st_accel_common_probe_error;
 
+       adata->num_data_channels = ST_ACCEL_NUMBER_DATA_CHANNELS;
        adata->multiread_bit = adata->sensor->multi_read_bit;
        indio_dev->channels = adata->sensor->ch;
        indio_dev->num_channels = ST_SENSORS_NUMBER_ALL_CHANNELS;
index ab0767e6727ec5b5d04ac149b28e0247219c3a55..93129ec4b649b9df77fcb95f72af31c1e3417b42 100644 (file)
@@ -133,6 +133,16 @@ config MAX1363
          max11646, max11647) Provides direct access via sysfs and buffered
          data via the iio dev interface.
 
+config MCP320X
+       tristate "Microchip Technology MCP3204/08"
+       depends on SPI
+       help
+         Say yes here to build support for Microchip Technology's MCP3204 or
+         MCP3208 analog to digital converter.
+
+         This driver can also be built as a module. If so, the module will be
+         called mcp320x.
+
 config TI_ADC081C
        tristate "Texas Instruments ADC081C021/027"
        depends on I2C
index 0a825bed43f6a1fa56e9901c1cb87520ceaee947..8f475d31fe4d884bf2f451e80cf7cefca9617332 100644 (file)
@@ -14,6 +14,7 @@ obj-$(CONFIG_AT91_ADC) += at91_adc.o
 obj-$(CONFIG_EXYNOS_ADC) += exynos_adc.o
 obj-$(CONFIG_LP8788_ADC) += lp8788_adc.o
 obj-$(CONFIG_MAX1363) += max1363.o
+obj-$(CONFIG_MCP320X) += mcp320x.o
 obj-$(CONFIG_TI_ADC081C) += ti-adc081c.o
 obj-$(CONFIG_TI_AM335X_ADC) += ti_am335x_adc.o
 obj-$(CONFIG_VIPERBOARD_ADC) += viperboard_adc.o
index e5b88d5d3b59425cf07e79ba20937322fa8c50fc..b6db6a0e09cd8107a58ccc1834d02feb15de6836 100644 (file)
@@ -774,11 +774,13 @@ static int at91_adc_remove(struct platform_device *pdev)
        return 0;
 }
 
+#ifdef CONFIG_OF
 static const struct of_device_id at91_adc_dt_ids[] = {
        { .compatible = "atmel,at91sam9260-adc" },
        {},
 };
 MODULE_DEVICE_TABLE(of, at91_adc_dt_ids);
+#endif
 
 static struct platform_driver at91_adc_driver = {
        .probe = at91_adc_probe,
index b3d03d335948a5a5757bd4d49650b1685cbfb78b..9809fc9a35d2f09b28a9b0b1e1839f7a3e5df1aa 100644 (file)
@@ -270,16 +270,16 @@ static int exynos_adc_probe(struct platform_device *pdev)
        info = iio_priv(indio_dev);
 
        mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       info->regs = devm_request_and_ioremap(&pdev->dev, mem);
-       if (!info->regs) {
-               ret = -ENOMEM;
+       info->regs = devm_ioremap_resource(&pdev->dev, mem);
+       if (IS_ERR(info->regs)) {
+               ret = PTR_ERR(info->regs);
                goto err_iio;
        }
 
        mem = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-       info->enable_reg = devm_request_and_ioremap(&pdev->dev, mem);
-       if (!info->enable_reg) {
-               ret = -ENOMEM;
+       info->enable_reg = devm_ioremap_resource(&pdev->dev, mem);
+       if (IS_ERR(info->enable_reg)) {
+               ret = PTR_ERR(info->enable_reg);
                goto err_iio;
        }
 
index 9e6da72ad82324d1a7b2d11294d9da76616408ba..f148d00b83f7dba9a4245f5069d3e0a8a4888dab 100644 (file)
@@ -660,7 +660,7 @@ static ssize_t max1363_monitor_store_freq(struct device *dev,
        unsigned long val;
        bool found = false;
 
-       ret = strict_strtoul(buf, 10, &val);
+       ret = kstrtoul(buf, 10, &val);
        if (ret)
                return -EINVAL;
        for (i = 0; i < ARRAY_SIZE(max1363_monitor_speeds); i++)
diff --git a/drivers/iio/adc/mcp320x.c b/drivers/iio/adc/mcp320x.c
new file mode 100644 (file)
index 0000000..ebc0159
--- /dev/null
@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2013 Oskar Andero <oskar.andero@gmail.com>
+ *
+ * Driver for Microchip Technology's MCP3204 and MCP3208 ADC chips.
+ * Datasheet can be found here:
+ * http://ww1.microchip.com/downloads/en/devicedoc/21298c.pdf
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/spi/spi.h>
+#include <linux/module.h>
+#include <linux/iio/iio.h>
+#include <linux/regulator/consumer.h>
+
+#define MCP_SINGLE_ENDED       (1 << 3)
+#define MCP_START_BIT          (1 << 4)
+
+enum {
+       mcp3204,
+       mcp3208,
+};
+
+struct mcp320x {
+       struct spi_device *spi;
+       struct spi_message msg;
+       struct spi_transfer transfer[2];
+
+       u8 tx_buf;
+       u8 rx_buf[2];
+
+       struct regulator *reg;
+       struct mutex lock;
+};
+
+static int mcp320x_adc_conversion(struct mcp320x *adc, u8 msg)
+{
+       int ret;
+
+       adc->tx_buf = msg;
+       ret = spi_sync(adc->spi, &adc->msg);
+       if (ret < 0)
+               return ret;
+
+       return ((adc->rx_buf[0] & 0x3f) << 6)  |
+               (adc->rx_buf[1] >> 2);
+}
+
+static int mcp320x_read_raw(struct iio_dev *indio_dev,
+                           struct iio_chan_spec const *channel, int *val,
+                           int *val2, long mask)
+{
+       struct mcp320x *adc = iio_priv(indio_dev);
+       int ret = -EINVAL;
+
+       mutex_lock(&adc->lock);
+
+       switch (mask) {
+       case IIO_CHAN_INFO_RAW:
+               if (channel->differential)
+                       ret = mcp320x_adc_conversion(adc,
+                               MCP_START_BIT | channel->address);
+               else
+                       ret = mcp320x_adc_conversion(adc,
+                               MCP_START_BIT | MCP_SINGLE_ENDED |
+                               channel->address);
+               if (ret < 0)
+                       goto out;
+
+               *val = ret;
+               ret = IIO_VAL_INT;
+               break;
+
+       case IIO_CHAN_INFO_SCALE:
+               /* Digital output code = (4096 * Vin) / Vref */
+               ret = regulator_get_voltage(adc->reg);
+               if (ret < 0)
+                       goto out;
+
+               *val = ret / 1000;
+               *val2 = 12;
+               ret = IIO_VAL_FRACTIONAL_LOG2;
+               break;
+
+       default:
+               break;
+       }
+
+out:
+       mutex_unlock(&adc->lock);
+
+       return ret;
+}
+
+#define MCP320X_VOLTAGE_CHANNEL(num)                           \
+       {                                                       \
+               .type = IIO_VOLTAGE,                            \
+               .indexed = 1,                                   \
+               .channel = (num),                               \
+               .address = (num),                               \
+               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),   \
+               .info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE) \
+       }
+
+#define MCP320X_VOLTAGE_CHANNEL_DIFF(num)                      \
+       {                                                       \
+               .type = IIO_VOLTAGE,                            \
+               .indexed = 1,                                   \
+               .channel = (num * 2),                           \
+               .channel2 = (num * 2 + 1),                      \
+               .address = (num * 2),                           \
+               .differential = 1,                              \
+               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),   \
+               .info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE) \
+       }
+
+static const struct iio_chan_spec mcp3204_channels[] = {
+       MCP320X_VOLTAGE_CHANNEL(0),
+       MCP320X_VOLTAGE_CHANNEL(1),
+       MCP320X_VOLTAGE_CHANNEL(2),
+       MCP320X_VOLTAGE_CHANNEL(3),
+       MCP320X_VOLTAGE_CHANNEL_DIFF(0),
+       MCP320X_VOLTAGE_CHANNEL_DIFF(1),
+};
+
+static const struct iio_chan_spec mcp3208_channels[] = {
+       MCP320X_VOLTAGE_CHANNEL(0),
+       MCP320X_VOLTAGE_CHANNEL(1),
+       MCP320X_VOLTAGE_CHANNEL(2),
+       MCP320X_VOLTAGE_CHANNEL(3),
+       MCP320X_VOLTAGE_CHANNEL(4),
+       MCP320X_VOLTAGE_CHANNEL(5),
+       MCP320X_VOLTAGE_CHANNEL(6),
+       MCP320X_VOLTAGE_CHANNEL(7),
+       MCP320X_VOLTAGE_CHANNEL_DIFF(0),
+       MCP320X_VOLTAGE_CHANNEL_DIFF(1),
+       MCP320X_VOLTAGE_CHANNEL_DIFF(2),
+       MCP320X_VOLTAGE_CHANNEL_DIFF(3),
+};
+
+static const struct iio_info mcp320x_info = {
+       .read_raw = mcp320x_read_raw,
+       .driver_module = THIS_MODULE,
+};
+
+struct mcp3208_chip_info {
+       const struct iio_chan_spec *channels;
+       unsigned int num_channels;
+};
+
+static const struct mcp3208_chip_info mcp3208_chip_infos[] = {
+       [mcp3204] = {
+               .channels = mcp3204_channels,
+               .num_channels = ARRAY_SIZE(mcp3204_channels)
+       },
+       [mcp3208] = {
+               .channels = mcp3208_channels,
+               .num_channels = ARRAY_SIZE(mcp3208_channels)
+       },
+};
+
+static int mcp320x_probe(struct spi_device *spi)
+{
+       struct iio_dev *indio_dev;
+       struct mcp320x *adc;
+       const struct mcp3208_chip_info *chip_info;
+       int ret;
+
+       indio_dev = iio_device_alloc(sizeof(*adc));
+       if (!indio_dev)
+               return -ENOMEM;
+
+       adc = iio_priv(indio_dev);
+       adc->spi = spi;
+
+       indio_dev->dev.parent = &spi->dev;
+       indio_dev->name = spi_get_device_id(spi)->name;
+       indio_dev->modes = INDIO_DIRECT_MODE;
+       indio_dev->info = &mcp320x_info;
+
+       chip_info = &mcp3208_chip_infos[spi_get_device_id(spi)->driver_data];
+       indio_dev->channels = chip_info->channels;
+       indio_dev->num_channels = chip_info->num_channels;
+
+       adc->transfer[0].tx_buf = &adc->tx_buf;
+       adc->transfer[0].len = sizeof(adc->tx_buf);
+       adc->transfer[1].rx_buf = adc->rx_buf;
+       adc->transfer[1].len = sizeof(adc->rx_buf);
+
+       spi_message_init_with_transfers(&adc->msg, adc->transfer,
+                                       ARRAY_SIZE(adc->transfer));
+
+       adc->reg = regulator_get(&spi->dev, "vref");
+       if (IS_ERR(adc->reg)) {
+               ret = PTR_ERR(adc->reg);
+               goto iio_free;
+       }
+
+       ret = regulator_enable(adc->reg);
+       if (ret < 0)
+               goto reg_free;
+
+       mutex_init(&adc->lock);
+
+       ret = iio_device_register(indio_dev);
+       if (ret < 0)
+               goto reg_disable;
+
+       return 0;
+
+reg_disable:
+       regulator_disable(adc->reg);
+reg_free:
+       regulator_put(adc->reg);
+iio_free:
+       iio_device_free(indio_dev);
+
+       return ret;
+}
+
+static int mcp320x_remove(struct spi_device *spi)
+{
+       struct iio_dev *indio_dev = spi_get_drvdata(spi);
+       struct mcp320x *adc = iio_priv(indio_dev);
+
+       iio_device_unregister(indio_dev);
+       regulator_disable(adc->reg);
+       regulator_put(adc->reg);
+       iio_device_free(indio_dev);
+
+       return 0;
+}
+
+static const struct spi_device_id mcp320x_id[] = {
+       { "mcp3204", mcp3204 },
+       { "mcp3208", mcp3208 },
+       { }
+};
+MODULE_DEVICE_TABLE(spi, mcp320x_id);
+
+static struct spi_driver mcp320x_driver = {
+       .driver = {
+               .name = "mcp320x",
+               .owner = THIS_MODULE,
+       },
+       .probe = mcp320x_probe,
+       .remove = mcp320x_remove,
+       .id_table = mcp320x_id,
+};
+module_spi_driver(mcp320x_driver);
+
+MODULE_AUTHOR("Oskar Andero <oskar.andero@gmail.com>");
+MODULE_DESCRIPTION("Microchip Technology MCP3204/08");
+MODULE_LICENSE("GPL v2");
index 09b236d6ee8943c62c9088691854a0dea187ebc2..71a2c5f63b9ced2361c5fb94f255cc3c6430ba23 100644 (file)
 
 int st_sensors_get_buffer_element(struct iio_dev *indio_dev, u8 *buf)
 {
+       u8 *addr;
        int i, n = 0, len;
-       u8 addr[ST_SENSORS_NUMBER_DATA_CHANNELS];
        struct st_sensor_data *sdata = iio_priv(indio_dev);
+       unsigned int num_data_channels = sdata->num_data_channels;
+       unsigned int byte_for_channel =
+                       indio_dev->channels[0].scan_type.storagebits >> 3;
 
-       for (i = 0; i < ST_SENSORS_NUMBER_DATA_CHANNELS; i++) {
+       addr = kmalloc(num_data_channels, GFP_KERNEL);
+       if (!addr) {
+               len = -ENOMEM;
+               goto st_sensors_get_buffer_element_error;
+       }
+
+       for (i = 0; i < num_data_channels; i++) {
                if (test_bit(i, indio_dev->active_scan_mask)) {
                        addr[n] = indio_dev->channels[i].address;
                        n++;
@@ -37,52 +46,58 @@ int st_sensors_get_buffer_element(struct iio_dev *indio_dev, u8 *buf)
        switch (n) {
        case 1:
                len = sdata->tf->read_multiple_byte(&sdata->tb, sdata->dev,
-                       addr[0], ST_SENSORS_BYTE_FOR_CHANNEL, buf,
-                       sdata->multiread_bit);
+                       addr[0], byte_for_channel, buf, sdata->multiread_bit);
                break;
        case 2:
-               if ((addr[1] - addr[0]) == ST_SENSORS_BYTE_FOR_CHANNEL) {
+               if ((addr[1] - addr[0]) == byte_for_channel) {
                        len = sdata->tf->read_multiple_byte(&sdata->tb,
-                                       sdata->dev, addr[0],
-                                       ST_SENSORS_BYTE_FOR_CHANNEL*n,
-                                       buf, sdata->multiread_bit);
+                               sdata->dev, addr[0], byte_for_channel * n,
+                               buf, sdata->multiread_bit);
                } else {
-                       u8 rx_array[ST_SENSORS_BYTE_FOR_CHANNEL*
-                                   ST_SENSORS_NUMBER_DATA_CHANNELS];
+                       u8 *rx_array;
+                       rx_array = kmalloc(byte_for_channel * num_data_channels,
+                                          GFP_KERNEL);
+                       if (!rx_array) {
+                               len = -ENOMEM;
+                               goto st_sensors_free_memory;
+                       }
+
                        len = sdata->tf->read_multiple_byte(&sdata->tb,
                                sdata->dev, addr[0],
-                               ST_SENSORS_BYTE_FOR_CHANNEL*
-                               ST_SENSORS_NUMBER_DATA_CHANNELS,
+                               byte_for_channel * num_data_channels,
                                rx_array, sdata->multiread_bit);
-                       if (len < 0)
-                               goto read_data_channels_error;
+                       if (len < 0) {
+                               kfree(rx_array);
+                               goto st_sensors_free_memory;
+                       }
 
-                       for (i = 0; i < n * ST_SENSORS_NUMBER_DATA_CHANNELS;
-                                                                       i++) {
+                       for (i = 0; i < n * num_data_channels; i++) {
                                if (i < n)
                                        buf[i] = rx_array[i];
                                else
                                        buf[i] = rx_array[n + i];
                        }
-                       len = ST_SENSORS_BYTE_FOR_CHANNEL*n;
+                       kfree(rx_array);
+                       len = byte_for_channel * n;
                }
                break;
        case 3:
                len = sdata->tf->read_multiple_byte(&sdata->tb, sdata->dev,
-                       addr[0], ST_SENSORS_BYTE_FOR_CHANNEL*
-                       ST_SENSORS_NUMBER_DATA_CHANNELS,
+                       addr[0], byte_for_channel * num_data_channels,
                        buf, sdata->multiread_bit);
                break;
        default:
                len = -EINVAL;
-               goto read_data_channels_error;
+               goto st_sensors_free_memory;
        }
-       if (len != ST_SENSORS_BYTE_FOR_CHANNEL*n) {
+       if (len != byte_for_channel * n) {
                len = -EIO;
-               goto read_data_channels_error;
+               goto st_sensors_free_memory;
        }
 
-read_data_channels_error:
+st_sensors_free_memory:
+       kfree(addr);
+st_sensors_get_buffer_element_error:
        return len;
 }
 EXPORT_SYMBOL(st_sensors_get_buffer_element);
index ed9bc8ae933030158c9d23b41acfc2bd2de7d61e..865b1781df6602db631dacdee3218c90308c6712 100644 (file)
 
 #define ST_SENSORS_WAI_ADDRESS         0x0f
 
+static inline u32 st_sensors_get_unaligned_le24(const u8 *p)
+{
+       return ((s32)((p[0] | p[1] << 8 | p[2] << 16) << 8) >> 8);
+}
+
 static int st_sensors_write_data_with_mask(struct iio_dev *indio_dev,
                                                u8 reg_addr, u8 mask, u8 data)
 {
@@ -112,7 +117,8 @@ st_sensors_match_odr_error:
        return ret;
 }
 
-static int st_sensors_set_fullscale(struct iio_dev *indio_dev, unsigned int fs)
+static int st_sensors_set_fullscale(struct iio_dev *indio_dev,
+                                                       unsigned int fs)
 {
        int err, i = 0;
        struct st_sensor_data *sdata = iio_priv(indio_dev);
@@ -273,21 +279,33 @@ st_sensors_match_scale_error:
 EXPORT_SYMBOL(st_sensors_set_fullscale_by_gain);
 
 static int st_sensors_read_axis_data(struct iio_dev *indio_dev,
-                                                       u8 ch_addr, int *data)
+                               struct iio_chan_spec const *ch, int *data)
 {
        int err;
-       u8 outdata[ST_SENSORS_BYTE_FOR_CHANNEL];
+       u8 *outdata;
        struct st_sensor_data *sdata = iio_priv(indio_dev);
+       unsigned int byte_for_channel = ch->scan_type.storagebits >> 3;
+
+       outdata = kmalloc(byte_for_channel, GFP_KERNEL);
+       if (!outdata) {
+               err = -EINVAL;
+               goto st_sensors_read_axis_data_error;
+       }
 
        err = sdata->tf->read_multiple_byte(&sdata->tb, sdata->dev,
-                               ch_addr, ST_SENSORS_BYTE_FOR_CHANNEL,
+                               ch->address, byte_for_channel,
                                outdata, sdata->multiread_bit);
        if (err < 0)
-               goto read_error;
+               goto st_sensors_free_memory;
 
-       *data = (s16)get_unaligned_le16(outdata);
+       if (byte_for_channel == 2)
+               *data = (s16)get_unaligned_le16(outdata);
+       else if (byte_for_channel == 3)
+               *data = (s32)st_sensors_get_unaligned_le24(outdata);
 
-read_error:
+st_sensors_free_memory:
+       kfree(outdata);
+st_sensors_read_axis_data_error:
        return err;
 }
 
@@ -307,7 +325,7 @@ int st_sensors_read_info_raw(struct iio_dev *indio_dev,
                        goto read_error;
 
                msleep((sdata->sensor->bootime * 1000) / sdata->odr);
-               err = st_sensors_read_axis_data(indio_dev, ch->address, val);
+               err = st_sensors_read_axis_data(indio_dev, ch, val);
                if (err < 0)
                        goto read_error;
 
index b61160bd935eafffd7e04207a0e0fb043f87450f..c9c33ce32d3a729fc4e325a96f388116f35ee63e 100644 (file)
@@ -130,6 +130,16 @@ config AD5686
          To compile this driver as a module, choose M here: the
          module will be called ad5686.
 
+config AD7303
+       tristate "Analog Devices Analog Devices AD7303 DAC driver"
+       depends on SPI
+       help
+         Say yes here to build support for Analog Devices AD7303 Digital to Analog
+         Converters (DAC).
+
+         To compile this driver as module choose M here: the module will be called
+         ad7303.
+
 config MAX517
        tristate "Maxim MAX517/518/519 DAC driver"
        depends on I2C
index 5b528ebb3343c60d260506821510ea60316da26f..c8d7ab6bff01b8b97ddbd3a020811a588a4f08c8 100644 (file)
@@ -14,5 +14,6 @@ obj-$(CONFIG_AD5755) += ad5755.o
 obj-$(CONFIG_AD5764) += ad5764.o
 obj-$(CONFIG_AD5791) += ad5791.o
 obj-$(CONFIG_AD5686) += ad5686.o
+obj-$(CONFIG_AD7303) += ad7303.o
 obj-$(CONFIG_MAX517) += max517.o
 obj-$(CONFIG_MCP4725) += mcp4725.o
diff --git a/drivers/iio/dac/ad7303.c b/drivers/iio/dac/ad7303.c
new file mode 100644 (file)
index 0000000..85aeef6
--- /dev/null
@@ -0,0 +1,315 @@
+/*
+ * AD7303 Digital to analog converters driver
+ *
+ * Copyright 2013 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/spi/spi.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/regulator/consumer.h>
+#include <linux/of.h>
+
+#include <linux/iio/iio.h>
+#include <linux/iio/sysfs.h>
+
+#include <linux/platform_data/ad7303.h>
+
+#define AD7303_CFG_EXTERNAL_VREF BIT(15)
+#define AD7303_CFG_POWER_DOWN(ch) BIT(11 + (ch))
+#define AD7303_CFG_ADDR_OFFSET 10
+
+#define AD7303_CMD_UPDATE_DAC  (0x3 << 8)
+
+/**
+ * struct ad7303_state - driver instance specific data
+ * @spi:               the device for this driver instance
+ * @config:            cached config register value
+ * @dac_cache:         current DAC raw value (chip does not support readback)
+ * @data:              spi transfer buffer
+ */
+
+struct ad7303_state {
+       struct spi_device *spi;
+       uint16_t config;
+       uint8_t dac_cache[2];
+
+       struct regulator *vdd_reg;
+       struct regulator *vref_reg;
+
+       /*
+        * DMA (thus cache coherency maintenance) requires the
+        * transfer buffers to live in their own cache lines.
+        */
+       __be16 data ____cacheline_aligned;
+};
+
+static int ad7303_write(struct ad7303_state *st, unsigned int chan,
+       uint8_t val)
+{
+       st->data = cpu_to_be16(AD7303_CMD_UPDATE_DAC |
+               (chan << AD7303_CFG_ADDR_OFFSET) |
+               st->config | val);
+
+       return spi_write(st->spi, &st->data, sizeof(st->data));
+}
+
+static ssize_t ad7303_read_dac_powerdown(struct iio_dev *indio_dev,
+       uintptr_t private, const struct iio_chan_spec *chan, char *buf)
+{
+       struct ad7303_state *st = iio_priv(indio_dev);
+
+       return sprintf(buf, "%d\n", (bool)(st->config &
+               AD7303_CFG_POWER_DOWN(chan->channel)));
+}
+
+static ssize_t ad7303_write_dac_powerdown(struct iio_dev *indio_dev,
+        uintptr_t private, const struct iio_chan_spec *chan, const char *buf,
+        size_t len)
+{
+       struct ad7303_state *st = iio_priv(indio_dev);
+       bool pwr_down;
+       int ret;
+
+       ret = strtobool(buf, &pwr_down);
+       if (ret)
+               return ret;
+
+       mutex_lock(&indio_dev->mlock);
+
+       if (pwr_down)
+               st->config |= AD7303_CFG_POWER_DOWN(chan->channel);
+       else
+               st->config &= ~AD7303_CFG_POWER_DOWN(chan->channel);
+
+       /* There is no noop cmd which allows us to only update the powerdown
+        * mode, so just write one of the DAC channels again */
+       ad7303_write(st, chan->channel, st->dac_cache[chan->channel]);
+
+       mutex_unlock(&indio_dev->mlock);
+       return ret ? ret : len;
+}
+
+static int ad7303_get_vref(struct ad7303_state *st,
+       struct iio_chan_spec const *chan)
+{
+       int ret;
+
+       if (st->config & AD7303_CFG_EXTERNAL_VREF)
+               return regulator_get_voltage(st->vref_reg);
+
+       ret = regulator_get_voltage(st->vdd_reg);
+       if (ret < 0)
+               return ret;
+       return ret / 2;
+}
+
+static int ad7303_read_raw(struct iio_dev *indio_dev,
+       struct iio_chan_spec const *chan, int *val, int *val2, long info)
+{
+       struct ad7303_state *st = iio_priv(indio_dev);
+       int vref_uv;
+
+       switch (info) {
+       case IIO_CHAN_INFO_RAW:
+               *val = st->dac_cache[chan->channel];
+               return IIO_VAL_INT;
+       case IIO_CHAN_INFO_SCALE:
+               vref_uv = ad7303_get_vref(st, chan);
+               if (vref_uv < 0)
+                       return vref_uv;
+
+               *val = 2 * vref_uv / 1000;
+               *val2 = chan->scan_type.realbits;
+
+               return IIO_VAL_FRACTIONAL_LOG2;
+       default:
+               break;
+       }
+       return -EINVAL;
+}
+
+static int ad7303_write_raw(struct iio_dev *indio_dev,
+       struct iio_chan_spec const *chan, int val, int val2, long mask)
+{
+       struct ad7303_state *st = iio_priv(indio_dev);
+       int ret;
+
+       switch (mask) {
+       case IIO_CHAN_INFO_RAW:
+               if (val >= (1 << chan->scan_type.realbits) || val < 0)
+                       return -EINVAL;
+
+               mutex_lock(&indio_dev->mlock);
+               ret = ad7303_write(st, chan->address, val);
+               if (ret == 0)
+                       st->dac_cache[chan->channel] = val;
+               mutex_unlock(&indio_dev->mlock);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+static const struct iio_info ad7303_info = {
+       .read_raw = ad7303_read_raw,
+       .write_raw = ad7303_write_raw,
+       .driver_module = THIS_MODULE,
+};
+
+static const struct iio_chan_spec_ext_info ad7303_ext_info[] = {
+       {
+               .name = "powerdown",
+               .read = ad7303_read_dac_powerdown,
+               .write = ad7303_write_dac_powerdown,
+       },
+       { },
+};
+
+#define AD7303_CHANNEL(chan) {                                 \
+       .type = IIO_VOLTAGE,                                    \
+       .indexed = 1,                                           \
+       .output = 1,                                            \
+       .channel = (chan),                                      \
+       .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),           \
+       .info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE),   \
+       .address = (chan),                                      \
+       .scan_type = {                                          \
+               .sign = 'u',                                    \
+               .realbits = '8',                                \
+               .storagebits = '8',                             \
+               .shift = '0',                                   \
+       },                                                      \
+       .ext_info = ad7303_ext_info,                            \
+}
+
+static const struct iio_chan_spec ad7303_channels[] = {
+       AD7303_CHANNEL(0),
+       AD7303_CHANNEL(1),
+};
+
+static int ad7303_probe(struct spi_device *spi)
+{
+       const struct spi_device_id *id = spi_get_device_id(spi);
+       struct iio_dev *indio_dev;
+       struct ad7303_state *st;
+       bool ext_ref;
+       int ret;
+
+       indio_dev = iio_device_alloc(sizeof(*st));
+       if (indio_dev == NULL)
+               return -ENOMEM;
+
+       st = iio_priv(indio_dev);
+       spi_set_drvdata(spi, indio_dev);
+
+       st->spi = spi;
+
+       st->vdd_reg = regulator_get(&spi->dev, "Vdd");
+       if (IS_ERR(st->vdd_reg)) {
+               ret = PTR_ERR(st->vdd_reg);
+               goto err_free;
+       }
+
+       ret = regulator_enable(st->vdd_reg);
+       if (ret)
+               goto err_put_vdd_reg;
+
+       if (spi->dev.of_node) {
+               ext_ref = of_property_read_bool(spi->dev.of_node,
+                               "REF-supply");
+       } else {
+               struct ad7303_platform_data *pdata = spi->dev.platform_data;
+               if (pdata && pdata->use_external_ref)
+                       ext_ref = true;
+               else
+                   ext_ref = false;
+       }
+
+       if (ext_ref) {
+               st->vref_reg = regulator_get(&spi->dev, "REF");
+               if (IS_ERR(st->vref_reg))
+                       goto err_disable_vdd_reg;
+
+               ret = regulator_enable(st->vref_reg);
+               if (ret)
+                       goto err_put_vref_reg;
+
+               st->config |= AD7303_CFG_EXTERNAL_VREF;
+       }
+
+       indio_dev->dev.parent = &spi->dev;
+       indio_dev->name = id->name;
+       indio_dev->info = &ad7303_info;
+       indio_dev->modes = INDIO_DIRECT_MODE;
+       indio_dev->channels = ad7303_channels;
+       indio_dev->num_channels = ARRAY_SIZE(ad7303_channels);
+
+       ret = iio_device_register(indio_dev);
+       if (ret)
+               goto err_disable_vref_reg;
+
+       return 0;
+
+err_disable_vref_reg:
+       if (st->vref_reg)
+               regulator_disable(st->vref_reg);
+err_put_vref_reg:
+       if (st->vref_reg)
+               regulator_put(st->vref_reg);
+err_disable_vdd_reg:
+       regulator_disable(st->vdd_reg);
+err_put_vdd_reg:
+       regulator_put(st->vdd_reg);
+err_free:
+       iio_device_free(indio_dev);
+
+       return ret;
+}
+
+static int ad7303_remove(struct spi_device *spi)
+{
+       struct iio_dev *indio_dev = spi_get_drvdata(spi);
+       struct ad7303_state *st = iio_priv(indio_dev);
+
+       iio_device_unregister(indio_dev);
+
+       if (st->vref_reg) {
+               regulator_disable(st->vref_reg);
+               regulator_put(st->vref_reg);
+       }
+       regulator_disable(st->vdd_reg);
+       regulator_put(st->vdd_reg);
+
+       iio_device_free(indio_dev);
+
+       return 0;
+}
+
+static const struct spi_device_id ad7303_spi_ids[] = {
+       { "ad7303", 0 },
+       {}
+};
+MODULE_DEVICE_TABLE(spi, ad7303_spi_ids);
+
+static struct spi_driver ad7303_driver = {
+       .driver = {
+               .name = "ad7303",
+               .owner = THIS_MODULE,
+       },
+       .probe = ad7303_probe,
+       .remove = ad7303_remove,
+       .id_table = ad7303_spi_ids,
+};
+module_spi_driver(ad7303_driver);
+
+MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
+MODULE_DESCRIPTION("Analog Devices AD7303 DAC driver");
+MODULE_LICENSE("GPL v2");
index e76d4ace53ff76d73917933a191da08b907cdc2f..a4157cdb314d5c1a8975cd28216943b40dd38bd6 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * ADF4350/ADF4351 SPI Wideband Synthesizer driver
  *
- * Copyright 2012 Analog Devices Inc.
+ * Copyright 2012-2013 Analog Devices Inc.
  *
  * Licensed under the GPL-2.
  */
@@ -17,6 +17,9 @@
 #include <linux/gcd.h>
 #include <linux/gpio.h>
 #include <asm/div64.h>
+#include <linux/clk.h>
+#include <linux/of.h>
+#include <linux/of_gpio.h>
 
 #include <linux/iio/iio.h>
 #include <linux/iio/sysfs.h>
@@ -33,6 +36,7 @@ struct adf4350_state {
        struct spi_device               *spi;
        struct regulator                *reg;
        struct adf4350_platform_data    *pdata;
+       struct clk                      *clk;
        unsigned long                   clkin;
        unsigned long                   chspc; /* Channel Spacing */
        unsigned long                   fpfd; /* Phase Frequency Detector */
@@ -43,7 +47,7 @@ struct adf4350_state {
        unsigned                        r4_rf_div_sel;
        unsigned long                   regs[6];
        unsigned long                   regs_hw[6];
-
+       unsigned long long              freq_req;
        /*
         * DMA (thus cache coherency maintenance) requires the
         * transfer buffers to live in their own cache lines.
@@ -52,7 +56,6 @@ struct adf4350_state {
 };
 
 static struct adf4350_platform_data default_pdata = {
-       .clkin = 122880000,
        .channel_spacing = 10000,
        .r2_user_settings = ADF4350_REG2_PD_POLARITY_POS |
                            ADF4350_REG2_CHARGE_PUMP_CURR_uA(2500),
@@ -235,6 +238,7 @@ static int adf4350_set_freq(struct adf4350_state *st, unsigned long long freq)
                ADF4350_REG4_MUTE_TILL_LOCK_EN));
 
        st->regs[ADF4350_REG5] = ADF4350_REG5_LD_PIN_MODE_DIGITAL;
+       st->freq_req = freq;
 
        return adf4350_sync_config(st);
 }
@@ -246,6 +250,7 @@ static ssize_t adf4350_write(struct iio_dev *indio_dev,
 {
        struct adf4350_state *st = iio_priv(indio_dev);
        unsigned long long readin;
+       unsigned long tmp;
        int ret;
 
        ret = kstrtoull(buf, 10, &readin);
@@ -258,10 +263,23 @@ static ssize_t adf4350_write(struct iio_dev *indio_dev,
                ret = adf4350_set_freq(st, readin);
                break;
        case ADF4350_FREQ_REFIN:
-               if (readin > ADF4350_MAX_FREQ_REFIN)
+               if (readin > ADF4350_MAX_FREQ_REFIN) {
                        ret = -EINVAL;
-               else
-                       st->clkin = readin;
+                       break;
+               }
+
+               if (st->clk) {
+                       tmp = clk_round_rate(st->clk, readin);
+                       if (tmp != readin) {
+                               ret = -EINVAL;
+                               break;
+                       }
+                       ret = clk_set_rate(st->clk, tmp);
+                       if (ret < 0)
+                               break;
+               }
+               st->clkin = readin;
+               ret = adf4350_set_freq(st, st->freq_req);
                break;
        case ADF4350_FREQ_RESOLUTION:
                if (readin == 0)
@@ -308,6 +326,9 @@ static ssize_t adf4350_read(struct iio_dev *indio_dev,
                        }
                break;
        case ADF4350_FREQ_REFIN:
+               if (st->clk)
+                       st->clkin = clk_get_rate(st->clk);
+
                val = st->clkin;
                break;
        case ADF4350_FREQ_RESOLUTION:
@@ -318,6 +339,7 @@ static ssize_t adf4350_read(struct iio_dev *indio_dev,
                break;
        default:
                ret = -EINVAL;
+               val = 0;
        }
        mutex_unlock(&indio_dev->mlock);
 
@@ -355,19 +377,153 @@ static const struct iio_info adf4350_info = {
        .driver_module = THIS_MODULE,
 };
 
+#ifdef CONFIG_OF
+static struct adf4350_platform_data *adf4350_parse_dt(struct device *dev)
+{
+       struct device_node *np = dev->of_node;
+       struct adf4350_platform_data *pdata;
+       unsigned int tmp;
+       int ret;
+
+       pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
+       if (!pdata) {
+               dev_err(dev, "could not allocate memory for platform data\n");
+               return NULL;
+       }
+
+       strncpy(&pdata->name[0], np->name, SPI_NAME_SIZE - 1);
+
+       tmp = 10000;
+       of_property_read_u32(np, "adi,channel-spacing", &tmp);
+       pdata->channel_spacing = tmp;
+
+       tmp = 0;
+       of_property_read_u32(np, "adi,power-up-frequency", &tmp);
+       pdata->power_up_frequency = tmp;
+
+       tmp = 0;
+       of_property_read_u32(np, "adi,reference-div-factor", &tmp);
+       pdata->ref_div_factor = tmp;
+
+       ret = of_get_gpio(np, 0);
+       if (ret < 0)
+               pdata->gpio_lock_detect = -1;
+       else
+               pdata->gpio_lock_detect = ret;
+
+       pdata->ref_doubler_en = of_property_read_bool(np,
+                       "adi,reference-doubler-enable");
+       pdata->ref_div2_en = of_property_read_bool(np,
+                       "adi,reference-div2-enable");
+
+       /* r2_user_settings */
+       pdata->r2_user_settings = of_property_read_bool(np,
+                       "adi,phase-detector-polarity-positive-enable") ?
+                       ADF4350_REG2_PD_POLARITY_POS : 0;
+       pdata->r2_user_settings |= of_property_read_bool(np,
+                       "adi,lock-detect-precision-6ns-enable") ?
+                       ADF4350_REG2_LDP_6ns : 0;
+       pdata->r2_user_settings |= of_property_read_bool(np,
+                       "adi,lock-detect-function-integer-n-enable") ?
+                       ADF4350_REG2_LDF_INT_N : 0;
+
+       tmp = 2500;
+       of_property_read_u32(np, "adi,charge-pump-current", &tmp);
+       pdata->r2_user_settings |= ADF4350_REG2_CHARGE_PUMP_CURR_uA(tmp);
+
+       tmp = 0;
+       of_property_read_u32(np, "adi,muxout-select", &tmp);
+       pdata->r2_user_settings |= ADF4350_REG2_MUXOUT(tmp);
+
+       pdata->r2_user_settings |= of_property_read_bool(np,
+                       "adi,low-spur-mode-enable") ?
+                       ADF4350_REG2_NOISE_MODE(0x3) : 0;
+
+       /* r3_user_settings */
+
+       pdata->r3_user_settings = of_property_read_bool(np,
+                       "adi,cycle-slip-reduction-enable") ?
+                       ADF4350_REG3_12BIT_CSR_EN : 0;
+       pdata->r3_user_settings |= of_property_read_bool(np,
+                       "adi,charge-cancellation-enable") ?
+                       ADF4351_REG3_CHARGE_CANCELLATION_EN : 0;
+
+       pdata->r3_user_settings |= of_property_read_bool(np,
+                       "adi,anti-backlash-3ns-enable") ?
+                       ADF4351_REG3_ANTI_BACKLASH_3ns_EN : 0;
+       pdata->r3_user_settings |= of_property_read_bool(np,
+                       "adi,band-select-clock-mode-high-enable") ?
+                       ADF4351_REG3_BAND_SEL_CLOCK_MODE_HIGH : 0;
+
+       tmp = 0;
+       of_property_read_u32(np, "adi,12bit-clk-divider", &tmp);
+       pdata->r3_user_settings |= ADF4350_REG3_12BIT_CLKDIV(tmp);
+
+       tmp = 0;
+       of_property_read_u32(np, "adi,clk-divider-mode", &tmp);
+       pdata->r3_user_settings |= ADF4350_REG3_12BIT_CLKDIV_MODE(tmp);
+
+       /* r4_user_settings */
+
+       pdata->r4_user_settings = of_property_read_bool(np,
+                       "adi,aux-output-enable") ?
+                       ADF4350_REG4_AUX_OUTPUT_EN : 0;
+       pdata->r4_user_settings |= of_property_read_bool(np,
+                       "adi,aux-output-fundamental-enable") ?
+                       ADF4350_REG4_AUX_OUTPUT_FUND : 0;
+       pdata->r4_user_settings |= of_property_read_bool(np,
+                       "adi,mute-till-lock-enable") ?
+                       ADF4350_REG4_MUTE_TILL_LOCK_EN : 0;
+
+       tmp = 0;
+       of_property_read_u32(np, "adi,output-power", &tmp);
+       pdata->r4_user_settings |= ADF4350_REG4_OUTPUT_PWR(tmp);
+
+       tmp = 0;
+       of_property_read_u32(np, "adi,aux-output-power", &tmp);
+       pdata->r4_user_settings |= ADF4350_REG4_AUX_OUTPUT_PWR(tmp);
+
+       return pdata;
+}
+#else
+static
+struct adf4350_platform_data *adf4350_parse_dt(struct device *dev)
+{
+       return NULL;
+}
+#endif
+
 static int adf4350_probe(struct spi_device *spi)
 {
-       struct adf4350_platform_data *pdata = spi->dev.platform_data;
+       struct adf4350_platform_data *pdata;
        struct iio_dev *indio_dev;
        struct adf4350_state *st;
+       struct clk *clk = NULL;
        int ret;
 
+       if (spi->dev.of_node) {
+               pdata = adf4350_parse_dt(&spi->dev);
+               if (pdata == NULL)
+                       return -EINVAL;
+       } else {
+               pdata = spi->dev.platform_data;
+       }
+
        if (!pdata) {
                dev_warn(&spi->dev, "no platform data? using default\n");
-
                pdata = &default_pdata;
        }
 
+       if (!pdata->clkin) {
+               clk = clk_get(&spi->dev, "clkin");
+               if (IS_ERR(clk))
+                       return -EPROBE_DEFER;
+
+               ret = clk_prepare_enable(clk);
+               if (ret < 0)
+                       return ret;
+       }
+
        indio_dev = iio_device_alloc(sizeof(*st));
        if (indio_dev == NULL)
                return -ENOMEM;
@@ -395,7 +551,12 @@ static int adf4350_probe(struct spi_device *spi)
        indio_dev->num_channels = 1;
 
        st->chspc = pdata->channel_spacing;
-       st->clkin = pdata->clkin;
+       if (clk) {
+               st->clk = clk;
+               st->clkin = clk_get_rate(clk);
+       } else {
+               st->clkin = pdata->clkin;
+       }
 
        st->min_out_freq = spi_get_device_id(spi)->driver_data == 4351 ?
                ADF4351_MIN_OUT_FREQ : ADF4350_MIN_OUT_FREQ;
@@ -435,6 +596,8 @@ error_put_reg:
        if (!IS_ERR(st->reg))
                regulator_put(st->reg);
 
+       if (clk)
+               clk_disable_unprepare(clk);
        iio_device_free(indio_dev);
 
        return ret;
@@ -451,6 +614,9 @@ static int adf4350_remove(struct spi_device *spi)
 
        iio_device_unregister(indio_dev);
 
+       if (st->clk)
+               clk_disable_unprepare(st->clk);
+
        if (!IS_ERR(reg)) {
                regulator_disable(reg);
                regulator_put(reg);
@@ -481,6 +647,6 @@ static struct spi_driver adf4350_driver = {
 };
 module_spi_driver(adf4350_driver);
 
-MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
+MODULE_AUTHOR("Michael Hennerich <michael.hennerich@analog.com>");
 MODULE_DESCRIPTION("Analog Devices ADF4350/ADF4351 PLL");
 MODULE_LICENSE("GPL v2");
index 6be4628faffec0a781ea4e1cdbde9dde7f916589..8498e9dcda6859a4ff88f40ed46007bc37ae3ac9 100644 (file)
@@ -10,6 +10,13 @@ config ADIS16080
          Say yes here to build support for Analog Devices ADIS16080, ADIS16100 Yaw
          Rate Gyroscope with SPI.
 
+config ADIS16130
+       tristate "Analog Devices ADIS16130 High Precision Angular Rate Sensor driver"
+       depends on SPI
+       help
+         Say yes here to build support for Analog Devices ADIS16130 High Precision
+         Angular Rate Sensor driver.
+
 config ADIS16136
        tristate "Analog devices ADIS16136 and similar gyroscopes driver"
        depends on SPI_MASTER
@@ -47,7 +54,6 @@ config IIO_ST_GYRO_3AXIS
        select IIO_ST_GYRO_I2C_3AXIS if (I2C)
        select IIO_ST_GYRO_SPI_3AXIS if (SPI_MASTER)
        select IIO_TRIGGERED_BUFFER if (IIO_BUFFER)
-       select IIO_ST_GYRO_BUFFER if (IIO_TRIGGERED_BUFFER)
        help
          Say yes here to build support for STMicroelectronics gyroscopes:
          L3G4200D, LSM330DL, L3GD20, L3GD20H, LSM330DLC, L3G4IS, LSM330.
index 225d289082e6a81d36175041aaec9ed1c3fd85a6..e9dc034aa18bc2deb6907c68b54e22b0b1490d26 100644 (file)
@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_ADIS16080) += adis16080.o
+obj-$(CONFIG_ADIS16130) += adis16130.o
 obj-$(CONFIG_ADIS16136) += adis16136.o
 obj-$(CONFIG_ADXRS450) += adxrs450.o
 
similarity index 77%
rename from drivers/staging/iio/gyro/adis16130_core.c
rename to drivers/iio/gyro/adis16130.c
index 531b803cb2ac4c79dd79b1681604a0802439a737..129acdf801a407e3d7cf1db7dbffdd9d6925b46c 100644 (file)
@@ -6,18 +6,12 @@
  * Licensed under the GPL-2 or later.
  */
 
-#include <linux/delay.h>
 #include <linux/mutex.h>
-#include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/spi/spi.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
-#include <linux/list.h>
 #include <linux/module.h>
 
 #include <linux/iio/iio.h>
-#include <linux/iio/sysfs.h>
 
 #define ADIS16130_CON         0x0
 #define ADIS16130_CON_RD      (1 << 6)
@@ -68,7 +62,6 @@ static int adis16130_spi_read(struct iio_dev *indio_dev, u8 reg_addr, u32 *val)
        spi_message_init(&msg);
        spi_message_add_tail(&xfer, &msg);
        ret = spi_sync(st->us, &msg);
-       ret = spi_read(st->us, st->buf, 4);
 
        if (ret == 0)
                *val = (st->buf[1] << 16) | (st->buf[2] << 8) | st->buf[3];
@@ -85,14 +78,47 @@ static int adis16130_read_raw(struct iio_dev *indio_dev,
        int ret;
        u32 temp;
 
-       /* Take the iio_dev status lock */
-       mutex_lock(&indio_dev->mlock);
-       ret =  adis16130_spi_read(indio_dev, chan->address, &temp);
-       mutex_unlock(&indio_dev->mlock);
-       if (ret)
-               return ret;
-       *val = temp;
-       return IIO_VAL_INT;
+       switch (mask) {
+       case IIO_CHAN_INFO_RAW:
+               /* Take the iio_dev status lock */
+               mutex_lock(&indio_dev->mlock);
+               ret = adis16130_spi_read(indio_dev, chan->address, &temp);
+               mutex_unlock(&indio_dev->mlock);
+               if (ret)
+                       return ret;
+               *val = temp;
+               return IIO_VAL_INT;
+       case IIO_CHAN_INFO_SCALE:
+               switch (chan->type) {
+               case IIO_ANGL_VEL:
+                       /* 0 degree = 838860, 250 degree = 14260608 */
+                       *val = 250;
+                       *val2 = 336440817; /* RAD_TO_DEGREE(14260608 - 8388608) */
+                       return IIO_VAL_FRACTIONAL;
+               case IIO_TEMP:
+                       /* 0C = 8036283, 105C = 9516048 */
+                       *val = 105000;
+                       *val2 = 9516048 - 8036283;
+                       return IIO_VAL_FRACTIONAL;
+               default:
+                       return -EINVAL;
+               }
+               break;
+       case IIO_CHAN_INFO_OFFSET:
+               switch (chan->type) {
+               case IIO_ANGL_VEL:
+                       *val = -8388608;
+                       return IIO_VAL_INT;
+               case IIO_TEMP:
+                       *val = -8036283;
+                       return IIO_VAL_INT;
+               default:
+                       return -EINVAL;
+               }
+               break;
+       }
+
+       return -EINVAL;
 }
 
 static const struct iio_chan_spec adis16130_channels[] = {
@@ -100,13 +126,17 @@ static const struct iio_chan_spec adis16130_channels[] = {
                .type = IIO_ANGL_VEL,
                .modified = 1,
                .channel2 = IIO_MOD_Z,
-               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+                       BIT(IIO_CHAN_INFO_SCALE) |
+                       BIT(IIO_CHAN_INFO_OFFSET),
                .address = ADIS16130_RATEDATA,
        }, {
                .type = IIO_TEMP,
                .indexed = 1,
                .channel = 0,
-               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+                       BIT(IIO_CHAN_INFO_SCALE) |
+                       BIT(IIO_CHAN_INFO_OFFSET),
                .address = ADIS16130_TEMPDATA,
        }
 };
@@ -153,7 +183,6 @@ error_ret:
        return ret;
 }
 
-/* fixme, confirm ordering in this function */
 static int adis16130_remove(struct spi_device *spi)
 {
        iio_device_unregister(spi_get_drvdata(spi));
index fa9b242199870facf4157251774268456cb81509..f9ed3488c3145125b4c6a5709180fc4fa32dbbce 100644 (file)
@@ -27,6 +27,8 @@
 #include <linux/iio/common/st_sensors.h>
 #include "st_gyro.h"
 
+#define ST_GYRO_NUMBER_DATA_CHANNELS           3
+
 /* DEFAULT VALUE FOR SENSORS */
 #define ST_GYRO_DEFAULT_OUT_X_L_ADDR           0x28
 #define ST_GYRO_DEFAULT_OUT_Y_L_ADDR           0x2a
 #define ST_GYRO_2_MULTIREAD_BIT                        true
 
 static const struct iio_chan_spec st_gyro_16bit_channels[] = {
-       ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL, ST_SENSORS_SCAN_X,
-               IIO_MOD_X, IIO_LE, ST_SENSORS_DEFAULT_16_REALBITS,
-                                               ST_GYRO_DEFAULT_OUT_X_L_ADDR),
-       ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL, ST_SENSORS_SCAN_Y,
-               IIO_MOD_Y, IIO_LE, ST_SENSORS_DEFAULT_16_REALBITS,
-                                               ST_GYRO_DEFAULT_OUT_Y_L_ADDR),
-       ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL, ST_SENSORS_SCAN_Z,
-               IIO_MOD_Z, IIO_LE, ST_SENSORS_DEFAULT_16_REALBITS,
-                                               ST_GYRO_DEFAULT_OUT_Z_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 16, 16,
+                       ST_GYRO_DEFAULT_OUT_X_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 16, 16,
+                       ST_GYRO_DEFAULT_OUT_Y_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_ANGL_VEL,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 16, 16,
+                       ST_GYRO_DEFAULT_OUT_Z_L_ADDR),
        IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
@@ -310,6 +315,7 @@ int st_gyro_common_probe(struct iio_dev *indio_dev)
        if (err < 0)
                goto st_gyro_common_probe_error;
 
+       gdata->num_data_channels = ST_GYRO_NUMBER_DATA_CHANNELS;
        gdata->multiread_bit = gdata->sensor->multi_read_bit;
        indio_dev->channels = gdata->sensor->ch;
        indio_dev->num_channels = ST_SENSORS_NUMBER_ALL_CHANNELS;
index aaadd32f9f0d4da915d96c37fb33b52c6ef0febe..e73033f3839a5e1eceba486cadf2825b30fb7027 100644 (file)
@@ -542,8 +542,7 @@ int iio_update_buffers(struct iio_dev *indio_dev,
                ret = indio_dev->setup_ops->preenable(indio_dev);
                if (ret) {
                        printk(KERN_ERR
-                              "Buffer not started:"
-                              "buffer preenable failed\n");
+                              "Buffer not started: buffer preenable failed (%d)\n", ret);
                        goto error_remove_inserted;
                }
        }
@@ -556,8 +555,7 @@ int iio_update_buffers(struct iio_dev *indio_dev,
                        ret = buffer->access->request_update(buffer);
                        if (ret) {
                                printk(KERN_INFO
-                                      "Buffer not started:"
-                                      "buffer parameter update failed\n");
+                                      "Buffer not started: buffer parameter update failed (%d)\n", ret);
                                goto error_run_postdisable;
                        }
                }
@@ -566,7 +564,7 @@ int iio_update_buffers(struct iio_dev *indio_dev,
                        ->update_scan_mode(indio_dev,
                                           indio_dev->active_scan_mask);
                if (ret < 0) {
-                       printk(KERN_INFO "update scan mode failed\n");
+                       printk(KERN_INFO "Buffer not started: update scan mode failed (%d)\n", ret);
                        goto error_run_postdisable;
                }
        }
@@ -590,7 +588,7 @@ int iio_update_buffers(struct iio_dev *indio_dev,
                ret = indio_dev->setup_ops->postenable(indio_dev);
                if (ret) {
                        printk(KERN_INFO
-                              "Buffer not started: postenable failed\n");
+                              "Buffer not started: postenable failed (%d)\n", ret);
                        indio_dev->currentmode = INDIO_DIRECT_MODE;
                        if (indio_dev->setup_ops->postdisable)
                                indio_dev->setup_ops->postdisable(indio_dev);
index 80d68ff02d296df8ff1350208a720ed9e89fa4e5..cdc2cad0f01bfac76aa6a808428de2c4f221876d 100644 (file)
@@ -31,7 +31,7 @@
 #include "../common/hid-sensors/hid-sensor-trigger.h"
 
 /*Format: HID-SENSOR-usage_id_in_hex*/
-/*Usage ID from spec for Accelerometer-3D: 0x200041*/
+/*Usage ID from spec for Ambiant-Light: 0x200041*/
 #define DRIVER_NAME "HID-SENSOR-200041"
 
 #define CHANNEL_SCAN_INDEX_ILLUM 0
index bd1cfb6666952dc91b142a404da430a9ea6344f7..c332b0ae4a3b8b47ad940532d4b1f4994c2c3029 100644 (file)
@@ -32,7 +32,6 @@ config IIO_ST_MAGN_3AXIS
        select IIO_ST_MAGN_I2C_3AXIS if (I2C)
        select IIO_ST_MAGN_SPI_3AXIS if (SPI_MASTER)
        select IIO_TRIGGERED_BUFFER if (IIO_BUFFER)
-       select IIO_ST_MAGN_BUFFER if (IIO_TRIGGERED_BUFFER)
        help
          Say yes here to build support for STMicroelectronics magnetometers:
          LSM303DLHC, LSM303DLM, LIS3MDL.
index af6c320a534ee8a0a7a5e03c5ed628928610603b..7105f22d6cd7f36890148327acc83f9584ef28ab 100644 (file)
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
+#include <linux/interrupt.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
 #include <linux/delay.h>
-
+#include <linux/bitops.h>
 #include <linux/gpio.h>
+#include <linux/of_gpio.h>
 
 #include <linux/iio/iio.h>
 #include <linux/iio/sysfs.h>
@@ -82,6 +84,7 @@
  */
 #define AK8975_MAX_CONVERSION_TIMEOUT  500
 #define AK8975_CONVERSION_DONE_POLL_TIME 10
+#define AK8975_DATA_READY_TIMEOUT      ((100*HZ)/1000)
 
 /*
  * Per-instance context data for the device.
@@ -94,6 +97,9 @@ struct ak8975_data {
        long                    raw_to_gauss[3];
        u8                      reg_cache[AK8975_MAX_REGS];
        int                     eoc_gpio;
+       int                     eoc_irq;
+       wait_queue_head_t       data_ready_queue;
+       unsigned long           flags;
 };
 
 static const int ak8975_index_to_reg[] = {
@@ -122,6 +128,51 @@ static int ak8975_write_data(struct i2c_client *client,
        return 0;
 }
 
+/*
+ * Handle data ready irq
+ */
+static irqreturn_t ak8975_irq_handler(int irq, void *data)
+{
+       struct ak8975_data *ak8975 = data;
+
+       set_bit(0, &ak8975->flags);
+       wake_up(&ak8975->data_ready_queue);
+
+       return IRQ_HANDLED;
+}
+
+/*
+ * Install data ready interrupt handler
+ */
+static int ak8975_setup_irq(struct ak8975_data *data)
+{
+       struct i2c_client *client = data->client;
+       int rc;
+       int irq;
+
+       if (client->irq)
+               irq = client->irq;
+       else
+               irq = gpio_to_irq(data->eoc_gpio);
+
+       rc = request_irq(irq, ak8975_irq_handler,
+                        IRQF_TRIGGER_RISING | IRQF_ONESHOT,
+                        dev_name(&client->dev), data);
+       if (rc < 0) {
+               dev_err(&client->dev,
+                       "irq %d request failed, (gpio %d): %d\n",
+                       irq, data->eoc_gpio, rc);
+               return rc;
+       }
+
+       init_waitqueue_head(&data->data_ready_queue);
+       clear_bit(0, &data->flags);
+       data->eoc_irq = irq;
+
+       return rc;
+}
+
+
 /*
  * Perform some start-of-day setup, including reading the asa calibration
  * values and caching them.
@@ -170,6 +221,16 @@ static int ak8975_setup(struct i2c_client *client)
                                AK8975_REG_CNTL_MODE_POWER_DOWN,
                                AK8975_REG_CNTL_MODE_MASK,
                                AK8975_REG_CNTL_MODE_SHIFT);
+
+       if (data->eoc_gpio > 0 || client->irq) {
+               ret = ak8975_setup_irq(data);
+               if (ret < 0) {
+                       dev_err(&client->dev,
+                               "Error setting data ready interrupt\n");
+                       return ret;
+               }
+       }
+
        if (ret < 0) {
                dev_err(&client->dev, "Error in setting power-down mode\n");
                return ret;
@@ -266,9 +327,23 @@ static int wait_conversion_complete_polled(struct ak8975_data *data)
                dev_err(&client->dev, "Conversion timeout happened\n");
                return -EINVAL;
        }
+
        return read_status;
 }
 
+/* Returns 0 if the end of conversion interrupt occured or -ETIME otherwise */
+static int wait_conversion_complete_interrupt(struct ak8975_data *data)
+{
+       int ret;
+
+       ret = wait_event_timeout(data->data_ready_queue,
+                                test_bit(0, &data->flags),
+                                AK8975_DATA_READY_TIMEOUT);
+       clear_bit(0, &data->flags);
+
+       return ret > 0 ? 0 : -ETIME;
+}
+
 /*
  * Emits the raw flux value for the x, y, or z axis.
  */
@@ -294,13 +369,16 @@ static int ak8975_read_axis(struct iio_dev *indio_dev, int index, int *val)
        }
 
        /* Wait for the conversion to complete. */
-       if (gpio_is_valid(data->eoc_gpio))
+       if (data->eoc_irq)
+               ret = wait_conversion_complete_interrupt(data);
+       else if (gpio_is_valid(data->eoc_gpio))
                ret = wait_conversion_complete_gpio(data);
        else
                ret = wait_conversion_complete_polled(data);
        if (ret < 0)
                goto exit;
 
+       /* This will be executed only for non-interrupt based waiting case */
        if (ret & AK8975_REG_ST1_DRDY_MASK) {
                ret = i2c_smbus_read_byte_data(client, AK8975_REG_ST2);
                if (ret < 0) {
@@ -384,10 +462,15 @@ static int ak8975_probe(struct i2c_client *client,
        int err;
 
        /* Grab and set up the supplied GPIO. */
-       if (client->dev.platform_data == NULL)
-               eoc_gpio = -1;
-       else
+       if (client->dev.platform_data)
                eoc_gpio = *(int *)(client->dev.platform_data);
+       else if (client->dev.of_node)
+               eoc_gpio = of_get_gpio(client->dev.of_node, 0);
+       else
+               eoc_gpio = -1;
+
+       if (eoc_gpio == -EPROBE_DEFER)
+               return -EPROBE_DEFER;
 
        /* We may not have a GPIO based IRQ to scan, that is fine, we will
           poll if so */
@@ -409,6 +492,11 @@ static int ak8975_probe(struct i2c_client *client,
        }
        data = iio_priv(indio_dev);
        i2c_set_clientdata(client, indio_dev);
+
+       data->client = client;
+       data->eoc_gpio = eoc_gpio;
+       data->eoc_irq = 0;
+
        /* Perform some basic start-of-day setup of the device. */
        err = ak8975_setup(client);
        if (err < 0) {
@@ -433,6 +521,8 @@ static int ak8975_probe(struct i2c_client *client,
 
 exit_free_iio:
        iio_device_free(indio_dev);
+       if (data->eoc_irq)
+               free_irq(data->eoc_irq, data);
 exit_gpio:
        if (gpio_is_valid(eoc_gpio))
                gpio_free(eoc_gpio);
@@ -447,6 +537,9 @@ static int ak8975_remove(struct i2c_client *client)
 
        iio_device_unregister(indio_dev);
 
+       if (data->eoc_irq)
+               free_irq(data->eoc_irq, data);
+
        if (gpio_is_valid(data->eoc_gpio))
                gpio_free(data->eoc_gpio);
 
index 16f0d6df239f0be75e64c388b38b3760ace5d901..ebfe8f11a0c26e62df3457bce88168a8ebc9620a 100644 (file)
@@ -26,6 +26,8 @@
 #include <linux/iio/common/st_sensors.h>
 #include "st_magn.h"
 
+#define ST_MAGN_NUMBER_DATA_CHANNELS           3
+
 /* DEFAULT VALUE FOR SENSORS */
 #define ST_MAGN_DEFAULT_OUT_X_L_ADDR           0X04
 #define ST_MAGN_DEFAULT_OUT_Y_L_ADDR           0X08
 #define ST_MAGN_2_OUT_Z_L_ADDR                 0x2c
 
 static const struct iio_chan_spec st_magn_16bit_channels[] = {
-       ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_X, IIO_MOD_X, IIO_LE,
-               ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_DEFAULT_OUT_X_L_ADDR),
-       ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_Y, IIO_MOD_Y, IIO_LE,
-               ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_DEFAULT_OUT_Y_L_ADDR),
-       ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_Z, IIO_MOD_Z, IIO_LE,
-               ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_DEFAULT_OUT_Z_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 16, 16,
+                       ST_MAGN_DEFAULT_OUT_X_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 16, 16,
+                       ST_MAGN_DEFAULT_OUT_Y_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 16, 16,
+                       ST_MAGN_DEFAULT_OUT_Z_L_ADDR),
        IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
 static const struct iio_chan_spec st_magn_2_16bit_channels[] = {
-       ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_X, IIO_MOD_X, IIO_LE,
-               ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_2_OUT_X_L_ADDR),
-       ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_Y, IIO_MOD_Y, IIO_LE,
-               ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_2_OUT_Y_L_ADDR),
-       ST_SENSORS_LSM_CHANNELS(IIO_MAGN, ST_SENSORS_SCAN_Z, IIO_MOD_Z, IIO_LE,
-               ST_SENSORS_DEFAULT_16_REALBITS, ST_MAGN_2_OUT_Z_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_X, 1, IIO_MOD_X, 's', IIO_LE, 16, 16,
+                       ST_MAGN_2_OUT_X_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_Y, 1, IIO_MOD_Y, 's', IIO_LE, 16, 16,
+                       ST_MAGN_2_OUT_Y_L_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_MAGN,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_Z, 1, IIO_MOD_Z, 's', IIO_LE, 16, 16,
+                       ST_MAGN_2_OUT_Z_L_ADDR),
        IIO_CHAN_SOFT_TIMESTAMP(3)
 };
 
@@ -344,6 +358,7 @@ int st_magn_common_probe(struct iio_dev *indio_dev)
        if (err < 0)
                goto st_magn_common_probe_error;
 
+       mdata->num_data_channels = ST_MAGN_NUMBER_DATA_CHANNELS;
        mdata->multiread_bit = mdata->sensor->multi_read_bit;
        indio_dev->channels = mdata->sensor->ch;
        indio_dev->num_channels = ST_SENSORS_NUMBER_ALL_CHANNELS;
diff --git a/drivers/iio/pressure/Kconfig b/drivers/iio/pressure/Kconfig
new file mode 100644 (file)
index 0000000..9427f01
--- /dev/null
@@ -0,0 +1,35 @@
+#
+# Pressure drivers
+#
+menu "Pressure Sensors"
+
+config IIO_ST_PRESS
+       tristate "STMicroelectronics pressures Driver"
+       depends on (I2C || SPI_MASTER) && SYSFS
+       select IIO_ST_SENSORS_CORE
+       select IIO_ST_PRESS_I2C if (I2C)
+       select IIO_ST_PRESS_SPI if (SPI_MASTER)
+       select IIO_TRIGGERED_BUFFER if (IIO_BUFFER)
+       help
+         Say yes here to build support for STMicroelectronics pressures:
+         LPS331AP.
+
+         This driver can also be built as a module. If so, will be created
+         these modules:
+         - st_pressure (core functions for the driver [it is mandatory]);
+         - st_pressure_i2c (necessary for the I2C devices [optional*]);
+         - st_pressure_spi (necessary for the SPI devices [optional*]);
+
+         (*) one of these is necessary to do something.
+
+config IIO_ST_PRESS_I2C
+       tristate
+       depends on IIO_ST_PRESS
+       depends on IIO_ST_SENSORS_I2C
+
+config IIO_ST_PRESS_SPI
+       tristate
+       depends on IIO_ST_PRESS
+       depends on IIO_ST_SENSORS_SPI
+
+endmenu
diff --git a/drivers/iio/pressure/Makefile b/drivers/iio/pressure/Makefile
new file mode 100644 (file)
index 0000000..d4bb33e
--- /dev/null
@@ -0,0 +1,10 @@
+#
+# Makefile for industrial I/O pressure drivers
+#
+
+obj-$(CONFIG_IIO_ST_PRESS) += st_pressure.o
+st_pressure-y := st_pressure_core.o
+st_pressure-$(CONFIG_IIO_BUFFER) += st_pressure_buffer.o
+
+obj-$(CONFIG_IIO_ST_PRESS_I2C) += st_pressure_i2c.o
+obj-$(CONFIG_IIO_ST_PRESS_SPI) += st_pressure_spi.o
diff --git a/drivers/iio/pressure/st_pressure.h b/drivers/iio/pressure/st_pressure.h
new file mode 100644 (file)
index 0000000..414e45a
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * STMicroelectronics pressures driver
+ *
+ * Copyright 2013 STMicroelectronics Inc.
+ *
+ * Denis Ciocca <denis.ciocca@st.com>
+ * v. 1.0.0
+ * Licensed under the GPL-2.
+ */
+
+#ifndef ST_PRESS_H
+#define ST_PRESS_H
+
+#include <linux/types.h>
+#include <linux/iio/common/st_sensors.h>
+
+#define LPS331AP_PRESS_DEV_NAME                "lps331ap"
+
+int st_press_common_probe(struct iio_dev *indio_dev);
+void st_press_common_remove(struct iio_dev *indio_dev);
+
+#ifdef CONFIG_IIO_BUFFER
+int st_press_allocate_ring(struct iio_dev *indio_dev);
+void st_press_deallocate_ring(struct iio_dev *indio_dev);
+int st_press_trig_set_state(struct iio_trigger *trig, bool state);
+#define ST_PRESS_TRIGGER_SET_STATE (&st_press_trig_set_state)
+#else /* CONFIG_IIO_BUFFER */
+static inline int st_press_allocate_ring(struct iio_dev *indio_dev)
+{
+       return 0;
+}
+
+static inline void st_press_deallocate_ring(struct iio_dev *indio_dev)
+{
+}
+#define ST_PRESS_TRIGGER_SET_STATE NULL
+#endif /* CONFIG_IIO_BUFFER */
+
+#endif /* ST_PRESS_H */
diff --git a/drivers/iio/pressure/st_pressure_buffer.c b/drivers/iio/pressure/st_pressure_buffer.c
new file mode 100644 (file)
index 0000000..f877ef8
--- /dev/null
@@ -0,0 +1,105 @@
+/*
+ * STMicroelectronics pressures driver
+ *
+ * Copyright 2013 STMicroelectronics Inc.
+ *
+ * Denis Ciocca <denis.ciocca@st.com>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/interrupt.h>
+#include <linux/i2c.h>
+#include <linux/delay.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/buffer.h>
+#include <linux/iio/trigger_consumer.h>
+#include <linux/iio/triggered_buffer.h>
+
+#include <linux/iio/common/st_sensors.h>
+#include "st_pressure.h"
+
+int st_press_trig_set_state(struct iio_trigger *trig, bool state)
+{
+       struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
+
+       return st_sensors_set_dataready_irq(indio_dev, state);
+}
+
+static int st_press_buffer_preenable(struct iio_dev *indio_dev)
+{
+       int err;
+
+       err = st_sensors_set_enable(indio_dev, true);
+       if (err < 0)
+               goto st_press_set_enable_error;
+
+       err = iio_sw_buffer_preenable(indio_dev);
+
+st_press_set_enable_error:
+       return err;
+}
+
+static int st_press_buffer_postenable(struct iio_dev *indio_dev)
+{
+       int err;
+       struct st_sensor_data *pdata = iio_priv(indio_dev);
+
+       pdata->buffer_data = kmalloc(indio_dev->scan_bytes, GFP_KERNEL);
+       if (pdata->buffer_data == NULL) {
+               err = -ENOMEM;
+               goto allocate_memory_error;
+       }
+
+       err = iio_triggered_buffer_postenable(indio_dev);
+       if (err < 0)
+               goto st_press_buffer_postenable_error;
+
+       return err;
+
+st_press_buffer_postenable_error:
+       kfree(pdata->buffer_data);
+allocate_memory_error:
+       return err;
+}
+
+static int st_press_buffer_predisable(struct iio_dev *indio_dev)
+{
+       int err;
+       struct st_sensor_data *pdata = iio_priv(indio_dev);
+
+       err = iio_triggered_buffer_predisable(indio_dev);
+       if (err < 0)
+               goto st_press_buffer_predisable_error;
+
+       err = st_sensors_set_enable(indio_dev, false);
+
+st_press_buffer_predisable_error:
+       kfree(pdata->buffer_data);
+       return err;
+}
+
+static const struct iio_buffer_setup_ops st_press_buffer_setup_ops = {
+       .preenable = &st_press_buffer_preenable,
+       .postenable = &st_press_buffer_postenable,
+       .predisable = &st_press_buffer_predisable,
+};
+
+int st_press_allocate_ring(struct iio_dev *indio_dev)
+{
+       return iio_triggered_buffer_setup(indio_dev, &iio_pollfunc_store_time,
+               &st_sensors_trigger_handler, &st_press_buffer_setup_ops);
+}
+
+void st_press_deallocate_ring(struct iio_dev *indio_dev)
+{
+       iio_triggered_buffer_cleanup(indio_dev);
+}
+
+MODULE_AUTHOR("Denis Ciocca <denis.ciocca@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics pressures buffer");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/pressure/st_pressure_core.c b/drivers/iio/pressure/st_pressure_core.c
new file mode 100644 (file)
index 0000000..9c343b4
--- /dev/null
@@ -0,0 +1,272 @@
+/*
+ * STMicroelectronics pressures driver
+ *
+ * Copyright 2013 STMicroelectronics Inc.
+ *
+ * Denis Ciocca <denis.ciocca@st.com>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/interrupt.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/sysfs.h>
+#include <linux/iio/trigger.h>
+#include <linux/iio/buffer.h>
+#include <asm/unaligned.h>
+
+#include <linux/iio/common/st_sensors.h>
+#include "st_pressure.h"
+
+#define ST_PRESS_MBAR_TO_KPASCAL(x)            (x * 10)
+#define ST_PRESS_NUMBER_DATA_CHANNELS          1
+
+/* DEFAULT VALUE FOR SENSORS */
+#define ST_PRESS_DEFAULT_OUT_XL_ADDR           0x28
+#define ST_TEMP_DEFAULT_OUT_L_ADDR             0x2b
+
+/* FULLSCALE */
+#define ST_PRESS_FS_AVL_1260MB                 1260
+
+/* CUSTOM VALUES FOR SENSOR 1 */
+#define ST_PRESS_1_WAI_EXP                     0xbb
+#define ST_PRESS_1_ODR_ADDR                    0x20
+#define ST_PRESS_1_ODR_MASK                    0x70
+#define ST_PRESS_1_ODR_AVL_1HZ_VAL             0x01
+#define ST_PRESS_1_ODR_AVL_7HZ_VAL             0x05
+#define ST_PRESS_1_ODR_AVL_13HZ_VAL            0x06
+#define ST_PRESS_1_ODR_AVL_25HZ_VAL            0x07
+#define ST_PRESS_1_PW_ADDR                     0x20
+#define ST_PRESS_1_PW_MASK                     0x80
+#define ST_PRESS_1_FS_ADDR                     0x23
+#define ST_PRESS_1_FS_MASK                     0x30
+#define ST_PRESS_1_FS_AVL_1260_VAL             0x00
+#define ST_PRESS_1_FS_AVL_1260_GAIN            ST_PRESS_MBAR_TO_KPASCAL(244141)
+#define ST_PRESS_1_FS_AVL_TEMP_GAIN            2083000
+#define ST_PRESS_1_BDU_ADDR                    0x20
+#define ST_PRESS_1_BDU_MASK                    0x04
+#define ST_PRESS_1_DRDY_IRQ_ADDR               0x22
+#define ST_PRESS_1_DRDY_IRQ_MASK               0x04
+#define ST_PRESS_1_MULTIREAD_BIT               true
+#define ST_PRESS_1_TEMP_OFFSET                 42500
+
+static const struct iio_chan_spec st_press_channels[] = {
+       ST_SENSORS_LSM_CHANNELS(IIO_PRESSURE,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+                       ST_SENSORS_SCAN_X, 0, IIO_NO_MOD, 'u', IIO_LE, 24, 24,
+                       ST_PRESS_DEFAULT_OUT_XL_ADDR),
+       ST_SENSORS_LSM_CHANNELS(IIO_TEMP,
+                       BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE) |
+                                               BIT(IIO_CHAN_INFO_OFFSET),
+                       -1, 0, IIO_NO_MOD, 's', IIO_LE, 16, 16,
+                       ST_TEMP_DEFAULT_OUT_L_ADDR),
+       IIO_CHAN_SOFT_TIMESTAMP(1)
+};
+
+static const struct st_sensors st_press_sensors[] = {
+       {
+               .wai = ST_PRESS_1_WAI_EXP,
+               .sensors_supported = {
+                       [0] = LPS331AP_PRESS_DEV_NAME,
+               },
+               .ch = (struct iio_chan_spec *)st_press_channels,
+               .odr = {
+                       .addr = ST_PRESS_1_ODR_ADDR,
+                       .mask = ST_PRESS_1_ODR_MASK,
+                       .odr_avl = {
+                               { 1, ST_PRESS_1_ODR_AVL_1HZ_VAL, },
+                               { 7, ST_PRESS_1_ODR_AVL_7HZ_VAL, },
+                               { 13, ST_PRESS_1_ODR_AVL_13HZ_VAL, },
+                               { 25, ST_PRESS_1_ODR_AVL_25HZ_VAL, },
+                       },
+               },
+               .pw = {
+                       .addr = ST_PRESS_1_PW_ADDR,
+                       .mask = ST_PRESS_1_PW_MASK,
+                       .value_on = ST_SENSORS_DEFAULT_POWER_ON_VALUE,
+                       .value_off = ST_SENSORS_DEFAULT_POWER_OFF_VALUE,
+               },
+               .fs = {
+                       .addr = ST_PRESS_1_FS_ADDR,
+                       .mask = ST_PRESS_1_FS_MASK,
+                       .fs_avl = {
+                               [0] = {
+                                       .num = ST_PRESS_FS_AVL_1260MB,
+                                       .value = ST_PRESS_1_FS_AVL_1260_VAL,
+                                       .gain = ST_PRESS_1_FS_AVL_1260_GAIN,
+                                       .gain2 = ST_PRESS_1_FS_AVL_TEMP_GAIN,
+                               },
+                       },
+               },
+               .bdu = {
+                       .addr = ST_PRESS_1_BDU_ADDR,
+                       .mask = ST_PRESS_1_BDU_MASK,
+               },
+               .drdy_irq = {
+                       .addr = ST_PRESS_1_DRDY_IRQ_ADDR,
+                       .mask = ST_PRESS_1_DRDY_IRQ_MASK,
+               },
+               .multi_read_bit = ST_PRESS_1_MULTIREAD_BIT,
+               .bootime = 2,
+       },
+};
+
+static int st_press_read_raw(struct iio_dev *indio_dev,
+                       struct iio_chan_spec const *ch, int *val,
+                                                       int *val2, long mask)
+{
+       int err;
+       struct st_sensor_data *pdata = iio_priv(indio_dev);
+
+       switch (mask) {
+       case IIO_CHAN_INFO_RAW:
+               err = st_sensors_read_info_raw(indio_dev, ch, val);
+               if (err < 0)
+                       goto read_error;
+
+               return IIO_VAL_INT;
+       case IIO_CHAN_INFO_SCALE:
+               *val = 0;
+
+               switch (ch->type) {
+               case IIO_PRESSURE:
+                       *val2 = pdata->current_fullscale->gain;
+                       break;
+               case IIO_TEMP:
+                       *val2 = pdata->current_fullscale->gain2;
+                       break;
+               default:
+                       err = -EINVAL;
+                       goto read_error;
+               }
+
+               return IIO_VAL_INT_PLUS_NANO;
+       case IIO_CHAN_INFO_OFFSET:
+               switch (ch->type) {
+               case IIO_TEMP:
+                       *val = 425;
+                       *val2 = 10;
+                       break;
+               default:
+                       err = -EINVAL;
+                       goto read_error;
+               }
+
+               return IIO_VAL_FRACTIONAL;
+       default:
+               return -EINVAL;
+       }
+
+read_error:
+       return err;
+}
+
+static ST_SENSOR_DEV_ATTR_SAMP_FREQ();
+static ST_SENSORS_DEV_ATTR_SAMP_FREQ_AVAIL();
+
+static struct attribute *st_press_attributes[] = {
+       &iio_dev_attr_sampling_frequency_available.dev_attr.attr,
+       &iio_dev_attr_sampling_frequency.dev_attr.attr,
+       NULL,
+};
+
+static const struct attribute_group st_press_attribute_group = {
+       .attrs = st_press_attributes,
+};
+
+static const struct iio_info press_info = {
+       .driver_module = THIS_MODULE,
+       .attrs = &st_press_attribute_group,
+       .read_raw = &st_press_read_raw,
+};
+
+#ifdef CONFIG_IIO_TRIGGER
+static const struct iio_trigger_ops st_press_trigger_ops = {
+       .owner = THIS_MODULE,
+       .set_trigger_state = ST_PRESS_TRIGGER_SET_STATE,
+};
+#define ST_PRESS_TRIGGER_OPS (&st_press_trigger_ops)
+#else
+#define ST_PRESS_TRIGGER_OPS NULL
+#endif
+
+int st_press_common_probe(struct iio_dev *indio_dev)
+{
+       int err;
+       struct st_sensor_data *pdata = iio_priv(indio_dev);
+
+       indio_dev->modes = INDIO_DIRECT_MODE;
+       indio_dev->info = &press_info;
+
+       err = st_sensors_check_device_support(indio_dev,
+                               ARRAY_SIZE(st_press_sensors), st_press_sensors);
+       if (err < 0)
+               goto st_press_common_probe_error;
+
+       pdata->num_data_channels = ST_PRESS_NUMBER_DATA_CHANNELS;
+       pdata->multiread_bit = pdata->sensor->multi_read_bit;
+       indio_dev->channels = pdata->sensor->ch;
+       indio_dev->num_channels = ARRAY_SIZE(st_press_channels);
+
+       pdata->current_fullscale = (struct st_sensor_fullscale_avl *)
+                                               &pdata->sensor->fs.fs_avl[0];
+       pdata->odr = pdata->sensor->odr.odr_avl[0].hz;
+
+       err = st_sensors_init_sensor(indio_dev);
+       if (err < 0)
+               goto st_press_common_probe_error;
+
+       if (pdata->get_irq_data_ready(indio_dev) > 0) {
+               err = st_press_allocate_ring(indio_dev);
+               if (err < 0)
+                       goto st_press_common_probe_error;
+
+               err = st_sensors_allocate_trigger(indio_dev,
+                                                       ST_PRESS_TRIGGER_OPS);
+               if (err < 0)
+                       goto st_press_probe_trigger_error;
+       }
+
+       err = iio_device_register(indio_dev);
+       if (err)
+               goto st_press_device_register_error;
+
+       return err;
+
+st_press_device_register_error:
+       if (pdata->get_irq_data_ready(indio_dev) > 0)
+               st_sensors_deallocate_trigger(indio_dev);
+st_press_probe_trigger_error:
+       if (pdata->get_irq_data_ready(indio_dev) > 0)
+               st_press_deallocate_ring(indio_dev);
+st_press_common_probe_error:
+       return err;
+}
+EXPORT_SYMBOL(st_press_common_probe);
+
+void st_press_common_remove(struct iio_dev *indio_dev)
+{
+       struct st_sensor_data *pdata = iio_priv(indio_dev);
+
+       iio_device_unregister(indio_dev);
+       if (pdata->get_irq_data_ready(indio_dev) > 0) {
+               st_sensors_deallocate_trigger(indio_dev);
+               st_press_deallocate_ring(indio_dev);
+       }
+       iio_device_free(indio_dev);
+}
+EXPORT_SYMBOL(st_press_common_remove);
+
+MODULE_AUTHOR("Denis Ciocca <denis.ciocca@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics pressures driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/pressure/st_pressure_i2c.c b/drivers/iio/pressure/st_pressure_i2c.c
new file mode 100644 (file)
index 0000000..7cebcc7
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ * STMicroelectronics pressures driver
+ *
+ * Copyright 2013 STMicroelectronics Inc.
+ *
+ * Denis Ciocca <denis.ciocca@st.com>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include <linux/iio/iio.h>
+
+#include <linux/iio/common/st_sensors.h>
+#include <linux/iio/common/st_sensors_i2c.h>
+#include "st_pressure.h"
+
+static int st_press_i2c_probe(struct i2c_client *client,
+                                               const struct i2c_device_id *id)
+{
+       struct iio_dev *indio_dev;
+       struct st_sensor_data *pdata;
+       int err;
+
+       indio_dev = iio_device_alloc(sizeof(*pdata));
+       if (indio_dev == NULL) {
+               err = -ENOMEM;
+               goto iio_device_alloc_error;
+       }
+
+       pdata = iio_priv(indio_dev);
+       pdata->dev = &client->dev;
+
+       st_sensors_i2c_configure(indio_dev, client, pdata);
+
+       err = st_press_common_probe(indio_dev);
+       if (err < 0)
+               goto st_press_common_probe_error;
+
+       return 0;
+
+st_press_common_probe_error:
+       iio_device_free(indio_dev);
+iio_device_alloc_error:
+       return err;
+}
+
+static int st_press_i2c_remove(struct i2c_client *client)
+{
+       st_press_common_remove(i2c_get_clientdata(client));
+
+       return 0;
+}
+
+static const struct i2c_device_id st_press_id_table[] = {
+       { LPS331AP_PRESS_DEV_NAME },
+       {},
+};
+MODULE_DEVICE_TABLE(i2c, st_press_id_table);
+
+static struct i2c_driver st_press_driver = {
+       .driver = {
+               .owner = THIS_MODULE,
+               .name = "st-press-i2c",
+       },
+       .probe = st_press_i2c_probe,
+       .remove = st_press_i2c_remove,
+       .id_table = st_press_id_table,
+};
+module_i2c_driver(st_press_driver);
+
+MODULE_AUTHOR("Denis Ciocca <denis.ciocca@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics pressures i2c driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/pressure/st_pressure_spi.c b/drivers/iio/pressure/st_pressure_spi.c
new file mode 100644 (file)
index 0000000..17a1490
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * STMicroelectronics pressures driver
+ *
+ * Copyright 2013 STMicroelectronics Inc.
+ *
+ * Denis Ciocca <denis.ciocca@st.com>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spi/spi.h>
+#include <linux/iio/iio.h>
+
+#include <linux/iio/common/st_sensors.h>
+#include <linux/iio/common/st_sensors_spi.h>
+#include "st_pressure.h"
+
+static int st_press_spi_probe(struct spi_device *spi)
+{
+       struct iio_dev *indio_dev;
+       struct st_sensor_data *pdata;
+       int err;
+
+       indio_dev = iio_device_alloc(sizeof(*pdata));
+       if (indio_dev == NULL) {
+               err = -ENOMEM;
+               goto iio_device_alloc_error;
+       }
+
+       pdata = iio_priv(indio_dev);
+       pdata->dev = &spi->dev;
+
+       st_sensors_spi_configure(indio_dev, spi, pdata);
+
+       err = st_press_common_probe(indio_dev);
+       if (err < 0)
+               goto st_press_common_probe_error;
+
+       return 0;
+
+st_press_common_probe_error:
+       iio_device_free(indio_dev);
+iio_device_alloc_error:
+       return err;
+}
+
+static int st_press_spi_remove(struct spi_device *spi)
+{
+       st_press_common_remove(spi_get_drvdata(spi));
+
+       return 0;
+}
+
+static const struct spi_device_id st_press_id_table[] = {
+       { LPS331AP_PRESS_DEV_NAME },
+       {},
+};
+MODULE_DEVICE_TABLE(spi, st_press_id_table);
+
+static struct spi_driver st_press_driver = {
+       .driver = {
+               .owner = THIS_MODULE,
+               .name = "st-press-spi",
+       },
+       .probe = st_press_spi_probe,
+       .remove = st_press_spi_remove,
+       .id_table = st_press_id_table,
+};
+module_spi_driver(st_press_driver);
+
+MODULE_AUTHOR("Denis Ciocca <denis.ciocca@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics pressures spi driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/trigger/Kconfig b/drivers/iio/trigger/Kconfig
new file mode 100644 (file)
index 0000000..360fd50
--- /dev/null
@@ -0,0 +1,26 @@
+#
+# Industrial I/O standalone triggers
+#
+menu "Triggers - standalone"
+
+config IIO_INTERRUPT_TRIGGER
+       tristate "Generic interrupt trigger"
+       help
+         Provides support for using an interrupt of any type as an IIO
+         trigger.  This may be provided by a gpio driver for example.
+
+         To compile this driver as a module, choose M here: the
+         module will be called iio-trig-interrupt.
+
+config IIO_SYSFS_TRIGGER
+       tristate "SYSFS trigger"
+       depends on SYSFS
+       select IRQ_WORK
+       help
+         Provides support for using SYSFS entry as IIO triggers.
+         If unsure, say N (but it's safe to say "Y").
+
+         To compile this driver as a module, choose M here: the
+         module will be called iio-trig-sysfs.
+
+endmenu
diff --git a/drivers/iio/trigger/Makefile b/drivers/iio/trigger/Makefile
new file mode 100644 (file)
index 0000000..ce319a5
--- /dev/null
@@ -0,0 +1,6 @@
+#
+# Makefile for triggers not associated with iio-devices
+#
+
+obj-$(CONFIG_IIO_INTERRUPT_TRIGGER) += iio-trig-interrupt.o
+obj-$(CONFIG_IIO_SYSFS_TRIGGER) += iio-trig-sysfs.o
diff --git a/drivers/iio/trigger/iio-trig-interrupt.c b/drivers/iio/trigger/iio-trig-interrupt.c
new file mode 100644 (file)
index 0000000..02577ec
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * Industrial I/O - generic interrupt based trigger support
+ *
+ * Copyright (c) 2008-2013 Jonathan Cameron
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include <linux/iio/iio.h>
+#include <linux/iio/trigger.h>
+
+
+struct iio_interrupt_trigger_info {
+       unsigned int irq;
+};
+
+static irqreturn_t iio_interrupt_trigger_poll(int irq, void *private)
+{
+       /* Timestamp not currently provided */
+       iio_trigger_poll(private, 0);
+       return IRQ_HANDLED;
+}
+
+static const struct iio_trigger_ops iio_interrupt_trigger_ops = {
+       .owner = THIS_MODULE,
+};
+
+static int iio_interrupt_trigger_probe(struct platform_device *pdev)
+{
+       struct iio_interrupt_trigger_info *trig_info;
+       struct iio_trigger *trig;
+       unsigned long irqflags;
+       struct resource *irq_res;
+       int irq, ret = 0;
+
+       irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+
+       if (irq_res == NULL)
+               return -ENODEV;
+
+       irqflags = (irq_res->flags & IRQF_TRIGGER_MASK) | IRQF_SHARED;
+
+       irq = irq_res->start;
+
+       trig = iio_trigger_alloc("irqtrig%d", irq);
+       if (!trig) {
+               ret = -ENOMEM;
+               goto error_ret;
+       }
+
+       trig_info = kzalloc(sizeof(*trig_info), GFP_KERNEL);
+       if (!trig_info) {
+               ret = -ENOMEM;
+               goto error_put_trigger;
+       }
+       iio_trigger_set_drvdata(trig, trig_info);
+       trig_info->irq = irq;
+       trig->ops = &iio_interrupt_trigger_ops;
+       ret = request_irq(irq, iio_interrupt_trigger_poll,
+                         irqflags, trig->name, trig);
+       if (ret) {
+               dev_err(&pdev->dev,
+                       "request IRQ-%d failed", irq);
+               goto error_free_trig_info;
+       }
+
+       ret = iio_trigger_register(trig);
+       if (ret)
+               goto error_release_irq;
+       platform_set_drvdata(pdev, trig);
+
+       return 0;
+
+/* First clean up the partly allocated trigger */
+error_release_irq:
+       free_irq(irq, trig);
+error_free_trig_info:
+       kfree(trig_info);
+error_put_trigger:
+       iio_trigger_put(trig);
+error_ret:
+       return ret;
+}
+
+static int iio_interrupt_trigger_remove(struct platform_device *pdev)
+{
+       struct iio_trigger *trig;
+       struct iio_interrupt_trigger_info *trig_info;
+
+       trig = platform_get_drvdata(pdev);
+       trig_info = iio_trigger_get_drvdata(trig);
+       iio_trigger_unregister(trig);
+       free_irq(trig_info->irq, trig);
+       kfree(trig_info);
+       iio_trigger_put(trig);
+
+       return 0;
+}
+
+static struct platform_driver iio_interrupt_trigger_driver = {
+       .probe = iio_interrupt_trigger_probe,
+       .remove = iio_interrupt_trigger_remove,
+       .driver = {
+               .name = "iio_interrupt_trigger",
+               .owner = THIS_MODULE,
+       },
+};
+
+module_platform_driver(iio_interrupt_trigger_driver);
+
+MODULE_AUTHOR("Jonathan Cameron <jic23@kernel.org>");
+MODULE_DESCRIPTION("Interrupt trigger for the iio subsystem");
+MODULE_LICENSE("GPL v2");
similarity index 98%
rename from drivers/staging/iio/trigger/iio-trig-sysfs.c
rename to drivers/iio/trigger/iio-trig-sysfs.c
index b727bde8b7fe09babfc8211be0d61d5c210e2d7c..effcd0ac98d84021c3ba439555342d660a025ef0 100644 (file)
@@ -34,7 +34,7 @@ static ssize_t iio_sysfs_trig_add(struct device *dev,
        int ret;
        unsigned long input;
 
-       ret = strict_strtoul(buf, 10, &input);
+       ret = kstrtoul(buf, 10, &input);
        if (ret)
                return ret;
        ret = iio_sysfs_trigger_probe(input);
@@ -53,7 +53,7 @@ static ssize_t iio_sysfs_trig_remove(struct device *dev,
        int ret;
        unsigned long input;
 
-       ret = strict_strtoul(buf, 10, &input);
+       ret = kstrtoul(buf, 10, &input);
        if (ret)
                return ret;
        ret = iio_sysfs_trigger_remove(input);
index aefe820a8005585934f9d80b2d62a69b44555b29..f64b662c74dbfca1831149642cfbcd861efa0de5 100644 (file)
@@ -62,6 +62,8 @@ source "drivers/staging/line6/Kconfig"
 
 source "drivers/staging/octeon/Kconfig"
 
+source "drivers/staging/octeon-usb/Kconfig"
+
 source "drivers/staging/serqt_usb2/Kconfig"
 
 source "drivers/staging/vt6655/Kconfig"
@@ -140,4 +142,8 @@ source "drivers/staging/netlogic/Kconfig"
 
 source "drivers/staging/dwc2/Kconfig"
 
+source "drivers/staging/lustre/Kconfig"
+
+source "drivers/staging/btmtk_usb/Kconfig"
+
 endif # STAGING
index 415772ea306dd160a9c19477b557ff16fd746fe6..1fb58a1562cb4e2d44571ea3baa592be0557e4bf 100644 (file)
@@ -25,6 +25,7 @@ obj-$(CONFIG_LINE6_USB)               += line6/
 obj-$(CONFIG_NETLOGIC_XLR_NET) += netlogic/
 obj-$(CONFIG_USB_SERIAL_QUATECH2)      += serqt_usb2/
 obj-$(CONFIG_OCTEON_ETHERNET)  += octeon/
+obj-$(CONFIG_OCTEON_USB)       += octeon-usb/
 obj-$(CONFIG_VT6655)           += vt6655/
 obj-$(CONFIG_VT6656)           += vt6656/
 obj-$(CONFIG_VME_BUS)          += vme/
@@ -62,3 +63,5 @@ obj-$(CONFIG_FIREWIRE_SERIAL) += fwserial/
 obj-$(CONFIG_ZCACHE)           += zcache/
 obj-$(CONFIG_GOLDFISH)         += goldfish/
 obj-$(CONFIG_USB_DWC2)         += dwc2/
+obj-$(CONFIG_LUSTRE_FS)                += lustre/
+obj-$(CONFIG_USB_BTMTK)                += btmtk_usb/
index e681bdd9aa5f46fb31e2a1390c3559387b333bf2..21a3f7250531c6a4e7891843f08f1919a31f890e 100644 (file)
@@ -704,7 +704,8 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 /* support of 32bit userspace on 64bit platforms */
 #ifdef CONFIG_COMPAT
-static long compat_ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long compat_ashmem_ioctl(struct file *file, unsigned int cmd,
+                               unsigned long arg)
 {
 
        switch (cmd) {
index 4928f93bdf3d6a0a8525b00e985a651586bd1ffb..765c757b120f2ae76839701cc3fcc6b9410966f2 100644 (file)
@@ -160,7 +160,8 @@ static int sw_sync_release(struct inode *inode, struct file *file)
        return 0;
 }
 
-static long sw_sync_ioctl_create_fence(struct sw_sync_timeline *obj, unsigned long arg)
+static long sw_sync_ioctl_create_fence(struct sw_sync_timeline *obj,
+                                      unsigned long arg)
 {
        int fd = get_unused_fd();
        int err;
@@ -218,7 +219,8 @@ static long sw_sync_ioctl_inc(struct sw_sync_timeline *obj, unsigned long arg)
        return 0;
 }
 
-static long sw_sync_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static long sw_sync_ioctl(struct file *file, unsigned int cmd,
+                         unsigned long arg)
 {
        struct sw_sync_timeline *obj = file->private_data;
 
index 3893a35747698603cce0fe62ee7dcc1d1c333352..2996077fedef1673d67725b802b749af80d025b8 100644 (file)
@@ -125,9 +125,9 @@ static void sync_timeline_remove_pt(struct sync_pt *pt)
        spin_unlock_irqrestore(&obj->active_list_lock, flags);
 
        spin_lock_irqsave(&obj->child_list_lock, flags);
-       if (!list_empty(&pt->child_list)) {
+       if (!list_empty(&pt->child_list))
                list_del_init(&pt->child_list);
-       }
+
        spin_unlock_irqrestore(&obj->child_list_lock, flags);
 }
 
@@ -876,11 +876,11 @@ static void sync_print_pt(struct seq_file *s, struct sync_pt *pt, bool fence)
                        seq_printf(s, " / %s", value);
                }
        } else if (pt->parent->ops->print_pt) {
-               seq_printf(s, ": ");
+               seq_puts(s, ": ");
                pt->parent->ops->print_pt(s, pt);
        }
 
-       seq_printf(s, "\n");
+       seq_puts(s, "\n");
 }
 
 static void sync_print_obj(struct seq_file *s, struct sync_timeline *obj)
@@ -895,11 +895,11 @@ static void sync_print_obj(struct seq_file *s, struct sync_timeline *obj)
                obj->ops->timeline_value_str(obj, value, sizeof(value));
                seq_printf(s, ": %s", value);
        } else if (obj->ops->print_obj) {
-               seq_printf(s, ": ");
+               seq_puts(s, ": ");
                obj->ops->print_obj(s, obj);
        }
 
-       seq_printf(s, "\n");
+       seq_puts(s, "\n");
 
        spin_lock_irqsave(&obj->child_list_lock, flags);
        list_for_each(pos, &obj->child_list_head) {
@@ -940,7 +940,7 @@ static int sync_debugfs_show(struct seq_file *s, void *unused)
        unsigned long flags;
        struct list_head *pos;
 
-       seq_printf(s, "objs:\n--------------\n");
+       seq_puts(s, "objs:\n--------------\n");
 
        spin_lock_irqsave(&sync_timeline_list_lock, flags);
        list_for_each(pos, &sync_timeline_list_head) {
@@ -949,11 +949,11 @@ static int sync_debugfs_show(struct seq_file *s, void *unused)
                                     sync_timeline_list);
 
                sync_print_obj(s, obj);
-               seq_printf(s, "\n");
+               seq_puts(s, "\n");
        }
        spin_unlock_irqrestore(&sync_timeline_list_lock, flags);
 
-       seq_printf(s, "fences:\n--------------\n");
+       seq_puts(s, "fences:\n--------------\n");
 
        spin_lock_irqsave(&sync_fence_list_lock, flags);
        list_for_each(pos, &sync_fence_list_head) {
@@ -961,7 +961,7 @@ static int sync_debugfs_show(struct seq_file *s, void *unused)
                        container_of(pos, struct sync_fence, sync_fence_list);
 
                sync_print_fence(s, fence);
-               seq_printf(s, "\n");
+               seq_puts(s, "\n");
        }
        spin_unlock_irqrestore(&sync_fence_list_lock, flags);
        return 0;
@@ -988,7 +988,7 @@ late_initcall(sync_debugfs_init);
 
 #define DUMP_CHUNK 256
 static char sync_dump_buf[64 * 1024];
-void sync_dump(void)
+static void sync_dump(void)
 {
        struct seq_file s = {
                .buf = sync_dump_buf,
index d0a5a28a8fe22598e760788a63b257f5f0cc93c9..3654dc32a0c6ca4d98262809ce64653688ee43da 100644 (file)
@@ -50,9 +50,9 @@
 #define ASUS_OLED_DISP_HEIGHT          32
 #define ASUS_OLED_PACKET_BUF_SIZE      256
 
-#define USB_VENDOR_ID_ASUS      0x0b05
-#define USB_DEVICE_ID_ASUS_LCM      0x1726
-#define USB_DEVICE_ID_ASUS_LCM2     0x175b
+#define USB_VENDOR_ID_ASUS             0x0b05
+#define USB_DEVICE_ID_ASUS_LCM         0x1726
+#define USB_DEVICE_ID_ASUS_LCM2                0x175b
 
 MODULE_AUTHOR("Jakub Schmidtke, sjakub@gmail.com");
 MODULE_DESCRIPTION("Asus OLED Driver");
@@ -324,9 +324,11 @@ static void send_data(struct asus_oled_dev *odev)
                return;
 
        if (odev->pack_mode == PACK_MODE_G1) {
-               /* When sending roll-mode data the display updated only
-                  first packet.  I have no idea why, but when static picture
-                  is sent just before rolling picture everything works fine. */
+               /*
+                * When sending roll-mode data the display updated only
+                * first packet.  I have no idea why, but when static picture
+                * is sent just before rolling picture everything works fine.
+                */
                if (odev->pic_mode == ASUS_OLED_ROLL)
                        send_packets(odev->udev, packet, odev->buf,
                                     ASUS_OLED_STATIC, 2);
@@ -363,9 +365,11 @@ static int append_values(struct asus_oled_dev *odev, uint8_t val, size_t count)
 
                switch (odev->pack_mode) {
                case PACK_MODE_G1:
-                       /* i = (x/128)*640 + 127 - x + (y/8)*128;
-                          This one for 128 is the same, but might be better
-                          for different widths? */
+                       /*
+                        * i = (x/128)*640 + 127 - x + (y/8)*128;
+                        * This one for 128 is the same, but might be better
+                        * for different widths?
+                        */
                        i = (x/odev->dev_width)*640 +
                                odev->dev_width - 1 - x +
                                (y/8)*odev->dev_width;
@@ -383,10 +387,8 @@ static int append_values(struct asus_oled_dev *odev, uint8_t val, size_t count)
                }
 
                if (i >= odev->buf_size) {
-                       dev_err(odev->dev, "Buffer overflow! Report a bug:"
-                              "offs: %d >= %d i: %d (x: %d y: %d)\n",
-                              (int) odev->buf_offs, (int) odev->buf_size,
-                              (int) i, (int) x, (int) y);
+                       dev_err(odev->dev, "Buffer overflow! Report a bug: offs: %zu >= %zu i: %zu (x: %zu y: %zu)\n",
+                              odev->buf_offs, odev->buf_size, i, x, y);
                        return -EIO;
                }
 
@@ -401,7 +403,7 @@ static int append_values(struct asus_oled_dev *odev, uint8_t val, size_t count)
 
                default:
                        /* cannot get here; stops gcc complaining*/
-                       ;
+                       break;
                }
 
                odev->buf_offs++;
@@ -566,9 +568,11 @@ static ssize_t odev_set_picture(struct asus_oled_dev *odev,
                        if (ret < 0)
                                return ret;
                } else if (buf[offs] == '\n') {
-                       /* New line detected. Lets assume, that all characters
-                          till the end of the line were equal to the last
-                          character in this line.*/
+                       /*
+                        * New line detected. Lets assume, that all characters
+                        * till the end of the line were equal to the last
+                        * character in this line.
+                        */
                        if (odev->buf_offs % odev->width != 0)
                                ret = append_values(odev, odev->last_val,
                                                    odev->width -
index 35641e529396759d5e7c860457155b28fde21f94..f67a22536cbf6dc7843b583db8a47295518496c4 100644 (file)
@@ -13,7 +13,7 @@
 * Returns        - Zero(Success)
 ****************************************************************/
 
-static int bcm_char_open(struct inode *inode, struct file * filp)
+static int bcm_char_open(struct inode *inode, struct file *filp)
 {
        struct bcm_mini_adapter *Adapter = NULL;
        struct bcm_tarang_data *pTarang = NULL;
index a1bf21579d3f40e0ee05cc123bce80275f688be8..5347828660429917b1946f31391d52552a935fd4 100644 (file)
@@ -42,107 +42,95 @@ send to f/w with in 200 ms after the Idle/Shutdown req issued
 */
 
 
-int InterfaceIdleModeRespond(struct bcm_mini_adapter *Adapter, unsigned intpuiBuffer)
+int InterfaceIdleModeRespond(struct bcm_mini_adapter *Adapter, unsigned int *puiBuffer)
 {
        int     status = STATUS_SUCCESS;
        unsigned int    uiRegRead = 0;
        int bytes;
 
-       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"SubType of Message :0x%X", ntohl(*puiBuffer));
+       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "SubType of Message :0x%X", ntohl(*puiBuffer));
 
-       if(ntohl(*puiBuffer) == GO_TO_IDLE_MODE_PAYLOAD)
-       {
-               BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL," Got GO_TO_IDLE_MODE_PAYLOAD(210) Msg Subtype");
-               if(ntohl(*(puiBuffer+1)) == 0 )
-               {
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Got IDLE MODE WAKE UP Response From F/W");
+       if (ntohl(*puiBuffer) == GO_TO_IDLE_MODE_PAYLOAD) {
+               BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, " Got GO_TO_IDLE_MODE_PAYLOAD(210) Msg Subtype");
+               if (ntohl(*(puiBuffer+1)) == 0 ) {
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Got IDLE MODE WAKE UP Response From F/W");
 
-                       status = wrmalt (Adapter,SW_ABORT_IDLEMODE_LOC, &uiRegRead, sizeof(uiRegRead));
-                       if(status)
-                       {
-                               BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode Reg");
+                       status = wrmalt (Adapter, SW_ABORT_IDLEMODE_LOC, &uiRegRead, sizeof(uiRegRead));
+                       if (status) {
+                               BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode Reg");
                                return status;
                        }
 
-                       if(Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING)
-                       {
+                       if (Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING) {
                                uiRegRead = 0x00000000 ;
-                               status = wrmalt (Adapter,DEBUG_INTERRUPT_GENERATOR_REGISTOR, &uiRegRead, sizeof(uiRegRead));
-                               if(status)
-                               {
-                                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode     Reg");
+                               status = wrmalt (Adapter, DEBUG_INTERRUPT_GENERATOR_REGISTOR, &uiRegRead, sizeof(uiRegRead));
+                               if (status) {
+                                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode    Reg");
                                        return status;
                                }
                        }
-                       //Below Register should not br read in case of Manual and Protocol Idle mode.
-                       else if(Adapter->ulPowerSaveMode != DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE)
-                       {
-                               //clear on read Register
+                       /* Below Register should not br read in case of Manual and Protocol Idle mode */
+                       else if (Adapter->ulPowerSaveMode != DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE) {
+                               /* clear on read Register */
                                bytes = rdmalt(Adapter, DEVICE_INT_OUT_EP_REG0, &uiRegRead, sizeof(uiRegRead));
                                if (bytes < 0) {
                                        status = bytes;
-                                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0, "rdm failed while clearing H/W Abort Reg0");
+                                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "rdm failed while clearing H/W Abort Reg0");
                                        return status;
                                }
-                               //clear on read Register
+                               /* clear on read Register */
                                bytes = rdmalt(Adapter, DEVICE_INT_OUT_EP_REG1, &uiRegRead, sizeof(uiRegRead));
                                if (bytes < 0) {
                                        status = bytes;
-                                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0, "rdm failed while clearing H/W Abort     Reg1");
+                                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "rdm failed while clearing H/W Abort    Reg1");
                                        return status;
                                }
                        }
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Device Up from Idle Mode");
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Device Up from Idle Mode");
 
-                       // Set Idle Mode Flag to False and Clear IdleMode reg.
+                       /* Set Idle Mode Flag to False and Clear IdleMode reg. */
                        Adapter->IdleMode = FALSE;
                        Adapter->bTriedToWakeUpFromlowPowerMode = FALSE;
 
                        wake_up(&Adapter->lowpower_mode_wait_queue);
 
-               }
-               else
-               {
-                       if(TRUE == Adapter->IdleMode)
+               } else {
+                       if (TRUE == Adapter->IdleMode)
                        {
-                               BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Device is already in Idle mode....");
+                               BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Device is already in Idle mode....");
                                return status ;
                        }
 
                        uiRegRead = 0;
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Got Req from F/W to go in IDLE mode \n");
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Got Req from F/W to go in IDLE mode \n");
 
-                       if (Adapter->chip_id== BCS220_2 ||
+                       if (Adapter->chip_id == BCS220_2 ||
                                Adapter->chip_id == BCS220_2BC ||
-                                       Adapter->chip_id== BCS250_BC ||
-                                       Adapter->chip_id== BCS220_3)
-                       {
+                                       Adapter->chip_id == BCS250_BC ||
+                                       Adapter->chip_id == BCS220_3) {
 
                                bytes = rdmalt(Adapter, HPM_CONFIG_MSW, &uiRegRead, sizeof(uiRegRead));
                                if (bytes < 0) {
                                        status = bytes;
-                                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "rdm failed while Reading HPM_CONFIG_LDO145 Reg 0\n");
+                                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "rdm failed while Reading HPM_CONFIG_LDO145 Reg 0\n");
                                        return status;
                                }
 
 
                                uiRegRead |= (1<<17);
 
-                               status = wrmalt (Adapter,HPM_CONFIG_MSW, &uiRegRead, sizeof(uiRegRead));
-                               if(status)
-                               {
-                                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode Reg\n");
+                               status = wrmalt (Adapter, HPM_CONFIG_MSW, &uiRegRead, sizeof(uiRegRead));
+                               if (status) {
+                                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "wrm failed while clearing Idle Mode Reg\n");
                                        return status;
                                }
 
                        }
                        SendIdleModeResponse(Adapter);
                }
-       }
-       else if(ntohl(*puiBuffer) == IDLE_MODE_SF_UPDATE_MSG)
-       {
-               BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "OverRiding Service Flow Params");
-               OverrideServiceFlowParams(Adapter,puiBuffer);
+       } else if (ntohl(*puiBuffer) == IDLE_MODE_SF_UPDATE_MSG) {
+               BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "OverRiding Service Flow Params");
+               OverrideServiceFlowParams(Adapter, puiBuffer);
        }
        return status;
 }
@@ -152,46 +140,40 @@ static int InterfaceAbortIdlemode(struct bcm_mini_adapter *Adapter, unsigned int
        int     status = STATUS_SUCCESS;
        unsigned int value;
        unsigned int chip_id ;
-       unsigned long timeout = 0 ,itr = 0;
+       unsigned long timeout = 0itr = 0;
 
        int     lenwritten = 0;
-       unsigned char aucAbortPattern[8]={0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
+       unsigned char aucAbortPattern[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
        struct bcm_interface_adapter *psInterfaceAdapter = Adapter->pvInterfaceAdapter;
 
-       //Abort Bus suspend if its already suspended
-       if((TRUE == psInterfaceAdapter->bSuspended) && (TRUE == Adapter->bDoSuspend))
-       {
+       /* Abort Bus suspend if its already suspended */
+       if ((TRUE == psInterfaceAdapter->bSuspended) && (TRUE == Adapter->bDoSuspend)) {
                status = usb_autopm_get_interface(psInterfaceAdapter->interface);
-               BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Bus got wakeup..Aborting Idle mode... status:%d \n",status);
+               BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Bus got wakeup..Aborting Idle mode... status:%d \n", status);
 
        }
 
-       if((Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING)
+       if ((Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING)
                                                                        ||
-          (Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE))
-       {
-               //write the SW abort pattern.
-               BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Writing pattern<%d> to SW_ABORT_IDLEMODE_LOC\n", Pattern);
-               status = wrmalt(Adapter,SW_ABORT_IDLEMODE_LOC, &Pattern, sizeof(Pattern));
-               if(status)
-               {
-                               BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"WRM to Register SW_ABORT_IDLEMODE_LOC failed..");
+          (Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE)) {
+               /* write the SW abort pattern. */
+               BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Writing pattern<%d> to SW_ABORT_IDLEMODE_LOC\n", Pattern);
+               status = wrmalt(Adapter, SW_ABORT_IDLEMODE_LOC, &Pattern, sizeof(Pattern));
+               if (status) {
+                               BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "WRM to Register SW_ABORT_IDLEMODE_LOC failed..");
                                return status;
                }
        }
 
-       if(Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING)
-       {
+       if (Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING) {
                value = 0x80000000;
-               status = wrmalt(Adapter,DEBUG_INTERRUPT_GENERATOR_REGISTOR, &value, sizeof(value));
-               if(status)
+               status = wrmalt(Adapter, DEBUG_INTERRUPT_GENERATOR_REGISTOR, &value, sizeof(value));
+               if (status)
                {
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"WRM to DEBUG_INTERRUPT_GENERATOR_REGISTOR Register failed");
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "WRM to DEBUG_INTERRUPT_GENERATOR_REGISTOR Register failed");
                        return status;
                }
-       }
-       else if(Adapter->ulPowerSaveMode != DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE)
-       {
+       } else if (Adapter->ulPowerSaveMode != DEVICE_POWERSAVE_MODE_AS_PROTOCOL_IDLE_MODE) {
                /*
                 * Get a Interrupt Out URB and send 8 Bytes Down
                 * To be Done in Thread Context.
@@ -204,43 +186,32 @@ static int InterfaceAbortIdlemode(struct bcm_mini_adapter *Adapter, unsigned int
                        8,
                        &lenwritten,
                        5000);
-               if(status)
-               {
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Sending Abort pattern down fails with status:%d..\n",status);
+               if (status) {
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Sending Abort pattern down fails with status:%d..\n", status);
                        return status;
-               }
-               else
-               {
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "NOB Sent down :%d", lenwritten);
+               } else {
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "NOB Sent down :%d", lenwritten);
                }
 
-               //mdelay(25);
+               /* mdelay(25); */
 
-               timeout= jiffies +  msecs_to_jiffies(50) ;
-               while( timeout > jiffies )
-               {
+               timeout = jiffies +  msecs_to_jiffies(50) ;
+               while ( timeout > jiffies ) {
                        itr++ ;
                        rdmalt(Adapter, CHIP_ID_REG, &chip_id, sizeof(UINT));
-                       if(0xbece3200==(chip_id&~(0xF0)))
-                       {
+                       if (0xbece3200 == (chip_id&~(0xF0)))
                                chip_id = chip_id&~(0xF0);
-                       }
-                       if(chip_id == Adapter->chip_id)
+                       if (chip_id == Adapter->chip_id)
                                break;
                }
-               if(timeout < jiffies )
-               {
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Not able to read chip-id even after 25 msec");
-               }
+               if (timeout < jiffies )
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Not able to read chip-id even after 25 msec");
                else
-               {
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Number of completed iteration to read chip-id :%lu", itr);
-               }
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Number of completed iteration to read chip-id :%lu", itr);
 
-               status = wrmalt(Adapter,SW_ABORT_IDLEMODE_LOC, &Pattern, sizeof(status));
-               if(status)
-               {
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0,"WRM to Register SW_ABORT_IDLEMODE_LOC failed..");
+               status = wrmalt(Adapter, SW_ABORT_IDLEMODE_LOC, &Pattern, sizeof(status));
+               if (status) {
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "WRM to Register SW_ABORT_IDLEMODE_LOC failed..");
                        return status;
                }
        }
@@ -249,13 +220,10 @@ static int InterfaceAbortIdlemode(struct bcm_mini_adapter *Adapter, unsigned int
 int InterfaceIdleModeWakeup(struct bcm_mini_adapter *Adapter)
 {
        ULONG   Status = 0;
-       if(Adapter->bTriedToWakeUpFromlowPowerMode)
-       {
-               BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Wake up already attempted.. ignoring\n");
-       }
-       else
-       {
-               BCM_DEBUG_PRINT(Adapter,DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL,"Writing Low Power Mode Abort pattern to the Device\n");
+       if (Adapter->bTriedToWakeUpFromlowPowerMode) {
+               BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Wake up already attempted.. ignoring\n");
+       } else {
+               BCM_DEBUG_PRINT(Adapter, DBG_TYPE_OTHERS, IDLE_MODE, DBG_LVL_ALL, "Writing Low Power Mode Abort pattern to the Device\n");
                Adapter->bTriedToWakeUpFromlowPowerMode = TRUE;
                InterfaceAbortIdlemode(Adapter, Adapter->usIdleModePattern);
 
@@ -269,33 +237,30 @@ void InterfaceHandleShutdownModeWakeup(struct bcm_mini_adapter *Adapter)
        INT Status = 0;
        int bytes;
 
-       if(Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING)
-       {
-               // clear idlemode interrupt.
+       if (Adapter->ulPowerSaveMode == DEVICE_POWERSAVE_MODE_AS_MANUAL_CLOCK_GATING) {
+               /* clear idlemode interrupt. */
                uiRegVal = 0;
-               Status =wrmalt(Adapter,DEBUG_INTERRUPT_GENERATOR_REGISTOR, &uiRegVal, sizeof(uiRegVal));
-               if(Status)
-               {
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0,"WRM to DEBUG_INTERRUPT_GENERATOR_REGISTOR Failed with err :%d", Status);
+               Status = wrmalt(Adapter, DEBUG_INTERRUPT_GENERATOR_REGISTOR, &uiRegVal, sizeof(uiRegVal));
+               if (Status) {
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0,"WRM to DEBUG_INTERRUPT_GENERATOR_REGISTOR Failed with err :%d", Status);
                        return;
                }
        }
 
-    else
-       {
+    else {
 
-        //clear Interrupt EP registers.
-               bytes = rdmalt(Adapter,DEVICE_INT_OUT_EP_REG0, &uiRegVal, sizeof(uiRegVal));
+        /* clear Interrupt EP registers. */
+               bytes = rdmalt(Adapter, DEVICE_INT_OUT_EP_REG0, &uiRegVal, sizeof(uiRegVal));
                if (bytes < 0) {
                        Status = bytes;
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0,"RDM of DEVICE_INT_OUT_EP_REG0 failed with Err :%d", Status);
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "RDM of DEVICE_INT_OUT_EP_REG0 failed with Err :%d", Status);
                        return;
                }
 
-               bytes = rdmalt(Adapter,DEVICE_INT_OUT_EP_REG1, &uiRegVal, sizeof(uiRegVal));
+               bytes = rdmalt(Adapter, DEVICE_INT_OUT_EP_REG1, &uiRegVal, sizeof(uiRegVal));
                if (bytes < 0) {
                        Status = bytes;
-                       BCM_DEBUG_PRINT(Adapter,DBG_TYPE_PRINTK, 0, 0,"RDM of DEVICE_INT_OUT_EP_REG1 failed with Err :%d", Status);
+                       BCM_DEBUG_PRINT(Adapter, DBG_TYPE_PRINTK, 0, 0, "RDM of DEVICE_INT_OUT_EP_REG1 failed with Err :%d", Status);
                        return;
                }
        }
index a07b956b9ff58fdbe21a1ff6cdc9480837d5ed7b..f1cb9de734a6a939f46c8de93c36bc7d764a1ca7 100644 (file)
@@ -1,4 +1,3 @@
-
 /*Copyright (c) 2005 Beceem Communications Inc.
 
 Module Name:
@@ -17,7 +16,6 @@ Abstract:
 #define VER_FILETYPE                VFT_DRV
 #define VER_FILESUBTYPE             VFT2_DRV_NETWORK
 
-
 #define VER_FILEVERSION             5.2.45
 #define VER_FILEVERSION_STR         "5.2.45"
 
@@ -28,8 +26,4 @@ Abstract:
 #define VER_PRODUCTVERSION_STR      VER_FILEVERSION_STR
 
 
-
-
-//#include "common.ver"
-
-#endif         //VERSION_H
+#endif /* VERSION_H */
index be1f91d955aa2716c69cb8b87d9c9dfd55de0d99..d38a06f762df19765d014498dfbe736e9fb46e77 100644 (file)
@@ -1,70 +1,70 @@
 #include "headers.h"
-//-----------------------------------------------------------------------------
-// Procedure:  vendorextnGetSectionInfo
-//
-// Description: Finds the type of NVM used.
-//
-// Arguments:
-//             Adapter    - ptr to Adapter object instance
-//             pNVMType   - ptr to NVM type.
-// Returns:
-//             STATUS_SUCCESS/STATUS_FAILURE
-//
-//-----------------------------------------------------------------------------
+/*
+ * Procedure:  vendorextnGetSectionInfo
+ *
+ * Description: Finds the type of NVM used.
+ *
+ * Arguments:
+ *             Adapter    - ptr to Adapter object instance
+ *             pNVMType   - ptr to NVM type.
+ * Returns:
+ *             STATUS_SUCCESS/STATUS_FAILURE
+ *
+ */
 INT vendorextnGetSectionInfo(PVOID  pContext, struct bcm_flash2x_vendor_info *pVendorInfo)
 {
        return STATUS_FAILURE;
 }
 
-//-----------------------------------------------------------------------------
-// Procedure:   vendorextnInit
-//
-// Description: Initializing the vendor extension NVM interface
-//
-// Arguments:
-//              Adapter   - Pointer to MINI Adapter Structure.
-
-// Returns:
-//              STATUS_SUCCESS/STATUS_FAILURE
-//
-//-----------------------------------------------------------------------------
+/*
+ * Procedure:   vendorextnInit
+ *
+ * Description: Initializing the vendor extension NVM interface
+ *
+ * Arguments:
+ *              Adapter   - Pointer to MINI Adapter Structure
+ * Returns:
+ *             STATUS_SUCCESS/STATUS_FAILURE
+ *
+ *
+ */
 INT vendorextnInit(struct bcm_mini_adapter *Adapter)
 {
        return STATUS_SUCCESS;
 }
 
-//-----------------------------------------------------------------------------
-// Procedure:   vendorextnExit
-//
-// Description: Free the resource associated with vendor extension NVM interface
-//
-// Arguments:
-//              Adapter   - Pointer to MINI Adapter Structure.
-
-// Returns:
-//              STATUS_SUCCESS/STATUS_FAILURE
-//
-//-----------------------------------------------------------------------------
+/*
+ * Procedure:   vendorextnExit
+ *
+ * Description: Free the resource associated with vendor extension NVM interface
+ *
+ * Arguments:
+ *
+ * Returns:
+ *              STATUS_SUCCESS/STATUS_FAILURE
+ *
+ *
+ */
 INT vendorextnExit(struct bcm_mini_adapter *Adapter)
 {
        return STATUS_SUCCESS;
 }
 
-//------------------------------------------------------------------------
-// Procedure:  vendorextnIoctl
-//
-// Description:        execute the vendor extension specific ioctl
-//
-//Arguments:
-//             Adapter -Beceem private Adapter Structure
-//             cmd     -vendor extension specific Ioctl commad
-//             arg             -input parameter sent by vendor
-//
-// Returns:
-//             CONTINUE_COMMON_PATH in case it is not meant to be processed by vendor ioctls
-//             STATUS_SUCCESS/STATUS_FAILURE as per the IOCTL return value
-//
-//--------------------------------------------------------------------------
+/*
+ * Procedure:  vendorextnIoctl
+ *
+ * Description: execute the vendor extension specific ioctl
+ *
+ * Arguments:
+ *             Adapter -Beceem private Adapter Structure
+ *             cmd     -vendor extension specific Ioctl commad
+ *             arg     -input parameter sent by vendor
+ *
+ * Returns:
+ *             CONTINUE_COMMON_PATH in case it is not meant to be processed by vendor ioctls
+ *             STATUS_SUCCESS/STATUS_FAILURE as per the IOCTL return value
+ */
+
 INT vendorextnIoctl(struct bcm_mini_adapter *Adapter, UINT cmd, ULONG arg)
 {
        return CONTINUE_COMMON_PATH;
@@ -72,22 +72,21 @@ INT vendorextnIoctl(struct bcm_mini_adapter *Adapter, UINT cmd, ULONG arg)
 
 
 
-//------------------------------------------------------------------
-// Procedure:  vendorextnReadSection
-//
-// Description: Reads from a section of NVM
-//
-// Arguments:
-//             pContext - ptr to Adapter object instance
-//             pBuffer - Read the data from Vendor Area to this buffer
-//             SectionVal   - Value of type of Section
-//             Offset - Read from the Offset of the Vendor Section.
-//             numOfBytes - Read numOfBytes from the Vendor section to Buffer
-//
-// Returns:
-//             STATUS_SUCCESS/STATUS_FAILURE
-//
-//------------------------------------------------------------------
+/*
+ * Procedure:  vendorextnReadSection
+ *
+ * Description: Reads from a section of NVM
+ *
+ * Arguments:
+ *             pContext - ptr to Adapter object instance
+ *             pBuffer - Read the data from Vendor Area to this buffer
+ *             SectionVal   - Value of type of Section
+ *             Offset - Read from the Offset of the Vendor Section.
+ *             numOfBytes - Read numOfBytes from the Vendor section to Buffer
+ *
+ * Returns:
+ *             STATUS_SUCCESS/STATUS_FAILURE
+ */
 
 INT vendorextnReadSection(PVOID  pContext, PUCHAR pBuffer, enum bcm_flash2x_section_val SectionVal,
                        UINT offset, UINT numOfBytes)
@@ -97,23 +96,22 @@ INT vendorextnReadSection(PVOID  pContext, PUCHAR pBuffer, enum bcm_flash2x_sect
 
 
 
-//------------------------------------------------------------------
-// Procedure:  vendorextnWriteSection
-//
-// Description: Write to a Section of NVM
-//
-// Arguments:
-//             pContext - ptr to Adapter object instance
-//             pBuffer - Write the data provided in the buffer
-//             SectionVal   - Value of type of Section
-//             Offset - Writes to the Offset of the Vendor Section.
-//             numOfBytes - Write num Bytes after reading from pBuffer.
-//             bVerify - the Buffer Written should be verified.
-//
-// Returns:
-//             STATUS_SUCCESS/STATUS_FAILURE
-//
-//------------------------------------------------------------------
+/*
+ * Procedure:  vendorextnWriteSection
+ *
+ * Description: Write to a Section of NVM
+ *
+ * Arguments:
+ *             pContext - ptr to Adapter object instance
+ *             pBuffer - Write the data provided in the buffer
+ *             SectionVal   - Value of type of Section
+ *             Offset - Writes to the Offset of the Vendor Section.
+ *             numOfBytes - Write num Bytes after reading from pBuffer.
+ *             bVerify - the Buffer Written should be verified.
+ *
+ * Returns:
+ *             STATUS_SUCCESS/STATUS_FAILURE
+ */
 INT vendorextnWriteSection(PVOID  pContext, PUCHAR pBuffer, enum bcm_flash2x_section_val SectionVal,
                        UINT offset, UINT numOfBytes, BOOLEAN bVerify)
 {
@@ -122,25 +120,23 @@ INT vendorextnWriteSection(PVOID  pContext, PUCHAR pBuffer, enum bcm_flash2x_sec
 
 
 
-//------------------------------------------------------------------
-// Procedure:  vendorextnWriteSectionWithoutErase
-//
-// Description: Write to a Section of NVM without erasing the sector
-//
-// Arguments:
-//             pContext - ptr to Adapter object instance
-//             pBuffer - Write the data provided in the buffer
-//             SectionVal   - Value of type of Section
-//             Offset - Writes to the Offset of the Vendor Section.
-//             numOfBytes - Write num Bytes after reading from pBuffer.
-//
-// Returns:
-//             STATUS_SUCCESS/STATUS_FAILURE
-//
-//------------------------------------------------------------------
+/*
+ * Procedure:  vendorextnWriteSectionWithoutErase
+ *
+ * Description: Write to a Section of NVM without erasing the sector
+ *
+ * Arguments:
+ *             pContext - ptr to Adapter object instance
+ *             pBuffer - Write the data provided in the buffer
+ *             SectionVal   - Value of type of Section
+ *             Offset - Writes to the Offset of the Vendor Section.
+ *             numOfBytes - Write num Bytes after reading from pBuffer.
+ *
+ * Returns:
+ *             STATUS_SUCCESS/STATUS_FAILURE
+ */
 INT vendorextnWriteSectionWithoutErase(PVOID  pContext, PUCHAR pBuffer, enum bcm_flash2x_section_val SectionVal,
                        UINT offset, UINT numOfBytes)
 {
        return STATUS_FAILURE;
 }
-
diff --git a/drivers/staging/btmtk_usb/Kconfig b/drivers/staging/btmtk_usb/Kconfig
new file mode 100644 (file)
index 0000000..a425ebd
--- /dev/null
@@ -0,0 +1,11 @@
+config USB_BTMTK
+       tristate "Mediatek Bluetooth support"
+       depends on USB && BT && m
+       ---help---
+         Say Y here if you wish to control a MTK USB Bluetooth.
+
+         This option depends on 'USB' support being enabled
+
+         To compile this driver as a module, choose M here: the
+         module will be called btmtk_usb.
+
diff --git a/drivers/staging/btmtk_usb/Makefile b/drivers/staging/btmtk_usb/Makefile
new file mode 100644 (file)
index 0000000..4d6c9d7
--- /dev/null
@@ -0,0 +1 @@
+obj-$(CONFIG_USB_BTMTK)        += btmtk_usb.o
diff --git a/drivers/staging/btmtk_usb/README b/drivers/staging/btmtk_usb/README
new file mode 100644 (file)
index 0000000..c046c8e
--- /dev/null
@@ -0,0 +1,14 @@
+-build driver modules
+       make
+
+-install driver modules
+       make install
+
+-remove driver modules
+       make clean
+
+-dynamic debug message
+       turn on CONFIG_DYNAMIC_DEBUG compiler flag for current kernel
+       mount -t debugfs none /sys/kernel/debug/
+       echo "module module_name +p" > /sys/kernel/debug/dynamic_debug/control(turn on debug messages, module name such as btmtk_usb)
+       echo "module module_name -p" > /sys/kernel/debug/dynamic_debug/control(turn off debug messages, module name such as btmtk_usb)
diff --git a/drivers/staging/btmtk_usb/TODO b/drivers/staging/btmtk_usb/TODO
new file mode 100644 (file)
index 0000000..a71d129
--- /dev/null
@@ -0,0 +1,10 @@
+TODO:
+        - checkpatch.pl clean
+       - determine if the driver should not be using a duplicate
+          version of the usb-bluetooth interface code, but should
+          be merged into the drivers/bluetooth/ directory and
+          infrastructure instead.
+       - review by the bluetooth developer community
+
+Please send any patches for this driver to Yu-Chen, Cho <acho@suse.com> and
+jay.hung@mediatek.com
diff --git a/drivers/staging/btmtk_usb/btmtk_usb.c b/drivers/staging/btmtk_usb/btmtk_usb.c
new file mode 100644 (file)
index 0000000..0e783e8
--- /dev/null
@@ -0,0 +1,1784 @@
+/*
+ *  MediaTek Bluetooth USB Driver
+ *
+ *  Copyright (C) 2013, MediaTek co.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  or on the worldwide web at http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/completion.h>
+#include <linux/firmware.h>
+#include <linux/usb.h>
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "btmtk_usb.h"
+
+#define VERSION "1.0.4"
+#define MT7650_FIRMWARE        "mt7650.bin"
+#define MT7662_FIRMWARE        "mt7662.bin"
+
+static struct usb_driver btmtk_usb_driver;
+
+
+static int btmtk_usb_load_rom_patch(struct btmtk_usb_data *);
+static int btmtk_usb_load_fw(struct btmtk_usb_data *);
+
+static void hex_dump(char *str, u8 *src_buf, u32 src_buf_len)
+{
+       unsigned char *pt;
+       int x;
+
+       pt = src_buf;
+
+       BT_DBG("%s: %p, len = %d\n", str, src_buf, src_buf_len);
+
+       for (x = 0; x < src_buf_len; x++) {
+               if (x % 16 == 0)
+                       BT_DBG("0x%04x : ", x);
+               BT_DBG("%02x ", ((unsigned char)pt[x]));
+               if (x % 16 == 15)
+                       BT_DBG("\n");
+       }
+
+       BT_DBG("\n");
+}
+
+static int btmtk_usb_reset(struct usb_device *udev)
+{
+       int ret;
+
+       BT_DBG("%s\n", __func__);
+
+       ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x01, DEVICE_VENDOR_REQUEST_OUT,
+                                                 0x01, 0x00, NULL, 0x00, CONTROL_TIMEOUT_JIFFIES);
+
+       if (ret < 0) {
+               BT_ERR("%s error(%d)\n", __func__, ret);
+               return ret;
+       }
+
+       if (ret > 0)
+               ret = 0;
+
+       return ret;
+}
+
+static int btmtk_usb_io_read32(struct btmtk_usb_data *data, u32 reg, u32 *val)
+{
+       u8 request = data->r_request;
+       struct usb_device *udev = data->udev;
+       int ret;
+
+       ret = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), request, DEVICE_VENDOR_REQUEST_IN,
+                                                 0x0, reg, data->io_buf, 4,
+                                                 CONTROL_TIMEOUT_JIFFIES);
+
+       if (ret < 0) {
+               *val = 0xffffffff;
+               BT_ERR("%s error(%d), reg=%x, value=%x\n", __func__, ret, reg, *val);
+               return ret;
+       }
+
+       memmove(val, data->io_buf, 4);
+
+       *val = le32_to_cpu(*val);
+
+       if (ret > 0)
+               ret = 0;
+
+       return ret;
+}
+
+static int btmtk_usb_io_write32(struct btmtk_usb_data *data, u32 reg, u32 val)
+{
+       u16 value, index;
+       u8 request = data->w_request;
+       struct usb_device *udev = data->udev;
+       int ret;
+
+       index = (u16)reg;
+       value = val & 0x0000ffff;
+
+       ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), request, DEVICE_VENDOR_REQUEST_OUT,
+                                                 value, index, NULL, 0,
+                                                 CONTROL_TIMEOUT_JIFFIES);
+
+       if (ret < 0) {
+               BT_ERR("%s error(%d), reg=%x, value=%x\n", __func__, ret, reg, val);
+               return ret;
+       }
+
+       index = (u16)(reg + 2);
+       value = (val & 0xffff0000) >> 16;
+
+       ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
+                               request, DEVICE_VENDOR_REQUEST_OUT,
+                               value, index, NULL, 0, CONTROL_TIMEOUT_JIFFIES);
+
+       if (ret < 0) {
+               BT_ERR("%s error(%d), reg=%x, value=%x\n", __func__, ret, reg, val);
+               return ret;
+       }
+
+       if (ret > 0)
+               ret = 0;
+
+       return ret;
+}
+
+static int btmtk_usb_switch_iobase(struct btmtk_usb_data *data, int base)
+{
+       int ret = 0;
+
+       switch (base) {
+       case SYSCTL:
+               data->w_request = 0x42;
+               data->r_request = 0x47;
+               break;
+       case WLAN:
+               data->w_request = 0x02;
+               data->r_request = 0x07;
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       return ret;
+}
+
+static void btmtk_usb_cap_init(struct btmtk_usb_data *data)
+{
+       const struct firmware   *firmware;
+       struct usb_device   *udev = data->udev;
+       int ret;
+
+       btmtk_usb_io_read32(data, 0x00, &data->chip_id);
+
+       BT_DBG("chip id = %x\n", data->chip_id);
+
+       if (is_mt7630(data) || is_mt7650(data)) {
+               data->need_load_fw = 1;
+               data->need_load_rom_patch = 0;
+               ret = request_firmware(&firmware, MT7650_FIRMWARE, &udev->dev);
+               if (ret < 0) {
+                       if (ret == -ENOENT) {
+                               BT_ERR("Firmware file \"%s\" not found \n", MT7650_FIRMWARE);
+                       } else {
+                               BT_ERR("Firmware file \"%s\" request failed (err=%d) \n",
+                                       MT7650_FIRMWARE, ret);
+                       }
+               } else {
+                       BT_DBG("Firmware file \"%s\" Found \n", MT7650_FIRMWARE);
+                       /* load firmware here */
+                       data->firmware = firmware;
+                       btmtk_usb_load_fw(data);
+               }
+               release_firmware(firmware);
+       } else if (is_mt7632(data) || is_mt7662(data)) {
+               data->need_load_fw = 0;
+               data->need_load_rom_patch = 1;
+               data->rom_patch_offset = 0x90000;
+               ret = request_firmware(&firmware, MT7662_FIRMWARE, &udev->dev);
+               if (ret < 0) {
+                       if (ret == -ENOENT) {
+                               BT_ERR("Firmware file \"%s\" not found\n", MT7662_FIRMWARE);
+                       } else {
+                               BT_ERR("Firmware file \"%s\" request failed (err=%d)\n",
+                                       MT7662_FIRMWARE, ret);
+                       }
+               } else {
+                   BT_DBG("Firmware file \"%s\" Found\n", MT7662_FIRMWARE);
+                   /* load rom patch here */
+                   data->firmware = firmware;
+                   data->rom_patch_len = firmware->size;
+                   btmtk_usb_load_rom_patch(data);
+               }
+               release_firmware(firmware);
+       } else {
+               BT_ERR("unknow chip(%x)\n", data->chip_id);
+       }
+}
+
+static u16 checksume16(u8 *pData, int len)
+{
+       int sum = 0;
+
+       while (len > 1) {
+               sum += *((u16 *)pData);
+
+               pData = pData + 2;
+
+               if (sum & 0x80000000)
+                       sum = (sum & 0xFFFF) + (sum >> 16);
+
+               len -= 2;
+       }
+
+       if (len)
+               sum += *((u8 *)pData);
+
+       while (sum >> 16) {
+               sum = (sum & 0xFFFF) + (sum >> 16);
+       }
+
+       return ~sum;
+}
+
+static int btmtk_usb_chk_crc(struct btmtk_usb_data *data, u32 checksum_len)
+{
+       int ret = 0;
+       struct usb_device *udev = data->udev;
+
+       BT_DBG("%s\n", __func__);
+
+       memmove(data->io_buf, &data->rom_patch_offset, 4);
+       memmove(&data->io_buf[4], &checksum_len, 4);
+
+       ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x1, DEVICE_VENDOR_REQUEST_IN,
+                                                 0x20, 0x00, data->io_buf, 8,
+                                                 CONTROL_TIMEOUT_JIFFIES);
+
+       if (ret < 0) {
+               BT_ERR("%s error(%d)\n", __func__, ret);
+       }
+
+       return ret;
+}
+
+static u16 btmtk_usb_get_crc(struct btmtk_usb_data *data)
+{
+       int ret = 0;
+       struct usb_device *udev = data->udev;
+       u16 crc, count = 0;
+
+       BT_DBG("%s\n", __func__);
+
+       while (1) {
+               ret = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
+                                       0x01, DEVICE_VENDOR_REQUEST_IN,
+                                       0x21, 0x00, data->io_buf, 2,
+                                       CONTROL_TIMEOUT_JIFFIES);
+
+               if (ret < 0) {
+                       crc = 0xFFFF;
+                       BT_ERR("%s error(%d)\n", __func__, ret);
+               }
+
+               memmove(&crc, data->io_buf, 2);
+
+               crc = le16_to_cpu(crc);
+
+               if (crc != 0xFFFF)
+                       break;
+
+               mdelay(100);
+
+               if (count++ > 100) {
+                       BT_ERR("Query CRC over %d times\n", count);
+                       break;
+               }
+       }
+
+       return crc;
+}
+
+static int btmtk_usb_reset_wmt(struct btmtk_usb_data *data)
+{
+       int ret = 0;
+
+       /* reset command */
+       u8 cmd[8] = {0x6F, 0xFC, 0x05, 0x01, 0x07, 0x01, 0x00, 0x04};
+
+       memmove(data->io_buf, cmd, 8);
+
+       BT_DBG("%s\n", __func__);
+
+       ret = usb_control_msg(data->udev, usb_sndctrlpipe(data->udev, 0), 0x01,
+                               DEVICE_CLASS_REQUEST_OUT, 0x12, 0x00, data->io_buf,
+                               8, CONTROL_TIMEOUT_JIFFIES);
+
+       if (ret)
+               BT_ERR("%s:(%d)\n", __func__, ret);
+
+       return ret;
+}
+
+static void load_rom_patch_complete(struct urb *urb)
+{
+
+       struct completion *sent_to_mcu_done = (struct completion *)urb->context;
+
+       complete(sent_to_mcu_done);
+}
+
+static int btmtk_usb_load_rom_patch(struct btmtk_usb_data *data)
+{
+       u32 loop = 0;
+       u32 value;
+       s32 sent_len;
+       int ret = 0, total_checksum = 0;
+       struct urb *urb;
+       u32 patch_len = 0;
+       u32 cur_len = 0;
+       dma_addr_t data_dma;
+       struct completion sent_to_mcu_done;
+       int first_block = 1;
+       unsigned char phase;
+       void *buf;
+       char *pos;
+       unsigned int pipe = usb_sndbulkpipe(data->udev, data->bulk_tx_ep->bEndpointAddress);
+
+       if (!data->firmware) {
+               BT_ERR("%s:please assign a rom patch\n", __func__);
+               return -1;
+       }
+
+load_patch_protect:
+       btmtk_usb_switch_iobase(data, WLAN);
+       btmtk_usb_io_read32(data, SEMAPHORE_03, &value);
+       loop++;
+
+       if (((value & 0x01) == 0x00) && (loop < 600)) {
+               mdelay(1);
+               goto load_patch_protect;
+       }
+
+       btmtk_usb_io_write32(data, 0x1004, 0x2c);
+
+       btmtk_usb_switch_iobase(data, SYSCTL);
+
+       btmtk_usb_io_write32(data, 0x1c, 0x30);
+
+       /* Enable USB_DMA_CFG */
+       btmtk_usb_io_write32(data, 0x9018, 0x00c00020);
+
+       btmtk_usb_switch_iobase(data, WLAN);
+
+       /* check ROM patch if upgrade */
+       btmtk_usb_io_read32(data, COM_REG0, &value);
+
+       if ((value & 0x02) == 0x02)
+               goto error0;
+
+       urb = usb_alloc_urb(0, GFP_ATOMIC);
+
+       if (!urb) {
+               ret = -ENOMEM;
+               goto error0;
+       }
+
+       buf = usb_alloc_coherent(data->udev, UPLOAD_PATCH_UNIT, GFP_ATOMIC, &data_dma);
+
+       if (!buf) {
+               ret = -ENOMEM;
+               goto error1;
+       }
+
+       pos = buf;
+       BT_DBG("loading rom patch");
+
+       init_completion(&sent_to_mcu_done);
+
+       cur_len = 0x00;
+       patch_len = data->rom_patch_len - PATCH_INFO_SIZE;
+
+       /* loading rom patch */
+       while (1) {
+               s32 sent_len_max = UPLOAD_PATCH_UNIT - PATCH_HEADER_SIZE;
+               sent_len = (patch_len - cur_len) >= sent_len_max ? sent_len_max : (patch_len - cur_len);
+
+               BT_DBG("patch_len = %d\n", patch_len);
+               BT_DBG("cur_len = %d\n", cur_len);
+               BT_DBG("sent_len = %d\n", sent_len);
+
+               if (sent_len > 0) {
+                       if (first_block == 1) {
+                               if (sent_len < sent_len_max)
+                                       phase = PATCH_PHASE3;
+                               else
+                                       phase = PATCH_PHASE1;
+                               first_block = 0;
+                       } else if (sent_len == sent_len_max) {
+                               phase = PATCH_PHASE2;
+                       } else {
+                               phase = PATCH_PHASE3;
+                       }
+
+                       /* prepare HCI header */
+                       pos[0] = 0x6F;
+                       pos[1] = 0xFC;
+                       pos[2] = (sent_len + 5) & 0xFF;
+                       pos[3] = ((sent_len + 5) >> 8) & 0xFF;
+
+                       /* prepare WMT header */
+                       pos[4] = 0x01;
+                       pos[5] = 0x01;
+                       pos[6] = (sent_len + 1) & 0xFF;
+                       pos[7] = ((sent_len + 1) >> 8) & 0xFF;
+
+                       pos[8] = phase;
+
+                       memcpy(&pos[9], data->firmware->data + PATCH_INFO_SIZE + cur_len, sent_len);
+
+                       BT_DBG("sent_len + PATCH_HEADER_SIZE = %d, phase = %d\n",
+                                       sent_len + PATCH_HEADER_SIZE, phase);
+
+                       usb_fill_bulk_urb(urb,
+                                       data->udev,
+                                       pipe,
+                                       buf,
+                                       sent_len + PATCH_HEADER_SIZE,
+                                       load_rom_patch_complete,
+                                       &sent_to_mcu_done);
+
+                       urb->transfer_dma = data_dma;
+                       urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+
+                       ret = usb_submit_urb(urb, GFP_ATOMIC);
+
+                       if (ret)
+                               goto error2;
+
+                       if (!wait_for_completion_timeout(&sent_to_mcu_done, msecs_to_jiffies(1000))) {
+                               usb_kill_urb(urb);
+                               BT_ERR("upload rom_patch timeout\n");
+                               goto error2;
+                       }
+
+                       BT_DBG(".");
+
+                       mdelay(200);
+
+                       cur_len += sent_len;
+
+               } else {
+                       break;
+               }
+       }
+
+       total_checksum = checksume16((u8 *)data->firmware->data + PATCH_INFO_SIZE, patch_len);
+
+       BT_DBG("Send checksum req..\n");
+
+       btmtk_usb_chk_crc(data, patch_len);
+
+       mdelay(20);
+
+       if (total_checksum != btmtk_usb_get_crc(data)) {
+               BT_ERR("checksum fail!, local(0x%x) <> fw(0x%x)\n",
+                               total_checksum, btmtk_usb_get_crc(data));
+               ret = -1;
+               goto error2;
+       }
+
+       mdelay(20);
+
+       ret = btmtk_usb_reset_wmt(data);
+
+       mdelay(20);
+
+error2:
+       usb_free_coherent(data->udev, UPLOAD_PATCH_UNIT, buf, data_dma);
+error1:
+       usb_free_urb(urb);
+error0:
+       btmtk_usb_io_write32(data, SEMAPHORE_03, 0x1);
+       return ret;
+}
+
+
+static int load_fw_iv(struct btmtk_usb_data *data)
+{
+       int ret;
+       struct usb_device *udev = data->udev;
+       char *buf = kmalloc(64, GFP_ATOMIC);
+
+       memmove(buf, data->firmware->data + 32, 64);
+
+       ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x01,
+                                                 DEVICE_VENDOR_REQUEST_OUT, 0x12, 0x0, buf, 64,
+                                                 CONTROL_TIMEOUT_JIFFIES);
+
+       if (ret < 0) {
+               BT_ERR("%s error(%d) step4\n", __func__, ret);
+               kfree(buf);
+               return ret;
+       }
+
+       if (ret > 0)
+               ret = 0;
+
+       kfree(buf);
+
+       return ret;
+}
+
+static void load_fw_complete(struct urb *urb)
+{
+
+       struct completion *sent_to_mcu_done = (struct completion *)urb->context;
+
+       complete(sent_to_mcu_done);
+}
+
+static int btmtk_usb_load_fw(struct btmtk_usb_data *data)
+{
+       struct usb_device *udev = data->udev;
+       struct urb *urb;
+       void *buf;
+       u32 cur_len = 0;
+       u32 packet_header = 0;
+       u32 value;
+       u32 ilm_len = 0, dlm_len = 0;
+       u16 fw_ver, build_ver;
+       u32 loop = 0;
+       dma_addr_t data_dma;
+       int ret = 0, sent_len;
+       struct completion sent_to_mcu_done;
+       unsigned int pipe = usb_sndbulkpipe(data->udev, data->bulk_tx_ep->bEndpointAddress);
+
+       if (!data->firmware) {
+               BT_ERR("%s:please assign a fw\n", __func__);
+               return -1;
+       }
+
+       BT_DBG("bulk_tx_ep = %x\n", data->bulk_tx_ep->bEndpointAddress);
+
+loadfw_protect:
+       btmtk_usb_switch_iobase(data, WLAN);
+       btmtk_usb_io_read32(data, SEMAPHORE_00, &value);
+       loop++;
+
+       if (((value & 0x1) == 0) && (loop < 10000))
+               goto loadfw_protect;
+
+       /* check MCU if ready */
+       btmtk_usb_io_read32(data, COM_REG0, &value);
+
+       if ((value & 0x01) == 0x01)
+               goto error0;
+
+       /* Enable MPDMA TX and EP2 load FW mode */
+       btmtk_usb_io_write32(data, 0x238, 0x1c000000);
+
+       btmtk_usb_reset(udev);
+       mdelay(100);
+
+       ilm_len = (*(data->firmware->data + 3) << 24)
+                       | (*(data->firmware->data + 2) << 16)
+                       | (*(data->firmware->data + 1) << 8)
+                       | (*data->firmware->data);
+
+       dlm_len = (*(data->firmware->data + 7) << 24)
+                       | (*(data->firmware->data + 6) << 16)
+                       | (*(data->firmware->data + 5) << 8)
+                       | (*(data->firmware->data + 4));
+
+       fw_ver = (*(data->firmware->data + 11) << 8) | (*(data->firmware->data + 10));
+
+       build_ver = (*(data->firmware->data + 9) << 8) | (*(data->firmware->data + 8));
+
+       BT_DBG("fw version:%d.%d.%02d ",
+                       (fw_ver & 0xf000) >> 8,
+                       (fw_ver & 0x0f00) >> 8,
+                       (fw_ver & 0x00ff));
+
+       BT_DBG("build:%x\n", build_ver);
+
+       BT_DBG("build Time =");
+
+       for (loop = 0; loop < 16; loop++)
+               BT_DBG("%c", *(data->firmware->data + 16 + loop));
+
+       BT_DBG("\n");
+
+       BT_DBG("ILM length = %d(bytes)\n", ilm_len);
+       BT_DBG("DLM length = %d(bytes)\n", dlm_len);
+
+       btmtk_usb_switch_iobase(data, SYSCTL);
+
+       /* U2M_PDMA rx_ring_base_ptr */
+       btmtk_usb_io_write32(data, 0x790, 0x400230);
+
+       /* U2M_PDMA rx_ring_max_cnt */
+       btmtk_usb_io_write32(data, 0x794, 0x1);
+
+       /* U2M_PDMA cpu_idx */
+       btmtk_usb_io_write32(data, 0x798, 0x1);
+
+       /* U2M_PDMA enable */
+       btmtk_usb_io_write32(data, 0x704, 0x44);
+
+       urb = usb_alloc_urb(0, GFP_ATOMIC);
+
+       if (!urb) {
+               ret = -ENOMEM;
+               goto error1;
+       }
+
+       buf = usb_alloc_coherent(udev, 14592, GFP_ATOMIC, &data_dma);
+
+       if (!buf) {
+               ret = -ENOMEM;
+               goto error2;
+       }
+
+       BT_DBG("loading fw");
+
+       init_completion(&sent_to_mcu_done);
+
+       btmtk_usb_switch_iobase(data, SYSCTL);
+
+       cur_len = 0x40;
+
+       /* Loading ILM */
+       while (1) {
+               sent_len = (ilm_len - cur_len) >= 14336 ? 14336 : (ilm_len - cur_len);
+
+               if (sent_len > 0) {
+                       packet_header &= ~(0xffffffff);
+                       packet_header |= (sent_len << 16);
+                       packet_header = cpu_to_le32(packet_header);
+
+                       memmove(buf, &packet_header, 4);
+                       memmove(buf + 4, data->firmware->data + 32 + cur_len, sent_len);
+
+                       /* U2M_PDMA descriptor */
+                       btmtk_usb_io_write32(data, 0x230, cur_len);
+
+                       while ((sent_len % 4) != 0) {
+                               sent_len++;
+                       }
+
+                       /* U2M_PDMA length */
+                       btmtk_usb_io_write32(data, 0x234, sent_len << 16);
+
+                       usb_fill_bulk_urb(urb,
+                                       udev,
+                                       pipe,
+                                       buf,
+                                       sent_len + 4,
+                                       load_fw_complete,
+                                       &sent_to_mcu_done);
+
+                       urb->transfer_dma = data_dma;
+                       urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+
+                       ret = usb_submit_urb(urb, GFP_ATOMIC);
+
+                       if (ret)
+                               goto error3;
+
+                       if (!wait_for_completion_timeout(&sent_to_mcu_done, msecs_to_jiffies(1000))) {
+                               usb_kill_urb(urb);
+                               BT_ERR("upload ilm fw timeout\n");
+                               goto error3;
+                       }
+
+                       BT_DBG(".");
+
+                       mdelay(200);
+
+                       cur_len += sent_len;
+               } else {
+                       break;
+               }
+       }
+
+       init_completion(&sent_to_mcu_done);
+       cur_len = 0x00;
+
+       /* Loading DLM */
+       while (1) {
+               sent_len = (dlm_len - cur_len) >= 14336 ? 14336 : (dlm_len - cur_len);
+
+               if (sent_len > 0) {
+                       packet_header &= ~(0xffffffff);
+                       packet_header |= (sent_len << 16);
+                       packet_header = cpu_to_le32(packet_header);
+
+                       memmove(buf, &packet_header, 4);
+                       memmove(buf + 4, data->firmware->data + 32 + ilm_len + cur_len, sent_len);
+
+                       /* U2M_PDMA descriptor */
+                       btmtk_usb_io_write32(data, 0x230, 0x80000 + cur_len);
+
+                       while ((sent_len % 4) != 0) {
+                               BT_DBG("sent_len is not divided by 4\n");
+                               sent_len++;
+                       }
+
+                       /* U2M_PDMA length */
+                       btmtk_usb_io_write32(data, 0x234, sent_len << 16);
+
+                       usb_fill_bulk_urb(urb,
+                                       udev,
+                                       pipe,
+                                       buf,
+                                       sent_len + 4,
+                                       load_fw_complete,
+                                       &sent_to_mcu_done);
+
+                       urb->transfer_dma = data_dma;
+                       urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+
+                       ret = usb_submit_urb(urb, GFP_ATOMIC);
+
+                       if (ret)
+                               goto error3;
+
+                       if (!wait_for_completion_timeout(&sent_to_mcu_done, msecs_to_jiffies(1000))) {
+                               usb_kill_urb(urb);
+                               BT_ERR("upload dlm fw timeout\n");
+                               goto error3;
+                       }
+
+                       BT_DBG(".");
+
+                       mdelay(500);
+
+                       cur_len += sent_len;
+
+               } else {
+                       break;
+               }
+       }
+
+       /* upload 64bytes interrupt vector */
+       ret = load_fw_iv(data);
+       mdelay(100);
+
+       btmtk_usb_switch_iobase(data, WLAN);
+
+       /* check MCU if ready */
+       loop = 0;
+
+       do {
+               btmtk_usb_io_read32(data, COM_REG0, &value);
+
+               if (value == 0x01)
+                       break;
+
+               mdelay(10);
+               loop++;
+       } while (loop <= 100);
+
+       if (loop > 1000) {
+               BT_ERR("wait for 100 times\n");
+               ret = -ENODEV;
+       }
+
+error3:
+       usb_free_coherent(udev, 14592, buf, data_dma);
+error2:
+       usb_free_urb(urb);
+error1:
+       /* Disbale load fw mode */
+       btmtk_usb_io_read32(data, 0x238, &value);
+       value = value & ~(0x10000000);
+       btmtk_usb_io_write32(data,  0x238, value);
+error0:
+       btmtk_usb_io_write32(data, SEMAPHORE_00, 0x1);
+       return ret;
+}
+
+static int inc_tx(struct btmtk_usb_data *data)
+{
+       unsigned long flags;
+       int rv;
+
+       spin_lock_irqsave(&data->txlock, flags);
+       rv = test_bit(BTUSB_SUSPENDING, &data->flags);
+       if (!rv)
+               data->tx_in_flight++;
+       spin_unlock_irqrestore(&data->txlock, flags);
+
+       return rv;
+}
+
+static void btmtk_usb_intr_complete(struct urb *urb)
+{
+       struct hci_dev *hdev = urb->context;
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+       int err;
+
+       BT_DBG("%s: %s urb %p status %d count %d\n", __func__, hdev->name,
+                                       urb, urb->status, urb->actual_length);
+
+       if (!test_bit(HCI_RUNNING, &hdev->flags))
+               return;
+
+       if (urb->status == 0) {
+               hdev->stat.byte_rx += urb->actual_length;
+
+               hex_dump("hci event", urb->transfer_buffer, urb->actual_length);
+
+               if (hci_recv_fragment(hdev, HCI_EVENT_PKT,
+                                               urb->transfer_buffer,
+                                               urb->actual_length) < 0) {
+                       BT_ERR("%s corrupted event packet", hdev->name);
+                       hdev->stat.err_rx++;
+               }
+       }
+
+       if (!test_bit(BTUSB_INTR_RUNNING, &data->flags))
+               return;
+
+       usb_mark_last_busy(data->udev);
+       usb_anchor_urb(urb, &data->intr_anchor);
+
+       err = usb_submit_urb(urb, GFP_ATOMIC);
+
+       if (err < 0) {
+               /* -EPERM: urb is being killed;
+                * -ENODEV: device got disconnected */
+               if (err != -EPERM && err != -ENODEV)
+                       BT_ERR("%s urb %p failed to resubmit (%d)",
+                                               hdev->name, urb, -err);
+               usb_unanchor_urb(urb);
+       }
+}
+
+static int btmtk_usb_submit_intr_urb(struct hci_dev *hdev, gfp_t mem_flags)
+{
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+       struct urb *urb;
+       unsigned char *buf;
+       unsigned int pipe;
+       int err, size;
+
+       BT_DBG("%s\n", __func__);
+
+       if (!data->intr_ep)
+               return -ENODEV;
+
+       urb = usb_alloc_urb(0, mem_flags);
+       if (!urb)
+               return -ENOMEM;
+
+       size = le16_to_cpu(data->intr_ep->wMaxPacketSize);
+
+       buf = kmalloc(size, mem_flags);
+       if (!buf) {
+               usb_free_urb(urb);
+               return -ENOMEM;
+       }
+
+       pipe = usb_rcvintpipe(data->udev, data->intr_ep->bEndpointAddress);
+
+       usb_fill_int_urb(urb, data->udev, pipe, buf, size,
+                                               btmtk_usb_intr_complete, hdev,
+                                               data->intr_ep->bInterval);
+
+       urb->transfer_flags |= URB_FREE_BUFFER;
+
+       usb_anchor_urb(urb, &data->intr_anchor);
+
+       err = usb_submit_urb(urb, mem_flags);
+       if (err < 0) {
+               if (err != -EPERM && err != -ENODEV)
+                       BT_ERR("%s urb %p submission failed (%d)",
+                                               hdev->name, urb, -err);
+               usb_unanchor_urb(urb);
+       }
+
+       usb_free_urb(urb);
+
+       return err;
+
+}
+
+static void btmtk_usb_bulk_in_complete(struct urb *urb)
+{
+       struct hci_dev *hdev = urb->context;
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+       int err;
+
+       BT_DBG("%s:%s urb %p status %d count %d", __func__, hdev->name,
+                                       urb, urb->status, urb->actual_length);
+
+       if (!test_bit(HCI_RUNNING, &hdev->flags)) {
+               return;
+       }
+
+       if (urb->status == 0) {
+               hdev->stat.byte_rx += urb->actual_length;
+
+               if (hci_recv_fragment(hdev, HCI_ACLDATA_PKT,
+                                               urb->transfer_buffer,
+                                               urb->actual_length) < 0) {
+                       BT_ERR("%s corrupted ACL packet", hdev->name);
+                       hdev->stat.err_rx++;
+               }
+       }
+
+       if (!test_bit(BTUSB_BULK_RUNNING, &data->flags))
+               return;
+
+       usb_anchor_urb(urb, &data->bulk_anchor);
+       usb_mark_last_busy(data->udev);
+
+       err = usb_submit_urb(urb, GFP_ATOMIC);
+       if (err < 0) {
+               /* -EPERM: urb is being killed;
+                * -ENODEV: device got disconnected */
+               if (err != -EPERM && err != -ENODEV)
+                       BT_ERR("%s urb %p failed to resubmit (%d)",
+                                               hdev->name, urb, -err);
+               usb_unanchor_urb(urb);
+       }
+}
+
+static int btmtk_usb_submit_bulk_in_urb(struct hci_dev *hdev, gfp_t mem_flags)
+{
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+       struct urb *urb;
+       unsigned char *buf;
+       unsigned int pipe;
+       int err, size = HCI_MAX_FRAME_SIZE;
+
+       BT_DBG("%s:%s\n", __func__, hdev->name);
+
+       if (!data->bulk_rx_ep)
+               return -ENODEV;
+
+       urb = usb_alloc_urb(0, mem_flags);
+       if (!urb)
+               return -ENOMEM;
+
+       buf = kmalloc(size, mem_flags);
+       if (!buf) {
+               usb_free_urb(urb);
+               return -ENOMEM;
+       }
+
+       pipe = usb_rcvbulkpipe(data->udev, data->bulk_rx_ep->bEndpointAddress);
+
+       usb_fill_bulk_urb(urb, data->udev, pipe,
+                                       buf, size, btmtk_usb_bulk_in_complete, hdev);
+
+       urb->transfer_flags |= URB_FREE_BUFFER;
+
+       usb_mark_last_busy(data->udev);
+       usb_anchor_urb(urb, &data->bulk_anchor);
+
+       err = usb_submit_urb(urb, mem_flags);
+       if (err < 0) {
+               if (err != -EPERM && err != -ENODEV)
+                       BT_ERR("%s urb %p submission failed (%d)",
+                                               hdev->name, urb, -err);
+               usb_unanchor_urb(urb);
+       }
+
+       usb_free_urb(urb);
+
+       return err;
+}
+
+static void btmtk_usb_isoc_in_complete(struct urb *urb)
+
+{
+       struct hci_dev *hdev = urb->context;
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+       int i, err;
+
+       BT_DBG("%s: %s urb %p status %d count %d", __func__, hdev->name,
+                                       urb, urb->status, urb->actual_length);
+
+       if (!test_bit(HCI_RUNNING, &hdev->flags))
+               return;
+
+       if (urb->status == 0) {
+               for (i = 0; i < urb->number_of_packets; i++) {
+                       unsigned int offset = urb->iso_frame_desc[i].offset;
+                       unsigned int length = urb->iso_frame_desc[i].actual_length;
+
+                       if (urb->iso_frame_desc[i].status)
+                               continue;
+
+                       hdev->stat.byte_rx += length;
+
+                       if (hci_recv_fragment(hdev, HCI_SCODATA_PKT,
+                                               urb->transfer_buffer + offset,
+                                                               length) < 0) {
+                               BT_ERR("%s corrupted SCO packet", hdev->name);
+                               hdev->stat.err_rx++;
+                       }
+               }
+       }
+
+       if (!test_bit(BTUSB_ISOC_RUNNING, &data->flags))
+               return;
+
+       usb_anchor_urb(urb, &data->isoc_anchor);
+
+       err = usb_submit_urb(urb, GFP_ATOMIC);
+       if (err < 0) {
+               /* -EPERM: urb is being killed;
+                * -ENODEV: device got disconnected */
+               if (err != -EPERM && err != -ENODEV)
+                       BT_ERR("%s urb %p failed to resubmit (%d)",
+                                               hdev->name, urb, -err);
+               usb_unanchor_urb(urb);
+       }
+}
+
+static inline void __fill_isoc_descriptor(struct urb *urb, int len, int mtu)
+{
+       int i, offset = 0;
+
+       BT_DBG("len %d mtu %d", len, mtu);
+
+       for (i = 0; i < BTUSB_MAX_ISOC_FRAMES && len >= mtu;
+                                       i++, offset += mtu, len -= mtu) {
+               urb->iso_frame_desc[i].offset = offset;
+               urb->iso_frame_desc[i].length = mtu;
+       }
+
+       if (len && i < BTUSB_MAX_ISOC_FRAMES) {
+               urb->iso_frame_desc[i].offset = offset;
+               urb->iso_frame_desc[i].length = len;
+               i++;
+       }
+
+       urb->number_of_packets = i;
+}
+
+static int btmtk_usb_submit_isoc_in_urb(struct hci_dev *hdev, gfp_t mem_flags)
+{
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+       struct urb *urb;
+       unsigned char *buf;
+       unsigned int pipe;
+       int err, size;
+
+       BT_DBG("%s\n", __func__);
+
+       if (!data->isoc_rx_ep)
+               return -ENODEV;
+
+       urb = usb_alloc_urb(BTUSB_MAX_ISOC_FRAMES, mem_flags);
+       if (!urb)
+               return -ENOMEM;
+
+       size = le16_to_cpu(data->isoc_rx_ep->wMaxPacketSize) *
+                                               BTUSB_MAX_ISOC_FRAMES;
+
+       buf = kmalloc(size, mem_flags);
+       if (!buf) {
+               usb_free_urb(urb);
+               return -ENOMEM;
+       }
+
+       pipe = usb_rcvisocpipe(data->udev, data->isoc_rx_ep->bEndpointAddress);
+
+       usb_fill_int_urb(urb, data->udev, pipe, buf, size, btmtk_usb_isoc_in_complete,
+                               hdev, data->isoc_rx_ep->bInterval);
+
+       urb->transfer_flags  = URB_FREE_BUFFER | URB_ISO_ASAP;
+
+       __fill_isoc_descriptor(urb, size,
+                       le16_to_cpu(data->isoc_rx_ep->wMaxPacketSize));
+
+       usb_anchor_urb(urb, &data->isoc_anchor);
+
+       err = usb_submit_urb(urb, mem_flags);
+       if (err < 0) {
+               if (err != -EPERM && err != -ENODEV)
+                       BT_ERR("%s urb %p submission failed (%d)",
+                                               hdev->name, urb, -err);
+               usb_unanchor_urb(urb);
+       }
+
+       usb_free_urb(urb);
+
+       return err;
+}
+
+static int btmtk_usb_open(struct hci_dev *hdev)
+{
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+       int err;
+
+       BT_DBG("%s\n", __func__);
+
+       err = usb_autopm_get_interface(data->intf);
+       if (err < 0)
+               return err;
+
+       data->intf->needs_remote_wakeup = 1;
+
+       if (test_and_set_bit(HCI_RUNNING, &hdev->flags))
+               goto done;
+
+       if (test_and_set_bit(BTUSB_INTR_RUNNING, &data->flags))
+               goto done;
+
+       err = btmtk_usb_submit_intr_urb(hdev, GFP_KERNEL);
+       if (err < 0)
+               goto failed;
+
+       err = btmtk_usb_submit_bulk_in_urb(hdev, GFP_KERNEL);
+       if (err < 0) {
+               usb_kill_anchored_urbs(&data->intr_anchor);
+               goto failed;
+       }
+
+       set_bit(BTUSB_BULK_RUNNING, &data->flags);
+       btmtk_usb_submit_bulk_in_urb(hdev, GFP_KERNEL);
+
+done:
+       usb_autopm_put_interface(data->intf);
+       return 0;
+
+failed:
+       clear_bit(BTUSB_INTR_RUNNING, &data->flags);
+       clear_bit(HCI_RUNNING, &hdev->flags);
+       usb_autopm_put_interface(data->intf);
+       return err;
+}
+
+static void btmtk_usb_stop_traffic(struct btmtk_usb_data *data)
+{
+       BT_DBG("%s\n", __func__);
+
+       usb_kill_anchored_urbs(&data->intr_anchor);
+       usb_kill_anchored_urbs(&data->bulk_anchor);
+       usb_kill_anchored_urbs(&data->isoc_anchor);
+}
+
+static int btmtk_usb_close(struct hci_dev *hdev)
+{
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+       int err;
+
+       BT_DBG("%s\n", __func__);
+
+       if (!test_and_clear_bit(HCI_RUNNING, &hdev->flags))
+               return 0;
+
+       cancel_work_sync(&data->work);
+       cancel_work_sync(&data->waker);
+
+       clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+       clear_bit(BTUSB_BULK_RUNNING, &data->flags);
+       clear_bit(BTUSB_INTR_RUNNING, &data->flags);
+
+       btmtk_usb_stop_traffic(data);
+
+       err = usb_autopm_get_interface(data->intf);
+       if (err < 0)
+               goto failed;
+
+       data->intf->needs_remote_wakeup = 0;
+       usb_autopm_put_interface(data->intf);
+
+failed:
+       usb_scuttle_anchored_urbs(&data->deferred);
+       return 0;
+}
+
+static int btmtk_usb_flush(struct hci_dev *hdev)
+{
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+
+       BT_DBG("%s\n", __func__);
+
+       usb_kill_anchored_urbs(&data->tx_anchor);
+
+       return 0;
+}
+
+static void btmtk_usb_tx_complete(struct urb *urb)
+{
+       struct sk_buff *skb = urb->context;
+       struct hci_dev *hdev = (struct hci_dev *)skb->dev;
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+
+       BT_DBG("%s: %s urb %p status %d count %d\n", __func__, hdev->name,
+                                       urb, urb->status, urb->actual_length);
+
+       if (!test_bit(HCI_RUNNING, &hdev->flags))
+               goto done;
+
+       if (!urb->status)
+               hdev->stat.byte_tx += urb->transfer_buffer_length;
+       else
+               hdev->stat.err_tx++;
+
+done:
+       spin_lock(&data->txlock);
+       data->tx_in_flight--;
+       spin_unlock(&data->txlock);
+
+       kfree(urb->setup_packet);
+
+       kfree_skb(skb);
+}
+
+static void btmtk_usb_isoc_tx_complete(struct urb *urb)
+{
+       struct sk_buff *skb = urb->context;
+       struct hci_dev *hdev = (struct hci_dev *) skb->dev;
+
+       BT_DBG("%s: %s urb %p status %d count %d", __func__, hdev->name,
+                                       urb, urb->status, urb->actual_length);
+
+       if (!test_bit(HCI_RUNNING, &hdev->flags))
+               goto done;
+
+       if (!urb->status)
+               hdev->stat.byte_tx += urb->transfer_buffer_length;
+       else
+               hdev->stat.err_tx++;
+
+done:
+       kfree(urb->setup_packet);
+
+       kfree_skb(skb);
+}
+
+static int btmtk_usb_send_frame(struct sk_buff *skb)
+{
+       struct hci_dev *hdev = (struct hci_dev *)skb->dev;
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+       struct usb_ctrlrequest *dr;
+       struct urb *urb;
+       unsigned int pipe;
+       int err;
+
+       BT_DBG("%s\n", __func__);
+
+       if (!test_bit(HCI_RUNNING, &hdev->flags))
+               return -EBUSY;
+
+       switch (bt_cb(skb)->pkt_type) {
+       case HCI_COMMAND_PKT:
+               urb = usb_alloc_urb(0, GFP_ATOMIC);
+               if (!urb)
+                       return -ENOMEM;
+
+               dr = kmalloc(sizeof(*dr), GFP_ATOMIC);
+               if (!dr) {
+                       usb_free_urb(urb);
+                       return -ENOMEM;
+               }
+
+               dr->bRequestType = data->cmdreq_type;
+               dr->bRequest     = 0;
+               dr->wIndex       = 0;
+               dr->wValue       = 0;
+               dr->wLength      = __cpu_to_le16(skb->len);
+
+               pipe = usb_sndctrlpipe(data->udev, 0x00);
+
+               if (test_bit(HCI_RUNNING, &hdev->flags)) {
+                       u16 op_code;
+                       memcpy(&op_code, skb->data, 2);
+                       BT_DBG("ogf = %x\n", (op_code & 0xfc00) >> 10);
+                       BT_DBG("ocf = %x\n", op_code & 0x03ff);
+                       hex_dump("hci command", skb->data, skb->len);
+
+               }
+
+               usb_fill_control_urb(urb, data->udev, pipe, (void *) dr,
+                               skb->data, skb->len, btmtk_usb_tx_complete, skb);
+
+               hdev->stat.cmd_tx++;
+               break;
+
+       case HCI_ACLDATA_PKT:
+               if (!data->bulk_tx_ep)
+                       return -ENODEV;
+
+               urb = usb_alloc_urb(0, GFP_ATOMIC);
+               if (!urb)
+                       return -ENOMEM;
+
+               pipe = usb_sndbulkpipe(data->udev,
+                                       data->bulk_tx_ep->bEndpointAddress);
+
+               usb_fill_bulk_urb(urb, data->udev, pipe,
+                               skb->data, skb->len, btmtk_usb_tx_complete, skb);
+
+               hdev->stat.acl_tx++;
+               BT_DBG("HCI_ACLDATA_PKT:\n");
+               break;
+
+       case HCI_SCODATA_PKT:
+               if (!data->isoc_tx_ep || hdev->conn_hash.sco_num < 1)
+                       return -ENODEV;
+
+               urb = usb_alloc_urb(BTUSB_MAX_ISOC_FRAMES, GFP_ATOMIC);
+               if (!urb)
+                       return -ENOMEM;
+
+               pipe = usb_sndisocpipe(data->udev,
+                                       data->isoc_tx_ep->bEndpointAddress);
+
+               usb_fill_int_urb(urb, data->udev, pipe,
+                               skb->data, skb->len, btmtk_usb_isoc_tx_complete,
+                               skb, data->isoc_tx_ep->bInterval);
+
+               urb->transfer_flags  = URB_ISO_ASAP;
+
+               __fill_isoc_descriptor(urb, skb->len,
+                               le16_to_cpu(data->isoc_tx_ep->wMaxPacketSize));
+
+               hdev->stat.sco_tx++;
+               BT_DBG("HCI_SCODATA_PKT:\n");
+               goto skip_waking;
+
+       default:
+               return -EILSEQ;
+       }
+
+       err = inc_tx(data);
+
+       if (err) {
+               usb_anchor_urb(urb, &data->deferred);
+               schedule_work(&data->waker);
+               err = 0;
+               goto done;
+       }
+
+skip_waking:
+       usb_anchor_urb(urb, &data->tx_anchor);
+
+       err = usb_submit_urb(urb, GFP_ATOMIC);
+       if (err < 0) {
+               if (err != -EPERM && err != -ENODEV)
+                       BT_ERR("%s urb %p submission failed (%d)",
+                                               hdev->name, urb, -err);
+               kfree(urb->setup_packet);
+               usb_unanchor_urb(urb);
+       } else {
+               usb_mark_last_busy(data->udev);
+       }
+
+done:
+       usb_free_urb(urb);
+       return err;
+}
+
+static void btmtk_usb_notify(struct hci_dev *hdev, unsigned int evt)
+{
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+
+       BT_DBG("%s evt %d", hdev->name, evt);
+
+       if (hdev->conn_hash.sco_num != data->sco_num) {
+               data->sco_num = hdev->conn_hash.sco_num;
+               schedule_work(&data->work);
+       }
+}
+
+static inline int __set_isoc_interface(struct hci_dev *hdev, int altsetting)
+{
+       struct btmtk_usb_data *data = hci_get_drvdata(hdev);
+       struct usb_interface *intf = data->isoc;
+       struct usb_endpoint_descriptor *ep_desc;
+       int i, err;
+
+       if (!data->isoc)
+               return -ENODEV;
+
+       err = usb_set_interface(data->udev, 1, altsetting);
+       if (err < 0) {
+               BT_ERR("%s setting interface failed (%d)", hdev->name, -err);
+               return err;
+       }
+
+       data->isoc_altsetting = altsetting;
+
+       data->isoc_tx_ep = NULL;
+       data->isoc_rx_ep = NULL;
+
+       for (i = 0; i < intf->cur_altsetting->desc.bNumEndpoints; i++) {
+               ep_desc = &intf->cur_altsetting->endpoint[i].desc;
+
+               if (!data->isoc_tx_ep && usb_endpoint_is_isoc_out(ep_desc)) {
+                       data->isoc_tx_ep = ep_desc;
+                       continue;
+               }
+
+               if (!data->isoc_rx_ep && usb_endpoint_is_isoc_in(ep_desc)) {
+                       data->isoc_rx_ep = ep_desc;
+                       continue;
+               }
+       }
+
+       if (!data->isoc_tx_ep || !data->isoc_rx_ep) {
+               BT_ERR("%s invalid SCO descriptors", hdev->name);
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+static void btmtk_usb_work(struct work_struct *work)
+{
+       struct btmtk_usb_data *data = container_of(work, struct btmtk_usb_data, work);
+       struct hci_dev *hdev = data->hdev;
+       int new_alts;
+       int err;
+
+       BT_DBG("%s\n", __func__);
+
+       if (hdev->conn_hash.sco_num > 0) {
+               if (!test_bit(BTUSB_DID_ISO_RESUME, &data->flags)) {
+                       err = usb_autopm_get_interface(data->isoc ? data->isoc : data->intf);
+                       if (err < 0) {
+                               clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+                               usb_kill_anchored_urbs(&data->isoc_anchor);
+                               return;
+                       }
+
+                       set_bit(BTUSB_DID_ISO_RESUME, &data->flags);
+               }
+
+               if (hdev->voice_setting & 0x0020) {
+                       static const int alts[3] = { 2, 4, 5 };
+                       new_alts = alts[hdev->conn_hash.sco_num - 1];
+               } else {
+                       new_alts = hdev->conn_hash.sco_num;
+               }
+
+               if (data->isoc_altsetting != new_alts) {
+                       clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+                       usb_kill_anchored_urbs(&data->isoc_anchor);
+
+                       if (__set_isoc_interface(hdev, new_alts) < 0)
+                               return;
+               }
+
+               if (!test_and_set_bit(BTUSB_ISOC_RUNNING, &data->flags)) {
+                       if (btmtk_usb_submit_isoc_in_urb(hdev, GFP_KERNEL) < 0)
+                               clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+                       else
+                               btmtk_usb_submit_isoc_in_urb(hdev, GFP_KERNEL);
+               }
+       } else {
+               clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+               usb_kill_anchored_urbs(&data->isoc_anchor);
+
+               __set_isoc_interface(hdev, 0);
+
+               if (test_and_clear_bit(BTUSB_DID_ISO_RESUME, &data->flags))
+                        usb_autopm_put_interface(data->isoc ? data->isoc : data->intf);
+       }
+}
+
+static void btmtk_usb_waker(struct work_struct *work)
+{
+       struct btmtk_usb_data *data = container_of(work, struct btmtk_usb_data, waker);
+       int err;
+
+       err = usb_autopm_get_interface(data->intf);
+
+       if (err < 0)
+               return;
+
+       usb_autopm_put_interface(data->intf);
+}
+
+static int btmtk_usb_probe(struct usb_interface *intf,
+                                       const struct usb_device_id *id)
+{
+       struct btmtk_usb_data *data;
+       struct usb_endpoint_descriptor *ep_desc;
+       int i, err;
+       struct hci_dev *hdev;
+
+       /* interface numbers are hardcoded in the spec */
+       if (intf->cur_altsetting->desc.bInterfaceNumber != 0)
+               return -ENODEV;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+
+       if (!data)
+               return -ENOMEM;
+
+       for (i = 0; i < intf->cur_altsetting->desc.bNumEndpoints; i++) {
+               ep_desc = &intf->cur_altsetting->endpoint[i].desc;
+
+               if (!data->intr_ep && usb_endpoint_is_int_in(ep_desc)) {
+                       data->intr_ep = ep_desc;
+                       continue;
+               }
+
+               if (!data->bulk_tx_ep && usb_endpoint_is_bulk_out(ep_desc)) {
+                       data->bulk_tx_ep = ep_desc;
+                       continue;
+               }
+
+               if (!data->bulk_rx_ep && usb_endpoint_is_bulk_in(ep_desc)) {
+                       data->bulk_rx_ep = ep_desc;
+                       continue;
+               }
+       }
+
+       if (!data->intr_ep || !data->bulk_tx_ep || !data->bulk_rx_ep) {
+               kfree(data);
+               return -ENODEV;
+       }
+
+       data->cmdreq_type = USB_TYPE_CLASS;
+
+       data->udev = interface_to_usbdev(intf);
+       data->intf = intf;
+
+       spin_lock_init(&data->lock);
+       INIT_WORK(&data->work, btmtk_usb_work);
+       INIT_WORK(&data->waker, btmtk_usb_waker);
+       spin_lock_init(&data->txlock);
+
+       init_usb_anchor(&data->tx_anchor);
+       init_usb_anchor(&data->intr_anchor);
+       init_usb_anchor(&data->bulk_anchor);
+       init_usb_anchor(&data->isoc_anchor);
+       init_usb_anchor(&data->deferred);
+
+       hdev = hci_alloc_dev();
+       if (!hdev) {
+               kfree(data);
+               return -ENOMEM;
+       }
+
+       hdev->bus = HCI_USB;
+
+       hci_set_drvdata(hdev, data);
+
+       data->hdev = hdev;
+
+       SET_HCIDEV_DEV(hdev, &intf->dev);
+
+       hdev->open     = btmtk_usb_open;
+       hdev->close    = btmtk_usb_close;
+       hdev->flush    = btmtk_usb_flush;
+       hdev->send     = btmtk_usb_send_frame;
+       hdev->notify   = btmtk_usb_notify;
+
+       /* Interface numbers are hardcoded in the specification */
+       data->isoc = usb_ifnum_to_if(data->udev, 1);
+
+       if (data->isoc) {
+               err = usb_driver_claim_interface(&btmtk_usb_driver,
+                                                       data->isoc, data);
+               if (err < 0) {
+                       hci_free_dev(hdev);
+                       kfree(data);
+                       return err;
+               }
+       }
+
+       data->io_buf = kmalloc(256, GFP_KERNEL);
+       if (!data->io_buf) {
+               hci_free_dev(hdev);
+               kfree(data);
+               return -ENOMEM;
+       }
+
+       btmtk_usb_switch_iobase(data, WLAN);
+
+       btmtk_usb_cap_init(data);
+
+       err = hci_register_dev(hdev);
+       if (err < 0) {
+               hci_free_dev(hdev);
+               kfree(data);
+               return err;
+       }
+
+       usb_set_intfdata(intf, data);
+
+       return 0;
+}
+
+static void btmtk_usb_disconnect(struct usb_interface *intf)
+{
+       struct btmtk_usb_data *data = usb_get_intfdata(intf);
+       struct hci_dev *hdev;
+
+       BT_DBG("%s\n", __func__);
+
+       if (!data)
+               return;
+
+       hdev = data->hdev;
+       usb_set_intfdata(data->intf, NULL);
+
+       if (data->isoc)
+               usb_set_intfdata(data->isoc, NULL);
+
+       hci_unregister_dev(hdev);
+
+       if (intf == data->isoc)
+               usb_driver_release_interface(&btmtk_usb_driver, data->intf);
+       else if (data->isoc)
+               usb_driver_release_interface(&btmtk_usb_driver, data->isoc);
+
+       hci_free_dev(hdev);
+
+       kfree(data->io_buf);
+
+       kfree(data);
+}
+
+#ifdef CONFIG_PM
+static int btmtk_usb_suspend(struct usb_interface *intf, pm_message_t message)
+{
+       struct btmtk_usb_data *data = usb_get_intfdata(intf);
+
+       BT_DBG("%s\n", __func__);
+
+       if (data->suspend_count++)
+               return 0;
+
+       spin_lock_irq(&data->txlock);
+       if (!(PMSG_IS_AUTO(message) && data->tx_in_flight)) {
+               set_bit(BTUSB_SUSPENDING, &data->flags);
+               spin_unlock_irq(&data->txlock);
+       } else {
+               spin_unlock_irq(&data->txlock);
+               data->suspend_count--;
+               return -EBUSY;
+       }
+
+       cancel_work_sync(&data->work);
+
+       btmtk_usb_stop_traffic(data);
+       usb_kill_anchored_urbs(&data->tx_anchor);
+
+       return 0;
+}
+
+static void play_deferred(struct btmtk_usb_data *data)
+{
+       struct urb *urb;
+       int err;
+
+       while ((urb = usb_get_from_anchor(&data->deferred))) {
+               err = usb_submit_urb(urb, GFP_ATOMIC);
+               if (err < 0)
+                       break;
+
+               data->tx_in_flight++;
+       }
+
+       usb_scuttle_anchored_urbs(&data->deferred);
+}
+
+static int btmtk_usb_resume(struct usb_interface *intf)
+{
+       struct btmtk_usb_data *data = usb_get_intfdata(intf);
+       struct hci_dev *hdev = data->hdev;
+       int err = 0;
+
+       BT_DBG("%s\n", __func__);
+
+       if (--data->suspend_count)
+               return 0;
+
+       if (!test_bit(HCI_RUNNING, &hdev->flags))
+               goto done;
+
+       if (test_bit(BTUSB_INTR_RUNNING, &data->flags)) {
+               err = btmtk_usb_submit_intr_urb(hdev, GFP_NOIO);
+               if (err < 0) {
+                       clear_bit(BTUSB_INTR_RUNNING, &data->flags);
+                       goto failed;
+               }
+       }
+
+       if (test_bit(BTUSB_BULK_RUNNING, &data->flags)) {
+               err = btmtk_usb_submit_bulk_in_urb(hdev, GFP_NOIO);
+               if (err < 0) {
+                       clear_bit(BTUSB_BULK_RUNNING, &data->flags);
+                       goto failed;
+               }
+
+               btmtk_usb_submit_bulk_in_urb(hdev, GFP_NOIO);
+       }
+
+       if (test_bit(BTUSB_ISOC_RUNNING, &data->flags)) {
+               if (btmtk_usb_submit_isoc_in_urb(hdev, GFP_NOIO) < 0)
+                       clear_bit(BTUSB_ISOC_RUNNING, &data->flags);
+               else
+                       btmtk_usb_submit_isoc_in_urb(hdev, GFP_NOIO);
+       }
+
+       spin_lock_irq(&data->txlock);
+       play_deferred(data);
+       clear_bit(BTUSB_SUSPENDING, &data->flags);
+       spin_unlock_irq(&data->txlock);
+       schedule_work(&data->work);
+
+       return 0;
+
+failed:
+       usb_scuttle_anchored_urbs(&data->deferred);
+done:
+       spin_lock_irq(&data->txlock);
+       clear_bit(BTUSB_SUSPENDING, &data->flags);
+       spin_unlock_irq(&data->txlock);
+
+       return err;
+}
+#endif
+
+static struct usb_device_id btmtk_usb_table[] = {
+       /* Mediatek MT7650 */
+       { USB_DEVICE(0x0e8d, 0x7650) },
+       { USB_DEVICE(0x0e8d, 0x7630) },
+       { USB_DEVICE(0x0e8d, 0x763e) },
+       /* Mediatek MT662 */
+       { USB_DEVICE(0x0e8d, 0x7662) },
+       { USB_DEVICE(0x0e8d, 0x7632) },
+       { }     /* Terminating entry */
+};
+
+static struct usb_driver btmtk_usb_driver = {
+       .name           = "btmtk_usb",
+       .probe          = btmtk_usb_probe,
+       .disconnect     = btmtk_usb_disconnect,
+#ifdef CONFIG_PM
+       .suspend        = btmtk_usb_suspend,
+       .resume         = btmtk_usb_resume,
+#endif
+       .id_table       = btmtk_usb_table,
+       .supports_autosuspend = 1,
+       .disable_hub_initiated_lpm = 1,
+};
+
+module_usb_driver(btmtk_usb_driver);
+
+MODULE_DESCRIPTION("Mediatek Bluetooth USB driver ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_FIRMWARE(MT7650_FIRMWARE);
+MODULE_FIRMWARE(MT7662_FIRMWARE);
diff --git a/drivers/staging/btmtk_usb/btmtk_usb.h b/drivers/staging/btmtk_usb/btmtk_usb.h
new file mode 100644 (file)
index 0000000..12f0d3b
--- /dev/null
@@ -0,0 +1,138 @@
+/*
+ *  MediaTek Bluetooth USB Driver
+ *
+ *  Copyright (C) 2013, MediaTek co.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *  or on the worldwide web at
+ *  http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
+ *
+ */
+
+#ifndef __BTMTK_USB_H__
+#define __BTMTK_USB_H_
+
+/* Memory map for MTK BT */
+
+/* SYS Control */
+#define SYSCTL 0x400000
+
+/* WLAN */
+#define WLAN           0x410000
+
+/* MCUCTL */
+#define INT_LEVEL              0x0718
+#define COM_REG0               0x0730
+#define SEMAPHORE_00   0x07B0
+#define SEMAPHORE_01   0x07B4
+#define SEMAPHORE_02   0x07B8
+#define SEMAPHORE_03   0x07BC
+
+/* Chip definition */
+
+#define CONTROL_TIMEOUT_JIFFIES ((300 * HZ) / 100)
+#define DEVICE_VENDOR_REQUEST_OUT      0x40
+#define DEVICE_VENDOR_REQUEST_IN       0xc0
+#define DEVICE_CLASS_REQUEST_OUT       0x20
+
+#define BTUSB_MAX_ISOC_FRAMES  10
+#define BTUSB_INTR_RUNNING     0
+#define BTUSB_BULK_RUNNING     1
+#define BTUSB_ISOC_RUNNING     2
+#define BTUSB_SUSPENDING       3
+#define BTUSB_DID_ISO_RESUME   4
+
+/* ROM Patch */
+#define PATCH_HCI_HEADER_SIZE 4
+#define PATCH_WMT_HEADER_SIZE 5
+#define PATCH_HEADER_SIZE (PATCH_HCI_HEADER_SIZE + PATCH_WMT_HEADER_SIZE)
+#define UPLOAD_PATCH_UNIT 2048
+#define PATCH_INFO_SIZE 30
+#define PATCH_PHASE1 1
+#define PATCH_PHASE2 2
+#define PATCH_PHASE3 3
+
+struct btmtk_usb_data {
+       struct hci_dev *hdev;
+       struct usb_device    *udev;
+       struct usb_interface *intf;
+       struct usb_interface *isoc;
+
+       spinlock_t lock;
+
+       unsigned long flags;
+       struct work_struct work;
+       struct work_struct waker;
+
+       struct usb_anchor tx_anchor;
+       struct usb_anchor intr_anchor;
+       struct usb_anchor bulk_anchor;
+       struct usb_anchor isoc_anchor;
+       struct usb_anchor deferred;
+       int tx_in_flight;
+       spinlock_t txlock;
+
+       struct usb_endpoint_descriptor *intr_ep;
+       struct usb_endpoint_descriptor *bulk_tx_ep;
+       struct usb_endpoint_descriptor *bulk_rx_ep;
+       struct usb_endpoint_descriptor *isoc_tx_ep;
+       struct usb_endpoint_descriptor *isoc_rx_ep;
+
+       __u8 cmdreq_type;
+
+       unsigned int sco_num;
+       int isoc_altsetting;
+       int suspend_count;
+
+       /* request for different io operation */
+       u8 w_request;
+       u8 r_request;
+
+       /* io buffer for usb control transfer */
+       char *io_buf;
+
+       struct semaphore fw_upload_sem;
+
+       /* unsigned char *fw_image; */
+       /* unsigned char *rom_patch; */
+       const struct firmware *firmware;
+       u32 chip_id;
+       u8 need_load_fw;
+       u8 need_load_rom_patch;
+       u32 rom_patch_offset;
+       u32 rom_patch_len;
+};
+
+static inline int is_mt7630(struct btmtk_usb_data *data)
+{
+       return ((data->chip_id & 0xffff0000) == 0x76300000);
+}
+
+static inline int is_mt7650(struct btmtk_usb_data *data)
+{
+       return ((data->chip_id & 0xffff0000) == 0x76500000);
+}
+
+static inline int is_mt7632(struct btmtk_usb_data *data)
+{
+       return ((data->chip_id & 0xffff0000) == 0x76320000);
+}
+
+static inline int is_mt7662(struct btmtk_usb_data *data)
+{
+       return ((data->chip_id & 0xffff0000) == 0x76620000);
+}
+
+#endif
index 82a333f6433e3f4a852e4ed62f7936bfe4ad707b..2dbaf39e2fc296b0b5acfc6b32fd0f9bb67d80e3 100644 (file)
 **
 ** Empties the Output buffer and sets int lines. Used from user level only
 ****************************************************************************/
-void FlushOutBuff(DEVICE_EXTENSION * pdx)
+static void FlushOutBuff(DEVICE_EXTENSION *pdx)
 {
        dev_dbg(&pdx->interface->dev, "%s currentState=%d", __func__,
                pdx->sCurrentState);
        if (pdx->sCurrentState == U14ERR_TIME)  /* Do nothing if hardware in trouble */
                return;
-//    CharSend_Cancel(pdx);                   /* Kill off any pending I/O */
+       /* Kill off any pending I/O */
+       /* CharSend_Cancel(pdx);  */
        spin_lock_irq(&pdx->charOutLock);
        pdx->dwNumOutput = 0;
        pdx->dwOutBuffGet = 0;
@@ -57,13 +58,14 @@ void FlushOutBuff(DEVICE_EXTENSION * pdx)
 **
 ** Empties the input buffer and sets int lines
 ****************************************************************************/
-void FlushInBuff(DEVICE_EXTENSION * pdx)
+static void FlushInBuff(DEVICE_EXTENSION *pdx)
 {
        dev_dbg(&pdx->interface->dev, "%s currentState=%d", __func__,
                pdx->sCurrentState);
        if (pdx->sCurrentState == U14ERR_TIME)  /* Do nothing if hardware in trouble */
                return;
-//    CharRead_Cancel(pDevObject);            /* Kill off any pending I/O */
+       /* Kill off any pending I/O */
+       /*     CharRead_Cancel(pDevObject);  */
        spin_lock_irq(&pdx->charInLock);
        pdx->dwNumInput = 0;
        pdx->dwInBuffGet = 0;
@@ -77,11 +79,11 @@ void FlushInBuff(DEVICE_EXTENSION * pdx)
 ** Utility routine to copy chars into the output buffer and fire them off.
 ** called from user mode, holds charOutLock.
 ****************************************************************************/
-static int PutChars(DEVICE_EXTENSION * pdx, const char *pCh,
+static int PutChars(DEVICE_EXTENSION *pdx, const char *pCh,
                    unsigned int uCount)
 {
        int iReturn;
-       spin_lock_irq(&pdx->charOutLock);       // get the output spin lock
+       spin_lock_irq(&pdx->charOutLock);       /*  get the output spin lock */
        if ((OUTBUF_SZ - pdx->dwNumOutput) >= uCount) {
                unsigned int u;
                for (u = 0; u < uCount; u++) {
@@ -91,9 +93,9 @@ static int PutChars(DEVICE_EXTENSION * pdx, const char *pCh,
                }
                pdx->dwNumOutput += uCount;
                spin_unlock_irq(&pdx->charOutLock);
-               iReturn = SendChars(pdx);       // ...give a chance to transmit data
+               iReturn = SendChars(pdx);       /*  ...give a chance to transmit data */
        } else {
-               iReturn = U14ERR_NOOUT; // no room at the out (ha-ha)
+               iReturn = U14ERR_NOOUT; /*  no room at the out (ha-ha) */
                spin_unlock_irq(&pdx->charOutLock);
        }
        return iReturn;
@@ -104,26 +106,25 @@ static int PutChars(DEVICE_EXTENSION * pdx, const char *pCh,
 ** trigger an output transfer if this is appropriate. User mode.
 ** Holds the io_mutex
 *****************************************************************************/
-int SendString(DEVICE_EXTENSION * pdx, const char __user * pData,
+int SendString(DEVICE_EXTENSION *pdx, const char __user *pData,
               unsigned int n)
 {
-       int iReturn = U14ERR_NOERROR;   // assume all will be well
-       char buffer[OUTBUF_SZ + 1];     // space in our address space for characters
-       if (n > OUTBUF_SZ)      // check space in local buffer...
-               return U14ERR_NOOUT;    // ...too many characters
+       int iReturn = U14ERR_NOERROR;   /*  assume all will be well */
+       char buffer[OUTBUF_SZ + 1];     /*  space in our address space for characters */
+       if (n > OUTBUF_SZ)      /*  check space in local buffer... */
+               return U14ERR_NOOUT;    /*  ...too many characters */
        if (copy_from_user(buffer, pData, n))
                return -EFAULT;
-       buffer[n] = 0;          // terminate for debug purposes
+       buffer[n] = 0;          /*  terminate for debug purposes */
 
-       mutex_lock(&pdx->io_mutex);     // Protect disconnect from new i/o
-       if (n > 0)              // do nothing if nowt to do!
-       {
+       mutex_lock(&pdx->io_mutex);     /*  Protect disconnect from new i/o */
+       if (n > 0) {            /*  do nothing if nowt to do! */
                dev_dbg(&pdx->interface->dev, "%s n=%d>%s<", __func__, n,
                        buffer);
                iReturn = PutChars(pdx, buffer, n);
        }
 
-       Allowi(pdx);            // make sure we have input int
+       Allowi(pdx);            /*  make sure we have input int */
        mutex_unlock(&pdx->io_mutex);
 
        return iReturn;
@@ -134,13 +135,13 @@ int SendString(DEVICE_EXTENSION * pdx, const char __user * pData,
 **
 ** Sends a single character to the 1401. User mode, holds io_mutex.
 ****************************************************************************/
-int SendChar(DEVICE_EXTENSION * pdx, char c)
+int SendChar(DEVICE_EXTENSION *pdx, char c)
 {
        int iReturn;
-       mutex_lock(&pdx->io_mutex);     // Protect disconnect from new i/o
+       mutex_lock(&pdx->io_mutex);     /*  Protect disconnect from new i/o */
        iReturn = PutChars(pdx, &c, 1);
        dev_dbg(&pdx->interface->dev, "SendChar >%c< (0x%02x)", c, c);
-       Allowi(pdx);    // Make sure char reads are running
+       Allowi(pdx);    /*  Make sure char reads are running */
        mutex_unlock(&pdx->io_mutex);
        return iReturn;
 }
@@ -171,20 +172,20 @@ int SendChar(DEVICE_EXTENSION * pdx, char c)
 **
 ** return error code (U14ERR_NOERROR for OK)
 */
-int Get1401State(DEVICE_EXTENSION * pdx, __u32 * state, __u32 * error)
+int Get1401State(DEVICE_EXTENSION *pdx, __u32 *state, __u32 *error)
 {
        int nGot;
        dev_dbg(&pdx->interface->dev, "Get1401State() entry");
 
-       *state = 0xFFFFFFFF;    // Start off with invalid state
+       *state = 0xFFFFFFFF;    /*  Start off with invalid state */
        nGot = usb_control_msg(pdx->udev, usb_rcvctrlpipe(pdx->udev, 0),
                               GET_STATUS, (D_TO_H | VENDOR | DEVREQ), 0, 0,
                               pdx->statBuf, sizeof(pdx->statBuf), HZ);
        if (nGot != sizeof(pdx->statBuf)) {
                dev_err(&pdx->interface->dev,
                        "Get1401State() FAILED, return code %d", nGot);
-               pdx->sCurrentState = U14ERR_TIME;       // Indicate that things are very wrong indeed
-               *state = 0;     // Force status values to a known state
+               pdx->sCurrentState = U14ERR_TIME;       /*  Indicate that things are very wrong indeed */
+               *state = 0;     /*  Force status values to a known state */
                *error = 0;
        } else {
                int nDevice;
@@ -192,17 +193,16 @@ int Get1401State(DEVICE_EXTENSION * pdx, __u32 * state, __u32 * error)
                        "Get1401State() Success, state: 0x%x, 0x%x",
                        pdx->statBuf[0], pdx->statBuf[1]);
 
-               *state = pdx->statBuf[0];       // Return the state values to the calling code
+               *state = pdx->statBuf[0];       /*  Return the state values to the calling code */
                *error = pdx->statBuf[1];
 
-               nDevice = pdx->udev->descriptor.bcdDevice >> 8; // 1401 type code value
-               switch (nDevice)        // so we can clean up current state
-               {
+               nDevice = pdx->udev->descriptor.bcdDevice >> 8; /*  1401 type code value */
+               switch (nDevice) {      /*  so we can clean up current state */
                case 0:
                        pdx->sCurrentState = U14ERR_U1401;
                        break;
 
-               default:        // allow lots of device codes for future 1401s
+               default:        /*  allow lots of device codes for future 1401s */
                        if ((nDevice >= 1) && (nDevice <= 23))
                                pdx->sCurrentState = (short)(nDevice + 6);
                        else
@@ -219,7 +219,7 @@ int Get1401State(DEVICE_EXTENSION * pdx, __u32 * state, __u32 * error)
 **
 ** Kills off staged read\write request from the USB if one is pending.
 ****************************************************************************/
-int ReadWrite_Cancel(DEVICE_EXTENSION * pdx)
+int ReadWrite_Cancel(DEVICE_EXTENSION *pdx)
 {
        dev_dbg(&pdx->interface->dev, "ReadWrite_Cancel entry %d",
                pdx->bStagedUrbPending);
@@ -227,24 +227,23 @@ int ReadWrite_Cancel(DEVICE_EXTENSION * pdx)
        int ntStatus = STATUS_SUCCESS;
        bool bResult = false;
        unsigned int i;
-       // We can fill this in when we know how we will implement the staged transfer stuff
+       /*  We can fill this in when we know how we will implement the staged transfer stuff */
        spin_lock_irq(&pdx->stagedLock);
 
-       if (pdx->bStagedUrbPending)     // anything to be cancelled? May need more...
-       {
+       if (pdx->bStagedUrbPending) {   /*  anything to be cancelled? May need more... */
                dev_info(&pdx->interface - dev,
                         "ReadWrite_Cancel about to cancel Urb");
-
-               //       KeClearEvent(&pdx->StagingDoneEvent);   // Clear the staging done flag
+               /* Clear the staging done flag */
+               /* KeClearEvent(&pdx->StagingDoneEvent); */
                USB_ASSERT(pdx->pStagedIrp != NULL);
 
-               // Release the spinlock first otherwise the completion routine may hang
-               //  on the spinlock while this function hands waiting for the event.
+               /*  Release the spinlock first otherwise the completion routine may hang */
+               /*   on the spinlock while this function hands waiting for the event. */
                spin_unlock_irq(&pdx->stagedLock);
-               bResult = IoCancelIrp(pdx->pStagedIrp); // Actually do the cancel
+               bResult = IoCancelIrp(pdx->pStagedIrp); /*  Actually do the cancel */
                if (bResult) {
                        LARGE_INTEGER timeout;
-                       timeout.QuadPart = -10000000;   // Use a timeout of 1 second
+                       timeout.QuadPart = -10000000;   /*  Use a timeout of 1 second */
                        dev_info(&pdx->interface - dev,
                                 "ReadWrite_Cancel about to wait till done");
                        ntStatus =
@@ -274,14 +273,14 @@ int ReadWrite_Cancel(DEVICE_EXTENSION * pdx)
 ** InSelfTest - utility to check in self test. Return 1 for ST, 0 for not or
 ** a -ve error code if we failed for some reason.
 ***************************************************************************/
-static int InSelfTest(DEVICE_EXTENSION * pdx, unsigned int *pState)
+static int InSelfTest(DEVICE_EXTENSION *pdx, unsigned int *pState)
 {
        unsigned int state, error;
-       int iReturn = Get1401State(pdx, &state, &error);        // see if in self-test
-       if (iReturn == U14ERR_NOERROR)  // if all still OK
-               iReturn = (state == (unsigned int)-1) ||        // TX problem or...
-                   ((state & 0xff) == 0x80);   // ...self test
-       *pState = state;        // return actual state
+       int iReturn = Get1401State(pdx, &state, &error);        /*  see if in self-test */
+       if (iReturn == U14ERR_NOERROR)  /*  if all still OK */
+               iReturn = (state == (unsigned int)-1) ||        /*  TX problem or... */
+                   ((state & 0xff) == 0x80);   /*  ...self test */
+       *pState = state;        /*  return actual state */
        return iReturn;
 }
 
@@ -303,48 +302,45 @@ static int InSelfTest(DEVICE_EXTENSION * pdx, unsigned int *pState)
 **
 **  Returns TRUE if a 1401 detected and OK, else FALSE
 ****************************************************************************/
-bool Is1401(DEVICE_EXTENSION * pdx)
+bool Is1401(DEVICE_EXTENSION *pdx)
 {
        int iReturn;
        dev_dbg(&pdx->interface->dev, "%s", __func__);
 
-       ced_draw_down(pdx);     // wait for, then kill outstanding Urbs
-       FlushInBuff(pdx);       // Clear out input buffer & pipe
-       FlushOutBuff(pdx);      // Clear output buffer & pipe
+       ced_draw_down(pdx);     /*  wait for, then kill outstanding Urbs */
+       FlushInBuff(pdx);       /*  Clear out input buffer & pipe */
+       FlushOutBuff(pdx);      /*  Clear output buffer & pipe */
 
-       // The next call returns 0 if OK, but has returned 1 in the past, meaning that
-       // usb_unlock_device() is needed... now it always is
+       /*  The next call returns 0 if OK, but has returned 1 in the past, meaning that */
+       /*  usb_unlock_device() is needed... now it always is */
        iReturn = usb_lock_device_for_reset(pdx->udev, pdx->interface);
 
-       // release the io_mutex because if we don't, we will deadlock due to system
-       // calls back into the driver.
-       mutex_unlock(&pdx->io_mutex);   // locked, so we will not get system calls
-       if (iReturn >= 0)       // if we failed
-       {
-               iReturn = usb_reset_device(pdx->udev);  // try to do the reset
-               usb_unlock_device(pdx->udev);   // undo the lock
+       /*  release the io_mutex because if we don't, we will deadlock due to system */
+       /*  calls back into the driver. */
+       mutex_unlock(&pdx->io_mutex);   /*  locked, so we will not get system calls */
+       if (iReturn >= 0) {     /*  if we failed */
+               iReturn = usb_reset_device(pdx->udev);  /*  try to do the reset */
+               usb_unlock_device(pdx->udev);   /*  undo the lock */
        }
 
-       mutex_lock(&pdx->io_mutex);     // hold stuff off while we wait
-       pdx->dwDMAFlag = MODE_CHAR;     // Clear DMA mode flag regardless!
-       if (iReturn == 0)       // if all is OK still
-       {
+       mutex_lock(&pdx->io_mutex);     /*  hold stuff off while we wait */
+       pdx->dwDMAFlag = MODE_CHAR;     /*  Clear DMA mode flag regardless! */
+       if (iReturn == 0) {     /*  if all is OK still */
                unsigned int state;
-               iReturn = InSelfTest(pdx, &state);      // see if likely in self test
-               if (iReturn > 0)        // do we need to wait for self-test?
-               {
-                       unsigned long ulTimeOut = jiffies + 30 * HZ;    // when to give up
+               iReturn = InSelfTest(pdx, &state);      /*  see if likely in self test */
+               if (iReturn > 0) {      /*  do we need to wait for self-test? */
+                       unsigned long ulTimeOut = jiffies + 30 * HZ;    /*  when to give up */
                        while ((iReturn > 0) && time_before(jiffies, ulTimeOut)) {
-                               schedule();     // let other stuff run
-                               iReturn = InSelfTest(pdx, &state);      // see if done yet
+                               schedule();     /*  let other stuff run */
+                               iReturn = InSelfTest(pdx, &state);      /*  see if done yet */
                        }
                }
 
-               if (iReturn == 0)       // if all is OK...
-                       iReturn = state == 0;   // then success is that the state is 0
+               if (iReturn == 0)       /*  if all is OK... */
+                       iReturn = state == 0;   /*  then success is that the state is 0 */
        } else
-               iReturn = 0;    // we failed
-       pdx->bForceReset = false;       // Clear forced reset flag now
+               iReturn = 0;    /*  we failed */
+       pdx->bForceReset = false;       /*  Clear forced reset flag now */
 
        return iReturn > 0;
 }
@@ -363,45 +359,42 @@ bool Is1401(DEVICE_EXTENSION * pdx)
 **
 ** The return value is TRUE if a useable 1401 is found, FALSE if not
 */
-bool QuickCheck(DEVICE_EXTENSION * pdx, bool bTestBuff, bool bCanReset)
+bool QuickCheck(DEVICE_EXTENSION *pdx, bool bTestBuff, bool bCanReset)
 {
-       bool bRet = false;      // assume it will fail and we will reset
+       bool bRet = false;      /*  assume it will fail and we will reset */
        bool bShortTest;
 
-       bShortTest = ((pdx->dwDMAFlag == MODE_CHAR) &&  // no DMA running
-                     (!pdx->bForceReset) &&    // Not had a real reset forced
-                     (pdx->sCurrentState >= U14ERR_STD));      // No 1401 errors stored
+       bShortTest = ((pdx->dwDMAFlag == MODE_CHAR) &&  /*  no DMA running */
+                     (!pdx->bForceReset) &&    /*  Not had a real reset forced */
+                     (pdx->sCurrentState >= U14ERR_STD));      /*  No 1401 errors stored */
 
        dev_dbg(&pdx->interface->dev,
                "%s DMAFlag:%d, state:%d, force:%d, testBuff:%d, short:%d",
                __func__, pdx->dwDMAFlag, pdx->sCurrentState, pdx->bForceReset,
                bTestBuff, bShortTest);
 
-       if ((bTestBuff) &&      // Buffer check requested, and...
-           (pdx->dwNumInput || pdx->dwNumOutput))      // ...characters were in the buffer?
-       {
-               bShortTest = false;     // Then do the full test
+       if ((bTestBuff) &&      /*  Buffer check requested, and... */
+           (pdx->dwNumInput || pdx->dwNumOutput)) {    /*  ...characters were in the buffer? */
+               bShortTest = false;     /*  Then do the full test */
                dev_dbg(&pdx->interface->dev,
                        "%s will reset as buffers not empty", __func__);
        }
 
-       if (bShortTest || !bCanReset)   // Still OK to try the short test?
-       {                       // Always test if no reset - we want state update
+       if (bShortTest || !bCanReset) { /*  Still OK to try the short test? */
+                               /*  Always test if no reset - we want state update */
                unsigned int state, error;
                dev_dbg(&pdx->interface->dev, "%s->Get1401State", __func__);
-               if (Get1401State(pdx, &state, &error) == U14ERR_NOERROR)        // Check on the 1401 state
-               {
-                       if ((state & 0xFF) == 0)        // If call worked, check the status value
-                               bRet = true;    // If that was zero, all is OK, no reset needed
+               if (Get1401State(pdx, &state, &error) == U14ERR_NOERROR) {      /*  Check on the 1401 state */
+                       if ((state & 0xFF) == 0)        /*  If call worked, check the status value */
+                               bRet = true;    /*  If that was zero, all is OK, no reset needed */
                }
        }
 
-       if (!bRet && bCanReset) // If all not OK, then
-       {
+       if (!bRet && bCanReset) { /*  If all not OK, then */
                dev_info(&pdx->interface->dev, "%s->Is1401 %d %d %d %d",
                         __func__, bShortTest, pdx->sCurrentState, bTestBuff,
                         pdx->bForceReset);
-               bRet = Is1401(pdx);     //  do full test
+               bRet = Is1401(pdx);     /*   do full test */
        }
 
        return bRet;
@@ -412,11 +405,11 @@ bool QuickCheck(DEVICE_EXTENSION * pdx, bool bTestBuff, bool bCanReset)
 **
 ** Resets the 1401 and empties the i/o buffers
 *****************************************************************************/
-int Reset1401(DEVICE_EXTENSION * pdx)
+int Reset1401(DEVICE_EXTENSION *pdx)
 {
-       mutex_lock(&pdx->io_mutex);     // Protect disconnect from new i/o
+       mutex_lock(&pdx->io_mutex);     /*  Protect disconnect from new i/o */
        dev_dbg(&pdx->interface->dev, "ABout to call QuickCheck");
-       QuickCheck(pdx, true, true);    // Check 1401, reset if not OK
+       QuickCheck(pdx, true, true);    /*  Check 1401, reset if not OK */
        mutex_unlock(&pdx->io_mutex);
        return U14ERR_NOERROR;
 }
@@ -426,30 +419,29 @@ int Reset1401(DEVICE_EXTENSION * pdx)
 **
 ** Gets a single character from the 1401
 ****************************************************************************/
-int GetChar(DEVICE_EXTENSION * pdx)
+int GetChar(DEVICE_EXTENSION *pdx)
 {
-       int iReturn = U14ERR_NOIN;      // assume we will get  nothing
-       mutex_lock(&pdx->io_mutex);     // Protect disconnect from new i/o
+       int iReturn = U14ERR_NOIN;      /*  assume we will get  nothing */
+       mutex_lock(&pdx->io_mutex);     /*  Protect disconnect from new i/o */
 
        dev_dbg(&pdx->interface->dev, "GetChar");
 
-       Allowi(pdx);    // Make sure char reads are running
-       SendChars(pdx); // and send any buffered chars
+       Allowi(pdx);    /*  Make sure char reads are running */
+       SendChars(pdx); /*  and send any buffered chars */
 
        spin_lock_irq(&pdx->charInLock);
-       if (pdx->dwNumInput > 0)        // worth looking
-       {
+       if (pdx->dwNumInput > 0) {      /*  worth looking */
                iReturn = pdx->inputBuffer[pdx->dwInBuffGet++];
                if (pdx->dwInBuffGet >= INBUF_SZ)
                        pdx->dwInBuffGet = 0;
                pdx->dwNumInput--;
        } else
-               iReturn = U14ERR_NOIN;  // no input data to read
+               iReturn = U14ERR_NOIN;  /*  no input data to read */
        spin_unlock_irq(&pdx->charInLock);
 
-       Allowi(pdx);    // Make sure char reads are running
+       Allowi(pdx);    /*  Make sure char reads are running */
 
-       mutex_unlock(&pdx->io_mutex);   // Protect disconnect from new i/o
+       mutex_unlock(&pdx->io_mutex);   /*  Protect disconnect from new i/o */
        return iReturn;
 }
 
@@ -464,46 +456,43 @@ int GetChar(DEVICE_EXTENSION * pdx)
 ** returns the count of characters (including the terminator, or 0 if none
 ** or a negative error code.
 ****************************************************************************/
-int GetString(DEVICE_EXTENSION * pdx, char __user * pUser, int n)
+int GetString(DEVICE_EXTENSION *pdx, char __user *pUser, int n)
 {
-       int nAvailable;         // character in the buffer
+       int nAvailable;         /*  character in the buffer */
        int iReturn = U14ERR_NOIN;
        if (n <= 0)
                return -ENOMEM;
 
-       mutex_lock(&pdx->io_mutex);     // Protect disconnect from new i/o
-       Allowi(pdx);    // Make sure char reads are running
-       SendChars(pdx);         // and send any buffered chars
+       mutex_lock(&pdx->io_mutex);     /*  Protect disconnect from new i/o */
+       Allowi(pdx);    /*  Make sure char reads are running */
+       SendChars(pdx);         /*  and send any buffered chars */
 
        spin_lock_irq(&pdx->charInLock);
-       nAvailable = pdx->dwNumInput;   // characters available now
-       if (nAvailable > n)     // read max of space in pUser...
-               nAvailable = n; // ...or input characters
+       nAvailable = pdx->dwNumInput;   /*  characters available now */
+       if (nAvailable > n)     /*  read max of space in pUser... */
+               nAvailable = n; /*  ...or input characters */
 
-       if (nAvailable > 0)     // worth looking?
-       {
-               char buffer[INBUF_SZ + 1];      // space for a linear copy of data
+       if (nAvailable > 0) {   /*  worth looking? */
+               char buffer[INBUF_SZ + 1];      /*  space for a linear copy of data */
                int nGot = 0;
-               int nCopyToUser;        // number to copy to user
+               int nCopyToUser;        /*  number to copy to user */
                char cData;
                do {
                        cData = pdx->inputBuffer[pdx->dwInBuffGet++];
-                       if (cData == CR_CHAR)   // replace CR with zero
+                       if (cData == CR_CHAR)   /*  replace CR with zero */
                                cData = (char)0;
 
                        if (pdx->dwInBuffGet >= INBUF_SZ)
-                               pdx->dwInBuffGet = 0;   // wrap buffer pointer
+                               pdx->dwInBuffGet = 0;   /*  wrap buffer pointer */
 
-                       buffer[nGot++] = cData; // save the output
-               }
-               while ((nGot < nAvailable) && cData);
-
-               nCopyToUser = nGot;     // what to copy...
-               if (cData)      // do we need null
-               {
-                       buffer[nGot] = (char)0; // make it tidy
-                       if (nGot < n)   // if space in user buffer...
-                               ++nCopyToUser;  // ...copy the 0 as well.
+                       buffer[nGot++] = cData; /*  save the output */
+               } while ((nGot < nAvailable) && cData);
+
+               nCopyToUser = nGot;     /*  what to copy... */
+               if (cData) {    /*  do we need null */
+                       buffer[nGot] = (char)0; /*  make it tidy */
+                       if (nGot < n)   /*  if space in user buffer... */
+                               ++nCopyToUser;  /*  ...copy the 0 as well. */
                }
 
                pdx->dwNumInput -= nGot;
@@ -514,12 +503,12 @@ int GetString(DEVICE_EXTENSION * pdx, char __user * pUser, int n)
                if (copy_to_user(pUser, buffer, nCopyToUser))
                        iReturn = -EFAULT;
                else
-                       iReturn = nGot;         // report characters read
+                       iReturn = nGot;         /*  report characters read */
        } else
                spin_unlock_irq(&pdx->charInLock);
 
-       Allowi(pdx);    // Make sure char reads are running
-       mutex_unlock(&pdx->io_mutex);   // Protect disconnect from new i/o
+       Allowi(pdx);    /*  Make sure char reads are running */
+       mutex_unlock(&pdx->io_mutex);   /*  Protect disconnect from new i/o */
 
        return iReturn;
 }
@@ -527,14 +516,14 @@ int GetString(DEVICE_EXTENSION * pdx, char __user * pUser, int n)
 /*******************************************************************************
 ** Get count of characters in the inout buffer.
 *******************************************************************************/
-int Stat1401(DEVICE_EXTENSION * pdx)
+int Stat1401(DEVICE_EXTENSION *pdx)
 {
        int iReturn;
-       mutex_lock(&pdx->io_mutex);     // Protect disconnect from new i/o
-       Allowi(pdx);            // make sure we allow pending chars
-       SendChars(pdx);         // in both directions
-       iReturn = pdx->dwNumInput;      // no lock as single read
-       mutex_unlock(&pdx->io_mutex);   // Protect disconnect from new i/o
+       mutex_lock(&pdx->io_mutex);     /*  Protect disconnect from new i/o */
+       Allowi(pdx);            /*  make sure we allow pending chars */
+       SendChars(pdx);         /*  in both directions */
+       iReturn = pdx->dwNumInput;      /*  no lock as single read */
+       mutex_unlock(&pdx->io_mutex);   /*  Protect disconnect from new i/o */
        return iReturn;
 }
 
@@ -545,32 +534,30 @@ int Stat1401(DEVICE_EXTENSION * pdx)
 ** any fancy interlocks as we only read the interrupt routine data, and the
 ** system is arranged so nothing can be destroyed.
 ****************************************************************************/
-int LineCount(DEVICE_EXTENSION * pdx)
+int LineCount(DEVICE_EXTENSION *pdx)
 {
-       int iReturn = 0;        // will be count of line ends
+       int iReturn = 0;        /*  will be count of line ends */
 
-       mutex_lock(&pdx->io_mutex);     // Protect disconnect from new i/o
-       Allowi(pdx);            // Make sure char reads are running
-       SendChars(pdx);         // and send any buffered chars
-       spin_lock_irq(&pdx->charInLock);        // Get protection
+       mutex_lock(&pdx->io_mutex);     /*  Protect disconnect from new i/o */
+       Allowi(pdx);            /*  Make sure char reads are running */
+       SendChars(pdx);         /*  and send any buffered chars */
+       spin_lock_irq(&pdx->charInLock);        /*  Get protection */
 
-       if (pdx->dwNumInput > 0)        // worth looking?
-       {
-               unsigned int dwIndex = pdx->dwInBuffGet;        // start at first available
-               unsigned int dwEnd = pdx->dwInBuffPut;  // Position for search end
+       if (pdx->dwNumInput > 0) {      /*  worth looking? */
+               unsigned int dwIndex = pdx->dwInBuffGet;        /*  start at first available */
+               unsigned int dwEnd = pdx->dwInBuffPut;  /*  Position for search end */
                do {
                        if (pdx->inputBuffer[dwIndex++] == CR_CHAR)
-                               ++iReturn;      // inc count if CR
+                               ++iReturn;      /*  inc count if CR */
 
-                       if (dwIndex >= INBUF_SZ)        // see if we fall off buff
+                       if (dwIndex >= INBUF_SZ)        /*  see if we fall off buff */
                                dwIndex = 0;
-               }
-               while (dwIndex != dwEnd);       // go to last available
+               } while (dwIndex != dwEnd);     /*  go to last available */
        }
 
        spin_unlock_irq(&pdx->charInLock);
        dev_dbg(&pdx->interface->dev, "LineCount returned %d", iReturn);
-       mutex_unlock(&pdx->io_mutex);   // Protect disconnect from new i/o
+       mutex_unlock(&pdx->io_mutex);   /*  Protect disconnect from new i/o */
        return iReturn;
 }
 
@@ -579,14 +566,14 @@ int LineCount(DEVICE_EXTENSION * pdx)
 **
 ** Gets the space in the output buffer. Called from user code.
 *****************************************************************************/
-int GetOutBufSpace(DEVICE_EXTENSION * pdx)
+int GetOutBufSpace(DEVICE_EXTENSION *pdx)
 {
        int iReturn;
-       mutex_lock(&pdx->io_mutex);     // Protect disconnect from new i/o
-       SendChars(pdx);         // send any buffered chars
-       iReturn = (int)(OUTBUF_SZ - pdx->dwNumOutput);  // no lock needed for single read
+       mutex_lock(&pdx->io_mutex);     /*  Protect disconnect from new i/o */
+       SendChars(pdx);         /*  send any buffered chars */
+       iReturn = (int)(OUTBUF_SZ - pdx->dwNumOutput);  /*  no lock needed for single read */
        dev_dbg(&pdx->interface->dev, "OutBufSpace %d", iReturn);
-       mutex_unlock(&pdx->io_mutex);   // Protect disconnect from new i/o
+       mutex_unlock(&pdx->io_mutex);   /*  Protect disconnect from new i/o */
        return iReturn;
 }
 
@@ -597,7 +584,7 @@ int GetOutBufSpace(DEVICE_EXTENSION * pdx)
 ** Clears up a transfer area. This is always called in the context of a user
 ** request, never from a call-back.
 ****************************************************************************/
-int ClearArea(DEVICE_EXTENSION * pdx, int nArea)
+int ClearArea(DEVICE_EXTENSION *pdx, int nArea)
 {
        int iReturn = U14ERR_NOERROR;
 
@@ -606,14 +593,14 @@ int ClearArea(DEVICE_EXTENSION * pdx, int nArea)
                dev_err(&pdx->interface->dev, "%s Attempt to clear area %d",
                        __func__, nArea);
        } else {
-               TRANSAREA *pTA = &pdx->rTransDef[nArea];        // to save typing
-               if (!pTA->bUsed)        // if not used...
-                       iReturn = U14ERR_NOTSET;        // ...nothing to be done
+               TRANSAREA *pTA = &pdx->rTransDef[nArea];        /*  to save typing */
+               if (!pTA->bUsed)        /*  if not used... */
+                       iReturn = U14ERR_NOTSET;        /*  ...nothing to be done */
                else {
-                       // We must save the memory we return as we shouldn't mess with memory while
-                       // holding a spin lock.
-                       struct page **pPages = 0;       // save page address list
-                       int nPages = 0; // and number of pages
+                       /*  We must save the memory we return as we shouldn't mess with memory while */
+                       /*  holding a spin lock. */
+                       struct page **pPages = NULL; /*save page address list*/
+                       int nPages = 0; /*  and number of pages */
                        int np;
 
                        dev_dbg(&pdx->interface->dev, "%s area %d", __func__,
@@ -621,33 +608,32 @@ int ClearArea(DEVICE_EXTENSION * pdx, int nArea)
                        spin_lock_irq(&pdx->stagedLock);
                        if ((pdx->StagedId == nArea)
                            && (pdx->dwDMAFlag > MODE_CHAR)) {
-                               iReturn = U14ERR_UNLOCKFAIL;    // cannot delete as in use
+                               iReturn = U14ERR_UNLOCKFAIL;    /*  cannot delete as in use */
                                dev_err(&pdx->interface->dev,
                                        "%s call on area %d while active",
                                        __func__, nArea);
                        } else {
-                               pPages = pTA->pPages;   // save page address list
-                               nPages = pTA->nPages;   // and page count
-                               if (pTA->dwEventSz)     // if events flagging in use
-                                       wake_up_interruptible(&pTA->wqEvent);   // release anything that was waiting
+                               pPages = pTA->pPages;   /*  save page address list */
+                               nPages = pTA->nPages;   /*  and page count */
+                               if (pTA->dwEventSz)     /*  if events flagging in use */
+                                       wake_up_interruptible(&pTA->wqEvent);   /*  release anything that was waiting */
 
                                if (pdx->bXFerWaiting
                                    && (pdx->rDMAInfo.wIdent == nArea))
-                                       pdx->bXFerWaiting = false;      // Cannot have pending xfer if area cleared
+                                       pdx->bXFerWaiting = false;      /*  Cannot have pending xfer if area cleared */
 
-                               // Clean out the TRANSAREA except for the wait queue, which is at the end
-                               // This sets bUsed to false and dwEventSz to 0 to say area not used and no events.
+                               /*  Clean out the TRANSAREA except for the wait queue, which is at the end */
+                               /*  This sets bUsed to false and dwEventSz to 0 to say area not used and no events. */
                                memset(pTA, 0,
                                       sizeof(TRANSAREA) -
                                       sizeof(wait_queue_head_t));
                        }
                        spin_unlock_irq(&pdx->stagedLock);
 
-                       if (pPages)     // if we decided to release the memory
-                       {
-                               // Now we must undo the pinning down of the pages. We will assume the worst and mark
-                               // all the pages as dirty. Don't be tempted to move this up above as you must not be
-                               // holding a spin lock to do this stuff as it is not atomic.
+                       if (pPages) {   /*  if we decided to release the memory */
+                               /*  Now we must undo the pinning down of the pages. We will assume the worst and mark */
+                               /*  all the pages as dirty. Don't be tempted to move this up above as you must not be */
+                               /*  holding a spin lock to do this stuff as it is not atomic. */
                                dev_dbg(&pdx->interface->dev, "%s nPages=%d",
                                        __func__, nPages);
 
@@ -674,29 +660,29 @@ int ClearArea(DEVICE_EXTENSION * pdx, int nArea)
 ** Sets up a transfer area - the functional part. Called by both
 ** SetTransfer and SetCircular.
 ****************************************************************************/
-static int SetArea(DEVICE_EXTENSION * pdx, int nArea, char __user * puBuf,
+static int SetArea(DEVICE_EXTENSION *pdx, int nArea, char __user *puBuf,
                   unsigned int dwLength, bool bCircular, bool bCircToHost)
 {
-       // Start by working out the page aligned start of the area and the size
-       // of the area in pages, allowing for the start not being aligned and the
-       // end needing to be rounded up to a page boundary.
+       /*  Start by working out the page aligned start of the area and the size */
+       /*  of the area in pages, allowing for the start not being aligned and the */
+       /*  end needing to be rounded up to a page boundary. */
        unsigned long ulStart = ((unsigned long)puBuf) & PAGE_MASK;
        unsigned int ulOffset = ((unsigned long)puBuf) & (PAGE_SIZE - 1);
        int len = (dwLength + ulOffset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-       TRANSAREA *pTA = &pdx->rTransDef[nArea];        // to save typing
-       struct page **pPages = 0;       // space for page tables
-       int nPages = 0;         // and number of pages
+       TRANSAREA *pTA = &pdx->rTransDef[nArea];        /*  to save typing */
+       struct page **pPages = NULL;    /*  space for page tables */
+       int nPages = 0;         /*  and number of pages */
 
-       int iReturn = ClearArea(pdx, nArea);    // see if OK to use this area
-       if ((iReturn != U14ERR_NOTSET) &&       // if not area unused and...
-           (iReturn != U14ERR_NOERROR))        // ...not all OK, then...
-               return iReturn; // ...we cannot use this area
+       int iReturn = ClearArea(pdx, nArea);    /*  see if OK to use this area */
+       if ((iReturn != U14ERR_NOTSET) &&       /*  if not area unused and... */
+           (iReturn != U14ERR_NOERROR))        /*  ...not all OK, then... */
+               return iReturn; /*  ...we cannot use this area */
 
-       if (!access_ok(VERIFY_WRITE, puBuf, dwLength))  // if we cannot access the memory...
-               return -EFAULT; // ...then we are done
+       if (!access_ok(VERIFY_WRITE, puBuf, dwLength))  /*  if we cannot access the memory... */
+               return -EFAULT; /*  ...then we are done */
 
-       // Now allocate space to hold the page pointer and virtual address pointer tables
+       /*  Now allocate space to hold the page pointer and virtual address pointer tables */
        pPages = kmalloc(len * sizeof(struct page *), GFP_KERNEL);
        if (!pPages) {
                iReturn = U14ERR_NOMEMORY;
@@ -705,24 +691,23 @@ static int SetArea(DEVICE_EXTENSION * pdx, int nArea, char __user * puBuf,
        dev_dbg(&pdx->interface->dev, "%s %p, length=%06x, circular %d",
                __func__, puBuf, dwLength, bCircular);
 
-       // To pin down user pages we must first acquire the mapping semaphore.
-       down_read(&current->mm->mmap_sem);      // get memory map semaphore
-       nPages =
-           get_user_pages(current, current->mm, ulStart, len, 1, 0, pPages, 0);
-       up_read(&current->mm->mmap_sem);        // release the semaphore
+       /*  To pin down user pages we must first acquire the mapping semaphore. */
+       down_read(&current->mm->mmap_sem);      /*  get memory map semaphore */
+       nPages = get_user_pages(current, current->mm, ulStart, len, 1, 0,
+                               pPages, NULL);
+       up_read(&current->mm->mmap_sem);        /*  release the semaphore */
        dev_dbg(&pdx->interface->dev, "%s nPages = %d", __func__, nPages);
 
-       if (nPages > 0)         // if we succeeded
-       {
-               // If you are tempted to use page_address (form LDD3), forget it. You MUST use
-               // kmap() or kmap_atomic() to get a virtual address. page_address will give you
-               // (null) or at least it does in this context with an x86 machine.
+       if (nPages > 0) {               /*  if we succeeded */
+               /*  If you are tempted to use page_address (form LDD3), forget it. You MUST use */
+               /*  kmap() or kmap_atomic() to get a virtual address. page_address will give you */
+               /*  (null) or at least it does in this context with an x86 machine. */
                spin_lock_irq(&pdx->stagedLock);
-               pTA->lpvBuff = puBuf;   // keep start of region (user address)
-               pTA->dwBaseOffset = ulOffset;   // save offset in first page to start of xfer
-               pTA->dwLength = dwLength;       // Size if the region in bytes
-               pTA->pPages = pPages;   // list of pages that are used by buffer
-               pTA->nPages = nPages;   // number of pages
+               pTA->lpvBuff = puBuf;   /*  keep start of region (user address) */
+               pTA->dwBaseOffset = ulOffset;   /*  save offset in first page to start of xfer */
+               pTA->dwLength = dwLength;       /*  Size if the region in bytes */
+               pTA->pPages = pPages;   /*  list of pages that are used by buffer */
+               pTA->nPages = nPages;   /*  number of pages */
 
                pTA->bCircular = bCircular;
                pTA->bCircToHost = bCircToHost;
@@ -731,10 +716,10 @@ static int SetArea(DEVICE_EXTENSION * pdx, int nArea, char __user * puBuf,
                pTA->aBlocks[0].dwSize = 0;
                pTA->aBlocks[1].dwOffset = 0;
                pTA->aBlocks[1].dwSize = 0;
-               pTA->bUsed = true;      // This is now a used block
+               pTA->bUsed = true;      /*  This is now a used block */
 
                spin_unlock_irq(&pdx->stagedLock);
-               iReturn = U14ERR_NOERROR;       // say all was well
+               iReturn = U14ERR_NOERROR;       /*  say all was well */
        } else {
                iReturn = U14ERR_LOCKFAIL;
                goto error;
@@ -754,7 +739,7 @@ error:
 ** unset it. Unsetting will fail if the area is booked, and a transfer to that
 ** area is in progress. Otherwise, we will release the area and re-assign it.
 ****************************************************************************/
-int SetTransfer(DEVICE_EXTENSION * pdx, TRANSFERDESC __user * pTD)
+int SetTransfer(DEVICE_EXTENSION *pdx, TRANSFERDESC __user *pTD)
 {
        int iReturn;
        TRANSFERDESC td;
@@ -765,9 +750,9 @@ int SetTransfer(DEVICE_EXTENSION * pdx, TRANSFERDESC __user * pTD)
        mutex_lock(&pdx->io_mutex);
        dev_dbg(&pdx->interface->dev, "%s area:%d, size:%08x", __func__,
                td.wAreaNum, td.dwLength);
-       // The strange cast is done so that we don't get warnings in 32-bit linux about the size of the
-       // pointer. The pointer is always passed as a 64-bit object so that we don't have problems using
-       // a 32-bit program on a 64-bit system. unsigned long is 64-bits on a 64-bit system.
+       /*  The strange cast is done so that we don't get warnings in 32-bit linux about the size of the */
+       /*  pointer. The pointer is always passed as a 64-bit object so that we don't have problems using */
+       /*  a 32-bit program on a 64-bit system. unsigned long is 64-bits on a 64-bit system. */
        iReturn =
            SetArea(pdx, td.wAreaNum,
                    (char __user *)((unsigned long)td.lpvBuff), td.dwLength,
@@ -780,7 +765,7 @@ int SetTransfer(DEVICE_EXTENSION * pdx, TRANSFERDESC __user * pTD)
 ** UnSetTransfer
 ** Erases a transfer area record
 ****************************************************************************/
-int UnsetTransfer(DEVICE_EXTENSION * pdx, int nArea)
+int UnsetTransfer(DEVICE_EXTENSION *pdx, int nArea)
 {
        int iReturn;
        mutex_lock(&pdx->io_mutex);
@@ -797,27 +782,26 @@ int UnsetTransfer(DEVICE_EXTENSION * pdx, int nArea)
 ** pretend that whatever the user asked for was achieved, so we return 1 if
 ** try to create one, and 0 if they ask to remove (assuming all else was OK).
 ****************************************************************************/
-int SetEvent(DEVICE_EXTENSION * pdx, TRANSFEREVENT __user * pTE)
+int SetEvent(DEVICE_EXTENSION *pdx, TRANSFEREVENT __user *pTE)
 {
        int iReturn = U14ERR_NOERROR;
        TRANSFEREVENT te;
 
-       // get a local copy of the data
+       /*  get a local copy of the data */
        if (copy_from_user(&te, pTE, sizeof(te)))
                return -EFAULT;
 
-       if (te.wAreaNum >= MAX_TRANSAREAS)      // the area must exist
+       if (te.wAreaNum >= MAX_TRANSAREAS)      /*  the area must exist */
                return U14ERR_BADAREA;
        else {
                TRANSAREA *pTA = &pdx->rTransDef[te.wAreaNum];
-               mutex_lock(&pdx->io_mutex);     // make sure we have no competitor
+               mutex_lock(&pdx->io_mutex);     /*  make sure we have no competitor */
                spin_lock_irq(&pdx->stagedLock);
-               if (pTA->bUsed) // area must be in use
-               {
-                       pTA->dwEventSt = te.dwStart;    // set area regions
-                       pTA->dwEventSz = te.dwLength;   // set size (0 cancels it)
-                       pTA->bEventToHost = te.wFlags & 1;      // set the direction
-                       pTA->iWakeUp = 0;       // zero the wake up count
+               if (pTA->bUsed) {       /*  area must be in use */
+                       pTA->dwEventSt = te.dwStart;    /*  set area regions */
+                       pTA->dwEventSz = te.dwLength;   /*  set size (0 cancels it) */
+                       pTA->bEventToHost = te.wFlags & 1;      /*  set the direction */
+                       pTA->iWakeUp = 0;       /*  zero the wake up count */
                } else
                        iReturn = U14ERR_NOTSET;
                spin_unlock_irq(&pdx->stagedLock);
@@ -833,7 +817,7 @@ int SetEvent(DEVICE_EXTENSION * pdx, TRANSFEREVENT __user * pTE)
 ** of times that a block met the event condition since we last cleared it or
 ** 0 if timed out, or -ve error (bad area or not set, or signal).
 ****************************************************************************/
-int WaitEvent(DEVICE_EXTENSION * pdx, int nArea, int msTimeOut)
+int WaitEvent(DEVICE_EXTENSION *pdx, int nArea, int msTimeOut)
 {
        int iReturn;
        if ((unsigned)nArea >= MAX_TRANSAREAS)
@@ -841,15 +825,15 @@ int WaitEvent(DEVICE_EXTENSION * pdx, int nArea, int msTimeOut)
        else {
                int iWait;
                TRANSAREA *pTA = &pdx->rTransDef[nArea];
-               msTimeOut = (msTimeOut * HZ + 999) / 1000;      // convert timeout to jiffies
-
-               // We cannot wait holding the mutex, but we check the flags while holding
-               // it. This may well be pointless as another thread could get in between
-               // releasing it and the wait call. However, this would have to clear the
-               // iWakeUp flag. However, the !pTA-bUsed may help us in this case.
-               mutex_lock(&pdx->io_mutex);     // make sure we have no competitor
-               if (!pTA->bUsed || !pTA->dwEventSz)     // check something to wait for...
-                       return U14ERR_NOTSET;   // ...else we do nothing
+               msTimeOut = (msTimeOut * HZ + 999) / 1000;      /*  convert timeout to jiffies */
+
+               /*  We cannot wait holding the mutex, but we check the flags while holding */
+               /*  it. This may well be pointless as another thread could get in between */
+               /*  releasing it and the wait call. However, this would have to clear the */
+               /*  iWakeUp flag. However, the !pTA-bUsed may help us in this case. */
+               mutex_lock(&pdx->io_mutex);     /*  make sure we have no competitor */
+               if (!pTA->bUsed || !pTA->dwEventSz)     /*  check something to wait for... */
+                       return U14ERR_NOTSET;   /*  ...else we do nothing */
                mutex_unlock(&pdx->io_mutex);
 
                if (msTimeOut)
@@ -863,12 +847,12 @@ int WaitEvent(DEVICE_EXTENSION * pdx, int nArea, int msTimeOut)
                            wait_event_interruptible(pTA->wqEvent, pTA->iWakeUp
                                                     || !pTA->bUsed);
                if (iWait)
-                       iReturn = -ERESTARTSYS; // oops - we have had a SIGNAL
+                       iReturn = -ERESTARTSYS; /*  oops - we have had a SIGNAL */
                else
-                       iReturn = pTA->iWakeUp; // else the wakeup count
+                       iReturn = pTA->iWakeUp; /*  else the wakeup count */
 
                spin_lock_irq(&pdx->stagedLock);
-               pTA->iWakeUp = 0;       // clear the flag
+               pTA->iWakeUp = 0;       /*  clear the flag */
                spin_unlock_irq(&pdx->stagedLock);
        }
        return iReturn;
@@ -880,17 +864,17 @@ int WaitEvent(DEVICE_EXTENSION * pdx, int nArea, int msTimeOut)
 ** number of times a block completed since the last call, or 0 if none or a
 ** negative error.
 ****************************************************************************/
-int TestEvent(DEVICE_EXTENSION * pdx, int nArea)
+int TestEvent(DEVICE_EXTENSION *pdx, int nArea)
 {
        int iReturn;
        if ((unsigned)nArea >= MAX_TRANSAREAS)
                iReturn = U14ERR_BADAREA;
        else {
                TRANSAREA *pTA = &pdx->rTransDef[nArea];
-               mutex_lock(&pdx->io_mutex);     // make sure we have no competitor
+               mutex_lock(&pdx->io_mutex);     /*  make sure we have no competitor */
                spin_lock_irq(&pdx->stagedLock);
-               iReturn = pTA->iWakeUp; // get wakeup count since last call
-               pTA->iWakeUp = 0;       // clear the count
+               iReturn = pTA->iWakeUp; /*  get wakeup count since last call */
+               pTA->iWakeUp = 0;       /*  clear the count */
                spin_unlock_irq(&pdx->stagedLock);
                mutex_unlock(&pdx->io_mutex);
        }
@@ -901,17 +885,17 @@ int TestEvent(DEVICE_EXTENSION * pdx, int nArea)
 ** GetTransferInfo
 ** Puts the current state of the 1401 in a TGET_TX_BLOCK.
 *****************************************************************************/
-int GetTransfer(DEVICE_EXTENSION * pdx, TGET_TX_BLOCK __user * pTX)
+int GetTransfer(DEVICE_EXTENSION *pdx, TGET_TX_BLOCK __user *pTX)
 {
        int iReturn = U14ERR_NOERROR;
        unsigned int dwIdent;
 
        mutex_lock(&pdx->io_mutex);
-       dwIdent = pdx->StagedId;        // area ident for last xfer
+       dwIdent = pdx->StagedId;        /*  area ident for last xfer */
        if (dwIdent >= MAX_TRANSAREAS)
                iReturn = U14ERR_BADAREA;
        else {
-               // Return the best information we have - we don't have physical addresses
+               /*  Return the best information we have - we don't have physical addresses */
                TGET_TX_BLOCK *tx;
 
                tx = kzalloc(sizeof(*tx), GFP_KERNEL);
@@ -921,8 +905,8 @@ int GetTransfer(DEVICE_EXTENSION * pdx, TGET_TX_BLOCK __user * pTX)
                }
                tx->size = pdx->rTransDef[dwIdent].dwLength;
                tx->linear = (long long)((long)pdx->rTransDef[dwIdent].lpvBuff);
-               tx->avail = GET_TX_MAXENTRIES;  // how many blocks we could return
-               tx->used = 1;   // number we actually return
+               tx->avail = GET_TX_MAXENTRIES;  /*  how many blocks we could return */
+               tx->used = 1;   /*  number we actually return */
                tx->entries[0].physical =
                    (long long)(tx->linear + pdx->StagedOffset);
                tx->entries[0].size = tx->size;
@@ -940,7 +924,7 @@ int GetTransfer(DEVICE_EXTENSION * pdx, TGET_TX_BLOCK __user * pTX)
 **
 ** Empties the host i/o buffers
 ****************************************************************************/
-int KillIO1401(DEVICE_EXTENSION * pdx)
+int KillIO1401(DEVICE_EXTENSION *pdx)
 {
        dev_dbg(&pdx->interface->dev, "%s", __func__);
        mutex_lock(&pdx->io_mutex);
@@ -955,7 +939,7 @@ int KillIO1401(DEVICE_EXTENSION * pdx)
 ** Returns a 0 or a 1 for whether DMA is happening. No point holding a mutex
 ** for this as it only does one read.
 *****************************************************************************/
-int BlkTransState(DEVICE_EXTENSION * pdx)
+int BlkTransState(DEVICE_EXTENSION *pdx)
 {
        int iReturn = pdx->dwDMAFlag != MODE_CHAR;
        dev_dbg(&pdx->interface->dev, "%s = %d", __func__, iReturn);
@@ -967,12 +951,12 @@ int BlkTransState(DEVICE_EXTENSION * pdx)
 **
 ** Puts the current state of the 1401 in the Irp return buffer.
 *****************************************************************************/
-int StateOf1401(DEVICE_EXTENSION * pdx)
+int StateOf1401(DEVICE_EXTENSION *pdx)
 {
        int iReturn;
        mutex_lock(&pdx->io_mutex);
 
-       QuickCheck(pdx, false, false);  // get state up to date, no reset
+       QuickCheck(pdx, false, false);  /*  get state up to date, no reset */
        iReturn = pdx->sCurrentState;
 
        mutex_unlock(&pdx->io_mutex);
@@ -987,20 +971,23 @@ int StateOf1401(DEVICE_EXTENSION * pdx)
 ** Initiates a self-test cycle. The assumption is that we have no interrupts
 ** active, so we should make sure that this is the case.
 *****************************************************************************/
-int StartSelfTest(DEVICE_EXTENSION * pdx)
+int StartSelfTest(DEVICE_EXTENSION *pdx)
 {
        int nGot;
        mutex_lock(&pdx->io_mutex);
        dev_dbg(&pdx->interface->dev, "%s", __func__);
 
-       ced_draw_down(pdx);     // wait for, then kill outstanding Urbs
-       FlushInBuff(pdx);       // Clear out input buffer & pipe
-       FlushOutBuff(pdx);      // Clear output buffer & pipe
-//    ReadWrite_Cancel(pDeviceObject);        /* so things stay tidy */
+       ced_draw_down(pdx);     /*  wait for, then kill outstanding Urbs */
+       FlushInBuff(pdx);       /*  Clear out input buffer & pipe */
+       FlushOutBuff(pdx);      /*  Clear output buffer & pipe */
+       /* so things stay tidy */
+       /* ReadWrite_Cancel(pDeviceObject); */
        pdx->dwDMAFlag = MODE_CHAR;     /* Clear DMA mode flags here */
 
-       nGot = usb_control_msg(pdx->udev, usb_rcvctrlpipe(pdx->udev, 0), DB_SELFTEST, (H_TO_D | VENDOR | DEVREQ), 0, 0, 0, 0, HZ);      // allow 1 second timeout
-       pdx->ulSelfTestTime = jiffies + HZ * 30;        // 30 seconds into the future
+       nGot = usb_control_msg(pdx->udev, usb_rcvctrlpipe(pdx->udev, 0),
+                              DB_SELFTEST, (H_TO_D | VENDOR | DEVREQ),
+                              0, 0, NULL, 0, HZ); /* allow 1 second timeout */
+       pdx->ulSelfTestTime = jiffies + HZ * 30;        /*  30 seconds into the future */
 
        mutex_unlock(&pdx->io_mutex);
        if (nGot < 0)
@@ -1013,53 +1000,49 @@ int StartSelfTest(DEVICE_EXTENSION * pdx)
 **
 ** Check progress of a self-test cycle
 ****************************************************************************/
-int CheckSelfTest(DEVICE_EXTENSION * pdx, TGET_SELFTEST __user * pGST)
+int CheckSelfTest(DEVICE_EXTENSION *pdx, TGET_SELFTEST __user *pGST)
 {
        unsigned int state, error;
        int iReturn;
-       TGET_SELFTEST gst;      // local work space
-       memset(&gst, 0, sizeof(gst));   // clear out the space (sets code 0)
+       TGET_SELFTEST gst;      /*  local work space */
+       memset(&gst, 0, sizeof(gst));   /*  clear out the space (sets code 0) */
 
        mutex_lock(&pdx->io_mutex);
 
        dev_dbg(&pdx->interface->dev, "%s", __func__);
        iReturn = Get1401State(pdx, &state, &error);
-       if (iReturn == U14ERR_NOERROR)  // Only accept zero if it happens twice
+       if (iReturn == U14ERR_NOERROR)  /*  Only accept zero if it happens twice */
                iReturn = Get1401State(pdx, &state, &error);
 
-       if (iReturn != U14ERR_NOERROR)  // Self-test can cause comms errors
-       {                       // so we assume still testing
+       if (iReturn != U14ERR_NOERROR) {        /*  Self-test can cause comms errors */
+                               /*  so we assume still testing */
                dev_err(&pdx->interface->dev,
                        "%s Get1401State=%d, assuming still testing", __func__,
                        iReturn);
-               state = 0x80;   // Force still-testing, no error
+               state = 0x80;   /*  Force still-testing, no error */
                error = 0;
                iReturn = U14ERR_NOERROR;
        }
 
-       if ((state == -1) && (error == -1))     // If Get1401State had problems
-       {
+       if ((state == -1) && (error == -1)) {   /*  If Get1401State had problems */
                dev_err(&pdx->interface->dev,
                        "%s Get1401State failed, assuming still testing",
                        __func__);
-               state = 0x80;   // Force still-testing, no error
+               state = 0x80;   /*  Force still-testing, no error */
                error = 0;
        }
 
-       if ((state & 0xFF) == 0x80)     // If we are still in self-test
-       {
-               if (state & 0x00FF0000) // Have we got an error?
-               {
-                       gst.code = (state & 0x00FF0000) >> 16;  // read the error code
-                       gst.x = error & 0x0000FFFF;     // Error data X
-                       gst.y = (error & 0xFFFF0000) >> 16;     // and data Y
+       if ((state & 0xFF) == 0x80) {   /*  If we are still in self-test */
+               if (state & 0x00FF0000) { /*  Have we got an error? */
+                       gst.code = (state & 0x00FF0000) >> 16;  /*  read the error code */
+                       gst.x = error & 0x0000FFFF;     /*  Error data X */
+                       gst.y = (error & 0xFFFF0000) >> 16;     /*  and data Y */
                        dev_dbg(&pdx->interface->dev, "Self-test error code %d",
                                gst.code);
-               } else          // No error, check for timeout
-               {
-                       unsigned long ulNow = jiffies;  // get current time
+               } else {                /*  No error, check for timeout */
+                       unsigned long ulNow = jiffies;  /*  get current time */
                        if (time_after(ulNow, pdx->ulSelfTestTime)) {
-                               gst.code = -2;  // Flag the timeout
+                               gst.code = -2;  /*  Flag the timeout */
                                dev_dbg(&pdx->interface->dev,
                                        "Self-test timed-out");
                        } else
@@ -1067,16 +1050,16 @@ int CheckSelfTest(DEVICE_EXTENSION * pdx, TGET_SELFTEST __user * pGST)
                                        "Self-test on-going");
                }
        } else {
-               gst.code = -1;  // Flag the test is done
+               gst.code = -1;  /*  Flag the test is done */
                dev_dbg(&pdx->interface->dev, "Self-test done");
        }
 
-       if (gst.code < 0)       // If we have a problem or finished
-       {                       // If using the 2890 we should reset properly
+       if (gst.code < 0) {     /*  If we have a problem or finished */
+                               /*  If using the 2890 we should reset properly */
                if ((pdx->nPipes == 4) && (pdx->s1401Type <= TYPEPOWER))
-                       Is1401(pdx);    // Get 1401 reset and OK
+                       Is1401(pdx);    /*  Get 1401 reset and OK */
                else
-                       QuickCheck(pdx, true, true);    // Otherwise check without reset unless problems
+                       QuickCheck(pdx, true, true);    /*  Otherwise check without reset unless problems */
        }
        mutex_unlock(&pdx->io_mutex);
 
@@ -1091,7 +1074,7 @@ int CheckSelfTest(DEVICE_EXTENSION * pdx, TGET_SELFTEST __user * pGST)
 **
 ** Returns code for standard, plus, micro1401, power1401 or none
 ****************************************************************************/
-int TypeOf1401(DEVICE_EXTENSION * pdx)
+int TypeOf1401(DEVICE_EXTENSION *pdx)
 {
        int iReturn = TYPEUNKNOWN;
        mutex_lock(&pdx->io_mutex);
@@ -1100,7 +1083,7 @@ int TypeOf1401(DEVICE_EXTENSION * pdx)
        switch (pdx->s1401Type) {
        case TYPE1401:
                iReturn = U14ERR_STD;
-               break;          // Handle these types directly
+               break;          /*  Handle these types directly */
        case TYPEPLUS:
                iReturn = U14ERR_PLUS;
                break;
@@ -1109,9 +1092,9 @@ int TypeOf1401(DEVICE_EXTENSION * pdx)
                break;
        default:
                if ((pdx->s1401Type >= TYPEPOWER) && (pdx->s1401Type <= 25))
-                       iReturn = pdx->s1401Type + 4;   // We can calculate types
-               else            //  for up-coming 1401 designs
-                       iReturn = TYPEUNKNOWN;  // Don't know or not there
+                       iReturn = pdx->s1401Type + 4;   /*  We can calculate types */
+               else            /*   for up-coming 1401 designs */
+                       iReturn = TYPEUNKNOWN;  /*  Don't know or not there */
        }
        dev_dbg(&pdx->interface->dev, "%s %d", __func__, iReturn);
        mutex_unlock(&pdx->io_mutex);
@@ -1124,13 +1107,13 @@ int TypeOf1401(DEVICE_EXTENSION * pdx)
 **
 ** Returns flags on block transfer abilities
 ****************************************************************************/
-int TransferFlags(DEVICE_EXTENSION * pdx)
+int TransferFlags(DEVICE_EXTENSION *pdx)
 {
-       int iReturn = U14TF_MULTIA | U14TF_DIAG |       // we always have multiple DMA area
-           U14TF_NOTIFY | U14TF_CIRCTH;        // diagnostics, notify and circular
+       int iReturn = U14TF_MULTIA | U14TF_DIAG |       /*  we always have multiple DMA area */
+           U14TF_NOTIFY | U14TF_CIRCTH;        /*  diagnostics, notify and circular */
        dev_dbg(&pdx->interface->dev, "%s", __func__);
        mutex_lock(&pdx->io_mutex);
-       if (pdx->bIsUSB2)       // Set flag for USB2 if appropriate
+       if (pdx->bIsUSB2)       /*  Set flag for USB2 if appropriate */
                iReturn |= U14TF_USB2;
        mutex_unlock(&pdx->io_mutex);
 
@@ -1142,12 +1125,16 @@ int TransferFlags(DEVICE_EXTENSION * pdx)
 ** Issues a debug\diagnostic command to the 1401 along with a 32-bit datum
 ** This is a utility command used for dbg operations.
 */
-static int DbgCmd1401(DEVICE_EXTENSION * pdx, unsigned char cmd,
+static int DbgCmd1401(DEVICE_EXTENSION *pdx, unsigned char cmd,
                      unsigned int data)
 {
        int iReturn;
        dev_dbg(&pdx->interface->dev, "%s entry", __func__);
-       iReturn = usb_control_msg(pdx->udev, usb_sndctrlpipe(pdx->udev, 0), cmd, (H_TO_D | VENDOR | DEVREQ), (unsigned short)data, (unsigned short)(data >> 16), 0, 0, HZ);     // allow 1 second timeout
+       iReturn = usb_control_msg(pdx->udev, usb_sndctrlpipe(pdx->udev, 0), cmd,
+                                 (H_TO_D | VENDOR | DEVREQ),
+                                 (unsigned short)data,
+                                 (unsigned short)(data >> 16), NULL, 0, HZ);
+                                               /* allow 1 second timeout */
        if (iReturn < 0)
                dev_err(&pdx->interface->dev, "%s fail code=%d", __func__,
                        iReturn);
@@ -1160,7 +1147,7 @@ static int DbgCmd1401(DEVICE_EXTENSION * pdx, unsigned char cmd,
 **
 ** Execute the diagnostic peek operation. Uses address, width and repeats.
 ****************************************************************************/
-int DbgPeek(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
+int DbgPeek(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB)
 {
        int iReturn;
        TDBGBLOCK db;
@@ -1189,7 +1176,7 @@ int DbgPeek(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
 ** Execute the diagnostic poke operation. Parameters are in the CSBLOCK struct
 ** in order address, size, repeats and value to poke.
 ****************************************************************************/
-int DbgPoke(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
+int DbgPoke(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB)
 {
        int iReturn;
        TDBGBLOCK db;
@@ -1218,7 +1205,7 @@ int DbgPoke(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
 ** Execute the diagnostic ramp data operation. Parameters are in the CSBLOCK struct
 ** in order address, default, enable mask, size and repeats.
 ****************************************************************************/
-int DbgRampData(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
+int DbgRampData(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB)
 {
        int iReturn;
        TDBGBLOCK db;
@@ -1250,7 +1237,7 @@ int DbgRampData(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
 **
 ** Execute the diagnostic ramp address operation
 ****************************************************************************/
-int DbgRampAddr(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
+int DbgRampAddr(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB)
 {
        int iReturn;
        TDBGBLOCK db;
@@ -1280,16 +1267,16 @@ int DbgRampAddr(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
 **
 ** Retrieve the data resulting from the last debug Peek operation
 ****************************************************************************/
-int DbgGetData(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
+int DbgGetData(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB)
 {
        int iReturn;
        TDBGBLOCK db;
-       memset(&db, 0, sizeof(db));     // fill returned block with 0s
+       memset(&db, 0, sizeof(db));     /*  fill returned block with 0s */
 
        mutex_lock(&pdx->io_mutex);
        dev_dbg(&pdx->interface->dev, "%s", __func__);
 
-       // Read back the last peeked value from the 1401.
+       /*  Read back the last peeked value from the 1401. */
        iReturn = usb_control_msg(pdx->udev, usb_rcvctrlpipe(pdx->udev, 0),
                                  DB_DATA, (D_TO_H | VENDOR | DEVREQ), 0, 0,
                                  &db.iData, sizeof(db.iData), HZ);
@@ -1313,7 +1300,7 @@ int DbgGetData(DEVICE_EXTENSION * pdx, TDBGBLOCK __user * pDB)
 ** Stop any never-ending debug loop, we just call Get1401State for USB
 **
 ****************************************************************************/
-int DbgStopLoop(DEVICE_EXTENSION * pdx)
+int DbgStopLoop(DEVICE_EXTENSION *pdx)
 {
        int iReturn;
        unsigned int uState, uErr;
@@ -1334,7 +1321,7 @@ int DbgStopLoop(DEVICE_EXTENSION * pdx)
 ** booked and a transfer to that area is in progress. Otherwise, we will
 ** release the area and re-assign it.
 ****************************************************************************/
-int SetCircular(DEVICE_EXTENSION * pdx, TRANSFERDESC __user * pTD)
+int SetCircular(DEVICE_EXTENSION *pdx, TRANSFERDESC __user *pTD)
 {
        int iReturn;
        bool bToHost;
@@ -1346,11 +1333,11 @@ int SetCircular(DEVICE_EXTENSION * pdx, TRANSFERDESC __user * pTD)
        mutex_lock(&pdx->io_mutex);
        dev_dbg(&pdx->interface->dev, "%s area:%d, size:%08x", __func__,
                td.wAreaNum, td.dwLength);
-       bToHost = td.eSize != 0;        // this is used as the tohost flag
+       bToHost = td.eSize != 0;        /*  this is used as the tohost flag */
 
-       // The strange cast is done so that we don't get warnings in 32-bit linux about the size of the
-       // pointer. The pointer is always passed as a 64-bit object so that we don't have problems using
-       // a 32-bit program on a 64-bit system. unsigned long is 64-bits on a 64-bit system.
+       /*  The strange cast is done so that we don't get warnings in 32-bit linux about the size of the */
+       /*  pointer. The pointer is always passed as a 64-bit object so that we don't have problems using */
+       /*  a 32-bit program on a 64-bit system. unsigned long is 64-bits on a 64-bit system. */
        iReturn =
            SetArea(pdx, td.wAreaNum,
                    (char __user *)((unsigned long)td.lpvBuff), td.dwLength,
@@ -1364,7 +1351,7 @@ int SetCircular(DEVICE_EXTENSION * pdx, TRANSFERDESC __user * pTD)
 **
 ** Return the next available block of circularly-transferred data.
 ****************************************************************************/
-int GetCircBlock(DEVICE_EXTENSION * pdx, TCIRCBLOCK __user * pCB)
+int GetCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __user *pCB)
 {
        int iReturn = U14ERR_NOERROR;
        unsigned int nArea;
@@ -1377,20 +1364,17 @@ int GetCircBlock(DEVICE_EXTENSION * pdx, TCIRCBLOCK __user * pCB)
 
        mutex_lock(&pdx->io_mutex);
 
-       nArea = cb.nArea;       // Retrieve parameters first
-       cb.dwOffset = 0;        // set default result (nothing)
+       nArea = cb.nArea;       /*  Retrieve parameters first */
+       cb.dwOffset = 0;        /*  set default result (nothing) */
        cb.dwSize = 0;
 
-       if (nArea < MAX_TRANSAREAS)     // The area number must be OK
-       {
-               TRANSAREA *pArea = &pdx->rTransDef[nArea];      // Pointer to relevant info
-               spin_lock_irq(&pdx->stagedLock);        // Lock others out
+       if (nArea < MAX_TRANSAREAS) {   /*  The area number must be OK */
+               TRANSAREA *pArea = &pdx->rTransDef[nArea];      /*  Pointer to relevant info */
+               spin_lock_irq(&pdx->stagedLock);        /*  Lock others out */
 
-               if ((pArea->bUsed) && (pArea->bCircular) &&     // Must be circular area
-                   (pArea->bCircToHost))       // For now at least must be to host
-               {
-                       if (pArea->aBlocks[0].dwSize > 0)       // Got anything?
-                       {
+               if ((pArea->bUsed) && (pArea->bCircular) &&     /*  Must be circular area */
+                   (pArea->bCircToHost)) {     /*  For now at least must be to host */
+                       if (pArea->aBlocks[0].dwSize > 0) {     /*  Got anything? */
                                cb.dwOffset = pArea->aBlocks[0].dwOffset;
                                cb.dwSize = pArea->aBlocks[0].dwSize;
                                dev_dbg(&pdx->interface->dev,
@@ -1416,7 +1400,7 @@ int GetCircBlock(DEVICE_EXTENSION * pdx, TCIRCBLOCK __user * pCB)
 **
 ** Frees a block of circularly-transferred data and returns the next one.
 ****************************************************************************/
-int FreeCircBlock(DEVICE_EXTENSION * pdx, TCIRCBLOCK __user * pCB)
+int FreeCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __user *pCB)
 {
        int iReturn = U14ERR_NOERROR;
        unsigned int nArea, uStart, uSize;
@@ -1429,33 +1413,28 @@ int FreeCircBlock(DEVICE_EXTENSION * pdx, TCIRCBLOCK __user * pCB)
 
        mutex_lock(&pdx->io_mutex);
 
-       nArea = cb.nArea;       // Retrieve parameters first
+       nArea = cb.nArea;       /*  Retrieve parameters first */
        uStart = cb.dwOffset;
        uSize = cb.dwSize;
-       cb.dwOffset = 0;        // then set default result (nothing)
+       cb.dwOffset = 0;        /*  then set default result (nothing) */
        cb.dwSize = 0;
 
-       if (nArea < MAX_TRANSAREAS)     // The area number must be OK
-       {
-               TRANSAREA *pArea = &pdx->rTransDef[nArea];      // Pointer to relevant info
-               spin_lock_irq(&pdx->stagedLock);        // Lock others out
+       if (nArea < MAX_TRANSAREAS) {   /*  The area number must be OK */
+               TRANSAREA *pArea = &pdx->rTransDef[nArea];      /*  Pointer to relevant info */
+               spin_lock_irq(&pdx->stagedLock);        /*  Lock others out */
 
-               if ((pArea->bUsed) && (pArea->bCircular) &&     // Must be circular area
-                   (pArea->bCircToHost))       // For now at least must be to host
-               {
+               if ((pArea->bUsed) && (pArea->bCircular) &&     /*  Must be circular area */
+                   (pArea->bCircToHost)) {     /*  For now at least must be to host */
                        bool bWaiting = false;
 
-                       if ((pArea->aBlocks[0].dwSize >= uSize) &&      // Got anything?
-                           (pArea->aBlocks[0].dwOffset == uStart))     // Must be legal data
-                       {
+                       if ((pArea->aBlocks[0].dwSize >= uSize) &&      /*  Got anything? */
+                           (pArea->aBlocks[0].dwOffset == uStart)) {   /*  Must be legal data */
                                pArea->aBlocks[0].dwSize -= uSize;
                                pArea->aBlocks[0].dwOffset += uSize;
-                               if (pArea->aBlocks[0].dwSize == 0)      // Have we emptied this block?
-                               {
-                                       if (pArea->aBlocks[1].dwSize)   // Is there a second block?
-                                       {
-                                               pArea->aBlocks[0] = pArea->aBlocks[1];  // Copy down block 2 data
-                                               pArea->aBlocks[1].dwSize = 0;   // and mark the second block as unused
+                               if (pArea->aBlocks[0].dwSize == 0) {    /*  Have we emptied this block? */
+                                       if (pArea->aBlocks[1].dwSize) { /*  Is there a second block? */
+                                               pArea->aBlocks[0] = pArea->aBlocks[1];  /*  Copy down block 2 data */
+                                               pArea->aBlocks[1].dwSize = 0;   /*  and mark the second block as unused */
                                                pArea->aBlocks[1].dwOffset = 0;
                                        } else
                                                pArea->aBlocks[0].dwOffset = 0;
@@ -1468,9 +1447,8 @@ int FreeCircBlock(DEVICE_EXTENSION * pdx, TCIRCBLOCK __user * pCB)
                                        pArea->aBlocks[0].dwOffset,
                                        pdx->bXFerWaiting);
 
-                               // Return the next available block of memory as well
-                               if (pArea->aBlocks[0].dwSize > 0)       // Got anything?
-                               {
+                               /*  Return the next available block of memory as well */
+                               if (pArea->aBlocks[0].dwSize > 0) {     /*  Got anything? */
                                        cb.dwOffset =
                                            pArea->aBlocks[0].dwOffset;
                                        cb.dwSize = pArea->aBlocks[0].dwSize;
@@ -1492,9 +1470,8 @@ int FreeCircBlock(DEVICE_EXTENSION * pdx, TCIRCBLOCK __user * pCB)
                                iReturn = U14ERR_NOMEMORY;
                        }
 
-                       // If we have one, kick off pending transfer
-                       if (bWaiting)   // Got a block xfer waiting?
-                       {
+                       /*  If we have one, kick off pending transfer */
+                       if (bWaiting) { /*  Got a block xfer waiting? */
                                int RWMStat =
                                    ReadWriteMem(pdx, !pdx->rDMAInfo.bOutWard,
                                                 pdx->rDMAInfo.wIdent,
index 0895c9414b4fd6b1a2765bc7aadc886291fd9ae5..aa68878bd2510715054c7b3e44ff4b71992b6c22 100644 (file)
@@ -35,7 +35,7 @@ typedef struct TransferDesc {
        short eSize;            /* element size - is tohost flag for circular */
 } TRANSFERDESC;
 
-typedef TRANSFERDESC * LPTRANSFERDESC;
+typedef TRANSFERDESC *LPTRANSFERDESC;
 
 typedef struct TransferEvent {
        unsigned int dwStart;           /* offset into the area */
index af073790b942a57cfd0b5ec16f6d468cb0533143..dbd4036d9bdddda07d8d0d97a9c0559d4f8e022d 100644 (file)
 #endif
 
 #if defined(LINUX) || defined(MAXOSX)
-    #define FAR
+       #define FAR
 
-    typedef int BOOL;       // To match Windows
-    typedef char * LPSTR;
-    typedef const char * LPCSTR;
-    typedef unsigned short WORD;
-    typedef unsigned int  DWORD;
-    typedef unsigned char  BYTE;
-    typedef BYTE  BOOLEAN;
-    typedef unsigned char UCHAR;
-    #define __packed __attribute__((packed))
-    typedef BYTE * LPBYTE;
-    #define HIWORD(x) (WORD)(((x)>>16) & 0xffff)
-    #define LOWORD(x) (WORD)((x) & 0xffff)
+       typedef int BOOL;       /*  To match Windows */
+       typedef unsigned char  BYTE;
+       #define __packed __attribute__((packed))
+       #define HIWORD(x) (unsigned short)(((x)>>16) & 0xffff)
+       #define LOWORD(x) (unsigned short)((x) & 0xffff)
 #endif
 
 #ifdef _IS_WINDOWS_
 ** a synonym.
 */
 #ifdef GNUC
-    #define DllExport __attribute__((dllexport))
-    #define DllImport __attribute__((dllimport))
+       #define DllExport __attribute__((dllexport))
+       #define DllImport __attribute__((dllimport))
 #endif
 
 #ifndef DllExport
 #ifdef _IS_WINDOWS_
-    #define DllExport __declspec(dllexport)
-    #define DllImport __declspec(dllimport)
+       #define DllExport __declspec(dllexport)
+       #define DllImport __declspec(dllimport)
 #else
-    #define DllExport
-    #define DllImport
+       #define DllExport
+       #define DllImport
 #endif
 #endif /* _IS_WINDOWS_ */
 
-    
 #ifndef TRUE
    #define TRUE 1
    #define FALSE 0
index 254131d8be5ffea67d89cc669f6648d3e79f0173..97c55f9e5151e9277e8fc108d922f51a8ea0ebea 100644 (file)
@@ -126,18 +126,18 @@ static void ced_delete(struct kref *kref)
 {
        DEVICE_EXTENSION *pdx = to_DEVICE_EXTENSION(kref);
 
-       // Free up the output buffer, then free the output urb. Note that the interface member
-       // of pdx will probably be NULL, so cannot be used to get to dev.
+       /*  Free up the output buffer, then free the output urb. Note that the interface member */
+       /*  of pdx will probably be NULL, so cannot be used to get to dev. */
        usb_free_coherent(pdx->udev, OUTBUF_SZ, pdx->pCoherCharOut,
                          pdx->pUrbCharOut->transfer_dma);
        usb_free_urb(pdx->pUrbCharOut);
 
-       // Do the same for chan input
+       /*  Do the same for chan input */
        usb_free_coherent(pdx->udev, INBUF_SZ, pdx->pCoherCharIn,
                          pdx->pUrbCharIn->transfer_dma);
        usb_free_urb(pdx->pUrbCharIn);
 
-       // Do the same for the block transfers
+       /*  Do the same for the block transfers */
        usb_free_coherent(pdx->udev, STAGED_SZ, pdx->pCoherStagedIO,
                          pdx->pStagedUrb->transfer_dma);
        usb_free_urb(pdx->pStagedUrb);
@@ -146,7 +146,7 @@ static void ced_delete(struct kref *kref)
        kfree(pdx);
 }
 
-// This is the driver end of the open() call from user space.
+/*  This is the driver end of the open() call from user space. */
 static int ced_open(struct inode *inode, struct file *file)
 {
        DEVICE_EXTENSION *pdx;
@@ -184,7 +184,7 @@ static int ced_open(struct inode *inode, struct file *file)
                        kref_put(&pdx->kref, ced_delete);
                        goto exit;
                }
-       } else {                //uncomment this block if you want exclusive open
+       } else {                /* uncomment this block if you want exclusive open */
                dev_err(&interface->dev, "%s fail: already open", __func__);
                retval = -EBUSY;
                pdx->open_count--;
@@ -210,11 +210,11 @@ static int ced_release(struct inode *inode, struct file *file)
 
        dev_dbg(&pdx->interface->dev, "%s called", __func__);
        mutex_lock(&pdx->io_mutex);
-       if (!--pdx->open_count && pdx->interface)       // Allow autosuspend
+       if (!--pdx->open_count && pdx->interface)       /*  Allow autosuspend */
                usb_autopm_put_interface(pdx->interface);
        mutex_unlock(&pdx->io_mutex);
 
-       kref_put(&pdx->kref, ced_delete);       // decrement the count on our device
+       kref_put(&pdx->kref, ced_delete);       /*  decrement the count on our device */
        return 0;
 }
 
@@ -252,9 +252,9 @@ static int ced_flush(struct file *file, fl_owner_t id)
 ** not help with a device extension held by a file.
 ** return true if can accept new io requests, else false
 */
-static bool CanAcceptIoRequests(DEVICE_EXTENSION * pdx)
+static bool CanAcceptIoRequests(DEVICE_EXTENSION *pdx)
 {
-       return pdx && pdx->interface;   // Can we accept IO requests
+       return pdx && pdx->interface;   /*  Can we accept IO requests */
 }
 
 /****************************************************************************
@@ -264,9 +264,9 @@ static bool CanAcceptIoRequests(DEVICE_EXTENSION * pdx)
 static void ced_writechar_callback(struct urb *pUrb)
 {
        DEVICE_EXTENSION *pdx = pUrb->context;
-       int nGot = pUrb->actual_length; // what we transferred
+       int nGot = pUrb->actual_length; /*  what we transferred */
 
-       if (pUrb->status) {     // sync/async unlink faults aren't errors
+       if (pUrb->status) {     /*  sync/async unlink faults aren't errors */
                if (!
                    (pUrb->status == -ENOENT || pUrb->status == -ECONNRESET
                     || pUrb->status == -ESHUTDOWN)) {
@@ -278,36 +278,35 @@ static void ced_writechar_callback(struct urb *pUrb)
                spin_lock(&pdx->err_lock);
                pdx->errors = pUrb->status;
                spin_unlock(&pdx->err_lock);
-               nGot = 0;       //  and tidy up again if so
+               nGot = 0;       /*   and tidy up again if so */
 
-               spin_lock(&pdx->charOutLock);   // already at irq level
-               pdx->dwOutBuffGet = 0;  // Reset the output buffer
+               spin_lock(&pdx->charOutLock);   /*  already at irq level */
+               pdx->dwOutBuffGet = 0;  /*  Reset the output buffer */
                pdx->dwOutBuffPut = 0;
-               pdx->dwNumOutput = 0;   // Clear the char count
-               pdx->bPipeError[0] = 1; // Flag an error for later
-               pdx->bSendCharsPending = false; // Allow other threads again
-               spin_unlock(&pdx->charOutLock); // already at irq level
+               pdx->dwNumOutput = 0;   /*  Clear the char count */
+               pdx->bPipeError[0] = 1; /*  Flag an error for later */
+               pdx->bSendCharsPending = false; /*  Allow other threads again */
+               spin_unlock(&pdx->charOutLock); /*  already at irq level */
                dev_dbg(&pdx->interface->dev,
                        "%s - char out done, 0 chars sent", __func__);
        } else {
                dev_dbg(&pdx->interface->dev,
                        "%s - char out done, %d chars sent", __func__, nGot);
-               spin_lock(&pdx->charOutLock);   // already at irq level
-               pdx->dwNumOutput -= nGot;       // Now adjust the char send buffer
-               pdx->dwOutBuffGet += nGot;      // to match what we did
-               if (pdx->dwOutBuffGet >= OUTBUF_SZ)     // Can't do this any earlier as data could be overwritten
+               spin_lock(&pdx->charOutLock);   /*  already at irq level */
+               pdx->dwNumOutput -= nGot;       /*  Now adjust the char send buffer */
+               pdx->dwOutBuffGet += nGot;      /*  to match what we did */
+               if (pdx->dwOutBuffGet >= OUTBUF_SZ)     /*  Can't do this any earlier as data could be overwritten */
                        pdx->dwOutBuffGet = 0;
 
-               if (pdx->dwNumOutput > 0)       // if more to be done...
-               {
-                       int nPipe = 0;  // The pipe number to use
+               if (pdx->dwNumOutput > 0) {     /*  if more to be done... */
+                       int nPipe = 0;  /*  The pipe number to use */
                        int iReturn;
                        char *pDat = &pdx->outputBuffer[pdx->dwOutBuffGet];
-                       unsigned int dwCount = pdx->dwNumOutput;        // maximum to send
-                       if ((pdx->dwOutBuffGet + dwCount) > OUTBUF_SZ)  // does it cross buffer end?
+                       unsigned int dwCount = pdx->dwNumOutput;        /*  maximum to send */
+                       if ((pdx->dwOutBuffGet + dwCount) > OUTBUF_SZ)  /*  does it cross buffer end? */
                                dwCount = OUTBUF_SZ - pdx->dwOutBuffGet;
-                       spin_unlock(&pdx->charOutLock); // we are done with stuff that changes
-                       memcpy(pdx->pCoherCharOut, pDat, dwCount);      // copy output data to the buffer
+                       spin_unlock(&pdx->charOutLock); /*  we are done with stuff that changes */
+                       memcpy(pdx->pCoherCharOut, pDat, dwCount);      /*  copy output data to the buffer */
                        usb_fill_bulk_urb(pdx->pUrbCharOut, pdx->udev,
                                          usb_sndbulkpipe(pdx->udev,
                                                          pdx->epAddr[0]),
@@ -315,22 +314,22 @@ static void ced_writechar_callback(struct urb *pUrb)
                                          ced_writechar_callback, pdx);
                        pdx->pUrbCharOut->transfer_flags |=
                            URB_NO_TRANSFER_DMA_MAP;
-                       usb_anchor_urb(pdx->pUrbCharOut, &pdx->submitted);      // in case we need to kill it
+                       usb_anchor_urb(pdx->pUrbCharOut, &pdx->submitted);      /*  in case we need to kill it */
                        iReturn = usb_submit_urb(pdx->pUrbCharOut, GFP_ATOMIC);
                        dev_dbg(&pdx->interface->dev, "%s n=%d>%s<", __func__,
                                dwCount, pDat);
-                       spin_lock(&pdx->charOutLock);   // grab lock for errors
+                       spin_lock(&pdx->charOutLock);   /*  grab lock for errors */
                        if (iReturn) {
-                               pdx->bPipeError[nPipe] = 1;     // Flag an error to be handled later
-                               pdx->bSendCharsPending = false; // Allow other threads again
+                               pdx->bPipeError[nPipe] = 1;     /*  Flag an error to be handled later */
+                               pdx->bSendCharsPending = false; /*  Allow other threads again */
                                usb_unanchor_urb(pdx->pUrbCharOut);
                                dev_err(&pdx->interface->dev,
                                        "%s usb_submit_urb() returned %d",
                                        __func__, iReturn);
                        }
                } else
-                       pdx->bSendCharsPending = false; // Allow other threads again
-               spin_unlock(&pdx->charOutLock); // already at irq level
+                       pdx->bSendCharsPending = false; /*  Allow other threads again */
+               spin_unlock(&pdx->charOutLock); /*  already at irq level */
        }
 }
 
@@ -339,44 +338,43 @@ static void ced_writechar_callback(struct urb *pUrb)
 ** Transmit the characters in the output buffer to the 1401. This may need
 ** breaking down into multiple transfers.
 ****************************************************************************/
-int SendChars(DEVICE_EXTENSION * pdx)
+int SendChars(DEVICE_EXTENSION *pdx)
 {
        int iReturn = U14ERR_NOERROR;
 
-       spin_lock_irq(&pdx->charOutLock);       // Protect ourselves
+       spin_lock_irq(&pdx->charOutLock);       /*  Protect ourselves */
 
-       if ((!pdx->bSendCharsPending) &&        // Not currently sending
-           (pdx->dwNumOutput > 0) &&   //  has characters to output
-           (CanAcceptIoRequests(pdx))) //  and current activity is OK
-       {
-               unsigned int dwCount = pdx->dwNumOutput;        // Get a copy of the character count
-               pdx->bSendCharsPending = true;  // Set flag to lock out other threads
+       if ((!pdx->bSendCharsPending) &&        /*  Not currently sending */
+           (pdx->dwNumOutput > 0) &&   /*   has characters to output */
+           (CanAcceptIoRequests(pdx))) { /*   and current activity is OK */
+               unsigned int dwCount = pdx->dwNumOutput;        /*  Get a copy of the character count */
+               pdx->bSendCharsPending = true;  /*  Set flag to lock out other threads */
 
                dev_dbg(&pdx->interface->dev,
                        "Send %d chars to 1401, EP0 flag %d\n", dwCount,
                        pdx->nPipes == 3);
-               // If we have only 3 end points we must send the characters to the 1401 using EP0.
+               /*  If we have only 3 end points we must send the characters to the 1401 using EP0. */
                if (pdx->nPipes == 3) {
-                       // For EP0 character transmissions to the 1401, we have to hang about until they
-                       // are gone, as otherwise without more character IO activity they will never go.
-                       unsigned int count = dwCount;   // Local char counter
-                       unsigned int index = 0; // The index into the char buffer
+                       /*  For EP0 character transmissions to the 1401, we have to hang about until they */
+                       /*  are gone, as otherwise without more character IO activity they will never go. */
+                       unsigned int count = dwCount;   /*  Local char counter */
+                       unsigned int index = 0; /*  The index into the char buffer */
 
-                       spin_unlock_irq(&pdx->charOutLock);     // Free spinlock as we call USBD
+                       spin_unlock_irq(&pdx->charOutLock);     /*  Free spinlock as we call USBD */
 
                        while ((count > 0) && (iReturn == U14ERR_NOERROR)) {
-                               // We have to break the transfer up into 64-byte chunks because of a 2270 problem
-                               int n = count > 64 ? 64 : count;        // Chars for this xfer, max of 64
+                               /*  We have to break the transfer up into 64-byte chunks because of a 2270 problem */
+                               int n = count > 64 ? 64 : count;        /*  Chars for this xfer, max of 64 */
                                int nSent = usb_control_msg(pdx->udev,
-                                                           usb_sndctrlpipe(pdx->udev, 0),      // use end point 0
-                                                           DB_CHARS,   // bRequest
-                                                           (H_TO_D | VENDOR | DEVREQ), // to the device, vendor request to the device
-                                                           0, 0,       // value and index are both 0
-                                                           &pdx->outputBuffer[index],  // where to send from
-                                                           n,  // how much to send
-                                                           1000);      // timeout in jiffies
+                                                           usb_sndctrlpipe(pdx->udev, 0),      /*  use end point 0 */
+                                                           DB_CHARS,   /*  bRequest */
+                                                           (H_TO_D | VENDOR | DEVREQ), /*  to the device, vendor request to the device */
+                                                           0, 0,       /*  value and index are both 0 */
+                                                           &pdx->outputBuffer[index],  /*  where to send from */
+                                                           n,  /*  how much to send */
+                                                           1000);      /*  timeout in jiffies */
                                if (nSent <= 0) {
-                                       iReturn = nSent ? nSent : -ETIMEDOUT;   // if 0 chars says we timed out
+                                       iReturn = nSent ? nSent : -ETIMEDOUT;   /*  if 0 chars says we timed out */
                                        dev_err(&pdx->interface->dev,
                                                "Send %d chars by EP0 failed: %d",
                                                n, iReturn);
@@ -388,19 +386,19 @@ int SendChars(DEVICE_EXTENSION * pdx)
                                }
                        }
 
-                       spin_lock_irq(&pdx->charOutLock);       // Protect pdx changes, released by general code
-                       pdx->dwOutBuffGet = 0;  // so reset the output buffer
+                       spin_lock_irq(&pdx->charOutLock);       /*  Protect pdx changes, released by general code */
+                       pdx->dwOutBuffGet = 0;  /*  so reset the output buffer */
                        pdx->dwOutBuffPut = 0;
-                       pdx->dwNumOutput = 0;   // and clear the buffer count
-                       pdx->bSendCharsPending = false; // Allow other threads again
-               } else {        // Here for sending chars normally - we hold the spin lock
-                       int nPipe = 0;  // The pipe number to use
+                       pdx->dwNumOutput = 0;   /*  and clear the buffer count */
+                       pdx->bSendCharsPending = false; /*  Allow other threads again */
+               } else {        /*  Here for sending chars normally - we hold the spin lock */
+                       int nPipe = 0;  /*  The pipe number to use */
                        char *pDat = &pdx->outputBuffer[pdx->dwOutBuffGet];
 
-                       if ((pdx->dwOutBuffGet + dwCount) > OUTBUF_SZ)  // does it cross buffer end?
+                       if ((pdx->dwOutBuffGet + dwCount) > OUTBUF_SZ)  /*  does it cross buffer end? */
                                dwCount = OUTBUF_SZ - pdx->dwOutBuffGet;
-                       spin_unlock_irq(&pdx->charOutLock);     // we are done with stuff that changes
-                       memcpy(pdx->pCoherCharOut, pDat, dwCount);      // copy output data to the buffer
+                       spin_unlock_irq(&pdx->charOutLock);     /*  we are done with stuff that changes */
+                       memcpy(pdx->pCoherCharOut, pDat, dwCount);      /*  copy output data to the buffer */
                        usb_fill_bulk_urb(pdx->pUrbCharOut, pdx->udev,
                                          usb_sndbulkpipe(pdx->udev,
                                                          pdx->epAddr[0]),
@@ -410,11 +408,11 @@ int SendChars(DEVICE_EXTENSION * pdx)
                            URB_NO_TRANSFER_DMA_MAP;
                        usb_anchor_urb(pdx->pUrbCharOut, &pdx->submitted);
                        iReturn = usb_submit_urb(pdx->pUrbCharOut, GFP_KERNEL);
-                       spin_lock_irq(&pdx->charOutLock);       // grab lock for errors
+                       spin_lock_irq(&pdx->charOutLock);       /*  grab lock for errors */
                        if (iReturn) {
-                               pdx->bPipeError[nPipe] = 1;     // Flag an error to be handled later
-                               pdx->bSendCharsPending = false; // Allow other threads again
-                               usb_unanchor_urb(pdx->pUrbCharOut);     // remove from list of active urbs
+                               pdx->bPipeError[nPipe] = 1;     /*  Flag an error to be handled later */
+                               pdx->bSendCharsPending = false; /*  Allow other threads again */
+                               usb_unanchor_urb(pdx->pUrbCharOut);     /*  remove from list of active urbs */
                        }
                }
        } else if (pdx->bSendCharsPending && (pdx->dwNumOutput > 0))
@@ -422,7 +420,7 @@ int SendChars(DEVICE_EXTENSION * pdx)
                        "SendChars bSendCharsPending:true");
 
        dev_dbg(&pdx->interface->dev, "SendChars exit code: %d", iReturn);
-       spin_unlock_irq(&pdx->charOutLock);     // Now let go of the spinlock
+       spin_unlock_irq(&pdx->charOutLock);     /*  Now let go of the spinlock */
        return iReturn;
 }
 
@@ -440,14 +438,14 @@ int SendChars(DEVICE_EXTENSION * pdx)
 ** pdx  Is our device extension which holds all we know about the transfer.
 ** n    The number of bytes to move one way or the other.
 ***************************************************************************/
-static void CopyUserSpace(DEVICE_EXTENSION * pdx, int n)
+static void CopyUserSpace(DEVICE_EXTENSION *pdx, int n)
 {
        unsigned int nArea = pdx->StagedId;
        if (nArea < MAX_TRANSAREAS) {
-               TRANSAREA *pArea = &pdx->rTransDef[nArea];      // area to be used
+               TRANSAREA *pArea = &pdx->rTransDef[nArea];      /*  area to be used */
                unsigned int dwOffset =
                    pdx->StagedDone + pdx->StagedOffset + pArea->dwBaseOffset;
-               char *pCoherBuf = pdx->pCoherStagedIO;  // coherent buffer
+               char *pCoherBuf = pdx->pCoherStagedIO;  /*  coherent buffer */
                if (!pArea->bUsed) {
                        dev_err(&pdx->interface->dev, "%s area %d unused",
                                __func__, nArea);
@@ -455,15 +453,15 @@ static void CopyUserSpace(DEVICE_EXTENSION * pdx, int n)
                }
 
                while (n) {
-                       int nPage = dwOffset >> PAGE_SHIFT;     // page number in table
+                       int nPage = dwOffset >> PAGE_SHIFT;     /*  page number in table */
                        if (nPage < pArea->nPages) {
                                char *pvAddress =
                                    (char *)kmap_atomic(pArea->pPages[nPage]);
                                if (pvAddress) {
-                                       unsigned int uiPageOff = dwOffset & (PAGE_SIZE - 1);    // offset into the page
-                                       size_t uiXfer = PAGE_SIZE - uiPageOff;  // max to transfer on this page
-                                       if (uiXfer > n) // limit byte count if too much
-                                               uiXfer = n;     // for the page
+                                       unsigned int uiPageOff = dwOffset & (PAGE_SIZE - 1);    /*  offset into the page */
+                                       size_t uiXfer = PAGE_SIZE - uiPageOff;  /*  max to transfer on this page */
+                                       if (uiXfer > n) /*  limit byte count if too much */
+                                               uiXfer = n;     /*  for the page */
                                        if (pdx->StagedRead)
                                                memcpy(pvAddress + uiPageOff,
                                                       pCoherBuf, uiXfer);
@@ -494,8 +492,8 @@ static void CopyUserSpace(DEVICE_EXTENSION * pdx, int n)
                        nArea);
 }
 
-// Forward declarations for stuff used circularly
-static int StageChunk(DEVICE_EXTENSION * pdx);
+/*  Forward declarations for stuff used circularly */
+static int StageChunk(DEVICE_EXTENSION *pdx);
 /***************************************************************************
 ** ReadWrite_Complete
 **
@@ -504,14 +502,14 @@ static int StageChunk(DEVICE_EXTENSION * pdx);
 static void staged_callback(struct urb *pUrb)
 {
        DEVICE_EXTENSION *pdx = pUrb->context;
-       unsigned int nGot = pUrb->actual_length;        // what we transferred
+       unsigned int nGot = pUrb->actual_length;        /*  what we transferred */
        bool bCancel = false;
-       bool bRestartCharInput; // used at the end
+       bool bRestartCharInput; /*  used at the end */
 
-       spin_lock(&pdx->stagedLock);    // stop ReadWriteMem() action while this routine is running
-       pdx->bStagedUrbPending = false; // clear the flag for staged IRP pending
+       spin_lock(&pdx->stagedLock);    /*  stop ReadWriteMem() action while this routine is running */
+       pdx->bStagedUrbPending = false; /*  clear the flag for staged IRP pending */
 
-       if (pUrb->status) {     // sync/async unlink faults aren't errors
+       if (pUrb->status) {     /*  sync/async unlink faults aren't errors */
                if (!
                    (pUrb->status == -ENOENT || pUrb->status == -ECONNRESET
                     || pUrb->status == -ESHUTDOWN)) {
@@ -525,40 +523,37 @@ static void staged_callback(struct urb *pUrb)
                spin_lock(&pdx->err_lock);
                pdx->errors = pUrb->status;
                spin_unlock(&pdx->err_lock);
-               nGot = 0;       //  and tidy up again if so
+               nGot = 0;       /*   and tidy up again if so */
                bCancel = true;
        } else {
                dev_dbg(&pdx->interface->dev, "%s %d chars xferred", __func__,
                        nGot);
-               if (pdx->StagedRead)    // if reading, save to user space
-                       CopyUserSpace(pdx, nGot);       // copy from buffer to user
+               if (pdx->StagedRead)    /*  if reading, save to user space */
+                       CopyUserSpace(pdx, nGot);       /*  copy from buffer to user */
                if (nGot == 0)
                        dev_dbg(&pdx->interface->dev, "%s ZLP", __func__);
        }
 
-       // Update the transfer length based on the TransferBufferLength value in the URB
+       /*  Update the transfer length based on the TransferBufferLength value in the URB */
        pdx->StagedDone += nGot;
 
        dev_dbg(&pdx->interface->dev, "%s, done %d bytes of %d", __func__,
                pdx->StagedDone, pdx->StagedLength);
 
-       if ((pdx->StagedDone == pdx->StagedLength) ||   // If no more to do
-           (bCancel))          // or this IRP was cancelled
-       {
-               TRANSAREA *pArea = &pdx->rTransDef[pdx->StagedId];      // Transfer area info
+       if ((pdx->StagedDone == pdx->StagedLength) ||   /*  If no more to do */
+           (bCancel)) {                /*  or this IRP was cancelled */
+               TRANSAREA *pArea = &pdx->rTransDef[pdx->StagedId];      /*  Transfer area info */
                dev_dbg(&pdx->interface->dev,
                        "%s transfer done, bytes %d, cancel %d", __func__,
                        pdx->StagedDone, bCancel);
 
-               // Here is where we sort out what to do with this transfer if using a circular buffer. We have
-               //  a completed transfer that can be assumed to fit into the transfer area. We should be able to
-               //  add this to the end of a growing block or to use it to start a new block unless the code
-               //  that calculates the offset to use (in ReadWriteMem) is totally duff.
-               if ((pArea->bCircular) && (pArea->bCircToHost) && (!bCancel) && // Time to sort out circular buffer info?
-                   (pdx->StagedRead))  // Only for tohost transfers for now
-               {
-                       if (pArea->aBlocks[1].dwSize > 0)       // If block 1 is in use we must append to it
-                       {
+               /*  Here is where we sort out what to do with this transfer if using a circular buffer. We have */
+               /*   a completed transfer that can be assumed to fit into the transfer area. We should be able to */
+               /*   add this to the end of a growing block or to use it to start a new block unless the code */
+               /*   that calculates the offset to use (in ReadWriteMem) is totally duff. */
+               if ((pArea->bCircular) && (pArea->bCircToHost) && (!bCancel) && /*  Time to sort out circular buffer info? */
+                   (pdx->StagedRead)) {        /*  Only for tohost transfers for now */
+                       if (pArea->aBlocks[1].dwSize > 0) {     /*  If block 1 is in use we must append to it */
                                if (pdx->StagedOffset ==
                                    (pArea->aBlocks[1].dwOffset +
                                     pArea->aBlocks[1].dwSize)) {
@@ -569,7 +564,7 @@ static void staged_callback(struct urb *pUrb)
                                                pArea->aBlocks[1].dwSize,
                                                pArea->aBlocks[1].dwOffset);
                                } else {
-                                       // Here things have gone very, very, wrong, but I cannot see how this can actually be achieved
+                                       /*  Here things have gone very, very, wrong, but I cannot see how this can actually be achieved */
                                        pArea->aBlocks[1].dwOffset =
                                            pdx->StagedOffset;
                                        pArea->aBlocks[1].dwSize =
@@ -580,22 +575,20 @@ static void staged_callback(struct urb *pUrb)
                                                pArea->aBlocks[1].dwSize,
                                                pArea->aBlocks[1].dwOffset);
                                }
-                       } else  // If block 1 is not used, we try to add to block 0
-                       {
-                               if (pArea->aBlocks[0].dwSize > 0)       // Got stored block 0 information?
-                               {       // Must append onto the existing block 0
+                       } else {        /*  If block 1 is not used, we try to add to block 0 */
+                               if (pArea->aBlocks[0].dwSize > 0) {     /*  Got stored block 0 information? */
+                                       /*  Must append onto the existing block 0 */
                                        if (pdx->StagedOffset ==
                                            (pArea->aBlocks[0].dwOffset +
                                             pArea->aBlocks[0].dwSize)) {
-                                               pArea->aBlocks[0].dwSize += pdx->StagedLength;  // Just add this transfer in
+                                               pArea->aBlocks[0].dwSize += pdx->StagedLength;  /*  Just add this transfer in */
                                                dev_dbg(&pdx->interface->dev,
                                                        "RWM_Complete, circ block 0 now %d bytes at %d",
                                                        pArea->aBlocks[0].
                                                        dwSize,
                                                        pArea->aBlocks[0].
                                                        dwOffset);
-                                       } else  // If it doesn't append, put into new block 1
-                                       {
+                                       } else {        /*  If it doesn't append, put into new block 1 */
                                                pArea->aBlocks[1].dwOffset =
                                                    pdx->StagedOffset;
                                                pArea->aBlocks[1].dwSize =
@@ -607,8 +600,7 @@ static void staged_callback(struct urb *pUrb)
                                                        pArea->aBlocks[1].
                                                        dwOffset);
                                        }
-                               } else  // No info stored yet, just save in block 0
-                               {
+                               } else  { /*  No info stored yet, just save in block 0 */
                                        pArea->aBlocks[0].dwOffset =
                                            pdx->StagedOffset;
                                        pArea->aBlocks[0].dwSize =
@@ -621,21 +613,19 @@ static void staged_callback(struct urb *pUrb)
                        }
                }
 
-               if (!bCancel)   // Don't generate an event if cancelled
-               {
+               if (!bCancel) { /*  Don't generate an event if cancelled */
                        dev_dbg(&pdx->interface->dev,
                                "RWM_Complete,  bCircular %d, bToHost %d, eStart %d, eSize %d",
                                pArea->bCircular, pArea->bEventToHost,
                                pArea->dwEventSt, pArea->dwEventSz);
-                       if ((pArea->dwEventSz) &&       // Set a user-mode event...
-                           (pdx->StagedRead == pArea->bEventToHost))   // ...on transfers in this direction?
-                       {
-                               int iWakeUp = 0;        // assume
-                               // If we have completed the right sort of DMA transfer then set the event to notify
-                               //   the user code to wake up anyone that is waiting.
-                               if ((pArea->bCircular) &&       // Circular areas use a simpler test
-                                   (pArea->bCircToHost))       // only in supported direction
-                               {       // Is total data waiting up to size limit?
+                       if ((pArea->dwEventSz) &&       /*  Set a user-mode event... */
+                           (pdx->StagedRead == pArea->bEventToHost)) { /*  ...on transfers in this direction? */
+                               int iWakeUp = 0;        /*  assume */
+                               /*  If we have completed the right sort of DMA transfer then set the event to notify */
+                               /*    the user code to wake up anyone that is waiting. */
+                               if ((pArea->bCircular) &&       /*  Circular areas use a simpler test */
+                                   (pArea->bCircToHost)) {     /*  only in supported direction */
+                                       /*  Is total data waiting up to size limit? */
                                        unsigned int dwTotal =
                                            pArea->aBlocks[0].dwSize +
                                            pArea->aBlocks[1].dwSize;
@@ -653,19 +643,17 @@ static void staged_callback(struct urb *pUrb)
                                if (iWakeUp) {
                                        dev_dbg(&pdx->interface->dev,
                                                "About to set event to notify app");
-                                       wake_up_interruptible(&pArea->wqEvent); // wake up waiting processes
-                                       ++pArea->iWakeUp;       // increment wakeup count
+                                       wake_up_interruptible(&pArea->wqEvent); /*  wake up waiting processes */
+                                       ++pArea->iWakeUp;       /*  increment wakeup count */
                                }
                        }
                }
 
-               pdx->dwDMAFlag = MODE_CHAR;     // Switch back to char mode before ReadWriteMem call
+               pdx->dwDMAFlag = MODE_CHAR;     /*  Switch back to char mode before ReadWriteMem call */
 
-               if (!bCancel)   // Don't look for waiting transfer if cancelled
-               {
-                       // If we have a transfer waiting, kick it off
-                       if (pdx->bXFerWaiting)  // Got a block xfer waiting?
-                       {
+               if (!bCancel) { /*  Don't look for waiting transfer if cancelled */
+                       /*  If we have a transfer waiting, kick it off */
+                       if (pdx->bXFerWaiting) {        /*  Got a block xfer waiting? */
                                int iReturn;
                                dev_info(&pdx->interface->dev,
                                         "*** RWM_Complete *** pending transfer will now be set up!!!");
@@ -682,22 +670,22 @@ static void staged_callback(struct urb *pUrb)
                        }
                }
 
-       } else                  // Here for more to do
-               StageChunk(pdx);        // fire off the next bit
+       } else                  /*  Here for more to do */
+               StageChunk(pdx);        /*  fire off the next bit */
 
-       // While we hold the stagedLock, see if we should reallow character input ints
-       // Don't allow if cancelled, or if a new block has started or if there is a waiting block.
-       // This feels wrong as we should ask which spin lock protects dwDMAFlag.
+       /*  While we hold the stagedLock, see if we should reallow character input ints */
+       /*  Don't allow if cancelled, or if a new block has started or if there is a waiting block. */
+       /*  This feels wrong as we should ask which spin lock protects dwDMAFlag. */
        bRestartCharInput = !bCancel && (pdx->dwDMAFlag == MODE_CHAR)
            && !pdx->bXFerWaiting;
 
-       spin_unlock(&pdx->stagedLock);  // Finally release the lock again
+       spin_unlock(&pdx->stagedLock);  /*  Finally release the lock again */
 
-       // This is not correct as dwDMAFlag is protected by the staged lock, but it is treated
-       // in Allowi as if it were protected by the char lock. In any case, most systems will
-       // not be upset by char input during DMA... sigh. Needs sorting out.
-       if (bRestartCharInput)  // may be out of date, but...
-               Allowi(pdx);    // ...Allowi tests a lock too.
+       /*  This is not correct as dwDMAFlag is protected by the staged lock, but it is treated */
+       /*  in Allowi as if it were protected by the char lock. In any case, most systems will */
+       /*  not be upset by char input during DMA... sigh. Needs sorting out. */
+       if (bRestartCharInput)  /*  may be out of date, but... */
+               Allowi(pdx);    /*  ...Allowi tests a lock too. */
        dev_dbg(&pdx->interface->dev, "%s done", __func__);
 }
 
@@ -709,29 +697,28 @@ static void staged_callback(struct urb *pUrb)
 ** The calling code must have acquired the staging spinlock before calling
 **  this function, and is responsible for releasing it. We are at callback level.
 ****************************************************************************/
-static int StageChunk(DEVICE_EXTENSION * pdx)
+static int StageChunk(DEVICE_EXTENSION *pdx)
 {
        int iReturn = U14ERR_NOERROR;
        unsigned int ChunkSize;
-       int nPipe = pdx->StagedRead ? 3 : 2;    // The pipe number to use for reads or writes
+       int nPipe = pdx->StagedRead ? 3 : 2;    /*  The pipe number to use for reads or writes */
        if (pdx->nPipes == 3)
-               nPipe--;        // Adjust for the 3-pipe case
-       if (nPipe < 0)          // and trap case that should never happen
+               nPipe--;        /*  Adjust for the 3-pipe case */
+       if (nPipe < 0)          /*  and trap case that should never happen */
                return U14ERR_FAIL;
 
-       if (!CanAcceptIoRequests(pdx))  // got sudden remove?
-       {
+       if (!CanAcceptIoRequests(pdx)) {        /*  got sudden remove? */
                dev_info(&pdx->interface->dev, "%s sudden remove, giving up",
                         __func__);
-               return U14ERR_FAIL;     // could do with a better error
+               return U14ERR_FAIL;     /*  could do with a better error */
        }
 
-       ChunkSize = (pdx->StagedLength - pdx->StagedDone);      // transfer length remaining
-       if (ChunkSize > STAGED_SZ)      // make sure to keep legal
-               ChunkSize = STAGED_SZ;  //  limit to max allowed
+       ChunkSize = (pdx->StagedLength - pdx->StagedDone);      /*  transfer length remaining */
+       if (ChunkSize > STAGED_SZ)      /*  make sure to keep legal */
+               ChunkSize = STAGED_SZ;  /*   limit to max allowed */
 
-       if (!pdx->StagedRead)   // if writing...
-               CopyUserSpace(pdx, ChunkSize);  // ...copy data into the buffer
+       if (!pdx->StagedRead)   /*  if writing... */
+               CopyUserSpace(pdx, ChunkSize);  /*  ...copy data into the buffer */
 
        usb_fill_bulk_urb(pdx->pStagedUrb, pdx->udev,
                          pdx->StagedRead ? usb_rcvbulkpipe(pdx->udev,
@@ -740,15 +727,15 @@ static int StageChunk(DEVICE_EXTENSION * pdx)
                          usb_sndbulkpipe(pdx->udev, pdx->epAddr[nPipe]),
                          pdx->pCoherStagedIO, ChunkSize, staged_callback, pdx);
        pdx->pStagedUrb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
-       usb_anchor_urb(pdx->pStagedUrb, &pdx->submitted);       // in case we need to kill it
+       usb_anchor_urb(pdx->pStagedUrb, &pdx->submitted);       /*  in case we need to kill it */
        iReturn = usb_submit_urb(pdx->pStagedUrb, GFP_ATOMIC);
        if (iReturn) {
-               usb_unanchor_urb(pdx->pStagedUrb);      // kill it
-               pdx->bPipeError[nPipe] = 1;     // Flag an error to be handled later
+               usb_unanchor_urb(pdx->pStagedUrb);      /*  kill it */
+               pdx->bPipeError[nPipe] = 1;     /*  Flag an error to be handled later */
                dev_err(&pdx->interface->dev, "%s submit urb failed, code %d",
                        __func__, iReturn);
        } else
-               pdx->bStagedUrbPending = true;  // Set the flag for staged URB pending
+               pdx->bStagedUrbPending = true;  /*  Set the flag for staged URB pending */
        dev_dbg(&pdx->interface->dev, "%s done so far:%d, this size:%d",
                __func__, pdx->StagedDone, ChunkSize);
 
@@ -772,13 +759,12 @@ static int StageChunk(DEVICE_EXTENSION * pdx)
 **             transfer.
 **    dwLen - the number of bytes to transfer.
 */
-int ReadWriteMem(DEVICE_EXTENSION * pdx, bool Read, unsigned short wIdent,
+int ReadWriteMem(DEVICE_EXTENSION *pdx, bool Read, unsigned short wIdent,
                 unsigned int dwOffs, unsigned int dwLen)
 {
-       TRANSAREA *pArea = &pdx->rTransDef[wIdent];     // Transfer area info
+       TRANSAREA *pArea = &pdx->rTransDef[wIdent];     /*  Transfer area info */
 
-       if (!CanAcceptIoRequests(pdx))  // Are we in a state to accept new requests?
-       {
+       if (!CanAcceptIoRequests(pdx)) {        /*  Are we in a state to accept new requests? */
                dev_err(&pdx->interface->dev, "%s can't accept requests",
                        __func__);
                return U14ERR_FAIL;
@@ -788,56 +774,51 @@ int ReadWriteMem(DEVICE_EXTENSION * pdx, bool Read, unsigned short wIdent,
                "%s xfer %d bytes to %s, offset %d, area %d", __func__, dwLen,
                Read ? "host" : "1401", dwOffs, wIdent);
 
-       // Amazingly, we can get an escape sequence back before the current staged Urb is done, so we
-       //  have to check for this situation and, if so, wait until all is OK.
+       /*  Amazingly, we can get an escape sequence back before the current staged Urb is done, so we */
+       /*   have to check for this situation and, if so, wait until all is OK. */
        if (pdx->bStagedUrbPending) {
-               pdx->bXFerWaiting = true;       // Flag we are waiting
+               pdx->bXFerWaiting = true;       /*  Flag we are waiting */
                dev_info(&pdx->interface->dev,
                         "%s xfer is waiting, as previous staged pending",
                         __func__);
                return U14ERR_NOERROR;
        }
 
-       if (dwLen == 0)         // allow 0-len read or write; just return success
-       {
+       if (dwLen == 0) {               /*  allow 0-len read or write; just return success */
                dev_dbg(&pdx->interface->dev,
                        "%s OK; zero-len read/write request", __func__);
                return U14ERR_NOERROR;
        }
 
-       if ((pArea->bCircular) &&       // Circular transfer?
-           (pArea->bCircToHost) && (Read))     // In a supported direction
-       {                       // If so, we sort out offset ourself
-               bool bWait = false;     // Flag for transfer having to wait
+       if ((pArea->bCircular) &&       /*  Circular transfer? */
+           (pArea->bCircToHost) && (Read)) {   /*  In a supported direction */
+                               /*  If so, we sort out offset ourself */
+               bool bWait = false;     /*  Flag for transfer having to wait */
 
                dev_dbg(&pdx->interface->dev,
                        "Circular buffers are %d at %d and %d at %d",
                        pArea->aBlocks[0].dwSize, pArea->aBlocks[0].dwOffset,
                        pArea->aBlocks[1].dwSize, pArea->aBlocks[1].dwOffset);
-               if (pArea->aBlocks[1].dwSize > 0)       // Using the second block already?
-               {
-                       dwOffs = pArea->aBlocks[1].dwOffset + pArea->aBlocks[1].dwSize; // take offset from that
-                       bWait = (dwOffs + dwLen) > pArea->aBlocks[0].dwOffset;  // Wait if will overwrite block 0?
-                       bWait |= (dwOffs + dwLen) > pArea->dwLength;    // or if it overflows the buffer
-               } else          // Area 1 not in use, try to use area 0
-               {
-                       if (pArea->aBlocks[0].dwSize == 0)      // Reset block 0 if not in use
+               if (pArea->aBlocks[1].dwSize > 0) {     /*  Using the second block already? */
+                       dwOffs = pArea->aBlocks[1].dwOffset + pArea->aBlocks[1].dwSize; /*  take offset from that */
+                       bWait = (dwOffs + dwLen) > pArea->aBlocks[0].dwOffset;  /*  Wait if will overwrite block 0? */
+                       bWait |= (dwOffs + dwLen) > pArea->dwLength;    /*  or if it overflows the buffer */
+               } else {                /*  Area 1 not in use, try to use area 0 */
+                       if (pArea->aBlocks[0].dwSize == 0)      /*  Reset block 0 if not in use */
                                pArea->aBlocks[0].dwOffset = 0;
                        dwOffs =
                            pArea->aBlocks[0].dwOffset +
                            pArea->aBlocks[0].dwSize;
-                       if ((dwOffs + dwLen) > pArea->dwLength) // Off the end of the buffer?
-                       {
-                               pArea->aBlocks[1].dwOffset = 0; // Set up to use second block
+                       if ((dwOffs + dwLen) > pArea->dwLength) {       /*  Off the end of the buffer? */
+                               pArea->aBlocks[1].dwOffset = 0; /*  Set up to use second block */
                                dwOffs = 0;
-                               bWait = (dwOffs + dwLen) > pArea->aBlocks[0].dwOffset;  // Wait if will overwrite block 0?
-                               bWait |= (dwOffs + dwLen) > pArea->dwLength;    // or if it overflows the buffer
+                               bWait = (dwOffs + dwLen) > pArea->aBlocks[0].dwOffset;  /*  Wait if will overwrite block 0? */
+                               bWait |= (dwOffs + dwLen) > pArea->dwLength;    /*  or if it overflows the buffer */
                        }
                }
 
-               if (bWait)      // This transfer will have to wait?
-               {
-                       pdx->bXFerWaiting = true;       // Flag we are waiting
+               if (bWait) {    /*  This transfer will have to wait? */
+                       pdx->bXFerWaiting = true;       /*  Flag we are waiting */
                        dev_dbg(&pdx->interface->dev,
                                "%s xfer waiting for circular buffer space",
                                __func__);
@@ -848,17 +829,17 @@ int ReadWriteMem(DEVICE_EXTENSION * pdx, bool Read, unsigned short wIdent,
                        "%s circular xfer, %d bytes starting at %d", __func__,
                        dwLen, dwOffs);
        }
-       // Save the parameters for the read\write transfer
-       pdx->StagedRead = Read; // Save the parameters for this read
-       pdx->StagedId = wIdent; // ID allows us to get transfer area info
-       pdx->StagedOffset = dwOffs;     // The area within the transfer area
+       /*  Save the parameters for the read\write transfer */
+       pdx->StagedRead = Read; /*  Save the parameters for this read */
+       pdx->StagedId = wIdent; /*  ID allows us to get transfer area info */
+       pdx->StagedOffset = dwOffs;     /*  The area within the transfer area */
        pdx->StagedLength = dwLen;
-       pdx->StagedDone = 0;    // Initialise the byte count
-       pdx->dwDMAFlag = MODE_LINEAR;   // Set DMA mode flag at this point
-       pdx->bXFerWaiting = false;      // Clearly not a transfer waiting now
+       pdx->StagedDone = 0;    /*  Initialise the byte count */
+       pdx->dwDMAFlag = MODE_LINEAR;   /*  Set DMA mode flag at this point */
+       pdx->bXFerWaiting = false;      /*  Clearly not a transfer waiting now */
 
-//    KeClearEvent(&pdx->StagingDoneEvent);           // Clear the transfer done event
-       StageChunk(pdx);        // fire off the first chunk
+/*     KeClearEvent(&pdx->StagingDoneEvent);           // Clear the transfer done event */
+       StageChunk(pdx);        /*  fire off the first chunk */
 
        return U14ERR_NOERROR;
 }
@@ -877,12 +858,11 @@ static bool ReadChar(unsigned char *pChar, char *pBuf, unsigned int *pdDone,
        bool bRead = false;
        unsigned int dDone = *pdDone;
 
-       if (dDone < dGot)       // If there is more data
-       {
-               *pChar = (unsigned char)pBuf[dDone];    // Extract the next char
-               dDone++;        // Increment the done count
+       if (dDone < dGot) {     /*  If there is more data */
+               *pChar = (unsigned char)pBuf[dDone];    /*  Extract the next char */
+               dDone++;        /*  Increment the done count */
                *pdDone = dDone;
-               bRead = true;   // and flag success
+               bRead = true;   /*  and flag success */
        }
 
        return bRead;
@@ -962,32 +942,32 @@ static bool ReadHuff(volatile unsigned int *pDWord, char *pBuf,
 **  we start handling the data at offset zero.
 **
 *****************************************************************************/
-static bool ReadDMAInfo(volatile DMADESC * pDmaDesc, DEVICE_EXTENSION * pdx,
+static bool ReadDMAInfo(volatile DMADESC *pDmaDesc, DEVICE_EXTENSION *pdx,
                        char *pBuf, unsigned int dwCount)
 {
-       bool bResult = false;   // assume we won't succeed
+       bool bResult = false;   /*  assume we won't succeed */
        unsigned char ucData;
-       unsigned int dDone = 0; // We haven't parsed anything so far
+       unsigned int dDone = 0; /*  We haven't parsed anything so far */
 
        dev_dbg(&pdx->interface->dev, "%s", __func__);
 
        if (ReadChar(&ucData, pBuf, &dDone, dwCount)) {
-               unsigned char ucTransCode = (ucData & 0x0F);    // get code for transfer type
-               unsigned short wIdent = ((ucData >> 4) & 0x07); // and area identifier
+               unsigned char ucTransCode = (ucData & 0x0F);    /*  get code for transfer type */
+               unsigned short wIdent = ((ucData >> 4) & 0x07); /*  and area identifier */
 
-               // fill in the structure we were given
-               pDmaDesc->wTransType = ucTransCode;     // type of transfer
-               pDmaDesc->wIdent = wIdent;      // area to use
-               pDmaDesc->dwSize = 0;   // initialise other bits
+               /*  fill in the structure we were given */
+               pDmaDesc->wTransType = ucTransCode;     /*  type of transfer */
+               pDmaDesc->wIdent = wIdent;      /*  area to use */
+               pDmaDesc->dwSize = 0;   /*  initialise other bits */
                pDmaDesc->dwOffset = 0;
 
                dev_dbg(&pdx->interface->dev, "%s type: %d ident: %d", __func__,
                        pDmaDesc->wTransType, pDmaDesc->wIdent);
 
-               pDmaDesc->bOutWard = (ucTransCode != TM_EXTTOHOST);     // set transfer direction
+               pDmaDesc->bOutWard = (ucTransCode != TM_EXTTOHOST);     /*  set transfer direction */
 
                switch (ucTransCode) {
-               case TM_EXTTOHOST:      // Extended linear transfer modes (the only ones!)
+               case TM_EXTTOHOST:      /*  Extended linear transfer modes (the only ones!) */
                case TM_EXTTO1401:
                        {
                                bResult =
@@ -1001,14 +981,14 @@ static bool ReadDMAInfo(volatile DMADESC * pDmaDesc, DEVICE_EXTENSION * pdx,
                                                __func__, pDmaDesc->dwOffset,
                                                pDmaDesc->dwSize);
 
-                                       if ((wIdent >= MAX_TRANSAREAS) ||       // Illegal area number, or...
-                                           (!pdx->rTransDef[wIdent].bUsed) ||  // area not set up, or...
-                                           (pDmaDesc->dwOffset > pdx->rTransDef[wIdent].dwLength) ||   // range/size
+                                       if ((wIdent >= MAX_TRANSAREAS) ||       /*  Illegal area number, or... */
+                                           (!pdx->rTransDef[wIdent].bUsed) ||  /*  area not set up, or... */
+                                           (pDmaDesc->dwOffset > pdx->rTransDef[wIdent].dwLength) ||   /*  range/size */
                                            ((pDmaDesc->dwOffset +
                                              pDmaDesc->dwSize) >
                                             (pdx->rTransDef[wIdent].
                                              dwLength))) {
-                                               bResult = false;        // bad parameter(s)
+                                               bResult = false;        /*  bad parameter(s) */
                                                dev_dbg(&pdx->interface->dev,
                                                        "%s bad param - id %d, bUsed %d, offset %d, size %d, area length %d",
                                                        __func__, wIdent,
@@ -1028,7 +1008,7 @@ static bool ReadDMAInfo(volatile DMADESC * pDmaDesc, DEVICE_EXTENSION * pdx,
        } else
                bResult = false;
 
-       if (!bResult)           // now check parameters for validity
+       if (!bResult)           /*  now check parameters for validity */
                dev_err(&pdx->interface->dev, "%s error reading Esc sequence",
                        __func__);
 
@@ -1049,30 +1029,29 @@ static bool ReadDMAInfo(volatile DMADESC * pDmaDesc, DEVICE_EXTENSION * pdx,
 **           this is known to be at least 2 or we will not be called.
 **
 ****************************************************************************/
-static int Handle1401Esc(DEVICE_EXTENSION * pdx, char *pCh,
+static int Handle1401Esc(DEVICE_EXTENSION *pdx, char *pCh,
                         unsigned int dwCount)
 {
        int iReturn = U14ERR_FAIL;
 
-       // I have no idea what this next test is about. '?' is 0x3f, which is area 3, code
-       // 15. At the moment, this is not used, so it does no harm, but unless someone can
-       // tell me what this is for, it should be removed from this and the Windows driver.
-       if (pCh[0] == '?')      // Is this an information response
-       {                       // Parse and save the information
+       /*  I have no idea what this next test is about. '?' is 0x3f, which is area 3, code */
+       /*  15. At the moment, this is not used, so it does no harm, but unless someone can */
+       /*  tell me what this is for, it should be removed from this and the Windows driver. */
+       if (pCh[0] == '?') {    /*  Is this an information response */
+                               /*  Parse and save the information */
        } else {
-               spin_lock(&pdx->stagedLock);    // Lock others out
+               spin_lock(&pdx->stagedLock);    /*  Lock others out */
 
-               if (ReadDMAInfo(&pdx->rDMAInfo, pdx, pCh, dwCount))     // Get DMA parameters
-               {
-                       unsigned short wTransType = pdx->rDMAInfo.wTransType;   // check transfer type
+               if (ReadDMAInfo(&pdx->rDMAInfo, pdx, pCh, dwCount)) {   /*  Get DMA parameters */
+                       unsigned short wTransType = pdx->rDMAInfo.wTransType;   /*  check transfer type */
 
                        dev_dbg(&pdx->interface->dev,
                                "%s xfer to %s, offset %d, length %d", __func__,
                                pdx->rDMAInfo.bOutWard ? "1401" : "host",
                                pdx->rDMAInfo.dwOffset, pdx->rDMAInfo.dwSize);
 
-                       if (pdx->bXFerWaiting)  // Check here for badly out of kilter...
-                       {       // This can never happen, really
+                       if (pdx->bXFerWaiting) { /*  Check here for badly out of kilter... */
+                               /*  This can never happen, really */
                                dev_err(&pdx->interface->dev,
                                        "ERROR: DMA setup while transfer still waiting");
                                spin_unlock(&pdx->stagedLock);
@@ -1090,16 +1069,16 @@ static int Handle1401Esc(DEVICE_EXTENSION * pdx, char *pCh,
                                                dev_err(&pdx->interface->dev,
                                                        "%s ReadWriteMem() failed %d",
                                                        __func__, iReturn);
-                               } else  // This covers non-linear transfer setup
+                               } else  /*  This covers non-linear transfer setup */
                                        dev_err(&pdx->interface->dev,
                                                "%s Unknown block xfer type %d",
                                                __func__, wTransType);
                        }
-               } else          // Failed to read parameters
+               } else          /*  Failed to read parameters */
                        dev_err(&pdx->interface->dev, "%s ReadDMAInfo() fail",
                                __func__);
 
-               spin_unlock(&pdx->stagedLock);  // OK here
+               spin_unlock(&pdx->stagedLock);  /*  OK here */
        }
 
        dev_dbg(&pdx->interface->dev, "%s returns %d", __func__, iReturn);
@@ -1113,12 +1092,11 @@ static int Handle1401Esc(DEVICE_EXTENSION * pdx, char *pCh,
 static void ced_readchar_callback(struct urb *pUrb)
 {
        DEVICE_EXTENSION *pdx = pUrb->context;
-       int nGot = pUrb->actual_length; // what we transferred
+       int nGot = pUrb->actual_length; /*  what we transferred */
 
-       if (pUrb->status)       // Do we have a problem to handle?
-       {
-               int nPipe = pdx->nPipes == 4 ? 1 : 0;   // The pipe number to use for error
-               // sync/async unlink faults aren't errors... just saying device removed or stopped
+       if (pUrb->status) {     /*  Do we have a problem to handle? */
+               int nPipe = pdx->nPipes == 4 ? 1 : 0;   /*  The pipe number to use for error */
+               /*  sync/async unlink faults aren't errors... just saying device removed or stopped */
                if (!
                    (pUrb->status == -ENOENT || pUrb->status == -ECONNRESET
                     || pUrb->status == -ESHUTDOWN)) {
@@ -1133,27 +1111,26 @@ static void ced_readchar_callback(struct urb *pUrb)
                spin_lock(&pdx->err_lock);
                pdx->errors = pUrb->status;
                spin_unlock(&pdx->err_lock);
-               nGot = 0;       //  and tidy up again if so
+               nGot = 0;       /*   and tidy up again if so */
 
-               spin_lock(&pdx->charInLock);    // already at irq level
-               pdx->bPipeError[nPipe] = 1;     // Flag an error for later
+               spin_lock(&pdx->charInLock);    /*  already at irq level */
+               pdx->bPipeError[nPipe] = 1;     /*  Flag an error for later */
        } else {
-               if ((nGot > 1) && ((pdx->pCoherCharIn[0] & 0x7f) == 0x1b))      // Esc sequence?
-               {
-                       Handle1401Esc(pdx, &pdx->pCoherCharIn[1], nGot - 1);    // handle it
-                       spin_lock(&pdx->charInLock);    // already at irq level
+               if ((nGot > 1) && ((pdx->pCoherCharIn[0] & 0x7f) == 0x1b)) {    /*  Esc sequence? */
+                       Handle1401Esc(pdx, &pdx->pCoherCharIn[1], nGot - 1);    /*  handle it */
+                       spin_lock(&pdx->charInLock);    /*  already at irq level */
                } else {
-                       spin_lock(&pdx->charInLock);    // already at irq level
+                       spin_lock(&pdx->charInLock);    /*  already at irq level */
                        if (nGot > 0) {
                                unsigned int i;
                                if (nGot < INBUF_SZ) {
-                                       pdx->pCoherCharIn[nGot] = 0;    // tidy the string
+                                       pdx->pCoherCharIn[nGot] = 0;    /*  tidy the string */
                                        dev_dbg(&pdx->interface->dev,
                                                "%s got %d chars >%s<",
                                                __func__, nGot,
                                                pdx->pCoherCharIn);
                                }
-                               // We know that whatever we read must fit in the input buffer
+                               /*  We know that whatever we read must fit in the input buffer */
                                for (i = 0; i < nGot; i++) {
                                        pdx->inputBuffer[pdx->dwInBuffPut++] =
                                            pdx->pCoherCharIn[i] & 0x7F;
@@ -1162,17 +1139,17 @@ static void ced_readchar_callback(struct urb *pUrb)
                                }
 
                                if ((pdx->dwNumInput + nGot) <= INBUF_SZ)
-                                       pdx->dwNumInput += nGot;        // Adjust the buffer count accordingly
+                                       pdx->dwNumInput += nGot;        /*  Adjust the buffer count accordingly */
                        } else
                                dev_dbg(&pdx->interface->dev, "%s read ZLP",
                                        __func__);
                }
        }
 
-       pdx->bReadCharsPending = false; // No longer have a pending read
-       spin_unlock(&pdx->charInLock);  // already at irq level
+       pdx->bReadCharsPending = false; /*  No longer have a pending read */
+       spin_unlock(&pdx->charInLock);  /*  already at irq level */
 
-       Allowi(pdx);    // see if we can do the next one
+       Allowi(pdx);    /*  see if we can do the next one */
 }
 
 /****************************************************************************
@@ -1182,25 +1159,25 @@ static void ced_readchar_callback(struct urb *pUrb)
 ** we can pick up any inward transfers. This can be called in multiple contexts
 ** so we use the irqsave version of the spinlock.
 ****************************************************************************/
-int Allowi(DEVICE_EXTENSION * pdx)
+int Allowi(DEVICE_EXTENSION *pdx)
 {
        int iReturn = U14ERR_NOERROR;
        unsigned long flags;
-       spin_lock_irqsave(&pdx->charInLock, flags);     // can be called in multiple contexts
-
-       // We don't want char input running while DMA is in progress as we know that this
-       //  can cause sequencing problems for the 2270. So don't. It will also allow the
-       //  ERR response to get back to the host code too early on some PCs, even if there
-       //  is no actual driver failure, so we don't allow this at all.
-       if (!pdx->bInDrawDown &&        // stop input if
-           !pdx->bReadCharsPending &&  // If no read request outstanding
-           (pdx->dwNumInput < (INBUF_SZ / 2)) &&       //  and there is some space
-           (pdx->dwDMAFlag == MODE_CHAR) &&    //  not doing any DMA
-           (!pdx->bXFerWaiting) &&     //  no xfer waiting to start
-           (CanAcceptIoRequests(pdx))) //  and activity is generally OK
-       {                       //  then off we go
-               unsigned int nMax = INBUF_SZ - pdx->dwNumInput; // max we could read
-               int nPipe = pdx->nPipes == 4 ? 1 : 0;   // The pipe number to use
+       spin_lock_irqsave(&pdx->charInLock, flags);     /*  can be called in multiple contexts */
+
+       /*  We don't want char input running while DMA is in progress as we know that this */
+       /*   can cause sequencing problems for the 2270. So don't. It will also allow the */
+       /*   ERR response to get back to the host code too early on some PCs, even if there */
+       /*   is no actual driver failure, so we don't allow this at all. */
+       if (!pdx->bInDrawDown &&        /*  stop input if */
+           !pdx->bReadCharsPending &&  /*  If no read request outstanding */
+           (pdx->dwNumInput < (INBUF_SZ / 2)) &&       /*   and there is some space */
+           (pdx->dwDMAFlag == MODE_CHAR) &&    /*   not doing any DMA */
+           (!pdx->bXFerWaiting) &&     /*   no xfer waiting to start */
+           (CanAcceptIoRequests(pdx))) { /*   and activity is generally OK */
+                               /*   then off we go */
+               unsigned int nMax = INBUF_SZ - pdx->dwNumInput; /*  max we could read */
+               int nPipe = pdx->nPipes == 4 ? 1 : 0;   /*  The pipe number to use */
 
                dev_dbg(&pdx->interface->dev, "%s %d chars in input buffer",
                        __func__, pdx->dwNumInput);
@@ -1209,16 +1186,16 @@ int Allowi(DEVICE_EXTENSION * pdx)
                                 usb_rcvintpipe(pdx->udev, pdx->epAddr[nPipe]),
                                 pdx->pCoherCharIn, nMax, ced_readchar_callback,
                                 pdx, pdx->bInterval);
-               pdx->pUrbCharIn->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;     // short xfers are OK by default
-               usb_anchor_urb(pdx->pUrbCharIn, &pdx->submitted);       // in case we need to kill it
+               pdx->pUrbCharIn->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;     /*  short xfers are OK by default */
+               usb_anchor_urb(pdx->pUrbCharIn, &pdx->submitted);       /*  in case we need to kill it */
                iReturn = usb_submit_urb(pdx->pUrbCharIn, GFP_ATOMIC);
                if (iReturn) {
-                       usb_unanchor_urb(pdx->pUrbCharIn);      // remove from list of active Urbs
-                       pdx->bPipeError[nPipe] = 1;     // Flag an error to be handled later
+                       usb_unanchor_urb(pdx->pUrbCharIn);      /*  remove from list of active Urbs */
+                       pdx->bPipeError[nPipe] = 1;     /*  Flag an error to be handled later */
                        dev_err(&pdx->interface->dev,
                                "%s submit urb failed: %d", __func__, iReturn);
                } else
-                       pdx->bReadCharsPending = true;  // Flag that we are active here
+                       pdx->bReadCharsPending = true;  /*  Flag that we are active here */
        }
 
        spin_unlock_irqrestore(&pdx->charInLock, flags);
@@ -1238,15 +1215,15 @@ static long ced_ioctl(struct file *file, unsigned int cmd, unsigned long ulArg)
 {
        int err = 0;
        DEVICE_EXTENSION *pdx = file->private_data;
-       if (!CanAcceptIoRequests(pdx))  // check we still exist
+       if (!CanAcceptIoRequests(pdx))  /*  check we still exist */
                return -ENODEV;
 
-       // Check that access is allowed, where is is needed. Anything that would have an indeterminate
-       // size will be checked by the specific command.
-       if (_IOC_DIR(cmd) & _IOC_READ)  // read from point of view of user...
-               err = !access_ok(VERIFY_WRITE, (void __user *)ulArg, _IOC_SIZE(cmd));   // is kernel write
-       else if (_IOC_DIR(cmd) & _IOC_WRITE)    // and write from point of view of user...
-               err = !access_ok(VERIFY_READ, (void __user *)ulArg, _IOC_SIZE(cmd));    // is kernel read
+       /*  Check that access is allowed, where is is needed. Anything that would have an indeterminate */
+       /*  size will be checked by the specific command. */
+       if (_IOC_DIR(cmd) & _IOC_READ)  /*  read from point of view of user... */
+               err = !access_ok(VERIFY_WRITE, (void __user *)ulArg, _IOC_SIZE(cmd));   /*  is kernel write */
+       else if (_IOC_DIR(cmd) & _IOC_WRITE)    /*  and write from point of view of user... */
+               err = !access_ok(VERIFY_READ, (void __user *)ulArg, _IOC_SIZE(cmd));    /*  is kernel read */
        if (err)
                return -EFAULT;
 
@@ -1289,7 +1266,7 @@ static long ced_ioctl(struct file *file, unsigned int cmd, unsigned long ulArg)
                return -1;
 
        case _IOC_NR(IOCTL_CED_GETDRIVERREVISION):
-               return (2 << 24) | (DRIVERMAJREV << 16) | DRIVERMINREV; // USB | MAJOR | MINOR
+               return (2 << 24) | (DRIVERMAJREV << 16) | DRIVERMINREV; /*  USB | MAJOR | MINOR */
 
        case _IOC_NR(IOCTL_CED_GETTRANSFER):
                return GetTransfer(pdx, (TGET_TX_BLOCK __user *) ulArg);
@@ -1335,7 +1312,7 @@ static long ced_ioctl(struct file *file, unsigned int cmd, unsigned long ulArg)
                return DbgStopLoop(pdx);
 
        case _IOC_NR(IOCTL_CED_FULLRESET):
-               pdx->bForceReset = true;        // Set a flag for a full reset
+               pdx->bForceReset = true;        /*  Set a flag for a full reset */
                break;
 
        case _IOC_NR(IOCTL_CED_SETCIRCULAR):
@@ -1378,8 +1355,8 @@ static struct usb_class_driver ced_class = {
        .minor_base = USB_CED_MINOR_BASE,
 };
 
-// Check that the device that matches a 1401 vendor and product ID is OK to use and
-// initialise our DEVICE_EXTENSION.
+/*  Check that the device that matches a 1401 vendor and product ID is OK to use and */
+/*  initialise our DEVICE_EXTENSION. */
 static int ced_probe(struct usb_interface *interface,
                     const struct usb_device_id *id)
 {
@@ -1389,23 +1366,22 @@ static int ced_probe(struct usb_interface *interface,
        int i, bcdDevice;
        int retval = -ENOMEM;
 
-       // allocate memory for our device extension and initialize it
+       /*  allocate memory for our device extension and initialize it */
        pdx = kzalloc(sizeof(*pdx), GFP_KERNEL);
        if (!pdx)
                goto error;
 
-       for (i = 0; i < MAX_TRANSAREAS; ++i)    // Initialise the wait queues
-       {
+       for (i = 0; i < MAX_TRANSAREAS; ++i) {  /*  Initialise the wait queues */
                init_waitqueue_head(&pdx->rTransDef[i].wqEvent);
        }
 
-       // Put initialises for our stuff here. Note that all of *pdx is zero, so
-       // no need to explicitly zero it.
+       /*  Put initialises for our stuff here. Note that all of *pdx is zero, so */
+       /*  no need to explicitly zero it. */
        spin_lock_init(&pdx->charOutLock);
        spin_lock_init(&pdx->charInLock);
        spin_lock_init(&pdx->stagedLock);
 
-       // Initialises from the skeleton stuff
+       /*  Initialises from the skeleton stuff */
        kref_init(&pdx->kref);
        mutex_init(&pdx->io_mutex);
        spin_lock_init(&pdx->err_lock);
@@ -1414,7 +1390,7 @@ static int ced_probe(struct usb_interface *interface,
        pdx->udev = usb_get_dev(interface_to_usbdev(interface));
        pdx->interface = interface;
 
-       // Attempt to identify the device
+       /*  Attempt to identify the device */
        bcdDevice = pdx->udev->descriptor.bcdDevice;
        i = (bcdDevice >> 8);
        if (i == 0)
@@ -1426,8 +1402,8 @@ static int ced_probe(struct usb_interface *interface,
                        __func__, bcdDevice);
                goto error;
        }
-       // set up the endpoint information. We only care about the number of EP as
-       // we know that we are dealing with a 1401 device.
+       /*  set up the endpoint information. We only care about the number of EP as */
+       /*  we know that we are dealing with a 1401 device. */
        iface_desc = interface->cur_altsetting;
        pdx->nPipes = iface_desc->desc.bNumEndpoints;
        dev_info(&interface->dev, "1401Type=%d with %d End Points",
@@ -1435,10 +1411,10 @@ static int ced_probe(struct usb_interface *interface,
        if ((pdx->nPipes < 3) || (pdx->nPipes > 4))
                goto error;
 
-       // Allocate the URBs we hold for performing transfers
-       pdx->pUrbCharOut = usb_alloc_urb(0, GFP_KERNEL);        // character output URB
-       pdx->pUrbCharIn = usb_alloc_urb(0, GFP_KERNEL); // character input URB
-       pdx->pStagedUrb = usb_alloc_urb(0, GFP_KERNEL); // block transfer URB
+       /*  Allocate the URBs we hold for performing transfers */
+       pdx->pUrbCharOut = usb_alloc_urb(0, GFP_KERNEL);        /*  character output URB */
+       pdx->pUrbCharIn = usb_alloc_urb(0, GFP_KERNEL); /*  character input URB */
+       pdx->pStagedUrb = usb_alloc_urb(0, GFP_KERNEL); /*  block transfer URB */
        if (!pdx->pUrbCharOut || !pdx->pUrbCharIn || !pdx->pStagedUrb) {
                dev_err(&interface->dev, "%s URB alloc failed", __func__);
                goto error;
@@ -1464,15 +1440,14 @@ static int ced_probe(struct usb_interface *interface,
                pdx->epAddr[i] = endpoint->bEndpointAddress;
                dev_info(&interface->dev, "Pipe %d, ep address %02x", i,
                         pdx->epAddr[i]);
-               if (((pdx->nPipes == 3) && (i == 0)) || // if char input end point
+               if (((pdx->nPipes == 3) && (i == 0)) || /*  if char input end point */
                    ((pdx->nPipes == 4) && (i == 1))) {
-                       pdx->bInterval = endpoint->bInterval;   // save the endpoint interrupt interval
+                       pdx->bInterval = endpoint->bInterval;   /*  save the endpoint interrupt interval */
                        dev_info(&interface->dev, "Pipe %d, bInterval = %d", i,
                                 pdx->bInterval);
                }
-               // Detect USB2 by checking last ep size (64 if USB1)
-               if (i == pdx->nPipes - 1)       // if this is the last ep (bulk)
-               {
+               /*  Detect USB2 by checking last ep size (64 if USB1) */
+               if (i == pdx->nPipes - 1) {     /*  if this is the last ep (bulk) */
                        pdx->bIsUSB2 =
                            le16_to_cpu(endpoint->wMaxPacketSize) > 64;
                        dev_info(&pdx->interface->dev, "USB%d",
@@ -1501,7 +1476,7 @@ static int ced_probe(struct usb_interface *interface,
 
 error:
        if (pdx)
-               kref_put(&pdx->kref, ced_delete);       // frees allocated memory
+               kref_put(&pdx->kref, ced_delete);       /*  frees allocated memory */
        return retval;
 }
 
@@ -1511,39 +1486,39 @@ static void ced_disconnect(struct usb_interface *interface)
        int minor = interface->minor;
        int i;
 
-       usb_set_intfdata(interface, NULL);      // remove the pdx from the interface
-       usb_deregister_dev(interface, &ced_class);      // give back our minor device number
+       usb_set_intfdata(interface, NULL);      /*  remove the pdx from the interface */
+       usb_deregister_dev(interface, &ced_class);      /*  give back our minor device number */
 
-       mutex_lock(&pdx->io_mutex);     // stop more I/O starting while...
-       ced_draw_down(pdx);     // ...wait for then kill any io
+       mutex_lock(&pdx->io_mutex);     /*  stop more I/O starting while... */
+       ced_draw_down(pdx);     /*  ...wait for then kill any io */
        for (i = 0; i < MAX_TRANSAREAS; ++i) {
-               int iErr = ClearArea(pdx, i);   // ...release any used memory
+               int iErr = ClearArea(pdx, i);   /*  ...release any used memory */
                if (iErr == U14ERR_UNLOCKFAIL)
                        dev_err(&pdx->interface->dev, "%s Area %d was in used",
                                __func__, i);
        }
-       pdx->interface = NULL;  // ...we kill off link to interface
+       pdx->interface = NULL;  /*  ...we kill off link to interface */
        mutex_unlock(&pdx->io_mutex);
 
        usb_kill_anchored_urbs(&pdx->submitted);
 
-       kref_put(&pdx->kref, ced_delete);       // decrement our usage count
+       kref_put(&pdx->kref, ced_delete);       /*  decrement our usage count */
 
        dev_info(&interface->dev, "USB cedusb #%d now disconnected", minor);
 }
 
-// Wait for all the urbs we know of to be done with, then kill off any that
-// are left. NBNB we will need to have a mechanism to stop circular xfers
-// from trying to fire off more urbs. We will wait up to 3 seconds for Urbs
-// to be done.
-void ced_draw_down(DEVICE_EXTENSION * pdx)
+/*  Wait for all the urbs we know of to be done with, then kill off any that */
+/*  are left. NBNB we will need to have a mechanism to stop circular xfers */
+/*  from trying to fire off more urbs. We will wait up to 3 seconds for Urbs */
+/*  to be done. */
+void ced_draw_down(DEVICE_EXTENSION *pdx)
 {
        int time;
        dev_dbg(&pdx->interface->dev, "%s called", __func__);
 
        pdx->bInDrawDown = true;
        time = usb_wait_anchor_empty_timeout(&pdx->submitted, 3000);
-       if (!time) {            // if we timed out we kill the urbs
+       if (!time) {            /*  if we timed out we kill the urbs */
                usb_kill_anchored_urbs(&pdx->submitted);
                dev_err(&pdx->interface->dev, "%s timed out", __func__);
        }
index 8fc6958b6f08d32438f6bf751eec93a8daeb1558..f031e3a2c7cf209fbf82188cade39567502d846e 100644 (file)
 #define UINT unsigned int
 #endif
 
-/// Device type codes, but these don't need to be extended - a succession is assumed
-/// These are set for usb from the bcdDevice field (suitably mangled). Future devices
-/// will be added in order of device creation to the list, so the names here are just
-/// to help use remember which device is which. The U14ERR_... values follow the same
-/// pattern for modern devices.
-#define TYPEUNKNOWN        -1             // dont know
-#define TYPE1401           0              // standard 1401
-#define TYPEPLUS           1              // 1401 plus
-#define TYPEU1401          2              // u1401
-#define TYPEPOWER          3              // Power1401
-#define TYPEU14012         4              // u1401 mkII
-#define TYPEPOWER2         5              // Power1401 mk II
-#define TYPEMICRO3         6              // Micro1401-3
-#define TYPEPOWER3         7              // Power1401-3
-
-/// Some useful defines of constants. DONT FORGET to change the version in the
-/// resources whenever you change it here!.
-#define DRIVERMAJREV      2             // driver revision level major (match windows)
-#define DRIVERMINREV      0             // driver revision level minor
-
-/// Definitions of the various block transfer command codes
-#define TM_EXTTOHOST    8               // extended tohost
-#define TM_EXTTO1401    9               // extended to1401
-
-/// Definitions of values in usbReqtype. Used in sorting out setup actions
+/** Device type codes, but these don't need to be extended - a succession is assumed
+** These are set for usb from the bcdDevice field (suitably mangled). Future devices
+** will be added in order of device creation to the list, so the names here are just
+** to help use remember which device is which. The U14ERR_... values follow the same
+** pattern for modern devices.a
+**/
+#define TYPEUNKNOWN        -1             /*  dont know */
+#define TYPE1401           0              /*  standard 1401 */
+#define TYPEPLUS           1              /*  1401 plus */
+#define TYPEU1401          2              /*  u1401 */
+#define TYPEPOWER          3              /*  Power1401 */
+#define TYPEU14012         4              /*  u1401 mkII */
+#define TYPEPOWER2         5              /*  Power1401 mk II */
+#define TYPEMICRO3         6              /*  Micro1401-3 */
+#define TYPEPOWER3         7              /*  Power1401-3 */
+
+/*  Some useful defines of constants. DONT FORGET to change the version in the */
+/*  resources whenever you change it here!. */
+#define DRIVERMAJREV      2             /*  driver revision level major (match windows) */
+#define DRIVERMINREV      0             /*  driver revision level minor */
+
+/*  Definitions of the various block transfer command codes */
+#define TM_EXTTOHOST    8               /*  extended tohost */
+#define TM_EXTTO1401    9               /*  extended to1401 */
+
+/*  Definitions of values in usbReqtype. Used in sorting out setup actions */
 #define H_TO_D 0x00
 #define D_TO_H 0x80
 #define VENDOR 0x40
@@ -58,7 +59,7 @@
 #define INTREQ 0x01
 #define ENDREQ 0x02
 
-/// Definition of values in usbRequest, again used to sort out setup
+/*  Definition of values in usbRequest, again used to sort out setup */
 #define GET_STATUS      0x00
 #define CLEAR_FEATURE   0x01
 #define SET_FEATURE     0x03
@@ -71,8 +72,8 @@
 #define SET_INTERFACE   0x0b
 #define SYNCH_FRAME     0x0c
 
-/// Definitions of the various debug command codes understood by the 1401. These
-/// are used in various vendor-specific commands to achieve the desired effect
+/*  Definitions of the various debug command codes understood by the 1401. These */
+/*  are used in various vendor-specific commands to achieve the desired effect */
 #define DB_GRAB         0x50            /* Grab is a NOP for USB */
 #define DB_FREE         0x51            /* Free is a NOP for the USB */
 #define DB_SETADD       0x52            /* Set debug address (double) */
 #define CR_CHAR          0x0D           /* The carriage return character */
 #define CR_CHAR_80       0x8d           /*  and with bit 7 set */
 
-/// A structure holding information about a block of memory for use in circular transfers
-typedef struct circBlk
-{
-    volatile UINT dwOffset;             /* Offset within area of block start */
-    volatile UINT dwSize;               /* Size of the block, in bytes (0 = unused) */
+/*  A structure holding information about a block of memory for use in circular transfers */
+typedef struct circBlk {
+       volatile UINT dwOffset;             /* Offset within area of block start */
+       volatile UINT dwSize;               /* Size of the block, in bytes (0 = unused) */
 } CIRCBLK;
 
-/// A structure holding all of the information about a transfer area - an area of
-///  memory set up for use either as a source or destination in DMA transfers.
-typedef struct transarea
-{
-    void*       lpvBuff;                // User address of xfer area saved for completeness
-    UINT        dwBaseOffset;           // offset to start of xfer area in first page
-    UINT        dwLength;               // Length of xfer area, in bytes
-    struct page **pPages;               // Points at array of locked down pages
-    int         nPages;                 // number of pages that are locked down
-    bool        bUsed;                  // Is this structure in use?
-    bool        bCircular;              // Is this area for circular transfers?
-    bool        bCircToHost;            // Flag for direction of circular transfer
-    bool        bEventToHost;           // Set event on transfer to host?
-    int         iWakeUp;                // Set 1 on event, cleared by TestEvent()
-    UINT        dwEventSt;              // Defines section within xfer area for...
-    UINT        dwEventSz;              // ...notification by the event SZ is 0 if unset
-    CIRCBLK     aBlocks[2];             // Info on a pair of circular blocks
-    wait_queue_head_t wqEvent;          // The wait queue for events in this area MUST BE LAST
+/*  A structure holding all of the information about a transfer area - an area of */
+/*   memory set up for use either as a source or destination in DMA transfers. */
+typedef struct transarea {
+       void    *lpvBuff;                /*  User address of xfer area saved for completeness */
+       UINT        dwBaseOffset;           /*  offset to start of xfer area in first page */
+       UINT        dwLength;               /*  Length of xfer area, in bytes */
+       struct page **pPages;               /*  Points at array of locked down pages */
+       int         nPages;                 /*  number of pages that are locked down */
+       bool        bUsed;                  /*  Is this structure in use? */
+       bool        bCircular;              /*  Is this area for circular transfers? */
+       bool        bCircToHost;            /*  Flag for direction of circular transfer */
+       bool        bEventToHost;           /*  Set event on transfer to host? */
+       int         iWakeUp;                /*  Set 1 on event, cleared by TestEvent() */
+       UINT        dwEventSt;              /*  Defines section within xfer area for... */
+       UINT        dwEventSz;              /*  ...notification by the event SZ is 0 if unset */
+       CIRCBLK     aBlocks[2];             /*  Info on a pair of circular blocks */
+       wait_queue_head_t wqEvent;          /*  The wait queue for events in this area MUST BE LAST */
 } TRANSAREA;
 
-/// The DMADESC structure is used to hold information on the transfer in progress. It
-/// is set up by ReadDMAInfo, using information sent by the 1401 in an escape sequence.
-typedef struct dmadesc
-{
-    unsigned short wTransType;          /* transfer type as TM_xxx above        */
-    unsigned short wIdent;              /* identifier word                      */
-    unsigned int   dwSize;              /* bytes to transfer                    */
-    unsigned int   dwOffset;            /* offset into transfer area for trans  */
-    bool           bOutWard;            /* true when data is going TO 1401      */
+/*  The DMADESC structure is used to hold information on the transfer in progress. It */
+/*  is set up by ReadDMAInfo, using information sent by the 1401 in an escape sequence. */
+typedef struct dmadesc {
+       unsigned short wTransType;          /* transfer type as TM_xxx above        */
+       unsigned short wIdent;              /* identifier word                      */
+       unsigned int   dwSize;              /* bytes to transfer                    */
+       unsigned int   dwOffset;            /* offset into transfer area for trans  */
+       bool           bOutWard;            /* true when data is going TO 1401      */
 } DMADESC;
 
 #define INBUF_SZ         256            /* input buffer size */
 #define OUTBUF_SZ        256            /* output buffer size */
-#define STAGED_SZ 0x10000               // size of coherent buffer for staged transfers
-
-/// Structure to hold all of our device specific stuff. We are making this as similar as we
-/// can to the Windows driver to help in our understanding of what is going on.
-typedef struct _DEVICE_EXTENSION
-{
-    char inputBuffer[INBUF_SZ];         /* The two buffers */
-    char outputBuffer[OUTBUF_SZ];       /* accessed by the host functions */
-    volatile unsigned int dwNumInput;   /* num of chars in input buffer   */
-    volatile unsigned int dwInBuffGet;  /* where to get from input buffer */
-    volatile unsigned int dwInBuffPut;  /* where to put into input buffer */
-    volatile unsigned int dwNumOutput;  /* num of chars in output buffer  */
-    volatile unsigned int dwOutBuffGet; /* where to get from output buffer*/
-    volatile unsigned int dwOutBuffPut; /* where to put into output buffer*/
-
-    volatile bool bSendCharsPending;    /* Flag to indicate sendchar active */
-    volatile bool bReadCharsPending;    /* Flag to indicate a read is primed */
-    char* pCoherCharOut;                /* special aligned buffer for chars to 1401 */
-    struct urb* pUrbCharOut;            /* urb used for chars to 1401 */
-    char* pCoherCharIn;                 /* special aligned buffer for chars to host */
-    struct urb* pUrbCharIn;             /* urb used for chars to host */
-
-    spinlock_t charOutLock;             /* to protect the outputBuffer and outputting */
-    spinlock_t charInLock;              /* to protect the inputBuffer and char reads */
-    __u8 bInterval;                     /* Interrupt end point interval */
-
-    volatile unsigned int dwDMAFlag;    /* state of DMA */
-    TRANSAREA rTransDef[MAX_TRANSAREAS];/* transfer area info */
-    volatile DMADESC rDMAInfo;          // info on current DMA transfer
-    volatile bool bXFerWaiting;         // Flag set if DMA transfer stalled
-    volatile bool bInDrawDown;          // Flag that we want to halt transfers
-
-    // Parameters relating to a block read\write that is in progress. Some of these values
-    //  are equivalent to values in rDMAInfo. The values here are those in use, while those
-    //  in rDMAInfo are those received from the 1401 via an escape sequence. If another
-    //  escape sequence arrives before the previous xfer ends, rDMAInfo values are updated while these
-    //  are used to finish off the current transfer.
-    volatile short StagedId;            // The transfer area id for this transfer
-    volatile bool StagedRead;           // Flag TRUE for read from 1401, FALSE for write
-    volatile unsigned int StagedLength; // Total length of this transfer
-    volatile unsigned int StagedOffset; // Offset within memory area for transfer start
-    volatile unsigned int StagedDone;   // Bytes transferred so far
-    volatile bool bStagedUrbPending;    // Flag to indicate active
-    char* pCoherStagedIO;               // buffer used for block transfers
-    struct urb* pStagedUrb;             // The URB to use
-    spinlock_t stagedLock;              // protects ReadWriteMem() and circular buffer stuff
-
-    short s1401Type;                    // type of 1401 attached
-    short sCurrentState;                // current error state
-    bool bIsUSB2;                       // type of the interface we connect to
-    bool bForceReset;                   // Flag to make sure we get a real reset
-    __u32 statBuf[2];                   // buffer for 1401 state info
-
-    unsigned long ulSelfTestTime;       // used to timeout self test
-
-    int nPipes;                         // Should be 3 or 4 depending on 1401 usb chip
-    int bPipeError[4];                  // set non-zero if an error on one of the pipe
-    __u8 epAddr[4];                     // addresses of the 3/4 end points
-
-    struct usb_device *udev;            // the usb device for this device
-    struct usb_interface *interface;    // the interface for this device, NULL if removed
-    struct usb_anchor submitted;        // in case we need to retract our submissions
-    struct mutex io_mutex;              // synchronize I/O with disconnect, one user-mode caller at a time
-
-    int    errors;                      // the last request tanked
-    int    open_count;                  // count the number of openers
-    spinlock_t err_lock;                // lock for errors
-    struct kref kref;
-}DEVICE_EXTENSION, *PDEVICE_EXTENSION;
+#define STAGED_SZ 0x10000               /*  size of coherent buffer for staged transfers */
+
+/*  Structure to hold all of our device specific stuff. We are making this as similar as we */
+/*  can to the Windows driver to help in our understanding of what is going on. */
+typedef struct _DEVICE_EXTENSION {
+       char inputBuffer[INBUF_SZ];         /* The two buffers */
+       char outputBuffer[OUTBUF_SZ];       /* accessed by the host functions */
+       volatile unsigned int dwNumInput;   /* num of chars in input buffer   */
+       volatile unsigned int dwInBuffGet;  /* where to get from input buffer */
+       volatile unsigned int dwInBuffPut;  /* where to put into input buffer */
+       volatile unsigned int dwNumOutput;  /* num of chars in output buffer  */
+       volatile unsigned int dwOutBuffGet; /* where to get from output buffer*/
+       volatile unsigned int dwOutBuffPut; /* where to put into output buffer*/
+
+       volatile bool bSendCharsPending;    /* Flag to indicate sendchar active */
+       volatile bool bReadCharsPending;    /* Flag to indicate a read is primed */
+       char *pCoherCharOut;                /* special aligned buffer for chars to 1401 */
+       struct urb *pUrbCharOut;            /* urb used for chars to 1401 */
+       char *pCoherCharIn;                 /* special aligned buffer for chars to host */
+       struct urb *pUrbCharIn;             /* urb used for chars to host */
+
+       spinlock_t charOutLock;             /* to protect the outputBuffer and outputting */
+       spinlock_t charInLock;              /* to protect the inputBuffer and char reads */
+       __u8 bInterval;                     /* Interrupt end point interval */
+
+       volatile unsigned int dwDMAFlag;    /* state of DMA */
+       TRANSAREA rTransDef[MAX_TRANSAREAS];/* transfer area info */
+       volatile DMADESC rDMAInfo;          /*  info on current DMA transfer */
+       volatile bool bXFerWaiting;         /*  Flag set if DMA transfer stalled */
+       volatile bool bInDrawDown;          /*  Flag that we want to halt transfers */
+
+       /*  Parameters relating to a block read\write that is in progress. Some of these values */
+       /*   are equivalent to values in rDMAInfo. The values here are those in use, while those */
+       /*   in rDMAInfo are those received from the 1401 via an escape sequence. If another */
+       /*   escape sequence arrives before the previous xfer ends, rDMAInfo values are updated while these */
+       /*   are used to finish off the current transfer. */
+       volatile short StagedId;            /*  The transfer area id for this transfer */
+       volatile bool StagedRead;           /*  Flag TRUE for read from 1401, FALSE for write */
+       volatile unsigned int StagedLength; /*  Total length of this transfer */
+       volatile unsigned int StagedOffset; /*  Offset within memory area for transfer start */
+       volatile unsigned int StagedDone;   /*  Bytes transferred so far */
+       volatile bool bStagedUrbPending;    /*  Flag to indicate active */
+       char *pCoherStagedIO;               /*  buffer used for block transfers */
+       struct urb *pStagedUrb;             /*  The URB to use */
+       spinlock_t stagedLock;              /*  protects ReadWriteMem() and circular buffer stuff */
+
+       short s1401Type;                    /*  type of 1401 attached */
+       short sCurrentState;                /*  current error state */
+       bool bIsUSB2;                       /*  type of the interface we connect to */
+       bool bForceReset;                   /*  Flag to make sure we get a real reset */
+       __u32 statBuf[2];                   /*  buffer for 1401 state info */
+
+       unsigned long ulSelfTestTime;       /*  used to timeout self test */
+
+       int nPipes;                         /*  Should be 3 or 4 depending on 1401 usb chip */
+       int bPipeError[4];                  /*  set non-zero if an error on one of the pipe */
+       __u8 epAddr[4];                     /*  addresses of the 3/4 end points */
+
+       struct usb_device *udev;            /*  the usb device for this device */
+       struct usb_interface *interface;    /*  the interface for this device, NULL if removed */
+       struct usb_anchor submitted;        /*  in case we need to retract our submissions */
+       struct mutex io_mutex;              /*  synchronize I/O with disconnect, one user-mode caller at a time */
+
+       int    errors;                      /*  the last request tanked */
+       int    open_count;                  /*  count the number of openers */
+       spinlock_t err_lock;                /*  lock for errors */
+       struct kref kref;
+} DEVICE_EXTENSION, *PDEVICE_EXTENSION;
 #define to_DEVICE_EXTENSION(d) container_of(d, DEVICE_EXTENSION, kref)
 
-/// Definitions of routimes used between compilation object files
-// in usb1401.c
-extern int Allowi(DEVICE_EXTENSIONpdx);
-extern int SendChars(DEVICE_EXTENSIONpdx);
+/*  Definitions of routimes used between compilation object files */
+/*  in usb1401.c */
+extern int Allowi(DEVICE_EXTENSION *pdx);
+extern int SendChars(DEVICE_EXTENSION *pdx);
 extern void ced_draw_down(DEVICE_EXTENSION *pdx);
 extern int ReadWriteMem(DEVICE_EXTENSION *pdx, bool Read, unsigned short wIdent,
-                      unsigned int dwOffs, unsigned int dwLen);
+                               unsigned int dwOffs, unsigned int dwLen);
 
-// in ced_ioc.c
+/*  in ced_ioc.c */
 extern int ClearArea(DEVICE_EXTENSION *pdx, int nArea);
-extern int SendString(DEVICE_EXTENSION* pdx, const char __user* pData, unsigned int n);
+extern int SendString(DEVICE_EXTENSION *pdx, const char __user *pData, unsigned int n);
 extern int SendChar(DEVICE_EXTENSION *pdx, char c);
-extern int Get1401State(DEVICE_EXTENSION* pdx, __u32* state, __u32* error);
+extern int Get1401State(DEVICE_EXTENSION *pdx, __u32 *state, __u32 *error);
 extern int ReadWrite_Cancel(DEVICE_EXTENSION *pdx);
-extern bool Is1401(DEVICE_EXTENSIONpdx);
-extern bool QuickCheck(DEVICE_EXTENSIONpdx, bool bTestBuff, bool bCanReset);
+extern bool Is1401(DEVICE_EXTENSION *pdx);
+extern bool QuickCheck(DEVICE_EXTENSION *pdx, bool bTestBuff, bool bCanReset);
 extern int Reset1401(DEVICE_EXTENSION *pdx);
 extern int GetChar(DEVICE_EXTENSION *pdx);
-extern int GetString(DEVICE_EXTENSION *pdx, char __userpUser, int n);
+extern int GetString(DEVICE_EXTENSION *pdx, char __user *pUser, int n);
 extern int SetTransfer(DEVICE_EXTENSION *pdx, TRANSFERDESC __user *pTD);
 extern int UnsetTransfer(DEVICE_EXTENSION *pdx, int nArea);
-extern int SetEvent(DEVICE_EXTENSION *pdx, TRANSFEREVENT __user*pTE);
+extern int SetEvent(DEVICE_EXTENSION *pdx, TRANSFEREVENT __user *pTE);
 extern int Stat1401(DEVICE_EXTENSION *pdx);
 extern int LineCount(DEVICE_EXTENSION *pdx);
 extern int GetOutBufSpace(DEVICE_EXTENSION *pdx);
@@ -235,15 +232,15 @@ extern int StartSelfTest(DEVICE_EXTENSION *pdx);
 extern int CheckSelfTest(DEVICE_EXTENSION *pdx, TGET_SELFTEST __user *pGST);
 extern int TypeOf1401(DEVICE_EXTENSION *pdx);
 extern int TransferFlags(DEVICE_EXTENSION *pdx);
-extern int DbgPeek(DEVICE_EXTENSION *pdx, TDBGBLOCK __userpDB);
+extern int DbgPeek(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB);
 extern int DbgPoke(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB);
 extern int DbgRampData(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB);
 extern int DbgRampAddr(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB);
 extern int DbgGetData(DEVICE_EXTENSION *pdx, TDBGBLOCK __user *pDB);
 extern int DbgStopLoop(DEVICE_EXTENSION *pdx);
 extern int SetCircular(DEVICE_EXTENSION *pdx, TRANSFERDESC __user *pTD);
-extern int GetCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __userpCB);
-extern int FreeCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __userpCB);
+extern int GetCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __user *pCB);
+extern int FreeCircBlock(DEVICE_EXTENSION *pdx, TCIRCBLOCK __user *pCB);
 extern int WaitEvent(DEVICE_EXTENSION *pdx, int nArea, int msTimeOut);
 extern int TestEvent(DEVICE_EXTENSION *pdx, int nArea);
 #endif
index 86294e21db0ca7d0fed26013f7d15fe4831e4d0a..b7997c9835c2b301a497efd7d9d00384b341afda 100644 (file)
 #define __USE1401_H__
 #include "machine.h"
 
-// Some definitions to make things compatible. If you want to use Use1401 directly
-//  from a Windows program you should define U14_NOT_DLL, in which case you also
-//  MUST make sure that your application startup code calls U14InitLib().
-// DLL_USE1401 is defined when you are building the Use1401 dll, not otherwise.
+/*  Some definitions to make things compatible. If you want to use Use1401 directly */
+/*   from a Windows program you should define U14_NOT_DLL, in which case you also */
+/*   MUST make sure that your application startup code calls U14InitLib(). */
+/*  DLL_USE1401 is defined when you are building the Use1401 dll, not otherwise. */
 #ifdef _IS_WINDOWS_
 #ifndef U14_NOT_DLL
 #ifdef DLL_USE1401
-#define U14API(retType) retType DllExport __stdcall
+#define U14API(retType) (retType DllExport __stdcall)
 #else
-#define U14API(retType) retType DllImport __stdcall
+#define U14API(retType) (retType DllImport __stdcall)
 #endif
 #endif
 
@@ -36,7 +36,7 @@
 #ifdef _QT
 #ifndef U14_NOT_DLL
 #undef U14API
-#define U14API(retType) retType __declspec(dllimport) __stdcall
+#define U14API(retType) (retType __declspec(dllimport) __stdcall)
 #endif
 #undef U14LONG
 #define U14LONG int
 #define U14LONG long
 #endif
 
-/// Error codes: We need them here as user space can see them.
-#define U14ERR_NOERROR        0             // no problems
+/* Error codes: We need them here as user space can see them. */
+#define U14ERR_NOERROR        0             /*  no problems */
 
-/// Device error codes, but these don't need to be extended - a succession is assumed
-#define U14ERR_STD            4              // standard 1401 connected
-#define U14ERR_U1401          5              // u1401 connected
-#define U14ERR_PLUS           6              // 1401 plus connected
-#define U14ERR_POWER          7              // Power1401 connected
-#define U14ERR_U14012         8              // u1401 mkII connected
+/* Device error codes, but these don't need to be extended - a succession is assumed */
+#define U14ERR_STD            4              /*  standard 1401 connected */
+#define U14ERR_U1401          5              /*  u1401 connected */
+#define U14ERR_PLUS           6              /*  1401 plus connected */
+#define U14ERR_POWER          7              /*  Power1401 connected */
+#define U14ERR_U14012         8              /*  u1401 mkII connected */
 #define U14ERR_POWER2         9
 #define U14ERR_U14013        10
 #define U14ERR_POWER3        11
 
-/// NBNB Error numbers need shifting as some linux error codes start at 512
+/* NBNB Error numbers need shifting as some linux error codes start at 512 */
 #define U14ERR(n)             (n+U14ERRBASE)
 #define U14ERR_OFF            U14ERR(0)      /* 1401 there but switched off    */
 #define U14ERR_NC             U14ERR(-1)     /* 1401 not connected             */
 #define U14ERR_DRIVCOMMS      U14ERR(-110)   /* failed talking to driver       */
 #define U14ERR_OUTOFMEMORY    U14ERR(-111)   /* needed memory and couldnt get it*/
 
-/// 1401 type codes.
+/* / 1401 type codes. */
 #define U14TYPE1401           0           /* standard 1401                  */
 #define U14TYPEPLUS           1           /* 1401 plus                      */
 #define U14TYPEU1401          2           /* u1401                          */
 #define U14TYPEPOWER3         7           /* power1401-3                    */
 #define U14TYPEUNKNOWN        -1          /* dont know                      */
 
-/// Transfer flags to allow driver capabilities to be interrogated
+/* Transfer flags to allow driver capabilities to be interrogated */
 
-/// Constants for transfer flags
+/* Constants for transfer flags */
 #define U14TF_USEDMA          1           /* Transfer flag for use DMA      */
 #define U14TF_MULTIA          2           /* Transfer flag for multi areas  */
 #define U14TF_FIFO            4           /* for FIFO interface card        */
 #define U14TF_DIAG            256         /* Diagnostics/debug functions    */
 #define U14TF_CIRC14          512         /* Circular-mode to 1401          */
 
-/// Definitions of element sizes for DMA transfers - to allow byte-swapping
+/* Definitions of element sizes for DMA transfers - to allow byte-swapping */
 #define ESZBYTES              0           /* BYTE element size value        */
-#define ESZWORDS              1           /* WORD element size value        */
+#define ESZWORDS              1           /* unsigned short element size value        */
 #define ESZLONGS              2           /* long element size value        */
 #define ESZUNKNOWN            0           /* unknown element size value     */
 
-/// These define required access types for the debug/diagnostics function
+/* These define required access types for the debug/diagnostics function */
 #define BYTE_SIZE             1           /* 8-bit access                   */
 #define WORD_SIZE             2           /* 16-bit access                  */
 #define LONG_SIZE             3           /* 32-bit access                  */
 
-/// Stuff used by U14_GetTransfer
+/* Stuff used by U14_GetTransfer */
 #define GET_TX_MAXENTRIES  257          /* (max length / page size + 1) */
 
 #ifdef _IS_WINDOWS_
 
 typedef struct                          /* used for U14_GetTransfer results */
 {                                          /* Info on a single mapped block */
-   U14LONG physical;
-   U14LONG size;
+       U14LONG physical;
+       U14LONG size;
 } TXENTRY;
 
 typedef struct TGetTxBlock              /* used for U14_GetTransfer results */
 {                                               /* matches structure in VXD */
-   U14LONG size;
-   U14LONG linear;
-   short   seg;
-   short   reserved;
-   short   avail;                      /* number of available entries */
-   short   used;                       /* number of used entries */
-   TXENTRY entries[GET_TX_MAXENTRIES];       /* Array of mapped block info */
+       U14LONG size;
+       U14LONG linear;
+       short   seg;
+       short   reserved;
+       short   avail;                      /* number of available entries */
+       short   used;                       /* number of used entries */
+       TXENTRY entries[GET_TX_MAXENTRIES];       /* Array of mapped block info */
 } TGET_TX_BLOCK;
 
 typedef TGET_TX_BLOCK *LPGET_TX_BLOCK;
@@ -180,19 +180,19 @@ typedef TGET_TX_BLOCK *LPGET_TX_BLOCK;
 #ifdef LINUX
 typedef struct                          /* used for U14_GetTransfer results */
 {                                       /* Info on a single mapped block */
-   long long physical;
-   long     size;
+       long long physical;
+       long     size;
 } TXENTRY;
 
 typedef struct TGetTxBlock              /* used for U14_GetTransfer results */
 {                                       /* matches structure in VXD */
-   long long linear;                    /* linear address */
-   long     size;                       /* total size of the mapped area, holds id when called */
-   short    seg;                        /* segment of the address for Win16 */
-   short    reserved;
-   short    avail;                      /* number of available entries */
-   short    used;                       /* number of used entries */
-   TXENTRY  entries[GET_TX_MAXENTRIES]; /* Array of mapped block info */
+       long long linear;                    /* linear address */
+       long     size;                       /* total size of the mapped area, holds id when called */
+       short    seg;                        /* segment of the address for Win16 */
+       short    reserved;
+       short    avail;                      /* number of available entries */
+       short    used;                       /* number of used entries */
+       TXENTRY  entries[GET_TX_MAXENTRIES]; /* Array of mapped block info */
 } TGET_TX_BLOCK;
 #endif
 
@@ -200,84 +200,84 @@ typedef struct TGetTxBlock              /* used for U14_GetTransfer results */
 extern "C" {
 #endif
 
-U14API(int)   U14WhenToTimeOut(short hand);         // when to timeout in ms
-U14API(short) U14PassedTime(int iTime);             // non-zero if iTime passed
+U14API(int)   U14WhenToTimeOut(short hand);         /*  when to timeout in ms */
+U14API(short)  U14PassedTime(int iTime);             /*  non-zero if iTime passed */
 
-U14API(short) U14LastErrCode(short hand);
+U14API(short)  U14LastErrCode(short hand);
 
-U14API(short) U14Open1401(short n1401);
-U14API(short) U14Close1401(short hand);
-U14API(short) U14Reset1401(short hand);
-U14API(short) U14ForceReset(short hand);
-U14API(short) U14TypeOf1401(short hand);
-U14API(short) U14NameOf1401(short hand, char* pBuf, WORD wMax);
+U14API(short)  U14Open1401(short n1401);
+U14API(short)  U14Close1401(short hand);
+U14API(short)  U14Reset1401(short hand);
+U14API(short)  U14ForceReset(short hand);
+U14API(short)  U14TypeOf1401(short hand);
+U14API(short)  U14NameOf1401(short hand, char *pBuf, unsigned short wMax);
 
-U14API(short) U14Stat1401(short hand);
-U14API(short) U14CharCount(short hand);
-U14API(short) U14LineCount(short hand);
+U14API(short)  U14Stat1401(short hand);
+U14API(short)  U14CharCount(short hand);
+U14API(short)  U14LineCount(short hand);
 
-U14API(short) U14SendString(short hand, const char* pString);
-U14API(short) U14GetString(short hand, char* pBuffer, WORD wMaxLen);
-U14API(short) U14SendChar(short hand, char cChar);
-U14API(short) U14GetChar(short hand, char* pcChar);
+U14API(short)  U14SendString(short hand, const char *pString);
+U14API(short)  U14GetString(short hand, char *pBuffer, unsigned short wMaxLen);
+U14API(short)  U14SendChar(short hand, char cChar);
+U14API(short)  U14GetChar(short hand, char *pcChar);
 
-U14API(short) U14LdCmd(short hand, const char* command);
-U14API(DWORD) U14Ld(short hand, const char* vl, const char* str);
+U14API(short)  U14LdCmd(short hand, const char *command);
+U14API(unsigned int) U14Ld(short hand, const char *vl, const char *str);
 
-U14API(short) U14SetTransArea(short hand, WORD wArea, void *pvBuff,
-                                            DWORD dwLength, short eSz);
-U14API(short) U14UnSetTransfer(short hand, WORD wArea);
-U14API(short) U14SetTransferEvent(short hand, WORD wArea, BOOL bEvent,
-                                  BOOL bToHost, DWORD dwStart, DWORD dwLength);
-U14API(int)   U14TestTransferEvent(short hand, WORD wArea);
-U14API(int)   U14WaitTransferEvent(short hand, WORD wArea, int msTimeOut);
-U14API(short) U14GetTransfer(short hand, TGET_TX_BLOCK *pTransBlock);
+U14API(short)  U14SetTransArea(short hand, unsigned short wArea, void *pvBuff,
+                                       unsigned int dwLength, short eSz);
+U14API(short)  U14UnSetTransfer(short hand, unsigned short wArea);
+U14API(short)  U14SetTransferEvent(short hand, unsigned short wArea, BOOL bEvent,
+                                       BOOL bToHost, unsigned int dwStart, unsigned int dwLength);
+U14API(int)   U14TestTransferEvent(short hand, unsigned short wArea);
+U14API(int)   U14WaitTransferEvent(short hand, unsigned short wArea, int msTimeOut);
+U14API(short)  U14GetTransfer(short hand, TGET_TX_BLOCK *pTransBlock);
 
-U14API(short) U14ToHost(short hand, char* pAddrHost,DWORD dwSize,DWORD dw1401,
-                                                            short eSz);
-U14API(short) U14To1401(short hand, const char* pAddrHost,DWORD dwSize,DWORD dw1401,
-                                                            short eSz);
+U14API(short)  U14ToHost(short hand, char *pAddrHost, unsigned int dwSize, unsigned int dw1401,
+                                                               short eSz);
+U14API(short)  U14To1401(short hand, const char *pAddrHost, unsigned int dwSize, unsigned int dw1401,
+                                                               short eSz);
 
-U14API(short) U14SetCircular(short hand, WORD wArea, BOOL bToHost, void *pvBuff,
-                                         DWORD dwLength);
+U14API(short)  U14SetCircular(short hand, unsigned short wArea, BOOL bToHost, void *pvBuff,
+                                                       unsigned int dwLength);
 
-U14API(int)   U14GetCircBlk(short hand, WORD wArea, DWORD *pdwOffs);
-U14API(int)   U14FreeCircBlk(short hand, WORD wArea, DWORD dwOffs, DWORD dwSize,
-                                         DWORD *pdwOffs);
+U14API(int)   U14GetCircBlk(short hand, unsigned short wArea, unsigned int *pdwOffs);
+U14API(int)   U14FreeCircBlk(short hand, unsigned short wArea, unsigned int dwOffs, unsigned int dwSize,
+                                                       unsigned int *pdwOffs);
 
-U14API(short) U14StrToLongs(const char* pszBuff, U14LONG *palNums, short sMaxLongs);
-U14API(short) U14LongsFrom1401(short hand, U14LONG *palBuff, short sMaxLongs);
+U14API(short)  U14StrToLongs(const char *pszBuff, U14LONG *palNums, short sMaxLongs);
+U14API(short)  U14LongsFrom1401(short hand, U14LONG *palBuff, short sMaxLongs);
 
 U14API(void)  U14SetTimeout(short hand, int lTimeout);
 U14API(int)   U14GetTimeout(short hand);
-U14API(short) U14OutBufSpace(short hand);
+U14API(short)  U14OutBufSpace(short hand);
 U14API(int)   U14BaseAddr1401(short hand);
 U14API(int)   U14DriverVersion(short hand);
 U14API(int)   U14DriverType(short hand);
-U14API(short) U14DriverName(short hand, char* pBuf, WORD wMax);
-U14API(short) U14GetUserMemorySize(short hand, DWORD *pMemorySize);
-U14API(short) U14KillIO1401(short hand);
-
-U14API(short) U14BlkTransState(short hand);
-U14API(short) U14StateOf1401(short hand);
-
-U14API(short) U14Grab1401(short hand);
-U14API(short) U14Free1401(short hand);
-U14API(short) U14Peek1401(short hand, DWORD dwAddr, int nSize, int nRepeats);
-U14API(short) U14Poke1401(short hand, DWORD dwAddr, DWORD dwValue, int nSize, int nRepeats);
-U14API(short) U14Ramp1401(short hand, DWORD dwAddr, DWORD dwDef, DWORD dwEnable, int nSize, int nRepeats);
-U14API(short) U14RampAddr(short hand, DWORD dwDef, DWORD dwEnable, int nSize, int nRepeats);
-U14API(short) U14StopDebugLoop(short hand);
-U14API(short) U14GetDebugData(short hand, U14LONG *plValue);
-
-U14API(short) U14StartSelfTest(short hand);
-U14API(short) U14CheckSelfTest(short hand, U14LONG *pData);
-U14API(short) U14TransferFlags(short hand);
-U14API(void)  U14GetErrorString(short nErr, char* pStr, WORD wMax);
+U14API(short)  U14DriverName(short hand, char *pBuf, unsigned short wMax);
+U14API(short)  U14GetUserMemorySize(short hand, unsigned int *pMemorySize);
+U14API(short)  U14KillIO1401(short hand);
+
+U14API(short)  U14BlkTransState(short hand);
+U14API(short)  U14StateOf1401(short hand);
+
+U14API(short)  U14Grab1401(short hand);
+U14API(short)  U14Free1401(short hand);
+U14API(short)  U14Peek1401(short hand, unsigned int dwAddr, int nSize, int nRepeats);
+U14API(short)  U14Poke1401(short hand, unsigned int dwAddr, unsigned int dwValue, int nSize, int nRepeats);
+U14API(short)  U14Ramp1401(short hand, unsigned int dwAddr, unsigned int dwDef, unsigned int dwEnable, int nSize, int nRepeats);
+U14API(short)  U14RampAddr(short hand, unsigned int dwDef, unsigned int dwEnable, int nSize, int nRepeats);
+U14API(short)  U14StopDebugLoop(short hand);
+U14API(short)  U14GetDebugData(short hand, U14LONG *plValue);
+
+U14API(short)  U14StartSelfTest(short hand);
+U14API(short)  U14CheckSelfTest(short hand, U14LONG *pData);
+U14API(short)  U14TransferFlags(short hand);
+U14API(void)  U14GetErrorString(short nErr, char *pStr, unsigned short wMax);
 U14API(int)   U14MonitorRev(short hand);
 U14API(void)  U14CloseAll(void);
 
-U14API(short) U14WorkingSet(DWORD dwMinKb, DWORD dwMaxKb);
+U14API(short)  U14WorkingSet(unsigned int dwMinKb, unsigned int dwMaxKb);
 U14API(int)   U14InitLib(void);
 
 #ifdef __cplusplus
@@ -285,3 +285,4 @@ U14API(int)   U14InitLib(void);
 #endif
 
 #endif /* End of ifndef __USE1401_H__ */
+
index 15ca638883800a99827d90123b4f51e83dc34b12..97d7913840dc351e642214a7dc7262e709321262 100644 (file)
 ** The IOCTL function codes from 0x80 to 0xFF are for developer use.
 */
 #define  FILE_DEVICE_CED1401    0x8001
-#define  FNNUMBASE              0x800
-
-#define  U14_OPEN1401            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE,               \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_CLOSE1401           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+1,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_SENDSTRING          CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+2,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_RESET1401           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+3,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_GETCHAR             CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+4,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_SENDCHAR            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+5,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_STAT1401            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+6,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_LINECOUNT           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+7,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_GETSTRING           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+8,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_REGCALLBACK         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+9,             \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_GETMONITORBUF       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+10,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_SETTRANSFER         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+11,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_UNSETTRANSFER       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+12,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_SETTRANSEVENT       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+13,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_GETOUTBUFSPACE      CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+14,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_GETBASEADDRESS      CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+15,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_GETDRIVERREVISION   CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+16,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_GETTRANSFER         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+17,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_KILLIO1401          CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+18,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_BLKTRANSSTATE       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+19,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_BYTECOUNT           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+20,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_ZEROBLOCKCOUNT      CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+21,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_STOPCIRCULAR        CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+22,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_STATEOF1401         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+23,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_REGISTERS1401       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+24,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_GRAB1401            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+25,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_FREE1401            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+26,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_STEP1401            CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+27,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_SET1401REGISTERS    CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+28,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_STEPTILL1401        CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+29,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_SETORIN             CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+30,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_STARTSELFTEST       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+31,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_CHECKSELFTEST       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+32,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_TYPEOF1401          CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+33,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_TRANSFERFLAGS       CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+34,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_DBGPEEK             CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+35,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_DBGPOKE             CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+36,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_DBGRAMPDATA         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+37,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_DBGRAMPADDR         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+38,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_DBGGETDATA          CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+39,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_DBGSTOPLOOP         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+40,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_FULLRESET           CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+41,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_SETCIRCULAR         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+42,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_GETCIRCBLK          CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+43,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-#define  U14_FREECIRCBLK         CTL_CODE( FILE_DEVICE_CED1401,     \
-                                           FNNUMBASE+44,            \
-                                           METHOD_BUFFERED,         \
-                                           FILE_ANY_ACCESS)
-
-//--------------- Structures that are shared with the driver -------------
+                                               FNNUMBASE              0x800
+
+#define  U14_OPEN1401            CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE,               \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_CLOSE1401           CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+1,             \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_SENDSTRING          CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+2,             \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_RESET1401           CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+3,             \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_GETCHAR             CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+4,             \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_SENDCHAR            CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+5,             \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_STAT1401            CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+6,             \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_LINECOUNT           CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+7,             \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_GETSTRING           CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+8,             \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_REGCALLBACK         CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+9,             \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_GETMONITORBUF       CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+10,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_SETTRANSFER         CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+11,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_UNSETTRANSFER       CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+12,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_SETTRANSEVENT       CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+13,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_GETOUTBUFSPACE      CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+14,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_GETBASEADDRESS      CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+15,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_GETDRIVERREVISION   CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+16,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_GETTRANSFER         CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+17,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_KILLIO1401          CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+18,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_BLKTRANSSTATE       CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+19,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_BYTECOUNT           CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+20,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_ZEROBLOCKCOUNT      CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+21,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_STOPCIRCULAR        CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+22,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_STATEOF1401         CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+23,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_REGISTERS1401       CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+24,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_GRAB1401            CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+25,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_FREE1401            CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+26,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_STEP1401            CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+27,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_SET1401REGISTERS    CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+28,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_STEPTILL1401        CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+29,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_SETORIN             CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+30,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_STARTSELFTEST       CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+31,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_CHECKSELFTEST       CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+32,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_TYPEOF1401          CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+33,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_TRANSFERFLAGS       CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+34,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_DBGPEEK             CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+35,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_DBGPOKE             CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+36,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_DBGRAMPDATA         CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+37,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_DBGRAMPADDR         CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+38,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_DBGGETDATA          CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+39,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_DBGSTOPLOOP         CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+40,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_FULLRESET           CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+41,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_SETCIRCULAR         CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+42,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_GETCIRCBLK          CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+43,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+#define  U14_FREECIRCBLK         CTL_CODE(FILE_DEVICE_CED1401,     \
+                                               FNNUMBASE+44,            \
+                                               METHOD_BUFFERED,         \
+                                               FILE_ANY_ACCESS)
+
+/*--------------- Structures that are shared with the driver ------------- */
 #pragma pack(1)
 
 typedef struct                  /* used for get/set standard 1401 registers */
 {
-   short   sPC;
-   char    A;
-   char    X;
-   char    Y;
-   char    stat;
-   char    rubbish;
+       short   sPC;
+       char    A;
+       char    X;
+       char    Y;
+       char    stat;
+       char    rubbish;
 } T1401REGISTERS;
 
 typedef union     /* to communicate with 1401 driver status & control funcs */
 {
-   char           chrs[22];
-   short          ints[11];
-   long           longs[5];
-   T1401REGISTERS registers;
+       char           chrs[22];
+       short          ints[11];
+       long           longs[5];
+       T1401REGISTERS registers;
 } TCSBLOCK;
 
 typedef TCSBLOCK*  LPTCSBLOCK;
 
-typedef struct paramBlk
-{
-    short       sState;
-    TCSBLOCK    csBlock;
+typedef struct paramBlk {
+        short       sState;
+        TCSBLOCK    csBlock;
 } PARAMBLK;
 
 typedef PARAMBLK*   PPARAMBLK;
 
 typedef struct TransferDesc          /* Structure and type for SetTransArea */
 {
-   WORD        wArea;            /* number of transfer area to set up       */
-   void FAR *  lpvBuff;          /* address of transfer area                */
-   DWORD       dwLength;         /* length of area to set up                */
-   short       eSize;            /* size to move (for swapping on MAC)      */
+       unsigned short        wArea;            /* number of transfer area to set up       */
+       void FAR *lpvBuff;          /* address of transfer area                */
+       unsigned int       dwLength;         /* length of area to set up                */
+       short       eSize;            /* size to move (for swapping on MAC)      */
 } TRANSFERDESC;
 
-typedef TRANSFERDESC FAR *    LPTRANSFERDESC;
+typedef TRANSFERDESC FAR *LPTRANSFERDESC;
 
 /* This is the structure used to set up a transfer area */
 typedef struct VXTransferDesc    /* use1401.c and use1432x.x use only       */
 {
-   WORD        wArea;            /* number of transfer area to set up       */
-   WORD        wAddrSel;         /* 16 bit selector for area                */
-   DWORD       dwAddrOfs;        /* 32 bit offset for area start            */
-   DWORD       dwLength;         /* length of area to set up                */
+       unsigned short        wArea;            /* number of transfer area to set up       */
+       unsigned short        wAddrSel;         /* 16 bit selector for area                */
+       unsigned int       dwAddrOfs;        /* 32 bit offset for area start            */
+       unsigned int       dwLength;         /* length of area to set up                */
 } VXTRANSFERDESC;
 
 #pragma pack()
 
-#endif
\ No newline at end of file
+#endif
index 38e7c1c82d43d7b1fc7de392c7fad47aef9e295e..c9bc2ebfef1a41478bc2686058bb039531d079d0 100644 (file)
@@ -36,7 +36,7 @@
 ** Under Windows 9x and NT, Use1401 uses DeviceIoControl to get access to
 ** the 1401 driver. This has parameters for the device handle, the function
 ** code, an input pointer and byte count, an output pointer and byte count
-** and a pointer to a DWORD to hold the output byte count. Note that input
+** and a pointer to a unsigned int to hold the output byte count. Note that input
 ** and output are from the point-of-view of the driver, so the output stuff
 ** is used to read values from the 1401, not send to the 1401. The use of
 ** these parameters varies with the function in use and the operating
@@ -250,7 +250,7 @@ static int iAttached = 0;                       // counts process attaches so ca
 static HANDLE aHand1401[MAX1401] = {0};         // handles for 1401s
 static HANDLE aXferEvent[MAX1401] = {0};        // transfer events for the 1401s
 static LPVOID apAreas[MAX1401][MAX_TRANSAREAS]; // Locked areas
-static DWORD  auAreas[MAX1401][MAX_TRANSAREAS]; // Size of locked areas
+static unsigned int  auAreas[MAX1401][MAX_TRANSAREAS]; // Size of locked areas
 static BOOL   bWindows9x = FALSE;               // if we are Windows 95 or better
 #ifdef _WIN64
 #define USE_NT_DIOC(ind) TRUE
@@ -276,8 +276,8 @@ static int aHand1401[MAX1401] = {0};    // handles for 1401s
 typedef struct CmdHead          // defines header block on command
 {                               // for PC commands
    char   acBasic[5];           // BASIC information - needed to align things
-   WORD   wBasicSz;             // size as seen by BASIC
-   WORD   wCmdSize;             // size of the following info
+   unsigned short   wBasicSz;             // size as seen by BASIC
+   unsigned short   wCmdSize;             // size of the following info
 } __packed CMDHEAD;
 #pragma pack()                  // back to normal
 
@@ -311,7 +311,7 @@ static short CheckHandle(short h)
 ****************************************************************************/
 static short U14Status1401(short sHand, LONG lCode, TCSBLOCK* pBlk)
 {
-    DWORD dwBytes = 0;
+    unsigned int dwBytes = 0;
 
     if ((sHand < 0) || (sHand >= MAX1401))  /* Check parameters */
         return U14ERR_BADHAND;
@@ -345,7 +345,7 @@ static short U14Status1401(short sHand, LONG lCode, TCSBLOCK* pBlk)
 ****************************************************************************/
 static short U14Control1401(short sHand, LONG lCode, TCSBLOCK* pBlk)
 {
-    DWORD dwBytes = 0;
+    unsigned int dwBytes = 0;
 
     if ((sHand < 0) || (sHand >= MAX1401))              /* Check parameters */
         return U14ERR_BADHAND;
@@ -455,7 +455,7 @@ static void TranslateString(char* pStr)
 ****************************************************************************/
 U14API(short) U14StrToLongs(const char* pszBuff, U14LONG *palNums, short sMaxLongs)
 {
-    WORD wChInd = 0;                // index into source
+    unsigned short wChInd = 0;                // index into source
     short sLgInd = 0;               // index into result longs
 
     while (pszBuff[wChInd] &&       // until we get to end of string...
@@ -681,7 +681,7 @@ U14API(int) U14DriverType(short hand)
 ** U14DriverName
 ** Returns the driver type as 3 character (ISA, PCI, USB or HSS))
 ****************************************************************************/
-U14API(short) U14DriverName(short hand, char* pBuf, WORD wMax)
+U14API(short) U14DriverName(short hand, char* pBuf, unsigned short wMax)
 {
     char* pName;
     *pBuf = 0;                             // Start off with a blank string
@@ -779,7 +779,7 @@ U14API(short)  U14Free1401(short hand)
 ** is called. After the peek is done, use U14GetDebugData to retrieve
 ** the results of the peek.
 ****************************************************************************/
-U14API(short) U14Peek1401(short hand, DWORD dwAddr, int nSize, int nRepeats)
+U14API(short) U14Peek1401(short hand, unsigned int dwAddr, int nSize, int nRepeats)
 {
     short sErr = CheckHandle(hand);
     if (sErr == U14ERR_NOERROR)
@@ -813,7 +813,7 @@ U14API(short) U14Peek1401(short hand, DWORD dwAddr, int nSize, int nRepeats)
 ** If lRepeats is zero, the loop will continue until U14StopDebugLoop
 ** is called.
 ****************************************************************************/
-U14API(short) U14Poke1401(short hand, DWORD dwAddr, DWORD dwValue,
+U14API(short) U14Poke1401(short hand, unsigned int dwAddr, unsigned int dwValue,
                                       int nSize, int nRepeats)
 {
     short sErr = CheckHandle(hand);
@@ -849,7 +849,7 @@ U14API(short) U14Poke1401(short hand, DWORD dwAddr, DWORD dwValue,
 ** DESCRIPTION  Cause the 1401 to loop, writing a ramp to a location.
 ** If lRepeats is zero, the loop will continue until U14StopDebugLoop.
 ****************************************************************************/
-U14API(short) U14Ramp1401(short hand, DWORD dwAddr, DWORD dwDef, DWORD dwEnable,
+U14API(short) U14Ramp1401(short hand, unsigned int dwAddr, unsigned int dwDef, unsigned int dwEnable,
                                       int nSize, int nRepeats)
 {
     short sErr = CheckHandle(hand);
@@ -887,7 +887,7 @@ U14API(short) U14Ramp1401(short hand, DWORD dwAddr, DWORD dwDef, DWORD dwEnable,
 ** DESCRIPTION  Cause the 1401 to loop, reading from a ramping location.
 ** If lRepeats is zero, the loop will continue until U14StopDebugLoop
 ****************************************************************************/
-U14API(short) U14RampAddr(short hand, DWORD dwDef, DWORD dwEnable,
+U14API(short) U14RampAddr(short hand, unsigned int dwDef, unsigned int dwEnable,
                                       int nSize, int nRepeats)
 {
     short sErr = CheckHandle(hand);
@@ -1024,7 +1024,7 @@ U14API(short) U14CheckSelfTest(short hand, U14LONG *pData)
 /****************************************************************************
 ** U14GetUserMemorySize
 ****************************************************************************/
-U14API(short) U14GetUserMemorySize(short hand, DWORD *pMemorySize)
+U14API(short) U14GetUserMemorySize(short hand, unsigned int *pMemorySize)
 {
     // The original 1401 used a different command for getting the size
     short sErr = U14SendString(hand, (asType1401[hand] == U14TYPE1401) ? "MEMTOP;" : "MEMTOP,?;");
@@ -1061,7 +1061,7 @@ U14API(short) U14TypeOf1401(short hand)
 ** U14NameOf1401
 ** Returns the type of the 1401 as a string, blank if unknown
 ****************************************************************************/
-U14API(short) U14NameOf1401(short hand, char* pBuf, WORD wMax)
+U14API(short) U14NameOf1401(short hand, char* pBuf, unsigned short wMax)
 {
     short sErr = CheckHandle(hand);
     if (sErr == U14ERR_NOERROR)
@@ -1207,7 +1207,7 @@ static short U14TryToOpen(int n1401, long* plRetVal, short* psHandle)
 {
     short sErr = U14ERR_NOERROR;
     HANDLE hDevice = INVALID_HANDLE_VALUE;
-    DWORD dwErr = 0;
+    unsigned int dwErr = 0;
     int nFirst, nLast, nDev = 0;        /* Used for the search for a 1401 */
     BOOL bOldName = FALSE;               /* start by looking for a modern driver */
 
@@ -1262,7 +1262,7 @@ static short U14TryToOpen(int n1401, long* plRetVal, short* psHandle)
             }
             else
             {
-                DWORD dwe = GetLastError();     /* Get error code otherwise */
+                unsigned int dwe = GetLastError();     /* Get error code otherwise */
                 if ((dwe != ERROR_FILE_NOT_FOUND) || (dwErr == 0))
                     dwErr = dwe;                /* Ignore repeats of 'not found' */
             }
@@ -1454,7 +1454,7 @@ U14API(short) U14Close1401(short hand)
         U14Reset1401(hand);                     // in case an active transfer running
         for (j = 0; j < MAX_TRANSAREAS; ++j)    // Locate locked areas
             if (iAreaMask & (1 << j))           // And kill off any transfers
-                U14UnSetTransfer(hand, (WORD)j);
+                U14UnSetTransfer(hand, (unsigned short)j);
     }
 
 #ifdef _IS_WINDOWS_
@@ -1581,7 +1581,7 @@ U14API(short) U14SendString(short hand, const char* pString)
         if (bSpaceToSend)
         {
             PARAMBLK    rData;
-            DWORD       dwBytes;
+            unsigned int       dwBytes;
             char        tstr[MAXSTRLEN+5];          /* Buffer for chars */
 
             if ((hand < 0) || (hand >= MAX1401))
@@ -1592,18 +1592,18 @@ U14API(short) U14SendString(short hand, const char* pString)
 #ifndef _WIN64
                 if (!USE_NT_DIOC(hand))             /* Using WIN 95 driver access? */
                 {
-                    int iOK = DeviceIoControl(aHand1401[hand], (DWORD)U14_SENDSTRING,
+                    int iOK = DeviceIoControl(aHand1401[hand], (unsigned int)U14_SENDSTRING,
                                     NULL, 0, tstr, nChars,
                                     &dwBytes, NULL);
                     if (iOK)
-                        sErr = (dwBytes >= (DWORD)nChars) ? U14ERR_NOERROR : U14ERR_DRIVCOMMS;
+                        sErr = (dwBytes >= (unsigned int)nChars) ? U14ERR_NOERROR : U14ERR_DRIVCOMMS;
                     else
                         sErr = (short)GetLastError();
                 }
                 else
 #endif
                 {
-                    int iOK = DeviceIoControl(aHand1401[hand],(DWORD)U14_SENDSTRING,
+                    int iOK = DeviceIoControl(aHand1401[hand],(unsigned int)U14_SENDSTRING,
                                     tstr, nChars,
                                     &rData,sizeof(PARAMBLK),&dwBytes,NULL);
                     if (iOK && (dwBytes >= sizeof(PARAMBLK)))
@@ -1697,7 +1697,7 @@ U14API(short) U14SendChar(short hand, char cChar)
 **          error code. Any error from the device causes us to set up for
 **          a full reset.
 ****************************************************************************/
-U14API(short) U14GetString(short hand, char* pBuffer, WORD wMaxLen)
+U14API(short) U14GetString(short hand, char* pBuffer, unsigned short wMaxLen)
 {
     short sErr = CheckHandle(hand);
     if (sErr != U14ERR_NOERROR)             // If an error...
@@ -1726,8 +1726,8 @@ U14API(short) U14GetString(short hand, char* pBuffer, WORD wMaxLen)
         {
             if (asLastRetCode[hand] == U14ERR_NOERROR)     /* all ok so far */
             {
-                DWORD       dwBytes = 0;
-                *((WORD *)pBuffer) = wMaxLen;       /* set up length */
+                unsigned int       dwBytes = 0;
+                *((unsigned short *)pBuffer) = wMaxLen;       /* set up length */
 #ifndef _WIN64
                 if (!USE_NT_DIOC(hand))             /* Win 95 DIOC here ? */
                 {
@@ -1737,9 +1737,9 @@ U14API(short) U14GetString(short hand, char* pBuffer, WORD wMaxLen)
                     if (wMaxLen > MAXSTRLEN)        /* Truncate length */
                         wMaxLen = MAXSTRLEN;    
 
-                    *((WORD *)tstr) = wMaxLen;      /* set len */
+                    *((unsigned short *)tstr) = wMaxLen;      /* set len */
 
-                    iOK = DeviceIoControl(aHand1401[hand],(DWORD)U14_GETSTRING,
+                    iOK = DeviceIoControl(aHand1401[hand],(unsigned int)U14_GETSTRING,
                                     NULL, 0, tstr, wMaxLen+sizeof(short),
                                     &dwBytes, NULL);
                     if (iOK)                        /* Device IO control OK ? */
@@ -1768,7 +1768,7 @@ U14API(short) U14GetString(short hand, char* pBuffer, WORD wMaxLen)
                         char* pMem = (char*)GlobalLock(hMem);
                         if (pMem)
                         {
-                            int iOK = DeviceIoControl(aHand1401[hand],(DWORD)U14_GETSTRING,
+                            int iOK = DeviceIoControl(aHand1401[hand],(unsigned int)U14_GETSTRING,
                                             NULL, 0, pMem, wMaxLen+sizeof(short),
                                             &dwBytes, NULL);
                             if (iOK)                /* Device IO control OK ? */
@@ -1946,7 +1946,7 @@ U14API(short) U14LineCount(short hand)
 **       other functions after getting an error and before using
 **       this function.
 ****************************************************************************/
-U14API(void)  U14GetErrorString(short nErr, char* pStr, WORD wMax)
+U14API(void)  U14GetErrorString(short nErr, char* pStr, unsigned short wMax)
 {
     char    wstr[150];
 
@@ -2105,7 +2105,7 @@ U14API(void)  U14GetErrorString(short nErr, char* pStr, WORD wMax)
         break;
 
     }
-    if ((WORD)strlen(wstr) >= wMax-1)  /* Check for string being too long */
+    if ((unsigned short)strlen(wstr) >= wMax-1)  /* Check for string being too long */
         wstr[wMax-1] = 0;                          /* and truncate it if so */
     strcpy(pStr, wstr);                       /* Return the error string */
 }
@@ -2120,8 +2120,8 @@ U14API(short) U14GetTransfer(short hand, TGET_TX_BLOCK *pTransBlock)
 #ifdef _IS_WINDOWS_
     if (sErr == U14ERR_NOERROR)
     { 
-        DWORD dwBytes = 0;
-        BOOL bOK = DeviceIoControl(aHand1401[hand], (DWORD)U14_GETTRANSFER, NULL, 0, pTransBlock,
+        unsigned int dwBytes = 0;
+        BOOL bOK = DeviceIoControl(aHand1401[hand], (unsigned int)U14_GETTRANSFER, NULL, 0, pTransBlock,
                               sizeof(TGET_TX_BLOCK), &dwBytes, NULL);
     
         if (bOK && (dwBytes >= sizeof(TGET_TX_BLOCK)))
@@ -2145,12 +2145,12 @@ U14API(short) U14GetTransfer(short hand, TGET_TX_BLOCK *pTransBlock)
 //     1 unable to access process (insufficient rights?)
 //     2 unable to read process working set
 //     3 unable to set process working set - bad parameters?
-U14API(short) U14WorkingSet(DWORD dwMinKb, DWORD dwMaxKb)
+U14API(short) U14WorkingSet(unsigned int dwMinKb, unsigned int dwMaxKb)
 {
 #ifdef _IS_WINDOWS_
     short sRetVal = 0;                      // 0 means all is OK
     HANDLE hProcess;
-    DWORD dwVer = GetVersion();
+    unsigned int dwVer = GetVersion();
        if (dwVer & 0x80000000)                 // is this not NT?
         return 0;                           // then give up right now
 
@@ -2164,8 +2164,8 @@ U14API(short) U14WorkingSet(DWORD dwMinKb, DWORD dwMaxKb)
         SIZE_T dwMinSize,dwMaxSize;
         if (GetProcessWorkingSetSize(hProcess, &dwMinSize, &dwMaxSize))
         {
-            DWORD dwMin = dwMinKb << 10;    // convert from kb to bytes
-            DWORD dwMax = dwMaxKb << 10;
+            unsigned int dwMin = dwMinKb << 10;    // convert from kb to bytes
+            unsigned int dwMax = dwMaxKb << 10;
 
             // if we get here, we have managed to read the current size
             if (dwMin > dwMinSize)          // need to change sizes?
@@ -2200,7 +2200,7 @@ U14API(short) U14WorkingSet(DWORD dwMinKb, DWORD dwMaxKb)
 ** U14UnSetTransfer  Cancels a transfer area
 ** wArea    The index of a block previously used in by SetTransfer
 *****************************************************************************/
-U14API(short) U14UnSetTransfer(short hand, WORD wArea)
+U14API(short) U14UnSetTransfer(short hand, unsigned short wArea)
 {
     short sErr = CheckHandle(hand);
 #ifdef _IS_WINDOWS_
@@ -2223,13 +2223,13 @@ U14API(short) U14UnSetTransfer(short hand, WORD wArea)
 
 /****************************************************************************
 ** U14SetTransArea      Sets an area up to be used for transfers
-** WORD  wArea     The area number to set up
+** unsigned short  wArea     The area number to set up
 ** void *pvBuff    The address of the buffer for the data.
-** DWORD dwLength  The length of the buffer for the data
+** unsigned int dwLength  The length of the buffer for the data
 ** short eSz       The element size (used for byte swapping on the Mac)
 ****************************************************************************/
-U14API(short) U14SetTransArea(short hand, WORD wArea, void *pvBuff,
-                                          DWORD dwLength, short eSz)
+U14API(short) U14SetTransArea(short hand, unsigned short wArea, void *pvBuff,
+                                          unsigned int dwLength, short eSz)
 {
     TRANSFERDESC td;
     short sErr = CheckHandle(hand);
@@ -2254,7 +2254,7 @@ U14API(short) U14SetTransArea(short hand, WORD wArea, void *pvBuff,
 #ifndef _WIN64
     if (!USE_NT_DIOC(hand))                         /* Use Win 9x DIOC? */
     {
-        DWORD dwBytes;
+        unsigned int dwBytes;
         VXTRANSFERDESC vxDesc;                      /* Structure to pass to VXD */
         vxDesc.wArea = wArea;                       /* Copy across simple params */
         vxDesc.dwLength = dwLength;
@@ -2264,10 +2264,10 @@ U14API(short) U14SetTransArea(short hand, WORD wArea, void *pvBuff,
             sErr = U14ERR_DRIVTOOOLD;
         else
         {
-            vxDesc.dwAddrOfs = (DWORD)pvBuff;       /* 32 bit offset */
+            vxDesc.dwAddrOfs = (unsigned int)pvBuff;       /* 32 bit offset */
             vxDesc.wAddrSel  = 0;
 
-            if (DeviceIoControl(aHand1401[hand], (DWORD)U14_SETTRANSFER,
+            if (DeviceIoControl(aHand1401[hand], (unsigned int)U14_SETTRANSFER,
                                 pvBuff,dwLength,    /* Will translate pointer */
                                 &vxDesc,sizeof(VXTRANSFERDESC),
                                 &dwBytes,NULL))
@@ -2285,13 +2285,13 @@ U14API(short) U14SetTransArea(short hand, WORD wArea, void *pvBuff,
 #endif
     {
         PARAMBLK rWork;
-        DWORD dwBytes;
+        unsigned int dwBytes;
         td.wArea = wArea;     /* Pure NT - put data into struct */
         td.lpvBuff = pvBuff;
         td.dwLength = dwLength;
         td.eSize = 0;                // Dummy element size
 
-        if (DeviceIoControl(aHand1401[hand],(DWORD)U14_SETTRANSFER,
+        if (DeviceIoControl(aHand1401[hand],(unsigned int)U14_SETTRANSFER,
                             &td,sizeof(TRANSFERDESC),
                             &rWork,sizeof(PARAMBLK),&dwBytes,NULL))
         {
@@ -2344,8 +2344,8 @@ U14API(short) U14SetTransArea(short hand, WORD wArea, void *pvBuff,
 ** Returns 1 if an event handle exists, 0 if all OK and no event handle or
 ** a negative code for an error.
 ****************************************************************************/
-U14API(short) U14SetTransferEvent(short hand, WORD wArea, BOOL bEvent,
-                                  BOOL bToHost, DWORD dwStart, DWORD dwLength)
+U14API(short) U14SetTransferEvent(short hand, unsigned short wArea, BOOL bEvent,
+                                  BOOL bToHost, unsigned int dwStart, unsigned int dwLength)
 {
 #ifdef _IS_WINDOWS_
     TCSBLOCK csBlock;
@@ -2416,7 +2416,7 @@ U14API(short) U14SetTransferEvent(short hand, WORD wArea, BOOL bEvent,
 ** Would a U14WaitTransferEvent() call return immediately? return 1 if so,
 ** 0 if not or a negative code if a problem.
 ****************************************************************************/
-U14API(int) U14TestTransferEvent(short hand, WORD wArea)
+U14API(int) U14TestTransferEvent(short hand, unsigned short wArea)
 {
 #ifdef _IS_WINDOWS_
     int iErr = CheckHandle(hand);
@@ -2441,7 +2441,7 @@ U14API(int) U14TestTransferEvent(short hand, WORD wArea)
 ** Returns   If no event handle then return immediately. Else return 1 if
 **           timed out or 0=event, and a negative code if a problem.
 ****************************************************************************/
-U14API(int) U14WaitTransferEvent(short hand, WORD wArea, int msTimeOut)
+U14API(int) U14WaitTransferEvent(short hand, unsigned short wArea, int msTimeOut)
 {
 #ifdef _IS_WINDOWS_
     int iErr = CheckHandle(hand);
@@ -2466,13 +2466,13 @@ U14API(int) U14WaitTransferEvent(short hand, WORD wArea, int msTimeOut)
 
 /****************************************************************************
 ** U14SetCircular    Sets an area up for circular DMA transfers
-** WORD  wArea          The area number to set up
+** unsigned short  wArea          The area number to set up
 ** BOOL  bToHost        Sets the direction of data transfer
 ** void *pvBuff        The address of the buffer for the data
-** DWORD dwLength       The length of the buffer for the data
+** unsigned int dwLength       The length of the buffer for the data
 ****************************************************************************/
-U14API(short) U14SetCircular(short hand, WORD wArea, BOOL bToHost,
-                                                                       void *pvBuff, DWORD dwLength)
+U14API(short) U14SetCircular(short hand, unsigned short wArea, BOOL bToHost,
+                                                                       void *pvBuff, unsigned int dwLength)
 {
     short sErr = CheckHandle(hand);
     if (sErr != U14ERR_NOERROR)
@@ -2495,14 +2495,14 @@ U14API(short) U14SetCircular(short hand, WORD wArea, BOOL bToHost,
     else
     {
         PARAMBLK rWork;
-        DWORD dwBytes;
+        unsigned int dwBytes;
         TRANSFERDESC txDesc;
         txDesc.wArea = wArea;             /* Pure NT - put data into struct */
         txDesc.lpvBuff = pvBuff;
         txDesc.dwLength = dwLength;
         txDesc.eSize = (short)bToHost;       /* Use this for direction flag */
    
-        if (DeviceIoControl(aHand1401[hand],(DWORD)U14_SETCIRCULAR,
+        if (DeviceIoControl(aHand1401[hand],(unsigned int)U14_SETCIRCULAR,
                            &txDesc, sizeof(TRANSFERDESC),
                            &rWork, sizeof(PARAMBLK),&dwBytes,NULL))
         {
@@ -2542,7 +2542,7 @@ U14API(short) U14SetCircular(short hand, WORD wArea, BOOL bToHost,
 ** Function  GetCircBlk returns the size (& start offset) of the next
 **           available block of circular data.
 ****************************************************************************/
-U14API(int) U14GetCircBlk(short hand, WORD wArea, DWORD *pdwOffs)
+U14API(int) U14GetCircBlk(short hand, unsigned short wArea, unsigned int *pdwOffs)
 {
     int lErr = CheckHandle(hand);
     if (lErr != U14ERR_NOERROR)
@@ -2555,10 +2555,10 @@ U14API(int) U14GetCircBlk(short hand, WORD wArea, DWORD *pdwOffs)
 #ifdef _IS_WINDOWS_
         PARAMBLK rWork;
         TCSBLOCK csBlock;
-        DWORD dwBytes;
+        unsigned int dwBytes;
         csBlock.longs[0] = wArea;               // Area number into control block
         rWork.sState = U14ERR_DRIVCOMMS;
-        if (DeviceIoControl(aHand1401[hand], (DWORD)U14_GETCIRCBLK, &csBlock, sizeof(TCSBLOCK), &rWork, sizeof(PARAMBLK), &dwBytes, NULL) &&
+        if (DeviceIoControl(aHand1401[hand], (unsigned int)U14_GETCIRCBLK, &csBlock, sizeof(TCSBLOCK), &rWork, sizeof(PARAMBLK), &dwBytes, NULL) &&
            (dwBytes >= sizeof(PARAMBLK)))
             lErr = rWork.sState;
         else
@@ -2591,8 +2591,8 @@ U14API(int) U14GetCircBlk(short hand, WORD wArea, DWORD *pdwOffs)
 **           resuse for circular transfers and returns the size (& start
 **           offset) of the next available block of circular data.
 ****************************************************************************/
-U14API(int) U14FreeCircBlk(short hand, WORD wArea, DWORD dwOffs, DWORD dwSize,
-                                        DWORD *pdwOffs)
+U14API(int) U14FreeCircBlk(short hand, unsigned short wArea, unsigned int dwOffs, unsigned int dwSize,
+                                        unsigned int *pdwOffs)
 {
     int lErr = CheckHandle(hand);
     if (lErr != U14ERR_NOERROR)
@@ -2603,12 +2603,12 @@ U14API(int) U14FreeCircBlk(short hand, WORD wArea, DWORD dwOffs, DWORD dwSize,
 #ifdef _IS_WINDOWS_
         PARAMBLK rWork;
         TCSBLOCK csBlock;
-        DWORD dwBytes;
+        unsigned int dwBytes;
         csBlock.longs[0] = wArea;               // Area number into control block
         csBlock.longs[1] = dwOffs;
         csBlock.longs[2] = dwSize;
         rWork.sState = U14ERR_DRIVCOMMS;
-        if (DeviceIoControl(aHand1401[hand], (DWORD)U14_FREECIRCBLK, &csBlock, sizeof(TCSBLOCK),
+        if (DeviceIoControl(aHand1401[hand], (unsigned int)U14_FREECIRCBLK, &csBlock, sizeof(TCSBLOCK),
                            &rWork, sizeof(PARAMBLK), &dwBytes, NULL) &&
            (dwBytes >= sizeof(PARAMBLK)))
            lErr = rWork.sState;
@@ -2647,7 +2647,7 @@ U14API(int) U14FreeCircBlk(short hand, WORD wArea, DWORD dwOffs, DWORD dwSize,
 ** which it should be to get a pointer
 *****************************************************************************/
 static short Transfer(short hand, BOOL bTo1401, char* pData,
-                       DWORD dwSize, DWORD dw1401, short eSz)
+                       unsigned int dwSize, unsigned int dw1401, short eSz)
 {
     char strcopy[MAXSTRLEN+1];          // to hold copy of work string
     short sResult = U14SetTransArea(hand, 0, (void *)pData, dwSize, eSz);
@@ -2670,8 +2670,8 @@ static short Transfer(short hand, BOOL bTo1401, char* pData,
 /****************************************************************************
 ** Function  ToHost transfers data into the host from the 1401
 ****************************************************************************/
-U14API(short) U14ToHost(short hand, char* pAddrHost, DWORD dwSize,
-                                            DWORD dw1401, short eSz)
+U14API(short) U14ToHost(short hand, char* pAddrHost, unsigned int dwSize,
+                                            unsigned int dw1401, short eSz)
 {
     short sErr = CheckHandle(hand);
     if ((sErr == U14ERR_NOERROR) && dwSize) // TOHOST is a constant
@@ -2682,8 +2682,8 @@ U14API(short) U14ToHost(short hand, char* pAddrHost, DWORD dwSize,
 /****************************************************************************
 ** Function  To1401 transfers data into the 1401 from the host
 ****************************************************************************/
-U14API(short) U14To1401(short hand, const char* pAddrHost,DWORD dwSize,
-                                    DWORD dw1401, short eSz)
+U14API(short) U14To1401(short hand, const char* pAddrHost,unsigned int dwSize,
+                                    unsigned int dw1401, short eSz)
 {
     short sErr = CheckHandle(hand);
     if ((sErr == U14ERR_NOERROR) && dwSize) // TO1401 is a constant
@@ -2707,7 +2707,7 @@ U14API(short) U14To1401(short hand, const char* pAddrHost,DWORD dwSize,
 #define file_close(h)   close(h)
 #define file_seek(h, pos) lseek(h, pos, SEEK_SET) 
 #define file_read(h, buffer, size) (read(h, buffer, size) == (ssize_t)size)
-static DWORD GetModuleFileName(void* dummy, char* buffer, int max)
+static unsigned int GetModuleFileName(void* dummy, char* buffer, int max)
 {
     // The following works for Linux systems with a /proc file system.
     char szProcPath[32];
@@ -2766,7 +2766,7 @@ U14API(short) U14LdCmd(short hand, const char* command)
     // application was run from.
     if (!bGotIt)                            // Still not got it?
     {
-        DWORD dwLen = GetModuleFileName(NULL, filnam, FNSZ); // Get app path
+        unsigned int dwLen = GetModuleFileName(NULL, filnam, FNSZ); // Get app path
         if (dwLen > 0)                      // and use it as path if found
         {
             char* pStr = strrchr(filnam, PATHSEP);    // Point to last separator
@@ -2821,7 +2821,7 @@ U14API(short) U14LdCmd(short hand, const char* command)
                 file_seek(iFHandle, sizeof(CMDHEAD));
                 if (file_read(iFHandle, pMem, (UINT)nComSize))
                 {
-                    sErr = U14SetTransArea(hand, 0, (void *)pMem, (DWORD)nComSize, ESZBYTES);
+                    sErr = U14SetTransArea(hand, 0, (void *)pMem, (unsigned int)nComSize, ESZBYTES);
                     if (sErr == U14ERR_NOERROR)
                     {
                         sprintf(strcopy, "CLOAD,0,$%X;", (int)nComSize);
@@ -2858,9 +2858,9 @@ U14API(short) U14LdCmd(short hand, const char* command)
 ** Returns NOERROR code or a long with error in lo word and index of
 ** command that failed in high word
 ****************************************************************************/
-U14API(DWORD) U14Ld(short hand, const char* vl, const char* str)
+U14API(unsigned int) U14Ld(short hand, const char* vl, const char* str)
 {
-    DWORD dwIndex = 0;              // index to current command
+    unsigned int dwIndex = 0;              // index to current command
     long lErr = U14ERR_NOERROR;     // what the error was that went wrong
     char strcopy[MAXSTRLEN+1];      // stores unmodified str parameter
     char szFExt[8];                 // The command file extension
@@ -2939,7 +2939,7 @@ U14API(DWORD) U14Ld(short hand, const char* vl, const char* str)
         return lErr;
     }
     else
-        return ((dwIndex<<16) | ((DWORD)lErr & 0x0000FFFF));
+        return ((dwIndex<<16) | ((unsigned int)lErr & 0x0000FFFF));
 }
 
 // Initialise the library (if not initialised) and return the library version
@@ -2951,7 +2951,7 @@ U14API(int) U14InitLib(void)
         int i;
 #ifdef _IS_WINDOWS_
         int j;
-        DWORD   dwVersion = GetVersion();
+        unsigned int   dwVersion = GetVersion();
         bWindows9x = FALSE;                  // Assume not Win9x
 
         if (dwVersion & 0x80000000)                 // if not windows NT
@@ -2993,12 +2993,12 @@ U14API(int) U14InitLib(void)
 #ifdef _IS_WINDOWS_
 #ifndef U14_NOT_DLL
 /****************************************************************************
-** FUNCTION: DllMain(HANDLE, DWORD, LPVOID)
+** FUNCTION: DllMain(HANDLE, unsigned int, LPVOID)
 ** LibMain is called by Windows when the DLL is initialized, Thread Attached,
 ** and other times. Refer to SDK documentation, as to the different ways this
 ** may be called.
 ****************************************************************************/
-INT APIENTRY DllMain(HANDLE hInst, DWORD ul_reason_being_called, LPVOID lpReserved)
+INT APIENTRY DllMain(HANDLE hInst, unsigned int ul_reason_being_called, LPVOID lpReserved)
 {
     int iRetVal = 1;
 
index 87e852a0ef49140a24f72ba37cb051ad793c89d4..8c8a551322572630543ec85ddae126c6de08e732 100644 (file)
@@ -110,15 +110,6 @@ menuconfig COMEDI_ISA_DRIVERS
 
 if COMEDI_ISA_DRIVERS
 
-config COMEDI_ACL7225B
-       tristate "ADlink NuDAQ ACL-7225b and compatibles support"
-       ---help---
-         Enable support for ADlink NuDAQ ACL-7225b and compatibles,
-         ADlink ACL-7225b (acl7225b), ICP P16R16DIO (p16r16dio)
-
-         To compile this driver as a module, choose M here: the module will be
-         called acl7225b.
-
 config COMEDI_PCL711
        tristate "Advantech PCL-711/711b and ADlink ACL-8112 ISA card support"
        ---help---
@@ -137,14 +128,6 @@ config COMEDI_PCL724
          To compile this driver as a module, choose M here: the module will be
          called pcl724.
 
-config COMEDI_PCL725
-       tristate "Advantech PCL-725 and compatible ISA card support"
-       ---help---
-         Enable support for Advantech PCL-725 and compatible ISA cards.
-
-         To compile this driver as a module, choose M here: the module will be
-         called pcl725.
-
 config COMEDI_PCL726
        tristate "Advantech PCL-726 and compatible ISA card support"
        ---help---
@@ -154,10 +137,21 @@ config COMEDI_PCL726
          called pcl726.
 
 config COMEDI_PCL730
-       tristate "Advantech PCL-730 and ADlink ACL-7130 ISA card support"
+       tristate "Simple Digital I/O board support (8-bit ports)"
        ---help---
-         Enable support for Advantech PCL-730, ICP ISO-730 and ADlink
-         ACL-7130 ISA cards
+         Enable support for various simple ISA or PC/104 Digital I/O boards.
+         These boards all use 8-bit I/O ports.
+
+         Advantech PCL-730   isolated - 16 in/16 out  ttl - 16 in/16 out
+         ICP ISO-730         isolated - 16 in/16 out  ttl - 16 in/16 out
+         ADlink ACL-7130     isolated - 16 in/16 out  ttl - 16 in/16 out
+         Advantech PCM-3730  isolated - 8 in/8 out    ttl - 16 in/16 out
+         Advantech PCL-725   isolated - 8 in/8 out
+         ICP P8R8-DIO        isolated - 8 in/8 out
+         ADlink ACL-7225b    isolated - 16 in/16 out
+         ICP P16R16-DIO      isolated - 16 in/16 out
+         Advantech PCL-733   isolated - 32 in
+         Advantech PCL-734   isolated - 32 out
 
          To compile this driver as a module, choose M here: the module will be
          called pcl730.
@@ -201,14 +195,6 @@ config COMEDI_PCM3724
          To compile this driver as a module, choose M here: the module will be
          called pcm3724.
 
-config COMEDI_PCM3730
-       tristate "Advantech PCM-3730 and clone PC/104 board support"
-       ---help---
-         Enable support for Advantech PCM-3730 and clone PC/104 boards
-
-         To compile this driver as a module, choose M here: the module will be
-         called pcm3730.
-
 config COMEDI_AMPLC_DIO200_ISA
        tristate "Amplicon PC212E/PC214E/PC215E/PC218E/PC272E"
        select COMEDI_AMPLC_DIO200
@@ -543,12 +529,19 @@ config COMEDI_POC
        tristate "Generic driver for very simple devices"
        ---help---
          Enable generic support for very simple / POC (Piece of Crap) boards,
-         Keithley Metrabyte DAC-02 (dac02), Advantech PCL-733 (pcl733) and
-         PCL-734 (pcl734)
+         Keithley Metrabyte DAC-02 (dac02).
 
          To compile this driver as a module, choose M here: the module will be
          called poc.
 
+config COMEDI_S526
+       tristate "Sensoray s526 support"
+       ---help---
+         Enable support for Sensoray s526
+
+         To compile this driver as a module, choose M here: the module will be
+         called s526.
+
 endif # COMEDI_ISA_DRIVERS
 
 menuconfig COMEDI_PCI_DRIVERS
@@ -1076,14 +1069,6 @@ config COMEDI_RTD520
          To compile this driver as a module, choose M here: the module will be
          called rtd520.
 
-config COMEDI_S526
-       tristate "Sensoray s526 support"
-       ---help---
-         Enable support for Sensoray s526
-
-         To compile this driver as a module, choose M here: the module will be
-         called s526.
-
 config COMEDI_S626
        tristate "Sensoray 626 support"
        select COMEDI_FC
index 4233605df30ace5057dc40f16df7ad24b6496605..6bbbe5b08954100e9ddedd9f5a19a94eeb9c4199 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _COMEDI_H
index d4be0e68509b7f6a0b54d6bf7c412105349acdc6..b4c001b6f88f904c58b6e8cc946ae5a31f441109 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include "comedidev.h"
index ad208cdd53d406f8ad8151038b91ae49ce700c87..2dfb06aedb153cfd91040e4c3df570c8e74cc6e5 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #include <linux/uaccess.h>
index 60cf51c4a793c1781288a025a4df8b845be42559..28e3c3059037a066ec70e01c538ff0c9a12d4148 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _COMEDI_COMPAT32_H
index 924c54c9c31fad595d1cdf9331db17d8c40413f4..c561a0eda92a6c6edddd2d095d3f1f23e39607e0 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #undef DEBUG
@@ -665,7 +660,7 @@ static int do_bufconfig_ioctl(struct comedi_device *dev,
        if (copy_from_user(&bc, arg, sizeof(bc)))
                return -EFAULT;
 
-       if (bc.subdevice >= dev->n_subdevices || bc.subdevice < 0)
+       if (bc.subdevice >= dev->n_subdevices)
                return -EINVAL;
 
        s = &dev->subdevices[bc.subdevice];
@@ -918,7 +913,7 @@ static int do_bufinfo_ioctl(struct comedi_device *dev,
        if (copy_from_user(&bi, arg, sizeof(bi)))
                return -EFAULT;
 
-       if (bi.subdevice >= dev->n_subdevices || bi.subdevice < 0)
+       if (bi.subdevice >= dev->n_subdevices)
                return -EINVAL;
 
        s = &dev->subdevices[bi.subdevice];
index 5fad084cfbd4dcb0a0a9906d183bb214038c76a3..abbc0e4f5c51288b9eae4ec0d32b4e2c06d9579a 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/pci.h>
index 453ff3b28617ad2c5270124065e24249ada791ba..9d49d5d01ad979a628b68de81712916cddcafe16 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/kernel.h>
index 9d9716a248f15f7f0816d84e35582bba0ffa91ce..13f18bef60916ab07c5145269373ecf550aacc89 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/usb.h>
@@ -34,6 +30,18 @@ struct usb_interface *comedi_to_usb_interface(struct comedi_device *dev)
 }
 EXPORT_SYMBOL_GPL(comedi_to_usb_interface);
 
+/**
+ * comedi_to_usb_dev() - comedi_device pointer to usb_device pointer.
+ * @dev: comedi_device struct
+ */
+struct usb_device *comedi_to_usb_dev(struct comedi_device *dev)
+{
+       struct usb_interface *intf = comedi_to_usb_interface(dev);
+
+       return intf ? interface_to_usbdev(intf) : NULL;
+}
+EXPORT_SYMBOL_GPL(comedi_to_usb_dev);
+
 /**
  * comedi_usb_auto_config() - Configure/probe a comedi USB driver.
  * @intf: usb_interface struct
index cdd472094cee1c2cbeca8ccd5df9a1e082b94a06..57deabf77418dc2beac03eb03049f1b226369748 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _COMEDIDEV_H
@@ -312,6 +307,18 @@ struct comedi_lrange {
        struct comedi_krange range[GCC_ZERO_LENGTH_ARRAY];
 };
 
+static inline bool comedi_range_is_bipolar(struct comedi_subdevice *s,
+                                          unsigned int range)
+{
+       return s->range_table->range[range].min < 0;
+}
+
+static inline bool comedi_range_is_unipolar(struct comedi_subdevice *s,
+                                           unsigned int range)
+{
+       return s->range_table->range[range].min >= 0;
+}
+
 /* some silly little inline functions */
 
 static inline unsigned int bytes_per_sample(const struct comedi_subdevice *subd)
@@ -351,6 +358,13 @@ int comedi_alloc_subdevices(struct comedi_device *, int);
 
 void comedi_spriv_free(struct comedi_device *, int subdev_num);
 
+int comedi_load_firmware(struct comedi_device *, struct device *,
+                        const char *name,
+                        int (*cb)(struct comedi_device *,
+                                  const u8 *data, size_t size,
+                                  unsigned long context),
+                        unsigned long context);
+
 int __comedi_request_region(struct comedi_device *,
                            unsigned long start, unsigned long len);
 int comedi_request_region(struct comedi_device *,
@@ -489,6 +503,7 @@ struct usb_driver;
 struct usb_interface;
 
 struct usb_interface *comedi_to_usb_interface(struct comedi_device *);
+struct usb_device *comedi_to_usb_dev(struct comedi_device *);
 
 int comedi_usb_auto_config(struct usb_interface *, struct comedi_driver *,
                           unsigned long context);
index ca92c43fdb3860e135f54fd9f51c7fe570a7f044..1a78b15543c447b0cd0e120449ce32768b0d9a14 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _LINUX_COMEDILIB_H
index 06d190f8fd34a5c81e0cccf97419f3e50b12fef4..f3e57fd8b2fbe8a6fed6014626749970b28b0bf7 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #include <linux/device.h>
@@ -38,6 +33,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/io.h>
 #include <linux/interrupt.h>
+#include <linux/firmware.h>
 
 #include "comedidev.h"
 #include "comedi_internal.h"
@@ -351,6 +347,38 @@ static void comedi_report_boards(struct comedi_driver *driv)
                pr_info(" %s\n", driv->driver_name);
 }
 
+/**
+ * comedi_load_firmware() - Request and load firmware for a device.
+ * @dev: comedi_device struct
+ * @hw_device: device struct for the comedi_device
+ * @name: the name of the firmware image
+ * @cb: callback to the upload the firmware image
+ * @context: private context from the driver
+ */
+int comedi_load_firmware(struct comedi_device *dev,
+                        struct device *device,
+                        const char *name,
+                        int (*cb)(struct comedi_device *dev,
+                                  const u8 *data, size_t size,
+                                  unsigned long context),
+                        unsigned long context)
+{
+       const struct firmware *fw;
+       int ret;
+
+       if (!cb)
+               return -EINVAL;
+
+       ret = request_firmware(&fw, name, device);
+       if (ret == 0) {
+               ret = cb(dev, fw->data, fw->size, context);
+               release_firmware(fw);
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(comedi_load_firmware);
+
 /**
  * __comedi_request_region() - Request an I/O reqion for a legacy driver.
  * @dev: comedi_device struct
index 429e0d60c0a3836bfb3770980257a6bfb1c31c31..3abedcd2527bbb2aade465b75f0fe23f4918afdf 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _8253_H
index 1d48aa602eceaf80476d5cde48e379f08877854a..1a1c2dae886b6a8ac371a4bef46a87ad0b5baf9f 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: 8255
index 0f6e7492b7db7cd32b3bbcfb0c24ab05efee7fca..4f16ea78f86ac505a5630d309a71c5b52d81ac7c 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _8255_H
index 76dec96aeb2ad696f1ad0e8a8a3187eb323d1a47..1117b61da3af1292a202b99efe69c3352cb196d4 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
index 57e984f0f462eefc23b50c22a31f66b74697eba7..dbb93e332487ed505884e0f3ddc1072544b881fb 100644 (file)
@@ -11,19 +11,16 @@ obj-$(CONFIG_COMEDI_SERIAL2002)             += serial2002.o
 obj-$(CONFIG_COMEDI_SKEL)              += skel.o
 
 # Comedi ISA drivers
-obj-$(CONFIG_COMEDI_ACL7225B)          += acl7225b.o
 obj-$(CONFIG_COMEDI_AMPLC_DIO200_ISA)  += amplc_dio200.o
 obj-$(CONFIG_COMEDI_AMPLC_PC263_ISA)   += amplc_pc263.o
 obj-$(CONFIG_COMEDI_PCL711)            += pcl711.o
 obj-$(CONFIG_COMEDI_PCL724)            += pcl724.o
-obj-$(CONFIG_COMEDI_PCL725)            += pcl725.o
 obj-$(CONFIG_COMEDI_PCL726)            += pcl726.o
 obj-$(CONFIG_COMEDI_PCL730)            += pcl730.o
 obj-$(CONFIG_COMEDI_PCL812)            += pcl812.o
 obj-$(CONFIG_COMEDI_PCL816)            += pcl816.o
 obj-$(CONFIG_COMEDI_PCL818)            += pcl818.o
 obj-$(CONFIG_COMEDI_PCM3724)           += pcm3724.o
-obj-$(CONFIG_COMEDI_PCM3730)           += pcm3730.o
 obj-$(CONFIG_COMEDI_RTI800)            += rti800.o
 obj-$(CONFIG_COMEDI_RTI802)            += rti802.o
 obj-$(CONFIG_COMEDI_DAS16M1)           += das16m1.o
@@ -55,6 +52,7 @@ obj-$(CONFIG_COMEDI_PCMMIO)           += pcmmio.o
 obj-$(CONFIG_COMEDI_PCMUIO)            += pcmuio.o
 obj-$(CONFIG_COMEDI_MULTIQ3)           += multiq3.o
 obj-$(CONFIG_COMEDI_POC)               += poc.o
+obj-$(CONFIG_COMEDI_S526)              += s526.o
 
 # Comedi PCI drivers
 obj-$(CONFIG_COMEDI_8255_PCI)          += 8255_pci.o
@@ -110,7 +108,6 @@ obj-$(CONFIG_COMEDI_NI_LABPC_PCI)   += ni_labpc_pci.o
 obj-$(CONFIG_COMEDI_NI_PCIDIO)         += ni_pcidio.o
 obj-$(CONFIG_COMEDI_NI_PCIMIO)         += ni_pcimio.o
 obj-$(CONFIG_COMEDI_RTD520)            += rtd520.o
-obj-$(CONFIG_COMEDI_S526)              += s526.o
 obj-$(CONFIG_COMEDI_S626)              += s626.o
 obj-$(CONFIG_COMEDI_SSV_DNP)           += ssv_dnp.o
 
diff --git a/drivers/staging/comedi/drivers/acl7225b.c b/drivers/staging/comedi/drivers/acl7225b.c
deleted file mode 100644 (file)
index 9e2c7ae..0000000
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * comedi/drivers/acl7225b.c
- * Driver for Adlink NuDAQ ACL-7225b and clones
- * José Luis Sánchez
- */
-/*
-Driver: acl7225b
-Description: Adlink NuDAQ ACL-7225b & compatibles
-Author: José Luis Sánchez (jsanchezv@teleline.es)
-Status: testing
-Devices: [Adlink] ACL-7225b (acl7225b), [ICP] P16R16DIO (p16r16dio)
-*/
-
-#include "../comedidev.h"
-
-#include <linux/ioport.h>
-
-#define ACL7225_RIO_LO 0       /* Relays input/output low byte (R0-R7) */
-#define ACL7225_RIO_HI 1       /* Relays input/output high byte (R8-R15) */
-#define ACL7225_DI_LO  2       /* Digital input low byte (DI0-DI7) */
-#define ACL7225_DI_HI  3       /* Digital input high byte (DI8-DI15) */
-
-struct acl7225b_boardinfo {
-       const char *name;
-       int io_range;
-};
-
-static const struct acl7225b_boardinfo acl7225b_boards[] = {
-       {
-               .name           = "acl7225b",
-               .io_range       = 8,            /* only 4 are used */
-       }, {
-               .name           = "p16r16dio",
-               .io_range       = 4,
-       },
-};
-
-static int acl7225b_do_insn_bits(struct comedi_device *dev,
-                                struct comedi_subdevice *s,
-                                struct comedi_insn *insn,
-                                unsigned int *data)
-{
-       unsigned long reg = (unsigned long)s->private;
-       unsigned int mask = data[0];
-       unsigned int bits = data[1];
-
-       if (mask) {
-               s->state &= ~mask;
-               s->state |= (bits & mask);
-
-               if (mask & 0x00ff)
-                       outb(s->state & 0xff, dev->iobase + reg);
-               if (mask & 0xff00)
-                       outb((s->state >> 8), dev->iobase + reg + 1);
-       }
-
-       data[1] = s->state;
-
-       return insn->n;
-}
-
-static int acl7225b_di_insn_bits(struct comedi_device *dev,
-                                struct comedi_subdevice *s,
-                                struct comedi_insn *insn,
-                                unsigned int *data)
-{
-       unsigned long reg = (unsigned long)s->private;
-
-       data[1] = inb(dev->iobase + reg) |
-                 (inb(dev->iobase + reg + 1) << 8);
-
-       return insn->n;
-}
-
-static int acl7225b_attach(struct comedi_device *dev,
-                          struct comedi_devconfig *it)
-{
-       const struct acl7225b_boardinfo *board = comedi_board(dev);
-       struct comedi_subdevice *s;
-       int ret;
-
-       ret = comedi_request_region(dev, it->options[0], board->io_range);
-       if (ret)
-               return ret;
-
-       ret = comedi_alloc_subdevices(dev, 3);
-       if (ret)
-               return ret;
-
-       s = &dev->subdevices[0];
-       /* Relays outputs */
-       s->type         = COMEDI_SUBD_DO;
-       s->subdev_flags = SDF_WRITABLE;
-       s->maxdata      = 1;
-       s->n_chan       = 16;
-       s->insn_bits    = acl7225b_do_insn_bits;
-       s->range_table  = &range_digital;
-       s->private      = (void *)ACL7225_RIO_LO;
-
-       s = &dev->subdevices[1];
-       /* Relays status */
-       s->type         = COMEDI_SUBD_DI;
-       s->subdev_flags = SDF_READABLE;
-       s->maxdata      = 1;
-       s->n_chan       = 16;
-       s->insn_bits    = acl7225b_di_insn_bits;
-       s->range_table  = &range_digital;
-       s->private      = (void *)ACL7225_RIO_LO;
-
-       s = &dev->subdevices[2];
-       /* Isolated digital inputs */
-       s->type         = COMEDI_SUBD_DI;
-       s->subdev_flags = SDF_READABLE;
-       s->maxdata      = 1;
-       s->n_chan       = 16;
-       s->insn_bits    = acl7225b_di_insn_bits;
-       s->range_table  = &range_digital;
-       s->private      = (void *)ACL7225_DI_LO;
-
-       return 0;
-}
-
-static struct comedi_driver acl7225b_driver = {
-       .driver_name    = "acl7225b",
-       .module         = THIS_MODULE,
-       .attach         = acl7225b_attach,
-       .detach         = comedi_legacy_detach,
-       .board_name     = &acl7225b_boards[0].name,
-       .num_names      = ARRAY_SIZE(acl7225b_boards),
-       .offset         = sizeof(struct acl7225b_boardinfo),
-};
-module_comedi_driver(acl7225b_driver);
-
-MODULE_DESCRIPTION("Comedi: NuDAQ ACL-7225B, 16 Relay & 16 Isolated DI Card");
-MODULE_AUTHOR("Comedi http://www.comedi.org");
-MODULE_LICENSE("GPL");
index 5bd7fe64637c16a06b3cbcc2d7d1c324b4251b14..d91f586fdd2643dc46dc866d4f7f6b9260423ad9 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index 6b38ce7a275be2a3f3d81f92c5dc9052b578fdf5..27de18e79895680a6d8356f357efaefcc458d24c 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index 70a7f953fa2f3f97282f94c08917b147a7736c44..c9db601da2c998a7f6dc430346b53078a797273c 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index be0c6adbdc94158fba17d98cb326cb3768afc33d..6bbcb06cc27941654a9ef58e407c8cfd253c8c30 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index a211e78dd3ba244b23784aaa87114a53b9c1c417..5c830337db85780c6adcb47f0c6c8288cb122aff 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index 97e7eec343d776d2610fcaf71c52357ccb91ca64..6ef1d6a434d909ea9ae6817f617dfcc6c8bd1033 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index 3bc9826ce40bbcacec42383c9b397422372da25d..0b79531ac24b337f96bff274dfa2f43bf899e007 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index c8238b8921cd64a691b4c6ca4c785588cb06a0e0..fb56360444ee7b4f7308854129db2a6806992768 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index 0c3db57a50f4e0e5896230fef5b5b4990e116196..f0cc32bae766e7eb6287f990d944d240a649ffb3 100644 (file)
@@ -20,13 +20,6 @@ This program is distributed in the hope that it will be useful, but WITHOUT ANY
 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this
-source code.
-
 @endverbatim
 */
 /*
index 5124ac9f1818da33cdb20397e6ac6ec0cb30e442..a5fed63059ad1e2bd11f272556d1c957b38d51b8 100644 (file)
  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  * for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #define NVRAM_USER_DATA_START  0x100
index b05f8505c894fa69af48ae2955df3dcb1963021f..b1a7ec1035e1194eb472b8efebae1449872a6bc9 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index 3d66e48e0cf7b4417f0381877e280cff538daf24..1128c22e7517b45944746844cf4d2a419b06e8a4 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index 24c4c983db381a5c7b72ecc9c30d6b7dd8cc33bb..054910511e9e222942a1828729df2f2108b3f267 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index fc31c4b934073f2ef15945870e1e91fb54b14af6..e3cc429403c08c35df09f02286a1079df039f0c9 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index 74065baa3c08c274b5376aa0e9f349044232dbaa..81eb51e0195a2891da88edf02a91379dda98d275 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index c7908730caa5c41699cdf5db8d481bc2324f50ff..32dce0329fd5a1ab10d8a0c64ff162c538858812 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index a45a2a26e0da8f11345bb556ef1a1747ccb62aab..f75ffd929cf3b3884b6b23902d1476e43f4917a8 100644 (file)
@@ -15,10 +15,6 @@ This program is free software; you can redistribute it and/or modify it under th
 
 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
-You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-You should also find the complete GPL in the COPYING file accompanying this source code.
-
 @endverbatim
 */
 /*
index 3d4878facc26994a5e31de3d52375acbe2358ad8..8a93542faedcaadbc6aa12fc11601a27df231466 100644 (file)
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying this
- * source code.
  */
 
 #include <linux/pci.h>
index ed01c56630bb7264fa11d8818f4bc03068a19bb3..5bf6bb129cd47c6436c4830eaaf4b28b89b4d2de 100644 (file)
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #include <linux/pci.h>
index 4c6a9b5a06aeeb31c29f54031e163d261dfb82dd..1f7bed9a3f7f6386f18fbebb299466fc88d036e3 100644 (file)
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #include <linux/pci.h>
index b666637f61beed09448a24bc93171653d88190d4..c0d83d8709f420dbc9063d120290b3191d7f3cae 100644 (file)
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #include <linux/pci.h>
index 1cdc08d79792ddfe96acdd2a9b3380716d4f6079..060620e184d6960d772f4a44ff41cabc5055fabb 100644 (file)
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #include <linux/pci.h>
index a0cf6ecdef0ee68c2557bbabe2a53616954c4f12..f9b63689a12af2b35d34536fd5a36247acfeaf8a 100644 (file)
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * You should also find the complete GPL in the COPYING file accompanying
- * this source code.
  */
 
 #include <linux/pci.h>
index 1666b5f510d3243b8811123a0c349b51b54ea7f9..2c21a16255473b37e205c35b68c85272d902d092 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include "../comedidev.h"
index 8a438ff1bd45979f7ec391b0a9d88e5d58221f81..b5e4e53f737f76068536d0e060d4979bcff42bab 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 /*
 Driver: adl_pci6208
index e3960745f506ed671041b86ebbf48bb8353a7db3..0d9243a5f495bb142cf6fe466b03bd03ff27a83c 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
index b3ec60afe3a040f9b55fa6667e2a870e5f0a0108..0b591b0b55013c13c2eee52efa0cdc9e4f952b54 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
index 6247fdcedcbf9d7da47f0947fff4c1cae965db1c..af51c746004845e02255f130d63b41ead49c40be 100644 (file)
@@ -17,10 +17,6 @@ This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*
index 71142e36e7fb66445cd683f38e33ee85f0ea322e..d187a7bf0a55e1e04c34fa362aae05f9268f9f48 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: adq12b
index ccc114d6c08b4e183beb0c126606f7a75b7f2917..8430a27ec1b59d22fbbfb4b6b688d2c89c840dc2 100644 (file)
@@ -1,4 +1,4 @@
-/*******************************************************************************
+/*
    comedi/drivers/pci1723.c
 
    COMEDI - Linux Control and Measurement Device Interface
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-*******************************************************************************/
+*/
 /*
 Driver: adv_pci1723
 Description: Advantech PCI-1723
index e60f12578d44c2e9d6f3ec0f584e8854c02fa8a2..da7462e01faa9f987092c4e9161364b0ef5d39bf 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 
 /*
 
index e2dc08a058bc680d42ce67299d63abed2f3806e8..77c92cb23d474c4a438a4157e013471e33eb52b6 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*
index 126854c0509322e67799f3bf9587010410c5ce87..029834d0ff1f88968f5c6266667363099a78cac3 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*
index 297750bef0f73c0c44b07e7e83489a52d17c48a8..e2478105ac1a0d9439b63b34b91bd503a7a64414 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
  * Driver: amplc_dio200
index cf2e7261740e35927cc31f247210b3419b1741fe..43160b9944bbdf4b09e14cbf8dedc5f4dc7f44c0 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef AMPLC_DIO200_H_INCLUDED
index 3403e5ccfa93034acf6b1b19d5b49bffff2f0f8a..81d6ee4549cb08db145f46b3e2f6501cc36179a5 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #include <linux/interrupt.h>
index 4be44e877373515ea3e01f87e2526ab53612fe22..d7d9f5cc3ab40fe81dc074f588c863871b34948a 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
  * Driver: amplc_dio200_pci
index 115ecd51677e355d45b10a685a1bef5750b8e60a..179d25e00ace87ee7137c568ca399c2ae7b3973a 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: amplc_pc236
index 94a752d852bb061e5fd0885e559db65b00fdc919..6546095e7a45bec5ed4ec54a78bbd8db36d01f88 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: amplc_pc263
index 4d7eab9b55658c0866a1b2be54d2213be0632a39..f1e36f08b103950e5ce4d7c39c44abe64ca3d935 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: amplc_pci224
index 49200fbd60b994b6984ca42dff9f404f8a54ea4d..08ff12837e00150601a7de497884bab0b712c4b3 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
   */
 /*
 Driver: amplc_pci230
index 8b57533bf4062483dded80783840e2257f297f30..4da900cc5845a146edda0e3221e42194af10b0f7 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: amplc_pci263
index 92376dc86dd8883695459d54c4f99dfc4dbc168d..929218a359754c9381ef383de6c80b7bcf1a3e87 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: c6xdigio
index f874fff445236bc627615dd82af30f059ae881b0..ae9a2082b5a44ee482ea0a3f1acc3bb7921ac8d2 100644 (file)
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
     PCMCIA support code for this driver is adapted from the dummy_cs.c
     driver of the Linux PCMCIA Card Services package.
 
index 53dd298d2b5443e41413676fa980ed7cce46c64c..172f459b0af2f920055184d7479866ba6700a30a 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: cb_pcidas
index c3e5495b4f06aa43b45ec8932ec0d88bee99865a..11f8101d851a8a11ceffd4d3648f04ce22163ea1 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 
 /*
  * Driver: cb_pcidas64
index f9b459888b8b8f3f1b18349a104698db2c947773..b74d4c38b5c3afa588c11197a182566c900d7402 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
index 29813c9d4a2a14783420b83021d312e016733b61..8b5c198862a1a44cf7806f70d85c6b57fb0c73cd 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: cb_pcimdas
index 88f03ae6f3e67b28e83627d2682c3ffd0ad4d688..d3a796eef883db0bc6285aed98993ec4a9a3f44a 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: cb_pcimdda
index 1bb53816eca3ef80e67d12f73ca43ba01559f470..1a51866be6f7d8deeefbcb8d13d9c92ecbd68078 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: comedi_bond
index 37dc79637d2a0299b7990ba52d10dbdafea79f25..b3d89c82d08798697a654980686e085e7770038f 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 
 #include "../comedidev.h"
 
index 31afab79f39a250b96dbce524bedc0e0b9ed8c1a..a4dea7cb86bef30294574aa42299793b70eb0308 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 
 #ifndef _COMEDI_FC_H
 #define _COMEDI_FC_H
index 3e061cc9b48ef3fc22ebf2d9961c4044fdbccd50..772a8f5f0c1ceec33b6df1574ce3199c0fbd05c9 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: comedi_parport
index c1d8e86f53a20e73b99196c05610c767640aa604..907e7a3822f5da6a733d7dc6fab8d63b55a9cc57 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 /*
 Driver: comedi_test
 Description: generates fake waveforms
index f2230bfd4eb9c656ff0103085d1b5f460254e7bf..0fb9027dde2d71026403c78c8fa54b9b6ed6a622 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: contec_pci_dio
index b87f95c3e17dbd7944fb815d6da77d6d235e06ab..f5aa3860e3e7e8d40ea7eeda1774df094b834116 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: daqboard2000
@@ -110,7 +105,6 @@ Configuration options: not applicable, uses PCI auto config
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
-#include <linux/firmware.h>
 
 #include "../comedidev.h"
 
@@ -524,7 +518,8 @@ static int daqboard2000_writeCPLD(struct comedi_device *dev, int data)
 }
 
 static int initialize_daqboard2000(struct comedi_device *dev,
-                                  const u8 *cpld_array, size_t len)
+                                  const u8 *cpld_array, size_t len,
+                                  unsigned long context)
 {
        struct daqboard2000_private *devpriv = dev->private;
        int result = -EIO;
@@ -565,22 +560,6 @@ static int initialize_daqboard2000(struct comedi_device *dev,
        return result;
 }
 
-static int daqboard2000_upload_firmware(struct comedi_device *dev)
-{
-       struct pci_dev *pcidev = comedi_to_pci_dev(dev);
-       const struct firmware *fw;
-       int ret;
-
-       ret = request_firmware(&fw, DAQBOARD2000_FIRMWARE, &pcidev->dev);
-       if (ret)
-               return ret;
-
-       ret = initialize_daqboard2000(dev, fw->data, fw->size);
-       release_firmware(fw);
-
-       return ret;
-}
-
 static void daqboard2000_adcStopDmaTransfer(struct comedi_device *dev)
 {
 }
@@ -724,7 +703,9 @@ static int daqboard2000_auto_attach(struct comedi_device *dev,
 
        readl(devpriv->plx + 0x6c);
 
-       result = daqboard2000_upload_firmware(dev);
+       result = comedi_load_firmware(dev, &comedi_to_pci_dev(dev)->dev,
+                                     DAQBOARD2000_FIRMWARE,
+                                     initialize_daqboard2000, 0);
        if (result < 0)
                return result;
 
index ba12c1d605fb956a2c963bf97367cba37158598f..64807eaa559eba5dd97fbeea7cf6f1b6ecc817d8 100644 (file)
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- *****************************************************************
  */
 
 /*
index 89bb8d6fdfc66fdd5816d4e357a2e3bcabc3fb4e..c312870ab691219aa529519fd853b15be5d2ff9e 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _DAS08_H
index d9f3e92317d3a2b5f1f318089b0f0d38a8d9e59a..3625b3eafe7c93005acfb85b2ff7ebbd04fc8157 100644 (file)
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
     PCMCIA support code for this driver is adapted from the dummy_cs.c
     driver of the Linux PCMCIA Card Services package.
 
     The initial developer of the original code is David A. Hinds
     <dahinds@users.sourceforge.net>.  Portions created by David A. Hinds
     are Copyright (C) 1999 David A. Hinds.  All Rights Reserved.
-
-*****************************************************************
-
 */
 /*
 Driver: das08_cs
index f09f6966ed6558fff3c51c6364eba4b75302d5ab..10e96e94515ca93e9c3575a9c66e5ed5ebdf96f1 100644 (file)
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
index 53fa943dd0b7038393c6d58bbe81754c32a892f5..351fbc6027f11bd93e5f977caec9381196457482 100644 (file)
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
index 762b5a6eac5acc1b94f8552230158f0f4a51a55c..dab7647ec2f5776f53312959e18224f9fa12a08e 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: das16
index 9cb9c3b0479718eccc5468d6b47fc7e4792d4ed0..e7ae2ffe8c797b24375cd92ce7d62c4544883a13 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: das16m1
index abf7638a9f71c20dba38a812ba953d41021bf25e..23b4a661eb1afbcad6b035f3adec15a525b46bf9 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: das1800
index 11424fb5b4d46a65a165cf1c4f46e107d95802f7..f0530778bb3b376dd5b985c134eaf21fc1856854 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: das6402
index 9ce6cbcc7ee8c092e4d21f0037f18cda3a346356..091cd911b38a852bd6a1af6dd2d581d36d7d56df 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: das800
index 6c85dd2d549b8a827ad19947d62c722a2f0498fe..e29847d73b432897eecfe6e3362cd66b762c1c85 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: dmm32at
index 8757b54ad4acfdae1022d703e4e5ea2fb8d6f800..5348cdae408a7e4045d8223a62d2ad7a0068a50d 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
 Driver: dt2811
index 7c95b3b68131af627acca6ada769057d410f1b9e..87e9749c4be770f6522dcbb33604c55b18cc8d4f 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: dt2814
index b24e87681fe3c25efef5e6ef4edc177bebd951a4..0fcd4fe7acdc4967e6f7f3d01eeece86f7a9ea5e 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: dt2815
index b5c8e8213faf71da172632604293997230db94f5..2f46be715f79f10a2042d104be652829fcac466e 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: dt2817
index 90f2de9bc402d614ea965a587f3ec287a636e511..c1950e3b19a2b69f5f18adba431ec2b311897778 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: dt282x
index 7e03929c9a14f3405fc711a6dad0104cf535af61..01a2f889d5b023e227a1abc4bae259cacbe62457 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: dt3000
index 81eb5ed6ec97dae546c3ea9dfcbb0bdd46cdf9c3..6c60949d919391b0255d7fc7c27827f1891a82d4 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
  */
 
 /*
@@ -43,14 +38,11 @@ for my needs.
  *      says P1).
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/kref.h>
 #include <linux/uaccess.h>
 #include <linux/usb.h>
 
@@ -60,6 +52,9 @@ for my needs.
 #define DT9812_MAX_WRITE_CMD_PIPE_SIZE 32
 #define DT9812_MAX_READ_CMD_PIPE_SIZE  32
 
+/* usb_bulk_msg() timout in milliseconds */
+#define DT9812_USB_TIMEOUT             1000
+
 /*
  * See Silican Laboratories C8051F020/1/2/3 manual
  */
@@ -242,87 +237,25 @@ struct dt9812_usb_cmd {
                struct dt9812_write_multi write_multi_info;
                struct dt9812_rmw_multi rmw_multi_info;
        } u;
-#if 0
-       WRITE_BYTE_INFO WriteByteInfo;
-       READ_BYTE_INFO ReadByteInfo;
-       WRITE_MULTI_INFO WriteMultiInfo;
-       READ_MULTI_INFO ReadMultiInfo;
-       RMW_BYTE_INFO RMWByteInfo;
-       RMW_MULTI_INFO RMWMultiInfo;
-       DAC_THRESHOLD_INFO DacThresholdInfo;
-       INT_ON_CHANGE_MASK_INFO IntOnChangeMaskInfo;
-       CGL_INFO CglInfo;
-       SUBSYSTEM_INFO SubsystemInfo;
-       CAL_POT_CMD CalPotCmd;
-       WRITE_DEV_BYTE_INFO WriteDevByteInfo;
-       READ_DEV_BYTE_INFO ReadDevByteInfo;
-       WRITE_DEV_MULTI_INFO WriteDevMultiInfo;
-       READ_DEV_MULTI_INFO ReadDevMultiInfo;
-       READ_SINGLE_VALUE_INFO ReadSingleValueInfo;
-       WRITE_SINGLE_VALUE_INFO WriteSingleValueInfo;
-#endif
 };
 
-#define DT9812_NUM_SLOTS       16
-
-static DEFINE_SEMAPHORE(dt9812_mutex);
-
-static const struct usb_device_id dt9812_table[] = {
-       {USB_DEVICE(0x0867, 0x9812)},
-       {}                      /* Terminating entry */
-};
-
-MODULE_DEVICE_TABLE(usb, dt9812_table);
-
-struct usb_dt9812 {
-       struct slot_dt9812 *slot;
-       struct usb_device *udev;
-       struct usb_interface *interface;
-       u16 vendor;
-       u16 product;
-       u16 device;
-       u32 serial;
+struct dt9812_private {
+       struct semaphore sem;
        struct {
                __u8 addr;
                size_t size;
-       } message_pipe, command_write, command_read, write_stream, read_stream;
-       struct kref kref;
-       u16 analog_out_shadow[2];
-       u8 digital_out_shadow;
-};
-
-struct comedi_dt9812 {
-       struct slot_dt9812 *slot;
-       u32 serial;
-};
-
-struct slot_dt9812 {
-       struct semaphore mutex;
-       u32 serial;
-       struct usb_dt9812 *usb;
-       struct comedi_dt9812 *comedi;
+       } cmd_wr, cmd_rd;
+       u16 device;
+       u16 ao_shadow[2];
 };
 
-static struct slot_dt9812 dt9812[DT9812_NUM_SLOTS];
-
-static inline struct usb_dt9812 *to_dt9812_dev(struct kref *d)
-{
-       return container_of(d, struct usb_dt9812, kref);
-}
-
-static void dt9812_delete(struct kref *kref)
-{
-       struct usb_dt9812 *dev = to_dt9812_dev(kref);
-
-       usb_put_dev(dev->udev);
-       kfree(dev);
-}
-
-static int dt9812_read_info(struct usb_dt9812 *dev, int offset, void *buf,
-                           size_t buf_size)
+static int dt9812_read_info(struct comedi_device *dev,
+                           int offset, void *buf, size_t buf_size)
 {
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct dt9812_private *devpriv = dev->private;
        struct dt9812_usb_cmd cmd;
-       int count, retval;
+       int count, ret;
 
        cmd.cmd = cpu_to_le32(DT9812_R_FLASH_DATA);
        cmd.u.flash_data_info.address =
@@ -330,25 +263,23 @@ static int dt9812_read_info(struct usb_dt9812 *dev, int offset, void *buf,
        cmd.u.flash_data_info.numbytes = cpu_to_le16(buf_size);
 
        /* DT9812 only responds to 32 byte writes!! */
-       count = 32;
-       retval = usb_bulk_msg(dev->udev,
-                             usb_sndbulkpipe(dev->udev,
-                                             dev->command_write.addr),
-                             &cmd, 32, &count, HZ * 1);
-       if (retval)
-               return retval;
-       retval = usb_bulk_msg(dev->udev,
-                             usb_rcvbulkpipe(dev->udev,
-                                             dev->command_read.addr),
-                             buf, buf_size, &count, HZ * 1);
-       return retval;
+       ret = usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr),
+                          &cmd, 32, &count, DT9812_USB_TIMEOUT);
+       if (ret)
+               return ret;
+
+       return usb_bulk_msg(usb, usb_rcvbulkpipe(usb, devpriv->cmd_rd.addr),
+                           buf, buf_size, &count, DT9812_USB_TIMEOUT);
 }
 
-static int dt9812_read_multiple_registers(struct usb_dt9812 *dev, int reg_count,
-                                         u8 *address, u8 *value)
+static int dt9812_read_multiple_registers(struct comedi_device *dev,
+                                         int reg_count, u8 *address,
+                                         u8 *value)
 {
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct dt9812_private *devpriv = dev->private;
        struct dt9812_usb_cmd cmd;
-       int i, count, retval;
+       int i, count, ret;
 
        cmd.cmd = cpu_to_le32(DT9812_R_MULTI_BYTE_REG);
        cmd.u.read_multi_info.count = reg_count;
@@ -356,26 +287,23 @@ static int dt9812_read_multiple_registers(struct usb_dt9812 *dev, int reg_count,
                cmd.u.read_multi_info.address[i] = address[i];
 
        /* DT9812 only responds to 32 byte writes!! */
-       count = 32;
-       retval = usb_bulk_msg(dev->udev,
-                             usb_sndbulkpipe(dev->udev,
-                                             dev->command_write.addr),
-                             &cmd, 32, &count, HZ * 1);
-       if (retval)
-               return retval;
-       retval = usb_bulk_msg(dev->udev,
-                             usb_rcvbulkpipe(dev->udev,
-                                             dev->command_read.addr),
-                             value, reg_count, &count, HZ * 1);
-       return retval;
+       ret = usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr),
+                          &cmd, 32, &count, DT9812_USB_TIMEOUT);
+       if (ret)
+               return ret;
+
+       return usb_bulk_msg(usb, usb_rcvbulkpipe(usb, devpriv->cmd_rd.addr),
+                           value, reg_count, &count, DT9812_USB_TIMEOUT);
 }
 
-static int dt9812_write_multiple_registers(struct usb_dt9812 *dev,
+static int dt9812_write_multiple_registers(struct comedi_device *dev,
                                           int reg_count, u8 *address,
                                           u8 *value)
 {
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct dt9812_private *devpriv = dev->private;
        struct dt9812_usb_cmd cmd;
-       int i, count, retval;
+       int i, count;
 
        cmd.cmd = cpu_to_le32(DT9812_W_MULTI_BYTE_REG);
        cmd.u.read_multi_info.count = reg_count;
@@ -383,19 +311,20 @@ static int dt9812_write_multiple_registers(struct usb_dt9812 *dev,
                cmd.u.write_multi_info.write[i].address = address[i];
                cmd.u.write_multi_info.write[i].value = value[i];
        }
+
        /* DT9812 only responds to 32 byte writes!! */
-       retval = usb_bulk_msg(dev->udev,
-                             usb_sndbulkpipe(dev->udev,
-                                             dev->command_write.addr),
-                             &cmd, 32, &count, HZ * 1);
-       return retval;
+       return usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr),
+                           &cmd, 32, &count, DT9812_USB_TIMEOUT);
 }
 
-static int dt9812_rmw_multiple_registers(struct usb_dt9812 *dev, int reg_count,
+static int dt9812_rmw_multiple_registers(struct comedi_device *dev,
+                                        int reg_count,
                                         struct dt9812_rmw_byte *rmw)
 {
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct dt9812_private *devpriv = dev->private;
        struct dt9812_usb_cmd cmd;
-       int i, count, retval;
+       int i, count;
 
        cmd.cmd = cpu_to_le32(DT9812_RMW_MULTI_BYTE_REG);
        cmd.u.rmw_multi_info.count = reg_count;
@@ -403,76 +332,52 @@ static int dt9812_rmw_multiple_registers(struct usb_dt9812 *dev, int reg_count,
                cmd.u.rmw_multi_info.rmw[i] = rmw[i];
 
        /* DT9812 only responds to 32 byte writes!! */
-       retval = usb_bulk_msg(dev->udev,
-                             usb_sndbulkpipe(dev->udev,
-                                             dev->command_write.addr),
-                             &cmd, 32, &count, HZ * 1);
-       return retval;
+       return usb_bulk_msg(usb, usb_sndbulkpipe(usb, devpriv->cmd_wr.addr),
+                           &cmd, 32, &count, DT9812_USB_TIMEOUT);
 }
 
-static int dt9812_digital_in(struct slot_dt9812 *slot, u8 *bits)
+static int dt9812_digital_in(struct comedi_device *dev, u8 *bits)
 {
-       int result = -ENODEV;
-
-       down(&slot->mutex);
-       if (slot->usb) {
-               u8 reg[2] = { F020_SFR_P3, F020_SFR_P1 };
-               u8 value[2];
+       struct dt9812_private *devpriv = dev->private;
+       u8 reg[2] = { F020_SFR_P3, F020_SFR_P1 };
+       u8 value[2];
+       int ret;
 
-               result = dt9812_read_multiple_registers(slot->usb, 2, reg,
-                                                       value);
-               if (result == 0) {
-                       /*
-                        * bits 0-6 in F020_SFR_P3 are bits 0-6 in the digital
-                        * input port bit 3 in F020_SFR_P1 is bit 7 in the
-                        * digital input port
-                        */
-                       *bits = (value[0] & 0x7f) | ((value[1] & 0x08) << 4);
-                       /* printk("%2.2x, %2.2x -> %2.2x\n",
-                          value[0], value[1], *bits); */
-               }
+       down(&devpriv->sem);
+       ret = dt9812_read_multiple_registers(dev, 2, reg, value);
+       if (ret == 0) {
+               /*
+                * bits 0-6 in F020_SFR_P3 are bits 0-6 in the digital
+                * input port bit 3 in F020_SFR_P1 is bit 7 in the
+                * digital input port
+                */
+               *bits = (value[0] & 0x7f) | ((value[1] & 0x08) << 4);
        }
-       up(&slot->mutex);
+       up(&devpriv->sem);
 
-       return result;
+       return ret;
 }
 
-static int dt9812_digital_out(struct slot_dt9812 *slot, u8 bits)
+static int dt9812_digital_out(struct comedi_device *dev, u8 bits)
 {
-       int result = -ENODEV;
-
-       down(&slot->mutex);
-       if (slot->usb) {
-               u8 reg[1];
-               u8 value[1];
-
-               reg[0] = F020_SFR_P2;
-               value[0] = bits;
-               result = dt9812_write_multiple_registers(slot->usb, 1, reg,
-                                                        value);
-               slot->usb->digital_out_shadow = bits;
-       }
-       up(&slot->mutex);
-       return result;
-}
+       struct dt9812_private *devpriv = dev->private;
+       u8 reg[1] = { F020_SFR_P2 };
+       u8 value[1] = { bits };
+       int ret;
 
-static int dt9812_digital_out_shadow(struct slot_dt9812 *slot, u8 *bits)
-{
-       int result = -ENODEV;
+       down(&devpriv->sem);
+       ret = dt9812_write_multiple_registers(dev, 1, reg, value);
+       up(&devpriv->sem);
 
-       down(&slot->mutex);
-       if (slot->usb) {
-               *bits = slot->usb->digital_out_shadow;
-               result = 0;
-       }
-       up(&slot->mutex);
-       return result;
+       return ret;
 }
 
-static void dt9812_configure_mux(struct usb_dt9812 *dev,
+static void dt9812_configure_mux(struct comedi_device *dev,
                                 struct dt9812_rmw_byte *rmw, int channel)
 {
-       if (dev->device == DT9812_DEVID_DT9812_10) {
+       struct dt9812_private *devpriv = dev->private;
+
+       if (devpriv->device == DT9812_DEVID_DT9812_10) {
                /* In the DT9812/10V MUX is selected by P1.5-7 */
                rmw->address = F020_SFR_P1;
                rmw->and_mask = 0xe0;
@@ -485,18 +390,21 @@ static void dt9812_configure_mux(struct usb_dt9812 *dev,
        }
 }
 
-static void dt9812_configure_gain(struct usb_dt9812 *dev,
+static void dt9812_configure_gain(struct comedi_device *dev,
                                  struct dt9812_rmw_byte *rmw,
                                  enum dt9812_gain gain)
 {
-       if (dev->device == DT9812_DEVID_DT9812_10) {
-               /* In the DT9812/10V, there is an external gain of 0.5 */
+       struct dt9812_private *devpriv = dev->private;
+
+       /* In the DT9812/10V, there is an external gain of 0.5 */
+       if (devpriv->device == DT9812_DEVID_DT9812_10)
                gain <<= 1;
-       }
 
        rmw->address = F020_SFR_ADC0CF;
        rmw->and_mask = F020_MASK_ADC0CF_AMP0GN2 |
-           F020_MASK_ADC0CF_AMP0GN1 | F020_MASK_ADC0CF_AMP0GN0;
+                       F020_MASK_ADC0CF_AMP0GN1 |
+                       F020_MASK_ADC0CF_AMP0GN0;
+
        switch (gain) {
                /*
                 * 000 -> Gain =  1
@@ -508,8 +416,10 @@ static void dt9812_configure_gain(struct usb_dt9812 *dev,
                 */
        case DT9812_GAIN_0PT5:
                rmw->or_value = F020_MASK_ADC0CF_AMP0GN2 |
-                   F020_MASK_ADC0CF_AMP0GN1;
+                               F020_MASK_ADC0CF_AMP0GN1;
                break;
+       default:
+               /* this should never happen, just use a gain of 1 */
        case DT9812_GAIN_1:
                rmw->or_value = 0x00;
                break;
@@ -521,20 +431,18 @@ static void dt9812_configure_gain(struct usb_dt9812 *dev,
                break;
        case DT9812_GAIN_8:
                rmw->or_value = F020_MASK_ADC0CF_AMP0GN1 |
-                   F020_MASK_ADC0CF_AMP0GN0;
+                               F020_MASK_ADC0CF_AMP0GN0;
                break;
        case DT9812_GAIN_16:
                rmw->or_value = F020_MASK_ADC0CF_AMP0GN2;
                break;
-       default:
-               dev_err(&dev->interface->dev, "Illegal gain %d\n", gain);
-
        }
 }
 
-static int dt9812_analog_in(struct slot_dt9812 *slot, int channel, u16 *value,
-                           enum dt9812_gain gain)
+static int dt9812_analog_in(struct comedi_device *dev,
+                           int channel, u16 *value, enum dt9812_gain gain)
 {
+       struct dt9812_private *devpriv = dev->private;
        struct dt9812_rmw_byte rmw[3];
        u8 reg[3] = {
                F020_SFR_ADC0CN,
@@ -542,31 +450,30 @@ static int dt9812_analog_in(struct slot_dt9812 *slot, int channel, u16 *value,
                F020_SFR_ADC0L
        };
        u8 val[3];
-       int result = -ENODEV;
+       int ret;
 
-       down(&slot->mutex);
-       if (!slot->usb)
-               goto exit;
+       down(&devpriv->sem);
 
        /* 1 select the gain */
-       dt9812_configure_gain(slot->usb, &rmw[0], gain);
+       dt9812_configure_gain(dev, &rmw[0], gain);
 
        /* 2 set the MUX to select the channel */
-       dt9812_configure_mux(slot->usb, &rmw[1], channel);
+       dt9812_configure_mux(dev, &rmw[1], channel);
 
        /* 3 start conversion */
        rmw[2].address = F020_SFR_ADC0CN;
        rmw[2].and_mask = 0xff;
        rmw[2].or_value = F020_MASK_ADC0CN_AD0EN | F020_MASK_ADC0CN_AD0BUSY;
 
-       result = dt9812_rmw_multiple_registers(slot->usb, 3, rmw);
-       if (result)
+       ret = dt9812_rmw_multiple_registers(dev, 3, rmw);
+       if (ret)
                goto exit;
 
        /* read the status and ADC */
-       result = dt9812_read_multiple_registers(slot->usb, 3, reg, val);
-       if (result)
+       ret = dt9812_read_multiple_registers(dev, 3, reg, val);
+       if (ret)
                goto exit;
+
        /*
         * An ADC conversion takes 16 SAR clocks cycles, i.e. about 9us.
         * Therefore, between the instant that AD0BUSY was set via
@@ -578,7 +485,7 @@ static int dt9812_analog_in(struct slot_dt9812 *slot, int channel, u16 *value,
         */
        if ((val[0] & (F020_MASK_ADC0CN_AD0INT | F020_MASK_ADC0CN_AD0BUSY)) ==
            F020_MASK_ADC0CN_AD0INT) {
-               switch (slot->usb->device) {
+               switch (devpriv->device) {
                case DT9812_DEVID_DT9812_10:
                        /*
                         * For DT9812-10V the personality module set the
@@ -594,422 +501,284 @@ static int dt9812_analog_in(struct slot_dt9812 *slot, int channel, u16 *value,
        }
 
 exit:
-       up(&slot->mutex);
-       return result;
+       up(&devpriv->sem);
+
+       return ret;
 }
 
-static int dt9812_analog_out_shadow(struct slot_dt9812 *slot, int channel,
-                                   u16 *value)
+static int dt9812_analog_out(struct comedi_device *dev, int channel, u16 value)
 {
-       int result = -ENODEV;
+       struct dt9812_private *devpriv = dev->private;
+       struct dt9812_rmw_byte rmw[3];
+       int ret;
 
-       down(&slot->mutex);
-       if (slot->usb) {
-               *value = slot->usb->analog_out_shadow[channel];
-               result = 0;
+       down(&devpriv->sem);
+
+       switch (channel) {
+       case 0:
+               /* 1. Set DAC mode */
+               rmw[0].address = F020_SFR_DAC0CN;
+               rmw[0].and_mask = 0xff;
+               rmw[0].or_value = F020_MASK_DACxCN_DACxEN;
+
+               /* 2 load low byte of DAC value first */
+               rmw[1].address = F020_SFR_DAC0L;
+               rmw[1].and_mask = 0xff;
+               rmw[1].or_value = value & 0xff;
+
+               /* 3 load high byte of DAC value next to latch the
+                       12-bit value */
+               rmw[2].address = F020_SFR_DAC0H;
+               rmw[2].and_mask = 0xff;
+               rmw[2].or_value = (value >> 8) & 0xf;
+               break;
+
+       case 1:
+               /* 1. Set DAC mode */
+               rmw[0].address = F020_SFR_DAC1CN;
+               rmw[0].and_mask = 0xff;
+               rmw[0].or_value = F020_MASK_DACxCN_DACxEN;
+
+               /* 2 load low byte of DAC value first */
+               rmw[1].address = F020_SFR_DAC1L;
+               rmw[1].and_mask = 0xff;
+               rmw[1].or_value = value & 0xff;
+
+               /* 3 load high byte of DAC value next to latch the
+                       12-bit value */
+               rmw[2].address = F020_SFR_DAC1H;
+               rmw[2].and_mask = 0xff;
+               rmw[2].or_value = (value >> 8) & 0xf;
+               break;
        }
-       up(&slot->mutex);
+       ret = dt9812_rmw_multiple_registers(dev, 3, rmw);
+       devpriv->ao_shadow[channel] = value;
 
-       return result;
+       up(&devpriv->sem);
+
+       return ret;
 }
 
-static int dt9812_analog_out(struct slot_dt9812 *slot, int channel, u16 value)
+static int dt9812_di_insn_bits(struct comedi_device *dev,
+                              struct comedi_subdevice *s,
+                              struct comedi_insn *insn,
+                              unsigned int *data)
 {
-       int result = -ENODEV;
+       u8 bits = 0;
+       int ret;
 
-       down(&slot->mutex);
-       if (slot->usb) {
-               struct dt9812_rmw_byte rmw[3];
+       ret = dt9812_digital_in(dev, &bits);
+       if (ret)
+               return ret;
 
-               switch (channel) {
-               case 0:
-                       /* 1. Set DAC mode */
-                       rmw[0].address = F020_SFR_DAC0CN;
-                       rmw[0].and_mask = 0xff;
-                       rmw[0].or_value = F020_MASK_DACxCN_DACxEN;
-
-                       /* 2 load low byte of DAC value first */
-                       rmw[1].address = F020_SFR_DAC0L;
-                       rmw[1].and_mask = 0xff;
-                       rmw[1].or_value = value & 0xff;
-
-                       /* 3 load high byte of DAC value next to latch the
-                          12-bit value */
-                       rmw[2].address = F020_SFR_DAC0H;
-                       rmw[2].and_mask = 0xff;
-                       rmw[2].or_value = (value >> 8) & 0xf;
-                       break;
+       data[1] = bits;
 
-               case 1:
-                       /* 1. Set DAC mode */
-                       rmw[0].address = F020_SFR_DAC1CN;
-                       rmw[0].and_mask = 0xff;
-                       rmw[0].or_value = F020_MASK_DACxCN_DACxEN;
-
-                       /* 2 load low byte of DAC value first */
-                       rmw[1].address = F020_SFR_DAC1L;
-                       rmw[1].and_mask = 0xff;
-                       rmw[1].or_value = value & 0xff;
-
-                       /* 3 load high byte of DAC value next to latch the
-                          12-bit value */
-                       rmw[2].address = F020_SFR_DAC1H;
-                       rmw[2].and_mask = 0xff;
-                       rmw[2].or_value = (value >> 8) & 0xf;
-                       break;
-               }
-               result = dt9812_rmw_multiple_registers(slot->usb, 3, rmw);
-               slot->usb->analog_out_shadow[channel] = value;
+       return insn->n;
+}
+
+static int dt9812_do_insn_bits(struct comedi_device *dev,
+                              struct comedi_subdevice *s,
+                              struct comedi_insn *insn,
+                              unsigned int *data)
+{
+       unsigned int mask = data[0];
+       unsigned int bits = data[1];
+
+       if (mask) {
+               s->state &= ~mask;
+               s->state |= (bits & mask);
+
+               dt9812_digital_out(dev, s->state);
        }
-       up(&slot->mutex);
 
-       return result;
+       data[1] = s->state;
+
+       return insn->n;
 }
 
-/*
- * USB framework functions
- */
+static int dt9812_ai_insn_read(struct comedi_device *dev,
+                              struct comedi_subdevice *s,
+                              struct comedi_insn *insn,
+                              unsigned int *data)
+{
+       unsigned int chan = CR_CHAN(insn->chanspec);
+       u16 val = 0;
+       int ret;
+       int i;
 
-static int dt9812_probe(struct usb_interface *interface,
-                       const struct usb_device_id *id)
+       for (i = 0; i < insn->n; i++) {
+               ret = dt9812_analog_in(dev, chan, &val, DT9812_GAIN_1);
+               if (ret)
+                       return ret;
+               data[i] = val;
+       }
+
+       return insn->n;
+}
+
+static int dt9812_ao_insn_read(struct comedi_device *dev,
+                              struct comedi_subdevice *s,
+                              struct comedi_insn *insn,
+                              unsigned int *data)
 {
-       int retval = -ENOMEM;
-       struct usb_dt9812 *dev = NULL;
-       struct usb_host_interface *iface_desc;
-       struct usb_endpoint_descriptor *endpoint;
+       struct dt9812_private *devpriv = dev->private;
+       unsigned int chan = CR_CHAN(insn->chanspec);
        int i;
-       u8 fw;
 
-       /* allocate memory for our device state and initialize it */
-       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-       if (dev == NULL)
-               goto error;
+       down(&devpriv->sem);
+       for (i = 0; i < insn->n; i++)
+               data[i] = devpriv->ao_shadow[chan];
+       up(&devpriv->sem);
+
+       return insn->n;
+}
+
+static int dt9812_ao_insn_write(struct comedi_device *dev,
+                               struct comedi_subdevice *s,
+                               struct comedi_insn *insn,
+                               unsigned int *data)
+{
+       unsigned int chan = CR_CHAN(insn->chanspec);
+       int ret;
+       int i;
 
-       kref_init(&dev->kref);
+       for (i = 0; i < insn->n; i++) {
+               ret = dt9812_analog_out(dev, chan, data[i]);
+               if (ret)
+                       return ret;
+       }
 
-       dev->udev = usb_get_dev(interface_to_usbdev(interface));
-       dev->interface = interface;
+       return insn->n;
+}
 
-       /* Check endpoints */
-       iface_desc = interface->cur_altsetting;
+static int dt9812_find_endpoints(struct comedi_device *dev)
+{
+       struct usb_interface *intf = comedi_to_usb_interface(dev);
+       struct usb_host_interface *host = intf->cur_altsetting;
+       struct dt9812_private *devpriv = dev->private;
+       struct usb_endpoint_descriptor *ep;
+       int i;
 
-       if (iface_desc->desc.bNumEndpoints != 5) {
-               dev_err(&interface->dev, "Wrong number of endpoints.\n");
-               retval = -ENODEV;
-               goto error;
+       if (host->desc.bNumEndpoints != 5) {
+               dev_err(dev->class_dev, "Wrong number of endpoints\n");
+               return -ENODEV;
        }
 
-       for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) {
-               int direction = -1;
-               endpoint = &iface_desc->endpoint[i].desc;
+       for (i = 0; i < host->desc.bNumEndpoints; ++i) {
+               int dir = -1;
+               ep = &host->endpoint[i].desc;
                switch (i) {
                case 0:
-                       direction = USB_DIR_IN;
-                       dev->message_pipe.addr = endpoint->bEndpointAddress;
-                       dev->message_pipe.size =
-                           le16_to_cpu(endpoint->wMaxPacketSize);
-
+                       /* unused message pipe */
+                       dir = USB_DIR_IN;
                        break;
                case 1:
-                       direction = USB_DIR_OUT;
-                       dev->command_write.addr = endpoint->bEndpointAddress;
-                       dev->command_write.size =
-                           le16_to_cpu(endpoint->wMaxPacketSize);
+                       dir = USB_DIR_OUT;
+                       devpriv->cmd_wr.addr = ep->bEndpointAddress;
+                       devpriv->cmd_wr.size = le16_to_cpu(ep->wMaxPacketSize);
                        break;
                case 2:
-                       direction = USB_DIR_IN;
-                       dev->command_read.addr = endpoint->bEndpointAddress;
-                       dev->command_read.size =
-                           le16_to_cpu(endpoint->wMaxPacketSize);
+                       dir = USB_DIR_IN;
+                       devpriv->cmd_rd.addr = ep->bEndpointAddress;
+                       devpriv->cmd_rd.size = le16_to_cpu(ep->wMaxPacketSize);
                        break;
                case 3:
-                       direction = USB_DIR_OUT;
-                       dev->write_stream.addr = endpoint->bEndpointAddress;
-                       dev->write_stream.size =
-                           le16_to_cpu(endpoint->wMaxPacketSize);
+                       /* unused write stream */
+                       dir = USB_DIR_OUT;
                        break;
                case 4:
-                       direction = USB_DIR_IN;
-                       dev->read_stream.addr = endpoint->bEndpointAddress;
-                       dev->read_stream.size =
-                           le16_to_cpu(endpoint->wMaxPacketSize);
+                       /* unused read stream */
+                       dir = USB_DIR_IN;
                        break;
                }
-               if ((endpoint->bEndpointAddress & USB_DIR_IN) != direction) {
-                       dev_err(&interface->dev,
-                               "Endpoint has wrong direction.\n");
-                       retval = -ENODEV;
-                       goto error;
+               if ((ep->bEndpointAddress & USB_DIR_IN) != dir) {
+                       dev_err(dev->class_dev,
+                               "Endpoint has wrong direction\n");
+                       return -ENODEV;
                }
        }
-       if (dt9812_read_info(dev, 0, &fw, sizeof(fw)) != 0) {
+       return 0;
+}
+
+static int dt9812_reset_device(struct comedi_device *dev)
+{
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct dt9812_private *devpriv = dev->private;
+       u32 serial;
+       u16 vendor;
+       u16 product;
+       u16 tmp16;
+       u8 tmp8;
+       int ret;
+       int i;
+
+       ret = dt9812_read_info(dev, 0, &tmp8, sizeof(tmp8));
+       if (ret) {
                /*
                 * Seems like a configuration reset is necessary if driver is
                 * reloaded while device is attached
                 */
-               usb_reset_configuration(dev->udev);
+               usb_reset_configuration(usb);
                for (i = 0; i < 10; i++) {
-                       retval = dt9812_read_info(dev, 1, &fw, sizeof(fw));
-                       if (retval == 0) {
-                               dev_info(&interface->dev,
-                                        "usb_reset_configuration succeeded "
-                                        "after %d iterations\n", i);
+                       ret = dt9812_read_info(dev, 1, &tmp8, sizeof(tmp8));
+                       if (ret == 0)
                                break;
-                       }
                }
-       }
-
-       if (dt9812_read_info(dev, 1, &dev->vendor, sizeof(dev->vendor)) != 0) {
-               dev_err(&interface->dev, "Failed to read vendor.\n");
-               retval = -ENODEV;
-               goto error;
-       }
-       if (dt9812_read_info(dev, 3, &dev->product, sizeof(dev->product)) != 0) {
-               dev_err(&interface->dev, "Failed to read product.\n");
-               retval = -ENODEV;
-               goto error;
-       }
-       if (dt9812_read_info(dev, 5, &dev->device, sizeof(dev->device)) != 0) {
-               dev_err(&interface->dev, "Failed to read device.\n");
-               retval = -ENODEV;
-               goto error;
-       }
-       if (dt9812_read_info(dev, 7, &dev->serial, sizeof(dev->serial)) != 0) {
-               dev_err(&interface->dev, "Failed to read serial.\n");
-               retval = -ENODEV;
-               goto error;
-       }
-
-       dev->vendor = le16_to_cpu(dev->vendor);
-       dev->product = le16_to_cpu(dev->product);
-       dev->device = le16_to_cpu(dev->device);
-       dev->serial = le32_to_cpu(dev->serial);
-       switch (dev->device) {
-       case DT9812_DEVID_DT9812_10:
-               dev->analog_out_shadow[0] = 0x0800;
-               dev->analog_out_shadow[1] = 0x800;
-               break;
-       case DT9812_DEVID_DT9812_2PT5:
-               dev->analog_out_shadow[0] = 0x0000;
-               dev->analog_out_shadow[1] = 0x0000;
-               break;
-       }
-       dev->digital_out_shadow = 0;
-
-       /* save our data pointer in this interface device */
-       usb_set_intfdata(interface, dev);
-
-       /* let the user know what node this device is now attached to */
-       dev_info(&interface->dev, "USB DT9812 (%4.4x.%4.4x.%4.4x) #0x%8.8x\n",
-                dev->vendor, dev->product, dev->device, dev->serial);
-
-       down(&dt9812_mutex);
-       {
-               /* Find a slot for the USB device */
-               struct slot_dt9812 *first = NULL;
-               struct slot_dt9812 *best = NULL;
-
-               for (i = 0; i < DT9812_NUM_SLOTS; i++) {
-                       if (!first && !dt9812[i].usb && dt9812[i].serial == 0)
-                               first = &dt9812[i];
-                       if (!best && dt9812[i].serial == dev->serial)
-                               best = &dt9812[i];
-               }
-
-               if (!best)
-                       best = first;
-
-               if (best) {
-                       down(&best->mutex);
-                       best->usb = dev;
-                       dev->slot = best;
-                       up(&best->mutex);
+               if (ret) {
+                       dev_err(dev->class_dev,
+                               "unable to reset configuration\n");
+                       return ret;
                }
        }
-       up(&dt9812_mutex);
-
-       return 0;
 
-error:
-       if (dev)
-               kref_put(&dev->kref, dt9812_delete);
-       return retval;
-}
-
-static void dt9812_disconnect(struct usb_interface *interface)
-{
-       struct usb_dt9812 *dev;
-       int minor = interface->minor;
-
-       down(&dt9812_mutex);
-       dev = usb_get_intfdata(interface);
-       if (dev->slot) {
-               down(&dev->slot->mutex);
-               dev->slot->usb = NULL;
-               up(&dev->slot->mutex);
-               dev->slot = NULL;
+       ret = dt9812_read_info(dev, 1, &vendor, sizeof(vendor));
+       if (ret) {
+               dev_err(dev->class_dev, "failed to read vendor id\n");
+               return ret;
        }
-       usb_set_intfdata(interface, NULL);
-       up(&dt9812_mutex);
-
-       /* queue final destruction */
-       kref_put(&dev->kref, dt9812_delete);
-
-       dev_info(&interface->dev, "USB Dt9812 #%d now disconnected\n", minor);
-}
+       vendor = le16_to_cpu(vendor);
 
-static struct usb_driver dt9812_usb_driver = {
-       .name = "dt9812",
-       .probe = dt9812_probe,
-       .disconnect = dt9812_disconnect,
-       .id_table = dt9812_table,
-};
-
-/*
- * Comedi functions
- */
-
-static int dt9812_comedi_open(struct comedi_device *dev)
-{
-       struct comedi_dt9812 *devpriv = dev->private;
-       int result = -ENODEV;
-
-       down(&devpriv->slot->mutex);
-       if (devpriv->slot->usb) {
-               /* We have an attached device, fill in current range info */
-               struct comedi_subdevice *s;
-
-               s = &dev->subdevices[0];
-               s->n_chan = 8;
-               s->maxdata = 1;
-
-               s = &dev->subdevices[1];
-               s->n_chan = 8;
-               s->maxdata = 1;
-
-               s = &dev->subdevices[2];
-               s->n_chan = 8;
-               switch (devpriv->slot->usb->device) {
-               case 0:{
-                               s->maxdata = 4095;
-                               s->range_table = &range_bipolar10;
-                       }
-                       break;
-               case 1:{
-                               s->maxdata = 4095;
-                               s->range_table = &range_unipolar2_5;
-                       }
-                       break;
-               }
-
-               s = &dev->subdevices[3];
-               s->n_chan = 2;
-               switch (devpriv->slot->usb->device) {
-               case 0:{
-                               s->maxdata = 4095;
-                               s->range_table = &range_bipolar10;
-                       }
-                       break;
-               case 1:{
-                               s->maxdata = 4095;
-                               s->range_table = &range_unipolar2_5;
-                       }
-                       break;
-               }
-               result = 0;
+       ret = dt9812_read_info(dev, 3, &product, sizeof(product));
+       if (ret) {
+               dev_err(dev->class_dev, "failed to read product id\n");
+               return ret;
        }
-       up(&devpriv->slot->mutex);
-       return result;
-}
-
-static int dt9812_di_rinsn(struct comedi_device *dev,
-                          struct comedi_subdevice *s, struct comedi_insn *insn,
-                          unsigned int *data)
-{
-       struct comedi_dt9812 *devpriv = dev->private;
-       unsigned int channel = CR_CHAN(insn->chanspec);
-       int n;
-       u8 bits = 0;
-
-       dt9812_digital_in(devpriv->slot, &bits);
-       for (n = 0; n < insn->n; n++)
-               data[n] = ((1 << channel) & bits) != 0;
-       return n;
-}
+       product = le16_to_cpu(product);
 
-static int dt9812_do_winsn(struct comedi_device *dev,
-                          struct comedi_subdevice *s, struct comedi_insn *insn,
-                          unsigned int *data)
-{
-       struct comedi_dt9812 *devpriv = dev->private;
-       unsigned int channel = CR_CHAN(insn->chanspec);
-       int n;
-       u8 bits = 0;
-
-       dt9812_digital_out_shadow(devpriv->slot, &bits);
-       for (n = 0; n < insn->n; n++) {
-               u8 mask = 1 << channel;
-
-               bits &= ~mask;
-               if (data[n])
-                       bits |= mask;
+       ret = dt9812_read_info(dev, 5, &tmp16, sizeof(tmp16));
+       if (ret) {
+               dev_err(dev->class_dev, "failed to read device id\n");
+               return ret;
        }
-       dt9812_digital_out(devpriv->slot, bits);
-       return n;
-}
+       devpriv->device = le16_to_cpu(tmp16);
 
-static int dt9812_ai_rinsn(struct comedi_device *dev,
-                          struct comedi_subdevice *s, struct comedi_insn *insn,
-                          unsigned int *data)
-{
-       struct comedi_dt9812 *devpriv = dev->private;
-       unsigned int channel = CR_CHAN(insn->chanspec);
-       int n;
-
-       for (n = 0; n < insn->n; n++) {
-               u16 value = 0;
-
-               dt9812_analog_in(devpriv->slot, channel, &value, DT9812_GAIN_1);
-               data[n] = value;
+       ret = dt9812_read_info(dev, 7, &serial, sizeof(serial));
+       if (ret) {
+               dev_err(dev->class_dev, "failed to read serial number\n");
+               return ret;
        }
-       return n;
-}
+       serial = le32_to_cpu(serial);
 
-static int dt9812_ao_rinsn(struct comedi_device *dev,
-                          struct comedi_subdevice *s, struct comedi_insn *insn,
-                          unsigned int *data)
-{
-       struct comedi_dt9812 *devpriv = dev->private;
-       unsigned int channel = CR_CHAN(insn->chanspec);
-       int n;
-       u16 value;
-
-       for (n = 0; n < insn->n; n++) {
-               value = 0;
-               dt9812_analog_out_shadow(devpriv->slot, channel, &value);
-               data[n] = value;
-       }
-       return n;
-}
+       /* let the user know what node this device is now attached to */
+       dev_info(dev->class_dev, "USB DT9812 (%4.4x.%4.4x.%4.4x) #0x%8.8x\n",
+                vendor, product, devpriv->device, serial);
 
-static int dt9812_ao_winsn(struct comedi_device *dev,
-                          struct comedi_subdevice *s, struct comedi_insn *insn,
-                          unsigned int *data)
-{
-       struct comedi_dt9812 *devpriv = dev->private;
-       unsigned int channel = CR_CHAN(insn->chanspec);
-       int n;
+       if (devpriv->device != DT9812_DEVID_DT9812_10 &&
+           devpriv->device != DT9812_DEVID_DT9812_2PT5) {
+               dev_err(dev->class_dev, "Unsupported device!\n");
+               return -EINVAL;
+       }
 
-       for (n = 0; n < insn->n; n++)
-               dt9812_analog_out(devpriv->slot, channel, data[n]);
-       return n;
+       return 0;
 }
 
-static int dt9812_attach(struct comedi_device *dev, struct comedi_devconfig *it)
+static int dt9812_auto_attach(struct comedi_device *dev,
+                             unsigned long context)
 {
-       struct comedi_dt9812 *devpriv;
-       int i;
+       struct usb_interface *intf = comedi_to_usb_interface(dev);
+       struct dt9812_private *devpriv;
        struct comedi_subdevice *s;
+       bool is_unipolar;
        int ret;
 
        devpriv = kzalloc(sizeof(*devpriv), GFP_KERNEL);
@@ -1017,125 +786,107 @@ static int dt9812_attach(struct comedi_device *dev, struct comedi_devconfig *it)
                return -ENOMEM;
        dev->private = devpriv;
 
-       /*
-        * Special open routine, since USB unit may be unattached at
-        * comedi_config time, hence range can not be determined
-        */
-       dev->open = dt9812_comedi_open;
+       sema_init(&devpriv->sem, 1);
+       usb_set_intfdata(intf, devpriv);
 
-       devpriv->serial = it->options[0];
+       ret = dt9812_find_endpoints(dev);
+       if (ret)
+               return ret;
+
+       ret = dt9812_reset_device(dev);
+       if (ret)
+               return ret;
+
+       is_unipolar = (devpriv->device == DT9812_DEVID_DT9812_2PT5);
 
        ret = comedi_alloc_subdevices(dev, 4);
        if (ret)
                return ret;
 
-       /* digital input subdevice */
+       /* Digital Input subdevice */
        s = &dev->subdevices[0];
-       s->type = COMEDI_SUBD_DI;
-       s->subdev_flags = SDF_READABLE;
-       s->n_chan = 0;
-       s->maxdata = 1;
-       s->range_table = &range_digital;
-       s->insn_read = &dt9812_di_rinsn;
-
-       /* digital output subdevice */
+       s->type         = COMEDI_SUBD_DI;
+       s->subdev_flags = SDF_READABLE;
+       s->n_chan       = 8;
+       s->maxdata      = 1;
+       s->range_table  = &range_digital;
+       s->insn_bits    = dt9812_di_insn_bits;
+
+       /* Digital Output subdevice */
        s = &dev->subdevices[1];
-       s->type = COMEDI_SUBD_DO;
-       s->subdev_flags = SDF_WRITEABLE;
-       s->n_chan = 0;
-       s->maxdata = 1;
-       s->range_table = &range_digital;
-       s->insn_write = &dt9812_do_winsn;
-
-       /* analog input subdevice */
+       s->type         = COMEDI_SUBD_DO;
+       s->subdev_flags = SDF_WRITEABLE;
+       s->n_chan       = 8;
+       s->maxdata      = 1;
+       s->range_table  = &range_digital;
+       s->insn_bits    = dt9812_do_insn_bits;
+
+       /* Analog Input subdevice */
        s = &dev->subdevices[2];
-       s->type = COMEDI_SUBD_AI;
-       s->subdev_flags = SDF_READABLE | SDF_GROUND;
-       s->n_chan = 0;
-       s->maxdata = 1;
-       s->range_table = NULL;
-       s->insn_read = &dt9812_ai_rinsn;
-
-       /* analog output subdevice */
+       s->type         = COMEDI_SUBD_AI;
+       s->subdev_flags = SDF_READABLE | SDF_GROUND;
+       s->n_chan       = 8;
+       s->maxdata      = 0x0fff;
+       s->range_table  = is_unipolar ? &range_unipolar2_5 : &range_bipolar10;
+       s->insn_read    = dt9812_ai_insn_read;
+
+       /* Analog Output subdevice */
        s = &dev->subdevices[3];
-       s->type = COMEDI_SUBD_AO;
-       s->subdev_flags = SDF_WRITEABLE;
-       s->n_chan = 0;
-       s->maxdata = 1;
-       s->range_table = NULL;
-       s->insn_write = &dt9812_ao_winsn;
-       s->insn_read = &dt9812_ao_rinsn;
-
-       dev_info(dev->class_dev, "successfully attached to dt9812.\n");
-
-       down(&dt9812_mutex);
-       /* Find a slot for the comedi device */
-       {
-               struct slot_dt9812 *first = NULL;
-               struct slot_dt9812 *best = NULL;
-               for (i = 0; i < DT9812_NUM_SLOTS; i++) {
-                       if (!first && !dt9812[i].comedi) {
-                               /* First free slot from comedi side */
-                               first = &dt9812[i];
-                       }
-                       if (!best &&
-                           dt9812[i].usb &&
-                           dt9812[i].usb->serial == devpriv->serial) {
-                               /* We have an attaced device with matching ID */
-                               best = &dt9812[i];
-                       }
-               }
-               if (!best)
-                       best = first;
-               if (best) {
-                       down(&best->mutex);
-                       best->comedi = devpriv;
-                       best->serial = devpriv->serial;
-                       devpriv->slot = best;
-                       up(&best->mutex);
-               }
-       }
-       up(&dt9812_mutex);
+       s->type         = COMEDI_SUBD_AO;
+       s->subdev_flags = SDF_WRITEABLE;
+       s->n_chan       = 2;
+       s->maxdata      = 0x0fff;
+       s->range_table  = is_unipolar ? &range_unipolar2_5 : &range_bipolar10;
+       s->insn_write   = dt9812_ao_insn_write;
+       s->insn_read    = dt9812_ao_insn_read;
+
+       devpriv->ao_shadow[0] = is_unipolar ? 0x0000 : 0x0800;
+       devpriv->ao_shadow[1] = is_unipolar ? 0x0000 : 0x0800;
 
        return 0;
 }
 
 static void dt9812_detach(struct comedi_device *dev)
 {
-       /* Nothing to cleanup */
-}
+       struct usb_interface *intf = comedi_to_usb_interface(dev);
+       struct dt9812_private *devpriv = dev->private;
 
-static struct comedi_driver dt9812_comedi_driver = {
-       .module = THIS_MODULE,
-       .driver_name = "dt9812",
-       .attach = dt9812_attach,
-       .detach = dt9812_detach,
-};
+       if (!devpriv)
+               return;
 
-static int __init usb_dt9812_init(void)
-{
-       int i;
+       down(&devpriv->sem);
 
-       /* Initialize all driver slots */
-       for (i = 0; i < DT9812_NUM_SLOTS; i++) {
-               sema_init(&dt9812[i].mutex, 1);
-               dt9812[i].serial = 0;
-               dt9812[i].usb = NULL;
-               dt9812[i].comedi = NULL;
-       }
-       dt9812[12].serial = 0x0;
+       usb_set_intfdata(intf, NULL);
 
-       return comedi_usb_driver_register(&dt9812_comedi_driver,
-                                               &dt9812_usb_driver);
+       up(&devpriv->sem);
 }
 
-static void __exit usb_dt9812_exit(void)
+static struct comedi_driver dt9812_driver = {
+       .driver_name    = "dt9812",
+       .module         = THIS_MODULE,
+       .auto_attach    = dt9812_auto_attach,
+       .detach         = dt9812_detach,
+};
+
+static int dt9812_usb_probe(struct usb_interface *intf,
+                           const struct usb_device_id *id)
 {
-       comedi_usb_driver_unregister(&dt9812_comedi_driver, &dt9812_usb_driver);
+       return comedi_usb_auto_config(intf, &dt9812_driver, id->driver_info);
 }
 
-module_init(usb_dt9812_init);
-module_exit(usb_dt9812_exit);
+static const struct usb_device_id dt9812_usb_table[] = {
+       { USB_DEVICE(0x0867, 0x9812) },
+       { }
+};
+MODULE_DEVICE_TABLE(usb, dt9812_usb_table);
+
+static struct usb_driver dt9812_usb_driver = {
+       .name           = "dt9812",
+       .id_table       = dt9812_usb_table,
+       .probe          = dt9812_usb_probe,
+       .disconnect     = comedi_usb_auto_unconfig,
+};
+module_comedi_usb_driver(dt9812_driver, dt9812_usb_driver);
 
 MODULE_AUTHOR("Anders Blomdell <anders.blomdell@control.lth.se>");
 MODULE_DESCRIPTION("Comedi DT9812 driver");
index 93ec8e492cccbfa0b6abf90754bd2226225adeab..e14dd3ae9ec6c5ccff9bffe9ccc476e8e17423b8 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
index 0c061df0978cc1fe1aa670bb1049db826444a0b6..2fceff93867bee453c9a1c61e6e7dc74e603dfca 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************/
+*/
 
 /*
  * Driver: gsc_hpdi
index 08ab9d6e71909317e2d50bfe89d5ca3bf075cd23..a11e015dc03d3207a28a36e9cca0a07dc6e1bec8 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*
index 90b303ab23000a24d81dd0bed3ba6920e002f778..94609f4aa4c94fe11815f39bb0e95e29c2381bc7 100644 (file)
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
  * Driver: jr3_pci
@@ -46,7 +41,6 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
-#include <linux/firmware.h>
 #include <linux/jiffies.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
@@ -97,37 +91,6 @@ struct jr3_pci_subdev_private {
        int retries;
 };
 
-/* Hotplug firmware loading stuff */
-static int comedi_load_firmware(struct comedi_device *dev, const char *name,
-                               int (*cb)(struct comedi_device *dev,
-                                       const u8 *data, size_t size))
-{
-       struct pci_dev *pcidev = comedi_to_pci_dev(dev);
-       int result = 0;
-       const struct firmware *fw;
-       char *firmware_path;
-       static const char *prefix = "comedi/";
-
-       firmware_path = kmalloc(strlen(prefix) + strlen(name) + 1, GFP_KERNEL);
-       if (!firmware_path) {
-               result = -ENOMEM;
-       } else {
-               firmware_path[0] = '\0';
-               strcat(firmware_path, prefix);
-               strcat(firmware_path, name);
-               result = request_firmware(&fw, firmware_path, &pcidev->dev);
-               if (result == 0) {
-                       if (!cb)
-                               result = -EINVAL;
-                       else
-                               result = cb(dev, fw->data, fw->size);
-                       release_firmware(fw);
-               }
-               kfree(firmware_path);
-       }
-       return result;
-}
-
 static struct poll_delay_t poll_delay_min_max(int min, int max)
 {
        struct poll_delay_t result;
@@ -362,8 +325,9 @@ static int read_idm_word(const u8 *data, size_t size, int *pos,
        return result;
 }
 
-static int jr3_download_firmware(struct comedi_device *dev, const u8 *data,
-                                size_t size)
+static int jr3_download_firmware(struct comedi_device *dev,
+                                const u8 *data, size_t size,
+                                unsigned long context)
 {
        /*
         * IDM file format is:
@@ -768,7 +732,9 @@ static int jr3_pci_auto_attach(struct comedi_device *dev,
        /*  Reset DSP card */
        writel(0, &devpriv->iobase->channel[0].reset);
 
-       result = comedi_load_firmware(dev, "jr3pci.idm", jr3_download_firmware);
+       result = comedi_load_firmware(dev, &comedi_to_pci_dev(dev)->dev,
+                                     "comedi/jr3pci.idm",
+                                     jr3_download_firmware, 0);
        dev_dbg(dev->class_dev, "Firmare load %d\n", result);
 
        if (result < 0)
@@ -778,8 +744,9 @@ static int jr3_pci_auto_attach(struct comedi_device *dev,
         * format:
         *     model serial Fx Fy Fz Mx My Mz\n
         *
-        *     comedi_load_firmware(dev, "jr3_offsets_table",
-        *                          jr3_download_firmware);
+        *     comedi_load_firmware(dev, &comedi_to_pci_dev(dev)->dev,
+        *                          "comedi/jr3_offsets_table",
+        *                          jr3_download_firmware, 1);
         */
 
        /*
index e0e64752e3103032e2724c6f9c541a7d1064ab74..f10cf10e5fe327cf155760a1fed71aaa1f730533 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ke_counter
index 641e693d5d0e9ffd68f5a43173450f7b2b0d1277..c2308fd24d6a26445012b5bd5b1ae77738832dbb 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: me4000
index 09f2a9feaf7ce28c7303c52392380467f5f273fa..7533ece3670e30aca9cf07d54ad6b6025a1ded4a 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -37,7 +33,6 @@
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <linux/firmware.h>
 
 #include "../comedidev.h"
 
@@ -391,7 +386,8 @@ static int me_ao_insn_read(struct comedi_device *dev,
 }
 
 static int me2600_xilinx_download(struct comedi_device *dev,
-                                 const u8 *data, size_t size)
+                                 const u8 *data, size_t size,
+                                 unsigned long context)
 {
        struct me_private_data *dev_private = dev->private;
        unsigned int value;
@@ -460,22 +456,6 @@ static int me2600_xilinx_download(struct comedi_device *dev,
        return 0;
 }
 
-static int me2600_upload_firmware(struct comedi_device *dev)
-{
-       struct pci_dev *pcidev = comedi_to_pci_dev(dev);
-       const struct firmware *fw;
-       int ret;
-
-       ret = request_firmware(&fw, ME2600_FIRMWARE, &pcidev->dev);
-       if (ret)
-               return ret;
-
-       ret = me2600_xilinx_download(dev, fw->data, fw->size);
-       release_firmware(fw);
-
-       return ret;
-}
-
 static int me_reset(struct comedi_device *dev)
 {
        struct me_private_data *dev_private = dev->private;
@@ -529,7 +509,9 @@ static int me_auto_attach(struct comedi_device *dev,
 
        /* Download firmware and reset card */
        if (board->needs_firmware) {
-               ret = me2600_upload_firmware(dev);
+               ret = comedi_load_firmware(dev, &comedi_to_pci_dev(dev)->dev,
+                                          ME2600_FIRMWARE,
+                                          me2600_xilinx_download, 0);
                if (ret < 0)
                        return ret;
        }
index 523c6564ffcaeaaa94b0d2e98fdae95b5226b0f1..12c34db61d637c0048fdb543cf9265e384b6f496 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*
index 255b8ba9c917b72b37d66d1cad7c17fc218a719b..d4487e888e64fa6a7dfcc73e492f821696829034 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _MITE_H_
index 4717be4ad26829bff589fa73157ae73d8f14b52f..713842ad6ff67a9fc842e3c4f31d3a6e85eb37ed 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: mpc624
index 7a8292086e1818a67894c21c23349fde32fa51f1..5ecd1b1666fbe35a3070a66c2de974f5551c3462 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: multiq3
index d10f777b7f179f5da2e540dbc39d0008b3a0c024..903c2ef5dd9a670e84a3e50135ae373eaf5ccf34 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_6527
index 3f71f0f54d3ce257398b9eee038aea4f3f5f6fa1..6a89e5c166e3a0031d9b18064b404679c90cbdd4 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_65xx
index 5cdda7fe97a71fd463ff8d04aeaf5ca1ac826f56..a9e000461ec7f180bb8f8870d512cc52c1039f1c 100644 (file)
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*
index 42ab6dbf9d3940cb5fcf2af2ce2f2404eb65277c..1a185b9c529f1f7a3a6b8b1b294862f89aee3b7a 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_670x
index 2d375168f36d41ed7b8f70537d9e65630ef910c7..7ea5aa32e9d2124e9a4ccc47eba9ae9ab39ce64a 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: ni_at_a2150
index 7e5783a4f4e707efc0e626aa7c23a5005a6ff6b2..e080053c697b8a98691115f2338209a4aa5afb34 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_at_ao
index 4ced7ba119b05463473d6898ccf5e8951b746171..713edd55a91b6fcf7344e048fce02e2ba59d832d 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 /*
 Driver: ni_atmio
index 6c97a0925aad06123ff5e8467aac51e1489e70f5..d280332d32d193edc58aa50971f5169d45a9aab2 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: ni_atmio16d
index d067ef70e194d3e736e28eb975d95502b4565392..3c50e31ecc6070518895114c2769bee0d73dc124 100644 (file)
  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *     GNU General Public License for more details.
- *
- *     You should have received a copy of the GNU General Public License
- *     along with this program; if not, write to the Free Software
- *     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
  */
 
 /*
@@ -40,7 +35,7 @@ port, bit 0; channel 8 corresponds to the input port, bit 0.
 Digital direction configuration: channels 0-7 output, 8-15 input (8225 device
 emu as port A output, port B input, port C N/A).
 
-Analog: The input  range is 0 to 4095 for -10 to +10 volts 
+Analog: The input  range is 0 to 4095 for -10 to +10 volts
 IRQ is assigned but not used.
 
 Version 0.1    Original DIO only driver
@@ -183,7 +178,7 @@ static int daq700_ai_rinsn(struct comedi_device *dev,
  */
 static void daq700_ai_config(struct comedi_device *dev,
                             struct comedi_subdevice *s)
-{                      
+{
        unsigned long iobase = dev->iobase;
 
        outb(0x80, iobase + CMD_R1);    /* disable scanning, ADC to chan 0 */
index 9b7805fda93252c7162f5c87b7005494ee470942..6ff1526bd063a97c082fe0b3622a3d0ed42f27bc 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: ni_daq_dio24
index 77a7bb63258034ae0dd2b88bd692a0b6da9fe317..a918b7ffbe22d28d58ecd00d9c8a339398c80eca 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -88,7 +84,7 @@
 #define CMD1_REG               0x00    /* W: Command 1 reg */
 #define CMD1_MA(x)             (((x) & 0x7) << 0)
 #define CMD1_TWOSCMP           (1 << 3)
-#define CMD1_GAIN_MASK         (7 << 4)
+#define CMD1_GAIN(x)           (((x) & 0x7) << 4)
 #define CMD1_SCANEN            (1 << 7)
 #define CMD2_REG               0x01    /* W: Command 2 reg */
 #define CMD2_PRETRIG           (1 << 0)
@@ -153,11 +149,6 @@ enum scan_mode {
        MODE_MULT_CHAN_DOWN,
 };
 
-static const int labpc_plus_ai_gain_bits[] = {
-       0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
-       0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
-};
-
 static const struct comedi_lrange range_labpc_plus_ai = {
        16, {
                BIP_RANGE(5),
@@ -179,13 +170,7 @@ static const struct comedi_lrange range_labpc_plus_ai = {
        }
 };
 
-const int labpc_1200_ai_gain_bits[] = {
-       0x00, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
-       0x00, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
-};
-EXPORT_SYMBOL_GPL(labpc_1200_ai_gain_bits);
-
-const struct comedi_lrange range_labpc_1200_ai = {
+static const struct comedi_lrange range_labpc_1200_ai = {
        14, {
                BIP_RANGE(5),
                BIP_RANGE(2.5),
@@ -203,7 +188,6 @@ const struct comedi_lrange range_labpc_1200_ai = {
                UNI_RANGE(0.1)
        }
 };
-EXPORT_SYMBOL_GPL(range_labpc_1200_ai);
 
 static const struct comedi_lrange range_labpc_ao = {
        2, {
@@ -239,25 +223,18 @@ static const struct labpc_boardinfo labpc_boards[] = {
        {
                .name                   = "lab-pc-1200",
                .ai_speed               = 10000,
-               .register_layout        = labpc_1200_layout,
-               .has_ao                 = 1,
-               .ai_range_table         = &range_labpc_1200_ai,
-               .ai_range_code          = labpc_1200_ai_gain_bits,
                .ai_scan_up             = 1,
+               .has_ao                 = 1,
+               .is_labpc1200           = 1,
        }, {
                .name                   = "lab-pc-1200ai",
                .ai_speed               = 10000,
-               .register_layout        = labpc_1200_layout,
-               .ai_range_table         = &range_labpc_1200_ai,
-               .ai_range_code          = labpc_1200_ai_gain_bits,
                .ai_scan_up             = 1,
+               .is_labpc1200           = 1,
        }, {
                .name                   = "lab-pc+",
                .ai_speed               = 12000,
-               .register_layout        = labpc_plus_layout,
                .has_ao                 = 1,
-               .ai_range_table         = &range_labpc_plus_ai,
-               .ai_range_code          = labpc_plus_ai_gain_bits,
        },
 };
 #endif
@@ -326,12 +303,21 @@ static void labpc_ai_set_chan_and_gain(struct comedi_device *dev,
        const struct labpc_boardinfo *board = comedi_board(dev);
        struct labpc_private *devpriv = dev->private;
 
+       if (board->is_labpc1200) {
+               /*
+                * The LabPC-1200 boards do not have a gain
+                * of '0x10'. Skip the range values that would
+                * result in this gain.
+                */
+               range += (range > 0) + (range > 7);
+       }
+
        /* munge channel bits for differential/scan disabled mode */
        if ((mode == MODE_SINGLE_CHAN || mode == MODE_SINGLE_CHAN_INTERVAL) &&
            aref == AREF_DIFF)
                chan *= 2;
        devpriv->cmd1 = CMD1_MA(chan);
-       devpriv->cmd1 |= board->ai_range_code[range];
+       devpriv->cmd1 |= CMD1_GAIN(range);
 
        devpriv->write_byte(devpriv->cmd1, dev->iobase + CMD1_REG);
 }
@@ -347,7 +333,7 @@ static void labpc_setup_cmd6_reg(struct comedi_device *dev,
        const struct labpc_boardinfo *board = comedi_board(dev);
        struct labpc_private *devpriv = dev->private;
 
-       if (board->register_layout != labpc_1200_layout)
+       if (!board->is_labpc1200)
                return;
 
        /* reference inputs to ground or common? */
@@ -759,7 +745,7 @@ static int labpc_ai_cmdtest(struct comedi_device *dev,
        err |= cfc_check_trigger_src(&cmd->scan_end_src, TRIG_COUNT);
 
        stop_mask = TRIG_COUNT | TRIG_NONE;
-       if (board->register_layout == labpc_1200_layout)
+       if (board->is_labpc1200)
                stop_mask |= TRIG_EXT;
        err |= cfc_check_trigger_src(&cmd->stop_src, stop_mask);
 
@@ -895,7 +881,7 @@ static int labpc_ai_cmd(struct comedi_device *dev, struct comedi_subdevice *s)
                /* pc-plus has no fifo-half full interrupt */
        } else
 #endif
-       if (board->register_layout == labpc_1200_layout &&
+       if (board->is_labpc1200 &&
                   /*  wake-end-of-scan should interrupt on fifo not empty */
                   (cmd->flags & TRIG_WAKE_EOS) == 0 &&
                   /*  make sure we are taking more than just a few points */
@@ -1175,7 +1161,7 @@ static irqreturn_t labpc_interrupt(int irq, void *d)
 
        /* read board status */
        devpriv->stat1 = devpriv->read_byte(dev->iobase + STAT1_REG);
-       if (board->register_layout == labpc_1200_layout)
+       if (board->is_labpc1200)
                devpriv->stat2 = devpriv->read_byte(dev->iobase + STAT2_REG);
 
        if ((devpriv->stat1 & (STAT1_GATA0 | STAT1_CNTINT | STAT1_OVERFLOW |
@@ -1201,8 +1187,7 @@ static irqreturn_t labpc_interrupt(int irq, void *d)
                 * has occurred
                 */
                if (devpriv->stat1 & STAT1_GATA0 ||
-                   (board->register_layout == labpc_1200_layout
-                    && devpriv->stat2 & STAT2_OUTA1)) {
+                   (board->is_labpc1200 && devpriv->stat2 & STAT2_OUTA1)) {
                        handle_isa_dma(dev);
                }
        } else
@@ -1266,7 +1251,7 @@ static int labpc_ao_insn_write(struct comedi_device *dev,
        spin_unlock_irqrestore(&dev->spinlock, flags);
 
        /* set range */
-       if (board->register_layout == labpc_1200_layout) {
+       if (board->is_labpc1200) {
                range = CR_RANGE(insn->chanspec);
                if (labpc_range_is_unipolar(s, range))
                        devpriv->cmd6 |= CMD6_DACUNI(channel);
@@ -1603,7 +1588,7 @@ int labpc_common_attach(struct comedi_device *dev,
        devpriv->write_byte(devpriv->cmd2, dev->iobase + CMD2_REG);
        devpriv->write_byte(devpriv->cmd3, dev->iobase + CMD3_REG);
        devpriv->write_byte(devpriv->cmd4, dev->iobase + CMD4_REG);
-       if (board->register_layout == labpc_1200_layout) {
+       if (board->is_labpc1200) {
                devpriv->write_byte(devpriv->cmd5, dev->iobase + CMD5_REG);
                devpriv->write_byte(devpriv->cmd6, dev->iobase + CMD6_REG);
        }
@@ -1626,7 +1611,8 @@ int labpc_common_attach(struct comedi_device *dev,
        s->n_chan       = 8;
        s->len_chanlist = 8;
        s->maxdata      = 0x0fff;
-       s->range_table  = board->ai_range_table;
+       s->range_table  = board->is_labpc1200
+                               ? &range_labpc_1200_ai : &range_labpc_plus_ai;
        s->insn_read    = labpc_ai_insn_read;
        if (dev->irq) {
                dev->read_subdev = s;
@@ -1671,7 +1657,7 @@ int labpc_common_attach(struct comedi_device *dev,
 
        /*  calibration subdevices for boards that have one */
        s = &dev->subdevices[3];
-       if (board->register_layout == labpc_1200_layout) {
+       if (board->is_labpc1200) {
                s->type         = COMEDI_SUBD_CALIB;
                s->subdev_flags = SDF_READABLE | SDF_WRITABLE | SDF_INTERNAL;
                s->n_chan       = 16;
@@ -1686,7 +1672,7 @@ int labpc_common_attach(struct comedi_device *dev,
 
        /* EEPROM */
        s = &dev->subdevices[4];
-       if (board->register_layout == labpc_1200_layout) {
+       if (board->is_labpc1200) {
                s->type         = COMEDI_SUBD_MEMORY;
                s->subdev_flags = SDF_READABLE | SDF_WRITABLE | SDF_INTERNAL;
                s->n_chan       = EEPROM_SIZE;
index 4b691f5a9965098cd6edebb5be04d940ea65af19..aa5c4d8bdf75f0c91012e21cb1ead935a508607d 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _NI_LABPC_H
 #define EEPROM_SIZE    256     /*  256 byte eeprom */
 #define NUM_AO_CHAN    2       /*  boards have two analog output channels */
 
-enum labpc_register_layout { labpc_plus_layout, labpc_1200_layout };
 enum transfer_type { fifo_not_empty_transfer, fifo_half_full_transfer,
        isa_dma_transfer
 };
 
 struct labpc_boardinfo {
        const char *name;
-       int device_id;          /*  device id for pci and pcmcia boards */
-       int ai_speed;           /*  maximum input speed in nanoseconds */
-
-       /*  1200 has extra registers compared to pc+ */
-       enum labpc_register_layout register_layout;
-       int has_ao;             /*  has analog output true/false */
-       const struct comedi_lrange *ai_range_table;
-       const int *ai_range_code;
-
-       /*  board can auto scan up in ai channels, not just down */
-       unsigned ai_scan_up:1;
-
-       /* uses memory mapped io instead of ioports */
-       unsigned has_mmio:1;
+       int ai_speed;                   /* maximum input speed in ns */
+       unsigned ai_scan_up:1;          /* can auto scan up in ai channels */
+       unsigned has_ao:1;              /* has analog outputs */
+       unsigned is_labpc1200:1;        /* has extra regs compared to pc+ */
+       unsigned has_mmio:1;            /* uses memory mapped io */
 };
 
 struct labpc_private {
@@ -103,7 +88,4 @@ int labpc_common_attach(struct comedi_device *dev,
                        unsigned int irq, unsigned long isr_flags);
 void labpc_common_detach(struct comedi_device *dev);
 
-extern const int labpc_1200_ai_gain_bits[];
-extern const struct comedi_lrange range_labpc_1200_ai;
-
 #endif /* _NI_LABPC_H */
index 9e3737c6918dc4e94d2b1d63ae2210ddc7d4a862..883581eb3dbe4b4123b86a5da5bc6a698df2eb9b 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-************************************************************************
 */
 /*
 Driver: ni_labpc_cs
@@ -76,12 +70,9 @@ NI manuals:
 static const struct labpc_boardinfo labpc_cs_boards[] = {
        {
                .name                   = "daqcard-1200",
-               .device_id              = 0x103,
                .ai_speed               = 10000,
-               .register_layout        = labpc_1200_layout,
                .has_ao                 = 1,
-               .ai_range_table         = &range_labpc_1200_ai,
-               .ai_range_code          = labpc_1200_ai_gain_bits,
+               .is_labpc1200           = 1,
        },
 };
 
index 8e916f86cceadd21877b994a101680eea6f525ec..1f80711bf3687813e424df600ef1bd6fef4937aa 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -50,11 +46,9 @@ static const struct labpc_boardinfo labpc_pci_boards[] = {
        [BOARD_NI_PCI1200] = {
                .name                   = "ni_pci-1200",
                .ai_speed               = 10000,
-               .register_layout        = labpc_1200_layout,
-               .has_ao                 = 1,
-               .ai_range_table         = &range_labpc_1200_ai,
-               .ai_range_code          = labpc_1200_ai_gain_bits,
                .ai_scan_up             = 1,
+               .has_ao                 = 1,
+               .is_labpc1200           = 1,
                .has_mmio               = 1,
        },
 };
index 8c5dee9b3b05188c85110ee171fb179d354bd858..1e78198a2253441e8defa9a48365e085c6b5c2c8 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*
index 888be7b89d2dcff34aa6763ce7d66eed9b1fc4da..f813f5763671942072e62ce034d239e157e616bd 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_mio_cs
index b5f340c186ec74e839d894288e02b96803d8e1b9..5b2f72e102e10377bd62cd728dc6cb572e67c94b 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ni_pcidio
@@ -58,7 +53,6 @@ comedi_nonfree_firmware tarball available from http://www.comedi.org
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
-#include <linux/firmware.h>
 
 #include "../comedidev.h"
 
@@ -971,11 +965,13 @@ static int ni_pcidio_change(struct comedi_device *dev,
        return 0;
 }
 
-static int pci_6534_load_fpga(struct comedi_device *dev, int fpga_index,
-                             const u8 *data, size_t data_len)
+static int pci_6534_load_fpga(struct comedi_device *dev,
+                             const u8 *data, size_t data_len,
+                             unsigned long context)
 {
        struct nidio96_private *devpriv = dev->private;
        static const int timeout = 1000;
+       int fpga_index = context;
        int i;
        size_t j;
 
@@ -1033,7 +1029,7 @@ static int pci_6534_load_fpga(struct comedi_device *dev, int fpga_index,
 
 static int pci_6534_reset_fpga(struct comedi_device *dev, int fpga_index)
 {
-       return pci_6534_load_fpga(dev, fpga_index, NULL, 0);
+       return pci_6534_load_fpga(dev, NULL, 0, fpga_index);
 }
 
 static int pci_6534_reset_fpgas(struct comedi_device *dev)
@@ -1067,13 +1063,12 @@ static void pci_6534_init_main_fpga(struct comedi_device *dev)
 static int pci_6534_upload_firmware(struct comedi_device *dev)
 {
        struct nidio96_private *devpriv = dev->private;
-       int ret;
-       const struct firmware *fw;
        static const char *const fw_file[3] = {
                FW_PCI_6534_SCARAB_DI,  /* loaded into scarab A for DI */
                FW_PCI_6534_SCARAB_DO,  /* loaded into scarab B for DO */
                FW_PCI_6534_MAIN,       /* loaded into main FPGA */
        };
+       int ret;
        int n;
 
        ret = pci_6534_reset_fpgas(dev);
@@ -1081,14 +1076,11 @@ static int pci_6534_upload_firmware(struct comedi_device *dev)
                return ret;
        /* load main FPGA first, then the two scarabs */
        for (n = 2; n >= 0; n--) {
-               ret = request_firmware(&fw, fw_file[n],
-                                      &devpriv->mite->pcidev->dev);
-               if (ret == 0) {
-                       ret = pci_6534_load_fpga(dev, n, fw->data, fw->size);
-                       if (ret == 0 && n == 2)
-                               pci_6534_init_main_fpga(dev);
-                       release_firmware(fw);
-               }
+               ret = comedi_load_firmware(dev, &devpriv->mite->pcidev->dev,
+                                          fw_file[n],
+                                          pci_6534_load_fpga, n);
+               if (ret == 0 && n == 2)
+                       pci_6534_init_main_fpga(dev);
                if (ret < 0)
                        break;
        }
index 634d02303aa046c1a6092ada8c3df6f52b3ba12b..35681ba1f369333d5eebf69f1ecfc2d6ccd90c3f 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 /*
 Driver: ni_pcimio
index 0a613c07760843b36a7eb612372a223a78158f56..11bf0aab82ea98528f26d436e4a4c6bf4472ec4e 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*
index 225287769dc18f13cb50427fd789d91c94d0c55b..f2cf76d15d7838ec6f557ce83aa6afffc8218373 100644 (file)
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*
index 8572996539fa1c9d88ea22e718c83e7351a09964..7e13697b32544a25ed52095511d9a641dcbfb1aa 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _COMEDI_NI_TIO_H
index 5e00212aa022e5c55e7f901adec5ed3b52f6b8fe..b009876754a8a1427ceff2a027c46807c21e1896 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _COMEDI_NI_TIO_INTERNAL_H
index 13747f324936deebf5fde0fbcc7f6a9a45520872..cff50bc45bcd71ac962810d90ceff64d92604b6c 100644 (file)
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 /*
index 8be2a4c503cc3cedef149e1078a9f348eebf5b45..7abf3f74144e78e530474683cfc185bc60b9139b 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: pcl711
index 4f033d88eecaa45fc00f3acfb95c12839e0835a8..1bcabb8901d167dc6aedd54ea9f83d2165fa3346 100644 (file)
@@ -1,42 +1,28 @@
 /*
-    comedi/drivers/pcl724.c
-
-    Michal Dobes <dobes@tesnet.cz>
-
-    hardware driver for Advantech cards:
-     card:   PCL-724, PCL-722, PCL-731
-     driver: pcl724,  pcl722,  pcl731
-    and ADLink cards:
-     card:   ACL-7122, ACL-7124, PET-48DIO
-     driver: acl7122,  acl7124,  pet48dio
-
-    Options for PCL-724, PCL-731, ACL-7124 and PET-48DIO:
-     [0] - IO Base
-
-    Options for PCL-722 and ACL-7122:
-     [0] - IO Base
-     [1] - IRQ (0=disable IRQ) IRQ isn't supported at this time!
-     [2] -number of DIO:
-             0, 144: 144 DIO configuration
-             1,  96:  96 DIO configuration
-*/
-/*
-Driver: pcl724
-Description: Advantech PCL-724, PCL-722, PCL-731 ADLink ACL-7122, ACL-7124,
-  PET-48DIO
-Author: Michal Dobes <dobes@tesnet.cz>
-Devices: [Advantech] PCL-724 (pcl724), PCL-722 (pcl722), PCL-731 (pcl731),
-  [ADLink] ACL-7122 (acl7122), ACL-7124 (acl7124), PET-48DIO (pet48dio)
-Status: untested
-
-This is driver for digital I/O boards PCL-722/724/731 with 144/24/48 DIO
-and for digital I/O boards ACL-7122/7124/PET-48DIO with 144/24/48 DIO.
-It need 8255.o for operations and only immediate mode is supported.
-See the source for configuration details.
-*/
+ * pcl724.c
+ * Comedi driver for 8255 based ISA DIO boards
+ *
+ * Michal Dobes <dobes@tesnet.cz>
+ */
+
 /*
- * check_driver overrides:
- *   struct comedi_insn
+ * Driver: pcl724
+ * Description: Comedi driver for 8255 based ISA DIO boards
+ * Devices: (Advantech) PCL-724 [pcl724]
+ *         (Advantech) PCL-722 [pcl722]
+ *         (Advantech) PCL-731 [pcl731]
+ *         (ADLink) ACL-7122 [acl7122]
+ *         (ADLink) ACL-7124 [acl7124]
+ *         (ADLink) PET-48DIO [pet48dio]
+ * Author: Michal Dobes <dobes@tesnet.cz>
+ * Status: untested
+ *
+ * Configuration options:
+ *   [0] - IO Base
+ *   [1] - IRQ (not supported)
+ *   [2] - number of DIO (pcl722 and acl7122 boards)
+ *        0, 144: 144 DIO configuration
+ *        1,  96:  96 DIO configuration
  */
 
 #include "../comedidev.h"
@@ -46,40 +32,48 @@ See the source for configuration details.
 
 #include "8255.h"
 
-#define PCL722_SIZE    32
-#define PCL722_96_SIZE 16
-#define PCL724_SIZE     4
-#define PCL731_SIZE     8
-#define PET48_SIZE      2
-
 #define SIZE_8255      4
 
-/* #define PCL724_IRQ   1  no IRQ support now */
-
 struct pcl724_board {
-
-       const char *name;       /*  board name */
-       int dio;                /*  num of DIO */
-       int numofports;         /*  num of 8255 subdevices */
-       unsigned int IRQbits;   /*  allowed interrupts */
-       unsigned int io_range;  /*  len of IO space */
-       char can_have96;
-       char is_pet48;
+       const char *name;
+       unsigned int io_range;
+       unsigned int can_have96:1;
+       unsigned int is_pet48:1;
+       int numofports;
 };
 
-static int subdev_8255_cb(int dir, int port, int data, unsigned long arg)
-{
-       unsigned long iobase = arg;
-
-       if (dir) {
-               outb(data, iobase + port);
-               return 0;
-       } else {
-               return inb(iobase + port);
-       }
-}
+static const struct pcl724_board boardtypes[] = {
+       {
+               .name           = "pcl724",
+               .io_range       = 0x04,
+               .numofports     = 1,    /* 24 DIO channels */
+       }, {
+               .name           = "pcl722",
+               .io_range       = 0x20,
+               .can_have96     = 1,
+               .numofports     = 6,    /* 144 (or 96) DIO channels */
+       }, {
+               .name           = "pcl731",
+               .io_range       = 0x08,
+               .numofports     = 2,    /* 48 DIO channels */
+       }, {
+               .name           = "acl7122",
+               .io_range       = 0x20,
+               .can_have96     = 1,
+               .numofports     = 6,    /* 144 (or 96) DIO channels */
+       }, {
+               .name           = "acl7124",
+               .io_range       = 0x04,
+               .numofports     = 1,    /* 24 DIO channels */
+       }, {
+               .name           = "pet48dio",
+               .io_range       = 0x02,
+               .is_pet48       = 1,
+               .numofports     = 2,    /* 48 DIO channels */
+       },
+};
 
-static int subdev_8255mapped_cb(int dir, int port, int data,
+static int pcl724_8255mapped_io(int dir, int port, int data,
                                unsigned long iobase)
 {
        int movport = SIZE_8255 * (iobase >> 12);
@@ -96,57 +90,30 @@ static int subdev_8255mapped_cb(int dir, int port, int data,
        }
 }
 
-static int pcl724_attach(struct comedi_device *dev, struct comedi_devconfig *it)
+static int pcl724_attach(struct comedi_device *dev,
+                        struct comedi_devconfig *it)
 {
        const struct pcl724_board *board = comedi_board(dev);
        struct comedi_subdevice *s;
+       unsigned long iobase;
        unsigned int iorange;
-       int ret, i, n_subdevices;
-#ifdef PCL724_IRQ
-       unsigned int irq;
-#endif
+       int n_subdevices;
+       int ret;
+       int i;
 
        iorange = board->io_range;
-       if ((board->can_have96) &&
-           ((it->options[1] == 1) || (it->options[1] == 96)))
-               iorange = PCL722_96_SIZE; /* PCL-724 in 96 DIO configuration */
-       ret = comedi_request_region(dev, it->options[0], iorange);
-       if (ret)
-               return ret;
+       n_subdevices = board->numofports;
 
-#ifdef PCL724_IRQ
-       irq = 0;
-       if (board->IRQbits != 0) {      /* board support IRQ */
-               irq = it->options[1];
-               if (irq) {      /* we want to use IRQ */
-                       if (((1 << irq) & board->IRQbits) == 0) {
-                               printk(KERN_WARNING
-                                      ", IRQ %u is out of allowed range, "
-                                      "DISABLING IT", irq);
-                               irq = 0;        /* Bad IRQ */
-                       } else {
-                               if (request_irq(irq, interrupt_pcl724, 0,
-                                               dev->board_name, dev)) {
-                                       printk(KERN_WARNING
-                                              ", unable to allocate IRQ %u, "
-                                              "DISABLING IT", irq);
-                                       irq = 0;        /* Can't use IRQ */
-                               } else {
-                                       printk(", irq=%u", irq);
-                               }
-                       }
-               }
+       /* Handle PCL-724 in 96 DIO configuration */
+       if (board->can_have96 &&
+           (it->options[2] == 1 || it->options[2] == 96)) {
+               iorange = 0x10;
+               n_subdevices = 4;
        }
 
-       dev->irq = irq;
-#endif
-
-       printk("\n");
-
-       n_subdevices = board->numofports;
-       if ((board->can_have96) && ((it->options[1] == 1)
-                                        || (it->options[1] == 96)))
-               n_subdevices = 4;       /*  PCL-724 in 96 DIO configuration */
+       ret = comedi_request_region(dev, it->options[0], iorange);
+       if (ret)
+               return ret;
 
        ret = comedi_alloc_subdevices(dev, n_subdevices);
        if (ret)
@@ -155,13 +122,15 @@ static int pcl724_attach(struct comedi_device *dev, struct comedi_devconfig *it)
        for (i = 0; i < dev->n_subdevices; i++) {
                s = &dev->subdevices[i];
                if (board->is_pet48) {
-                       subdev_8255_init(dev, s, subdev_8255mapped_cb,
-                                        (unsigned long)(dev->iobase +
-                                                        i * 0x1000));
-               } else
-                       subdev_8255_init(dev, s, subdev_8255_cb,
-                                        (unsigned long)(dev->iobase +
-                                                        SIZE_8255 * i));
+                       iobase = dev->iobase + (i * 0x1000);
+                       ret = subdev_8255_init(dev, s, pcl724_8255mapped_io,
+                                              iobase);
+               } else {
+                       iobase = dev->iobase + (i * SIZE_8255);
+                       ret = subdev_8255_init(dev, s, NULL, iobase);
+               }
+               if (ret)
+                       return ret;
        }
 
        return 0;
@@ -176,15 +145,6 @@ static void pcl724_detach(struct comedi_device *dev)
        comedi_legacy_detach(dev);
 }
 
-static const struct pcl724_board boardtypes[] = {
-       { "pcl724", 24, 1, 0x00fc, PCL724_SIZE, 0, 0, },
-       { "pcl722", 144, 6, 0x00fc, PCL722_SIZE, 1, 0, },
-       { "pcl731", 48, 2, 0x9cfc, PCL731_SIZE, 0, 0, },
-       { "acl7122", 144, 6, 0x9ee8, PCL722_SIZE, 1, 0, },
-       { "acl7124", 24, 1, 0x00fc, PCL724_SIZE, 0, 0, },
-       { "pet48dio", 48, 2, 0x9eb8, PET48_SIZE, 0, 1, },
-};
-
 static struct comedi_driver pcl724_driver = {
        .driver_name    = "pcl724",
        .module         = THIS_MODULE,
@@ -197,5 +157,5 @@ static struct comedi_driver pcl724_driver = {
 module_comedi_driver(pcl724_driver);
 
 MODULE_AUTHOR("Comedi http://www.comedi.org");
-MODULE_DESCRIPTION("Comedi low-level driver");
+MODULE_DESCRIPTION("Comedi driver for 8255 based ISA DIO boards");
 MODULE_LICENSE("GPL");
diff --git a/drivers/staging/comedi/drivers/pcl725.c b/drivers/staging/comedi/drivers/pcl725.c
deleted file mode 100644 (file)
index 6b02f06..0000000
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * comedi/drivers/pcl725.c
- * Driver for PCL725 and clones
- * David A. Schleef
- */
-/*
-Driver: pcl725
-Description: Advantech PCL-725 (& compatibles)
-Author: ds
-Status: unknown
-Devices: [Advantech] PCL-725 (pcl725)
-*/
-
-#include "../comedidev.h"
-
-#include <linux/ioport.h>
-
-#define PCL725_SIZE 2
-
-#define PCL725_DO 0
-#define PCL725_DI 1
-
-static int pcl725_do_insn(struct comedi_device *dev, struct comedi_subdevice *s,
-                         struct comedi_insn *insn, unsigned int *data)
-{
-       if (data[0]) {
-               s->state &= ~data[0];
-               s->state |= (data[0] & data[1]);
-               outb(s->state, dev->iobase + PCL725_DO);
-       }
-
-       data[1] = s->state;
-
-       return insn->n;
-}
-
-static int pcl725_di_insn(struct comedi_device *dev, struct comedi_subdevice *s,
-                         struct comedi_insn *insn, unsigned int *data)
-{
-       data[1] = inb(dev->iobase + PCL725_DI);
-
-       return insn->n;
-}
-
-static int pcl725_attach(struct comedi_device *dev, struct comedi_devconfig *it)
-{
-       struct comedi_subdevice *s;
-       int ret;
-
-       ret = comedi_request_region(dev, it->options[0], PCL725_SIZE);
-       if (ret)
-               return ret;
-
-       ret = comedi_alloc_subdevices(dev, 2);
-       if (ret)
-               return ret;
-
-       s = &dev->subdevices[0];
-       /* do */
-       s->type = COMEDI_SUBD_DO;
-       s->subdev_flags = SDF_WRITABLE;
-       s->maxdata = 1;
-       s->n_chan = 8;
-       s->insn_bits = pcl725_do_insn;
-       s->range_table = &range_digital;
-
-       s = &dev->subdevices[1];
-       /* di */
-       s->type = COMEDI_SUBD_DI;
-       s->subdev_flags = SDF_READABLE;
-       s->maxdata = 1;
-       s->n_chan = 8;
-       s->insn_bits = pcl725_di_insn;
-       s->range_table = &range_digital;
-
-       printk(KERN_INFO "\n");
-
-       return 0;
-}
-
-static struct comedi_driver pcl725_driver = {
-       .driver_name    = "pcl725",
-       .module         = THIS_MODULE,
-       .attach         = pcl725_attach,
-       .detach         = comedi_legacy_detach,
-};
-module_comedi_driver(pcl725_driver);
-
-MODULE_AUTHOR("Comedi http://www.comedi.org");
-MODULE_DESCRIPTION("Comedi low-level driver");
-MODULE_LICENSE("GPL");
index 4aa994393fae492d024a2544ac36892fcd1f9851..893f012a1b7a5fc6b468a330b367a27528821238 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: pcl726
index 2879db75da3e9280433d6bed161c76120663fd43..862e75fd68fd8ceba699962a44c195ebad3faf22 100644 (file)
  * Driver for Advantech PCL-730 and clones
  * José Luis Sánchez
  */
-/*
-Driver: pcl730
-Description: Advantech PCL-730 (& compatibles)
-Author: José Luis Sánchez (jsanchezv@teleline.es)
-Status: untested
-Devices: [Advantech] PCL-730 (pcl730), [ICP] ISO-730 (iso730),
-                [Adlink] ACL-7130 (acl7130)
 
-Interrupts are not supported.
-The ACL-7130 card have an 8254 timer/counter not supported by this driver.
-*/
+/*
+ * Driver: pcl730
+ * Description: Advantech PCL-730 (& compatibles)
+ * Devices: (Advantech) PCL-730 [pcl730]
+ *         (ICP) ISO-730 [iso730]
+ *         (Adlink) ACL-7130 [acl7130]
+ *         (Advantech) PCM-3730 [pcm3730]
+ *         (Advantech) PCL-725 [pcl725]
+ *         (ICP) P8R8-DIO [p16r16dio]
+ *         (Adlink) ACL-7225b [acl7225b]
+ *         (ICP) P16R16-DIO [p16r16dio]
+ *         (Advantech) PCL-733 [pcl733]
+ *         (Advantech) PCL-734 [pcl734]
+ * Author: José Luis Sánchez (jsanchezv@teleline.es)
+ * Status: untested
+ *
+ * Configuration options:
+ *   [0] - I/O port base
+ *
+ * Interrupts are not supported.
+ * The ACL-7130 card has an 8254 timer/counter not supported by this driver.
+ */
 
 #include "../comedidev.h"
 
 #include <linux/ioport.h>
 
-#define PCL730_SIZE            4
-#define ACL7130_SIZE   8
-#define PCL730_IDIO_LO 0       /* Isolated Digital I/O low byte (ID0-ID7) */
-#define PCL730_IDIO_HI 1       /* Isolated Digital I/O high byte (ID8-ID15) */
-#define PCL730_DIO_LO  2       /* TTL Digital I/O low byte (D0-D7) */
-#define PCL730_DIO_HI  3       /* TTL Digital I/O high byte (D8-D15) */
+/*
+ * Register map
+ *
+ * The register map varies slightly depending on the board type but
+ * all registers are 8-bit.
+ *
+ * The boardinfo 'io_range' is used to allow comedi to request the
+ * proper range required by the board.
+ *
+ * The comedi_subdevice 'private' data is used to pass the register
+ * offset to the (*insn_bits) functions to read/write the correct
+ * registers.
+ *
+ * The basic register mapping looks like this:
+ *
+ *     BASE+0  Isolated outputs 0-7 (write) / inputs 0-7 (read)
+ *     BASE+1  Isolated outputs 8-15 (write) / inputs 8-15 (read)
+ *     BASE+2  TTL outputs 0-7 (write) / inputs 0-7 (read)
+ *     BASE+3  TTL outputs 8-15 (write) / inputs 8-15 (read)
+ *
+ * The pcm3730 board does not have register BASE+1.
+ *
+ * The pcl725 and p8r8dio only have registers BASE+0 and BASE+1:
+ *
+ *     BASE+0  Isolated outputs 0-7 (write) (read back on p8r8dio)
+ *     BASE+1  Isolated inputs 0-7 (read)
+ *
+ * The acl7225b and p16r16dio boards have this register mapping:
+ *
+ *     BASE+0  Isolated outputs 0-7 (write) (read back)
+ *     BASE+1  Isolated outputs 8-15 (write) (read back)
+ *     BASE+2  Isolated inputs 0-7 (read)
+ *     BASE+3  Isolated inputs 8-15 (read)
+ *
+ * The pcl733 and pcl733 boards have this register mapping:
+ *
+ *     BASE+0  Isolated outputs 0-7 (write) or inputs 0-7 (read)
+ *     BASE+1  Isolated outputs 8-15 (write) or inputs 8-15 (read)
+ *     BASE+2  Isolated outputs 16-23 (write) or inputs 16-23 (read)
+ *     BASE+3  Isolated outputs 24-31 (write) or inputs 24-31 (read)
+ */
 
 struct pcl730_board {
+       const char *name;
+       unsigned int io_range;
+       unsigned is_pcl725:1;
+       unsigned is_acl7225b:1;
+       unsigned has_readback:1;
+       unsigned has_ttl_io:1;
+       int n_subdevs;
+       int n_iso_out_chan;
+       int n_iso_in_chan;
+       int n_ttl_chan;
+};
 
-       const char *name;       /*  board name */
-       unsigned int io_range;  /*  len of I/O space */
+static const struct pcl730_board pcl730_boards[] = {
+       {
+               .name           = "pcl730",
+               .io_range       = 0x04,
+               .has_ttl_io     = 1,
+               .n_subdevs      = 4,
+               .n_iso_out_chan = 16,
+               .n_iso_in_chan  = 16,
+               .n_ttl_chan     = 16,
+       }, {
+               .name           = "iso730",
+               .io_range       = 0x04,
+               .n_subdevs      = 4,
+               .n_iso_out_chan = 16,
+               .n_iso_in_chan  = 16,
+               .n_ttl_chan     = 16,
+       }, {
+               .name           = "acl7130",
+               .io_range       = 0x08,
+               .has_ttl_io     = 1,
+               .n_subdevs      = 4,
+               .n_iso_out_chan = 16,
+               .n_iso_in_chan  = 16,
+               .n_ttl_chan     = 16,
+       }, {
+               .name           = "pcm3730",
+               .io_range       = 0x04,
+               .has_ttl_io     = 1,
+               .n_subdevs      = 4,
+               .n_iso_out_chan = 8,
+               .n_iso_in_chan  = 8,
+               .n_ttl_chan     = 16,
+       }, {
+               .name           = "pcl725",
+               .io_range       = 0x02,
+               .is_pcl725      = 1,
+               .n_subdevs      = 2,
+               .n_iso_out_chan = 8,
+               .n_iso_in_chan  = 8,
+       }, {
+               .name           = "p8r8dio",
+               .io_range       = 0x02,
+               .is_pcl725      = 1,
+               .has_readback   = 1,
+               .n_subdevs      = 2,
+               .n_iso_out_chan = 8,
+               .n_iso_in_chan  = 8,
+       }, {
+               .name           = "acl7225b",
+               .io_range       = 0x08,         /* only 4 are used */
+               .is_acl7225b    = 1,
+               .has_readback   = 1,
+               .n_subdevs      = 2,
+               .n_iso_out_chan = 16,
+               .n_iso_in_chan  = 16,
+       }, {
+               .name           = "p16r16dio",
+               .io_range       = 0x04,
+               .is_acl7225b    = 1,
+               .has_readback   = 1,
+               .n_subdevs      = 2,
+               .n_iso_out_chan = 16,
+               .n_iso_in_chan  = 16,
+       }, {
+               .name           = "pcl733",
+               .io_range       = 0x04,
+               .n_subdevs      = 1,
+               .n_iso_in_chan  = 32,
+       }, {
+               .name           = "pcl734",
+               .io_range       = 0x04,
+               .n_subdevs      = 1,
+               .n_iso_out_chan = 32,
+       },
 };
 
-static int pcl730_do_insn(struct comedi_device *dev, struct comedi_subdevice *s,
-                         struct comedi_insn *insn, unsigned int *data)
+static int pcl730_do_insn_bits(struct comedi_device *dev,
+                              struct comedi_subdevice *s,
+                              struct comedi_insn *insn,
+                              unsigned int *data)
 {
-       if (data[0]) {
-               s->state &= ~data[0];
-               s->state |= (data[0] & data[1]);
+       unsigned long reg = (unsigned long)s->private;
+       unsigned int mask = data[0];
+       unsigned int bits = data[1];
+
+       if (mask) {
+               s->state &= ~mask;
+               s->state |= (bits & mask);
+
+               if (mask & 0x00ff)
+                       outb(s->state & 0xff, dev->iobase + reg);
+               if ((mask & 0xff00) && (s->n_chan > 8))
+                       outb((s->state >> 8) & 0xff, dev->iobase + reg + 1);
+               if ((mask & 0xff0000) && (s->n_chan > 16))
+                       outb((s->state >> 16) & 0xff, dev->iobase + reg + 2);
+               if ((mask & 0xff000000) && (s->n_chan > 24))
+                       outb((s->state >> 24) & 0xff, dev->iobase + reg + 3);
        }
-       if (data[0] & 0x00ff)
-               outb(s->state & 0xff,
-                    dev->iobase + ((unsigned long)s->private));
-       if (data[0] & 0xff00)
-               outb((s->state >> 8),
-                    dev->iobase + ((unsigned long)s->private) + 1);
 
        data[1] = s->state;
 
        return insn->n;
 }
 
-static int pcl730_di_insn(struct comedi_device *dev, struct comedi_subdevice *s,
-                         struct comedi_insn *insn, unsigned int *data)
+static unsigned int pcl730_get_bits(struct comedi_device *dev,
+                                   struct comedi_subdevice *s)
 {
-       data[1] = inb(dev->iobase + ((unsigned long)s->private)) |
-           (inb(dev->iobase + ((unsigned long)s->private) + 1) << 8);
+       unsigned long reg = (unsigned long)s->private;
+       unsigned int val;
+
+       val = inb(dev->iobase + reg);
+       if (s->n_chan > 8)
+               val |= (inb(dev->iobase + reg + 1) << 8);
+       if (s->n_chan > 16)
+               val |= (inb(dev->iobase + reg + 2) << 16);
+       if (s->n_chan > 24)
+               val |= (inb(dev->iobase + reg + 3) << 24);
+
+       return val;
+}
+
+static int pcl730_di_insn_bits(struct comedi_device *dev,
+                              struct comedi_subdevice *s,
+                              struct comedi_insn *insn,
+                              unsigned int *data)
+{
+       data[1] = pcl730_get_bits(dev, s);
 
        return insn->n;
 }
 
-static int pcl730_attach(struct comedi_device *dev, struct comedi_devconfig *it)
+static int pcl730_attach(struct comedi_device *dev,
+                        struct comedi_devconfig *it)
 {
        const struct pcl730_board *board = comedi_board(dev);
        struct comedi_subdevice *s;
+       int subdev;
        int ret;
 
        ret = comedi_request_region(dev, it->options[0], board->io_range);
        if (ret)
                return ret;
 
-       ret = comedi_alloc_subdevices(dev, 4);
+       ret = comedi_alloc_subdevices(dev, board->n_subdevs);
        if (ret)
                return ret;
 
-       s = &dev->subdevices[0];
-       /* Isolated do */
-       s->type = COMEDI_SUBD_DO;
-       s->subdev_flags = SDF_WRITABLE;
-       s->maxdata = 1;
-       s->n_chan = 16;
-       s->insn_bits = pcl730_do_insn;
-       s->range_table = &range_digital;
-       s->private = (void *)PCL730_IDIO_LO;
-
-       s = &dev->subdevices[1];
-       /* Isolated di */
-       s->type = COMEDI_SUBD_DI;
-       s->subdev_flags = SDF_READABLE;
-       s->maxdata = 1;
-       s->n_chan = 16;
-       s->insn_bits = pcl730_di_insn;
-       s->range_table = &range_digital;
-       s->private = (void *)PCL730_IDIO_LO;
-
-       s = &dev->subdevices[2];
-       /* TTL do */
-       s->type = COMEDI_SUBD_DO;
-       s->subdev_flags = SDF_WRITABLE;
-       s->maxdata = 1;
-       s->n_chan = 16;
-       s->insn_bits = pcl730_do_insn;
-       s->range_table = &range_digital;
-       s->private = (void *)PCL730_DIO_LO;
-
-       s = &dev->subdevices[3];
-       /* TTL di */
-       s->type = COMEDI_SUBD_DI;
-       s->subdev_flags = SDF_READABLE;
-       s->maxdata = 1;
-       s->n_chan = 16;
-       s->insn_bits = pcl730_di_insn;
-       s->range_table = &range_digital;
-       s->private = (void *)PCL730_DIO_LO;
-
-       printk(KERN_INFO "\n");
+       subdev = 0;
+
+       if (board->n_iso_out_chan) {
+               /* Isolated Digital Outputs */
+               s = &dev->subdevices[subdev++];
+               s->type         = COMEDI_SUBD_DO;
+               s->subdev_flags = SDF_WRITABLE;
+               s->n_chan       = board->n_iso_out_chan;
+               s->maxdata      = 1;
+               s->range_table  = &range_digital;
+               s->insn_bits    = pcl730_do_insn_bits;
+               s->private      = (void *)0;
+
+               /* get the initial state if supported */
+               if (board->has_readback)
+                       s->state = pcl730_get_bits(dev, s);
+       }
+
+       if (board->n_iso_in_chan) {
+               /* Isolated Digital Inputs */
+               s = &dev->subdevices[subdev++];
+               s->type         = COMEDI_SUBD_DI;
+               s->subdev_flags = SDF_READABLE;
+               s->n_chan       = board->n_iso_in_chan;
+               s->maxdata      = 1;
+               s->range_table  = &range_digital;
+               s->insn_bits    = pcl730_di_insn_bits;
+               s->private      = board->is_acl7225b ? (void *)2 :
+                                 board->is_pcl725 ? (void *)1 : (void *)0;
+       }
+
+       if (board->has_ttl_io) {
+               /* TTL Digital Outputs */
+               s = &dev->subdevices[subdev++];
+               s->type         = COMEDI_SUBD_DO;
+               s->subdev_flags = SDF_WRITABLE;
+               s->n_chan       = board->n_ttl_chan;
+               s->maxdata      = 1;
+               s->range_table  = &range_digital;
+               s->insn_bits    = pcl730_do_insn_bits;
+               s->private      = (void *)2;
+
+               /* TTL Digital Inputs */
+               s = &dev->subdevices[subdev++];
+               s->type         = COMEDI_SUBD_DI;
+               s->subdev_flags = SDF_READABLE;
+               s->n_chan       = board->n_ttl_chan;
+               s->maxdata      = 1;
+               s->range_table  = &range_digital;
+               s->insn_bits    = pcl730_di_insn_bits;
+               s->private      = (void *)2;
+       }
 
        return 0;
 }
 
-static const struct pcl730_board boardtypes[] = {
-       { "pcl730", PCL730_SIZE, },
-       { "iso730", PCL730_SIZE, },
-       { "acl7130", ACL7130_SIZE, },
-};
-
 static struct comedi_driver pcl730_driver = {
        .driver_name    = "pcl730",
        .module         = THIS_MODULE,
        .attach         = pcl730_attach,
        .detach         = comedi_legacy_detach,
-       .board_name     = &boardtypes[0].name,
-       .num_names      = ARRAY_SIZE(boardtypes),
+       .board_name     = &pcl730_boards[0].name,
+       .num_names      = ARRAY_SIZE(pcl730_boards),
        .offset         = sizeof(struct pcl730_board),
 };
 module_comedi_driver(pcl730_driver);
diff --git a/drivers/staging/comedi/drivers/pcm3730.c b/drivers/staging/comedi/drivers/pcm3730.c
deleted file mode 100644 (file)
index 3a3ce2c..0000000
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * comedi/drivers/pcm3730.c
- * Driver for PCM3730 and clones
- * Blaine Lee
- * from pcl725 by David S.
- */
-/*
-Driver: pcm3730
-Description: PCM3730
-Author: Blaine Lee
-Devices: [Advantech] PCM-3730 (pcm3730)
-Status: unknown
-
-Configuration options:
-  [0] - I/O port base
-*/
-
-#include "../comedidev.h"
-
-#include <linux/ioport.h>
-
-#define PCM3730_SIZE 4         /*  consecutive io port addresses */
-
-#define PCM3730_DOA 0          /*  offsets for each port */
-#define PCM3730_DOB 2
-#define PCM3730_DOC 3
-#define PCM3730_DIA 0
-#define PCM3730_DIB 2
-#define PCM3730_DIC 3
-
-static int pcm3730_do_insn_bits(struct comedi_device *dev,
-                               struct comedi_subdevice *s,
-                               struct comedi_insn *insn, unsigned int *data)
-{
-       if (data[0]) {
-               s->state &= ~data[0];
-               s->state |= (data[0] & data[1]);
-               outb(s->state, dev->iobase + (unsigned long)(s->private));
-       }
-       data[1] = s->state;
-
-       return insn->n;
-}
-
-static int pcm3730_di_insn_bits(struct comedi_device *dev,
-                               struct comedi_subdevice *s,
-                               struct comedi_insn *insn, unsigned int *data)
-{
-       data[1] = inb(dev->iobase + (unsigned long)(s->private));
-       return insn->n;
-}
-
-static int pcm3730_attach(struct comedi_device *dev,
-                         struct comedi_devconfig *it)
-{
-       struct comedi_subdevice *s;
-       int ret;
-
-       ret = comedi_request_region(dev, it->options[0], PCM3730_SIZE);
-       if (ret)
-               return ret;
-
-       ret = comedi_alloc_subdevices(dev, 6);
-       if (ret)
-               return ret;
-
-       s = &dev->subdevices[0];
-       s->type = COMEDI_SUBD_DO;
-       s->subdev_flags = SDF_WRITABLE;
-       s->maxdata = 1;
-       s->n_chan = 8;
-       s->insn_bits = pcm3730_do_insn_bits;
-       s->range_table = &range_digital;
-       s->private = (void *)PCM3730_DOA;
-
-       s = &dev->subdevices[1];
-       s->type = COMEDI_SUBD_DO;
-       s->subdev_flags = SDF_WRITABLE;
-       s->maxdata = 1;
-       s->n_chan = 8;
-       s->insn_bits = pcm3730_do_insn_bits;
-       s->range_table = &range_digital;
-       s->private = (void *)PCM3730_DOB;
-
-       s = &dev->subdevices[2];
-       s->type = COMEDI_SUBD_DO;
-       s->subdev_flags = SDF_WRITABLE;
-       s->maxdata = 1;
-       s->n_chan = 8;
-       s->insn_bits = pcm3730_do_insn_bits;
-       s->range_table = &range_digital;
-       s->private = (void *)PCM3730_DOC;
-
-       s = &dev->subdevices[3];
-       s->type = COMEDI_SUBD_DI;
-       s->subdev_flags = SDF_READABLE;
-       s->maxdata = 1;
-       s->n_chan = 8;
-       s->insn_bits = pcm3730_di_insn_bits;
-       s->range_table = &range_digital;
-       s->private = (void *)PCM3730_DIA;
-
-       s = &dev->subdevices[4];
-       s->type = COMEDI_SUBD_DI;
-       s->subdev_flags = SDF_READABLE;
-       s->maxdata = 1;
-       s->n_chan = 8;
-       s->insn_bits = pcm3730_di_insn_bits;
-       s->range_table = &range_digital;
-       s->private = (void *)PCM3730_DIB;
-
-       s = &dev->subdevices[5];
-       s->type = COMEDI_SUBD_DI;
-       s->subdev_flags = SDF_READABLE;
-       s->maxdata = 1;
-       s->n_chan = 8;
-       s->insn_bits = pcm3730_di_insn_bits;
-       s->range_table = &range_digital;
-       s->private = (void *)PCM3730_DIC;
-
-       printk(KERN_INFO "\n");
-
-       return 0;
-}
-
-static struct comedi_driver pcm3730_driver = {
-       .driver_name    = "pcm3730",
-       .module         = THIS_MODULE,
-       .attach         = pcm3730_attach,
-       .detach         = comedi_legacy_detach,
-};
-module_comedi_driver(pcm3730_driver);
-
-MODULE_AUTHOR("Comedi http://www.comedi.org");
-MODULE_DESCRIPTION("Comedi low-level driver");
-MODULE_LICENSE("GPL");
index b7c932e152e0b5451f8b5e1b64d83c958541ede7..d5c728dc6192f82443e157d0d716abd585ac8834 100644 (file)
@@ -1,52 +1,44 @@
 /*
-    comedi/drivers/pcmad.c
-    Hardware driver for Winsystems PCM-A/D12 and PCM-A/D16
-
-    COMEDI - Linux Control and Measurement Device Interface
-    Copyright (C) 2000,2001 David A. Schleef <ds@schleef.org>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * pcmad.c
+ * Hardware driver for Winsystems PCM-A/D12 and PCM-A/D16
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 2000,2001 David A. Schleef <ds@schleef.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
 
-*/
 /*
-Driver: pcmad
-Description: Winsystems PCM-A/D12, PCM-A/D16
-Author: ds
-Devices: [Winsystems] PCM-A/D12 (pcmad12), PCM-A/D16 (pcmad16)
-Status: untested
-
-This driver was written on a bet that I couldn't write a driver
-in less than 2 hours.  I won the bet, but never got paid.  =(
-
-Configuration options:
-  [0] - I/O port base
-  [1] - unused
-  [2] - Analog input reference
-       0 = single ended
-       1 = differential
-  [3] - Analog input encoding (must match jumpers)
-       0 = straight binary
-       1 = two's complement
-*/
-
-#include <linux/interrupt.h>
-#include "../comedidev.h"
-
-#include <linux/ioport.h>
+ * Driver: pcmad
+ * Description: Winsystems PCM-A/D12, PCM-A/D16
+ * Devices: (Winsystems) PCM-A/D12 [pcmad12]
+ *         (Winsystems) PCM-A/D16 [pcmad16]
+ * Author: ds
+ * Status: untested
+ *
+ * This driver was written on a bet that I couldn't write a driver
+ * in less than 2 hours.  I won the bet, but never got paid.  =(
+ *
+ * Configuration options:
+ *   [0] - I/O port base
+ *   [1] - IRQ (unused)
+ *   [2] - Analog input reference (must match jumpers)
+ *        0 = single-ended (16 channels)
+ *        1 = differential (8 channels)
+ *   [3] - Analog input encoding (must match jumpers)
+ *        0 = straight binary (0-5V input range)
+ *        1 = two's complement (+-10V input range)
+ */
 
-#define PCMAD_SIZE             4
+#include "../comedidev.h"
 
 #define PCMAD_STATUS           0
 #define PCMAD_LSB              1
@@ -55,60 +47,82 @@ Configuration options:
 
 struct pcmad_board_struct {
        const char *name;
-       int n_ai_bits;
+       unsigned int ai_maxdata;
 };
 
-struct pcmad_priv_struct {
-       int differential;
-       int twos_comp;
+static const struct pcmad_board_struct pcmad_boards[] = {
+       {
+               .name           = "pcmad12",
+               .ai_maxdata     = 0x0fff,
+       }, {
+               .name           = "pcmad16",
+               .ai_maxdata     = 0xffff,
+       },
 };
 
 #define TIMEOUT        100
 
+static int pcmad_ai_wait_for_eoc(struct comedi_device *dev,
+                                int timeout)
+{
+       int i;
+
+       for (i = 0; i < timeout; i++) {
+               if ((inb(dev->iobase + PCMAD_STATUS) & 0x3) == 0x3)
+                       return 0;
+       }
+       return -ETIME;
+}
+
+static bool pcmad_range_is_bipolar(struct comedi_subdevice *s,
+                                  unsigned int range)
+{
+       return s->range_table->range[range].min < 0;
+}
+
 static int pcmad_ai_insn_read(struct comedi_device *dev,
                              struct comedi_subdevice *s,
-                             struct comedi_insn *insn, unsigned int *data)
+                             struct comedi_insn *insn,
+                             unsigned int *data)
 {
-       const struct pcmad_board_struct *board = comedi_board(dev);
-       struct pcmad_priv_struct *devpriv = dev->private;
+       unsigned int chan = CR_CHAN(insn->chanspec);
+       unsigned int range = CR_RANGE(insn->chanspec);
+       unsigned int val;
+       int ret;
        int i;
-       int chan;
-       int n;
 
-       chan = CR_CHAN(insn->chanspec);
-
-       for (n = 0; n < insn->n; n++) {
+       for (i = 0; i < insn->n; i++) {
                outb(chan, dev->iobase + PCMAD_CONVERT);
 
-               for (i = 0; i < TIMEOUT; i++) {
-                       if ((inb(dev->iobase + PCMAD_STATUS) & 0x3) == 0x3)
-                               break;
+               ret = pcmad_ai_wait_for_eoc(dev, TIMEOUT);
+               if (ret)
+                       return ret;
+
+               val = inb(dev->iobase + PCMAD_LSB) |
+                     (inb(dev->iobase + PCMAD_MSB) << 8);
+
+               /* data is shifted on the pcmad12, fix it */
+               if (s->maxdata == 0x0fff)
+                       val >>= 4;
+
+               if (pcmad_range_is_bipolar(s, range)) {
+                       /* munge the two's complement value */
+                       val ^= ((s->maxdata + 1) >> 1);
                }
-               data[n] = inb(dev->iobase + PCMAD_LSB);
-               data[n] |= (inb(dev->iobase + PCMAD_MSB) << 8);
 
-               if (devpriv->twos_comp)
-                       data[n] ^= (1 << (board->n_ai_bits - 1));
+               data[i] = val;
        }
 
-       return n;
+       return insn->n;
 }
 
-/*
- * options:
- * 0   i/o base
- * 1   unused
- * 2   0=single ended 1=differential
- * 3   0=straight binary 1=two's comp
- */
 static int pcmad_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 {
        const struct pcmad_board_struct *board = comedi_board(dev);
-       struct pcmad_priv_struct *devpriv;
        struct comedi_subdevice *s;
        int ret;
 
-       ret = comedi_request_region(dev, it->options[0], PCMAD_SIZE);
+       ret = comedi_request_region(dev, it->options[0], 0x04);
        if (ret)
                return ret;
 
@@ -116,32 +130,25 @@ static int pcmad_attach(struct comedi_device *dev, struct comedi_devconfig *it)
        if (ret)
                return ret;
 
-       devpriv = kzalloc(sizeof(*devpriv), GFP_KERNEL);
-       if (!devpriv)
-               return -ENOMEM;
-       dev->private = devpriv;
-
        s = &dev->subdevices[0];
-       s->type = COMEDI_SUBD_AI;
-       s->subdev_flags = SDF_READABLE | AREF_GROUND;
-       s->n_chan = 16;         /* XXX */
-       s->len_chanlist = 1;
-       s->insn_read = pcmad_ai_insn_read;
-       s->maxdata = (1 << board->n_ai_bits) - 1;
-       s->range_table = &range_unknown;
+       s->type         = COMEDI_SUBD_AI;
+       if (it->options[1]) {
+               /* 8 differential channels */
+               s->subdev_flags = SDF_READABLE | AREF_DIFF;
+               s->n_chan       = 8;
+       } else {
+               /* 16 single-ended channels */
+               s->subdev_flags = SDF_READABLE | AREF_GROUND;
+               s->n_chan       = 16;
+       }
+       s->len_chanlist = 1;
+       s->maxdata      = board->ai_maxdata;
+       s->range_table  = it->options[2] ? &range_bipolar10 : &range_unipolar5;
+       s->insn_read    = pcmad_ai_insn_read;
 
        return 0;
 }
 
-static const struct pcmad_board_struct pcmad_boards[] = {
-       {
-               .name           = "pcmad12",
-               .n_ai_bits      = 12,
-       }, {
-               .name           = "pcmad16",
-               .n_ai_bits      = 16,
-       },
-};
 static struct comedi_driver pcmad_driver = {
        .driver_name    = "pcmad",
        .module         = THIS_MODULE,
index 61e7fd14a1e804274863d64cbdedf42a41291d88..774a63dfe0402557674e3b274551cb0851173d39 100644 (file)
 /*
-    comedi/drivers/pcmda12.c
-    Driver for Winsystems PC-104 based PCM-D/A-12 8-channel AO board.
-
-    COMEDI - Linux Control and Measurement Device Interface
-    Copyright (C) 2006 Calin A. Culianu <calin@ajvar.org>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
+ * pcmda12.c
+ * Driver for Winsystems PC-104 based PCM-D/A-12 8-channel AO board.
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 2006 Calin A. Culianu <calin@ajvar.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
 /*
-Driver: pcmda12
-Description: A driver for the Winsystems PCM-D/A-12
-Devices: [Winsystems] PCM-D/A-12 (pcmda12)
-Author: Calin Culianu <calin@ajvar.org>
-Updated: Fri, 13 Jan 2006 12:01:01 -0500
-Status: works
-
-A driver for the relatively straightforward-to-program PCM-D/A-12.
-This board doesn't support commands, and the only way to set its
-analog output range is to jumper the board.  As such,
-comedi_data_write() ignores the range value specified.
-
-The board uses 16 consecutive I/O addresses starting at the I/O port
-base address.  Each address corresponds to the LSB then MSB of a
-particular channel from 0-7.
-
-Note that the board is not ISA-PNP capable and thus
-needs the I/O port comedi_config parameter.
-
-Note that passing a nonzero value as the second config option will
-enable "simultaneous xfer" mode for this board, in which AO writes
-will not take effect until a subsequent read of any AO channel.  This
-is so that one can speed up programming by preloading all AO registers
-with values before simultaneously setting them to take effect with one
-read command.
-
-Configuration Options:
-  [0] - I/O port base address
-  [1] - Do Simultaneous Xfer (see description)
-*/
+ * Driver: pcmda12
+ * Description: A driver for the Winsystems PCM-D/A-12
+ * Devices: (Winsystems) PCM-D/A-12 [pcmda12]
+ * Author: Calin Culianu <calin@ajvar.org>
+ * Updated: Fri, 13 Jan 2006 12:01:01 -0500
+ * Status: works
+ *
+ * A driver for the relatively straightforward-to-program PCM-D/A-12.
+ * This board doesn't support commands, and the only way to set its
+ * analog output range is to jumper the board. As such,
+ * comedi_data_write() ignores the range value specified.
+ *
+ * The board uses 16 consecutive I/O addresses starting at the I/O port
+ * base address. Each address corresponds to the LSB then MSB of a
+ * particular channel from 0-7.
+ *
+ * Note that the board is not ISA-PNP capable and thus needs the I/O
+ * port comedi_config parameter.
+ *
+ * Note that passing a nonzero value as the second config option will
+ * enable "simultaneous xfer" mode for this board, in which AO writes
+ * will not take effect until a subsequent read of any AO channel. This
+ * is so that one can speed up programming by preloading all AO registers
+ * with values before simultaneously setting them to take effect with one
+ * read command.
+ *
+ * Configuration Options:
*   [0] - I/O port base address
*   [1] - Do Simultaneous Xfer (see description)
+ */
 
 #include "../comedidev.h"
 
-#define CHANS 8
-#define IOSIZE 16
-#define LSB(x) ((unsigned char)((x) & 0xff))
-#define MSB(x) ((unsigned char)((((unsigned short)(x))>>8) & 0xff))
-#define LSB_PORT(chan) (dev->iobase + (chan)*2)
-#define MSB_PORT(chan) (LSB_PORT(chan)+1)
-#define BITS 12
-
-/* note these have no effect and are merely here for reference..
-   these are configured by jumpering the board! */
+/* AI range is not configurable, it's set by jumpers on the board */
 static const struct comedi_lrange pcmda12_ranges = {
-       3,
-       {
-        UNI_RANGE(5), UNI_RANGE(10), BIP_RANGE(5)
-        }
+       3, {
+               UNI_RANGE(5),
+               UNI_RANGE(10),
+               BIP_RANGE(5)
+       }
 };
 
 struct pcmda12_private {
-
-       unsigned int ao_readback[CHANS];
+       unsigned int ao_readback[8];
        int simultaneous_xfer_mode;
 };
 
-static void zero_chans(struct comedi_device *dev)
-{                              /* sets up an
-                                  ASIC chip to defaults */
-       int i;
-       for (i = 0; i < CHANS; ++i) {
-/*      /\* do this as one instruction?? *\/ */
-/*      outw(0, LSB_PORT(chan)); */
-               outb(0, LSB_PORT(i));
-               outb(0, MSB_PORT(i));
-       }
-       inb(LSB_PORT(0));       /* update chans. */
-}
-
-static int ao_winsn(struct comedi_device *dev, struct comedi_subdevice *s,
-                   struct comedi_insn *insn, unsigned int *data)
+static int pcmda12_ao_insn_write(struct comedi_device *dev,
+                                struct comedi_subdevice *s,
+                                struct comedi_insn *insn,
+                                unsigned int *data)
 {
        struct pcmda12_private *devpriv = dev->private;
+       unsigned int chan = CR_CHAN(insn->chanspec);
+       unsigned int val = devpriv->ao_readback[chan];
+       unsigned long ioreg = dev->iobase + (chan * 2);
        int i;
-       int chan = CR_CHAN(insn->chanspec);
 
-       /* Writing a list of values to an AO channel is probably not
-        * very useful, but that's how the interface is defined. */
        for (i = 0; i < insn->n; ++i) {
-
-/*      /\* do this as one instruction?? *\/ */
-/*      outw(data[i], LSB_PORT(chan)); */
-
-               /* Need to do this as two instructions due to 8-bit bus?? */
-               /*  first, load the low byte */
-               outb(LSB(data[i]), LSB_PORT(chan));
-               /*  next, write the high byte */
-               outb(MSB(data[i]), MSB_PORT(chan));
-
-               /* save shadow register */
-               devpriv->ao_readback[chan] = data[i];
-
+               val = data[i];
+               outb(val & 0xff, ioreg);
+               outb((val >> 8) & 0xff, ioreg + 1);
+
+               /*
+                * Initiate transfer if not in simultaneaous xfer
+                * mode by reading one of the AO registers.
+                */
                if (!devpriv->simultaneous_xfer_mode)
-                       inb(LSB_PORT(chan));
+                       inb(ioreg);
        }
+       devpriv->ao_readback[chan] = val;
 
-       /* return the number of samples written */
-       return i;
+       return insn->n;
 }
 
-/* AO subdevices should have a read insn as well as a write insn.
-
-   Usually this means copying a value stored in devpriv->ao_readback.
-   However, since this driver supports simultaneous xfer then sometimes
-   this function actually accomplishes work.
-
-   Simultaneaous xfer mode is accomplished by loading ALL the values
-   you want for AO in all the channels, then READing off one of the AO
-   registers to initiate the instantaneous simultaneous update of all
-   DAC outputs, which makes all AO channels update simultaneously.
-   This is useful for some control applications, I would imagine.
-*/
-static int ao_rinsn(struct comedi_device *dev, struct comedi_subdevice *s,
-                   struct comedi_insn *insn, unsigned int *data)
+static int pcmda12_ao_insn_read(struct comedi_device *dev,
+                               struct comedi_subdevice *s,
+                               struct comedi_insn *insn,
+                               unsigned int *data)
 {
        struct pcmda12_private *devpriv = dev->private;
+       unsigned int chan = CR_CHAN(insn->chanspec);
        int i;
-       int chan = CR_CHAN(insn->chanspec);
 
-       for (i = 0; i < insn->n; i++) {
-               if (devpriv->simultaneous_xfer_mode)
-                       inb(LSB_PORT(chan));
-               /* read back shadow register */
+       /*
+        * Initiate simultaneaous xfer mode by reading one of the
+        * AO registers. All analog outputs will then be updated.
+        */
+       if (devpriv->simultaneous_xfer_mode)
+               inb(dev->iobase);
+
+       for (i = 0; i < insn->n; i++)
                data[i] = devpriv->ao_readback[chan];
-       }
 
-       return i;
+       return insn->n;
+}
+
+static void pcmda12_ao_reset(struct comedi_device *dev,
+                            struct comedi_subdevice *s)
+{
+       int i;
+
+       for (i = 0; i < s->n_chan; ++i) {
+               outb(0, dev->iobase + (i * 2));
+               outb(0, dev->iobase + (i * 2) + 1);
+       }
+       /* Initiate transfer by reading one of the AO registers. */
+       inb(dev->iobase);
 }
 
 static int pcmda12_attach(struct comedi_device *dev,
@@ -156,7 +134,7 @@ static int pcmda12_attach(struct comedi_device *dev,
        struct comedi_subdevice *s;
        int ret;
 
-       ret = comedi_request_region(dev, it->options[0], IOSIZE);
+       ret = comedi_request_region(dev, it->options[0], 0x10);
        if (ret)
                return ret;
 
@@ -172,18 +150,17 @@ static int pcmda12_attach(struct comedi_device *dev,
                return ret;
 
        s = &dev->subdevices[0];
-       s->private = NULL;
-       s->maxdata = (0x1 << BITS) - 1;
-       s->range_table = &pcmda12_ranges;
-       s->type = COMEDI_SUBD_AO;
-       s->subdev_flags = SDF_READABLE | SDF_WRITABLE;
-       s->n_chan = CHANS;
-       s->insn_write = &ao_winsn;
-       s->insn_read = &ao_rinsn;
-
-       zero_chans(dev);        /* clear out all the registers, basically */
-
-       return 1;
+       s->type         = COMEDI_SUBD_AO;
+       s->subdev_flags = SDF_READABLE | SDF_WRITABLE;
+       s->n_chan       = 8;
+       s->maxdata      = 0x0fff;
+       s->range_table  = &pcmda12_ranges;
+       s->insn_write   = pcmda12_ao_insn_write;
+       s->insn_read    = pcmda12_ao_insn_read;
+
+       pcmda12_ao_reset(dev, s);
+
+       return 0;
 }
 
 static struct comedi_driver pcmda12_driver = {
index 5a236cd5b33d329dda66227a0a4644bcdfaf4898..9f76b1f59983564093f5beaaae4df332ef6bb9b2 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 /*
 Driver: pcmmio
index 0c98e26bbba111e9835124b9c281de2d0c660b7f..9a560a36f1f512e4cc50689e20ebe78fad634d7c 100644 (file)
@@ -1,79 +1,77 @@
 /*
-    comedi/drivers/pcmuio.c
-    Driver for Winsystems PC-104 based 48-channel and 96-channel DIO boards.
-
-    COMEDI - Linux Control and Measurement Device Interface
-    Copyright (C) 2006 Calin A. Culianu <calin@ajvar.org>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
+ * pcmuio.c
+ * Comedi driver for Winsystems PC-104 based 48/96-channel DIO boards.
+ *
+ * COMEDI - Linux Control and Measurement Device Interface
+ * Copyright (C) 2006 Calin A. Culianu <calin@ajvar.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
 /*
-Driver: pcmuio
-Description: A driver for the PCM-UIO48A and PCM-UIO96A boards from Winsystems.
-Devices: [Winsystems] PCM-UIO48A (pcmuio48), PCM-UIO96A (pcmuio96)
-Author: Calin Culianu <calin@ajvar.org>
-Updated: Fri, 13 Jan 2006 12:01:01 -0500
-Status: works
-
-A driver for the relatively straightforward-to-program PCM-UIO48A and
-PCM-UIO96A boards from Winsystems.  These boards use either one or two
-(in the 96-DIO version) WS16C48 ASIC HighDensity I/O Chips (HDIO).
-This chip is interesting in that each I/O line is individually
-programmable for INPUT or OUTPUT (thus comedi_dio_config can be done
-on a per-channel basis).  Also, each chip supports edge-triggered
-interrupts for the first 24 I/O lines.  Of course, since the
-96-channel version of the board has two ASICs, it can detect polarity
-changes on up to 48 I/O lines.  Since this is essentially an (non-PnP)
-ISA board, I/O Address and IRQ selection are done through jumpers on
-the board.  You need to pass that information to this driver as the
-first and second comedi_config option, respectively.  Note that the
-48-channel version uses 16 bytes of IO memory and the 96-channel
-version uses 32-bytes (in case you are worried about conflicts).  The
-48-channel board is split into two 24-channel comedi subdevices.
-The 96-channel board is split into 4 24-channel DIO subdevices.
-
-Note that IRQ support has been added, but it is untested.
-
-To use edge-detection IRQ support, pass the IRQs of both ASICS
-(for the 96 channel version) or just 1 ASIC (for 48-channel version).
-Then, use use comedi_commands with TRIG_NOW.
-Your callback will be called each time an edge is triggered, and the data
-values will be two sample_t's, which should be concatenated to form one
-32-bit unsigned int.  This value is the mask of channels that had
-edges detected from your channel list.  Note that the bits positions
-in the mask correspond to positions in your chanlist when you specified
-the command and *not* channel id's!
-
-To set the polarity of the edge-detection interrupts pass a nonzero value for
-either CR_RANGE or CR_AREF for edge-up polarity, or a zero value for both
-CR_RANGE and CR_AREF if you want edge-down polarity.
-
-In the 48-channel version:
-
-On subdev 0, the first 24 channels channels are edge-detect channels.
-
-In the 96-channel board you have the collowing channels that can do edge detection:
-
-subdev 0, channels 0-24  (first 24 channels of 1st ASIC)
-subdev 2, channels 0-24  (first 24 channels of 2nd ASIC)
-
-Configuration Options:
-  [0] - I/O port base address
-  [1] - IRQ (for first ASIC, or first 24 channels)
-  [2] - IRQ for second ASIC (pcmuio96 only - IRQ for chans 48-72 .. can be the same as first irq!)
-*/
+ * Driver: pcmuio
+ * Description: Winsystems PC-104 based 48/96-channel DIO boards.
+ * Devices: (Winsystems) PCM-UIO48A [pcmuio48]
+ *         (Winsystems) PCM-UIO96A [pcmuio96]
+ * Author: Calin Culianu <calin@ajvar.org>
+ * Updated: Fri, 13 Jan 2006 12:01:01 -0500
+ * Status: works
+ *
+ * A driver for the relatively straightforward-to-program PCM-UIO48A and
+ * PCM-UIO96A boards from Winsystems. These boards use either one or two
+ * (in the 96-DIO version) WS16C48 ASIC HighDensity I/O Chips (HDIO). This
+ * chip is interesting in that each I/O line is individually programmable
+ * for INPUT or OUTPUT (thus comedi_dio_config can be done on a per-channel
+ * basis). Also, each chip supports edge-triggered interrupts for the first
+ * 24 I/O lines. Of course, since the 96-channel version of the board has
+ * two ASICs, it can detect polarity changes on up to 48 I/O lines. Since
+ * this is essentially an (non-PnP) ISA board, I/O Address and IRQ selection
+ * are done through jumpers on the board. You need to pass that information
+ * to this driver as the first and second comedi_config option, respectively.
+ * Note that the 48-channel version uses 16 bytes of IO memory and the 96-
+ * channel version uses 32-bytes (in case you are worried about conflicts).
+ * The 48-channel board is split into two 24-channel comedi subdevices. The
+ * 96-channel board is split into 4 24-channel DIO subdevices.
+ *
+ * Note that IRQ support has been added, but it is untested.
+ *
+ * To use edge-detection IRQ support, pass the IRQs of both ASICS (for the
+ * 96 channel version) or just 1 ASIC (for 48-channel version). Then, use
+ * comedi_commands with TRIG_NOW. Your callback will be called each time an
+ * edge is triggered, and the data values will be two sample_t's, which
+ * should be concatenated to form one 32-bit unsigned int.  This value is
+ * the mask of channels that had edges detected from your channel list. Note
+ * that the bits positions in the mask correspond to positions in your
+ * chanlist when you specified the command and *not* channel id's!
+ *
+ * To set the polarity of the edge-detection interrupts pass a nonzero value
+ * for either CR_RANGE or CR_AREF for edge-up polarity, or a zero value for
+ * both CR_RANGE and CR_AREF if you want edge-down polarity.
+ *
+ * In the 48-channel version:
+ *
+ * On subdev 0, the first 24 channels channels are edge-detect channels.
+ *
+ * In the 96-channel board you have the following channels that can do edge
+ * detection:
+ *
+ * subdev 0, channels 0-24  (first 24 channels of 1st ASIC)
+ * subdev 2, channels 0-24  (first 24 channels of 2nd ASIC)
+ *
+ * Configuration Options:
+ *  [0] - I/O port base address
+ *  [1] - IRQ (for first ASIC, or first 24 channels)
+ *  [2] - IRQ (for second ASIC, pcmuio96 only - IRQ for chans 48-72
+ *             can be the same as first irq!)
+ */
 
 #include <linux/interrupt.h>
 #include <linux/slab.h>
@@ -82,71 +80,71 @@ Configuration Options:
 
 #include "comedi_fc.h"
 
-#define CHANS_PER_PORT   8
-#define PORTS_PER_ASIC   6
-#define INTR_PORTS_PER_ASIC   3
-#define MAX_CHANS_PER_SUBDEV 24        /* number of channels per comedi subdevice */
-#define PORTS_PER_SUBDEV (MAX_CHANS_PER_SUBDEV/CHANS_PER_PORT)
-#define CHANS_PER_ASIC (CHANS_PER_PORT*PORTS_PER_ASIC)
-#define INTR_CHANS_PER_ASIC 24
-#define INTR_PORTS_PER_SUBDEV (INTR_CHANS_PER_ASIC/CHANS_PER_PORT)
-#define MAX_DIO_CHANS   (PORTS_PER_ASIC*2*CHANS_PER_PORT)
-#define MAX_ASICS       (MAX_DIO_CHANS/CHANS_PER_ASIC)
-#define CALC_N_SUBDEVS(nchans) ((nchans)/MAX_CHANS_PER_SUBDEV + (!!((nchans)%MAX_CHANS_PER_SUBDEV)) /*+ (nchans > INTR_CHANS_PER_ASIC ? 2 : 1)*/)
+#define CHANS_PER_PORT         8
+#define PORTS_PER_ASIC         6
+#define INTR_PORTS_PER_ASIC    3
+/* number of channels per comedi subdevice */
+#define MAX_CHANS_PER_SUBDEV   24
+#define PORTS_PER_SUBDEV       (MAX_CHANS_PER_SUBDEV / CHANS_PER_PORT)
+#define CHANS_PER_ASIC         (CHANS_PER_PORT * PORTS_PER_ASIC)
+#define INTR_CHANS_PER_ASIC    24
+#define INTR_PORTS_PER_SUBDEV  (INTR_CHANS_PER_ASIC / CHANS_PER_PORT)
+#define MAX_DIO_CHANS          (PORTS_PER_ASIC * 2 * CHANS_PER_PORT)
+#define MAX_ASICS              (MAX_DIO_CHANS / CHANS_PER_ASIC)
+
 /* IO Memory sizes */
-#define ASIC_IOSIZE (0x10)
-#define PCMUIO48_IOSIZE ASIC_IOSIZE
-#define PCMUIO96_IOSIZE (ASIC_IOSIZE*2)
-
-/* Some offsets - these are all in the 16byte IO memory offset from
-   the base address.  Note that there is a paging scheme to swap out
-   offsets 0x8-0xA using the PAGELOCK register.  See the table below.
-
-  Register(s)       Pages        R/W?        Description
-  --------------------------------------------------------------
-  REG_PORTx         All          R/W         Read/Write/Configure IO
-  REG_INT_PENDING   All          ReadOnly    Quickly see which INT_IDx has int.
-  REG_PAGELOCK      All          WriteOnly   Select a page
-  REG_POLx          Pg. 1 only   WriteOnly   Select edge-detection polarity
-  REG_ENABx         Pg. 2 only   WriteOnly   Enable/Disable edge-detect. int.
-  REG_INT_IDx       Pg. 3 only   R/W         See which ports/bits have ints.
- */
-#define REG_PORT0 0x0
-#define REG_PORT1 0x1
-#define REG_PORT2 0x2
-#define REG_PORT3 0x3
-#define REG_PORT4 0x4
-#define REG_PORT5 0x5
-#define REG_INT_PENDING 0x6
-#define REG_PAGELOCK 0x7       /* page selector register, upper 2 bits select a page
-                                  and bits 0-5 are used to 'lock down' a particular
-                                  port above to make it readonly.  */
-#define REG_POL0 0x8
-#define REG_POL1 0x9
-#define REG_POL2 0xA
-#define REG_ENAB0 0x8
-#define REG_ENAB1 0x9
-#define REG_ENAB2 0xA
-#define REG_INT_ID0 0x8
-#define REG_INT_ID1 0x9
-#define REG_INT_ID2 0xA
-
-#define NUM_PAGED_REGS 3
-#define NUM_PAGES 4
-#define FIRST_PAGED_REG 0x8
-#define REG_PAGE_BITOFFSET 6
-#define REG_LOCK_BITOFFSET 0
-#define REG_PAGE_MASK (~((0x1<<REG_PAGE_BITOFFSET)-1))
-#define REG_LOCK_MASK ~(REG_PAGE_MASK)
-#define PAGE_POL 1
-#define PAGE_ENAB 2
-#define PAGE_INT_ID 3
+#define ASIC_IOSIZE            0x10
+#define PCMUIO48_IOSIZE                ASIC_IOSIZE
+#define PCMUIO96_IOSIZE                (ASIC_IOSIZE * 2)
 
 /*
- * Board descriptions for two imaginary boards.  Describing the
- * boards in this way is optional, and completely driver-dependent.
- * Some drivers use arrays such as this, other do not.
+ * Some offsets - these are all in the 16byte IO memory offset from
+ * the base address.  Note that there is a paging scheme to swap out
+ * offsets 0x8-0xA using the PAGELOCK register.  See the table below.
+ *
+ * Register(s)       Pages        R/W?        Description
+ * --------------------------------------------------------------------------
+ * REG_PORTx         All          R/W         Read/Write/Configure IO
+ * REG_INT_PENDING   All          ReadOnly    Which INT_IDx has int.
+ * REG_PAGELOCK      All          WriteOnly   Select a page
+ * REG_POLx          Pg. 1 only   WriteOnly   Select edge-detection polarity
+ * REG_ENABx         Pg. 2 only   WriteOnly   Enable/Disable edge-detect int.
+ * REG_INT_IDx       Pg. 3 only   R/W         See which ports/bits have ints.
  */
+#define REG_PORT0              0x0
+#define REG_PORT1              0x1
+#define REG_PORT2              0x2
+#define REG_PORT3              0x3
+#define REG_PORT4              0x4
+#define REG_PORT5              0x5
+#define REG_INT_PENDING                0x6
+/*
+ * page selector register
+ * Upper 2 bits select a page and bits 0-5 are used to
+ * 'lock down' a particular port above to make it readonly.
+ */
+#define REG_PAGELOCK           0x7
+#define REG_POL0               0x8
+#define REG_POL1               0x9
+#define REG_POL2               0xa
+#define REG_ENAB0              0x8
+#define REG_ENAB1              0x9
+#define REG_ENAB2              0xa
+#define REG_INT_ID0            0x8
+#define REG_INT_ID1            0x9
+#define REG_INT_ID2            0xa
+
+#define NUM_PAGED_REGS         3
+#define NUM_PAGES              4
+#define FIRST_PAGED_REG                0x8
+#define REG_PAGE_BITOFFSET     6
+#define REG_LOCK_BITOFFSET     0
+#define REG_PAGE_MASK          (~((0x1 << REG_PAGE_BITOFFSET) - 1))
+#define REG_LOCK_MASK          ~(REG_PAGE_MASK)
+#define PAGE_POL               1
+#define PAGE_ENAB              2
+#define PAGE_INT_ID            3
+
 struct pcmuio_board {
        const char *name;
        const int num_asics;
@@ -154,22 +152,43 @@ struct pcmuio_board {
        const int num_ports;
 };
 
-/* this structure is for data unique to this subdevice.  */
+static const struct pcmuio_board pcmuio_boards[] = {
+       {
+               .name           = "pcmuio48",
+               .num_asics      = 1,
+               .num_ports      = 6,
+       }, {
+               .name           = "pcmuio96",
+               .num_asics      = 2,
+               .num_ports      = 12,
+       },
+};
+
 struct pcmuio_subdev_private {
        /* mapping of halfwords (bytes) in port/chanarray to iobase */
        unsigned long iobases[PORTS_PER_SUBDEV];
 
        /* The below is only used for intr subdevices */
        struct {
-               int asic;       /* if non-negative, this subdev has an interrupt asic */
-               int first_chan; /* if nonnegative, the first channel id for
-                                  interrupts. */
-               int num_asic_chans;     /* the number of asic channels in this subdev
-                                          that have interrutps */
-               int asic_chan;  /* if nonnegative, the first channel id with
-                                  respect to the asic that has interrupts */
-               int enabled_mask;       /* subdev-relative channel mask for channels
-                                          we are interested in */
+               /* if non-negative, this subdev has an interrupt asic */
+               int asic;
+               /* if nonnegative, the first channel id for interrupts */
+               int first_chan;
+               /*
+                * the number of asic channels in this
+                * subdev that have interrutps
+                */
+               int num_asic_chans;
+               /*
+                * if nonnegative, the first channel id with
+                * respect to the asic that has interrupts
+                */
+               int asic_chan;
+               /*
+                * subdev-relative channel mask for channels
+                * we are interested in
+                */
+               int enabled_mask;
                int active;
                int stop_count;
                int continuous;
@@ -177,14 +196,14 @@ struct pcmuio_subdev_private {
        } intr;
 };
 
-/* this structure is for data unique to this hardware driver.  If
-   several hardware drivers keep similar information in this structure,
-   feel free to suggest moving the variable to the struct comedi_device struct.  */
 struct pcmuio_private {
        struct {
-               unsigned char pagelock; /* current page and lock */
-               unsigned char pol[NUM_PAGED_REGS];      /* shadow of POLx registers */
-               unsigned char enab[NUM_PAGED_REGS];     /* shadow of ENABx registers */
+               /* current page and lock */
+               unsigned char pagelock;
+               /* shadow of POLx registers */
+               unsigned char pol[NUM_PAGED_REGS];
+               /* shadow of ENABx registers */
+               unsigned char enab[NUM_PAGED_REGS];
                int num;
                unsigned long iobase;
                unsigned int irq;
@@ -193,17 +212,11 @@ struct pcmuio_private {
        struct pcmuio_subdev_private *sprivs;
 };
 
-#define subpriv ((struct pcmuio_subdev_private *)s->private)
-
-/* DIO devices are slightly special.  Although it is possible to
- * implement the insn_read/insn_write interface, it is much more
- * useful to applications if you implement the insn_bits interface.
- * This allows packed reading/writing of the DIO channels.  The
- * comedi core can convert between insn_bits and insn_read/write */
 static int pcmuio_dio_insn_bits(struct comedi_device *dev,
                                struct comedi_subdevice *s,
                                struct comedi_insn *insn, unsigned int *data)
 {
+       struct pcmuio_subdev_private *subpriv = s->private;
        int byte_no;
 
        /* NOTE:
@@ -217,12 +230,6 @@ static int pcmuio_dio_insn_bits(struct comedi_device *dev,
        /* The insn data is a mask in data[0] and the new data
         * in data[1], each channel cooresponding to a bit. */
 
-#ifdef DAMMIT_ITS_BROKEN
-       /* DEBUG */
-       dev_dbg(dev->class_dev, "write mask: %08x  data: %08x\n", data[0],
-               data[1]);
-#endif
-
        s->state = 0;
 
        for (byte_no = 0; byte_no < s->n_chan / CHANS_PER_PORT; ++byte_no) {
@@ -239,25 +246,11 @@ static int pcmuio_dio_insn_bits(struct comedi_device *dev,
 
                byte = inb(ioaddr);     /* read all 8-bits for this port */
 
-#ifdef DAMMIT_ITS_BROKEN
-               /* DEBUG */
-               printk
-                   ("byte %d wmb %02x db %02x offset %02d io %04x, data_in %02x ",
-                    byte_no, (unsigned)write_mask_byte, (unsigned)data_byte,
-                    offset, ioaddr, (unsigned)byte);
-#endif
-
                if (write_mask_byte) {
-                       /* this byte has some write_bits -- so set the output lines */
-                       byte &= ~write_mask_byte;       /* clear bits for write mask */
-                       byte |= ~data_byte & write_mask_byte;   /* set to inverted data_byte */
-                       /* Write out the new digital output state */
+                       byte &= ~write_mask_byte;
+                       byte |= ~data_byte & write_mask_byte;
                        outb(byte, ioaddr);
                }
-#ifdef DAMMIT_ITS_BROKEN
-               /* DEBUG */
-               dev_dbg(dev->class_dev, "data_out_byte %02x\n", (unsigned)byte);
-#endif
                /* save the digital input lines for this byte.. */
                s->state |= ((unsigned int)byte) << offset;
        }
@@ -265,23 +258,14 @@ static int pcmuio_dio_insn_bits(struct comedi_device *dev,
        /* now return the DIO lines to data[1] - note they came inverted! */
        data[1] = ~s->state;
 
-#ifdef DAMMIT_ITS_BROKEN
-       /* DEBUG */
-       dev_dbg(dev->class_dev, "s->state %08x data_out %08x\n", s->state,
-               data[1]);
-#endif
-
        return insn->n;
 }
 
-/* The input or output configuration of each digital line is
- * configured by a special insn_config instruction.  chanspec
- * contains the channel to be changed, and data[0] contains the
- * value COMEDI_INPUT or COMEDI_OUTPUT. */
 static int pcmuio_dio_insn_config(struct comedi_device *dev,
                                  struct comedi_subdevice *s,
                                  struct comedi_insn *insn, unsigned int *data)
 {
+       struct pcmuio_subdev_private *subpriv = s->private;
        int chan = CR_CHAN(insn->chanspec), byte_no = chan / 8, bit_no =
            chan % 8;
        unsigned long ioaddr;
@@ -315,9 +299,12 @@ static int pcmuio_dio_insn_config(struct comedi_device *dev,
                byte &= ~(1 << bit_no);
                                /**< set input channel to '0' */
 
-               /* write out byte -- this is the only time we actually affect the
-                  hardware as all channels are implicitly output -- but input
-                  channels are set to float-high */
+               /*
+                * write out byte
+                * This is the only time we actually affect the hardware
+                * as all channels are implicitly output -- but input
+                * channels are set to float-high.
+                */
                outb(byte, ioaddr);
 
                /* save to io_bits */
@@ -390,8 +377,8 @@ static void init_asics(struct comedi_device *dev)
                   outb(0xff, baseaddr + REG_ENAB0); */
                /* END DEBUG */
 
-               switch_page(dev, asic, 0);      /* switch back to default page 0 */
-
+               /* switch back to default page 0 */
+               switch_page(dev, asic, 0);
        }
 }
 
@@ -431,8 +418,9 @@ static void unlock_port(struct comedi_device *dev, int asic, int port)
 static void pcmuio_stop_intr(struct comedi_device *dev,
                             struct comedi_subdevice *s)
 {
-       int nports, firstport, asic, port;
        struct pcmuio_private *devpriv = dev->private;
+       struct pcmuio_subdev_private *subpriv = s->private;
+       int nports, firstport, asic, port;
 
        asic = subpriv->intr.asic;
        if (asic < 0)
@@ -450,152 +438,131 @@ static void pcmuio_stop_intr(struct comedi_device *dev,
        }
 }
 
-static irqreturn_t interrupt_pcmuio(int irq, void *d)
+static void pcmuio_handle_intr_subdev(struct comedi_device *dev,
+                                     struct comedi_subdevice *s,
+                                     unsigned triggered)
+{
+       struct pcmuio_subdev_private *subpriv = s->private;
+       unsigned int len = s->async->cmd.chanlist_len;
+       unsigned oldevents = s->async->events;
+       unsigned int val = 0;
+       unsigned long flags;
+       unsigned mytrig;
+       unsigned int i;
+
+       spin_lock_irqsave(&subpriv->intr.spinlock, flags);
+
+       if (!subpriv->intr.active)
+               goto done;
+
+       mytrig = triggered >> subpriv->intr.asic_chan;
+       mytrig &= ((0x1 << subpriv->intr.num_asic_chans) - 1);
+       mytrig <<= subpriv->intr.first_chan;
+
+       if (!(mytrig & subpriv->intr.enabled_mask))
+               goto done;
+
+       for (i = 0; i < len; i++) {
+               unsigned int chan = CR_CHAN(s->async->cmd.chanlist[i]);
+               if (mytrig & (1U << chan))
+                       val |= (1U << i);
+       }
+
+       /* Write the scan to the buffer. */
+       if (comedi_buf_put(s->async, ((short *)&val)[0]) &&
+           comedi_buf_put(s->async, ((short *)&val)[1])) {
+               s->async->events |= (COMEDI_CB_BLOCK | COMEDI_CB_EOS);
+       } else {
+               /* Overflow! Stop acquisition!! */
+               /* TODO: STOP_ACQUISITION_CALL_HERE!! */
+               pcmuio_stop_intr(dev, s);
+       }
+
+       /* Check for end of acquisition. */
+       if (!subpriv->intr.continuous) {
+               /* stop_src == TRIG_COUNT */
+               if (subpriv->intr.stop_count > 0) {
+                       subpriv->intr.stop_count--;
+                       if (subpriv->intr.stop_count == 0) {
+                               s->async->events |= COMEDI_CB_EOA;
+                               /* TODO: STOP_ACQUISITION_CALL_HERE!! */
+                               pcmuio_stop_intr(dev, s);
+                       }
+               }
+       }
+
+done:
+       spin_unlock_irqrestore(&subpriv->intr.spinlock, flags);
+
+       if (oldevents != s->async->events)
+               comedi_event(dev, s);
+}
+
+static int pcmuio_handle_asic_interrupt(struct comedi_device *dev, int asic)
 {
-       int asic, got1 = 0;
-       struct comedi_device *dev = (struct comedi_device *)d;
        struct pcmuio_private *devpriv = dev->private;
+       struct pcmuio_subdev_private *subpriv;
+       unsigned long iobase = devpriv->asics[asic].iobase;
+       unsigned triggered = 0;
+       int got1 = 0;
+       unsigned long flags;
+       unsigned char int_pend;
        int i;
 
-       for (asic = 0; asic < MAX_ASICS; ++asic) {
-               if (irq == devpriv->asics[asic].irq) {
-                       unsigned long flags;
-                       unsigned triggered = 0;
-                       unsigned long iobase = devpriv->asics[asic].iobase;
-                       /* it is an interrupt for ASIC #asic */
-                       unsigned char int_pend;
-
-                       spin_lock_irqsave(&devpriv->asics[asic].spinlock,
-                                         flags);
-
-                       int_pend = inb(iobase + REG_INT_PENDING) & 0x07;
-
-                       if (int_pend) {
-                               int port;
-                               for (port = 0; port < INTR_PORTS_PER_ASIC;
-                                    ++port) {
-                                       if (int_pend & (0x1 << port)) {
-                                               unsigned char
-                                                   io_lines_with_edges = 0;
-                                               switch_page(dev, asic,
-                                                           PAGE_INT_ID);
-                                               io_lines_with_edges =
-                                                   inb(iobase +
-                                                       REG_INT_ID0 + port);
-
-                                               if (io_lines_with_edges)
-                                                       /* clear pending interrupt */
-                                                       outb(0, iobase +
-                                                            REG_INT_ID0 +
-                                                            port);
-
-                                               triggered |=
-                                                   io_lines_with_edges <<
-                                                   port * 8;
-                                       }
-                               }
-
-                               ++got1;
+       spin_lock_irqsave(&devpriv->asics[asic].spinlock, flags);
+
+       int_pend = inb(iobase + REG_INT_PENDING) & 0x07;
+       if (int_pend) {
+               for (i = 0; i < INTR_PORTS_PER_ASIC; ++i) {
+                       if (int_pend & (0x1 << i)) {
+                               unsigned char val;
+
+                               switch_page(dev, asic, PAGE_INT_ID);
+                               val = inb(iobase + REG_INT_ID0 + i);
+                               if (val)
+                                       /* clear pending interrupt */
+                                       outb(0, iobase + REG_INT_ID0 + i);
+
+                                       triggered |= (val << (i * 8));
                        }
+               }
 
-                       spin_unlock_irqrestore(&devpriv->asics[asic].spinlock,
-                                              flags);
-
-                       if (triggered) {
-                               struct comedi_subdevice *s;
-                               /* TODO here: dispatch io lines to subdevs with commands.. */
-                               printk
-                                   ("PCMUIO DEBUG: got edge detect interrupt %d asic %d which_chans: %06x\n",
-                                    irq, asic, triggered);
-                               for (i = 0; i < dev->n_subdevices; i++) {
-                                       s = &dev->subdevices[i];
-                                       if (subpriv->intr.asic == asic) {       /* this is an interrupt subdev, and it matches this asic! */
-                                               unsigned long flags;
-                                               unsigned oldevents;
-
-                                               spin_lock_irqsave(&subpriv->
-                                                                 intr.spinlock,
-                                                                 flags);
-
-                                               oldevents = s->async->events;
-
-                                               if (subpriv->intr.active) {
-                                                       unsigned mytrig =
-                                                           ((triggered >>
-                                                             subpriv->intr.asic_chan)
-                                                            &
-                                                            ((0x1 << subpriv->
-                                                              intr.
-                                                              num_asic_chans) -
-                                                             1)) << subpriv->
-                                                           intr.first_chan;
-                                                       if (mytrig &
-                                                           subpriv->intr.enabled_mask)
-                                                       {
-                                                               unsigned int val
-                                                                   = 0;
-                                                               unsigned int n,
-                                                                   ch, len;
-
-                                                               len =
-                                                                   s->
-                                                                   async->cmd.chanlist_len;
-                                                               for (n = 0;
-                                                                    n < len;
-                                                                    n++) {
-                                                                       ch = CR_CHAN(s->async->cmd.chanlist[n]);
-                                                                       if (mytrig & (1U << ch)) {
-                                                                               val |= (1U << n);
-                                                                       }
-                                                               }
-                                                               /* Write the scan to the buffer. */
-                                                               if (comedi_buf_put(s->async, ((short *)&val)[0])
-                                                                   &&
-                                                                   comedi_buf_put
-                                                                   (s->async,
-                                                                    ((short *)
-                                                                     &val)[1]))
-                                                               {
-                                                                       s->async->events |= (COMEDI_CB_BLOCK | COMEDI_CB_EOS);
-                                                               } else {
-                                                                       /* Overflow! Stop acquisition!! */
-                                                                       /* TODO: STOP_ACQUISITION_CALL_HERE!! */
-                                                                       pcmuio_stop_intr
-                                                                           (dev,
-                                                                            s);
-                                                               }
-
-                                                               /* Check for end of acquisition. */
-                                                               if (!subpriv->intr.continuous) {
-                                                                       /* stop_src == TRIG_COUNT */
-                                                                       if (subpriv->intr.stop_count > 0) {
-                                                                               subpriv->intr.stop_count--;
-                                                                               if (subpriv->intr.stop_count == 0) {
-                                                                                       s->async->events |= COMEDI_CB_EOA;
-                                                                                       /* TODO: STOP_ACQUISITION_CALL_HERE!! */
-                                                                                       pcmuio_stop_intr
-                                                                                           (dev,
-                                                                                            s);
-                                                                               }
-                                                                       }
-                                                               }
-                                                       }
-                                               }
-
-                                               spin_unlock_irqrestore
-                                                   (&subpriv->intr.spinlock,
-                                                    flags);
-
-                                               if (oldevents !=
-                                                   s->async->events) {
-                                                       comedi_event(dev, s);
-                                               }
-
-                                       }
-
-                               }
+               ++got1;
+       }
+
+       spin_unlock_irqrestore(&devpriv->asics[asic].spinlock, flags);
+
+       if (triggered) {
+               struct comedi_subdevice *s;
+               /* TODO here: dispatch io lines to subdevs with commands.. */
+               for (i = 0; i < dev->n_subdevices; i++) {
+                       s = &dev->subdevices[i];
+                       subpriv = s->private;
+                       if (subpriv->intr.asic == asic) {
+                               /*
+                                * This is an interrupt subdev, and it
+                                * matches this asic!
+                                */
+                               pcmuio_handle_intr_subdev(dev, s,
+                                                         triggered);
                        }
+               }
+       }
+       return got1;
+}
 
+static irqreturn_t interrupt_pcmuio(int irq, void *d)
+{
+       struct comedi_device *dev = d;
+       struct pcmuio_private *devpriv = dev->private;
+       int got1 = 0;
+       int asic;
+
+       for (asic = 0; asic < MAX_ASICS; ++asic) {
+               if (irq == devpriv->asics[asic].irq) {
+                       /* it is an interrupt for ASIC #asic */
+                       if (pcmuio_handle_asic_interrupt(dev, asic))
+                               got1++;
                }
        }
        if (!got1)
@@ -607,6 +574,7 @@ static int pcmuio_start_intr(struct comedi_device *dev,
                             struct comedi_subdevice *s)
 {
        struct pcmuio_private *devpriv = dev->private;
+       struct pcmuio_subdev_private *subpriv = s->private;
 
        if (!subpriv->intr.continuous && subpriv->intr.stop_count == 0) {
                /* An empty acquisition! */
@@ -660,6 +628,7 @@ static int pcmuio_start_intr(struct comedi_device *dev,
 
 static int pcmuio_cancel(struct comedi_device *dev, struct comedi_subdevice *s)
 {
+       struct pcmuio_subdev_private *subpriv = s->private;
        unsigned long flags;
 
        spin_lock_irqsave(&subpriv->intr.spinlock, flags);
@@ -677,6 +646,7 @@ static int
 pcmuio_inttrig_start_intr(struct comedi_device *dev, struct comedi_subdevice *s,
                          unsigned int trignum)
 {
+       struct pcmuio_subdev_private *subpriv = s->private;
        unsigned long flags;
        int event = 0;
 
@@ -701,6 +671,7 @@ pcmuio_inttrig_start_intr(struct comedi_device *dev, struct comedi_subdevice *s,
  */
 static int pcmuio_cmd(struct comedi_device *dev, struct comedi_subdevice *s)
 {
+       struct pcmuio_subdev_private *subpriv = s->private;
        struct comedi_cmd *cmd = &s->async->cmd;
        unsigned long flags;
        int event = 0;
@@ -797,8 +768,9 @@ static int pcmuio_cmdtest(struct comedi_device *dev,
 static int pcmuio_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 {
        const struct pcmuio_board *board = comedi_board(dev);
-       struct pcmuio_private *devpriv;
        struct comedi_subdevice *s;
+       struct pcmuio_private *devpriv;
+       struct pcmuio_subdev_private *subpriv;
        int sdev_no, chans_left, n_subdevs, port, asic, thisasic_chanct = 0;
        unsigned int irq[MAX_ASICS];
        int ret;
@@ -819,14 +791,12 @@ static int pcmuio_attach(struct comedi_device *dev, struct comedi_devconfig *it)
        for (asic = 0; asic < MAX_ASICS; ++asic) {
                devpriv->asics[asic].num = asic;
                devpriv->asics[asic].iobase = dev->iobase + asic * ASIC_IOSIZE;
-               devpriv->asics[asic].irq = 0;   /* this gets actually set at the end of
-                                                  this function when we
-                                                  request_irqs */
                spin_lock_init(&devpriv->asics[asic].spinlock);
        }
 
        chans_left = CHANS_PER_ASIC * board->num_asics;
-       n_subdevs = CALC_N_SUBDEVS(chans_left);
+       n_subdevs = (chans_left / MAX_CHANS_PER_SUBDEV) +
+                   (!!(chans_left % MAX_CHANS_PER_SUBDEV));
        devpriv->sprivs = kcalloc(n_subdevs,
                                  sizeof(struct pcmuio_subdev_private),
                                  GFP_KERNEL);
@@ -843,7 +813,8 @@ static int pcmuio_attach(struct comedi_device *dev, struct comedi_devconfig *it)
                int byte_no;
 
                s = &dev->subdevices[sdev_no];
-               s->private = &devpriv->sprivs[sdev_no];
+               subpriv = &devpriv->sprivs[sdev_no];
+               s->private = subpriv;
                s->maxdata = 1;
                s->range_table = &range_digital;
                s->subdev_flags = SDF_READABLE | SDF_WRITABLE;
@@ -860,7 +831,8 @@ static int pcmuio_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 
                /* save the ioport address for each 'port' of 8 channels in the
                   subdevice */
-               for (byte_no = 0; byte_no < PORTS_PER_SUBDEV; ++byte_no, ++port) {
+               for (byte_no = 0; byte_no < PORTS_PER_SUBDEV;
+                    ++byte_no, ++port) {
                        if (port >= PORTS_PER_ASIC) {
                                port = 0;
                                ++asic;
@@ -872,7 +844,7 @@ static int pcmuio_attach(struct comedi_device *dev, struct comedi_devconfig *it)
                        if (thisasic_chanct <
                            CHANS_PER_PORT * INTR_PORTS_PER_ASIC
                            && subpriv->intr.asic < 0) {
-                               /* this is an interrupt subdevice, so setup the struct */
+                               /* setup the interrupt subdevice */
                                subpriv->intr.asic = asic;
                                subpriv->intr.active = 0;
                                subpriv->intr.stop_count = 0;
@@ -894,7 +866,8 @@ static int pcmuio_attach(struct comedi_device *dev, struct comedi_devconfig *it)
                chans_left -= s->n_chan;
 
                if (!chans_left) {
-                       asic = 0;       /* reset the asic to our first asic, to do intr subdevs */
+                       /* reset to our first asic, to do intr subdevs */
+                       asic = 0;
                        port = 0;
                }
 
@@ -944,18 +917,6 @@ static void pcmuio_detach(struct comedi_device *dev)
        comedi_legacy_detach(dev);
 }
 
-static const struct pcmuio_board pcmuio_boards[] = {
-       {
-               .name           = "pcmuio48",
-               .num_asics      = 1,
-               .num_ports      = 6,
-       }, {
-               .name           = "pcmuio96",
-               .num_asics      = 2,
-               .num_ports      = 12,
-       },
-};
-
 static struct comedi_driver pcmuio_driver = {
        .driver_name    = "pcmuio",
        .module         = THIS_MODULE,
index ff76fbb4b3efa0f0b1be5a6f2b316fb357a07719..fbcf2506980709236e8ab41a689f0b84f0689989 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #ifndef _PLX9052_H_
index b55a16baeb144fc92ed206293eb2118448cdef33..005fbefae295fc4f0968728a956f06a916a2bfc4 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 /*
 Driver: poc
 Description: Generic driver for very simple devices
 Author: ds
-Devices: [Keithley Metrabyte] DAC-02 (dac02), [Advantech] PCL-733 (pcl733),
-  PCL-734 (pcl734)
+Devices: [Keithley Metrabyte] DAC-02 (dac02)
 Updated: Sat, 16 Mar 2002 17:34:48 -0800
 Status: unknown
 
 This driver is indended to support very simple ISA-based devices,
 including:
   dac02 - Keithley DAC-02 analog output board
-  pcl733 - Advantech PCL-733
-  pcl734 - Advantech PCL-734
 
 Configuration options:
   [0] - I/O port base
@@ -101,39 +94,6 @@ static int dac02_ao_winsn(struct comedi_device *dev, struct comedi_subdevice *s,
        return 1;
 }
 
-static int pcl733_insn_bits(struct comedi_device *dev,
-                           struct comedi_subdevice *s,
-                           struct comedi_insn *insn, unsigned int *data)
-{
-       data[1] = inb(dev->iobase + 0);
-       data[1] |= (inb(dev->iobase + 1) << 8);
-       data[1] |= (inb(dev->iobase + 2) << 16);
-       data[1] |= (inb(dev->iobase + 3) << 24);
-
-       return insn->n;
-}
-
-static int pcl734_insn_bits(struct comedi_device *dev,
-                           struct comedi_subdevice *s,
-                           struct comedi_insn *insn, unsigned int *data)
-{
-       if (data[0]) {
-               s->state &= ~data[0];
-               s->state |= (data[0] & data[1]);
-               if ((data[0] >> 0) & 0xff)
-                       outb((s->state >> 0) & 0xff, dev->iobase + 0);
-               if ((data[0] >> 8) & 0xff)
-                       outb((s->state >> 8) & 0xff, dev->iobase + 1);
-               if ((data[0] >> 16) & 0xff)
-                       outb((s->state >> 16) & 0xff, dev->iobase + 2);
-               if ((data[0] >> 24) & 0xff)
-                       outb((s->state >> 24) & 0xff, dev->iobase + 3);
-       }
-       data[1] = s->state;
-
-       return insn->n;
-}
-
 static int poc_attach(struct comedi_device *dev, struct comedi_devconfig *it)
 {
        const struct boarddef_struct *board = comedi_board(dev);
@@ -180,22 +140,6 @@ static const struct boarddef_struct boards[] = {
                .winsn          = dac02_ao_winsn,
                .rinsn          = readback_insn,
                .range          = &range_unknown,
-       }, {
-               .name           = "pcl733",
-               .iosize         = 4,
-               .type           = COMEDI_SUBD_DI,
-               .n_chan         = 32,
-               .n_bits         = 1,
-               .insnbits       = pcl733_insn_bits,
-               .range          = &range_digital,
-       }, {
-               .name           = "pcl734",
-               .iosize         = 4,
-               .type           = COMEDI_SUBD_DO,
-               .n_chan         = 32,
-               .n_bits         = 1,
-               .insnbits       = pcl734_insn_bits,
-               .range          = &range_digital,
        },
 };
 
index 30a17284fac9ac48ad6ba3864850e1f9004cd9ac..9b93a1fc4a599856e9957ee3fd3727d90b9da7dd 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
index f4163fd35a00b706968d03fc9aa5aba565d2851f..f698c7fc57261bbcccee66e7a84aae07e1059f93 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
index 46dbbe6cdd76593333079f4c4fb67885e799c7a8..9e7445055482a35e46fa661dff12375a080ce28c 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: rti802
index d240ce87bd68a75a2d470aaa2591e73cd40db914..e1587e58a732aa3e76fc22d57c3ad2aaf0f0ea07 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: s526
index 0cf4b3d1279af3b6331d00d67749d90b47cc8db2..48c4b70b736aad1163a6be02eceb7ba25cf1c688 100644 (file)
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*
index 99cd57b092ea7e40bd13bce47d9041f6416dc99f..d2756b83b62d13625d4a5ef775268fd0a228b330 100644 (file)
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*
index 8900086374db8a581d9781f2499d8f2e6a5791ec..b4f5fe35b0fa18fb8ac5f633ee87d8904f233a7a 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*
index dbc8c54d6da7f9509d44b1cca3cf061d0d69ed1b..06aee302bbc23c696d85b53161c64b028d6dfb7c 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: skel
index a76df092a57b149bc131f5460d0fa30273097a4d..45c661cbdbb9f4c533f5fafc3d60f7cea2f9d24f 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: ssv_dnp
index 0c243477cbe5cb69674d59bf7aefcafba38a257a..d05d46d0a1f264fb685a8c1014e865e8256453dc 100644 (file)
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
  *  GNU General Public License for more details.                           *
  *                                                                         *
- *  You should have received a copy of the GNU General Public License      *
- *  along with this program; if not, write to the Free Software            *
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.              *
- *                                                                         *
  ***************************************************************************/
 /*
 
index 6f5da67e26cba8ab03fa39abf2532391ce5cb80a..279e5bd493fa8682cfa6a9d3e1095265546736cc 100644 (file)
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
  */
 /*
 Driver: usbdux
@@ -94,7 +89,6 @@ sampling rate. If you sample two channels you get 4kHz and so on.
 #include <linux/usb.h>
 #include <linux/fcntl.h>
 #include <linux/compiler.h>
-#include <linux/firmware.h>
 
 #include "../comedidev.h"
 
@@ -727,154 +721,82 @@ static void usbduxsub_ao_isoc_irq(struct urb *urb)
        }
 }
 
-static int usbduxsub_start(struct usbduxsub *usbduxsub)
-{
-       int errcode = 0;
-       uint8_t *local_transfer_buffer;
-
-       local_transfer_buffer = kmalloc(1, GFP_KERNEL);
-       if (!local_transfer_buffer)
-               return -ENOMEM;
-
-       /* 7f92 to zero */
-       *local_transfer_buffer = 0;
-       errcode = usb_control_msg(usbduxsub->usbdev,
-                                 /* create a pipe for a control transfer */
-                                 usb_sndctrlpipe(usbduxsub->usbdev, 0),
-                                 /* bRequest, "Firmware" */
-                                 USBDUXSUB_FIRMWARE,
-                                 /* bmRequestType */
-                                 VENDOR_DIR_OUT,
-                                 /* Value */
-                                 USBDUXSUB_CPUCS,
-                                 /* Index */
-                                 0x0000,
-                                 /* address of the transfer buffer */
-                                 local_transfer_buffer,
-                                 /* Length */
-                                 1,
-                                 /* Timeout */
-                                 BULK_TIMEOUT);
-       if (errcode < 0)
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: control msg failed (start)\n");
-
-       kfree(local_transfer_buffer);
-       return errcode;
-}
-
-static int usbduxsub_stop(struct usbduxsub *usbduxsub)
-{
-       int errcode = 0;
-       uint8_t *local_transfer_buffer;
-
-       local_transfer_buffer = kmalloc(1, GFP_KERNEL);
-       if (!local_transfer_buffer)
-               return -ENOMEM;
-
-       /* 7f92 to one */
-       *local_transfer_buffer = 1;
-       errcode = usb_control_msg(usbduxsub->usbdev,
-                                 usb_sndctrlpipe(usbduxsub->usbdev, 0),
-                                 /* bRequest, "Firmware" */
-                                 USBDUXSUB_FIRMWARE,
-                                 /* bmRequestType */
-                                 VENDOR_DIR_OUT,
-                                 /* Value */
-                                 USBDUXSUB_CPUCS,
-                                 /* Index */
-                                 0x0000, local_transfer_buffer,
-                                 /* Length */
-                                 1,
-                                 /* Timeout */
-                                 BULK_TIMEOUT);
-       if (errcode < 0)
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: control msg failed (stop)\n");
-
-       kfree(local_transfer_buffer);
-       return errcode;
-}
-
-static int usbduxsub_upload(struct usbduxsub *usbduxsub,
-                           uint8_t *local_transfer_buffer,
-                           unsigned int start_addr, unsigned int len)
-{
-       int errcode;
-
-       errcode = usb_control_msg(usbduxsub->usbdev,
-                                 usb_sndctrlpipe(usbduxsub->usbdev, 0),
-                                 /* brequest, firmware */
-                                 USBDUXSUB_FIRMWARE,
-                                 /* bmRequestType */
-                                 VENDOR_DIR_OUT,
-                                 /* value */
-                                 start_addr,
-                                 /* index */
-                                 0x0000,
-                                 /* our local safe buffer */
-                                 local_transfer_buffer,
-                                 /* length */
-                                 len,
-                                 /* timeout */
-                                 BULK_TIMEOUT);
-       dev_dbg(&usbduxsub->interface->dev, "comedi_: result=%d\n", errcode);
-       if (errcode < 0) {
-               dev_err(&usbduxsub->interface->dev, "comedi_: upload failed\n");
-               return errcode;
-       }
-       return 0;
-}
-
 #define FIRMWARE_MAX_LEN 0x2000
 
-static int firmware_upload(struct usbduxsub *usbduxsub,
-                         const u8 *firmware_binary, int size_firmware)
+static int usbdux_firmware_upload(struct comedi_device *dev,
+                                 const u8 *data, size_t size,
+                                 unsigned long context)
 {
+       struct usbduxsub *usbduxsub = dev->private;
+       struct usb_device *usb = usbduxsub->usbdev;
+       uint8_t *buf;
+       uint8_t *tmp;
        int ret;
-       uint8_t *fw_buf;
 
-       if (!firmware_binary)
+       if (!data)
                return 0;
 
-       if (size_firmware > FIRMWARE_MAX_LEN) {
+       if (size > FIRMWARE_MAX_LEN) {
                dev_err(&usbduxsub->interface->dev,
                        "usbdux firmware binary it too large for FX2.\n");
                return -ENOMEM;
        }
 
        /* we generate a local buffer for the firmware */
-       fw_buf = kmemdup(firmware_binary, size_firmware, GFP_KERNEL);
-       if (!fw_buf) {
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: mem alloc for firmware failed\n");
+       buf = kmemdup(data, size, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       /* we need a malloc'ed buffer for usb_control_msg() */
+       tmp = kmalloc(1, GFP_KERNEL);
+       if (!tmp) {
+               kfree(buf);
                return -ENOMEM;
        }
 
-       ret = usbduxsub_stop(usbduxsub);
+       /* stop the current firmware on the device */
+       *tmp = 1;       /* 7f92 to one */
+       ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+                             USBDUXSUB_FIRMWARE,
+                             VENDOR_DIR_OUT,
+                             USBDUXSUB_CPUCS, 0x0000,
+                             tmp, 1,
+                             BULK_TIMEOUT);
        if (ret < 0) {
                dev_err(&usbduxsub->interface->dev,
                        "comedi_: can not stop firmware\n");
-               kfree(fw_buf);
-               return ret;
+               goto done;
        }
 
-       ret = usbduxsub_upload(usbduxsub, fw_buf, 0, size_firmware);
+       /* upload the new firmware to the device */
+       ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+                             USBDUXSUB_FIRMWARE,
+                             VENDOR_DIR_OUT,
+                             0, 0x0000,
+                             buf, size,
+                             BULK_TIMEOUT);
        if (ret < 0) {
                dev_err(&usbduxsub->interface->dev,
                        "comedi_: firmware upload failed\n");
-               kfree(fw_buf);
-               return ret;
+               goto done;
        }
-       ret = usbduxsub_start(usbduxsub);
-       if (ret < 0) {
+
+       /* start the new firmware on the device */
+       *tmp = 0;       /* 7f92 to zero */
+       ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+                             USBDUXSUB_FIRMWARE,
+                             VENDOR_DIR_OUT,
+                             USBDUXSUB_CPUCS, 0x0000,
+                             tmp, 1,
+                             BULK_TIMEOUT);
+       if (ret < 0)
                dev_err(&usbduxsub->interface->dev,
                        "comedi_: can not start firmware\n");
-               kfree(fw_buf);
-               return ret;
-       }
-       kfree(fw_buf);
-       return 0;
+
+done:
+       kfree(tmp);
+       kfree(buf);
+       return ret;
 }
 
 static int usbduxsub_submit_inurbs(struct usbduxsub *usbduxsub)
@@ -2328,13 +2250,21 @@ static int usbdux_auto_attach(struct comedi_device *dev,
                              unsigned long context_unused)
 {
        struct usb_interface *uinterf = comedi_to_usb_interface(dev);
+       struct usbduxsub *this_usbduxsub = usb_get_intfdata(uinterf);
+       struct usb_device *usb = usbduxsub->usbdev;
        int ret;
-       struct usbduxsub *this_usbduxsub;
+
+       dev->private = this_usbduxsub;  /* This is temporary... */
+       ret = comedi_load_firmware(dev, &usb->dev, FIRMWARE,
+                                  usbdux_firmware_upload, 0);
+       if (ret < 0) {
+               dev->private = NULL;
+               return ret;
+       }
 
        dev->private = NULL;
 
        down(&start_stop_sem);
-       this_usbduxsub = usb_get_intfdata(uinterf);
        if (!this_usbduxsub || !this_usbduxsub->probed) {
                dev_err(dev->class_dev,
                        "usbdux: error: auto_attach failed, not connected\n");
@@ -2369,35 +2299,6 @@ static struct comedi_driver usbdux_driver = {
        .detach         = usbdux_detach,
 };
 
-static void usbdux_firmware_request_complete_handler(const struct firmware *fw,
-                                                    void *context)
-{
-       struct usbduxsub *usbduxsub_tmp = context;
-       struct usb_interface *uinterf = usbduxsub_tmp->interface;
-       int ret;
-
-       if (fw == NULL) {
-               dev_err(&uinterf->dev,
-                       "Firmware complete handler without firmware!\n");
-               return;
-       }
-
-       /*
-        * we need to upload the firmware here because fw will be
-        * freed once we've left this function
-        */
-       ret = firmware_upload(usbduxsub_tmp, fw->data, fw->size);
-
-       if (ret) {
-               dev_err(&uinterf->dev,
-                       "Could not upload firmware (err=%d)\n", ret);
-               goto out;
-       }
-       comedi_usb_auto_config(uinterf, &usbdux_driver, 0);
- out:
-       release_firmware(fw);
-}
-
 static int usbdux_usb_probe(struct usb_interface *uinterf,
                            const struct usb_device_id *id)
 {
@@ -2405,7 +2306,6 @@ static int usbdux_usb_probe(struct usb_interface *uinterf,
        struct device *dev = &uinterf->dev;
        int i;
        int index;
-       int ret;
 
        dev_dbg(dev, "comedi_: usbdux_: "
                "finding a free structure for the usb-device\n");
@@ -2622,23 +2522,7 @@ static int usbdux_usb_probe(struct usb_interface *uinterf,
        usbduxsub[index].probed = 1;
        up(&start_stop_sem);
 
-       ret = request_firmware_nowait(THIS_MODULE,
-                                     FW_ACTION_HOTPLUG,
-                                     FIRMWARE,
-                                     &udev->dev,
-                                     GFP_KERNEL,
-                                     usbduxsub + index,
-                                     usbdux_firmware_request_complete_handler);
-
-       if (ret) {
-               dev_err(dev, "Could not load firmware (err=%d)\n", ret);
-               return ret;
-       }
-
-       dev_info(dev, "comedi_: usbdux%d "
-                "has been successfully initialised.\n", index);
-       /* success */
-       return 0;
+       return comedi_usb_auto_config(uinterf, &usbdux_driver, 0);
 }
 
 static void usbdux_usb_disconnect(struct usb_interface *intf)
index 7f95af33085d42e6010c0e9b8ddad3dc302ac210..27898c44e543f97ee200e89573b7a64089b9a014 100644 (file)
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 /*
@@ -40,7 +36,6 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
-#include <linux/firmware.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -60,6 +55,7 @@
  * constants for "firmware" upload and download
  */
 #define FIRMWARE               "usbduxfast_firmware.bin"
+#define FIRMWARE_MAX_LEN       0x2000
 #define USBDUXFASTSUB_FIRMWARE 0xA0
 #define VENDOR_DIR_IN          0xC0
 #define VENDOR_DIR_OUT         0x40
 /*
  * size of the buffer for the dux commands in bytes
  */
-#define SIZEOFDUXBUFFER        256
+#define SIZEOFDUXBUF   256
 
 /*
  * number of in-URBs which receive the data: min=5
  */
 #define NUMOFINBUFFERSHIGH     10
 
-/*
- * total number of usbduxfast devices
- */
-#define NUMUSBDUXFAST  16
-
-/*
- * analogue in subdevice
- */
-#define SUBDEV_AD      0
-
 /*
  * min delay steps for more than one channel
  * basically when the mux gives up ;-)
@@ -161,143 +147,83 @@ static const struct comedi_lrange range_usbduxfast_ai_range = {
  * this is the structure which holds all the data of this driver
  * one sub device just now: A/D
  */
-struct usbduxfastsub_s {
-       int attached;           /* is attached? */
-       int probed;             /* is it associated with a subdevice? */
-       struct usb_device *usbdev;      /* pointer to the usb-device */
-       struct urb *urbIn;      /* BULK-transfer handling: urb */
-       int8_t *transfer_buffer;
-       int16_t *insnBuffer;    /* input buffer for single insn */
-       int ifnum;              /* interface number */
-       struct usb_interface *interface;        /* interface structure */
-       /* comedi device for the interrupt context */
-       struct comedi_device *comedidev;
+struct usbduxfast_private {
+       struct urb *urb;        /* BULK-transfer handling: urb */
+       uint8_t *duxbuf;
+       int8_t *inbuf;
        short int ai_cmd_running;       /* asynchronous command is running */
        short int ai_continous; /* continous acquisition */
        long int ai_sample_count;       /* number of samples to acquire */
-       uint8_t *dux_commands;  /* commands */
        int ignore;             /* counter which ignores the first
                                   buffers */
        struct semaphore sem;
 };
 
-/*
- * The pointer to the private usb-data of the driver
- * is also the private data for the comedi-device.
- * This has to be global as the usb subsystem needs
- * global variables. The other reason is that this
- * structure must be there _before_ any comedi
- * command is issued. The usb subsystem must be
- * initialised before comedi can access it.
- */
-static struct usbduxfastsub_s usbduxfastsub[NUMUSBDUXFAST];
-
-static DEFINE_SEMAPHORE(start_stop_sem);
-
 /*
  * bulk transfers to usbduxfast
  */
 #define SENDADCOMMANDS            0
 #define SENDINITEP6               1
 
-static int send_dux_commands(struct usbduxfastsub_s *udfs, int cmd_type)
+static int usbduxfast_send_cmd(struct comedi_device *dev, int cmd_type)
 {
-       int tmp, nsent;
-
-       udfs->dux_commands[0] = cmd_type;
-
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi%d: usbduxfast: dux_commands: ",
-              udfs->comedidev->minor);
-       for (tmp = 0; tmp < SIZEOFDUXBUFFER; tmp++)
-               printk(" %02x", udfs->dux_commands[tmp]);
-       printk("\n");
-#endif
-
-       tmp = usb_bulk_msg(udfs->usbdev,
-                          usb_sndbulkpipe(udfs->usbdev, CHANNELLISTEP),
-                          udfs->dux_commands, SIZEOFDUXBUFFER, &nsent, 10000);
-       if (tmp < 0)
-               dev_err(&udfs->interface->dev,
-                       "could not transmit dux_commands to the usb-device, err=%d\n",
-                       tmp);
-       return tmp;
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxfast_private *devpriv = dev->private;
+       int nsent;
+       int ret;
+
+       devpriv->duxbuf[0] = cmd_type;
+
+       ret = usb_bulk_msg(usb, usb_sndbulkpipe(usb, CHANNELLISTEP),
+                          devpriv->duxbuf, SIZEOFDUXBUF,
+                          &nsent, 10000);
+       if (ret < 0)
+               dev_err(dev->class_dev,
+                       "could not transmit command to the usb-device, err=%d\n",
+                       ret);
+       return ret;
 }
 
-/*
- * Stops the data acquision.
- * It should be safe to call this function from any context.
- */
-static int usbduxfastsub_unlink_InURBs(struct usbduxfastsub_s *udfs)
+static void usbduxfast_cmd_data(struct comedi_device *dev, int index,
+                               uint8_t len, uint8_t op, uint8_t out,
+                               uint8_t log)
 {
-       int j = 0;
-       int err = 0;
+       struct usbduxfast_private *devpriv = dev->private;
 
-       if (udfs && udfs->urbIn) {
-               udfs->ai_cmd_running = 0;
-               /* waits until a running transfer is over */
-               usb_kill_urb(udfs->urbIn);
-               j = 0;
-       }
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi: usbduxfast: unlinked InURB: res=%d\n", j);
-#endif
-       return err;
+       /* Set the GPIF bytes, the first byte is the command byte */
+       devpriv->duxbuf[1 + 0x00 + index] = len;
+       devpriv->duxbuf[1 + 0x08 + index] = op;
+       devpriv->duxbuf[1 + 0x10 + index] = out;
+       devpriv->duxbuf[1 + 0x18 + index] = log;
 }
 
-/*
- * This will stop a running acquisition operation.
- * Is called from within this driver from both the
- * interrupt context and from comedi.
- */
-static int usbduxfast_ai_stop(struct usbduxfastsub_s *udfs, int do_unlink)
+static int usbduxfast_ai_stop(struct comedi_device *dev, int do_unlink)
 {
-       int ret = 0;
+       struct usbduxfast_private *devpriv = dev->private;
 
-       if (!udfs) {
-               pr_err("%s: udfs=NULL!\n", __func__);
-               return -EFAULT;
-       }
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi: usbduxfast_ai_stop\n");
-#endif
+       /* stop aquistion */
+       devpriv->ai_cmd_running = 0;
 
-       udfs->ai_cmd_running = 0;
-
-       if (do_unlink)
-               /* stop aquistion */
-               ret = usbduxfastsub_unlink_InURBs(udfs);
+       if (do_unlink && devpriv->urb) {
+               /* kill the running transfer */
+               usb_kill_urb(devpriv->urb);
+       }
 
-       return ret;
+       return 0;
 }
 
-/*
- * This will cancel a running acquisition operation.
- * This is called by comedi but never from inside the driver.
- */
 static int usbduxfast_ai_cancel(struct comedi_device *dev,
                                struct comedi_subdevice *s)
 {
-       struct usbduxfastsub_s *udfs;
+       struct usbduxfast_private *devpriv = dev->private;
        int ret;
 
-       /* force unlink of all urbs */
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi: usbduxfast_ai_cancel\n");
-#endif
-       udfs = dev->private;
-       if (!udfs) {
-               dev_err(dev->class_dev, "%s: udfs=NULL\n", __func__);
+       if (!devpriv)
                return -EFAULT;
-       }
-       down(&udfs->sem);
-       if (!udfs->probed) {
-               up(&udfs->sem);
-               return -ENODEV;
-       }
-       /* unlink */
-       ret = usbduxfast_ai_stop(udfs, 1);
-       up(&udfs->sem);
+
+       down(&devpriv->sem);
+       ret = usbduxfast_ai_stop(dev, 1);
+       up(&devpriv->sem);
 
        return ret;
 }
@@ -306,32 +232,17 @@ static int usbduxfast_ai_cancel(struct comedi_device *dev,
  * analogue IN
  * interrupt service routine
  */
-static void usbduxfastsub_ai_Irq(struct urb *urb)
+static void usbduxfast_ai_interrupt(struct urb *urb)
 {
+       struct comedi_device *dev = urb->context;
+       struct comedi_subdevice *s = dev->read_subdev;
+       struct comedi_async *async = s->async;
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxfast_private *devpriv = dev->private;
        int n, err;
-       struct usbduxfastsub_s *udfs;
-       struct comedi_device *this_comedidev;
-       struct comedi_subdevice *s;
 
-       /* sanity checks - is the urb there? */
-       if (!urb) {
-               pr_err("ao int-handler called with urb=NULL!\n");
-               return;
-       }
-       /* the context variable points to the subdevice */
-       this_comedidev = urb->context;
-       if (!this_comedidev) {
-               pr_err("urb context is a NULL pointer!\n");
-               return;
-       }
-       /* the private structure of the subdevice is usbduxfastsub_s */
-       udfs = this_comedidev->private;
-       if (!udfs) {
-               pr_err("private of comedi subdev is a NULL pointer!\n");
-               return;
-       }
        /* are we running a command? */
-       if (unlikely(!udfs->ai_cmd_running)) {
+       if (unlikely(!devpriv->ai_cmd_running)) {
                /*
                 * not running a command
                 * do not continue execution if no asynchronous command
@@ -340,13 +251,6 @@ static void usbduxfastsub_ai_Irq(struct urb *urb)
                return;
        }
 
-       if (unlikely(!udfs->attached)) {
-               /* no comedi device there */
-               return;
-       }
-       /* subdevice which is the AD converter */
-       s = &this_comedidev->subdevices[SUBDEV_AD];
-
        /* first we test if something unusual has just happened */
        switch (urb->status) {
        case 0:
@@ -361,189 +265,93 @@ static void usbduxfastsub_ai_Irq(struct urb *urb)
        case -ESHUTDOWN:
        case -ECONNABORTED:
                /* tell this comedi */
-               s->async->events |= COMEDI_CB_EOA;
-               s->async->events |= COMEDI_CB_ERROR;
-               comedi_event(udfs->comedidev, s);
+               async->events |= COMEDI_CB_EOA;
+               async->events |= COMEDI_CB_ERROR;
+               comedi_event(dev, s);
                /* stop the transfer w/o unlink */
-               usbduxfast_ai_stop(udfs, 0);
+               usbduxfast_ai_stop(dev, 0);
                return;
 
        default:
                pr_err("non-zero urb status received in ai intr context: %d\n",
                       urb->status);
-               s->async->events |= COMEDI_CB_EOA;
-               s->async->events |= COMEDI_CB_ERROR;
-               comedi_event(udfs->comedidev, s);
-               usbduxfast_ai_stop(udfs, 0);
+               async->events |= COMEDI_CB_EOA;
+               async->events |= COMEDI_CB_ERROR;
+               comedi_event(dev, s);
+               usbduxfast_ai_stop(dev, 0);
                return;
        }
 
-       if (!udfs->ignore) {
-               if (!udfs->ai_continous) {
+       if (!devpriv->ignore) {
+               if (!devpriv->ai_continous) {
                        /* not continuous, fixed number of samples */
                        n = urb->actual_length / sizeof(uint16_t);
-                       if (unlikely(udfs->ai_sample_count < n)) {
-                               /*
-                                * we have send only a fraction of the bytes
-                                * received
-                                */
+                       if (unlikely(devpriv->ai_sample_count < n)) {
+                               unsigned int num_bytes;
+
+                               /* partial sample received */
+                               num_bytes = devpriv->ai_sample_count *
+                                           sizeof(uint16_t);
                                cfc_write_array_to_buffer(s,
                                                          urb->transfer_buffer,
-                                                         udfs->ai_sample_count
-                                                         * sizeof(uint16_t));
-                               usbduxfast_ai_stop(udfs, 0);
+                                                         num_bytes);
+                               usbduxfast_ai_stop(dev, 0);
                                /* tell comedi that the acquistion is over */
-                               s->async->events |= COMEDI_CB_EOA;
-                               comedi_event(udfs->comedidev, s);
+                               async->events |= COMEDI_CB_EOA;
+                               comedi_event(dev, s);
                                return;
                        }
-                       udfs->ai_sample_count -= n;
+                       devpriv->ai_sample_count -= n;
                }
                /* write the full buffer to comedi */
                err = cfc_write_array_to_buffer(s, urb->transfer_buffer,
                                                urb->actual_length);
                if (unlikely(err == 0)) {
                        /* buffer overflow */
-                       usbduxfast_ai_stop(udfs, 0);
+                       usbduxfast_ai_stop(dev, 0);
                        return;
                }
 
                /* tell comedi that data is there */
-               comedi_event(udfs->comedidev, s);
-
+               comedi_event(dev, s);
        } else {
                /* ignore this packet */
-               udfs->ignore--;
+               devpriv->ignore--;
        }
 
        /*
         * command is still running
         * resubmit urb for BULK transfer
         */
-       urb->dev = udfs->usbdev;
+       urb->dev = usb;
        urb->status = 0;
        err = usb_submit_urb(urb, GFP_ATOMIC);
        if (err < 0) {
-               dev_err(&urb->dev->dev,
+               dev_err(dev->class_dev,
                        "urb resubm failed: %d", err);
-               s->async->events |= COMEDI_CB_EOA;
-               s->async->events |= COMEDI_CB_ERROR;
-               comedi_event(udfs->comedidev, s);
-               usbduxfast_ai_stop(udfs, 0);
-       }
-}
-
-static int usbduxfastsub_start(struct usbduxfastsub_s *udfs)
-{
-       int ret;
-       unsigned char *local_transfer_buffer;
-
-       local_transfer_buffer = kmalloc(1, GFP_KERNEL);
-       if (!local_transfer_buffer)
-               return -ENOMEM;
-
-       /* 7f92 to zero */
-       *local_transfer_buffer = 0;
-       /* bRequest, "Firmware" */
-       ret = usb_control_msg(udfs->usbdev, usb_sndctrlpipe(udfs->usbdev, 0),
-                             USBDUXFASTSUB_FIRMWARE,
-                             VENDOR_DIR_OUT,     /* bmRequestType */
-                             USBDUXFASTSUB_CPUCS,    /* Value */
-                             0x0000,   /* Index */
-                             /* address of the transfer buffer */
-                             local_transfer_buffer,
-                             1,      /* Length */
-                             EZTIMEOUT);    /* Timeout */
-       if (ret < 0)
-               dev_err(&udfs->interface->dev,
-                       "control msg failed (start)\n");
-
-       kfree(local_transfer_buffer);
-       return ret;
-}
-
-static int usbduxfastsub_stop(struct usbduxfastsub_s *udfs)
-{
-       int ret;
-       unsigned char *local_transfer_buffer;
-
-       local_transfer_buffer = kmalloc(1, GFP_KERNEL);
-       if (!local_transfer_buffer)
-               return -ENOMEM;
-
-       /* 7f92 to one */
-       *local_transfer_buffer = 1;
-       /* bRequest, "Firmware" */
-       ret = usb_control_msg(udfs->usbdev, usb_sndctrlpipe(udfs->usbdev, 0),
-                             USBDUXFASTSUB_FIRMWARE,
-                             VENDOR_DIR_OUT,   /* bmRequestType */
-                             USBDUXFASTSUB_CPUCS,      /* Value */
-                             0x0000,   /* Index */
-                             local_transfer_buffer, 1, /* Length */
-                             EZTIMEOUT);       /* Timeout */
-       if (ret < 0)
-               dev_err(&udfs->interface->dev,
-                       "control msg failed (stop)\n");
-
-       kfree(local_transfer_buffer);
-       return ret;
-}
-
-static int usbduxfastsub_upload(struct usbduxfastsub_s *udfs,
-                               unsigned char *local_transfer_buffer,
-                               unsigned int startAddr, unsigned int len)
-{
-       int ret;
-
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi: usbduxfast: uploading %d bytes", len);
-       printk(KERN_DEBUG " to addr %d, first byte=%d.\n",
-              startAddr, local_transfer_buffer[0]);
-#endif
-       /* brequest, firmware */
-       ret = usb_control_msg(udfs->usbdev, usb_sndctrlpipe(udfs->usbdev, 0),
-                             USBDUXFASTSUB_FIRMWARE,
-                             VENDOR_DIR_OUT,   /* bmRequestType */
-                             startAddr,        /* value */
-                             0x0000,    /* index */
-                             /* our local safe buffer */
-                             local_transfer_buffer,
-                             len,      /* length */
-                             EZTIMEOUT);      /* timeout */
-
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi_: usbduxfast: result=%d\n", ret);
-#endif
-
-       if (ret < 0) {
-               dev_err(&udfs->interface->dev, "uppload failed\n");
-               return ret;
+               async->events |= COMEDI_CB_EOA;
+               async->events |= COMEDI_CB_ERROR;
+               comedi_event(dev, s);
+               usbduxfast_ai_stop(dev, 0);
        }
-
-       return 0;
 }
 
-static int usbduxfastsub_submit_InURBs(struct usbduxfastsub_s *udfs)
+static int usbduxfast_submit_urb(struct comedi_device *dev)
 {
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxfast_private *devpriv = dev->private;
        int ret;
 
-       if (!udfs)
+       if (!devpriv)
                return -EFAULT;
 
-       usb_fill_bulk_urb(udfs->urbIn, udfs->usbdev,
-                         usb_rcvbulkpipe(udfs->usbdev, BULKINEP),
-                         udfs->transfer_buffer,
-                         SIZEINBUF, usbduxfastsub_ai_Irq, udfs->comedidev);
-
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi%d: usbduxfast: submitting in-urb: "
-              "0x%p,0x%p\n", udfs->comedidev->minor, udfs->urbIn->context,
-              udfs->urbIn->dev);
-#endif
-       ret = usb_submit_urb(udfs->urbIn, GFP_ATOMIC);
+       usb_fill_bulk_urb(devpriv->urb, usb, usb_rcvbulkpipe(usb, BULKINEP),
+                         devpriv->inbuf, SIZEINBUF,
+                         usbduxfast_ai_interrupt, dev);
+
+       ret = usb_submit_urb(devpriv->urb, GFP_ATOMIC);
        if (ret) {
-               dev_err(&udfs->interface->dev,
-                       "ai: usb_submit_urb error %d\n", ret);
+               dev_err(dev->class_dev, "usb_submit_urb error %d\n", ret);
                return ret;
        }
        return 0;
@@ -553,13 +361,9 @@ static int usbduxfast_ai_cmdtest(struct comedi_device *dev,
                                 struct comedi_subdevice *s,
                                 struct comedi_cmd *cmd)
 {
-       struct usbduxfastsub_s *udfs = dev->private;
        int err = 0;
        long int steps, tmp;
-       int minSamplPer;
-
-       if (!udfs->probed)
-               return -ENODEV;
+       int min_sample_period;
 
        /* Step 1 : check if triggers are trivially valid */
 
@@ -601,14 +405,14 @@ static int usbduxfast_ai_cmdtest(struct comedi_device *dev,
        err |= cfc_check_trigger_arg_is(&cmd->scan_end_arg, cmd->chanlist_len);
 
        if (cmd->chanlist_len == 1)
-               minSamplPer = 1;
+               min_sample_period = 1;
        else
-               minSamplPer = MIN_SAMPLING_PERIOD;
+               min_sample_period = MIN_SAMPLING_PERIOD;
 
        if (cmd->convert_src == TRIG_TIMER) {
                steps = cmd->convert_arg * 30;
-               if (steps < (minSamplPer * 1000))
-                       steps = minSamplPer * 1000;
+               if (steps < (min_sample_period * 1000))
+                       steps = min_sample_period * 1000;
 
                if (steps > (MAX_SAMPLING_PERIOD * 1000))
                        steps = MAX_SAMPLING_PERIOD * 1000;
@@ -650,80 +454,53 @@ static int usbduxfast_ai_inttrig(struct comedi_device *dev,
                                 struct comedi_subdevice *s,
                                 unsigned int trignum)
 {
+       struct usbduxfast_private *devpriv = dev->private;
        int ret;
-       struct usbduxfastsub_s *udfs = dev->private;
 
-       if (!udfs)
+       if (!devpriv)
                return -EFAULT;
 
-       down(&udfs->sem);
-       if (!udfs->probed) {
-               up(&udfs->sem);
-               return -ENODEV;
-       }
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi%d: usbduxfast_ai_inttrig\n", dev->minor);
-#endif
+       down(&devpriv->sem);
 
        if (trignum != 0) {
-               dev_err(dev->class_dev, "%s: invalid trignum\n", __func__);
-               up(&udfs->sem);
+               dev_err(dev->class_dev, "invalid trignum\n");
+               up(&devpriv->sem);
                return -EINVAL;
        }
-       if (!udfs->ai_cmd_running) {
-               udfs->ai_cmd_running = 1;
-               ret = usbduxfastsub_submit_InURBs(udfs);
+       if (!devpriv->ai_cmd_running) {
+               devpriv->ai_cmd_running = 1;
+               ret = usbduxfast_submit_urb(dev);
                if (ret < 0) {
-                       dev_err(dev->class_dev,
-                               "%s: urbSubmit: err=%d\n", __func__, ret);
-                       udfs->ai_cmd_running = 0;
-                       up(&udfs->sem);
+                       dev_err(dev->class_dev, "urbSubmit: err=%d\n", ret);
+                       devpriv->ai_cmd_running = 0;
+                       up(&devpriv->sem);
                        return ret;
                }
                s->async->inttrig = NULL;
        } else {
-               dev_err(dev->class_dev,
-                       "ai_inttrig but acqu is already running\n");
+               dev_err(dev->class_dev, "ai is already running\n");
        }
-       up(&udfs->sem);
+       up(&devpriv->sem);
        return 1;
 }
 
-/*
- * offsets for the GPIF bytes
- * the first byte is the command byte
- */
-#define LENBASE        (1+0x00)
-#define OPBASE (1+0x08)
-#define OUTBASE        (1+0x10)
-#define LOGBASE        (1+0x18)
-
 static int usbduxfast_ai_cmd(struct comedi_device *dev,
                             struct comedi_subdevice *s)
 {
+       struct usbduxfast_private *devpriv = dev->private;
        struct comedi_cmd *cmd = &s->async->cmd;
        unsigned int chan, gain, rngmask = 0xff;
        int i, j, ret;
-       struct usbduxfastsub_s *udfs;
        int result;
        long steps, steps_tmp;
 
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi%d: usbduxfast_ai_cmd\n", dev->minor);
-#endif
-       udfs = dev->private;
-       if (!udfs)
+       if (!devpriv)
                return -EFAULT;
 
-       down(&udfs->sem);
-       if (!udfs->probed) {
-               up(&udfs->sem);
-               return -ENODEV;
-       }
-       if (udfs->ai_cmd_running) {
-               dev_err(dev->class_dev,
-                       "ai_cmd not possible. Another ai_cmd is running.\n");
-               up(&udfs->sem);
+       down(&devpriv->sem);
+       if (devpriv->ai_cmd_running) {
+               dev_err(dev->class_dev, "ai_cmd not possible\n");
+               up(&devpriv->sem);
                return -EBUSY;
        }
        /* set current channel of the running acquisition to zero */
@@ -733,7 +510,7 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
         * ignore the first buffers from the device if there
         * is an error condition
         */
-       udfs->ignore = PACKETS_TO_IGNORE;
+       devpriv->ignore = PACKETS_TO_IGNORE;
 
        if (cmd->chanlist_len > 0) {
                gain = CR_RANGE(cmd->chanlist[0]);
@@ -741,20 +518,19 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                        chan = CR_CHAN(cmd->chanlist[i]);
                        if (chan != i) {
                                dev_err(dev->class_dev,
-                                       "cmd is accepting only consecutive channels.\n");
-                               up(&udfs->sem);
+                                       "channels are not consecutive\n");
+                               up(&devpriv->sem);
                                return -EINVAL;
                        }
                        if ((gain != CR_RANGE(cmd->chanlist[i]))
                            && (cmd->chanlist_len > 3)) {
                                dev_err(dev->class_dev,
-                                       "the gain must be the same for all channels.\n");
-                               up(&udfs->sem);
+                                       "gain must be the same for all channels\n");
+                               up(&devpriv->sem);
                                return -EINVAL;
                        }
                        if (i >= NUMCHANNELS) {
-                               dev_err(dev->class_dev,
-                                       "channel list too long\n");
+                               dev_err(dev->class_dev, "chanlist too long\n");
                                break;
                        }
                }
@@ -762,8 +538,8 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
        steps = 0;
        if (cmd->scan_begin_src == TRIG_TIMER) {
                dev_err(dev->class_dev,
-                       "scan_begin_src==TRIG_TIMER not valid.\n");
-               up(&udfs->sem);
+                       "scan_begin_src==TRIG_TIMER not valid\n");
+               up(&devpriv->sem);
                return -EINVAL;
        }
        if (cmd->convert_src == TRIG_TIMER)
@@ -771,27 +547,23 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
 
        if ((steps < MIN_SAMPLING_PERIOD) && (cmd->chanlist_len != 1)) {
                dev_err(dev->class_dev,
-                       "ai_cmd: steps=%ld, scan_begin_arg=%d. Not properly tested by cmdtest?\n",
+                       "steps=%ld, scan_begin_arg=%d. Not properly tested by cmdtest?\n",
                        steps, cmd->scan_begin_arg);
-               up(&udfs->sem);
+               up(&devpriv->sem);
                return -EINVAL;
        }
        if (steps > MAX_SAMPLING_PERIOD) {
-               dev_err(dev->class_dev, "ai_cmd: sampling rate too low.\n");
-               up(&udfs->sem);
+               dev_err(dev->class_dev, "sampling rate too low\n");
+               up(&devpriv->sem);
                return -EINVAL;
        }
        if ((cmd->start_src == TRIG_EXT) && (cmd->chanlist_len != 1)
            && (cmd->chanlist_len != 16)) {
                dev_err(dev->class_dev,
-                       "ai_cmd: TRIG_EXT only with 1 or 16 channels possible.\n");
-               up(&udfs->sem);
+                       "TRIG_EXT only with 1 or 16 channels possible\n");
+               up(&devpriv->sem);
                return -EINVAL;
        }
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi%d: usbduxfast: steps=%ld, convert_arg=%u\n",
-              dev->minor, steps, cmd->convert_arg);
-#endif
 
        switch (cmd->chanlist_len) {
        case 1:
@@ -812,17 +584,11 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                /* we loop here until ready has been set */
                if (cmd->start_src == TRIG_EXT) {
                        /* branch back to state 0 */
-                       udfs->dux_commands[LENBASE + 0] = 0x01;
                        /* deceision state w/o data */
-                       udfs->dux_commands[OPBASE + 0] = 0x01;
-                       udfs->dux_commands[OUTBASE + 0] = 0xFF & rngmask;
                        /* RDY0 = 0 */
-                       udfs->dux_commands[LOGBASE + 0] = 0x00;
+                       usbduxfast_cmd_data(dev, 0, 0x01, 0x01, rngmask, 0x00);
                } else {        /* we just proceed to state 1 */
-                       udfs->dux_commands[LENBASE + 0] = 1;
-                       udfs->dux_commands[OPBASE + 0] = 0;
-                       udfs->dux_commands[OUTBASE + 0] = 0xFF & rngmask;
-                       udfs->dux_commands[LOGBASE + 0] = 0;
+                       usbduxfast_cmd_data(dev, 0, 0x01, 0x00, rngmask, 0x00);
                }
 
                if (steps < MIN_SAMPLING_PERIOD) {
@@ -835,33 +601,25 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                                 */
 
                                /* branch back to state 1 */
-                               udfs->dux_commands[LENBASE + 1] = 0x89;
                                /* deceision state with data */
-                               udfs->dux_commands[OPBASE + 1] = 0x03;
-                               udfs->dux_commands[OUTBASE + 1] =
-                                   0xFF & rngmask;
                                /* doesn't matter */
-                               udfs->dux_commands[LOGBASE + 1] = 0xFF;
+                               usbduxfast_cmd_data(dev, 1,
+                                                   0x89, 0x03, rngmask, 0xff);
                        } else {
                                /*
                                 * we loop through two states: data and delay
                                 * max rate is 15MHz
                                 */
-                               udfs->dux_commands[LENBASE + 1] = steps - 1;
                                /* data */
-                               udfs->dux_commands[OPBASE + 1] = 0x02;
-                               udfs->dux_commands[OUTBASE + 1] =
-                                   0xFF & rngmask;
                                /* doesn't matter */
-                               udfs->dux_commands[LOGBASE + 1] = 0;
+                               usbduxfast_cmd_data(dev, 1, steps - 1,
+                                                   0x02, rngmask, 0x00);
+
                                /* branch back to state 1 */
-                               udfs->dux_commands[LENBASE + 2] = 0x09;
                                /* deceision state w/o data */
-                               udfs->dux_commands[OPBASE + 2] = 0x01;
-                               udfs->dux_commands[OUTBASE + 2] =
-                                   0xFF & rngmask;
                                /* doesn't matter */
-                               udfs->dux_commands[LOGBASE + 2] = 0xFF;
+                               usbduxfast_cmd_data(dev, 2,
+                                                   0x09, 0x01, rngmask, 0xff);
                        }
                } else {
                        /*
@@ -873,26 +631,20 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                        steps = steps - 1;
 
                        /* do the first part of the delay */
-                       udfs->dux_commands[LENBASE + 1] = steps / 2;
-                       udfs->dux_commands[OPBASE + 1] = 0;
-                       udfs->dux_commands[OUTBASE + 1] = 0xFF & rngmask;
-                       udfs->dux_commands[LOGBASE + 1] = 0;
+                       usbduxfast_cmd_data(dev, 1,
+                                           steps / 2, 0x00, rngmask, 0x00);
 
                        /* and the second part */
-                       udfs->dux_commands[LENBASE + 2] = steps - steps / 2;
-                       udfs->dux_commands[OPBASE + 2] = 0;
-                       udfs->dux_commands[OUTBASE + 2] = 0xFF & rngmask;
-                       udfs->dux_commands[LOGBASE + 2] = 0;
+                       usbduxfast_cmd_data(dev, 2, steps - steps / 2,
+                                           0x00, rngmask, 0x00);
 
                        /* get the data and branch back */
 
                        /* branch back to state 1 */
-                       udfs->dux_commands[LENBASE + 3] = 0x09;
                        /* deceision state w data */
-                       udfs->dux_commands[OPBASE + 3] = 0x03;
-                       udfs->dux_commands[OUTBASE + 3] = 0xFF & rngmask;
                        /* doesn't matter */
-                       udfs->dux_commands[LOGBASE + 3] = 0xFF;
+                       usbduxfast_cmd_data(dev, 3,
+                                           0x09, 0x03, rngmask, 0xff);
                }
                break;
 
@@ -907,11 +659,8 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                else
                        rngmask = 0xff;
 
-               udfs->dux_commands[LENBASE + 0] = 1;
                /* data */
-               udfs->dux_commands[OPBASE + 0] = 0x02;
-               udfs->dux_commands[OUTBASE + 0] = 0xFF & rngmask;
-               udfs->dux_commands[LOGBASE + 0] = 0;
+               usbduxfast_cmd_data(dev, 0, 0x01, 0x02, rngmask, 0x00);
 
                /* we have 1 state with duration 1: state 0 */
                steps_tmp = steps - 1;
@@ -922,23 +671,16 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                        rngmask = 0xff;
 
                /* do the first part of the delay */
-               udfs->dux_commands[LENBASE + 1] = steps_tmp / 2;
-               udfs->dux_commands[OPBASE + 1] = 0;
                /* count */
-               udfs->dux_commands[OUTBASE + 1] = 0xFE & rngmask;
-               udfs->dux_commands[LOGBASE + 1] = 0;
+               usbduxfast_cmd_data(dev, 1, steps_tmp / 2,
+                                   0x00, 0xfe & rngmask, 0x00);
 
                /* and the second part */
-               udfs->dux_commands[LENBASE + 2] = steps_tmp - steps_tmp / 2;
-               udfs->dux_commands[OPBASE + 2] = 0;
-               udfs->dux_commands[OUTBASE + 2] = 0xFF & rngmask;
-               udfs->dux_commands[LOGBASE + 2] = 0;
+               usbduxfast_cmd_data(dev, 2, steps_tmp  - steps_tmp / 2,
+                                   0x00, rngmask, 0x00);
 
-               udfs->dux_commands[LENBASE + 3] = 1;
                /* data */
-               udfs->dux_commands[OPBASE + 3] = 0x02;
-               udfs->dux_commands[OUTBASE + 3] = 0xFF & rngmask;
-               udfs->dux_commands[LOGBASE + 3] = 0;
+               usbduxfast_cmd_data(dev, 3, 0x01, 0x02, rngmask, 0x00);
 
                /*
                 * we have 2 states with duration 1: step 6 and
@@ -952,22 +694,15 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                        rngmask = 0xff;
 
                /* do the first part of the delay */
-               udfs->dux_commands[LENBASE + 4] = steps_tmp / 2;
-               udfs->dux_commands[OPBASE + 4] = 0;
                /* reset */
-               udfs->dux_commands[OUTBASE + 4] = (0xFF - 0x02) & rngmask;
-               udfs->dux_commands[LOGBASE + 4] = 0;
+               usbduxfast_cmd_data(dev, 4, steps_tmp / 2,
+                                   0x00, (0xff - 0x02) & rngmask, 0x00);
 
                /* and the second part */
-               udfs->dux_commands[LENBASE + 5] = steps_tmp - steps_tmp / 2;
-               udfs->dux_commands[OPBASE + 5] = 0;
-               udfs->dux_commands[OUTBASE + 5] = 0xFF & rngmask;
-               udfs->dux_commands[LOGBASE + 5] = 0;
-
-               udfs->dux_commands[LENBASE + 6] = 1;
-               udfs->dux_commands[OPBASE + 6] = 0;
-               udfs->dux_commands[OUTBASE + 6] = 0xFF & rngmask;
-               udfs->dux_commands[LOGBASE + 6] = 0;
+               usbduxfast_cmd_data(dev, 5, steps_tmp - steps_tmp / 2,
+                                   0x00, rngmask, 0x00);
+
+               usbduxfast_cmd_data(dev, 6, 0x01, 0x00, rngmask, 0x00);
                break;
 
        case 3:
@@ -975,6 +710,8 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                 * three channels
                 */
                for (j = 0; j < 1; j++) {
+                       int index = j * 2;
+
                        if (CR_RANGE(cmd->chanlist[j]) > 0)
                                rngmask = 0xff - 0x04;
                        else
@@ -983,12 +720,10 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                         * commit data to the FIFO and do the first part
                         * of the delay
                         */
-                       udfs->dux_commands[LENBASE + j * 2] = steps / 2;
                        /* data */
-                       udfs->dux_commands[OPBASE + j * 2] = 0x02;
                        /* no change */
-                       udfs->dux_commands[OUTBASE + j * 2] = 0xFF & rngmask;
-                       udfs->dux_commands[LOGBASE + j * 2] = 0;
+                       usbduxfast_cmd_data(dev, index, steps / 2,
+                                           0x02, rngmask, 0x00);
 
                        if (CR_RANGE(cmd->chanlist[j + 1]) > 0)
                                rngmask = 0xff - 0x04;
@@ -996,25 +731,19 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                                rngmask = 0xff;
 
                        /* do the second part of the delay */
-                       udfs->dux_commands[LENBASE + j * 2 + 1] =
-                           steps - steps / 2;
                        /* no data */
-                       udfs->dux_commands[OPBASE + j * 2 + 1] = 0;
                        /* count */
-                       udfs->dux_commands[OUTBASE + j * 2 + 1] =
-                           0xFE & rngmask;
-                       udfs->dux_commands[LOGBASE + j * 2 + 1] = 0;
+                       usbduxfast_cmd_data(dev, index + 1, steps - steps / 2,
+                                           0x00, 0xfe & rngmask, 0x00);
                }
 
                /* 2 steps with duration 1: the idele step and step 6: */
                steps_tmp = steps - 2;
 
                /* commit data to the FIFO and do the first part of the delay */
-               udfs->dux_commands[LENBASE + 4] = steps_tmp / 2;
                /* data */
-               udfs->dux_commands[OPBASE + 4] = 0x02;
-               udfs->dux_commands[OUTBASE + 4] = 0xFF & rngmask;
-               udfs->dux_commands[LOGBASE + 4] = 0;
+               usbduxfast_cmd_data(dev, 4, steps_tmp / 2,
+                                   0x02, rngmask, 0x00);
 
                if (CR_RANGE(cmd->chanlist[0]) > 0)
                        rngmask = 0xff - 0x04;
@@ -1022,17 +751,12 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                        rngmask = 0xff;
 
                /* do the second part of the delay */
-               udfs->dux_commands[LENBASE + 5] = steps_tmp - steps_tmp / 2;
                /* no data */
-               udfs->dux_commands[OPBASE + 5] = 0;
                /* reset */
-               udfs->dux_commands[OUTBASE + 5] = (0xFF - 0x02) & rngmask;
-               udfs->dux_commands[LOGBASE + 5] = 0;
+               usbduxfast_cmd_data(dev, 5, steps_tmp - steps_tmp / 2,
+                                   0x00, (0xff - 0x02) & rngmask, 0x00);
 
-               udfs->dux_commands[LENBASE + 6] = 1;
-               udfs->dux_commands[OPBASE + 6] = 0;
-               udfs->dux_commands[OUTBASE + 6] = 0xFF & rngmask;
-               udfs->dux_commands[LOGBASE + 6] = 0;
+               usbduxfast_cmd_data(dev, 6, 0x01, 0x00, rngmask, 0x00);
 
        case 16:
                if (CR_RANGE(cmd->chanlist[0]) > 0)
@@ -1046,101 +770,79 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                         */
 
                        /* branch back to state 0 */
-                       udfs->dux_commands[LENBASE + 0] = 0x01;
                        /* deceision state w/o data */
-                       udfs->dux_commands[OPBASE + 0] = 0x01;
                        /* reset */
-                       udfs->dux_commands[OUTBASE + 0] =
-                           (0xFF - 0x02) & rngmask;
                        /* RDY0 = 0 */
-                       udfs->dux_commands[LOGBASE + 0] = 0x00;
+                       usbduxfast_cmd_data(dev, 0, 0x01, 0x01,
+                                           (0xff - 0x02) & rngmask, 0x00);
                } else {
                        /*
                         * we just proceed to state 1
                         */
 
                        /* 30us reset pulse */
-                       udfs->dux_commands[LENBASE + 0] = 255;
-                       udfs->dux_commands[OPBASE + 0] = 0;
                        /* reset */
-                       udfs->dux_commands[OUTBASE + 0] =
-                           (0xFF - 0x02) & rngmask;
-                       udfs->dux_commands[LOGBASE + 0] = 0;
+                       usbduxfast_cmd_data(dev, 0, 0xff, 0x00,
+                                           (0xff - 0x02) & rngmask, 0x00);
                }
 
                /* commit data to the FIFO */
-               udfs->dux_commands[LENBASE + 1] = 1;
                /* data */
-               udfs->dux_commands[OPBASE + 1] = 0x02;
-               udfs->dux_commands[OUTBASE + 1] = 0xFF & rngmask;
-               udfs->dux_commands[LOGBASE + 1] = 0;
+               usbduxfast_cmd_data(dev, 1, 0x01, 0x02, rngmask, 0x00);
 
                /* we have 2 states with duration 1 */
                steps = steps - 2;
 
                /* do the first part of the delay */
-               udfs->dux_commands[LENBASE + 2] = steps / 2;
-               udfs->dux_commands[OPBASE + 2] = 0;
-               udfs->dux_commands[OUTBASE + 2] = 0xFE & rngmask;
-               udfs->dux_commands[LOGBASE + 2] = 0;
+               usbduxfast_cmd_data(dev, 2, steps / 2,
+                                   0x00, 0xfe & rngmask, 0x00);
 
                /* and the second part */
-               udfs->dux_commands[LENBASE + 3] = steps - steps / 2;
-               udfs->dux_commands[OPBASE + 3] = 0;
-               udfs->dux_commands[OUTBASE + 3] = 0xFF & rngmask;
-               udfs->dux_commands[LOGBASE + 3] = 0;
+               usbduxfast_cmd_data(dev, 3, steps - steps / 2,
+                                   0x00, rngmask, 0x00);
 
                /* branch back to state 1 */
-               udfs->dux_commands[LENBASE + 4] = 0x09;
                /* deceision state w/o data */
-               udfs->dux_commands[OPBASE + 4] = 0x01;
-               udfs->dux_commands[OUTBASE + 4] = 0xFF & rngmask;
                /* doesn't matter */
-               udfs->dux_commands[LOGBASE + 4] = 0xFF;
+               usbduxfast_cmd_data(dev, 4, 0x09, 0x01, rngmask, 0xff);
 
                break;
 
        default:
                dev_err(dev->class_dev, "unsupported combination of channels\n");
-               up(&udfs->sem);
+               up(&devpriv->sem);
                return -EFAULT;
        }
 
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi %d: sending commands to the usb device\n",
-              dev->minor);
-#endif
        /* 0 means that the AD commands are sent */
-       result = send_dux_commands(udfs, SENDADCOMMANDS);
+       result = usbduxfast_send_cmd(dev, SENDADCOMMANDS);
        if (result < 0) {
-               dev_err(dev->class_dev,
-                       "adc command could not be submitted. Aborting...\n");
-               up(&udfs->sem);
+               up(&devpriv->sem);
                return result;
        }
        if (cmd->stop_src == TRIG_COUNT) {
-               udfs->ai_sample_count = cmd->stop_arg * cmd->scan_end_arg;
-               if (udfs->ai_sample_count < 1) {
+               devpriv->ai_sample_count = cmd->stop_arg * cmd->scan_end_arg;
+               if (devpriv->ai_sample_count < 1) {
                        dev_err(dev->class_dev,
-                               "(cmd->stop_arg)*(cmd->scan_end_arg)<1, aborting.\n");
-                       up(&udfs->sem);
+                               "(cmd->stop_arg)*(cmd->scan_end_arg)<1, aborting\n");
+                       up(&devpriv->sem);
                        return -EFAULT;
                }
-               udfs->ai_continous = 0;
+               devpriv->ai_continous = 0;
        } else {
                /* continous acquisition */
-               udfs->ai_continous = 1;
-               udfs->ai_sample_count = 0;
+               devpriv->ai_continous = 1;
+               devpriv->ai_sample_count = 0;
        }
 
        if ((cmd->start_src == TRIG_NOW) || (cmd->start_src == TRIG_EXT)) {
                /* enable this acquisition operation */
-               udfs->ai_cmd_running = 1;
-               ret = usbduxfastsub_submit_InURBs(udfs);
+               devpriv->ai_cmd_running = 1;
+               ret = usbduxfast_submit_urb(dev);
                if (ret < 0) {
-                       udfs->ai_cmd_running = 0;
+                       devpriv->ai_cmd_running = 0;
                        /* fixme: unlink here?? */
-                       up(&udfs->sem);
+                       up(&devpriv->sem);
                        return ret;
                }
                s->async->inttrig = NULL;
@@ -1152,7 +854,7 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
                 */
                s->async->inttrig = usbduxfast_ai_inttrig;
        }
-       up(&udfs->sem);
+       up(&devpriv->sem);
 
        return 0;
 }
@@ -1162,490 +864,283 @@ static int usbduxfast_ai_cmd(struct comedi_device *dev,
  */
 static int usbduxfast_ai_insn_read(struct comedi_device *dev,
                                   struct comedi_subdevice *s,
-                                  struct comedi_insn *insn, unsigned int *data)
+                                  struct comedi_insn *insn,
+                                  unsigned int *data)
 {
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxfast_private *devpriv = dev->private;
+       unsigned int chan = CR_CHAN(insn->chanspec);
+       unsigned int range = CR_RANGE(insn->chanspec);
+       uint8_t rngmask = range ? (0xff - 0x04) : 0xff;
        int i, j, n, actual_length;
-       int chan, range, rngmask;
-       int err;
-       struct usbduxfastsub_s *udfs;
+       int ret;
 
-       udfs = dev->private;
-       if (!udfs) {
-               dev_err(dev->class_dev, "%s: no usb dev.\n", __func__);
-               return -ENODEV;
-       }
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi%d: ai_insn_read, insn->n=%d, "
-              "insn->subdev=%d\n", dev->minor, insn->n, insn->subdev);
-#endif
-       down(&udfs->sem);
-       if (!udfs->probed) {
-               up(&udfs->sem);
-               return -ENODEV;
-       }
-       if (udfs->ai_cmd_running) {
+       down(&devpriv->sem);
+
+       if (devpriv->ai_cmd_running) {
                dev_err(dev->class_dev,
-                       "ai_insn_read not possible. Async Command is running.\n");
-               up(&udfs->sem);
+                       "ai_insn_read not possible, async cmd is running\n");
+               up(&devpriv->sem);
                return -EBUSY;
        }
-       /* sample one channel */
-       chan = CR_CHAN(insn->chanspec);
-       range = CR_RANGE(insn->chanspec);
-       /* set command for the first channel */
 
-       if (range > 0)
-               rngmask = 0xff - 0x04;
-       else
-               rngmask = 0xff;
+       /* set command for the first channel */
 
        /* commit data to the FIFO */
-       udfs->dux_commands[LENBASE + 0] = 1;
        /* data */
-       udfs->dux_commands[OPBASE + 0] = 0x02;
-       udfs->dux_commands[OUTBASE + 0] = 0xFF & rngmask;
-       udfs->dux_commands[LOGBASE + 0] = 0;
+       usbduxfast_cmd_data(dev, 0, 0x01, 0x02, rngmask, 0x00);
 
        /* do the first part of the delay */
-       udfs->dux_commands[LENBASE + 1] = 12;
-       udfs->dux_commands[OPBASE + 1] = 0;
-       udfs->dux_commands[OUTBASE + 1] = 0xFE & rngmask;
-       udfs->dux_commands[LOGBASE + 1] = 0;
-
-       udfs->dux_commands[LENBASE + 2] = 1;
-       udfs->dux_commands[OPBASE + 2] = 0;
-       udfs->dux_commands[OUTBASE + 2] = 0xFE & rngmask;
-       udfs->dux_commands[LOGBASE + 2] = 0;
-
-       udfs->dux_commands[LENBASE + 3] = 1;
-       udfs->dux_commands[OPBASE + 3] = 0;
-       udfs->dux_commands[OUTBASE + 3] = 0xFE & rngmask;
-       udfs->dux_commands[LOGBASE + 3] = 0;
-
-       udfs->dux_commands[LENBASE + 4] = 1;
-       udfs->dux_commands[OPBASE + 4] = 0;
-       udfs->dux_commands[OUTBASE + 4] = 0xFE & rngmask;
-       udfs->dux_commands[LOGBASE + 4] = 0;
+       usbduxfast_cmd_data(dev, 1, 0x0c, 0x00, 0xfe & rngmask, 0x00);
+       usbduxfast_cmd_data(dev, 2, 0x01, 0x00, 0xfe & rngmask, 0x00);
+       usbduxfast_cmd_data(dev, 3, 0x01, 0x00, 0xfe & rngmask, 0x00);
+       usbduxfast_cmd_data(dev, 4, 0x01, 0x00, 0xfe & rngmask, 0x00);
 
        /* second part */
-       udfs->dux_commands[LENBASE + 5] = 12;
-       udfs->dux_commands[OPBASE + 5] = 0;
-       udfs->dux_commands[OUTBASE + 5] = 0xFF & rngmask;
-       udfs->dux_commands[LOGBASE + 5] = 0;
-
-       udfs->dux_commands[LENBASE + 6] = 1;
-       udfs->dux_commands[OPBASE + 6] = 0;
-       udfs->dux_commands[OUTBASE + 6] = 0xFF & rngmask;
-       udfs->dux_commands[LOGBASE + 0] = 0;
-
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi %d: sending commands to the usb device\n",
-              dev->minor);
-#endif
-       /* 0 means that the AD commands are sent */
-       err = send_dux_commands(udfs, SENDADCOMMANDS);
-       if (err < 0) {
-               dev_err(dev->class_dev,
-                       "adc command could not be submitted. Aborting...\n");
-               up(&udfs->sem);
-               return err;
+       usbduxfast_cmd_data(dev, 5, 0x0c, 0x00, rngmask, 0x00);
+       usbduxfast_cmd_data(dev, 6, 0x01, 0x00, rngmask, 0x00);
+
+       ret = usbduxfast_send_cmd(dev, SENDADCOMMANDS);
+       if (ret < 0) {
+               up(&devpriv->sem);
+               return ret;
        }
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi%d: usbduxfast: submitting in-urb: "
-              "0x%p,0x%p\n", udfs->comedidev->minor, udfs->urbIn->context,
-              udfs->urbIn->dev);
-#endif
+
        for (i = 0; i < PACKETS_TO_IGNORE; i++) {
-               err = usb_bulk_msg(udfs->usbdev,
-                                  usb_rcvbulkpipe(udfs->usbdev, BULKINEP),
-                                  udfs->transfer_buffer, SIZEINBUF,
+               ret = usb_bulk_msg(usb, usb_rcvbulkpipe(usb, BULKINEP),
+                                  devpriv->inbuf, SIZEINBUF,
                                   &actual_length, 10000);
-               if (err < 0) {
-                       dev_err(dev->class_dev, "insn timeout. No data.\n");
-                       up(&udfs->sem);
-                       return err;
+               if (ret < 0) {
+                       dev_err(dev->class_dev, "insn timeout, no data\n");
+                       up(&devpriv->sem);
+                       return ret;
                }
        }
-       /* data points */
+
        for (i = 0; i < insn->n;) {
-               err = usb_bulk_msg(udfs->usbdev,
-                                  usb_rcvbulkpipe(udfs->usbdev, BULKINEP),
-                                  udfs->transfer_buffer, SIZEINBUF,
+               ret = usb_bulk_msg(usb, usb_rcvbulkpipe(usb, BULKINEP),
+                                  devpriv->inbuf, SIZEINBUF,
                                   &actual_length, 10000);
-               if (err < 0) {
-                       dev_err(dev->class_dev, "insn data error: %d\n", err);
-                       up(&udfs->sem);
-                       return err;
+               if (ret < 0) {
+                       dev_err(dev->class_dev, "insn data error: %d\n", ret);
+                       up(&devpriv->sem);
+                       return ret;
                }
                n = actual_length / sizeof(uint16_t);
                if ((n % 16) != 0) {
-                       dev_err(dev->class_dev, "insn data packet corrupted.\n");
-                       up(&udfs->sem);
+                       dev_err(dev->class_dev, "insn data packet corrupted\n");
+                       up(&devpriv->sem);
                        return -EINVAL;
                }
                for (j = chan; (j < n) && (i < insn->n); j = j + 16) {
-                       data[i] = ((uint16_t *) (udfs->transfer_buffer))[j];
+                       data[i] = ((uint16_t *) (devpriv->inbuf))[j];
                        i++;
                }
        }
-       up(&udfs->sem);
-       return i;
-}
-
-#define FIRMWARE_MAX_LEN 0x2000
-
-static int firmwareUpload(struct usbduxfastsub_s *usbduxfastsub,
-                         const u8 *firmwareBinary, int sizeFirmware)
-{
-       int ret;
-       uint8_t *fwBuf;
-
-       if (!firmwareBinary)
-               return 0;
-
-       if (sizeFirmware > FIRMWARE_MAX_LEN) {
-               dev_err(&usbduxfastsub->interface->dev,
-                       "comedi_: usbduxfast firmware binary it too large for FX2.\n");
-               return -ENOMEM;
-       }
-
-       /* we generate a local buffer for the firmware */
-       fwBuf = kmemdup(firmwareBinary, sizeFirmware, GFP_KERNEL);
-       if (!fwBuf) {
-               dev_err(&usbduxfastsub->interface->dev,
-                       "comedi_: mem alloc for firmware failed\n");
-               return -ENOMEM;
-       }
-
-       ret = usbduxfastsub_stop(usbduxfastsub);
-       if (ret < 0) {
-               dev_err(&usbduxfastsub->interface->dev,
-                       "comedi_: can not stop firmware\n");
-               kfree(fwBuf);
-               return ret;
-       }
-
-       ret = usbduxfastsub_upload(usbduxfastsub, fwBuf, 0, sizeFirmware);
-       if (ret < 0) {
-               dev_err(&usbduxfastsub->interface->dev,
-                       "comedi_: firmware upload failed\n");
-               kfree(fwBuf);
-               return ret;
-       }
-       ret = usbduxfastsub_start(usbduxfastsub);
-       if (ret < 0) {
-               dev_err(&usbduxfastsub->interface->dev,
-                       "comedi_: can not start firmware\n");
-               kfree(fwBuf);
-               return ret;
-       }
-       kfree(fwBuf);
-       return 0;
-}
-
-static void tidy_up(struct usbduxfastsub_s *udfs)
-{
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi_: usbduxfast: tiding up\n");
-#endif
-
-       if (!udfs)
-               return;
 
-       /* shows the usb subsystem that the driver is down */
-       if (udfs->interface)
-               usb_set_intfdata(udfs->interface, NULL);
+       up(&devpriv->sem);
 
-       udfs->probed = 0;
-
-       if (udfs->urbIn) {
-               /* waits until a running transfer is over */
-               usb_kill_urb(udfs->urbIn);
-
-               kfree(udfs->transfer_buffer);
-               udfs->transfer_buffer = NULL;
-
-               usb_free_urb(udfs->urbIn);
-               udfs->urbIn = NULL;
-       }
-
-       kfree(udfs->insnBuffer);
-       udfs->insnBuffer = NULL;
-
-       kfree(udfs->dux_commands);
-       udfs->dux_commands = NULL;
-
-       udfs->ai_cmd_running = 0;
+       return insn->n;
 }
 
-static int usbduxfast_attach_common(struct comedi_device *dev,
-                                   struct usbduxfastsub_s *udfs)
+static int usbduxfast_attach_common(struct comedi_device *dev)
 {
-       int ret;
+       struct usbduxfast_private *devpriv = dev->private;
        struct comedi_subdevice *s;
+       int ret;
 
-       down(&udfs->sem);
-       /* pointer back to the corresponding comedi device */
-       udfs->comedidev = dev;
+       down(&devpriv->sem);
 
        ret = comedi_alloc_subdevices(dev, 1);
        if (ret) {
-               up(&udfs->sem);
+               up(&devpriv->sem);
                return ret;
        }
-       /* private structure is also simply the usb-structure */
-       dev->private = udfs;
-       /* the first subdevice is the A/D converter */
-       s = &dev->subdevices[SUBDEV_AD];
-       /*
-        * the URBs get the comedi subdevice which is responsible for reading
-        * this is the subdevice which reads data
-        */
+
+       /* Analog Input subdevice */
+       s = &dev->subdevices[0];
        dev->read_subdev = s;
-       /* the subdevice receives as private structure the usb-structure */
-       s->private = NULL;
-       /* analog input */
-       s->type = COMEDI_SUBD_AI;
-       /* readable and ref is to ground */
-       s->subdev_flags = SDF_READABLE | SDF_GROUND | SDF_CMD_READ;
-       /* 16 channels */
-       s->n_chan = 16;
-       /* length of the channellist */
-       s->len_chanlist = 16;
-       /* callback functions */
-       s->insn_read = usbduxfast_ai_insn_read;
-       s->do_cmdtest = usbduxfast_ai_cmdtest;
-       s->do_cmd = usbduxfast_ai_cmd;
-       s->cancel = usbduxfast_ai_cancel;
-       /* max value from the A/D converter (12bit+1 bit for overflow) */
-       s->maxdata = 0x1000;
-       /* range table to convert to physical units */
-       s->range_table = &range_usbduxfast_ai_range;
-       /* finally decide that it's attached */
-       udfs->attached = 1;
-       up(&udfs->sem);
-       dev_info(dev->class_dev, "successfully attached to usbduxfast.\n");
+       s->type         = COMEDI_SUBD_AI;
+       s->subdev_flags = SDF_READABLE | SDF_GROUND | SDF_CMD_READ;
+       s->n_chan       = 16;
+       s->len_chanlist = 16;
+       s->insn_read    = usbduxfast_ai_insn_read;
+       s->do_cmdtest   = usbduxfast_ai_cmdtest;
+       s->do_cmd       = usbduxfast_ai_cmd;
+       s->cancel       = usbduxfast_ai_cancel;
+       s->maxdata      = 0x1000;
+       s->range_table  = &range_usbduxfast_ai_range;
+
+       up(&devpriv->sem);
+
        return 0;
 }
 
-static int usbduxfast_auto_attach(struct comedi_device *dev,
-                                 unsigned long context_unused)
+static int usbduxfast_upload_firmware(struct comedi_device *dev,
+                                     const u8 *data, size_t size,
+                                     unsigned long context)
 {
-       struct usb_interface *uinterf = comedi_to_usb_interface(dev);
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       uint8_t *buf;
+       unsigned char *tmp;
        int ret;
-       struct usbduxfastsub_s *udfs;
 
-       dev->private = NULL;
-       down(&start_stop_sem);
-       udfs = usb_get_intfdata(uinterf);
-       if (!udfs || !udfs->probed) {
-               dev_err(dev->class_dev,
-                       "usbduxfast: error: auto_attach failed, not connected\n");
-               ret = -ENODEV;
-       } else if (udfs->attached) {
-               dev_err(dev->class_dev,
-                      "usbduxfast: error: auto_attach failed, already attached\n");
-               ret = -ENODEV;
-       } else
-               ret = usbduxfast_attach_common(dev, udfs);
-       up(&start_stop_sem);
-       return ret;
-}
+       if (!data)
+               return 0;
 
-static void usbduxfast_detach(struct comedi_device *dev)
-{
-       struct usbduxfastsub_s *usb = dev->private;
-
-       if (usb) {
-               down(&usb->sem);
-               down(&start_stop_sem);
-               dev->private = NULL;
-               usb->attached = 0;
-               usb->comedidev = NULL;
-               up(&start_stop_sem);
-               up(&usb->sem);
+       if (size > FIRMWARE_MAX_LEN) {
+               dev_err(dev->class_dev, "firmware binary too large for FX2\n");
+               return -ENOMEM;
        }
-}
-
-static struct comedi_driver usbduxfast_driver = {
-       .driver_name    = "usbduxfast",
-       .module         = THIS_MODULE,
-       .auto_attach    = usbduxfast_auto_attach,
-       .detach         = usbduxfast_detach,
-};
 
-static void usbduxfast_firmware_request_complete_handler(const struct firmware
-                                                        *fw, void *context)
-{
-       struct usbduxfastsub_s *usbduxfastsub_tmp = context;
-       struct usb_interface *uinterf = usbduxfastsub_tmp->interface;
-       int ret;
+       /* we generate a local buffer for the firmware */
+       buf = kmemdup(data, size, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
 
-       if (fw == NULL)
-               return;
+       /* we need a malloc'ed buffer for usb_control_msg() */
+       tmp = kmalloc(1, GFP_KERNEL);
+       if (!tmp) {
+               kfree(buf);
+               return -ENOMEM;
+       }
 
-       /*
-        * we need to upload the firmware here because fw will be
-        * freed once we've left this function
-        */
-       ret = firmwareUpload(usbduxfastsub_tmp, fw->data, fw->size);
+       /* stop the current firmware on the device */
+       *tmp = 1;       /* 7f92 to one */
+       ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+                             USBDUXFASTSUB_FIRMWARE,
+                             VENDOR_DIR_OUT,
+                             USBDUXFASTSUB_CPUCS, 0x0000,
+                             tmp, 1,
+                             EZTIMEOUT);
+       if (ret < 0) {
+               dev_err(dev->class_dev, "can not stop firmware\n");
+               goto done;
+       }
 
-       if (ret) {
-               dev_err(&uinterf->dev,
-                       "Could not upload firmware (err=%d)\n", ret);
-               goto out;
+       /* upload the new firmware to the device */
+       ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+                             USBDUXFASTSUB_FIRMWARE,
+                             VENDOR_DIR_OUT,
+                             0, 0x0000,
+                             buf, size,
+                             EZTIMEOUT);
+       if (ret < 0) {
+               dev_err(dev->class_dev, "firmware upload failed\n");
+               goto done;
        }
 
-       comedi_usb_auto_config(uinterf, &usbduxfast_driver, 0);
- out:
-       release_firmware(fw);
+       /* start the new firmware on the device */
+       *tmp = 0;       /* 7f92 to zero */
+       ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+                             USBDUXFASTSUB_FIRMWARE,
+                             VENDOR_DIR_OUT,
+                             USBDUXFASTSUB_CPUCS, 0x0000,
+                             tmp, 1,
+                             EZTIMEOUT);
+       if (ret < 0)
+               dev_err(dev->class_dev, "can not start firmware\n");
+
+done:
+       kfree(tmp);
+       kfree(buf);
+       return ret;
 }
 
-static int usbduxfast_usb_probe(struct usb_interface *uinterf,
-                               const struct usb_device_id *id)
+static int usbduxfast_auto_attach(struct comedi_device *dev,
+                                 unsigned long context_unused)
 {
-       struct usb_device *udev = interface_to_usbdev(uinterf);
-       int i;
-       int index;
+       struct usb_interface *intf = comedi_to_usb_interface(dev);
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxfast_private *devpriv;
        int ret;
 
-       if (udev->speed != USB_SPEED_HIGH) {
-               dev_err(&uinterf->dev,
+       if (usb->speed != USB_SPEED_HIGH) {
+               dev_err(dev->class_dev,
                        "This driver needs USB 2.0 to operate. Aborting...\n");
                return -ENODEV;
        }
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi_: usbduxfast_: finding a free structure for "
-              "the usb-device\n");
-#endif
-       down(&start_stop_sem);
-       /* look for a free place in the usbduxfast array */
-       index = -1;
-       for (i = 0; i < NUMUSBDUXFAST; i++) {
-               if (!usbduxfastsub[i].probed) {
-                       index = i;
-                       break;
-               }
-       }
 
-       /* no more space */
-       if (index == -1) {
-               dev_err(&uinterf->dev,
-                       "Too many usbduxfast-devices connected.\n");
-               up(&start_stop_sem);
-               return -EMFILE;
-       }
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi_: usbduxfast: usbduxfastsub[%d] is ready to "
-              "connect to comedi.\n", index);
-#endif
-
-       sema_init(&(usbduxfastsub[index].sem), 1);
-       /* save a pointer to the usb device */
-       usbduxfastsub[index].usbdev = udev;
-
-       /* save the interface itself */
-       usbduxfastsub[index].interface = uinterf;
-       /* get the interface number from the interface */
-       usbduxfastsub[index].ifnum = uinterf->altsetting->desc.bInterfaceNumber;
-       /*
-        * hand the private data over to the usb subsystem
-        * will be needed for disconnect
-        */
-       usb_set_intfdata(uinterf, &(usbduxfastsub[index]));
-
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi_: usbduxfast: ifnum=%d\n",
-              usbduxfastsub[index].ifnum);
-#endif
-       /* create space for the commands going to the usb device */
-       usbduxfastsub[index].dux_commands = kmalloc(SIZEOFDUXBUFFER,
-                                                   GFP_KERNEL);
-       if (!usbduxfastsub[index].dux_commands) {
-               tidy_up(&(usbduxfastsub[index]));
-               up(&start_stop_sem);
+       devpriv = kzalloc(sizeof(*devpriv), GFP_KERNEL);
+       if (!devpriv)
                return -ENOMEM;
-       }
-       /* create space of the instruction buffer */
-       usbduxfastsub[index].insnBuffer = kmalloc(SIZEINSNBUF, GFP_KERNEL);
-       if (!usbduxfastsub[index].insnBuffer) {
-               tidy_up(&(usbduxfastsub[index]));
-               up(&start_stop_sem);
+       dev->private = devpriv;
+
+       sema_init(&devpriv->sem, 1);
+       usb_set_intfdata(intf, devpriv);
+
+       devpriv->duxbuf = kmalloc(SIZEOFDUXBUF, GFP_KERNEL);
+       if (!devpriv->duxbuf)
                return -ENOMEM;
-       }
-       /* setting to alternate setting 1: enabling bulk ep */
-       i = usb_set_interface(usbduxfastsub[index].usbdev,
-                             usbduxfastsub[index].ifnum, 1);
-       if (i < 0) {
-               dev_err(&uinterf->dev,
-                       "usbduxfast%d: could not switch to alternate setting 1.\n",
-                       index);
-               tidy_up(&(usbduxfastsub[index]));
-               up(&start_stop_sem);
+
+       ret = usb_set_interface(usb,
+                               intf->altsetting->desc.bInterfaceNumber, 1);
+       if (ret < 0) {
+               dev_err(dev->class_dev,
+                       "could not switch to alternate setting 1\n");
                return -ENODEV;
        }
-       usbduxfastsub[index].urbIn = usb_alloc_urb(0, GFP_KERNEL);
-       if (!usbduxfastsub[index].urbIn) {
-               dev_err(&uinterf->dev,
-                       "usbduxfast%d: Could not alloc. urb\n", index);
-               tidy_up(&(usbduxfastsub[index]));
-               up(&start_stop_sem);
+
+       devpriv->urb = usb_alloc_urb(0, GFP_KERNEL);
+       if (!devpriv->urb) {
+               dev_err(dev->class_dev, "Could not alloc. urb\n");
                return -ENOMEM;
        }
-       usbduxfastsub[index].transfer_buffer = kmalloc(SIZEINBUF, GFP_KERNEL);
-       if (!usbduxfastsub[index].transfer_buffer) {
-               tidy_up(&(usbduxfastsub[index]));
-               up(&start_stop_sem);
+
+       devpriv->inbuf = kmalloc(SIZEINBUF, GFP_KERNEL);
+       if (!devpriv->inbuf)
                return -ENOMEM;
-       }
-       /* we've reached the bottom of the function */
-       usbduxfastsub[index].probed = 1;
-       up(&start_stop_sem);
-
-       ret = request_firmware_nowait(THIS_MODULE,
-                                     FW_ACTION_HOTPLUG,
-                                     FIRMWARE,
-                                     &udev->dev,
-                                     GFP_KERNEL,
-                                     usbduxfastsub + index,
-                                     usbduxfast_firmware_request_complete_handler);
 
-       if (ret) {
-               dev_err(&uinterf->dev, "could not load firmware (err=%d)\n", ret);
+       ret = comedi_load_firmware(dev, &usb->dev, FIRMWARE,
+                                  usbduxfast_upload_firmware, 0);
+       if (ret)
                return ret;
-       }
 
-       dev_info(&uinterf->dev,
-                "usbduxfast%d has been successfully initialized.\n", index);
-       /* success */
-       return 0;
+       return usbduxfast_attach_common(dev);
 }
 
-static void usbduxfast_usb_disconnect(struct usb_interface *intf)
+static void usbduxfast_detach(struct comedi_device *dev)
 {
-       struct usbduxfastsub_s *udfs = usb_get_intfdata(intf);
-       struct usb_device *udev = interface_to_usbdev(intf);
+       struct usb_interface *intf = comedi_to_usb_interface(dev);
+       struct usbduxfast_private *devpriv = dev->private;
 
-       if (!udfs) {
-               dev_err(&intf->dev, "disconnect called with null pointer.\n");
-               return;
-       }
-       if (udfs->usbdev != udev) {
-               dev_err(&intf->dev, "BUG! called with wrong ptr!!!\n");
+       if (!devpriv)
                return;
+
+       down(&devpriv->sem);
+
+       usb_set_intfdata(intf, NULL);
+
+       if (devpriv->urb) {
+               /* waits until a running transfer is over */
+               usb_kill_urb(devpriv->urb);
+
+               kfree(devpriv->inbuf);
+               devpriv->inbuf = NULL;
+
+               usb_free_urb(devpriv->urb);
+               devpriv->urb = NULL;
        }
 
-       comedi_usb_auto_unconfig(intf);
+       kfree(devpriv->duxbuf);
+       devpriv->duxbuf = NULL;
 
-       down(&start_stop_sem);
-       down(&udfs->sem);
-       tidy_up(udfs);
-       up(&udfs->sem);
-       up(&start_stop_sem);
+       devpriv->ai_cmd_running = 0;
 
-#ifdef CONFIG_COMEDI_DEBUG
-       printk(KERN_DEBUG "comedi_: usbduxfast: disconnected from the usb\n");
-#endif
+       up(&devpriv->sem);
+}
+
+static struct comedi_driver usbduxfast_driver = {
+       .driver_name    = "usbduxfast",
+       .module         = THIS_MODULE,
+       .auto_attach    = usbduxfast_auto_attach,
+       .detach         = usbduxfast_detach,
+};
+
+static int usbduxfast_usb_probe(struct usb_interface *intf,
+                               const struct usb_device_id *id)
+{
+       return comedi_usb_auto_config(intf, &usbduxfast_driver, 0);
 }
 
 static const struct usb_device_id usbduxfast_usb_table[] = {
@@ -1657,12 +1152,9 @@ static const struct usb_device_id usbduxfast_usb_table[] = {
 MODULE_DEVICE_TABLE(usb, usbduxfast_usb_table);
 
 static struct usb_driver usbduxfast_usb_driver = {
-#ifdef COMEDI_HAVE_USB_DRIVER_OWNER
-       .owner          = THIS_MODULE,
-#endif
        .name           = "usbduxfast",
        .probe          = usbduxfast_usb_probe,
-       .disconnect     = usbduxfast_usb_disconnect,
+       .disconnect     = comedi_usb_auto_unconfig,
        .id_table       = usbduxfast_usb_table,
 };
 module_comedi_usb_driver(usbduxfast_driver, usbduxfast_usb_driver);
index d3bc1b9910a7d3cea18b57364b102f3e21bc0359..898c3c4504066c31af6a44ec87f37b7bfc484aa0 100644 (file)
@@ -1,30 +1,27 @@
 /*
-   comedi/drivers/usbdux.c
-   Copyright (C) 2011 Bernd Porr, Bernd.Porr@f2s.com
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
+ * usbduxsigma.c
+ * Copyright (C) 2011 Bernd Porr, Bernd.Porr@f2s.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  */
+
 /*
-Driver: usbduxsigma
-Description: University of Stirling USB DAQ & INCITE Technology Limited
-Devices: [ITL] USB-DUX (usbduxsigma.o)
-Author: Bernd Porr <BerndPorr@f2s.com>
-Updated: 8 Nov 2011
-Status: testing
-*/
+ * Driver: usbduxsigma
+ * Description: University of Stirling USB DAQ & INCITE Technology Limited
+ * Devices: (ITL) USB-DUX [usbduxsigma]
+ * Author: Bernd Porr <BerndPorr@f2s.com>
+ * Updated: 8 Nov 2011
+ * Status: testing
+ */
+
 /*
  * I must give credit here to Chris Baugher who
  * wrote the driver for AT-MIO-16d. I used some parts of this
@@ -44,9 +41,6 @@ Status: testing
  *   0.6: corrected wrong input range
  */
 
-/* generates loads of debug info */
-/* #define NOISY_DUX_DEBUGBUG */
-
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -55,7 +49,7 @@ Status: testing
 #include <linux/usb.h>
 #include <linux/fcntl.h>
 #include <linux/compiler.h>
-#include <linux/firmware.h>
+
 #include "comedi_fc.h"
 #include "../comedidev.h"
 
@@ -63,38 +57,21 @@ Status: testing
 #define BULK_TIMEOUT 1000
 
 /* constants for "firmware" upload and download */
-#define FIRMWARE "usbduxsigma_firmware.bin"
-#define USBDUXSUB_FIRMWARE 0xA0
-#define VENDOR_DIR_IN  0xC0
-#define VENDOR_DIR_OUT 0x40
+#define FIRMWARE               "usbduxsigma_firmware.bin"
+#define FIRMWARE_MAX_LEN       0x4000
+#define USBDUXSUB_FIRMWARE     0xa0
+#define VENDOR_DIR_IN          0xc0
+#define VENDOR_DIR_OUT         0x40
 
 /* internal addresses of the 8051 processor */
 #define USBDUXSUB_CPUCS 0xE600
 
-/*
- * the minor device number, major is 180 only for debugging purposes and to
- * upload special firmware (programming the eeprom etc) which is not
- * compatible with the comedi framwork
- */
-#define USBDUXSUB_MINOR 32
-
-/* max lenghth of the transfer-buffer for software upload */
-#define TB_LEN 0x2000
-
-/* Input endpoint number: ISO/IRQ */
-#define ISOINEP           6
-
-/* Output endpoint number: ISO/IRQ */
-#define ISOOUTEP          2
-
-/* This EP sends DUX commands to USBDUX */
-#define COMMAND_OUT_EP     1
-
-/* This EP receives the DUX commands from USBDUX */
-#define COMMAND_IN_EP        8
-
-/* Output endpoint for PWM */
-#define PWM_EP         4
+/* USB endpoints */
+#define USBDUXSIGMA_CMD_OUT_EP         1       /* command output */
+#define USBDUXSIGMA_ISO_OUT_EP         2       /* analog output ISO/IRQ */
+#define USBDUXSIGMA_PWM_OUT_EP         4       /* pwm output */
+#define USBDUXSIGMA_ISO_IN_EP          6       /* analog input ISO/IRQ */
+#define USBDUXSIGMA_CMD_IN_EP          8       /* command input */
 
 /* 300Hz max frequ under PWM */
 #define MIN_PWM_PERIOD  ((long)(1E9/300))
@@ -105,6 +82,8 @@ Status: testing
 /* Number of channels (16 AD and offset)*/
 #define NUMCHANNELS 16
 
+#define USBDUXSIGMA_NUM_AO_CHAN                4
+
 /* Size of one A/D value */
 #define SIZEADIN          ((sizeof(int32_t)))
 
@@ -150,84 +129,54 @@ Status: testing
 /* must have more buffers due to buggy USB ctr */
 #define NUMOFOUTBUFFERSHIGH    10
 
-/* Total number of usbdux devices */
-#define NUMUSBDUX             16
-
-/* Analogue in subdevice */
-#define SUBDEV_AD             0
-
-/* Analogue out subdevice */
-#define SUBDEV_DA             1
-
-/* Digital I/O */
-#define SUBDEV_DIO            2
-
-/* timer aka pwm output */
-#define SUBDEV_PWM            3
-
 /* number of retries to get the right dux command */
 #define RETRIES 10
 
-/**************************************************/
-/* comedi constants */
-static const struct comedi_lrange range_usbdux_ai_range = { 1, {
-                                                               BIP_RANGE
-                                                               (2.65/2.0)
-                                                               }
-};
+/* bulk transfer commands to usbduxsigma */
+#define USBBUXSIGMA_AD_CMD             0
+#define USBDUXSIGMA_DA_CMD             1
+#define USBDUXSIGMA_DIO_CFG_CMD                2
+#define USBDUXSIGMA_DIO_BITS_CMD       3
+#define USBDUXSIGMA_SINGLE_AD_CMD      4
+#define USBDUXSIGMA_PWM_ON_CMD         7
+#define USBDUXSIGMA_PWM_OFF_CMD                8
 
-/*
- * private structure of one subdevice
- */
+static const struct comedi_lrange usbduxsigma_ai_range = {
+       1, {
+               BIP_RANGE(2.65 / 2.0)
+       }
+};
 
-/*
- * This is the structure which holds all the data of
- * this driver one sub device just now: A/D
- */
-struct usbduxsub {
-       /* attached? */
-       int attached;
-       /* is it associated with a subdevice? */
-       int probed;
-       /* pointer to the usb-device */
-       struct usb_device *usbdev;
+struct usbduxsigma_private {
        /* actual number of in-buffers */
-       int numOfInBuffers;
+       int n_ai_urbs;
        /* actual number of out-buffers */
-       int numOfOutBuffers;
+       int n_ao_urbs;
        /* ISO-transfer handling: buffers */
-       struct urb **urbIn;
-       struct urb **urbOut;
+       struct urb **ai_urbs;
+       struct urb **ao_urbs;
        /* pwm-transfer handling */
-       struct urb *urbPwm;
+       struct urb *pwm_urb;
        /* PWM period */
-       unsigned int pwmPeriod;
+       unsigned int pwm_period;
        /* PWM internal delay for the GPIF in the FX2 */
-       uint8_t pwmDelay;
+       uint8_t pwm_delay;
        /* size of the PWM buffer which holds the bit pattern */
-       int sizePwmBuf;
+       int pwm_buf_sz;
        /* input buffer for the ISO-transfer */
-       int32_t *inBuffer;
+       int32_t *in_buf;
        /* input buffer for single insn */
-       int8_t *insnBuffer;
-       /* output buffer for single DA outputs */
-       int16_t *outBuffer;
-       /* interface number */
-       int ifnum;
-       /* interface structure in 2.6 */
-       struct usb_interface *interface;
-       /* comedi device for the interrupt context */
-       struct comedi_device *comedidev;
-       /* is it USB_SPEED_HIGH or not? */
-       short int high_speed;
-       /* asynchronous command is running */
-       short int ai_cmd_running;
-       short int ao_cmd_running;
-       /* pwm is running */
-       short int pwm_cmd_running;
-       /* continuous acquisition */
-       short int ai_continuous;
-       short int ao_continuous;
+       int8_t *insn_buf;
+
+       unsigned int ao_readback[USBDUXSIGMA_NUM_AO_CHAN];
+
+       unsigned high_speed:1;
+       unsigned ai_cmd_running:1;
+       unsigned ai_continuous:1;
+       unsigned ao_cmd_running:1;
+       unsigned ao_continuous:1;
+       unsigned pwm_cmd_running:1;
+
        /* number of samples to acquire */
        int ai_sample_count;
        int ao_sample_count;
@@ -246,126 +195,58 @@ struct usbduxsub {
        struct semaphore sem;
 };
 
-/*
- * The pointer to the private usb-data of the driver is also the private data
- * for the comedi-device.  This has to be global as the usb subsystem needs
- * global variables. The other reason is that this structure must be there
- * _before_ any comedi command is issued. The usb subsystem must be initialised
- * before comedi can access it.
- */
-static struct usbduxsub usbduxsub[NUMUSBDUX];
-
-static DEFINE_SEMAPHORE(start_stop_sem);
-
-/*
- * Stops the data acquision
- * It should be safe to call this function from any context
- */
-static int usbduxsub_unlink_InURBs(struct usbduxsub *usbduxsub_tmp)
-{
-       int i = 0;
-       int err = 0;
-
-       if (usbduxsub_tmp && usbduxsub_tmp->urbIn) {
-               for (i = 0; i < usbduxsub_tmp->numOfInBuffers; i++) {
-                       if (usbduxsub_tmp->urbIn[i]) {
-                               /* We wait here until all transfers have been
-                                * cancelled. */
-                               usb_kill_urb(usbduxsub_tmp->urbIn[i]);
-                       }
-                       dev_dbg(&usbduxsub_tmp->interface->dev,
-                               "comedi: usbdux: unlinked InURB %d, err=%d\n",
-                               i, err);
-               }
-       }
-       return err;
-}
-
-/*
- * This will stop a running acquisition operation
- * Is called from within this driver from both the
- * interrupt context and from comedi
- */
-static int usbdux_ai_stop(struct usbduxsub *this_usbduxsub, int do_unlink)
+static void usbduxsigma_ai_stop(struct comedi_device *dev, int do_unlink)
 {
-       int ret = 0;
-
-       if (!this_usbduxsub) {
-               pr_err("comedi?: usbdux_ai_stop: this_usbduxsub=NULL!\n");
-               return -EFAULT;
-       }
-       dev_dbg(&this_usbduxsub->interface->dev, "comedi: usbdux_ai_stop\n");
+       struct usbduxsigma_private *devpriv = dev->private;
 
        if (do_unlink) {
-               /* stop aquistion */
-               ret = usbduxsub_unlink_InURBs(this_usbduxsub);
-       }
+               int i;
 
-       this_usbduxsub->ai_cmd_running = 0;
+               for (i = 0; i < devpriv->n_ai_urbs; i++) {
+                       if (devpriv->ai_urbs[i])
+                               usb_kill_urb(devpriv->ai_urbs[i]);
+               }
+       }
 
-       return ret;
+       devpriv->ai_cmd_running = 0;
 }
 
-/*
- * This will cancel a running acquisition operation.
- * This is called by comedi but never from inside the driver.
- */
-static int usbdux_ai_cancel(struct comedi_device *dev,
-                           struct comedi_subdevice *s)
+static int usbduxsigma_ai_cancel(struct comedi_device *dev,
+                                struct comedi_subdevice *s)
 {
-       struct usbduxsub *this_usbduxsub;
-       int res = 0;
-
-       /* force unlink of all urbs */
-       this_usbduxsub = dev->private;
-       if (!this_usbduxsub)
-               return -EFAULT;
+       struct usbduxsigma_private *devpriv = dev->private;
 
-       dev_dbg(&this_usbduxsub->interface->dev, "comedi: usbdux_ai_cancel\n");
+       down(&devpriv->sem);
+       /* unlink only if it is really running */
+       usbduxsigma_ai_stop(dev, devpriv->ai_cmd_running);
+       up(&devpriv->sem);
 
-       /* prevent other CPUs from submitting new commands just now */
-       down(&this_usbduxsub->sem);
-       if (!(this_usbduxsub->probed)) {
-               up(&this_usbduxsub->sem);
-               return -ENODEV;
-       }
-       /* unlink only if the urb really has been submitted */
-       res = usbdux_ai_stop(this_usbduxsub, this_usbduxsub->ai_cmd_running);
-       up(&this_usbduxsub->sem);
-       return res;
+       return 0;
 }
 
-/* analogue IN - interrupt service routine */
-static void usbduxsub_ai_IsocIrq(struct urb *urb)
+static void usbduxsigma_ai_urb_complete(struct urb *urb)
 {
-       int i, err, n;
-       struct usbduxsub *this_usbduxsub;
-       struct comedi_device *this_comedidev;
-       struct comedi_subdevice *s;
-       int32_t v;
+       struct comedi_device *dev = urb->context;
+       struct usbduxsigma_private *devpriv = dev->private;
+       struct comedi_subdevice *s = dev->read_subdev;
        unsigned int dio_state;
-
-       /* the context variable points to the comedi device */
-       this_comedidev = urb->context;
-       /* the private structure of the subdevice is struct usbduxsub */
-       this_usbduxsub = this_comedidev->private;
-       /* subdevice which is the AD converter */
-       s = &this_comedidev->subdevices[SUBDEV_AD];
+       int32_t val;
+       int ret;
+       int i;
 
        /* first we test if something unusual has just happened */
        switch (urb->status) {
        case 0:
                /* copy the result in the transfer buffer */
-               memcpy(this_usbduxsub->inBuffer,
-                      urb->transfer_buffer, SIZEINBUF);
+               memcpy(devpriv->in_buf, urb->transfer_buffer, SIZEINBUF);
                break;
        case -EILSEQ:
-               /* error in the ISOchronous data */
-               /* we don't copy the data into the transfer buffer */
-               /* and recycle the last data byte */
-               dev_dbg(&urb->dev->dev,
-                       "comedi%d: usbdux: CRC error in ISO IN stream.\n",
-                       this_usbduxsub->comedidev->minor);
+               /*
+                * error in the ISOchronous data
+                * we don't copy the data into the transfer buffer
+                * and recycle the last data byte
+                */
+               dev_dbg(dev->class_dev, "CRC error in ISO IN stream\n");
 
                break;
 
@@ -374,185 +255,127 @@ static void usbduxsub_ai_IsocIrq(struct urb *urb)
        case -ESHUTDOWN:
        case -ECONNABORTED:
                /* happens after an unlink command */
-               if (this_usbduxsub->ai_cmd_running) {
-                       /* we are still running a command */
-                       /* tell this comedi */
-                       s->async->events |= COMEDI_CB_EOA;
-                       s->async->events |= COMEDI_CB_ERROR;
-                       comedi_event(this_usbduxsub->comedidev, s);
-                       /* stop the transfer w/o unlink */
-                       usbdux_ai_stop(this_usbduxsub, 0);
+               if (devpriv->ai_cmd_running) {
+                       usbduxsigma_ai_stop(dev, 0);    /* w/o unlink */
+                       /* we are still running a command, tell comedi */
+                       s->async->events |= (COMEDI_CB_EOA | COMEDI_CB_ERROR);
+                       comedi_event(dev, s);
                }
                return;
 
        default:
-               /* a real error on the bus */
-               /* pass error to comedi if we are really running a command */
-               if (this_usbduxsub->ai_cmd_running) {
-                       dev_err(&urb->dev->dev,
-                               "Non-zero urb status received in ai intr "
-                               "context: %d\n", urb->status);
-                       s->async->events |= COMEDI_CB_EOA;
-                       s->async->events |= COMEDI_CB_ERROR;
-                       comedi_event(this_usbduxsub->comedidev, s);
-                       /* don't do an unlink here */
-                       usbdux_ai_stop(this_usbduxsub, 0);
+               /*
+                * a real error on the bus
+                * pass error to comedi if we are really running a command
+                */
+               if (devpriv->ai_cmd_running) {
+                       dev_err(dev->class_dev,
+                               "%s: non-zero urb status (%d)\n",
+                               __func__, urb->status);
+                       usbduxsigma_ai_stop(dev, 0);    /* w/o unlink */
+                       s->async->events |= (COMEDI_CB_EOA | COMEDI_CB_ERROR);
+                       comedi_event(dev, s);
                }
                return;
        }
 
-       /*
-        * at this point we are reasonably sure that nothing dodgy has happened
-        * are we running a command?
-        */
-       if (unlikely((!(this_usbduxsub->ai_cmd_running)))) {
-               /*
-                * not running a command, do not continue execution if no
-                * asynchronous command is running in particular not resubmit
-                */
+       if (unlikely(!devpriv->ai_cmd_running))
                return;
-       }
 
-       urb->dev = this_usbduxsub->usbdev;
-
-       /* resubmit the urb */
-       err = usb_submit_urb(urb, GFP_ATOMIC);
-       if (unlikely(err < 0)) {
-               dev_err(&urb->dev->dev,
-                       "comedi_: urb resubmit failed in int-context!"
-                       "err=%d\n",
-                       err);
-               if (err == -EL2NSYNC)
-                       dev_err(&urb->dev->dev,
-                               "buggy USB host controller or bug in IRQ "
-                               "handler!\n");
-               s->async->events |= COMEDI_CB_EOA;
-               s->async->events |= COMEDI_CB_ERROR;
-               comedi_event(this_usbduxsub->comedidev, s);
-               /* don't do an unlink here */
-               usbdux_ai_stop(this_usbduxsub, 0);
+       urb->dev = comedi_to_usb_dev(dev);
+
+       ret = usb_submit_urb(urb, GFP_ATOMIC);
+       if (unlikely(ret < 0)) {
+               dev_err(dev->class_dev, "%s: urb resubmit failed (%d)\n",
+                       __func__, ret);
+               if (ret == -EL2NSYNC)
+                       dev_err(dev->class_dev,
+                               "buggy USB host controller or bug in IRQ handler\n");
+               usbduxsigma_ai_stop(dev, 0);    /* w/o unlink */
+               s->async->events |= (COMEDI_CB_EOA | COMEDI_CB_ERROR);
+               comedi_event(dev, s);
                return;
        }
 
        /* get the state of the dio pins to allow external trigger */
-       dio_state = be32_to_cpu(this_usbduxsub->inBuffer[0]);
+       dio_state = be32_to_cpu(devpriv->in_buf[0]);
 
-       this_usbduxsub->ai_counter--;
-       if (likely(this_usbduxsub->ai_counter > 0))
+       devpriv->ai_counter--;
+       if (likely(devpriv->ai_counter > 0))
                return;
 
        /* timer zero, transfer measurements to comedi */
-       this_usbduxsub->ai_counter = this_usbduxsub->ai_timer;
+       devpriv->ai_counter = devpriv->ai_timer;
 
-       /* test, if we transmit only a fixed number of samples */
-       if (!(this_usbduxsub->ai_continuous)) {
+       if (!devpriv->ai_continuous) {
                /* not continuous, fixed number of samples */
-               this_usbduxsub->ai_sample_count--;
-               /* all samples received? */
-               if (this_usbduxsub->ai_sample_count < 0) {
-                       /* prevent a resubmit next time */
-                       usbdux_ai_stop(this_usbduxsub, 0);
-                       /* say comedi that the acquistion is over */
+               devpriv->ai_sample_count--;
+               if (devpriv->ai_sample_count < 0) {
+                       usbduxsigma_ai_stop(dev, 0);    /* w/o unlink */
+                       /* acquistion is over, tell comedi */
                        s->async->events |= COMEDI_CB_EOA;
-                       comedi_event(this_usbduxsub->comedidev, s);
+                       comedi_event(dev, s);
                        return;
                }
        }
+
        /* get the data from the USB bus and hand it over to comedi */
-       n = s->async->cmd.chanlist_len;
-       for (i = 0; i < n; i++) {
+       for (i = 0; i < s->async->cmd.chanlist_len; i++) {
                /* transfer data, note first byte is the DIO state */
-               v = be32_to_cpu(this_usbduxsub->inBuffer[i+1]);
-               /* strip status byte */
-               v = v & 0x00ffffff;
-               /* convert to unsigned */
-               v = v ^ 0x00800000;
-               /* write the byte to the buffer */
-               err = cfc_write_array_to_buffer(s, &v, sizeof(uint32_t));
-               if (unlikely(err == 0)) {
+               val = be32_to_cpu(devpriv->in_buf[i+1]);
+               val &= 0x00ffffff;      /* strip status byte */
+               val ^= 0x00800000;      /* convert to unsigned */
+
+               ret = cfc_write_array_to_buffer(s, &val, sizeof(uint32_t));
+               if (unlikely(ret == 0)) {
                        /* buffer overflow */
-                       usbdux_ai_stop(this_usbduxsub, 0);
+                       usbduxsigma_ai_stop(dev, 0);    /* w/o unlink */
                        return;
                }
        }
        /* tell comedi that data is there */
-       s->async->events |= COMEDI_CB_BLOCK | COMEDI_CB_EOS;
-       comedi_event(this_usbduxsub->comedidev, s);
+       s->async->events |= (COMEDI_CB_BLOCK | COMEDI_CB_EOS);
+       comedi_event(dev, s);
 }
 
-static int usbduxsub_unlink_OutURBs(struct usbduxsub *usbduxsub_tmp)
+static void usbduxsigma_ao_stop(struct comedi_device *dev, int do_unlink)
 {
-       int i = 0;
-       int err = 0;
+       struct usbduxsigma_private *devpriv = dev->private;
 
-       if (usbduxsub_tmp && usbduxsub_tmp->urbOut) {
-               for (i = 0; i < usbduxsub_tmp->numOfOutBuffers; i++) {
-                       if (usbduxsub_tmp->urbOut[i])
-                               usb_kill_urb(usbduxsub_tmp->urbOut[i]);
+       if (do_unlink) {
+               int i;
 
-                       dev_dbg(&usbduxsub_tmp->interface->dev,
-                               "comedi: usbdux: unlinked OutURB %d: res=%d\n",
-                               i, err);
+               for (i = 0; i < devpriv->n_ao_urbs; i++) {
+                       if (devpriv->ao_urbs[i])
+                               usb_kill_urb(devpriv->ao_urbs[i]);
                }
        }
-       return err;
-}
-
-/* This will cancel a running acquisition operation
- * in any context.
- */
-static int usbdux_ao_stop(struct usbduxsub *this_usbduxsub, int do_unlink)
-{
-       int ret = 0;
-
-       if (!this_usbduxsub)
-               return -EFAULT;
-       dev_dbg(&this_usbduxsub->interface->dev, "comedi: usbdux_ao_cancel\n");
 
-       if (do_unlink)
-               ret = usbduxsub_unlink_OutURBs(this_usbduxsub);
-
-       this_usbduxsub->ao_cmd_running = 0;
-
-       return ret;
+       devpriv->ao_cmd_running = 0;
 }
 
-/* force unlink, is called by comedi */
-static int usbdux_ao_cancel(struct comedi_device *dev,
-                           struct comedi_subdevice *s)
+static int usbduxsigma_ao_cancel(struct comedi_device *dev,
+                                struct comedi_subdevice *s)
 {
-       struct usbduxsub *this_usbduxsub = dev->private;
-       int res = 0;
-
-       if (!this_usbduxsub)
-               return -EFAULT;
+       struct usbduxsigma_private *devpriv = dev->private;
 
-       /* prevent other CPUs from submitting a command just now */
-       down(&this_usbduxsub->sem);
-       if (!(this_usbduxsub->probed)) {
-               up(&this_usbduxsub->sem);
-               return -ENODEV;
-       }
+       down(&devpriv->sem);
        /* unlink only if it is really running */
-       res = usbdux_ao_stop(this_usbduxsub, this_usbduxsub->ao_cmd_running);
-       up(&this_usbduxsub->sem);
-       return res;
+       usbduxsigma_ao_stop(dev, devpriv->ao_cmd_running);
+       up(&devpriv->sem);
+
+       return 0;
 }
 
-static void usbduxsub_ao_IsocIrq(struct urb *urb)
+static void usbduxsigma_ao_urb_complete(struct urb *urb)
 {
-       int i, ret;
+       struct comedi_device *dev = urb->context;
+       struct usbduxsigma_private *devpriv = dev->private;
+       struct comedi_subdevice *s = dev->write_subdev;
        uint8_t *datap;
-       struct usbduxsub *this_usbduxsub;
-       struct comedi_device *this_comedidev;
-       struct comedi_subdevice *s;
-
-       /* the context variable points to the subdevice */
-       this_comedidev = urb->context;
-       /* the private structure of the subdevice is struct usbduxsub */
-       this_usbduxsub = this_comedidev->private;
-
-       s = &this_comedidev->subdevices[SUBDEV_DA];
+       int len;
+       int ret;
+       int i;
 
        switch (urb->status) {
        case 0:
@@ -563,347 +386,141 @@ static void usbduxsub_ao_IsocIrq(struct urb *urb)
        case -ENOENT:
        case -ESHUTDOWN:
        case -ECONNABORTED:
-               /* after an unlink command, unplug, ... etc */
-               /* no unlink needed here. Already shutting down. */
-               if (this_usbduxsub->ao_cmd_running) {
+               /* happens after an unlink command */
+               if (devpriv->ao_cmd_running) {
+                       usbduxsigma_ao_stop(dev, 0);    /* w/o unlink */
                        s->async->events |= COMEDI_CB_EOA;
-                       comedi_event(this_usbduxsub->comedidev, s);
-                       usbdux_ao_stop(this_usbduxsub, 0);
+                       comedi_event(dev, s);
                }
                return;
 
        default:
                /* a real error */
-               if (this_usbduxsub->ao_cmd_running) {
-                       dev_err(&urb->dev->dev,
-                               "comedi_: Non-zero urb status received in ao "
-                               "intr context: %d\n", urb->status);
-                       s->async->events |= COMEDI_CB_ERROR;
-                       s->async->events |= COMEDI_CB_EOA;
-                       comedi_event(this_usbduxsub->comedidev, s);
-                       /* we do an unlink if we are in the high speed mode */
-                       usbdux_ao_stop(this_usbduxsub, 0);
+               if (devpriv->ao_cmd_running) {
+                       dev_err(dev->class_dev,
+                               "%s: non-zero urb status (%d)\n",
+                               __func__, urb->status);
+                       usbduxsigma_ao_stop(dev, 0);    /* w/o unlink */
+                       s->async->events |= (COMEDI_CB_ERROR | COMEDI_CB_EOA);
+                       comedi_event(dev, s);
                }
                return;
        }
 
-       /* are we actually running? */
-       if (!(this_usbduxsub->ao_cmd_running))
+       if (!devpriv->ao_cmd_running)
                return;
 
-       /* normal operation: executing a command in this subdevice */
-       this_usbduxsub->ao_counter--;
-       if ((int)this_usbduxsub->ao_counter <= 0) {
-               /* timer zero */
-               this_usbduxsub->ao_counter = this_usbduxsub->ao_timer;
-
-               /* handle non continuous acquisition */
-               if (!(this_usbduxsub->ao_continuous)) {
-                       /* fixed number of samples */
-                       this_usbduxsub->ao_sample_count--;
-                       if (this_usbduxsub->ao_sample_count < 0) {
-                               /* all samples transmitted */
-                               usbdux_ao_stop(this_usbduxsub, 0);
+       devpriv->ao_counter--;
+       if ((int)devpriv->ao_counter <= 0) {
+               /* timer zero, transfer from comedi */
+               devpriv->ao_counter = devpriv->ao_timer;
+
+               if (!devpriv->ao_continuous) {
+                       /* not continuous, fixed number of samples */
+                       devpriv->ao_sample_count--;
+                       if (devpriv->ao_sample_count < 0) {
+                               usbduxsigma_ao_stop(dev, 0);    /* w/o unlink */
+                               /* acquistion is over, tell comedi */
                                s->async->events |= COMEDI_CB_EOA;
-                               comedi_event(this_usbduxsub->comedidev, s);
-                               /* no resubmit of the urb */
+                               comedi_event(dev, s);
                                return;
                        }
                }
+
                /* transmit data to the USB bus */
-               ((uint8_t *) (urb->transfer_buffer))[0] =
-                   s->async->cmd.chanlist_len;
-               for (i = 0; i < s->async->cmd.chanlist_len; i++) {
-                       short temp;
-                       if (i >= NUMOUTCHANNELS)
-                               break;
-
-                       /* pointer to the DA */
-                       datap =
-                           (&(((uint8_t *) urb->transfer_buffer)[i * 2 + 1]));
-                       /* get the data from comedi */
-                       ret = comedi_buf_get(s->async, &temp);
-                       datap[0] = temp;
-                       datap[1] = this_usbduxsub->dac_commands[i];
-                       /* printk("data[0]=%x, data[1]=%x, data[2]=%x\n", */
-                       /* datap[0],datap[1],datap[2]); */
+               datap = urb->transfer_buffer;
+               len = s->async->cmd.chanlist_len;
+               *datap++ = len;
+               for (i = 0; i < len; i++) {
+                       unsigned int chan = devpriv->dac_commands[i];
+                       short val;
+
+                       ret = comedi_buf_get(s->async, &val);
                        if (ret < 0) {
-                               dev_err(&urb->dev->dev,
-                                       "comedi: buffer underflow\n");
-                               s->async->events |= COMEDI_CB_EOA;
-                               s->async->events |= COMEDI_CB_OVERFLOW;
+                               dev_err(dev->class_dev, "buffer underflow\n");
+                               s->async->events |= (COMEDI_CB_EOA |
+                                                    COMEDI_CB_OVERFLOW);
                        }
-                       /* transmit data to comedi */
+                       *datap++ = val;
+                       *datap++ = chan;
+                       devpriv->ao_readback[chan] = val;
+
                        s->async->events |= COMEDI_CB_BLOCK;
-                       comedi_event(this_usbduxsub->comedidev, s);
+                       comedi_event(dev, s);
                }
        }
+
        urb->transfer_buffer_length = SIZEOUTBUF;
-       urb->dev = this_usbduxsub->usbdev;
+       urb->dev = comedi_to_usb_dev(dev);
        urb->status = 0;
-       if (this_usbduxsub->ao_cmd_running) {
-               if (this_usbduxsub->high_speed) {
-                       /* uframes */
-                       urb->interval = 8;
-               } else {
-                       /* frames */
-                       urb->interval = 1;
-               }
-               urb->number_of_packets = 1;
-               urb->iso_frame_desc[0].offset = 0;
-               urb->iso_frame_desc[0].length = SIZEOUTBUF;
-               urb->iso_frame_desc[0].status = 0;
-               ret = usb_submit_urb(urb, GFP_ATOMIC);
-               if (ret < 0) {
-                       dev_err(&urb->dev->dev,
-                               "comedi_: ao urb resubm failed in int-cont. "
-                               "ret=%d", ret);
-                       if (ret == EL2NSYNC)
-                               dev_err(&urb->dev->dev,
-                                       "buggy USB host controller or bug in "
-                                       "IRQ handling!\n");
-
-                       s->async->events |= COMEDI_CB_EOA;
-                       s->async->events |= COMEDI_CB_ERROR;
-                       comedi_event(this_usbduxsub->comedidev, s);
-                       /* don't do an unlink here */
-                       usbdux_ao_stop(this_usbduxsub, 0);
-               }
-       }
-}
-
-static int usbduxsub_start(struct usbduxsub *usbduxsub)
-{
-       int errcode = 0;
-       uint8_t *local_transfer_buffer;
-
-       local_transfer_buffer = kmalloc(16, GFP_KERNEL);
-       if (!local_transfer_buffer)
-               return -ENOMEM;
-
-       /* 7f92 to zero */
-       local_transfer_buffer[0] = 0;
-       errcode = usb_control_msg(usbduxsub->usbdev,
-                                 /* create a pipe for a control transfer */
-                                 usb_sndctrlpipe(usbduxsub->usbdev, 0),
-                                 /* bRequest, "Firmware" */
-                                 USBDUXSUB_FIRMWARE,
-                                 /* bmRequestType */
-                                 VENDOR_DIR_OUT,
-                                 /* Value */
-                                 USBDUXSUB_CPUCS,
-                                 /* Index */
-                                 0x0000,
-                                 /* address of the transfer buffer */
-                                 local_transfer_buffer,
-                                 /* Length */
-                                 1,
-                                 /* Timeout */
-                                 BULK_TIMEOUT);
-       if (errcode < 0)
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: control msg failed (start)\n");
-
-       kfree(local_transfer_buffer);
-       return errcode;
-}
-
-static int usbduxsub_stop(struct usbduxsub *usbduxsub)
-{
-       int errcode = 0;
-       uint8_t *local_transfer_buffer;
-
-       local_transfer_buffer = kmalloc(16, GFP_KERNEL);
-       if (!local_transfer_buffer)
-               return -ENOMEM;
-
-       /* 7f92 to one */
-       local_transfer_buffer[0] = 1;
-       errcode = usb_control_msg(usbduxsub->usbdev,
-                                 usb_sndctrlpipe(usbduxsub->usbdev, 0),
-                                 /* bRequest, "Firmware" */
-                                 USBDUXSUB_FIRMWARE,
-                                 /* bmRequestType */
-                                 VENDOR_DIR_OUT,
-                                 /* Value */
-                                 USBDUXSUB_CPUCS,
-                                 /* Index */
-                                 0x0000, local_transfer_buffer,
-                                 /* Length */
-                                 1,
-                                 /* Timeout */
-                                 BULK_TIMEOUT);
-       if (errcode < 0)
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: control msg failed (stop)\n");
-
-       kfree(local_transfer_buffer);
-       return errcode;
-}
-
-static int usbduxsub_upload(struct usbduxsub *usbduxsub,
-                           uint8_t *local_transfer_buffer,
-                           unsigned int startAddr, unsigned int len)
-{
-       int errcode;
-
-       errcode = usb_control_msg(usbduxsub->usbdev,
-                                 usb_sndctrlpipe(usbduxsub->usbdev, 0),
-                                 /* brequest, firmware */
-                                 USBDUXSUB_FIRMWARE,
-                                 /* bmRequestType */
-                                 VENDOR_DIR_OUT,
-                                 /* value */
-                                 startAddr,
-                                 /* index */
-                                 0x0000,
-                                 /* our local safe buffer */
-                                 local_transfer_buffer,
-                                 /* length */
-                                 len,
-                                 /* timeout */
-                                 BULK_TIMEOUT);
-       dev_dbg(&usbduxsub->interface->dev, "comedi_: result=%d\n", errcode);
-       if (errcode < 0) {
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: upload failed\n");
-               return errcode;
-       }
-       return 0;
-}
-
-/* the FX2LP has twice as much as the standard FX2 */
-#define FIRMWARE_MAX_LEN 0x4000
-
-static int firmwareUpload(struct usbduxsub *usbduxsub,
-                         const u8 *firmwareBinary, int sizeFirmware)
-{
-       int ret;
-       uint8_t *fwBuf;
-
-       if (!firmwareBinary)
-               return 0;
-
-       if (sizeFirmware > FIRMWARE_MAX_LEN) {
-               dev_err(&usbduxsub->interface->dev,
-                       "usbduxsigma firmware binary it too large for FX2.\n");
-               return -ENOMEM;
-       }
-
-       /* we generate a local buffer for the firmware */
-       fwBuf = kmemdup(firmwareBinary, sizeFirmware, GFP_KERNEL);
-       if (!fwBuf) {
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: mem alloc for firmware failed\n");
-               return -ENOMEM;
-       }
-
-       ret = usbduxsub_stop(usbduxsub);
-       if (ret < 0) {
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: can not stop firmware\n");
-               kfree(fwBuf);
-               return ret;
-       }
-
-       ret = usbduxsub_upload(usbduxsub, fwBuf, 0, sizeFirmware);
-       if (ret < 0) {
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: firmware upload failed\n");
-               kfree(fwBuf);
-               return ret;
-       }
-       ret = usbduxsub_start(usbduxsub);
+       if (devpriv->high_speed)
+               urb->interval = 8;      /* uframes */
+       else
+               urb->interval = 1;      /* frames */
+       urb->number_of_packets = 1;
+       urb->iso_frame_desc[0].offset = 0;
+       urb->iso_frame_desc[0].length = SIZEOUTBUF;
+       urb->iso_frame_desc[0].status = 0;
+       ret = usb_submit_urb(urb, GFP_ATOMIC);
        if (ret < 0) {
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: can not start firmware\n");
-               kfree(fwBuf);
-               return ret;
+               dev_err(dev->class_dev,
+                       "%s: urb resubmit failed (%d)\n",
+                       __func__, ret);
+               if (ret == EL2NSYNC)
+                       dev_err(dev->class_dev,
+                               "buggy USB host controller or bug in IRQ handler\n");
+               usbduxsigma_ao_stop(dev, 0);    /* w/o unlink */
+               s->async->events |= (COMEDI_CB_EOA | COMEDI_CB_ERROR);
+               comedi_event(dev, s);
        }
-       kfree(fwBuf);
-       return 0;
 }
 
-static int usbduxsub_submit_InURBs(struct usbduxsub *usbduxsub)
+static int usbduxsigma_submit_urbs(struct comedi_device *dev,
+                                  struct urb **urbs, int num_urbs,
+                                  int input_urb)
 {
-       int i, errFlag;
-
-       if (!usbduxsub)
-               return -EFAULT;
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxsigma_private *devpriv = dev->private;
+       struct urb *urb;
+       int ret;
+       int i;
 
        /* Submit all URBs and start the transfer on the bus */
-       for (i = 0; i < usbduxsub->numOfInBuffers; i++) {
-               /* in case of a resubmission after an unlink... */
-               usbduxsub->urbIn[i]->interval = usbduxsub->ai_interval;
-               usbduxsub->urbIn[i]->context = usbduxsub->comedidev;
-               usbduxsub->urbIn[i]->dev = usbduxsub->usbdev;
-               usbduxsub->urbIn[i]->status = 0;
-               usbduxsub->urbIn[i]->transfer_flags = URB_ISO_ASAP;
-               dev_dbg(&usbduxsub->interface->dev,
-                       "comedi%d: submitting in-urb[%d]: %p,%p intv=%d\n",
-                       usbduxsub->comedidev->minor, i,
-                       (usbduxsub->urbIn[i]->context),
-                       (usbduxsub->urbIn[i]->dev),
-                       (usbduxsub->urbIn[i]->interval));
-               errFlag = usb_submit_urb(usbduxsub->urbIn[i], GFP_ATOMIC);
-               if (errFlag) {
-                       dev_err(&usbduxsub->interface->dev,
-                               "comedi_: ai: usb_submit_urb(%d) error %d\n",
-                               i, errFlag);
-                       return errFlag;
-               }
-       }
-       return 0;
-}
-
-static int usbduxsub_submit_OutURBs(struct usbduxsub *usbduxsub)
-{
-       int i, errFlag;
-
-       if (!usbduxsub)
-               return -EFAULT;
+       for (i = 0; i < num_urbs; i++) {
+               urb = urbs[i];
 
-       for (i = 0; i < usbduxsub->numOfOutBuffers; i++) {
-               dev_dbg(&usbduxsub->interface->dev,
-                       "comedi_: submitting out-urb[%d]\n", i);
                /* in case of a resubmission after an unlink... */
-               usbduxsub->urbOut[i]->context = usbduxsub->comedidev;
-               usbduxsub->urbOut[i]->dev = usbduxsub->usbdev;
-               usbduxsub->urbOut[i]->status = 0;
-               usbduxsub->urbOut[i]->transfer_flags = URB_ISO_ASAP;
-               errFlag = usb_submit_urb(usbduxsub->urbOut[i], GFP_ATOMIC);
-               if (errFlag) {
-                       dev_err(&usbduxsub->interface->dev,
-                               "comedi_: ao: usb_submit_urb(%d) error %d\n",
-                               i, errFlag);
-                       return errFlag;
-               }
+               if (input_urb)
+                       urb->interval = devpriv->ai_interval;
+               urb->context = dev;
+               urb->dev = usb;
+               urb->status = 0;
+               urb->transfer_flags = URB_ISO_ASAP;
+
+               ret = usb_submit_urb(urb, GFP_ATOMIC);
+               if (ret)
+                       return ret;
        }
        return 0;
 }
 
-static int chanToInterval(int nChannels)
+static int usbduxsigma_chans_to_interval(int num_chan)
 {
-       if (nChannels <= 2)
-               /* 4kHz */
-               return 2;
-       if (nChannels <= 8)
-               /* 2kHz */
-               return 4;
-       /* 1kHz */
-       return 8;
+       if (num_chan <= 2)
+               return 2;       /* 4kHz */
+       if (num_chan <= 8)
+               return 4;       /* 2kHz */
+       return 8;               /* 1kHz */
 }
 
-static int usbdux_ai_cmdtest(struct comedi_device *dev,
-                            struct comedi_subdevice *s,
-                            struct comedi_cmd *cmd)
+static int usbduxsigma_ai_cmdtest(struct comedi_device *dev,
+                                 struct comedi_subdevice *s,
+                                 struct comedi_cmd *cmd)
 {
-       struct usbduxsub *this_usbduxsub = dev->private;
-       int err = 0, i;
-       unsigned int tmpTimer;
-
-       if (!(this_usbduxsub->probed))
-               return -ENODEV;
+       struct usbduxsigma_private *devpriv = dev->private;
+       int high_speed = devpriv->high_speed;
+       int interval = usbduxsigma_chans_to_interval(cmd->chanlist_len);
+       int err = 0;
 
        /* Step 1 : check if triggers are trivially valid */
 
@@ -934,34 +551,28 @@ static int usbdux_ai_cmdtest(struct comedi_device *dev,
                err |= cfc_check_trigger_arg_is(&cmd->scan_begin_arg, 0);
 
        if (cmd->scan_begin_src == TRIG_TIMER) {
-               if (this_usbduxsub->high_speed) {
+               unsigned int tmp;
+
+               if (high_speed) {
                        /*
                         * In high speed mode microframes are possible.
                         * However, during one microframe we can roughly
                         * sample two channels. Thus, the more channels
                         * are in the channel list the more time we need.
                         */
-                       i = chanToInterval(cmd->chanlist_len);
                        err |= cfc_check_trigger_arg_min(&cmd->scan_begin_arg,
-                                                        (1000000 / 8 * i));
-                       /* now calc the real sampling rate with all the
-                        * rounding errors */
-                       tmpTimer =
-                           ((unsigned int)(cmd->scan_begin_arg / 125000)) *
-                           125000;
+                                               (1000000 / 8 * interval));
+
+                       tmp = (cmd->scan_begin_arg / 125000) * 125000;
                } else {
                        /* full speed */
                        /* 1kHz scans every USB frame */
                        err |= cfc_check_trigger_arg_min(&cmd->scan_begin_arg,
                                                         1000000);
-                       /*
-                        * calc the real sampling rate with the rounding errors
-                        */
-                       tmpTimer = ((unsigned int)(cmd->scan_begin_arg /
-                                                  1000000)) * 1000000;
+
+                       tmp = (cmd->scan_begin_arg / 1000000) * 1000000;
                }
-               err |= cfc_check_trigger_arg_is(&cmd->scan_begin_arg,
-                                               tmpTimer);
+               err |= cfc_check_trigger_arg_is(&cmd->scan_begin_arg, tmp);
        }
 
        err |= cfc_check_trigger_arg_is(&cmd->scan_end_arg, cmd->chanlist_len);
@@ -976,6 +587,37 @@ static int usbdux_ai_cmdtest(struct comedi_device *dev,
        if (err)
                return 3;
 
+       /* Step 4: fix up any arguments */
+
+       if (high_speed) {
+               /*
+                * every 2 channels get a time window of 125us. Thus, if we
+                * sample all 16 channels we need 1ms. If we sample only one
+                * channel we need only 125us
+                */
+               devpriv->ai_interval = interval;
+               devpriv->ai_timer = cmd->scan_begin_arg / (125000 * interval);
+       } else {
+               /* interval always 1ms */
+               devpriv->ai_interval = 1;
+               devpriv->ai_timer = cmd->scan_begin_arg / 1000000;
+       }
+       if (devpriv->ai_timer < 1)
+               err |= -EINVAL;
+
+       if (cmd->stop_src == TRIG_COUNT) {
+               /* data arrives as one packet */
+               devpriv->ai_sample_count = cmd->stop_arg;
+               devpriv->ai_continuous = 0;
+       } else {
+               /* continuous acquisition */
+               devpriv->ai_continuous = 1;
+               devpriv->ai_sample_count = 0;
+       }
+
+       if (err)
+               return 4;
+
        return 0;
 }
 
@@ -993,536 +635,278 @@ static void create_adc_command(unsigned int chan,
                (*muxsg1) = (*muxsg1) | (1 << (chan-8));
 }
 
+static int usbbuxsigma_send_cmd(struct comedi_device *dev, int cmd_type)
+{
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxsigma_private *devpriv = dev->private;
+       int nsent;
 
-/* bulk transfers to usbdux */
-
-#define SENDADCOMMANDS            0
-#define SENDDACOMMANDS            1
-#define SENDDIOCONFIGCOMMAND      2
-#define SENDDIOBITSCOMMAND        3
-#define SENDSINGLEAD              4
-#define SENDPWMON                 7
-#define SENDPWMOFF                8
+       devpriv->dux_commands[0] = cmd_type;
 
-static int send_dux_commands(struct usbduxsub *this_usbduxsub, int cmd_type)
-{
-       int result, nsent;
-
-       this_usbduxsub->dux_commands[0] = cmd_type;
-#ifdef NOISY_DUX_DEBUGBUG
-       printk(KERN_DEBUG "comedi%d: usbdux: dux_commands: ",
-              this_usbduxsub->comedidev->minor);
-       for (result = 0; result < SIZEOFDUXBUFFER; result++)
-               printk(" %02x", this_usbduxsub->dux_commands[result]);
-       printk("\n");
-#endif
-       result = usb_bulk_msg(this_usbduxsub->usbdev,
-                             usb_sndbulkpipe(this_usbduxsub->usbdev,
-                                             COMMAND_OUT_EP),
-                             this_usbduxsub->dux_commands, SIZEOFDUXBUFFER,
-                             &nsent, BULK_TIMEOUT);
-       if (result < 0)
-               dev_err(&this_usbduxsub->interface->dev, "comedi%d: "
-                       "could not transmit dux_command to the usb-device, "
-                       "err=%d\n", this_usbduxsub->comedidev->minor, result);
-
-       return result;
+       return usb_bulk_msg(usb, usb_sndbulkpipe(usb, USBDUXSIGMA_CMD_OUT_EP),
+                           devpriv->dux_commands, SIZEOFDUXBUFFER,
+                           &nsent, BULK_TIMEOUT);
 }
 
-static int receive_dux_commands(struct usbduxsub *this_usbduxsub, int command)
+static int usbduxsigma_receive_cmd(struct comedi_device *dev, int command)
 {
-       int result = (-EFAULT);
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxsigma_private *devpriv = dev->private;
        int nrec;
+       int ret;
        int i;
 
        for (i = 0; i < RETRIES; i++) {
-               result = usb_bulk_msg(this_usbduxsub->usbdev,
-                                     usb_rcvbulkpipe(this_usbduxsub->usbdev,
-                                                     COMMAND_IN_EP),
-                                     this_usbduxsub->insnBuffer, SIZEINSNBUF,
-                                     &nrec, BULK_TIMEOUT);
-               if (result < 0) {
-                       dev_err(&this_usbduxsub->interface->dev, "comedi%d: "
-                               "insn: USB error %d "
-                               "while receiving DUX command"
-                               "\n", this_usbduxsub->comedidev->minor,
-                               result);
-                       return result;
-               }
-               if (this_usbduxsub->insnBuffer[0] == command)
-                       return result;
+               ret = usb_bulk_msg(usb,
+                                  usb_rcvbulkpipe(usb, USBDUXSIGMA_CMD_IN_EP),
+                                  devpriv->insn_buf, SIZEINSNBUF,
+                                  &nrec, BULK_TIMEOUT);
+               if (ret < 0)
+                       return ret;
+
+               if (devpriv->insn_buf[0] == command)
+                       return 0;
        }
-       /* this is only reached if the data has been requested a couple of
-        * times */
-       dev_err(&this_usbduxsub->interface->dev, "comedi%d: insn: "
-               "wrong data returned from firmware: want %d, got %d.\n",
-               this_usbduxsub->comedidev->minor, command,
-               this_usbduxsub->insnBuffer[0]);
+       /*
+        * This is only reached if the data has been requested a
+        * couple of times and the command was not received.
+        */
        return -EFAULT;
 }
 
-static int usbdux_ai_inttrig(struct comedi_device *dev,
-                            struct comedi_subdevice *s, unsigned int trignum)
+static int usbduxsigma_ai_inttrig(struct comedi_device *dev,
+                                 struct comedi_subdevice *s,
+                                 unsigned int trignum)
 {
+       struct usbduxsigma_private *devpriv = dev->private;
        int ret;
-       struct usbduxsub *this_usbduxsub = dev->private;
-       if (!this_usbduxsub)
-               return -EFAULT;
 
-       down(&this_usbduxsub->sem);
-       if (!(this_usbduxsub->probed)) {
-               up(&this_usbduxsub->sem);
-               return -ENODEV;
-       }
-       dev_dbg(&this_usbduxsub->interface->dev,
-               "comedi%d: usbdux_ai_inttrig\n", dev->minor);
-
-       if (trignum != 0) {
-               dev_err(&this_usbduxsub->interface->dev,
-                       "comedi%d: usbdux_ai_inttrig: invalid trignum\n",
-                       dev->minor);
-               up(&this_usbduxsub->sem);
+       if (trignum != 0)
                return -EINVAL;
-       }
-       if (!(this_usbduxsub->ai_cmd_running)) {
-               this_usbduxsub->ai_cmd_running = 1;
-               ret = usbduxsub_submit_InURBs(this_usbduxsub);
+
+       down(&devpriv->sem);
+       if (!devpriv->ai_cmd_running) {
+               ret = usbduxsigma_submit_urbs(dev, devpriv->ai_urbs,
+                                             devpriv->n_ai_urbs, 1);
                if (ret < 0) {
-                       dev_err(&this_usbduxsub->interface->dev,
-                               "comedi%d: usbdux_ai_inttrig: "
-                               "urbSubmit: err=%d\n", dev->minor, ret);
-                       this_usbduxsub->ai_cmd_running = 0;
-                       up(&this_usbduxsub->sem);
+                       up(&devpriv->sem);
                        return ret;
                }
+               devpriv->ai_cmd_running = 1;
                s->async->inttrig = NULL;
-       } else {
-               dev_err(&this_usbduxsub->interface->dev,
-                       "comedi%d: ai_inttrig but acqu is already running\n",
-                       dev->minor);
        }
-       up(&this_usbduxsub->sem);
+       up(&devpriv->sem);
+
        return 1;
 }
 
-static int usbdux_ai_cmd(struct comedi_device *dev, struct comedi_subdevice *s)
+static int usbduxsigma_ai_cmd(struct comedi_device *dev,
+                             struct comedi_subdevice *s)
 {
+       struct usbduxsigma_private *devpriv = dev->private;
        struct comedi_cmd *cmd = &s->async->cmd;
-       unsigned int chan;
-       int i, ret;
-       struct usbduxsub *this_usbduxsub = dev->private;
-       int result;
+       unsigned int len = cmd->chanlist_len;
        uint8_t muxsg0 = 0;
        uint8_t muxsg1 = 0;
        uint8_t sysred = 0;
+       int ret;
+       int i;
 
-       if (!this_usbduxsub)
-               return -EFAULT;
-
-       dev_dbg(&this_usbduxsub->interface->dev,
-               "comedi%d: usbdux_ai_cmd\n", dev->minor);
-
-       /* block other CPUs from starting an ai_cmd */
-       down(&this_usbduxsub->sem);
+       down(&devpriv->sem);
 
-       if (!(this_usbduxsub->probed)) {
-               up(&this_usbduxsub->sem);
-               return -ENODEV;
-       }
-       if (this_usbduxsub->ai_cmd_running) {
-               dev_err(&this_usbduxsub->interface->dev, "comedi%d: "
-                       "ai_cmd not possible. Another ai_cmd is running.\n",
-                       dev->minor);
-               up(&this_usbduxsub->sem);
-               return -EBUSY;
-       }
        /* set current channel of the running acquisition to zero */
        s->async->cur_chan = 0;
+       for (i = 0; i < len; i++) {
+               unsigned int chan  = CR_CHAN(cmd->chanlist[i]);
 
-       /* first the number of channels per time step */
-       this_usbduxsub->dux_commands[1] = cmd->chanlist_len;
-
-       /* CONFIG0 */
-       this_usbduxsub->dux_commands[2] = 0x12;
-
-       /* CONFIG1: 23kHz sampling rate, delay = 0us,  */
-       this_usbduxsub->dux_commands[3] = 0x03;
-
-       /* CONFIG3: differential channels off */
-       this_usbduxsub->dux_commands[4] = 0x00;
-
-       for (i = 0; i < cmd->chanlist_len; i++) {
-               chan = CR_CHAN(cmd->chanlist[i]);
                create_adc_command(chan, &muxsg0, &muxsg1);
-               if (i >= NUMCHANNELS) {
-                       dev_err(&this_usbduxsub->interface->dev,
-                               "comedi%d: channel list too long\n",
-                               dev->minor);
-                       break;
-               }
-       }
-       this_usbduxsub->dux_commands[5] = muxsg0;
-       this_usbduxsub->dux_commands[6] = muxsg1;
-       this_usbduxsub->dux_commands[7] = sysred;
-
-       dev_dbg(&this_usbduxsub->interface->dev,
-               "comedi %d: sending commands to the usb device: size=%u\n",
-               dev->minor, NUMCHANNELS);
-
-       result = send_dux_commands(this_usbduxsub, SENDADCOMMANDS);
-       if (result < 0) {
-               up(&this_usbduxsub->sem);
-               return result;
        }
 
-       if (this_usbduxsub->high_speed) {
-               /*
-                * every 2 channels get a time window of 125us. Thus, if we
-                * sample all 16 channels we need 1ms. If we sample only one
-                * channel we need only 125us
-                */
-               this_usbduxsub->ai_interval =
-                       chanToInterval(cmd->chanlist_len);
-               this_usbduxsub->ai_timer = cmd->scan_begin_arg / (125000 *
-                                                         (this_usbduxsub->
-                                                          ai_interval));
-       } else {
-               /* interval always 1ms */
-               this_usbduxsub->ai_interval = 1;
-               this_usbduxsub->ai_timer = cmd->scan_begin_arg / 1000000;
-       }
-       if (this_usbduxsub->ai_timer < 1) {
-               dev_err(&this_usbduxsub->interface->dev, "comedi%d: ai_cmd: "
-                       "timer=%d, scan_begin_arg=%d. "
-                       "Not properly tested by cmdtest?\n", dev->minor,
-                       this_usbduxsub->ai_timer, cmd->scan_begin_arg);
-               up(&this_usbduxsub->sem);
-               return -EINVAL;
-       }
-       this_usbduxsub->ai_counter = this_usbduxsub->ai_timer;
+       devpriv->dux_commands[1] = len;  /* num channels per time step */
+       devpriv->dux_commands[2] = 0x12; /* CONFIG0 */
+       devpriv->dux_commands[3] = 0x03; /* CONFIG1: 23kHz sample, delay 0us */
+       devpriv->dux_commands[4] = 0x00; /* CONFIG3: diff. channels off */
+       devpriv->dux_commands[5] = muxsg0;
+       devpriv->dux_commands[6] = muxsg1;
+       devpriv->dux_commands[7] = sysred;
 
-       if (cmd->stop_src == TRIG_COUNT) {
-               /* data arrives as one packet */
-               this_usbduxsub->ai_sample_count = cmd->stop_arg;
-               this_usbduxsub->ai_continuous = 0;
-       } else {
-               /* continuous acquisition */
-               this_usbduxsub->ai_continuous = 1;
-               this_usbduxsub->ai_sample_count = 0;
+       ret = usbbuxsigma_send_cmd(dev, USBBUXSIGMA_AD_CMD);
+       if (ret < 0) {
+               up(&devpriv->sem);
+               return ret;
        }
 
+       devpriv->ai_counter = devpriv->ai_timer;
+
        if (cmd->start_src == TRIG_NOW) {
                /* enable this acquisition operation */
-               this_usbduxsub->ai_cmd_running = 1;
-               ret = usbduxsub_submit_InURBs(this_usbduxsub);
+               ret = usbduxsigma_submit_urbs(dev, devpriv->ai_urbs,
+                                             devpriv->n_ai_urbs, 1);
                if (ret < 0) {
-                       this_usbduxsub->ai_cmd_running = 0;
-                       /* fixme: unlink here?? */
-                       up(&this_usbduxsub->sem);
+                       up(&devpriv->sem);
                        return ret;
                }
                s->async->inttrig = NULL;
-       } else {
-               /* TRIG_INT */
-               /* don't enable the acquision operation */
-               /* wait for an internal signal */
-               s->async->inttrig = usbdux_ai_inttrig;
+               devpriv->ai_cmd_running = 1;
+       } else {        /* TRIG_INT */
+               /* wait for an internal signal and submit the urbs later */
+               s->async->inttrig = usbduxsigma_ai_inttrig;
        }
-       up(&this_usbduxsub->sem);
-       return 0;
+
+       up(&devpriv->sem);
+
+       return 0;
 }
 
-/* Mode 0 is used to get a single conversion on demand */
-static int usbdux_ai_insn_read(struct comedi_device *dev,
-                              struct comedi_subdevice *s,
-                              struct comedi_insn *insn, unsigned int *data)
+static int usbduxsigma_ai_insn_read(struct comedi_device *dev,
+                                   struct comedi_subdevice *s,
+                                   struct comedi_insn *insn,
+                                   unsigned int *data)
 {
-       int i;
-       int32_t one = 0;
-       int chan;
-       int err;
-       struct usbduxsub *this_usbduxsub = dev->private;
+       struct usbduxsigma_private *devpriv = dev->private;
+       unsigned int chan = CR_CHAN(insn->chanspec);
        uint8_t muxsg0 = 0;
        uint8_t muxsg1 = 0;
        uint8_t sysred = 0;
+       int ret;
+       int i;
 
-       if (!this_usbduxsub)
-               return 0;
-
-       dev_dbg(&this_usbduxsub->interface->dev,
-               "comedi%d: ai_insn_read, insn->n=%d, insn->subdev=%d\n",
-               dev->minor, insn->n, insn->subdev);
-
-       down(&this_usbduxsub->sem);
-       if (!(this_usbduxsub->probed)) {
-               up(&this_usbduxsub->sem);
-               return -ENODEV;
-       }
-       if (this_usbduxsub->ai_cmd_running) {
-               dev_err(&this_usbduxsub->interface->dev,
-                       "comedi%d: ai_insn_read not possible. "
-                       "Async Command is running.\n", dev->minor);
-               up(&this_usbduxsub->sem);
-               return 0;
+       down(&devpriv->sem);
+       if (devpriv->ai_cmd_running) {
+               up(&devpriv->sem);
+               return -EBUSY;
        }
 
-       /* sample one channel */
-       /* CONFIG0: chopper on */
-       this_usbduxsub->dux_commands[1] = 0x16;
-
-       /* CONFIG1: 2kHz sampling rate */
-       this_usbduxsub->dux_commands[2] = 0x80;
-
-       /* CONFIG3: differential channels off */
-       this_usbduxsub->dux_commands[3] = 0x00;
-
-       chan = CR_CHAN(insn->chanspec);
        create_adc_command(chan, &muxsg0, &muxsg1);
 
-       this_usbduxsub->dux_commands[4] = muxsg0;
-       this_usbduxsub->dux_commands[5] = muxsg1;
-       this_usbduxsub->dux_commands[6] = sysred;
+       /* Mode 0 is used to get a single conversion on demand */
+       devpriv->dux_commands[1] = 0x16; /* CONFIG0: chopper on */
+       devpriv->dux_commands[2] = 0x80; /* CONFIG1: 2kHz sampling rate */
+       devpriv->dux_commands[3] = 0x00; /* CONFIG3: diff. channels off */
+       devpriv->dux_commands[4] = muxsg0;
+       devpriv->dux_commands[5] = muxsg1;
+       devpriv->dux_commands[6] = sysred;
 
        /* adc commands */
-       err = send_dux_commands(this_usbduxsub, SENDSINGLEAD);
-       if (err < 0) {
-               up(&this_usbduxsub->sem);
-               return err;
+       ret = usbbuxsigma_send_cmd(dev, USBDUXSIGMA_SINGLE_AD_CMD);
+       if (ret < 0) {
+               up(&devpriv->sem);
+               return ret;
        }
 
        for (i = 0; i < insn->n; i++) {
-               err = receive_dux_commands(this_usbduxsub, SENDSINGLEAD);
-               if (err < 0) {
-                       up(&this_usbduxsub->sem);
-                       return 0;
-               }
-               /* 32 bits big endian from the A/D converter */
-               one = be32_to_cpu(*((int32_t *)
-                                   ((this_usbduxsub->insnBuffer)+1)));
-               /* mask out the status byte */
-               one = one & 0x00ffffff;
-               /* turn it into an unsigned integer */
-               one = one ^ 0x00800000;
-               data[i] = one;
-       }
-       up(&this_usbduxsub->sem);
-       return i;
-}
-
-
+               int32_t val;
 
+               ret = usbduxsigma_receive_cmd(dev, USBDUXSIGMA_SINGLE_AD_CMD);
+               if (ret < 0) {
+                       up(&devpriv->sem);
+                       return ret;
+               }
 
-static int usbdux_getstatusinfo(struct comedi_device *dev, int chan)
-{
-       struct usbduxsub *this_usbduxsub = dev->private;
-       uint8_t sysred = 0;
-       uint32_t one;
-       int err;
-
-       if (!this_usbduxsub)
-               return 0;
-
-       if (this_usbduxsub->ai_cmd_running) {
-               dev_err(&this_usbduxsub->interface->dev,
-                       "comedi%d: status read not possible. "
-                       "Async Command is running.\n", dev->minor);
-               return 0;
-       }
+               /* 32 bits big endian from the A/D converter */
+               val = be32_to_cpu(*((int32_t *)((devpriv->insn_buf) + 1)));
+               val &= 0x00ffffff;      /* strip status byte */
+               val ^= 0x00800000;      /* convert to unsigned */
 
-       /* CONFIG0 */
-       this_usbduxsub->dux_commands[1] = 0x12;
-
-       /* CONFIG1: 2kHz sampling rate */
-       this_usbduxsub->dux_commands[2] = 0x80;
-
-       /* CONFIG3: differential channels off */
-       this_usbduxsub->dux_commands[3] = 0x00;
-
-       if (chan == 1) {
-               /* ADC offset */
-               sysred = sysred | 1;
-       } else if (chan == 2) {
-               /* VCC */
-               sysred = sysred | 4;
-       } else if (chan == 3) {
-               /* temperature */
-               sysred = sysred | 8;
-       } else if (chan == 4) {
-               /* gain */
-               sysred = sysred | 16;
-       } else if (chan == 5) {
-               /* ref */
-               sysred = sysred | 32;
+               data[i] = val;
        }
+       up(&devpriv->sem);
 
-       this_usbduxsub->dux_commands[4] = 0;
-       this_usbduxsub->dux_commands[5] = 0;
-       this_usbduxsub->dux_commands[6] = sysred;
-
-       /* adc commands */
-       err = send_dux_commands(this_usbduxsub, SENDSINGLEAD);
-       if (err < 0)
-               return err;
-
-       err = receive_dux_commands(this_usbduxsub, SENDSINGLEAD);
-       if (err < 0)
-               return err;
-
-       /* 32 bits big endian from the A/D converter */
-       one = be32_to_cpu(*((int32_t *)((this_usbduxsub->insnBuffer)+1)));
-       /* mask out the status byte */
-       one = one & 0x00ffffff;
-       one = one ^ 0x00800000;
-
-       return (int)one;
+       return insn->n;
 }
 
-
-
-
-
-
-/************************************/
-/* analog out */
-
-static int usbdux_ao_insn_read(struct comedi_device *dev,
-                              struct comedi_subdevice *s,
-                              struct comedi_insn *insn, unsigned int *data)
+static int usbduxsigma_ao_insn_read(struct comedi_device *dev,
+                                   struct comedi_subdevice *s,
+                                   struct comedi_insn *insn,
+                                   unsigned int *data)
 {
+       struct usbduxsigma_private *devpriv = dev->private;
+       unsigned int chan = CR_CHAN(insn->chanspec);
        int i;
-       int chan = CR_CHAN(insn->chanspec);
-       struct usbduxsub *this_usbduxsub = dev->private;
-
-       if (!this_usbduxsub)
-               return -EFAULT;
 
-       down(&this_usbduxsub->sem);
-       if (!(this_usbduxsub->probed)) {
-               up(&this_usbduxsub->sem);
-               return -ENODEV;
-       }
+       down(&devpriv->sem);
        for (i = 0; i < insn->n; i++)
-               data[i] = this_usbduxsub->outBuffer[chan];
+               data[i] = devpriv->ao_readback[chan];
+       up(&devpriv->sem);
 
-       up(&this_usbduxsub->sem);
-       return i;
+       return insn->n;
 }
 
-static int usbdux_ao_insn_write(struct comedi_device *dev,
-                               struct comedi_subdevice *s,
-                               struct comedi_insn *insn, unsigned int *data)
+static int usbduxsigma_ao_insn_write(struct comedi_device *dev,
+                                    struct comedi_subdevice *s,
+                                    struct comedi_insn *insn,
+                                    unsigned int *data)
 {
-       int i, err;
-       int chan = CR_CHAN(insn->chanspec);
-       struct usbduxsub *this_usbduxsub = dev->private;
-
-       if (!this_usbduxsub)
-               return -EFAULT;
-
-       dev_dbg(&this_usbduxsub->interface->dev,
-               "comedi%d: ao_insn_write\n", dev->minor);
+       struct usbduxsigma_private *devpriv = dev->private;
+       unsigned int chan = CR_CHAN(insn->chanspec);
+       int ret;
+       int i;
 
-       down(&this_usbduxsub->sem);
-       if (!(this_usbduxsub->probed)) {
-               up(&this_usbduxsub->sem);
-               return -ENODEV;
-       }
-       if (this_usbduxsub->ao_cmd_running) {
-               dev_err(&this_usbduxsub->interface->dev,
-                       "comedi%d: ao_insn_write: "
-                       "ERROR: asynchronous ao_cmd is running\n", dev->minor);
-               up(&this_usbduxsub->sem);
-               return 0;
+       down(&devpriv->sem);
+       if (devpriv->ao_cmd_running) {
+               up(&devpriv->sem);
+               return -EBUSY;
        }
 
        for (i = 0; i < insn->n; i++) {
-               dev_dbg(&this_usbduxsub->interface->dev,
-                       "comedi%d: ao_insn_write: data[chan=%d,i=%d]=%d\n",
-                       dev->minor, chan, i, data[i]);
-
-               /* number of channels: 1 */
-               this_usbduxsub->dux_commands[1] = 1;
-               /* channel number */
-               this_usbduxsub->dux_commands[2] = data[i];
-               this_usbduxsub->outBuffer[chan] = data[i];
-               this_usbduxsub->dux_commands[3] = chan;
-               err = send_dux_commands(this_usbduxsub, SENDDACOMMANDS);
-               if (err < 0) {
-                       up(&this_usbduxsub->sem);
-                       return err;
+               devpriv->dux_commands[1] = 1;           /* num channels */
+               devpriv->dux_commands[2] = data[i];     /* value */
+               devpriv->dux_commands[3] = chan;        /* channel number */
+               ret = usbbuxsigma_send_cmd(dev, USBDUXSIGMA_DA_CMD);
+               if (ret < 0) {
+                       up(&devpriv->sem);
+                       return ret;
                }
+               devpriv->ao_readback[chan] = data[i];
        }
-       up(&this_usbduxsub->sem);
+       up(&devpriv->sem);
 
-       return i;
+       return insn->n;
 }
 
-static int usbdux_ao_inttrig(struct comedi_device *dev,
-                            struct comedi_subdevice *s, unsigned int trignum)
+static int usbduxsigma_ao_inttrig(struct comedi_device *dev,
+                                 struct comedi_subdevice *s,
+                                 unsigned int trignum)
 {
+       struct usbduxsigma_private *devpriv = dev->private;
        int ret;
-       struct usbduxsub *this_usbduxsub = dev->private;
-
-       if (!this_usbduxsub)
-               return -EFAULT;
 
-       down(&this_usbduxsub->sem);
+       if (trignum != 0)
+               return -EINVAL;
 
-       if (!(this_usbduxsub->probed)) {
-               ret = -ENODEV;
-               goto out;
-       }
-       if (trignum != 0) {
-               dev_err(&this_usbduxsub->interface->dev,
-                       "comedi%d: usbdux_ao_inttrig: invalid trignum\n",
-                       dev->minor);
-               ret = -EINVAL;
-               goto out;
-       }
-       if (!(this_usbduxsub->ao_cmd_running)) {
-               this_usbduxsub->ao_cmd_running = 1;
-               ret = usbduxsub_submit_OutURBs(this_usbduxsub);
+       down(&devpriv->sem);
+       if (!devpriv->ao_cmd_running) {
+               ret = usbduxsigma_submit_urbs(dev, devpriv->ao_urbs,
+                                             devpriv->n_ao_urbs, 0);
                if (ret < 0) {
-                       dev_err(&this_usbduxsub->interface->dev,
-                               "comedi%d: usbdux_ao_inttrig: submitURB: "
-                               "err=%d\n", dev->minor, ret);
-                       this_usbduxsub->ao_cmd_running = 0;
-                       goto out;
+                       up(&devpriv->sem);
+                       return ret;
                }
+               devpriv->ao_cmd_running = 1;
                s->async->inttrig = NULL;
-       } else {
-               dev_err(&this_usbduxsub->interface->dev,
-                       "comedi%d: ao_inttrig but acqu is already running.\n",
-                       dev->minor);
        }
-       ret = 1;
-out:
-       up(&this_usbduxsub->sem);
-       return ret;
+       up(&devpriv->sem);
+
+       return 1;
 }
 
-static int usbdux_ao_cmdtest(struct comedi_device *dev,
-                            struct comedi_subdevice *s,
-                            struct comedi_cmd *cmd)
+static int usbduxsigma_ao_cmdtest(struct comedi_device *dev,
+                                 struct comedi_subdevice *s,
+                                 struct comedi_cmd *cmd)
 {
-       struct usbduxsub *this_usbduxsub = dev->private;
+       struct usbduxsigma_private *devpriv = dev->private;
        int err = 0;
+       int high_speed;
        unsigned int flags;
 
-       if (!this_usbduxsub)
-               return -EFAULT;
-
-       if (!(this_usbduxsub->probed))
-               return -ENODEV;
-
-       dev_dbg(&this_usbduxsub->interface->dev,
-               "comedi%d: usbdux_ao_cmdtest\n", dev->minor);
+       /* high speed conversions are not used yet */
+       high_speed = 0;         /* (devpriv->high_speed) */
 
        /* Step 1 : check if triggers are trivially valid */
 
        err |= cfc_check_trigger_src(&cmd->start_src, TRIG_NOW | TRIG_INT);
 
-       if (0) {                /* (this_usbduxsub->high_speed) */
+       if (high_speed) {
                /*
                 * start immediately a new scan
                 * the sampling rate is set by the coversion rate
@@ -1538,8 +922,10 @@ static int usbdux_ao_cmdtest(struct comedi_device *dev,
        err |= cfc_check_trigger_src(&cmd->scan_end_src, TRIG_COUNT);
        err |= cfc_check_trigger_src(&cmd->stop_src, TRIG_COUNT | TRIG_NONE);
 
-       if (err)
+       if (err) {
+               up(&devpriv->sem);
                return 1;
+       }
 
        /* Step 2a : make sure trigger sources are unique */
 
@@ -1578,272 +964,186 @@ static int usbdux_ao_cmdtest(struct comedi_device *dev,
        if (err)
                return 3;
 
-       return 0;
-}
+       /* Step 4: fix up any arguments */
 
-static int usbdux_ao_cmd(struct comedi_device *dev, struct comedi_subdevice *s)
-{
-       struct comedi_cmd *cmd = &s->async->cmd;
-       unsigned int chan, gain;
-       int i, ret;
-       struct usbduxsub *this_usbduxsub = dev->private;
-
-       if (!this_usbduxsub)
-               return -EFAULT;
-
-       down(&this_usbduxsub->sem);
-       if (!(this_usbduxsub->probed)) {
-               up(&this_usbduxsub->sem);
-               return -ENODEV;
-       }
-       dev_dbg(&this_usbduxsub->interface->dev,
-               "comedi%d: %s\n", dev->minor, __func__);
-
-       /* set current channel of the running acquisition to zero */
-       s->async->cur_chan = 0;
-       for (i = 0; i < cmd->chanlist_len; ++i) {
-               chan = CR_CHAN(cmd->chanlist[i]);
-               gain = CR_RANGE(cmd->chanlist[i]);
-               if (i >= NUMOUTCHANNELS) {
-                       dev_err(&this_usbduxsub->interface->dev,
-                               "comedi%d: %s: channel list too long\n",
-                               dev->minor, __func__);
-                       break;
-               }
-               this_usbduxsub->dac_commands[i] = chan;
-               dev_dbg(&this_usbduxsub->interface->dev,
-                       "comedi%d: dac command for ch %d is %x\n",
-                       dev->minor, i, this_usbduxsub->dac_commands[i]);
-       }
-
-       /* we count in steps of 1ms (125us) */
-       /* 125us mode not used yet */
-       if (0) {                /* (this_usbduxsub->high_speed) */
-               /* 125us */
+       /* we count in timer steps */
+       if (high_speed) {
                /* timing of the conversion itself: every 125 us */
-               this_usbduxsub->ao_timer = cmd->convert_arg / 125000;
+               devpriv->ao_timer = cmd->convert_arg / 125000;
        } else {
-               /* 1ms */
-               /* timing of the scan: we get all channels at once */
-               this_usbduxsub->ao_timer = cmd->scan_begin_arg / 1000000;
-               dev_dbg(&this_usbduxsub->interface->dev,
-                       "comedi%d: scan_begin_src=%d, scan_begin_arg=%d, "
-                       "convert_src=%d, convert_arg=%d\n", dev->minor,
-                       cmd->scan_begin_src, cmd->scan_begin_arg,
-                       cmd->convert_src, cmd->convert_arg);
-               dev_dbg(&this_usbduxsub->interface->dev,
-                       "comedi%d: ao_timer=%d (ms)\n",
-                       dev->minor, this_usbduxsub->ao_timer);
-               if (this_usbduxsub->ao_timer < 1) {
-                       dev_err(&this_usbduxsub->interface->dev,
-                               "comedi%d: usbdux: ao_timer=%d, "
-                               "scan_begin_arg=%d. "
-                               "Not properly tested by cmdtest?\n",
-                               dev->minor, this_usbduxsub->ao_timer,
-                               cmd->scan_begin_arg);
-                       up(&this_usbduxsub->sem);
-                       return -EINVAL;
-               }
+               /*
+                * timing of the scan: every 1ms
+                * we get all channels at once
+                */
+               devpriv->ao_timer = cmd->scan_begin_arg / 1000000;
        }
-       this_usbduxsub->ao_counter = this_usbduxsub->ao_timer;
+       if (devpriv->ao_timer < 1)
+               err |= -EINVAL;
 
        if (cmd->stop_src == TRIG_COUNT) {
-               /* not continuous */
-               /* counter */
-               /* high speed also scans everything at once */
-               if (0) {        /* (this_usbduxsub->high_speed) */
-                       this_usbduxsub->ao_sample_count =
-                           (cmd->stop_arg) * (cmd->scan_end_arg);
+               /* not continuous, use counter */
+               if (high_speed) {
+                       /* high speed also scans everything at once */
+                       devpriv->ao_sample_count = cmd->stop_arg *
+                                                  cmd->scan_end_arg;
                } else {
-                       /* there's no scan as the scan has been */
-                       /* perf inside the FX2 */
-                       /* data arrives as one packet */
-                       this_usbduxsub->ao_sample_count = cmd->stop_arg;
+                       /*
+                        * There's no scan as the scan has been
+                        * handled inside the FX2. Data arrives as
+                        * one packet.
+                        */
+                       devpriv->ao_sample_count = cmd->stop_arg;
                }
-               this_usbduxsub->ao_continuous = 0;
+               devpriv->ao_continuous = 0;
        } else {
                /* continuous acquisition */
-               this_usbduxsub->ao_continuous = 1;
-               this_usbduxsub->ao_sample_count = 0;
+               devpriv->ao_continuous = 1;
+               devpriv->ao_sample_count = 0;
        }
 
+       if (err)
+               return 4;
+
+       return 0;
+}
+
+static int usbduxsigma_ao_cmd(struct comedi_device *dev,
+                             struct comedi_subdevice *s)
+{
+       struct usbduxsigma_private *devpriv = dev->private;
+       struct comedi_cmd *cmd = &s->async->cmd;
+       int ret;
+       int i;
+
+       down(&devpriv->sem);
+
+       /* set current channel of the running acquisition to zero */
+       s->async->cur_chan = 0;
+       for (i = 0; i < cmd->chanlist_len; ++i)
+               devpriv->dac_commands[i] = CR_CHAN(cmd->chanlist[i]);
+
+       devpriv->ao_counter = devpriv->ao_timer;
+
        if (cmd->start_src == TRIG_NOW) {
                /* enable this acquisition operation */
-               this_usbduxsub->ao_cmd_running = 1;
-               ret = usbduxsub_submit_OutURBs(this_usbduxsub);
+               ret = usbduxsigma_submit_urbs(dev, devpriv->ao_urbs,
+                                             devpriv->n_ao_urbs, 0);
                if (ret < 0) {
-                       this_usbduxsub->ao_cmd_running = 0;
-                       /* fixme: unlink here?? */
-                       up(&this_usbduxsub->sem);
+                       up(&devpriv->sem);
                        return ret;
                }
                s->async->inttrig = NULL;
-       } else {
-               /* TRIG_INT */
-               /* submit the urbs later */
-               /* wait for an internal signal */
-               s->async->inttrig = usbdux_ao_inttrig;
+               devpriv->ao_cmd_running = 1;
+       } else {        /* TRIG_INT */
+               /* wait for an internal signal and submit the urbs later */
+               s->async->inttrig = usbduxsigma_ao_inttrig;
        }
 
-       up(&this_usbduxsub->sem);
+       up(&devpriv->sem);
+
        return 0;
 }
 
-static int usbdux_dio_insn_config(struct comedi_device *dev,
-                                 struct comedi_subdevice *s,
-                                 struct comedi_insn *insn, unsigned int *data)
+static int usbduxsigma_dio_insn_config(struct comedi_device *dev,
+                                      struct comedi_subdevice *s,
+                                      struct comedi_insn *insn,
+                                      unsigned int *data)
 {
-       int chan = CR_CHAN(insn->chanspec);
-
-       /* The input or output configuration of each digital line is
-        * configured by a special insn_config instruction.  chanspec
-        * contains the channel to be changed, and data[0] contains the
-        * value COMEDI_INPUT or COMEDI_OUTPUT. */
+       unsigned int chan = CR_CHAN(insn->chanspec);
+       unsigned int mask = 1 << chan;
 
        switch (data[0]) {
        case INSN_CONFIG_DIO_OUTPUT:
-               s->io_bits |= 1 << chan;        /* 1 means Out */
+               s->io_bits |= mask;
                break;
        case INSN_CONFIG_DIO_INPUT:
-               s->io_bits &= ~(1 << chan);
+               s->io_bits &= ~mask;
                break;
        case INSN_CONFIG_DIO_QUERY:
-               data[1] =
-                   (s->io_bits & (1 << chan)) ? COMEDI_OUTPUT : COMEDI_INPUT;
+               data[1] = (s->io_bits & mask) ? COMEDI_OUTPUT : COMEDI_INPUT;
                break;
        default:
                return -EINVAL;
                break;
        }
-       /* we don't tell the firmware here as it would take 8 frames */
-       /* to submit the information. We do it in the insn_bits. */
+
+       /*
+        * We don't tell the firmware here as it would take 8 frames
+        * to submit the information. We do it in the (*insn_bits).
+        */
        return insn->n;
 }
 
-static int usbdux_dio_insn_bits(struct comedi_device *dev,
-                               struct comedi_subdevice *s,
-                               struct comedi_insn *insn,
-                               unsigned int *data)
+static int usbduxsigma_dio_insn_bits(struct comedi_device *dev,
+                                    struct comedi_subdevice *s,
+                                    struct comedi_insn *insn,
+                                    unsigned int *data)
 {
+       struct usbduxsigma_private *devpriv = dev->private;
+       unsigned int mask = data[0];
+       unsigned int bits = data[1];
+       int ret;
 
-       struct usbduxsub *this_usbduxsub = dev->private;
-       int err;
+       down(&devpriv->sem);
 
-       if (!this_usbduxsub)
-               return -EFAULT;
+       s->state &= ~mask;
+       s->state |= (bits & mask);
 
-       down(&this_usbduxsub->sem);
+       devpriv->dux_commands[1] = s->io_bits & 0xff;
+       devpriv->dux_commands[4] = s->state & 0xff;
+       devpriv->dux_commands[2] = (s->io_bits >> 8) & 0xff;
+       devpriv->dux_commands[5] = (s->state >> 8) & 0xff;
+       devpriv->dux_commands[3] = (s->io_bits >> 16) & 0xff;
+       devpriv->dux_commands[6] = (s->state >> 16) & 0xff;
 
-       if (!(this_usbduxsub->probed)) {
-               up(&this_usbduxsub->sem);
-               return -ENODEV;
-       }
+       ret = usbbuxsigma_send_cmd(dev, USBDUXSIGMA_DIO_BITS_CMD);
+       if (ret < 0)
+               goto done;
+       ret = usbduxsigma_receive_cmd(dev, USBDUXSIGMA_DIO_BITS_CMD);
+       if (ret < 0)
+               goto done;
 
-       /* The insn data is a mask in data[0] and the new data
-        * in data[1], each channel cooresponding to a bit. */
-       s->state &= ~data[0];
-       s->state |= data[0] & data[1];
-       /* The commands are 8 bits wide */
-       this_usbduxsub->dux_commands[1] = (s->io_bits) & 0x000000FF;
-       this_usbduxsub->dux_commands[4] = (s->state) & 0x000000FF;
-       this_usbduxsub->dux_commands[2] = ((s->io_bits) & 0x0000FF00) >> 8;
-       this_usbduxsub->dux_commands[5] = ((s->state) & 0x0000FF00) >> 8;
-       this_usbduxsub->dux_commands[3] = ((s->io_bits) & 0x00FF0000) >> 16;
-       this_usbduxsub->dux_commands[6] = ((s->state) & 0x00FF0000) >> 16;
-
-       /* This command also tells the firmware to return */
-       /* the digital input lines */
-       err = send_dux_commands(this_usbduxsub, SENDDIOBITSCOMMAND);
-       if (err < 0) {
-               up(&this_usbduxsub->sem);
-               return err;
-       }
-       err = receive_dux_commands(this_usbduxsub, SENDDIOBITSCOMMAND);
-       if (err < 0) {
-               up(&this_usbduxsub->sem);
-               return err;
-       }
+       s->state = devpriv->insn_buf[1] |
+                  (devpriv->insn_buf[2] << 8) |
+                  (devpriv->insn_buf[3] << 16);
 
-       data[1] = (((unsigned int)(this_usbduxsub->insnBuffer[1]))&0xff) |
-               ((((unsigned int)(this_usbduxsub->insnBuffer[2]))&0xff) << 8) |
-               ((((unsigned int)(this_usbduxsub->insnBuffer[3]))&0xff) << 16);
+       data[1] = s->state;
+       ret = insn->n;
 
-       s->state = data[1];
+done:
+       up(&devpriv->sem);
 
-       up(&this_usbduxsub->sem);
-       return insn->n;
+       return ret;
 }
 
-/***********************************/
-/* PWM */
-
-static int usbduxsub_unlink_PwmURBs(struct usbduxsub *usbduxsub_tmp)
+static void usbduxsigma_pwm_stop(struct comedi_device *dev, int do_unlink)
 {
-       int err = 0;
+       struct usbduxsigma_private *devpriv = dev->private;
 
-       if (usbduxsub_tmp && usbduxsub_tmp->urbPwm) {
-               if (usbduxsub_tmp->urbPwm)
-                       usb_kill_urb(usbduxsub_tmp->urbPwm);
-               dev_dbg(&usbduxsub_tmp->interface->dev,
-                       "comedi: unlinked PwmURB: res=%d\n", err);
+       if (do_unlink) {
+               if (devpriv->pwm_urb)
+                       usb_kill_urb(devpriv->pwm_urb);
        }
-       return err;
-}
 
-/* This cancels a running acquisition operation
- * in any context.
- */
-static int usbdux_pwm_stop(struct usbduxsub *this_usbduxsub, int do_unlink)
-{
-       int ret = 0;
-
-       if (!this_usbduxsub)
-               return -EFAULT;
-
-       dev_dbg(&this_usbduxsub->interface->dev, "comedi: %s\n", __func__);
-       if (do_unlink)
-               ret = usbduxsub_unlink_PwmURBs(this_usbduxsub);
-
-       this_usbduxsub->pwm_cmd_running = 0;
-
-       return ret;
+       devpriv->pwm_cmd_running = 0;
 }
 
-/* force unlink - is called by comedi */
-static int usbdux_pwm_cancel(struct comedi_device *dev,
-                            struct comedi_subdevice *s)
+static int usbduxsigma_pwm_cancel(struct comedi_device *dev,
+                                 struct comedi_subdevice *s)
 {
-       struct usbduxsub *this_usbduxsub = dev->private;
-       int res = 0;
+       struct usbduxsigma_private *devpriv = dev->private;
 
        /* unlink only if it is really running */
-       res = usbdux_pwm_stop(this_usbduxsub, this_usbduxsub->pwm_cmd_running);
+       usbduxsigma_pwm_stop(dev, devpriv->pwm_cmd_running);
 
-       dev_dbg(&this_usbduxsub->interface->dev,
-               "comedi %d: sending pwm off command to the usb device.\n",
-               dev->minor);
-       res = send_dux_commands(this_usbduxsub, SENDPWMOFF);
-       if (res < 0)
-               return res;
-
-       return res;
+       return usbbuxsigma_send_cmd(dev, USBDUXSIGMA_PWM_OFF_CMD);
 }
 
-static void usbduxsub_pwm_irq(struct urb *urb)
+static void usbduxsigma_pwm_urb_complete(struct urb *urb)
 {
+       struct comedi_device *dev = urb->context;
+       struct usbduxsigma_private *devpriv = dev->private;
        int ret;
-       struct usbduxsub *this_usbduxsub;
-       struct comedi_device *this_comedidev;
-       struct comedi_subdevice *s;
-
-       /* printk(KERN_DEBUG "PWM: IRQ\n"); */
-
-       /* the context variable points to the subdevice */
-       this_comedidev = urb->context;
-       /* the private structure of the subdevice is struct usbduxsub */
-       this_usbduxsub = this_comedidev->private;
-
-       s = &this_comedidev->subdevices[SUBDEV_DA];
 
        switch (urb->status) {
        case 0:
@@ -1854,260 +1154,180 @@ static void usbduxsub_pwm_irq(struct urb *urb)
        case -ENOENT:
        case -ESHUTDOWN:
        case -ECONNABORTED:
-               /*
-                * after an unlink command, unplug, ... etc
-                * no unlink needed here. Already shutting down.
-                */
-               if (this_usbduxsub->pwm_cmd_running)
-                       usbdux_pwm_stop(this_usbduxsub, 0);
-
+               /* happens after an unlink command */
+               if (devpriv->pwm_cmd_running)
+                       usbduxsigma_pwm_stop(dev, 0);   /* w/o unlink */
                return;
 
        default:
                /* a real error */
-               if (this_usbduxsub->pwm_cmd_running) {
-                       dev_err(&this_usbduxsub->interface->dev,
-                               "comedi_: Non-zero urb status received in "
-                               "pwm intr context: %d\n", urb->status);
-                       usbdux_pwm_stop(this_usbduxsub, 0);
+               if (devpriv->pwm_cmd_running) {
+                       dev_err(dev->class_dev,
+                               "%s: non-zero urb status (%d)\n",
+                               __func__, urb->status);
+                       usbduxsigma_pwm_stop(dev, 0);   /* w/o unlink */
                }
                return;
        }
 
-       /* are we actually running? */
-       if (!(this_usbduxsub->pwm_cmd_running))
+       if (!devpriv->pwm_cmd_running)
                return;
 
-       urb->transfer_buffer_length = this_usbduxsub->sizePwmBuf;
-       urb->dev = this_usbduxsub->usbdev;
+       urb->transfer_buffer_length = devpriv->pwm_buf_sz;
+       urb->dev = comedi_to_usb_dev(dev);
        urb->status = 0;
-       if (this_usbduxsub->pwm_cmd_running) {
-               ret = usb_submit_urb(urb, GFP_ATOMIC);
-               if (ret < 0) {
-                       dev_err(&this_usbduxsub->interface->dev,
-                               "comedi_: pwm urb resubm failed in int-cont. "
-                               "ret=%d", ret);
-                       if (ret == EL2NSYNC)
-                               dev_err(&this_usbduxsub->interface->dev,
-                                       "buggy USB host controller or bug in "
-                                       "IRQ handling!\n");
-
-                       /* don't do an unlink here */
-                       usbdux_pwm_stop(this_usbduxsub, 0);
-               }
+       ret = usb_submit_urb(urb, GFP_ATOMIC);
+       if (ret < 0) {
+               dev_err(dev->class_dev, "%s: urb resubmit failed (%d)\n",
+                       __func__, ret);
+               if (ret == EL2NSYNC)
+                       dev_err(dev->class_dev,
+                               "buggy USB host controller or bug in IRQ handler\n");
+               usbduxsigma_pwm_stop(dev, 0);   /* w/o unlink */
        }
 }
 
-static int usbduxsub_submit_PwmURBs(struct usbduxsub *usbduxsub)
+static int usbduxsigma_submit_pwm_urb(struct comedi_device *dev)
 {
-       int errFlag;
-
-       if (!usbduxsub)
-               return -EFAULT;
-
-       dev_dbg(&usbduxsub->interface->dev, "comedi_: submitting pwm-urb\n");
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxsigma_private *devpriv = dev->private;
+       struct urb *urb = devpriv->pwm_urb;
 
        /* in case of a resubmission after an unlink... */
-       usb_fill_bulk_urb(usbduxsub->urbPwm,
-                         usbduxsub->usbdev,
-                         usb_sndbulkpipe(usbduxsub->usbdev, PWM_EP),
-                         usbduxsub->urbPwm->transfer_buffer,
-                         usbduxsub->sizePwmBuf, usbduxsub_pwm_irq,
-                         usbduxsub->comedidev);
-
-       errFlag = usb_submit_urb(usbduxsub->urbPwm, GFP_ATOMIC);
-       if (errFlag) {
-               dev_err(&usbduxsub->interface->dev,
-                       "comedi_: usbduxsigma: pwm: usb_submit_urb error %d\n",
-                       errFlag);
-               return errFlag;
-       }
-       return 0;
+       usb_fill_bulk_urb(urb,
+                         usb, usb_sndbulkpipe(usb, USBDUXSIGMA_PWM_OUT_EP),
+                         urb->transfer_buffer, devpriv->pwm_buf_sz,
+                         usbduxsigma_pwm_urb_complete, dev);
+
+       return usb_submit_urb(urb, GFP_ATOMIC);
 }
 
-static int usbdux_pwm_period(struct comedi_device *dev,
-                            struct comedi_subdevice *s, unsigned int period)
+static int usbduxsigma_pwm_period(struct comedi_device *dev,
+                                 struct comedi_subdevice *s,
+                                 unsigned int period)
 {
-       struct usbduxsub *this_usbduxsub = dev->private;
+       struct usbduxsigma_private *devpriv = dev->private;
        int fx2delay = 255;
 
        if (period < MIN_PWM_PERIOD) {
-               dev_err(&this_usbduxsub->interface->dev,
-                       "comedi%d: illegal period setting for pwm.\n",
-                       dev->minor);
                return -EAGAIN;
        } else {
-               fx2delay = period / ((int)(6 * 512 * (1.0 / 0.033))) - 6;
-               if (fx2delay > 255) {
-                       dev_err(&this_usbduxsub->interface->dev,
-                               "comedi%d: period %d for pwm is too low.\n",
-                               dev->minor, period);
+               fx2delay = (period / (6 * 512 * 1000 / 33)) - 6;
+               if (fx2delay > 255)
                        return -EAGAIN;
-               }
        }
-       this_usbduxsub->pwmDelay = fx2delay;
-       this_usbduxsub->pwmPeriod = period;
-       dev_dbg(&this_usbduxsub->interface->dev, "%s: frequ=%d, period=%d\n",
-               __func__, period, fx2delay);
+       devpriv->pwm_delay = fx2delay;
+       devpriv->pwm_period = period;
        return 0;
 }
 
-/* is called from insn so there's no need to do all the sanity checks */
-static int usbdux_pwm_start(struct comedi_device *dev,
-                           struct comedi_subdevice *s)
+static int usbduxsigma_pwm_start(struct comedi_device *dev,
+                                struct comedi_subdevice *s)
 {
-       int ret, i;
-       struct usbduxsub *this_usbduxsub = dev->private;
-
-       dev_dbg(&this_usbduxsub->interface->dev, "comedi%d: %s\n",
-               dev->minor, __func__);
+       struct usbduxsigma_private *devpriv = dev->private;
+       int ret;
 
-       if (this_usbduxsub->pwm_cmd_running) {
-               /* already running */
+       if (devpriv->pwm_cmd_running)
                return 0;
-       }
 
-       this_usbduxsub->dux_commands[1] = ((uint8_t) this_usbduxsub->pwmDelay);
-       ret = send_dux_commands(this_usbduxsub, SENDPWMON);
+       devpriv->dux_commands[1] = devpriv->pwm_delay;
+       ret = usbbuxsigma_send_cmd(dev, USBDUXSIGMA_PWM_ON_CMD);
        if (ret < 0)
                return ret;
 
-       /* initialise the buffer */
-       for (i = 0; i < this_usbduxsub->sizePwmBuf; i++)
-               ((char *)(this_usbduxsub->urbPwm->transfer_buffer))[i] = 0;
+       memset(devpriv->pwm_urb->transfer_buffer, 0, devpriv->pwm_buf_sz);
 
-       this_usbduxsub->pwm_cmd_running = 1;
-       ret = usbduxsub_submit_PwmURBs(this_usbduxsub);
-       if (ret < 0) {
-               this_usbduxsub->pwm_cmd_running = 0;
+       ret = usbduxsigma_submit_pwm_urb(dev);
+       if (ret < 0)
                return ret;
-       }
+       devpriv->pwm_cmd_running = 1;
+
        return 0;
 }
 
-/* generates the bit pattern for PWM with the optional sign bit */
-static int usbdux_pwm_pattern(struct comedi_device *dev,
-                             struct comedi_subdevice *s, int channel,
-                             unsigned int value, unsigned int sign)
+static int usbduxsigma_pwm_pattern(struct comedi_device *dev,
+                                  struct comedi_subdevice *s,
+                                  unsigned int chan,
+                                  unsigned int value,
+                                  unsigned int sign)
 {
-       struct usbduxsub *this_usbduxsub = dev->private;
-       int i, szbuf;
-       char *pBuf;
-       char pwm_mask;
-       char sgn_mask;
-       char c;
-
-       if (!this_usbduxsub)
-               return -EFAULT;
-
-       /* this is the DIO bit which carries the PWM data */
-       pwm_mask = (1 << channel);
-       /* this is the DIO bit which carries the optional direction bit */
-       sgn_mask = (16 << channel);
-       /* this is the buffer which will be filled with the with bit */
-       /* pattern for one period */
-       szbuf = this_usbduxsub->sizePwmBuf;
-       pBuf = (char *)(this_usbduxsub->urbPwm->transfer_buffer);
+       struct usbduxsigma_private *devpriv = dev->private;
+       char pwm_mask = (1 << chan);    /* DIO bit for the PWM data */
+       char sgn_mask = (16 << chan);   /* DIO bit for the sign */
+       char *buf = (char *)(devpriv->pwm_urb->transfer_buffer);
+       int szbuf = devpriv->pwm_buf_sz;
+       int i;
+
        for (i = 0; i < szbuf; i++) {
-               c = *pBuf;
-               /* reset bits */
-               c = c & (~pwm_mask);
-               /* set the bit as long as the index is lower than the value */
+               char c = *buf;
+
+               c &= ~pwm_mask;
                if (i < value)
-                       c = c | pwm_mask;
-               /* set the optional sign bit for a relay */
-               if (!sign) {
-                       /* positive value */
-                       c = c & (~sgn_mask);
-               } else {
-                       /* negative value */
-                       c = c | sgn_mask;
-               }
-               *(pBuf++) = c;
+                       c |= pwm_mask;
+               if (!sign)
+                       c &= ~sgn_mask;
+               else
+                       c |= sgn_mask;
+               *buf++ = c;
        }
        return 1;
 }
 
-static int usbdux_pwm_write(struct comedi_device *dev,
-                           struct comedi_subdevice *s,
-                           struct comedi_insn *insn, unsigned int *data)
+static int usbduxsigma_pwm_write(struct comedi_device *dev,
+                                struct comedi_subdevice *s,
+                                struct comedi_insn *insn,
+                                unsigned int *data)
 {
-       struct usbduxsub *this_usbduxsub = dev->private;
+       unsigned int chan = CR_CHAN(insn->chanspec);
 
-       if (!this_usbduxsub)
-               return -EFAULT;
-
-       if ((insn->n) != 1) {
-               /*
-                * doesn't make sense to have more than one value here because
-                * it would just overwrite the PWM buffer a couple of times
-                */
+       /*
+        * It doesn't make sense to support more than one value here
+        * because it would just overwrite the PWM buffer.
+        */
+       if (insn->n != 1)
                return -EINVAL;
-       }
 
        /*
-        * the sign is set via a special INSN only, this gives us 8 bits for
-        * normal operation
-        * relay sign 0 by default
+        * The sign is set via a special INSN only, this gives us 8 bits
+        * for normal operation, sign is 0 by default.
         */
-       return usbdux_pwm_pattern(dev, s, CR_CHAN(insn->chanspec), data[0], 0);
+       return usbduxsigma_pwm_pattern(dev, s, chan, data[0], 0);
 }
 
-static int usbdux_pwm_read(struct comedi_device *x1,
-                          struct comedi_subdevice *x2, struct comedi_insn *x3,
-                          unsigned int *x4)
+static int usbduxsigma_pwm_config(struct comedi_device *dev,
+                                 struct comedi_subdevice *s,
+                                 struct comedi_insn *insn,
+                                 unsigned int *data)
 {
-       /* not needed */
-       return -EINVAL;
-};
+       struct usbduxsigma_private *devpriv = dev->private;
+       unsigned int chan = CR_CHAN(insn->chanspec);
 
-/* switches on/off PWM */
-static int usbdux_pwm_config(struct comedi_device *dev,
-                            struct comedi_subdevice *s,
-                            struct comedi_insn *insn, unsigned int *data)
-{
-       struct usbduxsub *this_usbduxsub = dev->private;
        switch (data[0]) {
        case INSN_CONFIG_ARM:
-               /* switch it on */
-               dev_dbg(&this_usbduxsub->interface->dev,
-                       "comedi%d: %s: pwm on\n", dev->minor, __func__);
                /*
                 * if not zero the PWM is limited to a certain time which is
                 * not supported here
                 */
                if (data[1] != 0)
                        return -EINVAL;
-               return usbdux_pwm_start(dev, s);
+               return usbduxsigma_pwm_start(dev, s);
        case INSN_CONFIG_DISARM:
-               dev_dbg(&this_usbduxsub->interface->dev,
-                       "comedi%d: %s: pwm off\n", dev->minor, __func__);
-               return usbdux_pwm_cancel(dev, s);
+               return usbduxsigma_pwm_cancel(dev, s);
        case INSN_CONFIG_GET_PWM_STATUS:
-               /*
-                * to check if the USB transmission has failed or in case PWM
-                * was limited to n cycles to check if it has terminated
-                */
-               data[1] = this_usbduxsub->pwm_cmd_running;
+               data[1] = devpriv->pwm_cmd_running;
                return 0;
        case INSN_CONFIG_PWM_SET_PERIOD:
-               dev_dbg(&this_usbduxsub->interface->dev,
-                       "comedi%d: %s: setting period\n", dev->minor,
-                       __func__);
-               return usbdux_pwm_period(dev, s, data[1]);
+               return usbduxsigma_pwm_period(dev, s, data[1]);
        case INSN_CONFIG_PWM_GET_PERIOD:
-               data[1] = this_usbduxsub->pwmPeriod;
+               data[1] = devpriv->pwm_period;
                return 0;
        case INSN_CONFIG_PWM_SET_H_BRIDGE:
-               /* value in the first byte and the sign in the second for a
-                  relay */
-               return usbdux_pwm_pattern(dev, s,
-                                         /* the channel number */
-                                         CR_CHAN(insn->chanspec),
-                                         /* actual PWM data */
-                                         data[1],
-                                         /* just a sign */
-                                         (data[2] != 0));
+               /*
+                * data[1] = value
+                * data[2] = sign (for a relay)
+                */
+               return usbduxsigma_pwm_pattern(dev, s, chan,
+                                              data[1], (data[2] != 0));
        case INSN_CONFIG_PWM_GET_H_BRIDGE:
                /* values are not kept in this driver, nothing to return */
                return -EINVAL;
@@ -2115,542 +1335,412 @@ static int usbdux_pwm_config(struct comedi_device *dev,
        return -EINVAL;
 }
 
-/* end of PWM */
-/*****************************************************************/
-
-static void tidy_up(struct usbduxsub *usbduxsub_tmp)
+static int usbduxsigma_getstatusinfo(struct comedi_device *dev, int chan)
 {
-       int i;
+       struct usbduxsigma_private *devpriv = dev->private;
+       uint8_t sysred;
+       uint32_t val;
+       int ret;
 
-       if (!usbduxsub_tmp)
-               return;
-       dev_dbg(&usbduxsub_tmp->interface->dev, "comedi_: tiding up\n");
+       switch (chan) {
+       default:
+       case 0:
+               sysred = 0;             /* ADC zero */
+               break;
+       case 1:
+               sysred = 1;             /* ADC offset */
+               break;
+       case 2:
+               sysred = 4;             /* VCC */
+               break;
+       case 3:
+               sysred = 8;             /* temperature */
+               break;
+       case 4:
+               sysred = 16;            /* gain */
+               break;
+       case 5:
+               sysred =  32;           /* ref */
+               break;
+       }
 
-       /* shows the usb subsystem that the driver is down */
-       if (usbduxsub_tmp->interface)
-               usb_set_intfdata(usbduxsub_tmp->interface, NULL);
+       devpriv->dux_commands[1] = 0x12; /* CONFIG0 */
+       devpriv->dux_commands[2] = 0x80; /* CONFIG1: 2kHz sampling rate */
+       devpriv->dux_commands[3] = 0x00; /* CONFIG3: diff. channels off */
+       devpriv->dux_commands[4] = 0;
+       devpriv->dux_commands[5] = 0;
+       devpriv->dux_commands[6] = sysred;
+       ret = usbbuxsigma_send_cmd(dev, USBDUXSIGMA_SINGLE_AD_CMD);
+       if (ret < 0)
+               return ret;
 
-       usbduxsub_tmp->probed = 0;
+       ret = usbduxsigma_receive_cmd(dev, USBDUXSIGMA_SINGLE_AD_CMD);
+       if (ret < 0)
+               return ret;
 
-       if (usbduxsub_tmp->urbIn) {
-               if (usbduxsub_tmp->ai_cmd_running) {
-                       usbduxsub_tmp->ai_cmd_running = 0;
-                       usbduxsub_unlink_InURBs(usbduxsub_tmp);
-               }
-               for (i = 0; i < usbduxsub_tmp->numOfInBuffers; i++) {
-                       kfree(usbduxsub_tmp->urbIn[i]->transfer_buffer);
-                       usbduxsub_tmp->urbIn[i]->transfer_buffer = NULL;
-                       usb_kill_urb(usbduxsub_tmp->urbIn[i]);
-                       usb_free_urb(usbduxsub_tmp->urbIn[i]);
-                       usbduxsub_tmp->urbIn[i] = NULL;
-               }
-               kfree(usbduxsub_tmp->urbIn);
-               usbduxsub_tmp->urbIn = NULL;
-       }
-       if (usbduxsub_tmp->urbOut) {
-               if (usbduxsub_tmp->ao_cmd_running) {
-                       usbduxsub_tmp->ao_cmd_running = 0;
-                       usbduxsub_unlink_OutURBs(usbduxsub_tmp);
-               }
-               for (i = 0; i < usbduxsub_tmp->numOfOutBuffers; i++) {
-                       if (usbduxsub_tmp->urbOut[i]->transfer_buffer) {
-                               kfree(usbduxsub_tmp->
-                                     urbOut[i]->transfer_buffer);
-                               usbduxsub_tmp->urbOut[i]->transfer_buffer =
-                                   NULL;
-                       }
-                       if (usbduxsub_tmp->urbOut[i]) {
-                               usb_kill_urb(usbduxsub_tmp->urbOut[i]);
-                               usb_free_urb(usbduxsub_tmp->urbOut[i]);
-                               usbduxsub_tmp->urbOut[i] = NULL;
-                       }
-               }
-               kfree(usbduxsub_tmp->urbOut);
-               usbduxsub_tmp->urbOut = NULL;
-       }
-       if (usbduxsub_tmp->urbPwm) {
-               if (usbduxsub_tmp->pwm_cmd_running) {
-                       usbduxsub_tmp->pwm_cmd_running = 0;
-                       usbduxsub_unlink_PwmURBs(usbduxsub_tmp);
-               }
-               kfree(usbduxsub_tmp->urbPwm->transfer_buffer);
-               usbduxsub_tmp->urbPwm->transfer_buffer = NULL;
-               usb_kill_urb(usbduxsub_tmp->urbPwm);
-               usb_free_urb(usbduxsub_tmp->urbPwm);
-               usbduxsub_tmp->urbPwm = NULL;
-       }
-       kfree(usbduxsub_tmp->inBuffer);
-       usbduxsub_tmp->inBuffer = NULL;
-       kfree(usbduxsub_tmp->insnBuffer);
-       usbduxsub_tmp->insnBuffer = NULL;
-       kfree(usbduxsub_tmp->outBuffer);
-       usbduxsub_tmp->outBuffer = NULL;
-       kfree(usbduxsub_tmp->dac_commands);
-       usbduxsub_tmp->dac_commands = NULL;
-       kfree(usbduxsub_tmp->dux_commands);
-       usbduxsub_tmp->dux_commands = NULL;
-       usbduxsub_tmp->ai_cmd_running = 0;
-       usbduxsub_tmp->ao_cmd_running = 0;
-       usbduxsub_tmp->pwm_cmd_running = 0;
+       /* 32 bits big endian from the A/D converter */
+       val = be32_to_cpu(*((int32_t *)((devpriv->insn_buf)+1)));
+       val &= 0x00ffffff;      /* strip status byte */
+       val ^= 0x00800000;      /* convert to unsigned */
+
+       return (int)val;
 }
 
-static int usbduxsigma_attach_common(struct comedi_device *dev,
-                                    struct usbduxsub *uds)
+static int usbduxsigma_attach_common(struct comedi_device *dev)
 {
-       int ret;
+       struct usbduxsigma_private *devpriv = dev->private;
        struct comedi_subdevice *s;
        int n_subdevs;
        int offset;
+       int ret;
 
-       down(&uds->sem);
-       /* pointer back to the corresponding comedi device */
-       uds->comedidev = dev;
+       down(&devpriv->sem);
 
-       /* set number of subdevices */
-       if (uds->high_speed)
+       if (devpriv->high_speed)
                n_subdevs = 4;  /* with pwm */
        else
                n_subdevs = 3;  /* without pwm */
        ret = comedi_alloc_subdevices(dev, n_subdevs);
        if (ret) {
-               up(&uds->sem);
+               up(&devpriv->sem);
                return ret;
        }
-       /* private structure is also simply the usb-structure */
-       dev->private = uds;
-       /* the first subdevice is the A/D converter */
-       s = &dev->subdevices[SUBDEV_AD];
-       /* the URBs get the comedi subdevice */
-       /* which is responsible for reading */
-       /* this is the subdevice which reads data */
+
+       /* Analog Input subdevice */
+       s = &dev->subdevices[0];
        dev->read_subdev = s;
-       /* the subdevice receives as private structure the */
-       /* usb-structure */
-       s->private = NULL;
-       /* analog input */
-       s->type = COMEDI_SUBD_AI;
-       /* readable and ref is to ground, 32 bit wide data! */
-       s->subdev_flags = SDF_READABLE | SDF_GROUND |
-               SDF_CMD_READ | SDF_LSAMPL;
-       /* 16 A/D channels */
-       s->n_chan = NUMCHANNELS;
-       /* length of the channellist */
-       s->len_chanlist = NUMCHANNELS;
-       /* callback functions */
-       s->insn_read = usbdux_ai_insn_read;
-       s->do_cmdtest = usbdux_ai_cmdtest;
-       s->do_cmd = usbdux_ai_cmd;
-       s->cancel = usbdux_ai_cancel;
-       /* max value from the A/D converter (24bit) */
-       s->maxdata = 0x00FFFFFF;
-       /* range table to convert to physical units */
-       s->range_table = (&range_usbdux_ai_range);
-       /* analog output subdevice */
-       s = &dev->subdevices[SUBDEV_DA];
-       /* analog out */
-       s->type = COMEDI_SUBD_AO;
-       /* backward pointer */
+       s->type         = COMEDI_SUBD_AI;
+       s->subdev_flags = SDF_READABLE | SDF_GROUND | SDF_CMD_READ | SDF_LSAMPL;
+       s->n_chan       = NUMCHANNELS;
+       s->len_chanlist = NUMCHANNELS;
+       s->maxdata      = 0x00ffffff;
+       s->range_table  = &usbduxsigma_ai_range;
+       s->insn_read    = usbduxsigma_ai_insn_read;
+       s->do_cmdtest   = usbduxsigma_ai_cmdtest;
+       s->do_cmd       = usbduxsigma_ai_cmd;
+       s->cancel       = usbduxsigma_ai_cancel;
+
+       /* Analog Output subdevice */
+       s = &dev->subdevices[1];
        dev->write_subdev = s;
-       /* the subdevice receives as private structure the */
-       /* usb-structure */
-       s->private = NULL;
-       /* are writable */
-       s->subdev_flags = SDF_WRITABLE | SDF_GROUND | SDF_CMD_WRITE;
-       /* 4 channels */
-       s->n_chan = 4;
-       /* length of the channellist */
-       s->len_chanlist = 4;
-       /* 8 bit resolution */
-       s->maxdata = 0x00ff;
-       /* unipolar range */
-       s->range_table = &range_unipolar2_5;
-       /* callback */
-       s->do_cmdtest = usbdux_ao_cmdtest;
-       s->do_cmd = usbdux_ao_cmd;
-       s->cancel = usbdux_ao_cancel;
-       s->insn_read = usbdux_ao_insn_read;
-       s->insn_write = usbdux_ao_insn_write;
-       /* digital I/O subdevice */
-       s = &dev->subdevices[SUBDEV_DIO];
-       s->type = COMEDI_SUBD_DIO;
-       s->subdev_flags = SDF_READABLE | SDF_WRITABLE;
-       /* 8 external and 16 internal channels */
-       s->n_chan = 24;
-       s->maxdata = 1;
-       s->range_table = (&range_digital);
-       s->insn_bits = usbdux_dio_insn_bits;
-       s->insn_config = usbdux_dio_insn_config;
-       /* we don't use it */
-       s->private = NULL;
-       if (uds->high_speed) {
-               /* timer / pwm subdevice */
-               s = &dev->subdevices[SUBDEV_PWM];
-               s->type = COMEDI_SUBD_PWM;
-               s->subdev_flags = SDF_WRITABLE | SDF_PWM_HBRIDGE;
-               s->n_chan = 8;
-               /* this defines the max duty cycle resolution */
-               s->maxdata = uds->sizePwmBuf;
-               s->insn_write = usbdux_pwm_write;
-               s->insn_read = usbdux_pwm_read;
-               s->insn_config = usbdux_pwm_config;
-               usbdux_pwm_period(dev, s, PWM_DEFAULT_PERIOD);
-       }
-       /* finally decide that it's attached */
-       uds->attached = 1;
-       up(&uds->sem);
-       offset = usbdux_getstatusinfo(dev, 0);
+       s->type         = COMEDI_SUBD_AO;
+       s->subdev_flags = SDF_WRITABLE | SDF_GROUND | SDF_CMD_WRITE;
+       s->n_chan       = USBDUXSIGMA_NUM_AO_CHAN;
+       s->len_chanlist = s->n_chan;
+       s->maxdata      = 0x00ff;
+       s->range_table  = &range_unipolar2_5;
+       s->insn_write   = usbduxsigma_ao_insn_write;
+       s->insn_read    = usbduxsigma_ao_insn_read;
+       s->do_cmdtest   = usbduxsigma_ao_cmdtest;
+       s->do_cmd       = usbduxsigma_ao_cmd;
+       s->cancel       = usbduxsigma_ao_cancel;
+
+       /* Digital I/O subdevice */
+       s = &dev->subdevices[2];
+       s->type         = COMEDI_SUBD_DIO;
+       s->subdev_flags = SDF_READABLE | SDF_WRITABLE;
+       s->n_chan       = 24;
+       s->maxdata      = 1;
+       s->range_table  = &range_digital;
+       s->insn_bits    = usbduxsigma_dio_insn_bits;
+       s->insn_config  = usbduxsigma_dio_insn_config;
+
+       if (devpriv->high_speed) {
+               /* Timer / pwm subdevice */
+               s = &dev->subdevices[3];
+               s->type         = COMEDI_SUBD_PWM;
+               s->subdev_flags = SDF_WRITABLE | SDF_PWM_HBRIDGE;
+               s->n_chan       = 8;
+               s->maxdata      = devpriv->pwm_buf_sz;
+               s->insn_write   = usbduxsigma_pwm_write;
+               s->insn_config  = usbduxsigma_pwm_config;
+
+               usbduxsigma_pwm_period(dev, s, PWM_DEFAULT_PERIOD);
+       }
+
+       up(&devpriv->sem);
+
+       offset = usbduxsigma_getstatusinfo(dev, 0);
        if (offset < 0)
-               dev_err(&uds->interface->dev,
-                       "Communication to USBDUXSIGMA failed! Check firmware and cabling.");
-       dev_info(&uds->interface->dev,
-                "comedi%d: attached, ADC_zero = %x\n", dev->minor, offset);
+               dev_err(dev->class_dev,
+                       "Communication to USBDUXSIGMA failed! Check firmware and cabling\n");
+
+       dev_info(dev->class_dev, "attached, ADC_zero = %x\n", offset);
+
        return 0;
 }
 
-static int usbduxsigma_auto_attach(struct comedi_device *dev,
-                                  unsigned long context_unused)
+static int usbduxsigma_firmware_upload(struct comedi_device *dev,
+                                      const u8 *data, size_t size,
+                                      unsigned long context)
 {
-       struct usb_interface *uinterf = comedi_to_usb_interface(dev);
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       uint8_t *buf;
+       uint8_t *tmp;
        int ret;
-       struct usbduxsub *uds;
 
-       dev->private = NULL;
-       down(&start_stop_sem);
-       uds = usb_get_intfdata(uinterf);
-       if (!uds || !uds->probed) {
-               dev_err(dev->class_dev,
-                       "usbduxsigma: error: auto_attach failed, not connected\n");
-               ret = -ENODEV;
-       } else if (uds->attached) {
-               dev_err(dev->class_dev,
-                      "usbduxsigma: error: auto_attach failed, already attached\n");
-               ret = -ENODEV;
-       } else
-               ret = usbduxsigma_attach_common(dev, uds);
-       up(&start_stop_sem);
-       return ret;
-}
+       if (!data)
+               return 0;
 
-static void usbduxsigma_detach(struct comedi_device *dev)
-{
-       struct usbduxsub *usb = dev->private;
-
-       if (usb) {
-               down(&usb->sem);
-               dev->private = NULL;
-               usb->attached = 0;
-               usb->comedidev = NULL;
-               up(&usb->sem);
+       if (size > FIRMWARE_MAX_LEN) {
+               dev_err(dev->class_dev, "firmware binary too large for FX2\n");
+               return -ENOMEM;
        }
-}
 
-static struct comedi_driver usbduxsigma_driver = {
-       .driver_name    = "usbduxsigma",
-       .module         = THIS_MODULE,
-       .auto_attach    = usbduxsigma_auto_attach,
-       .detach         = usbduxsigma_detach,
-};
+       /* we generate a local buffer for the firmware */
+       buf = kmemdup(data, size, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
 
-static void usbdux_firmware_request_complete_handler(const struct firmware *fw,
-                                                    void *context)
-{
-       struct usbduxsub *usbduxsub_tmp = context;
-       struct usb_interface *uinterf = usbduxsub_tmp->interface;
-       int ret;
+       /* we need a malloc'ed buffer for usb_control_msg() */
+       tmp = kmalloc(1, GFP_KERNEL);
+       if (!tmp) {
+               kfree(buf);
+               return -ENOMEM;
+       }
 
-       if (fw == NULL) {
-               dev_err(&uinterf->dev,
-                       "Firmware complete handler without firmware!\n");
-               return;
+       /* stop the current firmware on the device */
+       *tmp = 1;       /* 7f92 to one */
+       ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+                             USBDUXSUB_FIRMWARE,
+                             VENDOR_DIR_OUT,
+                             USBDUXSUB_CPUCS, 0x0000,
+                             tmp, 1,
+                             BULK_TIMEOUT);
+       if (ret < 0) {
+               dev_err(dev->class_dev, "can not stop firmware\n");
+               goto done;
        }
 
-       /*
-        * we need to upload the firmware here because fw will be
-        * freed once we've left this function
-        */
-       ret = firmwareUpload(usbduxsub_tmp, fw->data, fw->size);
+       /* upload the new firmware to the device */
+       ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+                             USBDUXSUB_FIRMWARE,
+                             VENDOR_DIR_OUT,
+                             0, 0x0000,
+                             buf, size,
+                             BULK_TIMEOUT);
+       if (ret < 0) {
+               dev_err(dev->class_dev, "firmware upload failed\n");
+               goto done;
+       }
+
+       /* start the new firmware on the device */
+       *tmp = 0;       /* 7f92 to zero */
+       ret = usb_control_msg(usb, usb_sndctrlpipe(usb, 0),
+                             USBDUXSUB_FIRMWARE,
+                             VENDOR_DIR_OUT,
+                             USBDUXSUB_CPUCS, 0x0000,
+                             tmp, 1,
+                             BULK_TIMEOUT);
+       if (ret < 0)
+               dev_err(dev->class_dev, "can not start firmware\n");
 
-       if (ret) {
-               dev_err(&uinterf->dev,
-                       "Could not upload firmware (err=%d)\n", ret);
-               goto out;
-       }
-       comedi_usb_auto_config(uinterf, &usbduxsigma_driver, 0);
-out:
-       release_firmware(fw);
+done:
+       kfree(tmp);
+       kfree(buf);
+       return ret;
 }
 
-static int usbduxsigma_usb_probe(struct usb_interface *uinterf,
-                                const struct usb_device_id *id)
+static int usbduxsigma_alloc_usb_buffers(struct comedi_device *dev)
 {
-       struct usb_device *udev = interface_to_usbdev(uinterf);
-       struct device *dev = &uinterf->dev;
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxsigma_private *devpriv = dev->private;
+       struct urb *urb;
        int i;
-       int index;
-       int ret;
-
-       dev_dbg(dev, "comedi_: usbdux_: "
-               "finding a free structure for the usb-device\n");
-
-       down(&start_stop_sem);
-       /* look for a free place in the usbdux array */
-       index = -1;
-       for (i = 0; i < NUMUSBDUX; i++) {
-               if (!(usbduxsub[i].probed)) {
-                       index = i;
-                       break;
-               }
-       }
 
-       /* no more space */
-       if (index == -1) {
-               dev_err(dev, "Too many usbduxsigma-devices connected.\n");
-               up(&start_stop_sem);
-               return -EMFILE;
-       }
-       dev_dbg(dev, "comedi_: usbdux: "
-               "usbduxsub[%d] is ready to connect to comedi.\n", index);
-
-       sema_init(&(usbduxsub[index].sem), 1);
-       /* save a pointer to the usb device */
-       usbduxsub[index].usbdev = udev;
-
-       /* save the interface itself */
-       usbduxsub[index].interface = uinterf;
-       /* get the interface number from the interface */
-       usbduxsub[index].ifnum = uinterf->altsetting->desc.bInterfaceNumber;
-       /* hand the private data over to the usb subsystem */
-       /* will be needed for disconnect */
-       usb_set_intfdata(uinterf, &(usbduxsub[index]));
-
-       dev_dbg(dev, "comedi_: usbdux: ifnum=%d\n", usbduxsub[index].ifnum);
-
-       /* test if it is high speed (USB 2.0) */
-       usbduxsub[index].high_speed =
-           (usbduxsub[index].usbdev->speed == USB_SPEED_HIGH);
-
-       /* create space for the commands of the DA converter */
-       usbduxsub[index].dac_commands = kzalloc(NUMOUTCHANNELS, GFP_KERNEL);
-       if (!usbduxsub[index].dac_commands) {
-               tidy_up(&(usbduxsub[index]));
-               up(&start_stop_sem);
-               return -ENOMEM;
-       }
-       /* create space for the commands going to the usb device */
-       usbduxsub[index].dux_commands = kzalloc(SIZEOFDUXBUFFER, GFP_KERNEL);
-       if (!usbduxsub[index].dux_commands) {
-               tidy_up(&(usbduxsub[index]));
-               up(&start_stop_sem);
-               return -ENOMEM;
-       }
-       /* create space for the in buffer and set it to zero */
-       usbduxsub[index].inBuffer = kzalloc(SIZEINBUF, GFP_KERNEL);
-       if (!(usbduxsub[index].inBuffer)) {
-               tidy_up(&(usbduxsub[index]));
-               up(&start_stop_sem);
-               return -ENOMEM;
-       }
-       /* create space of the instruction buffer */
-       usbduxsub[index].insnBuffer = kzalloc(SIZEINSNBUF, GFP_KERNEL);
-       if (!(usbduxsub[index].insnBuffer)) {
-               tidy_up(&(usbduxsub[index]));
-               up(&start_stop_sem);
-               return -ENOMEM;
-       }
-       /* create space for the outbuffer */
-       usbduxsub[index].outBuffer = kzalloc(SIZEOUTBUF, GFP_KERNEL);
-       if (!(usbduxsub[index].outBuffer)) {
-               tidy_up(&(usbduxsub[index]));
-               up(&start_stop_sem);
+       devpriv->dac_commands = kzalloc(NUMOUTCHANNELS, GFP_KERNEL);
+       devpriv->dux_commands = kzalloc(SIZEOFDUXBUFFER, GFP_KERNEL);
+       devpriv->in_buf = kzalloc(SIZEINBUF, GFP_KERNEL);
+       devpriv->insn_buf = kzalloc(SIZEINSNBUF, GFP_KERNEL);
+       devpriv->ai_urbs = kcalloc(devpriv->n_ai_urbs, sizeof(*urb),
+                                  GFP_KERNEL);
+       devpriv->ao_urbs = kcalloc(devpriv->n_ao_urbs, sizeof(*urb),
+                                  GFP_KERNEL);
+       if (!devpriv->dac_commands || !devpriv->dux_commands ||
+           !devpriv->in_buf || !devpriv->insn_buf ||
+           !devpriv->ai_urbs || !devpriv->ao_urbs)
                return -ENOMEM;
-       }
-       /* setting to alternate setting 3: enabling iso ep and bulk ep. */
-       i = usb_set_interface(usbduxsub[index].usbdev,
-                             usbduxsub[index].ifnum, 3);
-       if (i < 0) {
-               dev_err(dev, "comedi_: usbduxsigma%d: "
-                       "could not set alternate setting 3 in high speed.\n",
-                       index);
-               tidy_up(&(usbduxsub[index]));
-               up(&start_stop_sem);
-               return -ENODEV;
-       }
-       if (usbduxsub[index].high_speed)
-               usbduxsub[index].numOfInBuffers = NUMOFINBUFFERSHIGH;
-       else
-               usbduxsub[index].numOfInBuffers = NUMOFINBUFFERSFULL;
-
-       usbduxsub[index].urbIn = kcalloc(usbduxsub[index].numOfInBuffers,
-                                        sizeof(struct urb *),
-                                        GFP_KERNEL);
-       if (!(usbduxsub[index].urbIn)) {
-               tidy_up(&(usbduxsub[index]));
-               up(&start_stop_sem);
-               return -ENOMEM;
-       }
-       for (i = 0; i < usbduxsub[index].numOfInBuffers; i++) {
+
+       for (i = 0; i < devpriv->n_ai_urbs; i++) {
                /* one frame: 1ms */
-               usbduxsub[index].urbIn[i] = usb_alloc_urb(1, GFP_KERNEL);
-               if (usbduxsub[index].urbIn[i] == NULL) {
-                       dev_err(dev, "comedi_: usbduxsigma%d: "
-                               "Could not alloc. urb(%d)\n", index, i);
-                       tidy_up(&(usbduxsub[index]));
-                       up(&start_stop_sem);
+               urb = usb_alloc_urb(1, GFP_KERNEL);
+               if (!urb)
                        return -ENOMEM;
-               }
-               usbduxsub[index].urbIn[i]->dev = usbduxsub[index].usbdev;
+               devpriv->ai_urbs[i] = urb;
+               urb->dev = usb;
                /* will be filled later with a pointer to the comedi-device */
                /* and ONLY then the urb should be submitted */
-               usbduxsub[index].urbIn[i]->context = NULL;
-               usbduxsub[index].urbIn[i]->pipe =
-                   usb_rcvisocpipe(usbduxsub[index].usbdev, ISOINEP);
-               usbduxsub[index].urbIn[i]->transfer_flags = URB_ISO_ASAP;
-               usbduxsub[index].urbIn[i]->transfer_buffer =
-                   kzalloc(SIZEINBUF, GFP_KERNEL);
-               if (!(usbduxsub[index].urbIn[i]->transfer_buffer)) {
-                       tidy_up(&(usbduxsub[index]));
-                       up(&start_stop_sem);
+               urb->context = NULL;
+               urb->pipe = usb_rcvisocpipe(usb, USBDUXSIGMA_ISO_IN_EP);
+               urb->transfer_flags = URB_ISO_ASAP;
+               urb->transfer_buffer = kzalloc(SIZEINBUF, GFP_KERNEL);
+               if (!urb->transfer_buffer)
                        return -ENOMEM;
-               }
-               usbduxsub[index].urbIn[i]->complete = usbduxsub_ai_IsocIrq;
-               usbduxsub[index].urbIn[i]->number_of_packets = 1;
-               usbduxsub[index].urbIn[i]->transfer_buffer_length = SIZEINBUF;
-               usbduxsub[index].urbIn[i]->iso_frame_desc[0].offset = 0;
-               usbduxsub[index].urbIn[i]->iso_frame_desc[0].length =
-                       SIZEINBUF;
+               urb->complete = usbduxsigma_ai_urb_complete;
+               urb->number_of_packets = 1;
+               urb->transfer_buffer_length = SIZEINBUF;
+               urb->iso_frame_desc[0].offset = 0;
+               urb->iso_frame_desc[0].length = SIZEINBUF;
        }
 
-       /* out */
-       if (usbduxsub[index].high_speed)
-               usbduxsub[index].numOfOutBuffers = NUMOFOUTBUFFERSHIGH;
-       else
-               usbduxsub[index].numOfOutBuffers = NUMOFOUTBUFFERSFULL;
-
-       usbduxsub[index].urbOut = kcalloc(usbduxsub[index].numOfOutBuffers,
-                                         sizeof(struct urb *), GFP_KERNEL);
-       if (!(usbduxsub[index].urbOut)) {
-               tidy_up(&(usbduxsub[index]));
-               up(&start_stop_sem);
-               return -ENOMEM;
-       }
-       for (i = 0; i < usbduxsub[index].numOfOutBuffers; i++) {
+       for (i = 0; i < devpriv->n_ao_urbs; i++) {
                /* one frame: 1ms */
-               usbduxsub[index].urbOut[i] = usb_alloc_urb(1, GFP_KERNEL);
-               if (usbduxsub[index].urbOut[i] == NULL) {
-                       dev_err(dev, "comedi_: usbduxsigma%d: "
-                               "Could not alloc. urb(%d)\n", index, i);
-                       tidy_up(&(usbduxsub[index]));
-                       up(&start_stop_sem);
+               urb = usb_alloc_urb(1, GFP_KERNEL);
+               if (!urb)
                        return -ENOMEM;
-               }
-               usbduxsub[index].urbOut[i]->dev = usbduxsub[index].usbdev;
+               devpriv->ao_urbs[i] = urb;
+               urb->dev = usb;
                /* will be filled later with a pointer to the comedi-device */
                /* and ONLY then the urb should be submitted */
-               usbduxsub[index].urbOut[i]->context = NULL;
-               usbduxsub[index].urbOut[i]->pipe =
-                   usb_sndisocpipe(usbduxsub[index].usbdev, ISOOUTEP);
-               usbduxsub[index].urbOut[i]->transfer_flags = URB_ISO_ASAP;
-               usbduxsub[index].urbOut[i]->transfer_buffer =
-                   kzalloc(SIZEOUTBUF, GFP_KERNEL);
-               if (!(usbduxsub[index].urbOut[i]->transfer_buffer)) {
-                       tidy_up(&(usbduxsub[index]));
-                       up(&start_stop_sem);
+               urb->context = NULL;
+               urb->pipe = usb_sndisocpipe(usb, USBDUXSIGMA_ISO_OUT_EP);
+               urb->transfer_flags = URB_ISO_ASAP;
+               urb->transfer_buffer = kzalloc(SIZEOUTBUF, GFP_KERNEL);
+               if (!urb->transfer_buffer)
                        return -ENOMEM;
-               }
-               usbduxsub[index].urbOut[i]->complete = usbduxsub_ao_IsocIrq;
-               usbduxsub[index].urbOut[i]->number_of_packets = 1;
-               usbduxsub[index].urbOut[i]->transfer_buffer_length =
-                       SIZEOUTBUF;
-               usbduxsub[index].urbOut[i]->iso_frame_desc[0].offset = 0;
-               usbduxsub[index].urbOut[i]->iso_frame_desc[0].length =
-                   SIZEOUTBUF;
-               if (usbduxsub[index].high_speed) {
-                       /* uframes */
-                       usbduxsub[index].urbOut[i]->interval = 8;
-               } else {
-                       /* frames */
-                       usbduxsub[index].urbOut[i]->interval = 1;
-               }
+               urb->complete = usbduxsigma_ao_urb_complete;
+               urb->number_of_packets = 1;
+               urb->transfer_buffer_length = SIZEOUTBUF;
+               urb->iso_frame_desc[0].offset = 0;
+               urb->iso_frame_desc[0].length = SIZEOUTBUF;
+               if (devpriv->high_speed)
+                       urb->interval = 8;      /* uframes */
+               else
+                       urb->interval = 1;      /* frames */
        }
 
-       /* pwm */
-       if (usbduxsub[index].high_speed) {
+       if (devpriv->high_speed) {
                /* max bulk ep size in high speed */
-               usbduxsub[index].sizePwmBuf = 512;
-               usbduxsub[index].urbPwm = usb_alloc_urb(0, GFP_KERNEL);
-               if (usbduxsub[index].urbPwm == NULL) {
-                       dev_err(dev, "comedi_: usbduxsigma%d: "
-                               "Could not alloc. pwm urb\n", index);
-                       tidy_up(&(usbduxsub[index]));
-                       up(&start_stop_sem);
+               devpriv->pwm_buf_sz = 512;
+               urb = usb_alloc_urb(0, GFP_KERNEL);
+               if (!urb)
                        return -ENOMEM;
-               }
-               usbduxsub[index].urbPwm->transfer_buffer =
-                   kzalloc(usbduxsub[index].sizePwmBuf, GFP_KERNEL);
-               if (!(usbduxsub[index].urbPwm->transfer_buffer)) {
-                       tidy_up(&(usbduxsub[index]));
-                       up(&start_stop_sem);
+               devpriv->pwm_urb = urb;
+               urb->transfer_buffer = kzalloc(devpriv->pwm_buf_sz, GFP_KERNEL);
+               if (!urb->transfer_buffer)
                        return -ENOMEM;
-               }
        } else {
-               usbduxsub[index].urbPwm = NULL;
-               usbduxsub[index].sizePwmBuf = 0;
+               devpriv->pwm_urb = NULL;
+               devpriv->pwm_buf_sz = 0;
        }
 
-       usbduxsub[index].ai_cmd_running = 0;
-       usbduxsub[index].ao_cmd_running = 0;
-       usbduxsub[index].pwm_cmd_running = 0;
+       return 0;
+}
 
-       /* we've reached the bottom of the function */
-       usbduxsub[index].probed = 1;
-       up(&start_stop_sem);
+static void usbduxsigma_free_usb_buffers(struct comedi_device *dev)
+{
+       struct usbduxsigma_private *devpriv = dev->private;
+       struct urb *urb;
+       int i;
 
-       ret = request_firmware_nowait(THIS_MODULE,
-                                     FW_ACTION_HOTPLUG,
-                                     FIRMWARE,
-                                     &udev->dev,
-                                     GFP_KERNEL,
-                                     usbduxsub + index,
-                                     usbdux_firmware_request_complete_handler
-                                     );
+       /* force unlink all urbs */
+       usbduxsigma_ai_stop(dev, 1);
+       usbduxsigma_ao_stop(dev, 1);
+       usbduxsigma_pwm_stop(dev, 1);
+
+       urb = devpriv->pwm_urb;
+       if (urb) {
+               kfree(urb->transfer_buffer);
+               usb_free_urb(urb);
+       }
+       if (devpriv->ao_urbs) {
+               for (i = 0; i < devpriv->n_ao_urbs; i++) {
+                       urb = devpriv->ao_urbs[i];
+                       if (urb) {
+                               kfree(urb->transfer_buffer);
+                               usb_free_urb(urb);
+                       }
+               }
+               kfree(devpriv->ao_urbs);
+       }
+       if (devpriv->ai_urbs) {
+               for (i = 0; i < devpriv->n_ai_urbs; i++) {
+                       urb = devpriv->ai_urbs[i];
+                       if (urb) {
+                               kfree(urb->transfer_buffer);
+                               usb_free_urb(urb);
+                       }
+               }
+               kfree(devpriv->ai_urbs);
+       }
+       kfree(devpriv->insn_buf);
+       kfree(devpriv->in_buf);
+       kfree(devpriv->dux_commands);
+       kfree(devpriv->dac_commands);
+}
 
-       if (ret) {
-               dev_err(dev, "Could not load firmware (err=%d)\n", ret);
-               return ret;
+static int usbduxsigma_auto_attach(struct comedi_device *dev,
+                                  unsigned long context_unused)
+{
+       struct usb_interface *intf = comedi_to_usb_interface(dev);
+       struct usb_device *usb = comedi_to_usb_dev(dev);
+       struct usbduxsigma_private *devpriv;
+       int ret;
+
+       devpriv = kzalloc(sizeof(*devpriv), GFP_KERNEL);
+       if (!devpriv)
+               return -ENOMEM;
+       dev->private = devpriv;
+
+       sema_init(&devpriv->sem, 1);
+       usb_set_intfdata(intf, devpriv);
+
+       ret = usb_set_interface(usb,
+                               intf->altsetting->desc.bInterfaceNumber, 3);
+       if (ret < 0) {
+               dev_err(dev->class_dev,
+                       "could not set alternate setting 3 in high speed\n");
+               return -ENODEV;
        }
 
-       dev_info(dev, "comedi_: successfully initialised.\n");
-       /* success */
-       return 0;
+       /* test if it is high speed (USB 2.0) */
+       devpriv->high_speed = (usb->speed == USB_SPEED_HIGH);
+       if (devpriv->high_speed) {
+               devpriv->n_ai_urbs = NUMOFINBUFFERSHIGH;
+               devpriv->n_ao_urbs = NUMOFOUTBUFFERSHIGH;
+       } else {
+               devpriv->n_ai_urbs = NUMOFINBUFFERSFULL;
+               devpriv->n_ao_urbs = NUMOFOUTBUFFERSFULL;
+       }
+
+       ret = usbduxsigma_alloc_usb_buffers(dev);
+       if (ret)
+               return ret;
+
+       ret = comedi_load_firmware(dev, &usb->dev, FIRMWARE,
+                                  usbduxsigma_firmware_upload, 0);
+       if (ret)
+               return ret;
+
+       return usbduxsigma_attach_common(dev);
 }
 
-static void usbduxsigma_usb_disconnect(struct usb_interface *intf)
+static void usbduxsigma_detach(struct comedi_device *dev)
 {
-       struct usbduxsub *usbduxsub_tmp = usb_get_intfdata(intf);
-       struct usb_device *udev = interface_to_usbdev(intf);
+       struct usb_interface *intf = comedi_to_usb_interface(dev);
+       struct usbduxsigma_private *devpriv = dev->private;
 
-       if (!usbduxsub_tmp) {
-               dev_err(&intf->dev,
-                       "comedi_: disconnect called with null pointer.\n");
+       if (!devpriv)
                return;
-       }
-       if (usbduxsub_tmp->usbdev != udev) {
-               dev_err(&intf->dev, "comedi_: BUG! wrong ptr!\n");
-               return;
-       }
-       if (usbduxsub_tmp->ai_cmd_running)
-               /* we are still running a command */
-               usbdux_ai_stop(usbduxsub_tmp, 1);
-       if (usbduxsub_tmp->ao_cmd_running)
-               /* we are still running a command */
-               usbdux_ao_stop(usbduxsub_tmp, 1);
-       comedi_usb_auto_unconfig(intf);
-       down(&start_stop_sem);
-       down(&usbduxsub_tmp->sem);
-       tidy_up(usbduxsub_tmp);
-       up(&usbduxsub_tmp->sem);
-       up(&start_stop_sem);
-       dev_info(&intf->dev, "comedi_: disconnected from the usb\n");
+
+       usb_set_intfdata(intf, NULL);
+
+       down(&devpriv->sem);
+       usbduxsigma_free_usb_buffers(dev);
+       up(&devpriv->sem);
+}
+
+static struct comedi_driver usbduxsigma_driver = {
+       .driver_name    = "usbduxsigma",
+       .module         = THIS_MODULE,
+       .auto_attach    = usbduxsigma_auto_attach,
+       .detach         = usbduxsigma_detach,
+};
+
+static int usbduxsigma_usb_probe(struct usb_interface *intf,
+                                const struct usb_device_id *id)
+{
+       return comedi_usb_auto_config(intf, &usbduxsigma_driver, 0);
 }
 
 static const struct usb_device_id usbduxsigma_usb_table[] = {
@@ -2664,7 +1754,7 @@ MODULE_DEVICE_TABLE(usb, usbduxsigma_usb_table);
 static struct usb_driver usbduxsigma_usb_driver = {
        .name           = "usbduxsigma",
        .probe          = usbduxsigma_usb_probe,
-       .disconnect     = usbduxsigma_usb_disconnect,
+       .disconnect     = comedi_usb_auto_unconfig,
        .id_table       = usbduxsigma_usb_table,
 };
 module_comedi_usb_driver(usbduxsigma_driver, usbduxsigma_usb_driver);
index 2be5087414f62478f3701b37ed7a1508e6ef4690..0ab04c0dd4102a3513302b848f6ca9f0b64aaa18 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 /*
 Driver: vmk80xx
@@ -159,8 +154,6 @@ static const struct vmk80xx_board vmk80xx_boardinfo[] = {
 };
 
 struct vmk80xx_private {
-       struct usb_device *usb;
-       struct usb_interface *intf;
        struct usb_endpoint_descriptor *ep_rx;
        struct usb_endpoint_descriptor *ep_tx;
        struct firmware_version fw;
@@ -170,9 +163,10 @@ struct vmk80xx_private {
        enum vmk80xx_model model;
 };
 
-static int vmk80xx_check_data_link(struct vmk80xx_private *devpriv)
+static int vmk80xx_check_data_link(struct comedi_device *dev)
 {
-       struct usb_device *usb = devpriv->usb;
+       struct vmk80xx_private *devpriv = dev->private;
+       struct usb_device *usb = comedi_to_usb_dev(dev);
        unsigned int tx_pipe;
        unsigned int rx_pipe;
        unsigned char tx[1];
@@ -194,9 +188,10 @@ static int vmk80xx_check_data_link(struct vmk80xx_private *devpriv)
        return (int)rx[1];
 }
 
-static void vmk80xx_read_eeprom(struct vmk80xx_private *devpriv, int flag)
+static void vmk80xx_read_eeprom(struct comedi_device *dev, int flag)
 {
-       struct usb_device *usb = devpriv->usb;
+       struct vmk80xx_private *devpriv = dev->private;
+       struct usb_device *usb = comedi_to_usb_dev(dev);
        unsigned int tx_pipe;
        unsigned int rx_pipe;
        unsigned char tx[1];
@@ -223,9 +218,10 @@ static void vmk80xx_read_eeprom(struct vmk80xx_private *devpriv, int flag)
                strncpy(devpriv->fw.ic6_vers, rx + 25, 24);
 }
 
-static void vmk80xx_do_bulk_msg(struct vmk80xx_private *devpriv)
+static void vmk80xx_do_bulk_msg(struct comedi_device *dev)
 {
-       struct usb_device *usb = devpriv->usb;
+       struct vmk80xx_private *devpriv = dev->private;
+       struct usb_device *usb = comedi_to_usb_dev(dev);
        __u8 tx_addr;
        __u8 rx_addr;
        unsigned int tx_pipe;
@@ -248,21 +244,18 @@ static void vmk80xx_do_bulk_msg(struct vmk80xx_private *devpriv)
        usb_bulk_msg(usb, rx_pipe, devpriv->usb_rx_buf, size, NULL, HZ * 10);
 }
 
-static int vmk80xx_read_packet(struct vmk80xx_private *devpriv)
+static int vmk80xx_read_packet(struct comedi_device *dev)
 {
-       struct usb_device *usb;
+       struct vmk80xx_private *devpriv = dev->private;
+       struct usb_device *usb = comedi_to_usb_dev(dev);
        struct usb_endpoint_descriptor *ep;
        unsigned int pipe;
 
-       if (!devpriv->intf)
-               return -ENODEV;
-
        if (devpriv->model == VMK8061_MODEL) {
-               vmk80xx_do_bulk_msg(devpriv);
+               vmk80xx_do_bulk_msg(dev);
                return 0;
        }
 
-       usb = devpriv->usb;
        ep = devpriv->ep_rx;
        pipe = usb_rcvintpipe(usb, ep->bEndpointAddress);
        return usb_interrupt_msg(usb, pipe, devpriv->usb_rx_buf,
@@ -270,23 +263,20 @@ static int vmk80xx_read_packet(struct vmk80xx_private *devpriv)
                                 HZ * 10);
 }
 
-static int vmk80xx_write_packet(struct vmk80xx_private *devpriv, int cmd)
+static int vmk80xx_write_packet(struct comedi_device *dev, int cmd)
 {
-       struct usb_device *usb;
+       struct vmk80xx_private *devpriv = dev->private;
+       struct usb_device *usb = comedi_to_usb_dev(dev);
        struct usb_endpoint_descriptor *ep;
        unsigned int pipe;
 
-       if (!devpriv->intf)
-               return -ENODEV;
-
        devpriv->usb_tx_buf[0] = cmd;
 
        if (devpriv->model == VMK8061_MODEL) {
-               vmk80xx_do_bulk_msg(devpriv);
+               vmk80xx_do_bulk_msg(dev);
                return 0;
        }
 
-       usb = devpriv->usb;
        ep = devpriv->ep_tx;
        pipe = usb_sndintpipe(usb, ep->bEndpointAddress);
        return usb_interrupt_msg(usb, pipe, devpriv->usb_tx_buf,
@@ -294,18 +284,19 @@ static int vmk80xx_write_packet(struct vmk80xx_private *devpriv, int cmd)
                                 HZ * 10);
 }
 
-static int vmk80xx_reset_device(struct vmk80xx_private *devpriv)
+static int vmk80xx_reset_device(struct comedi_device *dev)
 {
+       struct vmk80xx_private *devpriv = dev->private;
        size_t size;
        int retval;
 
        size = le16_to_cpu(devpriv->ep_tx->wMaxPacketSize);
        memset(devpriv->usb_tx_buf, 0, size);
-       retval = vmk80xx_write_packet(devpriv, VMK8055_CMD_RST);
+       retval = vmk80xx_write_packet(dev, VMK8055_CMD_RST);
        if (retval)
                return retval;
        /* set outputs to known state as we cannot read them */
-       return vmk80xx_write_packet(devpriv, VMK8055_CMD_WRT_AD);
+       return vmk80xx_write_packet(dev, VMK8055_CMD_WRT_AD);
 }
 
 static int vmk80xx_ai_insn_read(struct comedi_device *dev,
@@ -338,7 +329,7 @@ static int vmk80xx_ai_insn_read(struct comedi_device *dev,
        }
 
        for (n = 0; n < insn->n; n++) {
-               if (vmk80xx_read_packet(devpriv))
+               if (vmk80xx_read_packet(dev))
                        break;
 
                if (devpriv->model == VMK8055_MODEL) {
@@ -388,7 +379,7 @@ static int vmk80xx_ao_insn_write(struct comedi_device *dev,
        for (n = 0; n < insn->n; n++) {
                devpriv->usb_tx_buf[reg] = data[n];
 
-               if (vmk80xx_write_packet(devpriv, cmd))
+               if (vmk80xx_write_packet(dev, cmd))
                        break;
        }
 
@@ -415,7 +406,7 @@ static int vmk80xx_ao_insn_read(struct comedi_device *dev,
        devpriv->usb_tx_buf[0] = VMK8061_CMD_RD_AO;
 
        for (n = 0; n < insn->n; n++) {
-               if (vmk80xx_read_packet(devpriv))
+               if (vmk80xx_read_packet(dev))
                        break;
 
                data[n] = devpriv->usb_rx_buf[reg + chan];
@@ -447,7 +438,7 @@ static int vmk80xx_di_insn_bits(struct comedi_device *dev,
                reg = VMK8055_DI_REG;
        }
 
-       retval = vmk80xx_read_packet(devpriv);
+       retval = vmk80xx_read_packet(dev);
 
        if (!retval) {
                if (devpriv->model == VMK8055_MODEL)
@@ -492,7 +483,7 @@ static int vmk80xx_do_insn_bits(struct comedi_device *dev,
                tx_buf[reg] &= ~data[0];
                tx_buf[reg] |= (data[0] & data[1]);
 
-               retval = vmk80xx_write_packet(devpriv, cmd);
+               retval = vmk80xx_write_packet(dev, cmd);
 
                if (retval)
                        goto out;
@@ -501,7 +492,7 @@ static int vmk80xx_do_insn_bits(struct comedi_device *dev,
        if (devpriv->model == VMK8061_MODEL) {
                tx_buf[0] = VMK8061_CMD_RD_DO;
 
-               retval = vmk80xx_read_packet(devpriv);
+               retval = vmk80xx_read_packet(dev);
 
                if (!retval) {
                        data[1] = rx_buf[reg];
@@ -547,7 +538,7 @@ static int vmk80xx_cnt_insn_read(struct comedi_device *dev,
        }
 
        for (n = 0; n < insn->n; n++) {
-               if (vmk80xx_read_packet(devpriv))
+               if (vmk80xx_read_packet(dev))
                        break;
 
                if (devpriv->model == VMK8055_MODEL)
@@ -597,7 +588,7 @@ static int vmk80xx_cnt_insn_config(struct comedi_device *dev,
        }
 
        for (n = 0; n < insn->n; n++)
-               if (vmk80xx_write_packet(devpriv, cmd))
+               if (vmk80xx_write_packet(dev, cmd))
                        break;
 
        up(&devpriv->limit_sem);
@@ -640,7 +631,7 @@ static int vmk80xx_cnt_insn_write(struct comedi_device *dev,
 
                devpriv->usb_tx_buf[6 + chan] = val;
 
-               if (vmk80xx_write_packet(devpriv, cmd))
+               if (vmk80xx_write_packet(dev, cmd))
                        break;
        }
 
@@ -671,7 +662,7 @@ static int vmk80xx_pwm_insn_read(struct comedi_device *dev,
        tx_buf[0] = VMK8061_CMD_RD_PWM;
 
        for (n = 0; n < insn->n; n++) {
-               if (vmk80xx_read_packet(devpriv))
+               if (vmk80xx_read_packet(dev))
                        break;
 
                data[n] = rx_buf[reg[0]] + 4 * rx_buf[reg[1]];
@@ -719,7 +710,7 @@ static int vmk80xx_pwm_insn_write(struct comedi_device *dev,
                tx_buf[reg[0]] = (unsigned char)(data[n] & 0x03);
                tx_buf[reg[1]] = (unsigned char)(data[n] >> 2) & 0xff;
 
-               if (vmk80xx_write_packet(devpriv, cmd))
+               if (vmk80xx_write_packet(dev, cmd))
                        break;
        }
 
@@ -731,7 +722,7 @@ static int vmk80xx_pwm_insn_write(struct comedi_device *dev,
 static int vmk80xx_find_usb_endpoints(struct comedi_device *dev)
 {
        struct vmk80xx_private *devpriv = dev->private;
-       struct usb_interface *intf = devpriv->intf;
+       struct usb_interface *intf = comedi_to_usb_interface(dev);
        struct usb_host_interface *iface_desc = intf->cur_altsetting;
        struct usb_endpoint_descriptor *ep_desc;
        int i;
@@ -889,8 +880,6 @@ static int vmk80xx_auto_attach(struct comedi_device *dev,
                return -ENOMEM;
        dev->private = devpriv;
 
-       devpriv->usb = interface_to_usbdev(intf);
-       devpriv->intf = intf;
        devpriv->model = boardinfo->model;
 
        ret = vmk80xx_find_usb_endpoints(dev);
@@ -906,23 +895,24 @@ static int vmk80xx_auto_attach(struct comedi_device *dev,
        usb_set_intfdata(intf, devpriv);
 
        if (devpriv->model == VMK8061_MODEL) {
-               vmk80xx_read_eeprom(devpriv, IC3_VERSION);
+               vmk80xx_read_eeprom(dev, IC3_VERSION);
                dev_info(&intf->dev, "%s\n", devpriv->fw.ic3_vers);
 
-               if (vmk80xx_check_data_link(devpriv)) {
-                       vmk80xx_read_eeprom(devpriv, IC6_VERSION);
+               if (vmk80xx_check_data_link(dev)) {
+                       vmk80xx_read_eeprom(dev, IC6_VERSION);
                        dev_info(&intf->dev, "%s\n", devpriv->fw.ic6_vers);
                }
        }
 
        if (devpriv->model == VMK8055_MODEL)
-               vmk80xx_reset_device(devpriv);
+               vmk80xx_reset_device(dev);
 
        return vmk80xx_init_subdevices(dev);
 }
 
 static void vmk80xx_detach(struct comedi_device *dev)
 {
+       struct usb_interface *intf = comedi_to_usb_interface(dev);
        struct vmk80xx_private *devpriv = dev->private;
 
        if (!devpriv)
@@ -930,7 +920,7 @@ static void vmk80xx_detach(struct comedi_device *dev)
 
        down(&devpriv->limit_sem);
 
-       usb_set_intfdata(devpriv->intf, NULL);
+       usb_set_intfdata(intf, NULL);
 
        kfree(devpriv->usb_rx_buf);
        kfree(devpriv->usb_tx_buf);
index 3231a483f561c5239fdd5a01cb5f07f6b3062801..da8988c6bf503c8bc9390cd4b838aad1b95ed597 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #include <linux/module.h>
index 886c202de9ab4784e6af2a80c9516cf20f65298b..8ee94424bc8fef6b0dc94ec31774152ca0932d68 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 /*
index 1dc391b76447f38040178c8a068c14fdb6183a51..1f20332cc45921c54fdc4b4ebf61e64119d1b6d8 100644 (file)
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
 */
 
 #include <linux/uaccess.h>
index e96eee3ca89821988008254aea2c9633ec0677de..42a5f5c8d3d181e224be7d2b6b2d14ca24be9592 100644 (file)
@@ -547,10 +547,8 @@ fail_gpio:
 fail:
        /* Clean up before returning failure */
        for (i = 0; i < TOUCH_SUPPORTED; i++) {
-               if (ts->cp_input_info[i].input) {
+               if (ts->cp_input_info[i].input)
                        input_unregister_device(ts->cp_input_info[i].input);
-                       input_free_device(ts->cp_input_info[i].input);
-               }
        }
        kfree(ts);
        return retval;
index fd1a6e680c8ae4da4111b0a0e29e49595fda842b..981708f3ee39141fa94612d00ca380fce938499a 100644 (file)
  * between the driver and the application.
  */
 enum BC_DTS_GLOBALS {
-       BC_MAX_FW_CMD_BUFF_SZ   = 0x40,         /* FW passthrough cmd/rsp buffer size */
+       BC_MAX_FW_CMD_BUFF_SZ = 0x40, /* FW passthrough cmd/rsp buffer size */
        PCI_CFG_SIZE            = 256,          /* PCI config size buffer */
        BC_IOCTL_DATA_POOL_SIZE = 8,            /* BC_IOCTL_DATA Pool size */
-       BC_LINK_MAX_OPENS       = 3,            /* Maximum simultaneous opens*/
-       BC_LINK_MAX_SGLS        = 1024,         /* Maximum SG elements 4M/4K */
+       BC_LINK_MAX_OPENS       = 3,    /* Maximum simultaneous opens*/
+       BC_LINK_MAX_SGLS        = 1024, /* Maximum SG elements 4M/4K */
        BC_TX_LIST_CNT          = 2,            /* Max Tx DMA Rings */
        BC_RX_LIST_CNT          = 8,            /* Max Rx DMA Rings*/
        BC_PROC_OUTPUT_TIMEOUT  = 3000,         /* Milliseconds */
@@ -240,11 +240,14 @@ enum BC_DRV_CMD {
        DRV_CMD_ADD_RXBUFFS,    /* Add Rx side buffers to driver pool */
        DRV_CMD_FETCH_RXBUFF,   /* Get Rx DMAed buffer */
        DRV_CMD_START_RX_CAP,   /* Start Rx Buffer Capture */
-       DRV_CMD_FLUSH_RX_CAP,   /* Stop the capture for now...we will enhance this later*/
+       DRV_CMD_FLUSH_RX_CAP,   /* Stop the capture for now...
+                       we will enhance this later*/
        DRV_CMD_GET_DRV_STAT,   /* Get Driver Internal Statistics */
        DRV_CMD_RST_DRV_STAT,   /* Reset Driver Internal Statistics */
-       DRV_CMD_NOTIFY_MODE,    /* Notify the Mode to driver in which the application is Operating*/
-       DRV_CMD_CHANGE_CLOCK,   /* Change the core clock to either save power or improve performance */
+       DRV_CMD_NOTIFY_MODE,    /* Notify the Mode to driver
+                       in which the application is Operating*/
+       DRV_CMD_CHANGE_CLOCK,   /* Change the core clock to either save power
+                       or improve performance */
 
        /* MUST be the last one.. */
        DRV_CMD_END,                    /* End of the List.. */
@@ -283,8 +286,8 @@ struct crystalhd_ioctl_data {
        struct BC_IOCTL_DATA    udata;          /* IOCTL from App..*/
        uint32_t                u_id;           /* Driver specific user ID */
        uint32_t                cmd;            /* Cmd ID for driver's use. */
-       void                    *add_cdata;     /* Additional command specific data..*/
-       uint32_t                add_cdata_sz;   /* Additional command specific data size */
+       void     *add_cdata;    /* Additional command specific data..*/
+       uint32_t add_cdata_sz;  /* Additional command specific data size */
        struct crystalhd_ioctl_data *next;      /* List/Fifo management */
 };
 
index ed99daa6ef468627d3ff7832e407b4944f94d5db..3ab502b8c3be7b9acb634ea0af7f037001e45421 100644 (file)
@@ -472,8 +472,8 @@ static enum BC_STATUS bc_cproc_hw_txdma(struct crystalhd_cmd *ctx,
 }
 
 /* Helper function to check on user buffers */
-static enum BC_STATUS bc_cproc_check_inbuffs(bool pin, void *ubuff, uint32_t ub_sz,
-                                       uint32_t uv_off, bool en_422)
+static enum BC_STATUS bc_cproc_check_inbuffs(bool pin, void *ubuff,
+                                uint32_t ub_sz, uint32_t uv_off, bool en_422)
 {
        if (!ubuff || !ub_sz) {
                BCMLOG_ERR("%s->Invalid Arg %p %x\n",
@@ -483,8 +483,9 @@ static enum BC_STATUS bc_cproc_check_inbuffs(bool pin, void *ubuff, uint32_t ub_
 
        /* Check for alignment */
        if (((uintptr_t)ubuff) & 0x03) {
-               BCMLOG_ERR("%s-->Un-aligned address not implemented yet.. %p\n",
-                               ((pin) ? "TX" : "RX"), ubuff);
+               BCMLOG_ERR(
+                       "%s-->Un-aligned address not implemented yet.. %p\n",
+                        ((pin) ? "TX" : "RX"), ubuff);
                return BC_STS_NOT_IMPL;
        }
        if (pin)
@@ -572,7 +573,8 @@ static enum BC_STATUS bc_cproc_add_cap_buff(struct crystalhd_cmd *ctx,
        if (!dio_hnd)
                return BC_STS_ERROR;
 
-       sts = crystalhd_hw_add_cap_buffer(&ctx->hw_ctx, dio_hnd, (ctx->state == BC_LINK_READY));
+       sts = crystalhd_hw_add_cap_buffer(&ctx->hw_ctx, dio_hnd,
+                                        (ctx->state == BC_LINK_READY));
        if ((sts != BC_STS_SUCCESS) && (sts != BC_STS_BUSY)) {
                crystalhd_unmap_dio(ctx->adp, dio_hnd);
                return sts;
@@ -618,7 +620,8 @@ static enum BC_STATUS bc_cproc_fetch_frame(struct crystalhd_cmd *ctx,
 
        sts = crystalhd_hw_get_cap_buffer(&ctx->hw_ctx, &frame->PibInfo, &dio);
        if (sts != BC_STS_SUCCESS)
-               return (ctx->state & BC_LINK_SUSPEND) ? BC_STS_IO_USER_ABORT : sts;
+               return (ctx->state & BC_LINK_SUSPEND) ?
+                                        BC_STS_IO_USER_ABORT : sts;
 
        frame->Flags = dio->uinfo.comp_flags;
 
@@ -673,7 +676,8 @@ static enum BC_STATUS bc_cproc_flush_cap_buffs(struct crystalhd_cmd *ctx,
        frame = &idata->udata.u.DecOutData;
        for (count = 0; count < BC_RX_LIST_CNT; count++) {
 
-               sts = crystalhd_hw_get_cap_buffer(&ctx->hw_ctx, &frame->PibInfo, &dio);
+               sts = crystalhd_hw_get_cap_buffer(&ctx->hw_ctx,
+                                        &frame->PibInfo, &dio);
                if (sts != BC_STS_SUCCESS)
                        break;
 
@@ -916,7 +920,8 @@ enum BC_STATUS crystalhd_user_open(struct crystalhd_cmd *ctx,
  * Closer application handle and release app specific
  * resources.
  */
-enum BC_STATUS crystalhd_user_close(struct crystalhd_cmd *ctx, struct crystalhd_user *uc)
+enum BC_STATUS crystalhd_user_close(struct crystalhd_cmd *ctx,
+                                        struct crystalhd_user *uc)
 {
        uint32_t mode = uc->mode;
 
@@ -1008,8 +1013,8 @@ enum BC_STATUS crystalhd_delete_cmd_context(struct crystalhd_cmd *ctx)
  * mode of operation and returns the function pointer
  * from the cproc table.
  */
-crystalhd_cmd_proc crystalhd_get_cmd_proc(struct crystalhd_cmd *ctx, uint32_t cmd,
-                                     struct crystalhd_user *uc)
+crystalhd_cmd_proc crystalhd_get_cmd_proc(struct crystalhd_cmd *ctx,
+                                uint32_t cmd, struct crystalhd_user *uc)
 {
        crystalhd_cmd_proc cproc = NULL;
        unsigned int i, tbl_sz;
@@ -1024,7 +1029,8 @@ crystalhd_cmd_proc crystalhd_get_cmd_proc(struct crystalhd_cmd *ctx, uint32_t cm
                return NULL;
        }
 
-       tbl_sz = sizeof(g_crystalhd_cproc_tbl) / sizeof(struct crystalhd_cmd_tbl);
+       tbl_sz = sizeof(g_crystalhd_cproc_tbl) /
+                                sizeof(struct crystalhd_cmd_tbl);
        for (i = 0; i < tbl_sz; i++) {
                if (g_crystalhd_cproc_tbl[i].cmd_id == cmd) {
                        if ((uc->mode == DTS_MONITOR_MODE) &&
index 4066ba393a17277cb0eacfad208ffbd5148212a7..377cd9d68b08e2e352667dc2ee7a871d17c6135f 100644 (file)
@@ -66,7 +66,8 @@ struct crystalhd_cmd {
        struct crystalhd_hw     hw_ctx;
 };
 
-typedef enum BC_STATUS(*crystalhd_cmd_proc)(struct crystalhd_cmd *, struct crystalhd_ioctl_data *);
+typedef enum BC_STATUS(*crystalhd_cmd_proc)(struct crystalhd_cmd *,
+                                        struct crystalhd_ioctl_data *);
 
 struct crystalhd_cmd_tbl {
        uint32_t                cmd_id;
@@ -74,13 +75,17 @@ struct crystalhd_cmd_tbl {
        uint32_t                block_mon;
 };
 
-enum BC_STATUS crystalhd_suspend(struct crystalhd_cmd *ctx, struct crystalhd_ioctl_data *idata);
+enum BC_STATUS crystalhd_suspend(struct crystalhd_cmd *ctx,
+                                struct crystalhd_ioctl_data *idata);
 enum BC_STATUS crystalhd_resume(struct crystalhd_cmd *ctx);
-crystalhd_cmd_proc crystalhd_get_cmd_proc(struct crystalhd_cmd *ctx, uint32_t cmd,
-                                     struct crystalhd_user *uc);
-enum BC_STATUS crystalhd_user_open(struct crystalhd_cmd *ctx, struct crystalhd_user **user_ctx);
-enum BC_STATUS crystalhd_user_close(struct crystalhd_cmd *ctx, struct crystalhd_user *uc);
-enum BC_STATUS crystalhd_setup_cmd_context(struct crystalhd_cmd *ctx, struct crystalhd_adp *adp);
+crystalhd_cmd_proc crystalhd_get_cmd_proc(struct crystalhd_cmd *ctx,
+                                uint32_t cmd, struct crystalhd_user *uc);
+enum BC_STATUS crystalhd_user_open(struct crystalhd_cmd *ctx,
+                                struct crystalhd_user **user_ctx);
+enum BC_STATUS crystalhd_user_close(struct crystalhd_cmd *ctx,
+                                struct crystalhd_user *uc);
+enum BC_STATUS crystalhd_setup_cmd_context(struct crystalhd_cmd *ctx,
+                                struct crystalhd_adp *adp);
 enum BC_STATUS crystalhd_delete_cmd_context(struct crystalhd_cmd *ctx);
 bool crystalhd_cmd_interrupt(struct crystalhd_cmd *ctx);
 
index 9e2831e68bbad6f47be2ae510242de011482e75e..4b363a5069d76fa208428bc0ca059aff1397d2df 100644 (file)
@@ -106,7 +106,8 @@ struct ppb_vc1 {
 
 struct fgt_sei {
        struct fgt_sei *next;
-       unsigned char model_values[3][MAX_FGT_VALUE_INTERVAL][MAX_FGT_MODEL_VALUE];
+       unsigned char
+                model_values[3][MAX_FGT_VALUE_INTERVAL][MAX_FGT_MODEL_VALUE];
        unsigned char upper_bound[3][MAX_FGT_VALUE_INTERVAL];
        unsigned char lower_bound[3][MAX_FGT_VALUE_INTERVAL];
 
@@ -125,10 +126,12 @@ struct fgt_sei {
 
        unsigned char blending_mode_id; /* Blending mode. */
        unsigned char log2_scale_factor;        /* Log2 scale factor (2-7). */
-       unsigned char comp_flag[3];             /* Components [0,2] parameters present flag. */
-       unsigned char num_intervals_minus1[3]; /* Number of intensity level intervals. */
+       unsigned char comp_flag[3];     /* Components [0,2]
+                                        parameters present flag. */
+       unsigned char num_intervals_minus1[3]; /* Number of
+                                        intensity level intervals. */
        unsigned char num_model_values[3];      /* Number of model values. */
-       uint16_t      repetition_period;        /* Repetition period (0-16384) */
+       uint16_t      repetition_period; /* Repetition period (0-16384) */
 
 };
 
@@ -266,40 +269,40 @@ enum  c011_ts_cmd {
 
        /* Decoding commands */
        eCMD_C011_DEC_CHAN_OPEN                 = eCMD_C011_CMD_BASE + 0x100,
-       eCMD_C011_DEC_CHAN_CLOSE                        = eCMD_C011_CMD_BASE + 0x101,
-       eCMD_C011_DEC_CHAN_ACTIVATE                     = eCMD_C011_CMD_BASE + 0x102,
-       eCMD_C011_DEC_CHAN_STATUS                       = eCMD_C011_CMD_BASE + 0x103,
-       eCMD_C011_DEC_CHAN_FLUSH                        = eCMD_C011_CMD_BASE + 0x104,
+       eCMD_C011_DEC_CHAN_CLOSE                = eCMD_C011_CMD_BASE + 0x101,
+       eCMD_C011_DEC_CHAN_ACTIVATE             = eCMD_C011_CMD_BASE + 0x102,
+       eCMD_C011_DEC_CHAN_STATUS               = eCMD_C011_CMD_BASE + 0x103,
+       eCMD_C011_DEC_CHAN_FLUSH                = eCMD_C011_CMD_BASE + 0x104,
        eCMD_C011_DEC_CHAN_TRICK_PLAY           = eCMD_C011_CMD_BASE + 0x105,
-       eCMD_C011_DEC_CHAN_TS_PIDS                      = eCMD_C011_CMD_BASE + 0x106,
+       eCMD_C011_DEC_CHAN_TS_PIDS              = eCMD_C011_CMD_BASE + 0x106,
        eCMD_C011_DEC_CHAN_PS_STREAM_ID         = eCMD_C011_CMD_BASE + 0x107,
        eCMD_C011_DEC_CHAN_INPUT_PARAMS         = eCMD_C011_CMD_BASE + 0x108,
        eCMD_C011_DEC_CHAN_VIDEO_OUTPUT         = eCMD_C011_CMD_BASE + 0x109,
-       eCMD_C011_DEC_CHAN_OUTPUT_FORMAT                = eCMD_C011_CMD_BASE + 0x10A,
-       eCMD_C011_DEC_CHAN_SCALING_FILTERS              = eCMD_C011_CMD_BASE + 0x10B,
-       eCMD_C011_DEC_CHAN_OSD_MODE                     = eCMD_C011_CMD_BASE + 0x10D,
+       eCMD_C011_DEC_CHAN_OUTPUT_FORMAT        = eCMD_C011_CMD_BASE + 0x10A,
+       eCMD_C011_DEC_CHAN_SCALING_FILTERS      = eCMD_C011_CMD_BASE + 0x10B,
+       eCMD_C011_DEC_CHAN_OSD_MODE             = eCMD_C011_CMD_BASE + 0x10D,
        eCMD_C011_DEC_CHAN_DROP                 = eCMD_C011_CMD_BASE + 0x10E,
-       eCMD_C011_DEC_CHAN_RELEASE                      = eCMD_C011_CMD_BASE + 0x10F,
-       eCMD_C011_DEC_CHAN_STREAM_SETTINGS              = eCMD_C011_CMD_BASE + 0x110,
+       eCMD_C011_DEC_CHAN_RELEASE              = eCMD_C011_CMD_BASE + 0x10F,
+       eCMD_C011_DEC_CHAN_STREAM_SETTINGS      = eCMD_C011_CMD_BASE + 0x110,
        eCMD_C011_DEC_CHAN_PAUSE_OUTPUT         = eCMD_C011_CMD_BASE + 0x111,
-       eCMD_C011_DEC_CHAN_CHANGE                       = eCMD_C011_CMD_BASE + 0x112,
-       eCMD_C011_DEC_CHAN_SET_STC                      = eCMD_C011_CMD_BASE + 0x113,
-       eCMD_C011_DEC_CHAN_SET_PTS                      = eCMD_C011_CMD_BASE + 0x114,
-       eCMD_C011_DEC_CHAN_CC_MODE                      = eCMD_C011_CMD_BASE + 0x115,
-       eCMD_C011_DEC_CREATE_AUDIO_CONTEXT              = eCMD_C011_CMD_BASE + 0x116,
-       eCMD_C011_DEC_COPY_AUDIO_CONTEXT                = eCMD_C011_CMD_BASE + 0x117,
-       eCMD_C011_DEC_DELETE_AUDIO_CONTEXT              = eCMD_C011_CMD_BASE + 0x118,
-       eCMD_C011_DEC_CHAN_SET_DECYPTION                = eCMD_C011_CMD_BASE + 0x119,
+       eCMD_C011_DEC_CHAN_CHANGE               = eCMD_C011_CMD_BASE + 0x112,
+       eCMD_C011_DEC_CHAN_SET_STC              = eCMD_C011_CMD_BASE + 0x113,
+       eCMD_C011_DEC_CHAN_SET_PTS              = eCMD_C011_CMD_BASE + 0x114,
+       eCMD_C011_DEC_CHAN_CC_MODE              = eCMD_C011_CMD_BASE + 0x115,
+       eCMD_C011_DEC_CREATE_AUDIO_CONTEXT      = eCMD_C011_CMD_BASE + 0x116,
+       eCMD_C011_DEC_COPY_AUDIO_CONTEXT        = eCMD_C011_CMD_BASE + 0x117,
+       eCMD_C011_DEC_DELETE_AUDIO_CONTEXT      = eCMD_C011_CMD_BASE + 0x118,
+       eCMD_C011_DEC_CHAN_SET_DECYPTION        = eCMD_C011_CMD_BASE + 0x119,
        eCMD_C011_DEC_CHAN_START_VIDEO          = eCMD_C011_CMD_BASE + 0x11A,
        eCMD_C011_DEC_CHAN_STOP_VIDEO           = eCMD_C011_CMD_BASE + 0x11B,
        eCMD_C011_DEC_CHAN_PIC_CAPTURE          = eCMD_C011_CMD_BASE + 0x11C,
-       eCMD_C011_DEC_CHAN_PAUSE                        = eCMD_C011_CMD_BASE + 0x11D,
+       eCMD_C011_DEC_CHAN_PAUSE                = eCMD_C011_CMD_BASE + 0x11D,
        eCMD_C011_DEC_CHAN_PAUSE_STATE          = eCMD_C011_CMD_BASE + 0x11E,
-       eCMD_C011_DEC_CHAN_SET_SLOWM_RATE               = eCMD_C011_CMD_BASE + 0x11F,
-       eCMD_C011_DEC_CHAN_GET_SLOWM_RATE               = eCMD_C011_CMD_BASE + 0x120,
+       eCMD_C011_DEC_CHAN_SET_SLOWM_RATE       = eCMD_C011_CMD_BASE + 0x11F,
+       eCMD_C011_DEC_CHAN_GET_SLOWM_RATE       = eCMD_C011_CMD_BASE + 0x120,
        eCMD_C011_DEC_CHAN_SET_FF_RATE          = eCMD_C011_CMD_BASE + 0x121,
        eCMD_C011_DEC_CHAN_GET_FF_RATE          = eCMD_C011_CMD_BASE + 0x122,
-       eCMD_C011_DEC_CHAN_FRAME_ADVANCE                = eCMD_C011_CMD_BASE + 0x123,
+       eCMD_C011_DEC_CHAN_FRAME_ADVANCE        = eCMD_C011_CMD_BASE + 0x123,
        eCMD_C011_DEC_CHAN_SET_SKIP_PIC_MODE    = eCMD_C011_CMD_BASE + 0x124,
        eCMD_C011_DEC_CHAN_GET_SKIP_PIC_MODE    = eCMD_C011_CMD_BASE + 0x125,
        eCMD_C011_DEC_CHAN_FILL_PIC_BUF         = eCMD_C011_CMD_BASE + 0x126,
@@ -308,15 +311,16 @@ enum  c011_ts_cmd {
        eCMD_C011_DEC_CHAN_SET_BRCM_TRICK_MODE  = eCMD_C011_CMD_BASE + 0x129,
        eCMD_C011_DEC_CHAN_GET_BRCM_TRICK_MODE  = eCMD_C011_CMD_BASE + 0x12A,
        eCMD_C011_DEC_CHAN_REVERSE_FIELD_STATUS = eCMD_C011_CMD_BASE + 0x12B,
-       eCMD_C011_DEC_CHAN_I_PICTURE_FOUND              = eCMD_C011_CMD_BASE + 0x12C,
-       eCMD_C011_DEC_CHAN_SET_PARAMETER                = eCMD_C011_CMD_BASE + 0x12D,
+       eCMD_C011_DEC_CHAN_I_PICTURE_FOUND      = eCMD_C011_CMD_BASE + 0x12C,
+       eCMD_C011_DEC_CHAN_SET_PARAMETER        = eCMD_C011_CMD_BASE + 0x12D,
        eCMD_C011_DEC_CHAN_SET_USER_DATA_MODE   = eCMD_C011_CMD_BASE + 0x12E,
-       eCMD_C011_DEC_CHAN_SET_PAUSE_DISPLAY_MODE       = eCMD_C011_CMD_BASE + 0x12F,
-       eCMD_C011_DEC_CHAN_SET_SLOW_DISPLAY_MODE        = eCMD_C011_CMD_BASE + 0x130,
+       eCMD_C011_DEC_CHAN_SET_PAUSE_DISPLAY_MODE = eCMD_C011_CMD_BASE + 0x12F,
+       eCMD_C011_DEC_CHAN_SET_SLOW_DISPLAY_MODE = eCMD_C011_CMD_BASE + 0x130,
        eCMD_C011_DEC_CHAN_SET_FF_DISPLAY_MODE  = eCMD_C011_CMD_BASE + 0x131,
-       eCMD_C011_DEC_CHAN_SET_DISPLAY_TIMING_MODE      = eCMD_C011_CMD_BASE + 0x132,
-       eCMD_C011_DEC_CHAN_SET_DISPLAY_MODE             = eCMD_C011_CMD_BASE + 0x133,
-       eCMD_C011_DEC_CHAN_GET_DISPLAY_MODE             = eCMD_C011_CMD_BASE + 0x134,
+       eCMD_C011_DEC_CHAN_SET_DISPLAY_TIMING_MODE = eCMD_C011_CMD_BASE +
+                                                                0x132,
+       eCMD_C011_DEC_CHAN_SET_DISPLAY_MODE     = eCMD_C011_CMD_BASE + 0x133,
+       eCMD_C011_DEC_CHAN_GET_DISPLAY_MODE     = eCMD_C011_CMD_BASE + 0x134,
        eCMD_C011_DEC_CHAN_SET_REVERSE_FIELD    = eCMD_C011_CMD_BASE + 0x135,
        eCMD_C011_DEC_CHAN_STREAM_OPEN          = eCMD_C011_CMD_BASE + 0x136,
        eCMD_C011_DEC_CHAN_SET_PCR_PID          = eCMD_C011_CMD_BASE + 0x137,
@@ -328,19 +332,22 @@ enum  c011_ts_cmd {
        eCMD_C011_DEC_CHAN_GET_DISPLAY_ORDER    = eCMD_C011_CMD_BASE + 0x143,
        eCMD_C011_DEC_CHAN_SET_HOST_TRICK_MODE  = eCMD_C011_CMD_BASE + 0x144,
        eCMD_C011_DEC_CHAN_SET_OPERATION_MODE   = eCMD_C011_CMD_BASE + 0x145,
-       eCMD_C011_DEC_CHAN_DISPLAY_PAUSE_UNTO_PTS       = eCMD_C011_CMD_BASE + 0x146,
-       eCMD_C011_DEC_CHAN_SET_PTS_STC_DIFF_THRESHOLD = eCMD_C011_CMD_BASE + 0x147,
+       eCMD_C011_DEC_CHAN_DISPLAY_PAUSE_UNTO_PTS = eCMD_C011_CMD_BASE + 0x146,
+       eCMD_C011_DEC_CHAN_SET_PTS_STC_DIFF_THRESHOLD = eCMD_C011_CMD_BASE +
+                                                                0x147,
        eCMD_C011_DEC_CHAN_SEND_COMPRESSED_BUF  = eCMD_C011_CMD_BASE + 0x148,
        eCMD_C011_DEC_CHAN_SET_CLIPPING         = eCMD_C011_CMD_BASE + 0x149,
        eCMD_C011_DEC_CHAN_SET_PARAMETERS_FOR_HARD_RESET_INTERRUPT_TO_HOST
                = eCMD_C011_CMD_BASE + 0x150,
 
        /* Decoder RevD commands */
-       eCMD_C011_DEC_CHAN_SET_CSC      = eCMD_C011_CMD_BASE + 0x180, /* color space conversion */
+       eCMD_C011_DEC_CHAN_SET_CSC      = eCMD_C011_CMD_BASE + 0x180, /* color
+                                                        space conversion */
        eCMD_C011_DEC_CHAN_SET_RANGE_REMAP      = eCMD_C011_CMD_BASE + 0x181,
        eCMD_C011_DEC_CHAN_SET_FGT              = eCMD_C011_CMD_BASE + 0x182,
        /* Note: 0x183 not implemented yet in Rev D main */
-       eCMD_C011_DEC_CHAN_SET_LASTPICTURE_PADDING = eCMD_C011_CMD_BASE + 0x183,
+       eCMD_C011_DEC_CHAN_SET_LASTPICTURE_PADDING = eCMD_C011_CMD_BASE +
+                                                                0x183,
 
        /* Decoder 7412 commands (7412-only) */
        eCMD_C011_DEC_CHAN_SET_CONTENT_KEY      = eCMD_C011_CMD_BASE + 0x190,
index e617d2fcbb1f7c85baadff62514b0db849aaad85..0c8cb329420f2120cf43d04bb334d874a8f2bdba 100644 (file)
@@ -94,15 +94,19 @@ static bool crystalhd_bring_out_of_rst(struct crystalhd_adp *adp)
         * Enable clocks while 7412 reset is asserted, delay
         * De-assert 7412 reset
         */
-       rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp, MISC_PERST_DECODER_CTRL);
+       rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp,
+                                        MISC_PERST_DECODER_CTRL);
        rst_deco_cntrl.stop_bcm_7412_clk = 0;
        rst_deco_cntrl.bcm7412_rst = 1;
-       crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL, rst_deco_cntrl.whole_reg);
+       crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL,
+                                        rst_deco_cntrl.whole_reg);
        msleep_interruptible(10);
 
-       rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp, MISC_PERST_DECODER_CTRL);
+       rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp,
+                                        MISC_PERST_DECODER_CTRL);
        rst_deco_cntrl.bcm7412_rst = 0;
-       crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL, rst_deco_cntrl.whole_reg);
+       crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL,
+                                        rst_deco_cntrl.whole_reg);
        msleep_interruptible(50);
 
        /* Disable OTP_CONTENT_MISC to 0 to disable all secure modes */
@@ -132,9 +136,11 @@ static bool crystalhd_put_in_reset(struct crystalhd_adp *adp)
         * Assert 7412 reset, delay
         * Assert 7412 stop clock
         */
-       rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp, MISC_PERST_DECODER_CTRL);
+       rst_deco_cntrl.whole_reg = crystalhd_reg_rd(adp,
+                                        MISC_PERST_DECODER_CTRL);
        rst_deco_cntrl.stop_bcm_7412_clk = 1;
-       crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL, rst_deco_cntrl.whole_reg);
+       crystalhd_reg_wr(adp, MISC_PERST_DECODER_CTRL,
+                                        rst_deco_cntrl.whole_reg);
        msleep_interruptible(50);
 
        /* Bus Arbiter Timeout: GISB_ARBITER_TIMER
@@ -213,7 +219,8 @@ static void crystalhd_clear_errors(struct crystalhd_adp *adp)
 {
        uint32_t reg;
 
-       /* FIXME: jarod: wouldn't we want to write a 0 to the reg? Or does the write clear the bits specified? */
+       /* FIXME: jarod: wouldn't we want to write a 0 to the reg?
+        Or does the write clear the bits specified? */
        reg = crystalhd_reg_rd(adp, MISC1_Y_RX_ERROR_STATUS);
        if (reg)
                crystalhd_reg_wr(adp, MISC1_Y_RX_ERROR_STATUS, reg);
@@ -263,10 +270,12 @@ static bool crystalhd_load_firmware_config(struct crystalhd_adp *adp)
        crystalhd_reg_wr(adp, DCI_DRAM_BASE_ADDR, (BC_DRAM_FW_CFG_ADDR >> 19));
 
        crystalhd_reg_wr(adp, AES_CMD, 0);
-       crystalhd_reg_wr(adp, AES_CONFIG_INFO, (BC_DRAM_FW_CFG_ADDR & 0x7FFFF));
+       crystalhd_reg_wr(adp, AES_CONFIG_INFO,
+                (BC_DRAM_FW_CFG_ADDR & 0x7FFFF));
        crystalhd_reg_wr(adp, AES_CMD, 0x1);
 
-       /* FIXME: jarod: I've seen this fail, and introducing extra delays helps... */
+       /* FIXME: jarod: I've seen this fail,
+        and introducing extra delays helps... */
        for (i = 0; i < 100; ++i) {
                reg = crystalhd_reg_rd(adp, AES_STATUS);
                if (reg & 0x1)
@@ -349,7 +358,8 @@ static bool crystalhd_stop_device(struct crystalhd_adp *adp)
        return true;
 }
 
-static struct crystalhd_rx_dma_pkt *crystalhd_hw_alloc_rx_pkt(struct crystalhd_hw *hw)
+static struct crystalhd_rx_dma_pkt *crystalhd_hw_alloc_rx_pkt(
+                                       struct crystalhd_hw *hw)
 {
        unsigned long flags = 0;
        struct crystalhd_rx_dma_pkt *temp = NULL;
@@ -484,8 +494,8 @@ hw_create_ioq_err:
 }
 
 
-static bool crystalhd_code_in_full(struct crystalhd_adp *adp, uint32_t needed_sz,
-                                bool b_188_byte_pkts,  uint8_t flags)
+static bool crystalhd_code_in_full(struct crystalhd_adp *adp,
+                uint32_t needed_sz, bool b_188_byte_pkts,  uint8_t flags)
 {
        uint32_t base, end, writep, readp;
        uint32_t cpbSize, cpbFullness, fifoSize;
@@ -525,7 +535,7 @@ static bool crystalhd_code_in_full(struct crystalhd_adp *adp, uint32_t needed_sz
 }
 
 static enum BC_STATUS crystalhd_hw_tx_req_complete(struct crystalhd_hw *hw,
-                                           uint32_t list_id, enum BC_STATUS cs)
+                                        uint32_t list_id, enum BC_STATUS cs)
 {
        struct tx_dma_pkt *tx_req;
 
@@ -536,7 +546,8 @@ static enum BC_STATUS crystalhd_hw_tx_req_complete(struct crystalhd_hw *hw,
 
        hw->pwr_lock--;
 
-       tx_req = (struct tx_dma_pkt *)crystalhd_dioq_find_and_fetch(hw->tx_actq, list_id);
+       tx_req = (struct tx_dma_pkt *)crystalhd_dioq_find_and_fetch(
+                                       hw->tx_actq, list_id);
        if (!tx_req) {
                if (cs != BC_STS_IO_USER_ABORT)
                        BCMLOG_ERR("Find and Fetch Did not find req\n");
@@ -559,7 +570,8 @@ static enum BC_STATUS crystalhd_hw_tx_req_complete(struct crystalhd_hw *hw,
        return crystalhd_dioq_add(hw->tx_freeq, tx_req, false, 0);
 }
 
-static bool crystalhd_tx_list0_handler(struct crystalhd_hw *hw, uint32_t err_sts)
+static bool crystalhd_tx_list0_handler(struct crystalhd_hw *hw,
+                                        uint32_t err_sts)
 {
        uint32_t err_mask, tmp;
        unsigned long flags = 0;
@@ -591,7 +603,8 @@ static bool crystalhd_tx_list0_handler(struct crystalhd_hw *hw, uint32_t err_sts
        return true;
 }
 
-static bool crystalhd_tx_list1_handler(struct crystalhd_hw *hw, uint32_t err_sts)
+static bool crystalhd_tx_list1_handler(struct crystalhd_hw *hw,
+                                        uint32_t err_sts)
 {
        uint32_t err_mask, tmp;
        unsigned long flags = 0;
@@ -663,14 +676,15 @@ static void crystalhd_hw_dump_desc(struct dma_descriptor *p_dma_desc,
        if (!p_dma_desc || !cnt)
                return;
 
-       /* FIXME: jarod: perhaps a modparam desc_debug to enable this, rather than
-        * setting ll (log level, I presume) to non-zero? */
+       /* FIXME: jarod: perhaps a modparam desc_debug to enable this,
+        rather than setting ll (log level, I presume) to non-zero? */
        if (!ll)
                return;
 
        for (ix = ul_desc_index; ix < (ul_desc_index + cnt); ix++) {
-               BCMLOG(ll, "%s[%d] Buff[%x:%x] Next:[%x:%x] XferSz:%x Intr:%x,Last:%x\n",
-                      ((p_dma_desc[ul_desc_index].dma_dir) ? "TDesc" : "RDesc"),
+               BCMLOG(ll,
+                "%s[%d] Buff[%x:%x] Next:[%x:%x] XferSz:%x Intr:%x,Last:%x\n",
+                ((p_dma_desc[ul_desc_index].dma_dir) ? "TDesc" : "RDesc"),
                       ul_desc_index,
                       p_dma_desc[ul_desc_index].buff_addr_high,
                       p_dma_desc[ul_desc_index].buff_addr_low,
@@ -707,7 +721,8 @@ static enum BC_STATUS crystalhd_hw_fill_desc(struct crystalhd_dio_req *ioreq,
                /* Get SGLE length */
                len = crystalhd_get_sgle_len(ioreq, sg_ix);
                if (len % 4) {
-                       BCMLOG_ERR(" len in sg %d %d %d\n", len, sg_ix, sg_cnt);
+                       BCMLOG_ERR(" len in sg %d %d %d\n", len, sg_ix,
+                                sg_cnt);
                        return BC_STS_NOT_IMPL;
                }
                /* Setup DMA desc with Phy addr & Length at current index. */
@@ -722,7 +737,8 @@ static enum BC_STATUS crystalhd_hw_fill_desc(struct crystalhd_dio_req *ioreq,
                desc[ix].dma_dir        = ioreq->uinfo.dir_tx;
 
                /* Chain DMA descriptor.  */
-               addr_temp.full_addr = desc_phy_addr + sizeof(struct dma_descriptor);
+               addr_temp.full_addr = desc_phy_addr +
+                                        sizeof(struct dma_descriptor);
                desc[ix].next_desc_addr_low = addr_temp.low_part;
                desc[ix].next_desc_addr_high = addr_temp.high_part;
 
@@ -731,8 +747,9 @@ static enum BC_STATUS crystalhd_hw_fill_desc(struct crystalhd_dio_req *ioreq,
 
                /* Debug.. */
                if ((!len) || (len > crystalhd_get_sgle_len(ioreq, sg_ix))) {
-                       BCMLOG_ERR("inv-len(%x) Ix(%d) count:%x xfr_sz:%x sg_cnt:%d\n",
-                                  len, ix, count, xfr_sz, sg_cnt);
+                       BCMLOG_ERR(
+                        "inv-len(%x) Ix(%d) count:%x xfr_sz:%x sg_cnt:%d\n",
+                        len, ix, count, xfr_sz, sg_cnt);
                        return BC_STS_ERROR;
                }
                /* Length expects Multiple of 4 */
@@ -774,7 +791,8 @@ static enum BC_STATUS crystalhd_hw_fill_desc(struct crystalhd_dio_req *ioreq,
        return BC_STS_SUCCESS;
 }
 
-static enum BC_STATUS crystalhd_xlat_sgl_to_dma_desc(struct crystalhd_dio_req *ioreq,
+static enum BC_STATUS crystalhd_xlat_sgl_to_dma_desc(
+                                             struct crystalhd_dio_req *ioreq,
                                              struct dma_desc_mem *pdesc_mem,
                                              uint32_t *uv_desc_index)
 {
@@ -887,12 +905,14 @@ static enum BC_STATUS crystalhd_stop_tx_dma_engine(struct crystalhd_hw *hw)
        while ((l1 || l2) && cnt) {
 
                if (l1) {
-                       l1 = crystalhd_reg_rd(hw->adp, MISC1_TX_FIRST_DESC_L_ADDR_LIST0);
+                       l1 = crystalhd_reg_rd(hw->adp,
+                                MISC1_TX_FIRST_DESC_L_ADDR_LIST0);
                        l1 &= DMA_START_BIT;
                }
 
                if (l2) {
-                       l2 = crystalhd_reg_rd(hw->adp, MISC1_TX_FIRST_DESC_L_ADDR_LIST1);
+                       l2 = crystalhd_reg_rd(hw->adp,
+                                MISC1_TX_FIRST_DESC_L_ADDR_LIST1);
                        l2 &= DMA_START_BIT;
                }
 
@@ -986,7 +1006,8 @@ static uint32_t crystalhd_get_addr_from_pib_Q(struct crystalhd_hw *hw)
        return addr_entry;
 }
 
-static bool crystalhd_rel_addr_to_pib_Q(struct crystalhd_hw *hw, uint32_t addr_to_rel)
+static bool crystalhd_rel_addr_to_pib_Q(struct crystalhd_hw *hw,
+                                        uint32_t addr_to_rel)
 {
        uint32_t Q_addr;
        uint32_t r_offset, w_offset, n_offset;
@@ -1021,7 +1042,8 @@ static bool crystalhd_rel_addr_to_pib_Q(struct crystalhd_hw *hw, uint32_t addr_t
        return true;
 }
 
-static void cpy_pib_to_app(struct c011_pib *src_pib, struct BC_PIC_INFO_BLOCK *dst_pib)
+static void cpy_pib_to_app(struct c011_pib *src_pib,
+                                        struct BC_PIC_INFO_BLOCK *dst_pib)
 {
        if (!src_pib || !dst_pib) {
                BCMLOG_ERR("Invalid Arguments\n");
@@ -1063,11 +1085,13 @@ static void crystalhd_hw_proc_pib(struct crystalhd_hw *hw)
                               (uint32_t *)&src_pib);
 
                if (src_pib.bFormatChange) {
-                       rx_pkt = (struct crystalhd_rx_dma_pkt *)crystalhd_dioq_fetch(hw->rx_freeq);
+                       rx_pkt = (struct crystalhd_rx_dma_pkt *)
+                                       crystalhd_dioq_fetch(hw->rx_freeq);
                        if (!rx_pkt)
                                return;
                        rx_pkt->flags = 0;
-                       rx_pkt->flags |= COMP_FLAG_PIB_VALID | COMP_FLAG_FMT_CHANGE;
+                       rx_pkt->flags |= COMP_FLAG_PIB_VALID |
+                                        COMP_FLAG_FMT_CHANGE;
                        AppPib = &rx_pkt->pib;
                        cpy_pib_to_app(&src_pib, AppPib);
 
@@ -1084,7 +1108,8 @@ static void crystalhd_hw_proc_pib(struct crystalhd_hw *hw)
                               rx_pkt->pib.pulldown,
                               rx_pkt->pib.ycom);
 
-                       crystalhd_dioq_add(hw->rx_rdyq, (void *)rx_pkt, true, rx_pkt->pkt_tag);
+                       crystalhd_dioq_add(hw->rx_rdyq, (void *)rx_pkt, true,
+                                        rx_pkt->pkt_tag);
 
                }
 
@@ -1096,16 +1121,20 @@ static void crystalhd_start_rx_dma_engine(struct crystalhd_hw *hw)
 {
        uint32_t        dma_cntrl;
 
-       dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
+       dma_cntrl = crystalhd_reg_rd(hw->adp,
+                        MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
        if (!(dma_cntrl & DMA_START_BIT)) {
                dma_cntrl |= DMA_START_BIT;
-               crystalhd_reg_wr(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+               crystalhd_reg_wr(hw->adp,
+                        MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
        }
 
-       dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
+       dma_cntrl = crystalhd_reg_rd(hw->adp,
+                        MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
        if (!(dma_cntrl & DMA_START_BIT)) {
                dma_cntrl |= DMA_START_BIT;
-               crystalhd_reg_wr(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+               crystalhd_reg_wr(hw->adp,
+                        MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
        }
 
        return;
@@ -1116,44 +1145,52 @@ static void crystalhd_stop_rx_dma_engine(struct crystalhd_hw *hw)
        uint32_t dma_cntrl = 0, count = 30;
        uint32_t l0y = 1, l0uv = 1, l1y = 1, l1uv = 1;
 
-       dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
+       dma_cntrl = crystalhd_reg_rd(hw->adp,
+                        MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
        if ((dma_cntrl & DMA_START_BIT)) {
                dma_cntrl &= ~DMA_START_BIT;
-               crystalhd_reg_wr(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+               crystalhd_reg_wr(hw->adp,
+                        MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
        }
 
-       dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
+       dma_cntrl = crystalhd_reg_rd(hw->adp,
+                        MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
        if ((dma_cntrl & DMA_START_BIT)) {
                dma_cntrl &= ~DMA_START_BIT;
-               crystalhd_reg_wr(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+               crystalhd_reg_wr(hw->adp,
+                        MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
        }
 
        /* Poll for 3seconds (30 * 100ms) on both the lists..*/
        while ((l0y || l0uv || l1y || l1uv) && count) {
 
                if (l0y) {
-                       l0y = crystalhd_reg_rd(hw->adp, MISC1_Y_RX_FIRST_DESC_L_ADDR_LIST0);
+                       l0y = crystalhd_reg_rd(hw->adp,
+                                MISC1_Y_RX_FIRST_DESC_L_ADDR_LIST0);
                        l0y &= DMA_START_BIT;
                        if (!l0y)
                                hw->rx_list_sts[0] &= ~rx_waiting_y_intr;
                }
 
                if (l1y) {
-                       l1y = crystalhd_reg_rd(hw->adp, MISC1_Y_RX_FIRST_DESC_L_ADDR_LIST1);
+                       l1y = crystalhd_reg_rd(hw->adp,
+                                MISC1_Y_RX_FIRST_DESC_L_ADDR_LIST1);
                        l1y &= DMA_START_BIT;
                        if (!l1y)
                                hw->rx_list_sts[1] &= ~rx_waiting_y_intr;
                }
 
                if (l0uv) {
-                       l0uv = crystalhd_reg_rd(hw->adp, MISC1_UV_RX_FIRST_DESC_L_ADDR_LIST0);
+                       l0uv = crystalhd_reg_rd(hw->adp,
+                                MISC1_UV_RX_FIRST_DESC_L_ADDR_LIST0);
                        l0uv &= DMA_START_BIT;
                        if (!l0uv)
                                hw->rx_list_sts[0] &= ~rx_waiting_uv_intr;
                }
 
                if (l1uv) {
-                       l1uv = crystalhd_reg_rd(hw->adp, MISC1_UV_RX_FIRST_DESC_L_ADDR_LIST1);
+                       l1uv = crystalhd_reg_rd(hw->adp,
+                                MISC1_UV_RX_FIRST_DESC_L_ADDR_LIST1);
                        l1uv &= DMA_START_BIT;
                        if (!l1uv)
                                hw->rx_list_sts[1] &= ~rx_waiting_uv_intr;
@@ -1168,7 +1205,8 @@ static void crystalhd_stop_rx_dma_engine(struct crystalhd_hw *hw)
               count, hw->rx_list_sts[0], hw->rx_list_sts[1]);
 }
 
-static enum BC_STATUS crystalhd_hw_prog_rxdma(struct crystalhd_hw *hw, struct crystalhd_rx_dma_pkt *rx_pkt)
+static enum BC_STATUS crystalhd_hw_prog_rxdma(struct crystalhd_hw *hw,
+                                        struct crystalhd_rx_dma_pkt *rx_pkt)
 {
        uint32_t y_low_addr_reg, y_high_addr_reg;
        uint32_t uv_low_addr_reg, uv_high_addr_reg;
@@ -1186,7 +1224,8 @@ static enum BC_STATUS crystalhd_hw_prog_rxdma(struct crystalhd_hw *hw, struct cr
        }
 
        spin_lock_irqsave(&hw->rx_lock, flags);
-       /* FIXME: jarod: sts_free is an enum for 0, in crystalhd_hw.h... yuk... */
+       /* FIXME: jarod: sts_free is an enum for 0,
+        in crystalhd_hw.h... yuk... */
        if (sts_free != hw->rx_list_sts[hw->rx_list_post_index]) {
                spin_unlock_irqrestore(&hw->rx_lock, flags);
                return BC_STS_BUSY;
@@ -1210,7 +1249,8 @@ static enum BC_STATUS crystalhd_hw_prog_rxdma(struct crystalhd_hw *hw, struct cr
        hw->rx_list_post_index = (hw->rx_list_post_index + 1) % DMA_ENGINE_CNT;
        spin_unlock_irqrestore(&hw->rx_lock, flags);
 
-       crystalhd_dioq_add(hw->rx_actq, (void *)rx_pkt, false, rx_pkt->pkt_tag);
+       crystalhd_dioq_add(hw->rx_actq, (void *)rx_pkt, false,
+                        rx_pkt->pkt_tag);
 
        crystalhd_start_rx_dma_engine(hw);
        /* Program the Y descriptor */
@@ -1221,8 +1261,10 @@ static enum BC_STATUS crystalhd_hw_prog_rxdma(struct crystalhd_hw *hw, struct cr
        if (rx_pkt->uv_phy_addr) {
                /* Program the UV descriptor */
                desc_addr.full_addr = rx_pkt->uv_phy_addr;
-               crystalhd_reg_wr(hw->adp, uv_high_addr_reg, desc_addr.high_part);
-               crystalhd_reg_wr(hw->adp, uv_low_addr_reg, desc_addr.low_part | 0x01);
+               crystalhd_reg_wr(hw->adp, uv_high_addr_reg,
+                        desc_addr.high_part);
+               crystalhd_reg_wr(hw->adp, uv_low_addr_reg,
+                        desc_addr.low_part | 0x01);
        }
 
        return BC_STS_SUCCESS;
@@ -1268,16 +1310,20 @@ static void crystalhd_hw_finalize_pause(struct crystalhd_hw *hw)
 
        hw->stop_pending = 0;
 
-       dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
+       dma_cntrl = crystalhd_reg_rd(hw->adp,
+                        MISC1_Y_RX_SW_DESC_LIST_CTRL_STS);
        if (dma_cntrl & DMA_START_BIT) {
                dma_cntrl &= ~DMA_START_BIT;
-               crystalhd_reg_wr(hw->adp, MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+               crystalhd_reg_wr(hw->adp,
+                        MISC1_Y_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
        }
 
-       dma_cntrl = crystalhd_reg_rd(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
+       dma_cntrl = crystalhd_reg_rd(hw->adp,
+                        MISC1_UV_RX_SW_DESC_LIST_CTRL_STS);
        if (dma_cntrl & DMA_START_BIT) {
                dma_cntrl &= ~DMA_START_BIT;
-               crystalhd_reg_wr(hw->adp, MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
+               crystalhd_reg_wr(hw->adp,
+                        MISC1_UV_RX_SW_DESC_LIST_CTRL_STS, dma_cntrl);
        }
        hw->rx_list_post_index = 0;
 
@@ -1287,8 +1333,8 @@ static void crystalhd_hw_finalize_pause(struct crystalhd_hw *hw)
        crystalhd_reg_wr(hw->adp, PCIE_DLL_DATA_LINK_CONTROL, aspm);
 }
 
-static enum BC_STATUS crystalhd_rx_pkt_done(struct crystalhd_hw *hw, uint32_t list_index,
-                                    enum BC_STATUS comp_sts)
+static enum BC_STATUS crystalhd_rx_pkt_done(struct crystalhd_hw *hw,
+                        uint32_t list_index, enum BC_STATUS comp_sts)
 {
        struct crystalhd_rx_dma_pkt *rx_pkt = NULL;
        uint32_t y_dw_dnsz, uv_dw_dnsz;
@@ -1302,7 +1348,8 @@ static enum BC_STATUS crystalhd_rx_pkt_done(struct crystalhd_hw *hw, uint32_t li
        rx_pkt = crystalhd_dioq_find_and_fetch(hw->rx_actq,
                                             hw->rx_pkt_tag_seed + list_index);
        if (!rx_pkt) {
-               BCMLOG_ERR("Act-Q:PostIx:%x L0Sts:%x L1Sts:%x current L:%x tag:%x comp:%x\n",
+               BCMLOG_ERR(
+               "Act-Q:PostIx:%x L0Sts:%x L1Sts:%x current L:%x tag:%x comp:%x\n",
                           hw->rx_list_post_index, hw->rx_list_sts[0],
                           hw->rx_list_sts[1], list_index,
                           hw->rx_pkt_tag_seed + list_index, comp_sts);
@@ -1324,8 +1371,8 @@ static enum BC_STATUS crystalhd_rx_pkt_done(struct crystalhd_hw *hw, uint32_t li
        return crystalhd_hw_post_cap_buff(hw, rx_pkt);
 }
 
-static bool crystalhd_rx_list0_handler(struct crystalhd_hw *hw, uint32_t int_sts,
-                                    uint32_t y_err_sts, uint32_t uv_err_sts)
+static bool crystalhd_rx_list0_handler(struct crystalhd_hw *hw,
+                uint32_t int_sts, uint32_t y_err_sts, uint32_t uv_err_sts)
 {
        uint32_t tmp;
        enum list_sts tmp_lsts;
@@ -1367,7 +1414,8 @@ static bool crystalhd_rx_list0_handler(struct crystalhd_hw *hw, uint32_t int_sts
                tmp &= ~MISC1_UV_RX_ERROR_STATUS_RX_L0_UNDERRUN_ERROR_MASK;
        }
 
-       if (uv_err_sts & MISC1_UV_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK) {
+       if (uv_err_sts &
+        MISC1_UV_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK) {
                hw->rx_list_sts[0] &= ~rx_uv_mask;
                hw->rx_list_sts[0] |= rx_uv_error;
                tmp &= ~MISC1_UV_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK;
@@ -1392,8 +1440,8 @@ static bool crystalhd_rx_list0_handler(struct crystalhd_hw *hw, uint32_t int_sts
        return (tmp_lsts != hw->rx_list_sts[0]);
 }
 
-static bool crystalhd_rx_list1_handler(struct crystalhd_hw *hw, uint32_t int_sts,
-                                    uint32_t y_err_sts, uint32_t uv_err_sts)
+static bool crystalhd_rx_list1_handler(struct crystalhd_hw *hw,
+                uint32_t int_sts, uint32_t y_err_sts, uint32_t uv_err_sts)
 {
        uint32_t tmp;
        enum list_sts tmp_lsts;
@@ -1486,9 +1534,11 @@ static void crystalhd_rx_isr(struct crystalhd_hw *hw, uint32_t intr_sts)
                /* Update States..*/
                spin_lock_irqsave(&hw->rx_lock, flags);
                if (i == 0)
-                       ret = crystalhd_rx_list0_handler(hw, intr_sts, y_err_sts, uv_err_sts);
+                       ret = crystalhd_rx_list0_handler(hw, intr_sts,
+                                        y_err_sts, uv_err_sts);
                else
-                       ret = crystalhd_rx_list1_handler(hw, intr_sts, y_err_sts, uv_err_sts);
+                       ret = crystalhd_rx_list1_handler(hw, intr_sts,
+                                        y_err_sts, uv_err_sts);
                if (ret) {
                        switch (hw->rx_list_sts[i]) {
                        case sts_free:
@@ -1501,11 +1551,13 @@ static void crystalhd_rx_isr(struct crystalhd_hw *hw, uint32_t intr_sts)
                                /* We got error on both or Y or uv. */
                                hw->stats.rx_errors++;
                                crystalhd_get_dnsz(hw, i, &y_dn_sz, &uv_dn_sz);
-                               /* FIXME: jarod: this is where my mini pci-e card is tripping up */
+                               /* FIXME: jarod: this is where
+                                my mini pci-e card is tripping up */
                                BCMLOG(BCMLOG_DBG, "list_index:%x rx[%d] Y:%x "
                                       "UV:%x Int:%x YDnSz:%x UVDnSz:%x\n",
                                       i, hw->stats.rx_errors, y_err_sts,
-                                      uv_err_sts, intr_sts, y_dn_sz, uv_dn_sz);
+                                      uv_err_sts, intr_sts, y_dn_sz,
+                                                uv_dn_sz);
                                hw->rx_list_sts[i] = sts_free;
                                comp_sts = BC_STS_ERROR;
                                break;
@@ -1567,14 +1619,17 @@ static enum BC_STATUS crystalhd_put_ddr2sleep(struct crystalhd_hw *hw)
        union link_misc_perst_decoder_ctrl rst_cntrl_reg;
 
        /* Pulse reset pin of 7412 (MISC_PERST_DECODER_CTRL) */
-       rst_cntrl_reg.whole_reg = crystalhd_reg_rd(hw->adp, MISC_PERST_DECODER_CTRL);
+       rst_cntrl_reg.whole_reg = crystalhd_reg_rd(hw->adp,
+                                        MISC_PERST_DECODER_CTRL);
 
        rst_cntrl_reg.bcm_7412_rst = 1;
-       crystalhd_reg_wr(hw->adp, MISC_PERST_DECODER_CTRL, rst_cntrl_reg.whole_reg);
+       crystalhd_reg_wr(hw->adp, MISC_PERST_DECODER_CTRL,
+                                        rst_cntrl_reg.whole_reg);
        msleep_interruptible(50);
 
        rst_cntrl_reg.bcm_7412_rst = 0;
-       crystalhd_reg_wr(hw->adp, MISC_PERST_DECODER_CTRL, rst_cntrl_reg.whole_reg);
+       crystalhd_reg_wr(hw->adp, MISC_PERST_DECODER_CTRL,
+                                        rst_cntrl_reg.whole_reg);
 
        /* Close all banks, put DDR in idle */
        bc_dec_reg_wr(hw->adp, SDRAM_PRECHARGE, 0);
@@ -1622,7 +1677,8 @@ static enum BC_STATUS crystalhd_put_ddr2sleep(struct crystalhd_hw *hw)
 **
 *************************************************/
 
-enum BC_STATUS crystalhd_download_fw(struct crystalhd_adp *adp, void *buffer, uint32_t sz)
+enum BC_STATUS crystalhd_download_fw(struct crystalhd_adp *adp, void *buffer,
+                                        uint32_t sz)
 {
        uint32_t reg_data, cnt, *temp_buff;
        uint32_t fw_sig_len = 36;
@@ -1828,7 +1884,8 @@ bool crystalhd_hw_interrupt(struct crystalhd_adp *adp, struct crystalhd_hw *hw)
                        crystalhd_hw_proc_pib(hw);
 
                bc_dec_reg_wr(adp, Stream2Host_Intr_Sts, deco_intr);
-               /* FIXME: jarod: No udelay? might this be the real reason mini pci-e cards were stalling out? */
+               /* FIXME: jarod: No udelay? might this be
+                the real reason mini pci-e cards were stalling out? */
                bc_dec_reg_wr(adp, Stream2Host_Intr_Sts, 0);
                rc = 1;
        }
@@ -1852,7 +1909,8 @@ bool crystalhd_hw_interrupt(struct crystalhd_adp *adp, struct crystalhd_hw *hw)
        return rc;
 }
 
-enum BC_STATUS crystalhd_hw_open(struct crystalhd_hw *hw, struct crystalhd_adp *adp)
+enum BC_STATUS crystalhd_hw_open(struct crystalhd_hw *hw,
+                        struct crystalhd_adp *adp)
 {
        if (!hw || !adp) {
                BCMLOG_ERR("Invalid Arguments\n");
@@ -1967,7 +2025,8 @@ enum BC_STATUS crystalhd_hw_setup_dma_rings(struct crystalhd_hw *hw)
                }
                rpkt->desc_mem.pdma_desc_start = mem;
                rpkt->desc_mem.phy_addr = phy_addr;
-               rpkt->desc_mem.sz  = BC_LINK_MAX_SGLS * sizeof(struct dma_descriptor);
+               rpkt->desc_mem.sz  = BC_LINK_MAX_SGLS *
+                                        sizeof(struct dma_descriptor);
                rpkt->pkt_tag = hw->rx_pkt_tag_seed + i;
                crystalhd_hw_free_rx_pkt(hw, rpkt);
        }
@@ -2013,7 +2072,8 @@ enum BC_STATUS crystalhd_hw_free_dma_rings(struct crystalhd_hw *hw)
        return BC_STS_SUCCESS;
 }
 
-enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw, struct crystalhd_dio_req *ioreq,
+enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw,
+                            struct crystalhd_dio_req *ioreq,
                             hw_comp_callback call_back,
                             wait_queue_head_t *cb_event, uint32_t *list_id,
                             uint8_t data_flags)
@@ -2047,7 +2107,8 @@ enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw, struct crystalhd_di
        }
 
        /* Get a list from TxFreeQ */
-       tx_dma_packet = (struct tx_dma_pkt *)crystalhd_dioq_fetch(hw->tx_freeq);
+       tx_dma_packet = (struct tx_dma_pkt *)crystalhd_dioq_fetch(
+                                               hw->tx_freeq);
        if (!tx_dma_packet) {
                BCMLOG_ERR("No empty elements..\n");
                return BC_STS_ERR_USAGE;
@@ -2105,7 +2166,8 @@ enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw, struct crystalhd_di
        crystalhd_start_tx_dma_engine(hw);
        crystalhd_reg_wr(hw->adp, first_desc_u_addr, desc_addr.high_part);
 
-       crystalhd_reg_wr(hw->adp, first_desc_l_addr, desc_addr.low_part | 0x01);
+       crystalhd_reg_wr(hw->adp, first_desc_l_addr, desc_addr.low_part |
+                                        0x01);
                                        /* Be sure we set the valid bit ^^^^ */
 
        return BC_STS_SUCCESS;
@@ -2120,7 +2182,8 @@ enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw, struct crystalhd_di
  *
  * FIX_ME: Not Tested the actual condition..
  */
-enum BC_STATUS crystalhd_hw_cancel_tx(struct crystalhd_hw *hw, uint32_t list_id)
+enum BC_STATUS crystalhd_hw_cancel_tx(struct crystalhd_hw *hw,
+                                        uint32_t list_id)
 {
        if (!hw || !list_id) {
                BCMLOG_ERR("Invalid Arguments\n");
@@ -2134,7 +2197,7 @@ enum BC_STATUS crystalhd_hw_cancel_tx(struct crystalhd_hw *hw, uint32_t list_id)
 }
 
 enum BC_STATUS crystalhd_hw_add_cap_buffer(struct crystalhd_hw *hw,
-                                   struct crystalhd_dio_req *ioreq, bool en_post)
+                                struct crystalhd_dio_req *ioreq, bool en_post)
 {
        struct crystalhd_rx_dma_pkt *rpkt;
        uint32_t tag, uv_desc_ix = 0;
@@ -2154,7 +2217,8 @@ enum BC_STATUS crystalhd_hw_add_cap_buffer(struct crystalhd_hw *hw,
        rpkt->dio_req = ioreq;
        tag = rpkt->pkt_tag;
 
-       sts = crystalhd_xlat_sgl_to_dma_desc(ioreq, &rpkt->desc_mem, &uv_desc_ix);
+       sts = crystalhd_xlat_sgl_to_dma_desc(ioreq, &rpkt->desc_mem,
+                                        &uv_desc_ix);
        if (sts != BC_STS_SUCCESS)
                return sts;
 
@@ -2163,7 +2227,7 @@ enum BC_STATUS crystalhd_hw_add_cap_buffer(struct crystalhd_hw *hw,
        /* Store the address of UV in the rx packet for post*/
        if (uv_desc_ix)
                rpkt->uv_phy_addr = rpkt->desc_mem.phy_addr +
-                                   (sizeof(struct dma_descriptor) * (uv_desc_ix + 1));
+                        (sizeof(struct dma_descriptor) * (uv_desc_ix + 1));
 
        if (en_post)
                sts = crystalhd_hw_post_cap_buff(hw, rpkt);
@@ -2190,7 +2254,8 @@ enum BC_STATUS crystalhd_hw_get_cap_buffer(struct crystalhd_hw *hw,
        rpkt = crystalhd_dioq_fetch_wait(hw->rx_rdyq, timeout, &sig_pending);
        if (!rpkt) {
                if (sig_pending) {
-                       BCMLOG(BCMLOG_INFO, "wait on frame time out %d\n", sig_pending);
+                       BCMLOG(BCMLOG_INFO, "wait on frame time out %d\n",
+                                        sig_pending);
                        return BC_STS_IO_USER_ABORT;
                } else {
                        return BC_STS_TIMEOUT;
@@ -2305,7 +2370,8 @@ enum BC_STATUS crystalhd_hw_suspend(struct crystalhd_hw *hw)
        return BC_STS_SUCCESS;
 }
 
-void crystalhd_hw_stats(struct crystalhd_hw *hw, struct crystalhd_hw_stats *stats)
+void crystalhd_hw_stats(struct crystalhd_hw *hw,
+                struct crystalhd_hw_stats *stats)
 {
        if (!hw) {
                BCMLOG_ERR("Invalid Arguments\n");
@@ -2378,7 +2444,8 @@ enum BC_STATUS crystalhd_hw_set_core_clock(struct crystalhd_hw *hw)
 
                if (reg & 0x00020000) {
                        hw->prev_n = n;
-                       /* FIXME: jarod: outputting a random "C" is... confusing... */
+                       /* FIXME: jarod: outputting
+                        a random "C" is... confusing... */
                        BCMLOG(BCMLOG_INFO, "C");
                        return BC_STS_SUCCESS;
                } else {
index 2d0e6c6005e54859cf1b551c45136077cd743573..37809442c5539b8a1e5ecde29657f67a61830c89 100644 (file)
@@ -46,7 +46,7 @@
 #define Cpu2HstMbx1            0x00100F04
 #define MbxStat1               0x00100F08
 #define Stream2Host_Intr_Sts   0x00100F24
-#define C011_RET_SUCCESS       0x0     /* Reutrn status of firmware command. */
+#define C011_RET_SUCCESS       0x0 /* Reutrn status of firmware command. */
 
 /* TS input status register */
 #define TS_StreamAFIFOStatus   0x0010044C
 #define BC_FWIMG_ST_ADDR       0x00000000
 /* FIXME: jarod: there's a kernel function that'll do this for us... */
 #define rotr32_1(x, n)         (((x) >> n) | ((x) << (32 - n)))
-#define bswap_32_1(x)          ((rotr32_1((x), 24) & 0x00ff00ff) | (rotr32_1((x), 8) & 0xff00ff00))
+#define bswap_32_1(x) ((rotr32_1((x), 24) & 0x00ff00ff) | (rotr32_1((x), 8) & 0xff00ff00))
 
 #define DecHt_HostSwReset      0x340000
 #define BC_DRAM_FW_CFG_ADDR    0x001c2000
@@ -136,9 +136,11 @@ union intr_mask_reg {
 
 union link_misc_perst_deco_ctrl {
        struct {
-               uint32_t        bcm7412_rst:1;          /* 1 -> BCM7412 is held in reset. Reset value 1.*/
+               uint32_t        bcm7412_rst:1;  /* 1 -> BCM7412 is held
+                                               in reset. Reset value 1.*/
                uint32_t        reserved0:3;            /* Reserved.No Effect*/
-               uint32_t        stop_bcm_7412_clk:1;    /* 1 ->Stops branch of 27MHz clk used to clk BCM7412*/
+               uint32_t        stop_bcm_7412_clk:1;    /* 1 ->Stops branch of
+                                               27MHz clk used to clk BCM7412*/
                uint32_t        reserved1:27;           /* Reseved. No Effect*/
        };
 
@@ -148,13 +150,18 @@ union link_misc_perst_deco_ctrl {
 
 union link_misc_perst_clk_ctrl {
        struct {
-               uint32_t        sel_alt_clk:1;    /* When set, selects a 6.75MHz clock as the source of core_clk */
-               uint32_t        stop_core_clk:1;  /* When set, stops the branch of core_clk that is not needed for low power operation */
-               uint32_t        pll_pwr_dn:1;     /* When set, powers down the main PLL. The alternate clock bit should be set
-                                                    to select an alternate clock before setting this bit.*/
+               uint32_t        sel_alt_clk:1;    /* When set, selects a
+                                6.75MHz clock as the source of core_clk */
+               uint32_t        stop_core_clk:1;  /* When set, stops the branch
+                of core_clk that is not needed for low power operation */
+               uint32_t        pll_pwr_dn:1;     /* When set, powers down the
+                        main PLL. The alternate clock bit should be set to
+                        select an alternate clock before setting this bit.*/
                uint32_t        reserved0:5;      /* Reserved */
-               uint32_t        pll_mult:8;       /* This setting controls the multiplier for the PLL. */
-               uint32_t        pll_div:4;        /* This setting controls the divider for the PLL. */
+               uint32_t        pll_mult:8;       /* This setting controls
+                                                the multiplier for the PLL. */
+               uint32_t        pll_div:4;        /* This setting controls
+                                                the divider for the PLL. */
                uint32_t        reserved1:12;     /* Reserved */
        };
 
@@ -164,9 +171,11 @@ union link_misc_perst_clk_ctrl {
 
 union link_misc_perst_decoder_ctrl {
        struct {
-               uint32_t        bcm_7412_rst:1; /* 1 -> BCM7412 is held in reset. Reset value 1.*/
+               uint32_t        bcm_7412_rst:1; /* 1 -> BCM7412 is held
+                                                in reset. Reset value 1.*/
                uint32_t        res0:3; /* Reserved.No Effect*/
-               uint32_t        stop_7412_clk:1; /* 1 ->Stops branch of 27MHz clk used to clk BCM7412*/
+               uint32_t        stop_7412_clk:1; /* 1 ->Stops branch of 27MHz
+                                                clk used to clk BCM7412*/
                uint32_t        res1:27; /* Reseved. No Effect */
        };
 
@@ -225,10 +234,12 @@ struct dma_descriptor {   /* 8 32-bit values */
  * The  virtual address will determine what should be freed.
  */
 struct dma_desc_mem {
-       struct dma_descriptor   *pdma_desc_start; /* 32-bytes for dma descriptor. should be first element */
-       dma_addr_t              phy_addr;       /* physical address of each DMA desc */
+       struct dma_descriptor   *pdma_desc_start; /* 32-bytes for dma
+                                descriptor. should be first element */
+       dma_addr_t              phy_addr;       /* physical address
+                                                of each DMA desc */
        uint32_t                sz;
-       struct _dma_desc_mem_   *Next;          /* points to Next Descriptor in chain */
+       struct _dma_desc_mem_   *Next; /* points to Next Descriptor in chain */
 
 };
 
@@ -323,50 +334,54 @@ struct crystalhd_hw {
 #define CLOCK_PRESET 175
 
 /* DMA engine register BIT mask wrappers.. */
-#define DMA_START_BIT          MISC1_TX_SW_DESC_LIST_CTRL_STS_TX_DMA_RUN_STOP_MASK
-
-#define GET_RX_INTR_MASK (INTR_INTR_STATUS_L1_UV_RX_DMA_ERR_INTR_MASK |                \
-                         INTR_INTR_STATUS_L1_UV_RX_DMA_DONE_INTR_MASK |        \
-                         INTR_INTR_STATUS_L1_Y_RX_DMA_ERR_INTR_MASK |          \
-                         INTR_INTR_STATUS_L1_Y_RX_DMA_DONE_INTR_MASK |         \
-                         INTR_INTR_STATUS_L0_UV_RX_DMA_ERR_INTR_MASK |         \
-                         INTR_INTR_STATUS_L0_UV_RX_DMA_DONE_INTR_MASK |        \
-                         INTR_INTR_STATUS_L0_Y_RX_DMA_ERR_INTR_MASK |          \
-                         INTR_INTR_STATUS_L0_Y_RX_DMA_DONE_INTR_MASK)
-
-#define GET_Y0_ERR_MSK (MISC1_Y_RX_ERROR_STATUS_RX_L0_OVERRUN_ERROR_MASK |             \
-                       MISC1_Y_RX_ERROR_STATUS_RX_L0_UNDERRUN_ERROR_MASK |             \
-                       MISC1_Y_RX_ERROR_STATUS_RX_L0_DESC_TX_ABORT_ERRORS_MASK |       \
-                       MISC1_Y_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK)
-
-#define GET_UV0_ERR_MSK (MISC1_UV_RX_ERROR_STATUS_RX_L0_OVERRUN_ERROR_MASK |           \
-                        MISC1_UV_RX_ERROR_STATUS_RX_L0_UNDERRUN_ERROR_MASK |           \
-                        MISC1_UV_RX_ERROR_STATUS_RX_L0_DESC_TX_ABORT_ERRORS_MASK |     \
-                        MISC1_UV_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK)
-
-#define GET_Y1_ERR_MSK (MISC1_Y_RX_ERROR_STATUS_RX_L1_OVERRUN_ERROR_MASK |             \
-                       MISC1_Y_RX_ERROR_STATUS_RX_L1_UNDERRUN_ERROR_MASK |             \
-                       MISC1_Y_RX_ERROR_STATUS_RX_L1_DESC_TX_ABORT_ERRORS_MASK |       \
-                       MISC1_Y_RX_ERROR_STATUS_RX_L1_FIFO_FULL_ERRORS_MASK)
-
-#define GET_UV1_ERR_MSK        (MISC1_UV_RX_ERROR_STATUS_RX_L1_OVERRUN_ERROR_MASK |            \
-                        MISC1_UV_RX_ERROR_STATUS_RX_L1_UNDERRUN_ERROR_MASK |           \
-                        MISC1_UV_RX_ERROR_STATUS_RX_L1_DESC_TX_ABORT_ERRORS_MASK |     \
-                        MISC1_UV_RX_ERROR_STATUS_RX_L1_FIFO_FULL_ERRORS_MASK)
+#define DMA_START_BIT  MISC1_TX_SW_DESC_LIST_CTRL_STS_TX_DMA_RUN_STOP_MASK
+
+#define GET_RX_INTR_MASK (INTR_INTR_STATUS_L1_UV_RX_DMA_ERR_INTR_MASK |        \
+       INTR_INTR_STATUS_L1_UV_RX_DMA_DONE_INTR_MASK |  \
+       INTR_INTR_STATUS_L1_Y_RX_DMA_ERR_INTR_MASK |            \
+       INTR_INTR_STATUS_L1_Y_RX_DMA_DONE_INTR_MASK |           \
+       INTR_INTR_STATUS_L0_UV_RX_DMA_ERR_INTR_MASK |           \
+       INTR_INTR_STATUS_L0_UV_RX_DMA_DONE_INTR_MASK |  \
+       INTR_INTR_STATUS_L0_Y_RX_DMA_ERR_INTR_MASK |            \
+       INTR_INTR_STATUS_L0_Y_RX_DMA_DONE_INTR_MASK)
+
+#define GET_Y0_ERR_MSK (MISC1_Y_RX_ERROR_STATUS_RX_L0_OVERRUN_ERROR_MASK | \
+       MISC1_Y_RX_ERROR_STATUS_RX_L0_UNDERRUN_ERROR_MASK |             \
+       MISC1_Y_RX_ERROR_STATUS_RX_L0_DESC_TX_ABORT_ERRORS_MASK |       \
+       MISC1_Y_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK)
+
+#define GET_UV0_ERR_MSK (MISC1_UV_RX_ERROR_STATUS_RX_L0_OVERRUN_ERROR_MASK | \
+       MISC1_UV_RX_ERROR_STATUS_RX_L0_UNDERRUN_ERROR_MASK |            \
+       MISC1_UV_RX_ERROR_STATUS_RX_L0_DESC_TX_ABORT_ERRORS_MASK |      \
+       MISC1_UV_RX_ERROR_STATUS_RX_L0_FIFO_FULL_ERRORS_MASK)
+
+#define GET_Y1_ERR_MSK (MISC1_Y_RX_ERROR_STATUS_RX_L1_OVERRUN_ERROR_MASK | \
+       MISC1_Y_RX_ERROR_STATUS_RX_L1_UNDERRUN_ERROR_MASK |             \
+       MISC1_Y_RX_ERROR_STATUS_RX_L1_DESC_TX_ABORT_ERRORS_MASK |       \
+       MISC1_Y_RX_ERROR_STATUS_RX_L1_FIFO_FULL_ERRORS_MASK)
+
+#define GET_UV1_ERR_MSK        (MISC1_UV_RX_ERROR_STATUS_RX_L1_OVERRUN_ERROR_MASK | \
+       MISC1_UV_RX_ERROR_STATUS_RX_L1_UNDERRUN_ERROR_MASK |            \
+       MISC1_UV_RX_ERROR_STATUS_RX_L1_DESC_TX_ABORT_ERRORS_MASK |      \
+       MISC1_UV_RX_ERROR_STATUS_RX_L1_FIFO_FULL_ERRORS_MASK)
 
 
 /**** API Exposed to the other layers ****/
 enum BC_STATUS crystalhd_download_fw(struct crystalhd_adp *adp,
                              void *buffer, uint32_t sz);
-enum BC_STATUS crystalhd_do_fw_cmd(struct crystalhd_hw *hw, struct BC_FW_CMD *fw_cmd);
-bool crystalhd_hw_interrupt(struct crystalhd_adp *adp, struct crystalhd_hw *hw);
-enum BC_STATUS crystalhd_hw_open(struct crystalhd_hw *, struct crystalhd_adp *);
+enum BC_STATUS crystalhd_do_fw_cmd(struct crystalhd_hw *hw,
+                                struct BC_FW_CMD *fw_cmd);
+bool crystalhd_hw_interrupt(struct crystalhd_adp *adp,
+                                struct crystalhd_hw *hw);
+enum BC_STATUS crystalhd_hw_open(struct crystalhd_hw *,
+                                struct crystalhd_adp *);
 enum BC_STATUS crystalhd_hw_close(struct crystalhd_hw *);
 enum BC_STATUS crystalhd_hw_setup_dma_rings(struct crystalhd_hw *);
 enum BC_STATUS crystalhd_hw_free_dma_rings(struct crystalhd_hw *);
 
 
-enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw, struct crystalhd_dio_req *ioreq,
+enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw,
+                            struct crystalhd_dio_req *ioreq,
                             hw_comp_callback call_back,
                             wait_queue_head_t *cb_event,
                             uint32_t *list_id, uint8_t data_flags);
@@ -374,15 +389,17 @@ enum BC_STATUS crystalhd_hw_post_tx(struct crystalhd_hw *hw, struct crystalhd_di
 enum BC_STATUS crystalhd_hw_pause(struct crystalhd_hw *hw);
 enum BC_STATUS crystalhd_hw_unpause(struct crystalhd_hw *hw);
 enum BC_STATUS crystalhd_hw_suspend(struct crystalhd_hw *hw);
-enum BC_STATUS crystalhd_hw_cancel_tx(struct crystalhd_hw *hw, uint32_t list_id);
+enum BC_STATUS crystalhd_hw_cancel_tx(struct crystalhd_hw *hw,
+                                uint32_t list_id);
 enum BC_STATUS crystalhd_hw_add_cap_buffer(struct crystalhd_hw *hw,
-                                   struct crystalhd_dio_req *ioreq, bool en_post);
+                        struct crystalhd_dio_req *ioreq, bool en_post);
 enum BC_STATUS crystalhd_hw_get_cap_buffer(struct crystalhd_hw *hw,
                                    struct BC_PIC_INFO_BLOCK *pib,
                                    struct crystalhd_dio_req **ioreq);
 enum BC_STATUS crystalhd_hw_stop_capture(struct crystalhd_hw *hw);
 enum BC_STATUS crystalhd_hw_start_capture(struct crystalhd_hw *hw);
-void crystalhd_hw_stats(struct crystalhd_hw *hw, struct crystalhd_hw_stats *stats);
+void crystalhd_hw_stats(struct crystalhd_hw *hw,
+                        struct crystalhd_hw_stats *stats);
 
 /* API to program the core clock on the decoder */
 enum BC_STATUS crystalhd_hw_set_core_clock(struct crystalhd_hw *);
index 85f51fb18425b9b6bc7a0ccd43a8ae91be6fb945..c1f6163cdeb81b545bdbe592b8c871b9c5310ce1 100644 (file)
@@ -75,7 +75,8 @@ static int chd_dec_disable_int(struct crystalhd_adp *adp)
        return 0;
 }
 
-struct crystalhd_ioctl_data *chd_dec_alloc_iodata(struct crystalhd_adp *adp, bool isr)
+struct crystalhd_ioctl_data *chd_dec_alloc_iodata(struct crystalhd_adp *adp,
+                                        bool isr)
 {
        unsigned long flags = 0;
        struct crystalhd_ioctl_data *temp;
@@ -95,8 +96,8 @@ struct crystalhd_ioctl_data *chd_dec_alloc_iodata(struct crystalhd_adp *adp, boo
        return temp;
 }
 
-void chd_dec_free_iodata(struct crystalhd_adp *adp, struct crystalhd_ioctl_data *iodata,
-                        bool isr)
+void chd_dec_free_iodata(struct crystalhd_adp *adp,
+                        struct crystalhd_ioctl_data *iodata, bool isr)
 {
        unsigned long flags = 0;
 
@@ -109,7 +110,8 @@ void chd_dec_free_iodata(struct crystalhd_adp *adp, struct crystalhd_ioctl_data
        spin_unlock_irqrestore(&adp->lock, flags);
 }
 
-static inline int crystalhd_user_data(unsigned long ud, void *dr, int size, int set)
+static inline int crystalhd_user_data(unsigned long ud, void *dr,
+                        int size, int set)
 {
        int rc;
 
@@ -131,8 +133,8 @@ static inline int crystalhd_user_data(unsigned long ud, void *dr, int size, int
        return rc;
 }
 
-static int chd_dec_fetch_cdata(struct crystalhd_adp *adp, struct crystalhd_ioctl_data *io,
-                              uint32_t m_sz, unsigned long ua)
+static int chd_dec_fetch_cdata(struct crystalhd_adp *adp,
+        struct crystalhd_ioctl_data *io, uint32_t m_sz, unsigned long ua)
 {
        unsigned long ua_off;
        int rc = 0;
@@ -163,7 +165,7 @@ static int chd_dec_fetch_cdata(struct crystalhd_adp *adp, struct crystalhd_ioctl
 }
 
 static int chd_dec_release_cdata(struct crystalhd_adp *adp,
-                                struct crystalhd_ioctl_data *io, unsigned long ua)
+                        struct crystalhd_ioctl_data *io, unsigned long ua)
 {
        unsigned long ua_off;
        int rc;
@@ -178,8 +180,9 @@ static int chd_dec_release_cdata(struct crystalhd_adp *adp,
                rc = crystalhd_user_data(ua_off, io->add_cdata,
                                        io->add_cdata_sz, 1);
                if (rc) {
-                       BCMLOG_ERR("failed to push add_cdata sz:%x ua_off:%x\n",
-                                  io->add_cdata_sz, (unsigned int)ua_off);
+                       BCMLOG_ERR(
+                               "failed to push add_cdata sz:%x ua_off:%x\n",
+                                io->add_cdata_sz, (unsigned int)ua_off);
                        return -ENODATA;
                }
        }
@@ -252,10 +255,7 @@ static int chd_dec_api_cmd(struct crystalhd_adp *adp, unsigned long ua,
                rc = chd_dec_proc_user_data(adp, temp, ua, 1);
        }
 
-       if (temp) {
-               chd_dec_free_iodata(adp, temp, 0);
-               temp = NULL;
-       }
+       chd_dec_free_iodata(adp, temp, 0);
 
        return rc;
 }
@@ -378,8 +378,8 @@ static int chd_dec_init_chdev(struct crystalhd_adp *adp)
                goto class_create_fail;
        }
 
-       dev = device_create(crystalhd_class, NULL, MKDEV(adp->chd_dec_major, 0),
-                           NULL, "crystalhd");
+       dev = device_create(crystalhd_class, NULL,
+                        MKDEV(adp->chd_dec_major, 0), NULL, "crystalhd");
        if (IS_ERR(dev)) {
                rc = PTR_ERR(dev);
                BCMLOG_ERR("failed to create device\n");
@@ -394,7 +394,8 @@ static int chd_dec_init_chdev(struct crystalhd_adp *adp)
 
        /* Allocate general purpose ioctl pool. */
        for (i = 0; i < CHD_IODATA_POOL_SZ; i++) {
-               temp = kzalloc(sizeof(struct crystalhd_ioctl_data), GFP_KERNEL);
+               temp = kzalloc(sizeof(struct crystalhd_ioctl_data),
+                                        GFP_KERNEL);
                if (!temp) {
                        BCMLOG_ERR("ioctl data pool kzalloc failed\n");
                        rc = -ENOMEM;
index a9e36336d097b5d8ae8850d58ec199c43f7b4dbf..bac572a8bc2e164553cf2046421d5681905b07e1 100644 (file)
@@ -77,8 +77,8 @@ struct crystalhd_adp {
        int             chd_dec_major;
        unsigned int            cfg_users;
 
-       struct crystalhd_ioctl_data     *idata_free_head;       /* ioctl data pool */
-       struct crystalhd_elem           *elem_pool_head;        /* Queue element pool */
+       struct crystalhd_ioctl_data     *idata_free_head; /* ioctl data pool */
+       struct crystalhd_elem   *elem_pool_head; /* Queue element pool */
 
        struct crystalhd_cmd    cmds;
 
index a5f109c632dc58c5248a93c905e04a40d5a47604..51f698052aff3576ea20f1fc56dd7557d617d291 100644 (file)
 
 uint32_t g_linklog_level;
 
-static inline uint32_t crystalhd_dram_rd(struct crystalhd_adp *adp, uint32_t mem_off)
+static inline uint32_t crystalhd_dram_rd(struct crystalhd_adp *adp,
+                                        uint32_t mem_off)
 {
        crystalhd_reg_wr(adp, DCI_DRAM_BASE_ADDR, (mem_off >> 19));
        return bc_dec_reg_rd(adp, (0x00380000 | (mem_off & 0x0007FFFF)));
 }
 
-static inline void crystalhd_dram_wr(struct crystalhd_adp *adp, uint32_t mem_off, uint32_t val)
+static inline void crystalhd_dram_wr(struct crystalhd_adp *adp,
+                                        uint32_t mem_off, uint32_t val)
 {
        crystalhd_reg_wr(adp, DCI_DRAM_BASE_ADDR, (mem_off >> 19));
        bc_dec_reg_wr(adp, (0x00380000 | (mem_off & 0x0007FFFF)), val);
 }
 
-static inline enum BC_STATUS bc_chk_dram_range(struct crystalhd_adp *adp, uint32_t start_off, uint32_t cnt)
+static inline enum BC_STATUS bc_chk_dram_range(struct crystalhd_adp *adp,
+                                        uint32_t start_off, uint32_t cnt)
 {
        return BC_STS_SUCCESS;
 }
@@ -66,7 +69,8 @@ static struct crystalhd_dio_req *crystalhd_alloc_dio(struct crystalhd_adp *adp)
        return temp;
 }
 
-static void crystalhd_free_dio(struct crystalhd_adp *adp, struct crystalhd_dio_req *dio)
+static void crystalhd_free_dio(struct crystalhd_adp *adp,
+                                        struct crystalhd_dio_req *dio)
 {
        unsigned long flags = 0;
 
@@ -99,7 +103,8 @@ static struct crystalhd_elem *crystalhd_alloc_elem(struct crystalhd_adp *adp)
 
        return temp;
 }
-static void crystalhd_free_elem(struct crystalhd_adp *adp, struct crystalhd_elem *elem)
+static void crystalhd_free_elem(struct crystalhd_adp *adp,
+                                        struct crystalhd_elem *elem)
 {
        unsigned long flags = 0;
 
@@ -120,7 +125,8 @@ static inline void crystalhd_set_sg(struct scatterlist *sg, struct page *page,
 #endif
 }
 
-static inline void crystalhd_init_sg(struct scatterlist *sg, unsigned int entries)
+static inline void crystalhd_init_sg(struct scatterlist *sg,
+                                        unsigned int entries)
 {
        /* http://lkml.org/lkml/2007/11/27/68 */
        sg_init_table(sg, entries);
@@ -208,7 +214,8 @@ uint32_t crystalhd_reg_rd(struct crystalhd_adp *adp, uint32_t reg_off)
  * configuration space.
  *
  */
-void crystalhd_reg_wr(struct crystalhd_adp *adp, uint32_t reg_off, uint32_t val)
+void crystalhd_reg_wr(struct crystalhd_adp *adp, uint32_t reg_off,
+                                        uint32_t val)
 {
        if (!adp || (reg_off > adp->pci_i2o_len)) {
                BCMLOG_ERR("link_wr_reg_off outof range: 0x%08x\n", reg_off);
@@ -469,7 +476,8 @@ enum BC_STATUS crystalhd_create_dioq(struct crystalhd_adp *adp,
  * by calling the call back provided during creation.
  *
  */
-void crystalhd_delete_dioq(struct crystalhd_adp *adp, struct crystalhd_dioq *dioq)
+void crystalhd_delete_dioq(struct crystalhd_adp *adp,
+                        struct crystalhd_dioq *dioq)
 {
        void *temp;
 
@@ -639,7 +647,8 @@ void *crystalhd_dioq_fetch_wait(struct crystalhd_dioq *ioq, uint32_t to_secs,
        while ((ioq->count == 0) && count) {
                spin_unlock_irqrestore(&ioq->lock, flags);
 
-               crystalhd_wait_on_event(&ioq->event, (ioq->count > 0), 1000, rc, 0);
+               crystalhd_wait_on_event(&ioq->event,
+                                (ioq->count > 0), 1000, rc, 0);
                if (rc == 0) {
                        goto out;
                } else if (rc == -EINTR) {
@@ -678,7 +687,8 @@ enum BC_STATUS crystalhd_map_dio(struct crystalhd_adp *adp, void *ubuff,
                          struct crystalhd_dio_req **dio_hnd)
 {
        struct crystalhd_dio_req        *dio;
-       /* FIXME: jarod: should some of these unsigned longs be uint32_t or uintptr_t? */
+       /* FIXME: jarod: should some of these
+        unsigned longs be uint32_t or uintptr_t? */
        unsigned long start = 0, end = 0, uaddr = 0, count = 0;
        unsigned long spsz = 0, uv_start = 0;
        int i = 0, rw = 0, res = 0, nr_pages = 0, skip_fb_sg = 0;
@@ -723,7 +733,8 @@ enum BC_STATUS crystalhd_map_dio(struct crystalhd_adp *adp, void *ubuff,
        if (uv_offset) {
                uv_start = (uaddr + (unsigned long)uv_offset)  >> PAGE_SHIFT;
                dio->uinfo.uv_sg_ix = uv_start - start;
-               dio->uinfo.uv_sg_off = ((uaddr + (unsigned long)uv_offset) & ~PAGE_MASK);
+               dio->uinfo.uv_sg_off = ((uaddr + (unsigned long)uv_offset) &
+                                        ~PAGE_MASK);
        }
 
        dio->fb_size = ubuff_sz & 0x03;
@@ -819,7 +830,8 @@ enum BC_STATUS crystalhd_map_dio(struct crystalhd_adp *adp, void *ubuff,
  *
  * This routine is to unmap the user buffer pages.
  */
-enum BC_STATUS crystalhd_unmap_dio(struct crystalhd_adp *adp, struct crystalhd_dio_req *dio)
+enum BC_STATUS crystalhd_unmap_dio(struct crystalhd_adp *adp,
+                                struct crystalhd_dio_req *dio)
 {
        struct page *page = NULL;
        int j = 0;
@@ -841,7 +853,8 @@ enum BC_STATUS crystalhd_unmap_dio(struct crystalhd_adp *adp, struct crystalhd_d
                }
        }
        if (dio->sig == crystalhd_dio_sg_mapped)
-               pci_unmap_sg(adp->pdev, dio->sg, dio->page_cnt, dio->direction);
+               pci_unmap_sg(adp->pdev, dio->sg, dio->page_cnt,
+                        dio->direction);
 
        crystalhd_free_dio(adp, dio);
 
index 8cdaa7a3481460834ab541126d657ce9814318bc..4dae3a797e956380d4e0b43aaa1852f65c264ffd 100644 (file)
@@ -127,12 +127,16 @@ uint32_t crystalhd_reg_rd(struct crystalhd_adp *, uint32_t);
 void crystalhd_reg_wr(struct crystalhd_adp *, uint32_t, uint32_t);
 
 /*========= Decoder (7412) memory access routines..=================*/
-enum BC_STATUS crystalhd_mem_rd(struct crystalhd_adp *, uint32_t, uint32_t, uint32_t *);
-enum BC_STATUS crystalhd_mem_wr(struct crystalhd_adp *, uint32_t, uint32_t, uint32_t *);
+enum BC_STATUS crystalhd_mem_rd(struct crystalhd_adp *,
+                        uint32_t, uint32_t, uint32_t *);
+enum BC_STATUS crystalhd_mem_wr(struct crystalhd_adp *,
+                        uint32_t, uint32_t, uint32_t *);
 
 /*==========Link (70012) PCIe Config access routines.================*/
-enum BC_STATUS crystalhd_pci_cfg_rd(struct crystalhd_adp *, uint32_t, uint32_t, uint32_t *);
-enum BC_STATUS crystalhd_pci_cfg_wr(struct crystalhd_adp *, uint32_t, uint32_t, uint32_t);
+enum BC_STATUS crystalhd_pci_cfg_rd(struct crystalhd_adp *,
+                        uint32_t, uint32_t, uint32_t *);
+enum BC_STATUS crystalhd_pci_cfg_wr(struct crystalhd_adp *,
+                        uint32_t, uint32_t, uint32_t);
 
 /*========= Linux Kernel Interface routines. ======================= */
 void *bc_kern_dma_alloc(struct crystalhd_adp *, uint32_t, dma_addr_t *);
@@ -168,20 +172,26 @@ do {                                                                      \
 /*================ Direct IO mapping routines ==================*/
 extern int crystalhd_create_dio_pool(struct crystalhd_adp *, uint32_t);
 extern void crystalhd_destroy_dio_pool(struct crystalhd_adp *);
-extern enum BC_STATUS crystalhd_map_dio(struct crystalhd_adp *, void *, uint32_t,
-                                  uint32_t, bool, bool, struct crystalhd_dio_req**);
+extern enum BC_STATUS crystalhd_map_dio(struct crystalhd_adp *, void *,
+                uint32_t, uint32_t, bool, bool, struct crystalhd_dio_req**);
 
-extern enum BC_STATUS crystalhd_unmap_dio(struct crystalhd_adp *, struct crystalhd_dio_req*);
+extern enum BC_STATUS crystalhd_unmap_dio(struct crystalhd_adp *,
+                                        struct crystalhd_dio_req*);
 #define crystalhd_get_sgle_paddr(_dio, _ix) (cpu_to_le64(sg_dma_address(&_dio->sg[_ix])))
 #define crystalhd_get_sgle_len(_dio, _ix) (cpu_to_le32(sg_dma_len(&_dio->sg[_ix])))
 
 /*================ General Purpose Queues ==================*/
-extern enum BC_STATUS crystalhd_create_dioq(struct crystalhd_adp *, struct crystalhd_dioq **, crystalhd_data_free_cb , void *);
-extern void crystalhd_delete_dioq(struct crystalhd_adp *, struct crystalhd_dioq *);
-extern enum BC_STATUS crystalhd_dioq_add(struct crystalhd_dioq *ioq, void *data, bool wake, uint32_t tag);
+extern enum BC_STATUS crystalhd_create_dioq(struct crystalhd_adp *,
+                struct crystalhd_dioq **, crystalhd_data_free_cb , void *);
+extern void crystalhd_delete_dioq(struct crystalhd_adp *,
+                struct crystalhd_dioq *);
+extern enum BC_STATUS crystalhd_dioq_add(struct crystalhd_dioq *ioq,
+                void *data, bool wake, uint32_t tag);
 extern void *crystalhd_dioq_fetch(struct crystalhd_dioq *ioq);
-extern void *crystalhd_dioq_find_and_fetch(struct crystalhd_dioq *ioq, uint32_t tag);
-extern void *crystalhd_dioq_fetch_wait(struct crystalhd_dioq *ioq, uint32_t to_secs, uint32_t *sig_pend);
+extern void *crystalhd_dioq_find_and_fetch(struct crystalhd_dioq *ioq,
+                uint32_t tag);
+extern void *crystalhd_dioq_fetch_wait(struct crystalhd_dioq *ioq,
+                uint32_t to_secs, uint32_t *sig_pend);
 
 #define crystalhd_dioq_count(_ioq)     ((_ioq) ? _ioq->count : 0)
 
@@ -190,7 +200,8 @@ extern void crystalhd_delete_elem_pool(struct crystalhd_adp *);
 
 
 /*================ Debug routines/macros .. ================================*/
-extern void crystalhd_show_buffer(uint32_t off, uint8_t *buff, uint32_t dwcount);
+extern void crystalhd_show_buffer(uint32_t off, uint8_t *buff,
+                uint32_t dwcount);
 
 enum _chd_log_levels {
        BCMLOG_ERROR            = 0x80000000,   /* Don't disable this option */
index b53a9e29a97c455da865070f0db114d29d364910..d795852ccb1c0ccf31b9907857824a8b991fcd6f 100644 (file)
@@ -241,67 +241,72 @@ static int bh_thread_function(void *arg)
        this_thread = &priv->bh_thread;
 
        t = timeout = 0;
-    while (!kthread_should_stop()) {
-        /* wait until an error occurs, or we need to process something. */
-        unifi_trace(priv, UDBG3, "bh_thread goes to sleep.\n");
-
-        if (timeout > 0) {
-            /* Convert t in ms to jiffies */
-            t = msecs_to_jiffies(timeout);
-            ret = wait_event_interruptible_timeout(this_thread->wakeup_q,
-                    (this_thread->wakeup_flag && !this_thread->block_thread) ||
-                    kthread_should_stop(),
-                    t);
-            timeout = (ret > 0) ? jiffies_to_msecs(ret) : 0;
-        } else {
-            ret = wait_event_interruptible(this_thread->wakeup_q,
-                    (this_thread->wakeup_flag && !this_thread->block_thread) ||
-                    kthread_should_stop());
-        }
-
-        if (kthread_should_stop()) {
-            unifi_trace(priv, UDBG2, "bh_thread: signalled to exit\n");
-            break;
-        }
-
-        if (ret < 0) {
-            unifi_notice(priv,
-                    "bh_thread: wait_event returned %d, thread will exit\n",
-                    ret);
-            uf_wait_for_thread_to_stop(priv, this_thread);
-            break;
-        }
-
-        this_thread->wakeup_flag = 0;
-
-        unifi_trace(priv, UDBG3, "bh_thread calls unifi_bh().\n");
-
-        CsrSdioClaim(priv->sdio);
-        csrResult = unifi_bh(priv->card, &timeout);
-        if(csrResult != CSR_RESULT_SUCCESS) {
-            if (csrResult == CSR_WIFI_HIP_RESULT_NO_DEVICE) {
-                CsrSdioRelease(priv->sdio);
-                uf_wait_for_thread_to_stop(priv, this_thread);
-                break;
-            }
-            /* Errors must be delivered to the error task */
-            handle_bh_error(priv);
-        }
-        CsrSdioRelease(priv->sdio);
-    }
-
-    /*
-     * I would normally try to call csr_sdio_remove_irq() here to make sure
-     * that we do not get any interrupts while this thread is not running.
-     * However, the MMC/SDIO driver tries to kill its' interrupt thread.
-     * The kernel threads implementation does not allow to kill threads
-     * from a signalled to stop thread.
-     * So, instead call csr_sdio_linux_remove_irq() always after calling
-     * uf_stop_thread() to kill this thread.
-     */
-
-    unifi_trace(priv, UDBG2, "bh_thread exiting....\n");
-    return 0;
+       while (!kthread_should_stop()) {
+               /*
+               * wait until an error occurs,
+               * or we need to process something.
+               */
+               unifi_trace(priv, UDBG3, "bh_thread goes to sleep.\n");
+
+               if (timeout > 0) {
+                       /* Convert t in ms to jiffies */
+                       t = msecs_to_jiffies(timeout);
+                       ret = wait_event_interruptible_timeout(
+                               this_thread->wakeup_q,
+                               (this_thread->wakeup_flag && !this_thread->block_thread) ||
+                               kthread_should_stop(),
+                               t);
+                       timeout = (ret > 0) ? jiffies_to_msecs(ret) : 0;
+               } else {
+                       ret = wait_event_interruptible(this_thread->wakeup_q,
+                               (this_thread->wakeup_flag && !this_thread->block_thread) ||
+                               kthread_should_stop());
+               }
+
+               if (kthread_should_stop()) {
+                       unifi_trace(priv, UDBG2,
+                               "bh_thread: signalled to exit\n");
+                       break;
+               }
+
+               if (ret < 0) {
+                       unifi_notice(priv,
+                               "bh_thread: wait_event returned %d, thread will exit\n",
+                               ret);
+                       uf_wait_for_thread_to_stop(priv, this_thread);
+                       break;
+               }
+
+               this_thread->wakeup_flag = 0;
+
+               unifi_trace(priv, UDBG3, "bh_thread calls unifi_bh().\n");
+
+               CsrSdioClaim(priv->sdio);
+               csrResult = unifi_bh(priv->card, &timeout);
+               if (csrResult != CSR_RESULT_SUCCESS) {
+                       if (csrResult == CSR_WIFI_HIP_RESULT_NO_DEVICE) {
+                               CsrSdioRelease(priv->sdio);
+                               uf_wait_for_thread_to_stop(priv, this_thread);
+                               break;
+                       }
+                       /* Errors must be delivered to the error task */
+                       handle_bh_error(priv);
+               }
+               CsrSdioRelease(priv->sdio);
+       }
+
+       /*
+        * I would normally try to call csr_sdio_remove_irq() here to make sure
+       * that we do not get any interrupts while this thread is not running.
+       * However, the MMC/SDIO driver tries to kill its' interrupt thread.
+       * The kernel threads implementation does not allow to kill threads
+       * from a signalled to stop thread.
+       * So, instead call csr_sdio_linux_remove_irq() always after calling
+       * uf_stop_thread() to kill this thread.
+       */
+
+       unifi_trace(priv, UDBG2, "bh_thread exiting....\n");
+       return 0;
 } /* bh_thread_function() */
 
 
@@ -319,33 +324,33 @@ static int bh_thread_function(void *arg)
  *      0 on success or else a Linux error code.
  * ---------------------------------------------------------------------------
  */
-    int
+int
 uf_init_bh(unifi_priv_t *priv)
 {
-    int r;
+       int r;
 
-    /* Enable mlme interface. */
-    priv->io_aborted = 0;
+       /* Enable mlme interface. */
+       priv->io_aborted = 0;
 
 
-    /* Start the BH thread */
-    r = uf_start_thread(priv, &priv->bh_thread, bh_thread_function);
-    if (r) {
-        unifi_error(priv,
-                "uf_init_bh: failed to start the BH thread.\n");
-        return r;
-    }
+       /* Start the BH thread */
+       r = uf_start_thread(priv, &priv->bh_thread, bh_thread_function);
+       if (r) {
+               unifi_error(priv,
+                       "uf_init_bh: failed to start the BH thread.\n");
+               return r;
+       }
 
-    /* Allow interrupts */
-    r = csr_sdio_linux_install_irq(priv->sdio);
-    if (r) {
-        unifi_error(priv,
-                "uf_init_bh: failed to install the IRQ.\n");
+       /* Allow interrupts */
+       r = csr_sdio_linux_install_irq(priv->sdio);
+       if (r) {
+               unifi_error(priv,
+                       "uf_init_bh: failed to install the IRQ.\n");
 
-        uf_stop_thread(priv, &priv->bh_thread);
-    }
+               uf_stop_thread(priv, &priv->bh_thread);
+       }
 
-    return r;
+       return r;
 } /* uf_init_bh() */
 
 
@@ -370,28 +375,30 @@ uf_init_bh(unifi_priv_t *priv)
  */
 CsrResult unifi_run_bh(void *ospriv)
 {
-    unifi_priv_t *priv = ospriv;
-
-    /*
-     * If an error has occurred, we discard silently all messages from the bh
-     * until the error has been processed and the unifi has been reinitialised.
-     */
-    if (priv->bh_thread.block_thread == 1) {
-        unifi_trace(priv, UDBG3, "unifi_run_bh: discard message.\n");
-        /*
-         * Do not try to acknowledge a pending interrupt here.
-         * This function is called by unifi_send_signal() which in turn can be
-         * running in an atomic or 'disabled irq' level if a signal is sent
-         * from a workqueue task (i.e multicass addresses set).
-         * We can not hold the SDIO lock because it might sleep.
-         */
-        return CSR_RESULT_FAILURE;
-    }
-
-    priv->bh_thread.wakeup_flag = 1;
-    /* wake up I/O thread */
-    wake_up_interruptible(&priv->bh_thread.wakeup_q);
-
-    return CSR_RESULT_SUCCESS;
+       unifi_priv_t *priv = ospriv;
+
+       /*
+       * If an error has occurred, we discard silently all messages from the bh
+       * until the error has been processed and the unifi has been
+       * reinitialised.
+       */
+       if (priv->bh_thread.block_thread == 1) {
+               unifi_trace(priv, UDBG3, "unifi_run_bh: discard message.\n");
+               /*
+               * Do not try to acknowledge a pending interrupt here.
+               * This function is called by unifi_send_signal()
+               * which in turn can be running in an atomic or 'disabled irq'
+               * level if a signal is sent from a workqueue task
+               * (i.e multicass addresses set). We can not hold the SDIO lock
+               * because it might sleep.
+               */
+               return CSR_RESULT_FAILURE;
+       }
+
+       priv->bh_thread.wakeup_flag = 1;
+       /* wake up I/O thread */
+       wake_up_interruptible(&priv->bh_thread.wakeup_q);
+
+       return CSR_RESULT_SUCCESS;
 } /* unifi_run_bh() */
 
index 2aabb6c6b0af0fc6e9fe17cc47a9bf3601ee6e05..98122bce1427d2ed40f6602d5e40e20460c6dedd 100644 (file)
@@ -1,10 +1,10 @@
 /*****************************************************************************
 
-            (c) Cambridge Silicon Radio Limited 2010
-            All rights reserved and confidential information of CSR
+               (c) Cambridge Silicon Radio Limited 2010
+               All rights reserved and confidential information of CSR
 
-            Refer to LICENSE.txt included with this source for details
-            on the license terms.
+               Refer to LICENSE.txt included with this source for details
+               on the license terms.
 
 *****************************************************************************/
 
  *----------------------------------------------------------------------------*/
 void CsrThreadSleep(u16 sleepTimeInMs)
 {
-    unsigned long t;
+       unsigned long t;
 
-    /* Convert t in ms to jiffies and round up */
-    t = ((sleepTimeInMs * HZ) + 999) / 1000;
-    schedule_timeout_uninterruptible(t);
+       /* Convert t in ms to jiffies and round up */
+       t = ((sleepTimeInMs * HZ) + 999) / 1000;
+       schedule_timeout_uninterruptible(t);
 }
 EXPORT_SYMBOL_GPL(CsrThreadSleep);
index e8ae490c09d6d6def6f1d381d7425889f34d8f1d..6d26ac6173b01e3921f02e58c9e18006cd140ad4 100644 (file)
@@ -2,11 +2,11 @@
 #define CSR_FRAMEWORK_EXT_H__
 /*****************************************************************************
 
-            (c) Cambridge Silicon Radio Limited 2010
-            All rights reserved and confidential information of CSR
+               (c) Cambridge Silicon Radio Limited 2010
+       All rights reserved and confidential information of CSR
 
-            Refer to LICENSE.txt included with this source for details
-            on the license terms.
+               Refer to LICENSE.txt included with this source for details
+               on the license terms.
 
 *****************************************************************************/
 
index e048848883d54f7ca1f8d418df4effa25f355f31..bfebb1529779d17c9bf23a4dce63633b06e02141 100644 (file)
@@ -21,10 +21,10 @@ void CsrWifiNmeApUpstreamStateHandlers(void* drvpriv, CsrWifiFsmEvent* msg)
             CsrWifiNmeApStopCfmHandler(drvpriv, msg);
             break;
         case CSR_WIFI_NME_AP_CONFIG_SET_CFM:
-            CsrWifiNmeApConfigSetCfmHandler(drvpriv,msg);
+            CsrWifiNmeApConfigSetCfmHandler(drvpriv, msg);
             break;
         default:
-           unifi_error(drvpriv, "CsrWifiNmeApUpstreamStateHandlers: unhandled NME_AP message type 0x%.4X\n",msg->type);
+           unifi_error(drvpriv, "CsrWifiNmeApUpstreamStateHandlers: unhandled NME_AP message type 0x%.4X\n", msg->type);
             break;
     }
 }
index bdc25236ab008e8b21e43e071bf09a616bf212eb..92898de921f550019637e8a74ad363f3ce3d7d68 100644 (file)
@@ -1159,13 +1159,13 @@ unifi_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
             break;
 #ifdef CSR_SUPPORT_SME
           case UNIFI_CFG_CORE_DUMP:
-            CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,CSR_WIFI_SME_CONTROL_INDICATION_ERROR);
+            CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, CSR_WIFI_SME_CONTROL_INDICATION_ERROR);
             unifi_trace(priv, UDBG2, "UNIFI_CFG_CORE_DUMP: sent wifi off indication\n");
             break;
 #endif
 #ifdef CSR_SUPPORT_WEXT_AP
           case UNIFI_CFG_SET_AP_CONFIG:
-            r= unifi_cfg_set_ap_config(priv,(unsigned char*)arg);
+            r= unifi_cfg_set_ap_config(priv, (unsigned char*)arg);
             break;
 #endif
           default:
@@ -1275,7 +1275,7 @@ unifi_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
             /* Attach the network device to the stack */
             if (!interfacePriv->netdev_registered)
             {
-                r = uf_register_netdev(priv,interfaceTag);
+                r = uf_register_netdev(priv, interfaceTag);
                 if (r) {
                     unifi_error(priv, "Failed to register the network device.\n");
                     goto out;
index fe4a7ba2acc95fec9d520053476366d99a654ab0..f903022b40792cf96c55d769dff7b72ede6f6daf 100644 (file)
@@ -117,7 +117,7 @@ static CsrResult signal_buffer_init(unifi_priv_t * priv, int size)
          if (priv->rxSignalBuffer.rx_buff[i].bufptr == NULL)
          {
              int j;
-             unifi_error(priv,"signal_buffer_init:Failed to Allocate shared memory for T-H signals \n");
+             unifi_error(priv, "signal_buffer_init:Failed to Allocate shared memory for T-H signals \n");
              for(j=0;j<i;j++)
              {
                  priv->rxSignalBuffer.rx_buff[j].sig_len=0;
@@ -360,13 +360,13 @@ register_unifi_sdio(CsrSdioFunction *sdio_dev, int bus_id, struct device *dev)
 
         for(i=1;i<CSR_WIFI_NUM_INTERFACES;i++)
         {
-            if( !uf_alloc_netdevice_for_other_interfaces(priv,i) )
+            if( !uf_alloc_netdevice_for_other_interfaces(priv, i) )
             {
                 /* error occured while allocating the net_device for interface[i]. The net_device are
                  * allocated for the interfaces with id<i. Dont worry, all the allocated net_device will
                  * be releasing chen the control goes to the label failed0.
                  */
-                unifi_error(priv, "Failed to allocate driver private for interface[%d]\n",i);
+                unifi_error(priv, "Failed to allocate driver private for interface[%d]\n", i);
                 goto failed0;
             }
             else
@@ -391,12 +391,12 @@ register_unifi_sdio(CsrSdioFunction *sdio_dev, int bus_id, struct device *dev)
 #ifdef CSR_WIFI_RX_PATH_SPLIT
     if (signal_buffer_init(priv, CSR_WIFI_RX_SIGNAL_BUFFER_SIZE))
     {
-        unifi_error(priv,"Failed to allocate shared memory for T-H signals\n");
+        unifi_error(priv, "Failed to allocate shared memory for T-H signals\n");
         goto failed2;
     }
     priv->rx_workqueue = create_singlethread_workqueue("rx_workq");
     if (priv->rx_workqueue == NULL) {
-        unifi_error(priv,"create_singlethread_workqueue failed \n");
+        unifi_error(priv, "create_singlethread_workqueue failed \n");
         goto failed3;
     }
     INIT_WORK(&priv->rx_work_struct, rx_wq_handler);
@@ -442,7 +442,7 @@ if (log_hip_signals)
     flush_workqueue(priv->rx_workqueue);
     destroy_workqueue(priv->rx_workqueue);
 failed3:
-    signal_buffer_free(priv,CSR_WIFI_RX_SIGNAL_BUFFER_SIZE);
+    signal_buffer_free(priv, CSR_WIFI_RX_SIGNAL_BUFFER_SIZE);
 failed2:
 #endif
     /* Remove the device nodes */
@@ -558,8 +558,8 @@ cleanup_unifi_sdio(unifi_priv_t *priv)
     /* Free any packets left in the Rx queues */
     for(i=0;i<CSR_WIFI_NUM_INTERFACES;i++)
     {
-        uf_free_pending_rx_packets(priv, UF_UNCONTROLLED_PORT_Q, broadcast_address,i);
-        uf_free_pending_rx_packets(priv, UF_CONTROLLED_PORT_Q, broadcast_address,i);
+        uf_free_pending_rx_packets(priv, UF_UNCONTROLLED_PORT_Q, broadcast_address, i);
+        uf_free_pending_rx_packets(priv, UF_CONTROLLED_PORT_Q, broadcast_address, i);
     }
     /*
      * We need to free the resources held by the core, which include tx skbs,
@@ -595,7 +595,7 @@ cleanup_unifi_sdio(unifi_priv_t *priv)
 #ifdef CSR_WIFI_RX_PATH_SPLIT
     flush_workqueue(priv->rx_workqueue);
     destroy_workqueue(priv->rx_workqueue);
-    signal_buffer_free(priv,CSR_WIFI_RX_SIGNAL_BUFFER_SIZE);
+    signal_buffer_free(priv, CSR_WIFI_RX_SIGNAL_BUFFER_SIZE);
 #endif
 
     /* Priv is freed as part of the net_device */
index a0177d998978fe25524a333fc557bdada13d2c42..5ead2d40411588d91d24883db487902dc26c8175 100644 (file)
@@ -754,7 +754,7 @@ get_packet_priority(unifi_priv_t *priv, struct sk_buff *skb, const struct ethhdr
         case CSR_WIFI_ROUTER_CTRL_MODE_IBSS:
             {
                 CsrWifiRouterCtrlStaInfo_t * dstStaInfo =
-                    CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv,ehdr->h_dest, interfacePriv->InterfaceTag);
+                    CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv, ehdr->h_dest, interfacePriv->InterfaceTag);
                 unifi_trace(priv, UDBG4, "mode is AP \n");
                 if (!(ehdr->h_dest[0] & 0x01) && dstStaInfo && dstStaInfo->wmmOrQosEnabled) {
                     /* If packet is not Broadcast/multicast */
@@ -1011,7 +1011,7 @@ skb_80211_to_ether(unifi_priv_t *priv, struct sk_buff *skb,
 #endif
 
     if(skb== NULL || daddr == NULL || saddr == NULL){
-        unifi_error(priv,"skb_80211_to_ether: PBC fail\n");
+        unifi_error(priv, "skb_80211_to_ether: PBC fail\n");
         return 1;
     }
 
@@ -1198,7 +1198,7 @@ int prepare_and_add_macheader(unifi_priv_t *priv, struct sk_buff *skb, struct sk
     u8 bQosNull = false;
 
     if (skb == NULL) {
-        unifi_error(priv,"prepare_and_add_macheader: Invalid SKB reference\n");
+        unifi_error(priv, "prepare_and_add_macheader: Invalid SKB reference\n");
         return -1;
     }
 
@@ -1383,7 +1383,7 @@ int prepare_and_add_macheader(unifi_priv_t *priv, struct sk_buff *skb, struct sk
             macHeaderLengthInBytes -= ETH_ALEN;
             break;
         default:
-            unifi_error(priv,"Unknown direction =%d : Not handled now\n",direction);
+            unifi_error(priv, "Unknown direction =%d : Not handled now\n", direction);
             return -1;
     }
     /* 2 bytes of frame control field, appended by firmware */
@@ -1569,8 +1569,8 @@ send_ma_pkt_request(unifi_priv_t *priv, struct sk_buff *skb, const struct ethhdr
     memcpy(peerAddress.a, ((u8 *) bulkdata.d[0].os_data_ptr) + 4, ETH_ALEN);
 
     unifi_trace(priv, UDBG5, "RA[0]=%x, RA[1]=%x, RA[2]=%x, RA[3]=%x, RA[4]=%x, RA[5]=%x\n",
-                peerAddress.a[0],peerAddress.a[1], peerAddress.a[2], peerAddress.a[3],
-                peerAddress.a[4],peerAddress.a[5]);
+                peerAddress.a[0], peerAddress.a[1], peerAddress.a[2], peerAddress.a[3],
+                peerAddress.a[4], peerAddress.a[5]);
 
 
     if ((proto == ETH_P_PAE)
@@ -1865,10 +1865,10 @@ unifi_pause_xmit(void *ospriv, unifi_TrafficQueue queue)
 
 #ifdef CSR_SUPPORT_SME
     if(queue<=3) {
-        routerStartBuffering(priv,queue);
-        unifi_trace(priv,UDBG2,"Start buffering %d\n", queue);
+        routerStartBuffering(priv, queue);
+        unifi_trace(priv, UDBG2, "Start buffering %d\n", queue);
      } else {
-        routerStartBuffering(priv,0);
+        routerStartBuffering(priv, 0);
         unifi_error(priv, "Start buffering %d defaulting to 0\n", queue);
      }
 #endif
@@ -1893,11 +1893,11 @@ unifi_restart_xmit(void *ospriv, unifi_TrafficQueue queue)
 
 #ifdef CSR_SUPPORT_SME
     if(queue <=3) {
-        routerStopBuffering(priv,queue);
-        uf_send_buffered_frames(priv,queue);
+        routerStopBuffering(priv, queue);
+        uf_send_buffered_frames(priv, queue);
     } else {
-        routerStopBuffering(priv,0);
-        uf_send_buffered_frames(priv,0);
+        routerStopBuffering(priv, 0);
+        uf_send_buffered_frames(priv, 0);
     }
 #endif
 } /* unifi_restart_xmit() */
@@ -2102,14 +2102,14 @@ uf_resume_data_plane(unifi_priv_t *priv, int queue,
             netif_tx_schedule_all(priv->netdev[interfaceTag]);
         }
 #endif
-        uf_process_rx_pending_queue(priv, queue, peer_address, 1,interfaceTag);
+        uf_process_rx_pending_queue(priv, queue, peer_address, 1, interfaceTag);
     }
 } /* uf_resume_data_plane() */
 
 
-void uf_free_pending_rx_packets(unifi_priv_t *priv, int queue, CsrWifiMacAddress peer_address,u16 interfaceTag)
+void uf_free_pending_rx_packets(unifi_priv_t *priv, int queue, CsrWifiMacAddress peer_address, u16 interfaceTag)
 {
-    uf_process_rx_pending_queue(priv, queue, peer_address, 0,interfaceTag);
+    uf_process_rx_pending_queue(priv, queue, peer_address, 0, interfaceTag);
 
 } /* uf_free_pending_rx_packets() */
 
@@ -2153,7 +2153,7 @@ unifi_rx(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_data_param_t *bulkdata)
     if (interfaceTag >= CSR_WIFI_NUM_INTERFACES)
     {
         unifi_error(priv, "%s: MA-PACKET indication with bad interfaceTag %d\n", __FUNCTION__, interfaceTag);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
 
@@ -2167,7 +2167,7 @@ unifi_rx(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_data_param_t *bulkdata)
 
     if (bulkdata->d[0].data_length == 0) {
         unifi_warning(priv, "%s: MA-PACKET indication with zero bulk data\n", __FUNCTION__);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
 
@@ -2179,8 +2179,8 @@ unifi_rx(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_data_param_t *bulkdata)
     toDs = (skb->data[1] & 0x01) ? 1 : 0;
     fromDs = (skb->data[1] & 0x02) ? 1 : 0;
 
-    memcpy(da,(skb->data+4+toDs*12),ETH_ALEN);/* Address1 or 3 */
-    memcpy(sa,(skb->data+10+fromDs*(6+toDs*8)),ETH_ALEN); /* Address2, 3 or 4 */
+    memcpy(da, (skb->data+4+toDs*12), ETH_ALEN);/* Address1 or 3 */
+    memcpy(sa, (skb->data+10+fromDs*(6+toDs*8)), ETH_ALEN); /* Address2, 3 or 4 */
 
 
     pData = &bulkdata->d[0];
@@ -2189,7 +2189,7 @@ unifi_rx(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_data_param_t *bulkdata)
 
     dataFrameType =((frameControl & 0x00f0) >> 4);
     unifi_trace(priv, UDBG6,
-                "%s: Receive Data Frame Type %d \n", __FUNCTION__,dataFrameType);
+                "%s: Receive Data Frame Type %d \n", __FUNCTION__, dataFrameType);
 
     switch(dataFrameType)
     {
@@ -2276,7 +2276,7 @@ unifi_rx(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_data_param_t *bulkdata)
 
         /* AP/P2PGO specific handling here */
         CsrWifiRouterCtrlStaInfo_t * srcStaInfo =
-            CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv,sa,interfaceTag);
+            CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv, sa, interfaceTag);
 
         /* Defensive check only; Source address is already checked in
         process_ma_packet_ind and we should have a valid source address here */
@@ -2284,10 +2284,10 @@ unifi_rx(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_data_param_t *bulkdata)
          if(srcStaInfo == NULL) {
             CsrWifiMacAddress peerMacAddress;
             /* Unknown data PDU */
-            memcpy(peerMacAddress.a,sa,ETH_ALEN);
+            memcpy(peerMacAddress.a, sa, ETH_ALEN);
             unifi_trace(priv, UDBG1, "%s: Unexpected frame from peer = %x:%x:%x:%x:%x:%x\n", __FUNCTION__,
-            sa[0], sa[1],sa[2], sa[3], sa[4],sa[5]);
-            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,interfaceTag,peerMacAddress);
+            sa[0], sa[1], sa[2], sa[3], sa[4], sa[5]);
+            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, peerMacAddress);
             unifi_net_data_free(priv, &bulkdata->d[0]);
             return;
         }
@@ -2296,11 +2296,11 @@ unifi_rx(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_data_param_t *bulkdata)
         if (port_action != CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_OPEN) {
             /* Drop the packet and return */
             CsrWifiMacAddress peerMacAddress;
-            memcpy(peerMacAddress.a,sa,ETH_ALEN);
+            memcpy(peerMacAddress.a, sa, ETH_ALEN);
             unifi_trace(priv, UDBG3, "%s: Port is not open: unexpected frame from peer = %x:%x:%x:%x:%x:%x\n",
-                        __FUNCTION__, sa[0], sa[1],sa[2], sa[3], sa[4],sa[5]);
+                        __FUNCTION__, sa[0], sa[1], sa[2], sa[3], sa[4], sa[5]);
 
-            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,interfaceTag,peerMacAddress);
+            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, peerMacAddress);
             interfacePriv->stats.rx_dropped++;
             unifi_net_data_free(priv, &bulkdata->d[0]);
             unifi_notice(priv, "%s: Dropping packet, proto=0x%04x, %s port\n", __FUNCTION__,
@@ -2328,7 +2328,7 @@ unifi_rx(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_data_param_t *bulkdata)
         {
             return;
         }
-        unifi_trace(priv, UDBG5, "unifi_rx: no specific AP handling process as normal frame, MAC Header len %d\n",macHeaderLengthInBytes);
+        unifi_trace(priv, UDBG5, "unifi_rx: no specific AP handling process as normal frame, MAC Header len %d\n", macHeaderLengthInBytes);
         /* Remove the MAC header for subsequent conversion */
         skb_pull(skb, macHeaderLengthInBytes);
         pData->os_data_ptr = skb->data;
@@ -2422,7 +2422,7 @@ static void process_ma_packet_cfm(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_d
     if(interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_AP ||
        interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_P2PGO) {
 
-        uf_process_ma_pkt_cfm_for_ap(priv,interfaceTag,pkt_cfm);
+        uf_process_ma_pkt_cfm_for_ap(priv, interfaceTag, pkt_cfm);
     } else if (interfacePriv->m4_sent && (pkt_cfm->HostTag == interfacePriv->m4_hostTag)) {
         /* Check if this is a confirm for EAPOL M4 frame and we need to send transmistted ind*/
         CsrResult result = pkt_cfm->TransmissionStatus == CSR_TX_SUCCESSFUL?CSR_RESULT_SUCCESS:CSR_RESULT_FAILURE;
@@ -2486,7 +2486,7 @@ static void process_ma_packet_ind(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_d
     if (interfaceTag >= CSR_WIFI_NUM_INTERFACES)
     {
         unifi_error(priv, "%s: MA-PACKET indication with bad interfaceTag %d\n", __FUNCTION__, interfaceTag);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
 
@@ -2500,7 +2500,7 @@ static void process_ma_packet_ind(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_d
 
     if (bulkdata->d[0].data_length == 0) {
         unifi_warning(priv, "%s: MA-PACKET indication with zero bulk data\n", __FUNCTION__);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
     /* For monitor mode we need to pass this indication to the registered application
@@ -2508,8 +2508,8 @@ static void process_ma_packet_ind(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_d
     /* MIC failure is already taken care of so no need to send the PDUs which are not successfully received in non-monitor mode*/
     if(pkt_ind->ReceptionStatus != CSR_RX_SUCCESS)
     {
-        unifi_warning(priv, "%s: MA-PACKET indication with status = %d\n",__FUNCTION__, pkt_ind->ReceptionStatus);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_warning(priv, "%s: MA-PACKET indication with status = %d\n", __FUNCTION__, pkt_ind->ReceptionStatus);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
 
@@ -2521,8 +2521,8 @@ static void process_ma_packet_ind(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_d
     toDs = (skb->data[1] & 0x01) ? 1 : 0;
     fromDs = (skb->data[1] & 0x02) ? 1 : 0;
 
-    memcpy(da,(skb->data+4+toDs*12),ETH_ALEN);/* Address1 or 3 */
-    memcpy(sa,(skb->data+10+fromDs*(6+toDs*8)),ETH_ALEN); /* Address2, 3 or 4 */
+    memcpy(da, (skb->data+4+toDs*12), ETH_ALEN);/* Address1 or 3 */
+    memcpy(sa, (skb->data+10+fromDs*(6+toDs*8)), ETH_ALEN); /* Address2, 3 or 4 */
 
     /* Find the BSSID, which will be used to match the BA session */
     if (toDs && fromDs)
@@ -2539,7 +2539,7 @@ static void process_ma_packet_ind(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_d
     frameControl = CSR_GET_UINT16_FROM_LITTLE_ENDIAN(pData->os_data_ptr);
     frameType = ((frameControl & 0x000C) >> 2);
 
-    unifi_trace(priv, UDBG3, "Rx Frame Type: %d sn: %d\n",frameType,
+    unifi_trace(priv, UDBG3, "Rx Frame Type: %d sn: %d\n", frameType,
          (le16_to_cpu(*((u16*)(bulkdata->d[0].os_data_ptr + IEEE802_11_SEQUENCE_CONTROL_OFFSET))) >> 4) & 0xfff);
     if(frameType == IEEE802_11_FRAMETYPE_CONTROL){
 #ifdef CSR_SUPPORT_SME
@@ -2550,18 +2550,18 @@ static void process_ma_packet_ind(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_d
             u8 pmBit = (frameControl & 0x1000)?0x01:0x00;
             unifi_trace(priv, UDBG6, "%s: Received PS-POLL Frame\n", __FUNCTION__);
 
-            uf_process_ps_poll(priv,sa,da,pmBit,interfaceTag);
+            uf_process_ps_poll(priv, sa, da, pmBit, interfaceTag);
         }
         else {
             unifi_warning(priv, "%s: Non PS-POLL control frame is received\n", __FUNCTION__);
         }
 #endif
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
     if(frameType != IEEE802_11_FRAMETYPE_DATA) {
-        unifi_warning(priv, "%s: Non control Non Data frame is received\n",__FUNCTION__);
-        unifi_net_data_free(priv,&bulkdata->d[0]);
+        unifi_warning(priv, "%s: Non control Non Data frame is received\n", __FUNCTION__);
+        unifi_net_data_free(priv, &bulkdata->d[0]);
         return;
     }
 
@@ -2569,15 +2569,15 @@ static void process_ma_packet_ind(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_d
     if((interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_AP) ||
        (interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_P2PGO)){
 
-        srcStaInfo = CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv,sa,interfaceTag);
+        srcStaInfo = CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv, sa, interfaceTag);
 
         if(srcStaInfo == NULL) {
             CsrWifiMacAddress peerMacAddress;
             /* Unknown data PDU */
-            memcpy(peerMacAddress.a,sa,ETH_ALEN);
+            memcpy(peerMacAddress.a, sa, ETH_ALEN);
             unifi_trace(priv, UDBG1, "%s: Unexpected frame from peer = %x:%x:%x:%x:%x:%x\n", __FUNCTION__,
-            sa[0], sa[1],sa[2], sa[3], sa[4],sa[5]);
-            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,interfaceTag,peerMacAddress);
+            sa[0], sa[1], sa[2], sa[3], sa[4], sa[5]);
+            CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, peerMacAddress);
             unifi_net_data_free(priv, &bulkdata->d[0]);
             return;
         }
@@ -2591,7 +2591,7 @@ static void process_ma_packet_ind(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_d
         */
 
         pmBit = (frameControl & 0x1000)?0x01:0x00;
-        powerSaveChanged = uf_process_pm_bit_for_peer(priv,srcStaInfo,pmBit,interfaceTag);
+        powerSaveChanged = uf_process_pm_bit_for_peer(priv, srcStaInfo, pmBit, interfaceTag);
 
         /* Update station last activity time */
         srcStaInfo->activity_flag = TRUE;
@@ -2616,8 +2616,8 @@ static void process_ma_packet_ind(unifi_priv_t *priv, CSR_SIGNAL *signal, bulk_d
                 else{
                     qosControl = CSR_GET_UINT16_FROM_LITTLE_ENDIAN(pData->os_data_ptr + 24);
                 }
-                unifi_trace(priv, UDBG5, "%s: Check if U-APSD operations are triggered for qosControl: 0x%x\n",__FUNCTION__,qosControl);
-                uf_process_wmm_deliver_ac_uapsd(priv,srcStaInfo,qosControl,interfaceTag);
+                unifi_trace(priv, UDBG5, "%s: Check if U-APSD operations are triggered for qosControl: 0x%x\n", __FUNCTION__, qosControl);
+                uf_process_wmm_deliver_ac_uapsd(priv, srcStaInfo, qosControl, interfaceTag);
             }
         }
     }
@@ -2918,8 +2918,8 @@ uf_netdev_event(struct notifier_block *notif, unsigned long event, void* ptr) {
             interfacePriv->connected = UnifiConnected;
             interfacePriv->wait_netdev_change = FALSE;
             /* Note: passing the broadcast address here will allow anyone to attempt to join our adhoc network */
-            uf_process_rx_pending_queue(priv, UF_UNCONTROLLED_PORT_Q, broadcast_address, 1,interfacePriv->InterfaceTag);
-            uf_process_rx_pending_queue(priv, UF_CONTROLLED_PORT_Q, broadcast_address, 1,interfacePriv->InterfaceTag);
+            uf_process_rx_pending_queue(priv, UF_UNCONTROLLED_PORT_Q, broadcast_address, 1, interfacePriv->InterfaceTag);
+            uf_process_rx_pending_queue(priv, UF_CONTROLLED_PORT_Q, broadcast_address, 1, interfacePriv->InterfaceTag);
         }
         break;
 
index 30271d35af55e41862d0dffb4c541b43d742d599..2b503c23efae1385205111620736decf0fecdbc4 100644 (file)
@@ -1135,8 +1135,8 @@ uf_glue_sdio_remove(struct sdio_func *func)
  * them from the list passed in csr_sdio_register_driver().
  */
 static const struct sdio_device_id unifi_ids[] = {
-    { SDIO_DEVICE(SDIO_MANF_ID_CSR,SDIO_CARD_ID_UNIFI_3) },
-    { SDIO_DEVICE(SDIO_MANF_ID_CSR,SDIO_CARD_ID_UNIFI_4) },
+    { SDIO_DEVICE(SDIO_MANF_ID_CSR, SDIO_CARD_ID_UNIFI_3) },
+    { SDIO_DEVICE(SDIO_MANF_ID_CSR, SDIO_CARD_ID_UNIFI_4) },
     { /* end: all zeroes */                            },
 };
 
index d88ccd5bd42861405844e350069ed56cdecf16a6..0c6e21636e7f4bf451e6eb1ea61e5ffac7a1c838 100644 (file)
@@ -1280,7 +1280,7 @@ int sme_sys_suspend(unifi_priv_t *priv)
         return -EIO;
 
     /* Suspend the SME, which MAY cause it to power down UniFi */
-    CsrWifiRouterCtrlSuspendIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0, 0, priv->wol_suspend);
+    CsrWifiRouterCtrlSuspendIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, 0, priv->wol_suspend);
     r = sme_wait_for_reply(priv, UNIFI_SME_SYS_LONG_TIMEOUT);
     if (r) {
         /* No reply - forcibly power down in case the request wasn't processed */
@@ -1366,7 +1366,7 @@ int sme_sys_resume(unifi_priv_t *priv)
     if (r)
         return -EIO;
 
-    CsrWifiRouterCtrlResumeIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0, priv->wol_suspend);
+    CsrWifiRouterCtrlResumeIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, priv->wol_suspend);
 
     r = sme_wait_for_reply(priv, UNIFI_SME_SYS_LONG_TIMEOUT);
     if (r)
@@ -1377,7 +1377,7 @@ int sme_sys_resume(unifi_priv_t *priv)
 }
 
 #ifdef CSR_SUPPORT_WEXT_AP
-int sme_ap_stop(unifi_priv_t *priv,u16 interface_tag)
+int sme_ap_stop(unifi_priv_t *priv, u16 interface_tag)
 {
     int r;
 
@@ -1390,7 +1390,7 @@ int sme_ap_stop(unifi_priv_t *priv,u16 interface_tag)
     if (r)
         return -EIO;
 
-    CsrWifiNmeApStopReqSend(0,interface_tag);
+    CsrWifiNmeApStopReqSend(0, interface_tag);
 
     r = sme_wait_for_reply(priv, UNIFI_SME_MGT_SHORT_TIMEOUT);
     if (r)
@@ -1403,12 +1403,12 @@ int sme_ap_stop(unifi_priv_t *priv,u16 interface_tag)
 
 }
 
-int sme_ap_start(unifi_priv_t *priv,u16 interface_tag,
+int sme_ap_start(unifi_priv_t *priv, u16 interface_tag,
                  CsrWifiSmeApConfig_t * ap_config)
 {
     int r;
     CsrWifiSmeApP2pGoConfig p2p_go_param;
-    memset(&p2p_go_param,0,sizeof(CsrWifiSmeApP2pGoConfig));
+    memset(&p2p_go_param, 0, sizeof(CsrWifiSmeApP2pGoConfig));
 
     if (priv->smepriv == NULL) {
         unifi_error(priv, "sme_ap_start: invalid smepriv\n");
@@ -1419,10 +1419,10 @@ int sme_ap_start(unifi_priv_t *priv,u16 interface_tag,
     if (r)
         return -EIO;
 
-    CsrWifiNmeApStartReqSend(0,interface_tag,CSR_WIFI_AP_TYPE_LEGACY,FALSE,
-                             ap_config->ssid,1,ap_config->channel,
-                             ap_config->credentials,ap_config->max_connections,
-                             p2p_go_param,FALSE);
+    CsrWifiNmeApStartReqSend(0, interface_tag, CSR_WIFI_AP_TYPE_LEGACY, FALSE,
+                             ap_config->ssid, 1, ap_config->channel,
+                             ap_config->credentials, ap_config->max_connections,
+                             p2p_go_param, FALSE);
 
     r = sme_wait_for_reply(priv, UNIFI_SME_MGT_SHORT_TIMEOUT);
     if (r)
@@ -1440,7 +1440,7 @@ int sme_ap_config(unifi_priv_t *priv,
 {
     int r;
     CsrWifiSmeApP2pGoConfig p2p_go_param;
-    memset(&p2p_go_param,0,sizeof(CsrWifiSmeApP2pGoConfig));
+    memset(&p2p_go_param, 0, sizeof(CsrWifiSmeApP2pGoConfig));
 
     if (priv->smepriv == NULL) {
         unifi_error(priv, "sme_ap_config: invalid smepriv\n");
@@ -1451,7 +1451,7 @@ int sme_ap_config(unifi_priv_t *priv,
     if (r)
         return -EIO;
 
-    CsrWifiNmeApConfigSetReqSend(0,*group_security_config,
+    CsrWifiNmeApConfigSetReqSend(0, *group_security_config,
                                  *ap_mac_config);
 
     r = sme_wait_for_reply(priv, UNIFI_SME_MGT_SHORT_TIMEOUT);
index ca55249bde3e7e29cd49cc7690192f3c4ab93719..d0b9be31e12c44bf4b29f17f770aee3516b68b6c 100644 (file)
@@ -55,7 +55,7 @@ uf_sme_deinit(unifi_priv_t *priv)
 
 int sme_mgt_wifi_on(unifi_priv_t *priv)
 {
-    int r,i;
+    int r, i;
     s32 csrResult;
 
     if (priv == NULL) {
index b1151a28d8e33785ca7156a2698d33237b850680..b5258d71d2503abb0bc1d46d334d32d2eacae48c 100644 (file)
@@ -158,7 +158,7 @@ void CsrWifiRouterCtrlMediaStatusReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
         unifi_error(priv, "CsrWifiRouterCtrlMediaStatusReqHandler: invalid interfaceTag\n");
         return;
     }
-    unifi_trace(priv, UDBG3, "CsrWifiRouterCtrlMediaStatusReqHandler: Mode = %d req->mediaStatus = %d\n",interfacePriv->interfaceMode,req->mediaStatus);
+    unifi_trace(priv, UDBG3, "CsrWifiRouterCtrlMediaStatusReqHandler: Mode = %d req->mediaStatus = %d\n", interfacePriv->interfaceMode, req->mediaStatus);
     if (interfacePriv->interfaceMode != CSR_WIFI_ROUTER_CTRL_MODE_AMP) {
         bulk_data_desc_t bulk_data;
 
@@ -389,7 +389,7 @@ void CsrWifiRouterCtrlHipReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
         unifi_error(priv,
                     "CsrWifiRouterCtrlHipReqHandler: Failed to send signal (0x%.4X - %u)\n",
                     *((u16*)signal_ptr), r);
-        CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,CSR_WIFI_SME_CONTROL_INDICATION_ERROR);
+        CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, CSR_WIFI_SME_CONTROL_INDICATION_ERROR);
     }
 
     unifi_trace(priv, UDBG4, "CsrWifiRouterCtrlHipReqHandler: <----\n");
@@ -474,7 +474,7 @@ uf_send_gratuitous_arp(unifi_priv_t *priv, u16 interfaceTag)
     r = ul_send_signal_unpacked(priv, &signal, &bulkdata);
     if (r)
     {
-        unifi_error(priv, "CsrWifiSmeRoamCompleteIndHandler: failed to send QOS data null packet result: %d\n",r);
+        unifi_error(priv, "CsrWifiSmeRoamCompleteIndHandler: failed to send QOS data null packet result: %d\n", r);
         unifi_net_data_free(priv, &bulkdata.d[0]);
         return;
     }
@@ -574,7 +574,7 @@ configure_data_port(unifi_priv_t *priv,
 
             /* If port is closed, discard all the pending Rx packets */
             if (port_action == CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_CLOSED_DISCARD) {
-                uf_free_pending_rx_packets(priv, queue, *macAddress,interfaceTag);
+                uf_free_pending_rx_packets(priv, queue, *macAddress, interfaceTag);
             }
         }
     } else {
@@ -645,7 +645,7 @@ configure_data_port(unifi_priv_t *priv,
          * coming from the peer station.
          */
         if (port_action == CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_CLOSED_DISCARD) {
-            uf_free_pending_rx_packets(priv, queue, *macAddress,interfaceTag);
+            uf_free_pending_rx_packets(priv, queue, *macAddress, interfaceTag);
         }
 
        unifi_trace(priv, UDBG2,
@@ -712,7 +712,7 @@ void CsrWifiRouterCtrlPortConfigureReqHandler(void* drvpriv, CsrWifiFsmEvent* ms
     configure_data_port(priv, req->controlledPortAction, (const CsrWifiMacAddress *)&req->macAddress,
                         UF_CONTROLLED_PORT_Q, req->interfaceTag);
 
-    CsrWifiRouterCtrlPortConfigureCfmSend(msg->source,req->clientData,req->interfaceTag,
+    CsrWifiRouterCtrlPortConfigureCfmSend(msg->source, req->clientData, req->interfaceTag,
                                       CSR_RESULT_SUCCESS, req->macAddress);
     unifi_trace(priv, UDBG3, "leaving CsrWifiRouterCtrlPortConfigureReqHandler\n");
 }
@@ -723,7 +723,7 @@ void CsrWifiRouterCtrlWifiOnReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
     CsrWifiRouterCtrlVersions versions;
     CsrWifiRouterCtrlWifiOnReq* req = (CsrWifiRouterCtrlWifiOnReq*)msg;
-    int r,i;
+    int r, i;
     CsrResult csrResult;
 
     if (priv == NULL) {
@@ -963,7 +963,7 @@ void CsrWifiRouterCtrlWifiOffReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
     }
     wifi_off(priv);
 
-    CsrWifiRouterCtrlWifiOffCfmSend(msg->source,req->clientData);
+    CsrWifiRouterCtrlWifiOffCfmSend(msg->source, req->clientData);
 
     /* If this is called in response to closing the character device, the
      * caller must use uf_sme_cancel_request() to terminate any pending SME
@@ -1239,7 +1239,7 @@ void CsrWifiRouterMaPacketSubscribeReqHandler(void* drvpriv, CsrWifiFsmEvent* ms
     unifi_trace(priv, UDBG1,
                 "subscribe_req: encap=%d, handle=%d, result=%d\n",
                 req->encapsulation, i, result);
-    CsrWifiRouterMaPacketSubscribeCfmSend(msg->source,req->interfaceTag, i, result, 0);
+    CsrWifiRouterMaPacketSubscribeCfmSend(msg->source, req->interfaceTag, i, result, 0);
 }
 
 
@@ -1268,7 +1268,7 @@ void CsrWifiRouterMaPacketUnsubscribeReqHandler(void* drvpriv, CsrWifiFsmEvent*
     unifi_trace(priv, UDBG1,
                 "unsubscribe_req: handle=%d, result=%d\n",
                 req->subscriptionHandle, result);
-    CsrWifiRouterMaPacketUnsubscribeCfmSend(msg->source,req->interfaceTag, result);
+    CsrWifiRouterMaPacketUnsubscribeCfmSend(msg->source, req->interfaceTag, result);
 }
 
 
@@ -1282,7 +1282,7 @@ void CsrWifiRouterCtrlCapabilitiesReqHandler(void* drvpriv, CsrWifiFsmEvent* msg
         return;
     }
 
-    CsrWifiRouterCtrlCapabilitiesCfmSend(msg->source,req->clientData,
+    CsrWifiRouterCtrlCapabilitiesCfmSend(msg->source, req->clientData,
             UNIFI_SOFT_COMMAND_Q_LENGTH - 1,
             UNIFI_SOFT_TRAFFIC_Q_LENGTH - 1);
 }
@@ -1404,7 +1404,7 @@ _sys_packet_req(unifi_priv_t *priv, const CSR_SIGNAL *signal,
         if (r) {
             unifi_error(priv,
                         "_sys_packet_req: failed to translate eth frame.\n");
-            unifi_net_data_free(priv,&bulkdata.d[0]);
+            unifi_net_data_free(priv, &bulkdata.d[0]);
             return r;
         }
 
@@ -1439,7 +1439,7 @@ _sys_packet_req(unifi_priv_t *priv, const CSR_SIGNAL *signal,
 #ifdef CSR_SUPPORT_SME
     if ((protection = uf_get_protection_bit_from_interfacemode(priv, interfaceTag, peerMacAddress.a)) < 0) {
         unifi_error(priv, "unicast address, but destination not in station record database\n");
-        unifi_net_data_free(priv,&bulkdata.d[0]);
+        unifi_net_data_free(priv, &bulkdata.d[0]);
         return -1;
     }
 #else
@@ -1453,7 +1453,7 @@ _sys_packet_req(unifi_priv_t *priv, const CSR_SIGNAL *signal,
     /* add Mac header */
     if (prepare_and_add_macheader(priv, skb, newSkb, req.Priority, &bulkdata, interfaceTag, frame, frame + ETH_ALEN, protection)) {
         unifi_error(priv, "failed to create MAC header\n");
-        unifi_net_data_free(priv,&bulkdata.d[0]);
+        unifi_net_data_free(priv, &bulkdata.d[0]);
         return -1;
     }
 
@@ -1479,7 +1479,7 @@ _sys_packet_req(unifi_priv_t *priv, const CSR_SIGNAL *signal,
     if (r) {
         unifi_error(priv,
                     "_sys_packet_req: failed to send signal.\n");
-        unifi_net_data_free(priv,&bulkdata.d[0]);
+        unifi_net_data_free(priv, &bulkdata.d[0]);
         return r;
     }
     /* The final CsrWifiRouterMaPacketCfmSend() will called when the actual MA-PACKET.cfm is received from the chip */
@@ -1558,7 +1558,7 @@ void CsrWifiRouterMaPacketReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
     memcpy(req->Ra.x, daddr, ETH_ALEN);
     req->Priority = mareq->priority;
     req->TransmitRate = 0; /* Let firmware select the rate*/
-    req->VirtualInterfaceIdentifier = uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag);
+    req->VirtualInterfaceIdentifier = uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag);
     req->HostTag = mareq->hostTag;
 
     if(mareq->cfmRequested)
@@ -1571,7 +1571,7 @@ void CsrWifiRouterMaPacketReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
 
     if (r && mareq->cfmRequested)
     {
-        CsrWifiRouterMaPacketCfmSend(msg->source,interfaceTag,
+        CsrWifiRouterMaPacketCfmSend(msg->source, interfaceTag,
                                      CSR_RESULT_FAILURE,
                                      mareq->hostTag, 0);
     }
@@ -1637,7 +1637,7 @@ void CsrWifiRouterCtrlM4TransmitReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
 /* reset the station records when the mode is set as CSR_WIFI_ROUTER_CTRL_MODE_NONE */
 static void CsrWifiRouterCtrlResetStationRecordList(unifi_priv_t *priv, u16 interfaceTag)
 {
-    u8 i,j;
+    u8 i, j;
     CsrWifiRouterCtrlStaInfo_t *staInfo=NULL;
     netInterface_priv_t *interfacePriv = priv->interfacePriv[interfaceTag];
     unsigned long lock_flags;
@@ -1658,15 +1658,15 @@ static void CsrWifiRouterCtrlResetStationRecordList(unifi_priv_t *priv, u16 inte
             uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                                  &send_cfm_list,
                                                  &(staInfo->mgtFrames));
-            uf_flush_list(priv,&(staInfo->mgtFrames));
+            uf_flush_list(priv, &(staInfo->mgtFrames));
             for(j=0;j<MAX_ACCESS_CATOGORY;j++){
                 uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                                      &send_cfm_list,
                                                      &(staInfo->dataPdu[j]));
-                uf_flush_list(priv,&(staInfo->dataPdu[j]));
+                uf_flush_list(priv, &(staInfo->dataPdu[j]));
             }
 
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             /* Removing station record information from port config array */
             memset(staInfo->peerControlledPort, 0, sizeof(unifi_port_cfg_t));
             staInfo->peerControlledPort->port_action = CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_CLOSED_DISCARD;
@@ -1680,7 +1680,7 @@ static void CsrWifiRouterCtrlResetStationRecordList(unifi_priv_t *priv, u16 inte
 
             kfree(interfacePriv->staInfo[i]);
             interfacePriv->staInfo[i] = NULL;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }
     }
     /* after the critical region process the list of frames that requested cfm
@@ -1697,9 +1697,9 @@ static void CsrWifiRouterCtrlResetStationRecordList(unifi_priv_t *priv, u16 inte
         case CSR_WIFI_ROUTER_CTRL_MODE_NONE:
             if (priv->noOfPktQueuedInDriver) {
                 unifi_warning(priv, "After reset the noOfPktQueuedInDriver = %x\n", priv->noOfPktQueuedInDriver);
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 priv->noOfPktQueuedInDriver = 0;
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             }
             break;
         case CSR_WIFI_ROUTER_CTRL_MODE_IBSS:
@@ -1745,18 +1745,18 @@ void CsrWifiRouterCtrlInterfaceReset(unifi_priv_t *priv, u16 interfaceTag)
     uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                              &send_cfm_list,
                                              &(interfacePriv->genericMgtFrames));
-    uf_flush_list(priv,&(interfacePriv->genericMgtFrames));
+    uf_flush_list(priv, &(interfacePriv->genericMgtFrames));
 
     uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                              &send_cfm_list,
                                              &(interfacePriv->genericMulticastOrBroadCastMgtFrames));
-    uf_flush_list(priv,&(interfacePriv->genericMulticastOrBroadCastMgtFrames));
+    uf_flush_list(priv, &(interfacePriv->genericMulticastOrBroadCastMgtFrames));
 
     uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                              &send_cfm_list,
                                              &(interfacePriv->genericMulticastOrBroadCastFrames));
 
-    uf_flush_list(priv,&(interfacePriv->genericMulticastOrBroadCastFrames));
+    uf_flush_list(priv, &(interfacePriv->genericMulticastOrBroadCastFrames));
 
     /*  process the list of frames that requested cfm
     and send cfm to requestor one by one */
@@ -1772,7 +1772,7 @@ void CsrWifiRouterCtrlInterfaceReset(unifi_priv_t *priv, u16 interfaceTag)
             /* station records not available in these modes */
             break;
         default:
-            CsrWifiRouterCtrlResetStationRecordList(priv,interfaceTag);
+            CsrWifiRouterCtrlResetStationRecordList(priv, interfaceTag);
     }
 
     interfacePriv->num_stations_joined = 0;
@@ -1880,7 +1880,7 @@ void CsrWifiRouterCtrlModeSetReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
          * other then CSR_WIFI_TIM_SET or CSR_WIFI_TIM_RESET value
          */
         interfacePriv->bcTimSetReqQueued =0xFF;
-        CsrWifiRouterCtrlInterfaceReset(priv,req->interfaceTag);
+        CsrWifiRouterCtrlInterfaceReset(priv, req->interfaceTag);
 
         if(req->mode == CSR_WIFI_ROUTER_CTRL_MODE_AP ||
            req->mode == CSR_WIFI_ROUTER_CTRL_MODE_P2PGO) {
@@ -1900,7 +1900,7 @@ void CsrWifiRouterCtrlModeSetReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
         }
     }
     else {
-        unifi_error(priv, "CsrWifiRouterCtrlModeSetReqHandler: invalid interfaceTag :%d\n",req->interfaceTag);
+        unifi_error(priv, "CsrWifiRouterCtrlModeSetReqHandler: invalid interfaceTag :%d\n", req->interfaceTag);
     }
 }
 
@@ -1941,15 +1941,15 @@ static int peer_delete_record(unifi_priv_t *priv, CsrWifiRouterCtrlPeerDelReq *r
                                                  &send_cfm_list,
                                                  &(staInfo->mgtFrames));
 
-        uf_flush_list(priv,&(staInfo->mgtFrames));
+        uf_flush_list(priv, &(staInfo->mgtFrames));
         for(j=0;j<MAX_ACCESS_CATOGORY;j++){
             uf_prepare_send_cfm_list_for_queued_pkts(priv,
                                                      &send_cfm_list,
                                                      &(staInfo->dataPdu[j]));
-            uf_flush_list(priv,&(staInfo->dataPdu[j]));
+            uf_flush_list(priv, &(staInfo->dataPdu[j]));
         }
 
-        spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+        spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
         /* clear the port configure array info, for the corresponding peer entry */
         controlledPort = &interfacePriv->controlled_data_port;
         unControlledPort = &interfacePriv->uncontrolled_data_port;
@@ -1975,12 +1975,12 @@ static int peer_delete_record(unifi_priv_t *priv, CsrWifiRouterCtrlPeerDelReq *r
             unifi_warning(priv, "number of uncontrolled port entries is zero, trying to decrement: debug\n");
         }
 
-        spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         /* update the TIM with zero */
         if (interfacePriv->interfaceMode != CSR_WIFI_ROUTER_CTRL_MODE_IBSS &&
                 staInfo->timSet == CSR_WIFI_TIM_SET) {
             unifi_trace(priv, UDBG3, "peer is deleted so TIM updated to 0, in firmware\n");
-            update_tim(priv,staInfo->aid,0,req->interfaceTag, req->peerRecordHandle);
+            update_tim(priv, staInfo->aid, 0, req->interfaceTag, req->peerRecordHandle);
         }
 
 
@@ -2021,7 +2021,7 @@ static int peer_delete_record(unifi_priv_t *priv, CsrWifiRouterCtrlPeerDelReq *r
         cancel_work_sync(&staInfo->send_disconnected_ind_task);
 #endif
 
-        spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+        spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
 #ifdef CSR_SUPPORT_SME
         interfacePriv->num_stations_joined--;
 
@@ -2039,7 +2039,7 @@ static int peer_delete_record(unifi_priv_t *priv, CsrWifiRouterCtrlPeerDelReq *r
         /* Free the station record for corresponding peer */
         kfree(interfacePriv->staInfo[req->peerRecordHandle]);
         interfacePriv->staInfo[req->peerRecordHandle] = NULL;
-        spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
         /* after the critical region process the list of frames that requested cfm
         and send cfm to requestor one by one */
@@ -2092,12 +2092,12 @@ void CsrWifiRouterCtrlPeerDelReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
             break;
     }
 
-    CsrWifiRouterCtrlPeerDelCfmSend(msg->source,req->clientData,req->interfaceTag,status);
+    CsrWifiRouterCtrlPeerDelCfmSend(msg->source, req->clientData, req->interfaceTag, status);
     unifi_trace(priv, UDBG2, "leaving CsrWifiRouterCtrlPeerDelReqHandler \n");
 }
 
 /* Add the new station to the station record data base */
-static int peer_add_new_record(unifi_priv_t *priv,CsrWifiRouterCtrlPeerAddReq *req,u32 *handle)
+static int peer_add_new_record(unifi_priv_t *priv, CsrWifiRouterCtrlPeerAddReq *req, u32 *handle)
 {
     u8 i, powerModeTemp = 0;
     u8 freeSlotFound = FALSE;
@@ -2135,11 +2135,11 @@ static int peer_add_new_record(unifi_priv_t *priv,CsrWifiRouterCtrlPeerAddReq *r
                         req->staInfo.listenIntervalInTus);
 
             /* disable the preemption until station record updated */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
 
             interfacePriv->staInfo[i] = newRecord;
             /* Initialize the record*/
-            memset(newRecord,0,sizeof(CsrWifiRouterCtrlStaInfo_t));
+            memset(newRecord, 0, sizeof(CsrWifiRouterCtrlStaInfo_t));
             /* update the station record */
             memcpy(newRecord->peerMacAddress.a, req->peerMacAddress.a, ETH_ALEN);
             newRecord->wmmOrQosEnabled = req->staInfo.wmmOrQosEnabled;
@@ -2182,11 +2182,11 @@ static int peer_add_new_record(unifi_priv_t *priv,CsrWifiRouterCtrlPeerAddReq *r
                 u8 k;
                 for(k=0; k< MAX_ACCESS_CATOGORY ;k++)
                     unifi_trace(priv, UDBG2, "peer_add_new_record: WMM : %d ,AC %d, powersaveMode %x \n",
-                            req->staInfo.wmmOrQosEnabled,k,newRecord->powersaveMode[k]);
+                            req->staInfo.wmmOrQosEnabled, k, newRecord->powersaveMode[k]);
             }
 
             unifi_trace(priv, UDBG3, "newRecord->wmmOrQosEnabled : %d , MAX SP : %d\n",
-                    newRecord->wmmOrQosEnabled,newRecord->maxSpLength);
+                    newRecord->wmmOrQosEnabled, newRecord->maxSpLength);
 
             /* Initialize the mgtFrames & data Pdu list */
             {
@@ -2201,7 +2201,7 @@ static int peer_add_new_record(unifi_priv_t *priv,CsrWifiRouterCtrlPeerAddReq *r
             newRecord->activity_flag = TRUE;
 
             /* enable the preemption as station record updated */
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
             /* First time port actions are set for the peer with below information */
             configure_data_port(priv, CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_OPEN, &newRecord->peerMacAddress,
@@ -2216,7 +2216,7 @@ static int peer_add_new_record(unifi_priv_t *priv,CsrWifiRouterCtrlPeerAddReq *r
             }
 
 
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             /* Port status must be already set before calling the Add Peer request */
             newRecord->peerControlledPort = uf_sme_port_config_handle(priv, newRecord->peerMacAddress.a,
                                                                       UF_CONTROLLED_PORT_Q, req->interfaceTag);
@@ -2228,7 +2228,7 @@ static int peer_add_new_record(unifi_priv_t *priv,CsrWifiRouterCtrlPeerAddReq *r
                 unifi_warning(priv, "Un/ControlledPort record not found in port configuration array index = %d\n", i);
                 kfree(interfacePriv->staInfo[i]);
                 interfacePriv->staInfo[i] = NULL;
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
                 return CSR_RESULT_FAILURE;
             }
 
@@ -2279,7 +2279,7 @@ static int peer_add_new_record(unifi_priv_t *priv,CsrWifiRouterCtrlPeerAddReq *r
 
             }
 #endif
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
             break;
         }
     }
@@ -2446,7 +2446,7 @@ void uf_send_disconnected_ind_wq(struct work_struct *work)
                                                  &send_cfm_list,
                                                  &(staInfo->dataPdu[j]));
 
-        uf_flush_list(priv,&(staInfo->dataPdu[j]));
+        uf_flush_list(priv, &(staInfo->dataPdu[j]));
     }
 
     send_auto_ma_packet_confirm(priv, staInfo->interfacePriv, &send_cfm_list);
@@ -2471,7 +2471,7 @@ void uf_send_disconnected_ind_wq(struct work_struct *work)
 
 
 #endif
-void CsrWifiRouterCtrlPeerAddReqHandler(void* drvpriv,CsrWifiFsmEvent* msg)
+void CsrWifiRouterCtrlPeerAddReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
 {
     CsrWifiRouterCtrlPeerAddReq* req = (CsrWifiRouterCtrlPeerAddReq*)msg;
     CsrResult status = CSR_RESULT_SUCCESS;
@@ -2500,7 +2500,7 @@ void CsrWifiRouterCtrlPeerAddReqHandler(void* drvpriv,CsrWifiFsmEvent* msg)
         case CSR_WIFI_ROUTER_CTRL_MODE_IBSS:
         case CSR_WIFI_ROUTER_CTRL_MODE_P2PGO:
             /* Add station record */
-            status = peer_add_new_record(priv,req,&handle);
+            status = peer_add_new_record(priv, req, &handle);
             break;
         case CSR_WIFI_ROUTER_CTRL_MODE_STA:
         case CSR_WIFI_ROUTER_CTRL_MODE_P2PCLI:
@@ -2509,11 +2509,11 @@ void CsrWifiRouterCtrlPeerAddReqHandler(void* drvpriv,CsrWifiFsmEvent* msg)
             break;
     }
 
-    CsrWifiRouterCtrlPeerAddCfmSend(msg->source,req->clientData,req->interfaceTag,req->peerMacAddress,handle,status);
+    CsrWifiRouterCtrlPeerAddCfmSend(msg->source, req->clientData, req->interfaceTag, req->peerMacAddress, handle, status);
     unifi_trace(priv, UDBG2, "leaving CsrWifiRouterCtrlPeerAddReqHandler \n");
 }
 
-void CsrWifiRouterCtrlPeerUpdateReqHandler(void* drvpriv,CsrWifiFsmEvent* msg)
+void CsrWifiRouterCtrlPeerUpdateReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
 {
     CsrWifiRouterCtrlPeerUpdateReq* req = (CsrWifiRouterCtrlPeerUpdateReq*)msg;
     CsrResult status = CSR_RESULT_SUCCESS;
@@ -2526,7 +2526,7 @@ void CsrWifiRouterCtrlPeerUpdateReqHandler(void* drvpriv,CsrWifiFsmEvent* msg)
         return;
     }
 
-    CsrWifiRouterCtrlPeerUpdateCfmSend(msg->source,req->clientData,req->interfaceTag,status);
+    CsrWifiRouterCtrlPeerUpdateCfmSend(msg->source, req->clientData, req->interfaceTag, status);
     unifi_trace(priv, UDBG2, "leaving CsrWifiRouterCtrlPeerUpdateReqHandler \n");
 }
 
@@ -2986,13 +2986,13 @@ void CsrWifiRouterCtrlWapiMulticastFilterReqHandler(void* drvpriv, CsrWifiFsmEve
         unifi_trace(priv, UDBG6, "<<%s\n", __FUNCTION__);
     } else {
 
-       unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__,interfacePriv->interfaceMode);
+       unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__, interfacePriv->interfaceMode);
 
     }
 #elif defined(UNIFI_DEBUG)
     /*WAPI Disabled*/
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
-    unifi_error(priv,"CsrWifiRouterCtrlWapiMulticastFilterReqHandler: called when WAPI isn't enabled\n");
+    unifi_error(priv, "CsrWifiRouterCtrlWapiMulticastFilterReqHandler: called when WAPI isn't enabled\n");
 #endif
 }
 
@@ -3022,13 +3022,13 @@ void CsrWifiRouterCtrlWapiUnicastFilterReqHandler(void* drvpriv, CsrWifiFsmEvent
         unifi_trace(priv, UDBG6, "<<%s\n", __FUNCTION__);
     } else {
 
-        unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__,interfacePriv->interfaceMode);
+        unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__, interfacePriv->interfaceMode);
 
     }
 #elif defined(UNIFI_DEBUG)
     /*WAPI Disabled*/
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
-    unifi_error(priv,"CsrWifiRouterCtrlWapiUnicastFilterReqHandler: called when WAPI isn't enabled\n");
+    unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastFilterReqHandler: called when WAPI isn't enabled\n");
 #endif
 }
 
@@ -3064,13 +3064,13 @@ void CsrWifiRouterCtrlWapiRxPktReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
 
 
         if (req->dataLength == 0 || req->data == NULL) {
-             unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReq: invalid request\n",__FUNCTION__);
+             unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReq: invalid request\n", __FUNCTION__);
              return;
         }
 
         res = unifi_net_data_malloc(priv, &bulkdata.d[0], req->dataLength);
         if (res != CSR_RESULT_SUCCESS) {
-             unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReq: Could not allocate net data\n",__FUNCTION__);
+             unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReq: Could not allocate net data\n", __FUNCTION__);
              return;
         }
 
@@ -3078,15 +3078,15 @@ void CsrWifiRouterCtrlWapiRxPktReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
          * So reset the reception status to rx_success */
         res = read_unpack_signal(req->signal, &signal);
         if (res) {
-                 unifi_error(priv,"CsrWifiRouterCtrlWapiRxPktReqHandler: Received unknown or corrupted signal.\n");
+                 unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReqHandler: Received unknown or corrupted signal.\n");
                  return;
         }
         pkt_ind = (CSR_MA_PACKET_INDICATION*) (&((&signal)->u).MaPacketIndication);
         if (pkt_ind->ReceptionStatus != CSR_MICHAEL_MIC_ERROR) {
-                 unifi_error(priv,"CsrWifiRouterCtrlWapiRxPktReqHandler: Unknown signal with reception status = %d\n",pkt_ind->ReceptionStatus);
+                 unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReqHandler: Unknown signal with reception status = %d\n", pkt_ind->ReceptionStatus);
                  return;
         } else {
-                 unifi_trace(priv, UDBG4,"CsrWifiRouterCtrlWapiRxPktReqHandler: MIC verified , RX_SUCCESS \n",__FUNCTION__);
+                 unifi_trace(priv, UDBG4, "CsrWifiRouterCtrlWapiRxPktReqHandler: MIC verified , RX_SUCCESS \n", __FUNCTION__);
                  pkt_ind->ReceptionStatus = CSR_RX_SUCCESS;
                  write_pack(&signal, req->signal, &(req->signalLength));
         }
@@ -3113,12 +3113,12 @@ void CsrWifiRouterCtrlWapiRxPktReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
 
         unifi_trace(priv, UDBG6, "<<%s\n", __FUNCTION__);
     } else {
-       unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__,interfacePriv->interfaceMode);
+       unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__, interfacePriv->interfaceMode);
     }
 #elif defined(UNIFI_DEBUG)
     /*WAPI Disabled*/
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
-    unifi_error(priv,"CsrWifiRouterCtrlWapiRxPktReqHandler: called when WAPI isn't enabled\n");
+    unifi_error(priv, "CsrWifiRouterCtrlWapiRxPktReqHandler: called when WAPI isn't enabled\n");
 #endif
 }
 
@@ -3142,15 +3142,15 @@ void CsrWifiRouterCtrlWapiUnicastTxPktReqHandler(void* drvpriv, CsrWifiFsmEvent*
         unifi_trace(priv, UDBG6, ">>%s\n", __FUNCTION__);
 
         if (priv == NULL) {
-            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler : invalid priv\n",__FUNCTION__);
+            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler : invalid priv\n", __FUNCTION__);
             return;
         }
         if (priv->smepriv == NULL) {
-            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler : invalid sme priv\n",__FUNCTION__);
+            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler : invalid sme priv\n", __FUNCTION__);
             return;
         }
         if (req->data == NULL) {
-            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: invalid request\n",__FUNCTION__);
+            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: invalid request\n", __FUNCTION__);
             return;
         } else {
             /* If it is QoS data (type = data subtype = QoS), frame header contains QoS control field */
@@ -3159,7 +3159,7 @@ void CsrWifiRouterCtrlWapiUnicastTxPktReqHandler(void* drvpriv, CsrWifiFsmEvent*
             }
         }
         if ( !(req->dataLength>(macHeaderLengthInBytes+appendedCryptoFields)) ) {
-            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: invalid dataLength\n",__FUNCTION__);
+            unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: invalid dataLength\n", __FUNCTION__);
             return;
         }
 
@@ -3174,7 +3174,7 @@ void CsrWifiRouterCtrlWapiUnicastTxPktReqHandler(void* drvpriv, CsrWifiFsmEvent*
          */
         result = unifi_net_data_malloc(priv, &bulkdata.d[0], req->dataLength);
         if (result != CSR_RESULT_SUCCESS) {
-             unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: Could not allocate net data\n",__FUNCTION__);
+             unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: Could not allocate net data\n", __FUNCTION__);
              return;
         }
         memcpy((void*)bulkdata.d[0].os_data_ptr, req->data, req->dataLength);
@@ -3217,13 +3217,13 @@ void CsrWifiRouterCtrlWapiUnicastTxPktReqHandler(void* drvpriv, CsrWifiFsmEvent*
 
     } else {
 
-       unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__,interfacePriv->interfaceMode);
+       unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__, interfacePriv->interfaceMode);
 
     }
 #elif defined(UNIFI_DEBUG)
     /*WAPI Disabled*/
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
-    unifi_error(priv,"CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: called when WAPI SW ENCRYPTION isn't enabled\n");
+    unifi_error(priv, "CsrWifiRouterCtrlWapiUnicastTxPktReqHandler: called when WAPI SW ENCRYPTION isn't enabled\n");
 #endif
 }
 
@@ -3240,14 +3240,14 @@ void CsrWifiRouterCtrlWapiFilterReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
 
         unifi_trace(priv, UDBG6, ">>%s\n", __FUNCTION__);
 
-        unifi_trace(priv, UDBG1, "CsrWifiRouterCtrlWapiFilterReq: req->isWapiConnected [0/1] = %d \n",req->isWapiConnected);
+        unifi_trace(priv, UDBG1, "CsrWifiRouterCtrlWapiFilterReq: req->isWapiConnected [0/1] = %d \n", req->isWapiConnected);
 
         priv->isWapiConnection = req->isWapiConnected;
 
         unifi_trace(priv, UDBG6, "<<%s\n", __FUNCTION__);
     } else {
 
-       unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__,interfacePriv->interfaceMode);
+       unifi_warning(priv, "%s is NOT applicable for interface mode - %d\n", __FUNCTION__, interfacePriv->interfaceMode);
 
     }
 #endif
@@ -3255,6 +3255,6 @@ void CsrWifiRouterCtrlWapiFilterReqHandler(void* drvpriv, CsrWifiFsmEvent* msg)
 #elif defined(UNIFI_DEBUG)
     /*WAPI Disabled*/
     unifi_priv_t *priv = (unifi_priv_t*)drvpriv;
-    unifi_error(priv,"CsrWifiRouterCtrlWapiFilterReq: called when WAPI isn't enabled\n");
+    unifi_error(priv, "CsrWifiRouterCtrlWapiFilterReq: called when WAPI isn't enabled\n");
 #endif
 }
index abcb446fb8c00f7c5a3f354236de3a51ff091b5c..b919b001ef7c92df4442dd88b1f3c41dfa9d9338 100644 (file)
@@ -118,7 +118,7 @@ uf_sme_init(unifi_priv_t *priv)
 void
 uf_sme_deinit(unifi_priv_t *priv)
 {
-    int i,j;
+    int i, j;
     u8 ba_session_idx;
     ba_session_rx_struct *ba_session_rx = NULL;
     ba_session_tx_struct *ba_session_tx = NULL;
@@ -224,7 +224,7 @@ unifi_ta_indicate_protocol(void *ospriv,
     if (CSR_WIFI_ROUTER_CTRL_PROTOCOL_DIRECTION_RX == direction)
     {
         u16 interfaceTag = 0;
-        CsrWifiRouterCtrlTrafficProtocolIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,
+        CsrWifiRouterCtrlTrafficProtocolIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0,
                 interfaceTag,
                 packet_type,
                 direction,
index 4129a6436b7635470cb70d77c613a7193f278b2d..84f11cb5359646b6c877c5cfd60e7d76fe9e6224 100644 (file)
@@ -120,7 +120,7 @@ channel_to_mhz(int ch, int dot11a)
 #ifdef CSR_SUPPORT_WEXT_AP
 void uf_sme_wext_ap_set_defaults(unifi_priv_t *priv)
 {
-    memcpy(priv->ap_config.ssid.ssid,"defaultssid",sizeof("defaultssid"));
+    memcpy(priv->ap_config.ssid.ssid, "defaultssid", sizeof("defaultssid"));
 
     priv->ap_config.ssid.length = 8;
     priv->ap_config.channel = 6;
@@ -202,7 +202,7 @@ void uf_sme_wext_ap_set_defaults(unifi_priv_t *priv)
                                                     to enable different types of
                                                     devices to join us */
     priv->ap_mac_config.supportedRatesCount =
-           uf_configure_supported_rates(priv->ap_mac_config.supportedRates,priv->ap_mac_config.phySupportedBitmap);
+           uf_configure_supported_rates(priv->ap_mac_config.supportedRates, priv->ap_mac_config.phySupportedBitmap);
 }
 #endif
 /*
@@ -459,7 +459,7 @@ static int decode_parameter_from_string(unifi_priv_t* priv, char **str_ptr,
 {
     u8 int_str[7] = "0";
     u32 param_str_len;
-    u8  *param_str_begin,*param_str_end;
+    u8  *param_str_begin, *param_str_end;
     u8  *orig_str = *str_ptr;
 
     if (!strncmp(*str_ptr, token, strlen(token))) {
@@ -472,41 +472,41 @@ static int decode_parameter_from_string(unifi_priv_t* priv, char **str_ptr,
             param_str_end = *str_ptr-1;
             param_str_len = param_str_end - param_str_begin;
         }
-        unifi_trace(priv,UDBG2,"'token:%s', len:%d, ", token, param_str_len);
+        unifi_trace(priv, UDBG2, "'token:%s', len:%d, ", token, param_str_len);
         if (param_str_len > param_max_len) {
-            unifi_notice(priv,"extracted param len:%d is > MAX:%d\n",param_str_len, param_max_len);
+            unifi_notice(priv, "extracted param len:%d is > MAX:%d\n", param_str_len, param_max_len);
             param_str_len = param_max_len;
         }
         switch (param_type) {
             case PARAM_TYPE_INT:
             {
-                u32 *pdst_int = dst,num =0;
-                int i,j=0;
+                u32 *pdst_int = dst, num =0;
+                int i, j=0;
                 if (param_str_len > sizeof(int_str)) {
                     param_str_len = sizeof(int_str);
                 }
                 memcpy(int_str, param_str_begin, param_str_len);
                 for(i = param_str_len; i>0;i--) {
                     if(int_str[i-1] >= '0' && int_str[i-1] <='9') {
-                        num += ((int_str[i-1]-'0')*power(10,j));
+                        num += ((int_str[i-1]-'0')*power(10, j));
                         j++;
                     } else {
-                        unifi_error(priv,"decode_parameter_from_string:not a number %c\n",(int_str[i-1]));
+                        unifi_error(priv, "decode_parameter_from_string:not a number %c\n", (int_str[i-1]));
                         return -1;
                     }
                 }
                 *pdst_int = num;
-                unifi_trace(priv,UDBG2,"decode_parameter_from_string:decoded int = %d\n",*pdst_int);
+                unifi_trace(priv, UDBG2, "decode_parameter_from_string:decoded int = %d\n", *pdst_int);
             }
             break;
             default:
                 memcpy(dst, param_str_begin, param_str_len);
                 *((char *)dst + param_str_len) = 0;
-                unifi_trace(priv,UDBG2,"decode_parameter_from_string:decoded string = %s\n",(char *)dst);
+                unifi_trace(priv, UDBG2, "decode_parameter_from_string:decoded string = %s\n", (char *)dst);
             break;
         }
     } else {
-        unifi_error(priv,"decode_parameter_from_string: Token:%s not found in %s \n",token,orig_str);
+        unifi_error(priv, "decode_parameter_from_string: Token:%s not found in %s \n", token, orig_str);
         return -1;
     }
     return 0;
@@ -514,7 +514,7 @@ static int decode_parameter_from_string(unifi_priv_t* priv, char **str_ptr,
 static int store_ap_advanced_config_from_string(unifi_priv_t *priv, char *param_str)
 {
     char * str_ptr=param_str;
-    int ret = 0,tmp_var;
+    int ret = 0, tmp_var;
     char phy_mode[6];
     CsrWifiSmeApMacConfig * ap_mac_config = &priv->ap_mac_config;
 
@@ -522,36 +522,36 @@ static int store_ap_advanced_config_from_string(unifi_priv_t *priv, char *param_
     ret = decode_parameter_from_string(priv, &str_ptr, "BI=",
                                        PARAM_TYPE_INT, &tmp_var, 5);
     if(ret) {
-        unifi_error(priv,"store_ap_advanced_config_from_string: BI not found\n");
+        unifi_error(priv, "store_ap_advanced_config_from_string: BI not found\n");
         return -1;
     }
     ap_mac_config->beaconInterval = tmp_var;
     ret = decode_parameter_from_string(priv, &str_ptr, "DTIM_PER=",
                                         PARAM_TYPE_INT, &tmp_var, 5);
     if(ret) {
-        unifi_error(priv,"store_ap_advanced_config_from_string: DTIM_PER not found\n");
+        unifi_error(priv, "store_ap_advanced_config_from_string: DTIM_PER not found\n");
         return -1;
     }
     ap_mac_config->dtimPeriod = tmp_var;
     ret = decode_parameter_from_string(priv, &str_ptr, "WMM=",
                                         PARAM_TYPE_INT, &tmp_var, 5);
     if(ret) {
-        unifi_error(priv,"store_ap_advanced_config_from_string: WMM not found\n");
+        unifi_error(priv, "store_ap_advanced_config_from_string: WMM not found\n");
         return -1;
     }
     ap_mac_config->wmmEnabled = tmp_var;
     ret = decode_parameter_from_string(priv, &str_ptr, "PHY=",
                                         PARAM_TYPE_STRING, phy_mode, 5);
     if(ret) {
-        unifi_error(priv,"store_ap_advanced_config_from_string: PHY not found\n");
+        unifi_error(priv, "store_ap_advanced_config_from_string: PHY not found\n");
     } else {
-       if(strstr(phy_mode,"b")){
+       if(strstr(phy_mode, "b")){
            ap_mac_config->phySupportedBitmap = CSR_WIFI_SME_AP_PHY_SUPPORT_B;
        }
-       if(strstr(phy_mode,"g")) {
+       if(strstr(phy_mode, "g")) {
            ap_mac_config->phySupportedBitmap |= CSR_WIFI_SME_AP_PHY_SUPPORT_G;
        }
-       if(strstr(phy_mode,"n")) {
+       if(strstr(phy_mode, "n")) {
            ap_mac_config->phySupportedBitmap |= CSR_WIFI_SME_AP_PHY_SUPPORT_N;
        }
        ap_mac_config->supportedRatesCount =
@@ -560,39 +560,39 @@ static int store_ap_advanced_config_from_string(unifi_priv_t *priv, char *param_
     return ret;
 }
 
-static int store_ap_config_from_string( unifi_priv_t * priv,char *param_str)
+static int store_ap_config_from_string( unifi_priv_t * priv, char *param_str)
 
 {
     char *str_ptr = param_str;
     char sub_cmd[16];
     char sec[CSR_WIFI_MAX_SEC_LEN];
     char key[CSR_WIFI_MAX_KEY_LEN];
-    int ret = 0,tmp_var;
+    int ret = 0, tmp_var;
     CsrWifiSmeApConfig_t *ap_config = &priv->ap_config;
     CsrWifiSmeApMacConfig * ap_mac_config = &priv->ap_mac_config;
     memset(sub_cmd, 0, sizeof(sub_cmd));
-    if(!strstr(param_str,"END")) {
-        unifi_error(priv,"store_ap_config_from_string:Invalid config string:%s\n",param_str);
+    if(!strstr(param_str, "END")) {
+        unifi_error(priv, "store_ap_config_from_string:Invalid config string:%s\n", param_str);
         return -1;
     }
-    if (decode_parameter_from_string(priv,&str_ptr, "ASCII_CMD=",
+    if (decode_parameter_from_string(priv, &str_ptr, "ASCII_CMD=",
         PARAM_TYPE_STRING, sub_cmd, 6) != 0) {
          return -1;
     }
     if (strncmp(sub_cmd, "AP_CFG", 6)) {
 
-        if(!strncmp(sub_cmd ,"ADVCFG", 6)) {
+        if(!strncmp(sub_cmd , "ADVCFG", 6)) {
            return store_ap_advanced_config_from_string(priv, str_ptr);
         }
-        unifi_error(priv,"store_ap_config_from_string: sub_cmd:%s != 'AP_CFG or ADVCFG'!\n", sub_cmd);
+        unifi_error(priv, "store_ap_config_from_string: sub_cmd:%s != 'AP_CFG or ADVCFG'!\n", sub_cmd);
         return -1;
     }
     memset(ap_config, 0, sizeof(CsrWifiSmeApConfig_t));
-    ret = decode_parameter_from_string(priv,&str_ptr, "SSID=",
+    ret = decode_parameter_from_string(priv, &str_ptr, "SSID=",
                                        PARAM_TYPE_STRING, ap_config->ssid.ssid,
                                        CSR_WIFI_MAX_SSID_LEN);
     if(ret) {
-        unifi_error(priv,"store_ap_config_from_string: SSID not found\n");
+        unifi_error(priv, "store_ap_config_from_string: SSID not found\n");
         return -1;
     }
     ap_config->ssid.length = strlen(ap_config->ssid.ssid);
@@ -600,27 +600,27 @@ static int store_ap_config_from_string( unifi_priv_t * priv,char *param_str)
     ret = decode_parameter_from_string(priv, &str_ptr, "SEC=",
                                        PARAM_TYPE_STRING, sec, CSR_WIFI_MAX_SEC_LEN);
     if(ret) {
-        unifi_error(priv,"store_ap_config_from_string: SEC not found\n");
+        unifi_error(priv, "store_ap_config_from_string: SEC not found\n");
         return -1;
     }
-    ret = decode_parameter_from_string(priv,&str_ptr, "KEY=",
-                         PARAM_TYPE_STRING,  key, CSR_WIFI_MAX_KEY_LEN);
-    if(!strcasecmp(sec,"open")) {
-        unifi_trace(priv,UDBG2,"store_ap_config_from_string: security open");
+    ret = decode_parameter_from_string(priv, &str_ptr, "KEY=",
+                         PARAM_TYPE_STRING, key, CSR_WIFI_MAX_KEY_LEN);
+    if(!strcasecmp(sec, "open")) {
+        unifi_trace(priv, UDBG2, "store_ap_config_from_string: security open");
         ap_config->credentials.authType = CSR_WIFI_SME_AP_AUTH_TYPE_OPEN_SYSTEM;
         if(ret) {
-            unifi_notice(priv,"store_ap_config_from_string: KEY not found:fine with Open\n");
+            unifi_notice(priv, "store_ap_config_from_string: KEY not found:fine with Open\n");
         }
     }
-    else if(!strcasecmp(sec,"wpa2-psk")) {
-        int i,j=0;
+    else if(!strcasecmp(sec, "wpa2-psk")) {
+        int i, j=0;
         CsrWifiNmeApAuthPers *pers =
                             ((CsrWifiNmeApAuthPers *)&(ap_config->credentials.nmeAuthType.authTypePersonal));
         u8 *psk = pers->authPers_credentials.psk.psk;
 
-        unifi_trace(priv,UDBG2,"store_ap_config_from_string: security WPA2");
+        unifi_trace(priv, UDBG2, "store_ap_config_from_string: security WPA2");
         if(ret) {
-            unifi_error(priv,"store_ap_config_from_string: KEY not found for WPA2\n");
+            unifi_error(priv, "store_ap_config_from_string: KEY not found for WPA2\n");
             return -1;
         }
         ap_config->credentials.authType = CSR_WIFI_SME_AP_AUTH_TYPE_PERSONAL;
@@ -636,21 +636,21 @@ static int store_ap_config_from_string( unifi_priv_t * priv,char *param_str)
         }
 
     } else {
-       unifi_notice(priv,"store_ap_config_from_string: Unknown security: Assuming Open");
+       unifi_notice(priv, "store_ap_config_from_string: Unknown security: Assuming Open");
        ap_config->credentials.authType = CSR_WIFI_SME_AP_AUTH_TYPE_OPEN_SYSTEM;
        return -1;
     }
    /* Get the decoded value in a temp int variable to ensure that other fields within the struct
       which are of type other than int are not over written */
-    ret = decode_parameter_from_string(priv,&str_ptr, "CHANNEL=", PARAM_TYPE_INT, &tmp_var, 5);
+    ret = decode_parameter_from_string(priv, &str_ptr, "CHANNEL=", PARAM_TYPE_INT, &tmp_var, 5);
     if(ret)
         return -1;
     ap_config->channel = tmp_var;
-    ret = decode_parameter_from_string(priv,&str_ptr, "PREAMBLE=", PARAM_TYPE_INT, &tmp_var, 5);
+    ret = decode_parameter_from_string(priv, &str_ptr, "PREAMBLE=", PARAM_TYPE_INT, &tmp_var, 5);
     if(ret)
         return -1;
     ap_mac_config->preamble = tmp_var;
-    ret = decode_parameter_from_string(priv,&str_ptr, "MAX_SCB=", PARAM_TYPE_INT,  &tmp_var, 5);
+    ret = decode_parameter_from_string(priv, &str_ptr, "MAX_SCB=", PARAM_TYPE_INT, &tmp_var, 5);
     ap_config->max_connections = tmp_var;
     return ret;
 }
@@ -664,9 +664,9 @@ iwprivsapstart(struct net_device *dev, struct iw_request_info *info,
     int r;
 
     unifi_trace(priv, UDBG1, "iwprivsapstart\n" );
-    r = sme_ap_start(priv,interfacePriv->InterfaceTag,&priv->ap_config);
+    r = sme_ap_start(priv, interfacePriv->InterfaceTag, &priv->ap_config);
     if(r) {
-        unifi_error(priv,"iwprivsapstart AP START failed : %d\n",-r);
+        unifi_error(priv, "iwprivsapstart AP START failed : %d\n", -r);
     }
     return r;
 }
@@ -692,28 +692,28 @@ iwprivsapconfig(struct net_device *dev, struct iw_request_info *info,
             return -EFAULT;
         }
         cfg_str[wrqu->data.length] = 0;
-        unifi_trace(priv,UDBG2,"length:%d\n",wrqu->data.length);
-        unifi_trace(priv,UDBG2,"AP configuration string:%s\n",cfg_str);
+        unifi_trace(priv, UDBG2, "length:%d\n", wrqu->data.length);
+        unifi_trace(priv, UDBG2, "AP configuration string:%s\n", cfg_str);
         str = cfg_str;
-       if ((r = store_ap_config_from_string(priv,str))) {
-           unifi_error(priv, "iwprivsapconfig:Failed  to decode the string %d\n",r);
+       if ((r = store_ap_config_from_string(priv, str))) {
+           unifi_error(priv, "iwprivsapconfig:Failed  to decode the string %d\n", r);
            kfree(cfg_str);
            return -EIO;
 
        }
     } else {
-        unifi_error(priv,"iwprivsapconfig argument length = 0 \n");
+        unifi_error(priv, "iwprivsapconfig argument length = 0 \n");
         return -EIO;
     }
     r = sme_ap_config(priv, &priv->ap_mac_config, &priv->group_sec_config);
     if(r) {
-        unifi_error(priv,"iwprivsapstop AP Config failed : %d\n",-r);
+        unifi_error(priv, "iwprivsapstop AP Config failed : %d\n", -r);
     } else if(interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_AP ||
         interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_P2PGO) {
         unifi_trace(priv, UDBG1, "iwprivsapconfig: Starting the AP");
-        r = sme_ap_start(priv,interfacePriv->InterfaceTag,&priv->ap_config);
+        r = sme_ap_start(priv, interfacePriv->InterfaceTag, &priv->ap_config);
         if(r) {
-            unifi_error(priv,"iwprivsapstart AP START failed : %d\n",-r);
+            unifi_error(priv, "iwprivsapstart AP START failed : %d\n", -r);
         }
     }
     kfree(cfg_str);
@@ -730,9 +730,9 @@ iwprivsapstop(struct net_device *dev, struct iw_request_info *info,
     u16 interface_tag = interfacePriv->InterfaceTag;
 
     unifi_trace(priv, UDBG1, "iwprivsapstop\n" );
-    r = sme_ap_stop(priv,interface_tag);
+    r = sme_ap_stop(priv, interface_tag);
     if(r) {
-        unifi_error(priv,"iwprivsapstop AP STOP failed : %d\n",-r);
+        unifi_error(priv, "iwprivsapstop AP STOP failed : %d\n", -r);
     }
     return r;
 }
@@ -778,14 +778,14 @@ iwprivsstackstop(struct net_device *dev, struct iw_request_info *info,
             break;
         case CSR_WIFI_ROUTER_CTRL_MODE_AP:
         case CSR_WIFI_ROUTER_CTRL_MODE_P2PGO:
-            r = sme_ap_stop(priv,interface_tag);
+            r = sme_ap_stop(priv, interface_tag);
             break;
         default :
             break;
     }
 
     if(r) {
-        unifi_error(priv,"iwprivsstackstop Stack stop failed : %d\n",-r);
+        unifi_error(priv, "iwprivsstackstop Stack stop failed : %d\n", -r);
     }
     return 0;
 }
@@ -3167,7 +3167,7 @@ static const struct iw_priv_args unifi_private_args[] = {
 #endif
 #ifdef CSR_SUPPORT_WEXT_AP
     { SIOCIWSAPCFGPRIV, IW_PRIV_TYPE_CHAR | 256, IW_PRIV_TYPE_NONE, "AP_SET_CFG" },
-    { SIOCIWSAPSTARTPRIV, 0,IW_PRIV_TYPE_CHAR | IW_PRIV_SIZE_FIXED|IWPRIV_SME_MAX_STRING,"AP_BSS_START" },
+    { SIOCIWSAPSTARTPRIV, 0, IW_PRIV_TYPE_CHAR | IW_PRIV_SIZE_FIXED|IWPRIV_SME_MAX_STRING, "AP_BSS_START" },
     { SIOCIWSAPSTOPPRIV, IW_PRIV_TYPE_CHAR |IW_PRIV_SIZE_FIXED|0,
       IW_PRIV_TYPE_CHAR |IW_PRIV_SIZE_FIXED|0, "AP_BSS_STOP" },
 #ifdef ANDROID_BUILD
index 0fae6f48f79b9e81345fcd835237909662ff6f8d..eb286e5f7467ee7fa99d15cfc6c5925a18903dfd 100644 (file)
@@ -258,7 +258,7 @@ ul_log_config_ind(unifi_priv_t *priv, u8 *conf_param, int len)
         unifi_notice(priv, "ul_log_config_ind: wifi on in progress, suppress error\n");
     } else {
         /* wifi_off_ind (error or exit) */
-        CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0, (CsrWifiRouterCtrlControlIndication)(*conf_param));
+        CsrWifiRouterCtrlWifiOffIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, (CsrWifiRouterCtrlControlIndication)(*conf_param));
     }
 #ifdef CSR_WIFI_HIP_DEBUG_OFFLINE
     unifi_debug_buf_dump();
index e81a99878272b671e7d25ce42f30705383e0b0b4..71fdb2180e3d74ba4eb274348a132543266b40fc 100644 (file)
@@ -105,7 +105,7 @@ static u8 check_routing_pkt_data_ind(unifi_priv_t *priv,
     u8 isDataFrameSubTypeNoData = FALSE;
 
 #ifdef CSR_WIFI_SECURITY_WAPI_ENABLE
-    static const u8 wapiProtocolIdSNAPHeader[] = {0x88,0xb4};
+    static const u8 wapiProtocolIdSNAPHeader[] = {0x88, 0xb4};
     static const u8 wapiProtocolIdSNAPHeaderOffset = 6;
     u8 *destAddr;
     u8 *srcAddr;
@@ -206,7 +206,7 @@ static u8 check_routing_pkt_data_ind(unifi_priv_t *priv,
                 unifi_trace(priv, UDBG4, "Discarding the contents of the frame with MIC failure \n");
 
                 if (isWapiUnicastPkt &&
-                    ((uf_sme_port_state(priv,srcAddr,UF_CONTROLLED_PORT_Q,interfaceTag) != CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_OPEN)||
+                    ((uf_sme_port_state(priv, srcAddr, UF_CONTROLLED_PORT_Q, interfaceTag) != CSR_WIFI_ROUTER_CTRL_PORT_ACTION_8021X_PORT_OPEN)||
 #ifndef CSR_WIFI_SECURITY_WAPI_SW_ENCRYPTION
                     (priv->wapi_unicast_filter) ||
 #endif
@@ -231,7 +231,7 @@ static u8 check_routing_pkt_data_ind(unifi_priv_t *priv,
                 unifi_trace(priv, UDBG6, "check_routing_pkt_data_ind - MIC FAILURE : Dest Addr %x:%x:%x:%x:%x:%x\n",
                             destAddr[0], destAddr[1], destAddr[2], destAddr[3], destAddr[4], destAddr[5]);
                 unifi_trace(priv, UDBG6, "check_routing_pkt_data_ind - MIC FAILURE : Control Port State - 0x%.4X \n",
-                            uf_sme_port_state(priv,srcAddr,UF_CONTROLLED_PORT_Q,interfaceTag));
+                            uf_sme_port_state(priv, srcAddr, UF_CONTROLLED_PORT_Q, interfaceTag));
 
                 unifi_error(priv, "MIC failure in %s\n", __FUNCTION__);
 
@@ -285,9 +285,9 @@ static u8 check_routing_pkt_data_ind(unifi_priv_t *priv,
 
         if (llcSnapHeaderOffset > 0) {
                /* QoS data or Data */
-            unifi_trace(priv, UDBG6, "check_routing_pkt_data_ind(): SNAP header found & its offset %d\n",llcSnapHeaderOffset);
+            unifi_trace(priv, UDBG6, "check_routing_pkt_data_ind(): SNAP header found & its offset %d\n", llcSnapHeaderOffset);
             if (memcmp((u8 *)(bulkdata->d[0].os_data_ptr+llcSnapHeaderOffset+wapiProtocolIdSNAPHeaderOffset),
-                       wapiProtocolIdSNAPHeader,sizeof(wapiProtocolIdSNAPHeader))) {
+                       wapiProtocolIdSNAPHeader, sizeof(wapiProtocolIdSNAPHeader))) {
 
                unifi_trace(priv, UDBG6, "check_routing_pkt_data_ind(): This is a data & NOT a WAI protocol packet\n");
                 /* On the first unicast data pkt that is decrypted successfully after re-keying, reset the filter */
@@ -584,14 +584,14 @@ void unifi_rx_queue_flush(void *ospriv)
     unifi_priv_t *priv = (unifi_priv_t*)ospriv;
 
     unifi_trace(priv, UDBG4, "rx_wq_handler: RdPtr = %d WritePtr =  %d\n",
-                priv->rxSignalBuffer.readPointer,priv->rxSignalBuffer.writePointer);
+                priv->rxSignalBuffer.readPointer, priv->rxSignalBuffer.writePointer);
     if(priv != NULL) {
         u8 readPointer = priv->rxSignalBuffer.readPointer;
         while (readPointer != priv->rxSignalBuffer.writePointer)
         {
              rx_buff_struct_t *buf = &priv->rxSignalBuffer.rx_buff[readPointer];
              unifi_trace(priv, UDBG6, "rx_wq_handler: RdPtr = %d WritePtr =  %d\n",
-                         readPointer,priv->rxSignalBuffer.writePointer);
+                         readPointer, priv->rxSignalBuffer.writePointer);
              unifi_process_receive_event(priv, buf->bufptr, buf->sig_len, &buf->data_ptrs);
              readPointer ++;
              if(readPointer >= priv->rxSignalBuffer.size) {
@@ -661,7 +661,7 @@ unifi_receive_event(void *ospriv,
             CSR_GET_UINT16_FROM_LITTLE_ENDIAN((sigdata) + sizeof(s16)*6) & 0xFFFF,
             CSR_GET_UINT16_FROM_LITTLE_ENDIAN((sigdata) + sizeof(s16)*7) & 0xFFFF, siglen);
     if(signal_buffer_is_full(priv)) {
-        unifi_error(priv,"TO HOST signal queue FULL dropping the PDU\n");
+        unifi_error(priv, "TO HOST signal queue FULL dropping the PDU\n");
         for (i = 0; i < UNIFI_MAX_DATA_REFERENCES; i++) {
             if (bulkdata->d[i].data_length != 0) {
                 unifi_net_data_free(priv, (void *)&bulkdata->d[i]);
@@ -671,14 +671,14 @@ unifi_receive_event(void *ospriv,
     }
     writePointer = priv->rxSignalBuffer.writePointer;
     rx_buff = &priv->rxSignalBuffer.rx_buff[writePointer];
-    memcpy(rx_buff->bufptr,sigdata,siglen);
+    memcpy(rx_buff->bufptr, sigdata, siglen);
     rx_buff->sig_len = siglen;
     rx_buff->data_ptrs = *bulkdata;
     writePointer++;
     if(writePointer >= priv->rxSignalBuffer.size) {
         writePointer =0;
     }
-    unifi_trace(priv, UDBG4, "unifi_receive_event:writePtr = %d\n",priv->rxSignalBuffer.writePointer);
+    unifi_trace(priv, UDBG4, "unifi_receive_event:writePtr = %d\n", priv->rxSignalBuffer.writePointer);
     priv->rxSignalBuffer.writePointer = writePointer;
 
 #ifndef CSR_WIFI_RX_PATH_SPLIT_DONT_USE_WQ
index f9b421b5aa3592111e25cebdabb4d3f7564ed3ff..b81a572920201b18420c779c74b4cef4a33102e0 100644 (file)
@@ -38,7 +38,7 @@ static void _update_buffered_pkt_params_after_alignment(unifi_priv_t *priv, bulk
     skb = (struct sk_buff*)bulkdata->d[0].os_net_buf_ptr;
     align_offset = (u32)(long)(bulkdata->d[0].os_data_ptr) & (CSR_WIFI_ALIGN_BYTES-1);
     if(align_offset){
-        skb_pull(skb,align_offset);
+        skb_pull(skb, align_offset);
     }
 
     buffered_pkt->bulkdata.os_data_ptr = bulkdata->d[0].os_data_ptr;
@@ -86,7 +86,7 @@ unifi_frame_ma_packet_req(unifi_priv_t *priv, CSR_PRIORITY priority,
      */
     req->TransmissionControl = transmissionControl;
     req->VirtualInterfaceIdentifier =
-           uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag);
+           uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag);
     memcpy(req->Ra.x, peerMacAddress, ETH_ALEN);
 
     if (hostTag == 0xffffffff) {
@@ -124,8 +124,8 @@ unifi_frame_ma_packet_req(unifi_priv_t *priv, CSR_PRIORITY priority,
 #define TRANSMISSION_CONTROL_EOSP_MASK 0x0002
 
 static
-int frame_and_send_queued_pdu(unifi_priv_t* priv,tx_buffered_packets_t* buffered_pkt,
-            CsrWifiRouterCtrlStaInfo_t *staRecord,u8 moreData , u8 eosp)
+int frame_and_send_queued_pdu(unifi_priv_t* priv, tx_buffered_packets_t* buffered_pkt,
+            CsrWifiRouterCtrlStaInfo_t *staRecord, u8 moreData , u8 eosp)
 {
 
     CSR_SIGNAL signal;
@@ -135,7 +135,7 @@ int frame_and_send_queued_pdu(unifi_priv_t* priv,tx_buffered_packets_t* buffered
     u8 *qc;
     u16 *fc = (u16*)(buffered_pkt->bulkdata.os_data_ptr);
     unsigned long lock_flags;
-    unifi_trace(priv, UDBG3, "frame_and_send_queued_pdu with moreData: %d , EOSP: %d\n",moreData,eosp);
+    unifi_trace(priv, UDBG3, "frame_and_send_queued_pdu with moreData: %d , EOSP: %d\n", moreData, eosp);
     unifi_frame_ma_packet_req(priv, buffered_pkt->priority, buffered_pkt->rate, buffered_pkt->hostTag,
                buffered_pkt->interfaceTag, buffered_pkt->transmissionControl,
                buffered_pkt->leSenderProcessId, buffered_pkt->peerMacAddress.a, &signal);
@@ -156,7 +156,7 @@ int frame_and_send_queued_pdu(unifi_priv_t* priv,tx_buffered_packets_t* buffered
 
     if((staRecord != NULL)&& (staRecord->wmmOrQosEnabled == TRUE))
     {
-        unifi_trace(priv, UDBG3, "frame_and_send_queued_pdu WMM Enabled: %d \n",staRecord->wmmOrQosEnabled);
+        unifi_trace(priv, UDBG3, "frame_and_send_queued_pdu WMM Enabled: %d \n", staRecord->wmmOrQosEnabled);
 
         toDs = (*fc & cpu_to_le16(IEEE802_11_FC_TO_DS_MASK))?1 : 0;
         fromDs = (*fc & cpu_to_le16(IEEE802_11_FC_FROM_DS_MASK))? 1: 0;
@@ -190,7 +190,7 @@ int frame_and_send_queued_pdu(unifi_priv_t* priv,tx_buffered_packets_t* buffered
     }
     result = ul_send_signal_unpacked(priv, &signal, &bulkdata);
     if(result){
-        _update_buffered_pkt_params_after_alignment(priv, &bulkdata,buffered_pkt);
+        _update_buffered_pkt_params_after_alignment(priv, &bulkdata, buffered_pkt);
     }
 
  /* Decrement the packet counts queued in driver */
@@ -199,13 +199,13 @@ int frame_and_send_queued_pdu(unifi_priv_t* priv,tx_buffered_packets_t* buffered
         if (!priv->noOfPktQueuedInDriver) {
             unifi_error(priv, "packets queued in driver 0 still decrementing\n");
         } else {
-            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
             priv->noOfPktQueuedInDriver--;
-            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
         }
         /* Sta Record is available for all unicast (except genericMgt Frames) & in other case its NULL */
         if (staRecord) {
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             if (!staRecord->noOfPktQueued) {
                 unifi_error(priv, "packets queued in driver per station is 0 still decrementing\n");
             } else {
@@ -217,7 +217,7 @@ int frame_and_send_queued_pdu(unifi_priv_t* priv,tx_buffered_packets_t* buffered
                     staRecord->nullDataHostTag = INVALID_HOST_TAG;
                 }
             }
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }
 
     }
@@ -243,24 +243,24 @@ void set_eosp_transmit_ctrl(unifi_priv_t *priv, struct list_head *txList)
 
     /* return the last node , and modify it. */
 
-    spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+    spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
     list_for_each_prev_safe(listHead, placeHolder, txList) {
         tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
         tx_q_item->transmissionControl |= TRANSMISSION_CONTROL_EOSP_MASK;
         tx_q_item->transmissionControl = (tx_q_item->transmissionControl & ~(CSR_NO_CONFIRM_REQUIRED));
         unifi_trace(priv, UDBG1,
-                "set_eosp_transmit_ctrl Transmission Control = 0x%x hostTag = 0x%x \n",tx_q_item->transmissionControl,tx_q_item->hostTag);
-        unifi_trace(priv,UDBG3,"in set_eosp_transmit_ctrl no.of buffered frames %d\n",priv->noOfPktQueuedInDriver);
+                "set_eosp_transmit_ctrl Transmission Control = 0x%x hostTag = 0x%x \n", tx_q_item->transmissionControl, tx_q_item->hostTag);
+        unifi_trace(priv, UDBG3, "in set_eosp_transmit_ctrl no.of buffered frames %d\n", priv->noOfPktQueuedInDriver);
         break;
     }
-    spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
-    unifi_trace(priv, UDBG1,"List Empty %d\n",list_empty(txList));
+    spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
+    unifi_trace(priv, UDBG1, "List Empty %d\n", list_empty(txList));
     unifi_trace(priv, UDBG5, "leaving set_eosp_transmit_ctrl\n");
     return;
 }
 
 static
-void send_vif_availibility_rsp(unifi_priv_t *priv,CSR_VIF_IDENTIFIER vif,CSR_RESULT_CODE resultCode)
+void send_vif_availibility_rsp(unifi_priv_t *priv, CSR_VIF_IDENTIFIER vif, CSR_RESULT_CODE resultCode)
 {
     CSR_SIGNAL signal;
     CSR_MA_VIF_AVAILABILITY_RESPONSE *rsp;
@@ -269,7 +269,7 @@ void send_vif_availibility_rsp(unifi_priv_t *priv,CSR_VIF_IDENTIFIER vif,CSR_RES
 
     unifi_trace(priv, UDBG3, "send_vif_availibility_rsp : invoked with resultCode = %d \n", resultCode);
 
-    memset(&signal,0,sizeof(CSR_SIGNAL));
+    memset(&signal, 0, sizeof(CSR_SIGNAL));
     rsp = &signal.u.MaVifAvailabilityResponse;
     rsp->VirtualInterfaceIdentifier = vif;
     rsp->ResultCode = resultCode;
@@ -280,7 +280,7 @@ void send_vif_availibility_rsp(unifi_priv_t *priv,CSR_VIF_IDENTIFIER vif,CSR_RES
     /* Send the signal to UniFi */
     r = ul_send_signal_unpacked(priv, &signal, bulkdata);
     if(r) {
-        unifi_error(priv,"Availibility response sending failed %x status %d\n",vif,r);
+        unifi_error(priv, "Availibility response sending failed %x status %d\n", vif, r);
     }
     else {
         unifi_trace(priv, UDBG3, "send_vif_availibility_rsp : status = %d \n", r);
@@ -295,7 +295,7 @@ void verify_and_accomodate_tx_packet(unifi_priv_t *priv)
     unsigned long lock_flags;
     struct list_head *listHead, *list;
     struct list_head *placeHolder;
-    u8 i, j,eospFramedeleted=0;
+    u8 i, j, eospFramedeleted=0;
     u8 thresholdExcedeDueToBroadcast = TRUE;
     /* it will be made it interface Specific in the future when multi interfaces are supported ,
     right now interface 0 is considered */
@@ -311,10 +311,10 @@ void verify_and_accomodate_tx_packet(unifi_priv_t *priv)
              * packets for station record crossed the threshold limit (64 for AP supporting
              * 8 peers)
              */
-            unifi_trace(priv,UDBG3,"number of station pkts queued=  %d for sta id = %d\n", staInfo->noOfPktQueued, staInfo->aid);
+            unifi_trace(priv, UDBG3, "number of station pkts queued=  %d for sta id = %d\n", staInfo->noOfPktQueued, staInfo->aid);
             for(j = 0; j < MAX_ACCESS_CATOGORY; j++) {
                 list = &staInfo->dataPdu[j];
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 list_for_each_safe(listHead, placeHolder, list) {
                     tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
                     list_del(listHead);
@@ -339,7 +339,7 @@ void verify_and_accomodate_tx_packet(unifi_priv_t *priv)
                     }
                     break;
                 }
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             }
         }
     }
@@ -347,13 +347,13 @@ void verify_and_accomodate_tx_packet(unifi_priv_t *priv)
         /* Remove the packets from genericMulticastOrBroadCastFrames queue
          * (the max packets in driver is reached due to broadcast/multicast frames)
          */
-        spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+        spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
         list_for_each_safe(listHead, placeHolder, &interfacePriv->genericMulticastOrBroadCastFrames) {
             tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
             if(eospFramedeleted){
                 tx_q_item->transmissionControl |= TRANSMISSION_CONTROL_EOSP_MASK;
                 tx_q_item->transmissionControl = (tx_q_item->transmissionControl & ~(CSR_NO_CONFIRM_REQUIRED));
-                unifi_trace(priv, UDBG1,"updating eosp for next packet hostTag:= 0x%x ",tx_q_item->hostTag);
+                unifi_trace(priv, UDBG1, "updating eosp for next packet hostTag:= 0x%x ", tx_q_item->hostTag);
                 eospFramedeleted =0;
                 break;
             }
@@ -361,7 +361,7 @@ void verify_and_accomodate_tx_packet(unifi_priv_t *priv)
             if(tx_q_item->transmissionControl & TRANSMISSION_CONTROL_EOSP_MASK ){
                eospFramedeleted = 1;
             }
-            unifi_trace(priv,UDBG1, "freeing of multicast packets ToC = 0x%x hostTag = 0x%x \n",tx_q_item->transmissionControl,tx_q_item->hostTag);
+            unifi_trace(priv, UDBG1, "freeing of multicast packets ToC = 0x%x hostTag = 0x%x \n", tx_q_item->transmissionControl, tx_q_item->hostTag);
             list_del(listHead);
             unifi_net_data_free(priv, &tx_q_item->bulkdata);
             kfree(tx_q_item);
@@ -373,7 +373,7 @@ void verify_and_accomodate_tx_packet(unifi_priv_t *priv)
                 break;
             }
         }
-        spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
     }
     unifi_trace(priv, UDBG3, "leaving verify_and_accomodate_tx_packet\n");
 }
@@ -391,13 +391,13 @@ CsrResult enque_tx_data_pdu(unifi_priv_t *priv, bulk_data_param_t *bulkdata,
 
     unifi_trace(priv, UDBG5, "entering enque_tx_data_pdu\n");
     if(!list) {
-       unifi_error(priv,"List is not specified\n");
+       unifi_error(priv, "List is not specified\n");
        return CSR_RESULT_FAILURE;
     }
 
     /* Removes aged packets & adds the incoming packet */
     if (priv->noOfPktQueuedInDriver >= CSR_WIFI_DRIVER_SUPPORT_FOR_MAX_PKT_QUEUEING) {
-        unifi_trace(priv,UDBG3,"number of pkts queued=  %d \n", priv->noOfPktQueuedInDriver);
+        unifi_trace(priv, UDBG3, "number of pkts queued=  %d \n", priv->noOfPktQueuedInDriver);
         verify_and_accomodate_tx_packet(priv);
     }
 
@@ -412,7 +412,7 @@ CsrResult enque_tx_data_pdu(unifi_priv_t *priv, bulk_data_param_t *bulkdata,
     }
 
     /* disable the preemption */
-    spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+    spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
     INIT_LIST_HEAD(&tx_q_item->q);
     /* fill the tx_q structure members */
     tx_q_item->bulkdata.os_data_ptr = bulkdata->d[0].os_data_ptr;
@@ -437,7 +437,7 @@ CsrResult enque_tx_data_pdu(unifi_priv_t *priv, bulk_data_param_t *bulkdata,
 
     /* Count of packet queued in driver */
     priv->noOfPktQueuedInDriver++;
-    spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
     unifi_trace(priv, UDBG5, "leaving enque_tx_data_pdu\n");
     return CSR_RESULT_SUCCESS;
 }
@@ -655,13 +655,13 @@ void uf_handle_tim_cfm(unifi_priv_t *priv, CSR_MLME_SET_TIM_CONFIRM *cfm, u16 re
     }
 
     if (handle != CSR_WIFI_BROADCAST_OR_MULTICAST_HANDLE) {
-        spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+        spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
         if ((staRecord = ((CsrWifiRouterCtrlStaInfo_t *) (interfacePriv->staInfo[handle]))) == NULL) {
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
             unifi_warning(priv, "uf_handle_tim_cfm: station record is NULL  handle = %x\n", handle);
             return;
         }
-       spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+       spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
     }
     switch(timSetStatus)
     {
@@ -909,13 +909,13 @@ void update_tim(unifi_priv_t * priv, u16 aid, u8 setTim, u16 interfaceTag, u32 h
                    (u8*)&signal.SignalPrimitiveHeader.SenderProcessId);
 
     /* set The virtual interfaceIdentifier, aid, tim value */
-    req->VirtualInterfaceIdentifier = uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag);
+    req->VirtualInterfaceIdentifier = uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag);
     req->AssociationId = aid;
     req->TimValue = setTim;
 
 
     unifi_trace(priv, UDBG2, "update_tim:AID %x,senderIdLsb = 0x%x, handle = 0x%x, timSetStatus = %x, sender proceesID = %x \n",
-                aid,senderIdLsb, handle, timSetStatus, signal.SignalPrimitiveHeader.SenderProcessId);
+                aid, senderIdLsb, handle, timSetStatus, signal.SignalPrimitiveHeader.SenderProcessId);
 
     /* Send the signal to UniFi */
     r = ul_send_signal_unpacked(priv, &signal, bulkdata);
@@ -953,17 +953,17 @@ void process_peer_active_transition(unifi_priv_t * priv,
                                     CsrWifiRouterCtrlStaInfo_t *staRecord,
                                     u16 interfaceTag)
 {
-    int r,i;
-    u8 spaceAvail[4] = {TRUE,TRUE,TRUE,TRUE};
+    int r, i;
+    u8 spaceAvail[4] = {TRUE, TRUE, TRUE, TRUE};
     tx_buffered_packets_t * buffered_pkt = NULL;
     unsigned long lock_flags;
     netInterface_priv_t *interfacePriv = priv->interfacePriv[interfaceTag];
 
     unifi_trace(priv, UDBG5, "entering process_peer_active_transition\n");
 
-    if(IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag)) {
+    if(IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag)) {
         /* giving more priority to multicast packets so delaying unicast packets*/
-        unifi_trace(priv,UDBG2, "Multicast transmission is going on so resume unicast transmission after DTIM over\n");
+        unifi_trace(priv, UDBG2, "Multicast transmission is going on so resume unicast transmission after DTIM over\n");
 
         /* As station is active now, even though AP is not able to send frames to it
          * because of DTIM, it needs to reset the TIM here
@@ -987,12 +987,12 @@ void process_peer_active_transition(unifi_priv_t * priv,
     while((buffered_pkt=dequeue_tx_data_pdu(priv, &staRecord->mgtFrames))) {
         buffered_pkt->transmissionControl &=
                      ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
-        if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,0,FALSE)) == -ENOSPC) {
+        if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, 0, FALSE)) == -ENOSPC) {
             unifi_trace(priv, UDBG2, "p_p_a_t:(ENOSPC) Mgt Frame queueing \n");
             /* Enqueue at the head of the queue */
-            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
             list_add(&buffered_pkt->q, &staRecord->mgtFrames);
-            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             priv->pausedStaHandle[3]=(u8)(staRecord->assignedHandle);
             spaceAvail[3] = FALSE;
             break;
@@ -1008,7 +1008,7 @@ void process_peer_active_transition(unifi_priv_t * priv,
     if (!staRecord->timRequestPendingFlag) {
         if (staRecord->txSuspend) {
             if(staRecord->timSet == CSR_WIFI_TIM_SET) {
-                update_tim(priv,staRecord->aid,0,interfaceTag, staRecord->assignedHandle);
+                update_tim(priv, staRecord->aid, 0, interfaceTag, staRecord->assignedHandle);
             }
             return;
         }
@@ -1025,16 +1025,16 @@ void process_peer_active_transition(unifi_priv_t * priv,
     for(i=3;i>=0;i--) {
         if(!spaceAvail[i])
             continue;
-        unifi_trace(priv, UDBG6, "p_p_a_t:data pkt sending for AC %d \n",i);
+        unifi_trace(priv, UDBG6, "p_p_a_t:data pkt sending for AC %d \n", i);
         while((buffered_pkt=dequeue_tx_data_pdu(priv, &staRecord->dataPdu[i]))) {
            buffered_pkt->transmissionControl &=
                       ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
-           if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,0,FALSE)) == -ENOSPC) {
+           if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, 0, FALSE)) == -ENOSPC) {
                /* Clear the trigger bit transmission control*/
                /* Enqueue at the head of the queue */
-               spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+               spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                list_add(&buffered_pkt->q, &staRecord->dataPdu[i]);
-               spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+               spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                priv->pausedStaHandle[i]=(u8)(staRecord->assignedHandle);
                break;
            } else {
@@ -1050,7 +1050,7 @@ void process_peer_active_transition(unifi_priv_t * priv,
     if (!staRecord->timRequestPendingFlag){
         if((staRecord->timSet  == CSR_WIFI_TIM_SET) || (staRecord->timSet  == CSR_WIFI_TIM_SETTING)) {
             unifi_trace(priv, UDBG3, "p_p_a_t:resetting tim .....\n");
-            update_tim(priv,staRecord->aid,0,interfaceTag, staRecord->assignedHandle);
+            update_tim(priv, staRecord->aid, 0, interfaceTag, staRecord->assignedHandle);
         }
     }
     else
@@ -1067,7 +1067,7 @@ void process_peer_active_transition(unifi_priv_t * priv,
 
 
 
-void uf_process_ma_pkt_cfm_for_ap(unifi_priv_t *priv,u16 interfaceTag, const CSR_MA_PACKET_CONFIRM *pkt_cfm)
+void uf_process_ma_pkt_cfm_for_ap(unifi_priv_t *priv, u16 interfaceTag, const CSR_MA_PACKET_CONFIRM *pkt_cfm)
 {
     netInterface_priv_t *interfacePriv;
     u8 i;
@@ -1076,16 +1076,16 @@ void uf_process_ma_pkt_cfm_for_ap(unifi_priv_t *priv,u16 interfaceTag, const CSR
 
 
     if(pkt_cfm->HostTag == interfacePriv->multicastPduHostTag) {
-         unifi_trace(priv,UDBG2,"CFM for marked Multicast Tag = %x\n",interfacePriv->multicastPduHostTag);
+         unifi_trace(priv, UDBG2, "CFM for marked Multicast Tag = %x\n", interfacePriv->multicastPduHostTag);
          interfacePriv->multicastPduHostTag = 0xffffffff;
-         resume_suspended_uapsd(priv,interfaceTag);
-         resume_unicast_buffered_frames(priv,interfaceTag);
+         resume_suspended_uapsd(priv, interfaceTag);
+         resume_unicast_buffered_frames(priv, interfaceTag);
          if(list_empty(&interfacePriv->genericMulticastOrBroadCastMgtFrames) &&
               list_empty(&interfacePriv->genericMulticastOrBroadCastFrames)) {
-            unifi_trace(priv,UDBG1,"Resetting multicastTIM");
+            unifi_trace(priv, UDBG1, "Resetting multicastTIM");
             if (!interfacePriv->bcTimSetReqPendingFlag)
             {
-                update_tim(priv,0,CSR_WIFI_TIM_RESET,interfaceTag, 0xFFFFFFFF);
+                update_tim(priv, 0, CSR_WIFI_TIM_RESET, interfaceTag, 0xFFFFFFFF);
             }
             else
             {
@@ -1164,7 +1164,7 @@ void uf_process_ma_pkt_cfm_for_ap(unifi_priv_t *priv,u16 interfaceTag, const CSR
                                                                  &send_cfm_list,
                                                                  &(staRecord->dataPdu[j]));
 
-                        uf_flush_list(priv,&(staRecord->dataPdu[j]));
+                        uf_flush_list(priv, &(staRecord->dataPdu[j]));
                     }
 
                     send_auto_ma_packet_confirm(priv, staRecord->interfacePriv, &send_cfm_list);
@@ -1469,7 +1469,7 @@ static int update_macheader(unifi_priv_t *priv, struct sk_buff *skb,
     }
 
     /* prepare the complete skb, by pushing the MAC header to the beginning of the skb->data */
-    unifi_trace(priv, UDBG5, "updated Mac Header: %d \n",macHeaderLengthInBytes);
+    unifi_trace(priv, UDBG5, "updated Mac Header: %d \n", macHeaderLengthInBytes);
     memcpy(bufPtr, macHeaderBuf, macHeaderLengthInBytes);
 
     unifi_trace(priv, UDBG5, "leaving the update_macheader function\n");
@@ -1515,7 +1515,7 @@ uf_ap_process_data_pdu(unifi_priv_t *priv, struct sk_buff *skb,
     CsrWifiRouterCtrlStaInfo_t *dstStaInfo = NULL;
     netInterface_priv_t *interfacePriv;
 
-    unifi_trace(priv, UDBG5, "entering  uf_ap_process_data_pdu %d\n",macHeaderLengthInBytes);
+    unifi_trace(priv, UDBG5, "entering  uf_ap_process_data_pdu %d\n", macHeaderLengthInBytes);
     /* InterfaceTag validation from MA_PACKET.indication */
     if (interfaceTag >= CSR_WIFI_NUM_INTERFACES) {
         unifi_trace(priv, UDBG1, "Interface Tag is Invalid in uf_ap_process_data_pdu\n");
@@ -1608,7 +1608,7 @@ uf_ap_process_data_pdu(unifi_priv_t *priv, struct sk_buff *skb,
     unifi_trace(priv, UDBG3, "Mac Header updated...calling uf_process_ma_packet_req \n");
 
     /* Packet is ready to send to unifi ,transmissionControl = 0x0004, confirmation is not needed for data packets */
-    if (uf_process_ma_packet_req(priv,  ehdr->h_dest, 0xffffffff, interfaceTag, CSR_NO_CONFIRM_REQUIRED, (CSR_RATE)0,priority, priv->netdev_client->sender_id, bulkdata)) {
+    if (uf_process_ma_packet_req(priv, ehdr->h_dest, 0xffffffff, interfaceTag, CSR_NO_CONFIRM_REQUIRED, (CSR_RATE)0, priority, priv->netdev_client->sender_id, bulkdata)) {
         if (sendToNetdev) {
             unifi_trace(priv, UDBG1, "In uf_ap_process_data_pdu, (Packet Drop) uf_process_ma_packet_req failed. freeing skb_copy data (original data sent to Netdev)\n");
             /*  Free's the skb_copy(skbPtr) data since packet processing failed */
@@ -1750,7 +1750,7 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
                         /* push the packet to the unifi if list is empty (if packet lost how to re-enque) */
                         if (list_empty(&interfacePriv->genericMgtFrames)) {
 #ifdef CSR_SUPPORT_SME
-                            if(!(IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag))) {
+                            if(!(IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag))) {
 #endif
 
                             unifi_trace(priv, UDBG3, "genericMgtFrames list is empty uf_process_ma_packet_req \n");
@@ -1765,8 +1765,8 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
 #ifdef CSR_SUPPORT_SME
                             }else{
                                 list = &interfacePriv->genericMgtFrames;
-                                unifi_trace(priv, UDBG3, "genericMgtFrames queue empty and dtim started\n hosttag is 0x%x,\n",signal.u.MaPacketRequest.HostTag);
-                                update_eosp_to_head_of_broadcast_list_head(priv,interfaceTag);
+                                unifi_trace(priv, UDBG3, "genericMgtFrames queue empty and dtim started\n hosttag is 0x%x,\n", signal.u.MaPacketRequest.HostTag);
+                                update_eosp_to_head_of_broadcast_list_head(priv, interfaceTag);
                            }
 #endif
                         } else {
@@ -1776,15 +1776,15 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
                         }
                     } else {
                         /* check peer power state */
-                        if (queuePacketDozing || !list_empty(&staRecord->mgtFrames) || IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag)) {
+                        if (queuePacketDozing || !list_empty(&staRecord->mgtFrames) || IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag)) {
                             /* peer is in dozing mode, so queue packet in mgt frame list of station record */
                            /*if multicast traffic is going on, buffer the unicast packets*/
                             list = &staRecord->mgtFrames;
 
                             unifi_trace(priv, UDBG1, "staRecord->MgtFrames list empty? = %s, handle = %d, queuePacketDozing = %d\n",
                                         (list_empty(&staRecord->mgtFrames))? "YES": "NO", staRecord->assignedHandle, queuePacketDozing);
-                            if(IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag)){
-                                update_eosp_to_head_of_broadcast_list_head(priv,interfaceTag);
+                            if(IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag)){
+                                update_eosp_to_head_of_broadcast_list_head(priv, interfaceTag);
                             }
 
                         } else {
@@ -1794,7 +1794,7 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
                                 /* requeue the failed packet to staRecord->mgtFrames with same position */
                                 list = &staRecord->mgtFrames;
                                 requeueOnSamePos = TRUE;
-                                unifi_trace(priv, UDBG1, "(ENOSPC) Sending MgtFrames Failed handle = %d so buffering\n",staRecord->assignedHandle);
+                                unifi_trace(priv, UDBG1, "(ENOSPC) Sending MgtFrames Failed handle = %d so buffering\n", staRecord->assignedHandle);
                                 priv->pausedStaHandle[0]=(u8)(staRecord->assignedHandle);
                             } else if (result) {
                                 status = CSR_RESULT_FAILURE;
@@ -1827,11 +1827,11 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
                     if(!staRecord) {
                         unifi_error(priv, "In %s unicast but staRecord = NULL\n", __FUNCTION__);
                         return CSR_RESULT_FAILURE;
-                    } else if (queuePacketDozing || isRouterBufferEnabled(priv,priority_q)|| !list_empty(&staRecord->dataPdu[priority_q]) || IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag)) {
+                    } else if (queuePacketDozing || isRouterBufferEnabled(priv, priority_q)|| !list_empty(&staRecord->dataPdu[priority_q]) || IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag)) {
                         /* peer is in dozing mode, so queue packet in mgt frame list of station record */
                         /* if multicast traffic is going on, buffet the unicast packets */
                         unifi_trace(priv, UDBG2, "Enqueued to staRecord->dataPdu[%d] queuePacketDozing=%d,\
-                                Buffering enabled = %d \n", priority_q,queuePacketDozing,isRouterBufferEnabled(priv,priority_q));
+                                Buffering enabled = %d \n", priority_q, queuePacketDozing, isRouterBufferEnabled(priv, priority_q));
                         list = &staRecord->dataPdu[priority_q];
                     } else {
                         unifi_trace(priv, UDBG5, "staRecord->dataPdu[%d] list is empty uf_process_ma_packet_req \n", priority_q);
@@ -1839,12 +1839,12 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
                         result = ul_send_signal_unpacked(priv, &signal, bulkdata);
                         if(result == -ENOSPC) {
                             /* requeue the failed packet to staRecord->dataPdu[priority_q] with same position */
-                            unifi_trace(priv, UDBG1, "(ENOSPC) Sending Unicast DataPDU to queue %d Failed so buffering\n",priority_q);
+                            unifi_trace(priv, UDBG1, "(ENOSPC) Sending Unicast DataPDU to queue %d Failed so buffering\n", priority_q);
                             requeueOnSamePos = TRUE;
                             list = &staRecord->dataPdu[priority_q];
                             priv->pausedStaHandle[priority_q]=(u8)(staRecord->assignedHandle);
-                            if(!isRouterBufferEnabled(priv,priority_q)) {
-                                unifi_error(priv,"Buffering Not enabled for queue %d \n",priority_q);
+                            if(!isRouterBufferEnabled(priv, priority_q)) {
+                                unifi_error(priv, "Buffering Not enabled for queue %d \n", priority_q);
                             }
                         } else if (result) {
                             status = CSR_RESULT_FAILURE;
@@ -1869,19 +1869,19 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
             unifi_error(priv, "unrecognized frame type\n");
     }
     if(list) {
-        status = enque_tx_data_pdu(priv, bulkdata,list, &signal,requeueOnSamePos);
+        status = enque_tx_data_pdu(priv, bulkdata, list, &signal, requeueOnSamePos);
         /* Record no. of packet queued for each peer */
         if (staRecord && (pktType == CSR_WIFI_UNICAST_PDU) && (!status)) {
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             staRecord->noOfPktQueued++;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }
         else if ((pktType == CSR_WIFI_MULTICAST_PDU) && (!status))
         {
             /* If broadcast Tim is set && queuing is successful, then only update TIM */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             interfacePriv->noOfbroadcastPktQueued++;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }
     }
     /* If broadcast Tim is set && queuing is successful, then only update TIM */
@@ -1889,7 +1889,7 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
         unifi_trace(priv, UDBG3, "tim set due to broadcast pkt\n");
         if (!interfacePriv->bcTimSetReqPendingFlag)
         {
-            update_tim(priv,0,CSR_WIFI_TIM_SET,interfaceTag, handle);
+            update_tim(priv, 0, CSR_WIFI_TIM_SET, interfaceTag, handle);
         }
         else
         {
@@ -1909,7 +1909,7 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
                    !list_empty(&staRecord->dataPdu[UNIFI_TRAFFIC_Q_CONTENTION])) {
                     unifi_trace(priv, UDBG3, "tim set due to unicast pkt & peer in powersave\n");
                     if (!staRecord->timRequestPendingFlag){
-                        update_tim(priv,staRecord->aid,1,interfaceTag, handle);
+                        update_tim(priv, staRecord->aid, 1, interfaceTag, handle);
                     }
                     else
                     {
@@ -1929,7 +1929,7 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
                 if (uf_is_more_data_for_non_delivery_ac(staRecord) || (allDeliveryEnabled && dataAvailable)
                     || (!list_empty(&staRecord->mgtFrames))) {
                     if (!staRecord->timRequestPendingFlag) {
-                        update_tim(priv,staRecord->aid,1,interfaceTag, handle);
+                        update_tim(priv, staRecord->aid, 1, interfaceTag, handle);
                     }
                     else
                     {
@@ -1945,8 +1945,8 @@ CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,
         }
     }
 
-    if((list) && (pktType == CSR_WIFI_UNICAST_PDU && !queuePacketDozing) && !(isRouterBufferEnabled(priv,priority_q)) && !(IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag))) {
-        unifi_trace(priv, UDBG2, "buffering cleared for queue = %d So resending buffered frames\n",priority_q);
+    if((list) && (pktType == CSR_WIFI_UNICAST_PDU && !queuePacketDozing) && !(isRouterBufferEnabled(priv, priority_q)) && !(IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag))) {
+        unifi_trace(priv, UDBG2, "buffering cleared for queue = %d So resending buffered frames\n", priority_q);
         uf_send_buffered_frames(priv, priority_q);
     }
     unifi_trace(priv, UDBG5, "leaving uf_process_ma_packet_req \n");
@@ -2022,23 +2022,23 @@ u8 send_multicast_frames(unifi_priv_t *priv, u16 interfaceTag)
     netInterface_priv_t *interfacePriv = priv->interfacePriv[interfaceTag];
     u32 hostTag = 0xffffffff;
 
-    if(!isRouterBufferEnabled(priv,UNIFI_TRAFFIC_Q_VO)) {
-        while((interfacePriv->dtimActive)&& (buffered_pkt=dequeue_tx_data_pdu(priv,&interfacePriv->genericMulticastOrBroadCastMgtFrames))) {
+    if(!isRouterBufferEnabled(priv, UNIFI_TRAFFIC_Q_VO)) {
+        while((interfacePriv->dtimActive)&& (buffered_pkt=dequeue_tx_data_pdu(priv, &interfacePriv->genericMulticastOrBroadCastMgtFrames))) {
             buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK);
             moreData = (buffered_pkt->transmissionControl & TRANSMISSION_CONTROL_EOSP_MASK)?FALSE:TRUE;
 
 
-            unifi_trace(priv,UDBG2,"DTIM Occurred for interface:sending Mgt packet %d\n",interfaceTag);
+            unifi_trace(priv, UDBG2, "DTIM Occurred for interface:sending Mgt packet %d\n", interfaceTag);
 
-            if((r=frame_and_send_queued_pdu(priv,buffered_pkt,NULL,moreData,FALSE)) == -ENOSPC) {
-               unifi_trace(priv,UDBG1,"frame_and_send_queued_pdu failed with ENOSPC for host tag = %x\n", buffered_pkt->hostTag);
+            if((r=frame_and_send_queued_pdu(priv, buffered_pkt, NULL, moreData, FALSE)) == -ENOSPC) {
+               unifi_trace(priv, UDBG1, "frame_and_send_queued_pdu failed with ENOSPC for host tag = %x\n", buffered_pkt->hostTag);
                /* Enqueue at the head of the queue */
-               spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+               spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                list_add(&buffered_pkt->q, &interfacePriv->genericMulticastOrBroadCastMgtFrames);
-               spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+               spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                break;
             } else {
-                unifi_trace(priv,UDBG1,"send_multicast_frames: Send genericMulticastOrBroadCastMgtFrames (%x, %x)\n",
+                unifi_trace(priv, UDBG1, "send_multicast_frames: Send genericMulticastOrBroadCastMgtFrames (%x, %x)\n",
                                         buffered_pkt->hostTag,
                                         r);
                 if(r) {
@@ -2051,35 +2051,35 @@ u8 send_multicast_frames(unifi_priv_t *priv, u16 interfaceTag)
                         hostTag = buffered_pkt->hostTag;
                         pduSent++;
                     } else {
-                        send_vif_availibility_rsp(priv,uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag),CSR_RC_UNSPECIFIED_FAILURE);
+                        send_vif_availibility_rsp(priv, uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag), CSR_RC_UNSPECIFIED_FAILURE);
                     }
                 }
                 /* Buffered frame sent successfully */
-                spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+                spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
                 interfacePriv->noOfbroadcastPktQueued--;
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
                 kfree(buffered_pkt);
            }
 
         }
     }
-    if(!isRouterBufferEnabled(priv,UNIFI_TRAFFIC_Q_CONTENTION)) {
-        while((interfacePriv->dtimActive)&& (buffered_pkt=dequeue_tx_data_pdu(priv,&interfacePriv->genericMulticastOrBroadCastFrames))) {
+    if(!isRouterBufferEnabled(priv, UNIFI_TRAFFIC_Q_CONTENTION)) {
+        while((interfacePriv->dtimActive)&& (buffered_pkt=dequeue_tx_data_pdu(priv, &interfacePriv->genericMulticastOrBroadCastFrames))) {
             buffered_pkt->transmissionControl |= TRANSMISSION_CONTROL_TRIGGER_MASK;
             moreData = (buffered_pkt->transmissionControl & TRANSMISSION_CONTROL_EOSP_MASK)?FALSE:TRUE;
 
 
-            if((r=frame_and_send_queued_pdu(priv,buffered_pkt,NULL,moreData,FALSE)) == -ENOSPC) {
+            if((r=frame_and_send_queued_pdu(priv, buffered_pkt, NULL, moreData, FALSE)) == -ENOSPC) {
                 /* Clear the trigger bit transmission control*/
                 buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK);
                 /* Enqueue at the head of the queue */
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 list_add(&buffered_pkt->q, &interfacePriv->genericMulticastOrBroadCastFrames);
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                 break;
             } else {
                 if(r) {
-                    unifi_trace(priv,UDBG1,"send_multicast_frames: Send genericMulticastOrBroadCastFrame failed (%x, %x)\n",
+                    unifi_trace(priv, UDBG1, "send_multicast_frames: Send genericMulticastOrBroadCastFrame failed (%x, %x)\n",
                                             buffered_pkt->hostTag,
                                             r);
                     unifi_net_data_free(priv, &buffered_pkt->bulkdata);
@@ -2090,26 +2090,26 @@ u8 send_multicast_frames(unifi_priv_t *priv, u16 interfaceTag)
                         pduSent ++;
                         hostTag = buffered_pkt->hostTag;
                     } else {
-                        send_vif_availibility_rsp(priv,uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag),CSR_RC_UNSPECIFIED_FAILURE);
+                        send_vif_availibility_rsp(priv, uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag), CSR_RC_UNSPECIFIED_FAILURE);
                     }
                 }
                 /* Buffered frame sent successfully */
-                spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+                spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
                 interfacePriv->noOfbroadcastPktQueued--;
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
                 kfree(buffered_pkt);
             }
         }
     }
     if((interfacePriv->dtimActive == FALSE)) {
         /* Record the host Tag*/
-        unifi_trace(priv,UDBG2,"send_multicast_frames: Recorded hostTag of EOSP packet: = 0x%x\n",hostTag);
+        unifi_trace(priv, UDBG2, "send_multicast_frames: Recorded hostTag of EOSP packet: = 0x%x\n", hostTag);
         interfacePriv->multicastPduHostTag = hostTag;
     }
     return pduSent;
 }
 #endif
-void uf_process_ma_vif_availibility_ind(unifi_priv_t *priv,u8 *sigdata,
+void uf_process_ma_vif_availibility_ind(unifi_priv_t *priv, u8 *sigdata,
                                         u32 siglen)
 {
 #ifdef CSR_SUPPORT_SME
@@ -2148,15 +2148,15 @@ void uf_process_ma_vif_availibility_ind(unifi_priv_t *priv,u8 *sigdata,
             /* This condition can occur because of a potential race where the
                TIM is not yet reset as host is waiting for confirm but it is sent
                by firmware and DTIM occurs*/
-            unifi_notice(priv,"ma_vif_availibility_ind recevied for multicast but queues are empty%d\n",interfaceTag);
-            send_vif_availibility_rsp(priv,ind->VirtualInterfaceIdentifier,CSR_RC_NO_BUFFERED_BROADCAST_MULTICAST_FRAMES);
+            unifi_notice(priv, "ma_vif_availibility_ind recevied for multicast but queues are empty%d\n", interfaceTag);
+            send_vif_availibility_rsp(priv, ind->VirtualInterfaceIdentifier, CSR_RC_NO_BUFFERED_BROADCAST_MULTICAST_FRAMES);
             interfacePriv->dtimActive = FALSE;
             if(interfacePriv->multicastPduHostTag == 0xffffffff) {
-                unifi_notice(priv,"ma_vif_availibility_ind recevied for multicast but queues are empty%d\n",interfaceTag);
+                unifi_notice(priv, "ma_vif_availibility_ind recevied for multicast but queues are empty%d\n", interfaceTag);
                 /* This may be an extra request in very rare race conditions but it is fine as it would atleast remove the potential lock up */
                 if (!interfacePriv->bcTimSetReqPendingFlag)
                 {
-                    update_tim(priv,0,CSR_WIFI_TIM_RESET,interfaceTag, 0xFFFFFFFF);
+                    update_tim(priv, 0, CSR_WIFI_TIM_RESET, interfaceTag, 0xFFFFFFFF);
                 }
                 else
                 {
@@ -2171,23 +2171,23 @@ void uf_process_ma_vif_availibility_ind(unifi_priv_t *priv,u8 *sigdata,
             return;
         }
         if(interfacePriv->dtimActive) {
-            unifi_trace(priv,UDBG2,"DTIM Occurred for already active DTIM interface %d\n",interfaceTag);
+            unifi_trace(priv, UDBG2, "DTIM Occurred for already active DTIM interface %d\n", interfaceTag);
             return;
         } else {
-            unifi_trace(priv,UDBG2,"DTIM Occurred for interface %d\n",interfaceTag);
+            unifi_trace(priv, UDBG2, "DTIM Occurred for interface %d\n", interfaceTag);
             if(list_empty(&interfacePriv->genericMulticastOrBroadCastFrames)) {
-                set_eosp_transmit_ctrl(priv,&interfacePriv->genericMulticastOrBroadCastMgtFrames);
+                set_eosp_transmit_ctrl(priv, &interfacePriv->genericMulticastOrBroadCastMgtFrames);
             } else {
-                set_eosp_transmit_ctrl(priv,&interfacePriv->genericMulticastOrBroadCastFrames);
+                set_eosp_transmit_ctrl(priv, &interfacePriv->genericMulticastOrBroadCastFrames);
             }
         }
         interfacePriv->dtimActive = TRUE;
-        pduSent = send_multicast_frames(priv,interfaceTag);
+        pduSent = send_multicast_frames(priv, interfaceTag);
     }
     else {
-        unifi_error(priv,"Interface switching is not supported %d\n",interfaceTag);
+        unifi_error(priv, "Interface switching is not supported %d\n", interfaceTag);
         resultCode = CSR_RC_NOT_SUPPORTED;
-        send_vif_availibility_rsp(priv,ind->VirtualInterfaceIdentifier,CSR_RC_NOT_SUPPORTED);
+        send_vif_availibility_rsp(priv, ind->VirtualInterfaceIdentifier, CSR_RC_NOT_SUPPORTED);
     }
 #endif
 }
@@ -2204,12 +2204,12 @@ static u8 uf_is_more_data_for_delivery_ac(unifi_priv_t *priv, CsrWifiRouterCtrlS
         if(((staRecord->powersaveMode[i]==CSR_WIFI_AC_DELIVERY_ONLY_ENABLE)
              ||(staRecord->powersaveMode[i]==CSR_WIFI_AC_TRIGGER_AND_DELIVERY_ENABLED))
              &&(!list_empty(&staRecord->dataPdu[i]))) {
-            unifi_trace(priv,UDBG2,"uf_is_more_data_for_delivery_ac: Data Available AC = %d\n", i);
+            unifi_trace(priv, UDBG2, "uf_is_more_data_for_delivery_ac: Data Available AC = %d\n", i);
             return TRUE;
         }
     }
 
-    unifi_trace(priv,UDBG2,"uf_is_more_data_for_delivery_ac: Data NOT Available \n");
+    unifi_trace(priv, UDBG2, "uf_is_more_data_for_delivery_ac: Data NOT Available \n");
     return FALSE;
 }
 
@@ -2222,12 +2222,12 @@ static u8 uf_is_more_data_for_usp_delivery(unifi_priv_t *priv, CsrWifiRouterCtrl
         if(((staRecord->powersaveMode[i]==CSR_WIFI_AC_DELIVERY_ONLY_ENABLE)
              ||(staRecord->powersaveMode[i]==CSR_WIFI_AC_TRIGGER_AND_DELIVERY_ENABLED))
              &&(!list_empty(&staRecord->dataPdu[i]))) {
-            unifi_trace(priv,UDBG2,"uf_is_more_data_for_usp_delivery: Data Available AC = %d\n", i);
+            unifi_trace(priv, UDBG2, "uf_is_more_data_for_usp_delivery: Data Available AC = %d\n", i);
             return TRUE;
         }
     }
 
-    unifi_trace(priv,UDBG2,"uf_is_more_data_for_usp_delivery: Data NOT Available \n");
+    unifi_trace(priv, UDBG2, "uf_is_more_data_for_usp_delivery: Data NOT Available \n");
     return FALSE;
 }
 
@@ -2272,18 +2272,18 @@ void uf_send_buffered_data_from_delivery_ac(unifi_priv_t *priv,
         return;
     }
     while((buffered_pkt=dequeue_tx_data_pdu(priv, txList))) {
-        if((IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag))) {
+        if((IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag))) {
             unifi_trace(priv, UDBG2, "uf_send_buffered_data_from_delivery_ac: DTIM Active, suspend UAPSD, staId: 0x%x\n",
                         staInfo->aid);
 
             /* Once resume called, the U-APSD delivery operation will resume */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             staInfo->uspSuspend = TRUE;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
             /* re-queueing the packet as DTIM started */
-            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
-            list_add(&buffered_pkt->q,txList);
-            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
+            list_add(&buffered_pkt->q, txList);
+            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             break;
         }
 
@@ -2315,20 +2315,20 @@ void uf_send_buffered_data_from_delivery_ac(unifi_priv_t *priv,
             unifi_warning(priv, "uf_send_buffered_data_from_delivery_ac: non U-APSD !!! \n");
         }
 
-        unifi_trace(priv,UDBG2,"uf_send_buffered_data_from_delivery_ac : MoreData:%d, EOSP:%d\n",moreData,eosp);
+        unifi_trace(priv, UDBG2, "uf_send_buffered_data_from_delivery_ac : MoreData:%d, EOSP:%d\n", moreData, eosp);
 
-        if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staInfo,moreData,eosp)) == -ENOSPC) {
+        if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staInfo, moreData, eosp)) == -ENOSPC) {
 
             unifi_trace(priv, UDBG2, "uf_send_buffered_data_from_delivery_ac: UASPD suspended, ENOSPC in hipQ=%x\n", queue);
 
             /* Once resume called, the U-APSD delivery operation will resume */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             staInfo->uspSuspend = TRUE;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
-            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
-            list_add(&buffered_pkt->q,txList);
-            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
+            list_add(&buffered_pkt->q, txList);
+            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             priv->pausedStaHandle[queue]=(u8)(staInfo->assignedHandle);
             break;
         } else {
@@ -2337,17 +2337,17 @@ void uf_send_buffered_data_from_delivery_ac(unifi_priv_t *priv,
                 unifi_net_data_free(priv, &buffered_pkt->bulkdata);
             }
             kfree(buffered_pkt);
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             staInfo->noOfSpFramesSent++;
             if((!moreData) || (staInfo->noOfSpFramesSent == staInfo->maxSpLength)) {
                 unifi_trace(priv, UDBG2, "uf_send_buffered_data_from_delivery_ac: Terminating USP\n");
                 staInfo->uapsdActive = FALSE;
                 staInfo->uspSuspend = FALSE;
                 staInfo->noOfSpFramesSent = 0;
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
                 break;
             }
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }
     }
     unifi_trace(priv, UDBG2, "--uf_send_buffered_data_from_delivery_ac, active=%x\n", staInfo->uapsdActive);
@@ -2364,25 +2364,25 @@ void uf_send_buffered_data_from_ac(unifi_priv_t *priv,
     u8 moreData = FALSE;
     s8 r =0;
 
-    unifi_trace(priv,UDBG2,"uf_send_buffered_data_from_ac :\n");
+    unifi_trace(priv, UDBG2, "uf_send_buffered_data_from_ac :\n");
 
-    while(!isRouterBufferEnabled(priv,queue) &&
+    while(!isRouterBufferEnabled(priv, queue) &&
                     ((buffered_pkt=dequeue_tx_data_pdu(priv, txList))!=NULL)){
 
         buffered_pkt->transmissionControl &=
                  ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
 
-        unifi_trace(priv,UDBG3,"uf_send_buffered_data_from_ac : MoreData:%d, EOSP:%d\n",moreData,eosp);
+        unifi_trace(priv, UDBG3, "uf_send_buffered_data_from_ac : MoreData:%d, EOSP:%d\n", moreData, eosp);
 
-        if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staInfo,moreData,eosp)) == -ENOSPC) {
+        if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staInfo, moreData, eosp)) == -ENOSPC) {
            /* Enqueue at the head of the queue */
-           spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
-           list_add(&buffered_pkt->q,txList);
-           spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+           spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
+           list_add(&buffered_pkt->q, txList);
+           spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
            if(staInfo != NULL){
               priv->pausedStaHandle[queue]=(u8)(staInfo->assignedHandle);
            }
-           unifi_trace(priv,UDBG3," uf_send_buffered_data_from_ac: PDU sending failed .. no space for queue %d \n",queue);
+           unifi_trace(priv, UDBG3, " uf_send_buffered_data_from_ac: PDU sending failed .. no space for queue %d \n", queue);
            } else {
             if(r){
                 /* the PDU failed where we can't do any thing so free the storage */
@@ -2394,10 +2394,10 @@ void uf_send_buffered_data_from_ac(unifi_priv_t *priv,
 
 }
 
-void uf_send_buffered_frames(unifi_priv_t *priv,unifi_TrafficQueue q)
+void uf_send_buffered_frames(unifi_priv_t *priv, unifi_TrafficQueue q)
 {
     u16 interfaceTag = GET_ACTIVE_INTERFACE_TAG(priv);
-    u32 startIndex=0,endIndex=0;
+    u32 startIndex=0, endIndex=0;
     CsrWifiRouterCtrlStaInfo_t * staInfo = NULL;
     u8 queue;
     u8 moreData = FALSE;
@@ -2412,14 +2412,14 @@ void uf_send_buffered_frames(unifi_priv_t *priv,unifi_TrafficQueue q)
 
     if(interfacePriv->dtimActive) {
         /* this function updates dtimActive*/
-        send_multicast_frames(priv,interfaceTag);
+        send_multicast_frames(priv, interfaceTag);
         if(!interfacePriv->dtimActive) {
             moreData = (!list_empty(&interfacePriv->genericMulticastOrBroadCastMgtFrames) ||
              !list_empty(&interfacePriv->genericMulticastOrBroadCastFrames));
             if(!moreData) {
                 if (!interfacePriv->bcTimSetReqPendingFlag)
                 {
-                    update_tim(priv,0,CSR_WIFI_TIM_RESET,interfaceTag, 0XFFFFFFFF);
+                    update_tim(priv, 0, CSR_WIFI_TIM_RESET, interfaceTag, 0XFFFFFFFF);
                 }
                 else
                 {
@@ -2436,8 +2436,8 @@ void uf_send_buffered_frames(unifi_priv_t *priv,unifi_TrafficQueue q)
                         !list_empty(&interfacePriv->genericMulticastOrBroadCastFrames));
            if(!moreData) {
                /* This should never happen but if it happens, we need a way out */
-               unifi_error(priv,"ERROR: No More Data but DTIM is active sending Response\n");
-               send_vif_availibility_rsp(priv,uf_get_vif_identifier(interfacePriv->interfaceMode,interfaceTag),CSR_RC_NO_BUFFERED_BROADCAST_MULTICAST_FRAMES);
+               unifi_error(priv, "ERROR: No More Data but DTIM is active sending Response\n");
+               send_vif_availibility_rsp(priv, uf_get_vif_identifier(interfacePriv->interfaceMode, interfaceTag), CSR_RC_NO_BUFFERED_BROADCAST_MULTICAST_FRAMES);
                interfacePriv->dtimActive = FALSE;
            }
         }
@@ -2450,9 +2450,9 @@ void uf_send_buffered_frames(unifi_priv_t *priv,unifi_TrafficQueue q)
     if(queue == UNIFI_TRAFFIC_Q_VO) {
 
 
-        unifi_trace(priv,UDBG2,"uf_send_buffered_frames : trying mgt from queue=%d\n",queue);
+        unifi_trace(priv, UDBG2, "uf_send_buffered_frames : trying mgt from queue=%d\n", queue);
         for(startIndex= 0; startIndex < UNIFI_MAX_CONNECTIONS;startIndex++) {
-            staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv,startIndex,interfaceTag);
+            staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv, startIndex, interfaceTag);
             if(!staInfo ) {
                 continue;
             } else if((staInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_POWER_SAVE)
@@ -2464,31 +2464,31 @@ void uf_send_buffered_frames(unifi_priv_t *priv,unifi_TrafficQueue q)
                                &&(staInfo->uapsdActive == FALSE)){
                             /*Non-UAPSD case push the management frames out*/
                if(!list_empty(&staInfo->mgtFrames)){
-                    uf_send_buffered_data_from_ac(priv,staInfo, UNIFI_TRAFFIC_Q_VO, &staInfo->mgtFrames);
+                    uf_send_buffered_data_from_ac(priv, staInfo, UNIFI_TRAFFIC_Q_VO, &staInfo->mgtFrames);
                 }
             }
 
-            if(isRouterBufferEnabled(priv,queue)) {
-                unifi_notice(priv,"uf_send_buffered_frames : No space Left for queue = %d\n",queue);
+            if(isRouterBufferEnabled(priv, queue)) {
+                unifi_notice(priv, "uf_send_buffered_frames : No space Left for queue = %d\n", queue);
                 break;
             }
         }
         /*push generic management frames out*/
         if(!list_empty(&interfacePriv->genericMgtFrames)) {
-            unifi_trace(priv,UDBG2,"uf_send_buffered_frames : trying generic mgt from queue=%d\n",queue);
-            uf_send_buffered_data_from_ac(priv,staInfo, UNIFI_TRAFFIC_Q_VO, &interfacePriv->genericMgtFrames);
+            unifi_trace(priv, UDBG2, "uf_send_buffered_frames : trying generic mgt from queue=%d\n", queue);
+            uf_send_buffered_data_from_ac(priv, staInfo, UNIFI_TRAFFIC_Q_VO, &interfacePriv->genericMgtFrames);
         }
     }
 
 
-    unifi_trace(priv,UDBG2,"uf_send_buffered_frames : Resume called for Queue=%d\n",queue);
-    unifi_trace(priv,UDBG2,"uf_send_buffered_frames : start=%d end=%d\n",startIndex,endIndex);
+    unifi_trace(priv, UDBG2, "uf_send_buffered_frames : Resume called for Queue=%d\n", queue);
+    unifi_trace(priv, UDBG2, "uf_send_buffered_frames : start=%d end=%d\n", startIndex, endIndex);
 
     startIndex = priv->pausedStaHandle[queue];
     endIndex = (startIndex + UNIFI_MAX_CONNECTIONS -1) % UNIFI_MAX_CONNECTIONS;
 
     while(startIndex != endIndex) {
-        staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv,startIndex,interfaceTag);
+        staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv, startIndex, interfaceTag);
         if(!staInfo) {
             startIndex ++;
             if(startIndex >= UNIFI_MAX_CONNECTIONS) {
@@ -2504,7 +2504,7 @@ void uf_send_buffered_frames(unifi_priv_t *priv,unifi_TrafficQueue q)
             continue;
         }
         /* Peer is active or U-APSD is active so send PDUs to the peer */
-        unifi_trace(priv,UDBG2,"uf_send_buffered_frames : trying data from queue=%d\n",queue);
+        unifi_trace(priv, UDBG2, "uf_send_buffered_frames : trying data from queue=%d\n", queue);
 
 
         if((staInfo != NULL)&&(staInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_ACTIVE)
@@ -2520,7 +2520,7 @@ void uf_send_buffered_frames(unifi_priv_t *priv,unifi_TrafficQueue q)
            startIndex = 0;
         }
     }
-    if(isRouterBufferEnabled(priv,queue)) {
+    if(isRouterBufferEnabled(priv, queue)) {
         priv->pausedStaHandle[queue] = endIndex;
     } else {
         priv->pausedStaHandle[queue] = 0;
@@ -2561,7 +2561,7 @@ u8 uf_is_more_data_for_non_delivery_ac(CsrWifiRouterCtrlStaInfo_t *staRecord)
 }
 
 
-int uf_process_station_records_for_sending_data(unifi_priv_t *priv,u16 interfaceTag,
+int uf_process_station_records_for_sending_data(unifi_priv_t *priv, u16 interfaceTag,
                                                  CsrWifiRouterCtrlStaInfo_t *srcStaInfo,
                                                  CsrWifiRouterCtrlStaInfo_t *dstStaInfo)
 {
@@ -2647,10 +2647,10 @@ static void uf_handle_uspframes_delivery(unifi_priv_t * priv, CsrWifiRouterCtrlS
         return;
     }
 
-    spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+    spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
     staInfo->uapsdActive = TRUE;
     staInfo->uspSuspend = FALSE;
-    spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
     if(((staInfo->powersaveMode[UNIFI_TRAFFIC_Q_VO]==CSR_WIFI_AC_TRIGGER_AND_DELIVERY_ENABLED)||
         (staInfo->powersaveMode[UNIFI_TRAFFIC_Q_VO]==CSR_WIFI_AC_DELIVERY_ONLY_ENABLE))
@@ -2666,9 +2666,9 @@ static void uf_handle_uspframes_delivery(unifi_priv_t * priv, CsrWifiRouterCtrlS
          * NOTE: If we have sent Mgt frame also, we must send QNULL followed to terminate USP
          */
         if (!staInfo->uspSuspend) {
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             staInfo->uapsdActive = FALSE;
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
             unifi_trace(priv, UDBG2, "uf_handle_uspframes_delivery: sending QNull for trigger\n");
             uf_send_qos_null(priv, interfaceTag, staInfo->peerMacAddress.a, (CSR_PRIORITY) staInfo->triggerFramePriority, staInfo);
@@ -2687,12 +2687,12 @@ static void uf_handle_uspframes_delivery(unifi_priv_t * priv, CsrWifiRouterCtrlS
             }
 
             if ((!staInfo->uapsdActive) ||
-                    (staInfo->uspSuspend && IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag))) {
+                    (staInfo->uspSuspend && IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag))) {
                 /* If DTIM active found on one AC, No need to parse the remaining AC's
                  * as USP suspended. Break out of loop
                  */
                 unifi_trace(priv, UDBG2, "uf_handle_uspframes_delivery: suspend=%x,  DTIM=%x, USP terminated=%s\n",
-                           staInfo->uspSuspend, IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag),
+                           staInfo->uspSuspend, IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag),
                            staInfo->uapsdActive?"NO":"YES");
                 break;
             }
@@ -2734,9 +2734,9 @@ void uf_process_wmm_deliver_ac_uapsd(unifi_priv_t * priv,
 
     if((srcStaInfo->powersaveMode[priority_q]==CSR_WIFI_AC_TRIGGER_ONLY_ENABLED)
         ||(srcStaInfo->powersaveMode[priority_q]==CSR_WIFI_AC_TRIGGER_AND_DELIVERY_ENABLED)) {
-        spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+        spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
         srcStaInfo->triggerFramePriority = priority;
-        spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         unifi_trace(priv, UDBG2, "uf_process_wmm_deliver_ac_uapsd: trigger frame, Begin U-APSD, triggerQ=%x\n", priority_q);
         uf_handle_uspframes_delivery(priv, srcStaInfo, interfaceTag);
     }
@@ -2744,7 +2744,7 @@ void uf_process_wmm_deliver_ac_uapsd(unifi_priv_t * priv,
 }
 
 
-void uf_send_qos_null(unifi_priv_t * priv,u16 interfaceTag, const u8 *da,CSR_PRIORITY priority,CsrWifiRouterCtrlStaInfo_t * srcStaInfo)
+void uf_send_qos_null(unifi_priv_t * priv, u16 interfaceTag, const u8 *da, CSR_PRIORITY priority, CsrWifiRouterCtrlStaInfo_t * srcStaInfo)
 {
     bulk_data_param_t bulkdata;
     CsrResult csrResult;
@@ -2806,14 +2806,14 @@ void uf_send_qos_null(unifi_priv_t * priv,u16 interfaceTag, const u8 *da,CSR_PRI
 
     r = ul_send_signal_unpacked(priv, &signal, &bulkdata);
     if(r) {
-        unifi_error(priv, "failed to send QOS data null packet result: %d\n",r);
+        unifi_error(priv, "failed to send QOS data null packet result: %d\n", r);
         unifi_net_data_free(priv, &bulkdata.d[0]);
     }
 
     return;
 
 }
-void uf_send_nulldata(unifi_priv_t * priv,u16 interfaceTag, const u8 *da,CSR_PRIORITY priority,CsrWifiRouterCtrlStaInfo_t * srcStaInfo)
+void uf_send_nulldata(unifi_priv_t * priv, u16 interfaceTag, const u8 *da, CSR_PRIORITY priority, CsrWifiRouterCtrlStaInfo_t * srcStaInfo)
 {
     bulk_data_param_t bulkdata;
     CsrResult csrResult;
@@ -2882,14 +2882,14 @@ void uf_send_nulldata(unifi_priv_t * priv,u16 interfaceTag, const u8 *da,CSR_PRI
     if(r == -ENOSPC) {
         unifi_trace(priv, UDBG1, "uf_send_nulldata: ENOSPC Requeue the Null frame\n");
         enque_tx_data_pdu(priv, &bulkdata, &srcStaInfo->dataPdu[priority_q], &signal, 1);
-        spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+        spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
         srcStaInfo->noOfPktQueued++;
-        spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
 
     }
     if(r && r != -ENOSPC){
-        unifi_error(priv, "uf_send_nulldata: Failed to send Null frame Error = %d\n",r);
+        unifi_error(priv, "uf_send_nulldata: Failed to send Null frame Error = %d\n", r);
         unifi_net_data_free(priv, &bulkdata.d[0]);
         srcStaInfo->nullDataHostTag = INVALID_HOST_TAG;
     }
@@ -2939,7 +2939,7 @@ u8 uf_check_broadcast_bssid(unifi_priv_t *priv, const bulk_data_param_t *bulkdat
 
 
 u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t * srcStaInfo,
-                                u8 pmBit,u16 interfaceTag)
+                                u8 pmBit, u16 interfaceTag)
 {
     u8 moreData = FALSE;
     u8 powerSaveChanged = FALSE;
@@ -2955,22 +2955,22 @@ u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t *
         if(srcStaInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_ACTIVE) {
 
             /* disable the preemption */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             srcStaInfo->currentPeerState =CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_POWER_SAVE;
             powerSaveChanged = TRUE;
             /* enable the preemption */
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         } else {
             return powerSaveChanged;
         }
     } else {
         if(srcStaInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_POWER_SAVE) {
             /* disable the preemption */
-            spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+            spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
             srcStaInfo->currentPeerState = CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_ACTIVE;
             powerSaveChanged = TRUE;
             /* enable the preemption */
-            spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
         }else {
             return powerSaveChanged;
         }
@@ -2978,10 +2978,10 @@ u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t *
 
 
     if(srcStaInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_ACTIVE) {
-        unifi_trace(priv,UDBG3, "Peer with AID = %d is active now\n",srcStaInfo->aid);
-        process_peer_active_transition(priv,srcStaInfo,interfaceTag);
+        unifi_trace(priv, UDBG3, "Peer with AID = %d is active now\n", srcStaInfo->aid);
+        process_peer_active_transition(priv, srcStaInfo, interfaceTag);
     } else {
-        unifi_trace(priv,UDBG3, "Peer with AID = %d is in PS Now\n",srcStaInfo->aid);
+        unifi_trace(priv, UDBG3, "Peer with AID = %d is in PS Now\n", srcStaInfo->aid);
         /* Set TIM if needed */
         if(!srcStaInfo->wmmOrQosEnabled) {
             moreData = (!list_empty(&srcStaInfo->mgtFrames) ||
@@ -2990,7 +2990,7 @@ u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t *
             if(moreData && (srcStaInfo->timSet == CSR_WIFI_TIM_RESET)) {
                 unifi_trace(priv, UDBG3, "This condition should not occur\n");
                 if (!srcStaInfo->timRequestPendingFlag){
-                    update_tim(priv,srcStaInfo->aid,1,interfaceTag, srcStaInfo->assignedHandle);
+                    update_tim(priv, srcStaInfo->aid, 1, interfaceTag, srcStaInfo->assignedHandle);
                 }
                 else
                 {
@@ -3013,7 +3013,7 @@ u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t *
 
             if(moreData && (srcStaInfo->timSet == CSR_WIFI_TIM_RESET)) {
                 if (!srcStaInfo->timRequestPendingFlag){
-                    update_tim(priv,srcStaInfo->aid,1,interfaceTag, srcStaInfo->assignedHandle);
+                    update_tim(priv, srcStaInfo->aid, 1, interfaceTag, srcStaInfo->assignedHandle);
                 }
                 else
                 {
@@ -3033,7 +3033,7 @@ u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t *
 
 
 
-void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceTag)
+void uf_process_ps_poll(unifi_priv_t *priv, u8* sa, u8* da, u8 pmBit, u16 interfaceTag)
 {
     CsrWifiRouterCtrlStaInfo_t *staRecord =
     CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv, sa, interfaceTag);
@@ -3046,27 +3046,27 @@ void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceT
 
     unifi_trace(priv, UDBG3, "entering uf_process_ps_poll\n");
     if(!staRecord) {
-        memcpy(peerMacAddress.a,sa,ETH_ALEN);
+        memcpy(peerMacAddress.a, sa, ETH_ALEN);
         unifi_trace(priv, UDBG3, "In uf_process_ps_poll, sta record not found:unexpected frame addr = %x:%x:%x:%x:%x:%x\n",
-                sa[0], sa[1],sa[2], sa[3], sa[4],sa[5]);
-        CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,interfaceTag,peerMacAddress);
+                sa[0], sa[1], sa[2], sa[3], sa[4], sa[5]);
+        CsrWifiRouterCtrlUnexpectedFrameIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, peerMacAddress);
         return;
     }
 
-    uf_process_pm_bit_for_peer(priv,staRecord,pmBit,interfaceTag);
+    uf_process_pm_bit_for_peer(priv, staRecord, pmBit, interfaceTag);
 
     /* Update station last activity time */
     staRecord->activity_flag = TRUE;
 
     /* This should not change the PM bit as PS-POLL has PM bit always set */
     if(!pmBit) {
-        unifi_notice (priv," PM bit reset in PS-POLL\n");
+        unifi_notice (priv, " PM bit reset in PS-POLL\n");
         return;
     }
 
-    if(IS_DTIM_ACTIVE(interfacePriv->dtimActive,interfacePriv->multicastPduHostTag)) {
+    if(IS_DTIM_ACTIVE(interfacePriv->dtimActive, interfacePriv->multicastPduHostTag)) {
         /* giving more priority to multicast packets so dropping ps-poll*/
-        unifi_notice (priv," multicast transmission is going on so don't take action on PS-POLL\n");
+        unifi_notice (priv, " multicast transmission is going on so don't take action on PS-POLL\n");
         return;
     }
 
@@ -3078,13 +3078,13 @@ void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceT
                         !list_empty(&staRecord->mgtFrames));
 
             buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
-            if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,moreData,FALSE)) == -ENOSPC) {
+            if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, moreData, FALSE)) == -ENOSPC) {
                 /* Clear the trigger bit transmission control*/
                 buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
                 /* Enqueue at the head of the queue */
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 list_add(&buffered_pkt->q, &staRecord->mgtFrames);
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                 unifi_trace(priv, UDBG1, "(ENOSPC) PS-POLL received : PDU sending failed \n");
                 priv->pausedStaHandle[3]=(u8)(staRecord->assignedHandle);
             } else {
@@ -3101,13 +3101,13 @@ void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceT
                         !list_empty(&staRecord->dataPdu[UNIFI_TRAFFIC_Q_VO]));
 
             buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
-            if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,moreData,FALSE)) == -ENOSPC) {
+            if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, moreData, FALSE)) == -ENOSPC) {
                 /* Clear the trigger bit transmission control*/
                 buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
                 /* Enqueue at the head of the queue */
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 list_add(&buffered_pkt->q, &staRecord->dataPdu[UNIFI_TRAFFIC_Q_VO]);
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                 priv->pausedStaHandle[3]=(u8)(staRecord->assignedHandle);
                 unifi_trace(priv, UDBG1, "(ENOSPC) PS-POLL received : PDU sending failed \n");
             } else {
@@ -3123,13 +3123,13 @@ void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceT
             moreData = !list_empty(&staRecord->dataPdu[UNIFI_TRAFFIC_Q_CONTENTION]);
 
             buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
-            if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,moreData,FALSE)) == -ENOSPC) {
+            if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, moreData, FALSE)) == -ENOSPC) {
                 /* Clear the trigger bit transmission control*/
                 buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
                 /* Enqueue at the head of the queue */
-                spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                 list_add(&buffered_pkt->q, &staRecord->dataPdu[UNIFI_TRAFFIC_Q_CONTENTION]);
-                spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                 priv->pausedStaHandle[0]=(u8)(staRecord->assignedHandle);
                 unifi_trace(priv, UDBG1, "(ENOSPC) PS-POLL received : PDU sending failed \n");
             } else {
@@ -3150,7 +3150,7 @@ void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceT
         if(!moreData && (staRecord->timSet == CSR_WIFI_TIM_SET)) {
             unifi_trace(priv, UDBG3, "more data = NULL, set tim to 0 in uf_process_ps_poll\n");
             if (!staRecord->timRequestPendingFlag){
-                update_tim(priv,staRecord->aid,0,interfaceTag, staRecord->assignedHandle);
+                update_tim(priv, staRecord->aid, 0, interfaceTag, staRecord->assignedHandle);
             }
             else
             {
@@ -3165,7 +3165,7 @@ void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceT
     } else {
 
         u8 allDeliveryEnabled = 0, dataAvailable = 0;
-        unifi_trace(priv, UDBG3,"Qos Support station.Processing PS-Poll\n");
+        unifi_trace(priv, UDBG3, "Qos Support station.Processing PS-Poll\n");
 
         /*Send Data From Management Frames*/
         /* Priority orders for delivering the buffered packets are
@@ -3179,7 +3179,7 @@ void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceT
 
         if (allDeliveryEnabled) {
             unifi_trace(priv, UDBG3, "uf_process_ps_poll: All ACs are delivery enable so Sending QOS Null in response of Ps-poll\n");
-            uf_send_qos_null(priv,interfaceTag,sa,CSR_QOS_UP0,staRecord);
+            uf_send_qos_null(priv, interfaceTag, sa, CSR_QOS_UP0, staRecord);
             return;
         }
 
@@ -3192,13 +3192,13 @@ void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceT
                     buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
 
                     /* Last parameter is EOSP & its false always for PS-POLL processing */
-                    if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,moreData,FALSE)) == -ENOSPC) {
+                    if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, moreData, FALSE)) == -ENOSPC) {
                         /* Clear the trigger bit transmission control*/
                         buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
                         /* Enqueue at the head of the queue */
-                        spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                        spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                         list_add(&buffered_pkt->q, &staRecord->mgtFrames);
-                        spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                        spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                         priv->pausedStaHandle[0]=(u8)(staRecord->assignedHandle);
                         unifi_trace(priv, UDBG1, "(ENOSPC) PS-POLL received : PDU sending failed \n");
                     } else {
@@ -3227,13 +3227,13 @@ void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceT
                         buffered_pkt->transmissionControl |= (TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
 
                         /* Last parameter is EOSP & its false always for PS-POLL processing */
-                        if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staRecord,moreData,FALSE)) == -ENOSPC) {
+                        if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staRecord, moreData, FALSE)) == -ENOSPC) {
                             /* Clear the trigger bit transmission control*/
                             buffered_pkt->transmissionControl &= ~(TRANSMISSION_CONTROL_TRIGGER_MASK | TRANSMISSION_CONTROL_EOSP_MASK);
                             /* Enqueue at the head of the queue */
-                            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                             list_add(&buffered_pkt->q, &staRecord->dataPdu[i]);
-                            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                             priv->pausedStaHandle[0]=(u8)(staRecord->assignedHandle);
                             unifi_trace(priv, UDBG1, "(ENOSPC) PS-POLL received : PDU sending failed \n");
                         } else {
@@ -3256,7 +3256,7 @@ void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceT
         if(!moreData && (staRecord->timSet == CSR_WIFI_TIM_SET)) {
             unifi_trace(priv, UDBG3, "more data = NULL, set tim to 0 in uf_process_ps_poll\n");
             if (!staRecord->timRequestPendingFlag){
-                update_tim(priv,staRecord->aid,0,interfaceTag, staRecord->assignedHandle);
+                update_tim(priv, staRecord->aid, 0, interfaceTag, staRecord->assignedHandle);
             }
             else
             {
@@ -3311,7 +3311,7 @@ void uf_prepare_send_cfm_list_for_queued_pkts(unifi_priv_t * priv,
     struct list_head *placeHolder;
     unsigned long lock_flags;
 
-    spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+    spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
 
     /* Search through the list and if confirmation required for any frames,
     add it to the send_cfm list */
@@ -3337,7 +3337,7 @@ void uf_prepare_send_cfm_list_for_queued_pkts(unifi_priv_t * priv,
         }
     }
 
-    spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
 
 }
 
@@ -3352,7 +3352,7 @@ void uf_flush_list(unifi_priv_t * priv, struct list_head * list)
 
     unifi_trace(priv, UDBG5, "entering the uf_flush_list \n");
 
-    spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+    spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
     /* go through list, delete & free memory */
     list_for_each_safe(listHead, placeHolder, list) {
         tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
@@ -3378,7 +3378,7 @@ void uf_flush_list(unifi_priv_t * priv, struct list_head * list)
             priv->noOfPktQueuedInDriver--;
         }
     }
-    spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
 }
 
 tx_buffered_packets_t *dequeue_tx_data_pdu(unifi_priv_t *priv, struct list_head *txList)
@@ -3403,13 +3403,13 @@ tx_buffered_packets_t *dequeue_tx_data_pdu(unifi_priv_t *priv, struct list_head
     }
 
     /* return first node after header, & delete from the list  && atleast one item exist */
-    spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+    spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
     list_for_each_safe(listHead, placeHolder, txList) {
         tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
         list_del(listHead);
         break;
     }
-    spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
 
     if (tx_q_item) {
         unifi_trace(priv, UDBG5,
@@ -3440,20 +3440,20 @@ CsrWifiRouterCtrlStaInfo_t *CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(
     interfacePriv = priv->interfacePriv[interfaceTag];
 
     /* disable the preemption until station record is fetched */
-    spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+    spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
 
     for (i = 0; i < UNIFI_MAX_CONNECTIONS; i++) {
         if (interfacePriv->staInfo[i]!= NULL) {
             if (!memcmp(((CsrWifiRouterCtrlStaInfo_t *) (interfacePriv->staInfo[i]))->peerMacAddress.a, peerMacAddress, ETH_ALEN)) {
                 /* enable the preemption as station record is fetched */
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
                 unifi_trace(priv, UDBG5, "peer entry found in station record\n");
                 return ((CsrWifiRouterCtrlStaInfo_t *) (interfacePriv->staInfo[i]));
             }
         }
     }
     /* enable the preemption as station record is fetched */
-    spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
     unifi_trace(priv, UDBG5, "peer entry not found in station record\n");
     return NULL;
 }
@@ -3487,7 +3487,7 @@ void uf_check_inactivity(unifi_priv_t *priv, u16 interfaceTag, u32 currentTime)
         return;
     }
 
-    spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+    spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
     /* Go through the list of stations to check for inactivity */
     for(i = 0; i < UNIFI_MAX_CONNECTIONS; i++) {
         staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv, i, interfaceTag);
@@ -3502,7 +3502,7 @@ void uf_check_inactivity(unifi_priv_t *priv, u16 interfaceTag, u32 currentTime)
         elapsedTime = (currentTime >= staInfo->lastActivity)?
                 (currentTime - staInfo->lastActivity):
                 (~((u32)0) - staInfo->lastActivity + currentTime);
-        spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
         if (elapsedTime > MAX_INACTIVITY_INTERVAL) {
             memcpy((u8*)&peerMacAddress, (u8*)&staInfo->peerMacAddress, sizeof(CsrWifiMacAddress));
@@ -3545,7 +3545,7 @@ void uf_update_sta_activity(unifi_priv_t *priv, u16 interfaceTag, const u8 *peer
         return;
     }
 
-    spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+    spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
     /* Update activity */
     staInfo->lastActivity = currentTime;
 
@@ -3558,7 +3558,7 @@ void uf_update_sta_activity(unifi_priv_t *priv, u16 interfaceTag, const u8 *peer
                     (currentTime - interfacePriv->last_inactivity_check):
                     (~((u32)0) - interfacePriv->last_inactivity_check + currentTime);
 
-    spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+    spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
 
     /* Check if it is time to run the inactivity handler */
     if (elapsedTime > INACTIVITY_CHECK_INTERVAL) {
@@ -3572,19 +3572,19 @@ void resume_unicast_buffered_frames(unifi_priv_t *priv, u16 interfaceTag)
    u8 i;
    int j;
    tx_buffered_packets_t * buffered_pkt = NULL;
-   u8 hipslotFree[4] = {TRUE,TRUE,TRUE,TRUE};
+   u8 hipslotFree[4] = {TRUE, TRUE, TRUE, TRUE};
    int r;
    unsigned long lock_flags;
 
-   while(!isRouterBufferEnabled(priv,3) &&
-                            ((buffered_pkt=dequeue_tx_data_pdu(priv,&interfacePriv->genericMgtFrames))!=NULL)) {
+   while(!isRouterBufferEnabled(priv, 3) &&
+                            ((buffered_pkt=dequeue_tx_data_pdu(priv, &interfacePriv->genericMgtFrames))!=NULL)) {
         buffered_pkt->transmissionControl &=
                      ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
-        if((r=frame_and_send_queued_pdu(priv,buffered_pkt,NULL,0,FALSE)) == -ENOSPC) {
+        if((r=frame_and_send_queued_pdu(priv, buffered_pkt, NULL, 0, FALSE)) == -ENOSPC) {
             /* Enqueue at the head of the queue */
-            spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+            spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
             list_add(&buffered_pkt->q, &interfacePriv->genericMgtFrames);
-            spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+            spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
             hipslotFree[3]=FALSE;
             break;
         }else {
@@ -3606,12 +3606,12 @@ void resume_unicast_buffered_frames(unifi_priv_t *priv, u16 interfaceTag)
           while((( TRUE == hipslotFree[3] ) && (buffered_pkt=dequeue_tx_data_pdu(priv, &staInfo->mgtFrames)))) {
               buffered_pkt->transmissionControl &=
                            ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
-              if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staInfo,0,FALSE)) == -ENOSPC) {
+              if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staInfo, 0, FALSE)) == -ENOSPC) {
                   unifi_trace(priv, UDBG3, "(ENOSPC) in resume_unicast_buffered_frames:: hip slots are full for voice queue\n");
                   /* Enqueue at the head of the queue */
-                  spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                  spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                   list_add(&buffered_pkt->q, &staInfo->mgtFrames);
-                  spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                  spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                   priv->pausedStaHandle[3]=(u8)(staInfo->assignedHandle);
                   hipslotFree[3] = FALSE;
                   break;
@@ -3632,11 +3632,11 @@ void resume_unicast_buffered_frames(unifi_priv_t *priv, u16 interfaceTag)
               while((buffered_pkt=dequeue_tx_data_pdu(priv, &staInfo->dataPdu[j]))) {
                  buffered_pkt->transmissionControl &=
                             ~(TRANSMISSION_CONTROL_TRIGGER_MASK|TRANSMISSION_CONTROL_EOSP_MASK);
-                 if((r=frame_and_send_queued_pdu(priv,buffered_pkt,staInfo,0,FALSE)) == -ENOSPC) {
+                 if((r=frame_and_send_queued_pdu(priv, buffered_pkt, staInfo, 0, FALSE)) == -ENOSPC) {
                      /* Enqueue at the head of the queue */
-                     spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+                     spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
                      list_add(&buffered_pkt->q, &staInfo->dataPdu[j]);
-                     spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+                     spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
                      priv->pausedStaHandle[j]=(u8)(staInfo->assignedHandle);
                      hipslotFree[j]=FALSE;
                      break;
@@ -3653,7 +3653,7 @@ void resume_unicast_buffered_frames(unifi_priv_t *priv, u16 interfaceTag)
        }
     }
 }
-void update_eosp_to_head_of_broadcast_list_head(unifi_priv_t *priv,u16 interfaceTag)
+void update_eosp_to_head_of_broadcast_list_head(unifi_priv_t *priv, u16 interfaceTag)
 {
 
     netInterface_priv_t *interfacePriv = priv->interfacePriv[interfaceTag];
@@ -3668,15 +3668,15 @@ void update_eosp_to_head_of_broadcast_list_head(unifi_priv_t *priv,u16 interface
          * because we have received any mgmt packet so it should not hold for long time
          * peer may time out.
          */
-        spin_lock_irqsave(&priv->tx_q_lock,lock_flags);
+        spin_lock_irqsave(&priv->tx_q_lock, lock_flags);
         list_for_each_safe(listHead, placeHolder, &interfacePriv->genericMulticastOrBroadCastFrames) {
             tx_q_item = list_entry(listHead, tx_buffered_packets_t, q);
             tx_q_item->transmissionControl |= TRANSMISSION_CONTROL_EOSP_MASK;
             tx_q_item->transmissionControl = (tx_q_item->transmissionControl & ~(CSR_NO_CONFIRM_REQUIRED));
-            unifi_trace(priv, UDBG1,"updating eosp for list Head hostTag:= 0x%x ",tx_q_item->hostTag);
+            unifi_trace(priv, UDBG1, "updating eosp for list Head hostTag:= 0x%x ", tx_q_item->hostTag);
             break;
         }
-        spin_unlock_irqrestore(&priv->tx_q_lock,lock_flags);
+        spin_unlock_irqrestore(&priv->tx_q_lock, lock_flags);
     }
 }
 
@@ -3692,7 +3692,7 @@ void update_eosp_to_head_of_broadcast_list_head(unifi_priv_t *priv,u16 interface
  *      interfaceTag    For which resume should happen
  * ---------------------------------------------------------------------------
  */
-void resume_suspended_uapsd(unifi_priv_t* priv,u16 interfaceTag)
+void resume_suspended_uapsd(unifi_priv_t* priv, u16 interfaceTag)
 {
 
    u8 startIndex;
@@ -3701,7 +3701,7 @@ void resume_suspended_uapsd(unifi_priv_t* priv,u16 interfaceTag)
 
     unifi_trace(priv, UDBG2, "++resume_suspended_uapsd: \n");
     for(startIndex= 0; startIndex < UNIFI_MAX_CONNECTIONS;startIndex++) {
-        staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv,startIndex,interfaceTag);
+        staInfo =  CsrWifiRouterCtrlGetStationRecordFromHandle(priv, startIndex, interfaceTag);
 
         if(!staInfo || !staInfo->wmmOrQosEnabled) {
             continue;
@@ -3716,10 +3716,10 @@ void resume_suspended_uapsd(unifi_priv_t* priv,u16 interfaceTag)
                         staInfo->currentPeerState, staInfo->uapsdActive, staInfo->uspSuspend);
             if (staInfo->currentPeerState == CSR_WIFI_ROUTER_CTRL_PEER_CONNECTED_ACTIVE)
             {
-                spin_lock_irqsave(&priv->staRecord_lock,lock_flags);
+                spin_lock_irqsave(&priv->staRecord_lock, lock_flags);
                 staInfo->uapsdActive = FALSE;
                 staInfo->uspSuspend = FALSE;
-                spin_unlock_irqrestore(&priv->staRecord_lock,lock_flags);
+                spin_unlock_irqrestore(&priv->staRecord_lock, lock_flags);
             }
         }
     }
index d20d74ce56cbb42f28e5401f78fa1c53ee0efd3a..37302f3c2f6c7cadf7a8d3e68fb590fcea1b6f3f 100644 (file)
@@ -259,7 +259,7 @@ typedef u8 CsrWifiAcPowersaveMode;
 
 #define IS_DELIVERY_ENABLED(mode) (mode & CSR_WIFI_AC_DELIVERY_ONLY_ENABLE)? 1: 0
 #define IS_DELIVERY_AND_TRIGGER_ENABLED(mode) ((mode & CSR_WIFI_AC_DELIVERY_ONLY_ENABLE)||(mode & CSR_WIFI_AC_TRIGGER_AND_DELIVERY_ENABLED))? 1: 0
-#define IS_DTIM_ACTIVE(flag,hostTag) ((flag == TRUE || hostTag != INVALID_HOST_TAG))
+#define IS_DTIM_ACTIVE(flag, hostTag) ((flag == TRUE || hostTag != INVALID_HOST_TAG))
 #define INVALID_HOST_TAG 0xFFFFFFFF
 #define UNIFI_TRAFFIC_Q_CONTENTION UNIFI_TRAFFIC_Q_BE
 
@@ -767,9 +767,9 @@ typedef struct netInterface_priv
 } netInterface_priv_t;
 
 #ifdef CSR_SUPPORT_SME
-#define routerStartBuffering(priv,queue) priv->routerBufferEnable[(queue)] = TRUE;
-#define routerStopBuffering(priv,queue) priv->routerBufferEnable[(queue)]  = FALSE;
-#define isRouterBufferEnabled(priv,queue) priv->routerBufferEnable[(queue)]
+#define routerStartBuffering(priv, queue) priv->routerBufferEnable[(queue)] = TRUE;
+#define routerStopBuffering(priv, queue) priv->routerBufferEnable[(queue)]  = FALSE;
+#define isRouterBufferEnabled(priv, queue) priv->routerBufferEnable[(queue)]
 #endif
 
 #ifdef USE_DRIVER_LOCK
@@ -919,8 +919,8 @@ int uf_verify_m4(unifi_priv_t *priv, const unsigned char *packet,
 
 #ifdef CSR_SUPPORT_SME
 u8 uf_check_broadcast_bssid(unifi_priv_t *priv, const bulk_data_param_t *bulkdata);
-u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t * srcStaInfo,u8 pmBit,u16 interfaceTag);
-void uf_process_ps_poll(unifi_priv_t *priv,u8* sa,u8* da,u8 pmBit,u16 interfaceTag);
+u8 uf_process_pm_bit_for_peer(unifi_priv_t * priv, CsrWifiRouterCtrlStaInfo_t * srcStaInfo, u8 pmBit, u16 interfaceTag);
+void uf_process_ps_poll(unifi_priv_t *priv, u8* sa, u8* da, u8 pmBit, u16 interfaceTag);
 int uf_ap_process_data_pdu(unifi_priv_t *priv, struct sk_buff *skb,
                    struct ethhdr *ehdr, CsrWifiRouterCtrlStaInfo_t * srcStaInfo,
                    const CSR_SIGNAL *signal,
@@ -936,17 +936,17 @@ void uf_send_buffered_data_from_ac(unifi_priv_t *priv, CsrWifiRouterCtrlStaInfo_
 void uf_send_buffered_data_from_delivery_ac(unifi_priv_t *priv, CsrWifiRouterCtrlStaInfo_t * staInfo, u8 queue, struct list_head *txList);
 
 void uf_continue_uapsd(unifi_priv_t *priv, CsrWifiRouterCtrlStaInfo_t * staInfo);
-void uf_send_qos_null(unifi_priv_t * priv,u16 interfaceTag, const u8 *da,CSR_PRIORITY priority,CsrWifiRouterCtrlStaInfo_t * srcStaInfo);
-void uf_send_nulldata(unifi_priv_t * priv,u16 interfaceTag, const u8 *da,CSR_PRIORITY priority,CsrWifiRouterCtrlStaInfo_t * srcStaInfo);
+void uf_send_qos_null(unifi_priv_t * priv, u16 interfaceTag, const u8 *da, CSR_PRIORITY priority, CsrWifiRouterCtrlStaInfo_t * srcStaInfo);
+void uf_send_nulldata(unifi_priv_t * priv, u16 interfaceTag, const u8 *da, CSR_PRIORITY priority, CsrWifiRouterCtrlStaInfo_t * srcStaInfo);
 
 
 
 #endif
 CsrResult uf_process_ma_packet_req(unifi_priv_t *priv,  u8 *peerMacAddress, CSR_CLIENT_TAG hostTag, u16 interfaceTag, CSR_TRANSMISSION_CONTROL transmissionControl, CSR_RATE TransmitRate, CSR_PRIORITY priority, CSR_PROCESS_ID senderId, bulk_data_param_t *bulkdata);
-void uf_process_ma_vif_availibility_ind(unifi_priv_t *priv,u8 *sigdata, u32 siglen);
+void uf_process_ma_vif_availibility_ind(unifi_priv_t *priv, u8 *sigdata, u32 siglen);
 #ifdef CSR_SUPPORT_SME
-void uf_send_buffered_frames(unifi_priv_t *priv,unifi_TrafficQueue queue);
-int uf_process_station_records_for_sending_data(unifi_priv_t *priv,u16 interfaceTag,
+void uf_send_buffered_frames(unifi_priv_t *priv, unifi_TrafficQueue queue);
+int uf_process_station_records_for_sending_data(unifi_priv_t *priv, u16 interfaceTag,
                                                  CsrWifiRouterCtrlStaInfo_t *srcStaInfo,
                                                  CsrWifiRouterCtrlStaInfo_t *dstStaInfo);
 void uf_prepare_send_cfm_list_for_queued_pkts(unifi_priv_t * priv,
@@ -958,8 +958,8 @@ void send_auto_ma_packet_confirm(unifi_priv_t *priv,
 void uf_flush_list(unifi_priv_t * priv, struct list_head * list);
 tx_buffered_packets_t *dequeue_tx_data_pdu(unifi_priv_t *priv, struct list_head *txList);
 void resume_unicast_buffered_frames(unifi_priv_t *priv, u16 interfaceTag);
-void update_eosp_to_head_of_broadcast_list_head(unifi_priv_t *priv,u16 interfaceTag);
-void resume_suspended_uapsd(unifi_priv_t* priv,u16 interfaceTag);
+void update_eosp_to_head_of_broadcast_list_head(unifi_priv_t *priv, u16 interfaceTag);
+void resume_suspended_uapsd(unifi_priv_t* priv, u16 interfaceTag);
 #endif
 /*
  *      netdev.c
@@ -1048,14 +1048,14 @@ CsrWifiRouterCtrlStaInfo_t * CsrWifiRouterCtrlGetStationRecordFromHandle(unifi_p
                                                                  u16 interfaceTag);
 
 void uf_update_sta_activity(unifi_priv_t *priv, u16 interfaceTag, const u8 *peerMacAddress);
-void uf_process_ma_pkt_cfm_for_ap(unifi_priv_t *priv,u16 interfaceTag, const CSR_MA_PACKET_CONFIRM *pkt_cfm);
+void uf_process_ma_pkt_cfm_for_ap(unifi_priv_t *priv, u16 interfaceTag, const CSR_MA_PACKET_CONFIRM *pkt_cfm);
 #endif
 
 void uf_resume_data_plane(unifi_priv_t *priv, int queue,
                           CsrWifiMacAddress peer_address,
                           u16 interfaceTag);
 void uf_free_pending_rx_packets(unifi_priv_t *priv, int queue,
-        CsrWifiMacAddress peer_address,u16 interfaceTag);
+        CsrWifiMacAddress peer_address, u16 interfaceTag);
 
 int uf_register_netdev(unifi_priv_t *priv, int numOfInterface);
 void uf_unregister_netdev(unifi_priv_t *priv);
index 90295035621f6392329eab1400a0f8d0996ac64b..50908822b3c819157b59aa84e6c6b23d3a2f480d 100644 (file)
@@ -133,7 +133,7 @@ sme_log_event(ul_client_t *pcli,
                     unicastPdu = FALSE;
 
                 CsrWifiRouterCtrlMicFailureIndSend (priv->CSR_WIFI_SME_IFACEQUEUE, 0,
-                        (ind->VirtualInterfaceIdentifier & 0xff),peerMacAddress,
+                        (ind->VirtualInterfaceIdentifier & 0xff), peerMacAddress,
                         unicastPdu);
                 return;
             }
@@ -143,10 +143,10 @@ sme_log_event(ul_client_t *pcli,
                 {
                     u8 pmBit = (frmCtrl & 0x1000)?0x01:0x00;
                     u16 interfaceTag = (ind->VirtualInterfaceIdentifier & 0xff);
-                    CsrWifiRouterCtrlStaInfo_t *srcStaInfo =  CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv,taddr,interfaceTag);
+                    CsrWifiRouterCtrlStaInfo_t *srcStaInfo =  CsrWifiRouterCtrlGetStationRecordFromPeerMacAddress(priv, taddr, interfaceTag);
                     if((srcStaInfo != NULL) && (uf_check_broadcast_bssid(priv, bulkdata)== FALSE))
                     {
-                        uf_process_pm_bit_for_peer(priv,srcStaInfo,pmBit,interfaceTag);
+                        uf_process_pm_bit_for_peer(priv, srcStaInfo, pmBit, interfaceTag);
 
                         /* Update station last activity flag */
                         srcStaInfo->activity_flag = TRUE;
@@ -169,7 +169,7 @@ sme_log_event(ul_client_t *pcli,
                 return;
             }
 
-            unifi_trace(priv,UDBG1,"MA-PACKET Confirm (%x, %x)\n", cfm->HostTag, cfm->TransmissionStatus);
+            unifi_trace(priv, UDBG1, "MA-PACKET Confirm (%x, %x)\n", cfm->HostTag, cfm->TransmissionStatus);
 
             interfacePriv = priv->interfacePriv[interfaceTag];
 #ifdef CSR_SUPPORT_SME
@@ -177,7 +177,7 @@ sme_log_event(ul_client_t *pcli,
                  interfacePriv->interfaceMode == CSR_WIFI_ROUTER_CTRL_MODE_P2PGO) {
 
                 if(cfm->HostTag == interfacePriv->multicastPduHostTag){
-                    uf_process_ma_pkt_cfm_for_ap(priv ,interfaceTag, cfm);
+                    uf_process_ma_pkt_cfm_for_ap(privinterfaceTag, cfm);
                 }
             }
 #endif
@@ -395,7 +395,7 @@ uf_multicast_list_wq(struct work_struct *work)
             interfacePriv->mc_list_count);
 
     /* Flush the current list */
-    CsrWifiRouterCtrlMulticastAddressIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0, interfaceTag, CSR_WIFI_SME_LIST_ACTION_FLUSH, 0, NULL);
+    CsrWifiRouterCtrlMulticastAddressIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, CSR_WIFI_SME_LIST_ACTION_FLUSH, 0, NULL);
 
     mc_count = interfacePriv->mc_list_count;
     mc_list = interfacePriv->mc_list;
@@ -419,7 +419,7 @@ uf_multicast_list_wq(struct work_struct *work)
         return;
     }
 
-    CsrWifiRouterCtrlMulticastAddressIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,
+    CsrWifiRouterCtrlMulticastAddressIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0,
             interfaceTag,
             CSR_WIFI_SME_LIST_ACTION_ADD,
             mc_count, multicast_address_list);
@@ -950,7 +950,7 @@ int
     }
     return i;
 }
-int unifi_cfg_set_ap_config(unifi_priv_t * priv,unsigned char* arg)
+int unifi_cfg_set_ap_config(unifi_priv_t * priv, unsigned char* arg)
 {
     uf_cfg_ap_config_t cfg_ap_config;
     char *buffer;
@@ -981,7 +981,7 @@ int unifi_cfg_set_ap_config(unifi_priv_t * priv,unsigned char* arg)
     priv->ap_mac_config.phySupportedBitmap = cfg_ap_config.phySupportedBitmap;
     priv->ap_mac_config.maxListenInterval=cfg_ap_config.maxListenInterval;
 
-    priv->ap_mac_config.supportedRatesCount=     uf_configure_supported_rates(priv->ap_mac_config.supportedRates,priv->ap_mac_config.phySupportedBitmap);
+    priv->ap_mac_config.supportedRatesCount=     uf_configure_supported_rates(priv->ap_mac_config.supportedRates, priv->ap_mac_config.phySupportedBitmap);
 
     return 0;
 }
@@ -1051,7 +1051,7 @@ uf_ta_ind_wq(struct work_struct *work)
     u16 interfaceTag = 0;
 
 
-    CsrWifiRouterCtrlTrafficProtocolIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0,
+    CsrWifiRouterCtrlTrafficProtocolIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0,
             interfaceTag,
             ind->packet_type,
             ind->direction,
@@ -1119,7 +1119,7 @@ uf_ta_sample_ind_wq(struct work_struct *work)
         }
     }
 
-    CsrWifiRouterCtrlTrafficSampleIndSend(priv->CSR_WIFI_SME_IFACEQUEUE,0, interfaceTag, ind->stats);
+    CsrWifiRouterCtrlTrafficSampleIndSend(priv->CSR_WIFI_SME_IFACEQUEUE, 0, interfaceTag, ind->stats);
 
     ind->in_use = 0;
 
@@ -1219,7 +1219,7 @@ void uf_send_pkt_to_encrypt(struct work_struct *work)
         kfree(pktBulkData); /* Would have been copied over by the SME Handler */
 
     } else {
-           unifi_warning(priv, "uf_send_pkt_to_encrypt() is NOT applicable for interface mode - %d\n",interfacePriv->interfaceMode);
+           unifi_warning(priv, "uf_send_pkt_to_encrypt() is NOT applicable for interface mode - %d\n", interfacePriv->interfaceMode);
     }
 }/* uf_send_pkt_to_encrypt() */
 #endif
index b689cfe2b1000570d1aef6b0725f9135551a6139..aff9aa1781244617b354a02f54016cd70ee96e07 100644 (file)
@@ -210,9 +210,9 @@ int sme_mgt_mib_get(unifi_priv_t *priv,
 int sme_mgt_mib_set(unifi_priv_t *priv,
                     unsigned char *varbind, int length);
 #ifdef CSR_SUPPORT_WEXT_AP
-int sme_ap_start(unifi_priv_t *priv,u16 interface_tag,CsrWifiSmeApConfig_t *ap_config);
-int sme_ap_stop(unifi_priv_t *priv,u16 interface_tag);
-int sme_ap_config(unifi_priv_t *priv,CsrWifiSmeApMacConfig *ap_mac_config, CsrWifiNmeApConfig *group_security_config);
+int sme_ap_start(unifi_priv_t *priv, u16 interface_tag, CsrWifiSmeApConfig_t *ap_config);
+int sme_ap_stop(unifi_priv_t *priv, u16 interface_tag);
+int sme_ap_config(unifi_priv_t *priv, CsrWifiSmeApMacConfig *ap_mac_config, CsrWifiNmeApConfig *group_security_config);
 int uf_configure_supported_rates(u8 * supportedRates, u8 phySupportedBitmap);
 #endif
 int unifi_translate_scan(struct net_device *dev,
@@ -234,7 +234,7 @@ int unifi_cfg_get_info(unifi_priv_t *priv, unsigned char *arg);
 int unifi_cfg_strict_draft_n(unifi_priv_t *priv, unsigned char *arg);
 int unifi_cfg_enable_okc(unifi_priv_t *priv, unsigned char *arg);
 #ifdef CSR_SUPPORT_WEXT_AP
-int unifi_cfg_set_ap_config(unifi_priv_t * priv,unsigned char* arg);
+int unifi_cfg_set_ap_config(unifi_priv_t * priv, unsigned char* arg);
 #endif
 
 
index 52224cdc967d1ac39c269b9d931d65fc81ac6a52..fabfd779c668148c545dfe96f9d45b6d4545603a 100644 (file)
@@ -13,7 +13,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <asm/io.h>
+#include <linux/io.h>
 #include <linux/hdlc.h>
 #include "pmcc4_sysdep.h"
 #include "sbecom_inline_linux.h"
@@ -35,235 +35,253 @@ extern int  cxt1e1_log_level;
 #define COMET_NUM_UNITS     5   /* Number of points per entry in table */
 
 /* forward references */
-STATIC void SetPwrLevel (comet_t * comet);
-STATIC void WrtRcvEqualizerTbl (ci_t * ci, comet_t * comet, u_int32_t *table);
-STATIC void WrtXmtWaveformTbl (ci_t * ci, comet_t * comet, u_int8_t table[COMET_NUM_SAMPLES][COMET_NUM_UNITS]);
+STATIC void SetPwrLevel(comet_t *comet);
+STATIC void WrtRcvEqualizerTbl(ci_t *ci, comet_t *comet, u_int32_t *table);
+STATIC void WrtXmtWaveformTbl(ci_t *ci, comet_t *comet, u_int8_t table[COMET_NUM_SAMPLES][COMET_NUM_UNITS]);
 
 
 void       *TWV_table[12] = {
-    TWVLongHaul0DB, TWVLongHaul7_5DB, TWVLongHaul15DB, TWVLongHaul22_5DB,
-    TWVShortHaul0, TWVShortHaul1, TWVShortHaul2, TWVShortHaul3, TWVShortHaul4,
-    TWVShortHaul5,
-    TWV_E1_75Ohm,    /** PORT POINT - 75 Ohm not supported **/
-    TWV_E1_120Ohm
+       TWVLongHaul0DB, TWVLongHaul7_5DB, TWVLongHaul15DB, TWVLongHaul22_5DB,
+       TWVShortHaul0, TWVShortHaul1, TWVShortHaul2, TWVShortHaul3,
+       TWVShortHaul4, TWVShortHaul5,
+       /** PORT POINT - 75 Ohm not supported **/
+       TWV_E1_75Ohm,
+       TWV_E1_120Ohm
 };
 
 
 static int
-lbo_tbl_lkup (int t1, int lbo)
-{
-    if ((lbo < CFG_LBO_LH0) || (lbo > CFG_LBO_E120))    /* error switches to
-                                                         * default */
-    {
-        if (t1)
-            lbo = CFG_LBO_LH0;  /* default T1 waveform table */
-        else
-            lbo = CFG_LBO_E120;     /* default E1 waveform table */
-    }
-    return (lbo - 1);               /* make index ZERO relative */
+lbo_tbl_lkup(int t1, int lbo) {
+       /* error switches to default */
+       if ((lbo < CFG_LBO_LH0) || (lbo > CFG_LBO_E120)) {
+               if (t1)
+                       /* default T1 waveform table */
+                       lbo = CFG_LBO_LH0;
+               else
+                       /* default E1 waveform table */
+                       lbo = CFG_LBO_E120;
+       }
+       /* make index ZERO relative */
+       return lbo - 1;
 }
 
-
-void
-init_comet (void *ci, comet_t * comet, u_int32_t port_mode, int clockmaster,
-            u_int8_t moreParams)
+void init_comet(void *ci, comet_t *comet, u_int32_t port_mode, int clockmaster,
+               u_int8_t moreParams)
 {
-    u_int8_t isT1mode;
-    u_int8_t    tix = CFG_LBO_LH0;      /* T1 default */
-
-    isT1mode = IS_FRAME_ANY_T1 (port_mode);
-    /* T1 or E1 */
-    if (isT1mode)
-    {
-        pci_write_32 ((u_int32_t *) &comet->gbl_cfg, 0xa0);     /* Select T1 Mode & PIO
-                                                                 * output enabled */
-        tix = lbo_tbl_lkup (isT1mode, CFG_LBO_LH0);     /* default T1 waveform
-                                                         * table */
-    } else
-    {
-        pci_write_32 ((u_int32_t *) &comet->gbl_cfg, 0x81);     /* Select E1 Mode & PIO
-                                                                 * output enabled */
-        tix = lbo_tbl_lkup (isT1mode, CFG_LBO_E120);    /* default E1 waveform
-                                                         * table */
-    }
-
-    if (moreParams & CFG_LBO_MASK)
-        tix = lbo_tbl_lkup (isT1mode, moreParams & CFG_LBO_MASK);       /* dial-in requested
-                                                                         * waveform table */
-
-    /* Tx line Intfc cfg     ** Set for analog & no special patterns */
-    pci_write_32 ((u_int32_t *) &comet->tx_line_cfg, 0x00);     /* Transmit Line
-                                                                 * Interface Config. */
-
-    /* master test    ** Ignore Test settings for now */
-    pci_write_32 ((u_int32_t *) &comet->mtest, 0x00);   /* making sure it's
-                                                         * Default value */
-
-    /* Turn on Center (CENT) and everything else off */
-    pci_write_32 ((u_int32_t *) &comet->rjat_cfg, 0x10);        /* RJAT cfg */
-    /* Set Jitter Attenuation to recommend T1 values */
-    if (isT1mode)
-    {
-        pci_write_32 ((u_int32_t *) &comet->rjat_n1clk, 0x2F);  /* RJAT Divider N1
-                                                                 * Control */
-        pci_write_32 ((u_int32_t *) &comet->rjat_n2clk, 0x2F);  /* RJAT Divider N2
-                                                                 * Control */
-    } else
-    {
-        pci_write_32 ((u_int32_t *) &comet->rjat_n1clk, 0xFF);  /* RJAT Divider N1
-                                                                 * Control */
-        pci_write_32 ((u_int32_t *) &comet->rjat_n2clk, 0xFF);  /* RJAT Divider N2
-                                                                 * Control */
-    }
-
-    /* Turn on Center (CENT) and everything else off */
-    pci_write_32 ((u_int32_t *) &comet->tjat_cfg, 0x10);        /* TJAT Config. */
-
-    /* Do not bypass jitter attenuation and bypass elastic store */
-    pci_write_32 ((u_int32_t *) &comet->rx_opt, 0x00);  /* rx opts */
-
-    /* TJAT ctrl & TJAT divider ctrl */
-    /* Set Jitter Attenuation to recommended T1 values */
-    if (isT1mode)
-    {
-        pci_write_32 ((u_int32_t *) &comet->tjat_n1clk, 0x2F);  /* TJAT Divider N1
-                                                                 * Control */
-        pci_write_32 ((u_int32_t *) &comet->tjat_n2clk, 0x2F);  /* TJAT Divider N2
-                                                                 * Control */
-    } else
-    {
-        pci_write_32 ((u_int32_t *) &comet->tjat_n1clk, 0xFF);  /* TJAT Divider N1
-                                                                 * Control */
-        pci_write_32 ((u_int32_t *) &comet->tjat_n2clk, 0xFF);  /* TJAT Divider N2
-                                                                 * Control */
-    }
-
-    /* 1c: rx ELST cfg   20: tx ELST cfg  28&38: rx&tx data link ctrl */
-    if (isT1mode)
-    {                               /* Select 193-bit frame format */
-        pci_write_32 ((u_int32_t *) &comet->rx_elst_cfg, 0x00);
-        pci_write_32 ((u_int32_t *) &comet->tx_elst_cfg, 0x00);
-    } else
-    {                               /* Select 256-bit frame format */
-        pci_write_32 ((u_int32_t *) &comet->rx_elst_cfg, 0x03);
-        pci_write_32 ((u_int32_t *) &comet->tx_elst_cfg, 0x03);
-        pci_write_32 ((u_int32_t *) &comet->rxce1_ctl, 0x00);   /* disable T1 data link
-                                                                 * receive */
-        pci_write_32 ((u_int32_t *) &comet->txci1_ctl, 0x00);   /* disable T1 data link
-                                                                 * transmit */
-    }
+       u_int8_t isT1mode;
+       /* T1 default */
+       u_int8_t    tix = CFG_LBO_LH0;
+       isT1mode = IS_FRAME_ANY_T1(port_mode);
+       /* T1 or E1 */
+       if (isT1mode) {
+               /* Select T1 Mode & PIO output enabled */
+               pci_write_32((u_int32_t *) &comet->gbl_cfg, 0xa0);
+               /* default T1 waveform table */
+               tix = lbo_tbl_lkup(isT1mode, CFG_LBO_LH0);
+       } else {
+               /* Select E1 Mode & PIO output enabled */
+               pci_write_32((u_int32_t *) &comet->gbl_cfg, 0x81);
+               /* default E1 waveform table */
+               tix = lbo_tbl_lkup(isT1mode, CFG_LBO_E120);
+       }
+
+       if (moreParams & CFG_LBO_MASK)
+               /* dial-in requested waveform table */
+               tix = lbo_tbl_lkup(isT1mode, moreParams & CFG_LBO_MASK);
+       /* Tx line Intfc cfg Set for analog & no special patterns */
+       /* Transmit Line Interface Config. */
+       pci_write_32((u_int32_t *) &comet->tx_line_cfg, 0x00);
+       /* master test Ignore Test settings for now */
+       /* making sure it's Default value */
+       pci_write_32((u_int32_t *) &comet->mtest, 0x00);
+       /* Turn on Center (CENT) and everything else off */
+       /* RJAT cfg */
+       pci_write_32((u_int32_t *) &comet->rjat_cfg, 0x10);
+       /* Set Jitter Attenuation to recommend T1 values */
+       if (isT1mode) {
+               /* RJAT Divider N1 Control */
+               pci_write_32((u_int32_t *) &comet->rjat_n1clk, 0x2F);
+               /* RJAT Divider N2 Control */
+               pci_write_32((u_int32_t *) &comet->rjat_n2clk, 0x2F);
+       } else {
+               /* RJAT Divider N1 Control */
+               pci_write_32((u_int32_t *) &comet->rjat_n1clk, 0xFF);
+               /* RJAT Divider N2 Control */
+               pci_write_32((u_int32_t *) &comet->rjat_n2clk, 0xFF);
+       }
+
+       /* Turn on Center (CENT) and everything else off */
+       /* TJAT Config. */
+       pci_write_32((u_int32_t *) &comet->tjat_cfg, 0x10);
+
+       /* Do not bypass jitter attenuation and bypass elastic store */
+       /* rx opts */
+       pci_write_32((u_int32_t *) &comet->rx_opt, 0x00);
+
+       /* TJAT ctrl & TJAT divider ctrl */
+       /* Set Jitter Attenuation to recommended T1 values */
+       if (isT1mode) {
+               /* TJAT Divider N1 Control */
+               pci_write_32((u_int32_t *) &comet->tjat_n1clk, 0x2F);
+               /* TJAT Divider N2  Control */
+               pci_write_32((u_int32_t *) &comet->tjat_n2clk, 0x2F);
+       } else {
+               /* TJAT Divider N1 Control */
+               pci_write_32((u_int32_t *) &comet->tjat_n1clk, 0xFF);
+               /* TJAT Divider N2 Control */
+               pci_write_32((u_int32_t *) &comet->tjat_n2clk, 0xFF);
+       }
+
+       /* 1c: rx ELST cfg   20: tx ELST cfg  28&38: rx&tx data link ctrl */
+
+       /* Select 193-bit frame format */
+       if (isT1mode) {
+               pci_write_32((u_int32_t *) &comet->rx_elst_cfg, 0x00);
+               pci_write_32((u_int32_t *) &comet->tx_elst_cfg, 0x00);
+       } else {
+               /* Select 256-bit frame format */
+               pci_write_32((u_int32_t *) &comet->rx_elst_cfg, 0x03);
+               pci_write_32((u_int32_t *) &comet->tx_elst_cfg, 0x03);
+               /* disable T1 data link receive */
+               pci_write_32((u_int32_t *) &comet->rxce1_ctl, 0x00);
+               /* disable T1 data link transmit */
+               pci_write_32((u_int32_t *) &comet->txci1_ctl, 0x00);
+       }
 
     /* the following is a default value */
     /* Enable 8 out of 10 validation */
-    pci_write_32 ((u_int32_t *) &comet->t1_rboc_ena, 0x00);     /* t1RBOC
-                                                                 * enable(BOC:BitOriented
-                                                                 * Code) */
-    if (isT1mode)
-    {
-
-        /* IBCD cfg: aka Inband Code Detection ** loopback code length set to */
-        pci_write_32 ((u_int32_t *) &comet->ibcd_cfg, 0x04);    /* 6 bit down, 5 bit up
-                                                                 * (assert)  */
-        pci_write_32 ((u_int32_t *) &comet->ibcd_act, 0x08);    /* line loopback
-                                                                 * activate pattern */
-        pci_write_32 ((u_int32_t *) &comet->ibcd_deact, 0x24);  /* deactivate code
-                                                                 * pattern (i.e.001) */
-    }
+        /* t1RBOC enable(BOC:BitOriented Code) */
+       pci_write_32((u_int32_t *) &comet->t1_rboc_ena, 0x00);
+       if (isT1mode)
+       {
+
+       /* IBCD cfg: aka Inband Code Detection ** loopback code length set to */
+               /* 6 bit down, 5 bit up (assert) */
+               pci_write_32((u_int32_t *) &comet->ibcd_cfg, 0x04);
+               /* line loopback activate pattern */
+               pci_write_32((u_int32_t *) &comet->ibcd_act, 0x08);
+               /* deactivate code pattern (i.e.001) */
+               pci_write_32((u_int32_t *) &comet->ibcd_deact, 0x24);
+       }
     /* 10: CDRC cfg 28&38: rx&tx data link 1 ctrl 48: t1 frmr cfg  */
     /* 50: SIGX cfg, COSS (change of signaling state) 54: XBAS cfg  */
     /* 60: t1 ALMI cfg */
     /* Configure Line Coding */
 
-    switch (port_mode)
-    {
-    case CFG_FRAME_SF:              /* 1 - T1 B8ZS */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->t1_frmr_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->t1_xbas_cfg, 0x20); /* 5:B8ZS */
-        pci_write_32 ((u_int32_t *) &comet->t1_almi_cfg, 0);
-        break;
-    case CFG_FRAME_ESF:     /* 2 - T1 B8ZS */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->rxce1_ctl, 0x20);   /* Bit 5: T1 DataLink
-                                                                 * Enable */
-        pci_write_32 ((u_int32_t *) &comet->txci1_ctl, 0x20);   /* 5: T1 DataLink Enable */
-        pci_write_32 ((u_int32_t *) &comet->t1_frmr_cfg, 0x30); /* 4:ESF  5:ESFFA */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0x04);    /* 2:ESF */
-        pci_write_32 ((u_int32_t *) &comet->t1_xbas_cfg, 0x30); /* 4:ESF  5:B8ZS */
-        pci_write_32 ((u_int32_t *) &comet->t1_almi_cfg, 0x10); /* 4:ESF */
-        break;
-    case CFG_FRAME_E1PLAIN:         /* 3 - HDB3 */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0x40);
-        break;
-    case CFG_FRAME_E1CAS:           /* 4 - HDB3 */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0x60);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0);
-        break;
-    case CFG_FRAME_E1CRC:           /* 5 - HDB3 */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0x10);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0xc2);
-        break;
-    case CFG_FRAME_E1CRC_CAS:       /* 6 - HDB3 */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0x70);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0x82);
-        break;
-    case CFG_FRAME_SF_AMI:          /* 7 - T1 AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->t1_frmr_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->t1_xbas_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->t1_almi_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        break;
-    case CFG_FRAME_ESF_AMI:         /* 8 - T1 AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->rxce1_ctl, 0x20);   /* 5: T1 DataLink Enable */
-        pci_write_32 ((u_int32_t *) &comet->txci1_ctl, 0x20);   /* 5: T1 DataLink Enable */
-        pci_write_32 ((u_int32_t *) &comet->t1_frmr_cfg, 0x30); /* Bit 4:ESF  5:ESFFA */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0x04);    /* 2:ESF */
-        pci_write_32 ((u_int32_t *) &comet->t1_xbas_cfg, 0x10); /* 4:ESF */
-        pci_write_32 ((u_int32_t *) &comet->t1_almi_cfg, 0x10); /* 4:ESF */
-        break;
-    case CFG_FRAME_E1PLAIN_AMI:       /* 9 - AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0x80);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0x40);
-        break;
-    case CFG_FRAME_E1CAS_AMI:       /* 10 - AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0xe0);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0);
-        break;
-    case CFG_FRAME_E1CRC_AMI:       /* 11 - AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0x90);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0xc2);
-        break;
-    case CFG_FRAME_E1CRC_CAS_AMI:   /* 12 - AMI */
-        pci_write_32 ((u_int32_t *) &comet->cdrc_cfg, 0x80);    /* Enable AMI Line
-                                                                 * Decoding */
-        pci_write_32 ((u_int32_t *) &comet->sigx_cfg, 0);
-        pci_write_32 ((u_int32_t *) &comet->e1_tran_cfg, 0xf0);
-        pci_write_32 ((u_int32_t *) &comet->e1_frmr_aopts, 0x82);
-        break;
-    }                               /* end switch */
+       switch (port_mode)
+       {
+       /* 1 - T1 B8ZS */
+       case CFG_FRAME_SF:
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->t1_frmr_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+               /* 5:B8ZS */
+               pci_write_32((u_int32_t *) &comet->t1_xbas_cfg, 0x20);
+               pci_write_32((u_int32_t *) &comet->t1_almi_cfg, 0);
+               break;
+       /* 2 - T1 B8ZS */
+       case CFG_FRAME_ESF:
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+               /* Bit 5: T1 DataLink Enable */
+               pci_write_32((u_int32_t *) &comet->rxce1_ctl, 0x20);
+               /* 5: T1 DataLink Enable */
+               pci_write_32((u_int32_t *) &comet->txci1_ctl, 0x20);
+               /* 4:ESF  5:ESFFA */
+               pci_write_32((u_int32_t *) &comet->t1_frmr_cfg, 0x30);
+               /* 2:ESF */
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0x04);
+               /* 4:ESF  5:B8ZS */
+               pci_write_32((u_int32_t *) &comet->t1_xbas_cfg, 0x30);
+               /* 4:ESF */
+               pci_write_32((u_int32_t *) &comet->t1_almi_cfg, 0x10);
+               break;
+       /* 3 - HDB3 */
+       case CFG_FRAME_E1PLAIN:
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0x40);
+               break;
+       /* 4 - HDB3 */
+       case CFG_FRAME_E1CAS:
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0x60);
+               pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0);
+               break;
+       /* 5 - HDB3 */
+       case CFG_FRAME_E1CRC:
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0x10);
+               pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0xc2);
+               break;
+       /* 6 - HDB3 */
+       case CFG_FRAME_E1CRC_CAS:
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0x70);
+               pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0x82);
+               break;
+       /* 7 - T1 AMI */
+       case CFG_FRAME_SF_AMI:
+               /* Enable AMI Line Decoding */
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+               pci_write_32((u_int32_t *) &comet->t1_frmr_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->t1_xbas_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->t1_almi_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+               break;
+       /* 8 - T1 AMI */
+       case CFG_FRAME_ESF_AMI:
+               /* Enable AMI Line Decoding */
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+               /* 5: T1 DataLink Enable */
+               pci_write_32((u_int32_t *) &comet->rxce1_ctl, 0x20);
+               /* 5: T1 DataLink Enable */
+               pci_write_32((u_int32_t *) &comet->txci1_ctl, 0x20);
+               /* Bit 4:ESF  5:ESFFA */
+               pci_write_32((u_int32_t *) &comet->t1_frmr_cfg, 0x30);
+               /* 2:ESF */
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0x04);
+               /* 4:ESF */
+               pci_write_32((u_int32_t *) &comet->t1_xbas_cfg, 0x10);
+               /* 4:ESF */
+               pci_write_32((u_int32_t *) &comet->t1_almi_cfg, 0x10);
+               break;
+       /* 9 - AMI */
+       case CFG_FRAME_E1PLAIN_AMI:
+               /* Enable AMI Line Decoding */
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0x80);
+               pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0x40);
+               break;
+       /* 10 - AMI */
+       case CFG_FRAME_E1CAS_AMI:
+               /* Enable AMI Line Decoding */
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0xe0);
+               pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0);
+               break;
+       /* 11 - AMI */
+       case CFG_FRAME_E1CRC_AMI:
+               /* Enable AMI Line Decoding */
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0x90);
+               pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0xc2);
+               break;
+       /* 12 - AMI */
+       case CFG_FRAME_E1CRC_CAS_AMI:
+               /* Enable AMI Line Decoding */
+               pci_write_32((u_int32_t *) &comet->cdrc_cfg, 0x80);
+               pci_write_32((u_int32_t *) &comet->sigx_cfg, 0);
+               pci_write_32((u_int32_t *) &comet->e1_tran_cfg, 0xf0);
+               pci_write_32((u_int32_t *) &comet->e1_frmr_aopts, 0x82);
+               break;
+       }       /* end switch */
 
     /***
      * Set Full Frame mode (NXDSO[1] = 0, NXDSO[0] = 0)
@@ -277,101 +295,109 @@ init_comet (void *ci, comet_t * comet, u_int32_t port_mode, int clockmaster,
 
     /* 0x30: "BRIF cfg"; 0x20 is 'CMODE', 0x03 is (bit) rate */
     /* note "rate bits can only be set once after reset" */
-    if (clockmaster)
-    {                               /* CMODE == clockMode, 0=clock master (so
-                                     * all 3 others should be slave) */
-        if (isT1mode)               /* rate = 1.544 Mb/s */
-            pci_write_32 ((u_int32_t *) &comet->brif_cfg, 0x00);        /* Comet 0 Master
-                                                                         * Mode(CMODE=0) */
-        else                        /* rate = 2.048 Mb/s */
-            pci_write_32 ((u_int32_t *) &comet->brif_cfg, 0x01);        /* Comet 0 Master
-                                                                         * Mode(CMODE=0) */
-
-        /* 31: BRIF frame pulse cfg  06: tx timing options */
-        pci_write_32 ((u_int32_t *) &comet->brif_fpcfg, 0x00);  /* Master Mode
-                                                                 * i.e.FPMODE=0 (@0x20) */
-        if ((moreParams & CFG_CLK_PORT_MASK) == CFG_CLK_PORT_INTERNAL)
-        {
-            if (cxt1e1_log_level >= LOG_SBEBUG12)
-                pr_info(">> %s: clockmaster internal clock\n", __func__);
-            pci_write_32 ((u_int32_t *) &comet->tx_time, 0x0d); /* internal oscillator */
-        } else                      /* external clock source */
-        {
-            if (cxt1e1_log_level >= LOG_SBEBUG12)
-                pr_info(">> %s: clockmaster external clock\n", __func__);
-            pci_write_32 ((u_int32_t *) &comet->tx_time, 0x09); /* loop timing
-                                                                 * (external) */
-        }
-
-    } else                          /* slave */
-    {
-        if (isT1mode)
-            pci_write_32 ((u_int32_t *) &comet->brif_cfg, 0x20);        /* Slave Mode(CMODE=1,
-                                                                         * see above) */
-        else
-            pci_write_32 ((u_int32_t *) &comet->brif_cfg, 0x21);        /* Slave Mode (CMODE=1) */
-        pci_write_32 ((u_int32_t *) &comet->brif_fpcfg, 0x20);  /* Slave Mode i.e.
-                                                                 * FPMODE=1 (@0x20) */
-        if (cxt1e1_log_level >= LOG_SBEBUG12)
-            pr_info(">> %s: clockslave internal clock\n", __func__);
-        pci_write_32 ((u_int32_t *) &comet->tx_time, 0x0d);     /* oscillator timing */
-    }
-
-    /* 32: BRIF parity F-bit cfg */
-    /* Totem-pole operation */
-    pci_write_32 ((u_int32_t *) &comet->brif_pfcfg, 0x01);      /* Receive Backplane
-                                                                 * Parity/F-bit */
+       if (clockmaster)
+               {
+               /* CMODE == clockMode, 0=clock master (so all 3 others should be slave) */
+               /* rate = 1.544 Mb/s */
+               if (isT1mode)
+                       /* Comet 0 Master Mode(CMODE=0) */
+                       pci_write_32((u_int32_t *) &comet->brif_cfg, 0x00);
+               /* rate = 2.048 Mb/s */
+               else
+                       /* Comet 0 Master Mode(CMODE=0) */
+                       pci_write_32((u_int32_t *) &comet->brif_cfg, 0x01);
+
+               /* 31: BRIF frame pulse cfg  06: tx timing options */
+
+               /* Master Mode i.e.FPMODE=0 (@0x20) */
+               pci_write_32((u_int32_t *) &comet->brif_fpcfg, 0x00);
+               if ((moreParams & CFG_CLK_PORT_MASK) == CFG_CLK_PORT_INTERNAL)
+                       {
+                       if (cxt1e1_log_level >= LOG_SBEBUG12)
+                               pr_info(">> %s: clockmaster internal clock\n", __func__);
+                       /* internal oscillator */
+                       pci_write_32((u_int32_t *) &comet->tx_time, 0x0d);
+               } else {
+                       /* external clock source */
+                       if (cxt1e1_log_level >= LOG_SBEBUG12)
+                               pr_info(">> %s: clockmaster external clock\n", __func__);
+                       /* loop timing(external) */
+                       pci_write_32((u_int32_t *) &comet->tx_time, 0x09);
+               }
+
+       } else  {
+               /* slave */
+               if (isT1mode)
+                       /* Slave Mode(CMODE=1, see above) */
+                       pci_write_32((u_int32_t *) &comet->brif_cfg, 0x20);
+               else
+                       /* Slave Mode(CMODE=1)*/
+                       pci_write_32((u_int32_t *) &comet->brif_cfg, 0x21);
+               /* Slave Mode i.e. FPMODE=1 (@0x20) */
+               pci_write_32((u_int32_t *) &comet->brif_fpcfg, 0x20);
+       if (cxt1e1_log_level >= LOG_SBEBUG12)
+               pr_info(">> %s: clockslave internal clock\n", __func__);
+       /* oscillator timing */
+       pci_write_32((u_int32_t *) &comet->tx_time, 0x0d);
+       }
+
+       /* 32: BRIF parity F-bit cfg */
+       /* Totem-pole operation */
+       /* Receive Backplane Parity/F-bit */
+       pci_write_32((u_int32_t *) &comet->brif_pfcfg, 0x01);
 
     /* dc: RLPS equalizer V ref */
     /* Configuration */
-    if (isT1mode)
-        pci_write_32 ((u_int32_t *) &comet->rlps_eqvr, 0x2c);   /* RLPS Equalizer
-                                                                 * Voltage  */
-    else
-        pci_write_32 ((u_int32_t *) &comet->rlps_eqvr, 0x34);   /* RLPS Equalizer
-                                                                 * Voltage  */
+       if (isT1mode)
+               /* RLPS Equalizer Voltage  */
+               pci_write_32((u_int32_t *) &comet->rlps_eqvr, 0x2c);
+       else
+               /* RLPS Equalizer Voltage  */
+               pci_write_32((u_int32_t *) &comet->rlps_eqvr, 0x34);
 
     /* Reserved bit set and SQUELCH enabled */
     /* f8: RLPS cfg & status  f9: RLPS ALOS detect/clear threshold */
-    pci_write_32 ((u_int32_t *) &comet->rlps_cfgsts, 0x11);     /* RLPS Configuration
-                                                                 * Status */
-    if (isT1mode)
-        pci_write_32 ((u_int32_t *) &comet->rlps_alos_thresh, 0x55);    /* ? */
-    else
-        pci_write_32 ((u_int32_t *) &comet->rlps_alos_thresh, 0x22);    /* ? */
+       /* RLPS Configuration Status */
+       pci_write_32((u_int32_t *) &comet->rlps_cfgsts, 0x11);
+       if (isT1mode)
+                /* ? */
+               pci_write_32((u_int32_t *) &comet->rlps_alos_thresh, 0x55);
+       else
+               /* ? */
+               pci_write_32((u_int32_t *) &comet->rlps_alos_thresh, 0x22);
 
 
     /* Set Full Frame mode (NXDSO[1] = 0, NXDSO[0] = 0) */
     /* CMODE=0: Clock slave mode with BTCLK as an input, DE=1: Use rising */
     /* edge of BTCLK for data, FE=1: Use rising edge of BTCLK for frame, */
     /* CMS=0: Use backplane freq, RATE[1:0]=0,0: T1 */
-/***    Transmit side is always an Input, Slave Clock*/
-    /* 40: BTIF cfg  41: BTIF frame pulse cfg */
-    if (isT1mode)
-        pci_write_32 ((u_int32_t *) &comet->btif_cfg, 0x38);    /* BTIF Configuration
-                                                                 * Reg. */
-    else
-        pci_write_32 ((u_int32_t *) &comet->btif_cfg, 0x39);    /* BTIF Configuration
-                                                                 * Reg. */
-
-    pci_write_32 ((u_int32_t *) &comet->btif_fpcfg, 0x01);      /* BTIF Frame Pulse
-                                                                 * Config. */
+    /***    Transmit side is always an Input, Slave Clock*/
+    /* 40: BTIF cfg  41: loop timing(external) */
+       /*BTIF frame pulse cfg */
+       if (isT1mode)
+               /* BTIF Configuration  Reg. */
+               pci_write_32((u_int32_t *) &comet->btif_cfg, 0x38);
+       else
+               /* BTIF Configuration  Reg. */
+               pci_write_32((u_int32_t *) &comet->btif_cfg, 0x39);
+       /* BTIF Frame Pulse Config. */
+       pci_write_32((u_int32_t *) &comet->btif_fpcfg, 0x01);
 
     /* 0a: master diag  06: tx timing options */
     /* if set Comet to loop back */
 
     /* Comets set to normal */
-    pci_write_32 ((u_int32_t *) &comet->mdiag, 0x00);
+       pci_write_32((u_int32_t *) &comet->mdiag, 0x00);
 
     /* BTCLK driven by TCLKI internally (crystal driven) and Xmt Elasted  */
     /* Store is enabled. */
 
-    WrtXmtWaveformTbl (ci, comet, TWV_table[tix]);
-    if (isT1mode)
-        WrtRcvEqualizerTbl ((ci_t *) ci, comet, &T1_Equalizer[0]);
-    else
-        WrtRcvEqualizerTbl ((ci_t *) ci, comet, &E1_Equalizer[0]);
-    SetPwrLevel (comet);
+       WrtXmtWaveformTbl(ci, comet, TWV_table[tix]);
+       if (isT1mode)
+               WrtRcvEqualizerTbl((ci_t *) ci, comet, &T1_Equalizer[0]);
+       else
+               WrtRcvEqualizerTbl((ci_t *) ci, comet, &E1_Equalizer[0]);
+       SetPwrLevel(comet);
 }
 
 /*
@@ -382,15 +408,15 @@ init_comet (void *ci, comet_t * comet, u_int32_t port_mode, int clockmaster,
 ** Returns:     Nothing
 */
 STATIC void
-WrtXmtWaveform (ci_t * ci, comet_t * comet, u_int32_t sample, u_int32_t unit, u_int8_t data)
+WrtXmtWaveform(ci_t *ci, comet_t *comet, u_int32_t sample, u_int32_t unit, u_int8_t data)
 {
-    u_int8_t    WaveformAddr;
+       u_int8_t    WaveformAddr;
 
-    WaveformAddr = (sample << 3) + (unit & 7);
-    pci_write_32 ((u_int32_t *) &comet->xlpg_pwave_addr, WaveformAddr);
-    pci_flush_write (ci);           /* for write order preservation when
-                                     * Optimizing driver */
-    pci_write_32 ((u_int32_t *) &comet->xlpg_pwave_data, 0x7F & data);
+       WaveformAddr = (sample << 3) + (unit & 7);
+       pci_write_32((u_int32_t *) &comet->xlpg_pwave_addr, WaveformAddr);
+       /* for write order preservation when Optimizing driver */
+       pci_flush_write(ci);
+       pci_write_32((u_int32_t *) &comet->xlpg_pwave_data, 0x7F & data);
 }
 
 /*
@@ -400,19 +426,19 @@ WrtXmtWaveform (ci_t * ci, comet_t * comet, u_int32_t sample, u_int32_t unit, u_
 ** Returns:     Nothing
 */
 STATIC void
-WrtXmtWaveformTbl (ci_t * ci, comet_t * comet,
-                   u_int8_t table[COMET_NUM_SAMPLES][COMET_NUM_UNITS])
+WrtXmtWaveformTbl(ci_t *ci, comet_t *comet,
+                 u_int8_t table[COMET_NUM_SAMPLES][COMET_NUM_UNITS])
 {
-    u_int32_t sample, unit;
+       u_int32_t sample, unit;
 
-    for (sample = 0; sample < COMET_NUM_SAMPLES; sample++)
-    {
-        for (unit = 0; unit < COMET_NUM_UNITS; unit++)
-            WrtXmtWaveform (ci, comet, sample, unit, table[sample][unit]);
-    }
+       for (sample = 0; sample < COMET_NUM_SAMPLES; sample++)
+               {
+               for (unit = 0; unit < COMET_NUM_UNITS; unit++)
+                       WrtXmtWaveform(ci, comet, sample, unit, table[sample][unit]);
+               }
 
     /* Enable transmitter and set output amplitude */
-    pci_write_32 ((u_int32_t *) &comet->xlpg_cfg, table[COMET_NUM_SAMPLES][0]);
+       pci_write_32((u_int32_t *) &comet->xlpg_cfg, table[COMET_NUM_SAMPLES][0]);
 }
 
 
@@ -427,60 +453,60 @@ WrtXmtWaveformTbl (ci_t * ci, comet_t * comet,
 */
 
 STATIC void
-WrtRcvEqualizerTbl (ci_t * ci, comet_t * comet, u_int32_t *table)
+WrtRcvEqualizerTbl(ci_t *ci, comet_t *comet, u_int32_t *table)
 {
-    u_int32_t   ramaddr;
-    volatile u_int32_t value;
-
-    for (ramaddr = 0; ramaddr < 256; ramaddr++)
-    {
-        /*** the following lines are per Errata 7, 2.5 ***/
-        {
-            pci_write_32 ((u_int32_t *) &comet->rlps_eq_rwsel, 0x80);   /* Set up for a read
-                                                                         * operation */
-            pci_flush_write (ci);   /* for write order preservation when
-                                     * Optimizing driver */
-            pci_write_32 ((u_int32_t *) &comet->rlps_eq_iaddr, (u_int8_t) ramaddr); /* write the addr,
-                                                                                  * initiate a read */
-            pci_flush_write (ci);   /* for write order preservation when
-                                     * Optimizing driver */
-            /*
-             * wait 3 line rate clock cycles to ensure address bits are
-             * captured by T1/E1 clock
-             */
-            OS_uwait (4, "wret");   /* 683ns * 3 = 1366 ns, approx 2us (but
-                                     * use 4us) */
-        }
-
-        value = *table++;
-        pci_write_32 ((u_int32_t *) &comet->rlps_idata3, (u_int8_t) (value >> 24));
-        pci_write_32 ((u_int32_t *) &comet->rlps_idata2, (u_int8_t) (value >> 16));
-        pci_write_32 ((u_int32_t *) &comet->rlps_idata1, (u_int8_t) (value >> 8));
-        pci_write_32 ((u_int32_t *) &comet->rlps_idata0, (u_int8_t) value);
-        pci_flush_write (ci);       /* for write order preservation when
-                                     * Optimizing driver */
-
-        /* Storing RAM address, causes RAM to be updated */
-
-        pci_write_32 ((u_int32_t *) &comet->rlps_eq_rwsel, 0);  /* Set up for a write
-                                                                 * operation */
-        pci_flush_write (ci);       /* for write order preservation when
-                                     * Optimizing driver */
-        pci_write_32 ((u_int32_t *) &comet->rlps_eq_iaddr, (u_int8_t) ramaddr); /* write the addr,
-                                                                                 * initiate a read */
-        pci_flush_write (ci);       /* for write order preservation when
-                                     * Optimizing driver */
-        /*
-         * wait 3 line rate clock cycles to ensure address bits are captured
-         * by T1/E1 clock
-         */
-        OS_uwait (4, "wret");       /* 683ns * 3 = 1366 ns, approx 2us (but
-                                     * use 4us) */
-    }
-
-    pci_write_32 ((u_int32_t *) &comet->rlps_eq_cfg, 0xCB);     /* Enable Equalizer &
-                                                                 * set it to use 256
-                                                                 * periods */
+       u_int32_t   ramaddr;
+       volatile u_int32_t value;
+
+       for (ramaddr = 0; ramaddr < 256; ramaddr++) {
+       /*** the following lines are per Errata 7, 2.5 ***/
+               {
+               /* Set up for a read operation */
+               pci_write_32((u_int32_t *) &comet->rlps_eq_rwsel, 0x80);
+               /* for write order preservation when Optimizing driver */
+               pci_flush_write(ci);
+               /* write the addr, initiate a read */
+               pci_write_32((u_int32_t *) &comet->rlps_eq_iaddr, (u_int8_t) ramaddr);
+               /* for write order preservation when Optimizing driver */
+               pci_flush_write(ci);
+               /*
+               * wait 3 line rate clock cycles to ensure address bits are
+               * captured by T1/E1 clock
+               */
+
+               /* 683ns * 3 = 1366 ns, approx 2us (but use 4us) */
+               OS_uwait(4, "wret");
+       }
+
+       value = *table++;
+       pci_write_32((u_int32_t *) &comet->rlps_idata3, (u_int8_t) (value >> 24));
+       pci_write_32((u_int32_t *) &comet->rlps_idata2, (u_int8_t) (value >> 16));
+       pci_write_32((u_int32_t *) &comet->rlps_idata1, (u_int8_t) (value >> 8));
+       pci_write_32((u_int32_t *) &comet->rlps_idata0, (u_int8_t) value);
+        /* for write order preservation when Optimizing driver */
+       pci_flush_write(ci);
+
+       /* Storing RAM address, causes RAM to be updated */
+
+               /* Set up for a write operation */
+               pci_write_32((u_int32_t *) &comet->rlps_eq_rwsel, 0);
+               /* for write order preservation when optimizing driver */
+               pci_flush_write(ci);
+               /* write the addr, initiate a read */
+               pci_write_32((u_int32_t *) &comet->rlps_eq_iaddr, (u_int8_t) ramaddr);
+                /* for write order preservation when optimizing driver */
+               pci_flush_write(ci);
+
+       /*
+       * wait 3 line rate clock cycles to ensure address bits are captured
+       * by T1/E1 clock
+       */
+               /* 683ns * 3 = 1366 ns, approx 2us (but use 4us) */
+               OS_uwait(4, "wret");
+       }
+
+       /* Enable Equalizer & set it to use 256 periods */
+       pci_write_32((u_int32_t *) &comet->rlps_eq_cfg, 0xCB);
 }
 
 
@@ -491,9 +517,9 @@ WrtRcvEqualizerTbl (ci_t * ci, comet_t * comet, u_int32_t *table)
 */
 
 STATIC void
-SetPwrLevel (comet_t * comet)
+SetPwrLevel(comet_t *comet)
 {
-    volatile u_int32_t temp;
+       volatile u_int32_t temp;
 
 /*
 **    Algorithm to Balance the Power Distribution of Ttip Tring
@@ -507,22 +533,20 @@ SetPwrLevel (comet_t * comet)
 **    Repeat these steps for register F5
 **    Write 0x01 to register F6
 */
-    pci_write_32 ((u_int32_t *) &comet->xlpg_fdata_sel, 0x00);  /* XLPG Fuse Data Select */
-
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_pctl, 0x01); /* XLPG Analog Test
-                                                                 * Positive control */
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_pctl, 0x01);
-
-    temp = pci_read_32 ((u_int32_t *) &comet->xlpg_atest_pctl) & 0xfe;
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_pctl, temp);
-
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_nctl, 0x01); /* XLPG Analog Test
-                                                                 * Negative control */
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_nctl, 0x01);
-
-    temp = pci_read_32 ((u_int32_t *) &comet->xlpg_atest_nctl) & 0xfe;
-    pci_write_32 ((u_int32_t *) &comet->xlpg_atest_nctl, temp);
-    pci_write_32 ((u_int32_t *) &comet->xlpg_fdata_sel, 0x01);  /* XLPG */
+       /* XLPG Fuse Data Select */
+       pci_write_32((u_int32_t *) &comet->xlpg_fdata_sel, 0x00);
+       /* XLPG Analog Test Positive control */
+       pci_write_32((u_int32_t *) &comet->xlpg_atest_pctl, 0x01);
+       pci_write_32((u_int32_t *) &comet->xlpg_atest_pctl, 0x01);
+       temp = pci_read_32((u_int32_t *) &comet->xlpg_atest_pctl) & 0xfe;
+       pci_write_32((u_int32_t *) &comet->xlpg_atest_pctl, temp);
+       pci_write_32((u_int32_t *) &comet->xlpg_atest_nctl, 0x01);
+       pci_write_32((u_int32_t *) &comet->xlpg_atest_nctl, 0x01);
+       /* XLPG Analog Test Negative control */
+       temp = pci_read_32((u_int32_t *) &comet->xlpg_atest_nctl) & 0xfe;
+       pci_write_32((u_int32_t *) &comet->xlpg_atest_nctl, temp);
+       /* XLPG */
+       pci_write_32((u_int32_t *) &comet->xlpg_fdata_sel, 0x01);
 }
 
 
@@ -535,33 +559,30 @@ SetPwrLevel (comet_t * comet)
 */
 #if 0
 STATIC void
-SetCometOps (comet_t * comet)
+SetCometOps(comet_t *comet)
 {
-    volatile u_int8_t rd_value;
-
-    if (comet == mConfig.C4Func1Base + (COMET0_OFFSET >> 2))
-    {
-        rd_value = (u_int8_t) pci_read_32 ((u_int32_t *) &comet->brif_cfg);     /* read the BRIF
-                                                                                 * Configuration */
-        rd_value &= ~0x20;
-        pci_write_32 ((u_int32_t *) &comet->brif_cfg, (u_int32_t) rd_value);
-
-        rd_value = (u_int8_t) pci_read_32 ((u_int32_t *) &comet->brif_fpcfg);   /* read the BRIF Frame
-                                                                                 * Pulse Configuration */
-        rd_value &= ~0x20;
-        pci_write_32 ((u_int32_t *) &comet->brif_fpcfg, (u_int8_t) rd_value);
-    } else
-    {
-        rd_value = (u_int8_t) pci_read_32 ((u_int32_t *) &comet->brif_cfg);     /* read the BRIF
-                                                                                 * Configuration */
-        rd_value |= 0x20;
-        pci_write_32 ((u_int32_t *) &comet->brif_cfg, (u_int32_t) rd_value);
-
-        rd_value = (u_int8_t) pci_read_32 ((u_int32_t *) &comet->brif_fpcfg);   /* read the BRIF Frame
-                                                                                 * Pulse Configuration */
-        rd_value |= 0x20;
-        pci_write_32 ((u_int32_t *) &comet->brif_fpcfg, (u_int8_t) rd_value);
-    }
+       volatile u_int8_t rd_value;
+
+       if (comet == mConfig.C4Func1Base + (COMET0_OFFSET >> 2))
+       {
+               /* read the BRIF Configuration */
+               rd_value = (u_int8_t) pci_read_32((u_int32_t *) &comet->brif_cfg);
+               rd_value &= ~0x20;
+               pci_write_32((u_int32_t *) &comet->brif_cfg, (u_int32_t) rd_value);
+               /* read the BRIF Frame Pulse Configuration */
+               rd_value = (u_int8_t) pci_read_32((u_int32_t *) &comet->brif_fpcfg);
+               rd_value &= ~0x20;
+               pci_write_32((u_int32_t *) &comet->brif_fpcfg, (u_int8_t) rd_value);
+       } else {
+       /* read the BRIF Configuration */
+       rd_value = (u_int8_t) pci_read_32((u_int32_t *) &comet->brif_cfg);
+       rd_value |= 0x20;
+       pci_write_32((u_int32_t *) &comet->brif_cfg, (u_int32_t) rd_value);
+       /* read the BRIF Frame Pulse Configuration */
+       rd_value = (u_int8_t) pci_read_32((u_int32_t *) &comet->brif_fpcfg);
+       rd_value |= 0x20;
+       pci_write_32(u_int32_t *) & comet->brif_fpcfg, (u_int8_t) rd_value);
+       }
 }
 #endif
 
index d9a9aa3571d9a1837f74ba469f620654d7c33cc4..6167dc5745778f1a584daf6105d843bcfb15f1e9 100644 (file)
@@ -14,7 +14,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/slab.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <asm/byteorder.h>
 #include <linux/netdevice.h>
 #include <linux/delay.h>
@@ -97,7 +97,7 @@ pci_write_32 (u_int32_t *p, u_int32_t v)
 
 
 void
-pci_flush_write (ci_t * ci)
+pci_flush_write (ci_t *ci)
 {
     volatile u_int32_t v;
 
@@ -202,7 +202,7 @@ sd_line_is_ok (void *user)
 {
     struct net_device *ndev = (struct net_device *) user;
 
-    return (netif_carrier_ok (ndev));
+    return netif_carrier_ok (ndev);
 }
 
 void
@@ -246,7 +246,7 @@ sd_queue_stopped (void *user)
 {
     struct net_device *ndev = (struct net_device *) user;
 
-    return (netif_queue_stopped (ndev));
+    return netif_queue_stopped (ndev);
 }
 
 void sd_recv_consume(void *token, size_t len, void *user)
@@ -279,7 +279,7 @@ VMETRO_TRACE (void *x)
 
 
 void
-VMETRO_TRIGGER (ci_t * ci, int x)
+VMETRO_TRIGGER (ci_t *ci, int x)
 {
     comet_t    *comet;
     volatile u_int32_t data;
index de8ac0bc24fb705a4d6dcae363ad0b6e3790b37e..110c252d38d7f85f175d194e6777e3f3e7d51b81 100644 (file)
@@ -50,7 +50,7 @@ struct s_hdw_info hdw_info[MAX_BOARDS];
 
 
 void        __init
-show_two (hdw_info_t * hi, int brdno)
+show_two (hdw_info_t *hi, int brdno)
 {
     ci_t       *ci;
     struct pci_dev *pdev;
@@ -102,7 +102,7 @@ show_two (hdw_info_t * hi, int brdno)
 
 
 void        __init
-hdw_sn_get (hdw_info_t * hi, int brdno)
+hdw_sn_get (hdw_info_t *hi, int brdno)
 {
     /* obtain hardware EEPROM information */
     long        addr;
@@ -222,7 +222,7 @@ cleanup_devs (void)
 
 
 STATIC int  __init
-c4_hdw_init (struct pci_dev * pdev, int found)
+c4_hdw_init (struct pci_dev *pdev, int found)
 {
     hdw_info_t *hi;
     int         i;
index a829b6231a6622bd72124ce4317f3c64bb243d83..e5889ef190a216ae4ec98a288efd69e9d9cc5dcf 100644 (file)
@@ -144,7 +144,7 @@ getuserbychan (int channum)
 
 
 char       *
-get_hdlc_name (hdlc_device * hdlc)
+get_hdlc_name (hdlc_device *hdlc)
 {
     struct c4_priv *priv = hdlc->priv;
     struct net_device *dev = getuserbychan (priv->channum);
@@ -185,7 +185,7 @@ mkret (int bsd)
  * within a port's group.
  */
 void
-c4_wk_chan_restart (mch_t * ch)
+c4_wk_chan_restart (mch_t *ch)
 {
     mpi_t      *pi = ch->up;
 
@@ -203,7 +203,7 @@ c4_wk_chan_restart (mch_t * ch)
 }
 
 status_t
-c4_wk_chan_init (mpi_t * pi, mch_t * ch)
+c4_wk_chan_init (mpi_t *pi, mch_t *ch)
 {
     /*
      * this will be used to restart a stopped channel
@@ -218,7 +218,7 @@ c4_wk_chan_init (mpi_t * pi, mch_t * ch)
 }
 
 status_t
-c4_wq_port_init (mpi_t * pi)
+c4_wq_port_init (mpi_t *pi)
 {
 
     char        name[16], *np;  /* NOTE: name of the queue limited by system
@@ -241,7 +241,7 @@ c4_wq_port_init (mpi_t * pi)
 }
 
 void
-c4_wq_port_cleanup (mpi_t * pi)
+c4_wq_port_cleanup (mpi_t *pi)
 {
     /*
      * PORT POINT: cannot call this if WQ is statically allocated w/in
@@ -278,7 +278,7 @@ c4_ebus_interrupt (int irq, void *dev_instance)
 
 
 static int
-void_open (struct net_device * ndev)
+void_open (struct net_device *ndev)
 {
     pr_info("%s: trying to open master device !\n", ndev->name);
     return -1;
@@ -286,7 +286,7 @@ void_open (struct net_device * ndev)
 
 
 STATIC int
-chan_open (struct net_device * ndev)
+chan_open (struct net_device *ndev)
 {
     hdlc_device *hdlc = dev_to_hdlc (ndev);
     const struct c4_priv *priv = hdlc->priv;
@@ -306,7 +306,7 @@ chan_open (struct net_device * ndev)
 
 
 STATIC int
-chan_close (struct net_device * ndev)
+chan_close (struct net_device *ndev)
 {
     hdlc_device *hdlc = dev_to_hdlc (ndev);
     const struct c4_priv *priv = hdlc->priv;
@@ -320,14 +320,14 @@ chan_close (struct net_device * ndev)
 
 
 STATIC int
-chan_dev_ioctl (struct net_device * dev, struct ifreq * ifr, int cmd)
+chan_dev_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 {
     return hdlc_ioctl (dev, ifr, cmd);
 }
 
 
 STATIC int
-chan_attach_noop (struct net_device * ndev, unsigned short foo_1, unsigned short foo_2)
+chan_attach_noop (struct net_device *ndev, unsigned short foo_1, unsigned short foo_2)
 {
     return 0;                   /* our driver has nothing to do here, show's
                                  * over, go home */
@@ -335,7 +335,7 @@ chan_attach_noop (struct net_device * ndev, unsigned short foo_1, unsigned short
 
 
 STATIC struct net_device_stats *
-chan_get_stats (struct net_device * ndev)
+chan_get_stats (struct net_device *ndev)
 {
     mch_t      *ch;
     struct net_device_stats *nstats;
@@ -388,14 +388,14 @@ chan_get_stats (struct net_device * ndev)
 
 
 static ci_t *
-get_ci_by_dev (struct net_device * ndev)
+get_ci_by_dev (struct net_device *ndev)
 {
     return (ci_t *)(netdev_priv(ndev));
 }
 
 
 STATIC int
-c4_linux_xmit (struct sk_buff * skb, struct net_device * ndev)
+c4_linux_xmit (struct sk_buff *skb, struct net_device *ndev)
 {
     const struct c4_priv *priv;
     int         rval;
@@ -417,8 +417,8 @@ static const struct net_device_ops chan_ops = {
 };
 
 STATIC struct net_device *
-create_chan (struct net_device * ndev, ci_t * ci,
-             struct sbecom_chan_param * cp)
+create_chan (struct net_device *ndev, ci_t *ci,
+             struct sbecom_chan_param *cp)
 {
     hdlc_device *hdlc;
     struct net_device *dev;
@@ -510,7 +510,7 @@ create_chan (struct net_device * ndev, ci_t * ci,
 
 /* the idea here is to get port information and pass it back (using pointer) */
 STATIC      status_t
-do_get_port (struct net_device * ndev, void *data)
+do_get_port (struct net_device *ndev, void *data)
 {
     int         ret;
     ci_t       *ci;             /* ci stands for card information */
@@ -535,7 +535,7 @@ do_get_port (struct net_device * ndev, void *data)
 
 /* this function copys the user data and then calls the real action function */
 STATIC      status_t
-do_set_port (struct net_device * ndev, void *data)
+do_set_port (struct net_device *ndev, void *data)
 {
     ci_t       *ci;             /* ci stands for card information */
     struct sbecom_port_param pp;/* copy data to kernel land */
@@ -557,7 +557,7 @@ do_set_port (struct net_device * ndev, void *data)
 
 /* work the port loopback mode as per directed */
 STATIC      status_t
-do_port_loop (struct net_device * ndev, void *data)
+do_port_loop (struct net_device *ndev, void *data)
 {
     struct sbecom_port_param pp;
     ci_t       *ci;
@@ -572,7 +572,7 @@ do_port_loop (struct net_device * ndev, void *data)
 
 /* set the specified register with the given value / or just read it */
 STATIC      status_t
-do_framer_rw (struct net_device * ndev, void *data)
+do_framer_rw (struct net_device *ndev, void *data)
 {
     struct sbecom_port_param pp;
     ci_t       *ci;
@@ -593,7 +593,7 @@ do_framer_rw (struct net_device * ndev, void *data)
 
 /* set the specified register with the given value / or just read it */
 STATIC      status_t
-do_pld_rw (struct net_device * ndev, void *data)
+do_pld_rw (struct net_device *ndev, void *data)
 {
     struct sbecom_port_param pp;
     ci_t       *ci;
@@ -614,7 +614,7 @@ do_pld_rw (struct net_device * ndev, void *data)
 
 /* set the specified register with the given value / or just read it */
 STATIC      status_t
-do_musycc_rw (struct net_device * ndev, void *data)
+do_musycc_rw (struct net_device *ndev, void *data)
 {
     struct c4_musycc_param mp;
     ci_t       *ci;
@@ -634,7 +634,7 @@ do_musycc_rw (struct net_device * ndev, void *data)
 }
 
 STATIC      status_t
-do_get_chan (struct net_device * ndev, void *data)
+do_get_chan (struct net_device *ndev, void *data)
 {
     struct sbecom_chan_param cp;
     int         ret;
@@ -652,7 +652,7 @@ do_get_chan (struct net_device * ndev, void *data)
 }
 
 STATIC      status_t
-do_set_chan (struct net_device * ndev, void *data)
+do_set_chan (struct net_device *ndev, void *data)
 {
     struct sbecom_chan_param cp;
     int         ret;
@@ -673,7 +673,7 @@ do_set_chan (struct net_device * ndev, void *data)
 }
 
 STATIC      status_t
-do_create_chan (struct net_device * ndev, void *data)
+do_create_chan (struct net_device *ndev, void *data)
 {
     ci_t       *ci;
     struct net_device *dev;
@@ -700,7 +700,7 @@ do_create_chan (struct net_device * ndev, void *data)
 }
 
 STATIC      status_t
-do_get_chan_stats (struct net_device * ndev, void *data)
+do_get_chan_stats (struct net_device *ndev, void *data)
 {
     struct c4_chan_stats_wrap ccs;
     int         ret;
@@ -721,7 +721,7 @@ do_get_chan_stats (struct net_device * ndev, void *data)
     return 0;
 }
 STATIC      status_t
-do_set_loglevel (struct net_device * ndev, void *data)
+do_set_loglevel (struct net_device *ndev, void *data)
 {
     unsigned int cxt1e1_log_level;
 
@@ -732,7 +732,7 @@ do_set_loglevel (struct net_device * ndev, void *data)
 }
 
 STATIC      status_t
-do_deluser (struct net_device * ndev, int lockit)
+do_deluser (struct net_device *ndev, int lockit)
 {
     if (ndev->flags & IFF_UP)
         return -EBUSY;
@@ -763,7 +763,7 @@ do_deluser (struct net_device * ndev, int lockit)
 }
 
 int
-do_del_chan (struct net_device * musycc_dev, void *data)
+do_del_chan (struct net_device *musycc_dev, void *data)
 {
     struct sbecom_chan_param cp;
     char        buf[sizeof (CHANNAME) + 3];
@@ -787,7 +787,7 @@ do_del_chan (struct net_device * musycc_dev, void *data)
 int         c4_reset_board (void *);
 
 int
-do_reset (struct net_device * musycc_dev, void *data)
+do_reset (struct net_device *musycc_dev, void *data)
 {
     const struct c4_priv *priv;
     int         i;
@@ -816,7 +816,7 @@ do_reset (struct net_device * musycc_dev, void *data)
 }
 
 int
-do_reset_chan_stats (struct net_device * musycc_dev, void *data)
+do_reset_chan_stats (struct net_device *musycc_dev, void *data)
 {
     struct sbecom_chan_param cp;
 
@@ -827,7 +827,7 @@ do_reset_chan_stats (struct net_device * musycc_dev, void *data)
 }
 
 STATIC      status_t
-c4_ioctl (struct net_device * ndev, struct ifreq * ifr, int cmd)
+c4_ioctl (struct net_device *ndev, struct ifreq *ifr, int cmd)
 {
     ci_t       *ci;
     void       *data;
@@ -954,7 +954,7 @@ static void c4_setup(struct net_device *dev)
 }
 
 struct net_device *__init
-c4_add_dev (hdw_info_t * hi, int brdno, unsigned long f0, unsigned long f1,
+c4_add_dev (hdw_info_t *hi, int brdno, unsigned long f0, unsigned long f1,
             int irq0, int irq1)
 {
     struct net_device *ndev;
index b2cc68a1fe879ab155106b271e2671fb564cab09..1037086d00a70ce6a74ed0f4ace41175974e2750 100644 (file)
@@ -74,7 +74,7 @@ void        musycc_update_timeslots(mpi_t *);
 
 #if 1
 STATIC int
-musycc_dump_rxbuffer_ring(mch_t * ch, int lockit)
+musycc_dump_rxbuffer_ring(mch_t *ch, int lockit)
 {
     struct mdesc *m;
     unsigned long flags = 0;
@@ -140,7 +140,7 @@ musycc_dump_rxbuffer_ring(mch_t * ch, int lockit)
 
 #if 1
 STATIC int
-musycc_dump_txbuffer_ring(mch_t * ch, int lockit)
+musycc_dump_txbuffer_ring(mch_t *ch, int lockit)
 {
     struct mdesc *m;
     unsigned long flags = 0;
@@ -205,7 +205,7 @@ musycc_dump_txbuffer_ring(mch_t * ch, int lockit)
  */
 
 status_t
-musycc_dump_ring(ci_t * ci, unsigned int chan)
+musycc_dump_ring(ci_t *ci, unsigned int chan)
 {
     mch_t      *ch;
 
@@ -248,7 +248,7 @@ musycc_dump_ring(ci_t * ci, unsigned int chan)
 
 
 status_t
-musycc_dump_rings(ci_t * ci, unsigned int start_chan)
+musycc_dump_rings(ci_t *ci, unsigned int start_chan)
 {
     unsigned int chan;
 
@@ -264,7 +264,7 @@ musycc_dump_rings(ci_t * ci, unsigned int start_chan)
  */
 
 void
-musycc_init_mdt(mpi_t * pi)
+musycc_init_mdt(mpi_t *pi)
 {
     u_int32_t  *addr, cfg;
     int         i;
@@ -288,7 +288,7 @@ musycc_init_mdt(mpi_t * pi)
 /* Set TX thp to the next unprocessed md */
 
 void
-musycc_update_tx_thp(mch_t * ch)
+musycc_update_tx_thp(mch_t *ch)
 {
     struct mdesc *md;
     unsigned long flags;
@@ -443,7 +443,7 @@ musycc_wq_chan_restart(void *arg)      /* channel private structure */
   */
 
 void
-musycc_chan_restart(mch_t * ch)
+musycc_chan_restart(mch_t *ch)
 {
 #ifdef RLD_RESTART_DEBUG
     pr_info("++ musycc_chan_restart[%d]: txd_irq_srv @ %p = sts %x\n",
@@ -461,7 +461,7 @@ musycc_chan_restart(mch_t * ch)
 
 
 void
-rld_put_led(mpi_t * pi, u_int32_t ledval)
+rld_put_led(mpi_t *pi, u_int32_t ledval)
 {
     static u_int32_t led = 0;
 
@@ -477,7 +477,7 @@ rld_put_led(mpi_t * pi, u_int32_t ledval)
 #define MUSYCC_SR_RETRY_CNT  9
 
 void
-musycc_serv_req(mpi_t * pi, u_int32_t req)
+musycc_serv_req(mpi_t *pi, u_int32_t req)
 {
     volatile u_int32_t r;
     int         rcnt;
@@ -578,7 +578,7 @@ rewrite:
 
 #ifdef  SBE_PMCC4_ENABLE
 void
-musycc_update_timeslots(mpi_t * pi)
+musycc_update_timeslots(mpi_t *pi)
 {
     int         i, ch;
     char        e1mode = IS_FRAME_ANY_E1(pi->p.port_mode);
@@ -640,7 +640,7 @@ musycc_update_timeslots(mpi_t * pi)
 
 #ifdef SBE_WAN256T3_ENABLE
 void
-musycc_update_timeslots(mpi_t * pi)
+musycc_update_timeslots(mpi_t *pi)
 {
     mch_t      *ch;
 
@@ -703,7 +703,7 @@ musycc_chan_proto(int proto)
 
 #ifdef SBE_WAN256T3_ENABLE
 STATIC void __init
-musycc_init_port(mpi_t * pi)
+musycc_init_port(mpi_t *pi)
 {
     pci_write_32((u_int32_t *) &pi->reg->gbp, OS_vtophys(pi->regram));
 
@@ -737,7 +737,7 @@ musycc_init_port(mpi_t * pi)
 
 
 status_t    __init
-musycc_init(ci_t * ci)
+musycc_init(ci_t *ci)
 {
     char       *regaddr;        /* temp for address boundary calculations */
     int         i, gchan;
@@ -832,7 +832,7 @@ musycc_init(ci_t * ci)
 
 
 void
-musycc_bh_tx_eom(mpi_t * pi, int gchan)
+musycc_bh_tx_eom(mpi_t *pi, int gchan)
 {
     mch_t      *ch;
     struct mdesc *md;
@@ -1010,7 +1010,7 @@ musycc_bh_tx_eom(mpi_t * pi, int gchan)
 
 
 STATIC void
-musycc_bh_rx_eom(mpi_t * pi, int gchan)
+musycc_bh_rx_eom(mpi_t *pi, int gchan)
 {
     mch_t      *ch;
     void       *m, *m2;
@@ -1229,7 +1229,7 @@ unsigned long
 #else
 void
 #endif
-musycc_intr_bh_tasklet(ci_t * ci)
+musycc_intr_bh_tasklet(ci_t *ci)
 {
     mpi_t      *pi;
     mch_t      *ch;
@@ -1517,7 +1517,7 @@ musycc_intr_bh_tasklet(ci_t * ci)
 
 #if 0
 int         __init
-musycc_new_chan(ci_t * ci, int channum, void *user)
+musycc_new_chan(ci_t *ci, int channum, void *user)
 {
     mch_t      *ch;
 
@@ -1546,7 +1546,7 @@ musycc_new_chan(ci_t * ci, int channum, void *user)
 
 #ifdef SBE_PMCC4_ENABLE
 status_t
-musycc_chan_down(ci_t * dummy, int channum)
+musycc_chan_down(ci_t *dummy, int channum)
 {
     mpi_t      *pi;
     mch_t      *ch;
@@ -1597,7 +1597,7 @@ musycc_chan_down(ci_t * dummy, int channum)
 
 
 int
-musycc_del_chan(ci_t * ci, int channum)
+musycc_del_chan(ci_t *ci, int channum)
 {
     mch_t      *ch;
 
@@ -1613,7 +1613,7 @@ musycc_del_chan(ci_t * ci, int channum)
 
 
 int
-musycc_del_chan_stats(ci_t * ci, int channum)
+musycc_del_chan_stats(ci_t *ci, int channum)
 {
     mch_t      *ch;
 
@@ -1628,7 +1628,7 @@ musycc_del_chan_stats(ci_t * ci, int channum)
 
 
 int
-musycc_start_xmit(ci_t * ci, int channum, void *mem_token)
+musycc_start_xmit(ci_t *ci, int channum, void *mem_token)
 {
     mch_t      *ch;
     struct mdesc *md;
index b0ed4ad13011608d508464451db086ae6414335a..003eb8690190d5dac0afc1324d4a405954b20703 100644 (file)
@@ -85,15 +85,15 @@ void        c4_cleanup (void);
 status_t    c4_chan_up (ci_t *, int channum);
 status_t    c4_del_chan_stats (int channum);
 status_t    c4_del_chan (int channum);
-status_t    c4_get_iidinfo (ci_t * ci, struct sbe_iid_info * iip);
+status_t    c4_get_iidinfo (ci_t *ci, struct sbe_iid_info *iip);
 int         c4_is_chan_up (int channum);
 
 void       *getuserbychan (int channum);
-void        pci_flush_write (ci_t * ci);
+void        pci_flush_write (ci_t *ci);
 void        sbecom_set_loglevel (int debuglevel);
-char       *sbeid_get_bdname (ci_t * ci);
-void        sbeid_set_bdtype (ci_t * ci);
-void        sbeid_set_hdwbid (ci_t * ci);
+char       *sbeid_get_bdname (ci_t *ci);
+void        sbeid_set_bdtype (ci_t *ci);
+void        sbeid_set_hdwbid (ci_t *ci);
 u_int32_t   sbeCrc (u_int8_t *, u_int32_t, u_int32_t, u_int32_t *);
 
 void        VMETRO_TRACE (void *);       /* put data into 8 LEDs */
index 8d8a22be5b2ec3c7329a799ac23ad5d2b389dcd4..32d7a216a41922fc43ae83c4d1f0c37ac0b0b339 100644 (file)
@@ -28,7 +28,7 @@
 #include <linux/sched.h>        /* include for timer */
 #include <linux/timer.h>        /* include for timer */
 #include <linux/hdlc.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 #include "sbecom_inline_linux.h"
 #include "libsbew.h"
@@ -123,7 +123,7 @@ c4_find_chan (int channum)
                 {
                     if ((ch->state != UNASSIGNED) &&
                         (ch->channum == channum))
-                        return (ch);
+                        return ch;
                 }
             }
     return 0;
@@ -193,7 +193,7 @@ c4_new (void *hi)
 #define COMET_LBCMD_READ  0x80  /* read only (do not set, return read value) */
 
 void
-checkPorts (ci_t * ci)
+checkPorts (ci_t *ci)
 {
 #ifndef CONFIG_SBE_PMCC4_NCOMM
     /*
@@ -459,7 +459,7 @@ checkPorts (ci_t * ci)
 
 
 STATIC void
-c4_watchdog (ci_t * ci)
+c4_watchdog (ci_t *ci)
 {
     if (drvr_state != SBE_DRVR_AVAILABLE)
     {
@@ -512,7 +512,7 @@ c4_cleanup (void)
  */
 
 int
-c4_get_portcfg (ci_t * ci)
+c4_get_portcfg (ci_t *ci)
 {
     comet_t    *comet;
     int         portnum, mask;
@@ -536,7 +536,7 @@ c4_get_portcfg (ci_t * ci)
 /* nothing herein should generate interrupts */
 
 status_t    __init
-c4_init (ci_t * ci, u_char *func0, u_char *func1)
+c4_init (ci_t *ci, u_char *func0, u_char *func1)
 {
     mpi_t      *pi;
     mch_t      *ch;
@@ -670,7 +670,7 @@ c4_init (ci_t * ci, u_char *func0, u_char *func1)
 /* better be fully setup to handle interrupts when you call this */
 
 status_t    __init
-c4_init2 (ci_t * ci)
+c4_init2 (ci_t *ci)
 {
     status_t    ret;
 
@@ -698,7 +698,7 @@ c4_init2 (ci_t * ci)
 /* This function sets the loopback mode (or clears it, as the case may be). */
 
 int
-c4_loop_port (ci_t * ci, int portnum, u_int8_t cmd)
+c4_loop_port (ci_t *ci, int portnum, u_int8_t cmd)
 {
     comet_t    *comet;
     volatile u_int32_t loopValue;
@@ -757,7 +757,7 @@ c4_loop_port (ci_t * ci, int portnum, u_int8_t cmd)
  */
 
 status_t
-c4_frame_rw (ci_t * ci, struct sbecom_port_param * pp)
+c4_frame_rw (ci_t *ci, struct sbecom_port_param *pp)
 {
     comet_t    *comet;
     volatile u_int32_t data;
@@ -796,7 +796,7 @@ c4_frame_rw (ci_t * ci, struct sbecom_port_param * pp)
  */
 
 status_t
-c4_pld_rw (ci_t * ci, struct sbecom_port_param * pp)
+c4_pld_rw (ci_t *ci, struct sbecom_port_param *pp)
 {
     volatile u_int32_t *regaddr;
     volatile u_int32_t data;
@@ -834,7 +834,7 @@ c4_pld_rw (ci_t * ci, struct sbecom_port_param * pp)
  */
 
 status_t
-c4_musycc_rw (ci_t * ci, struct c4_musycc_param * mcp)
+c4_musycc_rw (ci_t *ci, struct c4_musycc_param *mcp)
 {
     mpi_t      *pi;
     volatile u_int32_t *dph;    /* hardware implemented register */
@@ -898,7 +898,7 @@ c4_musycc_rw (ci_t * ci, struct c4_musycc_param * mcp)
 }
 
 status_t
-c4_get_port (ci_t * ci, int portnum)
+c4_get_port (ci_t *ci, int portnum)
 {
     if (portnum >= ci->max_port)    /* sanity check */
         return ENXIO;
@@ -913,7 +913,7 @@ c4_get_port (ci_t * ci, int portnum)
 }
 
 status_t
-c4_set_port (ci_t * ci, int portnum)
+c4_set_port (ci_t *ci, int portnum)
 {
     mpi_t      *pi;
     struct sbecom_port_param *pp;
@@ -942,7 +942,7 @@ c4_set_port (ci_t * ci, int portnum)
 
         if ((ret = c4_wq_port_init (pi)))       /* create/init
                                                  * workqueue_struct */
-            return (ret);
+            return ret;
     }
 
     init_comet (ci, pi->cometbase, pp->port_mode, 1 /* clockmaster == true */ , pp->portP);
@@ -1018,7 +1018,7 @@ c4_set_port (ci_t * ci, int portnum)
 unsigned int max_int = 0;
 
 status_t
-c4_new_chan (ci_t * ci, int portnum, int channum, void *user)
+c4_new_chan (ci_t *ci, int portnum, int channum, void *user)
 {
     mpi_t      *pi;
     mch_t      *ch;
@@ -1111,7 +1111,7 @@ c4_del_chan_stats (int channum)
 
 
 status_t
-c4_set_chan (int channum, struct sbecom_chan_param * p)
+c4_set_chan (int channum, struct sbecom_chan_param *p)
 {
     mch_t      *ch;
     int         i, x = 0;
@@ -1162,7 +1162,7 @@ c4_set_chan (int channum, struct sbecom_chan_param * p)
 
 
 status_t
-c4_get_chan (int channum, struct sbecom_chan_param * p)
+c4_get_chan (int channum, struct sbecom_chan_param *p)
 {
     mch_t      *ch;
 
@@ -1173,7 +1173,7 @@ c4_get_chan (int channum, struct sbecom_chan_param * p)
 }
 
 status_t
-c4_get_chan_stats (int channum, struct sbecom_chan_stats * p)
+c4_get_chan_stats (int channum, struct sbecom_chan_stats *p)
 {
     mch_t      *ch;
 
@@ -1185,7 +1185,7 @@ c4_get_chan_stats (int channum, struct sbecom_chan_stats * p)
 }
 
 STATIC int
-c4_fifo_alloc (mpi_t * pi, int chan, int *len)
+c4_fifo_alloc (mpi_t *pi, int chan, int *len)
 {
     int         i, l = 0, start = 0, max = 0, maxstart = 0;
 
@@ -1222,7 +1222,7 @@ c4_fifo_alloc (mpi_t * pi, int chan, int *len)
 }
 
 void
-c4_fifo_free (mpi_t * pi, int chan)
+c4_fifo_free (mpi_t *pi, int chan)
 {
     int         i;
 
@@ -1236,7 +1236,7 @@ c4_fifo_free (mpi_t * pi, int chan)
 
 
 status_t
-c4_chan_up (ci_t * ci, int channum)
+c4_chan_up (ci_t *ci, int channum)
 {
     mpi_t      *pi;
     mch_t      *ch;
@@ -1467,7 +1467,7 @@ errfree:
 /* stop the hardware from servicing & interrupting */
 
 void
-c4_stopwd (ci_t * ci)
+c4_stopwd (ci_t *ci)
 {
     OS_stop_watchdog (&ci->wd);
     SD_SEM_TAKE (&ci->sem_wdbusy, "_stop_");    /* ensure WD not running */
@@ -1476,7 +1476,7 @@ c4_stopwd (ci_t * ci)
 
 
 void
-sbecom_get_brdinfo (ci_t * ci, struct sbe_brd_info * bip, u_int8_t *bsn)
+sbecom_get_brdinfo (ci_t *ci, struct sbe_brd_info *bip, u_int8_t *bsn)
 {
     char       *np;
     u_int32_t   sn = 0;
@@ -1485,7 +1485,7 @@ sbecom_get_brdinfo (ci_t * ci, struct sbe_brd_info * bip, u_int8_t *bsn)
     bip->brdno = ci->brdno;         /* our board number */
     bip->brd_id = ci->brd_id;
     bip->brd_hdw_id = ci->hdw_bid;
-    bip->brd_chan_cnt = MUSYCC_NCHANS * ci->max_port;   /* number of channels
+    bip->brd_chan_cnt = MUSYCC_NCHANS *ci->max_port;   /* number of channels
                                                          * being used */
     bip->brd_port_cnt = ci->max_port;   /* number of ports being used */
     bip->brd_pci_speed = BINFO_PCI_SPEED_unk;   /* PCI speed not yet
@@ -1535,7 +1535,7 @@ sbecom_get_brdinfo (ci_t * ci, struct sbe_brd_info * bip, u_int8_t *bsn)
 
 
 status_t
-c4_get_iidinfo (ci_t * ci, struct sbe_iid_info * iip)
+c4_get_iidinfo (ci_t *ci, struct sbe_iid_info *iip)
 {
     struct net_device *dev;
     char       *np;
@@ -1624,7 +1624,7 @@ wanpmcC4T1E1_getBaseAddress (int cardID, int deviceID)
         }
         ci = ci->next;              /* next board, if any */
     }
-    return (base);
+    return base;
 }
 
 #endif                          /*** CONFIG_SBE_PMCC4_NCOMM ***/
index 68ed445ab0cb39f5a8527ccec2af11fc324b2dd8..3c6d1c0fc6d69050c893a320781facd7419114d8 100644 (file)
@@ -177,7 +177,7 @@ struct watchdog
 
 
 static inline int
-OS_start_watchdog (struct watchdog * wd)
+OS_start_watchdog (struct watchdog *wd)
 {
     wd->h.expires = jiffies + wd->ticks;
     add_timer (&wd->h);
@@ -186,7 +186,7 @@ OS_start_watchdog (struct watchdog * wd)
 
 
 static inline int
-OS_stop_watchdog (struct watchdog * wd)
+OS_stop_watchdog (struct watchdog *wd)
 {
     del_timer_sync (&wd->h);
     return 0;
@@ -194,7 +194,7 @@ OS_stop_watchdog (struct watchdog * wd)
 
 
 static inline int
-OS_free_watchdog (struct watchdog * wd)
+OS_free_watchdog (struct watchdog *wd)
 {
     OS_stop_watchdog (wd);
     OS_kfree (wd);
index a2243b10ef0589fb38e8eec48692c71842c7a187..0f9bd5f8136c178a8d1884492e7a008b48e5259d 100644 (file)
@@ -27,7 +27,7 @@
 
 
 char       *
-sbeid_get_bdname (ci_t * ci)
+sbeid_get_bdname (ci_t *ci)
 {
     char       *np = 0;
 
@@ -73,7 +73,7 @@ sbeid_get_bdname (ci_t * ci)
 /* given the presetting of brd_id, set the corresponding hdw_id */
 
 void
-sbeid_set_hdwbid (ci_t * ci)
+sbeid_set_hdwbid (ci_t *ci)
 {
     /*
      * set SBE's unique hardware identification (for legacy boards might not
@@ -170,7 +170,7 @@ sbeid_set_hdwbid (ci_t * ci)
 /* given the presetting of hdw_bid, set the corresponding brd_id */
 
 void
-sbeid_set_bdtype (ci_t * ci)
+sbeid_set_bdtype (ci_t *ci)
 {
     /* set SBE's unique PCI VENDOR/DEVID */
     switch (ci->hdw_bid)
index e5c072cf1952d1f505517c784e21a0043e15a8b7..37285df359c161f1733ebbd3a7bbf0c2e9ba32d0 100644 (file)
@@ -28,11 +28,11 @@ int __init  sbecom_proc_brd_init (ci_t *);
 
 #else
 
-static inline void sbecom_proc_brd_cleanup(ci_t * ci)
+static inline void sbecom_proc_brd_cleanup(ci_t *ci)
 {
 }
 
-static inline int __init sbecom_proc_brd_init(ci_t * ci)
+static inline int __init sbecom_proc_brd_init(ci_t *ci)
 {
        return 0;
 }
index 114799cddd85659899c092117cf95050e005f652..69bfe309376d36ea2eca26404a56be5b0660f171 100644 (file)
@@ -392,7 +392,7 @@ static long dgrp_dpa_ioctl(struct file *file, unsigned int cmd,
                getnode.nd_rx_byte = nd->nd_rx_byte;
 
                memset(&getnode.nd_ps_desc, 0, MAX_DESC_LEN);
-               strncpy(getnode.nd_ps_desc, nd->nd_ps_desc, MAX_DESC_LEN);
+               strlcpy(getnode.nd_ps_desc, nd->nd_ps_desc, MAX_DESC_LEN);
 
                if (copy_to_user(uarg, &getnode, sizeof(struct digi_node)))
                        return -EFAULT;
index 5b7833f593ff7a8909ca74a7a10a43d638626004..33ac7fb88cbd3b98b24b1eb50209e4a0b5c6100e 100644 (file)
@@ -278,7 +278,7 @@ static void parity_scan(struct ch_struct *ch, unsigned char *cbuf,
                switch (ch->ch_pscan_state) {
                default:
                        /* reset to sanity and fall through */
-                       ch->ch_pscan_state = 0 ;
+                       ch->ch_pscan_state = 0;
 
                case 0:
                        /* No FF seen yet */
@@ -1607,7 +1607,7 @@ static int dgrp_send(struct nd_struct *nd, long tmax)
                                        if ((ch->ch_pun.un_flag & UN_LOW) != 0 ?
                                            (n <= TBUF_LOW) :
                                            (ch->ch_pun.un_flag & UN_TIME) != 0 ?
-                                           ((jiffies - ch->ch_waketime) >= 0) :
+                                           time_is_before_jiffies(ch->ch_waketime) :
                                            (n == 0 && ch->ch_s_tpos == ch->ch_s_tin) &&
                                            ((ch->ch_pun.un_flag & UN_EMPTY) != 0 ||
                                            ((ch->ch_tun.un_open_count &&
@@ -3083,7 +3083,7 @@ check_query:
                                                nd->nd_hw_ver = (b[8] << 8) | b[9];
                                                nd->nd_sw_ver = (b[10] << 8) | b[11];
                                                nd->nd_hw_id = b[6];
-                                               desclen = ((plen - 12) > MAX_DESC_LEN) ? MAX_DESC_LEN :
+                                               desclen = (plen - 12 > MAX_DESC_LEN - 1) ? MAX_DESC_LEN - 1 :
                                                        plen - 12;
 
                                                if (desclen <= 0) {
index 84a1e7be4899b64240231c08f7e21b120f44e114..4024b488eba9071284ce07ab24a7ae8cd88bb5c4 100644 (file)
@@ -674,7 +674,7 @@ struct nd_struct {
        ushort       nd_hw_ver;           /* HW version returned from PS   */
        ushort       nd_sw_ver;           /* SW version returned from PS   */
        uint         nd_hw_id;            /* HW ID returned from PS        */
-       u8        nd_ps_desc[MAX_DESC_LEN+1];  /* Description from PS   */
+       u8        nd_ps_desc[MAX_DESC_LEN];  /* Description from PS     */
        uint         nd_vpd_len;                /* VPD len, if any */
        u8           nd_vpd[VPDSIZE];           /* VPD, if any */
 
index 3177db2380bfca5943a38670efaaf8a3cf546e4a..ac8ed15064dca6542768fa703f1c050ddd50a905 100644 (file)
@@ -506,8 +506,7 @@ static void dwc2_config_fifos(struct dwc2_hsotg *hsotg)
        struct dwc2_core_params *params = hsotg->core_params;
        u32 rxfsiz, nptxfsiz, ptxfsiz, hptxfsiz, dfifocfg;
 
-       if (!(hsotg->hwcfg2 & GHWCFG2_DYNAMIC_FIFO) ||
-           !params->enable_dynamic_fifo)
+       if (!params->enable_dynamic_fifo)
                return;
 
        dev_dbg(hsotg->dev, "Total FIFO Size=%d\n", hsotg->total_fifo_size);
@@ -1146,16 +1145,10 @@ void dwc2_hc_cleanup(struct dwc2_hsotg *hsotg, struct dwc2_host_chan *chan)
 static void dwc2_hc_set_even_odd_frame(struct dwc2_hsotg *hsotg,
                                       struct dwc2_host_chan *chan, u32 *hcchar)
 {
-       u32 hfnum, frnum;
-
        if (chan->ep_type == USB_ENDPOINT_XFER_INT ||
            chan->ep_type == USB_ENDPOINT_XFER_ISOC) {
-               hfnum = readl(hsotg->regs + HFNUM);
-               frnum = hfnum >> HFNUM_FRNUM_SHIFT &
-                       HFNUM_FRNUM_MASK >> HFNUM_FRNUM_SHIFT;
-
                /* 1 if _next_ frame is odd, 0 if it's even */
-               if (frnum & 0x1)
+               if (dwc2_hcd_get_frame_number(hsotg) & 0x1)
                        *hcchar |= HCCHAR_ODDFRM;
        }
 }
@@ -1696,7 +1689,7 @@ u32 dwc2_calc_frame_interval(struct dwc2_hsotg *hsotg)
            GHWCFG2_FS_PHY_TYPE_DEDICATED)
                clock = 48;
 
-       if ((hprt0 & HPRT0_SPD_MASK) == 0)
+       if ((hprt0 & HPRT0_SPD_MASK) == HPRT0_SPD_HIGH_SPEED)
                /* High speed case */
                return 125 * clock;
        else
@@ -1815,8 +1808,6 @@ void dwc2_dump_global_registers(struct dwc2_hsotg *hsotg)
 {
 #ifdef DEBUG
        u32 __iomem *addr;
-       int i, ep_num;
-       char *txfsiz;
 
        dev_dbg(hsotg->dev, "Core Global Registers\n");
        addr = hsotg->regs + GOTGCTL;
@@ -1892,23 +1883,6 @@ void dwc2_dump_global_registers(struct dwc2_hsotg *hsotg)
        dev_dbg(hsotg->dev, "HPTXFSIZ    @0x%08lX : 0x%08X\n",
                (unsigned long)addr, readl(addr));
 
-       if (hsotg->core_params->en_multiple_tx_fifo <= 0) {
-               ep_num = hsotg->hwcfg4 >> GHWCFG4_NUM_DEV_PERIO_IN_EP_SHIFT &
-                        GHWCFG4_NUM_DEV_PERIO_IN_EP_MASK >>
-                                        GHWCFG4_NUM_DEV_PERIO_IN_EP_SHIFT;
-               txfsiz = "DPTXFSIZ";
-       } else {
-               ep_num = hsotg->hwcfg4 >> GHWCFG4_NUM_IN_EPS_SHIFT &
-                        GHWCFG4_NUM_IN_EPS_MASK >> GHWCFG4_NUM_IN_EPS_SHIFT;
-               txfsiz = "DIENPTXF";
-       }
-
-       for (i = 0; i < ep_num; i++) {
-               addr = hsotg->regs + DPTXFSIZN(i + 1);
-               dev_dbg(hsotg->dev, "%s[%d] @0x%08lX : 0x%08X\n", txfsiz, i + 1,
-                       (unsigned long)addr, readl(addr));
-       }
-
        addr = hsotg->regs + PCGCTL;
        dev_dbg(hsotg->dev, "PCGCTL      @0x%08lX : 0x%08X\n",
                (unsigned long)addr, readl(addr));
@@ -2298,7 +2272,7 @@ int dwc2_set_param_phy_type(struct dwc2_hsotg *hsotg, int val)
 #ifndef NO_FS_PHY_HW_CHECKS
                valid = 0;
 #else
-               val = 0;
+               val = DWC2_PHY_TYPE_PARAM_FS;
                dev_dbg(hsotg->dev, "Setting phy_type to %d\n", val);
                retval = -EINVAL;
 #endif
@@ -2325,7 +2299,7 @@ int dwc2_set_param_phy_type(struct dwc2_hsotg *hsotg, int val)
                        dev_err(hsotg->dev,
                                "%d invalid for phy_type. Check HW configuration.\n",
                                val);
-               val = 0;
+               val = DWC2_PHY_TYPE_PARAM_FS;
                if (hs_phy_type != GHWCFG2_HS_PHY_TYPE_NOT_SUPPORTED) {
                        if (hs_phy_type == GHWCFG2_HS_PHY_TYPE_UTMI ||
                            hs_phy_type == GHWCFG2_HS_PHY_TYPE_UTMI_ULPI)
@@ -2360,8 +2334,8 @@ int dwc2_set_param_speed(struct dwc2_hsotg *hsotg, int val)
                valid = 0;
        }
 
-       if (val == 0 && dwc2_get_param_phy_type(hsotg) ==
-                                       DWC2_PHY_TYPE_PARAM_FS)
+       if (val == DWC2_SPEED_PARAM_HIGH &&
+           dwc2_get_param_phy_type(hsotg) == DWC2_PHY_TYPE_PARAM_FS)
                valid = 0;
 
        if (!valid) {
@@ -2370,7 +2344,7 @@ int dwc2_set_param_speed(struct dwc2_hsotg *hsotg, int val)
                                "%d invalid for speed parameter. Check HW configuration.\n",
                                val);
                val = dwc2_get_param_phy_type(hsotg) == DWC2_PHY_TYPE_PARAM_FS ?
-                               1 : 0;
+                               DWC2_SPEED_PARAM_FULL : DWC2_SPEED_PARAM_HIGH;
                dev_dbg(hsotg->dev, "Setting speed to %d\n", val);
                retval = -EINVAL;
        }
@@ -2668,7 +2642,7 @@ int dwc2_set_param_otg_ver(struct dwc2_hsotg *hsotg, int val)
  * for the DWC_otg core. It returns non-0 if any parameters are invalid.
  */
 int dwc2_set_parameters(struct dwc2_hsotg *hsotg,
-                       struct dwc2_core_params *params)
+                       const struct dwc2_core_params *params)
 {
        int retval = 0;
 
index 4c9ad14e90ecdc42af8c611869ba5b14ac576192..98c51bba6622ea5762a0b056737b3a142a78bd1b 100644 (file)
@@ -403,8 +403,7 @@ static void dwc2_handle_usb_suspend_intr(struct dwc2_hsotg *hsotg)
 #define GINTMSK_COMMON (GINTSTS_WKUPINT | GINTSTS_SESSREQINT |         \
                         GINTSTS_CONIDSTSCHNG | GINTSTS_OTGINT |        \
                         GINTSTS_MODEMIS | GINTSTS_DISCONNINT |         \
-                        GINTSTS_USBSUSP | GINTSTS_RESTOREDONE |        \
-                        GINTSTS_PRTINT)
+                        GINTSTS_USBSUSP | GINTSTS_PRTINT)
 
 /*
  * This function returns the Core Interrupt register
@@ -450,7 +449,7 @@ irqreturn_t dwc2_handle_common_intr(int irq, void *dev)
 {
        struct dwc2_hsotg *hsotg = dev;
        u32 gintsts;
-       int retval = 0;
+       irqreturn_t retval = IRQ_NONE;
 
        if (dwc2_check_core_status(hsotg) < 0) {
                dev_warn(hsotg->dev, "Controller is disconnected\n");
@@ -461,7 +460,7 @@ irqreturn_t dwc2_handle_common_intr(int irq, void *dev)
 
        gintsts = dwc2_read_common_intr(hsotg);
        if (gintsts & ~GINTSTS_PRTINT)
-               retval = 1;
+               retval = IRQ_HANDLED;
 
        if (gintsts & GINTSTS_MODEMIS)
                dwc2_handle_mode_mismatch_intr(hsotg);
@@ -478,12 +477,6 @@ irqreturn_t dwc2_handle_common_intr(int irq, void *dev)
        if (gintsts & GINTSTS_USBSUSP)
                dwc2_handle_usb_suspend_intr(hsotg);
 
-       if (gintsts & GINTSTS_RESTOREDONE) {
-               gintsts = GINTSTS_RESTOREDONE;
-               writel(gintsts, hsotg->regs + GINTSTS);
-               dev_dbg(hsotg->dev, " --Restore done interrupt received--\n");
-       }
-
        if (gintsts & GINTSTS_PRTINT) {
                /*
                 * The port interrupt occurs while in device mode with HPRT0
@@ -500,6 +493,6 @@ irqreturn_t dwc2_handle_common_intr(int irq, void *dev)
 
        spin_unlock(&hsotg->lock);
 out:
-       return IRQ_RETVAL(retval);
+       return retval;
 }
 EXPORT_SYMBOL_GPL(dwc2_handle_common_intr);
index 8551ccedf0376284c4d81648db0c2cbddde6d946..2ed54b172a3b11308fb15e834d6742f13a150b90 100644 (file)
@@ -1563,9 +1563,9 @@ static int dwc2_hcd_hub_control(struct dwc2_hsotg *hsotg, u16 typereq,
                break;
 
        case GetPortStatus:
-               dev_dbg(hsotg->dev,
-                       "GetPortStatus wIndex=0x%04x flags=0x%08x\n", windex,
-                       hsotg->flags.d32);
+               dev_vdbg(hsotg->dev,
+                        "GetPortStatus wIndex=0x%04x flags=0x%08x\n", windex,
+                        hsotg->flags.d32);
                if (!windex || windex > 1)
                        goto error;
 
@@ -1598,7 +1598,7 @@ static int dwc2_hcd_hub_control(struct dwc2_hsotg *hsotg, u16 typereq,
                }
 
                hprt0 = readl(hsotg->regs + HPRT0);
-               dev_dbg(hsotg->dev, "  HPRT0: 0x%08x\n", hprt0);
+               dev_vdbg(hsotg->dev, "  HPRT0: 0x%08x\n", hprt0);
 
                if (hprt0 & HPRT0_CONNSTS)
                        port_status |= USB_PORT_STAT_CONNECTION;
@@ -1623,7 +1623,7 @@ static int dwc2_hcd_hub_control(struct dwc2_hsotg *hsotg, u16 typereq,
                        port_status |= USB_PORT_STAT_TEST;
                /* USB_PORT_FEAT_INDICATOR unsupported always 0 */
 
-               dev_dbg(hsotg->dev, "port_status=%08x\n", port_status);
+               dev_vdbg(hsotg->dev, "port_status=%08x\n", port_status);
                *(__le32 *)buf = cpu_to_le32(port_status);
                break;
 
@@ -2533,9 +2533,8 @@ static void _dwc2_hcd_endpoint_reset(struct usb_hcd *hcd,
 static irqreturn_t _dwc2_hcd_irq(struct usb_hcd *hcd)
 {
        struct dwc2_hsotg *hsotg = dwc2_hcd_to_hsotg(hcd);
-       int retval = dwc2_hcd_intr(hsotg);
 
-       return IRQ_RETVAL(retval);
+       return dwc2_handle_hcd_intr(hsotg);
 }
 
 /*
@@ -2702,7 +2701,7 @@ EXPORT_SYMBOL_GPL(dwc2_set_all_params);
  * a negative error on failure.
  */
 int dwc2_hcd_init(struct dwc2_hsotg *hsotg, int irq,
-                 struct dwc2_core_params *params)
+                 const struct dwc2_core_params *params)
 {
        struct usb_hcd *hcd;
        struct dwc2_host_chan *channel;
@@ -2919,7 +2918,7 @@ int dwc2_hcd_init(struct dwc2_hsotg *hsotg, int irq,
         * allocates the DMA buffer pool, registers the USB bus, requests the
         * IRQ line, and calls hcd_start method.
         */
-       retval = usb_add_hcd(hcd, irq, IRQF_SHARED | IRQF_DISABLED);
+       retval = usb_add_hcd(hcd, irq, IRQF_SHARED);
        if (retval < 0)
                goto error3;
 
index d071f1a05df154e12c2213e9c6a524d9f90718b5..cf6c055aec8d1384842c628543d143e18f974d7d 100644 (file)
@@ -448,10 +448,10 @@ static inline u8 dwc2_hcd_is_pipe_out(struct dwc2_hcd_pipe_info *pipe)
 }
 
 extern int dwc2_hcd_init(struct dwc2_hsotg *hsotg, int irq,
-                        struct dwc2_core_params *params);
+                        const struct dwc2_core_params *params);
 extern void dwc2_hcd_remove(struct dwc2_hsotg *hsotg);
 extern int dwc2_set_parameters(struct dwc2_hsotg *hsotg,
-                              struct dwc2_core_params *params);
+                              const struct dwc2_core_params *params);
 extern void dwc2_set_all_params(struct dwc2_core_params *params, int value);
 
 /* Transaction Execution Functions */
@@ -646,14 +646,14 @@ extern void dwc2_hcd_save_data_toggle(struct dwc2_hsotg *hsotg,
 /* HCD Core API */
 
 /**
- * dwc2_hcd_intr() - Called on every hardware interrupt
+ * dwc2_handle_hcd_intr() - Called on every hardware interrupt
  *
  * @hsotg: The DWC2 HCD
  *
- * Returns non zero if interrupt is handled
- * Return 0 if interrupt is not handled
+ * Returns IRQ_HANDLED if interrupt is handled
+ * Return IRQ_NONE if interrupt is not handled
  */
-extern int dwc2_hcd_intr(struct dwc2_hsotg *hsotg);
+extern irqreturn_t dwc2_handle_hcd_intr(struct dwc2_hsotg *hsotg);
 
 /**
  * dwc2_hcd_stop() - Halts the DWC_otg host mode operation
index e24062f0a49ebd87ec2d30b06f914309d21173c0..e75dccb3b80b5148f0e8883152d783d773988d00 100644 (file)
@@ -115,16 +115,13 @@ static void dwc2_sof_intr(struct dwc2_hsotg *hsotg)
 {
        struct list_head *qh_entry;
        struct dwc2_qh *qh;
-       u32 hfnum;
        enum dwc2_transaction_type tr_type;
 
 #ifdef DEBUG_SOF
        dev_vdbg(hsotg->dev, "--Start of Frame Interrupt--\n");
 #endif
 
-       hfnum = readl(hsotg->regs + HFNUM);
-       hsotg->frame_number = hfnum >> HFNUM_FRNUM_SHIFT &
-                           HFNUM_FRNUM_MASK >> HFNUM_FRNUM_SHIFT;
+       hsotg->frame_number = dwc2_hcd_get_frame_number(hsotg);
 
        dwc2_track_missed_sofs(hsotg);
 
@@ -244,6 +241,7 @@ static void dwc2_hprt0_enable(struct dwc2_hsotg *hsotg, u32 hprt0,
        u32 usbcfg;
        u32 prtspd;
        u32 hcfg;
+       u32 fslspclksel;
        u32 hfir;
 
        dev_vdbg(hsotg->dev, "%s(%p)\n", __func__, hsotg);
@@ -275,6 +273,7 @@ static void dwc2_hprt0_enable(struct dwc2_hsotg *hsotg, u32 hprt0,
                }
 
                hcfg = readl(hsotg->regs + HCFG);
+               fslspclksel = hcfg & HCFG_FSLSPCLKSEL_MASK;
 
                if (prtspd == HPRT0_SPD_LOW_SPEED &&
                    params->host_ls_low_power_phy_clk ==
@@ -282,8 +281,7 @@ static void dwc2_hprt0_enable(struct dwc2_hsotg *hsotg, u32 hprt0,
                        /* 6 MHZ */
                        dev_vdbg(hsotg->dev,
                                 "FS_PHY programming HCFG to 6 MHz\n");
-                       if ((hcfg & HCFG_FSLSPCLKSEL_MASK) !=
-                           HCFG_FSLSPCLKSEL_6_MHZ) {
+                       if (fslspclksel != HCFG_FSLSPCLKSEL_6_MHZ) {
                                hcfg &= ~HCFG_FSLSPCLKSEL_MASK;
                                hcfg |= HCFG_FSLSPCLKSEL_6_MHZ;
                                writel(hcfg, hsotg->regs + HCFG);
@@ -293,8 +291,7 @@ static void dwc2_hprt0_enable(struct dwc2_hsotg *hsotg, u32 hprt0,
                        /* 48 MHZ */
                        dev_vdbg(hsotg->dev,
                                 "FS_PHY programming HCFG to 48 MHz\n");
-                       if ((hcfg & HCFG_FSLSPCLKSEL_MASK) !=
-                           HCFG_FSLSPCLKSEL_48_MHZ) {
+                       if (fslspclksel != HCFG_FSLSPCLKSEL_48_MHZ) {
                                hcfg &= ~HCFG_FSLSPCLKSEL_MASK;
                                hcfg |= HCFG_FSLSPCLKSEL_48_MHZ;
                                writel(hcfg, hsotg->regs + HCFG);
@@ -2060,14 +2057,14 @@ static void dwc2_hc_intr(struct dwc2_hsotg *hsotg)
 }
 
 /* This function handles interrupts for the HCD */
-int dwc2_hcd_intr(struct dwc2_hsotg *hsotg)
+irqreturn_t dwc2_handle_hcd_intr(struct dwc2_hsotg *hsotg)
 {
        u32 gintsts, dbg_gintsts;
-       int retval = 0;
+       irqreturn_t retval = IRQ_NONE;
 
        if (dwc2_check_core_status(hsotg) < 0) {
                dev_warn(hsotg->dev, "Controller is disconnected\n");
-               return 0;
+               return retval;
        }
 
        spin_lock(&hsotg->lock);
@@ -2077,10 +2074,10 @@ int dwc2_hcd_intr(struct dwc2_hsotg *hsotg)
                gintsts = dwc2_read_core_intr(hsotg);
                if (!gintsts) {
                        spin_unlock(&hsotg->lock);
-                       return 0;
+                       return retval;
                }
 
-               retval = 1;
+               retval = IRQ_HANDLED;
 
                dbg_gintsts = gintsts;
 #ifndef DEBUG_SOF
@@ -2102,9 +2099,6 @@ int dwc2_hcd_intr(struct dwc2_hsotg *hsotg)
                        dwc2_rx_fifo_level_intr(hsotg);
                if (gintsts & GINTSTS_NPTXFEMP)
                        dwc2_np_tx_fifo_empty_intr(hsotg);
-               if (gintsts & GINTSTS_I2CINT)
-                       /* Todo: Implement i2cintr handler */
-                       writel(GINTSTS_I2CINT, hsotg->regs + GINTSTS);
                if (gintsts & GINTSTS_PRTINT)
                        dwc2_port_intr(hsotg);
                if (gintsts & GINTSTS_HCHINT)
index 69c65eb8683fb01db49992a2d27936287570e062..3ca54d6782fdc4a69af5a44e025c4cb71d915db8 100644 (file)
@@ -59,7 +59,7 @@
 
 static const char dwc2_driver_name[] = "dwc2";
 
-static struct dwc2_core_params dwc2_module_params = {
+static const struct dwc2_core_params dwc2_module_params = {
        .otg_cap                        = -1,
        .otg_ver                        = -1,
        .dma_enable                     = -1,
@@ -101,8 +101,6 @@ static void dwc2_driver_remove(struct pci_dev *dev)
 {
        struct dwc2_hsotg *hsotg = pci_get_drvdata(dev);
 
-       dev_dbg(&dev->dev, "%s(%p)\n", __func__, dev);
-
        dwc2_hcd_remove(hsotg);
        pci_disable_device(dev);
 }
@@ -125,18 +123,14 @@ static int dwc2_driver_probe(struct pci_dev *dev,
        struct dwc2_hsotg *hsotg;
        int retval;
 
-       dev_dbg(&dev->dev, "%s(%p)\n", __func__, dev);
-
        hsotg = devm_kzalloc(&dev->dev, sizeof(*hsotg), GFP_KERNEL);
        if (!hsotg)
                return -ENOMEM;
 
-       pci_set_power_state(dev, PCI_D0);
-
        hsotg->dev = &dev->dev;
-       hsotg->regs = devm_request_and_ioremap(&dev->dev, &dev->resource[0]);
-       if (!hsotg->regs)
-               return -ENOMEM;
+       hsotg->regs = devm_ioremap_resource(&dev->dev, &dev->resource[0]);
+       if (IS_ERR(hsotg->regs))
+               return PTR_ERR(hsotg->regs);
 
        dev_dbg(&dev->dev, "mapped PA %08lx to VA %p\n",
                (unsigned long)pci_resource_start(dev, 0), hsotg->regs);
@@ -153,7 +147,6 @@ static int dwc2_driver_probe(struct pci_dev *dev,
        }
 
        pci_set_drvdata(dev, hsotg);
-       dev_dbg(&dev->dev, "hsotg=%p\n", hsotg);
 
        return retval;
 }
@@ -162,6 +155,10 @@ static DEFINE_PCI_DEVICE_TABLE(dwc2_pci_ids) = {
        {
                PCI_DEVICE(PCI_VENDOR_ID_SYNOPSYS, PCI_PRODUCT_ID_HAPS_HSOTG),
        },
+       {
+               PCI_DEVICE(PCI_VENDOR_ID_STMICRO,
+                          PCI_DEVICE_ID_STMICRO_USB_OTG),
+       },
        { /* end: all zeroes */ }
 };
 MODULE_DEVICE_TABLE(pci, dwc2_pci_ids);
index 5882139d49afc3d204f175d00c81e97c0794220f..9597e9523cac4d7459e1869f69a49f1c80a73b24 100644 (file)
@@ -267,13 +267,13 @@ struct oslec_state *oslec_create(int len, int adaption_mode)
                goto error_snap;
 
        ec->cond_met = 0;
-       ec->Pstates = 0;
-       ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
-       ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
+       ec->pstates = 0;
+       ec->ltxacc = ec->lrxacc = ec->lcleanacc = ec->lclean_bgacc = 0;
+       ec->ltx = ec->lrx = ec->lclean = ec->lclean_bg = 0;
        ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
-       ec->Lbgn = ec->Lbgn_acc = 0;
-       ec->Lbgn_upper = 200;
-       ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
+       ec->lbgn = ec->lbgn_acc = 0;
+       ec->lbgn_upper = 200;
+       ec->lbgn_upper_acc = ec->lbgn_upper << 13;
 
        return ec;
 
@@ -314,13 +314,13 @@ void oslec_flush(struct oslec_state *ec)
 {
        int i;
 
-       ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
-       ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
+       ec->ltxacc = ec->lrxacc = ec->lcleanacc = ec->lclean_bgacc = 0;
+       ec->ltx = ec->lrx = ec->lclean = ec->lclean_bg = 0;
        ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
 
-       ec->Lbgn = ec->Lbgn_acc = 0;
-       ec->Lbgn_upper = 200;
-       ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
+       ec->lbgn = ec->lbgn_acc = 0;
+       ec->lbgn_upper = 200;
+       ec->lbgn_upper_acc = ec->lbgn_upper << 13;
 
        ec->nonupdate_dwell = 0;
 
@@ -332,7 +332,7 @@ void oslec_flush(struct oslec_state *ec)
                memset(ec->fir_taps16[i], 0, ec->taps * sizeof(int16_t));
 
        ec->curr_pos = ec->taps - 1;
-       ec->Pstates = 0;
+       ec->pstates = 0;
 }
 EXPORT_SYMBOL_GPL(oslec_flush);
 
@@ -418,33 +418,33 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
                new = (int)tx * (int)tx;
                old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
                    (int)ec->fir_state.history[ec->fir_state.curr_pos];
-               ec->Pstates +=
+               ec->pstates +=
                    ((new - old) + (1 << (ec->log2taps - 1))) >> ec->log2taps;
-               if (ec->Pstates < 0)
-                       ec->Pstates = 0;
+               if (ec->pstates < 0)
+                       ec->pstates = 0;
        }
 
        /* Calculate short term average levels using simple single pole IIRs */
 
-       ec->Ltxacc += abs(tx) - ec->Ltx;
-       ec->Ltx = (ec->Ltxacc + (1 << 4)) >> 5;
-       ec->Lrxacc += abs(rx) - ec->Lrx;
-       ec->Lrx = (ec->Lrxacc + (1 << 4)) >> 5;
+       ec->ltxacc += abs(tx) - ec->ltx;
+       ec->ltx = (ec->ltxacc + (1 << 4)) >> 5;
+       ec->lrxacc += abs(rx) - ec->lrx;
+       ec->lrx = (ec->lrxacc + (1 << 4)) >> 5;
 
        /* Foreground filter */
 
        ec->fir_state.coeffs = ec->fir_taps16[0];
        echo_value = fir16(&ec->fir_state, tx);
        ec->clean = rx - echo_value;
-       ec->Lcleanacc += abs(ec->clean) - ec->Lclean;
-       ec->Lclean = (ec->Lcleanacc + (1 << 4)) >> 5;
+       ec->lcleanacc += abs(ec->clean) - ec->lclean;
+       ec->lclean = (ec->lcleanacc + (1 << 4)) >> 5;
 
        /* Background filter */
 
        echo_value = fir16(&ec->fir_state_bg, tx);
        clean_bg = rx - echo_value;
-       ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg;
-       ec->Lclean_bg = (ec->Lclean_bgacc + (1 << 4)) >> 5;
+       ec->lclean_bgacc += abs(clean_bg) - ec->lclean_bg;
+       ec->lclean_bg = (ec->lclean_bgacc + (1 << 4)) >> 5;
 
        /* Background Filter adaption */
 
@@ -455,7 +455,7 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
        ec->factor = 0;
        ec->shift = 0;
        if ((ec->nonupdate_dwell == 0)) {
-               int P, logP, shift;
+               int p, logp, shift;
 
                /* Determine:
 
@@ -490,9 +490,9 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
                   for a divide versus a top_bit() implementation.
                 */
 
-               P = MIN_TX_POWER_FOR_ADAPTION + ec->Pstates;
-               logP = top_bit(P) + ec->log2taps;
-               shift = 30 - 2 - logP;
+               p = MIN_TX_POWER_FOR_ADAPTION + ec->pstates;
+               logp = top_bit(p) + ec->log2taps;
+               shift = 30 - 2 - logp;
                ec->shift = shift;
 
                lms_adapt_bg(ec, clean_bg, shift);
@@ -502,7 +502,7 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
           near end speech */
 
        ec->adapt = 0;
-       if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx))
+       if ((ec->lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->lrx > ec->ltx))
                ec->nonupdate_dwell = DTD_HANGOVER;
        if (ec->nonupdate_dwell)
                ec->nonupdate_dwell--;
@@ -515,9 +515,9 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
        if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) &&
            (ec->nonupdate_dwell == 0) &&
            /* (ec->Lclean_bg < 0.875*ec->Lclean) */
-           (8 * ec->Lclean_bg < 7 * ec->Lclean) &&
+           (8 * ec->lclean_bg < 7 * ec->lclean) &&
            /* (ec->Lclean_bg < 0.125*ec->Ltx) */
-           (8 * ec->Lclean_bg < ec->Ltx)) {
+           (8 * ec->lclean_bg < ec->ltx)) {
                if (ec->cond_met == 6) {
                        /*
                         * BG filter has had better results for 6 consecutive
@@ -541,14 +541,14 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
                 * non-linearity in the channel.".
                 */
 
-               if ((16 * ec->Lclean < ec->Ltx)) {
+               if ((16 * ec->lclean < ec->ltx)) {
                        /*
                         * Our e/c has improved echo by at least 24 dB (each
                         * factor of 2 is 6dB, so 2*2*2*2=16 is the same as
                         * 6+6+6+6=24dB)
                         */
                        if (ec->adaption_mode & ECHO_CAN_USE_CNG) {
-                               ec->cng_level = ec->Lbgn;
+                               ec->cng_level = ec->lbgn;
 
                                /*
                                 * Very elementary comfort noise generation.
@@ -571,10 +571,10 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
 
                        } else if (ec->adaption_mode & ECHO_CAN_USE_CLIP) {
                                /* This sounds much better than CNG */
-                               if (ec->clean_nlp > ec->Lbgn)
-                                       ec->clean_nlp = ec->Lbgn;
-                               if (ec->clean_nlp < -ec->Lbgn)
-                                       ec->clean_nlp = -ec->Lbgn;
+                               if (ec->clean_nlp > ec->lbgn)
+                                       ec->clean_nlp = ec->lbgn;
+                               if (ec->clean_nlp < -ec->lbgn)
+                                       ec->clean_nlp = -ec->lbgn;
                        } else {
                                /*
                                 * just mute the residual, doesn't sound very
@@ -593,9 +593,9 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
                         * level signals like near end speech.  When combined
                         * with CNG or especially CLIP seems to work OK.
                         */
-                       if (ec->Lclean < 40) {
-                               ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn;
-                               ec->Lbgn = (ec->Lbgn_acc + (1 << 11)) >> 12;
+                       if (ec->lclean < 40) {
+                               ec->lbgn_acc += abs(ec->clean) - ec->lbgn;
+                               ec->lbgn = (ec->lbgn_acc + (1 << 11)) >> 12;
                        }
                }
        }
index 32ca9dedeca4b60828867dc88d470de259f5a40b..9b08c63e63696d4d9f7fa2de0e04a3db9b712fa0 100644 (file)
@@ -139,24 +139,24 @@ struct oslec_state {
        int adaption_mode;
 
        int cond_met;
-       int32_t Pstates;
+       int32_t pstates;
        int16_t adapt;
        int32_t factor;
        int16_t shift;
 
        /* Average levels and averaging filter states */
-       int Ltxacc;
-       int Lrxacc;
-       int Lcleanacc;
-       int Lclean_bgacc;
-       int Ltx;
-       int Lrx;
-       int Lclean;
-       int Lclean_bg;
-       int Lbgn;
-       int Lbgn_acc;
-       int Lbgn_upper;
-       int Lbgn_upper_acc;
+       int ltxacc;
+       int lrxacc;
+       int lcleanacc;
+       int lclean_bgacc;
+       int ltx;
+       int lrx;
+       int lclean;
+       int lclean_bg;
+       int lbgn;
+       int lbgn_acc;
+       int lbgn_upper;
+       int lbgn_upper_acc;
 
        /* foreground and background filter states */
        struct fir16_state_t fir_state;
index ea9362d7e589b14c2b236bbe10ef688409ff965c..0957eb08cdb524d64acd2f2edee828bd28bc387a 100644 (file)
  * raw interrupt reports.
  */
 
-/* Note: this currently uses a dumb ringbuffer for reads and writes.
+/*
+ * Note: this currently uses a dumb ringbuffer for reads and writes.
  * A more optimal driver would cache and kill off outstanding urbs that are
  * now invalid, and ignore ones that already were in the queue but valid
  * as we only have 30 commands for the alphatrack. In particular this is
  * key for getting lights to flash in time as otherwise many commands
  * can be buffered up before the light change makes it to the interface.
-*/
+ */
 
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -100,7 +101,8 @@ static int debug = ALPHATRACK_DEBUG;
 module_param(debug, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(debug, "Debug enabled or not");
 
-/* All interrupt in transfers are collected in a ring buffer to
+/*
+ * All interrupt in transfers are collected in a ring buffer to
  * avoid racing conditions and get better performance of the driver.
  */
 
@@ -109,8 +111,7 @@ static int ring_buffer_size = RING_BUFFER_SIZE;
 module_param(ring_buffer_size, int, S_IRUGO);
 MODULE_PARM_DESC(ring_buffer_size, "Read ring buffer size");
 
-/* The write_buffer can one day contain more than one interrupt out transfer.
- */
+/* The write_buffer can one day contain more than one interrupt out transfer.*/
 
 static int write_buffer_size = WRITE_BUFFER_SIZE;
 module_param(write_buffer_size, int, S_IRUGO);
@@ -199,9 +200,7 @@ static void usb_alphatrack_abort_transfers(struct usb_alphatrack *dev)
                        usb_kill_urb(dev->interrupt_out_urb);
 }
 
-/**
- *     usb_alphatrack_delete
- */
+/** usb_alphatrack_delete */
 static void usb_alphatrack_delete(struct usb_alphatrack *dev)
 {
        usb_alphatrack_abort_transfers(dev);
@@ -213,9 +212,7 @@ static void usb_alphatrack_delete(struct usb_alphatrack *dev)
        kfree(dev);             /* fixme oldi_buffer */
 }
 
-/**
- *     usb_alphatrack_interrupt_in_callback
- */
+/** usb_alphatrack_interrupt_in_callback */
 
 static void usb_alphatrack_interrupt_in_callback(struct urb *urb)
 {
@@ -296,9 +293,7 @@ exit:
        wake_up_interruptible(&dev->read_wait);
 }
 
-/**
- *     usb_alphatrack_interrupt_out_callback
- */
+/** usb_alphatrack_interrupt_out_callback */
 static void usb_alphatrack_interrupt_out_callback(struct urb *urb)
 {
        struct usb_alphatrack *dev = urb->context;
@@ -315,9 +310,7 @@ static void usb_alphatrack_interrupt_out_callback(struct urb *urb)
        wake_up_interruptible(&dev->write_wait);
 }
 
-/**
- *     usb_alphatrack_open
- */
+/** usb_alphatrack_open */
 static int usb_alphatrack_open(struct inode *inode, struct file *file)
 {
        struct usb_alphatrack *dev;
@@ -398,9 +391,7 @@ unlock_disconnect_exit:
        return retval;
 }
 
-/**
- *     usb_alphatrack_release
- */
+/** usb_alphatrack_release */
 static int usb_alphatrack_release(struct inode *inode, struct file *file)
 {
        struct usb_alphatrack *dev;
@@ -447,9 +438,7 @@ exit:
        return retval;
 }
 
-/**
- *     usb_alphatrack_poll
- */
+/** usb_alphatrack_poll */
 static unsigned int usb_alphatrack_poll(struct file *file, poll_table *wait)
 {
        struct usb_alphatrack *dev;
@@ -468,9 +457,7 @@ static unsigned int usb_alphatrack_poll(struct file *file, poll_table *wait)
        return mask;
 }
 
-/**
- *     usb_alphatrack_read
- */
+/** usb_alphatrack_read */
 static ssize_t usb_alphatrack_read(struct file *file, char __user *buffer,
                                   size_t count, loff_t *ppos)
 {
@@ -539,9 +526,7 @@ exit:
        return retval;
 }
 
-/**
- *     usb_alphatrack_write
- */
+/** usb_alphatrack_write */
 static ssize_t usb_alphatrack_write(struct file *file,
                                    const char __user *buffer, size_t count,
                                    loff_t *ppos)
@@ -718,8 +703,10 @@ static int usb_alphatrack_probe(struct usb_interface *intf,
 
        true_size = min(ring_buffer_size, RING_BUFFER_SIZE);
 
-       /* FIXME - there are more usb_alloc routines for dma correctness.
-          Needed? */
+       /*
+        * FIXME - there are more usb_alloc routines for dma correctness.
+        * Needed?
+        */
        dev->ring_buffer = kmalloc_array(true_size,
                                         sizeof(struct alphatrack_icmd),
                                         GFP_KERNEL);
index 10a797263594fb4c0ad87a7116004595f3b6c091..418c6053c027ace6c357b41266f0288696d01ce7 100644 (file)
@@ -6,7 +6,8 @@ struct alphatrack_ocmd {
        unsigned char cmd[8];
 };
 
-/* These are unused by the present driver but provide documentation for the
+/*
+ * These are unused by the present driver but provide documentation for the
  * userspace API.
  */
 enum LightID {
@@ -58,7 +59,8 @@ enum LightID {
 #define BUTTONMASK_PRESS2      0x00008010
 #define BUTTONMASK_PRESS3      0x00002020
 
-/* last 3 bytes are the slider position
+/*
+ * last 3 bytes are the slider position
  * 40 is the actual slider moving, the most sig bits, and 3 lsb
  */
 
index 04b5e66d986162250d377de896d2202b9fbeb49f..374dd1211df283dbaa2b97e16f98f862b19b530f 100644 (file)
@@ -86,7 +86,8 @@ static int debug = TRANZPORT_DEBUG;
 module_param(debug, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(debug, "Debug enabled or not");
 
-/* All interrupt in transfers are collected in a ring buffer to
+/*
+ * All interrupt in transfers are collected in a ring buffer to
  * avoid racing conditions and get better performance of the driver.
  */
 
@@ -95,7 +96,8 @@ static int ring_buffer_size = RING_BUFFER_SIZE;
 module_param(ring_buffer_size, int, S_IRUGO);
 MODULE_PARM_DESC(ring_buffer_size, "Read ring buffer size in reports");
 
-/* The write_buffer can one day contain more than one interrupt out transfer.
+/*
+ * The write_buffer can one day contain more than one interrupt out transfer.
  */
 static int write_buffer_size = WRITE_BUFFER_SIZE;
 module_param(write_buffer_size, int, S_IRUGO);
@@ -565,9 +567,9 @@ static ssize_t usb_tranzport_read(struct file *file, char __user *buffer,
                        newwheel = (*dev->ring_buffer)[next_tail].cmd[6];
                        oldwheel = (*dev->ring_buffer)[dev->ring_tail].cmd[6];
                        /* if both are wheel events, and
-                          no buttons have changes (FIXME, do I have to check?),
-                          and we are the same sign, we can compress +- 7F
-                       */
+                        * no buttons have changes (FIXME, do I have to check?),
+                        * and we are the same sign, we can compress +- 7F
+                        */
                        dbg_info(&dev->intf->dev,
                                "%s: trying to compress: "
                                "%02x%02x%02x%02x%02x%02x%02x%02x\n",
@@ -842,8 +844,10 @@ static int usb_tranzport_probe(struct usb_interface *intf,
                ring_buffer_size = RING_BUFFER_SIZE;
        true_size = min(ring_buffer_size, RING_BUFFER_SIZE);
 
-       /* FIXME - there are more usb_alloc routines for dma correctness.
-          Needed? */
+       /*
+        * FIXME - there are more usb_alloc routines for dma correctness.
+        * Needed?
+        */
 
        dev->ring_buffer =
            kmalloc((true_size * sizeof(struct tranzport_cmd)) + 8, GFP_KERNEL);
index 47cc365c630b2dab8d327c4ef9873a73d839d53e..6311b2ff58161e0433e7ff1065c9b63bd1ca533f 100644 (file)
@@ -132,16 +132,16 @@ void card_bootload(struct net_device *dev)
        pdata = (u32 *) bootimage;
        size = sizeof(bootimage);
 
-       // check for odd word
-       if (size & 0x0003) {
+       /* check for odd word */
+       if (size & 0x0003)
                size += 4;
-       }
-       // Provide mutual exclusive access while reading ASIC registers.
+
+       /* Provide mutual exclusive access while reading ASIC registers. */
        spin_lock_irqsave(&info->dpram_lock, flags);
 
-       // need to set i/o base address initially and hardware will autoincrement
+       /* need to set i/o base address initially and hardware will autoincrement */
        ft1000_write_reg(dev, FT1000_REG_DPRAM_ADDR, FT1000_DPRAM_BASE);
-       // write bytes
+       /* write bytes */
        for (i = 0; i < (size >> 2); i++) {
                templong = *pdata++;
                outl(templong, dev->base_addr + FT1000_REG_MAG_DPDATA);
@@ -345,11 +345,10 @@ int card_download(struct net_device *dev, const u8 *pFileStart,
 
                        handshake = get_handshake(dev, HANDSHAKE_DSP_BL_READY);
 
-                       if (handshake == HANDSHAKE_DSP_BL_READY) {
+                       if (handshake == HANDSHAKE_DSP_BL_READY)
                                put_handshake(dev, HANDSHAKE_DRIVER_READY);
-                       } else {
+                       else
                                Status = FAILURE;
-                       }
 
                        uiState = STATE_BOOT_DWNLD;
 
@@ -391,7 +390,7 @@ int card_download(struct net_device *dev, const u8 *pFileStart,
                                                Status = FAILURE;
                                                break;
                                        }
-                                       // Provide mutual exclusive access while reading ASIC registers.
+                                       /* Provide mutual exclusive access while reading ASIC registers. */
                                        spin_lock_irqsave(&info->dpram_lock,
                                                          flags);
                                        /*
@@ -505,15 +504,15 @@ int card_download(struct net_device *dev, const u8 *pFileStart,
                                        break;
 
                                case REQUEST_MAILBOX_DATA:
-                                       // Convert length from byte count to word count. Make sure we round up.
+                                       /* Convert length from byte count to word count. Make sure we round up. */
                                        word_length =
                                                (long)(info->DSPInfoBlklen + 1) / 2;
                                        put_request_value(dev, word_length);
                                        pMailBoxData =
-                                               (struct drv_msg *) & info->DSPInfoBlk[0];
+                                               (struct drv_msg *) &info->DSPInfoBlk[0];
                                        pUsData =
-                                               (u16 *) & pMailBoxData->data[0];
-                                       // Provide mutual exclusive access while reading ASIC registers.
+                                               (u16 *) &pMailBoxData->data[0];
+                                       /* Provide mutual exclusive access while reading ASIC registers. */
                                        spin_lock_irqsave(&info->dpram_lock,
                                                          flags);
                                        if (file_version == 5) {
@@ -538,9 +537,9 @@ int card_download(struct net_device *dev, const u8 *pFileStart,
                                                outw(DWNLD_MAG_PS_HDR_LOC,
                                                         dev->base_addr +
                                                         FT1000_REG_DPRAM_ADDR);
-                                               if (word_length & 0x01) {
+                                               if (word_length & 0x01)
                                                        word_length++;
-                                               }
+
                                                word_length = word_length / 2;
 
                                                for (; word_length > 0; word_length--) {        /* In words */
@@ -565,7 +564,7 @@ int card_download(struct net_device *dev, const u8 *pFileStart,
                                                (u16 *) ((long)pFileStart +
                                                        pFileHdr5->
                                                        version_data_offset);
-                                       // Provide mutual exclusive access while reading ASIC registers.
+                                       /* Provide mutual exclusive access while reading ASIC registers. */
                                        spin_lock_irqsave(&info->dpram_lock,
                                                          flags);
                                        /*
@@ -692,7 +691,7 @@ int card_download(struct net_device *dev, const u8 *pFileStart,
 
                        if (pHdr->portdest == 0x80      /* DspOAM */
                                && (pHdr->portsrc == 0x00       /* Driver */
-                               || pHdr->portsrc == 0x10 /* FMM */ )) {
+                               || pHdr->portsrc == 0x10 /* FMM */)) {
                                uiState = STATE_SECTION_PROV;
                        } else {
                                DEBUG(1,
@@ -711,13 +710,13 @@ int card_download(struct net_device *dev, const u8 *pFileStart,
                        pHdr = (struct pseudo_hdr *) pUcFile;
 
                        if (pHdr->checksum == hdr_checksum(pHdr)) {
-                               if (pHdr->portdest != 0x80 /* Dsp OAM */ ) {
+                               if (pHdr->portdest != 0x80 /* Dsp OAM */) {
                                        uiState = STATE_DONE_PROV;
                                        break;
                                }
                                usHdrLength = ntohs(pHdr->length);      /* Byte length for PROV records */
 
-                               // Get buffer for provisioning data
+                               /* Get buffer for provisioning data */
                                pbuffer =
                                        kmalloc((usHdrLength + sizeof(struct pseudo_hdr)),
                                                GFP_ATOMIC);
@@ -725,7 +724,7 @@ int card_download(struct net_device *dev, const u8 *pFileStart,
                                        memcpy(pbuffer, (void *)pUcFile,
                                                   (u32) (usHdrLength +
                                                           sizeof(struct pseudo_hdr)));
-                                       // link provisioning data
+                                       /* link provisioning data */
                                        pprov_record =
                                                kmalloc(sizeof(struct prov_record),
                                                        GFP_ATOMIC);
@@ -735,7 +734,7 @@ int card_download(struct net_device *dev, const u8 *pFileStart,
                                                list_add_tail(&pprov_record->
                                                                  list,
                                                                  &info->prov_list);
-                                               // Move to next entry if available
+                                               /* Move to next entry if available */
                                                pUcFile =
                                                        (u8 *) ((unsigned long) pUcFile +
                                                                   (unsigned long) ((usHdrLength + 1) & 0xFFFFFFFE) + sizeof(struct pseudo_hdr));
index 3251d2e073b5ebc3f5cbac27469fff23a99fbd59..68a55ce692008dd34d25765cb7b2071a9b6c17fb 100644 (file)
@@ -1,29 +1,31 @@
-//---------------------------------------------------------------------------
-// FT1000 driver for Flarion Flash OFDM NIC Device
-//
-// Copyright (C) 2006 Flarion Technologies, All rights reserved.
-//
-// This program is free software; you can redistribute it and/or modify it
-// under the terms of the GNU General Public License as published by the Free
-// Software Foundation; either version 2 of the License, or (at your option) any
-// later version. This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-// more details. You should have received a copy of the GNU General Public
-// License along with this program; if not, write to the
-// Free Software Foundation, Inc., 59 Temple Place -
-// Suite 330, Boston, MA 02111-1307, USA.
-//---------------------------------------------------------------------------
-//
-// File:         ft1000_chdev.c
-//
-// Description:  Custom character device dispatch routines.
-//
-// History:
-// 8/29/02    Whc                Ported to Linux.
-// 6/05/06    Whc                Porting to Linux 2.6.9
-//
-//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* FT1000 driver for Flarion Flash OFDM NIC Device
+*
+* Copyright (C) 2006 Flarion Technologies, All rights reserved.
+*
+* This program is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License as published by the Free
+* Software Foundation; either version 2 of the License, or (at your option) any
+* later version. This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+* or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+* more details. You should have received a copy of the GNU General Public
+* License along with this program; if not, write to the
+* Free Software Foundation, Inc., 59 Temple Place -
+* Suite 330, Boston, MA 02111-1307, USA.
+*---------------------------------------------------------------------------
+*
+* File:         ft1000_chdev.c
+*
+* Description:  Custom character device dispatch routines.
+*
+* History:
+* 8/29/02    Whc                Ported to Linux.
+* 6/05/06    Whc                Porting to Linux 2.6.9
+*
+*---------------------------------------------------------------------------
+*/
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 
 static int ft1000_flarion_cnt = 0;
 
-static int ft1000_open (struct inode *inode, struct file *file);
+static int ft1000_open(struct inode *inode, struct file *file);
 static unsigned int ft1000_poll_dev(struct file *file, poll_table *wait);
 static long ft1000_ioctl(struct file *file, unsigned int command,
                            unsigned long argument);
-static int ft1000_release (struct inode *inode, struct file *file);
+static int ft1000_release(struct inode *inode, struct file *file);
 
-// List to free receive command buffer pool
+/* List to free receive command buffer pool */
 struct list_head freercvpool;
 
-// lock to arbitrate free buffer list for receive command data
+/* lock to arbitrate free buffer list for receive command data */
 spinlock_t free_buff_lock;
 
 int numofmsgbuf = 0;
 
-//
-// Table of entry-point routines for char device
-//
-static const struct file_operations ft1000fops =
-{
+/*
+* Table of entry-point routines for char device
+*/
+static const struct file_operations ft1000fops = {
        .unlocked_ioctl = ft1000_ioctl,
        .poll           = ft1000_poll_dev,
        .open           = ft1000_open,
@@ -64,34 +65,35 @@ static const struct file_operations ft1000fops =
        .llseek         = no_llseek,
 };
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_get_buffer
-//
-// Parameters:
-//
-// Returns:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
+/*
+---------------------------------------------------------------------------
+* Function:    ft1000_get_buffer
+*
+* Parameters:
+*
+* Returns:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
 struct dpram_blk *ft1000_get_buffer(struct list_head *bufflist)
 {
     unsigned long flags;
        struct dpram_blk *ptr;
 
     spin_lock_irqsave(&free_buff_lock, flags);
-    // Check if buffer is available
-    if ( list_empty(bufflist) ) {
+    /* Check if buffer is available */
+    if (list_empty(bufflist)) {
         DEBUG("ft1000_get_buffer:  No more buffer - %d\n", numofmsgbuf);
         ptr = NULL;
-    }
-    else {
+    } else {
         numofmsgbuf--;
        ptr = list_entry(bufflist->next, struct dpram_blk, list);
         list_del(&ptr->list);
-        //DEBUG("ft1000_get_buffer: number of free msg buffers = %d\n", numofmsgbuf);
+        /* DEBUG("ft1000_get_buffer: number of free msg buffers = %d\n", numofmsgbuf); */
     }
     spin_unlock_irqrestore(&free_buff_lock, flags);
 
@@ -101,42 +103,46 @@ struct dpram_blk *ft1000_get_buffer(struct list_head *bufflist)
 
 
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_free_buffer
-//
-// Parameters:
-//
-// Returns:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_free_buffer
+*
+* Parameters:
+*
+* Returns:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
 void ft1000_free_buffer(struct dpram_blk *pdpram_blk, struct list_head *plist)
 {
     unsigned long flags;
 
     spin_lock_irqsave(&free_buff_lock, flags);
-    // Put memory back to list
+    /* Put memory back to list */
     list_add_tail(&pdpram_blk->list, plist);
     numofmsgbuf++;
-    //DEBUG("ft1000_free_buffer: number of free msg buffers = %d\n", numofmsgbuf);
+    /*DEBUG("ft1000_free_buffer: number of free msg buffers = %d\n", numofmsgbuf); */
     spin_unlock_irqrestore(&free_buff_lock, flags);
 }
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_CreateDevice
-//
-// Parameters:  dev - pointer to adapter object
-//
-// Returns:     0 if successful
-//
-// Description: Creates a private char device.
-//
-// Notes:       Only called by init_module().
-//
-//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_CreateDevice
+*
+* Parameters:  dev - pointer to adapter object
+*
+* Returns:     0 if successful
+*
+* Description: Creates a private char device.
+*
+* Notes:       Only called by init_module().
+*
+*---------------------------------------------------------------------------
+*/
 int ft1000_create_dev(struct ft1000_usb *dev)
 {
     int result;
@@ -144,20 +150,19 @@ int ft1000_create_dev(struct ft1000_usb *dev)
        struct dentry *dir, *file;
        struct ft1000_debug_dirs *tmp;
 
-    // make a new device name
+    /* make a new device name */
     sprintf(dev->DeviceName, "%s%d", "FT1000_", dev->CardNumber);
 
     DEBUG("%s: number of instance = %d\n", __func__, ft1000_flarion_cnt);
     DEBUG("DeviceCreated = %x\n", dev->DeviceCreated);
 
-    if (dev->DeviceCreated)
-    {
+    if (dev->DeviceCreated) {
        DEBUG("%s: \"%s\" already registered\n", __func__, dev->DeviceName);
        return -EIO;
     }
 
 
-    // register the device
+    /* register the device */
     DEBUG("%s: \"%s\" debugfs device registration\n", __func__, dev->DeviceName);
 
        tmp = kmalloc(sizeof(struct ft1000_debug_dirs), GFP_KERNEL);
@@ -186,7 +191,7 @@ int ft1000_create_dev(struct ft1000_usb *dev)
 
     DEBUG("%s: registered debugfs directory \"%s\"\n", __func__, dev->DeviceName);
 
-    // initialize application information
+    /* initialize application information */
     dev->appcnt = 0;
     for (i=0; i<MAX_NUM_APP; i++) {
         dev->app_info[i].nTxMsg = 0;
@@ -198,7 +203,7 @@ int ft1000_create_dev(struct ft1000_usb *dev)
         dev->app_info[i].DspBCMsgFlag = 0;
         dev->app_info[i].NumOfMsg = 0;
         init_waitqueue_head(&dev->app_info[i].wait_dpram_msg);
-        INIT_LIST_HEAD (&dev->app_info[i].app_sqlist);
+        INIT_LIST_HEAD(&dev->app_info[i].app_sqlist);
     }
 
     dev->DeviceCreated = TRUE;
@@ -214,16 +219,18 @@ fail:
        return result;
 }
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_DestroyDeviceDEBUG
-//
-// Parameters:  dev - pointer to adapter object
-//
-// Description: Destroys a private char device.
-//
-// Notes:       Only called by cleanup_module().
-//
-//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_DestroyDeviceDEBUG
+*
+* Parameters:  dev - pointer to adapter object
+*
+* Description: Destroys a private char device.
+*
+* Notes:       Only called by cleanup_module().
+*
+*---------------------------------------------------------------------------
+*/
 void ft1000_destroy_dev(struct net_device *netdev)
 {
        struct ft1000_info *info = netdev_priv(netdev);
@@ -238,8 +245,7 @@ void ft1000_destroy_dev(struct net_device *netdev)
 
 
 
-    if (dev->DeviceCreated)
-       {
+    if (dev->DeviceCreated) {
         ft1000_flarion_cnt--;
                list_for_each_safe(pos, q, &dev->nodes.list) {
                        dir = list_entry(pos, struct ft1000_debug_dirs, list);
@@ -253,7 +259,7 @@ void ft1000_destroy_dev(struct net_device *netdev)
                DEBUG("%s: unregistered device \"%s\"\n", __func__,
                                           dev->DeviceName);
 
-        // Make sure we free any memory reserve for slow Queue
+        /* Make sure we free any memory reserve for slow Queue */
         for (i=0; i<MAX_NUM_APP; i++) {
             while (list_empty(&dev->app_info[i].app_sqlist) == 0) {
                 pdpram_blk = list_entry(dev->app_info[i].app_sqlist.next, struct dpram_blk, list);
@@ -264,7 +270,7 @@ void ft1000_destroy_dev(struct net_device *netdev)
             wake_up_interruptible(&dev->app_info[i].wait_dpram_msg);
         }
 
-        // Remove buffer allocated for receive command data
+        /* Remove buffer allocated for receive command data */
         if (ft1000_flarion_cnt == 0) {
             while (list_empty(&freercvpool) == 0) {
                ptr = list_entry(freercvpool.next, struct dpram_blk, list);
@@ -279,17 +285,19 @@ void ft1000_destroy_dev(struct net_device *netdev)
 
 }
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_open
-//
-// Parameters:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
-static int ft1000_open (struct inode *inode, struct file *file)
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_open
+*
+* Parameters:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
+static int ft1000_open(struct inode *inode, struct file *file)
 {
        struct ft1000_info *info;
        struct ft1000_usb *dev = (struct ft1000_usb *)inode->i_private;
@@ -301,22 +309,22 @@ static int ft1000_open (struct inode *inode, struct file *file)
 
        info = file->private_data = netdev_priv(dev->net);
 
-    DEBUG("f_owner = %p number of application = %d\n", (&file->f_owner), dev->appcnt );
+    DEBUG("f_owner = %p number of application = %d\n", (&file->f_owner), dev->appcnt);
 
-    // Check if maximum number of application exceeded
+    /* Check if maximum number of application exceeded */
     if (dev->appcnt > MAX_NUM_APP) {
         DEBUG("Maximum number of application exceeded\n");
         return -EACCES;
     }
 
-    // Search for available application info block
+    /* Search for available application info block */
     for (i=0; i<MAX_NUM_APP; i++) {
-        if ( (dev->app_info[i].fileobject == NULL) ) {
+        if ((dev->app_info[i].fileobject == NULL)) {
             break;
         }
     }
 
-    // Fail due to lack of application info block
+    /* Fail due to lack of application info block */
     if (i == MAX_NUM_APP) {
         DEBUG("Could not find an application info block\n");
         return -EACCES;
@@ -334,16 +342,18 @@ static int ft1000_open (struct inode *inode, struct file *file)
 }
 
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_poll_dev
-//
-// Parameters:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_poll_dev
+*
+* Parameters:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
 
 static unsigned int ft1000_poll_dev(struct file *file, poll_table *wait)
 {
@@ -352,24 +362,24 @@ static unsigned int ft1000_poll_dev(struct file *file, poll_table *wait)
        struct ft1000_usb *dev = info->priv;
     int i;
 
-    //DEBUG("ft1000_poll_dev called\n");
+    /* DEBUG("ft1000_poll_dev called\n"); */
     if (ft1000_flarion_cnt == 0) {
         DEBUG("FT1000:ft1000_poll_dev called when ft1000_flarion_cnt is zero\n");
         return (-EBADF);
     }
 
-    // Search for matching file object
+    /* Search for matching file object */
     for (i=0; i<MAX_NUM_APP; i++) {
-        if ( dev->app_info[i].fileobject == &file->f_owner) {
-            //DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", dev->app_info[i].app_id);
+        if (dev->app_info[i].fileobject == &file->f_owner) {
+            /* DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", dev->app_info[i].app_id); */
             break;
         }
     }
 
-    // Could not find application info block
+    /* Could not find application info block */
     if (i == MAX_NUM_APP) {
         DEBUG("FT1000:ft1000_ioctl:Could not find application info block\n");
-        return ( -EACCES );
+        return (-EACCES);
     }
 
     if (list_empty(&dev->app_info[i].app_sqlist) == 0) {
@@ -377,23 +387,25 @@ static unsigned int ft1000_poll_dev(struct file *file, poll_table *wait)
         return(POLLIN | POLLRDNORM | POLLPRI);
     }
 
-    poll_wait (file, &dev->app_info[i].wait_dpram_msg, wait);
-    //DEBUG("FT1000:ft1000_poll_dev:Polling for data from DSP\n");
+    poll_wait(file, &dev->app_info[i].wait_dpram_msg, wait);
+    /* DEBUG("FT1000:ft1000_poll_dev:Polling for data from DSP\n"); */
 
     return (0);
 }
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_ioctl
-//
-// Parameters:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
-static long ft1000_ioctl (struct file *file, unsigned int command,
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_ioctl
+*
+* Parameters:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
+static long ft1000_ioctl(struct file *file, unsigned int command,
                            unsigned long argument)
 {
     void __user *argp = (void __user *)argument;
@@ -417,21 +429,21 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
     unsigned short ledStat=0;
     unsigned short conStat=0;
 
-    //DEBUG("ft1000_ioctl called\n");
+    /* DEBUG("ft1000_ioctl called\n"); */
 
     if (ft1000_flarion_cnt == 0) {
         DEBUG("FT1000:ft1000_ioctl called when ft1000_flarion_cnt is zero\n");
         return (-EBADF);
     }
 
-    //DEBUG("FT1000:ft1000_ioctl:command = 0x%x argument = 0x%8x\n", command, (u32)argument);
+    /* DEBUG("FT1000:ft1000_ioctl:command = 0x%x argument = 0x%8x\n", command, (u32)argument); */
 
        info = file->private_data;
        ft1000dev = info->priv;
     cmd = _IOC_NR(command);
-    //DEBUG("FT1000:ft1000_ioctl:cmd = 0x%x\n", cmd);
+    /* DEBUG("FT1000:ft1000_ioctl:cmd = 0x%x\n", cmd); */
 
-    // process the command
+    /* process the command */
     switch (cmd) {
     case IOCTL_REGISTER_CMD:
             DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_REGISTER called\n");
@@ -441,7 +453,7 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
                 break;
             }
             if (tempword == DSPBCMSGID) {
-                // Search for matching file object
+                /* Search for matching file object */
                 for (i=0; i<MAX_NUM_APP; i++) {
                     if (ft1000dev->app_info[i].fileobject == &file->f_owner) {
                         ft1000dev->app_info[i].DspBCMsgFlag = 1;
@@ -457,7 +469,7 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
 
         get_ver_data.drv_ver = FT1000_DRV_VER;
 
-        if (copy_to_user(argp, &get_ver_data, sizeof(get_ver_data)) ) {
+        if (copy_to_user(argp, &get_ver_data, sizeof(get_ver_data))) {
             DEBUG("FT1000:ft1000_ioctl: copy fault occurred\n");
             result = -EFAULT;
             break;
@@ -467,20 +479,20 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
 
         break;
     case IOCTL_CONNECT:
-        // Connect Message
+        /* Connect Message */
         DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_CONNECT\n");
         ConnectionMsg[79] = 0xfc;
                           card_send_command(ft1000dev, (unsigned short *)ConnectionMsg, 0x4c);
 
         break;
     case IOCTL_DISCONNECT:
-        // Disconnect Message
+        /* Disconnect Message */
         DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_DISCONNECT\n");
         ConnectionMsg[79] = 0xfd;
                           card_send_command(ft1000dev, (unsigned short *)ConnectionMsg, 0x4c);
         break;
     case IOCTL_GET_DSP_STAT_CMD:
-        //DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DSP_STAT called\n");
+        /* DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DSP_STAT called\n"); */
        memset(&get_stat_data, 0, sizeof(get_stat_data));
         memcpy(get_stat_data.DspVer, info->DspVer, DSPVERSZ);
         memcpy(get_stat_data.HwSerNum, info->HwSerNum, HWSERNUMSZ);
@@ -494,8 +506,7 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
                 ft1000_read_dpram16(ft1000dev, FT1000_MAG_DSP_CON_STATE, (u8 *)&conStat, FT1000_MAG_DSP_CON_STATE_INDX);
                 get_stat_data.ConStat = ntohs(conStat);
                 DEBUG("FT1000:ft1000_ioctl: ConStat = 0x%x\n", get_stat_data.ConStat);
-            }
-            else {
+            } else {
                 get_stat_data.ConStat = 0x0f;
             }
 
@@ -504,10 +515,10 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
         get_stat_data.nRxPkts = info->stats.rx_packets;
         get_stat_data.nTxBytes = info->stats.tx_bytes;
         get_stat_data.nRxBytes = info->stats.rx_bytes;
-        do_gettimeofday ( &tv );
+        do_gettimeofday(&tv);
         get_stat_data.ConTm = (u32)(tv.tv_sec - info->ConTm);
         DEBUG("Connection Time = %d\n", (int)get_stat_data.ConTm);
-        if (copy_to_user(argp, &get_stat_data, sizeof(get_stat_data)) ) {
+        if (copy_to_user(argp, &get_stat_data, sizeof(get_stat_data))) {
             DEBUG("FT1000:ft1000_ioctl: copy fault occurred\n");
             result = -EFAULT;
             break;
@@ -517,7 +528,7 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
     case IOCTL_SET_DPRAM_CMD:
         {
             IOCTL_DPRAM_BLK *dpram_data = NULL;
-            //IOCTL_DPRAM_COMMAND dpram_command;
+            /* IOCTL_DPRAM_COMMAND dpram_command; */
             u16 qtype;
             u16 msgsz;
                struct pseudo_hdr *ppseudo_hdr;
@@ -526,7 +537,7 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
             u16 app_index;
             u16 status;
 
-            //DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_SET_DPRAM called\n");
+            /* DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_SET_DPRAM called\n");*/
 
 
             if (ft1000_flarion_cnt == 0) {
@@ -545,12 +556,12 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
 
             if (info->CardReady) {
 
-               //DEBUG("FT1000:ft1000_ioctl: try to SET_DPRAM \n");
+               /* DEBUG("FT1000:ft1000_ioctl: try to SET_DPRAM \n"); */
 
-                // Get the length field to see how many bytes to copy
+                /* Get the length field to see how many bytes to copy */
                 result = get_user(msgsz, (__u16 __user *)argp);
-                msgsz = ntohs (msgsz);
-                //DEBUG("FT1000:ft1000_ioctl: length of message = %d\n", msgsz);
+                msgsz = ntohs(msgsz);
+                /* DEBUG("FT1000:ft1000_ioctl: length of message = %d\n", msgsz); */
 
                 if (msgsz > MAX_CMD_SQSIZE) {
                     DEBUG("FT1000:ft1000_ioctl: bad message length = %d\n", msgsz);
@@ -563,12 +574,11 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
                if (!dpram_data)
                        break;
 
-                if ( copy_from_user(dpram_data, argp, msgsz+2) ) {
+                if (copy_from_user(dpram_data, argp, msgsz+2)) {
                     DEBUG("FT1000:ft1000_ChIoctl: copy fault occurred\n");
                     result = -EFAULT;
-                }
-                else {
-                    // Check if this message came from a registered application
+                } else {
+                    /* Check if this message came from a registered application */
                     for (i=0; i<MAX_NUM_APP; i++) {
                         if (ft1000dev->app_info[i].fileobject == &file->f_owner) {
                             break;
@@ -582,28 +592,27 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
                     }
                     app_index = i;
 
-                    // Check message qtype type which is the lower byte within qos_class
+                    /* Check message qtype type which is the lower byte within qos_class */
                     qtype = ntohs(dpram_data->pseudohdr.qos_class) & 0xff;
-                    //DEBUG("FT1000_ft1000_ioctl: qtype = %d\n", qtype);
+                    /* DEBUG("FT1000_ft1000_ioctl: qtype = %d\n", qtype); */
                     if (qtype) {
-                    }
-                    else {
-                        // Put message into Slow Queue
-                        // Only put a message into the DPRAM if msg doorbell is available
+                    } else {
+                        /* Put message into Slow Queue */
+                        /* Only put a message into the DPRAM if msg doorbell is available */
                         status = ft1000_read_register(ft1000dev, &tempword, FT1000_REG_DOORBELL);
-                        //DEBUG("FT1000_ft1000_ioctl: READ REGISTER tempword=%x\n", tempword);
+                        /* DEBUG("FT1000_ft1000_ioctl: READ REGISTER tempword=%x\n", tempword); */
                         if (tempword & FT1000_DB_DPRAM_TX) {
-                            // Suspend for 2ms and try again due to DSP doorbell busy
+                            /* Suspend for 2ms and try again due to DSP doorbell busy */
                             mdelay(2);
                             status = ft1000_read_register(ft1000dev, &tempword, FT1000_REG_DOORBELL);
                             if (tempword & FT1000_DB_DPRAM_TX) {
-                                // Suspend for 1ms and try again due to DSP doorbell busy
+                                /* Suspend for 1ms and try again due to DSP doorbell busy */
                                 mdelay(1);
                                 status = ft1000_read_register(ft1000dev, &tempword, FT1000_REG_DOORBELL);
                                 if (tempword & FT1000_DB_DPRAM_TX) {
                                     status = ft1000_read_register(ft1000dev, &tempword, FT1000_REG_DOORBELL);
                                     if (tempword & FT1000_DB_DPRAM_TX) {
-                                        // Suspend for 3ms and try again due to DSP doorbell busy
+                                        /* Suspend for 3ms and try again due to DSP doorbell busy */
                                         mdelay(3);
                                         status = ft1000_read_register(ft1000dev, &tempword, FT1000_REG_DOORBELL);
                                         if (tempword & FT1000_DB_DPRAM_TX) {
@@ -617,11 +626,11 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
                             }
                         }
 
-                        //DEBUG("FT1000_ft1000_ioctl: finished reading register\n");
+                        /*DEBUG("FT1000_ft1000_ioctl: finished reading register\n"); */
 
-                        // Make sure we are within the limits of the slow queue memory limitation
-                        if ( (msgsz < MAX_CMD_SQSIZE) && (msgsz > PSEUDOSZ) ) {
-                            // Need to put sequence number plus new checksum for message
+                        /* Make sure we are within the limits of the slow queue memory limitation */
+                        if ((msgsz < MAX_CMD_SQSIZE) && (msgsz > PSEUDOSZ)) {
+                            /* Need to put sequence number plus new checksum for message */
                             pmsg = (u16 *)&dpram_data->pseudohdr;
                                ppseudo_hdr = (struct pseudo_hdr *)pmsg;
                             total_len = msgsz+2;
@@ -629,15 +638,15 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
                                 total_len++;
                             }
 
-                            // Insert slow queue sequence number
+                            /* Insert slow queue sequence number */
                             ppseudo_hdr->seq_num = info->squeseqnum++;
                             ppseudo_hdr->portsrc = ft1000dev->app_info[app_index].app_id;
-                            // Calculate new checksum
+                            /* Calculate new checksum */
                             ppseudo_hdr->checksum = *pmsg++;
-                            //DEBUG("checksum = 0x%x\n", ppseudo_hdr->checksum);
+                            /* DEBUG("checksum = 0x%x\n", ppseudo_hdr->checksum); */
                             for (i=1; i<7; i++) {
                                 ppseudo_hdr->checksum ^= *pmsg++;
-                                //DEBUG("checksum = 0x%x\n", ppseudo_hdr->checksum);
+                                /* DEBUG("checksum = 0x%x\n", ppseudo_hdr->checksum); */
                             }
                             pmsg++;
                                ppseudo_hdr = (struct pseudo_hdr *)pmsg;
@@ -645,14 +654,12 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
 
 
                             ft1000dev->app_info[app_index].nTxMsg++;
-                        }
-                        else {
+                        } else {
                             result = -EINVAL;
                         }
                     }
                 }
-            }
-            else {
+            } else {
                 DEBUG("FT1000:ft1000_ioctl: Card not ready take messages\n");
                 result = -EACCES;
             }
@@ -666,21 +673,21 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
             IOCTL_DPRAM_BLK __user *pioctl_dpram;
             int msglen;
 
-            //DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DPRAM called\n");
+            /* DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DPRAM called\n"); */
 
             if (ft1000_flarion_cnt == 0) {
                 return (-EBADF);
             }
 
-            // Search for matching file object
+            /* Search for matching file object */
             for (i=0; i<MAX_NUM_APP; i++) {
                 if (ft1000dev->app_info[i].fileobject == &file->f_owner) {
-                    //DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", ft1000dev->app_info[i].app_id);
+                    /*DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", ft1000dev->app_info[i].app_id); */
                     break;
                 }
             }
 
-            // Could not find application info block
+            /* Could not find application info block */
             if (i == MAX_NUM_APP) {
                 DEBUG("FT1000:ft1000_ioctl:Could not find application info block\n");
                 result = -EBADF;
@@ -690,30 +697,29 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
             result = 0;
             pioctl_dpram = argp;
             if (list_empty(&ft1000dev->app_info[i].app_sqlist) == 0) {
-                //DEBUG("FT1000:ft1000_ioctl:Message detected in slow queue\n");
+                /* DEBUG("FT1000:ft1000_ioctl:Message detected in slow queue\n"); */
                 spin_lock_irqsave(&free_buff_lock, flags);
                 pdpram_blk = list_entry(ft1000dev->app_info[i].app_sqlist.next, struct dpram_blk, list);
                 list_del(&pdpram_blk->list);
                 ft1000dev->app_info[i].NumOfMsg--;
-                //DEBUG("FT1000:ft1000_ioctl:NumOfMsg for app %d = %d\n", i, ft1000dev->app_info[i].NumOfMsg);
+                /* DEBUG("FT1000:ft1000_ioctl:NumOfMsg for app %d = %d\n", i, ft1000dev->app_info[i].NumOfMsg); */
                 spin_unlock_irqrestore(&free_buff_lock, flags);
                 msglen = ntohs(*(u16 *)pdpram_blk->pbuffer) + PSEUDOSZ;
                 result = get_user(msglen, &pioctl_dpram->total_len);
                if (result)
                        break;
                msglen = htons(msglen);
-                //DEBUG("FT1000:ft1000_ioctl:msg length = %x\n", msglen);
-                if(copy_to_user (&pioctl_dpram->pseudohdr, pdpram_blk->pbuffer, msglen))
-                               {
+                /* DEBUG("FT1000:ft1000_ioctl:msg length = %x\n", msglen); */
+                if (copy_to_user (&pioctl_dpram->pseudohdr, pdpram_blk->pbuffer, msglen)) {
                                        DEBUG("FT1000:ft1000_ioctl: copy fault occurred\n");
-                       result = -EFAULT;
-                       break;
+                       result = -EFAULT;
+                       break;
                                }
 
                 ft1000_free_buffer(pdpram_blk, &freercvpool);
                 result = msglen;
             }
-            //DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DPRAM no message\n");
+            /* DEBUG("FT1000:ft1000_ioctl: IOCTL_FT1000_GET_DPRAM no message\n"); */
         }
         break;
 
@@ -726,17 +732,19 @@ static long ft1000_ioctl (struct file *file, unsigned int command,
     return result;
 }
 
-//---------------------------------------------------------------------------
-// Function:    ft1000_release
-//
-// Parameters:
-//
-// Description:
-//
-// Notes:
-//
-//---------------------------------------------------------------------------
-static int ft1000_release (struct inode *inode, struct file *file)
+/*
+*---------------------------------------------------------------------------
+* Function:    ft1000_release
+*
+* Parameters:
+*
+* Description:
+*
+* Notes:
+*
+*---------------------------------------------------------------------------
+*/
+static int ft1000_release(struct inode *inode, struct file *file)
 {
        struct ft1000_info *info;
     struct net_device *dev;
@@ -755,10 +763,10 @@ static int ft1000_release (struct inode *inode, struct file *file)
         return (-EBADF);
     }
 
-    // Search for matching file object
+    /* Search for matching file object */
     for (i=0; i<MAX_NUM_APP; i++) {
-        if ( ft1000dev->app_info[i].fileobject == &file->f_owner) {
-            //DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", ft1000dev->app_info[i].app_id);
+        if (ft1000dev->app_info[i].fileobject == &file->f_owner) {
+            /* DEBUG("FT1000:ft1000_ioctl: Message is for AppId = %d\n", ft1000dev->app_info[i].app_id); */
             break;
         }
     }
@@ -773,11 +781,10 @@ static int ft1000_release (struct inode *inode, struct file *file)
         ft1000_free_buffer(pdpram_blk, &freercvpool);
     }
 
-    // initialize application information
+    /* initialize application information */
     ft1000dev->appcnt--;
     DEBUG("ft1000_chdev:%s:appcnt = %d\n", __FUNCTION__, ft1000dev->appcnt);
     ft1000dev->app_info[i].fileobject = NULL;
 
     return 0;
 }
-
index 3f4207fd1597c1843b661ada1136ddf5ae47a235..24b8d77a132cc49b048a5ff1ddf9ad84c5324077 100644 (file)
@@ -1,91 +1,89 @@
-//---------------------------------------------------------------------------
-// FT1000 driver for Flarion Flash OFDM NIC Device
-//
-// Copyright (C) 2002 Flarion Technologies, All rights reserved.
-//
-// This program is free software; you can redistribute it and/or modify it
-// under the terms of the GNU General Public License as published by the Free
-// Software Foundation; either version 2 of the License, or (at your option) any
-// later version. This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-// more details. You should have received a copy of the GNU General Public
-// License along with this program; if not, write to the
-// Free Software Foundation, Inc., 59 Temple Place -
-// Suite 330, Boston, MA 02111-1307, USA.
-//---------------------------------------------------------------------------
-//
-// File:         ft1000_ioctl.h
-//
-// Description:    Common structures and defines relating to IOCTL
-//
-// History:
-// 11/5/02    Whc                Created.
-//
-//---------------------------------------------------------------------------//---------------------------------------------------------------------------
+/*
+*---------------------------------------------------------------------------
+* FT1000 driver for Flarion Flash OFDM NIC Device
+*
+* Copyright (C) 2002 Flarion Technologies, All rights reserved.
+*
+* This program is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License as published by the Free
+* Software Foundation; either version 2 of the License, or (at your option) any
+* later version. This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+* or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+* more details. You should have received a copy of the GNU General Public
+* License along with this program; if not, write to the
+* Free Software Foundation, Inc., 59 Temple Place -
+* Suite 330, Boston, MA 02111-1307, USA.
+*---------------------------------------------------------------------------
+*
+* File:         ft1000_ioctl.h
+*
+* Description:    Common structures and defines relating to IOCTL
+*
+* History:
+* 11/5/02    Whc                Created.
+*
+*---------------------------------------------------------------------------//---------------------------------------------------------------------------
+*/
 #ifndef _FT1000IOCTLH_
 #define _FT1000IOCTLH_
 
-typedef struct _IOCTL_GET_VER
-{
+typedef struct _IOCTL_GET_VER {
     unsigned long drv_ver;
 } __attribute__ ((packed)) IOCTL_GET_VER, *PIOCTL_GET_VER;
 
-//Data structure for Dsp statistics
-typedef struct _IOCTL_GET_DSP_STAT
-{
-    unsigned char DspVer[DSPVERSZ];        // DSP version number
-    unsigned char HwSerNum[HWSERNUMSZ];    // Hardware Serial Number
-    unsigned char Sku[SKUSZ];              // SKU
-    unsigned char eui64[EUISZ];            // EUI64
-    unsigned short ConStat;                // Connection Status
-                                //    Bits 0-3 = Connection Status Field
-                                //               0000=Idle (Disconnect)
-                                //               0001=Searching
-                                //               0010=Active (Connected)
-                                //               0011=Waiting for L2 down
-                                //               0100=Sleep
-    unsigned short LedStat;                // Led Status
-                                //    Bits 0-3   = Signal Strength Field
-                                //                 0000 = -105dBm to -92dBm
-                                //                 0001 = -92dBm to -85dBm
-                                //                 0011 = -85dBm to -75dBm
-                                //                 0111 = -75dBm to -50dBm
-                                //                 1111 = -50dBm to 0dBm
-                                //    Bits 4-7   = Reserved
-                                //    Bits 8-11  = SNR Field
-                                //                 0000 = <2dB
-                                //                 0001 = 2dB to 8dB
-                                //                 0011 = 8dB to 15dB
-                                //                 0111 = 15dB to 22dB
-                                //                 1111 = >22dB
-                                //    Bits 12-15 = Reserved
-    unsigned long nTxPkts;                // Number of packets transmitted from host to dsp
-    unsigned long nRxPkts;                // Number of packets received from dsp to host
-    unsigned long nTxBytes;               // Number of bytes transmitted from host to dsp
-    unsigned long nRxBytes;               // Number of bytes received from dsp to host
-    unsigned long ConTm;                  // Current session connection time in seconds
-    unsigned char CalVer[CALVERSZ];       // Proprietary Calibration Version
-    unsigned char CalDate[CALDATESZ];     // Proprietary Calibration Date
+/* Data structure for Dsp statistics */
+typedef struct _IOCTL_GET_DSP_STAT {
+    unsigned char DspVer[DSPVERSZ];        /* DSP version number */
+    unsigned char HwSerNum[HWSERNUMSZ];    /* Hardware Serial Number */
+    unsigned char Sku[SKUSZ];              /* SKU */
+    unsigned char eui64[EUISZ];            /* EUI64 */
+    unsigned short ConStat;                /* Connection Status */
+                                /*    Bits 0-3 = Connection Status Field */
+                                /*               0000=Idle (Disconnect) */
+                                /*               0001=Searching */
+                                /*               0010=Active (Connected) */
+                                /*               0011=Waiting for L2 down */
+                                /*               0100=Sleep */
+    unsigned short LedStat;                /* Led Status */
+                                /*    Bits 0-3   = Signal Strength Field */
+                                /*                 0000 = -105dBm to -92dBm */
+                                /*                 0001 = -92dBm to -85dBm */
+                                /*                 0011 = -85dBm to -75dBm */
+                                /*                 0111 = -75dBm to -50dBm */
+                                /*                 1111 = -50dBm to 0dBm */
+                                /*    Bits 4-7   = Reserved */
+                                /*    Bits 8-11  = SNR Field */
+                                /*                 0000 = <2dB */
+                                /*                 0001 = 2dB to 8dB */
+                                /*                 0011 = 8dB to 15dB */
+                                /*                 0111 = 15dB to 22dB */
+                                /*                 1111 = >22dB */
+                                /*    Bits 12-15 = Reserved */
+    unsigned long nTxPkts;                /* Number of packets transmitted from host to dsp */
+    unsigned long nRxPkts;                /* Number of packets received from dsp to host */
+    unsigned long nTxBytes;               /* Number of bytes transmitted from host to dsp */
+    unsigned long nRxBytes;               /* Number of bytes received from dsp to host */
+    unsigned long ConTm;                  /* Current session connection time in seconds */
+    unsigned char CalVer[CALVERSZ];       /* Proprietary Calibration Version */
+    unsigned char CalDate[CALDATESZ];     /* Proprietary Calibration Date */
 } __attribute__ ((packed)) IOCTL_GET_DSP_STAT, *PIOCTL_GET_DSP_STAT;
 
-//Data structure for Dual Ported RAM messaging between Host and Dsp
-typedef struct _IOCTL_DPRAM_BLK
-{
+/* Data structure for Dual Ported RAM messaging between Host and Dsp */
+typedef struct _IOCTL_DPRAM_BLK {
     unsigned short total_len;
        struct pseudo_hdr pseudohdr;
     unsigned char buffer[1780];
 } __attribute__ ((packed)) IOCTL_DPRAM_BLK, *PIOCTL_DPRAM_BLK;
 
-typedef struct _IOCTL_DPRAM_COMMAND
-{
+typedef struct _IOCTL_DPRAM_COMMAND {
     unsigned short extra;
     IOCTL_DPRAM_BLK dpram_blk;
 } __attribute__ ((packed)) IOCTL_DPRAM_COMMAND, *PIOCTL_DPRAM_COMMAND;
 
-//
-// Custom IOCTL command codes
-//
+/*
+* Custom IOCTL command codes
+*/
 #define FT1000_MAGIC_CODE      'F'
 
 #define IOCTL_REGISTER_CMD                                     0
@@ -96,12 +94,12 @@ typedef struct _IOCTL_DPRAM_COMMAND
 #define IOCTL_CONNECT               10
 #define IOCTL_DISCONNECT            11
 
-#define IOCTL_FT1000_GET_DSP_STAT _IOR (FT1000_MAGIC_CODE, IOCTL_GET_DSP_STAT_CMD, sizeof(IOCTL_GET_DSP_STAT) )
-#define IOCTL_FT1000_GET_VER _IOR (FT1000_MAGIC_CODE, IOCTL_GET_VER_CMD, sizeof(IOCTL_GET_VER) )
-#define IOCTL_FT1000_CONNECT _IOW (FT1000_MAGIC_CODE, IOCTL_CONNECT, 0 )
-#define IOCTL_FT1000_DISCONNECT _IOW (FT1000_MAGIC_CODE, IOCTL_DISCONNECT, 0 )
-#define IOCTL_FT1000_SET_DPRAM _IOW (FT1000_MAGIC_CODE, IOCTL_SET_DPRAM_CMD, sizeof(IOCTL_DPRAM_BLK) )
-#define IOCTL_FT1000_GET_DPRAM _IOR (FT1000_MAGIC_CODE, IOCTL_GET_DPRAM_CMD, sizeof(IOCTL_DPRAM_BLK) )
-#define IOCTL_FT1000_REGISTER  _IOW (FT1000_MAGIC_CODE, IOCTL_REGISTER_CMD, sizeof(unsigned short *) )
-#endif // _FT1000IOCTLH_
+#define IOCTL_FT1000_GET_DSP_STAT _IOR(FT1000_MAGIC_CODE, IOCTL_GET_DSP_STAT_CMD, sizeof(IOCTL_GET_DSP_STAT)
+#define IOCTL_FT1000_GET_VER _IOR(FT1000_MAGIC_CODE, IOCTL_GET_VER_CMD, sizeof(IOCTL_GET_VER)
+#define IOCTL_FT1000_CONNECT _IOW(FT1000_MAGIC_CODE, IOCTL_CONNECT, 0
+#define IOCTL_FT1000_DISCONNECT _IOW(FT1000_MAGIC_CODE, IOCTL_DISCONNECT, 0
+#define IOCTL_FT1000_SET_DPRAM _IOW(FT1000_MAGIC_CODE, IOCTL_SET_DPRAM_CMD, sizeof(IOCTL_DPRAM_BLK)
+#define IOCTL_FT1000_GET_DPRAM _IOR(FT1000_MAGIC_CODE, IOCTL_GET_DPRAM_CMD, sizeof(IOCTL_DPRAM_BLK)
+#define IOCTL_FT1000_REGISTER  _IOW(FT1000_MAGIC_CODE, IOCTL_REGISTER_CMD, sizeof(unsigned short *)
+#endif /* _FT1000IOCTLH_ */
 
index 614db55a817116156d4137f74ad3ec354262dcb8..29a7cd23845d4a6b578f977c6a8fc169b19f7c31 100644 (file)
@@ -79,8 +79,12 @@ static int ft1000_probe(struct usb_interface *interface,
        ft1000dev->dev = dev;
        ft1000dev->status = 0;
        ft1000dev->net = NULL;
-       ft1000dev->tx_urb = usb_alloc_urb(0, GFP_ATOMIC);
-       ft1000dev->rx_urb = usb_alloc_urb(0, GFP_ATOMIC);
+       ft1000dev->tx_urb = usb_alloc_urb(0, GFP_KERNEL);
+       ft1000dev->rx_urb = usb_alloc_urb(0, GFP_KERNEL);
+       if (!ft1000dev->tx_urb || !ft1000dev->rx_urb) {
+               ret = -ENOMEM;
+               goto err_fw;
+       }
 
        DEBUG("ft1000_probe is called\n");
        numaltsetting = interface->num_altsetting;
@@ -209,6 +213,8 @@ err_thread:
 err_load:
        kfree(pFileStart);
 err_fw:
+       usb_free_urb(ft1000dev->rx_urb);
+       usb_free_urb(ft1000dev->tx_urb);
        kfree(ft1000dev);
        return ret;
 }
index e5818a1c22620914fb5afe279e32c657414715b3..4e1cd5e9ea37f68f785f79471a8e652626de0657 100644 (file)
@@ -18,6 +18,8 @@
  * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/device.h>
@@ -101,13 +103,16 @@ struct fwtty_transaction {
 };
 
 #define to_device(a, b)                        (a->b)
-#define fwtty_err(p, s, v...)          dev_err(to_device(p, device), s, ##v)
-#define fwtty_info(p, s, v...)         dev_info(to_device(p, device), s, ##v)
-#define fwtty_notice(p, s, v...)       dev_notice(to_device(p, device), s, ##v)
-#define fwtty_dbg(p, s, v...)          \
-               dev_dbg(to_device(p, device), "%s: " s, __func__, ##v)
-#define fwtty_err_ratelimited(p, s, v...) \
-               dev_err_ratelimited(to_device(p, device), s, ##v)
+#define fwtty_err(p, fmt, ...)                                         \
+       dev_err(to_device(p, device), fmt, ##__VA_ARGS__)
+#define fwtty_info(p, fmt, ...)                                                \
+       dev_info(to_device(p, device), fmt, ##__VA_ARGS__)
+#define fwtty_notice(p, fmt, ...)                                      \
+       dev_notice(to_device(p, device), fmt, ##__VA_ARGS__)
+#define fwtty_dbg(p, fmt, ...)                                         \
+       dev_dbg(to_device(p, device), "%s: " fmt, __func__, ##__VA_ARGS__)
+#define fwtty_err_ratelimited(p, fmt, ...)                             \
+       dev_err_ratelimited(to_device(p, device), fmt, ##__VA_ARGS__)
 
 #ifdef DEBUG
 static inline void debug_short_write(struct fwtty_port *port, int c, int n)
@@ -118,7 +123,7 @@ static inline void debug_short_write(struct fwtty_port *port, int c, int n)
                spin_lock_bh(&port->lock);
                avail = dma_fifo_avail(&port->tx_fifo);
                spin_unlock_bh(&port->lock);
-               fwtty_dbg(port, "short write: avail:%d req:%d wrote:%d",
+               fwtty_dbg(port, "short write: avail:%d req:%d wrote:%d\n",
                          avail, c, n);
        }
 }
@@ -197,22 +202,22 @@ static void fwtty_log_tx_error(struct fwtty_port *port, int rcode)
 {
        switch (rcode) {
        case RCODE_SEND_ERROR:
-               fwtty_err_ratelimited(port, "card busy");
+               fwtty_err_ratelimited(port, "card busy\n");
                break;
        case RCODE_ADDRESS_ERROR:
-               fwtty_err_ratelimited(port, "bad unit addr or write length");
+               fwtty_err_ratelimited(port, "bad unit addr or write length\n");
                break;
        case RCODE_DATA_ERROR:
-               fwtty_err_ratelimited(port, "failed rx");
+               fwtty_err_ratelimited(port, "failed rx\n");
                break;
        case RCODE_NO_ACK:
-               fwtty_err_ratelimited(port, "missing ack");
+               fwtty_err_ratelimited(port, "missing ack\n");
                break;
        case RCODE_BUSY:
-               fwtty_err_ratelimited(port, "remote busy");
+               fwtty_err_ratelimited(port, "remote busy\n");
                break;
        default:
-               fwtty_err_ratelimited(port, "failed tx: %d", rcode);
+               fwtty_err_ratelimited(port, "failed tx: %d\n", rcode);
        }
 }
 
@@ -287,7 +292,7 @@ static void __fwtty_restart_tx(struct fwtty_port *port)
                schedule_delayed_work(&port->drain, 0);
        avail = dma_fifo_avail(&port->tx_fifo);
 
-       fwtty_dbg(port, "fifo len: %d avail: %d", len, avail);
+       fwtty_dbg(port, "fifo len: %d avail: %d\n", len, avail);
 }
 
 static void fwtty_restart_tx(struct fwtty_port *port)
@@ -323,7 +328,7 @@ static void fwtty_update_port_status(struct fwtty_port *port, unsigned status)
        if (delta & TIOCM_CTS)
                ++port->icount.cts;
 
-       fwtty_dbg(port, "status: %x delta: %x", status, delta);
+       fwtty_dbg(port, "status: %x delta: %x\n", status, delta);
 
        if (delta & TIOCM_CAR) {
                tty = tty_port_tty_get(&port->port);
@@ -509,7 +514,7 @@ static void fwtty_emit_breaks(struct work_struct *work)
        n = (elapsed * port->cps) / HZ + 1;
        port->break_last = now;
 
-       fwtty_dbg(port, "sending %d brks", n);
+       fwtty_dbg(port, "sending %d brks\n", n);
 
        while (n) {
                t = min(n, 16);
@@ -570,7 +575,7 @@ static int fwtty_buffer_rx(struct fwtty_port *port, unsigned char *d, size_t n)
        size_t size = (n + sizeof(struct buffered_rx) + 0xFF) & ~0xFF;
 
        if (port->buffered + n > HIGH_WATERMARK) {
-               fwtty_err_ratelimited(port, "overflowed rx buffer: buffered: %d new: %zu wtrmk: %d",
+               fwtty_err_ratelimited(port, "overflowed rx buffer: buffered: %d new: %zu wtrmk: %d\n",
                                      port->buffered, n, HIGH_WATERMARK);
                return 0;
        }
@@ -599,7 +604,7 @@ static int fwtty_rx(struct fwtty_port *port, unsigned char *data, size_t len)
        unsigned lsr;
        int err = 0;
 
-       fwtty_dbg(port, "%d", n);
+       fwtty_dbg(port, "%d\n", n);
        profile_size_distrib(port->stats.reads, n);
 
        if (port->write_only) {
@@ -689,7 +694,7 @@ static void fwtty_port_handler(struct fw_card *card,
        rcu_read_unlock();
        if (!peer || peer != rcu_access_pointer(port->peer)) {
                rcode = RCODE_ADDRESS_ERROR;
-               fwtty_err_ratelimited(port, "ignoring unauthenticated data");
+               fwtty_err_ratelimited(port, "ignoring unauthenticated data\n");
                goto respond;
        }
 
@@ -746,7 +751,7 @@ static void fwtty_tx_complete(struct fw_card *card, int rcode,
        struct fwtty_port *port = txn->port;
        int len;
 
-       fwtty_dbg(port, "rcode: %d", rcode);
+       fwtty_dbg(port, "rcode: %d\n", rcode);
 
        switch (rcode) {
        case RCODE_COMPLETE:
@@ -809,7 +814,7 @@ static int fwtty_tx(struct fwtty_port *port, bool drain)
                n = dma_fifo_out_pend(&port->tx_fifo, &txn->dma_pended);
                spin_unlock_bh(&port->lock);
 
-               fwtty_dbg(port, "out: %u rem: %d", txn->dma_pended.len, n);
+               fwtty_dbg(port, "out: %u rem: %d\n", txn->dma_pended.len, n);
 
                if (n < 0) {
                        kmem_cache_free(fwtty_txn_cache, txn);
@@ -819,7 +824,8 @@ static int fwtty_tx(struct fwtty_port *port, bool drain)
                                profile_size_distrib(port->stats.txns, 0);
                        else {
                                ++port->stats.fifo_errs;
-                               fwtty_err_ratelimited(port, "fifo err: %d", n);
+                               fwtty_err_ratelimited(port, "fifo err: %d\n",
+                                                     n);
                        }
                        break;
                }
@@ -877,7 +883,7 @@ static void fwtty_write_xchar(struct fwtty_port *port, char ch)
 
        ++port->stats.xchars;
 
-       fwtty_dbg(port, "%02x", ch);
+       fwtty_dbg(port, "%02x\n", ch);
 
        rcu_read_lock();
        peer = rcu_dereference(port->peer);
@@ -964,7 +970,7 @@ static void fwtty_port_dtr_rts(struct tty_port *tty_port, int on)
 {
        struct fwtty_port *port = to_port(tty_port, port);
 
-       fwtty_dbg(port, "on/off: %d", on);
+       fwtty_dbg(port, "on/off: %d\n", on);
 
        spin_lock_bh(&port->lock);
        /* Don't change carrier state if this is a console */
@@ -992,7 +998,7 @@ static int fwtty_port_carrier_raised(struct tty_port *tty_port)
 
        rc = (port->mstatus & TIOCM_CAR);
 
-       fwtty_dbg(port, "%d", rc);
+       fwtty_dbg(port, "%d\n", rc);
 
        return rc;
 }
@@ -1177,7 +1183,7 @@ static int fwtty_write(struct tty_struct *tty, const unsigned char *buf, int c)
        struct fwtty_port *port = tty->driver_data;
        int n, len;
 
-       fwtty_dbg(port, "%d", c);
+       fwtty_dbg(port, "%d\n", c);
        profile_size_distrib(port->stats.writes, c);
 
        spin_lock_bh(&port->lock);
@@ -1204,7 +1210,7 @@ static int fwtty_write_room(struct tty_struct *tty)
        n = dma_fifo_avail(&port->tx_fifo);
        spin_unlock_bh(&port->lock);
 
-       fwtty_dbg(port, "%d", n);
+       fwtty_dbg(port, "%d\n", n);
 
        return n;
 }
@@ -1218,7 +1224,7 @@ static int fwtty_chars_in_buffer(struct tty_struct *tty)
        n = dma_fifo_level(&port->tx_fifo);
        spin_unlock_bh(&port->lock);
 
-       fwtty_dbg(port, "%d", n);
+       fwtty_dbg(port, "%d\n", n);
 
        return n;
 }
@@ -1227,7 +1233,7 @@ static void fwtty_send_xchar(struct tty_struct *tty, char ch)
 {
        struct fwtty_port *port = tty->driver_data;
 
-       fwtty_dbg(port, "%02x", ch);
+       fwtty_dbg(port, "%02x\n", ch);
 
        fwtty_write_xchar(port, ch);
 }
@@ -1254,7 +1260,7 @@ static void fwtty_unthrottle(struct tty_struct *tty)
 {
        struct fwtty_port *port = tty->driver_data;
 
-       fwtty_dbg(port, "CRTSCTS: %d", (C_CRTSCTS(tty) != 0));
+       fwtty_dbg(port, "CRTSCTS: %d\n", (C_CRTSCTS(tty) != 0));
 
        profile_fifo_avail(port, port->stats.unthrottle);
 
@@ -1409,7 +1415,7 @@ static int fwtty_break_ctl(struct tty_struct *tty, int state)
        struct fwtty_port *port = tty->driver_data;
        long ret;
 
-       fwtty_dbg(port, "%d", state);
+       fwtty_dbg(port, "%d\n", state);
 
        if (state == -1) {
                set_bit(STOP_TX, &port->flags);
@@ -1446,7 +1452,7 @@ static int fwtty_tiocmget(struct tty_struct *tty)
        tiocm = (port->mctrl & MCTRL_MASK) | (port->mstatus & ~MCTRL_MASK);
        spin_unlock_bh(&port->lock);
 
-       fwtty_dbg(port, "%x", tiocm);
+       fwtty_dbg(port, "%x\n", tiocm);
 
        return tiocm;
 }
@@ -1455,7 +1461,7 @@ static int fwtty_tiocmset(struct tty_struct *tty, unsigned set, unsigned clear)
 {
        struct fwtty_port *port = tty->driver_data;
 
-       fwtty_dbg(port, "set: %x clear: %x", set, clear);
+       fwtty_dbg(port, "set: %x clear: %x\n", set, clear);
 
        /* TODO: simulate loopback if TIOCM_LOOP set */
 
@@ -1775,7 +1781,7 @@ static void fwserial_virt_plug_complete(struct fwtty_peer *peer,
        if (port->port.console && port->fwcon_ops->notify != NULL)
                (*port->fwcon_ops->notify)(FWCON_NOTIFY_ATTACH, port->con_data);
 
-       fwtty_info(&peer->unit, "peer (guid:%016llx) connected on %s",
+       fwtty_info(&peer->unit, "peer (guid:%016llx) connected on %s\n",
                   (unsigned long long)peer->guid, dev_name(port->device));
 }
 
@@ -1797,7 +1803,7 @@ static inline int fwserial_send_mgmt_sync(struct fwtty_peer *peer,
                                           pkt, be16_to_cpu(pkt->hdr.len));
                if (rcode == RCODE_BUSY || rcode == RCODE_SEND_ERROR ||
                    rcode == RCODE_GENERATION) {
-                       fwtty_dbg(&peer->unit, "mgmt write error: %d", rcode);
+                       fwtty_dbg(&peer->unit, "mgmt write error: %d\n", rcode);
                        continue;
                } else
                        break;
@@ -1918,7 +1924,7 @@ static int fwserial_connect_peer(struct fwtty_peer *peer)
 
        port = fwserial_find_port(peer);
        if (!port) {
-               fwtty_err(&peer->unit, "avail ports in use");
+               fwtty_err(&peer->unit, "avail ports in use\n");
                err = -EBUSY;
                goto free_pkt;
        }
@@ -2056,7 +2062,7 @@ static struct fwtty_peer *__fwserial_peer_by_node_id(struct fw_card *card,
                 * has created its remote unit device before this driver has
                 * been probed for any unit devices...
                 */
-               fwtty_err(card, "unknown card (guid %016llx)",
+               fwtty_err(card, "unknown card (guid %016llx)\n",
                          (unsigned long long) card->guid);
                return NULL;
        }
@@ -2084,8 +2090,8 @@ static void __dump_peer_list(struct fw_card *card)
        list_for_each_entry_rcu(peer, &serial->peer_list, list) {
                int g = peer->generation;
                smp_rmb();
-               fwtty_dbg(card, "peer(%d:%x) guid: %016llx\n", g,
-                         peer->node_id, (unsigned long long) peer->guid);
+               fwtty_dbg(card, "peer(%d:%x) guid: %016llx\n",
+                         g, peer->node_id, (unsigned long long) peer->guid);
        }
 }
 #else
@@ -2173,7 +2179,7 @@ static int fwserial_add_peer(struct fw_serial *serial, struct fw_unit *unit)
        peer->serial = serial;
        list_add_rcu(&peer->list, &serial->peer_list);
 
-       fwtty_info(&peer->unit, "peer added (guid:%016llx)",
+       fwtty_info(&peer->unit, "peer added (guid:%016llx)\n",
                   (unsigned long long)peer->guid);
 
        /* identify the local unit & virt cable to loopback port */
@@ -2236,7 +2242,7 @@ static void fwserial_remove_peer(struct fwtty_peer *peer)
 
        list_del_rcu(&peer->list);
 
-       fwtty_info(&peer->unit, "peer removed (guid:%016llx)",
+       fwtty_info(&peer->unit, "peer removed (guid:%016llx)\n",
                   (unsigned long long)peer->guid);
 
        spin_unlock_bh(&peer->lock);
@@ -2324,7 +2330,7 @@ static int fwserial_create(struct fw_unit *unit)
 
        err = fwtty_ports_add(serial);
        if (err) {
-               fwtty_err(&unit, "no space in port table");
+               fwtty_err(&unit, "no space in port table\n");
                goto free_ports;
        }
 
@@ -2335,7 +2341,8 @@ static int fwserial_create(struct fw_unit *unit)
                                                   card->device);
                if (IS_ERR(tty_dev)) {
                        err = PTR_ERR(tty_dev);
-                       fwtty_err(&unit, "register tty device error (%d)", err);
+                       fwtty_err(&unit, "register tty device error (%d)\n",
+                                 err);
                        goto unregister_ttys;
                }
 
@@ -2352,7 +2359,8 @@ static int fwserial_create(struct fw_unit *unit)
                                                    card->device);
                if (IS_ERR(loop_dev)) {
                        err = PTR_ERR(loop_dev);
-                       fwtty_err(&unit, "create loop device failed (%d)", err);
+                       fwtty_err(&unit, "create loop device failed (%d)\n",
+                                 err);
                        goto unregister_ttys;
                }
                serial->ports[j]->device = loop_dev;
@@ -2372,14 +2380,14 @@ static int fwserial_create(struct fw_unit *unit)
 
        list_add_rcu(&serial->list, &fwserial_list);
 
-       fwtty_notice(&unit, "TTY over FireWire on device %s (guid %016llx)",
+       fwtty_notice(&unit, "TTY over FireWire on device %s (guid %016llx)\n",
                     dev_name(card->device), (unsigned long long) card->guid);
 
        err = fwserial_add_peer(serial, unit);
        if (!err)
                return 0;
 
-       fwtty_err(&unit, "unable to add peer unit device (%d)", err);
+       fwtty_err(&unit, "unable to add peer unit device (%d)\n", err);
 
        /* fall-through to error processing */
        debugfs_remove_recursive(serial->debugfs);
@@ -2621,7 +2629,7 @@ static void fwserial_handle_plug_req(struct work_struct *work)
        switch (peer->state) {
        case FWPS_NOT_ATTACHED:
                if (!port) {
-                       fwtty_err(&peer->unit, "no more ports avail");
+                       fwtty_err(&peer->unit, "no more ports avail\n");
                        fill_plug_rsp_nack(pkt);
                } else {
                        peer->port = port;
@@ -2663,7 +2671,7 @@ static void fwserial_handle_plug_req(struct work_struct *work)
                        fwtty_write_port_status(tmp);
                        spin_lock_bh(&peer->lock);
                } else {
-                       fwtty_err(&peer->unit, "PLUG_RSP error (%d)", rcode);
+                       fwtty_err(&peer->unit, "PLUG_RSP error (%d)\n", rcode);
                        port = peer_revert_state(peer);
                }
        }
@@ -2715,7 +2723,8 @@ static void fwserial_handle_unplug_req(struct work_struct *work)
        spin_lock_bh(&peer->lock);
        if (peer->state == FWPS_UNPLUG_RESPONDING) {
                if (rcode != RCODE_COMPLETE)
-                       fwtty_err(&peer->unit, "UNPLUG_RSP error (%d)", rcode);
+                       fwtty_err(&peer->unit, "UNPLUG_RSP error (%d)\n",
+                                 rcode);
                port = peer_revert_state(peer);
        }
 cleanup:
@@ -2750,19 +2759,19 @@ static int fwserial_parse_mgmt_write(struct fwtty_peer *peer,
                 * already removed from the bus -- and the removal was
                 * processed before we rec'd this transaction
                 */
-               fwtty_err(&peer->unit, "peer already removed");
+               fwtty_err(&peer->unit, "peer already removed\n");
                spin_unlock_bh(&peer->lock);
                return RCODE_ADDRESS_ERROR;
        }
 
        rcode = RCODE_COMPLETE;
 
-       fwtty_dbg(&peer->unit, "mgmt: hdr.code: %04hx", pkt->hdr.code);
+       fwtty_dbg(&peer->unit, "mgmt: hdr.code: %04hx\n", pkt->hdr.code);
 
        switch (be16_to_cpu(pkt->hdr.code) & FWSC_CODE_MASK) {
        case FWSC_VIRT_CABLE_PLUG:
                if (work_pending(&peer->work)) {
-                       fwtty_err(&peer->unit, "plug req: busy");
+                       fwtty_err(&peer->unit, "plug req: busy\n");
                        rcode = RCODE_CONFLICT_ERROR;
 
                } else {
@@ -2777,7 +2786,7 @@ static int fwserial_parse_mgmt_write(struct fwtty_peer *peer,
                        rcode = RCODE_CONFLICT_ERROR;
 
                } else if (be16_to_cpu(pkt->hdr.code) & FWSC_RSP_NACK) {
-                       fwtty_notice(&peer->unit, "NACK plug rsp");
+                       fwtty_notice(&peer->unit, "NACK plug rsp\n");
                        port = peer_revert_state(peer);
 
                } else {
@@ -2793,7 +2802,7 @@ static int fwserial_parse_mgmt_write(struct fwtty_peer *peer,
 
        case FWSC_VIRT_CABLE_UNPLUG:
                if (work_pending(&peer->work)) {
-                       fwtty_err(&peer->unit, "unplug req: busy");
+                       fwtty_err(&peer->unit, "unplug req: busy\n");
                        rcode = RCODE_CONFLICT_ERROR;
                } else {
                        PREPARE_WORK(&peer->work, fwserial_handle_unplug_req);
@@ -2806,14 +2815,14 @@ static int fwserial_parse_mgmt_write(struct fwtty_peer *peer,
                        rcode = RCODE_CONFLICT_ERROR;
                else {
                        if (be16_to_cpu(pkt->hdr.code) & FWSC_RSP_NACK)
-                               fwtty_notice(&peer->unit, "NACK unplug?");
+                               fwtty_notice(&peer->unit, "NACK unplug?\n");
                        port = peer_revert_state(peer);
                        reset = true;
                }
                break;
 
        default:
-               fwtty_err(&peer->unit, "unknown mgmt code %d",
+               fwtty_err(&peer->unit, "unknown mgmt code %d\n",
                          be16_to_cpu(pkt->hdr.code));
                rcode = RCODE_DATA_ERROR;
        }
@@ -2847,7 +2856,7 @@ static void fwserial_mgmt_handler(struct fw_card *card,
        rcu_read_lock();
        peer = __fwserial_peer_by_node_id(card, generation, source);
        if (!peer) {
-               fwtty_dbg(card, "peer(%d:%x) not found", generation, source);
+               fwtty_dbg(card, "peer(%d:%x) not found\n", generation, source);
                __dump_peer_list(card);
                rcode = RCODE_CONFLICT_ERROR;
 
@@ -2897,7 +2906,7 @@ static int __init fwserial_init(void)
 
        err = tty_register_driver(fwtty_driver);
        if (err) {
-               driver_err("register tty driver failed (%d)", err);
+               pr_err("register tty driver failed (%d)\n", err);
                goto put_tty;
        }
 
@@ -2922,7 +2931,7 @@ static int __init fwserial_init(void)
 
                err = tty_register_driver(fwloop_driver);
                if (err) {
-                       driver_err("register loop driver failed (%d)", err);
+                       pr_err("register loop driver failed (%d)\n", err);
                        goto put_loop;
                }
        }
@@ -2948,7 +2957,7 @@ static int __init fwserial_init(void)
        err = fw_core_add_address_handler(&fwserial_mgmt_addr_handler,
                                          &fwserial_mgmt_addr_region);
        if (err) {
-               driver_err("add management handler failed (%d)", err);
+               pr_err("add management handler failed (%d)\n", err);
                goto destroy_cache;
        }
 
@@ -2956,13 +2965,13 @@ static int __init fwserial_init(void)
                FW_UNIT_ADDRESS(fwserial_mgmt_addr_handler.offset);
        err = fw_core_add_descriptor(&fwserial_unit_directory);
        if (err) {
-               driver_err("add unit descriptor failed (%d)", err);
+               pr_err("add unit descriptor failed (%d)\n", err);
                goto remove_handler;
        }
 
        err = driver_register(&fwserial_driver.driver);
        if (err) {
-               driver_err("register fwserial driver failed (%d)", err);
+               pr_err("register fwserial driver failed (%d)\n", err);
                goto remove_descriptor;
        }
 
index 514f57173259d799271516d3744f95d6b0c43bbf..24635014a2acdad232634248794e41eff2450ffa 100644 (file)
@@ -356,8 +356,6 @@ static const char loop_dev_name[] = "fwloop";
 
 extern struct tty_driver *fwtty_driver;
 
-#define driver_err(s, v...)    pr_err(KBUILD_MODNAME ": " s, ##v)
-
 struct fwtty_port *fwtty_port_get(unsigned index);
 void fwtty_port_put(struct fwtty_port *port);
 
index 69059138de4ab334b96f44b63599be2ddceed7e9..dd8a3913f6b9e6e7ccbebc4cf05e723f55035dc6 100644 (file)
@@ -4,7 +4,7 @@
 
 menuconfig WIMAX_GDM72XX
        tristate "GCT GDM72xx WiMAX support"
-       depends on NET
+       depends on NET && (USB || MMC)
        help
          Support for the GCT GDM72xx WiMAX chip
 
@@ -19,7 +19,7 @@ config WIMAX_GDM72XX_K_MODE
        default n
 
 config WIMAX_GDM72XX_WIMAX2
-       bool "Enable WIMAX2 support"
+       bool "Enable WiMAX2 support"
        default n
 
 choice
@@ -27,18 +27,18 @@ choice
 
 config WIMAX_GDM72XX_USB
        bool "USB interface"
-       depends on USB
+       depends on (USB = y || USB = WIMAX_GDM72XX)
 
 config WIMAX_GDM72XX_SDIO
        bool "SDIO interface"
-       depends on MMC
+       depends on (MMC = y || MMC = WIMAX_GDM72XX)
 
 endchoice
 
 if WIMAX_GDM72XX_USB
 
 config WIMAX_GDM72XX_USB_PM
-       bool "Enable power managerment support"
+       bool "Enable power management support"
        depends on PM_RUNTIME
 
 endif # WIMAX_GDM72XX_USB
index 41efbeeb62f1fc284e192747029b931b6a9c9b06..dd854975db7d6f43a8fd1fc19f6d9db79e06c41e 100644 (file)
@@ -939,8 +939,7 @@ int register_wimax_device(struct phy_dev *phy_dev, struct device *pdev)
        struct net_device *dev;
        int ret;
 
-       dev = (struct net_device *)alloc_netdev(sizeof(*nic),
-                                               "wm%d", ether_setup);
+       dev = alloc_netdev(sizeof(*nic), "wm%d", ether_setup);
 
        if (dev == NULL) {
                pr_err("alloc_etherdev failed\n");
index d3bed21f4072a34312cdd67124c736eb29d457e9..f96dcec740aedf433d5266132f6072ff9b5b9461 100644 (file)
@@ -1,4 +1,5 @@
-/* drivers/misc/goldfish_audio.c
+/*
+ * drivers/misc/goldfish_audio.c
  *
  * Copyright (C) 2007 Google, Inc.
  * Copyright (C) 2012 Intel, Inc.
@@ -47,10 +48,11 @@ struct goldfish_audio {
        int read_supported;         /* true if we have audio input support */
 };
 
-/* We will allocate two read buffers and two write buffers.
-   Having two read buffers facilitate stereo -> mono conversion.
-   Having two write buffers facilitate interleaved IO.
-*/
+/*
+ *  We will allocate two read buffers and two write buffers.
+ *  Having two read buffers facilitate stereo -> mono conversion.
+ *  Having two write buffers facilitate interleaved IO.
+ */
 #define READ_BUFFER_SIZE        16384
 #define WRITE_BUFFER_SIZE       16384
 #define COMBINED_BUFFER_SIZE    ((2 * READ_BUFFER_SIZE) + \
@@ -59,8 +61,10 @@ struct goldfish_audio {
 #define AUDIO_READ(data, addr)         (readl(data->reg_base + addr))
 #define AUDIO_WRITE(data, addr, x)     (writel(x, data->reg_base + addr))
 
-/* temporary variable used between goldfish_audio_probe() and
-   goldfish_audio_open() */
+/*
+ *  temporary variable used between goldfish_audio_probe() and
+ *  goldfish_audio_open()
+ */
 static struct goldfish_audio *audio_data;
 
 enum {
@@ -161,8 +165,10 @@ static ssize_t goldfish_audio_write(struct file *fp, const char __user *buf,
                }
 
                spin_lock_irqsave(&data->lock, irq_flags);
-               /* clear the buffer empty flag, and signal the emulator
-                * to start writing the buffer */
+               /*
+                *  clear the buffer empty flag, and signal the emulator
+                *  to start writing the buffer
+                */
                if (kbuf == data->write_buffer1) {
                        data->buffer_status &= ~AUDIO_INT_WRITE_BUFFER_1_EMPTY;
                        AUDIO_WRITE(data, AUDIO_WRITE_BUFFER_1, copy);
@@ -225,8 +231,10 @@ static irqreturn_t goldfish_audio_interrupt(int irq, void *dev_id)
        /* read buffer status flags */
        status = AUDIO_READ(data, AUDIO_INT_STATUS);
        status &= AUDIO_INT_MASK;
-       /* if buffers are newly empty, wake up blocked
-          goldfish_audio_write() call */
+       /*
+        *  if buffers are newly empty, wake up blocked
+        *  goldfish_audio_write() call
+        */
        if (status) {
                data->buffer_status = status;
                wake_up(&data->wait);
index ab1f01952b485d7879e505e2f5747165a483a6d2..81e2ad4038feead8271e3bd12c4130209bb9aab1 100644 (file)
@@ -326,9 +326,10 @@ static int goldfish_nand_init_device(struct platform_device *pdev,
                        (mtd->writesize + mtd->oobsize) * mtd->writesize;
        do_div(mtd->size, mtd->writesize + mtd->oobsize);
        mtd->size *= mtd->writesize;
-       dev_dbg(&pdev->dev, 
+       dev_dbg(&pdev->dev,
                "goldfish nand dev%d: size %llx, page %d, extra %d, erase %d\n",
-                      id, mtd->size, mtd->writesize, mtd->oobsize, mtd->erasesize);
+                      id, mtd->size, mtd->writesize,
+                      mtd->oobsize, mtd->erasesize);
        spin_unlock_irqrestore(&nand->lock, irq_flags);
 
        mtd->priv = nand;
@@ -340,7 +341,7 @@ static int goldfish_nand_init_device(struct platform_device *pdev,
        result = goldfish_nand_cmd(mtd, NAND_CMD_GET_DEV_NAME, 0, name_len,
                                                                        name);
        if (result != name_len) {
-               dev_err(&pdev->dev, 
+               dev_err(&pdev->dev,
                        "goldfish_nand_init_device failed to get dev name %d != %d\n",
                               result, name_len);
                return -ENODEV;
@@ -391,7 +392,7 @@ static int goldfish_nand_probe(struct platform_device *pdev)
 
        version = readl(base + NAND_VERSION);
        if (version != NAND_VERSION_CURRENT) {
-               dev_err(&pdev->dev, 
+               dev_err(&pdev->dev,
                        "goldfish_nand_init: version mismatch, got %d, expected %d\n",
                                version, NAND_VERSION_CURRENT);
                return -ENODEV;
@@ -400,7 +401,7 @@ static int goldfish_nand_probe(struct platform_device *pdev)
        if (num_dev == 0)
                return -ENODEV;
 
-       nand = devm_kzalloc(&pdev->dev, sizeof(*nand) + 
+       nand = devm_kzalloc(&pdev->dev, sizeof(*nand) +
                                sizeof(struct mtd_info) * num_dev, GFP_KERNEL);
        if (nand == NULL)
                return -ENOMEM;
index 956c6c304b6e48021b78828a8b44fff0618a5f6a..ddfda71ab27aa791ba39dc7e6c43e2149104ab6c 100644 (file)
@@ -1,27 +1,30 @@
-/* drivers/mtd/devices/goldfish_nand_reg.h
-**
-** Copyright (C) 2007 Google, Inc.
-**
-** This software is licensed under the terms of the GNU General Public
-** License version 2, as published by the Free Software Foundation, and
-** may be copied, distributed, and modified under those terms.
-**
-** This program is distributed in the hope that it will be useful,
-** but WITHOUT ANY WARRANTY; without even the implied warranty of
-** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-** GNU General Public License for more details.
-**
-*/
+/*
+ * drivers/mtd/devices/goldfish_nand_reg.h
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
 
 #ifndef GOLDFISH_NAND_REG_H
 #define GOLDFISH_NAND_REG_H
 
 enum nand_cmd {
-       NAND_CMD_GET_DEV_NAME,  /* Write device name for NAND_DEV to NAND_DATA (vaddr) */
+       /* Write device name for NAND_DEV to NAND_DATA (vaddr) */
+       NAND_CMD_GET_DEV_NAME,
        NAND_CMD_READ,
        NAND_CMD_WRITE,
        NAND_CMD_ERASE,
-       NAND_CMD_BLOCK_BAD_GET, /* NAND_RESULT is 1 if block is bad, 0 if it is not */
+       /* NAND_RESULT is 1 if block is bad, 0 if it is not */
+       NAND_CMD_BLOCK_BAD_GET,
        NAND_CMD_BLOCK_BAD_SET,
        NAND_CMD_READ_WITH_PARAMS,
        NAND_CMD_WRITE_WITH_PARAMS,
index 504701940585e3f3f12cac3ced0cfa67858ccccd..3283e282953653eab17c098c8df6bcfe64f9d602 100644 (file)
@@ -326,7 +326,7 @@ static ssize_t ad7192_write_frequency(struct device *dev,
        unsigned long lval;
        int div, ret;
 
-       ret = strict_strtoul(buf, 10, &lval);
+       ret = kstrtoul(buf, 10, &lval);
        if (ret)
                return ret;
        if (lval == 0)
index 2fd6ee3c1902ad27bd63dbae1c8d711783256229..c19618bc37c4a004e21206e9ae55e5116ad4016e 100644 (file)
@@ -632,7 +632,7 @@ static ssize_t ad7280_write_channel_config(struct device *dev,
        long val;
        int ret;
 
-       ret = strict_strtol(buf, 10, &val);
+       ret = kstrtol(buf, 10, &val);
        if (ret)
                return ret;
 
index d088c662d5cd3a193c3b29bd4f4407aa55b76ba7..3fc79e5827500f97a8f843078fc3ff91cf6d9105 100644 (file)
@@ -21,6 +21,8 @@
 #include <linux/iio/sysfs.h>
 #include <linux/iio/events.h>
 
+#include "ad7291.h"
+
 /*
  * Simplified handling
  *
 #define AD7291_VOLTAGE                 0x01
 #define AD7291_T_SENSE                 0x02
 #define AD7291_T_AVERAGE               0x03
-#define AD7291_CH0_DATA_HIGH           0x04
-#define AD7291_CH0_DATA_LOW            0x05
-#define AD7291_CH0_HYST                        0x06
-#define AD7291_CH1_DATA_HIGH           0x07
-#define AD7291_CH1_DATA_LOW            0x08
-#define AD7291_CH1_HYST                        0x09
-#define AD7291_CH2_DATA_HIGH           0x0A
-#define AD7291_CH2_DATA_LOW            0x0B
-#define AD7291_CH2_HYST                        0x0C
-#define AD7291_CH3_DATA_HIGH           0x0D
-#define AD7291_CH3_DATA_LOW            0x0E
-#define AD7291_CH3_HYST                        0x0F
-#define AD7291_CH4_DATA_HIGH           0x10
-#define AD7291_CH4_DATA_LOW            0x11
-#define AD7291_CH4_HYST                        0x12
-#define AD7291_CH5_DATA_HIGH           0x13
-#define AD7291_CH5_DATA_LOW            0x14
-#define AD7291_CH5_HYST                        0x15
-#define AD7291_CH6_DATA_HIGH           0x16
-#define AD7291_CH6_DATA_LOW            0x17
-#define AD7291_CH6_HYST                        0x18
-#define AD7291_CH7_DATA_HIGH           0x19
-#define AD7291_CH7_DATA_LOW            0x1A
-#define AD7291_CH7_HYST                        0x2B
-#define AD7291_T_SENSE_HIGH            0x1C
-#define AD7291_T_SENSE_LOW             0x1D
-#define AD7291_T_SENSE_HYST            0x1E
+#define AD7291_DATA_HIGH(x)            ((x) * 3 + 0x4)
+#define AD7291_DATA_LOW(x)             ((x) * 3 + 0x5)
+#define AD7291_HYST(x)                 ((x) * 3 + 0x6)
 #define AD7291_VOLTAGE_ALERT_STATUS    0x1F
 #define AD7291_T_ALERT_STATUS          0x20
 
 struct ad7291_chip_info {
        struct i2c_client       *client;
        struct regulator        *reg;
-       u16                     int_vref_mv;
        u16                     command;
        u16                     c_mask; /* Active voltage channels for events */
        struct mutex            state_lock;
@@ -111,45 +88,22 @@ static int ad7291_i2c_read(struct ad7291_chip_info *chip, u8 reg, u16 *data)
        struct i2c_client *client = chip->client;
        int ret = 0;
 
-       ret = i2c_smbus_read_word_data(client, reg);
+       ret = i2c_smbus_read_word_swapped(client, reg);
        if (ret < 0) {
                dev_err(&client->dev, "I2C read error\n");
                return ret;
        }
 
-       *data = swab16((u16)ret);
+       *data = ret;
 
        return 0;
 }
 
 static int ad7291_i2c_write(struct ad7291_chip_info *chip, u8 reg, u16 data)
 {
-       return i2c_smbus_write_word_data(chip->client, reg, swab16(data));
-}
-
-static ssize_t ad7291_store_reset(struct device *dev,
-               struct device_attribute *attr,
-               const char *buf,
-               size_t len)
-{
-       struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-       struct ad7291_chip_info *chip = iio_priv(indio_dev);
-
-       return ad7291_i2c_write(chip, AD7291_COMMAND,
-                               chip->command | AD7291_RESET);
+       return i2c_smbus_write_word_swapped(chip->client, reg, data);
 }
 
-static IIO_DEVICE_ATTR(reset, S_IWUSR, NULL, ad7291_store_reset, 0);
-
-static struct attribute *ad7291_attributes[] = {
-       &iio_dev_attr_reset.dev_attr.attr,
-       NULL,
-};
-
-static const struct attribute_group ad7291_attribute_group = {
-       .attrs = ad7291_attributes,
-};
-
 static irqreturn_t ad7291_event_handler(int irq, void *private)
 {
        struct iio_dev *indio_dev = private;
@@ -255,31 +209,31 @@ static inline ssize_t ad7291_set_hyst(struct device *dev,
 static IIO_DEVICE_ATTR(in_temp0_thresh_both_hyst_raw,
                       S_IRUGO | S_IWUSR,
                       ad7291_show_hyst, ad7291_set_hyst,
-                      AD7291_T_SENSE_HYST);
+                      AD7291_HYST(8));
 static IIO_DEVICE_ATTR(in_voltage0_thresh_both_hyst_raw,
                       S_IRUGO | S_IWUSR,
-                      ad7291_show_hyst, ad7291_set_hyst, AD7291_CH0_HYST);
+                      ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(0));
 static IIO_DEVICE_ATTR(in_voltage1_thresh_both_hyst_raw,
                       S_IRUGO | S_IWUSR,
-                      ad7291_show_hyst, ad7291_set_hyst, AD7291_CH1_HYST);
+                      ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(1));
 static IIO_DEVICE_ATTR(in_voltage2_thresh_both_hyst_raw,
                       S_IRUGO | S_IWUSR,
-                      ad7291_show_hyst, ad7291_set_hyst, AD7291_CH2_HYST);
+                      ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(2));
 static IIO_DEVICE_ATTR(in_voltage3_thresh_both_hyst_raw,
                       S_IRUGO | S_IWUSR,
-                      ad7291_show_hyst, ad7291_set_hyst, AD7291_CH3_HYST);
+                      ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(3));
 static IIO_DEVICE_ATTR(in_voltage4_thresh_both_hyst_raw,
                       S_IRUGO | S_IWUSR,
-                      ad7291_show_hyst, ad7291_set_hyst, AD7291_CH4_HYST);
+                      ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(4));
 static IIO_DEVICE_ATTR(in_voltage5_thresh_both_hyst_raw,
                       S_IRUGO | S_IWUSR,
-                      ad7291_show_hyst, ad7291_set_hyst, AD7291_CH5_HYST);
+                      ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(5));
 static IIO_DEVICE_ATTR(in_voltage6_thresh_both_hyst_raw,
                       S_IRUGO | S_IWUSR,
-                      ad7291_show_hyst, ad7291_set_hyst, AD7291_CH6_HYST);
+                      ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(6));
 static IIO_DEVICE_ATTR(in_voltage7_thresh_both_hyst_raw,
                       S_IRUGO | S_IWUSR,
-                      ad7291_show_hyst, ad7291_set_hyst, AD7291_CH7_HYST);
+                      ad7291_show_hyst, ad7291_set_hyst, AD7291_HYST(7));
 
 static struct attribute *ad7291_event_attributes[] = {
        &iio_dev_attr_in_temp0_thresh_both_hyst_raw.dev_attr.attr,
@@ -294,53 +248,45 @@ static struct attribute *ad7291_event_attributes[] = {
        NULL,
 };
 
-/* high / low */
-static u8 ad7291_limit_regs[9][2] = {
-       { AD7291_CH0_DATA_HIGH, AD7291_CH0_DATA_LOW },
-       { AD7291_CH1_DATA_HIGH, AD7291_CH1_DATA_LOW },
-       { AD7291_CH2_DATA_HIGH, AD7291_CH2_DATA_LOW },
-       { AD7291_CH3_DATA_HIGH, AD7291_CH3_DATA_LOW }, /* FIXME: ? */
-       { AD7291_CH4_DATA_HIGH, AD7291_CH4_DATA_LOW },
-       { AD7291_CH5_DATA_HIGH, AD7291_CH5_DATA_LOW },
-       { AD7291_CH6_DATA_HIGH, AD7291_CH6_DATA_LOW },
-       { AD7291_CH7_DATA_HIGH, AD7291_CH7_DATA_LOW },
-       /* temp */
-       { AD7291_T_SENSE_HIGH, AD7291_T_SENSE_LOW },
-};
+static unsigned int ad7291_threshold_reg(u64 event_code)
+{
+       unsigned int offset;
+
+       switch (IIO_EVENT_CODE_EXTRACT_CHAN_TYPE(event_code)) {
+       case IIO_VOLTAGE:
+               offset = IIO_EVENT_CODE_EXTRACT_CHAN(event_code);
+               break;
+       case IIO_TEMP:
+               offset = 8;
+               break;
+       default:
+           return 0;
+       }
+
+       if (IIO_EVENT_CODE_EXTRACT_DIR(event_code) == IIO_EV_DIR_FALLING)
+               return AD7291_DATA_LOW(offset);
+       else
+               return AD7291_DATA_HIGH(offset);
+}
 
 static int ad7291_read_event_value(struct iio_dev *indio_dev,
                                   u64 event_code,
                                   int *val)
 {
        struct ad7291_chip_info *chip = iio_priv(indio_dev);
-
        int ret;
-       u8 reg;
        u16 uval;
-       s16 signval;
+
+       ret = ad7291_i2c_read(chip, ad7291_threshold_reg(event_code), &uval);
+       if (ret < 0)
+               return ret;
 
        switch (IIO_EVENT_CODE_EXTRACT_CHAN_TYPE(event_code)) {
        case IIO_VOLTAGE:
-               reg = ad7291_limit_regs[IIO_EVENT_CODE_EXTRACT_CHAN(event_code)]
-                       [!(IIO_EVENT_CODE_EXTRACT_DIR(event_code) ==
-                          IIO_EV_DIR_RISING)];
-
-               ret = ad7291_i2c_read(chip, reg, &uval);
-               if (ret < 0)
-                       return ret;
                *val = uval & AD7291_VALUE_MASK;
                return 0;
-
        case IIO_TEMP:
-               reg = ad7291_limit_regs[8]
-                       [!(IIO_EVENT_CODE_EXTRACT_DIR(event_code) ==
-                          IIO_EV_DIR_RISING)];
-
-               ret = ad7291_i2c_read(chip, reg, &signval);
-               if (ret < 0)
-                       return ret;
-               signval = (s16)((signval & AD7291_VALUE_MASK) << 4) >> 4;
-               *val = signval;
+               *val = sign_extend32(uval, 11);
                return 0;
        default:
                return -EINVAL;
@@ -352,28 +298,21 @@ static int ad7291_write_event_value(struct iio_dev *indio_dev,
                                    int val)
 {
        struct ad7291_chip_info *chip = iio_priv(indio_dev);
-       u8 reg;
-       s16 signval;
 
        switch (IIO_EVENT_CODE_EXTRACT_CHAN_TYPE(event_code)) {
        case IIO_VOLTAGE:
                if (val > AD7291_VALUE_MASK || val < 0)
                        return -EINVAL;
-               reg = ad7291_limit_regs[IIO_EVENT_CODE_EXTRACT_CHAN(event_code)]
-                       [!(IIO_EVENT_CODE_EXTRACT_DIR(event_code) ==
-                          IIO_EV_DIR_RISING)];
-               return ad7291_i2c_write(chip, reg, val);
+               break;
        case IIO_TEMP:
                if (val > 2047 || val < -2048)
                        return -EINVAL;
-               reg = ad7291_limit_regs[8]
-                       [!(IIO_EVENT_CODE_EXTRACT_DIR(event_code) ==
-                          IIO_EV_DIR_RISING)];
-               signval = val;
-               return ad7291_i2c_write(chip, reg, *(u16 *)&signval);
+               break;
        default:
                return -EINVAL;
-       };
+       }
+
+       return ad7291_i2c_write(chip, ad7291_threshold_reg(event_code), val);
 }
 
 static int ad7291_read_event_config(struct iio_dev *indio_dev,
@@ -456,9 +395,7 @@ static int ad7291_read_raw(struct iio_dev *indio_dev,
 {
        int ret;
        struct ad7291_chip_info *chip = iio_priv(indio_dev);
-       unsigned int scale_uv;
        u16 regval;
-       s16 signval;
 
        switch (mask) {
        case IIO_CHAN_INFO_RAW:
@@ -479,44 +416,47 @@ static int ad7291_read_raw(struct iio_dev *indio_dev,
                                return ret;
                        }
                        /* Read voltage */
-                       ret = i2c_smbus_read_word_data(chip->client,
+                       ret = i2c_smbus_read_word_swapped(chip->client,
                                                       AD7291_VOLTAGE);
                        if (ret < 0) {
                                mutex_unlock(&chip->state_lock);
                                return ret;
                        }
-                       *val = swab16((u16)ret) & AD7291_VALUE_MASK;
+                       *val = ret & AD7291_VALUE_MASK;
                        mutex_unlock(&chip->state_lock);
                        return IIO_VAL_INT;
                case IIO_TEMP:
                        /* Assumes tsense bit of command register always set */
-                       ret = i2c_smbus_read_word_data(chip->client,
+                       ret = i2c_smbus_read_word_swapped(chip->client,
                                                       AD7291_T_SENSE);
                        if (ret < 0)
                                return ret;
-                       signval = (s16)((swab16((u16)ret) &
-                               AD7291_VALUE_MASK) << 4) >> 4;
-                       *val = signval;
+                       *val = sign_extend32(ret, 11);
                        return IIO_VAL_INT;
                default:
                        return -EINVAL;
                }
        case IIO_CHAN_INFO_AVERAGE_RAW:
-               ret = i2c_smbus_read_word_data(chip->client,
+               ret = i2c_smbus_read_word_swapped(chip->client,
                                               AD7291_T_AVERAGE);
                        if (ret < 0)
                                return ret;
-                       signval = (s16)((swab16((u16)ret) &
-                               AD7291_VALUE_MASK) << 4) >> 4;
-                       *val = signval;
+                       *val = sign_extend32(ret, 11);
                        return IIO_VAL_INT;
        case IIO_CHAN_INFO_SCALE:
                switch (chan->type) {
                case IIO_VOLTAGE:
-                       scale_uv = (chip->int_vref_mv * 1000) >> AD7291_BITS;
-                       *val =  scale_uv / 1000;
-                       *val2 = (scale_uv % 1000) * 1000;
-                       return IIO_VAL_INT_PLUS_MICRO;
+                       if (chip->reg) {
+                               int vref;
+                               vref = regulator_get_voltage(chip->reg);
+                               if (vref < 0)
+                                       return vref;
+                               *val = vref / 1000;
+                       } else {
+                               *val = 2500;
+                       }
+                       *val2 = AD7291_BITS;
+                       return IIO_VAL_FRACTIONAL_LOG2;
                case IIO_TEMP:
                        /*
                         * One LSB of the ADC corresponds to 0.25 deg C.
@@ -571,7 +511,6 @@ static struct attribute_group ad7291_event_attribute_group = {
 };
 
 static const struct iio_info ad7291_info = {
-       .attrs = &ad7291_attribute_group,
        .read_raw = &ad7291_read_raw,
        .read_event_config = &ad7291_read_event_config,
        .write_event_config = &ad7291_write_event_config,
@@ -583,9 +522,10 @@ static const struct iio_info ad7291_info = {
 static int ad7291_probe(struct i2c_client *client,
                const struct i2c_device_id *id)
 {
+       struct ad7291_platform_data *pdata = client->dev.platform_data;
        struct ad7291_chip_info *chip;
        struct iio_dev *indio_dev;
-       int ret = 0, voltage_uv = 0;
+       int ret = 0;
 
        indio_dev = iio_device_alloc(sizeof(*chip));
        if (indio_dev == NULL) {
@@ -594,12 +534,14 @@ static int ad7291_probe(struct i2c_client *client,
        }
        chip = iio_priv(indio_dev);
 
-       chip->reg = regulator_get(&client->dev, "vcc");
-       if (!IS_ERR(chip->reg)) {
+       if (pdata && pdata->use_external_ref) {
+               chip->reg = regulator_get(&client->dev, "vref");
+               if (IS_ERR(chip->reg))
+                       goto error_free;
+
                ret = regulator_enable(chip->reg);
                if (ret)
                        goto error_put_reg;
-               voltage_uv = regulator_get_voltage(chip->reg);
        }
 
        mutex_init(&chip->state_lock);
@@ -612,12 +554,8 @@ static int ad7291_probe(struct i2c_client *client,
                        AD7291_T_SENSE_MASK | /* Tsense always enabled */
                        AD7291_ALERT_POLARITY; /* set irq polarity low level */
 
-       if (voltage_uv) {
-               chip->int_vref_mv = voltage_uv / 1000;
+       if (pdata && pdata->use_external_ref)
                chip->command |= AD7291_EXT_REF;
-       } else {
-               chip->int_vref_mv = 2500; /* Build-in ref */
-       }
 
        indio_dev->name = id->name;
        indio_dev->channels = ad7291_channels;
@@ -654,21 +592,18 @@ static int ad7291_probe(struct i2c_client *client,
        if (ret)
                goto error_unreg_irq;
 
-       dev_info(&client->dev, "%s ADC registered.\n",
-                        id->name);
-
        return 0;
 
 error_unreg_irq:
        if (client->irq)
                free_irq(client->irq, indio_dev);
 error_disable_reg:
-       if (!IS_ERR(chip->reg))
+       if (chip->reg)
                regulator_disable(chip->reg);
 error_put_reg:
-       if (!IS_ERR(chip->reg))
+       if (chip->reg)
                regulator_put(chip->reg);
-
+error_free:
        iio_device_free(indio_dev);
 error_ret:
        return ret;
@@ -684,7 +619,7 @@ static int ad7291_remove(struct i2c_client *client)
        if (client->irq)
                free_irq(client->irq, indio_dev);
 
-       if (!IS_ERR(chip->reg)) {
+       if (chip->reg) {
                regulator_disable(chip->reg);
                regulator_put(chip->reg);
        }
diff --git a/drivers/staging/iio/adc/ad7291.h b/drivers/staging/iio/adc/ad7291.h
new file mode 100644 (file)
index 0000000..bbd89fa
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef __IIO_AD7291_H__
+#define __IIO_AD7291_H__
+
+/**
+ * struct ad7291_platform_data - AD7291 platform data
+ * @use_external_ref: Whether to use an external or internal reference voltage
+ */
+struct ad7291_platform_data {
+       bool use_external_ref;
+};
+
+#endif
index d104b43784241f7931003978f35ccdcb01e88277..72868ceda360cdd969cae70ddc6b1e50b2b33bb6 100644 (file)
@@ -125,9 +125,12 @@ static ssize_t ad7606_store_range(struct device *dev,
        struct iio_dev *indio_dev = dev_to_iio_dev(dev);
        struct ad7606_state *st = iio_priv(indio_dev);
        unsigned long lval;
+       int ret;
+
+       ret = kstrtoul(buf, 10, &lval);
+       if (ret)
+               return ret;
 
-       if (strict_strtoul(buf, 10, &lval))
-               return -EINVAL;
        if (!(lval == 5000 || lval == 10000)) {
                dev_err(dev, "range is not supported\n");
                return -EINVAL;
@@ -173,8 +176,9 @@ static ssize_t ad7606_store_oversampling_ratio(struct device *dev,
        unsigned long lval;
        int ret;
 
-       if (strict_strtoul(buf, 10, &lval))
-               return -EINVAL;
+       ret = kstrtoul(buf, 10, &lval);
+       if (ret)
+               return ret;
 
        ret = ad7606_oversampling_get_index(lval);
        if (ret < 0) {
index 58cfddea96379ad1dd12039f615f57958e39e768..8a48d18de788de8731d9d7201529049e469bd9f8 100644 (file)
@@ -112,8 +112,6 @@ static int ad7606_par_remove(struct platform_device *pdev)
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        release_mem_region(res->start, resource_size(res));
 
-       platform_set_drvdata(pdev, NULL);
-
        return 0;
 }
 
index 928477146c2fc17bd70e3434739fbf37cb6b496c..8470036a3378cba40d787f1e45fcb0410bd5d676 100644 (file)
@@ -175,9 +175,9 @@ static ssize_t ad7816_store_channel(struct device *dev,
        unsigned long data;
        int ret;
 
-       ret = strict_strtoul(buf, 10, &data);
+       ret = kstrtoul(buf, 10, &data);
        if (ret)
-               return -EINVAL;
+               return ret;
 
        if (data > AD7816_CS_MAX && data != AD7816_CS_MASK) {
                dev_err(&chip->spi_dev->dev, "Invalid channel id %lu for %s.\n",
@@ -290,7 +290,9 @@ static inline ssize_t ad7816_set_oti(struct device *dev,
        u8 data;
        int ret;
 
-       ret = strict_strtol(buf, 10, &value);
+       ret = kstrtol(buf, 10, &value);
+       if (ret)
+               return ret;
 
        if (chip->channel_id > AD7816_CS_MAX) {
                dev_err(dev, "Invalid oti channel id %d.\n", chip->channel_id);
index 8dc97b36e05a7ab5506da9b2be2ffaf0019b4e6f..2b2049c8bc6ba969896168f6b22b58ed7336afcc 100644 (file)
@@ -226,7 +226,7 @@ static ssize_t ad799x_write_frequency(struct device *dev,
        int ret, i;
        u8 t;
 
-       ret = strict_strtol(buf, 10, &val);
+       ret = kstrtol(buf, 10, &val);
        if (ret)
                return ret;
 
@@ -337,7 +337,7 @@ static ssize_t ad799x_write_channel_config(struct device *dev,
        long val;
        int ret;
 
-       ret = strict_strtol(buf, 10, &val);
+       ret = kstrtol(buf, 10, &val);
        if (ret)
                return ret;
 
index 2f2f7fdd06913b7bfbdafd7a077c83deb779b9f5..9a4bb0999b51244d3e621610aa626477f5f851f5 100644 (file)
@@ -215,7 +215,6 @@ static int lpc32xx_adc_remove(struct platform_device *pdev)
 
        iio_device_unregister(iodev);
        free_irq(irq, info);
-       platform_set_drvdata(pdev, NULL);
        clk_put(info->clk);
        iounmap(info->adc_base);
        iio_device_free(iodev);
index 163c638e4095f3b893668bb5d83e8e2110b76afb..d92c97a59d61f808abb5795c937dbd689f2d8d4d 100644 (file)
@@ -620,7 +620,7 @@ static irqreturn_t mxs_lradc_trigger_handler(int irq, void *p)
                ((LRADC_DELAY_TIMER_LOOP - 1) << LRADC_CH_NUM_SAMPLES_OFFSET);
        unsigned int i, j = 0;
 
-       for_each_set_bit(i, iio->active_scan_mask, iio->masklength) {
+       for_each_set_bit(i, iio->active_scan_mask, LRADC_MAX_TOTAL_CHANS) {
                lradc->buffer[j] = readl(lradc->base + LRADC_CH(j));
                writel(chan_value, lradc->base + LRADC_CH(j));
                lradc->buffer[j] &= LRADC_CH_VALUE_MASK;
@@ -774,8 +774,7 @@ static bool mxs_lradc_validate_scan_mask(struct iio_dev *iio,
                                        const unsigned long *mask)
 {
        struct mxs_lradc *lradc = iio_priv(iio);
-       const int len = iio->masklength;
-       const int map_chans = bitmap_weight(mask, len);
+       const int map_chans = bitmap_weight(mask, LRADC_MAX_TOTAL_CHANS);
        int rsvd_chans = 0;
        unsigned long rsvd_mask = 0;
 
@@ -792,7 +791,7 @@ static bool mxs_lradc_validate_scan_mask(struct iio_dev *iio,
                rsvd_chans++;
 
        /* Test for attempts to map channels with special mode of operation. */
-       if (bitmap_intersects(mask, &rsvd_mask, len))
+       if (bitmap_intersects(mask, &rsvd_mask, LRADC_MAX_TOTAL_CHANS))
                return false;
 
        /* Test for attempts to map more channels then available slots. */
@@ -968,6 +967,7 @@ static int mxs_lradc_probe(struct platform_device *pdev)
        iio->modes = INDIO_DIRECT_MODE;
        iio->channels = mxs_lradc_chan_spec;
        iio->num_channels = ARRAY_SIZE(mxs_lradc_chan_spec);
+       iio->masklength = LRADC_MAX_TOTAL_CHANS;
 
        ret = iio_triggered_buffer_setup(iio, &iio_pollfunc_store_time,
                                &mxs_lradc_trigger_handler,
index f45da4266950aca0dcfae51aba7b49fea47c40a6..736219c30308f4c0c433f080a82ff1a00a09023b 100644 (file)
@@ -407,7 +407,6 @@ static int spear_adc_remove(struct platform_device *pdev)
        struct spear_adc_info *info = iio_priv(iodev);
 
        iio_device_unregister(iodev);
-       platform_set_drvdata(pdev, NULL);
        clk_disable_unprepare(info->clk);
        clk_put(info->clk);
        iounmap(info->adc_base_spear6xx);
@@ -416,11 +415,13 @@ static int spear_adc_remove(struct platform_device *pdev)
        return 0;
 }
 
+#ifdef CONFIG_OF
 static const struct of_device_id spear_adc_dt_ids[] = {
        { .compatible = "st,spear600-adc", },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, spear_adc_dt_ids);
+#endif
 
 static struct platform_driver spear_adc_driver = {
        .probe          = spear_adc_probe,
index 836066287192d3ef5e8ead829928032be74e0971..b4333715536e2d82d3a8d90f12785416118abd68 100644 (file)
@@ -10,13 +10,6 @@ config ADIS16060
          Say yes here to build support for Analog Devices adis16060 wide bandwidth
          yaw rate gyroscope with SPI.
 
-config ADIS16130
-       tristate "Analog Devices ADIS16130 High Precision Angular Rate Sensor driver"
-       depends on SPI
-       help
-         Say yes here to build support for Analog Devices ADIS16130 High Precision
-         Angular Rate Sensor driver.
-
 config ADIS16260
        tristate "Analog Devices ADIS16260 Digital Gyroscope Sensor SPI driver"
        depends on SPI
index 98e650061a3a8cb5e4f7bab9029fc9a1a024c5ef..975f95b141da3576fd9982d977a7d65dba229110 100644 (file)
@@ -5,8 +5,5 @@
 adis16060-y             := adis16060_core.o
 obj-$(CONFIG_ADIS16060) += adis16060.o
 
-adis16130-y             := adis16130_core.o
-obj-$(CONFIG_ADIS16130) += adis16130.o
-
 adis16260-y             := adis16260_core.o
 obj-$(CONFIG_ADIS16260) += adis16260.o
index 1a051da62505c498289340aea1ee498589b0eab1..2fd18c60323debb72b5f19e2342ca3c7991671c6 100644 (file)
@@ -12,23 +12,6 @@ config IIO_PERIODIC_RTC_TRIGGER
          Provides support for using periodic capable real time
          clocks as IIO triggers.
 
-config IIO_GPIO_TRIGGER
-       tristate "GPIO trigger"
-       depends on GPIOLIB
-       help
-         Provides support for using GPIO pins as IIO triggers.
-
-config IIO_SYSFS_TRIGGER
-       tristate "SYSFS trigger"
-       depends on SYSFS
-       select IRQ_WORK
-       help
-         Provides support for using SYSFS entry as IIO triggers.
-         If unsure, say N (but it's safe to say "Y").
-
-         To compile this driver as a module, choose M here: the
-         module will be called iio-trig-sysfs.
-
 config IIO_BFIN_TMR_TRIGGER
        tristate "Blackfin TIMER trigger"
        depends on BLACKFIN
index b088b57da335b576b59f16213150ab3be229fdb1..238481b78e7230b1c02797845987a1d5b8d0736b 100644 (file)
@@ -3,6 +3,4 @@
 #
 
 obj-$(CONFIG_IIO_PERIODIC_RTC_TRIGGER) += iio-trig-periodic-rtc.o
-obj-$(CONFIG_IIO_GPIO_TRIGGER) += iio-trig-gpio.o
-obj-$(CONFIG_IIO_SYSFS_TRIGGER) += iio-trig-sysfs.o
 obj-$(CONFIG_IIO_BFIN_TMR_TRIGGER) += iio-trig-bfin-timer.o
diff --git a/drivers/staging/iio/trigger/iio-trig-gpio.c b/drivers/staging/iio/trigger/iio-trig-gpio.c
deleted file mode 100644 (file)
index 7c593d1..0000000
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Industrial I/O - gpio based trigger support
- *
- * Copyright (c) 2008 Jonathan Cameron
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- *
- * Currently this is more of a functioning proof of concept than a full
- * fledged trigger driver.
- *
- * TODO:
- *
- * Add board config elements to allow specification of startup settings.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/interrupt.h>
-#include <linux/gpio.h>
-#include <linux/slab.h>
-
-#include <linux/iio/iio.h>
-#include <linux/iio/trigger.h>
-
-static LIST_HEAD(iio_gpio_trigger_list);
-static DEFINE_MUTEX(iio_gpio_trigger_list_lock);
-
-struct iio_gpio_trigger_info {
-       struct mutex in_use;
-       unsigned int irq;
-};
-/*
- * Need to reference count these triggers and only enable gpio interrupts
- * as appropriate.
- */
-
-/* So what functionality do we want in here?... */
-/* set high / low as interrupt type? */
-
-static irqreturn_t iio_gpio_trigger_poll(int irq, void *private)
-{
-       /* Timestamp not currently provided */
-       iio_trigger_poll(private, 0);
-       return IRQ_HANDLED;
-}
-
-static const struct iio_trigger_ops iio_gpio_trigger_ops = {
-       .owner = THIS_MODULE,
-};
-
-static int iio_gpio_trigger_probe(struct platform_device *pdev)
-{
-       struct iio_gpio_trigger_info *trig_info;
-       struct iio_trigger *trig, *trig2;
-       unsigned long irqflags;
-       struct resource *irq_res;
-       int irq, ret = 0, irq_res_cnt = 0;
-
-       do {
-               irq_res = platform_get_resource(pdev,
-                               IORESOURCE_IRQ, irq_res_cnt);
-
-               if (irq_res == NULL) {
-                       if (irq_res_cnt == 0)
-                               dev_err(&pdev->dev, "No GPIO IRQs specified");
-                       break;
-               }
-               irqflags = (irq_res->flags & IRQF_TRIGGER_MASK) | IRQF_SHARED;
-
-               for (irq = irq_res->start; irq <= irq_res->end; irq++) {
-
-                       trig = iio_trigger_alloc("irqtrig%d", irq);
-                       if (!trig) {
-                               ret = -ENOMEM;
-                               goto error_free_completed_registrations;
-                       }
-
-                       trig_info = kzalloc(sizeof(*trig_info), GFP_KERNEL);
-                       if (!trig_info) {
-                               ret = -ENOMEM;
-                               goto error_put_trigger;
-                       }
-                       iio_trigger_set_drvdata(trig, trig_info);
-                       trig_info->irq = irq;
-                       trig->ops = &iio_gpio_trigger_ops;
-                       ret = request_irq(irq, iio_gpio_trigger_poll,
-                                         irqflags, trig->name, trig);
-                       if (ret) {
-                               dev_err(&pdev->dev,
-                                       "request IRQ-%d failed", irq);
-                               goto error_free_trig_info;
-                       }
-
-                       ret = iio_trigger_register(trig);
-                       if (ret)
-                               goto error_release_irq;
-
-                       list_add_tail(&trig->alloc_list,
-                                       &iio_gpio_trigger_list);
-               }
-
-               irq_res_cnt++;
-       } while (irq_res != NULL);
-
-
-       return 0;
-
-/* First clean up the partly allocated trigger */
-error_release_irq:
-       free_irq(irq, trig);
-error_free_trig_info:
-       kfree(trig_info);
-error_put_trigger:
-       iio_trigger_put(trig);
-error_free_completed_registrations:
-       /* The rest should have been added to the iio_gpio_trigger_list */
-       list_for_each_entry_safe(trig,
-                                trig2,
-                                &iio_gpio_trigger_list,
-                                alloc_list) {
-               trig_info = iio_trigger_get_drvdata(trig);
-               free_irq(gpio_to_irq(trig_info->irq), trig);
-               kfree(trig_info);
-               iio_trigger_unregister(trig);
-       }
-
-       return ret;
-}
-
-static int iio_gpio_trigger_remove(struct platform_device *pdev)
-{
-       struct iio_trigger *trig, *trig2;
-       struct iio_gpio_trigger_info *trig_info;
-
-       mutex_lock(&iio_gpio_trigger_list_lock);
-       list_for_each_entry_safe(trig,
-                                trig2,
-                                &iio_gpio_trigger_list,
-                                alloc_list) {
-               trig_info = iio_trigger_get_drvdata(trig);
-               iio_trigger_unregister(trig);
-               free_irq(trig_info->irq, trig);
-               kfree(trig_info);
-               iio_trigger_put(trig);
-       }
-       mutex_unlock(&iio_gpio_trigger_list_lock);
-
-       return 0;
-}
-
-static struct platform_driver iio_gpio_trigger_driver = {
-       .probe = iio_gpio_trigger_probe,
-       .remove = iio_gpio_trigger_remove,
-       .driver = {
-               .name = "iio_gpio_trigger",
-               .owner = THIS_MODULE,
-       },
-};
-
-module_platform_driver(iio_gpio_trigger_driver);
-
-MODULE_AUTHOR("Jonathan Cameron <jic23@kernel.org>");
-MODULE_DESCRIPTION("Example gpio trigger for the iio subsystem");
-MODULE_LICENSE("GPL v2");
index 64553058b67ee8c81229028e3e64bdc233100df2..8c433fc167c6932c10e63a966f99699d74b94d85 100644 (file)
@@ -144,7 +144,7 @@ int imx_drm_crtc_panel_format(struct drm_crtc *crtc, u32 encoder_type,
                u32 interface_pix_fmt)
 {
        return imx_drm_crtc_panel_format_pins(crtc, encoder_type,
-                                             interface_pix_fmt, 0, 0);
+                                             interface_pix_fmt, 2, 3);
 }
 EXPORT_SYMBOL_GPL(imx_drm_crtc_panel_format);
 
index 03892de9bd7e1103665c2144212722436b97b8be..a56797d88edcdc85056dfed0b83f31811ed24cf4 100644 (file)
@@ -22,7 +22,6 @@
 #include <linux/clk-provider.h>
 #include <linux/module.h>
 #include <linux/of_i2c.h>
-#include <linux/pinctrl/consumer.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
 #include <linux/spinlock.h>
@@ -610,15 +609,6 @@ static int imx_tve_probe(struct platform_device *pdev)
        }
 
        if (tve->mode == TVE_MODE_VGA) {
-               struct pinctrl *pinctrl;
-
-               pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
-               if (IS_ERR(pinctrl)) {
-                       ret = PTR_ERR(pinctrl);
-                       dev_warn(&pdev->dev, "failed to setup pinctrl: %d", ret);
-                       return ret;
-               }
-
                ret = of_property_read_u32(np, "fsl,hsync-pin", &tve->hsync_pin);
                if (ret < 0) {
                        dev_err(&pdev->dev, "failed to get vsync pin\n");
@@ -638,11 +628,9 @@ static int imx_tve_probe(struct platform_device *pdev)
                return -ENOENT;
        }
 
-       base = devm_request_and_ioremap(&pdev->dev, res);
-       if (!base) {
-               dev_err(&pdev->dev, "failed to remap memory region\n");
-               return -ENOENT;
-       }
+       base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(base))
+               return PTR_ERR(base);
 
        tve_regmap_config.lock_arg = tve;
        tve->regmap = devm_regmap_init_mmio_clk(&pdev->dev, "tve", base,
index 19d777e39d0b94480a5b1251ffd69deb299056ff..0b6806e2069c3705afe13da8315871c407757ffb 100644 (file)
@@ -603,7 +603,12 @@ int ipu_di_init_sync_panel(struct ipu_di *di, struct ipu_di_signal_cfg *sig)
 
                vsync_cnt = 3;
                if (di->id == 1)
-                       vsync_cnt = 6;
+                       /*
+                        * TODO: change only for TVEv2, parallel display
+                        * uses pin 2 / 3
+                        */
+                       if (!(sig->hsync_pin == 2 && sig->vsync_pin == 3))
+                               vsync_cnt = 6;
 
                if (sig->Hsync_pol) {
                        if (sig->hsync_pin == 2)
@@ -614,11 +619,11 @@ int ipu_di_init_sync_panel(struct ipu_di *di, struct ipu_di_signal_cfg *sig)
                                di_gen |= DI_GEN_POLARITY_7;
                }
                if (sig->Vsync_pol) {
-                       if (sig->hsync_pin == 3)
+                       if (sig->vsync_pin == 3)
                                di_gen |= DI_GEN_POLARITY_3;
-                       else if (sig->hsync_pin == 6)
+                       else if (sig->vsync_pin == 6)
                                di_gen |= DI_GEN_POLARITY_6;
-                       else if (sig->hsync_pin == 8)
+                       else if (sig->vsync_pin == 8)
                                di_gen |= DI_GEN_POLARITY_8;
                }
        }
index e7fba62c10e939548d4394837f47b58f801514af..cea9f14fff4a159ef58aed465fb6df57d15fcbda 100644 (file)
@@ -23,7 +23,6 @@
 #include <drm/drm_fb_helper.h>
 #include <drm/drm_crtc_helper.h>
 #include <linux/videodev2.h>
-#include <linux/pinctrl/consumer.h>
 
 #include "imx-drm.h"
 
@@ -206,20 +205,11 @@ static int imx_pd_probe(struct platform_device *pdev)
        struct imx_parallel_display *imxpd;
        int ret;
        const char *fmt;
-       struct pinctrl *pinctrl;
 
        imxpd = devm_kzalloc(&pdev->dev, sizeof(*imxpd), GFP_KERNEL);
        if (!imxpd)
                return -ENOMEM;
 
-       pinctrl = devm_pinctrl_get_select_default(&pdev->dev);
-       if (IS_ERR(pinctrl)) {
-               ret = PTR_ERR(pinctrl);
-               dev_warn(&pdev->dev, "pinctrl_get_select_default failed with %d",
-                               ret);
-               return ret;
-       }
-
        edidp = of_get_property(np, "edid", &imxpd->edid_len);
        if (edidp)
                imxpd->edid = kmemdup(edidp, imxpd->edid_len, GFP_KERNEL);
@@ -265,6 +255,7 @@ static const struct of_device_id imx_pd_dt_ids[] = {
        { .compatible = "fsl,imx-parallel-display", },
        { /* sentinel */ }
 };
+MODULE_DEVICE_TABLE(of, imx_pd_dt_ids);
 
 static struct platform_driver imx_pd_driver = {
        .probe          = imx_pd_probe,
index 231611dc0f7472f1d444b0ef89090a9d5e23306a..f5d41e0348ce52d98586311a738f05ec386fb59c 100644 (file)
@@ -19,13 +19,13 @@ int ENE_InitMedia(struct us_data *us)
        int     result;
        BYTE    MiscReg03 = 0;
 
-       printk(KERN_INFO "--- Init Media ---\n");
-       result = ENE_Read_BYTE(us, REG_CARD_STATUS, &MiscReg03);
+       dev_info(&us->pusb_dev->dev, "--- Init Media ---\n");
+       result = ene_read_byte(us, REG_CARD_STATUS, &MiscReg03);
        if (result != USB_STOR_XFER_GOOD) {
-               printk(KERN_ERR "Read register fail !!\n");
+               dev_err(&us->pusb_dev->dev, "Failed to read register\n");
                return USB_STOR_TRANSPORT_ERROR;
        }
-       printk(KERN_INFO "MiscReg03 = %x\n", MiscReg03);
+       dev_info(&us->pusb_dev->dev, "MiscReg03 = %x\n", MiscReg03);
 
        if (MiscReg03 & 0x02) {
                if (!us->SM_Status.Ready && !us->MS_Status.Ready) {
@@ -39,9 +39,9 @@ int ENE_InitMedia(struct us_data *us)
 }
 
 /*
- * ENE_Read_BYTE() :
+ * ene_read_byte() :
  */
-int ENE_Read_BYTE(struct us_data *us, WORD index, void *buf)
+int ene_read_byte(struct us_data *us, WORD index, void *buf)
 {
        struct bulk_cb_wrap *bcb = (struct bulk_cb_wrap *) us->iobuf;
        int result;
@@ -67,11 +67,13 @@ int ENE_SMInit(struct us_data *us)
        int     result;
        BYTE    buf[0x200];
 
-       printk(KERN_INFO "transport --- ENE_SMInit\n");
+       dev_dbg(&us->pusb_dev->dev, "transport --- ENE_SMInit\n");
 
        result = ENE_LoadBinCode(us, SM_INIT_PATTERN);
        if (result != USB_STOR_XFER_GOOD) {
-               printk(KERN_INFO "Load SM Init Code Fail !!\n");
+               dev_info(&us->pusb_dev->dev,
+                        "Failed to load SmartMedia init code\n: result= %x\n",
+                        result);
                return USB_STOR_TRANSPORT_ERROR;
        }
 
@@ -84,26 +86,33 @@ int ENE_SMInit(struct us_data *us)
 
        result = ENE_SendScsiCmd(us, FDIR_READ, &buf, 0);
        if (result != USB_STOR_XFER_GOOD) {
-               printk(KERN_ERR
-                      "Execution SM Init Code Fail !! result = %x\n", result);
+               dev_err(&us->pusb_dev->dev,
+                       "Failed to load SmartMedia init code: result = %x\n",
+                       result);
                return USB_STOR_TRANSPORT_ERROR;
        }
 
-       us->SM_Status = *(PSM_STATUS)&buf[0];
+       us->SM_Status = *(struct keucr_sm_status *)&buf[0];
 
        us->SM_DeviceID = buf[1];
        us->SM_CardID   = buf[2];
 
        if (us->SM_Status.Insert && us->SM_Status.Ready) {
-               printk(KERN_INFO "Insert     = %x\n", us->SM_Status.Insert);
-               printk(KERN_INFO "Ready      = %x\n", us->SM_Status.Ready);
-               printk(KERN_INFO "WtP        = %x\n", us->SM_Status.WtP);
-               printk(KERN_INFO "DeviceID   = %x\n", us->SM_DeviceID);
-               printk(KERN_INFO "CardID     = %x\n", us->SM_CardID);
+               dev_info(&us->pusb_dev->dev, "Insert     = %x\n",
+                                            us->SM_Status.Insert);
+               dev_info(&us->pusb_dev->dev, "Ready      = %x\n",
+                                            us->SM_Status.Ready);
+               dev_info(&us->pusb_dev->dev, "WtP        = %x\n",
+                                            us->SM_Status.WtP);
+               dev_info(&us->pusb_dev->dev, "DeviceID   = %x\n",
+                                            us->SM_DeviceID);
+               dev_info(&us->pusb_dev->dev, "CardID     = %x\n",
+                                            us->SM_CardID);
                MediaChange = 1;
                Check_D_MediaFmt(us);
        } else {
-               printk(KERN_ERR "SM Card Not Ready --- %x\n", buf[0]);
+               dev_err(&us->pusb_dev->dev,
+                       "SmartMedia Card Not Ready --- %x\n", buf[0]);
                return USB_STOR_TRANSPORT_ERROR;
        }
 
@@ -120,7 +129,7 @@ int ENE_LoadBinCode(struct us_data *us, BYTE flag)
        /* void *buf; */
        PBYTE buf;
 
-       /* printk(KERN_INFO "transport --- ENE_LoadBinCode\n"); */
+       /* dev_info(&us->pusb_dev->dev, "transport --- ENE_LoadBinCode\n"); */
        if (us->BIN_FLAG == flag)
                return USB_STOR_TRANSPORT_GOOD;
 
@@ -130,11 +139,11 @@ int ENE_LoadBinCode(struct us_data *us, BYTE flag)
        switch (flag) {
        /* For SS */
        case SM_INIT_PATTERN:
-               printk(KERN_INFO "SM_INIT_PATTERN\n");
+               dev_dbg(&us->pusb_dev->dev, "SM_INIT_PATTERN\n");
                memcpy(buf, SM_Init, 0x800);
                break;
        case SM_RW_PATTERN:
-               printk(KERN_INFO "SM_RW_PATTERN\n");
+               dev_dbg(&us->pusb_dev->dev, "SM_RW_PATTERN\n");
                memcpy(buf, SM_Rdwr, 0x800);
                break;
        }
@@ -165,12 +174,13 @@ int ENE_SendScsiCmd(struct us_data *us, BYTE fDir, void *buf, int use_sg)
                     cswlen = 0, partial = 0;
        unsigned int residue;
 
-       /* printk(KERN_INFO "transport --- ENE_SendScsiCmd\n"); */
+       /* dev_dbg(&us->pusb_dev->dev, "transport --- ENE_SendScsiCmd\n"); */
        /* send cmd to out endpoint */
        result = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe,
                                            bcb, US_BULK_CB_WRAP_LEN, NULL);
        if (result != USB_STOR_XFER_GOOD) {
-               printk(KERN_ERR "send cmd to out endpoint fail ---\n");
+               dev_err(&us->pusb_dev->dev,
+                               "send cmd to out endpoint fail ---\n");
                return USB_STOR_TRANSPORT_ERROR;
        }
 
@@ -189,7 +199,7 @@ int ENE_SendScsiCmd(struct us_data *us, BYTE fDir, void *buf, int use_sg)
                        result = usb_stor_bulk_transfer_sg(us, pipe, buf,
                                                transfer_length, 0, &partial);
                if (result != USB_STOR_XFER_GOOD) {
-                       printk(KERN_ERR "data transfer fail ---\n");
+                       dev_err(&us->pusb_dev->dev, "data transfer fail ---\n");
                        return USB_STOR_TRANSPORT_ERROR;
                }
        }
@@ -199,14 +209,16 @@ int ENE_SendScsiCmd(struct us_data *us, BYTE fDir, void *buf, int use_sg)
                                                US_BULK_CS_WRAP_LEN, &cswlen);
 
        if (result == USB_STOR_XFER_SHORT && cswlen == 0) {
-               printk(KERN_WARNING "Received 0-length CSW; retrying...\n");
+               dev_warn(&us->pusb_dev->dev,
+                               "Received 0-length CSW; retrying...\n");
                result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe,
                                        bcs, US_BULK_CS_WRAP_LEN, &cswlen);
        }
 
        if (result == USB_STOR_XFER_STALLED) {
                /* get the status again */
-               printk(KERN_WARNING "Attempting to get CSW (2nd try)...\n");
+               dev_warn(&us->pusb_dev->dev,
+                               "Attempting to get CSW (2nd try)...\n");
                result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe,
                                                bcs, US_BULK_CS_WRAP_LEN, NULL);
        }
@@ -243,7 +255,7 @@ int ENE_Read_Data(struct us_data *us, void *buf, unsigned int length)
        struct bulk_cs_wrap *bcs = (struct bulk_cs_wrap *) us->iobuf;
        int result;
 
-       /* printk(KERN_INFO "transport --- ENE_Read_Data\n"); */
+       /* dev_dbg(&us->pusb_dev->dev, "transport --- ENE_Read_Data\n"); */
        /* set up the command wrapper */
        memset(bcb, 0, sizeof(struct bulk_cb_wrap));
        bcb->Signature = cpu_to_le32(US_BULK_CB_SIGN);
@@ -318,55 +330,3 @@ int ENE_Write_Data(struct us_data *us, void *buf, unsigned int length)
        return USB_STOR_TRANSPORT_GOOD;
 }
 
-/*
- * usb_stor_print_cmd():
- */
-void usb_stor_print_cmd(struct scsi_cmnd *srb)
-{
-       PBYTE   Cdb = srb->cmnd;
-       DWORD   cmd = Cdb[0];
-       DWORD   bn  =   ((Cdb[2] << 24) & 0xff000000) |
-                       ((Cdb[3] << 16) & 0x00ff0000) |
-                       ((Cdb[4] << 8) & 0x0000ff00) |
-                       ((Cdb[5] << 0) & 0x000000ff);
-       WORD    blen = ((Cdb[7] << 8) & 0xff00) | ((Cdb[8] << 0) & 0x00ff);
-
-       switch (cmd) {
-       case TEST_UNIT_READY:
-               /* printk(KERN_INFO
-                        "scsi cmd %X --- SCSIOP_TEST_UNIT_READY\n", cmd); */
-               break;
-       case INQUIRY:
-               printk(KERN_INFO "scsi cmd %X --- SCSIOP_INQUIRY\n", cmd);
-               break;
-       case MODE_SENSE:
-               printk(KERN_INFO "scsi cmd %X --- SCSIOP_MODE_SENSE\n", cmd);
-               break;
-       case START_STOP:
-               printk(KERN_INFO "scsi cmd %X --- SCSIOP_START_STOP\n", cmd);
-               break;
-       case READ_CAPACITY:
-               printk(KERN_INFO "scsi cmd %X --- SCSIOP_READ_CAPACITY\n", cmd);
-               break;
-       case READ_10:
-               /*  printk(KERN_INFO
-                          "scsi cmd %X --- SCSIOP_READ,bn = %X, blen = %X\n"
-                          ,cmd, bn, blen); */
-               break;
-       case WRITE_10:
-               /* printk(KERN_INFO
-                         "scsi cmd %X --- SCSIOP_WRITE,
-                         bn = %X, blen = %X\n" , cmd, bn, blen); */
-               break;
-       case ALLOW_MEDIUM_REMOVAL:
-               printk(KERN_INFO
-                       "scsi cmd %X --- SCSIOP_ALLOW_MEDIUM_REMOVAL\n", cmd);
-               break;
-       default:
-               printk(KERN_INFO "scsi cmd %X --- Other cmd\n", cmd);
-               break;
-       }
-       bn = 0;
-       blen = 0;
-}
-
index 48e1005349dac8e40a0066f942c5738c6ff8466c..afb00d84679dfa8d12d51cd8f8804696752c3185 100644 (file)
@@ -73,7 +73,8 @@ static int slave_configure(struct scsi_device *sdev)
                if (us->fflags & US_FL_CAPACITY_HEURISTICS)
                        sdev->guess_capacity = 1;
                if (sdev->scsi_level > SCSI_2)
-                       sdev->sdev_target->scsi_level = sdev->scsi_level = SCSI_2;
+                       sdev->sdev_target->scsi_level = sdev->scsi_level
+                                                               = SCSI_2;
                sdev->retry_hwerror = 1;
                sdev->allow_restart = 1;
                sdev->last_sector_bug = 1;
@@ -144,7 +145,7 @@ static int command_abort(struct scsi_cmnd *srb)
        scsi_lock(us_to_host(us));
        if (us->srb != srb) {
                scsi_unlock(us_to_host(us));
-               printk("-- nothing to abort\n");
+               dev_info(&us->pusb_dev->dev, "-- nothing to abort\n");
                return FAILED;
        }
 
@@ -319,8 +320,11 @@ static ssize_t store_max_sectors(struct device *dev,
        return -EINVAL;
 }
 
-static DEVICE_ATTR(max_sectors, S_IRUGO | S_IWUSR, show_max_sectors, store_max_sectors);
-static struct device_attribute *sysfs_device_attr_list[] = {&dev_attr_max_sectors, NULL, };
+static DEVICE_ATTR(max_sectors, S_IRUGO | S_IWUSR, show_max_sectors,
+                                                       store_max_sectors);
+static struct device_attribute *sysfs_device_attr_list[] = {
+       &dev_attr_max_sectors, NULL,
+};
 
 /* this defines our host template, with which we'll allocate hosts */
 
@@ -393,8 +397,9 @@ unsigned char usb_stor_sense_invalidCDB[18] = {
 /*
  * usb_stor_access_xfer_buf()
  */
-unsigned int usb_stor_access_xfer_buf(struct us_data *us, unsigned char *buffer,
-       unsigned int buflen, struct scsi_cmnd *srb, struct scatterlist **sgptr,
+unsigned int usb_stor_access_xfer_buf(struct us_data *us,
+       unsigned char *buffer, unsigned int buflen,
+       struct scsi_cmnd *srb, struct scatterlist **sgptr,
        unsigned int *offset, enum xfer_buf_dir dir)
 {
        unsigned int cnt;
@@ -424,7 +429,7 @@ unsigned int usb_stor_access_xfer_buf(struct us_data *us, unsigned char *buffer,
 
                while (sglen > 0) {
                        unsigned int plen = min(sglen,
-                                               (unsigned int)PAGE_SIZE - poff);
+                                       (unsigned int)PAGE_SIZE - poff);
                        unsigned char *ptr = kmap(page);
 
                        if (dir == TO_XFER_BUF)
index 24a636a4aa1c0f26c4a32f05e2b83f8368c32029..1538d7bd600f10d7202d9b2bd9efe8441f526672 100644 (file)
@@ -168,7 +168,7 @@ SmartMedia Model & Attribute
 /***************************************************************************
 Struct Definition
 ***************************************************************************/
-struct SSFDCTYPE {
+struct keucr_media_info {
        BYTE Model;
        BYTE Attribute;
        BYTE MaxZones;
@@ -177,30 +177,14 @@ struct SSFDCTYPE {
        WORD MaxLogBlocks;
 };
 
-typedef struct SSFDCTYPE_T {
-       BYTE Model;
-       BYTE Attribute;
-       BYTE MaxZones;
-       BYTE MaxSectors;
-       WORD MaxBlocks;
-       WORD MaxLogBlocks;
-} *SSFDCTYPE_T;
-
-struct ADDRESS {
+struct keucr_media_address {
        BYTE Zone;      /* Zone Number */
        BYTE Sector;    /* Sector(512byte) Number on Block */
        WORD PhyBlock;  /* Physical Block Number on Zone */
        WORD LogBlock;  /* Logical Block Number of Zone */
 };
 
-typedef struct ADDRESS_T {
-       BYTE Zone;      /* Zone Number */
-       BYTE Sector;    /* Sector(512byte) Number on Block */
-       WORD PhyBlock;  /* Physical Block Number on Zone */
-       WORD LogBlock;  /* Logical Block Number of Zone */
-} *ADDRESS_T;
-
-struct CIS_AREA {
+struct keucr_media_area {
        BYTE Sector;    /* Sector(512byte) Number on Block */
        WORD PhyBlock;  /* Physical Block Number on Zone 0 */
 };
@@ -215,9 +199,9 @@ extern WORD ReadBlock;
 extern WORD    WriteBlock;
 extern DWORD   MediaChange;
 
-extern struct SSFDCTYPE  Ssfdc;
-extern struct ADDRESS    Media;
-extern struct CIS_AREA   CisArea;
+extern struct keucr_media_info    Ssfdc;
+extern struct keucr_media_address Media;
+extern struct keucr_media_area    CisArea;
 
 /*
  * SMILMain.c
index cc49038e55d62011856dd3dc74c8a7c9c7b3509c..2786808fde9f017ff680084c8bd70a9dd6ab06cc 100644 (file)
 #include "smcommon.h"
 #include "smil.h"
 
-int         Check_D_LogCHS              (WORD *,BYTE *,BYTE *);
-void        Initialize_D_Media          (void);
-void        PowerOff_D_Media            (void);
-int         Check_D_MediaPower          (void);
-int         Check_D_MediaExist          (void);
-int         Check_D_MediaWP             (void);
-int         Check_D_MediaFmt            (struct us_data *);
-int         Check_D_MediaFmtForEraseAll (struct us_data *);
-int         Conv_D_MediaAddr            (struct us_data *, DWORD);
-int         Inc_D_MediaAddr             (struct us_data *);
-int         Check_D_FirstSect           (void);
-int         Check_D_LastSect            (void);
-int         Media_D_ReadOneSect         (struct us_data *, WORD, BYTE *);
-int         Media_D_WriteOneSect        (struct us_data *, WORD, BYTE *);
-int         Media_D_CopyBlockHead       (struct us_data *);
-int         Media_D_CopyBlockTail       (struct us_data *);
-int         Media_D_EraseOneBlock       (void);
-int         Media_D_EraseAllBlock       (void);
-
-int  Copy_D_BlockAll             (struct us_data *, DWORD);
-int  Copy_D_BlockHead            (struct us_data *);
-int  Copy_D_BlockTail            (struct us_data *);
-int  Reassign_D_BlockHead        (struct us_data *);
-
-int  Assign_D_WriteBlock         (void);
-int  Release_D_ReadBlock         (struct us_data *);
-int  Release_D_WriteBlock        (struct us_data *);
-int  Release_D_CopySector        (struct us_data *);
-
-int  Copy_D_PhyOneSect           (struct us_data *);
-int  Read_D_PhyOneSect           (struct us_data *, WORD, BYTE *);
-int  Write_D_PhyOneSect          (struct us_data *, WORD, BYTE *);
-int  Erase_D_PhyOneBlock         (struct us_data *);
-
-int  Set_D_PhyFmtValue           (struct us_data *);
-int  Search_D_CIS                (struct us_data *);
-int  Make_D_LogTable             (struct us_data *);
-void Check_D_BlockIsFull         (void);
-
-int  MarkFail_D_PhyOneBlock      (struct us_data *);
+int         Check_D_LogCHS(WORD *, BYTE *, BYTE *);
+void        Initialize_D_Media(void);
+void        PowerOff_D_Media(void);
+int         Check_D_MediaPower(void);
+int         Check_D_MediaExist(void);
+int         Check_D_MediaWP(void);
+int         Check_D_MediaFmt(struct us_data *);
+int         Check_D_MediaFmtForEraseAll(struct us_data *);
+int         Conv_D_MediaAddr(struct us_data *, DWORD);
+int         Inc_D_MediaAddr(struct us_data *);
+int         Check_D_FirstSect(void);
+int         Check_D_LastSect(void);
+int         Media_D_ReadOneSect(struct us_data *, WORD, BYTE *);
+int         Media_D_WriteOneSect(struct us_data *, WORD, BYTE *);
+int         Media_D_CopyBlockHead(struct us_data *);
+int         Media_D_CopyBlockTail(struct us_data *);
+int         Media_D_EraseOneBlock(void);
+int         Media_D_EraseAllBlock(void);
+
+int  Copy_D_BlockAll(struct us_data *, DWORD);
+int  Copy_D_BlockHead(struct us_data *);
+int  Copy_D_BlockTail(struct us_data *);
+int  Reassign_D_BlockHead(struct us_data *);
+
+int  Assign_D_WriteBlock(void);
+int  Release_D_ReadBlock(struct us_data *);
+int  Release_D_WriteBlock(struct us_data *);
+int  Release_D_CopySector(struct us_data *);
+
+int  Copy_D_PhyOneSect(struct us_data *);
+int  Read_D_PhyOneSect(struct us_data *, WORD, BYTE *);
+int  Write_D_PhyOneSect(struct us_data *, WORD, BYTE *);
+int  Erase_D_PhyOneBlock(struct us_data *);
+
+int  Set_D_PhyFmtValue(struct us_data *);
+int  Search_D_CIS(struct us_data *);
+int  Make_D_LogTable(struct us_data *);
+void Check_D_BlockIsFull(void);
+
+int  MarkFail_D_PhyOneBlock(struct us_data *);
 
 DWORD ErrXDCode;
 DWORD ErrCode;
-//BYTE  SectBuf[SECTSIZE];
 static BYTE  WorkBuf[SECTSIZE];
 static BYTE  Redundant[REDTSIZE];
 static BYTE  WorkRedund[REDTSIZE];
-//WORD  Log2Phy[MAX_ZONENUM][MAX_LOGBLOCK];
-static WORD  *Log2Phy[MAX_ZONENUM];                 // 128 x 1000,   Log2Phy[MAX_ZONENUM][MAX_LOGBLOCK];
-static BYTE  Assign[MAX_ZONENUM][MAX_BLOCKNUM/8];
+/* 128 x 1000, Log2Phy[MAX_ZONENUM][MAX_LOGBLOCK]; */
+static WORD  *Log2Phy[MAX_ZONENUM];
+static BYTE  Assign[MAX_ZONENUM][MAX_BLOCKNUM / 8];
 static WORD  AssignStart[MAX_ZONENUM];
 WORD  ReadBlock;
 WORD  WriteBlock;
 DWORD MediaChange;
 static DWORD SectCopyMode;
 
-//BIT Control Macro
-static BYTE BitData[] = { 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80 } ;
-#define Set_D_Bit(a,b)    (a[(BYTE)((b)/8)]|= BitData[(b)%8])
-#define Clr_D_Bit(a,b)    (a[(BYTE)((b)/8)]&=~BitData[(b)%8])
-#define Chk_D_Bit(a,b)    (a[(BYTE)((b)/8)] & BitData[(b)%8])
+/* BIT Control Macro */
+static BYTE BitData[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+#define Set_D_Bit(a, b)    (a[(BYTE)((b) / 8)] |= BitData[(b) % 8])
+#define Clr_D_Bit(a, b)    (a[(BYTE)((b) / 8)] &= ~BitData[(b) % 8])
+#define Chk_D_Bit(a, b)    (a[(BYTE)((b) / 8)] & BitData[(b) % 8])
 
-//extern PBYTE    SMHostAddr;
 BYTE     IsSSFDCCompliance;
 BYTE     IsXDCompliance;
 
 
-//
-////Power Control & Media Exist Check Function
-////----- Init_D_SmartMedia() --------------------------------------------
-//int Init_D_SmartMedia(void)
-//{
-//    int     i;
-//
-//    EMCR_Print("Init_D_SmartMedia start\n");
-//    for (i=0; i<MAX_ZONENUM; i++)
-//    {
-//        if (Log2Phy[i]!=NULL)
-//        {
-//            EMCR_Print("ExFreePool Zone = %x, Addr = %x\n", i, Log2Phy[i]);
-//            ExFreePool(Log2Phy[i]);
-//            Log2Phy[i] = NULL;
-//        }
-//    }
-//
-//    Initialize_D_Media();
-//    return(NO_ERROR);
-//}
-
-//----- SM_FreeMem() -------------------------------------------------
+/* ----- SM_FreeMem() ------------------------------------------------- */
 int SM_FreeMem(void)
 {
        int     i;
 
        pr_info("SM_FreeMem start\n");
-       for (i=0; i<MAX_ZONENUM; i++)
-       {
-               if (Log2Phy[i]!=NULL)
-               {
+       for (i = 0; i < MAX_ZONENUM; i++) {
+               if (Log2Phy[i] != NULL) {
                        pr_info("Free Zone = %x, Addr = %p\n", i, Log2Phy[i]);
                        kfree(Log2Phy[i]);
                        Log2Phy[i] = NULL;
                }
        }
-       return(NO_ERROR);
+       return NO_ERROR;
 }
 
-////----- Pwoff_D_SmartMedia() -------------------------------------------
-//int Pwoff_D_SmartMedia(void)
-//{
-//    PowerOff_D_Media();
-//    return(NO_ERROR);
-//}
-//
-////----- Check_D_SmartMedia() -------------------------------------------
-//int Check_D_SmartMedia(void)
-//{
-//    if (Check_D_MediaExist())
-//        return(ErrCode);
-//
-//    return(NO_ERROR);
-//}
-//
-////----- Check_D_Parameter() --------------------------------------------
-//int Check_D_Parameter(PFDO_DEVICE_EXTENSION fdoExt,WORD *pcyl,BYTE *phead,BYTE *psect)
-//{
-//    if (Check_D_MediaPower())
-//        return(ErrCode);
-//
-//    if (Check_D_MediaFmt(fdoExt))
-//        return(ErrCode);
-//
-//    if (Check_D_LogCHS(pcyl,phead,psect))
-//        return(ErrCode);
-//
-//    return(NO_ERROR);
-//}
-
-//SmartMedia Read/Write/Erase Function
-//----- Media_D_ReadSector() -------------------------------------------
-int Media_D_ReadSector(struct us_data *us, DWORD start,WORD count,BYTE *buf)
+/* SmartMedia Read/Write/Erase Function */
+/* ----- Media_D_ReadSector() ------------------------------------------- */
+int Media_D_ReadSector(struct us_data *us, DWORD start, WORD count, BYTE *buf)
 {
        WORD len, bn;
 
-       //if (Check_D_MediaPower())        ; Â¦b 6250 don't care
-       //    return(ErrCode);
-       //if (Check_D_MediaFmt(fdoExt))    ;
-       //    return(ErrCode);
        if (Conv_D_MediaAddr(us, start))
-               return(ErrCode);
+               return ErrCode;
 
-       while(1)
-       {
+       while (1) {
                len = Ssfdc.MaxSectors - Media.Sector;
                if (count > len)
                        bn = len;
                else
                        bn = count;
-               //if (Media_D_ReadOneSect(fdoExt, SectBuf))
-               //if (Media_D_ReadOneSect(fdoExt, count, buf))
-               if (Media_D_ReadOneSect(us, bn, buf))
-               {
+
+               if (Media_D_ReadOneSect(us, bn, buf)) {
                        ErrCode = ERR_EccReadErr;
-                       return(ErrCode);
+                       return ErrCode;
                }
 
                Media.Sector += bn;
                count -= bn;
 
-               if (count<=0)
+               if (count <= 0)
                        break;
 
                buf += bn * SECTSIZE;
 
                if (Inc_D_MediaAddr(us))
-                       return(ErrCode);
+                       return ErrCode;
        }
 
-       return(NO_ERROR);
+       return NO_ERROR;
 }
-// here
-//----- Media_D_CopySector() ------------------------------------------
-int Media_D_CopySector(struct us_data *us, DWORD start,WORD count,BYTE *buf)
+/* here */
+/* ----- Media_D_CopySector() ------------------------------------------ */
+int Media_D_CopySector(struct us_data *us, DWORD start, WORD count, BYTE *buf)
 {
-       //DWORD mode;
-       //int i;
        WORD len, bn;
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
 
        /* pr_info("Media_D_CopySector !!!\n"); */
        if (Conv_D_MediaAddr(us, start))
-               return(ErrCode);
+               return ErrCode;
 
-       while(1)
-       {
+       while (1) {
                if (Assign_D_WriteBlock())
-                       return(ERROR);
+                       return ERROR;
 
                len = Ssfdc.MaxSectors - Media.Sector;
                if (count > len)
@@ -209,607 +140,137 @@ int Media_D_CopySector(struct us_data *us, DWORD start,WORD count,BYTE *buf)
                else
                bn = count;
 
-               //if (Ssfdc_D_CopyBlock(fdoExt,count,buf,Redundant))
-               if (Ssfdc_D_CopyBlock(us,bn,buf,Redundant))
-               {
+               if (Ssfdc_D_CopyBlock(us, bn, buf, Redundant)) {
                        ErrCode = ERR_WriteFault;
-                       return(ErrCode);
+                       return ErrCode;
                }
 
                Media.Sector = 0x1F;
-               //if (Release_D_ReadBlock(fdoExt))
-               if (Release_D_CopySector(us))
-               {
-                       if (ErrCode==ERR_HwError)
-                       {
+               if (Release_D_CopySector(us)) {
+                       if (ErrCode == ERR_HwError) {
                                ErrCode = ERR_WriteFault;
-                               return(ErrCode);
+                               return ErrCode;
                        }
                }
                count -= bn;
 
-               if (count<=0)
+               if (count <= 0)
                        break;
 
                buf += bn * SECTSIZE;
 
                if (Inc_D_MediaAddr(us))
-                       return(ErrCode);
+                       return ErrCode;
 
        }
-       return(NO_ERROR);
+       return NO_ERROR;
 }
 
-//----- Release_D_CopySector() ------------------------------------------
+/* ----- Release_D_CopySector() ------------------------------------------ */
 int Release_D_CopySector(struct us_data *us)
 {
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-       Log2Phy[Media.Zone][Media.LogBlock]=WriteBlock;
-       Media.PhyBlock=ReadBlock;
+       Log2Phy[Media.Zone][Media.LogBlock] = WriteBlock;
+       Media.PhyBlock = ReadBlock;
 
-       if (Media.PhyBlock==NO_ASSIGN)
-       {
-               Media.PhyBlock=WriteBlock;
-               return(SMSUCCESS);
+       if (Media.PhyBlock == NO_ASSIGN) {
+               Media.PhyBlock = WriteBlock;
+               return SMSUCCESS;
        }
 
-       Clr_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-       Media.PhyBlock=WriteBlock;
+       Clr_D_Bit(Assign[Media.Zone], Media.PhyBlock);
+       Media.PhyBlock = WriteBlock;
 
-       return(SMSUCCESS);
-}
-/*
-//----- Media_D_WriteSector() ------------------------------------------
-int Media_D_WriteSector(PFDO_DEVICE_EXTENSION fdoExt, DWORD start,WORD count,BYTE *buf)
-{
-    int i;
-    WORD len, bn;
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    //if (Check_D_MediaPower())
-    //    return(ErrCode);
-    //
-    //if (Check_D_MediaFmt(fdoExt))
-    //    return(ErrCode);
-    //
-    //if (Check_D_MediaWP())
-    //    return(ErrCode);
-
-    if (Conv_D_MediaAddr(fdoExt, start))
-        return(ErrCode);
-
-    //ENE_Print("Media_D_WriteSector --- Sector = %x\n", Media.Sector);
-    if (Check_D_FirstSect())
-    {
-        if (Media_D_CopyBlockHead(fdoExt))
-        {
-            ErrCode = ERR_WriteFault;
-            return(ErrCode);
-        }
-    }
-
-    while(1)
-    {
-        if (!Check_D_FirstSect())
-        {
-            if (Assign_D_WriteBlock())
-                return(ErrCode);
-        }
-
-        len = Ssfdc.MaxSectors - Media.Sector;
-        if (count > len)
-           bn = len;
-        else
-           bn = count;
-        //for(i=0;i<SECTSIZE;i++)
-        //    SectBuf[i]=*buf++;
-
-        //if (Media_D_WriteOneSect(fdoExt, SectBuf))
-        if (Media_D_WriteOneSect(fdoExt, bn, buf))
-        {
-            ErrCode = ERR_WriteFault;
-            return(ErrCode);
-        }
-
-        Media.Sector += bn - 1;
-
-        if (!Check_D_LastSect())
-        {
-            if (Release_D_ReadBlock(fdoExt))
-
-            {    if (ErrCode==ERR_HwError)
-                {
-                    ErrCode = ERR_WriteFault;
-                    return(ErrCode);
-                }
-            }
-        }
-
-        count -= bn;
-
-        if (count<=0)
-            break;
-
-        buf += bn * SECTSIZE;
-
-        //if (--count<=0)
-        //    break;
-
-        if (Inc_D_MediaAddr(fdoExt))
-            return(ErrCode);
-    }
-
-    if (!Check_D_LastSect())
-        return(NO_ERROR);
-
-    if (Inc_D_MediaAddr(fdoExt))
-        return(ErrCode);
-
-    if (Media_D_CopyBlockTail(fdoExt))
-    {
-        ErrCode = ERR_WriteFault;
-        return(ErrCode);
-    }
-
-    return(NO_ERROR);
+       return SMSUCCESS;
 }
-//
-////----- Media_D_EraseBlock() -------------------------------------------
-//int Media_D_EraseBlock(PFDO_DEVICE_EXTENSION fdoExt, DWORD start,WORD count)
-//{
-//    if (Check_D_MediaPower())
-//        return(ErrCode);
-//
-//    if (Check_D_MediaFmt(fdoExt))
-//        return(ErrCode);
-//
-//    if (Check_D_MediaWP())
-//        return(ErrCode);
-//
-//    if (Conv_D_MediaAddr(start))
-//        return(ErrCode);
-//
-//    while(Check_D_FirstSect()) {
-//        if (Inc_D_MediaAddr(fdoExt))
-//            return(ErrCode);
-//
-//        if (--count<=0)
-//            return(NO_ERROR);
-//    }
-//
-//    while(1) {
-//        if (!Check_D_LastSect())
-//            if (Media_D_EraseOneBlock())
-//                if (ErrCode==ERR_HwError)
-//                {
-//                    ErrCode = ERR_WriteFault;
-//                    return(ErrCode);
-//                }
-//
-//        if (Inc_D_MediaAddr(fdoExt))
-//            return(ErrCode);
-//
-//        if (--count<=0)
-//            return(NO_ERROR);
-//    }
-//}
-//
-////----- Media_D_EraseAll() ---------------------------------------------
-//int Media_D_EraseAll(PFDO_DEVICE_EXTENSION fdoExt)
-//{
-//    if (Check_D_MediaPower())
-//        return(ErrCode);
-//
-//    if (Check_D_MediaFmtForEraseAll(fdoExt))
-//        return(ErrCode);
-//
-//    if (Check_D_MediaWP())
-//        return(ErrCode);
-//
-//    if (Media_D_EraseAllBlock())
-//        return(ErrCode);
-//
-//    return(NO_ERROR);
-//}
-
-//SmartMedia Write Function for One Sector Write Mode
-//----- Media_D_OneSectWriteStart() ------------------------------------
-int Media_D_OneSectWriteStart(PFDO_DEVICE_EXTENSION fdoExt,DWORD start,BYTE *buf)
-{
-//  int i;
-//  SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-//  ADDRESS_T   bb = (ADDRESS_T) &Media;
-//
-//  //if (Check_D_MediaPower())
-//  //    return(ErrCode);
-//  //if (Check_D_MediaFmt(fdoExt))
-//  //    return(ErrCode);
-//  //if (Check_D_MediaWP())
-//  //    return(ErrCode);
-//  if (Conv_D_MediaAddr(fdoExt, start))
-//      return(ErrCode);
-//
-//  if (Check_D_FirstSect())
-//      if (Media_D_CopyBlockHead(fdoExt))
-//      {
-//          ErrCode = ERR_WriteFault;
-//          return(ErrCode);
-//      }
-//
-//  if (!Check_D_FirstSect())
-//      if (Assign_D_WriteBlock())
-//          return(ErrCode);
-//
-//  //for(i=0;i<SECTSIZE;i++)
-//  //    SectBuf[i]=*buf++;
-//
-//  //if (Media_D_WriteOneSect(fdoExt, SectBuf))
-//  if (Media_D_WriteOneSect(fdoExt, buf))
-//  {
-//      ErrCode = ERR_WriteFault;
-//      return(ErrCode);
-//  }
-//
-//  if (!Check_D_LastSect())
-//  {
-//      if (Release_D_ReadBlock(fdoExt))
-//          if (ErrCode==ERR_HwError)
-//          {
-//              ErrCode = ERR_WriteFault;
-//              return(ErrCode);
-//          }
-//  }
-
-    return(NO_ERROR);
-}
-
-//----- Media_D_OneSectWriteNext() -------------------------------------
-int Media_D_OneSectWriteNext(PFDO_DEVICE_EXTENSION fdoExt, BYTE *buf)
-{
-//  int i;
-//  SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-//  ADDRESS_T   bb = (ADDRESS_T) &Media;
-//
-//  if (Inc_D_MediaAddr(fdoExt))
-//      return(ErrCode);
-//
-//  if (!Check_D_FirstSect())
-//    if (Assign_D_WriteBlock())
-//      return(ErrCode);
-//
-//  //for(i=0;i<SECTSIZE;i++)
-//  //    SectBuf[i]=*buf++;
-//
-//  //if (Media_D_WriteOneSect(fdoExt, SectBuf))
-//  if (Media_D_WriteOneSect(fdoExt, buf))
-//  {
-//      ErrCode = ERR_WriteFault;
-//      return(ErrCode);
-//  }
-//
-//  if (!Check_D_LastSect())
-//  {
-//      if (Release_D_ReadBlock(fdoExt))
-//          if (ErrCode==ERR_HwError)
-//          {
-//              ErrCode = ERR_WriteFault;
-//              return(ErrCode);
-//          }
-//  }
-
-    return(NO_ERROR);
-}
-
-//----- Media_D_OneSectWriteFlush() ------------------------------------
-int Media_D_OneSectWriteFlush(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    if (!Check_D_LastSect())
-        return(NO_ERROR);
-
-    if (Inc_D_MediaAddr(fdoExt))
-        return(ErrCode);
-
-    if (Media_D_CopyBlockTail(fdoExt))
-    {
-        ErrCode = ERR_WriteFault;
-        return(ErrCode);
-    }
 
-    return(NO_ERROR);
-}
-//
-////LED Tern On/Off Subroutine
-////----- SM_EnableLED() -----------------------------------------------
-//void SM_EnableLED(PFDO_DEVICE_EXTENSION fdoExt, BOOLEAN enable)
-//{
-//    if (fdoExt->Drive_IsSWLED)
-//    {
-//        if (enable)
-//           Led_D_TernOn();
-//        else
-//           Led_D_TernOff();
-//    }
-//}
-//
-////----- Led_D_TernOn() -------------------------------------------------
-//void Led_D_TernOn(void)
-//{
-//    if (Check_D_CardStsChg())
-//        MediaChange=ERROR;
-//
-//    Cnt_D_LedOn();
-//}
-//
-////----- Led_D_TernOff() ------------------------------------------------
-//void Led_D_TernOff(void)
-//{
-//    if (Check_D_CardStsChg())
-//        MediaChange=ERROR;
-//
-//    Cnt_D_LedOff();
-//}
-//
-////SmartMedia Logical Format Subroutine
-////----- Check_D_LogCHS() -----------------------------------------------
-//int Check_D_LogCHS(WORD *c,BYTE *h,BYTE *s)
-//{
-//    switch(Ssfdc.Model) {
-//        case SSFDC1MB:   *c=125; *h= 4; *s= 4; break;
-//        case SSFDC2MB:   *c=125; *h= 4; *s= 8; break;
-//        case SSFDC4MB:   *c=250; *h= 4; *s= 8; break;
-//        case SSFDC8MB:   *c=250; *h= 4; *s=16; break;
-//        case SSFDC16MB:  *c=500; *h= 4; *s=16; break;
-//        case SSFDC32MB:  *c=500; *h= 8; *s=16; break;
-//        case SSFDC64MB:  *c=500; *h= 8; *s=32; break;
-//        case SSFDC128MB: *c=500; *h=16; *s=32; break;
-//        default:         *c= 0;  *h= 0; *s= 0; ErrCode = ERR_NoSmartMedia;    return(ERROR);
-//    }
-//
-//    return(SMSUCCESS);
-//}
-//
-////Power Control & Media Exist Check Subroutine
-////----- Initialize_D_Media() -------------------------------------------
-//void Initialize_D_Media(void)
-//{
-//    ErrCode      = NO_ERROR;
-//    MediaChange  = ERROR;
-//    SectCopyMode = COMPLETED;
-//    Cnt_D_Reset();
-//}
-//
-////----- PowerOff_D_Media() ---------------------------------------------
-//void PowerOff_D_Media(void)
-//{
-//    Cnt_D_PowerOff();
-//}
-//
-////----- Check_D_MediaPower() -------------------------------------------
-//int Check_D_MediaPower(void)
-//{
-//    //usleep(56*1024);
-//    if (Check_D_CardStsChg())
-//        MediaChange = ERROR;
-//    //usleep(56*1024);
-//    if ((!Check_D_CntPower())&&(!MediaChange))  // Â¦Â³ power & Media Â¨S³Q change, Â«h return success
-//        return(SMSUCCESS);
-//    //usleep(56*1024);
-//
-//    if (Check_D_CardExist())                    // Check if card is not exist, return err
-//    {
-//        ErrCode        = ERR_NoSmartMedia;
-//        MediaChange = ERROR;
-//        return(ERROR);
-//    }
-//    //usleep(56*1024);
-//    if (Cnt_D_PowerOn())
-//    {
-//        ErrCode        = ERR_NoSmartMedia;
-//        MediaChange = ERROR;
-//        return(ERROR);
-//    }
-//    //usleep(56*1024);
-//    Ssfdc_D_Reset(fdoExt);
-//    //usleep(56*1024);
-//    return(SMSUCCESS);
-//}
-//
-////-----Check_D_MediaExist() --------------------------------------------
-//int Check_D_MediaExist(void)
-//{
-//    if (Check_D_CardStsChg())
-//        MediaChange = ERROR;
-//
-//    if (!Check_D_CardExist())
-//    {
-//        if (!MediaChange)
-//            return(SMSUCCESS);
-//
-//        ErrCode = ERR_ChangedMedia;
-//        return(ERROR);
-//    }
-//
-//    ErrCode = ERR_NoSmartMedia;
-//
-//    return(ERROR);
-//}
-//
-////----- Check_D_MediaWP() ----------------------------------------------
-//int Check_D_MediaWP(void)
-//{
-//    if (Ssfdc.Attribute &MWP)
-//    {
-//        ErrCode = ERR_WrtProtect;
-//        return(ERROR);
-//    }
-//
-//    return(SMSUCCESS);
-//}
-*/
-//SmartMedia Physical Format Test Subroutine
-//----- Check_D_MediaFmt() ---------------------------------------------
+/* SmartMedia Physical Format Test Subroutine */
+/* ----- Check_D_MediaFmt() --------------------------------------------- */
 int Check_D_MediaFmt(struct us_data *us)
 {
        pr_info("Check_D_MediaFmt\n");
-       //ULONG i,j, result=FALSE, zone,block;
 
-       //usleep(56*1024);
        if (!MediaChange)
-               return(SMSUCCESS);
+               return SMSUCCESS;
 
        MediaChange  = ERROR;
        SectCopyMode = COMPLETED;
 
-       //usleep(56*1024);
-       if (Set_D_PhyFmtValue(us))
-       {
+       if (Set_D_PhyFmtValue(us)) {
                ErrCode = ERR_UnknownMedia;
-               return(ERROR);
+               return ERROR;
        }
-       
-       //usleep(56*1024);
-       if (Search_D_CIS(us))
-       {
+
+       if (Search_D_CIS(us)) {
                ErrCode = ERR_IllegalFmt;
-               return(ERROR);
+               return ERROR;
        }
 
-
-    MediaChange = SMSUCCESS;
-    return(SMSUCCESS);
+       MediaChange = SMSUCCESS;
+       return SMSUCCESS;
 }
-/*
-////----- Check_D_BlockIsFull() ----------------------------------
-//void Check_D_BlockIsFull()
-//{
-//    ULONG i, block;
-//
-//    if (IsXDCompliance || IsSSFDCCompliance)
-//    {
-//       // If the blocks are full then return write-protect.
-//       block = Ssfdc.MaxBlocks/8;
-//       for (Media.Zone=0; Media.Zone<Ssfdc.MaxZones; Media.Zone++)
-//       {
-//           if (Log2Phy[Media.Zone]==NULL)
-//           {
-//               if (Make_D_LogTable())
-//               {
-//                   ErrCode = ERR_IllegalFmt;
-//                   return;
-//               }
-//           }
-//
-//           for (i=0; i<block; i++)
-//           {
-//               if (Assign[Media.Zone][i] != 0xFF)
-//                  return;
-//           }
-//       }
-//       Ssfdc.Attribute |= WP;
-//    }
-//}
-//
-//
-////----- Check_D_MediaFmtForEraseAll() ----------------------------------
-//int Check_D_MediaFmtForEraseAll(PFDO_DEVICE_EXTENSION fdoExt)
-//{
-//    MediaChange  = ERROR;
-//    SectCopyMode = COMPLETED;
-//
-//    if (Set_D_PhyFmtValue(fdoExt))
-//    {
-//        ErrCode = ERR_UnknownMedia;
-//        return(ERROR);
-//    }
-//
-//    if (Search_D_CIS(fdoExt))
-//    {
-//        ErrCode = ERR_IllegalFmt;
-//        return(ERROR);
-//    }
-//
-//    return(SMSUCCESS);
-//}
-*/
-//SmartMedia Physical Address Control Subroutine
-//----- Conv_D_MediaAddr() ---------------------------------------------
+
+/* SmartMedia Physical Address Control Subroutine */
+/* ----- Conv_D_MediaAddr() --------------------------------------------- */
 int Conv_D_MediaAddr(struct us_data *us, DWORD addr)
 {
        DWORD temp;
-       //ULONG  zz;
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-       temp           = addr/Ssfdc.MaxSectors;
-       Media.Zone     = (BYTE) (temp/Ssfdc.MaxLogBlocks);
+       temp           = addr / Ssfdc.MaxSectors;
+       Media.Zone     = (BYTE) (temp / Ssfdc.MaxLogBlocks);
 
-       if (Log2Phy[Media.Zone]==NULL)
-       {
-               if (Make_D_LogTable(us))
-               {
+       if (Log2Phy[Media.Zone] == NULL) {
+               if (Make_D_LogTable(us)) {
                        ErrCode = ERR_IllegalFmt;
-                       return(ERROR);
+                       return ERROR;
                }
        }
 
-       Media.Sector   = (BYTE) (addr%Ssfdc.MaxSectors);
-       Media.LogBlock = (WORD) (temp%Ssfdc.MaxLogBlocks);
+       Media.Sector   = (BYTE) (addr % Ssfdc.MaxSectors);
+       Media.LogBlock = (WORD) (temp % Ssfdc.MaxLogBlocks);
 
-       if (Media.Zone<Ssfdc.MaxZones)
-       {
+       if (Media.Zone < Ssfdc.MaxZones) {
                Clr_D_RedundantData(Redundant);
                Set_D_LogBlockAddr(Redundant);
                Media.PhyBlock = Log2Phy[Media.Zone][Media.LogBlock];
-               return(SMSUCCESS);
+               return SMSUCCESS;
        }
 
        ErrCode = ERR_OutOfLBA;
-       return(ERROR);
+       return ERROR;
 }
 
-//----- Inc_D_MediaAddr() ----------------------------------------------
+/* ----- Inc_D_MediaAddr() ---------------------------------------------- */
 int Inc_D_MediaAddr(struct us_data *us)
 {
        WORD        LogBlock = Media.LogBlock;
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-       if (++Media.Sector<Ssfdc.MaxSectors)
-               return(SMSUCCESS);
+       if (++Media.Sector < Ssfdc.MaxSectors)
+               return SMSUCCESS;
 
-       if (Log2Phy[Media.Zone]==NULL)
-       {
-               if (Make_D_LogTable(us))
-               {
+       if (Log2Phy[Media.Zone] == NULL) {
+               if (Make_D_LogTable(us)) {
                        ErrCode = ERR_IllegalFmt;
-                       return(ERROR);
+                       return ERROR;
                }
        }
 
-       Media.Sector=0;
+       Media.Sector = 0;
        Media.LogBlock = LogBlock;
 
-       if (++Media.LogBlock<Ssfdc.MaxLogBlocks)
-       {
+       if (++Media.LogBlock < Ssfdc.MaxLogBlocks) {
                Clr_D_RedundantData(Redundant);
                Set_D_LogBlockAddr(Redundant);
-               Media.PhyBlock=Log2Phy[Media.Zone][Media.LogBlock];
-               return(SMSUCCESS);
+               Media.PhyBlock = Log2Phy[Media.Zone][Media.LogBlock];
+               return SMSUCCESS;
        }
 
-       Media.LogBlock=0;
+       Media.LogBlock = 0;
 
-       if (++Media.Zone<Ssfdc.MaxZones)
-       {
-               if (Log2Phy[Media.Zone]==NULL)
-               {
-                       if (Make_D_LogTable(us))
-                       {
+       if (++Media.Zone < Ssfdc.MaxZones) {
+               if (Log2Phy[Media.Zone] == NULL) {
+                       if (Make_D_LogTable(us)) {
                                ErrCode = ERR_IllegalFmt;
-                               return(ERROR);
+                               return ERROR;
                        }
                }
 
@@ -817,1034 +278,508 @@ int Inc_D_MediaAddr(struct us_data *us)
 
                Clr_D_RedundantData(Redundant);
                Set_D_LogBlockAddr(Redundant);
-               Media.PhyBlock=Log2Phy[Media.Zone][Media.LogBlock];
-               return(SMSUCCESS);
+               Media.PhyBlock = Log2Phy[Media.Zone][Media.LogBlock];
+               return SMSUCCESS;
        }
 
-       Media.Zone=0;
+       Media.Zone = 0;
        ErrCode = ERR_OutOfLBA;
 
-       return(ERROR);
+       return ERROR;
 }
-/*
-//----- Check_D_FirstSect() --------------------------------------------
-int Check_D_FirstSect(void)
-{
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    if (!Media.Sector)
-        return(SMSUCCESS);
-
-    return(ERROR);
-}
-
-//----- Check_D_LastSect() ---------------------------------------------
-int Check_D_LastSect(void)
-{
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-    if (Media.Sector<(Ssfdc.MaxSectors-1))
-        return(ERROR);
-
-    return(SMSUCCESS);
-}
-*/
-//SmartMedia Read/Write Subroutine with Retry
-//----- Media_D_ReadOneSect() ------------------------------------------
+/* SmartMedia Read/Write Subroutine with Retry */
+/* ----- Media_D_ReadOneSect() ------------------------------------------ */
 int Media_D_ReadOneSect(struct us_data *us, WORD count, BYTE *buf)
 {
        DWORD err, retry;
 
        if (!Read_D_PhyOneSect(us, count, buf))
-               return(SMSUCCESS);
-       if (ErrCode==ERR_HwError)
-               return(ERROR);
-       if (ErrCode==ERR_DataStatus)
-               return(ERROR);
+               return SMSUCCESS;
+       if (ErrCode == ERR_HwError)
+               return ERROR;
+       if (ErrCode == ERR_DataStatus)
+               return ERROR;
 
 #ifdef RDERR_REASSIGN
-       if (Ssfdc.Attribute &MWP)
-       {
-               if (ErrCode==ERR_CorReadErr)
-                       return(SMSUCCESS);
-               return(ERROR);
+       if (Ssfdc.Attribute & MWP) {
+               if (ErrCode == ERR_CorReadErr)
+                       return SMSUCCESS;
+               return ERROR;
        }
 
-       err=ErrCode;
-       for(retry=0; retry<2; retry++)
-       {
-               if (Copy_D_BlockAll(us, (err==ERR_EccReadErr)?REQ_FAIL:REQ_ERASE))
-               {
-                       if (ErrCode==ERR_HwError)
-                               return(ERROR);
+       err = ErrCode;
+       for (retry = 0; retry < 2; retry++) {
+               if (Copy_D_BlockAll(us,
+                       (err == ERR_EccReadErr) ? REQ_FAIL : REQ_ERASE)) {
+                       if (ErrCode == ERR_HwError)
+                               return ERROR;
                        continue;
                }
 
                ErrCode = err;
-               if (ErrCode==ERR_CorReadErr)
-                       return(SMSUCCESS);
-               return(ERROR);
+               if (ErrCode == ERR_CorReadErr)
+                       return SMSUCCESS;
+               return ERROR;
        }
 
        MediaChange = ERROR;
 #else
-       if (ErrCode==ERR_CorReadErr) return(SMSUCCESS);
+       if (ErrCode == ERR_CorReadErr)
+               return SMSUCCESS;
 #endif
 
-       return(ERROR);
-}
-/*
-//----- Media_D_WriteOneSect() -----------------------------------------
-int Media_D_WriteOneSect(PFDO_DEVICE_EXTENSION fdoExt, WORD count, BYTE *buf)
-{
-    DWORD retry;
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    if (!Write_D_PhyOneSect(fdoExt, count, buf))
-        return(SMSUCCESS);
-    if (ErrCode==ERR_HwError)
-        return(ERROR);
-
-    for(retry=1; retry<2; retry++)
-    {
-        if (Reassign_D_BlockHead(fdoExt))
-        {
-            if (ErrCode==ERR_HwError)
-                return(ERROR);
-            continue;
-        }
-
-        if (!Write_D_PhyOneSect(fdoExt, count, buf))
-            return(SMSUCCESS);
-        if (ErrCode==ERR_HwError)
-            return(ERROR);
-    }
-
-    if (Release_D_WriteBlock(fdoExt))
-        return(ERROR);
-
-    ErrCode        = ERR_WriteFault;
-    MediaChange = ERROR;
-    return(ERROR);
-}
-
-//SmartMedia Data Copy Subroutine with Retry
-//----- Media_D_CopyBlockHead() ----------------------------------------
-int Media_D_CopyBlockHead(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    DWORD retry;
-
-    for(retry=0; retry<2; retry++)
-    {
-        if (!Copy_D_BlockHead(fdoExt))
-            return(SMSUCCESS);
-        if (ErrCode==ERR_HwError)
-            return(ERROR);
-    }
-
-    MediaChange = ERROR;
-    return(ERROR);
+       return ERROR;
 }
 
-//----- Media_D_CopyBlockTail() ----------------------------------------
-int Media_D_CopyBlockTail(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    DWORD retry;
-
-    if (!Copy_D_BlockTail(fdoExt))
-        return(SMSUCCESS);
-    if (ErrCode==ERR_HwError)
-        return(ERROR);
-
-    for(retry=1; retry<2; retry++)
-    {
-        if (Reassign_D_BlockHead(fdoExt))
-        {
-            if (ErrCode==ERR_HwError)
-                return(ERROR);
-            continue;
-        }
-
-        if (!Copy_D_BlockTail(fdoExt))
-            return(SMSUCCESS);
-        if (ErrCode==ERR_HwError)
-            return(ERROR);
-    }
-
-    if (Release_D_WriteBlock(fdoExt))
-        return(ERROR);
-
-    ErrCode        = ERR_WriteFault;
-    MediaChange = ERROR;
-    return(ERROR);
-}
-//
-////----- Media_D_EraseOneBlock() ----------------------------------------
-//int Media_D_EraseOneBlock(void)
-//{
-//    WORD        LogBlock = Media.LogBlock;
-//    WORD        PhyBlock = Media.PhyBlock;
-//    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-//    ADDRESS_T   bb = (ADDRESS_T) &Media;
-//
-//    if (Media.PhyBlock==NO_ASSIGN)
-//        return(SMSUCCESS);
-//
-//    if (Log2Phy[Media.Zone]==NULL)
-//    {
-//        if (Make_D_LogTable())
-//        {
-//            ErrCode = ERR_IllegalFmt;
-//            return(ERROR);
-//        }
-//    }
-//    Media.LogBlock = LogBlock;
-//    Media.PhyBlock = PhyBlock;
-//
-//    Log2Phy[Media.Zone][Media.LogBlock]=NO_ASSIGN;
-//
-//    if (Erase_D_PhyOneBlock(fdoExt))
-//    {
-//        if (ErrCode==ERR_HwError)
-//            return(ERROR);
-//        if (MarkFail_D_PhyOneBlock())
-//            return(ERROR);
-//
-//        ErrCode = ERR_WriteFault;
-//        return(ERROR);
-//    }
-//
-//    Clr_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-//    Media.PhyBlock=NO_ASSIGN;
-//    return(SMSUCCESS);
-//}
-//
-////SmartMedia Erase Subroutine
-////----- Media_D_EraseAllBlock() ----------------------------------------
-//int Media_D_EraseAllBlock(void)
-//{
-//    WORD cis=0;
-//
-//    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-//    ADDRESS_T   bb = (ADDRESS_T) &Media;
-//
-//    MediaChange = ERROR;
-//    Media.Sector   = 0;
-//
-//    for(Media.Zone=0; Media.Zone<Ssfdc.MaxZones; Media.Zone++)
-//        for(Media.PhyBlock=0; Media.PhyBlock<Ssfdc.MaxBlocks; Media.PhyBlock++) {
-//            if (Ssfdc_D_ReadRedtData(Redundant))
-//            {
-//                Ssfdc_D_Reset(fdoExt);
-//                return(ERROR);
-//            }
-//
-//            Ssfdc_D_Reset(fdoExt);
-//            if (!Check_D_FailBlock(Redundant))
-//            {
-//                if (cis)
-//                {
-//                    if (Ssfdc_D_EraseBlock(fdoExt))
-//                    {
-//                        ErrCode = ERR_HwError;
-//                        return(ERROR);
-//                    }
-//
-//                    if (Ssfdc_D_CheckStatus())
-//                    {
-//                        if (MarkFail_D_PhyOneBlock())
-//                            return(ERROR);
-//                    }
-//
-//                    continue;
-//                }
-//
-//                if (Media.PhyBlock!=CisArea.PhyBlock)
-//                {
-//                    ErrCode = ERR_IllegalFmt;
-//                    return(ERROR);
-//                }
-//
-//                cis++;
-//            }
-//
-//        }
-//    return(SMSUCCESS);
-//}
-*/
-//SmartMedia Physical Sector Data Copy Subroutine
-//----- Copy_D_BlockAll() ----------------------------------------------
+/* SmartMedia Physical Sector Data Copy Subroutine */
+/* ----- Copy_D_BlockAll() ---------------------------------------------- */
 int Copy_D_BlockAll(struct us_data *us, DWORD mode)
 {
        BYTE sect;
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-       sect=Media.Sector;
+       sect = Media.Sector;
 
        if (Assign_D_WriteBlock())
-               return(ERROR);
-       if (mode==REQ_FAIL)
-               SectCopyMode=REQ_FAIL;
-
-       for(Media.Sector=0; Media.Sector<Ssfdc.MaxSectors; Media.Sector++)
-       {
-               if (Copy_D_PhyOneSect(us))
-               {
-                       if (ErrCode==ERR_HwError)
-                               return(ERROR);
+               return ERROR;
+       if (mode == REQ_FAIL)
+               SectCopyMode = REQ_FAIL;
+
+       for (Media.Sector = 0; Media.Sector < Ssfdc.MaxSectors;
+                                                       Media.Sector++) {
+               if (Copy_D_PhyOneSect(us)) {
+                       if (ErrCode == ERR_HwError)
+                               return ERROR;
                        if (Release_D_WriteBlock(us))
-                               return(ERROR);
+                               return ERROR;
 
                        ErrCode = ERR_WriteFault;
-                       Media.PhyBlock=ReadBlock;
-                       Media.Sector=sect;
+                       Media.PhyBlock = ReadBlock;
+                       Media.Sector = sect;
 
-                       return(ERROR);
+                       return ERROR;
                }
        }
 
        if (Release_D_ReadBlock(us))
-               return(ERROR);
-
-       Media.PhyBlock=WriteBlock;
-       Media.Sector=sect;
-       return(SMSUCCESS);
-}
-/*
-//----- Copy_D_BlockHead() ---------------------------------------------
-int Copy_D_BlockHead(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    BYTE sect;
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    sect=Media.Sector;
-    if (Assign_D_WriteBlock())
-        return(ERROR);
-
-    for(Media.Sector=0; Media.Sector<sect; Media.Sector++)
-    {
-        if (Copy_D_PhyOneSect(fdoExt))
-        {
-            if (ErrCode==ERR_HwError)
-                return(ERROR);
-            if (Release_D_WriteBlock(fdoExt))
-                return(ERROR);
-
-            ErrCode = ERR_WriteFault;
-            Media.PhyBlock=ReadBlock;
-            Media.Sector=sect;
-
-            return(ERROR);
-        }
-    }
-
-    Media.PhyBlock=WriteBlock;
-    Media.Sector=sect;
-    return(SMSUCCESS);
-}
+               return ERROR;
 
-//----- Copy_D_BlockTail() ---------------------------------------------
-int Copy_D_BlockTail(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    BYTE sect;
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    for(sect=Media.Sector; Media.Sector<Ssfdc.MaxSectors; Media.Sector++)
-    {
-        if (Copy_D_PhyOneSect(fdoExt))
-        {
-            if (ErrCode==ERR_HwError)
-                return(ERROR);
-
-            Media.PhyBlock=WriteBlock;
-            Media.Sector=sect;
-
-            return(ERROR);
-        }
-    }
-
-    if (Release_D_ReadBlock(fdoExt))
-        return(ERROR);
-
-    Media.PhyBlock=WriteBlock;
-    Media.Sector=sect;
-    return(SMSUCCESS);
+       Media.PhyBlock = WriteBlock;
+       Media.Sector = sect;
+       return SMSUCCESS;
 }
 
-//----- Reassign_D_BlockHead() -----------------------------------------
-int Reassign_D_BlockHead(PFDO_DEVICE_EXTENSION fdoExt)
-{
-    DWORD  mode;
-    WORD   block;
-    BYTE   sect;
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    mode=SectCopyMode;
-    block=ReadBlock;
-    sect=Media.Sector;
-
-    if (Assign_D_WriteBlock())
-        return(ERROR);
-
-    SectCopyMode=REQ_FAIL;
-
-    for(Media.Sector=0; Media.Sector<sect; Media.Sector++)
-    {
-        if (Copy_D_PhyOneSect(fdoExt))
-        {
-            if (ErrCode==ERR_HwError)
-                return(ERROR);
-            if (Release_D_WriteBlock(fdoExt))
-                return(ERROR);
-
-            ErrCode = ERR_WriteFault;
-            SectCopyMode=mode;
-            WriteBlock=ReadBlock;
-            ReadBlock=block;
-            Media.Sector=sect;
-            Media.PhyBlock=WriteBlock;
-
-            return(ERROR);
-        }
-    }
-
-    if (Release_D_ReadBlock(fdoExt))
-        return(ERROR);
-
-    SectCopyMode=mode;
-    ReadBlock=block;
-    Media.Sector=sect;
-    Media.PhyBlock=WriteBlock;
-    return(SMSUCCESS);
-}
-*/
-//SmartMedia Physical Block Assign/Release Subroutine
-//----- Assign_D_WriteBlock() ------------------------------------------
+/* SmartMedia Physical Block Assign/Release Subroutine */
+/* ----- Assign_D_WriteBlock() ------------------------------------------ */
 int Assign_D_WriteBlock(void)
 {
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
-       ReadBlock=Media.PhyBlock;
-
-       for(WriteBlock=AssignStart[Media.Zone]; WriteBlock<Ssfdc.MaxBlocks; WriteBlock++)
-       {
-               if (!Chk_D_Bit(Assign[Media.Zone],WriteBlock))
-               {
-                       Set_D_Bit(Assign[Media.Zone],WriteBlock);
-                       AssignStart[Media.Zone]=WriteBlock+1;
-                       Media.PhyBlock=WriteBlock;
-                       SectCopyMode=REQ_ERASE;
-                       //ErrXDCode = NO_ERROR;
-                       return(SMSUCCESS);
+       ReadBlock = Media.PhyBlock;
+
+       for (WriteBlock = AssignStart[Media.Zone];
+                       WriteBlock < Ssfdc.MaxBlocks; WriteBlock++) {
+               if (!Chk_D_Bit(Assign[Media.Zone], WriteBlock)) {
+                       Set_D_Bit(Assign[Media.Zone], WriteBlock);
+                       AssignStart[Media.Zone] = WriteBlock + 1;
+                       Media.PhyBlock = WriteBlock;
+                       SectCopyMode = REQ_ERASE;
+                       return SMSUCCESS;
                }
        }
 
-       for(WriteBlock=0; WriteBlock<AssignStart[Media.Zone]; WriteBlock++)
-       {
-               if (!Chk_D_Bit(Assign[Media.Zone],WriteBlock))
-               {
-                       Set_D_Bit(Assign[Media.Zone],WriteBlock);
-                       AssignStart[Media.Zone]=WriteBlock+1;
-                       Media.PhyBlock=WriteBlock;
-                       SectCopyMode=REQ_ERASE;
-                       //ErrXDCode = NO_ERROR;
-                       return(SMSUCCESS);
+       for (WriteBlock = 0;
+                       WriteBlock < AssignStart[Media.Zone]; WriteBlock++) {
+               if (!Chk_D_Bit(Assign[Media.Zone], WriteBlock)) {
+                       Set_D_Bit(Assign[Media.Zone], WriteBlock);
+                       AssignStart[Media.Zone] = WriteBlock + 1;
+                       Media.PhyBlock = WriteBlock;
+                       SectCopyMode = REQ_ERASE;
+                       return SMSUCCESS;
                }
        }
 
-       WriteBlock=NO_ASSIGN;
+       WriteBlock = NO_ASSIGN;
        ErrCode = ERR_WriteFault;
-       // For xD test
-       //Ssfdc.Attribute |= WP;
-       //ErrXDCode = ERR_WrtProtect;
-       return(ERROR);
+
+       return ERROR;
 }
 
-//----- Release_D_ReadBlock() ------------------------------------------
+/* ----- Release_D_ReadBlock() ------------------------------------------ */
 int Release_D_ReadBlock(struct us_data *us)
 {
        DWORD mode;
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-       mode=SectCopyMode;
-       SectCopyMode=COMPLETED;
+       mode = SectCopyMode;
+       SectCopyMode = COMPLETED;
 
-       if (mode==COMPLETED)
-               return(SMSUCCESS);
+       if (mode == COMPLETED)
+               return SMSUCCESS;
 
-       Log2Phy[Media.Zone][Media.LogBlock]=WriteBlock;
-       Media.PhyBlock=ReadBlock;
+       Log2Phy[Media.Zone][Media.LogBlock] = WriteBlock;
+       Media.PhyBlock = ReadBlock;
 
-       if (Media.PhyBlock==NO_ASSIGN)
-       {
-               Media.PhyBlock=WriteBlock;
-               return(SMSUCCESS);
+       if (Media.PhyBlock == NO_ASSIGN) {
+               Media.PhyBlock = WriteBlock;
+               return SMSUCCESS;
        }
 
-       if (mode==REQ_ERASE)
-       {
-               if (Erase_D_PhyOneBlock(us))
-               {
-                       if (ErrCode==ERR_HwError) return(ERROR);
-                       if (MarkFail_D_PhyOneBlock(us)) return(ERROR);
-               }
-               else
-                       Clr_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-       }
-       else if (MarkFail_D_PhyOneBlock(us))
-               return(ERROR);
+       if (mode == REQ_ERASE) {
+               if (Erase_D_PhyOneBlock(us)) {
+                       if (ErrCode == ERR_HwError)
+                               return ERROR;
+                       if (MarkFail_D_PhyOneBlock(us))
+                               return ERROR;
+               } else
+                       Clr_D_Bit(Assign[Media.Zone], Media.PhyBlock);
+       } else if (MarkFail_D_PhyOneBlock(us))
+               return ERROR;
 
-       Media.PhyBlock=WriteBlock;
-       return(SMSUCCESS);
+       Media.PhyBlock = WriteBlock;
+       return SMSUCCESS;
 }
 
-//----- Release_D_WriteBlock() -----------------------------------------
+/* ----- Release_D_WriteBlock() ----------------------------------------- */
 int Release_D_WriteBlock(struct us_data *us)
 {
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
-       SectCopyMode=COMPLETED;
-       Media.PhyBlock=WriteBlock;
+       SectCopyMode = COMPLETED;
+       Media.PhyBlock = WriteBlock;
 
        if (MarkFail_D_PhyOneBlock(us))
-               return(ERROR);
+               return ERROR;
 
-       Media.PhyBlock=ReadBlock;
-       return(SMSUCCESS);
+       Media.PhyBlock = ReadBlock;
+       return SMSUCCESS;
 }
 
-//SmartMedia Physical Sector Data Copy Subroutine
-//----- Copy_D_PhyOneSect() --------------------------------------------
+/* SmartMedia Physical Sector Data Copy Subroutine */
+/* ----- Copy_D_PhyOneSect() -------------------------------------------- */
 int Copy_D_PhyOneSect(struct us_data *us)
 {
        int           i;
        DWORD  err, retry;
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
 
        /* pr_info("Copy_D_PhyOneSect --- Secotr = %x\n", Media.Sector); */
-       if (ReadBlock!=NO_ASSIGN)
-       {
-               Media.PhyBlock=ReadBlock;
-               for(retry=0; retry<2; retry++)
-               {
-                       if (retry!=0)
-                       {
+       if (ReadBlock != NO_ASSIGN) {
+               Media.PhyBlock = ReadBlock;
+               for (retry = 0; retry < 2; retry++) {
+                       if (retry != 0) {
                                Ssfdc_D_Reset(us);
-                               if (Ssfdc_D_ReadCisSect(us,WorkBuf,WorkRedund))
-                               { ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
+                               if (Ssfdc_D_ReadCisSect(us, WorkBuf,
+                                                               WorkRedund)) {
+                                       ErrCode = ERR_HwError;
+                                       MediaChange = ERROR;
+                                       return ERROR;
+                               }
 
-                               if (Check_D_CISdata(WorkBuf,WorkRedund))
-                               { ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
+                               if (Check_D_CISdata(WorkBuf, WorkRedund)) {
+                                       ErrCode = ERR_HwError;
+                                       MediaChange = ERROR;
+                                       return ERROR;
+                               }
+                       }
+
+                       if (Ssfdc_D_ReadSect(us, WorkBuf, WorkRedund)) {
+                               ErrCode = ERR_HwError;
+                               MediaChange = ERROR;
+                               return ERROR;
+                       }
+                       if (Check_D_DataStatus(WorkRedund)) {
+                               err = ERROR;
+                               break;
+                       }
+                       if (!Check_D_ReadError(WorkRedund)) {
+                               err = SMSUCCESS;
+                               break;
+                       }
+                       if (!Check_D_Correct(WorkBuf, WorkRedund)) {
+                               err = SMSUCCESS;
+                               break;
                        }
 
-                       if (Ssfdc_D_ReadSect(us,WorkBuf,WorkRedund))
-                       { ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-                       if (Check_D_DataStatus(WorkRedund))
-                       { err=ERROR; break; }
-                       if (!Check_D_ReadError(WorkRedund))
-                       { err=SMSUCCESS; break; }
-                       if (!Check_D_Correct(WorkBuf,WorkRedund))
-                       { err=SMSUCCESS; break; }
-
-                       err=ERROR;
-                       SectCopyMode=REQ_FAIL;
+                       err = ERROR;
+                       SectCopyMode = REQ_FAIL;
                }
-       }
-       else
-       {
-               err=SMSUCCESS;
-               for(i=0; i<SECTSIZE; i++)
-                       WorkBuf[i]=DUMMY_DATA;
+       } else {
+               err = SMSUCCESS;
+               for (i = 0; i < SECTSIZE; i++)
+                       WorkBuf[i] = DUMMY_DATA;
                Clr_D_RedundantData(WorkRedund);
        }
 
        Set_D_LogBlockAddr(WorkRedund);
-       if (err==ERROR)
-       {
+       if (err == ERROR) {
                Set_D_RightECC(WorkRedund);
                Set_D_DataStaus(WorkRedund);
        }
 
-       Media.PhyBlock=WriteBlock;
+       Media.PhyBlock = WriteBlock;
 
-       if (Ssfdc_D_WriteSectForCopy(us, WorkBuf, WorkRedund))
-       { ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-       if (Ssfdc_D_CheckStatus())
-       { ErrCode = ERR_WriteFault; return(ERROR); }
+       if (Ssfdc_D_WriteSectForCopy(us, WorkBuf, WorkRedund)) {
+               ErrCode = ERR_HwError;
+               MediaChange = ERROR;
+               return ERROR;
+       }
+       if (Ssfdc_D_CheckStatus()) {
+               ErrCode = ERR_WriteFault;
+               return ERROR;
+       }
 
-       Media.PhyBlock=ReadBlock;
-       return(SMSUCCESS);
+       Media.PhyBlock = ReadBlock;
+       return SMSUCCESS;
 }
 
-//SmartMedia Physical Sector Read/Write/Erase Subroutine
-//----- Read_D_PhyOneSect() --------------------------------------------
+/* SmartMedia Physical Sector Read/Write/Erase Subroutine */
+/* ----- Read_D_PhyOneSect() -------------------------------------------- */
 int Read_D_PhyOneSect(struct us_data *us, WORD count, BYTE *buf)
 {
        int           i;
        DWORD  retry;
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-       if (Media.PhyBlock==NO_ASSIGN)
-       {
-               for(i=0; i<SECTSIZE; i++)
-                       *buf++=DUMMY_DATA;
-               return(SMSUCCESS);
+
+       if (Media.PhyBlock == NO_ASSIGN) {
+               for (i = 0; i < SECTSIZE; i++)
+                       *buf++ = DUMMY_DATA;
+               return SMSUCCESS;
        }
 
-       for(retry=0; retry<2; retry++)
-       {
-               if (retry!=0)
-               {
+       for (retry = 0; retry < 2; retry++) {
+               if (retry != 0) {
                        Ssfdc_D_Reset(us);
 
-                       if (Ssfdc_D_ReadCisSect(us,WorkBuf,WorkRedund))
-                       { ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-                       if (Check_D_CISdata(WorkBuf,WorkRedund))
-                       { ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
+                       if (Ssfdc_D_ReadCisSect(us, WorkBuf, WorkRedund)) {
+                               ErrCode = ERR_HwError;
+                               MediaChange = ERROR;
+                               return ERROR;
+                       }
+                       if (Check_D_CISdata(WorkBuf, WorkRedund)) {
+                               ErrCode = ERR_HwError;
+                               MediaChange = ERROR;
+                               return ERROR;
+                       }
                }
 
-               //if (Ssfdc_D_ReadSect(fdoExt,buf,Redundant))
-               if (Ssfdc_D_ReadBlock(us,count,buf,Redundant))
-               { ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-               if (Check_D_DataStatus(Redundant))
-               { ErrCode = ERR_DataStatus; return(ERROR); }
+               if (Ssfdc_D_ReadBlock(us, count, buf, Redundant)) {
+                       ErrCode = ERR_HwError;
+                       MediaChange = ERROR;
+                       return ERROR;
+               }
+               if (Check_D_DataStatus(Redundant)) {
+                       ErrCode = ERR_DataStatus;
+                       return ERROR;
+               }
 
                if (!Check_D_ReadError(Redundant))
-                       return(SMSUCCESS);
+                       return SMSUCCESS;
 
-               if (!Check_D_Correct(buf,Redundant))
-               { ErrCode = ERR_CorReadErr; return(ERROR); }
+               if (!Check_D_Correct(buf, Redundant)) {
+                       ErrCode = ERR_CorReadErr;
+                       return ERROR;
+               }
        }
 
        ErrCode = ERR_EccReadErr;
-       return(ERROR);
+       return ERROR;
 }
-/*
-//----- Write_D_PhyOneSect() -------------------------------------------
-int Write_D_PhyOneSect(PFDO_DEVICE_EXTENSION fdoExt, WORD count, BYTE *buf)
-{
-    SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-    ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-    //if (Ssfdc_D_WriteSect(fdoExt,buf,Redundant))
-    if (Ssfdc_D_WriteBlock(fdoExt,count,buf,Redundant))
-    { ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-    if (Ssfdc_D_CheckStatus())
-    { ErrCode = ERR_WriteFault; return(ERROR); }
 
-    return(SMSUCCESS);
-}
-*/
-//----- Erase_D_PhyOneBlock() ------------------------------------------
+/* ----- Erase_D_PhyOneBlock() ------------------------------------------ */
 int Erase_D_PhyOneBlock(struct us_data *us)
 {
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-       if (Ssfdc_D_EraseBlock(us))
-       { ErrCode = ERR_HwError; MediaChange=ERROR; return(ERROR); }
-       if (Ssfdc_D_CheckStatus())
-       { ErrCode = ERR_WriteFault; return(ERROR); }
+       if (Ssfdc_D_EraseBlock(us)) {
+               ErrCode = ERR_HwError;
+               MediaChange = ERROR;
+               return ERROR;
+       }
+       if (Ssfdc_D_CheckStatus()) {
+               ErrCode = ERR_WriteFault;
+               return ERROR;
+       }
 
-       return(SMSUCCESS);
+       return SMSUCCESS;
 }
 
-//SmartMedia Physical Format Check Local Subroutine
-//----- Set_D_PhyFmtValue() --------------------------------------------
+/* SmartMedia Physical Format Check Local Subroutine */
+/* ----- Set_D_PhyFmtValue() -------------------------------------------- */
 int Set_D_PhyFmtValue(struct us_data *us)
 {
-//    PPDO_DEVICE_EXTENSION   pdoExt;
-//    BYTE      idcode[4];
-//    DWORD     UserDefData_1, UserDefData_2, Data, mask;
-//
-//    //if (!fdoExt->ChildDeviceObject)       return(ERROR);
-//    //pdoExt = fdoExt->ChildDeviceObject->DeviceExtension;
-//
-//    Ssfdc_D_ReadID(idcode, READ_ID_1);
-//
-    //if (Set_D_SsfdcModel(idcode[1]))
-    if (Set_D_SsfdcModel(us->SM_DeviceID))
-        return(ERROR);
-
-//    //Use Multi-function pin to differentiate SM and xD.
-//    UserDefData_1 = ReadPCIReg(fdoExt->BusID, fdoExt->DevID, fdoExt->FuncID, PCI_REG_USER_DEF) & 0x80;
-//    if (UserDefData_1)
-//    {
-//       if ( READ_PORT_BYTE(SM_REG_INT_STATUS) & 0x80 )      fdoExt->DiskType = DISKTYPE_XD;
-//       if ( READ_PORT_BYTE(SM_REG_INT_STATUS) & 0x40 )      fdoExt->DiskType = DISKTYPE_SM;
-//
-//       if ( IsXDCompliance && (fdoExt->DiskType == DISKTYPE_XD) )
-//       {
-//          Ssfdc_D_ReadID(idcode, READ_ID_3);
-//          if (idcode[2] != 0xB5)
-//             return(ERROR);
-//       }
-//    }
-//
-//    //Use GPIO to differentiate SM and xD.
-//    UserDefData_2 = ReadPCIReg(fdoExt->BusID, fdoExt->DevID, fdoExt->FuncID, PCI_REG_USER_DEF) >> 8;
-//    if ( UserDefData_2 )
-//    {
-//       Data = ReadPCIReg(fdoExt->BusID, fdoExt->DevID, 0, 0xAC);
-//
-//       mask = 1 << (UserDefData_2-1);
-//       // 1 : xD , 0 : SM
-//       if ( Data & mask)
-//          fdoExt->DiskType = DISKTYPE_XD;
-//       else
-//          fdoExt->DiskType = DISKTYPE_SM;
-//
-//       if ( IsXDCompliance && (fdoExt->DiskType == DISKTYPE_XD) )
-//       {
-//          Ssfdc_D_ReadID(idcode, READ_ID_3);
-//          if (idcode[2] != 0xB5)
-//             return(ERROR);
-//       }
-//    }
-//
-//    if ( !(UserDefData_1 | UserDefData_2) )
-//    {
-//      // Use UserDefine Register to differentiate SM and xD.
-//      Ssfdc_D_ReadID(idcode, READ_ID_3);
-//
-//      if (idcode[2] == 0xB5)
-//         fdoExt->DiskType = DISKTYPE_XD;
-//      else
-//      {
-//          if (!IsXDCompliance)
-//             fdoExt->DiskType = DISKTYPE_SM;
-//          else
-//             return(ERROR);
-//      }
-//
-//      if (fdoExt->UserDef_DiskType == 0x04)  fdoExt->DiskType = DISKTYPE_XD;
-//      if (fdoExt->UserDef_DiskType == 0x08)  fdoExt->DiskType = DISKTYPE_SM;
-//    }
-//
-//    if (!fdoExt->UserDef_DisableWP)
-//    {
-//       if (fdoExt->DiskType == DISKTYPE_SM)
-//       {
-//           if (Check_D_SsfdcWP())
-//              Ssfdc.Attribute|=WP;
-//       }
-//    }
-
-    return(SMSUCCESS);
+       if (Set_D_SsfdcModel(us->SM_DeviceID))
+               return ERROR;
+
+       return SMSUCCESS;
 }
 
-//----- Search_D_CIS() -------------------------------------------------
+/* ----- Search_D_CIS() ------------------------------------------------- */
 int Search_D_CIS(struct us_data *us)
 {
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
-
-       Media.Zone=0; Media.Sector=0;
+       Media.Zone = 0;
+       Media.Sector = 0;
 
-       for (Media.PhyBlock=0; Media.PhyBlock<(Ssfdc.MaxBlocks-Ssfdc.MaxLogBlocks-1); Media.PhyBlock++)
-       {
-               if (Ssfdc_D_ReadRedtData(us, Redundant))
-               {
+       for (Media.PhyBlock = 0;
+               Media.PhyBlock < (Ssfdc.MaxBlocks - Ssfdc.MaxLogBlocks - 1);
+               Media.PhyBlock++) {
+               if (Ssfdc_D_ReadRedtData(us, Redundant)) {
                        Ssfdc_D_Reset(us);
-                       return(ERROR);
+                       return ERROR;
                }
 
                if (!Check_D_FailBlock(Redundant))
                        break;
        }
 
-       if (Media.PhyBlock==(Ssfdc.MaxBlocks-Ssfdc.MaxLogBlocks-1))
-       {
+       if (Media.PhyBlock == (Ssfdc.MaxBlocks - Ssfdc.MaxLogBlocks - 1)) {
                Ssfdc_D_Reset(us);
-               return(ERROR);
+               return ERROR;
        }
 
-       while (Media.Sector<CIS_SEARCH_SECT)
-       {
-               if (Media.Sector)
-               {
-                       if (Ssfdc_D_ReadRedtData(us, Redundant))
-                       {
+       while (Media.Sector < CIS_SEARCH_SECT) {
+               if (Media.Sector) {
+                       if (Ssfdc_D_ReadRedtData(us, Redundant)) {
                                Ssfdc_D_Reset(us);
-                               return(ERROR);
+                               return ERROR;
                        }
                }
-               if (!Check_D_DataStatus(Redundant))
-               {
-                       if (Ssfdc_D_ReadSect(us,WorkBuf,Redundant))
-                       {
+               if (!Check_D_DataStatus(Redundant)) {
+                       if (Ssfdc_D_ReadSect(us, WorkBuf, Redundant)) {
                                Ssfdc_D_Reset(us);
-                               return(ERROR);
+                               return ERROR;
                        }
 
-                       if (Check_D_CISdata(WorkBuf,Redundant))
-                       {
+                       if (Check_D_CISdata(WorkBuf, Redundant)) {
                                Ssfdc_D_Reset(us);
-                               return(ERROR);
+                               return ERROR;
                        }
 
-                       CisArea.PhyBlock=Media.PhyBlock;
-                       CisArea.Sector=Media.Sector;
+                       CisArea.PhyBlock = Media.PhyBlock;
+                       CisArea.Sector = Media.Sector;
                        Ssfdc_D_Reset(us);
-                       return(SMSUCCESS);
+                       return SMSUCCESS;
                }
 
                Media.Sector++;
        }
 
        Ssfdc_D_Reset(us);
-       return(ERROR);
+       return ERROR;
 }
 
-//----- Make_D_LogTable() ----------------------------------------------
+/* ----- Make_D_LogTable() ---------------------------------------------- */
 int Make_D_LogTable(struct us_data *us)
 {
-       WORD  phyblock,logblock;
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
+       WORD  phyblock, logblock;
 
-       if (Log2Phy[Media.Zone]==NULL)
-       {
-               Log2Phy[Media.Zone] = kmalloc(MAX_LOGBLOCK*sizeof(WORD), GFP_KERNEL);
+       if (Log2Phy[Media.Zone] == NULL) {
+               Log2Phy[Media.Zone] = kmalloc(MAX_LOGBLOCK * sizeof(WORD),
+                                                               GFP_KERNEL);
                /* pr_info("ExAllocatePool Zone = %x, Addr = %x\n",
                                Media.Zone, Log2Phy[Media.Zone]); */
-               if (Log2Phy[Media.Zone]==NULL)
-                       return(ERROR);
+               if (Log2Phy[Media.Zone] == NULL)
+                       return ERROR;
        }
 
-       Media.Sector=0;
-
-       //for(Media.Zone=0; Media.Zone<MAX_ZONENUM; Media.Zone++)
-       //for(Media.Zone=0; Media.Zone<Ssfdc.MaxZones; Media.Zone++)
-       {
-               /* pr_info("Make_D_LogTable --- MediaZone = 0x%x\n",
-                                                       Media.Zone); */
-               for(Media.LogBlock=0; Media.LogBlock<Ssfdc.MaxLogBlocks; Media.LogBlock++)
-                       Log2Phy[Media.Zone][Media.LogBlock]=NO_ASSIGN;
-
-               for(Media.PhyBlock=0; Media.PhyBlock<(MAX_BLOCKNUM/8); Media.PhyBlock++)
-                       Assign[Media.Zone][Media.PhyBlock]=0x00;
-
-               for(Media.PhyBlock=0; Media.PhyBlock<Ssfdc.MaxBlocks; Media.PhyBlock++)
-               {
-                       if ((!Media.Zone) && (Media.PhyBlock<=CisArea.PhyBlock))
-                       {
-                               Set_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-                               continue;
-                       }
+       Media.Sector = 0;
+
+       /* pr_info("Make_D_LogTable --- MediaZone = 0x%x\n",
+                                               Media.Zone); */
+       for (Media.LogBlock = 0; Media.LogBlock < Ssfdc.MaxLogBlocks;
+                                               Media.LogBlock++)
+               Log2Phy[Media.Zone][Media.LogBlock] = NO_ASSIGN;
+
+       for (Media.PhyBlock = 0; Media.PhyBlock < (MAX_BLOCKNUM / 8);
+                                               Media.PhyBlock++)
+               Assign[Media.Zone][Media.PhyBlock] = 0x00;
+
+       for (Media.PhyBlock = 0; Media.PhyBlock < Ssfdc.MaxBlocks;
+                                               Media.PhyBlock++) {
+               if ((!Media.Zone) && (Media.PhyBlock <= CisArea.PhyBlock)) {
+                       Set_D_Bit(Assign[Media.Zone], Media.PhyBlock);
+                       continue;
+               }
+
+               if (Ssfdc_D_ReadRedtData(us, Redundant)) {
+                       Ssfdc_D_Reset(us);
+                       return ERROR;
+               }
+
+               if (!Check_D_DataBlank(Redundant))
+                       continue;
+
+               Set_D_Bit(Assign[Media.Zone], Media.PhyBlock);
 
-                       if (Ssfdc_D_ReadRedtData(us, Redundant))
-                       { Ssfdc_D_Reset(us); return(ERROR); }
+               if (Check_D_FailBlock(Redundant))
+                       continue;
 
-                       if (!Check_D_DataBlank(Redundant))
-                               continue;
+               if (Load_D_LogBlockAddr(Redundant))
+                       continue;
 
-                       Set_D_Bit(Assign[Media.Zone],Media.PhyBlock);
+               if (Media.LogBlock >= Ssfdc.MaxLogBlocks)
+                       continue;
 
-                       if (Check_D_FailBlock(Redundant))
-                               continue;
+               if (Log2Phy[Media.Zone][Media.LogBlock] == NO_ASSIGN) {
+                       Log2Phy[Media.Zone][Media.LogBlock] = Media.PhyBlock;
+                       continue;
+               }
 
-                       //if (Check_D_DataStatus(Redundant))
-                       //    continue;
+               phyblock     = Media.PhyBlock;
+               logblock     = Media.LogBlock;
+               Media.Sector = (BYTE)(Ssfdc.MaxSectors - 1);
 
-                       if (Load_D_LogBlockAddr(Redundant))
-                               continue;
+               if (Ssfdc_D_ReadRedtData(us, Redundant)) {
+                       Ssfdc_D_Reset(us);
+                       return ERROR;
+               }
 
-                       if (Media.LogBlock>=Ssfdc.MaxLogBlocks)
-                               continue;
+               if (!Load_D_LogBlockAddr(Redundant) &&
+                               (Media.LogBlock == logblock)) {
+                       Media.PhyBlock = Log2Phy[Media.Zone][logblock];
 
-                       if (Log2Phy[Media.Zone][Media.LogBlock]==NO_ASSIGN)
-                       {
-                               Log2Phy[Media.Zone][Media.LogBlock]=Media.PhyBlock;
-                               continue;
+                       if (Ssfdc_D_ReadRedtData(us, Redundant)) {
+                               Ssfdc_D_Reset(us);
+                               return ERROR;
                        }
 
-                       phyblock     = Media.PhyBlock;
-                       logblock     = Media.LogBlock;
-                       Media.Sector = (BYTE)(Ssfdc.MaxSectors-1);
-
-                       if (Ssfdc_D_ReadRedtData(us, Redundant))
-                       { Ssfdc_D_Reset(us); return(ERROR); }
-
-                       if (!Load_D_LogBlockAddr(Redundant))
-                       {
-                               if (Media.LogBlock==logblock)
-                               {
-                                       Media.PhyBlock=Log2Phy[Media.Zone][logblock];
-
-                                       if (Ssfdc_D_ReadRedtData(us, Redundant))
-                                       { Ssfdc_D_Reset(us); return(ERROR); }
-
-                                       Media.PhyBlock=phyblock;
-
-                                       if (!Load_D_LogBlockAddr(Redundant))
-                                       {
-                                               if (Media.LogBlock!=logblock)
-                                               {
-                                                       Media.PhyBlock=Log2Phy[Media.Zone][logblock];
-                                                       Log2Phy[Media.Zone][logblock]=phyblock;
-                                               }
-                                       }
-                                       else
-                                       {
-                                               Media.PhyBlock=Log2Phy[Media.Zone][logblock];
-                                               Log2Phy[Media.Zone][logblock]=phyblock;
-                                       }
+                       Media.PhyBlock = phyblock;
+
+                       if (!Load_D_LogBlockAddr(Redundant)) {
+                               if (Media.LogBlock != logblock) {
+                                       Media.PhyBlock =
+                                               Log2Phy[Media.Zone][logblock];
+                                       Log2Phy[Media.Zone][logblock] =
+                                                               phyblock;
                                }
+                       } else {
+                               Media.PhyBlock = Log2Phy[Media.Zone][logblock];
+                               Log2Phy[Media.Zone][logblock] = phyblock;
                        }
+               }
+
+               Media.Sector = 0;
+               Media.PhyBlock = phyblock;
+
+       AssignStart[Media.Zone] = 0;
 
-                       Media.Sector=0;
-
-// here Not yet
-//#ifdef L2P_ERR_ERASE
-//                     if (!(Ssfdc.Attribute &MWP))
-//                     {
-//                             Ssfdc_D_Reset(fdoExt);
-//                             if (Ssfdc_D_EraseBlock(fdoExt))
-//                                     return(ERROR);
-//
-//                             if (Ssfdc_D_CheckStatus())
-//                             {
-//                                     if (MarkFail_D_PhyOneBlock())
-//                                             return(ERROR);
-//                             }
-//                             else
-//                                     Clr_D_Bit(Assign[Media.Zone],Media.PhyBlock);
-//                     }
-//#else
-//                     Ssfdc.Attribute|=MWP;
-//#endif
-                       Media.PhyBlock=phyblock;
-
-               } // End for (Media.PhyBlock<Ssfdc.MaxBlocks)
-
-               AssignStart[Media.Zone]=0;
-
-       } // End for (Media.Zone<MAX_ZONENUM)
+       } /* End for (Media.Zone<MAX_ZONENUM) */
 
        Ssfdc_D_Reset(us);
-       return(SMSUCCESS);
+       return SMSUCCESS;
 }
 
-//----- MarkFail_D_PhyOneBlock() ---------------------------------------
+/* ----- MarkFail_D_PhyOneBlock() --------------------------------------- */
 int MarkFail_D_PhyOneBlock(struct us_data *us)
 {
        BYTE sect;
-       //SSFDCTYPE_T aa = (SSFDCTYPE_T ) &Ssfdc;
-       //ADDRESS_T   bb = (ADDRESS_T) &Media;
 
-       sect=Media.Sector;
+       sect = Media.Sector;
        Set_D_FailBlock(WorkRedund);
-       //Ssfdc_D_WriteRedtMode();
 
-       for(Media.Sector=0; Media.Sector<Ssfdc.MaxSectors; Media.Sector++)
-       {
-               if (Ssfdc_D_WriteRedtData(us, WorkRedund))
-               {
+       for (Media.Sector = 0; Media.Sector < Ssfdc.MaxSectors;
+                                                       Media.Sector++) {
+               if (Ssfdc_D_WriteRedtData(us, WorkRedund)) {
                        Ssfdc_D_Reset(us);
                        Media.Sector   = sect;
                        ErrCode        = ERR_HwError;
                        MediaChange = ERROR;
-                       return(ERROR);
-               } // NO Status Check
+                       return ERROR;
+               } /* NO Status Check */
        }
 
        Ssfdc_D_Reset(us);
-       Media.Sector=sect;
-       return(SMSUCCESS);
+       Media.Sector = sect;
+       return SMSUCCESS;
 }
-/*
-//
-////----- SM_Init() ----------------------------------------------------
-//void SM_Init(void)
-//{
-//    _Hw_D_ClrIntCardChg();
-//    _Hw_D_SetIntMask();
-//    // For DMA Interrupt
-//    _Hw_D_ClrDMAIntCardChg();
-//    _Hw_D_SetDMAIntMask();
-//}
-//
-////----- Media_D_EraseAllRedtData() -----------------------------------
-//int Media_D_EraseAllRedtData(DWORD Index, BOOLEAN CheckBlock)
-//{
-//    BYTE    i;
-//
-//    if (Check_D_MediaPower())
-//        return(ErrCode);
-//
-//    if (Check_D_MediaWP())
-//        return(ErrCode);
-//
-//    for (i=0; i<REDTSIZE; i++)
-//        WorkRedund[i] = 0xFF;
-//
-//    Media.Zone = (BYTE)Index;
-//    for (Media.PhyBlock=0; Media.PhyBlock<Ssfdc.MaxBlocks; Media.PhyBlock++)
-//    {
-//        if ((!Media.Zone) && (Media.PhyBlock<=CisArea.PhyBlock))
-//            continue;
-//
-//        if (Ssfdc_D_EraseBlock(fdoExt))
-//        {
-//            ErrCode = ERR_HwError;
-//            return(ERROR);
-//        }
-//
-//        for(Media.Sector=0; Media.Sector<Ssfdc.MaxSectors; Media.Sector++)
-//        {
-//            Ssfdc_D_WriteRedtMode();
-//
-//            if (Ssfdc_D_WriteRedtData(WorkRedund))
-//            {
-//                Ssfdc_D_Reset(fdoExt);
-//                ErrCode        = ERR_HwError;
-//                MediaChange    = ERROR;
-//                return(ERROR);
-//            } // NO Status Check
-//        }
-//
-//        Ssfdc_D_Reset(fdoExt);
-//    }
-//
-//    Ssfdc_D_Reset(fdoExt);
-//
-//    return(SMSUCCESS);
-//}
-//
-////----- Media_D_GetMediaInfo() ---------------------------------------
-//DWORD Media_D_GetMediaInfo(PFDO_DEVICE_EXTENSION fdoExt, PIOCTL_MEDIA_INFO_IN pParamIn, PIOCTL_MEDIA_INFO_OUT pParamOut)
-//{
-//    pParamOut->ErrCode = STATUS_CMD_FAIL;
-//
-//    Init_D_SmartMedia();
-//
-//    if (Check_D_MediaPower())
-//        return (ErrCode==ERR_NoSmartMedia) ? STATUS_CMD_NO_MEDIA : STATUS_CMD_FAIL;
-//
-//    if (Set_D_PhyFmtValue(fdoExt))
-//        return STATUS_CMD_FAIL;
-//
-//    //usleep(56*1024);
-//    if (Search_D_CIS(fdoExt))
-//        return STATUS_CMD_FAIL;
-//
-//    if (Check_D_MediaWP())
-//        return STATUS_CMD_MEDIA_WP;
-//
-//    pParamOut->PageSize  = Ssfdc.MaxSectors;
-//    pParamOut->BlockSize = Ssfdc.MaxBlocks;
-//    pParamOut->ZoneSize  = Ssfdc.MaxZones;
-//
-//    return STATUS_CMD_SUCCESS;
-//}*/
index d4dd5ed516ce80ebd8de093759bb700161bd7ac1..346c5702f41116ad549dcf5bf3c9caf7fb5920e8 100644 (file)
@@ -33,9 +33,9 @@ void   _Set_D_ECCdata(BYTE, BYTE *);
 void   _Calc_D_ECCdata(BYTE *);
 
 
-struct SSFDCTYPE                Ssfdc;
-struct ADDRESS                  Media;
-struct CIS_AREA                 CisArea;
+struct keucr_media_info         Ssfdc;
+struct keucr_media_address      Media;
+struct keucr_media_area         CisArea;
 
 static BYTE                            EccBuf[6];
 extern PBYTE                    SMHostAddr;
@@ -103,8 +103,10 @@ int Load_D_LogBlockAddr(BYTE *redundant)
 {
        WORD addr1, addr2;
 
-       addr1 = (WORD)*(redundant + REDT_ADDR1H)*0x0100 + (WORD)*(redundant + REDT_ADDR1L);
-       addr2 = (WORD)*(redundant + REDT_ADDR2H)*0x0100 + (WORD)*(redundant + REDT_ADDR2L);
+       addr1 = (WORD)*(redundant + REDT_ADDR1H)*0x0100 +
+                                       (WORD)*(redundant + REDT_ADDR1L);
+       addr2 = (WORD)*(redundant + REDT_ADDR2H)*0x0100 +
+                                       (WORD)*(redundant + REDT_ADDR2L);
 
        if (addr1 == addr2)
                if ((addr1 & 0xF000) == 0x1000) {
@@ -151,7 +153,8 @@ void Set_D_LogBlockAddr(BYTE *redundant)
        if ((hweight16(addr) % 2))
                addr++;
 
-       *(redundant + REDT_ADDR1H) = *(redundant + REDT_ADDR2H) = (BYTE)(addr / 0x0100);
+       *(redundant + REDT_ADDR1H) = *(redundant + REDT_ADDR2H) =
+                                                       (BYTE)(addr / 0x0100);
        *(redundant + REDT_ADDR1L) = *(redundant + REDT_ADDR2L) = (BYTE)addr;
 }
 
@@ -191,7 +194,9 @@ int Ssfdc_D_ReadCisSect(struct us_data *us, BYTE *buf, BYTE *redundant)
        Media.Sector = CisArea.Sector;
 
        if (Ssfdc_D_ReadSect(us, buf, redundant)) {
-               Media.Zone = zone; Media.PhyBlock = block; Media.Sector = sector;
+               Media.Zone = zone;
+               Media.PhyBlock = block;
+               Media.Sector = sector;
                return ERROR;
        }
 
@@ -209,7 +214,8 @@ int Ssfdc_D_ReadSect(struct us_data *us, BYTE *buf, BYTE *redundant)
 
        result = ENE_LoadBinCode(us, SM_RW_PATTERN);
        if (result != USB_STOR_XFER_GOOD) {
-               printk("Load SM RW Code Fail !!\n");
+               dev_err(&us->pusb_dev->dev,
+                       "Failed to load SmartMedia read/write code\n");
                return USB_STOR_TRANSPORT_ERROR;
        }
 
@@ -252,7 +258,8 @@ int Ssfdc_D_ReadSect(struct us_data *us, BYTE *buf, BYTE *redundant)
 }
 
 /* ----- Ssfdc_D_ReadBlock() --------------------------------------------- */
-int Ssfdc_D_ReadBlock(struct us_data *us, WORD count, BYTE *buf, BYTE *redundant)
+int Ssfdc_D_ReadBlock(struct us_data *us, WORD count, BYTE *buf,
+                                                       BYTE *redundant)
 {
        struct bulk_cb_wrap *bcb = (struct bulk_cb_wrap *) us->iobuf;
        int     result;
@@ -260,7 +267,8 @@ int Ssfdc_D_ReadBlock(struct us_data *us, WORD count, BYTE *buf, BYTE *redundant
 
        result = ENE_LoadBinCode(us, SM_RW_PATTERN);
        if (result != USB_STOR_XFER_GOOD) {
-               printk("Load SM RW Code Fail !!\n");
+               dev_err(&us->pusb_dev->dev,
+                       "Failed to load SmartMedia read/write code\n");
                return USB_STOR_TRANSPORT_ERROR;
        }
 
@@ -304,7 +312,8 @@ int Ssfdc_D_ReadBlock(struct us_data *us, WORD count, BYTE *buf, BYTE *redundant
 
 
 /* ----- Ssfdc_D_CopyBlock() -------------------------------------------- */
-int Ssfdc_D_CopyBlock(struct us_data *us, WORD count, BYTE *buf, BYTE *redundant)
+int Ssfdc_D_CopyBlock(struct us_data *us, WORD count, BYTE *buf,
+                                                       BYTE *redundant)
 {
        struct bulk_cb_wrap *bcb = (struct bulk_cb_wrap *) us->iobuf;
        int     result;
@@ -312,7 +321,8 @@ int Ssfdc_D_CopyBlock(struct us_data *us, WORD count, BYTE *buf, BYTE *redundant
 
        result = ENE_LoadBinCode(us, SM_RW_PATTERN);
        if (result != USB_STOR_XFER_GOOD) {
-               printk("Load SM RW Code Fail !!\n");
+               dev_err(&us->pusb_dev->dev,
+                       "Failed to load SmartMedia read/write code\n");
                return USB_STOR_TRANSPORT_ERROR;
        }
 
@@ -358,7 +368,8 @@ int Ssfdc_D_WriteSectForCopy(struct us_data *us, BYTE *buf, BYTE *redundant)
 
        result = ENE_LoadBinCode(us, SM_RW_PATTERN);
        if (result != USB_STOR_XFER_GOOD) {
-               printk("Load SM RW Code Fail !!\n");
+               dev_err(&us->pusb_dev->dev,
+                       "Failed to load SmartMedia read/write code\n");
                return USB_STOR_TRANSPORT_ERROR;
        }
 
@@ -396,7 +407,8 @@ int Ssfdc_D_EraseBlock(struct us_data *us)
 
        result = ENE_LoadBinCode(us, SM_RW_PATTERN);
        if (result != USB_STOR_XFER_GOOD) {
-               printk("Load SM RW Code Fail !!\n");
+               dev_err(&us->pusb_dev->dev,
+                       "Failed to load SmartMedia read/write code\n");
                return USB_STOR_TRANSPORT_ERROR;
        }
 
@@ -431,7 +443,8 @@ int Ssfdc_D_ReadRedtData(struct us_data *us, BYTE *redundant)
 
        result = ENE_LoadBinCode(us, SM_RW_PATTERN);
        if (result != USB_STOR_XFER_GOOD) {
-               printk("Load SM RW Code Fail !!\n");
+               dev_err(&us->pusb_dev->dev,
+                       "Failed to load SmartMedia read/write code\n");
                return USB_STOR_TRANSPORT_ERROR;
        }
 
@@ -470,7 +483,8 @@ int Ssfdc_D_WriteRedtData(struct us_data *us, BYTE *redundant)
 
        result = ENE_LoadBinCode(us, SM_RW_PATTERN);
        if (result != USB_STOR_XFER_GOOD) {
-               printk("Load SM RW Code Fail !!\n");
+               dev_err(&us->pusb_dev->dev,
+                       "Failed to load SmartMedia read/write code\n");
                return USB_STOR_TRANSPORT_ERROR;
        }
 
@@ -611,7 +625,7 @@ int Set_D_SsfdcModel(BYTE dcode)
                return ERROR;
        }
 
-    return SMSUCCESS;
+       return SMSUCCESS;
 }
 
 /* ----- _Check_D_DevCode() --------------------------------------------- */
@@ -686,8 +700,8 @@ int Check_D_CISdata(BYTE *buf, BYTE *redundant)
 /* ----- Set_D_RightECC() ---------------------------------------------- */
 void Set_D_RightECC(BYTE *redundant)
 {
-    /* Driver ECC Check */
-    return;
+       /* Driver ECC Check */
+       return;
 }
 
 
index 58b55557118533be486391e568b6769b44646cec..572d6489b66b0e618df083e874fb5f0f5e14faf5 100644 (file)
@@ -56,7 +56,7 @@ int SM_SCSIIrp(struct us_data *us, struct scsi_cmnd *srb)
        return result;
 }
 
-/* ----- SM_SCSI_Test_Unit_Ready() -------------------------------------------------- */
+/* ----- SM_SCSI_Test_Unit_Ready() ------------------------------------- */
 int SM_SCSI_Test_Unit_Ready(struct us_data *us, struct scsi_cmnd *srb)
 {
        if (us->SM_Status.Insert && us->SM_Status.Ready)
@@ -69,21 +69,27 @@ int SM_SCSI_Test_Unit_Ready(struct us_data *us, struct scsi_cmnd *srb)
        return USB_STOR_TRANSPORT_GOOD;
 }
 
-/* ----- SM_SCSI_Inquiry() -------------------------------------------------- */
+/* ----- SM_SCSI_Inquiry() --------------------------------------------- */
 int SM_SCSI_Inquiry(struct us_data *us, struct scsi_cmnd *srb)
 {
-       BYTE data_ptr[36] = {0x00, 0x80, 0x02, 0x00, 0x1F, 0x00, 0x00, 0x00, 0x55, 0x53, 0x42, 0x32, 0x2E, 0x30, 0x20, 0x20, 0x43, 0x61, 0x72, 0x64, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, 0x31, 0x30, 0x30};
+       BYTE data_ptr[36] = {0x00, 0x80, 0x02, 0x00, 0x1F, 0x00, 0x00, 0x00,
+                                0x55, 0x53, 0x42, 0x32, 0x2E, 0x30, 0x20,
+                                0x20, 0x43, 0x61, 0x72, 0x64, 0x52, 0x65,
+                                0x61, 0x64, 0x65, 0x72, 0x20, 0x20, 0x20,
+                                0x20, 0x20, 0x20, 0x30, 0x31, 0x30, 0x30};
 
        usb_stor_set_xfer_buf(us, data_ptr, 36, srb, TO_XFER_BUF);
        return USB_STOR_TRANSPORT_GOOD;
 }
 
 
-/* ----- SM_SCSI_Mode_Sense() -------------------------------------------------- */
+/* ----- SM_SCSI_Mode_Sense() ------------------------------------------ */
 int SM_SCSI_Mode_Sense(struct us_data *us, struct scsi_cmnd *srb)
 {
-       BYTE    mediaNoWP[12] = {0x0b, 0x00, 0x00, 0x08, 0x00, 0x00, 0x71, 0xc0, 0x00, 0x00, 0x02, 0x00};
-       BYTE    mediaWP[12]   = {0x0b, 0x00, 0x80, 0x08, 0x00, 0x00, 0x71, 0xc0, 0x00, 0x00, 0x02, 0x00};
+       BYTE    mediaNoWP[12] = {0x0b, 0x00, 0x00, 0x08, 0x00, 0x00,
+                               0x71, 0xc0, 0x00, 0x00, 0x02, 0x00};
+       BYTE    mediaWP[12]   = {0x0b, 0x00, 0x80, 0x08, 0x00, 0x00,
+                               0x71, 0xc0, 0x00, 0x00, 0x02, 0x00};
 
        if (us->SM_Status.WtP)
                usb_stor_set_xfer_buf(us, mediaWP, 12, srb, TO_XFER_BUF);
@@ -94,7 +100,7 @@ int SM_SCSI_Mode_Sense(struct us_data *us, struct scsi_cmnd *srb)
        return USB_STOR_TRANSPORT_GOOD;
 }
 
-/* ----- SM_SCSI_Read_Capacity() -------------------------------------------------- */
+/* ----- SM_SCSI_Read_Capacity() --------------------------------------- */
 int SM_SCSI_Read_Capacity(struct us_data *us, struct scsi_cmnd *srb)
 {
        unsigned int offset = 0;
@@ -103,14 +109,14 @@ int SM_SCSI_Read_Capacity(struct us_data *us, struct scsi_cmnd *srb)
        WORD    bl_len;
        BYTE    buf[8];
 
-       printk("SM_SCSI_Read_Capacity\n");
+       dev_dbg(&us->pusb_dev->dev, "SM_SCSI_Read_Capacity\n");
 
        bl_len = 0x200;
        bl_num = Ssfdc.MaxLogBlocks * Ssfdc.MaxSectors * Ssfdc.MaxZones - 1;
 
        us->bl_num = bl_num;
-       printk("bl_len = %x\n", bl_len);
-       printk("bl_num = %x\n", bl_num);
+       dev_dbg(&us->pusb_dev->dev, "bl_len = %x\n", bl_len);
+       dev_dbg(&us->pusb_dev->dev, "bl_num = %x\n", bl_num);
 
        buf[0] = (bl_num >> 24) & 0xff;
        buf[1] = (bl_num >> 16) & 0xff;
@@ -131,8 +137,10 @@ int SM_SCSI_Read(struct us_data *us, struct scsi_cmnd *srb)
 {
        int result = 0;
        PBYTE   Cdb = srb->cmnd;
-       DWORD bn  =  ((Cdb[2] << 24) & 0xff000000) | ((Cdb[3] << 16) & 0x00ff0000) |
-               ((Cdb[4] << 8) & 0x0000ff00) | ((Cdb[5] << 0) & 0x000000ff);
+       DWORD bn  =  ((Cdb[2] << 24) & 0xff000000) |
+                       ((Cdb[3] << 16) & 0x00ff0000) |
+                       ((Cdb[4] << 8) & 0x0000ff00) |
+                       ((Cdb[5] << 0) & 0x000000ff);
        WORD  blen = ((Cdb[7] << 8) & 0xff00)     | ((Cdb[8] << 0) & 0x00ff);
        DWORD   blenByte = blen * 0x200;
        void    *buf;
@@ -161,8 +169,10 @@ int SM_SCSI_Write(struct us_data *us, struct scsi_cmnd *srb)
 {
        int result = 0;
        PBYTE   Cdb = srb->cmnd;
-       DWORD bn  =  ((Cdb[2] << 24) & 0xff000000) | ((Cdb[3] << 16) & 0x00ff0000) |
-               ((Cdb[4] << 8) & 0x0000ff00) | ((Cdb[5] << 0) & 0x000000ff);
+       DWORD bn  =  ((Cdb[2] << 24) & 0xff000000) |
+                       ((Cdb[3] << 16) & 0x00ff0000) |
+                       ((Cdb[4] << 8) & 0x0000ff00) |
+                       ((Cdb[5] << 0) & 0x000000ff);
        WORD  blen = ((Cdb[7] << 8) & 0xff00)     | ((Cdb[8] << 0) & 0x00ff);
        DWORD   blenByte = blen * 0x200;
        void    *buf;
index 1a8837df0766e326f8d81eea33252250db20a733..1f9ea58b339666cc3e02b94d2c145f0014bce18f 100644 (file)
@@ -78,6 +78,61 @@ static int usb_stor_msg_common(struct us_data *us, int timeout)
        return us->current_urb->status;
 }
 
+/*
+ * usb_stor_print_cmd():
+ */
+static void usb_stor_print_cmd(struct us_data *us, struct scsi_cmnd *srb)
+{
+       PBYTE   Cdb = srb->cmnd;
+       DWORD   cmd = Cdb[0];
+       DWORD   bn  =   ((Cdb[2] << 24) & 0xff000000) |
+                       ((Cdb[3] << 16) & 0x00ff0000) |
+                       ((Cdb[4] << 8) & 0x0000ff00) |
+                       ((Cdb[5] << 0) & 0x000000ff);
+       WORD    blen = ((Cdb[7] << 8) & 0xff00) | ((Cdb[8] << 0) & 0x00ff);
+
+       switch (cmd) {
+       case TEST_UNIT_READY:
+               /* dev_dbg(&us->pusb_dev->dev,
+                       "scsi cmd %X --- SCSIOP_TEST_UNIT_READY\n", cmd); */
+               break;
+       case INQUIRY:
+               dev_dbg(&us->pusb_dev->dev,
+                               "scsi cmd %X --- SCSIOP_INQUIRY\n", cmd);
+               break;
+       case MODE_SENSE:
+               dev_dbg(&us->pusb_dev->dev,
+                               "scsi cmd %X --- SCSIOP_MODE_SENSE\n", cmd);
+               break;
+       case START_STOP:
+               dev_dbg(&us->pusb_dev->dev,
+                               "scsi cmd %X --- SCSIOP_START_STOP\n", cmd);
+               break;
+       case READ_CAPACITY:
+               dev_dbg(&us->pusb_dev->dev,
+                               "scsi cmd %X --- SCSIOP_READ_CAPACITY\n", cmd);
+               break;
+       case READ_10:
+               /*  dev_dbg(&us->pusb_dev->dev,
+                       "scsi cmd %X --- SCSIOP_READ,bn = %X, blen = %X\n"
+                       cmd, bn, blen); */
+               break;
+       case WRITE_10:
+               /* dev_dbg(&us->pusb_dev->dev,
+                       "scsi cmd %X --- SCSIOP_WRITE, bn = %X, blen = %X\n",
+                       cmd, bn, blen); */
+               break;
+       case ALLOW_MEDIUM_REMOVAL:
+               dev_dbg(&us->pusb_dev->dev,
+                       "scsi cmd %X --- SCSIOP_ALLOW_MEDIUM_REMOVAL\n", cmd);
+               break;
+       default:
+               dev_dbg(&us->pusb_dev->dev, "scsi cmd %X --- Other cmd\n", cmd);
+               break;
+       }
+       bn = 0;
+}
+
 /*
  * usb_stor_control_msg()
  */
@@ -303,7 +358,7 @@ void usb_stor_invoke_transport(struct scsi_cmnd *srb, struct us_data *us)
        int result;
 
        /* pr_info("transport --- usb_stor_invoke_transport\n"); */
-       usb_stor_print_cmd(srb);
+       usb_stor_print_cmd(us, srb);
        /* send the command to the transport layer */
        scsi_set_resid(srb, 0);
        result = us->transport(srb, us); /* usb_stor_Bulk_transport; */
@@ -429,7 +484,7 @@ void ENE_stor_invoke_transport(struct scsi_cmnd *srb, struct us_data *us)
        int result = 0;
 
        /* pr_info("transport --- ENE_stor_invoke_transport\n"); */
-       usb_stor_print_cmd(srb);
+       usb_stor_print_cmd(us, srb);
        /* send the command to the transport layer */
        scsi_set_resid(srb, 0);
        if (!(us->SM_Status.Ready))
@@ -708,8 +763,8 @@ int usb_stor_Bulk_transport(struct scsi_cmnd *srb, struct us_data *us)
 
                } else {
                        residue = min(residue, transfer_length);
-                       scsi_set_resid(srb, max(scsi_get_resid(srb),
-                                                       (int) residue));
+                       scsi_set_resid(srb, max_t(int, scsi_get_resid(srb),
+                                                       residue));
                }
        }
 
index 2a11a98375d7ca879f6b0936eac3ddefee8f9483..df34474ae568b9ec1089891b9e9d1d5184388ee9 100644 (file)
@@ -29,7 +29,6 @@
 extern int usb_stor_Bulk_transport(struct scsi_cmnd *, struct us_data*);
 extern int usb_stor_Bulk_max_lun(struct us_data *);
 extern int usb_stor_Bulk_reset(struct us_data *);
-extern void usb_stor_print_cmd(struct scsi_cmnd *);
 extern void usb_stor_invoke_transport(struct scsi_cmnd *, struct us_data*);
 extern void usb_stor_stop_transport(struct us_data *);
 extern int usb_stor_control_msg(struct us_data *us, unsigned int pipe,
@@ -61,7 +60,7 @@ extern int ENE_InitMedia(struct us_data *);
 extern int ENE_SMInit(struct us_data *);
 extern int ENE_SendScsiCmd(struct us_data*, BYTE, void*, int);
 extern int ENE_LoadBinCode(struct us_data*, BYTE);
-extern int ENE_Read_BYTE(struct us_data*, WORD index, void *buf);
+extern int ene_read_byte(struct us_data*, WORD index, void *buf);
 extern int ENE_Read_Data(struct us_data*, void *buf, unsigned int length);
 extern int ENE_Write_Data(struct us_data*, void *buf, unsigned int length);
 extern void BuildSenseBuffer(struct scsi_cmnd *, int);
index f656f8aeeda31117003eace68b9eb6f9564d37d5..ddd2e7390b4611b9df23d282874210e3b915f312 100644 (file)
@@ -24,13 +24,13 @@ MODULE_LICENSE("GPL");
 
 static unsigned int delay_use = 1;
 
-static struct usb_device_id eucr_usb_ids [] = {
+static struct usb_device_id eucr_usb_ids[] = {
        { USB_DEVICE(0x058f, 0x6366) },
        { USB_DEVICE(0x0cf2, 0x6230) },
        { USB_DEVICE(0x0cf2, 0x6250) },
        { }                                            /* Terminating entry */
 };
-MODULE_DEVICE_TABLE (usb, eucr_usb_ids);
+MODULE_DEVICE_TABLE(usb, eucr_usb_ids);
 
 
 #ifdef CONFIG_PM
@@ -65,7 +65,7 @@ static int eucr_resume(struct usb_interface *iface)
 
        us->Power_IsResum = true;
 
-       us->SM_Status = *(PSM_STATUS)&tmp;
+       us->SM_Status = *(struct keucr_sm_status *)&tmp;
 
        return 0;
 }
@@ -85,9 +85,9 @@ static int eucr_reset_resume(struct usb_interface *iface)
         * the device
         */
 
-       us->Power_IsResum = true;
+       us->Power_IsResum = true;
 
-       us->SM_Status = *(PSM_STATUS)&tmp;
+       us->SM_Status = *(struct keucr_sm_status *)&tmp;
 
        return 0;
 }
@@ -124,16 +124,18 @@ static int eucr_post_reset(struct usb_interface *iface)
        return 0;
 }
 
-void fill_inquiry_response(struct us_data *us, unsigned char *data, unsigned int data_len)
+void fill_inquiry_response(struct us_data *us, unsigned char *data,
+                                                       unsigned int data_len)
 {
        pr_info("usb --- fill_inquiry_response\n");
        if (data_len < 36) /* You lose. */
                return;
 
        if (data[0]&0x20) {
-               memset(data+8,0,28);
+               memset(data+8, 0, 28);
        } else {
-               u16 bcdDevice = le16_to_cpu(us->pusb_dev->descriptor.bcdDevice);
+               u16 bcdDevice =
+                       le16_to_cpu(us->pusb_dev->descriptor.bcdDevice);
                memcpy(data+8, us->unusual_dev->vendorName,
                        strlen(us->unusual_dev->vendorName) > 8 ? 8 :
                        strlen(us->unusual_dev->vendorName));
@@ -148,7 +150,7 @@ void fill_inquiry_response(struct us_data *us, unsigned char *data, unsigned int
        usb_stor_set_xfer_buf(us, data, data_len, us->srb, TO_XFER_BUF);
 }
 
-static int usb_stor_control_thread(void * __us)
+static int usb_stor_control_thread(void *__us)
 {
        struct us_data *us = (struct us_data *)__us;
        struct Scsi_Host *host = us_to_host(us);
@@ -194,7 +196,8 @@ static int usb_stor_control_thread(void * __us)
                        us->srb->result = DID_BAD_TARGET << 16;
                } else if ((us->srb->cmnd[0] == INQUIRY)
                           && (us->fflags & US_FL_FIX_INQUIRY)) {
-                       unsigned char data_ptr[36] = {0x00, 0x80, 0x02, 0x02, 0x1F, 0x00, 0x00, 0x00};
+                       unsigned char data_ptr[36] = {0x00, 0x80, 0x02, 0x02,
+                                               0x1F, 0x00, 0x00, 0x00};
 
                        fill_inquiry_response(us, data_ptr, 36);
                        us->srb->result = SAM_STAT_GOOD;
@@ -253,13 +256,15 @@ static int associate_dev(struct us_data *us, struct usb_interface *intf)
        usb_set_intfdata(intf, us);
 
        /* Allocate the device-related DMA-mapped buffers */
-       us->cr = usb_alloc_coherent(us->pusb_dev, sizeof(*us->cr), GFP_KERNEL, &us->cr_dma);
+       us->cr = usb_alloc_coherent(us->pusb_dev, sizeof(*us->cr), GFP_KERNEL,
+                                                       &us->cr_dma);
        if (!us->cr) {
                pr_info("usb_ctrlrequest allocation failed\n");
                return -ENOMEM;
        }
 
-       us->iobuf = usb_alloc_coherent(us->pusb_dev, US_IOBUF_SIZE, GFP_KERNEL, &us->iobuf_dma);
+       us->iobuf = usb_alloc_coherent(us->pusb_dev, US_IOBUF_SIZE, GFP_KERNEL,
+                                                       &us->iobuf_dma);
        if (!us->iobuf) {
                pr_info("I/O buffer allocation failed\n");
                return -ENOMEM;
@@ -275,7 +280,8 @@ static int associate_dev(struct us_data *us, struct usb_interface *intf)
 static int get_device_info(struct us_data *us, const struct usb_device_id *id)
 {
        struct usb_device *dev = us->pusb_dev;
-       struct usb_interface_descriptor *idesc = &us->pusb_intf->cur_altsetting->desc;
+       struct usb_interface_descriptor *idesc =
+                                       &us->pusb_intf->cur_altsetting->desc;
 
        pr_info("usb --- get_device_info\n");
 
@@ -374,10 +380,13 @@ static int get_pipes(struct us_data *us)
        /* Calculate and store the pipe values */
        us->send_ctrl_pipe = usb_sndctrlpipe(us->pusb_dev, 0);
        us->recv_ctrl_pipe = usb_rcvctrlpipe(us->pusb_dev, 0);
-       us->send_bulk_pipe = usb_sndbulkpipe(us->pusb_dev, ep_out->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
-       us->recv_bulk_pipe = usb_rcvbulkpipe(us->pusb_dev, ep_in->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
+       us->send_bulk_pipe = usb_sndbulkpipe(us->pusb_dev,
+                       ep_out->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
+       us->recv_bulk_pipe = usb_rcvbulkpipe(us->pusb_dev,
+                       ep_in->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
        if (ep_int) {
-               us->recv_intr_pipe = usb_rcvintpipe(us->pusb_dev, ep_int->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
+               us->recv_intr_pipe = usb_rcvintpipe(us->pusb_dev,
+                       ep_int->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
                us->ep_bInterval = ep_int->bInterval;
        }
        return 0;
@@ -433,10 +442,9 @@ static void dissociate_dev(struct us_data *us)
        kfree(us->sensebuf);
 
        /* Free the device-related DMA-mapped buffers */
-       if (us->cr)
-               usb_free_coherent(us->pusb_dev, sizeof(*us->cr), us->cr, us->cr_dma);
-       if (us->iobuf)
-               usb_free_coherent(us->pusb_dev, US_IOBUF_SIZE, us->iobuf, us->iobuf_dma);
+       usb_free_coherent(us->pusb_dev, sizeof(*us->cr), us->cr, us->cr_dma);
+       usb_free_coherent(us->pusb_dev, US_IOBUF_SIZE, us->iobuf,
+                         us->iobuf_dma);
 
        /* Remove our private data from the interface */
        usb_set_intfdata(us->pusb_intf, NULL);
@@ -485,7 +493,7 @@ static void release_everything(struct us_data *us)
        scsi_host_put(us_to_host(us));
 }
 
-static int usb_stor_scan_thread(void * __us)
+static int usb_stor_scan_thread(void *__us)
 {
        struct us_data *us = (struct us_data *)__us;
 
@@ -515,7 +523,8 @@ static int usb_stor_scan_thread(void * __us)
        complete_and_exit(&us->scanning_done, 0);
 }
 
-static int eucr_probe(struct usb_interface *intf, const struct usb_device_id *id)
+static int eucr_probe(struct usb_interface *intf,
+                                       const struct usb_device_id *id)
 {
        struct Scsi_Host *host;
        struct us_data *us;
@@ -525,7 +534,7 @@ static int eucr_probe(struct usb_interface *intf, const struct usb_device_id *id
 
        pr_info("usb --- eucr_probe\n");
 
-      host = scsi_host_alloc(&usb_stor_host_template, sizeof(*us));
+       host = scsi_host_alloc(&usb_stor_host_template, sizeof(*us));
        if (!host) {
                pr_info("Unable to allocate the scsi host\n");
                return -ENOMEM;
@@ -585,7 +594,7 @@ static int eucr_probe(struct usb_interface *intf, const struct usb_device_id *id
        wake_up_process(th);
 
        /* probe card type */
-       result = ENE_Read_BYTE(us, REG_CARD_STATUS, &MiscReg03);
+       result = ene_read_byte(us, REG_CARD_STATUS, &MiscReg03);
        if (result != USB_STOR_XFER_GOOD) {
                result = USB_STOR_TRANSPORT_ERROR;
                quiesce_and_remove_host(us);
@@ -595,9 +604,9 @@ static int eucr_probe(struct usb_interface *intf, const struct usb_device_id *id
        if (!(MiscReg03 & 0x02)) {
                result = -ENODEV;
                quiesce_and_remove_host(us);
-               pr_info("keucr: The driver only supports SM/MS card.\
-                       To use SD card, \
-                       please build driver/usb/storage/ums-eneub6250.ko\n");
+               pr_info("keucr: The driver only supports SM/MS card. "
+                       "To use SD card, "
+                       "please build driver/usb/storage/ums-eneub6250.ko\n");
                goto BadDevice;
        }
 
@@ -623,9 +632,9 @@ static void eucr_disconnect(struct usb_interface *intf)
 static struct usb_driver usb_storage_driver = {
        .name =         "eucr",
        .probe =                eucr_probe,
-       .suspend =          eucr_suspend,
+       .suspend =          eucr_suspend,
        .resume =           eucr_resume,
-       .reset_resume = eucr_reset_resume,
+       .reset_resume = eucr_reset_resume,
        .disconnect =   eucr_disconnect,
        .pre_reset =    eucr_pre_reset,
        .post_reset =   eucr_post_reset,
index a5f7a16c11c973e981fd9b16a59f8a3e6127bd79..d665af177b96c2450e5e1257ae9d1ba7786c4ecb 100644 (file)
@@ -1,4 +1,4 @@
-// Driver for USB Mass Storage compliant devices
+/* Driver for USB Mass Storage compliant devices */
 
 #ifndef _USB_H_
 #define _USB_H_
@@ -19,26 +19,26 @@ struct scsi_cmnd;
  */
 
 struct us_unusual_dev {
-       const charvendorName;
-       const charproductName;
+       const char *vendorName;
+       const char *productName;
        __u8  useProtocol;
        __u8  useTransport;
        int (*initFunction)(struct us_data *);
 };
 
-//EnE HW Register
+/* EnE HW Register */
 #define REG_CARD_STATUS     0xFF83
 #define REG_HW_TRAP1        0xFF89
 
-// SRB Status. Refers /usr/include/wine/wine/wnaspi32.h & SCSI sense key
-#define SS_SUCCESS                  0x00      // No Sense
+/* SRB Status. Refers /usr/include/wine/wine/wnaspi32.h & SCSI sense key */
+#define SS_SUCCESS                  0x00      /* No Sense */
 #define SS_NOT_READY                0x02
 #define SS_MEDIUM_ERR               0x03
 #define SS_HW_ERR                   0x04
 #define SS_ILLEGAL_REQUEST          0x05
 #define SS_UNIT_ATTENTION           0x06
 
-//ENE Load FW Pattern
+/* ENE Load FW Pattern */
 #define SD_INIT1_PATTERN   1
 #define SD_INIT2_PATTERN   2
 #define SD_RW_PATTERN      3
@@ -51,39 +51,40 @@ struct us_unusual_dev {
 #define FDIR_WRITE        0
 #define FDIR_READ         1
 
-typedef struct _SD_STATUS {
-    BYTE    Insert:1;
-    BYTE    Ready:1;
-    BYTE    MediaChange:1;
-    BYTE    IsMMC:1;
-    BYTE    HiCapacity:1;
-    BYTE    HiSpeed:1;
-    BYTE    WtP:1;
-    BYTE    Reserved:1;
-} SD_STATUS, *PSD_STATUS;
-
-typedef struct _MS_STATUS {
-    BYTE    Insert:1;
-    BYTE    Ready:1;
-    BYTE    MediaChange:1;
-    BYTE    IsMSPro:1;
-    BYTE    IsMSPHG:1;
-    BYTE    Reserved1:1;
-    BYTE    WtP:1;
-    BYTE    Reserved2:1;
-} MS_STATUS, *PMS_STATUS;
-
-typedef struct _SM_STATUS {
-    BYTE    Insert:1;
-    BYTE    Ready:1;
-    BYTE    MediaChange:1;
-    BYTE    Reserved:3;
-    BYTE    WtP:1;
-    BYTE    IsMS:1;
-} SM_STATUS, *PSM_STATUS;
-
-// SD Block Length
-#define SD_BLOCK_LEN                            9       // 2^9 = 512 Bytes, The HW maximum read/write data length
+struct keucr_sd_status {
+       BYTE    Insert:1;
+       BYTE    Ready:1;
+       BYTE    MediaChange:1;
+       BYTE    IsMMC:1;
+       BYTE    HiCapacity:1;
+       BYTE    HiSpeed:1;
+       BYTE    WtP:1;
+       BYTE    Reserved:1;
+};
+
+struct keucr_ms_status {
+       BYTE    Insert:1;
+       BYTE    Ready:1;
+       BYTE    MediaChange:1;
+       BYTE    IsMSPro:1;
+       BYTE    IsMSPHG:1;
+       BYTE    Reserved1:1;
+       BYTE    WtP:1;
+       BYTE    Reserved2:1;
+};
+
+struct keucr_sm_status {
+       BYTE    Insert:1;
+       BYTE    Ready:1;
+       BYTE    MediaChange:1;
+       BYTE    Reserved:3;
+       BYTE    WtP:1;
+       BYTE    IsMS:1;
+};
+
+/* SD Block Length */
+#define SD_BLOCK_LEN           9       /* 2^9 = 512 Bytes,
+                               The HW maximum read/write data length */
 
 /* Dynamic bitflag definitions (us->dflags): used in set_bit() etc. */
 #define US_FLIDX_URB_ACTIVE    0       /* current_urb is in use    */
@@ -107,9 +108,9 @@ typedef struct _SM_STATUS {
 #define US_IOBUF_SIZE          64      /* Size of the DMA-mapped I/O buffer */
 #define US_SENSE_SIZE          18      /* Size of the autosense data buffer */
 
-typedef int (*trans_cmnd)(struct scsi_cmnd *, struct us_data*);
-typedef int (*trans_reset)(struct us_data*);
-typedef void (*proto_cmnd)(struct scsi_cmnd*, struct us_data*);
+typedef int (*trans_cmnd)(struct scsi_cmnd *, struct us_data *);
+typedef int (*trans_reset)(struct us_data *);
+typedef void (*proto_cmnd)(struct scsi_cmnd *, struct us_data *);
 typedef void (*extra_data_destructor)(void *); /* extra data destructor */
 typedef void (*pm_hook)(struct us_data *, int);        /* power management hook */
 
@@ -176,19 +177,19 @@ struct us_data {
 #ifdef CONFIG_PM
        pm_hook                 suspend_resume_hook;
 #endif
-       // for 6250 code
-       SD_STATUS   SD_Status;
-       MS_STATUS   MS_Status;
-       SM_STATUS   SM_Status;
+       /* for 6250 code */
+       struct keucr_sd_status   SD_Status;
+       struct keucr_ms_status   MS_Status;
+       struct keucr_sm_status   SM_Status;
 
-       //----- SD Control Data ----------------
-       //SD_REGISTER SD_Regs;
+       /* ----- SD Control Data ---------------- */
+       /* SD_REGISTER SD_Regs; */
        WORD        SD_Block_Mult;
        BYTE        SD_READ_BL_LEN;
        WORD        SD_C_SIZE;
        BYTE        SD_C_SIZE_MULT;
 
-       // SD/MMC New spec.
+       /* SD/MMC New spec. */
        BYTE        SD_SPEC_VER;
        BYTE        SD_CSD_VER;
        BYTE        SD20_HIGH_CAPACITY;
@@ -196,15 +197,15 @@ struct us_data {
        BYTE        MMC_SPEC_VER;
        BYTE        MMC_BusWidth;
        BYTE        MMC_HIGH_CAPACITY;
-       
-       //----- MS Control Data ----------------
+
+       /* ----- MS Control Data ---------------- */
        BOOLEAN             MS_SWWP;
        DWORD               MSP_TotalBlock;
        /* MS_LibControl       MS_Lib; */
        BOOLEAN             MS_IsRWPage;
        WORD                MS_Model;
 
-       //----- SM Control Data ----------------
+       /* ----- SM Control Data ---------------- */
        BYTE            SM_DeviceID;
        BYTE            SM_CardID;
 
@@ -212,16 +213,18 @@ struct us_data {
        BYTE            BIN_FLAG;
        DWORD           bl_num;
        int             SrbStatus;
-       
-       //------Power Managerment ---------------
-       BOOLEAN         Power_IsResum;  
+
+       /* ------Power Managerment --------------- */
+       BOOLEAN         Power_IsResum;
 };
 
 /* Convert between us_data and the corresponding Scsi_Host */
-static inline struct Scsi_Host *us_to_host(struct us_data *us) {
+static inline struct Scsi_Host *us_to_host(struct us_data *us)
+{
        return container_of((void *) us, struct Scsi_Host, hostdata);
 }
-static inline struct us_data *host_to_us(struct Scsi_Host *host) {
+static inline struct us_data *host_to_us(struct Scsi_Host *host)
+{
        return (struct us_data *) host->hostdata;
 }
 
diff --git a/drivers/staging/lustre/Kconfig b/drivers/staging/lustre/Kconfig
new file mode 100644 (file)
index 0000000..a224d88
--- /dev/null
@@ -0,0 +1,3 @@
+source "drivers/staging/lustre/lustre/Kconfig"
+
+source "drivers/staging/lustre/lnet/Kconfig"
diff --git a/drivers/staging/lustre/Makefile b/drivers/staging/lustre/Makefile
new file mode 100644 (file)
index 0000000..2616289
--- /dev/null
@@ -0,0 +1,4 @@
+subdir-ccflags-y := -I$(src)/include/
+
+obj-$(CONFIG_LUSTRE_FS)                += lustre/
+obj-$(CONFIG_LNET)             += lnet/
diff --git a/drivers/staging/lustre/TODO b/drivers/staging/lustre/TODO
new file mode 100644 (file)
index 0000000..22742d6
--- /dev/null
@@ -0,0 +1,13 @@
+* Possible remaining coding style fix.
+* Remove deadcode.
+* Seperate client/server functionality. Functions only used by server can be
+  removed from client.
+* Clean up libcfs layer. Ideally we can remove include/linux/libcfs entirely.
+* Clean up CLIO layer. Lustre client readahead/writeback control needs to better
+  suit kernel providings.
+* Add documents in Documentation.
+* Other minor misc cleanups...
+
+Please send any patches to Greg Kroah-Hartman <greg@kroah.com>, Andreas Dilger
+<andreas.dilger@intel.com> and Peng Tao <tao.peng@emc.com>. CCing
+hpdd-discuss <hpdd-discuss@lists.01.org> would be great too.
diff --git a/drivers/staging/lustre/include/linux/libcfs/bitmap.h b/drivers/staging/lustre/include/linux/libcfs/bitmap.h
new file mode 100644 (file)
index 0000000..3f1c37b
--- /dev/null
@@ -0,0 +1,111 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LIBCFS_BITMAP_H_
+#define _LIBCFS_BITMAP_H_
+
+
+typedef struct {
+       int          size;
+       unsigned long   data[0];
+} cfs_bitmap_t;
+
+#define CFS_BITMAP_SIZE(nbits) \
+     (((nbits/BITS_PER_LONG)+1)*sizeof(long)+sizeof(cfs_bitmap_t))
+
+static inline
+cfs_bitmap_t *CFS_ALLOCATE_BITMAP(int size)
+{
+       cfs_bitmap_t *ptr;
+
+       OBD_ALLOC(ptr, CFS_BITMAP_SIZE(size));
+       if (ptr == NULL)
+               RETURN(ptr);
+
+       ptr->size = size;
+
+       RETURN (ptr);
+}
+
+#define CFS_FREE_BITMAP(ptr)   OBD_FREE(ptr, CFS_BITMAP_SIZE(ptr->size))
+
+static inline
+void cfs_bitmap_set(cfs_bitmap_t *bitmap, int nbit)
+{
+       set_bit(nbit, bitmap->data);
+}
+
+static inline
+void cfs_bitmap_clear(cfs_bitmap_t *bitmap, int nbit)
+{
+       test_and_clear_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_check(cfs_bitmap_t *bitmap, int nbit)
+{
+       return test_bit(nbit, bitmap->data);
+}
+
+static inline
+int cfs_bitmap_test_and_clear(cfs_bitmap_t *bitmap, int nbit)
+{
+       return test_and_clear_bit(nbit, bitmap->data);
+}
+
+/* return 0 is bitmap has none set bits */
+static inline
+int cfs_bitmap_check_empty(cfs_bitmap_t *bitmap)
+{
+       return find_first_bit(bitmap->data, bitmap->size) == bitmap->size;
+}
+
+static inline
+void cfs_bitmap_copy(cfs_bitmap_t *new, cfs_bitmap_t *old)
+{
+       int newsize;
+
+       LASSERT(new->size >= old->size);
+       newsize = new->size;
+       memcpy(new, old, CFS_BITMAP_SIZE(old->size));
+       new->size = newsize;
+}
+
+#define cfs_foreach_bit(bitmap, pos)                                   \
+       for ((pos) = find_first_bit((bitmap)->data, bitmap->size);      \
+            (pos) < (bitmap)->size;                                    \
+            (pos) = find_next_bit((bitmap)->data, (bitmap)->size, (pos) + 1))
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/curproc.h b/drivers/staging/lustre/include/linux/libcfs/curproc.h
new file mode 100644 (file)
index 0000000..90d7ce6
--- /dev/null
@@ -0,0 +1,110 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/curproc.h
+ *
+ * Lustre curproc API declaration
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_CURPROC_H__
+#define __LIBCFS_CURPROC_H__
+
+/*
+ * Portable API to access common characteristics of "current" UNIX process.
+ *
+ * Implemented in portals/include/libcfs/<os>/
+ */
+int    cfs_curproc_groups_nr(void);
+int    current_is_in_group(gid_t group);
+void   cfs_curproc_groups_dump(gid_t *array, int size);
+
+/*
+ * Plus, platform-specific constant
+ *
+ * CFS_CURPROC_COMM_MAX,
+ *
+ * and opaque scalar type
+ *
+ * kernel_cap_t
+ */
+
+/* check if task is running in compat mode.*/
+int current_is_32bit(void);
+#define current_pid()          (current->pid)
+#define current_comm()         (current->comm)
+int cfs_get_environ(const char *key, char *value, int *val_len);
+
+typedef __u32 cfs_cap_t;
+
+#define CFS_CAP_CHOWN             0
+#define CFS_CAP_DAC_OVERRIDE       1
+#define CFS_CAP_DAC_READ_SEARCH         2
+#define CFS_CAP_FOWNER           3
+#define CFS_CAP_FSETID           4
+#define CFS_CAP_LINUX_IMMUTABLE         9
+#define CFS_CAP_SYS_ADMIN            21
+#define CFS_CAP_SYS_BOOT              23
+#define CFS_CAP_SYS_RESOURCE      24
+
+#define CFS_CAP_FS_MASK ((1 << CFS_CAP_CHOWN) |                 \
+                        (1 << CFS_CAP_DAC_OVERRIDE) |    \
+                        (1 << CFS_CAP_DAC_READ_SEARCH) |       \
+                        (1 << CFS_CAP_FOWNER) |                \
+                        (1 << CFS_CAP_FSETID ) |              \
+                        (1 << CFS_CAP_LINUX_IMMUTABLE) |       \
+                        (1 << CFS_CAP_SYS_ADMIN) |          \
+                        (1 << CFS_CAP_SYS_BOOT) |            \
+                        (1 << CFS_CAP_SYS_RESOURCE))
+
+void cfs_cap_raise(cfs_cap_t cap);
+void cfs_cap_lower(cfs_cap_t cap);
+int cfs_cap_raised(cfs_cap_t cap);
+cfs_cap_t cfs_curproc_cap_pack(void);
+void cfs_curproc_cap_unpack(cfs_cap_t cap);
+int cfs_capable(cfs_cap_t cap);
+
+/* __LIBCFS_CURPROC_H__ */
+#endif
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/libcfs.h
new file mode 100644 (file)
index 0000000..1ab1f2b
--- /dev/null
@@ -0,0 +1,234 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LIBCFS_H__
+#define __LIBCFS_LIBCFS_H__
+
+#if !__GNUC__
+#define __attribute__(x)
+#endif
+
+#include <linux/libcfs/linux/libcfs.h>
+
+#include "curproc.h"
+
+#ifndef offsetof
+# define offsetof(typ,memb) ((long)(long_ptr_t)((char *)&(((typ *)0)->memb)))
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) ((sizeof (a)) / (sizeof ((a)[0])))
+#endif
+
+#if !defined(swap)
+#define swap(x,y) do { typeof(x) z = x; x = y; y = z; } while (0)
+#endif
+
+#if !defined(container_of)
+/* given a pointer @ptr to the field @member embedded into type (usually
+ * struct) @type, return pointer to the embedding instance of @type. */
+#define container_of(ptr, type, member) \
+       ((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+#endif
+
+static inline int __is_po2(unsigned long long val)
+{
+       return !(val & (val - 1));
+}
+
+#define IS_PO2(val) __is_po2((unsigned long long)(val))
+
+#define LOWEST_BIT_SET(x)       ((x) & ~((x) - 1))
+
+/*
+ * Lustre Error Checksum: calculates checksum
+ * of Hex number by XORing each bit.
+ */
+#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \
+                          ((hexnum) >> 8 & 0xf))
+
+
+/*
+ * Some (nomina odiosa sunt) platforms define NULL as naked 0. This confuses
+ * Lustre RETURN(NULL) macro.
+ */
+#if defined(NULL)
+#undef NULL
+#endif
+
+#define NULL ((void *)0)
+
+#define LUSTRE_SRV_LNET_PID      LUSTRE_LNET_PID
+
+
+#include <linux/list.h>
+
+#ifndef cfs_for_each_possible_cpu
+#  error cfs_for_each_possible_cpu is not supported by kernel!
+#endif
+
+/* libcfs tcpip */
+int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask);
+int libcfs_ipif_enumerate(char ***names);
+void libcfs_ipif_free_enumeration(char **names, int n);
+int libcfs_sock_listen(socket_t **sockp, __u32 ip, int port, int backlog);
+int libcfs_sock_accept(socket_t **newsockp, socket_t *sock);
+void libcfs_sock_abort_accept(socket_t *sock);
+int libcfs_sock_connect(socket_t **sockp, int *fatal,
+                       __u32 local_ip, int local_port,
+                       __u32 peer_ip, int peer_port);
+int libcfs_sock_setbuf(socket_t *socket, int txbufsize, int rxbufsize);
+int libcfs_sock_getbuf(socket_t *socket, int *txbufsize, int *rxbufsize);
+int libcfs_sock_getaddr(socket_t *socket, int remote, __u32 *ip, int *port);
+int libcfs_sock_write(socket_t *sock, void *buffer, int nob, int timeout);
+int libcfs_sock_read(socket_t *sock, void *buffer, int nob, int timeout);
+void libcfs_sock_release(socket_t *sock);
+
+/* libcfs watchdogs */
+struct lc_watchdog;
+
+/* Add a watchdog which fires after "time" milliseconds of delay.  You have to
+ * touch it once to enable it. */
+struct lc_watchdog *lc_watchdog_add(int time,
+                                   void (*cb)(pid_t pid, void *),
+                                   void *data);
+
+/* Enables a watchdog and resets its timer. */
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout);
+#define CFS_GET_TIMEOUT(svc) (max_t(int, obd_timeout,             \
+                         AT_OFF ? 0 : at_get(&svc->srv_at_estimate)) * \
+                         svc->srv_watchdog_factor)
+
+/* Disable a watchdog; touch it to restart it. */
+void lc_watchdog_disable(struct lc_watchdog *lcw);
+
+/* Clean up the watchdog */
+void lc_watchdog_delete(struct lc_watchdog *lcw);
+
+/* Dump a debug log */
+void lc_watchdog_dumplog(pid_t pid, void *data);
+
+
+/* need both kernel and user-land acceptor */
+#define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
+#define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
+
+/*
+ * libcfs pseudo device operations
+ *
+ * struct psdev_t and
+ * misc_register() and
+ * misc_deregister() are declared in
+ * libcfs/<os>/<os>-prim.h
+ *
+ * It's just draft now.
+ */
+
+struct cfs_psdev_file {
+       unsigned long   off;
+       void        *private_data;
+       unsigned long   reserved1;
+       unsigned long   reserved2;
+};
+
+struct cfs_psdev_ops {
+       int (*p_open)(unsigned long, void *);
+       int (*p_close)(unsigned long, void *);
+       int (*p_read)(struct cfs_psdev_file *, char *, unsigned long);
+       int (*p_write)(struct cfs_psdev_file *, char *, unsigned long);
+       int (*p_ioctl)(struct cfs_psdev_file *, unsigned long, void *);
+};
+
+/*
+ * Drop into debugger, if possible. Implementation is provided by platform.
+ */
+
+void cfs_enter_debugger(void);
+
+/*
+ * Defined by platform
+ */
+int unshare_fs_struct(void);
+sigset_t cfs_get_blocked_sigs(void);
+sigset_t cfs_block_allsigs(void);
+sigset_t cfs_block_sigs(unsigned long sigs);
+sigset_t cfs_block_sigsinv(unsigned long sigs);
+void cfs_restore_sigs(sigset_t);
+int cfs_signal_pending(void);
+void cfs_clear_sigpending(void);
+
+/*
+ * Random number handling
+ */
+
+/* returns a random 32-bit integer */
+unsigned int cfs_rand(void);
+/* seed the generator */
+void cfs_srand(unsigned int, unsigned int);
+void cfs_get_random_bytes(void *buf, int size);
+
+#include <linux/libcfs/libcfs_debug.h>
+#include <linux/libcfs/libcfs_cpu.h>
+#include <linux/libcfs/libcfs_private.h>
+#include <linux/libcfs/libcfs_ioctl.h>
+#include <linux/libcfs/libcfs_prim.h>
+#include <linux/libcfs/libcfs_time.h>
+#include <linux/libcfs/libcfs_string.h>
+#include <linux/libcfs/libcfs_kernelcomm.h>
+#include <linux/libcfs/libcfs_workitem.h>
+#include <linux/libcfs/libcfs_hash.h>
+#include <linux/libcfs/libcfs_heap.h>
+#include <linux/libcfs/libcfs_fail.h>
+#include <linux/libcfs/params_tree.h>
+#include <linux/libcfs/libcfs_crypto.h>
+
+/* container_of depends on "likely" which is defined in libcfs_private.h */
+static inline void *__container_of(void *ptr, unsigned long shift)
+{
+       if (unlikely(IS_ERR(ptr) || ptr == NULL))
+               return ptr;
+       else
+               return (char *)ptr - shift;
+}
+
+#define container_of0(ptr, type, member) \
+       ((type *)__container_of((void *)(ptr), offsetof(type, member)))
+
+#define SET_BUT_UNUSED(a) do { } while(sizeof(a) - sizeof(a))
+
+#define _LIBCFS_H
+
+#endif /* _LIBCFS_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h
new file mode 100644 (file)
index 0000000..6ae7415
--- /dev/null
@@ -0,0 +1,214 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_cpu.h
+ *
+ * CPU partition
+ *   . CPU partition is virtual processing unit
+ *
+ *   . CPU partition can present 1-N cores, or 1-N NUMA nodes,
+ *     in other words, CPU partition is a processors pool.
+ *
+ * CPU Partition Table (CPT)
+ *   . a set of CPU partitions
+ *
+ *   . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP
+ *
+ *   . User can specify total number of CPU partitions while creating a
+ *     CPT, ID of CPU partition is always start from 0.
+ *
+ *     Example: if there are 8 cores on the system, while creating a CPT
+ *     with cpu_npartitions=4:
+ *           core[0, 1] = partition[0], core[2, 3] = partition[1]
+ *           core[4, 5] = partition[2], core[6, 7] = partition[3]
+ *
+ *       cpu_npartitions=1:
+ *           core[0, 1, ... 7] = partition[0]
+ *
+ *   . User can also specify CPU partitions by string pattern
+ *
+ *     Examples: cpu_partitions="0[0,1], 1[2,3]"
+ *            cpu_partitions="N 0[0-3], 1[4-8]"
+ *
+ *     The first character "N" means following numbers are numa ID
+ *
+ *   . NUMA allocators, CPU affinity threads are built over CPU partitions,
+ *     instead of HW CPUs or HW nodes.
+ *
+ *   . By default, Lustre modules should refer to the global cfs_cpt_table,
+ *     instead of accessing HW CPUs directly, so concurrency of Lustre can be
+ *     configured by cpu_npartitions of the global cfs_cpt_table
+ *
+ *   . If cpu_npartitions=1(all CPUs in one pool), lustre should work the
+ *     same way as 2.2 or earlier versions
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_CPU_H__
+#define __LIBCFS_CPU_H__
+
+#ifndef HAVE_LIBCFS_CPT
+
+typedef unsigned long          cpumask_t;
+typedef unsigned long          nodemask_t;
+
+struct cfs_cpt_table {
+       /* # of CPU partitions */
+       int                     ctb_nparts;
+       /* cpu mask */
+       cpumask_t               ctb_mask;
+       /* node mask */
+       nodemask_t              ctb_nodemask;
+       /* version */
+       __u64                   ctb_version;
+};
+
+#endif /* !HAVE_LIBCFS_CPT */
+
+/* any CPU partition */
+#define CFS_CPT_ANY            (-1)
+
+extern struct cfs_cpt_table    *cfs_cpt_table;
+
+/**
+ * destroy a CPU partition table
+ */
+void cfs_cpt_table_free(struct cfs_cpt_table *cptab);
+/**
+ * create a cfs_cpt_table with \a ncpt number of partitions
+ */
+struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt);
+/**
+ * print string information of cpt-table
+ */
+int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len);
+/**
+ * return total number of CPU partitions in \a cptab
+ */
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab);
+/**
+ * return number of HW cores or hypter-threadings in a CPU partition \a cpt
+ */
+int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * is there any online CPU in CPU partition \a cpt
+ */
+int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return cpumask of CPU partition \a cpt
+ */
+cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * return nodemask of CPU partition \a cpt
+ */
+nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * shadow current HW processor ID to CPU-partition ID of \a cptab
+ */
+int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap);
+/**
+ * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab
+ */
+int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu);
+/**
+ * bind current thread on a CPU-partition \a cpt of \a cptab
+ */
+int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * add \a cpu to CPU partion @cpt of \a cptab, return 1 for success,
+ * otherwise 0 is returned
+ */
+int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * remove \a cpu from CPU partition \a cpt of \a cptab
+ */
+void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu);
+/**
+ * add all cpus in \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab,
+                       int cpt, cpumask_t *mask);
+/**
+ * remove all cpus in \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab,
+                          int cpt, cpumask_t *mask);
+/**
+ * add all cpus in NUMA node \a node to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node);
+/**
+ * remove all cpus in NUMA node \a node from CPU partition \a cpt
+ */
+void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node);
+
+/**
+ * add all cpus in node mask \a mask to CPU partition \a cpt
+ * return 1 if successfully set all CPUs, otherwise return 0
+ */
+int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab,
+                        int cpt, nodemask_t *mask);
+/**
+ * remove all cpus in node mask \a mask from CPU partition \a cpt
+ */
+void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab,
+                           int cpt, nodemask_t *mask);
+/**
+ * unset all cpus for CPU partition \a cpt
+ */
+void cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt);
+/**
+ * convert partition id \a cpt to numa node id, if there are more than one
+ * nodes in this partition, it might return a different node id each time.
+ */
+int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt);
+
+/**
+ * iterate over all CPU partitions in \a cptab
+ */
+#define cfs_cpt_for_each(i, cptab)     \
+       for (i = 0; i < cfs_cpt_number(cptab); i++)
+
+#ifndef __read_mostly
+# define __read_mostly
+#endif
+
+#ifndef ____cacheline_aligned
+#define ____cacheline_aligned
+#endif
+
+int  cfs_cpu_init(void);
+void cfs_cpu_fini(void);
+
+#endif /* __LIBCFS_CPU_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h
new file mode 100644 (file)
index 0000000..64ca62f
--- /dev/null
@@ -0,0 +1,201 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+#ifndef _LIBCFS_CRYPTO_H
+#define _LIBCFS_CRYPTO_H
+
+struct cfs_crypto_hash_type {
+       char            *cht_name;      /**< hash algorithm name, equal to
+                                        * format name for crypto api */
+       unsigned int    cht_key;        /**< init key by default (vaild for
+                                        * 4 bytes context like crc32, adler */
+       unsigned int    cht_size;       /**< hash digest size */
+};
+
+enum cfs_crypto_hash_alg {
+       CFS_HASH_ALG_NULL       = 0,
+       CFS_HASH_ALG_ADLER32,
+       CFS_HASH_ALG_CRC32,
+       CFS_HASH_ALG_MD5,
+       CFS_HASH_ALG_SHA1,
+       CFS_HASH_ALG_SHA256,
+       CFS_HASH_ALG_SHA384,
+       CFS_HASH_ALG_SHA512,
+       CFS_HASH_ALG_CRC32C,
+       CFS_HASH_ALG_MAX
+};
+
+static struct cfs_crypto_hash_type hash_types[] = {
+       [CFS_HASH_ALG_NULL]    = { "null",     0,      0 },
+       [CFS_HASH_ALG_ADLER32] = { "adler32",  1,      4 },
+       [CFS_HASH_ALG_CRC32]   = { "crc32",   ~0,      4 },
+       [CFS_HASH_ALG_CRC32C]  = { "crc32c",  ~0,      4 },
+       [CFS_HASH_ALG_MD5]     = { "md5",      0,     16 },
+       [CFS_HASH_ALG_SHA1]    = { "sha1",     0,     20 },
+       [CFS_HASH_ALG_SHA256]  = { "sha256",   0,     32 },
+       [CFS_HASH_ALG_SHA384]  = { "sha384",   0,     48 },
+       [CFS_HASH_ALG_SHA512]  = { "sha512",   0,     64 },
+};
+
+/**    Return pointer to type of hash for valid hash algorithm identifier */
+static inline const struct cfs_crypto_hash_type *
+                   cfs_crypto_hash_type(unsigned char hash_alg)
+{
+       struct cfs_crypto_hash_type *ht;
+
+       if (hash_alg < CFS_HASH_ALG_MAX) {
+               ht = &hash_types[hash_alg];
+               if (ht->cht_name)
+                       return ht;
+       }
+       return NULL;
+}
+
+/**     Return hash name for valid hash algorithm identifier or "unknown" */
+static inline const char *cfs_crypto_hash_name(unsigned char hash_alg)
+{
+       const struct cfs_crypto_hash_type *ht;
+
+       ht = cfs_crypto_hash_type(hash_alg);
+       if (ht)
+               return ht->cht_name;
+       else
+               return "unknown";
+}
+
+/**     Return digest size for valid algorithm identifier or 0 */
+static inline int cfs_crypto_hash_digestsize(unsigned char hash_alg)
+{
+       const struct cfs_crypto_hash_type *ht;
+
+       ht = cfs_crypto_hash_type(hash_alg);
+       if (ht)
+               return ht->cht_size;
+       else
+               return 0;
+}
+
+/**     Return hash identifier for valid hash algorithm name or 0xFF */
+static inline unsigned char cfs_crypto_hash_alg(const char *algname)
+{
+       unsigned char   i;
+
+       for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+               if (!strcmp(hash_types[i].cht_name, algname))
+                       break;
+       return (i == CFS_HASH_ALG_MAX ? 0xFF : i);
+}
+
+/**     Calculate hash digest for buffer.
+ *      @param alg         id of hash algorithm
+ *      @param buf         buffer of data
+ *      @param buf_len buffer len
+ *      @param key         initial value for algorithm, if it is NULL,
+ *                         default initial value should be used.
+ *      @param key_len len of initial value
+ *      @param hash       [out] pointer to hash, if it is NULL, hash_len is
+ *                         set to valid digest size in bytes, retval -ENOSPC.
+ *      @param hash_len       [in,out] size of hash buffer
+ *      @returns             status of operation
+ *      @retval -EINVAL       if buf, buf_len, hash_len or alg_id is invalid
+ *      @retval -ENODEV       if this algorithm is unsupported
+ *      @retval -ENOSPC       if pointer to hash is NULL, or hash_len less than
+ *                         digest size
+ *      @retval 0           for success
+ *      @retval < 0       other errors from lower layers.
+ */
+int cfs_crypto_hash_digest(unsigned char alg,
+                          const void *buf, unsigned int buf_len,
+                          unsigned char *key, unsigned int key_len,
+                          unsigned char *hash, unsigned int *hash_len);
+
+/* cfs crypto hash descriptor */
+struct cfs_crypto_hash_desc;
+
+/**     Allocate and initialize desriptor for hash algorithm.
+ *      @param alg         algorithm id
+ *      @param key         initial value for algorithm, if it is NULL,
+ *                         default initial value should be used.
+ *      @param key_len len of initial value
+ *      @returns             pointer to descriptor of hash instance
+ *      @retval ERR_PTR(error) when errors occured.
+ */
+struct cfs_crypto_hash_desc*
+       cfs_crypto_hash_init(unsigned char alg,
+                            unsigned char *key, unsigned int key_len);
+
+/**    Update digest by part of data.
+ *     @param desc           hash descriptor
+ *     @param page           data page
+ *     @param offset       data offset
+ *     @param len             data len
+ *     @returns                 status of operation
+ *     @retval 0               for success.
+ */
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc,
+                               struct page *page, unsigned int offset,
+                               unsigned int len);
+
+/**    Update digest by part of data.
+ *     @param desc           hash descriptor
+ *     @param buf             pointer to data buffer
+ *     @param buf_len     size of data at buffer
+ *     @returns                 status of operation
+ *     @retval 0               for success.
+ */
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf,
+                          unsigned int buf_len);
+
+/**    Finalize hash calculation, copy hash digest to buffer, destroy hash
+ *     descriptor.
+ *     @param desc           hash descriptor
+ *     @param hash           buffer pointer to store hash digest
+ *     @param hash_len   pointer to hash buffer size, if NULL
+ *                           destory hash descriptor
+ *     @returns                 status of operation
+ *     @retval -ENOSPC   if hash is NULL, or *hash_len less than
+ *                           digest size
+ *     @retval 0               for success
+ *     @retval < 0           other errors from lower layers.
+ */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc,
+                         unsigned char *hash, unsigned int *hash_len);
+/**
+ *      Register crypto hash algorithms
+ */
+int cfs_crypto_register(void);
+
+/**
+ *      Unregister
+ */
+void cfs_crypto_unregister(void);
+
+/**     Return hash speed in Mbytes per second for valid hash algorithm
+ *      identifier. If test was unsuccessfull -1 would be return.
+ */
+int cfs_crypto_hash_speed(unsigned char hash_alg);
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h
new file mode 100644 (file)
index 0000000..dd8ac2f
--- /dev/null
@@ -0,0 +1,350 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_debug.h
+ *
+ * Debug messages and assertions
+ *
+ */
+
+#ifndef __LIBCFS_DEBUG_H__
+#define __LIBCFS_DEBUG_H__
+
+/*
+ *  Debugging
+ */
+extern unsigned int libcfs_subsystem_debug;
+extern unsigned int libcfs_stack;
+extern unsigned int libcfs_debug;
+extern unsigned int libcfs_printk;
+extern unsigned int libcfs_console_ratelimit;
+extern unsigned int libcfs_watchdog_ratelimit;
+extern unsigned int libcfs_console_max_delay;
+extern unsigned int libcfs_console_min_delay;
+extern unsigned int libcfs_console_backoff;
+extern unsigned int libcfs_debug_binary;
+extern char libcfs_debug_file_path_arr[PATH_MAX];
+
+int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys);
+int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
+
+/* Has there been an LBUG? */
+extern unsigned int libcfs_catastrophe;
+extern unsigned int libcfs_panic_on_lbug;
+
+/**
+ * Format for debug message headers
+ */
+struct ptldebug_header {
+       __u32 ph_len;
+       __u32 ph_flags;
+       __u32 ph_subsys;
+       __u32 ph_mask;
+       __u16 ph_cpu_id;
+       __u16 ph_type;
+       __u32 ph_sec;
+       __u64 ph_usec;
+       __u32 ph_stack;
+       __u32 ph_pid;
+       __u32 ph_extern_pid;
+       __u32 ph_line_num;
+} __attribute__((packed));
+
+
+#define PH_FLAG_FIRST_RECORD 1
+
+/* Debugging subsystems (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+#define S_UNDEFINED   0x00000001
+#define S_MDC   0x00000002
+#define S_MDS   0x00000004
+#define S_OSC   0x00000008
+#define S_OST   0x00000010
+#define S_CLASS       0x00000020
+#define S_LOG   0x00000040
+#define S_LLITE       0x00000080
+#define S_RPC   0x00000100
+#define S_MGMT 0x00000200
+#define S_LNET 0x00000400
+#define S_LND   0x00000800 /* ALL LNDs */
+#define S_PINGER      0x00001000
+#define S_FILTER      0x00002000
+/* unused */
+#define S_ECHO 0x00008000
+#define S_LDLM 0x00010000
+#define S_LOV   0x00020000
+#define S_LQUOTA      0x00040000
+#define S_OSD          0x00080000
+/* unused */
+/* unused */
+/* unused */
+#define S_LMV   0x00800000 /* b_new_cmd */
+/* unused */
+#define S_SEC   0x02000000 /* upcall cache */
+#define S_GSS   0x04000000 /* b_new_cmd */
+/* unused */
+#define S_MGC   0x10000000
+#define S_MGS   0x20000000
+#define S_FID   0x40000000 /* b_new_cmd */
+#define S_FLD   0x80000000 /* b_new_cmd */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+
+/* Debugging masks (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
+#define D_TRACE       0x00000001 /* ENTRY/EXIT markers */
+#define D_INODE       0x00000002
+#define D_SUPER       0x00000004
+#define D_EXT2 0x00000008 /* anything from ext2_debug */
+#define D_MALLOC      0x00000010 /* print malloc, free information */
+#define D_CACHE       0x00000020 /* cache-related items */
+#define D_INFO 0x00000040 /* general information */
+#define D_IOCTL       0x00000080 /* ioctl related information */
+#define D_NETERROR    0x00000100 /* network errors */
+#define D_NET   0x00000200 /* network communications */
+#define D_WARNING     0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
+#define D_BUFFS       0x00000800
+#define D_OTHER       0x00001000
+#define D_DENTRY      0x00002000
+#define D_NETTRACE    0x00004000
+#define D_PAGE 0x00008000 /* bulk page handling */
+#define D_DLMTRACE    0x00010000
+#define D_ERROR       0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG       0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA     0x00080000 /* recovery and failover */
+#define D_RPCTRACE    0x00100000 /* for distributed debugging */
+#define D_VFSTRACE    0x00200000
+#define D_READA       0x00400000 /* read-ahead */
+#define D_MMAP 0x00800000
+#define D_CONFIG      0x01000000
+#define D_CONSOLE     0x02000000
+#define D_QUOTA       0x04000000
+#define D_SEC   0x08000000
+#define D_LFSCK              0x10000000 /* For both OI scrub and LFSCK */
+/* keep these in sync with lnet/{utils,libcfs}/debug.c */
+
+#define D_HSM   D_TRACE
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600))        /* jiffies */
+#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */
+#define CDEBUG_DEFAULT_BACKOFF   2
+typedef struct {
+       cfs_time_t      cdls_next;
+       unsigned int    cdls_delay;
+       int          cdls_count;
+} cfs_debug_limit_state_t;
+
+struct libcfs_debug_msg_data {
+       const char             *msg_file;
+       const char             *msg_fn;
+       int                   msg_subsys;
+       int                   msg_line;
+       int                   msg_mask;
+       cfs_debug_limit_state_t  *msg_cdls;
+};
+
+#define LIBCFS_DEBUG_MSG_DATA_INIT(data, mask, cdls)   \
+do {                                                   \
+       (data)->msg_subsys = DEBUG_SUBSYSTEM;          \
+       (data)->msg_file   = __FILE__;                \
+       (data)->msg_fn     = __FUNCTION__;                \
+       (data)->msg_line   = __LINE__;                \
+       (data)->msg_cdls   = (cdls);                    \
+       (data)->msg_mask   = (mask);                    \
+} while (0)
+
+#define LIBCFS_DEBUG_MSG_DATA_DECL(dataname, mask, cdls)    \
+       static struct libcfs_debug_msg_data dataname = {    \
+              .msg_subsys = DEBUG_SUBSYSTEM,          \
+              .msg_file   = __FILE__,                \
+              .msg_fn     = __FUNCTION__,                \
+              .msg_line   = __LINE__,                \
+              .msg_cdls   = (cdls)      };           \
+       dataname.msg_mask   = (mask);
+
+
+
+/**
+ * Filters out logging messages based on mask and subsystem.
+ */
+static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem)
+{
+       return mask & D_CANTMASK ||
+               ((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem));
+}
+
+#define __CDEBUG(cdls, mask, format, ...)                             \
+do {                                                               \
+       static struct libcfs_debug_msg_data msgdata;                \
+                                                                       \
+       CFS_CHECK_STACK(&msgdata, mask, cdls);                    \
+                                                                       \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {              \
+               LIBCFS_DEBUG_MSG_DATA_INIT(&msgdata, mask, cdls);       \
+               libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__);     \
+       }                                                              \
+} while (0)
+
+#define CDEBUG(mask, format, ...) __CDEBUG(NULL, mask, format, ## __VA_ARGS__)
+
+#define CDEBUG_LIMIT(mask, format, ...)         \
+do {                                       \
+       static cfs_debug_limit_state_t cdls;    \
+                                               \
+       __CDEBUG(&cdls, mask, format, ## __VA_ARGS__);\
+} while (0)
+
+
+
+
+#define CWARN(format, ...)       CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__)
+#define CERROR(format, ...)     CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__)
+#define CNETERR(format, a...)       CDEBUG_LIMIT(D_NETERROR, format, ## a)
+#define CEMERG(format, ...)     CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__)
+
+#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__)
+#define LCONSOLE_INFO(format, ...)  CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__)
+#define LCONSOLE_WARN(format, ...)  CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__)
+#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \
+                          "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__)
+#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__)
+
+#define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__)
+
+
+void libcfs_log_goto(struct libcfs_debug_msg_data *, const char *, long_ptr_t);
+#define GOTO(label, rc)                                                 \
+do {                                                               \
+       if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {                \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);     \
+               libcfs_log_goto(&msgdata, #label, (long_ptr_t)(rc));    \
+       } else {                                                        \
+               (void)(rc);                                          \
+       }                                                              \
+       goto label;                                                  \
+} while (0)
+
+
+/*
+ * if rc == NULL, we need to code as RETURN((void *)NULL), otherwise
+ * there will be a warning in osx.
+ */
+#if defined(__GNUC__)
+
+long libcfs_log_return(struct libcfs_debug_msg_data *, long rc);
+#if BITS_PER_LONG > 32
+#define RETURN(rc)                                                     \
+do {                                                                   \
+       EXIT_NESTING;                                                   \
+       if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {                \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);     \
+               return (typeof(rc))libcfs_log_return(&msgdata,          \
+                                                    (long)(rc));       \
+       }                                                               \
+                                                                       \
+       return (rc);                                                    \
+} while (0)
+#else /* BITS_PER_LONG == 32 */
+/* We need an on-stack variable, because we cannot case a 32-bit pointer
+ * directly to (long long) without generating a complier warning/error, yet
+ * casting directly to (long) will truncate 64-bit return values. The log
+ * values will print as 32-bit values, but they always have been. LU-1436
+ */
+#define RETURN(rc)                                                     \
+do {                                                                   \
+       EXIT_NESTING;                                                   \
+       if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) {                \
+               typeof(rc) __rc = (rc);                                 \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL);     \
+               libcfs_log_return(&msgdata, (long_ptr_t)__rc);          \
+               return __rc;                                            \
+       }                                                               \
+                                                                       \
+       return (rc);                                                    \
+} while (0)
+#endif /* BITS_PER_LONG > 32 */
+
+#elif defined(_MSC_VER)
+#define RETURN(rc)                                                   \
+do {                                                               \
+       CDEBUG(D_TRACE, "Process leaving.\n");                    \
+       EXIT_NESTING;                                              \
+       return (rc);                                                \
+} while (0)
+#else
+# error "Unkown compiler"
+#endif /* __GNUC__ */
+
+#define ENTRY                                                     \
+ENTRY_NESTING;                                                   \
+do {                                                               \
+       CDEBUG(D_TRACE, "Process entered\n");                      \
+} while (0)
+
+#define EXIT                                                       \
+do {                                                               \
+       CDEBUG(D_TRACE, "Process leaving\n");                      \
+       EXIT_NESTING;                                              \
+} while(0)
+
+#define RETURN_EXIT                                                    \
+do {                                                                   \
+       EXIT;                                                           \
+       return;                                                         \
+} while (0)
+
+extern int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+                           const char *format1, ...)
+       __attribute__ ((format (printf, 2, 3)));
+
+extern int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+                             const char *format1,
+                             va_list args, const char *format2, ...)
+       __attribute__ ((format (printf, 4, 5)));
+
+/* other external symbols that tracefile provides: */
+extern int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+                                  const char *usr_buffer, int usr_buffer_nob);
+extern int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+                                   const char *knl_buffer, char *append);
+
+#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log"
+
+#endif /* __LIBCFS_DEBUG_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h
new file mode 100644 (file)
index 0000000..8393c27
--- /dev/null
@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#ifndef _LIBCFS_FAIL_H
+#define _LIBCFS_FAIL_H
+
+extern unsigned long cfs_fail_loc;
+extern unsigned int cfs_fail_val;
+
+extern wait_queue_head_t cfs_race_waitq;
+extern int cfs_race_state;
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set);
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set);
+
+enum {
+       CFS_FAIL_LOC_NOSET      = 0,
+       CFS_FAIL_LOC_ORSET      = 1,
+       CFS_FAIL_LOC_RESET      = 2,
+       CFS_FAIL_LOC_VALUE      = 3
+};
+
+/* Failure injection control */
+#define CFS_FAIL_MASK_SYS    0x0000FF00
+#define CFS_FAIL_MASK_LOC   (0x000000FF | CFS_FAIL_MASK_SYS)
+
+#define CFS_FAILED_BIT       30
+/* CFS_FAILED is 0x40000000 */
+#define CFS_FAILED       (1 << CFS_FAILED_BIT)
+
+#define CFS_FAIL_ONCE_BIT    31
+/* CFS_FAIL_ONCE is 0x80000000 */
+#define CFS_FAIL_ONCE       (1 << CFS_FAIL_ONCE_BIT)
+
+/* The following flags aren't made to be combined */
+#define CFS_FAIL_SKIP  0x20000000 /* skip N times then fail */
+#define CFS_FAIL_SOME  0x10000000 /* only fail N times */
+#define CFS_FAIL_RAND  0x08000000 /* fail 1/N of the times */
+#define CFS_FAIL_USR1  0x04000000 /* user flag */
+
+#define CFS_FAIL_PRECHECK(id) (cfs_fail_loc &&                         \
+                             (cfs_fail_loc & CFS_FAIL_MASK_LOC) ==        \
+                             ((id) & CFS_FAIL_MASK_LOC))
+
+static inline int cfs_fail_check_set(__u32 id, __u32 value,
+                                    int set, int quiet)
+{
+       int ret = 0;
+
+       if (unlikely(CFS_FAIL_PRECHECK(id) &&
+                    (ret = __cfs_fail_check_set(id, value, set)))) {
+               if (quiet) {
+                       CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n",
+                              id, value);
+               } else {
+                       LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n",
+                                     id, value);
+               }
+       }
+
+       return ret;
+}
+
+/* If id hit cfs_fail_loc, return 1, otherwise return 0 */
+#define CFS_FAIL_CHECK(id) \
+       cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0)
+#define CFS_FAIL_CHECK_QUIET(id) \
+       cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1)
+
+/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_VALUE(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0)
+#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_ORSET(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0)
+#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1,
+ * otherwise return 0 */
+#define CFS_FAIL_CHECK_RESET(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0)
+#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \
+       cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1)
+
+static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+       if (unlikely(CFS_FAIL_PRECHECK(id)))
+               return __cfs_fail_timeout_set(id, value, ms, set);
+       else
+               return 0;
+}
+
+/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT(id, secs) \
+       cfs_fail_timeout_set(id, 0, secs * 1000, CFS_FAIL_LOC_NOSET)
+
+#define CFS_FAIL_TIMEOUT_MS(id, ms) \
+       cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET)
+
+/* If id hit cfs_fail_loc, cfs_fail_loc |= value and
+ * sleep seconds or milliseconds */
+#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \
+       cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_ORSET)
+
+#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \
+       cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET)
+
+/* The idea here is to synchronise two threads to force a race. The
+ * first thread that calls this with a matching fail_loc is put to
+ * sleep. The next thread that calls with the same fail_loc wakes up
+ * the first and continues. */
+static inline void cfs_race(__u32 id)
+{
+
+       if (CFS_FAIL_PRECHECK(id)) {
+               if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) {
+                       int rc;
+                       cfs_race_state = 0;
+                       CERROR("cfs_race id %x sleeping\n", id);
+                       cfs_wait_event_interruptible(cfs_race_waitq,
+                                                    cfs_race_state != 0, rc);
+                       CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc);
+               } else {
+                       CERROR("cfs_fail_race id %x waking\n", id);
+                       cfs_race_state = 1;
+                       wake_up(&cfs_race_waitq);
+               }
+       }
+}
+#define CFS_RACE(id) cfs_race(id)
+
+#endif /* _LIBCFS_FAIL_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h
new file mode 100644 (file)
index 0000000..f6361b3
--- /dev/null
@@ -0,0 +1,851 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_hash.h
+ *
+ * Hashing routines
+ *
+ */
+
+#ifndef __LIBCFS_HASH_H__
+#define __LIBCFS_HASH_H__
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL
+
+/*
+ * Ideally we would use HAVE_HASH_LONG for this, but on linux we configure
+ * the linux kernel and user space at the same time, so we need to differentiate
+ * between them explicitely. If this is not needed on other architectures, then
+ * we'll need to move the functions to archi specific headers.
+ */
+
+#include <linux/hash.h>
+
+#define cfs_hash_long(val, bits)    hash_long(val, bits)
+
+/** disable debug */
+#define CFS_HASH_DEBUG_NONE     0
+/** record hash depth and output to console when it's too deep,
+ *  computing overhead is low but consume more memory */
+#define CFS_HASH_DEBUG_1           1
+/** expensive, check key validation */
+#define CFS_HASH_DEBUG_2           2
+
+#define CFS_HASH_DEBUG_LEVEL   CFS_HASH_DEBUG_NONE
+
+struct cfs_hash_ops;
+struct cfs_hash_lock_ops;
+struct cfs_hash_hlist_ops;
+
+typedef union {
+       rwlock_t                rw;             /**< rwlock */
+       spinlock_t              spin;           /**< spinlock */
+} cfs_hash_lock_t;
+
+/**
+ * cfs_hash_bucket is a container of:
+ * - lock, couter ...
+ * - array of hash-head starting from hsb_head[0], hash-head can be one of
+ *   . cfs_hash_head_t
+ *   . cfs_hash_head_dep_t
+ *   . cfs_hash_dhead_t
+ *   . cfs_hash_dhead_dep_t
+ *   which depends on requirement of user
+ * - some extra bytes (caller can require it while creating hash)
+ */
+typedef struct cfs_hash_bucket {
+       cfs_hash_lock_t         hsb_lock;       /**< bucket lock */
+       __u32                   hsb_count;      /**< current entries */
+       __u32                   hsb_version;    /**< change version */
+       unsigned int            hsb_index;      /**< index of bucket */
+       int                     hsb_depmax;     /**< max depth on bucket */
+       long                    hsb_head[0];    /**< hash-head array */
+} cfs_hash_bucket_t;
+
+/**
+ * cfs_hash bucket descriptor, it's normally in stack of caller
+ */
+typedef struct cfs_hash_bd {
+       cfs_hash_bucket_t         *bd_bucket;      /**< address of bucket */
+       unsigned int            bd_offset;      /**< offset in bucket */
+} cfs_hash_bd_t;
+
+#define CFS_HASH_NAME_LEN         16      /**< default name length */
+#define CFS_HASH_BIGNAME_LEN   64      /**< bigname for param tree */
+
+#define CFS_HASH_BKT_BITS         3       /**< default bits of bucket */
+#define CFS_HASH_BITS_MAX         30      /**< max bits of bucket */
+#define CFS_HASH_BITS_MIN         CFS_HASH_BKT_BITS
+
+/**
+ * common hash attributes.
+ */
+enum cfs_hash_tag {
+       /**
+        * don't need any lock, caller will protect operations with it's
+        * own lock. With this flag:
+        *  . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK
+        *    will be ignored.
+        *  . Some functions will be disabled with this flag, i.e:
+        *    cfs_hash_for_each_empty, cfs_hash_rehash
+        */
+       CFS_HASH_NO_LOCK        = 1 << 0,
+       /** no bucket lock, use one spinlock to protect the whole hash */
+       CFS_HASH_NO_BKTLOCK     = 1 << 1,
+       /** rwlock to protect bucket */
+       CFS_HASH_RW_BKTLOCK     = 1 << 2,
+       /** spinlcok to protect bucket */
+       CFS_HASH_SPIN_BKTLOCK   = 1 << 3,
+       /** always add new item to tail */
+       CFS_HASH_ADD_TAIL       = 1 << 4,
+       /** hash-table doesn't have refcount on item */
+       CFS_HASH_NO_ITEMREF     = 1 << 5,
+       /** big name for param-tree */
+       CFS_HASH_BIGNAME        = 1 << 6,
+       /** track global count */
+       CFS_HASH_COUNTER        = 1 << 7,
+       /** rehash item by new key */
+       CFS_HASH_REHASH_KEY     = 1 << 8,
+       /** Enable dynamic hash resizing */
+       CFS_HASH_REHASH  = 1 << 9,
+       /** can shrink hash-size */
+       CFS_HASH_SHRINK  = 1 << 10,
+       /** assert hash is empty on exit */
+       CFS_HASH_ASSERT_EMPTY   = 1 << 11,
+       /** record hlist depth */
+       CFS_HASH_DEPTH    = 1 << 12,
+       /**
+        * rehash is always scheduled in a different thread, so current
+        * change on hash table is non-blocking
+        */
+       CFS_HASH_NBLK_CHANGE    = 1 << 13,
+       /** NB, we typed hs_flags as  __u16, please change it
+        * if you need to extend >=16 flags */
+};
+
+/** most used attributes */
+#define CFS_HASH_DEFAULT       (CFS_HASH_RW_BKTLOCK | \
+                               CFS_HASH_COUNTER | CFS_HASH_REHASH)
+
+/**
+ * cfs_hash is a hash-table implementation for general purpose, it can support:
+ *    . two refcount modes
+ *      hash-table with & without refcount
+ *    . four lock modes
+ *      nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock
+ *    . general operations
+ *      lookup, add(add_tail or add_head), delete
+ *    . rehash
+ *      grows or shrink
+ *    . iteration
+ *      locked iteration and unlocked iteration
+ *    . bigname
+ *      support long name hash
+ *    . debug
+ *      trace max searching depth
+ *
+ * Rehash:
+ * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker)
+ * is spawned to handle the rehash in the background, it's possible that other
+ * processes can concurrently perform additions, deletions, and lookups
+ * without being blocked on rehash completion, because rehash will release
+ * the global wrlock for each bucket.
+ *
+ * rehash and iteration can't run at the same time because it's too tricky
+ * to keep both of them safe and correct.
+ * As they are relatively rare operations, so:
+ *   . if iteration is in progress while we try to launch rehash, then
+ *     it just giveup, iterator will launch rehash at the end.
+ *   . if rehash is in progress while we try to iterate the hash table,
+ *     then we just wait (shouldn't be very long time), anyway, nobody
+ *     should expect iteration of whole hash-table to be non-blocking.
+ *
+ * During rehashing, a (key,object) pair may be in one of two buckets,
+ * depending on whether the worker task has yet to transfer the object
+ * to its new location in the table. Lookups and deletions need to search both
+ * locations; additions must take care to only insert into the new bucket.
+ */
+
+typedef struct cfs_hash {
+       /** serialize with rehash, or serialize all operations if
+        * the hash-table has CFS_HASH_NO_BKTLOCK */
+       cfs_hash_lock_t      hs_lock;
+       /** hash operations */
+       struct cfs_hash_ops     *hs_ops;
+       /** hash lock operations */
+       struct cfs_hash_lock_ops   *hs_lops;
+       /** hash list operations */
+       struct cfs_hash_hlist_ops  *hs_hops;
+       /** hash buckets-table */
+       cfs_hash_bucket_t        **hs_buckets;
+       /** total number of items on this hash-table */
+       atomic_t                hs_count;
+       /** hash flags, see cfs_hash_tag for detail */
+       __u16                  hs_flags;
+       /** # of extra-bytes for bucket, for user saving extended attributes */
+       __u16                  hs_extra_bytes;
+       /** wants to iterate */
+       __u8                    hs_iterating;
+       /** hash-table is dying */
+       __u8                    hs_exiting;
+       /** current hash bits */
+       __u8                    hs_cur_bits;
+       /** min hash bits */
+       __u8                    hs_min_bits;
+       /** max hash bits */
+       __u8                    hs_max_bits;
+       /** bits for rehash */
+       __u8                    hs_rehash_bits;
+       /** bits for each bucket */
+       __u8                    hs_bkt_bits;
+       /** resize min threshold */
+       __u16                  hs_min_theta;
+       /** resize max threshold */
+       __u16                  hs_max_theta;
+       /** resize count */
+       __u32                  hs_rehash_count;
+       /** # of iterators (caller of cfs_hash_for_each_*) */
+       __u32                  hs_iterators;
+       /** rehash workitem */
+       cfs_workitem_t        hs_rehash_wi;
+       /** refcount on this hash table */
+       atomic_t                hs_refcount;
+       /** rehash buckets-table */
+       cfs_hash_bucket_t        **hs_rehash_buckets;
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+       /** serialize debug members */
+       spinlock_t                      hs_dep_lock;
+       /** max depth */
+       unsigned int            hs_dep_max;
+       /** id of the deepest bucket */
+       unsigned int            hs_dep_bkt;
+       /** offset in the deepest bucket */
+       unsigned int            hs_dep_off;
+       /** bits when we found the max depth */
+       unsigned int            hs_dep_bits;
+       /** workitem to output max depth */
+       cfs_workitem_t        hs_dep_wi;
+#endif
+       /** name of htable */
+       char                    hs_name[0];
+} cfs_hash_t;
+
+typedef struct cfs_hash_lock_ops {
+       /** lock the hash table */
+       void    (*hs_lock)(cfs_hash_lock_t *lock, int exclusive);
+       /** unlock the hash table */
+       void    (*hs_unlock)(cfs_hash_lock_t *lock, int exclusive);
+       /** lock the hash bucket */
+       void    (*hs_bkt_lock)(cfs_hash_lock_t *lock, int exclusive);
+       /** unlock the hash bucket */
+       void    (*hs_bkt_unlock)(cfs_hash_lock_t *lock, int exclusive);
+} cfs_hash_lock_ops_t;
+
+typedef struct cfs_hash_hlist_ops {
+       /** return hlist_head of hash-head of @bd */
+       struct hlist_head *(*hop_hhead)(cfs_hash_t *hs, cfs_hash_bd_t *bd);
+       /** return hash-head size */
+       int (*hop_hhead_size)(cfs_hash_t *hs);
+       /** add @hnode to hash-head of @bd */
+       int (*hop_hnode_add)(cfs_hash_t *hs,
+                            cfs_hash_bd_t *bd, struct hlist_node *hnode);
+       /** remove @hnode from hash-head of @bd */
+       int (*hop_hnode_del)(cfs_hash_t *hs,
+                            cfs_hash_bd_t *bd, struct hlist_node *hnode);
+} cfs_hash_hlist_ops_t;
+
+typedef struct cfs_hash_ops {
+       /** return hashed value from @key */
+       unsigned (*hs_hash)(cfs_hash_t *hs, const void *key, unsigned mask);
+       /** return key address of @hnode */
+       void *   (*hs_key)(struct hlist_node *hnode);
+       /** copy key from @hnode to @key */
+       void     (*hs_keycpy)(struct hlist_node *hnode, void *key);
+       /**
+        *  compare @key with key of @hnode
+        *  returns 1 on a match
+        */
+       int      (*hs_keycmp)(const void *key, struct hlist_node *hnode);
+       /** return object address of @hnode, i.e: container_of(...hnode) */
+       void *   (*hs_object)(struct hlist_node *hnode);
+       /** get refcount of item, always called with holding bucket-lock */
+       void     (*hs_get)(cfs_hash_t *hs, struct hlist_node *hnode);
+       /** release refcount of item */
+       void     (*hs_put)(cfs_hash_t *hs, struct hlist_node *hnode);
+       /** release refcount of item, always called with holding bucket-lock */
+       void     (*hs_put_locked)(cfs_hash_t *hs, struct hlist_node *hnode);
+       /** it's called before removing of @hnode */
+       void     (*hs_exit)(cfs_hash_t *hs, struct hlist_node *hnode);
+} cfs_hash_ops_t;
+
+/** total number of buckets in @hs */
+#define CFS_HASH_NBKT(hs)       \
+       (1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits))
+
+/** total number of buckets in @hs while rehashing */
+#define CFS_HASH_RH_NBKT(hs)    \
+       (1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits))
+
+/** number of hlist for in bucket */
+#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits)
+
+/** total number of hlist in @hs */
+#define CFS_HASH_NHLIST(hs)     (1U << (hs)->hs_cur_bits)
+
+/** total number of hlist in @hs while rehashing */
+#define CFS_HASH_RH_NHLIST(hs)  (1U << (hs)->hs_rehash_bits)
+
+static inline int
+cfs_hash_with_no_lock(cfs_hash_t *hs)
+{
+       /* caller will serialize all operations for this hash-table */
+       return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_no_bktlock(cfs_hash_t *hs)
+{
+       /* no bucket lock, one single lock to protect the hash-table */
+       return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_rw_bktlock(cfs_hash_t *hs)
+{
+       /* rwlock to protect hash bucket */
+       return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_spin_bktlock(cfs_hash_t *hs)
+{
+       /* spinlock to protect hash bucket */
+       return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0;
+}
+
+static inline int
+cfs_hash_with_add_tail(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0;
+}
+
+static inline int
+cfs_hash_with_no_itemref(cfs_hash_t *hs)
+{
+       /* hash-table doesn't keep refcount on item,
+        * item can't be removed from hash unless it's
+        * ZERO refcount */
+       return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0;
+}
+
+static inline int
+cfs_hash_with_bigname(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_BIGNAME) != 0;
+}
+
+static inline int
+cfs_hash_with_counter(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_COUNTER) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_REHASH) != 0;
+}
+
+static inline int
+cfs_hash_with_rehash_key(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0;
+}
+
+static inline int
+cfs_hash_with_shrink(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_SHRINK) != 0;
+}
+
+static inline int
+cfs_hash_with_assert_empty(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0;
+}
+
+static inline int
+cfs_hash_with_depth(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_DEPTH) != 0;
+}
+
+static inline int
+cfs_hash_with_nblk_change(cfs_hash_t *hs)
+{
+       return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0;
+}
+
+static inline int
+cfs_hash_is_exiting(cfs_hash_t *hs)
+{       /* cfs_hash_destroy is called */
+       return hs->hs_exiting;
+}
+
+static inline int
+cfs_hash_is_rehashing(cfs_hash_t *hs)
+{       /* rehash is launched */
+       return hs->hs_rehash_bits != 0;
+}
+
+static inline int
+cfs_hash_is_iterating(cfs_hash_t *hs)
+{       /* someone is calling cfs_hash_for_each_* */
+       return hs->hs_iterating || hs->hs_iterators != 0;
+}
+
+static inline int
+cfs_hash_bkt_size(cfs_hash_t *hs)
+{
+       return offsetof(cfs_hash_bucket_t, hsb_head[0]) +
+              hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) +
+              hs->hs_extra_bytes;
+}
+
+#define CFS_HOP(hs, op)           (hs)->hs_ops->hs_ ## op
+
+static inline unsigned
+cfs_hash_id(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return CFS_HOP(hs, hash)(hs, key, mask);
+}
+
+static inline void *
+cfs_hash_key(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, key)(hnode);
+}
+
+static inline void
+cfs_hash_keycpy(cfs_hash_t *hs, struct hlist_node *hnode, void *key)
+{
+       if (CFS_HOP(hs, keycpy) != NULL)
+               CFS_HOP(hs, keycpy)(hnode, key);
+}
+
+/**
+ * Returns 1 on a match,
+ */
+static inline int
+cfs_hash_keycmp(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, keycmp)(key, hnode);
+}
+
+static inline void *
+cfs_hash_object(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, object)(hnode);
+}
+
+static inline void
+cfs_hash_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       return CFS_HOP(hs, get)(hs, hnode);
+}
+
+static inline void
+cfs_hash_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       LASSERT(CFS_HOP(hs, put_locked) != NULL);
+
+       return CFS_HOP(hs, put_locked)(hs, hnode);
+}
+
+static inline void
+cfs_hash_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       LASSERT(CFS_HOP(hs, put) != NULL);
+
+       return CFS_HOP(hs, put)(hs, hnode);
+}
+
+static inline void
+cfs_hash_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       if (CFS_HOP(hs, exit))
+               CFS_HOP(hs, exit)(hs, hnode);
+}
+
+static inline void cfs_hash_lock(cfs_hash_t *hs, int excl)
+{
+       hs->hs_lops->hs_lock(&hs->hs_lock, excl);
+}
+
+static inline void cfs_hash_unlock(cfs_hash_t *hs, int excl)
+{
+       hs->hs_lops->hs_unlock(&hs->hs_lock, excl);
+}
+
+static inline int cfs_hash_dec_and_lock(cfs_hash_t *hs,
+                                       atomic_t *condition)
+{
+       LASSERT(cfs_hash_with_no_bktlock(hs));
+       return atomic_dec_and_lock(condition, &hs->hs_lock.spin);
+}
+
+static inline void cfs_hash_bd_lock(cfs_hash_t *hs,
+                                   cfs_hash_bd_t *bd, int excl)
+{
+       hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+static inline void cfs_hash_bd_unlock(cfs_hash_t *hs,
+                                     cfs_hash_bd_t *bd, int excl)
+{
+       hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl);
+}
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are normally for hash-table without rehash
+ */
+void cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd);
+
+static inline void cfs_hash_bd_get_and_lock(cfs_hash_t *hs, const void *key,
+                                           cfs_hash_bd_t *bd, int excl)
+{
+       cfs_hash_bd_get(hs, key, bd);
+       cfs_hash_bd_lock(hs, bd, excl);
+}
+
+static inline unsigned cfs_hash_bd_index_get(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits);
+}
+
+static inline void cfs_hash_bd_index_set(cfs_hash_t *hs,
+                                        unsigned index, cfs_hash_bd_t *bd)
+{
+       bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits];
+       bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U);
+}
+
+static inline void *
+cfs_hash_bd_extra_get(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       return (void *)bd->bd_bucket +
+              cfs_hash_bkt_size(hs) - hs->hs_extra_bytes;
+}
+
+static inline __u32
+cfs_hash_bd_version_get(cfs_hash_bd_t *bd)
+{
+       /* need hold cfs_hash_bd_lock */
+       return bd->bd_bucket->hsb_version;
+}
+
+static inline __u32
+cfs_hash_bd_count_get(cfs_hash_bd_t *bd)
+{
+       /* need hold cfs_hash_bd_lock */
+       return bd->bd_bucket->hsb_count;
+}
+
+static inline int
+cfs_hash_bd_depmax_get(cfs_hash_bd_t *bd)
+{
+       return bd->bd_bucket->hsb_depmax;
+}
+
+static inline int
+cfs_hash_bd_compare(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2)
+{
+       if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index)
+               return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index;
+
+       if (bd1->bd_offset != bd2->bd_offset)
+               return bd1->bd_offset - bd2->bd_offset;
+
+       return 0;
+}
+
+void cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                           struct hlist_node *hnode);
+void cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                           struct hlist_node *hnode);
+void cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old,
+                            cfs_hash_bd_t *bd_new, struct hlist_node *hnode);
+
+static inline int cfs_hash_bd_dec_and_lock(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                          atomic_t *condition)
+{
+       LASSERT(cfs_hash_with_spin_bktlock(hs));
+       return atomic_dec_and_lock(condition,
+                                      &bd->bd_bucket->hsb_lock.spin);
+}
+
+static inline struct hlist_head *cfs_hash_bd_hhead(cfs_hash_t *hs,
+                                                 cfs_hash_bd_t *bd)
+{
+       return hs->hs_hops->hop_hhead(hs, bd);
+}
+
+struct hlist_node *cfs_hash_bd_lookup_locked(cfs_hash_t *hs,
+                                           cfs_hash_bd_t *bd, const void *key);
+struct hlist_node *cfs_hash_bd_peek_locked(cfs_hash_t *hs,
+                                         cfs_hash_bd_t *bd, const void *key);
+struct hlist_node *cfs_hash_bd_findadd_locked(cfs_hash_t *hs,
+                                            cfs_hash_bd_t *bd, const void *key,
+                                            struct hlist_node *hnode,
+                                            int insist_add);
+struct hlist_node *cfs_hash_bd_finddel_locked(cfs_hash_t *hs,
+                                            cfs_hash_bd_t *bd, const void *key,
+                                            struct hlist_node *hnode);
+
+/**
+ * operations on cfs_hash bucket (bd: bucket descriptor),
+ * they are safe for hash-table with rehash
+ */
+void cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds);
+void cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl);
+void cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl);
+
+static inline void cfs_hash_dual_bd_get_and_lock(cfs_hash_t *hs, const void *key,
+                                                cfs_hash_bd_t *bds, int excl)
+{
+       cfs_hash_dual_bd_get(hs, key, bds);
+       cfs_hash_dual_bd_lock(hs, bds, excl);
+}
+
+struct hlist_node *cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs,
+                                                cfs_hash_bd_t *bds,
+                                                const void *key);
+struct hlist_node *cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs,
+                                                 cfs_hash_bd_t *bds,
+                                                 const void *key,
+                                                 struct hlist_node *hnode,
+                                                 int insist_add);
+struct hlist_node *cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs,
+                                                 cfs_hash_bd_t *bds,
+                                                 const void *key,
+                                                 struct hlist_node *hnode);
+
+/* Hash init/cleanup functions */
+cfs_hash_t *cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+                           unsigned bkt_bits, unsigned extra_bytes,
+                           unsigned min_theta, unsigned max_theta,
+                           cfs_hash_ops_t *ops, unsigned flags);
+
+cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs);
+void cfs_hash_putref(cfs_hash_t *hs);
+
+/* Hash addition functions */
+void cfs_hash_add(cfs_hash_t *hs, const void *key,
+                 struct hlist_node *hnode);
+int cfs_hash_add_unique(cfs_hash_t *hs, const void *key,
+                       struct hlist_node *hnode);
+void *cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key,
+                             struct hlist_node *hnode);
+
+/* Hash deletion functions */
+void *cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode);
+void *cfs_hash_del_key(cfs_hash_t *hs, const void *key);
+
+/* Hash lookup/for_each functions */
+#define CFS_HASH_LOOP_HOG       1024
+
+typedef int (*cfs_hash_for_each_cb_t)(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                     struct hlist_node *node, void *data);
+void *cfs_hash_lookup(cfs_hash_t *hs, const void *key);
+void cfs_hash_for_each(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
+void cfs_hash_for_each_safe(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_nolock(cfs_hash_t *hs,
+                             cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_for_each_empty(cfs_hash_t *hs,
+                            cfs_hash_for_each_cb_t, void *data);
+void cfs_hash_for_each_key(cfs_hash_t *hs, const void *key,
+                          cfs_hash_for_each_cb_t, void *data);
+typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data);
+void cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t, void *data);
+
+void cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex,
+                            cfs_hash_for_each_cb_t, void *data);
+int  cfs_hash_is_empty(cfs_hash_t *hs);
+__u64 cfs_hash_size_get(cfs_hash_t *hs);
+
+/*
+ * Rehash - Theta is calculated to be the average chained
+ * hash depth assuming a perfectly uniform hash funcion.
+ */
+void cfs_hash_rehash_cancel_locked(cfs_hash_t *hs);
+void cfs_hash_rehash_cancel(cfs_hash_t *hs);
+int  cfs_hash_rehash(cfs_hash_t *hs, int do_rehash);
+void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key,
+                        void *new_key, struct hlist_node *hnode);
+
+#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1
+/* Validate hnode references the correct key */
+static inline void
+cfs_hash_key_validate(cfs_hash_t *hs, const void *key,
+                     struct hlist_node *hnode)
+{
+       LASSERT(cfs_hash_keycmp(hs, key, hnode));
+}
+
+/* Validate hnode is in the correct bucket */
+static inline void
+cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                        struct hlist_node *hnode)
+{
+       cfs_hash_bd_t   bds[2];
+
+       cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds);
+       LASSERT(bds[0].bd_bucket == bd->bd_bucket ||
+               bds[1].bd_bucket == bd->bd_bucket);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */
+
+static inline void
+cfs_hash_key_validate(cfs_hash_t *hs, const void *key,
+                     struct hlist_node *hnode) {}
+
+static inline void
+cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                        struct hlist_node *hnode) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL */
+
+#define CFS_HASH_THETA_BITS  10
+#define CFS_HASH_MIN_THETA  (1U << (CFS_HASH_THETA_BITS - 1))
+#define CFS_HASH_MAX_THETA  (1U << (CFS_HASH_THETA_BITS + 1))
+
+/* Return integer component of theta */
+static inline int __cfs_hash_theta_int(int theta)
+{
+       return (theta >> CFS_HASH_THETA_BITS);
+}
+
+/* Return a fractional value between 0 and 999 */
+static inline int __cfs_hash_theta_frac(int theta)
+{
+       return ((theta * 1000) >> CFS_HASH_THETA_BITS) -
+              (__cfs_hash_theta_int(theta) * 1000);
+}
+
+static inline int __cfs_hash_theta(cfs_hash_t *hs)
+{
+       return (atomic_read(&hs->hs_count) <<
+               CFS_HASH_THETA_BITS) >> hs->hs_cur_bits;
+}
+
+static inline void __cfs_hash_set_theta(cfs_hash_t *hs, int min, int max)
+{
+       LASSERT(min < max);
+       hs->hs_min_theta = (__u16)min;
+       hs->hs_max_theta = (__u16)max;
+}
+
+/* Generic debug formatting routines mainly for proc handler */
+struct seq_file;
+int cfs_hash_debug_header(struct seq_file *m);
+int cfs_hash_debug_str(cfs_hash_t *hs, struct seq_file *m);
+
+/*
+ * Generic djb2 hash algorithm for character arrays.
+ */
+static inline unsigned
+cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask)
+{
+       unsigned i, hash = 5381;
+
+       LASSERT(key != NULL);
+
+       for (i = 0; i < size; i++)
+               hash = hash * 33 + ((char *)key)[i];
+
+       return (hash & mask);
+}
+
+/*
+ * Generic u32 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u32_hash(const __u32 key, unsigned mask)
+{
+       return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask);
+}
+
+/*
+ * Generic u64 hash algorithm.
+ */
+static inline unsigned
+cfs_hash_u64_hash(const __u64 key, unsigned mask)
+{
+       return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask);
+}
+
+/** iterate over all buckets in @bds (array of cfs_hash_bd_t) */
+#define cfs_hash_for_each_bd(bds, n, i) \
+       for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++)
+
+/** iterate over all buckets of @hs */
+#define cfs_hash_for_each_bucket(hs, bd, pos)             \
+       for (pos = 0;                                      \
+            pos < CFS_HASH_NBKT(hs) &&                  \
+            ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++)
+
+/** iterate over all hlist of bucket @bd */
+#define cfs_hash_bd_for_each_hlist(hs, bd, hlist)             \
+       for ((bd)->bd_offset = 0;                              \
+            (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) &&       \
+            (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL;       \
+            (bd)->bd_offset++)
+
+/* !__LIBCFS__HASH_H__ */
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h
new file mode 100644 (file)
index 0000000..bfa6d7b
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/include/libcfs/heap.h
+ *
+ * Author: Eric Barton <eeb@whamcloud.com>
+ *        Liang Zhen   <liang@whamcloud.com>
+ */
+
+#ifndef __LIBCFS_HEAP_H__
+#define __LIBCFS_HEAP_H__
+
+/** \defgroup heap Binary heap
+ *
+ * The binary heap is a scalable data structure created using a binary tree. It
+ * is capable of maintaining large sets of elements sorted usually by one or
+ * more element properties, but really based on anything that can be used as a
+ * binary predicate in order to determine the relevant ordering of any two nodes
+ * that belong to the set. There is no search operation, rather the intention is
+ * for the element of the lowest priority which will always be at the root of
+ * the tree (as this is an implementation of a min-heap) to be removed by users
+ * for consumption.
+ *
+ * Users of the heap should embed a \e cfs_binheap_node_t object instance on
+ * every object of the set that they wish the binary heap instance to handle,
+ * and (at a minimum) provide a cfs_binheap_ops_t::hop_compare() implementation
+ * which is used by the heap as the binary predicate during its internal sorting
+ * operations.
+ *
+ * The current implementation enforces no locking scheme, and so assumes the
+ * user caters for locking between calls to insert, delete and lookup
+ * operations. Since the only consumer for the data structure at this point
+ * are NRS policies, and these operate on a per-CPT basis, binary heap instances
+ * are tied to a specific CPT.
+ * @{
+ */
+
+/**
+ * Binary heap node.
+ *
+ * Objects of this type are embedded into objects of the ordered set that is to
+ * be maintained by a \e cfs_binheap_t instance.
+ */
+typedef struct {
+       /** Index into the binary tree */
+       unsigned int    chn_index;
+} cfs_binheap_node_t;
+
+#define CBH_SHIFT      9
+#define CBH_SIZE       (1 << CBH_SHIFT)                    /* # ptrs per level */
+#define CBH_MASK       (CBH_SIZE - 1)
+#define CBH_NOB        (CBH_SIZE * sizeof(cfs_binheap_node_t *))
+
+#define CBH_POISON     0xdeadbeef
+
+/**
+ * Binary heap flags.
+ */
+enum {
+       CBH_FLAG_ATOMIC_GROW    = 1,
+};
+
+struct cfs_binheap;
+
+/**
+ * Binary heap operations.
+ */
+typedef struct {
+       /**
+        * Called right before inserting a node into the binary heap.
+        *
+        * Implementing this operation is optional.
+        *
+        * \param[in] h The heap
+        * \param[in] e The node
+        *
+        * \retval 0 success
+        * \retval != 0 error
+        */
+       int             (*hop_enter)(struct cfs_binheap *h,
+                                    cfs_binheap_node_t *e);
+       /**
+        * Called right after removing a node from the binary heap.
+        *
+        * Implementing this operation is optional.
+        *
+        * \param[in] h The heap
+        * \param[in] e The node
+        */
+       void            (*hop_exit)(struct cfs_binheap *h,
+                                   cfs_binheap_node_t *e);
+       /**
+        * A binary predicate which is called during internal heap sorting
+        * operations, and used in order to determine the relevant ordering of
+        * two heap nodes.
+        *
+        * Implementing this operation is mandatory.
+        *
+        * \param[in] a The first heap node
+        * \param[in] b The second heap node
+        *
+        * \retval 0 Node a > node b
+        * \retval 1 Node a < node b
+        *
+        * \see cfs_binheap_bubble()
+        * \see cfs_biheap_sink()
+        */
+       int             (*hop_compare)(cfs_binheap_node_t *a,
+                                      cfs_binheap_node_t *b);
+} cfs_binheap_ops_t;
+
+/**
+ * Binary heap object.
+ *
+ * Sorts elements of type \e cfs_binheap_node_t
+ */
+typedef struct cfs_binheap {
+       /** Triple indirect */
+       cfs_binheap_node_t  ****cbh_elements3;
+       /** double indirect */
+       cfs_binheap_node_t   ***cbh_elements2;
+       /** single indirect */
+       cfs_binheap_node_t    **cbh_elements1;
+       /** # elements referenced */
+       unsigned int            cbh_nelements;
+       /** high water mark */
+       unsigned int            cbh_hwm;
+       /** user flags */
+       unsigned int            cbh_flags;
+       /** operations table */
+       cfs_binheap_ops_t      *cbh_ops;
+       /** private data */
+       void                   *cbh_private;
+       /** associated CPT table */
+       struct cfs_cpt_table   *cbh_cptab;
+       /** associated CPT id of this cfs_binheap_t::cbh_cptab */
+       int                     cbh_cptid;
+} cfs_binheap_t;
+
+void cfs_binheap_destroy(cfs_binheap_t *h);
+cfs_binheap_t *cfs_binheap_create(cfs_binheap_ops_t *ops, unsigned int flags,
+                                 unsigned count, void *arg,
+                                 struct cfs_cpt_table *cptab, int cptid);
+cfs_binheap_node_t *cfs_binheap_find(cfs_binheap_t *h, unsigned int idx);
+int cfs_binheap_insert(cfs_binheap_t *h, cfs_binheap_node_t *e);
+void cfs_binheap_remove(cfs_binheap_t *h, cfs_binheap_node_t *e);
+
+static inline int
+cfs_binheap_size(cfs_binheap_t *h)
+{
+       return h->cbh_nelements;
+}
+
+static inline int
+cfs_binheap_is_empty(cfs_binheap_t *h)
+{
+       return h->cbh_nelements == 0;
+}
+
+static inline cfs_binheap_node_t *
+cfs_binheap_root(cfs_binheap_t *h)
+{
+       return cfs_binheap_find(h, 0);
+}
+
+static inline cfs_binheap_node_t *
+cfs_binheap_remove_root(cfs_binheap_t *h)
+{
+       cfs_binheap_node_t *e = cfs_binheap_find(h, 0);
+
+       if (e != NULL)
+               cfs_binheap_remove(h, e);
+       return e;
+}
+
+/** @} heap */
+
+#endif /* __LIBCFS_HEAP_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h
new file mode 100644 (file)
index 0000000..5be3679
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_ioctl.h
+ *
+ * Low-level ioctl data structures. Kernel ioctl functions declared here,
+ * and user space functions are in libcfsutil_ioctl.h.
+ *
+ */
+
+#ifndef __LIBCFS_IOCTL_H__
+#define __LIBCFS_IOCTL_H__
+
+
+#define LIBCFS_IOCTL_VERSION 0x0001000a
+
+struct libcfs_ioctl_data {
+       __u32 ioc_len;
+       __u32 ioc_version;
+
+       __u64 ioc_nid;
+       __u64 ioc_u64[1];
+
+       __u32 ioc_flags;
+       __u32 ioc_count;
+       __u32 ioc_net;
+       __u32 ioc_u32[7];
+
+       __u32 ioc_inllen1;
+       char *ioc_inlbuf1;
+       __u32 ioc_inllen2;
+       char *ioc_inlbuf2;
+
+       __u32 ioc_plen1; /* buffers in userspace */
+       char *ioc_pbuf1;
+       __u32 ioc_plen2; /* buffers in userspace */
+       char *ioc_pbuf2;
+
+       char ioc_bulk[0];
+};
+
+
+struct libcfs_ioctl_hdr {
+       __u32 ioc_len;
+       __u32 ioc_version;
+};
+
+struct libcfs_debug_ioctl_data
+{
+       struct libcfs_ioctl_hdr hdr;
+       unsigned int subs;
+       unsigned int debug;
+};
+
+#define LIBCFS_IOC_INIT(data)                     \
+do {                                               \
+       memset(&data, 0, sizeof(data));          \
+       data.ioc_version = LIBCFS_IOCTL_VERSION;        \
+       data.ioc_len = sizeof(data);                \
+} while (0)
+
+
+struct libcfs_ioctl_handler {
+       struct list_head item;
+       int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_data *data);
+};
+
+#define DECLARE_IOCTL_HANDLER(ident, func)                   \
+       struct libcfs_ioctl_handler ident = {              \
+               /* .item = */ LIST_HEAD_INIT(ident.item),   \
+               /* .handle_ioctl = */ func                    \
+       }
+
+
+/* FIXME check conflict with lustre_lib.h */
+#define LIBCFS_IOC_DEBUG_MASK       _IOWR('f', 250, long)
+
+
+/* ioctls for manipulating snapshots 30- */
+#define IOC_LIBCFS_TYPE                   'e'
+#define IOC_LIBCFS_MIN_NR               30
+/* libcfs ioctls */
+#define IOC_LIBCFS_PANIC                  _IOWR('e', 30, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLEAR_DEBUG      _IOWR('e', 31, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MARK_DEBUG        _IOWR('e', 32, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_CONTROL      _IOWR('e', 33, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_SNAPSHOT            _IOWR('e', 34, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_LOOKUP_STRING       _IOWR('e', 35, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MEMHOG                _IOWR('e', 36, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING_TEST          _IOWR('e', 37, IOCTL_LIBCFS_TYPE)
+/* lnet ioctls */
+#define IOC_LIBCFS_GET_NI                _IOWR('e', 50, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_FAIL_NID            _IOWR('e', 51, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_ROUTE          _IOWR('e', 52, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_ROUTE          _IOWR('e', 53, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_ROUTE          _IOWR('e', 54, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_NOTIFY_ROUTER          _IOWR('e', 55, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_UNCONFIGURE      _IOWR('e', 56, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PORTALS_COMPATIBILITY   _IOWR('e', 57, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNET_DIST          _IOWR('e', 58, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CONFIGURE          _IOWR('e', 59, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_TESTPROTOCOMPAT      _IOWR('e', 60, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING                    _IOWR('e', 61, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEBUG_PEER        _IOWR('e', 62, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNETST                _IOWR('e', 63, IOCTL_LIBCFS_TYPE)
+/* lnd ioctls */
+#define IOC_LIBCFS_REGISTER_MYNID        _IOWR('e', 70, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLOSE_CONNECTION    _IOWR('e', 71, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PUSH_CONNECTION      _IOWR('e', 72, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_CONN            _IOWR('e', 73, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_PEER            _IOWR('e', 74, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_PEER            _IOWR('e', 75, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_PEER            _IOWR('e', 76, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_TXDESC        _IOWR('e', 77, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_INTERFACE          _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_INTERFACE          _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_INTERFACE          _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
+
+#define IOC_LIBCFS_MAX_NR                           80
+
+static inline int libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
+{
+       int len = sizeof(*data);
+       len += cfs_size_round(data->ioc_inllen1);
+       len += cfs_size_round(data->ioc_inllen2);
+       return len;
+}
+
+static inline int libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data)
+{
+       if (data->ioc_len > (1<<30)) {
+               CERROR ("LIBCFS ioctl: ioc_len larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen1 > (1<<30)) {
+               CERROR ("LIBCFS ioctl: ioc_inllen1 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen2 > (1<<30)) {
+               CERROR ("LIBCFS ioctl: ioc_inllen2 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+               CERROR ("LIBCFS ioctl: inlbuf1 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+               CERROR ("LIBCFS ioctl: inlbuf2 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_pbuf1 && !data->ioc_plen1) {
+               CERROR ("LIBCFS ioctl: pbuf1 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_pbuf2 && !data->ioc_plen2) {
+               CERROR ("LIBCFS ioctl: pbuf2 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_plen1 && !data->ioc_pbuf1) {
+               CERROR ("LIBCFS ioctl: plen1 nonzero but no pbuf1 pointer\n");
+               return 1;
+       }
+       if (data->ioc_plen2 && !data->ioc_pbuf2) {
+               CERROR ("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n");
+               return 1;
+       }
+       if ((__u32)libcfs_ioctl_packlen(data) != data->ioc_len ) {
+               CERROR ("LIBCFS ioctl: packlen != ioc_len\n");
+               return 1;
+       }
+       if (data->ioc_inllen1 &&
+           data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') {
+               CERROR ("LIBCFS ioctl: inlbuf1 not 0 terminated\n");
+               return 1;
+       }
+       if (data->ioc_inllen2 &&
+           data->ioc_bulk[cfs_size_round(data->ioc_inllen1) +
+                          data->ioc_inllen2 - 1] != '\0') {
+               CERROR ("LIBCFS ioctl: inlbuf2 not 0 terminated\n");
+               return 1;
+       }
+       return 0;
+}
+
+
+extern int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
+extern int libcfs_ioctl_getdata(char *buf, char *end, void *arg);
+extern int libcfs_ioctl_popdata(void *arg, void *buf, int size);
+
+
+#endif /* __LIBCFS_IOCTL_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h
new file mode 100644 (file)
index 0000000..596a15f
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * libcfs/include/libcfs/libcfs_kernelcomm.h
+ *
+ * Kernel <-> userspace communication routines.
+ * The definitions below are used in the kernel and userspace.
+ *
+ */
+
+#ifndef __LIBCFS_KERNELCOMM_H__
+#define __LIBCFS_KERNELCOMM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+/* KUC message header.
+ * All current and future KUC messages should use this header.
+ * To avoid having to include Lustre headers from libcfs, define this here.
+ */
+struct kuc_hdr {
+       __u16 kuc_magic;
+       __u8  kuc_transport;  /* Each new Lustre feature should use a different
+                                transport */
+       __u8  kuc_flags;
+       __u16 kuc_msgtype;    /* Message type or opcode, transport-specific */
+       __u16 kuc_msglen;     /* Including header */
+} __attribute__((aligned(sizeof(__u64))));
+
+#define KUC_MAGIC  0x191C /*Lustre9etLinC */
+#define KUC_FL_BLOCK 0x01   /* Wait for send */
+
+/* kuc_msgtype values are defined in each transport */
+enum kuc_transport_type {
+       KUC_TRANSPORT_GENERIC   = 1,
+       KUC_TRANSPORT_HSM       = 2,
+       KUC_TRANSPORT_CHANGELOG = 3,
+};
+
+enum kuc_generic_message_type {
+       KUC_MSG_SHUTDOWN = 1,
+};
+
+/* prototype for callback function on kuc groups */
+typedef int (*libcfs_kkuc_cb_t)(__u32 data, void *cb_arg);
+
+/* KUC Broadcast Groups. This determines which userspace process hears which
+ * messages.  Mutliple transports may be used within a group, or multiple
+ * groups may use the same transport.  Broadcast
+ * groups need not be used if e.g. a UID is specified instead;
+ * use group 0 to signify unicast.
+ */
+#define KUC_GRP_HSM       0x02
+#define KUC_GRP_MAX       KUC_GRP_HSM
+
+/* Kernel methods */
+extern int libcfs_kkuc_msg_put(struct file *fp, void *payload);
+extern int libcfs_kkuc_group_put(int group, void *payload);
+extern int libcfs_kkuc_group_add(struct file *fp, int uid, int group,
+                                __u32 data);
+extern int libcfs_kkuc_group_rem(int uid, int group);
+extern int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+                                    void *cb_arg);
+
+#define LK_FLG_STOP 0x01
+
+/* kernelcomm control structure, passed from userspace to kernel */
+typedef struct lustre_kernelcomm {
+       __u32 lk_wfd;
+       __u32 lk_rfd;
+       __u32 lk_uid;
+       __u32 lk_group;
+       __u32 lk_data;
+       __u32 lk_flags;
+} __attribute__((packed)) lustre_kernelcomm;
+
+/* Userspace methods */
+extern int libcfs_ukuc_start(lustre_kernelcomm *l, int groups);
+extern int libcfs_ukuc_stop(lustre_kernelcomm *l);
+extern int libcfs_ukuc_msg_get(lustre_kernelcomm *l, char *buf, int maxsize,
+                              int transport);
+
+#endif /* __LIBCFS_KERNELCOMM_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h
new file mode 100644 (file)
index 0000000..9c40ed9
--- /dev/null
@@ -0,0 +1,101 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_prim.h
+ *
+ * General primitives.
+ *
+ */
+
+#ifndef __LIBCFS_PRIM_H__
+#define __LIBCFS_PRIM_H__
+
+#ifndef EXPORT_SYMBOL
+# define EXPORT_SYMBOL(s)
+#endif
+
+/*
+ * Schedule
+ */
+void cfs_pause(cfs_duration_t ticks);
+
+/*
+ * Timer
+ */
+typedef  void (cfs_timer_func_t)(ulong_ptr_t);
+void schedule_timeout_and_set_state(cfs_task_state_t, int64_t);
+
+void init_waitqueue_entry_current(wait_queue_t *link);
+int64_t waitq_timedwait(wait_queue_t *, cfs_task_state_t, int64_t);
+void waitq_wait(wait_queue_t *, cfs_task_state_t);
+void add_wait_queue_exclusive_head(wait_queue_head_t *, wait_queue_t *);
+
+void cfs_init_timer(timer_list_t *t);
+void cfs_timer_init(timer_list_t *t, cfs_timer_func_t *func, void *arg);
+void cfs_timer_done(timer_list_t *t);
+void cfs_timer_arm(timer_list_t *t, cfs_time_t deadline);
+void cfs_timer_disarm(timer_list_t *t);
+int  cfs_timer_is_armed(timer_list_t *t);
+cfs_time_t cfs_timer_deadline(timer_list_t *t);
+
+/*
+ * Memory
+ */
+#ifndef memory_pressure_get
+#define memory_pressure_get() (0)
+#endif
+#ifndef memory_pressure_set
+#define memory_pressure_set() do {} while (0)
+#endif
+#ifndef memory_pressure_clr
+#define memory_pressure_clr() do {} while (0)
+#endif
+
+static inline int cfs_memory_pressure_get_and_set(void)
+{
+       int old = memory_pressure_get();
+
+       if (!old)
+               memory_pressure_set();
+       return old;
+}
+
+static inline void cfs_memory_pressure_restore(int old)
+{
+       if (old)
+               memory_pressure_set();
+       else
+               memory_pressure_clr();
+       return;
+}
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
new file mode 100644 (file)
index 0000000..056caa4
--- /dev/null
@@ -0,0 +1,577 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_private.h
+ *
+ * Various defines for libcfs.
+ *
+ */
+
+#ifndef __LIBCFS_PRIVATE_H__
+#define __LIBCFS_PRIVATE_H__
+
+/* XXX this layering violation is for nidstrings */
+#include <linux/lnet/types.h>
+
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
+
+
+
+/*
+ * When this is on, LASSERT macro includes check for assignment used instead
+ * of equality check, but doesn't have unlikely(). Turn this on from time to
+ * time to make test-builds. This shouldn't be on for production release.
+ */
+#define LASSERT_CHECKED (0)
+
+
+#define LASSERTF(cond, fmt, ...)                                       \
+do {                                                                   \
+       if (unlikely(!(cond))) {                                        \
+               LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL);  \
+               libcfs_debug_msg(&__msg_data,                           \
+                                "ASSERTION( %s ) failed: " fmt, #cond, \
+                                ## __VA_ARGS__);                       \
+               lbug_with_loc(&__msg_data);                             \
+       }                                                               \
+} while (0)
+
+#define LASSERT(cond) LASSERTF(cond, "\n")
+
+#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK
+/**
+ * This is for more expensive checks that one doesn't want to be enabled all
+ * the time. LINVRNT() has to be explicitly enabled by
+ * CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK option.
+ */
+# define LINVRNT(exp) LASSERT(exp)
+#else
+# define LINVRNT(exp) ((void)sizeof!!(exp))
+#endif
+
+#define KLASSERT(e) LASSERT(e)
+
+void lbug_with_loc(struct libcfs_debug_msg_data *) __attribute__((noreturn));
+
+#define LBUG()                                                   \
+do {                                                               \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);          \
+       lbug_with_loc(&msgdata);                                        \
+} while(0)
+
+extern atomic_t libcfs_kmemory;
+/*
+ * Memory
+ */
+
+# define libcfs_kmem_inc(ptr, size)            \
+do {                                           \
+       atomic_add(size, &libcfs_kmemory);      \
+} while (0)
+
+# define libcfs_kmem_dec(ptr, size)            \
+do {                                           \
+       atomic_sub(size, &libcfs_kmemory);      \
+} while (0)
+
+# define libcfs_kmem_read()                    \
+       atomic_read(&libcfs_kmemory)
+
+
+#ifndef LIBCFS_VMALLOC_SIZE
+#define LIBCFS_VMALLOC_SIZE    (2 << PAGE_CACHE_SHIFT) /* 2 pages */
+#endif
+
+#define LIBCFS_ALLOC_PRE(size, mask)                                       \
+do {                                                                       \
+       LASSERT(!in_interrupt() ||                                          \
+               ((size) <= LIBCFS_VMALLOC_SIZE &&                           \
+                ((mask) & GFP_ATOMIC)) != 0);                      \
+} while (0)
+
+#define LIBCFS_ALLOC_POST(ptr, size)                                       \
+do {                                                                       \
+       if (unlikely((ptr) == NULL)) {                                      \
+               CERROR("LNET: out of memory at %s:%d (tried to alloc '"     \
+                      #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));  \
+               CERROR("LNET: %d total bytes allocated by lnet\n",          \
+                      libcfs_kmem_read());                                 \
+       } else {                                                            \
+               memset((ptr), 0, (size));                                   \
+               libcfs_kmem_inc((ptr), (size));                             \
+               CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %d).\n",  \
+                      (int)(size), (ptr), libcfs_kmem_read());             \
+       }                                                                  \
+} while (0)
+
+/**
+ * allocate memory with GFP flags @mask
+ */
+#define LIBCFS_ALLOC_GFP(ptr, size, mask)                                  \
+do {                                                                       \
+       LIBCFS_ALLOC_PRE((size), (mask));                                   \
+       (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?                             \
+               kmalloc((size), (mask)) : vmalloc(size);            \
+       LIBCFS_ALLOC_POST((ptr), (size));                                   \
+} while (0)
+
+/**
+ * default allocator
+ */
+#define LIBCFS_ALLOC(ptr, size) \
+       LIBCFS_ALLOC_GFP(ptr, size, __GFP_IO)
+
+/**
+ * non-sleeping allocator
+ */
+#define LIBCFS_ALLOC_ATOMIC(ptr, size) \
+       LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC)
+
+/**
+ * allocate memory for specified CPU partition
+ *   \a cptab != NULL, \a cpt is CPU partition id of \a cptab
+ *   \a cptab == NULL, \a cpt is HW NUMA node id
+ */
+#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask)                  \
+do {                                                                       \
+       LIBCFS_ALLOC_PRE((size), (mask));                                   \
+       (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ?                             \
+               kmalloc_node((size), (mask), cfs_cpt_spread_node(cptab, cpt)) :\
+               vmalloc_node(size, cfs_cpt_spread_node(cptab, cpt));        \
+       LIBCFS_ALLOC_POST((ptr), (size));                                   \
+} while (0)
+
+/** default numa allocator */
+#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size)                                    \
+       LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO)
+
+#define LIBCFS_FREE(ptr, size)                                   \
+do {                                                               \
+       int s = (size);                                          \
+       if (unlikely((ptr) == NULL)) {                            \
+               CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
+                      "%s:%d\n", s, __FILE__, __LINE__);              \
+               break;                                            \
+       }                                                              \
+       libcfs_kmem_dec((ptr), s);                                    \
+       CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
+              s, (ptr), libcfs_kmem_read());                           \
+       if (unlikely(s > LIBCFS_VMALLOC_SIZE))                    \
+               vfree(ptr);                                 \
+       else                                                        \
+               kfree(ptr);                                       \
+} while (0)
+
+/******************************************************************************/
+
+/* htonl hack - either this, or compile with -O2. Stupid byteorder/generic.h */
+#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__)
+#define ___htonl(x) __cpu_to_be32(x)
+#define ___htons(x) __cpu_to_be16(x)
+#define ___ntohl(x) __be32_to_cpu(x)
+#define ___ntohs(x) __be16_to_cpu(x)
+#define htonl(x) ___htonl(x)
+#define ntohl(x) ___ntohl(x)
+#define htons(x) ___htons(x)
+#define ntohs(x) ___ntohs(x)
+#endif
+
+void libcfs_debug_dumpstack(task_t *tsk);
+void libcfs_run_upcall(char **argv);
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *);
+void libcfs_debug_dumplog(void);
+int libcfs_debug_init(unsigned long bufsize);
+int libcfs_debug_cleanup(void);
+int libcfs_debug_clear_buffer(void);
+int libcfs_debug_mark_buffer(const char *text);
+
+void libcfs_debug_set_level(unsigned int debug_level);
+
+
+/*
+ * allocate per-cpu-partition data, returned value is an array of pointers,
+ * variable can be indexed by CPU ID.
+ *     cptable != NULL: size of array is number of CPU partitions
+ *     cptable == NULL: size of array is number of HW cores
+ */
+void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size);
+/*
+ * destory per-cpu-partition variable
+ */
+void  cfs_percpt_free(void *vars);
+int   cfs_percpt_number(void *vars);
+void *cfs_percpt_current(void *vars);
+void *cfs_percpt_index(void *vars, int idx);
+
+#define cfs_percpt_for_each(var, i, vars)              \
+       for (i = 0; i < cfs_percpt_number(vars) &&      \
+                   ((var) = (vars)[i]) != NULL; i++)
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by count.
+ */
+void *cfs_array_alloc(int count, unsigned int size);
+void  cfs_array_free(void *vars);
+
+#define LASSERT_ATOMIC_ENABLED   (1)
+
+#if LASSERT_ATOMIC_ENABLED
+
+/** assert value of @a is equal to @v */
+#define LASSERT_ATOMIC_EQ(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) == v,                  \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is unequal to @v */
+#define LASSERT_ATOMIC_NE(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) != v,                  \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is little than @v */
+#define LASSERT_ATOMIC_LT(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) < v,                    \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is little/equal to @v */
+#define LASSERT_ATOMIC_LE(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) <= v,                  \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is great than @v */
+#define LASSERT_ATOMIC_GT(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) > v,                    \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is great/equal to @v */
+#define LASSERT_ATOMIC_GE(a, v)                                 \
+do {                                                       \
+       LASSERTF(atomic_read(a) >= v,                  \
+                "value: %d\n", atomic_read((a)));        \
+} while (0)
+
+/** assert value of @a is great than @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)                         \
+do {                                                       \
+       int __v = atomic_read(a);                          \
+       LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v);     \
+} while (0)
+
+/** assert value of @a is great than @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)                         \
+do {                                                       \
+       int __v = atomic_read(a);                          \
+       LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little than @v2 */
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)                         \
+do {                                                       \
+       int __v = atomic_read(a);                          \
+       LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v);    \
+} while (0)
+
+/** assert value of @a is great/equal to @v1 and little/equal to @v2 */
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)                         \
+do {                                                       \
+       int __v = atomic_read(a);                          \
+       LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v);   \
+} while (0)
+
+#else /* !LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_EQ(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_NE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_LT(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_LE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GT(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GE(a, v)                 do {} while (0)
+#define LASSERT_ATOMIC_GT_LT(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GT_LE(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GE_LT(a, v1, v2)         do {} while (0)
+#define LASSERT_ATOMIC_GE_LE(a, v1, v2)         do {} while (0)
+
+#endif /* LASSERT_ATOMIC_ENABLED */
+
+#define LASSERT_ATOMIC_ZERO(a)           LASSERT_ATOMIC_EQ(a, 0)
+#define LASSERT_ATOMIC_POS(a)             LASSERT_ATOMIC_GT(a, 0)
+
+#define CFS_ALLOC_PTR(ptr)      LIBCFS_ALLOC(ptr, sizeof (*(ptr)));
+#define CFS_FREE_PTR(ptr)       LIBCFS_FREE(ptr, sizeof (*(ptr)));
+
+/*
+ * percpu partition lock
+ *
+ * There are some use-cases like this in Lustre:
+ * . each CPU partition has it's own private data which is frequently changed,
+ *   and mostly by the local CPU partition.
+ * . all CPU partitions share some global data, these data are rarely changed.
+ *
+ * LNet is typical example.
+ * CPU partition lock is designed for this kind of use-cases:
+ * . each CPU partition has it's own private lock
+ * . change on private data just needs to take the private lock
+ * . read on shared data just needs to take _any_ of private locks
+ * . change on shared data needs to take _all_ private locks,
+ *   which is slow and should be really rare.
+ */
+
+enum {
+       CFS_PERCPT_LOCK_EX      = -1, /* negative */
+};
+
+
+struct cfs_percpt_lock {
+       /* cpu-partition-table for this lock */
+       struct cfs_cpt_table    *pcl_cptab;
+       /* exclusively locked */
+       unsigned int            pcl_locked;
+       /* private lock table */
+       spinlock_t              **pcl_locks;
+};
+
+/* return number of private locks */
+static inline int
+cfs_percpt_lock_num(struct cfs_percpt_lock *pcl)
+{
+       return cfs_cpt_number(pcl->pcl_cptab);
+}
+
+
+/*
+ * create a cpu-partition lock based on CPU partition table \a cptab,
+ * each private lock has extra \a psize bytes padding data
+ */
+struct cfs_percpt_lock *cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab);
+/* destroy a cpu-partition lock */
+void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl);
+
+/* lock private lock \a index of \a pcl */
+void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index);
+/* unlock private lock \a index of \a pcl */
+void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index);
+/* create percpt (atomic) refcount based on @cptab */
+atomic_t **cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int val);
+/* destroy percpt refcount */
+void cfs_percpt_atomic_free(atomic_t **refs);
+/* return sum of all percpu refs */
+int cfs_percpt_atomic_summary(atomic_t **refs);
+
+
+/** Compile-time assertion.
+
+ * Check an invariant described by a constant expression at compile time by
+ * forcing a compiler error if it does not hold.  \a cond must be a constant
+ * expression as defined by the ISO C Standard:
+ *
+ *       6.8.4.2  The switch statement
+ *       ....
+ *       [#3] The expression of each case label shall be  an  integer
+ *       constant   expression  and  no  two  of  the  case  constant
+ *       expressions in the same switch statement shall have the same
+ *       value  after  conversion...
+ *
+ */
+#define CLASSERT(cond) do {switch(42) {case (cond): case 0: break;}} while (0)
+
+/* support decl needed both by kernel and liblustre */
+int     libcfs_isknown_lnd(int type);
+char       *libcfs_lnd2modname(int type);
+char       *libcfs_lnd2str(int type);
+int     libcfs_str2lnd(const char *str);
+char       *libcfs_net2str(__u32 net);
+char       *libcfs_nid2str(lnet_nid_t nid);
+__u32       libcfs_str2net(const char *str);
+lnet_nid_t  libcfs_str2nid(const char *str);
+int     libcfs_str2anynid(lnet_nid_t *nid, const char *str);
+char       *libcfs_id2str(lnet_process_id_t id);
+void   cfs_free_nidlist(struct list_head *list);
+int     cfs_parse_nidlist(char *str, int len, struct list_head *list);
+int     cfs_match_nid(lnet_nid_t nid, struct list_head *list);
+
+/** \addtogroup lnet_addr
+ * @{ */
+/* how an LNET NID encodes net:address */
+/** extract the address part of an lnet_nid_t */
+#define LNET_NIDADDR(nid)      ((__u32)((nid) & 0xffffffff))
+/** extract the network part of an lnet_nid_t */
+#define LNET_NIDNET(nid)       ((__u32)(((nid) >> 32)) & 0xffffffff)
+/** make an lnet_nid_t from a network part and an address part */
+#define LNET_MKNID(net,addr)   ((((__u64)(net))<<32)|((__u64)(addr)))
+/* how net encodes type:number */
+#define LNET_NETNUM(net)       ((net) & 0xffff)
+#define LNET_NETTYP(net)       (((net) >> 16) & 0xffff)
+#define LNET_MKNET(typ,num)    ((((__u32)(typ))<<16)|((__u32)(num)))
+/** @} lnet_addr */
+
+/* max value for numeric network address */
+#define MAX_NUMERIC_VALUE 0xffffffff
+
+/* implication */
+#define ergo(a, b) (!(a) || (b))
+/* logical equivalence */
+#define equi(a, b) (!!(a) == !!(b))
+
+#ifndef CFS_CURRENT_TIME
+# define CFS_CURRENT_TIME time(0)
+#endif
+
+/* --------------------------------------------------------------------
+ * Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect.
+ * All stuff about lwt are put in arch/kp30.h
+ * -------------------------------------------------------------------- */
+
+struct libcfs_device_userstate
+{
+       int        ldu_memhog_pages;
+       struct page   *ldu_memhog_root_page;
+};
+
+/* what used to be in portals_lib.h */
+#ifndef MIN
+# define MIN(a,b) (((a)<(b)) ? (a): (b))
+#endif
+#ifndef MAX
+# define MAX(a,b) (((a)>(b)) ? (a): (b))
+#endif
+
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+static inline int cfs_size_round4 (int val)
+{
+       return (val + 3) & (~0x3);
+}
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+       return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+static inline int cfs_size_round16(int val)
+{
+       return (val + 0xf) & (~0xf);
+}
+
+static inline int cfs_size_round32(int val)
+{
+       return (val + 0x1f) & (~0x1f);
+}
+
+static inline int cfs_size_round0(int val)
+{
+       if (!val)
+               return 0;
+       return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t cfs_round_strlen(char *fset)
+{
+       return (size_t)cfs_size_round((int)strlen(fset) + 1);
+}
+
+/* roundup \a val to power2 */
+static inline unsigned int cfs_power2_roundup(unsigned int val)
+{
+       if (val != LOWEST_BIT_SET(val)) { /* not a power of 2 already */
+               do {
+                       val &= ~LOWEST_BIT_SET(val);
+               } while (val != LOWEST_BIT_SET(val));
+               /* ...and round up */
+               val <<= 1;
+       }
+       return val;
+}
+
+#define LOGL(var,len,ptr)                                     \
+do {                                                       \
+       if (var)                                                \
+               memcpy((char *)ptr, (const char *)var, len);    \
+       ptr += cfs_size_round(len);                          \
+} while (0)
+
+#define LOGU(var,len,ptr)                                     \
+do {                                                       \
+       if (var)                                                \
+               memcpy((char *)var, (const char *)ptr, len);    \
+       ptr += cfs_size_round(len);                          \
+} while (0)
+
+#define LOGL0(var,len,ptr)                           \
+do {                                               \
+       if (!len)                                      \
+               break;                            \
+       memcpy((char *)ptr, (const char *)var, len);    \
+       *((char *)(ptr) + len) = 0;                  \
+       ptr += cfs_size_round(len + 1);          \
+} while (0)
+
+/**
+ *  Lustre Network Driver types.
+ */
+enum {
+       /* Only add to these values (i.e. don't ever change or redefine them):
+        * network addresses depend on them... */
+       QSWLND    = 1,
+       SOCKLND   = 2,
+       GMLND     = 3, /* obsolete, keep it so that libcfs_nid2str works */
+       PTLLND    = 4,
+       O2IBLND   = 5,
+       CIBLND    = 6,
+       OPENIBLND = 7,
+       IIBLND    = 8,
+       LOLND     = 9,
+       RALND     = 10,
+       VIBLND    = 11,
+       MXLND     = 12,
+       GNILND    = 13,
+};
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h
new file mode 100644 (file)
index 0000000..a6bac9c
--- /dev/null
@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_string.h
+ *
+ * Generic string manipulation functions.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#ifndef __LIBCFS_STRING_H__
+#define __LIBCFS_STRING_H__
+
+/* libcfs_string.c */
+/* string comparison ignoring case */
+int cfs_strncasecmp(const char *s1, const char *s2, size_t n);
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+                int *oldmask, int minmask, int allmask);
+
+/* Allocate space for and copy an existing string.
+ * Must free with kfree().
+ */
+char *cfs_strdup(const char *str, u_int32_t flags);
+
+/* safe vsnprintf */
+int cfs_vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
+
+/* safe snprintf */
+int cfs_snprintf(char *buf, size_t size, const char *fmt, ...);
+
+/* trim leading and trailing space characters */
+char *cfs_firststr(char *str, size_t size);
+
+/**
+ * Structure to represent NULL-less strings.
+ */
+struct cfs_lstr {
+       char            *ls_str;
+       int             ls_len;
+};
+
+/*
+ * Structure to represent \<range_expr\> token of the syntax.
+ */
+struct cfs_range_expr {
+       /*
+        * Link to cfs_expr_list::el_exprs.
+        */
+       struct list_head        re_link;
+       __u32           re_lo;
+       __u32           re_hi;
+       __u32           re_stride;
+};
+
+struct cfs_expr_list {
+       struct list_head        el_link;
+       struct list_head        el_exprs;
+};
+
+static inline int
+cfs_iswhite(char c)
+{
+       switch (c) {
+       case ' ':
+       case '\t':
+       case '\n':
+       case '\r':
+               return 1;
+       default:
+               break;
+       }
+       return 0;
+}
+
+char *cfs_trimwhite(char *str);
+int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
+int cfs_str2num_check(char *str, int nob, unsigned *num,
+                     unsigned min, unsigned max);
+int cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+                        int single_tok, struct cfs_range_expr **expr);
+int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list);
+int cfs_expr_list_values(struct cfs_expr_list *expr_list,
+                        int max, __u32 **values);
+static inline void
+cfs_expr_list_values_free(__u32 *values, int num)
+{
+       /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed
+        * by OBD_FREE() if it's called by module other than libcfs & LNet,
+        * otherwise we will see fake memory leak */
+       LIBCFS_FREE(values, num * sizeof(values[0]));
+}
+
+void cfs_expr_list_free(struct cfs_expr_list *expr_list);
+void cfs_expr_list_print(struct cfs_expr_list *expr_list);
+int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+                       struct cfs_expr_list **elpp);
+void cfs_expr_list_free_list(struct list_head *list);
+int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
+int cfs_ip_addr_match(__u32 addr, struct list_head *list);
+void cfs_ip_addr_free(struct list_head *list);
+
+#define        strtoul(str, endp, base)        simple_strtoul(str, endp, base)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h
new file mode 100644 (file)
index 0000000..4bdd771
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_time.h
+ *
+ * Time functions.
+ *
+ */
+
+#ifndef __LIBCFS_TIME_H__
+#define __LIBCFS_TIME_H__
+/*
+ * generic time manipulation functions.
+ */
+
+static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
+{
+       return (cfs_time_t)(t + d);
+}
+
+static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
+{
+       return (cfs_time_t)(t1 - t2);
+}
+
+static inline int cfs_time_after(cfs_time_t t1, cfs_time_t t2)
+{
+       return cfs_time_before(t2, t1);
+}
+
+static inline int cfs_time_aftereq(cfs_time_t t1, cfs_time_t t2)
+{
+       return cfs_time_beforeq(t2, t1);
+}
+
+
+static inline cfs_time_t cfs_time_shift(int seconds)
+{
+       return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+}
+
+static inline long cfs_timeval_sub(struct timeval *large, struct timeval *small,
+                                  struct timeval *result)
+{
+       long r = (long) (
+               (large->tv_sec - small->tv_sec) * ONE_MILLION +
+               (large->tv_usec - small->tv_usec));
+       if (result != NULL) {
+               result->tv_usec = r % ONE_MILLION;
+               result->tv_sec = r / ONE_MILLION;
+       }
+       return r;
+}
+
+static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg)
+{
+       if (cfs_time_after(cfs_time_current(),
+                          cfs_time_add(now, cfs_time_seconds(15))))
+               CERROR("slow %s "CFS_TIME_T" sec\n", msg,
+                      cfs_duration_sec(cfs_time_sub(cfs_time_current(),now)));
+}
+
+#define CFS_RATELIMIT(seconds)                           \
+({                                                           \
+       /*                                                    \
+        * XXX nikita: non-portable initializer          \
+        */                                                  \
+       static time_t __next_message = 0;                      \
+       int result;                                          \
+                                                               \
+       if (cfs_time_after(cfs_time_current(), __next_message)) \
+               result = 1;                                  \
+       else {                                            \
+               __next_message = cfs_time_shift(seconds);       \
+               result = 0;                                  \
+       }                                                      \
+       result;                                          \
+})
+
+/*
+ * helper function similar to do_gettimeofday() of Linux kernel
+ */
+static inline void cfs_fs_timeval(struct timeval *tv)
+{
+       cfs_fs_time_t time;
+
+       cfs_fs_time_current(&time);
+       cfs_fs_time_usec(&time, tv);
+}
+
+/*
+ * return valid time-out based on user supplied one. Currently we only check
+ * that time-out is not shorted than allowed.
+ */
+static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout)
+{
+       if (timeout < CFS_TICK)
+               timeout = CFS_TICK;
+       return timeout;
+}
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h
new file mode 100644 (file)
index 0000000..5cc64f3
--- /dev/null
@@ -0,0 +1,110 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/libcfs_workitem.h
+ *
+ * Author: Isaac Huang  <he.h.huang@oracle.com>
+ *      Liang Zhen   <zhen.liang@sun.com>
+ *
+ * A workitems is deferred work with these semantics:
+ * - a workitem always runs in thread context.
+ * - a workitem can be concurrent with other workitems but is strictly
+ *   serialized with respect to itself.
+ * - no CPU affinity, a workitem does not necessarily run on the same CPU
+ *   that schedules it. However, this might change in the future.
+ * - if a workitem is scheduled again before it has a chance to run, it
+ *   runs only once.
+ * - if a workitem is scheduled while it runs, it runs again after it
+ *   completes; this ensures that events occurring while other events are
+ *   being processed receive due attention. This behavior also allows a
+ *   workitem to reschedule itself.
+ *
+ * Usage notes:
+ * - a workitem can sleep but it should be aware of how that sleep might
+ *   affect others.
+ * - a workitem runs inside a kernel thread so there's no user space to access.
+ * - do not use a workitem if the scheduling latency can't be tolerated.
+ *
+ * When wi_action returns non-zero, it means the workitem has either been
+ * freed or reused and workitem scheduler won't touch it any more.
+ */
+
+#ifndef __LIBCFS_WORKITEM_H__
+#define __LIBCFS_WORKITEM_H__
+
+struct cfs_wi_sched;
+
+void cfs_wi_sched_destroy(struct cfs_wi_sched *);
+int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt,
+                       int nthrs, struct cfs_wi_sched **);
+
+struct cfs_workitem;
+
+typedef int (*cfs_wi_action_t) (struct cfs_workitem *);
+typedef struct cfs_workitem {
+       /** chain on runq or rerunq */
+       struct list_head       wi_list;
+       /** working function */
+       cfs_wi_action_t  wi_action;
+       /** arg for working function */
+       void        *wi_data;
+       /** in running */
+       unsigned short   wi_running:1;
+       /** scheduled */
+       unsigned short   wi_scheduled:1;
+} cfs_workitem_t;
+
+static inline void
+cfs_wi_init(cfs_workitem_t *wi, void *data, cfs_wi_action_t action)
+{
+       INIT_LIST_HEAD(&wi->wi_list);
+
+       wi->wi_running   = 0;
+       wi->wi_scheduled = 0;
+       wi->wi_data      = data;
+       wi->wi_action    = action;
+}
+
+void cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+int  cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+void cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi);
+
+int  cfs_wi_startup(void);
+void cfs_wi_shutdown(void);
+
+/** # workitem scheduler loops before reschedule */
+#define CFS_WI_RESCHED    128
+
+#endif /* __LIBCFS_WORKITEM_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h b/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h
new file mode 100644 (file)
index 0000000..4b7ae1c
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_KP30_H__
+#define __LIBCFS_LINUX_KP30_H__
+
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/rwsem.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/smp.h>
+#include <linux/ctype.h>
+#include <linux/compiler.h>
+#ifdef HAVE_MM_INLINE
+# include <linux/mm_inline.h>
+#endif
+#include <linux/kallsyms.h>
+#include <linux/moduleparam.h>
+#include <linux/scatterlist.h>
+
+#include <linux/libcfs/linux/portals_compat25.h>
+
+
+#define prepare_work(wq,cb,cbdata)                                         \
+do {                                                                     \
+       INIT_WORK((wq), (void *)(cb));                                  \
+} while (0)
+
+#define cfs_get_work_data(type,field,data) container_of(data,type,field)
+
+
+#define our_recalc_sigpending(current) recalc_sigpending()
+#define strtok(a,b) strpbrk(a, b)
+#define work_struct_t      struct work_struct
+
+#ifdef CONFIG_SMP
+#else
+#endif
+
+
+#define SEM_COUNT(sem)   ((sem)->count)
+
+
+/* ------------------------------------------------------------------- */
+
+#define PORTAL_SYMBOL_REGISTER(x)
+#define PORTAL_SYMBOL_UNREGISTER(x)
+
+
+
+
+/******************************************************************************/
+/* Module parameter support */
+#define CFS_MODULE_PARM(name, t, type, perm, desc) \
+       module_param(name, type, perm);\
+       MODULE_PARM_DESC(name, desc)
+
+#define CFS_SYSFS_MODULE_PARM  1 /* module parameters accessible via sysfs */
+
+/******************************************************************************/
+
+#if (__GNUC__)
+/* Use the special GNU C __attribute__ hack to have the compiler check the
+ * printf style argument string against the actual argument count and
+ * types.
+ */
+#ifdef printf
+# warning printf has been defined as a macro...
+# undef printf
+#endif
+
+#endif /* __GNUC__ */
+
+# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b)
+# define printf(format, b...) CDEBUG(D_OTHER, format , ## b)
+# define time(a) CURRENT_TIME
+
+# define cfs_num_present_cpus()  num_present_cpus()
+
+/******************************************************************************/
+/* Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect. */
+#define LWT_SUPPORT  0
+
+#define LWT_MEMORY   (16<<20)
+
+#ifndef KLWT_SUPPORT
+#  if !defined(BITS_PER_LONG)
+#   error "BITS_PER_LONG not defined"
+#  endif
+
+/* kernel hasn't defined this? */
+typedef struct {
+       long long   lwte_when;
+       char       *lwte_where;
+       void       *lwte_task;
+       long    lwte_p1;
+       long    lwte_p2;
+       long    lwte_p3;
+       long    lwte_p4;
+# if BITS_PER_LONG > 32
+       long    lwte_pad;
+# endif
+} lwt_event_t;
+#endif /* !KLWT_SUPPORT */
+
+#if LWT_SUPPORT
+#  if !KLWT_SUPPORT
+
+typedef struct _lwt_page {
+       struct list_head               lwtp_list;
+       struct page          *lwtp_page;
+       lwt_event_t          *lwtp_events;
+} lwt_page_t;
+
+typedef struct {
+       int             lwtc_current_index;
+       lwt_page_t      *lwtc_current_page;
+} lwt_cpu_t;
+
+extern int       lwt_enabled;
+extern lwt_cpu_t lwt_cpus[];
+
+/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set.
+ * This stuff is meant for finding specific problems; it never stays in
+ * production code... */
+
+#define LWTSTR(n)       #n
+#define LWTWHERE(f,l)   f ":" LWTSTR(l)
+#define LWT_EVENTS_PER_PAGE (PAGE_CACHE_SIZE / sizeof (lwt_event_t))
+
+#define LWT_EVENT(p1, p2, p3, p4)                                     \
+do {                                                               \
+       unsigned long    flags;                                  \
+       lwt_cpu_t       *cpu;                                      \
+       lwt_page_t      *p;                                          \
+       lwt_event_t     *e;                                          \
+                                                                       \
+       if (lwt_enabled) {                                            \
+               local_irq_save (flags);                          \
+                                                                       \
+               cpu = &lwt_cpus[smp_processor_id()];                \
+               p = cpu->lwtc_current_page;                          \
+               e = &p->lwtp_events[cpu->lwtc_current_index++];  \
+                                                                       \
+               if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) {   \
+                       cpu->lwtc_current_page =                        \
+                               list_entry (p->lwtp_list.next,      \
+                                               lwt_page_t, lwtp_list); \
+                       cpu->lwtc_current_index = 0;                \
+               }                                                      \
+                                                                       \
+               e->lwte_when  = get_cycles();                      \
+               e->lwte_where = LWTWHERE(__FILE__,__LINE__);        \
+               e->lwte_task  = current;                                \
+               e->lwte_p1    = (long)(p1);                          \
+               e->lwte_p2    = (long)(p2);                          \
+               e->lwte_p3    = (long)(p3);                          \
+               e->lwte_p4    = (long)(p4);                          \
+                                                                       \
+               local_irq_restore (flags);                            \
+       }                                                              \
+} while (0)
+
+#endif /* !KLWT_SUPPORT */
+
+extern int  lwt_init (void);
+extern void lwt_fini (void);
+extern int  lwt_lookup_string (int *size, char *knlptr,
+                              char *usrptr, int usrsize);
+extern int  lwt_control (int enable, int clear);
+extern int  lwt_snapshot (cfs_cycles_t *now, int *ncpu, int *total_size,
+                         void *user_ptr, int user_size);
+#endif /* LWT_SUPPORT */
+
+/* ------------------------------------------------------------------ */
+
+#define IOCTL_LIBCFS_TYPE long
+
+#ifdef __CYGWIN__
+# ifndef BITS_PER_LONG
+#   define BITS_PER_LONG 64
+# endif
+#endif
+
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
+
+/* this is a bit chunky */
+
+#define _LWORDSIZE BITS_PER_LONG
+
+# define LPU64 "%llu"
+# define LPD64 "%lld"
+# define LPX64 "%#llx"
+# define LPX64i "%llx"
+# define LPO64 "%#llo"
+# define LPF64 "L"
+
+/*
+ * long_ptr_t & ulong_ptr_t, same to "long" for gcc
+ */
+# define LPLU "%lu"
+# define LPLD "%ld"
+# define LPLX "%#lx"
+
+/*
+ * pid_t
+ */
+# define LPPID "%d"
+
+
+#undef _LWORDSIZE
+
+/* compat macroses */
+
+
+#ifndef get_cpu
+# ifdef CONFIG_PREEMPT
+#  define get_cpu()  ({ preempt_disable(); smp_processor_id(); })
+#  define put_cpu()  preempt_enable()
+# else
+#  define get_cpu()  smp_processor_id()
+#  define put_cpu()
+# endif
+#else
+#endif /* get_cpu & put_cpu */
+
+#define INIT_CTL_NAME(a)
+#define INIT_STRATEGY(a)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h
new file mode 100644 (file)
index 0000000..292a3ba
--- /dev/null
@@ -0,0 +1,125 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_LIBCFS_H__
+#define __LIBCFS_LINUX_LIBCFS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+
+#include <stdarg.h>
+#include <linux/libcfs/linux/linux-cpu.h>
+#include <linux/libcfs/linux/linux-time.h>
+#include <linux/libcfs/linux/linux-mem.h>
+#include <linux/libcfs/linux/linux-prim.h>
+#include <linux/libcfs/linux/linux-lock.h>
+#include <linux/libcfs/linux/linux-fs.h>
+#include <linux/libcfs/linux/linux-tcpip.h>
+#include <linux/libcfs/linux/linux-bitops.h>
+#include <linux/libcfs/linux/linux-types.h>
+#include <linux/libcfs/linux/kp30.h>
+
+#include <asm/types.h>
+#include <linux/types.h>
+#include <asm/timex.h>
+#include <linux/sched.h> /* THREAD_SIZE */
+#include <linux/rbtree.h>
+
+#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
+
+#if !defined(__x86_64__)
+# ifdef  __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
+                         ((unsigned long)__builtin_dwarf_cfa() &       \
+                          (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
+                         ((unsigned long)__builtin_frame_address(0) &  \
+                          (THREAD_SIZE - 1)))
+# endif /* __ia64__ */
+
+#define __CHECK_STACK(msgdata, mask, cdls)                           \
+do {                                                               \
+       if (unlikely(CDEBUG_STACK() > libcfs_stack)) {            \
+               LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL);   \
+               libcfs_stack = CDEBUG_STACK();                    \
+               libcfs_debug_msg(msgdata,                              \
+                                "maximum lustre stack %lu\n",    \
+                                CDEBUG_STACK());                      \
+               (msgdata)->msg_mask = mask;                          \
+               (msgdata)->msg_cdls = cdls;                          \
+               dump_stack();                                      \
+             /*panic("LBUG");*/                                        \
+       }                                                              \
+} while (0)
+#define CFS_CHECK_STACK(msgdata, mask, cdls)  __CHECK_STACK(msgdata, mask, cdls)
+#else /* __x86_64__ */
+#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while(0)
+#define CDEBUG_STACK() (0L)
+#endif /* __x86_64__ */
+
+/* initial pid  */
+#define LUSTRE_LNET_PID          12345
+
+#define ENTRY_NESTING_SUPPORT (1)
+#define ENTRY_NESTING   do {;} while (0)
+#define EXIT_NESTING   do {;} while (0)
+#define __current_nesting_level() (0)
+
+/**
+ * Platform specific declarations for cfs_curproc API (libcfs/curproc.h)
+ *
+ * Implementation is in linux-curproc.c
+ */
+#define CFS_CURPROC_COMM_MAX (sizeof ((struct task_struct *)0)->comm)
+
+#include <linux/capability.h>
+
+/* long integer with size equal to pointer */
+typedef unsigned long ulong_ptr_t;
+typedef long long_ptr_t;
+
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
+
+
+
+#endif /* _LINUX_LIBCFS_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h
new file mode 100644 (file)
index 0000000..43936e3
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-bitops.h
+ */
+#include <linux/bitops.h>
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h
new file mode 100644 (file)
index 0000000..224371c
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef __LIBCFS_LINUX_CPU_H__
+#define __LIBCFS_LINUX_CPU_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/topology.h>
+#include <linux/version.h>
+
+
+#ifdef CONFIG_SMP
+
+#define HAVE_LIBCFS_CPT
+
+/** virtual processing unit */
+struct cfs_cpu_partition {
+       /* CPUs mask for this partition */
+       cpumask_t                       *cpt_cpumask;
+       /* nodes mask for this partition */
+       nodemask_t                      *cpt_nodemask;
+       /* spread rotor for NUMA allocator */
+       unsigned                        cpt_spread_rotor;
+};
+
+/** descriptor for CPU partitions */
+struct cfs_cpt_table {
+       /* version, reserved for hotplug */
+       unsigned                        ctb_version;
+       /* spread rotor for NUMA allocator */
+       unsigned                        ctb_spread_rotor;
+       /* # of CPU partitions */
+       unsigned                        ctb_nparts;
+       /* partitions tables */
+       struct cfs_cpu_partition        *ctb_parts;
+       /* shadow HW CPU to CPU partition ID */
+       int                             *ctb_cpu2cpt;
+       /* all cpus in this partition table */
+       cpumask_t                       *ctb_cpumask;
+       /* all nodes in this partition table */
+       nodemask_t                      *ctb_nodemask;
+};
+
+void cfs_cpu_core_siblings(int cpu, cpumask_t *mask);
+void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask);
+void cfs_node_to_cpumask(int node, cpumask_t *mask);
+int cfs_cpu_core_nsiblings(int cpu);
+int cfs_cpu_ht_nsiblings(int cpu);
+
+/**
+ * comment out definitions for compatible layer
+ * #define CFS_CPU_NR                    NR_CPUS
+ *
+ * typedef cpumask_t                      cfs_cpumask_t;
+ *
+ * #define cfs_cpu_current()              smp_processor_id()
+ * #define cfs_cpu_online(i)              cpu_online(i)
+ * #define cfs_cpu_online_num()                num_online_cpus()
+ * #define cfs_cpu_online_for_each(i)    for_each_online_cpu(i)
+ * #define cfs_cpu_possible_num()            num_possible_cpus()
+ * #define cfs_cpu_possible_for_each(i)        for_each_possible_cpu(i)
+ *
+ * #ifdef CONFIG_CPUMASK_SIZE
+ * #define cfs_cpu_mask_size()          cpumask_size()
+ * #else
+ * #define cfs_cpu_mask_size()          sizeof(cfs_cpumask_t)
+ * #endif
+ *
+ * #define cfs_cpu_mask_set(i, mask)      cpu_set(i, mask)
+ * #define cfs_cpu_mask_unset(i, mask)  cpu_clear(i, mask)
+ * #define cfs_cpu_mask_isset(i, mask)  cpu_isset(i, mask)
+ * #define cfs_cpu_mask_clear(mask)        cpus_clear(mask)
+ * #define cfs_cpu_mask_empty(mask)        cpus_empty(mask)
+ * #define cfs_cpu_mask_weight(mask)      cpus_weight(mask)
+ * #define cfs_cpu_mask_first(mask)        first_cpu(mask)
+ * #define cfs_cpu_mask_any_online(mask)      (any_online_cpu(mask) != NR_CPUS)
+ * #define cfs_cpu_mask_for_each(i, mask)      for_each_cpu_mask(i, mask)
+ * #define cfs_cpu_mask_bind(t, mask)    set_cpus_allowed(t, mask)
+ *
+ * #ifdef HAVE_CPUMASK_COPY
+ * #define cfs_cpu_mask_copy(dst, src)  cpumask_copy(dst, src)
+ * #else
+ * #define cfs_cpu_mask_copy(dst, src)  memcpy(dst, src, sizeof(*src))
+ * #endif
+ *
+ * static inline void
+ * cfs_cpu_mask_of_online(cfs_cpumask_t *mask)
+ * {
+ * cfs_cpu_mask_copy(mask, &cpu_online_map);
+ * }
+ *
+ * #ifdef CONFIG_NUMA
+ *
+ * #define CFS_NODE_NR                  MAX_NUMNODES
+ *
+ * typedef nodemask_t                    cfs_node_mask_t;
+ *
+ * #define cfs_node_of_cpu(cpu)                cpu_to_node(cpu)
+ * #define cfs_node_online(i)            node_online(i)
+ * #define cfs_node_online_num()              num_online_nodes()
+ * #define cfs_node_online_for_each(i)  for_each_online_node(i)
+ * #define cfs_node_possible_num()          num_possible_nodes()
+ * #define cfs_node_possible_for_each(i)       for_each_node(i)
+ *
+ * static inline void cfs_node_to_cpumask(int node, cfs_cpumask_t *mask)
+ * {
+ * #if defined(HAVE_NODE_TO_CPUMASK)
+ *      *mask = node_to_cpumask(node);
+ * #elif defined(HAVE_CPUMASK_OF_NODE)
+ *      cfs_cpu_mask_copy(mask, cpumask_of_node(node));
+ * #else
+ * # error "Needs node_to_cpumask or cpumask_of_node"
+ * #endif
+ * }
+ *
+ * #define cfs_node_mask_set(i, mask)    node_set(i, mask)
+ * #define cfs_node_mask_unset(i, mask)        node_clear(i, mask)
+ * #define cfs_node_mask_isset(i, mask)        node_isset(i, mask)
+ * #define cfs_node_mask_clear(mask)      nodes_reset(mask)
+ * #define cfs_node_mask_empty(mask)      nodes_empty(mask)
+ * #define cfs_node_mask_weight(mask)    nodes_weight(mask)
+ * #define cfs_node_mask_for_each(i, mask)     for_each_node_mask(i, mask)
+ * #define cfs_node_mask_copy(dst, src)        memcpy(dst, src, sizeof(*src))
+ *
+ * static inline void
+ * cfs_node_mask_of_online(cfs_node_mask_t *mask)
+ * {
+ *       cfs_node_mask_copy(mask, &node_online_map);
+ * }
+ *
+ * #endif
+ */
+
+#endif /* CONFIG_SMP */
+#endif /* __LIBCFS_LINUX_CPU_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h
new file mode 100644 (file)
index 0000000..97c771c
--- /dev/null
@@ -0,0 +1,49 @@
+ /*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/**
+ * Linux crypto hash specific functions.
+ */
+
+/**
+ * Functions for start/stop shash CRC32 algorithm.
+ */
+int cfs_crypto_crc32_register(void);
+void cfs_crypto_crc32_unregister(void);
+
+/**
+ * Functions for start/stop shash adler32 algorithm.
+ */
+int cfs_crypto_adler32_register(void);
+void cfs_crypto_adler32_unregister(void);
+
+/**
+ * Functions for start/stop shash crc32 pclmulqdq
+ */
+int cfs_crypto_crc32_pclmul_register(void);
+void cfs_crypto_crc32_pclmul_unregister(void);
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h
new file mode 100644 (file)
index 0000000..eebf138
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-fs.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_FS_H__
+#define __LIBCFS_LINUX_CFS_FS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/mount.h>
+#include <linux/backing-dev.h>
+#include <linux/posix_acl_xattr.h>
+
+#define filp_size(f)                                   \
+       (i_size_read((f)->f_dentry->d_inode))
+#define filp_poff(f)                                   \
+       (&(f)->f_pos)
+
+# define do_fsync(fp, flag)                            \
+       ((fp)->f_op->fsync(fp, 0, LLONG_MAX, flag))
+
+#define filp_read(fp, buf, size, pos)                  \
+       ((fp)->f_op->read((fp), (buf), (size), pos))
+
+#define filp_write(fp, buf, size, pos)                 \
+       ((fp)->f_op->write((fp), (buf), (size), pos))
+
+#define filp_fsync(fp)                                 \
+       do_fsync(fp, 1)
+
+#define flock_type(fl)                 ((fl)->fl_type)
+#define flock_set_type(fl, type)       do { (fl)->fl_type = (type); } while (0)
+#define flock_pid(fl)                  ((fl)->fl_pid)
+#define flock_set_pid(fl, pid)         do { (fl)->fl_pid = (pid); } while (0)
+#define flock_start(fl)                        ((fl)->fl_start)
+#define flock_set_start(fl, st)                do { (fl)->fl_start = (st); } while (0)
+#define flock_end(fl)                  ((fl)->fl_end)
+#define flock_set_end(fl, end)         do { (fl)->fl_end = (end); } while (0)
+
+#ifndef IFSHIFT
+#define IFSHIFT                        12
+#endif
+
+#ifndef IFTODT
+#define IFTODT(type)           (((type) & S_IFMT) >> IFSHIFT)
+#endif
+#ifndef DTTOIF
+#define DTTOIF(dirtype)                ((dirtype) << IFSHIFT)
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h
new file mode 100644 (file)
index 0000000..6fbcbf3
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-lock.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_LOCK_H__
+#define __LIBCFS_LINUX_CFS_LOCK_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/mutex.h>
+
+/*
+ * IMPORTANT !!!!!!!!
+ *
+ * All locks' declaration are not guaranteed to be initialized,
+ * Althought some of they are initialized in Linux. All locks
+ * declared by CFS_DECL_* should be initialized explicitly.
+ */
+
+/*
+ * spin_lock "implementation" (use Linux kernel's primitives)
+ *
+ * - spin_lock_init(x)
+ * - spin_lock(x)
+ * - spin_lock_bh(x)
+ * - spin_lock_bh_init(x)
+ * - spin_unlock(x)
+ * - spin_unlock_bh(x)
+ * - spin_trylock(x)
+ * - spin_is_locked(x)
+ *
+ * - spin_lock_irq(x)
+ * - spin_lock_irqsave(x, f)
+ * - spin_unlock_irqrestore(x, f)
+ * - read_lock_irqsave(lock, f)
+ * - write_lock_irqsave(lock, f)
+ * - write_unlock_irqrestore(lock, f)
+ */
+
+/*
+ * spinlock "implementation"
+ */
+
+
+
+
+/*
+ * rw_semaphore "implementation" (use Linux kernel's primitives)
+ *
+ * - sema_init(x)
+ * - init_rwsem(x)
+ * - down_read(x)
+ * - up_read(x)
+ * - down_write(x)
+ * - up_write(x)
+ */
+
+
+#define fini_rwsem(s)          do {} while (0)
+
+
+/*
+ * rwlock_t "implementation" (use Linux kernel's primitives)
+ *
+ * - rwlock_init(x)
+ * - read_lock(x)
+ * - read_unlock(x)
+ * - write_lock(x)
+ * - write_unlock(x)
+ * - write_lock_bh(x)
+ * - write_unlock_bh(x)
+ *
+ * - RW_LOCK_UNLOCKED
+ */
+
+
+#ifndef DEFINE_RWLOCK
+#define DEFINE_RWLOCK(lock)    rwlock_t lock = __RW_LOCK_UNLOCKED(lock)
+#endif
+
+/*
+ * completion "implementation" (use Linux kernel's primitives)
+ *
+ * - DECLARE_COMPLETION(work)
+ * - INIT_COMPLETION(c)
+ * - COMPLETION_INITIALIZER(work)
+ * - init_completion(c)
+ * - complete(c)
+ * - wait_for_completion(c)
+ * - wait_for_completion_interruptible(c)
+ * - fini_completion(c)
+ */
+#define fini_completion(c) do { } while (0)
+
+/*
+ * semaphore "implementation" (use Linux kernel's primitives)
+ * - DEFINE_SEMAPHORE(name)
+ * - sema_init(sem, val)
+ * - up(sem)
+ * - down(sem)
+ * - down_interruptible(sem)
+ * - down_trylock(sem)
+ */
+
+/*
+ * mutex "implementation" (use Linux kernel's primitives)
+ *
+ * - DEFINE_MUTEX(name)
+ * - mutex_init(x)
+ * - mutex_lock(x)
+ * - mutex_unlock(x)
+ * - mutex_trylock(x)
+ * - mutex_is_locked(x)
+ * - mutex_destroy(x)
+ */
+
+#ifndef lockdep_set_class
+
+/**************************************************************************
+ *
+ * Lockdep "implementation". Also see liblustre.h
+ *
+ **************************************************************************/
+
+struct lock_class_key {
+       ;
+};
+
+#define lockdep_set_class(lock, key) \
+       do { (void)sizeof(lock); (void)sizeof(key); } while (0)
+/* This has to be a macro, so that `subclass' can be undefined in kernels
+ * that do not support lockdep. */
+
+
+static inline void lockdep_off(void)
+{
+}
+
+static inline void lockdep_on(void)
+{
+}
+#else
+
+#endif /* lockdep_set_class */
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#ifndef mutex_lock_nested
+#define mutex_lock_nested(mutex, subclass) mutex_lock(mutex)
+#endif
+
+#ifndef spin_lock_nested
+#define spin_lock_nested(lock, subclass) spin_lock(lock)
+#endif
+
+#ifndef down_read_nested
+#define down_read_nested(lock, subclass) down_read(lock)
+#endif
+
+#ifndef down_write_nested
+#define down_write_nested(lock, subclass) down_write(lock)
+#endif
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+
+#endif /* __LIBCFS_LINUX_CFS_LOCK_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
new file mode 100644 (file)
index 0000000..042a2bc
--- /dev/null
@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-mem.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_MEM_H__
+#define __LIBCFS_LINUX_CFS_MEM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+
+#define CFS_PAGE_MASK             (~((__u64)PAGE_CACHE_SIZE-1))
+#define page_index(p)       ((p)->index)
+
+#define memory_pressure_get() (current->flags & PF_MEMALLOC)
+#define memory_pressure_set() do { current->flags |= PF_MEMALLOC; } while (0)
+#define memory_pressure_clr() do { current->flags &= ~PF_MEMALLOC; } while (0)
+
+#if BITS_PER_LONG == 32
+/* limit to lowmem on 32-bit systems */
+#define NUM_CACHEPAGES \
+       min(num_physpages, 1UL << (30 - PAGE_CACHE_SHIFT) * 3 / 4)
+#else
+#define NUM_CACHEPAGES num_physpages
+#endif
+
+/*
+ * In Linux there is no way to determine whether current execution context is
+ * blockable.
+ */
+#define ALLOC_ATOMIC_TRY   GFP_ATOMIC
+
+#define DECL_MMSPACE           mm_segment_t __oldfs
+#define MMSPACE_OPEN \
+       do { __oldfs = get_fs(); set_fs(get_ds());} while(0)
+#define MMSPACE_CLOSE         set_fs(__oldfs)
+
+/*
+ * Shrinker
+ */
+
+# define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)  \
+                      struct shrinker *shrinker, \
+                      struct shrink_control *sc
+# define shrink_param(sc, var) ((sc)->var)
+
+typedef int (*shrinker_t)(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask));
+
+static inline
+struct shrinker *set_shrinker(int seek, shrinker_t func)
+{
+       struct shrinker *s;
+
+       s = kmalloc(sizeof(*s), GFP_KERNEL);
+       if (s == NULL)
+               return (NULL);
+
+       s->shrink = func;
+       s->seeks = seek;
+
+       register_shrinker(s);
+
+       return s;
+}
+
+static inline
+void remove_shrinker(struct shrinker *shrinker)
+{
+       if (shrinker == NULL)
+               return;
+
+       unregister_shrinker(shrinker);
+       kfree(shrinker);
+}
+
+#endif /* __LINUX_CFS_MEM_H__ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h
new file mode 100644 (file)
index 0000000..a4963a8
--- /dev/null
@@ -0,0 +1,241 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-prim.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_PRIM_H__
+#define __LIBCFS_LINUX_CFS_PRIM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/mm.h>
+#include <linux/timer.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/random.h>
+
+#include <linux/miscdevice.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+#include <asm/div64.h>
+
+#include <linux/libcfs/linux/linux-time.h>
+
+
+/*
+ * CPU
+ */
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#ifdef NR_CPUS
+#else
+#define NR_CPUS     1
+#endif
+
+/*
+ * cache
+ */
+
+/*
+ * IRQs
+ */
+
+
+/*
+ * Pseudo device register
+ */
+typedef struct miscdevice              psdev_t;
+
+/*
+ * Sysctl register
+ */
+typedef struct ctl_table               ctl_table_t;
+typedef struct ctl_table_header                ctl_table_header_t;
+
+#define cfs_register_sysctl_table(t, a) register_sysctl_table(t)
+
+#define DECLARE_PROC_HANDLER(name)                   \
+static int                                           \
+LL_PROC_PROTO(name)                                 \
+{                                                     \
+       DECLARE_LL_PROC_PPOS_DECL;                    \
+                                                       \
+       return proc_call_handler(table->data, write,    \
+                                ppos, buffer, lenp,    \
+                                __##name);          \
+}
+
+/*
+ * Symbol register
+ */
+#define cfs_symbol_register(s, p)       do {} while(0)
+#define cfs_symbol_unregister(s)       do {} while(0)
+#define cfs_symbol_get(s)             symbol_get(s)
+#define cfs_symbol_put(s)             symbol_put(s)
+
+typedef struct module module_t;
+
+/*
+ * Proc file system APIs
+ */
+typedef struct proc_dir_entry     proc_dir_entry_t;
+
+/*
+ * Wait Queue
+ */
+
+
+typedef long                       cfs_task_state_t;
+
+#define CFS_DECL_WAITQ(wq)             DECLARE_WAIT_QUEUE_HEAD(wq)
+
+/*
+ * Task struct
+ */
+typedef struct task_struct           task_t;
+#define DECL_JOURNAL_DATA         void *journal_info
+#define PUSH_JOURNAL           do {    \
+       journal_info = current->journal_info;   \
+       current->journal_info = NULL;      \
+       } while(0)
+#define POP_JOURNAL             do {    \
+       current->journal_info = journal_info;   \
+       } while(0)
+
+/* Module interfaces */
+#define cfs_module(name, version, init, fini) \
+       module_init(init);                  \
+       module_exit(fini)
+
+/*
+ * Signal
+ */
+
+/*
+ * Timer
+ */
+typedef struct timer_list timer_list_t;
+
+
+#ifndef wait_event_timeout /* Only for RHEL3 2.4.21 kernel */
+#define __wait_event_timeout(wq, condition, timeout, ret)      \
+do {                                                        \
+       int __ret = 0;                                     \
+       if (!(condition)) {                                   \
+               wait_queue_t __wait;                         \
+               unsigned long expire;                       \
+                                                                \
+               init_waitqueue_entry(&__wait, current);   \
+               expire = timeout + jiffies;                   \
+               add_wait_queue(&wq, &__wait);               \
+               for (;;) {                                     \
+                       set_current_state(TASK_UNINTERRUPTIBLE); \
+                       if (condition)                     \
+                               break;                     \
+                       if (jiffies > expire) {           \
+                               ret = jiffies - expire;   \
+                               break;                     \
+                       }                                       \
+                       schedule_timeout(timeout);             \
+               }                                               \
+               current->state = TASK_RUNNING;             \
+               remove_wait_queue(&wq, &__wait);                 \
+       }                                                       \
+} while (0)
+/*
+   retval == 0; condition met; we're good.
+   retval > 0; timed out.
+*/
+#define cfs_waitq_wait_event_timeout(wq, condition, timeout, ret)    \
+do {                                                            \
+       ret = 0;                                                     \
+       if (!(condition))                                           \
+               __wait_event_timeout(wq, condition, timeout, ret);   \
+} while (0)
+#else
+#define cfs_waitq_wait_event_timeout(wq, condition, timeout, ret)    \
+       ret = wait_event_timeout(wq, condition, timeout)
+#endif
+
+#define cfs_waitq_wait_event_interruptible_timeout(wq, c, timeout, ret) \
+       ret = wait_event_interruptible_timeout(wq, c, timeout)
+
+/*
+ * atomic
+ */
+
+
+#define cfs_atomic_add_unless(atom, a, u)    atomic_add_unless(atom, a, u)
+#define cfs_atomic_cmpxchg(atom, old, nv)    atomic_cmpxchg(atom, old, nv)
+
+/*
+ * membar
+ */
+
+
+/*
+ * interrupt
+ */
+
+
+/*
+ * might_sleep
+ */
+
+/*
+ * group_info
+ */
+typedef struct group_info group_info_t;
+
+
+/*
+ * Random bytes
+ */
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h
new file mode 100644 (file)
index 0000000..687f33f
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-tcpip.h
+ *
+ * Basic library routines.
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_TCP_H__
+#define __LIBCFS_LINUX_CFS_TCP_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+#include <net/sock.h>
+
+#ifndef HIPQUAD
+// XXX Should just kill all users
+#if defined(__LITTLE_ENDIAN)
+#define HIPQUAD(addr) \
+       ((unsigned char *)&addr)[3], \
+       ((unsigned char *)&addr)[2], \
+       ((unsigned char *)&addr)[1], \
+       ((unsigned char *)&addr)[0]
+#elif defined(__BIG_ENDIAN)
+#define HIPQUAD NIPQUAD
+#else
+#error "Please fix asm/byteorder.h"
+#endif /* __LITTLE_ENDIAN */
+#endif
+
+typedef struct socket   socket_t;
+
+#define SOCK_SNDBUF(so)         ((so)->sk->sk_sndbuf)
+#define SOCK_TEST_NOSPACE(so)   test_bit(SOCK_NOSPACE, &(so)->flags)
+
+static inline int
+cfs_sock_error(struct socket *sock)
+{
+       return sock->sk->sk_err;
+}
+
+static inline int
+cfs_sock_wmem_queued(struct socket *sock)
+{
+       return sock->sk->sk_wmem_queued;
+}
+
+#define cfs_sk_sleep(sk)       sk_sleep(sk)
+
+#define DEFAULT_NET    (&init_net)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h
new file mode 100644 (file)
index 0000000..4a48b91
--- /dev/null
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/linux/linux-time.h
+ *
+ * Implementation of portable time API for Linux (kernel and user-level).
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef __LIBCFS_LINUX_LINUX_TIME_H__
+#define __LIBCFS_LINUX_LINUX_TIME_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <linux/libcfs/libcfs.h> instead
+#endif
+
+
+/* Portable time API */
+
+/*
+ * Platform provides three opaque data-types:
+ *
+ *  cfs_time_t represents point in time. This is internal kernel
+ *                 time rather than "wall clock". This time bears no
+ *                 relation to gettimeofday().
+ *
+ *  cfs_duration_t    represents time interval with resolution of internal
+ *                 platform clock
+ *
+ *  cfs_fs_time_t     represents instance in world-visible time. This is
+ *                 used in file-system time-stamps
+ *
+ *  cfs_time_t     cfs_time_current(void);
+ *  cfs_time_t     cfs_time_add    (cfs_time_t, cfs_duration_t);
+ *  cfs_duration_t cfs_time_sub    (cfs_time_t, cfs_time_t);
+ *  int            cfs_impl_time_before (cfs_time_t, cfs_time_t);
+ *  int            cfs_impl_time_before_eq(cfs_time_t, cfs_time_t);
+ *
+ *  cfs_duration_t cfs_duration_build(int64_t);
+ *
+ *  time_t      cfs_duration_sec (cfs_duration_t);
+ *  void          cfs_duration_usec(cfs_duration_t, struct timeval *);
+ *  void          cfs_duration_nsec(cfs_duration_t, struct timespec *);
+ *
+ *  void          cfs_fs_time_current(cfs_fs_time_t *);
+ *  time_t      cfs_fs_time_sec    (cfs_fs_time_t *);
+ *  void          cfs_fs_time_usec   (cfs_fs_time_t *, struct timeval *);
+ *  void          cfs_fs_time_nsec   (cfs_fs_time_t *, struct timespec *);
+ *  int            cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *);
+ *  int            cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *);
+ *
+ *  CFS_TIME_FORMAT
+ *  CFS_DURATION_FORMAT
+ *
+ */
+
+#define ONE_BILLION ((u_int64_t)1000000000)
+#define ONE_MILLION 1000000
+
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/time.h>
+#include <asm/div64.h>
+
+#include <linux/libcfs/linux/portals_compat25.h>
+
+/*
+ * post 2.5 kernels.
+ */
+
+#include <linux/jiffies.h>
+
+typedef struct timespec cfs_fs_time_t;
+
+static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v)
+{
+       v->tv_sec  = t->tv_sec;
+       v->tv_usec = t->tv_nsec / 1000;
+}
+
+static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s)
+{
+       *s = *t;
+}
+
+/*
+ * internal helper function used by cfs_fs_time_before*()
+ */
+static inline unsigned long long __cfs_fs_time_flat(cfs_fs_time_t *t)
+{
+       return (unsigned long long)t->tv_sec * ONE_BILLION + t->tv_nsec;
+}
+
+
+/*
+ * Generic kernel stuff
+ */
+
+typedef unsigned long cfs_time_t;      /* jiffies */
+typedef long cfs_duration_t;
+typedef cycles_t cfs_cycles_t;
+
+static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
+{
+       return time_before(t1, t2);
+}
+
+static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
+{
+       return time_before_eq(t1, t2);
+}
+
+static inline cfs_time_t cfs_time_current(void)
+{
+       return jiffies;
+}
+
+static inline time_t cfs_time_current_sec(void)
+{
+       return get_seconds();
+}
+
+static inline void cfs_fs_time_current(cfs_fs_time_t *t)
+{
+       *t = CURRENT_TIME;
+}
+
+static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t)
+{
+       return t->tv_sec;
+}
+
+static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+       return __cfs_fs_time_flat(t1) <  __cfs_fs_time_flat(t2);
+}
+
+static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+       return __cfs_fs_time_flat(t1) <= __cfs_fs_time_flat(t2);
+}
+
+#if 0
+static inline cfs_duration_t cfs_duration_build(int64_t nano)
+{
+#if (BITS_PER_LONG == 32)
+       /* We cannot use do_div(t, ONE_BILLION), do_div can only process
+        * 64 bits n and 32 bits base */
+       int64_t  t = nano * HZ;
+       do_div(t, 1000);
+       do_div(t, 1000000);
+       return (cfs_duration_t)t;
+#else
+       return (nano * HZ / ONE_BILLION);
+#endif
+}
+#endif
+
+static inline cfs_duration_t cfs_time_seconds(int seconds)
+{
+       return ((cfs_duration_t)seconds) * HZ;
+}
+
+static inline time_t cfs_duration_sec(cfs_duration_t d)
+{
+       return d / HZ;
+}
+
+static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s)
+{
+#if (BITS_PER_LONG == 32) && (HZ > 4096)
+       __u64 t;
+
+       s->tv_sec = d / HZ;
+       t = (d - (cfs_duration_t)s->tv_sec * HZ) * ONE_MILLION;
+       do_div(t, HZ);
+       s->tv_usec = t;
+#else
+       s->tv_sec = d / HZ;
+       s->tv_usec = ((d - (cfs_duration_t)s->tv_sec * HZ) * \
+               ONE_MILLION) / HZ;
+#endif
+}
+
+static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s)
+{
+#if (BITS_PER_LONG == 32)
+       __u64 t;
+
+       s->tv_sec = d / HZ;
+       t = (d - s->tv_sec * HZ) * ONE_BILLION;
+       do_div(t, HZ);
+       s->tv_nsec = t;
+#else
+       s->tv_sec = d / HZ;
+       s->tv_nsec = ((d - s->tv_sec * HZ) * ONE_BILLION) / HZ;
+#endif
+}
+
+#define cfs_time_current_64 get_jiffies_64
+
+static inline __u64 cfs_time_add_64(__u64 t, __u64 d)
+{
+       return t + d;
+}
+
+static inline __u64 cfs_time_shift_64(int seconds)
+{
+       return cfs_time_add_64(cfs_time_current_64(),
+                              cfs_time_seconds(seconds));
+}
+
+static inline int cfs_time_before_64(__u64 t1, __u64 t2)
+{
+       return (__s64)t2 - (__s64)t1 > 0;
+}
+
+static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2)
+{
+       return (__s64)t2 - (__s64)t1 >= 0;
+}
+
+
+/*
+ * One jiffy
+ */
+#define CFS_TICK               (1)
+
+#define CFS_TIME_T           "%lu"
+#define CFS_DURATION_T   "%ld"
+
+
+#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h
new file mode 100644 (file)
index 0000000..1423949
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/include/libcfs/user-bitops.h
+ */
+#include <linux/types.h>
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h b/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h
new file mode 100644 (file)
index 0000000..132a4be
--- /dev/null
@@ -0,0 +1,114 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_PORTALS_COMPAT_H__
+#define __LIBCFS_LINUX_PORTALS_COMPAT_H__
+
+// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved
+#if defined(SPINLOCK_DEBUG) && SPINLOCK_DEBUG
+#  define SIGNAL_MASK_ASSERT() \
+   LASSERT(current->sighand->siglock.magic == SPINLOCK_MAGIC)
+#else
+# define SIGNAL_MASK_ASSERT()
+#endif
+// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved
+
+#define SIGNAL_MASK_LOCK(task, flags)                            \
+       spin_lock_irqsave(&task->sighand->siglock, flags)
+#define SIGNAL_MASK_UNLOCK(task, flags)                                \
+       spin_unlock_irqrestore(&task->sighand->siglock, flags)
+#define USERMODEHELPER(path, argv, envp)                              \
+       call_usermodehelper(path, argv, envp, 1)
+#define clear_tsk_thread_flag(current, TIF_SIGPENDING)   clear_tsk_thread_flag(current,       \
+                                                       TIF_SIGPENDING)
+# define smp_num_cpus        num_online_cpus()
+
+#define cfs_wait_event_interruptible(wq, condition, ret)              \
+       ret = wait_event_interruptible(wq, condition)
+#define cfs_wait_event_interruptible_exclusive(wq, condition, ret)     \
+       ret = wait_event_interruptible_exclusive(wq, condition)
+
+#define THREAD_NAME(comm, len, fmt, a...)                            \
+       snprintf(comm, len, fmt, ## a)
+
+/* 2.6 alloc_page users can use page->lru */
+#define PAGE_LIST_ENTRY lru
+#define PAGE_LIST(page) ((page)->lru)
+
+#ifndef __user
+#define __user
+#endif
+
+#ifndef __fls
+#define __cfs_fls fls
+#else
+#define __cfs_fls __fls
+#endif
+
+#define ll_proc_dointvec(table, write, filp, buffer, lenp, ppos)       \
+       proc_dointvec(table, write, buffer, lenp, ppos);
+
+#define ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos)      \
+       proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+#define ll_proc_dostring(table, write, filp, buffer, lenp, ppos)       \
+       proc_dostring(table, write, buffer, lenp, ppos);
+#define LL_PROC_PROTO(name)                                         \
+       name(ctl_table_t *table, int write,                   \
+            void __user *buffer, size_t *lenp, loff_t *ppos)
+#define DECLARE_LL_PROC_PPOS_DECL
+
+/* helper for sysctl handlers */
+int proc_call_handler(void *data, int write,
+                     loff_t *ppos, void *buffer, size_t *lenp,
+                     int (*handler)(void *data, int write,
+                                    loff_t pos, void *buffer, int len));
+/*
+ * CPU
+ */
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#ifdef NR_CPUS
+#else
+#define NR_CPUS     1
+#endif
+
+#define cfs_register_sysctl_table(t, a) register_sysctl_table(t)
+
+#endif /* _PORTALS_COMPAT_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/lucache.h b/drivers/staging/lustre/include/linux/libcfs/lucache.h
new file mode 100644 (file)
index 0000000..7ae36fc
--- /dev/null
@@ -0,0 +1,162 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUCACHE_H
+#define _LUCACHE_H
+
+#include <linux/libcfs/libcfs.h>
+
+/** \defgroup ucache ucache
+ *
+ * @{
+ */
+
+#define UC_CACHE_NEW       0x01
+#define UC_CACHE_ACQUIRING      0x02
+#define UC_CACHE_INVALID       0x04
+#define UC_CACHE_EXPIRED       0x08
+
+#define UC_CACHE_IS_NEW(i)       ((i)->ue_flags & UC_CACHE_NEW)
+#define UC_CACHE_IS_INVALID(i)      ((i)->ue_flags & UC_CACHE_INVALID)
+#define UC_CACHE_IS_ACQUIRING(i)    ((i)->ue_flags & UC_CACHE_ACQUIRING)
+#define UC_CACHE_IS_EXPIRED(i)      ((i)->ue_flags & UC_CACHE_EXPIRED)
+#define UC_CACHE_IS_VALID(i)   ((i)->ue_flags == 0)
+
+#define UC_CACHE_SET_NEW(i)     (i)->ue_flags |= UC_CACHE_NEW
+#define UC_CACHE_SET_INVALID(i)     (i)->ue_flags |= UC_CACHE_INVALID
+#define UC_CACHE_SET_ACQUIRING(i)   (i)->ue_flags |= UC_CACHE_ACQUIRING
+#define UC_CACHE_SET_EXPIRED(i)     (i)->ue_flags |= UC_CACHE_EXPIRED
+#define UC_CACHE_SET_VALID(i)       (i)->ue_flags = 0
+
+#define UC_CACHE_CLEAR_NEW(i)       (i)->ue_flags &= ~UC_CACHE_NEW
+#define UC_CACHE_CLEAR_ACQUIRING(i) (i)->ue_flags &= ~UC_CACHE_ACQUIRING
+#define UC_CACHE_CLEAR_INVALID(i)   (i)->ue_flags &= ~UC_CACHE_INVALID
+#define UC_CACHE_CLEAR_EXPIRED(i)   (i)->ue_flags &= ~UC_CACHE_EXPIRED
+
+struct upcall_cache_entry;
+
+struct md_perm {
+       lnet_nid_t      mp_nid;
+       __u32      mp_perm;
+};
+
+struct md_identity {
+       struct upcall_cache_entry *mi_uc_entry;
+       uid_t                 mi_uid;
+       gid_t                 mi_gid;
+       group_info_t      *mi_ginfo;
+       int                     mi_nperms;
+       struct md_perm      *mi_perms;
+};
+
+struct upcall_cache_entry {
+       struct list_head              ue_hash;
+       __u64              ue_key;
+       atomic_t            ue_refcount;
+       int                  ue_flags;
+       wait_queue_head_t            ue_waitq;
+       cfs_time_t            ue_acquire_expire;
+       cfs_time_t            ue_expire;
+       union {
+               struct md_identity     identity;
+       } u;
+};
+
+#define UC_CACHE_HASH_SIZE     (128)
+#define UC_CACHE_HASH_INDEX(id)   ((id) & (UC_CACHE_HASH_SIZE - 1))
+#define UC_CACHE_UPCALL_MAXPATH   (1024UL)
+
+struct upcall_cache;
+
+struct upcall_cache_ops {
+       void        (*init_entry)(struct upcall_cache_entry *, void *args);
+       void        (*free_entry)(struct upcall_cache *,
+                                     struct upcall_cache_entry *);
+       int          (*upcall_compare)(struct upcall_cache *,
+                                         struct upcall_cache_entry *,
+                                         __u64 key, void *args);
+       int          (*downcall_compare)(struct upcall_cache *,
+                                           struct upcall_cache_entry *,
+                                           __u64 key, void *args);
+       int          (*do_upcall)(struct upcall_cache *,
+                                    struct upcall_cache_entry *);
+       int          (*parse_downcall)(struct upcall_cache *,
+                                         struct upcall_cache_entry *, void *);
+};
+
+struct upcall_cache {
+       struct list_head                uc_hashtable[UC_CACHE_HASH_SIZE];
+       spinlock_t              uc_lock;
+       rwlock_t                uc_upcall_rwlock;
+
+       char                    uc_name[40];            /* for upcall */
+       char                    uc_upcall[UC_CACHE_UPCALL_MAXPATH];
+       int                     uc_acquire_expire;      /* seconds */
+       int                     uc_entry_expire;        /* seconds */
+       struct upcall_cache_ops *uc_ops;
+};
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+                                                 __u64 key, void *args);
+void upcall_cache_put_entry(struct upcall_cache *cache,
+                           struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+                         void *args);
+void upcall_cache_flush_idle(struct upcall_cache *cache);
+void upcall_cache_flush_all(struct upcall_cache *cache);
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args);
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+                                      struct upcall_cache_ops *ops);
+void upcall_cache_cleanup(struct upcall_cache *cache);
+
+#if 0
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *hash,
+                                                 __u64 key, __u32 primary,
+                                                 __u32 ngroups, __u32 *groups);
+void upcall_cache_put_entry(struct upcall_cache *hash,
+                           struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *hash, __u32 err, __u64 key,
+                         __u32 primary, __u32 ngroups, __u32 *groups);
+void upcall_cache_flush_idle(struct upcall_cache *cache);
+void upcall_cache_flush_all(struct upcall_cache *cache);
+struct upcall_cache *upcall_cache_init(const char *name);
+void upcall_cache_cleanup(struct upcall_cache *hash);
+
+#endif
+
+/** @} ucache */
+
+#endif /* _LUCACHE_H */
diff --git a/drivers/staging/lustre/include/linux/libcfs/params_tree.h b/drivers/staging/lustre/include/linux/libcfs/params_tree.h
new file mode 100644 (file)
index 0000000..3f18a44
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * API and structure definitions for params_tree.
+ *
+ * Author: LiuYing <emoly.liu@oracle.com>
+ */
+#ifndef __PARAMS_TREE_H__
+#define __PARAMS_TREE_H__
+
+#include <linux/libcfs/libcfs.h>
+
+#undef LPROCFS
+#if  defined(CONFIG_PROC_FS)
+# define LPROCFS
+#endif
+
+#ifdef LPROCFS
+typedef struct file                         cfs_param_file_t;
+typedef struct inode                       cfs_inode_t;
+typedef struct proc_inode                     cfs_proc_inode_t;
+typedef struct seq_file                         cfs_seq_file_t;
+typedef struct seq_operations             cfs_seq_ops_t;
+typedef struct file_operations           cfs_param_file_ops_t;
+typedef module_t                          *cfs_param_module_t;
+typedef struct proc_dir_entry             cfs_param_dentry_t;
+typedef struct poll_table_struct               cfs_poll_table_t;
+#define CFS_PARAM_MODULE                       THIS_MODULE
+#define cfs_file_private(file)           (file->private_data)
+#define cfs_dentry_data(dentry)                 (dentry->data)
+#define cfs_proc_inode_pde(proc_inode)   (proc_inode->pde)
+#define cfs_proc_inode(proc_inode)           (proc_inode->vfs_inode)
+#define cfs_seq_read_common                 seq_read
+#define cfs_seq_lseek_common               seq_lseek
+#define cfs_seq_private(seq)               (seq->private)
+#define cfs_seq_printf(seq, format, ...)       seq_printf(seq, format,  \
+                                                          ## __VA_ARGS__)
+#define cfs_seq_release(inode, file)       seq_release(inode, file)
+#define cfs_seq_puts(seq, s)               seq_puts(seq, s)
+#define cfs_seq_putc(seq, s)               seq_putc(seq, s)
+#define cfs_seq_read(file, buf, count, ppos, rc) (rc = seq_read(file, buf, \
+                                                           count, ppos))
+#define cfs_seq_open(file, ops, rc)         (rc = seq_open(file, ops))
+
+#else /* !LPROCFS */
+
+typedef struct cfs_params_file {
+       void       *param_private;
+       loff_t    param_pos;
+       unsigned int    param_flags;
+} cfs_param_file_t;
+
+typedef struct cfs_param_inode {
+       void    *param_private;
+} cfs_inode_t;
+
+typedef struct cfs_param_dentry {
+       void *param_data;
+} cfs_param_dentry_t;
+
+typedef struct cfs_proc_inode {
+       cfs_param_dentry_t *param_pde;
+       cfs_inode_t      param_inode;
+} cfs_proc_inode_t;
+
+struct cfs_seq_operations;
+typedef struct cfs_seq_file {
+       char                  *buf;
+       size_t               size;
+       size_t               from;
+       size_t               count;
+       loff_t               index;
+       loff_t               version;
+       struct mutex                    lock;
+       struct cfs_seq_operations *op;
+       void                  *private;
+} cfs_seq_file_t;
+
+typedef struct cfs_seq_operations {
+       void *(*start) (cfs_seq_file_t *m, loff_t *pos);
+       void  (*stop) (cfs_seq_file_t *m, void *v);
+       void *(*next) (cfs_seq_file_t *m, void *v, loff_t *pos);
+       int   (*show) (cfs_seq_file_t *m, void *v);
+} cfs_seq_ops_t;
+
+typedef void *cfs_param_module_t;
+typedef void *cfs_poll_table_t;
+
+typedef struct cfs_param_file_ops {
+       cfs_param_module_t owner;
+       int (*open) (cfs_inode_t *, struct file *);
+       loff_t (*llseek)(struct file *, loff_t, int);
+       int (*release) (cfs_inode_t *, cfs_param_file_t *);
+       unsigned int (*poll) (struct file *, cfs_poll_table_t *);
+       ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
+       ssize_t (*read)(struct file *, char *, size_t, loff_t *);
+} cfs_param_file_ops_t;
+typedef cfs_param_file_ops_t *cfs_lproc_filep_t;
+
+static inline cfs_proc_inode_t *FAKE_PROC_I(const cfs_inode_t *inode)
+{
+       return container_of(inode, cfs_proc_inode_t, param_inode);
+}
+
+#define CFS_PARAM_MODULE                       NULL
+#define cfs_file_private(file)           (file->param_private)
+#define cfs_dentry_data(dentry)                 (dentry->param_data)
+#define cfs_proc_inode(proc_inode)           (proc_inode->param_inode)
+#define cfs_proc_inode_pde(proc_inode)   (proc_inode->param_pde)
+#define cfs_seq_read_common                 NULL
+#define cfs_seq_lseek_common               NULL
+#define cfs_seq_private(seq)               (seq->private)
+#define cfs_seq_read(file, buf, count, ppos, rc) do {} while(0)
+#define cfs_seq_open(file, ops, rc)                 \
+do {                                               \
+        cfs_seq_file_t *p = cfs_file_private(file);    \
+        if (!p) {                                    \
+               LIBCFS_ALLOC(p, sizeof(*p));        \
+               if (!p) {                              \
+                       rc = -ENOMEM;              \
+                       break;                    \
+               }                                      \
+               cfs_file_private(file) = p;          \
+       }                                              \
+       memset(p, 0, sizeof(*p));                      \
+       p->op = ops;                                \
+       rc = 0;                                  \
+} while(0)
+
+#endif /* LPROCFS */
+
+/* XXX: params_tree APIs */
+
+#endif  /* __PARAMS_TREE_H__ */
diff --git a/drivers/staging/lustre/include/linux/lnet/api-support.h b/drivers/staging/lustre/include/linux/lnet/api-support.h
new file mode 100644 (file)
index 0000000..a8d91db
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_SUPPORT_H__
+#define __LNET_API_SUPPORT_H__
+
+#include <linux/lnet/linux/api-support.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+#include <linux/lnet/lnet.h>
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/api.h b/drivers/staging/lustre/include/linux/lnet/api.h
new file mode 100644 (file)
index 0000000..e8642e3
--- /dev/null
@@ -0,0 +1,220 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_API_H__
+#define __LNET_API_H__
+
+/** \defgroup lnet LNet
+ *
+ * The Lustre Networking subsystem.
+ *
+ * LNet is an asynchronous message-passing API, which provides an unreliable
+ * connectionless service that can't guarantee any order. It supports OFA IB,
+ * TCP/IP, and Cray Portals, and routes between heterogeneous networks.
+ *
+ * LNet can run both in OS kernel space and in userspace as a library.
+ * @{
+ */
+
+#include <linux/lnet/types.h>
+
+/** \defgroup lnet_init_fini Initialization and cleanup
+ * The LNet must be properly initialized before any LNet calls can be made.
+ * @{ */
+int LNetInit(void);
+void LNetFini(void);
+
+int LNetNIInit(lnet_pid_t requested_pid);
+int LNetNIFini(void);
+/** @} lnet_init_fini */
+
+/** \defgroup lnet_addr LNet addressing and basic types
+ *
+ * Addressing scheme and basic data types of LNet.
+ *
+ * The LNet API is memory-oriented, so LNet must be able to address not only
+ * end-points but also memory region within a process address space.
+ * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process
+ * in a node. A portal represents an opening in the address space of a
+ * process. Match bits is criteria to identify a region of memory inside a
+ * portal, and offset specifies an offset within the memory region.
+ *
+ * LNet creates a table of portals for each process during initialization.
+ * This table has MAX_PORTALS entries and its size can't be dynamically
+ * changed. A portal stays empty until the owning process starts to add
+ * memory regions to it. A portal is sometimes called an index because
+ * it's an entry in the portals table of a process.
+ *
+ * \see LNetMEAttach
+ * @{ */
+int LNetGetId(unsigned int index, lnet_process_id_t *id);
+int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order);
+void LNetSnprintHandle(char *str, int str_len, lnet_handle_any_t handle);
+
+/** @} lnet_addr */
+
+
+/** \defgroup lnet_me Match entries
+ *
+ * A match entry (abbreviated as ME) describes a set of criteria to accept
+ * incoming requests.
+ *
+ * A portal is essentially a match list plus a set of attributes. A match
+ * list is a chain of MEs. Each ME includes a pointer to a memory descriptor
+ * and a set of match criteria. The match criteria can be used to reject
+ * incoming requests based on process ID or the match bits provided in the
+ * request. MEs can be dynamically inserted into a match list by LNetMEAttach()
+ * and LNetMEInsert(), and removed from its list by LNetMEUnlink().
+ * @{ */
+int LNetMEAttach(unsigned int      portal,
+                lnet_process_id_t match_id_in,
+                __u64       match_bits_in,
+                __u64       ignore_bits_in,
+                lnet_unlink_t     unlink_in,
+                lnet_ins_pos_t    pos_in,
+                lnet_handle_me_t *handle_out);
+
+int LNetMEInsert(lnet_handle_me_t  current_in,
+                lnet_process_id_t match_id_in,
+                __u64       match_bits_in,
+                __u64       ignore_bits_in,
+                lnet_unlink_t     unlink_in,
+                lnet_ins_pos_t    position_in,
+                lnet_handle_me_t *handle_out);
+
+int LNetMEUnlink(lnet_handle_me_t current_in);
+/** @} lnet_me */
+
+/** \defgroup lnet_md Memory descriptors
+ *
+ * A memory descriptor contains information about a region of a user's
+ * memory (either in kernel or user space) and optionally points to an
+ * event queue where information about the operations performed on the
+ * memory descriptor are recorded. Memory descriptor is abbreviated as
+ * MD and can be used interchangeably with the memory region it describes.
+ *
+ * The LNet API provides two operations to create MDs: LNetMDAttach()
+ * and LNetMDBind(); one operation to unlink and release the resources
+ * associated with a MD: LNetMDUnlink().
+ * @{ */
+int LNetMDAttach(lnet_handle_me_t  current_in,
+                lnet_md_t       md_in,
+                lnet_unlink_t     unlink_in,
+                lnet_handle_md_t *handle_out);
+
+int LNetMDBind(lnet_md_t        md_in,
+              lnet_unlink_t     unlink_in,
+              lnet_handle_md_t *handle_out);
+
+int LNetMDUnlink(lnet_handle_md_t md_in);
+/** @} lnet_md */
+
+/** \defgroup lnet_eq Events and event queues
+ *
+ * Event queues (abbreviated as EQ) are used to log operations performed on
+ * local MDs. In particular, they signal the completion of a data transmission
+ * into or out of a MD. They can also be used to hold acknowledgments for
+ * completed PUT operations and indicate when a MD has been unlinked. Multiple
+ * MDs can share a single EQ. An EQ may have an optional event handler
+ * associated with it. If an event handler exists, it will be run for each
+ * event that is deposited into the EQ.
+ *
+ * In addition to the lnet_handle_eq_t, the LNet API defines two types
+ * associated with events: The ::lnet_event_kind_t defines the kinds of events
+ * that can be stored in an EQ. The lnet_event_t defines a structure that
+ * holds the information about with an event.
+ *
+ * There are five functions for dealing with EQs: LNetEQAlloc() is used to
+ * create an EQ and allocate the resources needed, while LNetEQFree()
+ * releases these resources and free the EQ. LNetEQGet() retrieves the next
+ * event from an EQ, and LNetEQWait() can be used to block a process until
+ * an EQ has at least one event. LNetEQPoll() can be used to test or wait
+ * on multiple EQs.
+ * @{ */
+int LNetEQAlloc(unsigned int       count_in,
+               lnet_eq_handler_t  handler,
+               lnet_handle_eq_t  *handle_out);
+
+int LNetEQFree(lnet_handle_eq_t eventq_in);
+
+int LNetEQGet(lnet_handle_eq_t  eventq_in,
+             lnet_event_t     *event_out);
+
+
+int LNetEQWait(lnet_handle_eq_t  eventq_in,
+              lnet_event_t     *event_out);
+
+int LNetEQPoll(lnet_handle_eq_t *eventqs_in,
+              int             neq_in,
+              int             timeout_ms,
+              lnet_event_t     *event_out,
+              int            *which_eq_out);
+/** @} lnet_eq */
+
+/** \defgroup lnet_data Data movement operations
+ *
+ * The LNet API provides two data movement operations: LNetPut()
+ * and LNetGet().
+ * @{ */
+int LNetPut(lnet_nid_t self,
+           lnet_handle_md_t  md_in,
+           lnet_ack_req_t    ack_req_in,
+           lnet_process_id_t target_in,
+           unsigned int      portal_in,
+           __u64            match_bits_in,
+           unsigned int      offset_in,
+           __u64            hdr_data_in);
+
+int LNetGet(lnet_nid_t self,
+           lnet_handle_md_t  md_in,
+           lnet_process_id_t target_in,
+           unsigned int      portal_in,
+           __u64            match_bits_in,
+           unsigned int      offset_in);
+/** @} lnet_data */
+
+
+/** \defgroup lnet_misc Miscellaneous operations.
+ * Miscellaneous operations.
+ * @{ */
+
+int LNetSetLazyPortal(int portal);
+int LNetClearLazyPortal(int portal);
+int LNetCtl(unsigned int cmd, void *arg);
+int LNetSetAsync(lnet_process_id_t id, int nasync);
+
+/** @} lnet_misc */
+
+/** @} lnet */
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h
new file mode 100644 (file)
index 0000000..59bff0b
--- /dev/null
@@ -0,0 +1,874 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-lnet.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef __LNET_LIB_LNET_H__
+#define __LNET_LIB_LNET_H__
+
+#include <linux/lnet/linux/lib-lnet.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+
+extern lnet_t  the_lnet;                       /* THE network */
+
+#if  defined(LNET_USE_LIB_FREELIST)
+/* 1 CPT, simplify implementation... */
+# define LNET_CPT_MAX_BITS      0
+
+#else /* KERNEL and no freelist */
+
+# if (BITS_PER_LONG == 32)
+/* 2 CPTs, allowing more CPTs might make us under memory pressure */
+#  define LNET_CPT_MAX_BITS     1
+
+# else /* 64-bit system */
+/*
+ * 256 CPTs for thousands of CPUs, allowing more CPTs might make us
+ * under risk of consuming all lh_cookie.
+ */
+#  define LNET_CPT_MAX_BITS     8
+# endif /* BITS_PER_LONG == 32 */
+#endif
+
+/* max allowed CPT number */
+#define LNET_CPT_MAX       (1 << LNET_CPT_MAX_BITS)
+
+#define LNET_CPT_NUMBER         (the_lnet.ln_cpt_number)
+#define LNET_CPT_BITS     (the_lnet.ln_cpt_bits)
+#define LNET_CPT_MASK     ((1ULL << LNET_CPT_BITS) - 1)
+
+/** exclusive lock */
+#define LNET_LOCK_EX       CFS_PERCPT_LOCK_EX
+
+static inline int lnet_is_wire_handle_none (lnet_handle_wire_t *wh)
+{
+       return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE &&
+               wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE);
+}
+
+static inline int lnet_md_exhausted (lnet_libmd_t *md)
+{
+       return (md->md_threshold == 0 ||
+               ((md->md_options & LNET_MD_MAX_SIZE) != 0 &&
+                md->md_offset + md->md_max_size > md->md_length));
+}
+
+static inline int lnet_md_unlinkable (lnet_libmd_t *md)
+{
+       /* Should unlink md when its refcount is 0 and either:
+        *  - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink,
+        *    in the latter case md may not be exhausted).
+        *  - auto unlink is on and md is exhausted.
+        */
+       if (md->md_refcount != 0)
+               return 0;
+
+       if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0)
+               return 1;
+
+       return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 &&
+               lnet_md_exhausted(md));
+}
+
+#define lnet_cpt_table()       (the_lnet.ln_cpt_table)
+#define lnet_cpt_current()     cfs_cpt_current(the_lnet.ln_cpt_table, 1)
+
+static inline int
+lnet_cpt_of_cookie(__u64 cookie)
+{
+       unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK;
+
+       /* LNET_CPT_NUMBER doesn't have to be power2, which means we can
+        * get illegal cpt from it's invalid cookie */
+       return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER;
+}
+
+static inline void
+lnet_res_lock(int cpt)
+{
+       cfs_percpt_lock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline void
+lnet_res_unlock(int cpt)
+{
+       cfs_percpt_unlock(the_lnet.ln_res_lock, cpt);
+}
+
+static inline int
+lnet_res_lock_current(void)
+{
+       int cpt = lnet_cpt_current();
+
+       lnet_res_lock(cpt);
+       return cpt;
+}
+
+static inline void
+lnet_net_lock(int cpt)
+{
+       cfs_percpt_lock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline void
+lnet_net_unlock(int cpt)
+{
+       cfs_percpt_unlock(the_lnet.ln_net_lock, cpt);
+}
+
+static inline int
+lnet_net_lock_current(void)
+{
+       int cpt = lnet_cpt_current();
+
+       lnet_net_lock(cpt);
+       return cpt;
+}
+
+#define LNET_LOCK()            lnet_net_lock(LNET_LOCK_EX)
+#define LNET_UNLOCK()          lnet_net_unlock(LNET_LOCK_EX)
+
+
+#define lnet_ptl_lock(ptl)     spin_lock(&(ptl)->ptl_lock)
+#define lnet_ptl_unlock(ptl)   spin_unlock(&(ptl)->ptl_lock)
+#define lnet_eq_wait_lock()    spin_lock(&the_lnet.ln_eq_wait_lock)
+#define lnet_eq_wait_unlock()  spin_unlock(&the_lnet.ln_eq_wait_lock)
+#define lnet_ni_lock(ni)       spin_lock(&(ni)->ni_lock)
+#define lnet_ni_unlock(ni)     spin_unlock(&(ni)->ni_lock)
+#define LNET_MUTEX_LOCK(m)     mutex_lock(m)
+#define LNET_MUTEX_UNLOCK(m)   mutex_unlock(m)
+
+
+#define MAX_PORTALS     64
+
+/* these are only used by code with LNET_USE_LIB_FREELIST, but we still
+ * exported them to !LNET_USE_LIB_FREELIST for easy implemetation */
+#define LNET_FL_MAX_MES                2048
+#define LNET_FL_MAX_MDS                2048
+#define LNET_FL_MAX_EQS                512
+#define LNET_FL_MAX_MSGS       2048    /* Outstanding messages */
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int lnet_freelist_init(lnet_freelist_t *fl, int n, int size);
+void lnet_freelist_fini(lnet_freelist_t *fl);
+
+static inline void *
+lnet_freelist_alloc (lnet_freelist_t *fl)
+{
+       /* ALWAYS called with liblock held */
+       lnet_freeobj_t *o;
+
+       if (list_empty (&fl->fl_list))
+               return (NULL);
+
+       o = list_entry (fl->fl_list.next, lnet_freeobj_t, fo_list);
+       list_del (&o->fo_list);
+       return ((void *)&o->fo_contents);
+}
+
+static inline void
+lnet_freelist_free (lnet_freelist_t *fl, void *obj)
+{
+       /* ALWAYS called with liblock held */
+       lnet_freeobj_t *o = list_entry (obj, lnet_freeobj_t, fo_contents);
+
+       list_add (&o->fo_list, &fl->fl_list);
+}
+
+
+static inline lnet_eq_t *
+lnet_eq_alloc (void)
+{
+       /* NEVER called with resource lock held */
+       struct lnet_res_container *rec = &the_lnet.ln_eq_container;
+       lnet_eq_t                 *eq;
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+
+       lnet_res_lock(0);
+       eq = (lnet_eq_t *)lnet_freelist_alloc(&rec->rec_freelist);
+       lnet_res_unlock(0);
+
+       return eq;
+}
+
+static inline void
+lnet_eq_free_locked(lnet_eq_t *eq)
+{
+       /* ALWAYS called with resource lock held */
+       struct lnet_res_container *rec = &the_lnet.ln_eq_container;
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+       lnet_freelist_free(&rec->rec_freelist, eq);
+}
+
+static inline void
+lnet_eq_free(lnet_eq_t *eq)
+{
+       lnet_res_lock(0);
+       lnet_eq_free_locked(eq);
+       lnet_res_unlock(0);
+}
+
+static inline lnet_libmd_t *
+lnet_md_alloc (lnet_md_t *umd)
+{
+       /* NEVER called with resource lock held */
+       struct lnet_res_container *rec = the_lnet.ln_md_containers[0];
+       lnet_libmd_t              *md;
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+
+       lnet_res_lock(0);
+       md = (lnet_libmd_t *)lnet_freelist_alloc(&rec->rec_freelist);
+       lnet_res_unlock(0);
+
+       if (md != NULL)
+               INIT_LIST_HEAD(&md->md_list);
+
+       return md;
+}
+
+static inline void
+lnet_md_free_locked(lnet_libmd_t *md)
+{
+       /* ALWAYS called with resource lock held */
+       struct lnet_res_container *rec = the_lnet.ln_md_containers[0];
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+       lnet_freelist_free(&rec->rec_freelist, md);
+}
+
+static inline void
+lnet_md_free(lnet_libmd_t *md)
+{
+       lnet_res_lock(0);
+       lnet_md_free_locked(md);
+       lnet_res_unlock(0);
+}
+
+static inline lnet_me_t *
+lnet_me_alloc(void)
+{
+       /* NEVER called with resource lock held */
+       struct lnet_res_container *rec = the_lnet.ln_me_containers[0];
+       lnet_me_t                 *me;
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+
+       lnet_res_lock(0);
+       me = (lnet_me_t *)lnet_freelist_alloc(&rec->rec_freelist);
+       lnet_res_unlock(0);
+
+       return me;
+}
+
+static inline void
+lnet_me_free_locked(lnet_me_t *me)
+{
+       /* ALWAYS called with resource lock held */
+       struct lnet_res_container *rec = the_lnet.ln_me_containers[0];
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+       lnet_freelist_free(&rec->rec_freelist, me);
+}
+
+static inline void
+lnet_me_free(lnet_me_t *me)
+{
+       lnet_res_lock(0);
+       lnet_me_free_locked(me);
+       lnet_res_unlock(0);
+}
+
+static inline lnet_msg_t *
+lnet_msg_alloc (void)
+{
+       /* NEVER called with network lock held */
+       struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0];
+       lnet_msg_t                *msg;
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+
+       lnet_net_lock(0);
+       msg = (lnet_msg_t *)lnet_freelist_alloc(&msc->msc_freelist);
+       lnet_net_unlock(0);
+
+       if (msg != NULL) {
+               /* NULL pointers, clear flags etc */
+               memset(msg, 0, sizeof(*msg));
+       }
+       return msg;
+}
+
+static inline void
+lnet_msg_free_locked(lnet_msg_t *msg)
+{
+       /* ALWAYS called with network lock held */
+       struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0];
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+       LASSERT(!msg->msg_onactivelist);
+       lnet_freelist_free(&msc->msc_freelist, msg);
+}
+
+static inline void
+lnet_msg_free (lnet_msg_t *msg)
+{
+       lnet_net_lock(0);
+       lnet_msg_free_locked(msg);
+       lnet_net_unlock(0);
+}
+
+#else /* !LNET_USE_LIB_FREELIST */
+
+static inline lnet_eq_t *
+lnet_eq_alloc (void)
+{
+       /* NEVER called with liblock held */
+       lnet_eq_t *eq;
+
+       LIBCFS_ALLOC(eq, sizeof(*eq));
+       return (eq);
+}
+
+static inline void
+lnet_eq_free(lnet_eq_t *eq)
+{
+       /* ALWAYS called with resource lock held */
+       LIBCFS_FREE(eq, sizeof(*eq));
+}
+
+static inline lnet_libmd_t *
+lnet_md_alloc (lnet_md_t *umd)
+{
+       /* NEVER called with liblock held */
+       lnet_libmd_t *md;
+       unsigned int  size;
+       unsigned int  niov;
+
+       if ((umd->options & LNET_MD_KIOV) != 0) {
+               niov = umd->length;
+               size = offsetof(lnet_libmd_t, md_iov.kiov[niov]);
+       } else {
+               niov = ((umd->options & LNET_MD_IOVEC) != 0) ?
+                      umd->length : 1;
+               size = offsetof(lnet_libmd_t, md_iov.iov[niov]);
+       }
+
+       LIBCFS_ALLOC(md, size);
+
+       if (md != NULL) {
+               /* Set here in case of early free */
+               md->md_options = umd->options;
+               md->md_niov = niov;
+               INIT_LIST_HEAD(&md->md_list);
+       }
+
+       return (md);
+}
+
+static inline void
+lnet_md_free(lnet_libmd_t *md)
+{
+       /* ALWAYS called with resource lock held */
+       unsigned int  size;
+
+       if ((md->md_options & LNET_MD_KIOV) != 0)
+               size = offsetof(lnet_libmd_t, md_iov.kiov[md->md_niov]);
+       else
+               size = offsetof(lnet_libmd_t, md_iov.iov[md->md_niov]);
+
+       LIBCFS_FREE(md, size);
+}
+
+static inline lnet_me_t *
+lnet_me_alloc (void)
+{
+       /* NEVER called with liblock held */
+       lnet_me_t *me;
+
+       LIBCFS_ALLOC(me, sizeof(*me));
+       return (me);
+}
+
+static inline void
+lnet_me_free(lnet_me_t *me)
+{
+       /* ALWAYS called with resource lock held */
+       LIBCFS_FREE(me, sizeof(*me));
+}
+
+static inline lnet_msg_t *
+lnet_msg_alloc(void)
+{
+       /* NEVER called with liblock held */
+       lnet_msg_t *msg;
+
+       LIBCFS_ALLOC(msg, sizeof(*msg));
+
+       /* no need to zero, LIBCFS_ALLOC does for us */
+       return (msg);
+}
+
+static inline void
+lnet_msg_free(lnet_msg_t *msg)
+{
+       /* ALWAYS called with network lock held */
+       LASSERT(!msg->msg_onactivelist);
+       LIBCFS_FREE(msg, sizeof(*msg));
+}
+
+#define lnet_eq_free_locked(eq)                lnet_eq_free(eq)
+#define lnet_md_free_locked(md)                lnet_md_free(md)
+#define lnet_me_free_locked(me)                lnet_me_free(me)
+#define lnet_msg_free_locked(msg)      lnet_msg_free(msg)
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+lnet_libhandle_t *lnet_res_lh_lookup(struct lnet_res_container *rec,
+                                    __u64 cookie);
+void lnet_res_lh_initialize(struct lnet_res_container *rec,
+                           lnet_libhandle_t *lh);
+static inline void
+lnet_res_lh_invalidate(lnet_libhandle_t *lh)
+{
+       /* ALWAYS called with resource lock held */
+       /* NB: cookie is still useful, don't reset it */
+       list_del(&lh->lh_hash_chain);
+}
+
+static inline void
+lnet_eq2handle (lnet_handle_eq_t *handle, lnet_eq_t *eq)
+{
+       if (eq == NULL) {
+               LNetInvalidateHandle(handle);
+               return;
+       }
+
+       handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lnet_eq_t *
+lnet_handle2eq(lnet_handle_eq_t *handle)
+{
+       /* ALWAYS called with resource lock held */
+       lnet_libhandle_t *lh;
+
+       lh = lnet_res_lh_lookup(&the_lnet.ln_eq_container, handle->cookie);
+       if (lh == NULL)
+               return NULL;
+
+       return lh_entry(lh, lnet_eq_t, eq_lh);
+}
+
+static inline void
+lnet_md2handle (lnet_handle_md_t *handle, lnet_libmd_t *md)
+{
+       handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lnet_libmd_t *
+lnet_handle2md(lnet_handle_md_t *handle)
+{
+       /* ALWAYS called with resource lock held */
+       lnet_libhandle_t *lh;
+       int              cpt;
+
+       cpt = lnet_cpt_of_cookie(handle->cookie);
+       lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+                               handle->cookie);
+       if (lh == NULL)
+               return NULL;
+
+       return lh_entry(lh, lnet_libmd_t, md_lh);
+}
+
+static inline lnet_libmd_t *
+lnet_wire_handle2md(lnet_handle_wire_t *wh)
+{
+       /* ALWAYS called with resource lock held */
+       lnet_libhandle_t *lh;
+       int              cpt;
+
+       if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie)
+               return NULL;
+
+       cpt = lnet_cpt_of_cookie(wh->wh_object_cookie);
+       lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt],
+                               wh->wh_object_cookie);
+       if (lh == NULL)
+               return NULL;
+
+       return lh_entry(lh, lnet_libmd_t, md_lh);
+}
+
+static inline void
+lnet_me2handle (lnet_handle_me_t *handle, lnet_me_t *me)
+{
+       handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lnet_me_t *
+lnet_handle2me(lnet_handle_me_t *handle)
+{
+       /* ALWAYS called with resource lock held */
+       lnet_libhandle_t *lh;
+       int              cpt;
+
+       cpt = lnet_cpt_of_cookie(handle->cookie);
+       lh = lnet_res_lh_lookup(the_lnet.ln_me_containers[cpt],
+                               handle->cookie);
+       if (lh == NULL)
+               return NULL;
+
+       return lh_entry(lh, lnet_me_t, me_lh);
+}
+
+static inline void
+lnet_peer_addref_locked(lnet_peer_t *lp)
+{
+       LASSERT (lp->lp_refcount > 0);
+       lp->lp_refcount++;
+}
+
+extern void lnet_destroy_peer_locked(lnet_peer_t *lp);
+
+static inline void
+lnet_peer_decref_locked(lnet_peer_t *lp)
+{
+       LASSERT (lp->lp_refcount > 0);
+       lp->lp_refcount--;
+       if (lp->lp_refcount == 0)
+               lnet_destroy_peer_locked(lp);
+}
+
+static inline int
+lnet_isrouter(lnet_peer_t *lp)
+{
+       return lp->lp_rtr_refcount != 0;
+}
+
+static inline void
+lnet_ni_addref_locked(lnet_ni_t *ni, int cpt)
+{
+       LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+       LASSERT(*ni->ni_refs[cpt] >= 0);
+
+       (*ni->ni_refs[cpt])++;
+}
+
+static inline void
+lnet_ni_addref(lnet_ni_t *ni)
+{
+       lnet_net_lock(0);
+       lnet_ni_addref_locked(ni, 0);
+       lnet_net_unlock(0);
+}
+
+static inline void
+lnet_ni_decref_locked(lnet_ni_t *ni, int cpt)
+{
+       LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER);
+       LASSERT(*ni->ni_refs[cpt] > 0);
+
+       (*ni->ni_refs[cpt])--;
+}
+
+static inline void
+lnet_ni_decref(lnet_ni_t *ni)
+{
+       lnet_net_lock(0);
+       lnet_ni_decref_locked(ni, 0);
+       lnet_net_unlock(0);
+}
+
+void lnet_ni_free(lnet_ni_t *ni);
+
+static inline int
+lnet_nid2peerhash(lnet_nid_t nid)
+{
+       return cfs_hash_long(nid, LNET_PEER_HASH_BITS);
+}
+
+static inline struct list_head *
+lnet_net2rnethash(__u32 net)
+{
+       return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) +
+               LNET_NETTYP(net)) &
+               ((1U << the_lnet.ln_remote_nets_hbits) - 1)];
+}
+
+extern lnd_t the_lolnd;
+
+
+extern int lnet_cpt_of_nid_locked(lnet_nid_t nid);
+extern int lnet_cpt_of_nid(lnet_nid_t nid);
+extern lnet_ni_t *lnet_nid2ni_locked(lnet_nid_t nid, int cpt);
+extern lnet_ni_t *lnet_net2ni_locked(__u32 net, int cpt);
+extern lnet_ni_t *lnet_net2ni(__u32 net);
+
+int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, cfs_time_t when);
+void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when);
+int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid);
+int lnet_check_routes(void);
+int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
+void lnet_destroy_routes(void);
+int lnet_get_route(int idx, __u32 *net, __u32 *hops,
+                  lnet_nid_t *gateway, __u32 *alive);
+void lnet_proc_init(void);
+void lnet_proc_fini(void);
+int  lnet_rtrpools_alloc(int im_a_router);
+void lnet_rtrpools_free(void);
+lnet_remotenet_t *lnet_find_net_locked (__u32 net);
+
+int lnet_islocalnid(lnet_nid_t nid);
+int lnet_islocalnet(__u32 net);
+
+void lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+                       unsigned int offset, unsigned int mlen);
+void lnet_msg_detach_md(lnet_msg_t *msg, int status);
+void lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev);
+void lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type);
+void lnet_msg_commit(lnet_msg_t *msg, int cpt);
+void lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status);
+
+void lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev);
+void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+                   unsigned int offset, unsigned int len);
+int lnet_send(lnet_nid_t nid, lnet_msg_t *msg, lnet_nid_t rtr_nid);
+void lnet_return_tx_credits_locked(lnet_msg_t *msg);
+void lnet_return_rx_credits_locked(lnet_msg_t *msg);
+
+/* portals functions */
+/* portals attributes */
+static inline int
+lnet_ptl_is_lazy(lnet_portal_t *ptl)
+{
+       return !!(ptl->ptl_options & LNET_PTL_LAZY);
+}
+
+static inline int
+lnet_ptl_is_unique(lnet_portal_t *ptl)
+{
+       return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE);
+}
+
+static inline int
+lnet_ptl_is_wildcard(lnet_portal_t *ptl)
+{
+       return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD);
+}
+
+static inline void
+lnet_ptl_setopt(lnet_portal_t *ptl, int opt)
+{
+       ptl->ptl_options |= opt;
+}
+
+static inline void
+lnet_ptl_unsetopt(lnet_portal_t *ptl, int opt)
+{
+       ptl->ptl_options &= ~opt;
+}
+
+/* match-table functions */
+struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable,
+                              lnet_process_id_t id, __u64 mbits);
+struct lnet_match_table *lnet_mt_of_attach(unsigned int index,
+                                          lnet_process_id_t id, __u64 mbits,
+                                          __u64 ignore_bits,
+                                          lnet_ins_pos_t pos);
+int lnet_mt_match_md(struct lnet_match_table *mtable,
+                    struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* portals match/attach functions */
+void lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+                       struct list_head *matches, struct list_head *drops);
+void lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md);
+int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg);
+
+/* initialized and finalize portals */
+int lnet_portals_create(void);
+void lnet_portals_destroy(void);
+
+/* message functions */
+int lnet_parse (lnet_ni_t *ni, lnet_hdr_t *hdr,
+               lnet_nid_t fromnid, void *private, int rdma_req);
+void lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+              unsigned int offset, unsigned int mlen, unsigned int rlen);
+lnet_msg_t *lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *get_msg);
+void lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *msg, unsigned int len);
+void lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int rc);
+void lnet_drop_delayed_msg_list(struct list_head *head, char *reason);
+void lnet_recv_delayed_msg_list(struct list_head *head);
+
+int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt);
+void lnet_msg_container_cleanup(struct lnet_msg_container *container);
+void lnet_msg_containers_destroy(void);
+int lnet_msg_containers_create(void);
+
+char *lnet_msgtyp2str (int type);
+void lnet_print_hdr (lnet_hdr_t * hdr);
+int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
+
+void lnet_counters_get(lnet_counters_t *counters);
+void lnet_counters_reset(void);
+
+unsigned int lnet_iov_nob (unsigned int niov, struct iovec *iov);
+int lnet_extract_iov (int dst_niov, struct iovec *dst,
+                     int src_niov, struct iovec *src,
+                     unsigned int offset, unsigned int len);
+
+unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov);
+int lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+                     int src_niov, lnet_kiov_t *src,
+                     unsigned int offset, unsigned int len);
+
+void lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov,
+                       unsigned int doffset,
+                       unsigned int nsiov, struct iovec *siov,
+                       unsigned int soffset, unsigned int nob);
+void lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov,
+                        unsigned int iovoffset,
+                        unsigned int nkiov, lnet_kiov_t *kiov,
+                        unsigned int kiovoffset, unsigned int nob);
+void lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov,
+                        unsigned int kiovoffset,
+                        unsigned int niov, struct iovec *iov,
+                        unsigned int iovoffset, unsigned int nob);
+void lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov,
+                         unsigned int doffset,
+                         unsigned int nskiov, lnet_kiov_t *skiov,
+                         unsigned int soffset, unsigned int nob);
+
+static inline void
+lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset,
+                  unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+                  unsigned int nob)
+{
+       struct iovec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen};
+
+       lnet_copy_iov2iov(1, &diov, doffset,
+                         nsiov, siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset,
+                   unsigned int nsiov, lnet_kiov_t *skiov, unsigned int soffset,
+                   unsigned int nob)
+{
+       struct iovec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen};
+
+       lnet_copy_kiov2iov(1, &diov, doffset,
+                          nsiov, skiov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2iov(unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+                  int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+       struct iovec siov = {/*.iov_base = */ src, /*.iov_len = */slen};
+       lnet_copy_iov2iov(ndiov, diov, doffset,
+                         1, &siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, unsigned int doffset,
+                   int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+       struct iovec siov = {/* .iov_base = */ src, /* .iov_len = */ slen};
+       lnet_copy_iov2kiov(ndiov, dkiov, doffset,
+                          1, &siov, soffset, nob);
+}
+
+void lnet_me_unlink(lnet_me_t *me);
+
+void lnet_md_unlink(lnet_libmd_t *md);
+void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd);
+
+void lnet_register_lnd(lnd_t *lnd);
+void lnet_unregister_lnd(lnd_t *lnd);
+int lnet_set_ip_niaddr (lnet_ni_t *ni);
+
+int lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+                __u32 local_ip, __u32 peer_ip, int peer_port);
+void lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
+                               __u32 peer_ip, int port);
+int lnet_count_acceptor_nis(void);
+int lnet_acceptor_timeout(void);
+int lnet_acceptor_port(void);
+
+int lnet_count_acceptor_nis(void);
+int lnet_acceptor_port(void);
+
+int lnet_acceptor_start(void);
+void lnet_acceptor_stop(void);
+
+void lnet_get_tunables(void);
+int lnet_peers_start_down(void);
+int lnet_peer_buffer_credits(lnet_ni_t *ni);
+
+int lnet_router_checker_start(void);
+void lnet_router_checker_stop(void);
+void lnet_swap_pinginfo(lnet_ping_info_t *info);
+
+int lnet_ping_target_init(void);
+void lnet_ping_target_fini(void);
+int lnet_ping(lnet_process_id_t id, int timeout_ms,
+             lnet_process_id_t *ids, int n_ids);
+
+int lnet_parse_ip2nets (char **networksp, char *ip2nets);
+int lnet_parse_routes (char *route_str, int *im_a_router);
+int lnet_parse_networks (struct list_head *nilist, char *networks);
+
+int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt);
+lnet_peer_t *lnet_find_peer_locked(struct lnet_peer_table *ptable,
+                                  lnet_nid_t nid);
+void lnet_peer_tables_cleanup(void);
+void lnet_peer_tables_destroy(void);
+int lnet_peer_tables_create(void);
+void lnet_debug_peer(lnet_nid_t nid);
+
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h
new file mode 100644 (file)
index 0000000..86428d4
--- /dev/null
@@ -0,0 +1,765 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef __LNET_LIB_TYPES_H__
+#define __LNET_LIB_TYPES_H__
+
+#include <linux/lnet/linux/lib-types.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/list.h>
+#include <linux/lnet/types.h>
+
+#define WIRE_ATTR       __attribute__((packed))
+
+/* Packed version of lnet_process_id_t to transfer via network */
+typedef struct {
+       lnet_nid_t nid;
+       lnet_pid_t pid;   /* node id / process id */
+} WIRE_ATTR lnet_process_id_packed_t;
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use). */
+typedef struct {
+       __u64 wh_interface_cookie;
+       __u64 wh_object_cookie;
+} WIRE_ATTR lnet_handle_wire_t;
+
+typedef enum {
+       LNET_MSG_ACK = 0,
+       LNET_MSG_PUT,
+       LNET_MSG_GET,
+       LNET_MSG_REPLY,
+       LNET_MSG_HELLO,
+} lnet_msg_type_t;
+
+/* The variant fields of the portals message header are aligned on an 8
+ * byte boundary in the message header.  Note that all types used in these
+ * wire structs MUST be fixed size and the smaller types are placed at the
+ * end. */
+typedef struct lnet_ack {
+       lnet_handle_wire_t  dst_wmd;
+       __u64          match_bits;
+       __u32          mlength;
+} WIRE_ATTR lnet_ack_t;
+
+typedef struct lnet_put {
+       lnet_handle_wire_t  ack_wmd;
+       __u64          match_bits;
+       __u64          hdr_data;
+       __u32          ptl_index;
+       __u32          offset;
+} WIRE_ATTR lnet_put_t;
+
+typedef struct lnet_get {
+       lnet_handle_wire_t  return_wmd;
+       __u64          match_bits;
+       __u32          ptl_index;
+       __u32          src_offset;
+       __u32          sink_length;
+} WIRE_ATTR lnet_get_t;
+
+typedef struct lnet_reply {
+       lnet_handle_wire_t  dst_wmd;
+} WIRE_ATTR lnet_reply_t;
+
+typedef struct lnet_hello {
+       __u64         incarnation;
+       __u32         type;
+} WIRE_ATTR lnet_hello_t;
+
+typedef struct {
+       lnet_nid_t        dest_nid;
+       lnet_nid_t        src_nid;
+       lnet_pid_t        dest_pid;
+       lnet_pid_t        src_pid;
+       __u32          type;           /* lnet_msg_type_t */
+       __u32          payload_length;     /* payload data to follow */
+       /*<------__u64 aligned------->*/
+       union {
+               lnet_ack_t   ack;
+               lnet_put_t   put;
+               lnet_get_t   get;
+               lnet_reply_t reply;
+               lnet_hello_t hello;
+       } msg;
+} WIRE_ATTR lnet_hdr_t;
+
+/* A HELLO message contains a magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * LNET_MSG_HELLO in the type field.  All other common fields are zero
+ * (including payload_size; i.e. no payload).
+ * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID. These LNDs should
+ * exchange HELLO messages when a connection is first established.  Individual
+ * LNDs can put whatever else they fancy in lnet_hdr_t::msg.
+ */
+typedef struct {
+       __u32   magic;                    /* LNET_PROTO_TCP_MAGIC */
+       __u16   version_major;            /* increment on incompatible change */
+       __u16   version_minor;            /* increment on compatible change */
+} WIRE_ATTR lnet_magicversion_t;
+
+/* PROTO MAGIC for LNDs */
+#define LNET_PROTO_IB_MAGIC             0x0be91b91
+#define LNET_PROTO_RA_MAGIC             0x0be91b92
+#define LNET_PROTO_QSW_MAGIC           0x0be91b93
+#define LNET_PROTO_GNI_MAGIC           0xb00fbabe /* ask Kim */
+#define LNET_PROTO_TCP_MAGIC           0xeebc0ded
+#define LNET_PROTO_PTL_MAGIC           0x50746C4E /* 'PtlN' unique magic */
+#define LNET_PROTO_MX_MAGIC             0x4d583130 /* 'MX10'! */
+#define LNET_PROTO_ACCEPTOR_MAGIC         0xacce7100
+#define LNET_PROTO_PING_MAGIC         0x70696E67 /* 'ping' */
+
+/* Placeholder for a future "unified" protocol across all LNDs */
+/* Current LNDs that receive a request with this magic will respond with a
+ * "stub" reply using their current protocol */
+#define LNET_PROTO_MAGIC                   0x45726963 /* ! */
+
+
+#define LNET_PROTO_TCP_VERSION_MAJOR   1
+#define LNET_PROTO_TCP_VERSION_MINOR   0
+
+/* Acceptor connection request */
+typedef struct {
+       __u32       acr_magic;            /* PTL_ACCEPTOR_PROTO_MAGIC */
+       __u32       acr_version;                /* protocol version */
+       __u64       acr_nid;                /* target NID */
+} WIRE_ATTR lnet_acceptor_connreq_t;
+
+#define LNET_PROTO_ACCEPTOR_VERSION       1
+
+/* forward refs */
+struct lnet_libmd;
+
+typedef struct lnet_msg {
+       struct list_head            msg_activelist;
+       struct list_head            msg_list;      /* Q for credits/MD */
+
+       lnet_process_id_t     msg_target;
+       /* where is it from, it's only for building event */
+       lnet_nid_t              msg_from;
+       __u32                   msg_type;
+
+       /* commited for sending */
+       unsigned int            msg_tx_committed:1;
+       /* CPT # this message committed for sending */
+       unsigned int            msg_tx_cpt:15;
+       /* commited for receiving */
+       unsigned int            msg_rx_committed:1;
+       /* CPT # this message committed for receiving */
+       unsigned int            msg_rx_cpt:15;
+       /* queued for tx credit */
+       unsigned int            msg_tx_delayed:1;
+       /* queued for RX buffer */
+       unsigned int            msg_rx_delayed:1;
+       /* ready for pending on RX delay list */
+       unsigned int            msg_rx_ready_delay:1;
+
+       unsigned int      msg_vmflush:1;      /* VM trying to free memory */
+       unsigned int      msg_target_is_router:1; /* sending to a router */
+       unsigned int      msg_routing:1;      /* being forwarded */
+       unsigned int      msg_ack:1;      /* ack on finalize (PUT) */
+       unsigned int      msg_sending:1;      /* outgoing message */
+       unsigned int      msg_receiving:1;    /* being received */
+       unsigned int      msg_txcredit:1;     /* taken an NI send credit */
+       unsigned int      msg_peertxcredit:1; /* taken a peer send credit */
+       unsigned int      msg_rtrcredit:1;    /* taken a globel router credit */
+       unsigned int      msg_peerrtrcredit:1; /* taken a peer router credit */
+       unsigned int      msg_onactivelist:1; /* on the activelist */
+
+       struct lnet_peer     *msg_txpeer;        /* peer I'm sending to */
+       struct lnet_peer     *msg_rxpeer;        /* peer I received from */
+
+       void             *msg_private;
+       struct lnet_libmd    *msg_md;
+
+       unsigned int      msg_len;
+       unsigned int      msg_wanted;
+       unsigned int      msg_offset;
+       unsigned int      msg_niov;
+       struct iovec     *msg_iov;
+       lnet_kiov_t       *msg_kiov;
+
+       lnet_event_t      msg_ev;
+       lnet_hdr_t          msg_hdr;
+} lnet_msg_t;
+
+
+typedef struct lnet_libhandle {
+       struct list_head            lh_hash_chain;
+       __u64            lh_cookie;
+} lnet_libhandle_t;
+
+#define lh_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(char *)(&((type *)0)->member)))
+
+typedef struct lnet_eq {
+       struct list_head                eq_list;
+       lnet_libhandle_t        eq_lh;
+       lnet_seq_t              eq_enq_seq;
+       lnet_seq_t              eq_deq_seq;
+       unsigned int            eq_size;
+       lnet_eq_handler_t       eq_callback;
+       lnet_event_t            *eq_events;
+       int                     **eq_refs;      /* percpt refcount for EQ */
+} lnet_eq_t;
+
+typedef struct lnet_me {
+       struct list_head             me_list;
+       lnet_libhandle_t       me_lh;
+       lnet_process_id_t      me_match_id;
+       unsigned int       me_portal;
+       unsigned int       me_pos;              /* hash offset in mt_hash */
+       __u64             me_match_bits;
+       __u64             me_ignore_bits;
+       lnet_unlink_t     me_unlink;
+       struct lnet_libmd     *me_md;
+} lnet_me_t;
+
+typedef struct lnet_libmd {
+       struct list_head            md_list;
+       lnet_libhandle_t      md_lh;
+       lnet_me_t           *md_me;
+       char             *md_start;
+       unsigned int      md_offset;
+       unsigned int      md_length;
+       unsigned int      md_max_size;
+       int                md_threshold;
+       int                md_refcount;
+       unsigned int      md_options;
+       unsigned int      md_flags;
+       void             *md_user_ptr;
+       lnet_eq_t           *md_eq;
+       unsigned int      md_niov;              /* # frags */
+       union {
+               struct iovec  iov[LNET_MAX_IOV];
+               lnet_kiov_t   kiov[LNET_MAX_IOV];
+       } md_iov;
+} lnet_libmd_t;
+
+#define LNET_MD_FLAG_ZOMBIE       (1 << 0)
+#define LNET_MD_FLAG_AUTO_UNLINK      (1 << 1)
+
+#ifdef LNET_USE_LIB_FREELIST
+typedef struct
+{
+       void              *fl_objs;       /* single contiguous array of objects */
+       int                 fl_nobjs;    /* the number of them */
+       int                 fl_objsize;       /* the size (including overhead) of each of them */
+       struct list_head             fl_list;     /* where they are enqueued */
+} lnet_freelist_t;
+
+typedef struct
+{
+       struct list_head             fo_list;        /* enqueue on fl_list */
+       void              *fo_contents;  /* aligned contents */
+} lnet_freeobj_t;
+#endif
+
+typedef struct {
+       /* info about peers we are trying to fail */
+       struct list_head             tp_list;        /* ln_test_peers */
+       lnet_nid_t           tp_nid;          /* matching nid */
+       unsigned int       tp_threshold;        /* # failures to simulate */
+} lnet_test_peer_t;
+
+#define LNET_COOKIE_TYPE_MD    1
+#define LNET_COOKIE_TYPE_ME    2
+#define LNET_COOKIE_TYPE_EQ    3
+#define LNET_COOKIE_TYPE_BITS  2
+#define LNET_COOKIE_MASK       ((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL)
+
+struct lnet_ni;                                  /* forward ref */
+
+typedef struct lnet_lnd
+{
+       /* fields managed by portals */
+       struct list_head            lnd_list;        /* stash in the LND table */
+       int                lnd_refcount;         /* # active instances */
+
+       /* fields initialised by the LND */
+       unsigned int      lnd_type;
+
+       int  (*lnd_startup) (struct lnet_ni *ni);
+       void (*lnd_shutdown) (struct lnet_ni *ni);
+       int  (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg);
+
+       /* In data movement APIs below, payload buffers are described as a set
+        * of 'niov' fragments which are...
+        * EITHER
+        *    in virtual memory (struct iovec *iov != NULL)
+        * OR
+        *    in pages (kernel only: plt_kiov_t *kiov != NULL).
+        * The LND may NOT overwrite these fragment descriptors.
+        * An 'offset' and may specify a byte offset within the set of
+        * fragments to start from
+        */
+
+       /* Start sending a preformatted message.  'private' is NULL for PUT and
+        * GET messages; otherwise this is a response to an incoming message
+        * and 'private' is the 'private' passed to lnet_parse().  Return
+        * non-zero for immediate failure, otherwise complete later with
+        * lnet_finalize() */
+       int (*lnd_send)(struct lnet_ni *ni, void *private, lnet_msg_t *msg);
+
+       /* Start receiving 'mlen' bytes of payload data, skipping the following
+        * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to
+        * lnet_parse().  Return non-zero for immedaite failure, otherwise
+        * complete later with lnet_finalize().  This also gives back a receive
+        * credit if the LND does flow control. */
+       int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+                       int delayed, unsigned int niov,
+                       struct iovec *iov, lnet_kiov_t *kiov,
+                       unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+       /* lnet_parse() has had to delay processing of this message
+        * (e.g. waiting for a forwarding buffer or send credits).  Give the
+        * LND a chance to free urgently needed resources.  If called, return 0
+        * for success and do NOT give back a receive credit; that has to wait
+        * until lnd_recv() gets called.  On failure return < 0 and
+        * release resources; lnd_recv() will not be called. */
+       int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+                             void **new_privatep);
+
+       /* notification of peer health */
+       void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
+
+       /* query of peer aliveness */
+       void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when);
+
+       /* accept a new connection */
+       int (*lnd_accept)(struct lnet_ni *ni, socket_t *sock);
+
+} lnd_t;
+
+#define LNET_NI_STATUS_UP      0x15aac0de
+#define LNET_NI_STATUS_DOWN    0xdeadface
+#define LNET_NI_STATUS_INVALID 0x00000000
+typedef struct {
+       lnet_nid_t ns_nid;
+       __u32      ns_status;
+       __u32      ns_unused;
+} WIRE_ATTR lnet_ni_status_t;
+
+struct lnet_tx_queue {
+       int                     tq_credits;     /* # tx credits free */
+       int                     tq_credits_min; /* lowest it's been */
+       int                     tq_credits_max; /* total # tx credits */
+       struct list_head                tq_delayed;     /* delayed TXs */
+};
+
+#define LNET_MAX_INTERFACES   16
+
+typedef struct lnet_ni {
+       spinlock_t              ni_lock;
+       struct list_head                ni_list;        /* chain on ln_nis */
+       struct list_head                ni_cptlist;     /* chain on ln_nis_cpt */
+       int                     ni_maxtxcredits; /* # tx credits  */
+       /* # per-peer send credits */
+       int                     ni_peertxcredits;
+       /* # per-peer router buffer credits */
+       int                     ni_peerrtrcredits;
+       /* seconds to consider peer dead */
+       int                     ni_peertimeout;
+       int                     ni_ncpts;       /* number of CPTs */
+       __u32                   *ni_cpts;       /* bond NI on some CPTs */
+       lnet_nid_t              ni_nid;         /* interface's NID */
+       void                    *ni_data;       /* instance-specific data */
+       lnd_t                   *ni_lnd;        /* procedural interface */
+       struct lnet_tx_queue    **ni_tx_queues; /* percpt TX queues */
+       int                     **ni_refs;      /* percpt reference count */
+       long                    ni_last_alive;  /* when I was last alive */
+       lnet_ni_status_t        *ni_status;     /* my health status */
+       /* equivalent interfaces to use */
+       char                    *ni_interfaces[LNET_MAX_INTERFACES];
+} lnet_ni_t;
+
+#define LNET_PROTO_PING_MATCHBITS      0x8000000000000000LL
+
+/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+ * of old LNet, so there shouldn't be any compatibility issue */
+#define LNET_PING_FEAT_INVAL           (0)             /* no feature */
+#define LNET_PING_FEAT_BASE            (1 << 0)        /* just a ping */
+#define LNET_PING_FEAT_NI_STATUS       (1 << 1)        /* return NI status */
+
+#define LNET_PING_FEAT_MASK            (LNET_PING_FEAT_BASE | \
+                                        LNET_PING_FEAT_NI_STATUS)
+
+typedef struct {
+       __u32                   pi_magic;
+       __u32                   pi_features;
+       lnet_pid_t              pi_pid;
+       __u32                   pi_nnis;
+       lnet_ni_status_t        pi_ni[0];
+} WIRE_ATTR lnet_ping_info_t;
+
+/* router checker data, per router */
+#define LNET_MAX_RTR_NIS   16
+#define LNET_PINGINFO_SIZE offsetof(lnet_ping_info_t, pi_ni[LNET_MAX_RTR_NIS])
+typedef struct {
+       /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */
+       struct list_head                rcd_list;
+       lnet_handle_md_t        rcd_mdh;        /* ping buffer MD */
+       struct lnet_peer        *rcd_gateway;   /* reference to gateway */
+       lnet_ping_info_t        *rcd_pinginfo;  /* ping buffer */
+} lnet_rc_data_t;
+
+typedef struct lnet_peer {
+       struct list_head        lp_hashlist;      /* chain on peer hash */
+       struct list_head        lp_txq;        /* messages blocking for tx credits */
+       struct list_head        lp_rtrq;              /* messages blocking for router credits */
+       struct list_head        lp_rtr_list;      /* chain on router list */
+       int            lp_txcredits;     /* # tx credits available */
+       int            lp_mintxcredits;      /* low water mark */
+       int            lp_rtrcredits;   /* # router credits */
+       int            lp_minrtrcredits;     /* low water mark */
+       unsigned int      lp_alive:1;      /* alive/dead? */
+       unsigned int      lp_notify:1;    /* notification outstanding? */
+       unsigned int      lp_notifylnd:1;       /* outstanding notification for LND? */
+       unsigned int      lp_notifying:1;       /* some thread is handling notification */
+       unsigned int      lp_ping_notsent;      /* SEND event outstanding from ping */
+       int            lp_alive_count;       /* # times router went dead<->alive */
+       long          lp_txqnob;            /* bytes queued for sending */
+       cfs_time_t      lp_timestamp;    /* time of last aliveness news */
+       cfs_time_t      lp_ping_timestamp;    /* time of last ping attempt */
+       cfs_time_t      lp_ping_deadline;     /* != 0 if ping reply expected */
+       cfs_time_t      lp_last_alive;  /* when I was last alive */
+       cfs_time_t      lp_last_query;  /* when lp_ni was queried last time */
+       lnet_ni_t       *lp_ni;         /* interface peer is on */
+       lnet_nid_t      lp_nid;        /* peer's NID */
+       int            lp_refcount;       /* # refs */
+       int                     lp_cpt;         /* CPT this peer attached on */
+       /* # refs from lnet_route_t::lr_gateway */
+       int                     lp_rtr_refcount;
+       /* returned RC ping features */
+       unsigned int            lp_ping_feats;
+       struct list_head                lp_routes;      /* routers on this peer */
+       lnet_rc_data_t          *lp_rcd;        /* router checker state */
+} lnet_peer_t;
+
+
+/* peer hash size */
+#define LNET_PEER_HASH_BITS     9
+#define LNET_PEER_HASH_SIZE     (1 << LNET_PEER_HASH_BITS)
+
+/* peer hash table */
+struct lnet_peer_table {
+       int                     pt_version;     /* /proc validity stamp */
+       int                     pt_number;      /* # peers extant */
+       struct list_head                pt_deathrow;    /* zombie peers */
+       struct list_head                *pt_hash;       /* NID->peer hash */
+};
+
+/* peer aliveness is enabled only on routers for peers in a network where the
+ * lnet_ni_t::ni_peertimeout has been set to a positive value */
+#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \
+                                        (lp)->lp_ni->ni_peertimeout > 0)
+
+typedef struct {
+       struct list_head                lr_list;        /* chain on net */
+       struct list_head                lr_gwlist;      /* chain on gateway */
+       lnet_peer_t             *lr_gateway;    /* router node */
+       __u32                   lr_net;         /* remote network number */
+       int                     lr_seq;         /* sequence for round-robin */
+       unsigned int            lr_downis;      /* number of down NIs */
+       unsigned int            lr_hops;        /* how far I am */
+} lnet_route_t;
+
+#define LNET_REMOTE_NETS_HASH_DEFAULT  (1U << 7)
+#define LNET_REMOTE_NETS_HASH_MAX      (1U << 16)
+#define LNET_REMOTE_NETS_HASH_SIZE     (1 << the_lnet.ln_remote_nets_hbits)
+
+typedef struct {
+       struct list_head              lrn_list;       /* chain on ln_remote_nets_hash */
+       struct list_head              lrn_routes;     /* routes to me */
+       __u32              lrn_net;     /* my net number */
+} lnet_remotenet_t;
+
+typedef struct {
+       struct list_head rbp_bufs;           /* my free buffer pool */
+       struct list_head rbp_msgs;           /* messages blocking for a buffer */
+       int     rbp_npages;        /* # pages in each buffer */
+       int     rbp_nbuffers;    /* # buffers */
+       int     rbp_credits;      /* # free buffers / blocked messages */
+       int     rbp_mincredits;       /* low water mark */
+} lnet_rtrbufpool_t;
+
+typedef struct {
+       struct list_head             rb_list;        /* chain on rbp_bufs */
+       lnet_rtrbufpool_t     *rb_pool;      /* owning pool */
+       lnet_kiov_t         rb_kiov[0];   /* the buffer space */
+} lnet_rtrbuf_t;
+
+typedef struct {
+       __u32   msgs_alloc;
+       __u32   msgs_max;
+       __u32   errors;
+       __u32   send_count;
+       __u32   recv_count;
+       __u32   route_count;
+       __u32   drop_count;
+       __u64   send_length;
+       __u64   recv_length;
+       __u64   route_length;
+       __u64   drop_length;
+} WIRE_ATTR lnet_counters_t;
+
+#define LNET_PEER_HASHSIZE   503               /* prime! */
+
+#define LNET_NRBPOOLS   3               /* # different router buffer pools */
+
+enum {
+       /* Didn't match anything */
+       LNET_MATCHMD_NONE       = (1 << 0),
+       /* Matched OK */
+       LNET_MATCHMD_OK         = (1 << 1),
+       /* Must be discarded */
+       LNET_MATCHMD_DROP       = (1 << 2),
+       /* match and buffer is exhausted */
+       LNET_MATCHMD_EXHAUSTED  = (1 << 3),
+       /* match or drop */
+       LNET_MATCHMD_FINISH     = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP),
+};
+
+/* Options for lnet_portal_t::ptl_options */
+#define LNET_PTL_LAZY         (1 << 0)
+#define LNET_PTL_MATCH_UNIQUE       (1 << 1)    /* unique match, for RDMA */
+#define LNET_PTL_MATCH_WILDCARD     (1 << 2)    /* wildcard match, request portal */
+
+/* parameter for matching operations (GET, PUT) */
+struct lnet_match_info {
+       __u64                   mi_mbits;
+       lnet_process_id_t       mi_id;
+       unsigned int            mi_opc;
+       unsigned int            mi_portal;
+       unsigned int            mi_rlength;
+       unsigned int            mi_roffset;
+};
+
+/* ME hash of RDMA portal */
+#define LNET_MT_HASH_BITS              8
+#define LNET_MT_HASH_SIZE              (1 << LNET_MT_HASH_BITS)
+#define LNET_MT_HASH_MASK              (LNET_MT_HASH_SIZE - 1)
+/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash,
+ * the last entry is reserved for MEs with ignore-bits */
+#define LNET_MT_HASH_IGNORE            LNET_MT_HASH_SIZE
+/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which
+ * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the
+ * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */
+#define LNET_MT_BITS_U64               6       /* 2^6 bits */
+#define LNET_MT_EXHAUSTED_BITS         (LNET_MT_HASH_BITS - LNET_MT_BITS_U64)
+#define LNET_MT_EXHAUSTED_BMAP         ((1 << LNET_MT_EXHAUSTED_BITS) + 1)
+
+/* portal match table */
+struct lnet_match_table {
+       /* reserved for upcoming patches, CPU partition ID */
+       unsigned int            mt_cpt;
+       unsigned int            mt_portal;      /* portal index */
+       /* match table is set as "enabled" if there's non-exhausted MD
+        * attached on mt_mhash, it's only valide for wildcard portal */
+       unsigned int            mt_enabled;
+       /* bitmap to flag whether MEs on mt_hash are exhausted or not */
+       __u64                   mt_exhausted[LNET_MT_EXHAUSTED_BMAP];
+       struct list_head                *mt_mhash;      /* matching hash */
+};
+
+/* these are only useful for wildcard portal */
+/* Turn off message rotor for wildcard portals */
+#define        LNET_PTL_ROTOR_OFF      0
+/* round-robin dispatch all PUT messages for wildcard portals */
+#define        LNET_PTL_ROTOR_ON       1
+/* round-robin dispatch routed PUT message for wildcard portals */
+#define        LNET_PTL_ROTOR_RR_RT    2
+/* dispatch routed PUT message by hashing source NID for wildcard portals */
+#define        LNET_PTL_ROTOR_HASH_RT  3
+
+typedef struct lnet_portal {
+       spinlock_t              ptl_lock;
+       unsigned int            ptl_index;      /* portal ID, reserved */
+       /* flags on this portal: lazy, unique... */
+       unsigned int            ptl_options;
+       /* list of messags which are stealing buffer */
+       struct list_head                ptl_msg_stealing;
+       /* messages blocking for MD */
+       struct list_head                ptl_msg_delayed;
+       /* Match table for each CPT */
+       struct lnet_match_table **ptl_mtables;
+       /* spread rotor of incoming "PUT" */
+       int                     ptl_rotor;
+       /* # active entries for this portal */
+       int                  ptl_mt_nmaps;
+       /* array of active entries' cpu-partition-id */
+       int                  ptl_mt_maps[0];
+} lnet_portal_t;
+
+#define LNET_LH_HASH_BITS      12
+#define LNET_LH_HASH_SIZE      (1ULL << LNET_LH_HASH_BITS)
+#define LNET_LH_HASH_MASK      (LNET_LH_HASH_SIZE - 1)
+
+/* resource container (ME, MD, EQ) */
+struct lnet_res_container {
+       unsigned int            rec_type;       /* container type */
+       __u64                   rec_lh_cookie;  /* cookie generator */
+       struct list_head                rec_active;     /* active resource list */
+       struct list_head                *rec_lh_hash;   /* handle hash */
+#ifdef LNET_USE_LIB_FREELIST
+       lnet_freelist_t         rec_freelist;   /* freelist for resources */
+#endif
+};
+
+/* message container */
+struct lnet_msg_container {
+       int                     msc_init;       /* initialized or not */
+       /* max # threads finalizing */
+       int                     msc_nfinalizers;
+       /* msgs waiting to complete finalizing */
+       struct list_head                msc_finalizing;
+       struct list_head                msc_active;     /* active message list */
+       /* threads doing finalization */
+       void                    **msc_finalizers;
+#ifdef LNET_USE_LIB_FREELIST
+       lnet_freelist_t         msc_freelist;   /* freelist for messages */
+#endif
+};
+
+/* Router Checker states */
+#define LNET_RC_STATE_SHUTDOWN         0       /* not started */
+#define LNET_RC_STATE_RUNNING          1       /* started up OK */
+#define LNET_RC_STATE_STOPPING         2       /* telling thread to stop */
+
+typedef struct
+{
+       /* CPU partition table of LNet */
+       struct cfs_cpt_table            *ln_cpt_table;
+       /* number of CPTs in ln_cpt_table */
+       unsigned int                    ln_cpt_number;
+       unsigned int                    ln_cpt_bits;
+
+       /* protect LNet resources (ME/MD/EQ) */
+       struct cfs_percpt_lock          *ln_res_lock;
+       /* # portals */
+       int                             ln_nportals;
+       /* the vector of portals */
+       lnet_portal_t                   **ln_portals;
+       /* percpt ME containers */
+       struct lnet_res_container       **ln_me_containers;
+       /* percpt MD container */
+       struct lnet_res_container       **ln_md_containers;
+
+       /* Event Queue container */
+       struct lnet_res_container       ln_eq_container;
+       wait_queue_head_t                       ln_eq_waitq;
+       spinlock_t                      ln_eq_wait_lock;
+       unsigned int                    ln_remote_nets_hbits;
+
+       /* protect NI, peer table, credits, routers, rtrbuf... */
+       struct cfs_percpt_lock          *ln_net_lock;
+       /* percpt message containers for active/finalizing/freed message */
+       struct lnet_msg_container       **ln_msg_containers;
+       lnet_counters_t                 **ln_counters;
+       struct lnet_peer_table          **ln_peer_tables;
+       /* failure simulation */
+       struct list_head                        ln_test_peers;
+
+       struct list_head                        ln_nis;         /* LND instances */
+       /* NIs bond on specific CPT(s) */
+       struct list_head                        ln_nis_cpt;
+       /* dying LND instances */
+       struct list_head                        ln_nis_zombie;
+       lnet_ni_t                       *ln_loni;       /* the loopback NI */
+       /* NI to wait for events in */
+       lnet_ni_t                       *ln_eq_waitni;
+
+       /* remote networks with routes to them */
+       struct list_head                        *ln_remote_nets_hash;
+       /* validity stamp */
+       __u64                           ln_remote_nets_version;
+       /* list of all known routers */
+       struct list_head                        ln_routers;
+       /* validity stamp */
+       __u64                           ln_routers_version;
+       /* percpt router buffer pools */
+       lnet_rtrbufpool_t               **ln_rtrpools;
+
+       lnet_handle_md_t                ln_ping_target_md;
+       lnet_handle_eq_t                ln_ping_target_eq;
+       lnet_ping_info_t                *ln_ping_info;
+
+       /* router checker startup/shutdown state */
+       int                             ln_rc_state;
+       /* router checker's event queue */
+       lnet_handle_eq_t                ln_rc_eqh;
+       /* rcd still pending on net */
+       struct list_head                        ln_rcd_deathrow;
+       /* rcd ready for free */
+       struct list_head                        ln_rcd_zombie;
+       /* serialise startup/shutdown */
+       struct semaphore                ln_rc_signal;
+
+       struct mutex                    ln_api_mutex;
+       struct mutex                    ln_lnd_mutex;
+       int                             ln_init;        /* LNetInit() called? */
+       /* Have I called LNetNIInit myself? */
+       int                             ln_niinit_self;
+       /* LNetNIInit/LNetNIFini counter */
+       int                             ln_refcount;
+       /* shutdown in progress */
+       int                             ln_shutdown;
+
+       int                             ln_routing;     /* am I a router? */
+       lnet_pid_t                      ln_pid;         /* requested pid */
+       /* uniquely identifies this ni in this epoch */
+       __u64                           ln_interface_cookie;
+       /* registered LNDs */
+       struct list_head                        ln_lnds;
+
+       /* space for network names */
+       char                            *ln_network_tokens;
+       int                             ln_network_tokens_nob;
+       /* test protocol compatibility flags */
+       int                             ln_testprotocompat;
+
+} lnet_t;
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/api-support.h b/drivers/staging/lustre/include/linux/lnet/linux/api-support.h
new file mode 100644 (file)
index 0000000..ca78a0a
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_API_SUPPORT_H__
+#define __LINUX_API_SUPPORT_H__
+
+#ifndef __LNET_API_SUPPORT_H__
+#error Do not #include this file directly. #include <lnet /api-support.h> instead
+#endif
+
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h
new file mode 100644 (file)
index 0000000..d2c0a70
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LIB_LNET_H__
+#define __LNET_LINUX_LIB_LNET_H__
+
+#ifndef __LNET_LIB_LNET_H__
+#error Do not #include this file directly. #include <linux/lnet/lib-lnet.h> instead
+#endif
+
+# include <asm/page.h>
+# include <linux/string.h>
+# include <asm/io.h>
+# include <linux/libcfs/libcfs.h>
+
+static inline __u64
+lnet_page2phys (struct page *p)
+{
+       /* compiler optimizer will elide unused branches */
+
+       switch (sizeof(typeof(page_to_phys(p)))) {
+       case 4:
+               /* page_to_phys returns a 32 bit physical address.  This must
+                * be a 32 bit machine with <= 4G memory and we must ensure we
+                * don't sign extend when converting to 64 bits. */
+               return (unsigned long)page_to_phys(p);
+
+       case 8:
+               /* page_to_phys returns a 64 bit physical address :) */
+               return page_to_phys(p);
+
+       default:
+               LBUG();
+               return 0;
+       }
+}
+
+
+#define LNET_ROUTER
+
+#endif /* __LNET_LINUX_LIB_LNET_H__ */
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h b/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h
new file mode 100644 (file)
index 0000000..669e8c0
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LIB_TYPES_H__
+#define __LNET_LINUX_LIB_TYPES_H__
+
+#ifndef __LNET_LIB_TYPES_H__
+#error Do not #include this file directly. #include <linux/lnet/lib-types.h> instead
+#endif
+
+# include <linux/uio.h>
+# include <linux/types.h>
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lnet.h b/drivers/staging/lustre/include/linux/lnet/linux/lnet.h
new file mode 100644 (file)
index 0000000..1e888f1
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_LINUX_LNET_H__
+#define __LNET_LINUX_LNET_H__
+
+#ifndef __LNET_H__
+#error Do not #include this file directly. #include <linux/lnet/lnet.h> instead
+#endif
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+
+#include <linux/uio.h>
+#include <linux/types.h>
+
+#define cfs_tcp_sendpage(sk, page, offset, size, flags) \
+       tcp_sendpage(sk, page, offset, size, flags)
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h b/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h
new file mode 100644 (file)
index 0000000..1bde44e
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_SYSCTL_H__
+#define __LNET_SYSCTL_H__
+
+#if defined(CONFIG_SYSCTL)
+
+
+#define CTL_KRANAL      201
+#define CTL_O2IBLND     205
+#define CTL_PTLLND      206
+#define CTL_QSWNAL      207
+#define CTL_SOCKLND     208
+#define CTL_GNILND      210
+
+
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnet.h b/drivers/staging/lustre/include/linux/lnet/lnet.h
new file mode 100644 (file)
index 0000000..c532b15
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_H__
+#define __LNET_H__
+
+/*
+ * lnet.h
+ *
+ * User application interface file
+ */
+#include <linux/lnet/linux/lnet.h>
+
+#include <linux/lnet/types.h>
+#include <linux/lnet/api.h>
+
+#define LNET_NIDSTR_COUNT  1024    /* # of nidstrings */
+#define LNET_NIDSTR_SIZE   32      /* size of each one (see below for usage) */
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnetctl.h b/drivers/staging/lustre/include/linux/lnet/lnetctl.h
new file mode 100644 (file)
index 0000000..b22daa2
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+
+#define LNET_DEV_ID 0
+#define LNET_DEV_PATH "/dev/lnet"
+#define LNET_DEV_MAJOR 10
+#define LNET_DEV_MINOR 240
+#define OBD_DEV_ID 1
+#define OBD_DEV_NAME "obd"
+#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME
+#define OBD_DEV_MAJOR 10
+#define OBD_DEV_MINOR 241
+#define SMFS_DEV_ID  2
+#define SMFS_DEV_PATH "/dev/snapdev"
+#define SMFS_DEV_MAJOR 10
+#define SMFS_DEV_MINOR 242
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_list_nids(int argc, char **argv);
+int jt_ptl_which_nid(int argc, char **argv);
+int jt_ptl_print_interfaces(int argc, char **argv);
+int jt_ptl_add_interface(int argc, char **argv);
+int jt_ptl_del_interface(int argc, char **argv);
+int jt_ptl_print_peers (int argc, char **argv);
+int jt_ptl_add_peer (int argc, char **argv);
+int jt_ptl_del_peer (int argc, char **argv);
+int jt_ptl_print_connections (int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_print_active_txs(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_add_route (int argc, char **argv);
+int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_notify_router (int argc, char **argv);
+int jt_ptl_print_routes (int argc, char **argv);
+int jt_ptl_fail_nid (int argc, char **argv);
+int jt_ptl_lwt(int argc, char **argv);
+int jt_ptl_testprotocompat(int argc, char **argv);
+int jt_ptl_memhog(int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/lnetst.h b/drivers/staging/lustre/include/linux/lnet/lnetst.h
new file mode 100644 (file)
index 0000000..d90f94e
--- /dev/null
@@ -0,0 +1,491 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/lnetst.h
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LNET_ST_H__
+#define __LNET_ST_H__
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+
+#define LST_FEAT_NONE          (0)
+#define LST_FEAT_BULK_LEN      (1 << 0)        /* enable variable page size */
+
+#define LST_FEATS_EMPTY                (LST_FEAT_NONE)
+#define LST_FEATS_MASK         (LST_FEAT_NONE | LST_FEAT_BULK_LEN)
+
+#define LST_NAME_SIZE     32         /* max name buffer length */
+
+#define LSTIO_DEBUG         0xC00         /* debug */
+#define LSTIO_SESSION_NEW       0xC01     /* create session */
+#define LSTIO_SESSION_END       0xC02     /* end session */
+#define LSTIO_SESSION_INFO      0xC03     /* query session */
+#define LSTIO_GROUP_ADD         0xC10     /* add group */
+#define LSTIO_GROUP_LIST       0xC11      /* list all groups in session */
+#define LSTIO_GROUP_INFO       0xC12      /* query defailt infomation of specified group */
+#define LSTIO_GROUP_DEL         0xC13     /* delete group */
+#define LSTIO_NODES_ADD         0xC14     /* add nodes to specified group */
+#define LSTIO_GROUP_UPDATE      0xC15     /* update group */
+#define LSTIO_BATCH_ADD         0xC20     /* add batch */
+#define LSTIO_BATCH_START       0xC21     /* start batch */
+#define LSTIO_BATCH_STOP       0xC22      /* stop batch */
+#define LSTIO_BATCH_DEL         0xC23     /* delete batch */
+#define LSTIO_BATCH_LIST       0xC24      /* show all batches in the session */
+#define LSTIO_BATCH_INFO       0xC25      /* show defail of specified batch */
+#define LSTIO_TEST_ADD   0xC26    /* add test (to batch) */
+#define LSTIO_BATCH_QUERY       0xC27     /* query batch status */
+#define LSTIO_STAT_QUERY       0xC30      /* get stats */
+
+typedef struct {
+       lnet_nid_t            ses_nid;          /* nid of console node */
+       __u64              ses_stamp;         /* time stamp */
+} lst_sid_t;                                       /*** session id */
+
+extern lst_sid_t LST_INVALID_SID;
+
+typedef struct {
+       __u64              bat_id;               /* unique id in session */
+} lst_bid_t;                                       /*** batch id (group of tests) */
+
+/* Status of test node */
+#define LST_NODE_ACTIVE         0x1                 /* node in this session */
+#define LST_NODE_BUSY     0x2               /* node is taken by other session */
+#define LST_NODE_DOWN     0x4               /* node is down */
+#define LST_NODE_UNKNOWN       0x8                  /* node not in session */
+
+typedef struct {
+       lnet_process_id_t       nde_id;          /* id of node */
+       int                  nde_state;       /* state of node */
+} lstcon_node_ent_t;                               /*** node entry, for list_group command */
+
+typedef struct {
+       int                  nle_nnode;       /* # of nodes */
+       int                  nle_nactive;           /* # of active nodes */
+       int                  nle_nbusy;       /* # of busy nodes */
+       int                  nle_ndown;       /* # of down nodes */
+       int                  nle_nunknown;         /* # of unknown nodes */
+} lstcon_ndlist_ent_t;                           /*** node_list entry, for list_batch command */
+
+typedef struct {
+       int                  tse_type;         /* test type */
+       int                  tse_loop;         /* loop count */
+       int                  tse_concur;             /* concurrency of test */
+} lstcon_test_ent_t;                               /*** test summary entry, for list_batch command */
+
+typedef struct {
+       int                  bae_state;       /* batch status */
+       int                  bae_timeout;           /* batch timeout */
+       int                  bae_ntest;       /* # of tests in the batch */
+} lstcon_batch_ent_t;                             /*** batch summary entry, for list_batch command */
+
+typedef struct {
+       lstcon_ndlist_ent_t     tbe_cli_nle;        /* client (group) node_list entry */
+       lstcon_ndlist_ent_t     tbe_srv_nle;        /* server (group) node_list entry */
+       union {
+               lstcon_test_ent_t  tbe_test;        /* test entry */
+               lstcon_batch_ent_t tbe_batch;      /* batch entry */
+       } u;
+} lstcon_test_batch_ent_t;                           /*** test/batch verbose information entry,
+                                                        *** for list_batch command */
+
+typedef struct {
+       struct list_head              rpe_link;        /* link chain */
+       lnet_process_id_t       rpe_peer;              /* peer's id */
+       struct timeval    rpe_stamp;          /* time stamp of RPC */
+       int                  rpe_state;       /* peer's state */
+       int                  rpe_rpc_errno;       /* RPC errno */
+
+       lst_sid_t              rpe_sid;         /* peer's session id */
+       int                  rpe_fwk_errno;       /* framework errno */
+       int                  rpe_priv[4];           /* private data */
+       char                rpe_payload[0];      /* private reply payload */
+} lstcon_rpc_ent_t;
+
+typedef struct {
+       int                  trs_rpc_stat[4];   /* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */
+       int                  trs_rpc_errno;       /* RPC errno */
+       int                  trs_fwk_stat[8];   /* framework stat */
+       int                  trs_fwk_errno;       /* errno of the first remote error */
+       void               *trs_fwk_private;    /* private framework stat */
+} lstcon_trans_stat_t;
+
+static inline int
+lstcon_rpc_stat_total(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0];
+}
+
+static inline int
+lstcon_rpc_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1];
+}
+
+static inline int
+lstcon_rpc_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2];
+}
+
+static inline int
+lstcon_sesop_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesop_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_active(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_sesqry_stat_busy(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_sesqry_stat_unknown(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_tsbop_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbop_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_idle(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_tsbqry_stat_run(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+static inline int
+lstcon_tsbqry_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2];
+}
+
+static inline int
+lstcon_statqry_stat_success(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0];
+}
+
+static inline int
+lstcon_statqry_stat_failure(lstcon_trans_stat_t *stat, int inc)
+{
+       return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1];
+}
+
+/* create a session */
+typedef struct {
+       int                  lstio_ses_key;       /* IN: local key */
+       int                  lstio_ses_timeout;      /* IN: session timeout */
+       int                  lstio_ses_force;   /* IN: force create ? */
+       /** IN: session features */
+       unsigned                lstio_ses_feats;
+       lst_sid_t             *lstio_ses_idp;     /* OUT: session id */
+       int                  lstio_ses_nmlen;   /* IN: name length */
+       char               *lstio_ses_namep;    /* IN: session name */
+} lstio_session_new_args_t;
+
+/* query current session */
+typedef struct {
+       lst_sid_t             *lstio_ses_idp;     /* OUT: session id */
+       int                 *lstio_ses_keyp;     /* OUT: local key */
+       /** OUT: session features */
+       unsigned               *lstio_ses_featp;
+       lstcon_ndlist_ent_t    *lstio_ses_ndinfo;       /* OUT: */
+       int                  lstio_ses_nmlen;   /* IN: name length */
+       char               *lstio_ses_namep;    /* OUT: session name */
+} lstio_session_info_args_t;
+
+/* delete a session */
+typedef struct {
+       int                  lstio_ses_key;       /* IN: session key */
+} lstio_session_end_args_t;
+
+#define LST_OPC_SESSION         1
+#define LST_OPC_GROUP     2
+#define LST_OPC_NODES     3
+#define LST_OPC_BATCHCLI       4
+#define LST_OPC_BATCHSRV       5
+
+typedef struct {
+       int                  lstio_dbg_key;       /* IN: session key */
+       int                  lstio_dbg_type;     /* IN: debug sessin|batch|group|nodes list */
+       int                  lstio_dbg_flags;   /* IN: reserved debug flags */
+       int                  lstio_dbg_timeout;      /* IN: timeout of debug */
+
+       int                  lstio_dbg_nmlen;   /* IN: len of name */
+       char               *lstio_dbg_namep;    /* IN: name of group|batch */
+       int                  lstio_dbg_count;   /* IN: # of test nodes to debug */
+       lnet_process_id_t      *lstio_dbg_idsp;  /* IN: id of test nodes */
+       struct list_head             *lstio_dbg_resultp;      /* OUT: list head of result buffer */
+} lstio_debug_args_t;
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_nmlen;   /* IN: name length */
+       char               *lstio_grp_namep;    /* IN: group name */
+} lstio_group_add_args_t;
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_nmlen;   /* IN: name length */
+       char               *lstio_grp_namep;    /* IN: group name */
+} lstio_group_del_args_t;
+
+#define LST_GROUP_CLEAN         1                     /* remove inactive nodes in the group */
+#define LST_GROUP_REFRESH       2                     /* refresh inactive nodes in the group */
+#define LST_GROUP_RMND   3                    /* delete nodes from the group */
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_opc;       /* IN: OPC */
+       int                  lstio_grp_args;     /* IN: arguments */
+       int                  lstio_grp_nmlen;   /* IN: name length */
+       char               *lstio_grp_namep;    /* IN: group name */
+       int                  lstio_grp_count;   /* IN: # of nodes id */
+       lnet_process_id_t      *lstio_grp_idsp;  /* IN: array of nodes */
+       struct list_head             *lstio_grp_resultp;      /* OUT: list head of result buffer */
+} lstio_group_update_args_t;
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_nmlen;   /* IN: name length */
+       char               *lstio_grp_namep;    /* IN: group name */
+       int                  lstio_grp_count;   /* IN: # of nodes */
+       /** OUT: session features */
+       unsigned               *lstio_grp_featp;
+       lnet_process_id_t      *lstio_grp_idsp;  /* IN: nodes */
+       struct list_head             *lstio_grp_resultp;      /* OUT: list head of result buffer */
+} lstio_group_nodes_args_t;
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_idx;       /* IN: group idx */
+       int                  lstio_grp_nmlen;   /* IN: name len */
+       char               *lstio_grp_namep;    /* OUT: name */
+} lstio_group_list_args_t;
+
+typedef struct {
+       int                  lstio_grp_key;       /* IN: session key */
+       int                  lstio_grp_nmlen;   /* IN: name len */
+       char               *lstio_grp_namep;    /* IN: name */
+       lstcon_ndlist_ent_t    *lstio_grp_entp;  /* OUT: description of group */
+
+       int                 *lstio_grp_idxp;     /* IN/OUT: node index */
+       int                 *lstio_grp_ndentp;       /* IN/OUT: # of nodent */
+       lstcon_node_ent_t      *lstio_grp_dentsp;       /* OUT: nodent array */
+} lstio_group_info_args_t;
+
+#define LST_DEFAULT_BATCH       "batch"                 /* default batch name */
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+} lstio_batch_add_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+} lstio_batch_del_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_timeout;      /* IN: timeout for the batch */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+       struct list_head             *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_run_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_force;   /* IN: abort unfinished test RPC */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+       struct list_head             *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_stop_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_testidx;      /* IN: test index */
+       int                  lstio_bat_client;       /* IN: is test client? */
+       int                  lstio_bat_timeout;      /* IN: timeout for waiting */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+       struct list_head             *lstio_bat_resultp;      /* OUT: list head of result buffer */
+} lstio_batch_query_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_idx;       /* IN: index */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: batch name */
+} lstio_batch_list_args_t;
+
+typedef struct {
+       int                  lstio_bat_key;       /* IN: session key */
+       int                  lstio_bat_nmlen;   /* IN: name length */
+       char               *lstio_bat_namep;    /* IN: name */
+       int                  lstio_bat_server;       /* IN: query server or not */
+       int                  lstio_bat_testidx;      /* IN: test index */
+       lstcon_test_batch_ent_t *lstio_bat_entp;        /* OUT: batch ent */
+
+       int                 *lstio_bat_idxp;     /* IN/OUT: index of node */
+       int                 *lstio_bat_ndentp;       /* IN/OUT: # of nodent */
+       lstcon_node_ent_t      *lstio_bat_dentsp;       /* array of nodent */
+} lstio_batch_info_args_t;
+
+/* add stat in session */
+typedef struct {
+       int                  lstio_sta_key;       /* IN: session key */
+       int                  lstio_sta_timeout;      /* IN: timeout for stat requst */
+       int                  lstio_sta_nmlen;   /* IN: group name length */
+       char               *lstio_sta_namep;    /* IN: group name */
+       int                  lstio_sta_count;   /* IN: # of pid */
+       lnet_process_id_t      *lstio_sta_idsp;  /* IN: pid */
+       struct list_head             *lstio_sta_resultp;      /* OUT: list head of result buffer */
+} lstio_stat_args_t;
+
+typedef enum {
+       LST_TEST_BULK   = 1,
+       LST_TEST_PING   = 2
+} lst_test_type_t;
+
+/* create a test in a batch */
+#define LST_MAX_CONCUR   1024              /* Max concurrency of test */
+
+typedef struct {
+       int                  lstio_tes_key;       /* IN: session key */
+       int                  lstio_tes_bat_nmlen;    /* IN: batch name len */
+       char               *lstio_tes_bat_name;     /* IN: batch name */
+       int                  lstio_tes_type;     /* IN: test type */
+       int                  lstio_tes_oneside;      /* IN: one sided test */
+       int                  lstio_tes_loop;     /* IN: loop count */
+       int                  lstio_tes_concur;       /* IN: concurrency */
+
+       int                  lstio_tes_dist;     /* IN: node distribution in destination groups */
+       int                  lstio_tes_span;     /* IN: node span in destination groups */
+       int                  lstio_tes_sgrp_nmlen;   /* IN: source group name length */
+       char               *lstio_tes_sgrp_name;    /* IN: group name */
+       int                  lstio_tes_dgrp_nmlen;   /* IN: destination group name length */
+       char               *lstio_tes_dgrp_name;    /* IN: group name */
+
+       int                  lstio_tes_param_len;    /* IN: param buffer len */
+       void               *lstio_tes_param;    /* IN: parameter for specified test:
+                                                              lstio_bulk_param_t,
+                                                              lstio_ping_param_t,
+                                                              ... more */
+       int                 *lstio_tes_retp;     /* OUT: private returned value */
+       struct list_head             *lstio_tes_resultp;      /* OUT: list head of result buffer */
+} lstio_test_args_t;
+
+typedef enum {
+       LST_BRW_READ    = 1,
+       LST_BRW_WRITE   = 2
+} lst_brw_type_t;
+
+typedef enum {
+       LST_BRW_CHECK_NONE   = 1,
+       LST_BRW_CHECK_SIMPLE = 2,
+       LST_BRW_CHECK_FULL   = 3
+} lst_brw_flags_t;
+
+typedef struct {
+       int                  blk_opc;           /* bulk operation code */
+       int                  blk_size;         /* size (bytes) */
+       int                  blk_time;         /* time of running the test*/
+       int                  blk_flags;       /* reserved flags */
+} lst_test_bulk_param_t;
+
+typedef struct {
+       int                  png_size;         /* size of ping message */
+       int                  png_time;         /* time */
+       int                  png_loop;         /* loop */
+       int                  png_flags;       /* reserved flags */
+} lst_test_ping_param_t;
+
+/* more tests */
+typedef struct {
+       __u32 errors;
+       __u32 rpcs_sent;
+       __u32 rpcs_rcvd;
+       __u32 rpcs_dropped;
+       __u32 rpcs_expired;
+       __u64 bulk_get;
+       __u64 bulk_put;
+} WIRE_ATTR srpc_counters_t;
+
+typedef struct {
+       /** milliseconds since current session started */
+       __u32 running_ms;
+       __u32 active_batches;
+       __u32 zombie_sessions;
+       __u32 brw_errors;
+       __u32 ping_errors;
+} WIRE_ATTR sfw_counters_t;
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/ptllnd.h b/drivers/staging/lustre/include/linux/lnet/ptllnd.h
new file mode 100644 (file)
index 0000000..fc1ce8e
--- /dev/null
@@ -0,0 +1,94 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/ptllnd.h
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ */
+
+/*
+ * The PTLLND was designed to support Portals with
+ * Lustre and non-lustre UNLINK semantics.
+ * However for now the two targets are Cray Portals
+ * on the XT3 and Lustre Portals (for testing) both
+ * have Lustre UNLINK semantics, so this is defined
+ * by default.
+ */
+#define LUSTRE_PORTALS_UNLINK_SEMANTICS
+
+
+#ifdef _USING_LUSTRE_PORTALS_
+
+/* NIDs are 64-bits on Lustre Portals */
+#define FMT_NID LPU64
+#define FMT_PID "%d"
+
+/* When using Lustre Portals Lustre completion semantics are imlicit*/
+#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS      0
+
+#else /* _USING_CRAY_PORTALS_ */
+
+/* NIDs are integers on Cray Portals */
+#define FMT_NID "%u"
+#define FMT_PID "%d"
+
+/* When using Cray Portals this is defined in the Cray Portals Header*/
+/*#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS */
+
+/* Can compare handles directly on Cray Portals */
+#define PtlHandleIsEqual(a,b) ((a) == (b))
+
+/* Diffrent error types on Cray Portals*/
+#define ptl_err_t ptl_ni_fail_t
+
+/*
+ * The Cray Portals has no maximum number of IOVs.  The
+ * maximum is limited only by memory and size of the
+ * int parameters (2^31-1).
+ * Lustre only really require that the underyling
+ * implemenation to support at least LNET_MAX_IOV,
+ * so for Cray portals we can safely just use that
+ * value here.
+ *
+ */
+#define PTL_MD_MAX_IOV   LNET_MAX_IOV
+
+#endif
+
+#define FMT_PTLID "ptlid:"FMT_PID"-"FMT_NID
+
+/* Align incoming small request messages to an 8 byte boundary if this is
+ * supported to avoid alignment issues on some architectures */
+#ifndef PTL_MD_LOCAL_ALIGN8
+# define PTL_MD_LOCAL_ALIGN8 0
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h b/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h
new file mode 100644 (file)
index 0000000..7d12b3a
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/ptllnd_wire.h
+ *
+ * Author: PJ Kirner <pjkirner@clusterfs.com>
+ */
+
+/* Minimum buffer size that any peer will post to receive ptllnd messages */
+#define PTLLND_MIN_BUFFER_SIZE  256
+
+/************************************************************************
+ * Tunable defaults that {u,k}lnds/ptllnd should have in common.
+ */
+
+#define PTLLND_PORTAL     9      /* The same portal PTLPRC used when talking to cray portals */
+#define PTLLND_PID           9   /* The Portals PID */
+#define PTLLND_PEERCREDITS      8        /* concurrent sends to 1 peer */
+
+/* Default buffer size for kernel ptllnds (guaranteed eager) */
+#define PTLLND_MAX_KLND_MSG_SIZE 512
+
+/* Default buffer size for catamount ptllnds (not guaranteed eager) - large
+ * enough to avoid RDMA for anything sent while control is not in liblustre */
+#define PTLLND_MAX_ULND_MSG_SIZE 512
+
+
+/************************************************************************
+ * Portals LND Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+#define PTL_RESERVED_MATCHBITS  0x100  /* below this value is reserved
+                                        * above is for bulk data transfer */
+#define LNET_MSG_MATCHBITS       0      /* the value for the message channel */
+
+typedef struct
+{
+       lnet_hdr_t      kptlim_hdr;          /* portals header */
+       char          kptlim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kptl_immediate_msg_t;
+
+typedef struct
+{
+       lnet_hdr_t      kptlrm_hdr;          /* portals header */
+       __u64        kptlrm_matchbits;       /* matchbits */
+} WIRE_ATTR kptl_rdma_msg_t;
+
+typedef struct
+{
+       __u64        kptlhm_matchbits;       /* matchbits */
+       __u32        kptlhm_max_msg_size;    /* max message size */
+} WIRE_ATTR kptl_hello_msg_t;
+
+typedef struct
+{
+       /* First 2 fields fixed FOR ALL TIME */
+       __u32      ptlm_magic;     /* I'm a Portals LND message */
+       __u16      ptlm_version;   /* this is my version number */
+       __u8        ptlm_type;      /* the message type */
+       __u8        ptlm_credits;   /* returned credits */
+       __u32      ptlm_nob;       /* # bytes in whole message */
+       __u32      ptlm_cksum;     /* checksum (0 == no checksum) */
+       __u64      ptlm_srcnid;    /* sender's NID */
+       __u64      ptlm_srcstamp;  /* sender's incarnation */
+       __u64      ptlm_dstnid;    /* destination's NID */
+       __u64      ptlm_dststamp;  /* destination's incarnation */
+       __u32      ptlm_srcpid;    /* sender's PID */
+       __u32      ptlm_dstpid;    /* destination's PID */
+
+        union {
+               kptl_immediate_msg_t    immediate;
+               kptl_rdma_msg_t  rdma;
+               kptl_hello_msg_t        hello;
+       } WIRE_ATTR ptlm_u;
+
+} kptl_msg_t;
+
+/* kptl_msg_t::ptlm_credits is only a __u8 */
+#define PTLLND_MSG_MAX_CREDITS ((typeof(((kptl_msg_t*) 0)->ptlm_credits)) -1)
+
+#define PTLLND_MSG_MAGIC               LNET_PROTO_PTL_MAGIC
+#define PTLLND_MSG_VERSION           0x04
+
+#define PTLLND_RDMA_OK           0x00
+#define PTLLND_RDMA_FAIL               0x01
+
+#define PTLLND_MSG_TYPE_INVALID         0x00
+#define PTLLND_MSG_TYPE_PUT         0x01
+#define PTLLND_MSG_TYPE_GET         0x02
+#define PTLLND_MSG_TYPE_IMMEDIATE       0x03    /* No bulk data xfer*/
+#define PTLLND_MSG_TYPE_NOOP       0x04
+#define PTLLND_MSG_TYPE_HELLO     0x05
+#define PTLLND_MSG_TYPE_NAK         0x06
diff --git a/drivers/staging/lustre/include/linux/lnet/socklnd.h b/drivers/staging/lustre/include/linux/lnet/socklnd.h
new file mode 100644 (file)
index 0000000..bacc749
--- /dev/null
@@ -0,0 +1,103 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/include/lnet/socklnd.h
+ *
+ * #defines shared between socknal implementation and utilities
+ */
+#ifndef __LNET_LNET_SOCKLND_H__
+#define __LNET_LNET_SOCKLND_H__
+
+#include <linux/lnet/types.h>
+#include <linux/lnet/lib-types.h>
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY       0
+#define SOCKLND_CONN_CONTROL    1
+#define SOCKLND_CONN_BULK_IN    2
+#define SOCKLND_CONN_BULK_OUT   3
+#define SOCKLND_CONN_NTYPES     4
+
+#define SOCKLND_CONN_ACK       SOCKLND_CONN_BULK_IN
+
+typedef struct {
+       __u32              kshm_magic;     /* magic number of socklnd message */
+       __u32              kshm_version;   /* version of socklnd message */
+       lnet_nid_t            kshm_src_nid;   /* sender's nid */
+       lnet_nid_t            kshm_dst_nid;   /* destination nid */
+       lnet_pid_t            kshm_src_pid;   /* sender's pid */
+       lnet_pid_t            kshm_dst_pid;   /* destination pid */
+       __u64              kshm_src_incarnation; /* sender's incarnation */
+       __u64              kshm_dst_incarnation; /* destination's incarnation */
+       __u32              kshm_ctype;     /* connection type */
+       __u32              kshm_nips;      /* # IP addrs */
+       __u32              kshm_ips[0];    /* IP addrs */
+} WIRE_ATTR ksock_hello_msg_t;
+
+typedef struct {
+       lnet_hdr_t            ksnm_hdr;       /* lnet hdr */
+
+       /*
+        * ksnm_payload is removed because of winnt compiler's limitation:
+        * zero-sized array can only be placed at the tail of [nested]
+        * structure definitions. lnet payload will be stored just after
+        * the body of structure ksock_lnet_msg_t
+        */
+} WIRE_ATTR ksock_lnet_msg_t;
+
+typedef struct {
+       __u32              ksm_type;       /* type of socklnd message */
+       __u32              ksm_csum;       /* checksum if != 0 */
+       __u64              ksm_zc_cookies[2]; /* Zero-Copy request/ACK cookie */
+       union {
+               ksock_lnet_msg_t lnetmsg;       /* lnet message, it's empty if it's NOOP */
+       } WIRE_ATTR ksm_u;
+} WIRE_ATTR ksock_msg_t;
+
+static inline void
+socklnd_init_msg(ksock_msg_t *msg, int type)
+{
+       msg->ksm_csum      = 0;
+       msg->ksm_type      = type;
+       msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+#define KSOCK_MSG_NOOP   0xc0      /* ksm_u empty */
+#define KSOCK_MSG_LNET   0xc1      /* lnet msg */
+
+/* We need to know this number to parse hello msg from ksocklnd in
+ * other LND (usocklnd, for example) */
+#define KSOCK_PROTO_V2   2
+#define KSOCK_PROTO_V3   3
+
+#endif
diff --git a/drivers/staging/lustre/include/linux/lnet/types.h b/drivers/staging/lustre/include/linux/lnet/types.h
new file mode 100644 (file)
index 0000000..4f63b7a
--- /dev/null
@@ -0,0 +1,503 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LNET_TYPES_H__
+#define __LNET_TYPES_H__
+
+/** \addtogroup lnet
+ * @{ */
+
+#include <linux/libcfs/libcfs.h>
+
+/** \addtogroup lnet_addr
+ * @{ */
+
+/** Portal reserved for LNet's own use.
+ * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments.
+ */
+#define LNET_RESERVED_PORTAL      0
+
+/**
+ * Address of an end-point in an LNet network.
+ *
+ * A node can have multiple end-points and hence multiple addresses.
+ * An LNet network can be a simple network (e.g. tcp0) or a network of
+ * LNet networks connected by LNet routers. Therefore an end-point address
+ * has two parts: network ID, and address within a network.
+ *
+ * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID.
+ */
+typedef __u64 lnet_nid_t;
+/**
+ * ID of a process in a node. Shortened as PID to distinguish from
+ * lnet_process_id_t, the global process ID.
+ */
+typedef __u32 lnet_pid_t;
+
+/** wildcard NID that matches any end-point address */
+#define LNET_NID_ANY      ((lnet_nid_t) -1)
+/** wildcard PID that matches any lnet_pid_t */
+#define LNET_PID_ANY      ((lnet_pid_t) -1)
+
+#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */
+#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */
+
+#define LNET_TIME_FOREVER    (-1)
+
+/**
+ * Objects maintained by the LNet are accessed through handles. Handle types
+ * have names of the form lnet_handle_xx_t, where xx is one of the two letter
+ * object type codes ('eq' for event queue, 'md' for memory descriptor, and
+ * 'me' for match entry).
+ * Each type of object is given a unique handle type to enhance type checking.
+ * The type lnet_handle_any_t can be used when a generic handle is needed.
+ * Every handle value can be converted into a value of type lnet_handle_any_t
+ * without loss of information.
+ */
+typedef struct {
+       __u64    cookie;
+} lnet_handle_any_t;
+
+typedef lnet_handle_any_t lnet_handle_eq_t;
+typedef lnet_handle_any_t lnet_handle_md_t;
+typedef lnet_handle_any_t lnet_handle_me_t;
+
+#define LNET_WIRE_HANDLE_COOKIE_NONE   (-1)
+
+/**
+ * Invalidate handle \a h.
+ */
+static inline void LNetInvalidateHandle(lnet_handle_any_t *h)
+{
+       h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE;
+}
+
+/**
+ * Compare handles \a h1 and \a h2.
+ *
+ * \return 1 if handles are equal, 0 if otherwise.
+ */
+static inline int LNetHandleIsEqual (lnet_handle_any_t h1, lnet_handle_any_t h2)
+{
+       return (h1.cookie == h2.cookie);
+}
+
+/**
+ * Check whether handle \a h is invalid.
+ *
+ * \return 1 if handle is invalid, 0 if valid.
+ */
+static inline int LNetHandleIsInvalid(lnet_handle_any_t h)
+{
+       return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie);
+}
+
+/**
+ * Global process ID.
+ */
+typedef struct {
+       /** node id */
+       lnet_nid_t nid;
+       /** process id */
+       lnet_pid_t pid;
+} lnet_process_id_t;
+/** @} lnet_addr */
+
+/** \addtogroup lnet_me
+ * @{ */
+
+/**
+ * Specifies whether the match entry or memory descriptor should be unlinked
+ * automatically (LNET_UNLINK) or not (LNET_RETAIN).
+ */
+typedef enum {
+       LNET_RETAIN = 0,
+       LNET_UNLINK
+} lnet_unlink_t;
+
+/**
+ * Values of the type lnet_ins_pos_t are used to control where a new match
+ * entry is inserted. The value LNET_INS_BEFORE is used to insert the new
+ * entry before the current entry or before the head of the list. The value
+ * LNET_INS_AFTER is used to insert the new entry after the current entry
+ * or after the last item in the list.
+ */
+typedef enum {
+       /** insert ME before current position or head of the list */
+       LNET_INS_BEFORE,
+       /** insert ME after current position or tail of the list */
+       LNET_INS_AFTER,
+       /** attach ME at tail of local CPU partition ME list */
+       LNET_INS_LOCAL
+} lnet_ins_pos_t;
+
+/** @} lnet_me */
+
+/** \addtogroup lnet_md
+ * @{ */
+
+/**
+ * Defines the visible parts of a memory descriptor. Values of this type
+ * are used to initialize memory descriptors.
+ */
+typedef struct {
+       /**
+        * Specify the memory region associated with the memory descriptor.
+        * If the options field has:
+        * - LNET_MD_KIOV bit set: The start field points to the starting
+        * address of an array of lnet_kiov_t and the length field specifies
+        * the number of entries in the array. The length can't be bigger
+        * than LNET_MAX_IOV. The lnet_kiov_t is used to describe page-based
+        * fragments that are not necessarily mapped in virtal memory.
+        * - LNET_MD_IOVEC bit set: The start field points to the starting
+        * address of an array of struct iovec and the length field specifies
+        * the number of entries in the array. The length can't be bigger
+        * than LNET_MAX_IOV. The struct iovec is used to describe fragments
+        * that have virtual addresses.
+        * - Otherwise: The memory region is contiguous. The start field
+        * specifies the starting address for the memory region and the
+        * length field specifies its length.
+        *
+        * When the memory region is fragmented, all fragments but the first
+        * one must start on page boundary, and all but the last must end on
+        * page boundary.
+        */
+       void        *start;
+       unsigned int     length;
+       /**
+        * Specifies the maximum number of operations that can be performed
+        * on the memory descriptor. An operation is any action that could
+        * possibly generate an event. In the usual case, the threshold value
+        * is decremented for each operation on the MD. When the threshold
+        * drops to zero, the MD becomes inactive and does not respond to
+        * operations. A threshold value of LNET_MD_THRESH_INF indicates that
+        * there is no bound on the number of operations that may be applied
+        * to a MD.
+        */
+       int           threshold;
+       /**
+        * Specifies the largest incoming request that the memory descriptor
+        * should respond to. When the unused portion of a MD (length -
+        * local offset) falls below this value, the MD becomes inactive and
+        * does not respond to further operations. This value is only used
+        * if the LNET_MD_MAX_SIZE option is set.
+        */
+       int           max_size;
+       /**
+        * Specifies the behavior of the memory descriptor. A bitwise OR
+        * of the following values can be used:
+        * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD.
+        * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD.
+        * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory
+        *   region is provided by the incoming request. By default, the
+        *   offset is maintained locally. When maintained locally, the
+        *   offset is incremented by the length of the request so that
+        *   the next operation (PUT or GET) will access the next part of
+        *   the memory region. Note that only one offset variable exists
+        *   per memory descriptor. If both PUT and GET operations are
+        *   performed on a memory descriptor, the offset is updated each time.
+        * - LNET_MD_TRUNCATE: The length provided in the incoming request can
+        *   be reduced to match the memory available in the region (determined
+        *   by subtracting the offset from the length of the memory region).
+        *   By default, if the length in the incoming operation is greater
+        *   than the amount of memory available, the operation is rejected.
+        * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for
+        *   incoming PUT operations, even if requested. By default,
+        *   acknowledgments are sent for PUT operations that request an
+        *   acknowledgment. Acknowledgments are never sent for GET operations.
+        *   The data sent in the REPLY serves as an implicit acknowledgment.
+        * - LNET_MD_KIOV: The start and length fields specify an array of
+        *   lnet_kiov_t.
+        * - LNET_MD_IOVEC: The start and length fields specify an array of
+        *   struct iovec.
+        * - LNET_MD_MAX_SIZE: The max_size field is valid.
+        *
+        * Note:
+        * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather
+        *   capability for memory descriptors. They can't be both set.
+        * - When LNET_MD_MAX_SIZE is set, the total length of the memory
+        *   region (i.e. sum of all fragment lengths) must not be less than
+        *   \a max_size.
+        */
+       unsigned int     options;
+       /**
+        * A user-specified value that is associated with the memory
+        * descriptor. The value does not need to be a pointer, but must fit
+        * in the space used by a pointer. This value is recorded in events
+        * associated with operations on this MD.
+        */
+       void        *user_ptr;
+       /**
+        * A handle for the event queue used to log the operations performed on
+        * the memory region. If this argument is a NULL handle (i.e. nullified
+        * by LNetInvalidateHandle()), operations performed on this memory
+        * descriptor are not logged.
+        */
+       lnet_handle_eq_t eq_handle;
+} lnet_md_t;
+
+/* Max Transfer Unit (minimum supported everywhere).
+ * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
+ * these limits are system wide and not interface-local. */
+#define LNET_MTU_BITS  20
+#define LNET_MTU       (1 << LNET_MTU_BITS)
+
+/** limit on the number of fragments in discontiguous MDs */
+#define LNET_MAX_IOV    256
+
+/* Max payload size */
+# define LNET_MAX_PAYLOAD      CONFIG_LNET_MAX_PAYLOAD
+# if (LNET_MAX_PAYLOAD < LNET_MTU)
+#  error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
+# else
+#  if (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
+/*  PAGE_SIZE is a constant: check with cpp! */
+#   error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
+#  endif
+# endif
+
+/**
+ * Options for the MD structure. See lnet_md_t::options.
+ */
+#define LNET_MD_OP_PUT        (1 << 0)
+/** See lnet_md_t::options. */
+#define LNET_MD_OP_GET        (1 << 1)
+/** See lnet_md_t::options. */
+#define LNET_MD_MANAGE_REMOTE  (1 << 2)
+/* unused                          (1 << 3) */
+/** See lnet_md_t::options. */
+#define LNET_MD_TRUNCATE            (1 << 4)
+/** See lnet_md_t::options. */
+#define LNET_MD_ACK_DISABLE      (1 << 5)
+/** See lnet_md_t::options. */
+#define LNET_MD_IOVEC          (1 << 6)
+/** See lnet_md_t::options. */
+#define LNET_MD_MAX_SIZE            (1 << 7)
+/** See lnet_md_t::options. */
+#define LNET_MD_KIOV            (1 << 8)
+
+/* For compatibility with Cray Portals */
+#define LNET_MD_PHYS                    0
+
+/** Infinite threshold on MD operations. See lnet_md_t::threshold */
+#define LNET_MD_THRESH_INF       (-1)
+
+/* NB lustre portals uses struct iovec internally! */
+typedef struct iovec lnet_md_iovec_t;
+
+/**
+ * A page-based fragment of a MD.
+ */
+typedef struct {
+       /** Pointer to the page where the fragment resides */
+       struct page      *kiov_page;
+       /** Length in bytes of the fragment */
+       unsigned int     kiov_len;
+       /**
+        * Starting offset of the fragment within the page. Note that the
+        * end of the fragment must not pass the end of the page; i.e.,
+        * kiov_len + kiov_offset <= PAGE_CACHE_SIZE.
+        */
+       unsigned int     kiov_offset;
+} lnet_kiov_t;
+/** @} lnet_md */
+
+/** \addtogroup lnet_eq
+ * @{ */
+
+/**
+ * Six types of events can be logged in an event queue.
+ */
+typedef enum {
+       /** An incoming GET operation has completed on the MD. */
+       LNET_EVENT_GET          = 1,
+       /**
+        * An incoming PUT operation has completed on the MD. The
+        * underlying layers will not alter the memory (on behalf of this
+        * operation) once this event has been logged.
+        */
+       LNET_EVENT_PUT,
+       /**
+        * A REPLY operation has completed. This event is logged after the
+        * data (if any) from the REPLY has been written into the MD.
+        */
+       LNET_EVENT_REPLY,
+       /** An acknowledgment has been received. */
+       LNET_EVENT_ACK,
+       /**
+        * An outgoing send (PUT or GET) operation has completed. This event
+        * is logged after the entire buffer has been sent and it is safe for
+        * the caller to reuse the buffer.
+        *
+        * Note:
+        * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can
+        *   happen even when the message has not yet been put out on wire.
+        * - It's unsafe to assume that in an outgoing GET operation
+        *   the LNET_EVENT_SEND event would happen before the
+        *   LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and
+        *   LNET_EVENT_ACK events in an outgoing PUT operation.
+        */
+       LNET_EVENT_SEND,
+       /**
+        * A MD has been unlinked. Note that LNetMDUnlink() does not
+        * necessarily trigger an LNET_EVENT_UNLINK event.
+        * \see LNetMDUnlink
+        */
+       LNET_EVENT_UNLINK,
+} lnet_event_kind_t;
+
+#define LNET_SEQ_BASETYPE       long
+typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t;
+#define LNET_SEQ_GT(a,b)       (((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0)
+
+/* XXX
+ * cygwin need the pragma line, not clear if it's needed in other places.
+ * checking!!!
+ */
+#ifdef __CYGWIN__
+#pragma pack(push, 4)
+#endif
+
+/**
+ * Information about an event on a MD.
+ */
+typedef struct {
+       /** The identifier (nid, pid) of the target. */
+       lnet_process_id_t   target;
+       /** The identifier (nid, pid) of the initiator. */
+       lnet_process_id_t   initiator;
+       /**
+        * The NID of the immediate sender. If the request has been forwarded
+        * by routers, this is the NID of the last hop; otherwise it's the
+        * same as the initiator.
+        */
+       lnet_nid_t        sender;
+       /** Indicates the type of the event. */
+       lnet_event_kind_t   type;
+       /** The portal table index specified in the request */
+       unsigned int    pt_index;
+       /** A copy of the match bits specified in the request. */
+       __u64          match_bits;
+       /** The length (in bytes) specified in the request. */
+       unsigned int    rlength;
+       /**
+        * The length (in bytes) of the data that was manipulated by the
+        * operation. For truncated operations, the manipulated length will be
+        * the number of bytes specified by the MD (possibly with an offset,
+        * see lnet_md_t). For all other operations, the manipulated length
+        * will be the length of the requested operation, i.e. rlength.
+        */
+       unsigned int    mlength;
+       /**
+        * The handle to the MD associated with the event. The handle may be
+        * invalid if the MD has been unlinked.
+        */
+       lnet_handle_md_t    md_handle;
+       /**
+        * A snapshot of the state of the MD immediately after the event has
+        * been processed. In particular, the threshold field in md will
+        * reflect the value of the threshold after the operation occurred.
+        */
+       lnet_md_t          md;
+       /**
+        * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT.
+        * \see LNetPut
+        */
+       __u64          hdr_data;
+       /**
+        * Indicates the completion status of the operation. It's 0 for
+        * successful operations, otherwise it's an error code.
+        */
+       int              status;
+       /**
+        * Indicates whether the MD has been unlinked. Note that:
+        * - An event with unlinked set is the last event on the MD.
+        * - This field is also set for an explicit LNET_EVENT_UNLINK event.
+        * \see LNetMDUnlink
+        */
+       int              unlinked;
+       /**
+        * The displacement (in bytes) into the memory region that the
+        * operation used. The offset can be determined by the operation for
+        * a remote managed MD or by the local MD.
+        * \see lnet_md_t::options
+        */
+       unsigned int    offset;
+       /**
+        * The sequence number for this event. Sequence numbers are unique
+        * to each event.
+        */
+       volatile lnet_seq_t sequence;
+} lnet_event_t;
+#ifdef __CYGWIN__
+#pragma pop
+#endif
+
+/**
+ * Event queue handler function type.
+ *
+ * The EQ handler runs for each event that is deposited into the EQ. The
+ * handler is supplied with a pointer to the event that triggered the
+ * handler invocation.
+ *
+ * The handler must not block, must be reentrant, and must not call any LNet
+ * API functions. It should return as quickly as possible.
+ */
+typedef void (*lnet_eq_handler_t)(lnet_event_t *event);
+#define LNET_EQ_HANDLER_NONE NULL
+/** @} lnet_eq */
+
+/** \addtogroup lnet_data
+ * @{ */
+
+/**
+ * Specify whether an acknowledgment should be sent by target when the PUT
+ * operation completes (i.e., when the data has been written to a MD of the
+ * target process).
+ *
+ * \see lnet_md_t::options for the discussion on LNET_MD_ACK_DISABLE by which
+ * acknowledgments can be disabled for a MD.
+ */
+typedef enum {
+       /** Request an acknowledgment */
+       LNET_ACK_REQ,
+       /** Request that no acknowledgment should be generated. */
+       LNET_NOACK_REQ
+} lnet_ack_req_t;
+/** @} lnet_data */
+
+/** @} lnet */
+#endif
diff --git a/drivers/staging/lustre/lnet/Kconfig b/drivers/staging/lustre/lnet/Kconfig
new file mode 100644 (file)
index 0000000..00850ee
--- /dev/null
@@ -0,0 +1,40 @@
+config LNET
+       tristate "Lustre networking subsystem"
+       depends on LUSTRE_FS
+
+config LNET_MAX_PAYLOAD
+       int "Lustre lnet max transfer payload (default 2MB)"
+       depends on LUSTRE_FS
+       default "1048576"
+       help
+         This option defines the maximum size of payload in bytes that lnet
+         can put into its transport.
+
+         If unsure, use default.
+
+config LNET_SELFTEST
+       tristate "Lustre networking self testing"
+       depends on LNET
+       help
+         Choose Y here if you want to do lnet self testing. To compile this
+         as a module, choose M here: the module will be called lnet_selftest.
+
+         To compile this as a kernel modules, choose M here and it will be
+         called lnet_selftest.
+
+         If unsure, say N.
+
+         See also http://wiki.lustre.org/
+
+config LNET_XPRT_IB
+       tristate "LNET infiniband support"
+       depends on LNET && INFINIBAND && INFINIBAND_ADDR_TRANS
+       default LNET && INFINIBAND
+       help
+         This option allows the LNET users to use infiniband as an
+         RDMA-enabled transport.
+
+         To compile this as a kernel module, choose M here and it will be
+         called ko2iblnd.
+
+         If unsure, say N.
diff --git a/drivers/staging/lustre/lnet/Makefile b/drivers/staging/lustre/lnet/Makefile
new file mode 100644 (file)
index 0000000..374212b
--- /dev/null
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) := klnds/ lnet/ selftest/
diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile
new file mode 100644 (file)
index 0000000..c23e4f6
--- /dev/null
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) += o2iblnd/  socklnd/
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
new file mode 100644 (file)
index 0000000..71b7d84
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o
+ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
+
+
+ccflags-y := -I$(src)/../../include
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644 (file)
index 0000000..29a9794
--- /dev/null
@@ -0,0 +1,3259 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+#include <asm/div64.h>
+
+lnd_t the_o2iblnd = {
+       .lnd_type       = O2IBLND,
+       .lnd_startup    = kiblnd_startup,
+       .lnd_shutdown   = kiblnd_shutdown,
+       .lnd_ctl        = kiblnd_ctl,
+       .lnd_query      = kiblnd_query,
+       .lnd_send       = kiblnd_send,
+       .lnd_recv       = kiblnd_recv,
+};
+
+kib_data_t           kiblnd_data;
+
+__u32
+kiblnd_cksum (void *ptr, int nob)
+{
+       char  *c  = ptr;
+       __u32  sum = 0;
+
+       while (nob-- > 0)
+               sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+       /* ensure I don't return 0 (== no checksum) */
+       return (sum == 0) ? 1 : sum;
+}
+
+static char *
+kiblnd_msgtype2str(int type)
+{
+       switch (type) {
+       case IBLND_MSG_CONNREQ:
+               return "CONNREQ";
+
+       case IBLND_MSG_CONNACK:
+               return "CONNACK";
+
+       case IBLND_MSG_NOOP:
+               return "NOOP";
+
+       case IBLND_MSG_IMMEDIATE:
+               return "IMMEDIATE";
+
+       case IBLND_MSG_PUT_REQ:
+               return "PUT_REQ";
+
+       case IBLND_MSG_PUT_NAK:
+               return "PUT_NAK";
+
+       case IBLND_MSG_PUT_ACK:
+               return "PUT_ACK";
+
+       case IBLND_MSG_PUT_DONE:
+               return "PUT_DONE";
+
+       case IBLND_MSG_GET_REQ:
+               return "GET_REQ";
+
+       case IBLND_MSG_GET_DONE:
+               return "GET_DONE";
+
+       default:
+               return "???";
+       }
+}
+
+static int
+kiblnd_msgtype2size(int type)
+{
+       const int hdr_size = offsetof(kib_msg_t, ibm_u);
+
+       switch (type) {
+       case IBLND_MSG_CONNREQ:
+       case IBLND_MSG_CONNACK:
+               return hdr_size + sizeof(kib_connparams_t);
+
+       case IBLND_MSG_NOOP:
+               return hdr_size;
+
+       case IBLND_MSG_IMMEDIATE:
+               return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+
+       case IBLND_MSG_PUT_REQ:
+               return hdr_size + sizeof(kib_putreq_msg_t);
+
+       case IBLND_MSG_PUT_ACK:
+               return hdr_size + sizeof(kib_putack_msg_t);
+
+       case IBLND_MSG_GET_REQ:
+               return hdr_size + sizeof(kib_get_msg_t);
+
+       case IBLND_MSG_PUT_NAK:
+       case IBLND_MSG_PUT_DONE:
+       case IBLND_MSG_GET_DONE:
+               return hdr_size + sizeof(kib_completion_msg_t);
+       default:
+               return -1;
+       }
+}
+
+static int
+kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+{
+       kib_rdma_desc_t   *rd;
+       int             nob;
+       int             n;
+       int             i;
+
+       LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ ||
+                msg->ibm_type == IBLND_MSG_PUT_ACK);
+
+       rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
+                             &msg->ibm_u.get.ibgm_rd :
+                             &msg->ibm_u.putack.ibpam_rd;
+
+       if (flip) {
+               __swab32s(&rd->rd_key);
+               __swab32s(&rd->rd_nfrags);
+       }
+
+       n = rd->rd_nfrags;
+
+       if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+               CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
+                      n, IBLND_MAX_RDMA_FRAGS);
+               return 1;
+       }
+
+       nob = offsetof (kib_msg_t, ibm_u) +
+             kiblnd_rd_msg_size(rd, msg->ibm_type, n);
+
+       if (msg->ibm_nob < nob) {
+               CERROR("Short %s: %d(%d)\n",
+                      kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
+               return 1;
+       }
+
+       if (!flip)
+               return 0;
+
+       for (i = 0; i < n; i++) {
+               __swab32s(&rd->rd_frags[i].rf_nob);
+               __swab64s(&rd->rd_frags[i].rf_addr);
+       }
+
+       return 0;
+}
+
+void
+kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+                int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+       kib_net_t *net = ni->ni_data;
+
+       /* CAVEAT EMPTOR! all message fields not set here should have been
+        * initialised previously. */
+       msg->ibm_magic    = IBLND_MSG_MAGIC;
+       msg->ibm_version  = version;
+       /*   ibm_type */
+       msg->ibm_credits  = credits;
+       /*   ibm_nob */
+       msg->ibm_cksum    = 0;
+       msg->ibm_srcnid   = ni->ni_nid;
+       msg->ibm_srcstamp = net->ibn_incarnation;
+       msg->ibm_dstnid   = dstnid;
+       msg->ibm_dststamp = dststamp;
+
+       if (*kiblnd_tunables.kib_cksum) {
+               /* NB ibm_cksum zero while computing cksum */
+               msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+       }
+}
+
+int
+kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+{
+       const int hdr_size = offsetof(kib_msg_t, ibm_u);
+       __u32     msg_cksum;
+       __u16     version;
+       int       msg_nob;
+       int       flip;
+
+       /* 6 bytes are enough to have received magic + version */
+       if (nob < 6) {
+               CERROR("Short message: %d\n", nob);
+               return -EPROTO;
+       }
+
+       if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+               flip = 0;
+       } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+               flip = 1;
+       } else {
+               CERROR("Bad magic: %08x\n", msg->ibm_magic);
+               return -EPROTO;
+       }
+
+       version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+       if (version != IBLND_MSG_VERSION &&
+           version != IBLND_MSG_VERSION_1) {
+               CERROR("Bad version: %x\n", version);
+               return -EPROTO;
+       }
+
+       if (nob < hdr_size) {
+               CERROR("Short message: %d\n", nob);
+               return -EPROTO;
+       }
+
+       msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+       if (msg_nob > nob) {
+               CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+               return -EPROTO;
+       }
+
+       /* checksum must be computed with ibm_cksum zero and BEFORE anything
+        * gets flipped */
+       msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+       msg->ibm_cksum = 0;
+       if (msg_cksum != 0 &&
+           msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+               CERROR("Bad checksum\n");
+               return -EPROTO;
+       }
+
+       msg->ibm_cksum = msg_cksum;
+
+       if (flip) {
+               /* leave magic unflipped as a clue to peer endianness */
+               msg->ibm_version = version;
+               CLASSERT (sizeof(msg->ibm_type) == 1);
+               CLASSERT (sizeof(msg->ibm_credits) == 1);
+               msg->ibm_nob     = msg_nob;
+               __swab64s(&msg->ibm_srcnid);
+               __swab64s(&msg->ibm_srcstamp);
+               __swab64s(&msg->ibm_dstnid);
+               __swab64s(&msg->ibm_dststamp);
+       }
+
+       if (msg->ibm_srcnid == LNET_NID_ANY) {
+               CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+               return -EPROTO;
+       }
+
+       if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
+               CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
+                      msg_nob, kiblnd_msgtype2size(msg->ibm_type));
+               return -EPROTO;
+       }
+
+       switch (msg->ibm_type) {
+       default:
+               CERROR("Unknown message type %x\n", msg->ibm_type);
+               return -EPROTO;
+
+       case IBLND_MSG_NOOP:
+       case IBLND_MSG_IMMEDIATE:
+       case IBLND_MSG_PUT_REQ:
+               break;
+
+       case IBLND_MSG_PUT_ACK:
+       case IBLND_MSG_GET_REQ:
+               if (kiblnd_unpack_rd(msg, flip))
+                       return -EPROTO;
+               break;
+
+       case IBLND_MSG_PUT_NAK:
+       case IBLND_MSG_PUT_DONE:
+       case IBLND_MSG_GET_DONE:
+               if (flip)
+                       __swab32s(&msg->ibm_u.completion.ibcm_status);
+               break;
+
+       case IBLND_MSG_CONNREQ:
+       case IBLND_MSG_CONNACK:
+               if (flip) {
+                       __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+                       __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+                       __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+               }
+               break;
+       }
+       return 0;
+}
+
+int
+kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
+{
+       kib_peer_t      *peer;
+       kib_net_t       *net = ni->ni_data;
+       int             cpt = lnet_cpt_of_nid(nid);
+       unsigned long   flags;
+
+       LASSERT(net != NULL);
+       LASSERT(nid != LNET_NID_ANY);
+
+       LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
+       if (peer == NULL) {
+               CERROR("Cannot allocate peer\n");
+               return -ENOMEM;
+       }
+
+       memset(peer, 0, sizeof(*peer));  /* zero flags etc */
+
+       peer->ibp_ni = ni;
+       peer->ibp_nid = nid;
+       peer->ibp_error = 0;
+       peer->ibp_last_alive = 0;
+       atomic_set(&peer->ibp_refcount, 1);  /* 1 ref for caller */
+
+       INIT_LIST_HEAD(&peer->ibp_list);     /* not in the peer table yet */
+       INIT_LIST_HEAD(&peer->ibp_conns);
+       INIT_LIST_HEAD(&peer->ibp_tx_queue);
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       /* always called with a ref on ni, which prevents ni being shutdown */
+       LASSERT (net->ibn_shutdown == 0);
+
+       /* npeers only grows with the global lock held */
+       atomic_inc(&net->ibn_npeers);
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       *peerp = peer;
+       return 0;
+}
+
+void
+kiblnd_destroy_peer (kib_peer_t *peer)
+{
+       kib_net_t *net = peer->ibp_ni->ni_data;
+
+       LASSERT (net != NULL);
+       LASSERT (atomic_read(&peer->ibp_refcount) == 0);
+       LASSERT (!kiblnd_peer_active(peer));
+       LASSERT (peer->ibp_connecting == 0);
+       LASSERT (peer->ibp_accepting == 0);
+       LASSERT (list_empty(&peer->ibp_conns));
+       LASSERT (list_empty(&peer->ibp_tx_queue));
+
+       LIBCFS_FREE(peer, sizeof(*peer));
+
+       /* NB a peer's connections keep a reference on their peer until
+        * they are destroyed, so we can be assured that _all_ state to do
+        * with this peer has been cleaned up when its refcount drops to
+        * zero. */
+       atomic_dec(&net->ibn_npeers);
+}
+
+kib_peer_t *
+kiblnd_find_peer_locked (lnet_nid_t nid)
+{
+       /* the caller is responsible for accounting the additional reference
+        * that this creates */
+       struct list_head       *peer_list = kiblnd_nid2peerlist(nid);
+       struct list_head       *tmp;
+       kib_peer_t       *peer;
+
+       list_for_each (tmp, peer_list) {
+
+               peer = list_entry(tmp, kib_peer_t, ibp_list);
+
+               LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+                        peer->ibp_accepting > 0 ||
+                        !list_empty(&peer->ibp_conns));  /* active conn */
+
+               if (peer->ibp_nid != nid)
+                       continue;
+
+               CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
+                      peer, libcfs_nid2str(nid),
+                      atomic_read(&peer->ibp_refcount),
+                      peer->ibp_version);
+               return peer;
+       }
+       return NULL;
+}
+
+void
+kiblnd_unlink_peer_locked (kib_peer_t *peer)
+{
+       LASSERT (list_empty(&peer->ibp_conns));
+
+       LASSERT (kiblnd_peer_active(peer));
+       list_del_init(&peer->ibp_list);
+       /* lose peerlist's ref */
+       kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_get_peer_info (lnet_ni_t *ni, int index,
+                     lnet_nid_t *nidp, int *count)
+{
+       kib_peer_t          *peer;
+       struct list_head            *ptmp;
+       int                 i;
+       unsigned long     flags;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+
+               list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                       LASSERT (peer->ibp_connecting > 0 ||
+                                peer->ibp_accepting > 0 ||
+                                !list_empty(&peer->ibp_conns));
+
+                       if (peer->ibp_ni != ni)
+                               continue;
+
+                       if (index-- > 0)
+                               continue;
+
+                       *nidp = peer->ibp_nid;
+                       *count = atomic_read(&peer->ibp_refcount);
+
+                       read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                              flags);
+                       return 0;
+               }
+       }
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+       return -ENOENT;
+}
+
+void
+kiblnd_del_peer_locked (kib_peer_t *peer)
+{
+       struct list_head           *ctmp;
+       struct list_head           *cnxt;
+       kib_conn_t         *conn;
+
+       if (list_empty(&peer->ibp_conns)) {
+               kiblnd_unlink_peer_locked(peer);
+       } else {
+               list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                       conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                       kiblnd_close_conn_locked(conn, 0);
+               }
+               /* NB closing peer's last conn unlinked it. */
+       }
+       /* NB peer now unlinked; might even be freed if the peer table had the
+        * last ref on it. */
+}
+
+int
+kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
+{
+       LIST_HEAD        (zombies);
+       struct list_head            *ptmp;
+       struct list_head            *pnxt;
+       kib_peer_t          *peer;
+       int                 lo;
+       int                 hi;
+       int                 i;
+       unsigned long     flags;
+       int                 rc = -ENOENT;
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       if (nid != LNET_NID_ANY) {
+               lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+       } else {
+               lo = 0;
+               hi = kiblnd_data.kib_peer_hash_size - 1;
+       }
+
+       for (i = lo; i <= hi; i++) {
+               list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                       LASSERT (peer->ibp_connecting > 0 ||
+                                peer->ibp_accepting > 0 ||
+                                !list_empty(&peer->ibp_conns));
+
+                       if (peer->ibp_ni != ni)
+                               continue;
+
+                       if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
+                               continue;
+
+                       if (!list_empty(&peer->ibp_tx_queue)) {
+                               LASSERT (list_empty(&peer->ibp_conns));
+
+                               list_splice_init(&peer->ibp_tx_queue,
+                                                    &zombies);
+                       }
+
+                       kiblnd_del_peer_locked(peer);
+                       rc = 0;  /* matched something */
+               }
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       kiblnd_txlist_done(ni, &zombies, -EIO);
+
+       return rc;
+}
+
+kib_conn_t *
+kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+       kib_peer_t          *peer;
+       struct list_head            *ptmp;
+       kib_conn_t          *conn;
+       struct list_head            *ctmp;
+       int                 i;
+       unsigned long     flags;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+               list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                       LASSERT (peer->ibp_connecting > 0 ||
+                                peer->ibp_accepting > 0 ||
+                                !list_empty(&peer->ibp_conns));
+
+                       if (peer->ibp_ni != ni)
+                               continue;
+
+                       list_for_each (ctmp, &peer->ibp_conns) {
+                               if (index-- > 0)
+                                       continue;
+
+                               conn = list_entry(ctmp, kib_conn_t,
+                                                     ibc_list);
+                               kiblnd_conn_addref(conn);
+                               read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                                      flags);
+                               return conn;
+                       }
+               }
+       }
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+       return NULL;
+}
+
+void
+kiblnd_debug_rx (kib_rx_t *rx)
+{
+       CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
+              rx, rx->rx_status, rx->rx_msg->ibm_type,
+              rx->rx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_tx (kib_tx_t *tx)
+{
+       CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
+              "cookie "LPX64" msg %s%s type %x cred %d\n",
+              tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+              tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+              tx->tx_lntmsg[0] == NULL ? "-" : "!",
+              tx->tx_lntmsg[1] == NULL ? "-" : "!",
+              tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_conn (kib_conn_t *conn)
+{
+       struct list_head        *tmp;
+       int             i;
+
+       spin_lock(&conn->ibc_lock);
+
+       CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s: \n",
+              atomic_read(&conn->ibc_refcount), conn,
+              conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+       CDEBUG(D_CONSOLE, "   state %d nposted %d/%d cred %d o_cred %d r_cred %d\n",
+              conn->ibc_state, conn->ibc_noops_posted,
+              conn->ibc_nsends_posted, conn->ibc_credits,
+              conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+       CDEBUG(D_CONSOLE, "   comms_err %d\n", conn->ibc_comms_error);
+
+       CDEBUG(D_CONSOLE, "   early_rxs:\n");
+       list_for_each(tmp, &conn->ibc_early_rxs)
+               kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+
+       CDEBUG(D_CONSOLE, "   tx_noops:\n");
+       list_for_each(tmp, &conn->ibc_tx_noops)
+               kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+       CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
+       list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+               kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+       CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
+       list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+               kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+       CDEBUG(D_CONSOLE, "   tx_queue:\n");
+       list_for_each(tmp, &conn->ibc_tx_queue)
+               kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+       CDEBUG(D_CONSOLE, "   active_txs:\n");
+       list_for_each(tmp, &conn->ibc_active_txs)
+               kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+       CDEBUG(D_CONSOLE, "   rxs:\n");
+       for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++)
+               kiblnd_debug_rx(&conn->ibc_rxs[i]);
+
+       spin_unlock(&conn->ibc_lock);
+}
+
+int
+kiblnd_translate_mtu(int value)
+{
+       switch (value) {
+       default:
+               return -1;
+       case 0:
+               return 0;
+       case 256:
+               return IB_MTU_256;
+       case 512:
+               return IB_MTU_512;
+       case 1024:
+               return IB_MTU_1024;
+       case 2048:
+               return IB_MTU_2048;
+       case 4096:
+               return IB_MTU_4096;
+       }
+}
+
+static void
+kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
+{
+       int        mtu;
+
+       /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
+       if (cmid->route.path_rec == NULL)
+               return;
+
+       mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
+       LASSERT (mtu >= 0);
+       if (mtu != 0)
+               cmid->route.path_rec->mtu = mtu;
+}
+
+static int
+kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+{
+       cpumask_t       *mask;
+       int             vectors;
+       int             off;
+       int             i;
+       lnet_nid_t      nid = conn->ibc_peer->ibp_nid;
+
+       vectors = conn->ibc_cmid->device->num_comp_vectors;
+       if (vectors <= 1)
+               return 0;
+
+       mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
+
+       /* hash NID to CPU id in this partition... */
+       off = do_div(nid, cpus_weight(*mask));
+       for_each_cpu_mask(i, *mask) {
+               if (off-- == 0)
+                       return i % vectors;
+       }
+
+       LBUG();
+       return 1;
+}
+
+kib_conn_t *
+kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+                  int state, int version)
+{
+       /* CAVEAT EMPTOR:
+        * If the new conn is created successfully it takes over the caller's
+        * ref on 'peer'.  It also "owns" 'cmid' and destroys it when it itself
+        * is destroyed.  On failure, the caller's ref on 'peer' remains and
+        * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
+        * to destroy 'cmid' here since I'm called from the CM which still has
+        * its ref on 'cmid'). */
+       rwlock_t                *glock = &kiblnd_data.kib_global_lock;
+       kib_net_t             *net = peer->ibp_ni->ni_data;
+       kib_dev_t             *dev;
+       struct ib_qp_init_attr *init_qp_attr;
+       struct kib_sched_info   *sched;
+       kib_conn_t              *conn;
+       struct ib_cq            *cq;
+       unsigned long           flags;
+       int                     cpt;
+       int                     rc;
+       int                     i;
+
+       LASSERT(net != NULL);
+       LASSERT(!in_interrupt());
+
+       dev = net->ibn_dev;
+
+       cpt = lnet_cpt_of_nid(peer->ibp_nid);
+       sched = kiblnd_data.kib_scheds[cpt];
+
+       LASSERT(sched->ibs_nthreads > 0);
+
+       LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
+                        sizeof(*init_qp_attr));
+       if (init_qp_attr == NULL) {
+               CERROR("Can't allocate qp_attr for %s\n",
+                      libcfs_nid2str(peer->ibp_nid));
+               goto failed_0;
+       }
+
+       LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
+       if (conn == NULL) {
+               CERROR("Can't allocate connection for %s\n",
+                      libcfs_nid2str(peer->ibp_nid));
+               goto failed_1;
+       }
+
+       conn->ibc_state = IBLND_CONN_INIT;
+       conn->ibc_version = version;
+       conn->ibc_peer = peer;            /* I take the caller's ref */
+       cmid->context = conn;              /* for future CM callbacks */
+       conn->ibc_cmid = cmid;
+
+       INIT_LIST_HEAD(&conn->ibc_early_rxs);
+       INIT_LIST_HEAD(&conn->ibc_tx_noops);
+       INIT_LIST_HEAD(&conn->ibc_tx_queue);
+       INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+       INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+       INIT_LIST_HEAD(&conn->ibc_active_txs);
+       spin_lock_init(&conn->ibc_lock);
+
+       LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
+                        sizeof(*conn->ibc_connvars));
+       if (conn->ibc_connvars == NULL) {
+               CERROR("Can't allocate in-progress connection state\n");
+               goto failed_2;
+       }
+
+       write_lock_irqsave(glock, flags);
+       if (dev->ibd_failover) {
+               write_unlock_irqrestore(glock, flags);
+               CERROR("%s: failover in progress\n", dev->ibd_ifname);
+               goto failed_2;
+       }
+
+       if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
+               /* wakeup failover thread and teardown connection */
+               if (kiblnd_dev_can_failover(dev)) {
+                       list_add_tail(&dev->ibd_fail_list,
+                                     &kiblnd_data.kib_failed_devs);
+                       wake_up(&kiblnd_data.kib_failover_waitq);
+               }
+
+               write_unlock_irqrestore(glock, flags);
+               CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
+                      cmid->device->name, dev->ibd_ifname);
+               goto failed_2;
+       }
+
+       kiblnd_hdev_addref_locked(dev->ibd_hdev);
+       conn->ibc_hdev = dev->ibd_hdev;
+
+       kiblnd_setup_mtu_locked(cmid);
+
+       write_unlock_irqrestore(glock, flags);
+
+       LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
+                        IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
+       if (conn->ibc_rxs == NULL) {
+               CERROR("Cannot allocate RX buffers\n");
+               goto failed_2;
+       }
+
+       rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
+                               IBLND_RX_MSG_PAGES(version));
+       if (rc != 0)
+               goto failed_2;
+
+       kiblnd_map_rx_descs(conn);
+
+       cq = ib_create_cq(cmid->device,
+                         kiblnd_cq_completion, kiblnd_cq_event, conn,
+                         IBLND_CQ_ENTRIES(version),
+                         kiblnd_get_completion_vector(conn, cpt));
+       if (IS_ERR(cq)) {
+               CERROR("Can't create CQ: %ld, cqe: %d\n",
+                      PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
+               goto failed_2;
+       }
+
+       conn->ibc_cq = cq;
+
+       rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+       if (rc != 0) {
+               CERROR("Can't request completion notificiation: %d\n", rc);
+               goto failed_2;
+       }
+
+       init_qp_attr->event_handler = kiblnd_qp_event;
+       init_qp_attr->qp_context = conn;
+       init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
+       init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
+       init_qp_attr->cap.max_send_sge = 1;
+       init_qp_attr->cap.max_recv_sge = 1;
+       init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+       init_qp_attr->qp_type = IB_QPT_RC;
+       init_qp_attr->send_cq = cq;
+       init_qp_attr->recv_cq = cq;
+
+       conn->ibc_sched = sched;
+
+       rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+       if (rc != 0) {
+               CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
+                      rc, init_qp_attr->cap.max_send_wr,
+                      init_qp_attr->cap.max_recv_wr);
+               goto failed_2;
+       }
+
+       LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+
+       /* 1 ref for caller and each rxmsg */
+       atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
+       conn->ibc_nrx = IBLND_RX_MSGS(version);
+
+       /* post receives */
+       for (i = 0; i < IBLND_RX_MSGS(version); i++) {
+               rc = kiblnd_post_rx(&conn->ibc_rxs[i],
+                                   IBLND_POSTRX_NO_CREDIT);
+               if (rc != 0) {
+                       CERROR("Can't post rxmsg: %d\n", rc);
+
+                       /* Make posted receives complete */
+                       kiblnd_abort_receives(conn);
+
+                       /* correct # of posted buffers
+                        * NB locking needed now I'm racing with completion */
+                       spin_lock_irqsave(&sched->ibs_lock, flags);
+                       conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
+                       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+                       /* cmid will be destroyed by CM(ofed) after cm_callback
+                        * returned, so we can't refer it anymore
+                        * (by kiblnd_connd()->kiblnd_destroy_conn) */
+                       rdma_destroy_qp(conn->ibc_cmid);
+                       conn->ibc_cmid = NULL;
+
+                       /* Drop my own and unused rxbuffer refcounts */
+                       while (i++ <= IBLND_RX_MSGS(version))
+                               kiblnd_conn_decref(conn);
+
+                       return NULL;
+               }
+       }
+
+       /* Init successful! */
+       LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
+                state == IBLND_CONN_PASSIVE_WAIT);
+       conn->ibc_state = state;
+
+       /* 1 more conn */
+       atomic_inc(&net->ibn_nconns);
+       return conn;
+
+ failed_2:
+       kiblnd_destroy_conn(conn);
+ failed_1:
+       LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+ failed_0:
+       return NULL;
+}
+
+void
+kiblnd_destroy_conn (kib_conn_t *conn)
+{
+       struct rdma_cm_id *cmid = conn->ibc_cmid;
+       kib_peer_t      *peer = conn->ibc_peer;
+       int             rc;
+
+       LASSERT (!in_interrupt());
+       LASSERT (atomic_read(&conn->ibc_refcount) == 0);
+       LASSERT (list_empty(&conn->ibc_early_rxs));
+       LASSERT (list_empty(&conn->ibc_tx_noops));
+       LASSERT (list_empty(&conn->ibc_tx_queue));
+       LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+       LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
+       LASSERT (list_empty(&conn->ibc_active_txs));
+       LASSERT (conn->ibc_noops_posted == 0);
+       LASSERT (conn->ibc_nsends_posted == 0);
+
+       switch (conn->ibc_state) {
+       default:
+               /* conn must be completely disengaged from the network */
+               LBUG();
+
+       case IBLND_CONN_DISCONNECTED:
+               /* connvars should have been freed already */
+               LASSERT (conn->ibc_connvars == NULL);
+               break;
+
+       case IBLND_CONN_INIT:
+               break;
+       }
+
+       /* conn->ibc_cmid might be destroyed by CM already */
+       if (cmid != NULL && cmid->qp != NULL)
+               rdma_destroy_qp(cmid);
+
+       if (conn->ibc_cq != NULL) {
+               rc = ib_destroy_cq(conn->ibc_cq);
+               if (rc != 0)
+                       CWARN("Error destroying CQ: %d\n", rc);
+       }
+
+       if (conn->ibc_rx_pages != NULL)
+               kiblnd_unmap_rx_descs(conn);
+
+       if (conn->ibc_rxs != NULL) {
+               LIBCFS_FREE(conn->ibc_rxs,
+                           IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t));
+       }
+
+       if (conn->ibc_connvars != NULL)
+               LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+       if (conn->ibc_hdev != NULL)
+               kiblnd_hdev_decref(conn->ibc_hdev);
+
+       /* See CAVEAT EMPTOR above in kiblnd_create_conn */
+       if (conn->ibc_state != IBLND_CONN_INIT) {
+               kib_net_t *net = peer->ibp_ni->ni_data;
+
+               kiblnd_peer_decref(peer);
+               rdma_destroy_id(cmid);
+               atomic_dec(&net->ibn_nconns);
+       }
+
+       LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+       kib_conn_t           *conn;
+       struct list_head             *ctmp;
+       struct list_head             *cnxt;
+       int                  count = 0;
+
+       list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+               conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+               CDEBUG(D_NET, "Closing conn -> %s, "
+                             "version: %x, reason: %d\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      conn->ibc_version, why);
+
+               kiblnd_close_conn_locked(conn, why);
+               count++;
+       }
+
+       return count;
+}
+
+int
+kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+                                int version, __u64 incarnation)
+{
+       kib_conn_t           *conn;
+       struct list_head             *ctmp;
+       struct list_head             *cnxt;
+       int                  count = 0;
+
+       list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+               conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+               if (conn->ibc_version     == version &&
+                   conn->ibc_incarnation == incarnation)
+                       continue;
+
+               CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
+                             "incarnation:"LPX64"(%x, "LPX64")\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      conn->ibc_version, conn->ibc_incarnation,
+                      version, incarnation);
+
+               kiblnd_close_conn_locked(conn, -ESTALE);
+               count++;
+       }
+
+       return count;
+}
+
+int
+kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid)
+{
+       kib_peer_t           *peer;
+       struct list_head             *ptmp;
+       struct list_head             *pnxt;
+       int                  lo;
+       int                  hi;
+       int                  i;
+       unsigned long      flags;
+       int                  count = 0;
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       if (nid != LNET_NID_ANY)
+               lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+       else {
+               lo = 0;
+               hi = kiblnd_data.kib_peer_hash_size - 1;
+       }
+
+       for (i = lo; i <= hi; i++) {
+               list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+
+                       peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                       LASSERT (peer->ibp_connecting > 0 ||
+                                peer->ibp_accepting > 0 ||
+                                !list_empty(&peer->ibp_conns));
+
+                       if (peer->ibp_ni != ni)
+                               continue;
+
+                       if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
+                               continue;
+
+                       count += kiblnd_close_peer_conns_locked(peer, 0);
+               }
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       /* wildcards always succeed */
+       if (nid == LNET_NID_ANY)
+               return 0;
+
+       return (count == 0) ? -ENOENT : 0;
+}
+
+int
+kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+       struct libcfs_ioctl_data *data = arg;
+       int                    rc = -EINVAL;
+
+       switch(cmd) {
+       case IOC_LIBCFS_GET_PEER: {
+               lnet_nid_t   nid = 0;
+               int       count = 0;
+
+               rc = kiblnd_get_peer_info(ni, data->ioc_count,
+                                         &nid, &count);
+               data->ioc_nid    = nid;
+               data->ioc_count  = count;
+               break;
+       }
+
+       case IOC_LIBCFS_DEL_PEER: {
+               rc = kiblnd_del_peer(ni, data->ioc_nid);
+               break;
+       }
+       case IOC_LIBCFS_GET_CONN: {
+               kib_conn_t *conn;
+
+               rc = 0;
+               conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+               if (conn == NULL) {
+                       rc = -ENOENT;
+                       break;
+               }
+
+               LASSERT (conn->ibc_cmid != NULL);
+               data->ioc_nid = conn->ibc_peer->ibp_nid;
+               if (conn->ibc_cmid->route.path_rec == NULL)
+                       data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+               else
+                       data->ioc_u32[0] =
+                       ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+               kiblnd_conn_decref(conn);
+               break;
+       }
+       case IOC_LIBCFS_CLOSE_CONNECTION: {
+               rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+               break;
+       }
+
+       default:
+               break;
+       }
+
+       return rc;
+}
+
+void
+kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+       cfs_time_t      last_alive = 0;
+       cfs_time_t      now = cfs_time_current();
+       rwlock_t        *glock = &kiblnd_data.kib_global_lock;
+       kib_peer_t      *peer;
+       unsigned long   flags;
+
+       read_lock_irqsave(glock, flags);
+
+       peer = kiblnd_find_peer_locked(nid);
+       if (peer != NULL) {
+               LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+                        peer->ibp_accepting > 0 ||
+                        !list_empty(&peer->ibp_conns));  /* active conn */
+               last_alive = peer->ibp_last_alive;
+       }
+
+       read_unlock_irqrestore(glock, flags);
+
+       if (last_alive != 0)
+               *when = last_alive;
+
+       /* peer is not persistent in hash, trigger peer creation
+        * and connection establishment with a NULL tx */
+       if (peer == NULL)
+               kiblnd_launch_tx(ni, NULL, nid);
+
+       CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+              libcfs_nid2str(nid), peer,
+              last_alive ? cfs_duration_sec(now - last_alive) : -1);
+       return;
+}
+
+void
+kiblnd_free_pages(kib_pages_t *p)
+{
+       int     npages = p->ibp_npages;
+       int     i;
+
+       for (i = 0; i < npages; i++) {
+               if (p->ibp_pages[i] != NULL)
+                       __free_page(p->ibp_pages[i]);
+       }
+
+       LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+{
+       kib_pages_t     *p;
+       int             i;
+
+       LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
+                        offsetof(kib_pages_t, ibp_pages[npages]));
+       if (p == NULL) {
+               CERROR("Can't allocate descriptor for %d pages\n", npages);
+               return -ENOMEM;
+       }
+
+       memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+       p->ibp_npages = npages;
+
+       for (i = 0; i < npages; i++) {
+               p->ibp_pages[i] = alloc_pages_node(
+                                   cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+                                   __GFP_IO, 0);
+               if (p->ibp_pages[i] == NULL) {
+                       CERROR("Can't allocate page %d of %d\n", i, npages);
+                       kiblnd_free_pages(p);
+                       return -ENOMEM;
+               }
+       }
+
+       *pp = p;
+       return 0;
+}
+
+void
+kiblnd_unmap_rx_descs(kib_conn_t *conn)
+{
+       kib_rx_t *rx;
+       int       i;
+
+       LASSERT (conn->ibc_rxs != NULL);
+       LASSERT (conn->ibc_hdev != NULL);
+
+       for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+               rx = &conn->ibc_rxs[i];
+
+               LASSERT (rx->rx_nob >= 0); /* not posted */
+
+               kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+                                       KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+                                                         rx->rx_msgaddr),
+                                       IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+       }
+
+       kiblnd_free_pages(conn->ibc_rx_pages);
+
+       conn->ibc_rx_pages = NULL;
+}
+
+void
+kiblnd_map_rx_descs(kib_conn_t *conn)
+{
+       kib_rx_t       *rx;
+       struct page    *pg;
+       int          pg_off;
+       int          ipg;
+       int          i;
+
+       for (pg_off = ipg = i = 0;
+            i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+               pg = conn->ibc_rx_pages->ibp_pages[ipg];
+               rx = &conn->ibc_rxs[i];
+
+               rx->rx_conn = conn;
+               rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+
+               rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+                                                      rx->rx_msg, IBLND_MSG_SIZE,
+                                                      DMA_FROM_DEVICE);
+               LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+                                                  rx->rx_msgaddr));
+               KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+
+               CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n",
+                      i, rx->rx_msg, rx->rx_msgaddr,
+                      lnet_page2phys(pg) + pg_off);
+
+               pg_off += IBLND_MSG_SIZE;
+               LASSERT (pg_off <= PAGE_SIZE);
+
+               if (pg_off == PAGE_SIZE) {
+                       pg_off = 0;
+                       ipg++;
+                       LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
+               }
+       }
+}
+
+static void
+kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+{
+       kib_hca_dev_t  *hdev = tpo->tpo_hdev;
+       kib_tx_t       *tx;
+       int          i;
+
+       LASSERT (tpo->tpo_pool.po_allocated == 0);
+
+       if (hdev == NULL)
+               return;
+
+       for (i = 0; i < tpo->tpo_pool.po_size; i++) {
+               tx = &tpo->tpo_tx_descs[i];
+               kiblnd_dma_unmap_single(hdev->ibh_ibdev,
+                                       KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
+                                                         tx->tx_msgaddr),
+                                       IBLND_MSG_SIZE, DMA_TO_DEVICE);
+       }
+
+       kiblnd_hdev_decref(hdev);
+       tpo->tpo_hdev = NULL;
+}
+
+static kib_hca_dev_t *
+kiblnd_current_hdev(kib_dev_t *dev)
+{
+       kib_hca_dev_t *hdev;
+       unsigned long  flags;
+       int         i = 0;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       while (dev->ibd_failover) {
+               read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+               if (i++ % 50 == 0)
+                       CDEBUG(D_NET, "%s: Wait for failover\n",
+                              dev->ibd_ifname);
+               schedule_timeout(cfs_time_seconds(1) / 100);
+
+               read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       }
+
+       kiblnd_hdev_addref_locked(dev->ibd_hdev);
+       hdev = dev->ibd_hdev;
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       return hdev;
+}
+
+static void
+kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
+{
+       kib_pages_t    *txpgs = tpo->tpo_tx_pages;
+       kib_pool_t     *pool  = &tpo->tpo_pool;
+       kib_net_t      *net   = pool->po_owner->ps_net;
+       kib_dev_t      *dev;
+       struct page    *page;
+       kib_tx_t       *tx;
+       int          page_offset;
+       int          ipage;
+       int          i;
+
+       LASSERT (net != NULL);
+
+       dev = net->ibn_dev;
+
+       /* pre-mapped messages are not bigger than 1 page */
+       CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
+
+       /* No fancy arithmetic when we do the buffer calculations */
+       CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
+
+       tpo->tpo_hdev = kiblnd_current_hdev(dev);
+
+       for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
+               page = txpgs->ibp_pages[ipage];
+               tx = &tpo->tpo_tx_descs[i];
+
+               tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+                                          page_offset);
+
+               tx->tx_msgaddr = kiblnd_dma_map_single(
+                       tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
+                       IBLND_MSG_SIZE, DMA_TO_DEVICE);
+               LASSERT (!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
+                                                  tx->tx_msgaddr));
+               KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
+
+               list_add(&tx->tx_list, &pool->po_free_list);
+
+               page_offset += IBLND_MSG_SIZE;
+               LASSERT (page_offset <= PAGE_SIZE);
+
+               if (page_offset == PAGE_SIZE) {
+                       page_offset = 0;
+                       ipage++;
+                       LASSERT (ipage <= txpgs->ibp_npages);
+               }
+       }
+}
+
+struct ib_mr *
+kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
+{
+       __u64   index;
+
+       LASSERT (hdev->ibh_mrs[0] != NULL);
+
+       if (hdev->ibh_nmrs == 1)
+               return hdev->ibh_mrs[0];
+
+       index = addr >> hdev->ibh_mr_shift;
+
+       if (index <  hdev->ibh_nmrs &&
+           index == ((addr + size - 1) >> hdev->ibh_mr_shift))
+               return hdev->ibh_mrs[index];
+
+       return NULL;
+}
+
+struct ib_mr *
+kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
+{
+       struct ib_mr *prev_mr;
+       struct ib_mr *mr;
+       int        i;
+
+       LASSERT (hdev->ibh_mrs[0] != NULL);
+
+       if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+           *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
+               return NULL;
+
+       if (hdev->ibh_nmrs == 1)
+               return hdev->ibh_mrs[0];
+
+       for (i = 0, mr = prev_mr = NULL;
+            i < rd->rd_nfrags; i++) {
+               mr = kiblnd_find_dma_mr(hdev,
+                                       rd->rd_frags[i].rf_addr,
+                                       rd->rd_frags[i].rf_nob);
+               if (prev_mr == NULL)
+                       prev_mr = mr;
+
+               if (mr == NULL || prev_mr != mr) {
+                       /* Can't covered by one single MR */
+                       mr = NULL;
+                       break;
+               }
+       }
+
+       return mr;
+}
+
+void
+kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
+{
+       LASSERT (pool->fpo_map_count == 0);
+
+       if (pool->fpo_fmr_pool != NULL)
+               ib_destroy_fmr_pool(pool->fpo_fmr_pool);
+
+       if (pool->fpo_hdev != NULL)
+               kiblnd_hdev_decref(pool->fpo_hdev);
+
+       LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
+}
+
+void
+kiblnd_destroy_fmr_pool_list(struct list_head *head)
+{
+       kib_fmr_pool_t *pool;
+
+       while (!list_empty(head)) {
+               pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
+               list_del(&pool->fpo_list);
+               kiblnd_destroy_fmr_pool(pool);
+       }
+}
+
+static int kiblnd_fmr_pool_size(int ncpts)
+{
+       int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
+
+       return max(IBLND_FMR_POOL, size);
+}
+
+static int kiblnd_fmr_flush_trigger(int ncpts)
+{
+       int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
+
+       return max(IBLND_FMR_POOL_FLUSH, size);
+}
+
+int
+kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+{
+       /* FMR pool for RDMA */
+       kib_dev_t              *dev = fps->fps_net->ibn_dev;
+       kib_fmr_pool_t    *fpo;
+       struct ib_fmr_pool_param param = {
+               .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+               .page_shift     = PAGE_SHIFT,
+               .access     = (IB_ACCESS_LOCAL_WRITE |
+                                     IB_ACCESS_REMOTE_WRITE),
+               .pool_size         = fps->fps_pool_size,
+               .dirty_watermark   = fps->fps_flush_trigger,
+               .flush_function    = NULL,
+               .flush_arg       = NULL,
+               .cache       = !!*kiblnd_tunables.kib_fmr_cache};
+       int rc;
+
+       LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+       if (fpo == NULL)
+               return -ENOMEM;
+
+       fpo->fpo_hdev = kiblnd_current_hdev(dev);
+
+       fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
+       if (IS_ERR(fpo->fpo_fmr_pool)) {
+               rc = PTR_ERR(fpo->fpo_fmr_pool);
+               CERROR("Failed to create FMR pool: %d\n", rc);
+
+               kiblnd_hdev_decref(fpo->fpo_hdev);
+               LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
+               return rc;
+       }
+
+       fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+       fpo->fpo_owner    = fps;
+       *pp_fpo = fpo;
+
+       return 0;
+}
+
+static void
+kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
+{
+       if (fps->fps_net == NULL) /* intialized? */
+               return;
+
+       spin_lock(&fps->fps_lock);
+
+       while (!list_empty(&fps->fps_pool_list)) {
+               kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
+                                                kib_fmr_pool_t, fpo_list);
+               fpo->fpo_failed = 1;
+               list_del(&fpo->fpo_list);
+               if (fpo->fpo_map_count == 0)
+                       list_add(&fpo->fpo_list, zombies);
+               else
+                       list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+       }
+
+       spin_unlock(&fps->fps_lock);
+}
+
+static void
+kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+{
+       if (fps->fps_net != NULL) { /* initialized? */
+               kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
+               kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
+       }
+}
+
+static int
+kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, kib_net_t *net,
+                       int pool_size, int flush_trigger)
+{
+       kib_fmr_pool_t *fpo;
+       int          rc;
+
+       memset(fps, 0, sizeof(kib_fmr_poolset_t));
+
+       fps->fps_net = net;
+       fps->fps_cpt = cpt;
+       fps->fps_pool_size = pool_size;
+       fps->fps_flush_trigger = flush_trigger;
+       spin_lock_init(&fps->fps_lock);
+       INIT_LIST_HEAD(&fps->fps_pool_list);
+       INIT_LIST_HEAD(&fps->fps_failed_pool_list);
+
+       rc = kiblnd_create_fmr_pool(fps, &fpo);
+       if (rc == 0)
+               list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+
+       return rc;
+}
+
+static int
+kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
+{
+       if (fpo->fpo_map_count != 0) /* still in use */
+               return 0;
+       if (fpo->fpo_failed)
+               return 1;
+       return cfs_time_aftereq(now, fpo->fpo_deadline);
+}
+
+void
+kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+{
+       LIST_HEAD     (zombies);
+       kib_fmr_pool_t    *fpo = fmr->fmr_pool;
+       kib_fmr_poolset_t *fps = fpo->fpo_owner;
+       cfs_time_t       now = cfs_time_current();
+       kib_fmr_pool_t    *tmp;
+       int             rc;
+
+       rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+       LASSERT (rc == 0);
+
+       if (status != 0) {
+               rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
+               LASSERT (rc == 0);
+       }
+
+       fmr->fmr_pool = NULL;
+       fmr->fmr_pfmr = NULL;
+
+       spin_lock(&fps->fps_lock);
+       fpo->fpo_map_count --;  /* decref the pool */
+
+       list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
+               /* the first pool is persistent */
+               if (fps->fps_pool_list.next == &fpo->fpo_list)
+                       continue;
+
+               if (kiblnd_fmr_pool_is_idle(fpo, now)) {
+                       list_move(&fpo->fpo_list, &zombies);
+                       fps->fps_version ++;
+               }
+       }
+       spin_unlock(&fps->fps_lock);
+
+       if (!list_empty(&zombies))
+               kiblnd_destroy_fmr_pool_list(&zombies);
+}
+
+int
+kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
+                   __u64 iov, kib_fmr_t *fmr)
+{
+       struct ib_pool_fmr *pfmr;
+       kib_fmr_pool_t     *fpo;
+       __u64          version;
+       int              rc;
+
+ again:
+       spin_lock(&fps->fps_lock);
+       version = fps->fps_version;
+       list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
+               fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+               fpo->fpo_map_count++;
+               spin_unlock(&fps->fps_lock);
+
+               pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
+                                           pages, npages, iov);
+               if (likely(!IS_ERR(pfmr))) {
+                       fmr->fmr_pool = fpo;
+                       fmr->fmr_pfmr = pfmr;
+                       return 0;
+               }
+
+               spin_lock(&fps->fps_lock);
+               fpo->fpo_map_count--;
+               if (PTR_ERR(pfmr) != -EAGAIN) {
+                       spin_unlock(&fps->fps_lock);
+                       return PTR_ERR(pfmr);
+               }
+
+               /* EAGAIN and ... */
+               if (version != fps->fps_version) {
+                       spin_unlock(&fps->fps_lock);
+                       goto again;
+               }
+       }
+
+       if (fps->fps_increasing) {
+               spin_unlock(&fps->fps_lock);
+               CDEBUG(D_NET, "Another thread is allocating new "
+                      "FMR pool, waiting for her to complete\n");
+               schedule();
+               goto again;
+
+       }
+
+       if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
+               /* someone failed recently */
+               spin_unlock(&fps->fps_lock);
+               return -EAGAIN;
+       }
+
+       fps->fps_increasing = 1;
+       spin_unlock(&fps->fps_lock);
+
+       CDEBUG(D_NET, "Allocate new FMR pool\n");
+       rc = kiblnd_create_fmr_pool(fps, &fpo);
+       spin_lock(&fps->fps_lock);
+       fps->fps_increasing = 0;
+       if (rc == 0) {
+               fps->fps_version++;
+               list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+       } else {
+               fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+       }
+       spin_unlock(&fps->fps_lock);
+
+       goto again;
+}
+
+static void
+kiblnd_fini_pool(kib_pool_t *pool)
+{
+       LASSERT (list_empty(&pool->po_free_list));
+       LASSERT (pool->po_allocated == 0);
+
+       CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
+}
+
+static void
+kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+{
+       CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
+
+       memset(pool, 0, sizeof(kib_pool_t));
+       INIT_LIST_HEAD(&pool->po_free_list);
+       pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+       pool->po_owner    = ps;
+       pool->po_size     = size;
+}
+
+void
+kiblnd_destroy_pool_list(struct list_head *head)
+{
+       kib_pool_t *pool;
+
+       while (!list_empty(head)) {
+               pool = list_entry(head->next, kib_pool_t, po_list);
+               list_del(&pool->po_list);
+
+               LASSERT (pool->po_owner != NULL);
+               pool->po_owner->ps_pool_destroy(pool);
+       }
+}
+
+static void
+kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+{
+       if (ps->ps_net == NULL) /* intialized? */
+               return;
+
+       spin_lock(&ps->ps_lock);
+       while (!list_empty(&ps->ps_pool_list)) {
+               kib_pool_t *po = list_entry(ps->ps_pool_list.next,
+                                           kib_pool_t, po_list);
+               po->po_failed = 1;
+               list_del(&po->po_list);
+               if (po->po_allocated == 0)
+                       list_add(&po->po_list, zombies);
+               else
+                       list_add(&po->po_list, &ps->ps_failed_pool_list);
+       }
+       spin_unlock(&ps->ps_lock);
+}
+
+static void
+kiblnd_fini_poolset(kib_poolset_t *ps)
+{
+       if (ps->ps_net != NULL) { /* initialized? */
+               kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
+               kiblnd_destroy_pool_list(&ps->ps_pool_list);
+       }
+}
+
+static int
+kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
+                   kib_net_t *net, char *name, int size,
+                   kib_ps_pool_create_t po_create,
+                   kib_ps_pool_destroy_t po_destroy,
+                   kib_ps_node_init_t nd_init,
+                   kib_ps_node_fini_t nd_fini)
+{
+       kib_pool_t      *pool;
+       int             rc;
+
+       memset(ps, 0, sizeof(kib_poolset_t));
+
+       ps->ps_cpt          = cpt;
+       ps->ps_net        = net;
+       ps->ps_pool_create  = po_create;
+       ps->ps_pool_destroy = po_destroy;
+       ps->ps_node_init    = nd_init;
+       ps->ps_node_fini    = nd_fini;
+       ps->ps_pool_size    = size;
+       if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
+           >= sizeof(ps->ps_name))
+               return -E2BIG;
+       spin_lock_init(&ps->ps_lock);
+       INIT_LIST_HEAD(&ps->ps_pool_list);
+       INIT_LIST_HEAD(&ps->ps_failed_pool_list);
+
+       rc = ps->ps_pool_create(ps, size, &pool);
+       if (rc == 0)
+               list_add(&pool->po_list, &ps->ps_pool_list);
+       else
+               CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+       return rc;
+}
+
+static int
+kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
+{
+       if (pool->po_allocated != 0) /* still in use */
+               return 0;
+       if (pool->po_failed)
+               return 1;
+       return cfs_time_aftereq(now, pool->po_deadline);
+}
+
+void
+kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+{
+       LIST_HEAD  (zombies);
+       kib_poolset_t  *ps = pool->po_owner;
+       kib_pool_t     *tmp;
+       cfs_time_t      now = cfs_time_current();
+
+       spin_lock(&ps->ps_lock);
+
+       if (ps->ps_node_fini != NULL)
+               ps->ps_node_fini(pool, node);
+
+       LASSERT (pool->po_allocated > 0);
+       list_add(node, &pool->po_free_list);
+       pool->po_allocated --;
+
+       list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+               /* the first pool is persistent */
+               if (ps->ps_pool_list.next == &pool->po_list)
+                       continue;
+
+               if (kiblnd_pool_is_idle(pool, now))
+                       list_move(&pool->po_list, &zombies);
+       }
+       spin_unlock(&ps->ps_lock);
+
+       if (!list_empty(&zombies))
+               kiblnd_destroy_pool_list(&zombies);
+}
+
+struct list_head *
+kiblnd_pool_alloc_node(kib_poolset_t *ps)
+{
+       struct list_head            *node;
+       kib_pool_t          *pool;
+       int                 rc;
+
+ again:
+       spin_lock(&ps->ps_lock);
+       list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+               if (list_empty(&pool->po_free_list))
+                       continue;
+
+               pool->po_allocated ++;
+               pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+               node = pool->po_free_list.next;
+               list_del(node);
+
+               if (ps->ps_node_init != NULL) {
+                       /* still hold the lock */
+                       ps->ps_node_init(pool, node);
+               }
+               spin_unlock(&ps->ps_lock);
+               return node;
+       }
+
+       /* no available tx pool and ... */
+       if (ps->ps_increasing) {
+               /* another thread is allocating a new pool */
+               spin_unlock(&ps->ps_lock);
+               CDEBUG(D_NET, "Another thread is allocating new "
+                      "%s pool, waiting for her to complete\n",
+                      ps->ps_name);
+               schedule();
+               goto again;
+       }
+
+       if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
+               /* someone failed recently */
+               spin_unlock(&ps->ps_lock);
+               return NULL;
+       }
+
+       ps->ps_increasing = 1;
+       spin_unlock(&ps->ps_lock);
+
+       CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+
+       rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+
+       spin_lock(&ps->ps_lock);
+       ps->ps_increasing = 0;
+       if (rc == 0) {
+               list_add_tail(&pool->po_list, &ps->ps_pool_list);
+       } else {
+               ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+               CERROR("Can't allocate new %s pool because out of memory\n",
+                      ps->ps_name);
+       }
+       spin_unlock(&ps->ps_lock);
+
+       goto again;
+}
+
+void
+kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
+{
+       kib_pmr_pool_t      *ppo = pmr->pmr_pool;
+       struct ib_mr    *mr  = pmr->pmr_mr;
+
+       pmr->pmr_mr = NULL;
+       kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
+       if (mr != NULL)
+               ib_dereg_mr(mr);
+}
+
+int
+kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+                   kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
+{
+       kib_phys_mr_t *pmr;
+       struct list_head    *node;
+       int         rc;
+       int         i;
+
+       node = kiblnd_pool_alloc_node(&pps->pps_poolset);
+       if (node == NULL) {
+               CERROR("Failed to allocate PMR descriptor\n");
+               return -ENOMEM;
+       }
+
+       pmr = container_of(node, kib_phys_mr_t, pmr_list);
+       if (pmr->pmr_pool->ppo_hdev != hdev) {
+               kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+               return -EAGAIN;
+       }
+
+       for (i = 0; i < rd->rd_nfrags; i ++) {
+               pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
+               pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
+       }
+
+       pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
+                                    pmr->pmr_ipb, rd->rd_nfrags,
+                                    IB_ACCESS_LOCAL_WRITE |
+                                    IB_ACCESS_REMOTE_WRITE,
+                                    iova);
+       if (!IS_ERR(pmr->pmr_mr)) {
+               pmr->pmr_iova = *iova;
+               *pp_pmr = pmr;
+               return 0;
+       }
+
+       rc = PTR_ERR(pmr->pmr_mr);
+       CERROR("Failed ib_reg_phys_mr: %d\n", rc);
+
+       pmr->pmr_mr = NULL;
+       kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+
+       return rc;
+}
+
+static void
+kiblnd_destroy_pmr_pool(kib_pool_t *pool)
+{
+       kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
+       kib_phys_mr_t  *pmr;
+
+       LASSERT (pool->po_allocated == 0);
+
+       while (!list_empty(&pool->po_free_list)) {
+               pmr = list_entry(pool->po_free_list.next,
+                                    kib_phys_mr_t, pmr_list);
+
+               LASSERT (pmr->pmr_mr == NULL);
+               list_del(&pmr->pmr_list);
+
+               if (pmr->pmr_ipb != NULL) {
+                       LIBCFS_FREE(pmr->pmr_ipb,
+                                   IBLND_MAX_RDMA_FRAGS *
+                                   sizeof(struct ib_phys_buf));
+               }
+
+               LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
+       }
+
+       kiblnd_fini_pool(pool);
+       if (ppo->ppo_hdev != NULL)
+               kiblnd_hdev_decref(ppo->ppo_hdev);
+
+       LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
+}
+
+static inline int kiblnd_pmr_pool_size(int ncpts)
+{
+       int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts;
+
+       return max(IBLND_PMR_POOL, size);
+}
+
+static int
+kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+       struct kib_pmr_pool     *ppo;
+       struct kib_pool         *pool;
+       kib_phys_mr_t           *pmr;
+       int                     i;
+
+       LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(),
+                        ps->ps_cpt, sizeof(kib_pmr_pool_t));
+       if (ppo == NULL) {
+               CERROR("Failed to allocate PMR pool\n");
+               return -ENOMEM;
+       }
+
+       pool = &ppo->ppo_pool;
+       kiblnd_init_pool(ps, pool, size);
+
+       for (i = 0; i < size; i++) {
+               LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(),
+                                ps->ps_cpt, sizeof(kib_phys_mr_t));
+               if (pmr == NULL)
+                       break;
+
+               pmr->pmr_pool = ppo;
+               LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt,
+                                IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb));
+               if (pmr->pmr_ipb == NULL)
+                       break;
+
+               list_add(&pmr->pmr_list, &pool->po_free_list);
+       }
+
+       if (i < size) {
+               ps->ps_pool_destroy(pool);
+               return -ENOMEM;
+       }
+
+       ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
+       *pp_po = pool;
+       return 0;
+}
+
+static void
+kiblnd_destroy_tx_pool(kib_pool_t *pool)
+{
+       kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+       int          i;
+
+       LASSERT (pool->po_allocated == 0);
+
+       if (tpo->tpo_tx_pages != NULL) {
+               kiblnd_unmap_tx_pool(tpo);
+               kiblnd_free_pages(tpo->tpo_tx_pages);
+       }
+
+       if (tpo->tpo_tx_descs == NULL)
+               goto out;
+
+       for (i = 0; i < pool->po_size; i++) {
+               kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+               list_del(&tx->tx_list);
+               if (tx->tx_pages != NULL)
+                       LIBCFS_FREE(tx->tx_pages,
+                                   LNET_MAX_IOV *
+                                   sizeof(*tx->tx_pages));
+               if (tx->tx_frags != NULL)
+                       LIBCFS_FREE(tx->tx_frags,
+                                   IBLND_MAX_RDMA_FRAGS *
+                                           sizeof(*tx->tx_frags));
+               if (tx->tx_wrq != NULL)
+                       LIBCFS_FREE(tx->tx_wrq,
+                                   (1 + IBLND_MAX_RDMA_FRAGS) *
+                                   sizeof(*tx->tx_wrq));
+               if (tx->tx_sge != NULL)
+                       LIBCFS_FREE(tx->tx_sge,
+                                   (1 + IBLND_MAX_RDMA_FRAGS) *
+                                   sizeof(*tx->tx_sge));
+               if (tx->tx_rd != NULL)
+                       LIBCFS_FREE(tx->tx_rd,
+                                   offsetof(kib_rdma_desc_t,
+                                            rd_frags[IBLND_MAX_RDMA_FRAGS]));
+       }
+
+       LIBCFS_FREE(tpo->tpo_tx_descs,
+                   pool->po_size * sizeof(kib_tx_t));
+out:
+       kiblnd_fini_pool(pool);
+       LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+}
+
+static int kiblnd_tx_pool_size(int ncpts)
+{
+       int ntx = *kiblnd_tunables.kib_ntx / ncpts;
+
+       return max(IBLND_TX_POOL, ntx);
+}
+
+static int
+kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+       int         i;
+       int         npg;
+       kib_pool_t    *pool;
+       kib_tx_pool_t *tpo;
+
+       LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
+       if (tpo == NULL) {
+               CERROR("Failed to allocate TX pool\n");
+               return -ENOMEM;
+       }
+
+       pool = &tpo->tpo_pool;
+       kiblnd_init_pool(ps, pool, size);
+       tpo->tpo_tx_descs = NULL;
+       tpo->tpo_tx_pages = NULL;
+
+       npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+       if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
+               CERROR("Can't allocate tx pages: %d\n", npg);
+               LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+               return -ENOMEM;
+       }
+
+       LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
+                        size * sizeof(kib_tx_t));
+       if (tpo->tpo_tx_descs == NULL) {
+               CERROR("Can't allocate %d tx descriptors\n", size);
+               ps->ps_pool_destroy(pool);
+               return -ENOMEM;
+       }
+
+       memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+
+       for (i = 0; i < size; i++) {
+               kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+               tx->tx_pool = tpo;
+               if (ps->ps_net->ibn_fmr_ps != NULL) {
+                       LIBCFS_CPT_ALLOC(tx->tx_pages,
+                                        lnet_cpt_table(), ps->ps_cpt,
+                                        LNET_MAX_IOV * sizeof(*tx->tx_pages));
+                       if (tx->tx_pages == NULL)
+                               break;
+               }
+
+               LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
+                                IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags));
+               if (tx->tx_frags == NULL)
+                       break;
+
+               sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
+
+               LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
+                                (1 + IBLND_MAX_RDMA_FRAGS) *
+                                sizeof(*tx->tx_wrq));
+               if (tx->tx_wrq == NULL)
+                       break;
+
+               LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
+                                (1 + IBLND_MAX_RDMA_FRAGS) *
+                                sizeof(*tx->tx_sge));
+               if (tx->tx_sge == NULL)
+                       break;
+
+               LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
+                                offsetof(kib_rdma_desc_t,
+                                         rd_frags[IBLND_MAX_RDMA_FRAGS]));
+               if (tx->tx_rd == NULL)
+                       break;
+       }
+
+       if (i == size) {
+               kiblnd_map_tx_pool(tpo);
+               *pp_po = pool;
+               return 0;
+       }
+
+       ps->ps_pool_destroy(pool);
+       return -ENOMEM;
+}
+
+static void
+kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+{
+       kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+                                            tps_poolset);
+       kib_tx_t         *tx  = list_entry(node, kib_tx_t, tx_list);
+
+       tx->tx_cookie = tps->tps_next_tx_cookie ++;
+}
+
+void
+kiblnd_net_fini_pools(kib_net_t *net)
+{
+       int     i;
+
+       cfs_cpt_for_each(i, lnet_cpt_table()) {
+               kib_tx_poolset_t        *tps;
+               kib_fmr_poolset_t       *fps;
+               kib_pmr_poolset_t       *pps;
+
+               if (net->ibn_tx_ps != NULL) {
+                       tps = net->ibn_tx_ps[i];
+                       kiblnd_fini_poolset(&tps->tps_poolset);
+               }
+
+               if (net->ibn_fmr_ps != NULL) {
+                       fps = net->ibn_fmr_ps[i];
+                       kiblnd_fini_fmr_poolset(fps);
+               }
+
+               if (net->ibn_pmr_ps != NULL) {
+                       pps = net->ibn_pmr_ps[i];
+                       kiblnd_fini_poolset(&pps->pps_poolset);
+               }
+       }
+
+       if (net->ibn_tx_ps != NULL) {
+               cfs_percpt_free(net->ibn_tx_ps);
+               net->ibn_tx_ps = NULL;
+       }
+
+       if (net->ibn_fmr_ps != NULL) {
+               cfs_percpt_free(net->ibn_fmr_ps);
+               net->ibn_fmr_ps = NULL;
+       }
+
+       if (net->ibn_pmr_ps != NULL) {
+               cfs_percpt_free(net->ibn_pmr_ps);
+               net->ibn_pmr_ps = NULL;
+       }
+}
+
+int
+kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
+{
+       unsigned long   flags;
+       int             cpt;
+       int             rc;
+       int             i;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       if (*kiblnd_tunables.kib_map_on_demand == 0 &&
+           net->ibn_dev->ibd_hdev->ibh_nmrs == 1) {
+               read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                          flags);
+               goto create_tx_pool;
+       }
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       if (*kiblnd_tunables.kib_fmr_pool_size <
+           *kiblnd_tunables.kib_ntx / 4) {
+               CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+                      *kiblnd_tunables.kib_fmr_pool_size,
+                      *kiblnd_tunables.kib_ntx / 4);
+               rc = -EINVAL;
+               goto failed;
+       }
+
+       /* TX pool must be created later than FMR/PMR, see LU-2268
+        * for details */
+       LASSERT(net->ibn_tx_ps == NULL);
+
+       /* premapping can fail if ibd_nmr > 1, so we always create
+        * FMR/PMR pool and map-on-demand if premapping failed */
+
+       net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+                                          sizeof(kib_fmr_poolset_t));
+       if (net->ibn_fmr_ps == NULL) {
+               CERROR("Failed to allocate FMR pool array\n");
+               rc = -ENOMEM;
+               goto failed;
+       }
+
+       for (i = 0; i < ncpts; i++) {
+               cpt = (cpts == NULL) ? i : cpts[i];
+               rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
+                                            kiblnd_fmr_pool_size(ncpts),
+                                            kiblnd_fmr_flush_trigger(ncpts));
+               if (rc == -ENOSYS && i == 0) /* no FMR */
+                       break; /* create PMR pool */
+
+               if (rc != 0) { /* a real error */
+                       CERROR("Can't initialize FMR pool for CPT %d: %d\n",
+                              cpt, rc);
+                       goto failed;
+               }
+       }
+
+       if (i > 0) {
+               LASSERT(i == ncpts);
+               goto create_tx_pool;
+       }
+
+       cfs_percpt_free(net->ibn_fmr_ps);
+       net->ibn_fmr_ps = NULL;
+
+       CWARN("Device does not support FMR, failing back to PMR\n");
+
+       if (*kiblnd_tunables.kib_pmr_pool_size <
+           *kiblnd_tunables.kib_ntx / 4) {
+               CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
+                      *kiblnd_tunables.kib_pmr_pool_size,
+                      *kiblnd_tunables.kib_ntx / 4);
+               rc = -EINVAL;
+               goto failed;
+       }
+
+       net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+                                          sizeof(kib_pmr_poolset_t));
+       if (net->ibn_pmr_ps == NULL) {
+               CERROR("Failed to allocate PMR pool array\n");
+               rc = -ENOMEM;
+               goto failed;
+       }
+
+       for (i = 0; i < ncpts; i++) {
+               cpt = (cpts == NULL) ? i : cpts[i];
+               rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset,
+                                        cpt, net, "PMR",
+                                        kiblnd_pmr_pool_size(ncpts),
+                                        kiblnd_create_pmr_pool,
+                                        kiblnd_destroy_pmr_pool, NULL, NULL);
+               if (rc != 0) {
+                       CERROR("Can't initialize PMR pool for CPT %d: %d\n",
+                              cpt, rc);
+                       goto failed;
+               }
+       }
+
+ create_tx_pool:
+       net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
+                                         sizeof(kib_tx_poolset_t));
+       if (net->ibn_tx_ps == NULL) {
+               CERROR("Failed to allocate tx pool array\n");
+               rc = -ENOMEM;
+               goto failed;
+       }
+
+       for (i = 0; i < ncpts; i++) {
+               cpt = (cpts == NULL) ? i : cpts[i];
+               rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
+                                        cpt, net, "TX",
+                                        kiblnd_tx_pool_size(ncpts),
+                                        kiblnd_create_tx_pool,
+                                        kiblnd_destroy_tx_pool,
+                                        kiblnd_tx_init, NULL);
+               if (rc != 0) {
+                       CERROR("Can't initialize TX pool for CPT %d: %d\n",
+                              cpt, rc);
+                       goto failed;
+               }
+       }
+
+       return 0;
+ failed:
+       kiblnd_net_fini_pools(net);
+       LASSERT(rc != 0);
+       return rc;
+}
+
+static int
+kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+{
+       struct ib_device_attr *attr;
+       int                 rc;
+
+       /* It's safe to assume a HCA can handle a page size
+        * matching that of the native system */
+       hdev->ibh_page_shift = PAGE_SHIFT;
+       hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+       hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+
+       LIBCFS_ALLOC(attr, sizeof(*attr));
+       if (attr == NULL) {
+               CERROR("Out of memory\n");
+               return -ENOMEM;
+       }
+
+       rc = ib_query_device(hdev->ibh_ibdev, attr);
+       if (rc == 0)
+               hdev->ibh_mr_size = attr->max_mr_size;
+
+       LIBCFS_FREE(attr, sizeof(*attr));
+
+       if (rc != 0) {
+               CERROR("Failed to query IB device: %d\n", rc);
+               return rc;
+       }
+
+       if (hdev->ibh_mr_size == ~0ULL) {
+               hdev->ibh_mr_shift = 64;
+               return 0;
+       }
+
+       for (hdev->ibh_mr_shift = 0;
+            hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift ++) {
+               if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
+                   hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
+                       return 0;
+       }
+
+       CERROR("Invalid mr size: "LPX64"\n", hdev->ibh_mr_size);
+       return -EINVAL;
+}
+
+void
+kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+{
+       int     i;
+
+       if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL)
+               return;
+
+       for (i = 0; i < hdev->ibh_nmrs; i++) {
+               if (hdev->ibh_mrs[i] == NULL)
+                       break;
+
+               ib_dereg_mr(hdev->ibh_mrs[i]);
+       }
+
+       LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+       hdev->ibh_mrs  = NULL;
+       hdev->ibh_nmrs = 0;
+}
+
+void
+kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+{
+       kiblnd_hdev_cleanup_mrs(hdev);
+
+       if (hdev->ibh_pd != NULL)
+               ib_dealloc_pd(hdev->ibh_pd);
+
+       if (hdev->ibh_cmid != NULL)
+               rdma_destroy_id(hdev->ibh_cmid);
+
+       LIBCFS_FREE(hdev, sizeof(*hdev));
+}
+
+int
+kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+{
+       struct ib_mr *mr;
+       int        i;
+       int        rc;
+       __u64    mm_size;
+       __u64    mr_size;
+       int        acflags = IB_ACCESS_LOCAL_WRITE |
+                               IB_ACCESS_REMOTE_WRITE;
+
+       rc = kiblnd_hdev_get_attr(hdev);
+       if (rc != 0)
+               return rc;
+
+       if (hdev->ibh_mr_shift == 64) {
+               LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
+               if (hdev->ibh_mrs == NULL) {
+                       CERROR("Failed to allocate MRs table\n");
+                       return -ENOMEM;
+               }
+
+               hdev->ibh_mrs[0] = NULL;
+               hdev->ibh_nmrs   = 1;
+
+               mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
+               if (IS_ERR(mr)) {
+                       CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
+                       kiblnd_hdev_cleanup_mrs(hdev);
+                       return PTR_ERR(mr);
+               }
+
+               hdev->ibh_mrs[0] = mr;
+
+               goto out;
+       }
+
+       mr_size = (1ULL << hdev->ibh_mr_shift);
+       mm_size = (unsigned long)high_memory - PAGE_OFFSET;
+
+       hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift);
+
+       if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) {
+               /* it's 4T..., assume we will re-code at that time */
+               CERROR("Can't support memory size: x"LPX64
+                      " with MR size: x"LPX64"\n", mm_size, mr_size);
+               return -EINVAL;
+       }
+
+       /* create an array of MRs to cover all memory */
+       LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+       if (hdev->ibh_mrs == NULL) {
+               CERROR("Failed to allocate MRs' table\n");
+               return -ENOMEM;
+       }
+
+       memset(hdev->ibh_mrs, 0, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+
+       for (i = 0; i < hdev->ibh_nmrs; i++) {
+               struct ib_phys_buf ipb;
+               __u64         iova;
+
+               ipb.size = hdev->ibh_mr_size;
+               ipb.addr = i * mr_size;
+               iova     = ipb.addr;
+
+               mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova);
+               if (IS_ERR(mr)) {
+                       CERROR("Failed ib_reg_phys_mr addr "LPX64
+                              " size "LPX64" : %ld\n",
+                              ipb.addr, ipb.size, PTR_ERR(mr));
+                       kiblnd_hdev_cleanup_mrs(hdev);
+                       return PTR_ERR(mr);
+               }
+
+               LASSERT (iova == ipb.addr);
+
+               hdev->ibh_mrs[i] = mr;
+       }
+
+out:
+       if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1)
+               LCONSOLE_INFO("Register global MR array, MR size: "
+                             LPX64", array size: %d\n",
+                             hdev->ibh_mr_size, hdev->ibh_nmrs);
+       return 0;
+}
+
+static int
+kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{       /* DUMMY */
+       return 0;
+}
+
+static int
+kiblnd_dev_need_failover(kib_dev_t *dev)
+{
+       struct rdma_cm_id  *cmid;
+       struct sockaddr_in  srcaddr;
+       struct sockaddr_in  dstaddr;
+       int              rc;
+
+       if (dev->ibd_hdev == NULL || /* initializing */
+           dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
+           *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
+               return 1;
+
+       /* XXX: it's UGLY, but I don't have better way to find
+        * ib-bonding HCA failover because:
+        *
+        * a. no reliable CM event for HCA failover...
+        * b. no OFED API to get ib_device for current net_device...
+        *
+        * We have only two choices at this point:
+        *
+        * a. rdma_bind_addr(), it will conflict with listener cmid
+        * b. rdma_resolve_addr() to zero addr */
+       cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
+                                    IB_QPT_RC);
+       if (IS_ERR(cmid)) {
+               rc = PTR_ERR(cmid);
+               CERROR("Failed to create cmid for failover: %d\n", rc);
+               return rc;
+       }
+
+       memset(&srcaddr, 0, sizeof(srcaddr));
+       srcaddr.sin_family      = AF_INET;
+       srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+
+       memset(&dstaddr, 0, sizeof(dstaddr));
+       dstaddr.sin_family = AF_INET;
+       rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
+                              (struct sockaddr *)&dstaddr, 1);
+       if (rc != 0 || cmid->device == NULL) {
+               CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+                      dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+                      cmid->device, rc);
+               rdma_destroy_id(cmid);
+               return rc;
+       }
+
+       if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
+               /* don't need device failover */
+               rdma_destroy_id(cmid);
+               return 0;
+       }
+
+       return 1;
+}
+
+int
+kiblnd_dev_failover(kib_dev_t *dev)
+{
+       LIST_HEAD      (zombie_tpo);
+       LIST_HEAD      (zombie_ppo);
+       LIST_HEAD      (zombie_fpo);
+       struct rdma_cm_id  *cmid  = NULL;
+       kib_hca_dev_t      *hdev  = NULL;
+       kib_hca_dev_t      *old;
+       struct ib_pd       *pd;
+       kib_net_t         *net;
+       struct sockaddr_in  addr;
+       unsigned long       flags;
+       int              rc = 0;
+       int                 i;
+
+       LASSERT (*kiblnd_tunables.kib_dev_failover > 1 ||
+                dev->ibd_can_failover ||
+                dev->ibd_hdev == NULL);
+
+       rc = kiblnd_dev_need_failover(dev);
+       if (rc <= 0)
+               goto out;
+
+       if (dev->ibd_hdev != NULL &&
+           dev->ibd_hdev->ibh_cmid != NULL) {
+               /* XXX it's not good to close old listener at here,
+                * because we can fail to create new listener.
+                * But we have to close it now, otherwise rdma_bind_addr
+                * will return EADDRINUSE... How crap! */
+               write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+               cmid = dev->ibd_hdev->ibh_cmid;
+               /* make next schedule of kiblnd_dev_need_failover()
+                * return 1 for me */
+               dev->ibd_hdev->ibh_cmid  = NULL;
+               write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+               rdma_destroy_id(cmid);
+       }
+
+       cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
+                                    IB_QPT_RC);
+       if (IS_ERR(cmid)) {
+               rc = PTR_ERR(cmid);
+               CERROR("Failed to create cmid for failover: %d\n", rc);
+               goto out;
+       }
+
+       memset(&addr, 0, sizeof(addr));
+       addr.sin_family      = AF_INET;
+       addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+       addr.sin_port   = htons(*kiblnd_tunables.kib_service);
+
+       /* Bind to failover device or port */
+       rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+       if (rc != 0 || cmid->device == NULL) {
+               CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+                      dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+                      cmid->device, rc);
+               rdma_destroy_id(cmid);
+               goto out;
+       }
+
+       LIBCFS_ALLOC(hdev, sizeof(*hdev));
+       if (hdev == NULL) {
+               CERROR("Failed to allocate kib_hca_dev\n");
+               rdma_destroy_id(cmid);
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       atomic_set(&hdev->ibh_ref, 1);
+       hdev->ibh_dev   = dev;
+       hdev->ibh_cmid  = cmid;
+       hdev->ibh_ibdev = cmid->device;
+
+       pd = ib_alloc_pd(cmid->device);
+       if (IS_ERR(pd)) {
+               rc = PTR_ERR(pd);
+               CERROR("Can't allocate PD: %d\n", rc);
+               goto out;
+       }
+
+       hdev->ibh_pd = pd;
+
+       rc = rdma_listen(cmid, 0);
+       if (rc != 0) {
+               CERROR("Can't start new listener: %d\n", rc);
+               goto out;
+       }
+
+       rc = kiblnd_hdev_setup_mrs(hdev);
+       if (rc != 0) {
+               CERROR("Can't setup device: %d\n", rc);
+               goto out;
+       }
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       old = dev->ibd_hdev;
+       dev->ibd_hdev = hdev; /* take over the refcount */
+       hdev = old;
+
+       list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
+               cfs_cpt_for_each(i, lnet_cpt_table()) {
+                       kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
+                                           &zombie_tpo);
+
+                       if (net->ibn_fmr_ps != NULL) {
+                               kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
+                                                       &zombie_fpo);
+
+                       } else if (net->ibn_pmr_ps != NULL) {
+                               kiblnd_fail_poolset(&net->ibn_pmr_ps[i]->
+                                                   pps_poolset, &zombie_ppo);
+                       }
+               }
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ out:
+       if (!list_empty(&zombie_tpo))
+               kiblnd_destroy_pool_list(&zombie_tpo);
+       if (!list_empty(&zombie_ppo))
+               kiblnd_destroy_pool_list(&zombie_ppo);
+       if (!list_empty(&zombie_fpo))
+               kiblnd_destroy_fmr_pool_list(&zombie_fpo);
+       if (hdev != NULL)
+               kiblnd_hdev_decref(hdev);
+
+       if (rc != 0)
+               dev->ibd_failed_failover++;
+       else
+               dev->ibd_failed_failover = 0;
+
+       return rc;
+}
+
+void
+kiblnd_destroy_dev (kib_dev_t *dev)
+{
+       LASSERT (dev->ibd_nnets == 0);
+       LASSERT (list_empty(&dev->ibd_nets));
+
+       list_del(&dev->ibd_fail_list);
+       list_del(&dev->ibd_list);
+
+       if (dev->ibd_hdev != NULL)
+               kiblnd_hdev_decref(dev->ibd_hdev);
+
+       LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+kib_dev_t *
+kiblnd_create_dev(char *ifname)
+{
+       struct net_device *netdev;
+       kib_dev_t        *dev;
+       __u32         netmask;
+       __u32         ip;
+       int             up;
+       int             rc;
+
+       rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
+       if (rc != 0) {
+               CERROR("Can't query IPoIB interface %s: %d\n",
+                      ifname, rc);
+               return NULL;
+       }
+
+       if (!up) {
+               CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(dev, sizeof(*dev));
+       if (dev == NULL)
+               return NULL;
+
+       memset(dev, 0, sizeof(*dev));
+       netdev = dev_get_by_name(&init_net, ifname);
+       if (netdev == NULL) {
+               dev->ibd_can_failover = 0;
+       } else {
+               dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
+               dev_put(netdev);
+       }
+
+       INIT_LIST_HEAD(&dev->ibd_nets);
+       INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
+       INIT_LIST_HEAD(&dev->ibd_fail_list);
+       dev->ibd_ifip = ip;
+       strcpy(&dev->ibd_ifname[0], ifname);
+
+       /* initialize the device */
+       rc = kiblnd_dev_failover(dev);
+       if (rc != 0) {
+               CERROR("Can't initialize device: %d\n", rc);
+               LIBCFS_FREE(dev, sizeof(*dev));
+               return NULL;
+       }
+
+       list_add_tail(&dev->ibd_list,
+                         &kiblnd_data.kib_devs);
+       return dev;
+}
+
+void
+kiblnd_base_shutdown(void)
+{
+       struct kib_sched_info   *sched;
+       int                     i;
+
+       LASSERT (list_empty(&kiblnd_data.kib_devs));
+
+       CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       switch (kiblnd_data.kib_init) {
+       default:
+               LBUG();
+
+       case IBLND_INIT_ALL:
+       case IBLND_INIT_DATA:
+               LASSERT (kiblnd_data.kib_peers != NULL);
+               for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+                       LASSERT (list_empty(&kiblnd_data.kib_peers[i]));
+               }
+               LASSERT (list_empty(&kiblnd_data.kib_connd_zombies));
+               LASSERT (list_empty(&kiblnd_data.kib_connd_conns));
+
+               /* flag threads to terminate; wake and wait for them to die */
+               kiblnd_data.kib_shutdown = 1;
+
+               /* NB: we really want to stop scheduler threads net by net
+                * instead of the whole module, this should be improved
+                * with dynamic configuration LNet */
+               cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
+                       wake_up_all(&sched->ibs_waitq);
+
+               wake_up_all(&kiblnd_data.kib_connd_waitq);
+               wake_up_all(&kiblnd_data.kib_failover_waitq);
+
+               i = 2;
+               while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
+                       i++;
+                       CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                              "Waiting for %d threads to terminate\n",
+                              atomic_read(&kiblnd_data.kib_nthreads));
+                       cfs_pause(cfs_time_seconds(1));
+               }
+
+               /* fall through */
+
+       case IBLND_INIT_NOTHING:
+               break;
+       }
+
+       if (kiblnd_data.kib_peers != NULL) {
+               LIBCFS_FREE(kiblnd_data.kib_peers,
+                           sizeof(struct list_head) *
+                           kiblnd_data.kib_peer_hash_size);
+       }
+
+       if (kiblnd_data.kib_scheds != NULL)
+               cfs_percpt_free(kiblnd_data.kib_scheds);
+
+       CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+       module_put(THIS_MODULE);
+}
+
+void
+kiblnd_shutdown (lnet_ni_t *ni)
+{
+       kib_net_t       *net = ni->ni_data;
+       rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
+       int            i;
+       unsigned long     flags;
+
+       LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+       if (net == NULL)
+               goto out;
+
+       CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       write_lock_irqsave(g_lock, flags);
+       net->ibn_shutdown = 1;
+       write_unlock_irqrestore(g_lock, flags);
+
+       switch (net->ibn_init) {
+       default:
+               LBUG();
+
+       case IBLND_INIT_ALL:
+               /* nuke all existing peers within this net */
+               kiblnd_del_peer(ni, LNET_NID_ANY);
+
+               /* Wait for all peer state to clean up */
+               i = 2;
+               while (atomic_read(&net->ibn_npeers) != 0) {
+                       i++;
+                       CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
+                              "%s: waiting for %d peers to disconnect\n",
+                              libcfs_nid2str(ni->ni_nid),
+                              atomic_read(&net->ibn_npeers));
+                       cfs_pause(cfs_time_seconds(1));
+               }
+
+               kiblnd_net_fini_pools(net);
+
+               write_lock_irqsave(g_lock, flags);
+               LASSERT(net->ibn_dev->ibd_nnets > 0);
+               net->ibn_dev->ibd_nnets--;
+               list_del(&net->ibn_list);
+               write_unlock_irqrestore(g_lock, flags);
+
+               /* fall through */
+
+       case IBLND_INIT_NOTHING:
+               LASSERT (atomic_read(&net->ibn_nconns) == 0);
+
+               if (net->ibn_dev != NULL &&
+                   net->ibn_dev->ibd_nnets == 0)
+                       kiblnd_destroy_dev(net->ibn_dev);
+
+               break;
+       }
+
+       CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       net->ibn_init = IBLND_INIT_NOTHING;
+       ni->ni_data = NULL;
+
+       LIBCFS_FREE(net, sizeof(*net));
+
+out:
+       if (list_empty(&kiblnd_data.kib_devs))
+               kiblnd_base_shutdown();
+       return;
+}
+
+int
+kiblnd_base_startup(void)
+{
+       struct kib_sched_info   *sched;
+       int                     rc;
+       int                     i;
+
+       LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+       try_module_get(THIS_MODULE);
+       memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
+
+       rwlock_init(&kiblnd_data.kib_global_lock);
+
+       INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+       INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
+
+       kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
+       LIBCFS_ALLOC(kiblnd_data.kib_peers,
+                    sizeof(struct list_head) *
+                           kiblnd_data.kib_peer_hash_size);
+       if (kiblnd_data.kib_peers == NULL) {
+               goto failed;
+       }
+       for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+               INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
+
+       spin_lock_init(&kiblnd_data.kib_connd_lock);
+       INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+       INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+       init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+       init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
+
+       kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
+                                                 sizeof(*sched));
+       if (kiblnd_data.kib_scheds == NULL)
+               goto failed;
+
+       cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+               int     nthrs;
+
+               spin_lock_init(&sched->ibs_lock);
+               INIT_LIST_HEAD(&sched->ibs_conns);
+               init_waitqueue_head(&sched->ibs_waitq);
+
+               nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+               if (*kiblnd_tunables.kib_nscheds > 0) {
+                       nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
+               } else {
+                       /* max to half of CPUs, another half is reserved for
+                        * upper layer modules */
+                       nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+               }
+
+               sched->ibs_nthreads_max = nthrs;
+               sched->ibs_cpt = i;
+       }
+
+       kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+       /* lists/ptrs/locks initialised */
+       kiblnd_data.kib_init = IBLND_INIT_DATA;
+       /*****************************************************/
+
+       rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
+       if (rc != 0) {
+               CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+               goto failed;
+       }
+
+       if (*kiblnd_tunables.kib_dev_failover != 0)
+               rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+                                        "kiblnd_failover");
+
+       if (rc != 0) {
+               CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
+               goto failed;
+       }
+
+       /* flag everything initialised */
+       kiblnd_data.kib_init = IBLND_INIT_ALL;
+       /*****************************************************/
+
+       return 0;
+
+ failed:
+       kiblnd_base_shutdown();
+       return -ENETDOWN;
+}
+
+int
+kiblnd_start_schedulers(struct kib_sched_info *sched)
+{
+       int     rc = 0;
+       int     nthrs;
+       int     i;
+
+       if (sched->ibs_nthreads == 0) {
+               if (*kiblnd_tunables.kib_nscheds > 0) {
+                       nthrs = sched->ibs_nthreads_max;
+               } else {
+                       nthrs = cfs_cpt_weight(lnet_cpt_table(),
+                                              sched->ibs_cpt);
+                       nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+                       nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
+               }
+       } else {
+               LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
+               /* increase one thread if there is new interface */
+               nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max);
+       }
+
+       for (i = 0; i < nthrs; i++) {
+               long    id;
+               char    name[20];
+               id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+               snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
+                        KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
+               rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
+               if (rc == 0)
+                       continue;
+
+               CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+                      sched->ibs_cpt, sched->ibs_nthreads + i, rc);
+               break;
+       }
+
+       sched->ibs_nthreads += i;
+       return rc;
+}
+
+int
+kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
+{
+       int     cpt;
+       int     rc;
+       int     i;
+
+       for (i = 0; i < ncpts; i++) {
+               struct kib_sched_info *sched;
+
+               cpt = (cpts == NULL) ? i : cpts[i];
+               sched = kiblnd_data.kib_scheds[cpt];
+
+               if (!newdev && sched->ibs_nthreads > 0)
+                       continue;
+
+               rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
+               if (rc != 0) {
+                       CERROR("Failed to start scheduler threads for %s\n",
+                              dev->ibd_ifname);
+                       return rc;
+               }
+       }
+       return 0;
+}
+
+kib_dev_t *
+kiblnd_dev_search(char *ifname)
+{
+       kib_dev_t       *alias = NULL;
+       kib_dev_t       *dev;
+       char            *colon;
+       char            *colon2;
+
+       colon = strchr(ifname, ':');
+       list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+               if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+                       return dev;
+
+               if (alias != NULL)
+                       continue;
+
+               colon2 = strchr(dev->ibd_ifname, ':');
+               if (colon != NULL)
+                       *colon = 0;
+               if (colon2 != NULL)
+                       *colon2 = 0;
+
+               if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+                       alias = dev;
+
+               if (colon != NULL)
+                       *colon = ':';
+               if (colon2 != NULL)
+                       *colon2 = ':';
+       }
+       return alias;
+}
+
+int
+kiblnd_startup (lnet_ni_t *ni)
+{
+       char                 *ifname;
+       kib_dev_t               *ibdev = NULL;
+       kib_net_t               *net;
+       struct timeval      tv;
+       unsigned long        flags;
+       int                    rc;
+       int                       newdev;
+
+       LASSERT (ni->ni_lnd == &the_o2iblnd);
+
+       if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+               rc = kiblnd_base_startup();
+               if (rc != 0)
+                       return rc;
+       }
+
+       LIBCFS_ALLOC(net, sizeof(*net));
+       ni->ni_data = net;
+       if (net == NULL)
+               goto failed;
+
+       memset(net, 0, sizeof(*net));
+
+       do_gettimeofday(&tv);
+       net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+       ni->ni_peertimeout    = *kiblnd_tunables.kib_peertimeout;
+       ni->ni_maxtxcredits   = *kiblnd_tunables.kib_credits;
+       ni->ni_peertxcredits  = *kiblnd_tunables.kib_peertxcredits;
+       ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
+
+       if (ni->ni_interfaces[0] != NULL) {
+               /* Use the IPoIB interface specified in 'networks=' */
+
+               CLASSERT (LNET_MAX_INTERFACES > 1);
+               if (ni->ni_interfaces[1] != NULL) {
+                       CERROR("Multiple interfaces not supported\n");
+                       goto failed;
+               }
+
+               ifname = ni->ni_interfaces[0];
+       } else {
+               ifname = *kiblnd_tunables.kib_default_ipif;
+       }
+
+       if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+               CERROR("IPoIB interface name too long: %s\n", ifname);
+               goto failed;
+       }
+
+       ibdev = kiblnd_dev_search(ifname);
+
+       newdev = ibdev == NULL;
+       /* hmm...create kib_dev even for alias */
+       if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
+               ibdev = kiblnd_create_dev(ifname);
+
+       if (ibdev == NULL)
+               goto failed;
+
+       net->ibn_dev = ibdev;
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+
+       rc = kiblnd_dev_start_threads(ibdev, newdev,
+                                     ni->ni_cpts, ni->ni_ncpts);
+       if (rc != 0)
+               goto failed;
+
+       rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
+       if (rc != 0) {
+               CERROR("Failed to initialize NI pools: %d\n", rc);
+               goto failed;
+       }
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       ibdev->ibd_nnets++;
+       list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       net->ibn_init = IBLND_INIT_ALL;
+
+       return 0;
+
+failed:
+       if (net->ibn_dev == NULL && ibdev != NULL)
+               kiblnd_destroy_dev(ibdev);
+
+       kiblnd_shutdown(ni);
+
+       CDEBUG(D_NET, "kiblnd_startup failed\n");
+       return -ENETDOWN;
+}
+
+void __exit
+kiblnd_module_fini (void)
+{
+       lnet_unregister_lnd(&the_o2iblnd);
+       kiblnd_tunables_fini();
+}
+
+int __init
+kiblnd_module_init (void)
+{
+       int    rc;
+
+       CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+       CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+                 <= IBLND_MSG_SIZE);
+       CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+                 <= IBLND_MSG_SIZE);
+
+       rc = kiblnd_tunables_init();
+       if (rc != 0)
+               return rc;
+
+       lnet_register_lnd(&the_o2iblnd);
+
+       return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00");
+MODULE_LICENSE("GPL");
+
+module_init(kiblnd_module_init);
+module_exit(kiblnd_module_fini);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644 (file)
index 0000000..e4626bf
--- /dev/null
@@ -0,0 +1,1057 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/pci.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
+#define IBLND_PEER_HASH_SIZE           101     /* # peer lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED                  100
+
+#define IBLND_N_SCHED                  2
+#define IBLND_N_SCHED_HIGH             4
+
+typedef struct
+{
+       int           *kib_dev_failover;     /* HCA failover */
+       unsigned int     *kib_service;    /* IB service number */
+       int           *kib_min_reconnect_interval; /* first failed connection retry... */
+       int           *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+       int           *kib_cksum;           /* checksum kib_msg_t? */
+       int           *kib_timeout;       /* comms timeout (seconds) */
+       int           *kib_keepalive;   /* keepalive timeout (seconds) */
+       int           *kib_ntx;       /* # tx descs */
+       int           *kib_credits;       /* # concurrent sends */
+       int           *kib_peertxcredits;    /* # concurrent sends to 1 peer */
+       int           *kib_peerrtrcredits;   /* # per-peer router buffer credits */
+       int           *kib_peercredits_hiw;  /* # when eagerly to return credits */
+       int           *kib_peertimeout;      /* seconds to consider peer dead */
+       char        **kib_default_ipif;     /* default IPoIB interface */
+       int           *kib_retry_count;
+       int           *kib_rnr_retry_count;
+       int           *kib_concurrent_sends; /* send work queue sizing */
+       int              *kib_ib_mtu;           /* IB MTU */
+       int           *kib_map_on_demand;    /* map-on-demand if RD has more fragments
+                                                * than this value, 0 disable map-on-demand */
+       int           *kib_pmr_pool_size;    /* # physical MR in pool */
+       int           *kib_fmr_pool_size;    /* # FMRs in pool */
+       int           *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+       int           *kib_fmr_cache;   /* enable FMR pool cache? */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+       ctl_table_header_t *kib_sysctl;  /* sysctl interface */
+#endif
+       int           *kib_require_priv_port;/* accept only privileged ports */
+       int           *kib_use_priv_port;    /* use privileged port for active connect */
+       /* # threads on each CPT */
+       int              *kib_nscheds;
+} kib_tunables_t;
+
+extern kib_tunables_t  kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1      8   /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1    7   /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT  8         /* default # of peer credits */
+#define IBLND_CREDITS_MAX        ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer credits */
+
+#define IBLND_MSG_QUEUE_SIZE(v)    ((v) == IBLND_MSG_VERSION_1 ? \
+                                    IBLND_MSG_QUEUE_SIZE_V1 :   \
+                                    *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
+#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
+                                    IBLND_CREDIT_HIGHWATER_V1 : \
+                                    *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
+
+static inline int
+kiblnd_concurrent_sends_v1(void)
+{
+       if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+               return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+       if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+               return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+
+       return *kiblnd_tunables.kib_concurrent_sends;
+}
+
+#define IBLND_CONCURRENT_SENDS(v)  ((v) == IBLND_MSG_VERSION_1 ? \
+                                    kiblnd_concurrent_sends_v1() : \
+                                    *kiblnd_tunables.kib_concurrent_sends)
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v)         (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+#define IBLND_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
+#define IBLND_MAX_RDMA_FRAGS    LNET_MAX_IOV      /* max # of fragments supported */
+#define IBLND_CFG_RDMA_FRAGS       (*kiblnd_tunables.kib_map_on_demand != 0 ? \
+                                   *kiblnd_tunables.kib_map_on_demand :      \
+                                    IBLND_MAX_RDMA_FRAGS)  /* max # of fragments configured by user */
+#define IBLND_RDMA_FRAGS(v)    ((v) == IBLND_MSG_VERSION_1 ? \
+                                    IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
+
+/************************/
+/* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL                  256
+#define IBLND_PMR_POOL                 256
+#define IBLND_FMR_POOL                 256
+#define IBLND_FMR_POOL_FLUSH           192
+
+/* TX messages (shared by all connections) */
+#define IBLND_TX_MSGS()            (*kiblnd_tunables.kib_ntx)
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS(v)           (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v))
+#define IBLND_RX_MSG_BYTES(v)       (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(v)      ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(v)          IBLND_RX_MSGS(v)
+#define IBLND_SEND_WRS(v)        ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v))
+#define IBLND_CQ_ENTRIES(v)     (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE              IFALIASZ
+#else
+#define KIB_IFNAME_SIZE              256
+#endif
+
+typedef struct
+{
+       struct list_head           ibd_list;      /* chain on kib_devs */
+       struct list_head           ibd_fail_list;     /* chain on kib_failed_devs */
+       __u32           ibd_ifip;         /* IPoIB interface IP */
+       /** IPoIB interface name */
+       char             ibd_ifname[KIB_IFNAME_SIZE];
+       int               ibd_nnets;     /* # nets extant */
+
+       cfs_time_t         ibd_next_failover;
+       int               ibd_failed_failover; /* # failover failures */
+       unsigned int     ibd_failover;      /* failover in progress */
+       unsigned int     ibd_can_failover;  /* IPoIB interface is a bonding master */
+       struct list_head           ibd_nets;
+       struct kib_hca_dev  *ibd_hdev;
+} kib_dev_t;
+
+typedef struct kib_hca_dev
+{
+       struct rdma_cm_id   *ibh_cmid;    /* listener cmid */
+       struct ib_device    *ibh_ibdev;  /* IB device */
+       int               ibh_page_shift;    /* page shift of current HCA */
+       int               ibh_page_size;     /* page size of current HCA */
+       __u64           ibh_page_mask;     /* page mask of current HCA */
+       int               ibh_mr_shift;      /* bits shift of max MR size */
+       __u64           ibh_mr_size;       /* size of MR */
+       int               ibh_nmrs;       /* # of global MRs */
+       struct ib_mr       **ibh_mrs;      /* global MR */
+       struct ib_pd    *ibh_pd;            /* PD */
+       kib_dev_t          *ibh_dev;       /* owner */
+       atomic_t         ibh_ref;          /* refcount */
+} kib_hca_dev_t;
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE     300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY       1
+
+typedef struct
+{
+       int                  ibp_npages;             /* # pages */
+       struct page         *ibp_pages[0];         /* page array */
+} kib_pages_t;
+
+struct kib_pmr_pool;
+
+typedef struct {
+       struct list_head              pmr_list;        /* chain node */
+       struct ib_phys_buf     *pmr_ipb;                /* physical buffer */
+       struct ib_mr       *pmr_mr;              /* IB MR */
+       struct kib_pmr_pool    *pmr_pool;              /* owner of this MR */
+       __u64              pmr_iova;           /* Virtual I/O address */
+       int                  pmr_refcount;         /* reference count */
+} kib_phys_mr_t;
+
+struct kib_pool;
+struct kib_poolset;
+
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+                                    int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
+
+struct kib_net;
+
+#define IBLND_POOL_NAME_LEN     32
+
+typedef struct kib_poolset
+{
+       spinlock_t              ps_lock;                /* serialize */
+       struct kib_net   *ps_net;                /* network it belongs to */
+       char                ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
+       struct list_head              ps_pool_list;        /* list of pools */
+       struct list_head              ps_failed_pool_list;    /* failed pool list */
+       cfs_time_t            ps_next_retry;      /* time stamp for retry if failed to allocate */
+       int                  ps_increasing;       /* is allocating new pool */
+       int                  ps_pool_size;         /* new pool size */
+       int                     ps_cpt;                 /* CPT id */
+
+       kib_ps_pool_create_t    ps_pool_create;  /* create a new pool */
+       kib_ps_pool_destroy_t   ps_pool_destroy;        /* destroy a pool */
+       kib_ps_node_init_t      ps_node_init;      /* initialize new allocated node */
+       kib_ps_node_fini_t      ps_node_fini;      /* finalize node */
+} kib_poolset_t;
+
+typedef struct kib_pool
+{
+       struct list_head              po_list;          /* chain on pool list */
+       struct list_head              po_free_list;        /* pre-allocated node */
+       kib_poolset_t     *po_owner;           /* pool_set of this pool */
+       cfs_time_t            po_deadline;          /* deadline of this pool */
+       int                  po_allocated;         /* # of elements in use */
+       int                  po_failed;       /* pool is created on failed HCA */
+       int                  po_size;           /* # of pre-allocated elements */
+} kib_pool_t;
+
+typedef struct {
+       kib_poolset_t      tps_poolset;     /* pool-set */
+       __u64              tps_next_tx_cookie;     /* cookie of TX */
+} kib_tx_poolset_t;
+
+typedef struct {
+       kib_pool_t            tpo_pool;        /* pool */
+       struct kib_hca_dev     *tpo_hdev;              /* device for this pool */
+       struct kib_tx     *tpo_tx_descs;           /* all the tx descriptors */
+       kib_pages_t         *tpo_tx_pages;         /* premapped tx msg pages */
+} kib_tx_pool_t;
+
+typedef struct {
+       kib_poolset_t      pps_poolset;     /* pool-set */
+} kib_pmr_poolset_t;
+
+typedef struct kib_pmr_pool {
+       struct kib_hca_dev     *ppo_hdev;              /* device for this pool */
+       kib_pool_t            ppo_pool;        /* pool */
+} kib_pmr_pool_t;
+
+typedef struct
+{
+       spinlock_t              fps_lock;               /* serialize */
+       struct kib_net   *fps_net;              /* IB network */
+       struct list_head              fps_pool_list;      /* FMR pool list */
+       struct list_head              fps_failed_pool_list;   /* FMR pool list */
+       __u64              fps_version;     /* validity stamp */
+       int                     fps_cpt;                /* CPT id */
+       int                     fps_pool_size;
+       int                     fps_flush_trigger;
+       /* is allocating new pool */
+       int                     fps_increasing;
+       /* time stamp for retry if failed to allocate */
+       cfs_time_t              fps_next_retry;
+} kib_fmr_poolset_t;
+
+typedef struct
+{
+       struct list_head              fpo_list;        /* chain on pool list */
+       struct kib_hca_dev     *fpo_hdev;              /* device for this pool */
+       kib_fmr_poolset_t      *fpo_owner;            /* owner of this pool */
+       struct ib_fmr_pool     *fpo_fmr_pool;      /* IB FMR pool */
+       cfs_time_t            fpo_deadline;        /* deadline of this pool */
+       int                  fpo_failed;             /* fmr pool is failed */
+       int                  fpo_map_count;       /* # of mapped FMR */
+} kib_fmr_pool_t;
+
+typedef struct {
+       struct ib_pool_fmr     *fmr_pfmr;              /* IB pool fmr */
+       kib_fmr_pool_t   *fmr_pool;            /* pool of FMR */
+} kib_fmr_t;
+
+typedef struct kib_net
+{
+       struct list_head           ibn_list;      /* chain on kib_dev_t::ibd_nets */
+       __u64           ibn_incarnation;   /* my epoch */
+       int               ibn_init;       /* initialisation state */
+       int               ibn_shutdown;      /* shutting down? */
+
+       atomic_t                ibn_npeers;     /* # peers extant */
+       atomic_t                ibn_nconns;     /* # connections extant */
+
+       kib_tx_poolset_t        **ibn_tx_ps;    /* tx pool-set */
+       kib_fmr_poolset_t       **ibn_fmr_ps;   /* fmr pool-set */
+       kib_pmr_poolset_t       **ibn_pmr_ps;   /* pmr pool-set */
+
+       kib_dev_t               *ibn_dev;       /* underlying IB device */
+} kib_net_t;
+
+#define KIB_THREAD_SHIFT               16
+#define KIB_THREAD_ID(cpt, tid)                ((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)             ((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)             ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+       /* serialise */
+       spinlock_t              ibs_lock;
+       /* schedulers sleep here */
+       wait_queue_head_t               ibs_waitq;
+       /* conns to check for rx completions */
+       struct list_head                ibs_conns;
+       /* number of scheduler threads */
+       int                     ibs_nthreads;
+       /* max allowed scheduler threads */
+       int                     ibs_nthreads_max;
+       int                     ibs_cpt;        /* CPT id */
+};
+
+typedef struct
+{
+       int                     kib_init;       /* initialisation state */
+       int                     kib_shutdown;   /* shut down? */
+       struct list_head                kib_devs;       /* IB devices extant */
+       /* list head of failed devices */
+       struct list_head                kib_failed_devs;
+       /* schedulers sleep here */
+       wait_queue_head_t               kib_failover_waitq;
+       atomic_t                kib_nthreads;   /* # live threads */
+       /* stabilize net/dev/peer/conn ops */
+       rwlock_t                kib_global_lock;
+       /* hash table of all my known peers */
+       struct list_head                *kib_peers;
+       /* size of kib_peers */
+       int                     kib_peer_hash_size;
+       /* the connd task (serialisation assertions) */
+       void                    *kib_connd;
+       /* connections to setup/teardown */
+       struct list_head                kib_connd_conns;
+       /* connections with zero refcount */
+       struct list_head                kib_connd_zombies;
+       /* connection daemon sleeps here */
+       wait_queue_head_t               kib_connd_waitq;
+       spinlock_t              kib_connd_lock; /* serialise */
+       struct ib_qp_attr       kib_error_qpa;  /* QP->ERROR */
+       /* percpt data for schedulers */
+       struct kib_sched_info   **kib_scheds;
+} kib_data_t;
+
+#define IBLND_INIT_NOTHING      0
+#define IBLND_INIT_DATA            1
+#define IBLND_INIT_ALL      2
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct kib_connparams
+{
+       __u16        ibcp_queue_depth;
+       __u16        ibcp_max_frags;
+       __u32        ibcp_max_msg_size;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct
+{
+       lnet_hdr_t      ibim_hdr;            /* portals header */
+       char          ibim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
+
+typedef struct
+{
+       __u32        rf_nob;           /* # bytes this frag */
+       __u64        rf_addr;         /* CAVEAT EMPTOR: misaligned!! */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct
+{
+       __u32        rd_key;           /* local/remote key */
+       __u32        rd_nfrags;     /* # fragments */
+       kib_rdma_frag_t   rd_frags[0];    /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+
+typedef struct
+{
+       lnet_hdr_t      ibprm_hdr;          /* portals header */
+       __u64        ibprm_cookie;       /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct
+{
+       __u64        ibpam_src_cookie;     /* reflected completion cookie */
+       __u64        ibpam_dst_cookie;     /* opaque completion cookie */
+       kib_rdma_desc_t   ibpam_rd;          /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct
+{
+       lnet_hdr_t      ibgm_hdr;            /* portals header */
+       __u64        ibgm_cookie;         /* opaque completion cookie */
+       kib_rdma_desc_t   ibgm_rd;            /* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
+
+typedef struct
+{
+       __u64        ibcm_cookie;         /* opaque completion cookie */
+       __s32        ibcm_status;         /* < 0 failure: >= 0 length */
+} WIRE_ATTR kib_completion_msg_t;
+
+typedef struct
+{
+       /* First 2 fields fixed FOR ALL TIME */
+       __u32        ibm_magic;     /* I'm an ibnal message */
+       __u16        ibm_version;         /* this is my version number */
+
+       __u8          ibm_type;      /* msg type */
+       __u8          ibm_credits;        /* returned credits */
+       __u32        ibm_nob;         /* # bytes in whole message */
+       __u32        ibm_cksum;     /* checksum (0 == no checksum) */
+       __u64        ibm_srcnid;           /* sender's NID */
+       __u64        ibm_srcstamp;       /* sender's incarnation */
+       __u64        ibm_dstnid;           /* destination's NID */
+       __u64        ibm_dststamp;       /* destination's incarnation */
+
+       union {
+               kib_connparams_t      connparams;
+               kib_immediate_msg_t   immediate;
+               kib_putreq_msg_t      putreq;
+               kib_putack_msg_t      putack;
+               kib_get_msg_t    get;
+               kib_completion_msg_t  completion;
+       } WIRE_ATTR ibm_u;
+} WIRE_ATTR kib_msg_t;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC    /* unique magic */
+
+#define IBLND_MSG_VERSION_1     0x11
+#define IBLND_MSG_VERSION_2     0x12
+#define IBLND_MSG_VERSION         IBLND_MSG_VERSION_2
+
+#define IBLND_MSG_CONNREQ         0xc0 /* connection request */
+#define IBLND_MSG_CONNACK         0xc1 /* connection acknowledge */
+#define IBLND_MSG_NOOP       0xd0      /* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE     0xd1   /* immediate */
+#define IBLND_MSG_PUT_REQ         0xd2 /* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK         0xd3 /* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK         0xd4 /* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE       0xd5  /* completion (src->sink) */
+#define IBLND_MSG_GET_REQ         0xd6 /* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE       0xd7  /* completion (src->sink: all OK) */
+
+typedef struct {
+       __u32       ibr_magic;       /* sender's magic */
+       __u16       ibr_version;           /* sender's version */
+       __u8         ibr_why;          /* reject reason */
+       __u8         ibr_padding;          /* padding */
+       __u64       ibr_incarnation;       /* incarnation of peer */
+       kib_connparams_t ibr_cp;                /* connection parameters */
+} WIRE_ATTR kib_rej_t;
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE       1   /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES    2   /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL        3      /* Anything else */
+
+#define IBLND_REJECT_CONN_UNCOMPAT   4   /* incompatible version peer */
+#define IBLND_REJECT_CONN_STALE      5   /* stale peer */
+
+#define IBLND_REJECT_RDMA_FRAGS      6   /* Fatal: peer's rdma frags can't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7   /* Fatal: peer's msg queue size can't match mine */
+
+/***********************************************************************/
+
+typedef struct kib_rx                     /* receive message */
+{
+       struct list_head                rx_list;      /* queue for attention */
+       struct kib_conn   *rx_conn;      /* owning conn */
+       int                    rx_nob;       /* # bytes received (-1 while posted) */
+       enum ib_wc_status        rx_status;    /* completion status */
+       kib_msg_t               *rx_msg;       /* message buffer (host vaddr) */
+       __u64                rx_msgaddr;   /* message buffer (I/O addr) */
+       DECLARE_PCI_UNMAP_ADDR   (rx_msgunmap); /* for dma_unmap_single() */
+       struct ib_recv_wr        rx_wrq;       /* receive work item... */
+       struct ib_sge        rx_sge;       /* ...and its memory */
+} kib_rx_t;
+
+#define IBLND_POSTRX_DONT_POST    0         /* don't post */
+#define IBLND_POSTRX_NO_CREDIT    1         /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT  2         /* post: give peer back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3         /* post: give myself back 1 reserved credit */
+
+typedef struct kib_tx                     /* transmit message */
+{
+       struct list_head                tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+       kib_tx_pool_t       *tx_pool;      /* pool I'm from */
+       struct kib_conn   *tx_conn;      /* owning conn */
+       short                tx_sending;   /* # tx callbacks outstanding */
+       short                tx_queued;    /* queued for sending */
+       short                tx_waiting;   /* waiting for peer */
+       int                    tx_status;    /* LNET completion status */
+       unsigned long        tx_deadline;  /* completion deadline */
+       __u64                tx_cookie;    /* completion cookie */
+       lnet_msg_t             *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
+       kib_msg_t               *tx_msg;       /* message buffer (host vaddr) */
+       __u64                tx_msgaddr;   /* message buffer (I/O addr) */
+       DECLARE_PCI_UNMAP_ADDR   (tx_msgunmap); /* for dma_unmap_single() */
+       int                    tx_nwrq;      /* # send work items */
+       struct ib_send_wr       *tx_wrq;       /* send work items... */
+       struct ib_sge       *tx_sge;       /* ...and their memory */
+       kib_rdma_desc_t   *tx_rd;       /* rdma descriptor */
+       int                    tx_nfrags;    /* # entries in... */
+       struct scatterlist       *tx_frags;     /* dma_map_sg descriptor */
+       __u64               *tx_pages;     /* rdma phys page addrs */
+       union {
+               kib_phys_mr_t      *pmr;        /* MR for physical buffer */
+               kib_fmr_t          fmr; /* FMR */
+       }                        tx_u;
+       int                    tx_dmadir;    /* dma direction */
+} kib_tx_t;
+
+typedef struct kib_connvars
+{
+       /* connection-in-progress variables */
+       kib_msg_t                cv_msg;
+} kib_connvars_t;
+
+typedef struct kib_conn
+{
+       struct kib_sched_info *ibc_sched;       /* scheduler information */
+       struct kib_peer     *ibc_peer;    /* owning peer */
+       kib_hca_dev_t       *ibc_hdev;    /* HCA bound on */
+       struct list_head           ibc_list;      /* stash on peer's conn list */
+       struct list_head           ibc_sched_list;    /* schedule for attention */
+       __u16           ibc_version;       /* version of connection */
+       __u64           ibc_incarnation;   /* which instance of the peer */
+       atomic_t         ibc_refcount;      /* # users */
+       int               ibc_state;     /* what's happening */
+       int               ibc_nsends_posted; /* # uncompleted sends */
+       int               ibc_noops_posted;  /* # uncompleted NOOPs */
+       int               ibc_credits;       /* # credits I have */
+       int               ibc_outstanding_credits; /* # credits to return */
+       int               ibc_reserved_credits;/* # ACK/DONE msg credits */
+       int               ibc_comms_error;   /* set on comms error */
+       unsigned int         ibc_nrx:16;        /* receive buffers owned */
+       unsigned int         ibc_scheduled:1;   /* scheduled for attention */
+       unsigned int         ibc_ready:1;       /* CQ callback fired */
+       /* time of last send */
+       unsigned long   ibc_last_send;
+       /** link chain for kiblnd_check_conns only */
+       struct list_head           ibc_connd_list;
+       /** rxs completed before ESTABLISHED */
+       struct list_head           ibc_early_rxs;
+       /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+       struct list_head           ibc_tx_noops;
+       struct list_head           ibc_tx_queue;       /* sends that need a credit */
+       struct list_head           ibc_tx_queue_nocred;/* sends that don't need a credit */
+       struct list_head           ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
+       struct list_head           ibc_active_txs;     /* active tx awaiting completion */
+       spinlock_t           ibc_lock;           /* serialise */
+       kib_rx_t            *ibc_rxs;       /* the rx descs */
+       kib_pages_t      *ibc_rx_pages;       /* premapped rx msg pages */
+
+       struct rdma_cm_id   *ibc_cmid;     /* CM id */
+       struct ib_cq    *ibc_cq;             /* completion queue */
+
+       kib_connvars_t      *ibc_connvars;       /* in-progress connection state */
+} kib_conn_t;
+
+#define IBLND_CONN_INIT               0         /* being initialised */
+#define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT       2         /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED 3        /* connection established */
+#define IBLND_CONN_CLOSING         4    /* being closed */
+#define IBLND_CONN_DISCONNECTED       5         /* disconnected */
+
+typedef struct kib_peer
+{
+       struct list_head           ibp_list;       /* stash on global peer list */
+       lnet_nid_t         ibp_nid;         /* who's on the other end(s) */
+       lnet_ni_t          *ibp_ni;          /* LNet interface */
+       atomic_t         ibp_refcount;       /* # users */
+       struct list_head           ibp_conns;     /* all active connections */
+       struct list_head           ibp_tx_queue;       /* msgs waiting for a conn */
+       __u16           ibp_version;    /* version of peer */
+       __u64           ibp_incarnation;    /* incarnation of peer */
+       int               ibp_connecting;     /* current active connection attempts */
+       int               ibp_accepting;      /* current passive connection attempts */
+       int               ibp_error;      /* errno on closing this peer */
+       cfs_time_t         ibp_last_alive;     /* when (in jiffies) I was last alive */
+} kib_peer_t;
+
+extern kib_data_t      kiblnd_data;
+
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+
+static inline void
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+{
+       LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+       atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+{
+       LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+       if (atomic_dec_and_test(&hdev->ibh_ref))
+               kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(kib_dev_t *dev)
+{
+       if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+               return 0;
+
+       if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+               return 0;
+
+       if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+               return 1;
+
+       return dev->ibd_can_failover;
+}
+
+#define kiblnd_conn_addref(conn)                               \
+do {                                                       \
+       CDEBUG(D_NET, "conn[%p] (%d)++\n",                    \
+              (conn), atomic_read(&(conn)->ibc_refcount)); \
+       atomic_inc(&(conn)->ibc_refcount);                \
+} while (0)
+
+#define kiblnd_conn_decref(conn)                                       \
+do {                                                                   \
+       unsigned long flags;                                            \
+                                                                       \
+       CDEBUG(D_NET, "conn[%p] (%d)--\n",                              \
+              (conn), atomic_read(&(conn)->ibc_refcount));             \
+       LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);                      \
+       if (atomic_dec_and_test(&(conn)->ibc_refcount)) {               \
+               spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);  \
+               list_add_tail(&(conn)->ibc_list,                        \
+                                 &kiblnd_data.kib_connd_zombies);      \
+               wake_up(&kiblnd_data.kib_connd_waitq);          \
+               spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+       }                                                               \
+} while (0)
+
+#define kiblnd_peer_addref(peer)                               \
+do {                                                       \
+       CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
+              (peer), libcfs_nid2str((peer)->ibp_nid),  \
+              atomic_read (&(peer)->ibp_refcount));    \
+       atomic_inc(&(peer)->ibp_refcount);                \
+} while (0)
+
+#define kiblnd_peer_decref(peer)                               \
+do {                                                       \
+       CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
+              (peer), libcfs_nid2str((peer)->ibp_nid),  \
+              atomic_read (&(peer)->ibp_refcount));    \
+       LASSERT_ATOMIC_POS(&(peer)->ibp_refcount);            \
+       if (atomic_dec_and_test(&(peer)->ibp_refcount))     \
+               kiblnd_destroy_peer(peer);                    \
+} while (0)
+
+static inline struct list_head *
+kiblnd_nid2peerlist (lnet_nid_t nid)
+{
+       unsigned int hash =
+               ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+
+       return (&kiblnd_data.kib_peers [hash]);
+}
+
+static inline int
+kiblnd_peer_active (kib_peer_t *peer)
+{
+       /* Am I in the peer hash table? */
+       return (!list_empty(&peer->ibp_list));
+}
+
+static inline kib_conn_t *
+kiblnd_get_conn_locked (kib_peer_t *peer)
+{
+       LASSERT (!list_empty(&peer->ibp_conns));
+
+       /* just return the first connection */
+       return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+}
+
+static inline int
+kiblnd_send_keepalive(kib_conn_t *conn)
+{
+       return (*kiblnd_tunables.kib_keepalive > 0) &&
+               cfs_time_after(jiffies, conn->ibc_last_send +
+                              *kiblnd_tunables.kib_keepalive*HZ);
+}
+
+static inline int
+kiblnd_need_noop(kib_conn_t *conn)
+{
+       LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+       if (conn->ibc_outstanding_credits <
+           IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+           !kiblnd_send_keepalive(conn))
+               return 0; /* No need to send NOOP */
+
+       if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+               if (!list_empty(&conn->ibc_tx_queue_nocred))
+                       return 0; /* NOOP can be piggybacked */
+
+               /* No tx to piggyback NOOP onto or no credit to send a tx */
+               return (list_empty(&conn->ibc_tx_queue) ||
+                       conn->ibc_credits == 0);
+       }
+
+       if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+           !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+           conn->ibc_credits == 0)                 /* no credit */
+               return 0;
+
+       if (conn->ibc_credits == 1 &&      /* last credit reserved for */
+           conn->ibc_outstanding_credits == 0) /* giving back credits */
+               return 0;
+
+       /* No tx to piggyback NOOP onto or no credit to send a tx */
+       return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
+static inline void
+kiblnd_abort_receives(kib_conn_t *conn)
+{
+       ib_modify_qp(conn->ibc_cmid->qp,
+                    &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+static inline const char *
+kiblnd_queue2str (kib_conn_t *conn, struct list_head *q)
+{
+       if (q == &conn->ibc_tx_queue)
+               return "tx_queue";
+
+       if (q == &conn->ibc_tx_queue_rsrvd)
+               return "tx_queue_rsrvd";
+
+       if (q == &conn->ibc_tx_queue_nocred)
+               return "tx_queue_nocred";
+
+       if (q == &conn->ibc_active_txs)
+               return "active_txs";
+
+       LBUG();
+       return NULL;
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_TX    0
+#define IBLND_WID_RDMA  1
+#define IBLND_WID_RX    2
+#define IBLND_WID_MASK  3UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+       unsigned long lptr = (unsigned long)ptr;
+
+       LASSERT ((lptr & IBLND_WID_MASK) == 0);
+       LASSERT ((type & ~IBLND_WID_MASK) == 0);
+       return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+       return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+       return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
+{
+       conn->ibc_state = state;
+       mb();
+}
+
+static inline void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+{
+       msg->ibm_type = type;
+       msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
+
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+       int   i;
+       int   size;
+
+       for (i = size = 0; i < rd->rd_nfrags; i++)
+               size += rd->rd_frags[i].rf_nob;
+
+       return size;
+}
+
+static inline __u64
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+{
+       return rd->rd_frags[index].rf_addr;
+}
+
+static inline __u32
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+{
+       return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+{
+       return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+{
+       if (nob < rd->rd_frags[index].rf_nob) {
+               rd->rd_frags[index].rf_addr += nob;
+               rd->rd_frags[index].rf_nob  -= nob;
+       } else {
+               index ++;
+       }
+
+       return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+{
+       LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+                msgtype == IBLND_MSG_PUT_ACK);
+
+       return msgtype == IBLND_MSG_GET_REQ ?
+              offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+              offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+}
+
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+       return ib_dma_mapping_error(dev, dma_addr);
+}
+
+static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
+                                         void *msg, size_t size,
+                                         enum dma_data_direction direction)
+{
+       return ib_dma_map_single(dev, msg, size, direction);
+}
+
+static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
+                                          __u64 addr, size_t size,
+                                         enum dma_data_direction direction)
+{
+       ib_dma_unmap_single(dev, addr, size, direction);
+}
+
+#define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
+#define KIBLND_UNMAP_ADDR(p, m, a)      (a)
+
+static inline int kiblnd_dma_map_sg(struct ib_device *dev,
+                                   struct scatterlist *sg, int nents,
+                                   enum dma_data_direction direction)
+{
+       return ib_dma_map_sg(dev, sg, nents, direction);
+}
+
+static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
+                                      struct scatterlist *sg, int nents,
+                                      enum dma_data_direction direction)
+{
+       ib_dma_unmap_sg(dev, sg, nents, direction);
+}
+
+static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
+                                         struct scatterlist *sg)
+{
+       return ib_sg_dma_address(dev, sg);
+}
+
+static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
+                                            struct scatterlist *sg)
+{
+       return ib_sg_dma_len(dev, sg);
+}
+
+/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
+ * right because OFED1.2 defines it as const, to use it we have to add
+ * (void *) cast to overcome "const" */
+
+#define KIBLND_CONN_PARAM(e)       ((e)->param.conn.private_data)
+#define KIBLND_CONN_PARAM_LEN(e)       ((e)->param.conn.private_data_len)
+
+
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
+                                   kib_rdma_desc_t *rd);
+struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev,
+                                __u64 addr, __u64 size);
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+                 kib_rdma_desc_t *rd, int nfrags);
+void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+
+int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
+                        int npages, __u64 iov, kib_fmr_t *fmr);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+
+int  kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+                        kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr);
+void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr);
+
+int  kiblnd_startup (lnet_ni_t *ni);
+void kiblnd_shutdown (lnet_ni_t *ni);
+int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+
+int  kiblnd_tunables_init(void);
+void kiblnd_tunables_fini(void);
+
+int  kiblnd_connd (void *arg);
+int  kiblnd_scheduler(void *arg);
+int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+int  kiblnd_failover_thread (void *arg);
+
+int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+void kiblnd_free_pages (kib_pages_t *p);
+
+int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
+                       struct rdma_cm_event *event);
+int  kiblnd_translate_mtu(int value);
+
+int  kiblnd_dev_failover(kib_dev_t *dev);
+int  kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_t *peer);
+void kiblnd_peer_alive (kib_peer_t *peer);
+kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
+void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
+int  kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+                                     int version, __u64 incarnation);
+int  kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
+
+void kiblnd_connreq_done(kib_conn_t *conn, int status);
+kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
+                               int state, int version);
+void kiblnd_destroy_conn (kib_conn_t *conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+int  kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+                      int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+
+void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
+void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+                        int status);
+void kiblnd_check_sends (kib_conn_t *conn);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+                     int credits, lnet_nid_t dstnid, __u64 dststamp);
+int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+
+int  kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+                unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+                unsigned int offset, unsigned int mlen, unsigned int rlen);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644 (file)
index 0000000..cc62321
--- /dev/null
@@ -0,0 +1,3529 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_cb.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+void
+kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx)
+{
+       lnet_msg_t *lntmsg[2];
+       kib_net_t  *net = ni->ni_data;
+       int      rc;
+       int      i;
+
+       LASSERT (net != NULL);
+       LASSERT (!in_interrupt());
+       LASSERT (!tx->tx_queued);              /* mustn't be queued for sending */
+       LASSERT (tx->tx_sending == 0);    /* mustn't be awaiting sent callback */
+       LASSERT (!tx->tx_waiting);            /* mustn't be awaiting peer response */
+       LASSERT (tx->tx_pool != NULL);
+
+       kiblnd_unmap_tx(ni, tx);
+
+       /* tx may have up to 2 lnet msgs to finalise */
+       lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+       lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+       rc = tx->tx_status;
+
+       if (tx->tx_conn != NULL) {
+               LASSERT (ni == tx->tx_conn->ibc_peer->ibp_ni);
+
+               kiblnd_conn_decref(tx->tx_conn);
+               tx->tx_conn = NULL;
+       }
+
+       tx->tx_nwrq = 0;
+       tx->tx_status = 0;
+
+       kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
+
+       /* delay finalize until my descs have been freed */
+       for (i = 0; i < 2; i++) {
+               if (lntmsg[i] == NULL)
+                       continue;
+
+               lnet_finalize(ni, lntmsg[i], rc);
+       }
+}
+
+void
+kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status)
+{
+       kib_tx_t *tx;
+
+       while (!list_empty (txlist)) {
+               tx = list_entry (txlist->next, kib_tx_t, tx_list);
+
+               list_del(&tx->tx_list);
+               /* complete now */
+               tx->tx_waiting = 0;
+               tx->tx_status = status;
+               kiblnd_tx_done(ni, tx);
+       }
+}
+
+kib_tx_t *
+kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
+{
+       kib_net_t               *net = (kib_net_t *)ni->ni_data;
+       struct list_head                *node;
+       kib_tx_t                *tx;
+       kib_tx_poolset_t        *tps;
+
+       tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+       node = kiblnd_pool_alloc_node(&tps->tps_poolset);
+       if (node == NULL)
+               return NULL;
+       tx = container_of(node, kib_tx_t, tx_list);
+
+       LASSERT (tx->tx_nwrq == 0);
+       LASSERT (!tx->tx_queued);
+       LASSERT (tx->tx_sending == 0);
+       LASSERT (!tx->tx_waiting);
+       LASSERT (tx->tx_status == 0);
+       LASSERT (tx->tx_conn == NULL);
+       LASSERT (tx->tx_lntmsg[0] == NULL);
+       LASSERT (tx->tx_lntmsg[1] == NULL);
+       LASSERT (tx->tx_u.pmr == NULL);
+       LASSERT (tx->tx_nfrags == 0);
+
+       return tx;
+}
+
+void
+kiblnd_drop_rx(kib_rx_t *rx)
+{
+       kib_conn_t              *conn   = rx->rx_conn;
+       struct kib_sched_info   *sched  = conn->ibc_sched;
+       unsigned long           flags;
+
+       spin_lock_irqsave(&sched->ibs_lock, flags);
+       LASSERT(conn->ibc_nrx > 0);
+       conn->ibc_nrx--;
+       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+       kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx (kib_rx_t *rx, int credit)
+{
+       kib_conn_t       *conn = rx->rx_conn;
+       kib_net_t         *net = conn->ibc_peer->ibp_ni->ni_data;
+       struct ib_recv_wr  *bad_wrq = NULL;
+       struct ib_mr       *mr;
+       int              rc;
+
+       LASSERT (net != NULL);
+       LASSERT (!in_interrupt());
+       LASSERT (credit == IBLND_POSTRX_NO_CREDIT ||
+                credit == IBLND_POSTRX_PEER_CREDIT ||
+                credit == IBLND_POSTRX_RSRVD_CREDIT);
+
+       mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE);
+       LASSERT (mr != NULL);
+
+       rx->rx_sge.lkey   = mr->lkey;
+       rx->rx_sge.addr   = rx->rx_msgaddr;
+       rx->rx_sge.length = IBLND_MSG_SIZE;
+
+       rx->rx_wrq.next = NULL;
+       rx->rx_wrq.sg_list = &rx->rx_sge;
+       rx->rx_wrq.num_sge = 1;
+       rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+       LASSERT (conn->ibc_state >= IBLND_CONN_INIT);
+       LASSERT (rx->rx_nob >= 0);            /* not posted */
+
+       if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+               kiblnd_drop_rx(rx);          /* No more posts for this rx */
+               return 0;
+       }
+
+       rx->rx_nob = -1;                        /* flag posted */
+
+       rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+       if (rc != 0) {
+               CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
+               rx->rx_nob = 0;
+       }
+
+       if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+               return rc;
+
+       if (rc != 0) {
+               kiblnd_close_conn(conn, rc);
+               kiblnd_drop_rx(rx);          /* No more posts for this rx */
+               return rc;
+       }
+
+       if (credit == IBLND_POSTRX_NO_CREDIT)
+               return 0;
+
+       spin_lock(&conn->ibc_lock);
+       if (credit == IBLND_POSTRX_PEER_CREDIT)
+               conn->ibc_outstanding_credits++;
+       else
+               conn->ibc_reserved_credits++;
+       spin_unlock(&conn->ibc_lock);
+
+       kiblnd_check_sends(conn);
+       return 0;
+}
+
+kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+{
+       struct list_head   *tmp;
+
+       list_for_each(tmp, &conn->ibc_active_txs) {
+               kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+               LASSERT (!tx->tx_queued);
+               LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
+
+               if (tx->tx_cookie != cookie)
+                       continue;
+
+               if (tx->tx_waiting &&
+                   tx->tx_msg->ibm_type == txtype)
+                       return tx;
+
+               CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+                     tx->tx_waiting ? "" : "NOT ",
+                     tx->tx_msg->ibm_type, txtype);
+       }
+       return NULL;
+}
+
+void
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+       kib_tx_t    *tx;
+       lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+       int       idle;
+
+       spin_lock(&conn->ibc_lock);
+
+       tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+       if (tx == NULL) {
+               spin_unlock(&conn->ibc_lock);
+
+               CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+                     txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               kiblnd_close_conn(conn, -EPROTO);
+               return;
+       }
+
+       if (tx->tx_status == 0) {              /* success so far */
+               if (status < 0) {              /* failed? */
+                       tx->tx_status = status;
+               } else if (txtype == IBLND_MSG_GET_REQ) {
+                       lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+               }
+       }
+
+       tx->tx_waiting = 0;
+
+       idle = !tx->tx_queued && (tx->tx_sending == 0);
+       if (idle)
+               list_del(&tx->tx_list);
+
+       spin_unlock(&conn->ibc_lock);
+
+       if (idle)
+               kiblnd_tx_done(ni, tx);
+}
+
+void
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+       lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+       kib_tx_t    *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+
+       if (tx == NULL) {
+               CERROR("Can't get tx for completion %x for %s\n",
+                      type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               return;
+       }
+
+       tx->tx_msg->ibm_u.completion.ibcm_status = status;
+       tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+       kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+
+       kiblnd_queue_tx(tx, conn);
+}
+
+void
+kiblnd_handle_rx (kib_rx_t *rx)
+{
+       kib_msg_t    *msg = rx->rx_msg;
+       kib_conn_t   *conn = rx->rx_conn;
+       lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+       int        credits = msg->ibm_credits;
+       kib_tx_t     *tx;
+       int        rc = 0;
+       int        rc2;
+       int        post_credit;
+
+       LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+       CDEBUG (D_NET, "Received %x[%d] from %s\n",
+               msg->ibm_type, credits,
+               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+       if (credits != 0) {
+               /* Have I received credits that will let me send? */
+               spin_lock(&conn->ibc_lock);
+
+               if (conn->ibc_credits + credits >
+                   IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) {
+                       rc2 = conn->ibc_credits;
+                       spin_unlock(&conn->ibc_lock);
+
+                       CERROR("Bad credits from %s: %d + %d > %d\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                              rc2, credits,
+                              IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+
+                       kiblnd_close_conn(conn, -EPROTO);
+                       kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+                       return;
+               }
+
+               conn->ibc_credits += credits;
+
+               /* This ensures the credit taken by NOOP can be returned */
+               if (msg->ibm_type == IBLND_MSG_NOOP &&
+                   !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+                       conn->ibc_outstanding_credits++;
+
+               spin_unlock(&conn->ibc_lock);
+               kiblnd_check_sends(conn);
+       }
+
+       switch (msg->ibm_type) {
+       default:
+               CERROR("Bad IBLND message type %x from %s\n",
+                      msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               post_credit = IBLND_POSTRX_NO_CREDIT;
+               rc = -EPROTO;
+               break;
+
+       case IBLND_MSG_NOOP:
+               if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+                       post_credit = IBLND_POSTRX_NO_CREDIT;
+                       break;
+               }
+
+               if (credits != 0) /* credit already posted */
+                       post_credit = IBLND_POSTRX_NO_CREDIT;
+               else          /* a keepalive NOOP */
+                       post_credit = IBLND_POSTRX_PEER_CREDIT;
+               break;
+
+       case IBLND_MSG_IMMEDIATE:
+               post_credit = IBLND_POSTRX_DONT_POST;
+               rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
+                               msg->ibm_srcnid, rx, 0);
+               if (rc < 0)                  /* repost on error */
+                       post_credit = IBLND_POSTRX_PEER_CREDIT;
+               break;
+
+       case IBLND_MSG_PUT_REQ:
+               post_credit = IBLND_POSTRX_DONT_POST;
+               rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
+                               msg->ibm_srcnid, rx, 1);
+               if (rc < 0)                  /* repost on error */
+                       post_credit = IBLND_POSTRX_PEER_CREDIT;
+               break;
+
+       case IBLND_MSG_PUT_NAK:
+               CWARN ("PUT_NACK from %s\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+               kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+                                        msg->ibm_u.completion.ibcm_status,
+                                        msg->ibm_u.completion.ibcm_cookie);
+               break;
+
+       case IBLND_MSG_PUT_ACK:
+               post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+               spin_lock(&conn->ibc_lock);
+               tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+                                       msg->ibm_u.putack.ibpam_src_cookie);
+               if (tx != NULL)
+                       list_del(&tx->tx_list);
+               spin_unlock(&conn->ibc_lock);
+
+               if (tx == NULL) {
+                       CERROR("Unmatched PUT_ACK from %s\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       rc = -EPROTO;
+                       break;
+               }
+
+               LASSERT (tx->tx_waiting);
+               /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+                * (a) I can overwrite tx_msg since my peer has received it!
+                * (b) tx_waiting set tells tx_complete() it's not done. */
+
+               tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
+
+               rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
+                                      kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+                                      &msg->ibm_u.putack.ibpam_rd,
+                                      msg->ibm_u.putack.ibpam_dst_cookie);
+               if (rc2 < 0)
+                       CERROR("Can't setup rdma for PUT to %s: %d\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+               spin_lock(&conn->ibc_lock);
+               tx->tx_waiting = 0;     /* clear waiting and queue atomically */
+               kiblnd_queue_tx_locked(tx, conn);
+               spin_unlock(&conn->ibc_lock);
+               break;
+
+       case IBLND_MSG_PUT_DONE:
+               post_credit = IBLND_POSTRX_PEER_CREDIT;
+               kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+                                        msg->ibm_u.completion.ibcm_status,
+                                        msg->ibm_u.completion.ibcm_cookie);
+               break;
+
+       case IBLND_MSG_GET_REQ:
+               post_credit = IBLND_POSTRX_DONT_POST;
+               rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
+                               msg->ibm_srcnid, rx, 1);
+               if (rc < 0)                  /* repost on error */
+                       post_credit = IBLND_POSTRX_PEER_CREDIT;
+               break;
+
+       case IBLND_MSG_GET_DONE:
+               post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+               kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+                                        msg->ibm_u.completion.ibcm_status,
+                                        msg->ibm_u.completion.ibcm_cookie);
+               break;
+       }
+
+       if (rc < 0)                          /* protocol error */
+               kiblnd_close_conn(conn, rc);
+
+       if (post_credit != IBLND_POSTRX_DONT_POST)
+               kiblnd_post_rx(rx, post_credit);
+}
+
+void
+kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
+{
+       kib_msg_t    *msg = rx->rx_msg;
+       kib_conn_t   *conn = rx->rx_conn;
+       lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+       kib_net_t    *net = ni->ni_data;
+       int        rc;
+       int        err = -EIO;
+
+       LASSERT (net != NULL);
+       LASSERT (rx->rx_nob < 0);              /* was posted */
+       rx->rx_nob = 0;                  /* isn't now */
+
+       if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+               goto ignore;
+
+       if (status != IB_WC_SUCCESS) {
+               CNETERR("Rx from %s failed: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+               goto failed;
+       }
+
+       LASSERT (nob >= 0);
+       rx->rx_nob = nob;
+
+       rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+       if (rc != 0) {
+               CERROR ("Error %d unpacking rx from %s\n",
+                       rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               goto failed;
+       }
+
+       if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+           msg->ibm_dstnid != ni->ni_nid ||
+           msg->ibm_srcstamp != conn->ibc_incarnation ||
+           msg->ibm_dststamp != net->ibn_incarnation) {
+               CERROR ("Stale rx from %s\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               err = -ESTALE;
+               goto failed;
+       }
+
+       /* set time last known alive */
+       kiblnd_peer_alive(conn->ibc_peer);
+
+       /* racing with connection establishment/teardown! */
+
+       if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+               rwlock_t  *g_lock = &kiblnd_data.kib_global_lock;
+               unsigned long  flags;
+
+               write_lock_irqsave(g_lock, flags);
+               /* must check holding global lock to eliminate race */
+               if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                       list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+                       write_unlock_irqrestore(g_lock, flags);
+                       return;
+               }
+               write_unlock_irqrestore(g_lock, flags);
+       }
+       kiblnd_handle_rx(rx);
+       return;
+
+ failed:
+       CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+       kiblnd_close_conn(conn, err);
+ ignore:
+       kiblnd_drop_rx(rx);                  /* Don't re-post rx. */
+}
+
+struct page *
+kiblnd_kvaddr_to_page (unsigned long vaddr)
+{
+       struct page *page;
+
+       if (vaddr >= VMALLOC_START &&
+           vaddr < VMALLOC_END) {
+               page = vmalloc_to_page ((void *)vaddr);
+               LASSERT (page != NULL);
+               return page;
+       }
+#ifdef CONFIG_HIGHMEM
+       if (vaddr >= PKMAP_BASE &&
+           vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+               /* No highmem pages only used for bulk (kiov) I/O */
+               CERROR("find page for address in highmem\n");
+               LBUG();
+       }
+#endif
+       page = virt_to_page (vaddr);
+       LASSERT (page != NULL);
+       return page;
+}
+
+static int
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+       kib_hca_dev_t           *hdev;
+       __u64                   *pages = tx->tx_pages;
+       kib_fmr_poolset_t       *fps;
+       int                     npages;
+       int                     size;
+       int                     cpt;
+       int                     rc;
+       int                     i;
+
+       LASSERT(tx->tx_pool != NULL);
+       LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+       hdev  = tx->tx_pool->tpo_hdev;
+
+       for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+               for (size = 0; size <  rd->rd_frags[i].rf_nob;
+                              size += hdev->ibh_page_size) {
+                       pages[npages ++] = (rd->rd_frags[i].rf_addr &
+                                           hdev->ibh_page_mask) + size;
+               }
+       }
+
+       cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+       fps = net->ibn_fmr_ps[cpt];
+       rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr);
+       if (rc != 0) {
+               CERROR ("Can't map %d pages: %d\n", npages, rc);
+               return rc;
+       }
+
+       /* If rd is not tx_rd, it's going to get sent to a peer, who will need
+        * the rkey */
+       rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey :
+                                        tx->tx_u.fmr.fmr_pfmr->fmr->lkey;
+       rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+       rd->rd_frags[0].rf_nob   = nob;
+       rd->rd_nfrags = 1;
+
+       return 0;
+}
+
+static int
+kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+       kib_hca_dev_t           *hdev;
+       kib_pmr_poolset_t       *pps;
+       __u64                   iova;
+       int                     cpt;
+       int                     rc;
+
+       LASSERT(tx->tx_pool != NULL);
+       LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+       hdev = tx->tx_pool->tpo_hdev;
+
+       iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask;
+
+       cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+       pps = net->ibn_pmr_ps[cpt];
+       rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr);
+       if (rc != 0) {
+               CERROR("Failed to create MR by phybuf: %d\n", rc);
+               return rc;
+       }
+
+       /* If rd is not tx_rd, it's going to get sent to a peer, who will need
+        * the rkey */
+       rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey :
+                                        tx->tx_u.pmr->pmr_mr->lkey;
+       rd->rd_nfrags = 1;
+       rd->rd_frags[0].rf_addr = iova;
+       rd->rd_frags[0].rf_nob  = nob;
+
+       return 0;
+}
+
+void
+kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
+{
+       kib_net_t  *net = ni->ni_data;
+
+       LASSERT(net != NULL);
+
+       if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) {
+               kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status);
+               tx->tx_u.fmr.fmr_pfmr = NULL;
+
+       } else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) {
+               kiblnd_pmr_pool_unmap(tx->tx_u.pmr);
+               tx->tx_u.pmr = NULL;
+       }
+
+       if (tx->tx_nfrags != 0) {
+               kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+                                   tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+               tx->tx_nfrags = 0;
+       }
+}
+
+int
+kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+             kib_rdma_desc_t *rd, int nfrags)
+{
+       kib_hca_dev_t      *hdev  = tx->tx_pool->tpo_hdev;
+       kib_net_t         *net   = ni->ni_data;
+       struct ib_mr       *mr    = NULL;
+       __u32          nob;
+       int              i;
+
+       /* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+        * RDMA sink */
+       tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+       tx->tx_nfrags = nfrags;
+
+       rd->rd_nfrags =
+               kiblnd_dma_map_sg(hdev->ibh_ibdev,
+                                 tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+
+       for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
+               rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
+                       hdev->ibh_ibdev, &tx->tx_frags[i]);
+               rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+                       hdev->ibh_ibdev, &tx->tx_frags[i]);
+               nob += rd->rd_frags[i].rf_nob;
+       }
+
+       /* looking for pre-mapping MR */
+       mr = kiblnd_find_rd_dma_mr(hdev, rd);
+       if (mr != NULL) {
+               /* found pre-mapping MR */
+               rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+               return 0;
+       }
+
+       if (net->ibn_fmr_ps != NULL)
+               return kiblnd_fmr_map_tx(net, tx, rd, nob);
+       else if (net->ibn_pmr_ps != NULL)
+               return kiblnd_pmr_map_tx(net, tx, rd, nob);
+
+       return -EINVAL;
+}
+
+
+int
+kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+                   unsigned int niov, struct iovec *iov, int offset, int nob)
+{
+       kib_net_t         *net = ni->ni_data;
+       struct page     *page;
+       struct scatterlist *sg;
+       unsigned long       vaddr;
+       int              fragnob;
+       int              page_offset;
+
+       LASSERT (nob > 0);
+       LASSERT (niov > 0);
+       LASSERT (net != NULL);
+
+       while (offset >= iov->iov_len) {
+               offset -= iov->iov_len;
+               niov--;
+               iov++;
+               LASSERT (niov > 0);
+       }
+
+       sg = tx->tx_frags;
+       do {
+               LASSERT (niov > 0);
+
+               vaddr = ((unsigned long)iov->iov_base) + offset;
+               page_offset = vaddr & (PAGE_SIZE - 1);
+               page = kiblnd_kvaddr_to_page(vaddr);
+               if (page == NULL) {
+                       CERROR ("Can't find page\n");
+                       return -EFAULT;
+               }
+
+               fragnob = min((int)(iov->iov_len - offset), nob);
+               fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+               sg_set_page(sg, page, fragnob, page_offset);
+               sg++;
+
+               if (offset + fragnob < iov->iov_len) {
+                       offset += fragnob;
+               } else {
+                       offset = 0;
+                       iov++;
+                       niov--;
+               }
+               nob -= fragnob;
+       } while (nob > 0);
+
+       return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+                     int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+       kib_net_t         *net = ni->ni_data;
+       struct scatterlist *sg;
+       int              fragnob;
+
+       CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+       LASSERT (nob > 0);
+       LASSERT (nkiov > 0);
+       LASSERT (net != NULL);
+
+       while (offset >= kiov->kiov_len) {
+               offset -= kiov->kiov_len;
+               nkiov--;
+               kiov++;
+               LASSERT (nkiov > 0);
+       }
+
+       sg = tx->tx_frags;
+       do {
+               LASSERT (nkiov > 0);
+
+               fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+               sg_set_page(sg, kiov->kiov_page, fragnob,
+                           kiov->kiov_offset + offset);
+               sg++;
+
+               offset = 0;
+               kiov++;
+               nkiov--;
+               nob -= fragnob;
+       } while (nob > 0);
+
+       return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
+{
+       kib_msg_t        *msg = tx->tx_msg;
+       kib_peer_t      *peer = conn->ibc_peer;
+       int             ver = conn->ibc_version;
+       int             rc;
+       int             done;
+       struct ib_send_wr *bad_wrq;
+
+       LASSERT (tx->tx_queued);
+       /* We rely on this for QP sizing */
+       LASSERT (tx->tx_nwrq > 0);
+       LASSERT (tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver));
+
+       LASSERT (credit == 0 || credit == 1);
+       LASSERT (conn->ibc_outstanding_credits >= 0);
+       LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+       LASSERT (conn->ibc_credits >= 0);
+       LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+
+       if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
+               /* tx completions outstanding... */
+               CDEBUG(D_NET, "%s: posted enough\n",
+                      libcfs_nid2str(peer->ibp_nid));
+               return -EAGAIN;
+       }
+
+       if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
+               CDEBUG(D_NET, "%s: no credits\n",
+                      libcfs_nid2str(peer->ibp_nid));
+               return -EAGAIN;
+       }
+
+       if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
+           conn->ibc_credits == 1 &&   /* last credit reserved */
+           msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
+               CDEBUG(D_NET, "%s: not using last credit\n",
+                      libcfs_nid2str(peer->ibp_nid));
+               return -EAGAIN;
+       }
+
+       /* NB don't drop ibc_lock before bumping tx_sending */
+       list_del(&tx->tx_list);
+       tx->tx_queued = 0;
+
+       if (msg->ibm_type == IBLND_MSG_NOOP &&
+           (!kiblnd_need_noop(conn) ||     /* redundant NOOP */
+            (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
+             conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
+               /* OK to drop when posted enough NOOPs, since
+                * kiblnd_check_sends will queue NOOP again when
+                * posted NOOPs complete */
+               spin_unlock(&conn->ibc_lock);
+               kiblnd_tx_done(peer->ibp_ni, tx);
+               spin_lock(&conn->ibc_lock);
+               CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      conn->ibc_noops_posted);
+               return 0;
+       }
+
+       kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+                       peer->ibp_nid, conn->ibc_incarnation);
+
+       conn->ibc_credits -= credit;
+       conn->ibc_outstanding_credits = 0;
+       conn->ibc_nsends_posted++;
+       if (msg->ibm_type == IBLND_MSG_NOOP)
+               conn->ibc_noops_posted++;
+
+       /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+        * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+        * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+        * and then re-queued here.  It's (just) possible that
+        * tx_sending is non-zero if we've not done the tx_complete()
+        * from the first send; hence the ++ rather than = below. */
+       tx->tx_sending++;
+       list_add(&tx->tx_list, &conn->ibc_active_txs);
+
+       /* I'm still holding ibc_lock! */
+       if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+               rc = -ECONNABORTED;
+       } else if (tx->tx_pool->tpo_pool.po_failed ||
+                conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+               /* close_conn will launch failover */
+               rc = -ENETDOWN;
+       } else {
+               rc = ib_post_send(conn->ibc_cmid->qp,
+                                 tx->tx_wrq, &bad_wrq);
+       }
+
+       conn->ibc_last_send = jiffies;
+
+       if (rc == 0)
+               return 0;
+
+       /* NB credits are transferred in the actual
+        * message, which can only be the last work item */
+       conn->ibc_credits += credit;
+       conn->ibc_outstanding_credits += msg->ibm_credits;
+       conn->ibc_nsends_posted--;
+       if (msg->ibm_type == IBLND_MSG_NOOP)
+               conn->ibc_noops_posted--;
+
+       tx->tx_status = rc;
+       tx->tx_waiting = 0;
+       tx->tx_sending--;
+
+       done = (tx->tx_sending == 0);
+       if (done)
+               list_del(&tx->tx_list);
+
+       spin_unlock(&conn->ibc_lock);
+
+       if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+               CERROR("Error %d posting transmit to %s\n",
+                      rc, libcfs_nid2str(peer->ibp_nid));
+       else
+               CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+                      rc, libcfs_nid2str(peer->ibp_nid));
+
+       kiblnd_close_conn(conn, rc);
+
+       if (done)
+               kiblnd_tx_done(peer->ibp_ni, tx);
+
+       spin_lock(&conn->ibc_lock);
+
+       return -EIO;
+}
+
+void
+kiblnd_check_sends (kib_conn_t *conn)
+{
+       int     ver = conn->ibc_version;
+       lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+       kib_tx_t  *tx;
+
+       /* Don't send anything until after the connection is established */
+       if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+               CDEBUG(D_NET, "%s too soon\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               return;
+       }
+
+       spin_lock(&conn->ibc_lock);
+
+       LASSERT (conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
+       LASSERT (!IBLND_OOB_CAPABLE(ver) ||
+                conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
+       LASSERT (conn->ibc_reserved_credits >= 0);
+
+       while (conn->ibc_reserved_credits > 0 &&
+              !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+               tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+                                   kib_tx_t, tx_list);
+               list_del(&tx->tx_list);
+               list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+               conn->ibc_reserved_credits--;
+       }
+
+       if (kiblnd_need_noop(conn)) {
+               spin_unlock(&conn->ibc_lock);
+
+               tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+               if (tx != NULL)
+                       kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+               spin_lock(&conn->ibc_lock);
+               if (tx != NULL)
+                       kiblnd_queue_tx_locked(tx, conn);
+       }
+
+       kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */
+
+       for (;;) {
+               int credit;
+
+               if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+                       credit = 0;
+                       tx = list_entry(conn->ibc_tx_queue_nocred.next,
+                                           kib_tx_t, tx_list);
+               } else if (!list_empty(&conn->ibc_tx_noops)) {
+                       LASSERT (!IBLND_OOB_CAPABLE(ver));
+                       credit = 1;
+                       tx = list_entry(conn->ibc_tx_noops.next,
+                                       kib_tx_t, tx_list);
+               } else if (!list_empty(&conn->ibc_tx_queue)) {
+                       credit = 1;
+                       tx = list_entry(conn->ibc_tx_queue.next,
+                                           kib_tx_t, tx_list);
+               } else
+                       break;
+
+               if (kiblnd_post_tx_locked(conn, tx, credit) != 0)
+                       break;
+       }
+
+       spin_unlock(&conn->ibc_lock);
+
+       kiblnd_conn_decref(conn); /* ...until here */
+}
+
+void
+kiblnd_tx_complete (kib_tx_t *tx, int status)
+{
+       int        failed = (status != IB_WC_SUCCESS);
+       kib_conn_t   *conn = tx->tx_conn;
+       int        idle;
+
+       LASSERT (tx->tx_sending > 0);
+
+       if (failed) {
+               if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+                       CNETERR("Tx -> %s cookie "LPX64
+                               " sending %d waiting %d: failed %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                               tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+                               status);
+
+               kiblnd_close_conn(conn, -EIO);
+       } else {
+               kiblnd_peer_alive(conn->ibc_peer);
+       }
+
+       spin_lock(&conn->ibc_lock);
+
+       /* I could be racing with rdma completion.  Whoever makes 'tx' idle
+        * gets to free it, which also drops its ref on 'conn'. */
+
+       tx->tx_sending--;
+       conn->ibc_nsends_posted--;
+       if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
+               conn->ibc_noops_posted--;
+
+       if (failed) {
+               tx->tx_waiting = 0;          /* don't wait for peer */
+               tx->tx_status = -EIO;
+       }
+
+       idle = (tx->tx_sending == 0) &&  /* This is the final callback */
+              !tx->tx_waiting &&              /* Not waiting for peer */
+              !tx->tx_queued;            /* Not re-queued (PUT_DONE) */
+       if (idle)
+               list_del(&tx->tx_list);
+
+       kiblnd_conn_addref(conn);              /* 1 ref for me.... */
+
+       spin_unlock(&conn->ibc_lock);
+
+       if (idle)
+               kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
+
+       kiblnd_check_sends(conn);
+
+       kiblnd_conn_decref(conn);              /* ...until here */
+}
+
+void
+kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
+{
+       kib_hca_dev_t     *hdev = tx->tx_pool->tpo_hdev;
+       struct ib_sge     *sge = &tx->tx_sge[tx->tx_nwrq];
+       struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
+       int             nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+       struct ib_mr      *mr;
+
+       LASSERT (tx->tx_nwrq >= 0);
+       LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+       LASSERT (nob <= IBLND_MSG_SIZE);
+
+       kiblnd_init_msg(tx->tx_msg, type, body_nob);
+
+       mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob);
+       LASSERT (mr != NULL);
+
+       sge->lkey   = mr->lkey;
+       sge->addr   = tx->tx_msgaddr;
+       sge->length = nob;
+
+       memset(wrq, 0, sizeof(*wrq));
+
+       wrq->next       = NULL;
+       wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+       wrq->sg_list    = sge;
+       wrq->num_sge    = 1;
+       wrq->opcode     = IB_WR_SEND;
+       wrq->send_flags = IB_SEND_SIGNALED;
+
+       tx->tx_nwrq++;
+}
+
+int
+kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+                 int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+{
+       kib_msg_t        *ibmsg = tx->tx_msg;
+       kib_rdma_desc_t   *srcrd = tx->tx_rd;
+       struct ib_sge     *sge = &tx->tx_sge[0];
+       struct ib_send_wr *wrq = &tx->tx_wrq[0];
+       int             rc  = resid;
+       int             srcidx;
+       int             dstidx;
+       int             wrknob;
+
+       LASSERT (!in_interrupt());
+       LASSERT (tx->tx_nwrq == 0);
+       LASSERT (type == IBLND_MSG_GET_DONE ||
+                type == IBLND_MSG_PUT_DONE);
+
+       srcidx = dstidx = 0;
+
+       while (resid > 0) {
+               if (srcidx >= srcrd->rd_nfrags) {
+                       CERROR("Src buffer exhausted: %d frags\n", srcidx);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               if (dstidx == dstrd->rd_nfrags) {
+                       CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) {
+                       CERROR("RDMA too fragmented for %s (%d): "
+                              "%d/%d src %d/%d dst frags\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                              IBLND_RDMA_FRAGS(conn->ibc_version),
+                              srcidx, srcrd->rd_nfrags,
+                              dstidx, dstrd->rd_nfrags);
+                       rc = -EMSGSIZE;
+                       break;
+               }
+
+               wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
+                                kiblnd_rd_frag_size(dstrd, dstidx)), resid);
+
+               sge = &tx->tx_sge[tx->tx_nwrq];
+               sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
+               sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
+               sge->length = wrknob;
+
+               wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+               wrq->next       = wrq + 1;
+               wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+               wrq->sg_list    = sge;
+               wrq->num_sge    = 1;
+               wrq->opcode     = IB_WR_RDMA_WRITE;
+               wrq->send_flags = 0;
+
+               wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
+               wrq->wr.rdma.rkey       = kiblnd_rd_frag_key(dstrd, dstidx);
+
+               srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
+               dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
+
+               resid -= wrknob;
+
+               tx->tx_nwrq++;
+               wrq++;
+               sge++;
+       }
+
+       if (rc < 0)                          /* no RDMA if completing with failure */
+               tx->tx_nwrq = 0;
+
+       ibmsg->ibm_u.completion.ibcm_status = rc;
+       ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+       kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
+                          type, sizeof (kib_completion_msg_t));
+
+       return rc;
+}
+
+void
+kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+       struct list_head   *q;
+
+       LASSERT (tx->tx_nwrq > 0);            /* work items set up */
+       LASSERT (!tx->tx_queued);              /* not queued for sending already */
+       LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+       tx->tx_queued = 1;
+       tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ);
+
+       if (tx->tx_conn == NULL) {
+               kiblnd_conn_addref(conn);
+               tx->tx_conn = conn;
+               LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+       } else {
+               /* PUT_DONE first attached to conn as a PUT_REQ */
+               LASSERT (tx->tx_conn == conn);
+               LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+       }
+
+       switch (tx->tx_msg->ibm_type) {
+       default:
+               LBUG();
+
+       case IBLND_MSG_PUT_REQ:
+       case IBLND_MSG_GET_REQ:
+               q = &conn->ibc_tx_queue_rsrvd;
+               break;
+
+       case IBLND_MSG_PUT_NAK:
+       case IBLND_MSG_PUT_ACK:
+       case IBLND_MSG_PUT_DONE:
+       case IBLND_MSG_GET_DONE:
+               q = &conn->ibc_tx_queue_nocred;
+               break;
+
+       case IBLND_MSG_NOOP:
+               if (IBLND_OOB_CAPABLE(conn->ibc_version))
+                       q = &conn->ibc_tx_queue_nocred;
+               else
+                       q = &conn->ibc_tx_noops;
+               break;
+
+       case IBLND_MSG_IMMEDIATE:
+               q = &conn->ibc_tx_queue;
+               break;
+       }
+
+       list_add_tail(&tx->tx_list, q);
+}
+
+void
+kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+       spin_lock(&conn->ibc_lock);
+       kiblnd_queue_tx_locked(tx, conn);
+       spin_unlock(&conn->ibc_lock);
+
+       kiblnd_check_sends(conn);
+}
+
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+                              struct sockaddr_in *srcaddr,
+                              struct sockaddr_in *dstaddr,
+                              int timeout_ms)
+{
+       unsigned short port;
+       int rc;
+
+       /* allow the port to be reused */
+       rc = rdma_set_reuseaddr(cmid, 1);
+       if (rc != 0) {
+               CERROR("Unable to set reuse on cmid: %d\n", rc);
+               return rc;
+       }
+
+       /* look for a free privileged port */
+       for (port = PROT_SOCK-1; port > 0; port--) {
+               srcaddr->sin_port = htons(port);
+               rc = rdma_resolve_addr(cmid,
+                                      (struct sockaddr *)srcaddr,
+                                      (struct sockaddr *)dstaddr,
+                                      timeout_ms);
+               if (rc == 0) {
+                       CDEBUG(D_NET, "bound to port %hu\n", port);
+                       return 0;
+               } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+                       CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+                              port, rc);
+               } else {
+                       return rc;
+               }
+       }
+
+       CERROR("Failed to bind to a free privileged port\n");
+       return rc;
+}
+
+void
+kiblnd_connect_peer (kib_peer_t *peer)
+{
+       struct rdma_cm_id *cmid;
+       kib_dev_t        *dev;
+       kib_net_t        *net = peer->ibp_ni->ni_data;
+       struct sockaddr_in srcaddr;
+       struct sockaddr_in dstaddr;
+       int             rc;
+
+       LASSERT (net != NULL);
+       LASSERT (peer->ibp_connecting > 0);
+
+       cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
+                                    IB_QPT_RC);
+
+       if (IS_ERR(cmid)) {
+               CERROR("Can't create CMID for %s: %ld\n",
+                      libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
+               rc = PTR_ERR(cmid);
+               goto failed;
+       }
+
+       dev = net->ibn_dev;
+       memset(&srcaddr, 0, sizeof(srcaddr));
+       srcaddr.sin_family = AF_INET;
+       srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
+
+       memset(&dstaddr, 0, sizeof(dstaddr));
+       dstaddr.sin_family = AF_INET;
+       dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+       dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+
+       kiblnd_peer_addref(peer);              /* cmid's ref */
+
+       if (*kiblnd_tunables.kib_use_priv_port) {
+               rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+                                        *kiblnd_tunables.kib_timeout * 1000);
+       } else {
+               rc = rdma_resolve_addr(cmid,
+                                      (struct sockaddr *)&srcaddr,
+                                      (struct sockaddr *)&dstaddr,
+                                      *kiblnd_tunables.kib_timeout * 1000);
+       }
+       if (rc != 0) {
+               /* Can't initiate address resolution:  */
+               CERROR("Can't resolve addr for %s: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), rc);
+               goto failed2;
+       }
+
+       LASSERT (cmid->device != NULL);
+       CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n",
+              libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+              HIPQUAD(dev->ibd_ifip), cmid->device->name);
+
+       return;
+
+ failed2:
+       kiblnd_peer_decref(peer);              /* cmid's ref */
+       rdma_destroy_id(cmid);
+ failed:
+       kiblnd_peer_connect_failed(peer, 1, rc);
+}
+
+void
+kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
+{
+       kib_peer_t      *peer;
+       kib_peer_t      *peer2;
+       kib_conn_t      *conn;
+       rwlock_t        *g_lock = &kiblnd_data.kib_global_lock;
+       unsigned long      flags;
+       int             rc;
+
+       /* If I get here, I've committed to send, so I complete the tx with
+        * failure on any problems */
+
+       LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */
+       LASSERT (tx == NULL || tx->tx_nwrq > 0);     /* work items have been set up */
+
+       /* First time, just use a read lock since I expect to find my peer
+        * connected */
+       read_lock_irqsave(g_lock, flags);
+
+       peer = kiblnd_find_peer_locked(nid);
+       if (peer != NULL && !list_empty(&peer->ibp_conns)) {
+               /* Found a peer with an established connection */
+               conn = kiblnd_get_conn_locked(peer);
+               kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+               read_unlock_irqrestore(g_lock, flags);
+
+               if (tx != NULL)
+                       kiblnd_queue_tx(tx, conn);
+               kiblnd_conn_decref(conn); /* ...to here */
+               return;
+       }
+
+       read_unlock(g_lock);
+       /* Re-try with a write lock */
+       write_lock(g_lock);
+
+       peer = kiblnd_find_peer_locked(nid);
+       if (peer != NULL) {
+               if (list_empty(&peer->ibp_conns)) {
+                       /* found a peer, but it's still connecting... */
+                       LASSERT (peer->ibp_connecting != 0 ||
+                                peer->ibp_accepting != 0);
+                       if (tx != NULL)
+                               list_add_tail(&tx->tx_list,
+                                                 &peer->ibp_tx_queue);
+                       write_unlock_irqrestore(g_lock, flags);
+               } else {
+                       conn = kiblnd_get_conn_locked(peer);
+                       kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+                       write_unlock_irqrestore(g_lock, flags);
+
+                       if (tx != NULL)
+                               kiblnd_queue_tx(tx, conn);
+                       kiblnd_conn_decref(conn); /* ...to here */
+               }
+               return;
+       }
+
+       write_unlock_irqrestore(g_lock, flags);
+
+       /* Allocate a peer ready to add to the peer table and retry */
+       rc = kiblnd_create_peer(ni, &peer, nid);
+       if (rc != 0) {
+               CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+               if (tx != NULL) {
+                       tx->tx_status = -EHOSTUNREACH;
+                       tx->tx_waiting = 0;
+                       kiblnd_tx_done(ni, tx);
+               }
+               return;
+       }
+
+       write_lock_irqsave(g_lock, flags);
+
+       peer2 = kiblnd_find_peer_locked(nid);
+       if (peer2 != NULL) {
+               if (list_empty(&peer2->ibp_conns)) {
+                       /* found a peer, but it's still connecting... */
+                       LASSERT (peer2->ibp_connecting != 0 ||
+                                peer2->ibp_accepting != 0);
+                       if (tx != NULL)
+                               list_add_tail(&tx->tx_list,
+                                                 &peer2->ibp_tx_queue);
+                       write_unlock_irqrestore(g_lock, flags);
+               } else {
+                       conn = kiblnd_get_conn_locked(peer2);
+                       kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+                       write_unlock_irqrestore(g_lock, flags);
+
+                       if (tx != NULL)
+                               kiblnd_queue_tx(tx, conn);
+                       kiblnd_conn_decref(conn); /* ...to here */
+               }
+
+               kiblnd_peer_decref(peer);
+               return;
+       }
+
+       /* Brand new peer */
+       LASSERT (peer->ibp_connecting == 0);
+       peer->ibp_connecting = 1;
+
+       /* always called with a ref on ni, which prevents ni being shutdown */
+       LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
+       if (tx != NULL)
+               list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
+
+       kiblnd_peer_addref(peer);
+       list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+       write_unlock_irqrestore(g_lock, flags);
+
+       kiblnd_connect_peer(peer);
+       kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+       lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+       int            type = lntmsg->msg_type;
+       lnet_process_id_t target = lntmsg->msg_target;
+       int            target_is_router = lntmsg->msg_target_is_router;
+       int            routing = lntmsg->msg_routing;
+       unsigned int      payload_niov = lntmsg->msg_niov;
+       struct iovec     *payload_iov = lntmsg->msg_iov;
+       lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+       unsigned int      payload_offset = lntmsg->msg_offset;
+       unsigned int      payload_nob = lntmsg->msg_len;
+       kib_msg_t       *ibmsg;
+       kib_tx_t         *tx;
+       int            nob;
+       int            rc;
+
+       /* NB 'private' is different depending on what we're sending.... */
+
+       CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+              payload_nob, payload_niov, libcfs_id2str(target));
+
+       LASSERT (payload_nob == 0 || payload_niov > 0);
+       LASSERT (payload_niov <= LNET_MAX_IOV);
+
+       /* Thread context */
+       LASSERT (!in_interrupt());
+       /* payload is either all vaddrs or all pages */
+       LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+       switch (type) {
+       default:
+               LBUG();
+               return (-EIO);
+
+       case LNET_MSG_ACK:
+               LASSERT (payload_nob == 0);
+               break;
+
+       case LNET_MSG_GET:
+               if (routing || target_is_router)
+                       break;            /* send IMMEDIATE */
+
+               /* is the REPLY message too small for RDMA? */
+               nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+               if (nob <= IBLND_MSG_SIZE)
+                       break;            /* send IMMEDIATE */
+
+               tx = kiblnd_get_idle_tx(ni, target.nid);
+               if (tx == NULL) {
+                       CERROR("Can't allocate txd for GET to %s\n",
+                              libcfs_nid2str(target.nid));
+                       return -ENOMEM;
+               }
+
+               ibmsg = tx->tx_msg;
+
+               if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+                       rc = kiblnd_setup_rd_iov(ni, tx,
+                                                &ibmsg->ibm_u.get.ibgm_rd,
+                                                lntmsg->msg_md->md_niov,
+                                                lntmsg->msg_md->md_iov.iov,
+                                                0, lntmsg->msg_md->md_length);
+               else
+                       rc = kiblnd_setup_rd_kiov(ni, tx,
+                                                 &ibmsg->ibm_u.get.ibgm_rd,
+                                                 lntmsg->msg_md->md_niov,
+                                                 lntmsg->msg_md->md_iov.kiov,
+                                                 0, lntmsg->msg_md->md_length);
+               if (rc != 0) {
+                       CERROR("Can't setup GET sink for %s: %d\n",
+                              libcfs_nid2str(target.nid), rc);
+                       kiblnd_tx_done(ni, tx);
+                       return -EIO;
+               }
+
+               nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]);
+               ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+               ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+
+               kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+               tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+               if (tx->tx_lntmsg[1] == NULL) {
+                       CERROR("Can't create reply for GET -> %s\n",
+                              libcfs_nid2str(target.nid));
+                       kiblnd_tx_done(ni, tx);
+                       return -EIO;
+               }
+
+               tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
+               tx->tx_waiting = 1;          /* waiting for GET_DONE */
+               kiblnd_launch_tx(ni, tx, target.nid);
+               return 0;
+
+       case LNET_MSG_REPLY:
+       case LNET_MSG_PUT:
+               /* Is the payload small enough not to need RDMA? */
+               nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+               if (nob <= IBLND_MSG_SIZE)
+                       break;            /* send IMMEDIATE */
+
+               tx = kiblnd_get_idle_tx(ni, target.nid);
+               if (tx == NULL) {
+                       CERROR("Can't allocate %s txd for %s\n",
+                              type == LNET_MSG_PUT ? "PUT" : "REPLY",
+                              libcfs_nid2str(target.nid));
+                       return -ENOMEM;
+               }
+
+               if (payload_kiov == NULL)
+                       rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+                                                payload_niov, payload_iov,
+                                                payload_offset, payload_nob);
+               else
+                       rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+                                                 payload_niov, payload_kiov,
+                                                 payload_offset, payload_nob);
+               if (rc != 0) {
+                       CERROR("Can't setup PUT src for %s: %d\n",
+                              libcfs_nid2str(target.nid), rc);
+                       kiblnd_tx_done(ni, tx);
+                       return -EIO;
+               }
+
+               ibmsg = tx->tx_msg;
+               ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+               ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+               kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+               tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+               tx->tx_waiting = 1;          /* waiting for PUT_{ACK,NAK} */
+               kiblnd_launch_tx(ni, tx, target.nid);
+               return 0;
+       }
+
+       /* send IMMEDIATE */
+
+       LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+                <= IBLND_MSG_SIZE);
+
+       tx = kiblnd_get_idle_tx(ni, target.nid);
+       if (tx == NULL) {
+               CERROR ("Can't send %d to %s: tx descs exhausted\n",
+                       type, libcfs_nid2str(target.nid));
+               return -ENOMEM;
+       }
+
+       ibmsg = tx->tx_msg;
+       ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+       if (payload_kiov != NULL)
+               lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+                                   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                   payload_niov, payload_kiov,
+                                   payload_offset, payload_nob);
+       else
+               lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
+                                  offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                  payload_niov, payload_iov,
+                                  payload_offset, payload_nob);
+
+       nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+       kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+
+       tx->tx_lntmsg[0] = lntmsg;            /* finalise lntmsg on completion */
+       kiblnd_launch_tx(ni, tx, target.nid);
+       return 0;
+}
+
+void
+kiblnd_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
+{
+       lnet_process_id_t target = lntmsg->msg_target;
+       unsigned int      niov = lntmsg->msg_niov;
+       struct iovec     *iov = lntmsg->msg_iov;
+       lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+       unsigned int      offset = lntmsg->msg_offset;
+       unsigned int      nob = lntmsg->msg_len;
+       kib_tx_t         *tx;
+       int            rc;
+
+       tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
+       if (tx == NULL) {
+               CERROR("Can't get tx for REPLY to %s\n",
+                      libcfs_nid2str(target.nid));
+               goto failed_0;
+       }
+
+       if (nob == 0)
+               rc = 0;
+       else if (kiov == NULL)
+               rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+                                        niov, iov, offset, nob);
+       else
+               rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+                                         niov, kiov, offset, nob);
+
+       if (rc != 0) {
+               CERROR("Can't setup GET src for %s: %d\n",
+                      libcfs_nid2str(target.nid), rc);
+               goto failed_1;
+       }
+
+       rc = kiblnd_init_rdma(rx->rx_conn, tx,
+                             IBLND_MSG_GET_DONE, nob,
+                             &rx->rx_msg->ibm_u.get.ibgm_rd,
+                             rx->rx_msg->ibm_u.get.ibgm_cookie);
+       if (rc < 0) {
+               CERROR("Can't setup rdma for GET from %s: %d\n",
+                      libcfs_nid2str(target.nid), rc);
+               goto failed_1;
+       }
+
+       if (nob == 0) {
+               /* No RDMA: local completion may happen now! */
+               lnet_finalize(ni, lntmsg, 0);
+       } else {
+               /* RDMA: lnet_finalize(lntmsg) when it
+                * completes */
+               tx->tx_lntmsg[0] = lntmsg;
+       }
+
+       kiblnd_queue_tx(tx, rx->rx_conn);
+       return;
+
+ failed_1:
+       kiblnd_tx_done(ni, tx);
+ failed_0:
+       lnet_finalize(ni, lntmsg, -EIO);
+}
+
+int
+kiblnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+            unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+            unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+       kib_rx_t    *rx = private;
+       kib_msg_t   *rxmsg = rx->rx_msg;
+       kib_conn_t  *conn = rx->rx_conn;
+       kib_tx_t    *tx;
+       kib_msg_t   *txmsg;
+       int       nob;
+       int       post_credit = IBLND_POSTRX_PEER_CREDIT;
+       int       rc = 0;
+
+       LASSERT (mlen <= rlen);
+       LASSERT (!in_interrupt());
+       /* Either all pages or all vaddrs */
+       LASSERT (!(kiov != NULL && iov != NULL));
+
+       switch (rxmsg->ibm_type) {
+       default:
+               LBUG();
+
+       case IBLND_MSG_IMMEDIATE:
+               nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+               if (nob > rx->rx_nob) {
+                       CERROR ("Immediate message from %s too big: %d(%d)\n",
+                               libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+                               nob, rx->rx_nob);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               if (kiov != NULL)
+                       lnet_copy_flat2kiov(niov, kiov, offset,
+                                           IBLND_MSG_SIZE, rxmsg,
+                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                           mlen);
+               else
+                       lnet_copy_flat2iov(niov, iov, offset,
+                                          IBLND_MSG_SIZE, rxmsg,
+                                          offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                          mlen);
+               lnet_finalize (ni, lntmsg, 0);
+               break;
+
+       case IBLND_MSG_PUT_REQ:
+               if (mlen == 0) {
+                       lnet_finalize(ni, lntmsg, 0);
+                       kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
+                                              rxmsg->ibm_u.putreq.ibprm_cookie);
+                       break;
+               }
+
+               tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+               if (tx == NULL) {
+                       CERROR("Can't allocate tx for %s\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       /* Not replying will break the connection */
+                       rc = -ENOMEM;
+                       break;
+               }
+
+               txmsg = tx->tx_msg;
+               if (kiov == NULL)
+                       rc = kiblnd_setup_rd_iov(ni, tx,
+                                                &txmsg->ibm_u.putack.ibpam_rd,
+                                                niov, iov, offset, mlen);
+               else
+                       rc = kiblnd_setup_rd_kiov(ni, tx,
+                                                 &txmsg->ibm_u.putack.ibpam_rd,
+                                                 niov, kiov, offset, mlen);
+               if (rc != 0) {
+                       CERROR("Can't setup PUT sink for %s: %d\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                       kiblnd_tx_done(ni, tx);
+                       /* tell peer it's over */
+                       kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
+                                              rxmsg->ibm_u.putreq.ibprm_cookie);
+                       break;
+               }
+
+               nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]);
+               txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+               txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+               kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+               tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+               tx->tx_waiting = 1;          /* waiting for PUT_DONE */
+               kiblnd_queue_tx(tx, conn);
+
+               /* reposted buffer reserved for PUT_DONE */
+               post_credit = IBLND_POSTRX_NO_CREDIT;
+               break;
+
+       case IBLND_MSG_GET_REQ:
+               if (lntmsg != NULL) {
+                       /* Optimized GET; RDMA lntmsg's payload */
+                       kiblnd_reply(ni, rx, lntmsg);
+               } else {
+                       /* GET didn't match anything */
+                       kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+                                              -ENODATA,
+                                              rxmsg->ibm_u.get.ibgm_cookie);
+               }
+               break;
+       }
+
+       kiblnd_post_rx(rx, post_credit);
+       return rc;
+}
+
+int
+kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+       task_t *task = kthread_run(fn, arg, name);
+
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+
+       atomic_inc(&kiblnd_data.kib_nthreads);
+       return 0;
+}
+
+void
+kiblnd_thread_fini (void)
+{
+       atomic_dec (&kiblnd_data.kib_nthreads);
+}
+
+void
+kiblnd_peer_alive (kib_peer_t *peer)
+{
+       /* This is racy, but everyone's only writing cfs_time_current() */
+       peer->ibp_last_alive = cfs_time_current();
+       mb();
+}
+
+void
+kiblnd_peer_notify (kib_peer_t *peer)
+{
+       int        error = 0;
+       cfs_time_t    last_alive = 0;
+       unsigned long flags;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       if (list_empty(&peer->ibp_conns) &&
+           peer->ibp_accepting == 0 &&
+           peer->ibp_connecting == 0 &&
+           peer->ibp_error != 0) {
+               error = peer->ibp_error;
+               peer->ibp_error = 0;
+
+               last_alive = peer->ibp_last_alive;
+       }
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       if (error != 0)
+               lnet_notify(peer->ibp_ni,
+                           peer->ibp_nid, 0, last_alive);
+}
+
+void
+kiblnd_close_conn_locked (kib_conn_t *conn, int error)
+{
+       /* This just does the immediate housekeeping.  'error' is zero for a
+        * normal shutdown which can happen only after the connection has been
+        * established.  If the connection is established, schedule the
+        * connection to be finished off by the connd.  Otherwise the connd is
+        * already dealing with it (either to set it up or tear it down).
+        * Caller holds kib_global_lock exclusively in irq context */
+       kib_peer_t       *peer = conn->ibc_peer;
+       kib_dev_t       *dev;
+       unsigned long     flags;
+
+       LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+       if (error != 0 && conn->ibc_comms_error == 0)
+               conn->ibc_comms_error = error;
+
+       if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+               return; /* already being handled  */
+
+       if (error == 0 &&
+           list_empty(&conn->ibc_tx_noops) &&
+           list_empty(&conn->ibc_tx_queue) &&
+           list_empty(&conn->ibc_tx_queue_rsrvd) &&
+           list_empty(&conn->ibc_tx_queue_nocred) &&
+           list_empty(&conn->ibc_active_txs)) {
+               CDEBUG(D_NET, "closing conn to %s\n",
+                      libcfs_nid2str(peer->ibp_nid));
+       } else {
+               CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+                      libcfs_nid2str(peer->ibp_nid), error,
+                      list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+                      list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
+                      list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+                      list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+                      list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+       }
+
+       dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev;
+       list_del(&conn->ibc_list);
+       /* connd (see below) takes over ibc_list's ref */
+
+       if (list_empty (&peer->ibp_conns) &&    /* no more conns */
+           kiblnd_peer_active(peer)) {  /* still in peer table */
+               kiblnd_unlink_peer_locked(peer);
+
+               /* set/clear error on last conn */
+               peer->ibp_error = conn->ibc_comms_error;
+       }
+
+       kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+       if (error != 0 &&
+           kiblnd_dev_can_failover(dev)) {
+               list_add_tail(&dev->ibd_fail_list,
+                             &kiblnd_data.kib_failed_devs);
+               wake_up(&kiblnd_data.kib_failover_waitq);
+       }
+
+       spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+       list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+       wake_up(&kiblnd_data.kib_connd_waitq);
+
+       spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn(kib_conn_t *conn, int error)
+{
+       unsigned long flags;
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       kiblnd_close_conn_locked(conn, error);
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_handle_early_rxs(kib_conn_t *conn)
+{
+       unsigned long    flags;
+       kib_rx_t        *rx;
+
+       LASSERT(!in_interrupt());
+       LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       while (!list_empty(&conn->ibc_early_rxs)) {
+               rx = list_entry(conn->ibc_early_rxs.next,
+                                   kib_rx_t, rx_list);
+               list_del(&rx->rx_list);
+               write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+               kiblnd_handle_rx(rx);
+
+               write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       }
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+       LIST_HEAD       (zombies);
+       struct list_head          *tmp;
+       struct list_head          *nxt;
+       kib_tx_t            *tx;
+
+       spin_lock(&conn->ibc_lock);
+
+       list_for_each_safe (tmp, nxt, txs) {
+               tx = list_entry (tmp, kib_tx_t, tx_list);
+
+               if (txs == &conn->ibc_active_txs) {
+                       LASSERT (!tx->tx_queued);
+                       LASSERT (tx->tx_waiting ||
+                                tx->tx_sending != 0);
+               } else {
+                       LASSERT (tx->tx_queued);
+               }
+
+               tx->tx_status = -ECONNABORTED;
+               tx->tx_waiting = 0;
+
+               if (tx->tx_sending == 0) {
+                       tx->tx_queued = 0;
+                       list_del (&tx->tx_list);
+                       list_add (&tx->tx_list, &zombies);
+               }
+       }
+
+       spin_unlock(&conn->ibc_lock);
+
+       kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
+}
+
+void
+kiblnd_finalise_conn (kib_conn_t *conn)
+{
+       LASSERT (!in_interrupt());
+       LASSERT (conn->ibc_state > IBLND_CONN_INIT);
+
+       kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+       /* abort_receives moves QP state to IB_QPS_ERR.  This is only required
+        * for connections that didn't get as far as being connected, because
+        * rdma_disconnect() does this for free. */
+       kiblnd_abort_receives(conn);
+
+       /* Complete all tx descs not waiting for sends to complete.
+        * NB we should be safe from RDMA now that the QP has changed state */
+
+       kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
+       kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+       kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+       kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+       kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+       kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error)
+{
+       LIST_HEAD    (zombies);
+       unsigned long     flags;
+
+       LASSERT (error != 0);
+       LASSERT (!in_interrupt());
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       if (active) {
+               LASSERT (peer->ibp_connecting > 0);
+               peer->ibp_connecting--;
+       } else {
+               LASSERT (peer->ibp_accepting > 0);
+               peer->ibp_accepting--;
+       }
+
+       if (peer->ibp_connecting != 0 ||
+           peer->ibp_accepting != 0) {
+               /* another connection attempt under way... */
+               write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                           flags);
+               return;
+       }
+
+       if (list_empty(&peer->ibp_conns)) {
+               /* Take peer's blocked transmits to complete with error */
+               list_add(&zombies, &peer->ibp_tx_queue);
+               list_del_init(&peer->ibp_tx_queue);
+
+               if (kiblnd_peer_active(peer))
+                       kiblnd_unlink_peer_locked(peer);
+
+               peer->ibp_error = error;
+       } else {
+               /* Can't have blocked transmits if there are connections */
+               LASSERT (list_empty(&peer->ibp_tx_queue));
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       kiblnd_peer_notify(peer);
+
+       if (list_empty (&zombies))
+               return;
+
+       CNETERR("Deleting messages for %s: connection failed\n",
+               libcfs_nid2str(peer->ibp_nid));
+
+       kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
+}
+
+void
+kiblnd_connreq_done(kib_conn_t *conn, int status)
+{
+       kib_peer_t      *peer = conn->ibc_peer;
+       kib_tx_t          *tx;
+       struct list_head         txs;
+       unsigned long      flags;
+       int             active;
+
+       active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+       CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n",
+              libcfs_nid2str(peer->ibp_nid), active,
+              conn->ibc_version, status);
+
+       LASSERT (!in_interrupt());
+       LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+                 peer->ibp_connecting > 0) ||
+                (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+                 peer->ibp_accepting > 0));
+
+       LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+       conn->ibc_connvars = NULL;
+
+       if (status != 0) {
+               /* failed to establish connection */
+               kiblnd_peer_connect_failed(peer, active, status);
+               kiblnd_finalise_conn(conn);
+               return;
+       }
+
+       /* connection established */
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       conn->ibc_last_send = jiffies;
+       kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+       kiblnd_peer_alive(peer);
+
+       /* Add conn to peer's list and nuke any dangling conns from a different
+        * peer instance... */
+       kiblnd_conn_addref(conn);              /* +1 ref for ibc_list */
+       list_add(&conn->ibc_list, &peer->ibp_conns);
+       if (active)
+               peer->ibp_connecting--;
+       else
+               peer->ibp_accepting--;
+
+       if (peer->ibp_version == 0) {
+               peer->ibp_version     = conn->ibc_version;
+               peer->ibp_incarnation = conn->ibc_incarnation;
+       }
+
+       if (peer->ibp_version     != conn->ibc_version ||
+           peer->ibp_incarnation != conn->ibc_incarnation) {
+               kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
+                                               conn->ibc_incarnation);
+               peer->ibp_version     = conn->ibc_version;
+               peer->ibp_incarnation = conn->ibc_incarnation;
+       }
+
+       /* grab pending txs while I have the lock */
+       list_add(&txs, &peer->ibp_tx_queue);
+       list_del_init(&peer->ibp_tx_queue);
+
+       if (!kiblnd_peer_active(peer) ||        /* peer has been deleted */
+           conn->ibc_comms_error != 0) {       /* error has happened already */
+               lnet_ni_t *ni = peer->ibp_ni;
+
+               /* start to shut down connection */
+               kiblnd_close_conn_locked(conn, -ECONNABORTED);
+               write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+               kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
+
+               return;
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       /* Schedule blocked txs */
+       spin_lock(&conn->ibc_lock);
+       while (!list_empty(&txs)) {
+               tx = list_entry(txs.next, kib_tx_t, tx_list);
+               list_del(&tx->tx_list);
+
+               kiblnd_queue_tx_locked(tx, conn);
+       }
+       spin_unlock(&conn->ibc_lock);
+
+       kiblnd_check_sends(conn);
+
+       /* schedule blocked rxs */
+       kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+{
+       int       rc;
+
+       rc = rdma_reject(cmid, rej, sizeof(*rej));
+
+       if (rc != 0)
+               CWARN("Error %d sending reject\n", rc);
+}
+
+int
+kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+       rwlock_t                *g_lock = &kiblnd_data.kib_global_lock;
+       kib_msg_t            *reqmsg = priv;
+       kib_msg_t            *ackmsg;
+       kib_dev_t            *ibdev;
+       kib_peer_t          *peer;
+       kib_peer_t          *peer2;
+       kib_conn_t          *conn;
+       lnet_ni_t            *ni  = NULL;
+       kib_net_t            *net = NULL;
+       lnet_nid_t           nid;
+       struct rdma_conn_param cp;
+       kib_rej_t             rej;
+       int                 version = IBLND_MSG_VERSION;
+       unsigned long     flags;
+       int                 rc;
+       struct sockaddr_in    *peer_addr;
+       LASSERT (!in_interrupt());
+
+       /* cmid inherits 'context' from the corresponding listener id */
+       ibdev = (kib_dev_t *)cmid->context;
+       LASSERT (ibdev != NULL);
+
+       memset(&rej, 0, sizeof(rej));
+       rej.ibr_magic           = IBLND_MSG_MAGIC;
+       rej.ibr_why               = IBLND_REJECT_FATAL;
+       rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+       peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+       if (*kiblnd_tunables.kib_require_priv_port &&
+           ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+               __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+               CERROR("Peer's port (%u.%u.%u.%u:%hu) is not privileged\n",
+                      HIPQUAD(ip), ntohs(peer_addr->sin_port));
+               goto failed;
+       }
+
+       if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+               CERROR("Short connection request\n");
+               goto failed;
+       }
+
+       /* Future protocol version compatibility support!  If the
+        * o2iblnd-specific protocol changes, or when LNET unifies
+        * protocols over all LNDs, the initial connection will
+        * negotiate a protocol version.  I trap this here to avoid
+        * console errors; the reject tells the peer which protocol I
+        * speak. */
+       if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+           reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+               goto failed;
+       if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+           reqmsg->ibm_version != IBLND_MSG_VERSION &&
+           reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+               goto failed;
+       if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+           reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+           reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+               goto failed;
+
+       rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+       if (rc != 0) {
+               CERROR("Can't parse connection request: %d\n", rc);
+               goto failed;
+       }
+
+       nid = reqmsg->ibm_srcnid;
+       ni  = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+
+       if (ni != NULL) {
+               net = (kib_net_t *)ni->ni_data;
+               rej.ibr_incarnation = net->ibn_incarnation;
+       }
+
+       if (ni == NULL ||                        /* no matching net */
+           ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
+           net->ibn_dev != ibdev) {          /* wrong device */
+               CERROR("Can't accept %s on %s (%s:%d:%u.%u.%u.%u): "
+                      "bad dst nid %s\n", libcfs_nid2str(nid),
+                      ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+                      ibdev->ibd_ifname, ibdev->ibd_nnets,
+                      HIPQUAD(ibdev->ibd_ifip),
+                      libcfs_nid2str(reqmsg->ibm_dstnid));
+
+               goto failed;
+       }
+
+       /* check time stamp as soon as possible */
+       if (reqmsg->ibm_dststamp != 0 &&
+           reqmsg->ibm_dststamp != net->ibn_incarnation) {
+               CWARN("Stale connection request\n");
+               rej.ibr_why = IBLND_REJECT_CONN_STALE;
+               goto failed;
+       }
+
+       /* I can accept peer's version */
+       version = reqmsg->ibm_version;
+
+       if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+               CERROR("Unexpected connreq msg type: %x from %s\n",
+                      reqmsg->ibm_type, libcfs_nid2str(nid));
+               goto failed;
+       }
+
+       if (reqmsg->ibm_u.connparams.ibcp_queue_depth !=
+           IBLND_MSG_QUEUE_SIZE(version)) {
+               CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+                      libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth,
+                      IBLND_MSG_QUEUE_SIZE(version));
+
+               if (version == IBLND_MSG_VERSION)
+                       rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
+
+               goto failed;
+       }
+
+       if (reqmsg->ibm_u.connparams.ibcp_max_frags !=
+           IBLND_RDMA_FRAGS(version)) {
+               CERROR("Can't accept %s(version %x): "
+                      "incompatible max_frags %d (%d wanted)\n",
+                      libcfs_nid2str(nid), version,
+                      reqmsg->ibm_u.connparams.ibcp_max_frags,
+                      IBLND_RDMA_FRAGS(version));
+
+               if (version == IBLND_MSG_VERSION)
+                       rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+               goto failed;
+
+       }
+
+       if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+               CERROR("Can't accept %s: message size %d too big (%d max)\n",
+                      libcfs_nid2str(nid),
+                      reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+                      IBLND_MSG_SIZE);
+               goto failed;
+       }
+
+       /* assume 'nid' is a new peer; create  */
+       rc = kiblnd_create_peer(ni, &peer, nid);
+       if (rc != 0) {
+               CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+               rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+               goto failed;
+       }
+
+       write_lock_irqsave(g_lock, flags);
+
+       peer2 = kiblnd_find_peer_locked(nid);
+       if (peer2 != NULL) {
+               if (peer2->ibp_version == 0) {
+                       peer2->ibp_version     = version;
+                       peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+               }
+
+               /* not the guy I've talked with */
+               if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
+                   peer2->ibp_version     != version) {
+                       kiblnd_close_peer_conns_locked(peer2, -ESTALE);
+                       write_unlock_irqrestore(g_lock, flags);
+
+                       CWARN("Conn stale %s [old ver: %x, new ver: %x]\n",
+                             libcfs_nid2str(nid), peer2->ibp_version, version);
+
+                       kiblnd_peer_decref(peer);
+                       rej.ibr_why = IBLND_REJECT_CONN_STALE;
+                       goto failed;
+               }
+
+               /* tie-break connection race in favour of the higher NID */
+               if (peer2->ibp_connecting != 0 &&
+                   nid < ni->ni_nid) {
+                       write_unlock_irqrestore(g_lock, flags);
+
+                       CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid));
+
+                       kiblnd_peer_decref(peer);
+                       rej.ibr_why = IBLND_REJECT_CONN_RACE;
+                       goto failed;
+               }
+
+               peer2->ibp_accepting++;
+               kiblnd_peer_addref(peer2);
+
+               write_unlock_irqrestore(g_lock, flags);
+               kiblnd_peer_decref(peer);
+               peer = peer2;
+       } else {
+               /* Brand new peer */
+               LASSERT (peer->ibp_accepting == 0);
+               LASSERT (peer->ibp_version == 0 &&
+                        peer->ibp_incarnation == 0);
+
+               peer->ibp_accepting   = 1;
+               peer->ibp_version     = version;
+               peer->ibp_incarnation = reqmsg->ibm_srcstamp;
+
+               /* I have a ref on ni that prevents it being shutdown */
+               LASSERT (net->ibn_shutdown == 0);
+
+               kiblnd_peer_addref(peer);
+               list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+               write_unlock_irqrestore(g_lock, flags);
+       }
+
+       conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+       if (conn == NULL) {
+               kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
+               kiblnd_peer_decref(peer);
+               rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+               goto failed;
+       }
+
+       /* conn now "owns" cmid, so I return success from here on to ensure the
+        * CM callback doesn't destroy cmid. */
+
+       conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+       conn->ibc_credits         = IBLND_MSG_QUEUE_SIZE(version);
+       conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version);
+       LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version)
+                <= IBLND_RX_MSGS(version));
+
+       ackmsg = &conn->ibc_connvars->cv_msg;
+       memset(ackmsg, 0, sizeof(*ackmsg));
+
+       kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+                       sizeof(ackmsg->ibm_u.connparams));
+       ackmsg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+       ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+       ackmsg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+
+       kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
+
+       memset(&cp, 0, sizeof(cp));
+       cp.private_data = ackmsg;
+       cp.private_data_len    = ackmsg->ibm_nob;
+       cp.responder_resources = 0;          /* No atomic ops or RDMA reads */
+       cp.initiator_depth     = 0;
+       cp.flow_control = 1;
+       cp.retry_count   = *kiblnd_tunables.kib_retry_count;
+       cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+       CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+       rc = rdma_accept(cmid, &cp);
+       if (rc != 0) {
+               CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+               rej.ibr_version = version;
+               rej.ibr_why     = IBLND_REJECT_FATAL;
+
+               kiblnd_reject(cmid, &rej);
+               kiblnd_connreq_done(conn, rc);
+               kiblnd_conn_decref(conn);
+       }
+
+       lnet_ni_decref(ni);
+       return 0;
+
+ failed:
+       if (ni != NULL)
+               lnet_ni_decref(ni);
+
+       rej.ibr_version = version;
+       rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+       rej.ibr_cp.ibcp_max_frags   = IBLND_RDMA_FRAGS(version);
+       kiblnd_reject(cmid, &rej);
+
+       return -ECONNREFUSED;
+}
+
+void
+kiblnd_reconnect (kib_conn_t *conn, int version,
+                 __u64 incarnation, int why, kib_connparams_t *cp)
+{
+       kib_peer_t    *peer = conn->ibc_peer;
+       char      *reason;
+       int         retry = 0;
+       unsigned long  flags;
+
+       LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+       LASSERT (peer->ibp_connecting > 0);     /* 'conn' at least */
+
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       /* retry connection if it's still needed and no other connection
+        * attempts (active or passive) are in progress
+        * NB: reconnect is still needed even when ibp_tx_queue is
+        * empty if ibp_version != version because reconnect may be
+        * initiated by kiblnd_query() */
+       if ((!list_empty(&peer->ibp_tx_queue) ||
+            peer->ibp_version != version) &&
+           peer->ibp_connecting == 1 &&
+           peer->ibp_accepting == 0) {
+               retry = 1;
+               peer->ibp_connecting++;
+
+               peer->ibp_version     = version;
+               peer->ibp_incarnation = incarnation;
+       }
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       if (!retry)
+               return;
+
+       switch (why) {
+       default:
+               reason = "Unknown";
+               break;
+
+       case IBLND_REJECT_CONN_STALE:
+               reason = "stale";
+               break;
+
+       case IBLND_REJECT_CONN_RACE:
+               reason = "conn race";
+               break;
+
+       case IBLND_REJECT_CONN_UNCOMPAT:
+               reason = "version negotiation";
+               break;
+       }
+
+       CNETERR("%s: retrying (%s), %x, %x, "
+               "queue_dep: %d, max_frag: %d, msg_size: %d\n",
+               libcfs_nid2str(peer->ibp_nid),
+               reason, IBLND_MSG_VERSION, version,
+               cp != NULL? cp->ibcp_queue_depth :IBLND_MSG_QUEUE_SIZE(version),
+               cp != NULL? cp->ibcp_max_frags   : IBLND_RDMA_FRAGS(version),
+               cp != NULL? cp->ibcp_max_msg_size: IBLND_MSG_SIZE);
+
+       kiblnd_connect_peer(peer);
+}
+
+void
+kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
+{
+       kib_peer_t    *peer = conn->ibc_peer;
+
+       LASSERT (!in_interrupt());
+       LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+       switch (reason) {
+       case IB_CM_REJ_STALE_CONN:
+               kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0,
+                                IBLND_REJECT_CONN_STALE, NULL);
+               break;
+
+       case IB_CM_REJ_INVALID_SERVICE_ID:
+               CNETERR("%s rejected: no listener at %d\n",
+                       libcfs_nid2str(peer->ibp_nid),
+                       *kiblnd_tunables.kib_service);
+               break;
+
+       case IB_CM_REJ_CONSUMER_DEFINED:
+               if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
+                       kib_rej_t       *rej     = priv;
+                       kib_connparams_t *cp      = NULL;
+                       int            flip     = 0;
+                       __u64        incarnation = -1;
+
+                       /* NB. default incarnation is -1 because:
+                        * a) V1 will ignore dst incarnation in connreq.
+                        * b) V2 will provide incarnation while rejecting me,
+                        *    -1 will be overwrote.
+                        *
+                        * if I try to connect to a V1 peer with V2 protocol,
+                        * it rejected me then upgrade to V2, I have no idea
+                        * about the upgrading and try to reconnect with V1,
+                        * in this case upgraded V2 can find out I'm trying to
+                        * talk to the old guy and reject me(incarnation is -1).
+                        */
+
+                       if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+                           rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+                               __swab32s(&rej->ibr_magic);
+                               __swab16s(&rej->ibr_version);
+                               flip = 1;
+                       }
+
+                       if (priv_nob >= sizeof(kib_rej_t) &&
+                           rej->ibr_version > IBLND_MSG_VERSION_1) {
+                               /* priv_nob is always 148 in current version
+                                * of OFED, so we still need to check version.
+                                * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */
+                               cp = &rej->ibr_cp;
+
+                               if (flip) {
+                                       __swab64s(&rej->ibr_incarnation);
+                                       __swab16s(&cp->ibcp_queue_depth);
+                                       __swab16s(&cp->ibcp_max_frags);
+                                       __swab32s(&cp->ibcp_max_msg_size);
+                               }
+
+                               incarnation = rej->ibr_incarnation;
+                       }
+
+                       if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+                           rej->ibr_magic != LNET_PROTO_MAGIC) {
+                               CERROR("%s rejected: consumer defined fatal error\n",
+                                      libcfs_nid2str(peer->ibp_nid));
+                               break;
+                       }
+
+                       if (rej->ibr_version != IBLND_MSG_VERSION &&
+                           rej->ibr_version != IBLND_MSG_VERSION_1) {
+                               CERROR("%s rejected: o2iblnd version %x error\n",
+                                      libcfs_nid2str(peer->ibp_nid),
+                                      rej->ibr_version);
+                               break;
+                       }
+
+                       if (rej->ibr_why     == IBLND_REJECT_FATAL &&
+                           rej->ibr_version == IBLND_MSG_VERSION_1) {
+                               CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
+                                      libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
+
+                               if (conn->ibc_version != IBLND_MSG_VERSION_1)
+                                       rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
+                       }
+
+                       switch (rej->ibr_why) {
+                       case IBLND_REJECT_CONN_RACE:
+                       case IBLND_REJECT_CONN_STALE:
+                       case IBLND_REJECT_CONN_UNCOMPAT:
+                               kiblnd_reconnect(conn, rej->ibr_version,
+                                                incarnation, rej->ibr_why, cp);
+                               break;
+
+                       case IBLND_REJECT_MSG_QUEUE_SIZE:
+                               CERROR("%s rejected: incompatible message queue depth %d, %d\n",
+                                      libcfs_nid2str(peer->ibp_nid), cp->ibcp_queue_depth,
+                                      IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+                               break;
+
+                       case IBLND_REJECT_RDMA_FRAGS:
+                               CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n",
+                                      libcfs_nid2str(peer->ibp_nid), cp->ibcp_max_frags,
+                                      IBLND_RDMA_FRAGS(conn->ibc_version));
+                               break;
+
+                       case IBLND_REJECT_NO_RESOURCES:
+                               CERROR("%s rejected: o2iblnd no resources\n",
+                                      libcfs_nid2str(peer->ibp_nid));
+                               break;
+
+                       case IBLND_REJECT_FATAL:
+                               CERROR("%s rejected: o2iblnd fatal error\n",
+                                      libcfs_nid2str(peer->ibp_nid));
+                               break;
+
+                       default:
+                               CERROR("%s rejected: o2iblnd reason %d\n",
+                                      libcfs_nid2str(peer->ibp_nid),
+                                      rej->ibr_why);
+                               break;
+                       }
+                       break;
+               }
+               /* fall through */
+       default:
+               CNETERR("%s rejected: reason %d, size %d\n",
+                       libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
+               break;
+       }
+
+       kiblnd_connreq_done(conn, -ECONNREFUSED);
+}
+
+void
+kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
+{
+       kib_peer_t    *peer = conn->ibc_peer;
+       lnet_ni_t     *ni   = peer->ibp_ni;
+       kib_net_t     *net  = ni->ni_data;
+       kib_msg_t     *msg  = priv;
+       int         ver  = conn->ibc_version;
+       int         rc   = kiblnd_unpack_msg(msg, priv_nob);
+       unsigned long  flags;
+
+       LASSERT (net != NULL);
+
+       if (rc != 0) {
+               CERROR("Can't unpack connack from %s: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), rc);
+               goto failed;
+       }
+
+       if (msg->ibm_type != IBLND_MSG_CONNACK) {
+               CERROR("Unexpected message %d from %s\n",
+                      msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
+               rc = -EPROTO;
+               goto failed;
+       }
+
+       if (ver != msg->ibm_version) {
+               CERROR("%s replied version %x is different with "
+                      "requested version %x\n",
+                      libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
+               rc = -EPROTO;
+               goto failed;
+       }
+
+       if (msg->ibm_u.connparams.ibcp_queue_depth !=
+           IBLND_MSG_QUEUE_SIZE(ver)) {
+               CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      msg->ibm_u.connparams.ibcp_queue_depth,
+                      IBLND_MSG_QUEUE_SIZE(ver));
+               rc = -EPROTO;
+               goto failed;
+       }
+
+       if (msg->ibm_u.connparams.ibcp_max_frags !=
+           IBLND_RDMA_FRAGS(ver)) {
+               CERROR("%s has incompatible max_frags %d (%d wanted)\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      msg->ibm_u.connparams.ibcp_max_frags,
+                      IBLND_RDMA_FRAGS(ver));
+               rc = -EPROTO;
+               goto failed;
+       }
+
+       if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+               CERROR("%s max message size %d too big (%d max)\n",
+                      libcfs_nid2str(peer->ibp_nid),
+                      msg->ibm_u.connparams.ibcp_max_msg_size,
+                      IBLND_MSG_SIZE);
+               rc = -EPROTO;
+               goto failed;
+       }
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       if (msg->ibm_dstnid == ni->ni_nid &&
+           msg->ibm_dststamp == net->ibn_incarnation)
+               rc = 0;
+       else
+               rc = -ESTALE;
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       if (rc != 0) {
+               CERROR("Bad connection reply from %s, rc = %d, "
+                      "version: %x max_frags: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), rc,
+                      msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
+               goto failed;
+       }
+
+       conn->ibc_incarnation      = msg->ibm_srcstamp;
+       conn->ibc_credits         =
+       conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver);
+       LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver)
+                <= IBLND_RX_MSGS(ver));
+
+       kiblnd_connreq_done(conn, 0);
+       return;
+
+ failed:
+       /* NB My QP has already established itself, so I handle anything going
+        * wrong here by setting ibc_comms_error.
+        * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+        * immediately tears it down. */
+
+       LASSERT (rc != 0);
+       conn->ibc_comms_error = rc;
+       kiblnd_connreq_done(conn, 0);
+}
+
+int
+kiblnd_active_connect (struct rdma_cm_id *cmid)
+{
+       kib_peer_t            *peer = (kib_peer_t *)cmid->context;
+       kib_conn_t            *conn;
+       kib_msg_t              *msg;
+       struct rdma_conn_param   cp;
+       int                   version;
+       __u64               incarnation;
+       unsigned long       flags;
+       int                   rc;
+
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       incarnation = peer->ibp_incarnation;
+       version     = (peer->ibp_version == 0) ? IBLND_MSG_VERSION :
+                                                peer->ibp_version;
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version);
+       if (conn == NULL) {
+               kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
+               kiblnd_peer_decref(peer); /* lose cmid's ref */
+               return -ENOMEM;
+       }
+
+       /* conn "owns" cmid now, so I return success from here on to ensure the
+        * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+        * on peer */
+
+       msg = &conn->ibc_connvars->cv_msg;
+
+       memset(msg, 0, sizeof(*msg));
+       kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+       msg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+       msg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+       msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+       kiblnd_pack_msg(peer->ibp_ni, msg, version,
+                       0, peer->ibp_nid, incarnation);
+
+       memset(&cp, 0, sizeof(cp));
+       cp.private_data = msg;
+       cp.private_data_len    = msg->ibm_nob;
+       cp.responder_resources = 0;          /* No atomic ops or RDMA reads */
+       cp.initiator_depth     = 0;
+       cp.flow_control = 1;
+       cp.retry_count   = *kiblnd_tunables.kib_retry_count;
+       cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+       LASSERT(cmid->context == (void *)conn);
+       LASSERT(conn->ibc_cmid == cmid);
+
+       rc = rdma_connect(cmid, &cp);
+       if (rc != 0) {
+               CERROR("Can't connect to %s: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), rc);
+               kiblnd_connreq_done(conn, rc);
+               kiblnd_conn_decref(conn);
+       }
+
+       return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+       kib_peer_t  *peer;
+       kib_conn_t  *conn;
+       int       rc;
+
+       switch (event->event) {
+       default:
+               CERROR("Unexpected event: %d, status: %d\n",
+                      event->event, event->status);
+               LBUG();
+
+       case RDMA_CM_EVENT_CONNECT_REQUEST:
+               /* destroy cmid on failure */
+               rc = kiblnd_passive_connect(cmid,
+                                           (void *)KIBLND_CONN_PARAM(event),
+                                           KIBLND_CONN_PARAM_LEN(event));
+               CDEBUG(D_NET, "connreq: %d\n", rc);
+               return rc;
+
+       case RDMA_CM_EVENT_ADDR_ERROR:
+               peer = (kib_peer_t *)cmid->context;
+               CNETERR("%s: ADDR ERROR %d\n",
+                      libcfs_nid2str(peer->ibp_nid), event->status);
+               kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+               kiblnd_peer_decref(peer);
+               return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_ADDR_RESOLVED:
+               peer = (kib_peer_t *)cmid->context;
+
+               CDEBUG(D_NET,"%s Addr resolved: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), event->status);
+
+               if (event->status != 0) {
+                       CNETERR("Can't resolve address for %s: %d\n",
+                               libcfs_nid2str(peer->ibp_nid), event->status);
+                       rc = event->status;
+               } else {
+                       rc = rdma_resolve_route(
+                               cmid, *kiblnd_tunables.kib_timeout * 1000);
+                       if (rc == 0)
+                               return 0;
+                       /* Can't initiate route resolution */
+                       CERROR("Can't resolve route for %s: %d\n",
+                              libcfs_nid2str(peer->ibp_nid), rc);
+               }
+               kiblnd_peer_connect_failed(peer, 1, rc);
+               kiblnd_peer_decref(peer);
+               return rc;                    /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_ROUTE_ERROR:
+               peer = (kib_peer_t *)cmid->context;
+               CNETERR("%s: ROUTE ERROR %d\n",
+                       libcfs_nid2str(peer->ibp_nid), event->status);
+               kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+               kiblnd_peer_decref(peer);
+               return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_ROUTE_RESOLVED:
+               peer = (kib_peer_t *)cmid->context;
+               CDEBUG(D_NET,"%s Route resolved: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), event->status);
+
+               if (event->status == 0)
+                       return kiblnd_active_connect(cmid);
+
+               CNETERR("Can't resolve route for %s: %d\n",
+                      libcfs_nid2str(peer->ibp_nid), event->status);
+               kiblnd_peer_connect_failed(peer, 1, event->status);
+               kiblnd_peer_decref(peer);
+               return event->status;      /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_UNREACHABLE:
+               conn = (kib_conn_t *)cmid->context;
+               LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                       conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+               CNETERR("%s: UNREACHABLE %d\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+               kiblnd_connreq_done(conn, -ENETDOWN);
+               kiblnd_conn_decref(conn);
+               return 0;
+
+       case RDMA_CM_EVENT_CONNECT_ERROR:
+               conn = (kib_conn_t *)cmid->context;
+               LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                       conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+               CNETERR("%s: CONNECT ERROR %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+               kiblnd_connreq_done(conn, -ENOTCONN);
+               kiblnd_conn_decref(conn);
+               return 0;
+
+       case RDMA_CM_EVENT_REJECTED:
+               conn = (kib_conn_t *)cmid->context;
+               switch (conn->ibc_state) {
+               default:
+                       LBUG();
+
+               case IBLND_CONN_PASSIVE_WAIT:
+                       CERROR ("%s: REJECTED %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                               event->status);
+                       kiblnd_connreq_done(conn, -ECONNRESET);
+                       break;
+
+               case IBLND_CONN_ACTIVE_CONNECT:
+                       kiblnd_rejected(conn, event->status,
+                                       (void *)KIBLND_CONN_PARAM(event),
+                                       KIBLND_CONN_PARAM_LEN(event));
+                       break;
+               }
+               kiblnd_conn_decref(conn);
+               return 0;
+
+       case RDMA_CM_EVENT_ESTABLISHED:
+               conn = (kib_conn_t *)cmid->context;
+               switch (conn->ibc_state) {
+               default:
+                       LBUG();
+
+               case IBLND_CONN_PASSIVE_WAIT:
+                       CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       kiblnd_connreq_done(conn, 0);
+                       break;
+
+               case IBLND_CONN_ACTIVE_CONNECT:
+                       CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       kiblnd_check_connreply(conn,
+                                              (void *)KIBLND_CONN_PARAM(event),
+                                              KIBLND_CONN_PARAM_LEN(event));
+                       break;
+               }
+               /* net keeps its ref on conn! */
+               return 0;
+
+       case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+               CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
+               return 0;
+       case RDMA_CM_EVENT_DISCONNECTED:
+               conn = (kib_conn_t *)cmid->context;
+               if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                       CERROR("%s DISCONNECTED\n",
+                              libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                       kiblnd_connreq_done(conn, -ECONNRESET);
+               } else {
+                       kiblnd_close_conn(conn, 0);
+               }
+               kiblnd_conn_decref(conn);
+               cmid->context = NULL;
+               return 0;
+
+       case RDMA_CM_EVENT_DEVICE_REMOVAL:
+               LCONSOLE_ERROR_MSG(0x131,
+                                  "Received notification of device removal\n"
+                                  "Please shutdown LNET to allow this to proceed\n");
+               /* Can't remove network from underneath LNET for now, so I have
+                * to ignore this */
+               return 0;
+
+       case RDMA_CM_EVENT_ADDR_CHANGE:
+               LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
+               return 0;
+       }
+}
+
+static int
+kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+{
+       kib_tx_t          *tx;
+       struct list_head        *ttmp;
+
+       list_for_each (ttmp, txs) {
+               tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+               if (txs != &conn->ibc_active_txs) {
+                       LASSERT (tx->tx_queued);
+               } else {
+                       LASSERT (!tx->tx_queued);
+                       LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+               }
+
+               if (cfs_time_aftereq (jiffies, tx->tx_deadline)) {
+                       CERROR("Timed out tx: %s, %lu seconds\n",
+                              kiblnd_queue2str(conn, txs),
+                              cfs_duration_sec(jiffies - tx->tx_deadline));
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static int
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+{
+       return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+               kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+               kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+               kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+               kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
+}
+
+void
+kiblnd_check_conns (int idx)
+{
+       LIST_HEAD (closes);
+       LIST_HEAD (checksends);
+       struct list_head    *peers = &kiblnd_data.kib_peers[idx];
+       struct list_head    *ptmp;
+       kib_peer_t    *peer;
+       kib_conn_t    *conn;
+       struct list_head    *ctmp;
+       unsigned long  flags;
+
+       /* NB. We expect to have a look at all the peers and not find any
+        * RDMAs to time out, so we just use a shared lock while we
+        * take a look... */
+       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       list_for_each (ptmp, peers) {
+               peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+               list_for_each (ctmp, &peer->ibp_conns) {
+                       int timedout;
+                       int sendnoop;
+
+                       conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                       LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+                       spin_lock(&conn->ibc_lock);
+
+                       sendnoop = kiblnd_need_noop(conn);
+                       timedout = kiblnd_conn_timed_out_locked(conn);
+                       if (!sendnoop && !timedout) {
+                               spin_unlock(&conn->ibc_lock);
+                               continue;
+                       }
+
+                       if (timedout) {
+                               CERROR("Timed out RDMA with %s (%lu): "
+                                      "c: %u, oc: %u, rc: %u\n",
+                                      libcfs_nid2str(peer->ibp_nid),
+                                      cfs_duration_sec(cfs_time_current() -
+                                                       peer->ibp_last_alive),
+                                      conn->ibc_credits,
+                                      conn->ibc_outstanding_credits,
+                                      conn->ibc_reserved_credits);
+                               list_add(&conn->ibc_connd_list, &closes);
+                       } else {
+                               list_add(&conn->ibc_connd_list,
+                                            &checksends);
+                       }
+                       /* +ref for 'closes' or 'checksends' */
+                       kiblnd_conn_addref(conn);
+
+                       spin_unlock(&conn->ibc_lock);
+               }
+       }
+
+       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       /* Handle timeout by closing the whole
+        * connection. We can only be sure RDMA activity
+        * has ceased once the QP has been modified. */
+       while (!list_empty(&closes)) {
+               conn = list_entry(closes.next,
+                                     kib_conn_t, ibc_connd_list);
+               list_del(&conn->ibc_connd_list);
+               kiblnd_close_conn(conn, -ETIMEDOUT);
+               kiblnd_conn_decref(conn);
+       }
+
+       /* In case we have enough credits to return via a
+        * NOOP, but there were no non-blocking tx descs
+        * free to do it last time... */
+       while (!list_empty(&checksends)) {
+               conn = list_entry(checksends.next,
+                                     kib_conn_t, ibc_connd_list);
+               list_del(&conn->ibc_connd_list);
+               kiblnd_check_sends(conn);
+               kiblnd_conn_decref(conn);
+       }
+}
+
+void
+kiblnd_disconnect_conn (kib_conn_t *conn)
+{
+       LASSERT (!in_interrupt());
+       LASSERT (current == kiblnd_data.kib_connd);
+       LASSERT (conn->ibc_state == IBLND_CONN_CLOSING);
+
+       rdma_disconnect(conn->ibc_cmid);
+       kiblnd_finalise_conn(conn);
+
+       kiblnd_peer_notify(conn->ibc_peer);
+}
+
+int
+kiblnd_connd (void *arg)
+{
+       wait_queue_t     wait;
+       unsigned long      flags;
+       kib_conn_t      *conn;
+       int             timeout;
+       int             i;
+       int             dropped_lock;
+       int             peer_index = 0;
+       unsigned long      deadline = jiffies;
+
+       cfs_block_allsigs ();
+
+       init_waitqueue_entry_current (&wait);
+       kiblnd_data.kib_connd = current;
+
+       spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+       while (!kiblnd_data.kib_shutdown) {
+
+               dropped_lock = 0;
+
+               if (!list_empty (&kiblnd_data.kib_connd_zombies)) {
+                       conn = list_entry(kiblnd_data. \
+                                             kib_connd_zombies.next,
+                                             kib_conn_t, ibc_list);
+                       list_del(&conn->ibc_list);
+
+                       spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+                                              flags);
+                       dropped_lock = 1;
+
+                       kiblnd_destroy_conn(conn);
+
+                       spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+               }
+
+               if (!list_empty(&kiblnd_data.kib_connd_conns)) {
+                       conn = list_entry(kiblnd_data.kib_connd_conns.next,
+                                             kib_conn_t, ibc_list);
+                       list_del(&conn->ibc_list);
+
+                       spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+                                              flags);
+                       dropped_lock = 1;
+
+                       kiblnd_disconnect_conn(conn);
+                       kiblnd_conn_decref(conn);
+
+                       spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+               }
+
+               /* careful with the jiffy wrap... */
+               timeout = (int)(deadline - jiffies);
+               if (timeout <= 0) {
+                       const int n = 4;
+                       const int p = 1;
+                       int       chunk = kiblnd_data.kib_peer_hash_size;
+
+                       spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+                       dropped_lock = 1;
+
+                       /* Time to check for RDMA timeouts on a few more
+                        * peers: I do checks every 'p' seconds on a
+                        * proportion of the peer table and I need to check
+                        * every connection 'n' times within a timeout
+                        * interval, to ensure I detect a timeout on any
+                        * connection within (n+1)/n times the timeout
+                        * interval. */
+
+                       if (*kiblnd_tunables.kib_timeout > n * p)
+                               chunk = (chunk * n * p) /
+                                       *kiblnd_tunables.kib_timeout;
+                       if (chunk == 0)
+                               chunk = 1;
+
+                       for (i = 0; i < chunk; i++) {
+                               kiblnd_check_conns(peer_index);
+                               peer_index = (peer_index + 1) %
+                                            kiblnd_data.kib_peer_hash_size;
+                       }
+
+                       deadline += p * HZ;
+                       spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+               }
+
+               if (dropped_lock)
+                       continue;
+
+               /* Nothing to do for 'timeout'  */
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+               spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+               waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+               spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+       }
+
+       spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+       kiblnd_thread_fini();
+       return 0;
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+       kib_conn_t *conn = arg;
+
+       switch (event->event) {
+       case IB_EVENT_COMM_EST:
+               CDEBUG(D_NET, "%s established\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid));
+               return;
+
+       default:
+               CERROR("%s: Async QP event type %d\n",
+                      libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+               return;
+       }
+}
+
+void
+kiblnd_complete (struct ib_wc *wc)
+{
+       switch (kiblnd_wreqid2type(wc->wr_id)) {
+       default:
+               LBUG();
+
+       case IBLND_WID_RDMA:
+               /* We only get RDMA completion notification if it fails.  All
+                * subsequent work items, including the final SEND will fail
+                * too.  However we can't print out any more info about the
+                * failing RDMA because 'tx' might be back on the idle list or
+                * even reused already if we didn't manage to post all our work
+                * items */
+               CNETERR("RDMA (tx: %p) failed: %d\n",
+                       kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+               return;
+
+       case IBLND_WID_TX:
+               kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+               return;
+
+       case IBLND_WID_RX:
+               kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+                                  wc->byte_len);
+               return;
+       }
+}
+
+void
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
+{
+       /* NB I'm not allowed to schedule this conn once its refcount has
+        * reached 0.  Since fundamentally I'm racing with scheduler threads
+        * consuming my CQ I could be called after all completions have
+        * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+        * and this CQ is about to be destroyed so I NOOP. */
+       kib_conn_t              *conn = (kib_conn_t *)arg;
+       struct kib_sched_info   *sched = conn->ibc_sched;
+       unsigned long           flags;
+
+       LASSERT(cq == conn->ibc_cq);
+
+       spin_lock_irqsave(&sched->ibs_lock, flags);
+
+       conn->ibc_ready = 1;
+
+       if (!conn->ibc_scheduled &&
+           (conn->ibc_nrx > 0 ||
+            conn->ibc_nsends_posted > 0)) {
+               kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+               conn->ibc_scheduled = 1;
+               list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+               if (waitqueue_active(&sched->ibs_waitq))
+                       wake_up(&sched->ibs_waitq);
+       }
+
+       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+       kib_conn_t *conn = arg;
+
+       CERROR("%s: async CQ event type %d\n",
+              libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+       long                    id = (long)arg;
+       struct kib_sched_info   *sched;
+       kib_conn_t              *conn;
+       wait_queue_t            wait;
+       unsigned long           flags;
+       struct ib_wc            wc;
+       int                     did_something;
+       int                     busy_loops = 0;
+       int                     rc;
+
+       cfs_block_allsigs();
+
+       init_waitqueue_entry_current(&wait);
+
+       sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
+
+       rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+       if (rc != 0) {
+               CWARN("Failed to bind on CPT %d, please verify whether "
+                     "all CPUs are healthy and reload modules if necessary, "
+                     "otherwise your system might under risk of low "
+                     "performance\n", sched->ibs_cpt);
+       }
+
+       spin_lock_irqsave(&sched->ibs_lock, flags);
+
+       while (!kiblnd_data.kib_shutdown) {
+               if (busy_loops++ >= IBLND_RESCHED) {
+                       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+                       cond_resched();
+                       busy_loops = 0;
+
+                       spin_lock_irqsave(&sched->ibs_lock, flags);
+               }
+
+               did_something = 0;
+
+               if (!list_empty(&sched->ibs_conns)) {
+                       conn = list_entry(sched->ibs_conns.next,
+                                             kib_conn_t, ibc_sched_list);
+                       /* take over kib_sched_conns' ref on conn... */
+                       LASSERT(conn->ibc_scheduled);
+                       list_del(&conn->ibc_sched_list);
+                       conn->ibc_ready = 0;
+
+                       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+                       rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+                       if (rc == 0) {
+                               rc = ib_req_notify_cq(conn->ibc_cq,
+                                                     IB_CQ_NEXT_COMP);
+                               if (rc < 0) {
+                                       CWARN("%s: ib_req_notify_cq failed: %d, "
+                                             "closing connection\n",
+                                             libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                                       kiblnd_close_conn(conn, -EIO);
+                                       kiblnd_conn_decref(conn);
+                                       spin_lock_irqsave(&sched->ibs_lock,
+                                                             flags);
+                                       continue;
+                               }
+
+                               rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+                       }
+
+                       if (rc < 0) {
+                               CWARN("%s: ib_poll_cq failed: %d, "
+                                     "closing connection\n",
+                                     libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                                     rc);
+                               kiblnd_close_conn(conn, -EIO);
+                               kiblnd_conn_decref(conn);
+                               spin_lock_irqsave(&sched->ibs_lock, flags);
+                               continue;
+                       }
+
+                       spin_lock_irqsave(&sched->ibs_lock, flags);
+
+                       if (rc != 0 || conn->ibc_ready) {
+                               /* There may be another completion waiting; get
+                                * another scheduler to check while I handle
+                                * this one... */
+                               /* +1 ref for sched_conns */
+                               kiblnd_conn_addref(conn);
+                               list_add_tail(&conn->ibc_sched_list,
+                                                 &sched->ibs_conns);
+                               if (waitqueue_active(&sched->ibs_waitq))
+                                       wake_up(&sched->ibs_waitq);
+                       } else {
+                               conn->ibc_scheduled = 0;
+                       }
+
+                       if (rc != 0) {
+                               spin_unlock_irqrestore(&sched->ibs_lock, flags);
+                               kiblnd_complete(&wc);
+
+                               spin_lock_irqsave(&sched->ibs_lock, flags);
+                       }
+
+                       kiblnd_conn_decref(conn); /* ...drop my ref from above */
+                       did_something = 1;
+               }
+
+               if (did_something)
+                       continue;
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
+               spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+               waitq_wait(&wait, TASK_INTERRUPTIBLE);
+               busy_loops = 0;
+
+               remove_wait_queue(&sched->ibs_waitq, &wait);
+               set_current_state(TASK_RUNNING);
+               spin_lock_irqsave(&sched->ibs_lock, flags);
+       }
+
+       spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+       kiblnd_thread_fini();
+       return 0;
+}
+
+int
+kiblnd_failover_thread(void *arg)
+{
+       rwlock_t                *glock = &kiblnd_data.kib_global_lock;
+       kib_dev_t        *dev;
+       wait_queue_t     wait;
+       unsigned long      flags;
+       int             rc;
+
+       LASSERT (*kiblnd_tunables.kib_dev_failover != 0);
+
+       cfs_block_allsigs ();
+
+       init_waitqueue_entry_current(&wait);
+       write_lock_irqsave(glock, flags);
+
+       while (!kiblnd_data.kib_shutdown) {
+               int     do_failover = 0;
+               int     long_sleep;
+
+               list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+                                   ibd_fail_list) {
+                       if (cfs_time_before(cfs_time_current(),
+                                           dev->ibd_next_failover))
+                               continue;
+                       do_failover = 1;
+                       break;
+               }
+
+               if (do_failover) {
+                       list_del_init(&dev->ibd_fail_list);
+                       dev->ibd_failover = 1;
+                       write_unlock_irqrestore(glock, flags);
+
+                       rc = kiblnd_dev_failover(dev);
+
+                       write_lock_irqsave(glock, flags);
+
+                       LASSERT (dev->ibd_failover);
+                       dev->ibd_failover = 0;
+                       if (rc >= 0) { /* Device is OK or failover succeed */
+                               dev->ibd_next_failover = cfs_time_shift(3);
+                               continue;
+                       }
+
+                       /* failed to failover, retry later */
+                       dev->ibd_next_failover =
+                               cfs_time_shift(min(dev->ibd_failed_failover, 10));
+                       if (kiblnd_dev_can_failover(dev)) {
+                               list_add_tail(&dev->ibd_fail_list,
+                                             &kiblnd_data.kib_failed_devs);
+                       }
+
+                       continue;
+               }
+
+               /* long sleep if no more pending failover */
+               long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+               write_unlock_irqrestore(glock, flags);
+
+               rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+                                                  cfs_time_seconds(1));
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+               write_lock_irqsave(glock, flags);
+
+               if (!long_sleep || rc != 0)
+                       continue;
+
+               /* have a long sleep, routine check all active devices,
+                * we need checking like this because if there is not active
+                * connection on the dev and no SEND from local, we may listen
+                * on wrong HCA for ever while there is a bonding failover */
+               list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+                       if (kiblnd_dev_can_failover(dev)) {
+                               list_add_tail(&dev->ibd_fail_list,
+                                             &kiblnd_data.kib_failed_devs);
+                       }
+               }
+       }
+
+       write_unlock_irqrestore(glock, flags);
+
+       kiblnd_thread_fini();
+       return 0;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644 (file)
index 0000000..e21028b
--- /dev/null
@@ -0,0 +1,493 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+static int service = 987;
+CFS_MODULE_PARM(service, "i", int, 0444,
+               "service number (within RDMA_PS_TCP)");
+
+static int cksum = 0;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+               "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+               "number of threads in each scheduler pool");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of message descriptors allocated for each pool");
+
+/* NB: this value is shared by all CPTs */
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# concurrent sends to 1 peer");
+
+static int peer_credits_hiw = 0;
+CFS_MODULE_PARM(peer_credits_hiw, "i", int, 0444,
+               "when eagerly to return credits");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+               "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+               "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+static char *ipif_name = "ib0";
+CFS_MODULE_PARM(ipif_name, "s", charp, 0444,
+               "IPoIB interface name");
+
+static int retry_count = 5;
+CFS_MODULE_PARM(retry_count, "i", int, 0644,
+               "Retransmissions when no ACK received");
+
+static int rnr_retry_count = 6;
+CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644,
+               "RNR retransmissions");
+
+static int keepalive = 100;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+               "Idle time in seconds before sending a keepalive");
+
+static int ib_mtu = 0;
+CFS_MODULE_PARM(ib_mtu, "i", int, 0444,
+               "IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends = 0;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
+               "send work-queue sizing");
+
+static int map_on_demand = 0;
+CFS_MODULE_PARM(map_on_demand, "i", int, 0444,
+               "map on demand");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_pool_size = 512;
+CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444,
+               "size of fmr pool on each CPT (>= ntx / 4)");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_flush_trigger = 384;
+CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444,
+               "# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+CFS_MODULE_PARM(fmr_cache, "i", int, 0444,
+               "non-zero to enable FMR caching");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int pmr_pool_size = 512;
+CFS_MODULE_PARM(pmr_pool_size, "i", int, 0444,
+               "size of MR cache pmr pool on each CPT");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover = 0;
+CFS_MODULE_PARM(dev_failover, "i", int, 0444,
+              "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+
+static int require_privileged_port = 0;
+CFS_MODULE_PARM(require_privileged_port, "i", int, 0644,
+               "require privileged port when accepting connection");
+
+static int use_privileged_port = 1;
+CFS_MODULE_PARM(use_privileged_port, "i", int, 0644,
+               "use privileged port when initiating connection");
+
+kib_tunables_t kiblnd_tunables = {
+       .kib_dev_failover          = &dev_failover,
+       .kib_service            = &service,
+       .kib_cksum                = &cksum,
+       .kib_timeout            = &timeout,
+       .kib_keepalive        = &keepalive,
+       .kib_ntx                    = &ntx,
+       .kib_credits            = &credits,
+       .kib_peertxcredits        = &peer_credits,
+       .kib_peercredits_hiw    = &peer_credits_hiw,
+       .kib_peerrtrcredits      = &peer_buffer_credits,
+       .kib_peertimeout            = &peer_timeout,
+       .kib_default_ipif          = &ipif_name,
+       .kib_retry_count            = &retry_count,
+       .kib_rnr_retry_count    = &rnr_retry_count,
+       .kib_concurrent_sends       = &concurrent_sends,
+       .kib_ib_mtu              = &ib_mtu,
+       .kib_map_on_demand        = &map_on_demand,
+       .kib_fmr_pool_size        = &fmr_pool_size,
+       .kib_fmr_flush_trigger      = &fmr_flush_trigger,
+       .kib_fmr_cache        = &fmr_cache,
+       .kib_pmr_pool_size        = &pmr_pool_size,
+       .kib_require_priv_port      = &require_privileged_port,
+       .kib_use_priv_port          = &use_privileged_port,
+       .kib_nscheds                = &nscheds
+};
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+static char ipif_basename_space[32];
+
+
+enum {
+       O2IBLND_SERVICE  = 1,
+       O2IBLND_CKSUM,
+       O2IBLND_TIMEOUT,
+       O2IBLND_NTX,
+       O2IBLND_CREDITS,
+       O2IBLND_PEER_TXCREDITS,
+       O2IBLND_PEER_CREDITS_HIW,
+       O2IBLND_PEER_RTRCREDITS,
+       O2IBLND_PEER_TIMEOUT,
+       O2IBLND_IPIF_BASENAME,
+       O2IBLND_RETRY_COUNT,
+       O2IBLND_RNR_RETRY_COUNT,
+       O2IBLND_KEEPALIVE,
+       O2IBLND_CONCURRENT_SENDS,
+       O2IBLND_IB_MTU,
+       O2IBLND_MAP_ON_DEMAND,
+       O2IBLND_FMR_POOL_SIZE,
+       O2IBLND_FMR_FLUSH_TRIGGER,
+       O2IBLND_FMR_CACHE,
+       O2IBLND_PMR_POOL_SIZE,
+       O2IBLND_DEV_FAILOVER
+};
+
+static ctl_table_t kiblnd_ctl_table[] = {
+       {
+               .ctl_name = O2IBLND_SERVICE,
+               .procname = "service",
+               .data     = &service,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_CKSUM,
+               .procname = "cksum",
+               .data     = &cksum,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_TIMEOUT,
+               .procname = "timeout",
+               .data     = &timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_NTX,
+               .procname = "ntx",
+               .data     = &ntx,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_CREDITS,
+               .procname = "credits",
+               .data     = &credits,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_PEER_TXCREDITS,
+               .procname = "peer_credits",
+               .data     = &peer_credits,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_PEER_CREDITS_HIW,
+               .procname = "peer_credits_hiw",
+               .data     = &peer_credits_hiw,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_PEER_RTRCREDITS,
+               .procname = "peer_buffer_credits",
+               .data     = &peer_buffer_credits,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_PEER_TIMEOUT,
+               .procname = "peer_timeout",
+               .data     = &peer_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_IPIF_BASENAME,
+               .procname = "ipif_name",
+               .data     = ipif_basename_space,
+               .maxlen   = sizeof(ipif_basename_space),
+               .mode     = 0444,
+               .proc_handler = &proc_dostring
+       },
+       {
+               .ctl_name = O2IBLND_RETRY_COUNT,
+               .procname = "retry_count",
+               .data     = &retry_count,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_RNR_RETRY_COUNT,
+               .procname = "rnr_retry_count",
+               .data     = &rnr_retry_count,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_KEEPALIVE,
+               .procname = "keepalive",
+               .data     = &keepalive,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_CONCURRENT_SENDS,
+               .procname = "concurrent_sends",
+               .data     = &concurrent_sends,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_IB_MTU,
+               .procname = "ib_mtu",
+               .data     = &ib_mtu,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_MAP_ON_DEMAND,
+               .procname = "map_on_demand",
+               .data     = &map_on_demand,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+
+       {
+               .ctl_name = O2IBLND_FMR_POOL_SIZE,
+               .procname = "fmr_pool_size",
+               .data     = &fmr_pool_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_FMR_FLUSH_TRIGGER,
+               .procname = "fmr_flush_trigger",
+               .data     = &fmr_flush_trigger,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_FMR_CACHE,
+               .procname = "fmr_cache",
+               .data     = &fmr_cache,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_PMR_POOL_SIZE,
+               .procname = "pmr_pool_size",
+               .data     = &pmr_pool_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .ctl_name = O2IBLND_DEV_FAILOVER,
+               .procname = "dev_failover",
+               .data     = &dev_failover,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+       },
+       {0}
+};
+
+static ctl_table_t kiblnd_top_ctl_table[] = {
+       {
+               .ctl_name = CTL_O2IBLND,
+               .procname = "o2iblnd",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0555,
+               .child    = kiblnd_ctl_table
+       },
+       {0}
+};
+
+void
+kiblnd_initstrtunable(char *space, char *str, int size)
+{
+       strncpy(space, str, size);
+       space[size-1] = 0;
+}
+
+void
+kiblnd_sysctl_init (void)
+{
+       kiblnd_initstrtunable(ipif_basename_space, ipif_name,
+                             sizeof(ipif_basename_space));
+
+       kiblnd_tunables.kib_sysctl =
+               cfs_register_sysctl_table(kiblnd_top_ctl_table, 0);
+
+       if (kiblnd_tunables.kib_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+       if (kiblnd_tunables.kib_sysctl != NULL)
+               unregister_sysctl_table(kiblnd_tunables.kib_sysctl);
+}
+
+#else
+
+void
+kiblnd_sysctl_init (void)
+{
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+}
+
+#endif
+
+int
+kiblnd_tunables_init (void)
+{
+       if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
+               CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+                      *kiblnd_tunables.kib_ib_mtu);
+               return -EINVAL;
+       }
+
+       if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT)
+               *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT;
+
+       if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX)
+               *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX;
+
+       if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits)
+               *kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits;
+
+       if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2)
+               *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2;
+
+       if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits)
+               *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1;
+
+       if (*kiblnd_tunables.kib_map_on_demand < 0 ||
+           *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS)
+               *kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */
+
+       if (*kiblnd_tunables.kib_map_on_demand == 1)
+               *kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */
+
+       if (*kiblnd_tunables.kib_concurrent_sends == 0) {
+               if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+                   *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8)
+                       *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2;
+               else
+                       *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits);
+       }
+
+       if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2)
+               *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2;
+
+       if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2)
+               *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2;
+
+       if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) {
+               CWARN("Concurrent sends %d is lower than message queue size: %d, "
+                     "performance may drop slightly.\n",
+                     *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits);
+       }
+
+       kiblnd_sysctl_init();
+       return 0;
+}
+
+void
+kiblnd_tunables_fini (void)
+{
+       kiblnd_sysctl_fini();
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
new file mode 100644 (file)
index 0000000..6494b2b
--- /dev/null
@@ -0,0 +1,7 @@
+obj-$(CONFIG_LNET) += ksocklnd.o
+
+ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o
+
+
+
+ccflags-y := -I$(src)/../../include
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
new file mode 100644 (file)
index 0000000..c826bf9
--- /dev/null
@@ -0,0 +1,2902 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/socklnd/socklnd.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "socklnd.h"
+
+lnd_t             the_ksocklnd;
+ksock_nal_data_t       ksocknal_data;
+
+ksock_interface_t *
+ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip)
+{
+       ksock_net_t       *net = ni->ni_data;
+       int             i;
+       ksock_interface_t *iface;
+
+       for (i = 0; i < net->ksnn_ninterfaces; i++) {
+               LASSERT(i < LNET_MAX_INTERFACES);
+               iface = &net->ksnn_interfaces[i];
+
+               if (iface->ksni_ipaddr == ip)
+                       return (iface);
+       }
+
+       return (NULL);
+}
+
+ksock_route_t *
+ksocknal_create_route (__u32 ipaddr, int port)
+{
+       ksock_route_t *route;
+
+       LIBCFS_ALLOC (route, sizeof (*route));
+       if (route == NULL)
+               return (NULL);
+
+       atomic_set (&route->ksnr_refcount, 1);
+       route->ksnr_peer = NULL;
+       route->ksnr_retry_interval = 0;  /* OK to connect at any time */
+       route->ksnr_ipaddr = ipaddr;
+       route->ksnr_port = port;
+       route->ksnr_scheduled = 0;
+       route->ksnr_connecting = 0;
+       route->ksnr_connected = 0;
+       route->ksnr_deleted = 0;
+       route->ksnr_conn_count = 0;
+       route->ksnr_share_count = 0;
+
+       return (route);
+}
+
+void
+ksocknal_destroy_route (ksock_route_t *route)
+{
+       LASSERT (atomic_read(&route->ksnr_refcount) == 0);
+
+       if (route->ksnr_peer != NULL)
+               ksocknal_peer_decref(route->ksnr_peer);
+
+       LIBCFS_FREE (route, sizeof (*route));
+}
+
+int
+ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
+{
+       ksock_net_t   *net = ni->ni_data;
+       ksock_peer_t  *peer;
+
+       LASSERT (id.nid != LNET_NID_ANY);
+       LASSERT (id.pid != LNET_PID_ANY);
+       LASSERT (!in_interrupt());
+
+       LIBCFS_ALLOC (peer, sizeof (*peer));
+       if (peer == NULL)
+               return -ENOMEM;
+
+       memset (peer, 0, sizeof (*peer));       /* NULL pointers/clear flags etc */
+
+       peer->ksnp_ni = ni;
+       peer->ksnp_id = id;
+       atomic_set (&peer->ksnp_refcount, 1);   /* 1 ref for caller */
+       peer->ksnp_closing = 0;
+       peer->ksnp_accepting = 0;
+       peer->ksnp_proto = NULL;
+       peer->ksnp_last_alive = 0;
+       peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+       INIT_LIST_HEAD (&peer->ksnp_conns);
+       INIT_LIST_HEAD (&peer->ksnp_routes);
+       INIT_LIST_HEAD (&peer->ksnp_tx_queue);
+       INIT_LIST_HEAD (&peer->ksnp_zc_req_list);
+       spin_lock_init(&peer->ksnp_lock);
+
+       spin_lock_bh(&net->ksnn_lock);
+
+       if (net->ksnn_shutdown) {
+               spin_unlock_bh(&net->ksnn_lock);
+
+               LIBCFS_FREE(peer, sizeof(*peer));
+               CERROR("Can't create peer: network shutdown\n");
+               return -ESHUTDOWN;
+       }
+
+       net->ksnn_npeers++;
+
+       spin_unlock_bh(&net->ksnn_lock);
+
+       *peerp = peer;
+       return 0;
+}
+
+void
+ksocknal_destroy_peer (ksock_peer_t *peer)
+{
+       ksock_net_t    *net = peer->ksnp_ni->ni_data;
+
+       CDEBUG (D_NET, "peer %s %p deleted\n",
+               libcfs_id2str(peer->ksnp_id), peer);
+
+       LASSERT (atomic_read (&peer->ksnp_refcount) == 0);
+       LASSERT (peer->ksnp_accepting == 0);
+       LASSERT (list_empty (&peer->ksnp_conns));
+       LASSERT (list_empty (&peer->ksnp_routes));
+       LASSERT (list_empty (&peer->ksnp_tx_queue));
+       LASSERT (list_empty (&peer->ksnp_zc_req_list));
+
+       LIBCFS_FREE (peer, sizeof (*peer));
+
+       /* NB a peer's connections and routes keep a reference on their peer
+        * until they are destroyed, so we can be assured that _all_ state to
+        * do with this peer has been cleaned up when its refcount drops to
+        * zero. */
+       spin_lock_bh(&net->ksnn_lock);
+       net->ksnn_npeers--;
+       spin_unlock_bh(&net->ksnn_lock);
+}
+
+ksock_peer_t *
+ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id)
+{
+       struct list_head       *peer_list = ksocknal_nid2peerlist(id.nid);
+       struct list_head       *tmp;
+       ksock_peer_t     *peer;
+
+       list_for_each (tmp, peer_list) {
+
+               peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+               LASSERT (!peer->ksnp_closing);
+
+               if (peer->ksnp_ni != ni)
+                       continue;
+
+               if (peer->ksnp_id.nid != id.nid ||
+                   peer->ksnp_id.pid != id.pid)
+                       continue;
+
+               CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+                      peer, libcfs_id2str(id),
+                      atomic_read(&peer->ksnp_refcount));
+               return (peer);
+       }
+       return (NULL);
+}
+
+ksock_peer_t *
+ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id)
+{
+       ksock_peer_t     *peer;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+       peer = ksocknal_find_peer_locked(ni, id);
+       if (peer != NULL)                       /* +1 ref for caller? */
+               ksocknal_peer_addref(peer);
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       return (peer);
+}
+
+void
+ksocknal_unlink_peer_locked (ksock_peer_t *peer)
+{
+       int             i;
+       __u32         ip;
+       ksock_interface_t *iface;
+
+       for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+               LASSERT (i < LNET_MAX_INTERFACES);
+               ip = peer->ksnp_passive_ips[i];
+
+               iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+               /* All IPs in peer->ksnp_passive_ips[] come from the
+                * interface list, therefore the call must succeed. */
+               LASSERT (iface != NULL);
+
+               CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
+                      peer, iface, iface->ksni_nroutes);
+               iface->ksni_npeers--;
+       }
+
+       LASSERT (list_empty(&peer->ksnp_conns));
+       LASSERT (list_empty(&peer->ksnp_routes));
+       LASSERT (!peer->ksnp_closing);
+       peer->ksnp_closing = 1;
+       list_del (&peer->ksnp_list);
+       /* lose peerlist's ref */
+       ksocknal_peer_decref(peer);
+}
+
+int
+ksocknal_get_peer_info (lnet_ni_t *ni, int index,
+                       lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip,
+                       int *port, int *conn_count, int *share_count)
+{
+       ksock_peer_t      *peer;
+       struct list_head        *ptmp;
+       ksock_route_t     *route;
+       struct list_head        *rtmp;
+       int             i;
+       int             j;
+       int             rc = -ENOENT;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+
+               list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+                       peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                       if (peer->ksnp_ni != ni)
+                               continue;
+
+                       if (peer->ksnp_n_passive_ips == 0 &&
+                           list_empty(&peer->ksnp_routes)) {
+                               if (index-- > 0)
+                                       continue;
+
+                               *id = peer->ksnp_id;
+                               *myip = 0;
+                               *peer_ip = 0;
+                               *port = 0;
+                               *conn_count = 0;
+                               *share_count = 0;
+                               rc = 0;
+                               goto out;
+                       }
+
+                       for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+                               if (index-- > 0)
+                                       continue;
+
+                               *id = peer->ksnp_id;
+                               *myip = peer->ksnp_passive_ips[j];
+                               *peer_ip = 0;
+                               *port = 0;
+                               *conn_count = 0;
+                               *share_count = 0;
+                               rc = 0;
+                               goto out;
+                       }
+
+                       list_for_each (rtmp, &peer->ksnp_routes) {
+                               if (index-- > 0)
+                                       continue;
+
+                               route = list_entry(rtmp, ksock_route_t,
+                                                      ksnr_list);
+
+                               *id = peer->ksnp_id;
+                               *myip = route->ksnr_myipaddr;
+                               *peer_ip = route->ksnr_ipaddr;
+                               *port = route->ksnr_port;
+                               *conn_count = route->ksnr_conn_count;
+                               *share_count = route->ksnr_share_count;
+                               rc = 0;
+                               goto out;
+                       }
+               }
+       }
+ out:
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+       return (rc);
+}
+
+void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+       ksock_peer_t      *peer = route->ksnr_peer;
+       int             type = conn->ksnc_type;
+       ksock_interface_t *iface;
+
+       conn->ksnc_route = route;
+       ksocknal_route_addref(route);
+
+       if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+               if (route->ksnr_myipaddr == 0) {
+                       /* route wasn't bound locally yet (the initial route) */
+                       CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n",
+                              libcfs_id2str(peer->ksnp_id),
+                              HIPQUAD(route->ksnr_ipaddr),
+                              HIPQUAD(conn->ksnc_myipaddr));
+               } else {
+                       CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from "
+                              "%u.%u.%u.%u to %u.%u.%u.%u\n",
+                              libcfs_id2str(peer->ksnp_id),
+                              HIPQUAD(route->ksnr_ipaddr),
+                              HIPQUAD(route->ksnr_myipaddr),
+                              HIPQUAD(conn->ksnc_myipaddr));
+
+                       iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                                 route->ksnr_myipaddr);
+                       if (iface != NULL)
+                               iface->ksni_nroutes--;
+               }
+               route->ksnr_myipaddr = conn->ksnc_myipaddr;
+               iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                         route->ksnr_myipaddr);
+               if (iface != NULL)
+                       iface->ksni_nroutes++;
+       }
+
+       route->ksnr_connected |= (1<<type);
+       route->ksnr_conn_count++;
+
+       /* Successful connection => further attempts can
+        * proceed immediately */
+       route->ksnr_retry_interval = 0;
+}
+
+void
+ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
+{
+       struct list_head        *tmp;
+       ksock_conn_t      *conn;
+       ksock_route_t     *route2;
+
+       LASSERT (!peer->ksnp_closing);
+       LASSERT (route->ksnr_peer == NULL);
+       LASSERT (!route->ksnr_scheduled);
+       LASSERT (!route->ksnr_connecting);
+       LASSERT (route->ksnr_connected == 0);
+
+       /* LASSERT(unique) */
+       list_for_each(tmp, &peer->ksnp_routes) {
+               route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+               if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+                       CERROR ("Duplicate route %s %u.%u.%u.%u\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(route->ksnr_ipaddr));
+                       LBUG();
+               }
+       }
+
+       route->ksnr_peer = peer;
+       ksocknal_peer_addref(peer);
+       /* peer's routelist takes over my ref on 'route' */
+       list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+
+       list_for_each(tmp, &peer->ksnp_conns) {
+               conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+               if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+                       continue;
+
+               ksocknal_associate_route_conn_locked(route, conn);
+               /* keep going (typed routes) */
+       }
+}
+
+void
+ksocknal_del_route_locked (ksock_route_t *route)
+{
+       ksock_peer_t      *peer = route->ksnr_peer;
+       ksock_interface_t *iface;
+       ksock_conn_t      *conn;
+       struct list_head        *ctmp;
+       struct list_head        *cnxt;
+
+       LASSERT (!route->ksnr_deleted);
+
+       /* Close associated conns */
+       list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+               conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+               if (conn->ksnc_route != route)
+                       continue;
+
+               ksocknal_close_conn_locked (conn, 0);
+       }
+
+       if (route->ksnr_myipaddr != 0) {
+               iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                         route->ksnr_myipaddr);
+               if (iface != NULL)
+                       iface->ksni_nroutes--;
+       }
+
+       route->ksnr_deleted = 1;
+       list_del (&route->ksnr_list);
+       ksocknal_route_decref(route);        /* drop peer's ref */
+
+       if (list_empty (&peer->ksnp_routes) &&
+           list_empty (&peer->ksnp_conns)) {
+               /* I've just removed the last route to a peer with no active
+                * connections */
+               ksocknal_unlink_peer_locked (peer);
+       }
+}
+
+int
+ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
+{
+       struct list_head        *tmp;
+       ksock_peer_t      *peer;
+       ksock_peer_t      *peer2;
+       ksock_route_t     *route;
+       ksock_route_t     *route2;
+       int             rc;
+
+       if (id.nid == LNET_NID_ANY ||
+           id.pid == LNET_PID_ANY)
+               return (-EINVAL);
+
+       /* Have a brand new peer ready... */
+       rc = ksocknal_create_peer(&peer, ni, id);
+       if (rc != 0)
+               return rc;
+
+       route = ksocknal_create_route (ipaddr, port);
+       if (route == NULL) {
+               ksocknal_peer_decref(peer);
+               return (-ENOMEM);
+       }
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       /* always called with a ref on ni, so shutdown can't have started */
+       LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+       peer2 = ksocknal_find_peer_locked (ni, id);
+       if (peer2 != NULL) {
+               ksocknal_peer_decref(peer);
+               peer = peer2;
+       } else {
+               /* peer table takes my ref on peer */
+               list_add_tail (&peer->ksnp_list,
+                                  ksocknal_nid2peerlist (id.nid));
+       }
+
+       route2 = NULL;
+       list_for_each (tmp, &peer->ksnp_routes) {
+               route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+               if (route2->ksnr_ipaddr == ipaddr)
+                       break;
+
+               route2 = NULL;
+       }
+       if (route2 == NULL) {
+               ksocknal_add_route_locked(peer, route);
+               route->ksnr_share_count++;
+       } else {
+               ksocknal_route_decref(route);
+               route2->ksnr_share_count++;
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       return (0);
+}
+
+void
+ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
+{
+       ksock_conn_t     *conn;
+       ksock_route_t    *route;
+       struct list_head       *tmp;
+       struct list_head       *nxt;
+       int            nshared;
+
+       LASSERT (!peer->ksnp_closing);
+
+       /* Extra ref prevents peer disappearing until I'm done with it */
+       ksocknal_peer_addref(peer);
+
+       list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+               route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+               /* no match */
+               if (!(ip == 0 || route->ksnr_ipaddr == ip))
+                       continue;
+
+               route->ksnr_share_count = 0;
+               /* This deletes associated conns too */
+               ksocknal_del_route_locked (route);
+       }
+
+       nshared = 0;
+       list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+               route = list_entry(tmp, ksock_route_t, ksnr_list);
+               nshared += route->ksnr_share_count;
+       }
+
+       if (nshared == 0) {
+               /* remove everything else if there are no explicit entries
+                * left */
+
+               list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+                       route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+                       /* we should only be removing auto-entries */
+                       LASSERT(route->ksnr_share_count == 0);
+                       ksocknal_del_route_locked (route);
+               }
+
+               list_for_each_safe (tmp, nxt, &peer->ksnp_conns) {
+                       conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                       ksocknal_close_conn_locked(conn, 0);
+               }
+       }
+
+       ksocknal_peer_decref(peer);
+       /* NB peer unlinks itself when last conn/route is removed */
+}
+
+int
+ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
+{
+       LIST_HEAD     (zombies);
+       struct list_head        *ptmp;
+       struct list_head        *pnxt;
+       ksock_peer_t      *peer;
+       int             lo;
+       int             hi;
+       int             i;
+       int             rc = -ENOENT;
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       if (id.nid != LNET_NID_ANY)
+               lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+       else {
+               lo = 0;
+               hi = ksocknal_data.ksnd_peer_hash_size - 1;
+       }
+
+       for (i = lo; i <= hi; i++) {
+               list_for_each_safe (ptmp, pnxt,
+                                       &ksocknal_data.ksnd_peers[i]) {
+                       peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                       if (peer->ksnp_ni != ni)
+                               continue;
+
+                       if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
+                             (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
+                               continue;
+
+                       ksocknal_peer_addref(peer);     /* a ref for me... */
+
+                       ksocknal_del_peer_locked (peer, ip);
+
+                       if (peer->ksnp_closing &&
+                           !list_empty(&peer->ksnp_tx_queue)) {
+                               LASSERT (list_empty(&peer->ksnp_conns));
+                               LASSERT (list_empty(&peer->ksnp_routes));
+
+                               list_splice_init(&peer->ksnp_tx_queue,
+                                                    &zombies);
+                       }
+
+                       ksocknal_peer_decref(peer);     /* ...till here */
+
+                       rc = 0;          /* matched! */
+               }
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       ksocknal_txlist_done(ni, &zombies, 1);
+
+       return (rc);
+}
+
+ksock_conn_t *
+ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+       ksock_peer_t      *peer;
+       struct list_head        *ptmp;
+       ksock_conn_t      *conn;
+       struct list_head        *ctmp;
+       int             i;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+               list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+                       peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                       LASSERT (!peer->ksnp_closing);
+
+                       if (peer->ksnp_ni != ni)
+                               continue;
+
+                       list_for_each (ctmp, &peer->ksnp_conns) {
+                               if (index-- > 0)
+                                       continue;
+
+                               conn = list_entry (ctmp, ksock_conn_t,
+                                                      ksnc_list);
+                               ksocknal_conn_addref(conn);
+                               read_unlock(&ksocknal_data. \
+                                                ksnd_global_lock);
+                               return (conn);
+                       }
+               }
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+       return (NULL);
+}
+
+ksock_sched_t *
+ksocknal_choose_scheduler_locked(unsigned int cpt)
+{
+       struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt];
+       ksock_sched_t           *sched;
+       int                     i;
+
+       LASSERT(info->ksi_nthreads > 0);
+
+       sched = &info->ksi_scheds[0];
+       /*
+        * NB: it's safe so far, but info->ksi_nthreads could be changed
+        * at runtime when we have dynamic LNet configuration, then we
+        * need to take care of this.
+        */
+       for (i = 1; i < info->ksi_nthreads; i++) {
+               if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+                       sched = &info->ksi_scheds[i];
+       }
+
+       return sched;
+}
+
+int
+ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs)
+{
+       ksock_net_t       *net = ni->ni_data;
+       int             i;
+       int             nip;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       nip = net->ksnn_ninterfaces;
+       LASSERT (nip <= LNET_MAX_INTERFACES);
+
+       /* Only offer interfaces for additional connections if I have
+        * more than one. */
+       if (nip < 2) {
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+               return 0;
+       }
+
+       for (i = 0; i < nip; i++) {
+               ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
+               LASSERT (ipaddrs[i] != 0);
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+       return (nip);
+}
+
+int
+ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+{
+       int   best_netmatch = 0;
+       int   best_xor      = 0;
+       int   best        = -1;
+       int   this_xor;
+       int   this_netmatch;
+       int   i;
+
+       for (i = 0; i < nips; i++) {
+               if (ips[i] == 0)
+                       continue;
+
+               this_xor = (ips[i] ^ iface->ksni_ipaddr);
+               this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+               if (!(best < 0 ||
+                     best_netmatch < this_netmatch ||
+                     (best_netmatch == this_netmatch &&
+                      best_xor > this_xor)))
+                       continue;
+
+               best = i;
+               best_netmatch = this_netmatch;
+               best_xor = this_xor;
+       }
+
+       LASSERT (best >= 0);
+       return (best);
+}
+
+int
+ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+{
+       rwlock_t                *global_lock = &ksocknal_data.ksnd_global_lock;
+       ksock_net_t     *net = peer->ksnp_ni->ni_data;
+       ksock_interface_t  *iface;
+       ksock_interface_t  *best_iface;
+       int              n_ips;
+       int              i;
+       int              j;
+       int              k;
+       __u32          ip;
+       __u32          xor;
+       int              this_netmatch;
+       int              best_netmatch;
+       int              best_npeers;
+
+       /* CAVEAT EMPTOR: We do all our interface matching with an
+        * exclusive hold of global lock at IRQ priority.  We're only
+        * expecting to be dealing with small numbers of interfaces, so the
+        * O(n**3)-ness shouldn't matter */
+
+       /* Also note that I'm not going to return more than n_peerips
+        * interfaces, even if I have more myself */
+
+       write_lock_bh(global_lock);
+
+       LASSERT (n_peerips <= LNET_MAX_INTERFACES);
+       LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+       /* Only match interfaces for additional connections
+        * if I have > 1 interface */
+       n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
+               MIN(n_peerips, net->ksnn_ninterfaces);
+
+       for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+               /*            ^ yes really... */
+
+               /* If we have any new interfaces, first tick off all the
+                * peer IPs that match old interfaces, then choose new
+                * interfaces to match the remaining peer IPS.
+                * We don't forget interfaces we've stopped using; we might
+                * start using them again... */
+
+               if (i < peer->ksnp_n_passive_ips) {
+                       /* Old interface. */
+                       ip = peer->ksnp_passive_ips[i];
+                       best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+
+                       /* peer passive ips are kept up to date */
+                       LASSERT(best_iface != NULL);
+               } else {
+                       /* choose a new interface */
+                       LASSERT (i == peer->ksnp_n_passive_ips);
+
+                       best_iface = NULL;
+                       best_netmatch = 0;
+                       best_npeers = 0;
+
+                       for (j = 0; j < net->ksnn_ninterfaces; j++) {
+                               iface = &net->ksnn_interfaces[j];
+                               ip = iface->ksni_ipaddr;
+
+                               for (k = 0; k < peer->ksnp_n_passive_ips; k++)
+                                       if (peer->ksnp_passive_ips[k] == ip)
+                                               break;
+
+                               if (k < peer->ksnp_n_passive_ips) /* using it already */
+                                       continue;
+
+                               k = ksocknal_match_peerip(iface, peerips, n_peerips);
+                               xor = (ip ^ peerips[k]);
+                               this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+                               if (!(best_iface == NULL ||
+                                     best_netmatch < this_netmatch ||
+                                     (best_netmatch == this_netmatch &&
+                                      best_npeers > iface->ksni_npeers)))
+                                       continue;
+
+                               best_iface = iface;
+                               best_netmatch = this_netmatch;
+                               best_npeers = iface->ksni_npeers;
+                       }
+
+                       best_iface->ksni_npeers++;
+                       ip = best_iface->ksni_ipaddr;
+                       peer->ksnp_passive_ips[i] = ip;
+                       peer->ksnp_n_passive_ips = i+1;
+               }
+
+               LASSERT (best_iface != NULL);
+
+               /* mark the best matching peer IP used */
+               j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+               peerips[j] = 0;
+       }
+
+       /* Overwrite input peer IP addresses */
+       memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+
+       write_unlock_bh(global_lock);
+
+       return (n_ips);
+}
+
+void
+ksocknal_create_routes(ksock_peer_t *peer, int port,
+                      __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+       ksock_route_t       *newroute = NULL;
+       rwlock_t                *global_lock = &ksocknal_data.ksnd_global_lock;
+       lnet_ni_t          *ni = peer->ksnp_ni;
+       ksock_net_t      *net = ni->ni_data;
+       struct list_head          *rtmp;
+       ksock_route_t       *route;
+       ksock_interface_t   *iface;
+       ksock_interface_t   *best_iface;
+       int               best_netmatch;
+       int               this_netmatch;
+       int               best_nroutes;
+       int               i;
+       int               j;
+
+       /* CAVEAT EMPTOR: We do all our interface matching with an
+        * exclusive hold of global lock at IRQ priority.  We're only
+        * expecting to be dealing with small numbers of interfaces, so the
+        * O(n**3)-ness here shouldn't matter */
+
+       write_lock_bh(global_lock);
+
+       if (net->ksnn_ninterfaces < 2) {
+               /* Only create additional connections
+                * if I have > 1 interface */
+               write_unlock_bh(global_lock);
+               return;
+       }
+
+       LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES);
+
+       for (i = 0; i < npeer_ipaddrs; i++) {
+               if (newroute != NULL) {
+                       newroute->ksnr_ipaddr = peer_ipaddrs[i];
+               } else {
+                       write_unlock_bh(global_lock);
+
+                       newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+                       if (newroute == NULL)
+                               return;
+
+                       write_lock_bh(global_lock);
+               }
+
+               if (peer->ksnp_closing) {
+                       /* peer got closed under me */
+                       break;
+               }
+
+               /* Already got a route? */
+               route = NULL;
+               list_for_each(rtmp, &peer->ksnp_routes) {
+                       route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+                       if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+                               break;
+
+                       route = NULL;
+               }
+               if (route != NULL)
+                       continue;
+
+               best_iface = NULL;
+               best_nroutes = 0;
+               best_netmatch = 0;
+
+               LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+               /* Select interface to connect from */
+               for (j = 0; j < net->ksnn_ninterfaces; j++) {
+                       iface = &net->ksnn_interfaces[j];
+
+                       /* Using this interface already? */
+                       list_for_each(rtmp, &peer->ksnp_routes) {
+                               route = list_entry(rtmp, ksock_route_t,
+                                                      ksnr_list);
+
+                               if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+                                       break;
+
+                               route = NULL;
+                       }
+                       if (route != NULL)
+                               continue;
+
+                       this_netmatch = (((iface->ksni_ipaddr ^
+                                          newroute->ksnr_ipaddr) &
+                                          iface->ksni_netmask) == 0) ? 1 : 0;
+
+                       if (!(best_iface == NULL ||
+                             best_netmatch < this_netmatch ||
+                             (best_netmatch == this_netmatch &&
+                              best_nroutes > iface->ksni_nroutes)))
+                               continue;
+
+                       best_iface = iface;
+                       best_netmatch = this_netmatch;
+                       best_nroutes = iface->ksni_nroutes;
+               }
+
+               if (best_iface == NULL)
+                       continue;
+
+               newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+               best_iface->ksni_nroutes++;
+
+               ksocknal_add_route_locked(peer, newroute);
+               newroute = NULL;
+       }
+
+       write_unlock_bh(global_lock);
+       if (newroute != NULL)
+               ksocknal_route_decref(newroute);
+}
+
+int
+ksocknal_accept (lnet_ni_t *ni, socket_t *sock)
+{
+       ksock_connreq_t    *cr;
+       int              rc;
+       __u32          peer_ip;
+       int              peer_port;
+
+       rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+       LASSERT (rc == 0);                    /* we succeeded before */
+
+       LIBCFS_ALLOC(cr, sizeof(*cr));
+       if (cr == NULL) {
+               LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from "
+                                  "%u.%u.%u.%u: memory exhausted\n",
+                                  HIPQUAD(peer_ip));
+               return -ENOMEM;
+       }
+
+       lnet_ni_addref(ni);
+       cr->ksncr_ni   = ni;
+       cr->ksncr_sock = sock;
+
+       spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+       list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+       wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+       spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+       return 0;
+}
+
+int
+ksocknal_connecting (ksock_peer_t *peer, __u32 ipaddr)
+{
+       ksock_route_t   *route;
+
+       list_for_each_entry (route, &peer->ksnp_routes, ksnr_list) {
+
+               if (route->ksnr_ipaddr == ipaddr)
+                       return route->ksnr_connecting;
+       }
+       return 0;
+}
+
+int
+ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+                     socket_t *sock, int type)
+{
+       rwlock_t                *global_lock = &ksocknal_data.ksnd_global_lock;
+       LIST_HEAD     (zombies);
+       lnet_process_id_t  peerid;
+       struct list_head        *tmp;
+       __u64         incarnation;
+       ksock_conn_t      *conn;
+       ksock_conn_t      *conn2;
+       ksock_peer_t      *peer = NULL;
+       ksock_peer_t      *peer2;
+       ksock_sched_t     *sched;
+       ksock_hello_msg_t *hello;
+       int                cpt;
+       ksock_tx_t      *tx;
+       ksock_tx_t      *txtmp;
+       int             rc;
+       int             active;
+       char          *warn = NULL;
+
+       active = (route != NULL);
+
+       LASSERT (active == (type != SOCKLND_CONN_NONE));
+
+       LIBCFS_ALLOC(conn, sizeof(*conn));
+       if (conn == NULL) {
+               rc = -ENOMEM;
+               goto failed_0;
+       }
+
+       memset (conn, 0, sizeof (*conn));
+
+       conn->ksnc_peer = NULL;
+       conn->ksnc_route = NULL;
+       conn->ksnc_sock = sock;
+       /* 2 ref, 1 for conn, another extra ref prevents socket
+        * being closed before establishment of connection */
+       atomic_set (&conn->ksnc_sock_refcount, 2);
+       conn->ksnc_type = type;
+       ksocknal_lib_save_callback(sock, conn);
+       atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+       conn->ksnc_rx_ready = 0;
+       conn->ksnc_rx_scheduled = 0;
+
+       INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+       conn->ksnc_tx_ready = 0;
+       conn->ksnc_tx_scheduled = 0;
+       conn->ksnc_tx_carrier = NULL;
+       atomic_set (&conn->ksnc_tx_nob, 0);
+
+       LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t,
+                                    kshm_ips[LNET_MAX_INTERFACES]));
+       if (hello == NULL) {
+               rc = -ENOMEM;
+               goto failed_1;
+       }
+
+       /* stash conn's local and remote addrs */
+       rc = ksocknal_lib_get_conn_addrs (conn);
+       if (rc != 0)
+               goto failed_1;
+
+       /* Find out/confirm peer's NID and connection type and get the
+        * vector of interfaces she's willing to let me connect to.
+        * Passive connections use the listener timeout since the peer sends
+        * eagerly */
+
+       if (active) {
+               peer = route->ksnr_peer;
+               LASSERT(ni == peer->ksnp_ni);
+
+               /* Active connection sends HELLO eagerly */
+               hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
+               peerid = peer->ksnp_id;
+
+               write_lock_bh(global_lock);
+               conn->ksnc_proto = peer->ksnp_proto;
+               write_unlock_bh(global_lock);
+
+               if (conn->ksnc_proto == NULL) {
+                        conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+                        if (*ksocknal_tunables.ksnd_protocol == 2)
+                                conn->ksnc_proto = &ksocknal_protocol_v2x;
+                        else if (*ksocknal_tunables.ksnd_protocol == 1)
+                                conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+               }
+
+               rc = ksocknal_send_hello (ni, conn, peerid.nid, hello);
+               if (rc != 0)
+                       goto failed_1;
+       } else {
+               peerid.nid = LNET_NID_ANY;
+               peerid.pid = LNET_PID_ANY;
+
+               /* Passive, get protocol from peer */
+               conn->ksnc_proto = NULL;
+       }
+
+       rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation);
+       if (rc < 0)
+               goto failed_1;
+
+       LASSERT (rc == 0 || active);
+       LASSERT (conn->ksnc_proto != NULL);
+       LASSERT (peerid.nid != LNET_NID_ANY);
+
+       cpt = lnet_cpt_of_nid(peerid.nid);
+
+       if (active) {
+               ksocknal_peer_addref(peer);
+               write_lock_bh(global_lock);
+       } else {
+               rc = ksocknal_create_peer(&peer, ni, peerid);
+               if (rc != 0)
+                       goto failed_1;
+
+               write_lock_bh(global_lock);
+
+               /* called with a ref on ni, so shutdown can't have started */
+               LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+               peer2 = ksocknal_find_peer_locked(ni, peerid);
+               if (peer2 == NULL) {
+                       /* NB this puts an "empty" peer in the peer
+                        * table (which takes my ref) */
+                       list_add_tail(&peer->ksnp_list,
+                                         ksocknal_nid2peerlist(peerid.nid));
+               } else {
+                       ksocknal_peer_decref(peer);
+                       peer = peer2;
+               }
+
+               /* +1 ref for me */
+               ksocknal_peer_addref(peer);
+               peer->ksnp_accepting++;
+
+               /* Am I already connecting to this guy?  Resolve in
+                * favour of higher NID... */
+               if (peerid.nid < ni->ni_nid &&
+                   ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
+                       rc = EALREADY;
+                       warn = "connection race resolution";
+                       goto failed_2;
+               }
+       }
+
+       if (peer->ksnp_closing ||
+           (active && route->ksnr_deleted)) {
+               /* peer/route got closed under me */
+               rc = -ESTALE;
+               warn = "peer/route removed";
+               goto failed_2;
+       }
+
+       if (peer->ksnp_proto == NULL) {
+               /* Never connected before.
+                * NB recv_hello may have returned EPROTO to signal my peer
+                * wants a different protocol than the one I asked for.
+                */
+               LASSERT (list_empty(&peer->ksnp_conns));
+
+               peer->ksnp_proto = conn->ksnc_proto;
+               peer->ksnp_incarnation = incarnation;
+       }
+
+       if (peer->ksnp_proto != conn->ksnc_proto ||
+           peer->ksnp_incarnation != incarnation) {
+               /* Peer rebooted or I've got the wrong protocol version */
+               ksocknal_close_peer_conns_locked(peer, 0, 0);
+
+               peer->ksnp_proto = NULL;
+               rc = ESTALE;
+               warn = peer->ksnp_incarnation != incarnation ?
+                      "peer rebooted" :
+                      "wrong proto version";
+               goto failed_2;
+       }
+
+       switch (rc) {
+       default:
+               LBUG();
+       case 0:
+               break;
+       case EALREADY:
+               warn = "lost conn race";
+               goto failed_2;
+       case EPROTO:
+               warn = "retry with different protocol version";
+               goto failed_2;
+       }
+
+       /* Refuse to duplicate an existing connection, unless this is a
+        * loopback connection */
+       if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+               list_for_each(tmp, &peer->ksnp_conns) {
+                       conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                       if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+                           conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+                           conn2->ksnc_type != conn->ksnc_type)
+                               continue;
+
+                       /* Reply on a passive connection attempt so the peer
+                        * realises we're connected. */
+                       LASSERT (rc == 0);
+                       if (!active)
+                               rc = EALREADY;
+
+                       warn = "duplicate";
+                       goto failed_2;
+               }
+       }
+
+       /* If the connection created by this route didn't bind to the IP
+        * address the route connected to, the connection/route matching
+        * code below probably isn't going to work. */
+       if (active &&
+           route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+               CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n",
+                      libcfs_id2str(peer->ksnp_id),
+                      HIPQUAD(route->ksnr_ipaddr),
+                      HIPQUAD(conn->ksnc_ipaddr));
+       }
+
+       /* Search for a route corresponding to the new connection and
+        * create an association.  This allows incoming connections created
+        * by routes in my peer to match my own route entries so I don't
+        * continually create duplicate routes. */
+       list_for_each (tmp, &peer->ksnp_routes) {
+               route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+               if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+                       continue;
+
+               ksocknal_associate_route_conn_locked(route, conn);
+               break;
+       }
+
+       conn->ksnc_peer = peer;          /* conn takes my ref on peer */
+       peer->ksnp_last_alive = cfs_time_current();
+       peer->ksnp_send_keepalive = 0;
+       peer->ksnp_error = 0;
+
+       sched = ksocknal_choose_scheduler_locked(cpt);
+       sched->kss_nconns++;
+       conn->ksnc_scheduler = sched;
+
+       conn->ksnc_tx_last_post = cfs_time_current();
+       /* Set the deadline for the outgoing HELLO to drain */
+       conn->ksnc_tx_bufnob = cfs_sock_wmem_queued(sock);
+       conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+       mb();   /* order with adding to peer's conn list */
+
+       list_add (&conn->ksnc_list, &peer->ksnp_conns);
+       ksocknal_conn_addref(conn);
+
+       ksocknal_new_packet(conn, 0);
+
+       conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
+
+       /* Take packets blocking for this connection. */
+       list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
+               if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO)
+                               continue;
+
+               list_del (&tx->tx_list);
+               ksocknal_queue_tx_locked (tx, conn);
+       }
+
+       write_unlock_bh(global_lock);
+
+       /* We've now got a new connection.  Any errors from here on are just
+        * like "normal" comms errors and we close the connection normally.
+        * NB (a) we still have to send the reply HELLO for passive
+        *      connections,
+        *    (b) normal I/O on the conn is blocked until I setup and call the
+        *      socket callbacks.
+        */
+
+       CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+              " incarnation:"LPD64" sched[%d:%d]\n",
+              libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+              HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr),
+              conn->ksnc_port, incarnation, cpt,
+              (int)(sched - &sched->kss_info->ksi_scheds[0]));
+
+       if (active) {
+               /* additional routes after interface exchange? */
+               ksocknal_create_routes(peer, conn->ksnc_port,
+                                      hello->kshm_ips, hello->kshm_nips);
+       } else {
+               hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+                                                      hello->kshm_nips);
+               rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+       }
+
+       LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+                                   kshm_ips[LNET_MAX_INTERFACES]));
+
+       /* setup the socket AFTER I've received hello (it disables
+        * SO_LINGER).  I might call back to the acceptor who may want
+        * to send a protocol version response and then close the
+        * socket; this ensures the socket only tears down after the
+        * response has been sent. */
+       if (rc == 0)
+               rc = ksocknal_lib_setup_sock(sock);
+
+       write_lock_bh(global_lock);
+
+       /* NB my callbacks block while I hold ksnd_global_lock */
+       ksocknal_lib_set_callback(sock, conn);
+
+       if (!active)
+               peer->ksnp_accepting--;
+
+       write_unlock_bh(global_lock);
+
+       if (rc != 0) {
+               write_lock_bh(global_lock);
+               if (!conn->ksnc_closing) {
+                       /* could be closed by another thread */
+                       ksocknal_close_conn_locked(conn, rc);
+               }
+               write_unlock_bh(global_lock);
+       } else if (ksocknal_connsock_addref(conn) == 0) {
+               /* Allow I/O to proceed. */
+               ksocknal_read_callback(conn);
+               ksocknal_write_callback(conn);
+               ksocknal_connsock_decref(conn);
+       }
+
+       ksocknal_connsock_decref(conn);
+       ksocknal_conn_decref(conn);
+       return rc;
+
+ failed_2:
+       if (!peer->ksnp_closing &&
+           list_empty (&peer->ksnp_conns) &&
+           list_empty (&peer->ksnp_routes)) {
+               list_add(&zombies, &peer->ksnp_tx_queue);
+               list_del_init(&peer->ksnp_tx_queue);
+               ksocknal_unlink_peer_locked(peer);
+       }
+
+       write_unlock_bh(global_lock);
+
+       if (warn != NULL) {
+               if (rc < 0)
+                       CERROR("Not creating conn %s type %d: %s\n",
+                              libcfs_id2str(peerid), conn->ksnc_type, warn);
+               else
+                       CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+                             libcfs_id2str(peerid), conn->ksnc_type, warn);
+       }
+
+       if (!active) {
+               if (rc > 0) {
+                       /* Request retry by replying with CONN_NONE
+                        * ksnc_proto has been set already */
+                       conn->ksnc_type = SOCKLND_CONN_NONE;
+                       hello->kshm_nips = 0;
+                       ksocknal_send_hello(ni, conn, peerid.nid, hello);
+               }
+
+               write_lock_bh(global_lock);
+               peer->ksnp_accepting--;
+               write_unlock_bh(global_lock);
+       }
+
+       ksocknal_txlist_done(ni, &zombies, 1);
+       ksocknal_peer_decref(peer);
+
+ failed_1:
+       if (hello != NULL)
+               LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+                                           kshm_ips[LNET_MAX_INTERFACES]));
+
+       LIBCFS_FREE (conn, sizeof(*conn));
+
+ failed_0:
+       libcfs_sock_release(sock);
+       return rc;
+}
+
+void
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
+{
+       /* This just does the immmediate housekeeping, and queues the
+        * connection for the reaper to terminate.
+        * Caller holds ksnd_global_lock exclusively in irq context */
+       ksock_peer_t      *peer = conn->ksnc_peer;
+       ksock_route_t     *route;
+       ksock_conn_t      *conn2;
+       struct list_head        *tmp;
+
+       LASSERT (peer->ksnp_error == 0);
+       LASSERT (!conn->ksnc_closing);
+       conn->ksnc_closing = 1;
+
+       /* ksnd_deathrow_conns takes over peer's ref */
+       list_del (&conn->ksnc_list);
+
+       route = conn->ksnc_route;
+       if (route != NULL) {
+               /* dissociate conn from route... */
+               LASSERT (!route->ksnr_deleted);
+               LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
+
+               conn2 = NULL;
+               list_for_each(tmp, &peer->ksnp_conns) {
+                       conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                       if (conn2->ksnc_route == route &&
+                           conn2->ksnc_type == conn->ksnc_type)
+                               break;
+
+                       conn2 = NULL;
+               }
+               if (conn2 == NULL)
+                       route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
+               conn->ksnc_route = NULL;
+
+#if 0     /* irrelevent with only eager routes */
+               /* make route least favourite */
+               list_del (&route->ksnr_list);
+               list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
+#endif
+               ksocknal_route_decref(route);     /* drop conn's ref on route */
+       }
+
+       if (list_empty (&peer->ksnp_conns)) {
+               /* No more connections to this peer */
+
+               if (!list_empty(&peer->ksnp_tx_queue)) {
+                       ksock_tx_t *tx;
+
+                       LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+
+                       /* throw them to the last connection...,
+                        * these TXs will be send to /dev/null by scheduler */
+                       list_for_each_entry(tx, &peer->ksnp_tx_queue,
+                                               tx_list)
+                               ksocknal_tx_prep(conn, tx);
+
+                       spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
+                       list_splice_init(&peer->ksnp_tx_queue,
+                                            &conn->ksnc_tx_queue);
+                       spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
+               }
+
+               peer->ksnp_proto = NULL;        /* renegotiate protocol version */
+               peer->ksnp_error = error;       /* stash last conn close reason */
+
+               if (list_empty (&peer->ksnp_routes)) {
+                       /* I've just closed last conn belonging to a
+                        * peer with no routes to it */
+                       ksocknal_unlink_peer_locked (peer);
+               }
+       }
+
+       spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+       list_add_tail(&conn->ksnc_list,
+                         &ksocknal_data.ksnd_deathrow_conns);
+       wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+       spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed (ksock_peer_t *peer)
+{
+       int     notify = 0;
+       cfs_time_t last_alive = 0;
+
+       /* There has been a connection failure or comms error; but I'll only
+        * tell LNET I think the peer is dead if it's to another kernel and
+        * there are no connections or connection attempts in existance. */
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+           list_empty(&peer->ksnp_conns) &&
+           peer->ksnp_accepting == 0 &&
+           ksocknal_find_connecting_route_locked(peer) == NULL) {
+               notify = 1;
+               last_alive = peer->ksnp_last_alive;
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       if (notify)
+               lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0,
+                            last_alive);
+}
+
+void
+ksocknal_finalize_zcreq(ksock_conn_t *conn)
+{
+       ksock_peer_t     *peer = conn->ksnc_peer;
+       ksock_tx_t       *tx;
+       ksock_tx_t       *tmp;
+       LIST_HEAD    (zlist);
+
+       /* NB safe to finalize TXs because closing of socket will
+        * abort all buffered data */
+       LASSERT (conn->ksnc_sock == NULL);
+
+       spin_lock(&peer->ksnp_lock);
+
+       list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
+               if (tx->tx_conn != conn)
+                       continue;
+
+               LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0);
+
+               tx->tx_msg.ksm_zc_cookies[0] = 0;
+               tx->tx_zc_aborted = 1; /* mark it as not-acked */
+               list_del(&tx->tx_zc_list);
+               list_add(&tx->tx_zc_list, &zlist);
+       }
+
+       spin_unlock(&peer->ksnp_lock);
+
+       while (!list_empty(&zlist)) {
+               tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+
+               list_del(&tx->tx_zc_list);
+               ksocknal_tx_decref(tx);
+       }
+}
+
+void
+ksocknal_terminate_conn (ksock_conn_t *conn)
+{
+       /* This gets called by the reaper (guaranteed thread context) to
+        * disengage the socket from its callbacks and close it.
+        * ksnc_refcount will eventually hit zero, and then the reaper will
+        * destroy it. */
+       ksock_peer_t     *peer = conn->ksnc_peer;
+       ksock_sched_t    *sched = conn->ksnc_scheduler;
+       int            failed = 0;
+
+       LASSERT(conn->ksnc_closing);
+
+       /* wake up the scheduler to "send" all remaining packets to /dev/null */
+       spin_lock_bh(&sched->kss_lock);
+
+       /* a closing conn is always ready to tx */
+       conn->ksnc_tx_ready = 1;
+
+       if (!conn->ksnc_tx_scheduled &&
+           !list_empty(&conn->ksnc_tx_queue)){
+               list_add_tail (&conn->ksnc_tx_list,
+                              &sched->kss_tx_conns);
+               conn->ksnc_tx_scheduled = 1;
+               /* extra ref for scheduler */
+               ksocknal_conn_addref(conn);
+
+               wake_up (&sched->kss_waitq);
+       }
+
+       spin_unlock_bh(&sched->kss_lock);
+
+       /* serialise with callbacks */
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+       /* OK, so this conn may not be completely disengaged from its
+        * scheduler yet, but it _has_ committed to terminate... */
+       conn->ksnc_scheduler->kss_nconns--;
+
+       if (peer->ksnp_error != 0) {
+               /* peer's last conn closed in error */
+               LASSERT (list_empty (&peer->ksnp_conns));
+               failed = 1;
+               peer->ksnp_error = 0;     /* avoid multiple notifications */
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       if (failed)
+               ksocknal_peer_failed(peer);
+
+       /* The socket is closed on the final put; either here, or in
+        * ksocknal_{send,recv}msg().  Since we set up the linger2 option
+        * when the connection was established, this will close the socket
+        * immediately, aborting anything buffered in it. Any hung
+        * zero-copy transmits will therefore complete in finite time. */
+       ksocknal_connsock_decref(conn);
+}
+
+void
+ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+{
+       /* Queue the conn for the reaper to destroy */
+
+       LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+       spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+       list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+       wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+       spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_destroy_conn (ksock_conn_t *conn)
+{
+       cfs_time_t      last_rcv;
+
+       /* Final coup-de-grace of the reaper */
+       CDEBUG (D_NET, "connection %p\n", conn);
+
+       LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0);
+       LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0);
+       LASSERT (conn->ksnc_sock == NULL);
+       LASSERT (conn->ksnc_route == NULL);
+       LASSERT (!conn->ksnc_tx_scheduled);
+       LASSERT (!conn->ksnc_rx_scheduled);
+       LASSERT (list_empty(&conn->ksnc_tx_queue));
+
+       /* complete current receive if any */
+       switch (conn->ksnc_rx_state) {
+       case SOCKNAL_RX_LNET_PAYLOAD:
+               last_rcv = conn->ksnc_rx_deadline -
+                          cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+               CERROR("Completing partial receive from %s[%d]"
+                      ", ip %d.%d.%d.%d:%d, with error, wanted: %d, left: %d, "
+                      "last alive is %ld secs ago\n",
+                      libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+                      HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+                      conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+                      cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+                                       last_rcv)));
+               lnet_finalize (conn->ksnc_peer->ksnp_ni,
+                              conn->ksnc_cookie, -EIO);
+               break;
+       case SOCKNAL_RX_LNET_HEADER:
+               if (conn->ksnc_rx_started)
+                       CERROR("Incomplete receive of lnet header from %s"
+                              ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+                              conn->ksnc_proto->pro_version);
+               break;
+       case SOCKNAL_RX_KSM_HEADER:
+               if (conn->ksnc_rx_started)
+                       CERROR("Incomplete receive of ksock message from %s"
+                              ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+                              conn->ksnc_proto->pro_version);
+               break;
+       case SOCKNAL_RX_SLOP:
+               if (conn->ksnc_rx_started)
+                       CERROR("Incomplete receive of slops from %s"
+                              ", ip %d.%d.%d.%d:%d, with error\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+              break;
+       default:
+               LBUG ();
+               break;
+       }
+
+       ksocknal_peer_decref(conn->ksnc_peer);
+
+       LIBCFS_FREE (conn, sizeof (*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why)
+{
+       ksock_conn_t       *conn;
+       struct list_head         *ctmp;
+       struct list_head         *cnxt;
+       int              count = 0;
+
+       list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+               conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+               if (ipaddr == 0 ||
+                   conn->ksnc_ipaddr == ipaddr) {
+                       count++;
+                       ksocknal_close_conn_locked (conn, why);
+               }
+       }
+
+       return (count);
+}
+
+int
+ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
+{
+       ksock_peer_t     *peer = conn->ksnc_peer;
+       __u32        ipaddr = conn->ksnc_ipaddr;
+       int            count;
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       count = ksocknal_close_peer_conns_locked (peer, ipaddr, why);
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       return (count);
+}
+
+int
+ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr)
+{
+       ksock_peer_t       *peer;
+       struct list_head         *ptmp;
+       struct list_head         *pnxt;
+       int              lo;
+       int              hi;
+       int              i;
+       int              count = 0;
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       if (id.nid != LNET_NID_ANY)
+               lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+       else {
+               lo = 0;
+               hi = ksocknal_data.ksnd_peer_hash_size - 1;
+       }
+
+       for (i = lo; i <= hi; i++) {
+               list_for_each_safe (ptmp, pnxt,
+                                       &ksocknal_data.ksnd_peers[i]) {
+
+                       peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+                       if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
+                             (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
+                               continue;
+
+                       count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0);
+               }
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       /* wildcards always succeed */
+       if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+               return (0);
+
+       return (count == 0 ? -ENOENT : 0);
+}
+
+void
+ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
+{
+       /* The router is telling me she's been notified of a change in
+        * gateway state.... */
+       lnet_process_id_t  id = {0};
+
+       id.nid = gw_nid;
+       id.pid = LNET_PID_ANY;
+
+       CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
+               alive ? "up" : "down");
+
+       if (!alive) {
+               /* If the gateway crashed, close all open connections... */
+               ksocknal_close_matching_conns (id, 0);
+               return;
+       }
+
+       /* ...otherwise do nothing.  We can only establish new connections
+        * if we have autroutes, and these connect on demand. */
+}
+
+void
+ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+       int             connect = 1;
+       cfs_time_t       last_alive = 0;
+       cfs_time_t       now = cfs_time_current();
+       ksock_peer_t      *peer = NULL;
+       rwlock_t                *glock = &ksocknal_data.ksnd_global_lock;
+       lnet_process_id_t  id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+
+       read_lock(glock);
+
+       peer = ksocknal_find_peer_locked(ni, id);
+       if (peer != NULL) {
+               struct list_head       *tmp;
+               ksock_conn_t     *conn;
+               int            bufnob;
+
+               list_for_each (tmp, &peer->ksnp_conns) {
+                       conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                       bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+
+                       if (bufnob < conn->ksnc_tx_bufnob) {
+                               /* something got ACKed */
+                               conn->ksnc_tx_deadline =
+                                       cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+                               peer->ksnp_last_alive = now;
+                               conn->ksnc_tx_bufnob = bufnob;
+                       }
+               }
+
+               last_alive = peer->ksnp_last_alive;
+               if (ksocknal_find_connectable_route_locked(peer) == NULL)
+                       connect = 0;
+       }
+
+       read_unlock(glock);
+
+       if (last_alive != 0)
+               *when = last_alive;
+
+       CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+              libcfs_nid2str(nid), peer,
+              last_alive ? cfs_duration_sec(now - last_alive) : -1,
+              connect);
+
+       if (!connect)
+               return;
+
+       ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
+
+       write_lock_bh(glock);
+
+       peer = ksocknal_find_peer_locked(ni, id);
+       if (peer != NULL)
+               ksocknal_launch_all_connections_locked(peer);
+
+       write_unlock_bh(glock);
+       return;
+}
+
+void
+ksocknal_push_peer (ksock_peer_t *peer)
+{
+       int            index;
+       int            i;
+       struct list_head       *tmp;
+       ksock_conn_t     *conn;
+
+       for (index = 0; ; index++) {
+               read_lock(&ksocknal_data.ksnd_global_lock);
+
+               i = 0;
+               conn = NULL;
+
+               list_for_each (tmp, &peer->ksnp_conns) {
+                       if (i++ == index) {
+                               conn = list_entry (tmp, ksock_conn_t,
+                                                      ksnc_list);
+                               ksocknal_conn_addref(conn);
+                               break;
+                       }
+               }
+
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+
+               if (conn == NULL)
+                       break;
+
+               ksocknal_lib_push_conn (conn);
+               ksocknal_conn_decref(conn);
+       }
+}
+
+int
+ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id)
+{
+       ksock_peer_t      *peer;
+       struct list_head        *tmp;
+       int             index;
+       int             i;
+       int             j;
+       int             rc = -ENOENT;
+
+       for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+               for (j = 0; ; j++) {
+                       read_lock(&ksocknal_data.ksnd_global_lock);
+
+                       index = 0;
+                       peer = NULL;
+
+                       list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+                               peer = list_entry(tmp, ksock_peer_t,
+                                                     ksnp_list);
+
+                               if (!((id.nid == LNET_NID_ANY ||
+                                      id.nid == peer->ksnp_id.nid) &&
+                                     (id.pid == LNET_PID_ANY ||
+                                      id.pid == peer->ksnp_id.pid))) {
+                                       peer = NULL;
+                                       continue;
+                               }
+
+                               if (index++ == j) {
+                                       ksocknal_peer_addref(peer);
+                                       break;
+                               }
+                       }
+
+                       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                       if (peer != NULL) {
+                               rc = 0;
+                               ksocknal_push_peer (peer);
+                               ksocknal_peer_decref(peer);
+                       }
+               }
+
+       }
+
+       return (rc);
+}
+
+int
+ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
+{
+       ksock_net_t       *net = ni->ni_data;
+       ksock_interface_t *iface;
+       int             rc;
+       int             i;
+       int             j;
+       struct list_head        *ptmp;
+       ksock_peer_t      *peer;
+       struct list_head        *rtmp;
+       ksock_route_t     *route;
+
+       if (ipaddress == 0 ||
+           netmask == 0)
+               return (-EINVAL);
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       iface = ksocknal_ip2iface(ni, ipaddress);
+       if (iface != NULL) {
+               /* silently ignore dups */
+               rc = 0;
+       } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
+               rc = -ENOSPC;
+       } else {
+               iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
+
+               iface->ksni_ipaddr = ipaddress;
+               iface->ksni_netmask = netmask;
+               iface->ksni_nroutes = 0;
+               iface->ksni_npeers = 0;
+
+               for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                       list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+                               peer = list_entry(ptmp, ksock_peer_t,
+                                                     ksnp_list);
+
+                               for (j = 0; j < peer->ksnp_n_passive_ips; j++)
+                                       if (peer->ksnp_passive_ips[j] == ipaddress)
+                                               iface->ksni_npeers++;
+
+                               list_for_each(rtmp, &peer->ksnp_routes) {
+                                       route = list_entry(rtmp,
+                                                              ksock_route_t,
+                                                              ksnr_list);
+
+                                       if (route->ksnr_myipaddr == ipaddress)
+                                               iface->ksni_nroutes++;
+                               }
+                       }
+               }
+
+               rc = 0;
+               /* NB only new connections will pay attention to the new interface! */
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       return (rc);
+}
+
+void
+ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+{
+       struct list_head         *tmp;
+       struct list_head         *nxt;
+       ksock_route_t      *route;
+       ksock_conn_t       *conn;
+       int              i;
+       int              j;
+
+       for (i = 0; i < peer->ksnp_n_passive_ips; i++)
+               if (peer->ksnp_passive_ips[i] == ipaddr) {
+                       for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
+                               peer->ksnp_passive_ips[j-1] =
+                                       peer->ksnp_passive_ips[j];
+                       peer->ksnp_n_passive_ips--;
+                       break;
+               }
+
+       list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+               route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+               if (route->ksnr_myipaddr != ipaddr)
+                       continue;
+
+               if (route->ksnr_share_count != 0) {
+                       /* Manually created; keep, but unbind */
+                       route->ksnr_myipaddr = 0;
+               } else {
+                       ksocknal_del_route_locked(route);
+               }
+       }
+
+       list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+               conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+               if (conn->ksnc_myipaddr == ipaddr)
+                       ksocknal_close_conn_locked (conn, 0);
+       }
+}
+
+int
+ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
+{
+       ksock_net_t       *net = ni->ni_data;
+       int             rc = -ENOENT;
+       struct list_head        *tmp;
+       struct list_head        *nxt;
+       ksock_peer_t      *peer;
+       __u32         this_ip;
+       int             i;
+       int             j;
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       for (i = 0; i < net->ksnn_ninterfaces; i++) {
+               this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
+
+               if (!(ipaddress == 0 ||
+                     ipaddress == this_ip))
+                       continue;
+
+               rc = 0;
+
+               for (j = i+1; j < net->ksnn_ninterfaces; j++)
+                       net->ksnn_interfaces[j-1] =
+                               net->ksnn_interfaces[j];
+
+               net->ksnn_ninterfaces--;
+
+               for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+                       list_for_each_safe(tmp, nxt,
+                                              &ksocknal_data.ksnd_peers[j]) {
+                               peer = list_entry(tmp, ksock_peer_t,
+                                                     ksnp_list);
+
+                               if (peer->ksnp_ni != ni)
+                                       continue;
+
+                               ksocknal_peer_del_interface_locked(peer, this_ip);
+                       }
+               }
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       return (rc);
+}
+
+int
+ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+       lnet_process_id_t id = {0};
+       struct libcfs_ioctl_data *data = arg;
+       int rc;
+
+       switch(cmd) {
+       case IOC_LIBCFS_GET_INTERFACE: {
+               ksock_net_t       *net = ni->ni_data;
+               ksock_interface_t *iface;
+
+               read_lock(&ksocknal_data.ksnd_global_lock);
+
+               if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
+                       rc = -ENOENT;
+               } else {
+                       rc = 0;
+                       iface = &net->ksnn_interfaces[data->ioc_count];
+
+                       data->ioc_u32[0] = iface->ksni_ipaddr;
+                       data->ioc_u32[1] = iface->ksni_netmask;
+                       data->ioc_u32[2] = iface->ksni_npeers;
+                       data->ioc_u32[3] = iface->ksni_nroutes;
+               }
+
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+               return rc;
+       }
+
+       case IOC_LIBCFS_ADD_INTERFACE:
+               return ksocknal_add_interface(ni,
+                                             data->ioc_u32[0], /* IP address */
+                                             data->ioc_u32[1]); /* net mask */
+
+       case IOC_LIBCFS_DEL_INTERFACE:
+               return ksocknal_del_interface(ni,
+                                             data->ioc_u32[0]); /* IP address */
+
+       case IOC_LIBCFS_GET_PEER: {
+               __u32       myip = 0;
+               __u32       ip = 0;
+               int           port = 0;
+               int           conn_count = 0;
+               int           share_count = 0;
+
+               rc = ksocknal_get_peer_info(ni, data->ioc_count,
+                                           &id, &myip, &ip, &port,
+                                           &conn_count,  &share_count);
+               if (rc != 0)
+                       return rc;
+
+               data->ioc_nid    = id.nid;
+               data->ioc_count  = share_count;
+               data->ioc_u32[0] = ip;
+               data->ioc_u32[1] = port;
+               data->ioc_u32[2] = myip;
+               data->ioc_u32[3] = conn_count;
+               data->ioc_u32[4] = id.pid;
+               return 0;
+       }
+
+       case IOC_LIBCFS_ADD_PEER:
+               id.nid = data->ioc_nid;
+               id.pid = LUSTRE_SRV_LNET_PID;
+               return ksocknal_add_peer (ni, id,
+                                         data->ioc_u32[0], /* IP */
+                                         data->ioc_u32[1]); /* port */
+
+       case IOC_LIBCFS_DEL_PEER:
+               id.nid = data->ioc_nid;
+               id.pid = LNET_PID_ANY;
+               return ksocknal_del_peer (ni, id,
+                                         data->ioc_u32[0]); /* IP */
+
+       case IOC_LIBCFS_GET_CONN: {
+               int        txmem;
+               int        rxmem;
+               int        nagle;
+               ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
+
+               if (conn == NULL)
+                       return -ENOENT;
+
+               ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+               data->ioc_count  = txmem;
+               data->ioc_nid    = conn->ksnc_peer->ksnp_id.nid;
+               data->ioc_flags  = nagle;
+               data->ioc_u32[0] = conn->ksnc_ipaddr;
+               data->ioc_u32[1] = conn->ksnc_port;
+               data->ioc_u32[2] = conn->ksnc_myipaddr;
+               data->ioc_u32[3] = conn->ksnc_type;
+               data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+               data->ioc_u32[5] = rxmem;
+               data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+               ksocknal_conn_decref(conn);
+               return 0;
+       }
+
+       case IOC_LIBCFS_CLOSE_CONNECTION:
+               id.nid = data->ioc_nid;
+               id.pid = LNET_PID_ANY;
+               return ksocknal_close_matching_conns (id,
+                                                     data->ioc_u32[0]);
+
+       case IOC_LIBCFS_REGISTER_MYNID:
+               /* Ignore if this is a noop */
+               if (data->ioc_nid == ni->ni_nid)
+                       return 0;
+
+               CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                      libcfs_nid2str(data->ioc_nid),
+                      libcfs_nid2str(ni->ni_nid));
+               return -EINVAL;
+
+       case IOC_LIBCFS_PUSH_CONNECTION:
+               id.nid = data->ioc_nid;
+               id.pid = LNET_PID_ANY;
+               return ksocknal_push(ni, id);
+
+       default:
+               return -EINVAL;
+       }
+       /* not reached */
+}
+
+void
+ksocknal_free_buffers (void)
+{
+       LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+       if (ksocknal_data.ksnd_sched_info != NULL) {
+               struct ksock_sched_info *info;
+               int                     i;
+
+               cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+                       if (info->ksi_scheds != NULL) {
+                               LIBCFS_FREE(info->ksi_scheds,
+                                           info->ksi_nthreads_max *
+                                           sizeof(info->ksi_scheds[0]));
+                       }
+               }
+               cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+       }
+
+       LIBCFS_FREE (ksocknal_data.ksnd_peers,
+                    sizeof (struct list_head) *
+                    ksocknal_data.ksnd_peer_hash_size);
+
+       spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+       if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+               struct list_head        zlist;
+               ksock_tx_t      *tx;
+
+               list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
+               list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
+               spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+               while (!list_empty(&zlist)) {
+                       tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+                       list_del(&tx->tx_list);
+                       LIBCFS_FREE(tx, tx->tx_desc_size);
+               }
+       } else {
+               spin_unlock(&ksocknal_data.ksnd_tx_lock);
+       }
+}
+
+void
+ksocknal_base_shutdown(void)
+{
+       struct ksock_sched_info *info;
+       ksock_sched_t           *sched;
+       int                     i;
+       int                     j;
+
+       CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+              atomic_read (&libcfs_kmemory));
+       LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+       switch (ksocknal_data.ksnd_init) {
+       default:
+               LASSERT (0);
+
+       case SOCKNAL_INIT_ALL:
+       case SOCKNAL_INIT_DATA:
+               LASSERT (ksocknal_data.ksnd_peers != NULL);
+               for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                       LASSERT (list_empty (&ksocknal_data.ksnd_peers[i]));
+               }
+
+               LASSERT(list_empty(&ksocknal_data.ksnd_nets));
+               LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns));
+               LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns));
+               LASSERT (list_empty (&ksocknal_data.ksnd_connd_connreqs));
+               LASSERT (list_empty (&ksocknal_data.ksnd_connd_routes));
+
+               if (ksocknal_data.ksnd_sched_info != NULL) {
+                       cfs_percpt_for_each(info, i,
+                                           ksocknal_data.ksnd_sched_info) {
+                               if (info->ksi_scheds == NULL)
+                                       continue;
+
+                               for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+                                       sched = &info->ksi_scheds[j];
+                                       LASSERT(list_empty(&sched->\
+                                                              kss_tx_conns));
+                                       LASSERT(list_empty(&sched->\
+                                                              kss_rx_conns));
+                                       LASSERT(list_empty(&sched-> \
+                                                 kss_zombie_noop_txs));
+                                       LASSERT(sched->kss_nconns == 0);
+                               }
+                       }
+               }
+
+               /* flag threads to terminate; wake and wait for them to die */
+               ksocknal_data.ksnd_shuttingdown = 1;
+               wake_up_all(&ksocknal_data.ksnd_connd_waitq);
+               wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
+
+               if (ksocknal_data.ksnd_sched_info != NULL) {
+                       cfs_percpt_for_each(info, i,
+                                           ksocknal_data.ksnd_sched_info) {
+                               if (info->ksi_scheds == NULL)
+                                       continue;
+
+                               for (j = 0; j < info->ksi_nthreads_max; j++) {
+                                       sched = &info->ksi_scheds[j];
+                                       wake_up_all(&sched->kss_waitq);
+                               }
+                       }
+               }
+
+               i = 4;
+               read_lock(&ksocknal_data.ksnd_global_lock);
+               while (ksocknal_data.ksnd_nthreads != 0) {
+                       i++;
+                       CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                              "waiting for %d threads to terminate\n",
+                               ksocknal_data.ksnd_nthreads);
+                       read_unlock(&ksocknal_data.ksnd_global_lock);
+                       cfs_pause(cfs_time_seconds(1));
+                       read_lock(&ksocknal_data.ksnd_global_lock);
+               }
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+
+               ksocknal_free_buffers();
+
+               ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+               break;
+       }
+
+       CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+              atomic_read (&libcfs_kmemory));
+
+       module_put(THIS_MODULE);
+}
+
+__u64
+ksocknal_new_incarnation (void)
+{
+       struct timeval tv;
+
+       /* The incarnation number is the time this module loaded and it
+        * identifies this particular instance of the socknal.  Hopefully
+        * we won't be able to reboot more frequently than 1MHz for the
+        * forseeable future :) */
+
+       do_gettimeofday(&tv);
+
+       return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+}
+
+int
+ksocknal_base_startup(void)
+{
+       struct ksock_sched_info *info;
+       int                     rc;
+       int                     i;
+
+       LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+       LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+       memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+       ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
+       LIBCFS_ALLOC (ksocknal_data.ksnd_peers,
+                     sizeof (struct list_head) *
+                     ksocknal_data.ksnd_peer_hash_size);
+       if (ksocknal_data.ksnd_peers == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+               INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
+
+       rwlock_init(&ksocknal_data.ksnd_global_lock);
+       INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
+
+       spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns);
+       init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+       spin_lock_init(&ksocknal_data.ksnd_connd_lock);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes);
+       init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
+
+       spin_lock_init(&ksocknal_data.ksnd_tx_lock);
+       INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs);
+
+       /* NB memset above zeros whole of ksocknal_data */
+
+       /* flag lists/ptrs/locks initialised */
+       ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+       try_module_get(THIS_MODULE);
+
+       ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+                                                        sizeof(*info));
+       if (ksocknal_data.ksnd_sched_info == NULL)
+               goto failed;
+
+       cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+               ksock_sched_t   *sched;
+               int             nthrs;
+
+               nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+               if (*ksocknal_tunables.ksnd_nscheds > 0) {
+                       nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+               } else {
+                       /* max to half of CPUs, assume another half should be
+                        * reserved for upper layer modules */
+                       nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+               }
+
+               info->ksi_nthreads_max = nthrs;
+               info->ksi_cpt = i;
+
+               LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+                                info->ksi_nthreads_max * sizeof(*sched));
+               if (info->ksi_scheds == NULL)
+                       goto failed;
+
+               for (; nthrs > 0; nthrs--) {
+                       sched = &info->ksi_scheds[nthrs - 1];
+
+                       sched->kss_info = info;
+                       spin_lock_init(&sched->kss_lock);
+                       INIT_LIST_HEAD(&sched->kss_rx_conns);
+                       INIT_LIST_HEAD(&sched->kss_tx_conns);
+                       INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+                       init_waitqueue_head(&sched->kss_waitq);
+               }
+       }
+
+       ksocknal_data.ksnd_connd_starting        = 0;
+       ksocknal_data.ksnd_connd_failed_stamp     = 0;
+       ksocknal_data.ksnd_connd_starting_stamp   = cfs_time_current_sec();
+       /* must have at least 2 connds to remain responsive to accepts while
+        * connecting */
+       if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
+               *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
+
+       if (*ksocknal_tunables.ksnd_nconnds_max <
+           *ksocknal_tunables.ksnd_nconnds) {
+               ksocknal_tunables.ksnd_nconnds_max =
+                       ksocknal_tunables.ksnd_nconnds;
+       }
+
+       for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+               char name[16];
+               spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+               ksocknal_data.ksnd_connd_starting++;
+               spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+
+               snprintf(name, sizeof(name), "socknal_cd%02d", i);
+               rc = ksocknal_thread_start(ksocknal_connd,
+                                          (void *)((ulong_ptr_t)i), name);
+               if (rc != 0) {
+                       spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+                       ksocknal_data.ksnd_connd_starting--;
+                       spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+                       CERROR("Can't spawn socknal connd: %d\n", rc);
+                       goto failed;
+               }
+       }
+
+       rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
+       if (rc != 0) {
+               CERROR ("Can't spawn socknal reaper: %d\n", rc);
+               goto failed;
+       }
+
+       /* flag everything initialised */
+       ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+       return 0;
+
+ failed:
+       ksocknal_base_shutdown();
+       return -ENETDOWN;
+}
+
+void
+ksocknal_debug_peerhash (lnet_ni_t *ni)
+{
+       ksock_peer_t    *peer = NULL;
+       struct list_head        *tmp;
+       int             i;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+               list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+                       peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+                       if (peer->ksnp_ni == ni) break;
+
+                       peer = NULL;
+               }
+       }
+
+       if (peer != NULL) {
+               ksock_route_t *route;
+               ksock_conn_t  *conn;
+
+               CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, "
+                      "closing %d, accepting %d, err %d, zcookie "LPU64", "
+                      "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id),
+                      atomic_read(&peer->ksnp_refcount),
+                      peer->ksnp_sharecount, peer->ksnp_closing,
+                      peer->ksnp_accepting, peer->ksnp_error,
+                      peer->ksnp_zc_next_cookie,
+                      !list_empty(&peer->ksnp_tx_queue),
+                      !list_empty(&peer->ksnp_zc_req_list));
+
+               list_for_each (tmp, &peer->ksnp_routes) {
+                       route = list_entry(tmp, ksock_route_t, ksnr_list);
+                       CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
+                              "del %d\n", atomic_read(&route->ksnr_refcount),
+                              route->ksnr_scheduled, route->ksnr_connecting,
+                              route->ksnr_connected, route->ksnr_deleted);
+               }
+
+               list_for_each (tmp, &peer->ksnp_conns) {
+                       conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                       CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
+                              atomic_read(&conn->ksnc_conn_refcount),
+                              atomic_read(&conn->ksnc_sock_refcount),
+                              conn->ksnc_type, conn->ksnc_closing);
+               }
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+       return;
+}
+
+void
+ksocknal_shutdown (lnet_ni_t *ni)
+{
+       ksock_net_t      *net = ni->ni_data;
+       int            i;
+       lnet_process_id_t anyid = {0};
+
+       anyid.nid =  LNET_NID_ANY;
+       anyid.pid =  LNET_PID_ANY;
+
+       LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+       LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+       spin_lock_bh(&net->ksnn_lock);
+       net->ksnn_shutdown = 1;          /* prevent new peers */
+       spin_unlock_bh(&net->ksnn_lock);
+
+       /* Delete all peers */
+       ksocknal_del_peer(ni, anyid, 0);
+
+       /* Wait for all peer state to clean up */
+       i = 2;
+       spin_lock_bh(&net->ksnn_lock);
+       while (net->ksnn_npeers != 0) {
+               spin_unlock_bh(&net->ksnn_lock);
+
+               i++;
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                      "waiting for %d peers to disconnect\n",
+                      net->ksnn_npeers);
+               cfs_pause(cfs_time_seconds(1));
+
+               ksocknal_debug_peerhash(ni);
+
+               spin_lock_bh(&net->ksnn_lock);
+       }
+       spin_unlock_bh(&net->ksnn_lock);
+
+       for (i = 0; i < net->ksnn_ninterfaces; i++) {
+               LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0);
+               LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
+       }
+
+       list_del(&net->ksnn_list);
+       LIBCFS_FREE(net, sizeof(*net));
+
+       ksocknal_data.ksnd_nnets--;
+       if (ksocknal_data.ksnd_nnets == 0)
+               ksocknal_base_shutdown();
+}
+
+int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+       char      **names;
+       int      i;
+       int      j;
+       int      rc;
+       int      n;
+
+       n = libcfs_ipif_enumerate(&names);
+       if (n <= 0) {
+               CERROR("Can't enumerate interfaces: %d\n", n);
+               return n;
+       }
+
+       for (i = j = 0; i < n; i++) {
+               int     up;
+               __u32      ip;
+               __u32      mask;
+
+               if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+                       continue;
+
+               rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+               if (rc != 0) {
+                       CWARN("Can't get interface %s info: %d\n",
+                             names[i], rc);
+                       continue;
+               }
+
+               if (!up) {
+                       CWARN("Ignoring interface %s (down)\n",
+                             names[i]);
+                       continue;
+               }
+
+               if (j == LNET_MAX_INTERFACES) {
+                       CWARN("Ignoring interface %s (too many interfaces)\n",
+                             names[i]);
+                       continue;
+               }
+
+               net->ksnn_interfaces[j].ksni_ipaddr = ip;
+               net->ksnn_interfaces[j].ksni_netmask = mask;
+               strncpy(&net->ksnn_interfaces[j].ksni_name[0],
+                       names[i], IFNAMSIZ);
+               j++;
+       }
+
+       libcfs_ipif_free_enumeration(names, n);
+
+       if (j == 0)
+               CERROR("Can't find any usable interfaces\n");
+
+       return j;
+}
+
+int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+       int     new_ipif = 0;
+       int     i;
+
+       for (i = 0; i < net->ksnn_ninterfaces; i++) {
+               char            *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+               char            *colon = strchr(ifnam, ':');
+               int             found  = 0;
+               ksock_net_t     *tmp;
+               int             j;
+
+               if (colon != NULL) /* ignore alias device */
+                       *colon = 0;
+
+               list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+                                       ksnn_list) {
+                       for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+                               char *ifnam2 = &tmp->ksnn_interfaces[j].\
+                                            ksni_name[0];
+                               char *colon2 = strchr(ifnam2, ':');
+
+                               if (colon2 != NULL)
+                                       *colon2 = 0;
+
+                               found = strcmp(ifnam, ifnam2) == 0;
+                               if (colon2 != NULL)
+                                       *colon2 = ':';
+                       }
+                       if (found)
+                               break;
+               }
+
+               new_ipif += !found;
+               if (colon != NULL)
+                       *colon = ':';
+       }
+
+       return new_ipif;
+}
+
+int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+       int     nthrs;
+       int     rc = 0;
+       int     i;
+
+       if (info->ksi_nthreads == 0) {
+               if (*ksocknal_tunables.ksnd_nscheds > 0) {
+                       nthrs = info->ksi_nthreads_max;
+               } else {
+                       nthrs = cfs_cpt_weight(lnet_cpt_table(),
+                                              info->ksi_cpt);
+                       nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+                       nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+               }
+               nthrs = min(nthrs, info->ksi_nthreads_max);
+       } else {
+               LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+               /* increase two threads if there is new interface */
+               nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+       }
+
+       for (i = 0; i < nthrs; i++) {
+               long            id;
+               char            name[20];
+               ksock_sched_t   *sched;
+               id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+               sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+               snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
+                        info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+
+               rc = ksocknal_thread_start(ksocknal_scheduler,
+                                          (void *)id, name);
+               if (rc == 0)
+                       continue;
+
+               CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+                      info->ksi_cpt, info->ksi_nthreads + i, rc);
+               break;
+       }
+
+       info->ksi_nthreads += i;
+       return rc;
+}
+
+int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+       int     newif = ksocknal_search_new_ipif(net);
+       int     rc;
+       int     i;
+
+       LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+
+       for (i = 0; i < ncpts; i++) {
+               struct ksock_sched_info *info;
+               int cpt = (cpts == NULL) ? i : cpts[i];
+
+               LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+               info = ksocknal_data.ksnd_sched_info[cpt];
+
+               if (!newif && info->ksi_nthreads > 0)
+                       continue;
+
+               rc = ksocknal_start_schedulers(info);
+               if (rc != 0)
+                       return rc;
+       }
+       return 0;
+}
+
+int
+ksocknal_startup (lnet_ni_t *ni)
+{
+       ksock_net_t  *net;
+       int        rc;
+       int        i;
+
+       LASSERT (ni->ni_lnd == &the_ksocklnd);
+
+       if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+               rc = ksocknal_base_startup();
+               if (rc != 0)
+                       return rc;
+       }
+
+       LIBCFS_ALLOC(net, sizeof(*net));
+       if (net == NULL)
+               goto fail_0;
+
+       spin_lock_init(&net->ksnn_lock);
+       net->ksnn_incarnation = ksocknal_new_incarnation();
+       ni->ni_data = net;
+       ni->ni_peertimeout    = *ksocknal_tunables.ksnd_peertimeout;
+       ni->ni_maxtxcredits   = *ksocknal_tunables.ksnd_credits;
+       ni->ni_peertxcredits  = *ksocknal_tunables.ksnd_peertxcredits;
+       ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
+
+       if (ni->ni_interfaces[0] == NULL) {
+               rc = ksocknal_enumerate_interfaces(net);
+               if (rc <= 0)
+                       goto fail_1;
+
+               net->ksnn_ninterfaces = 1;
+       } else {
+               for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+                       int    up;
+
+                       if (ni->ni_interfaces[i] == NULL)
+                               break;
+
+                       rc = libcfs_ipif_query(
+                               ni->ni_interfaces[i], &up,
+                               &net->ksnn_interfaces[i].ksni_ipaddr,
+                               &net->ksnn_interfaces[i].ksni_netmask);
+
+                       if (rc != 0) {
+                               CERROR("Can't get interface %s info: %d\n",
+                                      ni->ni_interfaces[i], rc);
+                               goto fail_1;
+                       }
+
+                       if (!up) {
+                               CERROR("Interface %s is down\n",
+                                      ni->ni_interfaces[i]);
+                               goto fail_1;
+                       }
+
+                       strncpy(&net->ksnn_interfaces[i].ksni_name[0],
+                               ni->ni_interfaces[i], IFNAMSIZ);
+               }
+               net->ksnn_ninterfaces = i;
+       }
+
+       /* call it before add it to ksocknal_data.ksnd_nets */
+       rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+       if (rc != 0)
+               goto fail_1;
+
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+                               net->ksnn_interfaces[0].ksni_ipaddr);
+       list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+
+       ksocknal_data.ksnd_nnets++;
+
+       return 0;
+
+ fail_1:
+       LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+       if (ksocknal_data.ksnd_nnets == 0)
+               ksocknal_base_shutdown();
+
+       return -ENETDOWN;
+}
+
+
+void __exit
+ksocknal_module_fini (void)
+{
+       lnet_unregister_lnd(&the_ksocklnd);
+       ksocknal_tunables_fini();
+}
+
+int __init
+ksocknal_module_init (void)
+{
+       int    rc;
+
+       /* check ksnr_connected/connecting field large enough */
+       CLASSERT (SOCKLND_CONN_NTYPES <= 4);
+       CLASSERT (SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN);
+
+       /* initialize the_ksocklnd */
+       the_ksocklnd.lnd_type     = SOCKLND;
+       the_ksocklnd.lnd_startup  = ksocknal_startup;
+       the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
+       the_ksocklnd.lnd_ctl      = ksocknal_ctl;
+       the_ksocklnd.lnd_send     = ksocknal_send;
+       the_ksocklnd.lnd_recv     = ksocknal_recv;
+       the_ksocklnd.lnd_notify   = ksocknal_notify;
+       the_ksocklnd.lnd_query    = ksocknal_query;
+       the_ksocklnd.lnd_accept   = ksocknal_accept;
+
+       rc = ksocknal_tunables_init();
+       if (rc != 0)
+               return rc;
+
+       lnet_register_lnd(&the_ksocklnd);
+
+       return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0");
+MODULE_LICENSE("GPL");
+
+cfs_module(ksocknal, "3.0.0", ksocknal_module_init, ksocknal_module_fini);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
new file mode 100644 (file)
index 0000000..b483e0c
--- /dev/null
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "socklnd_lib-linux.h"
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/socklnd.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#define SOCKNAL_PEER_HASH_SIZE  101         /* # peer lists */
+#define SOCKNAL_RESCHED         100         /* # scheduler loops before reschedule */
+#define SOCKNAL_INSANITY_RECONN 5000       /* connd is trying on reconn infinitely */
+#define SOCKNAL_ENOMEM_RETRY    CFS_TICK       /* jiffies between retries */
+
+#define SOCKNAL_SINGLE_FRAG_TX      0     /* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX      0     /* disable multi-fragment receives */
+
+#define SOCKNAL_VERSION_DEBUG       0     /* enable protocol version debugging */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK  0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK  1
+#endif
+
+struct ksock_sched_info;
+
+typedef struct                           /* per scheduler state */
+{
+       spinlock_t              kss_lock;       /* serialise */
+       struct list_head                kss_rx_conns;   /* conn waiting to be read */
+       /* conn waiting to be written */
+       struct list_head                kss_tx_conns;
+       /* zombie noop tx list */
+       struct list_head                kss_zombie_noop_txs;
+       wait_queue_head_t               kss_waitq;      /* where scheduler sleeps */
+       /* # connections assigned to this scheduler */
+       int                     kss_nconns;
+       struct ksock_sched_info *kss_info;      /* owner of it */
+       struct page             *kss_rx_scratch_pgs[LNET_MAX_IOV];
+       struct iovec            kss_scratch_iov[LNET_MAX_IOV];
+} ksock_sched_t;
+
+struct ksock_sched_info {
+       int                     ksi_nthreads_max; /* max allowed threads */
+       int                     ksi_nthreads;   /* number of threads */
+       int                     ksi_cpt;        /* CPT id */
+       ksock_sched_t           *ksi_scheds;    /* array of schedulers */
+};
+
+#define KSOCK_CPT_SHIFT                        16
+#define KSOCK_THREAD_ID(cpt, sid)      (((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id)           ((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id)           ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
+
+typedef struct                           /* in-use interface */
+{
+       __u32           ksni_ipaddr;            /* interface's IP address */
+       __u32           ksni_netmask;           /* interface's network mask */
+       int             ksni_nroutes;           /* # routes using (active) */
+       int             ksni_npeers;            /* # peers using (passive) */
+       char            ksni_name[IFNAMSIZ];    /* interface name */
+} ksock_interface_t;
+
+typedef struct
+{
+       /* "stuck" socket timeout (seconds) */
+       int           *ksnd_timeout;
+       /* # scheduler threads in each pool while starting */
+       int              *ksnd_nscheds;
+       int           *ksnd_nconnds;     /* # connection daemons */
+       int           *ksnd_nconnds_max;     /* max # connection daemons */
+       int           *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+       int           *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+       int           *ksnd_eager_ack;       /* make TCP ack eagerly? */
+       int           *ksnd_typed_conns;     /* drive sockets by type? */
+       int           *ksnd_min_bulk;   /* smallest "large" message */
+       int           *ksnd_tx_buffer_size;  /* socket tx buffer size */
+       int           *ksnd_rx_buffer_size;  /* socket rx buffer size */
+       int           *ksnd_nagle;         /* enable NAGLE? */
+       int           *ksnd_round_robin;     /* round robin for multiple interfaces */
+       int           *ksnd_keepalive;       /* # secs for sending keepalive NOOP */
+       int           *ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+       int           *ksnd_keepalive_count; /* # probes */
+       int           *ksnd_keepalive_intvl; /* time between probes */
+       int           *ksnd_credits;     /* # concurrent sends */
+       int           *ksnd_peertxcredits;   /* # concurrent sends to 1 peer */
+       int           *ksnd_peerrtrcredits;  /* # per-peer router buffer credits */
+       int           *ksnd_peertimeout;     /* seconds to consider peer dead */
+       int           *ksnd_enable_csum;     /* enable check sum */
+       int           *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+       int           *ksnd_nonblk_zcack;    /* always send zc-ack on non-blocking connection */
+       unsigned int     *ksnd_zc_min_payload;  /* minimum zero copy payload size */
+       int           *ksnd_zc_recv;     /* enable ZC receive (for Chelsio TOE) */
+       int           *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+       ctl_table_header_t *ksnd_sysctl;   /* sysctl interface */
+#endif
+} ksock_tunables_t;
+
+typedef struct
+{
+       __u64             ksnn_incarnation;     /* my epoch */
+       spinlock_t        ksnn_lock;            /* serialise */
+       struct list_head          ksnn_list;            /* chain on global list */
+       int               ksnn_npeers;          /* # peers */
+       int               ksnn_shutdown;        /* shutting down? */
+       int               ksnn_ninterfaces;     /* IP interfaces */
+       ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES];
+} ksock_net_t;
+
+/** connd timeout */
+#define SOCKNAL_CONND_TIMEOUT  120
+/** reserved thread for accepting & creating new connd */
+#define SOCKNAL_CONND_RESV     1
+
+typedef struct
+{
+       int                     ksnd_init;      /* initialisation state */
+       int                     ksnd_nnets;     /* # networks set up */
+       struct list_head                ksnd_nets;      /* list of nets */
+       /* stabilize peer/conn ops */
+       rwlock_t                ksnd_global_lock;
+       /* hash table of all my known peers */
+       struct list_head                *ksnd_peers;
+       int                     ksnd_peer_hash_size; /* size of ksnd_peers */
+
+       int                     ksnd_nthreads;  /* # live threads */
+       int                     ksnd_shuttingdown; /* tell threads to exit */
+       /* schedulers information */
+       struct ksock_sched_info **ksnd_sched_info;
+
+       atomic_t      ksnd_nactive_txs;    /* #active txs */
+
+       struct list_head        ksnd_deathrow_conns; /* conns to close: reaper_lock*/
+       struct list_head        ksnd_zombie_conns;   /* conns to free: reaper_lock */
+       struct list_head        ksnd_enomem_conns;   /* conns to retry: reaper_lock*/
+       wait_queue_head_t       ksnd_reaper_waitq;   /* reaper sleeps here */
+       cfs_time_t      ksnd_reaper_waketime;/* when reaper will wake */
+       spinlock_t        ksnd_reaper_lock;     /* serialise */
+
+       int            ksnd_enomem_tx;      /* test ENOMEM sender */
+       int            ksnd_stall_tx;       /* test sluggish sender */
+       int            ksnd_stall_rx;       /* test sluggish receiver */
+
+       struct list_head        ksnd_connd_connreqs; /* incoming connection requests */
+       struct list_head        ksnd_connd_routes;   /* routes waiting to be connected */
+       wait_queue_head_t       ksnd_connd_waitq;    /* connds sleep here */
+       int            ksnd_connd_connecting;/* # connds connecting */
+       /** time stamp of the last failed connecting attempt */
+       long          ksnd_connd_failed_stamp;
+       /** # starting connd */
+       unsigned          ksnd_connd_starting;
+       /** time stamp of the last starting connd */
+       long          ksnd_connd_starting_stamp;
+       /** # running connd */
+       unsigned          ksnd_connd_running;
+       spinlock_t        ksnd_connd_lock;      /* serialise */
+
+       struct list_head          ksnd_idle_noop_txs;   /* list head for freed noop tx */
+       spinlock_t        ksnd_tx_lock;         /* serialise, g_lock unsafe */
+
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_ALL       2
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments (the first frag contains the portals header),
+ * followed by 0 or more lnet_kiov_t fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, the payload is
+ * received into either struct iovec or lnet_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
+
+struct ksock_conn;                           /* forward ref */
+struct ksock_peer;                           /* forward ref */
+struct ksock_route;                         /* forward ref */
+struct ksock_proto;                         /* forward ref */
+
+typedef struct                           /* transmit packet */
+{
+       struct list_head     tx_list;   /* queue on conn for transmission etc */
+       struct list_head     tx_zc_list;     /* queue on peer for ZC request */
+       atomic_t   tx_refcount;    /* tx reference count */
+       int         tx_nob;      /* # packet bytes */
+       int         tx_resid;       /* residual bytes */
+       int         tx_niov;    /* # packet iovec frags */
+       struct iovec  *tx_iov;   /* packet iovec frags */
+       int         tx_nkiov;       /* # packet page frags */
+       unsigned short tx_zc_aborted;  /* aborted ZC request */
+       unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
+       unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
+       unsigned short tx_nonblk:1;    /* it's a non-blocking ACK */
+       lnet_kiov_t   *tx_kiov; /* packet page frags */
+       struct ksock_conn  *tx_conn;    /* owning conn */
+       lnet_msg_t    *tx_lnetmsg;     /* lnet message for lnet_finalize() */
+       cfs_time_t     tx_deadline;    /* when (in jiffies) tx times out */
+       ksock_msg_t    tx_msg;   /* socklnd message buffer */
+       int         tx_desc_size;   /* size of this descriptor */
+       union {
+               struct {
+                       struct iovec iov;       /* virt hdr */
+                       lnet_kiov_t  kiov[0];   /* paged payload */
+               }                 paged;
+               struct {
+                       struct iovec iov[1];    /* virt hdr + payload */
+               }                 virt;
+       }                      tx_frags;
+} ksock_tx_t;
+
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+typedef union {
+       struct iovec     iov[LNET_MAX_IOV];
+       lnet_kiov_t      kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_KSM_HEADER   1             /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER  2             /* reading lnet message header */
+#define SOCKNAL_RX_PARSE       3              /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT   4             /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5             /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP         6             /* skipping body */
+
+typedef struct ksock_conn
+{
+       struct ksock_peer  *ksnc_peer;   /* owning peer */
+       struct ksock_route *ksnc_route; /* owning route */
+       struct list_head          ksnc_list;     /* stash on peer's conn list */
+       socket_t       *ksnc_sock;       /* actual socket */
+       void           *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+       void           *ksnc_saved_write_space; /* socket's original write_space() callback */
+       atomic_t        ksnc_conn_refcount; /* conn refcount */
+       atomic_t        ksnc_sock_refcount; /* sock refcount */
+       ksock_sched_t      *ksnc_scheduler;  /* who schedules this connection */
+       __u32          ksnc_myipaddr;   /* my IP */
+       __u32          ksnc_ipaddr;     /* peer's IP */
+       int              ksnc_port;       /* peer's port */
+       signed int        ksnc_type:3;     /* type of connection,
+                                             * should be signed value */
+       unsigned int        ksnc_closing:1;  /* being shut down */
+       unsigned int        ksnc_flip:1;     /* flip or not, only for V2.x */
+       unsigned int        ksnc_zc_capable:1; /* enable to ZC */
+       struct ksock_proto *ksnc_proto;      /* protocol for the connection */
+
+       /* reader */
+       struct list_head  ksnc_rx_list;     /* where I enq waiting input or a forwarding descriptor */
+       cfs_time_t          ksnc_rx_deadline; /* when (in jiffies) receive times out */
+       __u8              ksnc_rx_started;  /* started receiving a message */
+       __u8              ksnc_rx_ready;    /* data ready to read */
+       __u8              ksnc_rx_scheduled;/* being progressed */
+       __u8              ksnc_rx_state;    /* what is being read */
+       int                ksnc_rx_nob_left; /* # bytes to next hdr/body */
+       int                ksnc_rx_nob_wanted; /* bytes actually wanted */
+       int                ksnc_rx_niov;     /* # iovec frags */
+       struct iovec     *ksnc_rx_iov;      /* the iovec frags */
+       int                ksnc_rx_nkiov;    /* # page frags */
+       lnet_kiov_t       *ksnc_rx_kiov;     /* the page frags */
+       ksock_rxiovspace_t    ksnc_rx_iov_space;/* space for frag descriptors */
+       __u32            ksnc_rx_csum;     /* partial checksum for incoming data */
+       void             *ksnc_cookie;      /* rx lnet_finalize passthru arg */
+       ksock_msg_t        ksnc_msg;     /* incoming message buffer:
+                                                * V2.x message takes the
+                                                * whole struct
+                                                * V1.x message is a bare
+                                                * lnet_hdr_t, it's stored in
+                                                * ksnc_msg.ksm_u.lnetmsg */
+
+       /* WRITER */
+       struct list_head            ksnc_tx_list;     /* where I enq waiting for output space */
+       struct list_head            ksnc_tx_queue;    /* packets waiting to be sent */
+       ksock_tx_t         *ksnc_tx_carrier;  /* next TX that can carry a LNet message or ZC-ACK */
+       cfs_time_t          ksnc_tx_deadline; /* when (in jiffies) tx times out */
+       int                ksnc_tx_bufnob;     /* send buffer marker */
+       atomic_t          ksnc_tx_nob;  /* # bytes queued */
+       int                ksnc_tx_ready;      /* write space */
+       int                ksnc_tx_scheduled;  /* being progressed */
+       cfs_time_t          ksnc_tx_last_post;  /* time stamp of the last posted TX */
+} ksock_conn_t;
+
+typedef struct ksock_route
+{
+       struct list_head            ksnr_list;  /* chain on peer route list */
+       struct list_head            ksnr_connd_list;  /* chain on ksnr_connd_routes */
+       struct ksock_peer    *ksnr_peer;        /* owning peer */
+       atomic_t          ksnr_refcount;    /* # users */
+       cfs_time_t          ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
+       cfs_duration_t  ksnr_retry_interval; /* how long between retries */
+       __u32            ksnr_myipaddr;    /* my IP */
+       __u32            ksnr_ipaddr;      /* IP address to connect to */
+       int                ksnr_port;   /* port to connect to */
+       unsigned int      ksnr_scheduled:1; /* scheduled for attention */
+       unsigned int      ksnr_connecting:1;/* connection establishment in progress */
+       unsigned int      ksnr_connected:4; /* connections established by type */
+       unsigned int      ksnr_deleted:1;   /* been removed from peer? */
+       unsigned int      ksnr_share_count; /* created explicitly? */
+       int                ksnr_conn_count;  /* # conns established by this route */
+} ksock_route_t;
+
+#define SOCKNAL_KEEPALIVE_PING   1       /* cookie for keepalive ping */
+
+typedef struct ksock_peer
+{
+       struct list_head            ksnp_list;  /* stash on global peer list */
+       cfs_time_t          ksnp_last_alive;  /* when (in jiffies) I was last alive */
+       lnet_process_id_t     ksnp_id;       /* who's on the other end(s) */
+       atomic_t          ksnp_refcount; /* # users */
+       int                ksnp_sharecount;  /* lconf usage counter */
+       int                ksnp_closing;  /* being closed */
+       int                ksnp_accepting;/* # passive connections pending */
+       int                ksnp_error;    /* errno on closing last conn */
+       __u64            ksnp_zc_next_cookie;/* ZC completion cookie */
+       __u64            ksnp_incarnation;   /* latest known peer incarnation */
+       struct ksock_proto   *ksnp_proto;    /* latest known peer protocol */
+       struct list_head            ksnp_conns;    /* all active connections */
+       struct list_head            ksnp_routes;   /* routes */
+       struct list_head            ksnp_tx_queue; /* waiting packets */
+       spinlock_t            ksnp_lock;        /* serialize, g_lock unsafe */
+       struct list_head            ksnp_zc_req_list;   /* zero copy requests wait for ACK  */
+       cfs_time_t          ksnp_send_keepalive; /* time to send keepalive */
+       lnet_ni_t           *ksnp_ni;       /* which network */
+       int                ksnp_n_passive_ips; /* # of... */
+       __u32            ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_t;
+
+typedef struct ksock_connreq
+{
+       struct list_head            ksncr_list;     /* stash on ksnd_connd_connreqs */
+       lnet_ni_t           *ksncr_ni;       /* chosen NI */
+       socket_t         *ksncr_sock;     /* accepted socket */
+} ksock_connreq_t;
+
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
+
+#define SOCKNAL_MATCH_NO       0       /* TX can't match type of connection */
+#define SOCKNAL_MATCH_YES       1      /* TX matches type of connection */
+#define SOCKNAL_MATCH_MAY       2      /* TX can be sent on the connection, but not preferred */
+
+typedef struct ksock_proto
+{
+       int        pro_version;                                       /* version number of protocol */
+       int      (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *);     /* handshake function */
+       int      (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */
+       void    (*pro_pack)(ksock_tx_t *);                                /* message pack */
+       void    (*pro_unpack)(ksock_msg_t *);                          /* message unpack */
+       ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *);    /* queue tx on the connection */
+       int      (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
+       int      (*pro_handle_zcreq)(ksock_conn_t *, __u64, int);           /* handle ZC request */
+       int      (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64);       /* handle ZC ACK */
+       int      (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int);     /* msg type matches the connection type:
+                                                                                * return value:
+                                                                                *   return MATCH_NO  : no
+                                                                                *   return MATCH_YES : matching type
+                                                                                *   return MATCH_MAY : can be backup */
+} ksock_proto_t;
+
+extern ksock_proto_t ksocknal_protocol_v1x;
+extern ksock_proto_t ksocknal_protocol_v2x;
+extern ksock_proto_t ksocknal_protocol_v3x;
+
+#define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1   KSOCK_PROTO_V1_MAJOR
+
+#ifndef CPU_MASK_NONE
+#define CPU_MASK_NONE   0UL
+#endif
+
+static inline int
+ksocknal_route_mask(void)
+{
+       if (!*ksocknal_tunables.ksnd_typed_conns)
+               return (1 << SOCKLND_CONN_ANY);
+
+       return ((1 << SOCKLND_CONN_CONTROL) |
+               (1 << SOCKLND_CONN_BULK_IN) |
+               (1 << SOCKLND_CONN_BULK_OUT));
+}
+
+static inline struct list_head *
+ksocknal_nid2peerlist (lnet_nid_t nid)
+{
+       unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
+
+       return (&ksocknal_data.ksnd_peers [hash]);
+}
+
+static inline void
+ksocknal_conn_addref (ksock_conn_t *conn)
+{
+       LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+       atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
+extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref (ksock_conn_t *conn)
+{
+       LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+       if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+               ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref (ksock_conn_t *conn)
+{
+       int   rc = -ESHUTDOWN;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+       if (!conn->ksnc_closing) {
+               LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+               atomic_inc(&conn->ksnc_sock_refcount);
+               rc = 0;
+       }
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       return (rc);
+}
+
+static inline void
+ksocknal_connsock_decref (ksock_conn_t *conn)
+{
+       LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+       if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+               LASSERT (conn->ksnc_closing);
+               libcfs_sock_release(conn->ksnc_sock);
+               conn->ksnc_sock = NULL;
+               ksocknal_finalize_zcreq(conn);
+       }
+}
+
+static inline void
+ksocknal_tx_addref (ksock_tx_t *tx)
+{
+       LASSERT (atomic_read(&tx->tx_refcount) > 0);
+       atomic_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
+extern void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx);
+
+static inline void
+ksocknal_tx_decref (ksock_tx_t *tx)
+{
+       LASSERT (atomic_read(&tx->tx_refcount) > 0);
+       if (atomic_dec_and_test(&tx->tx_refcount))
+               ksocknal_tx_done(NULL, tx);
+}
+
+static inline void
+ksocknal_route_addref (ksock_route_t *route)
+{
+       LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+       atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route (ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref (ksock_route_t *route)
+{
+       LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+       if (atomic_dec_and_test(&route->ksnr_refcount))
+               ksocknal_destroy_route (route);
+}
+
+static inline void
+ksocknal_peer_addref (ksock_peer_t *peer)
+{
+       LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+       atomic_inc(&peer->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer (ksock_peer_t *peer);
+
+static inline void
+ksocknal_peer_decref (ksock_peer_t *peer)
+{
+       LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+       if (atomic_dec_and_test(&peer->ksnp_refcount))
+               ksocknal_destroy_peer (peer);
+}
+
+int ksocknal_startup (lnet_ni_t *ni);
+void ksocknal_shutdown (lnet_ni_t *ni);
+int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ksocknal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+                 int delayed, unsigned int niov,
+                 struct iovec *iov, lnet_kiov_t *kiov,
+                 unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(lnet_ni_t *ni, socket_t *sock);
+
+extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
+extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed (ksock_peer_t *peer);
+extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+                                socket_t *sock, int type);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn (ksock_conn_t *conn);
+extern void ksocknal_destroy_conn (ksock_conn_t *conn);
+extern int  ksocknal_close_peer_conns_locked (ksock_peer_t *peer,
+                                             __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+extern int ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr);
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer,
+                                              ksock_tx_t *tx, int nonblk);
+
+extern int  ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx,
+                                  lnet_process_id_t id);
+extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx (ksock_tx_t *tx);
+extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
+extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
+extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+                                 int error);
+extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
+extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
+extern void ksocknal_thread_fini (void);
+extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_connd (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern int ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                               lnet_nid_t peer_nid, ksock_hello_msg_t *hello);
+extern int ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                               ksock_hello_msg_t *hello, lnet_process_id_t *id,
+                               __u64 *incarnation);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+
+extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
+extern void ksocknal_lib_save_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(socket_t *sock,  ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn);
+extern int ksocknal_lib_setup_sock (socket_t *so);
+extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem,
+                                          int *rxmem, int *nagle);
+
+extern int ksocknal_tunables_init(void);
+extern void ksocknal_tunables_fini(void);
+extern int ksocknal_lib_tunables_init(void);
+extern void ksocknal_lib_tunables_fini(void);
+
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+
+extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_bind_thread_to_cpu(int id);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644 (file)
index 0000000..ad5e241
--- /dev/null
@@ -0,0 +1,2664 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+ksock_tx_t *
+ksocknal_alloc_tx(int type, int size)
+{
+       ksock_tx_t *tx = NULL;
+
+       if (type == KSOCK_MSG_NOOP) {
+               LASSERT(size == KSOCK_NOOP_TX_SIZE);
+
+               /* searching for a noop tx in free list */
+               spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+               if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+                       tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
+                                           next, ksock_tx_t, tx_list);
+                       LASSERT(tx->tx_desc_size == size);
+                       list_del(&tx->tx_list);
+               }
+
+               spin_unlock(&ksocknal_data.ksnd_tx_lock);
+       }
+
+       if (tx == NULL)
+               LIBCFS_ALLOC(tx, size);
+
+       if (tx == NULL)
+               return NULL;
+
+       atomic_set(&tx->tx_refcount, 1);
+       tx->tx_zc_aborted = 0;
+       tx->tx_zc_capable = 0;
+       tx->tx_zc_checked = 0;
+       tx->tx_desc_size  = size;
+
+       atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+       return tx;
+}
+
+ksock_tx_t *
+ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
+{
+       ksock_tx_t *tx;
+
+       tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
+       if (tx == NULL) {
+               CERROR("Can't allocate noop tx desc\n");
+               return NULL;
+       }
+
+       tx->tx_conn     = NULL;
+       tx->tx_lnetmsg  = NULL;
+       tx->tx_kiov     = NULL;
+       tx->tx_nkiov    = 0;
+       tx->tx_iov      = tx->tx_frags.virt.iov;
+       tx->tx_niov     = 1;
+       tx->tx_nonblk   = nonblk;
+
+       socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP);
+       tx->tx_msg.ksm_zc_cookies[1] = cookie;
+
+       return tx;
+}
+
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+       atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+       if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+               /* it's a noop tx */
+               spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+               list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+               spin_unlock(&ksocknal_data.ksnd_tx_lock);
+       } else {
+               LIBCFS_FREE(tx, tx->tx_desc_size);
+       }
+}
+
+int
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       struct iovec  *iov = tx->tx_iov;
+       int    nob;
+       int    rc;
+
+       LASSERT (tx->tx_niov > 0);
+
+       /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+       rc = ksocknal_lib_send_iov(conn, tx);
+
+       if (rc <= 0)                        /* sent nothing? */
+               return (rc);
+
+       nob = rc;
+       LASSERT (nob <= tx->tx_resid);
+       tx->tx_resid -= nob;
+
+       /* "consume" iov */
+       do {
+               LASSERT (tx->tx_niov > 0);
+
+               if (nob < (int) iov->iov_len) {
+                       iov->iov_base = (void *)((char *)iov->iov_base + nob);
+                       iov->iov_len -= nob;
+                       return (rc);
+               }
+
+               nob -= iov->iov_len;
+               tx->tx_iov = ++iov;
+               tx->tx_niov--;
+       } while (nob != 0);
+
+       return (rc);
+}
+
+int
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       lnet_kiov_t    *kiov = tx->tx_kiov;
+       int     nob;
+       int     rc;
+
+       LASSERT (tx->tx_niov == 0);
+       LASSERT (tx->tx_nkiov > 0);
+
+       /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+       rc = ksocknal_lib_send_kiov(conn, tx);
+
+       if (rc <= 0)                        /* sent nothing? */
+               return (rc);
+
+       nob = rc;
+       LASSERT (nob <= tx->tx_resid);
+       tx->tx_resid -= nob;
+
+       /* "consume" kiov */
+       do {
+               LASSERT(tx->tx_nkiov > 0);
+
+               if (nob < (int)kiov->kiov_len) {
+                       kiov->kiov_offset += nob;
+                       kiov->kiov_len -= nob;
+                       return rc;
+               }
+
+               nob -= (int)kiov->kiov_len;
+               tx->tx_kiov = ++kiov;
+               tx->tx_nkiov--;
+       } while (nob != 0);
+
+       return (rc);
+}
+
+int
+ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       int      rc;
+       int      bufnob;
+
+       if (ksocknal_data.ksnd_stall_tx != 0) {
+               cfs_pause(cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+       }
+
+       LASSERT (tx->tx_resid != 0);
+
+       rc = ksocknal_connsock_addref(conn);
+       if (rc != 0) {
+               LASSERT (conn->ksnc_closing);
+               return (-ESHUTDOWN);
+       }
+
+       do {
+               if (ksocknal_data.ksnd_enomem_tx > 0) {
+                       /* testing... */
+                       ksocknal_data.ksnd_enomem_tx--;
+                       rc = -EAGAIN;
+               } else if (tx->tx_niov != 0) {
+                       rc = ksocknal_send_iov (conn, tx);
+               } else {
+                       rc = ksocknal_send_kiov (conn, tx);
+               }
+
+               bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+               if (rc > 0)                  /* sent something? */
+                       conn->ksnc_tx_bufnob += rc; /* account it */
+
+               if (bufnob < conn->ksnc_tx_bufnob) {
+                       /* allocated send buffer bytes < computed; infer
+                        * something got ACKed */
+                       conn->ksnc_tx_deadline =
+                               cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+                       conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+                       conn->ksnc_tx_bufnob = bufnob;
+                       mb();
+               }
+
+               if (rc <= 0) { /* Didn't write anything? */
+
+                       if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+                               rc = -EAGAIN;
+
+                       /* Check if EAGAIN is due to memory pressure */
+                       if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+                               rc = -ENOMEM;
+
+                       break;
+               }
+
+               /* socket's wmem_queued now includes 'rc' bytes */
+               atomic_sub (rc, &conn->ksnc_tx_nob);
+               rc = 0;
+
+       } while (tx->tx_resid != 0);
+
+       ksocknal_connsock_decref(conn);
+       return (rc);
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+       struct iovec *iov = conn->ksnc_rx_iov;
+       int     nob;
+       int     rc;
+
+       LASSERT (conn->ksnc_rx_niov > 0);
+
+       /* Never touch conn->ksnc_rx_iov or change connection
+        * status inside ksocknal_lib_recv_iov */
+       rc = ksocknal_lib_recv_iov(conn);
+
+       if (rc <= 0)
+               return (rc);
+
+       /* received something... */
+       nob = rc;
+
+       conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+       conn->ksnc_rx_deadline =
+               cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+       mb();                  /* order with setting rx_started */
+       conn->ksnc_rx_started = 1;
+
+       conn->ksnc_rx_nob_wanted -= nob;
+       conn->ksnc_rx_nob_left -= nob;
+
+       do {
+               LASSERT (conn->ksnc_rx_niov > 0);
+
+               if (nob < (int)iov->iov_len) {
+                       iov->iov_len -= nob;
+                       iov->iov_base = (void *)((char *)iov->iov_base + nob);
+                       return (-EAGAIN);
+               }
+
+               nob -= iov->iov_len;
+               conn->ksnc_rx_iov = ++iov;
+               conn->ksnc_rx_niov--;
+       } while (nob != 0);
+
+       return (rc);
+}
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+       lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+       int     nob;
+       int     rc;
+       LASSERT (conn->ksnc_rx_nkiov > 0);
+
+       /* Never touch conn->ksnc_rx_kiov or change connection
+        * status inside ksocknal_lib_recv_iov */
+       rc = ksocknal_lib_recv_kiov(conn);
+
+       if (rc <= 0)
+               return (rc);
+
+       /* received something... */
+       nob = rc;
+
+       conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+       conn->ksnc_rx_deadline =
+               cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+       mb();                  /* order with setting rx_started */
+       conn->ksnc_rx_started = 1;
+
+       conn->ksnc_rx_nob_wanted -= nob;
+       conn->ksnc_rx_nob_left -= nob;
+
+       do {
+               LASSERT (conn->ksnc_rx_nkiov > 0);
+
+               if (nob < (int) kiov->kiov_len) {
+                       kiov->kiov_offset += nob;
+                       kiov->kiov_len -= nob;
+                       return -EAGAIN;
+               }
+
+               nob -= kiov->kiov_len;
+               conn->ksnc_rx_kiov = ++kiov;
+               conn->ksnc_rx_nkiov--;
+       } while (nob != 0);
+
+       return 1;
+}
+
+int
+ksocknal_receive (ksock_conn_t *conn)
+{
+       /* Return 1 on success, 0 on EOF, < 0 on error.
+        * Caller checks ksnc_rx_nob_wanted to determine
+        * progress/completion. */
+       int     rc;
+       ENTRY;
+
+       if (ksocknal_data.ksnd_stall_rx != 0) {
+               cfs_pause(cfs_time_seconds (ksocknal_data.ksnd_stall_rx));
+       }
+
+       rc = ksocknal_connsock_addref(conn);
+       if (rc != 0) {
+               LASSERT (conn->ksnc_closing);
+               return (-ESHUTDOWN);
+       }
+
+       for (;;) {
+               if (conn->ksnc_rx_niov != 0)
+                       rc = ksocknal_recv_iov (conn);
+               else
+                       rc = ksocknal_recv_kiov (conn);
+
+               if (rc <= 0) {
+                       /* error/EOF or partial receive */
+                       if (rc == -EAGAIN) {
+                               rc = 1;
+                       } else if (rc == 0 && conn->ksnc_rx_started) {
+                               /* EOF in the middle of a message */
+                               rc = -EPROTO;
+                       }
+                       break;
+               }
+
+               /* Completed a fragment */
+
+               if (conn->ksnc_rx_nob_wanted == 0) {
+                       rc = 1;
+                       break;
+               }
+       }
+
+       ksocknal_connsock_decref(conn);
+       RETURN (rc);
+}
+
+void
+ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx)
+{
+       lnet_msg_t  *lnetmsg = tx->tx_lnetmsg;
+       int       rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO;
+       ENTRY;
+
+       LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+       if (tx->tx_conn != NULL)
+               ksocknal_conn_decref(tx->tx_conn);
+
+       if (ni == NULL && tx->tx_conn != NULL)
+               ni = tx->tx_conn->ksnc_peer->ksnp_ni;
+
+       ksocknal_free_tx (tx);
+       if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+               lnet_finalize (ni, lnetmsg, rc);
+
+       EXIT;
+}
+
+void
+ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error)
+{
+       ksock_tx_t *tx;
+
+       while (!list_empty (txlist)) {
+               tx = list_entry (txlist->next, ksock_tx_t, tx_list);
+
+               if (error && tx->tx_lnetmsg != NULL) {
+                       CNETERR("Deleting packet type %d len %d %s->%s\n",
+                               le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
+                               le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
+                               libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+                               libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
+               } else if (error) {
+                       CNETERR("Deleting noop packet\n");
+               }
+
+               list_del (&tx->tx_list);
+
+               LASSERT (atomic_read(&tx->tx_refcount) == 1);
+               ksocknal_tx_done (ni, tx);
+       }
+}
+
+static void
+ksocknal_check_zc_req(ksock_tx_t *tx)
+{
+       ksock_conn_t   *conn = tx->tx_conn;
+       ksock_peer_t   *peer = conn->ksnc_peer;
+
+       /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
+        * to ksnp_zc_req_list if some fragment of this message should be sent
+        * zero-copy.  Our peer will send an ACK containing this cookie when
+        * she has received this message to tell us we can signal completion.
+        * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
+        * ksnp_zc_req_list. */
+       LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+       LASSERT (tx->tx_zc_capable);
+
+       tx->tx_zc_checked = 1;
+
+       if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
+           !conn->ksnc_zc_capable)
+               return;
+
+       /* assign cookie and queue tx to pending list, it will be released when
+        * a matching ack is received. See ksocknal_handle_zcack() */
+
+       ksocknal_tx_addref(tx);
+
+       spin_lock(&peer->ksnp_lock);
+
+       /* ZC_REQ is going to be pinned to the peer */
+       tx->tx_deadline =
+               cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+       LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+
+       tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
+
+       if (peer->ksnp_zc_next_cookie == 0)
+               peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+       list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
+
+       spin_unlock(&peer->ksnp_lock);
+}
+
+static void
+ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+{
+       ksock_peer_t   *peer = tx->tx_conn->ksnc_peer;
+
+       LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+       LASSERT(tx->tx_zc_capable);
+
+       tx->tx_zc_checked = 0;
+
+       spin_lock(&peer->ksnp_lock);
+
+       if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+               /* Not waiting for an ACK */
+               spin_unlock(&peer->ksnp_lock);
+               return;
+       }
+
+       tx->tx_msg.ksm_zc_cookies[0] = 0;
+       list_del(&tx->tx_zc_list);
+
+       spin_unlock(&peer->ksnp_lock);
+
+       ksocknal_tx_decref(tx);
+}
+
+int
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       int         rc;
+
+       if (tx->tx_zc_capable && !tx->tx_zc_checked)
+               ksocknal_check_zc_req(tx);
+
+       rc = ksocknal_transmit (conn, tx);
+
+       CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+       if (tx->tx_resid == 0) {
+               /* Sent everything OK */
+               LASSERT (rc == 0);
+
+               return (0);
+       }
+
+       if (rc == -EAGAIN)
+               return (rc);
+
+       if (rc == -ENOMEM) {
+               static int counter;
+
+               counter++;   /* exponential backoff warnings */
+               if ((counter & (-counter)) == counter)
+                       CWARN("%u ENOMEM tx %p (%u allocated)\n",
+                             counter, conn, atomic_read(&libcfs_kmemory));
+
+               /* Queue on ksnd_enomem_conns for retry after a timeout */
+               spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+               /* enomem list takes over scheduler's ref... */
+               LASSERT (conn->ksnc_tx_scheduled);
+               list_add_tail(&conn->ksnc_tx_list,
+                                 &ksocknal_data.ksnd_enomem_conns);
+               if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+                                                  SOCKNAL_ENOMEM_RETRY),
+                                  ksocknal_data.ksnd_reaper_waketime))
+                       wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+               spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+               return (rc);
+       }
+
+       /* Actual error */
+       LASSERT (rc < 0);
+
+       if (!conn->ksnc_closing) {
+               switch (rc) {
+               case -ECONNRESET:
+                       LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection "
+                                     "while we were sending data; it may have "
+                                     "rebooted.\n",
+                                     HIPQUAD(conn->ksnc_ipaddr));
+                       break;
+               default:
+                       LCONSOLE_WARN("There was an unexpected network error "
+                                     "while writing to %u.%u.%u.%u: %d.\n",
+                                     HIPQUAD(conn->ksnc_ipaddr), rc);
+                       break;
+               }
+               CDEBUG(D_NET, "[%p] Error %d on write to %s"
+                      " ip %d.%d.%d.%d:%d\n", conn, rc,
+                      libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                      HIPQUAD(conn->ksnc_ipaddr),
+                      conn->ksnc_port);
+       }
+
+       if (tx->tx_zc_checked)
+               ksocknal_uncheck_zc_req(tx);
+
+       /* it's not an error if conn is being closed */
+       ksocknal_close_conn_and_siblings (conn,
+                                         (conn->ksnc_closing) ? 0 : rc);
+
+       return (rc);
+}
+
+void
+ksocknal_launch_connection_locked (ksock_route_t *route)
+{
+
+       /* called holding write lock on ksnd_global_lock */
+
+       LASSERT (!route->ksnr_scheduled);
+       LASSERT (!route->ksnr_connecting);
+       LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0);
+
+       route->ksnr_scheduled = 1;            /* scheduling conn for connd */
+       ksocknal_route_addref(route);      /* extra ref for connd */
+
+       spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+       list_add_tail(&route->ksnr_connd_list,
+                         &ksocknal_data.ksnd_connd_routes);
+       wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+       spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+}
+
+void
+ksocknal_launch_all_connections_locked (ksock_peer_t *peer)
+{
+       ksock_route_t *route;
+
+       /* called holding write lock on ksnd_global_lock */
+       for (;;) {
+               /* launch any/all connections that need it */
+               route = ksocknal_find_connectable_route_locked(peer);
+               if (route == NULL)
+                       return;
+
+               ksocknal_launch_connection_locked(route);
+       }
+}
+
+ksock_conn_t *
+ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk)
+{
+       struct list_head       *tmp;
+       ksock_conn_t     *conn;
+       ksock_conn_t     *typed = NULL;
+       ksock_conn_t     *fallback = NULL;
+       int            tnob     = 0;
+       int            fnob     = 0;
+
+       list_for_each (tmp, &peer->ksnp_conns) {
+               ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
+               int        nob = atomic_read(&c->ksnc_tx_nob) +
+                                   cfs_sock_wmem_queued(c->ksnc_sock);
+               int        rc;
+
+               LASSERT (!c->ksnc_closing);
+               LASSERT (c->ksnc_proto != NULL &&
+                        c->ksnc_proto->pro_match_tx != NULL);
+
+               rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
+
+               switch (rc) {
+               default:
+                       LBUG();
+               case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
+                       continue;
+
+               case SOCKNAL_MATCH_YES: /* typed connection */
+                       if (typed == NULL || tnob > nob ||
+                           (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+                            cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+                               typed = c;
+                               tnob  = nob;
+                       }
+                       break;
+
+               case SOCKNAL_MATCH_MAY: /* fallback connection */
+                       if (fallback == NULL || fnob > nob ||
+                           (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+                            cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+                               fallback = c;
+                               fnob     = nob;
+                       }
+                       break;
+               }
+       }
+
+       /* prefer the typed selection */
+       conn = (typed != NULL) ? typed : fallback;
+
+       if (conn != NULL)
+               conn->ksnc_tx_last_post = cfs_time_current();
+
+       return conn;
+}
+
+void
+ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       conn->ksnc_proto->pro_pack(tx);
+
+       atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+       ksocknal_conn_addref(conn); /* +1 ref for tx */
+       tx->tx_conn = conn;
+}
+
+void
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+{
+       ksock_sched_t *sched = conn->ksnc_scheduler;
+       ksock_msg_t   *msg = &tx->tx_msg;
+       ksock_tx_t    *ztx = NULL;
+       int         bufnob = 0;
+
+       /* called holding global lock (read or irq-write) and caller may
+        * not have dropped this lock between finding conn and calling me,
+        * so we don't need the {get,put}connsock dance to deref
+        * ksnc_sock... */
+       LASSERT(!conn->ksnc_closing);
+
+       CDEBUG (D_NET, "Sending to %s ip %d.%d.%d.%d:%d\n",
+               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+               HIPQUAD(conn->ksnc_ipaddr),
+               conn->ksnc_port);
+
+       ksocknal_tx_prep(conn, tx);
+
+       /* Ensure the frags we've been given EXACTLY match the number of
+        * bytes we want to send.  Many TCP/IP stacks disregard any total
+        * size parameters passed to them and just look at the frags.
+        *
+        * We always expect at least 1 mapped fragment containing the
+        * complete ksocknal message header. */
+       LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+                lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
+                (unsigned int)tx->tx_nob);
+       LASSERT (tx->tx_niov >= 1);
+       LASSERT (tx->tx_resid == tx->tx_nob);
+
+       CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+               tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type:
+                                              KSOCK_MSG_NOOP,
+               tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+       /*
+        * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__
+        * but they're used inside spinlocks a lot.
+        */
+       bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+       spin_lock_bh(&sched->kss_lock);
+
+       if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
+               /* First packet starts the timeout */
+               conn->ksnc_tx_deadline =
+                       cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+               if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
+                       conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+               conn->ksnc_tx_bufnob = 0;
+               mb(); /* order with adding to tx_queue */
+       }
+
+       if (msg->ksm_type == KSOCK_MSG_NOOP) {
+               /* The packet is noop ZC ACK, try to piggyback the ack_cookie
+                * on a normal packet so I don't need to send it */
+               LASSERT (msg->ksm_zc_cookies[1] != 0);
+               LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+               if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
+                       ztx = tx; /* ZC ACK piggybacked on ztx release tx later */
+
+       } else {
+               /* It's a normal packet - can it piggback a noop zc-ack that
+                * has been queued already? */
+               LASSERT (msg->ksm_zc_cookies[1] == 0);
+               LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL);
+
+               ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
+               /* ztx will be released later */
+       }
+
+       if (ztx != NULL) {
+               atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+               list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+       }
+
+       if (conn->ksnc_tx_ready &&      /* able to send */
+           !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+               /* +1 ref for scheduler */
+               ksocknal_conn_addref(conn);
+               list_add_tail (&conn->ksnc_tx_list,
+                                  &sched->kss_tx_conns);
+               conn->ksnc_tx_scheduled = 1;
+               wake_up (&sched->kss_waitq);
+       }
+
+       spin_unlock_bh(&sched->kss_lock);
+}
+
+
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+{
+       cfs_time_t     now = cfs_time_current();
+       struct list_head    *tmp;
+       ksock_route_t *route;
+
+       list_for_each (tmp, &peer->ksnp_routes) {
+               route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+               LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+               if (route->ksnr_scheduled)      /* connections being established */
+                       continue;
+
+               /* all route types connected ? */
+               if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
+                       continue;
+
+               if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+                     cfs_time_aftereq(now, route->ksnr_timeout))) {
+                       CDEBUG(D_NET,
+                              "Too soon to retry route %u.%u.%u.%u "
+                              "(cnted %d, interval %ld, %ld secs later)\n",
+                              HIPQUAD(route->ksnr_ipaddr),
+                              route->ksnr_connected,
+                              route->ksnr_retry_interval,
+                              cfs_duration_sec(route->ksnr_timeout - now));
+                       continue;
+               }
+
+               return (route);
+       }
+
+       return (NULL);
+}
+
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
+{
+       struct list_head        *tmp;
+       ksock_route_t     *route;
+
+       list_for_each (tmp, &peer->ksnp_routes) {
+               route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+               LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+               if (route->ksnr_scheduled)
+                       return (route);
+       }
+
+       return (NULL);
+}
+
+int
+ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
+{
+       ksock_peer_t     *peer;
+       ksock_conn_t     *conn;
+       rwlock_t     *g_lock;
+       int            retry;
+       int            rc;
+
+       LASSERT (tx->tx_conn == NULL);
+
+       g_lock = &ksocknal_data.ksnd_global_lock;
+
+       for (retry = 0;; retry = 1) {
+               read_lock(g_lock);
+               peer = ksocknal_find_peer_locked(ni, id);
+               if (peer != NULL) {
+                       if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+                               conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+                               if (conn != NULL) {
+                                       /* I've got no routes that need to be
+                                        * connecting and I do have an actual
+                                        * connection... */
+                                       ksocknal_queue_tx_locked (tx, conn);
+                                       read_unlock(g_lock);
+                                       return (0);
+                               }
+                       }
+               }
+
+               /* I'll need a write lock... */
+               read_unlock(g_lock);
+
+               write_lock_bh(g_lock);
+
+               peer = ksocknal_find_peer_locked(ni, id);
+               if (peer != NULL)
+                       break;
+
+               write_unlock_bh(g_lock);
+
+               if ((id.pid & LNET_PID_USERFLAG) != 0) {
+                       CERROR("Refusing to create a connection to "
+                              "userspace process %s\n", libcfs_id2str(id));
+                       return -EHOSTUNREACH;
+               }
+
+               if (retry) {
+                       CERROR("Can't find peer %s\n", libcfs_id2str(id));
+                       return -EHOSTUNREACH;
+               }
+
+               rc = ksocknal_add_peer(ni, id,
+                                      LNET_NIDADDR(id.nid),
+                                      lnet_acceptor_port());
+               if (rc != 0) {
+                       CERROR("Can't add peer %s: %d\n",
+                              libcfs_id2str(id), rc);
+                       return rc;
+               }
+       }
+
+       ksocknal_launch_all_connections_locked(peer);
+
+       conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+       if (conn != NULL) {
+               /* Connection exists; queue message on it */
+               ksocknal_queue_tx_locked (tx, conn);
+               write_unlock_bh(g_lock);
+               return (0);
+       }
+
+       if (peer->ksnp_accepting > 0 ||
+           ksocknal_find_connecting_route_locked (peer) != NULL) {
+               /* the message is going to be pinned to the peer */
+               tx->tx_deadline =
+                       cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+               /* Queue the message until a connection is established */
+               list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
+               write_unlock_bh(g_lock);
+               return 0;
+       }
+
+       write_unlock_bh(g_lock);
+
+       /* NB Routes may be ignored if connections to them failed recently */
+       CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+       return (-EHOSTUNREACH);
+}
+
+int
+ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+       int            mpflag = 0;
+       int            type = lntmsg->msg_type;
+       lnet_process_id_t target = lntmsg->msg_target;
+       unsigned int      payload_niov = lntmsg->msg_niov;
+       struct iovec     *payload_iov = lntmsg->msg_iov;
+       lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+       unsigned int      payload_offset = lntmsg->msg_offset;
+       unsigned int      payload_nob = lntmsg->msg_len;
+       ksock_tx_t       *tx;
+       int            desc_size;
+       int            rc;
+
+       /* NB 'private' is different depending on what we're sending.
+        * Just ignore it... */
+
+       CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+              payload_nob, payload_niov, libcfs_id2str(target));
+
+       LASSERT (payload_nob == 0 || payload_niov > 0);
+       LASSERT (payload_niov <= LNET_MAX_IOV);
+       /* payload is either all vaddrs or all pages */
+       LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+       LASSERT (!in_interrupt ());
+
+       if (payload_iov != NULL)
+               desc_size = offsetof(ksock_tx_t,
+                                    tx_frags.virt.iov[1 + payload_niov]);
+       else
+               desc_size = offsetof(ksock_tx_t,
+                                    tx_frags.paged.kiov[payload_niov]);
+
+       if (lntmsg->msg_vmflush)
+               mpflag = cfs_memory_pressure_get_and_set();
+       tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
+       if (tx == NULL) {
+               CERROR("Can't allocate tx desc type %d size %d\n",
+                      type, desc_size);
+               if (lntmsg->msg_vmflush)
+                       cfs_memory_pressure_restore(mpflag);
+               return (-ENOMEM);
+       }
+
+       tx->tx_conn = NULL;                  /* set when assigned a conn */
+       tx->tx_lnetmsg = lntmsg;
+
+       if (payload_iov != NULL) {
+               tx->tx_kiov = NULL;
+               tx->tx_nkiov = 0;
+               tx->tx_iov = tx->tx_frags.virt.iov;
+               tx->tx_niov = 1 +
+                             lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+                                              payload_niov, payload_iov,
+                                              payload_offset, payload_nob);
+       } else {
+               tx->tx_niov = 1;
+               tx->tx_iov = &tx->tx_frags.paged.iov;
+               tx->tx_kiov = tx->tx_frags.paged.kiov;
+               tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+                                                payload_niov, payload_kiov,
+                                                payload_offset, payload_nob);
+
+               if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
+                       tx->tx_zc_capable = 1;
+       }
+
+       socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET);
+
+       /* The first fragment will be set later in pro_pack */
+       rc = ksocknal_launch_packet(ni, tx, target);
+       if (lntmsg->msg_vmflush)
+               cfs_memory_pressure_restore(mpflag);
+       if (rc == 0)
+               return (0);
+
+       ksocknal_free_tx(tx);
+       return (-EIO);
+}
+
+int
+ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+       task_t *task = kthread_run(fn, arg, name);
+
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+       ksocknal_data.ksnd_nthreads++;
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+       return 0;
+}
+
+void
+ksocknal_thread_fini (void)
+{
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+       ksocknal_data.ksnd_nthreads--;
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+       static char ksocknal_slop_buffer[4096];
+
+       int         nob;
+       unsigned int   niov;
+       int         skipped;
+
+       LASSERT(conn->ksnc_proto != NULL);
+
+       if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+               /* Remind the socket to ack eagerly... */
+               ksocknal_lib_eager_ack(conn);
+       }
+
+       if (nob_to_skip == 0) {  /* right at next packet boundary now */
+               conn->ksnc_rx_started = 0;
+               mb();                  /* racing with timeout thread */
+
+               switch (conn->ksnc_proto->pro_version) {
+               case  KSOCK_PROTO_V2:
+               case  KSOCK_PROTO_V3:
+                       conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+                       conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                       conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg;
+
+                       conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u);
+                       conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u);
+                       conn->ksnc_rx_iov[0].iov_len  = offsetof(ksock_msg_t, ksm_u);
+                       break;
+
+               case KSOCK_PROTO_V1:
+                       /* Receiving bare lnet_hdr_t */
+                       conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+                       conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t);
+                       conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t);
+
+                       conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                       conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+                       conn->ksnc_rx_iov[0].iov_len  = sizeof (lnet_hdr_t);
+                       break;
+
+               default:
+                       LBUG ();
+               }
+               conn->ksnc_rx_niov = 1;
+
+               conn->ksnc_rx_kiov = NULL;
+               conn->ksnc_rx_nkiov = 0;
+               conn->ksnc_rx_csum = ~0;
+               return (1);
+       }
+
+       /* Set up to skip as much as possible now.  If there's more left
+        * (ran out of iov entries) we'll get called again */
+
+       conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+       conn->ksnc_rx_nob_left = nob_to_skip;
+       conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+       skipped = 0;
+       niov = 0;
+
+       do {
+               nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+               conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+               conn->ksnc_rx_iov[niov].iov_len  = nob;
+               niov++;
+               skipped += nob;
+               nob_to_skip -=nob;
+
+       } while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+                niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+       conn->ksnc_rx_niov = niov;
+       conn->ksnc_rx_kiov = NULL;
+       conn->ksnc_rx_nkiov = 0;
+       conn->ksnc_rx_nob_wanted = skipped;
+       return (0);
+}
+
+int
+ksocknal_process_receive (ksock_conn_t *conn)
+{
+       lnet_hdr_t      *lhdr;
+       lnet_process_id_t *id;
+       int             rc;
+
+       LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+
+       /* NB: sched lock NOT held */
+       /* SOCKNAL_RX_LNET_HEADER is here for backward compatability */
+       LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+                conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+                conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+                conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+       if (conn->ksnc_rx_nob_wanted != 0) {
+               rc = ksocknal_receive(conn);
+
+               if (rc <= 0) {
+                       LASSERT (rc != -EAGAIN);
+
+                       if (rc == 0)
+                               CDEBUG (D_NET, "[%p] EOF from %s"
+                                       " ip %d.%d.%d.%d:%d\n", conn,
+                                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
+                       else if (!conn->ksnc_closing)
+                               CERROR ("[%p] Error %d on read from %s"
+                                       " ip %d.%d.%d.%d:%d\n",
+                                       conn, rc,
+                                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
+
+                       /* it's not an error if conn is being closed */
+                       ksocknal_close_conn_and_siblings (conn,
+                                                         (conn->ksnc_closing) ? 0 : rc);
+                       return (rc == 0 ? -ESHUTDOWN : rc);
+               }
+
+               if (conn->ksnc_rx_nob_wanted != 0) {
+                       /* short read */
+                       return (-EAGAIN);
+               }
+       }
+       switch (conn->ksnc_rx_state) {
+       case SOCKNAL_RX_KSM_HEADER:
+               if (conn->ksnc_flip) {
+                       __swab32s(&conn->ksnc_msg.ksm_type);
+                       __swab32s(&conn->ksnc_msg.ksm_csum);
+                       __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
+                       __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
+               }
+
+               if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
+                   conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
+                       CERROR("%s: Unknown message type: %x\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              conn->ksnc_msg.ksm_type);
+                       ksocknal_new_packet(conn, 0);
+                       ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                       return (-EPROTO);
+               }
+
+               if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+                   conn->ksnc_msg.ksm_csum != 0 &&     /* has checksum */
+                   conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+                       /* NOOP Checksum error */
+                       CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+                       ksocknal_new_packet(conn, 0);
+                       ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                       return (-EIO);
+               }
+
+               if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) {
+                       __u64 cookie = 0;
+
+                       LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+                       if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
+                               cookie = conn->ksnc_msg.ksm_zc_cookies[0];
+
+                       rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
+                                              conn->ksnc_msg.ksm_zc_cookies[1]);
+
+                       if (rc != 0) {
+                               CERROR("%s: Unknown ZC-ACK cookie: "LPU64", "LPU64"\n",
+                                      libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                      cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
+                               ksocknal_new_packet(conn, 0);
+                               ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                               return (rc);
+                       }
+               }
+
+               if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
+                       ksocknal_new_packet (conn, 0);
+                       return 0;       /* NOOP is done and just return */
+               }
+
+               conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+               conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t);
+               conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t);
+
+               conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+               conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+               conn->ksnc_rx_iov[0].iov_len  = sizeof(ksock_lnet_msg_t);
+
+               conn->ksnc_rx_niov = 1;
+               conn->ksnc_rx_kiov = NULL;
+               conn->ksnc_rx_nkiov = 0;
+
+               goto again;     /* read lnet header now */
+
+       case SOCKNAL_RX_LNET_HEADER:
+               /* unpack message header */
+               conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
+
+               if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
+                       /* Userspace peer */
+                       lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+                       id   = &conn->ksnc_peer->ksnp_id;
+
+                       /* Substitute process ID assigned at connection time */
+                       lhdr->src_pid = cpu_to_le32(id->pid);
+                       lhdr->src_nid = cpu_to_le64(id->nid);
+               }
+
+               conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+               ksocknal_conn_addref(conn);     /* ++ref while parsing */
+
+               rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
+                               &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
+                               conn->ksnc_peer->ksnp_id.nid, conn, 0);
+               if (rc < 0) {
+                       /* I just received garbage: give up on this conn */
+                       ksocknal_new_packet(conn, 0);
+                       ksocknal_close_conn_and_siblings (conn, rc);
+                       ksocknal_conn_decref(conn);
+                       return (-EPROTO);
+               }
+
+               /* I'm racing with ksocknal_recv() */
+               LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+                        conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+
+               if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+                       return 0;
+
+               /* ksocknal_recv() got called */
+               goto again;
+
+       case SOCKNAL_RX_LNET_PAYLOAD:
+               /* payload all received */
+               rc = 0;
+
+               if (conn->ksnc_rx_nob_left == 0 &&   /* not truncating */
+                   conn->ksnc_msg.ksm_csum != 0 &&  /* has checksum */
+                   conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+                       CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+                              libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                              conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+                       rc = -EIO;
+               }
+
+               if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) {
+                       LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+                       lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+                       id   = &conn->ksnc_peer->ksnp_id;
+
+                       rc = conn->ksnc_proto->pro_handle_zcreq(conn,
+                                       conn->ksnc_msg.ksm_zc_cookies[0],
+                                       *ksocknal_tunables.ksnd_nonblk_zcack ||
+                                       le64_to_cpu(lhdr->src_nid) != id->nid);
+               }
+
+               lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
+
+               if (rc != 0) {
+                       ksocknal_new_packet(conn, 0);
+                       ksocknal_close_conn_and_siblings (conn, rc);
+                       return (-EPROTO);
+               }
+               /* Fall through */
+
+       case SOCKNAL_RX_SLOP:
+               /* starting new packet? */
+               if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+                       return 0;       /* come back later */
+               goto again;          /* try to finish reading slop now */
+
+       default:
+               break;
+       }
+
+       /* Not Reached */
+       LBUG ();
+       return (-EINVAL);                      /* keep gcc happy */
+}
+
+int
+ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+              unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+              unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+       ksock_conn_t  *conn = (ksock_conn_t *)private;
+       ksock_sched_t *sched = conn->ksnc_scheduler;
+
+       LASSERT (mlen <= rlen);
+       LASSERT (niov <= LNET_MAX_IOV);
+
+       conn->ksnc_cookie = msg;
+       conn->ksnc_rx_nob_wanted = mlen;
+       conn->ksnc_rx_nob_left   = rlen;
+
+       if (mlen == 0 || iov != NULL) {
+               conn->ksnc_rx_nkiov = 0;
+               conn->ksnc_rx_kiov = NULL;
+               conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+               conn->ksnc_rx_niov =
+                       lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+                                        niov, iov, offset, mlen);
+       } else {
+               conn->ksnc_rx_niov = 0;
+               conn->ksnc_rx_iov  = NULL;
+               conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+               conn->ksnc_rx_nkiov =
+                       lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+                                         niov, kiov, offset, mlen);
+       }
+
+       LASSERT (mlen ==
+                lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+       LASSERT (conn->ksnc_rx_scheduled);
+
+       spin_lock_bh(&sched->kss_lock);
+
+       switch (conn->ksnc_rx_state) {
+       case SOCKNAL_RX_PARSE_WAIT:
+               list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+               wake_up (&sched->kss_waitq);
+               LASSERT (conn->ksnc_rx_ready);
+               break;
+
+       case SOCKNAL_RX_PARSE:
+               /* scheduler hasn't noticed I'm parsing yet */
+               break;
+       }
+
+       conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+
+       spin_unlock_bh(&sched->kss_lock);
+       ksocknal_conn_decref(conn);
+       return 0;
+}
+
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+       int        rc;
+
+       spin_lock_bh(&sched->kss_lock);
+
+       rc = (!ksocknal_data.ksnd_shuttingdown &&
+             list_empty(&sched->kss_rx_conns) &&
+             list_empty(&sched->kss_tx_conns));
+
+       spin_unlock_bh(&sched->kss_lock);
+       return rc;
+}
+
+int ksocknal_scheduler(void *arg)
+{
+       struct ksock_sched_info *info;
+       ksock_sched_t           *sched;
+       ksock_conn_t            *conn;
+       ksock_tx_t              *tx;
+       int                     rc;
+       int                     nloops = 0;
+       long                    id = (long)arg;
+
+       info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+       sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+
+       cfs_block_allsigs();
+
+       rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+       if (rc != 0) {
+               CERROR("Can't set CPT affinity to %d: %d\n",
+                      info->ksi_cpt, rc);
+       }
+
+       spin_lock_bh(&sched->kss_lock);
+
+       while (!ksocknal_data.ksnd_shuttingdown) {
+               int did_something = 0;
+
+               /* Ensure I progress everything semi-fairly */
+
+               if (!list_empty (&sched->kss_rx_conns)) {
+                       conn = list_entry(sched->kss_rx_conns.next,
+                                             ksock_conn_t, ksnc_rx_list);
+                       list_del(&conn->ksnc_rx_list);
+
+                       LASSERT(conn->ksnc_rx_scheduled);
+                       LASSERT(conn->ksnc_rx_ready);
+
+                       /* clear rx_ready in case receive isn't complete.
+                        * Do it BEFORE we call process_recv, since
+                        * data_ready can set it any time after we release
+                        * kss_lock. */
+                       conn->ksnc_rx_ready = 0;
+                       spin_unlock_bh(&sched->kss_lock);
+
+                       rc = ksocknal_process_receive(conn);
+
+                       spin_lock_bh(&sched->kss_lock);
+
+                       /* I'm the only one that can clear this flag */
+                       LASSERT(conn->ksnc_rx_scheduled);
+
+                       /* Did process_receive get everything it wanted? */
+                       if (rc == 0)
+                               conn->ksnc_rx_ready = 1;
+
+                       if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+                               /* Conn blocked waiting for ksocknal_recv()
+                                * I change its state (under lock) to signal
+                                * it can be rescheduled */
+                               conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+                       } else if (conn->ksnc_rx_ready) {
+                               /* reschedule for rx */
+                               list_add_tail (&conn->ksnc_rx_list,
+                                                  &sched->kss_rx_conns);
+                       } else {
+                               conn->ksnc_rx_scheduled = 0;
+                               /* drop my ref */
+                               ksocknal_conn_decref(conn);
+                       }
+
+                       did_something = 1;
+               }
+
+               if (!list_empty (&sched->kss_tx_conns)) {
+                       LIST_HEAD    (zlist);
+
+                       if (!list_empty(&sched->kss_zombie_noop_txs)) {
+                               list_add(&zlist,
+                                            &sched->kss_zombie_noop_txs);
+                               list_del_init(&sched->kss_zombie_noop_txs);
+                       }
+
+                       conn = list_entry(sched->kss_tx_conns.next,
+                                             ksock_conn_t, ksnc_tx_list);
+                       list_del (&conn->ksnc_tx_list);
+
+                       LASSERT(conn->ksnc_tx_scheduled);
+                       LASSERT(conn->ksnc_tx_ready);
+                       LASSERT(!list_empty(&conn->ksnc_tx_queue));
+
+                       tx = list_entry(conn->ksnc_tx_queue.next,
+                                           ksock_tx_t, tx_list);
+
+                       if (conn->ksnc_tx_carrier == tx)
+                               ksocknal_next_tx_carrier(conn);
+
+                       /* dequeue now so empty list => more to send */
+                       list_del(&tx->tx_list);
+
+                       /* Clear tx_ready in case send isn't complete.  Do
+                        * it BEFORE we call process_transmit, since
+                        * write_space can set it any time after we release
+                        * kss_lock. */
+                       conn->ksnc_tx_ready = 0;
+                       spin_unlock_bh(&sched->kss_lock);
+
+                       if (!list_empty(&zlist)) {
+                               /* free zombie noop txs, it's fast because
+                                * noop txs are just put in freelist */
+                               ksocknal_txlist_done(NULL, &zlist, 0);
+                       }
+
+                       rc = ksocknal_process_transmit(conn, tx);
+
+                       if (rc == -ENOMEM || rc == -EAGAIN) {
+                               /* Incomplete send: replace tx on HEAD of tx_queue */
+                               spin_lock_bh(&sched->kss_lock);
+                               list_add(&tx->tx_list,
+                                            &conn->ksnc_tx_queue);
+                       } else {
+                               /* Complete send; tx -ref */
+                               ksocknal_tx_decref(tx);
+
+                               spin_lock_bh(&sched->kss_lock);
+                               /* assume space for more */
+                               conn->ksnc_tx_ready = 1;
+                       }
+
+                       if (rc == -ENOMEM) {
+                               /* Do nothing; after a short timeout, this
+                                * conn will be reposted on kss_tx_conns. */
+                       } else if (conn->ksnc_tx_ready &&
+                                  !list_empty (&conn->ksnc_tx_queue)) {
+                               /* reschedule for tx */
+                               list_add_tail (&conn->ksnc_tx_list,
+                                                  &sched->kss_tx_conns);
+                       } else {
+                               conn->ksnc_tx_scheduled = 0;
+                               /* drop my ref */
+                               ksocknal_conn_decref(conn);
+                       }
+
+                       did_something = 1;
+               }
+               if (!did_something ||      /* nothing to do */
+                   ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+                       spin_unlock_bh(&sched->kss_lock);
+
+                       nloops = 0;
+
+                       if (!did_something) {   /* wait for something to do */
+                               cfs_wait_event_interruptible_exclusive(
+                                       sched->kss_waitq,
+                                       !ksocknal_sched_cansleep(sched), rc);
+                               LASSERT (rc == 0);
+                       } else {
+                               cond_resched();
+                       }
+
+                       spin_lock_bh(&sched->kss_lock);
+               }
+       }
+
+       spin_unlock_bh(&sched->kss_lock);
+       ksocknal_thread_fini();
+       return 0;
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
+{
+       ksock_sched_t *sched;
+       ENTRY;
+
+       sched = conn->ksnc_scheduler;
+
+       spin_lock_bh(&sched->kss_lock);
+
+       conn->ksnc_rx_ready = 1;
+
+       if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+               list_add_tail(&conn->ksnc_rx_list,
+                                 &sched->kss_rx_conns);
+               conn->ksnc_rx_scheduled = 1;
+               /* extra ref for scheduler */
+               ksocknal_conn_addref(conn);
+
+               wake_up (&sched->kss_waitq);
+       }
+       spin_unlock_bh(&sched->kss_lock);
+
+       EXIT;
+}
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback (ksock_conn_t *conn)
+{
+       ksock_sched_t *sched;
+       ENTRY;
+
+       sched = conn->ksnc_scheduler;
+
+       spin_lock_bh(&sched->kss_lock);
+
+       conn->ksnc_tx_ready = 1;
+
+       if (!conn->ksnc_tx_scheduled && // not being progressed
+           !list_empty(&conn->ksnc_tx_queue)){//packets to send
+               list_add_tail (&conn->ksnc_tx_list,
+                                  &sched->kss_tx_conns);
+               conn->ksnc_tx_scheduled = 1;
+               /* extra ref for scheduler */
+               ksocknal_conn_addref(conn);
+
+               wake_up (&sched->kss_waitq);
+       }
+
+       spin_unlock_bh(&sched->kss_lock);
+
+       EXIT;
+}
+
+ksock_proto_t *
+ksocknal_parse_proto_version (ksock_hello_msg_t *hello)
+{
+       __u32   version = 0;
+
+       if (hello->kshm_magic == LNET_PROTO_MAGIC)
+               version = hello->kshm_version;
+       else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
+               version = __swab32(hello->kshm_version);
+
+       if (version != 0) {
+#if SOCKNAL_VERSION_DEBUG
+               if (*ksocknal_tunables.ksnd_protocol == 1)
+                       return NULL;
+
+               if (*ksocknal_tunables.ksnd_protocol == 2 &&
+                   version == KSOCK_PROTO_V3)
+                       return NULL;
+#endif
+               if (version == KSOCK_PROTO_V2)
+                       return &ksocknal_protocol_v2x;
+
+               if (version == KSOCK_PROTO_V3)
+                       return &ksocknal_protocol_v3x;
+
+               return NULL;
+       }
+
+       if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+               lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello;
+
+               CLASSERT (sizeof (lnet_magicversion_t) ==
+                         offsetof (ksock_hello_msg_t, kshm_src_nid));
+
+               if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+                   hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+                       return &ksocknal_protocol_v1x;
+       }
+
+       return NULL;
+}
+
+int
+ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                    lnet_nid_t peer_nid, ksock_hello_msg_t *hello)
+{
+       /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+       ksock_net_t      *net = (ksock_net_t *)ni->ni_data;
+
+       LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES);
+
+       /* rely on caller to hold a ref on socket so it wouldn't disappear */
+       LASSERT (conn->ksnc_proto != NULL);
+
+       hello->kshm_src_nid      = ni->ni_nid;
+       hello->kshm_dst_nid      = peer_nid;
+       hello->kshm_src_pid      = the_lnet.ln_pid;
+
+       hello->kshm_src_incarnation = net->ksnn_incarnation;
+       hello->kshm_ctype          = conn->ksnc_type;
+
+       return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+int
+ksocknal_invert_type(int type)
+{
+       switch (type)
+       {
+       case SOCKLND_CONN_ANY:
+       case SOCKLND_CONN_CONTROL:
+               return (type);
+       case SOCKLND_CONN_BULK_IN:
+               return SOCKLND_CONN_BULK_OUT;
+       case SOCKLND_CONN_BULK_OUT:
+               return SOCKLND_CONN_BULK_IN;
+       default:
+               return (SOCKLND_CONN_NONE);
+       }
+}
+
+int
+ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                    ksock_hello_msg_t *hello, lnet_process_id_t *peerid,
+                    __u64 *incarnation)
+{
+       /* Return < 0   fatal error
+        *      0         success
+        *      EALREADY   lost connection race
+        *      EPROTO     protocol version mismatch
+        */
+       socket_t        *sock = conn->ksnc_sock;
+       int               active = (conn->ksnc_proto != NULL);
+       int               timeout;
+       int               proto_match;
+       int               rc;
+       ksock_proto_t       *proto;
+       lnet_process_id_t    recv_id;
+
+       /* socket type set on active connections - not set on passive */
+       LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
+
+       timeout = active ? *ksocknal_tunables.ksnd_timeout :
+                           lnet_acceptor_timeout();
+
+       rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0);
+               return rc;
+       }
+
+       if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+           hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+           hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+               /* Unexpected magic! */
+               CERROR ("Bad magic(1) %#08x (%#08x expected) from "
+                       "%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic),
+                       LNET_PROTO_TCP_MAGIC,
+                       HIPQUAD(conn->ksnc_ipaddr));
+               return -EPROTO;
+       }
+
+       rc = libcfs_sock_read(sock, &hello->kshm_version,
+                             sizeof(hello->kshm_version), timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0);
+               return rc;
+       }
+
+       proto = ksocknal_parse_proto_version(hello);
+       if (proto == NULL) {
+               if (!active) {
+                       /* unknown protocol from peer, tell peer my protocol */
+                       conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+                       if (*ksocknal_tunables.ksnd_protocol == 2)
+                               conn->ksnc_proto = &ksocknal_protocol_v2x;
+                       else if (*ksocknal_tunables.ksnd_protocol == 1)
+                               conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+                       hello->kshm_nips = 0;
+                       ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
+               }
+
+               CERROR ("Unknown protocol version (%d.x expected)"
+                       " from %u.%u.%u.%u\n",
+                       conn->ksnc_proto->pro_version,
+                       HIPQUAD(conn->ksnc_ipaddr));
+
+               return -EPROTO;
+       }
+
+       proto_match = (conn->ksnc_proto == proto);
+       conn->ksnc_proto = proto;
+
+       /* receive the rest of hello message anyway */
+       rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+       if (rc != 0) {
+               CERROR("Error %d reading or checking hello from from %u.%u.%u.%u\n",
+                      rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0);
+               return rc;
+       }
+
+       *incarnation = hello->kshm_src_incarnation;
+
+       if (hello->kshm_src_nid == LNET_NID_ANY) {
+               CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY"
+                      "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr));
+               return -EPROTO;
+       }
+
+       if (!active &&
+           conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+               /* Userspace NAL assigns peer process ID from socket */
+               recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+               recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+       } else {
+               recv_id.nid = hello->kshm_src_nid;
+               recv_id.pid = hello->kshm_src_pid;
+       }
+
+       if (!active) {
+               *peerid = recv_id;
+
+               /* peer determines type */
+               conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+               if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+                       CERROR ("Unexpected type %d from %s ip %u.%u.%u.%u\n",
+                               hello->kshm_ctype, libcfs_id2str(*peerid),
+                               HIPQUAD(conn->ksnc_ipaddr));
+                       return -EPROTO;
+               }
+
+               return 0;
+       }
+
+       if (peerid->pid != recv_id.pid ||
+           peerid->nid != recv_id.nid) {
+               LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host"
+                                  " %u.%u.%u.%u, but they claimed they were "
+                                  "%s; please check your Lustre "
+                                  "configuration.\n",
+                                  libcfs_id2str(*peerid),
+                                  HIPQUAD(conn->ksnc_ipaddr),
+                                  libcfs_id2str(recv_id));
+               return -EPROTO;
+       }
+
+       if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+               /* Possible protocol mismatch or I lost the connection race */
+               return proto_match ? EALREADY : EPROTO;
+       }
+
+       if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+               CERROR ("Mismatched types: me %d, %s ip %u.%u.%u.%u %d\n",
+                       conn->ksnc_type, libcfs_id2str(*peerid),
+                       HIPQUAD(conn->ksnc_ipaddr),
+                       hello->kshm_ctype);
+               return -EPROTO;
+       }
+
+       return 0;
+}
+
+int
+ksocknal_connect (ksock_route_t *route)
+{
+       LIST_HEAD    (zombies);
+       ksock_peer_t     *peer = route->ksnr_peer;
+       int            type;
+       int            wanted;
+       socket_t     *sock;
+       cfs_time_t      deadline;
+       int            retry_later = 0;
+       int            rc = 0;
+
+       deadline = cfs_time_add(cfs_time_current(),
+                               cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       LASSERT (route->ksnr_scheduled);
+       LASSERT (!route->ksnr_connecting);
+
+       route->ksnr_connecting = 1;
+
+       for (;;) {
+               wanted = ksocknal_route_mask() & ~route->ksnr_connected;
+
+               /* stop connecting if peer/route got closed under me, or
+                * route got connected while queued */
+               if (peer->ksnp_closing || route->ksnr_deleted ||
+                   wanted == 0) {
+                       retry_later = 0;
+                       break;
+               }
+
+               /* reschedule if peer is connecting to me */
+               if (peer->ksnp_accepting > 0) {
+                       CDEBUG(D_NET,
+                              "peer %s(%d) already connecting to me, retry later.\n",
+                              libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting);
+                       retry_later = 1;
+               }
+
+               if (retry_later) /* needs reschedule */
+                       break;
+
+               if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) {
+                       type = SOCKLND_CONN_ANY;
+               } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) {
+                       type = SOCKLND_CONN_CONTROL;
+               } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) {
+                       type = SOCKLND_CONN_BULK_IN;
+               } else {
+                       LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0);
+                       type = SOCKLND_CONN_BULK_OUT;
+               }
+
+               write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+               if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+                       rc = -ETIMEDOUT;
+                       lnet_connect_console_error(rc, peer->ksnp_id.nid,
+                                                  route->ksnr_ipaddr,
+                                                  route->ksnr_port);
+                       goto failed;
+               }
+
+               rc = lnet_connect(&sock, peer->ksnp_id.nid,
+                                 route->ksnr_myipaddr,
+                                 route->ksnr_ipaddr, route->ksnr_port);
+               if (rc != 0)
+                       goto failed;
+
+               rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+               if (rc < 0) {
+                       lnet_connect_console_error(rc, peer->ksnp_id.nid,
+                                                  route->ksnr_ipaddr,
+                                                  route->ksnr_port);
+                       goto failed;
+               }
+
+               /* A +ve RC means I have to retry because I lost the connection
+                * race or I have to renegotiate protocol version */
+               retry_later = (rc != 0);
+               if (retry_later)
+                       CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
+                              libcfs_nid2str(peer->ksnp_id.nid));
+
+               write_lock_bh(&ksocknal_data.ksnd_global_lock);
+       }
+
+       route->ksnr_scheduled = 0;
+       route->ksnr_connecting = 0;
+
+       if (retry_later) {
+               /* re-queue for attention; this frees me up to handle
+                * the peer's incoming connection request */
+
+               if (rc == EALREADY ||
+                   (rc == 0 && peer->ksnp_accepting > 0)) {
+                       /* We want to introduce a delay before next
+                        * attempt to connect if we lost conn race,
+                        * but the race is resolved quickly usually,
+                        * so min_reconnectms should be good heuristic */
+                       route->ksnr_retry_interval =
+                               cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+                       route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+                                                          route->ksnr_retry_interval);
+               }
+
+               ksocknal_launch_connection_locked(route);
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+       return retry_later;
+
+ failed:
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       route->ksnr_scheduled = 0;
+       route->ksnr_connecting = 0;
+
+       /* This is a retry rather than a new connection */
+       route->ksnr_retry_interval *= 2;
+       route->ksnr_retry_interval =
+               MAX(route->ksnr_retry_interval,
+                   cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+       route->ksnr_retry_interval =
+               MIN(route->ksnr_retry_interval,
+                   cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+
+       LASSERT (route->ksnr_retry_interval != 0);
+       route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+                                          route->ksnr_retry_interval);
+
+       if (!list_empty(&peer->ksnp_tx_queue) &&
+           peer->ksnp_accepting == 0 &&
+           ksocknal_find_connecting_route_locked(peer) == NULL) {
+               ksock_conn_t *conn;
+
+               /* ksnp_tx_queue is queued on a conn on successful
+                * connection for V1.x and V2.x */
+               if (!list_empty (&peer->ksnp_conns)) {
+                       conn = list_entry(peer->ksnp_conns.next,
+                                             ksock_conn_t, ksnc_list);
+                       LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+               }
+
+               /* take all the blocked packets while I've got the lock and
+                * complete below... */
+               list_splice_init(&peer->ksnp_tx_queue, &zombies);
+       }
+
+#if 0     /* irrelevent with only eager routes */
+       if (!route->ksnr_deleted) {
+               /* make this route least-favourite for re-selection */
+               list_del(&route->ksnr_list);
+               list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+       }
+#endif
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       ksocknal_peer_failed(peer);
+       ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
+       return 0;
+}
+
+/*
+ * check whether we need to create more connds.
+ * It will try to create new thread if it's necessary, @timeout can
+ * be updated if failed to create, so caller wouldn't keep try while
+ * running out of resource.
+ */
+static int
+ksocknal_connd_check_start(long sec, long *timeout)
+{
+       char name[16];
+       int rc;
+       int total = ksocknal_data.ksnd_connd_starting +
+                   ksocknal_data.ksnd_connd_running;
+
+       if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+               /* still in initializing */
+               return 0;
+       }
+
+       if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
+           total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
+               /* can't create more connd, or still have enough
+                * threads to handle more connecting */
+               return 0;
+       }
+
+       if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
+               /* no pending connecting request */
+               return 0;
+       }
+
+       if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
+               /* may run out of resource, retry later */
+               *timeout = cfs_time_seconds(1);
+               return 0;
+       }
+
+       if (ksocknal_data.ksnd_connd_starting > 0) {
+               /* serialize starting to avoid flood */
+               return 0;
+       }
+
+       ksocknal_data.ksnd_connd_starting_stamp = sec;
+       ksocknal_data.ksnd_connd_starting++;
+       spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+       /* NB: total is the next id */
+       snprintf(name, sizeof(name), "socknal_cd%02d", total);
+       rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
+
+       spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+       if (rc == 0)
+               return 1;
+
+       /* we tried ... */
+       LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+       ksocknal_data.ksnd_connd_starting--;
+       ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec();
+
+       return 1;
+}
+
+/*
+ * check whether current thread can exit, it will return 1 if there are too
+ * many threads and no creating in past 120 seconds.
+ * Also, this function may update @timeout to make caller come back
+ * again to recheck these conditions.
+ */
+static int
+ksocknal_connd_check_stop(long sec, long *timeout)
+{
+       int val;
+
+       if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+               /* still in initializing */
+               return 0;
+       }
+
+       if (ksocknal_data.ksnd_connd_starting > 0) {
+               /* in progress of starting new thread */
+               return 0;
+       }
+
+       if (ksocknal_data.ksnd_connd_running <=
+           *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
+               return 0;
+       }
+
+       /* created thread in past 120 seconds? */
+       val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
+                   SOCKNAL_CONND_TIMEOUT - sec);
+
+       *timeout = (val > 0) ? cfs_time_seconds(val) :
+                              cfs_time_seconds(SOCKNAL_CONND_TIMEOUT);
+       if (val > 0)
+               return 0;
+
+       /* no creating in past 120 seconds */
+
+       return ksocknal_data.ksnd_connd_running >
+              ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
+}
+
+/* Go through connd_routes queue looking for a route that we can process
+ * right now, @timeout_p can be updated if we need to come back later */
+static ksock_route_t *
+ksocknal_connd_get_route_locked(signed long *timeout_p)
+{
+       ksock_route_t *route;
+       cfs_time_t     now;
+
+       now = cfs_time_current();
+
+       /* connd_routes can contain both pending and ordinary routes */
+       list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes,
+                                ksnr_connd_list) {
+
+               if (route->ksnr_retry_interval == 0 ||
+                   cfs_time_aftereq(now, route->ksnr_timeout))
+                       return route;
+
+               if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
+                   (int)*timeout_p > (int)(route->ksnr_timeout - now))
+                       *timeout_p = (int)(route->ksnr_timeout - now);
+       }
+
+       return NULL;
+}
+
+int
+ksocknal_connd (void *arg)
+{
+       spinlock_t    *connd_lock = &ksocknal_data.ksnd_connd_lock;
+       ksock_connreq_t   *cr;
+       wait_queue_t     wait;
+       int             nloops = 0;
+       int             cons_retry = 0;
+
+       cfs_block_allsigs ();
+
+       init_waitqueue_entry_current (&wait);
+
+       spin_lock_bh(connd_lock);
+
+       LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+       ksocknal_data.ksnd_connd_starting--;
+       ksocknal_data.ksnd_connd_running++;
+
+       while (!ksocknal_data.ksnd_shuttingdown) {
+               ksock_route_t *route = NULL;
+               long sec = cfs_time_current_sec();
+               long timeout = MAX_SCHEDULE_TIMEOUT;
+               int  dropped_lock = 0;
+
+               if (ksocknal_connd_check_stop(sec, &timeout)) {
+                       /* wakeup another one to check stop */
+                       wake_up(&ksocknal_data.ksnd_connd_waitq);
+                       break;
+               }
+
+               if (ksocknal_connd_check_start(sec, &timeout)) {
+                       /* created new thread */
+                       dropped_lock = 1;
+               }
+
+               if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+                       /* Connection accepted by the listener */
+                       cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
+                                           next, ksock_connreq_t, ksncr_list);
+
+                       list_del(&cr->ksncr_list);
+                       spin_unlock_bh(connd_lock);
+                       dropped_lock = 1;
+
+                       ksocknal_create_conn(cr->ksncr_ni, NULL,
+                                            cr->ksncr_sock, SOCKLND_CONN_NONE);
+                       lnet_ni_decref(cr->ksncr_ni);
+                       LIBCFS_FREE(cr, sizeof(*cr));
+
+                       spin_lock_bh(connd_lock);
+               }
+
+               /* Only handle an outgoing connection request if there
+                * is a thread left to handle incoming connections and
+                * create new connd */
+               if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
+                   ksocknal_data.ksnd_connd_running) {
+                       route = ksocknal_connd_get_route_locked(&timeout);
+               }
+               if (route != NULL) {
+                       list_del (&route->ksnr_connd_list);
+                       ksocknal_data.ksnd_connd_connecting++;
+                       spin_unlock_bh(connd_lock);
+                       dropped_lock = 1;
+
+                       if (ksocknal_connect(route)) {
+                               /* consecutive retry */
+                               if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
+                                       CWARN("massive consecutive "
+                                             "re-connecting to %u.%u.%u.%u\n",
+                                             HIPQUAD(route->ksnr_ipaddr));
+                                       cons_retry = 0;
+                               }
+                       } else {
+                               cons_retry = 0;
+                       }
+
+                       ksocknal_route_decref(route);
+
+                       spin_lock_bh(connd_lock);
+                       ksocknal_data.ksnd_connd_connecting--;
+               }
+
+               if (dropped_lock) {
+                       if (++nloops < SOCKNAL_RESCHED)
+                               continue;
+                       spin_unlock_bh(connd_lock);
+                       nloops = 0;
+                       cond_resched();
+                       spin_lock_bh(connd_lock);
+                       continue;
+               }
+
+               /* Nothing to do for 'timeout'  */
+               set_current_state(TASK_INTERRUPTIBLE);
+               add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait);
+               spin_unlock_bh(connd_lock);
+
+               nloops = 0;
+               waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
+               spin_lock_bh(connd_lock);
+       }
+       ksocknal_data.ksnd_connd_running--;
+       spin_unlock_bh(connd_lock);
+
+       ksocknal_thread_fini();
+       return 0;
+}
+
+ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_t *peer)
+{
+       /* We're called with a shared lock on ksnd_global_lock */
+       ksock_conn_t      *conn;
+       struct list_head        *ctmp;
+
+       list_for_each (ctmp, &peer->ksnp_conns) {
+               int     error;
+               conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+               /* Don't need the {get,put}connsock dance to deref ksnc_sock */
+               LASSERT (!conn->ksnc_closing);
+
+               /* SOCK_ERROR will reset error code of socket in
+                * some platform (like Darwin8.x) */
+               error = cfs_sock_error(conn->ksnc_sock);
+               if (error != 0) {
+                       ksocknal_conn_addref(conn);
+
+                       switch (error) {
+                       case ECONNRESET:
+                               CNETERR("A connection with %s "
+                                       "(%u.%u.%u.%u:%d) was reset; "
+                                       "it may have rebooted.\n",
+                                       libcfs_id2str(peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
+                               break;
+                       case ETIMEDOUT:
+                               CNETERR("A connection with %s "
+                                       "(%u.%u.%u.%u:%d) timed out; the "
+                                       "network or node may be down.\n",
+                                       libcfs_id2str(peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
+                               break;
+                       default:
+                               CNETERR("An unexpected network error %d "
+                                       "occurred with %s "
+                                       "(%u.%u.%u.%u:%d\n", error,
+                                       libcfs_id2str(peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
+                               break;
+                       }
+
+                       return (conn);
+               }
+
+               if (conn->ksnc_rx_started &&
+                   cfs_time_aftereq(cfs_time_current(),
+                                    conn->ksnc_rx_deadline)) {
+                       /* Timed out incomplete incoming message */
+                       ksocknal_conn_addref(conn);
+                       CNETERR("Timeout receiving from %s (%u.%u.%u.%u:%d), "
+                               "state %d wanted %d left %d\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(conn->ksnc_ipaddr),
+                               conn->ksnc_port,
+                               conn->ksnc_rx_state,
+                               conn->ksnc_rx_nob_wanted,
+                               conn->ksnc_rx_nob_left);
+                       return (conn);
+               }
+
+               if ((!list_empty(&conn->ksnc_tx_queue) ||
+                    cfs_sock_wmem_queued(conn->ksnc_sock) != 0) &&
+                   cfs_time_aftereq(cfs_time_current(),
+                                    conn->ksnc_tx_deadline)) {
+                       /* Timed out messages queued for sending or
+                        * buffered in the socket's send buffer */
+                       ksocknal_conn_addref(conn);
+                       CNETERR("Timeout sending data to %s (%u.%u.%u.%u:%d) "
+                               "the network or that node may be down.\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(conn->ksnc_ipaddr),
+                               conn->ksnc_port);
+                       return (conn);
+               }
+       }
+
+       return (NULL);
+}
+
+static inline void
+ksocknal_flush_stale_txs(ksock_peer_t *peer)
+{
+       ksock_tx_t      *tx;
+       LIST_HEAD      (stale_txs);
+
+       write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+       while (!list_empty (&peer->ksnp_tx_queue)) {
+               tx = list_entry (peer->ksnp_tx_queue.next,
+                                    ksock_tx_t, tx_list);
+
+               if (!cfs_time_aftereq(cfs_time_current(),
+                                     tx->tx_deadline))
+                       break;
+
+               list_del (&tx->tx_list);
+               list_add_tail (&tx->tx_list, &stale_txs);
+       }
+
+       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+       ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
+}
+
+int
+ksocknal_send_keepalive_locked(ksock_peer_t *peer)
+{
+       ksock_sched_t  *sched;
+       ksock_conn_t   *conn;
+       ksock_tx_t     *tx;
+
+       if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */
+               return 0;
+
+       if (peer->ksnp_proto != &ksocknal_protocol_v3x)
+               return 0;
+
+       if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
+           cfs_time_before(cfs_time_current(),
+                           cfs_time_add(peer->ksnp_last_alive,
+                                        cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+               return 0;
+
+       if (cfs_time_before(cfs_time_current(),
+                           peer->ksnp_send_keepalive))
+               return 0;
+
+       /* retry 10 secs later, so we wouldn't put pressure
+        * on this peer if we failed to send keepalive this time */
+       peer->ksnp_send_keepalive = cfs_time_shift(10);
+
+       conn = ksocknal_find_conn_locked(peer, NULL, 1);
+       if (conn != NULL) {
+               sched = conn->ksnc_scheduler;
+
+               spin_lock_bh(&sched->kss_lock);
+               if (!list_empty(&conn->ksnc_tx_queue)) {
+                       spin_unlock_bh(&sched->kss_lock);
+                       /* there is an queued ACK, don't need keepalive */
+                       return 0;
+               }
+
+               spin_unlock_bh(&sched->kss_lock);
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       /* cookie = 1 is reserved for keepalive PING */
+       tx = ksocknal_alloc_tx_noop(1, 1);
+       if (tx == NULL) {
+               read_lock(&ksocknal_data.ksnd_global_lock);
+               return -ENOMEM;
+       }
+
+       if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) {
+               read_lock(&ksocknal_data.ksnd_global_lock);
+               return 1;
+       }
+
+       ksocknal_free_tx(tx);
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       return -EIO;
+}
+
+
+void
+ksocknal_check_peer_timeouts (int idx)
+{
+       struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
+       ksock_peer_t     *peer;
+       ksock_conn_t     *conn;
+       ksock_tx_t       *tx;
+
+ again:
+       /* NB. We expect to have a look at all the peers and not find any
+        * connections to time out, so we just use a shared lock while we
+        * take a look... */
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       list_for_each_entry(peer, peers, ksnp_list) {
+               cfs_time_t  deadline = 0;
+               int      resid = 0;
+               int      n     = 0;
+
+               if (ksocknal_send_keepalive_locked(peer) != 0) {
+                       read_unlock(&ksocknal_data.ksnd_global_lock);
+                       goto again;
+               }
+
+               conn = ksocknal_find_timed_out_conn (peer);
+
+               if (conn != NULL) {
+                       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                       ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+
+                       /* NB we won't find this one again, but we can't
+                        * just proceed with the next peer, since we dropped
+                        * ksnd_global_lock and it might be dead already! */
+                       ksocknal_conn_decref(conn);
+                       goto again;
+               }
+
+               /* we can't process stale txs right here because we're
+                * holding only shared lock */
+               if (!list_empty (&peer->ksnp_tx_queue)) {
+                       ksock_tx_t *tx =
+                               list_entry (peer->ksnp_tx_queue.next,
+                                               ksock_tx_t, tx_list);
+
+                       if (cfs_time_aftereq(cfs_time_current(),
+                                            tx->tx_deadline)) {
+
+                               ksocknal_peer_addref(peer);
+                               read_unlock(&ksocknal_data.ksnd_global_lock);
+
+                               ksocknal_flush_stale_txs(peer);
+
+                               ksocknal_peer_decref(peer);
+                               goto again;
+                       }
+               }
+
+               if (list_empty(&peer->ksnp_zc_req_list))
+                       continue;
+
+               spin_lock(&peer->ksnp_lock);
+               list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
+                       if (!cfs_time_aftereq(cfs_time_current(),
+                                             tx->tx_deadline))
+                               break;
+                       /* ignore the TX if connection is being closed */
+                       if (tx->tx_conn->ksnc_closing)
+                               continue;
+                       n++;
+               }
+
+               if (n == 0) {
+                       spin_unlock(&peer->ksnp_lock);
+                       continue;
+               }
+
+               tx = list_entry(peer->ksnp_zc_req_list.next,
+                                   ksock_tx_t, tx_zc_list);
+               deadline = tx->tx_deadline;
+               resid    = tx->tx_resid;
+               conn     = tx->tx_conn;
+               ksocknal_conn_addref(conn);
+
+               spin_unlock(&peer->ksnp_lock);
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+
+               CERROR("Total %d stale ZC_REQs for peer %s detected; the "
+                      "oldest(%p) timed out %ld secs ago, "
+                      "resid: %d, wmem: %d\n",
+                      n, libcfs_nid2str(peer->ksnp_id.nid), tx,
+                      cfs_duration_sec(cfs_time_current() - deadline),
+                      resid, cfs_sock_wmem_queued(conn->ksnc_sock));
+
+               ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+               ksocknal_conn_decref(conn);
+               goto again;
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+       wait_queue_t     wait;
+       ksock_conn_t      *conn;
+       ksock_sched_t     *sched;
+       struct list_head         enomem_conns;
+       int             nenomem_conns;
+       cfs_duration_t     timeout;
+       int             i;
+       int             peer_index = 0;
+       cfs_time_t       deadline = cfs_time_current();
+
+       cfs_block_allsigs ();
+
+       INIT_LIST_HEAD(&enomem_conns);
+       init_waitqueue_entry_current (&wait);
+
+       spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+       while (!ksocknal_data.ksnd_shuttingdown) {
+
+               if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) {
+                       conn = list_entry (ksocknal_data. \
+                                              ksnd_deathrow_conns.next,
+                                              ksock_conn_t, ksnc_list);
+                       list_del (&conn->ksnc_list);
+
+                       spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+                       ksocknal_terminate_conn(conn);
+                       ksocknal_conn_decref(conn);
+
+                       spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+                       continue;
+               }
+
+               if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) {
+                       conn = list_entry (ksocknal_data.ksnd_zombie_conns.\
+                                              next, ksock_conn_t, ksnc_list);
+                       list_del (&conn->ksnc_list);
+
+                       spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+                       ksocknal_destroy_conn(conn);
+
+                       spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+                       continue;
+               }
+
+               if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) {
+                       list_add(&enomem_conns,
+                                    &ksocknal_data.ksnd_enomem_conns);
+                       list_del_init(&ksocknal_data.ksnd_enomem_conns);
+               }
+
+               spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+               /* reschedule all the connections that stalled with ENOMEM... */
+               nenomem_conns = 0;
+               while (!list_empty (&enomem_conns)) {
+                       conn = list_entry (enomem_conns.next,
+                                              ksock_conn_t, ksnc_tx_list);
+                       list_del (&conn->ksnc_tx_list);
+
+                       sched = conn->ksnc_scheduler;
+
+                       spin_lock_bh(&sched->kss_lock);
+
+                       LASSERT(conn->ksnc_tx_scheduled);
+                       conn->ksnc_tx_ready = 1;
+                       list_add_tail(&conn->ksnc_tx_list,
+                                         &sched->kss_tx_conns);
+                       wake_up(&sched->kss_waitq);
+
+                       spin_unlock_bh(&sched->kss_lock);
+                       nenomem_conns++;
+               }
+
+               /* careful with the jiffy wrap... */
+               while ((timeout = cfs_time_sub(deadline,
+                                              cfs_time_current())) <= 0) {
+                       const int n = 4;
+                       const int p = 1;
+                       int       chunk = ksocknal_data.ksnd_peer_hash_size;
+
+                       /* Time to check for timeouts on a few more peers: I do
+                        * checks every 'p' seconds on a proportion of the peer
+                        * table and I need to check every connection 'n' times
+                        * within a timeout interval, to ensure I detect a
+                        * timeout on any connection within (n+1)/n times the
+                        * timeout interval. */
+
+                       if (*ksocknal_tunables.ksnd_timeout > n * p)
+                               chunk = (chunk * n * p) /
+                                       *ksocknal_tunables.ksnd_timeout;
+                       if (chunk == 0)
+                               chunk = 1;
+
+                       for (i = 0; i < chunk; i++) {
+                               ksocknal_check_peer_timeouts (peer_index);
+                               peer_index = (peer_index + 1) %
+                                            ksocknal_data.ksnd_peer_hash_size;
+                       }
+
+                       deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+               }
+
+               if (nenomem_conns != 0) {
+                       /* Reduce my timeout if I rescheduled ENOMEM conns.
+                        * This also prevents me getting woken immediately
+                        * if any go back on my enomem list. */
+                       timeout = SOCKNAL_ENOMEM_RETRY;
+               }
+               ksocknal_data.ksnd_reaper_waketime =
+                       cfs_time_add(cfs_time_current(), timeout);
+
+               set_current_state (TASK_INTERRUPTIBLE);
+               add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+               if (!ksocknal_data.ksnd_shuttingdown &&
+                   list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
+                   list_empty (&ksocknal_data.ksnd_zombie_conns))
+                       waitq_timedwait (&wait, TASK_INTERRUPTIBLE,
+                                            timeout);
+
+               set_current_state (TASK_RUNNING);
+               remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+               spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+       }
+
+       spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+       ksocknal_thread_fini();
+       return 0;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
new file mode 100644 (file)
index 0000000..3e08fe2
--- /dev/null
@@ -0,0 +1,1088 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include "socklnd.h"
+
+# if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+
+enum {
+       SOCKLND_TIMEOUT = 1,
+       SOCKLND_CREDITS,
+       SOCKLND_PEER_TXCREDITS,
+       SOCKLND_PEER_RTRCREDITS,
+       SOCKLND_PEER_TIMEOUT,
+       SOCKLND_NCONNDS,
+       SOCKLND_RECONNECTS_MIN,
+       SOCKLND_RECONNECTS_MAX,
+       SOCKLND_EAGER_ACK,
+       SOCKLND_ZERO_COPY,
+       SOCKLND_TYPED,
+       SOCKLND_BULK_MIN,
+       SOCKLND_RX_BUFFER_SIZE,
+       SOCKLND_TX_BUFFER_SIZE,
+       SOCKLND_NAGLE,
+       SOCKLND_IRQ_AFFINITY,
+       SOCKLND_ROUND_ROBIN,
+       SOCKLND_KEEPALIVE,
+       SOCKLND_KEEPALIVE_IDLE,
+       SOCKLND_KEEPALIVE_COUNT,
+       SOCKLND_KEEPALIVE_INTVL,
+       SOCKLND_BACKOFF_INIT,
+       SOCKLND_BACKOFF_MAX,
+       SOCKLND_PROTOCOL,
+       SOCKLND_ZERO_COPY_RECV,
+       SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS
+};
+
+static ctl_table_t ksocknal_ctl_table[] = {
+       {
+               .ctl_name = SOCKLND_TIMEOUT,
+               .procname = "timeout",
+               .data     = &ksocknal_tunables.ksnd_timeout,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_CREDITS,
+               .procname = "credits",
+               .data     = &ksocknal_tunables.ksnd_credits,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+        {
+               .ctl_name = SOCKLND_PEER_TXCREDITS,
+               .procname = "peer_credits",
+               .data     = &ksocknal_tunables.ksnd_peertxcredits,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+        {
+               .ctl_name = SOCKLND_PEER_RTRCREDITS,
+               .procname = "peer_buffer_credits",
+               .data     = &ksocknal_tunables.ksnd_peerrtrcredits,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_PEER_TIMEOUT,
+               .procname = "peer_timeout",
+               .data     = &ksocknal_tunables.ksnd_peertimeout,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_NCONNDS,
+               .procname = "nconnds",
+               .data     = &ksocknal_tunables.ksnd_nconnds,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_RECONNECTS_MIN,
+               .procname = "min_reconnectms",
+               .data     = &ksocknal_tunables.ksnd_min_reconnectms,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_RECONNECTS_MAX,
+               .procname = "max_reconnectms",
+               .data     = &ksocknal_tunables.ksnd_max_reconnectms,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_EAGER_ACK,
+               .procname = "eager_ack",
+               .data     = &ksocknal_tunables.ksnd_eager_ack,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_ZERO_COPY,
+               .procname = "zero_copy",
+               .data     = &ksocknal_tunables.ksnd_zc_min_payload,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_ZERO_COPY_RECV,
+               .procname = "zero_copy_recv",
+               .data     = &ksocknal_tunables.ksnd_zc_recv,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+
+       {
+               .ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS,
+               .procname = "zero_copy_recv",
+               .data     = &ksocknal_tunables.ksnd_zc_recv_min_nfrags,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_TYPED,
+               .procname = "typed",
+               .data     = &ksocknal_tunables.ksnd_typed_conns,
+               .maxlen   = sizeof (int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_BULK_MIN,
+               .procname = "min_bulk",
+               .data     = &ksocknal_tunables.ksnd_min_bulk,
+               .maxlen   = sizeof (int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_RX_BUFFER_SIZE,
+               .procname = "rx_buffer_size",
+               .data     = &ksocknal_tunables.ksnd_rx_buffer_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_TX_BUFFER_SIZE,
+               .procname = "tx_buffer_size",
+               .data     = &ksocknal_tunables.ksnd_tx_buffer_size,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_NAGLE,
+               .procname = "nagle",
+               .data     = &ksocknal_tunables.ksnd_nagle,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_ROUND_ROBIN,
+               .procname = "round_robin",
+               .data     = &ksocknal_tunables.ksnd_round_robin,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_KEEPALIVE,
+               .procname = "keepalive",
+               .data     = &ksocknal_tunables.ksnd_keepalive,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_KEEPALIVE_IDLE,
+               .procname = "keepalive_idle",
+               .data     = &ksocknal_tunables.ksnd_keepalive_idle,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_KEEPALIVE_COUNT,
+               .procname = "keepalive_count",
+               .data     = &ksocknal_tunables.ksnd_keepalive_count,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+       {
+               .ctl_name = SOCKLND_KEEPALIVE_INTVL,
+               .procname = "keepalive_intvl",
+               .data     = &ksocknal_tunables.ksnd_keepalive_intvl,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+#if SOCKNAL_VERSION_DEBUG
+       {
+               .ctl_name = SOCKLND_PROTOCOL,
+               .procname = "protocol",
+               .data     = &ksocknal_tunables.ksnd_protocol,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               .strategy = &sysctl_intvec,
+       },
+#endif
+       {0}
+};
+
+
+ctl_table_t ksocknal_top_ctl_table[] = {
+       {
+               .ctl_name = CTL_SOCKLND,
+               .procname = "socknal",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0555,
+               .child    = ksocknal_ctl_table
+       },
+       { 0 }
+};
+
+int
+ksocknal_lib_tunables_init ()
+{
+       if (!*ksocknal_tunables.ksnd_typed_conns) {
+               int rc = -EINVAL;
+#if SOCKNAL_VERSION_DEBUG
+               if (*ksocknal_tunables.ksnd_protocol < 3)
+                       rc = 0;
+#endif
+               if (rc != 0) {
+                       CERROR("Protocol V3.x MUST have typed connections\n");
+                       return rc;
+               }
+       }
+
+       if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2)
+               *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2;
+       if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV)
+               *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV;
+
+       ksocknal_tunables.ksnd_sysctl =
+               cfs_register_sysctl_table(ksocknal_top_ctl_table, 0);
+
+       if (ksocknal_tunables.ksnd_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+       if (ksocknal_tunables.ksnd_sysctl != NULL)
+               unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);
+}
+#else
+int
+ksocknal_lib_tunables_init ()
+{
+       return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+}
+#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{
+       int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+                                    &conn->ksnc_ipaddr,
+                                    &conn->ksnc_port);
+
+       /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+       LASSERT (!conn->ksnc_closing);
+
+       if (rc != 0) {
+               CERROR ("Error %d getting sock peer IP\n", rc);
+               return rc;
+       }
+
+       rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+                                &conn->ksnc_myipaddr, NULL);
+       if (rc != 0) {
+               CERROR ("Error %d getting sock local IP\n", rc);
+               return rc;
+       }
+
+       return 0;
+}
+
+int
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
+{
+       int  caps = conn->ksnc_sock->sk->sk_route_caps;
+
+       if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+               return 0;
+
+       /* ZC if the socket supports scatter/gather and doesn't need software
+        * checksums */
+       return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0);
+}
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       struct socket *sock = conn->ksnc_sock;
+       int         nob;
+       int         rc;
+
+       if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */
+           conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
+           tx->tx_nob == tx->tx_resid           && /* frist sending    */
+           tx->tx_msg.ksm_csum == 0)                /* not checksummed  */
+               ksocknal_lib_csum_tx(tx);
+
+       /* NB we can't trust socket ops to either consume our iovs
+        * or leave them alone. */
+
+       {
+#if SOCKNAL_SINGLE_FRAG_TX
+               struct iovec    scratch;
+               struct iovec   *scratchiov = &scratch;
+               unsigned int    niov = 1;
+#else
+               struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+               unsigned int    niov = tx->tx_niov;
+#endif
+               struct msghdr msg = {
+                       .msg_name       = NULL,
+                       .msg_namelen    = 0,
+                       .msg_iov        = scratchiov,
+                       .msg_iovlen     = niov,
+                       .msg_control    = NULL,
+                       .msg_controllen = 0,
+                       .msg_flags      = MSG_DONTWAIT
+               };
+               mm_segment_t oldmm = get_fs();
+               int  i;
+
+               for (nob = i = 0; i < niov; i++) {
+                       scratchiov[i] = tx->tx_iov[i];
+                       nob += scratchiov[i].iov_len;
+               }
+
+               if (!list_empty(&conn->ksnc_tx_queue) ||
+                   nob < tx->tx_resid)
+                       msg.msg_flags |= MSG_MORE;
+
+               set_fs (KERNEL_DS);
+               rc = sock_sendmsg(sock, &msg, nob);
+               set_fs (oldmm);
+       }
+       return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+       struct socket *sock = conn->ksnc_sock;
+       lnet_kiov_t   *kiov = tx->tx_kiov;
+       int         rc;
+       int         nob;
+
+       /* Not NOOP message */
+       LASSERT (tx->tx_lnetmsg != NULL);
+
+       /* NB we can't trust socket ops to either consume our iovs
+        * or leave them alone. */
+       if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+               /* Zero copy is enabled */
+               struct sock   *sk = sock->sk;
+               struct page   *page = kiov->kiov_page;
+               int         offset = kiov->kiov_offset;
+               int         fragsize = kiov->kiov_len;
+               int         msgflg = MSG_DONTWAIT;
+
+               CDEBUG(D_NET, "page %p + offset %x for %d\n",
+                              page, offset, kiov->kiov_len);
+
+               if (!list_empty(&conn->ksnc_tx_queue) ||
+                   fragsize < tx->tx_resid)
+                       msgflg |= MSG_MORE;
+
+               if (sk->sk_prot->sendpage != NULL) {
+                       rc = sk->sk_prot->sendpage(sk, page,
+                                                  offset, fragsize, msgflg);
+               } else {
+                       rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+                                             msgflg);
+               }
+       } else {
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+               struct iovec  scratch;
+               struct iovec *scratchiov = &scratch;
+               unsigned int  niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+               struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+               unsigned int  niov = tx->tx_nkiov;
+#endif
+               struct msghdr msg = {
+                       .msg_name       = NULL,
+                       .msg_namelen    = 0,
+                       .msg_iov        = scratchiov,
+                       .msg_iovlen     = niov,
+                       .msg_control    = NULL,
+                       .msg_controllen = 0,
+                       .msg_flags      = MSG_DONTWAIT
+               };
+               mm_segment_t  oldmm = get_fs();
+               int        i;
+
+               for (nob = i = 0; i < niov; i++) {
+                       scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+                                                kiov[i].kiov_offset;
+                       nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+               }
+
+               if (!list_empty(&conn->ksnc_tx_queue) ||
+                   nob < tx->tx_resid)
+                       msg.msg_flags |= MSG_MORE;
+
+               set_fs (KERNEL_DS);
+               rc = sock_sendmsg(sock, &msg, nob);
+               set_fs (oldmm);
+
+               for (i = 0; i < niov; i++)
+                       kunmap(kiov[i].kiov_page);
+       }
+       return rc;
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+       int         opt = 1;
+       mm_segment_t   oldmm = get_fs();
+       struct socket *sock = conn->ksnc_sock;
+
+       /* Remind the socket to ACK eagerly.  If I don't, the socket might
+        * think I'm about to send something it could piggy-back the ACK
+        * on, introducing delay in completing zero-copy sends in my
+        * peer. */
+
+       set_fs(KERNEL_DS);
+       sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
+                              (char *)&opt, sizeof (opt));
+       set_fs(oldmm);
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+       struct iovec  scratch;
+       struct iovec *scratchiov = &scratch;
+       unsigned int  niov = 1;
+#else
+       struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+       unsigned int  niov = conn->ksnc_rx_niov;
+#endif
+       struct iovec *iov = conn->ksnc_rx_iov;
+       struct msghdr msg = {
+               .msg_name       = NULL,
+               .msg_namelen    = 0,
+               .msg_iov        = scratchiov,
+               .msg_iovlen     = niov,
+               .msg_control    = NULL,
+               .msg_controllen = 0,
+               .msg_flags      = 0
+       };
+       mm_segment_t oldmm = get_fs();
+       int       nob;
+       int       i;
+       int       rc;
+       int       fragnob;
+       int       sum;
+       __u32   saved_csum;
+
+       /* NB we can't trust socket ops to either consume our iovs
+        * or leave them alone. */
+       LASSERT (niov > 0);
+
+       for (nob = i = 0; i < niov; i++) {
+               scratchiov[i] = iov[i];
+               nob += scratchiov[i].iov_len;
+       }
+       LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+       set_fs (KERNEL_DS);
+       rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+       /* NB this is just a boolean..........................^ */
+       set_fs (oldmm);
+
+       saved_csum = 0;
+       if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+               saved_csum = conn->ksnc_msg.ksm_csum;
+               conn->ksnc_msg.ksm_csum = 0;
+       }
+
+       if (saved_csum != 0) {
+               /* accumulate checksum */
+               for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+                       LASSERT (i < niov);
+
+                       fragnob = iov[i].iov_len;
+                       if (fragnob > sum)
+                               fragnob = sum;
+
+                       conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+                                                          iov[i].iov_base, fragnob);
+               }
+               conn->ksnc_msg.ksm_csum = saved_csum;
+       }
+
+       return rc;
+}
+
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+       if (addr == NULL)
+               return;
+
+       vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
+                      struct iovec *iov, struct page **pages)
+{
+       void         *addr;
+       int            nob;
+       int            i;
+
+       if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+               return NULL;
+
+       LASSERT (niov <= LNET_MAX_IOV);
+
+       if (niov < 2 ||
+           niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+               return NULL;
+
+       for (nob = i = 0; i < niov; i++) {
+               if ((kiov[i].kiov_offset != 0 && i > 0) ||
+                   (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1))
+                       return NULL;
+
+               pages[i] = kiov[i].kiov_page;
+               nob += kiov[i].kiov_len;
+       }
+
+       addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+       if (addr == NULL)
+               return NULL;
+
+       iov->iov_base = addr + kiov[0].kiov_offset;
+       iov->iov_len = nob;
+
+       return addr;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+       struct iovec   scratch;
+       struct iovec  *scratchiov = &scratch;
+       struct page  **pages      = NULL;
+       unsigned int   niov       = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+       struct iovec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+       struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
+       unsigned int   niov       = conn->ksnc_rx_nkiov;
+#endif
+       lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+       struct msghdr msg = {
+               .msg_name       = NULL,
+               .msg_namelen    = 0,
+               .msg_iov        = scratchiov,
+               .msg_control    = NULL,
+               .msg_controllen = 0,
+               .msg_flags      = 0
+       };
+       mm_segment_t oldmm = get_fs();
+       int       nob;
+       int       i;
+       int       rc;
+       void    *base;
+       void    *addr;
+       int       sum;
+       int       fragnob;
+
+       /* NB we can't trust socket ops to either consume our iovs
+        * or leave them alone. */
+       if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
+               nob = scratchiov[0].iov_len;
+               msg.msg_iovlen = 1;
+
+       } else {
+               for (nob = i = 0; i < niov; i++) {
+                       nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+                       scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+                                                kiov[i].kiov_offset;
+               }
+               msg.msg_iovlen = niov;
+       }
+
+       LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+       set_fs (KERNEL_DS);
+       rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+       /* NB this is just a boolean.......................^ */
+       set_fs (oldmm);
+
+       if (conn->ksnc_msg.ksm_csum != 0) {
+               for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+                       LASSERT (i < niov);
+
+                       /* Dang! have to kmap again because I have nowhere to stash the
+                        * mapped address.  But by doing it while the page is still
+                        * mapped, the kernel just bumps the map count and returns me
+                        * the address it stashed. */
+                       base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+                       fragnob = kiov[i].kiov_len;
+                       if (fragnob > sum)
+                               fragnob = sum;
+
+                       conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+                                                          base, fragnob);
+
+                       kunmap(kiov[i].kiov_page);
+               }
+       }
+
+       if (addr != NULL) {
+               ksocknal_lib_kiov_vunmap(addr);
+       } else {
+               for (i = 0; i < niov; i++)
+                       kunmap(kiov[i].kiov_page);
+       }
+
+       return (rc);
+}
+
+void
+ksocknal_lib_csum_tx(ksock_tx_t *tx)
+{
+       int       i;
+       __u32   csum;
+       void    *base;
+
+       LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg);
+       LASSERT(tx->tx_conn != NULL);
+       LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+       tx->tx_msg.ksm_csum = 0;
+
+       csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base,
+                            tx->tx_iov[0].iov_len);
+
+       if (tx->tx_kiov != NULL) {
+               for (i = 0; i < tx->tx_nkiov; i++) {
+                       base = kmap(tx->tx_kiov[i].kiov_page) +
+                              tx->tx_kiov[i].kiov_offset;
+
+                       csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
+
+                       kunmap(tx->tx_kiov[i].kiov_page);
+               }
+       } else {
+               for (i = 1; i < tx->tx_niov; i++)
+                       csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
+                                            tx->tx_iov[i].iov_len);
+       }
+
+       if (*ksocknal_tunables.ksnd_inject_csum_error) {
+               csum++;
+               *ksocknal_tunables.ksnd_inject_csum_error = 0;
+       }
+
+       tx->tx_msg.ksm_csum = csum;
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+       mm_segment_t   oldmm = get_fs ();
+       struct socket *sock = conn->ksnc_sock;
+       int         len;
+       int         rc;
+
+       rc = ksocknal_connsock_addref(conn);
+       if (rc != 0) {
+               LASSERT (conn->ksnc_closing);
+               *txmem = *rxmem = *nagle = 0;
+               return (-ESHUTDOWN);
+       }
+
+       rc = libcfs_sock_getbuf(sock, txmem, rxmem);
+       if (rc == 0) {
+               len = sizeof(*nagle);
+               set_fs(KERNEL_DS);
+               rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
+                                          (char *)nagle, &len);
+               set_fs(oldmm);
+       }
+
+       ksocknal_connsock_decref(conn);
+
+       if (rc == 0)
+               *nagle = !*nagle;
+       else
+               *txmem = *rxmem = *nagle = 0;
+
+       return (rc);
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *sock)
+{
+       mm_segment_t    oldmm = get_fs ();
+       int          rc;
+       int          option;
+       int          keep_idle;
+       int          keep_intvl;
+       int          keep_count;
+       int          do_keepalive;
+       struct linger   linger;
+
+       sock->sk->sk_allocation = GFP_NOFS;
+
+       /* Ensure this socket aborts active sends immediately when we close
+        * it. */
+
+       linger.l_onoff = 0;
+       linger.l_linger = 0;
+
+       set_fs (KERNEL_DS);
+       rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER,
+                             (char *)&linger, sizeof (linger));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set SO_LINGER: %d\n", rc);
+               return (rc);
+       }
+
+       option = -1;
+       set_fs (KERNEL_DS);
+       rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2,
+                                   (char *)&option, sizeof (option));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set SO_LINGER2: %d\n", rc);
+               return (rc);
+       }
+
+       if (!*ksocknal_tunables.ksnd_nagle) {
+               option = 1;
+
+               set_fs (KERNEL_DS);
+               rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY,
+                                           (char *)&option, sizeof (option));
+               set_fs (oldmm);
+               if (rc != 0) {
+                       CERROR ("Can't disable nagle: %d\n", rc);
+                       return (rc);
+               }
+       }
+
+       rc = libcfs_sock_setbuf(sock,
+                               *ksocknal_tunables.ksnd_tx_buffer_size,
+                               *ksocknal_tunables.ksnd_rx_buffer_size);
+       if (rc != 0) {
+               CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+                       *ksocknal_tunables.ksnd_tx_buffer_size,
+                       *ksocknal_tunables.ksnd_rx_buffer_size, rc);
+               return (rc);
+       }
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+
+       /* snapshot tunables */
+       keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+       keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+       keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+       do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+       option = (do_keepalive ? 1 : 0);
+       set_fs (KERNEL_DS);
+       rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE,
+                             (char *)&option, sizeof (option));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+               return (rc);
+       }
+
+       if (!do_keepalive)
+               return (0);
+
+       set_fs (KERNEL_DS);
+       rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
+                                   (char *)&keep_idle, sizeof (keep_idle));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
+               return (rc);
+       }
+
+       set_fs (KERNEL_DS);
+       rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
+                                   (char *)&keep_intvl, sizeof (keep_intvl));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+               return (rc);
+       }
+
+       set_fs (KERNEL_DS);
+       rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
+                                   (char *)&keep_count, sizeof (keep_count));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
+               return (rc);
+       }
+
+       return (0);
+}
+
+void
+ksocknal_lib_push_conn (ksock_conn_t *conn)
+{
+       struct sock    *sk;
+       struct tcp_sock *tp;
+       int          nonagle;
+       int          val = 1;
+       int          rc;
+       mm_segment_t    oldmm;
+
+       rc = ksocknal_connsock_addref(conn);
+       if (rc != 0)                        /* being shut down */
+               return;
+
+       sk = conn->ksnc_sock->sk;
+       tp = tcp_sk(sk);
+
+       lock_sock (sk);
+       nonagle = tp->nonagle;
+       tp->nonagle = 1;
+       release_sock (sk);
+
+       oldmm = get_fs ();
+       set_fs (KERNEL_DS);
+
+       rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
+                                     (char *)&val, sizeof (val));
+       LASSERT (rc == 0);
+
+       set_fs (oldmm);
+
+       lock_sock (sk);
+       tp->nonagle = nonagle;
+       release_sock (sk);
+
+       ksocknal_connsock_decref(conn);
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+/*
+ * socket call back in Linux
+ */
+static void
+ksocknal_data_ready (struct sock *sk, int n)
+{
+       ksock_conn_t  *conn;
+       ENTRY;
+
+       /* interleave correctly with closing sockets... */
+       LASSERT(!in_irq());
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       conn = sk->sk_user_data;
+       if (conn == NULL) {          /* raced with ksocknal_terminate_conn */
+               LASSERT (sk->sk_data_ready != &ksocknal_data_ready);
+               sk->sk_data_ready (sk, n);
+       } else
+               ksocknal_read_callback(conn);
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       EXIT;
+}
+
+static void
+ksocknal_write_space (struct sock *sk)
+{
+       ksock_conn_t  *conn;
+       int         wspace;
+       int         min_wpace;
+
+       /* interleave correctly with closing sockets... */
+       LASSERT(!in_irq());
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       conn = sk->sk_user_data;
+       wspace = SOCKNAL_WSPACE(sk);
+       min_wpace = SOCKNAL_MIN_WSPACE(sk);
+
+       CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+              sk, wspace, min_wpace, conn,
+              (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
+                                     " ready" : " blocked"),
+              (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+                                     " scheduled" : " idle"),
+              (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
+                                     " empty" : " queued"));
+
+       if (conn == NULL) {          /* raced with ksocknal_terminate_conn */
+               LASSERT (sk->sk_write_space != &ksocknal_write_space);
+               sk->sk_write_space (sk);
+
+               read_unlock(&ksocknal_data.ksnd_global_lock);
+               return;
+       }
+
+       if (wspace >= min_wpace) {            /* got enough space */
+               ksocknal_write_callback(conn);
+
+               /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
+                * ENOMEM check in ksocknal_transmit is race-free (think about
+                * it). */
+
+               clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+       conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
+       conn->ksnc_saved_write_space = sock->sk->sk_write_space;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+{
+       sock->sk->sk_user_data = conn;
+       sock->sk->sk_data_ready = ksocknal_data_ready;
+       sock->sk->sk_write_space = ksocknal_write_space;
+       return;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+       /* Remove conn's network callbacks.
+        * NB I _have_ to restore the callback, rather than storing a noop,
+        * since the socket could survive past this module being unloaded!! */
+       sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
+       sock->sk->sk_write_space = conn->ksnc_saved_write_space;
+
+       /* A callback could be in progress already; they hold a read lock
+        * on ksnd_global_lock (to serialise with me) and NOOP if
+        * sk_user_data is NULL. */
+       sock->sk->sk_user_data = NULL;
+
+       return ;
+}
+
+int
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+{
+       int         rc = 0;
+       ksock_sched_t *sched;
+
+       sched = conn->ksnc_scheduler;
+       spin_lock_bh(&sched->kss_lock);
+
+       if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) &&
+           !conn->ksnc_tx_ready) {
+               /* SOCK_NOSPACE is set when the socket fills
+                * and cleared in the write_space callback
+                * (which also sets ksnc_tx_ready).  If
+                * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+                * zero, I didn't fill the socket and
+                * write_space won't reschedule me, so I
+                * return -ENOMEM to get my caller to retry
+                * after a timeout */
+               rc = -ENOMEM;
+       }
+
+       spin_unlock_bh(&sched->kss_lock);
+
+       return rc;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
new file mode 100644 (file)
index 0000000..3c13578
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_PORTAL_ALLOC
+
+#ifndef __LINUX_SOCKNAL_LIB_H__
+#define __LINUX_SOCKNAL_LIB_H__
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/if.h>
+
+#include <asm/uaccess.h>
+#include <asm/irq.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <linux/syscalls.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/crc32.h>
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+       return crc32_le(crc, p, len);
+#else
+       while (len-- > 0)
+               crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+       return crc;
+#endif
+}
+
+#define SOCKNAL_WSPACE(sk)       sk_stream_wspace(sk)
+#define SOCKNAL_MIN_WSPACE(sk)   sk_stream_min_wspace(sk)
+
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS                3
+#define SOCKNAL_NSCHEDS_HIGH   (SOCKNAL_NSCHEDS << 1)
+
+#endif
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644 (file)
index 0000000..8a474f6
--- /dev/null
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int sock_timeout = 50;
+CFS_MODULE_PARM(sock_timeout, "i", int, 0644,
+               "dead socket timeout (seconds)");
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# concurrent sends to 1 peer");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+               "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+               "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+               "# scheduler daemons in each pool while starting");
+
+static int nconnds = 4;
+CFS_MODULE_PARM(nconnds, "i", int, 0444,
+               "# connection daemons while starting");
+
+static int nconnds_max = 64;
+CFS_MODULE_PARM(nconnds_max, "i", int, 0444,
+               "max # connection daemons");
+
+static int min_reconnectms = 1000;
+CFS_MODULE_PARM(min_reconnectms, "i", int, 0644,
+               "min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+CFS_MODULE_PARM(max_reconnectms, "i", int, 0644,
+               "max connection retry interval (mS)");
+
+# define DEFAULT_EAGER_ACK 0
+static int eager_ack = DEFAULT_EAGER_ACK;
+CFS_MODULE_PARM(eager_ack, "i", int, 0644,
+               "send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+CFS_MODULE_PARM(typed_conns, "i", int, 0444,
+               "use different sockets for bulk");
+
+static int min_bulk = (1<<10);
+CFS_MODULE_PARM(min_bulk, "i", int, 0644,
+               "smallest 'large' message");
+
+# define DEFAULT_BUFFER_SIZE 0
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644,
+               "socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644,
+               "socket rx buffer size (0 for system default)");
+
+static int nagle = 0;
+CFS_MODULE_PARM(nagle, "i", int, 0644,
+               "enable NAGLE?");
+
+static int round_robin = 1;
+CFS_MODULE_PARM(round_robin, "i", int, 0644,
+               "Round robin for multiple interfaces");
+
+static int keepalive = 30;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+               "# seconds before send keepalive");
+
+static int keepalive_idle = 30;
+CFS_MODULE_PARM(keepalive_idle, "i", int, 0644,
+               "# idle seconds before probe");
+
+#define DEFAULT_KEEPALIVE_COUNT  5
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+CFS_MODULE_PARM(keepalive_count, "i", int, 0644,
+               "# missed probes == dead");
+
+static int keepalive_intvl = 5;
+CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644,
+               "seconds between probes");
+
+static int enable_csum = 0;
+CFS_MODULE_PARM(enable_csum, "i", int, 0644,
+               "enable check sum");
+
+static int inject_csum_error = 0;
+CFS_MODULE_PARM(inject_csum_error, "i", int, 0644,
+               "set non-zero to inject a checksum error");
+
+static int nonblk_zcack = 1;
+CFS_MODULE_PARM(nonblk_zcack, "i", int, 0644,
+               "always send ZC-ACK on non-blocking connection");
+
+static unsigned int zc_min_payload = (16 << 10);
+CFS_MODULE_PARM(zc_min_payload, "i", int, 0644,
+               "minimum payload size to zero copy");
+
+static unsigned int zc_recv = 0;
+CFS_MODULE_PARM(zc_recv, "i", int, 0644,
+               "enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0644,
+               "minimum # of fragments to enable ZC recv");
+
+
+#if SOCKNAL_VERSION_DEBUG
+static int protocol = 3;
+CFS_MODULE_PARM(protocol, "i", int, 0644,
+               "protocol version");
+#endif
+
+ksock_tunables_t ksocknal_tunables;
+
+int ksocknal_tunables_init(void)
+{
+
+       /* initialize ksocknal_tunables structure */
+       ksocknal_tunables.ksnd_timeout      = &sock_timeout;
+       ksocknal_tunables.ksnd_nscheds            = &nscheds;
+       ksocknal_tunables.ksnd_nconnds      = &nconnds;
+       ksocknal_tunables.ksnd_nconnds_max      = &nconnds_max;
+       ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+       ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+       ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
+       ksocknal_tunables.ksnd_typed_conns      = &typed_conns;
+       ksocknal_tunables.ksnd_min_bulk    = &min_bulk;
+       ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+       ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+       ksocknal_tunables.ksnd_nagle          = &nagle;
+       ksocknal_tunables.ksnd_round_robin      = &round_robin;
+       ksocknal_tunables.ksnd_keepalive          = &keepalive;
+       ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+       ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+       ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+       ksocknal_tunables.ksnd_credits      = &credits;
+       ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
+       ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
+       ksocknal_tunables.ksnd_peertimeout      = &peer_timeout;
+       ksocknal_tunables.ksnd_enable_csum      = &enable_csum;
+       ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+       ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
+       ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
+       ksocknal_tunables.ksnd_zc_recv      = &zc_recv;
+       ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+
+
+
+#if SOCKNAL_VERSION_DEBUG
+       ksocknal_tunables.ksnd_protocol    = &protocol;
+#endif
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+       ksocknal_tunables.ksnd_sysctl        =  NULL;
+#endif
+
+       if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+               *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+
+       /* initialize platform-sepcific tunables */
+       return ksocknal_lib_tunables_init();
+};
+
+void ksocknal_tunables_fini(void)
+{
+       ksocknal_lib_tunables_fini();
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
new file mode 100644 (file)
index 0000000..ec57179
--- /dev/null
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+/*
+ * Protocol entries :
+ *   pro_send_hello       : send hello message
+ *   pro_recv_hello       : receive hello message
+ *   pro_pack       : pack message header
+ *   pro_unpack           : unpack message header
+ *   pro_queue_tx_zcack() : Called holding BH lock: kss_lock
+ *                       return 1 if ACK is piggybacked, otherwise return 0
+ *   pro_queue_tx_msg()   : Called holding BH lock: kss_lock
+ *                       return the ACK that piggybacked by my message, or NULL
+ *   pro_handle_zcreq()   : handler of incoming ZC-REQ
+ *   pro_handle_zcack()   : handler of incoming ZC-ACK
+ *   pro_match_tx()       : Called holding glock
+ */
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+       /* V1.x, just enqueue it */
+       list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+       return NULL;
+}
+
+void
+ksocknal_next_tx_carrier(ksock_conn_t *conn)
+{
+       ksock_tx_t     *tx = conn->ksnc_tx_carrier;
+
+       /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+       LASSERT (!list_empty(&conn->ksnc_tx_queue));
+       LASSERT (tx != NULL);
+
+       /* Next TX that can carry ZC-ACK or LNet message */
+       if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+               /* no more packets queued */
+               conn->ksnc_tx_carrier = NULL;
+       } else {
+               conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
+                                                      ksock_tx_t, tx_list);
+               LASSERT (conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type);
+       }
+}
+
+static int
+ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
+                          ksock_tx_t *tx_ack, __u64 cookie)
+{
+       ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+       LASSERT (tx_ack == NULL ||
+                tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+       /*
+        * Enqueue or piggyback tx_ack / cookie
+        * . no tx can piggyback cookie of tx_ack (or cookie), just
+        *   enqueue the tx_ack (if tx_ack != NUL) and return NULL.
+        * . There is tx can piggyback cookie of tx_ack (or cookie),
+        *   piggyback the cookie and return the tx.
+        */
+       if (tx == NULL) {
+               if (tx_ack != NULL) {
+                       list_add_tail(&tx_ack->tx_list,
+                                         &conn->ksnc_tx_queue);
+                       conn->ksnc_tx_carrier = tx_ack;
+               }
+               return 0;
+       }
+
+       if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+               /* tx is noop zc-ack, can't piggyback zc-ack cookie */
+               if (tx_ack != NULL)
+                       list_add_tail(&tx_ack->tx_list,
+                                         &conn->ksnc_tx_queue);
+               return 0;
+       }
+
+       LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+       LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0);
+
+       if (tx_ack != NULL)
+               cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+       /* piggyback the zc-ack cookie */
+       tx->tx_msg.ksm_zc_cookies[1] = cookie;
+       /* move on to the next TX which can carry cookie */
+       ksocknal_next_tx_carrier(conn);
+
+       return 1;
+}
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+       ksock_tx_t  *tx  = conn->ksnc_tx_carrier;
+
+       /*
+        * Enqueue tx_msg:
+        * . If there is no NOOP on the connection, just enqueue
+        *   tx_msg and return NULL
+        * . If there is NOOP on the connection, piggyback the cookie
+        *   and replace the NOOP tx, and return the NOOP tx.
+        */
+       if (tx == NULL) { /* nothing on queue */
+               list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+               conn->ksnc_tx_carrier = tx_msg;
+               return NULL;
+       }
+
+       if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
+               list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+               return NULL;
+       }
+
+       LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+       /* There is a noop zc-ack can be piggybacked */
+       tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
+       ksocknal_next_tx_carrier(conn);
+
+       /* use new_tx to replace the noop zc-ack packet */
+       list_add(&tx_msg->tx_list, &tx->tx_list);
+       list_del(&tx->tx_list);
+
+       return tx;
+}
+
+static int
+ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
+                          ksock_tx_t *tx_ack, __u64 cookie)
+{
+       ksock_tx_t *tx;
+
+       if (conn->ksnc_type != SOCKLND_CONN_ACK)
+               return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
+
+       /* non-blocking ZC-ACK (to router) */
+       LASSERT (tx_ack == NULL ||
+                tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+       if ((tx = conn->ksnc_tx_carrier) == NULL) {
+               if (tx_ack != NULL) {
+                       list_add_tail(&tx_ack->tx_list,
+                                         &conn->ksnc_tx_queue);
+                       conn->ksnc_tx_carrier = tx_ack;
+               }
+               return 0;
+       }
+
+       /* conn->ksnc_tx_carrier != NULL */
+
+       if (tx_ack != NULL)
+               cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+       if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
+               return 1;
+
+       if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
+               /* replace the keepalive PING with a real ACK */
+               LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+               tx->tx_msg.ksm_zc_cookies[1] = cookie;
+               return 1;
+       }
+
+       if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
+           cookie == tx->tx_msg.ksm_zc_cookies[1]) {
+               CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+                     libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+               return 1; /* XXX return error in the future */
+       }
+
+       if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+               /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */
+               if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
+                       tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
+                       tx->tx_msg.ksm_zc_cookies[1] = cookie;
+               } else {
+                       tx->tx_msg.ksm_zc_cookies[0] = cookie;
+               }
+
+               if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
+                       /* not likely to carry more ACKs, skip it to simplify logic */
+                       ksocknal_next_tx_carrier(conn);
+               }
+
+               return 1;
+       }
+
+       /* takes two or more cookies already */
+
+       if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
+               __u64   tmp = 0;
+
+               /* two seperated cookies: (a+2, a) or (a+1, a) */
+               LASSERT (tx->tx_msg.ksm_zc_cookies[0] -
+                        tx->tx_msg.ksm_zc_cookies[1] <= 2);
+
+               if (tx->tx_msg.ksm_zc_cookies[0] -
+                   tx->tx_msg.ksm_zc_cookies[1] == 2) {
+                       if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
+                               tmp = cookie;
+               } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
+                       tmp = tx->tx_msg.ksm_zc_cookies[1];
+               } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
+                       tmp = tx->tx_msg.ksm_zc_cookies[0];
+               }
+
+               if (tmp != 0) {
+                       /* range of cookies */
+                       tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
+                       tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
+                       return 1;
+               }
+
+       } else {
+               /* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */
+               if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
+                   cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
+                       CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+                             libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+                       return 1; /* XXX: return error in the future */
+               }
+
+               if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
+                       tx->tx_msg.ksm_zc_cookies[1] = cookie;
+                       return 1;
+               }
+
+               if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
+                       tx->tx_msg.ksm_zc_cookies[0] = cookie;
+                       return 1;
+               }
+       }
+
+       /* failed to piggyback ZC-ACK */
+       if (tx_ack != NULL) {
+               list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
+               /* the next tx can piggyback at least 1 ACK */
+               ksocknal_next_tx_carrier(conn);
+       }
+
+       return 0;
+}
+
+static int
+ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+       int nob;
+
+#if SOCKNAL_VERSION_DEBUG
+       if (!*ksocknal_tunables.ksnd_typed_conns)
+               return SOCKNAL_MATCH_YES;
+#endif
+
+       if (tx == NULL || tx->tx_lnetmsg == NULL) {
+               /* noop packet */
+               nob = offsetof(ksock_msg_t, ksm_u);
+       } else {
+               nob = tx->tx_lnetmsg->msg_len +
+                     ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
+                      sizeof(lnet_hdr_t) : sizeof(ksock_msg_t));
+       }
+
+       /* default checking for typed connection */
+       switch (conn->ksnc_type) {
+       default:
+               CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+               LBUG();
+       case SOCKLND_CONN_ANY:
+               return SOCKNAL_MATCH_YES;
+
+       case SOCKLND_CONN_BULK_IN:
+               return SOCKNAL_MATCH_MAY;
+
+       case SOCKLND_CONN_BULK_OUT:
+               if (nob < *ksocknal_tunables.ksnd_min_bulk)
+                       return SOCKNAL_MATCH_MAY;
+               else
+                       return SOCKNAL_MATCH_YES;
+
+       case SOCKLND_CONN_CONTROL:
+               if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+                       return SOCKNAL_MATCH_MAY;
+               else
+                       return SOCKNAL_MATCH_YES;
+       }
+}
+
+static int
+ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+       int nob;
+
+       if (tx == NULL || tx->tx_lnetmsg == NULL)
+               nob = offsetof(ksock_msg_t, ksm_u);
+       else
+               nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t);
+
+       switch (conn->ksnc_type) {
+       default:
+               CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+               LBUG();
+       case SOCKLND_CONN_ANY:
+               return SOCKNAL_MATCH_NO;
+
+       case SOCKLND_CONN_ACK:
+               if (nonblk)
+                       return SOCKNAL_MATCH_YES;
+               else if (tx == NULL || tx->tx_lnetmsg == NULL)
+                       return SOCKNAL_MATCH_MAY;
+               else
+                       return SOCKNAL_MATCH_NO;
+
+       case SOCKLND_CONN_BULK_OUT:
+               if (nonblk)
+                       return SOCKNAL_MATCH_NO;
+               else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+                       return SOCKNAL_MATCH_MAY;
+               else
+                       return SOCKNAL_MATCH_YES;
+
+       case SOCKLND_CONN_CONTROL:
+               if (nonblk)
+                       return SOCKNAL_MATCH_NO;
+               else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+                       return SOCKNAL_MATCH_MAY;
+               else
+                       return SOCKNAL_MATCH_YES;
+       }
+}
+
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+{
+       ksock_peer_t   *peer = c->ksnc_peer;
+       ksock_conn_t   *conn;
+       ksock_tx_t     *tx;
+       int          rc;
+
+       read_lock(&ksocknal_data.ksnd_global_lock);
+
+       conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
+       if (conn != NULL) {
+               ksock_sched_t *sched = conn->ksnc_scheduler;
+
+               LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+               spin_lock_bh(&sched->kss_lock);
+
+               rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
+
+               spin_unlock_bh(&sched->kss_lock);
+
+               if (rc) { /* piggybacked */
+                       read_unlock(&ksocknal_data.ksnd_global_lock);
+                       return 0;
+               }
+       }
+
+       read_unlock(&ksocknal_data.ksnd_global_lock);
+
+       /* ACK connection is not ready, or can't piggyback the ACK */
+       tx = ksocknal_alloc_tx_noop(cookie, !!remote);
+       if (tx == NULL)
+               return -ENOMEM;
+
+       if ((rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) == 0)
+               return 0;
+
+       ksocknal_free_tx(tx);
+       return rc;
+}
+
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+{
+       ksock_peer_t      *peer = conn->ksnc_peer;
+       ksock_tx_t      *tx;
+       ksock_tx_t      *tmp;
+       LIST_HEAD     (zlist);
+       int             count;
+
+       if (cookie1 == 0)
+               cookie1 = cookie2;
+
+       count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
+
+       if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
+           conn->ksnc_proto == &ksocknal_protocol_v3x) {
+               /* keepalive PING for V3.x, just ignore it */
+               return count == 1 ? 0 : -EPROTO;
+       }
+
+       spin_lock(&peer->ksnp_lock);
+
+       list_for_each_entry_safe(tx, tmp,
+                                    &peer->ksnp_zc_req_list, tx_zc_list) {
+               __u64 c = tx->tx_msg.ksm_zc_cookies[0];
+
+               if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
+                       tx->tx_msg.ksm_zc_cookies[0] = 0;
+                       list_del(&tx->tx_zc_list);
+                       list_add(&tx->tx_zc_list, &zlist);
+
+                       if (--count == 0)
+                               break;
+               }
+       }
+
+       spin_unlock(&peer->ksnp_lock);
+
+       while (!list_empty(&zlist)) {
+               tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+               list_del(&tx->tx_zc_list);
+               ksocknal_tx_decref(tx);
+       }
+
+       return count == 0 ? 0 : -EPROTO;
+}
+
+static int
+ksocknal_send_hello_v1 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+       socket_t        *sock = conn->ksnc_sock;
+       lnet_hdr_t        *hdr;
+       lnet_magicversion_t *hmv;
+       int               rc;
+       int               i;
+
+       CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid));
+
+       LIBCFS_ALLOC(hdr, sizeof(*hdr));
+       if (hdr == NULL) {
+               CERROR("Can't allocate lnet_hdr_t\n");
+               return -ENOMEM;
+       }
+
+       hmv = (lnet_magicversion_t *)&hdr->dest_nid;
+
+       /* Re-organize V2.x message header to V1.x (lnet_hdr_t)
+        * header and send out */
+       hmv->magic       = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+       hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+       hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+       if (the_lnet.ln_testprotocompat != 0) {
+               /* single-shot proto check */
+               LNET_LOCK();
+               if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                       hmv->version_major++;   /* just different! */
+                       the_lnet.ln_testprotocompat &= ~1;
+               }
+               if ((the_lnet.ln_testprotocompat & 2) != 0) {
+                       hmv->magic = LNET_PROTO_MAGIC;
+                       the_lnet.ln_testprotocompat &= ~2;
+               }
+               LNET_UNLOCK();
+       }
+
+       hdr->src_nid    = cpu_to_le64 (hello->kshm_src_nid);
+       hdr->src_pid    = cpu_to_le32 (hello->kshm_src_pid);
+       hdr->type          = cpu_to_le32 (LNET_MSG_HELLO);
+       hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+       hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+       hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
+
+       rc = libcfs_sock_write(sock, hdr, sizeof(*hdr),lnet_acceptor_timeout());
+
+       if (rc != 0) {
+               CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+               goto out;
+       }
+
+       if (hello->kshm_nips == 0)
+               goto out;
+
+       for (i = 0; i < (int) hello->kshm_nips; i++) {
+               hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
+       }
+
+       rc = libcfs_sock_write(sock, hello->kshm_ips,
+                              hello->kshm_nips * sizeof(__u32),
+                              lnet_acceptor_timeout());
+       if (rc != 0) {
+               CNETERR("Error %d sending HELLO payload (%d)"
+                       " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+                       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+       }
+out:
+       LIBCFS_FREE(hdr, sizeof(*hdr));
+
+       return rc;
+}
+
+static int
+ksocknal_send_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+       socket_t   *sock = conn->ksnc_sock;
+       int          rc;
+
+       hello->kshm_magic   = LNET_PROTO_MAGIC;
+       hello->kshm_version = conn->ksnc_proto->pro_version;
+
+       if (the_lnet.ln_testprotocompat != 0) {
+               /* single-shot proto check */
+               LNET_LOCK();
+               if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                       hello->kshm_version++;   /* just different! */
+                       the_lnet.ln_testprotocompat &= ~1;
+               }
+               LNET_UNLOCK();
+       }
+
+       rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips),
+                              lnet_acceptor_timeout());
+
+       if (rc != 0) {
+               CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+               return rc;
+       }
+
+       if (hello->kshm_nips == 0)
+               return 0;
+
+       rc = libcfs_sock_write(sock, hello->kshm_ips,
+                              hello->kshm_nips * sizeof(__u32),
+                              lnet_acceptor_timeout());
+       if (rc != 0) {
+               CNETERR("Error %d sending HELLO payload (%d)"
+                       " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+                       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+       }
+
+       return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,int timeout)
+{
+       socket_t        *sock = conn->ksnc_sock;
+       lnet_hdr_t        *hdr;
+       int               rc;
+       int               i;
+
+       LIBCFS_ALLOC(hdr, sizeof(*hdr));
+       if (hdr == NULL) {
+               CERROR("Can't allocate lnet_hdr_t\n");
+               return -ENOMEM;
+       }
+
+       rc = libcfs_sock_read(sock, &hdr->src_nid,
+                             sizeof (*hdr) - offsetof (lnet_hdr_t, src_nid),
+                             timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0 && rc != -EALREADY);
+               goto out;
+       }
+
+       /* ...and check we got what we expected */
+       if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
+               CERROR ("Expecting a HELLO hdr,"
+                       " but got type %d from %u.%u.%u.%u\n",
+                       le32_to_cpu (hdr->type),
+                       HIPQUAD(conn->ksnc_ipaddr));
+               rc = -EPROTO;
+               goto out;
+       }
+
+       hello->kshm_src_nid      = le64_to_cpu (hdr->src_nid);
+       hello->kshm_src_pid      = le32_to_cpu (hdr->src_pid);
+       hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation);
+       hello->kshm_ctype          = le32_to_cpu (hdr->msg.hello.type);
+       hello->kshm_nips            = le32_to_cpu (hdr->payload_length) /
+                                        sizeof (__u32);
+
+       if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+               CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+                      hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+               rc = -EPROTO;
+               goto out;
+       }
+
+       if (hello->kshm_nips == 0)
+               goto out;
+
+       rc = libcfs_sock_read(sock, hello->kshm_ips,
+                             hello->kshm_nips * sizeof(__u32), timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0 && rc != -EALREADY);
+               goto out;
+       }
+
+       for (i = 0; i < (int) hello->kshm_nips; i++) {
+               hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+
+               if (hello->kshm_ips[i] == 0) {
+                       CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+                              i, HIPQUAD(conn->ksnc_ipaddr));
+                       rc = -EPROTO;
+                       break;
+               }
+       }
+out:
+       LIBCFS_FREE(hdr, sizeof(*hdr));
+
+       return rc;
+}
+
+static int
+ksocknal_recv_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout)
+{
+       socket_t      *sock = conn->ksnc_sock;
+       int             rc;
+       int             i;
+
+       if (hello->kshm_magic == LNET_PROTO_MAGIC)
+               conn->ksnc_flip = 0;
+       else
+               conn->ksnc_flip = 1;
+
+       rc = libcfs_sock_read(sock, &hello->kshm_src_nid,
+                             offsetof(ksock_hello_msg_t, kshm_ips) -
+                                      offsetof(ksock_hello_msg_t, kshm_src_nid),
+                             timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0 && rc != -EALREADY);
+               return rc;
+       }
+
+       if (conn->ksnc_flip) {
+               __swab32s(&hello->kshm_src_pid);
+               __swab64s(&hello->kshm_src_nid);
+               __swab32s(&hello->kshm_dst_pid);
+               __swab64s(&hello->kshm_dst_nid);
+               __swab64s(&hello->kshm_src_incarnation);
+               __swab64s(&hello->kshm_dst_incarnation);
+               __swab32s(&hello->kshm_ctype);
+               __swab32s(&hello->kshm_nips);
+       }
+
+       if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+               CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+                      hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+               return -EPROTO;
+       }
+
+       if (hello->kshm_nips == 0)
+               return 0;
+
+       rc = libcfs_sock_read(sock, hello->kshm_ips,
+                             hello->kshm_nips * sizeof(__u32), timeout);
+       if (rc != 0) {
+               CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+               LASSERT (rc < 0 && rc != -EALREADY);
+               return rc;
+       }
+
+       for (i = 0; i < (int) hello->kshm_nips; i++) {
+               if (conn->ksnc_flip)
+                       __swab32s(&hello->kshm_ips[i]);
+
+               if (hello->kshm_ips[i] == 0) {
+                       CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+                              i, HIPQUAD(conn->ksnc_ipaddr));
+                       return -EPROTO;
+               }
+       }
+
+       return 0;
+}
+
+static void
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
+{
+       /* V1.x has no KSOCK_MSG_NOOP */
+       LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+       LASSERT(tx->tx_lnetmsg != NULL);
+
+       tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr;
+       tx->tx_iov[0].iov_len  = sizeof(lnet_hdr_t);
+
+       tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t);
+}
+
+static void
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
+{
+       tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
+
+       if (tx->tx_lnetmsg != NULL) {
+               LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+
+               tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
+               tx->tx_iov[0].iov_len = sizeof(ksock_msg_t);
+               tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len;
+       } else {
+               LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+               tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+               tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t,  ksm_u.lnetmsg.ksnm_hdr);
+       }
+       /* Don't checksum before start sending, because packet can be piggybacked with ACK */
+}
+
+static void
+ksocknal_unpack_msg_v1(ksock_msg_t *msg)
+{
+       msg->ksm_csum      = 0;
+       msg->ksm_type      = KSOCK_MSG_LNET;
+       msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+static void
+ksocknal_unpack_msg_v2(ksock_msg_t *msg)
+{
+       return;  /* Do nothing */
+}
+
+ksock_proto_t  ksocknal_protocol_v1x =
+{
+       .pro_version        = KSOCK_PROTO_V1,
+       .pro_send_hello  = ksocknal_send_hello_v1,
+       .pro_recv_hello  = ksocknal_recv_hello_v1,
+       .pro_pack              = ksocknal_pack_msg_v1,
+       .pro_unpack          = ksocknal_unpack_msg_v1,
+       .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v1,
+       .pro_handle_zcreq       = NULL,
+       .pro_handle_zcack       = NULL,
+       .pro_queue_tx_zcack     = NULL,
+       .pro_match_tx      = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v2x =
+{
+       .pro_version        = KSOCK_PROTO_V2,
+       .pro_send_hello  = ksocknal_send_hello_v2,
+       .pro_recv_hello  = ksocknal_recv_hello_v2,
+       .pro_pack              = ksocknal_pack_msg_v2,
+       .pro_unpack          = ksocknal_unpack_msg_v2,
+       .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+       .pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v2,
+       .pro_handle_zcreq       = ksocknal_handle_zcreq,
+       .pro_handle_zcack       = ksocknal_handle_zcack,
+       .pro_match_tx      = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v3x =
+{
+       .pro_version        = KSOCK_PROTO_V3,
+       .pro_send_hello  = ksocknal_send_hello_v2,
+       .pro_recv_hello  = ksocknal_recv_hello_v2,
+       .pro_pack              = ksocknal_pack_msg_v2,
+       .pro_unpack          = ksocknal_unpack_msg_v2,
+       .pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+       .pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v3,
+       .pro_handle_zcreq       = ksocknal_handle_zcreq,
+       .pro_handle_zcack       = ksocknal_handle_zcack,
+       .pro_match_tx      = ksocknal_match_tx_v3
+};
diff --git a/drivers/staging/lustre/lnet/lnet/Makefile b/drivers/staging/lustre/lnet/lnet/Makefile
new file mode 100644 (file)
index 0000000..1bd9ef7
--- /dev/null
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LNET) += lnet.o
+
+lnet-y := api-errno.o api-ni.o config.o lib-me.o lib-msg.o lib-eq.o    \
+         lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o          \
+         router_proc.o acceptor.o peer.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/lnet/acceptor.c b/drivers/staging/lustre/lnet/lnet/acceptor.c
new file mode 100644 (file)
index 0000000..81ef28b
--- /dev/null
@@ -0,0 +1,527 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+
+static int   accept_port    = 988;
+static int   accept_backlog = 127;
+static int   accept_timeout = 5;
+
+struct {
+       int                     pta_shutdown;
+       socket_t                *pta_sock;
+       struct completion       pta_signal;
+} lnet_acceptor_state;
+
+int
+lnet_acceptor_port(void)
+{
+       return accept_port;
+}
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+       return (magic == constant ||
+               magic == __swab32(constant));
+}
+
+
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static char *accept = "secure";
+
+CFS_MODULE_PARM(accept, "s", charp, 0444,
+               "Accept connections (secure|all|none)");
+CFS_MODULE_PARM(accept_port, "i", int, 0444,
+               "Acceptor's port (same on all nodes)");
+CFS_MODULE_PARM(accept_backlog, "i", int, 0444,
+               "Acceptor's listen backlog");
+CFS_MODULE_PARM(accept_timeout, "i", int, 0644,
+               "Acceptor's timeout (seconds)");
+
+static char *accept_type = NULL;
+
+int
+lnet_acceptor_get_tunables(void)
+{
+       /* Userland acceptor uses 'accept_type' instead of 'accept', due to
+        * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
+        * for compatibility. Hence the trick. */
+       accept_type = accept;
+       return 0;
+}
+
+int
+lnet_acceptor_timeout(void)
+{
+       return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error (int rc, lnet_nid_t peer_nid,
+                          __u32 peer_ip, int peer_port)
+{
+       switch (rc) {
+       /* "normal" errors */
+       case -ECONNREFUSED:
+               CNETERR("Connection to %s at host %u.%u.%u.%u on port %d was "
+                       "refused: check that Lustre is running on that node.\n",
+                       libcfs_nid2str(peer_nid),
+                       HIPQUAD(peer_ip), peer_port);
+               break;
+       case -EHOSTUNREACH:
+       case -ENETUNREACH:
+               CNETERR("Connection to %s at host %u.%u.%u.%u "
+                       "was unreachable: the network or that node may "
+                       "be down, or Lustre may be misconfigured.\n",
+                       libcfs_nid2str(peer_nid), HIPQUAD(peer_ip));
+               break;
+       case -ETIMEDOUT:
+               CNETERR("Connection to %s at host %u.%u.%u.%u on "
+                       "port %d took too long: that node may be hung "
+                       "or experiencing high load.\n",
+                       libcfs_nid2str(peer_nid),
+                       HIPQUAD(peer_ip), peer_port);
+               break;
+       case -ECONNRESET:
+               LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %u.%u.%u.%u"
+                                  " on port %d was reset: "
+                                  "is it running a compatible version of "
+                                  "Lustre and is %s one of its NIDs?\n",
+                                  libcfs_nid2str(peer_nid),
+                                  HIPQUAD(peer_ip), peer_port,
+                                  libcfs_nid2str(peer_nid));
+               break;
+       case -EPROTO:
+               LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at "
+                                  "host %u.%u.%u.%u on port %d: is it running "
+                                  "a compatible version of Lustre?\n",
+                                  libcfs_nid2str(peer_nid),
+                                  HIPQUAD(peer_ip), peer_port);
+               break;
+       case -EADDRINUSE:
+               LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to "
+                                  "connect to %s at host %u.%u.%u.%u on port "
+                                  "%d\n", libcfs_nid2str(peer_nid),
+                                  HIPQUAD(peer_ip), peer_port);
+               break;
+       default:
+               LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s"
+                                  " at host %u.%u.%u.%u on port %d\n", rc,
+                                  libcfs_nid2str(peer_nid),
+                                  HIPQUAD(peer_ip), peer_port);
+               break;
+       }
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+           __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+       lnet_acceptor_connreq_t cr;
+       socket_t           *sock;
+       int                  rc;
+       int                  port;
+       int                  fatal;
+
+       CLASSERT (sizeof(cr) <= 16);        /* not too big to be on the stack */
+
+       for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+            port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+            --port) {
+               /* Iterate through reserved ports. */
+
+               rc = libcfs_sock_connect(&sock, &fatal,
+                                        local_ip, port,
+                                        peer_ip, peer_port);
+               if (rc != 0) {
+                       if (fatal)
+                               goto failed;
+                       continue;
+               }
+
+               CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+               cr.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
+               cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+               cr.acr_nid     = peer_nid;
+
+               if (the_lnet.ln_testprotocompat != 0) {
+                       /* single-shot proto check */
+                       lnet_net_lock(LNET_LOCK_EX);
+                       if ((the_lnet.ln_testprotocompat & 4) != 0) {
+                               cr.acr_version++;
+                               the_lnet.ln_testprotocompat &= ~4;
+                       }
+                       if ((the_lnet.ln_testprotocompat & 8) != 0) {
+                               cr.acr_magic = LNET_PROTO_MAGIC;
+                               the_lnet.ln_testprotocompat &= ~8;
+                       }
+                       lnet_net_unlock(LNET_LOCK_EX);
+               }
+
+               rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+                                      accept_timeout);
+               if (rc != 0)
+                       goto failed_sock;
+
+               *sockp = sock;
+               return 0;
+       }
+
+       rc = -EADDRINUSE;
+       goto failed;
+
+ failed_sock:
+       libcfs_sock_release(sock);
+ failed:
+       lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+       return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+
+/* Below is the code common for both kernel and MT user-space */
+
+int
+lnet_accept(socket_t *sock, __u32 magic)
+{
+       lnet_acceptor_connreq_t cr;
+       __u32              peer_ip;
+       int                  peer_port;
+       int                  rc;
+       int                  flip;
+       lnet_ni_t             *ni;
+       char               *str;
+
+       LASSERT (sizeof(cr) <= 16);          /* not too big for the stack */
+
+       rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+       LASSERT (rc == 0);                    /* we succeeded before */
+
+       if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+               if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+                       /* future version compatibility!
+                        * When LNET unifies protocols over all LNDs, the first
+                        * thing sent will be a version query.  I send back
+                        * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+                       memset (&cr, 0, sizeof(cr));
+                       cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+                       cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+                       rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+                                              accept_timeout);
+
+                       if (rc != 0)
+                               CERROR("Error sending magic+version in response"
+                                      "to LNET magic from %u.%u.%u.%u: %d\n",
+                                      HIPQUAD(peer_ip), rc);
+                       return -EPROTO;
+               }
+
+               if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+                       str = "'old' socknal/tcpnal";
+               else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC))
+                       str = "'old' ranal";
+               else
+                       str = "unrecognised";
+
+               LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %u.%u.%u.%u"
+                                  " magic %08x: %s acceptor protocol\n",
+                                  HIPQUAD(peer_ip), magic, str);
+               return -EPROTO;
+       }
+
+       flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+       rc = libcfs_sock_read(sock, &cr.acr_version,
+                             sizeof(cr.acr_version),
+                             accept_timeout);
+       if (rc != 0) {
+               CERROR("Error %d reading connection request version from "
+                      "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+               return -EIO;
+       }
+
+       if (flip)
+               __swab32s(&cr.acr_version);
+
+       if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+               /* future version compatibility!
+                * An acceptor-specific protocol rev will first send a version
+                * query.  I send back my current version to tell her I'm
+                * "old". */
+               int peer_version = cr.acr_version;
+
+               memset (&cr, 0, sizeof(cr));
+               cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+               cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+               rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+                                      accept_timeout);
+
+               if (rc != 0)
+                       CERROR("Error sending magic+version in response"
+                              "to version %d from %u.%u.%u.%u: %d\n",
+                              peer_version, HIPQUAD(peer_ip), rc);
+               return -EPROTO;
+       }
+
+       rc = libcfs_sock_read(sock, &cr.acr_nid,
+                             sizeof(cr) -
+                             offsetof(lnet_acceptor_connreq_t, acr_nid),
+                             accept_timeout);
+       if (rc != 0) {
+               CERROR("Error %d reading connection request from "
+                      "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+               return -EIO;
+       }
+
+       if (flip)
+               __swab64s(&cr.acr_nid);
+
+       ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
+       if (ni == NULL ||              /* no matching net */
+           ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+               if (ni != NULL)
+                       lnet_ni_decref(ni);
+               LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %u.%u.%u.%u"
+                                  " for %s: No matching NI\n",
+                                  HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+               return -EPERM;
+       }
+
+       if (ni->ni_lnd->lnd_accept == NULL) {
+               /* This catches a request for the loopback LND */
+               lnet_ni_decref(ni);
+               LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %u.%u.%u.%u"
+                                 " for %s: NI doesn not accept IP connections\n",
+                                 HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+               return -EPERM;
+       }
+
+       CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u\n",
+              libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip));
+
+       rc = ni->ni_lnd->lnd_accept(ni, sock);
+
+       lnet_ni_decref(ni);
+       return rc;
+}
+
+int
+lnet_acceptor(void *arg)
+{
+       socket_t  *newsock;
+       int         rc;
+       __u32     magic;
+       __u32     peer_ip;
+       int         peer_port;
+       int         secure = (int)((long_ptr_t)arg);
+
+       LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+       cfs_block_allsigs();
+
+       rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
+                               0, accept_port, accept_backlog);
+       if (rc != 0) {
+               if (rc == -EADDRINUSE)
+                       LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
+                                          " %d: port already in use\n",
+                                          accept_port);
+               else
+                       LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port "
+                                          "%d: unexpected error %d\n",
+                                          accept_port, rc);
+
+               lnet_acceptor_state.pta_sock = NULL;
+       } else {
+               LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+       }
+
+       /* set init status and unblock parent */
+       lnet_acceptor_state.pta_shutdown = rc;
+       complete(&lnet_acceptor_state.pta_signal);
+
+       if (rc != 0)
+               return rc;
+
+       while (!lnet_acceptor_state.pta_shutdown) {
+
+               rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+               if (rc != 0) {
+                       if (rc != -EAGAIN) {
+                               CWARN("Accept error %d: pausing...\n", rc);
+                               cfs_pause(cfs_time_seconds(1));
+                       }
+                       continue;
+               }
+
+               /* maybe we're waken up with libcfs_sock_abort_accept() */
+               if (lnet_acceptor_state.pta_shutdown) {
+                       libcfs_sock_release(newsock);
+                       break;
+               }
+
+               rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port);
+               if (rc != 0) {
+                       CERROR("Can't determine new connection's address\n");
+                       goto failed;
+               }
+
+               if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+                       CERROR("Refusing connection from %u.%u.%u.%u: "
+                              "insecure port %d\n",
+                              HIPQUAD(peer_ip), peer_port);
+                       goto failed;
+               }
+
+               rc = libcfs_sock_read(newsock, &magic, sizeof(magic),
+                                     accept_timeout);
+               if (rc != 0) {
+                       CERROR("Error %d reading connection request from "
+                              "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+                       goto failed;
+               }
+
+               rc = lnet_accept(newsock, magic);
+               if (rc != 0)
+                       goto failed;
+
+               continue;
+
+       failed:
+               libcfs_sock_release(newsock);
+       }
+
+       libcfs_sock_release(lnet_acceptor_state.pta_sock);
+       lnet_acceptor_state.pta_sock = NULL;
+
+       CDEBUG(D_NET, "Acceptor stopping\n");
+
+       /* unblock lnet_acceptor_stop() */
+       complete(&lnet_acceptor_state.pta_signal);
+       return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+       if (!strcmp(acc, "secure")) {
+               *sec = 1;
+               return 1;
+       } else if (!strcmp(acc, "all")) {
+               *sec = 0;
+               return 1;
+       } else if (!strcmp(acc, "none")) {
+               return 0;
+       } else {
+               LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+                                  acc);
+               return -EINVAL;
+       }
+}
+
+int
+lnet_acceptor_start(void)
+{
+       int  rc;
+       long rc2;
+       long secure;
+
+       LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+       rc = lnet_acceptor_get_tunables();
+       if (rc != 0)
+               return rc;
+
+
+       init_completion(&lnet_acceptor_state.pta_signal);
+       rc = accept2secure(accept_type, &secure);
+       if (rc <= 0) {
+               fini_completion(&lnet_acceptor_state.pta_signal);
+               return rc;
+       }
+
+       if (lnet_count_acceptor_nis() == 0)  /* not required */
+               return 0;
+
+       rc2 = PTR_ERR(kthread_run(lnet_acceptor,
+                                 (void *)(ulong_ptr_t)secure,
+                                 "acceptor_%03ld", secure));
+       if (IS_ERR_VALUE(rc2)) {
+               CERROR("Can't start acceptor thread: %ld\n", rc2);
+               fini_completion(&lnet_acceptor_state.pta_signal);
+
+               return -ESRCH;
+       }
+
+       /* wait for acceptor to startup */
+       wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+       if (!lnet_acceptor_state.pta_shutdown) {
+               /* started OK */
+               LASSERT(lnet_acceptor_state.pta_sock != NULL);
+               return 0;
+       }
+
+       LASSERT(lnet_acceptor_state.pta_sock == NULL);
+       fini_completion(&lnet_acceptor_state.pta_signal);
+
+       return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+       if (lnet_acceptor_state.pta_sock == NULL) /* not running */
+               return;
+
+       lnet_acceptor_state.pta_shutdown = 1;
+       libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock);
+
+       /* block until acceptor signals exit */
+       wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+       fini_completion(&lnet_acceptor_state.pta_signal);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/api-errno.c b/drivers/staging/lustre/lnet/lnet/api-errno.c
new file mode 100644 (file)
index 0000000..695b272
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/api-errno.c
+ *
+ * Instantiate the string table of errors
+ */
+
+/* If you change these, you must update the number table in portals/errno.h */
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
new file mode 100644 (file)
index 0000000..e88bee3
--- /dev/null
@@ -0,0 +1,1941 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+#include <linux/log2.h>
+
+#define D_LNI D_CONSOLE
+
+lnet_t      the_lnet;                     /* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+
+static char *ip2nets = "";
+CFS_MODULE_PARM(ip2nets, "s", charp, 0444,
+               "LNET network <- IP table");
+
+static char *networks = "";
+CFS_MODULE_PARM(networks, "s", charp, 0444,
+               "local networks");
+
+static char *routes = "";
+CFS_MODULE_PARM(routes, "s", charp, 0444,
+               "routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+CFS_MODULE_PARM(rnet_htable_size, "i", int, 0444,
+               "size of remote network hash table");
+
+char *
+lnet_get_routes(void)
+{
+       return routes;
+}
+
+char *
+lnet_get_networks(void)
+{
+       char   *nets;
+       int     rc;
+
+       if (*networks != 0 && *ip2nets != 0) {
+               LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or "
+                                  "'ip2nets' but not both at once\n");
+               return NULL;
+       }
+
+       if (*ip2nets != 0) {
+               rc = lnet_parse_ip2nets(&nets, ip2nets);
+               return (rc == 0) ? nets : NULL;
+       }
+
+       if (*networks != 0)
+               return networks;
+
+       return "tcp";
+}
+
+void
+lnet_init_locks(void)
+{
+       spin_lock_init(&the_lnet.ln_eq_wait_lock);
+       init_waitqueue_head(&the_lnet.ln_eq_waitq);
+       mutex_init(&the_lnet.ln_lnd_mutex);
+       mutex_init(&the_lnet.ln_api_mutex);
+}
+
+void
+lnet_fini_locks(void)
+{
+}
+
+
+static int
+lnet_create_remote_nets_table(void)
+{
+       int             i;
+       struct list_head        *hash;
+
+       LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+       LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+       LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+       if (hash == NULL) {
+               CERROR("Failed to create remote nets hash table\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+               INIT_LIST_HEAD(&hash[i]);
+       the_lnet.ln_remote_nets_hash = hash;
+       return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+       int             i;
+       struct list_head        *hash;
+
+       if (the_lnet.ln_remote_nets_hash == NULL)
+               return;
+
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+               LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+       LIBCFS_FREE(the_lnet.ln_remote_nets_hash,
+                   LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+       the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+       if (the_lnet.ln_res_lock != NULL) {
+               cfs_percpt_lock_free(the_lnet.ln_res_lock);
+               the_lnet.ln_res_lock = NULL;
+       }
+
+       if (the_lnet.ln_net_lock != NULL) {
+               cfs_percpt_lock_free(the_lnet.ln_net_lock);
+               the_lnet.ln_net_lock = NULL;
+       }
+
+       lnet_fini_locks();
+}
+
+static int
+lnet_create_locks(void)
+{
+       lnet_init_locks();
+
+       the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+       if (the_lnet.ln_res_lock == NULL)
+               goto failed;
+
+       the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+       if (the_lnet.ln_net_lock == NULL)
+               goto failed;
+
+       return 0;
+
+ failed:
+       lnet_destroy_locks();
+       return -ENOMEM;
+}
+
+void lnet_assert_wire_constants (void)
+{
+       /* Wire protocol assertions generated by 'wirecheck'
+        * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+        * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+        * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+
+       /* Constants... */
+       CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+       CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1);
+       CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0);
+       CLASSERT (LNET_MSG_ACK == 0);
+       CLASSERT (LNET_MSG_PUT == 1);
+       CLASSERT (LNET_MSG_GET == 2);
+       CLASSERT (LNET_MSG_REPLY == 3);
+       CLASSERT (LNET_MSG_HELLO == 4);
+
+       /* Checks for struct ptl_handle_wire_t */
+       CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16);
+       CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0);
+       CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8);
+       CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8);
+       CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+       /* Checks for struct lnet_magicversion_t */
+       CLASSERT ((int)sizeof(lnet_magicversion_t) == 8);
+       CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0);
+       CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4);
+       CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4);
+       CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2);
+       CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6);
+       CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2);
+
+       /* Checks for struct lnet_hdr_t */
+       CLASSERT ((int)sizeof(lnet_hdr_t) == 72);
+       CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40);
+
+       /* Ack */
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4);
+
+       /* Put */
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4);
+
+       /* Get */
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4);
+
+       /* Reply */
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+
+       /* Hello */
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8);
+       CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40);
+       CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4);
+}
+
+lnd_t *
+lnet_find_lnd_by_type (int type)
+{
+       lnd_t         *lnd;
+       struct list_head         *tmp;
+
+       /* holding lnd mutex */
+       list_for_each (tmp, &the_lnet.ln_lnds) {
+               lnd = list_entry(tmp, lnd_t, lnd_list);
+
+               if ((int)lnd->lnd_type == type)
+                       return lnd;
+       }
+
+       return NULL;
+}
+
+void
+lnet_register_lnd (lnd_t *lnd)
+{
+       LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (libcfs_isknown_lnd(lnd->lnd_type));
+       LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+       list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds);
+       lnd->lnd_refcount = 0;
+
+       CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+       LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd (lnd_t *lnd)
+{
+       LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+       LASSERT (lnd->lnd_refcount == 0);
+
+       list_del (&lnd->lnd_list);
+       CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+       LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+void
+lnet_counters_get(lnet_counters_t *counters)
+{
+       lnet_counters_t *ctr;
+       int             i;
+
+       memset(counters, 0, sizeof(*counters));
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+               counters->msgs_max     += ctr->msgs_max;
+               counters->msgs_alloc   += ctr->msgs_alloc;
+               counters->errors       += ctr->errors;
+               counters->send_count   += ctr->send_count;
+               counters->recv_count   += ctr->recv_count;
+               counters->route_count  += ctr->route_count;
+               counters->drop_length  += ctr->drop_length;
+               counters->send_length  += ctr->send_length;
+               counters->recv_length  += ctr->recv_length;
+               counters->route_length += ctr->route_length;
+               counters->drop_length  += ctr->drop_length;
+
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+       lnet_counters_t *counters;
+       int             i;
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+               memset(counters, 0, sizeof(lnet_counters_t));
+
+       lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_reset);
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int
+lnet_freelist_init (lnet_freelist_t *fl, int n, int size)
+{
+       char *space;
+
+       LASSERT (n > 0);
+
+       size += offsetof (lnet_freeobj_t, fo_contents);
+
+       LIBCFS_ALLOC(space, n * size);
+       if (space == NULL)
+               return (-ENOMEM);
+
+       INIT_LIST_HEAD (&fl->fl_list);
+       fl->fl_objs = space;
+       fl->fl_nobjs = n;
+       fl->fl_objsize = size;
+
+       do
+       {
+               memset (space, 0, size);
+               list_add ((struct list_head *)space, &fl->fl_list);
+               space += size;
+       } while (--n != 0);
+
+       return (0);
+}
+
+void
+lnet_freelist_fini (lnet_freelist_t *fl)
+{
+       struct list_head       *el;
+       int            count;
+
+       if (fl->fl_nobjs == 0)
+               return;
+
+       count = 0;
+       for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+               count++;
+
+       LASSERT (count == fl->fl_nobjs);
+
+       LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+       memset (fl, 0, sizeof (*fl));
+}
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+__u64
+lnet_create_interface_cookie (void)
+{
+       /* NB the interface cookie in wire handles guards against delayed
+        * replies and ACKs appearing valid after reboot. Initialisation time,
+        * even if it's only implemented to millisecond resolution is probably
+        * easily good enough. */
+       struct timeval tv;
+       __u64     cookie;
+       do_gettimeofday(&tv);
+       cookie = tv.tv_sec;
+       cookie *= 1000000;
+       cookie += tv.tv_usec;
+       return cookie;
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+       switch (type) {
+       default:
+               LBUG();
+       case LNET_COOKIE_TYPE_MD:
+               return "MD";
+       case LNET_COOKIE_TYPE_ME:
+               return "ME";
+       case LNET_COOKIE_TYPE_EQ:
+               return "EQ";
+       }
+}
+
+void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+       int     count = 0;
+
+       if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+               return;
+
+       while (!list_empty(&rec->rec_active)) {
+               struct list_head *e = rec->rec_active.next;
+
+               list_del_init(e);
+               if (rec->rec_type == LNET_COOKIE_TYPE_EQ) {
+                       lnet_eq_free(list_entry(e, lnet_eq_t, eq_list));
+
+               } else if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+                       lnet_md_free(list_entry(e, lnet_libmd_t, md_list));
+
+               } else { /* NB: Active MEs should be attached on portals */
+                       LBUG();
+               }
+               count++;
+       }
+
+       if (count > 0) {
+               /* Found alive MD/ME/EQ, user really should unlink/free
+                * all of them before finalize LNet, but if someone didn't,
+                * we have to recycle garbage for him */
+               CERROR("%d active elements on exit of %s container\n",
+                      count, lnet_res_type2str(rec->rec_type));
+       }
+
+#ifdef LNET_USE_LIB_FREELIST
+       lnet_freelist_fini(&rec->rec_freelist);
+#endif
+       if (rec->rec_lh_hash != NULL) {
+               LIBCFS_FREE(rec->rec_lh_hash,
+                           LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+               rec->rec_lh_hash = NULL;
+       }
+
+       rec->rec_type = 0; /* mark it as finalized */
+}
+
+int
+lnet_res_container_setup(struct lnet_res_container *rec,
+                        int cpt, int type, int objnum, int objsz)
+{
+       int     rc = 0;
+       int     i;
+
+       LASSERT(rec->rec_type == 0);
+
+       rec->rec_type = type;
+       INIT_LIST_HEAD(&rec->rec_active);
+
+#ifdef LNET_USE_LIB_FREELIST
+       memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist));
+       rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz);
+       if (rc != 0)
+               goto out;
+#endif
+       rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+       /* Arbitrary choice of hash table size */
+       LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+                        LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+       if (rec->rec_lh_hash == NULL) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+               INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+       return 0;
+
+out:
+       CERROR("Failed to setup %s resource container\n",
+              lnet_res_type2str(type));
+       lnet_res_container_cleanup(rec);
+       return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+       struct lnet_res_container       *rec;
+       int                             i;
+
+       cfs_percpt_for_each(rec, i, recs)
+               lnet_res_container_cleanup(rec);
+
+       cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type, int objnum, int objsz)
+{
+       struct lnet_res_container       **recs;
+       struct lnet_res_container       *rec;
+       int                             rc;
+       int                             i;
+
+       recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+       if (recs == NULL) {
+               CERROR("Failed to allocate %s resource containers\n",
+                      lnet_res_type2str(type));
+               return NULL;
+       }
+
+       cfs_percpt_for_each(rec, i, recs) {
+               rc = lnet_res_container_setup(rec, i, type, objnum, objsz);
+               if (rc != 0) {
+                       lnet_res_containers_destroy(recs);
+                       return NULL;
+               }
+       }
+
+       return recs;
+}
+
+lnet_libhandle_t *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+       /* ALWAYS called with lnet_res_lock held */
+       struct list_head                *head;
+       lnet_libhandle_t        *lh;
+       unsigned int            hash;
+
+       if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+               return NULL;
+
+       hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+       head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+       list_for_each_entry(lh, head, lh_hash_chain) {
+               if (lh->lh_cookie == cookie)
+                       return lh;
+       }
+
+       return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh)
+{
+       /* ALWAYS called with lnet_res_lock held */
+       unsigned int    ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+       unsigned int    hash;
+
+       lh->lh_cookie = rec->rec_lh_cookie;
+       rec->rec_lh_cookie += 1 << ibits;
+
+       hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+       list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+
+int lnet_unprepare(void);
+
+int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+       /* Prepare to bring up the network */
+       struct lnet_res_container **recs;
+       int                       rc = 0;
+
+       LASSERT (the_lnet.ln_refcount == 0);
+
+       the_lnet.ln_routing = 0;
+
+       LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0);
+       the_lnet.ln_pid = requested_pid;
+
+       INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+       INIT_LIST_HEAD(&the_lnet.ln_nis);
+       INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
+       INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
+       INIT_LIST_HEAD(&the_lnet.ln_routers);
+
+       rc = lnet_create_remote_nets_table();
+       if (rc != 0)
+               goto failed;
+
+       the_lnet.ln_interface_cookie = lnet_create_interface_cookie();
+
+       the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+                                               sizeof(lnet_counters_t));
+       if (the_lnet.ln_counters == NULL) {
+               CERROR("Failed to allocate counters for LNet\n");
+               rc = -ENOMEM;
+               goto failed;
+       }
+
+       rc = lnet_peer_tables_create();
+       if (rc != 0)
+               goto failed;
+
+       rc = lnet_msg_containers_create();
+       if (rc != 0)
+               goto failed;
+
+       rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+                                     LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS,
+                                     sizeof(lnet_eq_t));
+       if (rc != 0)
+               goto failed;
+
+       recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES,
+                                         sizeof(lnet_me_t));
+       if (recs == NULL)
+               goto failed;
+
+       the_lnet.ln_me_containers = recs;
+
+       recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS,
+                                         sizeof(lnet_libmd_t));
+       if (recs == NULL)
+               goto failed;
+
+       the_lnet.ln_md_containers = recs;
+
+       rc = lnet_portals_create();
+       if (rc != 0) {
+               CERROR("Failed to create portals for LNet: %d\n", rc);
+               goto failed;
+       }
+
+       return 0;
+
+ failed:
+       lnet_unprepare();
+       return rc;
+}
+
+int
+lnet_unprepare (void)
+{
+       /* NB no LNET_LOCK since this is the last reference.  All LND instances
+        * have shut down already, so it is safe to unlink and free all
+        * descriptors, even those that appear committed to a network op (eg MD
+        * with non-zero pending count) */
+
+       lnet_fail_nid(LNET_NID_ANY, 0);
+
+       LASSERT(the_lnet.ln_refcount == 0);
+       LASSERT(list_empty(&the_lnet.ln_test_peers));
+       LASSERT(list_empty(&the_lnet.ln_nis));
+       LASSERT(list_empty(&the_lnet.ln_nis_cpt));
+       LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+       lnet_portals_destroy();
+
+       if (the_lnet.ln_md_containers != NULL) {
+               lnet_res_containers_destroy(the_lnet.ln_md_containers);
+               the_lnet.ln_md_containers = NULL;
+       }
+
+       if (the_lnet.ln_me_containers != NULL) {
+               lnet_res_containers_destroy(the_lnet.ln_me_containers);
+               the_lnet.ln_me_containers = NULL;
+       }
+
+       lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+       lnet_msg_containers_destroy();
+       lnet_peer_tables_destroy();
+       lnet_rtrpools_free();
+
+       if (the_lnet.ln_counters != NULL) {
+               cfs_percpt_free(the_lnet.ln_counters);
+               the_lnet.ln_counters = NULL;
+       }
+       lnet_destroy_remote_nets_table();
+
+       return 0;
+}
+
+lnet_ni_t  *
+lnet_net2ni_locked(__u32 net, int cpt)
+{
+       struct list_head        *tmp;
+       lnet_ni_t       *ni;
+
+       LASSERT(cpt != LNET_LOCK_EX);
+
+       list_for_each(tmp, &the_lnet.ln_nis) {
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+               if (LNET_NIDNET(ni->ni_nid) == net) {
+                       lnet_ni_addref_locked(ni, cpt);
+                       return ni;
+               }
+       }
+
+       return NULL;
+}
+
+lnet_ni_t *
+lnet_net2ni(__u32 net)
+{
+       lnet_ni_t *ni;
+
+       lnet_net_lock(0);
+       ni = lnet_net2ni_locked(net, 0);
+       lnet_net_unlock(0);
+
+       return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni);
+
+static unsigned int
+lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+       __u64           key = nid;
+       unsigned int    val;
+
+       LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+       if (number == 1)
+               return 0;
+
+       val = cfs_hash_long(key, LNET_CPT_BITS);
+       /* NB: LNET_CP_NUMBER doesn't have to be PO2 */
+       if (val < number)
+               return val;
+
+       return (unsigned int)(key + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(lnet_nid_t nid)
+{
+       struct lnet_ni *ni;
+
+       /* must called with hold of lnet_net_lock */
+       if (LNET_CPT_NUMBER == 1)
+               return 0; /* the only one */
+
+       /* take lnet_net_lock(any) would be OK */
+       if (!list_empty(&the_lnet.ln_nis_cpt)) {
+               list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) {
+                       if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid))
+                               continue;
+
+                       LASSERT(ni->ni_cpts != NULL);
+                       return ni->ni_cpts[lnet_nid_cpt_hash
+                                          (nid, ni->ni_ncpts)];
+               }
+       }
+
+       return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid)
+{
+       int     cpt;
+       int     cpt2;
+
+       if (LNET_CPT_NUMBER == 1)
+               return 0; /* the only one */
+
+       if (list_empty(&the_lnet.ln_nis_cpt))
+               return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+       cpt = lnet_net_lock_current();
+       cpt2 = lnet_cpt_of_nid_locked(nid);
+       lnet_net_unlock(cpt);
+
+       return cpt2;
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet(__u32 net)
+{
+       struct lnet_ni  *ni;
+       int             cpt;
+
+       cpt = lnet_net_lock_current();
+
+       ni = lnet_net2ni_locked(net, cpt);
+       if (ni != NULL)
+               lnet_ni_decref_locked(ni, cpt);
+
+       lnet_net_unlock(cpt);
+
+       return ni != NULL;
+}
+
+lnet_ni_t  *
+lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
+{
+       struct lnet_ni  *ni;
+       struct list_head        *tmp;
+
+       LASSERT(cpt != LNET_LOCK_EX);
+
+       list_for_each(tmp, &the_lnet.ln_nis) {
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+               if (ni->ni_nid == nid) {
+                       lnet_ni_addref_locked(ni, cpt);
+                       return ni;
+               }
+       }
+
+       return NULL;
+}
+
+int
+lnet_islocalnid(lnet_nid_t nid)
+{
+       struct lnet_ni  *ni;
+       int             cpt;
+
+       cpt = lnet_net_lock_current();
+       ni = lnet_nid2ni_locked(nid, cpt);
+       if (ni != NULL)
+               lnet_ni_decref_locked(ni, cpt);
+       lnet_net_unlock(cpt);
+
+       return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nis (void)
+{
+       /* Return the # of NIs that need the acceptor. */
+       int             count = 0;
+       struct list_head        *tmp;
+       struct lnet_ni  *ni;
+       int             cpt;
+
+       cpt = lnet_net_lock_current();
+       list_for_each(tmp, &the_lnet.ln_nis) {
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+               if (ni->ni_lnd->lnd_accept != NULL)
+                       count++;
+       }
+
+       lnet_net_unlock(cpt);
+
+       return count;
+}
+
+static int
+lnet_ni_tq_credits(lnet_ni_t *ni)
+{
+       int     credits;
+
+       LASSERT(ni->ni_ncpts >= 1);
+
+       if (ni->ni_ncpts == 1)
+               return ni->ni_maxtxcredits;
+
+       credits = ni->ni_maxtxcredits / ni->ni_ncpts;
+       credits = max(credits, 8 * ni->ni_peertxcredits);
+       credits = min(credits, ni->ni_maxtxcredits);
+
+       return credits;
+}
+
+void
+lnet_shutdown_lndnis (void)
+{
+       int             i;
+       int             islo;
+       lnet_ni_t        *ni;
+
+       /* NB called holding the global mutex */
+
+       /* All quiet on the API front */
+       LASSERT(!the_lnet.ln_shutdown);
+       LASSERT(the_lnet.ln_refcount == 0);
+       LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+       lnet_net_lock(LNET_LOCK_EX);
+       the_lnet.ln_shutdown = 1;       /* flag shutdown */
+
+       /* Unlink NIs from the global table */
+       while (!list_empty(&the_lnet.ln_nis)) {
+               ni = list_entry(the_lnet.ln_nis.next,
+                                   lnet_ni_t, ni_list);
+               /* move it to zombie list and nobody can find it anymore */
+               list_move(&ni->ni_list, &the_lnet.ln_nis_zombie);
+               lnet_ni_decref_locked(ni, 0);   /* drop ln_nis' ref */
+
+               if (!list_empty(&ni->ni_cptlist)) {
+                       list_del_init(&ni->ni_cptlist);
+                       lnet_ni_decref_locked(ni, 0);
+               }
+       }
+
+       /* Drop the cached eqwait NI. */
+       if (the_lnet.ln_eq_waitni != NULL) {
+               lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0);
+               the_lnet.ln_eq_waitni = NULL;
+       }
+
+       /* Drop the cached loopback NI. */
+       if (the_lnet.ln_loni != NULL) {
+               lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+               the_lnet.ln_loni = NULL;
+       }
+
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       /* Clear lazy portals and drop delayed messages which hold refs
+        * on their lnet_msg_t::msg_rxpeer */
+       for (i = 0; i < the_lnet.ln_nportals; i++)
+               LNetClearLazyPortal(i);
+
+       /* Clear the peer table and wait for all peers to go (they hold refs on
+        * their NIs) */
+       lnet_peer_tables_cleanup();
+
+       lnet_net_lock(LNET_LOCK_EX);
+       /* Now wait for the NI's I just nuked to show up on ln_zombie_nis
+        * and shut them down in guaranteed thread context */
+       i = 2;
+       while (!list_empty(&the_lnet.ln_nis_zombie)) {
+               int     *ref;
+               int     j;
+
+               ni = list_entry(the_lnet.ln_nis_zombie.next,
+                                   lnet_ni_t, ni_list);
+               list_del_init(&ni->ni_list);
+               cfs_percpt_for_each(ref, j, ni->ni_refs) {
+                       if (*ref == 0)
+                               continue;
+                       /* still busy, add it back to zombie list */
+                       list_add(&ni->ni_list, &the_lnet.ln_nis_zombie);
+                       break;
+               }
+
+               while (!list_empty(&ni->ni_list)) {
+                       lnet_net_unlock(LNET_LOCK_EX);
+                       ++i;
+                       if ((i & (-i)) == i) {
+                               CDEBUG(D_WARNING,
+                                      "Waiting for zombie LNI %s\n",
+                                      libcfs_nid2str(ni->ni_nid));
+                       }
+                       cfs_pause(cfs_time_seconds(1));
+                       lnet_net_lock(LNET_LOCK_EX);
+                       continue;
+               }
+
+               ni->ni_lnd->lnd_refcount--;
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               islo = ni->ni_lnd->lnd_type == LOLND;
+
+               LASSERT (!in_interrupt ());
+               (ni->ni_lnd->lnd_shutdown)(ni);
+
+               /* can't deref lnd anymore now; it might have unregistered
+                * itself...  */
+
+               if (!islo)
+                       CDEBUG(D_LNI, "Removed LNI %s\n",
+                              libcfs_nid2str(ni->ni_nid));
+
+               lnet_ni_free(ni);
+               lnet_net_lock(LNET_LOCK_EX);
+       }
+
+       the_lnet.ln_shutdown = 0;
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       if (the_lnet.ln_network_tokens != NULL) {
+               LIBCFS_FREE(the_lnet.ln_network_tokens,
+                           the_lnet.ln_network_tokens_nob);
+               the_lnet.ln_network_tokens = NULL;
+       }
+}
+
+int
+lnet_startup_lndnis (void)
+{
+       lnd_t                   *lnd;
+       struct lnet_ni          *ni;
+       struct lnet_tx_queue    *tq;
+       struct list_head                nilist;
+       int                     i;
+       int             rc = 0;
+       int             lnd_type;
+       int             nicount = 0;
+       char          *nets = lnet_get_networks();
+
+       INIT_LIST_HEAD(&nilist);
+
+       if (nets == NULL)
+               goto failed;
+
+       rc = lnet_parse_networks(&nilist, nets);
+       if (rc != 0)
+               goto failed;
+
+       while (!list_empty(&nilist)) {
+               ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+               lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+               LASSERT (libcfs_isknown_lnd(lnd_type));
+
+               if (lnd_type == CIBLND    ||
+                   lnd_type == OPENIBLND ||
+                   lnd_type == IIBLND    ||
+                   lnd_type == VIBLND) {
+                       CERROR("LND %s obsoleted\n",
+                              libcfs_lnd2str(lnd_type));
+                       goto failed;
+               }
+
+               LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+               lnd = lnet_find_lnd_by_type(lnd_type);
+
+               if (lnd == NULL) {
+                       LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+                       rc = request_module("%s",
+                                               libcfs_lnd2modname(lnd_type));
+                       LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+                       lnd = lnet_find_lnd_by_type(lnd_type);
+                       if (lnd == NULL) {
+                               LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+                               CERROR("Can't load LND %s, module %s, rc=%d\n",
+                                      libcfs_lnd2str(lnd_type),
+                                      libcfs_lnd2modname(lnd_type), rc);
+                               goto failed;
+                       }
+               }
+
+               lnet_net_lock(LNET_LOCK_EX);
+               lnd->lnd_refcount++;
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               ni->ni_lnd = lnd;
+
+               rc = (lnd->lnd_startup)(ni);
+
+               LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+
+               if (rc != 0) {
+                       LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s"
+                                          "\n",
+                                          rc, libcfs_lnd2str(lnd->lnd_type));
+                       lnet_net_lock(LNET_LOCK_EX);
+                       lnd->lnd_refcount--;
+                       lnet_net_unlock(LNET_LOCK_EX);
+                       goto failed;
+               }
+
+               LASSERT (ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL);
+
+               list_del(&ni->ni_list);
+
+               lnet_net_lock(LNET_LOCK_EX);
+               /* refcount for ln_nis */
+               lnet_ni_addref_locked(ni, 0);
+               list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
+               if (ni->ni_cpts != NULL) {
+                       list_add_tail(&ni->ni_cptlist,
+                                         &the_lnet.ln_nis_cpt);
+                       lnet_ni_addref_locked(ni, 0);
+               }
+
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               if (lnd->lnd_type == LOLND) {
+                       lnet_ni_addref(ni);
+                       LASSERT (the_lnet.ln_loni == NULL);
+                       the_lnet.ln_loni = ni;
+                       continue;
+               }
+
+               if (ni->ni_peertxcredits == 0 ||
+                   ni->ni_maxtxcredits == 0) {
+                       LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+                                          libcfs_lnd2str(lnd->lnd_type),
+                                          ni->ni_peertxcredits == 0 ?
+                                          "" : "per-peer ");
+                       goto failed;
+               }
+
+               cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+                       tq->tq_credits_min =
+                       tq->tq_credits_max =
+                       tq->tq_credits = lnet_ni_tq_credits(ni);
+               }
+
+               CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+                      libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits,
+                      lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+                      ni->ni_peerrtrcredits, ni->ni_peertimeout);
+
+               nicount++;
+       }
+
+       if (the_lnet.ln_eq_waitni != NULL && nicount > 1) {
+               lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type;
+               LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network"
+                                  "\n",
+                                  libcfs_lnd2str(lnd_type));
+               goto failed;
+       }
+
+       return 0;
+
+ failed:
+       lnet_shutdown_lndnis();
+
+       while (!list_empty(&nilist)) {
+               ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+               list_del(&ni->ni_list);
+               lnet_ni_free(ni);
+       }
+
+       return -ENETDOWN;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Only userspace program needs to call this function - it's automatically
+ * called in the kernel at module loading time. Caller has to call LNetFini()
+ * after a call to LNetInit(), if and only if the latter returned 0. It must
+ * be called exactly once.
+ *
+ * \return 0 on success, and -ve on failures.
+ */
+int
+LNetInit(void)
+{
+       int     rc;
+
+       lnet_assert_wire_constants();
+       LASSERT(!the_lnet.ln_init);
+
+       memset(&the_lnet, 0, sizeof(the_lnet));
+
+       /* refer to global cfs_cpt_table for now */
+       the_lnet.ln_cpt_table   = cfs_cpt_table;
+       the_lnet.ln_cpt_number  = cfs_cpt_number(cfs_cpt_table);
+
+       LASSERT(the_lnet.ln_cpt_number > 0);
+       if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+               /* we are under risk of consuming all lh_cookie */
+               CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
+                      "please change setting of CPT-table and retry\n",
+                      the_lnet.ln_cpt_number, LNET_CPT_MAX);
+               return -1;
+       }
+
+       while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+               the_lnet.ln_cpt_bits++;
+
+       rc = lnet_create_locks();
+       if (rc != 0) {
+               CERROR("Can't create LNet global locks: %d\n", rc);
+               return -1;
+       }
+
+       the_lnet.ln_refcount = 0;
+       the_lnet.ln_init = 1;
+       LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+       INIT_LIST_HEAD(&the_lnet.ln_lnds);
+       INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+       INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
+
+       /* The hash table size is the number of bits it takes to express the set
+        * ln_num_routes, minus 1 (better to under estimate than over so we
+        * don't waste memory). */
+       if (rnet_htable_size <= 0)
+               rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+       else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+               rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+       the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+                                          order_base_2(rnet_htable_size) - 1);
+
+       /* All LNDs apart from the LOLND are in separate modules.  They
+        * register themselves when their module loads, and unregister
+        * themselves when their module is unloaded. */
+       lnet_register_lnd(&the_lolnd);
+       return 0;
+}
+EXPORT_SYMBOL(LNetInit);
+
+/**
+ * Finalize LNet library.
+ *
+ * Only userspace program needs to call this function. It can be called
+ * at most once.
+ *
+ * \pre LNetInit() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ */
+void
+LNetFini(void)
+{
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount == 0);
+
+       while (!list_empty(&the_lnet.ln_lnds))
+               lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+                                                  lnd_t, lnd_list));
+       lnet_destroy_locks();
+
+       the_lnet.ln_init = 0;
+}
+EXPORT_SYMBOL(LNetFini);
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Userspace program should call this after a successful call to LNetInit().
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+       int      im_a_router = 0;
+       int      rc;
+
+       LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+       LASSERT (the_lnet.ln_init);
+       CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+       if (the_lnet.ln_refcount > 0) {
+               rc = the_lnet.ln_refcount++;
+               goto out;
+       }
+
+       lnet_get_tunables();
+
+       if (requested_pid == LNET_PID_ANY) {
+               /* Don't instantiate LNET just for me */
+               rc = -ENETDOWN;
+               goto failed0;
+       }
+
+       rc = lnet_prepare(requested_pid);
+       if (rc != 0)
+               goto failed0;
+
+       rc = lnet_startup_lndnis();
+       if (rc != 0)
+               goto failed1;
+
+       rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+       if (rc != 0)
+               goto failed2;
+
+       rc = lnet_check_routes();
+       if (rc != 0)
+               goto failed2;
+
+       rc = lnet_rtrpools_alloc(im_a_router);
+       if (rc != 0)
+               goto failed2;
+
+       rc = lnet_acceptor_start();
+       if (rc != 0)
+               goto failed2;
+
+       the_lnet.ln_refcount = 1;
+       /* Now I may use my own API functions... */
+
+       /* NB router checker needs the_lnet.ln_ping_info in
+        * lnet_router_checker -> lnet_update_ni_status_locked */
+       rc = lnet_ping_target_init();
+       if (rc != 0)
+               goto failed3;
+
+       rc = lnet_router_checker_start();
+       if (rc != 0)
+               goto failed4;
+
+       lnet_proc_init();
+       goto out;
+
+ failed4:
+       lnet_ping_target_fini();
+ failed3:
+       the_lnet.ln_refcount = 0;
+       lnet_acceptor_stop();
+ failed2:
+       lnet_destroy_routes();
+       lnet_shutdown_lndnis();
+ failed1:
+       lnet_unprepare();
+ failed0:
+       LASSERT (rc < 0);
+ out:
+       LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+       return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini()
+{
+       LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (the_lnet.ln_refcount != 1) {
+               the_lnet.ln_refcount--;
+       } else {
+               LASSERT (!the_lnet.ln_niinit_self);
+
+               lnet_proc_fini();
+               lnet_router_checker_stop();
+               lnet_ping_target_fini();
+
+               /* Teardown fns that use my own API functions BEFORE here */
+               the_lnet.ln_refcount = 0;
+
+               lnet_acceptor_stop();
+               lnet_destroy_routes();
+               lnet_shutdown_lndnis();
+               lnet_unprepare();
+       }
+
+       LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+       return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet
+ * internal ioctl handler.
+ *
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it.
+ *
+ * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer.
+ * The data will be printed to system console. Don't use it excessively.
+ * \param arg A pointer to lnet_process_id_t, process ID of the peer.
+ *
+ * \return Always return 0 when called by users directly (i.e., not via ioctl).
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+       struct libcfs_ioctl_data *data = arg;
+       lnet_process_id_t        id = {0};
+       lnet_ni_t               *ni;
+       int                    rc;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       switch (cmd) {
+       case IOC_LIBCFS_GET_NI:
+               rc = LNetGetId(data->ioc_count, &id);
+               data->ioc_nid = id.nid;
+               return rc;
+
+       case IOC_LIBCFS_FAIL_NID:
+               return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+       case IOC_LIBCFS_ADD_ROUTE:
+               rc = lnet_add_route(data->ioc_net, data->ioc_count,
+                                   data->ioc_nid);
+               return (rc != 0) ? rc : lnet_check_routes();
+
+       case IOC_LIBCFS_DEL_ROUTE:
+               return lnet_del_route(data->ioc_net, data->ioc_nid);
+
+       case IOC_LIBCFS_GET_ROUTE:
+               return lnet_get_route(data->ioc_count,
+                                     &data->ioc_net, &data->ioc_count,
+                                     &data->ioc_nid, &data->ioc_flags);
+       case IOC_LIBCFS_NOTIFY_ROUTER:
+               return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+                                  cfs_time_current() -
+                                  cfs_time_seconds(cfs_time_current_sec() -
+                                                   (time_t)data->ioc_u64[0]));
+
+       case IOC_LIBCFS_PORTALS_COMPATIBILITY:
+               /* This can be removed once lustre stops calling it */
+               return 0;
+
+       case IOC_LIBCFS_LNET_DIST:
+               rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+               if (rc < 0 && rc != -EHOSTUNREACH)
+                       return rc;
+
+               data->ioc_u32[0] = rc;
+               return 0;
+
+       case IOC_LIBCFS_TESTPROTOCOMPAT:
+               lnet_net_lock(LNET_LOCK_EX);
+               the_lnet.ln_testprotocompat = data->ioc_flags;
+               lnet_net_unlock(LNET_LOCK_EX);
+               return 0;
+
+       case IOC_LIBCFS_PING:
+               id.nid = data->ioc_nid;
+               id.pid = data->ioc_u32[0];
+               rc = lnet_ping(id, data->ioc_u32[1], /* timeout */
+                              (lnet_process_id_t *)data->ioc_pbuf1,
+                              data->ioc_plen1/sizeof(lnet_process_id_t));
+               if (rc < 0)
+                       return rc;
+               data->ioc_count = rc;
+               return 0;
+
+       case IOC_LIBCFS_DEBUG_PEER: {
+               /* CAVEAT EMPTOR: this one designed for calling directly; not
+                * via an ioctl */
+               id = *((lnet_process_id_t *) arg);
+
+               lnet_debug_peer(id.nid);
+
+               ni = lnet_net2ni(LNET_NIDNET(id.nid));
+               if (ni == NULL) {
+                       CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id));
+               } else {
+                       if (ni->ni_lnd->lnd_ctl == NULL) {
+                               CDEBUG(D_WARNING, "No ctl for %s\n",
+                                      libcfs_id2str(id));
+                       } else {
+                               (void)ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+                       }
+
+                       lnet_ni_decref(ni);
+               }
+               return 0;
+       }
+
+       default:
+               ni = lnet_net2ni(data->ioc_net);
+               if (ni == NULL)
+                       return -EINVAL;
+
+               if (ni->ni_lnd->lnd_ctl == NULL)
+                       rc = -EINVAL;
+               else
+                       rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+
+               lnet_ni_decref(ni);
+               return rc;
+       }
+       /* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+/**
+ * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that
+ * all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * lnet_process_id_t ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, lnet_process_id_t *id)
+{
+       struct lnet_ni  *ni;
+       struct list_head        *tmp;
+       int             cpt;
+       int             rc = -ENOENT;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       cpt = lnet_net_lock_current();
+
+       list_for_each(tmp, &the_lnet.ln_nis) {
+               if (index-- != 0)
+                       continue;
+
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+               id->nid = ni->ni_nid;
+               id->pid = the_lnet.ln_pid;
+               rc = 0;
+               break;
+       }
+
+       lnet_net_unlock(cpt);
+       return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+/**
+ * Print a string representation of handle \a h into buffer \a str of
+ * \a len bytes.
+ */
+void
+LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
+{
+       snprintf(str, len, LPX64, h.cookie);
+}
+EXPORT_SYMBOL(LNetSnprintHandle);
+
+static int
+lnet_create_ping_info(void)
+{
+       int            i;
+       int            n;
+       int            rc;
+       unsigned int      infosz;
+       lnet_ni_t       *ni;
+       lnet_process_id_t id;
+       lnet_ping_info_t *pinfo;
+
+       for (n = 0; ; n++) {
+               rc = LNetGetId(n, &id);
+               if (rc == -ENOENT)
+                       break;
+
+               LASSERT (rc == 0);
+       }
+
+       infosz = offsetof(lnet_ping_info_t, pi_ni[n]);
+       LIBCFS_ALLOC(pinfo, infosz);
+       if (pinfo == NULL) {
+               CERROR("Can't allocate ping info[%d]\n", n);
+               return -ENOMEM;
+       }
+
+       pinfo->pi_nnis    = n;
+       pinfo->pi_pid     = the_lnet.ln_pid;
+       pinfo->pi_magic   = LNET_PROTO_PING_MAGIC;
+       pinfo->pi_features = LNET_PING_FEAT_NI_STATUS;
+
+       for (i = 0; i < n; i++) {
+               lnet_ni_status_t *ns = &pinfo->pi_ni[i];
+
+               rc = LNetGetId(i, &id);
+               LASSERT (rc == 0);
+
+               ns->ns_nid    = id.nid;
+               ns->ns_status = LNET_NI_STATUS_UP;
+
+               lnet_net_lock(0);
+
+               ni = lnet_nid2ni_locked(id.nid, 0);
+               LASSERT(ni != NULL);
+
+               lnet_ni_lock(ni);
+               LASSERT(ni->ni_status == NULL);
+               ni->ni_status = ns;
+               lnet_ni_unlock(ni);
+
+               lnet_ni_decref_locked(ni, 0);
+               lnet_net_unlock(0);
+       }
+
+       the_lnet.ln_ping_info = pinfo;
+       return 0;
+}
+
+static void
+lnet_destroy_ping_info(void)
+{
+       struct lnet_ni  *ni;
+
+       lnet_net_lock(0);
+
+       list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+               lnet_ni_lock(ni);
+               ni->ni_status = NULL;
+               lnet_ni_unlock(ni);
+       }
+
+       lnet_net_unlock(0);
+
+       LIBCFS_FREE(the_lnet.ln_ping_info,
+                   offsetof(lnet_ping_info_t,
+                            pi_ni[the_lnet.ln_ping_info->pi_nnis]));
+       the_lnet.ln_ping_info = NULL;
+       return;
+}
+
+int
+lnet_ping_target_init(void)
+{
+       lnet_md_t        md = {0};
+       lnet_handle_me_t  meh;
+       lnet_process_id_t id;
+       int            rc;
+       int            rc2;
+       int            infosz;
+
+       rc = lnet_create_ping_info();
+       if (rc != 0)
+               return rc;
+
+       /* We can have a tiny EQ since we only need to see the unlink event on
+        * teardown, which by definition is the last one! */
+       rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
+       if (rc != 0) {
+               CERROR("Can't allocate ping EQ: %d\n", rc);
+               goto failed_0;
+       }
+
+       memset(&id, 0, sizeof(lnet_process_id_t));
+       id.nid = LNET_NID_ANY;
+       id.pid = LNET_PID_ANY;
+
+       rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+                         LNET_PROTO_PING_MATCHBITS, 0,
+                         LNET_UNLINK, LNET_INS_AFTER,
+                         &meh);
+       if (rc != 0) {
+               CERROR("Can't create ping ME: %d\n", rc);
+               goto failed_1;
+       }
+
+       /* initialize md content */
+       infosz = offsetof(lnet_ping_info_t,
+                         pi_ni[the_lnet.ln_ping_info->pi_nnis]);
+       md.start     = the_lnet.ln_ping_info;
+       md.length    = infosz;
+       md.threshold = LNET_MD_THRESH_INF;
+       md.max_size  = 0;
+       md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+                      LNET_MD_MANAGE_REMOTE;
+       md.user_ptr  = NULL;
+       md.eq_handle = the_lnet.ln_ping_target_eq;
+
+       rc = LNetMDAttach(meh, md,
+                         LNET_RETAIN,
+                         &the_lnet.ln_ping_target_md);
+       if (rc != 0) {
+               CERROR("Can't attach ping MD: %d\n", rc);
+               goto failed_2;
+       }
+
+       return 0;
+
+ failed_2:
+       rc2 = LNetMEUnlink(meh);
+       LASSERT (rc2 == 0);
+ failed_1:
+       rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+       LASSERT (rc2 == 0);
+ failed_0:
+       lnet_destroy_ping_info();
+       return rc;
+}
+
+void
+lnet_ping_target_fini(void)
+{
+       lnet_event_t    event;
+       int          rc;
+       int          which;
+       int          timeout_ms = 1000;
+       sigset_t    blocked = cfs_block_allsigs();
+
+       LNetMDUnlink(the_lnet.ln_ping_target_md);
+       /* NB md could be busy; this just starts the unlink */
+
+       for (;;) {
+               rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1,
+                               timeout_ms, &event, &which);
+
+               /* I expect overflow... */
+               LASSERT (rc >= 0 || rc == -EOVERFLOW);
+
+               if (rc == 0) {
+                       /* timed out: provide a diagnostic */
+                       CWARN("Still waiting for ping MD to unlink\n");
+                       timeout_ms *= 2;
+                       continue;
+               }
+
+               /* Got a valid event */
+               if (event.unlinked)
+                       break;
+       }
+
+       rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+       LASSERT (rc == 0);
+       lnet_destroy_ping_info();
+       cfs_restore_sigs(blocked);
+}
+
+int
+lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids)
+{
+       lnet_handle_eq_t     eqh;
+       lnet_handle_md_t     mdh;
+       lnet_event_t     event;
+       lnet_md_t           md = {0};
+       int               which;
+       int               unlinked = 0;
+       int               replied = 0;
+       const int           a_long_time = 60000; /* mS */
+       int               infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]);
+       lnet_ping_info_t    *info;
+       lnet_process_id_t    tmpid;
+       int               i;
+       int               nob;
+       int               rc;
+       int               rc2;
+       sigset_t         blocked;
+
+       if (n_ids <= 0 ||
+           id.nid == LNET_NID_ANY ||
+           timeout_ms > 500000 ||            /* arbitrary limit! */
+           n_ids > 20)                  /* arbitrary limit! */
+               return -EINVAL;
+
+       if (id.pid == LNET_PID_ANY)
+               id.pid = LUSTRE_SRV_LNET_PID;
+
+       LIBCFS_ALLOC(info, infosz);
+       if (info == NULL)
+               return -ENOMEM;
+
+       /* NB 2 events max (including any unlink event) */
+       rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+       if (rc != 0) {
+               CERROR("Can't allocate EQ: %d\n", rc);
+               goto out_0;
+       }
+
+       /* initialize md content */
+       md.start     = info;
+       md.length    = infosz;
+       md.threshold = 2; /*GET/REPLY*/
+       md.max_size  = 0;
+       md.options   = LNET_MD_TRUNCATE;
+       md.user_ptr  = NULL;
+       md.eq_handle = eqh;
+
+       rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+       if (rc != 0) {
+               CERROR("Can't bind MD: %d\n", rc);
+               goto out_1;
+       }
+
+       rc = LNetGet(LNET_NID_ANY, mdh, id,
+                    LNET_RESERVED_PORTAL,
+                    LNET_PROTO_PING_MATCHBITS, 0);
+
+       if (rc != 0) {
+               /* Don't CERROR; this could be deliberate! */
+
+               rc2 = LNetMDUnlink(mdh);
+               LASSERT (rc2 == 0);
+
+               /* NB must wait for the UNLINK event below... */
+               unlinked = 1;
+               timeout_ms = a_long_time;
+       }
+
+       do {
+               /* MUST block for unlink to complete */
+               if (unlinked)
+                       blocked = cfs_block_allsigs();
+
+               rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which);
+
+               if (unlinked)
+                       cfs_restore_sigs(blocked);
+
+               CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+                      (rc2 <= 0) ? -1 : event.type,
+                      (rc2 <= 0) ? -1 : event.status,
+                      (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+               LASSERT (rc2 != -EOVERFLOW);     /* can't miss anything */
+
+               if (rc2 <= 0 || event.status != 0) {
+                       /* timeout or error */
+                       if (!replied && rc == 0)
+                               rc = (rc2 < 0) ? rc2 :
+                                    (rc2 == 0) ? -ETIMEDOUT :
+                                    event.status;
+
+                       if (!unlinked) {
+                               /* Ensure completion in finite time... */
+                               LNetMDUnlink(mdh);
+                               /* No assertion (racing with network) */
+                               unlinked = 1;
+                               timeout_ms = a_long_time;
+                       } else if (rc2 == 0) {
+                               /* timed out waiting for unlink */
+                               CWARN("ping %s: late network completion\n",
+                                     libcfs_id2str(id));
+                       }
+               } else if (event.type == LNET_EVENT_REPLY) {
+                       replied = 1;
+                       rc = event.mlength;
+               }
+
+       } while (rc2 <= 0 || !event.unlinked);
+
+       if (!replied) {
+               if (rc >= 0)
+                       CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+                             libcfs_id2str(id));
+               rc = -EIO;
+               goto out_1;
+       }
+
+       nob = rc;
+       LASSERT (nob >= 0 && nob <= infosz);
+
+       rc = -EPROTO;                      /* if I can't parse... */
+
+       if (nob < 8) {
+               /* can't check magic/version */
+               CERROR("%s: ping info too short %d\n",
+                      libcfs_id2str(id), nob);
+               goto out_1;
+       }
+
+       if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+               lnet_swap_pinginfo(info);
+       } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+               CERROR("%s: Unexpected magic %08x\n",
+                      libcfs_id2str(id), info->pi_magic);
+               goto out_1;
+       }
+
+       if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+               CERROR("%s: ping w/o NI status: 0x%x\n",
+                      libcfs_id2str(id), info->pi_features);
+               goto out_1;
+       }
+
+       if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) {
+               CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+                      nob, (int)offsetof(lnet_ping_info_t, pi_ni[0]));
+               goto out_1;
+       }
+
+       if (info->pi_nnis < n_ids)
+               n_ids = info->pi_nnis;
+
+       if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) {
+               CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+                      nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids]));
+               goto out_1;
+       }
+
+       rc = -EFAULT;                      /* If I SEGV... */
+
+       for (i = 0; i < n_ids; i++) {
+               tmpid.pid = info->pi_pid;
+               tmpid.nid = info->pi_ni[i].ns_nid;
+               if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+                       goto out_1;
+       }
+       rc = info->pi_nnis;
+
+ out_1:
+       rc2 = LNetEQFree(eqh);
+       if (rc2 != 0)
+               CERROR("rc2 %d\n", rc2);
+       LASSERT (rc2 == 0);
+
+ out_0:
+       LIBCFS_FREE(info, infosz);
+       return rc;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c
new file mode 100644 (file)
index 0000000..28711e6
--- /dev/null
@@ -0,0 +1,1264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+typedef struct {                           /* tmp struct for parsing routes */
+       struct list_head         ltb_list;      /* stash on lists */
+       int             ltb_size;       /* allocated size */
+       char           ltb_text[0];     /* text buffer */
+} lnet_text_buf_t;
+
+static int lnet_tbnob = 0;                     /* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB     (64<<10)      /* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB  (4<<10)
+
+void
+lnet_syntax(char *name, char *str, int offset, int width)
+{
+       static char dots[LNET_SINGLE_TEXTBUF_NOB];
+       static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+       memset(dots, '.', sizeof(dots));
+       dots[sizeof(dots)-1] = 0;
+       memset(dashes, '-', sizeof(dashes));
+       dashes[sizeof(dashes)-1] = 0;
+
+       LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+       LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+                          (int)strlen(name), dots, offset, dots,
+                           (width < 1) ? 0 : width - 1, dashes);
+}
+
+int
+lnet_issep (char c)
+{
+       switch (c) {
+       case '\n':
+       case '\r':
+       case ';':
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+int
+lnet_net_unique(__u32 net, struct list_head *nilist)
+{
+       struct list_head       *tmp;
+       lnet_ni_t       *ni;
+
+       list_for_each (tmp, nilist) {
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+               if (LNET_NIDNET(ni->ni_nid) == net)
+                       return 0;
+       }
+
+       return 1;
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+       if (ni->ni_refs != NULL)
+               cfs_percpt_free(ni->ni_refs);
+
+       if (ni->ni_tx_queues != NULL)
+               cfs_percpt_free(ni->ni_tx_queues);
+
+       if (ni->ni_cpts != NULL)
+               cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+       LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+lnet_ni_t *
+lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
+{
+       struct lnet_tx_queue    *tq;
+       struct lnet_ni          *ni;
+       int                     rc;
+       int                     i;
+
+       if (!lnet_net_unique(net, nilist)) {
+               LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n",
+                                  libcfs_net2str(net));
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(ni, sizeof(*ni));
+       if (ni == NULL) {
+               CERROR("Out of memory creating network %s\n",
+                      libcfs_net2str(net));
+               return NULL;
+       }
+
+       spin_lock_init(&ni->ni_lock);
+       INIT_LIST_HEAD(&ni->ni_cptlist);
+       ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+                                      sizeof(*ni->ni_refs[0]));
+       if (ni->ni_refs == NULL)
+               goto failed;
+
+       ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+                                           sizeof(*ni->ni_tx_queues[0]));
+       if (ni->ni_tx_queues == NULL)
+               goto failed;
+
+       cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+               INIT_LIST_HEAD(&tq->tq_delayed);
+
+       if (el == NULL) {
+               ni->ni_cpts  = NULL;
+               ni->ni_ncpts = LNET_CPT_NUMBER;
+       } else {
+               rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+               if (rc <= 0) {
+                       CERROR("Failed to set CPTs for NI %s: %d\n",
+                              libcfs_net2str(net), rc);
+                       goto failed;
+               }
+
+               LASSERT(rc <= LNET_CPT_NUMBER);
+               if (rc == LNET_CPT_NUMBER) {
+                       LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0]));
+                       ni->ni_cpts = NULL;
+               }
+
+               ni->ni_ncpts = rc;
+       }
+
+       /* LND will fill in the address part of the NID */
+       ni->ni_nid = LNET_MKNID(net, 0);
+       ni->ni_last_alive = cfs_time_current_sec();
+       list_add_tail(&ni->ni_list, nilist);
+       return ni;
+ failed:
+       lnet_ni_free(ni);
+       return NULL;
+}
+
+int
+lnet_parse_networks(struct list_head *nilist, char *networks)
+{
+       struct cfs_expr_list *el = NULL;
+       int             tokensize = strlen(networks) + 1;
+       char            *tokens;
+       char            *str;
+       char            *tmp;
+       struct lnet_ni  *ni;
+       __u32           net;
+       int             nnets = 0;
+
+       if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+               /* _WAY_ conservative */
+               LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too "
+                                  "long\n");
+               return -EINVAL;
+       }
+
+       LIBCFS_ALLOC(tokens, tokensize);
+       if (tokens == NULL) {
+               CERROR("Can't allocate net tokens\n");
+               return -ENOMEM;
+       }
+
+       the_lnet.ln_network_tokens = tokens;
+       the_lnet.ln_network_tokens_nob = tokensize;
+       memcpy (tokens, networks, tokensize);
+       str = tmp = tokens;
+
+       /* Add in the loopback network */
+       ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist);
+       if (ni == NULL)
+               goto failed;
+
+       while (str != NULL && *str != 0) {
+               char    *comma = strchr(str, ',');
+               char    *bracket = strchr(str, '(');
+               char    *square = strchr(str, '[');
+               char    *iface;
+               int     niface;
+               int     rc;
+
+               /* NB we don't check interface conflicts here; it's the LNDs
+                * responsibility (if it cares at all) */
+
+               if (square != NULL && (comma == NULL || square < comma)) {
+                       /* i.e: o2ib0(ib0)[1,2], number between square
+                        * brackets are CPTs this NI needs to be bond */
+                       if (bracket != NULL && bracket > square) {
+                               tmp = square;
+                               goto failed_syntax;
+                       }
+
+                       tmp = strchr(square, ']');
+                       if (tmp == NULL) {
+                               tmp = square;
+                               goto failed_syntax;
+                       }
+
+                       rc = cfs_expr_list_parse(square, tmp - square + 1,
+                                                0, LNET_CPT_NUMBER - 1, &el);
+                       if (rc != 0) {
+                               tmp = square;
+                               goto failed_syntax;
+                       }
+
+                       while (square <= tmp)
+                               *square++ = ' ';
+               }
+
+               if (bracket == NULL ||
+                   (comma != NULL && comma < bracket)) {
+
+                       /* no interface list specified */
+
+                       if (comma != NULL)
+                               *comma++ = 0;
+                       net = libcfs_str2net(cfs_trimwhite(str));
+
+                       if (net == LNET_NIDNET(LNET_NID_ANY)) {
+                               LCONSOLE_ERROR_MSG(0x113, "Unrecognised network"
+                                                  " type\n");
+                               tmp = str;
+                               goto failed_syntax;
+                       }
+
+                       if (LNET_NETTYP(net) != LOLND && /* LO is implicit */
+                           lnet_ni_alloc(net, el, nilist) == NULL)
+                               goto failed;
+
+                       if (el != NULL) {
+                               cfs_expr_list_free(el);
+                               el = NULL;
+                       }
+
+                       str = comma;
+                       continue;
+               }
+
+               *bracket = 0;
+               net = libcfs_str2net(cfs_trimwhite(str));
+               if (net == LNET_NIDNET(LNET_NID_ANY)) {
+                       tmp = str;
+                       goto failed_syntax;
+               }
+
+               nnets++;
+               ni = lnet_ni_alloc(net, el, nilist);
+               if (ni == NULL)
+                       goto failed;
+
+               if (el != NULL) {
+                       cfs_expr_list_free(el);
+                       el = NULL;
+               }
+
+               niface = 0;
+               iface = bracket + 1;
+
+               bracket = strchr(iface, ')');
+               if (bracket == NULL) {
+                       tmp = iface;
+                       goto failed_syntax;
+               }
+
+               *bracket = 0;
+               do {
+                       comma = strchr(iface, ',');
+                       if (comma != NULL)
+                               *comma++ = 0;
+
+                       iface = cfs_trimwhite(iface);
+                       if (*iface == 0) {
+                               tmp = iface;
+                               goto failed_syntax;
+                       }
+
+                       if (niface == LNET_MAX_INTERFACES) {
+                               LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
+                                                  "for net %s\n",
+                                                  libcfs_net2str(net));
+                               goto failed;
+                       }
+
+                       ni->ni_interfaces[niface++] = iface;
+                       iface = comma;
+               } while (iface != NULL);
+
+               str = bracket + 1;
+               comma = strchr(bracket + 1, ',');
+               if (comma != NULL) {
+                       *comma = 0;
+                       str = cfs_trimwhite(str);
+                       if (*str != 0) {
+                               tmp = str;
+                               goto failed_syntax;
+                       }
+                       str = comma + 1;
+                       continue;
+               }
+
+               str = cfs_trimwhite(str);
+               if (*str != 0) {
+                       tmp = str;
+                       goto failed_syntax;
+               }
+       }
+
+       LASSERT(!list_empty(nilist));
+       return 0;
+
+ failed_syntax:
+       lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));
+ failed:
+       while (!list_empty(nilist)) {
+               ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+
+               list_del(&ni->ni_list);
+               lnet_ni_free(ni);
+       }
+
+       if (el != NULL)
+               cfs_expr_list_free(el);
+
+       LIBCFS_FREE(tokens, tokensize);
+       the_lnet.ln_network_tokens = NULL;
+
+       return -EINVAL;
+}
+
+lnet_text_buf_t *
+lnet_new_text_buf (int str_len)
+{
+       lnet_text_buf_t *ltb;
+       int           nob;
+
+       /* NB allocate space for the terminating 0 */
+       nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]);
+       if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+               /* _way_ conservative for "route net gateway..." */
+               CERROR("text buffer too big\n");
+               return NULL;
+       }
+
+       if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+               CERROR("Too many text buffers\n");
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(ltb, nob);
+       if (ltb == NULL)
+               return NULL;
+
+       ltb->ltb_size = nob;
+       ltb->ltb_text[0] = 0;
+       lnet_tbnob += nob;
+       return ltb;
+}
+
+void
+lnet_free_text_buf (lnet_text_buf_t *ltb)
+{
+       lnet_tbnob -= ltb->ltb_size;
+       LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+       lnet_text_buf_t  *ltb;
+
+       while (!list_empty(tbs)) {
+               ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+               list_del(&ltb->ltb_list);
+               lnet_free_text_buf(ltb);
+       }
+}
+
+void
+lnet_print_text_bufs(struct list_head *tbs)
+{
+       struct list_head        *tmp;
+       lnet_text_buf_t   *ltb;
+
+       list_for_each (tmp, tbs) {
+               ltb = list_entry(tmp, lnet_text_buf_t, ltb_list);
+
+               CDEBUG(D_WARNING, "%s\n", ltb->ltb_text);
+       }
+
+       CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob);
+}
+
+int
+lnet_str2tbs_sep (struct list_head *tbs, char *str)
+{
+       struct list_head        pending;
+       char         *sep;
+       int            nob;
+       int            i;
+       lnet_text_buf_t  *ltb;
+
+       INIT_LIST_HEAD(&pending);
+
+       /* Split 'str' into separate commands */
+       for (;;) {
+               /* skip leading whitespace */
+               while (cfs_iswhite(*str))
+                       str++;
+
+               /* scan for separator or comment */
+               for (sep = str; *sep != 0; sep++)
+                       if (lnet_issep(*sep) || *sep == '#')
+                               break;
+
+               nob = (int)(sep - str);
+               if (nob > 0) {
+                       ltb = lnet_new_text_buf(nob);
+                       if (ltb == NULL) {
+                               lnet_free_text_bufs(&pending);
+                               return -1;
+                       }
+
+                       for (i = 0; i < nob; i++)
+                               if (cfs_iswhite(str[i]))
+                                       ltb->ltb_text[i] = ' ';
+                               else
+                                       ltb->ltb_text[i] = str[i];
+
+                       ltb->ltb_text[nob] = 0;
+
+                       list_add_tail(&ltb->ltb_list, &pending);
+               }
+
+               if (*sep == '#') {
+                       /* scan for separator */
+                       do {
+                               sep++;
+                       } while (*sep != 0 && !lnet_issep(*sep));
+               }
+
+               if (*sep == 0)
+                       break;
+
+               str = sep + 1;
+       }
+
+       list_splice(&pending, tbs->prev);
+       return 0;
+}
+
+int
+lnet_expand1tb (struct list_head *list,
+              char *str, char *sep1, char *sep2,
+              char *item, int itemlen)
+{
+       int           len1 = (int)(sep1 - str);
+       int           len2 = strlen(sep2 + 1);
+       lnet_text_buf_t *ltb;
+
+       LASSERT (*sep1 == '[');
+       LASSERT (*sep2 == ']');
+
+       ltb = lnet_new_text_buf(len1 + itemlen + len2);
+       if (ltb == NULL)
+               return -ENOMEM;
+
+       memcpy(ltb->ltb_text, str, len1);
+       memcpy(&ltb->ltb_text[len1], item, itemlen);
+       memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+       ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+       list_add_tail(&ltb->ltb_list, list);
+       return 0;
+}
+
+int
+lnet_str2tbs_expand (struct list_head *tbs, char *str)
+{
+       char          num[16];
+       struct list_head        pending;
+       char         *sep;
+       char         *sep2;
+       char         *parsed;
+       char         *enditem;
+       int            lo;
+       int            hi;
+       int            stride;
+       int            i;
+       int            nob;
+       int            scanned;
+
+       INIT_LIST_HEAD(&pending);
+
+       sep = strchr(str, '[');
+       if (sep == NULL)                        /* nothing to expand */
+               return 0;
+
+       sep2 = strchr(sep, ']');
+       if (sep2 == NULL)
+               goto failed;
+
+       for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+               enditem = ++parsed;
+               while (enditem < sep2 && *enditem != ',')
+                       enditem++;
+
+               if (enditem == parsed)          /* no empty items */
+                       goto failed;
+
+               if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) {
+
+                       if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+                               /* simple string enumeration */
+                               if (lnet_expand1tb(&pending, str, sep, sep2,
+                                                  parsed, (int)(enditem - parsed)) != 0)
+                                       goto failed;
+
+                               continue;
+                       }
+
+                       stride = 1;
+               }
+
+               /* range expansion */
+
+               if (enditem != parsed + scanned) /* no trailing junk */
+                       goto failed;
+
+               if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+                   (hi - lo) % stride != 0)
+                       goto failed;
+
+               for (i = lo; i <= hi; i += stride) {
+
+                       snprintf(num, sizeof(num), "%d", i);
+                       nob = strlen(num);
+                       if (nob + 1 == sizeof(num))
+                               goto failed;
+
+                       if (lnet_expand1tb(&pending, str, sep, sep2,
+                                          num, nob) != 0)
+                               goto failed;
+               }
+       }
+
+       list_splice(&pending, tbs->prev);
+       return 1;
+
+ failed:
+       lnet_free_text_bufs(&pending);
+       return -1;
+}
+
+int
+lnet_parse_hops (char *str, unsigned int *hops)
+{
+       int     len = strlen(str);
+       int     nob = len;
+
+       return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+               nob == len &&
+               *hops > 0 && *hops < 256);
+}
+
+
+int
+lnet_parse_route (char *str, int *im_a_router)
+{
+       /* static scratch buffer OK (single threaded) */
+       static char       cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+       struct list_head        nets;
+       struct list_head        gateways;
+       struct list_head       *tmp1;
+       struct list_head       *tmp2;
+       __u32        net;
+       lnet_nid_t      nid;
+       lnet_text_buf_t  *ltb;
+       int            rc;
+       char         *sep;
+       char         *token = str;
+       int            ntokens = 0;
+       int            myrc = -1;
+       unsigned int      hops;
+       int            got_hops = 0;
+
+       INIT_LIST_HEAD(&gateways);
+       INIT_LIST_HEAD(&nets);
+
+       /* save a copy of the string for error messages */
+       strncpy(cmd, str, sizeof(cmd) - 1);
+       cmd[sizeof(cmd) - 1] = 0;
+
+       sep = str;
+       for (;;) {
+               /* scan for token start */
+               while (cfs_iswhite(*sep))
+                       sep++;
+               if (*sep == 0) {
+                       if (ntokens < (got_hops ? 3 : 2))
+                               goto token_error;
+                       break;
+               }
+
+               ntokens++;
+               token = sep++;
+
+               /* scan for token end */
+               while (*sep != 0 && !cfs_iswhite(*sep))
+                       sep++;
+               if (*sep != 0)
+                       *sep++ = 0;
+
+               if (ntokens == 1) {
+                       tmp2 = &nets;           /* expanding nets */
+               } else if (ntokens == 2 &&
+                          lnet_parse_hops(token, &hops)) {
+                       got_hops = 1;      /* got a hop count */
+                       continue;
+               } else {
+                       tmp2 = &gateways;       /* expanding gateways */
+               }
+
+               ltb = lnet_new_text_buf(strlen(token));
+               if (ltb == NULL)
+                       goto out;
+
+               strcpy(ltb->ltb_text, token);
+               tmp1 = &ltb->ltb_list;
+               list_add_tail(tmp1, tmp2);
+
+               while (tmp1 != tmp2) {
+                       ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+
+                       rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+                       if (rc < 0)
+                               goto token_error;
+
+                       tmp1 = tmp1->next;
+
+                       if (rc > 0) {           /* expanded! */
+                               list_del(&ltb->ltb_list);
+                               lnet_free_text_buf(ltb);
+                               continue;
+                       }
+
+                       if (ntokens == 1) {
+                               net = libcfs_str2net(ltb->ltb_text);
+                               if (net == LNET_NIDNET(LNET_NID_ANY) ||
+                                   LNET_NETTYP(net) == LOLND)
+                                       goto token_error;
+                       } else {
+                               nid = libcfs_str2nid(ltb->ltb_text);
+                               if (nid == LNET_NID_ANY ||
+                                   LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+                                       goto token_error;
+                       }
+               }
+       }
+
+       if (!got_hops)
+               hops = 1;
+
+       LASSERT (!list_empty(&nets));
+       LASSERT (!list_empty(&gateways));
+
+       list_for_each (tmp1, &nets) {
+               ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+               net = libcfs_str2net(ltb->ltb_text);
+               LASSERT (net != LNET_NIDNET(LNET_NID_ANY));
+
+               list_for_each (tmp2, &gateways) {
+                       ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list);
+                       nid = libcfs_str2nid(ltb->ltb_text);
+                       LASSERT (nid != LNET_NID_ANY);
+
+                       if (lnet_islocalnid(nid)) {
+                               *im_a_router = 1;
+                               continue;
+                       }
+
+                       rc = lnet_add_route (net, hops, nid);
+                       if (rc != 0) {
+                               CERROR("Can't create route "
+                                      "to %s via %s\n",
+                                      libcfs_net2str(net),
+                                      libcfs_nid2str(nid));
+                               goto out;
+                       }
+               }
+       }
+
+       myrc = 0;
+       goto out;
+
+ token_error:
+       lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+ out:
+       lnet_free_text_bufs(&nets);
+       lnet_free_text_bufs(&gateways);
+       return myrc;
+}
+
+int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+       lnet_text_buf_t   *ltb;
+
+       while (!list_empty(tbs)) {
+               ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+               if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+                       lnet_free_text_bufs(tbs);
+                       return -EINVAL;
+               }
+
+               list_del(&ltb->ltb_list);
+               lnet_free_text_buf(ltb);
+       }
+
+       return 0;
+}
+
+int
+lnet_parse_routes (char *routes, int *im_a_router)
+{
+       struct list_head        tbs;
+       int            rc = 0;
+
+       *im_a_router = 0;
+
+       INIT_LIST_HEAD(&tbs);
+
+       if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+               CERROR("Error parsing routes\n");
+               rc = -EINVAL;
+       } else {
+               rc = lnet_parse_route_tbs(&tbs, im_a_router);
+       }
+
+       LASSERT (lnet_tbnob == 0);
+       return rc;
+}
+
+int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+       LIST_HEAD       (list);
+       int             rc;
+       int             i;
+
+       rc = cfs_ip_addr_parse(token, len, &list);
+       if (rc != 0)
+               return rc;
+
+       for (rc = i = 0; !rc && i < nip; i++)
+               rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+       cfs_ip_addr_free(&list);
+
+       return rc;
+}
+
+int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+       static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+       int   matched = 0;
+       int   ntokens = 0;
+       int   len;
+       char *net = NULL;
+       char *sep;
+       char *token;
+       int   rc;
+
+       LASSERT (strlen(net_entry) < sizeof(tokens));
+
+       /* work on a copy of the string */
+       strcpy(tokens, net_entry);
+       sep = tokens;
+       for (;;) {
+               /* scan for token start */
+               while (cfs_iswhite(*sep))
+                       sep++;
+               if (*sep == 0)
+                       break;
+
+               token = sep++;
+
+               /* scan for token end */
+               while (*sep != 0 && !cfs_iswhite(*sep))
+                       sep++;
+               if (*sep != 0)
+                       *sep++ = 0;
+
+               if (ntokens++ == 0) {
+                       net = token;
+                       continue;
+               }
+
+               len = strlen(token);
+
+               rc = lnet_match_network_token(token, len, ipaddrs, nip);
+               if (rc < 0) {
+                       lnet_syntax("ip2nets", net_entry,
+                                   (int)(token - tokens), len);
+                       return rc;
+               }
+
+               matched |= (rc != 0);
+       }
+
+       if (!matched)
+               return 0;
+
+       strcpy(net_entry, net);          /* replace with matched net */
+       return 1;
+}
+
+__u32
+lnet_netspec2net(char *netspec)
+{
+       char   *bracket = strchr(netspec, '(');
+       __u32   net;
+
+       if (bracket != NULL)
+               *bracket = 0;
+
+       net = libcfs_str2net(netspec);
+
+       if (bracket != NULL)
+               *bracket = '(';
+
+       return net;
+}
+
+int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+       int            offset = 0;
+       int            offset2;
+       int            len;
+       lnet_text_buf_t  *tb;
+       lnet_text_buf_t  *tb2;
+       struct list_head       *t;
+       char         *sep;
+       char         *bracket;
+       __u32        net;
+
+       LASSERT (!list_empty(nets));
+       LASSERT (nets->next == nets->prev);     /* single entry */
+
+       tb = list_entry(nets->next, lnet_text_buf_t, ltb_list);
+
+       for (;;) {
+               sep = strchr(tb->ltb_text, ',');
+               bracket = strchr(tb->ltb_text, '(');
+
+               if (sep != NULL &&
+                   bracket != NULL &&
+                   bracket < sep) {
+                       /* netspec lists interfaces... */
+
+                       offset2 = offset + (int)(bracket - tb->ltb_text);
+                       len = strlen(bracket);
+
+                       bracket = strchr(bracket + 1, ')');
+
+                       if (bracket == NULL ||
+                           !(bracket[1] == ',' || bracket[1] == 0)) {
+                               lnet_syntax("ip2nets", source, offset2, len);
+                               return -EINVAL;
+                       }
+
+                       sep = (bracket[1] == 0) ? NULL : bracket + 1;
+               }
+
+               if (sep != NULL)
+                       *sep++ = 0;
+
+               net = lnet_netspec2net(tb->ltb_text);
+               if (net == LNET_NIDNET(LNET_NID_ANY)) {
+                       lnet_syntax("ip2nets", source, offset,
+                                   strlen(tb->ltb_text));
+                       return -EINVAL;
+               }
+
+               list_for_each(t, nets) {
+                       tb2 = list_entry(t, lnet_text_buf_t, ltb_list);
+
+                       if (tb2 == tb)
+                               continue;
+
+                       if (net == lnet_netspec2net(tb2->ltb_text)) {
+                               /* duplicate network */
+                               lnet_syntax("ip2nets", source, offset,
+                                           strlen(tb->ltb_text));
+                               return -EINVAL;
+                       }
+               }
+
+               if (sep == NULL)
+                       return 0;
+
+               offset += (int)(sep - tb->ltb_text);
+               tb2 = lnet_new_text_buf(strlen(sep));
+               if (tb2 == NULL)
+                       return -ENOMEM;
+
+               strcpy(tb2->ltb_text, sep);
+               list_add_tail(&tb2->ltb_list, nets);
+
+               tb = tb2;
+       }
+}
+
+int
+lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+       static char     networks[LNET_SINGLE_TEXTBUF_NOB];
+       static char     source[LNET_SINGLE_TEXTBUF_NOB];
+
+       struct list_head          raw_entries;
+       struct list_head          matched_nets;
+       struct list_head          current_nets;
+       struct list_head         *t;
+       struct list_head         *t2;
+       lnet_text_buf_t    *tb;
+       lnet_text_buf_t    *tb2;
+       __u32          net1;
+       __u32          net2;
+       int              len;
+       int              count;
+       int              dup;
+       int              rc;
+
+       INIT_LIST_HEAD(&raw_entries);
+       if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+               CERROR("Error parsing ip2nets\n");
+               LASSERT (lnet_tbnob == 0);
+               return -EINVAL;
+       }
+
+       INIT_LIST_HEAD(&matched_nets);
+       INIT_LIST_HEAD(&current_nets);
+       networks[0] = 0;
+       count = 0;
+       len = 0;
+       rc = 0;
+
+       while (!list_empty(&raw_entries)) {
+               tb = list_entry(raw_entries.next, lnet_text_buf_t,
+                                   ltb_list);
+
+               strncpy(source, tb->ltb_text, sizeof(source)-1);
+               source[sizeof(source)-1] = 0;
+
+               /* replace ltb_text with the network(s) add on match */
+               rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+               if (rc < 0)
+                       break;
+
+               list_del(&tb->ltb_list);
+
+               if (rc == 0) {            /* no match */
+                       lnet_free_text_buf(tb);
+                       continue;
+               }
+
+               /* split into separate networks */
+               INIT_LIST_HEAD(&current_nets);
+               list_add(&tb->ltb_list, &current_nets);
+               rc = lnet_splitnets(source, &current_nets);
+               if (rc < 0)
+                       break;
+
+               dup = 0;
+               list_for_each (t, &current_nets) {
+                       tb = list_entry(t, lnet_text_buf_t, ltb_list);
+                       net1 = lnet_netspec2net(tb->ltb_text);
+                       LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY));
+
+                       list_for_each(t2, &matched_nets) {
+                               tb2 = list_entry(t2, lnet_text_buf_t,
+                                                    ltb_list);
+                               net2 = lnet_netspec2net(tb2->ltb_text);
+                               LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY));
+
+                               if (net1 == net2) {
+                                       dup = 1;
+                                       break;
+                               }
+                       }
+
+                       if (dup)
+                               break;
+               }
+
+               if (dup) {
+                       lnet_free_text_bufs(&current_nets);
+                       continue;
+               }
+
+               list_for_each_safe(t, t2, &current_nets) {
+                       tb = list_entry(t, lnet_text_buf_t, ltb_list);
+
+                       list_del(&tb->ltb_list);
+                       list_add_tail(&tb->ltb_list, &matched_nets);
+
+                       len += snprintf(networks + len, sizeof(networks) - len,
+                                       "%s%s", (len == 0) ? "" : ",",
+                                       tb->ltb_text);
+
+                       if (len >= sizeof(networks)) {
+                               CERROR("Too many matched networks\n");
+                               rc = -E2BIG;
+                               goto out;
+                       }
+               }
+
+               count++;
+       }
+
+ out:
+       lnet_free_text_bufs(&raw_entries);
+       lnet_free_text_bufs(&matched_nets);
+       lnet_free_text_bufs(&current_nets);
+       LASSERT (lnet_tbnob == 0);
+
+       if (rc < 0)
+               return rc;
+
+       *networksp = networks;
+       return count;
+}
+
+void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+       LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+int
+lnet_ipaddr_enumerate (__u32 **ipaddrsp)
+{
+       int     up;
+       __u32      netmask;
+       __u32     *ipaddrs;
+       __u32     *ipaddrs2;
+       int     nip;
+       char     **ifnames;
+       int     nif = libcfs_ipif_enumerate(&ifnames);
+       int     i;
+       int     rc;
+
+       if (nif <= 0)
+               return nif;
+
+       LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+       if (ipaddrs == NULL) {
+               CERROR("Can't allocate ipaddrs[%d]\n", nif);
+               libcfs_ipif_free_enumeration(ifnames, nif);
+               return -ENOMEM;
+       }
+
+       for (i = nip = 0; i < nif; i++) {
+               if (!strcmp(ifnames[i], "lo"))
+                       continue;
+
+               rc = libcfs_ipif_query(ifnames[i], &up,
+                                      &ipaddrs[nip], &netmask);
+               if (rc != 0) {
+                       CWARN("Can't query interface %s: %d\n",
+                             ifnames[i], rc);
+                       continue;
+               }
+
+               if (!up) {
+                       CWARN("Ignoring interface %s: it's down\n",
+                             ifnames[i]);
+                       continue;
+               }
+
+               nip++;
+       }
+
+       libcfs_ipif_free_enumeration(ifnames, nif);
+
+       if (nip == nif) {
+               *ipaddrsp = ipaddrs;
+       } else {
+               if (nip > 0) {
+                       LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+                       if (ipaddrs2 == NULL) {
+                               CERROR("Can't allocate ipaddrs[%d]\n", nip);
+                               nip = -ENOMEM;
+                       } else {
+                               memcpy(ipaddrs2, ipaddrs,
+                                      nip * sizeof(*ipaddrs));
+                               *ipaddrsp = ipaddrs2;
+                               rc = nip;
+                       }
+               }
+               lnet_ipaddr_free_enumeration(ipaddrs, nif);
+       }
+       return nip;
+}
+
+int
+lnet_parse_ip2nets (char **networksp, char *ip2nets)
+{
+       __u32     *ipaddrs;
+       int     nip = lnet_ipaddr_enumerate(&ipaddrs);
+       int     rc;
+
+       if (nip < 0) {
+               LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP "
+                                  "interfaces for ip2nets to match\n", nip);
+               return nip;
+       }
+
+       if (nip == 0) {
+               LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces "
+                                  "for ip2nets to match\n");
+               return -ENOENT;
+       }
+
+       rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+       lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+       if (rc < 0) {
+               LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+               return rc;
+       }
+
+       if (rc == 0) {
+               LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match "
+                                  "any local IP interfaces\n");
+               return -ENOENT;
+       }
+
+       return 0;
+}
+
+int
+lnet_set_ip_niaddr (lnet_ni_t *ni)
+{
+       __u32  net = LNET_NIDNET(ni->ni_nid);
+       char **names;
+       int    n;
+       __u32  ip;
+       __u32  netmask;
+       int    up;
+       int    i;
+       int    rc;
+
+       /* Convenience for LNDs that use the IP address of a local interface as
+        * the local address part of their NID */
+
+       if (ni->ni_interfaces[0] != NULL) {
+
+               CLASSERT (LNET_MAX_INTERFACES > 1);
+
+               if (ni->ni_interfaces[1] != NULL) {
+                       CERROR("Net %s doesn't support multiple interfaces\n",
+                              libcfs_net2str(net));
+                       return -EPERM;
+               }
+
+               rc = libcfs_ipif_query(ni->ni_interfaces[0],
+                                      &up, &ip, &netmask);
+               if (rc != 0) {
+                       CERROR("Net %s can't query interface %s: %d\n",
+                              libcfs_net2str(net), ni->ni_interfaces[0], rc);
+                       return -EPERM;
+               }
+
+               if (!up) {
+                       CERROR("Net %s can't use interface %s: it's down\n",
+                              libcfs_net2str(net), ni->ni_interfaces[0]);
+                       return -ENETDOWN;
+               }
+
+               ni->ni_nid = LNET_MKNID(net, ip);
+               return 0;
+       }
+
+       n = libcfs_ipif_enumerate(&names);
+       if (n <= 0) {
+               CERROR("Net %s can't enumerate interfaces: %d\n",
+                      libcfs_net2str(net), n);
+               return 0;
+       }
+
+       for (i = 0; i < n; i++) {
+               if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+                       continue;
+
+               rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+
+               if (rc != 0) {
+                       CWARN("Net %s can't query interface %s: %d\n",
+                             libcfs_net2str(net), names[i], rc);
+                       continue;
+               }
+
+               if (!up) {
+                       CWARN("Net %s ignoring interface %s (down)\n",
+                             libcfs_net2str(net), names[i]);
+                       continue;
+               }
+
+               libcfs_ipif_free_enumeration(names, n);
+               ni->ni_nid = LNET_MKNID(net, ip);
+               return 0;
+       }
+
+       CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+       libcfs_ipif_free_enumeration(names, n);
+       return -ENOENT;
+}
+EXPORT_SYMBOL(lnet_set_ip_niaddr);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
new file mode 100644 (file)
index 0000000..78297a7
--- /dev/null
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-eq.c
+ *
+ * Library level Event queue management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create an event queue that has room for \a count number of events.
+ *
+ * The event queue is circular and older events will be overwritten by new
+ * ones if they are not removed in time by the user using the functions
+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to
+ * determine the appropriate size of the event queue to prevent this loss
+ * of events. Note that when EQ handler is specified in \a callback, no
+ * event loss can happen, since the handler is run for each event deposited
+ * into the EQ.
+ *
+ * \param count The number of events to be stored in the event queue. It
+ * will be rounded up to the next power of two.
+ * \param callback A handler function that runs when an event is deposited
+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to
+ * indicate that no event handler is desired.
+ * \param handle On successful return, this location will hold a handle for
+ * the newly created EQ.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If an parameter is not valid.
+ * \retval -ENOMEM If memory for the EQ can't be allocated.
+ *
+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics.
+ */
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
+           lnet_handle_eq_t *handle)
+{
+       lnet_eq_t     *eq;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       /* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+        * overflow, they don't skip entries, so the queue has the same
+        * apparent capacity at all times */
+
+       count = cfs_power2_roundup(count);
+
+       if (callback != LNET_EQ_HANDLER_NONE && count != 0) {
+               CWARN("EQ callback is guaranteed to get every event, "
+                     "do you still want to set eqcount %d for polling "
+                     "event which will have locking overhead? "
+                     "Please contact with developer to confirm\n", count);
+       }
+
+       /* count can be 0 if only need callback, we can eliminate
+        * overhead of enqueue event */
+       if (count == 0 && callback == LNET_EQ_HANDLER_NONE)
+               return -EINVAL;
+
+       eq = lnet_eq_alloc();
+       if (eq == NULL)
+               return -ENOMEM;
+
+       if (count != 0) {
+               LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t));
+               if (eq->eq_events == NULL)
+                       goto failed;
+               /* NB allocator has set all event sequence numbers to 0,
+                * so all them should be earlier than eq_deq_seq */
+       }
+
+       eq->eq_deq_seq = 1;
+       eq->eq_enq_seq = 1;
+       eq->eq_size = count;
+       eq->eq_callback = callback;
+
+       eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(),
+                                      sizeof(*eq->eq_refs[0]));
+       if (eq->eq_refs == NULL)
+               goto failed;
+
+       /* MUST hold both exclusive lnet_res_lock */
+       lnet_res_lock(LNET_LOCK_EX);
+       /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+        * both EQ lookup and poll event with only lnet_eq_wait_lock */
+       lnet_eq_wait_lock();
+
+       lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh);
+       list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active);
+
+       lnet_eq_wait_unlock();
+       lnet_res_unlock(LNET_LOCK_EX);
+
+       lnet_eq2handle(handle, eq);
+       return 0;
+
+failed:
+       if (eq->eq_events != NULL)
+               LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t));
+
+       if (eq->eq_refs != NULL)
+               cfs_percpt_free(eq->eq_refs);
+
+       lnet_eq_free(eq);
+       return -ENOMEM;
+}
+EXPORT_SYMBOL(LNetEQAlloc);
+
+/**
+ * Release the resources associated with an event queue if it's idle;
+ * otherwise do nothing and it's up to the user to try again.
+ *
+ * \param eqh A handle for the event queue to be released.
+ *
+ * \retval 0 If the EQ is not in use and freed.
+ * \retval -ENOENT If \a eqh does not point to a valid EQ.
+ * \retval -EBUSY  If the EQ is still in use by some MDs.
+ */
+int
+LNetEQFree(lnet_handle_eq_t eqh)
+{
+       struct lnet_eq  *eq;
+       lnet_event_t    *events = NULL;
+       int             **refs = NULL;
+       int             *ref;
+       int             rc = 0;
+       int             size = 0;
+       int             i;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       lnet_res_lock(LNET_LOCK_EX);
+       /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+        * both EQ lookup and poll event with only lnet_eq_wait_lock */
+       lnet_eq_wait_lock();
+
+       eq = lnet_handle2eq(&eqh);
+       if (eq == NULL) {
+               rc = -ENOENT;
+               goto out;
+       }
+
+       cfs_percpt_for_each(ref, i, eq->eq_refs) {
+               LASSERT(*ref >= 0);
+               if (*ref == 0)
+                       continue;
+
+               CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n",
+                      i, *ref);
+               rc = -EBUSY;
+               goto out;
+       }
+
+       /* stash for free after lock dropped */
+       events  = eq->eq_events;
+       size    = eq->eq_size;
+       refs    = eq->eq_refs;
+
+       lnet_res_lh_invalidate(&eq->eq_lh);
+       list_del(&eq->eq_list);
+       lnet_eq_free_locked(eq);
+ out:
+       lnet_eq_wait_unlock();
+       lnet_res_unlock(LNET_LOCK_EX);
+
+       if (events != NULL)
+               LIBCFS_FREE(events, size * sizeof(lnet_event_t));
+       if (refs != NULL)
+               cfs_percpt_free(refs);
+
+       return rc;
+}
+EXPORT_SYMBOL(LNetEQFree);
+
+void
+lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+       /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */
+       int index;
+
+       if (eq->eq_size == 0) {
+               LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE);
+               eq->eq_callback(ev);
+               return;
+       }
+
+       lnet_eq_wait_lock();
+       ev->sequence = eq->eq_enq_seq++;
+
+       LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size));
+       index = ev->sequence & (eq->eq_size - 1);
+
+       eq->eq_events[index] = *ev;
+
+       if (eq->eq_callback != LNET_EQ_HANDLER_NONE)
+               eq->eq_callback(ev);
+
+       /* Wake anyone waiting in LNetEQPoll() */
+       if (waitqueue_active(&the_lnet.ln_eq_waitq))
+               wake_up_all(&the_lnet.ln_eq_waitq);
+       lnet_eq_wait_unlock();
+}
+
+int
+lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+       int             new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+       lnet_event_t    *new_event = &eq->eq_events[new_index];
+       int             rc;
+       ENTRY;
+
+       /* must called with lnet_eq_wait_lock hold */
+       if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence))
+               RETURN(0);
+
+       /* We've got a new event... */
+       *ev = *new_event;
+
+       CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+              new_event, eq->eq_deq_seq, eq->eq_size);
+
+       /* ...but did it overwrite an event we've not seen yet? */
+       if (eq->eq_deq_seq == new_event->sequence) {
+               rc = 1;
+       } else {
+               /* don't complain with CERROR: some EQs are sized small
+                * anyway; if it's important, the caller should complain */
+               CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
+                      eq->eq_deq_seq, new_event->sequence);
+               rc = -EOVERFLOW;
+       }
+
+       eq->eq_deq_seq = new_event->sequence + 1;
+       RETURN(rc);
+}
+
+/**
+ * A nonblocking function that can be used to get the next event in an EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. The event is removed from the queue.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 0     No pending event in the EQ.
+ * \retval 1     Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+       int which;
+
+       return LNetEQPoll(&eventq, 1, 0,
+                        event, &which);
+}
+EXPORT_SYMBOL(LNetEQGet);
+
+/**
+ * Block the calling process until there is an event in the EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. This function returns the next event
+ * in the EQ and removes it from the EQ.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 1     Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+       int which;
+
+       return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER,
+                        event, &which);
+}
+EXPORT_SYMBOL(LNetEQWait);
+
+
+static int
+lnet_eq_wait_locked(int *timeout_ms)
+{
+       int             tms = *timeout_ms;
+       int             wait;
+       wait_queue_t  wl;
+       cfs_time_t      now;
+
+       if (tms == 0)
+               return -1; /* don't want to wait and no new event */
+
+       init_waitqueue_entry_current(&wl);
+       set_current_state(TASK_INTERRUPTIBLE);
+       add_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+       lnet_eq_wait_unlock();
+
+       if (tms < 0) {
+               waitq_wait(&wl, TASK_INTERRUPTIBLE);
+
+       } else {
+               struct timeval tv;
+
+               now = cfs_time_current();
+               waitq_timedwait(&wl, TASK_INTERRUPTIBLE,
+                                   cfs_time_seconds(tms) / 1000);
+               cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv);
+               tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+               if (tms < 0) /* no more wait but may have new event */
+                       tms = 0;
+       }
+
+       wait = tms != 0; /* might need to call here again */
+       *timeout_ms = tms;
+
+       lnet_eq_wait_lock();
+       remove_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+       return wait;
+}
+
+
+
+/**
+ * Block the calling process until there's an event from a set of EQs or
+ * timeout happens.
+ *
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully, in which case the corresponding event
+ * is consumed.
+ *
+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a
+ * fixed period, or block indefinitely.
+ *
+ * \param eventqs,neq An array of EQ handles, and size of the array.
+ * \param timeout_ms Time in milliseconds to wait for an event to occur on
+ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an
+ * infinite timeout.
+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will
+ * hold the next event in the EQs, and \a which will contain the index of the
+ * EQ from which the event was taken.
+ *
+ * \retval 0     No pending event in the EQs after timeout.
+ * \retval 1     Indicates success.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ indicated by \a which has been dropped due to limited space in the EQ.
+ * \retval -ENOENT    If there's an invalid handle in \a eventqs.
+ */
+int
+LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
+          lnet_event_t *event, int *which)
+{
+       int     wait = 1;
+       int     rc;
+       int     i;
+       ENTRY;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (neq < 1)
+               RETURN(-ENOENT);
+
+       lnet_eq_wait_lock();
+
+       for (;;) {
+               for (i = 0; i < neq; i++) {
+                       lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]);
+
+                       if (eq == NULL) {
+                               lnet_eq_wait_unlock();
+                               RETURN(-ENOENT);
+                       }
+
+                       rc = lnet_eq_dequeue_event(eq, event);
+                       if (rc != 0) {
+                               lnet_eq_wait_unlock();
+                               *which = i;
+                               RETURN(rc);
+                       }
+               }
+
+               if (wait == 0)
+                       break;
+
+               /*
+                * return value of lnet_eq_wait_locked:
+                * -1 : did nothing and it's sure no new event
+                *  1 : sleep inside and wait until new event
+                *  0 : don't want to wait anymore, but might have new event
+                *      so need to call dequeue again
+                */
+               wait = lnet_eq_wait_locked(&timeout_ms);
+               if (wait < 0) /* no new event */
+                       break;
+       }
+
+       lnet_eq_wait_unlock();
+       RETURN(0);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c
new file mode 100644 (file)
index 0000000..ae643f2
--- /dev/null
@@ -0,0 +1,451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(lnet_libmd_t *md)
+{
+       if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+               /* first unlink attempt... */
+               lnet_me_t *me = md->md_me;
+
+               md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+               /* Disassociate from ME (if any), and unlink it if it was created
+                * with LNET_UNLINK */
+               if (me != NULL) {
+                       /* detach MD from portal */
+                       lnet_ptl_detach_md(me, md);
+                       if (me->me_unlink == LNET_UNLINK)
+                               lnet_me_unlink(me);
+               }
+
+               /* ensure all future handle lookups fail */
+               lnet_res_lh_invalidate(&md->md_lh);
+       }
+
+       if (md->md_refcount != 0) {
+               CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+               return;
+       }
+
+       CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+       if (md->md_eq != NULL) {
+               int     cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+
+               LASSERT(*md->md_eq->eq_refs[cpt] > 0);
+               (*md->md_eq->eq_refs[cpt])--;
+       }
+
+       LASSERT(!list_empty(&md->md_list));
+       list_del_init(&md->md_list);
+       lnet_md_free_locked(md);
+}
+
+static int
+lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
+{
+       int       i;
+       unsigned int niov;
+       int       total_length = 0;
+
+       lmd->md_me = NULL;
+       lmd->md_start = umd->start;
+       lmd->md_offset = 0;
+       lmd->md_max_size = umd->max_size;
+       lmd->md_options = umd->options;
+       lmd->md_user_ptr = umd->user_ptr;
+       lmd->md_eq = NULL;
+       lmd->md_threshold = umd->threshold;
+       lmd->md_refcount = 0;
+       lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+
+       if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+               if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+                       return -EINVAL;
+
+               lmd->md_niov = niov = umd->length;
+               memcpy(lmd->md_iov.iov, umd->start,
+                      niov * sizeof (lmd->md_iov.iov[0]));
+
+               for (i = 0; i < (int)niov; i++) {
+                       /* We take the base address on trust */
+                       if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */
+                               return -EINVAL;
+
+                       total_length += lmd->md_iov.iov[i].iov_len;
+               }
+
+               lmd->md_length = total_length;
+
+               if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+                   (umd->max_size < 0 ||
+                    umd->max_size > total_length)) // illegal max_size
+                       return -EINVAL;
+
+       } else if ((umd->options & LNET_MD_KIOV) != 0) {
+               lmd->md_niov = niov = umd->length;
+               memcpy(lmd->md_iov.kiov, umd->start,
+                      niov * sizeof (lmd->md_iov.kiov[0]));
+
+               for (i = 0; i < (int)niov; i++) {
+                       /* We take the page pointer on trust */
+                       if (lmd->md_iov.kiov[i].kiov_offset +
+                           lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE )
+                               return -EINVAL; /* invalid length */
+
+                       total_length += lmd->md_iov.kiov[i].kiov_len;
+               }
+
+               lmd->md_length = total_length;
+
+               if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+                   (umd->max_size < 0 ||
+                    umd->max_size > total_length)) // illegal max_size
+                       return -EINVAL;
+       } else {   /* contiguous */
+               lmd->md_length = umd->length;
+               lmd->md_niov = niov = 1;
+               lmd->md_iov.iov[0].iov_base = umd->start;
+               lmd->md_iov.iov[0].iov_len = umd->length;
+
+               if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+                   (umd->max_size < 0 ||
+                    umd->max_size > (int)umd->length)) // illegal max_size
+                       return -EINVAL;
+       }
+
+       return 0;
+}
+
+/* must be called with resource lock held */
+static int
+lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt)
+{
+       struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+       /* NB we are passed an allocated, but inactive md.
+        * if we return success, caller may lnet_md_unlink() it.
+        * otherwise caller may only lnet_md_free() it.
+        */
+       /* This implementation doesn't know how to create START events or
+        * disable END events.  Best to LASSERT our caller is compliant so
+        * we find out quickly...  */
+       /*  TODO - reevaluate what should be here in light of
+        * the removal of the start and end events
+        * maybe there we shouldn't even allow LNET_EQ_NONE!)
+        * LASSERT (eq == NULL);
+        */
+       if (!LNetHandleIsInvalid(eq_handle)) {
+               md->md_eq = lnet_handle2eq(&eq_handle);
+
+               if (md->md_eq == NULL)
+                       return -ENOENT;
+
+               (*md->md_eq->eq_refs[cpt])++;
+       }
+
+       lnet_res_lh_initialize(container, &md->md_lh);
+
+       LASSERT(list_empty(&md->md_list));
+       list_add(&md->md_list, &container->rec_active);
+
+       return 0;
+}
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
+{
+       /* NB this doesn't copy out all the iov entries so when a
+        * discontiguous MD is copied out, the target gets to know the
+        * original iov pointer (in start) and the number of entries it had
+        * and that's all.
+        */
+       umd->start = lmd->md_start;
+       umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+                     lmd->md_length : lmd->md_niov;
+       umd->threshold = lmd->md_threshold;
+       umd->max_size = lmd->md_max_size;
+       umd->options = lmd->md_options;
+       umd->user_ptr = lmd->md_user_ptr;
+       lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
+}
+
+int
+lnet_md_validate(lnet_md_t *umd)
+{
+       if (umd->start == NULL && umd->length != 0) {
+               CERROR("MD start pointer can not be NULL with length %u\n",
+                      umd->length);
+               return -EINVAL;
+       }
+
+       if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+           umd->length > LNET_MAX_IOV) {
+               CERROR("Invalid option: too many fragments %u, %d max\n",
+                      umd->length, LNET_MAX_IOV);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param meh A handle for a ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a
+ * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by
+ * calling LNetInvalidateHandle() on it.
+ * \retval -EBUSY  If the ME pointed to by \a meh is already associated with
+ * a MD.
+ */
+int
+LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
+            lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+       LIST_HEAD               (matches);
+       LIST_HEAD               (drops);
+       struct lnet_me          *me;
+       struct lnet_libmd       *md;
+       int                     cpt;
+       int                     rc;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (lnet_md_validate(&umd) != 0)
+               return -EINVAL;
+
+       if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+               CERROR("Invalid option: no MD_OP set\n");
+               return -EINVAL;
+       }
+
+       md = lnet_md_alloc(&umd);
+       if (md == NULL)
+               return -ENOMEM;
+
+       rc = lnet_md_build(md, &umd, unlink);
+       cpt = lnet_cpt_of_cookie(meh.cookie);
+
+       lnet_res_lock(cpt);
+       if (rc != 0)
+               goto failed;
+
+       me = lnet_handle2me(&meh);
+       if (me == NULL)
+               rc = -ENOENT;
+       else if (me->me_md != NULL)
+               rc = -EBUSY;
+       else
+               rc = lnet_md_link(md, umd.eq_handle, cpt);
+
+       if (rc != 0)
+               goto failed;
+
+       /* attach this MD to portal of ME and check if it matches any
+        * blocked msgs on this portal */
+       lnet_ptl_attach_md(me, md, &matches, &drops);
+
+       lnet_md2handle(handle, md);
+
+       lnet_res_unlock(cpt);
+
+       lnet_drop_delayed_msg_list(&drops, "Bad match");
+       lnet_recv_delayed_msg_list(&matches);
+
+       return 0;
+
+ failed:
+       lnet_md_free_locked(md);
+
+       lnet_res_unlock(cpt);
+       return rc;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that
+ * it's OK to supply a NULL \a umd.eq_handle by calling
+ * LNetInvalidateHandle() on it.
+ */
+int
+LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+       lnet_libmd_t    *md;
+       int             cpt;
+       int             rc;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (lnet_md_validate(&umd) != 0)
+               return -EINVAL;
+
+       if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+               CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+               return -EINVAL;
+       }
+
+       md = lnet_md_alloc(&umd);
+       if (md == NULL)
+               return -ENOMEM;
+
+       rc = lnet_md_build(md, &umd, unlink);
+
+       cpt = lnet_res_lock_current();
+       if (rc != 0)
+               goto failed;
+
+       rc = lnet_md_link(md, umd.eq_handle, cpt);
+       if (rc != 0)
+               goto failed;
+
+       lnet_md2handle(handle, md);
+
+       lnet_res_unlock(cpt);
+       return 0;
+
+ failed:
+       lnet_md_free_locked(md);
+
+       lnet_res_unlock(cpt);
+       return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ *   and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ *   returns, and the unlinked event will be piggybacked on the event of
+ *   the completion of the last operation by setting the unlinked field of
+ *   the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+LNetMDUnlink (lnet_handle_md_t mdh)
+{
+       lnet_event_t    ev;
+       lnet_libmd_t    *md;
+       int             cpt;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       cpt = lnet_cpt_of_cookie(mdh.cookie);
+       lnet_res_lock(cpt);
+
+       md = lnet_handle2md(&mdh);
+       if (md == NULL) {
+               lnet_res_unlock(cpt);
+               return -ENOENT;
+       }
+
+       /* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+        * when the NAL is done, the completion event flags that the MD was
+        * unlinked.  Otherwise, we enqueue an event now... */
+
+       if (md->md_eq != NULL &&
+           md->md_refcount == 0) {
+               lnet_build_unlink_event(md, &ev);
+               lnet_eq_enqueue_event(md->md_eq, &ev);
+       }
+
+       lnet_md_unlink(md);
+
+       lnet_res_unlock(cpt);
+       return 0;
+}
+EXPORT_SYMBOL(LNetMDUnlink);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-me.c b/drivers/staging/lustre/lnet/lnet/lib-me.c
new file mode 100644 (file)
index 0000000..0081075
--- /dev/null
@@ -0,0 +1,297 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the lnet_process_id_t
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ * \param handle On successful returns, a handle to the newly created ME
+ * object is saved here. This handle can be used later in LNetMEInsert(),
+ * LNetMEUnlink(), or LNetMDAttach() functions.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is invalid.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ */
+int
+LNetMEAttach(unsigned int portal,
+            lnet_process_id_t match_id,
+            __u64 match_bits, __u64 ignore_bits,
+            lnet_unlink_t unlink, lnet_ins_pos_t pos,
+            lnet_handle_me_t *handle)
+{
+       struct lnet_match_table *mtable;
+       struct lnet_me          *me;
+       struct list_head                *head;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       if ((int)portal >= the_lnet.ln_nportals)
+               return -EINVAL;
+
+       mtable = lnet_mt_of_attach(portal, match_id,
+                                  match_bits, ignore_bits, pos);
+       if (mtable == NULL) /* can't match portal type */
+               return -EPERM;
+
+       me = lnet_me_alloc();
+       if (me == NULL)
+               return -ENOMEM;
+
+       lnet_res_lock(mtable->mt_cpt);
+
+       me->me_portal = portal;
+       me->me_match_id = match_id;
+       me->me_match_bits = match_bits;
+       me->me_ignore_bits = ignore_bits;
+       me->me_unlink = unlink;
+       me->me_md = NULL;
+
+       lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
+                              &me->me_lh);
+       if (ignore_bits != 0)
+               head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+       else
+               head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+       me->me_pos = head - &mtable->mt_mhash[0];
+       if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+               list_add_tail(&me->me_list, head);
+       else
+               list_add(&me->me_list, head);
+
+       lnet_me2handle(handle, me);
+
+       lnet_res_unlock(mtable->mt_cpt);
+       return 0;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/**
+ * Create and a match entry and insert it before or after the ME pointed to by
+ * \a current_meh. The new ME is empty, i.e. not associated with a memory
+ * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME.
+ *
+ * This function is identical to LNetMEAttach() except for the position
+ * where the new ME is inserted.
+ *
+ * \param current_meh A handle for a ME. The new ME will be inserted
+ * immediately before or immediately after this ME.
+ * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion
+ * for LNetMEAttach().
+ *
+ * \retval 0       On success.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ * \retval -ENOENT If \a current_meh does not point to a valid match entry.
+ */
+int
+LNetMEInsert(lnet_handle_me_t current_meh,
+            lnet_process_id_t match_id,
+            __u64 match_bits, __u64 ignore_bits,
+            lnet_unlink_t unlink, lnet_ins_pos_t pos,
+            lnet_handle_me_t *handle)
+{
+       struct lnet_me          *current_me;
+       struct lnet_me          *new_me;
+       struct lnet_portal      *ptl;
+       int                     cpt;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       if (pos == LNET_INS_LOCAL)
+               return -EPERM;
+
+       new_me = lnet_me_alloc();
+       if (new_me == NULL)
+               return -ENOMEM;
+
+       cpt = lnet_cpt_of_cookie(current_meh.cookie);
+
+       lnet_res_lock(cpt);
+
+       current_me = lnet_handle2me(&current_meh);
+       if (current_me == NULL) {
+               lnet_me_free_locked(new_me);
+
+               lnet_res_unlock(cpt);
+               return -ENOENT;
+       }
+
+       LASSERT(current_me->me_portal < the_lnet.ln_nportals);
+
+       ptl = the_lnet.ln_portals[current_me->me_portal];
+       if (lnet_ptl_is_unique(ptl)) {
+               /* nosense to insertion on unique portal */
+               lnet_me_free_locked(new_me);
+               lnet_res_unlock(cpt);
+               return -EPERM;
+       }
+
+       new_me->me_pos = current_me->me_pos;
+       new_me->me_portal = current_me->me_portal;
+       new_me->me_match_id = match_id;
+       new_me->me_match_bits = match_bits;
+       new_me->me_ignore_bits = ignore_bits;
+       new_me->me_unlink = unlink;
+       new_me->me_md = NULL;
+
+       lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh);
+
+       if (pos == LNET_INS_AFTER)
+               list_add(&new_me->me_list, &current_me->me_list);
+       else
+               list_add_tail(&new_me->me_list, &current_me->me_list);
+
+       lnet_me2handle(handle, new_me);
+
+       lnet_res_unlock(cpt);
+
+       return 0;
+}
+EXPORT_SYMBOL(LNetMEInsert);
+
+/**
+ * Unlink a match entry from its match list.
+ *
+ * This operation also releases any resources associated with the ME. If a
+ * memory descriptor is attached to the ME, then it will be unlinked as well
+ * and an unlink event will be generated. It is an error to use the ME handle
+ * after calling LNetMEUnlink().
+ *
+ * \param meh A handle for the ME to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a meh does not point to a valid ME.
+ * \see LNetMDUnlink() for the discussion on delivering unlink event.
+ */
+int
+LNetMEUnlink(lnet_handle_me_t meh)
+{
+       lnet_me_t       *me;
+       lnet_libmd_t    *md;
+       lnet_event_t    ev;
+       int             cpt;
+
+       LASSERT(the_lnet.ln_init);
+       LASSERT(the_lnet.ln_refcount > 0);
+
+       cpt = lnet_cpt_of_cookie(meh.cookie);
+       lnet_res_lock(cpt);
+
+       me = lnet_handle2me(&meh);
+       if (me == NULL) {
+               lnet_res_unlock(cpt);
+               return -ENOENT;
+       }
+
+       md = me->me_md;
+       if (md != NULL &&
+           md->md_eq != NULL &&
+           md->md_refcount == 0) {
+               lnet_build_unlink_event(md, &ev);
+               lnet_eq_enqueue_event(md->md_eq, &ev);
+       }
+
+       lnet_me_unlink(me);
+
+       lnet_res_unlock(cpt);
+       return 0;
+}
+EXPORT_SYMBOL(LNetMEUnlink);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(lnet_me_t *me)
+{
+       list_del(&me->me_list);
+
+       if (me->me_md != NULL) {
+               lnet_libmd_t *md = me->me_md;
+
+               /* detach MD from portal of this ME */
+               lnet_ptl_detach_md(me, md);
+               lnet_md_unlink(md);
+       }
+
+       lnet_res_lh_invalidate(&me->me_lh);
+       lnet_me_free_locked(me);
+}
+
+#if 0
+static void
+lib_me_dump(lnet_me_t *me)
+{
+       CWARN("Match Entry %p ("LPX64")\n", me,
+             me->me_lh.lh_cookie);
+
+       CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+             me->me_match_bits, me->me_ignore_bits);
+
+       CWARN("\tMD\t= %p\n", me->md);
+       CWARN("\tprev\t= %p\n",
+             list_entry(me->me_list.prev, lnet_me_t, me_list));
+       CWARN("\tnext\t= %p\n",
+             list_entry(me->me_list.next, lnet_me_t, me_list));
+}
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
new file mode 100644 (file)
index 0000000..49b0f12
--- /dev/null
@@ -0,0 +1,2441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+static int local_nid_dist_zero = 1;
+CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444,
+               "Reserved");
+
+int
+lnet_fail_nid (lnet_nid_t nid, unsigned int threshold)
+{
+       lnet_test_peer_t  *tp;
+       struct list_head        *el;
+       struct list_head        *next;
+       struct list_head         cull;
+
+       LASSERT (the_lnet.ln_init);
+
+       /* NB: use lnet_net_lock(0) to serialize operations on test peers */
+       if (threshold != 0) {
+               /* Adding a new entry */
+               LIBCFS_ALLOC(tp, sizeof(*tp));
+               if (tp == NULL)
+                       return -ENOMEM;
+
+               tp->tp_nid = nid;
+               tp->tp_threshold = threshold;
+
+               lnet_net_lock(0);
+               list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+               lnet_net_unlock(0);
+               return 0;
+       }
+
+       /* removing entries */
+       INIT_LIST_HEAD(&cull);
+
+       lnet_net_lock(0);
+
+       list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+               tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+               if (tp->tp_threshold == 0 ||    /* needs culling anyway */
+                   nid == LNET_NID_ANY ||       /* removing all entries */
+                   tp->tp_nid == nid)    /* matched this one */
+               {
+                       list_del (&tp->tp_list);
+                       list_add (&tp->tp_list, &cull);
+               }
+       }
+
+       lnet_net_unlock(0);
+
+       while (!list_empty (&cull)) {
+               tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+
+               list_del (&tp->tp_list);
+               LIBCFS_FREE(tp, sizeof (*tp));
+       }
+       return 0;
+}
+
+static int
+fail_peer (lnet_nid_t nid, int outgoing)
+{
+       lnet_test_peer_t *tp;
+       struct list_head       *el;
+       struct list_head       *next;
+       struct list_head        cull;
+       int            fail = 0;
+
+       INIT_LIST_HEAD (&cull);
+
+       /* NB: use lnet_net_lock(0) to serialize operations on test peers */
+       lnet_net_lock(0);
+
+       list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+               tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+               if (tp->tp_threshold == 0) {
+                       /* zombie entry */
+                       if (outgoing) {
+                               /* only cull zombies on outgoing tests,
+                                * since we may be at interrupt priority on
+                                * incoming messages. */
+                               list_del (&tp->tp_list);
+                               list_add (&tp->tp_list, &cull);
+                       }
+                       continue;
+               }
+
+               if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */
+                   nid == tp->tp_nid) {        /* fail this peer */
+                       fail = 1;
+
+                       if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+                               tp->tp_threshold--;
+                               if (outgoing &&
+                                   tp->tp_threshold == 0) {
+                                       /* see above */
+                                       list_del (&tp->tp_list);
+                                       list_add (&tp->tp_list, &cull);
+                               }
+                       }
+                       break;
+               }
+       }
+
+       lnet_net_unlock(0);
+
+       while (!list_empty (&cull)) {
+               tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+               list_del (&tp->tp_list);
+
+               LIBCFS_FREE(tp, sizeof (*tp));
+       }
+
+       return (fail);
+}
+
+unsigned int
+lnet_iov_nob (unsigned int niov, struct iovec *iov)
+{
+       unsigned int nob = 0;
+
+       while (niov-- > 0)
+               nob += (iov++)->iov_len;
+
+       return (nob);
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+                  unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+                  unsigned int nob)
+{
+       /* NB diov, siov are READ-ONLY */
+       unsigned int  this_nob;
+
+       if (nob == 0)
+               return;
+
+       /* skip complete frags before 'doffset' */
+       LASSERT (ndiov > 0);
+       while (doffset >= diov->iov_len) {
+               doffset -= diov->iov_len;
+               diov++;
+               ndiov--;
+               LASSERT (ndiov > 0);
+       }
+
+       /* skip complete frags before 'soffset' */
+       LASSERT (nsiov > 0);
+       while (soffset >= siov->iov_len) {
+               soffset -= siov->iov_len;
+               siov++;
+               nsiov--;
+               LASSERT (nsiov > 0);
+       }
+
+       do {
+               LASSERT (ndiov > 0);
+               LASSERT (nsiov > 0);
+               this_nob = MIN(diov->iov_len - doffset,
+                              siov->iov_len - soffset);
+               this_nob = MIN(this_nob, nob);
+
+               memcpy ((char *)diov->iov_base + doffset,
+                       (char *)siov->iov_base + soffset, this_nob);
+               nob -= this_nob;
+
+               if (diov->iov_len > doffset + this_nob) {
+                       doffset += this_nob;
+               } else {
+                       diov++;
+                       ndiov--;
+                       doffset = 0;
+               }
+
+               if (siov->iov_len > soffset + this_nob) {
+                       soffset += this_nob;
+               } else {
+                       siov++;
+                       nsiov--;
+                       soffset = 0;
+               }
+       } while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+int
+lnet_extract_iov (int dst_niov, struct iovec *dst,
+                 int src_niov, struct iovec *src,
+                 unsigned int offset, unsigned int len)
+{
+       /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+        * for exactly 'len' bytes, and return the number of entries.
+        * NB not destructive to 'src' */
+       unsigned int    frag_len;
+       unsigned int    niov;
+
+       if (len == 0)                      /* no data => */
+               return (0);                  /* no frags */
+
+       LASSERT (src_niov > 0);
+       while (offset >= src->iov_len) {      /* skip initial frags */
+               offset -= src->iov_len;
+               src_niov--;
+               src++;
+               LASSERT (src_niov > 0);
+       }
+
+       niov = 1;
+       for (;;) {
+               LASSERT (src_niov > 0);
+               LASSERT ((int)niov <= dst_niov);
+
+               frag_len = src->iov_len - offset;
+               dst->iov_base = ((char *)src->iov_base) + offset;
+
+               if (len <= frag_len) {
+                       dst->iov_len = len;
+                       return (niov);
+               }
+
+               dst->iov_len = frag_len;
+
+               len -= frag_len;
+               dst++;
+               src++;
+               niov++;
+               src_niov--;
+               offset = 0;
+       }
+}
+EXPORT_SYMBOL(lnet_extract_iov);
+
+
+unsigned int
+lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov)
+{
+       unsigned int  nob = 0;
+
+       while (niov-- > 0)
+               nob += (kiov++)->kiov_len;
+
+       return (nob);
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+                    unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+                    unsigned int nob)
+{
+       /* NB diov, siov are READ-ONLY */
+       unsigned int    this_nob;
+       char       *daddr = NULL;
+       char       *saddr = NULL;
+
+       if (nob == 0)
+               return;
+
+       LASSERT (!in_interrupt ());
+
+       LASSERT (ndiov > 0);
+       while (doffset >= diov->kiov_len) {
+               doffset -= diov->kiov_len;
+               diov++;
+               ndiov--;
+               LASSERT (ndiov > 0);
+       }
+
+       LASSERT (nsiov > 0);
+       while (soffset >= siov->kiov_len) {
+               soffset -= siov->kiov_len;
+               siov++;
+               nsiov--;
+               LASSERT (nsiov > 0);
+       }
+
+       do {
+               LASSERT (ndiov > 0);
+               LASSERT (nsiov > 0);
+               this_nob = MIN(diov->kiov_len - doffset,
+                              siov->kiov_len - soffset);
+               this_nob = MIN(this_nob, nob);
+
+               if (daddr == NULL)
+                       daddr = ((char *)kmap(diov->kiov_page)) +
+                               diov->kiov_offset + doffset;
+               if (saddr == NULL)
+                       saddr = ((char *)kmap(siov->kiov_page)) +
+                               siov->kiov_offset + soffset;
+
+               /* Vanishing risk of kmap deadlock when mapping 2 pages.
+                * However in practice at least one of the kiovs will be mapped
+                * kernel pages and the map/unmap will be NOOPs */
+
+               memcpy (daddr, saddr, this_nob);
+               nob -= this_nob;
+
+               if (diov->kiov_len > doffset + this_nob) {
+                       daddr += this_nob;
+                       doffset += this_nob;
+               } else {
+                       kunmap(diov->kiov_page);
+                       daddr = NULL;
+                       diov++;
+                       ndiov--;
+                       doffset = 0;
+               }
+
+               if (siov->kiov_len > soffset + this_nob) {
+                       saddr += this_nob;
+                       soffset += this_nob;
+               } else {
+                       kunmap(siov->kiov_page);
+                       saddr = NULL;
+                       siov++;
+                       nsiov--;
+                       soffset = 0;
+               }
+       } while (nob > 0);
+
+       if (daddr != NULL)
+               kunmap(diov->kiov_page);
+       if (saddr != NULL)
+               kunmap(siov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+                   unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+                   unsigned int nob)
+{
+       /* NB iov, kiov are READ-ONLY */
+       unsigned int    this_nob;
+       char       *addr = NULL;
+
+       if (nob == 0)
+               return;
+
+       LASSERT (!in_interrupt ());
+
+       LASSERT (niov > 0);
+       while (iovoffset >= iov->iov_len) {
+               iovoffset -= iov->iov_len;
+               iov++;
+               niov--;
+               LASSERT (niov > 0);
+       }
+
+       LASSERT (nkiov > 0);
+       while (kiovoffset >= kiov->kiov_len) {
+               kiovoffset -= kiov->kiov_len;
+               kiov++;
+               nkiov--;
+               LASSERT (nkiov > 0);
+       }
+
+       do {
+               LASSERT (niov > 0);
+               LASSERT (nkiov > 0);
+               this_nob = MIN(iov->iov_len - iovoffset,
+                              kiov->kiov_len - kiovoffset);
+               this_nob = MIN(this_nob, nob);
+
+               if (addr == NULL)
+                       addr = ((char *)kmap(kiov->kiov_page)) +
+                               kiov->kiov_offset + kiovoffset;
+
+               memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob);
+               nob -= this_nob;
+
+               if (iov->iov_len > iovoffset + this_nob) {
+                       iovoffset += this_nob;
+               } else {
+                       iov++;
+                       niov--;
+                       iovoffset = 0;
+               }
+
+               if (kiov->kiov_len > kiovoffset + this_nob) {
+                       addr += this_nob;
+                       kiovoffset += this_nob;
+               } else {
+                       kunmap(kiov->kiov_page);
+                       addr = NULL;
+                       kiov++;
+                       nkiov--;
+                       kiovoffset = 0;
+               }
+
+       } while (nob > 0);
+
+       if (addr != NULL)
+               kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+                   unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+                   unsigned int nob)
+{
+       /* NB kiov, iov are READ-ONLY */
+       unsigned int    this_nob;
+       char       *addr = NULL;
+
+       if (nob == 0)
+               return;
+
+       LASSERT (!in_interrupt ());
+
+       LASSERT (nkiov > 0);
+       while (kiovoffset >= kiov->kiov_len) {
+               kiovoffset -= kiov->kiov_len;
+               kiov++;
+               nkiov--;
+               LASSERT (nkiov > 0);
+       }
+
+       LASSERT (niov > 0);
+       while (iovoffset >= iov->iov_len) {
+               iovoffset -= iov->iov_len;
+               iov++;
+               niov--;
+               LASSERT (niov > 0);
+       }
+
+       do {
+               LASSERT (nkiov > 0);
+               LASSERT (niov > 0);
+               this_nob = MIN(kiov->kiov_len - kiovoffset,
+                              iov->iov_len - iovoffset);
+               this_nob = MIN(this_nob, nob);
+
+               if (addr == NULL)
+                       addr = ((char *)kmap(kiov->kiov_page)) +
+                               kiov->kiov_offset + kiovoffset;
+
+               memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
+               nob -= this_nob;
+
+               if (kiov->kiov_len > kiovoffset + this_nob) {
+                       addr += this_nob;
+                       kiovoffset += this_nob;
+               } else {
+                       kunmap(kiov->kiov_page);
+                       addr = NULL;
+                       kiov++;
+                       nkiov--;
+                       kiovoffset = 0;
+               }
+
+               if (iov->iov_len > iovoffset + this_nob) {
+                       iovoffset += this_nob;
+               } else {
+                       iov++;
+                       niov--;
+                       iovoffset = 0;
+               }
+       } while (nob > 0);
+
+       if (addr != NULL)
+               kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+                  int src_niov, lnet_kiov_t *src,
+                  unsigned int offset, unsigned int len)
+{
+       /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+        * for exactly 'len' bytes, and return the number of entries.
+        * NB not destructive to 'src' */
+       unsigned int    frag_len;
+       unsigned int    niov;
+
+       if (len == 0)                      /* no data => */
+               return (0);                  /* no frags */
+
+       LASSERT (src_niov > 0);
+       while (offset >= src->kiov_len) {      /* skip initial frags */
+               offset -= src->kiov_len;
+               src_niov--;
+               src++;
+               LASSERT (src_niov > 0);
+       }
+
+       niov = 1;
+       for (;;) {
+               LASSERT (src_niov > 0);
+               LASSERT ((int)niov <= dst_niov);
+
+               frag_len = src->kiov_len - offset;
+               dst->kiov_page = src->kiov_page;
+               dst->kiov_offset = src->kiov_offset + offset;
+
+               if (len <= frag_len) {
+                       dst->kiov_len = len;
+                       LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+                       return (niov);
+               }
+
+               dst->kiov_len = frag_len;
+               LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+
+               len -= frag_len;
+               dst++;
+               src++;
+               niov++;
+               src_niov--;
+               offset = 0;
+       }
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+void
+lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+            unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+       unsigned int  niov = 0;
+       struct iovec *iov = NULL;
+       lnet_kiov_t  *kiov = NULL;
+       int        rc;
+
+       LASSERT (!in_interrupt ());
+       LASSERT (mlen == 0 || msg != NULL);
+
+       if (msg != NULL) {
+               LASSERT(msg->msg_receiving);
+               LASSERT(!msg->msg_sending);
+               LASSERT(rlen == msg->msg_len);
+               LASSERT(mlen <= msg->msg_len);
+               LASSERT(msg->msg_offset == offset);
+               LASSERT(msg->msg_wanted == mlen);
+
+               msg->msg_receiving = 0;
+
+               if (mlen != 0) {
+                       niov = msg->msg_niov;
+                       iov  = msg->msg_iov;
+                       kiov = msg->msg_kiov;
+
+                       LASSERT (niov > 0);
+                       LASSERT ((iov == NULL) != (kiov == NULL));
+               }
+       }
+
+       rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
+                                   niov, iov, kiov, offset, mlen, rlen);
+       if (rc < 0)
+               lnet_finalize(ni, msg, rc);
+}
+
+void
+lnet_setpayloadbuffer(lnet_msg_t *msg)
+{
+       lnet_libmd_t *md = msg->msg_md;
+
+       LASSERT (msg->msg_len > 0);
+       LASSERT (!msg->msg_routing);
+       LASSERT (md != NULL);
+       LASSERT (msg->msg_niov == 0);
+       LASSERT (msg->msg_iov == NULL);
+       LASSERT (msg->msg_kiov == NULL);
+
+       msg->msg_niov = md->md_niov;
+       if ((md->md_options & LNET_MD_KIOV) != 0)
+               msg->msg_kiov = md->md_iov.kiov;
+       else
+               msg->msg_iov = md->md_iov.iov;
+}
+
+void
+lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+              unsigned int offset, unsigned int len)
+{
+       msg->msg_type = type;
+       msg->msg_target = target;
+       msg->msg_len = len;
+       msg->msg_offset = offset;
+
+       if (len != 0)
+               lnet_setpayloadbuffer(msg);
+
+       memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+       msg->msg_hdr.type          = cpu_to_le32(type);
+       msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
+       msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
+       /* src_nid will be set later */
+       msg->msg_hdr.src_pid    = cpu_to_le32(the_lnet.ln_pid);
+       msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
+
+void
+lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       void   *priv = msg->msg_private;
+       int     rc;
+
+       LASSERT (!in_interrupt ());
+       LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+                (msg->msg_txcredit && msg->msg_peertxcredit));
+
+       rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+       if (rc < 0)
+               lnet_finalize(ni, msg, rc);
+}
+
+int
+lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       int     rc;
+
+       LASSERT(!msg->msg_sending);
+       LASSERT(msg->msg_receiving);
+       LASSERT(!msg->msg_rx_ready_delay);
+       LASSERT(ni->ni_lnd->lnd_eager_recv != NULL);
+
+       msg->msg_rx_ready_delay = 1;
+       rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+                                         &msg->msg_private);
+       if (rc != 0) {
+               CERROR("recv from %s / send to %s aborted: "
+                      "eager_recv failed %d\n",
+                      libcfs_nid2str(msg->msg_rxpeer->lp_nid),
+                      libcfs_id2str(msg->msg_target), rc);
+               LASSERT(rc < 0); /* required by my callers */
+       }
+
+       return rc;
+}
+
+/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+void
+lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+       cfs_time_t last_alive = 0;
+
+       LASSERT(lnet_peer_aliveness_enabled(lp));
+       LASSERT(ni->ni_lnd->lnd_query != NULL);
+
+       lnet_net_unlock(lp->lp_cpt);
+       (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+       lnet_net_lock(lp->lp_cpt);
+
+       lp->lp_last_query = cfs_time_current();
+
+       if (last_alive != 0) /* NI has updated timestamp */
+               lp->lp_last_alive = last_alive;
+}
+
+/* NB: always called with lnet_net_lock held */
+static inline int
+lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
+{
+       int     alive;
+       cfs_time_t deadline;
+
+       LASSERT (lnet_peer_aliveness_enabled(lp));
+
+       /* Trust lnet_notify() if it has more recent aliveness news, but
+        * ignore the initial assumed death (see lnet_peers_start_down()).
+        */
+       if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+           cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+               return 0;
+
+       deadline = cfs_time_add(lp->lp_last_alive,
+                               cfs_time_seconds(lp->lp_ni->ni_peertimeout));
+       alive = cfs_time_after(deadline, now);
+
+       /* Update obsolete lp_alive except for routers assumed to be dead
+        * initially, because router checker would update aliveness in this
+        * case, and moreover lp_last_alive at peer creation is assumed.
+        */
+       if (alive && !lp->lp_alive &&
+           !(lnet_isrouter(lp) && lp->lp_alive_count == 0))
+               lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+       return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ *     may drop the lnet_net_lock */
+int
+lnet_peer_alive_locked (lnet_peer_t *lp)
+{
+       cfs_time_t now = cfs_time_current();
+
+       if (!lnet_peer_aliveness_enabled(lp))
+               return -ENODEV;
+
+       if (lnet_peer_is_alive(lp, now))
+               return 1;
+
+       /* Peer appears dead, but we should avoid frequent NI queries (at
+        * most once per lnet_queryinterval seconds). */
+       if (lp->lp_last_query != 0) {
+               static const int lnet_queryinterval = 1;
+
+               cfs_time_t next_query =
+                          cfs_time_add(lp->lp_last_query,
+                                       cfs_time_seconds(lnet_queryinterval));
+
+               if (cfs_time_before(now, next_query)) {
+                       if (lp->lp_alive)
+                               CWARN("Unexpected aliveness of peer %s: "
+                                     "%d < %d (%d/%d)\n",
+                                     libcfs_nid2str(lp->lp_nid),
+                                     (int)now, (int)next_query,
+                                     lnet_queryinterval,
+                                     lp->lp_ni->ni_peertimeout);
+                       return 0;
+               }
+       }
+
+       /* query NI for latest aliveness news */
+       lnet_ni_query_locked(lp->lp_ni, lp);
+
+       if (lnet_peer_is_alive(lp, now))
+               return 1;
+
+       lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+       return 0;
+}
+
+int
+lnet_post_send_locked(lnet_msg_t *msg, int do_send)
+{
+       /* lnet_send is going to lnet_net_unlock immediately after this,
+        * so it sets do_send FALSE and I don't do the unlock/send/lock bit.
+        * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer
+        * appears dead, and 0 if sent or OK to send */
+       struct lnet_peer        *lp = msg->msg_txpeer;
+       struct lnet_ni          *ni = lp->lp_ni;
+       struct lnet_tx_queue    *tq;
+       int                     cpt;
+
+       /* non-lnet_send() callers have checked before */
+       LASSERT(!do_send || msg->msg_tx_delayed);
+       LASSERT(!msg->msg_receiving);
+       LASSERT(msg->msg_tx_committed);
+
+       cpt = msg->msg_tx_cpt;
+       tq = ni->ni_tx_queues[cpt];
+
+       /* NB 'lp' is always the next hop */
+       if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+           lnet_peer_alive_locked(lp) == 0) {
+               the_lnet.ln_counters[cpt]->drop_count++;
+               the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+               lnet_net_unlock(cpt);
+
+               CNETERR("Dropping message for %s: peer not alive\n",
+                       libcfs_id2str(msg->msg_target));
+               if (do_send)
+                       lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+               lnet_net_lock(cpt);
+               return EHOSTUNREACH;
+       }
+
+       if (!msg->msg_peertxcredit) {
+               LASSERT ((lp->lp_txcredits < 0) ==
+                        !list_empty(&lp->lp_txq));
+
+               msg->msg_peertxcredit = 1;
+               lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+               lp->lp_txcredits--;
+
+               if (lp->lp_txcredits < lp->lp_mintxcredits)
+                       lp->lp_mintxcredits = lp->lp_txcredits;
+
+               if (lp->lp_txcredits < 0) {
+                       msg->msg_tx_delayed = 1;
+                       list_add_tail(&msg->msg_list, &lp->lp_txq);
+                       return EAGAIN;
+               }
+       }
+
+       if (!msg->msg_txcredit) {
+               LASSERT((tq->tq_credits < 0) ==
+                       !list_empty(&tq->tq_delayed));
+
+               msg->msg_txcredit = 1;
+               tq->tq_credits--;
+
+               if (tq->tq_credits < tq->tq_credits_min)
+                       tq->tq_credits_min = tq->tq_credits;
+
+               if (tq->tq_credits < 0) {
+                       msg->msg_tx_delayed = 1;
+                       list_add_tail(&msg->msg_list, &tq->tq_delayed);
+                       return EAGAIN;
+               }
+       }
+
+       if (do_send) {
+               lnet_net_unlock(cpt);
+               lnet_ni_send(ni, msg);
+               lnet_net_lock(cpt);
+       }
+       return 0;
+}
+
+
+lnet_rtrbufpool_t *
+lnet_msg2bufpool(lnet_msg_t *msg)
+{
+       lnet_rtrbufpool_t       *rbp;
+       int                     cpt;
+
+       LASSERT(msg->msg_rx_committed);
+
+       cpt = msg->msg_rx_cpt;
+       rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+       LASSERT(msg->msg_len <= LNET_MTU);
+       while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) {
+               rbp++;
+               LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+       }
+
+       return rbp;
+}
+
+int
+lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
+{
+       /* lnet_parse is going to lnet_net_unlock immediately after this, so it
+        * sets do_recv FALSE and I don't do the unlock/send/lock bit.  I
+        * return EAGAIN if msg blocked and 0 if received or OK to receive */
+       lnet_peer_t      *lp = msg->msg_rxpeer;
+       lnet_rtrbufpool_t   *rbp;
+       lnet_rtrbuf_t       *rb;
+
+       LASSERT (msg->msg_iov == NULL);
+       LASSERT (msg->msg_kiov == NULL);
+       LASSERT (msg->msg_niov == 0);
+       LASSERT (msg->msg_routing);
+       LASSERT (msg->msg_receiving);
+       LASSERT (!msg->msg_sending);
+
+       /* non-lnet_parse callers only receive delayed messages */
+       LASSERT(!do_recv || msg->msg_rx_delayed);
+
+       if (!msg->msg_peerrtrcredit) {
+               LASSERT ((lp->lp_rtrcredits < 0) ==
+                        !list_empty(&lp->lp_rtrq));
+
+               msg->msg_peerrtrcredit = 1;
+               lp->lp_rtrcredits--;
+               if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
+                       lp->lp_minrtrcredits = lp->lp_rtrcredits;
+
+               if (lp->lp_rtrcredits < 0) {
+                       /* must have checked eager_recv before here */
+                       LASSERT(msg->msg_rx_ready_delay);
+                       msg->msg_rx_delayed = 1;
+                       list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+                       return EAGAIN;
+               }
+       }
+
+       rbp = lnet_msg2bufpool(msg);
+
+       if (!msg->msg_rtrcredit) {
+               LASSERT ((rbp->rbp_credits < 0) ==
+                        !list_empty(&rbp->rbp_msgs));
+
+               msg->msg_rtrcredit = 1;
+               rbp->rbp_credits--;
+               if (rbp->rbp_credits < rbp->rbp_mincredits)
+                       rbp->rbp_mincredits = rbp->rbp_credits;
+
+               if (rbp->rbp_credits < 0) {
+                       /* must have checked eager_recv before here */
+                       LASSERT(msg->msg_rx_ready_delay);
+                       msg->msg_rx_delayed = 1;
+                       list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+                       return EAGAIN;
+               }
+       }
+
+       LASSERT (!list_empty(&rbp->rbp_bufs));
+       rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list);
+       list_del(&rb->rb_list);
+
+       msg->msg_niov = rbp->rbp_npages;
+       msg->msg_kiov = &rb->rb_kiov[0];
+
+       if (do_recv) {
+               int cpt = msg->msg_rx_cpt;
+
+               lnet_net_unlock(cpt);
+               lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+                            0, msg->msg_len, msg->msg_len);
+               lnet_net_lock(cpt);
+       }
+       return 0;
+}
+
+void
+lnet_return_tx_credits_locked(lnet_msg_t *msg)
+{
+       lnet_peer_t     *txpeer = msg->msg_txpeer;
+       lnet_msg_t      *msg2;
+
+       if (msg->msg_txcredit) {
+               struct lnet_ni       *ni = txpeer->lp_ni;
+               struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+               /* give back NI txcredits */
+               msg->msg_txcredit = 0;
+
+               LASSERT((tq->tq_credits < 0) ==
+                       !list_empty(&tq->tq_delayed));
+
+               tq->tq_credits++;
+               if (tq->tq_credits <= 0) {
+                       msg2 = list_entry(tq->tq_delayed.next,
+                                             lnet_msg_t, msg_list);
+                       list_del(&msg2->msg_list);
+
+                       LASSERT(msg2->msg_txpeer->lp_ni == ni);
+                       LASSERT(msg2->msg_tx_delayed);
+
+                       (void) lnet_post_send_locked(msg2, 1);
+               }
+       }
+
+       if (msg->msg_peertxcredit) {
+               /* give back peer txcredits */
+               msg->msg_peertxcredit = 0;
+
+               LASSERT((txpeer->lp_txcredits < 0) ==
+                       !list_empty(&txpeer->lp_txq));
+
+               txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+               LASSERT (txpeer->lp_txqnob >= 0);
+
+               txpeer->lp_txcredits++;
+               if (txpeer->lp_txcredits <= 0) {
+                       msg2 = list_entry(txpeer->lp_txq.next,
+                                             lnet_msg_t, msg_list);
+                       list_del(&msg2->msg_list);
+
+                       LASSERT(msg2->msg_txpeer == txpeer);
+                       LASSERT(msg2->msg_tx_delayed);
+
+                       (void) lnet_post_send_locked(msg2, 1);
+               }
+       }
+
+       if (txpeer != NULL) {
+               msg->msg_txpeer = NULL;
+               lnet_peer_decref_locked(txpeer);
+       }
+}
+
+void
+lnet_return_rx_credits_locked(lnet_msg_t *msg)
+{
+       lnet_peer_t     *rxpeer = msg->msg_rxpeer;
+       lnet_msg_t      *msg2;
+
+       if (msg->msg_rtrcredit) {
+               /* give back global router credits */
+               lnet_rtrbuf_t     *rb;
+               lnet_rtrbufpool_t *rbp;
+
+               /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+                * there until it gets one allocated, or aborts the wait
+                * itself */
+               LASSERT (msg->msg_kiov != NULL);
+
+               rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]);
+               rbp = rb->rb_pool;
+               LASSERT (rbp == lnet_msg2bufpool(msg));
+
+               msg->msg_kiov = NULL;
+               msg->msg_rtrcredit = 0;
+
+               LASSERT((rbp->rbp_credits < 0) ==
+                       !list_empty(&rbp->rbp_msgs));
+               LASSERT((rbp->rbp_credits > 0) ==
+                       !list_empty(&rbp->rbp_bufs));
+
+               list_add(&rb->rb_list, &rbp->rbp_bufs);
+               rbp->rbp_credits++;
+               if (rbp->rbp_credits <= 0) {
+                       msg2 = list_entry(rbp->rbp_msgs.next,
+                                             lnet_msg_t, msg_list);
+                       list_del(&msg2->msg_list);
+
+                       (void) lnet_post_routed_recv_locked(msg2, 1);
+               }
+       }
+
+       if (msg->msg_peerrtrcredit) {
+               /* give back peer router credits */
+               msg->msg_peerrtrcredit = 0;
+
+               LASSERT((rxpeer->lp_rtrcredits < 0) ==
+                       !list_empty(&rxpeer->lp_rtrq));
+
+               rxpeer->lp_rtrcredits++;
+               if (rxpeer->lp_rtrcredits <= 0) {
+                       msg2 = list_entry(rxpeer->lp_rtrq.next,
+                                             lnet_msg_t, msg_list);
+                       list_del(&msg2->msg_list);
+
+                       (void) lnet_post_routed_recv_locked(msg2, 1);
+               }
+       }
+       if (rxpeer != NULL) {
+               msg->msg_rxpeer = NULL;
+               lnet_peer_decref_locked(rxpeer);
+       }
+}
+
+static int
+lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
+{
+       lnet_peer_t *p1 = r1->lr_gateway;
+       lnet_peer_t *p2 = r2->lr_gateway;
+
+       if (r1->lr_hops < r2->lr_hops)
+               return 1;
+
+       if (r1->lr_hops > r2->lr_hops)
+               return -1;
+
+       if (p1->lp_txqnob < p2->lp_txqnob)
+               return 1;
+
+       if (p1->lp_txqnob > p2->lp_txqnob)
+               return -1;
+
+       if (p1->lp_txcredits > p2->lp_txcredits)
+               return 1;
+
+       if (p1->lp_txcredits < p2->lp_txcredits)
+               return -1;
+
+       if (r1->lr_seq - r2->lr_seq <= 0)
+               return 1;
+
+       return -1;
+}
+
+static lnet_peer_t *
+lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
+{
+       lnet_remotenet_t        *rnet;
+       lnet_route_t            *rtr;
+       lnet_route_t            *rtr_best;
+       lnet_route_t            *rtr_last;
+       struct lnet_peer        *lp_best;
+       struct lnet_peer        *lp;
+       int                     rc;
+
+       /* If @rtr_nid is not LNET_NID_ANY, return the gateway with
+        * rtr_nid nid, otherwise find the best gateway I can use */
+
+       rnet = lnet_find_net_locked(LNET_NIDNET(target));
+       if (rnet == NULL)
+               return NULL;
+
+       lp_best = NULL;
+       rtr_best = rtr_last = NULL;
+       list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) {
+               lp = rtr->lr_gateway;
+
+               if (!lp->lp_alive || /* gateway is down */
+                   ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 &&
+                    rtr->lr_downis != 0)) /* NI to target is down */
+                       continue;
+
+               if (ni != NULL && lp->lp_ni != ni)
+                       continue;
+
+               if (lp->lp_nid == rtr_nid) /* it's pre-determined router */
+                       return lp;
+
+               if (lp_best == NULL) {
+                       rtr_best = rtr_last = rtr;
+                       lp_best = lp;
+                       continue;
+               }
+
+               /* no protection on below fields, but it's harmless */
+               if (rtr_last->lr_seq - rtr->lr_seq < 0)
+                       rtr_last = rtr;
+
+               rc = lnet_compare_routes(rtr, rtr_best);
+               if (rc < 0)
+                       continue;
+
+               rtr_best = rtr;
+               lp_best = lp;
+       }
+
+       /* set sequence number on the best router to the latest sequence + 1
+        * so we can round-robin all routers, it's race and inaccurate but
+        * harmless and functional  */
+       if (rtr_best != NULL)
+               rtr_best->lr_seq = rtr_last->lr_seq + 1;
+       return lp_best;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+       lnet_nid_t              dst_nid = msg->msg_target.nid;
+       struct lnet_ni          *src_ni;
+       struct lnet_ni          *local_ni;
+       struct lnet_peer        *lp;
+       int                     cpt;
+       int                     cpt2;
+       int                     rc;
+
+       /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+        * but we might want to use pre-determined router for ACK/REPLY
+        * in the future */
+       /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+       LASSERT (msg->msg_txpeer == NULL);
+       LASSERT (!msg->msg_sending);
+       LASSERT (!msg->msg_target_is_router);
+       LASSERT (!msg->msg_receiving);
+
+       msg->msg_sending = 1;
+
+       LASSERT(!msg->msg_tx_committed);
+       cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid);
+ again:
+       lnet_net_lock(cpt);
+
+       if (the_lnet.ln_shutdown) {
+               lnet_net_unlock(cpt);
+               return -ESHUTDOWN;
+       }
+
+       if (src_nid == LNET_NID_ANY) {
+               src_ni = NULL;
+       } else {
+               src_ni = lnet_nid2ni_locked(src_nid, cpt);
+               if (src_ni == NULL) {
+                       lnet_net_unlock(cpt);
+                       LCONSOLE_WARN("Can't send to %s: src %s is not a "
+                                     "local nid\n", libcfs_nid2str(dst_nid),
+                                     libcfs_nid2str(src_nid));
+                       return -EINVAL;
+               }
+               LASSERT (!msg->msg_routing);
+       }
+
+       /* Is this for someone on a local network? */
+       local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
+
+       if (local_ni != NULL) {
+               if (src_ni == NULL) {
+                       src_ni = local_ni;
+                       src_nid = src_ni->ni_nid;
+               } else if (src_ni == local_ni) {
+                       lnet_ni_decref_locked(local_ni, cpt);
+               } else {
+                       lnet_ni_decref_locked(local_ni, cpt);
+                       lnet_ni_decref_locked(src_ni, cpt);
+                       lnet_net_unlock(cpt);
+                       LCONSOLE_WARN("No route to %s via from %s\n",
+                                     libcfs_nid2str(dst_nid),
+                                     libcfs_nid2str(src_nid));
+                       return -EINVAL;
+               }
+
+               LASSERT(src_nid != LNET_NID_ANY);
+               lnet_msg_commit(msg, cpt);
+
+               if (!msg->msg_routing)
+                       msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+
+               if (src_ni == the_lnet.ln_loni) {
+                       /* No send credit hassles with LOLND */
+                       lnet_net_unlock(cpt);
+                       lnet_ni_send(src_ni, msg);
+
+                       lnet_net_lock(cpt);
+                       lnet_ni_decref_locked(src_ni, cpt);
+                       lnet_net_unlock(cpt);
+                       return 0;
+               }
+
+               rc = lnet_nid2peer_locked(&lp, dst_nid, cpt);
+               /* lp has ref on src_ni; lose mine */
+               lnet_ni_decref_locked(src_ni, cpt);
+               if (rc != 0) {
+                       lnet_net_unlock(cpt);
+                       LCONSOLE_WARN("Error %d finding peer %s\n", rc,
+                                     libcfs_nid2str(dst_nid));
+                       /* ENOMEM or shutting down */
+                       return rc;
+               }
+               LASSERT (lp->lp_ni == src_ni);
+       } else {
+               /* sending to a remote network */
+               lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid);
+               if (lp == NULL) {
+                       if (src_ni != NULL)
+                               lnet_ni_decref_locked(src_ni, cpt);
+                       lnet_net_unlock(cpt);
+
+                       LCONSOLE_WARN("No route to %s via %s "
+                                     "(all routers down)\n",
+                                     libcfs_id2str(msg->msg_target),
+                                     libcfs_nid2str(src_nid));
+                       return -EHOSTUNREACH;
+               }
+
+               /* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
+                * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
+                * pre-determined router, this can happen if router table
+                * was changed when we release the lock */
+               if (rtr_nid != lp->lp_nid) {
+                       cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid);
+                       if (cpt2 != cpt) {
+                               if (src_ni != NULL)
+                                       lnet_ni_decref_locked(src_ni, cpt);
+                               lnet_net_unlock(cpt);
+
+                               rtr_nid = lp->lp_nid;
+                               cpt = cpt2;
+                               goto again;
+                       }
+               }
+
+               CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+                      libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
+                      lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+
+               if (src_ni == NULL) {
+                       src_ni = lp->lp_ni;
+                       src_nid = src_ni->ni_nid;
+               } else {
+                       LASSERT (src_ni == lp->lp_ni);
+                       lnet_ni_decref_locked(src_ni, cpt);
+               }
+
+               lnet_peer_addref_locked(lp);
+
+               LASSERT(src_nid != LNET_NID_ANY);
+               lnet_msg_commit(msg, cpt);
+
+               if (!msg->msg_routing) {
+                       /* I'm the source and now I know which NI to send on */
+                       msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+               }
+
+               msg->msg_target_is_router = 1;
+               msg->msg_target.nid = lp->lp_nid;
+               msg->msg_target.pid = LUSTRE_SRV_LNET_PID;
+       }
+
+       /* 'lp' is our best choice of peer */
+
+       LASSERT (!msg->msg_peertxcredit);
+       LASSERT (!msg->msg_txcredit);
+       LASSERT (msg->msg_txpeer == NULL);
+
+       msg->msg_txpeer = lp;              /* msg takes my ref on lp */
+
+       rc = lnet_post_send_locked(msg, 0);
+       lnet_net_unlock(cpt);
+
+       if (rc == EHOSTUNREACH)
+               return -EHOSTUNREACH;
+
+       if (rc == 0)
+               lnet_ni_send(src_ni, msg);
+
+       return 0;
+}
+
+static void
+lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob)
+{
+       lnet_net_lock(cpt);
+       the_lnet.ln_counters[cpt]->drop_count++;
+       the_lnet.ln_counters[cpt]->drop_length += nob;
+       lnet_net_unlock(cpt);
+
+       lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       lnet_hdr_t      *hdr = &msg->msg_hdr;
+
+       if (msg->msg_wanted != 0)
+               lnet_setpayloadbuffer(msg);
+
+       lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+       /* Must I ACK?  If so I'll grab the ack_wmd out of the header and put
+        * it back into the ACK during lnet_finalize() */
+       msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+                       (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+       lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+                    msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       lnet_hdr_t              *hdr = &msg->msg_hdr;
+       struct lnet_match_info  info;
+       int                     rc;
+
+       /* Convert put fields to host byte order */
+       hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+       hdr->msg.put.ptl_index  = le32_to_cpu(hdr->msg.put.ptl_index);
+       hdr->msg.put.offset     = le32_to_cpu(hdr->msg.put.offset);
+
+       info.mi_id.nid  = hdr->src_nid;
+       info.mi_id.pid  = hdr->src_pid;
+       info.mi_opc     = LNET_MD_OP_PUT;
+       info.mi_portal  = hdr->msg.put.ptl_index;
+       info.mi_rlength = hdr->payload_length;
+       info.mi_roffset = hdr->msg.put.offset;
+       info.mi_mbits   = hdr->msg.put.match_bits;
+
+       msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL;
+
+ again:
+       rc = lnet_ptl_match_md(&info, msg);
+       switch (rc) {
+       default:
+               LBUG();
+
+       case LNET_MATCHMD_OK:
+               lnet_recv_put(ni, msg);
+               return 0;
+
+       case LNET_MATCHMD_NONE:
+               if (msg->msg_rx_delayed) /* attached on delayed list */
+                       return 0;
+
+               rc = lnet_ni_eager_recv(ni, msg);
+               if (rc == 0)
+                       goto again;
+               /* fall through */
+
+       case LNET_MATCHMD_DROP:
+               CNETERR("Dropping PUT from %s portal %d match "LPU64
+                       " offset %d length %d: %d\n",
+                       libcfs_id2str(info.mi_id), info.mi_portal,
+                       info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+               return ENOENT;  /* +ve: OK but no match */
+       }
+}
+
+static int
+lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
+{
+       struct lnet_match_info  info;
+       lnet_hdr_t              *hdr = &msg->msg_hdr;
+       lnet_handle_wire_t      reply_wmd;
+       int                     rc;
+
+       /* Convert get fields to host byte order */
+       hdr->msg.get.match_bits   = le64_to_cpu(hdr->msg.get.match_bits);
+       hdr->msg.get.ptl_index    = le32_to_cpu(hdr->msg.get.ptl_index);
+       hdr->msg.get.sink_length  = le32_to_cpu(hdr->msg.get.sink_length);
+       hdr->msg.get.src_offset   = le32_to_cpu(hdr->msg.get.src_offset);
+
+       info.mi_id.nid  = hdr->src_nid;
+       info.mi_id.pid  = hdr->src_pid;
+       info.mi_opc     = LNET_MD_OP_GET;
+       info.mi_portal  = hdr->msg.get.ptl_index;
+       info.mi_rlength = hdr->msg.get.sink_length;
+       info.mi_roffset = hdr->msg.get.src_offset;
+       info.mi_mbits   = hdr->msg.get.match_bits;
+
+       rc = lnet_ptl_match_md(&info, msg);
+       if (rc == LNET_MATCHMD_DROP) {
+               CNETERR("Dropping GET from %s portal %d match "LPU64
+                       " offset %d length %d\n",
+                       libcfs_id2str(info.mi_id), info.mi_portal,
+                       info.mi_mbits, info.mi_roffset, info.mi_rlength);
+               return ENOENT;  /* +ve: OK but no match */
+       }
+
+       LASSERT(rc == LNET_MATCHMD_OK);
+
+       lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+       reply_wmd = hdr->msg.get.return_wmd;
+
+       lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+                      msg->msg_offset, msg->msg_wanted);
+
+       msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+       if (rdma_get) {
+               /* The LND completes the REPLY from her recv procedure */
+               lnet_ni_recv(ni, msg->msg_private, msg, 0,
+                            msg->msg_offset, msg->msg_len, msg->msg_len);
+               return 0;
+       }
+
+       lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+       msg->msg_receiving = 0;
+
+       rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
+       if (rc < 0) {
+               /* didn't get as far as lnet_ni_send() */
+               CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+                      libcfs_nid2str(ni->ni_nid),
+                      libcfs_id2str(info.mi_id), rc);
+
+               lnet_finalize(ni, msg, rc);
+       }
+
+       return 0;
+}
+
+static int
+lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       void         *private = msg->msg_private;
+       lnet_hdr_t       *hdr = &msg->msg_hdr;
+       lnet_process_id_t src = {0};
+       lnet_libmd_t     *md;
+       int            rlength;
+       int            mlength;
+       int                     cpt;
+
+       cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+       lnet_res_lock(cpt);
+
+       src.nid = hdr->src_nid;
+       src.pid = hdr->src_pid;
+
+       /* NB handles only looked up by creator (no flips) */
+       md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+       if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+               CNETERR("%s: Dropping REPLY from %s for %s "
+                       "MD "LPX64"."LPX64"\n",
+                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                       (md == NULL) ? "invalid" : "inactive",
+                       hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                       hdr->msg.reply.dst_wmd.wh_object_cookie);
+               if (md != NULL && md->md_me != NULL)
+                       CERROR("REPLY MD also attached to portal %d\n",
+                              md->md_me->me_portal);
+
+               lnet_res_unlock(cpt);
+               return ENOENT;            /* +ve: OK but no match */
+       }
+
+       LASSERT (md->md_offset == 0);
+
+       rlength = hdr->payload_length;
+       mlength = MIN(rlength, (int)md->md_length);
+
+       if (mlength < rlength &&
+           (md->md_options & LNET_MD_TRUNCATE) == 0) {
+               CNETERR("%s: Dropping REPLY from %s length %d "
+                       "for MD "LPX64" would overflow (%d)\n",
+                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                       rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+                       mlength);
+               lnet_res_unlock(cpt);
+               return ENOENT;    /* +ve: OK but no match */
+       }
+
+       CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n",
+              libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+              mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+       lnet_msg_attach_md(msg, md, 0, mlength);
+
+       if (mlength != 0)
+               lnet_setpayloadbuffer(msg);
+
+       lnet_res_unlock(cpt);
+
+       lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+       lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+       return 0;
+}
+
+static int
+lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       lnet_hdr_t       *hdr = &msg->msg_hdr;
+       lnet_process_id_t src = {0};
+       lnet_libmd_t     *md;
+       int                     cpt;
+
+       src.nid = hdr->src_nid;
+       src.pid = hdr->src_pid;
+
+       /* Convert ack fields to host byte order */
+       hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+       hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+       cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+       lnet_res_lock(cpt);
+
+       /* NB handles only looked up by creator (no flips) */
+       md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+       if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+               /* Don't moan; this is expected */
+               CDEBUG(D_NET,
+                      "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n",
+                      libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                      (md == NULL) ? "invalid" : "inactive",
+                      hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                      hdr->msg.ack.dst_wmd.wh_object_cookie);
+               if (md != NULL && md->md_me != NULL)
+                       CERROR("Source MD also attached to portal %d\n",
+                              md->md_me->me_portal);
+
+               lnet_res_unlock(cpt);
+               return ENOENT;            /* +ve! */
+       }
+
+       CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n",
+              libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+              hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+       lnet_msg_attach_md(msg, md, 0, 0);
+
+       lnet_res_unlock(cpt);
+
+       lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+       lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+       return 0;
+}
+
+static int
+lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+       int     rc = 0;
+
+       if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+           lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+               if (ni->ni_lnd->lnd_eager_recv == NULL) {
+                       msg->msg_rx_ready_delay = 1;
+               } else {
+                       lnet_net_unlock(msg->msg_rx_cpt);
+                       rc = lnet_ni_eager_recv(ni, msg);
+                       lnet_net_lock(msg->msg_rx_cpt);
+               }
+       }
+
+       if (rc == 0)
+               rc = lnet_post_routed_recv_locked(msg, 0);
+       return rc;
+}
+
+char *
+lnet_msgtyp2str (int type)
+{
+       switch (type) {
+       case LNET_MSG_ACK:
+               return ("ACK");
+       case LNET_MSG_PUT:
+               return ("PUT");
+       case LNET_MSG_GET:
+               return ("GET");
+       case LNET_MSG_REPLY:
+               return ("REPLY");
+       case LNET_MSG_HELLO:
+               return ("HELLO");
+       default:
+               return ("<UNKNOWN>");
+       }
+}
+EXPORT_SYMBOL(lnet_msgtyp2str);
+
+void
+lnet_print_hdr(lnet_hdr_t * hdr)
+{
+       lnet_process_id_t src = {0};
+       lnet_process_id_t dst = {0};
+       char *type_str = lnet_msgtyp2str (hdr->type);
+
+       src.nid = hdr->src_nid;
+       src.pid = hdr->src_pid;
+
+       dst.nid = hdr->dest_nid;
+       dst.pid = hdr->dest_pid;
+
+       CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+       CWARN("    From %s\n", libcfs_id2str(src));
+       CWARN("    To   %s\n", libcfs_id2str(dst));
+
+       switch (hdr->type) {
+       default:
+               break;
+
+       case LNET_MSG_PUT:
+               CWARN("    Ptl index %d, ack md "LPX64"."LPX64", "
+                     "match bits "LPU64"\n",
+                     hdr->msg.put.ptl_index,
+                     hdr->msg.put.ack_wmd.wh_interface_cookie,
+                     hdr->msg.put.ack_wmd.wh_object_cookie,
+                     hdr->msg.put.match_bits);
+               CWARN("    Length %d, offset %d, hdr data "LPX64"\n",
+                     hdr->payload_length, hdr->msg.put.offset,
+                     hdr->msg.put.hdr_data);
+               break;
+
+       case LNET_MSG_GET:
+               CWARN("    Ptl index %d, return md "LPX64"."LPX64", "
+                     "match bits "LPU64"\n", hdr->msg.get.ptl_index,
+                     hdr->msg.get.return_wmd.wh_interface_cookie,
+                     hdr->msg.get.return_wmd.wh_object_cookie,
+                     hdr->msg.get.match_bits);
+               CWARN("    Length %d, src offset %d\n",
+                     hdr->msg.get.sink_length,
+                     hdr->msg.get.src_offset);
+               break;
+
+       case LNET_MSG_ACK:
+               CWARN("    dst md "LPX64"."LPX64", "
+                     "manipulated length %d\n",
+                     hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                     hdr->msg.ack.dst_wmd.wh_object_cookie,
+                     hdr->msg.ack.mlength);
+               break;
+
+       case LNET_MSG_REPLY:
+               CWARN("    dst md "LPX64"."LPX64", "
+                     "length %d\n",
+                     hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                     hdr->msg.reply.dst_wmd.wh_object_cookie,
+                     hdr->payload_length);
+       }
+
+}
+
+int
+lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
+          void *private, int rdma_req)
+{
+       int             rc = 0;
+       int             cpt;
+       int             for_me;
+       struct lnet_msg *msg;
+       lnet_pid_t     dest_pid;
+       lnet_nid_t     dest_nid;
+       lnet_nid_t     src_nid;
+       __u32     payload_length;
+       __u32     type;
+
+       LASSERT (!in_interrupt ());
+
+       type = le32_to_cpu(hdr->type);
+       src_nid = le64_to_cpu(hdr->src_nid);
+       dest_nid = le64_to_cpu(hdr->dest_nid);
+       dest_pid = le32_to_cpu(hdr->dest_pid);
+       payload_length = le32_to_cpu(hdr->payload_length);
+
+       for_me = (ni->ni_nid == dest_nid);
+       cpt = lnet_cpt_of_nid(from_nid);
+
+       switch (type) {
+       case LNET_MSG_ACK:
+       case LNET_MSG_GET:
+               if (payload_length > 0) {
+                       CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+                              libcfs_nid2str(from_nid),
+                              libcfs_nid2str(src_nid),
+                              lnet_msgtyp2str(type), payload_length);
+                       return -EPROTO;
+               }
+               break;
+
+       case LNET_MSG_PUT:
+       case LNET_MSG_REPLY:
+               if (payload_length > (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+                       CERROR("%s, src %s: bad %s payload %d "
+                              "(%d max expected)\n",
+                              libcfs_nid2str(from_nid),
+                              libcfs_nid2str(src_nid),
+                              lnet_msgtyp2str(type),
+                              payload_length,
+                              for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+                       return -EPROTO;
+               }
+               break;
+
+       default:
+               CERROR("%s, src %s: Bad message type 0x%x\n",
+                      libcfs_nid2str(from_nid),
+                      libcfs_nid2str(src_nid), type);
+               return -EPROTO;
+       }
+
+       if (the_lnet.ln_routing &&
+           ni->ni_last_alive != cfs_time_current_sec()) {
+               lnet_ni_lock(ni);
+
+               /* NB: so far here is the only place to set NI status to "up */
+               ni->ni_last_alive = cfs_time_current_sec();
+               if (ni->ni_status != NULL &&
+                   ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+                       ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+               lnet_ni_unlock(ni);
+       }
+
+       /* Regard a bad destination NID as a protocol error.  Senders should
+        * know what they're doing; if they don't they're misconfigured, buggy
+        * or malicious so we chop them off at the knees :) */
+
+       if (!for_me) {
+               if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+                       /* should have gone direct */
+                       CERROR ("%s, src %s: Bad dest nid %s "
+                               "(should have been sent direct)\n",
+                               libcfs_nid2str(from_nid),
+                               libcfs_nid2str(src_nid),
+                               libcfs_nid2str(dest_nid));
+                       return -EPROTO;
+               }
+
+               if (lnet_islocalnid(dest_nid)) {
+                       /* dest is another local NI; sender should have used
+                        * this node's NID on its own network */
+                       CERROR ("%s, src %s: Bad dest nid %s "
+                               "(it's my nid but on a different network)\n",
+                               libcfs_nid2str(from_nid),
+                               libcfs_nid2str(src_nid),
+                               libcfs_nid2str(dest_nid));
+                       return -EPROTO;
+               }
+
+               if (rdma_req && type == LNET_MSG_GET) {
+                       CERROR ("%s, src %s: Bad optimized GET for %s "
+                               "(final destination must be me)\n",
+                               libcfs_nid2str(from_nid),
+                               libcfs_nid2str(src_nid),
+                               libcfs_nid2str(dest_nid));
+                       return -EPROTO;
+               }
+
+               if (!the_lnet.ln_routing) {
+                       CERROR ("%s, src %s: Dropping message for %s "
+                               "(routing not enabled)\n",
+                               libcfs_nid2str(from_nid),
+                               libcfs_nid2str(src_nid),
+                               libcfs_nid2str(dest_nid));
+                       goto drop;
+               }
+       }
+
+       /* Message looks OK; we're not going to return an error, so we MUST
+        * call back lnd_recv() come what may... */
+
+       if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+           fail_peer (src_nid, 0))          /* shall we now? */
+       {
+               CERROR("%s, src %s: Dropping %s to simulate failure\n",
+                      libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+                      lnet_msgtyp2str(type));
+               goto drop;
+       }
+
+       msg = lnet_msg_alloc();
+       if (msg == NULL) {
+               CERROR("%s, src %s: Dropping %s (out of memory)\n",
+                      libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+                      lnet_msgtyp2str(type));
+               goto drop;
+       }
+
+       /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */
+
+       msg->msg_type = type;
+       msg->msg_private = private;
+       msg->msg_receiving = 1;
+       msg->msg_len = msg->msg_wanted = payload_length;
+       msg->msg_offset = 0;
+       msg->msg_hdr = *hdr;
+       /* for building message event */
+       msg->msg_from = from_nid;
+       if (!for_me) {
+               msg->msg_target.pid     = dest_pid;
+               msg->msg_target.nid     = dest_nid;
+               msg->msg_routing        = 1;
+
+       } else {
+               /* convert common msg->hdr fields to host byteorder */
+               msg->msg_hdr.type       = type;
+               msg->msg_hdr.src_nid    = src_nid;
+               msg->msg_hdr.src_pid    = le32_to_cpu(msg->msg_hdr.src_pid);
+               msg->msg_hdr.dest_nid   = dest_nid;
+               msg->msg_hdr.dest_pid   = dest_pid;
+               msg->msg_hdr.payload_length = payload_length;
+       }
+
+       lnet_net_lock(cpt);
+       rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
+       if (rc != 0) {
+               lnet_net_unlock(cpt);
+               CERROR("%s, src %s: Dropping %s "
+                      "(error %d looking up sender)\n",
+                      libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+                      lnet_msgtyp2str(type), rc);
+               lnet_msg_free(msg);
+               goto drop;
+       }
+
+       lnet_msg_commit(msg, cpt);
+
+       if (!for_me) {
+               rc = lnet_parse_forward_locked(ni, msg);
+               lnet_net_unlock(cpt);
+
+               if (rc < 0)
+                       goto free_drop;
+               if (rc == 0) {
+                       lnet_ni_recv(ni, msg->msg_private, msg, 0,
+                                    0, payload_length, payload_length);
+               }
+               return 0;
+       }
+
+       lnet_net_unlock(cpt);
+
+       switch (type) {
+       case LNET_MSG_ACK:
+               rc = lnet_parse_ack(ni, msg);
+               break;
+       case LNET_MSG_PUT:
+               rc = lnet_parse_put(ni, msg);
+               break;
+       case LNET_MSG_GET:
+               rc = lnet_parse_get(ni, msg, rdma_req);
+               break;
+       case LNET_MSG_REPLY:
+               rc = lnet_parse_reply(ni, msg);
+               break;
+       default:
+               LASSERT(0);
+               rc = -EPROTO;
+               goto free_drop;  /* prevent an unused label if !kernel */
+       }
+
+       if (rc == 0)
+               return 0;
+
+       LASSERT (rc == ENOENT);
+
+ free_drop:
+       LASSERT(msg->msg_md == NULL);
+       lnet_finalize(ni, msg, rc);
+
+ drop:
+       lnet_drop_message(ni, cpt, private, payload_length);
+       return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+       while (!list_empty(head)) {
+               lnet_process_id_t       id = {0};
+               lnet_msg_t              *msg;
+
+               msg = list_entry(head->next, lnet_msg_t, msg_list);
+               list_del(&msg->msg_list);
+
+               id.nid = msg->msg_hdr.src_nid;
+               id.pid = msg->msg_hdr.src_pid;
+
+               LASSERT(msg->msg_md == NULL);
+               LASSERT(msg->msg_rx_delayed);
+               LASSERT(msg->msg_rxpeer != NULL);
+               LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+               CWARN("Dropping delayed PUT from %s portal %d match "LPU64
+                     " offset %d length %d: %s\n",
+                     libcfs_id2str(id),
+                     msg->msg_hdr.msg.put.ptl_index,
+                     msg->msg_hdr.msg.put.match_bits,
+                     msg->msg_hdr.msg.put.offset,
+                     msg->msg_hdr.payload_length, reason);
+
+               /* NB I can't drop msg's ref on msg_rxpeer until after I've
+                * called lnet_drop_message(), so I just hang onto msg as well
+                * until that's done */
+
+               lnet_drop_message(msg->msg_rxpeer->lp_ni,
+                                 msg->msg_rxpeer->lp_cpt,
+                                 msg->msg_private, msg->msg_len);
+               /*
+                * NB: message will not generate event because w/o attached MD,
+                * but we still should give error code so lnet_msg_decommit()
+                * can skip counters operations and other checks.
+                */
+               lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
+       }
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+       while (!list_empty(head)) {
+               lnet_msg_t        *msg;
+               lnet_process_id_t  id;
+
+               msg = list_entry(head->next, lnet_msg_t, msg_list);
+               list_del(&msg->msg_list);
+
+               /* md won't disappear under me, since each msg
+                * holds a ref on it */
+
+               id.nid = msg->msg_hdr.src_nid;
+               id.pid = msg->msg_hdr.src_pid;
+
+               LASSERT(msg->msg_rx_delayed);
+               LASSERT(msg->msg_md != NULL);
+               LASSERT(msg->msg_rxpeer != NULL);
+               LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+               CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+                      "match "LPU64" offset %d length %d.\n",
+                       libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index,
+                       msg->msg_hdr.msg.put.match_bits,
+                       msg->msg_hdr.msg.put.offset,
+                       msg->msg_hdr.payload_length);
+
+               lnet_recv_put(msg->msg_rxpeer->lp_ni, msg);
+       }
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see lnet_event_t::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
+       lnet_process_id_t target, unsigned int portal,
+       __u64 match_bits, unsigned int offset,
+       __u64 hdr_data)
+{
+       struct lnet_msg         *msg;
+       struct lnet_libmd       *md;
+       int                     cpt;
+       int                     rc;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+           fail_peer (target.nid, 1))    /* shall we now? */
+       {
+               CERROR("Dropping PUT to %s: simulated failure\n",
+                      libcfs_id2str(target));
+               return -EIO;
+       }
+
+       msg = lnet_msg_alloc();
+       if (msg == NULL) {
+               CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n",
+                      libcfs_id2str(target));
+               return -ENOMEM;
+       }
+       msg->msg_vmflush = !!memory_pressure_get();
+
+       cpt = lnet_cpt_of_cookie(mdh.cookie);
+       lnet_res_lock(cpt);
+
+       md = lnet_handle2md(&mdh);
+       if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+               CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n",
+                      match_bits, portal, libcfs_id2str(target),
+                      md == NULL ? -1 : md->md_threshold);
+               if (md != NULL && md->md_me != NULL)
+                       CERROR("Source MD also attached to portal %d\n",
+                              md->md_me->me_portal);
+               lnet_res_unlock(cpt);
+
+               lnet_msg_free(msg);
+               return -ENOENT;
+       }
+
+       CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
+
+       lnet_msg_attach_md(msg, md, 0, 0);
+
+       lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+       msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+       msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+       msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+       msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+       /* NB handles only looked up by creator (no flips) */
+       if (ack == LNET_ACK_REQ) {
+               msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+                       the_lnet.ln_interface_cookie;
+               msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+                       md->md_lh.lh_cookie;
+       } else {
+               msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+                       LNET_WIRE_HANDLE_COOKIE_NONE;
+               msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+                       LNET_WIRE_HANDLE_COOKIE_NONE;
+       }
+
+       lnet_res_unlock(cpt);
+
+       lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+       rc = lnet_send(self, msg, LNET_NID_ANY);
+       if (rc != 0) {
+               CNETERR( "Error sending PUT to %s: %d\n",
+                      libcfs_id2str(target), rc);
+               lnet_finalize (NULL, msg, rc);
+       }
+
+       /* completion will be signalled by an event */
+       return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+lnet_msg_t *
+lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
+{
+       /* The LND can DMA direct to the GET md (i.e. no REPLY msg).  This
+        * returns a msg for the LND to pass to lnet_finalize() when the sink
+        * data has been received.
+        *
+        * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+        * lnet_finalize() is called on it, so the LND must call this first */
+
+       struct lnet_msg         *msg = lnet_msg_alloc();
+       struct lnet_libmd       *getmd = getmsg->msg_md;
+       lnet_process_id_t       peer_id = getmsg->msg_target;
+       int                     cpt;
+
+       LASSERT(!getmsg->msg_target_is_router);
+       LASSERT(!getmsg->msg_routing);
+
+       cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+       lnet_res_lock(cpt);
+
+       LASSERT (getmd->md_refcount > 0);
+
+       if (msg == NULL) {
+               CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n",
+                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
+               goto drop;
+       }
+
+       if (getmd->md_threshold == 0) {
+               CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n",
+                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id),
+                       getmd);
+               lnet_res_unlock(cpt);
+               goto drop;
+       }
+
+       LASSERT(getmd->md_offset == 0);
+
+       CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+              libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
+
+       /* setup information for lnet_build_msg_event */
+       msg->msg_from = peer_id.nid;
+       msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+       msg->msg_hdr.src_nid = peer_id.nid;
+       msg->msg_hdr.payload_length = getmd->md_length;
+       msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+       lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+       lnet_res_unlock(cpt);
+
+       cpt = lnet_cpt_of_nid(peer_id.nid);
+
+       lnet_net_lock(cpt);
+       lnet_msg_commit(msg, cpt);
+       lnet_net_unlock(cpt);
+
+       lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+       return msg;
+
+ drop:
+       cpt = lnet_cpt_of_nid(peer_id.nid);
+
+       lnet_net_lock(cpt);
+       the_lnet.ln_counters[cpt]->drop_count++;
+       the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+       lnet_net_unlock(cpt);
+
+       if (msg != NULL)
+               lnet_msg_free(msg);
+
+       return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len)
+{
+       /* Set the REPLY length, now the RDMA that elides the REPLY message has
+        * completed and I know it. */
+       LASSERT (reply != NULL);
+       LASSERT (reply->msg_type == LNET_MSG_GET);
+       LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY);
+
+       /* NB I trusted my peer to RDMA.  If she tells me she's written beyond
+        * the end of my buffer, I might as well be dead. */
+       LASSERT (len <= reply->msg_ev.mlength);
+
+       reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating" (See LNetMDBind()).
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self, lnet_handle_md_t mdh,
+       lnet_process_id_t target, unsigned int portal,
+       __u64 match_bits, unsigned int offset)
+{
+       struct lnet_msg         *msg;
+       struct lnet_libmd       *md;
+       int                     cpt;
+       int                     rc;
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+           fail_peer (target.nid, 1))    /* shall we now? */
+       {
+               CERROR("Dropping GET to %s: simulated failure\n",
+                      libcfs_id2str(target));
+               return -EIO;
+       }
+
+       msg = lnet_msg_alloc();
+       if (msg == NULL) {
+               CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n",
+                      libcfs_id2str(target));
+               return -ENOMEM;
+       }
+
+       cpt = lnet_cpt_of_cookie(mdh.cookie);
+       lnet_res_lock(cpt);
+
+       md = lnet_handle2md(&mdh);
+       if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+               CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n",
+                      match_bits, portal, libcfs_id2str(target),
+                      md == NULL ? -1 : md->md_threshold);
+               if (md != NULL && md->md_me != NULL)
+                       CERROR("REPLY MD also attached to portal %d\n",
+                              md->md_me->me_portal);
+
+               lnet_res_unlock(cpt);
+
+               lnet_msg_free(msg);
+
+               return -ENOENT;
+       }
+
+       CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
+
+       lnet_msg_attach_md(msg, md, 0, 0);
+
+       lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
+
+       msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+       msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+       msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+       msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+       /* NB handles only looked up by creator (no flips) */
+       msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+               the_lnet.ln_interface_cookie;
+       msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+               md->md_lh.lh_cookie;
+
+       lnet_res_unlock(cpt);
+
+       lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+       rc = lnet_send(self, msg, LNET_NID_ANY);
+       if (rc < 0) {
+               CNETERR( "Error sending GET to %s: %d\n",
+                      libcfs_id2str(target), rc);
+               lnet_finalize (NULL, msg, rc);
+       }
+
+       /* completion will be signalled by an event */
+       return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+       struct list_head                *e;
+       struct lnet_ni          *ni;
+       lnet_remotenet_t        *rnet;
+       __u32                   dstnet = LNET_NIDNET(dstnid);
+       int                     hops;
+       int                     cpt;
+       __u32                   order = 2;
+       struct list_head                *rn_list;
+
+       /* if !local_nid_dist_zero, I don't return a distance of 0 ever
+        * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+        * keep order 0 free for 0@lo and order 1 free for a local NID
+        * match */
+
+       LASSERT (the_lnet.ln_init);
+       LASSERT (the_lnet.ln_refcount > 0);
+
+       cpt = lnet_net_lock_current();
+
+       list_for_each (e, &the_lnet.ln_nis) {
+               ni = list_entry(e, lnet_ni_t, ni_list);
+
+               if (ni->ni_nid == dstnid) {
+                       if (srcnidp != NULL)
+                               *srcnidp = dstnid;
+                       if (orderp != NULL) {
+                               if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+                                       *orderp = 0;
+                               else
+                                       *orderp = 1;
+                       }
+                       lnet_net_unlock(cpt);
+
+                       return local_nid_dist_zero ? 0 : 1;
+               }
+
+               if (LNET_NIDNET(ni->ni_nid) == dstnet) {
+                       if (srcnidp != NULL)
+                               *srcnidp = ni->ni_nid;
+                       if (orderp != NULL)
+                               *orderp = order;
+                       lnet_net_unlock(cpt);
+                       return 1;
+               }
+
+               order++;
+       }
+
+       rn_list = lnet_net2rnethash(dstnet);
+       list_for_each(e, rn_list) {
+               rnet = list_entry(e, lnet_remotenet_t, lrn_list);
+
+               if (rnet->lrn_net == dstnet) {
+                       lnet_route_t *route;
+                       lnet_route_t *shortest = NULL;
+
+                       LASSERT (!list_empty(&rnet->lrn_routes));
+
+                       list_for_each_entry(route, &rnet->lrn_routes,
+                                               lr_list) {
+                               if (shortest == NULL ||
+                                   route->lr_hops < shortest->lr_hops)
+                                       shortest = route;
+                       }
+
+                       LASSERT (shortest != NULL);
+                       hops = shortest->lr_hops;
+                       if (srcnidp != NULL)
+                               *srcnidp = shortest->lr_gateway->lp_ni->ni_nid;
+                       if (orderp != NULL)
+                               *orderp = order;
+                       lnet_net_unlock(cpt);
+                       return hops + 1;
+               }
+               order++;
+       }
+
+       lnet_net_unlock(cpt);
+       return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
+
+/**
+ * Set the number of asynchronous messages expected from a target process.
+ *
+ * This function is only meaningful for userspace callers. It's a no-op when
+ * called from kernel.
+ *
+ * Asynchronous messages are those that can come from a target when the
+ * userspace process is not waiting for IO to complete; e.g., AST callbacks
+ * from Lustre servers. Specifying the expected number of such messages
+ * allows them to be eagerly received when user process is not running in
+ * LNet; otherwise network errors may occur.
+ *
+ * \param id Process ID of the target process.
+ * \param nasync Number of asynchronous messages expected from the target.
+ *
+ * \return 0 on success, and an error code otherwise.
+ */
+int
+LNetSetAsync(lnet_process_id_t id, int nasync)
+{
+       return 0;
+}
+EXPORT_SYMBOL(LNetSetAsync);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c
new file mode 100644 (file)
index 0000000..8f3a50b
--- /dev/null
@@ -0,0 +1,650 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+void
+lnet_build_unlink_event (lnet_libmd_t *md, lnet_event_t *ev)
+{
+       ENTRY;
+
+       memset(ev, 0, sizeof(*ev));
+
+       ev->status   = 0;
+       ev->unlinked = 1;
+       ev->type     = LNET_EVENT_UNLINK;
+       lnet_md_deconstruct(md, &ev->md);
+       lnet_md2handle(&ev->md_handle, md);
+       EXIT;
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
+{
+       lnet_hdr_t      *hdr = &msg->msg_hdr;
+       lnet_event_t    *ev  = &msg->msg_ev;
+
+       LASSERT(!msg->msg_routing);
+
+       ev->type = ev_type;
+
+       if (ev_type == LNET_EVENT_SEND) {
+               /* event for active message */
+               ev->target.nid    = le64_to_cpu(hdr->dest_nid);
+               ev->target.pid    = le32_to_cpu(hdr->dest_pid);
+               ev->initiator.nid = LNET_NID_ANY;
+               ev->initiator.pid = the_lnet.ln_pid;
+               ev->sender        = LNET_NID_ANY;
+
+       } else {
+               /* event for passive message */
+               ev->target.pid    = hdr->dest_pid;
+               ev->target.nid    = hdr->dest_nid;
+               ev->initiator.pid = hdr->src_pid;
+               ev->initiator.nid = hdr->src_nid;
+               ev->rlength       = hdr->payload_length;
+               ev->sender        = msg->msg_from;
+               ev->mlength       = msg->msg_wanted;
+               ev->offset        = msg->msg_offset;
+       }
+
+       switch (ev_type) {
+       default:
+               LBUG();
+
+       case LNET_EVENT_PUT: /* passive PUT */
+               ev->pt_index   = hdr->msg.put.ptl_index;
+               ev->match_bits = hdr->msg.put.match_bits;
+               ev->hdr_data   = hdr->msg.put.hdr_data;
+               return;
+
+       case LNET_EVENT_GET: /* passive GET */
+               ev->pt_index   = hdr->msg.get.ptl_index;
+               ev->match_bits = hdr->msg.get.match_bits;
+               ev->hdr_data   = 0;
+               return;
+
+       case LNET_EVENT_ACK: /* ACK */
+               ev->match_bits = hdr->msg.ack.match_bits;
+               ev->mlength    = hdr->msg.ack.mlength;
+               return;
+
+       case LNET_EVENT_REPLY: /* REPLY */
+               return;
+
+       case LNET_EVENT_SEND: /* active message */
+               if (msg->msg_type == LNET_MSG_PUT) {
+                       ev->pt_index   = le32_to_cpu(hdr->msg.put.ptl_index);
+                       ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+                       ev->offset     = le32_to_cpu(hdr->msg.put.offset);
+                       ev->mlength    =
+                       ev->rlength    = le32_to_cpu(hdr->payload_length);
+                       ev->hdr_data   = le64_to_cpu(hdr->msg.put.hdr_data);
+
+               } else {
+                       LASSERT(msg->msg_type == LNET_MSG_GET);
+                       ev->pt_index   = le32_to_cpu(hdr->msg.get.ptl_index);
+                       ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+                       ev->mlength    =
+                       ev->rlength    = le32_to_cpu(hdr->msg.get.sink_length);
+                       ev->offset     = le32_to_cpu(hdr->msg.get.src_offset);
+                       ev->hdr_data   = 0;
+               }
+               return;
+       }
+}
+
+void
+lnet_msg_commit(lnet_msg_t *msg, int cpt)
+{
+       struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+       lnet_counters_t           *counters  = the_lnet.ln_counters[cpt];
+
+       /* routed message can be committed for both receiving and sending */
+       LASSERT(!msg->msg_tx_committed);
+
+       if (msg->msg_sending) {
+               LASSERT(!msg->msg_receiving);
+
+               msg->msg_tx_cpt = cpt;
+               msg->msg_tx_committed = 1;
+               if (msg->msg_rx_committed) { /* routed message REPLY */
+                       LASSERT(msg->msg_onactivelist);
+                       return;
+               }
+       } else {
+               LASSERT(!msg->msg_sending);
+               msg->msg_rx_cpt = cpt;
+               msg->msg_rx_committed = 1;
+       }
+
+       LASSERT(!msg->msg_onactivelist);
+       msg->msg_onactivelist = 1;
+       list_add(&msg->msg_activelist, &container->msc_active);
+
+       counters->msgs_alloc++;
+       if (counters->msgs_alloc > counters->msgs_max)
+               counters->msgs_max = counters->msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(lnet_msg_t *msg, int status)
+{
+       lnet_counters_t *counters;
+       lnet_event_t    *ev = &msg->msg_ev;
+
+       LASSERT(msg->msg_tx_committed);
+       if (status != 0)
+               goto out;
+
+       counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+       switch (ev->type) {
+       default: /* routed message */
+               LASSERT(msg->msg_routing);
+               LASSERT(msg->msg_rx_committed);
+               LASSERT(ev->type == 0);
+
+               counters->route_length += msg->msg_len;
+               counters->route_count++;
+               goto out;
+
+       case LNET_EVENT_PUT:
+               /* should have been decommitted */
+               LASSERT(!msg->msg_rx_committed);
+               /* overwritten while sending ACK */
+               LASSERT(msg->msg_type == LNET_MSG_ACK);
+               msg->msg_type = LNET_MSG_PUT; /* fix type */
+               break;
+
+       case LNET_EVENT_SEND:
+               LASSERT(!msg->msg_rx_committed);
+               if (msg->msg_type == LNET_MSG_PUT)
+                       counters->send_length += msg->msg_len;
+               break;
+
+       case LNET_EVENT_GET:
+               LASSERT(msg->msg_rx_committed);
+               /* overwritten while sending reply, we should never be
+                * here for optimized GET */
+               LASSERT(msg->msg_type == LNET_MSG_REPLY);
+               msg->msg_type = LNET_MSG_GET; /* fix type */
+               break;
+       }
+
+       counters->send_count++;
+ out:
+       lnet_return_tx_credits_locked(msg);
+       msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(lnet_msg_t *msg, int status)
+{
+       lnet_counters_t *counters;
+       lnet_event_t    *ev = &msg->msg_ev;
+
+       LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+       LASSERT(msg->msg_rx_committed);
+
+       if (status != 0)
+               goto out;
+
+       counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+       switch (ev->type) {
+       default:
+               LASSERT(ev->type == 0);
+               LASSERT(msg->msg_routing);
+               goto out;
+
+       case LNET_EVENT_ACK:
+               LASSERT(msg->msg_type == LNET_MSG_ACK);
+               break;
+
+       case LNET_EVENT_GET:
+               /* type is "REPLY" if it's an optimized GET on passive side,
+                * because optimized GET will never be committed for sending,
+                * so message type wouldn't be changed back to "GET" by
+                * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+               LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+                       msg->msg_type == LNET_MSG_GET);
+               counters->send_length += msg->msg_wanted;
+               break;
+
+       case LNET_EVENT_PUT:
+               LASSERT(msg->msg_type == LNET_MSG_PUT);
+               break;
+
+       case LNET_EVENT_REPLY:
+               /* type is "GET" if it's an optimized GET on active side,
+                * see details in lnet_create_reply_msg() */
+               LASSERT(msg->msg_type == LNET_MSG_GET ||
+                       msg->msg_type == LNET_MSG_REPLY);
+               break;
+       }
+
+       counters->recv_count++;
+       if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+               counters->recv_length += msg->msg_wanted;
+
+ out:
+       lnet_return_rx_credits_locked(msg);
+       msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status)
+{
+       int     cpt2 = cpt;
+
+       LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+       LASSERT(msg->msg_onactivelist);
+
+       if (msg->msg_tx_committed) { /* always decommit for sending first */
+               LASSERT(cpt == msg->msg_tx_cpt);
+               lnet_msg_decommit_tx(msg, status);
+       }
+
+       if (msg->msg_rx_committed) {
+               /* forwarding msg committed for both receiving and sending */
+               if (cpt != msg->msg_rx_cpt) {
+                       lnet_net_unlock(cpt);
+                       cpt2 = msg->msg_rx_cpt;
+                       lnet_net_lock(cpt2);
+               }
+               lnet_msg_decommit_rx(msg, status);
+       }
+
+       list_del(&msg->msg_activelist);
+       msg->msg_onactivelist = 0;
+
+       the_lnet.ln_counters[cpt2]->msgs_alloc--;
+
+       if (cpt2 != cpt) {
+               lnet_net_unlock(cpt2);
+               lnet_net_lock(cpt);
+       }
+}
+
+void
+lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+                  unsigned int offset, unsigned int mlen)
+{
+       /* NB: @offset and @len are only useful for receiving */
+       /* Here, we attach the MD on lnet_msg and mark it busy and
+        * decrementing its threshold. Come what may, the lnet_msg "owns"
+        * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+        * signals completion. */
+       LASSERT(!msg->msg_routing);
+
+       msg->msg_md = md;
+       if (msg->msg_receiving) { /* commited for receiving */
+               msg->msg_offset = offset;
+               msg->msg_wanted = mlen;
+       }
+
+       md->md_refcount++;
+       if (md->md_threshold != LNET_MD_THRESH_INF) {
+               LASSERT(md->md_threshold > 0);
+               md->md_threshold--;
+       }
+
+       /* build umd in event */
+       lnet_md2handle(&msg->msg_ev.md_handle, md);
+       lnet_md_deconstruct(md, &msg->msg_ev.md);
+}
+
+void
+lnet_msg_detach_md(lnet_msg_t *msg, int status)
+{
+       lnet_libmd_t    *md = msg->msg_md;
+       int             unlink;
+
+       /* Now it's safe to drop my caller's ref */
+       md->md_refcount--;
+       LASSERT(md->md_refcount >= 0);
+
+       unlink = lnet_md_unlinkable(md);
+       if (md->md_eq != NULL) {
+               msg->msg_ev.status   = status;
+               msg->msg_ev.unlinked = unlink;
+               lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+       }
+
+       if (unlink)
+               lnet_md_unlink(md);
+
+       msg->msg_md = NULL;
+}
+
+static int
+lnet_complete_msg_locked(lnet_msg_t *msg, int cpt)
+{
+       lnet_handle_wire_t ack_wmd;
+       int             rc;
+       int             status = msg->msg_ev.status;
+
+       LASSERT (msg->msg_onactivelist);
+
+       if (status == 0 && msg->msg_ack) {
+               /* Only send an ACK if the PUT completed successfully */
+
+               lnet_msg_decommit(msg, cpt, 0);
+
+               msg->msg_ack = 0;
+               lnet_net_unlock(cpt);
+
+               LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+               LASSERT(!msg->msg_routing);
+
+               ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+               lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+
+               msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+               msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+               msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+               /* NB: we probably want to use NID of msg::msg_from as 3rd
+                * parameter (router NID) if it's routed message */
+               rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
+
+               lnet_net_lock(cpt);
+               /*
+                * NB: message is committed for sending, we should return
+                * on success because LND will finalize this message later.
+                *
+                * Also, there is possibility that message is commited for
+                * sending and also failed before delivering to LND,
+                * i.e: ENOMEM, in that case we can't fall through either
+                * because CPT for sending can be different with CPT for
+                * receiving, so we should return back to lnet_finalize()
+                * to make sure we are locking the correct partition.
+                */
+               return rc;
+
+       } else if (status == 0 &&       /* OK so far */
+                  (msg->msg_routing && !msg->msg_sending)) {
+               /* not forwarded */
+               LASSERT(!msg->msg_receiving);   /* called back recv already */
+               lnet_net_unlock(cpt);
+
+               rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
+
+               lnet_net_lock(cpt);
+               /*
+                * NB: message is committed for sending, we should return
+                * on success because LND will finalize this message later.
+                *
+                * Also, there is possibility that message is commited for
+                * sending and also failed before delivering to LND,
+                * i.e: ENOMEM, in that case we can't fall through either:
+                * - The rule is message must decommit for sending first if
+                *   the it's committed for both sending and receiving
+                * - CPT for sending can be different with CPT for receiving,
+                *   so we should return back to lnet_finalize() to make
+                *   sure we are locking the correct partition.
+                */
+               return rc;
+       }
+
+       lnet_msg_decommit(msg, cpt, status);
+       lnet_msg_free_locked(msg);
+       return 0;
+}
+
+void
+lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status)
+{
+       struct lnet_msg_container       *container;
+       int                             my_slot;
+       int                             cpt;
+       int                             rc;
+       int                             i;
+
+       LASSERT (!in_interrupt ());
+
+       if (msg == NULL)
+               return;
+#if 0
+       CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n",
+              lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target),
+              msg->msg_target_is_router ? "t" : "",
+              msg->msg_routing ? "X" : "",
+              msg->msg_ack ? "A" : "",
+              msg->msg_sending ? "S" : "",
+              msg->msg_receiving ? "R" : "",
+              msg->msg_delayed ? "d" : "",
+              msg->msg_txcredit ? "C" : "",
+              msg->msg_peertxcredit ? "c" : "",
+              msg->msg_rtrcredit ? "F" : "",
+              msg->msg_peerrtrcredit ? "f" : "",
+              msg->msg_onactivelist ? "!" : "",
+              msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid),
+              msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid));
+#endif
+       msg->msg_ev.status = status;
+
+       if (msg->msg_md != NULL) {
+               cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
+               lnet_res_lock(cpt);
+               lnet_msg_detach_md(msg, status);
+               lnet_res_unlock(cpt);
+       }
+
+ again:
+       rc = 0;
+       if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+               /* not commited to network yet */
+               LASSERT(!msg->msg_onactivelist);
+               lnet_msg_free(msg);
+               return;
+       }
+
+       /*
+        * NB: routed message can be commited for both receiving and sending,
+        * we should finalize in LIFO order and keep counters correct.
+        * (finalize sending first then finalize receiving)
+        */
+       cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+       lnet_net_lock(cpt);
+
+       container = the_lnet.ln_msg_containers[cpt];
+       list_add_tail(&msg->msg_list, &container->msc_finalizing);
+
+       /* Recursion breaker.  Don't complete the message here if I am (or
+        * enough other threads are) already completing messages */
+
+       my_slot = -1;
+       for (i = 0; i < container->msc_nfinalizers; i++) {
+               if (container->msc_finalizers[i] == current)
+                       break;
+
+               if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+                       my_slot = i;
+       }
+
+       if (i < container->msc_nfinalizers || my_slot < 0) {
+               lnet_net_unlock(cpt);
+               return;
+       }
+
+       container->msc_finalizers[my_slot] = current;
+
+       while (!list_empty(&container->msc_finalizing)) {
+               msg = list_entry(container->msc_finalizing.next,
+                                    lnet_msg_t, msg_list);
+
+               list_del(&msg->msg_list);
+
+               /* NB drops and regains the lnet lock if it actually does
+                * anything, so my finalizing friends can chomp along too */
+               rc = lnet_complete_msg_locked(msg, cpt);
+               if (rc != 0)
+                       break;
+       }
+
+       container->msc_finalizers[my_slot] = NULL;
+       lnet_net_unlock(cpt);
+
+       if (rc != 0)
+               goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+       int     count = 0;
+
+       if (container->msc_init == 0)
+               return;
+
+       while (!list_empty(&container->msc_active)) {
+               lnet_msg_t *msg = list_entry(container->msc_active.next,
+                                                lnet_msg_t, msg_activelist);
+
+               LASSERT(msg->msg_onactivelist);
+               msg->msg_onactivelist = 0;
+               list_del(&msg->msg_activelist);
+               lnet_msg_free(msg);
+               count++;
+       }
+
+       if (count > 0)
+               CERROR("%d active msg on exit\n", count);
+
+       if (container->msc_finalizers != NULL) {
+               LIBCFS_FREE(container->msc_finalizers,
+                           container->msc_nfinalizers *
+                           sizeof(*container->msc_finalizers));
+               container->msc_finalizers = NULL;
+       }
+#ifdef LNET_USE_LIB_FREELIST
+       lnet_freelist_fini(&container->msc_freelist);
+#endif
+       container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+       int     rc;
+
+       container->msc_init = 1;
+
+       INIT_LIST_HEAD(&container->msc_active);
+       INIT_LIST_HEAD(&container->msc_finalizing);
+
+#ifdef LNET_USE_LIB_FREELIST
+       memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t));
+
+       rc = lnet_freelist_init(&container->msc_freelist,
+                               LNET_FL_MAX_MSGS, sizeof(lnet_msg_t));
+       if (rc != 0) {
+               CERROR("Failed to init freelist for message container\n");
+               lnet_msg_container_cleanup(container);
+               return rc;
+       }
+#else
+       rc = 0;
+#endif
+       /* number of CPUs */
+       container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+
+       LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+                        container->msc_nfinalizers *
+                        sizeof(*container->msc_finalizers));
+
+       if (container->msc_finalizers == NULL) {
+               CERROR("Failed to allocate message finalizers\n");
+               lnet_msg_container_cleanup(container);
+               return -ENOMEM;
+       }
+
+       return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+       struct lnet_msg_container *container;
+       int     i;
+
+       if (the_lnet.ln_msg_containers == NULL)
+               return;
+
+       cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+               lnet_msg_container_cleanup(container);
+
+       cfs_percpt_free(the_lnet.ln_msg_containers);
+       the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+       struct lnet_msg_container *container;
+       int     rc;
+       int     i;
+
+       the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+                                                     sizeof(*container));
+
+       if (the_lnet.ln_msg_containers == NULL) {
+               CERROR("Failed to allocate cpu-partition data for network\n");
+               return -ENOMEM;
+       }
+
+       cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+               rc = lnet_msg_container_setup(container, i);
+               if (rc != 0) {
+                       lnet_msg_containers_destroy();
+                       return rc;
+               }
+       }
+
+       return 0;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
new file mode 100644 (file)
index 0000000..9b9e7d3
--- /dev/null
@@ -0,0 +1,938 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* NB: add /proc interfaces in upcoming patches */
+int    portal_rotor    = LNET_PTL_ROTOR_HASH_RT;
+CFS_MODULE_PARM(portal_rotor, "i", int, 0644,
+               "redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id,
+                   __u64 mbits, __u64 ignore_bits)
+{
+       struct lnet_portal      *ptl = the_lnet.ln_portals[index];
+       int                     unique;
+
+       unique = ignore_bits == 0 &&
+                match_id.nid != LNET_NID_ANY &&
+                match_id.pid != LNET_PID_ANY;
+
+       LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+       /* prefer to check w/o any lock */
+       if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+               goto match;
+
+       /* unset, new portal */
+       lnet_ptl_lock(ptl);
+       /* check again with lock */
+       if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+               lnet_ptl_unlock(ptl);
+               goto match;
+       }
+
+       /* still not set */
+       if (unique)
+               lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+       else
+               lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+       lnet_ptl_unlock(ptl);
+
+       return 1;
+
+ match:
+       if ((lnet_ptl_is_unique(ptl) && !unique) ||
+           (lnet_ptl_is_wildcard(ptl) && unique))
+               return 0;
+       return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+       struct lnet_match_table *mtable = ptl->ptl_mtables[cpt];
+       int                     i;
+
+       /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+       LASSERT(lnet_ptl_is_wildcard(ptl));
+
+       mtable->mt_enabled = 1;
+
+       ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+       for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+               LASSERT(ptl->ptl_mt_maps[i] != cpt);
+               if (ptl->ptl_mt_maps[i] < cpt)
+                       break;
+
+               /* swap to order */
+               ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+               ptl->ptl_mt_maps[i] = cpt;
+       }
+
+       ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+       struct lnet_match_table *mtable = ptl->ptl_mtables[cpt];
+       int                     i;
+
+       /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+       LASSERT(lnet_ptl_is_wildcard(ptl));
+
+       if (LNET_CPT_NUMBER == 1)
+               return; /* never disable the only match-table */
+
+       mtable->mt_enabled = 0;
+
+       LASSERT(ptl->ptl_mt_nmaps > 0 &&
+               ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+       /* remove it from mt_maps */
+       ptl->ptl_mt_nmaps--;
+       for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+               if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+                       ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+       }
+}
+
+static int
+lnet_try_match_md(lnet_libmd_t *md,
+                 struct lnet_match_info *info, struct lnet_msg *msg)
+{
+       /* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+        * lnet_match_blocked_msg() relies on this to avoid races */
+       unsigned int    offset;
+       unsigned int    mlength;
+       lnet_me_t       *me = md->md_me;
+
+       /* MD exhausted */
+       if (lnet_md_exhausted(md))
+               return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+       /* mismatched MD op */
+       if ((md->md_options & info->mi_opc) == 0)
+               return LNET_MATCHMD_NONE;
+
+       /* mismatched ME nid/pid? */
+       if (me->me_match_id.nid != LNET_NID_ANY &&
+           me->me_match_id.nid != info->mi_id.nid)
+               return LNET_MATCHMD_NONE;
+
+       if (me->me_match_id.pid != LNET_PID_ANY &&
+           me->me_match_id.pid != info->mi_id.pid)
+               return LNET_MATCHMD_NONE;
+
+       /* mismatched ME matchbits? */
+       if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+               return LNET_MATCHMD_NONE;
+
+       /* Hurrah! This _is_ a match; check it out... */
+
+       if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+               offset = md->md_offset;
+       else
+               offset = info->mi_roffset;
+
+       if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+               mlength = md->md_max_size;
+               LASSERT(md->md_offset + mlength <= md->md_length);
+       } else {
+               mlength = md->md_length - offset;
+       }
+
+       if (info->mi_rlength <= mlength) {      /* fits in allowed space */
+               mlength = info->mi_rlength;
+       } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+               /* this packet _really_ is too big */
+               CERROR("Matching packet from %s, match "LPU64
+                      " length %d too big: %d left, %d allowed\n",
+                      libcfs_id2str(info->mi_id), info->mi_mbits,
+                      info->mi_rlength, md->md_length - offset, mlength);
+
+               return LNET_MATCHMD_DROP;
+       }
+
+       /* Commit to this ME/MD */
+       CDEBUG(D_NET, "Incoming %s index %x from %s of "
+              "length %d/%d into md "LPX64" [%d] + %d\n",
+              (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+              info->mi_portal, libcfs_id2str(info->mi_id), mlength,
+              info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+       lnet_msg_attach_md(msg, md, offset, mlength);
+       md->md_offset = offset + mlength;
+
+       if (!lnet_md_exhausted(md))
+               return LNET_MATCHMD_OK;
+
+       /* Auto-unlink NOW, so the ME gets unlinked if required.
+        * We bumped md->md_refcount above so the MD just gets flagged
+        * for unlink when it is finalized. */
+       if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+               lnet_md_unlink(md);
+
+       return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits)
+{
+       if (LNET_CPT_NUMBER == 1)
+               return ptl->ptl_mtables[0]; /* the only one */
+
+       /* if it's a unique portal, return match-table hashed by NID */
+       return lnet_ptl_is_unique(ptl) ?
+              ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, lnet_process_id_t id,
+                 __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos)
+{
+       struct lnet_portal      *ptl;
+       struct lnet_match_table *mtable;
+
+       /* NB: called w/o lock */
+       LASSERT(index < the_lnet.ln_nportals);
+
+       if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+               return NULL;
+
+       ptl = the_lnet.ln_portals[index];
+
+       mtable = lnet_match2mt(ptl, id, mbits);
+       if (mtable != NULL) /* unique portal or only one match-table */
+               return mtable;
+
+       /* it's a wildcard portal */
+       switch (pos) {
+       default:
+               return NULL;
+       case LNET_INS_BEFORE:
+       case LNET_INS_AFTER:
+               /* posted by no affinity thread, always hash to specific
+                * match-table to avoid buffer stealing which is heavy */
+               return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+       case LNET_INS_LOCAL:
+               /* posted by cpu-affinity thread */
+               return ptl->ptl_mtables[lnet_cpt_current()];
+       }
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+       struct lnet_match_table *mtable;
+       struct lnet_portal      *ptl;
+       int                     nmaps;
+       int                     rotor;
+       int                     routed;
+       int                     cpt;
+
+       /* NB: called w/o lock */
+       LASSERT(info->mi_portal < the_lnet.ln_nportals);
+       ptl = the_lnet.ln_portals[info->mi_portal];
+
+       LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+       mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits);
+       if (mtable != NULL)
+               return mtable;
+
+       /* it's a wildcard portal */
+       routed = LNET_NIDNET(msg->msg_hdr.src_nid) !=
+                LNET_NIDNET(msg->msg_hdr.dest_nid);
+
+       if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+           (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+               cpt = lnet_cpt_current();
+               if (ptl->ptl_mtables[cpt]->mt_enabled)
+                       return ptl->ptl_mtables[cpt];
+       }
+
+       rotor = ptl->ptl_rotor++; /* get round-robin factor */
+       if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+               cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid);
+       else
+               cpt = rotor % LNET_CPT_NUMBER;
+
+       if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+               /* is there any active entry for this portal? */
+               nmaps = ptl->ptl_mt_nmaps;
+               /* map to an active mtable to avoid heavy "stealing" */
+               if (nmaps != 0) {
+                       /* NB: there is possibility that ptl_mt_maps is being
+                        * changed because we are not under protection of
+                        * lnet_ptl_lock, but it shouldn't hurt anything */
+                       cpt = ptl->ptl_mt_maps[rotor % nmaps];
+               }
+       }
+
+       return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+       __u64   *bmap;
+       int     i;
+
+       if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+               return 0;
+
+       if (pos < 0) { /* check all bits */
+               for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+                       if (mtable->mt_exhausted[i] != (__u64)(-1))
+                               return 0;
+               }
+               return 1;
+       }
+
+       LASSERT(pos <= LNET_MT_HASH_IGNORE);
+       /* mtable::mt_mhash[pos] is marked as exhausted or not */
+       bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+       pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+       return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+       __u64   *bmap;
+
+       LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+       LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+       /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+       bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+       pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+       if (!exhausted)
+               *bmap &= ~(1ULL << pos);
+       else
+               *bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+                  lnet_process_id_t id, __u64 mbits)
+{
+       struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+       if (lnet_ptl_is_wildcard(ptl)) {
+               return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+       } else {
+               unsigned long hash = mbits + id.nid + id.pid;
+
+               LASSERT(lnet_ptl_is_unique(ptl));
+               hash = cfs_hash_long(hash, LNET_MT_HASH_BITS);
+               return &mtable->mt_mhash[hash];
+       }
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+                struct lnet_match_info *info, struct lnet_msg *msg)
+{
+       struct list_head                *head;
+       lnet_me_t               *me;
+       lnet_me_t               *tmp;
+       int                     exhausted = 0;
+       int                     rc;
+
+       /* any ME with ignore bits? */
+       if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+               head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+       else
+               head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+       /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+       if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+               exhausted = LNET_MATCHMD_EXHAUSTED;
+
+       list_for_each_entry_safe(me, tmp, head, me_list) {
+               /* ME attached but MD not attached yet */
+               if (me->me_md == NULL)
+                       continue;
+
+               LASSERT(me == me->me_md->md_me);
+
+               rc = lnet_try_match_md(me->me_md, info, msg);
+               if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+                       exhausted = 0; /* mlist is not empty */
+
+               if ((rc & LNET_MATCHMD_FINISH) != 0) {
+                       /* don't return EXHAUSTED bit because we don't know
+                        * whether the mlist is empty or not */
+                       return rc & ~LNET_MATCHMD_EXHAUSTED;
+               }
+       }
+
+       if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+               lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+               if (!lnet_mt_test_exhausted(mtable, -1))
+                       exhausted = 0;
+       }
+
+       if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+               head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+               goto again; /* re-check MEs w/o ignore-bits */
+       }
+
+       if (info->mi_opc == LNET_MD_OP_GET ||
+           !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+               return LNET_MATCHMD_DROP | exhausted;
+
+       return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+       int     rc;
+
+       /* message arrived before any buffer posting on this portal,
+        * simply delay or drop this message */
+       if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+               return 0;
+
+       lnet_ptl_lock(ptl);
+       /* check it again with hold of lock */
+       if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+               lnet_ptl_unlock(ptl);
+               return 0;
+       }
+
+       if (lnet_ptl_is_lazy(ptl)) {
+               if (msg->msg_rx_ready_delay) {
+                       msg->msg_rx_delayed = 1;
+                       list_add_tail(&msg->msg_list,
+                                         &ptl->ptl_msg_delayed);
+               }
+               rc = LNET_MATCHMD_NONE;
+       } else {
+               rc = LNET_MATCHMD_DROP;
+       }
+
+       lnet_ptl_unlock(ptl);
+       return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+                    struct lnet_match_info *info, struct lnet_msg *msg)
+{
+       int     first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+       int     rc = 0;
+       int     i;
+
+       /* steal buffer from other CPTs, and delay it if nothing to steal,
+        * this function is more expensive than a regular match, but we
+        * don't expect it can happen a lot */
+       LASSERT(lnet_ptl_is_wildcard(ptl));
+
+       for (i = 0; i < LNET_CPT_NUMBER; i++) {
+               struct lnet_match_table *mtable;
+               int                     cpt;
+
+               cpt = (first + i) % LNET_CPT_NUMBER;
+               mtable = ptl->ptl_mtables[cpt];
+               if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+                       continue;
+
+               lnet_res_lock(cpt);
+               lnet_ptl_lock(ptl);
+
+               if (i == 0) { /* the first try, attach on stealing list */
+                       list_add_tail(&msg->msg_list,
+                                         &ptl->ptl_msg_stealing);
+               }
+
+               if (!list_empty(&msg->msg_list)) { /* on stealing list */
+                       rc = lnet_mt_match_md(mtable, info, msg);
+
+                       if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+                           mtable->mt_enabled)
+                               lnet_ptl_disable_mt(ptl, cpt);
+
+                       if ((rc & LNET_MATCHMD_FINISH) != 0)
+                               list_del_init(&msg->msg_list);
+
+               } else {
+                       /* could be matched by lnet_ptl_attach_md()
+                        * which is called by another thread */
+                       rc = msg->msg_md == NULL ?
+                            LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+               }
+
+               if (!list_empty(&msg->msg_list) && /* not matched yet */
+                   (i == LNET_CPT_NUMBER - 1 || /* the last CPT */
+                    ptl->ptl_mt_nmaps == 0 ||   /* no active CPT */
+                    (ptl->ptl_mt_nmaps == 1 &&  /* the only active CPT */
+                     ptl->ptl_mt_maps[0] == cpt))) {
+                       /* nothing to steal, delay or drop */
+                       list_del_init(&msg->msg_list);
+
+                       if (lnet_ptl_is_lazy(ptl)) {
+                               msg->msg_rx_delayed = 1;
+                               list_add_tail(&msg->msg_list,
+                                                 &ptl->ptl_msg_delayed);
+                               rc = LNET_MATCHMD_NONE;
+                       } else {
+                               rc = LNET_MATCHMD_DROP;
+                       }
+               }
+
+               lnet_ptl_unlock(ptl);
+               lnet_res_unlock(cpt);
+
+               if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed)
+                       break;
+       }
+
+       return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+       struct lnet_match_table *mtable;
+       struct lnet_portal      *ptl;
+       int                     rc;
+
+       CDEBUG(D_NET, "Request from %s of length %d into portal %d "
+              "MB="LPX64"\n", libcfs_id2str(info->mi_id),
+              info->mi_rlength, info->mi_portal, info->mi_mbits);
+
+       if (info->mi_portal >= the_lnet.ln_nportals) {
+               CERROR("Invalid portal %d not in [0-%d]\n",
+                      info->mi_portal, the_lnet.ln_nportals);
+               return LNET_MATCHMD_DROP;
+       }
+
+       ptl = the_lnet.ln_portals[info->mi_portal];
+       rc = lnet_ptl_match_early(ptl, msg);
+       if (rc != 0) /* matched or delayed early message */
+               return rc;
+
+       mtable = lnet_mt_of_match(info, msg);
+       lnet_res_lock(mtable->mt_cpt);
+
+       if (the_lnet.ln_shutdown) {
+               rc = LNET_MATCHMD_DROP;
+               goto out1;
+       }
+
+       rc = lnet_mt_match_md(mtable, info, msg);
+       if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+               lnet_ptl_lock(ptl);
+               lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+               lnet_ptl_unlock(ptl);
+       }
+
+       if ((rc & LNET_MATCHMD_FINISH) != 0)    /* matched or dropping */
+               goto out1;
+
+       if (!msg->msg_rx_ready_delay)
+               goto out1;
+
+       LASSERT(lnet_ptl_is_lazy(ptl));
+       LASSERT(!msg->msg_rx_delayed);
+
+       /* NB: we don't expect "delay" can happen a lot */
+       if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+               lnet_ptl_lock(ptl);
+
+               msg->msg_rx_delayed = 1;
+               list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+               lnet_ptl_unlock(ptl);
+               lnet_res_unlock(mtable->mt_cpt);
+
+       } else  {
+               lnet_res_unlock(mtable->mt_cpt);
+               rc = lnet_ptl_match_delay(ptl, info, msg);
+       }
+
+       if (msg->msg_rx_delayed) {
+               CDEBUG(D_NET,
+                      "Delaying %s from %s ptl %d MB "LPX64" off %d len %d\n",
+                      info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+                      libcfs_id2str(info->mi_id), info->mi_portal,
+                      info->mi_mbits, info->mi_roffset, info->mi_rlength);
+       }
+       goto out0;
+ out1:
+       lnet_res_unlock(mtable->mt_cpt);
+ out0:
+       /* EXHAUSTED bit is only meaningful for internal functions */
+       return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md)
+{
+       LASSERT(me->me_md == md && md->md_me == me);
+
+       me->me_md = NULL;
+       md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+                  struct list_head *matches, struct list_head *drops)
+{
+       struct lnet_portal      *ptl = the_lnet.ln_portals[me->me_portal];
+       struct lnet_match_table *mtable;
+       struct list_head                *head;
+       lnet_msg_t              *tmp;
+       lnet_msg_t              *msg;
+       int                     exhausted = 0;
+       int                     cpt;
+
+       LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+       me->me_md = md;
+       md->md_me = me;
+
+       cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+       mtable = ptl->ptl_mtables[cpt];
+
+       if (list_empty(&ptl->ptl_msg_stealing) &&
+           list_empty(&ptl->ptl_msg_delayed) &&
+           !lnet_mt_test_exhausted(mtable, me->me_pos))
+               return;
+
+       lnet_ptl_lock(ptl);
+       head = &ptl->ptl_msg_stealing;
+ again:
+       list_for_each_entry_safe(msg, tmp, head, msg_list) {
+               struct lnet_match_info  info;
+               lnet_hdr_t              *hdr;
+               int                     rc;
+
+               LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+               hdr   = &msg->msg_hdr;
+               info.mi_id.nid  = hdr->src_nid;
+               info.mi_id.pid  = hdr->src_pid;
+               info.mi_opc     = LNET_MD_OP_PUT;
+               info.mi_portal  = hdr->msg.put.ptl_index;
+               info.mi_rlength = hdr->payload_length;
+               info.mi_roffset = hdr->msg.put.offset;
+               info.mi_mbits   = hdr->msg.put.match_bits;
+
+               rc = lnet_try_match_md(md, &info, msg);
+
+               exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+               if ((rc & LNET_MATCHMD_NONE) != 0) {
+                       if (exhausted)
+                               break;
+                       continue;
+               }
+
+               /* Hurrah! This _is_ a match */
+               LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+               list_del_init(&msg->msg_list);
+
+               if (head == &ptl->ptl_msg_stealing) {
+                       if (exhausted)
+                               break;
+                       /* stealing thread will handle the message */
+                       continue;
+               }
+
+               if ((rc & LNET_MATCHMD_OK) != 0) {
+                       list_add_tail(&msg->msg_list, matches);
+
+                       CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+                              "match "LPU64" offset %d length %d.\n",
+                              libcfs_id2str(info.mi_id),
+                              info.mi_portal, info.mi_mbits,
+                              info.mi_roffset, info.mi_rlength);
+               } else {
+                       list_add_tail(&msg->msg_list, drops);
+               }
+
+               if (exhausted)
+                       break;
+       }
+
+       if (!exhausted && head == &ptl->ptl_msg_stealing) {
+               head = &ptl->ptl_msg_delayed;
+               goto again;
+       }
+
+       if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+               lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+               if (!mtable->mt_enabled)
+                       lnet_ptl_enable_mt(ptl, cpt);
+       }
+
+       lnet_ptl_unlock(ptl);
+}
+
+void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+       struct lnet_match_table *mtable;
+       int                     i;
+
+       if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+               return;
+
+       LASSERT(list_empty(&ptl->ptl_msg_delayed));
+       LASSERT(list_empty(&ptl->ptl_msg_stealing));
+       cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+               struct list_head        *mhash;
+               lnet_me_t       *me;
+               int             j;
+
+               if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+                       continue;
+
+               mhash = mtable->mt_mhash;
+               /* cleanup ME */
+               for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+                       while (!list_empty(&mhash[j])) {
+                               me = list_entry(mhash[j].next,
+                                                   lnet_me_t, me_list);
+                               CERROR("Active ME %p on exit\n", me);
+                               list_del(&me->me_list);
+                               lnet_me_free(me);
+                       }
+               }
+               /* the extra entry is for MEs with ignore bits */
+               LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+       }
+
+       cfs_percpt_free(ptl->ptl_mtables);
+       ptl->ptl_mtables = NULL;
+}
+
+int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+       struct lnet_match_table *mtable;
+       struct list_head                *mhash;
+       int                     i;
+       int                     j;
+
+       ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+                                           sizeof(struct lnet_match_table));
+       if (ptl->ptl_mtables == NULL) {
+               CERROR("Failed to create match table for portal %d\n", index);
+               return -ENOMEM;
+       }
+
+       ptl->ptl_index = index;
+       INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+       INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+       spin_lock_init(&ptl->ptl_lock);
+       cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+               /* the extra entry is for MEs with ignore bits */
+               LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+                                sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+               if (mhash == NULL) {
+                       CERROR("Failed to create match hash for portal %d\n",
+                              index);
+                       goto failed;
+               }
+
+               memset(&mtable->mt_exhausted[0], -1,
+                      sizeof(mtable->mt_exhausted[0]) *
+                      LNET_MT_EXHAUSTED_BMAP);
+               mtable->mt_mhash = mhash;
+               for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+                       INIT_LIST_HEAD(&mhash[j]);
+
+               mtable->mt_portal = index;
+               mtable->mt_cpt = i;
+       }
+
+       return 0;
+ failed:
+       lnet_ptl_cleanup(ptl);
+       return -ENOMEM;
+}
+
+void
+lnet_portals_destroy(void)
+{
+       int     i;
+
+       if (the_lnet.ln_portals == NULL)
+               return;
+
+       for (i = 0; i < the_lnet.ln_nportals; i++)
+               lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+
+       cfs_array_free(the_lnet.ln_portals);
+       the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+       int     size;
+       int     i;
+
+       size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]);
+
+       the_lnet.ln_nportals = MAX_PORTALS;
+       the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size);
+       if (the_lnet.ln_portals == NULL) {
+               CERROR("Failed to allocate portals table\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < the_lnet.ln_nportals; i++) {
+               if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+                       lnet_portals_destroy();
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+       struct lnet_portal *ptl;
+
+       if (portal < 0 || portal >= the_lnet.ln_nportals)
+               return -EINVAL;
+
+       CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+       ptl = the_lnet.ln_portals[portal];
+
+       lnet_res_lock(LNET_LOCK_EX);
+       lnet_ptl_lock(ptl);
+
+       lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+       lnet_ptl_unlock(ptl);
+       lnet_res_unlock(LNET_LOCK_EX);
+
+       return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+       struct lnet_portal      *ptl;
+       LIST_HEAD               (zombies);
+
+       if (portal < 0 || portal >= the_lnet.ln_nportals)
+               return -EINVAL;
+
+       ptl = the_lnet.ln_portals[portal];
+
+       lnet_res_lock(LNET_LOCK_EX);
+       lnet_ptl_lock(ptl);
+
+       if (!lnet_ptl_is_lazy(ptl)) {
+               lnet_ptl_unlock(ptl);
+               lnet_res_unlock(LNET_LOCK_EX);
+               return 0;
+       }
+
+       if (the_lnet.ln_shutdown)
+               CWARN("Active lazy portal %d on exit\n", portal);
+       else
+               CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+       /* grab all the blocked messages atomically */
+       list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+       lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+
+       lnet_ptl_unlock(ptl);
+       lnet_res_unlock(LNET_LOCK_EX);
+
+       lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr");
+
+       return 0;
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);
diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c
new file mode 100644 (file)
index 0000000..670dae3
--- /dev/null
@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+int
+lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+       LASSERT (!lntmsg->msg_routing);
+       LASSERT (!lntmsg->msg_target_is_router);
+
+       return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+int
+lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+           int delayed, unsigned int niov,
+           struct iovec *iov, lnet_kiov_t *kiov,
+           unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+       lnet_msg_t *sendmsg = private;
+
+       if (lntmsg != NULL) {              /* not discarding */
+               if (sendmsg->msg_iov != NULL) {
+                       if (iov != NULL)
+                               lnet_copy_iov2iov(niov, iov, offset,
+                                                 sendmsg->msg_niov,
+                                                 sendmsg->msg_iov,
+                                                 sendmsg->msg_offset, mlen);
+                       else
+                               lnet_copy_iov2kiov(niov, kiov, offset,
+                                                  sendmsg->msg_niov,
+                                                  sendmsg->msg_iov,
+                                                  sendmsg->msg_offset, mlen);
+               } else {
+                       if (iov != NULL)
+                               lnet_copy_kiov2iov(niov, iov, offset,
+                                                  sendmsg->msg_niov,
+                                                  sendmsg->msg_kiov,
+                                                  sendmsg->msg_offset, mlen);
+                       else
+                               lnet_copy_kiov2kiov(niov, kiov, offset,
+                                                   sendmsg->msg_niov,
+                                                   sendmsg->msg_kiov,
+                                                   sendmsg->msg_offset, mlen);
+               }
+
+               lnet_finalize(ni, lntmsg, 0);
+       }
+
+       lnet_finalize(ni, sendmsg, 0);
+       return 0;
+}
+
+static int lolnd_instanced;
+
+void
+lolnd_shutdown(lnet_ni_t *ni)
+{
+       CDEBUG (D_NET, "shutdown\n");
+       LASSERT (lolnd_instanced);
+
+       lolnd_instanced = 0;
+}
+
+int
+lolnd_startup (lnet_ni_t *ni)
+{
+       LASSERT (ni->ni_lnd == &the_lolnd);
+       LASSERT (!lolnd_instanced);
+       lolnd_instanced = 1;
+
+       return (0);
+}
+
+lnd_t the_lolnd = {
+       /* .lnd_list       = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+       /* .lnd_refcount   = */ 0,
+       /* .lnd_type       = */ LOLND,
+       /* .lnd_startup    = */ lolnd_startup,
+       /* .lnd_shutdown   = */ lolnd_shutdown,
+       /* .lnt_ctl     = */ NULL,
+       /* .lnd_send       = */ lolnd_send,
+       /* .lnd_recv       = */ lolnd_recv,
+       /* .lnd_eager_recv = */ NULL,
+       /* .lnd_notify     = */ NULL,
+       /* .lnd_accept     = */ NULL
+};
diff --git a/drivers/staging/lustre/lnet/lnet/module.c b/drivers/staging/lustre/lnet/lnet/module.c
new file mode 100644 (file)
index 0000000..c832385
--- /dev/null
@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+static int config_on_load = 0;
+CFS_MODULE_PARM(config_on_load, "i", int, 0444,
+               "configure network at module load");
+
+static struct mutex lnet_config_mutex;
+
+int
+lnet_configure (void *arg)
+{
+       /* 'arg' only there so I can be passed to cfs_create_thread() */
+       int    rc = 0;
+
+       LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+       if (!the_lnet.ln_niinit_self) {
+               rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+               if (rc >= 0) {
+                       the_lnet.ln_niinit_self = 1;
+                       rc = 0;
+               }
+       }
+
+       LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+       return rc;
+}
+
+int
+lnet_unconfigure (void)
+{
+       int   refcount;
+
+       LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+       if (the_lnet.ln_niinit_self) {
+               the_lnet.ln_niinit_self = 0;
+               LNetNIFini();
+       }
+
+       LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+       refcount = the_lnet.ln_refcount;
+       LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+
+       LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+       return (refcount == 0) ? 0 : -EBUSY;
+}
+
+int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+       int   rc;
+
+       switch (cmd) {
+       case IOC_LIBCFS_CONFIGURE:
+               return lnet_configure(NULL);
+
+       case IOC_LIBCFS_UNCONFIGURE:
+               return lnet_unconfigure();
+
+       default:
+               /* Passing LNET_PID_ANY only gives me a ref if the net is up
+                * already; I'll need it to ensure the net can't go down while
+                * I'm called into it */
+               rc = LNetNIInit(LNET_PID_ANY);
+               if (rc >= 0) {
+                       rc = LNetCtl(cmd, data);
+                       LNetNIFini();
+               }
+               return rc;
+       }
+}
+
+DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+
+int
+init_lnet(void)
+{
+       int               rc;
+       ENTRY;
+
+       mutex_init(&lnet_config_mutex);
+
+       rc = LNetInit();
+       if (rc != 0) {
+               CERROR("LNetInit: error %d\n", rc);
+               RETURN(rc);
+       }
+
+       rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+       LASSERT (rc == 0);
+
+       if (config_on_load) {
+               /* Have to schedule a separate thread to avoid deadlocking
+                * in modload */
+               (void) kthread_run(lnet_configure, NULL, "lnet_initd");
+       }
+
+       RETURN(0);
+}
+
+void
+fini_lnet(void)
+{
+       int rc;
+
+       rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+       LASSERT (rc == 0);
+
+       LNetFini();
+}
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "1.0.0", init_lnet, fini_lnet);
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
new file mode 100644 (file)
index 0000000..2869776
--- /dev/null
@@ -0,0 +1,337 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+int
+lnet_peer_tables_create(void)
+{
+       struct lnet_peer_table  *ptable;
+       struct list_head                *hash;
+       int                     i;
+       int                     j;
+
+       the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+                                                  sizeof(*ptable));
+       if (the_lnet.ln_peer_tables == NULL) {
+               CERROR("Failed to allocate cpu-partition peer tables\n");
+               return -ENOMEM;
+       }
+
+       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+               INIT_LIST_HEAD(&ptable->pt_deathrow);
+
+               LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+                                LNET_PEER_HASH_SIZE * sizeof(*hash));
+               if (hash == NULL) {
+                       CERROR("Failed to create peer hash table\n");
+                       lnet_peer_tables_destroy();
+                       return -ENOMEM;
+               }
+
+               for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+                       INIT_LIST_HEAD(&hash[j]);
+               ptable->pt_hash = hash; /* sign of initialization */
+       }
+
+       return 0;
+}
+
+void
+lnet_peer_tables_destroy(void)
+{
+       struct lnet_peer_table  *ptable;
+       struct list_head                *hash;
+       int                     i;
+       int                     j;
+
+       if (the_lnet.ln_peer_tables == NULL)
+               return;
+
+       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+               hash = ptable->pt_hash;
+               if (hash == NULL) /* not intialized */
+                       break;
+
+               LASSERT(list_empty(&ptable->pt_deathrow));
+
+               ptable->pt_hash = NULL;
+               for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+                       LASSERT(list_empty(&hash[j]));
+
+               LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+       }
+
+       cfs_percpt_free(the_lnet.ln_peer_tables);
+       the_lnet.ln_peer_tables = NULL;
+}
+
+void
+lnet_peer_tables_cleanup(void)
+{
+       struct lnet_peer_table  *ptable;
+       int                     i;
+       int                     j;
+
+       LASSERT(the_lnet.ln_shutdown);  /* i.e. no new peers */
+
+       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+               lnet_net_lock(i);
+
+               for (j = 0; j < LNET_PEER_HASH_SIZE; j++) {
+                       struct list_head *peers = &ptable->pt_hash[j];
+
+                       while (!list_empty(peers)) {
+                               lnet_peer_t *lp = list_entry(peers->next,
+                                                                lnet_peer_t,
+                                                                lp_hashlist);
+                               list_del_init(&lp->lp_hashlist);
+                               /* lose hash table's ref */
+                               lnet_peer_decref_locked(lp);
+                       }
+               }
+
+               lnet_net_unlock(i);
+       }
+
+       cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+               LIST_HEAD       (deathrow);
+               lnet_peer_t     *lp;
+
+               lnet_net_lock(i);
+
+               for (j = 3; ptable->pt_number != 0; j++) {
+                       lnet_net_unlock(i);
+
+                       if ((j & (j - 1)) == 0) {
+                               CDEBUG(D_WARNING,
+                                      "Waiting for %d peers on peer table\n",
+                                      ptable->pt_number);
+                       }
+                       cfs_pause(cfs_time_seconds(1) / 2);
+                       lnet_net_lock(i);
+               }
+               list_splice_init(&ptable->pt_deathrow, &deathrow);
+
+               lnet_net_unlock(i);
+
+               while (!list_empty(&deathrow)) {
+                       lp = list_entry(deathrow.next,
+                                           lnet_peer_t, lp_hashlist);
+                       list_del(&lp->lp_hashlist);
+                       LIBCFS_FREE(lp, sizeof(*lp));
+               }
+       }
+}
+
+void
+lnet_destroy_peer_locked(lnet_peer_t *lp)
+{
+       struct lnet_peer_table *ptable;
+
+       LASSERT(lp->lp_refcount == 0);
+       LASSERT(lp->lp_rtr_refcount == 0);
+       LASSERT(list_empty(&lp->lp_txq));
+       LASSERT(list_empty(&lp->lp_hashlist));
+       LASSERT(lp->lp_txqnob == 0);
+
+       ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+       LASSERT(ptable->pt_number > 0);
+       ptable->pt_number--;
+
+       lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt);
+       lp->lp_ni = NULL;
+
+       list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+}
+
+lnet_peer_t *
+lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+       struct list_head        *peers;
+       lnet_peer_t     *lp;
+
+       LASSERT(!the_lnet.ln_shutdown);
+
+       peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+       list_for_each_entry(lp, peers, lp_hashlist) {
+               if (lp->lp_nid == nid) {
+                       lnet_peer_addref_locked(lp);
+                       return lp;
+               }
+       }
+
+       return NULL;
+}
+
+int
+lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
+{
+       struct lnet_peer_table  *ptable;
+       lnet_peer_t             *lp = NULL;
+       lnet_peer_t             *lp2;
+       int                     cpt2;
+       int                     rc = 0;
+
+       *lpp = NULL;
+       if (the_lnet.ln_shutdown) /* it's shutting down */
+               return -ESHUTDOWN;
+
+       /* cpt can be LNET_LOCK_EX if it's called from router functions */
+       cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
+
+       ptable = the_lnet.ln_peer_tables[cpt2];
+       lp = lnet_find_peer_locked(ptable, nid);
+       if (lp != NULL) {
+               *lpp = lp;
+               return 0;
+       }
+
+       if (!list_empty(&ptable->pt_deathrow)) {
+               lp = list_entry(ptable->pt_deathrow.next,
+                                   lnet_peer_t, lp_hashlist);
+               list_del(&lp->lp_hashlist);
+       }
+
+       /*
+        * take extra refcount in case another thread has shutdown LNet
+        * and destroyed locks and peer-table before I finish the allocation
+        */
+       ptable->pt_number++;
+       lnet_net_unlock(cpt);
+
+       if (lp != NULL)
+               memset(lp, 0, sizeof(*lp));
+       else
+               LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
+
+       if (lp == NULL) {
+               rc = -ENOMEM;
+               lnet_net_lock(cpt);
+               goto out;
+       }
+
+       INIT_LIST_HEAD(&lp->lp_txq);
+       INIT_LIST_HEAD(&lp->lp_rtrq);
+       INIT_LIST_HEAD(&lp->lp_routes);
+
+       lp->lp_notify = 0;
+       lp->lp_notifylnd = 0;
+       lp->lp_notifying = 0;
+       lp->lp_alive_count = 0;
+       lp->lp_timestamp = 0;
+       lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+       lp->lp_last_alive = cfs_time_current(); /* assumes alive */
+       lp->lp_last_query = 0; /* haven't asked NI yet */
+       lp->lp_ping_timestamp = 0;
+       lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
+       lp->lp_nid = nid;
+       lp->lp_cpt = cpt2;
+       lp->lp_refcount = 2;    /* 1 for caller; 1 for hash */
+       lp->lp_rtr_refcount = 0;
+
+       lnet_net_lock(cpt);
+
+       if (the_lnet.ln_shutdown) {
+               rc = -ESHUTDOWN;
+               goto out;
+       }
+
+       lp2 = lnet_find_peer_locked(ptable, nid);
+       if (lp2 != NULL) {
+               *lpp = lp2;
+               goto out;
+       }
+
+       lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
+       if (lp->lp_ni == NULL) {
+               rc = -EHOSTUNREACH;
+               goto out;
+       }
+
+       lp->lp_txcredits    =
+       lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+       lp->lp_rtrcredits    =
+       lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+
+       list_add_tail(&lp->lp_hashlist,
+                         &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+       ptable->pt_version++;
+       *lpp = lp;
+
+       return 0;
+out:
+       if (lp != NULL)
+               list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+       ptable->pt_number--;
+       return rc;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+       char            *aliveness = "NA";
+       lnet_peer_t     *lp;
+       int             rc;
+       int             cpt;
+
+       cpt = lnet_cpt_of_nid(nid);
+       lnet_net_lock(cpt);
+
+       rc = lnet_nid2peer_locked(&lp, nid, cpt);
+       if (rc != 0) {
+               lnet_net_unlock(cpt);
+               CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+               return;
+       }
+
+       if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+               aliveness = lp->lp_alive ? "up" : "down";
+
+       CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+              libcfs_nid2str(lp->lp_nid), lp->lp_refcount,
+              aliveness, lp->lp_ni->ni_peertxcredits,
+              lp->lp_rtrcredits, lp->lp_minrtrcredits,
+              lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+
+       lnet_peer_decref_locked(lp);
+
+       lnet_net_unlock(cpt);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c
new file mode 100644 (file)
index 0000000..a326ce0
--- /dev/null
@@ -0,0 +1,1694 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+#if  defined(LNET_ROUTER)
+
+#define LNET_NRB_TINY_MIN      512     /* min value for each CPT */
+#define LNET_NRB_TINY          (LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN     4096    /* min value for each CPT */
+#define LNET_NRB_SMALL         (LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_LARGE_MIN     256     /* min value for each CPT */
+#define LNET_NRB_LARGE         (LNET_NRB_LARGE_MIN * 4)
+
+static char *forwarding = "";
+CFS_MODULE_PARM(forwarding, "s", charp, 0444,
+               "Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444,
+               "# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+CFS_MODULE_PARM(small_router_buffers, "i", int, 0444,
+               "# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+CFS_MODULE_PARM(large_router_buffers, "i", int, 0444,
+               "# of large messages to buffer in the router");
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+               "# router buffer credits per peer");
+
+static int auto_down = 1;
+CFS_MODULE_PARM(auto_down, "i", int, 0444,
+               "Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+       /* NI option overrides LNet default */
+       if (ni->ni_peerrtrcredits > 0)
+               return ni->ni_peerrtrcredits;
+       if (peer_buffer_credits > 0)
+               return peer_buffer_credits;
+
+       /* As an approximation, allow this peer the same number of router
+        * buffers as it is allowed outstanding sends */
+       return ni->ni_peertxcredits;
+}
+
+/* forward ref's */
+static int lnet_router_checker(void *);
+#else
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+       return 0;
+}
+
+#endif
+
+static int check_routers_before_use = 0;
+CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
+               "Assume routers are down and ping them before use");
+
+static int avoid_asym_router_failure = 1;
+CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644,
+               "Avoid asymmetrical router failures (0 to disable)");
+
+static int dead_router_check_interval = 60;
+CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644,
+               "Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 60;
+CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644,
+               "Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644,
+               "Seconds to wait for the reply to a router health query");
+
+int
+lnet_peers_start_down(void)
+{
+       return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
+{
+       if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */
+               CDEBUG(D_NET, "Out of date\n");
+               return;
+       }
+
+       lp->lp_timestamp = when;                /* update timestamp */
+       lp->lp_ping_deadline = 0;              /* disable ping timeout */
+
+       if (lp->lp_alive_count != 0 &&    /* got old news */
+           (!lp->lp_alive) == (!alive)) {      /* new date for old news */
+               CDEBUG(D_NET, "Old news\n");
+               return;
+       }
+
+       /* Flag that notification is outstanding */
+
+       lp->lp_alive_count++;
+       lp->lp_alive = !(!alive);              /* 1 bit! */
+       lp->lp_notify = 1;
+       lp->lp_notifylnd |= notifylnd;
+       if (lp->lp_alive)
+               lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+       CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+}
+
+void
+lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+       int     alive;
+       int     notifylnd;
+
+       /* Notify only in 1 thread at any time to ensure ordered notification.
+        * NB individual events can be missed; the only guarantee is that you
+        * always get the most recent news */
+
+       if (lp->lp_notifying)
+               return;
+
+       lp->lp_notifying = 1;
+
+       while (lp->lp_notify) {
+               alive     = lp->lp_alive;
+               notifylnd = lp->lp_notifylnd;
+
+               lp->lp_notifylnd = 0;
+               lp->lp_notify    = 0;
+
+               if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
+                       lnet_net_unlock(lp->lp_cpt);
+
+                       /* A new notification could happen now; I'll handle it
+                        * when control returns to me */
+
+                       (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+
+                       lnet_net_lock(lp->lp_cpt);
+               }
+       }
+
+       lp->lp_notifying = 0;
+}
+
+
+static void
+lnet_rtr_addref_locked(lnet_peer_t *lp)
+{
+       LASSERT(lp->lp_refcount > 0);
+       LASSERT(lp->lp_rtr_refcount >= 0);
+
+       /* lnet_net_lock must be exclusively locked */
+       lp->lp_rtr_refcount++;
+       if (lp->lp_rtr_refcount == 1) {
+               struct list_head *pos;
+
+               /* a simple insertion sort */
+               list_for_each_prev(pos, &the_lnet.ln_routers) {
+                       lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
+                                                         lp_rtr_list);
+
+                       if (rtr->lp_nid < lp->lp_nid)
+                               break;
+               }
+
+               list_add(&lp->lp_rtr_list, pos);
+               /* addref for the_lnet.ln_routers */
+               lnet_peer_addref_locked(lp);
+               the_lnet.ln_routers_version++;
+       }
+}
+
+static void
+lnet_rtr_decref_locked(lnet_peer_t *lp)
+{
+       LASSERT(lp->lp_refcount > 0);
+       LASSERT(lp->lp_rtr_refcount > 0);
+
+       /* lnet_net_lock must be exclusively locked */
+       lp->lp_rtr_refcount--;
+       if (lp->lp_rtr_refcount == 0) {
+               LASSERT(list_empty(&lp->lp_routes));
+
+               if (lp->lp_rcd != NULL) {
+                       list_add(&lp->lp_rcd->rcd_list,
+                                    &the_lnet.ln_rcd_deathrow);
+                       lp->lp_rcd = NULL;
+               }
+
+               list_del(&lp->lp_rtr_list);
+               /* decref for the_lnet.ln_routers */
+               lnet_peer_decref_locked(lp);
+               the_lnet.ln_routers_version++;
+       }
+}
+
+lnet_remotenet_t *
+lnet_find_net_locked (__u32 net)
+{
+       lnet_remotenet_t        *rnet;
+       struct list_head                *tmp;
+       struct list_head                *rn_list;
+
+       LASSERT(!the_lnet.ln_shutdown);
+
+       rn_list = lnet_net2rnethash(net);
+       list_for_each(tmp, rn_list) {
+               rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
+
+               if (rnet->lrn_net == net)
+                       return rnet;
+       }
+       return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+       static int seeded = 0;
+       int lnd_type, seed[2];
+       struct timeval tv;
+       lnet_ni_t *ni;
+       struct list_head *tmp;
+
+       if (seeded)
+               return;
+
+       cfs_get_random_bytes(seed, sizeof(seed));
+
+       /* Nodes with small feet have little entropy
+        * the NID for this node gives the most entropy in the low bits */
+       list_for_each(tmp, &the_lnet.ln_nis) {
+               ni = list_entry(tmp, lnet_ni_t, ni_list);
+               lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+               if (lnd_type != LOLND)
+                       seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
+       }
+
+       do_gettimeofday(&tv);
+       cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+       seeded = 1;
+       return;
+}
+
+/* NB expects LNET_LOCK held */
+void
+lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route)
+{
+       unsigned int      len = 0;
+       unsigned int      offset = 0;
+       struct list_head       *e;
+
+       lnet_shuffle_seed();
+
+       list_for_each (e, &rnet->lrn_routes) {
+               len++;
+       }
+
+       /* len+1 positions to add a new entry, also prevents division by 0 */
+       offset = cfs_rand() % (len + 1);
+       list_for_each (e, &rnet->lrn_routes) {
+               if (offset == 0)
+                       break;
+               offset--;
+       }
+       list_add(&route->lr_list, e);
+       list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+
+       the_lnet.ln_remote_nets_version++;
+       lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
+{
+       struct list_head          *e;
+       lnet_remotenet_t    *rnet;
+       lnet_remotenet_t    *rnet2;
+       lnet_route_t    *route;
+       lnet_ni_t          *ni;
+       int               add_route;
+       int               rc;
+
+       CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n",
+              libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+
+       if (gateway == LNET_NID_ANY ||
+           LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+           net == LNET_NIDNET(LNET_NID_ANY) ||
+           LNET_NETTYP(net) == LOLND ||
+           LNET_NIDNET(gateway) == net ||
+           hops < 1 || hops > 255)
+               return (-EINVAL);
+
+       if (lnet_islocalnet(net))              /* it's a local network */
+               return 0;                      /* ignore the route entry */
+
+       /* Assume net, route, all new */
+       LIBCFS_ALLOC(route, sizeof(*route));
+       LIBCFS_ALLOC(rnet, sizeof(*rnet));
+       if (route == NULL || rnet == NULL) {
+               CERROR("Out of memory creating route %s %d %s\n",
+                      libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+               if (route != NULL)
+                       LIBCFS_FREE(route, sizeof(*route));
+               if (rnet != NULL)
+                       LIBCFS_FREE(rnet, sizeof(*rnet));
+               return -ENOMEM;
+       }
+
+       INIT_LIST_HEAD(&rnet->lrn_routes);
+       rnet->lrn_net = net;
+       route->lr_hops = hops;
+       route->lr_net = net;
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
+       if (rc != 0) {
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               LIBCFS_FREE(route, sizeof(*route));
+               LIBCFS_FREE(rnet, sizeof(*rnet));
+
+               if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */
+                       return 0;       /* ignore the route entry */
+               } else {
+                       CERROR("Error %d creating route %s %d %s\n", rc,
+                              libcfs_net2str(net), hops,
+                              libcfs_nid2str(gateway));
+               }
+               return rc;
+       }
+
+       LASSERT (!the_lnet.ln_shutdown);
+
+       rnet2 = lnet_find_net_locked(net);
+       if (rnet2 == NULL) {
+               /* new network */
+               list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+               rnet2 = rnet;
+       }
+
+       /* Search for a duplicate route (it's a NOOP if it is) */
+       add_route = 1;
+       list_for_each (e, &rnet2->lrn_routes) {
+               lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
+
+               if (route2->lr_gateway == route->lr_gateway) {
+                       add_route = 0;
+                       break;
+               }
+
+               /* our lookups must be true */
+               LASSERT (route2->lr_gateway->lp_nid != gateway);
+       }
+
+       if (add_route) {
+               lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
+               lnet_add_route_to_rnet(rnet2, route);
+
+               ni = route->lr_gateway->lp_ni;
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               /* XXX Assume alive */
+               if (ni->ni_lnd->lnd_notify != NULL)
+                       (ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+
+               lnet_net_lock(LNET_LOCK_EX);
+       }
+
+       /* -1 for notify or !add_route */
+       lnet_peer_decref_locked(route->lr_gateway);
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       if (!add_route)
+               LIBCFS_FREE(route, sizeof(*route));
+
+       if (rnet != rnet2)
+               LIBCFS_FREE(rnet, sizeof(*rnet));
+
+       return 0;
+}
+
+int
+lnet_check_routes(void)
+{
+       lnet_remotenet_t        *rnet;
+       lnet_route_t            *route;
+       lnet_route_t            *route2;
+       struct list_head                *e1;
+       struct list_head                *e2;
+       int                     cpt;
+       struct list_head                *rn_list;
+       int                     i;
+
+       cpt = lnet_net_lock_current();
+
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+               rn_list = &the_lnet.ln_remote_nets_hash[i];
+               list_for_each(e1, rn_list) {
+                       rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+                       route2 = NULL;
+                       list_for_each(e2, &rnet->lrn_routes) {
+                               lnet_nid_t      nid1;
+                               lnet_nid_t      nid2;
+                               int             net;
+
+                               route = list_entry(e2, lnet_route_t,
+                                                      lr_list);
+
+                               if (route2 == NULL) {
+                                       route2 = route;
+                                       continue;
+                               }
+
+                               if (route->lr_gateway->lp_ni ==
+                                   route2->lr_gateway->lp_ni)
+                                       continue;
+
+                               nid1 = route->lr_gateway->lp_nid;
+                               nid2 = route2->lr_gateway->lp_nid;
+                               net = rnet->lrn_net;
+
+                               lnet_net_unlock(cpt);
+
+                               CERROR("Routes to %s via %s and %s not "
+                                      "supported\n",
+                                      libcfs_net2str(net),
+                                      libcfs_nid2str(nid1),
+                                      libcfs_nid2str(nid2));
+                               return -EINVAL;
+                       }
+               }
+       }
+
+       lnet_net_unlock(cpt);
+       return 0;
+}
+
+int
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+{
+       struct lnet_peer        *gateway;
+       lnet_remotenet_t        *rnet;
+       lnet_route_t            *route;
+       struct list_head                *e1;
+       struct list_head                *e2;
+       int                     rc = -ENOENT;
+       struct list_head                *rn_list;
+       int                     idx = 0;
+
+       CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+              libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+       /* NB Caller may specify either all routes via the given gateway
+        * or a specific route entry actual NIDs) */
+
+       lnet_net_lock(LNET_LOCK_EX);
+       if (net == LNET_NIDNET(LNET_NID_ANY))
+               rn_list = &the_lnet.ln_remote_nets_hash[0];
+       else
+               rn_list = lnet_net2rnethash(net);
+
+ again:
+       list_for_each(e1, rn_list) {
+               rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+               if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+                       net == rnet->lrn_net))
+                       continue;
+
+               list_for_each(e2, &rnet->lrn_routes) {
+                       route = list_entry(e2, lnet_route_t, lr_list);
+
+                       gateway = route->lr_gateway;
+                       if (!(gw_nid == LNET_NID_ANY ||
+                             gw_nid == gateway->lp_nid))
+                               continue;
+
+                       list_del(&route->lr_list);
+                       list_del(&route->lr_gwlist);
+                       the_lnet.ln_remote_nets_version++;
+
+                       if (list_empty(&rnet->lrn_routes))
+                               list_del(&rnet->lrn_list);
+                       else
+                               rnet = NULL;
+
+                       lnet_rtr_decref_locked(gateway);
+                       lnet_peer_decref_locked(gateway);
+
+                       lnet_net_unlock(LNET_LOCK_EX);
+
+                       LIBCFS_FREE(route, sizeof(*route));
+
+                       if (rnet != NULL)
+                               LIBCFS_FREE(rnet, sizeof(*rnet));
+
+                       rc = 0;
+                       lnet_net_lock(LNET_LOCK_EX);
+                       goto again;
+               }
+       }
+
+       if (net == LNET_NIDNET(LNET_NID_ANY) &&
+           ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+               rn_list = &the_lnet.ln_remote_nets_hash[idx];
+               goto again;
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       return rc;
+}
+
+void
+lnet_destroy_routes (void)
+{
+       lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops,
+              lnet_nid_t *gateway, __u32 *alive)
+{
+       struct list_head                *e1;
+       struct list_head                *e2;
+       lnet_remotenet_t        *rnet;
+       lnet_route_t            *route;
+       int                     cpt;
+       int                     i;
+       struct list_head                *rn_list;
+
+       cpt = lnet_net_lock_current();
+
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+               rn_list = &the_lnet.ln_remote_nets_hash[i];
+               list_for_each(e1, rn_list) {
+                       rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+                       list_for_each(e2, &rnet->lrn_routes) {
+                               route = list_entry(e2, lnet_route_t,
+                                                      lr_list);
+
+                               if (idx-- == 0) {
+                                       *net     = rnet->lrn_net;
+                                       *hops    = route->lr_hops;
+                                       *gateway = route->lr_gateway->lp_nid;
+                                       *alive   = route->lr_gateway->lp_alive;
+                                       lnet_net_unlock(cpt);
+                                       return 0;
+                               }
+                       }
+               }
+       }
+
+       lnet_net_unlock(cpt);
+       return -ENOENT;
+}
+
+void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+       int            i;
+       lnet_ni_status_t *stat;
+
+       __swab32s(&info->pi_magic);
+       __swab32s(&info->pi_features);
+       __swab32s(&info->pi_pid);
+       __swab32s(&info->pi_nnis);
+       for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+               stat = &info->pi_ni[i];
+               __swab64s(&stat->ns_nid);
+               __swab32s(&stat->ns_status);
+       }
+       return;
+}
+
+/**
+ * parse router-checker pinginfo, record number of down NIs for remote
+ * networks on that router.
+ */
+static void
+lnet_parse_rc_info(lnet_rc_data_t *rcd)
+{
+       lnet_ping_info_t        *info = rcd->rcd_pinginfo;
+       struct lnet_peer        *gw   = rcd->rcd_gateway;
+       lnet_route_t            *rtr;
+
+       if (!gw->lp_alive)
+               return;
+
+       if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+               lnet_swap_pinginfo(info);
+
+       /* NB always racing with network! */
+       if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+               CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
+                      libcfs_nid2str(gw->lp_nid), info->pi_magic);
+               gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+               return;
+       }
+
+       gw->lp_ping_feats = info->pi_features;
+       if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+               CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+                      libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+               return; /* nothing I can understand */
+       }
+
+       if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+               return; /* can't carry NI status info */
+
+       list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
+               int     ptl_status = LNET_NI_STATUS_INVALID;
+               int     down = 0;
+               int     up = 0;
+               int     i;
+
+               for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+                       lnet_ni_status_t *stat = &info->pi_ni[i];
+                       lnet_nid_t       nid = stat->ns_nid;
+
+                       if (nid == LNET_NID_ANY) {
+                               CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
+                                      libcfs_nid2str(gw->lp_nid));
+                               gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+                               return;
+                       }
+
+                       if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+                               continue;
+
+                       if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+                               if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
+                                       down++;
+                               else if (ptl_status != LNET_NI_STATUS_UP)
+                                       ptl_status = LNET_NI_STATUS_DOWN;
+                               continue;
+                       }
+
+                       if (stat->ns_status == LNET_NI_STATUS_UP) {
+                               if (LNET_NIDNET(nid) == rtr->lr_net) {
+                                       up = 1;
+                                       break;
+                               }
+                               /* ptl NIs are considered down only when
+                                * they're all down */
+                               if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+                                       ptl_status = LNET_NI_STATUS_UP;
+                               continue;
+                       }
+
+                       CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
+                              libcfs_nid2str(gw->lp_nid), stat->ns_status);
+                       gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+                       return;
+               }
+
+               if (up) { /* ignore downed NIs if NI for dest network is up */
+                       rtr->lr_downis = 0;
+                       continue;
+               }
+               rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
+       }
+}
+
+static void
+lnet_router_checker_event(lnet_event_t *event)
+{
+       lnet_rc_data_t          *rcd = event->md.user_ptr;
+       struct lnet_peer        *lp;
+
+       LASSERT(rcd != NULL);
+
+       if (event->unlinked) {
+               LNetInvalidateHandle(&rcd->rcd_mdh);
+               return;
+       }
+
+       LASSERT(event->type == LNET_EVENT_SEND ||
+               event->type == LNET_EVENT_REPLY);
+
+       lp = rcd->rcd_gateway;
+       LASSERT(lp != NULL);
+
+        /* NB: it's called with holding lnet_res_lock, we have a few
+         * places need to hold both locks at the same time, please take
+         * care of lock ordering */
+       lnet_net_lock(lp->lp_cpt);
+       if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
+               /* ignore if no longer a router or rcd is replaced */
+               goto out;
+       }
+
+       if (event->type == LNET_EVENT_SEND) {
+               lp->lp_ping_notsent = 0;
+               if (event->status == 0)
+                       goto out;
+       }
+
+       /* LNET_EVENT_REPLY */
+       /* A successful REPLY means the router is up.  If _any_ comms
+        * to the router fail I assume it's down (this will happen if
+        * we ping alive routers to try to detect router death before
+        * apps get burned). */
+
+       lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+       /* The router checker will wake up very shortly and do the
+        * actual notification.
+        * XXX If 'lp' stops being a router before then, it will still
+        * have the notification pending!!! */
+
+       if (avoid_asym_router_failure && event->status == 0)
+               lnet_parse_rc_info(rcd);
+
+ out:
+       lnet_net_unlock(lp->lp_cpt);
+}
+
+void
+lnet_wait_known_routerstate(void)
+{
+       lnet_peer_t      *rtr;
+       struct list_head          *entry;
+       int               all_known;
+
+       LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+       for (;;) {
+               int     cpt = lnet_net_lock_current();
+
+               all_known = 1;
+               list_for_each (entry, &the_lnet.ln_routers) {
+                       rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+                       if (rtr->lp_alive_count == 0) {
+                               all_known = 0;
+                               break;
+                       }
+               }
+
+               lnet_net_unlock(cpt);
+
+               if (all_known)
+                       return;
+
+               cfs_pause(cfs_time_seconds(1));
+       }
+}
+
+void
+lnet_update_ni_status_locked(void)
+{
+       lnet_ni_t       *ni;
+       long            now;
+       int             timeout;
+
+       LASSERT(the_lnet.ln_routing);
+
+       timeout = router_ping_timeout +
+                 MAX(live_router_check_interval, dead_router_check_interval);
+
+       now = cfs_time_current_sec();
+       list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+               if (ni->ni_lnd->lnd_type == LOLND)
+                       continue;
+
+               if (now < ni->ni_last_alive + timeout)
+                       continue;
+
+               lnet_ni_lock(ni);
+               /* re-check with lock */
+               if (now < ni->ni_last_alive + timeout) {
+                       lnet_ni_unlock(ni);
+                       continue;
+               }
+
+               LASSERT(ni->ni_status != NULL);
+
+               if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
+                       CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+                              libcfs_nid2str(ni->ni_nid), timeout);
+                       /* NB: so far, this is the only place to set
+                        * NI status to "down" */
+                       ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+               }
+               lnet_ni_unlock(ni);
+       }
+}
+
+void
+lnet_destroy_rc_data(lnet_rc_data_t *rcd)
+{
+       LASSERT(list_empty(&rcd->rcd_list));
+       /* detached from network */
+       LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
+
+       if (rcd->rcd_gateway != NULL) {
+               int cpt = rcd->rcd_gateway->lp_cpt;
+
+               lnet_net_lock(cpt);
+               lnet_peer_decref_locked(rcd->rcd_gateway);
+               lnet_net_unlock(cpt);
+       }
+
+       if (rcd->rcd_pinginfo != NULL)
+               LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+
+       LIBCFS_FREE(rcd, sizeof(*rcd));
+}
+
+lnet_rc_data_t *
+lnet_create_rc_data_locked(lnet_peer_t *gateway)
+{
+       lnet_rc_data_t          *rcd = NULL;
+       lnet_ping_info_t        *pi;
+       int                     rc;
+       int                     i;
+
+       lnet_net_unlock(gateway->lp_cpt);
+
+       LIBCFS_ALLOC(rcd, sizeof(*rcd));
+       if (rcd == NULL)
+               goto out;
+
+       LNetInvalidateHandle(&rcd->rcd_mdh);
+       INIT_LIST_HEAD(&rcd->rcd_list);
+
+       LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+       if (pi == NULL)
+               goto out;
+
+       memset(pi, 0, LNET_PINGINFO_SIZE);
+       for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+               pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+               pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+       }
+       rcd->rcd_pinginfo = pi;
+
+       LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+       rc = LNetMDBind((lnet_md_t){.start     = pi,
+                                   .user_ptr  = rcd,
+                                   .length    = LNET_PINGINFO_SIZE,
+                                   .threshold = LNET_MD_THRESH_INF,
+                                   .options   = LNET_MD_TRUNCATE,
+                                   .eq_handle = the_lnet.ln_rc_eqh},
+                       LNET_UNLINK,
+                       &rcd->rcd_mdh);
+       if (rc < 0) {
+               CERROR("Can't bind MD: %d\n", rc);
+               goto out;
+       }
+       LASSERT(rc == 0);
+
+       lnet_net_lock(gateway->lp_cpt);
+       /* router table changed or someone has created rcd for this gateway */
+       if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
+               lnet_net_unlock(gateway->lp_cpt);
+               goto out;
+       }
+
+       lnet_peer_addref_locked(gateway);
+       rcd->rcd_gateway = gateway;
+       gateway->lp_rcd = rcd;
+       gateway->lp_ping_notsent = 0;
+
+       return rcd;
+
+ out:
+       if (rcd != NULL) {
+               if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
+                       rc = LNetMDUnlink(rcd->rcd_mdh);
+                       LASSERT(rc == 0);
+               }
+               lnet_destroy_rc_data(rcd);
+       }
+
+       lnet_net_lock(gateway->lp_cpt);
+       return gateway->lp_rcd;
+}
+
+static int
+lnet_router_check_interval (lnet_peer_t *rtr)
+{
+       int secs;
+
+       secs = rtr->lp_alive ? live_router_check_interval :
+                              dead_router_check_interval;
+       if (secs < 0)
+               secs = 0;
+
+       return secs;
+}
+
+static void
+lnet_ping_router_locked (lnet_peer_t *rtr)
+{
+       lnet_rc_data_t *rcd = NULL;
+       cfs_time_t      now = cfs_time_current();
+       int          secs;
+
+       lnet_peer_addref_locked(rtr);
+
+       if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
+           cfs_time_after(now, rtr->lp_ping_deadline))
+               lnet_notify_locked(rtr, 1, 0, now);
+
+       /* Run any outstanding notifications */
+       lnet_ni_notify_locked(rtr->lp_ni, rtr);
+
+       if (!lnet_isrouter(rtr) ||
+           the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+               /* router table changed or router checker is shutting down */
+               lnet_peer_decref_locked(rtr);
+               return;
+       }
+
+       rcd = rtr->lp_rcd != NULL ?
+             rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
+
+       if (rcd == NULL)
+               return;
+
+       secs = lnet_router_check_interval(rtr);
+
+       CDEBUG(D_NET,
+              "rtr %s %d: deadline %lu ping_notsent %d alive %d "
+              "alive_count %d lp_ping_timestamp %lu\n",
+              libcfs_nid2str(rtr->lp_nid), secs,
+              rtr->lp_ping_deadline, rtr->lp_ping_notsent,
+              rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
+
+       if (secs != 0 && !rtr->lp_ping_notsent &&
+           cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+                                            cfs_time_seconds(secs)))) {
+               int            rc;
+               lnet_process_id_t id;
+               lnet_handle_md_t  mdh;
+
+               id.nid = rtr->lp_nid;
+               id.pid = LUSTRE_SRV_LNET_PID;
+               CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
+
+               rtr->lp_ping_notsent   = 1;
+               rtr->lp_ping_timestamp = now;
+
+               mdh = rcd->rcd_mdh;
+
+               if (rtr->lp_ping_deadline == 0) {
+                       rtr->lp_ping_deadline =
+                               cfs_time_shift(router_ping_timeout);
+               }
+
+               lnet_net_unlock(rtr->lp_cpt);
+
+               rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+                            LNET_PROTO_PING_MATCHBITS, 0);
+
+               lnet_net_lock(rtr->lp_cpt);
+               if (rc != 0)
+                       rtr->lp_ping_notsent = 0; /* no event pending */
+       }
+
+       lnet_peer_decref_locked(rtr);
+       return;
+}
+
+int
+lnet_router_checker_start(void)
+{
+       int       rc;
+       int       eqsz;
+
+       LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+       if (check_routers_before_use &&
+           dead_router_check_interval <= 0) {
+               LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be"
+                                  " set if 'check_routers_before_use' is set"
+                                  "\n");
+               return -EINVAL;
+       }
+
+       if (!the_lnet.ln_routing &&
+           live_router_check_interval <= 0 &&
+           dead_router_check_interval <= 0)
+               return 0;
+
+       sema_init(&the_lnet.ln_rc_signal, 0);
+       /* EQ size doesn't matter; the callback is guaranteed to get every
+        * event */
+       eqsz = 0;
+       rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
+                        &the_lnet.ln_rc_eqh);
+       if (rc != 0) {
+               CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+               return -ENOMEM;
+       }
+
+       the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+       rc = PTR_ERR(kthread_run(lnet_router_checker,
+                                NULL, "router_checker"));
+       if (IS_ERR_VALUE(rc)) {
+               CERROR("Can't start router checker thread: %d\n", rc);
+               /* block until event callback signals exit */
+               down(&the_lnet.ln_rc_signal);
+               rc = LNetEQFree(the_lnet.ln_rc_eqh);
+               LASSERT(rc == 0);
+               the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+               return -ENOMEM;
+       }
+
+       if (check_routers_before_use) {
+               /* Note that a helpful side-effect of pinging all known routers
+                * at startup is that it makes them drop stale connections they
+                * may have to a previous instance of me. */
+               lnet_wait_known_routerstate();
+       }
+
+       return 0;
+}
+
+void
+lnet_router_checker_stop (void)
+{
+       int rc;
+
+       if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+               return;
+
+       LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+       the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+
+       /* block until event callback signals exit */
+       down(&the_lnet.ln_rc_signal);
+       LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+       rc = LNetEQFree(the_lnet.ln_rc_eqh);
+       LASSERT (rc == 0);
+       return;
+}
+
+static void
+lnet_prune_rc_data(int wait_unlink)
+{
+       lnet_rc_data_t          *rcd;
+       lnet_rc_data_t          *tmp;
+       lnet_peer_t             *lp;
+       struct list_head                head;
+       int                     i = 2;
+
+       if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+                  list_empty(&the_lnet.ln_rcd_deathrow) &&
+                  list_empty(&the_lnet.ln_rcd_zombie)))
+               return;
+
+       INIT_LIST_HEAD(&head);
+
+       lnet_net_lock(LNET_LOCK_EX);
+
+       if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+               /* router checker is stopping, prune all */
+               list_for_each_entry(lp, &the_lnet.ln_routers,
+                                       lp_rtr_list) {
+                       if (lp->lp_rcd == NULL)
+                               continue;
+
+                       LASSERT(list_empty(&lp->lp_rcd->rcd_list));
+                       list_add(&lp->lp_rcd->rcd_list,
+                                    &the_lnet.ln_rcd_deathrow);
+                       lp->lp_rcd = NULL;
+               }
+       }
+
+       /* unlink all RCDs on deathrow list */
+       list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
+
+       if (!list_empty(&head)) {
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               list_for_each_entry(rcd, &head, rcd_list)
+                       LNetMDUnlink(rcd->rcd_mdh);
+
+               lnet_net_lock(LNET_LOCK_EX);
+       }
+
+       list_splice_init(&head, &the_lnet.ln_rcd_zombie);
+
+       /* release all zombie RCDs */
+       while (!list_empty(&the_lnet.ln_rcd_zombie)) {
+               list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
+                                            rcd_list) {
+                       if (LNetHandleIsInvalid(rcd->rcd_mdh))
+                               list_move(&rcd->rcd_list, &head);
+               }
+
+               wait_unlink = wait_unlink &&
+                             !list_empty(&the_lnet.ln_rcd_zombie);
+
+               lnet_net_unlock(LNET_LOCK_EX);
+
+               while (!list_empty(&head)) {
+                       rcd = list_entry(head.next,
+                                            lnet_rc_data_t, rcd_list);
+                       list_del_init(&rcd->rcd_list);
+                       lnet_destroy_rc_data(rcd);
+               }
+
+               if (!wait_unlink)
+                       return;
+
+               i++;
+               CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                      "Waiting for rc buffers to unlink\n");
+               cfs_pause(cfs_time_seconds(1) / 4);
+
+               lnet_net_lock(LNET_LOCK_EX);
+       }
+
+       lnet_net_unlock(LNET_LOCK_EX);
+}
+
+
+#if  defined(LNET_ROUTER)
+
+static int
+lnet_router_checker(void *arg)
+{
+       lnet_peer_t       *rtr;
+       struct list_head        *entry;
+
+       cfs_block_allsigs();
+
+       LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+       while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+               __u64   version;
+               int     cpt;
+               int     cpt2;
+
+               cpt = lnet_net_lock_current();
+rescan:
+               version = the_lnet.ln_routers_version;
+
+               list_for_each(entry, &the_lnet.ln_routers) {
+                       rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+                       cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
+                       if (cpt != cpt2) {
+                               lnet_net_unlock(cpt);
+                               cpt = cpt2;
+                               lnet_net_lock(cpt);
+                               /* the routers list has changed */
+                               if (version != the_lnet.ln_routers_version)
+                                       goto rescan;
+                       }
+
+                       lnet_ping_router_locked(rtr);
+
+                       /* NB dropped lock */
+                       if (version != the_lnet.ln_routers_version) {
+                               /* the routers list has changed */
+                               goto rescan;
+                       }
+               }
+
+               if (the_lnet.ln_routing)
+                       lnet_update_ni_status_locked();
+
+               lnet_net_unlock(cpt);
+
+               lnet_prune_rc_data(0); /* don't wait for UNLINK */
+
+               /* Call cfs_pause() here always adds 1 to load average
+                * because kernel counts # active tasks as nr_running
+                * + nr_uninterruptible. */
+               schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+                                                  cfs_time_seconds(1));
+       }
+
+       LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
+
+       lnet_prune_rc_data(1); /* wait for UNLINK */
+
+       the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+       up(&the_lnet.ln_rc_signal);
+       /* The unlink event callback will signal final completion */
+       return 0;
+}
+
+void
+lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
+{
+       int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+
+       while (--npages >= 0)
+               __free_page(rb->rb_kiov[npages].kiov_page);
+
+       LIBCFS_FREE(rb, sz);
+}
+
+lnet_rtrbuf_t *
+lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
+{
+       int         npages = rbp->rbp_npages;
+       int         sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+       struct page   *page;
+       lnet_rtrbuf_t *rb;
+       int         i;
+
+       LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+       if (rb == NULL)
+               return NULL;
+
+       rb->rb_pool = rbp;
+
+       for (i = 0; i < npages; i++) {
+               page = alloc_pages_node(
+                               cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+                               __GFP_ZERO | GFP_IOFS, 0);
+               if (page == NULL) {
+                       while (--i >= 0)
+                               __free_page(rb->rb_kiov[i].kiov_page);
+
+                       LIBCFS_FREE(rb, sz);
+                       return NULL;
+               }
+
+               rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
+               rb->rb_kiov[i].kiov_offset = 0;
+               rb->rb_kiov[i].kiov_page = page;
+       }
+
+       return rb;
+}
+
+void
+lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
+{
+       int             npages = rbp->rbp_npages;
+       int             nbuffers = 0;
+       lnet_rtrbuf_t   *rb;
+
+       if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+               return;
+
+       LASSERT (list_empty(&rbp->rbp_msgs));
+       LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers);
+
+       while (!list_empty(&rbp->rbp_bufs)) {
+               LASSERT (rbp->rbp_credits > 0);
+
+               rb = list_entry(rbp->rbp_bufs.next,
+                                   lnet_rtrbuf_t, rb_list);
+               list_del(&rb->rb_list);
+               lnet_destroy_rtrbuf(rb, npages);
+               nbuffers++;
+       }
+
+       LASSERT (rbp->rbp_nbuffers == nbuffers);
+       LASSERT (rbp->rbp_credits == nbuffers);
+
+       rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+}
+
+int
+lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
+{
+       lnet_rtrbuf_t *rb;
+       int         i;
+
+       if (rbp->rbp_nbuffers != 0) {
+               LASSERT (rbp->rbp_nbuffers == nbufs);
+               return 0;
+       }
+
+       for (i = 0; i < nbufs; i++) {
+               rb = lnet_new_rtrbuf(rbp, cpt);
+
+               if (rb == NULL) {
+                       CERROR("Failed to allocate %d router bufs of %d pages\n",
+                              nbufs, rbp->rbp_npages);
+                       return -ENOMEM;
+               }
+
+               rbp->rbp_nbuffers++;
+               rbp->rbp_credits++;
+               rbp->rbp_mincredits++;
+               list_add(&rb->rb_list, &rbp->rbp_bufs);
+
+               /* No allocation "under fire" */
+               /* Otherwise we'd need code to schedule blocked msgs etc */
+               LASSERT (!the_lnet.ln_routing);
+       }
+
+       LASSERT (rbp->rbp_credits == nbufs);
+       return 0;
+}
+
+void
+lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
+{
+       INIT_LIST_HEAD(&rbp->rbp_msgs);
+       INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+       rbp->rbp_npages = npages;
+       rbp->rbp_credits = 0;
+       rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+       lnet_rtrbufpool_t *rtrp;
+       int               i;
+
+       if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+               return;
+
+       cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+               lnet_rtrpool_free_bufs(&rtrp[0]);
+               lnet_rtrpool_free_bufs(&rtrp[1]);
+               lnet_rtrpool_free_bufs(&rtrp[2]);
+       }
+
+       cfs_percpt_free(the_lnet.ln_rtrpools);
+       the_lnet.ln_rtrpools = NULL;
+}
+
+static int
+lnet_nrb_tiny_calculate(int npages)
+{
+       int     nrbs = LNET_NRB_TINY;
+
+       if (tiny_router_buffers < 0) {
+               LCONSOLE_ERROR_MSG(0x10c,
+                                  "tiny_router_buffers=%d invalid when "
+                                  "routing enabled\n", tiny_router_buffers);
+               return -1;
+       }
+
+       if (tiny_router_buffers > 0)
+               nrbs = tiny_router_buffers;
+
+       nrbs /= LNET_CPT_NUMBER;
+       return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(int npages)
+{
+       int     nrbs = LNET_NRB_SMALL;
+
+       if (small_router_buffers < 0) {
+               LCONSOLE_ERROR_MSG(0x10c,
+                                  "small_router_buffers=%d invalid when "
+                                  "routing enabled\n", small_router_buffers);
+               return -1;
+       }
+
+       if (small_router_buffers > 0)
+               nrbs = small_router_buffers;
+
+       nrbs /= LNET_CPT_NUMBER;
+       return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(int npages)
+{
+       int     nrbs = LNET_NRB_LARGE;
+
+       if (large_router_buffers < 0) {
+               LCONSOLE_ERROR_MSG(0x10c,
+                                  "large_router_buffers=%d invalid when "
+                                  "routing enabled\n", large_router_buffers);
+               return -1;
+       }
+
+       if (large_router_buffers > 0)
+               nrbs = large_router_buffers;
+
+       nrbs /= LNET_CPT_NUMBER;
+       return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+       lnet_rtrbufpool_t *rtrp;
+       int     large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       int     small_pages = 1;
+       int     nrb_tiny;
+       int     nrb_small;
+       int     nrb_large;
+       int     rc;
+       int     i;
+
+       if (!strcmp(forwarding, "")) {
+               /* not set either way */
+               if (!im_a_router)
+                       return 0;
+       } else if (!strcmp(forwarding, "disabled")) {
+               /* explicitly disabled */
+               return 0;
+       } else if (!strcmp(forwarding, "enabled")) {
+               /* explicitly enabled */
+       } else {
+               LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either "
+                                  "'enabled' or 'disabled'\n");
+               return -EINVAL;
+       }
+
+       nrb_tiny = lnet_nrb_tiny_calculate(0);
+       if (nrb_tiny < 0)
+               return -EINVAL;
+
+       nrb_small = lnet_nrb_small_calculate(small_pages);
+       if (nrb_small < 0)
+               return -EINVAL;
+
+       nrb_large = lnet_nrb_large_calculate(large_pages);
+       if (nrb_large < 0)
+               return -EINVAL;
+
+       the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+                                               LNET_NRBPOOLS *
+                                               sizeof(lnet_rtrbufpool_t));
+       if (the_lnet.ln_rtrpools == NULL) {
+               LCONSOLE_ERROR_MSG(0x10c,
+                                  "Failed to initialize router buffe pool\n");
+               return -ENOMEM;
+       }
+
+       cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+               lnet_rtrpool_init(&rtrp[0], 0);
+               rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
+               if (rc != 0)
+                       goto failed;
+
+               lnet_rtrpool_init(&rtrp[1], small_pages);
+               rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
+               if (rc != 0)
+                       goto failed;
+
+               lnet_rtrpool_init(&rtrp[2], large_pages);
+               rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
+               if (rc != 0)
+                       goto failed;
+       }
+
+       lnet_net_lock(LNET_LOCK_EX);
+       the_lnet.ln_routing = 1;
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       return 0;
+
+ failed:
+       lnet_rtrpools_free();
+       return rc;
+}
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+       struct lnet_peer        *lp = NULL;
+       cfs_time_t              now = cfs_time_current();
+       int                     cpt = lnet_cpt_of_nid(nid);
+
+       LASSERT (!in_interrupt ());
+
+       CDEBUG (D_NET, "%s notifying %s: %s\n",
+               (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+               libcfs_nid2str(nid),
+               alive ? "up" : "down");
+
+       if (ni != NULL &&
+           LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+               CWARN ("Ignoring notification of %s %s by %s (different net)\n",
+                       libcfs_nid2str(nid), alive ? "birth" : "death",
+                       libcfs_nid2str(ni->ni_nid));
+               return -EINVAL;
+       }
+
+       /* can't do predictions... */
+       if (cfs_time_after(when, now)) {
+               CWARN ("Ignoring prediction from %s of %s %s "
+                      "%ld seconds in the future\n",
+                      (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+                      libcfs_nid2str(nid), alive ? "up" : "down",
+                      cfs_duration_sec(cfs_time_sub(when, now)));
+               return -EINVAL;
+       }
+
+       if (ni != NULL && !alive &&          /* LND telling me she's down */
+           !auto_down) {                      /* auto-down disabled */
+               CDEBUG(D_NET, "Auto-down disabled\n");
+               return 0;
+       }
+
+       lnet_net_lock(cpt);
+
+       if (the_lnet.ln_shutdown) {
+               lnet_net_unlock(cpt);
+               return -ESHUTDOWN;
+       }
+
+       lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
+       if (lp == NULL) {
+               /* nid not found */
+               lnet_net_unlock(cpt);
+               CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+               return 0;
+       }
+
+       /* We can't fully trust LND on reporting exact peer last_alive
+        * if he notifies us about dead peer. For example ksocklnd can
+        * call us with when == _time_when_the_node_was_booted_ if
+        * no connections were successfully established */
+       if (ni != NULL && !alive && when < lp->lp_last_alive)
+               when = lp->lp_last_alive;
+
+       lnet_notify_locked(lp, ni == NULL, alive, when);
+
+       lnet_ni_notify_locked(ni, lp);
+
+       lnet_peer_decref_locked(lp);
+
+       lnet_net_unlock(cpt);
+       return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
+
+void
+lnet_get_tunables (void)
+{
+       return;
+}
+
+#else
+
+int
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+       return -EOPNOTSUPP;
+}
+
+void
+lnet_router_checker (void)
+{
+       static time_t last = 0;
+       static int    running = 0;
+
+       time_t      now = cfs_time_current_sec();
+       int            interval = now - last;
+       int            rc;
+       __u64        version;
+       lnet_peer_t      *rtr;
+
+       /* It's no use to call me again within a sec - all intervals and
+        * timeouts are measured in seconds */
+       if (last != 0 && interval < 2)
+               return;
+
+       if (last != 0 &&
+           interval > MAX(live_router_check_interval,
+                          dead_router_check_interval))
+               CNETERR("Checker(%d/%d) not called for %d seconds\n",
+                       live_router_check_interval, dead_router_check_interval,
+                       interval);
+
+       LASSERT(LNET_CPT_NUMBER == 1);
+
+       lnet_net_lock(0);
+       LASSERT(!running); /* recursion check */
+       running = 1;
+       lnet_net_unlock(0);
+
+       last = now;
+
+       if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING)
+               lnet_prune_rc_data(0); /* unlink all rcd and nowait */
+
+       /* consume all pending events */
+       while (1) {
+               int       i;
+               lnet_event_t ev;
+
+               /* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the
+                * recursion breaker in LNetEQPoll would fail */
+               rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i);
+               if (rc == 0)   /* no event pending */
+                       break;
+
+               /* NB a lost SENT prevents me from pinging a router again */
+               if (rc == -EOVERFLOW) {
+                       CERROR("Dropped an event!!!\n");
+                       abort();
+               }
+
+               LASSERT (rc == 1);
+
+               lnet_router_checker_event(&ev);
+       }
+
+       if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) {
+               lnet_prune_rc_data(1); /* release rcd */
+               the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+               running = 0;
+               return;
+       }
+
+       LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+       lnet_net_lock(0);
+
+       version = the_lnet.ln_routers_version;
+       list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) {
+               lnet_ping_router_locked(rtr);
+               LASSERT (version == the_lnet.ln_routers_version);
+       }
+
+       lnet_net_unlock(0);
+
+       running = 0; /* lock only needed for the recursion check */
+       return;
+}
+
+/* NB lnet_peers_start_down depends on me,
+ * so must be called before any peer creation */
+void
+lnet_get_tunables (void)
+{
+       char *s;
+
+       s = getenv("LNET_ROUTER_PING_TIMEOUT");
+       if (s != NULL) router_ping_timeout = atoi(s);
+
+       s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL");
+       if (s != NULL) live_router_check_interval = atoi(s);
+
+       s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL");
+       if (s != NULL) dead_router_check_interval = atoi(s);
+
+       /* This replaces old lnd_notify mechanism */
+       check_routers_before_use = 1;
+       if (dead_router_check_interval <= 0)
+               dead_router_check_interval = 30;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+}
+
+int
+lnet_rtrpools_alloc(int im_a_arouter)
+{
+       return 0;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c
new file mode 100644 (file)
index 0000000..3084b0c
--- /dev/null
@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+
+#if  defined(LNET_ROUTER)
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+static ctl_table_header_t *lnet_table_header = NULL;
+
+#define CTL_LNET        (0x100)
+enum {
+       PSDEV_LNET_STATS = 100,
+       PSDEV_LNET_ROUTES,
+       PSDEV_LNET_ROUTERS,
+       PSDEV_LNET_PEERS,
+       PSDEV_LNET_BUFFERS,
+       PSDEV_LNET_NIS,
+       PSDEV_LNET_PTL_ROTOR,
+};
+
+#define LNET_LOFFT_BITS                (sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS     (LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS     MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8)
+
+#define LNET_PROC_HASH_BITS    LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS    (LNET_LOFFT_BITS -       \
+                                LNET_PROC_CPT_BITS -    \
+                                LNET_PROC_VER_BITS -    \
+                                LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS    (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS    (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK     ((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK     ((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK    ((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK    ((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos)                         \
+       (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos)                         \
+       (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos)                                \
+       (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos)                                \
+       (int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off)                \
+       (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) |   \
+       ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) |   \
+       ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+       ((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v)   ((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int __proc_lnet_stats(void *data, int write,
+                            loff_t pos, void *buffer, int nob)
+{
+       int           rc;
+       lnet_counters_t *ctrs;
+       int           len;
+       char        *tmpstr;
+       const int       tmpsiz = 256; /* 7 %u and 4 LPU64 */
+
+       if (write) {
+               lnet_counters_reset();
+               return 0;
+       }
+
+       /* read */
+
+       LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+       if (ctrs == NULL)
+               return -ENOMEM;
+
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL) {
+               LIBCFS_FREE(ctrs, sizeof(*ctrs));
+               return -ENOMEM;
+       }
+
+       lnet_counters_get(ctrs);
+
+       len = snprintf(tmpstr, tmpsiz,
+                      "%u %u %u %u %u %u %u "LPU64" "LPU64" "
+                      LPU64" "LPU64,
+                      ctrs->msgs_alloc, ctrs->msgs_max,
+                      ctrs->errors,
+                      ctrs->send_count, ctrs->recv_count,
+                      ctrs->route_count, ctrs->drop_count,
+                      ctrs->send_length, ctrs->recv_length,
+                      ctrs->route_length, ctrs->drop_length);
+
+       if (pos >= min_t(int, len, strlen(tmpstr)))
+               rc = 0;
+       else
+               rc = cfs_trace_copyout_string(buffer, nob,
+                                             tmpstr + pos, "\n");
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+       LIBCFS_FREE(ctrs, sizeof(*ctrs));
+       return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_stats);
+
+int LL_PROC_PROTO(proc_lnet_routes)
+{
+       const int       tmpsiz = 256;
+       char            *tmpstr;
+       char            *s;
+       int             rc = 0;
+       int             len;
+       int             ver;
+       int             off;
+
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       CLASSERT(sizeof(loff_t) >= 4);
+
+       off = LNET_PROC_HOFF_GET(*ppos);
+       ver = LNET_PROC_VER_GET(*ppos);
+
+       LASSERT (!write);
+
+       if (*lenp == 0)
+               return 0;
+
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL)
+               return -ENOMEM;
+
+       s = tmpstr; /* points to current position in tmpstr[] */
+
+       if (*ppos == 0) {
+               s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+                             the_lnet.ln_routing ? "enabled" : "disabled");
+               LASSERT (tmpstr + tmpsiz - s > 0);
+
+               s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %7s %s\n",
+                             "net", "hops", "state", "router");
+               LASSERT (tmpstr + tmpsiz - s > 0);
+
+               lnet_net_lock(0);
+               ver = (unsigned int)the_lnet.ln_remote_nets_version;
+               lnet_net_unlock(0);
+               *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+       } else {
+               struct list_head                *n;
+               struct list_head                *r;
+               lnet_route_t            *route = NULL;
+               lnet_remotenet_t        *rnet  = NULL;
+               int                     skip  = off - 1;
+               struct list_head                *rn_list;
+               int                     i;
+
+               lnet_net_lock(0);
+
+               if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+                       lnet_net_unlock(0);
+                       LIBCFS_FREE(tmpstr, tmpsiz);
+                       return -ESTALE;
+               }
+
+               for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+                    i++) {
+                       rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+                       n = rn_list->next;
+
+                       while (n != rn_list && route == NULL) {
+                               rnet = list_entry(n, lnet_remotenet_t,
+                                                     lrn_list);
+
+                               r = rnet->lrn_routes.next;
+
+                               while (r != &rnet->lrn_routes) {
+                                       lnet_route_t *re =
+                                               list_entry(r, lnet_route_t,
+                                                              lr_list);
+                                       if (skip == 0) {
+                                               route = re;
+                                               break;
+                                       }
+
+                                       skip--;
+                                       r = r->next;
+                               }
+
+                               n = n->next;
+                       }
+               }
+
+               if (route != NULL) {
+                       __u32   net   = rnet->lrn_net;
+                       unsigned int hops  = route->lr_hops;
+                       lnet_nid_t   nid   = route->lr_gateway->lp_nid;
+                       int       alive = route->lr_gateway->lp_alive;
+
+                       s += snprintf(s, tmpstr + tmpsiz - s,
+                                     "%-8s %4u %7s %s\n",
+                                     libcfs_net2str(net), hops,
+                                     alive ? "up" : "down",
+                                     libcfs_nid2str(nid));
+                       LASSERT(tmpstr + tmpsiz - s > 0);
+               }
+
+               lnet_net_unlock(0);
+       }
+
+       len = s - tmpstr;     /* how many bytes was written */
+
+       if (len > *lenp) {    /* linux-supplied buffer is too small */
+               rc = -EINVAL;
+       } else if (len > 0) { /* wrote something */
+               if (copy_to_user(buffer, tmpstr, len))
+                       rc = -EFAULT;
+               else {
+                       off += 1;
+                       *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+               }
+       }
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+
+       if (rc == 0)
+               *lenp = len;
+
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_routers)
+{
+       int     rc = 0;
+       char      *tmpstr;
+       char      *s;
+       const int  tmpsiz = 256;
+       int     len;
+       int     ver;
+       int     off;
+
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       off = LNET_PROC_HOFF_GET(*ppos);
+       ver = LNET_PROC_VER_GET(*ppos);
+
+       LASSERT (!write);
+
+       if (*lenp == 0)
+               return 0;
+
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL)
+               return -ENOMEM;
+
+       s = tmpstr; /* points to current position in tmpstr[] */
+
+       if (*ppos == 0) {
+               s += snprintf(s, tmpstr + tmpsiz - s,
+                             "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+                             "ref", "rtr_ref", "alive_cnt", "state",
+                             "last_ping", "ping_sent", "deadline",
+                             "down_ni", "router");
+               LASSERT(tmpstr + tmpsiz - s > 0);
+
+               lnet_net_lock(0);
+               ver = (unsigned int)the_lnet.ln_routers_version;
+               lnet_net_unlock(0);
+               *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+       } else {
+               struct list_head                *r;
+               struct lnet_peer        *peer = NULL;
+               int                     skip = off - 1;
+
+               lnet_net_lock(0);
+
+               if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+                       lnet_net_unlock(0);
+
+                       LIBCFS_FREE(tmpstr, tmpsiz);
+                       return -ESTALE;
+               }
+
+               r = the_lnet.ln_routers.next;
+
+               while (r != &the_lnet.ln_routers) {
+                       lnet_peer_t *lp = list_entry(r, lnet_peer_t,
+                                                        lp_rtr_list);
+
+                       if (skip == 0) {
+                               peer = lp;
+                               break;
+                       }
+
+                       skip--;
+                       r = r->next;
+               }
+
+               if (peer != NULL) {
+                       lnet_nid_t nid = peer->lp_nid;
+                       cfs_time_t now = cfs_time_current();
+                       cfs_time_t deadline = peer->lp_ping_deadline;
+                       int nrefs     = peer->lp_refcount;
+                       int nrtrrefs  = peer->lp_rtr_refcount;
+                       int alive_cnt = peer->lp_alive_count;
+                       int alive     = peer->lp_alive;
+                       int pingsent  = !peer->lp_ping_notsent;
+                       int last_ping = cfs_duration_sec(cfs_time_sub(now,
+                                                    peer->lp_ping_timestamp));
+                       int down_ni   = 0;
+                       lnet_route_t *rtr;
+
+                       if ((peer->lp_ping_feats &
+                            LNET_PING_FEAT_NI_STATUS) != 0) {
+                               list_for_each_entry(rtr, &peer->lp_routes,
+                                                       lr_gwlist) {
+                                       /* downis on any route should be the
+                                        * number of downis on the gateway */
+                                       if (rtr->lr_downis != 0) {
+                                               down_ni = rtr->lr_downis;
+                                               break;
+                                       }
+                               }
+                       }
+
+                       if (deadline == 0)
+                               s += snprintf(s, tmpstr + tmpsiz - s,
+                                             "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+                                             nrefs, nrtrrefs, alive_cnt,
+                                             alive ? "up" : "down", last_ping,
+                                             pingsent, "NA", down_ni,
+                                             libcfs_nid2str(nid));
+                       else
+                               s += snprintf(s, tmpstr + tmpsiz - s,
+                                             "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+                                             nrefs, nrtrrefs, alive_cnt,
+                                             alive ? "up" : "down", last_ping,
+                                             pingsent,
+                                             cfs_duration_sec(cfs_time_sub(deadline, now)),
+                                             down_ni, libcfs_nid2str(nid));
+                       LASSERT (tmpstr + tmpsiz - s > 0);
+               }
+
+               lnet_net_unlock(0);
+       }
+
+       len = s - tmpstr;     /* how many bytes was written */
+
+       if (len > *lenp) {    /* linux-supplied buffer is too small */
+               rc = -EINVAL;
+       } else if (len > 0) { /* wrote something */
+               if (copy_to_user(buffer, tmpstr, len))
+                       rc = -EFAULT;
+               else {
+                       off += 1;
+                       *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+               }
+       }
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+
+       if (rc == 0)
+               *lenp = len;
+
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_peers)
+{
+       const int               tmpsiz  = 256;
+       struct lnet_peer_table  *ptable;
+       char                    *tmpstr;
+       char                    *s;
+       int                     cpt  = LNET_PROC_CPT_GET(*ppos);
+       int                     ver  = LNET_PROC_VER_GET(*ppos);
+       int                     hash = LNET_PROC_HASH_GET(*ppos);
+       int                     hoff = LNET_PROC_HOFF_GET(*ppos);
+       int                     rc = 0;
+       int                     len;
+
+       CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS);
+       LASSERT(!write);
+
+       if (*lenp == 0)
+               return 0;
+
+       if (cpt >= LNET_CPT_NUMBER) {
+               *lenp = 0;
+               return 0;
+       }
+
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL)
+               return -ENOMEM;
+
+       s = tmpstr; /* points to current position in tmpstr[] */
+
+       if (*ppos == 0) {
+               s += snprintf(s, tmpstr + tmpsiz - s,
+                             "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+                             "nid", "refs", "state", "last", "max",
+                             "rtr", "min", "tx", "min", "queue");
+               LASSERT (tmpstr + tmpsiz - s > 0);
+
+               hoff++;
+       } else {
+               struct lnet_peer        *peer;
+               struct list_head                *p;
+               int                     skip;
+ again:
+               p = NULL;
+               peer = NULL;
+               skip = hoff - 1;
+
+               lnet_net_lock(cpt);
+               ptable = the_lnet.ln_peer_tables[cpt];
+               if (hoff == 1)
+                       ver = LNET_PROC_VERSION(ptable->pt_version);
+
+               if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+                       lnet_net_unlock(cpt);
+                       LIBCFS_FREE(tmpstr, tmpsiz);
+                       return -ESTALE;
+               }
+
+               while (hash < LNET_PEER_HASH_SIZE) {
+                       if (p == NULL)
+                               p = ptable->pt_hash[hash].next;
+
+                       while (p != &ptable->pt_hash[hash]) {
+                               lnet_peer_t *lp = list_entry(p, lnet_peer_t,
+                                                                lp_hashlist);
+                               if (skip == 0) {
+                                       peer = lp;
+
+                                       /* minor optimization: start from idx+1
+                                        * on next iteration if we've just
+                                        * drained lp_hashlist */
+                                       if (lp->lp_hashlist.next ==
+                                           &ptable->pt_hash[hash]) {
+                                               hoff = 1;
+                                               hash++;
+                                       } else {
+                                               hoff++;
+                                       }
+
+                                       break;
+                               }
+
+                               skip--;
+                               p = lp->lp_hashlist.next;
+                       }
+
+                       if (peer != NULL)
+                               break;
+
+                       p = NULL;
+                       hoff = 1;
+                       hash++;
+               }
+
+               if (peer != NULL) {
+                       lnet_nid_t nid       = peer->lp_nid;
+                       int     nrefs     = peer->lp_refcount;
+                       int     lastalive = -1;
+                       char      *aliveness = "NA";
+                       int     maxcr     = peer->lp_ni->ni_peertxcredits;
+                       int     txcr      = peer->lp_txcredits;
+                       int     mintxcr   = peer->lp_mintxcredits;
+                       int     rtrcr     = peer->lp_rtrcredits;
+                       int     minrtrcr  = peer->lp_minrtrcredits;
+                       int     txqnob    = peer->lp_txqnob;
+
+                       if (lnet_isrouter(peer) ||
+                           lnet_peer_aliveness_enabled(peer))
+                               aliveness = peer->lp_alive ? "up" : "down";
+
+                       if (lnet_peer_aliveness_enabled(peer)) {
+                               cfs_time_t     now = cfs_time_current();
+                               cfs_duration_t delta;
+
+                               delta = cfs_time_sub(now, peer->lp_last_alive);
+                               lastalive = cfs_duration_sec(delta);
+
+                               /* No need to mess up peers contents with
+                                * arbitrarily long integers - it suffices to
+                                * know that lastalive is more than 10000s old
+                                */
+                               if (lastalive >= 10000)
+                                       lastalive = 9999;
+                       }
+
+                       lnet_net_unlock(cpt);
+
+                       s += snprintf(s, tmpstr + tmpsiz - s,
+                                     "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+                                     libcfs_nid2str(nid), nrefs, aliveness,
+                                     lastalive, maxcr, rtrcr, minrtrcr, txcr,
+                                     mintxcr, txqnob);
+                       LASSERT (tmpstr + tmpsiz - s > 0);
+
+               } else { /* peer is NULL */
+                       lnet_net_unlock(cpt);
+               }
+
+               if (hash == LNET_PEER_HASH_SIZE) {
+                       cpt++;
+                       hash = 0;
+                       hoff = 1;
+                       if (peer == NULL && cpt < LNET_CPT_NUMBER)
+                               goto again;
+               }
+       }
+
+       len = s - tmpstr;     /* how many bytes was written */
+
+       if (len > *lenp) {    /* linux-supplied buffer is too small */
+               rc = -EINVAL;
+       } else if (len > 0) { /* wrote something */
+               if (copy_to_user(buffer, tmpstr, len))
+                       rc = -EFAULT;
+               else
+                       *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+       }
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+
+       if (rc == 0)
+               *lenp = len;
+
+       return rc;
+}
+
+static int __proc_lnet_buffers(void *data, int write,
+                              loff_t pos, void *buffer, int nob)
+{
+       char        *s;
+       char        *tmpstr;
+       int             tmpsiz;
+       int             idx;
+       int             len;
+       int             rc;
+       int             i;
+
+       LASSERT(!write);
+
+       /* (4 %d) * 4 * LNET_CPT_NUMBER */
+       tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL)
+               return -ENOMEM;
+
+       s = tmpstr; /* points to current position in tmpstr[] */
+
+       s += snprintf(s, tmpstr + tmpsiz - s,
+                     "%5s %5s %7s %7s\n",
+                     "pages", "count", "credits", "min");
+       LASSERT (tmpstr + tmpsiz - s > 0);
+
+       if (the_lnet.ln_rtrpools == NULL)
+               goto out; /* I'm not a router */
+
+       for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+               lnet_rtrbufpool_t *rbp;
+
+               lnet_net_lock(LNET_LOCK_EX);
+               cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+                       s += snprintf(s, tmpstr + tmpsiz - s,
+                                     "%5d %5d %7d %7d\n",
+                                     rbp[idx].rbp_npages,
+                                     rbp[idx].rbp_nbuffers,
+                                     rbp[idx].rbp_credits,
+                                     rbp[idx].rbp_mincredits);
+                       LASSERT(tmpstr + tmpsiz - s > 0);
+               }
+               lnet_net_unlock(LNET_LOCK_EX);
+       }
+
+ out:
+       len = s - tmpstr;
+
+       if (pos >= min_t(int, len, strlen(tmpstr)))
+               rc = 0;
+       else
+               rc = cfs_trace_copyout_string(buffer, nob,
+                                             tmpstr + pos, NULL);
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+       return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_buffers);
+
+int LL_PROC_PROTO(proc_lnet_nis)
+{
+       int     tmpsiz = 128 * LNET_CPT_NUMBER;
+       int     rc = 0;
+       char      *tmpstr;
+       char      *s;
+       int     len;
+
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       LASSERT (!write);
+
+       if (*lenp == 0)
+               return 0;
+
+       LIBCFS_ALLOC(tmpstr, tmpsiz);
+       if (tmpstr == NULL)
+               return -ENOMEM;
+
+       s = tmpstr; /* points to current position in tmpstr[] */
+
+       if (*ppos == 0) {
+               s += snprintf(s, tmpstr + tmpsiz - s,
+                             "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+                             "nid", "status", "alive", "refs", "peer",
+                             "rtr", "max", "tx", "min");
+               LASSERT (tmpstr + tmpsiz - s > 0);
+       } else {
+               struct list_head        *n;
+               lnet_ni_t        *ni   = NULL;
+               int             skip = *ppos - 1;
+
+               lnet_net_lock(0);
+
+               n = the_lnet.ln_nis.next;
+
+               while (n != &the_lnet.ln_nis) {
+                       lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list);
+
+                       if (skip == 0) {
+                               ni = a_ni;
+                               break;
+                       }
+
+                       skip--;
+                       n = n->next;
+               }
+
+               if (ni != NULL) {
+                       struct lnet_tx_queue    *tq;
+                       char    *stat;
+                       long    now = cfs_time_current_sec();
+                       int     last_alive = -1;
+                       int     i;
+                       int     j;
+
+                       if (the_lnet.ln_routing)
+                               last_alive = now - ni->ni_last_alive;
+
+                       /* @lo forever alive */
+                       if (ni->ni_lnd->lnd_type == LOLND)
+                               last_alive = 0;
+
+                       lnet_ni_lock(ni);
+                       LASSERT(ni->ni_status != NULL);
+                       stat = (ni->ni_status->ns_status ==
+                               LNET_NI_STATUS_UP) ? "up" : "down";
+                       lnet_ni_unlock(ni);
+
+                       /* we actually output credits information for
+                        * TX queue of each partition */
+                       cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+                               for (j = 0; ni->ni_cpts != NULL &&
+                                    j < ni->ni_ncpts; j++) {
+                                       if (i == ni->ni_cpts[j])
+                                               break;
+                               }
+
+                               if (j == ni->ni_ncpts)
+                                       continue;
+
+                               if (i != 0)
+                                       lnet_net_lock(i);
+
+                               s += snprintf(s, tmpstr + tmpsiz - s,
+                                     "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+                                     libcfs_nid2str(ni->ni_nid), stat,
+                                     last_alive, *ni->ni_refs[i],
+                                     ni->ni_peertxcredits,
+                                     ni->ni_peerrtrcredits,
+                                     tq->tq_credits_max,
+                                     tq->tq_credits, tq->tq_credits_min);
+                               if (i != 0)
+                                       lnet_net_unlock(i);
+                       }
+                       LASSERT(tmpstr + tmpsiz - s > 0);
+               }
+
+               lnet_net_unlock(0);
+       }
+
+       len = s - tmpstr;     /* how many bytes was written */
+
+       if (len > *lenp) {    /* linux-supplied buffer is too small */
+               rc = -EINVAL;
+       } else if (len > 0) { /* wrote something */
+               if (copy_to_user(buffer, tmpstr, len))
+                       rc = -EFAULT;
+               else
+                       *ppos += 1;
+       }
+
+       LIBCFS_FREE(tmpstr, tmpsiz);
+
+       if (rc == 0)
+               *lenp = len;
+
+       return rc;
+}
+
+struct lnet_portal_rotors {
+       int          pr_value;
+       const char      *pr_name;
+       const char      *pr_desc;
+};
+
+static struct lnet_portal_rotors       portal_rotors[] = {
+       {
+               .pr_value = LNET_PTL_ROTOR_OFF,
+               .pr_name  = "OFF",
+               .pr_desc  = "Turn off message rotor for wildcard portals"
+       },
+       {
+               .pr_value = LNET_PTL_ROTOR_ON,
+               .pr_name  = "ON",
+               .pr_desc  = "round-robin dispatch all PUT messages for "
+                           "wildcard portals"
+       },
+       {
+               .pr_value = LNET_PTL_ROTOR_RR_RT,
+               .pr_name  = "RR_RT",
+               .pr_desc  = "round-robin dispatch routed PUT message for "
+                           "wildcard portals"
+       },
+       {
+               .pr_value = LNET_PTL_ROTOR_HASH_RT,
+               .pr_name  = "HASH_RT",
+               .pr_desc  = "dispatch routed PUT message by hashing source "
+                           "NID for wildcard portals"
+       },
+       {
+               .pr_value = -1,
+               .pr_name  = NULL,
+               .pr_desc  = NULL
+       },
+};
+
+extern int portal_rotor;
+
+static int __proc_lnet_portal_rotor(void *data, int write,
+                                   loff_t pos, void *buffer, int nob)
+{
+       const int       buf_len = 128;
+       char            *buf;
+       char            *tmp;
+       int             rc;
+       int             i;
+
+       LIBCFS_ALLOC(buf, buf_len);
+       if (buf == NULL)
+               return -ENOMEM;
+
+       if (!write) {
+               lnet_res_lock(0);
+
+               for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+                       if (portal_rotors[i].pr_value == portal_rotor)
+                               break;
+               }
+
+               LASSERT(portal_rotors[i].pr_value == portal_rotor);
+               lnet_res_unlock(0);
+
+               rc = snprintf(buf, buf_len,
+                             "{\n\tportals: all\n"
+                             "\trotor: %s\n\tdescription: %s\n}",
+                             portal_rotors[i].pr_name,
+                             portal_rotors[i].pr_desc);
+
+               if (pos >= min_t(int, rc, buf_len)) {
+                       rc = 0;
+               } else {
+                       rc = cfs_trace_copyout_string(buffer, nob,
+                                       buf + pos, "\n");
+               }
+               goto out;
+       }
+
+       rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob);
+       if (rc < 0)
+               goto out;
+
+       tmp = cfs_trimwhite(buf);
+
+       rc = -EINVAL;
+       lnet_res_lock(0);
+       for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+               if (cfs_strncasecmp(portal_rotors[i].pr_name, tmp,
+                                   strlen(portal_rotors[i].pr_name)) == 0) {
+                       portal_rotor = portal_rotors[i].pr_value;
+                       rc = 0;
+                       break;
+               }
+       }
+       lnet_res_unlock(0);
+out:
+       LIBCFS_FREE(buf, buf_len);
+       return rc;
+}
+DECLARE_PROC_HANDLER(proc_lnet_portal_rotor);
+
+static ctl_table_t lnet_table[] = {
+       /*
+        * NB No .strategy entries have been provided since sysctl(8) prefers
+        * to go via /proc for portability.
+        */
+       {
+               INIT_CTL_NAME(PSDEV_LNET_STATS)
+               .procname = "stats",
+               .mode     = 0644,
+               .proc_handler = &proc_lnet_stats,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_ROUTES)
+               .procname = "routes",
+               .mode     = 0444,
+               .proc_handler = &proc_lnet_routes,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_ROUTERS)
+               .procname = "routers",
+               .mode     = 0444,
+               .proc_handler = &proc_lnet_routers,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_PEERS)
+               .procname = "peers",
+               .mode     = 0444,
+               .proc_handler = &proc_lnet_peers,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_PEERS)
+               .procname = "buffers",
+               .mode     = 0444,
+               .proc_handler = &proc_lnet_buffers,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_NIS)
+               .procname = "nis",
+               .mode     = 0444,
+               .proc_handler = &proc_lnet_nis,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_PTL_ROTOR)
+               .procname = "portal_rotor",
+               .mode     = 0644,
+               .proc_handler = &proc_lnet_portal_rotor,
+       },
+       {
+               INIT_CTL_NAME(0)
+       }
+};
+
+static ctl_table_t top_table[] = {
+       {
+               INIT_CTL_NAME(CTL_LNET)
+               .procname = "lnet",
+               .mode     = 0555,
+               .data     = NULL,
+               .maxlen   = 0,
+               .child    = lnet_table,
+       },
+       {
+               INIT_CTL_NAME(0)
+       }
+};
+
+void
+lnet_proc_init(void)
+{
+#ifdef CONFIG_SYSCTL
+       if (lnet_table_header == NULL)
+               lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+}
+
+void
+lnet_proc_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+       if (lnet_table_header != NULL)
+               unregister_sysctl_table(lnet_table_header);
+
+       lnet_table_header = NULL;
+#endif
+}
+
+#else
+
+void
+lnet_proc_init(void)
+{
+}
+
+void
+lnet_proc_fini(void)
+{
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/Makefile b/drivers/staging/lustre/lnet/selftest/Makefile
new file mode 100644 (file)
index 0000000..1e40aee
--- /dev/null
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o
+
+lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \
+                  module.o ping_test.o brw_test.o
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/selftest/brw_test.c b/drivers/staging/lustre/lnet/selftest/brw_test.c
new file mode 100644 (file)
index 0000000..3bb6fbe
--- /dev/null
@@ -0,0 +1,499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/brw_test.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+static int brw_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(brw_srv_workitems, "i", int, 0644, "# BRW server workitems");
+
+static int brw_inject_errors;
+CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644,
+               "# data errors to inject randomly, zero by default");
+
+static void
+brw_client_fini (sfw_test_instance_t *tsi)
+{
+       srpc_bulk_t     *bulk;
+       sfw_test_unit_t *tsu;
+
+       LASSERT (tsi->tsi_is_client);
+
+       list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+               bulk = tsu->tsu_private;
+               if (bulk == NULL) continue;
+
+               srpc_free_bulk(bulk);
+               tsu->tsu_private = NULL;
+       }
+}
+
+int
+brw_client_init (sfw_test_instance_t *tsi)
+{
+       sfw_session_t    *sn = tsi->tsi_batch->bat_session;
+       int               flags;
+       int               npg;
+       int               len;
+       int               opc;
+       srpc_bulk_t      *bulk;
+       sfw_test_unit_t  *tsu;
+
+       LASSERT(sn != NULL);
+       LASSERT(tsi->tsi_is_client);
+
+       if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+               test_bulk_req_t  *breq = &tsi->tsi_u.bulk_v0;
+
+               opc   = breq->blk_opc;
+               flags = breq->blk_flags;
+               npg   = breq->blk_npg;
+               /* NB: this is not going to work for variable page size,
+                * but we have to keep it for compatibility */
+               len   = npg * PAGE_CACHE_SIZE;
+
+       } else {
+               test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+               /* I should never get this step if it's unknown feature
+                * because make_session will reject unknown feature */
+               LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+               opc   = breq->blk_opc;
+               flags = breq->blk_flags;
+               len   = breq->blk_len;
+               npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       }
+
+       if (npg > LNET_MAX_IOV || npg <= 0)
+               return -EINVAL;
+
+       if (opc != LST_BRW_READ && opc != LST_BRW_WRITE)
+               return -EINVAL;
+
+       if (flags != LST_BRW_CHECK_NONE &&
+           flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE)
+               return -EINVAL;
+
+       list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+               bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid),
+                                      npg, len, opc == LST_BRW_READ);
+               if (bulk == NULL) {
+                       brw_client_fini(tsi);
+                       return -ENOMEM;
+               }
+
+               tsu->tsu_private = bulk;
+       }
+
+       return 0;
+}
+
+#define BRW_POISON      0xbeefbeefbeefbeefULL
+#define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE       sizeof(__u64)
+
+int
+brw_inject_one_error (void)
+{
+       struct timeval tv;
+
+       if (brw_inject_errors <= 0) return 0;
+
+       do_gettimeofday(&tv);
+
+       if ((tv.tv_usec & 1) == 0) return 0;
+
+       return brw_inject_errors--;
+}
+
+void
+brw_fill_page (struct page *pg, int pattern, __u64 magic)
+{
+       char *addr = page_address(pg);
+       int   i;
+
+       LASSERT (addr != NULL);
+
+       if (pattern == LST_BRW_CHECK_NONE) return;
+
+       if (magic == BRW_MAGIC)
+               magic += brw_inject_one_error();
+
+       if (pattern == LST_BRW_CHECK_SIMPLE) {
+               memcpy(addr, &magic, BRW_MSIZE);
+               addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+               memcpy(addr, &magic, BRW_MSIZE);
+               return;
+       }
+
+       if (pattern == LST_BRW_CHECK_FULL) {
+               for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++)
+                       memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE);
+               return;
+       }
+
+       LBUG ();
+       return;
+}
+
+int
+brw_check_page (struct page *pg, int pattern, __u64 magic)
+{
+       char  *addr = page_address(pg);
+       __u64  data = 0; /* make compiler happy */
+       int    i;
+
+       LASSERT (addr != NULL);
+
+       if (pattern == LST_BRW_CHECK_NONE)
+               return 0;
+
+       if (pattern == LST_BRW_CHECK_SIMPLE) {
+               data = *((__u64 *) addr);
+               if (data != magic) goto bad_data;
+
+               addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+               data = *((__u64 *) addr);
+               if (data != magic) goto bad_data;
+
+               return 0;
+       }
+
+       if (pattern == LST_BRW_CHECK_FULL) {
+               for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) {
+                       data = *(((__u64 *) addr) + i);
+                       if (data != magic) goto bad_data;
+               }
+
+               return 0;
+       }
+
+       LBUG ();
+
+bad_data:
+       CERROR ("Bad data in page %p: "LPX64", "LPX64" expected\n",
+               pg, data, magic);
+       return 1;
+}
+
+void
+brw_fill_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+       int      i;
+       struct page *pg;
+
+       for (i = 0; i < bk->bk_niov; i++) {
+               pg = bk->bk_iovs[i].kiov_page;
+               brw_fill_page(pg, pattern, magic);
+       }
+}
+
+int
+brw_check_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+       int      i;
+       struct page *pg;
+
+       for (i = 0; i < bk->bk_niov; i++) {
+               pg = bk->bk_iovs[i].kiov_page;
+               if (brw_check_page(pg, pattern, magic) != 0) {
+                       CERROR ("Bulk page %p (%d/%d) is corrupted!\n",
+                               pg, i, bk->bk_niov);
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static int
+brw_client_prep_rpc (sfw_test_unit_t *tsu,
+                    lnet_process_id_t dest, srpc_client_rpc_t **rpcpp)
+{
+       srpc_bulk_t      *bulk = tsu->tsu_private;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+       srpc_client_rpc_t   *rpc;
+       srpc_brw_reqst_t    *req;
+       int                  flags;
+       int                  npg;
+       int                  len;
+       int                  opc;
+       int                  rc;
+
+       LASSERT(sn != NULL);
+       LASSERT(bulk != NULL);
+
+       if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+               test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+               opc   = breq->blk_opc;
+               flags = breq->blk_flags;
+               npg   = breq->blk_npg;
+               len   = npg * PAGE_CACHE_SIZE;
+
+       } else {
+               test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+               /* I should never get this step if it's unknown feature
+                * because make_session will reject unknown feature */
+               LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+               opc   = breq->blk_opc;
+               flags = breq->blk_flags;
+               len   = breq->blk_len;
+               npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       }
+
+       rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
+       if (rc != 0)
+               return rc;
+
+       memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+       if (opc == LST_BRW_WRITE)
+               brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
+       else
+               brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON);
+
+       req = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+       req->brw_flags = flags;
+       req->brw_rw    = opc;
+       req->brw_len   = len;
+
+       *rpcpp = rpc;
+       return 0;
+}
+
+static void
+brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+       __u64           magic = BRW_MAGIC;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+       srpc_msg_t        *msg = &rpc->crpc_replymsg;
+       srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
+       srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+
+       LASSERT (sn != NULL);
+
+       if (rpc->crpc_status != 0) {
+               CERROR ("BRW RPC to %s failed with %d\n",
+                       libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+               if (!tsi->tsi_stopping) /* rpc could have been aborted */
+                       atomic_inc(&sn->sn_brw_errors);
+               goto out;
+       }
+
+       if (msg->msg_magic != SRPC_MSG_MAGIC) {
+               __swab64s(&magic);
+               __swab32s(&reply->brw_status);
+       }
+
+       CDEBUG (reply->brw_status ? D_WARNING : D_NET,
+               "BRW RPC to %s finished with brw_status: %d\n",
+               libcfs_id2str(rpc->crpc_dest), reply->brw_status);
+
+       if (reply->brw_status != 0) {
+               atomic_inc(&sn->sn_brw_errors);
+               rpc->crpc_status = -(int)reply->brw_status;
+               goto out;
+       }
+
+       if (reqst->brw_rw == LST_BRW_WRITE) goto out;
+
+       if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
+               CERROR ("Bulk data from %s is corrupted!\n",
+                       libcfs_id2str(rpc->crpc_dest));
+               atomic_inc(&sn->sn_brw_errors);
+               rpc->crpc_status = -EBADMSG;
+       }
+
+out:
+       return;
+}
+
+void
+brw_server_rpc_done (srpc_server_rpc_t *rpc)
+{
+       srpc_bulk_t *blk = rpc->srpc_bulk;
+
+       if (blk == NULL) return;
+
+       if (rpc->srpc_status != 0)
+               CERROR ("Bulk transfer %s %s has failed: %d\n",
+                       blk->bk_sink ? "from" : "to",
+                       libcfs_id2str(rpc->srpc_peer), rpc->srpc_status);
+       else
+               CDEBUG (D_NET, "Transfered %d pages bulk data %s %s\n",
+                       blk->bk_niov, blk->bk_sink ? "from" : "to",
+                       libcfs_id2str(rpc->srpc_peer));
+
+       sfw_free_pages(rpc);
+}
+
+int
+brw_bulk_ready (srpc_server_rpc_t *rpc, int status)
+{
+       __u64        magic = BRW_MAGIC;
+       srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+       srpc_brw_reqst_t *reqst;
+       srpc_msg_t       *reqstmsg;
+
+       LASSERT (rpc->srpc_bulk != NULL);
+       LASSERT (rpc->srpc_reqstbuf != NULL);
+
+       reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+       reqst = &reqstmsg->msg_body.brw_reqst;
+
+       if (status != 0) {
+               CERROR ("BRW bulk %s failed for RPC from %s: %d\n",
+                       reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE",
+                       libcfs_id2str(rpc->srpc_peer), status);
+               return -EIO;
+       }
+
+       if (reqst->brw_rw == LST_BRW_READ)
+               return 0;
+
+       if (reqstmsg->msg_magic != SRPC_MSG_MAGIC)
+               __swab64s(&magic);
+
+       if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) {
+               CERROR ("Bulk data from %s is corrupted!\n",
+                       libcfs_id2str(rpc->srpc_peer));
+               reply->brw_status = EBADMSG;
+       }
+
+       return 0;
+}
+
+int
+brw_server_handle(struct srpc_server_rpc *rpc)
+{
+       struct srpc_service     *sv = rpc->srpc_scd->scd_svc;
+       srpc_msg_t       *replymsg = &rpc->srpc_replymsg;
+       srpc_msg_t       *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+       srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
+       srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
+       int               npg;
+       int            rc;
+
+       LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
+
+       if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+               LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+               __swab32s(&reqst->brw_rw);
+               __swab32s(&reqst->brw_len);
+               __swab32s(&reqst->brw_flags);
+               __swab64s(&reqst->brw_rpyid);
+               __swab64s(&reqst->brw_bulkid);
+       }
+       LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id));
+
+       reply->brw_status = 0;
+       rpc->srpc_done = brw_server_rpc_done;
+
+       if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) ||
+           (reqst->brw_flags != LST_BRW_CHECK_NONE &&
+            reqst->brw_flags != LST_BRW_CHECK_FULL &&
+            reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) {
+               reply->brw_status = EINVAL;
+               return 0;
+       }
+
+       if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+               replymsg->msg_ses_feats = LST_FEATS_MASK;
+               reply->brw_status = EPROTO;
+               return 0;
+       }
+
+       if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+               /* compat with old version */
+               if ((reqst->brw_len & ~CFS_PAGE_MASK) != 0) {
+                       reply->brw_status = EINVAL;
+                       return 0;
+               }
+               npg = reqst->brw_len >> PAGE_CACHE_SHIFT;
+
+       } else {
+               npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       }
+
+       replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+       if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) {
+               reply->brw_status = EINVAL;
+               return 0;
+       }
+
+       rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg,
+                            reqst->brw_len,
+                            reqst->brw_rw == LST_BRW_WRITE);
+       if (rc != 0)
+               return rc;
+
+       if (reqst->brw_rw == LST_BRW_READ)
+               brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC);
+       else
+               brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON);
+
+       return 0;
+}
+
+sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void)
+{
+       brw_test_client.tso_init       = brw_client_init;
+       brw_test_client.tso_fini       = brw_client_fini;
+       brw_test_client.tso_prep_rpc   = brw_client_prep_rpc;
+       brw_test_client.tso_done_rpc   = brw_client_done_rpc;
+};
+
+srpc_service_t brw_test_service;
+void brw_init_test_service(void)
+{
+
+       brw_test_service.sv_id   = SRPC_SERVICE_BRW;
+       brw_test_service.sv_name       = "brw_test";
+       brw_test_service.sv_handler    = brw_server_handle;
+       brw_test_service.sv_bulk_ready = brw_bulk_ready;
+       brw_test_service.sv_wi_total   = brw_srv_workitems;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/conctl.c b/drivers/staging/lustre/lnet/selftest/conctl.c
new file mode 100644 (file)
index 0000000..bce3d3b
--- /dev/null
@@ -0,0 +1,931 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * IOC handle in kernel
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnetst.h>
+#include "console.h"
+
+int
+lst_session_new_ioctl(lstio_session_new_args_t *args)
+{
+       char      *name;
+       int     rc;
+
+       if (args->lstio_ses_idp   == NULL || /* address for output sid */
+           args->lstio_ses_key   == 0 || /* no key is specified */
+           args->lstio_ses_namep == NULL || /* session name */
+           args->lstio_ses_nmlen <= 0 ||
+           args->lstio_ses_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_ses_namep,
+                              args->lstio_ses_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_ses_nmlen] = 0;
+
+       rc = lstcon_session_new(name,
+                               args->lstio_ses_key,
+                               args->lstio_ses_feats,
+                               args->lstio_ses_force,
+                               args->lstio_ses_timeout,
+                               args->lstio_ses_idp);
+
+       LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+       return rc;
+}
+
+int
+lst_session_end_ioctl(lstio_session_end_args_t *args)
+{
+       if (args->lstio_ses_key != console_session.ses_key)
+               return -EACCES;
+
+       return lstcon_session_end();
+}
+
+int
+lst_session_info_ioctl(lstio_session_info_args_t *args)
+{
+       /* no checking of key */
+
+       if (args->lstio_ses_idp   == NULL || /* address for ouput sid */
+           args->lstio_ses_keyp  == NULL || /* address for ouput key */
+           args->lstio_ses_featp  == NULL || /* address for ouput features */
+           args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+           args->lstio_ses_namep == NULL || /* address for ouput name */
+           args->lstio_ses_nmlen <= 0 ||
+           args->lstio_ses_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       return lstcon_session_info(args->lstio_ses_idp,
+                                  args->lstio_ses_keyp,
+                                  args->lstio_ses_featp,
+                                  args->lstio_ses_ndinfo,
+                                  args->lstio_ses_namep,
+                                  args->lstio_ses_nmlen);
+}
+
+int
+lst_debug_ioctl(lstio_debug_args_t *args)
+{
+       char   *name   = NULL;
+       int     client = 1;
+       int     rc;
+
+       if (args->lstio_dbg_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_dbg_resultp == NULL)
+               return -EINVAL;
+
+       if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+           (args->lstio_dbg_nmlen <= 0 ||
+            args->lstio_dbg_nmlen > LST_NAME_SIZE))
+               return -EINVAL;
+
+       if (args->lstio_dbg_namep != NULL) {
+               LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+               if (name == NULL)
+                       return -ENOMEM;
+
+               if (copy_from_user(name, args->lstio_dbg_namep,
+                                      args->lstio_dbg_nmlen)) {
+                       LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+                       return -EFAULT;
+               }
+
+               name[args->lstio_dbg_nmlen] = 0;
+       }
+
+       rc = -EINVAL;
+
+       switch (args->lstio_dbg_type) {
+       case LST_OPC_SESSION:
+               rc = lstcon_session_debug(args->lstio_dbg_timeout,
+                                         args->lstio_dbg_resultp);
+               break;
+
+       case LST_OPC_BATCHSRV:
+               client = 0;
+       case LST_OPC_BATCHCLI:
+               if (name == NULL)
+                       goto out;
+
+               rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+                                       name, client, args->lstio_dbg_resultp);
+               break;
+
+       case LST_OPC_GROUP:
+               if (name == NULL)
+                       goto out;
+
+               rc = lstcon_group_debug(args->lstio_dbg_timeout,
+                                       name, args->lstio_dbg_resultp);
+               break;
+
+       case LST_OPC_NODES:
+               if (args->lstio_dbg_count <= 0 ||
+                   args->lstio_dbg_idsp == NULL)
+                       goto out;
+
+               rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+                                       args->lstio_dbg_count,
+                                       args->lstio_dbg_idsp,
+                                       args->lstio_dbg_resultp);
+               break;
+
+       default:
+               break;
+       }
+
+out:
+       if (name != NULL)
+               LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_group_add_ioctl(lstio_group_add_args_t *args)
+{
+       char       *name;
+       int          rc;
+
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_namep == NULL||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_grp_namep,
+                              args->lstio_grp_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_grp_nmlen);
+               return -EFAULT;
+       }
+
+       name[args->lstio_grp_nmlen] = 0;
+
+       rc = lstcon_group_add(name);
+
+       LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_group_del_ioctl(lstio_group_del_args_t *args)
+{
+       int     rc;
+       char   *name;
+
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_namep == NULL ||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_grp_namep,
+                              args->lstio_grp_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_grp_nmlen] = 0;
+
+       rc = lstcon_group_del(name);
+
+       LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_group_update_ioctl(lstio_group_update_args_t *args)
+{
+       int     rc;
+       char   *name;
+
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_resultp == NULL ||
+           args->lstio_grp_namep == NULL ||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                          args->lstio_grp_namep,
+                          args->lstio_grp_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_grp_nmlen] = 0;
+
+       switch (args->lstio_grp_opc) {
+       case LST_GROUP_CLEAN:
+               rc = lstcon_group_clean(name, args->lstio_grp_args);
+               break;
+
+       case LST_GROUP_REFRESH:
+               rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+               break;
+
+       case LST_GROUP_RMND:
+               if (args->lstio_grp_count  <= 0 ||
+                   args->lstio_grp_idsp == NULL) {
+                       rc = -EINVAL;
+                       break;
+               }
+               rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+                                        args->lstio_grp_idsp,
+                                        args->lstio_grp_resultp);
+               break;
+
+       default:
+               rc = -EINVAL;
+               break;
+       }
+
+       LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_nodes_add_ioctl(lstio_group_nodes_args_t *args)
+{
+       unsigned feats;
+       int     rc;
+       char   *name;
+
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_idsp == NULL || /* array of ids */
+           args->lstio_grp_count <= 0 ||
+           args->lstio_grp_resultp == NULL ||
+           args->lstio_grp_featp == NULL ||
+           args->lstio_grp_namep == NULL ||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name, args->lstio_grp_namep,
+                              args->lstio_grp_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+               return -EFAULT;
+       }
+
+       name[args->lstio_grp_nmlen] = 0;
+
+       rc = lstcon_nodes_add(name, args->lstio_grp_count,
+                             args->lstio_grp_idsp, &feats,
+                             args->lstio_grp_resultp);
+
+       LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+       if (rc == 0 &&
+           copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) {
+               return -EINVAL;
+       }
+
+       return rc;
+}
+
+int
+lst_group_list_ioctl(lstio_group_list_args_t *args)
+{
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_idx   < 0 ||
+           args->lstio_grp_namep == NULL ||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       return lstcon_group_list(args->lstio_grp_idx,
+                             args->lstio_grp_nmlen,
+                             args->lstio_grp_namep);
+}
+
+int
+lst_group_info_ioctl(lstio_group_info_args_t *args)
+{
+       char       *name;
+       int          ndent;
+       int          index;
+       int          rc;
+
+       if (args->lstio_grp_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_grp_namep == NULL ||
+           args->lstio_grp_nmlen <= 0 ||
+           args->lstio_grp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       if (args->lstio_grp_entp  == NULL && /* output: group entry */
+           args->lstio_grp_dentsp == NULL)  /* output: node entry */
+               return -EINVAL;
+
+       if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+               if (args->lstio_grp_idxp == NULL || /* node index */
+                   args->lstio_grp_ndentp == NULL) /* # of node entry */
+                       return -EINVAL;
+
+               if (copy_from_user(&ndent, args->lstio_grp_ndentp,
+                                      sizeof(ndent)) ||
+                   copy_from_user(&index, args->lstio_grp_idxp,
+                                      sizeof(index)))
+                       return -EFAULT;
+
+               if (ndent <= 0 || index < 0)
+                       return -EINVAL;
+       }
+
+       LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_grp_namep,
+                              args->lstio_grp_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_grp_nmlen] = 0;
+
+       rc = lstcon_group_info(name, args->lstio_grp_entp,
+                              &index, &ndent, args->lstio_grp_dentsp);
+
+       LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+       if (rc != 0)
+               return rc;
+
+       if (args->lstio_grp_dentsp != NULL &&
+           (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
+            copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent))))
+               rc = -EFAULT;
+
+       return 0;
+}
+
+int
+lst_batch_add_ioctl(lstio_batch_add_args_t *args)
+{
+       int          rc;
+       char       *name;
+
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_namep == NULL ||
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_bat_namep,
+                              args->lstio_bat_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_bat_nmlen] = 0;
+
+       rc = lstcon_batch_add(name);
+
+       LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_batch_run_ioctl(lstio_batch_run_args_t *args)
+{
+       int          rc;
+       char       *name;
+
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_namep == NULL ||
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_bat_namep,
+                              args->lstio_bat_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_bat_nmlen] = 0;
+
+       rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+                             args->lstio_bat_resultp);
+
+       LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_batch_stop_ioctl(lstio_batch_stop_args_t *args)
+{
+       int          rc;
+       char       *name;
+
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_resultp == NULL ||
+           args->lstio_bat_namep == NULL ||
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_bat_namep,
+                              args->lstio_bat_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_bat_nmlen] = 0;
+
+       rc = lstcon_batch_stop(name, args->lstio_bat_force,
+                              args->lstio_bat_resultp);
+
+       LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_batch_query_ioctl(lstio_batch_query_args_t *args)
+{
+       char   *name;
+       int     rc;
+
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_resultp == NULL ||
+           args->lstio_bat_namep == NULL ||
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       if (args->lstio_bat_testidx < 0)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_bat_namep,
+                              args->lstio_bat_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_bat_nmlen] = 0;
+
+       rc = lstcon_test_batch_query(name,
+                                    args->lstio_bat_testidx,
+                                    args->lstio_bat_client,
+                                    args->lstio_bat_timeout,
+                                    args->lstio_bat_resultp);
+
+       LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+       return rc;
+}
+
+int
+lst_batch_list_ioctl(lstio_batch_list_args_t *args)
+{
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_idx   < 0 ||
+           args->lstio_bat_namep == NULL ||
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       return lstcon_batch_list(args->lstio_bat_idx,
+                             args->lstio_bat_nmlen,
+                             args->lstio_bat_namep);
+}
+
+int
+lst_batch_info_ioctl(lstio_batch_info_args_t *args)
+{
+       char       *name;
+       int          rc;
+       int          index;
+       int          ndent;
+
+       if (args->lstio_bat_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_bat_namep == NULL || /* batch name */
+           args->lstio_bat_nmlen <= 0 ||
+           args->lstio_bat_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       if (args->lstio_bat_entp == NULL && /* output: batch entry */
+           args->lstio_bat_dentsp == NULL) /* output: node entry */
+               return -EINVAL;
+
+       if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+               if (args->lstio_bat_idxp == NULL || /* node index */
+                   args->lstio_bat_ndentp == NULL) /* # of node entry */
+                       return -EINVAL;
+
+               if (copy_from_user(&index, args->lstio_bat_idxp,
+                                      sizeof(index)) ||
+                   copy_from_user(&ndent, args->lstio_bat_ndentp,
+                                      sizeof(ndent)))
+                       return -EFAULT;
+
+               if (ndent <= 0 || index < 0)
+                       return -EINVAL;
+       }
+
+       LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name,
+                              args->lstio_bat_namep, args->lstio_bat_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+               return -EFAULT;
+       }
+
+       name[args->lstio_bat_nmlen] = 0;
+
+       rc = lstcon_batch_info(name,
+                           args->lstio_bat_entp, args->lstio_bat_server,
+                           args->lstio_bat_testidx, &index, &ndent,
+                           args->lstio_bat_dentsp);
+
+       LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+       if (rc != 0)
+               return rc;
+
+       if (args->lstio_bat_dentsp != NULL &&
+           (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
+            copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent))))
+               rc = -EFAULT;
+
+       return rc;
+}
+
+int
+lst_stat_query_ioctl(lstio_stat_args_t *args)
+{
+       int          rc;
+       char       *name;
+
+       /* TODO: not finished */
+       if (args->lstio_sta_key != console_session.ses_key)
+               return -EACCES;
+
+       if (args->lstio_sta_resultp == NULL ||
+           (args->lstio_sta_namep  == NULL &&
+            args->lstio_sta_idsp   == NULL) ||
+           args->lstio_sta_nmlen <= 0 ||
+           args->lstio_sta_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       if (args->lstio_sta_idsp != NULL &&
+           args->lstio_sta_count <= 0)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name, args->lstio_sta_namep,
+                              args->lstio_sta_nmlen)) {
+               LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+               return -EFAULT;
+       }
+
+       if (args->lstio_sta_idsp == NULL) {
+               rc = lstcon_group_stat(name, args->lstio_sta_timeout,
+                                      args->lstio_sta_resultp);
+       } else {
+               rc = lstcon_nodes_stat(args->lstio_sta_count,
+                                      args->lstio_sta_idsp,
+                                      args->lstio_sta_timeout,
+                                      args->lstio_sta_resultp);
+       }
+
+       LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+
+       return rc;
+}
+
+int lst_test_add_ioctl(lstio_test_args_t *args)
+{
+       char       *name;
+       char       *srcgrp = NULL;
+       char       *dstgrp = NULL;
+       void       *param = NULL;
+       int          ret = 0;
+       int          rc = -ENOMEM;
+
+       if (args->lstio_tes_resultp == NULL ||
+           args->lstio_tes_retp == NULL ||
+           args->lstio_tes_bat_name == NULL || /* no specified batch */
+           args->lstio_tes_bat_nmlen <= 0 ||
+           args->lstio_tes_bat_nmlen > LST_NAME_SIZE ||
+           args->lstio_tes_sgrp_name == NULL || /* no source group */
+           args->lstio_tes_sgrp_nmlen <= 0 ||
+           args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE ||
+           args->lstio_tes_dgrp_name == NULL || /* no target group */
+           args->lstio_tes_dgrp_nmlen <= 0 ||
+           args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE)
+               return -EINVAL;
+
+       if (args->lstio_tes_loop == 0 || /* negative is infinite */
+           args->lstio_tes_concur <= 0 ||
+           args->lstio_tes_dist <= 0 ||
+           args->lstio_tes_span <= 0)
+               return -EINVAL;
+
+       /* have parameter, check if parameter length is valid */
+       if (args->lstio_tes_param != NULL &&
+           (args->lstio_tes_param_len <= 0 ||
+            args->lstio_tes_param_len > PAGE_CACHE_SIZE - sizeof(lstcon_test_t)))
+               return -EINVAL;
+
+       LIBCFS_ALLOC(name, args->lstio_tes_bat_nmlen + 1);
+       if (name == NULL)
+               return rc;
+
+       LIBCFS_ALLOC(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+       if (srcgrp == NULL)
+               goto out;
+
+       LIBCFS_ALLOC(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+        if (dstgrp == NULL)
+               goto out;
+
+       if (args->lstio_tes_param != NULL) {
+               LIBCFS_ALLOC(param, args->lstio_tes_param_len);
+               if (param == NULL)
+                       goto out;
+       }
+
+       rc = -EFAULT;
+       if (copy_from_user(name,
+                             args->lstio_tes_bat_name,
+                             args->lstio_tes_bat_nmlen) ||
+           copy_from_user(srcgrp,
+                             args->lstio_tes_sgrp_name,
+                             args->lstio_tes_sgrp_nmlen) ||
+           copy_from_user(dstgrp,
+                             args->lstio_tes_dgrp_name,
+                             args->lstio_tes_dgrp_nmlen) ||
+           copy_from_user(param, args->lstio_tes_param,
+                             args->lstio_tes_param_len))
+               goto out;
+
+       rc = lstcon_test_add(name,
+                           args->lstio_tes_type,
+                           args->lstio_tes_loop,
+                           args->lstio_tes_concur,
+                           args->lstio_tes_dist, args->lstio_tes_span,
+                           srcgrp, dstgrp, param, args->lstio_tes_param_len,
+                           &ret, args->lstio_tes_resultp);
+
+       if (ret != 0)
+               rc = (copy_to_user(args->lstio_tes_retp, &ret,
+                                      sizeof(ret))) ? -EFAULT : 0;
+out:
+       if (name != NULL)
+               LIBCFS_FREE(name, args->lstio_tes_bat_nmlen + 1);
+
+       if (srcgrp != NULL)
+               LIBCFS_FREE(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+
+       if (dstgrp != NULL)
+               LIBCFS_FREE(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+
+       if (param != NULL)
+               LIBCFS_FREE(param, args->lstio_tes_param_len);
+
+       return rc;
+}
+
+int
+lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+       char   *buf;
+       int     opc = data->ioc_u32[0];
+       int     rc;
+
+       if (cmd != IOC_LIBCFS_LNETST)
+               return -EINVAL;
+
+       if (data->ioc_plen1 > PAGE_CACHE_SIZE)
+               return -EINVAL;
+
+       LIBCFS_ALLOC(buf, data->ioc_plen1);
+       if (buf == NULL)
+               return -ENOMEM;
+
+       /* copy in parameter */
+       if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
+               LIBCFS_FREE(buf, data->ioc_plen1);
+               return -EFAULT;
+       }
+
+       mutex_lock(&console_session.ses_mutex);
+
+       console_session.ses_laststamp = cfs_time_current_sec();
+
+       if (console_session.ses_shutdown) {
+               rc = -ESHUTDOWN;
+               goto out;
+       }
+
+       if (console_session.ses_expired)
+               lstcon_session_end();
+
+       if (opc != LSTIO_SESSION_NEW &&
+           console_session.ses_state == LST_SESSION_NONE) {
+               CDEBUG(D_NET, "LST no active session\n");
+               rc = -ESRCH;
+               goto out;
+       }
+
+       memset(&console_session.ses_trans_stat, 0, sizeof(lstcon_trans_stat_t));
+
+       switch (opc) {
+               case LSTIO_SESSION_NEW:
+                       rc = lst_session_new_ioctl((lstio_session_new_args_t *)buf);
+                       break;
+               case LSTIO_SESSION_END:
+                       rc = lst_session_end_ioctl((lstio_session_end_args_t *)buf);
+                       break;
+               case LSTIO_SESSION_INFO:
+                       rc = lst_session_info_ioctl((lstio_session_info_args_t *)buf);
+                       break;
+               case LSTIO_DEBUG:
+                       rc = lst_debug_ioctl((lstio_debug_args_t *)buf);
+                       break;
+               case LSTIO_GROUP_ADD:
+                       rc = lst_group_add_ioctl((lstio_group_add_args_t *)buf);
+                       break;
+               case LSTIO_GROUP_DEL:
+                       rc = lst_group_del_ioctl((lstio_group_del_args_t *)buf);
+                       break;
+               case LSTIO_GROUP_UPDATE:
+                       rc = lst_group_update_ioctl((lstio_group_update_args_t *)buf);
+                       break;
+               case LSTIO_NODES_ADD:
+                       rc = lst_nodes_add_ioctl((lstio_group_nodes_args_t *)buf);
+                       break;
+               case LSTIO_GROUP_LIST:
+                       rc = lst_group_list_ioctl((lstio_group_list_args_t *)buf);
+                       break;
+               case LSTIO_GROUP_INFO:
+                       rc = lst_group_info_ioctl((lstio_group_info_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_ADD:
+                       rc = lst_batch_add_ioctl((lstio_batch_add_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_START:
+                       rc = lst_batch_run_ioctl((lstio_batch_run_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_STOP:
+                       rc = lst_batch_stop_ioctl((lstio_batch_stop_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_QUERY:
+                       rc = lst_batch_query_ioctl((lstio_batch_query_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_LIST:
+                       rc = lst_batch_list_ioctl((lstio_batch_list_args_t *)buf);
+                       break;
+               case LSTIO_BATCH_INFO:
+                       rc = lst_batch_info_ioctl((lstio_batch_info_args_t *)buf);
+                       break;
+               case LSTIO_TEST_ADD:
+                       rc = lst_test_add_ioctl((lstio_test_args_t *)buf);
+                       break;
+               case LSTIO_STAT_QUERY:
+                       rc = lst_stat_query_ioctl((lstio_stat_args_t *)buf);
+                       break;
+               default:
+                       rc = -EINVAL;
+       }
+
+       if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
+                            sizeof(lstcon_trans_stat_t)))
+               rc = -EFAULT;
+out:
+       mutex_unlock(&console_session.ses_mutex);
+
+       LIBCFS_FREE(buf, data->ioc_plen1);
+
+       return rc;
+}
+
+EXPORT_SYMBOL(lstcon_ioctl_entry);
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c
new file mode 100644 (file)
index 0000000..446de0e
--- /dev/null
@@ -0,0 +1,1397 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Console framework rpcs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "timer.h"
+#include "conrpc.h"
+#include "console.h"
+
+void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
+                          lstcon_node_t *, lstcon_trans_stat_t *);
+
+static void
+lstcon_rpc_done(srpc_client_rpc_t *rpc)
+{
+       lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+
+       LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
+       LASSERT(crpc->crp_posted && !crpc->crp_finished);
+
+       spin_lock(&rpc->crpc_lock);
+
+       if (crpc->crp_trans == NULL) {
+               /* Orphan RPC is not in any transaction,
+                * I'm just a poor body and nobody loves me */
+               spin_unlock(&rpc->crpc_lock);
+
+               /* release it */
+               lstcon_rpc_put(crpc);
+               return;
+       }
+
+       /* not an orphan RPC */
+       crpc->crp_finished = 1;
+
+       if (crpc->crp_stamp == 0) {
+               /* not aborted */
+               LASSERT (crpc->crp_status == 0);
+
+               crpc->crp_stamp  = cfs_time_current();
+               crpc->crp_status = rpc->crpc_status;
+       }
+
+       /* wakeup (transaction)thread if I'm the last RPC in the transaction */
+       if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining))
+               wake_up(&crpc->crp_trans->tas_waitq);
+
+       spin_unlock(&rpc->crpc_lock);
+}
+
+int
+lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
+               int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+{
+       crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
+                                      feats, bulk_npg, bulk_len,
+                                      lstcon_rpc_done, (void *)crpc);
+       if (crpc->crp_rpc == NULL)
+               return -ENOMEM;
+
+       crpc->crp_trans    = NULL;
+       crpc->crp_node     = nd;
+       crpc->crp_posted   = 0;
+       crpc->crp_finished = 0;
+       crpc->crp_unpacked = 0;
+       crpc->crp_status   = 0;
+       crpc->crp_stamp    = 0;
+       crpc->crp_embedded = embedded;
+       INIT_LIST_HEAD(&crpc->crp_link);
+
+       atomic_inc(&console_session.ses_rpc_counter);
+
+       return 0;
+}
+
+int
+lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
+               int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+{
+       lstcon_rpc_t  *crpc = NULL;
+       int         rc;
+
+       spin_lock(&console_session.ses_rpc_lock);
+
+       if (!list_empty(&console_session.ses_rpc_freelist)) {
+               crpc = list_entry(console_session.ses_rpc_freelist.next,
+                                     lstcon_rpc_t, crp_link);
+               list_del_init(&crpc->crp_link);
+       }
+
+       spin_unlock(&console_session.ses_rpc_lock);
+
+       if (crpc == NULL) {
+               LIBCFS_ALLOC(crpc, sizeof(*crpc));
+               if (crpc == NULL)
+                       return -ENOMEM;
+       }
+
+       rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc);
+       if (rc == 0) {
+               *crpcpp = crpc;
+               return 0;
+       }
+
+       LIBCFS_FREE(crpc, sizeof(*crpc));
+
+       return rc;
+}
+
+void
+lstcon_rpc_put(lstcon_rpc_t *crpc)
+{
+       srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
+       int       i;
+
+       LASSERT (list_empty(&crpc->crp_link));
+
+       for (i = 0; i < bulk->bk_niov; i++) {
+               if (bulk->bk_iovs[i].kiov_page == NULL)
+                       continue;
+
+               __free_page(bulk->bk_iovs[i].kiov_page);
+       }
+
+       srpc_client_rpc_decref(crpc->crp_rpc);
+
+       if (crpc->crp_embedded) {
+               /* embedded RPC, don't recycle it */
+               memset(crpc, 0, sizeof(*crpc));
+               crpc->crp_embedded = 1;
+
+       } else {
+               spin_lock(&console_session.ses_rpc_lock);
+
+               list_add(&crpc->crp_link,
+                            &console_session.ses_rpc_freelist);
+
+               spin_unlock(&console_session.ses_rpc_lock);
+       }
+
+       /* RPC is not alive now */
+       atomic_dec(&console_session.ses_rpc_counter);
+}
+
+void
+lstcon_rpc_post(lstcon_rpc_t *crpc)
+{
+       lstcon_rpc_trans_t *trans = crpc->crp_trans;
+
+       LASSERT (trans != NULL);
+
+       atomic_inc(&trans->tas_remaining);
+       crpc->crp_posted = 1;
+
+       sfw_post_rpc(crpc->crp_rpc);
+}
+
+static char *
+lstcon_rpc_trans_name(int transop)
+{
+       if (transop == LST_TRANS_SESNEW)
+               return "SESNEW";
+
+       if (transop == LST_TRANS_SESEND)
+               return "SESEND";
+
+       if (transop == LST_TRANS_SESQRY)
+               return "SESQRY";
+
+       if (transop == LST_TRANS_SESPING)
+               return "SESPING";
+
+       if (transop == LST_TRANS_TSBCLIADD)
+               return "TSBCLIADD";
+
+       if (transop == LST_TRANS_TSBSRVADD)
+               return "TSBSRVADD";
+
+       if (transop == LST_TRANS_TSBRUN)
+               return "TSBRUN";
+
+       if (transop == LST_TRANS_TSBSTOP)
+               return "TSBSTOP";
+
+       if (transop == LST_TRANS_TSBCLIQRY)
+               return "TSBCLIQRY";
+
+       if (transop == LST_TRANS_TSBSRVQRY)
+               return "TSBSRVQRY";
+
+       if (transop == LST_TRANS_STATQRY)
+               return "STATQRY";
+
+       return "Unknown";
+}
+
+int
+lstcon_rpc_trans_prep(struct list_head *translist,
+                     int transop, lstcon_rpc_trans_t **transpp)
+{
+       lstcon_rpc_trans_t *trans;
+
+       if (translist != NULL) {
+               list_for_each_entry(trans, translist, tas_link) {
+                       /* Can't enqueue two private transaction on
+                        * the same object */
+                       if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE)
+                               return -EPERM;
+               }
+       }
+
+       /* create a trans group */
+       LIBCFS_ALLOC(trans, sizeof(*trans));
+       if (trans == NULL)
+               return -ENOMEM;
+
+       trans->tas_opc = transop;
+
+       if (translist == NULL)
+               INIT_LIST_HEAD(&trans->tas_olink);
+       else
+               list_add_tail(&trans->tas_olink, translist);
+
+       list_add_tail(&trans->tas_link, &console_session.ses_trans_list);
+
+       INIT_LIST_HEAD(&trans->tas_rpcs_list);
+       atomic_set(&trans->tas_remaining, 0);
+       init_waitqueue_head(&trans->tas_waitq);
+
+       spin_lock(&console_session.ses_rpc_lock);
+       trans->tas_features = console_session.ses_features;
+       spin_unlock(&console_session.ses_rpc_lock);
+
+       *transpp = trans;
+       return 0;
+}
+
+void
+lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+{
+       list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
+       crpc->crp_trans = trans;
+}
+
+void
+lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+{
+       srpc_client_rpc_t *rpc;
+       lstcon_rpc_t      *crpc;
+       lstcon_node_t     *nd;
+
+       list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+               rpc = crpc->crp_rpc;
+
+               spin_lock(&rpc->crpc_lock);
+
+               if (!crpc->crp_posted || /* not posted */
+                   crpc->crp_stamp != 0) { /* rpc done or aborted already */
+                       if (crpc->crp_stamp == 0) {
+                               crpc->crp_stamp = cfs_time_current();
+                               crpc->crp_status = -EINTR;
+                       }
+                       spin_unlock(&rpc->crpc_lock);
+                       continue;
+               }
+
+               crpc->crp_stamp  = cfs_time_current();
+               crpc->crp_status = error;
+
+               spin_unlock(&rpc->crpc_lock);
+
+               sfw_abort_rpc(rpc);
+
+               if  (error != ETIMEDOUT)
+                       continue;
+
+               nd = crpc->crp_node;
+               if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+                       continue;
+
+               nd->nd_stamp = crpc->crp_stamp;
+               nd->nd_state = LST_NODE_DOWN;
+       }
+}
+
+static int
+lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+{
+       if (console_session.ses_shutdown &&
+           !list_empty(&trans->tas_olink)) /* Not an end session RPC */
+               return 1;
+
+       return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0;
+}
+
+int
+lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+{
+       lstcon_rpc_t  *crpc;
+       int         rc;
+
+       if (list_empty(&trans->tas_rpcs_list))
+               return 0;
+
+       if (timeout < LST_TRANS_MIN_TIMEOUT)
+               timeout = LST_TRANS_MIN_TIMEOUT;
+
+       CDEBUG(D_NET, "Transaction %s started\n",
+              lstcon_rpc_trans_name(trans->tas_opc));
+
+       /* post all requests */
+       list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+               LASSERT (!crpc->crp_posted);
+
+               lstcon_rpc_post(crpc);
+       }
+
+       mutex_unlock(&console_session.ses_mutex);
+
+       rc = wait_event_interruptible_timeout(trans->tas_waitq,
+                                             lstcon_rpc_trans_check(trans),
+                                             cfs_time_seconds(timeout));
+       rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT);
+
+       mutex_lock(&console_session.ses_mutex);
+
+       if (console_session.ses_shutdown)
+               rc = -ESHUTDOWN;
+
+       if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) {
+               /* treat short timeout as canceled */
+               if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2)
+                       rc = -EINTR;
+
+               lstcon_rpc_trans_abort(trans, rc);
+       }
+
+       CDEBUG(D_NET, "Transaction %s stopped: %d\n",
+              lstcon_rpc_trans_name(trans->tas_opc), rc);
+
+       lstcon_rpc_trans_stat(trans, lstcon_trans_stat());
+
+       return rc;
+}
+
+int
+lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+{
+       lstcon_node_t   *nd  = crpc->crp_node;
+       srpc_client_rpc_t    *rpc = crpc->crp_rpc;
+       srpc_generic_reply_t *rep;
+
+       LASSERT (nd != NULL && rpc != NULL);
+       LASSERT (crpc->crp_stamp != 0);
+
+       if (crpc->crp_status != 0) {
+               *msgpp = NULL;
+               return crpc->crp_status;
+       }
+
+       *msgpp = &rpc->crpc_replymsg;
+       if (!crpc->crp_unpacked) {
+               sfw_unpack_message(*msgpp);
+               crpc->crp_unpacked = 1;
+       }
+
+       if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+               return 0;
+
+       nd->nd_stamp = crpc->crp_stamp;
+       rep = &(*msgpp)->msg_body.reply;
+
+       if (rep->sid.ses_nid == LNET_NID_ANY)
+               nd->nd_state = LST_NODE_UNKNOWN;
+       else if (lstcon_session_match(rep->sid))
+               nd->nd_state = LST_NODE_ACTIVE;
+       else
+               nd->nd_state = LST_NODE_BUSY;
+
+       return 0;
+}
+
+void
+lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, lstcon_trans_stat_t *stat)
+{
+       lstcon_rpc_t      *crpc;
+       srpc_msg_t      *rep;
+       int             error;
+
+       LASSERT (stat != NULL);
+
+       memset(stat, 0, sizeof(*stat));
+
+       list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+               lstcon_rpc_stat_total(stat, 1);
+
+               LASSERT (crpc->crp_stamp != 0);
+
+               error = lstcon_rpc_get_reply(crpc, &rep);
+               if (error != 0) {
+                       lstcon_rpc_stat_failure(stat, 1);
+                       if (stat->trs_rpc_errno == 0)
+                               stat->trs_rpc_errno = -error;
+
+                       continue;
+               }
+
+               lstcon_rpc_stat_success(stat, 1);
+
+               lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat);
+       }
+
+       if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) {
+               stat->trs_fwk_errno =
+                     lstcon_session_feats_check(trans->tas_features);
+       }
+
+       CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, "
+                     "RPC error(%d), Framework error(%d)\n",
+              lstcon_rpc_trans_name(trans->tas_opc),
+              lstcon_rpc_stat_success(stat, 0),
+              lstcon_rpc_stat_failure(stat, 0),
+              lstcon_rpc_stat_total(stat, 0),
+              stat->trs_rpc_errno, stat->trs_fwk_errno);
+
+       return;
+}
+
+int
+lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+                            struct list_head *head_up,
+                            lstcon_rpc_readent_func_t readent)
+{
+       struct list_head            tmp;
+       struct list_head           *next;
+       lstcon_rpc_ent_t     *ent;
+       srpc_generic_reply_t *rep;
+       lstcon_rpc_t     *crpc;
+       srpc_msg_t         *msg;
+       lstcon_node_t   *nd;
+       cfs_duration_t  dur;
+       struct timeval  tv;
+       int                error;
+
+       LASSERT (head_up != NULL);
+
+       next = head_up;
+
+       list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+               if (copy_from_user(&tmp, next,
+                                      sizeof(struct list_head)))
+                       return -EFAULT;
+
+               if (tmp.next == head_up)
+                       return 0;
+
+               next = tmp.next;
+
+               ent = list_entry(next, lstcon_rpc_ent_t, rpe_link);
+
+               LASSERT (crpc->crp_stamp != 0);
+
+               error = lstcon_rpc_get_reply(crpc, &msg);
+
+               nd = crpc->crp_node;
+
+               dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
+                     (cfs_time_t)console_session.ses_id.ses_stamp);
+               cfs_duration_usec(dur, &tv);
+
+               if (copy_to_user(&ent->rpe_peer,
+                                    &nd->nd_id, sizeof(lnet_process_id_t)) ||
+                   copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) ||
+                   copy_to_user(&ent->rpe_state,
+                                    &nd->nd_state, sizeof(nd->nd_state)) ||
+                   copy_to_user(&ent->rpe_rpc_errno, &error,
+                                    sizeof(error)))
+                       return -EFAULT;
+
+               if (error != 0)
+                       continue;
+
+               /* RPC is done */
+               rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+
+               if (copy_to_user(&ent->rpe_sid,
+                                    &rep->sid, sizeof(lst_sid_t)) ||
+                   copy_to_user(&ent->rpe_fwk_errno,
+                                    &rep->status, sizeof(rep->status)))
+                       return -EFAULT;
+
+               if (readent == NULL)
+                       continue;
+
+               if ((error = readent(trans->tas_opc, msg, ent)) != 0)
+                       return error;
+       }
+
+       return 0;
+}
+
+void
+lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+{
+       srpc_client_rpc_t *rpc;
+       lstcon_rpc_t      *crpc;
+       lstcon_rpc_t      *tmp;
+       int             count = 0;
+
+       list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list,
+                                crp_link) {
+               rpc = crpc->crp_rpc;
+
+               spin_lock(&rpc->crpc_lock);
+
+               /* free it if not posted or finished already */
+               if (!crpc->crp_posted || crpc->crp_finished) {
+                       spin_unlock(&rpc->crpc_lock);
+
+                       list_del_init(&crpc->crp_link);
+                       lstcon_rpc_put(crpc);
+
+                       continue;
+               }
+
+               /* rpcs can be still not callbacked (even LNetMDUnlink is called)
+                * because huge timeout for inaccessible network, don't make
+                * user wait for them, just abandon them, they will be recycled
+                * in callback */
+
+               LASSERT (crpc->crp_status != 0);
+
+               crpc->crp_node  = NULL;
+               crpc->crp_trans = NULL;
+               list_del_init(&crpc->crp_link);
+               count ++;
+
+               spin_unlock(&rpc->crpc_lock);
+
+               atomic_dec(&trans->tas_remaining);
+       }
+
+       LASSERT (atomic_read(&trans->tas_remaining) == 0);
+
+       list_del(&trans->tas_link);
+       if (!list_empty(&trans->tas_olink))
+               list_del(&trans->tas_olink);
+
+       CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n",
+              lstcon_rpc_trans_name(trans->tas_opc), count);
+
+       LIBCFS_FREE(trans, sizeof(*trans));
+
+       return;
+}
+
+int
+lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
+                  unsigned feats, lstcon_rpc_t **crpc)
+{
+       srpc_mksn_reqst_t *msrq;
+       srpc_rmsn_reqst_t *rsrq;
+       int             rc;
+
+       switch (transop) {
+       case LST_TRANS_SESNEW:
+               rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION,
+                                    feats, 0, 0, crpc);
+               if (rc != 0)
+                       return rc;
+
+               msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
+               msrq->mksn_sid     = console_session.ses_id;
+               msrq->mksn_force   = console_session.ses_force;
+               strncpy(msrq->mksn_name, console_session.ses_name,
+                       strlen(console_session.ses_name));
+               break;
+
+       case LST_TRANS_SESEND:
+               rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION,
+                                    feats, 0, 0, crpc);
+               if (rc != 0)
+                       return rc;
+
+               rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst;
+               rsrq->rmsn_sid = console_session.ses_id;
+               break;
+
+       default:
+               LBUG();
+       }
+
+       return 0;
+}
+
+int
+lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+       srpc_debug_reqst_t *drq;
+       int                 rc;
+
+       rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
+       if (rc != 0)
+               return rc;
+
+       drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+       drq->dbg_sid   = console_session.ses_id;
+       drq->dbg_flags = 0;
+
+       return rc;
+}
+
+int
+lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+                  lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+{
+       lstcon_batch_t     *batch;
+       srpc_batch_reqst_t *brq;
+       int                 rc;
+
+       rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
+       if (rc != 0)
+               return rc;
+
+       brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst;
+
+       brq->bar_sid     = console_session.ses_id;
+       brq->bar_bid     = tsb->tsb_id;
+       brq->bar_testidx = tsb->tsb_index;
+       brq->bar_opc     = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN :
+                          (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP:
+                           SRPC_BATCH_OPC_QUERY);
+
+       if (transop != LST_TRANS_TSBRUN &&
+           transop != LST_TRANS_TSBSTOP)
+               return 0;
+
+       LASSERT (tsb->tsb_index == 0);
+
+       batch = (lstcon_batch_t *)tsb;
+       brq->bar_arg = batch->bat_arg;
+
+       return 0;
+}
+
+int
+lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+       srpc_stat_reqst_t *srq;
+       int                rc;
+
+       rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
+       if (rc != 0)
+               return rc;
+
+       srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst;
+
+       srq->str_sid  = console_session.ses_id;
+       srq->str_type = 0; /* XXX remove it */
+
+       return 0;
+}
+
+lnet_process_id_packed_t *
+lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
+{
+       lnet_process_id_packed_t *pid;
+       int                    i;
+
+       i = idx / SFW_ID_PER_PAGE;
+
+       LASSERT (i < nkiov);
+
+       pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page);
+
+       return &pid[idx % SFW_ID_PER_PAGE];
+}
+
+int
+lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+                    int dist, int span, int nkiov, lnet_kiov_t *kiov)
+{
+       lnet_process_id_packed_t *pid;
+       lstcon_ndlink_t   *ndl;
+       lstcon_node_t       *nd;
+       int                    start;
+       int                    end;
+       int                    i = 0;
+
+       LASSERT (dist >= 1);
+       LASSERT (span >= 1);
+       LASSERT (grp->grp_nnode >= 1);
+
+       if (span > grp->grp_nnode)
+               return -EINVAL;
+
+       start = ((idx / dist) * span) % grp->grp_nnode;
+       end   = ((idx / dist) * span + span - 1) % grp->grp_nnode;
+
+       list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+               nd = ndl->ndl_node;
+               if (i < start) {
+                       i ++;
+                       continue;
+               }
+
+               if (i > (end >= start ? end: grp->grp_nnode))
+                       break;
+
+               pid = lstcon_next_id((i - start), nkiov, kiov);
+               pid->nid = nd->nd_id.nid;
+               pid->pid = nd->nd_id.pid;
+               i++;
+       }
+
+       if (start <= end) /* done */
+               return 0;
+
+       list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+               if (i > grp->grp_nnode + end)
+                       break;
+
+               nd = ndl->ndl_node;
+               pid = lstcon_next_id((i - start), nkiov, kiov);
+               pid->nid = nd->nd_id.nid;
+               pid->pid = nd->nd_id.pid;
+               i++;
+       }
+
+       return 0;
+}
+
+int
+lstcon_pingrpc_prep(lst_test_ping_param_t *param, srpc_test_reqst_t *req)
+{
+       test_ping_req_t *prq = &req->tsr_u.ping;
+
+       prq->png_size   = param->png_size;
+       prq->png_flags  = param->png_flags;
+       /* TODO dest */
+       return 0;
+}
+
+int
+lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+       test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+
+       brq->blk_opc    = param->blk_opc;
+       brq->blk_npg    = (param->blk_size + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE;
+       brq->blk_flags  = param->blk_flags;
+
+       return 0;
+}
+
+int
+lstcon_bulkrpc_v1_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+       test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+
+       brq->blk_opc    = param->blk_opc;
+       brq->blk_flags  = param->blk_flags;
+       brq->blk_len    = param->blk_size;
+       brq->blk_offset = 0; /* reserved */
+
+       return 0;
+}
+
+int
+lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+                   lstcon_test_t *test, lstcon_rpc_t **crpc)
+{
+       lstcon_group_t    *sgrp = test->tes_src_grp;
+       lstcon_group_t    *dgrp = test->tes_dst_grp;
+       srpc_test_reqst_t *trq;
+       srpc_bulk_t       *bulk;
+       int             i;
+       int                npg = 0;
+       int                nob = 0;
+       int                rc  = 0;
+
+       if (transop == LST_TRANS_TSBCLIADD) {
+               npg = sfw_id_pages(test->tes_span);
+               nob = (feats & LST_FEAT_BULK_LEN) == 0 ?
+                     npg * PAGE_CACHE_SIZE :
+                     sizeof(lnet_process_id_packed_t) * test->tes_span;
+       }
+
+       rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc);
+       if (rc != 0)
+               return rc;
+
+       trq  = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst;
+
+       if (transop == LST_TRANS_TSBSRVADD) {
+               int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist;
+               int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span;
+               int nmax = (ndist + nspan - 1) / nspan;
+
+               trq->tsr_ndest = 0;
+               trq->tsr_loop  = nmax * test->tes_dist * test->tes_concur;
+
+       } else {
+               bulk = &(*crpc)->crp_rpc->crpc_bulk;
+
+               for (i = 0; i < npg; i++) {
+                       int     len;
+
+                       LASSERT(nob > 0);
+
+                       len = (feats & LST_FEAT_BULK_LEN) == 0 ?
+                             PAGE_CACHE_SIZE : min_t(int, nob, PAGE_CACHE_SIZE);
+                       nob -= len;
+
+                       bulk->bk_iovs[i].kiov_offset = 0;
+                       bulk->bk_iovs[i].kiov_len    = len;
+                       bulk->bk_iovs[i].kiov_page   =
+                               alloc_page(GFP_IOFS);
+
+                       if (bulk->bk_iovs[i].kiov_page == NULL) {
+                               lstcon_rpc_put(*crpc);
+                               return -ENOMEM;
+                       }
+               }
+
+               bulk->bk_sink = 0;
+
+               LASSERT (transop == LST_TRANS_TSBCLIADD);
+
+               rc = lstcon_dstnodes_prep(test->tes_dst_grp,
+                                         test->tes_cliidx++,
+                                         test->tes_dist,
+                                         test->tes_span,
+                                         npg, &bulk->bk_iovs[0]);
+               if (rc != 0) {
+                       lstcon_rpc_put(*crpc);
+                       return rc;
+               }
+
+               trq->tsr_ndest = test->tes_span;
+               trq->tsr_loop  = test->tes_loop;
+       }
+
+       trq->tsr_sid    = console_session.ses_id;
+       trq->tsr_bid    = test->tes_hdr.tsb_id;
+       trq->tsr_concur     = test->tes_concur;
+       trq->tsr_is_client  = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0;
+       trq->tsr_stop_onerr = !!test->tes_stop_onerr;
+
+       switch (test->tes_type) {
+       case LST_TEST_PING:
+               trq->tsr_service = SRPC_SERVICE_PING;
+               rc = lstcon_pingrpc_prep((lst_test_ping_param_t *)
+                                        &test->tes_param[0], trq);
+               break;
+
+       case LST_TEST_BULK:
+               trq->tsr_service = SRPC_SERVICE_BRW;
+               if ((feats & LST_FEAT_BULK_LEN) == 0) {
+                       rc = lstcon_bulkrpc_v0_prep((lst_test_bulk_param_t *)
+                                                   &test->tes_param[0], trq);
+               } else {
+                       rc = lstcon_bulkrpc_v1_prep((lst_test_bulk_param_t *)
+                                                   &test->tes_param[0], trq);
+               }
+
+               break;
+       default:
+               LBUG();
+               break;
+       }
+
+       return rc;
+}
+
+int
+lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
+                        lstcon_node_t *nd, srpc_msg_t *reply)
+{
+       srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
+       int                status   = mksn_rep->mksn_status;
+
+       if (status == 0 &&
+           (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+               mksn_rep->mksn_status = EPROTO;
+               status = EPROTO;
+       }
+
+       if (status == EPROTO) {
+               CNETERR("session protocol error from %s: %u\n",
+                       libcfs_nid2str(nd->nd_id.nid),
+                       reply->msg_ses_feats);
+       }
+
+       if (status != 0)
+               return status;
+
+       if (!trans->tas_feats_updated) {
+               trans->tas_feats_updated = 1;
+               trans->tas_features = reply->msg_ses_feats;
+       }
+
+       if (reply->msg_ses_feats != trans->tas_features) {
+               CNETERR("Framework features %x from %s is different with "
+                       "features on this transaction: %x\n",
+                        reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid),
+                        trans->tas_features);
+               status = mksn_rep->mksn_status = EPROTO;
+       }
+
+       if (status == 0) {
+               /* session timeout on remote node */
+               nd->nd_timeout = mksn_rep->mksn_timeout;
+       }
+
+       return status;
+}
+
+void
+lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
+                     lstcon_node_t *nd, lstcon_trans_stat_t *stat)
+{
+       srpc_rmsn_reply_t  *rmsn_rep;
+       srpc_debug_reply_t *dbg_rep;
+       srpc_batch_reply_t *bat_rep;
+       srpc_test_reply_t  *test_rep;
+       srpc_stat_reply_t  *stat_rep;
+       int              rc = 0;
+
+       switch (trans->tas_opc) {
+       case LST_TRANS_SESNEW:
+               rc = lstcon_sesnew_stat_reply(trans, nd, msg);
+               if (rc == 0) {
+                       lstcon_sesop_stat_success(stat, 1);
+                       return;
+               }
+
+               lstcon_sesop_stat_failure(stat, 1);
+               break;
+
+       case LST_TRANS_SESEND:
+               rmsn_rep = &msg->msg_body.rmsn_reply;
+               /* ESRCH is not an error for end session */
+               if (rmsn_rep->rmsn_status == 0 ||
+                   rmsn_rep->rmsn_status == ESRCH) {
+                       lstcon_sesop_stat_success(stat, 1);
+                       return;
+               }
+
+               lstcon_sesop_stat_failure(stat, 1);
+               rc = rmsn_rep->rmsn_status;
+               break;
+
+       case LST_TRANS_SESQRY:
+       case LST_TRANS_SESPING:
+               dbg_rep = &msg->msg_body.dbg_reply;
+
+               if (dbg_rep->dbg_status == ESRCH) {
+                       lstcon_sesqry_stat_unknown(stat, 1);
+                       return;
+               }
+
+               if (lstcon_session_match(dbg_rep->dbg_sid))
+                       lstcon_sesqry_stat_active(stat, 1);
+               else
+                       lstcon_sesqry_stat_busy(stat, 1);
+               return;
+
+       case LST_TRANS_TSBRUN:
+       case LST_TRANS_TSBSTOP:
+               bat_rep = &msg->msg_body.bat_reply;
+
+               if (bat_rep->bar_status == 0) {
+                       lstcon_tsbop_stat_success(stat, 1);
+                       return;
+               }
+
+               if (bat_rep->bar_status == EPERM &&
+                   trans->tas_opc == LST_TRANS_TSBSTOP) {
+                       lstcon_tsbop_stat_success(stat, 1);
+                       return;
+               }
+
+               lstcon_tsbop_stat_failure(stat, 1);
+               rc = bat_rep->bar_status;
+               break;
+
+       case LST_TRANS_TSBCLIQRY:
+       case LST_TRANS_TSBSRVQRY:
+               bat_rep = &msg->msg_body.bat_reply;
+
+               if (bat_rep->bar_active != 0)
+                       lstcon_tsbqry_stat_run(stat, 1);
+               else
+                       lstcon_tsbqry_stat_idle(stat, 1);
+
+               if (bat_rep->bar_status == 0)
+                       return;
+
+               lstcon_tsbqry_stat_failure(stat, 1);
+               rc = bat_rep->bar_status;
+               break;
+
+       case LST_TRANS_TSBCLIADD:
+       case LST_TRANS_TSBSRVADD:
+               test_rep = &msg->msg_body.tes_reply;
+
+               if (test_rep->tsr_status == 0) {
+                       lstcon_tsbop_stat_success(stat, 1);
+                       return;
+               }
+
+               lstcon_tsbop_stat_failure(stat, 1);
+               rc = test_rep->tsr_status;
+               break;
+
+       case LST_TRANS_STATQRY:
+               stat_rep = &msg->msg_body.stat_reply;
+
+               if (stat_rep->str_status == 0) {
+                       lstcon_statqry_stat_success(stat, 1);
+                       return;
+               }
+
+               lstcon_statqry_stat_failure(stat, 1);
+               rc = stat_rep->str_status;
+               break;
+
+       default:
+               LBUG();
+       }
+
+       if (stat->trs_fwk_errno == 0)
+               stat->trs_fwk_errno = rc;
+
+       return;
+}
+
+int
+lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+                       struct list_head *translist, int transop,
+                       void *arg, lstcon_rpc_cond_func_t condition,
+                       lstcon_rpc_trans_t **transpp)
+{
+       lstcon_rpc_trans_t *trans;
+       lstcon_ndlink_t    *ndl;
+       lstcon_node_t      *nd;
+       lstcon_rpc_t       *rpc;
+       unsigned            feats;
+       int              rc;
+
+       /* Creating session RPG for list of nodes */
+
+       rc = lstcon_rpc_trans_prep(translist, transop, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction %d: %d\n", transop, rc);
+               return rc;
+       }
+
+       feats = trans->tas_features;
+       list_for_each_entry(ndl, ndlist, ndl_link) {
+               rc = condition == NULL ? 1 :
+                    condition(transop, ndl->ndl_node, arg);
+
+               if (rc == 0)
+                       continue;
+
+               if (rc < 0) {
+                       CDEBUG(D_NET, "Condition error while creating RPC "
+                                     " for transaction %d: %d\n", transop, rc);
+                       break;
+               }
+
+               nd = ndl->ndl_node;
+
+               switch (transop) {
+               case LST_TRANS_SESNEW:
+               case LST_TRANS_SESEND:
+                       rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc);
+                       break;
+               case LST_TRANS_SESQRY:
+               case LST_TRANS_SESPING:
+                       rc = lstcon_dbgrpc_prep(nd, feats, &rpc);
+                       break;
+               case LST_TRANS_TSBCLIADD:
+               case LST_TRANS_TSBSRVADD:
+                       rc = lstcon_testrpc_prep(nd, transop, feats,
+                                                (lstcon_test_t *)arg, &rpc);
+                       break;
+               case LST_TRANS_TSBRUN:
+               case LST_TRANS_TSBSTOP:
+               case LST_TRANS_TSBCLIQRY:
+               case LST_TRANS_TSBSRVQRY:
+                       rc = lstcon_batrpc_prep(nd, transop, feats,
+                                               (lstcon_tsb_hdr_t *)arg, &rpc);
+                       break;
+               case LST_TRANS_STATQRY:
+                       rc = lstcon_statrpc_prep(nd, feats, &rpc);
+                       break;
+               default:
+                       rc = -EINVAL;
+                       break;
+               }
+
+               if (rc != 0) {
+                       CERROR("Failed to create RPC for transaction %s: %d\n",
+                              lstcon_rpc_trans_name(transop), rc);
+                       break;
+               }
+
+               lstcon_rpc_trans_addreq(trans, rpc);
+       }
+
+       if (rc == 0) {
+               *transpp = trans;
+               return 0;
+       }
+
+       lstcon_rpc_trans_destroy(trans);
+
+       return rc;
+}
+
+void
+lstcon_rpc_pinger(void *arg)
+{
+       stt_timer_t     *ptimer = (stt_timer_t *)arg;
+       lstcon_rpc_trans_t *trans;
+       lstcon_rpc_t       *crpc;
+       srpc_msg_t       *rep;
+       srpc_debug_reqst_t *drq;
+       lstcon_ndlink_t    *ndl;
+       lstcon_node_t      *nd;
+       time_t        intv;
+       int              count = 0;
+       int              rc;
+
+       /* RPC pinger is a special case of transaction,
+        * it's called by timer at 8 seconds interval.
+        */
+       mutex_lock(&console_session.ses_mutex);
+
+       if (console_session.ses_shutdown || console_session.ses_expired) {
+               mutex_unlock(&console_session.ses_mutex);
+               return;
+       }
+
+       if (!console_session.ses_expired &&
+           cfs_time_current_sec() - console_session.ses_laststamp >
+           (time_t)console_session.ses_timeout)
+               console_session.ses_expired = 1;
+
+       trans = console_session.ses_ping;
+
+       LASSERT (trans != NULL);
+
+       list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) {
+               nd = ndl->ndl_node;
+
+               if (console_session.ses_expired) {
+                       /* idle console, end session on all nodes */
+                       if (nd->nd_state != LST_NODE_ACTIVE)
+                               continue;
+
+                       rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND,
+                                               trans->tas_features, &crpc);
+                       if (rc != 0) {
+                               CERROR("Out of memory\n");
+                               break;
+                       }
+
+                       lstcon_rpc_trans_addreq(trans, crpc);
+                       lstcon_rpc_post(crpc);
+
+                       continue;
+               }
+
+               crpc = &nd->nd_ping;
+
+               if (crpc->crp_rpc != NULL) {
+                       LASSERT (crpc->crp_trans == trans);
+                       LASSERT (!list_empty(&crpc->crp_link));
+
+                       spin_lock(&crpc->crp_rpc->crpc_lock);
+
+                       LASSERT(crpc->crp_posted);
+
+                       if (!crpc->crp_finished) {
+                               /* in flight */
+                               spin_unlock(&crpc->crp_rpc->crpc_lock);
+                               continue;
+                       }
+
+                       spin_unlock(&crpc->crp_rpc->crpc_lock);
+
+                       lstcon_rpc_get_reply(crpc, &rep);
+
+                       list_del_init(&crpc->crp_link);
+
+                       lstcon_rpc_put(crpc);
+               }
+
+               if (nd->nd_state != LST_NODE_ACTIVE)
+                       continue;
+
+               intv = cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+                                                    nd->nd_stamp));
+               if (intv < (time_t)nd->nd_timeout / 2)
+                       continue;
+
+               rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
+                                    trans->tas_features, 0, 0, 1, crpc);
+               if (rc != 0) {
+                       CERROR("Out of memory\n");
+                       break;
+               }
+
+               drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+               drq->dbg_sid   = console_session.ses_id;
+               drq->dbg_flags = 0;
+
+               lstcon_rpc_trans_addreq(trans, crpc);
+               lstcon_rpc_post(crpc);
+
+               count ++;
+       }
+
+       if (console_session.ses_expired) {
+               mutex_unlock(&console_session.ses_mutex);
+               return;
+       }
+
+       CDEBUG(D_NET, "Ping %d nodes in session\n", count);
+
+       ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+       stt_add_timer(ptimer);
+
+       mutex_unlock(&console_session.ses_mutex);
+}
+
+int
+lstcon_rpc_pinger_start(void)
+{
+       stt_timer_t    *ptimer;
+       int          rc;
+
+       LASSERT (list_empty(&console_session.ses_rpc_freelist));
+       LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+
+       rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING,
+                                  &console_session.ses_ping);
+       if (rc != 0) {
+               CERROR("Failed to create console pinger\n");
+               return rc;
+       }
+
+       ptimer = &console_session.ses_ping_timer;
+       ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+
+       stt_add_timer(ptimer);
+
+       return 0;
+}
+
+void
+lstcon_rpc_pinger_stop(void)
+{
+       LASSERT (console_session.ses_shutdown);
+
+       stt_del_timer(&console_session.ses_ping_timer);
+
+       lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN);
+       lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat());
+       lstcon_rpc_trans_destroy(console_session.ses_ping);
+
+       memset(lstcon_trans_stat(), 0, sizeof(lstcon_trans_stat_t));
+
+       console_session.ses_ping = NULL;
+}
+
+void
+lstcon_rpc_cleanup_wait(void)
+{
+       lstcon_rpc_trans_t *trans;
+       lstcon_rpc_t       *crpc;
+       struct list_head         *pacer;
+       struct list_head          zlist;
+
+       /* Called with hold of global mutex */
+
+       LASSERT (console_session.ses_shutdown);
+
+       while (!list_empty(&console_session.ses_trans_list)) {
+               list_for_each(pacer, &console_session.ses_trans_list) {
+                       trans = list_entry(pacer, lstcon_rpc_trans_t,
+                                              tas_link);
+
+                       CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
+                              lstcon_rpc_trans_name(trans->tas_opc));
+
+                       wake_up(&trans->tas_waitq);
+               }
+
+               mutex_unlock(&console_session.ses_mutex);
+
+               CWARN("Session is shutting down, "
+                     "waiting for termination of transactions\n");
+               cfs_pause(cfs_time_seconds(1));
+
+               mutex_lock(&console_session.ses_mutex);
+       }
+
+       spin_lock(&console_session.ses_rpc_lock);
+
+       lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0),
+                      console_session.ses_rpc_lock,
+                      "Network is not accessable or target is down, "
+                      "waiting for %d console RPCs to being recycled\n",
+                      atomic_read(&console_session.ses_rpc_counter));
+
+       list_add(&zlist, &console_session.ses_rpc_freelist);
+       list_del_init(&console_session.ses_rpc_freelist);
+
+       spin_unlock(&console_session.ses_rpc_lock);
+
+       while (!list_empty(&zlist)) {
+               crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+
+               list_del(&crpc->crp_link);
+               LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+       }
+}
+
+int
+lstcon_rpc_module_init(void)
+{
+       INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list);
+       console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger;
+       console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer;
+
+       console_session.ses_ping = NULL;
+
+       spin_lock_init(&console_session.ses_rpc_lock);
+       atomic_set(&console_session.ses_rpc_counter, 0);
+       INIT_LIST_HEAD(&console_session.ses_rpc_freelist);
+
+       return 0;
+}
+
+void
+lstcon_rpc_module_fini(void)
+{
+       LASSERT (list_empty(&console_session.ses_rpc_freelist));
+       LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+}
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.h b/drivers/staging/lustre/lnet/selftest/conrpc.h
new file mode 100644 (file)
index 0000000..9aba24a
--- /dev/null
@@ -0,0 +1,146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * /lnet/selftest/conrpc.h
+ *
+ * Console rpc
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+#ifndef __LST_CONRPC_H__
+#define __LST_CONRPC_H__
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "rpc.h"
+#include "selftest.h"
+
+/* Console rpc and rpc transaction */
+#define LST_TRANS_TIMEOUT       30
+#define LST_TRANS_MIN_TIMEOUT   3
+
+#define LST_VALIDATE_TIMEOUT(t) MIN(MAX(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT)
+
+#define LST_PING_INTERVAL       8
+
+struct lstcon_rpc_trans;
+struct lstcon_tsb_hdr;
+struct lstcon_test;
+struct lstcon_node;
+
+typedef struct lstcon_rpc {
+       struct list_head               crp_link;       /* chain on rpc transaction */
+       srpc_client_rpc_t       *crp_rpc;       /* client rpc */
+       struct lstcon_node      *crp_node;       /* destination node */
+       struct lstcon_rpc_trans *crp_trans;     /* conrpc transaction */
+
+       unsigned int             crp_posted:1;   /* rpc is posted */
+       unsigned int             crp_finished:1; /* rpc is finished */
+       unsigned int             crp_unpacked:1; /* reply is unpacked */
+       /** RPC is embedded in other structure and can't free it */
+       unsigned int             crp_embedded:1;
+       int                   crp_status;     /* console rpc errors */
+       cfs_time_t             crp_stamp;      /* replied time stamp */
+} lstcon_rpc_t;
+
+typedef struct lstcon_rpc_trans {
+       struct list_head            tas_olink;     /* link chain on owner list */
+       struct list_head            tas_link;      /* link chain on global list */
+       int                tas_opc;       /* operation code of transaction */
+       /* features mask is uptodate */
+       unsigned              tas_feats_updated;
+       /* test features mask */
+       unsigned              tas_features;
+       wait_queue_head_t          tas_waitq;     /* wait queue head */
+       atomic_t          tas_remaining; /* # of un-scheduled rpcs */
+       struct list_head            tas_rpcs_list; /* queued requests */
+} lstcon_rpc_trans_t;
+
+#define LST_TRANS_PRIVATE       0x1000
+
+#define LST_TRANS_SESNEW       (LST_TRANS_PRIVATE | 0x01)
+#define LST_TRANS_SESEND       (LST_TRANS_PRIVATE | 0x02)
+#define LST_TRANS_SESQRY       0x03
+#define LST_TRANS_SESPING       0x04
+
+#define LST_TRANS_TSBCLIADD     (LST_TRANS_PRIVATE | 0x11)
+#define LST_TRANS_TSBSRVADD     (LST_TRANS_PRIVATE | 0x12)
+#define LST_TRANS_TSBRUN       (LST_TRANS_PRIVATE | 0x13)
+#define LST_TRANS_TSBSTOP       (LST_TRANS_PRIVATE | 0x14)
+#define LST_TRANS_TSBCLIQRY     0x15
+#define LST_TRANS_TSBSRVQRY     0x16
+
+#define LST_TRANS_STATQRY       0x21
+
+typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (* lstcon_rpc_readent_func_t)(int, srpc_msg_t *, lstcon_rpc_ent_t *);
+
+int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+                       unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
+                       unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+                       struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+                        struct lstcon_test *test, lstcon_rpc_t **crpc);
+int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
+                        lstcon_rpc_t **crpc);
+void lstcon_rpc_put(lstcon_rpc_t *crpc);
+int  lstcon_rpc_trans_prep(struct list_head *translist,
+                          int transop, lstcon_rpc_trans_t **transpp);
+int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+                            struct list_head *translist, int transop,
+                            void *arg, lstcon_rpc_cond_func_t condition,
+                            lstcon_rpc_trans_t **transpp);
+void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+                          lstcon_trans_stat_t *stat);
+int  lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+                                 struct list_head *head_up,
+                                 lstcon_rpc_readent_func_t readent);
+void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
+void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
+void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
+int  lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+int  lstcon_rpc_pinger_start(void);
+void lstcon_rpc_pinger_stop(void);
+void lstcon_rpc_cleanup_wait(void);
+int  lstcon_rpc_module_init(void);
+void lstcon_rpc_module_fini(void);
+
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/console.c b/drivers/staging/lustre/lnet/selftest/console.c
new file mode 100644 (file)
index 0000000..78e8d04
--- /dev/null
@@ -0,0 +1,2071 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Infrastructure of LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "console.h"
+#include "conrpc.h"
+
+#define LST_NODE_STATE_COUNTER(nd, p)             \
+do {                                               \
+       if ((nd)->nd_state == LST_NODE_ACTIVE)    \
+               (p)->nle_nactive ++;                \
+       else if ((nd)->nd_state == LST_NODE_BUSY)       \
+               (p)->nle_nbusy ++;                    \
+       else if ((nd)->nd_state == LST_NODE_DOWN)       \
+               (p)->nle_ndown ++;                    \
+       else                                        \
+               (p)->nle_nunknown ++;              \
+       (p)->nle_nnode ++;                            \
+} while (0)
+
+lstcon_session_t       console_session;
+
+void
+lstcon_node_get(lstcon_node_t *nd)
+{
+       LASSERT (nd->nd_ref >= 1);
+
+       nd->nd_ref++;
+}
+
+static int
+lstcon_node_find(lnet_process_id_t id, lstcon_node_t **ndpp, int create)
+{
+       lstcon_ndlink_t *ndl;
+       unsigned int     idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+
+       LASSERT (id.nid != LNET_NID_ANY);
+
+       list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], ndl_hlink) {
+               if (ndl->ndl_node->nd_id.nid != id.nid ||
+                   ndl->ndl_node->nd_id.pid != id.pid)
+                       continue;
+
+               lstcon_node_get(ndl->ndl_node);
+               *ndpp = ndl->ndl_node;
+               return 0;
+       }
+
+       if (!create)
+               return -ENOENT;
+
+       LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+       if (*ndpp == NULL)
+               return -ENOMEM;
+
+       ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+
+       ndl->ndl_node = *ndpp;
+
+       ndl->ndl_node->nd_ref   = 1;
+       ndl->ndl_node->nd_id    = id;
+       ndl->ndl_node->nd_stamp = cfs_time_current();
+       ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+       ndl->ndl_node->nd_timeout = 0;
+       memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+
+       /* queued in global hash & list, no refcount is taken by
+        * global hash & list, if caller release his refcount,
+        * node will be released */
+       list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]);
+       list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list);
+
+       return 0;
+}
+
+void
+lstcon_node_put(lstcon_node_t *nd)
+{
+       lstcon_ndlink_t  *ndl;
+
+       LASSERT (nd->nd_ref > 0);
+
+       if (--nd->nd_ref > 0)
+               return;
+
+       ndl = (lstcon_ndlink_t *)(nd + 1);
+
+       LASSERT (!list_empty(&ndl->ndl_link));
+       LASSERT (!list_empty(&ndl->ndl_hlink));
+
+       /* remove from session */
+       list_del(&ndl->ndl_link);
+       list_del(&ndl->ndl_hlink);
+
+       LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+}
+
+static int
+lstcon_ndlink_find(struct list_head *hash,
+                  lnet_process_id_t id, lstcon_ndlink_t **ndlpp, int create)
+{
+       unsigned int     idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+       lstcon_ndlink_t *ndl;
+       lstcon_node_t   *nd;
+       int           rc;
+
+       if (id.nid == LNET_NID_ANY)
+               return -EINVAL;
+
+       /* search in hash */
+       list_for_each_entry(ndl, &hash[idx], ndl_hlink) {
+               if (ndl->ndl_node->nd_id.nid != id.nid ||
+                   ndl->ndl_node->nd_id.pid != id.pid)
+                       continue;
+
+               *ndlpp = ndl;
+               return 0;
+       }
+
+       if (create == 0)
+               return -ENOENT;
+
+       /* find or create in session hash */
+       rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0);
+       if (rc != 0)
+               return rc;
+
+       LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+       if (ndl == NULL) {
+               lstcon_node_put(nd);
+               return -ENOMEM;
+       }
+
+       *ndlpp = ndl;
+
+       ndl->ndl_node = nd;
+       INIT_LIST_HEAD(&ndl->ndl_link);
+       list_add_tail(&ndl->ndl_hlink, &hash[idx]);
+
+       return  0;
+}
+
+static void
+lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+{
+       LASSERT (list_empty(&ndl->ndl_link));
+       LASSERT (!list_empty(&ndl->ndl_hlink));
+
+       list_del(&ndl->ndl_hlink); /* delete from hash */
+       lstcon_node_put(ndl->ndl_node);
+
+       LIBCFS_FREE(ndl, sizeof(*ndl));
+}
+
+static int
+lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+{
+       lstcon_group_t *grp;
+       int          i;
+
+       LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+                                  grp_ndl_hash[LST_NODE_HASHSIZE]));
+       if (grp == NULL)
+               return -ENOMEM;
+
+       memset(grp, 0, offsetof(lstcon_group_t,
+                               grp_ndl_hash[LST_NODE_HASHSIZE]));
+
+       grp->grp_ref = 1;
+       if (name != NULL)
+               strcpy(grp->grp_name, name);
+
+       INIT_LIST_HEAD(&grp->grp_link);
+       INIT_LIST_HEAD(&grp->grp_ndl_list);
+       INIT_LIST_HEAD(&grp->grp_trans_list);
+
+       for (i = 0; i < LST_NODE_HASHSIZE; i++)
+               INIT_LIST_HEAD(&grp->grp_ndl_hash[i]);
+
+       *grpp = grp;
+
+       return 0;
+}
+
+static void
+lstcon_group_addref(lstcon_group_t *grp)
+{
+       grp->grp_ref ++;
+}
+
+static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+
+static void
+lstcon_group_drain(lstcon_group_t *grp, int keep)
+{
+       lstcon_ndlink_t *ndl;
+       lstcon_ndlink_t *tmp;
+
+       list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
+               if ((ndl->ndl_node->nd_state & keep) == 0)
+                       lstcon_group_ndlink_release(grp, ndl);
+       }
+}
+
+static void
+lstcon_group_decref(lstcon_group_t *grp)
+{
+       int     i;
+
+       if (--grp->grp_ref > 0)
+               return;
+
+       if (!list_empty(&grp->grp_link))
+               list_del(&grp->grp_link);
+
+       lstcon_group_drain(grp, 0);
+
+       for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+               LASSERT (list_empty(&grp->grp_ndl_hash[i]));
+       }
+
+       LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+                                 grp_ndl_hash[LST_NODE_HASHSIZE]));
+}
+
+static int
+lstcon_group_find(char *name, lstcon_group_t **grpp)
+{
+       lstcon_group_t   *grp;
+
+       list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+               if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
+                       continue;
+
+               lstcon_group_addref(grp);  /* +1 ref for caller */
+               *grpp = grp;
+               return 0;
+       }
+
+       return -ENOENT;
+}
+
+static void
+lstcon_group_put(lstcon_group_t *grp)
+{
+       lstcon_group_decref(grp);
+}
+
+static int
+lstcon_group_ndlink_find(lstcon_group_t *grp, lnet_process_id_t id,
+                        lstcon_ndlink_t **ndlpp, int create)
+{
+       int     rc;
+
+       rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create);
+       if (rc != 0)
+               return rc;
+
+       if (!list_empty(&(*ndlpp)->ndl_link))
+               return 0;
+
+       list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list);
+       grp->grp_nnode ++;
+
+       return 0;
+}
+
+static void
+lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+{
+       list_del_init(&ndl->ndl_link);
+       lstcon_ndlink_release(ndl);
+       grp->grp_nnode --;
+}
+
+static void
+lstcon_group_ndlink_move(lstcon_group_t *old,
+                        lstcon_group_t *new, lstcon_ndlink_t *ndl)
+{
+       unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
+                          LST_NODE_HASHSIZE;
+
+       list_del(&ndl->ndl_hlink);
+       list_del(&ndl->ndl_link);
+       old->grp_nnode --;
+
+       list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]);
+       list_add_tail(&ndl->ndl_link, &new->grp_ndl_list);
+       new->grp_nnode ++;
+
+       return;
+}
+
+static void
+lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+{
+       lstcon_ndlink_t *ndl;
+
+       while (!list_empty(&old->grp_ndl_list)) {
+               ndl = list_entry(old->grp_ndl_list.next,
+                                    lstcon_ndlink_t, ndl_link);
+               lstcon_group_ndlink_move(old, new, ndl);
+       }
+}
+
+int
+lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+       lstcon_group_t *grp = (lstcon_group_t *)arg;
+
+       switch (transop) {
+       case LST_TRANS_SESNEW:
+               if (nd->nd_state == LST_NODE_ACTIVE)
+                       return 0;
+               break;
+
+       case LST_TRANS_SESEND:
+               if (nd->nd_state != LST_NODE_ACTIVE)
+                       return 0;
+
+               if (grp != NULL && nd->nd_ref > 1)
+                       return 0;
+               break;
+
+       case LST_TRANS_SESQRY:
+               break;
+
+       default:
+               LBUG();
+       }
+
+       return 1;
+}
+
+int
+lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+                     lstcon_rpc_ent_t *ent_up)
+{
+       srpc_debug_reply_t *rep;
+
+       switch (transop) {
+       case LST_TRANS_SESNEW:
+       case LST_TRANS_SESEND:
+               return 0;
+
+       case LST_TRANS_SESQRY:
+               rep = &msg->msg_body.dbg_reply;
+
+               if (copy_to_user(&ent_up->rpe_priv[0],
+                                    &rep->dbg_timeout, sizeof(int)) ||
+                   copy_to_user(&ent_up->rpe_payload[0],
+                                    &rep->dbg_name, LST_NAME_SIZE))
+                       return -EFAULT;
+
+               return 0;
+
+       default:
+               LBUG();
+       }
+
+       return 0;
+}
+
+static int
+lstcon_group_nodes_add(lstcon_group_t *grp,
+                      int count, lnet_process_id_t *ids_up,
+                      unsigned *featp, struct list_head *result_up)
+{
+       lstcon_rpc_trans_t      *trans;
+       lstcon_ndlink_t  *ndl;
+       lstcon_group_t    *tmp;
+       lnet_process_id_t       id;
+       int                   i;
+       int                   rc;
+
+       rc = lstcon_group_alloc(NULL, &tmp);
+       if (rc != 0) {
+               CERROR("Out of memory\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0 ; i < count; i++) {
+               if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                       rc = -EFAULT;
+                       break;
+               }
+
+               /* skip if it's in this group already */
+               rc = lstcon_group_ndlink_find(grp, id, &ndl, 0);
+               if (rc == 0)
+                       continue;
+
+               /* add to tmp group */
+               rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1);
+               if (rc != 0) {
+                       CERROR("Can't create ndlink, out of memory\n");
+                       break;
+               }
+       }
+
+       if (rc != 0) {
+               lstcon_group_put(tmp);
+               return rc;
+       }
+
+       rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+                                    &tmp->grp_trans_list, LST_TRANS_SESNEW,
+                                    tmp, lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               lstcon_group_put(tmp);
+               return rc;
+       }
+
+       /* post all RPCs */
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                         lstcon_sesrpc_readent);
+       *featp = trans->tas_features;
+
+       /* destroy all RPGs */
+       lstcon_rpc_trans_destroy(trans);
+
+       lstcon_group_move(tmp, grp);
+       lstcon_group_put(tmp);
+
+       return rc;
+}
+
+static int
+lstcon_group_nodes_remove(lstcon_group_t *grp,
+                         int count, lnet_process_id_t *ids_up,
+                         struct list_head *result_up)
+{
+       lstcon_rpc_trans_t     *trans;
+       lstcon_ndlink_t *ndl;
+       lstcon_group_t   *tmp;
+       lnet_process_id_t       id;
+       int                  rc;
+       int                  i;
+
+       /* End session and remove node from the group */
+
+       rc = lstcon_group_alloc(NULL, &tmp);
+       if (rc != 0) {
+               CERROR("Out of memory\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < count; i++) {
+               if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                       rc = -EFAULT;
+                       goto error;
+               }
+
+               /* move node to tmp group */
+               if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0)
+                       lstcon_group_ndlink_move(grp, tmp, ndl);
+       }
+
+       rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+                                    &tmp->grp_trans_list, LST_TRANS_SESEND,
+                                    tmp, lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               goto error;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+       lstcon_rpc_trans_destroy(trans);
+       /* release nodes anyway, because we can't rollback status */
+       lstcon_group_put(tmp);
+
+       return rc;
+error:
+       lstcon_group_move(tmp, grp);
+       lstcon_group_put(tmp);
+
+       return rc;
+}
+
+int
+lstcon_group_add(char *name)
+{
+       lstcon_group_t *grp;
+       int          rc;
+
+       rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
+       if (rc != 0) {
+               /* find a group with same name */
+               lstcon_group_put(grp);
+               return rc;
+       }
+
+       rc = lstcon_group_alloc(name, &grp);
+       if (rc != 0) {
+               CERROR("Can't allocate descriptor for group %s\n", name);
+               return -ENOMEM;
+       }
+
+       list_add_tail(&grp->grp_link, &console_session.ses_grp_list);
+
+       return rc;
+}
+
+int
+lstcon_nodes_add(char *name, int count, lnet_process_id_t *ids_up,
+                unsigned *featp, struct list_head *result_up)
+{
+       lstcon_group_t   *grp;
+       int                  rc;
+
+       LASSERT (count > 0);
+       LASSERT (ids_up != NULL);
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", name);
+               return rc;
+       }
+
+       if (grp->grp_ref > 2) {
+               /* referred by other threads or test */
+               CDEBUG(D_NET, "Group %s is busy\n", name);
+               lstcon_group_put(grp);
+
+               return -EBUSY;
+       }
+
+       rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up);
+
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_group_del(char *name)
+{
+       lstcon_rpc_trans_t *trans;
+       lstcon_group_t     *grp;
+       int              rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group: %s\n", name);
+               return rc;
+       }
+
+       if (grp->grp_ref > 2) {
+               /* referred by others threads or test */
+               CDEBUG(D_NET, "Group %s is busy\n", name);
+               lstcon_group_put(grp);
+               return -EBUSY;
+       }
+
+       rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                    &grp->grp_trans_list, LST_TRANS_SESEND,
+                                    grp, lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               lstcon_group_put(grp);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       lstcon_rpc_trans_destroy(trans);
+
+       lstcon_group_put(grp);
+       /* -ref for session, it's destroyed,
+        * status can't be rolled back, destroy group anway */
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_group_clean(char *name, int args)
+{
+       lstcon_group_t *grp = NULL;
+       int          rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", name);
+               return rc;
+       }
+
+       if (grp->grp_ref > 2) {
+               /* referred by test */
+               CDEBUG(D_NET, "Group %s is busy\n", name);
+               lstcon_group_put(grp);
+               return -EBUSY;
+       }
+
+       args = (LST_NODE_ACTIVE | LST_NODE_BUSY |
+               LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args;
+
+       lstcon_group_drain(grp, args);
+
+       lstcon_group_put(grp);
+       /* release empty group */
+       if (list_empty(&grp->grp_ndl_list))
+               lstcon_group_put(grp);
+
+       return 0;
+}
+
+int
+lstcon_nodes_remove(char *name, int count,
+                   lnet_process_id_t *ids_up, struct list_head *result_up)
+{
+       lstcon_group_t *grp = NULL;
+       int          rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group: %s\n", name);
+               return rc;
+       }
+
+       if (grp->grp_ref > 2) {
+               /* referred by test */
+               CDEBUG(D_NET, "Group %s is busy\n", name);
+               lstcon_group_put(grp);
+               return -EBUSY;
+       }
+
+       rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up);
+
+       lstcon_group_put(grp);
+       /* release empty group */
+       if (list_empty(&grp->grp_ndl_list))
+               lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_group_refresh(char *name, struct list_head *result_up)
+{
+       lstcon_rpc_trans_t      *trans;
+       lstcon_group_t    *grp;
+       int                   rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group: %s\n", name);
+               return rc;
+       }
+
+       if (grp->grp_ref > 2) {
+               /* referred by test */
+               CDEBUG(D_NET, "Group %s is busy\n", name);
+               lstcon_group_put(grp);
+               return -EBUSY;
+       }
+
+       /* re-invite all inactive nodes int the group */
+       rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                    &grp->grp_trans_list, LST_TRANS_SESNEW,
+                                    grp, lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               /* local error, return */
+               CDEBUG(D_NET, "Can't create transaction: %d\n", rc);
+               lstcon_group_put(grp);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+       lstcon_rpc_trans_destroy(trans);
+       /* -ref for me */
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_group_list(int index, int len, char *name_up)
+{
+       lstcon_group_t *grp;
+
+       LASSERT (index >= 0);
+       LASSERT (name_up != NULL);
+
+       list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+               if (index-- == 0) {
+                       return copy_to_user(name_up, grp->grp_name, len) ?
+                              -EFAULT : 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+static int
+lstcon_nodes_getent(struct list_head *head, int *index_p,
+                   int *count_p, lstcon_node_ent_t *dents_up)
+{
+       lstcon_ndlink_t  *ndl;
+       lstcon_node_t    *nd;
+       int            count = 0;
+       int            index = 0;
+
+       LASSERT (index_p != NULL && count_p != NULL);
+       LASSERT (dents_up != NULL);
+       LASSERT (*index_p >= 0);
+       LASSERT (*count_p > 0);
+
+       list_for_each_entry(ndl, head, ndl_link) {
+               if (index++ < *index_p)
+                       continue;
+
+               if (count >= *count_p)
+                       break;
+
+               nd = ndl->ndl_node;
+               if (copy_to_user(&dents_up[count].nde_id,
+                                    &nd->nd_id, sizeof(nd->nd_id)) ||
+                   copy_to_user(&dents_up[count].nde_state,
+                                    &nd->nd_state, sizeof(nd->nd_state)))
+                       return -EFAULT;
+
+               count ++;
+       }
+
+       if (index <= *index_p)
+               return -ENOENT;
+
+       *count_p = count;
+       *index_p = index;
+
+       return 0;
+}
+
+int
+lstcon_group_info(char *name, lstcon_ndlist_ent_t *gents_p,
+                 int *index_p, int *count_p, lstcon_node_ent_t *dents_up)
+{
+       lstcon_ndlist_ent_t *gentp;
+       lstcon_group_t      *grp;
+       lstcon_ndlink_t     *ndl;
+       int               rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", name);
+               return rc;
+       }
+
+       if (dents_up != 0) {
+               /* verbose query */
+               rc = lstcon_nodes_getent(&grp->grp_ndl_list,
+                                        index_p, count_p, dents_up);
+               lstcon_group_put(grp);
+
+               return rc;
+       }
+
+       /* non-verbose query */
+       LIBCFS_ALLOC(gentp, sizeof(lstcon_ndlist_ent_t));
+       if (gentp == NULL) {
+               CERROR("Can't allocate ndlist_ent\n");
+               lstcon_group_put(grp);
+
+               return -ENOMEM;
+       }
+
+       memset(gentp, 0, sizeof(lstcon_ndlist_ent_t));
+
+       list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link)
+               LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp);
+
+       rc = copy_to_user(gents_p, gentp,
+                             sizeof(lstcon_ndlist_ent_t)) ? -EFAULT: 0;
+
+       LIBCFS_FREE(gentp, sizeof(lstcon_ndlist_ent_t));
+
+       lstcon_group_put(grp);
+
+       return 0;
+}
+
+int
+lstcon_batch_find(char *name, lstcon_batch_t **batpp)
+{
+       lstcon_batch_t   *bat;
+
+       list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+               if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
+                       *batpp = bat;
+                       return 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+int
+lstcon_batch_add(char *name)
+{
+       lstcon_batch_t   *bat;
+       int            i;
+       int            rc;
+
+       rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
+       if (rc != 0) {
+               CDEBUG(D_NET, "Batch %s already exists\n", name);
+               return rc;
+       }
+
+       LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+       if (bat == NULL) {
+               CERROR("Can't allocate descriptor for batch %s\n", name);
+               return -ENOMEM;
+       }
+
+       LIBCFS_ALLOC(bat->bat_cli_hash,
+                    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+       if (bat->bat_cli_hash == NULL) {
+               CERROR("Can't allocate hash for batch %s\n", name);
+               LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+               return -ENOMEM;
+       }
+
+       LIBCFS_ALLOC(bat->bat_srv_hash,
+                    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+       if (bat->bat_srv_hash == NULL) {
+               CERROR("Can't allocate hash for batch %s\n", name);
+               LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+               LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+               return -ENOMEM;
+       }
+
+       strcpy(bat->bat_name, name);
+       bat->bat_hdr.tsb_index = 0;
+       bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie;
+
+       bat->bat_ntest = 0;
+       bat->bat_state = LST_BATCH_IDLE;
+
+       INIT_LIST_HEAD(&bat->bat_cli_list);
+       INIT_LIST_HEAD(&bat->bat_srv_list);
+       INIT_LIST_HEAD(&bat->bat_test_list);
+       INIT_LIST_HEAD(&bat->bat_trans_list);
+
+       for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+               INIT_LIST_HEAD(&bat->bat_cli_hash[i]);
+               INIT_LIST_HEAD(&bat->bat_srv_hash[i]);
+       }
+
+       list_add_tail(&bat->bat_link, &console_session.ses_bat_list);
+
+       return rc;
+}
+
+int
+lstcon_batch_list(int index, int len, char *name_up)
+{
+       lstcon_batch_t    *bat;
+
+       LASSERT (name_up != NULL);
+       LASSERT (index >= 0);
+
+       list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+               if (index-- == 0) {
+                       return copy_to_user(name_up,bat->bat_name, len) ?
+                              -EFAULT: 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+int
+lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, int server,
+                 int testidx, int *index_p, int *ndent_p,
+                 lstcon_node_ent_t *dents_up)
+{
+       lstcon_test_batch_ent_t *entp;
+       struct list_head              *clilst;
+       struct list_head              *srvlst;
+       lstcon_test_t      *test = NULL;
+       lstcon_batch_t    *bat;
+       lstcon_ndlink_t  *ndl;
+       int                   rc;
+
+       rc = lstcon_batch_find(name, &bat);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find batch %s\n", name);
+               return -ENOENT;
+       }
+
+       if (testidx > 0) {
+               /* query test, test index start from 1 */
+               list_for_each_entry(test, &bat->bat_test_list, tes_link) {
+                       if (testidx-- == 1)
+                               break;
+               }
+
+               if (testidx > 0) {
+                       CDEBUG(D_NET, "Can't find specified test in batch\n");
+                       return -ENOENT;
+               }
+       }
+
+       clilst = (test == NULL) ? &bat->bat_cli_list :
+                                 &test->tes_src_grp->grp_ndl_list;
+       srvlst = (test == NULL) ? &bat->bat_srv_list :
+                                 &test->tes_dst_grp->grp_ndl_list;
+
+       if (dents_up != NULL) {
+               rc = lstcon_nodes_getent((server ? srvlst: clilst),
+                                        index_p, ndent_p, dents_up);
+               return rc;
+       }
+
+       /* non-verbose query */
+       LIBCFS_ALLOC(entp, sizeof(lstcon_test_batch_ent_t));
+       if (entp == NULL)
+               return -ENOMEM;
+
+       memset(entp, 0, sizeof(lstcon_test_batch_ent_t));
+
+       if (test == NULL) {
+               entp->u.tbe_batch.bae_ntest = bat->bat_ntest;
+               entp->u.tbe_batch.bae_state = bat->bat_state;
+
+       } else {
+
+               entp->u.tbe_test.tse_type   = test->tes_type;
+               entp->u.tbe_test.tse_loop   = test->tes_loop;
+               entp->u.tbe_test.tse_concur = test->tes_concur;
+       }
+
+       list_for_each_entry(ndl, clilst, ndl_link)
+               LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle);
+
+       list_for_each_entry(ndl, srvlst, ndl_link)
+               LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle);
+
+       rc = copy_to_user(ent_up, entp,
+                             sizeof(lstcon_test_batch_ent_t)) ? -EFAULT : 0;
+
+       LIBCFS_FREE(entp, sizeof(lstcon_test_batch_ent_t));
+
+       return rc;
+}
+
+int
+lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+       switch (transop) {
+       case LST_TRANS_TSBRUN:
+               if (nd->nd_state != LST_NODE_ACTIVE)
+                       return -ENETDOWN;
+               break;
+
+       case LST_TRANS_TSBSTOP:
+               if (nd->nd_state != LST_NODE_ACTIVE)
+                       return 0;
+               break;
+
+       case LST_TRANS_TSBCLIQRY:
+       case LST_TRANS_TSBSRVQRY:
+               break;
+       }
+
+       return 1;
+}
+
+static int
+lstcon_batch_op(lstcon_batch_t *bat, int transop,
+               struct list_head *result_up)
+{
+       lstcon_rpc_trans_t *trans;
+       int              rc;
+
+       rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
+                                    &bat->bat_trans_list, transop,
+                                    bat, lstcon_batrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+       lstcon_rpc_trans_destroy(trans);
+
+       return rc;
+}
+
+int
+lstcon_batch_run(char *name, int timeout, struct list_head *result_up)
+{
+       lstcon_batch_t *bat;
+       int          rc;
+
+       if (lstcon_batch_find(name, &bat) != 0) {
+               CDEBUG(D_NET, "Can't find batch %s\n", name);
+               return -ENOENT;
+       }
+
+       bat->bat_arg = timeout;
+
+       rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up);
+
+       /* mark batch as running if it's started in any node */
+       if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0)
+               bat->bat_state = LST_BATCH_RUNNING;
+
+       return rc;
+}
+
+int
+lstcon_batch_stop(char *name, int force, struct list_head *result_up)
+{
+       lstcon_batch_t *bat;
+       int          rc;
+
+       if (lstcon_batch_find(name, &bat) != 0) {
+               CDEBUG(D_NET, "Can't find batch %s\n", name);
+               return -ENOENT;
+       }
+
+       bat->bat_arg = force;
+
+       rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up);
+
+       /* mark batch as stopped if all RPCs finished */
+       if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0)
+               bat->bat_state = LST_BATCH_IDLE;
+
+       return rc;
+}
+
+static void
+lstcon_batch_destroy(lstcon_batch_t *bat)
+{
+       lstcon_ndlink_t    *ndl;
+       lstcon_test_t      *test;
+       int              i;
+
+       list_del(&bat->bat_link);
+
+       while (!list_empty(&bat->bat_test_list)) {
+               test = list_entry(bat->bat_test_list.next,
+                                     lstcon_test_t, tes_link);
+               LASSERT (list_empty(&test->tes_trans_list));
+
+               list_del(&test->tes_link);
+
+               lstcon_group_put(test->tes_src_grp);
+               lstcon_group_put(test->tes_dst_grp);
+
+               LIBCFS_FREE(test, offsetof(lstcon_test_t,
+                                          tes_param[test->tes_paramlen]));
+       }
+
+       LASSERT (list_empty(&bat->bat_trans_list));
+
+       while (!list_empty(&bat->bat_cli_list)) {
+               ndl = list_entry(bat->bat_cli_list.next,
+                                    lstcon_ndlink_t, ndl_link);
+               list_del_init(&ndl->ndl_link);
+
+               lstcon_ndlink_release(ndl);
+       }
+
+       while (!list_empty(&bat->bat_srv_list)) {
+               ndl = list_entry(bat->bat_srv_list.next,
+                                    lstcon_ndlink_t, ndl_link);
+               list_del_init(&ndl->ndl_link);
+
+               lstcon_ndlink_release(ndl);
+       }
+
+       for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+               LASSERT (list_empty(&bat->bat_cli_hash[i]));
+               LASSERT (list_empty(&bat->bat_srv_hash[i]));
+       }
+
+       LIBCFS_FREE(bat->bat_cli_hash,
+                   sizeof(struct list_head) * LST_NODE_HASHSIZE);
+       LIBCFS_FREE(bat->bat_srv_hash,
+                   sizeof(struct list_head) * LST_NODE_HASHSIZE);
+       LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+}
+
+int
+lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+       lstcon_test_t    *test;
+       lstcon_batch_t   *batch;
+       lstcon_ndlink_t  *ndl;
+       struct list_head       *hash;
+       struct list_head       *head;
+
+       test = (lstcon_test_t *)arg;
+       LASSERT (test != NULL);
+
+       batch = test->tes_batch;
+       LASSERT (batch != NULL);
+
+       if (test->tes_oneside &&
+           transop == LST_TRANS_TSBSRVADD)
+               return 0;
+
+       if (nd->nd_state != LST_NODE_ACTIVE)
+               return -ENETDOWN;
+
+       if (transop == LST_TRANS_TSBCLIADD) {
+               hash = batch->bat_cli_hash;
+               head = &batch->bat_cli_list;
+
+       } else {
+               LASSERT (transop == LST_TRANS_TSBSRVADD);
+
+               hash = batch->bat_srv_hash;
+               head = &batch->bat_srv_list;
+       }
+
+       LASSERT (nd->nd_id.nid != LNET_NID_ANY);
+
+       if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0)
+               return -ENOMEM;
+
+       if (list_empty(&ndl->ndl_link))
+               list_add_tail(&ndl->ndl_link, head);
+
+       return 1;
+}
+
+static int
+lstcon_test_nodes_add(lstcon_test_t *test, struct list_head *result_up)
+{
+       lstcon_rpc_trans_t     *trans;
+       lstcon_group_t   *grp;
+       int                  transop;
+       int                  rc;
+
+       LASSERT (test->tes_src_grp != NULL);
+       LASSERT (test->tes_dst_grp != NULL);
+
+       transop = LST_TRANS_TSBSRVADD;
+       grp  = test->tes_dst_grp;
+again:
+       rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+                                    &test->tes_trans_list, transop,
+                                    test, lstcon_testrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+           lstcon_trans_stat()->trs_fwk_errno != 0) {
+               lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+               lstcon_rpc_trans_destroy(trans);
+               /* return if any error */
+               CDEBUG(D_NET, "Failed to add test %s, "
+                             "RPC error %d, framework error %d\n",
+                      transop == LST_TRANS_TSBCLIADD ? "client" : "server",
+                      lstcon_trans_stat()->trs_rpc_errno,
+                      lstcon_trans_stat()->trs_fwk_errno);
+
+               return rc;
+       }
+
+       lstcon_rpc_trans_destroy(trans);
+
+       if (transop == LST_TRANS_TSBCLIADD)
+               return rc;
+
+       transop = LST_TRANS_TSBCLIADD;
+       grp = test->tes_src_grp;
+       test->tes_cliidx = 0;
+
+       /* requests to test clients */
+       goto again;
+}
+
+int
+lstcon_test_add(char *name, int type, int loop, int concur,
+               int dist, int span, char *src_name, char * dst_name,
+               void *param, int paramlen, int *retp,
+               struct list_head *result_up)
+{
+       lstcon_group_t  *src_grp = NULL;
+       lstcon_group_t  *dst_grp = NULL;
+       lstcon_test_t   *test    = NULL;
+       lstcon_batch_t  *batch;
+       int           rc;
+
+       rc = lstcon_batch_find(name, &batch);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find batch %s\n", name);
+               return rc;
+       }
+
+       if (batch->bat_state != LST_BATCH_IDLE) {
+               CDEBUG(D_NET, "Can't change running batch %s\n", name);
+               return rc;
+       }
+
+       rc = lstcon_group_find(src_name, &src_grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", src_name);
+               goto out;
+       }
+
+       rc = lstcon_group_find(dst_name, &dst_grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", dst_name);
+               goto out;
+       }
+
+       if (dst_grp->grp_userland)
+               *retp = 1;
+
+       LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+       if (!test) {
+               CERROR("Can't allocate test descriptor\n");
+               rc = -ENOMEM;
+
+               goto out;
+       }
+
+       memset(test, 0, offsetof(lstcon_test_t, tes_param[paramlen]));
+       test->tes_hdr.tsb_id    = batch->bat_hdr.tsb_id;
+       test->tes_batch  = batch;
+       test->tes_type    = type;
+       test->tes_oneside       = 0; /* TODO */
+       test->tes_loop    = loop;
+       test->tes_concur        = concur;
+       test->tes_stop_onerr    = 1; /* TODO */
+       test->tes_span    = span;
+       test->tes_dist    = dist;
+       test->tes_cliidx        = 0; /* just used for creating RPC */
+       test->tes_src_grp       = src_grp;
+       test->tes_dst_grp       = dst_grp;
+       INIT_LIST_HEAD(&test->tes_trans_list);
+
+       if (param != NULL) {
+               test->tes_paramlen = paramlen;
+               memcpy(&test->tes_param[0], param, paramlen);
+       }
+
+       rc = lstcon_test_nodes_add(test, result_up);
+
+       if (rc != 0)
+               goto out;
+
+       if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+           lstcon_trans_stat()->trs_fwk_errno != 0)
+               CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, name);
+
+       /* add to test list anyway, so user can check what's going on */
+       list_add_tail(&test->tes_link, &batch->bat_test_list);
+
+       batch->bat_ntest ++;
+       test->tes_hdr.tsb_index = batch->bat_ntest;
+
+       /*  hold groups so nobody can change them */
+       return rc;
+out:
+       if (test != NULL)
+               LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+
+       if (dst_grp != NULL)
+               lstcon_group_put(dst_grp);
+
+       if (src_grp != NULL)
+               lstcon_group_put(src_grp);
+
+       return rc;
+}
+
+int
+lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+{
+       lstcon_test_t *test;
+
+       list_for_each_entry(test, &batch->bat_test_list, tes_link) {
+               if (idx == test->tes_hdr.tsb_index) {
+                       *testpp = test;
+                       return 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+int
+lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+                     lstcon_rpc_ent_t *ent_up)
+{
+       srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+       LASSERT (transop == LST_TRANS_TSBCLIQRY ||
+                transop == LST_TRANS_TSBSRVQRY);
+
+       /* positive errno, framework error code */
+       if (copy_to_user(&ent_up->rpe_priv[0],
+                            &rep->bar_active, sizeof(rep->bar_active)))
+               return -EFAULT;
+
+       return 0;
+}
+
+int
+lstcon_test_batch_query(char *name, int testidx, int client,
+                       int timeout, struct list_head *result_up)
+{
+       lstcon_rpc_trans_t *trans;
+       struct list_head         *translist;
+       struct list_head         *ndlist;
+       lstcon_tsb_hdr_t   *hdr;
+       lstcon_batch_t     *batch;
+       lstcon_test_t      *test = NULL;
+       int              transop;
+       int              rc;
+
+       rc = lstcon_batch_find(name, &batch);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find batch: %s\n", name);
+               return rc;
+       }
+
+       if (testidx == 0) {
+               translist = &batch->bat_trans_list;
+               ndlist    = &batch->bat_cli_list;
+               hdr       = &batch->bat_hdr;
+
+       } else {
+               /* query specified test only */
+               rc = lstcon_test_find(batch, testidx, &test);
+               if (rc != 0) {
+                       CDEBUG(D_NET, "Can't find test: %d\n", testidx);
+                       return rc;
+               }
+
+               translist = &test->tes_trans_list;
+               ndlist    = &test->tes_src_grp->grp_ndl_list;
+               hdr       = &test->tes_hdr;
+       }
+
+       transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY;
+
+       rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr,
+                                    lstcon_batrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, timeout);
+
+       if (testidx == 0 && /* query a batch, not a test */
+           lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 &&
+           lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) {
+               /* all RPCs finished, and no active test */
+               batch->bat_state = LST_BATCH_IDLE;
+       }
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                         lstcon_tsbrpc_readent);
+       lstcon_rpc_trans_destroy(trans);
+
+       return rc;
+}
+
+int
+lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+                      lstcon_rpc_ent_t *ent_up)
+{
+       srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+       sfw_counters_t    *sfwk_stat;
+       srpc_counters_t   *srpc_stat;
+       lnet_counters_t   *lnet_stat;
+
+       if (rep->str_status != 0)
+               return 0;
+
+       sfwk_stat = (sfw_counters_t *)&ent_up->rpe_payload[0];
+       srpc_stat = (srpc_counters_t *)((char *)sfwk_stat + sizeof(*sfwk_stat));
+       lnet_stat = (lnet_counters_t *)((char *)srpc_stat + sizeof(*srpc_stat));
+
+       if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
+           copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) ||
+           copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat)))
+               return -EFAULT;
+
+       return 0;
+}
+
+int
+lstcon_ndlist_stat(struct list_head *ndlist,
+                  int timeout, struct list_head *result_up)
+{
+       struct list_head          head;
+       lstcon_rpc_trans_t *trans;
+       int              rc;
+
+       INIT_LIST_HEAD(&head);
+
+       rc = lstcon_rpc_trans_ndlist(ndlist, &head,
+                                    LST_TRANS_STATQRY, NULL, NULL, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                         lstcon_statrpc_readent);
+       lstcon_rpc_trans_destroy(trans);
+
+       return rc;
+}
+
+int
+lstcon_group_stat(char *grp_name, int timeout, struct list_head *result_up)
+{
+       lstcon_group_t     *grp;
+       int              rc;
+
+       rc = lstcon_group_find(grp_name, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Can't find group %s\n", grp_name);
+               return rc;
+       }
+
+       rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up);
+
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+                 int timeout, struct list_head *result_up)
+{
+       lstcon_ndlink_t  *ndl;
+       lstcon_group_t    *tmp;
+       lnet_process_id_t       id;
+       int                   i;
+       int                   rc;
+
+       rc = lstcon_group_alloc(NULL, &tmp);
+       if (rc != 0) {
+               CERROR("Out of memory\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0 ; i < count; i++) {
+               if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                       rc = -EFAULT;
+                       break;
+               }
+
+               /* add to tmp group */
+               rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2);
+               if (rc != 0) {
+                       CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET,
+                              "Failed to find or create %s: %d\n",
+                              libcfs_id2str(id), rc);
+                       break;
+               }
+       }
+
+       if (rc != 0) {
+               lstcon_group_put(tmp);
+               return rc;
+       }
+
+       rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up);
+
+       lstcon_group_put(tmp);
+
+       return rc;
+}
+
+int
+lstcon_debug_ndlist(struct list_head *ndlist,
+                   struct list_head *translist,
+                   int timeout, struct list_head *result_up)
+{
+       lstcon_rpc_trans_t *trans;
+       int              rc;
+
+       rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
+                                    NULL, lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+       rc = lstcon_rpc_trans_interpreter(trans, result_up,
+                                         lstcon_sesrpc_readent);
+       lstcon_rpc_trans_destroy(trans);
+
+       return rc;
+}
+
+int
+lstcon_session_debug(int timeout, struct list_head *result_up)
+{
+       return lstcon_debug_ndlist(&console_session.ses_ndl_list,
+                                  NULL, timeout, result_up);
+}
+
+int
+lstcon_batch_debug(int timeout, char *name,
+                  int client, struct list_head *result_up)
+{
+       lstcon_batch_t *bat;
+       int          rc;
+
+       rc = lstcon_batch_find(name, &bat);
+       if (rc != 0)
+               return -ENOENT;
+
+       rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list :
+                                         &bat->bat_srv_list,
+                                NULL, timeout, result_up);
+
+       return rc;
+}
+
+int
+lstcon_group_debug(int timeout, char *name,
+                  struct list_head *result_up)
+{
+       lstcon_group_t *grp;
+       int          rc;
+
+       rc = lstcon_group_find(name, &grp);
+       if (rc != 0)
+               return -ENOENT;
+
+       rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+                                timeout, result_up);
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_nodes_debug(int timeout,
+                  int count, lnet_process_id_t *ids_up,
+                  struct list_head *result_up)
+{
+       lnet_process_id_t  id;
+       lstcon_ndlink_t   *ndl;
+       lstcon_group_t    *grp;
+       int             i;
+       int             rc;
+
+       rc = lstcon_group_alloc(NULL, &grp);
+       if (rc != 0) {
+               CDEBUG(D_NET, "Out of memory\n");
+               return rc;
+       }
+
+       for (i = 0; i < count; i++) {
+               if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+                       rc = -EFAULT;
+                       break;
+               }
+
+               /* node is added to tmp group */
+               rc = lstcon_group_ndlink_find(grp, id, &ndl, 1);
+               if (rc != 0) {
+                       CERROR("Can't create node link\n");
+                       break;
+               }
+       }
+
+       if (rc != 0) {
+               lstcon_group_put(grp);
+               return rc;
+       }
+
+       rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+                                timeout, result_up);
+
+       lstcon_group_put(grp);
+
+       return rc;
+}
+
+int
+lstcon_session_match(lst_sid_t sid)
+{
+       return (console_session.ses_id.ses_nid   == sid.ses_nid &&
+               console_session.ses_id.ses_stamp == sid.ses_stamp) ?  1: 0;
+}
+
+static void
+lstcon_new_session_id(lst_sid_t *sid)
+{
+       lnet_process_id_t      id;
+
+       LASSERT (console_session.ses_state == LST_SESSION_NONE);
+
+       LNetGetId(1, &id);
+       sid->ses_nid   = id.nid;
+       sid->ses_stamp = cfs_time_current();
+}
+
+extern srpc_service_t lstcon_acceptor_service;
+
+int
+lstcon_session_new(char *name, int key, unsigned feats,
+                  int timeout, int force, lst_sid_t *sid_up)
+{
+       int     rc = 0;
+       int     i;
+
+       if (console_session.ses_state != LST_SESSION_NONE) {
+               /* session exists */
+               if (!force) {
+                       CNETERR("Session %s already exists\n",
+                               console_session.ses_name);
+                       return -EEXIST;
+               }
+
+               rc = lstcon_session_end();
+
+               /* lstcon_session_end() only return local error */
+               if  (rc != 0)
+                       return rc;
+       }
+
+       if ((feats & ~LST_FEATS_MASK) != 0) {
+               CNETERR("Unknown session features %x\n",
+                       (feats & ~LST_FEATS_MASK));
+               return -EINVAL;
+       }
+
+       for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+               LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+       lstcon_new_session_id(&console_session.ses_id);
+
+       console_session.ses_key     = key;
+       console_session.ses_state   = LST_SESSION_ACTIVE;
+       console_session.ses_force   = !!force;
+       console_session.ses_features = feats;
+       console_session.ses_feats_updated = 0;
+       console_session.ses_timeout = (timeout <= 0) ?
+                                     LST_CONSOLE_TIMEOUT : timeout;
+       strcpy(console_session.ses_name, name);
+
+       rc = lstcon_batch_add(LST_DEFAULT_BATCH);
+       if (rc != 0)
+               return rc;
+
+       rc = lstcon_rpc_pinger_start();
+       if (rc != 0) {
+               lstcon_batch_t *bat = NULL;
+
+               lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
+               lstcon_batch_destroy(bat);
+
+               return rc;
+       }
+
+       if (copy_to_user(sid_up, &console_session.ses_id,
+                            sizeof(lst_sid_t)) == 0)
+               return rc;
+
+       lstcon_session_end();
+
+       return -EFAULT;
+}
+
+int
+lstcon_session_info(lst_sid_t *sid_up, int *key_up, unsigned *featp,
+                   lstcon_ndlist_ent_t *ndinfo_up, char *name_up, int len)
+{
+       lstcon_ndlist_ent_t *entp;
+       lstcon_ndlink_t     *ndl;
+       int               rc = 0;
+
+       if (console_session.ses_state != LST_SESSION_ACTIVE)
+               return -ESRCH;
+
+       LIBCFS_ALLOC(entp, sizeof(*entp));
+       if (entp == NULL)
+               return -ENOMEM;
+
+       memset(entp, 0, sizeof(*entp));
+
+       list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link)
+               LST_NODE_STATE_COUNTER(ndl->ndl_node, entp);
+
+       if (copy_to_user(sid_up, &console_session.ses_id,
+                            sizeof(lst_sid_t)) ||
+           copy_to_user(key_up, &console_session.ses_key,
+                            sizeof(*key_up)) ||
+           copy_to_user(featp, &console_session.ses_features,
+                            sizeof(*featp)) ||
+           copy_to_user(ndinfo_up, entp, sizeof(*entp)) ||
+           copy_to_user(name_up, console_session.ses_name, len))
+               rc = -EFAULT;
+
+       LIBCFS_FREE(entp, sizeof(*entp));
+
+       return rc;
+}
+
+int
+lstcon_session_end()
+{
+       lstcon_rpc_trans_t *trans;
+       lstcon_group_t     *grp;
+       lstcon_batch_t     *bat;
+       int              rc = 0;
+
+       LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
+
+       rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list,
+                                    NULL, LST_TRANS_SESEND, NULL,
+                                    lstcon_sesrpc_condition, &trans);
+       if (rc != 0) {
+               CERROR("Can't create transaction: %d\n", rc);
+               return rc;
+       }
+
+       console_session.ses_shutdown = 1;
+
+       lstcon_rpc_pinger_stop();
+
+       lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+       lstcon_rpc_trans_destroy(trans);
+       /* User can do nothing even rpc failed, so go on */
+
+       /* waiting for orphan rpcs to die */
+       lstcon_rpc_cleanup_wait();
+
+       console_session.ses_id    = LST_INVALID_SID;
+       console_session.ses_state = LST_SESSION_NONE;
+       console_session.ses_key   = 0;
+       console_session.ses_force = 0;
+       console_session.ses_feats_updated = 0;
+
+       /* destroy all batches */
+       while (!list_empty(&console_session.ses_bat_list)) {
+               bat = list_entry(console_session.ses_bat_list.next,
+                                    lstcon_batch_t, bat_link);
+
+               lstcon_batch_destroy(bat);
+       }
+
+       /* destroy all groups */
+       while (!list_empty(&console_session.ses_grp_list)) {
+               grp = list_entry(console_session.ses_grp_list.next,
+                                    lstcon_group_t, grp_link);
+               LASSERT (grp->grp_ref == 1);
+
+               lstcon_group_put(grp);
+       }
+
+       /* all nodes should be released */
+       LASSERT (list_empty(&console_session.ses_ndl_list));
+
+       console_session.ses_shutdown = 0;
+       console_session.ses_expired  = 0;
+
+       return rc;
+}
+
+int
+lstcon_session_feats_check(unsigned feats)
+{
+       int rc = 0;
+
+       if ((feats & ~LST_FEATS_MASK) != 0) {
+               CERROR("Can't support these features: %x\n",
+                      (feats & ~LST_FEATS_MASK));
+               return -EPROTO;
+       }
+
+       spin_lock(&console_session.ses_rpc_lock);
+
+       if (!console_session.ses_feats_updated) {
+               console_session.ses_feats_updated = 1;
+               console_session.ses_features = feats;
+       }
+
+       if (console_session.ses_features != feats)
+               rc = -EPROTO;
+
+       spin_unlock(&console_session.ses_rpc_lock);
+
+       if (rc != 0) {
+               CERROR("remote features %x do not match with "
+                      "session features %x of console\n",
+                      feats, console_session.ses_features);
+       }
+
+       return rc;
+}
+
+static int
+lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
+{
+       srpc_msg_t      *rep  = &rpc->srpc_replymsg;
+       srpc_msg_t      *req  = &rpc->srpc_reqstbuf->buf_msg;
+       srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
+       srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
+       lstcon_group_t    *grp  = NULL;
+       lstcon_ndlink_t   *ndl;
+       int             rc   = 0;
+
+       sfw_unpack_message(req);
+
+       mutex_lock(&console_session.ses_mutex);
+
+       jrep->join_sid = console_session.ses_id;
+
+       if (console_session.ses_id.ses_nid == LNET_NID_ANY) {
+               jrep->join_status = ESRCH;
+               goto out;
+       }
+
+       if (lstcon_session_feats_check(req->msg_ses_feats) != 0) {
+               jrep->join_status = EPROTO;
+               goto out;
+       }
+
+       if (jreq->join_sid.ses_nid != LNET_NID_ANY &&
+            !lstcon_session_match(jreq->join_sid)) {
+               jrep->join_status = EBUSY;
+               goto out;
+       }
+
+       if (lstcon_group_find(jreq->join_group, &grp) != 0) {
+               rc = lstcon_group_alloc(jreq->join_group, &grp);
+               if (rc != 0) {
+                       CERROR("Out of memory\n");
+                       goto out;
+               }
+
+               list_add_tail(&grp->grp_link,
+                                 &console_session.ses_grp_list);
+               lstcon_group_addref(grp);
+       }
+
+       if (grp->grp_ref > 2) {
+               /* Group in using */
+               jrep->join_status = EBUSY;
+               goto out;
+       }
+
+       rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0);
+       if (rc == 0) {
+               jrep->join_status = EEXIST;
+               goto out;
+       }
+
+       rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1);
+       if (rc != 0) {
+               CERROR("Out of memory\n");
+               goto out;
+       }
+
+       ndl->ndl_node->nd_state   = LST_NODE_ACTIVE;
+       ndl->ndl_node->nd_timeout = console_session.ses_timeout;
+
+       if (grp->grp_userland == 0)
+               grp->grp_userland = 1;
+
+       strcpy(jrep->join_session, console_session.ses_name);
+       jrep->join_timeout = console_session.ses_timeout;
+       jrep->join_status  = 0;
+
+out:
+       rep->msg_ses_feats = console_session.ses_features;
+       if (grp != NULL)
+               lstcon_group_put(grp);
+
+       mutex_unlock(&console_session.ses_mutex);
+
+       return rc;
+}
+
+srpc_service_t lstcon_acceptor_service;
+void lstcon_init_acceptor_service(void)
+{
+       /* initialize selftest console acceptor service table */
+       lstcon_acceptor_service.sv_name    = "join session";
+       lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle;
+       lstcon_acceptor_service.sv_id      = SRPC_SERVICE_JOIN;
+       lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
+}
+
+extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data);
+
+DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+
+/* initialize console */
+int
+lstcon_console_init(void)
+{
+       int     i;
+       int     rc;
+
+       memset(&console_session, 0, sizeof(lstcon_session_t));
+
+       console_session.ses_id              = LST_INVALID_SID;
+       console_session.ses_state           = LST_SESSION_NONE;
+       console_session.ses_timeout         = 0;
+       console_session.ses_force           = 0;
+       console_session.ses_expired         = 0;
+       console_session.ses_feats_updated   = 0;
+       console_session.ses_features        = LST_FEATS_MASK;
+       console_session.ses_laststamp       = cfs_time_current_sec();
+
+       mutex_init(&console_session.ses_mutex);
+
+       INIT_LIST_HEAD(&console_session.ses_ndl_list);
+       INIT_LIST_HEAD(&console_session.ses_grp_list);
+       INIT_LIST_HEAD(&console_session.ses_bat_list);
+       INIT_LIST_HEAD(&console_session.ses_trans_list);
+
+       LIBCFS_ALLOC(console_session.ses_ndl_hash,
+                    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+       if (console_session.ses_ndl_hash == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+               INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]);
+
+
+       /* initialize acceptor service table */
+       lstcon_init_acceptor_service();
+
+       rc = srpc_add_service(&lstcon_acceptor_service);
+       LASSERT (rc != -EBUSY);
+       if (rc != 0) {
+               LIBCFS_FREE(console_session.ses_ndl_hash,
+                           sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+               return rc;
+       }
+
+       rc = srpc_service_add_buffers(&lstcon_acceptor_service,
+                                     lstcon_acceptor_service.sv_wi_total);
+       if (rc != 0) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
+
+       if (rc == 0) {
+               lstcon_rpc_module_init();
+               return 0;
+       }
+
+out:
+       srpc_shutdown_service(&lstcon_acceptor_service);
+       srpc_remove_service(&lstcon_acceptor_service);
+
+       LIBCFS_FREE(console_session.ses_ndl_hash,
+                   sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+       srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+       return rc;
+}
+
+int
+lstcon_console_fini(void)
+{
+       int     i;
+
+       libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+
+       mutex_lock(&console_session.ses_mutex);
+
+       srpc_shutdown_service(&lstcon_acceptor_service);
+       srpc_remove_service(&lstcon_acceptor_service);
+
+       if (console_session.ses_state != LST_SESSION_NONE)
+               lstcon_session_end();
+
+       lstcon_rpc_module_fini();
+
+       mutex_unlock(&console_session.ses_mutex);
+
+       LASSERT (list_empty(&console_session.ses_ndl_list));
+       LASSERT (list_empty(&console_session.ses_grp_list));
+       LASSERT (list_empty(&console_session.ses_bat_list));
+       LASSERT (list_empty(&console_session.ses_trans_list));
+
+       for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+               LASSERT (list_empty(&console_session.ses_ndl_hash[i]));
+       }
+
+       LIBCFS_FREE(console_session.ses_ndl_hash,
+                   sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+       srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+       return 0;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/console.h b/drivers/staging/lustre/lnet/selftest/console.h
new file mode 100644 (file)
index 0000000..e61b266
--- /dev/null
@@ -0,0 +1,232 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/console.h
+ *
+ * kernel structure for LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LST_CONSOLE_H__
+#define __LST_CONSOLE_H__
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "selftest.h"
+#include "conrpc.h"
+
+typedef struct lstcon_node {
+       lnet_process_id_t    nd_id;       /* id of the node */
+       int               nd_ref;        /* reference count */
+       int               nd_state;       /* state of the node */
+       int               nd_timeout;     /* session timeout */
+       cfs_time_t         nd_stamp;       /* timestamp of last replied RPC */
+       struct lstcon_rpc    nd_ping;   /* ping rpc */
+} lstcon_node_t;                               /*** node descriptor */
+
+typedef struct {
+       struct list_head           ndl_link;       /* chain on list */
+       struct list_head           ndl_hlink;      /* chain on hash */
+       lstcon_node_t       *ndl_node;       /* pointer to node */
+} lstcon_ndlink_t;                           /*** node link descriptor */
+
+typedef struct {
+       struct list_head           grp_link;       /* chain on global group list */
+       int               grp_ref;      /* reference count */
+       int               grp_userland;   /* has userland nodes */
+       int               grp_nnode;      /* # of nodes */
+       char             grp_name[LST_NAME_SIZE]; /* group name */
+
+       struct list_head           grp_trans_list; /* transaction list */
+       struct list_head           grp_ndl_list;   /* nodes list */
+       struct list_head           grp_ndl_hash[0];/* hash table for nodes */
+} lstcon_group_t;                  /*** (alias of nodes) group descriptor */
+
+#define LST_BATCH_IDLE   0xB0      /* idle batch */
+#define LST_BATCH_RUNNING       0xB1       /* running batch */
+
+typedef struct lstcon_tsb_hdr {
+       lst_bid_t              tsb_id;   /* batch ID */
+       int                  tsb_index;      /* test index */
+} lstcon_tsb_hdr_t;
+
+typedef struct {
+       lstcon_tsb_hdr_t        bat_hdr;        /* test_batch header */
+       struct list_head              bat_link;       /* chain on session's batches list */
+       int                  bat_ntest;      /* # of test */
+       int                  bat_state;      /* state of the batch */
+       int                  bat_arg;   /* parameter for run|stop, timeout for run, force for stop */
+       char                bat_name[LST_NAME_SIZE]; /* name of batch */
+
+       struct list_head              bat_test_list;  /* list head of tests (lstcon_test_t) */
+       struct list_head              bat_trans_list; /* list head of transaction */
+       struct list_head              bat_cli_list;   /* list head of client nodes (lstcon_node_t) */
+       struct list_head             *bat_cli_hash;   /* hash table of client nodes */
+       struct list_head              bat_srv_list;   /* list head of server nodes */
+       struct list_head             *bat_srv_hash;   /* hash table of server nodes */
+} lstcon_batch_t;                           /*** (tests ) batch descritptor */
+
+typedef struct lstcon_test {
+       lstcon_tsb_hdr_t      tes_hdr;  /* test batch header */
+       struct list_head            tes_link;       /* chain on batch's tests list */
+       lstcon_batch_t       *tes_batch;      /* pointer to batch */
+
+       int                tes_type;       /* type of the test, i.e: bulk, ping */
+       int                tes_stop_onerr; /* stop on error */
+       int                tes_oneside;    /* one-sided test */
+       int                tes_concur;     /* concurrency */
+       int                tes_loop;       /* loop count */
+       int                tes_dist;       /* nodes distribution of target group */
+       int                tes_span;       /* nodes span of target group */
+       int                tes_cliidx;     /* client index, used for RPC creating */
+
+       struct list_head  tes_trans_list; /* transaction list */
+       lstcon_group_t       *tes_src_grp;    /* group run the test */
+       lstcon_group_t       *tes_dst_grp;    /* target group */
+
+       int                tes_paramlen;   /* test parameter length */
+       char              tes_param[0];   /* test parameter */
+} lstcon_test_t;                               /*** a single test descriptor */
+
+#define LST_GLOBAL_HASHSIZE     503         /* global nodes hash table size */
+#define LST_NODE_HASHSIZE       239         /* node hash table (for batch or group) */
+
+#define LST_SESSION_NONE       0x0          /* no session */
+#define LST_SESSION_ACTIVE      0x1         /* working session */
+
+#define LST_CONSOLE_TIMEOUT     300         /* default console timeout */
+
+typedef struct {
+       struct mutex            ses_mutex;      /* only 1 thread in session */
+       lst_sid_t              ses_id;   /* global session id */
+       int                  ses_key;   /* local session key */
+       int                  ses_state;      /* state of session */
+       int                  ses_timeout;    /* timeout in seconds */
+       time_t            ses_laststamp;  /* last operation stamp (seconds) */
+       /** tests features of the session */
+       unsigned                ses_features;
+       /** features are synced with remote test nodes */
+       unsigned                ses_feats_updated:1;
+       /** force creating */
+       unsigned                ses_force:1;
+       /** session is shutting down */
+       unsigned                ses_shutdown:1;
+       /** console is timedout */
+       unsigned                ses_expired:1;
+       __u64              ses_id_cookie;  /* batch id cookie */
+       char                ses_name[LST_NAME_SIZE];  /* session name */
+       lstcon_rpc_trans_t     *ses_ping;       /* session pinger */
+       stt_timer_t          ses_ping_timer; /* timer for pinger */
+       lstcon_trans_stat_t     ses_trans_stat; /* transaction stats */
+
+       struct list_head              ses_trans_list; /* global list of transaction */
+       struct list_head              ses_grp_list;   /* global list of groups */
+       struct list_head              ses_bat_list;   /* global list of batches */
+       struct list_head              ses_ndl_list;   /* global list of nodes */
+       struct list_head             *ses_ndl_hash;   /* hash table of nodes */
+
+       spinlock_t        ses_rpc_lock;   /* serialize */
+       atomic_t            ses_rpc_counter;/* # of initialized RPCs */
+       struct list_head              ses_rpc_freelist; /* idle console rpc */
+} lstcon_session_t;                         /*** session descriptor */
+
+extern lstcon_session_t         console_session;
+
+static inline lstcon_trans_stat_t *
+lstcon_trans_stat(void)
+{
+       return &console_session.ses_trans_stat;
+}
+
+static inline struct list_head *
+lstcon_id2hash (lnet_process_id_t id, struct list_head *hash)
+{
+       unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+
+       return &hash[idx];
+}
+
+extern int lstcon_session_match(lst_sid_t sid);
+extern int lstcon_session_new(char *name, int key, unsigned version,
+                             int timeout, int flags, lst_sid_t *sid_up);
+extern int lstcon_session_info(lst_sid_t *sid_up, int *key, unsigned *verp,
+                              lstcon_ndlist_ent_t *entp, char *name_up, int len);
+extern int lstcon_session_end(void);
+extern int lstcon_session_debug(int timeout, struct list_head *result_up);
+extern int lstcon_session_feats_check(unsigned feats);
+extern int lstcon_batch_debug(int timeout, char *name,
+                             int client, struct list_head *result_up);
+extern int lstcon_group_debug(int timeout, char *name,
+                             struct list_head *result_up);
+extern int lstcon_nodes_debug(int timeout, int nnd, lnet_process_id_t *nds_up,
+                             struct list_head *result_up);
+extern int lstcon_group_add(char *name);
+extern int lstcon_group_del(char *name);
+extern int lstcon_group_clean(char *name, int args);
+extern int lstcon_group_refresh(char *name, struct list_head *result_up);
+extern int lstcon_nodes_add(char *name, int nnd, lnet_process_id_t *nds_up,
+                           unsigned *featp, struct list_head *result_up);
+extern int lstcon_nodes_remove(char *name, int nnd, lnet_process_id_t *nds_up,
+                              struct list_head *result_up);
+extern int lstcon_group_info(char *name, lstcon_ndlist_ent_t *gent_up,
+                            int *index_p, int *ndent_p, lstcon_node_ent_t *ndents_up);
+extern int lstcon_group_list(int idx, int len, char *name_up);
+extern int lstcon_batch_add(char *name);
+extern int lstcon_batch_run(char *name, int timeout,
+                           struct list_head *result_up);
+extern int lstcon_batch_stop(char *name, int force,
+                            struct list_head *result_up);
+extern int lstcon_test_batch_query(char *name, int testidx,
+                                  int client, int timeout,
+                                  struct list_head *result_up);
+extern int lstcon_batch_del(char *name);
+extern int lstcon_batch_list(int idx, int namelen, char *name_up);
+extern int lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up,
+                            int server, int testidx, int *index_p,
+                            int *ndent_p, lstcon_node_ent_t *dents_up);
+extern int lstcon_group_stat(char *grp_name, int timeout,
+                            struct list_head *result_up);
+extern int lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+                            int timeout, struct list_head *result_up);
+extern int lstcon_test_add(char *name, int type, int loop, int concur,
+                          int dist, int span, char *src_name, char * dst_name,
+                          void *param, int paramlen, int *retp,
+                          struct list_head *result_up);
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c
new file mode 100644 (file)
index 0000000..483c785
--- /dev/null
@@ -0,0 +1,1814 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/framework.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Author: Liang Zhen  <liangzhen@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+lst_sid_t LST_INVALID_SID = {LNET_NID_ANY, -1};
+
+static int session_timeout = 100;
+CFS_MODULE_PARM(session_timeout, "i", int, 0444,
+               "test session timeout in seconds (100 by default, 0 == never)");
+
+static int rpc_timeout = 64;
+CFS_MODULE_PARM(rpc_timeout, "i", int, 0644,
+               "rpc timeout in seconds (64 by default, 0 == never)");
+
+#define sfw_unpack_id(id)             \
+do {                               \
+       __swab64s(&(id).nid);      \
+       __swab32s(&(id).pid);      \
+} while (0)
+
+#define sfw_unpack_sid(sid)         \
+do {                               \
+       __swab64s(&(sid).ses_nid);      \
+       __swab64s(&(sid).ses_stamp);    \
+} while (0)
+
+#define sfw_unpack_fw_counters(fc)     \
+do {                                 \
+       __swab32s(&(fc).running_ms);      \
+       __swab32s(&(fc).active_batches);  \
+       __swab32s(&(fc).zombie_sessions); \
+       __swab32s(&(fc).brw_errors);      \
+       __swab32s(&(fc).ping_errors);     \
+} while (0)
+
+#define sfw_unpack_rpc_counters(rc)     \
+do {                               \
+       __swab32s(&(rc).errors);        \
+       __swab32s(&(rc).rpcs_sent);     \
+       __swab32s(&(rc).rpcs_rcvd);     \
+       __swab32s(&(rc).rpcs_dropped);  \
+       __swab32s(&(rc).rpcs_expired);  \
+       __swab64s(&(rc).bulk_get);      \
+       __swab64s(&(rc).bulk_put);      \
+} while (0)
+
+#define sfw_unpack_lnet_counters(lc)    \
+do {                               \
+       __swab32s(&(lc).errors);        \
+       __swab32s(&(lc).msgs_max);      \
+       __swab32s(&(lc).msgs_alloc);    \
+       __swab32s(&(lc).send_count);    \
+       __swab32s(&(lc).recv_count);    \
+       __swab32s(&(lc).drop_count);    \
+       __swab32s(&(lc).route_count);   \
+       __swab64s(&(lc).send_length);   \
+       __swab64s(&(lc).recv_length);   \
+       __swab64s(&(lc).drop_length);   \
+       __swab64s(&(lc).route_length);  \
+} while (0)
+
+#define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
+#define sfw_batch_active(b)     (atomic_read(&(b)->bat_nactive) != 0)
+
+struct smoketest_framework {
+       struct list_head         fw_zombie_rpcs;     /* RPCs to be recycled */
+       struct list_head         fw_zombie_sessions; /* stopping sessions */
+       struct list_head         fw_tests;         /* registered test cases */
+       atomic_t       fw_nzombies;     /* # zombie sessions */
+       spinlock_t         fw_lock;             /* serialise */
+       sfw_session_t     *fw_session;          /* _the_ session */
+       int                fw_shuttingdown;     /* shutdown in progress */
+       srpc_server_rpc_t *fw_active_srpc;      /* running RPC */
+} sfw_data;
+
+/* forward ref's */
+int sfw_stop_batch (sfw_batch_t *tsb, int force);
+void sfw_destroy_session (sfw_session_t *sn);
+
+static inline sfw_test_case_t *
+sfw_find_test_case(int id)
+{
+       sfw_test_case_t *tsc;
+
+       LASSERT (id <= SRPC_SERVICE_MAX_ID);
+       LASSERT (id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+       list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+               if (tsc->tsc_srv_service->sv_id == id)
+                       return tsc;
+       }
+
+       return NULL;
+}
+
+static int
+sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
+{
+       sfw_test_case_t *tsc;
+
+       if (sfw_find_test_case(service->sv_id) != NULL) {
+               CERROR ("Failed to register test %s (%d)\n",
+                       service->sv_name, service->sv_id);
+               return -EEXIST;
+       }
+
+       LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+       if (tsc == NULL)
+               return -ENOMEM;
+
+       memset(tsc, 0, sizeof(sfw_test_case_t));
+       tsc->tsc_cli_ops     = cliops;
+       tsc->tsc_srv_service = service;
+
+       list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests);
+       return 0;
+}
+
+void
+sfw_add_session_timer (void)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       stt_timer_t   *timer = &sn->sn_timer;
+
+       LASSERT (!sfw_data.fw_shuttingdown);
+
+       if (sn == NULL || sn->sn_timeout == 0)
+               return;
+
+       LASSERT (!sn->sn_timer_active);
+
+       sn->sn_timer_active = 1;
+       timer->stt_expires = cfs_time_add(sn->sn_timeout,
+                                         cfs_time_current_sec());
+       stt_add_timer(timer);
+       return;
+}
+
+int
+sfw_del_session_timer (void)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+
+       if (sn == NULL || !sn->sn_timer_active)
+               return 0;
+
+       LASSERT (sn->sn_timeout != 0);
+
+       if (stt_del_timer(&sn->sn_timer)) { /* timer defused */
+               sn->sn_timer_active = 0;
+               return 0;
+       }
+
+       return EBUSY; /* racing with sfw_session_expired() */
+}
+
+/* called with sfw_data.fw_lock held */
+static void
+sfw_deactivate_session (void)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       int         nactive = 0;
+       sfw_batch_t   *tsb;
+       sfw_test_case_t *tsc;
+
+       if (sn == NULL) return;
+
+       LASSERT (!sn->sn_timer_active);
+
+       sfw_data.fw_session = NULL;
+       atomic_inc(&sfw_data.fw_nzombies);
+       list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions);
+
+       spin_unlock(&sfw_data.fw_lock);
+
+       list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+               srpc_abort_service(tsc->tsc_srv_service);
+       }
+
+       spin_lock(&sfw_data.fw_lock);
+
+       list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+               if (sfw_batch_active(tsb)) {
+                       nactive++;
+                       sfw_stop_batch(tsb, 1);
+               }
+       }
+
+       if (nactive != 0)
+               return;   /* wait for active batches to stop */
+
+       list_del_init(&sn->sn_list);
+       spin_unlock(&sfw_data.fw_lock);
+
+       sfw_destroy_session(sn);
+
+       spin_lock(&sfw_data.fw_lock);
+}
+
+
+void
+sfw_session_expired (void *data)
+{
+       sfw_session_t *sn = data;
+
+       spin_lock(&sfw_data.fw_lock);
+
+       LASSERT (sn->sn_timer_active);
+       LASSERT (sn == sfw_data.fw_session);
+
+       CWARN ("Session expired! sid: %s-"LPU64", name: %s\n",
+              libcfs_nid2str(sn->sn_id.ses_nid),
+              sn->sn_id.ses_stamp, &sn->sn_name[0]);
+
+       sn->sn_timer_active = 0;
+       sfw_deactivate_session();
+
+       spin_unlock(&sfw_data.fw_lock);
+}
+
+static inline void
+sfw_init_session(sfw_session_t *sn, lst_sid_t sid,
+                unsigned features, const char *name)
+{
+       stt_timer_t *timer = &sn->sn_timer;
+
+       memset(sn, 0, sizeof(sfw_session_t));
+       INIT_LIST_HEAD(&sn->sn_list);
+       INIT_LIST_HEAD(&sn->sn_batches);
+       atomic_set(&sn->sn_refcount, 1);        /* +1 for caller */
+       atomic_set(&sn->sn_brw_errors, 0);
+       atomic_set(&sn->sn_ping_errors, 0);
+       strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
+
+       sn->sn_timer_active = 0;
+       sn->sn_id          = sid;
+       sn->sn_features     = features;
+       sn->sn_timeout      = session_timeout;
+       sn->sn_started      = cfs_time_current();
+
+       timer->stt_data = sn;
+       timer->stt_func = sfw_session_expired;
+       INIT_LIST_HEAD(&timer->stt_list);
+}
+
+/* completion handler for incoming framework RPCs */
+void
+sfw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+       struct srpc_service     *sv     = rpc->srpc_scd->scd_svc;
+       int                     status  = rpc->srpc_status;
+
+       CDEBUG (D_NET,
+               "Incoming framework RPC done: "
+               "service %s, peer %s, status %s:%d\n",
+               sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+               swi_state2str(rpc->srpc_wi.swi_state),
+               status);
+
+       if (rpc->srpc_bulk != NULL)
+               sfw_free_pages(rpc);
+       return;
+}
+
+void
+sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
+{
+       LASSERT (rpc->crpc_bulk.bk_niov == 0);
+       LASSERT (list_empty(&rpc->crpc_list));
+       LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+       CDEBUG (D_NET,
+               "Outgoing framework RPC done: "
+               "service %d, peer %s, status %s:%d:%d\n",
+               rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+               swi_state2str(rpc->crpc_wi.swi_state),
+               rpc->crpc_aborted, rpc->crpc_status);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       /* my callers must finish all RPCs before shutting me down */
+       LASSERT(!sfw_data.fw_shuttingdown);
+       list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs);
+
+       spin_unlock(&sfw_data.fw_lock);
+}
+
+sfw_batch_t *
+sfw_find_batch (lst_bid_t bid)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       sfw_batch_t   *bat;
+
+       LASSERT (sn != NULL);
+
+       list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+               if (bat->bat_id.bat_id == bid.bat_id)
+                       return bat;
+       }
+
+       return NULL;
+}
+
+sfw_batch_t *
+sfw_bid2batch (lst_bid_t bid)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       sfw_batch_t   *bat;
+
+       LASSERT (sn != NULL);
+
+       bat = sfw_find_batch(bid);
+       if (bat != NULL)
+               return bat;
+
+       LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+       if (bat == NULL)
+               return NULL;
+
+       bat->bat_error    = 0;
+       bat->bat_session  = sn;
+       bat->bat_id       = bid;
+       atomic_set(&bat->bat_nactive, 0);
+       INIT_LIST_HEAD(&bat->bat_tests);
+
+       list_add_tail(&bat->bat_list, &sn->sn_batches);
+       return bat;
+}
+
+int
+sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+{
+       sfw_session_t  *sn = sfw_data.fw_session;
+       sfw_counters_t *cnt = &reply->str_fw;
+       sfw_batch_t    *bat;
+       struct timeval  tv;
+
+       reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+       if (request->str_sid.ses_nid == LNET_NID_ANY) {
+               reply->str_status = EINVAL;
+               return 0;
+       }
+
+       if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) {
+               reply->str_status = ESRCH;
+               return 0;
+       }
+
+       lnet_counters_get(&reply->str_lnet);
+       srpc_get_counters(&reply->str_rpc);
+
+       /* send over the msecs since the session was started
+        - with 32 bits to send, this is ~49 days */
+       cfs_duration_usec(cfs_time_sub(cfs_time_current(),
+                                      sn->sn_started), &tv);
+
+       cnt->running_ms      = (__u32)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+       cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
+       cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
+       cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
+
+       cnt->active_batches = 0;
+       list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+               if (atomic_read(&bat->bat_nactive) > 0)
+                       cnt->active_batches++;
+       }
+
+       reply->str_status = 0;
+       return 0;
+}
+
+int
+sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       srpc_msg_t    *msg = container_of(request, srpc_msg_t,
+                                         msg_body.mksn_reqst);
+       int            cplen = 0;
+
+       if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
+               reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+               reply->mksn_status = EINVAL;
+               return 0;
+       }
+
+       if (sn != NULL) {
+               reply->mksn_status  = 0;
+               reply->mksn_sid     = sn->sn_id;
+               reply->mksn_timeout = sn->sn_timeout;
+
+               if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) {
+                       atomic_inc(&sn->sn_refcount);
+                       return 0;
+               }
+
+               if (!request->mksn_force) {
+                       reply->mksn_status = EBUSY;
+                       cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0],
+                                       sizeof(reply->mksn_name));
+                       if (cplen >= sizeof(reply->mksn_name))
+                               return -E2BIG;
+                       return 0;
+               }
+       }
+
+       /* reject the request if it requires unknown features
+        * NB: old version will always accept all features because it's not
+        * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+        * harmless because it will return zero feature to console, and it's
+        * console's responsibility to make sure all nodes in a session have
+        * same feature mask. */
+       if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+               reply->mksn_status = EPROTO;
+               return 0;
+       }
+
+       /* brand new or create by force */
+       LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+       if (sn == NULL) {
+               CERROR ("Dropping RPC (mksn) under memory pressure.\n");
+               return -ENOMEM;
+       }
+
+       sfw_init_session(sn, request->mksn_sid,
+                        msg->msg_ses_feats, &request->mksn_name[0]);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       sfw_deactivate_session();
+       LASSERT(sfw_data.fw_session == NULL);
+       sfw_data.fw_session = sn;
+
+       spin_unlock(&sfw_data.fw_lock);
+
+       reply->mksn_status  = 0;
+       reply->mksn_sid     = sn->sn_id;
+       reply->mksn_timeout = sn->sn_timeout;
+       return 0;
+}
+
+int
+sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+
+       reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+       if (request->rmsn_sid.ses_nid == LNET_NID_ANY) {
+               reply->rmsn_status = EINVAL;
+               return 0;
+       }
+
+       if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) {
+               reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY;
+               return 0;
+       }
+
+       if (!atomic_dec_and_test(&sn->sn_refcount)) {
+               reply->rmsn_status = 0;
+               return 0;
+       }
+
+       spin_lock(&sfw_data.fw_lock);
+       sfw_deactivate_session();
+       spin_unlock(&sfw_data.fw_lock);
+
+       reply->rmsn_status = 0;
+       reply->rmsn_sid    = LST_INVALID_SID;
+       LASSERT(sfw_data.fw_session == NULL);
+       return 0;
+}
+
+int
+sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+
+       if (sn == NULL) {
+               reply->dbg_status = ESRCH;
+               reply->dbg_sid    = LST_INVALID_SID;
+               return 0;
+       }
+
+       reply->dbg_status  = 0;
+       reply->dbg_sid     = sn->sn_id;
+       reply->dbg_timeout = sn->sn_timeout;
+       if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name))
+           >= sizeof(reply->dbg_name))
+               return -E2BIG;
+
+       return 0;
+}
+
+void
+sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
+{
+       sfw_test_unit_t     *tsu = rpc->crpc_priv;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+       /* Called with hold of tsi->tsi_lock */
+       LASSERT (list_empty(&rpc->crpc_list));
+       list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+}
+
+static inline int
+sfw_test_buffers(sfw_test_instance_t *tsi)
+{
+       struct sfw_test_case    *tsc = sfw_find_test_case(tsi->tsi_service);
+       struct srpc_service     *svc = tsc->tsc_srv_service;
+       int                     nbuf;
+
+       nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts;
+       return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA);
+}
+
+int
+sfw_load_test(struct sfw_test_instance *tsi)
+{
+       struct sfw_test_case    *tsc;
+       struct srpc_service     *svc;
+       int                     nbuf;
+       int                     rc;
+
+       LASSERT(tsi != NULL);
+       tsc = sfw_find_test_case(tsi->tsi_service);
+       nbuf = sfw_test_buffers(tsi);
+       LASSERT(tsc != NULL);
+       svc = tsc->tsc_srv_service;
+
+       if (tsi->tsi_is_client) {
+               tsi->tsi_ops = tsc->tsc_cli_ops;
+               return 0;
+       }
+
+       rc = srpc_service_add_buffers(svc, nbuf);
+       if (rc != 0) {
+               CWARN("Failed to reserve enough buffers: "
+                     "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc);
+               /* NB: this error handler is not strictly correct, because
+                * it may release more buffers than already allocated,
+                * but it doesn't matter because request portal should
+                * be lazy portal and will grow buffers if necessary. */
+               srpc_service_remove_buffers(svc, nbuf);
+               return -ENOMEM;
+       }
+
+       CDEBUG(D_NET, "Reserved %d buffers for test %s\n",
+              nbuf * (srpc_serv_is_framework(svc) ?
+                      1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name);
+       return 0;
+}
+
+void
+sfw_unload_test(struct sfw_test_instance *tsi)
+{
+       struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service);
+
+       LASSERT(tsc != NULL);
+
+       if (tsi->tsi_is_client)
+               return;
+
+       /* shrink buffers, because request portal is lazy portal
+        * which can grow buffers at runtime so we may leave
+        * some buffers behind, but never mind... */
+       srpc_service_remove_buffers(tsc->tsc_srv_service,
+                                   sfw_test_buffers(tsi));
+       return;
+}
+
+void
+sfw_destroy_test_instance (sfw_test_instance_t *tsi)
+{
+       srpc_client_rpc_t *rpc;
+       sfw_test_unit_t   *tsu;
+
+       if (!tsi->tsi_is_client) goto clean;
+
+       tsi->tsi_ops->tso_fini(tsi);
+
+       LASSERT (!tsi->tsi_stopping);
+       LASSERT (list_empty(&tsi->tsi_active_rpcs));
+       LASSERT (!sfw_test_active(tsi));
+
+       while (!list_empty(&tsi->tsi_units)) {
+               tsu = list_entry(tsi->tsi_units.next,
+                                    sfw_test_unit_t, tsu_list);
+               list_del(&tsu->tsu_list);
+               LIBCFS_FREE(tsu, sizeof(*tsu));
+       }
+
+       while (!list_empty(&tsi->tsi_free_rpcs)) {
+               rpc = list_entry(tsi->tsi_free_rpcs.next,
+                                    srpc_client_rpc_t, crpc_list);
+               list_del(&rpc->crpc_list);
+               LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+       }
+
+clean:
+       sfw_unload_test(tsi);
+       LIBCFS_FREE(tsi, sizeof(*tsi));
+       return;
+}
+
+void
+sfw_destroy_batch (sfw_batch_t *tsb)
+{
+       sfw_test_instance_t *tsi;
+
+       LASSERT (!sfw_batch_active(tsb));
+       LASSERT (list_empty(&tsb->bat_list));
+
+       while (!list_empty(&tsb->bat_tests)) {
+               tsi = list_entry(tsb->bat_tests.next,
+                                    sfw_test_instance_t, tsi_list);
+               list_del_init(&tsi->tsi_list);
+               sfw_destroy_test_instance(tsi);
+       }
+
+       LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+       return;
+}
+
+void
+sfw_destroy_session (sfw_session_t *sn)
+{
+       sfw_batch_t *batch;
+
+       LASSERT (list_empty(&sn->sn_list));
+       LASSERT (sn != sfw_data.fw_session);
+
+       while (!list_empty(&sn->sn_batches)) {
+               batch = list_entry(sn->sn_batches.next,
+                                      sfw_batch_t, bat_list);
+               list_del_init(&batch->bat_list);
+               sfw_destroy_batch(batch);
+       }
+
+       LIBCFS_FREE(sn, sizeof(*sn));
+       atomic_dec(&sfw_data.fw_nzombies);
+       return;
+}
+
+void
+sfw_unpack_addtest_req(srpc_msg_t *msg)
+{
+       srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+       LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
+       LASSERT (req->tsr_is_client);
+
+       if (msg->msg_magic == SRPC_MSG_MAGIC)
+               return; /* no flipping needed */
+
+       LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+       if (req->tsr_service == SRPC_SERVICE_BRW) {
+               if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+                       test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+
+                       __swab32s(&bulk->blk_opc);
+                       __swab32s(&bulk->blk_npg);
+                       __swab32s(&bulk->blk_flags);
+
+               } else {
+                       test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+
+                       __swab16s(&bulk->blk_opc);
+                       __swab16s(&bulk->blk_flags);
+                       __swab32s(&bulk->blk_offset);
+                       __swab32s(&bulk->blk_len);
+               }
+
+               return;
+       }
+
+       if (req->tsr_service == SRPC_SERVICE_PING) {
+               test_ping_req_t *ping = &req->tsr_u.ping;
+
+               __swab32s(&ping->png_size);
+               __swab32s(&ping->png_flags);
+               return;
+       }
+
+       LBUG ();
+       return;
+}
+
+int
+sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+{
+       srpc_msg_t        *msg = &rpc->srpc_reqstbuf->buf_msg;
+       srpc_test_reqst_t   *req = &msg->msg_body.tes_reqst;
+       srpc_bulk_t      *bk = rpc->srpc_bulk;
+       int               ndest = req->tsr_ndest;
+       sfw_test_unit_t     *tsu;
+       sfw_test_instance_t *tsi;
+       int               i;
+       int               rc;
+
+       LIBCFS_ALLOC(tsi, sizeof(*tsi));
+       if (tsi == NULL) {
+               CERROR ("Can't allocate test instance for batch: "LPU64"\n",
+                       tsb->bat_id.bat_id);
+               return -ENOMEM;
+       }
+
+       memset(tsi, 0, sizeof(*tsi));
+       spin_lock_init(&tsi->tsi_lock);
+       atomic_set(&tsi->tsi_nactive, 0);
+       INIT_LIST_HEAD(&tsi->tsi_units);
+       INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
+       INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
+
+       tsi->tsi_stopping      = 0;
+       tsi->tsi_batch   = tsb;
+       tsi->tsi_loop     = req->tsr_loop;
+       tsi->tsi_concur = req->tsr_concur;
+       tsi->tsi_service       = req->tsr_service;
+       tsi->tsi_is_client     = !!(req->tsr_is_client);
+       tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
+
+       rc = sfw_load_test(tsi);
+       if (rc != 0) {
+               LIBCFS_FREE(tsi, sizeof(*tsi));
+               return rc;
+       }
+
+       LASSERT (!sfw_batch_active(tsb));
+
+       if (!tsi->tsi_is_client) {
+               /* it's test server, just add it to tsb */
+               list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+               return 0;
+       }
+
+       LASSERT (bk != NULL);
+       LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest);
+       LASSERT((unsigned int)bk->bk_len >=
+               sizeof(lnet_process_id_packed_t) * ndest);
+
+       sfw_unpack_addtest_req(msg);
+       memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u));
+
+       for (i = 0; i < ndest; i++) {
+               lnet_process_id_packed_t *dests;
+               lnet_process_id_packed_t  id;
+               int                    j;
+
+               dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page);
+               LASSERT (dests != NULL);  /* my pages are within KVM always */
+               id = dests[i % SFW_ID_PER_PAGE];
+               if (msg->msg_magic != SRPC_MSG_MAGIC)
+                       sfw_unpack_id(id);
+
+               for (j = 0; j < tsi->tsi_concur; j++) {
+                       LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+                       if (tsu == NULL) {
+                               rc = -ENOMEM;
+                               CERROR ("Can't allocate tsu for %d\n",
+                                       tsi->tsi_service);
+                               goto error;
+                       }
+
+                       tsu->tsu_dest.nid = id.nid;
+                       tsu->tsu_dest.pid = id.pid;
+                       tsu->tsu_instance = tsi;
+                       tsu->tsu_private  = NULL;
+                       list_add_tail(&tsu->tsu_list, &tsi->tsi_units);
+               }
+       }
+
+       rc = tsi->tsi_ops->tso_init(tsi);
+       if (rc == 0) {
+               list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+               return 0;
+       }
+
+error:
+       LASSERT (rc != 0);
+       sfw_destroy_test_instance(tsi);
+       return rc;
+}
+
+static void
+sfw_test_unit_done (sfw_test_unit_t *tsu)
+{
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       sfw_batch_t      *tsb = tsi->tsi_batch;
+       sfw_session_t       *sn = tsb->bat_session;
+
+       LASSERT (sfw_test_active(tsi));
+
+       if (!atomic_dec_and_test(&tsi->tsi_nactive))
+               return;
+
+       /* the test instance is done */
+       spin_lock(&tsi->tsi_lock);
+
+       tsi->tsi_stopping = 0;
+
+       spin_unlock(&tsi->tsi_lock);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */
+           sn == sfw_data.fw_session) {                  /* sn also active */
+               spin_unlock(&sfw_data.fw_lock);
+               return;
+       }
+
+       LASSERT (!list_empty(&sn->sn_list)); /* I'm a zombie! */
+
+       list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+               if (sfw_batch_active(tsb)) {
+                       spin_unlock(&sfw_data.fw_lock);
+                       return;
+               }
+       }
+
+       list_del_init(&sn->sn_list);
+       spin_unlock(&sfw_data.fw_lock);
+
+       sfw_destroy_session(sn);
+       return;
+}
+
+void
+sfw_test_rpc_done (srpc_client_rpc_t *rpc)
+{
+       sfw_test_unit_t     *tsu = rpc->crpc_priv;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       int               done = 0;
+
+       tsi->tsi_ops->tso_done_rpc(tsu, rpc);
+
+       spin_lock(&tsi->tsi_lock);
+
+       LASSERT (sfw_test_active(tsi));
+       LASSERT (!list_empty(&rpc->crpc_list));
+
+       list_del_init(&rpc->crpc_list);
+
+       /* batch is stopping or loop is done or get error */
+       if (tsi->tsi_stopping ||
+           tsu->tsu_loop == 0 ||
+           (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
+               done = 1;
+
+       /* dec ref for poster */
+       srpc_client_rpc_decref(rpc);
+
+       spin_unlock(&tsi->tsi_lock);
+
+       if (!done) {
+               swi_schedule_workitem(&tsu->tsu_worker);
+               return;
+       }
+
+       sfw_test_unit_done(tsu);
+       return;
+}
+
+int
+sfw_create_test_rpc(sfw_test_unit_t *tsu, lnet_process_id_t peer,
+                   unsigned features, int nblk, int blklen,
+                   srpc_client_rpc_t **rpcpp)
+{
+       srpc_client_rpc_t   *rpc = NULL;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+       spin_lock(&tsi->tsi_lock);
+
+       LASSERT (sfw_test_active(tsi));
+
+       if (!list_empty(&tsi->tsi_free_rpcs)) {
+               /* pick request from buffer */
+               rpc = list_entry(tsi->tsi_free_rpcs.next,
+                                    srpc_client_rpc_t, crpc_list);
+               LASSERT (nblk == rpc->crpc_bulk.bk_niov);
+               list_del_init(&rpc->crpc_list);
+       }
+
+       spin_unlock(&tsi->tsi_lock);
+
+       if (rpc == NULL) {
+               rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk,
+                                            blklen, sfw_test_rpc_done,
+                                            sfw_test_rpc_fini, tsu);
+       } else {
+               srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk,
+                                    blklen, sfw_test_rpc_done,
+                                    sfw_test_rpc_fini, tsu);
+       }
+
+       if (rpc == NULL) {
+               CERROR("Can't create rpc for test %d\n", tsi->tsi_service);
+               return -ENOMEM;
+       }
+
+       rpc->crpc_reqstmsg.msg_ses_feats = features;
+       *rpcpp = rpc;
+
+       return 0;
+}
+
+int
+sfw_run_test (swi_workitem_t *wi)
+{
+       sfw_test_unit_t     *tsu = wi->swi_workitem.wi_data;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       srpc_client_rpc_t   *rpc = NULL;
+
+       LASSERT (wi == &tsu->tsu_worker);
+
+       if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
+               LASSERT (rpc == NULL);
+               goto test_done;
+       }
+
+       LASSERT (rpc != NULL);
+
+       spin_lock(&tsi->tsi_lock);
+
+       if (tsi->tsi_stopping) {
+               list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+               spin_unlock(&tsi->tsi_lock);
+               goto test_done;
+       }
+
+       if (tsu->tsu_loop > 0)
+               tsu->tsu_loop--;
+
+       list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs);
+       spin_unlock(&tsi->tsi_lock);
+
+       rpc->crpc_timeout = rpc_timeout;
+
+       spin_lock(&rpc->crpc_lock);
+       srpc_post_rpc(rpc);
+       spin_unlock(&rpc->crpc_lock);
+       return 0;
+
+test_done:
+       /*
+        * No one can schedule me now since:
+        * - previous RPC, if any, has done and
+        * - no new RPC is initiated.
+        * - my batch is still active; no one can run it again now.
+        * Cancel pending schedules and prevent future schedule attempts:
+        */
+       swi_exit_workitem(wi);
+       sfw_test_unit_done(tsu);
+       return 1;
+}
+
+int
+sfw_run_batch (sfw_batch_t *tsb)
+{
+       swi_workitem_t      *wi;
+       sfw_test_unit_t     *tsu;
+       sfw_test_instance_t *tsi;
+
+       if (sfw_batch_active(tsb)) {
+               CDEBUG(D_NET, "Batch already active: "LPU64" (%d)\n",
+                      tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive));
+               return 0;
+       }
+
+       list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+               if (!tsi->tsi_is_client) /* skip server instances */
+                       continue;
+
+               LASSERT (!tsi->tsi_stopping);
+               LASSERT (!sfw_test_active(tsi));
+
+               atomic_inc(&tsb->bat_nactive);
+
+               list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+                       atomic_inc(&tsi->tsi_nactive);
+                       tsu->tsu_loop = tsi->tsi_loop;
+                       wi = &tsu->tsu_worker;
+                       swi_init_workitem(wi, tsu, sfw_run_test,
+                                         lst_sched_test[\
+                                         lnet_cpt_of_nid(tsu->tsu_dest.nid)]);
+                       swi_schedule_workitem(wi);
+               }
+       }
+
+       return 0;
+}
+
+int
+sfw_stop_batch (sfw_batch_t *tsb, int force)
+{
+       sfw_test_instance_t *tsi;
+       srpc_client_rpc_t   *rpc;
+
+       if (!sfw_batch_active(tsb)) {
+               CDEBUG(D_NET, "Batch "LPU64" inactive\n", tsb->bat_id.bat_id);
+               return 0;
+       }
+
+       list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+               spin_lock(&tsi->tsi_lock);
+
+               if (!tsi->tsi_is_client ||
+                   !sfw_test_active(tsi) || tsi->tsi_stopping) {
+                       spin_unlock(&tsi->tsi_lock);
+                       continue;
+               }
+
+               tsi->tsi_stopping = 1;
+
+               if (!force) {
+                       spin_unlock(&tsi->tsi_lock);
+                       continue;
+               }
+
+               /* abort launched rpcs in the test */
+               list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) {
+                       spin_lock(&rpc->crpc_lock);
+
+                       srpc_abort_rpc(rpc, -EINTR);
+
+                       spin_unlock(&rpc->crpc_lock);
+               }
+
+               spin_unlock(&tsi->tsi_lock);
+       }
+
+       return 0;
+}
+
+int
+sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+{
+       sfw_test_instance_t *tsi;
+
+       if (testidx < 0)
+               return -EINVAL;
+
+       if (testidx == 0) {
+               reply->bar_active = atomic_read(&tsb->bat_nactive);
+               return 0;
+       }
+
+       list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+               if (testidx-- > 1)
+                       continue;
+
+               reply->bar_active = atomic_read(&tsi->tsi_nactive);
+               return 0;
+       }
+
+       return -ENOENT;
+}
+
+void
+sfw_free_pages (srpc_server_rpc_t *rpc)
+{
+       srpc_free_bulk(rpc->srpc_bulk);
+       rpc->srpc_bulk = NULL;
+}
+
+int
+sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+               int sink)
+{
+       LASSERT(rpc->srpc_bulk == NULL);
+       LASSERT(npages > 0 && npages <= LNET_MAX_IOV);
+
+       rpc->srpc_bulk = srpc_alloc_bulk(cpt, npages, len, sink);
+       if (rpc->srpc_bulk == NULL)
+               return -ENOMEM;
+
+       return 0;
+}
+
+int
+sfw_add_test (srpc_server_rpc_t *rpc)
+{
+       sfw_session_t     *sn = sfw_data.fw_session;
+       srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+       srpc_test_reqst_t *request;
+       int             rc;
+       sfw_batch_t       *bat;
+
+       request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
+       reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+       if (request->tsr_loop == 0 ||
+           request->tsr_concur == 0 ||
+           request->tsr_sid.ses_nid == LNET_NID_ANY ||
+           request->tsr_ndest > SFW_MAX_NDESTS ||
+           (request->tsr_is_client && request->tsr_ndest == 0) ||
+           request->tsr_concur > SFW_MAX_CONCUR ||
+           request->tsr_service > SRPC_SERVICE_MAX_ID ||
+           request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) {
+               reply->tsr_status = EINVAL;
+               return 0;
+       }
+
+       if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) ||
+           sfw_find_test_case(request->tsr_service) == NULL) {
+               reply->tsr_status = ENOENT;
+               return 0;
+       }
+
+       bat = sfw_bid2batch(request->tsr_bid);
+       if (bat == NULL) {
+               CERROR ("Dropping RPC (%s) from %s under memory pressure.\n",
+                       rpc->srpc_scd->scd_svc->sv_name,
+                       libcfs_id2str(rpc->srpc_peer));
+               return -ENOMEM;
+       }
+
+       if (sfw_batch_active(bat)) {
+               reply->tsr_status = EBUSY;
+               return 0;
+       }
+
+       if (request->tsr_is_client && rpc->srpc_bulk == NULL) {
+               /* rpc will be resumed later in sfw_bulk_ready */
+               int     npg = sfw_id_pages(request->tsr_ndest);
+               int     len;
+
+               if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+                       len = npg * PAGE_CACHE_SIZE;
+
+               } else  {
+                       len = sizeof(lnet_process_id_packed_t) *
+                             request->tsr_ndest;
+               }
+
+               return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1);
+       }
+
+       rc = sfw_add_test_instance(bat, rpc);
+       CDEBUG (rc == 0 ? D_NET : D_WARNING,
+               "%s test: sv %d %s, loop %d, concur %d, ndest %d\n",
+               rc == 0 ? "Added" : "Failed to add", request->tsr_service,
+               request->tsr_is_client ? "client" : "server",
+               request->tsr_loop, request->tsr_concur, request->tsr_ndest);
+
+       reply->tsr_status = (rc < 0) ? -rc : rc;
+       return 0;
+}
+
+int
+sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+{
+       sfw_session_t *sn = sfw_data.fw_session;
+       int         rc = 0;
+       sfw_batch_t   *bat;
+
+       reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+       if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) {
+               reply->bar_status = ESRCH;
+               return 0;
+       }
+
+       bat = sfw_find_batch(request->bar_bid);
+       if (bat == NULL) {
+               reply->bar_status = ENOENT;
+               return 0;
+       }
+
+       switch (request->bar_opc) {
+       case SRPC_BATCH_OPC_RUN:
+               rc = sfw_run_batch(bat);
+               break;
+
+       case SRPC_BATCH_OPC_STOP:
+               rc = sfw_stop_batch(bat, request->bar_arg);
+               break;
+
+       case SRPC_BATCH_OPC_QUERY:
+               rc = sfw_query_batch(bat, request->bar_testidx, reply);
+               break;
+
+       default:
+               return -EINVAL; /* drop it */
+       }
+
+       reply->bar_status = (rc < 0) ? -rc : rc;
+       return 0;
+}
+
+int
+sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
+{
+       struct srpc_service     *sv = rpc->srpc_scd->scd_svc;
+       srpc_msg_t     *reply   = &rpc->srpc_replymsg;
+       srpc_msg_t     *request = &rpc->srpc_reqstbuf->buf_msg;
+       unsigned        features = LST_FEATS_MASK;
+       int             rc = 0;
+
+       LASSERT(sfw_data.fw_active_srpc == NULL);
+       LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       if (sfw_data.fw_shuttingdown) {
+               spin_unlock(&sfw_data.fw_lock);
+               return -ESHUTDOWN;
+       }
+
+       /* Remove timer to avoid racing with it or expiring active session */
+       if (sfw_del_session_timer() != 0) {
+               CERROR("Dropping RPC (%s) from %s: racing with expiry timer.",
+                      sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+               spin_unlock(&sfw_data.fw_lock);
+               return -EAGAIN;
+       }
+
+       sfw_data.fw_active_srpc = rpc;
+       spin_unlock(&sfw_data.fw_lock);
+
+       sfw_unpack_message(request);
+       LASSERT(request->msg_type == srpc_service2request(sv->sv_id));
+
+       /* rpc module should have checked this */
+       LASSERT(request->msg_version == SRPC_MSG_VERSION);
+
+       if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
+           sv->sv_id != SRPC_SERVICE_DEBUG) {
+               sfw_session_t *sn = sfw_data.fw_session;
+
+               if (sn != NULL &&
+                   sn->sn_features != request->msg_ses_feats) {
+                       CNETERR("Features of framework RPC don't match "
+                               "features of current session: %x/%x\n",
+                               request->msg_ses_feats, sn->sn_features);
+                       reply->msg_body.reply.status = EPROTO;
+                       reply->msg_body.reply.sid    = sn->sn_id;
+                       goto out;
+               }
+
+       } else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+               /* NB: at this point, old version will ignore features and
+                * create new session anyway, so console should be able
+                * to handle this */
+               reply->msg_body.reply.status = EPROTO;
+               goto out;
+       }
+
+       switch(sv->sv_id) {
+       default:
+               LBUG ();
+       case SRPC_SERVICE_TEST:
+               rc = sfw_add_test(rpc);
+               break;
+
+       case SRPC_SERVICE_BATCH:
+               rc = sfw_control_batch(&request->msg_body.bat_reqst,
+                                      &reply->msg_body.bat_reply);
+               break;
+
+       case SRPC_SERVICE_QUERY_STAT:
+               rc = sfw_get_stats(&request->msg_body.stat_reqst,
+                                  &reply->msg_body.stat_reply);
+               break;
+
+       case SRPC_SERVICE_DEBUG:
+               rc = sfw_debug_session(&request->msg_body.dbg_reqst,
+                                      &reply->msg_body.dbg_reply);
+               break;
+
+       case SRPC_SERVICE_MAKE_SESSION:
+               rc = sfw_make_session(&request->msg_body.mksn_reqst,
+                                     &reply->msg_body.mksn_reply);
+               break;
+
+       case SRPC_SERVICE_REMOVE_SESSION:
+               rc = sfw_remove_session(&request->msg_body.rmsn_reqst,
+                                       &reply->msg_body.rmsn_reply);
+               break;
+       }
+
+       if (sfw_data.fw_session != NULL)
+               features = sfw_data.fw_session->sn_features;
+ out:
+       reply->msg_ses_feats = features;
+       rpc->srpc_done = sfw_server_rpc_done;
+       spin_lock(&sfw_data.fw_lock);
+
+       if (!sfw_data.fw_shuttingdown)
+               sfw_add_session_timer();
+
+       sfw_data.fw_active_srpc = NULL;
+       spin_unlock(&sfw_data.fw_lock);
+       return rc;
+}
+
+int
+sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+       struct srpc_service     *sv = rpc->srpc_scd->scd_svc;
+       int                     rc;
+
+       LASSERT(rpc->srpc_bulk != NULL);
+       LASSERT(sv->sv_id == SRPC_SERVICE_TEST);
+       LASSERT(sfw_data.fw_active_srpc == NULL);
+       LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       if (status != 0) {
+               CERROR("Bulk transfer failed for RPC: "
+                      "service %s, peer %s, status %d\n",
+                      sv->sv_name, libcfs_id2str(rpc->srpc_peer), status);
+               spin_unlock(&sfw_data.fw_lock);
+               return -EIO;
+       }
+
+       if (sfw_data.fw_shuttingdown) {
+               spin_unlock(&sfw_data.fw_lock);
+               return -ESHUTDOWN;
+       }
+
+       if (sfw_del_session_timer() != 0) {
+               CERROR("Dropping RPC (%s) from %s: racing with expiry timer",
+                      sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+               spin_unlock(&sfw_data.fw_lock);
+               return -EAGAIN;
+       }
+
+       sfw_data.fw_active_srpc = rpc;
+       spin_unlock(&sfw_data.fw_lock);
+
+       rc = sfw_add_test(rpc);
+
+       spin_lock(&sfw_data.fw_lock);
+
+       if (!sfw_data.fw_shuttingdown)
+               sfw_add_session_timer();
+
+       sfw_data.fw_active_srpc = NULL;
+       spin_unlock(&sfw_data.fw_lock);
+       return rc;
+}
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+              unsigned features, int nbulkiov, int bulklen,
+              void (*done)(srpc_client_rpc_t *), void *priv)
+{
+       srpc_client_rpc_t *rpc = NULL;
+
+       spin_lock(&sfw_data.fw_lock);
+
+       LASSERT (!sfw_data.fw_shuttingdown);
+       LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+       if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
+               rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+                                    srpc_client_rpc_t, crpc_list);
+               list_del(&rpc->crpc_list);
+
+               srpc_init_client_rpc(rpc, peer, service, 0, 0,
+                                    done, sfw_client_rpc_fini, priv);
+       }
+
+       spin_unlock(&sfw_data.fw_lock);
+
+       if (rpc == NULL) {
+               rpc = srpc_create_client_rpc(peer, service,
+                                            nbulkiov, bulklen, done,
+                                            nbulkiov != 0 ?  NULL :
+                                            sfw_client_rpc_fini,
+                                            priv);
+       }
+
+       if (rpc != NULL) /* "session" is concept in framework */
+               rpc->crpc_reqstmsg.msg_ses_feats = features;
+
+       return rpc;
+}
+
+void
+sfw_unpack_message (srpc_msg_t *msg)
+{
+       if (msg->msg_magic == SRPC_MSG_MAGIC)
+               return; /* no flipping needed */
+
+       /* srpc module should guarantee I wouldn't get crap */
+       LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+       if (msg->msg_type == SRPC_MSG_STAT_REQST) {
+               srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+
+               __swab32s(&req->str_type);
+               __swab64s(&req->str_rpyid);
+               sfw_unpack_sid(req->str_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
+               srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+
+               __swab32s(&rep->str_status);
+               sfw_unpack_sid(rep->str_sid);
+               sfw_unpack_fw_counters(rep->str_fw);
+               sfw_unpack_rpc_counters(rep->str_rpc);
+               sfw_unpack_lnet_counters(rep->str_lnet);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
+               srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+
+               __swab64s(&req->mksn_rpyid);
+               __swab32s(&req->mksn_force);
+               sfw_unpack_sid(req->mksn_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
+               srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+
+               __swab32s(&rep->mksn_status);
+               __swab32s(&rep->mksn_timeout);
+               sfw_unpack_sid(rep->mksn_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
+               srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+
+               __swab64s(&req->rmsn_rpyid);
+               sfw_unpack_sid(req->rmsn_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
+               srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+
+               __swab32s(&rep->rmsn_status);
+               sfw_unpack_sid(rep->rmsn_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
+               srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+
+               __swab64s(&req->dbg_rpyid);
+               __swab32s(&req->dbg_flags);
+               sfw_unpack_sid(req->dbg_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
+               srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+
+               __swab32s(&rep->dbg_nbatch);
+               __swab32s(&rep->dbg_timeout);
+               sfw_unpack_sid(rep->dbg_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
+               srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+
+               __swab32s(&req->bar_opc);
+               __swab64s(&req->bar_rpyid);
+               __swab32s(&req->bar_testidx);
+               __swab32s(&req->bar_arg);
+               sfw_unpack_sid(req->bar_sid);
+               __swab64s(&req->bar_bid.bat_id);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
+               srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+               __swab32s(&rep->bar_status);
+               sfw_unpack_sid(rep->bar_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_TEST_REQST) {
+               srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+               __swab64s(&req->tsr_rpyid);
+               __swab64s(&req->tsr_bulkid);
+               __swab32s(&req->tsr_loop);
+               __swab32s(&req->tsr_ndest);
+               __swab32s(&req->tsr_concur);
+               __swab32s(&req->tsr_service);
+               sfw_unpack_sid(req->tsr_sid);
+               __swab64s(&req->tsr_bid.bat_id);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
+               srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+
+               __swab32s(&rep->tsr_status);
+               sfw_unpack_sid(rep->tsr_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
+               srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+
+               __swab64s(&req->join_rpyid);
+               sfw_unpack_sid(req->join_sid);
+               return;
+       }
+
+       if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
+               srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+
+               __swab32s(&rep->join_status);
+               __swab32s(&rep->join_timeout);
+               sfw_unpack_sid(rep->join_sid);
+               return;
+       }
+
+       LBUG ();
+       return;
+}
+
+void
+sfw_abort_rpc (srpc_client_rpc_t *rpc)
+{
+       LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
+       LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+       spin_lock(&rpc->crpc_lock);
+       srpc_abort_rpc(rpc, -EINTR);
+       spin_unlock(&rpc->crpc_lock);
+       return;
+}
+
+void
+sfw_post_rpc (srpc_client_rpc_t *rpc)
+{
+       spin_lock(&rpc->crpc_lock);
+
+       LASSERT (!rpc->crpc_closed);
+       LASSERT (!rpc->crpc_aborted);
+       LASSERT (list_empty(&rpc->crpc_list));
+       LASSERT (!sfw_data.fw_shuttingdown);
+
+       rpc->crpc_timeout = rpc_timeout;
+       srpc_post_rpc(rpc);
+
+       spin_unlock(&rpc->crpc_lock);
+       return;
+}
+
+static srpc_service_t sfw_services[] =
+{
+       {
+               /* sv_id */    SRPC_SERVICE_DEBUG,
+               /* sv_name */  "debug",
+               0
+       },
+       {
+               /* sv_id */    SRPC_SERVICE_QUERY_STAT,
+               /* sv_name */  "query stats",
+               0
+       },
+       {
+               /* sv_id */    SRPC_SERVICE_MAKE_SESSION,
+               /* sv_name */  "make session",
+               0
+       },
+       {
+               /* sv_id */    SRPC_SERVICE_REMOVE_SESSION,
+               /* sv_name */  "remove session",
+               0
+       },
+       {
+               /* sv_id */    SRPC_SERVICE_BATCH,
+               /* sv_name */  "batch service",
+               0
+       },
+       {
+               /* sv_id */    SRPC_SERVICE_TEST,
+               /* sv_name */  "test service",
+               0
+       },
+       {
+               /* sv_id */    0,
+               /* sv_name */  NULL,
+               0
+       }
+};
+
+extern sfw_test_client_ops_t ping_test_client;
+extern srpc_service_t  ping_test_service;
+extern void ping_init_test_client(void);
+extern void ping_init_test_service(void);
+
+extern sfw_test_client_ops_t brw_test_client;
+extern srpc_service_t  brw_test_service;
+extern void brw_init_test_client(void);
+extern void brw_init_test_service(void);
+
+
+int
+sfw_startup (void)
+{
+       int           i;
+       int           rc;
+       int           error;
+       srpc_service_t  *sv;
+       sfw_test_case_t *tsc;
+
+
+       if (session_timeout < 0) {
+               CERROR ("Session timeout must be non-negative: %d\n",
+                       session_timeout);
+               return -EINVAL;
+       }
+
+       if (rpc_timeout < 0) {
+               CERROR ("RPC timeout must be non-negative: %d\n",
+                       rpc_timeout);
+               return -EINVAL;
+       }
+
+       if (session_timeout == 0)
+               CWARN ("Zero session_timeout specified "
+                      "- test sessions never expire.\n");
+
+       if (rpc_timeout == 0)
+               CWARN ("Zero rpc_timeout specified "
+                      "- test RPC never expire.\n");
+
+       memset(&sfw_data, 0, sizeof(struct smoketest_framework));
+
+       sfw_data.fw_session     = NULL;
+       sfw_data.fw_active_srpc = NULL;
+       spin_lock_init(&sfw_data.fw_lock);
+       atomic_set(&sfw_data.fw_nzombies, 0);
+       INIT_LIST_HEAD(&sfw_data.fw_tests);
+       INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs);
+       INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions);
+
+       brw_init_test_client();
+       brw_init_test_service();
+       rc = sfw_register_test(&brw_test_service, &brw_test_client);
+       LASSERT (rc == 0);
+
+       ping_init_test_client();
+       ping_init_test_service();
+       rc = sfw_register_test(&ping_test_service, &ping_test_client);
+       LASSERT (rc == 0);
+
+       error = 0;
+       list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+               sv = tsc->tsc_srv_service;
+
+               rc = srpc_add_service(sv);
+               LASSERT (rc != -EBUSY);
+               if (rc != 0) {
+                       CWARN ("Failed to add %s service: %d\n",
+                              sv->sv_name, rc);
+                       error = rc;
+               }
+       }
+
+       for (i = 0; ; i++) {
+               sv = &sfw_services[i];
+               if (sv->sv_name == NULL) break;
+
+               sv->sv_bulk_ready = NULL;
+               sv->sv_handler    = sfw_handle_server_rpc;
+               sv->sv_wi_total   = SFW_FRWK_WI_MAX;
+               if (sv->sv_id == SRPC_SERVICE_TEST)
+                       sv->sv_bulk_ready = sfw_bulk_ready;
+
+               rc = srpc_add_service(sv);
+               LASSERT (rc != -EBUSY);
+               if (rc != 0) {
+                       CWARN ("Failed to add %s service: %d\n",
+                              sv->sv_name, rc);
+                       error = rc;
+               }
+
+               /* about to sfw_shutdown, no need to add buffer */
+               if (error) continue;
+
+               rc = srpc_service_add_buffers(sv, sv->sv_wi_total);
+               if (rc != 0) {
+                       CWARN("Failed to reserve enough buffers: "
+                             "service %s, %d needed: %d\n",
+                             sv->sv_name, sv->sv_wi_total, rc);
+                       error = -ENOMEM;
+               }
+       }
+
+       if (error != 0)
+               sfw_shutdown();
+       return error;
+}
+
+void
+sfw_shutdown (void)
+{
+       srpc_service_t  *sv;
+       sfw_test_case_t *tsc;
+       int              i;
+
+       spin_lock(&sfw_data.fw_lock);
+
+       sfw_data.fw_shuttingdown = 1;
+       lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock,
+                      "waiting for active RPC to finish.\n");
+
+       if (sfw_del_session_timer() != 0)
+               lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock,
+                              "waiting for session timer to explode.\n");
+
+       sfw_deactivate_session();
+       lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0,
+                      sfw_data.fw_lock,
+                      "waiting for %d zombie sessions to die.\n",
+                      atomic_read(&sfw_data.fw_nzombies));
+
+       spin_unlock(&sfw_data.fw_lock);
+
+       for (i = 0; ; i++) {
+               sv = &sfw_services[i];
+               if (sv->sv_name == NULL)
+                       break;
+
+               srpc_shutdown_service(sv);
+               srpc_remove_service(sv);
+       }
+
+       list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+               sv = tsc->tsc_srv_service;
+               srpc_shutdown_service(sv);
+               srpc_remove_service(sv);
+       }
+
+       while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
+               srpc_client_rpc_t *rpc;
+
+               rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+                                    srpc_client_rpc_t, crpc_list);
+               list_del(&rpc->crpc_list);
+
+               LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+       }
+
+       for (i = 0; ; i++) {
+               sv = &sfw_services[i];
+               if (sv->sv_name == NULL)
+                       break;
+
+               srpc_wait_service_shutdown(sv);
+       }
+
+       while (!list_empty(&sfw_data.fw_tests)) {
+               tsc = list_entry(sfw_data.fw_tests.next,
+                                    sfw_test_case_t, tsc_list);
+
+               srpc_wait_service_shutdown(tsc->tsc_srv_service);
+
+               list_del(&tsc->tsc_list);
+               LIBCFS_FREE(tsc, sizeof(*tsc));
+       }
+
+       return;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/module.c b/drivers/staging/lustre/lnet/selftest/module.c
new file mode 100644 (file)
index 0000000..5257e56
--- /dev/null
@@ -0,0 +1,169 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+enum {
+       LST_INIT_NONE           = 0,
+       LST_INIT_WI_SERIAL,
+       LST_INIT_WI_TEST,
+       LST_INIT_RPC,
+       LST_INIT_FW,
+       LST_INIT_CONSOLE
+};
+
+extern int lstcon_console_init(void);
+extern int lstcon_console_fini(void);
+
+static int lst_init_step = LST_INIT_NONE;
+
+struct cfs_wi_sched *lst_sched_serial;
+struct cfs_wi_sched **lst_sched_test;
+
+void
+lnet_selftest_fini(void)
+{
+       int     i;
+
+       switch (lst_init_step) {
+               case LST_INIT_CONSOLE:
+                       lstcon_console_fini();
+               case LST_INIT_FW:
+                       sfw_shutdown();
+               case LST_INIT_RPC:
+                       srpc_shutdown();
+               case LST_INIT_WI_TEST:
+                       for (i = 0;
+                            i < cfs_cpt_number(lnet_cpt_table()); i++) {
+                               if (lst_sched_test[i] == NULL)
+                                       continue;
+                               cfs_wi_sched_destroy(lst_sched_test[i]);
+                       }
+                       LIBCFS_FREE(lst_sched_test,
+                                   sizeof(lst_sched_test[0]) *
+                                   cfs_cpt_number(lnet_cpt_table()));
+                       lst_sched_test = NULL;
+
+               case LST_INIT_WI_SERIAL:
+                       cfs_wi_sched_destroy(lst_sched_serial);
+                       lst_sched_serial = NULL;
+               case LST_INIT_NONE:
+                       break;
+               default:
+                       LBUG();
+       }
+       return;
+}
+
+void
+lnet_selftest_structure_assertion(void)
+{
+       CLASSERT(sizeof(srpc_msg_t) == 160);
+       CLASSERT(sizeof(srpc_test_reqst_t) == 70);
+       CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72);
+       CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78);
+       CLASSERT(sizeof(srpc_stat_reply_t) == 136);
+       CLASSERT(sizeof(srpc_stat_reqst_t) == 28);
+}
+
+int
+lnet_selftest_init(void)
+{
+       int     nscheds;
+       int     rc;
+       int     i;
+
+       rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
+                                1, &lst_sched_serial);
+       if (rc != 0) {
+               CERROR("Failed to create serial WI scheduler for LST\n");
+               return rc;
+       }
+       lst_init_step = LST_INIT_WI_SERIAL;
+
+       nscheds = cfs_cpt_number(lnet_cpt_table());
+       LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds);
+       if (lst_sched_test == NULL)
+               goto error;
+
+       lst_init_step = LST_INIT_WI_TEST;
+       for (i = 0; i < nscheds; i++) {
+               int nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+
+               /* reserve at least one CPU for LND */
+               nthrs = max(nthrs - 1, 1);
+               rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
+                                        nthrs, &lst_sched_test[i]);
+               if (rc != 0) {
+                       CERROR("Failed to create CPT affinity WI scheduler "
+                              "%d for LST\n", i);
+                       goto error;
+               }
+       }
+
+       rc = srpc_startup();
+       if (rc != 0) {
+               CERROR("LST can't startup rpc\n");
+               goto error;
+       }
+       lst_init_step = LST_INIT_RPC;
+
+       rc = sfw_startup();
+       if (rc != 0) {
+               CERROR("LST can't startup framework\n");
+               goto error;
+       }
+       lst_init_step = LST_INIT_FW;
+
+       rc = lstcon_console_init();
+       if (rc != 0) {
+               CERROR("LST can't startup console\n");
+               goto error;
+       }
+       lst_init_step = LST_INIT_CONSOLE;
+       return 0;
+error:
+       lnet_selftest_fini();
+       return rc;
+}
+
+
+MODULE_DESCRIPTION("LNet Selftest");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "0.9.0", lnet_selftest_init, lnet_selftest_fini);
diff --git a/drivers/staging/lustre/lnet/selftest/ping_test.c b/drivers/staging/lustre/lnet/selftest/ping_test.c
new file mode 100644 (file)
index 0000000..f0f9194
--- /dev/null
@@ -0,0 +1,229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Test client & Server
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+#define LST_PING_TEST_MAGIC     0xbabeface
+
+int ping_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(ping_srv_workitems, "i", int, 0644, "# PING server workitems");
+
+typedef struct {
+       spinlock_t      pnd_lock;       /* serialize */
+       int             pnd_counter;    /* sequence counter */
+} lst_ping_data_t;
+
+static lst_ping_data_t  lst_ping_data;
+
+static int
+ping_client_init(sfw_test_instance_t *tsi)
+{
+       sfw_session_t *sn = tsi->tsi_batch->bat_session;
+
+       LASSERT(tsi->tsi_is_client);
+       LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+       spin_lock_init(&lst_ping_data.pnd_lock);
+       lst_ping_data.pnd_counter = 0;
+
+       return 0;
+}
+
+static void
+ping_client_fini (sfw_test_instance_t *tsi)
+{
+       sfw_session_t *sn = tsi->tsi_batch->bat_session;
+       int         errors;
+
+       LASSERT (sn != NULL);
+       LASSERT (tsi->tsi_is_client);
+
+       errors = atomic_read(&sn->sn_ping_errors);
+       if (errors)
+               CWARN ("%d pings have failed.\n", errors);
+       else
+               CDEBUG (D_NET, "Ping test finished OK.\n");
+}
+
+static int
+ping_client_prep_rpc(sfw_test_unit_t *tsu,
+                    lnet_process_id_t dest, srpc_client_rpc_t **rpc)
+{
+       srpc_ping_reqst_t   *req;
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       sfw_session_t       *sn  = tsi->tsi_batch->bat_session;
+       struct timeval       tv;
+       int                  rc;
+
+       LASSERT(sn != NULL);
+       LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+       rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc);
+       if (rc != 0)
+               return rc;
+
+       req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst;
+
+       req->pnr_magic = LST_PING_TEST_MAGIC;
+
+       spin_lock(&lst_ping_data.pnd_lock);
+       req->pnr_seq = lst_ping_data.pnd_counter++;
+       spin_unlock(&lst_ping_data.pnd_lock);
+
+       cfs_fs_timeval(&tv);
+       req->pnr_time_sec  = tv.tv_sec;
+       req->pnr_time_usec = tv.tv_usec;
+
+       return rc;
+}
+
+static void
+ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+       sfw_test_instance_t *tsi = tsu->tsu_instance;
+       sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+       srpc_ping_reqst_t   *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+       srpc_ping_reply_t   *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+       struct timeval       tv;
+
+       LASSERT (sn != NULL);
+
+       if (rpc->crpc_status != 0) {
+               if (!tsi->tsi_stopping) /* rpc could have been aborted */
+                       atomic_inc(&sn->sn_ping_errors);
+               CERROR ("Unable to ping %s (%d): %d\n",
+                       libcfs_id2str(rpc->crpc_dest),
+                       reqst->pnr_seq, rpc->crpc_status);
+               return;
+       }
+
+       if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+               __swab32s(&reply->pnr_seq);
+               __swab32s(&reply->pnr_magic);
+               __swab32s(&reply->pnr_status);
+       }
+
+       if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+               rpc->crpc_status = -EBADMSG;
+               atomic_inc(&sn->sn_ping_errors);
+               CERROR ("Bad magic %u from %s, %u expected.\n",
+                       reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+                       LST_PING_TEST_MAGIC);
+               return;
+       }
+
+       if (reply->pnr_seq != reqst->pnr_seq) {
+               rpc->crpc_status = -EBADMSG;
+               atomic_inc(&sn->sn_ping_errors);
+               CERROR ("Bad seq %u from %s, %u expected.\n",
+                       reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+                       reqst->pnr_seq);
+               return;
+       }
+
+       cfs_fs_timeval(&tv);
+       CDEBUG (D_NET, "%d reply in %u usec\n", reply->pnr_seq,
+               (unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000
+                          + (tv.tv_usec - reqst->pnr_time_usec)));
+       return;
+}
+
+static int
+ping_server_handle(struct srpc_server_rpc *rpc)
+{
+       struct srpc_service     *sv  = rpc->srpc_scd->scd_svc;
+       srpc_msg_t      *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+       srpc_msg_t        *replymsg = &rpc->srpc_replymsg;
+       srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
+       srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+
+       LASSERT (sv->sv_id == SRPC_SERVICE_PING);
+
+       if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+               LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+               __swab32s(&req->pnr_seq);
+               __swab32s(&req->pnr_magic);
+               __swab64s(&req->pnr_time_sec);
+               __swab64s(&req->pnr_time_usec);
+       }
+       LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id));
+
+       if (req->pnr_magic != LST_PING_TEST_MAGIC) {
+               CERROR ("Unexpect magic %08x from %s\n",
+                       req->pnr_magic, libcfs_id2str(rpc->srpc_peer));
+               return -EINVAL;
+       }
+
+       rep->pnr_seq   = req->pnr_seq;
+       rep->pnr_magic = LST_PING_TEST_MAGIC;
+
+       if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+               replymsg->msg_ses_feats = LST_FEATS_MASK;
+               rep->pnr_status = EPROTO;
+               return 0;
+       }
+
+       replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+       CDEBUG(D_NET, "Get ping %d from %s\n",
+              req->pnr_seq, libcfs_id2str(rpc->srpc_peer));
+       return 0;
+}
+
+sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void)
+{
+       ping_test_client.tso_init     = ping_client_init;
+       ping_test_client.tso_fini     = ping_client_fini;
+       ping_test_client.tso_prep_rpc = ping_client_prep_rpc;
+       ping_test_client.tso_done_rpc = ping_client_done_rpc;
+}
+
+srpc_service_t ping_test_service;
+void ping_init_test_service(void)
+{
+       ping_test_service.sv_id       = SRPC_SERVICE_PING;
+       ping_test_service.sv_name     = "ping_test";
+       ping_test_service.sv_handler  = ping_server_handle;
+       ping_test_service.sv_wi_total = ping_srv_workitems;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c
new file mode 100644 (file)
index 0000000..bc1f38b
--- /dev/null
@@ -0,0 +1,1666 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/rpc.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *
+ * 2012-05-13: Liang Zhen <liang@whamcloud.com>
+ * - percpt data for service to improve smp performance
+ * - code cleanup
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+typedef enum {
+       SRPC_STATE_NONE,
+       SRPC_STATE_NI_INIT,
+       SRPC_STATE_EQ_INIT,
+       SRPC_STATE_RUNNING,
+       SRPC_STATE_STOPPING,
+} srpc_state_t;
+
+struct smoketest_rpc {
+       spinlock_t       rpc_glock;     /* global lock */
+       srpc_service_t  *rpc_services[SRPC_SERVICE_MAX_ID + 1];
+       lnet_handle_eq_t rpc_lnet_eq;   /* _the_ LNet event queue */
+       srpc_state_t     rpc_state;
+       srpc_counters_t  rpc_counters;
+       __u64            rpc_matchbits; /* matchbits counter */
+} srpc_data;
+
+static inline int
+srpc_serv_portal(int svc_id)
+{
+       return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ?
+              SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL;
+}
+
+/* forward ref's */
+int srpc_handle_rpc (swi_workitem_t *wi);
+
+void srpc_get_counters (srpc_counters_t *cnt)
+{
+       spin_lock(&srpc_data.rpc_glock);
+       *cnt = srpc_data.rpc_counters;
+       spin_unlock(&srpc_data.rpc_glock);
+}
+
+void srpc_set_counters (const srpc_counters_t *cnt)
+{
+       spin_lock(&srpc_data.rpc_glock);
+       srpc_data.rpc_counters = *cnt;
+       spin_unlock(&srpc_data.rpc_glock);
+}
+
+int
+srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob)
+{
+       nob = min(nob, (int)PAGE_CACHE_SIZE);
+
+       LASSERT(nob > 0);
+       LASSERT(i >= 0 && i < bk->bk_niov);
+
+       bk->bk_iovs[i].kiov_offset = 0;
+       bk->bk_iovs[i].kiov_page   = pg;
+       bk->bk_iovs[i].kiov_len    = nob;
+       return nob;
+}
+
+void
+srpc_free_bulk (srpc_bulk_t *bk)
+{
+       int      i;
+       struct page *pg;
+
+       LASSERT (bk != NULL);
+
+       for (i = 0; i < bk->bk_niov; i++) {
+               pg = bk->bk_iovs[i].kiov_page;
+               if (pg == NULL) break;
+
+               __free_page(pg);
+       }
+
+       LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
+       return;
+}
+
+srpc_bulk_t *
+srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink)
+{
+       srpc_bulk_t  *bk;
+       struct page  **pages;
+       int           i;
+
+       LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
+
+       LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
+                        offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+       if (bk == NULL) {
+               CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
+               return NULL;
+       }
+
+       memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+       bk->bk_sink   = sink;
+       bk->bk_len    = bulk_len;
+       bk->bk_niov   = bulk_npg;
+       UNUSED(pages);
+
+       for (i = 0; i < bulk_npg; i++) {
+               struct page *pg;
+               int         nob;
+
+               pg = alloc_pages_node(cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+                                     GFP_IOFS, 0);
+               if (pg == NULL) {
+                       CERROR("Can't allocate page %d of %d\n", i, bulk_npg);
+                       srpc_free_bulk(bk);
+                       return NULL;
+               }
+
+               nob = srpc_add_bulk_page(bk, pg, i, bulk_len);
+               bulk_len -= nob;
+       }
+
+       return bk;
+}
+
+static inline __u64
+srpc_next_id (void)
+{
+       __u64 id;
+
+       spin_lock(&srpc_data.rpc_glock);
+       id = srpc_data.rpc_matchbits++;
+       spin_unlock(&srpc_data.rpc_glock);
+       return id;
+}
+
+void
+srpc_init_server_rpc(struct srpc_server_rpc *rpc,
+                    struct srpc_service_cd *scd,
+                    struct srpc_buffer *buffer)
+{
+       memset(rpc, 0, sizeof(*rpc));
+       swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc,
+                         srpc_serv_is_framework(scd->scd_svc) ?
+                         lst_sched_serial : lst_sched_test[scd->scd_cpt]);
+
+       rpc->srpc_ev.ev_fired = 1; /* no event expected now */
+
+       rpc->srpc_scd      = scd;
+       rpc->srpc_reqstbuf = buffer;
+       rpc->srpc_peer     = buffer->buf_peer;
+       rpc->srpc_self     = buffer->buf_self;
+       LNetInvalidateHandle(&rpc->srpc_replymdh);
+}
+
+static void
+srpc_service_fini(struct srpc_service *svc)
+{
+       struct srpc_service_cd  *scd;
+       struct srpc_server_rpc  *rpc;
+       struct srpc_buffer      *buf;
+       struct list_head                *q;
+       int                     i;
+
+       if (svc->sv_cpt_data == NULL)
+               return;
+
+       cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+               while (1) {
+                       if (!list_empty(&scd->scd_buf_posted))
+                               q = &scd->scd_buf_posted;
+                       else if (!list_empty(&scd->scd_buf_blocked))
+                               q = &scd->scd_buf_blocked;
+                       else
+                               break;
+
+                       while (!list_empty(q)) {
+                               buf = list_entry(q->next,
+                                                    struct srpc_buffer,
+                                                    buf_list);
+                               list_del(&buf->buf_list);
+                               LIBCFS_FREE(buf, sizeof(*buf));
+                       }
+               }
+
+               LASSERT(list_empty(&scd->scd_rpc_active));
+
+               while (!list_empty(&scd->scd_rpc_free)) {
+                       rpc = list_entry(scd->scd_rpc_free.next,
+                                            struct srpc_server_rpc,
+                                            srpc_list);
+                       list_del(&rpc->srpc_list);
+                       LIBCFS_FREE(rpc, sizeof(*rpc));
+               }
+       }
+
+       cfs_percpt_free(svc->sv_cpt_data);
+       svc->sv_cpt_data = NULL;
+}
+
+static int
+srpc_service_nrpcs(struct srpc_service *svc)
+{
+       int nrpcs = svc->sv_wi_total / svc->sv_ncpts;
+
+       return srpc_serv_is_framework(svc) ?
+              max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN);
+}
+
+int srpc_add_buffer(struct swi_workitem *wi);
+
+static int
+srpc_service_init(struct srpc_service *svc)
+{
+       struct srpc_service_cd  *scd;
+       struct srpc_server_rpc  *rpc;
+       int                     nrpcs;
+       int                     i;
+       int                     j;
+
+       svc->sv_shuttingdown = 0;
+
+       svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(),
+                                           sizeof(struct srpc_service_cd));
+       if (svc->sv_cpt_data == NULL)
+               return -ENOMEM;
+
+       svc->sv_ncpts = srpc_serv_is_framework(svc) ?
+                       1 : cfs_cpt_number(lnet_cpt_table());
+       nrpcs = srpc_service_nrpcs(svc);
+
+       cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+               scd->scd_cpt = i;
+               scd->scd_svc = svc;
+               spin_lock_init(&scd->scd_lock);
+               INIT_LIST_HEAD(&scd->scd_rpc_free);
+               INIT_LIST_HEAD(&scd->scd_rpc_active);
+               INIT_LIST_HEAD(&scd->scd_buf_posted);
+               INIT_LIST_HEAD(&scd->scd_buf_blocked);
+
+               scd->scd_ev.ev_data = scd;
+               scd->scd_ev.ev_type = SRPC_REQUEST_RCVD;
+
+               /* NB: don't use lst_sched_serial for adding buffer,
+                * see details in srpc_service_add_buffers() */
+               swi_init_workitem(&scd->scd_buf_wi, scd,
+                                 srpc_add_buffer, lst_sched_test[i]);
+
+               if (i != 0 && srpc_serv_is_framework(svc)) {
+                       /* NB: framework service only needs srpc_service_cd for
+                        * one partition, but we allocate for all to make
+                        * it easier to implement, it will waste a little
+                        * memory but nobody should care about this */
+                       continue;
+               }
+
+               for (j = 0; j < nrpcs; j++) {
+                       LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(),
+                                        i, sizeof(*rpc));
+                       if (rpc == NULL) {
+                               srpc_service_fini(svc);
+                               return -ENOMEM;
+                       }
+                       list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+               }
+       }
+
+       return 0;
+}
+
+int
+srpc_add_service(struct srpc_service *sv)
+{
+       int id = sv->sv_id;
+
+       LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID);
+
+       if (srpc_service_init(sv) != 0)
+               return -ENOMEM;
+
+       spin_lock(&srpc_data.rpc_glock);
+
+       LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+       if (srpc_data.rpc_services[id] != NULL) {
+               spin_unlock(&srpc_data.rpc_glock);
+               goto failed;
+       }
+
+       srpc_data.rpc_services[id] = sv;
+       spin_unlock(&srpc_data.rpc_glock);
+
+       CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
+       return 0;
+
+ failed:
+       srpc_service_fini(sv);
+       return -EBUSY;
+}
+
+int
+srpc_remove_service (srpc_service_t *sv)
+{
+       int id = sv->sv_id;
+
+       spin_lock(&srpc_data.rpc_glock);
+
+       if (srpc_data.rpc_services[id] != sv) {
+               spin_unlock(&srpc_data.rpc_glock);
+               return -ENOENT;
+       }
+
+       srpc_data.rpc_services[id] = NULL;
+       spin_unlock(&srpc_data.rpc_glock);
+       return 0;
+}
+
+int
+srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
+                      int len, int options, lnet_process_id_t peer,
+                      lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+       int              rc;
+       lnet_md_t        md;
+       lnet_handle_me_t meh;
+
+       rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
+                         local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
+       if (rc != 0) {
+               CERROR ("LNetMEAttach failed: %d\n", rc);
+               LASSERT (rc == -ENOMEM);
+               return -ENOMEM;
+       }
+
+       md.threshold = 1;
+       md.user_ptr  = ev;
+       md.start     = buf;
+       md.length    = len;
+       md.options   = options;
+       md.eq_handle = srpc_data.rpc_lnet_eq;
+
+       rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+       if (rc != 0) {
+               CERROR ("LNetMDAttach failed: %d\n", rc);
+               LASSERT (rc == -ENOMEM);
+
+               rc = LNetMEUnlink(meh);
+               LASSERT (rc == 0);
+               return -ENOMEM;
+       }
+
+       CDEBUG (D_NET,
+               "Posted passive RDMA: peer %s, portal %d, matchbits "LPX64"\n",
+               libcfs_id2str(peer), portal, matchbits);
+       return 0;
+}
+
+int
+srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
+                     int options, lnet_process_id_t peer, lnet_nid_t self,
+                     lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+       int       rc;
+       lnet_md_t md;
+
+       md.user_ptr  = ev;
+       md.start     = buf;
+       md.length    = len;
+       md.eq_handle = srpc_data.rpc_lnet_eq;
+       md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+       md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+       rc = LNetMDBind(md, LNET_UNLINK, mdh);
+       if (rc != 0) {
+               CERROR ("LNetMDBind failed: %d\n", rc);
+               LASSERT (rc == -ENOMEM);
+               return -ENOMEM;
+       }
+
+       /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+        * they're only meaningful for MDs attached to an ME (i.e. passive
+        * buffers... */
+       if ((options & LNET_MD_OP_PUT) != 0) {
+               rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+                            portal, matchbits, 0, 0);
+       } else {
+               LASSERT ((options & LNET_MD_OP_GET) != 0);
+
+               rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
+       }
+
+       if (rc != 0) {
+               CERROR ("LNet%s(%s, %d, "LPD64") failed: %d\n",
+                       ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+                       libcfs_id2str(peer), portal, matchbits, rc);
+
+               /* The forthcoming unlink event will complete this operation
+                * with failure, so fall through and return success here.
+                */
+               rc = LNetMDUnlink(*mdh);
+               LASSERT (rc == 0);
+       } else {
+               CDEBUG (D_NET,
+                       "Posted active RDMA: peer %s, portal %u, matchbits "LPX64"\n",
+                       libcfs_id2str(peer), portal, matchbits);
+       }
+       return 0;
+}
+
+int
+srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf,
+                       int len, lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+       return srpc_post_active_rdma(srpc_serv_portal(service), service,
+                                    buf, len, LNET_MD_OP_PUT, peer,
+                                    LNET_NID_ANY, mdh, ev);
+}
+
+int
+srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
+                        lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+       lnet_process_id_t any = {0};
+
+       any.nid = LNET_NID_ANY;
+       any.pid = LNET_PID_ANY;
+
+       return srpc_post_passive_rdma(srpc_serv_portal(service),
+                                     local, service, buf, len,
+                                     LNET_MD_OP_PUT, any, mdh, ev);
+}
+
+int
+srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
+{
+       struct srpc_service     *sv = scd->scd_svc;
+       struct srpc_msg         *msg = &buf->buf_msg;
+       int                     rc;
+
+       LNetInvalidateHandle(&buf->buf_mdh);
+       list_add(&buf->buf_list, &scd->scd_buf_posted);
+       scd->scd_buf_nposted++;
+       spin_unlock(&scd->scd_lock);
+
+       rc = srpc_post_passive_rqtbuf(sv->sv_id,
+                                     !srpc_serv_is_framework(sv),
+                                     msg, sizeof(*msg), &buf->buf_mdh,
+                                     &scd->scd_ev);
+
+       /* At this point, a RPC (new or delayed) may have arrived in
+        * msg and its event handler has been called. So we must add
+        * buf to scd_buf_posted _before_ dropping scd_lock */
+
+       spin_lock(&scd->scd_lock);
+
+       if (rc == 0) {
+               if (!sv->sv_shuttingdown)
+                       return 0;
+
+               spin_unlock(&scd->scd_lock);
+               /* srpc_shutdown_service might have tried to unlink me
+                * when my buf_mdh was still invalid */
+               LNetMDUnlink(buf->buf_mdh);
+               spin_lock(&scd->scd_lock);
+               return 0;
+       }
+
+       scd->scd_buf_nposted--;
+       if (sv->sv_shuttingdown)
+               return rc; /* don't allow to change scd_buf_posted */
+
+       list_del(&buf->buf_list);
+       spin_unlock(&scd->scd_lock);
+
+       LIBCFS_FREE(buf, sizeof(*buf));
+
+       spin_lock(&scd->scd_lock);
+       return rc;
+}
+
+int
+srpc_add_buffer(struct swi_workitem *wi)
+{
+       struct srpc_service_cd  *scd = wi->swi_workitem.wi_data;
+       struct srpc_buffer      *buf;
+       int                     rc = 0;
+
+       /* it's called by workitem scheduler threads, these threads
+        * should have been set CPT affinity, so buffers will be posted
+        * on CPT local list of Portal */
+       spin_lock(&scd->scd_lock);
+
+       while (scd->scd_buf_adjust > 0 &&
+              !scd->scd_svc->sv_shuttingdown) {
+               scd->scd_buf_adjust--; /* consume it */
+               scd->scd_buf_posting++;
+
+               spin_unlock(&scd->scd_lock);
+
+               LIBCFS_ALLOC(buf, sizeof(*buf));
+               if (buf == NULL) {
+                       CERROR("Failed to add new buf to service: %s\n",
+                              scd->scd_svc->sv_name);
+                       spin_lock(&scd->scd_lock);
+                       rc = -ENOMEM;
+                       break;
+               }
+
+               spin_lock(&scd->scd_lock);
+               if (scd->scd_svc->sv_shuttingdown) {
+                       spin_unlock(&scd->scd_lock);
+                       LIBCFS_FREE(buf, sizeof(*buf));
+
+                       spin_lock(&scd->scd_lock);
+                       rc = -ESHUTDOWN;
+                       break;
+               }
+
+               rc = srpc_service_post_buffer(scd, buf);
+               if (rc != 0)
+                       break; /* buf has been freed inside */
+
+               LASSERT(scd->scd_buf_posting > 0);
+               scd->scd_buf_posting--;
+               scd->scd_buf_total++;
+               scd->scd_buf_low = MAX(2, scd->scd_buf_total / 4);
+       }
+
+       if (rc != 0) {
+               scd->scd_buf_err_stamp = cfs_time_current_sec();
+               scd->scd_buf_err = rc;
+
+               LASSERT(scd->scd_buf_posting > 0);
+               scd->scd_buf_posting--;
+       }
+
+       spin_unlock(&scd->scd_lock);
+       return 0;
+}
+
+int
+srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
+{
+       struct srpc_service_cd  *scd;
+       int                     rc = 0;
+       int                     i;
+
+       LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+
+               scd->scd_buf_err = 0;
+               scd->scd_buf_err_stamp = 0;
+               scd->scd_buf_posting = 0;
+               scd->scd_buf_adjust = nbuffer;
+               /* start to post buffers */
+               swi_schedule_workitem(&scd->scd_buf_wi);
+               spin_unlock(&scd->scd_lock);
+
+               /* framework service only post buffer for one partition  */
+               if (srpc_serv_is_framework(sv))
+                       break;
+       }
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+               /*
+                * NB: srpc_service_add_buffers() can be called inside
+                * thread context of lst_sched_serial, and we don't normally
+                * allow to sleep inside thread context of WI scheduler
+                * because it will block current scheduler thread from doing
+                * anything else, even worse, it could deadlock if it's
+                * waiting on result from another WI of the same scheduler.
+                * However, it's safe at here because scd_buf_wi is scheduled
+                * by thread in a different WI scheduler (lst_sched_test),
+                * so we don't have any risk of deadlock, though this could
+                * block all WIs pending on lst_sched_serial for a moment
+                * which is not good but not fatal.
+                */
+               lst_wait_until(scd->scd_buf_err != 0 ||
+                              (scd->scd_buf_adjust == 0 &&
+                               scd->scd_buf_posting == 0),
+                              scd->scd_lock, "waiting for adding buffer\n");
+
+               if (scd->scd_buf_err != 0 && rc == 0)
+                       rc = scd->scd_buf_err;
+
+               spin_unlock(&scd->scd_lock);
+       }
+
+       return rc;
+}
+
+void
+srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
+{
+       struct srpc_service_cd  *scd;
+       int                     num;
+       int                     i;
+
+       LASSERT(!sv->sv_shuttingdown);
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+
+               num = scd->scd_buf_total + scd->scd_buf_posting;
+               scd->scd_buf_adjust -= min(nbuffer, num);
+
+               spin_unlock(&scd->scd_lock);
+       }
+}
+
+/* returns 1 if sv has finished, otherwise 0 */
+int
+srpc_finish_service(struct srpc_service *sv)
+{
+       struct srpc_service_cd  *scd;
+       struct srpc_server_rpc  *rpc;
+       int                     i;
+
+       LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+               if (!swi_deschedule_workitem(&scd->scd_buf_wi))
+                       return 0;
+
+               if (scd->scd_buf_nposted > 0) {
+                       CDEBUG(D_NET, "waiting for %d posted buffers to unlink",
+                              scd->scd_buf_nposted);
+                       spin_unlock(&scd->scd_lock);
+                       return 0;
+               }
+
+               if (list_empty(&scd->scd_rpc_active)) {
+                       spin_unlock(&scd->scd_lock);
+                       continue;
+               }
+
+               rpc = list_entry(scd->scd_rpc_active.next,
+                                    struct srpc_server_rpc, srpc_list);
+               CNETERR("Active RPC %p on shutdown: sv %s, peer %s, "
+                       "wi %s scheduled %d running %d, "
+                       "ev fired %d type %d status %d lnet %d\n",
+                       rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+                       swi_state2str(rpc->srpc_wi.swi_state),
+                       rpc->srpc_wi.swi_workitem.wi_scheduled,
+                       rpc->srpc_wi.swi_workitem.wi_running,
+                       rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type,
+                       rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet);
+               spin_unlock(&scd->scd_lock);
+               return 0;
+       }
+
+       /* no lock needed from now on */
+       srpc_service_fini(sv);
+       return 1;
+}
+
+/* called with sv->sv_lock held */
+void
+srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+{
+       if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
+               if (srpc_service_post_buffer(scd, buf) != 0) {
+                       CWARN("Failed to post %s buffer\n",
+                             scd->scd_svc->sv_name);
+               }
+               return;
+       }
+
+       /* service is shutting down, or we want to recycle some buffers */
+       scd->scd_buf_total--;
+
+       if (scd->scd_buf_adjust < 0) {
+               scd->scd_buf_adjust++;
+               if (scd->scd_buf_adjust < 0 &&
+                   scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) {
+                       CDEBUG(D_INFO,
+                              "Try to recyle %d buffers but nothing left\n",
+                              scd->scd_buf_adjust);
+                       scd->scd_buf_adjust = 0;
+               }
+       }
+
+       spin_unlock(&scd->scd_lock);
+       LIBCFS_FREE(buf, sizeof(*buf));
+       spin_lock(&scd->scd_lock);
+}
+
+void
+srpc_abort_service(struct srpc_service *sv)
+{
+       struct srpc_service_cd  *scd;
+       struct srpc_server_rpc  *rpc;
+       int                     i;
+
+       CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
+              sv->sv_id, sv->sv_name);
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+
+               /* schedule in-flight RPCs to notice the abort, NB:
+                * racing with incoming RPCs; complete fix should make test
+                * RPCs carry session ID in its headers */
+               list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
+                       rpc->srpc_aborted = 1;
+                       swi_schedule_workitem(&rpc->srpc_wi);
+               }
+
+               spin_unlock(&scd->scd_lock);
+       }
+}
+
+void
+srpc_shutdown_service(srpc_service_t *sv)
+{
+       struct srpc_service_cd  *scd;
+       struct srpc_server_rpc  *rpc;
+       srpc_buffer_t           *buf;
+       int                     i;
+
+       CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
+              sv->sv_id, sv->sv_name);
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+               spin_lock(&scd->scd_lock);
+
+       sv->sv_shuttingdown = 1; /* i.e. no new active RPC */
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+               spin_unlock(&scd->scd_lock);
+
+       cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+               spin_lock(&scd->scd_lock);
+
+               /* schedule in-flight RPCs to notice the shutdown */
+               list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list)
+                       swi_schedule_workitem(&rpc->srpc_wi);
+
+               spin_unlock(&scd->scd_lock);
+
+               /* OK to traverse scd_buf_posted without lock, since no one
+                * touches scd_buf_posted now */
+               list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
+                       LNetMDUnlink(buf->buf_mdh);
+       }
+}
+
+int
+srpc_send_request (srpc_client_rpc_t *rpc)
+{
+       srpc_event_t *ev = &rpc->crpc_reqstev;
+       int        rc;
+
+       ev->ev_fired = 0;
+       ev->ev_data  = rpc;
+       ev->ev_type  = SRPC_REQUEST_SENT;
+
+       rc = srpc_post_active_rqtbuf(rpc->crpc_dest, rpc->crpc_service,
+                                    &rpc->crpc_reqstmsg, sizeof(srpc_msg_t),
+                                    &rpc->crpc_reqstmdh, ev);
+       if (rc != 0) {
+               LASSERT (rc == -ENOMEM);
+               ev->ev_fired = 1;  /* no more event expected */
+       }
+       return rc;
+}
+
+int
+srpc_prepare_reply (srpc_client_rpc_t *rpc)
+{
+       srpc_event_t *ev = &rpc->crpc_replyev;
+       __u64   *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+       int        rc;
+
+       ev->ev_fired = 0;
+       ev->ev_data  = rpc;
+       ev->ev_type  = SRPC_REPLY_RCVD;
+
+       *id = srpc_next_id();
+
+       rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+                                   &rpc->crpc_replymsg, sizeof(srpc_msg_t),
+                                   LNET_MD_OP_PUT, rpc->crpc_dest,
+                                   &rpc->crpc_replymdh, ev);
+       if (rc != 0) {
+               LASSERT (rc == -ENOMEM);
+               ev->ev_fired = 1;  /* no more event expected */
+       }
+       return rc;
+}
+
+int
+srpc_prepare_bulk (srpc_client_rpc_t *rpc)
+{
+       srpc_bulk_t  *bk = &rpc->crpc_bulk;
+       srpc_event_t *ev = &rpc->crpc_bulkev;
+       __u64   *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+       int        rc;
+       int        opt;
+
+       LASSERT (bk->bk_niov <= LNET_MAX_IOV);
+
+       if (bk->bk_niov == 0) return 0; /* nothing to do */
+
+       opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+       opt |= LNET_MD_KIOV;
+
+       ev->ev_fired = 0;
+       ev->ev_data  = rpc;
+       ev->ev_type  = SRPC_BULK_REQ_RCVD;
+
+       *id = srpc_next_id();
+
+       rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+                                   &bk->bk_iovs[0], bk->bk_niov, opt,
+                                   rpc->crpc_dest, &bk->bk_mdh, ev);
+       if (rc != 0) {
+               LASSERT (rc == -ENOMEM);
+               ev->ev_fired = 1;  /* no more event expected */
+       }
+       return rc;
+}
+
+int
+srpc_do_bulk (srpc_server_rpc_t *rpc)
+{
+       srpc_event_t  *ev = &rpc->srpc_ev;
+       srpc_bulk_t   *bk = rpc->srpc_bulk;
+       __u64     id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+       int         rc;
+       int         opt;
+
+       LASSERT (bk != NULL);
+
+       opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+       opt |= LNET_MD_KIOV;
+
+       ev->ev_fired = 0;
+       ev->ev_data  = rpc;
+       ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+       rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+                                  &bk->bk_iovs[0], bk->bk_niov, opt,
+                                  rpc->srpc_peer, rpc->srpc_self,
+                                  &bk->bk_mdh, ev);
+       if (rc != 0)
+               ev->ev_fired = 1;  /* no more event expected */
+       return rc;
+}
+
+/* only called from srpc_handle_rpc */
+void
+srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+{
+       struct srpc_service_cd  *scd = rpc->srpc_scd;
+       struct srpc_service     *sv  = scd->scd_svc;
+       srpc_buffer_t           *buffer;
+
+       LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+
+       rpc->srpc_status = status;
+
+       CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR,
+               "Server RPC %p done: service %s, peer %s, status %s:%d\n",
+               rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+               swi_state2str(rpc->srpc_wi.swi_state), status);
+
+       if (status != 0) {
+               spin_lock(&srpc_data.rpc_glock);
+               srpc_data.rpc_counters.rpcs_dropped++;
+               spin_unlock(&srpc_data.rpc_glock);
+       }
+
+       if (rpc->srpc_done != NULL)
+               (*rpc->srpc_done) (rpc);
+       LASSERT(rpc->srpc_bulk == NULL);
+
+       spin_lock(&scd->scd_lock);
+
+       if (rpc->srpc_reqstbuf != NULL) {
+               /* NB might drop sv_lock in srpc_service_recycle_buffer, but
+                * sv won't go away for scd_rpc_active must not be empty */
+               srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
+               rpc->srpc_reqstbuf = NULL;
+       }
+
+       list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */
+
+       /*
+        * No one can schedule me now since:
+        * - I'm not on scd_rpc_active.
+        * - all LNet events have been fired.
+        * Cancel pending schedules and prevent future schedule attempts:
+        */
+       LASSERT(rpc->srpc_ev.ev_fired);
+       swi_exit_workitem(&rpc->srpc_wi);
+
+       if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
+               buffer = list_entry(scd->scd_buf_blocked.next,
+                                       srpc_buffer_t, buf_list);
+               list_del(&buffer->buf_list);
+
+               srpc_init_server_rpc(rpc, scd, buffer);
+               list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active);
+               swi_schedule_workitem(&rpc->srpc_wi);
+       } else {
+               list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+       }
+
+       spin_unlock(&scd->scd_lock);
+       return;
+}
+
+/* handles an incoming RPC */
+int
+srpc_handle_rpc(swi_workitem_t *wi)
+{
+       struct srpc_server_rpc  *rpc = wi->swi_workitem.wi_data;
+       struct srpc_service_cd  *scd = rpc->srpc_scd;
+       struct srpc_service     *sv = scd->scd_svc;
+       srpc_event_t            *ev = &rpc->srpc_ev;
+       int                     rc = 0;
+
+       LASSERT(wi == &rpc->srpc_wi);
+
+       spin_lock(&scd->scd_lock);
+
+       if (sv->sv_shuttingdown || rpc->srpc_aborted) {
+               spin_unlock(&scd->scd_lock);
+
+               if (rpc->srpc_bulk != NULL)
+                       LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+               LNetMDUnlink(rpc->srpc_replymdh);
+
+               if (ev->ev_fired) { /* no more event, OK to finish */
+                       srpc_server_rpc_done(rpc, -ESHUTDOWN);
+                       return 1;
+               }
+               return 0;
+       }
+
+       spin_unlock(&scd->scd_lock);
+
+       switch (wi->swi_state) {
+       default:
+               LBUG ();
+       case SWI_STATE_NEWBORN: {
+               srpc_msg_t         *msg;
+               srpc_generic_reply_t *reply;
+
+               msg = &rpc->srpc_reqstbuf->buf_msg;
+               reply = &rpc->srpc_replymsg.msg_body.reply;
+
+               if (msg->msg_magic == 0) {
+                       /* moaned already in srpc_lnet_ev_handler */
+                       srpc_server_rpc_done(rpc, EBADMSG);
+                       return 1;
+               }
+
+               srpc_unpack_msg_hdr(msg);
+               if (msg->msg_version != SRPC_MSG_VERSION) {
+                       CWARN("Version mismatch: %u, %u expected, from %s\n",
+                             msg->msg_version, SRPC_MSG_VERSION,
+                             libcfs_id2str(rpc->srpc_peer));
+                       reply->status = EPROTO;
+                       /* drop through and send reply */
+               } else {
+                       reply->status = 0;
+                       rc = (*sv->sv_handler)(rpc);
+                       LASSERT(reply->status == 0 || !rpc->srpc_bulk);
+                       if (rc != 0) {
+                               srpc_server_rpc_done(rpc, rc);
+                               return 1;
+                       }
+               }
+
+               wi->swi_state = SWI_STATE_BULK_STARTED;
+
+               if (rpc->srpc_bulk != NULL) {
+                       rc = srpc_do_bulk(rpc);
+                       if (rc == 0)
+                               return 0; /* wait for bulk */
+
+                       LASSERT (ev->ev_fired);
+                       ev->ev_status = rc;
+               }
+       }
+       case SWI_STATE_BULK_STARTED:
+               LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
+
+               if (rpc->srpc_bulk != NULL) {
+                       rc = ev->ev_status;
+
+                       if (sv->sv_bulk_ready != NULL)
+                               rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+                       if (rc != 0) {
+                               srpc_server_rpc_done(rpc, rc);
+                               return 1;
+                       }
+               }
+
+               wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+               rc = srpc_send_reply(rpc);
+               if (rc == 0)
+                       return 0; /* wait for reply */
+               srpc_server_rpc_done(rpc, rc);
+               return 1;
+
+       case SWI_STATE_REPLY_SUBMITTED:
+               if (!ev->ev_fired) {
+                       CERROR("RPC %p: bulk %p, service %d\n",
+                              rpc, rpc->srpc_bulk, sv->sv_id);
+                       CERROR("Event: status %d, type %d, lnet %d\n",
+                              ev->ev_status, ev->ev_type, ev->ev_lnet);
+                       LASSERT (ev->ev_fired);
+               }
+
+               wi->swi_state = SWI_STATE_DONE;
+               srpc_server_rpc_done(rpc, ev->ev_status);
+               return 1;
+       }
+
+       return 0;
+}
+
+void
+srpc_client_rpc_expired (void *data)
+{
+       srpc_client_rpc_t *rpc = data;
+
+       CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n",
+              rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+              rpc->crpc_timeout);
+
+       spin_lock(&rpc->crpc_lock);
+
+       rpc->crpc_timeout = 0;
+       srpc_abort_rpc(rpc, -ETIMEDOUT);
+
+       spin_unlock(&rpc->crpc_lock);
+
+       spin_lock(&srpc_data.rpc_glock);
+       srpc_data.rpc_counters.rpcs_expired++;
+       spin_unlock(&srpc_data.rpc_glock);
+}
+
+inline void
+srpc_add_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+       stt_timer_t *timer = &rpc->crpc_timer;
+
+       if (rpc->crpc_timeout == 0) return;
+
+       INIT_LIST_HEAD(&timer->stt_list);
+       timer->stt_data    = rpc;
+       timer->stt_func    = srpc_client_rpc_expired;
+       timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
+                                         cfs_time_current_sec());
+       stt_add_timer(timer);
+       return;
+}
+
+/*
+ * Called with rpc->crpc_lock held.
+ *
+ * Upon exit the RPC expiry timer is not queued and the handler is not
+ * running on any CPU. */
+void
+srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+       /* timer not planted or already exploded */
+       if (rpc->crpc_timeout == 0)
+               return;
+
+       /* timer sucessfully defused */
+       if (stt_del_timer(&rpc->crpc_timer))
+               return;
+
+       /* timer detonated, wait for it to explode */
+       while (rpc->crpc_timeout != 0) {
+               spin_unlock(&rpc->crpc_lock);
+
+               schedule();
+
+               spin_lock(&rpc->crpc_lock);
+       }
+}
+
+void
+srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
+{
+       swi_workitem_t *wi = &rpc->crpc_wi;
+
+       LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
+
+       spin_lock(&rpc->crpc_lock);
+
+       rpc->crpc_closed = 1;
+       if (rpc->crpc_status == 0)
+               rpc->crpc_status = status;
+
+       srpc_del_client_rpc_timer(rpc);
+
+       CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR,
+               "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+               rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+               swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+       /*
+        * No one can schedule me now since:
+        * - RPC timer has been defused.
+        * - all LNet events have been fired.
+        * - crpc_closed has been set, preventing srpc_abort_rpc from
+        *   scheduling me.
+        * Cancel pending schedules and prevent future schedule attempts:
+        */
+       LASSERT (!srpc_event_pending(rpc));
+       swi_exit_workitem(wi);
+
+       spin_unlock(&rpc->crpc_lock);
+
+       (*rpc->crpc_done)(rpc);
+       return;
+}
+
+/* sends an outgoing RPC */
+int
+srpc_send_rpc (swi_workitem_t *wi)
+{
+       int             rc = 0;
+       srpc_client_rpc_t *rpc;
+       srpc_msg_t      *reply;
+       int             do_bulk;
+
+       LASSERT(wi != NULL);
+
+       rpc = wi->swi_workitem.wi_data;
+
+       LASSERT (rpc != NULL);
+       LASSERT (wi == &rpc->crpc_wi);
+
+       reply = &rpc->crpc_replymsg;
+       do_bulk = rpc->crpc_bulk.bk_niov > 0;
+
+       spin_lock(&rpc->crpc_lock);
+
+       if (rpc->crpc_aborted) {
+               spin_unlock(&rpc->crpc_lock);
+               goto abort;
+       }
+
+       spin_unlock(&rpc->crpc_lock);
+
+       switch (wi->swi_state) {
+       default:
+               LBUG ();
+       case SWI_STATE_NEWBORN:
+               LASSERT (!srpc_event_pending(rpc));
+
+               rc = srpc_prepare_reply(rpc);
+               if (rc != 0) {
+                       srpc_client_rpc_done(rpc, rc);
+                       return 1;
+               }
+
+               rc = srpc_prepare_bulk(rpc);
+               if (rc != 0) break;
+
+               wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+               rc = srpc_send_request(rpc);
+               break;
+
+       case SWI_STATE_REQUEST_SUBMITTED:
+               /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+                * order; however, they're processed in a strict order:
+                * rqt, rpy, and bulk. */
+               if (!rpc->crpc_reqstev.ev_fired) break;
+
+               rc = rpc->crpc_reqstev.ev_status;
+               if (rc != 0) break;
+
+               wi->swi_state = SWI_STATE_REQUEST_SENT;
+               /* perhaps more events, fall thru */
+       case SWI_STATE_REQUEST_SENT: {
+               srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+
+               if (!rpc->crpc_replyev.ev_fired) break;
+
+               rc = rpc->crpc_replyev.ev_status;
+               if (rc != 0) break;
+
+               srpc_unpack_msg_hdr(reply);
+               if (reply->msg_type != type ||
+                   (reply->msg_magic != SRPC_MSG_MAGIC &&
+                    reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+                       CWARN ("Bad message from %s: type %u (%d expected),"
+                              " magic %u (%d expected).\n",
+                              libcfs_id2str(rpc->crpc_dest),
+                              reply->msg_type, type,
+                              reply->msg_magic, SRPC_MSG_MAGIC);
+                       rc = -EBADMSG;
+                       break;
+               }
+
+               if (do_bulk && reply->msg_body.reply.status != 0) {
+                       CWARN ("Remote error %d at %s, unlink bulk buffer in "
+                              "case peer didn't initiate bulk transfer\n",
+                              reply->msg_body.reply.status,
+                              libcfs_id2str(rpc->crpc_dest));
+                       LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+               }
+
+               wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+       }
+       case SWI_STATE_REPLY_RECEIVED:
+               if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
+
+               rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+               /* Bulk buffer was unlinked due to remote error. Clear error
+                * since reply buffer still contains valid data.
+                * NB rpc->crpc_done shouldn't look into bulk data in case of
+                * remote error. */
+               if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+                   rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+                       rc = 0;
+
+               wi->swi_state = SWI_STATE_DONE;
+               srpc_client_rpc_done(rpc, rc);
+               return 1;
+       }
+
+       if (rc != 0) {
+               spin_lock(&rpc->crpc_lock);
+               srpc_abort_rpc(rpc, rc);
+               spin_unlock(&rpc->crpc_lock);
+       }
+
+abort:
+       if (rpc->crpc_aborted) {
+               LNetMDUnlink(rpc->crpc_reqstmdh);
+               LNetMDUnlink(rpc->crpc_replymdh);
+               LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+               if (!srpc_event_pending(rpc)) {
+                       srpc_client_rpc_done(rpc, -EINTR);
+                       return 1;
+               }
+       }
+       return 0;
+}
+
+srpc_client_rpc_t *
+srpc_create_client_rpc (lnet_process_id_t peer, int service,
+                       int nbulkiov, int bulklen,
+                       void (*rpc_done)(srpc_client_rpc_t *),
+                       void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+       srpc_client_rpc_t *rpc;
+
+       LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
+                                  crpc_bulk.bk_iovs[nbulkiov]));
+       if (rpc == NULL)
+               return NULL;
+
+       srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+                            bulklen, rpc_done, rpc_fini, priv);
+       return rpc;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_abort_rpc (srpc_client_rpc_t *rpc, int why)
+{
+       LASSERT (why != 0);
+
+       if (rpc->crpc_aborted || /* already aborted */
+           rpc->crpc_closed)    /* callback imminent */
+               return;
+
+       CDEBUG (D_NET,
+               "Aborting RPC: service %d, peer %s, state %s, why %d\n",
+               rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+               swi_state2str(rpc->crpc_wi.swi_state), why);
+
+       rpc->crpc_aborted = 1;
+       rpc->crpc_status  = why;
+       swi_schedule_workitem(&rpc->crpc_wi);
+       return;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_post_rpc (srpc_client_rpc_t *rpc)
+{
+       LASSERT (!rpc->crpc_aborted);
+       LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+       CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+               libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+               rpc->crpc_timeout);
+
+       srpc_add_client_rpc_timer(rpc);
+       swi_schedule_workitem(&rpc->crpc_wi);
+       return;
+}
+
+
+int
+srpc_send_reply(struct srpc_server_rpc *rpc)
+{
+       srpc_event_t            *ev = &rpc->srpc_ev;
+       struct srpc_msg         *msg = &rpc->srpc_replymsg;
+       struct srpc_buffer      *buffer = rpc->srpc_reqstbuf;
+       struct srpc_service_cd  *scd = rpc->srpc_scd;
+       struct srpc_service     *sv = scd->scd_svc;
+       __u64                   rpyid;
+       int                     rc;
+
+       LASSERT(buffer != NULL);
+       rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
+
+       spin_lock(&scd->scd_lock);
+
+       if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
+               /* Repost buffer before replying since test client
+                * might send me another RPC once it gets the reply */
+               if (srpc_service_post_buffer(scd, buffer) != 0)
+                       CWARN("Failed to repost %s buffer\n", sv->sv_name);
+               rpc->srpc_reqstbuf = NULL;
+       }
+
+       spin_unlock(&scd->scd_lock);
+
+       ev->ev_fired = 0;
+       ev->ev_data  = rpc;
+       ev->ev_type  = SRPC_REPLY_SENT;
+
+       msg->msg_magic   = SRPC_MSG_MAGIC;
+       msg->msg_version = SRPC_MSG_VERSION;
+       msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+       rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+                                  sizeof(*msg), LNET_MD_OP_PUT,
+                                  rpc->srpc_peer, rpc->srpc_self,
+                                  &rpc->srpc_replymdh, ev);
+       if (rc != 0)
+               ev->ev_fired = 1;  /* no more event expected */
+       return rc;
+}
+
+/* when in kernel always called with LNET_LOCK() held, and in thread context */
+void
+srpc_lnet_ev_handler(lnet_event_t *ev)
+{
+       struct srpc_service_cd  *scd;
+       srpc_event_t      *rpcev = ev->md.user_ptr;
+       srpc_client_rpc_t *crpc;
+       srpc_server_rpc_t *srpc;
+       srpc_buffer_t     *buffer;
+       srpc_service_t    *sv;
+       srpc_msg_t      *msg;
+       srpc_msg_type_t    type;
+
+       LASSERT (!in_interrupt());
+
+       if (ev->status != 0) {
+               spin_lock(&srpc_data.rpc_glock);
+               srpc_data.rpc_counters.errors++;
+               spin_unlock(&srpc_data.rpc_glock);
+       }
+
+       rpcev->ev_lnet = ev->type;
+
+       switch (rpcev->ev_type) {
+       default:
+               CERROR("Unknown event: status %d, type %d, lnet %d\n",
+                      rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+               LBUG ();
+       case SRPC_REQUEST_SENT:
+               if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+                       spin_lock(&srpc_data.rpc_glock);
+                       srpc_data.rpc_counters.rpcs_sent++;
+                       spin_unlock(&srpc_data.rpc_glock);
+               }
+       case SRPC_REPLY_RCVD:
+       case SRPC_BULK_REQ_RCVD:
+               crpc = rpcev->ev_data;
+
+               if (rpcev != &crpc->crpc_reqstev &&
+                   rpcev != &crpc->crpc_replyev &&
+                   rpcev != &crpc->crpc_bulkev) {
+                       CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+                              rpcev, crpc, &crpc->crpc_reqstev,
+                              &crpc->crpc_replyev, &crpc->crpc_bulkev);
+                       CERROR("Bad event: status %d, type %d, lnet %d\n",
+                              rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+                       LBUG ();
+               }
+
+               spin_lock(&crpc->crpc_lock);
+
+               LASSERT(rpcev->ev_fired == 0);
+               rpcev->ev_fired  = 1;
+               rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+                                               -EINTR : ev->status;
+               swi_schedule_workitem(&crpc->crpc_wi);
+
+               spin_unlock(&crpc->crpc_lock);
+               break;
+
+       case SRPC_REQUEST_RCVD:
+               scd = rpcev->ev_data;
+               sv = scd->scd_svc;
+
+               LASSERT(rpcev == &scd->scd_ev);
+
+               spin_lock(&scd->scd_lock);
+
+               LASSERT (ev->unlinked);
+               LASSERT (ev->type == LNET_EVENT_PUT ||
+                        ev->type == LNET_EVENT_UNLINK);
+               LASSERT (ev->type != LNET_EVENT_UNLINK ||
+                        sv->sv_shuttingdown);
+
+               buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+               buffer->buf_peer = ev->initiator;
+               buffer->buf_self = ev->target.nid;
+
+               LASSERT(scd->scd_buf_nposted > 0);
+               scd->scd_buf_nposted--;
+
+               if (sv->sv_shuttingdown) {
+                       /* Leave buffer on scd->scd_buf_nposted since
+                        * srpc_finish_service needs to traverse it. */
+                       spin_unlock(&scd->scd_lock);
+                       break;
+               }
+
+               if (scd->scd_buf_err_stamp != 0 &&
+                   scd->scd_buf_err_stamp < cfs_time_current_sec()) {
+                       /* re-enable adding buffer */
+                       scd->scd_buf_err_stamp = 0;
+                       scd->scd_buf_err = 0;
+               }
+
+               if (scd->scd_buf_err == 0 && /* adding buffer is enabled */
+                   scd->scd_buf_adjust == 0 &&
+                   scd->scd_buf_nposted < scd->scd_buf_low) {
+                       scd->scd_buf_adjust = MAX(scd->scd_buf_total / 2,
+                                                 SFW_TEST_WI_MIN);
+                       swi_schedule_workitem(&scd->scd_buf_wi);
+               }
+
+               list_del(&buffer->buf_list); /* from scd->scd_buf_posted */
+               msg = &buffer->buf_msg;
+               type = srpc_service2request(sv->sv_id);
+
+               if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+                   (msg->msg_type != type &&
+                    msg->msg_type != __swab32(type)) ||
+                   (msg->msg_magic != SRPC_MSG_MAGIC &&
+                    msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+                       CERROR ("Dropping RPC (%s) from %s: "
+                               "status %d mlength %d type %u magic %u.\n",
+                               sv->sv_name, libcfs_id2str(ev->initiator),
+                               ev->status, ev->mlength,
+                               msg->msg_type, msg->msg_magic);
+
+                       /* NB can't call srpc_service_recycle_buffer here since
+                        * it may call LNetM[DE]Attach. The invalid magic tells
+                        * srpc_handle_rpc to drop this RPC */
+                       msg->msg_magic = 0;
+               }
+
+               if (!list_empty(&scd->scd_rpc_free)) {
+                       srpc = list_entry(scd->scd_rpc_free.next,
+                                             struct srpc_server_rpc,
+                                             srpc_list);
+                       list_del(&srpc->srpc_list);
+
+                       srpc_init_server_rpc(srpc, scd, buffer);
+                       list_add_tail(&srpc->srpc_list,
+                                         &scd->scd_rpc_active);
+                       swi_schedule_workitem(&srpc->srpc_wi);
+               } else {
+                       list_add_tail(&buffer->buf_list,
+                                         &scd->scd_buf_blocked);
+               }
+
+               spin_unlock(&scd->scd_lock);
+
+               spin_lock(&srpc_data.rpc_glock);
+               srpc_data.rpc_counters.rpcs_rcvd++;
+               spin_unlock(&srpc_data.rpc_glock);
+               break;
+
+       case SRPC_BULK_GET_RPLD:
+               LASSERT (ev->type == LNET_EVENT_SEND ||
+                        ev->type == LNET_EVENT_REPLY ||
+                        ev->type == LNET_EVENT_UNLINK);
+
+               if (!ev->unlinked)
+                       break; /* wait for final event */
+
+       case SRPC_BULK_PUT_SENT:
+               if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+                       spin_lock(&srpc_data.rpc_glock);
+
+                       if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
+                               srpc_data.rpc_counters.bulk_get += ev->mlength;
+                       else
+                               srpc_data.rpc_counters.bulk_put += ev->mlength;
+
+                       spin_unlock(&srpc_data.rpc_glock);
+               }
+       case SRPC_REPLY_SENT:
+               srpc = rpcev->ev_data;
+               scd  = srpc->srpc_scd;
+
+               LASSERT(rpcev == &srpc->srpc_ev);
+
+               spin_lock(&scd->scd_lock);
+
+               rpcev->ev_fired  = 1;
+               rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+                                  -EINTR : ev->status;
+               swi_schedule_workitem(&srpc->srpc_wi);
+
+               spin_unlock(&scd->scd_lock);
+               break;
+       }
+}
+
+
+int
+srpc_startup (void)
+{
+       int rc;
+
+       memset(&srpc_data, 0, sizeof(struct smoketest_rpc));
+       spin_lock_init(&srpc_data.rpc_glock);
+
+       /* 1 second pause to avoid timestamp reuse */
+       cfs_pause(cfs_time_seconds(1));
+       srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48;
+
+       srpc_data.rpc_state = SRPC_STATE_NONE;
+
+       rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+       if (rc < 0) {
+               CERROR ("LNetNIInit() has failed: %d\n", rc);
+               return rc;
+       }
+
+       srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+
+       LNetInvalidateHandle(&srpc_data.rpc_lnet_eq);
+       rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
+       if (rc != 0) {
+               CERROR("LNetEQAlloc() has failed: %d\n", rc);
+               goto bail;
+       }
+
+       rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+       LASSERT(rc == 0);
+       rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
+       LASSERT(rc == 0);
+
+       srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+
+       rc = stt_startup();
+
+bail:
+       if (rc != 0)
+               srpc_shutdown();
+       else
+               srpc_data.rpc_state = SRPC_STATE_RUNNING;
+
+       return rc;
+}
+
+void
+srpc_shutdown (void)
+{
+       int i;
+       int rc;
+       int state;
+
+       state = srpc_data.rpc_state;
+       srpc_data.rpc_state = SRPC_STATE_STOPPING;
+
+       switch (state) {
+       default:
+               LBUG ();
+       case SRPC_STATE_RUNNING:
+               spin_lock(&srpc_data.rpc_glock);
+
+               for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+                       srpc_service_t *sv = srpc_data.rpc_services[i];
+
+                       LASSERTF (sv == NULL,
+                                 "service not empty: id %d, name %s\n",
+                                 i, sv->sv_name);
+               }
+
+               spin_unlock(&srpc_data.rpc_glock);
+
+               stt_shutdown();
+
+       case SRPC_STATE_EQ_INIT:
+               rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+               rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
+               LASSERT (rc == 0);
+               rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+               LASSERT (rc == 0); /* the EQ should have no user by now */
+
+       case SRPC_STATE_NI_INIT:
+               LNetNIFini();
+       }
+
+       return;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.h b/drivers/staging/lustre/lnet/selftest/rpc.h
new file mode 100644 (file)
index 0000000..b905d49
--- /dev/null
@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __SELFTEST_RPC_H__
+#define __SELFTEST_RPC_H__
+
+#include <linux/lnet/lnetst.h>
+
+/*
+ * LST wired structures
+ *
+ * XXX: *REPLY == *REQST + 1
+ */
+typedef enum {
+       SRPC_MSG_MKSN_REQST     = 0,
+       SRPC_MSG_MKSN_REPLY     = 1,
+       SRPC_MSG_RMSN_REQST     = 2,
+       SRPC_MSG_RMSN_REPLY     = 3,
+       SRPC_MSG_BATCH_REQST    = 4,
+       SRPC_MSG_BATCH_REPLY    = 5,
+       SRPC_MSG_STAT_REQST     = 6,
+       SRPC_MSG_STAT_REPLY     = 7,
+       SRPC_MSG_TEST_REQST     = 8,
+       SRPC_MSG_TEST_REPLY     = 9,
+       SRPC_MSG_DEBUG_REQST    = 10,
+       SRPC_MSG_DEBUG_REPLY    = 11,
+       SRPC_MSG_BRW_REQST      = 12,
+       SRPC_MSG_BRW_REPLY      = 13,
+       SRPC_MSG_PING_REQST     = 14,
+       SRPC_MSG_PING_REPLY     = 15,
+       SRPC_MSG_JOIN_REQST     = 16,
+       SRPC_MSG_JOIN_REPLY     = 17,
+} srpc_msg_type_t;
+
+
+/* CAVEAT EMPTOR:
+ * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * and 2nd field matchbits of bulk buffer if any.
+ *
+ * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * session id if needed.
+ */
+typedef struct {
+       __u64                   rpyid;          /* reply buffer matchbits */
+       __u64                   bulkid;         /* bulk buffer matchbits */
+} WIRE_ATTR srpc_generic_reqst_t;
+
+typedef struct {
+       __u32              status;
+       lst_sid_t              sid;
+} WIRE_ATTR srpc_generic_reply_t;
+
+/* FRAMEWORK RPCs */
+typedef struct {
+       __u64                   mksn_rpyid;      /* reply buffer matchbits */
+       lst_sid_t              mksn_sid;        /* session id */
+       __u32                   mksn_force;      /* use brute force */
+       char                    mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reqst_t;                 /* make session request */
+
+typedef struct {
+       __u32              mksn_status;      /* session status */
+       lst_sid_t              mksn_sid;         /* session id */
+       __u32              mksn_timeout;     /* session timeout */
+       char                    mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+
+typedef struct {
+       __u64                   rmsn_rpyid;      /* reply buffer matchbits */
+       lst_sid_t               rmsn_sid;       /* session id */
+} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+
+typedef struct {
+       __u32                   rmsn_status;
+       lst_sid_t               rmsn_sid;       /* session id */
+} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+
+typedef struct {
+       __u64                   join_rpyid;     /* reply buffer matchbits */
+       lst_sid_t              join_sid;       /* session id to join */
+       char                join_group[LST_NAME_SIZE]; /* group name */
+} WIRE_ATTR srpc_join_reqst_t;
+
+typedef struct {
+       __u32              join_status;    /* returned status */
+       lst_sid_t              join_sid;       /* session id */
+       __u32                   join_timeout;   /* # seconds' inactivity to expire */
+       char                join_session[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_join_reply_t;
+
+typedef struct {
+       __u64              dbg_rpyid;      /* reply buffer matchbits */
+       lst_sid_t              dbg_sid; /* session id */
+       __u32              dbg_flags;      /* bitmap of debug */
+} WIRE_ATTR srpc_debug_reqst_t;
+
+typedef struct {
+       __u32              dbg_status;     /* returned code */
+       lst_sid_t              dbg_sid; /* session id */
+       __u32              dbg_timeout;    /* session timeout */
+       __u32              dbg_nbatch;     /* # of batches in the node */
+       char                dbg_name[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_debug_reply_t;
+
+#define SRPC_BATCH_OPC_RUN      1
+#define SRPC_BATCH_OPC_STOP     2
+#define SRPC_BATCH_OPC_QUERY    3
+
+typedef struct {
+       __u64              bar_rpyid;      /* reply buffer matchbits */
+       lst_sid_t              bar_sid; /* session id */
+       lst_bid_t              bar_bid; /* batch id */
+       __u32              bar_opc;     /* create/start/stop batch */
+       __u32              bar_testidx;    /* index of test */
+       __u32              bar_arg;     /* parameters */
+} WIRE_ATTR srpc_batch_reqst_t;
+
+typedef struct {
+       __u32              bar_status;     /* status of request */
+       lst_sid_t              bar_sid; /* session id */
+       __u32              bar_active;     /* # of active tests in batch/test */
+       __u32              bar_time;       /* remained time */
+} WIRE_ATTR srpc_batch_reply_t;
+
+typedef struct {
+       __u64              str_rpyid;      /* reply buffer matchbits */
+       lst_sid_t              str_sid; /* session id */
+       __u32              str_type;       /* type of stat */
+} WIRE_ATTR srpc_stat_reqst_t;
+
+typedef struct {
+       __u32              str_status;
+       lst_sid_t              str_sid;
+       sfw_counters_t    str_fw;
+       srpc_counters_t  str_rpc;
+       lnet_counters_t  str_lnet;
+} WIRE_ATTR srpc_stat_reply_t;
+
+typedef struct {
+       __u32              blk_opc;     /* bulk operation code */
+       __u32              blk_npg;     /* # of pages */
+       __u32              blk_flags;      /* reserved flags */
+} WIRE_ATTR test_bulk_req_t;
+
+typedef struct {
+       /** bulk operation code */
+       __u16                   blk_opc;
+       /** data check flags */
+       __u16                   blk_flags;
+       /** data length */
+       __u32                   blk_len;
+       /** reserved: offset */
+       __u32              blk_offset;
+} WIRE_ATTR test_bulk_req_v1_t;
+
+typedef struct {
+       __u32                   png_size;       /* size of ping message */
+       __u32                   png_flags;      /* reserved flags */
+} WIRE_ATTR test_ping_req_t;
+
+typedef struct {
+       __u64                   tsr_rpyid;      /* reply buffer matchbits */
+       __u64                   tsr_bulkid;     /* bulk buffer matchbits */
+       lst_sid_t               tsr_sid;        /* session id */
+       lst_bid_t               tsr_bid;        /* batch id */
+       __u32                   tsr_service;    /* test type: bulk|ping|... */
+       /* test client loop count or # server buffers needed */
+       __u32                   tsr_loop;
+       __u32                   tsr_concur;     /* concurrency of test */
+       __u8                    tsr_is_client;  /* is test client or not */
+       __u8                    tsr_stop_onerr; /* stop on error */
+       __u32                   tsr_ndest;      /* # of dest nodes */
+
+       union {
+               test_ping_req_t         ping;
+               test_bulk_req_t         bulk_v0;
+               test_bulk_req_v1_t      bulk_v1;
+       }               tsr_u;
+} WIRE_ATTR srpc_test_reqst_t;
+
+typedef struct {
+       __u32                   tsr_status;     /* returned code */
+       lst_sid_t               tsr_sid;
+} WIRE_ATTR srpc_test_reply_t;
+
+/* TEST RPCs */
+typedef struct {
+       __u64              pnr_rpyid;
+       __u32              pnr_magic;
+       __u32              pnr_seq;
+       __u64              pnr_time_sec;
+       __u64              pnr_time_usec;
+} WIRE_ATTR srpc_ping_reqst_t;
+
+typedef struct {
+       __u32              pnr_status;
+       __u32              pnr_magic;
+       __u32              pnr_seq;
+} WIRE_ATTR srpc_ping_reply_t;
+
+typedef struct {
+       __u64              brw_rpyid;      /* reply buffer matchbits */
+       __u64              brw_bulkid;     /* bulk buffer matchbits */
+       __u32              brw_rw;       /* read or write */
+       __u32              brw_len;     /* bulk data len */
+       __u32              brw_flags;      /* bulk data patterns */
+} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+
+typedef struct {
+       __u32              brw_status;
+} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+
+#define SRPC_MSG_MAGIC           0xeeb0f00d
+#define SRPC_MSG_VERSION               1
+
+typedef struct srpc_msg {
+       /** magic number */
+       __u32   msg_magic;
+       /** message version number */
+       __u32   msg_version;
+       /** type of message body: srpc_msg_type_t */
+       __u32   msg_type;
+       __u32   msg_reserved0;
+       __u32   msg_reserved1;
+       /** test session features */
+       __u32   msg_ses_feats;
+       union {
+               srpc_generic_reqst_t reqst;
+               srpc_generic_reply_t reply;
+
+               srpc_mksn_reqst_t    mksn_reqst;
+               srpc_mksn_reply_t    mksn_reply;
+               srpc_rmsn_reqst_t    rmsn_reqst;
+               srpc_rmsn_reply_t    rmsn_reply;
+               srpc_debug_reqst_t   dbg_reqst;
+               srpc_debug_reply_t   dbg_reply;
+               srpc_batch_reqst_t   bat_reqst;
+               srpc_batch_reply_t   bat_reply;
+               srpc_stat_reqst_t    stat_reqst;
+               srpc_stat_reply_t    stat_reply;
+               srpc_test_reqst_t    tes_reqst;
+               srpc_test_reply_t    tes_reply;
+               srpc_join_reqst_t    join_reqst;
+               srpc_join_reply_t    join_reply;
+
+               srpc_ping_reqst_t    ping_reqst;
+               srpc_ping_reply_t    ping_reply;
+               srpc_brw_reqst_t     brw_reqst;
+               srpc_brw_reply_t     brw_reply;
+       }     msg_body;
+} WIRE_ATTR srpc_msg_t;
+
+static inline void
+srpc_unpack_msg_hdr(srpc_msg_t *msg)
+{
+       if (msg->msg_magic == SRPC_MSG_MAGIC)
+               return; /* no flipping needed */
+
+       /* We do not swap the magic number here as it is needed to
+          determine whether the body needs to be swapped. */
+       /* __swab32s(&msg->msg_magic); */
+       __swab32s(&msg->msg_type);
+       __swab32s(&msg->msg_version);
+       __swab32s(&msg->msg_ses_feats);
+       __swab32s(&msg->msg_reserved0);
+       __swab32s(&msg->msg_reserved1);
+}
+
+#endif /* __SELFTEST_RPC_H__ */
diff --git a/drivers/staging/lustre/lnet/selftest/selftest.h b/drivers/staging/lustre/lnet/selftest/selftest.h
new file mode 100644 (file)
index 0000000..8053b05
--- /dev/null
@@ -0,0 +1,611 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/selftest.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_SELFTEST_H__
+#define __SELFTEST_SELFTEST_H__
+
+#define LNET_ONLY
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+
+#include "rpc.h"
+#include "timer.h"
+
+#ifndef MADE_WITHOUT_COMPROMISE
+#define MADE_WITHOUT_COMPROMISE
+#endif
+
+
+#define SWI_STATE_NEWBORN                0
+#define SWI_STATE_REPLY_SUBMITTED        1
+#define SWI_STATE_REPLY_SENT          2
+#define SWI_STATE_REQUEST_SUBMITTED    3
+#define SWI_STATE_REQUEST_SENT      4
+#define SWI_STATE_REPLY_RECEIVED          5
+#define SWI_STATE_BULK_STARTED      6
+#define SWI_STATE_DONE              10
+
+/* forward refs */
+struct srpc_service;
+struct srpc_service_cd;
+struct sfw_test_unit;
+struct sfw_test_instance;
+
+/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework
+ * services, e.g. create/modify session.
+ */
+#define SRPC_SERVICE_DEBUG           0
+#define SRPC_SERVICE_MAKE_SESSION       1
+#define SRPC_SERVICE_REMOVE_SESSION     2
+#define SRPC_SERVICE_BATCH           3
+#define SRPC_SERVICE_TEST             4
+#define SRPC_SERVICE_QUERY_STAT         5
+#define SRPC_SERVICE_JOIN             6
+#define SRPC_FRAMEWORK_SERVICE_MAX_ID   10
+/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */
+#define SRPC_SERVICE_BRW               11
+#define SRPC_SERVICE_PING             12
+#define SRPC_SERVICE_MAX_ID         12
+
+#define SRPC_REQUEST_PORTAL         50
+/* a lazy portal for framework RPC requests */
+#define SRPC_FRAMEWORK_REQUEST_PORTAL   51
+/* all reply/bulk RDMAs go to this portal */
+#define SRPC_RDMA_PORTAL               52
+
+static inline srpc_msg_type_t
+srpc_service2request (int service)
+{
+       switch (service) {
+       default:
+               LBUG ();
+       case SRPC_SERVICE_DEBUG:
+               return SRPC_MSG_DEBUG_REQST;
+
+       case SRPC_SERVICE_MAKE_SESSION:
+               return SRPC_MSG_MKSN_REQST;
+
+       case SRPC_SERVICE_REMOVE_SESSION:
+               return SRPC_MSG_RMSN_REQST;
+
+       case SRPC_SERVICE_BATCH:
+               return SRPC_MSG_BATCH_REQST;
+
+       case SRPC_SERVICE_TEST:
+               return SRPC_MSG_TEST_REQST;
+
+       case SRPC_SERVICE_QUERY_STAT:
+               return SRPC_MSG_STAT_REQST;
+
+       case SRPC_SERVICE_BRW:
+               return SRPC_MSG_BRW_REQST;
+
+       case SRPC_SERVICE_PING:
+               return SRPC_MSG_PING_REQST;
+
+       case SRPC_SERVICE_JOIN:
+               return SRPC_MSG_JOIN_REQST;
+       }
+}
+
+static inline srpc_msg_type_t
+srpc_service2reply (int service)
+{
+       return srpc_service2request(service) + 1;
+}
+
+typedef enum {
+       SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
+       SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
+       SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
+       SRPC_REPLY_RCVD      = 4, /* incoming reply received */
+       SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
+       SRPC_REQUEST_RCVD    = 6, /* incoming request received */
+       SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
+} srpc_event_type_t;
+
+/* RPC event */
+typedef struct {
+       srpc_event_type_t ev_type;   /* what's up */
+       lnet_event_kind_t ev_lnet;   /* LNet event type */
+       int            ev_fired;  /* LNet event fired? */
+       int            ev_status; /* LNet event status */
+       void         *ev_data;   /* owning server/client RPC */
+} srpc_event_t;
+
+typedef struct {
+       int           bk_len;  /* len of bulk data */
+       lnet_handle_md_t bk_mdh;
+       int           bk_sink; /* sink/source */
+       int           bk_niov; /* # iov in bk_iovs */
+       lnet_kiov_t      bk_iovs[0];
+} srpc_bulk_t; /* bulk descriptor */
+
+/* message buffer descriptor */
+typedef struct srpc_buffer {
+       struct list_head           buf_list; /* chain on srpc_service::*_msgq */
+       srpc_msg_t         buf_msg;
+       lnet_handle_md_t     buf_mdh;
+       lnet_nid_t         buf_self;
+       lnet_process_id_t    buf_peer;
+} srpc_buffer_t;
+
+struct swi_workitem;
+typedef int (*swi_action_t) (struct swi_workitem *);
+
+typedef struct swi_workitem {
+       struct cfs_wi_sched     *swi_sched;
+       cfs_workitem_t       swi_workitem;
+       swi_action_t     swi_action;
+       int               swi_state;
+} swi_workitem_t;
+
+/* server-side state of a RPC */
+typedef struct srpc_server_rpc {
+       /* chain on srpc_service::*_rpcq */
+       struct list_head                srpc_list;
+       struct srpc_service_cd *srpc_scd;
+       swi_workitem_t       srpc_wi;
+       srpc_event_t     srpc_ev;      /* bulk/reply event */
+       lnet_nid_t         srpc_self;
+       lnet_process_id_t    srpc_peer;
+       srpc_msg_t         srpc_replymsg;
+       lnet_handle_md_t     srpc_replymdh;
+       srpc_buffer_t       *srpc_reqstbuf;
+       srpc_bulk_t      *srpc_bulk;
+
+       unsigned int     srpc_aborted; /* being given up */
+       int               srpc_status;
+       void           (*srpc_done)(struct srpc_server_rpc *);
+} srpc_server_rpc_t;
+
+/* client-side state of a RPC */
+typedef struct srpc_client_rpc {
+       struct list_head                crpc_list;      /* chain on user's lists */
+       spinlock_t              crpc_lock;      /* serialize */
+       int               crpc_service;
+       atomic_t         crpc_refcount;
+       int               crpc_timeout; /* # seconds to wait for reply */
+       stt_timer_t       crpc_timer;
+       swi_workitem_t       crpc_wi;
+       lnet_process_id_t    crpc_dest;
+
+       void           (*crpc_done)(struct srpc_client_rpc *);
+       void           (*crpc_fini)(struct srpc_client_rpc *);
+       int               crpc_status;    /* completion status */
+       void            *crpc_priv;      /* caller data */
+
+       /* state flags */
+       unsigned int     crpc_aborted:1; /* being given up */
+       unsigned int     crpc_closed:1;  /* completed */
+
+       /* RPC events */
+       srpc_event_t     crpc_bulkev;    /* bulk event */
+       srpc_event_t     crpc_reqstev;   /* request event */
+       srpc_event_t     crpc_replyev;   /* reply event */
+
+       /* bulk, request(reqst), and reply exchanged on wire */
+       srpc_msg_t         crpc_reqstmsg;
+       srpc_msg_t         crpc_replymsg;
+       lnet_handle_md_t     crpc_reqstmdh;
+       lnet_handle_md_t     crpc_replymdh;
+       srpc_bulk_t       crpc_bulk;
+} srpc_client_rpc_t;
+
+#define srpc_client_rpc_size(rpc)                                     \
+offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+
+#define srpc_client_rpc_addref(rpc)                                 \
+do {                                                               \
+       CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n",                  \
+              (rpc), libcfs_id2str((rpc)->crpc_dest),            \
+              atomic_read(&(rpc)->crpc_refcount));              \
+       LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);            \
+       atomic_inc(&(rpc)->crpc_refcount);                        \
+} while (0)
+
+#define srpc_client_rpc_decref(rpc)                                 \
+do {                                                               \
+       CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n",                  \
+              (rpc), libcfs_id2str((rpc)->crpc_dest),            \
+              atomic_read(&(rpc)->crpc_refcount));              \
+       LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);            \
+       if (atomic_dec_and_test(&(rpc)->crpc_refcount))      \
+               srpc_destroy_client_rpc(rpc);                      \
+} while (0)
+
+#define srpc_event_pending(rpc)   ((rpc)->crpc_bulkev.ev_fired == 0 ||  \
+                                  (rpc)->crpc_reqstev.ev_fired == 0 || \
+                                  (rpc)->crpc_replyev.ev_fired == 0)
+
+/* CPU partition data of srpc service */
+struct srpc_service_cd {
+       /** serialize */
+       spinlock_t              scd_lock;
+       /** backref to service */
+       struct srpc_service     *scd_svc;
+       /** event buffer */
+       srpc_event_t            scd_ev;
+       /** free RPC descriptors */
+       struct list_head                scd_rpc_free;
+       /** in-flight RPCs */
+       struct list_head                scd_rpc_active;
+       /** workitem for posting buffer */
+       swi_workitem_t          scd_buf_wi;
+       /** CPT id */
+       int                     scd_cpt;
+       /** error code for scd_buf_wi */
+       int                     scd_buf_err;
+       /** timestamp for scd_buf_err */
+       unsigned long      scd_buf_err_stamp;
+       /** total # request buffers */
+       int                     scd_buf_total;
+       /** # posted request buffers */
+       int                     scd_buf_nposted;
+       /** in progress of buffer posting */
+       int                     scd_buf_posting;
+       /** allocate more buffers if scd_buf_nposted < scd_buf_low */
+       int                     scd_buf_low;
+       /** increase/decrease some buffers */
+       int                     scd_buf_adjust;
+       /** posted message buffers */
+       struct list_head                scd_buf_posted;
+       /** blocked for RPC descriptor */
+       struct list_head                scd_buf_blocked;
+};
+
+/* number of server workitems (mini-thread) for testing service */
+#define SFW_TEST_WI_MIN                256
+#define SFW_TEST_WI_MAX                2048
+/* extra buffers for tolerating buggy peers, or unbalanced number
+ * of peers between partitions  */
+#define SFW_TEST_WI_EXTRA      64
+
+/* number of server workitems (mini-thread) for framework service */
+#define SFW_FRWK_WI_MIN                16
+#define SFW_FRWK_WI_MAX                256
+
+typedef struct srpc_service {
+       int                     sv_id;          /* service id */
+       const char              *sv_name;       /* human readable name */
+       int                     sv_wi_total;    /* total server workitems */
+       int                     sv_shuttingdown;
+       int                     sv_ncpts;
+       /* percpt data for srpc_service */
+       struct srpc_service_cd  **sv_cpt_data;
+       /* Service callbacks:
+        * - sv_handler: process incoming RPC request
+        * - sv_bulk_ready: notify bulk data
+        */
+       int           (*sv_handler) (srpc_server_rpc_t *);
+       int           (*sv_bulk_ready) (srpc_server_rpc_t *, int);
+} srpc_service_t;
+
+typedef struct {
+       struct list_head        sn_list;    /* chain on fw_zombie_sessions */
+       lst_sid_t        sn_id;      /* unique identifier */
+       unsigned int      sn_timeout; /* # seconds' inactivity to expire */
+       int            sn_timer_active;
+       unsigned int      sn_features;
+       stt_timer_t       sn_timer;
+       struct list_head        sn_batches; /* list of batches */
+       char          sn_name[LST_NAME_SIZE];
+       atomic_t      sn_refcount;
+       atomic_t      sn_brw_errors;
+       atomic_t      sn_ping_errors;
+       cfs_time_t      sn_started;
+} sfw_session_t;
+
+#define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
+                                      (sid0).ses_stamp == (sid1).ses_stamp)
+
+typedef struct {
+       struct list_head        bat_list;      /* chain on sn_batches */
+       lst_bid_t        bat_id;        /* batch id */
+       int            bat_error;     /* error code of batch */
+       sfw_session_t    *bat_session;   /* batch's session */
+       atomic_t      bat_nactive;   /* # of active tests */
+       struct list_head        bat_tests;     /* test instances */
+} sfw_batch_t;
+
+typedef struct {
+       int  (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */
+       void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+       int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+                            lnet_process_id_t dest,
+                            srpc_client_rpc_t **rpc);   /* prep a tests rpc */
+       void (*tso_done_rpc)(struct sfw_test_unit *tsu,
+                            srpc_client_rpc_t *rpc);    /* done a test rpc */
+} sfw_test_client_ops_t;
+
+typedef struct sfw_test_instance {
+       struct list_head              tsi_list;  /* chain on batch */
+       int                  tsi_service;      /* test type */
+       sfw_batch_t         *tsi_batch; /* batch */
+       sfw_test_client_ops_t  *tsi_ops;          /* test client operations */
+
+       /* public parameter for all test units */
+       unsigned int            tsi_is_client:1;     /* is test client */
+       unsigned int            tsi_stoptsu_onerr:1; /* stop tsu on error */
+       int                  tsi_concur;          /* concurrency */
+       int                  tsi_loop;      /* loop count */
+
+       /* status of test instance */
+       spinlock_t              tsi_lock;         /* serialize */
+       unsigned int            tsi_stopping:1;   /* test is stopping */
+       atomic_t            tsi_nactive;      /* # of active test unit */
+       struct list_head              tsi_units;        /* test units */
+       struct list_head              tsi_free_rpcs;    /* free rpcs */
+       struct list_head              tsi_active_rpcs;  /* active rpcs */
+
+       union {
+               test_ping_req_t         ping;     /* ping parameter */
+               test_bulk_req_t         bulk_v0;  /* bulk parameter */
+               test_bulk_req_v1_t      bulk_v1;  /* bulk v1 parameter */
+       } tsi_u;
+} sfw_test_instance_t;
+
+/* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at
+ * the end of pages are not used */
+#define SFW_MAX_CONCUR     LST_MAX_CONCUR
+#define SFW_ID_PER_PAGE    (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t))
+#define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
+#define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
+
+typedef struct sfw_test_unit {
+       struct list_head            tsu_list;    /* chain on lst_test_instance */
+       lnet_process_id_t     tsu_dest;  /* id of dest node */
+       int                tsu_loop;     /* loop count of the test */
+       sfw_test_instance_t  *tsu_instance;     /* pointer to test instance */
+       void             *tsu_private;      /* private data */
+       swi_workitem_t  tsu_worker;       /* workitem of the test unit */
+} sfw_test_unit_t;
+
+typedef struct sfw_test_case {
+       struct list_head              tsc_list;  /* chain on fw_tests */
+       srpc_service_t   *tsc_srv_service;  /* test service */
+       sfw_test_client_ops_t  *tsc_cli_ops;      /* ops of test client */
+} sfw_test_case_t;
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+              unsigned features, int nbulkiov, int bulklen,
+              void (*done) (srpc_client_rpc_t *), void *priv);
+int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+                       lnet_process_id_t peer, unsigned features,
+                       int nblk, int blklen, srpc_client_rpc_t **rpc);
+void sfw_abort_rpc(srpc_client_rpc_t *rpc);
+void sfw_post_rpc(srpc_client_rpc_t *rpc);
+void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
+void sfw_unpack_message(srpc_msg_t *msg);
+void sfw_free_pages(srpc_server_rpc_t *rpc);
+void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
+int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+                   int sink);
+int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(lnet_process_id_t peer, int service,
+                      int nbulkiov, int bulklen,
+                      void (*rpc_done)(srpc_client_rpc_t *),
+                      void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
+void srpc_post_rpc(srpc_client_rpc_t *rpc);
+void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
+void srpc_free_bulk(srpc_bulk_t *bk);
+srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len,
+                            int sink);
+int srpc_send_rpc(swi_workitem_t *wi);
+int srpc_send_reply(srpc_server_rpc_t *rpc);
+int srpc_add_service(srpc_service_t *sv);
+int srpc_remove_service(srpc_service_t *sv);
+void srpc_shutdown_service(srpc_service_t *sv);
+void srpc_abort_service(srpc_service_t *sv);
+int srpc_finish_service(srpc_service_t *sv);
+int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_get_counters(srpc_counters_t *cnt);
+void srpc_set_counters(const srpc_counters_t *cnt);
+
+extern struct cfs_wi_sched *lst_sched_serial;
+extern struct cfs_wi_sched **lst_sched_test;
+
+static inline int
+srpc_serv_is_framework(struct srpc_service *svc)
+{
+       return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID;
+}
+
+static inline int
+swi_wi_action(cfs_workitem_t *wi)
+{
+       swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+
+       return swi->swi_action(swi);
+}
+
+static inline void
+swi_init_workitem(swi_workitem_t *swi, void *data,
+                 swi_action_t action, struct cfs_wi_sched *sched)
+{
+       swi->swi_sched  = sched;
+       swi->swi_action = action;
+       swi->swi_state  = SWI_STATE_NEWBORN;
+       cfs_wi_init(&swi->swi_workitem, data, swi_wi_action);
+}
+
+static inline void
+swi_schedule_workitem(swi_workitem_t *wi)
+{
+       cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
+}
+
+static inline void
+swi_exit_workitem(swi_workitem_t *swi)
+{
+       cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
+}
+
+static inline int
+swi_deschedule_workitem(swi_workitem_t *swi)
+{
+       return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
+}
+
+
+int sfw_startup(void);
+int srpc_startup(void);
+void sfw_shutdown(void);
+void srpc_shutdown(void);
+
+static inline void
+srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+{
+       LASSERT (rpc != NULL);
+       LASSERT (!srpc_event_pending(rpc));
+       LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+       if (rpc->crpc_fini == NULL) {
+               LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+       } else {
+               (*rpc->crpc_fini) (rpc);
+       }
+
+       return;
+}
+
+static inline void
+srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer,
+                     int service, int nbulkiov, int bulklen,
+                     void (*rpc_done)(srpc_client_rpc_t *),
+                     void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+       LASSERT (nbulkiov <= LNET_MAX_IOV);
+
+       memset(rpc, 0, offsetof(srpc_client_rpc_t,
+                               crpc_bulk.bk_iovs[nbulkiov]));
+
+       INIT_LIST_HEAD(&rpc->crpc_list);
+       swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc,
+                         lst_sched_test[lnet_cpt_of_nid(peer.nid)]);
+       spin_lock_init(&rpc->crpc_lock);
+       atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
+
+       rpc->crpc_dest   = peer;
+       rpc->crpc_priv   = priv;
+       rpc->crpc_service      = service;
+       rpc->crpc_bulk.bk_len  = bulklen;
+       rpc->crpc_bulk.bk_niov = nbulkiov;
+       rpc->crpc_done   = rpc_done;
+       rpc->crpc_fini   = rpc_fini;
+       LNetInvalidateHandle(&rpc->crpc_reqstmdh);
+       LNetInvalidateHandle(&rpc->crpc_replymdh);
+       LNetInvalidateHandle(&rpc->crpc_bulk.bk_mdh);
+
+       /* no event is expected at this point */
+       rpc->crpc_bulkev.ev_fired  =
+       rpc->crpc_reqstev.ev_fired =
+       rpc->crpc_replyev.ev_fired = 1;
+
+       rpc->crpc_reqstmsg.msg_magic   = SRPC_MSG_MAGIC;
+       rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION;
+       rpc->crpc_reqstmsg.msg_type    = srpc_service2request(service);
+       return;
+}
+
+static inline const char *
+swi_state2str (int state)
+{
+#define STATE2STR(x) case x: return #x
+       switch(state) {
+               default:
+                       LBUG();
+               STATE2STR(SWI_STATE_NEWBORN);
+               STATE2STR(SWI_STATE_REPLY_SUBMITTED);
+               STATE2STR(SWI_STATE_REPLY_SENT);
+               STATE2STR(SWI_STATE_REQUEST_SUBMITTED);
+               STATE2STR(SWI_STATE_REQUEST_SENT);
+               STATE2STR(SWI_STATE_REPLY_RECEIVED);
+               STATE2STR(SWI_STATE_BULK_STARTED);
+               STATE2STR(SWI_STATE_DONE);
+       }
+#undef STATE2STR
+}
+
+#define UNUSED(x)       ( (void)(x) )
+
+
+#define selftest_wait_events() cfs_pause(cfs_time_seconds(1) / 10)
+
+
+#define lst_wait_until(cond, lock, fmt, ...)                           \
+do {                                                                   \
+       int __I = 2;                                                    \
+       while (!(cond)) {                                               \
+               CDEBUG(IS_PO2(++__I) ? D_WARNING : D_NET,               \
+                      fmt, ## __VA_ARGS__);                            \
+               spin_unlock(&(lock));                                   \
+                                                                       \
+               selftest_wait_events();                                 \
+                                                                       \
+               spin_lock(&(lock));                                     \
+       }                                                               \
+} while (0)
+
+static inline void
+srpc_wait_service_shutdown(srpc_service_t *sv)
+{
+       int i = 2;
+
+       LASSERT(sv->sv_shuttingdown);
+
+       while (srpc_finish_service(sv) == 0) {
+               i++;
+               CDEBUG (((i & -i) == i) ? D_WARNING : D_NET,
+                       "Waiting for %s service to shutdown...\n",
+                       sv->sv_name);
+               selftest_wait_events();
+       }
+}
+
+#endif /* __SELFTEST_SELFTEST_H__ */
diff --git a/drivers/staging/lustre/lnet/selftest/timer.c b/drivers/staging/lustre/lnet/selftest/timer.c
new file mode 100644 (file)
index 0000000..2c07855
--- /dev/null
@@ -0,0 +1,253 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+
+/*
+ * Timers are implemented as a sorted queue of expiry times. The queue
+ * is slotted, with each slot holding timers which expire in a
+ * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are
+ * sorted by increasing expiry time. The number of slots is 2**7 (128),
+ * to cover a time period of 1024 seconds into the future before wrapping.
+ */
+#define STTIMER_MINPOLL        3   /* log2 min poll interval (8 s) */
+#define STTIMER_SLOTTIME       (1 << STTIMER_MINPOLL)
+#define STTIMER_SLOTTIMEMASK   (~(STTIMER_SLOTTIME - 1))
+#define STTIMER_NSLOTS        (1 << 7)
+#define STTIMER_SLOT(t)               (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \
+                                                   (STTIMER_NSLOTS - 1))])
+
+struct st_timer_data {
+       spinlock_t       stt_lock;
+       /* start time of the slot processed previously */
+       cfs_time_t       stt_prev_slot;
+       struct list_head       stt_hash[STTIMER_NSLOTS];
+       int           stt_shuttingdown;
+       wait_queue_head_t      stt_waitq;
+       int           stt_nthreads;
+} stt_data;
+
+void
+stt_add_timer(stt_timer_t *timer)
+{
+       struct list_head *pos;
+
+       spin_lock(&stt_data.stt_lock);
+
+       LASSERT (stt_data.stt_nthreads > 0);
+       LASSERT (!stt_data.stt_shuttingdown);
+       LASSERT (timer->stt_func != NULL);
+       LASSERT (list_empty(&timer->stt_list));
+       LASSERT (cfs_time_after(timer->stt_expires, cfs_time_current_sec()));
+
+       /* a simple insertion sort */
+       list_for_each_prev (pos, STTIMER_SLOT(timer->stt_expires)) {
+               stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+
+               if (cfs_time_aftereq(timer->stt_expires, old->stt_expires))
+                       break;
+       }
+       list_add(&timer->stt_list, pos);
+
+       spin_unlock(&stt_data.stt_lock);
+}
+
+/*
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ *
+ * CAVEAT EMPTOR:
+ * When 0 is returned, it is possible that timer->stt_func _is_ running on
+ * another CPU.
+ */
+int
+stt_del_timer (stt_timer_t *timer)
+{
+       int ret = 0;
+
+       spin_lock(&stt_data.stt_lock);
+
+       LASSERT (stt_data.stt_nthreads > 0);
+       LASSERT (!stt_data.stt_shuttingdown);
+
+       if (!list_empty(&timer->stt_list)) {
+               ret = 1;
+               list_del_init(&timer->stt_list);
+       }
+
+       spin_unlock(&stt_data.stt_lock);
+       return ret;
+}
+
+/* called with stt_data.stt_lock held */
+int
+stt_expire_list (struct list_head *slot, cfs_time_t now)
+{
+       int       expired = 0;
+       stt_timer_t *timer;
+
+       while (!list_empty(slot)) {
+               timer = list_entry(slot->next, stt_timer_t, stt_list);
+
+               if (cfs_time_after(timer->stt_expires, now))
+                       break;
+
+               list_del_init(&timer->stt_list);
+               spin_unlock(&stt_data.stt_lock);
+
+               expired++;
+               (*timer->stt_func) (timer->stt_data);
+
+               spin_lock(&stt_data.stt_lock);
+       }
+
+       return expired;
+}
+
+int
+stt_check_timers (cfs_time_t *last)
+{
+       int     expired = 0;
+       cfs_time_t now;
+       cfs_time_t this_slot;
+
+       now = cfs_time_current_sec();
+       this_slot = now & STTIMER_SLOTTIMEMASK;
+
+       spin_lock(&stt_data.stt_lock);
+
+       while (cfs_time_aftereq(this_slot, *last)) {
+               expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
+               this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+       }
+
+       *last = now & STTIMER_SLOTTIMEMASK;
+       spin_unlock(&stt_data.stt_lock);
+       return expired;
+}
+
+
+int
+stt_timer_main (void *arg)
+{
+       int rc = 0;
+       UNUSED(arg);
+
+       SET_BUT_UNUSED(rc);
+
+       cfs_block_allsigs();
+
+       while (!stt_data.stt_shuttingdown) {
+               stt_check_timers(&stt_data.stt_prev_slot);
+
+               rc = wait_event_timeout(stt_data.stt_waitq,
+                                       stt_data.stt_shuttingdown,
+                                       cfs_time_seconds(STTIMER_SLOTTIME));
+       }
+
+       spin_lock(&stt_data.stt_lock);
+       stt_data.stt_nthreads--;
+       spin_unlock(&stt_data.stt_lock);
+       return 0;
+}
+
+int
+stt_start_timer_thread (void)
+{
+       task_t *task;
+
+       LASSERT(!stt_data.stt_shuttingdown);
+
+       task = kthread_run(stt_timer_main, NULL, "st_timer");
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+
+       spin_lock(&stt_data.stt_lock);
+       stt_data.stt_nthreads++;
+       spin_unlock(&stt_data.stt_lock);
+       return 0;
+}
+
+
+int
+stt_startup (void)
+{
+       int rc = 0;
+       int i;
+
+       stt_data.stt_shuttingdown = 0;
+       stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK;
+
+       spin_lock_init(&stt_data.stt_lock);
+       for (i = 0; i < STTIMER_NSLOTS; i++)
+               INIT_LIST_HEAD(&stt_data.stt_hash[i]);
+
+       stt_data.stt_nthreads = 0;
+       init_waitqueue_head(&stt_data.stt_waitq);
+       rc = stt_start_timer_thread();
+       if (rc != 0)
+               CERROR ("Can't spawn timer thread: %d\n", rc);
+
+       return rc;
+}
+
+void
+stt_shutdown (void)
+{
+       int i;
+
+       spin_lock(&stt_data.stt_lock);
+
+       for (i = 0; i < STTIMER_NSLOTS; i++)
+               LASSERT (list_empty(&stt_data.stt_hash[i]));
+
+       stt_data.stt_shuttingdown = 1;
+
+       wake_up(&stt_data.stt_waitq);
+       lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock,
+                      "waiting for %d threads to terminate\n",
+                      stt_data.stt_nthreads);
+
+       spin_unlock(&stt_data.stt_lock);
+}
diff --git a/drivers/staging/lustre/lnet/selftest/timer.h b/drivers/staging/lustre/lnet/selftest/timer.h
new file mode 100644 (file)
index 0000000..56dbfe5
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_TIMER_H__
+#define __SELFTEST_TIMER_H__
+
+typedef struct {
+       struct list_head        stt_list;
+       cfs_time_t      stt_expires;
+       void        (*stt_func) (void *);
+       void         *stt_data;
+} stt_timer_t;
+
+void stt_add_timer (stt_timer_t *timer);
+int stt_del_timer (stt_timer_t *timer);
+int stt_startup (void);
+void stt_shutdown (void);
+
+#endif /* __SELFTEST_TIMER_H__ */
diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig
new file mode 100644 (file)
index 0000000..ee91ae1
--- /dev/null
@@ -0,0 +1,51 @@
+config LUSTRE_FS
+       tristate "Lustre file system client support"
+       depends on STAGING && INET
+       select LNET
+       select CRYPTO
+       select CRYPTO_CRC32
+       select CRYPTO_CRC32_PCLMUL if X86
+       select CRYPTO_CRC32C
+       select CRYPTO_MD5
+       select CRYPTO_SHA1
+       select CRYPTO_SHA256
+       select CRYPTO_SHA512
+       help
+         This option enables Lustre file system client support. Choose Y
+         here if you want to access a Lustre file system cluster. To compile
+         this file system support as a module, choose M here: the module will
+         be called lustre.
+
+         To mount Lustre file systems , you also need to install the user space
+         mount.lustre and other user space commands which can be found in the
+         lustre-client package, available from
+         http://downloads.whamcloud.com/public/lustre/
+
+         Lustre file system is the most popular cluster file system in high
+         performance computing. Source code of both kernel space and user space
+         Lustre components can also be found at
+         http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary
+
+         If unsure, say N.
+
+         See also http://wiki.lustre.org/
+
+config LUSTRE_OBD_MAX_IOCTL_BUFFER
+       int "Lustre obd max ioctl buffer bytes (default 8KB)"
+       depends on LUSTRE_FS
+       default 8192
+       help
+         This option defines the maximum size of buffer in bytes that user space
+         applications can pass to Lustre kernel module through ioctl interface.
+
+         If unsure, use default.
+
+config LUSTRE_DEBUG_EXPENSIVE_CHECK
+       bool "Enable Lustre DEBUG checks"
+       depends on LUSTRE_FS
+       default false
+       help
+         This option is mainly for debug purpose. It enables Lustre code to do
+         expensive checks that may have a performance impact.
+
+         Use with caution. If unsure, say N.
diff --git a/drivers/staging/lustre/lustre/Makefile b/drivers/staging/lustre/lustre/Makefile
new file mode 100644 (file)
index 0000000..3fb94fc
--- /dev/null
@@ -0,0 +1,2 @@
+obj-$(CONFIG_LUSTRE_FS) := fid/ lvfs/ obdclass/ ptlrpc/ obdecho/ mgc/ lov/ \
+                          osc/ mdc/ lmv/ llite/ fld/ libcfs/
diff --git a/drivers/staging/lustre/lustre/fid/Makefile b/drivers/staging/lustre/lustre/fid/Makefile
new file mode 100644 (file)
index 0000000..b8d6d21
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += fid.o
+fid-y := fid_handler.o fid_store.o fid_request.o lproc_fid.o fid_lib.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/fid/fid_handler.c b/drivers/staging/lustre/lustre/fid/fid_handler.c
new file mode 100644 (file)
index 0000000..bbbb3cf
--- /dev/null
@@ -0,0 +1,661 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_handler.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+int client_fid_init(struct obd_device *obd,
+                   struct obd_export *exp, enum lu_cli_type type)
+{
+       struct client_obd *cli = &obd->u.cli;
+       char *prefix;
+       int rc;
+       ENTRY;
+
+       OBD_ALLOC_PTR(cli->cl_seq);
+       if (cli->cl_seq == NULL)
+               RETURN(-ENOMEM);
+
+       OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+       if (prefix == NULL)
+               GOTO(out_free_seq, rc = -ENOMEM);
+
+       snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name);
+
+       /* Init client side sequence-manager */
+       rc = seq_client_init(cli->cl_seq, exp, type, prefix, NULL);
+       OBD_FREE(prefix, MAX_OBD_NAME + 5);
+       if (rc)
+               GOTO(out_free_seq, rc);
+
+       RETURN(rc);
+out_free_seq:
+       OBD_FREE_PTR(cli->cl_seq);
+       cli->cl_seq = NULL;
+       return rc;
+}
+EXPORT_SYMBOL(client_fid_init);
+
+int client_fid_fini(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       ENTRY;
+
+       if (cli->cl_seq != NULL) {
+               seq_client_fini(cli->cl_seq);
+               OBD_FREE_PTR(cli->cl_seq);
+               cli->cl_seq = NULL;
+       }
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(client_fid_fini);
+
+static void seq_server_proc_fini(struct lu_server_seq *seq);
+
+/* Assigns client to sequence controller node. */
+int seq_server_set_cli(struct lu_server_seq *seq,
+                      struct lu_client_seq *cli,
+                      const struct lu_env *env)
+{
+       int rc = 0;
+       ENTRY;
+
+       /*
+        * Ask client for new range, assign that range to ->seq_space and write
+        * seq state to backing store should be atomic.
+        */
+       mutex_lock(&seq->lss_mutex);
+
+       if (cli == NULL) {
+               CDEBUG(D_INFO, "%s: Detached sequence client %s\n",
+                      seq->lss_name, cli->lcs_name);
+               seq->lss_cli = cli;
+               GOTO(out_up, rc = 0);
+       }
+
+       if (seq->lss_cli != NULL) {
+               CDEBUG(D_HA, "%s: Sequence controller is already "
+                      "assigned\n", seq->lss_name);
+               GOTO(out_up, rc = -EEXIST);
+       }
+
+       CDEBUG(D_INFO, "%s: Attached sequence controller %s\n",
+              seq->lss_name, cli->lcs_name);
+
+       seq->lss_cli = cli;
+       cli->lcs_space.lsr_index = seq->lss_site->ss_node_id;
+       EXIT;
+out_up:
+       mutex_unlock(&seq->lss_mutex);
+       return rc;
+}
+EXPORT_SYMBOL(seq_server_set_cli);
+/*
+ * allocate \a w units of sequence from range \a from.
+ */
+static inline void range_alloc(struct lu_seq_range *to,
+                              struct lu_seq_range *from,
+                              __u64 width)
+{
+       width = min(range_space(from), width);
+       to->lsr_start = from->lsr_start;
+       to->lsr_end = from->lsr_start + width;
+       from->lsr_start += width;
+}
+
+/**
+ * On controller node, allocate new super sequence for regular sequence server.
+ * As this super sequence controller, this node suppose to maintain fld
+ * and update index.
+ * \a out range always has currect mds node number of requester.
+ */
+
+static int __seq_server_alloc_super(struct lu_server_seq *seq,
+                                   struct lu_seq_range *out,
+                                   const struct lu_env *env)
+{
+       struct lu_seq_range *space = &seq->lss_space;
+       int rc;
+       ENTRY;
+
+       LASSERT(range_is_sane(space));
+
+       if (range_is_exhausted(space)) {
+               CERROR("%s: Sequences space is exhausted\n",
+                      seq->lss_name);
+               RETURN(-ENOSPC);
+       } else {
+               range_alloc(out, space, seq->lss_width);
+       }
+
+       rc = seq_store_update(env, seq, out, 1 /* sync */);
+
+       LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n",
+                     seq->lss_name, rc, PRANGE(out));
+
+       RETURN(rc);
+}
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+                          struct lu_seq_range *out,
+                          const struct lu_env *env)
+{
+       int rc;
+       ENTRY;
+
+       mutex_lock(&seq->lss_mutex);
+       rc = __seq_server_alloc_super(seq, out, env);
+       mutex_unlock(&seq->lss_mutex);
+
+       RETURN(rc);
+}
+
+static int __seq_set_init(const struct lu_env *env,
+                           struct lu_server_seq *seq)
+{
+       struct lu_seq_range *space = &seq->lss_space;
+       int rc;
+
+       range_alloc(&seq->lss_lowater_set, space, seq->lss_set_width);
+       range_alloc(&seq->lss_hiwater_set, space, seq->lss_set_width);
+
+       rc = seq_store_update(env, seq, NULL, 1);
+
+       return rc;
+}
+
+/*
+ * This function implements new seq allocation algorithm using async
+ * updates to seq file on disk. ref bug 18857 for details.
+ * there are four variable to keep track of this process
+ *
+ * lss_space; - available lss_space
+ * lss_lowater_set; - lu_seq_range for all seqs before barrier, i.e. safe to use
+ * lss_hiwater_set; - lu_seq_range after barrier, i.e. allocated but may be
+ *                 not yet committed
+ *
+ * when lss_lowater_set reaches the end it is replaced with hiwater one and
+ * a write operation is initiated to allocate new hiwater range.
+ * if last seq write opearion is still not commited, current operation is
+ * flaged as sync write op.
+ */
+static int range_alloc_set(const struct lu_env *env,
+                           struct lu_seq_range *out,
+                           struct lu_server_seq *seq)
+{
+       struct lu_seq_range *space = &seq->lss_space;
+       struct lu_seq_range *loset = &seq->lss_lowater_set;
+       struct lu_seq_range *hiset = &seq->lss_hiwater_set;
+       int rc = 0;
+
+       if (range_is_zero(loset))
+               __seq_set_init(env, seq);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_ALLOC)) /* exhaust set */
+               loset->lsr_start = loset->lsr_end;
+
+       if (range_is_exhausted(loset)) {
+               /* reached high water mark. */
+               struct lu_device *dev = seq->lss_site->ss_lu->ls_top_dev;
+               int obd_num_clients = dev->ld_obd->obd_num_exports;
+               __u64 set_sz;
+
+               /* calculate new seq width based on number of clients */
+               set_sz = max(seq->lss_set_width,
+                            obd_num_clients * seq->lss_width);
+               set_sz = min(range_space(space), set_sz);
+
+               /* Switch to hiwater range now */
+               *loset = *hiset;
+               /* allocate new hiwater range */
+               range_alloc(hiset, space, set_sz);
+
+               /* update ondisk seq with new *space */
+               rc = seq_store_update(env, seq, NULL, seq->lss_need_sync);
+       }
+
+       LASSERTF(!range_is_exhausted(loset) || range_is_sane(loset),
+                DRANGE"\n", PRANGE(loset));
+
+       if (rc == 0)
+               range_alloc(out, loset, seq->lss_width);
+
+       RETURN(rc);
+}
+
+static int __seq_server_alloc_meta(struct lu_server_seq *seq,
+                                  struct lu_seq_range *out,
+                                  const struct lu_env *env)
+{
+       struct lu_seq_range *space = &seq->lss_space;
+       int rc = 0;
+
+       ENTRY;
+
+       LASSERT(range_is_sane(space));
+
+       /* Check if available space ends and allocate new super seq */
+       if (range_is_exhausted(space)) {
+               if (!seq->lss_cli) {
+                       CERROR("%s: No sequence controller is attached.\n",
+                              seq->lss_name);
+                       RETURN(-ENODEV);
+               }
+
+               rc = seq_client_alloc_super(seq->lss_cli, env);
+               if (rc) {
+                       CERROR("%s: Can't allocate super-sequence, rc %d\n",
+                              seq->lss_name, rc);
+                       RETURN(rc);
+               }
+
+               /* Saving new range to allocation space. */
+               *space = seq->lss_cli->lcs_space;
+               LASSERT(range_is_sane(space));
+       }
+
+       rc = range_alloc_set(env, out, seq);
+       if (rc != 0) {
+               CERROR("%s: Allocated meta-sequence failed: rc = %d\n",
+                       seq->lss_name, rc);
+               RETURN(rc);
+       }
+
+       CDEBUG(D_INFO, "%s: Allocated meta-sequence " DRANGE"\n",
+               seq->lss_name, PRANGE(out));
+
+       RETURN(rc);
+}
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+                         struct lu_seq_range *out,
+                         const struct lu_env *env)
+{
+       int rc;
+       ENTRY;
+
+       mutex_lock(&seq->lss_mutex);
+       rc = __seq_server_alloc_meta(seq, out, env);
+       mutex_unlock(&seq->lss_mutex);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(seq_server_alloc_meta);
+
+static int seq_server_handle(struct lu_site *site,
+                            const struct lu_env *env,
+                            __u32 opc, struct lu_seq_range *out)
+{
+       int rc;
+       struct seq_server_site *ss_site;
+       ENTRY;
+
+       ss_site = lu_site2seq(site);
+
+       switch (opc) {
+       case SEQ_ALLOC_META:
+               if (!ss_site->ss_server_seq) {
+                       CERROR("Sequence server is not "
+                              "initialized\n");
+                       RETURN(-EINVAL);
+               }
+               rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env);
+               break;
+       case SEQ_ALLOC_SUPER:
+               if (!ss_site->ss_control_seq) {
+                       CERROR("Sequence controller is not "
+                              "initialized\n");
+                       RETURN(-EINVAL);
+               }
+               rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env);
+               break;
+       default:
+               rc = -EINVAL;
+               break;
+       }
+
+       RETURN(rc);
+}
+
+static int seq_req_handle(struct ptlrpc_request *req,
+                         const struct lu_env *env,
+                         struct seq_thread_info *info)
+{
+       struct lu_seq_range *out, *tmp;
+       struct lu_site *site;
+       int rc = -EPROTO;
+       __u32 *opc;
+       ENTRY;
+
+       LASSERT(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY));
+       site = req->rq_export->exp_obd->obd_lu_dev->ld_site;
+       LASSERT(site != NULL);
+
+       rc = req_capsule_server_pack(info->sti_pill);
+       if (rc)
+               RETURN(err_serious(rc));
+
+       opc = req_capsule_client_get(info->sti_pill, &RMF_SEQ_OPC);
+       if (opc != NULL) {
+               out = req_capsule_server_get(info->sti_pill, &RMF_SEQ_RANGE);
+               if (out == NULL)
+                       RETURN(err_serious(-EPROTO));
+
+               tmp = req_capsule_client_get(info->sti_pill, &RMF_SEQ_RANGE);
+
+               /* seq client passed mdt id, we need to pass that using out
+                * range parameter */
+
+               out->lsr_index = tmp->lsr_index;
+               out->lsr_flags = tmp->lsr_flags;
+               rc = seq_server_handle(site, env, *opc, out);
+       } else
+               rc = err_serious(-EPROTO);
+
+       RETURN(rc);
+}
+
+/* context key constructor/destructor: seq_key_init, seq_key_fini */
+LU_KEY_INIT_FINI(seq, struct seq_thread_info);
+
+/* context key: seq_thread_key */
+LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD);
+
+static void seq_thread_info_init(struct ptlrpc_request *req,
+                                struct seq_thread_info *info)
+{
+       info->sti_pill = &req->rq_pill;
+       /* Init request capsule */
+       req_capsule_init(info->sti_pill, req, RCL_SERVER);
+       req_capsule_set(info->sti_pill, &RQF_SEQ_QUERY);
+}
+
+static void seq_thread_info_fini(struct seq_thread_info *info)
+{
+       req_capsule_fini(info->sti_pill);
+}
+
+int seq_handle(struct ptlrpc_request *req)
+{
+       const struct lu_env *env;
+       struct seq_thread_info *info;
+       int rc;
+
+       env = req->rq_svc_thread->t_env;
+       LASSERT(env != NULL);
+
+       info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+       LASSERT(info != NULL);
+
+       seq_thread_info_init(req, info);
+       rc = seq_req_handle(req, env, info);
+       /* XXX: we don't need replay but MDT assign transno in any case,
+        * remove it manually before reply*/
+       lustre_msg_set_transno(req->rq_repmsg, 0);
+       seq_thread_info_fini(info);
+
+       return rc;
+}
+EXPORT_SYMBOL(seq_handle);
+
+/*
+ * Entry point for handling FLD RPCs called from MDT.
+ */
+int seq_query(struct com_thread_info *info)
+{
+       return seq_handle(info->cti_pill->rc_req);
+}
+EXPORT_SYMBOL(seq_query);
+
+
+#ifdef LPROCFS
+static int seq_server_proc_init(struct lu_server_seq *seq)
+{
+       int rc;
+       ENTRY;
+
+       seq->lss_proc_dir = lprocfs_register(seq->lss_name,
+                                            seq_type_proc_dir,
+                                            NULL, NULL);
+       if (IS_ERR(seq->lss_proc_dir)) {
+               rc = PTR_ERR(seq->lss_proc_dir);
+               RETURN(rc);
+       }
+
+       rc = lprocfs_add_vars(seq->lss_proc_dir,
+                             seq_server_proc_list, seq);
+       if (rc) {
+               CERROR("%s: Can't init sequence manager "
+                      "proc, rc %d\n", seq->lss_name, rc);
+               GOTO(out_cleanup, rc);
+       }
+
+       RETURN(0);
+
+out_cleanup:
+       seq_server_proc_fini(seq);
+       return rc;
+}
+
+static void seq_server_proc_fini(struct lu_server_seq *seq)
+{
+       ENTRY;
+       if (seq->lss_proc_dir != NULL) {
+               if (!IS_ERR(seq->lss_proc_dir))
+                       lprocfs_remove(&seq->lss_proc_dir);
+               seq->lss_proc_dir = NULL;
+       }
+       EXIT;
+}
+#else
+static int seq_server_proc_init(struct lu_server_seq *seq)
+{
+       return 0;
+}
+
+static void seq_server_proc_fini(struct lu_server_seq *seq)
+{
+       return;
+}
+#endif
+
+
+int seq_server_init(struct lu_server_seq *seq,
+                   struct dt_device *dev,
+                   const char *prefix,
+                   enum lu_mgr_type type,
+                   struct seq_server_site *ss,
+                   const struct lu_env *env)
+{
+       int rc, is_srv = (type == LUSTRE_SEQ_SERVER);
+       ENTRY;
+
+       LASSERT(dev != NULL);
+       LASSERT(prefix != NULL);
+       LASSERT(ss != NULL);
+       LASSERT(ss->ss_lu != NULL);
+
+       seq->lss_cli = NULL;
+       seq->lss_type = type;
+       seq->lss_site = ss;
+       range_init(&seq->lss_space);
+
+       range_init(&seq->lss_lowater_set);
+       range_init(&seq->lss_hiwater_set);
+       seq->lss_set_width = LUSTRE_SEQ_BATCH_WIDTH;
+
+       mutex_init(&seq->lss_mutex);
+
+       seq->lss_width = is_srv ?
+               LUSTRE_SEQ_META_WIDTH : LUSTRE_SEQ_SUPER_WIDTH;
+
+       snprintf(seq->lss_name, sizeof(seq->lss_name),
+                "%s-%s", (is_srv ? "srv" : "ctl"), prefix);
+
+       rc = seq_store_init(seq, env, dev);
+       if (rc)
+               GOTO(out, rc);
+       /* Request backing store for saved sequence info. */
+       rc = seq_store_read(seq, env);
+       if (rc == -ENODATA) {
+
+               /* Nothing is read, init by default value. */
+               seq->lss_space = is_srv ?
+                       LUSTRE_SEQ_ZERO_RANGE:
+                       LUSTRE_SEQ_SPACE_RANGE;
+
+               LASSERT(ss != NULL);
+               seq->lss_space.lsr_index = ss->ss_node_id;
+               LCONSOLE_INFO("%s: No data found "
+                             "on store. Initialize space\n",
+                             seq->lss_name);
+
+               rc = seq_store_update(env, seq, NULL, 0);
+               if (rc) {
+                       CERROR("%s: Can't write space data, "
+                              "rc %d\n", seq->lss_name, rc);
+               }
+       } else if (rc) {
+               CERROR("%s: Can't read space data, rc %d\n",
+                      seq->lss_name, rc);
+               GOTO(out, rc);
+       }
+
+       if (is_srv) {
+               LASSERT(range_is_sane(&seq->lss_space));
+       } else {
+               LASSERT(!range_is_zero(&seq->lss_space) &&
+                       range_is_sane(&seq->lss_space));
+       }
+
+       rc  = seq_server_proc_init(seq);
+       if (rc)
+               GOTO(out, rc);
+
+       EXIT;
+out:
+       if (rc)
+               seq_server_fini(seq, env);
+       return rc;
+}
+EXPORT_SYMBOL(seq_server_init);
+
+void seq_server_fini(struct lu_server_seq *seq,
+                    const struct lu_env *env)
+{
+       ENTRY;
+
+       seq_server_proc_fini(seq);
+       seq_store_fini(seq, env);
+
+       EXIT;
+}
+EXPORT_SYMBOL(seq_server_fini);
+
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss)
+{
+       if (ss == NULL)
+               RETURN(0);
+
+       if (ss->ss_server_seq) {
+               seq_server_fini(ss->ss_server_seq, env);
+               OBD_FREE_PTR(ss->ss_server_seq);
+               ss->ss_server_seq = NULL;
+       }
+
+       if (ss->ss_control_seq) {
+               seq_server_fini(ss->ss_control_seq, env);
+               OBD_FREE_PTR(ss->ss_control_seq);
+               ss->ss_control_seq = NULL;
+       }
+
+       if (ss->ss_client_seq) {
+               seq_client_fini(ss->ss_client_seq);
+               OBD_FREE_PTR(ss->ss_client_seq);
+               ss->ss_client_seq = NULL;
+       }
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(seq_site_fini);
+
+proc_dir_entry_t *seq_type_proc_dir = NULL;
+
+static int __init fid_mod_init(void)
+{
+       seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME,
+                                            proc_lustre_root,
+                                            NULL, NULL);
+       if (IS_ERR(seq_type_proc_dir))
+               return PTR_ERR(seq_type_proc_dir);
+
+       LU_CONTEXT_KEY_INIT(&seq_thread_key);
+       lu_context_key_register(&seq_thread_key);
+       return 0;
+}
+
+static void __exit fid_mod_exit(void)
+{
+       lu_context_key_degister(&seq_thread_key);
+       if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) {
+               lprocfs_remove(&seq_type_proc_dir);
+               seq_type_proc_dir = NULL;
+       }
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FID Module");
+MODULE_LICENSE("GPL");
+
+cfs_module(fid, "0.1.0", fid_mod_init, fid_mod_exit);
diff --git a/drivers/staging/lustre/lustre/fid/fid_internal.h b/drivers/staging/lustre/lustre/fid/fid_internal.h
new file mode 100644 (file)
index 0000000..407a743
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+#ifndef __FID_INTERNAL_H
+#define __FID_INTERNAL_H
+
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct seq_thread_info {
+       struct req_capsule     *sti_pill;
+       struct lu_seq_range     sti_space;
+       struct lu_buf      sti_buf;
+};
+
+enum {
+       SEQ_TXN_STORE_CREDITS = 20
+};
+
+extern struct lu_context_key seq_thread_key;
+
+int seq_client_alloc_super(struct lu_client_seq *seq,
+                          const struct lu_env *env);
+/* Store API functions. */
+int seq_store_init(struct lu_server_seq *seq,
+                  const struct lu_env *env,
+                  struct dt_device *dt);
+
+void seq_store_fini(struct lu_server_seq *seq,
+                   const struct lu_env *env);
+
+int seq_store_read(struct lu_server_seq *seq,
+                  const struct lu_env *env);
+
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+                    struct lu_seq_range *out, int sync);
+
+#ifdef LPROCFS
+extern struct lprocfs_vars seq_server_proc_list[];
+extern struct lprocfs_vars seq_client_proc_list[];
+#endif
+
+
+extern proc_dir_entry_t *seq_type_proc_dir;
+
+#endif /* __FID_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/fid/fid_lib.c b/drivers/staging/lustre/lustre/fid/fid_lib.c
new file mode 100644 (file)
index 0000000..eaff51a
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_lib.c
+ *
+ * Miscellaneous fid functions.
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <lu_object.h>
+#include <lustre_fid.h>
+
+/**
+ * A cluster-wide range from which fid-sequences are granted to servers and
+ * then clients.
+ *
+ * Fid namespace:
+ * <pre>
+ * Normal FID: seq:64 [2^33,2^64-1]      oid:32          ver:32
+ * IGIF      : 0:32, ino:32          gen:32      0:32
+ * IDIF      : 0:31, 1:1, ost-index:16,  objd:48        0:32
+ * </pre>
+ *
+ * The first 0x400 sequences of normal FID are reserved for special purpose.
+ * FID_SEQ_START + 1 is for local file id generation.
+ * FID_SEQ_START + 2 is for .lustre directory and its objects
+ */
+const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = {
+       FID_SEQ_NORMAL,
+       (__u64)~0ULL
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_SPACE_RANGE);
+
+/* Zero range, used for init and other purposes. */
+const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = {
+       0,
+       0
+};
+EXPORT_SYMBOL(LUSTRE_SEQ_ZERO_RANGE);
+
+/* Lustre Big Fs Lock fid. */
+const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL,
+                                      .f_oid = FID_OID_SPECIAL_BFL,
+                                      .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LUSTRE_BFL_FID);
+
+/** Special fid for ".lustre" directory */
+const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+                                         .f_oid = FID_OID_DOT_LUSTRE,
+                                         .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_DOT_LUSTRE_FID);
+
+/** Special fid for "fid" special object in .lustre */
+const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE,
+                                  .f_oid = FID_OID_DOT_LUSTRE_OBF,
+                                  .f_ver = 0x0000000000000000 };
+EXPORT_SYMBOL(LU_OBF_FID);
diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c
new file mode 100644 (file)
index 0000000..fcaaca7
--- /dev/null
@@ -0,0 +1,522 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_request.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+/* mdc RPC locks */
+#include <lustre_mdc.h>
+#include "fid_internal.h"
+
+static int seq_client_rpc(struct lu_client_seq *seq,
+                         struct lu_seq_range *output, __u32 opc,
+                         const char *opcname)
+{
+       struct obd_export     *exp = seq->lcs_exp;
+       struct ptlrpc_request *req;
+       struct lu_seq_range   *out, *in;
+       __u32            *op;
+       unsigned int       debug_mask;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY,
+                                       LUSTRE_MDS_VERSION, SEQ_QUERY);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       /* Init operation code */
+       op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC);
+       *op = opc;
+
+       /* Zero out input range, this is not recovery yet. */
+       in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE);
+       range_init(in);
+
+       ptlrpc_request_set_replen(req);
+
+       in->lsr_index = seq->lcs_space.lsr_index;
+       if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+               fld_range_set_mdt(in);
+       else
+               fld_range_set_ost(in);
+
+       if (opc == SEQ_ALLOC_SUPER) {
+               req->rq_request_portal = SEQ_CONTROLLER_PORTAL;
+               req->rq_reply_portal = MDC_REPLY_PORTAL;
+               /* During allocating super sequence for data object,
+                * the current thread might hold the export of MDT0(MDT0
+                * precreating objects on this OST), and it will send the
+                * request to MDT0 here, so we can not keep resending the
+                * request here, otherwise if MDT0 is failed(umounted),
+                * it can not release the export of MDT0 */
+               if (seq->lcs_type == LUSTRE_SEQ_DATA)
+                       req->rq_no_delay = req->rq_no_resend = 1;
+               debug_mask = D_CONSOLE;
+       } else {
+               if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+                       req->rq_request_portal = SEQ_METADATA_PORTAL;
+               else
+                       req->rq_request_portal = SEQ_DATA_PORTAL;
+               debug_mask = D_INFO;
+       }
+
+       ptlrpc_at_set_req_timeout(req);
+
+       if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+               mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       rc = ptlrpc_queue_wait(req);
+       if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+               mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       if (rc)
+               GOTO(out_req, rc);
+
+       out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);
+       *output = *out;
+
+       if (!range_is_sane(output)) {
+               CERROR("%s: Invalid range received from server: "
+                      DRANGE"\n", seq->lcs_name, PRANGE(output));
+               GOTO(out_req, rc = -EINVAL);
+       }
+
+       if (range_is_exhausted(output)) {
+               CERROR("%s: Range received from server is exhausted: "
+                      DRANGE"]\n", seq->lcs_name, PRANGE(output));
+               GOTO(out_req, rc = -EINVAL);
+       }
+
+       CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n",
+                    seq->lcs_name, opcname, PRANGE(output));
+
+       EXIT;
+out_req:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+/* Request sequence-controller node to allocate new super-sequence. */
+int seq_client_alloc_super(struct lu_client_seq *seq,
+                          const struct lu_env *env)
+{
+       int rc;
+       ENTRY;
+
+       mutex_lock(&seq->lcs_mutex);
+
+       if (seq->lcs_srv) {
+               LASSERT(env != NULL);
+               rc = seq_server_alloc_super(seq->lcs_srv, &seq->lcs_space,
+                                           env);
+       } else {
+               /* Check whether the connection to seq controller has been
+                * setup (lcs_exp != NULL) */
+               if (seq->lcs_exp == NULL) {
+                       mutex_unlock(&seq->lcs_mutex);
+                       RETURN(-EINPROGRESS);
+               }
+
+               rc = seq_client_rpc(seq, &seq->lcs_space,
+                                   SEQ_ALLOC_SUPER, "super");
+       }
+       mutex_unlock(&seq->lcs_mutex);
+       RETURN(rc);
+}
+
+/* Request sequence-controller node to allocate new meta-sequence. */
+static int seq_client_alloc_meta(const struct lu_env *env,
+                                struct lu_client_seq *seq)
+{
+       int rc;
+       ENTRY;
+
+       if (seq->lcs_srv) {
+               LASSERT(env != NULL);
+               rc = seq_server_alloc_meta(seq->lcs_srv, &seq->lcs_space, env);
+       } else {
+               do {
+                       /* If meta server return -EINPROGRESS or EAGAIN,
+                        * it means meta server might not be ready to
+                        * allocate super sequence from sequence controller
+                        * (MDT0)yet */
+                       rc = seq_client_rpc(seq, &seq->lcs_space,
+                                           SEQ_ALLOC_META, "meta");
+               } while (rc == -EINPROGRESS || rc == -EAGAIN);
+       }
+       RETURN(rc);
+}
+
+/* Allocate new sequence for client. */
+static int seq_client_alloc_seq(const struct lu_env *env,
+                               struct lu_client_seq *seq, seqno_t *seqnr)
+{
+       int rc;
+       ENTRY;
+
+       LASSERT(range_is_sane(&seq->lcs_space));
+
+       if (range_is_exhausted(&seq->lcs_space)) {
+               rc = seq_client_alloc_meta(env, seq);
+               if (rc) {
+                       CERROR("%s: Can't allocate new meta-sequence,"
+                              "rc %d\n", seq->lcs_name, rc);
+                       RETURN(rc);
+               } else {
+                       CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
+                              seq->lcs_name, PRANGE(&seq->lcs_space));
+               }
+       } else {
+               rc = 0;
+       }
+
+       LASSERT(!range_is_exhausted(&seq->lcs_space));
+       *seqnr = seq->lcs_space.lsr_start;
+       seq->lcs_space.lsr_start += 1;
+
+       CDEBUG(D_INFO, "%s: Allocated sequence ["LPX64"]\n", seq->lcs_name,
+              *seqnr);
+
+       RETURN(rc);
+}
+
+static int seq_fid_alloc_prep(struct lu_client_seq *seq,
+                             wait_queue_t *link)
+{
+       if (seq->lcs_update) {
+               add_wait_queue(&seq->lcs_waitq, link);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               mutex_unlock(&seq->lcs_mutex);
+
+               waitq_wait(link, TASK_UNINTERRUPTIBLE);
+
+               mutex_lock(&seq->lcs_mutex);
+               remove_wait_queue(&seq->lcs_waitq, link);
+               set_current_state(TASK_RUNNING);
+               return -EAGAIN;
+       }
+       ++seq->lcs_update;
+       mutex_unlock(&seq->lcs_mutex);
+       return 0;
+}
+
+static void seq_fid_alloc_fini(struct lu_client_seq *seq)
+{
+       LASSERT(seq->lcs_update == 1);
+       mutex_lock(&seq->lcs_mutex);
+       --seq->lcs_update;
+       wake_up(&seq->lcs_waitq);
+}
+
+/**
+ * Allocate the whole seq to the caller.
+ **/
+int seq_client_get_seq(const struct lu_env *env,
+                      struct lu_client_seq *seq, seqno_t *seqnr)
+{
+       wait_queue_t link;
+       int rc;
+
+       LASSERT(seqnr != NULL);
+       mutex_lock(&seq->lcs_mutex);
+       init_waitqueue_entry_current(&link);
+
+       while (1) {
+               rc = seq_fid_alloc_prep(seq, &link);
+               if (rc == 0)
+                       break;
+       }
+
+       rc = seq_client_alloc_seq(env, seq, seqnr);
+       if (rc) {
+               CERROR("%s: Can't allocate new sequence, "
+                      "rc %d\n", seq->lcs_name, rc);
+               seq_fid_alloc_fini(seq);
+               mutex_unlock(&seq->lcs_mutex);
+               return rc;
+       }
+
+       CDEBUG(D_INFO, "%s: allocate sequence "
+              "[0x%16.16"LPF64"x]\n", seq->lcs_name, *seqnr);
+
+       /* Since the caller require the whole seq,
+        * so marked this seq to be used */
+       if (seq->lcs_type == LUSTRE_SEQ_METADATA)
+               seq->lcs_fid.f_oid = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+       else
+               seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+       seq->lcs_fid.f_seq = *seqnr;
+       seq->lcs_fid.f_ver = 0;
+       /*
+        * Inform caller that sequence switch is performed to allow it
+        * to setup FLD for it.
+        */
+       seq_fid_alloc_fini(seq);
+       mutex_unlock(&seq->lcs_mutex);
+
+       return rc;
+}
+EXPORT_SYMBOL(seq_client_get_seq);
+
+/* Allocate new fid on passed client @seq and save it to @fid. */
+int seq_client_alloc_fid(const struct lu_env *env,
+                        struct lu_client_seq *seq, struct lu_fid *fid)
+{
+       wait_queue_t link;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+       LASSERT(fid != NULL);
+
+       init_waitqueue_entry_current(&link);
+       mutex_lock(&seq->lcs_mutex);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST))
+               seq->lcs_fid.f_oid = seq->lcs_width;
+
+       while (1) {
+               seqno_t seqnr;
+
+               if (!fid_is_zero(&seq->lcs_fid) &&
+                   fid_oid(&seq->lcs_fid) < seq->lcs_width) {
+                       /* Just bump last allocated fid and return to caller. */
+                       seq->lcs_fid.f_oid += 1;
+                       rc = 0;
+                       break;
+               }
+
+               rc = seq_fid_alloc_prep(seq, &link);
+               if (rc)
+                       continue;
+
+               rc = seq_client_alloc_seq(env, seq, &seqnr);
+               if (rc) {
+                       CERROR("%s: Can't allocate new sequence, "
+                              "rc %d\n", seq->lcs_name, rc);
+                       seq_fid_alloc_fini(seq);
+                       mutex_unlock(&seq->lcs_mutex);
+                       RETURN(rc);
+               }
+
+               CDEBUG(D_INFO, "%s: Switch to sequence "
+                      "[0x%16.16"LPF64"x]\n", seq->lcs_name, seqnr);
+
+               seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID;
+               seq->lcs_fid.f_seq = seqnr;
+               seq->lcs_fid.f_ver = 0;
+
+               /*
+                * Inform caller that sequence switch is performed to allow it
+                * to setup FLD for it.
+                */
+               rc = 1;
+
+               seq_fid_alloc_fini(seq);
+               break;
+       }
+
+       *fid = seq->lcs_fid;
+       mutex_unlock(&seq->lcs_mutex);
+
+       CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name,  PFID(fid));
+       RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_alloc_fid);
+
+/*
+ * Finish the current sequence due to disconnect.
+ * See mdc_import_event()
+ */
+void seq_client_flush(struct lu_client_seq *seq)
+{
+       wait_queue_t link;
+
+       LASSERT(seq != NULL);
+       init_waitqueue_entry_current(&link);
+       mutex_lock(&seq->lcs_mutex);
+
+       while (seq->lcs_update) {
+               add_wait_queue(&seq->lcs_waitq, &link);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               mutex_unlock(&seq->lcs_mutex);
+
+               waitq_wait(&link, TASK_UNINTERRUPTIBLE);
+
+               mutex_lock(&seq->lcs_mutex);
+               remove_wait_queue(&seq->lcs_waitq, &link);
+               set_current_state(TASK_RUNNING);
+       }
+
+       fid_zero(&seq->lcs_fid);
+       /**
+        * this id shld not be used for seq range allocation.
+        * set to -1 for dgb check.
+        */
+
+       seq->lcs_space.lsr_index = -1;
+
+       range_init(&seq->lcs_space);
+       mutex_unlock(&seq->lcs_mutex);
+}
+EXPORT_SYMBOL(seq_client_flush);
+
+static void seq_client_proc_fini(struct lu_client_seq *seq);
+
+#ifdef LPROCFS
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+       int rc;
+       ENTRY;
+
+       seq->lcs_proc_dir = lprocfs_register(seq->lcs_name,
+                                            seq_type_proc_dir,
+                                            NULL, NULL);
+
+       if (IS_ERR(seq->lcs_proc_dir)) {
+               CERROR("%s: LProcFS failed in seq-init\n",
+                      seq->lcs_name);
+               rc = PTR_ERR(seq->lcs_proc_dir);
+               RETURN(rc);
+       }
+
+       rc = lprocfs_add_vars(seq->lcs_proc_dir,
+                             seq_client_proc_list, seq);
+       if (rc) {
+               CERROR("%s: Can't init sequence manager "
+                      "proc, rc %d\n", seq->lcs_name, rc);
+               GOTO(out_cleanup, rc);
+       }
+
+       RETURN(0);
+
+out_cleanup:
+       seq_client_proc_fini(seq);
+       return rc;
+}
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+       ENTRY;
+       if (seq->lcs_proc_dir) {
+               if (!IS_ERR(seq->lcs_proc_dir))
+                       lprocfs_remove(&seq->lcs_proc_dir);
+               seq->lcs_proc_dir = NULL;
+       }
+       EXIT;
+}
+#else
+static int seq_client_proc_init(struct lu_client_seq *seq)
+{
+       return 0;
+}
+
+static void seq_client_proc_fini(struct lu_client_seq *seq)
+{
+       return;
+}
+#endif
+
+int seq_client_init(struct lu_client_seq *seq,
+                   struct obd_export *exp,
+                   enum lu_cli_type type,
+                   const char *prefix,
+                   struct lu_server_seq *srv)
+{
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+       LASSERT(prefix != NULL);
+
+       seq->lcs_srv = srv;
+       seq->lcs_type = type;
+
+       mutex_init(&seq->lcs_mutex);
+       if (type == LUSTRE_SEQ_METADATA)
+               seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+       else
+               seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH;
+
+       init_waitqueue_head(&seq->lcs_waitq);
+       /* Make sure that things are clear before work is started. */
+       seq_client_flush(seq);
+
+       if (exp != NULL)
+               seq->lcs_exp = class_export_get(exp);
+       else if (type == LUSTRE_SEQ_METADATA)
+               LASSERT(seq->lcs_srv != NULL);
+
+       snprintf(seq->lcs_name, sizeof(seq->lcs_name),
+                "cli-%s", prefix);
+
+       rc = seq_client_proc_init(seq);
+       if (rc)
+               seq_client_fini(seq);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(seq_client_init);
+
+void seq_client_fini(struct lu_client_seq *seq)
+{
+       ENTRY;
+
+       seq_client_proc_fini(seq);
+
+       if (seq->lcs_exp != NULL) {
+               class_export_put(seq->lcs_exp);
+               seq->lcs_exp = NULL;
+       }
+
+       seq->lcs_srv = NULL;
+       EXIT;
+}
+EXPORT_SYMBOL(seq_client_fini);
diff --git a/drivers/staging/lustre/lustre/fid/fid_store.c b/drivers/staging/lustre/lustre/fid/fid_store.c
new file mode 100644 (file)
index 0000000..a90e6e3
--- /dev/null
@@ -0,0 +1,259 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/fid_store.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+
+static struct lu_buf *seq_store_buf(struct seq_thread_info *info)
+{
+       struct lu_buf *buf;
+
+       buf = &info->sti_buf;
+       buf->lb_buf = &info->sti_space;
+       buf->lb_len = sizeof(info->sti_space);
+       return buf;
+}
+
+struct seq_update_callback {
+       struct dt_txn_commit_cb suc_cb;
+       struct lu_server_seq   *suc_seq;
+};
+
+void seq_update_cb(struct lu_env *env, struct thandle *th,
+                  struct dt_txn_commit_cb *cb, int err)
+{
+       struct seq_update_callback *ccb;
+
+       ccb = container_of0(cb, struct seq_update_callback, suc_cb);
+
+       LASSERT(ccb->suc_seq != NULL);
+
+       ccb->suc_seq->lss_need_sync = 0;
+       OBD_FREE_PTR(ccb);
+}
+
+int seq_update_cb_add(struct thandle *th, struct lu_server_seq *seq)
+{
+       struct seq_update_callback *ccb;
+       struct dt_txn_commit_cb    *dcb;
+       int                        rc;
+
+       OBD_ALLOC_PTR(ccb);
+       if (ccb == NULL)
+               return -ENOMEM;
+
+       ccb->suc_seq       = seq;
+       seq->lss_need_sync = 1;
+
+       dcb            = &ccb->suc_cb;
+       dcb->dcb_func  = seq_update_cb;
+       INIT_LIST_HEAD(&dcb->dcb_linkage);
+       strncpy(dcb->dcb_name, "seq_update_cb", MAX_COMMIT_CB_STR_LEN);
+       dcb->dcb_name[MAX_COMMIT_CB_STR_LEN - 1] = '\0';
+
+       rc = dt_trans_cb_add(th, dcb);
+       if (rc)
+               OBD_FREE_PTR(ccb);
+       return rc;
+}
+
+/* This function implies that caller takes care about locking. */
+int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq,
+                    struct lu_seq_range *out, int sync)
+{
+       struct dt_device *dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev);
+       struct seq_thread_info *info;
+       struct thandle *th;
+       loff_t pos = 0;
+       int rc;
+
+       info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+       LASSERT(info != NULL);
+
+       th = dt_trans_create(env, dt_dev);
+       if (IS_ERR(th))
+               RETURN(PTR_ERR(th));
+
+       rc = dt_declare_record_write(env, seq->lss_obj,
+                                    sizeof(struct lu_seq_range), 0, th);
+       if (rc)
+               GOTO(exit, rc);
+
+       if (out != NULL) {
+               rc = fld_declare_server_create(env,
+                                              seq->lss_site->ss_server_fld,
+                                              out, th);
+               if (rc)
+                       GOTO(exit, rc);
+       }
+
+       rc = dt_trans_start_local(env, dt_dev, th);
+       if (rc)
+               GOTO(exit, rc);
+
+       /* Store ranges in le format. */
+       range_cpu_to_le(&info->sti_space, &seq->lss_space);
+
+       rc = dt_record_write(env, seq->lss_obj, seq_store_buf(info), &pos, th);
+       if (rc) {
+               CERROR("%s: Can't write space data, rc %d\n",
+                      seq->lss_name, rc);
+               GOTO(exit, rc);
+       } else if (out != NULL) {
+               rc = fld_server_create(env, seq->lss_site->ss_server_fld, out,
+                                      th);
+               if (rc) {
+                       CERROR("%s: Can't Update fld database, rc %d\n",
+                               seq->lss_name, rc);
+                       GOTO(exit, rc);
+               }
+       }
+       /* next sequence update will need sync until this update is committed
+        * in case of sync operation this is not needed obviously */
+       if (!sync)
+               /* if callback can't be added then sync always */
+               sync = !!seq_update_cb_add(th, seq);
+
+       th->th_sync |= sync;
+exit:
+       dt_trans_stop(env, dt_dev, th);
+       return rc;
+}
+
+/*
+ * This function implies that caller takes care about locking or locking is not
+ * needed (init time).
+ */
+int seq_store_read(struct lu_server_seq *seq,
+                  const struct lu_env *env)
+{
+       struct seq_thread_info *info;
+       loff_t pos = 0;
+       int rc;
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &seq_thread_key);
+       LASSERT(info != NULL);
+
+       rc = seq->lss_obj->do_body_ops->dbo_read(env, seq->lss_obj,
+                                                seq_store_buf(info),
+                                                &pos, BYPASS_CAPA);
+
+       if (rc == sizeof(info->sti_space)) {
+               range_le_to_cpu(&seq->lss_space, &info->sti_space);
+               CDEBUG(D_INFO, "%s: Space - "DRANGE"\n",
+                      seq->lss_name, PRANGE(&seq->lss_space));
+               rc = 0;
+       } else if (rc == 0) {
+               rc = -ENODATA;
+       } else if (rc > 0) {
+               CERROR("%s: Read only %d bytes of %d\n", seq->lss_name,
+                      rc, (int)sizeof(info->sti_space));
+               rc = -EIO;
+       }
+
+       RETURN(rc);
+}
+
+int seq_store_init(struct lu_server_seq *seq,
+                  const struct lu_env *env,
+                  struct dt_device *dt)
+{
+       struct dt_object *dt_obj;
+       struct lu_fid fid;
+       struct lu_attr attr;
+       struct dt_object_format dof;
+       const char *name;
+       int rc;
+       ENTRY;
+
+       name = seq->lss_type == LUSTRE_SEQ_SERVER ?
+               LUSTRE_SEQ_SRV_NAME : LUSTRE_SEQ_CTL_NAME;
+
+       if (seq->lss_type == LUSTRE_SEQ_SERVER)
+               lu_local_obj_fid(&fid, FID_SEQ_SRV_OID);
+       else
+               lu_local_obj_fid(&fid, FID_SEQ_CTL_OID);
+
+       memset(&attr, 0, sizeof(attr));
+       attr.la_valid = LA_MODE;
+       attr.la_mode = S_IFREG | 0666;
+       dof.dof_type = DFT_REGULAR;
+
+       dt_obj = dt_find_or_create(env, dt, &fid, &dof, &attr);
+       if (!IS_ERR(dt_obj)) {
+               seq->lss_obj = dt_obj;
+               rc = 0;
+       } else {
+               CERROR("%s: Can't find \"%s\" obj %d\n",
+                      seq->lss_name, name, (int)PTR_ERR(dt_obj));
+               rc = PTR_ERR(dt_obj);
+       }
+
+       RETURN(rc);
+}
+
+void seq_store_fini(struct lu_server_seq *seq,
+                   const struct lu_env *env)
+{
+       ENTRY;
+
+       if (seq->lss_obj != NULL) {
+               if (!IS_ERR(seq->lss_obj))
+                       lu_object_put(env, &seq->lss_obj->do_lu);
+               seq->lss_obj = NULL;
+       }
+
+       EXIT;
+}
diff --git a/drivers/staging/lustre/lustre/fid/lproc_fid.c b/drivers/staging/lustre/lustre/fid/lproc_fid.c
new file mode 100644 (file)
index 0000000..af817a8
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fid/lproc_fid.c
+ *
+ * Lustre Sequence Manager
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FID
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fid.h>
+#include "fid_internal.h"
+
+#ifdef LPROCFS
+/*
+ * Note: this function is only used for testing, it is no safe for production
+ * use.
+ */
+static int
+lprocfs_fid_write_common(const char *buffer, unsigned long count,
+                        struct lu_seq_range *range)
+{
+       struct lu_seq_range tmp;
+       int rc;
+       ENTRY;
+
+       LASSERT(range != NULL);
+
+       rc = sscanf(buffer, "[%llx - %llx]\n",
+                   (long long unsigned *)&tmp.lsr_start,
+                   (long long unsigned *)&tmp.lsr_end);
+       if (rc != 2 || !range_is_sane(&tmp) || range_is_zero(&tmp))
+               RETURN(-EINVAL);
+       *range = tmp;
+       RETURN(0);
+}
+
+/* Client side procfs stuff */
+static ssize_t
+lprocfs_fid_space_seq_write(struct file *file, const char *buffer,
+                           size_t count, loff_t *off)
+{
+       struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lcs_mutex);
+       rc = lprocfs_fid_write_common(buffer, count, &seq->lcs_space);
+
+       if (rc == 0) {
+               CDEBUG(D_INFO, "%s: Space: "DRANGE"\n",
+                      seq->lcs_name, PRANGE(&seq->lcs_space));
+       }
+
+       mutex_unlock(&seq->lcs_mutex);
+
+       RETURN(count);
+}
+
+static int
+lprocfs_fid_space_seq_show(struct seq_file *m, void *unused)
+{
+       struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lcs_mutex);
+       rc = seq_printf(m, "["LPX64" - "LPX64"]:%x:%s\n", PRANGE(&seq->lcs_space));
+       mutex_unlock(&seq->lcs_mutex);
+
+       RETURN(rc);
+}
+
+static ssize_t
+lprocfs_fid_width_seq_write(struct file *file, const char *buffer,
+                           size_t count, loff_t *off)
+{
+       struct lu_client_seq *seq = ((struct seq_file *)file->private_data)->private;
+       __u64  max;
+       int rc, val;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               RETURN(rc);
+
+       mutex_lock(&seq->lcs_mutex);
+       if (seq->lcs_type == LUSTRE_SEQ_DATA)
+               max = LUSTRE_DATA_SEQ_MAX_WIDTH;
+       else
+               max = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+
+       if (val <= max && val > 0) {
+               seq->lcs_width = val;
+
+               if (rc == 0) {
+                       CDEBUG(D_INFO, "%s: Sequence size: "LPU64"\n",
+                              seq->lcs_name, seq->lcs_width);
+               }
+       }
+
+       mutex_unlock(&seq->lcs_mutex);
+
+       RETURN(count);
+}
+
+static int
+lprocfs_fid_width_seq_show(struct seq_file *m, void *unused)
+{
+       struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lcs_mutex);
+       rc = seq_printf(m, LPU64"\n", seq->lcs_width);
+       mutex_unlock(&seq->lcs_mutex);
+
+       RETURN(rc);
+}
+
+static int
+lprocfs_fid_fid_seq_show(struct seq_file *m, void *unused)
+{
+       struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       mutex_lock(&seq->lcs_mutex);
+       rc = seq_printf(m, DFID"\n", PFID(&seq->lcs_fid));
+       mutex_unlock(&seq->lcs_mutex);
+
+       RETURN(rc);
+}
+
+static int
+lprocfs_fid_server_seq_show(struct seq_file *m, void *unused)
+{
+       struct lu_client_seq *seq = (struct lu_client_seq *)m->private;
+       struct client_obd *cli;
+       int rc;
+       ENTRY;
+
+       LASSERT(seq != NULL);
+
+       if (seq->lcs_exp != NULL) {
+               cli = &seq->lcs_exp->exp_obd->u.cli;
+               rc = seq_printf(m, "%s\n", cli->cl_target_uuid.uuid);
+       } else {
+               rc = seq_printf(m, "%s\n", seq->lcs_srv->lss_name);
+       }
+       RETURN(rc);
+}
+
+struct lprocfs_vars seq_server_proc_list[] = {
+};
+
+LPROC_SEQ_FOPS(lprocfs_fid_space);
+LPROC_SEQ_FOPS(lprocfs_fid_width);
+LPROC_SEQ_FOPS_RO(lprocfs_fid_server);
+LPROC_SEQ_FOPS_RO(lprocfs_fid_fid);
+
+struct lprocfs_vars seq_client_proc_list[] = {
+       { "space", &lprocfs_fid_space_fops },
+       { "width", &lprocfs_fid_width_fops },
+       { "server", &lprocfs_fid_server_fops },
+       { "fid", &lprocfs_fid_fid_fops },
+       { NULL }
+};
+#endif
diff --git a/drivers/staging/lustre/lustre/fld/Makefile b/drivers/staging/lustre/lustre/fld/Makefile
new file mode 100644 (file)
index 0000000..e7f2881
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += fld.o
+fld-y := fld_handler.o fld_request.o fld_cache.o fld_index.o lproc_fld.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/fld/fld_cache.c b/drivers/staging/lustre/lustre/fld/fld_cache.c
new file mode 100644 (file)
index 0000000..347f2ae
--- /dev/null
@@ -0,0 +1,566 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_cache.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+/**
+ * create fld cache.
+ */
+struct fld_cache *fld_cache_init(const char *name,
+                                int cache_size, int cache_threshold)
+{
+       struct fld_cache *cache;
+       ENTRY;
+
+       LASSERT(name != NULL);
+       LASSERT(cache_threshold < cache_size);
+
+       OBD_ALLOC_PTR(cache);
+       if (cache == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       INIT_LIST_HEAD(&cache->fci_entries_head);
+       INIT_LIST_HEAD(&cache->fci_lru);
+
+       cache->fci_cache_count = 0;
+       rwlock_init(&cache->fci_lock);
+
+       strlcpy(cache->fci_name, name,
+               sizeof(cache->fci_name));
+
+       cache->fci_cache_size = cache_size;
+       cache->fci_threshold = cache_threshold;
+
+       /* Init fld cache info. */
+       memset(&cache->fci_stat, 0, sizeof(cache->fci_stat));
+
+       CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n",
+              cache->fci_name, cache_size, cache_threshold);
+
+       RETURN(cache);
+}
+
+/**
+ * destroy fld cache.
+ */
+void fld_cache_fini(struct fld_cache *cache)
+{
+       __u64 pct;
+       ENTRY;
+
+       LASSERT(cache != NULL);
+       fld_cache_flush(cache);
+
+       if (cache->fci_stat.fst_count > 0) {
+               pct = cache->fci_stat.fst_cache * 100;
+               do_div(pct, cache->fci_stat.fst_count);
+       } else {
+               pct = 0;
+       }
+
+       CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name);
+       CDEBUG(D_INFO, "  Total reqs: "LPU64"\n", cache->fci_stat.fst_count);
+       CDEBUG(D_INFO, "  Cache reqs: "LPU64"\n", cache->fci_stat.fst_cache);
+       CDEBUG(D_INFO, "  Cache hits: "LPU64"%%\n", pct);
+
+       OBD_FREE_PTR(cache);
+
+       EXIT;
+}
+
+/**
+ * delete given node from list.
+ */
+void fld_cache_entry_delete(struct fld_cache *cache,
+                           struct fld_cache_entry *node)
+{
+       list_del(&node->fce_list);
+       list_del(&node->fce_lru);
+       cache->fci_cache_count--;
+       OBD_FREE_PTR(node);
+}
+
+/**
+ * fix list by checking new entry with NEXT entry in order.
+ */
+static void fld_fix_new_list(struct fld_cache *cache)
+{
+       struct fld_cache_entry *f_curr;
+       struct fld_cache_entry *f_next;
+       struct lu_seq_range *c_range;
+       struct lu_seq_range *n_range;
+       struct list_head *head = &cache->fci_entries_head;
+       ENTRY;
+
+restart_fixup:
+
+       list_for_each_entry_safe(f_curr, f_next, head, fce_list) {
+               c_range = &f_curr->fce_range;
+               n_range = &f_next->fce_range;
+
+               LASSERT(range_is_sane(c_range));
+               if (&f_next->fce_list == head)
+                       break;
+
+               if (c_range->lsr_flags != n_range->lsr_flags)
+                       continue;
+
+               LASSERTF(c_range->lsr_start <= n_range->lsr_start,
+                        "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n",
+                        PRANGE(c_range), PRANGE(n_range));
+
+               /* check merge possibility with next range */
+               if (c_range->lsr_end == n_range->lsr_start) {
+                       if (c_range->lsr_index != n_range->lsr_index)
+                               continue;
+                       n_range->lsr_start = c_range->lsr_start;
+                       fld_cache_entry_delete(cache, f_curr);
+                       continue;
+               }
+
+               /* check if current range overlaps with next range. */
+               if (n_range->lsr_start < c_range->lsr_end) {
+                       if (c_range->lsr_index == n_range->lsr_index) {
+                               n_range->lsr_start = c_range->lsr_start;
+                               n_range->lsr_end = max(c_range->lsr_end,
+                                                      n_range->lsr_end);
+                               fld_cache_entry_delete(cache, f_curr);
+                       } else {
+                               if (n_range->lsr_end <= c_range->lsr_end) {
+                                       *n_range = *c_range;
+                                       fld_cache_entry_delete(cache, f_curr);
+                               } else
+                                       n_range->lsr_start = c_range->lsr_end;
+                       }
+
+                       /* we could have overlap over next
+                        * range too. better restart. */
+                       goto restart_fixup;
+               }
+
+               /* kill duplicates */
+               if (c_range->lsr_start == n_range->lsr_start &&
+                   c_range->lsr_end == n_range->lsr_end)
+                       fld_cache_entry_delete(cache, f_curr);
+       }
+
+       EXIT;
+}
+
+/**
+ * add node to fld cache
+ */
+static inline void fld_cache_entry_add(struct fld_cache *cache,
+                                      struct fld_cache_entry *f_new,
+                                      struct list_head *pos)
+{
+       list_add(&f_new->fce_list, pos);
+       list_add(&f_new->fce_lru, &cache->fci_lru);
+
+       cache->fci_cache_count++;
+       fld_fix_new_list(cache);
+}
+
+/**
+ * Check if cache needs to be shrunk. If so - do it.
+ * Remove one entry in list and so on until cache is shrunk enough.
+ */
+static int fld_cache_shrink(struct fld_cache *cache)
+{
+       struct fld_cache_entry *flde;
+       struct list_head *curr;
+       int num = 0;
+       ENTRY;
+
+       LASSERT(cache != NULL);
+
+       if (cache->fci_cache_count < cache->fci_cache_size)
+               RETURN(0);
+
+       curr = cache->fci_lru.prev;
+
+       while (cache->fci_cache_count + cache->fci_threshold >
+              cache->fci_cache_size && curr != &cache->fci_lru) {
+
+               flde = list_entry(curr, struct fld_cache_entry, fce_lru);
+               curr = curr->prev;
+               fld_cache_entry_delete(cache, flde);
+               num++;
+       }
+
+       CDEBUG(D_INFO, "%s: FLD cache - Shrunk by "
+              "%d entries\n", cache->fci_name, num);
+
+       RETURN(0);
+}
+
+/**
+ * kill all fld cache entries.
+ */
+void fld_cache_flush(struct fld_cache *cache)
+{
+       ENTRY;
+
+       write_lock(&cache->fci_lock);
+       cache->fci_cache_size = 0;
+       fld_cache_shrink(cache);
+       write_unlock(&cache->fci_lock);
+
+       EXIT;
+}
+
+/**
+ * punch hole in existing range. divide this range and add new
+ * entry accordingly.
+ */
+
+void fld_cache_punch_hole(struct fld_cache *cache,
+                         struct fld_cache_entry *f_curr,
+                         struct fld_cache_entry *f_new)
+{
+       const struct lu_seq_range *range = &f_new->fce_range;
+       const seqno_t new_start  = range->lsr_start;
+       const seqno_t new_end  = range->lsr_end;
+       struct fld_cache_entry *fldt;
+
+       ENTRY;
+       OBD_ALLOC_GFP(fldt, sizeof *fldt, GFP_ATOMIC);
+       if (!fldt) {
+               OBD_FREE_PTR(f_new);
+               EXIT;
+               /* overlap is not allowed, so dont mess up list. */
+               return;
+       }
+       /*  break f_curr RANGE into three RANGES:
+        *      f_curr, f_new , fldt
+        */
+
+       /* f_new = *range */
+
+       /* fldt */
+       fldt->fce_range.lsr_start = new_end;
+       fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end;
+       fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index;
+
+       /* f_curr */
+       f_curr->fce_range.lsr_end = new_start;
+
+       /* add these two entries to list */
+       fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+       fld_cache_entry_add(cache, fldt, &f_new->fce_list);
+
+       /* no need to fixup */
+       EXIT;
+}
+
+/**
+ * handle range overlap in fld cache.
+ */
+static void fld_cache_overlap_handle(struct fld_cache *cache,
+                               struct fld_cache_entry *f_curr,
+                               struct fld_cache_entry *f_new)
+{
+       const struct lu_seq_range *range = &f_new->fce_range;
+       const seqno_t new_start  = range->lsr_start;
+       const seqno_t new_end  = range->lsr_end;
+       const mdsno_t mdt = range->lsr_index;
+
+       /* this is overlap case, these case are checking overlapping with
+        * prev range only. fixup will handle overlaping with next range. */
+
+       if (f_curr->fce_range.lsr_index == mdt) {
+               f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start,
+                                                 new_start);
+
+               f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end,
+                                               new_end);
+
+               OBD_FREE_PTR(f_new);
+               fld_fix_new_list(cache);
+
+       } else if (new_start <= f_curr->fce_range.lsr_start &&
+                       f_curr->fce_range.lsr_end <= new_end) {
+               /* case 1: new range completely overshadowed existing range.
+                *       e.g. whole range migrated. update fld cache entry */
+
+               f_curr->fce_range = *range;
+               OBD_FREE_PTR(f_new);
+               fld_fix_new_list(cache);
+
+       } else if (f_curr->fce_range.lsr_start < new_start &&
+                       new_end < f_curr->fce_range.lsr_end) {
+               /* case 2: new range fit within existing range. */
+
+               fld_cache_punch_hole(cache, f_curr, f_new);
+
+       } else  if (new_end <= f_curr->fce_range.lsr_end) {
+               /* case 3: overlap:
+                *       [new_start [c_start  new_end)  c_end)
+                */
+
+               LASSERT(new_start <= f_curr->fce_range.lsr_start);
+
+               f_curr->fce_range.lsr_start = new_end;
+               fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev);
+
+       } else if (f_curr->fce_range.lsr_start <= new_start) {
+               /* case 4: overlap:
+                *       [c_start [new_start c_end) new_end)
+                */
+
+               LASSERT(f_curr->fce_range.lsr_end <= new_end);
+
+               f_curr->fce_range.lsr_end = new_start;
+               fld_cache_entry_add(cache, f_new, &f_curr->fce_list);
+       } else
+               CERROR("NEW range ="DRANGE" curr = "DRANGE"\n",
+                      PRANGE(range),PRANGE(&f_curr->fce_range));
+}
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range)
+{
+       struct fld_cache_entry *f_new;
+
+       LASSERT(range_is_sane(range));
+
+       OBD_ALLOC_PTR(f_new);
+       if (!f_new)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       f_new->fce_range = *range;
+       RETURN(f_new);
+}
+
+/**
+ * Insert FLD entry in FLD cache.
+ *
+ * This function handles all cases of merging and breaking up of
+ * ranges.
+ */
+int fld_cache_insert_nolock(struct fld_cache *cache,
+                           struct fld_cache_entry *f_new)
+{
+       struct fld_cache_entry *f_curr;
+       struct fld_cache_entry *n;
+       struct list_head *head;
+       struct list_head *prev = NULL;
+       const seqno_t new_start  = f_new->fce_range.lsr_start;
+       const seqno_t new_end  = f_new->fce_range.lsr_end;
+       __u32 new_flags  = f_new->fce_range.lsr_flags;
+       ENTRY;
+
+       /*
+        * Duplicate entries are eliminated in insert op.
+        * So we don't need to search new entry before starting
+        * insertion loop.
+        */
+
+       if (!cache->fci_no_shrink)
+               fld_cache_shrink(cache);
+
+       head = &cache->fci_entries_head;
+
+       list_for_each_entry_safe(f_curr, n, head, fce_list) {
+               /* add list if next is end of list */
+               if (new_end < f_curr->fce_range.lsr_start ||
+                  (new_end == f_curr->fce_range.lsr_start &&
+                   new_flags != f_curr->fce_range.lsr_flags))
+                       break;
+
+               prev = &f_curr->fce_list;
+               /* check if this range is to left of new range. */
+               if (new_start < f_curr->fce_range.lsr_end &&
+                   new_flags == f_curr->fce_range.lsr_flags) {
+                       fld_cache_overlap_handle(cache, f_curr, f_new);
+                       goto out;
+               }
+       }
+
+       if (prev == NULL)
+               prev = head;
+
+       CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range));
+       /* Add new entry to cache and lru list. */
+       fld_cache_entry_add(cache, f_new, prev);
+out:
+       RETURN(0);
+}
+
+int fld_cache_insert(struct fld_cache *cache,
+                    const struct lu_seq_range *range)
+{
+       struct fld_cache_entry  *flde;
+       int rc;
+
+       flde = fld_cache_entry_create(range);
+       if (IS_ERR(flde))
+               RETURN(PTR_ERR(flde));
+
+       write_lock(&cache->fci_lock);
+       rc = fld_cache_insert_nolock(cache, flde);
+       write_unlock(&cache->fci_lock);
+       if (rc)
+               OBD_FREE_PTR(flde);
+
+       RETURN(rc);
+}
+
+void fld_cache_delete_nolock(struct fld_cache *cache,
+                     const struct lu_seq_range *range)
+{
+       struct fld_cache_entry *flde;
+       struct fld_cache_entry *tmp;
+       struct list_head *head;
+
+       head = &cache->fci_entries_head;
+       list_for_each_entry_safe(flde, tmp, head, fce_list) {
+               /* add list if next is end of list */
+               if (range->lsr_start == flde->fce_range.lsr_start ||
+                  (range->lsr_end == flde->fce_range.lsr_end &&
+                   range->lsr_flags == flde->fce_range.lsr_flags)) {
+                       fld_cache_entry_delete(cache, flde);
+                       break;
+               }
+       }
+}
+
+/**
+ * Delete FLD entry in FLD cache.
+ *
+ */
+void fld_cache_delete(struct fld_cache *cache,
+                     const struct lu_seq_range *range)
+{
+       write_lock(&cache->fci_lock);
+       fld_cache_delete_nolock(cache, range);
+       write_unlock(&cache->fci_lock);
+}
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+                             struct lu_seq_range *range)
+{
+       struct fld_cache_entry *flde;
+       struct fld_cache_entry *got = NULL;
+       struct list_head *head;
+
+       head = &cache->fci_entries_head;
+       list_for_each_entry(flde, head, fce_list) {
+               if (range->lsr_start == flde->fce_range.lsr_start ||
+                  (range->lsr_end == flde->fce_range.lsr_end &&
+                   range->lsr_flags == flde->fce_range.lsr_flags)) {
+                       got = flde;
+                       break;
+               }
+       }
+
+       RETURN(got);
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+struct fld_cache_entry
+*fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range)
+{
+       struct fld_cache_entry *got = NULL;
+       ENTRY;
+
+       read_lock(&cache->fci_lock);
+       got = fld_cache_entry_lookup_nolock(cache, range);
+       read_unlock(&cache->fci_lock);
+       RETURN(got);
+}
+
+/**
+ * lookup \a seq sequence for range in fld cache.
+ */
+int fld_cache_lookup(struct fld_cache *cache,
+                    const seqno_t seq, struct lu_seq_range *range)
+{
+       struct fld_cache_entry *flde;
+       struct fld_cache_entry *prev = NULL;
+       struct list_head *head;
+       ENTRY;
+
+       read_lock(&cache->fci_lock);
+       head = &cache->fci_entries_head;
+
+       cache->fci_stat.fst_count++;
+       list_for_each_entry(flde, head, fce_list) {
+               if (flde->fce_range.lsr_start > seq) {
+                       if (prev != NULL)
+                               *range = prev->fce_range;
+                       break;
+               }
+
+               prev = flde;
+               if (range_within(&flde->fce_range, seq)) {
+                       *range = flde->fce_range;
+
+                       cache->fci_stat.fst_cache++;
+                       read_unlock(&cache->fci_lock);
+                       RETURN(0);
+               }
+       }
+       read_unlock(&cache->fci_lock);
+       RETURN(-ENOENT);
+}
diff --git a/drivers/staging/lustre/lustre/fld/fld_handler.c b/drivers/staging/lustre/lustre/fld/fld_handler.c
new file mode 100644 (file)
index 0000000..d2707ae
--- /dev/null
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_handler.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Pravin Shelar <pravin.shelar@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <md_object.h>
+#include <lustre_fid.h>
+#include <lustre_req_layout.h>
+#include "fld_internal.h"
+#include <lustre_fid.h>
+
+
+/* context key constructor/destructor: fld_key_init, fld_key_fini */
+LU_KEY_INIT_FINI(fld, struct fld_thread_info);
+
+/* context key: fld_thread_key */
+LU_CONTEXT_KEY_DEFINE(fld, LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD);
+
+proc_dir_entry_t *fld_type_proc_dir = NULL;
+
+static int __init fld_mod_init(void)
+{
+       fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME,
+                                            proc_lustre_root,
+                                            NULL, NULL);
+       if (IS_ERR(fld_type_proc_dir))
+               return PTR_ERR(fld_type_proc_dir);
+
+       LU_CONTEXT_KEY_INIT(&fld_thread_key);
+       lu_context_key_register(&fld_thread_key);
+       return 0;
+}
+
+static void __exit fld_mod_exit(void)
+{
+       lu_context_key_degister(&fld_thread_key);
+       if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) {
+               lprocfs_remove(&fld_type_proc_dir);
+               fld_type_proc_dir = NULL;
+       }
+}
+
+int fld_declare_server_create(const struct lu_env *env,
+                             struct lu_server_fld *fld,
+                             struct lu_seq_range *range,
+                             struct thandle *th)
+{
+       int rc;
+
+       rc = fld_declare_index_create(env, fld, range, th);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(fld_declare_server_create);
+
+/**
+ * Insert FLD index entry and update FLD cache.
+ *
+ * This function is called from the sequence allocator when a super-sequence
+ * is granted to a server.
+ */
+int fld_server_create(const struct lu_env *env, struct lu_server_fld *fld,
+                     struct lu_seq_range *range, struct thandle *th)
+{
+       int rc;
+
+       mutex_lock(&fld->lsf_lock);
+       rc = fld_index_create(env, fld, range, th);
+       mutex_unlock(&fld->lsf_lock);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_create);
+
+/**
+ *  Lookup mds by seq, returns a range for given seq.
+ *
+ *  If that entry is not cached in fld cache, request is sent to super
+ *  sequence controller node (MDT0). All other MDT[1...N] and client
+ *  cache fld entries, but this cache is not persistent.
+ */
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+                     seqno_t seq, struct lu_seq_range *range)
+{
+       struct lu_seq_range *erange;
+       struct fld_thread_info *info;
+       int rc;
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       LASSERT(info != NULL);
+       erange = &info->fti_lrange;
+
+       /* Lookup it in the cache. */
+       rc = fld_cache_lookup(fld->lsf_cache, seq, erange);
+       if (rc == 0) {
+               if (unlikely(fld_range_type(erange) != fld_range_type(range) &&
+                            !fld_range_is_any(range))) {
+                       CERROR("%s: FLD cache range "DRANGE" does not match"
+                              "requested flag %x: rc = %d\n", fld->lsf_name,
+                              PRANGE(erange), range->lsr_flags, -EIO);
+                       RETURN(-EIO);
+               }
+               *range = *erange;
+               RETURN(0);
+       }
+
+       if (fld->lsf_obj) {
+               /* On server side, all entries should be in cache.
+                * If we can not find it in cache, just return error */
+               CERROR("%s: Cannot find sequence "LPX64": rc = %d\n",
+                       fld->lsf_name, seq, -EIO);
+               RETURN(-EIO);
+       } else {
+               LASSERT(fld->lsf_control_exp);
+               /* send request to mdt0 i.e. super seq. controller.
+                * This is temporary solution, long term solution is fld
+                * replication on all mdt servers.
+                */
+               range->lsr_start = seq;
+               rc = fld_client_rpc(fld->lsf_control_exp,
+                                   range, FLD_LOOKUP);
+               if (rc == 0)
+                       fld_cache_insert(fld->lsf_cache, range);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(fld_server_lookup);
+
+/**
+ * All MDT server handle fld lookup operation. But only MDT0 has fld index.
+ * if entry is not found in cache we need to forward lookup request to MDT0
+ */
+
+static int fld_server_handle(struct lu_server_fld *fld,
+                            const struct lu_env *env,
+                            __u32 opc, struct lu_seq_range *range,
+                            struct fld_thread_info *info)
+{
+       int rc;
+       ENTRY;
+
+       switch (opc) {
+       case FLD_LOOKUP:
+               rc = fld_server_lookup(env, fld, range->lsr_start, range);
+               break;
+       default:
+               rc = -EINVAL;
+               break;
+       }
+
+       CDEBUG(D_INFO, "%s: FLD req handle: error %d (opc: %d, range: "
+              DRANGE"\n", fld->lsf_name, rc, opc, PRANGE(range));
+
+       RETURN(rc);
+
+}
+
+static int fld_req_handle(struct ptlrpc_request *req,
+                         struct fld_thread_info *info)
+{
+       struct obd_export *exp = req->rq_export;
+       struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site;
+       struct lu_seq_range *in;
+       struct lu_seq_range *out;
+       int rc;
+       __u32 *opc;
+       ENTRY;
+
+       rc = req_capsule_server_pack(info->fti_pill);
+       if (rc)
+               RETURN(err_serious(rc));
+
+       opc = req_capsule_client_get(info->fti_pill, &RMF_FLD_OPC);
+       if (opc != NULL) {
+               in = req_capsule_client_get(info->fti_pill, &RMF_FLD_MDFLD);
+               if (in == NULL)
+                       RETURN(err_serious(-EPROTO));
+               out = req_capsule_server_get(info->fti_pill, &RMF_FLD_MDFLD);
+               if (out == NULL)
+                       RETURN(err_serious(-EPROTO));
+               *out = *in;
+
+               /* For old 2.0 client, the 'lsr_flags' is uninitialized.
+                * Set it as 'LU_SEQ_RANGE_MDT' by default. */
+               if (!(exp_connect_flags(exp) & OBD_CONNECT_64BITHASH) &&
+                   !(exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) &&
+                   !(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) &&
+                   !exp->exp_libclient)
+                       fld_range_set_mdt(out);
+
+               rc = fld_server_handle(lu_site2seq(site)->ss_server_fld,
+                                      req->rq_svc_thread->t_env,
+                                      *opc, out, info);
+       } else {
+               rc = err_serious(-EPROTO);
+       }
+
+       RETURN(rc);
+}
+
+static void fld_thread_info_init(struct ptlrpc_request *req,
+                                struct fld_thread_info *info)
+{
+       info->fti_pill = &req->rq_pill;
+       /* Init request capsule. */
+       req_capsule_init(info->fti_pill, req, RCL_SERVER);
+       req_capsule_set(info->fti_pill, &RQF_FLD_QUERY);
+}
+
+static void fld_thread_info_fini(struct fld_thread_info *info)
+{
+       req_capsule_fini(info->fti_pill);
+}
+
+static int fld_handle(struct ptlrpc_request *req)
+{
+       struct fld_thread_info *info;
+       const struct lu_env *env;
+       int rc;
+
+       env = req->rq_svc_thread->t_env;
+       LASSERT(env != NULL);
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       LASSERT(info != NULL);
+
+       fld_thread_info_init(req, info);
+       rc = fld_req_handle(req, info);
+       fld_thread_info_fini(info);
+
+       return rc;
+}
+
+/*
+ * Entry point for handling FLD RPCs called from MDT.
+ */
+int fld_query(struct com_thread_info *info)
+{
+       return fld_handle(info->cti_pill->rc_req);
+}
+EXPORT_SYMBOL(fld_query);
+
+/*
+ * Returns true, if fid is local to this server node.
+ *
+ * WARNING: this function is *not* guaranteed to return false if fid is
+ * remote: it makes an educated conservative guess only.
+ *
+ * fid_is_local() is supposed to be used in assertion checks only.
+ */
+int fid_is_local(const struct lu_env *env,
+                struct lu_site *site, const struct lu_fid *fid)
+{
+       int result;
+       struct seq_server_site *ss_site;
+       struct lu_seq_range *range;
+       struct fld_thread_info *info;
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       range = &info->fti_lrange;
+
+       result = 1; /* conservatively assume fid is local */
+       ss_site = lu_site2seq(site);
+       if (ss_site->ss_client_fld != NULL) {
+               int rc;
+
+               rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache,
+                                     fid_seq(fid), range);
+               if (rc == 0)
+                       result = (range->lsr_index == ss_site->ss_node_id);
+       }
+       return result;
+}
+EXPORT_SYMBOL(fid_is_local);
+
+static void fld_server_proc_fini(struct lu_server_fld *fld);
+
+#ifdef LPROCFS
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+       int rc = 0;
+       ENTRY;
+
+       fld->lsf_proc_dir = lprocfs_register(fld->lsf_name,
+                                            fld_type_proc_dir,
+                                            fld_server_proc_list, fld);
+       if (IS_ERR(fld->lsf_proc_dir)) {
+               rc = PTR_ERR(fld->lsf_proc_dir);
+               RETURN(rc);
+       }
+
+       rc = lprocfs_seq_create(fld->lsf_proc_dir, "fldb", 0444,
+                               &fld_proc_seq_fops, fld);
+       if (rc) {
+               lprocfs_remove(&fld->lsf_proc_dir);
+               fld->lsf_proc_dir = NULL;
+       }
+
+       RETURN(rc);
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+       ENTRY;
+       if (fld->lsf_proc_dir != NULL) {
+               if (!IS_ERR(fld->lsf_proc_dir))
+                       lprocfs_remove(&fld->lsf_proc_dir);
+               fld->lsf_proc_dir = NULL;
+       }
+       EXIT;
+}
+#else
+static int fld_server_proc_init(struct lu_server_fld *fld)
+{
+       return 0;
+}
+
+static void fld_server_proc_fini(struct lu_server_fld *fld)
+{
+       return;
+}
+#endif
+
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+                   struct dt_device *dt, const char *prefix, int mds_node_id,
+                   int type)
+{
+       int cache_size, cache_threshold;
+       int rc;
+       ENTRY;
+
+       snprintf(fld->lsf_name, sizeof(fld->lsf_name),
+                "srv-%s", prefix);
+
+       cache_size = FLD_SERVER_CACHE_SIZE /
+               sizeof(struct fld_cache_entry);
+
+       cache_threshold = cache_size *
+               FLD_SERVER_CACHE_THRESHOLD / 100;
+
+       mutex_init(&fld->lsf_lock);
+       fld->lsf_cache = fld_cache_init(fld->lsf_name,
+                                       cache_size, cache_threshold);
+       if (IS_ERR(fld->lsf_cache)) {
+               rc = PTR_ERR(fld->lsf_cache);
+               fld->lsf_cache = NULL;
+               GOTO(out, rc);
+       }
+
+       if (!mds_node_id && type == LU_SEQ_RANGE_MDT) {
+               rc = fld_index_init(env, fld, dt);
+               if (rc)
+                       GOTO(out, rc);
+       } else {
+               fld->lsf_obj = NULL;
+       }
+
+       rc = fld_server_proc_init(fld);
+       if (rc)
+               GOTO(out, rc);
+
+       fld->lsf_control_exp = NULL;
+
+       GOTO(out, rc);
+
+out:
+       if (rc)
+               fld_server_fini(env, fld);
+       return rc;
+}
+EXPORT_SYMBOL(fld_server_init);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+       ENTRY;
+
+       fld_server_proc_fini(fld);
+       fld_index_fini(env, fld);
+
+       if (fld->lsf_cache != NULL) {
+               if (!IS_ERR(fld->lsf_cache))
+                       fld_cache_fini(fld->lsf_cache);
+               fld->lsf_cache = NULL;
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(fld_server_fini);
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre FLD");
+MODULE_LICENSE("GPL");
+
+cfs_module(mdd, "0.1.0", fld_mod_init, fld_mod_exit);
diff --git a/drivers/staging/lustre/lustre/fld/fld_index.c b/drivers/staging/lustre/lustre/fld/fld_index.c
new file mode 100644 (file)
index 0000000..ec68a54
--- /dev/null
@@ -0,0 +1,426 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_index.c
+ *
+ * Author: WangDi <wangdi@clusterfs.com>
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_mdc.h>
+#include <lustre_fid.h>
+#include <lustre_fld.h>
+#include "fld_internal.h"
+
+const char fld_index_name[] = "fld";
+
+static const struct lu_seq_range IGIF_FLD_RANGE = {
+       .lsr_start = FID_SEQ_IGIF,
+       .lsr_end   = FID_SEQ_IGIF_MAX + 1,
+       .lsr_index = 0,
+       .lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range DOT_LUSTRE_FLD_RANGE = {
+       .lsr_start = FID_SEQ_DOT_LUSTRE,
+       .lsr_end   = FID_SEQ_DOT_LUSTRE + 1,
+       .lsr_index = 0,
+       .lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+static const struct lu_seq_range ROOT_FLD_RANGE = {
+       .lsr_start = FID_SEQ_ROOT,
+       .lsr_end   = FID_SEQ_ROOT + 1,
+       .lsr_index = 0,
+       .lsr_flags = LU_SEQ_RANGE_MDT
+};
+
+const struct dt_index_features fld_index_features = {
+       .dif_flags       = DT_IND_UPDATE,
+       .dif_keysize_min = sizeof(seqno_t),
+       .dif_keysize_max = sizeof(seqno_t),
+       .dif_recsize_min = sizeof(struct lu_seq_range),
+       .dif_recsize_max = sizeof(struct lu_seq_range),
+       .dif_ptrsize     = 4
+};
+
+extern struct lu_context_key fld_thread_key;
+
+int fld_declare_index_create(const struct lu_env *env,
+                            struct lu_server_fld *fld,
+                            const struct lu_seq_range *new_range,
+                            struct thandle *th)
+{
+       struct lu_seq_range     *tmp;
+       struct lu_seq_range     *range;
+       struct fld_thread_info  *info;
+       int                     rc = 0;
+
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       range = &info->fti_lrange;
+       tmp = &info->fti_irange;
+       memset(range, 0, sizeof(*range));
+
+       rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+       if (rc == 0) {
+               /* In case of duplicate entry, the location must be same */
+               LASSERT((range_compare_loc(new_range, range) == 0));
+               GOTO(out, rc = -EEXIST);
+       }
+
+       if (rc != -ENOENT) {
+               CERROR("%s: lookup range "DRANGE" error: rc = %d\n",
+                       fld->lsf_name, PRANGE(range), rc);
+               GOTO(out, rc);
+       }
+
+       /* Check for merge case, since the fld entry can only be increamental,
+        * so we will only check whether it can be merged from the left. */
+       if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+           range_compare_loc(new_range, range) == 0) {
+               range_cpu_to_be(tmp, range);
+               rc = dt_declare_delete(env, fld->lsf_obj,
+                                      (struct dt_key *)&tmp->lsr_start, th);
+               if (rc) {
+                       CERROR("%s: declare record "DRANGE" failed: rc = %d\n",
+                              fld->lsf_name, PRANGE(range), rc);
+                       GOTO(out, rc);
+               }
+               memcpy(tmp, new_range, sizeof(*new_range));
+               tmp->lsr_start = range->lsr_start;
+       } else {
+               memcpy(tmp, new_range, sizeof(*new_range));
+       }
+
+       range_cpu_to_be(tmp, tmp);
+       rc = dt_declare_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+                              (struct dt_key *)&tmp->lsr_start, th);
+out:
+       RETURN(rc);
+}
+
+/**
+ * insert range in fld store.
+ *
+ *      \param  range  range to be inserted
+ *      \param  th     transaction for this operation as it could compound
+ *                  transaction.
+ *
+ *      \retval  0  success
+ *      \retval  -ve error
+ *
+ * The whole fld index insertion is protected by seq->lss_mutex (see
+ * seq_server_alloc_super), i.e. only one thread will access fldb each
+ * time, so we do not need worry the fld file and cache will being
+ * changed between declare and create.
+ * Because the fld entry can only be increamental, so we will only check
+ * whether it can be merged from the left.
+ **/
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+                    const struct lu_seq_range *new_range, struct thandle *th)
+{
+       struct lu_seq_range     *range;
+       struct lu_seq_range     *tmp;
+       struct fld_thread_info  *info;
+       int                     rc = 0;
+       int                     deleted = 0;
+       struct fld_cache_entry  *flde;
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+
+       LASSERT(mutex_is_locked(&fld->lsf_lock));
+
+       range = &info->fti_lrange;
+       memset(range, 0, sizeof(*range));
+       tmp = &info->fti_irange;
+       rc = fld_index_lookup(env, fld, new_range->lsr_start, range);
+       if (rc != -ENOENT) {
+               rc = rc == 0 ? -EEXIST : rc;
+               GOTO(out, rc);
+       }
+
+       if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 &&
+           range_compare_loc(new_range, range) == 0) {
+               range_cpu_to_be(tmp, range);
+               rc = dt_delete(env, fld->lsf_obj,
+                              (struct dt_key *)&tmp->lsr_start, th,
+                               BYPASS_CAPA);
+               if (rc != 0)
+                       GOTO(out, rc);
+               memcpy(tmp, new_range, sizeof(*new_range));
+               tmp->lsr_start = range->lsr_start;
+               deleted = 1;
+       } else {
+               memcpy(tmp, new_range, sizeof(*new_range));
+       }
+
+       range_cpu_to_be(tmp, tmp);
+       rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp,
+                      (struct dt_key *)&tmp->lsr_start, th, BYPASS_CAPA, 1);
+       if (rc != 0) {
+               CERROR("%s: insert range "DRANGE" failed: rc = %d\n",
+                      fld->lsf_name, PRANGE(new_range), rc);
+               GOTO(out, rc);
+       }
+
+       flde = fld_cache_entry_create(new_range);
+       if (IS_ERR(flde))
+               GOTO(out, rc = PTR_ERR(flde));
+
+       write_lock(&fld->lsf_cache->fci_lock);
+       if (deleted)
+               fld_cache_delete_nolock(fld->lsf_cache, new_range);
+       rc = fld_cache_insert_nolock(fld->lsf_cache, flde);
+       write_unlock(&fld->lsf_cache->fci_lock);
+       if (rc)
+               OBD_FREE_PTR(flde);
+out:
+       RETURN(rc);
+}
+
+/**
+ * lookup range for a seq passed. note here we only care about the start/end,
+ * caller should handle the attached location data (flags, index).
+ *
+ * \param  seq     seq for lookup.
+ * \param  range   result of lookup.
+ *
+ * \retval  0     found, \a range is the matched range;
+ * \retval -ENOENT      not found, \a range is the left-side range;
+ * \retval  -ve         other error;
+ */
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+                    seqno_t seq, struct lu_seq_range *range)
+{
+       struct lu_seq_range     *fld_rec;
+       struct fld_thread_info  *info;
+       int rc;
+
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       fld_rec = &info->fti_rec;
+
+       rc = fld_cache_lookup(fld->lsf_cache, seq, fld_rec);
+       if (rc == 0) {
+               *range = *fld_rec;
+               if (range_within(range, seq))
+                       rc = 0;
+               else
+                       rc = -ENOENT;
+       }
+
+       CDEBUG(D_INFO, "%s: lookup seq = "LPX64" range : "DRANGE" rc = %d\n",
+              fld->lsf_name, seq, PRANGE(range), rc);
+
+       RETURN(rc);
+}
+
+int fld_insert_entry(const struct lu_env *env,
+                    struct lu_server_fld *fld,
+                    const struct lu_seq_range *range)
+{
+       struct thandle *th;
+       int rc;
+       ENTRY;
+
+       th = dt_trans_create(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev));
+       if (IS_ERR(th))
+               RETURN(PTR_ERR(th));
+
+       rc = fld_declare_index_create(env, fld, range, th);
+       if (rc != 0) {
+               if (rc == -EEXIST)
+                       rc = 0;
+               GOTO(out, rc);
+       }
+
+       rc = dt_trans_start_local(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev),
+                                 th);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = fld_index_create(env, fld, range, th);
+       if (rc == -EEXIST)
+               rc = 0;
+out:
+       dt_trans_stop(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev), th);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(fld_insert_entry);
+
+static int fld_insert_special_entries(const struct lu_env *env,
+                                     struct lu_server_fld *fld)
+{
+       int rc;
+
+       rc = fld_insert_entry(env, fld, &IGIF_FLD_RANGE);
+       if (rc != 0)
+               RETURN(rc);
+
+       rc = fld_insert_entry(env, fld, &DOT_LUSTRE_FLD_RANGE);
+       if (rc != 0)
+               RETURN(rc);
+
+       rc = fld_insert_entry(env, fld, &ROOT_FLD_RANGE);
+
+       RETURN(rc);
+}
+
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+                  struct dt_device *dt)
+{
+       struct dt_object        *dt_obj = NULL;
+       struct lu_fid           fid;
+       struct lu_attr          *attr = NULL;
+       struct lu_seq_range     *range = NULL;
+       struct fld_thread_info  *info;
+       struct dt_object_format dof;
+       struct dt_it            *it;
+       const struct dt_it_ops  *iops;
+       int                     rc;
+       ENTRY;
+
+       info = lu_context_key_get(&env->le_ctx, &fld_thread_key);
+       LASSERT(info != NULL);
+
+       lu_local_obj_fid(&fid, FLD_INDEX_OID);
+       OBD_ALLOC_PTR(attr);
+       if (attr == NULL)
+               RETURN(-ENOMEM);
+
+       memset(attr, 0, sizeof(*attr));
+       attr->la_valid = LA_MODE;
+       attr->la_mode = S_IFREG | 0666;
+       dof.dof_type = DFT_INDEX;
+       dof.u.dof_idx.di_feat = &fld_index_features;
+
+       dt_obj = dt_find_or_create(env, dt, &fid, &dof, attr);
+       if (IS_ERR(dt_obj)) {
+               rc = PTR_ERR(dt_obj);
+               CERROR("%s: Can't find \"%s\" obj %d\n", fld->lsf_name,
+                       fld_index_name, rc);
+               dt_obj = NULL;
+               GOTO(out, rc);
+       }
+
+       fld->lsf_obj = dt_obj;
+       rc = dt_obj->do_ops->do_index_try(env, dt_obj, &fld_index_features);
+       if (rc != 0) {
+               CERROR("%s: File \"%s\" is not an index: rc = %d!\n",
+                      fld->lsf_name, fld_index_name, rc);
+               GOTO(out, rc);
+       }
+
+       range = &info->fti_rec;
+       /* Load fld entry to cache */
+       iops = &dt_obj->do_index_ops->dio_it;
+       it = iops->init(env, dt_obj, 0, NULL);
+       if (IS_ERR(it))
+               GOTO(out, rc = PTR_ERR(it));
+
+       rc = iops->load(env, it, 0);
+       if (rc < 0)
+               GOTO(out_it_fini, rc);
+
+       if (rc > 0) {
+               /* Load FLD entry into server cache */
+               do {
+                       rc = iops->rec(env, it, (struct dt_rec *)range, 0);
+                       if (rc != 0)
+                               GOTO(out_it_put, rc);
+                       LASSERT(range != NULL);
+                       range_be_to_cpu(range, range);
+                       rc = fld_cache_insert(fld->lsf_cache, range);
+                       if (rc != 0)
+                               GOTO(out_it_put, rc);
+                       rc = iops->next(env, it);
+               } while (rc == 0);
+       }
+
+       /* Note: fld_insert_entry will detect whether these
+        * special entries already exist inside FLDB */
+       mutex_lock(&fld->lsf_lock);
+       rc = fld_insert_special_entries(env, fld);
+       mutex_unlock(&fld->lsf_lock);
+       if (rc != 0) {
+               CERROR("%s: insert special entries failed!: rc = %d\n",
+                      fld->lsf_name, rc);
+               GOTO(out_it_put, rc);
+       }
+
+out_it_put:
+       iops->put(env, it);
+out_it_fini:
+       iops->fini(env, it);
+out:
+       if (attr != NULL)
+               OBD_FREE_PTR(attr);
+
+       if (rc != 0) {
+               if (dt_obj != NULL)
+                       lu_object_put(env, &dt_obj->do_lu);
+               fld->lsf_obj = NULL;
+       }
+       RETURN(rc);
+}
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld)
+{
+       ENTRY;
+       if (fld->lsf_obj != NULL) {
+               if (!IS_ERR(fld->lsf_obj))
+                       lu_object_put(env, &fld->lsf_obj->do_lu);
+               fld->lsf_obj = NULL;
+       }
+       EXIT;
+}
diff --git a/drivers/staging/lustre/lustre/fld/fld_internal.h b/drivers/staging/lustre/lustre/fld/fld_internal.h
new file mode 100644 (file)
index 0000000..9fa9e01
--- /dev/null
@@ -0,0 +1,223 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_internal.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Tom WangDi <wangdi@clusterfs.com>
+ */
+#ifndef __FLD_INTERNAL_H
+#define __FLD_INTERNAL_H
+
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+
+enum {
+       LUSTRE_FLD_INIT = 1 << 0,
+       LUSTRE_FLD_RUN  = 1 << 1
+};
+
+struct fld_stats {
+       __u64   fst_count;
+       __u64   fst_cache;
+       __u64   fst_inflight;
+};
+
+typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64);
+
+typedef struct lu_fld_target *
+(*fld_scan_func_t) (struct lu_client_fld *, __u64);
+
+struct lu_fld_hash {
+       const char            *fh_name;
+       fld_hash_func_t   fh_hash_func;
+       fld_scan_func_t   fh_scan_func;
+};
+
+struct fld_cache_entry {
+       struct list_head               fce_lru;
+       struct list_head               fce_list;
+       /**
+        * fld cache entries are sorted on range->lsr_start field. */
+       struct lu_seq_range      fce_range;
+};
+
+struct fld_cache {
+       /**
+        * Cache guard, protects fci_hash mostly because others immutable after
+        * init is finished.
+        */
+       rwlock_t                 fci_lock;
+
+       /**
+        * Cache shrink threshold */
+       int                   fci_threshold;
+
+       /**
+        * Prefered number of cached entries */
+       int                   fci_cache_size;
+
+       /**
+        * Current number of cached entries. Protected by \a fci_lock */
+       int                   fci_cache_count;
+
+       /**
+        * LRU list fld entries. */
+       struct list_head               fci_lru;
+
+       /**
+        * sorted fld entries. */
+       struct list_head               fci_entries_head;
+
+       /**
+        * Cache statistics. */
+       struct fld_stats         fci_stat;
+
+       /**
+        * Cache name used for debug and messages. */
+       char                 fci_name[80];
+       unsigned int             fci_no_shrink:1;
+};
+
+enum fld_op {
+       FLD_CREATE = 0,
+       FLD_DELETE = 1,
+       FLD_LOOKUP = 2
+};
+
+enum {
+       /* 4M of FLD cache will not hurt client a lot. */
+       FLD_SERVER_CACHE_SIZE      = (4 * 0x100000),
+
+       /* 1M of FLD cache will not hurt client a lot. */
+       FLD_CLIENT_CACHE_SIZE      = (1 * 0x100000)
+};
+
+enum {
+       /* Cache threshold is 10 percent of size. */
+       FLD_SERVER_CACHE_THRESHOLD = 10,
+
+       /* Cache threshold is 10 percent of size. */
+       FLD_CLIENT_CACHE_THRESHOLD = 10
+};
+
+extern struct lu_fld_hash fld_hash[];
+
+
+struct fld_thread_info {
+       struct req_capsule *fti_pill;
+       __u64          fti_key;
+       struct lu_seq_range fti_rec;
+       struct lu_seq_range fti_lrange;
+       struct lu_seq_range fti_irange;
+};
+
+extern struct lu_context_key fld_thread_key;
+
+int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld,
+                  struct dt_device *dt);
+
+void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_index_create(const struct lu_env *env,
+                            struct lu_server_fld *fld,
+                            const struct lu_seq_range *new,
+                            struct thandle *th);
+
+int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld,
+                    const struct lu_seq_range *new, struct thandle *th);
+
+int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+                    seqno_t seq, struct lu_seq_range *range);
+
+int fld_client_rpc(struct obd_export *exp,
+                  struct lu_seq_range *range, __u32 fld_op);
+
+#ifdef LPROCFS
+extern struct lprocfs_vars fld_server_proc_list[];
+extern struct lprocfs_vars fld_client_proc_list[];
+#endif
+
+
+struct fld_cache *fld_cache_init(const char *name,
+                                int cache_size, int cache_threshold);
+
+void fld_cache_fini(struct fld_cache *cache);
+
+void fld_cache_flush(struct fld_cache *cache);
+
+int fld_cache_insert(struct fld_cache *cache,
+                    const struct lu_seq_range *range);
+
+struct fld_cache_entry
+*fld_cache_entry_create(const struct lu_seq_range *range);
+
+int fld_cache_insert_nolock(struct fld_cache *cache,
+                           struct fld_cache_entry *f_new);
+void fld_cache_delete(struct fld_cache *cache,
+                     const struct lu_seq_range *range);
+void fld_cache_delete_nolock(struct fld_cache *cache,
+                            const struct lu_seq_range *range);
+int fld_cache_lookup(struct fld_cache *cache,
+                    const seqno_t seq, struct lu_seq_range *range);
+
+struct fld_cache_entry*
+fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range);
+void fld_cache_entry_delete(struct fld_cache *cache,
+                           struct fld_cache_entry *node);
+void fld_dump_cache_entries(struct fld_cache *cache);
+
+struct fld_cache_entry
+*fld_cache_entry_lookup_nolock(struct fld_cache *cache,
+                             struct lu_seq_range *range);
+int fld_write_range(const struct lu_env *env, struct dt_object *dt,
+                   const struct lu_seq_range *range, struct thandle *th);
+
+static inline const char *
+fld_target_name(struct lu_fld_target *tar)
+{
+       if (tar->ft_srv != NULL)
+               return tar->ft_srv->lsf_name;
+
+       return (const char *)tar->ft_exp->exp_obd->obd_name;
+}
+
+extern proc_dir_entry_t *fld_type_proc_dir;
+extern struct file_operations fld_proc_seq_fops;
+#endif /* __FLD_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/fld/fld_request.c b/drivers/staging/lustre/lustre/fld/fld_request.c
new file mode 100644 (file)
index 0000000..e9f0739
--- /dev/null
@@ -0,0 +1,519 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/fld_request.c
+ *
+ * FLD (Fids Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+# include <linux/jbd.h>
+# include <asm/div64.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_ver.h>
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#include <dt_object.h>
+#include <md_object.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include <lustre_mdc.h>
+#include "fld_internal.h"
+
+/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c
+ * It should be common thing. The same about mdc RPC lock */
+static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+       int rc;
+       ENTRY;
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = list_empty(&mcw->mcw_entry);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       RETURN(rc);
+};
+
+static void fld_enter_request(struct client_obd *cli)
+{
+       struct mdc_cache_waiter mcw;
+       struct l_wait_info lwi = { 0 };
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+               list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+               init_waitqueue_head(&mcw.mcw_waitq);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               l_wait_event(mcw.mcw_waitq, fld_req_avail(cli, &mcw), &lwi);
+       } else {
+               cli->cl_r_in_flight++;
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+       }
+}
+
+static void fld_exit_request(struct client_obd *cli)
+{
+       struct list_head *l, *tmp;
+       struct mdc_cache_waiter *mcw;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_r_in_flight--;
+       list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+
+               if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+                       /* No free request slots anymore */
+                       break;
+               }
+
+               mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+               list_del_init(&mcw->mcw_entry);
+               cli->cl_r_in_flight++;
+               wake_up(&mcw->mcw_waitq);
+       }
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static int fld_rrb_hash(struct lu_client_fld *fld,
+                       seqno_t seq)
+{
+       LASSERT(fld->lcf_count > 0);
+       return do_div(seq, fld->lcf_count);
+}
+
+static struct lu_fld_target *
+fld_rrb_scan(struct lu_client_fld *fld, seqno_t seq)
+{
+       struct lu_fld_target *target;
+       int hash;
+       ENTRY;
+
+       /* Because almost all of special sequence located in MDT0,
+        * it should go to index 0 directly, instead of calculating
+        * hash again, and also if other MDTs is not being connected,
+        * the fld lookup requests(for seq on MDT0) should not be
+        * blocked because of other MDTs */
+       if (fid_seq_is_norm(seq))
+               hash = fld_rrb_hash(fld, seq);
+       else
+               hash = 0;
+
+       list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+               if (target->ft_idx == hash)
+                       RETURN(target);
+       }
+
+       CERROR("%s: Can't find target by hash %d (seq "LPX64"). "
+              "Targets (%d):\n", fld->lcf_name, hash, seq,
+              fld->lcf_count);
+
+       list_for_each_entry(target, &fld->lcf_targets, ft_chain) {
+               const char *srv_name = target->ft_srv != NULL  ?
+                       target->ft_srv->lsf_name : "<null>";
+               const char *exp_name = target->ft_exp != NULL ?
+                       (char *)target->ft_exp->exp_obd->obd_uuid.uuid :
+                       "<null>";
+
+               CERROR("  exp: 0x%p (%s), srv: 0x%p (%s), idx: "LPU64"\n",
+                      target->ft_exp, exp_name, target->ft_srv,
+                      srv_name, target->ft_idx);
+       }
+
+       /*
+        * If target is not found, there is logical error anyway, so here is
+        * LBUG() to catch this situation.
+        */
+       LBUG();
+       RETURN(NULL);
+}
+
+struct lu_fld_hash fld_hash[] = {
+       {
+               .fh_name = "RRB",
+               .fh_hash_func = fld_rrb_hash,
+               .fh_scan_func = fld_rrb_scan
+       },
+       {
+               0,
+       }
+};
+
+static struct lu_fld_target *
+fld_client_get_target(struct lu_client_fld *fld, seqno_t seq)
+{
+       struct lu_fld_target *target;
+       ENTRY;
+
+       LASSERT(fld->lcf_hash != NULL);
+
+       spin_lock(&fld->lcf_lock);
+       target = fld->lcf_hash->fh_scan_func(fld, seq);
+       spin_unlock(&fld->lcf_lock);
+
+       if (target != NULL) {
+               CDEBUG(D_INFO, "%s: Found target (idx "LPU64
+                      ") by seq "LPX64"\n", fld->lcf_name,
+                      target->ft_idx, seq);
+       }
+
+       RETURN(target);
+}
+
+/*
+ * Add export to FLD. This is usually done by CMM and LMV as they are main users
+ * of FLD module.
+ */
+int fld_client_add_target(struct lu_client_fld *fld,
+                         struct lu_fld_target *tar)
+{
+       const char *name;
+       struct lu_fld_target *target, *tmp;
+       ENTRY;
+
+       LASSERT(tar != NULL);
+       name = fld_target_name(tar);
+       LASSERT(name != NULL);
+       LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL);
+
+       if (fld->lcf_flags != LUSTRE_FLD_INIT) {
+               CERROR("%s: Attempt to add target %s (idx "LPU64") "
+                      "on fly - skip it\n", fld->lcf_name, name,
+                      tar->ft_idx);
+               RETURN(0);
+       } else {
+               CDEBUG(D_INFO, "%s: Adding target %s (idx "
+                      LPU64")\n", fld->lcf_name, name, tar->ft_idx);
+       }
+
+       OBD_ALLOC_PTR(target);
+       if (target == NULL)
+               RETURN(-ENOMEM);
+
+       spin_lock(&fld->lcf_lock);
+       list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) {
+               if (tmp->ft_idx == tar->ft_idx) {
+                       spin_unlock(&fld->lcf_lock);
+                       OBD_FREE_PTR(target);
+                       CERROR("Target %s exists in FLD and known as %s:#"LPU64"\n",
+                              name, fld_target_name(tmp), tmp->ft_idx);
+                       RETURN(-EEXIST);
+               }
+       }
+
+       target->ft_exp = tar->ft_exp;
+       if (target->ft_exp != NULL)
+               class_export_get(target->ft_exp);
+       target->ft_srv = tar->ft_srv;
+       target->ft_idx = tar->ft_idx;
+
+       list_add_tail(&target->ft_chain,
+                         &fld->lcf_targets);
+
+       fld->lcf_count++;
+       spin_unlock(&fld->lcf_lock);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(fld_client_add_target);
+
+/* Remove export from FLD */
+int fld_client_del_target(struct lu_client_fld *fld, __u64 idx)
+{
+       struct lu_fld_target *target, *tmp;
+       ENTRY;
+
+       spin_lock(&fld->lcf_lock);
+       list_for_each_entry_safe(target, tmp,
+                                    &fld->lcf_targets, ft_chain) {
+               if (target->ft_idx == idx) {
+                       fld->lcf_count--;
+                       list_del(&target->ft_chain);
+                       spin_unlock(&fld->lcf_lock);
+
+                       if (target->ft_exp != NULL)
+                               class_export_put(target->ft_exp);
+
+                       OBD_FREE_PTR(target);
+                       RETURN(0);
+               }
+       }
+       spin_unlock(&fld->lcf_lock);
+       RETURN(-ENOENT);
+}
+EXPORT_SYMBOL(fld_client_del_target);
+
+#ifdef LPROCFS
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+       int rc;
+       ENTRY;
+
+       fld->lcf_proc_dir = lprocfs_register(fld->lcf_name,
+                                            fld_type_proc_dir,
+                                            NULL, NULL);
+
+       if (IS_ERR(fld->lcf_proc_dir)) {
+               CERROR("%s: LProcFS failed in fld-init\n",
+                      fld->lcf_name);
+               rc = PTR_ERR(fld->lcf_proc_dir);
+               RETURN(rc);
+       }
+
+       rc = lprocfs_add_vars(fld->lcf_proc_dir,
+                             fld_client_proc_list, fld);
+       if (rc) {
+               CERROR("%s: Can't init FLD proc, rc %d\n",
+                      fld->lcf_name, rc);
+               GOTO(out_cleanup, rc);
+       }
+
+       RETURN(0);
+
+out_cleanup:
+       fld_client_proc_fini(fld);
+       return rc;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+       ENTRY;
+       if (fld->lcf_proc_dir) {
+               if (!IS_ERR(fld->lcf_proc_dir))
+                       lprocfs_remove(&fld->lcf_proc_dir);
+               fld->lcf_proc_dir = NULL;
+       }
+       EXIT;
+}
+#else
+static int fld_client_proc_init(struct lu_client_fld *fld)
+{
+       return 0;
+}
+
+void fld_client_proc_fini(struct lu_client_fld *fld)
+{
+       return;
+}
+#endif
+
+EXPORT_SYMBOL(fld_client_proc_fini);
+
+static inline int hash_is_sane(int hash)
+{
+       return (hash >= 0 && hash < ARRAY_SIZE(fld_hash));
+}
+
+int fld_client_init(struct lu_client_fld *fld,
+                   const char *prefix, int hash)
+{
+       int cache_size, cache_threshold;
+       int rc;
+       ENTRY;
+
+       LASSERT(fld != NULL);
+
+       snprintf(fld->lcf_name, sizeof(fld->lcf_name),
+                "cli-%s", prefix);
+
+       if (!hash_is_sane(hash)) {
+               CERROR("%s: Wrong hash function %#x\n",
+                      fld->lcf_name, hash);
+               RETURN(-EINVAL);
+       }
+
+       fld->lcf_count = 0;
+       spin_lock_init(&fld->lcf_lock);
+       fld->lcf_hash = &fld_hash[hash];
+       fld->lcf_flags = LUSTRE_FLD_INIT;
+       INIT_LIST_HEAD(&fld->lcf_targets);
+
+       cache_size = FLD_CLIENT_CACHE_SIZE /
+               sizeof(struct fld_cache_entry);
+
+       cache_threshold = cache_size *
+               FLD_CLIENT_CACHE_THRESHOLD / 100;
+
+       fld->lcf_cache = fld_cache_init(fld->lcf_name,
+                                       cache_size, cache_threshold);
+       if (IS_ERR(fld->lcf_cache)) {
+               rc = PTR_ERR(fld->lcf_cache);
+               fld->lcf_cache = NULL;
+               GOTO(out, rc);
+       }
+
+       rc = fld_client_proc_init(fld);
+       if (rc)
+               GOTO(out, rc);
+       EXIT;
+out:
+       if (rc)
+               fld_client_fini(fld);
+       else
+               CDEBUG(D_INFO, "%s: Using \"%s\" hash\n",
+                      fld->lcf_name, fld->lcf_hash->fh_name);
+       return rc;
+}
+EXPORT_SYMBOL(fld_client_init);
+
+void fld_client_fini(struct lu_client_fld *fld)
+{
+       struct lu_fld_target *target, *tmp;
+       ENTRY;
+
+       spin_lock(&fld->lcf_lock);
+       list_for_each_entry_safe(target, tmp,
+                                    &fld->lcf_targets, ft_chain) {
+               fld->lcf_count--;
+               list_del(&target->ft_chain);
+               if (target->ft_exp != NULL)
+                       class_export_put(target->ft_exp);
+               OBD_FREE_PTR(target);
+       }
+       spin_unlock(&fld->lcf_lock);
+
+       if (fld->lcf_cache != NULL) {
+               if (!IS_ERR(fld->lcf_cache))
+                       fld_cache_fini(fld->lcf_cache);
+               fld->lcf_cache = NULL;
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(fld_client_fini);
+
+int fld_client_rpc(struct obd_export *exp,
+                  struct lu_seq_range *range, __u32 fld_op)
+{
+       struct ptlrpc_request *req;
+       struct lu_seq_range   *prange;
+       __u32            *op;
+       int                 rc;
+       struct obd_import     *imp;
+       ENTRY;
+
+       LASSERT(exp != NULL);
+
+       imp = class_exp2cliimp(exp);
+       req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, LUSTRE_MDS_VERSION,
+                                       FLD_QUERY);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC);
+       *op = fld_op;
+
+       prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD);
+       *prange = *range;
+
+       ptlrpc_request_set_replen(req);
+       req->rq_request_portal = FLD_REQUEST_PORTAL;
+       ptlrpc_at_set_req_timeout(req);
+
+       if (fld_op == FLD_LOOKUP &&
+           imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS)
+               req->rq_allow_replay = 1;
+
+       if (fld_op != FLD_LOOKUP)
+               mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       fld_enter_request(&exp->exp_obd->u.cli);
+       rc = ptlrpc_queue_wait(req);
+       fld_exit_request(&exp->exp_obd->u.cli);
+       if (fld_op != FLD_LOOKUP)
+               mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       if (rc)
+               GOTO(out_req, rc);
+
+       prange = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD);
+       if (prange == NULL)
+               GOTO(out_req, rc = -EFAULT);
+       *range = *prange;
+       EXIT;
+out_req:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds,
+                     __u32 flags, const struct lu_env *env)
+{
+       struct lu_seq_range res = { 0 };
+       struct lu_fld_target *target;
+       int rc;
+       ENTRY;
+
+       fld->lcf_flags |= LUSTRE_FLD_RUN;
+
+       rc = fld_cache_lookup(fld->lcf_cache, seq, &res);
+       if (rc == 0) {
+               *mds = res.lsr_index;
+               RETURN(0);
+       }
+
+       /* Can not find it in the cache */
+       target = fld_client_get_target(fld, seq);
+       LASSERT(target != NULL);
+
+       CDEBUG(D_INFO, "%s: Lookup fld entry (seq: "LPX64") on "
+              "target %s (idx "LPU64")\n", fld->lcf_name, seq,
+              fld_target_name(target), target->ft_idx);
+
+       res.lsr_start = seq;
+       fld_range_set_type(&res, flags);
+       if (target->ft_srv != NULL) {
+               LASSERT(env != NULL);
+               rc = fld_server_lookup(env, target->ft_srv, seq, &res);
+       } else {
+               rc = fld_client_rpc(target->ft_exp, &res, FLD_LOOKUP);
+       }
+
+       if (rc == 0) {
+               *mds = res.lsr_index;
+
+               fld_cache_insert(fld->lcf_cache, &res);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(fld_client_lookup);
+
+void fld_client_flush(struct lu_client_fld *fld)
+{
+       fld_cache_flush(fld->lcf_cache);
+}
+EXPORT_SYMBOL(fld_client_flush);
diff --git a/drivers/staging/lustre/lustre/fld/lproc_fld.c b/drivers/staging/lustre/lustre/fld/lproc_fld.c
new file mode 100644 (file)
index 0000000..c1bd803
--- /dev/null
@@ -0,0 +1,373 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/fld/lproc_fld.c
+ *
+ * FLD (FIDs Location Database)
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ *     Di Wang <di.wang@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FLD
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/module.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <dt_object.h>
+#include <md_object.h>
+#include <obd_support.h>
+#include <lustre_req_layout.h>
+#include <lustre_fld.h>
+#include <lustre_fid.h>
+#include "fld_internal.h"
+
+#ifdef LPROCFS
+static int
+fld_proc_targets_seq_show(struct seq_file *m, void *unused)
+{
+       struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+       struct lu_fld_target *target;
+       ENTRY;
+
+       LASSERT(fld != NULL);
+
+       spin_lock(&fld->lcf_lock);
+       list_for_each_entry(target,
+                               &fld->lcf_targets, ft_chain)
+               seq_printf(m, "%s\n", fld_target_name(target));
+       spin_unlock(&fld->lcf_lock);
+
+       RETURN(0);
+}
+
+static int
+fld_proc_hash_seq_show(struct seq_file *m, void *unused)
+{
+       struct lu_client_fld *fld = (struct lu_client_fld *)m->private;
+       ENTRY;
+
+       LASSERT(fld != NULL);
+
+       spin_lock(&fld->lcf_lock);
+       seq_printf(m, "%s\n", fld->lcf_hash->fh_name);
+       spin_unlock(&fld->lcf_lock);
+
+       RETURN(0);
+}
+
+static ssize_t
+fld_proc_hash_seq_write(struct file *file, const char *buffer,
+                       size_t count, loff_t *off)
+{
+       struct lu_client_fld *fld = ((struct seq_file *)file->private_data)->private;
+       struct lu_fld_hash *hash = NULL;
+       int i;
+       ENTRY;
+
+       LASSERT(fld != NULL);
+
+       for (i = 0; fld_hash[i].fh_name != NULL; i++) {
+               if (count != strlen(fld_hash[i].fh_name))
+                       continue;
+
+               if (!strncmp(fld_hash[i].fh_name, buffer, count)) {
+                       hash = &fld_hash[i];
+                       break;
+               }
+       }
+
+       if (hash != NULL) {
+               spin_lock(&fld->lcf_lock);
+               fld->lcf_hash = hash;
+               spin_unlock(&fld->lcf_lock);
+
+               CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n",
+                      fld->lcf_name, hash->fh_name);
+       }
+
+       RETURN(count);
+}
+
+static ssize_t
+fld_proc_cache_flush_write(struct file *file, const char __user *buffer,
+                              size_t count, loff_t *pos)
+{
+       struct lu_client_fld *fld = file->private_data;
+       ENTRY;
+
+       LASSERT(fld != NULL);
+
+       fld_cache_flush(fld->lcf_cache);
+
+       CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name);
+
+       RETURN(count);
+}
+
+static int fld_proc_cache_flush_open(struct inode *inode, struct file *file)
+{
+       file->private_data = PDE_DATA(inode);
+       return 0;
+}
+
+static int fld_proc_cache_flush_release(struct inode *inode, struct file *file)
+{
+       file->private_data = NULL;
+       return 0;
+}
+
+struct file_operations fld_proc_cache_flush_fops = {
+       .owner          = THIS_MODULE,
+       .open           = fld_proc_cache_flush_open,
+       .write          = fld_proc_cache_flush_write,
+       .release        = fld_proc_cache_flush_release,
+};
+
+struct fld_seq_param {
+       struct lu_env           fsp_env;
+       struct dt_it            *fsp_it;
+       struct lu_server_fld    *fsp_fld;
+       unsigned int            fsp_stop:1;
+};
+
+static void *fldb_seq_start(struct seq_file *p, loff_t *pos)
+{
+       struct fld_seq_param    *param = p->private;
+       struct lu_server_fld    *fld;
+       struct dt_object        *obj;
+       const struct dt_it_ops  *iops;
+
+       if (param == NULL || param->fsp_stop)
+               return NULL;
+
+       fld = param->fsp_fld;
+       obj = fld->lsf_obj;
+       LASSERT(obj != NULL);
+       iops = &obj->do_index_ops->dio_it;
+
+       iops->load(&param->fsp_env, param->fsp_it, *pos);
+
+       *pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+       return param;
+}
+
+static void fldb_seq_stop(struct seq_file *p, void *v)
+{
+       struct fld_seq_param    *param = p->private;
+       const struct dt_it_ops  *iops;
+       struct lu_server_fld    *fld;
+       struct dt_object        *obj;
+
+       if (param == NULL)
+               return;
+
+       fld = param->fsp_fld;
+       obj = fld->lsf_obj;
+       LASSERT(obj != NULL);
+       iops = &obj->do_index_ops->dio_it;
+
+       iops->put(&param->fsp_env, param->fsp_it);
+}
+
+static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       struct fld_seq_param    *param = p->private;
+       struct lu_server_fld    *fld;
+       struct dt_object        *obj;
+       const struct dt_it_ops  *iops;
+       int                     rc;
+
+       if (param == NULL || param->fsp_stop)
+               return NULL;
+
+       fld = param->fsp_fld;
+       obj = fld->lsf_obj;
+       LASSERT(obj != NULL);
+       iops = &obj->do_index_ops->dio_it;
+
+       rc = iops->next(&param->fsp_env, param->fsp_it);
+       if (rc > 0) {
+               param->fsp_stop = 1;
+               return NULL;
+       }
+
+       *pos = be64_to_cpu(*(__u64 *)iops->key(&param->fsp_env, param->fsp_it));
+       return param;
+}
+
+static int fldb_seq_show(struct seq_file *p, void *v)
+{
+       struct fld_seq_param    *param = p->private;
+       struct lu_server_fld    *fld;
+       struct dt_object        *obj;
+       const struct dt_it_ops  *iops;
+       struct fld_thread_info  *info;
+       struct lu_seq_range     *fld_rec;
+       int                     rc;
+
+       if (param == NULL || param->fsp_stop)
+               return 0;
+
+       fld = param->fsp_fld;
+       obj = fld->lsf_obj;
+       LASSERT(obj != NULL);
+       iops = &obj->do_index_ops->dio_it;
+
+       info = lu_context_key_get(&param->fsp_env.le_ctx,
+                                 &fld_thread_key);
+       fld_rec = &info->fti_rec;
+       rc = iops->rec(&param->fsp_env, param->fsp_it,
+                      (struct dt_rec *)fld_rec, 0);
+       if (rc != 0) {
+               CERROR("%s:read record error: rc %d\n",
+                      fld->lsf_name, rc);
+       } else if (fld_rec->lsr_start != 0) {
+               range_be_to_cpu(fld_rec, fld_rec);
+               rc = seq_printf(p, DRANGE"\n", PRANGE(fld_rec));
+       }
+
+       return rc;
+}
+
+struct seq_operations fldb_sops = {
+       .start = fldb_seq_start,
+       .stop = fldb_seq_stop,
+       .next = fldb_seq_next,
+       .show = fldb_seq_show,
+};
+
+static int fldb_seq_open(struct inode *inode, struct file *file)
+{
+       struct seq_file         *seq;
+       struct lu_server_fld    *fld = (struct lu_server_fld *)PDE_DATA(inode);
+       struct dt_object        *obj;
+       const struct dt_it_ops  *iops;
+       struct fld_seq_param    *param = NULL;
+       int                     env_init = 0;
+       int                     rc;
+
+       rc = seq_open(file, &fldb_sops);
+       if (rc)
+               GOTO(out, rc);
+
+       obj = fld->lsf_obj;
+       if (obj == NULL) {
+               seq = file->private_data;
+               seq->private = NULL;
+               return 0;
+       }
+
+       OBD_ALLOC_PTR(param);
+       if (param == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rc = lu_env_init(&param->fsp_env, LCT_MD_THREAD);
+       if (rc != 0)
+               GOTO(out, rc);
+
+       env_init = 1;
+       iops = &obj->do_index_ops->dio_it;
+       param->fsp_it = iops->init(&param->fsp_env, obj, 0, NULL);
+       if (IS_ERR(param->fsp_it))
+               GOTO(out, rc = PTR_ERR(param->fsp_it));
+
+       param->fsp_fld = fld;
+       param->fsp_stop = 0;
+
+       seq = file->private_data;
+       seq->private = param;
+out:
+       if (rc != 0) {
+               if (env_init == 1)
+                       lu_env_fini(&param->fsp_env);
+               if (param != NULL)
+                       OBD_FREE_PTR(param);
+       }
+       return rc;
+}
+
+static int fldb_seq_release(struct inode *inode, struct file *file)
+{
+       struct seq_file         *seq = file->private_data;
+       struct fld_seq_param    *param;
+       struct lu_server_fld    *fld;
+       struct dt_object        *obj;
+       const struct dt_it_ops  *iops;
+
+       param = seq->private;
+       if (param == NULL) {
+               lprocfs_seq_release(inode, file);
+               return 0;
+       }
+
+       fld = param->fsp_fld;
+       obj = fld->lsf_obj;
+       LASSERT(obj != NULL);
+       iops = &obj->do_index_ops->dio_it;
+
+       LASSERT(iops != NULL);
+       LASSERT(obj != NULL);
+       LASSERT(param->fsp_it != NULL);
+       iops->fini(&param->fsp_env, param->fsp_it);
+       lu_env_fini(&param->fsp_env);
+       OBD_FREE_PTR(param);
+       lprocfs_seq_release(inode, file);
+
+       return 0;
+}
+
+struct lprocfs_vars fld_server_proc_list[] = {
+       { NULL }};
+
+LPROC_SEQ_FOPS_RO(fld_proc_targets);
+LPROC_SEQ_FOPS(fld_proc_hash);
+
+struct lprocfs_vars fld_client_proc_list[] = {
+       { "targets", &fld_proc_targets_fops },
+       { "hash", &fld_proc_hash_fops },
+       { "cache_flush", &fld_proc_cache_flush_fops },
+       { NULL }};
+
+struct file_operations fld_proc_seq_fops = {
+       .owner   = THIS_MODULE,
+       .open    = fldb_seq_open,
+       .read    = seq_read,
+       .release = fldb_seq_release,
+};
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h
new file mode 100644 (file)
index 0000000..4bb6880
--- /dev/null
@@ -0,0 +1,3279 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LUSTRE_CL_OBJECT_H
+#define _LUSTRE_CL_OBJECT_H
+
+/** \defgroup clio clio
+ *
+ * Client objects implement io operations and cache pages.
+ *
+ * Examples: lov and osc are implementations of cl interface.
+ *
+ * Big Theory Statement.
+ *
+ * Layered objects.
+ *
+ * Client implementation is based on the following data-types:
+ *
+ *   - cl_object
+ *
+ *   - cl_page
+ *
+ *   - cl_lock     represents an extent lock on an object.
+ *
+ *   - cl_io       represents high-level i/o activity such as whole read/write
+ *              system call, or write-out of pages from under the lock being
+ *              canceled. cl_io has sub-ios that can be stopped and resumed
+ *              independently, thus achieving high degree of transfer
+ *              parallelism. Single cl_io can be advanced forward by
+ *              the multiple threads (although in the most usual case of
+ *              read/write system call it is associated with the single user
+ *              thread, that issued the system call).
+ *
+ *   - cl_req      represents a collection of pages for a transfer. cl_req is
+ *              constructed by req-forming engine that tries to saturate
+ *              transport with large and continuous transfers.
+ *
+ * Terminology
+ *
+ *     - to avoid confusion high-level I/O operation like read or write system
+ *     call is referred to as "an io", whereas low-level I/O operation, like
+ *     RPC, is referred to as "a transfer"
+ *
+ *     - "generic code" means generic (not file system specific) code in the
+ *     hosting environment. "cl-code" means code (mostly in cl_*.c files) that
+ *     is not layer specific.
+ *
+ * Locking.
+ *
+ *  - i_mutex
+ *      - PG_locked
+ *       - cl_object_header::coh_page_guard
+ *       - cl_object_header::coh_lock_guard
+ *       - lu_site::ls_guard
+ *
+ * See the top comment in cl_object.c for the description of overall locking and
+ * reference-counting design.
+ *
+ * See comments below for the description of i/o, page, and dlm-locking
+ * design.
+ *
+ * @{
+ */
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+#include <lvfs.h>
+#      include <linux/mutex.h>
+#      include <linux/radix-tree.h>
+
+struct inode;
+
+struct cl_device;
+struct cl_device_operations;
+
+struct cl_object;
+struct cl_object_page_operations;
+struct cl_object_lock_operations;
+
+struct cl_page;
+struct cl_page_slice;
+struct cl_lock;
+struct cl_lock_slice;
+
+struct cl_lock_operations;
+struct cl_page_operations;
+
+struct cl_io;
+struct cl_io_slice;
+
+struct cl_req;
+struct cl_req_slice;
+
+/**
+ * Operations for each data device in the client stack.
+ *
+ * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops
+ */
+struct cl_device_operations {
+       /**
+        * Initialize cl_req. This method is called top-to-bottom on all
+        * devices in the stack to get them a chance to allocate layer-private
+        * data, and to attach them to the cl_req by calling
+        * cl_req_slice_add().
+        *
+        * \see osc_req_init(), lov_req_init(), lovsub_req_init()
+        * \see ccc_req_init()
+        */
+       int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev,
+                           struct cl_req *req);
+};
+
+/**
+ * Device in the client stack.
+ *
+ * \see ccc_device, lov_device, lovsub_device, osc_device
+ */
+struct cl_device {
+       /** Super-class. */
+       struct lu_device                   cd_lu_dev;
+       /** Per-layer operation vector. */
+       const struct cl_device_operations *cd_ops;
+};
+
+/** \addtogroup cl_object cl_object
+ * @{ */
+/**
+ * "Data attributes" of cl_object. Data attributes can be updated
+ * independently for a sub-object, and top-object's attributes are calculated
+ * from sub-objects' ones.
+ */
+struct cl_attr {
+       /** Object size, in bytes */
+       loff_t cat_size;
+       /**
+        * Known minimal size, in bytes.
+        *
+        * This is only valid when at least one DLM lock is held.
+        */
+       loff_t cat_kms;
+       /** Modification time. Measured in seconds since epoch. */
+       time_t cat_mtime;
+       /** Access time. Measured in seconds since epoch. */
+       time_t cat_atime;
+       /** Change time. Measured in seconds since epoch. */
+       time_t cat_ctime;
+       /**
+        * Blocks allocated to this cl_object on the server file system.
+        *
+        * \todo XXX An interface for block size is needed.
+        */
+       __u64  cat_blocks;
+       /**
+        * User identifier for quota purposes.
+        */
+       uid_t  cat_uid;
+       /**
+        * Group identifier for quota purposes.
+        */
+       gid_t  cat_gid;
+};
+
+/**
+ * Fields in cl_attr that are being set.
+ */
+enum cl_attr_valid {
+       CAT_SIZE   = 1 << 0,
+       CAT_KMS    = 1 << 1,
+       CAT_MTIME  = 1 << 3,
+       CAT_ATIME  = 1 << 4,
+       CAT_CTIME  = 1 << 5,
+       CAT_BLOCKS = 1 << 6,
+       CAT_UID    = 1 << 7,
+       CAT_GID    = 1 << 8
+};
+
+/**
+ * Sub-class of lu_object with methods common for objects on the client
+ * stacks.
+ *
+ * cl_object: represents a regular file system object, both a file and a
+ *    stripe. cl_object is based on lu_object: it is identified by a fid,
+ *    layered, cached, hashed, and lrued. Important distinction with the server
+ *    side, where md_object and dt_object are used, is that cl_object "fans out"
+ *    at the lov/sns level: depending on the file layout, single file is
+ *    represented as a set of "sub-objects" (stripes). At the implementation
+ *    level, struct lov_object contains an array of cl_objects. Each sub-object
+ *    is a full-fledged cl_object, having its fid, living in the lru and hash
+ *    table.
+ *
+ *    This leads to the next important difference with the server side: on the
+ *    client, it's quite usual to have objects with the different sequence of
+ *    layers. For example, typical top-object is composed of the following
+ *    layers:
+ *
+ *     - vvp
+ *     - lov
+ *
+ *    whereas its sub-objects are composed of
+ *
+ *     - lovsub
+ *     - osc
+ *
+ *    layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
+ *    track of the object-subobject relationship.
+ *
+ *    Sub-objects are not cached independently: when top-object is about to
+ *    be discarded from the memory, all its sub-objects are torn-down and
+ *    destroyed too.
+ *
+ * \see ccc_object, lov_object, lovsub_object, osc_object
+ */
+struct cl_object {
+       /** super class */
+       struct lu_object                   co_lu;
+       /** per-object-layer operations */
+       const struct cl_object_operations *co_ops;
+       /** offset of page slice in cl_page buffer */
+       int                                co_slice_off;
+};
+
+/**
+ * Description of the client object configuration. This is used for the
+ * creation of a new client object that is identified by a more state than
+ * fid.
+ */
+struct cl_object_conf {
+       /** Super-class. */
+       struct lu_object_conf     coc_lu;
+       union {
+               /**
+                * Object layout. This is consumed by lov.
+                */
+               struct lustre_md *coc_md;
+               /**
+                * Description of particular stripe location in the
+                * cluster. This is consumed by osc.
+                */
+               struct lov_oinfo *coc_oinfo;
+       } u;
+       /**
+        * VFS inode. This is consumed by vvp.
+        */
+       struct inode         *coc_inode;
+       /**
+        * Layout lock handle.
+        */
+       struct ldlm_lock         *coc_lock;
+       /**
+        * Operation to handle layout, OBJECT_CONF_XYZ.
+        */
+       int                       coc_opc;
+};
+
+enum {
+       /** configure layout, set up a new stripe, must be called while
+        * holding layout lock. */
+       OBJECT_CONF_SET = 0,
+       /** invalidate the current stripe configuration due to losing
+        * layout lock. */
+       OBJECT_CONF_INVALIDATE = 1,
+       /** wait for old layout to go away so that new layout can be
+        * set up. */
+       OBJECT_CONF_WAIT = 2
+};
+
+/**
+ * Operations implemented for each cl object layer.
+ *
+ * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
+ */
+struct cl_object_operations {
+       /**
+        * Initialize page slice for this layer. Called top-to-bottom through
+        * every object layer when a new cl_page is instantiated. Layer
+        * keeping private per-page data, or requiring its own page operations
+        * vector should allocate these data here, and attach then to the page
+        * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
+        * sense). Optional.
+        *
+        * \retval NULL success.
+        *
+        * \retval ERR_PTR(errno) failure code.
+        *
+        * \retval valid-pointer pointer to already existing referenced page
+        *       to be used instead of newly created.
+        */
+       int  (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
+                               struct cl_page *page, struct page *vmpage);
+       /**
+        * Initialize lock slice for this layer. Called top-to-bottom through
+        * every object layer when a new cl_lock is instantiated. Layer
+        * keeping private per-lock data, or requiring its own lock operations
+        * vector should allocate these data here, and attach then to the lock
+        * by calling cl_lock_slice_add(). Mandatory.
+        */
+       int  (*coo_lock_init)(const struct lu_env *env,
+                             struct cl_object *obj, struct cl_lock *lock,
+                             const struct cl_io *io);
+       /**
+        * Initialize io state for a given layer.
+        *
+        * called top-to-bottom once per io existence to initialize io
+        * state. If layer wants to keep some state for this type of io, it
+        * has to embed struct cl_io_slice in lu_env::le_ses, and register
+        * slice with cl_io_slice_add(). It is guaranteed that all threads
+        * participating in this io share the same session.
+        */
+       int  (*coo_io_init)(const struct lu_env *env,
+                           struct cl_object *obj, struct cl_io *io);
+       /**
+        * Fill portion of \a attr that this layer controls. This method is
+        * called top-to-bottom through all object layers.
+        *
+        * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+        *
+        * \return   0: to continue
+        * \return +ve: to stop iterating through layers (but 0 is returned
+        * from enclosing cl_object_attr_get())
+        * \return -ve: to signal error
+        */
+       int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
+                           struct cl_attr *attr);
+       /**
+        * Update attributes.
+        *
+        * \a valid is a bitmask composed from enum #cl_attr_valid, and
+        * indicating what attributes are to be set.
+        *
+        * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+        *
+        * \return the same convention as for
+        * cl_object_operations::coo_attr_get() is used.
+        */
+       int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_attr *attr, unsigned valid);
+       /**
+        * Update object configuration. Called top-to-bottom to modify object
+        * configuration.
+        *
+        * XXX error conditions and handling.
+        */
+       int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_object_conf *conf);
+       /**
+        * Glimpse ast. Executed when glimpse ast arrives for a lock on this
+        * object. Layers are supposed to fill parts of \a lvb that will be
+        * shipped to the glimpse originator as a glimpse result.
+        *
+        * \see ccc_object_glimpse(), lovsub_object_glimpse(),
+        * \see osc_object_glimpse()
+        */
+       int (*coo_glimpse)(const struct lu_env *env,
+                          const struct cl_object *obj, struct ost_lvb *lvb);
+};
+
+/**
+ * Extended header for client object.
+ */
+struct cl_object_header {
+       /** Standard lu_object_header. cl_object::co_lu::lo_header points
+        * here. */
+       struct lu_object_header  coh_lu;
+       /** \name locks
+        * \todo XXX move locks below to the separate cache-lines, they are
+        * mostly useless otherwise.
+        */
+       /** @{ */
+       /** Lock protecting page tree. */
+       spinlock_t               coh_page_guard;
+       /** Lock protecting lock list. */
+       spinlock_t               coh_lock_guard;
+       /** @} locks */
+       /** Radix tree of cl_page's, cached for this object. */
+       struct radix_tree_root   coh_tree;
+       /** # of pages in radix tree. */
+       unsigned long       coh_pages;
+       /** List of cl_lock's granted for this object. */
+       struct list_head               coh_locks;
+
+       /**
+        * Parent object. It is assumed that an object has a well-defined
+        * parent, but not a well-defined child (there may be multiple
+        * sub-objects, for the same top-object). cl_object_header::coh_parent
+        * field allows certain code to be written generically, without
+        * limiting possible cl_object layouts unduly.
+        */
+       struct cl_object_header *coh_parent;
+       /**
+        * Protects consistency between cl_attr of parent object and
+        * attributes of sub-objects, that the former is calculated ("merged")
+        * from.
+        *
+        * \todo XXX this can be read/write lock if needed.
+        */
+       spinlock_t               coh_attr_guard;
+       /**
+        * Size of cl_page + page slices
+        */
+       unsigned short           coh_page_bufsize;
+       /**
+        * Number of objects above this one: 0 for a top-object, 1 for its
+        * sub-object, etc.
+        */
+       unsigned char            coh_nesting;
+};
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer top-to-bottom to \a slice.
+ */
+#define cl_object_for_each(slice, obj)                               \
+       list_for_each_entry((slice),                                \
+                               &(obj)->co_lu.lo_header->loh_layers,    \
+                               co_lu.lo_linkage)
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer bottom-to-top to \a slice.
+ */
+#define cl_object_for_each_reverse(slice, obj)                        \
+       list_for_each_entry_reverse((slice),                         \
+                                       &(obj)->co_lu.lo_header->loh_layers, \
+                                       co_lu.lo_linkage)
+/** @} cl_object */
+
+#ifndef pgoff_t
+#define pgoff_t unsigned long
+#endif
+
+#define CL_PAGE_EOF ((pgoff_t)~0ull)
+
+/** \addtogroup cl_page cl_page
+ * @{ */
+
+/** \struct cl_page
+ * Layered client page.
+ *
+ * cl_page: represents a portion of a file, cached in the memory. All pages
+ *    of the given file are of the same size, and are kept in the radix tree
+ *    hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
+ *    of the top-level file object are first class cl_objects, they have their
+ *    own radix trees of pages and hence page is implemented as a sequence of
+ *    struct cl_pages's, linked into double-linked list through
+ *    cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
+ *    corresponding radix tree at the corresponding logical offset.
+ *
+ * cl_page is associated with VM page of the hosting environment (struct
+ *    page in Linux kernel, for example), struct page. It is assumed, that this
+ *    association is implemented by one of cl_page layers (top layer in the
+ *    current design) that
+ *
+ *     - intercepts per-VM-page call-backs made by the environment (e.g.,
+ *       memory pressure),
+ *
+ *     - translates state (page flag bits) and locking between lustre and
+ *       environment.
+ *
+ *    The association between cl_page and struct page is immutable and
+ *    established when cl_page is created.
+ *
+ * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
+ *    this io an exclusive access to this page w.r.t. other io attempts and
+ *    various events changing page state (such as transfer completion, or
+ *    eviction of the page from the memory). Note, that in general cl_io
+ *    cannot be identified with a particular thread, and page ownership is not
+ *    exactly equal to the current thread holding a lock on the page. Layer
+ *    implementing association between cl_page and struct page has to implement
+ *    ownership on top of available synchronization mechanisms.
+ *
+ *    While lustre client maintains the notion of an page ownership by io,
+ *    hosting MM/VM usually has its own page concurrency control
+ *    mechanisms. For example, in Linux, page access is synchronized by the
+ *    per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
+ *    takes care to acquire and release such locks as necessary around the
+ *    calls to the file system methods (->readpage(), ->prepare_write(),
+ *    ->commit_write(), etc.). This leads to the situation when there are two
+ *    different ways to own a page in the client:
+ *
+ *     - client code explicitly and voluntary owns the page (cl_page_own());
+ *
+ *     - VM locks a page and then calls the client, that has "to assume"
+ *       the ownership from the VM (cl_page_assume()).
+ *
+ *    Dual methods to release ownership are cl_page_disown() and
+ *    cl_page_unassume().
+ *
+ * cl_page is reference counted (cl_page::cp_ref). When reference counter
+ *    drops to 0, the page is returned to the cache, unless it is in
+ *    cl_page_state::CPS_FREEING state, in which case it is immediately
+ *    destroyed.
+ *
+ *    The general logic guaranteeing the absence of "existential races" for
+ *    pages is the following:
+ *
+ *     - there are fixed known ways for a thread to obtain a new reference
+ *       to a page:
+ *
+ *         - by doing a lookup in the cl_object radix tree, protected by the
+ *           spin-lock;
+ *
+ *         - by starting from VM-locked struct page and following some
+ *           hosting environment method (e.g., following ->private pointer in
+ *           the case of Linux kernel), see cl_vmpage_page();
+ *
+ *     - when the page enters cl_page_state::CPS_FREEING state, all these
+ *       ways are severed with the proper synchronization
+ *       (cl_page_delete());
+ *
+ *     - entry into cl_page_state::CPS_FREEING is serialized by the VM page
+ *       lock;
+ *
+ *     - no new references to the page in cl_page_state::CPS_FREEING state
+ *       are allowed (checked in cl_page_get()).
+ *
+ *    Together this guarantees that when last reference to a
+ *    cl_page_state::CPS_FREEING page is released, it is safe to destroy the
+ *    page, as neither references to it can be acquired at that point, nor
+ *    ones exist.
+ *
+ * cl_page is a state machine. States are enumerated in enum
+ *    cl_page_state. Possible state transitions are enumerated in
+ *    cl_page_state_set(). State transition process (i.e., actual changing of
+ *    cl_page::cp_state field) is protected by the lock on the underlying VM
+ *    page.
+ *
+ * Linux Kernel implementation.
+ *
+ *    Binding between cl_page and struct page (which is a typedef for
+ *    struct page) is implemented in the vvp layer. cl_page is attached to the
+ *    ->private pointer of the struct page, together with the setting of
+ *    PG_private bit in page->flags, and acquiring additional reference on the
+ *    struct page (much like struct buffer_head, or any similar file system
+ *    private data structures).
+ *
+ *    PG_locked lock is used to implement both ownership and transfer
+ *    synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
+ *    states. No additional references are acquired for the duration of the
+ *    transfer.
+ *
+ * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
+ *       write-out is "protected" by the special PG_writeback bit.
+ */
+
+/**
+ * States of cl_page. cl_page.c assumes particular order here.
+ *
+ * The page state machine is rather crude, as it doesn't recognize finer page
+ * states like "dirty" or "up to date". This is because such states are not
+ * always well defined for the whole stack (see, for example, the
+ * implementation of the read-ahead, that hides page up-to-dateness to track
+ * cache hits accurately). Such sub-states are maintained by the layers that
+ * are interested in them.
+ */
+enum cl_page_state {
+       /**
+        * Page is in the cache, un-owned. Page leaves cached state in the
+        * following cases:
+        *
+        *     - [cl_page_state::CPS_OWNED] io comes across the page and
+        *     owns it;
+        *
+        *     - [cl_page_state::CPS_PAGEOUT] page is dirty, the
+        *     req-formation engine decides that it wants to include this page
+        *     into an cl_req being constructed, and yanks it from the cache;
+        *
+        *     - [cl_page_state::CPS_FREEING] VM callback is executed to
+        *     evict the page form the memory;
+        *
+        * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+        */
+       CPS_CACHED,
+       /**
+        * Page is exclusively owned by some cl_io. Page may end up in this
+        * state as a result of
+        *
+        *     - io creating new page and immediately owning it;
+        *
+        *     - [cl_page_state::CPS_CACHED] io finding existing cached page
+        *     and owning it;
+        *
+        *     - [cl_page_state::CPS_OWNED] io finding existing owned page
+        *     and waiting for owner to release the page;
+        *
+        * Page leaves owned state in the following cases:
+        *
+        *     - [cl_page_state::CPS_CACHED] io decides to leave the page in
+        *     the cache, doing nothing;
+        *
+        *     - [cl_page_state::CPS_PAGEIN] io starts read transfer for
+        *     this page;
+        *
+        *     - [cl_page_state::CPS_PAGEOUT] io starts immediate write
+        *     transfer for this page;
+        *
+        *     - [cl_page_state::CPS_FREEING] io decides to destroy this
+        *     page (e.g., as part of truncate or extent lock cancellation).
+        *
+        * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
+        */
+       CPS_OWNED,
+       /**
+        * Page is being written out, as a part of a transfer. This state is
+        * entered when req-formation logic decided that it wants this page to
+        * be sent through the wire _now_. Specifically, it means that once
+        * this state is achieved, transfer completion handler (with either
+        * success or failure indication) is guaranteed to be executed against
+        * this page independently of any locks and any scheduling decisions
+        * made by the hosting environment (that effectively means that the
+        * page is never put into cl_page_state::CPS_PAGEOUT state "in
+        * advance". This property is mentioned, because it is important when
+        * reasoning about possible dead-locks in the system). The page can
+        * enter this state as a result of
+        *
+        *     - [cl_page_state::CPS_OWNED] an io requesting an immediate
+        *     write-out of this page, or
+        *
+        *     - [cl_page_state::CPS_CACHED] req-forming engine deciding
+        *     that it has enough dirty pages cached to issue a "good"
+        *     transfer.
+        *
+        * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
+        * is completed---it is moved into cl_page_state::CPS_CACHED state.
+        *
+        * Underlying VM page is locked for the duration of transfer.
+        *
+        * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+        */
+       CPS_PAGEOUT,
+       /**
+        * Page is being read in, as a part of a transfer. This is quite
+        * similar to the cl_page_state::CPS_PAGEOUT state, except that
+        * read-in is always "immediate"---there is no such thing a sudden
+        * construction of read cl_req from cached, presumably not up to date,
+        * pages.
+        *
+        * Underlying VM page is locked for the duration of transfer.
+        *
+        * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+        */
+       CPS_PAGEIN,
+       /**
+        * Page is being destroyed. This state is entered when client decides
+        * that page has to be deleted from its host object, as, e.g., a part
+        * of truncate.
+        *
+        * Once this state is reached, there is no way to escape it.
+        *
+        * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+        */
+       CPS_FREEING,
+       CPS_NR
+};
+
+enum cl_page_type {
+       /** Host page, the page is from the host inode which the cl_page
+        * belongs to. */
+       CPT_CACHEABLE = 1,
+
+       /** Transient page, the transient cl_page is used to bind a cl_page
+        *  to vmpage which is not belonging to the same object of cl_page.
+        *  it is used in DirectIO, lockless IO and liblustre. */
+       CPT_TRANSIENT,
+};
+
+/**
+ * Flags maintained for every cl_page.
+ */
+enum cl_page_flags {
+       /**
+        * Set when pagein completes. Used for debugging (read completes at
+        * most once for a page).
+        */
+       CPF_READ_COMPLETED = 1 << 0
+};
+
+/**
+ * Fields are protected by the lock on struct page, except for atomics and
+ * immutables.
+ *
+ * \invariant Data type invariants are in cl_page_invariant(). Basically:
+ * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
+ * list, consistent with the parent/child pointers in the cl_page::cp_obj and
+ * cl_page::cp_owner (when set).
+ */
+struct cl_page {
+       /** Reference counter. */
+       atomic_t             cp_ref;
+       /** An object this page is a part of. Immutable after creation. */
+       struct cl_object        *cp_obj;
+       /** Logical page index within the object. Immutable after creation. */
+       pgoff_t           cp_index;
+       /** List of slices. Immutable after creation. */
+       struct list_head               cp_layers;
+       /** Parent page, NULL for top-level page. Immutable after creation. */
+       struct cl_page    *cp_parent;
+       /** Lower-layer page. NULL for bottommost page. Immutable after
+        * creation. */
+       struct cl_page    *cp_child;
+       /**
+        * Page state. This field is const to avoid accidental update, it is
+        * modified only internally within cl_page.c. Protected by a VM lock.
+        */
+       const enum cl_page_state cp_state;
+       /** Linkage of pages within group. Protected by cl_page::cp_mutex. */
+       struct list_head                cp_batch;
+       /** Mutex serializing membership of a page in a batch. */
+       struct mutex            cp_mutex;
+       /** Linkage of pages within cl_req. */
+       struct list_head               cp_flight;
+       /** Transfer error. */
+       int                   cp_error;
+
+       /**
+        * Page type. Only CPT_TRANSIENT is used so far. Immutable after
+        * creation.
+        */
+       enum cl_page_type       cp_type;
+
+       /**
+        * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
+        * by sub-io. Protected by a VM lock.
+        */
+       struct cl_io        *cp_owner;
+       /**
+        * Debug information, the task is owning the page.
+        */
+       task_t        *cp_task;
+       /**
+        * Owning IO request in cl_page_state::CPS_PAGEOUT and
+        * cl_page_state::CPS_PAGEIN states. This field is maintained only in
+        * the top-level pages. Protected by a VM lock.
+        */
+       struct cl_req      *cp_req;
+       /** List of references to this page, for debugging. */
+       struct lu_ref       cp_reference;
+       /** Link to an object, for debugging. */
+       struct lu_ref_link      *cp_obj_ref;
+       /** Link to a queue, for debugging. */
+       struct lu_ref_link      *cp_queue_ref;
+       /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */
+       unsigned                 cp_flags;
+       /** Assigned if doing a sync_io */
+       struct cl_sync_io       *cp_sync_io;
+};
+
+/**
+ * Per-layer part of cl_page.
+ *
+ * \see ccc_page, lov_page, osc_page
+ */
+struct cl_page_slice {
+       struct cl_page            *cpl_page;
+       /**
+        * Object slice corresponding to this page slice. Immutable after
+        * creation.
+        */
+       struct cl_object                *cpl_obj;
+       const struct cl_page_operations *cpl_ops;
+       /** Linkage into cl_page::cp_layers. Immutable after creation. */
+       struct list_head                       cpl_linkage;
+};
+
+/**
+ * Lock mode. For the client extent locks.
+ *
+ * \warning: cl_lock_mode_match() assumes particular ordering here.
+ * \ingroup cl_lock
+ */
+enum cl_lock_mode {
+       /**
+        * Mode of a lock that protects no data, and exists only as a
+        * placeholder. This is used for `glimpse' requests. A phantom lock
+        * might get promoted to real lock at some point.
+        */
+       CLM_PHANTOM,
+       CLM_READ,
+       CLM_WRITE,
+       CLM_GROUP
+};
+
+/**
+ * Requested transfer type.
+ * \ingroup cl_req
+ */
+enum cl_req_type {
+       CRT_READ,
+       CRT_WRITE,
+       CRT_NR
+};
+
+/**
+ * Per-layer page operations.
+ *
+ * Methods taking an \a io argument are for the activity happening in the
+ * context of given \a io. Page is assumed to be owned by that io, except for
+ * the obvious cases (like cl_page_operations::cpo_own()).
+ *
+ * \see vvp_page_ops, lov_page_ops, osc_page_ops
+ */
+struct cl_page_operations {
+       /**
+        * cl_page<->struct page methods. Only one layer in the stack has to
+        * implement these. Current code assumes that this functionality is
+        * provided by the topmost layer, see cl_page_disown0() as an example.
+        */
+
+       /**
+        * \return the underlying VM page. Optional.
+        */
+       struct page *(*cpo_vmpage)(const struct lu_env *env,
+                                 const struct cl_page_slice *slice);
+       /**
+        * Called when \a io acquires this page into the exclusive
+        * ownership. When this method returns, it is guaranteed that the is
+        * not owned by other io, and no transfer is going on against
+        * it. Optional.
+        *
+        * \see cl_page_own()
+        * \see vvp_page_own(), lov_page_own()
+        */
+       int  (*cpo_own)(const struct lu_env *env,
+                       const struct cl_page_slice *slice,
+                       struct cl_io *io, int nonblock);
+       /** Called when ownership it yielded. Optional.
+        *
+        * \see cl_page_disown()
+        * \see vvp_page_disown()
+        */
+       void (*cpo_disown)(const struct lu_env *env,
+                          const struct cl_page_slice *slice, struct cl_io *io);
+       /**
+        * Called for a page that is already "owned" by \a io from VM point of
+        * view. Optional.
+        *
+        * \see cl_page_assume()
+        * \see vvp_page_assume(), lov_page_assume()
+        */
+       void (*cpo_assume)(const struct lu_env *env,
+                          const struct cl_page_slice *slice, struct cl_io *io);
+       /** Dual to cl_page_operations::cpo_assume(). Optional. Called
+        * bottom-to-top when IO releases a page without actually unlocking
+        * it.
+        *
+        * \see cl_page_unassume()
+        * \see vvp_page_unassume()
+        */
+       void (*cpo_unassume)(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *io);
+       /**
+        * Announces whether the page contains valid data or not by \a uptodate.
+        *
+        * \see cl_page_export()
+        * \see vvp_page_export()
+        */
+       void  (*cpo_export)(const struct lu_env *env,
+                           const struct cl_page_slice *slice, int uptodate);
+       /**
+        * Unmaps page from the user space (if it is mapped).
+        *
+        * \see cl_page_unmap()
+        * \see vvp_page_unmap()
+        */
+       int (*cpo_unmap)(const struct lu_env *env,
+                        const struct cl_page_slice *slice, struct cl_io *io);
+       /**
+        * Checks whether underlying VM page is locked (in the suitable
+        * sense). Used for assertions.
+        *
+        * \retval    -EBUSY: page is protected by a lock of a given mode;
+        * \retval  -ENODATA: page is not protected by a lock;
+        * \retval       0: this layer cannot decide. (Should never happen.)
+        */
+       int (*cpo_is_vmlocked)(const struct lu_env *env,
+                              const struct cl_page_slice *slice);
+       /**
+        * Page destruction.
+        */
+
+       /**
+        * Called when page is truncated from the object. Optional.
+        *
+        * \see cl_page_discard()
+        * \see vvp_page_discard(), osc_page_discard()
+        */
+       void (*cpo_discard)(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *io);
+       /**
+        * Called when page is removed from the cache, and is about to being
+        * destroyed. Optional.
+        *
+        * \see cl_page_delete()
+        * \see vvp_page_delete(), osc_page_delete()
+        */
+       void (*cpo_delete)(const struct lu_env *env,
+                          const struct cl_page_slice *slice);
+       /** Destructor. Frees resources and slice itself. */
+       void (*cpo_fini)(const struct lu_env *env,
+                        struct cl_page_slice *slice);
+
+       /**
+        * Checks whether the page is protected by a cl_lock. This is a
+        * per-layer method, because certain layers have ways to check for the
+        * lock much more efficiently than through the generic locks scan, or
+        * implement locking mechanisms separate from cl_lock, e.g.,
+        * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks
+        * being canceled, or scheduled for cancellation as soon as the last
+        * user goes away, too.
+        *
+        * \retval    -EBUSY: page is protected by a lock of a given mode;
+        * \retval  -ENODATA: page is not protected by a lock;
+        * \retval       0: this layer cannot decide.
+        *
+        * \see cl_page_is_under_lock()
+        */
+       int (*cpo_is_under_lock)(const struct lu_env *env,
+                                const struct cl_page_slice *slice,
+                                struct cl_io *io);
+
+       /**
+        * Optional debugging helper. Prints given page slice.
+        *
+        * \see cl_page_print()
+        */
+       int (*cpo_print)(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        void *cookie, lu_printer_t p);
+       /**
+        * \name transfer
+        *
+        * Transfer methods. See comment on cl_req for a description of
+        * transfer formation and life-cycle.
+        *
+        * @{
+        */
+       /**
+        * Request type dependent vector of operations.
+        *
+        * Transfer operations depend on transfer mode (cl_req_type). To avoid
+        * passing transfer mode to each and every of these methods, and to
+        * avoid branching on request type inside of the methods, separate
+        * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
+        * provided. That is, method invocation usually looks like
+        *
+        *       slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
+        */
+       struct {
+               /**
+                * Called when a page is submitted for a transfer as a part of
+                * cl_page_list.
+                *
+                * \return    0  : page is eligible for submission;
+                * \return    -EALREADY : skip this page;
+                * \return    -ve       : error.
+                *
+                * \see cl_page_prep()
+                */
+               int  (*cpo_prep)(const struct lu_env *env,
+                                const struct cl_page_slice *slice,
+                                struct cl_io *io);
+               /**
+                * Completion handler. This is guaranteed to be eventually
+                * fired after cl_page_operations::cpo_prep() or
+                * cl_page_operations::cpo_make_ready() call.
+                *
+                * This method can be called in a non-blocking context. It is
+                * guaranteed however, that the page involved and its object
+                * are pinned in memory (and, hence, calling cl_page_put() is
+                * safe).
+                *
+                * \see cl_page_completion()
+                */
+               void (*cpo_completion)(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      int ioret);
+               /**
+                * Called when cached page is about to be added to the
+                * cl_req as a part of req formation.
+                *
+                * \return    0       : proceed with this page;
+                * \return    -EAGAIN : skip this page;
+                * \return    -ve     : error.
+                *
+                * \see cl_page_make_ready()
+                */
+               int  (*cpo_make_ready)(const struct lu_env *env,
+                                      const struct cl_page_slice *slice);
+               /**
+                * Announce that this page is to be written out
+                * opportunistically, that is, page is dirty, it is not
+                * necessary to start write-out transfer right now, but
+                * eventually page has to be written out.
+                *
+                * Main caller of this is the write path (see
+                * vvp_io_commit_write()), using this method to build a
+                * "transfer cache" from which large transfers are then
+                * constructed by the req-formation engine.
+                *
+                * \todo XXX it would make sense to add page-age tracking
+                * semantics here, and to oblige the req-formation engine to
+                * send the page out not later than it is too old.
+                *
+                * \see cl_page_cache_add()
+                */
+               int  (*cpo_cache_add)(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     struct cl_io *io);
+       } io[CRT_NR];
+       /**
+        * Tell transfer engine that only [to, from] part of a page should be
+        * transmitted.
+        *
+        * This is used for immediate transfers.
+        *
+        * \todo XXX this is not very good interface. It would be much better
+        * if all transfer parameters were supplied as arguments to
+        * cl_io_operations::cio_submit() call, but it is not clear how to do
+        * this for page queues.
+        *
+        * \see cl_page_clip()
+        */
+       void (*cpo_clip)(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        int from, int to);
+       /**
+        * \pre  the page was queued for transferring.
+        * \post page is removed from client's pending list, or -EBUSY
+        *       is returned if it has already been in transferring.
+        *
+        * This is one of seldom page operation which is:
+        * 0. called from top level;
+        * 1. don't have vmpage locked;
+        * 2. every layer should synchronize execution of its ->cpo_cancel()
+        *    with completion handlers. Osc uses client obd lock for this
+        *    purpose. Based on there is no vvp_page_cancel and
+        *    lov_page_cancel(), cpo_cancel is defacto protected by client lock.
+        *
+        * \see osc_page_cancel().
+        */
+       int (*cpo_cancel)(const struct lu_env *env,
+                         const struct cl_page_slice *slice);
+       /**
+        * Write out a page by kernel. This is only called by ll_writepage
+        * right now.
+        *
+        * \see cl_page_flush()
+        */
+       int (*cpo_flush)(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        struct cl_io *io);
+       /** @} transfer */
+};
+
+/**
+ * Helper macro, dumping detailed information about \a page into a log.
+ */
+#define CL_PAGE_DEBUG(mask, env, page, format, ...)                 \
+do {                                                               \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
+                                                                       \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {              \
+               cl_page_print(env, &msgdata, lu_cdebug_printer, page);  \
+               CDEBUG(mask, format , ## __VA_ARGS__);            \
+       }                                                              \
+} while (0)
+
+/**
+ * Helper macro, dumping shorter information about \a page into a log.
+ */
+#define CL_PAGE_HEADER(mask, env, page, format, ...)                     \
+do {                                                                     \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                      \
+                                                                             \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                    \
+               cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
+               CDEBUG(mask, format , ## __VA_ARGS__);                  \
+       }                                                                    \
+} while (0)
+
+static inline int __page_in_use(const struct cl_page *page, int refc)
+{
+       if (page->cp_type == CPT_CACHEABLE)
+               ++refc;
+       LASSERT(atomic_read(&page->cp_ref) > 0);
+       return (atomic_read(&page->cp_ref) > refc);
+}
+#define cl_page_in_use(pg)       __page_in_use(pg, 1)
+#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
+
+/** @} cl_page */
+
+/** \addtogroup cl_lock cl_lock
+ * @{ */
+/** \struct cl_lock
+ *
+ * Extent locking on the client.
+ *
+ * LAYERING
+ *
+ * The locking model of the new client code is built around
+ *
+ *     struct cl_lock
+ *
+ * data-type representing an extent lock on a regular file. cl_lock is a
+ * layered object (much like cl_object and cl_page), it consists of a header
+ * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
+ * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
+ *
+ * All locks for a given object are linked into cl_object_header::coh_locks
+ * list (protected by cl_object_header::coh_lock_guard spin-lock) through
+ * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can
+ * sort it in starting lock offset, or use altogether different data structure
+ * like a tree.
+ *
+ * Typical cl_lock consists of the two layers:
+ *
+ *     - vvp_lock (vvp specific data), and
+ *     - lov_lock (lov specific data).
+ *
+ * lov_lock contains an array of sub-locks. Each of these sub-locks is a
+ * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
+ *
+ *     - lovsub_lock, and
+ *     - osc_lock
+ *
+ * Each sub-lock is associated with a cl_object (representing stripe
+ * sub-object or the file to which top-level cl_lock is associated to), and is
+ * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
+ * cl_object (that at lov layer also fans out into multiple sub-objects), and
+ * is different from cl_page, that doesn't fan out (there is usually exactly
+ * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
+ * a "top-lock" and its lovsub-osc portion a "sub-lock".
+ *
+ * LIFE CYCLE
+ *
+ * cl_lock is reference counted. When reference counter drops to 0, lock is
+ * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING
+ * lock is destroyed when last reference is released. Referencing between
+ * top-lock and its sub-locks is described in the lov documentation module.
+ *
+ * STATE MACHINE
+ *
+ * Also, cl_lock is a state machine. This requires some clarification. One of
+ * the goals of client IO re-write was to make IO path non-blocking, or at
+ * least to make it easier to make it non-blocking in the future. Here
+ * `non-blocking' means that when a system call (read, write, truncate)
+ * reaches a situation where it has to wait for a communication with the
+ * server, it should --instead of waiting-- remember its current state and
+ * switch to some other work.  E.g,. instead of waiting for a lock enqueue,
+ * client should proceed doing IO on the next stripe, etc. Obviously this is
+ * rather radical redesign, and it is not planned to be fully implemented at
+ * this time, instead we are putting some infrastructure in place, that would
+ * make it easier to do asynchronous non-blocking IO easier in the
+ * future. Specifically, where old locking code goes to sleep (waiting for
+ * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When
+ * enqueue reply comes, its completion handler signals that lock state-machine
+ * is ready to transit to the next state. There is some generic code in
+ * cl_lock.c that sleeps, waiting for these signals. As a result, for users of
+ * this cl_lock.c code, it looks like locking is done in normal blocking
+ * fashion, and it the same time it is possible to switch to the non-blocking
+ * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c
+ * functions).
+ *
+ * For a description of state machine states and transitions see enum
+ * cl_lock_state.
+ *
+ * There are two ways to restrict a set of states which lock might move to:
+ *
+ *     - placing a "hold" on a lock guarantees that lock will not be moved
+ *       into cl_lock_state::CLS_FREEING state until hold is released. Hold
+ *       can be only acquired on a lock that is not in
+ *       cl_lock_state::CLS_FREEING. All holds on a lock are counted in
+ *       cl_lock::cll_holds. Hold protects lock from cancellation and
+ *       destruction. Requests to cancel and destroy a lock on hold will be
+ *       recorded, but only honored when last hold on a lock is released;
+ *
+ *     - placing a "user" on a lock guarantees that lock will not leave
+ *       cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of
+ *       states, once it enters this set. That is, if a user is added onto a
+ *       lock in a state not from this set, it doesn't immediately enforce
+ *       lock to move to this set, but once lock enters this set it will
+ *       remain there until all users are removed. Lock users are counted in
+ *       cl_lock::cll_users.
+ *
+ *       User is used to assure that lock is not canceled or destroyed while
+ *       it is being enqueued, or actively used by some IO.
+ *
+ *       Currently, a user always comes with a hold (cl_lock_invariant()
+ *       checks that a number of holds is not less than a number of users).
+ *
+ * CONCURRENCY
+ *
+ * This is how lock state-machine operates. struct cl_lock contains a mutex
+ * cl_lock::cll_guard that protects struct fields.
+ *
+ *     - mutex is taken, and cl_lock::cll_state is examined.
+ *
+ *     - for every state there are possible target states where lock can move
+ *       into. They are tried in order. Attempts to move into next state are
+ *       done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try().
+ *
+ *     - if the transition can be performed immediately, state is changed,
+ *       and mutex is released.
+ *
+ *     - if the transition requires blocking, _try() function returns
+ *       cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to
+ *       sleep, waiting for possibility of lock state change. It is woken
+ *       up when some event occurs, that makes lock state change possible
+ *       (e.g., the reception of the reply from the server), and repeats
+ *       the loop.
+ *
+ * Top-lock and sub-lock has separate mutexes and the latter has to be taken
+ * first to avoid dead-lock.
+ *
+ * To see an example of interaction of all these issues, take a look at the
+ * lov_cl.c:lov_lock_enqueue() function. It is called as a part of
+ * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by
+ * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note
+ * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It
+ * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be
+ * done in parallel, rather than one after another (this is used for glimpse
+ * locks, that cannot dead-lock).
+ *
+ * INTERFACE AND USAGE
+ *
+ * struct cl_lock_operations provide a number of call-backs that are invoked
+ * when events of interest occurs. Layers can intercept and handle glimpse,
+ * blocking, cancel ASTs and a reception of the reply from the server.
+ *
+ * One important difference with the old client locking model is that new
+ * client has a representation for the top-lock, whereas in the old code only
+ * sub-locks existed as real data structures and file-level locks are
+ * represented by "request sets" that are created and destroyed on each and
+ * every lock creation.
+ *
+ * Top-locks are cached, and can be found in the cache by the system calls. It
+ * is possible that top-lock is in cache, but some of its sub-locks were
+ * canceled and destroyed. In that case top-lock has to be enqueued again
+ * before it can be used.
+ *
+ * Overall process of the locking during IO operation is as following:
+ *
+ *     - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
+ *       is called on each layer. Responsibility of this method is to add locks,
+ *       needed by a given layer into cl_io.ci_lockset.
+ *
+ *     - once locks for all layers were collected, they are sorted to avoid
+ *       dead-locks (cl_io_locks_sort()), and enqueued.
+ *
+ *     - when all locks are acquired, IO is performed;
+ *
+ *     - locks are released into cache.
+ *
+ * Striping introduces major additional complexity into locking. The
+ * fundamental problem is that it is generally unsafe to actively use (hold)
+ * two locks on the different OST servers at the same time, as this introduces
+ * inter-server dependency and can lead to cascading evictions.
+ *
+ * Basic solution is to sub-divide large read/write IOs into smaller pieces so
+ * that no multi-stripe locks are taken (note that this design abandons POSIX
+ * read/write semantics). Such pieces ideally can be executed concurrently. At
+ * the same time, certain types of IO cannot be sub-divived, without
+ * sacrificing correctness. This includes:
+ *
+ *  - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
+ *  atomicity;
+ *
+ *  - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
+ *
+ * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
+ * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
+ * has to be held together with the usual lock on [offset, offset + count].
+ *
+ * As multi-stripe locks have to be allowed, it makes sense to cache them, so
+ * that, for example, a sequence of O_APPEND writes can proceed quickly
+ * without going down to the individual stripes to do lock matching. On the
+ * other hand, multi-stripe locks shouldn't be used by normal read/write
+ * calls. To achieve this, every layer can implement ->clo_fits_into() method,
+ * that is called by lock matching code (cl_lock_lookup()), and that can be
+ * used to selectively disable matching of certain locks for certain IOs. For
+ * exmaple, lov layer implements lov_lock_fits_into() that allow multi-stripe
+ * locks to be matched only for truncates and O_APPEND writes.
+ *
+ * Interaction with DLM
+ *
+ * In the expected setup, cl_lock is ultimately backed up by a collection of
+ * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
+ * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
+ * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
+ * description of interaction with DLM.
+ */
+
+/**
+ * Lock description.
+ */
+struct cl_lock_descr {
+       /** Object this lock is granted for. */
+       struct cl_object *cld_obj;
+       /** Index of the first page protected by this lock. */
+       pgoff_t    cld_start;
+       /** Index of the last page (inclusive) protected by this lock. */
+       pgoff_t    cld_end;
+       /** Group ID, for group lock */
+       __u64        cld_gid;
+       /** Lock mode. */
+       enum cl_lock_mode cld_mode;
+       /**
+        * flags to enqueue lock. A combination of bit-flags from
+        * enum cl_enq_flags.
+        */
+       __u32        cld_enq_flags;
+};
+
+#define DDESCR "%s(%d):[%lu, %lu]"
+#define PDESCR(descr)                                             \
+       cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode,        \
+       (descr)->cld_start, (descr)->cld_end
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode);
+
+/**
+ * Lock state-machine states.
+ *
+ * \htmlonly
+ * <pre>
+ *
+ * Possible state transitions:
+ *
+ *           +------------------>NEW
+ *           |             |
+ *           |             | cl_enqueue_try()
+ *           |             |
+ *           |    cl_unuse_try()  V
+ *           |  +--------------QUEUING (*)
+ *           |  |               |
+ *           |  |               | cl_enqueue_try()
+ *           |  |               |
+ *           |  | cl_unuse_try()  V
+ *    sub-lock  |  +-------------ENQUEUED (*)
+ *    canceled  |  |            |
+ *           |  |               | cl_wait_try()
+ *           |  |               |
+ *           |  |              (R)
+ *           |  |               |
+ *           |  |               V
+ *           |  |              HELD<---------+
+ *           |  |               |          |
+ *           |  |               |          | cl_use_try()
+ *           |  |  cl_unuse_try() |        |
+ *           |  |               |          |
+ *           |  |               V       ---+
+ *           |  +------------>INTRANSIT (D) <--+
+ *           |             |       |
+ *           |     cl_unuse_try() |        | cached lock found
+ *           |             |       | cl_use_try()
+ *           |             |       |
+ *           |             V       |
+ *           +------------------CACHED---------+
+ *                                |
+ *                               (C)
+ *                                |
+ *                                V
+ *                             FREEING
+ *
+ * Legend:
+ *
+ *      In states marked with (*) transition to the same state (i.e., a loop
+ *      in the diagram) is possible.
+ *
+ *      (R) is the point where Receive call-back is invoked: it allows layers
+ *      to handle arrival of lock reply.
+ *
+ *      (C) is the point where Cancellation call-back is invoked.
+ *
+ *      (D) is the transit state which means the lock is changing.
+ *
+ *      Transition to FREEING state is possible from any other state in the
+ *      diagram in case of unrecoverable error.
+ * </pre>
+ * \endhtmlonly
+ *
+ * These states are for individual cl_lock object. Top-lock and its sub-locks
+ * can be in the different states. Another way to say this is that we have
+ * nested state-machines.
+ *
+ * Separate QUEUING and ENQUEUED states are needed to support non-blocking
+ * operation for locks with multiple sub-locks. Imagine lock on a file F, that
+ * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send
+ * enqueue to S0, wait for its completion, then send enqueue for S1, wait for
+ * its completion and at last enqueue lock for S2, and wait for its
+ * completion. In that case, top-lock is in QUEUING state while S0, S1 are
+ * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note
+ * that in this case, sub-locks move from state to state, and top-lock remains
+ * in the same state).
+ */
+enum cl_lock_state {
+       /**
+        * Lock that wasn't yet enqueued
+        */
+       CLS_NEW,
+       /**
+        * Enqueue is in progress, blocking for some intermediate interaction
+        * with the other side.
+        */
+       CLS_QUEUING,
+       /**
+        * Lock is fully enqueued, waiting for server to reply when it is
+        * granted.
+        */
+       CLS_ENQUEUED,
+       /**
+        * Lock granted, actively used by some IO.
+        */
+       CLS_HELD,
+       /**
+        * This state is used to mark the lock is being used, or unused.
+        * We need this state because the lock may have several sublocks,
+        * so it's impossible to have an atomic way to bring all sublocks
+        * into CLS_HELD state at use case, or all sublocks to CLS_CACHED
+        * at unuse case.
+        * If a thread is referring to a lock, and it sees the lock is in this
+        * state, it must wait for the lock.
+        * See state diagram for details.
+        */
+       CLS_INTRANSIT,
+       /**
+        * Lock granted, not used.
+        */
+       CLS_CACHED,
+       /**
+        * Lock is being destroyed.
+        */
+       CLS_FREEING,
+       CLS_NR
+};
+
+enum cl_lock_flags {
+       /**
+        * lock has been cancelled. This flag is never cleared once set (by
+        * cl_lock_cancel0()).
+        */
+       CLF_CANCELLED  = 1 << 0,
+       /** cancellation is pending for this lock. */
+       CLF_CANCELPEND = 1 << 1,
+       /** destruction is pending for this lock. */
+       CLF_DOOMED     = 1 << 2,
+       /** from enqueue RPC reply upcall. */
+       CLF_FROM_UPCALL= 1 << 3,
+};
+
+/**
+ * Lock closure.
+ *
+ * Lock closure is a collection of locks (both top-locks and sub-locks) that
+ * might be updated in a result of an operation on a certain lock (which lock
+ * this is a closure of).
+ *
+ * Closures are needed to guarantee dead-lock freedom in the presence of
+ *
+ *     - nested state-machines (top-lock state-machine composed of sub-lock
+ *       state-machines), and
+ *
+ *     - shared sub-locks.
+ *
+ * Specifically, many operations, such as lock enqueue, wait, unlock,
+ * etc. start from a top-lock, and then operate on a sub-locks of this
+ * top-lock, holding a top-lock mutex. When sub-lock state changes as a result
+ * of such operation, this change has to be propagated to all top-locks that
+ * share this sub-lock. Obviously, no natural lock ordering (e.g.,
+ * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has
+ * to be used. Lock closure systematizes this try-and-repeat logic.
+ */
+struct cl_lock_closure {
+       /**
+        * Lock that is mutexed when closure construction is started. When
+        * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on
+        * origin is released before waiting.
+        */
+       struct cl_lock   *clc_origin;
+       /**
+        * List of enclosed locks, so far. Locks are linked here through
+        * cl_lock::cll_inclosure.
+        */
+       struct list_head        clc_list;
+       /**
+        * True iff closure is in a `wait' mode. This determines what
+        * cl_lock_enclosure() does when a lock L to be added to the closure
+        * is currently mutexed by some other thread.
+        *
+        * If cl_lock_closure::clc_wait is not set, then closure construction
+        * fails with CLO_REPEAT immediately.
+        *
+        * In wait mode, cl_lock_enclosure() waits until next attempt to build
+        * a closure might succeed. To this end it releases an origin mutex
+        * (cl_lock_closure::clc_origin), that has to be the only lock mutex
+        * owned by the current thread, and then waits on L mutex (by grabbing
+        * it and immediately releasing), before returning CLO_REPEAT to the
+        * caller.
+        */
+       int            clc_wait;
+       /** Number of locks in the closure. */
+       int            clc_nr;
+};
+
+/**
+ * Layered client lock.
+ */
+struct cl_lock {
+       /** Reference counter. */
+       atomic_t          cll_ref;
+       /** List of slices. Immutable after creation. */
+       struct list_head            cll_layers;
+       /**
+        * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected
+        * by cl_lock::cll_descr::cld_obj::coh_lock_guard.
+        */
+       struct list_head            cll_linkage;
+       /**
+        * Parameters of this lock. Protected by
+        * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within
+        * cl_lock::cll_guard. Modified only on lock creation and in
+        * cl_lock_modify().
+        */
+       struct cl_lock_descr  cll_descr;
+       /** Protected by cl_lock::cll_guard. */
+       enum cl_lock_state    cll_state;
+       /** signals state changes. */
+       wait_queue_head_t          cll_wq;
+       /**
+        * Recursive lock, most fields in cl_lock{} are protected by this.
+        *
+        * Locking rules: this mutex is never held across network
+        * communication, except when lock is being canceled.
+        *
+        * Lock ordering: a mutex of a sub-lock is taken first, then a mutex
+        * on a top-lock. Other direction is implemented through a
+        * try-lock-repeat loop. Mutices of unrelated locks can be taken only
+        * by try-locking.
+        *
+        * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait().
+        */
+       struct mutex            cll_guard;
+       task_t     *cll_guarder;
+       int                cll_depth;
+
+       /**
+        * the owner for INTRANSIT state
+        */
+       task_t     *cll_intransit_owner;
+       int                cll_error;
+       /**
+        * Number of holds on a lock. A hold prevents a lock from being
+        * canceled and destroyed. Protected by cl_lock::cll_guard.
+        *
+        * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release()
+        */
+       int                cll_holds;
+        /**
+         * Number of lock users. Valid in cl_lock_state::CLS_HELD state
+         * only. Lock user pins lock in CLS_HELD state. Protected by
+         * cl_lock::cll_guard.
+         *
+         * \see cl_wait(), cl_unuse().
+         */
+       int                cll_users;
+       /**
+        * Flag bit-mask. Values from enum cl_lock_flags. Updates are
+        * protected by cl_lock::cll_guard.
+        */
+       unsigned long    cll_flags;
+       /**
+        * A linkage into a list of locks in a closure.
+        *
+        * \see cl_lock_closure
+        */
+       struct list_head            cll_inclosure;
+       /**
+        * Confict lock at queuing time.
+        */
+       struct cl_lock       *cll_conflict;
+       /**
+        * A list of references to this lock, for debugging.
+        */
+       struct lu_ref    cll_reference;
+       /**
+        * A list of holds on this lock, for debugging.
+        */
+       struct lu_ref    cll_holders;
+       /**
+        * A reference for cl_lock::cll_descr::cld_obj. For debugging.
+        */
+       struct lu_ref_link   *cll_obj_ref;
+#ifdef CONFIG_LOCKDEP
+       /* "dep_map" name is assumed by lockdep.h macros. */
+       struct lockdep_map    dep_map;
+#endif
+};
+
+/**
+ * Per-layer part of cl_lock
+ *
+ * \see ccc_lock, lov_lock, lovsub_lock, osc_lock
+ */
+struct cl_lock_slice {
+       struct cl_lock            *cls_lock;
+       /** Object slice corresponding to this lock slice. Immutable after
+        * creation. */
+       struct cl_object                *cls_obj;
+       const struct cl_lock_operations *cls_ops;
+       /** Linkage into cl_lock::cll_layers. Immutable after creation. */
+       struct list_head                       cls_linkage;
+};
+
+/**
+ * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}().
+ *
+ * NOTE: lov_subresult() depends on ordering here.
+ */
+enum cl_lock_transition {
+       /** operation cannot be completed immediately. Wait for state change. */
+       CLO_WAIT        = 1,
+       /** operation had to release lock mutex, restart. */
+       CLO_REPEAT      = 2,
+       /** lower layer re-enqueued. */
+       CLO_REENQUEUED  = 3,
+};
+
+/**
+ *
+ * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ */
+struct cl_lock_operations {
+       /**
+        * \name statemachine
+        *
+        * State machine transitions. These 3 methods are called to transfer
+        * lock from one state to another, as described in the commentary
+        * above enum #cl_lock_state.
+        *
+        * \retval 0      this layer has nothing more to do to before
+        *                     transition to the target state happens;
+        *
+        * \retval CLO_REPEAT method had to release and re-acquire cl_lock
+        *                  mutex, repeat invocation of transition method
+        *                  across all layers;
+        *
+        * \retval CLO_WAIT   this layer cannot move to the target state
+        *                  immediately, as it has to wait for certain event
+        *                  (e.g., the communication with the server). It
+        *                  is guaranteed, that when the state transfer
+        *                  becomes possible, cl_lock::cll_wq wait-queue
+        *                  is signaled. Caller can wait for this event by
+        *                  calling cl_lock_state_wait();
+        *
+        * \retval -ve  failure, abort state transition, move the lock
+        *                  into cl_lock_state::CLS_FREEING state, and set
+        *                  cl_lock::cll_error.
+        *
+        * Once all layers voted to agree to transition (by returning 0), lock
+        * is moved into corresponding target state. All state transition
+        * methods are optional.
+        */
+       /** @{ */
+       /**
+        * Attempts to enqueue the lock. Called top-to-bottom.
+        *
+        * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
+        * \see osc_lock_enqueue()
+        */
+       int  (*clo_enqueue)(const struct lu_env *env,
+                           const struct cl_lock_slice *slice,
+                           struct cl_io *io, __u32 enqflags);
+       /**
+        * Attempts to wait for enqueue result. Called top-to-bottom.
+        *
+        * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait()
+        */
+       int  (*clo_wait)(const struct lu_env *env,
+                        const struct cl_lock_slice *slice);
+       /**
+        * Attempts to unlock the lock. Called bottom-to-top. In addition to
+        * usual return values of lock state-machine methods, this can return
+        * -ESTALE to indicate that lock cannot be returned to the cache, and
+        * has to be re-initialized.
+        * unuse is a one-shot operation, so it must NOT return CLO_WAIT.
+        *
+        * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse()
+        */
+       int  (*clo_unuse)(const struct lu_env *env,
+                         const struct cl_lock_slice *slice);
+       /**
+        * Notifies layer that cached lock is started being used.
+        *
+        * \pre lock->cll_state == CLS_CACHED
+        *
+        * \see lov_lock_use(), osc_lock_use()
+        */
+       int  (*clo_use)(const struct lu_env *env,
+                       const struct cl_lock_slice *slice);
+       /** @} statemachine */
+       /**
+        * A method invoked when lock state is changed (as a result of state
+        * transition). This is used, for example, to track when the state of
+        * a sub-lock changes, to propagate this change to the corresponding
+        * top-lock. Optional
+        *
+        * \see lovsub_lock_state()
+        */
+       void (*clo_state)(const struct lu_env *env,
+                         const struct cl_lock_slice *slice,
+                         enum cl_lock_state st);
+       /**
+        * Returns true, iff given lock is suitable for the given io, idea
+        * being, that there are certain "unsafe" locks, e.g., ones acquired
+        * for O_APPEND writes, that we don't want to re-use for a normal
+        * write, to avoid the danger of cascading evictions. Optional. Runs
+        * under cl_object_header::coh_lock_guard.
+        *
+        * XXX this should take more information about lock needed by
+        * io. Probably lock description or something similar.
+        *
+        * \see lov_fits_into()
+        */
+       int (*clo_fits_into)(const struct lu_env *env,
+                            const struct cl_lock_slice *slice,
+                            const struct cl_lock_descr *need,
+                            const struct cl_io *io);
+       /**
+        * \name ast
+        * Asynchronous System Traps. All of then are optional, all are
+        * executed bottom-to-top.
+        */
+       /** @{ */
+
+       /**
+        * Cancellation callback. Cancel a lock voluntarily, or under
+        * the request of server.
+        */
+       void (*clo_cancel)(const struct lu_env *env,
+                          const struct cl_lock_slice *slice);
+       /**
+        * Lock weighting ast. Executed to estimate how precious this lock
+        * is. The sum of results across all layers is used to determine
+        * whether lock worth keeping in cache given present memory usage.
+        *
+        * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh().
+        */
+       unsigned long (*clo_weigh)(const struct lu_env *env,
+                                  const struct cl_lock_slice *slice);
+       /** @} ast */
+
+       /**
+        * \see lovsub_lock_closure()
+        */
+       int (*clo_closure)(const struct lu_env *env,
+                          const struct cl_lock_slice *slice,
+                          struct cl_lock_closure *closure);
+       /**
+        * Executed bottom-to-top when lock description changes (e.g., as a
+        * result of server granting more generous lock than was requested).
+        *
+        * \see lovsub_lock_modify()
+        */
+       int (*clo_modify)(const struct lu_env *env,
+                         const struct cl_lock_slice *slice,
+                         const struct cl_lock_descr *updated);
+       /**
+        * Notifies layers (bottom-to-top) that lock is going to be
+        * destroyed. Responsibility of layers is to prevent new references on
+        * this lock from being acquired once this method returns.
+        *
+        * This can be called multiple times due to the races.
+        *
+        * \see cl_lock_delete()
+        * \see osc_lock_delete(), lovsub_lock_delete()
+        */
+       void (*clo_delete)(const struct lu_env *env,
+                          const struct cl_lock_slice *slice);
+       /**
+        * Destructor. Frees resources and the slice.
+        *
+        * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
+        * \see osc_lock_fini()
+        */
+       void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
+       /**
+        * Optional debugging helper. Prints given lock slice.
+        */
+       int (*clo_print)(const struct lu_env *env,
+                        void *cookie, lu_printer_t p,
+                        const struct cl_lock_slice *slice);
+};
+
+#define CL_LOCK_DEBUG(mask, env, lock, format, ...)                 \
+do {                                                               \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
+                                                                       \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {              \
+               cl_lock_print(env, &msgdata, lu_cdebug_printer, lock);  \
+               CDEBUG(mask, format , ## __VA_ARGS__);            \
+       }                                                              \
+} while (0)
+
+#define CL_LOCK_ASSERT(expr, env, lock) do {                       \
+       if (likely(expr))                                              \
+               break;                                            \
+                                                                       \
+       CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr);    \
+       LBUG();                                                  \
+} while (0)
+
+/** @} cl_lock */
+
+/** \addtogroup cl_page_list cl_page_list
+ * Page list used to perform collective operations on a group of pages.
+ *
+ * Pages are added to the list one by one. cl_page_list acquires a reference
+ * for every page in it. Page list is used to perform collective operations on
+ * pages:
+ *
+ *     - submit pages for an immediate transfer,
+ *
+ *     - own pages on behalf of certain io (waiting for each page in turn),
+ *
+ *     - discard pages.
+ *
+ * When list is finalized, it releases references on all pages it still has.
+ *
+ * \todo XXX concurrency control.
+ *
+ * @{
+ */
+struct cl_page_list {
+       unsigned             pl_nr;
+       struct list_head           pl_pages;
+       task_t    *pl_owner;
+};
+
+/**
+ * A 2-queue of pages. A convenience data-type for common use case, 2-queue
+ * contains an incoming page list and an outgoing page list.
+ */
+struct cl_2queue {
+       struct cl_page_list c2_qin;
+       struct cl_page_list c2_qout;
+};
+
+/** @} cl_page_list */
+
+/** \addtogroup cl_io cl_io
+ * @{ */
+/** \struct cl_io
+ * I/O
+ *
+ * cl_io represents a high level I/O activity like
+ * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
+ * lock.
+ *
+ * cl_io is a layered object, much like cl_{object,page,lock} but with one
+ * important distinction. We want to minimize number of calls to the allocator
+ * in the fast path, e.g., in the case of read(2) when everything is cached:
+ * client already owns the lock over region being read, and data are cached
+ * due to read-ahead. To avoid allocation of cl_io layers in such situations,
+ * per-layer io state is stored in the session, associated with the io, see
+ * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
+ * by using free-lists, see cl_env_get().
+ *
+ * There is a small predefined number of possible io types, enumerated in enum
+ * cl_io_type.
+ *
+ * cl_io is a state machine, that can be advanced concurrently by the multiple
+ * threads. It is up to these threads to control the concurrency and,
+ * specifically, to detect when io is done, and its state can be safely
+ * released.
+ *
+ * For read/write io overall execution plan is as following:
+ *
+ *     (0) initialize io state through all layers;
+ *
+ *     (1) loop: prepare chunk of work to do
+ *
+ *     (2) call all layers to collect locks they need to process current chunk
+ *
+ *     (3) sort all locks to avoid dead-locks, and acquire them
+ *
+ *     (4) process the chunk: call per-page methods
+ *      (cl_io_operations::cio_read_page() for read,
+ *      cl_io_operations::cio_prepare_write(),
+ *      cl_io_operations::cio_commit_write() for write)
+ *
+ *     (5) release locks
+ *
+ *     (6) repeat loop.
+ *
+ * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
+ * address allocation efficiency issues mentioned above), and returns with the
+ * special error condition from per-page method when current sub-io has to
+ * block. This causes io loop to be repeated, and lov switches to the next
+ * sub-io in its cl_io_operations::cio_iter_init() implementation.
+ */
+
+/** IO types */
+enum cl_io_type {
+       /** read system call */
+       CIT_READ,
+       /** write system call */
+       CIT_WRITE,
+       /** truncate, utime system calls */
+       CIT_SETATTR,
+       /**
+        * page fault handling
+        */
+       CIT_FAULT,
+       /**
+        * fsync system call handling
+        * To write out a range of file
+        */
+       CIT_FSYNC,
+       /**
+        * Miscellaneous io. This is used for occasional io activity that
+        * doesn't fit into other types. Currently this is used for:
+        *
+        *     - cancellation of an extent lock. This io exists as a context
+        *     to write dirty pages from under the lock being canceled back
+        *     to the server;
+        *
+        *     - VM induced page write-out. An io context for writing page out
+        *     for memory cleansing;
+        *
+        *     - glimpse. An io context to acquire glimpse lock.
+        *
+        *     - grouplock. An io context to acquire group lock.
+        *
+        * CIT_MISC io is used simply as a context in which locks and pages
+        * are manipulated. Such io has no internal "process", that is,
+        * cl_io_loop() is never called for it.
+        */
+       CIT_MISC,
+       CIT_OP_NR
+};
+
+/**
+ * States of cl_io state machine
+ */
+enum cl_io_state {
+       /** Not initialized. */
+       CIS_ZERO,
+       /** Initialized. */
+       CIS_INIT,
+       /** IO iteration started. */
+       CIS_IT_STARTED,
+       /** Locks taken. */
+       CIS_LOCKED,
+       /** Actual IO is in progress. */
+       CIS_IO_GOING,
+       /** IO for the current iteration finished. */
+       CIS_IO_FINISHED,
+       /** Locks released. */
+       CIS_UNLOCKED,
+       /** Iteration completed. */
+       CIS_IT_ENDED,
+       /** cl_io finalized. */
+       CIS_FINI
+};
+
+/**
+ * IO state private for a layer.
+ *
+ * This is usually embedded into layer session data, rather than allocated
+ * dynamically.
+ *
+ * \see vvp_io, lov_io, osc_io, ccc_io
+ */
+struct cl_io_slice {
+       struct cl_io              *cis_io;
+       /** corresponding object slice. Immutable after creation. */
+       struct cl_object              *cis_obj;
+       /** io operations. Immutable after creation. */
+       const struct cl_io_operations *cis_iop;
+       /**
+        * linkage into a list of all slices for a given cl_io, hanging off
+        * cl_io::ci_layers. Immutable after creation.
+        */
+       struct list_head                     cis_linkage;
+};
+
+
+/**
+ * Per-layer io operations.
+ * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
+ */
+struct cl_io_operations {
+       /**
+        * Vector of io state transition methods for every io type.
+        *
+        * \see cl_page_operations::io
+        */
+       struct {
+               /**
+                * Prepare io iteration at a given layer.
+                *
+                * Called top-to-bottom at the beginning of each iteration of
+                * "io loop" (if it makes sense for this type of io). Here
+                * layer selects what work it will do during this iteration.
+                *
+                * \see cl_io_operations::cio_iter_fini()
+                */
+               int (*cio_iter_init) (const struct lu_env *env,
+                                     const struct cl_io_slice *slice);
+               /**
+                * Finalize io iteration.
+                *
+                * Called bottom-to-top at the end of each iteration of "io
+                * loop". Here layers can decide whether IO has to be
+                * continued.
+                *
+                * \see cl_io_operations::cio_iter_init()
+                */
+               void (*cio_iter_fini) (const struct lu_env *env,
+                                      const struct cl_io_slice *slice);
+               /**
+                * Collect locks for the current iteration of io.
+                *
+                * Called top-to-bottom to collect all locks necessary for
+                * this iteration. This methods shouldn't actually enqueue
+                * anything, instead it should post a lock through
+                * cl_io_lock_add(). Once all locks are collected, they are
+                * sorted and enqueued in the proper order.
+                */
+               int  (*cio_lock) (const struct lu_env *env,
+                                 const struct cl_io_slice *slice);
+               /**
+                * Finalize unlocking.
+                *
+                * Called bottom-to-top to finish layer specific unlocking
+                * functionality, after generic code released all locks
+                * acquired by cl_io_operations::cio_lock().
+                */
+               void  (*cio_unlock)(const struct lu_env *env,
+                                   const struct cl_io_slice *slice);
+               /**
+                * Start io iteration.
+                *
+                * Once all locks are acquired, called top-to-bottom to
+                * commence actual IO. In the current implementation,
+                * top-level vvp_io_{read,write}_start() does all the work
+                * synchronously by calling generic_file_*(), so other layers
+                * are called when everything is done.
+                */
+               int  (*cio_start)(const struct lu_env *env,
+                                 const struct cl_io_slice *slice);
+               /**
+                * Called top-to-bottom at the end of io loop. Here layer
+                * might wait for an unfinished asynchronous io.
+                */
+               void (*cio_end)  (const struct lu_env *env,
+                                 const struct cl_io_slice *slice);
+               /**
+                * Called bottom-to-top to notify layers that read/write IO
+                * iteration finished, with \a nob bytes transferred.
+                */
+               void (*cio_advance)(const struct lu_env *env,
+                                   const struct cl_io_slice *slice,
+                                   size_t nob);
+               /**
+                * Called once per io, bottom-to-top to release io resources.
+                */
+               void (*cio_fini) (const struct lu_env *env,
+                                 const struct cl_io_slice *slice);
+       } op[CIT_OP_NR];
+       struct {
+               /**
+                * Submit pages from \a queue->c2_qin for IO, and move
+                * successfully submitted pages into \a queue->c2_qout. Return
+                * non-zero if failed to submit even the single page. If
+                * submission failed after some pages were moved into \a
+                * queue->c2_qout, completion callback with non-zero ioret is
+                * executed on them.
+                */
+               int  (*cio_submit)(const struct lu_env *env,
+                                  const struct cl_io_slice *slice,
+                                  enum cl_req_type crt,
+                                  struct cl_2queue *queue);
+       } req_op[CRT_NR];
+       /**
+        * Read missing page.
+        *
+        * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start()
+        * method, when it hits not-up-to-date page in the range. Optional.
+        *
+        * \pre io->ci_type == CIT_READ
+        */
+       int (*cio_read_page)(const struct lu_env *env,
+                            const struct cl_io_slice *slice,
+                            const struct cl_page_slice *page);
+       /**
+        * Prepare write of a \a page. Called bottom-to-top by a top-level
+        * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for
+        * get data from user-level buffer.
+        *
+        * \pre io->ci_type == CIT_WRITE
+        *
+        * \see vvp_io_prepare_write(), lov_io_prepare_write(),
+        * osc_io_prepare_write().
+        */
+       int (*cio_prepare_write)(const struct lu_env *env,
+                                const struct cl_io_slice *slice,
+                                const struct cl_page_slice *page,
+                                unsigned from, unsigned to);
+       /**
+        *
+        * \pre io->ci_type == CIT_WRITE
+        *
+        * \see vvp_io_commit_write(), lov_io_commit_write(),
+        * osc_io_commit_write().
+        */
+       int (*cio_commit_write)(const struct lu_env *env,
+                               const struct cl_io_slice *slice,
+                               const struct cl_page_slice *page,
+                               unsigned from, unsigned to);
+       /**
+        * Optional debugging helper. Print given io slice.
+        */
+       int (*cio_print)(const struct lu_env *env, void *cookie,
+                        lu_printer_t p, const struct cl_io_slice *slice);
+};
+
+/**
+ * Flags to lock enqueue procedure.
+ * \ingroup cl_lock
+ */
+enum cl_enq_flags {
+       /**
+        * instruct server to not block, if conflicting lock is found. Instead
+        * -EWOULDBLOCK is returned immediately.
+        */
+       CEF_NONBLOCK     = 0x00000001,
+       /**
+        * take lock asynchronously (out of order), as it cannot
+        * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
+        */
+       CEF_ASYNC       = 0x00000002,
+       /**
+        * tell the server to instruct (though a flag in the blocking ast) an
+        * owner of the conflicting lock, that it can drop dirty pages
+        * protected by this lock, without sending them to the server.
+        */
+       CEF_DISCARD_DATA = 0x00000004,
+       /**
+        * tell the sub layers that it must be a `real' lock. This is used for
+        * mmapped-buffer locks and glimpse locks that must be never converted
+        * into lockless mode.
+        *
+        * \see vvp_mmap_locks(), cl_glimpse_lock().
+        */
+       CEF_MUST         = 0x00000008,
+       /**
+        * tell the sub layers that never request a `real' lock. This flag is
+        * not used currently.
+        *
+        * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
+        * conversion policy: ci_lockreq describes generic information of lock
+        * requirement for this IO, especially for locks which belong to the
+        * object doing IO; however, lock itself may have precise requirements
+        * that are described by the enqueue flags.
+        */
+       CEF_NEVER       = 0x00000010,
+       /**
+        * for async glimpse lock.
+        */
+       CEF_AGL   = 0x00000020,
+       /**
+        * mask of enq_flags.
+        */
+       CEF_MASK         = 0x0000003f,
+};
+
+/**
+ * Link between lock and io. Intermediate structure is needed, because the
+ * same lock can be part of multiple io's simultaneously.
+ */
+struct cl_io_lock_link {
+       /** linkage into one of cl_lockset lists. */
+       struct list_head           cill_linkage;
+       struct cl_lock_descr cill_descr;
+       struct cl_lock      *cill_lock;
+       /** optional destructor */
+       void           (*cill_fini)(const struct lu_env *env,
+                                       struct cl_io_lock_link *link);
+};
+
+/**
+ * Lock-set represents a collection of locks, that io needs at a
+ * time. Generally speaking, client tries to avoid holding multiple locks when
+ * possible, because
+ *
+ *      - holding extent locks over multiple ost's introduces the danger of
+ *     "cascading timeouts";
+ *
+ *      - holding multiple locks over the same ost is still dead-lock prone,
+ *     see comment in osc_lock_enqueue(),
+ *
+ * but there are certain situations where this is unavoidable:
+ *
+ *      - O_APPEND writes have to take [0, EOF] lock for correctness;
+ *
+ *      - truncate has to take [new-size, EOF] lock for correctness;
+ *
+ *      - SNS has to take locks across full stripe for correctness;
+ *
+ *      - in the case when user level buffer, supplied to {read,write}(file0),
+ *     is a part of a memory mapped lustre file, client has to take a dlm
+ *     locks on file0, and all files that back up the buffer (or a part of
+ *     the buffer, that is being processed in the current chunk, in any
+ *     case, there are situations where at least 2 locks are necessary).
+ *
+ * In such cases we at least try to take locks in the same consistent
+ * order. To this end, all locks are first collected, then sorted, and then
+ * enqueued.
+ */
+struct cl_lockset {
+       /** locks to be acquired. */
+       struct list_head  cls_todo;
+       /** locks currently being processed. */
+       struct list_head  cls_curr;
+       /** locks acquired. */
+       struct list_head  cls_done;
+};
+
+/**
+ * Lock requirements(demand) for IO. It should be cl_io_lock_req,
+ * but 'req' is always to be thought as 'request' :-)
+ */
+enum cl_io_lock_dmd {
+       /** Always lock data (e.g., O_APPEND). */
+       CILR_MANDATORY = 0,
+       /** Layers are free to decide between local and global locking. */
+       CILR_MAYBE,
+       /** Never lock: there is no cache (e.g., liblustre). */
+       CILR_NEVER
+};
+
+enum cl_fsync_mode {
+       /** start writeback, do not wait for them to finish */
+       CL_FSYNC_NONE  = 0,
+       /** start writeback and wait for them to finish */
+       CL_FSYNC_LOCAL = 1,
+       /** discard all of dirty pages in a specific file range */
+       CL_FSYNC_DISCARD = 2,
+       /** start writeback and make sure they have reached storage before
+        * return. OST_SYNC RPC must be issued and finished */
+       CL_FSYNC_ALL   = 3
+};
+
+struct cl_io_rw_common {
+       loff_t      crw_pos;
+       size_t      crw_count;
+       int      crw_nonblock;
+};
+
+
+/**
+ * State for io.
+ *
+ * cl_io is shared by all threads participating in this IO (in current
+ * implementation only one thread advances IO, but parallel IO design and
+ * concurrent copy_*_user() require multiple threads acting on the same IO. It
+ * is up to these threads to serialize their activities, including updates to
+ * mutable cl_io fields.
+ */
+struct cl_io {
+       /** type of this IO. Immutable after creation. */
+       enum cl_io_type         ci_type;
+       /** current state of cl_io state machine. */
+       enum cl_io_state               ci_state;
+       /** main object this io is against. Immutable after creation. */
+       struct cl_object              *ci_obj;
+       /**
+        * Upper layer io, of which this io is a part of. Immutable after
+        * creation.
+        */
+       struct cl_io              *ci_parent;
+       /** List of slices. Immutable after creation. */
+       struct list_head                     ci_layers;
+       /** list of locks (to be) acquired by this io. */
+       struct cl_lockset             ci_lockset;
+       /** lock requirements, this is just a help info for sublayers. */
+       enum cl_io_lock_dmd         ci_lockreq;
+       union {
+               struct cl_rd_io {
+                       struct cl_io_rw_common rd;
+               } ci_rd;
+               struct cl_wr_io {
+                       struct cl_io_rw_common wr;
+                       int                 wr_append;
+                       int                 wr_sync;
+               } ci_wr;
+               struct cl_io_rw_common ci_rw;
+               struct cl_setattr_io {
+                       struct ost_lvb   sa_attr;
+                       unsigned int     sa_valid;
+                       struct obd_capa *sa_capa;
+               } ci_setattr;
+               struct cl_fault_io {
+                       /** page index within file. */
+                       pgoff_t  ft_index;
+                       /** bytes valid byte on a faulted page. */
+                       int          ft_nob;
+                       /** writable page? for nopage() only */
+                       int          ft_writable;
+                       /** page of an executable? */
+                       int          ft_executable;
+                       /** page_mkwrite() */
+                       int          ft_mkwrite;
+                       /** resulting page */
+                       struct cl_page *ft_page;
+               } ci_fault;
+               struct cl_fsync_io {
+                       loff_t       fi_start;
+                       loff_t       fi_end;
+                       struct obd_capa   *fi_capa;
+                       /** file system level fid */
+                       struct lu_fid     *fi_fid;
+                       enum cl_fsync_mode fi_mode;
+                       /* how many pages were written/discarded */
+                       unsigned int       fi_nr_written;
+               } ci_fsync;
+       } u;
+       struct cl_2queue     ci_queue;
+       size_t         ci_nob;
+       int               ci_result;
+       unsigned int     ci_continue:1,
+       /**
+        * This io has held grouplock, to inform sublayers that
+        * don't do lockless i/o.
+        */
+                            ci_no_srvlock:1,
+       /**
+        * The whole IO need to be restarted because layout has been changed
+        */
+                            ci_need_restart:1,
+       /**
+        * to not refresh layout - the IO issuer knows that the layout won't
+        * change(page operations, layout change causes all page to be
+        * discarded), or it doesn't matter if it changes(sync).
+        */
+                            ci_ignore_layout:1,
+       /**
+        * Check if layout changed after the IO finishes. Mainly for HSM
+        * requirement. If IO occurs to openning files, it doesn't need to
+        * verify layout because HSM won't release openning files.
+        * Right now, only two opertaions need to verify layout: glimpse
+        * and setattr.
+        */
+                            ci_verify_layout:1;
+       /**
+        * Number of pages owned by this IO. For invariant checking.
+        */
+       unsigned             ci_owned_nr;
+};
+
+/** @} cl_io */
+
+/** \addtogroup cl_req cl_req
+ * @{ */
+/** \struct cl_req
+ * Transfer.
+ *
+ * There are two possible modes of transfer initiation on the client:
+ *
+ *     - immediate transfer: this is started when a high level io wants a page
+ *       or a collection of pages to be transferred right away. Examples:
+ *       read-ahead, synchronous read in the case of non-page aligned write,
+ *       page write-out as a part of extent lock cancellation, page write-out
+ *       as a part of memory cleansing. Immediate transfer can be both
+ *       cl_req_type::CRT_READ and cl_req_type::CRT_WRITE;
+ *
+ *     - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens
+ *       when io wants to transfer a page to the server some time later, when
+ *       it can be done efficiently. Example: pages dirtied by the write(2)
+ *       path.
+ *
+ * In any case, transfer takes place in the form of a cl_req, which is a
+ * representation for a network RPC.
+ *
+ * Pages queued for an opportunistic transfer are cached until it is decided
+ * that efficient RPC can be composed of them. This decision is made by "a
+ * req-formation engine", currently implemented as a part of osc
+ * layer. Req-formation depends on many factors: the size of the resulting
+ * RPC, whether or not multi-object RPCs are supported by the server,
+ * max-rpc-in-flight limitations, size of the dirty cache, etc.
+ *
+ * For the immediate transfer io submits a cl_page_list, that req-formation
+ * engine slices into cl_req's, possibly adding cached pages to some of
+ * the resulting req's.
+ *
+ * Whenever a page from cl_page_list is added to a newly constructed req, its
+ * cl_page_operations::cpo_prep() layer methods are called. At that moment,
+ * page state is atomically changed from cl_page_state::CPS_OWNED to
+ * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner
+ * is zeroed, and cl_page::cp_req is set to the
+ * req. cl_page_operations::cpo_prep() method at the particular layer might
+ * return -EALREADY to indicate that it does not need to submit this page
+ * at all. This is possible, for example, if page, submitted for read,
+ * became up-to-date in the meantime; and for write, the page don't have
+ * dirty bit marked. \see cl_io_submit_rw()
+ *
+ * Whenever a cached page is added to a newly constructed req, its
+ * cl_page_operations::cpo_make_ready() layer methods are called. At that
+ * moment, page state is atomically changed from cl_page_state::CPS_CACHED to
+ * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to
+ * req. cl_page_operations::cpo_make_ready() method at the particular layer
+ * might return -EAGAIN to indicate that this page is not eligible for the
+ * transfer right now.
+ *
+ * FUTURE
+ *
+ * Plan is to divide transfers into "priority bands" (indicated when
+ * submitting cl_page_list, and queuing a page for the opportunistic transfer)
+ * and allow glueing of cached pages to immediate transfers only within single
+ * band. This would make high priority transfers (like lock cancellation or
+ * memory pressure induced write-out) really high priority.
+ *
+ */
+
+/**
+ * Per-transfer attributes.
+ */
+struct cl_req_attr {
+       /** Generic attributes for the server consumption. */
+       struct obdo     *cra_oa;
+       /** Capability. */
+       struct obd_capa *cra_capa;
+       /** Jobid */
+       char             cra_jobid[JOBSTATS_JOBID_SIZE];
+};
+
+/**
+ * Transfer request operations definable at every layer.
+ *
+ * Concurrency: transfer formation engine synchronizes calls to all transfer
+ * methods.
+ */
+struct cl_req_operations {
+       /**
+        * Invoked top-to-bottom by cl_req_prep() when transfer formation is
+        * complete (all pages are added).
+        *
+        * \see osc_req_prep()
+        */
+       int  (*cro_prep)(const struct lu_env *env,
+                        const struct cl_req_slice *slice);
+       /**
+        * Called top-to-bottom to fill in \a oa fields. This is called twice
+        * with different flags, see bug 10150 and osc_build_req().
+        *
+        * \param obj an object from cl_req which attributes are to be set in
+        *          \a oa.
+        *
+        * \param oa struct obdo where attributes are placed
+        *
+        * \param flags \a oa fields to be filled.
+        */
+       void (*cro_attr_set)(const struct lu_env *env,
+                            const struct cl_req_slice *slice,
+                            const struct cl_object *obj,
+                            struct cl_req_attr *attr, obd_valid flags);
+       /**
+        * Called top-to-bottom from cl_req_completion() to notify layers that
+        * transfer completed. Has to free all state allocated by
+        * cl_device_operations::cdo_req_init().
+        */
+       void (*cro_completion)(const struct lu_env *env,
+                              const struct cl_req_slice *slice, int ioret);
+};
+
+/**
+ * A per-object state that (potentially multi-object) transfer request keeps.
+ */
+struct cl_req_obj {
+       /** object itself */
+       struct cl_object   *ro_obj;
+       /** reference to cl_req_obj::ro_obj. For debugging. */
+       struct lu_ref_link *ro_obj_ref;
+       /* something else? Number of pages for a given object? */
+};
+
+/**
+ * Transfer request.
+ *
+ * Transfer requests are not reference counted, because IO sub-system owns
+ * them exclusively and knows when to free them.
+ *
+ * Life cycle.
+ *
+ * cl_req is created by cl_req_alloc() that calls
+ * cl_device_operations::cdo_req_init() device methods to allocate per-req
+ * state in every layer.
+ *
+ * Then pages are added (cl_req_page_add()), req keeps track of all objects it
+ * contains pages for.
+ *
+ * Once all pages were collected, cl_page_operations::cpo_prep() method is
+ * called top-to-bottom. At that point layers can modify req, let it pass, or
+ * deny it completely. This is to support things like SNS that have transfer
+ * ordering requirements invisible to the individual req-formation engine.
+ *
+ * On transfer completion (or transfer timeout, or failure to initiate the
+ * transfer of an allocated req), cl_req_operations::cro_completion() method
+ * is called, after execution of cl_page_operations::cpo_completion() of all
+ * req's pages.
+ */
+struct cl_req {
+       enum cl_req_type      crq_type;
+       /** A list of pages being transfered */
+       struct list_head            crq_pages;
+       /** Number of pages in cl_req::crq_pages */
+       unsigned              crq_nrpages;
+       /** An array of objects which pages are in ->crq_pages */
+       struct cl_req_obj    *crq_o;
+       /** Number of elements in cl_req::crq_objs[] */
+       unsigned              crq_nrobjs;
+       struct list_head            crq_layers;
+};
+
+/**
+ * Per-layer state for request.
+ */
+struct cl_req_slice {
+       struct cl_req    *crs_req;
+       struct cl_device *crs_dev;
+       struct list_head        crs_linkage;
+       const struct cl_req_operations *crs_ops;
+};
+
+/* @} cl_req */
+
+enum cache_stats_item {
+       /** how many cache lookups were performed */
+       CS_lookup = 0,
+       /** how many times cache lookup resulted in a hit */
+       CS_hit,
+       /** how many entities are in the cache right now */
+       CS_total,
+       /** how many entities in the cache are actively used (and cannot be
+        * evicted) right now */
+       CS_busy,
+       /** how many entities were created at all */
+       CS_create,
+       CS_NR
+};
+
+#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
+
+/**
+ * Stats for a generic cache (similar to inode, lu_object, etc. caches).
+ */
+struct cache_stats {
+       const char    *cs_name;
+       atomic_t   cs_stats[CS_NR];
+};
+
+/** These are not exported so far */
+void cache_stats_init (struct cache_stats *cs, const char *name);
+
+/**
+ * Client-side site. This represents particular client stack. "Global"
+ * variables should (directly or indirectly) be added here to allow multiple
+ * clients to co-exist in the single address space.
+ */
+struct cl_site {
+       struct lu_site  cs_lu;
+       /**
+        * Statistical counters. Atomics do not scale, something better like
+        * per-cpu counters is needed.
+        *
+        * These are exported as /proc/fs/lustre/llite/.../site
+        *
+        * When interpreting keep in mind that both sub-locks (and sub-pages)
+        * and top-locks (and top-pages) are accounted here.
+        */
+       struct cache_stats    cs_pages;
+       struct cache_stats    cs_locks;
+       atomic_t          cs_pages_state[CPS_NR];
+       atomic_t          cs_locks_state[CLS_NR];
+};
+
+int  cl_site_init (struct cl_site *s, struct cl_device *top);
+void cl_site_fini (struct cl_site *s);
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
+
+/**
+ * Output client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m);
+
+/**
+ * \name helpers
+ *
+ * Type conversion and accessory functions.
+ */
+/** @{ */
+
+static inline struct cl_site *lu2cl_site(const struct lu_site *site)
+{
+       return container_of(site, struct cl_site, cs_lu);
+}
+
+static inline int lu_device_is_cl(const struct lu_device *d)
+{
+       return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
+{
+       LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
+       return container_of0(d, struct cl_device, cd_lu_dev);
+}
+
+static inline struct lu_device *cl2lu_dev(struct cl_device *d)
+{
+       return &d->cd_lu_dev;
+}
+
+static inline struct cl_object *lu2cl(const struct lu_object *o)
+{
+       LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
+       return container_of0(o, struct cl_object, co_lu);
+}
+
+static inline const struct cl_object_conf *
+lu2cl_conf(const struct lu_object_conf *conf)
+{
+       return container_of0(conf, struct cl_object_conf, coc_lu);
+}
+
+static inline struct cl_object *cl_object_next(const struct cl_object *obj)
+{
+       return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
+}
+
+static inline struct cl_device *cl_object_device(const struct cl_object *o)
+{
+       LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev));
+       return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
+}
+
+static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
+{
+       return container_of0(h, struct cl_object_header, coh_lu);
+}
+
+static inline struct cl_site *cl_object_site(const struct cl_object *obj)
+{
+       return lu2cl_site(obj->co_lu.lo_dev->ld_site);
+}
+
+static inline
+struct cl_object_header *cl_object_header(const struct cl_object *obj)
+{
+       return luh2coh(obj->co_lu.lo_header);
+}
+
+static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
+{
+       return lu_device_init(&d->cd_lu_dev, t);
+}
+
+static inline void cl_device_fini(struct cl_device *d)
+{
+       lu_device_fini(&d->cd_lu_dev);
+}
+
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+                      struct cl_object *obj,
+                      const struct cl_page_operations *ops);
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+                      struct cl_object *obj,
+                      const struct cl_lock_operations *ops);
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+                    struct cl_object *obj, const struct cl_io_operations *ops);
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+                     struct cl_device *dev,
+                     const struct cl_req_operations *ops);
+/** @} helpers */
+
+/** \defgroup cl_object cl_object
+ * @{ */
+struct cl_object *cl_object_top (struct cl_object *o);
+struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
+                                const struct lu_fid *fid,
+                                const struct cl_object_conf *c);
+
+int  cl_object_header_init(struct cl_object_header *h);
+void cl_object_header_fini(struct cl_object_header *h);
+void cl_object_put     (const struct lu_env *env, struct cl_object *o);
+void cl_object_get     (struct cl_object *o);
+void cl_object_attr_lock  (struct cl_object *o);
+void cl_object_attr_unlock(struct cl_object *o);
+int  cl_object_attr_get   (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_attr *attr);
+int  cl_object_attr_set   (const struct lu_env *env, struct cl_object *obj,
+                          const struct cl_attr *attr, unsigned valid);
+int  cl_object_glimpse    (const struct lu_env *env, struct cl_object *obj,
+                          struct ost_lvb *lvb);
+int  cl_conf_set         (const struct lu_env *env, struct cl_object *obj,
+                          const struct cl_object_conf *conf);
+void cl_object_prune      (const struct lu_env *env, struct cl_object *obj);
+void cl_object_kill       (const struct lu_env *env, struct cl_object *obj);
+int  cl_object_has_locks  (struct cl_object *obj);
+
+/**
+ * Returns true, iff \a o0 and \a o1 are slices of the same object.
+ */
+static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
+{
+       return cl_object_header(o0) == cl_object_header(o1);
+}
+
+static inline void cl_object_page_init(struct cl_object *clob, int size)
+{
+       clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
+       cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8);
+}
+
+static inline void *cl_object_page_slice(struct cl_object *clob,
+                                        struct cl_page *page)
+{
+       return (void *)((char *)page + clob->co_slice_off);
+}
+
+/** @} cl_object */
+
+/** \defgroup cl_page cl_page
+ * @{ */
+enum {
+       CLP_GANG_OKAY = 0,
+       CLP_GANG_RESCHED,
+       CLP_GANG_AGAIN,
+       CLP_GANG_ABORT
+};
+
+/* callback of cl_page_gang_lookup() */
+typedef int   (*cl_page_gang_cb_t)  (const struct lu_env *, struct cl_io *,
+                                    struct cl_page *, void *);
+int         cl_page_gang_lookup (const struct lu_env *env,
+                                    struct cl_object *obj,
+                                    struct cl_io *io,
+                                    pgoff_t start, pgoff_t end,
+                                    cl_page_gang_cb_t cb, void *cbdata);
+struct cl_page *cl_page_lookup      (struct cl_object_header *hdr,
+                                    pgoff_t index);
+struct cl_page *cl_page_find   (const struct lu_env *env,
+                                    struct cl_object *obj,
+                                    pgoff_t idx, struct page *vmpage,
+                                    enum cl_page_type type);
+struct cl_page *cl_page_find_sub    (const struct lu_env *env,
+                                    struct cl_object *obj,
+                                    pgoff_t idx, struct page *vmpage,
+                                    struct cl_page *parent);
+void       cl_page_get  (struct cl_page *page);
+void       cl_page_put  (const struct lu_env *env,
+                                    struct cl_page *page);
+void       cl_page_print       (const struct lu_env *env, void *cookie,
+                                    lu_printer_t printer,
+                                    const struct cl_page *pg);
+void       cl_page_header_print(const struct lu_env *env, void *cookie,
+                                    lu_printer_t printer,
+                                    const struct cl_page *pg);
+struct page     *cl_page_vmpage      (const struct lu_env *env,
+                                    struct cl_page *page);
+struct cl_page *cl_vmpage_page      (struct page *vmpage, struct cl_object *obj);
+struct cl_page *cl_page_top     (struct cl_page *page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+                                      const struct lu_device_type *dtype);
+
+/**
+ * \name ownership
+ *
+ * Functions dealing with the ownership of page by io.
+ */
+/** @{ */
+
+int  cl_page_own       (const struct lu_env *env,
+                        struct cl_io *io, struct cl_page *page);
+int  cl_page_own_try    (const struct lu_env *env,
+                        struct cl_io *io, struct cl_page *page);
+void cl_page_assume     (const struct lu_env *env,
+                        struct cl_io *io, struct cl_page *page);
+void cl_page_unassume   (const struct lu_env *env,
+                        struct cl_io *io, struct cl_page *pg);
+void cl_page_disown     (const struct lu_env *env,
+                        struct cl_io *io, struct cl_page *page);
+int  cl_page_is_owned   (const struct cl_page *pg, const struct cl_io *io);
+
+/** @} ownership */
+
+/**
+ * \name transfer
+ *
+ * Functions dealing with the preparation of a page for a transfer, and
+ * tracking transfer state.
+ */
+/** @{ */
+int  cl_page_prep       (const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *pg, enum cl_req_type crt);
+void cl_page_completion (const struct lu_env *env,
+                        struct cl_page *pg, enum cl_req_type crt, int ioret);
+int  cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
+                        enum cl_req_type crt);
+int  cl_page_cache_add  (const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *pg, enum cl_req_type crt);
+void cl_page_clip       (const struct lu_env *env, struct cl_page *pg,
+                        int from, int to);
+int  cl_page_cancel     (const struct lu_env *env, struct cl_page *page);
+int  cl_page_flush      (const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *pg);
+
+/** @} transfer */
+
+
+/**
+ * \name helper routines
+ * Functions to discard, delete and export a cl_page.
+ */
+/** @{ */
+void    cl_page_discard      (const struct lu_env *env, struct cl_io *io,
+                             struct cl_page *pg);
+void    cl_page_delete       (const struct lu_env *env, struct cl_page *pg);
+int     cl_page_unmap  (const struct lu_env *env, struct cl_io *io,
+                             struct cl_page *pg);
+int     cl_page_is_vmlocked  (const struct lu_env *env,
+                             const struct cl_page *pg);
+void    cl_page_export       (const struct lu_env *env,
+                             struct cl_page *pg, int uptodate);
+int     cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+                             struct cl_page *page);
+loff_t  cl_offset          (const struct cl_object *obj, pgoff_t idx);
+pgoff_t cl_index            (const struct cl_object *obj, loff_t offset);
+int     cl_page_size    (const struct cl_object *obj);
+int     cl_pages_prune       (const struct lu_env *env, struct cl_object *obj);
+
+void cl_lock_print      (const struct lu_env *env, void *cookie,
+                        lu_printer_t printer, const struct cl_lock *lock);
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+                        lu_printer_t printer,
+                        const struct cl_lock_descr *descr);
+/* @} helper */
+
+/** @} cl_page */
+
+/** \defgroup cl_lock cl_lock
+ * @{ */
+
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+                            const struct cl_lock_descr *need,
+                            const char *scope, const void *source);
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+                            const struct cl_lock_descr *need,
+                            const char *scope, const void *source);
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+                               const struct cl_lock_descr *need,
+                               const char *scope, const void *source);
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+                                struct cl_object *obj, pgoff_t index,
+                                struct cl_lock *except, int pending,
+                                int canceld);
+static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env,
+                                             struct cl_object *obj,
+                                             struct cl_page *page,
+                                             struct cl_lock *except,
+                                             int pending, int canceld)
+{
+       LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj));
+       return cl_lock_at_pgoff(env, obj, page->cp_index, except,
+                               pending, canceld);
+}
+
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+                                      const struct lu_device_type *dtype);
+
+void  cl_lock_get       (struct cl_lock *lock);
+void  cl_lock_get_trust (struct cl_lock *lock);
+void  cl_lock_put       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_hold_add  (const struct lu_env *env, struct cl_lock *lock,
+                        const char *scope, const void *source);
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+                         const char *scope, const void *source);
+void  cl_lock_unhold    (const struct lu_env *env, struct cl_lock *lock,
+                        const char *scope, const void *source);
+void  cl_lock_release   (const struct lu_env *env, struct cl_lock *lock,
+                        const char *scope, const void *source);
+void  cl_lock_user_add  (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_user_del  (const struct lu_env *env, struct cl_lock *lock);
+
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+                                    struct cl_lock *lock);
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+                      enum cl_lock_state state);
+int cl_lock_is_intransit(struct cl_lock *lock);
+
+int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock,
+                        int keep_mutex);
+
+/** \name statemachine statemachine
+ * Interface to lock state machine consists of 3 parts:
+ *
+ *     - "try" functions that attempt to effect a state transition. If state
+ *     transition is not possible right now (e.g., if it has to wait for some
+ *     asynchronous event to occur), these functions return
+ *     cl_lock_transition::CLO_WAIT.
+ *
+ *     - "non-try" functions that implement synchronous blocking interface on
+ *     top of non-blocking "try" functions. These functions repeatedly call
+ *     corresponding "try" versions, and if state transition is not possible
+ *     immediately, wait for lock state change.
+ *
+ *     - methods from cl_lock_operations, called by "try" functions. Lock can
+ *     be advanced to the target state only when all layers voted that they
+ *     are ready for this transition. "Try" functions call methods under lock
+ *     mutex. If a layer had to release a mutex, it re-acquires it and returns
+ *     cl_lock_transition::CLO_REPEAT, causing "try" function to call all
+ *     layers again.
+ *
+ * TRY       NON-TRY      METHOD                           FINAL STATE
+ *
+ * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED
+ *
+ * cl_wait_try()    cl_wait()    cl_lock_operations::clo_wait()    CLS_HELD
+ *
+ * cl_unuse_try()   cl_unuse()   cl_lock_operations::clo_unuse()   CLS_CACHED
+ *
+ * cl_use_try()     NONE        cl_lock_operations::clo_use()     CLS_HELD
+ *
+ * @{ */
+
+int   cl_enqueue    (const struct lu_env *env, struct cl_lock *lock,
+                    struct cl_io *io, __u32 flags);
+int   cl_wait       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_unuse      (const struct lu_env *env, struct cl_lock *lock);
+int   cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+                    struct cl_io *io, __u32 flags);
+int   cl_unuse_try  (const struct lu_env *env, struct cl_lock *lock);
+int   cl_wait_try   (const struct lu_env *env, struct cl_lock *lock);
+int   cl_use_try    (const struct lu_env *env, struct cl_lock *lock, int atomic);
+
+/** @} statemachine */
+
+void cl_lock_signal      (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_state_wait  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_state_set   (const struct lu_env *env, struct cl_lock *lock,
+                         enum cl_lock_state state);
+int  cl_queue_match      (const struct list_head *queue,
+                         const struct cl_lock_descr *need);
+
+void cl_lock_mutex_get  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_mutex_try  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_mutex_put  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_is_mutexed (struct cl_lock *lock);
+int  cl_lock_nr_mutexed (const struct lu_env *env);
+int  cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_ext_match  (const struct cl_lock_descr *has,
+                        const struct cl_lock_descr *need);
+int  cl_lock_descr_match(const struct cl_lock_descr *has,
+                        const struct cl_lock_descr *need);
+int  cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need);
+int  cl_lock_modify     (const struct lu_env *env, struct cl_lock *lock,
+                        const struct cl_lock_descr *desc);
+
+void cl_lock_closure_init (const struct lu_env *env,
+                          struct cl_lock_closure *closure,
+                          struct cl_lock *origin, int wait);
+void cl_lock_closure_fini (struct cl_lock_closure *closure);
+int  cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+                          struct cl_lock_closure *closure);
+void cl_lock_disclosure   (const struct lu_env *env,
+                          struct cl_lock_closure *closure);
+int  cl_lock_enclosure    (const struct lu_env *env, struct cl_lock *lock,
+                          struct cl_lock_closure *closure);
+
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error);
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait);
+
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock);
+
+/** @} cl_lock */
+
+/** \defgroup cl_io cl_io
+ * @{ */
+
+int   cl_io_init        (const struct lu_env *env, struct cl_io *io,
+                         enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_sub_init     (const struct lu_env *env, struct cl_io *io,
+                         enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_rw_init      (const struct lu_env *env, struct cl_io *io,
+                         enum cl_io_type iot, loff_t pos, size_t count);
+int   cl_io_loop        (const struct lu_env *env, struct cl_io *io);
+
+void  cl_io_fini        (const struct lu_env *env, struct cl_io *io);
+int   cl_io_iter_init    (const struct lu_env *env, struct cl_io *io);
+void  cl_io_iter_fini    (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock        (const struct lu_env *env, struct cl_io *io);
+void  cl_io_unlock       (const struct lu_env *env, struct cl_io *io);
+int   cl_io_start      (const struct lu_env *env, struct cl_io *io);
+void  cl_io_end          (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock_add     (const struct lu_env *env, struct cl_io *io,
+                         struct cl_io_lock_link *link);
+int   cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+                          struct cl_lock_descr *descr);
+int   cl_io_read_page    (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page);
+int   cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_submit_rw    (const struct lu_env *env, struct cl_io *io,
+                         enum cl_req_type iot, struct cl_2queue *queue);
+int   cl_io_submit_sync  (const struct lu_env *env, struct cl_io *io,
+                         enum cl_req_type iot, struct cl_2queue *queue,
+                         long timeout);
+void  cl_io_rw_advance   (const struct lu_env *env, struct cl_io *io,
+                         size_t nob);
+int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page_list *queue);
+int   cl_io_is_going     (const struct lu_env *env);
+
+/**
+ * True, iff \a io is an O_APPEND write(2).
+ */
+static inline int cl_io_is_append(const struct cl_io *io)
+{
+       return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
+}
+
+static inline int cl_io_is_sync_write(const struct cl_io *io)
+{
+       return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
+}
+
+static inline int cl_io_is_mkwrite(const struct cl_io *io)
+{
+       return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
+}
+
+/**
+ * True, iff \a io is a truncate(2).
+ */
+static inline int cl_io_is_trunc(const struct cl_io *io)
+{
+       return io->ci_type == CIT_SETATTR &&
+               (io->u.ci_setattr.sa_valid & ATTR_SIZE);
+}
+
+struct cl_io *cl_io_top(struct cl_io *io);
+
+void cl_io_print(const struct lu_env *env, void *cookie,
+                lu_printer_t printer, const struct cl_io *io);
+
+#define CL_IO_SLICE_CLEAN(foo_io, base)                                 \
+do {                                                               \
+       typeof(foo_io) __foo_io = (foo_io);                          \
+                                                                       \
+       CLASSERT(offsetof(typeof(*__foo_io), base) == 0);              \
+       memset(&__foo_io->base + 1, 0,                            \
+              (sizeof *__foo_io) - sizeof __foo_io->base);          \
+} while (0)
+
+/** @} cl_io */
+
+/** \defgroup cl_page_list cl_page_list
+ * @{ */
+
+/**
+ * Last page in the page list.
+ */
+static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist)
+{
+       LASSERT(plist->pl_nr > 0);
+       return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
+}
+
+/**
+ * Iterate over pages in a page list.
+ */
+#define cl_page_list_for_each(page, list)                             \
+       list_for_each_entry((page), &(list)->pl_pages, cp_batch)
+
+/**
+ * Iterate over pages in a page list, taking possible removals into account.
+ */
+#define cl_page_list_for_each_safe(page, temp, list)               \
+       list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
+
+void cl_page_list_init   (struct cl_page_list *plist);
+void cl_page_list_add    (struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_move   (struct cl_page_list *dst, struct cl_page_list *src,
+                         struct cl_page *page);
+void cl_page_list_splice (struct cl_page_list *list,
+                         struct cl_page_list *head);
+void cl_page_list_del    (const struct lu_env *env,
+                         struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_disown (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_own    (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_assume (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_discard(const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_unmap  (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_fini   (const struct lu_env *env, struct cl_page_list *plist);
+
+void cl_2queue_init     (struct cl_2queue *queue);
+void cl_2queue_add      (struct cl_2queue *queue, struct cl_page *page);
+void cl_2queue_disown   (const struct lu_env *env,
+                        struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_assume   (const struct lu_env *env,
+                        struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_discard  (const struct lu_env *env,
+                        struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_fini     (const struct lu_env *env, struct cl_2queue *queue);
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
+
+/** @} cl_page_list */
+
+/** \defgroup cl_req cl_req
+ * @{ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+                           enum cl_req_type crt, int nr_objects);
+
+void cl_req_page_add  (const struct lu_env *env, struct cl_req *req,
+                      struct cl_page *page);
+void cl_req_page_done (const struct lu_env *env, struct cl_page *page);
+int  cl_req_prep      (const struct lu_env *env, struct cl_req *req);
+void cl_req_attr_set  (const struct lu_env *env, struct cl_req *req,
+                      struct cl_req_attr *attr, obd_valid flags);
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret);
+
+/** \defgroup cl_sync_io cl_sync_io
+ * @{ */
+
+/**
+ * Anchor for synchronous transfer. This is allocated on a stack by thread
+ * doing synchronous transfer, and a pointer to this structure is set up in
+ * every page submitted for transfer. Transfer completion routine updates
+ * anchor and wakes up waiting thread when transfer is complete.
+ */
+struct cl_sync_io {
+       /** number of pages yet to be transferred. */
+       atomic_t                csi_sync_nr;
+       /** error code. */
+       int                     csi_sync_rc;
+       /** barrier of destroy this structure */
+       atomic_t                csi_barrier;
+       /** completion to be signaled when transfer is complete. */
+       wait_queue_head_t               csi_waitq;
+};
+
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages);
+int  cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+                    struct cl_page_list *queue, struct cl_sync_io *anchor,
+                    long timeout);
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret);
+
+/** @} cl_sync_io */
+
+/** @} cl_req */
+
+/** \defgroup cl_env cl_env
+ *
+ * lu_env handling for a client.
+ *
+ * lu_env is an environment within which lustre code executes. Its major part
+ * is lu_context---a fast memory allocation mechanism that is used to conserve
+ * precious kernel stack space. Originally lu_env was designed for a server,
+ * where
+ *
+ *     - there is a (mostly) fixed number of threads, and
+ *
+ *     - call chains have no non-lustre portions inserted between lustre code.
+ *
+ * On a client both these assumtpion fails, because every user thread can
+ * potentially execute lustre code as part of a system call, and lustre calls
+ * into VFS or MM that call back into lustre.
+ *
+ * To deal with that, cl_env wrapper functions implement the following
+ * optimizations:
+ *
+ *     - allocation and destruction of environment is amortized by caching no
+ *     longer used environments instead of destroying them;
+ *
+ *     - there is a notion of "current" environment, attached to the kernel
+ *     data structure representing current thread Top-level lustre code
+ *     allocates an environment and makes it current, then calls into
+ *     non-lustre code, that in turn calls lustre back. Low-level lustre
+ *     code thus called can fetch environment created by the top-level code
+ *     and reuse it, avoiding additional environment allocation.
+ *       Right now, three interfaces can attach the cl_env to running thread:
+ *       - cl_env_get
+ *       - cl_env_implant
+ *       - cl_env_reexit(cl_env_reenter had to be called priorly)
+ *
+ * \see lu_env, lu_context, lu_context_key
+ * @{ */
+
+struct cl_env_nest {
+       int   cen_refcheck;
+       void *cen_cookie;
+};
+
+struct lu_env *cl_env_peek       (int *refcheck);
+struct lu_env *cl_env_get      (int *refcheck);
+struct lu_env *cl_env_alloc      (int *refcheck, __u32 tags);
+struct lu_env *cl_env_nested_get (struct cl_env_nest *nest);
+void      cl_env_put   (struct lu_env *env, int *refcheck);
+void      cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env);
+void     *cl_env_reenter    (void);
+void      cl_env_reexit     (void *cookie);
+void      cl_env_implant    (struct lu_env *env, int *refcheck);
+void      cl_env_unplant    (struct lu_env *env, int *refcheck);
+
+/** @} cl_env */
+
+/*
+ * Misc
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+                               struct lu_device_type *ldt,
+                               struct lu_device *next);
+/** @} clio */
+
+int cl_global_init(void);
+void cl_global_fini(void);
+
+#endif /* _LINUX_CL_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/dt_object.h b/drivers/staging/lustre/lustre/include/dt_object.h
new file mode 100644 (file)
index 0000000..e116bb2
--- /dev/null
@@ -0,0 +1,1498 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_DT_OBJECT_H
+#define __LUSTRE_DT_OBJECT_H
+
+/** \defgroup dt dt
+ * Sub-class of lu_object with methods common for "data" objects in OST stack.
+ *
+ * Data objects behave like regular files: you can read/write them, get and
+ * set their attributes. Implementation of dt interface is supposed to
+ * implement some form of garbage collection, normally reference counting
+ * (nlink) based one.
+ *
+ * Examples: osd (lustre/osd) is an implementation of dt interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+
+struct thandle;
+struct dt_device;
+struct dt_object;
+struct dt_index_features;
+struct niobuf_local;
+struct niobuf_remote;
+struct ldlm_enqueue_info;
+
+typedef enum {
+       MNTOPT_USERXATTR        = 0x00000001,
+       MNTOPT_ACL            = 0x00000002,
+} mntopt_t;
+
+struct dt_device_param {
+       unsigned           ddp_max_name_len;
+       unsigned           ddp_max_nlink;
+       unsigned           ddp_block_shift;
+       mntopt_t           ddp_mntopts;
+       unsigned           ddp_max_ea_size;
+       void          *ddp_mnt; /* XXX: old code can retrieve mnt -bzzz */
+       int             ddp_mount_type;
+       unsigned long long ddp_maxbytes;
+       /* percentage of available space to reserve for grant error margin */
+       int             ddp_grant_reserved;
+       /* per-inode space consumption */
+       short         ddp_inodespace;
+       /* per-fragment grant overhead to be used by client for grant
+        * calculation */
+       int             ddp_grant_frag;
+};
+
+/**
+ * Per-transaction commit callback function
+ */
+struct dt_txn_commit_cb;
+typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th,
+                       struct dt_txn_commit_cb *cb, int err);
+/**
+ * Special per-transaction callback for cases when just commit callback
+ * is needed and per-device callback are not convenient to use
+ */
+#define TRANS_COMMIT_CB_MAGIC  0xa0a00a0a
+#define MAX_COMMIT_CB_STR_LEN  32
+
+struct dt_txn_commit_cb {
+       struct list_head        dcb_linkage;
+       dt_cb_t         dcb_func;
+       __u32           dcb_magic;
+       char            dcb_name[MAX_COMMIT_CB_STR_LEN];
+};
+
+/**
+ * Operations on dt device.
+ */
+struct dt_device_operations {
+       /**
+        * Return device-wide statistics.
+        */
+       int   (*dt_statfs)(const struct lu_env *env,
+                          struct dt_device *dev, struct obd_statfs *osfs);
+       /**
+        * Create transaction, described by \a param.
+        */
+       struct thandle *(*dt_trans_create)(const struct lu_env *env,
+                                          struct dt_device *dev);
+       /**
+        * Start transaction, described by \a param.
+        */
+       int   (*dt_trans_start)(const struct lu_env *env,
+                               struct dt_device *dev, struct thandle *th);
+       /**
+        * Finish previously started transaction.
+        */
+       int   (*dt_trans_stop)(const struct lu_env *env,
+                              struct thandle *th);
+       /**
+        * Add commit callback to the transaction.
+        */
+       int   (*dt_trans_cb_add)(struct thandle *th,
+                                struct dt_txn_commit_cb *dcb);
+       /**
+        * Return fid of root index object.
+        */
+       int   (*dt_root_get)(const struct lu_env *env,
+                            struct dt_device *dev, struct lu_fid *f);
+       /**
+        * Return device configuration data.
+        */
+       void  (*dt_conf_get)(const struct lu_env *env,
+                            const struct dt_device *dev,
+                            struct dt_device_param *param);
+       /**
+        *  handling device state, mostly for tests
+        */
+       int   (*dt_sync)(const struct lu_env *env, struct dt_device *dev);
+       int   (*dt_ro)(const struct lu_env *env, struct dt_device *dev);
+       /**
+         * Start a transaction commit asynchronously
+         *
+         * \param env environment
+         * \param dev dt_device to start commit on
+         *
+         * \return 0 success, negative value if error
+         */
+        int   (*dt_commit_async)(const struct lu_env *env,
+                                 struct dt_device *dev);
+       /**
+        * Initialize capability context.
+        */
+       int   (*dt_init_capa_ctxt)(const struct lu_env *env,
+                                  struct dt_device *dev,
+                                  int mode, unsigned long timeout,
+                                  __u32 alg, struct lustre_capa_key *keys);
+};
+
+struct dt_index_features {
+       /** required feature flags from enum dt_index_flags */
+       __u32 dif_flags;
+       /** minimal required key size */
+       size_t dif_keysize_min;
+       /** maximal required key size, 0 if no limit */
+       size_t dif_keysize_max;
+       /** minimal required record size */
+       size_t dif_recsize_min;
+       /** maximal required record size, 0 if no limit */
+       size_t dif_recsize_max;
+       /** pointer size for record */
+       size_t dif_ptrsize;
+};
+
+enum dt_index_flags {
+       /** index supports variable sized keys */
+       DT_IND_VARKEY = 1 << 0,
+       /** index supports variable sized records */
+       DT_IND_VARREC = 1 << 1,
+       /** index can be modified */
+       DT_IND_UPDATE = 1 << 2,
+       /** index supports records with non-unique (duplicate) keys */
+       DT_IND_NONUNQ = 1 << 3,
+       /**
+        * index support fixed-size keys sorted with natural numerical way
+        * and is able to return left-side value if no exact value found
+        */
+       DT_IND_RANGE = 1 << 4,
+};
+
+/**
+ * Features, required from index to support file system directories (mapping
+ * names to fids).
+ */
+extern const struct dt_index_features dt_directory_features;
+extern const struct dt_index_features dt_otable_features;
+extern const struct dt_index_features dt_lfsck_features;
+
+/* index features supported by the accounting objects */
+extern const struct dt_index_features dt_acct_features;
+
+/* index features supported by the quota global indexes */
+extern const struct dt_index_features dt_quota_glb_features;
+
+/* index features supported by the quota slave indexes */
+extern const struct dt_index_features dt_quota_slv_features;
+
+/**
+ * This is a general purpose dt allocation hint.
+ * It now contains the parent object.
+ * It can contain any allocation hint in the future.
+ */
+struct dt_allocation_hint {
+       struct dt_object           *dah_parent;
+       __u32                  dah_mode;
+};
+
+/**
+ * object type specifier.
+ */
+
+enum dt_format_type {
+       DFT_REGULAR,
+       DFT_DIR,
+       /** for mknod */
+       DFT_NODE,
+       /** for special index */
+       DFT_INDEX,
+       /** for symbolic link */
+       DFT_SYM,
+};
+
+/**
+ * object format specifier.
+ */
+struct dt_object_format {
+       /** type for dt object */
+       enum dt_format_type dof_type;
+       union {
+               struct dof_regular {
+                       int striped;
+               } dof_reg;
+               struct dof_dir {
+               } dof_dir;
+               struct dof_node {
+               } dof_node;
+               /**
+                * special index need feature as parameter to create
+                * special idx
+                */
+               struct dof_index {
+                       const struct dt_index_features *di_feat;
+               } dof_idx;
+       } u;
+};
+
+enum dt_format_type dt_mode_to_dft(__u32 mode);
+
+typedef __u64 dt_obj_version_t;
+
+/**
+ * Per-dt-object operations.
+ */
+struct dt_object_operations {
+       void  (*do_read_lock)(const struct lu_env *env,
+                             struct dt_object *dt, unsigned role);
+       void  (*do_write_lock)(const struct lu_env *env,
+                              struct dt_object *dt, unsigned role);
+       void  (*do_read_unlock)(const struct lu_env *env,
+                               struct dt_object *dt);
+       void  (*do_write_unlock)(const struct lu_env *env,
+                                struct dt_object *dt);
+       int  (*do_write_locked)(const struct lu_env *env,
+                               struct dt_object *dt);
+       /**
+        * Note: following ->do_{x,}attr_{set,get}() operations are very
+        * similar to ->moo_{x,}attr_{set,get}() operations in struct
+        * md_object_operations (see md_object.h). These operations are not in
+        * lu_object_operations, because ->do_{x,}attr_set() versions take
+        * transaction handle as an argument (this transaction is started by
+        * caller). We might factor ->do_{x,}attr_get() into
+        * lu_object_operations, but that would break existing symmetry.
+        */
+
+       /**
+        * Return standard attributes.
+        *
+        * precondition: lu_object_exists(&dt->do_lu);
+        */
+       int   (*do_attr_get)(const struct lu_env *env,
+                            struct dt_object *dt, struct lu_attr *attr,
+                            struct lustre_capa *capa);
+       /**
+        * Set standard attributes.
+        *
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_declare_attr_set)(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    const struct lu_attr *attr,
+                                    struct thandle *handle);
+       int   (*do_attr_set)(const struct lu_env *env,
+                            struct dt_object *dt,
+                            const struct lu_attr *attr,
+                            struct thandle *handle,
+                            struct lustre_capa *capa);
+       /**
+        * Return a value of an extended attribute.
+        *
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_xattr_get)(const struct lu_env *env, struct dt_object *dt,
+                             struct lu_buf *buf, const char *name,
+                             struct lustre_capa *capa);
+       /**
+        * Set value of an extended attribute.
+        *
+        * \a fl - flags from enum lu_xattr_flags
+        *
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_declare_xattr_set)(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     const struct lu_buf *buf,
+                                     const char *name, int fl,
+                                     struct thandle *handle);
+       int   (*do_xattr_set)(const struct lu_env *env,
+                             struct dt_object *dt, const struct lu_buf *buf,
+                             const char *name, int fl, struct thandle *handle,
+                             struct lustre_capa *capa);
+       /**
+        * Delete existing extended attribute.
+        *
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_declare_xattr_del)(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     const char *name, struct thandle *handle);
+       int   (*do_xattr_del)(const struct lu_env *env,
+                             struct dt_object *dt,
+                             const char *name, struct thandle *handle,
+                             struct lustre_capa *capa);
+       /**
+        * Place list of existing extended attributes into \a buf (which has
+        * length len).
+        *
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_xattr_list)(const struct lu_env *env,
+                              struct dt_object *dt, struct lu_buf *buf,
+                              struct lustre_capa *capa);
+       /**
+        * Init allocation hint using parent object and child mode.
+        * (1) The \a parent might be NULL if this is a partial creation for
+        *     remote object.
+        * (2) The type of child is in \a child_mode.
+        * (3) The result hint is stored in \a ah;
+        */
+       void  (*do_ah_init)(const struct lu_env *env,
+                           struct dt_allocation_hint *ah,
+                           struct dt_object *parent,
+                           struct dt_object *child,
+                           umode_t child_mode);
+       /**
+        * Create new object on this device.
+        *
+        * precondition: !dt_object_exists(dt);
+        * postcondition: ergo(result == 0, dt_object_exists(dt));
+        */
+       int   (*do_declare_create)(const struct lu_env *env,
+                                  struct dt_object *dt,
+                                  struct lu_attr *attr,
+                                  struct dt_allocation_hint *hint,
+                                  struct dt_object_format *dof,
+                                  struct thandle *th);
+       int   (*do_create)(const struct lu_env *env, struct dt_object *dt,
+                          struct lu_attr *attr,
+                          struct dt_allocation_hint *hint,
+                          struct dt_object_format *dof,
+                          struct thandle *th);
+
+       /**
+         Destroy object on this device
+        * precondition: !dt_object_exists(dt);
+        * postcondition: ergo(result == 0, dt_object_exists(dt));
+        */
+       int   (*do_declare_destroy)(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   struct thandle *th);
+       int   (*do_destroy)(const struct lu_env *env, struct dt_object *dt,
+                           struct thandle *th);
+
+       /**
+        * Announce that this object is going to be used as an index. This
+        * operation check that object supports indexing operations and
+        * installs appropriate dt_index_operations vector on success.
+        *
+        * Also probes for features. Operation is successful if all required
+        * features are supported.
+        */
+       int   (*do_index_try)(const struct lu_env *env,
+                             struct dt_object *dt,
+                             const struct dt_index_features *feat);
+       /**
+        * Add nlink of the object
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_declare_ref_add)(const struct lu_env *env,
+                                   struct dt_object *dt, struct thandle *th);
+       int   (*do_ref_add)(const struct lu_env *env,
+                           struct dt_object *dt, struct thandle *th);
+       /**
+        * Del nlink of the object
+        * precondition: dt_object_exists(dt);
+        */
+       int   (*do_declare_ref_del)(const struct lu_env *env,
+                                   struct dt_object *dt, struct thandle *th);
+       int   (*do_ref_del)(const struct lu_env *env,
+                           struct dt_object *dt, struct thandle *th);
+
+       struct obd_capa *(*do_capa_get)(const struct lu_env *env,
+                                       struct dt_object *dt,
+                                       struct lustre_capa *old,
+                                       __u64 opc);
+       int (*do_object_sync)(const struct lu_env *, struct dt_object *);
+       /**
+        * Get object info of next level. Currently, only get inode from osd.
+        * This is only used by quota b=16542
+        * precondition: dt_object_exists(dt);
+        */
+       int (*do_data_get)(const struct lu_env *env, struct dt_object *dt,
+                          void **data);
+
+       /**
+        * Lock object.
+        */
+       int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt,
+                             struct lustre_handle *lh,
+                             struct ldlm_enqueue_info *einfo,
+                             void *policy);
+};
+
+/**
+ * Per-dt-object operations on "file body".
+ */
+struct dt_body_operations {
+       /**
+        * precondition: dt_object_exists(dt);
+        */
+       ssize_t (*dbo_read)(const struct lu_env *env, struct dt_object *dt,
+                           struct lu_buf *buf, loff_t *pos,
+                           struct lustre_capa *capa);
+       /**
+        * precondition: dt_object_exists(dt);
+        */
+       ssize_t (*dbo_declare_write)(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    const loff_t size, loff_t pos,
+                                    struct thandle *handle);
+       ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt,
+                            const struct lu_buf *buf, loff_t *pos,
+                            struct thandle *handle, struct lustre_capa *capa,
+                            int ignore_quota);
+       /*
+        * methods for zero-copy IO
+        */
+
+       /*
+        * precondition: dt_object_exists(dt);
+        * returns:
+        * < 0 - error code
+        * = 0 - illegal
+        * > 0 - number of local buffers prepared
+        */
+       int (*dbo_bufs_get)(const struct lu_env *env, struct dt_object *dt,
+                           loff_t pos, ssize_t len, struct niobuf_local *lb,
+                           int rw, struct lustre_capa *capa);
+       /*
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dbo_bufs_put)(const struct lu_env *env, struct dt_object *dt,
+                           struct niobuf_local *lb, int nr);
+       /*
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dbo_write_prep)(const struct lu_env *env, struct dt_object *dt,
+                             struct niobuf_local *lb, int nr);
+       /*
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dbo_declare_write_commit)(const struct lu_env *env,
+                                       struct dt_object *dt,
+                                       struct niobuf_local *,
+                                       int, struct thandle *);
+       /*
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dbo_write_commit)(const struct lu_env *env, struct dt_object *dt,
+                               struct niobuf_local *, int, struct thandle *);
+       /*
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dbo_read_prep)(const struct lu_env *env, struct dt_object *dt,
+                            struct niobuf_local *lnb, int nr);
+       int (*dbo_fiemap_get)(const struct lu_env *env, struct dt_object *dt,
+                             struct ll_user_fiemap *fm);
+       /**
+        * Punch object's content
+        * precondition: regular object, not index
+        */
+       int   (*dbo_declare_punch)(const struct lu_env *, struct dt_object *,
+                                 __u64, __u64, struct thandle *th);
+       int   (*dbo_punch)(const struct lu_env *env, struct dt_object *dt,
+                         __u64 start, __u64 end, struct thandle *th,
+                         struct lustre_capa *capa);
+};
+
+/**
+ * Incomplete type of index record.
+ */
+struct dt_rec;
+
+/**
+ * Incomplete type of index key.
+ */
+struct dt_key;
+
+/**
+ * Incomplete type of dt iterator.
+ */
+struct dt_it;
+
+/**
+ * Per-dt-object operations on object as index.
+ */
+struct dt_index_operations {
+       /**
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dio_lookup)(const struct lu_env *env, struct dt_object *dt,
+                         struct dt_rec *rec, const struct dt_key *key,
+                         struct lustre_capa *capa);
+       /**
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dio_declare_insert)(const struct lu_env *env,
+                                 struct dt_object *dt,
+                                 const struct dt_rec *rec,
+                                 const struct dt_key *key,
+                                 struct thandle *handle);
+       int (*dio_insert)(const struct lu_env *env, struct dt_object *dt,
+                         const struct dt_rec *rec, const struct dt_key *key,
+                         struct thandle *handle, struct lustre_capa *capa,
+                         int ignore_quota);
+       /**
+        * precondition: dt_object_exists(dt);
+        */
+       int (*dio_declare_delete)(const struct lu_env *env,
+                                 struct dt_object *dt,
+                                 const struct dt_key *key,
+                                 struct thandle *handle);
+       int (*dio_delete)(const struct lu_env *env, struct dt_object *dt,
+                         const struct dt_key *key, struct thandle *handle,
+                         struct lustre_capa *capa);
+       /**
+        * Iterator interface
+        */
+       struct dt_it_ops {
+               /**
+                * Allocate and initialize new iterator.
+                *
+                * precondition: dt_object_exists(dt);
+                */
+               struct dt_it *(*init)(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     __u32 attr,
+                                     struct lustre_capa *capa);
+               void      (*fini)(const struct lu_env *env,
+                                     struct dt_it *di);
+               int         (*get)(const struct lu_env *env,
+                                     struct dt_it *di,
+                                     const struct dt_key *key);
+               void       (*put)(const struct lu_env *env,
+                                     struct dt_it *di);
+               int        (*next)(const struct lu_env *env,
+                                     struct dt_it *di);
+               struct dt_key *(*key)(const struct lu_env *env,
+                                     const struct dt_it *di);
+               int       (*key_size)(const struct lu_env *env,
+                                     const struct dt_it *di);
+               int         (*rec)(const struct lu_env *env,
+                                     const struct dt_it *di,
+                                     struct dt_rec *rec,
+                                     __u32 attr);
+               __u64   (*store)(const struct lu_env *env,
+                                     const struct dt_it *di);
+               int        (*load)(const struct lu_env *env,
+                                     const struct dt_it *di, __u64 hash);
+               int     (*key_rec)(const struct lu_env *env,
+                                     const struct dt_it *di, void* key_rec);
+       } dio_it;
+};
+
+enum dt_otable_it_valid {
+       DOIV_ERROR_HANDLE       = 0x0001,
+};
+
+enum dt_otable_it_flags {
+       /* Exit when fail. */
+       DOIF_FAILOUT    = 0x0001,
+
+       /* Reset iteration position to the device beginning. */
+       DOIF_RESET      = 0x0002,
+
+       /* There is up layer component uses the iteration. */
+       DOIF_OUTUSED    = 0x0004,
+};
+
+/* otable based iteration needs to use the common DT interation APIs.
+ * To initialize the iteration, it needs call dio_it::init() firstly.
+ * Here is how the otable based iteration should prepare arguments to
+ * call dt_it_ops::init().
+ *
+ * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init()
+ * is composed of two parts:
+ * low 16-bits is for valid bits, high 16-bits is for flags bits. */
+#define DT_OTABLE_IT_FLAGS_SHIFT       16
+#define DT_OTABLE_IT_FLAGS_MASK        0xffff0000
+
+struct dt_device {
+       struct lu_device                   dd_lu_dev;
+       const struct dt_device_operations *dd_ops;
+
+       /**
+        * List of dt_txn_callback (see below). This is not protected in any
+        * way, because callbacks are supposed to be added/deleted only during
+        * single-threaded start-up shut-down procedures.
+        */
+       struct list_head                         dd_txn_callbacks;
+};
+
+int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
+void dt_device_fini(struct dt_device *dev);
+
+static inline int lu_device_is_dt(const struct lu_device *d)
+{
+       return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT);
+}
+
+static inline struct dt_device * lu2dt_dev(struct lu_device *l)
+{
+       LASSERT(lu_device_is_dt(l));
+       return container_of0(l, struct dt_device, dd_lu_dev);
+}
+
+struct dt_object {
+       struct lu_object                   do_lu;
+       const struct dt_object_operations *do_ops;
+       const struct dt_body_operations   *do_body_ops;
+       const struct dt_index_operations  *do_index_ops;
+};
+
+/*
+ * In-core representation of per-device local object OID storage
+ */
+struct local_oid_storage {
+       /* all initialized llog systems on this node linked by this */
+       struct list_head          los_list;
+
+       /* how many handle's reference this los has */
+       atomic_t          los_refcount;
+       struct dt_device *los_dev;
+       struct dt_object *los_obj;
+
+       /* data used to generate new fids */
+       struct mutex     los_id_lock;
+       __u64             los_seq;
+       __u32             los_last_oid;
+};
+
+static inline struct dt_object *lu2dt(struct lu_object *l)
+{
+       LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev));
+       return container_of0(l, struct dt_object, do_lu);
+}
+
+int  dt_object_init(struct dt_object *obj,
+                   struct lu_object_header *h, struct lu_device *d);
+
+void dt_object_fini(struct dt_object *obj);
+
+static inline int dt_object_exists(const struct dt_object *dt)
+{
+       return lu_object_exists(&dt->do_lu);
+}
+
+static inline int dt_object_remote(const struct dt_object *dt)
+{
+       return lu_object_remote(&dt->do_lu);
+}
+
+static inline struct dt_object *lu2dt_obj(struct lu_object *o)
+{
+       LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev)));
+       return container_of0(o, struct dt_object, do_lu);
+}
+
+/**
+ * This is the general purpose transaction handle.
+ * 1. Transaction Life Cycle
+ *      This transaction handle is allocated upon starting a new transaction,
+ *      and deallocated after this transaction is committed.
+ * 2. Transaction Nesting
+ *      We do _NOT_ support nested transaction. So, every thread should only
+ *      have one active transaction, and a transaction only belongs to one
+ *      thread. Due to this, transaction handle need no reference count.
+ * 3. Transaction & dt_object locking
+ *      dt_object locks should be taken inside transaction.
+ * 4. Transaction & RPC
+ *      No RPC request should be issued inside transaction.
+ */
+struct thandle {
+       /** the dt device on which the transactions are executed */
+       struct dt_device *th_dev;
+
+       /** context for this transaction, tag is LCT_TX_HANDLE */
+       struct lu_context th_ctx;
+
+       /** additional tags (layers can add in declare) */
+       __u32        th_tags;
+
+       /** the last operation result in this transaction.
+        * this value is used in recovery */
+       __s32        th_result;
+
+       /** whether we need sync commit */
+       unsigned int            th_sync:1;
+
+       /* local transation, no need to inform other layers */
+       unsigned int            th_local:1;
+
+       /* In DNE, one transaction can be disassemblied into
+        * updates on several different MDTs, and these updates
+        * will be attached to th_remote_update_list per target.
+        * Only single thread will access the list, no need lock
+        */
+       struct list_head                th_remote_update_list;
+       struct update_request   *th_current_request;
+};
+
+/**
+ * Transaction call-backs.
+ *
+ * These are invoked by osd (or underlying transaction engine) when
+ * transaction changes state.
+ *
+ * Call-backs are used by upper layers to modify transaction parameters and to
+ * perform some actions on for each transaction state transition. Typical
+ * example is mdt registering call-back to write into last-received file
+ * before each transaction commit.
+ */
+struct dt_txn_callback {
+       int (*dtc_txn_start)(const struct lu_env *env,
+                            struct thandle *txn, void *cookie);
+       int (*dtc_txn_stop)(const struct lu_env *env,
+                           struct thandle *txn, void *cookie);
+       void (*dtc_txn_commit)(struct thandle *txn, void *cookie);
+       void            *dtc_cookie;
+       __u32           dtc_tag;
+       struct list_head           dtc_linkage;
+};
+
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb);
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb);
+
+int dt_txn_hook_start(const struct lu_env *env,
+                     struct dt_device *dev, struct thandle *txn);
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn);
+void dt_txn_hook_commit(struct thandle *txn);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj);
+
+/**
+ * Callback function used for parsing path.
+ * \see llo_store_resolve
+ */
+typedef int (*dt_entry_func_t)(const struct lu_env *env,
+                           const char *name,
+                           void *pvt);
+
+#define DT_MAX_PATH 1024
+
+int dt_path_parser(const struct lu_env *env,
+                  char *local, dt_entry_func_t entry_func,
+                  void *data);
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+                const char *path, struct lu_fid *fid);
+
+struct dt_object *dt_store_open(const struct lu_env *env,
+                               struct dt_device *dt,
+                               const char *dirname,
+                               const char *filename,
+                               struct lu_fid *fid);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+                                   struct dt_device *dt,
+                                   const struct lu_fid *fid,
+                                   struct dt_object_format *dof,
+                                   struct lu_attr *attr);
+
+struct dt_object *dt_locate_at(const struct lu_env *env,
+                              struct dt_device *dev,
+                              const struct lu_fid *fid,
+                              struct lu_device *top_dev);
+static inline struct dt_object *
+dt_locate(const struct lu_env *env, struct dt_device *dev,
+         const struct lu_fid *fid)
+{
+       return dt_locate_at(env, dev, fid, dev->dd_lu_dev.ld_site->ls_top_dev);
+}
+
+
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+                          const struct lu_fid *first_fid,
+                          struct local_oid_storage **los);
+void local_oid_storage_fini(const struct lu_env *env,
+                           struct local_oid_storage *los);
+int local_object_fid_generate(const struct lu_env *env,
+                             struct local_oid_storage *los,
+                             struct lu_fid *fid);
+int local_object_declare_create(const struct lu_env *env,
+                               struct local_oid_storage *los,
+                               struct dt_object *o,
+                               struct lu_attr *attr,
+                               struct dt_object_format *dof,
+                               struct thandle *th);
+int local_object_create(const struct lu_env *env,
+                       struct local_oid_storage *los,
+                       struct dt_object *o,
+                       struct lu_attr *attr, struct dt_object_format *dof,
+                       struct thandle *th);
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+                                           struct local_oid_storage *los,
+                                           struct dt_object *parent,
+                                           const char *name, __u32 mode);
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+                                                    struct dt_device *dt,
+                                                    const struct lu_fid *fid,
+                                                    struct dt_object *parent,
+                                                    const char *name,
+                                                    __u32 mode);
+struct dt_object *
+local_index_find_or_create(const struct lu_env *env,
+                          struct local_oid_storage *los,
+                          struct dt_object *parent,
+                          const char *name, __u32 mode,
+                          const struct dt_index_features *ft);
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+                                   struct dt_device *dt,
+                                   const struct lu_fid *fid,
+                                   struct dt_object *parent,
+                                   const char *name, __u32 mode,
+                                   const struct dt_index_features *ft);
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+                       struct dt_object *parent, const char *name);
+
+static inline int dt_object_lock(const struct lu_env *env,
+                                struct dt_object *o, struct lustre_handle *lh,
+                                struct ldlm_enqueue_info *einfo,
+                                void *policy)
+{
+       LASSERT(o);
+       LASSERT(o->do_ops);
+       LASSERT(o->do_ops->do_object_lock);
+       return o->do_ops->do_object_lock(env, o, lh, einfo, policy);
+}
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+                 const char *name, struct lu_fid *fid);
+
+static inline int dt_object_sync(const struct lu_env *env,
+                                struct dt_object *o)
+{
+       LASSERT(o);
+       LASSERT(o->do_ops);
+       LASSERT(o->do_ops->do_object_sync);
+       return o->do_ops->do_object_sync(env, o);
+}
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+                          struct thandle *th);
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+                   dt_obj_version_t version, struct thandle *th);
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o);
+
+
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+           struct lu_buf *buf, loff_t *pos);
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+                  struct lu_buf *buf, loff_t *pos);
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+                   const struct lu_buf *buf, loff_t *pos, struct thandle *th);
+typedef int (*dt_index_page_build_t)(const struct lu_env *env,
+                                    union lu_page *lp, int nob,
+                                    const struct dt_it_ops *iops,
+                                    struct dt_it *it, __u32 attr, void *arg);
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+                 const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+                 void *arg);
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+                 struct idx_info *ii, const struct lu_rdpg *rdpg);
+
+static inline struct thandle *dt_trans_create(const struct lu_env *env,
+                                             struct dt_device *d)
+{
+       LASSERT(d->dd_ops->dt_trans_create);
+       return d->dd_ops->dt_trans_create(env, d);
+}
+
+static inline int dt_trans_start(const struct lu_env *env,
+                                struct dt_device *d, struct thandle *th)
+{
+       LASSERT(d->dd_ops->dt_trans_start);
+       return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+/* for this transaction hooks shouldn't be called */
+static inline int dt_trans_start_local(const struct lu_env *env,
+                                      struct dt_device *d, struct thandle *th)
+{
+       LASSERT(d->dd_ops->dt_trans_start);
+       th->th_local = 1;
+       return d->dd_ops->dt_trans_start(env, d, th);
+}
+
+static inline int dt_trans_stop(const struct lu_env *env,
+                               struct dt_device *d, struct thandle *th)
+{
+       LASSERT(d->dd_ops->dt_trans_stop);
+       return d->dd_ops->dt_trans_stop(env, th);
+}
+
+static inline int dt_trans_cb_add(struct thandle *th,
+                                 struct dt_txn_commit_cb *dcb)
+{
+       LASSERT(th->th_dev->dd_ops->dt_trans_cb_add);
+       dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC;
+       return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb);
+}
+/** @} dt */
+
+
+static inline int dt_declare_record_write(const struct lu_env *env,
+                                         struct dt_object *dt,
+                                         int size, loff_t pos,
+                                         struct thandle *th)
+{
+       int rc;
+
+       LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+       LASSERT(th != NULL);
+       LASSERT(dt->do_body_ops);
+       LASSERT(dt->do_body_ops->dbo_declare_write);
+       rc = dt->do_body_ops->dbo_declare_write(env, dt, size, pos, th);
+       return rc;
+}
+
+static inline int dt_declare_create(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   struct lu_attr *attr,
+                                   struct dt_allocation_hint *hint,
+                                   struct dt_object_format *dof,
+                                   struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_create);
+       return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_create(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   struct lu_attr *attr,
+                                   struct dt_allocation_hint *hint,
+                                   struct dt_object_format *dof,
+                                   struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_create);
+       return dt->do_ops->do_create(env, dt, attr, hint, dof, th);
+}
+
+static inline int dt_declare_destroy(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_destroy);
+       return dt->do_ops->do_declare_destroy(env, dt, th);
+}
+
+static inline int dt_destroy(const struct lu_env *env,
+                            struct dt_object *dt,
+                            struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_destroy);
+       return dt->do_ops->do_destroy(env, dt, th);
+}
+
+static inline void dt_read_lock(const struct lu_env *env,
+                               struct dt_object *dt,
+                               unsigned role)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_read_lock);
+       dt->do_ops->do_read_lock(env, dt, role);
+}
+
+static inline void dt_write_lock(const struct lu_env *env,
+                               struct dt_object *dt,
+                               unsigned role)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_write_lock);
+       dt->do_ops->do_write_lock(env, dt, role);
+}
+
+static inline void dt_read_unlock(const struct lu_env *env,
+                               struct dt_object *dt)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_read_unlock);
+       dt->do_ops->do_read_unlock(env, dt);
+}
+
+static inline void dt_write_unlock(const struct lu_env *env,
+                               struct dt_object *dt)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_write_unlock);
+       dt->do_ops->do_write_unlock(env, dt);
+}
+
+static inline int dt_write_locked(const struct lu_env *env,
+                                 struct dt_object *dt)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_write_locked);
+       return dt->do_ops->do_write_locked(env, dt);
+}
+
+static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt,
+                             struct lu_attr *la, void *arg)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_attr_get);
+       return dt->do_ops->do_attr_get(env, dt, la, arg);
+}
+
+static inline int dt_declare_attr_set(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     const struct lu_attr *la,
+                                     struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_attr_set);
+       return dt->do_ops->do_declare_attr_set(env, dt, la, th);
+}
+
+static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt,
+                             const struct lu_attr *la, struct thandle *th,
+                             struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_attr_set);
+       return dt->do_ops->do_attr_set(env, dt, la, th, capa);
+}
+
+static inline int dt_declare_ref_add(const struct lu_env *env,
+                                    struct dt_object *dt, struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_ref_add);
+       return dt->do_ops->do_declare_ref_add(env, dt, th);
+}
+
+static inline int dt_ref_add(const struct lu_env *env,
+                            struct dt_object *dt, struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_ref_add);
+       return dt->do_ops->do_ref_add(env, dt, th);
+}
+
+static inline int dt_declare_ref_del(const struct lu_env *env,
+                                    struct dt_object *dt, struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_ref_del);
+       return dt->do_ops->do_declare_ref_del(env, dt, th);
+}
+
+static inline int dt_ref_del(const struct lu_env *env,
+                            struct dt_object *dt, struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_ref_del);
+       return dt->do_ops->do_ref_del(env, dt, th);
+}
+
+static inline struct obd_capa *dt_capa_get(const struct lu_env *env,
+                                          struct dt_object *dt,
+                                          struct lustre_capa *old, __u64 opc)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_ref_del);
+       return dt->do_ops->do_capa_get(env, dt, old, opc);
+}
+
+static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
+                             struct niobuf_remote *rnb,
+                             struct niobuf_local *lnb, int rw,
+                             struct lustre_capa *capa)
+{
+       LASSERT(d);
+       LASSERT(d->do_body_ops);
+       LASSERT(d->do_body_ops->dbo_bufs_get);
+       return d->do_body_ops->dbo_bufs_get(env, d, rnb->offset,
+                                           rnb->len, lnb, rw, capa);
+}
+
+static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d,
+                             struct niobuf_local *lnb, int n)
+{
+       LASSERT(d);
+       LASSERT(d->do_body_ops);
+       LASSERT(d->do_body_ops->dbo_bufs_put);
+       return d->do_body_ops->dbo_bufs_put(env, d, lnb, n);
+}
+
+static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d,
+                               struct niobuf_local *lnb, int n)
+{
+       LASSERT(d);
+       LASSERT(d->do_body_ops);
+       LASSERT(d->do_body_ops->dbo_write_prep);
+       return d->do_body_ops->dbo_write_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_write_commit(const struct lu_env *env,
+                                         struct dt_object *d,
+                                         struct niobuf_local *lnb,
+                                         int n, struct thandle *th)
+{
+       LASSERTF(d != NULL, "dt is NULL when we want to declare write\n");
+       LASSERT(th != NULL);
+       return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th);
+}
+
+
+static inline int dt_write_commit(const struct lu_env *env,
+                                 struct dt_object *d, struct niobuf_local *lnb,
+                                 int n, struct thandle *th)
+{
+       LASSERT(d);
+       LASSERT(d->do_body_ops);
+       LASSERT(d->do_body_ops->dbo_write_commit);
+       return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th);
+}
+
+static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d,
+                              struct niobuf_local *lnb, int n)
+{
+       LASSERT(d);
+       LASSERT(d->do_body_ops);
+       LASSERT(d->do_body_ops->dbo_read_prep);
+       return d->do_body_ops->dbo_read_prep(env, d, lnb, n);
+}
+
+static inline int dt_declare_punch(const struct lu_env *env,
+                                  struct dt_object *dt, __u64 start,
+                                  __u64 end, struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_body_ops);
+       LASSERT(dt->do_body_ops->dbo_declare_punch);
+       return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th);
+}
+
+static inline int dt_punch(const struct lu_env *env, struct dt_object *dt,
+                          __u64 start, __u64 end, struct thandle *th,
+                          struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_body_ops);
+       LASSERT(dt->do_body_ops->dbo_punch);
+       return dt->do_body_ops->dbo_punch(env, dt, start, end, th, capa);
+}
+
+static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d,
+                               struct ll_user_fiemap *fm)
+{
+       LASSERT(d);
+       if (d->do_body_ops == NULL)
+               return -EPROTO;
+       if (d->do_body_ops->dbo_fiemap_get == NULL)
+               return -EOPNOTSUPP;
+       return d->do_body_ops->dbo_fiemap_get(env, d, fm);
+}
+
+static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev,
+                           struct obd_statfs *osfs)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_statfs);
+       return dev->dd_ops->dt_statfs(env, dev, osfs);
+}
+
+static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev,
+                             struct lu_fid *f)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_root_get);
+       return dev->dd_ops->dt_root_get(env, dev, f);
+}
+
+static inline void dt_conf_get(const struct lu_env *env,
+                              const struct dt_device *dev,
+                              struct dt_device_param *param)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_conf_get);
+       return dev->dd_ops->dt_conf_get(env, dev, param);
+}
+
+static inline int dt_sync(const struct lu_env *env, struct dt_device *dev)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_sync);
+       return dev->dd_ops->dt_sync(env, dev);
+}
+
+static inline int dt_ro(const struct lu_env *env, struct dt_device *dev)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_ro);
+       return dev->dd_ops->dt_ro(env, dev);
+}
+
+static inline int dt_declare_insert(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   const struct dt_rec *rec,
+                                   const struct dt_key *key,
+                                   struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_index_ops);
+       LASSERT(dt->do_index_ops->dio_declare_insert);
+       return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th);
+}
+
+static inline int dt_insert(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   const struct dt_rec *rec,
+                                   const struct dt_key *key,
+                                   struct thandle *th,
+                                   struct lustre_capa *capa,
+                                   int noquota)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_index_ops);
+       LASSERT(dt->do_index_ops->dio_insert);
+       return dt->do_index_ops->dio_insert(env, dt, rec, key, th,
+                                           capa, noquota);
+}
+
+static inline int dt_declare_xattr_del(const struct lu_env *env,
+                                      struct dt_object *dt,
+                                      const char *name,
+                                      struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_xattr_del);
+       return dt->do_ops->do_declare_xattr_del(env, dt, name, th);
+}
+
+static inline int dt_xattr_del(const struct lu_env *env,
+                              struct dt_object *dt, const char *name,
+                              struct thandle *th,
+                              struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_xattr_del);
+       return dt->do_ops->do_xattr_del(env, dt, name, th, capa);
+}
+
+static inline int dt_declare_xattr_set(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     const struct lu_buf *buf,
+                                     const char *name, int fl,
+                                     struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_declare_xattr_set);
+       return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th);
+}
+
+static inline int dt_xattr_set(const struct lu_env *env,
+                             struct dt_object *dt, const struct lu_buf *buf,
+                             const char *name, int fl, struct thandle *th,
+                             struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_xattr_set);
+       return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th, capa);
+}
+
+static inline int dt_xattr_get(const struct lu_env *env,
+                             struct dt_object *dt, struct lu_buf *buf,
+                             const char *name, struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_xattr_get);
+       return dt->do_ops->do_xattr_get(env, dt, buf, name, capa);
+}
+
+static inline int dt_xattr_list(const struct lu_env *env,
+                              struct dt_object *dt, struct lu_buf *buf,
+                              struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_ops);
+       LASSERT(dt->do_ops->do_xattr_list);
+       return dt->do_ops->do_xattr_list(env, dt, buf, capa);
+}
+
+static inline int dt_declare_delete(const struct lu_env *env,
+                                   struct dt_object *dt,
+                                   const struct dt_key *key,
+                                   struct thandle *th)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_index_ops);
+       LASSERT(dt->do_index_ops->dio_declare_delete);
+       return dt->do_index_ops->dio_declare_delete(env, dt, key, th);
+}
+
+static inline int dt_delete(const struct lu_env *env,
+                           struct dt_object *dt,
+                           const struct dt_key *key,
+                           struct thandle *th,
+                           struct lustre_capa *capa)
+{
+       LASSERT(dt);
+       LASSERT(dt->do_index_ops);
+       LASSERT(dt->do_index_ops->dio_delete);
+       return dt->do_index_ops->dio_delete(env, dt, key, th, capa);
+}
+
+static inline int dt_commit_async(const struct lu_env *env,
+                                 struct dt_device *dev)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_commit_async);
+       return dev->dd_ops->dt_commit_async(env, dev);
+}
+
+static inline int dt_init_capa_ctxt(const struct lu_env *env,
+                                   struct dt_device *dev,
+                                   int mode, unsigned long timeout,
+                                   __u32 alg, struct lustre_capa_key *keys)
+{
+       LASSERT(dev);
+       LASSERT(dev->dd_ops);
+       LASSERT(dev->dd_ops->dt_init_capa_ctxt);
+       return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode,
+                                             timeout, alg, keys);
+}
+
+static inline int dt_lookup(const struct lu_env *env,
+                           struct dt_object *dt,
+                           struct dt_rec *rec,
+                           const struct dt_key *key,
+                           struct lustre_capa *capa)
+{
+       int ret;
+
+       LASSERT(dt);
+       LASSERT(dt->do_index_ops);
+       LASSERT(dt->do_index_ops->dio_lookup);
+
+       ret = dt->do_index_ops->dio_lookup(env, dt, rec, key, capa);
+       if (ret > 0)
+               ret = 0;
+       else if (ret == 0)
+               ret = -ENOENT;
+       return ret;
+}
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+struct dt_find_hint {
+       struct lu_fid   *dfh_fid;
+       struct dt_device     *dfh_dt;
+       struct dt_object     *dfh_o;
+};
+
+struct dt_thread_info {
+       char                 dti_buf[DT_MAX_PATH];
+       struct dt_find_hint      dti_dfh;
+       struct lu_attr     dti_attr;
+       struct lu_fid       dti_fid;
+       struct dt_object_format  dti_dof;
+       struct lustre_mdt_attrs  dti_lma;
+       struct lu_buf       dti_lb;
+       loff_t             dti_off;
+};
+
+extern struct lu_context_key dt_key;
+
+static inline struct dt_thread_info *dt_info(const struct lu_env *env)
+{
+       struct dt_thread_info *dti;
+
+       dti = lu_context_key_get(&env->le_ctx, &dt_key);
+       LASSERT(dti);
+       return dti;
+}
+
+int dt_global_init(void);
+void dt_global_fini(void);
+
+# ifdef LPROCFS
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+                         int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+                             int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+                            int count, int *eof, void *data);
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+                             int count, int *eof, void *data);
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+                            int count, int *eof, void *data);
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+                           int count, int *eof, void *data);
+# endif /* LPROCFS */
+
+#endif /* __LUSTRE_DT_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/interval_tree.h b/drivers/staging/lustre/lustre/include/interval_tree.h
new file mode 100644 (file)
index 0000000..dfdb8aa
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/interval_tree.h
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+
+#ifndef _INTERVAL_H__
+#define _INTERVAL_H__
+
+#include <linux/libcfs/libcfs.h>   /* LASSERT. */
+
+struct interval_node {
+       struct interval_node   *in_left;
+       struct interval_node   *in_right;
+       struct interval_node   *in_parent;
+       unsigned                in_color:1,
+                               in_intree:1, /** set if the node is in tree */
+                               in_res1:30;
+       __u8                in_res2[4];  /** tags, 8-bytes aligned */
+       __u64              in_max_high;
+       struct interval_node_extent {
+               __u64 start;
+               __u64 end;
+       } in_extent;
+};
+
+enum interval_iter {
+       INTERVAL_ITER_CONT = 1,
+       INTERVAL_ITER_STOP = 2
+};
+
+static inline int interval_is_intree(struct interval_node *node)
+{
+       return node->in_intree == 1;
+}
+
+static inline __u64 interval_low(struct interval_node *node)
+{
+       return node->in_extent.start;
+}
+
+static inline __u64 interval_high(struct interval_node *node)
+{
+       return node->in_extent.end;
+}
+
+static inline void interval_set(struct interval_node *node,
+                               __u64 start, __u64 end)
+{
+       LASSERT(start <= end);
+       node->in_extent.start = start;
+       node->in_extent.end = end;
+       node->in_max_high = end;
+}
+
+/* Rules to write an interval callback.
+ *  - the callback returns INTERVAL_ITER_STOP when it thinks the iteration
+ *    should be stopped. It will then cause the iteration function to return
+ *    immediately with return value INTERVAL_ITER_STOP.
+ *  - callbacks for interval_iterate and interval_iterate_reverse: Every
+ *    nodes in the tree will be set to @node before the callback being called
+ *  - callback for interval_search: Only overlapped node will be set to @node
+ *    before the callback being called.
+ */
+typedef enum interval_iter (*interval_callback_t)(struct interval_node *node,
+                                                 void *args);
+
+struct interval_node *interval_insert(struct interval_node *node,
+                                     struct interval_node **root);
+void interval_erase(struct interval_node *node, struct interval_node **root);
+
+/* Search the extents in the tree and call @func for each overlapped
+ * extents. */
+enum interval_iter interval_search(struct interval_node *root,
+                                  struct interval_node_extent *ex,
+                                  interval_callback_t func, void *data);
+
+/* Iterate every node in the tree - by reverse order or regular order. */
+enum interval_iter interval_iterate(struct interval_node *root,
+                                   interval_callback_t func, void *data);
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+                                   interval_callback_t func,void *data);
+
+void interval_expand(struct interval_node *root,
+                    struct interval_node_extent *ext,
+                    struct interval_node_extent *limiter);
+int interval_is_overlapped(struct interval_node *root,
+                          struct interval_node_extent *ex);
+struct interval_node *interval_find(struct interval_node *root,
+                                   struct interval_node_extent *ex);
+#endif
diff --git a/drivers/staging/lustre/lustre/include/ioctl.h b/drivers/staging/lustre/lustre/include/ioctl.h
new file mode 100644 (file)
index 0000000..227c261
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _IOWR
+
+/* On i386 and x86_64, _ASM_I386_IOCTL_H is defined by the kernel's ioctl.h,
+ * and on newer kernels this header is shared as _ASM_GENERIC_IOCTL_H.
+ *
+ * We can avoid any problems with the kernel header being included again by
+ * defining _ASM_I386_IOCTL_H here so that a later occurence of <asm/ioctl.h>
+ * does not include the kernel's ioctl.h after this one. b=14746 */
+#define _ASM_I386_IOCTL_H
+#define _ASM_GENERIC_IOCTL_H
+
+/* ioctl command encoding: 32 bits total, command in lower 16 bits,
+ * size of the parameter structure in the lower 14 bits of the
+ * upper 16 bits.
+ * Encoding the size of the parameter structure in the ioctl request
+ * The highest 2 bits are reserved for indicating the ``access mode''.
+ * NOTE: This limits the max parameter size to 16kB -1 !
+ */
+
+/*
+ * The following is for compatibility across the various Linux
+ * platforms.  The i386 ioctl numbering scheme doesn't really enforce
+ * a type field.  De facto, however, the top 8 bits of the lower 16
+ * bits are indeed used as a type field, so we might just as well make
+ * this explicit here.  Please be sure to use the decoding macros
+ * below from now on.
+ */
+#define _IOC_NRBITS     8
+#define _IOC_TYPEBITS   8
+#define _IOC_SIZEBITS   14
+#define _IOC_DIRBITS    2
+
+#define _IOC_NRMASK     ((1 << _IOC_NRBITS)-1)
+#define _IOC_TYPEMASK   ((1 << _IOC_TYPEBITS)-1)
+#define _IOC_SIZEMASK   ((1 << _IOC_SIZEBITS)-1)
+#define _IOC_DIRMASK    ((1 << _IOC_DIRBITS)-1)
+
+#define _IOC_NRSHIFT    0
+#define _IOC_TYPESHIFT  (_IOC_NRSHIFT+_IOC_NRBITS)
+#define _IOC_SIZESHIFT  (_IOC_TYPESHIFT+_IOC_TYPEBITS)
+#define _IOC_DIRSHIFT   (_IOC_SIZESHIFT+_IOC_SIZEBITS)
+
+/*
+ * Direction bits.
+ */
+#define _IOC_NONE       0U
+#define _IOC_WRITE      1U
+#define _IOC_READ       2U
+
+#define _IOC(dir,type,nr,size) (((dir)  << _IOC_DIRSHIFT) | ((type) << _IOC_TYPESHIFT) | ((nr)   << _IOC_NRSHIFT) | ((size) << _IOC_SIZESHIFT))
+
+/* used to create numbers */
+#define _IO(type,nr)       _IOC(_IOC_NONE,(type),(nr),0)
+#define _IOR(type,nr,size)      _IOC(_IOC_READ,(type),(nr),sizeof(size))
+#define _IOW(type,nr,size)      _IOC(_IOC_WRITE,(type),(nr),sizeof(size))
+#define _IOWR(type,nr,size)     _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
+
+/* used to decode ioctl numbers.. */
+#define _IOC_DIR(nr)       (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
+#define _IOC_TYPE(nr)     (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
+#define _IOC_NR(nr)         (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
+#define _IOC_SIZE(nr)     (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
+
+/* ...and for the drivers/sound files... */
+
+#define IOC_IN   (_IOC_WRITE << _IOC_DIRSHIFT)
+#define IOC_OUT         (_IOC_READ << _IOC_DIRSHIFT)
+#define IOC_INOUT       ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT)
+#define IOCSIZE_MASK    (_IOC_SIZEMASK << _IOC_SIZESHIFT)
+#define IOCSIZE_SHIFT   (_IOC_SIZESHIFT)
+
+#endif /* _IOWR */
diff --git a/drivers/staging/lustre/lustre/include/lclient.h b/drivers/staging/lustre/lustre/include/lclient.h
new file mode 100644 (file)
index 0000000..9d4011f
--- /dev/null
@@ -0,0 +1,437 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Definitions shared between vvp and liblustre, and other clients in the
+ * future.
+ *
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef LCLIENT_H
+#define LCLIENT_H
+
+blkcnt_t dirty_cnt(struct inode *inode);
+
+int cl_glimpse_size0(struct inode *inode, int agl);
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+                   struct inode *inode, struct cl_object *clob, int agl);
+
+static inline int cl_glimpse_size(struct inode *inode)
+{
+       return cl_glimpse_size0(inode, 0);
+}
+
+static inline int cl_agl(struct inode *inode)
+{
+       return cl_glimpse_size0(inode, 1);
+}
+
+/**
+ * Locking policy for setattr.
+ */
+enum ccc_setattr_lock_type {
+       /** Locking is done by server */
+       SETATTR_NOLOCK,
+       /** Extent lock is enqueued */
+       SETATTR_EXTENT_LOCK,
+       /** Existing local extent lock is used */
+       SETATTR_MATCH_LOCK
+};
+
+
+/**
+ * IO state private to vvp or slp layers.
+ */
+struct ccc_io {
+       /** super class */
+       struct cl_io_slice     cui_cl;
+       struct cl_io_lock_link cui_link;
+       /**
+        * I/O vector information to or from which read/write is going.
+        */
+       struct iovec *cui_iov;
+       unsigned long cui_nrsegs;
+       /**
+        * Total iov count for left IO.
+        */
+       unsigned long cui_tot_nrsegs;
+       /**
+        * Old length for iov that was truncated partially.
+        */
+       size_t cui_iov_olen;
+       /**
+        * Total size for the left IO.
+        */
+       size_t cui_tot_count;
+
+       union {
+               struct {
+                       enum ccc_setattr_lock_type cui_local_lock;
+               } setattr;
+       } u;
+       /**
+        * True iff io is processing glimpse right now.
+        */
+       int               cui_glimpse;
+       /**
+        * Layout version when this IO is initialized
+        */
+       __u32           cui_layout_gen;
+       /**
+        * File descriptor against which IO is done.
+        */
+       struct ll_file_data *cui_fd;
+       struct kiocb *cui_iocb;
+};
+
+/**
+ * True, if \a io is a normal io, False for other (sendfile, splice*).
+ * must be impementated in arch specific code.
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io);
+
+extern struct lu_context_key ccc_key;
+extern struct lu_context_key ccc_session_key;
+
+struct ccc_thread_info {
+       struct cl_lock_descr cti_descr;
+       struct cl_io     cti_io;
+       struct cl_attr       cti_attr;
+};
+
+static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env)
+{
+       struct ccc_thread_info      *info;
+
+       info = lu_context_key_get(&env->le_ctx, &ccc_key);
+       LASSERT(info != NULL);
+       return info;
+}
+
+static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env)
+{
+       struct cl_attr *attr = &ccc_env_info(env)->cti_attr;
+       memset(attr, 0, sizeof(*attr));
+       return attr;
+}
+
+static inline struct cl_io *ccc_env_thread_io(const struct lu_env *env)
+{
+       struct cl_io *io = &ccc_env_info(env)->cti_io;
+       memset(io, 0, sizeof(*io));
+       return io;
+}
+
+struct ccc_session {
+       struct ccc_io cs_ios;
+};
+
+static inline struct ccc_session *ccc_env_session(const struct lu_env *env)
+{
+       struct ccc_session *ses;
+
+       ses = lu_context_key_get(env->le_ses, &ccc_session_key);
+       LASSERT(ses != NULL);
+       return ses;
+}
+
+static inline struct ccc_io *ccc_env_io(const struct lu_env *env)
+{
+       return &ccc_env_session(env)->cs_ios;
+}
+
+/**
+ * ccc-private object state.
+ */
+struct ccc_object {
+       struct cl_object_header cob_header;
+       struct cl_object        cob_cl;
+       struct inode       *cob_inode;
+
+       /**
+        * A list of dirty pages pending IO in the cache. Used by
+        * SOM. Protected by ll_inode_info::lli_lock.
+        *
+        * \see ccc_page::cpg_pending_linkage
+        */
+       struct list_head             cob_pending_list;
+
+       /**
+        * Access this counter is protected by inode->i_sem. Now that
+        * the lifetime of transient pages must be covered by inode sem,
+        * we don't need to hold any lock..
+        */
+       int                  cob_transient_pages;
+       /**
+        * Number of outstanding mmaps on this file.
+        *
+        * \see ll_vm_open(), ll_vm_close().
+        */
+       atomic_t            cob_mmap_cnt;
+
+       /**
+        * various flags
+        * cob_discard_page_warned
+        *     if pages belonging to this object are discarded when a client
+        * is evicted, some debug info will be printed, this flag will be set
+        * during processing the first discarded page, then avoid flooding
+        * debug message for lots of discarded pages.
+        *
+        * \see ll_dirty_page_discard_warn.
+        */
+       unsigned int            cob_discard_page_warned:1;
+};
+
+/**
+ * ccc-private page state.
+ */
+struct ccc_page {
+       struct cl_page_slice cpg_cl;
+       int               cpg_defer_uptodate;
+       int               cpg_ra_used;
+       int               cpg_write_queued;
+       /**
+        * Non-empty iff this page is already counted in
+        * ccc_object::cob_pending_list. Protected by
+        * ccc_object::cob_pending_guard. This list is only used as a flag,
+        * that is, never iterated through, only checked for list_empty(), but
+        * having a list is useful for debugging.
+        */
+       struct list_head           cpg_pending_linkage;
+       /** VM page */
+       struct page       *cpg_page;
+};
+
+static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice)
+{
+       return container_of(slice, struct ccc_page, cpg_cl);
+}
+
+struct cl_page    *ccc_vmpage_page_transient(struct page *vmpage);
+
+struct ccc_device {
+       struct cl_device    cdv_cl;
+       struct super_block *cdv_sb;
+       struct cl_device   *cdv_next;
+};
+
+struct ccc_lock {
+       struct cl_lock_slice clk_cl;
+};
+
+struct ccc_req {
+       struct cl_req_slice  crq_cl;
+};
+
+void *ccc_key_init     (const struct lu_context *ctx,
+                          struct lu_context_key *key);
+void  ccc_key_fini     (const struct lu_context *ctx,
+                          struct lu_context_key *key, void *data);
+void *ccc_session_key_init(const struct lu_context *ctx,
+                          struct lu_context_key *key);
+void  ccc_session_key_fini(const struct lu_context *ctx,
+                          struct lu_context_key *key, void *data);
+
+int          ccc_device_init  (const struct lu_env *env,
+                                  struct lu_device *d,
+                                  const char *name, struct lu_device *next);
+struct lu_device *ccc_device_fini (const struct lu_env *env,
+                                  struct lu_device *d);
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+                                  struct lu_device_type *t,
+                                  struct lustre_cfg *cfg,
+                                  const struct lu_device_operations *luops,
+                                  const struct cl_device_operations *clops);
+struct lu_device *ccc_device_free (const struct lu_env *env,
+                                  struct lu_device *d);
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *hdr,
+                                  struct lu_device *dev,
+                                  const struct cl_object_operations *clops,
+                                  const struct lu_object_operations *luops);
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+                struct cl_req *req);
+void ccc_umount(const struct lu_env *env, struct cl_device *dev);
+int ccc_global_init(struct lu_device_type *device_type);
+void ccc_global_fini(struct lu_device_type *device_type);
+int ccc_object_init0(const struct lu_env *env,struct ccc_object *vob,
+                    const struct cl_object_conf *conf);
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+                   const struct lu_object_conf *conf);
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj);
+int ccc_lock_init(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_lock *lock, const struct cl_io *io,
+                 const struct cl_lock_operations *lkops);
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+                const struct cl_attr *attr, unsigned valid);
+int ccc_object_glimpse(const struct lu_env *env,
+                      const struct cl_object *obj, struct ost_lvb *lvb);
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+                const struct cl_object_conf *conf);
+struct page *ccc_page_vmpage(const struct lu_env *env,
+                           const struct cl_page_slice *slice);
+int ccc_page_is_under_lock(const struct lu_env *env,
+                          const struct cl_page_slice *slice, struct cl_io *io);
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice);
+void ccc_transient_page_verify(const struct cl_page *page);
+int  ccc_transient_page_own(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *io, int nonblock);
+void ccc_transient_page_assume(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *io);
+void ccc_transient_page_unassume(const struct lu_env *env,
+                                const struct cl_page_slice *slice,
+                                struct cl_io *io);
+void ccc_transient_page_disown(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *io);
+void ccc_transient_page_discard(const struct lu_env *env,
+                               const struct cl_page_slice *slice,
+                               struct cl_io *io);
+int ccc_transient_page_prep(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *io);
+void ccc_lock_delete(const struct lu_env *env,
+                    const struct cl_lock_slice *slice);
+void ccc_lock_fini(const struct lu_env *env,struct cl_lock_slice *slice);
+int ccc_lock_enqueue(const struct lu_env *env,const struct cl_lock_slice *slice,
+                    struct cl_io *io, __u32 enqflags);
+int ccc_lock_unuse(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_wait(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_fits_into(const struct lu_env *env,
+                      const struct cl_lock_slice *slice,
+                      const struct cl_lock_descr *need,
+                      const struct cl_io *io);
+void ccc_lock_state(const struct lu_env *env,
+                   const struct cl_lock_slice *slice,
+                   enum cl_lock_state state);
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios);
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+                         __u32 enqflags, enum cl_lock_mode mode,
+                         pgoff_t start, pgoff_t end);
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+                   __u32 enqflags, enum cl_lock_mode mode,
+                   loff_t start, loff_t end);
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios);
+void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios,
+                   size_t nob);
+void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio,
+                      struct cl_io *io);
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_io *io, loff_t start, size_t count, int *exceed);
+void ccc_req_completion(const struct lu_env *env,
+                       const struct cl_req_slice *slice, int ioret);
+void ccc_req_attr_set(const struct lu_env *env,const struct cl_req_slice *slice,
+                     const struct cl_object *obj,
+                     struct cl_req_attr *oa, obd_valid flags);
+
+struct lu_device   *ccc2lu_dev      (struct ccc_device *vdv);
+struct lu_object   *ccc2lu       (struct ccc_object *vob);
+struct ccc_device  *lu2ccc_dev      (const struct lu_device *d);
+struct ccc_device  *cl2ccc_dev      (const struct cl_device *d);
+struct ccc_object  *lu2ccc       (const struct lu_object *obj);
+struct ccc_object  *cl2ccc       (const struct cl_object *obj);
+struct ccc_lock    *cl2ccc_lock     (const struct cl_lock_slice *slice);
+struct ccc_io      *cl2ccc_io       (const struct lu_env *env,
+                                    const struct cl_io_slice *slice);
+struct ccc_req     *cl2ccc_req      (const struct cl_req_slice *slice);
+struct page     *cl2vm_page      (const struct cl_page_slice *slice);
+struct inode       *ccc_object_inode(const struct cl_object *obj);
+struct ccc_object  *cl_inode2ccc    (struct inode *inode);
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+                  struct obd_capa *capa);
+
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage);
+int ccc_object_invariant(const struct cl_object *obj);
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+int cl_local_size(struct inode *inode);
+
+__u16 ll_dirent_type_get(struct lu_dirent *ent);
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32);
+__u32 cl_fid_build_gen(const struct lu_fid *fid);
+
+# define CLOBINVRNT(env, clob, expr)                               \
+       ((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr))
+
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp);
+int cl_ocd_update(struct obd_device *host,
+                 struct obd_device *watched,
+                 enum obd_notify_event ev, void *owner, void *data);
+
+struct ccc_grouplock {
+       struct lu_env   *cg_env;
+       struct cl_io    *cg_io;
+       struct cl_lock  *cg_lock;
+       unsigned long    cg_gid;
+};
+
+int  cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+                     struct ccc_grouplock *cg);
+void cl_put_grouplock(struct ccc_grouplock *cg);
+
+/**
+ * New interfaces to get and put lov_stripe_md from lov layer. This violates
+ * layering because lov_stripe_md is supposed to be a private data in lov.
+ *
+ * NB: If you find you have to use these interfaces for your new code, please
+ * think about it again. These interfaces may be removed in the future for
+ * better layering. */
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj);
+void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm);
+int lov_read_and_clear_async_rc(struct cl_object *clob);
+
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode);
+void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm);
+
+/**
+ * Data structure managing a client's cached clean pages. An LRU of
+ * pages is maintained, along with other statistics.
+ */
+struct cl_client_cache {
+       atomic_t        ccc_users;    /* # of users (OSCs) of this data */
+       struct list_head        ccc_lru;      /* LRU list of cached clean pages */
+       spinlock_t      ccc_lru_lock; /* lock for list */
+       atomic_t        ccc_lru_left; /* # of LRU entries available */
+       unsigned long   ccc_lru_max;  /* Max # of LRU entries possible */
+       unsigned int    ccc_lru_shrinkers; /* # of threads reclaiming */
+};
+
+#endif /*LCLIENT_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h
new file mode 100644 (file)
index 0000000..5866922
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LINUX_LPROCFS_SNMP_H
+#define _LINUX_LPROCFS_SNMP_H
+
+#ifndef _LPROCFS_SNMP_H
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/version.h>
+#include <linux/smp.h>
+#include <linux/rwsem.h>
+#include <linux/libcfs/libcfs.h>
+#include <linux/statfs.h>
+
+
+#endif /* LPROCFS_SNMP_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_acl.h b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h
new file mode 100644 (file)
index 0000000..ff4fc4f
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_acl.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_LINUX_ACL_H
+#define _LUSTRE_LINUX_ACL_H
+
+#ifndef        _LUSTRE_ACL_H
+#error Shoud not include direectly. use #include <lustre_acl.h> instead
+#endif
+
+# include <linux/fs.h>
+# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/posix_acl_xattr.h>
+#  define LUSTRE_POSIX_ACL_MAX_ENTRIES 32
+#  define LUSTRE_POSIX_ACL_MAX_SIZE                                    \
+       (sizeof(posix_acl_xattr_header) +                               \
+        LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry))
+# endif /* CONFIG_FS_POSIX_ACL */
+# include <linux/lustre_intent.h>
+# include <linux/xattr.h> /* XATTR_{REPLACE,CREATE} */
+
+#ifndef LUSTRE_POSIX_ACL_MAX_SIZE
+# define LUSTRE_POSIX_ACL_MAX_SIZE   0
+#endif
+
+#endif /* _LUSTRE_LINUX_ACL_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_common.h b/drivers/staging/lustre/lustre/include/linux/lustre_common.h
new file mode 100644 (file)
index 0000000..d1783a3
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef LUSTRE_COMMON_H
+#define LUSTRE_COMMON_H
+
+#include <linux/sched.h>
+
+static inline int cfs_cleanup_group_info(void)
+{
+       struct group_info *ginfo;
+
+       ginfo = groups_alloc(0);
+       if (!ginfo)
+               return -ENOMEM;
+
+       set_current_groups(ginfo);
+       put_group_info(ginfo);
+
+       return 0;
+}
+
+#define ll_inode_blksize(a)            (1<<(a)->i_blkbits)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h
new file mode 100644 (file)
index 0000000..dff0468
--- /dev/null
@@ -0,0 +1,349 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_COMPAT25_H
+#define _LINUX_COMPAT25_H
+
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/lustre_patchless_compat.h>
+
+# define LOCK_FS_STRUCT(fs)    spin_lock(&(fs)->lock)
+# define UNLOCK_FS_STRUCT(fs)  spin_unlock(&(fs)->lock)
+
+static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
+                                struct dentry *dentry)
+{
+       struct path path;
+       struct path old_pwd;
+
+       path.mnt = mnt;
+       path.dentry = dentry;
+       LOCK_FS_STRUCT(fs);
+       old_pwd = fs->pwd;
+       path_get(&path);
+       fs->pwd = path;
+       UNLOCK_FS_STRUCT(fs);
+
+       if (old_pwd.dentry)
+               path_put(&old_pwd);
+}
+
+
+/*
+ * set ATTR_BLOCKS to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_BLOCKS    (1 << 27)
+
+#define current_ngroups current_cred()->group_info->ngroups
+#define current_groups current_cred()->group_info->small_block
+
+/*
+ * OBD need working random driver, thus all our
+ * initialization routines must be called after device
+ * driver initialization
+ */
+#ifndef MODULE
+#undef module_init
+#define module_init(a)     late_initcall(a)
+#endif
+
+
+#define LTIME_S(time)             (time.tv_sec)
+
+#define ll_permission(inode,mask,nd)    inode_permission(inode,mask)
+
+# define ll_generic_permission(inode, mask, flags, check_acl) \
+        generic_permission(inode, mask)
+
+#define ll_blkdev_put(a, b) blkdev_put(a, b)
+
+#define ll_dentry_open(a,b,c)  dentry_open(a,b,c)
+
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                      vfs_symlink(dir, dentry, path)
+
+
+#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \
+               generic_file_llseek_size(file, offset, origin, maxbytes, eof);
+
+/* inode_dio_wait(i) use as-is for write lock */
+# define inode_dio_write_done(i)       do {} while (0) /* for write unlock */
+# define inode_dio_read(i)             atomic_inc(&(i)->i_dio_count)
+/* inode_dio_done(i) use as-is for read unlock */
+
+#define TREE_READ_LOCK_IRQ(mapping)    spin_lock_irq(&(mapping)->tree_lock)
+#define TREE_READ_UNLOCK_IRQ(mapping)  spin_unlock_irq(&(mapping)->tree_lock)
+
+static inline
+int ll_unregister_blkdev(unsigned int dev, const char *name)
+{
+       unregister_blkdev(dev, name);
+       return 0;
+}
+
+#define ll_invalidate_bdev(a,b)         invalidate_bdev((a))
+
+#ifndef FS_HAS_FIEMAP
+#define FS_HAS_FIEMAP                  (0)
+#endif
+
+
+
+/* add a lustre compatible layer for crypto API */
+#include <linux/crypto.h>
+#define ll_crypto_hash   crypto_hash
+#define ll_crypto_cipher       crypto_blkcipher
+#define ll_crypto_alloc_hash(name, type, mask)  crypto_alloc_hash(name, type, mask)
+#define ll_crypto_hash_setkey(tfm, key, keylen) crypto_hash_setkey(tfm, key, keylen)
+#define ll_crypto_hash_init(desc)             crypto_hash_init(desc)
+#define ll_crypto_hash_update(desc, sl, bytes)  crypto_hash_update(desc, sl, bytes)
+#define ll_crypto_hash_final(desc, out)         crypto_hash_final(desc, out)
+#define ll_crypto_blkcipher_setkey(tfm, key, keylen) \
+               crypto_blkcipher_setkey(tfm, key, keylen)
+#define ll_crypto_blkcipher_set_iv(tfm, src, len) \
+               crypto_blkcipher_set_iv(tfm, src, len)
+#define ll_crypto_blkcipher_get_iv(tfm, dst, len) \
+               crypto_blkcipher_get_iv(tfm, dst, len)
+#define ll_crypto_blkcipher_encrypt(desc, dst, src, bytes) \
+               crypto_blkcipher_encrypt(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_decrypt(desc, dst, src, bytes) \
+               crypto_blkcipher_decrypt(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_encrypt_iv(desc, dst, src, bytes) \
+               crypto_blkcipher_encrypt_iv(desc, dst, src, bytes)
+#define ll_crypto_blkcipher_decrypt_iv(desc, dst, src, bytes) \
+               crypto_blkcipher_decrypt_iv(desc, dst, src, bytes)
+
+static inline
+struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(const char *name,
+                                                  u32 type, u32 mask)
+{
+       struct ll_crypto_cipher *rtn = crypto_alloc_blkcipher(name, type, mask);
+
+       return (rtn == NULL ? ERR_PTR(-ENOMEM) : rtn);
+}
+
+static inline int ll_crypto_hmac(struct ll_crypto_hash *tfm,
+                                u8 *key, unsigned int *keylen,
+                                struct scatterlist *sg,
+                                unsigned int size, u8 *result)
+{
+       struct hash_desc desc;
+       int           rv;
+       desc.tfm   = tfm;
+       desc.flags = 0;
+       rv = crypto_hash_setkey(desc.tfm, key, *keylen);
+       if (rv) {
+               CERROR("failed to hash setkey: %d\n", rv);
+               return rv;
+       }
+       return crypto_hash_digest(&desc, sg, size, result);
+}
+static inline
+unsigned int ll_crypto_tfm_alg_max_keysize(struct crypto_blkcipher *tfm)
+{
+       return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.max_keysize;
+}
+static inline
+unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm)
+{
+       return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize;
+}
+
+#define ll_crypto_hash_blocksize(tfm)       crypto_hash_blocksize(tfm)
+#define ll_crypto_hash_digestsize(tfm)      crypto_hash_digestsize(tfm)
+#define ll_crypto_blkcipher_ivsize(tfm)     crypto_blkcipher_ivsize(tfm)
+#define ll_crypto_blkcipher_blocksize(tfm)  crypto_blkcipher_blocksize(tfm)
+#define ll_crypto_free_hash(tfm)           crypto_free_hash(tfm)
+#define ll_crypto_free_blkcipher(tfm)       crypto_free_blkcipher(tfm)
+
+#define ll_vfs_rmdir(dir,entry,mnt)         vfs_rmdir(dir,entry)
+#define ll_vfs_mkdir(inode,dir,mnt,mode)       vfs_mkdir(inode,dir,mode)
+#define ll_vfs_link(old,mnt,dir,new,mnt1)       vfs_link(old,dir,new)
+#define ll_vfs_unlink(inode,entry,mnt)   vfs_unlink(inode,entry)
+#define ll_vfs_mknod(dir,entry,mnt,mode,dev)    vfs_mknod(dir,entry,mode,dev)
+#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry)
+#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
+               vfs_rename(old,old_dir,new,new_dir)
+
+#ifdef for_each_possible_cpu
+#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
+#elif defined(for_each_cpu)
+#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu)
+#endif
+
+#define cfs_bio_io_error(a,b)   bio_io_error((a))
+#define cfs_bio_endio(a,b,c)    bio_endio((a),(c))
+
+#define cfs_fs_pwd(fs)       ((fs)->pwd.dentry)
+#define cfs_fs_mnt(fs)       ((fs)->pwd.mnt)
+#define cfs_path_put(nd)     path_put(&(nd)->path)
+
+
+#ifndef SLAB_DESTROY_BY_RCU
+#define SLAB_DESTROY_BY_RCU 0
+#endif
+
+
+
+static inline int
+ll_quota_on(struct super_block *sb, int off, int ver, char *name, int remount)
+{
+       int rc;
+
+       if (sb->s_qcop->quota_on) {
+               struct path path;
+
+               rc = kern_path(name, LOOKUP_FOLLOW, &path);
+               if (!rc)
+                       return rc;
+               rc = sb->s_qcop->quota_on(sb, off, ver
+                                           , &path
+                                          );
+               path_put(&path);
+               return rc;
+       }
+       else
+               return -ENOSYS;
+}
+
+static inline int ll_quota_off(struct super_block *sb, int off, int remount)
+{
+       if (sb->s_qcop->quota_off) {
+               return sb->s_qcop->quota_off(sb, off
+                                           );
+       }
+       else
+               return -ENOSYS;
+}
+
+
+# define ll_vfs_dq_init             dquot_initialize
+# define ll_vfs_dq_drop             dquot_drop
+# define ll_vfs_dq_transfer     dquot_transfer
+# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1)
+
+
+
+
+
+#define queue_max_phys_segments(rq)       queue_max_segments(rq)
+#define queue_max_hw_segments(rq)       queue_max_segments(rq)
+
+#define ll_kmap_atomic(a, b)   kmap_atomic(a)
+#define ll_kunmap_atomic(a, b) kunmap_atomic(a)
+
+
+#define ll_d_hlist_node hlist_node
+#define ll_d_hlist_empty(list) hlist_empty(list)
+#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name)
+#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry)
+#define ll_d_hlist_for_each_entry(dentry, p, i_dentry, alias) \
+       p = NULL; hlist_for_each_entry(dentry, i_dentry, alias)
+
+
+#define bio_hw_segments(q, bio) 0
+
+
+#define ll_pagevec_init(pv, cold)       do {} while (0)
+#define ll_pagevec_add(pv, pg)   (0)
+#define ll_pagevec_lru_add_file(pv)     do {} while (0)
+
+
+#ifndef QUOTA_OK
+# define QUOTA_OK 0
+#endif
+#ifndef NO_QUOTA
+# define NO_QUOTA (-EDQUOT)
+#endif
+
+#ifndef SEEK_DATA
+#define SEEK_DATA      3       /* seek to the next data */
+#endif
+#ifndef SEEK_HOLE
+#define SEEK_HOLE      4       /* seek to the next hole */
+#endif
+
+#ifndef FMODE_UNSIGNED_OFFSET
+#define FMODE_UNSIGNED_OFFSET  ((__force fmode_t)0x2000)
+#endif
+
+#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit)
+# define ext2_set_bit       __test_and_set_bit_le
+# define ext2_clear_bit           __test_and_clear_bit_le
+# define ext2_test_bit     test_bit_le
+# define ext2_find_first_zero_bit find_first_zero_bit_le
+# define ext2_find_next_zero_bit  find_next_zero_bit_le
+#endif
+
+#ifdef ATTR_TIMES_SET
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+#else
+# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET)
+#endif
+
+
+
+/*
+ * After 3.1, kernel's nameidata.intent.open.flags is different
+ * with lustre's lookup_intent.it_flags, as lustre's it_flags'
+ * lower bits equal to FMODE_xxx while kernel doesn't transliterate
+ * lower bits of nameidata.intent.open.flags to FMODE_xxx.
+ * */
+#include <linux/version.h>
+static inline int ll_namei_to_lookup_intent_flag(int flag)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0)
+       flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag);
+#endif
+       return flag;
+}
+
+# define ll_mrf_ret void
+# define LL_MRF_RETURN(rc)
+
+#include <linux/fs.h>
+
+# define ll_umode_t    umode_t
+
+#include <linux/dcache.h>
+
+# define ll_dirty_inode(inode, flag)   (inode)->i_sb->s_op->dirty_inode((inode), flag)
+
+#endif /* _COMPAT25_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_debug.h b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h
new file mode 100644 (file)
index 0000000..11deac7
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_DEBUG_H
+#define _LINUX_LUSTRE_DEBUG_H
+
+#ifndef _LUSTRE_DEBUG_H
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+#define LL_CDEBUG_PAGE(mask, page, fmt, arg...)                               \
+       CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: "\
+              fmt, page, page->mapping, page->index, (long)page->flags,      \
+              page_count(page), page_private(page), ## arg)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h
new file mode 100644 (file)
index 0000000..207df03
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_DLM_H__
+#define _LINUX_LUSTRE_DLM_H__
+
+#ifndef _LUSTRE_DLM_H__
+#error Do not #include this file directly. #include <lprocfs_status.h> instead
+#endif
+
+# include <linux/proc_fs.h>
+#  include <asm/processor.h>
+#  include <linux/bit_spinlock.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h
new file mode 100644 (file)
index 0000000..6c72609
--- /dev/null
@@ -0,0 +1,181 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_fsfilt.h
+ *
+ * Filesystem interface helper.
+ */
+
+#ifndef _LINUX_LUSTRE_FSFILT_H
+#define _LINUX_LUSTRE_FSFILT_H
+
+#ifndef _LUSTRE_FSFILT_H
+#error Do not #include this file directly. #include <lustre_fsfilt.h> instead
+#endif
+
+
+#include <obd.h>
+#include <obd_class.h>
+
+typedef void (*fsfilt_cb_t)(struct obd_device *obd, __u64 last_rcvd,
+                           void *data, int error);
+
+struct fsfilt_operations {
+       struct list_head fs_list;
+       module_t *fs_owner;
+       char   *fs_type;
+       char   *(* fs_getlabel)(struct super_block *sb);
+       void   *(* fs_start)(struct inode *inode, int op, void *desc_private,
+                            int logs);
+       int     (* fs_commit)(struct inode *inode, void *handle,int force_sync);
+       int     (* fs_map_inode_pages)(struct inode *inode, struct page **page,
+                                      int pages, unsigned long *blocks,
+                                      int create, struct mutex *sem);
+       int     (* fs_write_record)(struct file *, void *, int size, loff_t *,
+                                   int force_sync);
+       int     (* fs_read_record)(struct file *, void *, int size, loff_t *);
+       int     (* fs_setup)(struct super_block *sb);
+};
+
+extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops);
+extern void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops);
+extern struct fsfilt_operations *fsfilt_get_ops(const char *type);
+extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops);
+
+static inline char *fsfilt_get_label(struct obd_device *obd,
+                                    struct super_block *sb)
+{
+       if (obd->obd_fsops->fs_getlabel == NULL)
+               return NULL;
+       if (obd->obd_fsops->fs_getlabel(sb)[0] == '\0')
+               return NULL;
+
+       return obd->obd_fsops->fs_getlabel(sb);
+}
+
+#define FSFILT_OP_UNLINK               1
+#define FSFILT_OP_CANCEL_UNLINK         10
+
+#define __fsfilt_check_slow(obd, start, msg)                         \
+do {                                                                 \
+       if (cfs_time_before(jiffies, start + 15 * HZ))          \
+               break;                                              \
+       else if (cfs_time_before(jiffies, start + 30 * HZ))        \
+               CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name,   \
+                      msg, (jiffies-start) / HZ);                  \
+       else if (cfs_time_before(jiffies, start + DISK_TIMEOUT * HZ)) \
+               CWARN("%s: slow %s %lus\n", obd->obd_name, msg,    \
+                     (jiffies - start) / HZ);                  \
+       else                                                          \
+               CERROR("%s: slow %s %lus\n", obd->obd_name, msg,          \
+                      (jiffies - start) / HZ);                \
+} while (0)
+
+#define fsfilt_check_slow(obd, start, msg)           \
+do {                                               \
+       __fsfilt_check_slow(obd, start, msg);      \
+       start = jiffies;                                \
+} while (0)
+
+static inline void *fsfilt_start_log(struct obd_device *obd,
+                                    struct inode *inode, int op,
+                                    struct obd_trans_info *oti, int logs)
+{
+       unsigned long now = jiffies;
+       void *parent_handle = oti ? oti->oti_handle : NULL;
+       void *handle;
+
+       handle = obd->obd_fsops->fs_start(inode, op, parent_handle, logs);
+       CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle);
+
+       if (oti != NULL) {
+               if (parent_handle == NULL) {
+                       oti->oti_handle = handle;
+               } else if (handle != parent_handle) {
+                       CERROR("mismatch: parent %p, handle %p, oti %p\n",
+                              parent_handle, handle, oti);
+                       LBUG();
+               }
+       }
+       fsfilt_check_slow(obd, now, "journal start");
+       return handle;
+}
+
+static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode,
+                               void *handle, int force_sync)
+{
+       unsigned long now = jiffies;
+       int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync);
+       CDEBUG(D_INFO, "committing handle %p\n", handle);
+
+       fsfilt_check_slow(obd, now, "journal start");
+
+       return rc;
+}
+
+static inline int fsfilt_map_inode_pages(struct obd_device *obd,
+                                        struct inode *inode,
+                                        struct page **page, int pages,
+                                        unsigned long *blocks,
+                                        int create, struct mutex *mutex)
+{
+       return obd->obd_fsops->fs_map_inode_pages(inode, page, pages, blocks,
+                                                 create, mutex);
+}
+
+static inline int fsfilt_read_record(struct obd_device *obd, struct file *file,
+                                    void *buf, loff_t size, loff_t *offs)
+{
+       return obd->obd_fsops->fs_read_record(file, buf, size, offs);
+}
+
+static inline int fsfilt_write_record(struct obd_device *obd, struct file *file,
+                                     void *buf, loff_t size, loff_t *offs,
+                                     int force_sync)
+{
+       return obd->obd_fsops->fs_write_record(file, buf, size,offs,force_sync);
+}
+
+static inline int fsfilt_setup(struct obd_device *obd, struct super_block *fs)
+{
+       if (obd->obd_fsops->fs_setup)
+               return obd->obd_fsops->fs_setup(fs);
+       return 0;
+}
+
+
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_handles.h b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h
new file mode 100644 (file)
index 0000000..ecf1840
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_LUSTRE_HANDLES_H_
+#define __LINUX_LUSTRE_HANDLES_H_
+
+#ifndef __LUSTRE_HANDLES_H_
+#error Do not #include this file directly. #include <lustre_handles.h> instead
+#endif
+
+#include <asm/types.h>
+#include <asm/atomic.h>
+#include <linux/list.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <linux/rcupdate.h> /* for rcu_head{} */
+typedef struct rcu_head cfs_rcu_head_t;
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_intent.h b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h
new file mode 100644 (file)
index 0000000..b10ddfa
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_INTENT_H
+#define LUSTRE_INTENT_H
+
+/* intent IT_XXX are defined in lustre/include/obd.h */
+struct lustre_intent_data {
+       int             it_disposition;
+       int             it_status;
+       __u64           it_lock_handle;
+       __u64           it_lock_bits;
+       int             it_lock_mode;
+       int             it_remote_lock_mode;
+       __u64      it_remote_lock_handle;
+       void       *it_data;
+       unsigned int    it_lock_set:1;
+};
+
+struct lookup_intent {
+       int     it_op;
+       int     it_flags;
+       int     it_create_mode;
+       union {
+               struct lustre_intent_data lustre;
+       } d;
+};
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lib.h b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h
new file mode 100644 (file)
index 0000000..b2f755a
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LINUX_LUSTRE_LIB_H
+#define _LINUX_LUSTRE_LIB_H
+
+#ifndef _LUSTRE_LIB_H
+#error Do not #include this file directly. #include <lustre_lib.h> instead
+#endif
+
+# include <linux/rwsem.h>
+# include <linux/sched.h>
+# include <linux/signal.h>
+# include <linux/types.h>
+# include <linux/lustre_compat25.h>
+# include <linux/lustre_common.h>
+
+#ifndef LP_POISON
+#if BITS_PER_LONG > 32
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
+#else
+# define LI_POISON ((int)0x5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a)
+#endif
+#endif
+
+/* This macro is only for compatibility reasons with older Linux Lustre user
+ * tools. New ioctls should NOT use this macro as the ioctl "size". Instead
+ * the ioctl should get a "size" argument which is the actual data type used
+ * by the ioctl, to ensure the ioctl interface is versioned correctly. */
+#define OBD_IOC_DATA_TYPE             long
+
+#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) |                \
+                          sigmask(SIGTERM) | sigmask(SIGQUIT) |               \
+                          sigmask(SIGALRM))
+
+/* initialize ost_lvb according to inode */
+static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb)
+{
+       lvb->lvb_size = i_size_read(inode);
+       lvb->lvb_blocks = inode->i_blocks;
+       lvb->lvb_mtime = LTIME_S(inode->i_mtime);
+       lvb->lvb_atime = LTIME_S(inode->i_atime);
+       lvb->lvb_ctime = LTIME_S(inode->i_ctime);
+}
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h
new file mode 100644 (file)
index 0000000..c95dff9
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LL_H
+#define _LINUX_LL_H
+
+#ifndef _LL_H
+#error Do not #include this file directly. #include <lustre_lite.h> instead
+#endif
+
+
+#include <linux/version.h>
+
+#include <asm/statfs.h>
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/proc_fs.h>
+
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_ha.h>
+
+#include <linux/rbtree.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/pagemap.h>
+
+/* lprocfs.c */
+enum {
+        LPROC_LL_DIRTY_HITS = 0,
+        LPROC_LL_DIRTY_MISSES,
+        LPROC_LL_READ_BYTES,
+        LPROC_LL_WRITE_BYTES,
+        LPROC_LL_BRW_READ,
+        LPROC_LL_BRW_WRITE,
+        LPROC_LL_OSC_READ,
+        LPROC_LL_OSC_WRITE,
+        LPROC_LL_IOCTL,
+        LPROC_LL_OPEN,
+        LPROC_LL_RELEASE,
+        LPROC_LL_MAP,
+        LPROC_LL_LLSEEK,
+        LPROC_LL_FSYNC,
+        LPROC_LL_READDIR,
+        LPROC_LL_SETATTR,
+        LPROC_LL_TRUNC,
+        LPROC_LL_FLOCK,
+        LPROC_LL_GETATTR,
+        LPROC_LL_CREATE,
+        LPROC_LL_LINK,
+        LPROC_LL_UNLINK,
+        LPROC_LL_SYMLINK,
+        LPROC_LL_MKDIR,
+        LPROC_LL_RMDIR,
+        LPROC_LL_MKNOD,
+        LPROC_LL_RENAME,
+        LPROC_LL_STAFS,
+        LPROC_LL_ALLOC_INODE,
+        LPROC_LL_SETXATTR,
+        LPROC_LL_GETXATTR,
+        LPROC_LL_LISTXATTR,
+        LPROC_LL_REMOVEXATTR,
+        LPROC_LL_INODE_PERM,
+        LPROC_LL_FILE_OPCODES
+};
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_log.h b/drivers/staging/lustre/lustre/include/linux/lustre_log.h
new file mode 100644 (file)
index 0000000..e9c8e56
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *  - orphan recovery: OST adds record on create
+ *  - mtime/size consistency: the OST adds a record on first write
+ *  - open/unlinked objects: OST adds a record on destroy
+ *
+ *  - mds unlink log: the MDS adds an entry upon delete
+ *
+ *  - raid1 replication log between OST's
+ *  - MDS replication logs
+ */
+
+#ifndef _LINUX_LUSTRE_LOG_H
+#define _LINUX_LUSTRE_LOG_H
+
+#ifndef _LUSTRE_LOG_H
+#error Do not #include this file directly. #include <lustre_log.h> instead
+#endif
+
+#define LUSTRE_LOG_SERVER
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_net.h b/drivers/staging/lustre/lustre/include/linux/lustre_net.h
new file mode 100644 (file)
index 0000000..2d7c425
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_NET_H
+#define _LINUX_LUSTRE_NET_H
+
+#ifndef _LUSTRE_NET_H
+#error Do not #include this file directly. #include <lustre_net.h> instead
+#endif
+
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+/* XXX Liang: should be moved to other header instead of here */
+#ifndef WITH_GROUP_INFO
+#define WITH_GROUP_INFO
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
new file mode 100644 (file)
index 0000000..f050808
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LUSTRE_PATCHLESS_COMPAT_H
+#define LUSTRE_PATCHLESS_COMPAT_H
+
+#include <linux/fs.h>
+
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/hash.h>
+
+
+#define ll_delete_from_page_cache(page) delete_from_page_cache(page)
+
+static inline void
+truncate_complete_page(struct address_space *mapping, struct page *page)
+{
+       if (page->mapping != mapping)
+               return;
+
+       if (PagePrivate(page))
+               page->mapping->a_ops->invalidatepage(page, 0);
+
+       cancel_dirty_page(page, PAGE_SIZE);
+       ClearPageMappedToDisk(page);
+       ll_delete_from_page_cache(page);
+}
+
+#  define d_refcount(d)                 ((d)->d_count)
+
+#ifdef ATTR_OPEN
+# define ATTR_FROM_OPEN ATTR_OPEN
+#else
+# ifndef ATTR_FROM_OPEN
+#  define ATTR_FROM_OPEN 0
+# endif
+#endif /* ATTR_OPEN */
+
+#ifndef ATTR_RAW
+#define ATTR_RAW 0
+#endif
+
+#ifndef ATTR_CTIME_SET
+/*
+ * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other
+ * ATTR_* attributes (see bug 13828)
+ */
+#define ATTR_CTIME_SET (1 << 28)
+#endif
+
+#endif /* LUSTRE_PATCHLESS_COMPAT_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_quota.h b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h
new file mode 100644 (file)
index 0000000..421866b
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_LUSTRE_QUOTA_H
+#define _LINUX_LUSTRE_QUOTA_H
+
+#ifndef _LUSTRE_QUOTA_H
+#error Do not #include this file directly. #include <lustre_quota.h> instead
+#endif
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/drivers/staging/lustre/lustre/include/linux/lustre_user.h
new file mode 100644 (file)
index 0000000..ebaf929
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LINUX_LUSTRE_USER_H
+#define _LINUX_LUSTRE_USER_H
+
+# include <linux/version.h>
+# include <linux/quota.h>
+
+/*
+ * asm-x86_64/processor.h on some SLES 9 distros seems to use
+ * kernel-only typedefs.  fortunately skipping it altogether is ok
+ * (for now).
+ */
+#define __ASM_X86_64_PROCESSOR_H
+
+#include <linux/string.h>
+
+#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
+    defined(__craynv) || defined (__mips64__) || defined(__powerpc64__)
+typedef struct stat     lstat_t;
+#define lstat_f         lstat
+#define HAVE_LOV_USER_MDS_DATA
+#else
+typedef struct stat64   lstat_t;
+#define lstat_f         lstat64
+#define HAVE_LOV_USER_MDS_DATA
+#endif
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs.h b/drivers/staging/lustre/lustre/include/linux/lvfs.h
new file mode 100644 (file)
index 0000000..b4db6cb
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/linux/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LINUX_LVFS_H__
+#define __LINUX_LVFS_H__
+
+#ifndef __LVFS_H__
+#error Do not #include this file directly. #include <lvfs.h> instead
+#endif
+
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/lvfs_linux.h>
+
+#define LLOG_LVFS
+
+/* simple.c */
+
+struct lvfs_ucred {
+       __u32              luc_uid;
+       __u32              luc_gid;
+       __u32              luc_fsuid;
+       __u32              luc_fsgid;
+       kernel_cap_t    luc_cap;
+       __u32              luc_umask;
+       struct group_info      *luc_ginfo;
+       struct md_identity     *luc_identity;
+};
+
+struct lvfs_callback_ops {
+       struct dentry *(*l_fid2dentry)(__u64 id_ino, __u32 gen, __u64 gr, void *data);
+};
+
+#define OBD_RUN_CTXT_MAGIC      0xC0FFEEAA
+#define OBD_CTXT_DEBUG   /* development-only debugging */
+struct lvfs_run_ctxt {
+       struct vfsmount  *pwdmnt;
+       struct dentry      *pwd;
+       mm_segment_t         fs;
+       struct lvfs_ucred       luc;
+       int                   ngroups;
+       struct lvfs_callback_ops cb_ops;
+       struct group_info       *group_info;
+       struct dt_device        *dt;
+#ifdef OBD_CTXT_DEBUG
+       __u32               magic;
+#endif
+};
+
+#ifdef OBD_CTXT_DEBUG
+#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC
+#else
+#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0)
+#endif
+
+
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *oldname,
+                 char *newname);
+
+static inline void l_dput(struct dentry *de)
+{
+       if (!de || IS_ERR(de))
+               return;
+       //shrink_dcache_parent(de);
+       LASSERT(d_refcount(de) > 0);
+       dput(de);
+}
+
+/* We need to hold the inode semaphore over the dcache lookup itself, or we
+ * run the risk of entering the filesystem lookup path concurrently on SMP
+ * systems, and instantiating two inodes for the same entry.  We still
+ * protect against concurrent addition/removal races with the DLM locking.
+ */
+static inline struct dentry *ll_lookup_one_len(const char *fid_name,
+                                              struct dentry *dparent,
+                                              int fid_namelen)
+{
+       struct dentry *dchild;
+
+       mutex_lock(&dparent->d_inode->i_mutex);
+       dchild = lookup_one_len(fid_name, dparent, fid_namelen);
+       mutex_unlock(&dparent->d_inode->i_mutex);
+
+       if (IS_ERR(dchild) || dchild->d_inode == NULL)
+               return dchild;
+
+       if (is_bad_inode(dchild->d_inode)) {
+               CERROR("bad inode returned %lu/%u\n",
+                      dchild->d_inode->i_ino, dchild->d_inode->i_generation);
+               dput(dchild);
+               dchild = ERR_PTR(-ENOENT);
+       }
+       return dchild;
+}
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h
new file mode 100644 (file)
index 0000000..140a60f
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LVFS_LINUX_H__
+#define __LVFS_LINUX_H__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#include <lvfs.h>
+
+#define l_file file
+#define l_dentry dentry
+
+#define l_filp_open filp_open
+
+struct lvfs_run_ctxt;
+struct l_file *l_dentry_open(struct lvfs_run_ctxt *, struct l_dentry *,
+                            int flags);
+
+struct l_linux_dirent {
+       struct list_head      lld_list;
+       ino_t      lld_ino;
+       unsigned long   lld_off;
+       char        lld_name[LL_FID_NAMELEN];
+};
+struct l_readdir_callback {
+       struct l_linux_dirent *lrc_dirent;
+       struct list_head            *lrc_list;
+};
+
+#endif /*  __LVFS_LINUX_H__ */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd.h b/drivers/staging/lustre/lustre/include/linux/obd.h
new file mode 100644 (file)
index 0000000..2c36c0d
--- /dev/null
@@ -0,0 +1,128 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_OBD_H
+#define __LINUX_OBD_H
+
+#ifndef __OBD_H
+#error Do not #include this file directly. #include <obd.h> instead
+#endif
+
+#include <obd_support.h>
+
+# include <linux/fs.h>
+# include <linux/list.h>
+# include <linux/sched.h>  /* for struct task_struct, for current.h */
+# include <linux/proc_fs.h>
+# include <linux/mount.h>
+# include <linux/lustre_intent.h>
+
+struct ll_iattr {
+       struct iattr    iattr;
+       unsigned int    ia_attr_flags;
+};
+
+#define CLIENT_OBD_LIST_LOCK_DEBUG 1
+
+typedef struct {
+       spinlock_t              lock;
+
+       unsigned long       time;
+       struct task_struct *task;
+       const char       *func;
+       int              line;
+} client_obd_lock_t;
+
+static inline void __client_obd_list_lock(client_obd_lock_t *lock,
+                                         const char *func, int line)
+{
+       unsigned long cur = jiffies;
+       while (1) {
+               if (spin_trylock(&lock->lock)) {
+                       LASSERT(lock->task == NULL);
+                       lock->task = current;
+                       lock->func = func;
+                       lock->line = line;
+                       lock->time = jiffies;
+                       break;
+               }
+
+               if ((jiffies - cur > 5 * HZ) &&
+                   (jiffies - lock->time > 5 * HZ)) {
+                       struct task_struct *task = lock->task;
+
+                       if (task == NULL)
+                               continue;
+
+                       LCONSOLE_WARN("%s:%d: lock %p was acquired"
+                                     " by <%s:%d:%s:%d> for %lu seconds.\n",
+                                     current->comm, current->pid,
+                                     lock, task->comm, task->pid,
+                                     lock->func, lock->line,
+                                     (jiffies - lock->time) / HZ);
+                       LCONSOLE_WARN("====== for process holding the "
+                                     "lock =====\n");
+                       libcfs_debug_dumpstack(task);
+                       LCONSOLE_WARN("====== for current process =====\n");
+                       libcfs_debug_dumpstack(NULL);
+                       LCONSOLE_WARN("====== end =======\n");
+                       cfs_pause(1000 * HZ);
+               }
+               cpu_relax();
+       }
+}
+
+#define client_obd_list_lock(lock) \
+       __client_obd_list_lock(lock, __FUNCTION__, __LINE__)
+
+static inline void client_obd_list_unlock(client_obd_lock_t *lock)
+{
+       LASSERT(lock->task != NULL);
+       lock->task = NULL;
+       lock->time = jiffies;
+       spin_unlock(&lock->lock);
+}
+
+
+static inline void client_obd_list_lock_init(client_obd_lock_t *lock)
+{
+       spin_lock_init(&lock->lock);
+}
+
+static inline void client_obd_list_lock_done(client_obd_lock_t *lock)
+{}
+
+#endif /* __LINUX_OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd_class.h b/drivers/staging/lustre/lustre/include/linux/obd_class.h
new file mode 100644 (file)
index 0000000..021ead6
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_CLASS_OBD_H
+#define __LINUX_CLASS_OBD_H
+
+#ifndef __CLASS_OBD_H
+#error Do not #include this file directly. #include <obd_class.h> instead
+#endif
+
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+
+/* obdo.c */
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid);
+void la_from_obdo(struct lu_attr *la, struct obdo *dst, obd_flag valid);
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+#define ll_inode_flags(inode)   (inode->i_flags)
+
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustre/lustre/include/linux/obd_support.h b/drivers/staging/lustre/lustre/include/linux/obd_support.h
new file mode 100644 (file)
index 0000000..9166503
--- /dev/null
@@ -0,0 +1,63 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LINUX_OBD_SUPPORT
+#define _LINUX_OBD_SUPPORT
+
+#ifndef _OBD_SUPPORT
+#error Do not #include this file directly. #include <obd_support.h> instead
+#endif
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+#include <asm/processor.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lustre_common.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+
+# include <linux/types.h>
+# include <linux/blkdev.h>
+# include <lvfs.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h
new file mode 100644 (file)
index 0000000..e770d02
--- /dev/null
@@ -0,0 +1,1043 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lprocfs_status.h
+ *
+ * Top level header file for LProc SNMP
+ *
+ * Author: Hariharan Thantry thantry@users.sourceforge.net
+ */
+#ifndef _LPROCFS_SNMP_H
+#define _LPROCFS_SNMP_H
+
+#include <linux/lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <linux/libcfs/params_tree.h>
+
+struct lprocfs_vars {
+       const char              *name;
+       struct file_operations  *fops;
+       void                    *data;
+       /**
+        * /proc file mode.
+        */
+       mode_t                  proc_mode;
+};
+
+struct lprocfs_static_vars {
+       struct lprocfs_vars *module_vars;
+       struct lprocfs_vars *obd_vars;
+};
+
+/* if we find more consumers this could be generalized */
+#define OBD_HIST_MAX 32
+struct obd_histogram {
+       spinlock_t      oh_lock;
+       unsigned long   oh_buckets[OBD_HIST_MAX];
+};
+
+enum {
+       BRW_R_PAGES = 0,
+       BRW_W_PAGES,
+       BRW_R_RPC_HIST,
+       BRW_W_RPC_HIST,
+       BRW_R_IO_TIME,
+       BRW_W_IO_TIME,
+       BRW_R_DISCONT_PAGES,
+       BRW_W_DISCONT_PAGES,
+       BRW_R_DISCONT_BLOCKS,
+       BRW_W_DISCONT_BLOCKS,
+       BRW_R_DISK_IOSIZE,
+       BRW_W_DISK_IOSIZE,
+       BRW_R_DIO_FRAGS,
+       BRW_W_DIO_FRAGS,
+       BRW_LAST,
+};
+
+struct brw_stats {
+       struct obd_histogram hist[BRW_LAST];
+};
+
+enum {
+       RENAME_SAMEDIR_SIZE = 0,
+       RENAME_CROSSDIR_SRC_SIZE,
+       RENAME_CROSSDIR_TGT_SIZE,
+       RENAME_LAST,
+};
+
+struct rename_stats {
+       struct obd_histogram hist[RENAME_LAST];
+};
+
+/* An lprocfs counter can be configured using the enum bit masks below.
+ *
+ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
+ * protects this counter from concurrent updates. If not specified,
+ * lprocfs an internal per-counter lock variable. External locks are
+ * not used to protect counter increments, but are used to protect
+ * counter readout and resets.
+ *
+ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
+ * (i.e. counter can be incremented by more than "1"). When specified,
+ * the counter maintains min, max and sum in addition to a simple
+ * invocation count. This allows averages to be be computed.
+ * If not specified, the counter is an increment-by-1 counter.
+ * min, max, sum, etc. are not maintained.
+ *
+ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
+ * squares (for multi-valued counter samples only). This allows
+ * external computation of standard deviation, but involves a 64-bit
+ * multiply per counter increment.
+ */
+
+enum {
+       LPROCFS_CNTR_EXTERNALLOCK = 0x0001,
+       LPROCFS_CNTR_AVGMINMAX    = 0x0002,
+       LPROCFS_CNTR_STDDEV       = 0x0004,
+
+       /* counter data type */
+       LPROCFS_TYPE_REGS        = 0x0100,
+       LPROCFS_TYPE_BYTES      = 0x0200,
+       LPROCFS_TYPE_PAGES      = 0x0400,
+       LPROCFS_TYPE_CYCLE      = 0x0800,
+};
+
+#define LC_MIN_INIT ((~(__u64)0) >> 1)
+
+struct lprocfs_counter_header {
+       unsigned int            lc_config;
+       const char              *lc_name;   /* must be static */
+       const char              *lc_units;  /* must be static */
+};
+
+struct lprocfs_counter {
+       __s64   lc_count;
+       __s64   lc_min;
+       __s64   lc_max;
+       __s64   lc_sumsquare;
+       /*
+        * Every counter has lc_array_sum[0], while lc_array_sum[1] is only
+        * for irq context counter, i.e. stats with
+        * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need
+        * lc_array_sum[1]
+        */
+       __s64   lc_array_sum[1];
+};
+#define lc_sum         lc_array_sum[0]
+#define lc_sum_irq     lc_array_sum[1]
+
+struct lprocfs_percpu {
+#ifndef __GNUC__
+       __s64                   pad;
+#endif
+       struct lprocfs_counter lp_cntr[0];
+};
+
+#define LPROCFS_GET_NUM_CPU 0x0001
+#define LPROCFS_GET_SMP_ID  0x0002
+
+enum lprocfs_stats_flags {
+       LPROCFS_STATS_FLAG_NONE     = 0x0000, /* per cpu counter */
+       LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu
+                                              * area and need locking */
+       LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */
+};
+
+enum lprocfs_fields_flags {
+       LPROCFS_FIELDS_FLAGS_CONFIG     = 0x0001,
+       LPROCFS_FIELDS_FLAGS_SUM        = 0x0002,
+       LPROCFS_FIELDS_FLAGS_MIN        = 0x0003,
+       LPROCFS_FIELDS_FLAGS_MAX        = 0x0004,
+       LPROCFS_FIELDS_FLAGS_AVG        = 0x0005,
+       LPROCFS_FIELDS_FLAGS_SUMSQUARE  = 0x0006,
+       LPROCFS_FIELDS_FLAGS_COUNT      = 0x0007,
+};
+
+struct lprocfs_stats {
+       /* # of counters */
+       unsigned short                  ls_num;
+       /* 1 + the biggest cpu # whose ls_percpu slot has been allocated */
+       unsigned short                  ls_biggest_alloc_num;
+       enum lprocfs_stats_flags        ls_flags;
+       /* Lock used when there are no percpu stats areas; For percpu stats,
+        * it is used to protect ls_biggest_alloc_num change */
+       spinlock_t                      ls_lock;
+
+       /* has ls_num of counter headers */
+       struct lprocfs_counter_header   *ls_cnt_header;
+       struct lprocfs_percpu           *ls_percpu[0];
+};
+
+#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC)
+
+/* Pack all opcodes down into a single monotonically increasing index */
+static inline int opcode_offset(__u32 opc) {
+       if (opc < OST_LAST_OPC) {
+                /* OST opcode */
+               return (opc - OST_FIRST_OPC);
+       } else if (opc < MDS_LAST_OPC) {
+               /* MDS opcode */
+               return (opc - MDS_FIRST_OPC +
+                       OPC_RANGE(OST));
+       } else if (opc < LDLM_LAST_OPC) {
+               /* LDLM Opcode */
+               return (opc - LDLM_FIRST_OPC +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < MGS_LAST_OPC) {
+               /* MGS Opcode */
+               return (opc - MGS_FIRST_OPC +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < OBD_LAST_OPC) {
+               /* OBD Ping */
+               return (opc - OBD_FIRST_OPC +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < LLOG_LAST_OPC) {
+               /* LLOG Opcode */
+               return (opc - LLOG_FIRST_OPC +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < QUOTA_LAST_OPC) {
+               /* LQUOTA Opcode */
+               return (opc - QUOTA_FIRST_OPC +
+                       OPC_RANGE(LLOG) +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < SEQ_LAST_OPC) {
+               /* SEQ opcode */
+               return (opc - SEQ_FIRST_OPC +
+                       OPC_RANGE(QUOTA) +
+                       OPC_RANGE(LLOG) +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < SEC_LAST_OPC) {
+               /* SEC opcode */
+               return (opc - SEC_FIRST_OPC +
+                       OPC_RANGE(SEQ) +
+                       OPC_RANGE(QUOTA) +
+                       OPC_RANGE(LLOG) +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < FLD_LAST_OPC) {
+               /* FLD opcode */
+                return (opc - FLD_FIRST_OPC +
+                       OPC_RANGE(SEC) +
+                       OPC_RANGE(SEQ) +
+                       OPC_RANGE(QUOTA) +
+                       OPC_RANGE(LLOG) +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else if (opc < UPDATE_LAST_OPC) {
+               /* update opcode */
+               return (opc - UPDATE_FIRST_OPC +
+                       OPC_RANGE(FLD) +
+                       OPC_RANGE(SEC) +
+                       OPC_RANGE(SEQ) +
+                       OPC_RANGE(QUOTA) +
+                       OPC_RANGE(LLOG) +
+                       OPC_RANGE(OBD) +
+                       OPC_RANGE(MGS) +
+                       OPC_RANGE(LDLM) +
+                       OPC_RANGE(MDS) +
+                       OPC_RANGE(OST));
+       } else {
+               /* Unknown Opcode */
+               return -1;
+       }
+}
+
+
+#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST)  + \
+                           OPC_RANGE(MDS)  + \
+                           OPC_RANGE(LDLM) + \
+                           OPC_RANGE(MGS)  + \
+                           OPC_RANGE(OBD)  + \
+                           OPC_RANGE(LLOG) + \
+                           OPC_RANGE(SEC)  + \
+                           OPC_RANGE(SEQ)  + \
+                           OPC_RANGE(SEC)  + \
+                           OPC_RANGE(FLD)  + \
+                           OPC_RANGE(UPDATE))
+
+#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR)  + \
+                           OPC_RANGE(EXTRA))
+
+enum {
+       PTLRPC_REQWAIT_CNTR = 0,
+       PTLRPC_REQQDEPTH_CNTR,
+       PTLRPC_REQACTIVE_CNTR,
+       PTLRPC_TIMEOUT,
+       PTLRPC_REQBUF_AVAIL_CNTR,
+       PTLRPC_LAST_CNTR
+};
+
+#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
+
+enum {
+       LDLM_GLIMPSE_ENQUEUE = 0,
+       LDLM_PLAIN_ENQUEUE,
+       LDLM_EXTENT_ENQUEUE,
+       LDLM_FLOCK_ENQUEUE,
+       LDLM_IBITS_ENQUEUE,
+       MDS_REINT_SETATTR,
+       MDS_REINT_CREATE,
+       MDS_REINT_LINK,
+       MDS_REINT_UNLINK,
+       MDS_REINT_RENAME,
+       MDS_REINT_OPEN,
+       MDS_REINT_SETXATTR,
+       BRW_READ_BYTES,
+       BRW_WRITE_BYTES,
+       EXTRA_LAST_OPC
+};
+
+#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
+/* class_obd.c */
+extern proc_dir_entry_t *proc_lustre_root;
+
+struct obd_device;
+struct obd_histogram;
+
+/* Days / hours / mins / seconds format */
+struct dhms {
+       int d,h,m,s;
+};
+static inline void s2dhms(struct dhms *ts, time_t secs)
+{
+       ts->d = secs / 86400;
+       secs = secs % 86400;
+       ts->h = secs / 3600;
+       secs = secs % 3600;
+       ts->m = secs / 60;
+       ts->s = secs % 60;
+}
+#define DHMS_FMT "%dd%dh%02dm%02ds"
+#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s
+
+#define JOBSTATS_JOBID_VAR_MAX_LEN     20
+#define JOBSTATS_DISABLE               "disable"
+#define JOBSTATS_PROCNAME_UID          "procname_uid"
+
+typedef void (*cntr_init_callback)(struct lprocfs_stats *stats);
+
+struct obd_job_stats {
+       cfs_hash_t      *ojs_hash;
+       struct list_head         ojs_list;
+       rwlock_t       ojs_lock; /* protect the obj_list */
+       cntr_init_callback ojs_cntr_init_fn;
+       int             ojs_cntr_num;
+       int             ojs_cleanup_interval;
+       time_t             ojs_last_cleanup;
+};
+
+#ifdef LPROCFS
+
+extern int lprocfs_stats_alloc_one(struct lprocfs_stats *stats,
+                                  unsigned int cpuid);
+/*
+ * \return value
+ *      < 0     : on error (only possible for opc as LPROCFS_GET_SMP_ID)
+ */
+static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc,
+                                    unsigned long *flags)
+{
+       int             rc = 0;
+
+       switch (opc) {
+       default:
+               LBUG();
+
+       case LPROCFS_GET_SMP_ID:
+               if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+                               spin_lock_irqsave(&stats->ls_lock, *flags);
+                       else
+                               spin_lock(&stats->ls_lock);
+                       return 0;
+               } else {
+                       unsigned int cpuid = get_cpu();
+
+                       if (unlikely(stats->ls_percpu[cpuid] == NULL)) {
+                               rc = lprocfs_stats_alloc_one(stats, cpuid);
+                               if (rc < 0) {
+                                       put_cpu();
+                                       return rc;
+                               }
+                       }
+                       return cpuid;
+               }
+
+       case LPROCFS_GET_NUM_CPU:
+               if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+                               spin_lock_irqsave(&stats->ls_lock, *flags);
+                       else
+                               spin_lock(&stats->ls_lock);
+                       return 1;
+               } else {
+                       return stats->ls_biggest_alloc_num;
+               }
+       }
+}
+
+static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc,
+                                       unsigned long *flags)
+{
+       switch (opc) {
+       default:
+               LBUG();
+
+       case LPROCFS_GET_SMP_ID:
+               if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+                               spin_unlock_irqrestore(&stats->ls_lock,
+                                                          *flags);
+                       } else {
+                               spin_unlock(&stats->ls_lock);
+                       }
+               } else {
+                       put_cpu();
+               }
+               return;
+
+       case LPROCFS_GET_NUM_CPU:
+               if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) {
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+                               spin_unlock_irqrestore(&stats->ls_lock,
+                                                          *flags);
+                       } else {
+                               spin_unlock(&stats->ls_lock);
+                       }
+               }
+               return;
+       }
+}
+
+static inline unsigned int
+lprocfs_stats_counter_size(struct lprocfs_stats *stats)
+{
+       unsigned int percpusize;
+
+       percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]);
+
+       /* irq safe stats need lc_array_sum[1] */
+       if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+               percpusize += stats->ls_num * sizeof(__s64);
+
+       if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0)
+               percpusize = L1_CACHE_ALIGN(percpusize);
+
+       return percpusize;
+}
+
+static inline struct lprocfs_counter *
+lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid,
+                         int index)
+{
+       struct lprocfs_counter *cntr;
+
+       cntr = &stats->ls_percpu[cpuid]->lp_cntr[index];
+
+       if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+               cntr = (void *)cntr + index * sizeof(__s64);
+
+       return cntr;
+}
+
+/* Two optimized LPROCFS counter increment functions are provided:
+ *     lprocfs_counter_incr(cntr, value) - optimized for by-one counters
+ *     lprocfs_counter_add(cntr) - use for multi-valued counters
+ * Counter data layout allows config flag, counter lock and the
+ * count itself to reside within a single cache line.
+ */
+
+extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx,
+                               long amount);
+extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx,
+                               long amount);
+
+#define lprocfs_counter_incr(stats, idx) \
+       lprocfs_counter_add(stats, idx, 1)
+#define lprocfs_counter_decr(stats, idx) \
+       lprocfs_counter_sub(stats, idx, 1)
+
+extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+                                struct lprocfs_counter_header *header,
+                                enum lprocfs_stats_flags flags,
+                                enum lprocfs_fields_flags field);
+static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats,
+                                           int idx,
+                                           enum lprocfs_fields_flags field)
+{
+       int           i;
+       unsigned int  num_cpu;
+       unsigned long flags     = 0;
+       __u64         ret       = 0;
+
+       LASSERT(stats != NULL);
+
+       num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+       for (i = 0; i < num_cpu; i++) {
+               if (stats->ls_percpu[i] == NULL)
+                       continue;
+               ret += lprocfs_read_helper(
+                               lprocfs_stats_counter_get(stats, i, idx),
+                               &stats->ls_cnt_header[idx], stats->ls_flags,
+                               field);
+       }
+       lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+       return ret;
+}
+
+extern struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags);
+extern void lprocfs_clear_stats(struct lprocfs_stats *stats);
+extern void lprocfs_free_stats(struct lprocfs_stats **stats);
+extern void lprocfs_init_ops_stats(int num_private_stats,
+                                  struct lprocfs_stats *stats);
+extern void lprocfs_init_mps_stats(int num_private_stats,
+                                  struct lprocfs_stats *stats);
+extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats);
+extern int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+                                  unsigned int num_private_stats);
+extern int lprocfs_alloc_md_stats(struct obd_device *obddev,
+                                 unsigned int num_private_stats);
+extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+                                unsigned conf, const char *name,
+                                const char *units);
+extern void lprocfs_free_obd_stats(struct obd_device *obddev);
+extern void lprocfs_free_md_stats(struct obd_device *obddev);
+struct obd_export;
+struct nid_stat;
+extern int lprocfs_add_clear_entry(struct obd_device * obd,
+                                  proc_dir_entry_t *entry);
+extern int lprocfs_exp_setup(struct obd_export *exp,
+                            lnet_nid_t *peer_nid, int *newnid);
+extern int lprocfs_exp_cleanup(struct obd_export *exp);
+extern proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+                                               char *name,
+                                               void *data,
+                                               struct file_operations *fops);
+extern struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+                   const char *format, ...);
+extern void lprocfs_free_per_client_stats(struct obd_device *obd);
+extern int
+lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+                             unsigned long count, void *data);
+extern int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data);
+
+extern int lprocfs_register_stats(proc_dir_entry_t *root, const char *name,
+                                 struct lprocfs_stats *stats);
+
+/* lprocfs_status.c */
+extern int lprocfs_add_vars(proc_dir_entry_t *root,
+                           struct lprocfs_vars *var,
+                           void *data);
+
+extern proc_dir_entry_t *lprocfs_register(const char *name,
+                                             proc_dir_entry_t *parent,
+                                             struct lprocfs_vars *list,
+                                             void *data);
+
+extern void lprocfs_remove(proc_dir_entry_t **root);
+extern void lprocfs_remove_proc_entry(const char *name,
+                                     struct proc_dir_entry *parent);
+
+extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
+extern int lprocfs_obd_cleanup(struct obd_device *obd);
+
+extern int lprocfs_seq_create(proc_dir_entry_t *parent, const char *name,
+                             mode_t mode,
+                             const struct file_operations *seq_fops,
+                             void *data);
+extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name,
+                                 mode_t mode,
+                                 const struct file_operations *seq_fops,
+                                 void *data);
+
+/* Generic callbacks */
+
+extern int lprocfs_rd_u64(struct seq_file *m, void *data);
+extern int lprocfs_rd_atomic(struct seq_file *m, void *data);
+extern int lprocfs_wr_atomic(struct file *file, const char *buffer,
+                            unsigned long count, void *data);
+extern int lprocfs_rd_uint(struct seq_file *m, void *data);
+extern int lprocfs_wr_uint(struct file *file, const char *buffer,
+                          unsigned long count, void *data);
+extern int lprocfs_rd_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_name(struct seq_file *m, void *data);
+extern int lprocfs_rd_server_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_conn_uuid(struct seq_file *m, void *data);
+extern int lprocfs_rd_import(struct seq_file *m, void *data);
+extern int lprocfs_rd_state(struct seq_file *m, void *data);
+extern int lprocfs_rd_connect_flags(struct seq_file *m, void *data);
+extern int lprocfs_rd_num_exports(struct seq_file *m, void *data);
+extern int lprocfs_rd_numrefs(struct seq_file *m, void *data);
+
+struct adaptive_timeout;
+extern int lprocfs_at_hist_helper(struct seq_file *m,
+                                 struct adaptive_timeout *at);
+extern int lprocfs_rd_timeouts(struct seq_file *m, void *data);
+extern int lprocfs_wr_timeouts(struct file *file, const char *buffer,
+                              unsigned long count, void *data);
+extern int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+                           size_t count, loff_t *off);
+extern int lprocfs_wr_ping(struct file *file, const char *buffer,
+                          size_t count, loff_t *off);
+extern int lprocfs_wr_import(struct file *file, const char *buffer,
+                     size_t count, loff_t *off);
+extern int lprocfs_rd_pinger_recov(struct seq_file *m, void *n);
+extern int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+                                  size_t count, loff_t *off);
+
+/* Statfs helpers */
+extern int lprocfs_rd_blksize(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytestotal(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytesfree(struct seq_file *m, void *data);
+extern int lprocfs_rd_kbytesavail(struct seq_file *m, void *data);
+extern int lprocfs_rd_filestotal(struct seq_file *m, void *data);
+extern int lprocfs_rd_filesfree(struct seq_file *m, void *data);
+
+extern int lprocfs_write_helper(const char *buffer, unsigned long count,
+                               int *val);
+extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+                                    int *val, int mult);
+extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult);
+extern int lprocfs_read_frac_helper(char *buffer, unsigned long count,
+                                   long val, int mult);
+extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count,
+                                   __u64 *val);
+extern int lprocfs_write_frac_u64_helper(const char *buffer,
+                                        unsigned long count,
+                                        __u64 *val, int mult);
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+                               unsigned long *count);
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
+void lprocfs_oh_clear(struct obd_histogram *oh);
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
+
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+                          struct lprocfs_counter *cnt);
+
+extern int lprocfs_single_release(cfs_inode_t *, struct file *);
+extern int lprocfs_seq_release(cfs_inode_t *, struct file *);
+
+/* You must use these macros when you want to refer to
+ * the import in a client obd_device for a lprocfs entry */
+#define LPROCFS_CLIMP_CHECK(obd) do {     \
+       typecheck(struct obd_device *, obd);    \
+       down_read(&(obd)->u.cli.cl_sem);    \
+       if ((obd)->u.cli.cl_import == NULL) {   \
+            up_read(&(obd)->u.cli.cl_sem); \
+            return -ENODEV;                \
+       }                                      \
+} while(0)
+#define LPROCFS_CLIMP_EXIT(obd)                 \
+       up_read(&(obd)->u.cli.cl_sem);
+
+
+/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only
+  proc entries; otherwise, you will define name##_seq_write function also for
+  a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally,
+  call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */
+#define __LPROC_SEQ_FOPS(name, custom_seq_write)                       \
+static int name##_single_open(cfs_inode_t *inode, struct file *file)   \
+{                                                                      \
+       return single_open(file, name##_seq_show, PDE_DATA(inode));     \
+}                                                                      \
+struct file_operations name##_fops = {                              \
+       .owner   = THIS_MODULE,                                     \
+       .open    = name##_single_open,                               \
+       .read    = seq_read,                                           \
+       .write   = custom_seq_write,                                   \
+       .llseek  = seq_lseek,                                         \
+       .release = lprocfs_single_release,                               \
+}
+
+#define LPROC_SEQ_FOPS_RO(name)         __LPROC_SEQ_FOPS(name, NULL)
+#define LPROC_SEQ_FOPS(name)       __LPROC_SEQ_FOPS(name, name##_seq_write)
+
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)                             \
+       static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+       {                                                               \
+               return lprocfs_rd_##type(m, m->private);                \
+       }                                                               \
+       LPROC_SEQ_FOPS_RO(name##_##type)
+
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)                             \
+       static int name##_##type##_seq_show(struct seq_file *m, void *v)\
+       {                                                               \
+               return lprocfs_rd_##type(m, m->private);                \
+       }                                                               \
+       static ssize_t name##_##type##_seq_write(struct file *file,     \
+                       const char *buffer, size_t count, loff_t *off)  \
+       {                                                               \
+               struct seq_file *seq = file->private_data;              \
+               return lprocfs_wr_##type(file, buffer,                  \
+                                        count, seq->private);          \
+       }                                                               \
+       LPROC_SEQ_FOPS(name##_##type);
+
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)                             \
+       static ssize_t name##_##type##_write(struct file *file,         \
+                       const char *buffer, size_t count, loff_t *off)  \
+       {                                                               \
+               return lprocfs_wr_##type(file, buffer, count, off);     \
+       }                                                               \
+       static int name##_##type##_open(cfs_inode_t *inode, struct file *file) \
+       {                                                               \
+               return single_open(file, NULL, PDE_DATA(inode));        \
+       }                                                               \
+       struct file_operations name##_##type##_fops = {                 \
+               .open   = name##_##type##_open,                         \
+               .write  = name##_##type##_write,                        \
+               .release = lprocfs_single_release,                      \
+       };
+
+/* lprocfs_jobstats.c */
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+                         int event, long amount);
+void lprocfs_job_stats_fini(struct obd_device *obd);
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+                          cntr_init_callback fn);
+int lprocfs_rd_job_interval(struct seq_file *m, void *data);
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+                           unsigned long count, void *data);
+
+/* lproc_ptlrpc.c */
+struct ptlrpc_request;
+extern void target_print_req(void *seq_file, struct ptlrpc_request *req);
+
+/* lproc_status.c */
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data);
+int lprocfs_obd_wr_max_pages_per_rpc(struct file *file, const char *buffer,
+                                    size_t count, loff_t *off);
+
+/* all quota proc functions */
+extern int lprocfs_quota_rd_bunit(char *page, char **start,
+                                 loff_t off, int count,
+                                 int *eof, void *data);
+extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer,
+                                 unsigned long count, void *data);
+extern int lprocfs_quota_rd_btune(char *page, char **start,
+                                 loff_t off, int count,
+                                 int *eof, void *data);
+extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer,
+                                 unsigned long count, void *data);
+extern int lprocfs_quota_rd_iunit(char *page, char **start,
+                                 loff_t off, int count,
+                                 int *eof, void *data);
+extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer,
+                                 unsigned long count, void *data);
+extern int lprocfs_quota_rd_itune(char *page, char **start,
+                                 loff_t off, int count,
+                                 int *eof, void *data);
+extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer,
+                                 unsigned long count, void *data);
+extern int lprocfs_quota_rd_type(char *page, char **start, loff_t off, int count,
+                                int *eof, void *data);
+extern int lprocfs_quota_wr_type(struct file *file, const char *buffer,
+                                unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, loff_t off,
+                                          int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_seconds(struct file *file,
+                                          const char *buffer,
+                                          unsigned long count, void *data);
+extern int lprocfs_quota_rd_sync_blk(char *page, char **start, loff_t off,
+                                    int count, int *eof, void *data);
+extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer,
+                                    unsigned long count, void *data);
+extern int lprocfs_quota_rd_switch_qs(char *page, char **start, loff_t off,
+                                     int count, int *eof, void *data);
+extern int lprocfs_quota_wr_switch_qs(struct file *file,
+                                     const char *buffer,
+                                     unsigned long count, void *data);
+extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, loff_t off,
+                                           int count, int *eof, void *data);
+extern int lprocfs_quota_wr_boundary_factor(struct file *file,
+                                           const char *buffer,
+                                           unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_bunit(char *page, char **start, loff_t off,
+                                       int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_bunit(struct file *file,
+                                       const char *buffer,
+                                       unsigned long count, void *data);
+extern int lprocfs_quota_rd_least_iunit(char *page, char **start, loff_t off,
+                                       int count, int *eof, void *data);
+extern int lprocfs_quota_wr_least_iunit(struct file *file,
+                                       const char *buffer,
+                                       unsigned long count, void *data);
+extern int lprocfs_quota_rd_qs_factor(char *page, char **start, loff_t off,
+                                     int count, int *eof, void *data);
+extern int lprocfs_quota_wr_qs_factor(struct file *file,
+                                     const char *buffer,
+                                     unsigned long count, void *data);
+
+
+
+#else
+/* LPROCFS is not defined */
+
+#define proc_lustre_root NULL
+
+static inline void lprocfs_counter_add(struct lprocfs_stats *stats,
+                                      int index, long amount)
+{ return; }
+static inline void lprocfs_counter_incr(struct lprocfs_stats *stats,
+                                       int index)
+{ return; }
+static inline void lprocfs_counter_sub(struct lprocfs_stats *stats,
+                                      int index, long amount)
+{ return; }
+static inline void lprocfs_counter_decr(struct lprocfs_stats *stats,
+                                       int index)
+{ return; }
+static inline void lprocfs_counter_init(struct lprocfs_stats *stats,
+                                       int index, unsigned conf,
+                                       const char *name, const char *units)
+{ return; }
+
+static inline __u64 lc_read_helper(struct lprocfs_counter *lc,
+                                  enum lprocfs_fields_flags field)
+{ return 0; }
+
+/* NB: we return !NULL to satisfy error checker */
+static inline struct lprocfs_stats *
+lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags)
+{ return (struct lprocfs_stats *)1; }
+static inline void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_free_stats(struct lprocfs_stats **stats)
+{ return; }
+static inline int lprocfs_register_stats(proc_dir_entry_t *root,
+                                        const char *name,
+                                        struct lprocfs_stats *stats)
+{ return 0; }
+static inline void lprocfs_init_ops_stats(int num_private_stats,
+                                         struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_mps_stats(int num_private_stats,
+                                         struct lprocfs_stats *stats)
+{ return; }
+static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{ return; }
+static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev,
+                                         unsigned int num_private_stats)
+{ return 0; }
+static inline int lprocfs_alloc_md_stats(struct obd_device *obddev,
+                                        unsigned int num_private_stats)
+{ return 0; }
+static inline void lprocfs_free_obd_stats(struct obd_device *obddev)
+{ return; }
+static inline void lprocfs_free_md_stats(struct obd_device *obddev)
+{ return; }
+
+struct obd_export;
+static inline int lprocfs_add_clear_entry(struct obd_export *exp)
+{ return 0; }
+static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid,
+                                   int *newnid)
+{ return 0; }
+static inline int lprocfs_exp_cleanup(struct obd_export *exp)
+{ return 0; }
+static inline proc_dir_entry_t *
+lprocfs_add_simple(struct proc_dir_entry *root, char *name,
+                  void *data, struct file_operations *fops)
+{return 0; }
+static inline struct proc_dir_entry *
+lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent,
+                   const char *format, ...)
+{return NULL; }
+static inline void lprocfs_free_per_client_stats(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{return count;}
+static inline
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{ return 0; }
+
+static inline proc_dir_entry_t *
+lprocfs_register(const char *name, proc_dir_entry_t *parent,
+                struct lprocfs_vars *list, void *data)
+{ return NULL; }
+static inline int lprocfs_add_vars(proc_dir_entry_t *root,
+                                  struct lprocfs_vars *var,
+                                  void *data)
+{ return 0; }
+static inline void lprocfs_remove(proc_dir_entry_t **root)
+{ return; }
+static inline void lprocfs_remove_proc_entry(const char *name,
+                                            struct proc_dir_entry *parent)
+{ return; }
+static inline int lprocfs_obd_setup(struct obd_device *dev,
+                                   struct lprocfs_vars *list)
+{ return 0; }
+static inline int lprocfs_obd_cleanup(struct obd_device *dev)
+{ return 0; }
+static inline int lprocfs_rd_u64(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_name(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_import(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_pinger_recov(struct seq_file *m, void *n)
+{ return 0; }
+static inline int lprocfs_rd_state(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{ return 0; }
+extern inline int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{ return 0; }
+struct adaptive_timeout;
+static inline int lprocfs_at_hist_helper(struct seq_file *m,
+                                        struct adaptive_timeout *at)
+{ return 0; }
+static inline int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{ return 0; }
+static inline int lprocfs_wr_timeouts(struct file *file,
+                                     const char *buffer,
+                                     unsigned long count, void *data)
+{ return 0; }
+static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+                                   size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_ping(struct file *file, const char *buffer,
+                          size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_import(struct file *file, const char *buffer,
+                             size_t count, loff_t *off)
+{ return 0; }
+static inline int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+                                       size_t count, loff_t *off)
+{ return 0; }
+
+/* Statfs helpers */
+static inline
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{ return 0; }
+static inline
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{ return; }
+static inline
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{ return; }
+static inline
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{ return 0; }
+static inline
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+                          struct lprocfs_counter *cnt)
+{ return; }
+static inline
+__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx,
+                              enum lprocfs_fields_flags field)
+{ return (__u64)0; }
+
+#define LPROC_SEQ_FOPS_RO(name)
+#define LPROC_SEQ_FOPS(name)
+#define LPROC_SEQ_FOPS_RO_TYPE(name, type)
+#define LPROC_SEQ_FOPS_RW_TYPE(name, type)
+#define LPROC_SEQ_FOPS_WR_ONLY(name, type)
+
+/* lprocfs_jobstats.c */
+static inline
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event,
+                         long amount)
+{ return 0; }
+static inline
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{ return; }
+static inline
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+                          cntr_init_callback fn)
+{ return 0; }
+
+
+/* lproc_ptlrpc.c */
+#define target_print_req NULL
+
+#endif /* LPROCFS */
+
+#endif /* LPROCFS_SNMP_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h
new file mode 100644 (file)
index 0000000..d40ad81
--- /dev/null
@@ -0,0 +1,1346 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_LU_OBJECT_H
+#define __LUSTRE_LU_OBJECT_H
+
+#include <stdarg.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+
+struct seq_file;
+struct proc_dir_entry;
+struct lustre_cfg;
+struct lprocfs_stats;
+
+/** \defgroup lu lu
+ * lu_* data-types represent server-side entities shared by data and meta-data
+ * stacks.
+ *
+ * Design goals:
+ *
+ * -# support for layering.
+ *
+ *     Server side object is split into layers, one per device in the
+ *     corresponding device stack. Individual layer is represented by struct
+ *     lu_object. Compound layered object --- by struct lu_object_header. Most
+ *     interface functions take lu_object as an argument and operate on the
+ *     whole compound object. This decision was made due to the following
+ *     reasons:
+ *
+ *     - it's envisaged that lu_object will be used much more often than
+ *     lu_object_header;
+ *
+ *     - we want lower (non-top) layers to be able to initiate operations
+ *     on the whole object.
+ *
+ *     Generic code supports layering more complex than simple stacking, e.g.,
+ *     it is possible that at some layer object "spawns" multiple sub-objects
+ *     on the lower layer.
+ *
+ * -# fid-based identification.
+ *
+ *     Compound object is uniquely identified by its fid. Objects are indexed
+ *     by their fids (hash table is used for index).
+ *
+ * -# caching and life-cycle management.
+ *
+ *     Object's life-time is controlled by reference counting. When reference
+ *     count drops to 0, object is returned to cache. Cached objects still
+ *     retain their identity (i.e., fid), and can be recovered from cache.
+ *
+ *     Objects are kept in the global LRU list, and lu_site_purge() function
+ *     can be used to reclaim given number of unused objects from the tail of
+ *     the LRU.
+ *
+ * -# avoiding recursion.
+ *
+ *     Generic code tries to replace recursion through layers by iterations
+ *     where possible. Additionally to the end of reducing stack consumption,
+ *     data, when practically possible, are allocated through lu_context_key
+ *     interface rather than on stack.
+ * @{
+ */
+
+struct lu_site;
+struct lu_object;
+struct lu_device;
+struct lu_object_header;
+struct lu_context;
+struct lu_env;
+
+/**
+ * Operations common for data and meta-data devices.
+ */
+struct lu_device_operations {
+       /**
+        * Allocate object for the given device (without lower-layer
+        * parts). This is called by lu_object_operations::loo_object_init()
+        * from the parent layer, and should setup at least lu_object::lo_dev
+        * and lu_object::lo_ops fields of resulting lu_object.
+        *
+        * Object creation protocol.
+        *
+        * Due to design goal of avoiding recursion, object creation (see
+        * lu_object_alloc()) is somewhat involved:
+        *
+        *  - first, lu_device_operations::ldo_object_alloc() method of the
+        *  top-level device in the stack is called. It should allocate top
+        *  level object (including lu_object_header), but without any
+        *  lower-layer sub-object(s).
+        *
+        *  - then lu_object_alloc() sets fid in the header of newly created
+        *  object.
+        *
+        *  - then lu_object_operations::loo_object_init() is called. It has
+        *  to allocate lower-layer object(s). To do this,
+        *  lu_object_operations::loo_object_init() calls ldo_object_alloc()
+        *  of the lower-layer device(s).
+        *
+        *  - for all new objects allocated by
+        *  lu_object_operations::loo_object_init() (and inserted into object
+        *  stack), lu_object_operations::loo_object_init() is called again
+        *  repeatedly, until no new objects are created.
+        *
+        * \post ergo(!IS_ERR(result), result->lo_dev == d &&
+        *                           result->lo_ops != NULL);
+        */
+       struct lu_object *(*ldo_object_alloc)(const struct lu_env *env,
+                                             const struct lu_object_header *h,
+                                             struct lu_device *d);
+       /**
+        * process config specific for device.
+        */
+       int (*ldo_process_config)(const struct lu_env *env,
+                                 struct lu_device *, struct lustre_cfg *);
+       int (*ldo_recovery_complete)(const struct lu_env *,
+                                    struct lu_device *);
+
+       /**
+        * initialize local objects for device. this method called after layer has
+        * been initialized (after LCFG_SETUP stage) and before it starts serving
+        * user requests.
+        */
+
+       int (*ldo_prepare)(const struct lu_env *,
+                          struct lu_device *parent,
+                          struct lu_device *dev);
+
+};
+
+/**
+ * For lu_object_conf flags
+ */
+typedef enum {
+       /* This is a new object to be allocated, or the file
+        * corresponding to the object does not exists. */
+       LOC_F_NEW       = 0x00000001,
+} loc_flags_t;
+
+/**
+ * Object configuration, describing particulars of object being created. On
+ * server this is not used, as server objects are full identified by fid. On
+ * client configuration contains struct lustre_md.
+ */
+struct lu_object_conf {
+       /**
+        * Some hints for obj find and alloc.
+        */
+       loc_flags_t     loc_flags;
+};
+
+/**
+ * Type of "printer" function used by lu_object_operations::loo_object_print()
+ * method.
+ *
+ * Printer function is needed to provide some flexibility in (semi-)debugging
+ * output: possible implementations: printk, CDEBUG, sysfs/seq_file
+ */
+typedef int (*lu_printer_t)(const struct lu_env *env,
+                           void *cookie, const char *format, ...)
+       __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Operations specific for particular lu_object.
+ */
+struct lu_object_operations {
+
+       /**
+        * Allocate lower-layer parts of the object by calling
+        * lu_device_operations::ldo_object_alloc() of the corresponding
+        * underlying device.
+        *
+        * This method is called once for each object inserted into object
+        * stack. It's responsibility of this method to insert lower-layer
+        * object(s) it create into appropriate places of object stack.
+        */
+       int (*loo_object_init)(const struct lu_env *env,
+                              struct lu_object *o,
+                              const struct lu_object_conf *conf);
+       /**
+        * Called (in top-to-bottom order) during object allocation after all
+        * layers were allocated and initialized. Can be used to perform
+        * initialization depending on lower layers.
+        */
+       int (*loo_object_start)(const struct lu_env *env,
+                               struct lu_object *o);
+       /**
+        * Called before lu_object_operations::loo_object_free() to signal
+        * that object is being destroyed. Dual to
+        * lu_object_operations::loo_object_init().
+        */
+       void (*loo_object_delete)(const struct lu_env *env,
+                                 struct lu_object *o);
+       /**
+        * Dual to lu_device_operations::ldo_object_alloc(). Called when
+        * object is removed from memory.
+        */
+       void (*loo_object_free)(const struct lu_env *env,
+                               struct lu_object *o);
+       /**
+        * Called when last active reference to the object is released (and
+        * object returns to the cache). This method is optional.
+        */
+       void (*loo_object_release)(const struct lu_env *env,
+                                  struct lu_object *o);
+       /**
+        * Optional debugging helper. Print given object.
+        */
+       int (*loo_object_print)(const struct lu_env *env, void *cookie,
+                               lu_printer_t p, const struct lu_object *o);
+       /**
+        * Optional debugging method. Returns true iff method is internally
+        * consistent.
+        */
+       int (*loo_object_invariant)(const struct lu_object *o);
+};
+
+/**
+ * Type of lu_device.
+ */
+struct lu_device_type;
+
+/**
+ * Device: a layer in the server side abstraction stacking.
+ */
+struct lu_device {
+       /**
+        * reference count. This is incremented, in particular, on each object
+        * created at this layer.
+        *
+        * \todo XXX which means that atomic_t is probably too small.
+        */
+       atomic_t                       ld_ref;
+       /**
+        * Pointer to device type. Never modified once set.
+        */
+       struct lu_device_type       *ld_type;
+       /**
+        * Operation vector for this device.
+        */
+       const struct lu_device_operations *ld_ops;
+       /**
+        * Stack this device belongs to.
+        */
+       struct lu_site              *ld_site;
+       struct proc_dir_entry        *ld_proc_entry;
+
+       /** \todo XXX: temporary back pointer into obd. */
+       struct obd_device                *ld_obd;
+       /**
+        * A list of references to this object, for debugging.
+        */
+       struct lu_ref                 ld_reference;
+       /**
+        * Link the device to the site.
+        **/
+       struct list_head                         ld_linkage;
+};
+
+struct lu_device_type_operations;
+
+/**
+ * Tag bits for device type. They are used to distinguish certain groups of
+ * device types.
+ */
+enum lu_device_tag {
+       /** this is meta-data device */
+       LU_DEVICE_MD = (1 << 0),
+       /** this is data device */
+       LU_DEVICE_DT = (1 << 1),
+       /** data device in the client stack */
+       LU_DEVICE_CL = (1 << 2)
+};
+
+/**
+ * Type of device.
+ */
+struct lu_device_type {
+       /**
+        * Tag bits. Taken from enum lu_device_tag. Never modified once set.
+        */
+       __u32                              ldt_tags;
+       /**
+        * Name of this class. Unique system-wide. Never modified once set.
+        */
+       char                               *ldt_name;
+       /**
+        * Operations for this type.
+        */
+       const struct lu_device_type_operations *ldt_ops;
+       /**
+        * \todo XXX: temporary pointer to associated obd_type.
+        */
+       struct obd_type                 *ldt_obd_type;
+       /**
+        * \todo XXX: temporary: context tags used by obd_*() calls.
+        */
+       __u32                              ldt_ctx_tags;
+       /**
+        * Number of existing device type instances.
+        */
+       unsigned                                ldt_device_nr;
+       /**
+        * Linkage into a global list of all device types.
+        *
+        * \see lu_device_types.
+        */
+       struct list_head                              ldt_linkage;
+};
+
+/**
+ * Operations on a device type.
+ */
+struct lu_device_type_operations {
+       /**
+        * Allocate new device.
+        */
+       struct lu_device *(*ldto_device_alloc)(const struct lu_env *env,
+                                              struct lu_device_type *t,
+                                              struct lustre_cfg *lcfg);
+       /**
+        * Free device. Dual to
+        * lu_device_type_operations::ldto_device_alloc(). Returns pointer to
+        * the next device in the stack.
+        */
+       struct lu_device *(*ldto_device_free)(const struct lu_env *,
+                                             struct lu_device *);
+
+       /**
+        * Initialize the devices after allocation
+        */
+       int  (*ldto_device_init)(const struct lu_env *env,
+                                struct lu_device *, const char *,
+                                struct lu_device *);
+       /**
+        * Finalize device. Dual to
+        * lu_device_type_operations::ldto_device_init(). Returns pointer to
+        * the next device in the stack.
+        */
+       struct lu_device *(*ldto_device_fini)(const struct lu_env *env,
+                                             struct lu_device *);
+       /**
+        * Initialize device type. This is called on module load.
+        */
+       int  (*ldto_init)(struct lu_device_type *t);
+       /**
+        * Finalize device type. Dual to
+        * lu_device_type_operations::ldto_init(). Called on module unload.
+        */
+       void (*ldto_fini)(struct lu_device_type *t);
+       /**
+        * Called when the first device is created.
+        */
+       void (*ldto_start)(struct lu_device_type *t);
+       /**
+        * Called when number of devices drops to 0.
+        */
+       void (*ldto_stop)(struct lu_device_type *t);
+};
+
+static inline int lu_device_is_md(const struct lu_device *d)
+{
+       return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD);
+}
+
+/**
+ * Flags for the object layers.
+ */
+enum lu_object_flags {
+       /**
+        * this flags is set if lu_object_operations::loo_object_init() has
+        * been called for this layer. Used by lu_object_alloc().
+        */
+       LU_OBJECT_ALLOCATED = (1 << 0)
+};
+
+/**
+ * Common object attributes.
+ */
+struct lu_attr {
+       /** size in bytes */
+       __u64     la_size;
+       /** modification time in seconds since Epoch */
+       obd_time       la_mtime;
+       /** access time in seconds since Epoch */
+       obd_time       la_atime;
+       /** change time in seconds since Epoch */
+       obd_time       la_ctime;
+       /** 512-byte blocks allocated to object */
+       __u64     la_blocks;
+       /** permission bits and file type */
+       __u32     la_mode;
+       /** owner id */
+       __u32     la_uid;
+       /** group id */
+       __u32     la_gid;
+       /** object flags */
+       __u32     la_flags;
+       /** number of persistent references to this object */
+       __u32     la_nlink;
+       /** blk bits of the object*/
+       __u32     la_blkbits;
+       /** blk size of the object*/
+       __u32     la_blksize;
+       /** real device */
+       __u32     la_rdev;
+       /**
+        * valid bits
+        *
+        * \see enum la_valid
+        */
+       __u64     la_valid;
+};
+
+/** Bit-mask of valid attributes */
+enum la_valid {
+       LA_ATIME = 1 << 0,
+       LA_MTIME = 1 << 1,
+       LA_CTIME = 1 << 2,
+       LA_SIZE  = 1 << 3,
+       LA_MODE  = 1 << 4,
+       LA_UID   = 1 << 5,
+       LA_GID   = 1 << 6,
+       LA_BLOCKS = 1 << 7,
+       LA_TYPE   = 1 << 8,
+       LA_FLAGS  = 1 << 9,
+       LA_NLINK  = 1 << 10,
+       LA_RDEV   = 1 << 11,
+       LA_BLKSIZE = 1 << 12,
+       LA_KILL_SUID = 1 << 13,
+       LA_KILL_SGID = 1 << 14,
+};
+
+/**
+ * Layer in the layered object.
+ */
+struct lu_object {
+       /**
+        * Header for this object.
+        */
+       struct lu_object_header    *lo_header;
+       /**
+        * Device for this layer.
+        */
+       struct lu_device                  *lo_dev;
+       /**
+        * Operations for this object.
+        */
+       const struct lu_object_operations *lo_ops;
+       /**
+        * Linkage into list of all layers.
+        */
+       struct list_head                         lo_linkage;
+       /**
+        * Depth. Top level layer depth is 0.
+        */
+       int                             lo_depth;
+       /**
+        * Flags from enum lu_object_flags.
+        */
+       __u32                                   lo_flags;
+       /**
+        * Link to the device, for debugging.
+        */
+       struct lu_ref_link              *lo_dev_ref;
+};
+
+enum lu_object_header_flags {
+       /**
+        * Don't keep this object in cache. Object will be destroyed as soon
+        * as last reference to it is released. This flag cannot be cleared
+        * once set.
+        */
+       LU_OBJECT_HEARD_BANSHEE = 0,
+       /**
+        * Mark this object has already been taken out of cache.
+        */
+       LU_OBJECT_UNHASHED = 1
+};
+
+enum lu_object_header_attr {
+       LOHA_EXISTS   = 1 << 0,
+       LOHA_REMOTE   = 1 << 1,
+       /**
+        * UNIX file type is stored in S_IFMT bits.
+        */
+       LOHA_FT_START = 001 << 12, /**< S_IFIFO */
+       LOHA_FT_END   = 017 << 12, /**< S_IFMT */
+};
+
+/**
+ * "Compound" object, consisting of multiple layers.
+ *
+ * Compound object with given fid is unique with given lu_site.
+ *
+ * Note, that object does *not* necessary correspond to the real object in the
+ * persistent storage: object is an anchor for locking and method calling, so
+ * it is created for things like not-yet-existing child created by mkdir or
+ * create calls. lu_object_operations::loo_exists() can be used to check
+ * whether object is backed by persistent storage entity.
+ */
+struct lu_object_header {
+       /**
+        * Object flags from enum lu_object_header_flags. Set and checked
+        * atomically.
+        */
+       unsigned long     loh_flags;
+       /**
+        * Object reference count. Protected by lu_site::ls_guard.
+        */
+       atomic_t           loh_ref;
+       /**
+        * Fid, uniquely identifying this object.
+        */
+       struct lu_fid     loh_fid;
+       /**
+        * Common object attributes, cached for efficiency. From enum
+        * lu_object_header_attr.
+        */
+       __u32             loh_attr;
+       /**
+        * Linkage into per-site hash table. Protected by lu_site::ls_guard.
+        */
+       struct hlist_node       loh_hash;
+       /**
+        * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
+        */
+       struct list_head             loh_lru;
+       /**
+        * Linkage into list of layers. Never modified once set (except lately
+        * during object destruction). No locking is necessary.
+        */
+       struct list_head             loh_layers;
+       /**
+        * A list of references to this object, for debugging.
+        */
+       struct lu_ref     loh_reference;
+};
+
+struct fld;
+
+struct lu_site_bkt_data {
+       /**
+        * number of busy object on this bucket
+        */
+       long                  lsb_busy;
+       /**
+        * LRU list, updated on each access to object. Protected by
+        * bucket lock of lu_site::ls_obj_hash.
+        *
+        * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are
+        * moved to the lu_site::ls_lru.prev (this is due to the non-existence
+        * of list_for_each_entry_safe_reverse()).
+        */
+       struct list_head                lsb_lru;
+       /**
+        * Wait-queue signaled when an object in this site is ultimately
+        * destroyed (lu_object_free()). It is used by lu_object_find() to
+        * wait before re-trying when object in the process of destruction is
+        * found in the hash table.
+        *
+        * \see htable_lookup().
+        */
+       wait_queue_head_t              lsb_marche_funebre;
+};
+
+enum {
+       LU_SS_CREATED    = 0,
+       LU_SS_CACHE_HIT,
+       LU_SS_CACHE_MISS,
+       LU_SS_CACHE_RACE,
+       LU_SS_CACHE_DEATH_RACE,
+       LU_SS_LRU_PURGED,
+       LU_SS_LAST_STAT
+};
+
+/**
+ * lu_site is a "compartment" within which objects are unique, and LRU
+ * discipline is maintained.
+ *
+ * lu_site exists so that multiple layered stacks can co-exist in the same
+ * address space.
+ *
+ * lu_site has the same relation to lu_device as lu_object_header to
+ * lu_object.
+ */
+struct lu_site {
+       /**
+        * objects hash table
+        */
+       cfs_hash_t             *ls_obj_hash;
+       /**
+        * index of bucket on hash table while purging
+        */
+       int                    ls_purge_start;
+       /**
+        * Top-level device for this stack.
+        */
+       struct lu_device         *ls_top_dev;
+       /**
+        * Bottom-level device for this stack
+        */
+       struct lu_device        *ls_bottom_dev;
+       /**
+        * Linkage into global list of sites.
+        */
+       struct list_head                ls_linkage;
+       /**
+        * List for lu device for this site, protected
+        * by ls_ld_lock.
+        **/
+       struct list_head                ls_ld_linkage;
+       spinlock_t              ls_ld_lock;
+
+       /**
+        * lu_site stats
+        */
+       struct lprocfs_stats    *ls_stats;
+       /**
+        * XXX: a hack! fld has to find md_site via site, remove when possible
+        */
+       struct seq_server_site  *ld_seq_site;
+};
+
+static inline struct lu_site_bkt_data *
+lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid)
+{
+       cfs_hash_bd_t bd;
+
+       cfs_hash_bd_get(site->ls_obj_hash, fid, &bd);
+       return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+}
+
+/** \name ctors
+ * Constructors/destructors.
+ * @{
+ */
+
+int  lu_site_init       (struct lu_site *s, struct lu_device *d);
+void lu_site_fini       (struct lu_site *s);
+int  lu_site_init_finish  (struct lu_site *s);
+void lu_stack_fini     (const struct lu_env *env, struct lu_device *top);
+void lu_device_get     (struct lu_device *d);
+void lu_device_put     (struct lu_device *d);
+int  lu_device_init       (struct lu_device *d, struct lu_device_type *t);
+void lu_device_fini       (struct lu_device *d);
+int  lu_object_header_init(struct lu_object_header *h);
+void lu_object_header_fini(struct lu_object_header *h);
+int  lu_object_init       (struct lu_object *o,
+                          struct lu_object_header *h, struct lu_device *d);
+void lu_object_fini       (struct lu_object *o);
+void lu_object_add_top    (struct lu_object_header *h, struct lu_object *o);
+void lu_object_add     (struct lu_object *before, struct lu_object *o);
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d);
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d);
+
+/**
+ * Helpers to initialize and finalize device types.
+ */
+
+int  lu_device_type_init(struct lu_device_type *ldt);
+void lu_device_type_fini(struct lu_device_type *ldt);
+void lu_types_stop(void);
+
+/** @} ctors */
+
+/** \name caching
+ * Caching and reference counting.
+ * @{
+ */
+
+/**
+ * Acquire additional reference to the given object. This function is used to
+ * attain additional reference. To acquire initial reference use
+ * lu_object_find().
+ */
+static inline void lu_object_get(struct lu_object *o)
+{
+       LASSERT(atomic_read(&o->lo_header->loh_ref) > 0);
+       atomic_inc(&o->lo_header->loh_ref);
+}
+
+/**
+ * Return true of object will not be cached after last reference to it is
+ * released.
+ */
+static inline int lu_object_is_dying(const struct lu_object_header *h)
+{
+       return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags);
+}
+
+void lu_object_put(const struct lu_env *env, struct lu_object *o);
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o);
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o);
+
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr);
+
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+                  lu_printer_t printer);
+struct lu_object *lu_object_find(const struct lu_env *env,
+                                struct lu_device *dev, const struct lu_fid *f,
+                                const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+                                   struct lu_device *dev,
+                                   const struct lu_fid *f,
+                                   const struct lu_object_conf *conf);
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+                                      struct lu_device *dev,
+                                      const struct lu_fid *f,
+                                      const struct lu_object_conf *conf);
+/** @} caching */
+
+/** \name helpers
+ * Helpers.
+ * @{
+ */
+
+/**
+ * First (topmost) sub-object of given compound object
+ */
+static inline struct lu_object *lu_object_top(struct lu_object_header *h)
+{
+       LASSERT(!list_empty(&h->loh_layers));
+       return container_of0(h->loh_layers.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Next sub-object in the layering
+ */
+static inline struct lu_object *lu_object_next(const struct lu_object *o)
+{
+       return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage);
+}
+
+/**
+ * Pointer to the fid of this object.
+ */
+static inline const struct lu_fid *lu_object_fid(const struct lu_object *o)
+{
+       return &o->lo_header->loh_fid;
+}
+
+/**
+ * return device operations vector for this object
+ */
+static const inline struct lu_device_operations *
+lu_object_ops(const struct lu_object *o)
+{
+       return o->lo_dev->ld_ops;
+}
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+                                  const struct lu_device_type *dtype);
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+                     void *cookie, const char *format, ...);
+
+/**
+ * Print object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_DEBUG(mask, env, object, format, ...)                   \
+do {                                                                 \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                  \
+                                                                         \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                \
+               lu_object_print(env, &msgdata, lu_cdebug_printer, object);\
+               CDEBUG(mask, format , ## __VA_ARGS__);              \
+       }                                                                \
+} while (0)
+
+/**
+ * Print short object description followed by a user-supplied message.
+ */
+#define LU_OBJECT_HEADER(mask, env, object, format, ...)               \
+do {                                                               \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
+                                                                       \
+       if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {              \
+               lu_object_header_print(env, &msgdata, lu_cdebug_printer,\
+                                      (object)->lo_header);        \
+               lu_cdebug_printer(env, &msgdata, "\n");          \
+               CDEBUG(mask, format , ## __VA_ARGS__);            \
+       }                                                              \
+} while (0)
+
+void lu_object_print       (const struct lu_env *env, void *cookie,
+                           lu_printer_t printer, const struct lu_object *o);
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t printer,
+                           const struct lu_object_header *hdr);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o);
+
+
+/**
+ * Check whether object exists, no matter on local or remote storage.
+ * Note: LOHA_EXISTS will be set once some one created the object,
+ * and it does not needs to be committed to storage.
+ */
+#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS)
+
+/**
+ * Check whether object on the remote storage.
+ */
+#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE)
+
+static inline int lu_object_assert_exists(const struct lu_object *o)
+{
+       return lu_object_exists(o);
+}
+
+static inline int lu_object_assert_not_exists(const struct lu_object *o)
+{
+       return !lu_object_exists(o);
+}
+
+/**
+ * Attr of this object.
+ */
+static inline __u32 lu_object_attr(const struct lu_object *o)
+{
+       LASSERT(lu_object_exists(o) != 0);
+       return o->lo_header->loh_attr;
+}
+
+static inline struct lu_ref_link *lu_object_ref_add(struct lu_object *o,
+                                                   const char *scope,
+                                                   const void *source)
+{
+       return lu_ref_add(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del(struct lu_object *o,
+                                    const char *scope, const void *source)
+{
+       lu_ref_del(&o->lo_header->loh_reference, scope, source);
+}
+
+static inline void lu_object_ref_del_at(struct lu_object *o,
+                                       struct lu_ref_link *link,
+                                       const char *scope, const void *source)
+{
+       lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source);
+}
+
+/** input params, should be filled out by mdt */
+struct lu_rdpg {
+       /** hash */
+       __u64              rp_hash;
+       /** count in bytes */
+       unsigned int        rp_count;
+       /** number of pages */
+       unsigned int        rp_npages;
+       /** requested attr */
+       __u32              rp_attrs;
+       /** pointers to pages */
+       struct page        **rp_pages;
+};
+
+enum lu_xattr_flags {
+       LU_XATTR_REPLACE = (1 << 0),
+       LU_XATTR_CREATE  = (1 << 1)
+};
+
+/** @} helpers */
+
+/** \name lu_context
+ * @{ */
+
+/** For lu_context health-checks */
+enum lu_context_state {
+       LCS_INITIALIZED = 1,
+       LCS_ENTERED,
+       LCS_LEFT,
+       LCS_FINALIZED
+};
+
+/**
+ * lu_context. Execution context for lu_object methods. Currently associated
+ * with thread.
+ *
+ * All lu_object methods, except device and device type methods (called during
+ * system initialization and shutdown) are executed "within" some
+ * lu_context. This means, that pointer to some "current" lu_context is passed
+ * as an argument to all methods.
+ *
+ * All service ptlrpc threads create lu_context as part of their
+ * initialization. It is possible to create "stand-alone" context for other
+ * execution environments (like system calls).
+ *
+ * lu_object methods mainly use lu_context through lu_context_key interface
+ * that allows each layer to associate arbitrary pieces of data with each
+ * context (see pthread_key_create(3) for similar interface).
+ *
+ * On a client, lu_context is bound to a thread, see cl_env_get().
+ *
+ * \see lu_context_key
+ */
+struct lu_context {
+       /**
+        * lu_context is used on the client side too. Yet we don't want to
+        * allocate values of server-side keys for the client contexts and
+        * vice versa.
+        *
+        * To achieve this, set of tags in introduced. Contexts and keys are
+        * marked with tags. Key value are created only for context whose set
+        * of tags has non-empty intersection with one for key. Tags are taken
+        * from enum lu_context_tag.
+        */
+       __u32             lc_tags;
+       enum lu_context_state  lc_state;
+       /**
+        * Pointer to the home service thread. NULL for other execution
+        * contexts.
+        */
+       struct ptlrpc_thread  *lc_thread;
+       /**
+        * Pointer to an array with key values. Internal implementation
+        * detail.
+        */
+       void             **lc_value;
+       /**
+        * Linkage into a list of all remembered contexts. Only
+        * `non-transient' contexts, i.e., ones created for service threads
+        * are placed here.
+        */
+       struct list_head             lc_remember;
+       /**
+        * Version counter used to skip calls to lu_context_refill() when no
+        * keys were registered.
+        */
+       unsigned               lc_version;
+       /**
+        * Debugging cookie.
+        */
+       unsigned               lc_cookie;
+};
+
+/**
+ * lu_context_key interface. Similar to pthread_key.
+ */
+
+enum lu_context_tag {
+       /**
+        * Thread on md server
+        */
+       LCT_MD_THREAD = 1 << 0,
+       /**
+        * Thread on dt server
+        */
+       LCT_DT_THREAD = 1 << 1,
+       /**
+        * Context for transaction handle
+        */
+       LCT_TX_HANDLE = 1 << 2,
+       /**
+        * Thread on client
+        */
+       LCT_CL_THREAD = 1 << 3,
+       /**
+        * A per-request session on a server, and a per-system-call session on
+        * a client.
+        */
+       LCT_SESSION   = 1 << 4,
+       /**
+        * A per-request data on OSP device
+        */
+       LCT_OSP_THREAD = 1 << 5,
+       /**
+        * MGS device thread
+        */
+       LCT_MG_THREAD = 1 << 6,
+       /**
+        * Context for local operations
+        */
+       LCT_LOCAL = 1 << 7,
+       /**
+        * Set when at least one of keys, having values in this context has
+        * non-NULL lu_context_key::lct_exit() method. This is used to
+        * optimize lu_context_exit() call.
+        */
+       LCT_HAS_EXIT  = 1 << 28,
+       /**
+        * Don't add references for modules creating key values in that context.
+        * This is only for contexts used internally by lu_object framework.
+        */
+       LCT_NOREF     = 1 << 29,
+       /**
+        * Key is being prepared for retiring, don't create new values for it.
+        */
+       LCT_QUIESCENT = 1 << 30,
+       /**
+        * Context should be remembered.
+        */
+       LCT_REMEMBER  = 1 << 31,
+       /**
+        * Contexts usable in cache shrinker thread.
+        */
+       LCT_SHRINKER  = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF
+};
+
+/**
+ * Key. Represents per-context value slot.
+ *
+ * Keys are usually registered when module owning the key is initialized, and
+ * de-registered when module is unloaded. Once key is registered, all new
+ * contexts with matching tags, will get key value. "Old" contexts, already
+ * initialized at the time of key registration, can be forced to get key value
+ * by calling lu_context_refill().
+ *
+ * Every key value is counted in lu_context_key::lct_used and acquires a
+ * reference on an owning module. This means, that all key values have to be
+ * destroyed before module can be unloaded. This is usually achieved by
+ * stopping threads started by the module, that created contexts in their
+ * entry functions. Situation is complicated by the threads shared by multiple
+ * modules, like ptlrpcd daemon on a client. To work around this problem,
+ * contexts, created in such threads, are `remembered' (see
+ * LCT_REMEMBER)---i.e., added into a global list. When module is preparing
+ * for unloading it does the following:
+ *
+ *     - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT)
+ *       preventing new key values from being allocated in the new contexts,
+ *       and
+ *
+ *     - scans a list of remembered contexts, destroying values of module
+ *       keys, thus releasing references to the module.
+ *
+ * This is done by lu_context_key_quiesce(). If module is re-activated
+ * before key has been de-registered, lu_context_key_revive() call clears
+ * `quiescent' marker.
+ *
+ * lu_context code doesn't provide any internal synchronization for these
+ * activities---it's assumed that startup (including threads start-up) and
+ * shutdown are serialized by some external means.
+ *
+ * \see lu_context
+ */
+struct lu_context_key {
+       /**
+        * Set of tags for which values of this key are to be instantiated.
+        */
+       __u32 lct_tags;
+       /**
+        * Value constructor. This is called when new value is created for a
+        * context. Returns pointer to new value of error pointer.
+        */
+       void  *(*lct_init)(const struct lu_context *ctx,
+                          struct lu_context_key *key);
+       /**
+        * Value destructor. Called when context with previously allocated
+        * value of this slot is destroyed. \a data is a value that was returned
+        * by a matching call to lu_context_key::lct_init().
+        */
+       void   (*lct_fini)(const struct lu_context *ctx,
+                          struct lu_context_key *key, void *data);
+       /**
+        * Optional method called on lu_context_exit() for all allocated
+        * keys. Can be used by debugging code checking that locks are
+        * released, etc.
+        */
+       void   (*lct_exit)(const struct lu_context *ctx,
+                          struct lu_context_key *key, void *data);
+       /**
+        * Internal implementation detail: index within lu_context::lc_value[]
+        * reserved for this key.
+        */
+       int      lct_index;
+       /**
+        * Internal implementation detail: number of values created for this
+        * key.
+        */
+       atomic_t lct_used;
+       /**
+        * Internal implementation detail: module for this key.
+        */
+       module_t *lct_owner;
+       /**
+        * References to this key. For debugging.
+        */
+       struct lu_ref  lct_reference;
+};
+
+#define LU_KEY_INIT(mod, type)                             \
+       static void* mod##_key_init(const struct lu_context *ctx, \
+                                   struct lu_context_key *key)   \
+       {                                                        \
+               type *value;                                  \
+                                                                 \
+               CLASSERT(PAGE_CACHE_SIZE >= sizeof (*value));       \
+                                                                 \
+               OBD_ALLOC_PTR(value);                        \
+               if (value == NULL)                              \
+                       value = ERR_PTR(-ENOMEM);                \
+                                                                 \
+               return value;                                \
+       }                                                        \
+       struct __##mod##__dummy_init {;} /* semicolon catcher */
+
+#define LU_KEY_FINI(mod, type)                                       \
+       static void mod##_key_fini(const struct lu_context *ctx,            \
+                                   struct lu_context_key *key, void* data) \
+       {                                                                  \
+               type *info = data;                                        \
+                                                                           \
+               OBD_FREE_PTR(info);                                      \
+       }                                                                  \
+       struct __##mod##__dummy_fini {;} /* semicolon catcher */
+
+#define LU_KEY_INIT_FINI(mod, type)   \
+       LU_KEY_INIT(mod,type);  \
+       LU_KEY_FINI(mod,type)
+
+#define LU_CONTEXT_KEY_DEFINE(mod, tags)               \
+       struct lu_context_key mod##_thread_key = {      \
+               .lct_tags = tags,                      \
+               .lct_init = mod##_key_init,          \
+               .lct_fini = mod##_key_fini            \
+       }
+
+#define LU_CONTEXT_KEY_INIT(key)                       \
+do {                                               \
+       (key)->lct_owner = THIS_MODULE;          \
+} while (0)
+
+int   lu_context_key_register(struct lu_context_key *key);
+void  lu_context_key_degister(struct lu_context_key *key);
+void *lu_context_key_get     (const struct lu_context *ctx,
+                              const struct lu_context_key *key);
+void  lu_context_key_quiesce (struct lu_context_key *key);
+void  lu_context_key_revive  (struct lu_context_key *key);
+
+
+/*
+ * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an
+ * owning module.
+ */
+
+#define LU_KEY_INIT_GENERIC(mod)                                       \
+       static void mod##_key_init_generic(struct lu_context_key *k, ...) \
+       {                                                              \
+               struct lu_context_key *key = k;                  \
+               va_list args;                                      \
+                                                                       \
+               va_start(args, k);                                    \
+               do {                                                \
+                       LU_CONTEXT_KEY_INIT(key);                      \
+                       key = va_arg(args, struct lu_context_key *);    \
+               } while (key != NULL);                            \
+               va_end(args);                                      \
+       }
+
+#define LU_TYPE_INIT(mod, ...)                                   \
+       LU_KEY_INIT_GENERIC(mod)                                        \
+       static int mod##_type_init(struct lu_device_type *t)        \
+       {                                                              \
+               mod##_key_init_generic(__VA_ARGS__, NULL);            \
+               return lu_context_key_register_many(__VA_ARGS__, NULL); \
+       }                                                              \
+       struct __##mod##_dummy_type_init {;}
+
+#define LU_TYPE_FINI(mod, ...)                                   \
+       static void mod##_type_fini(struct lu_device_type *t)      \
+       {                                                              \
+               lu_context_key_degister_many(__VA_ARGS__, NULL);        \
+       }                                                              \
+       struct __##mod##_dummy_type_fini {;}
+
+#define LU_TYPE_START(mod, ...)                                 \
+       static void mod##_type_start(struct lu_device_type *t)  \
+       {                                                      \
+               lu_context_key_revive_many(__VA_ARGS__, NULL);  \
+       }                                                      \
+       struct __##mod##_dummy_type_start {;}
+
+#define LU_TYPE_STOP(mod, ...)                           \
+       static void mod##_type_stop(struct lu_device_type *t)   \
+       {                                                      \
+               lu_context_key_quiesce_many(__VA_ARGS__, NULL); \
+       }                                                      \
+       struct __##mod##_dummy_type_stop {;}
+
+
+
+#define LU_TYPE_INIT_FINI(mod, ...)         \
+       LU_TYPE_INIT(mod, __VA_ARGS__);  \
+       LU_TYPE_FINI(mod, __VA_ARGS__);  \
+       LU_TYPE_START(mod, __VA_ARGS__);        \
+       LU_TYPE_STOP(mod, __VA_ARGS__)
+
+int   lu_context_init  (struct lu_context *ctx, __u32 tags);
+void  lu_context_fini  (struct lu_context *ctx);
+void  lu_context_enter (struct lu_context *ctx);
+void  lu_context_exit  (struct lu_context *ctx);
+int   lu_context_refill(struct lu_context *ctx);
+
+/*
+ * Helper functions to operate on multiple keys. These are used by the default
+ * device type operations, defined by LU_TYPE_INIT_FINI().
+ */
+
+int  lu_context_key_register_many(struct lu_context_key *k, ...);
+void lu_context_key_degister_many(struct lu_context_key *k, ...);
+void lu_context_key_revive_many  (struct lu_context_key *k, ...);
+void lu_context_key_quiesce_many (struct lu_context_key *k, ...);
+
+/*
+ * update/clear ctx/ses tags.
+ */
+void lu_context_tags_update(__u32 tags);
+void lu_context_tags_clear(__u32 tags);
+void lu_session_tags_update(__u32 tags);
+void lu_session_tags_clear(__u32 tags);
+
+/**
+ * Environment.
+ */
+struct lu_env {
+       /**
+        * "Local" context, used to store data instead of stack.
+        */
+       struct lu_context  le_ctx;
+       /**
+        * "Session" context for per-request data.
+        */
+       struct lu_context *le_ses;
+};
+
+int  lu_env_init  (struct lu_env *env, __u32 tags);
+void lu_env_fini  (struct lu_env *env);
+int  lu_env_refill(struct lu_env *env);
+int  lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags);
+
+/** @} lu_context */
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m);
+
+/**
+ * Common name structure to be passed around for various name related methods.
+ */
+struct lu_name {
+       const char    *ln_name;
+       int         ln_namelen;
+};
+
+/**
+ * Common buffer structure to be passed around for various xattr_{s,g}et()
+ * methods.
+ */
+struct lu_buf {
+       void   *lb_buf;
+       ssize_t lb_len;
+};
+
+#define DLUBUF "(%p %zu)"
+#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len
+/**
+ * One-time initializers, called at obdclass module initialization, not
+ * exported.
+ */
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void);
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void);
+
+struct lu_kmem_descr {
+       struct kmem_cache **ckd_cache;
+       const char       *ckd_name;
+       const size_t      ckd_size;
+};
+
+int  lu_kmem_init(struct lu_kmem_descr *caches);
+void lu_kmem_fini(struct lu_kmem_descr *caches);
+
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+                         const struct lu_fid *fid);
+struct lu_object *lu_object_anon(const struct lu_env *env,
+                                struct lu_device *dev,
+                                const struct lu_object_conf *conf);
+
+/** null buffer */
+extern struct lu_buf LU_BUF_NULL;
+
+void lu_buf_free(struct lu_buf *buf);
+void lu_buf_alloc(struct lu_buf *buf, int size);
+void lu_buf_realloc(struct lu_buf *buf, int size);
+
+int lu_buf_check_and_grow(struct lu_buf *buf, int len);
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len);
+
+/** @} lu */
+#endif /* __LUSTRE_LU_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_ref.h b/drivers/staging/lustre/lustre/include/lu_ref.h
new file mode 100644 (file)
index 0000000..624c19b
--- /dev/null
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LUSTRE_LU_REF_H
+#define __LUSTRE_LU_REF_H
+
+#include <linux/list.h>
+
+/** \defgroup lu_ref lu_ref
+ *
+ * An interface to track references between objects. Mostly for debugging.
+ *
+ * Suppose there is a reference counted data-structure struct foo. To track
+ * who acquired references to instance of struct foo, add lu_ref field to it:
+ *
+ * \code
+ *      struct foo {
+ *              atomic_t      foo_refcount;
+ *              struct lu_ref foo_reference;
+ *              ...
+ *      };
+ * \endcode
+ *
+ * foo::foo_reference has to be initialized by calling
+ * lu_ref_init(). Typically there will be functions or macros to increment and
+ * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo)
+ * and foo_put(struct foo *foo), respectively.
+ *
+ * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add()
+ * has to be called to insert into foo::foo_reference a record, describing
+ * acquired reference. Dually, lu_ref_del() removes matching record. Typical
+ * usages are:
+ *
+ * \code
+ *     struct bar *bar;
+ *
+ *     // bar owns a reference to foo.
+ *     bar->bar_foo = foo_get(foo);
+ *     lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *     ...
+ *
+ *     // reference from bar to foo is released.
+ *     lu_ref_del(&foo->foo_reference, "bar", bar);
+ *     foo_put(bar->bar_foo);
+ *
+ *
+ *     // current thread acquired a temporary reference to foo.
+ *     foo_get(foo);
+ *     lu_ref_add(&foo->reference, __FUNCTION__, current);
+ *
+ *     ...
+ *
+ *     // temporary reference is released.
+ *     lu_ref_del(&foo->reference, __FUNCTION__, current);
+ *     foo_put(foo);
+ * \endcode
+ *
+ * \e Et \e cetera. Often it makes sense to include lu_ref_add() and
+ * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct
+ * foo is destroyed, lu_ref_fini() has to be called that checks that no
+ * pending references remain. lu_ref_print() can be used to dump a list of
+ * pending references, while hunting down a leak.
+ *
+ * For objects to which a large number of references can be acquired,
+ * lu_ref_del() can become cpu consuming, as it has to scan the list of
+ * references. To work around this, remember result of lu_ref_add() (usually
+ * in the same place where pointer to struct foo is stored), and use
+ * lu_ref_del_at():
+ *
+ * \code
+ *     // There is a large number of bar's for a single foo.
+ *     bar->bar_foo     = foo_get(foo);
+ *     bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar);
+ *
+ *     ...
+ *
+ *     // reference from bar to foo is released.
+ *     lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar);
+ *     foo_put(bar->bar_foo);
+ * \endcode
+ *
+ * lu_ref interface degrades gracefully in case of memory shortages.
+ *
+ * @{
+ */
+
+
+struct lu_ref  {};
+
+static inline void lu_ref_init(struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_fini(struct lu_ref *ref)
+{
+}
+
+static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref,
+                                            const char *scope,
+                                            const void *source)
+{
+       return NULL;
+}
+
+static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref,
+                                                   const char *scope,
+                                                   const void *source)
+{
+       return NULL;
+}
+
+static inline void lu_ref_del(struct lu_ref *ref, const char *scope,
+                             const void *source)
+{
+}
+
+static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
+                                const char *scope, const void *source0,
+                                const void *source1)
+{
+}
+
+static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
+                                const char *scope, const void *source)
+{
+}
+
+static inline int lu_ref_global_init(void)
+{
+       return 0;
+}
+
+static inline void lu_ref_global_fini(void)
+{
+}
+
+static inline void lu_ref_print(const struct lu_ref *ref)
+{
+}
+
+static inline void lu_ref_print_all(void)
+{
+}
+
+/** @} lu */
+
+#endif /* __LUSTRE_LU_REF_H */
diff --git a/drivers/staging/lustre/lustre/include/lu_target.h b/drivers/staging/lustre/lustre/include/lu_target.h
new file mode 100644 (file)
index 0000000..8d48cf4
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_LU_TARGET_H
+#define _LUSTRE_LU_TARGET_H
+
+#include <dt_object.h>
+#include <lustre_disk.h>
+
+struct lu_target {
+       struct obd_device       *lut_obd;
+       struct dt_device        *lut_bottom;
+       /** last_rcvd file */
+       struct dt_object        *lut_last_rcvd;
+       /* transaction callbacks */
+       struct dt_txn_callback   lut_txn_cb;
+       /** server data in last_rcvd file */
+       struct lr_server_data    lut_lsd;
+       /** Server last transaction number */
+       __u64               lut_last_transno;
+       /** Lock protecting last transaction number */
+       spinlock_t               lut_translock;
+       /** Lock protecting client bitmap */
+       spinlock_t               lut_client_bitmap_lock;
+       /** Bitmap of known clients */
+       unsigned long      *lut_client_bitmap;
+};
+
+typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno,
+                        void *data, int err);
+struct tgt_commit_cb {
+       tgt_cb_t  tgt_cb_func;
+       void     *tgt_cb_data;
+};
+
+void tgt_boot_epoch_update(struct lu_target *lut);
+int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *lut,
+                          struct obd_export *exp, __u64 transno);
+int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp);
+int tgt_init(const struct lu_env *env, struct lu_target *lut,
+            struct obd_device *obd, struct dt_device *dt);
+void tgt_fini(const struct lu_env *env, struct lu_target *lut);
+int tgt_client_alloc(struct obd_export *exp);
+void tgt_client_free(struct obd_export *exp);
+int tgt_client_del(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int);
+int tgt_client_new(const struct lu_env *env, struct obd_export *exp);
+int tgt_client_data_read(const struct lu_env *env, struct lu_target *tg,
+                        struct lsd_client_data *lcd, loff_t *off, int index);
+int tgt_client_data_write(const struct lu_env *env, struct lu_target *tg,
+                         struct lsd_client_data *lcd, loff_t *off, struct thandle *th);
+int tgt_server_data_read(const struct lu_env *env, struct lu_target *tg);
+int tgt_server_data_write(const struct lu_env *env, struct lu_target *tg,
+                         struct thandle *th);
+int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg, int sync);
+int tgt_truncate_last_rcvd(const struct lu_env *env, struct lu_target *tg, loff_t off);
+
+#endif /* __LUSTRE_LU_TARGET_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/libiam.h b/drivers/staging/lustre/lustre/include/lustre/libiam.h
new file mode 100644 (file)
index 0000000..e8e0b08
--- /dev/null
@@ -0,0 +1,145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/libiam.h
+ *
+ * iam user level library
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+/*
+ *  lustre/libiam.h
+ */
+
+#ifndef __IAM_ULIB_H__
+#define __IAM_ULIB_H__
+
+/** \defgroup libiam libiam
+ *
+ * @{
+ */
+
+
+#define DX_FMT_NAME_LEN 16
+
+enum iam_fmt_t {
+       FMT_LFIX,
+       FMT_LVAR
+};
+
+struct iam_uapi_info {
+       __u16 iui_keysize;
+       __u16 iui_recsize;
+       __u16 iui_ptrsize;
+       __u16 iui_height;
+       char  iui_fmt_name[DX_FMT_NAME_LEN];
+};
+
+/*
+ * Creat an iam file, but do NOT open it.
+ * Return 0 if success, else -1.
+ */
+int iam_creat(char *filename, enum iam_fmt_t fmt,
+             int blocksize, int keysize, int recsize, int ptrsize);
+
+/*
+ * Open an iam file, but do NOT creat it if the file doesn't exist.
+ * Please use iam_creat for creating the file before use iam_open.
+ * Return file id (fd) if success, else -1.
+ */
+int iam_open(char *filename, struct iam_uapi_info *ua);
+
+/*
+ * Close file opened by iam_open.
+ */
+int iam_close(int fd);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_insert(int fd, struct iam_uapi_info *ua,
+              int key_need_convert, char *keybuf,
+              int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_lookup(int fd, struct iam_uapi_info *ua,
+              int key_need_convert, char *key_buf,
+              int *keysize, char *save_key,
+              int rec_need_convert, char *rec_buf,
+              int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_delete(int fd, struct iam_uapi_info *ua,
+              int key_need_convert, char *keybuf,
+              int rec_need_convert, char *recbuf);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_start(int fd, struct iam_uapi_info *ua,
+                int key_need_convert, char *key_buf,
+                int *keysize, char *save_key,
+                int rec_need_convert, char *rec_buf,
+                int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_next(int fd, struct iam_uapi_info *ua,
+               int key_need_convert, char *key_buf,
+               int *keysize, char *save_key,
+               int rec_need_convert, char *rec_buf,
+               int *recsize, char *save_rec);
+
+/*
+ * Please use iam_open before use this function.
+ */
+int iam_it_stop(int fd, struct iam_uapi_info *ua,
+               int key_need_convert, char *keybuf,
+               int rec_need_convert, char *recbuf);
+
+/*
+ * Change iam file mode.
+ */
+int iam_polymorph(char *filename, unsigned long mode);
+
+/** @} libiam */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h
new file mode 100644 (file)
index 0000000..707eb74
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ * NOTE: This file is DEPRECATED!  Please include lustreapi.h directly
+ * instead of this file.  This file will be removed from a future version
+ * of lustre!
+ */
+
+#ifndef _LIBLUSTREAPI_H_
+#define _LIBLUSTREAPI_H_
+
+#include <lustre/lustreapi.h>
+#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly."
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h
new file mode 100644 (file)
index 0000000..ad253c6
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/ll_fiemap.h
+ *
+ * FIEMAP data structures and flags. This header file will be used until
+ * fiemap.h is available in the upstream kernel.
+ *
+ * Author: Kalpak Shah <kalpak.shah@sun.com>
+ * Author: Andreas Dilger <adilger@sun.com>
+ */
+
+#ifndef _LUSTRE_FIEMAP_H
+#define _LUSTRE_FIEMAP_H
+
+
+
+struct ll_fiemap_extent {
+       __u64 fe_logical;  /* logical offset in bytes for the start of
+                           * the extent from the beginning of the file */
+       __u64 fe_physical; /* physical offset in bytes for the start
+                           * of the extent from the beginning of the disk */
+       __u64 fe_length;   /* length in bytes for this extent */
+       __u64 fe_reserved64[2];
+       __u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+       __u32 fe_device;   /* device number for this extent */
+       __u32 fe_reserved[2];
+};
+
+struct ll_user_fiemap {
+       __u64 fm_start;  /* logical offset (inclusive) at
+                         * which to start mapping (in) */
+       __u64 fm_length; /* logical length of mapping which
+                         * userspace wants (in) */
+       __u32 fm_flags;  /* FIEMAP_FLAG_* flags for request (in/out) */
+       __u32 fm_mapped_extents;/* number of extents that were mapped (out) */
+       __u32 fm_extent_count;  /* size of fm_extents array (in) */
+       __u32 fm_reserved;
+       struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
+};
+
+#define FIEMAP_MAX_OFFSET      (~0ULL)
+
+#define FIEMAP_FLAG_SYNC        0x00000001 /* sync file data before map */
+#define FIEMAP_FLAG_XATTR      0x00000002 /* map extended attribute tree */
+
+#define FIEMAP_EXTENT_LAST           0x00000001 /* Last extent in file. */
+#define FIEMAP_EXTENT_UNKNOWN     0x00000002 /* Data location unknown. */
+#define FIEMAP_EXTENT_DELALLOC   0x00000004 /* Location still pending.
+                                                   * Sets EXTENT_UNKNOWN. */
+#define FIEMAP_EXTENT_ENCODED     0x00000008 /* Data can not be read
+                                                   * while fs is unmounted */
+#define FIEMAP_EXTENT_DATA_ENCRYPTED    0x00000080 /* Data is encrypted by fs.
+                                                   * Sets EXTENT_NO_DIRECT. */
+#define FIEMAP_EXTENT_NOT_ALIGNED       0x00000100 /* Extent offsets may not be
+                                                   * block aligned. */
+#define FIEMAP_EXTENT_DATA_INLINE       0x00000200 /* Data mixed with metadata.
+                                                   * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_DATA_TAIL         0x00000400 /* Multiple files in block.
+                                                   * Sets EXTENT_NOT_ALIGNED.*/
+#define FIEMAP_EXTENT_UNWRITTEN         0x00000800 /* Space allocated, but
+                                                   * no data (i.e. zero). */
+#define FIEMAP_EXTENT_MERGED       0x00001000 /* File does not natively
+                                                   * support extents. Result
+                                                   * merged for efficiency. */
+
+
+static inline size_t fiemap_count_to_size(size_t extent_count)
+{
+       return (sizeof(struct ll_user_fiemap) + extent_count *
+                                              sizeof(struct ll_fiemap_extent));
+}
+
+static inline unsigned fiemap_size_to_count(size_t array_size)
+{
+       return ((array_size - sizeof(struct ll_user_fiemap)) /
+                                              sizeof(struct ll_fiemap_extent));
+}
+
+#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */
+
+#ifdef FIEMAP_FLAGS_COMPAT
+#undef FIEMAP_FLAGS_COMPAT
+#endif
+
+/* Lustre specific flags - use a high bit, don't conflict with upstream flag */
+#define FIEMAP_EXTENT_NO_DIRECT         0x40000000 /* Data mapping undefined */
+#define FIEMAP_EXTENT_NET             0x80000000 /* Data stored remotely.
+                                                   * Sets NO_DIRECT flag */
+
+#endif /* _LUSTRE_FIEMAP_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h
new file mode 100644 (file)
index 0000000..93a3d7d
--- /dev/null
@@ -0,0 +1,2 @@
+#define BUILD_VERSION "v2_3_64_0-g6e62c21-CHANGED-3.9.0"
+#define LUSTRE_RELEASE 3.9.0_g6e62c21
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
new file mode 100644 (file)
index 0000000..8825460
--- /dev/null
@@ -0,0 +1,3653 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_idl.h
+ *
+ * Lustre wire protocol definitions.
+ */
+
+/** \defgroup lustreidl lustreidl
+ *
+ * Lustre wire protocol definitions.
+ *
+ * ALL structs passing over the wire should be declared here.  Structs
+ * that are used in interfaces with userspace should go in lustre_user.h.
+ *
+ * All structs being declared here should be built from simple fixed-size
+ * types (__u8, __u16, __u32, __u64) or be built from other types or
+ * structs also declared in this file.  Similarly, all flags and magic
+ * values in those structs should also be declared here.  This ensures
+ * that the Lustre wire protocol is not influenced by external dependencies.
+ *
+ * The only other acceptable items in this file are VERY SIMPLE accessor
+ * functions to avoid callers grubbing inside the structures, and the
+ * prototypes of the swabber functions for each struct.  Nothing that
+ * depends on external functions or definitions should be in here.
+ *
+ * Structs must be properly aligned to put 64-bit values on an 8-byte
+ * boundary.  Any structs being added here must also be added to
+ * utils/wirecheck.c and "make newwiretest" run to regenerate the
+ * utils/wiretest.c sources.  This allows us to verify that wire structs
+ * have the proper alignment/size on all architectures.
+ *
+ * DO NOT CHANGE any of the structs, flags, values declared here and used
+ * in released Lustre versions.  Some structs may have padding fields that
+ * can be used.  Some structs might allow addition at the end (verify this
+ * in the code to ensure that new/old clients that see this larger struct
+ * do not fail, otherwise you need to implement protocol compatibility).
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines,
+ * implemented either here, inline (trivial implementations) or in
+ * ptlrpc/pack_generic.c.  These 'swabbers' convert the type from "other"
+ * endian, in-place in the message buffer.
+ *
+ * A swabber takes a single pointer argument.  The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_IDL_H_
+#define _LUSTRE_IDL_H_
+
+#if !defined(LASSERT) && !defined(LPU64)
+#include <linux/libcfs/libcfs.h> /* for LASSERT, LPUX64, etc */
+#endif
+
+/* Defn's shared with user-space. */
+#include <lustre/lustre_user.h>
+
+/*
+ *  GENERAL STUFF
+ */
+/* FOO_REQUEST_PORTAL is for incoming requests on the FOO
+ * FOO_REPLY_PORTAL   is for incoming replies on the FOO
+ * FOO_BULK_PORTAL    is for incoming bulk on the FOO
+ */
+
+#define CONNMGR_REQUEST_PORTAL   1
+#define CONNMGR_REPLY_PORTAL       2
+//#define OSC_REQUEST_PORTAL       3
+#define OSC_REPLY_PORTAL               4
+//#define OSC_BULK_PORTAL             5
+#define OST_IO_PORTAL             6
+#define OST_CREATE_PORTAL             7
+#define OST_BULK_PORTAL                 8
+//#define MDC_REQUEST_PORTAL       9
+#define MDC_REPLY_PORTAL              10
+//#define MDC_BULK_PORTAL            11
+#define MDS_REQUEST_PORTAL          12
+//#define MDS_REPLY_PORTAL          13
+#define MDS_BULK_PORTAL                14
+#define LDLM_CB_REQUEST_PORTAL  15
+#define LDLM_CB_REPLY_PORTAL      16
+#define LDLM_CANCEL_REQUEST_PORTAL     17
+#define LDLM_CANCEL_REPLY_PORTAL       18
+//#define PTLBD_REQUEST_PORTAL    19
+//#define PTLBD_REPLY_PORTAL        20
+//#define PTLBD_BULK_PORTAL          21
+#define MDS_SETATTR_PORTAL          22
+#define MDS_READPAGE_PORTAL        23
+#define MDS_MDS_PORTAL          24
+
+#define MGC_REPLY_PORTAL              25
+#define MGS_REQUEST_PORTAL          26
+#define MGS_REPLY_PORTAL              27
+#define OST_REQUEST_PORTAL          28
+#define FLD_REQUEST_PORTAL          29
+#define SEQ_METADATA_PORTAL        30
+#define SEQ_DATA_PORTAL                31
+#define SEQ_CONTROLLER_PORTAL    32
+#define MGS_BULK_PORTAL                33
+
+/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */
+
+/* packet types */
+#define PTL_RPC_MSG_REQUEST 4711
+#define PTL_RPC_MSG_ERR     4712
+#define PTL_RPC_MSG_REPLY   4713
+
+/* DON'T use swabbed values of MAGIC as magic! */
+#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0
+#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3
+
+#define LUSTRE_MSG_MAGIC_V1_SWABBED 0xD00BD00B
+#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B
+
+#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2
+
+#define PTLRPC_MSG_VERSION  0x00000003
+#define LUSTRE_VERSION_MASK 0xffff0000
+#define LUSTRE_OBD_VERSION  0x00010000
+#define LUSTRE_MDS_VERSION  0x00020000
+#define LUSTRE_OST_VERSION  0x00030000
+#define LUSTRE_DLM_VERSION  0x00040000
+#define LUSTRE_LOG_VERSION  0x00050000
+#define LUSTRE_MGS_VERSION  0x00060000
+
+typedef __u32 mdsno_t;
+typedef __u64 seqno_t;
+typedef __u64 obd_id;
+typedef __u64 obd_seq;
+typedef __s64 obd_time;
+typedef __u64 obd_size;
+typedef __u64 obd_off;
+typedef __u64 obd_blocks;
+typedef __u64 obd_valid;
+typedef __u32 obd_blksize;
+typedef __u32 obd_mode;
+typedef __u32 obd_uid;
+typedef __u32 obd_gid;
+typedef __u32 obd_flag;
+typedef __u32 obd_count;
+
+/**
+ * Describes a range of sequence, lsr_start is included but lsr_end is
+ * not in the range.
+ * Same structure is used in fld module where lsr_index field holds mdt id
+ * of the home mdt.
+ */
+struct lu_seq_range {
+       __u64 lsr_start;
+       __u64 lsr_end;
+       __u32 lsr_index;
+       __u32 lsr_flags;
+};
+
+#define LU_SEQ_RANGE_MDT       0x0
+#define LU_SEQ_RANGE_OST       0x1
+#define LU_SEQ_RANGE_ANY       0x3
+
+#define LU_SEQ_RANGE_MASK      0x3
+
+static inline unsigned fld_range_type(const struct lu_seq_range *range)
+{
+       return range->lsr_flags & LU_SEQ_RANGE_MASK;
+}
+
+static inline int fld_range_is_ost(const struct lu_seq_range *range)
+{
+       return fld_range_type(range) == LU_SEQ_RANGE_OST;
+}
+
+static inline int fld_range_is_mdt(const struct lu_seq_range *range)
+{
+       return fld_range_type(range) == LU_SEQ_RANGE_MDT;
+}
+
+/**
+ * This all range is only being used when fld client sends fld query request,
+ * but it does not know whether the seq is MDT or OST, so it will send req
+ * with ALL type, which means either seq type gotten from lookup can be
+ * expected.
+ */
+static inline unsigned fld_range_is_any(const struct lu_seq_range *range)
+{
+       return fld_range_type(range) == LU_SEQ_RANGE_ANY;
+}
+
+static inline void fld_range_set_type(struct lu_seq_range *range,
+                                     unsigned flags)
+{
+       LASSERT(!(flags & ~LU_SEQ_RANGE_MASK));
+       range->lsr_flags |= flags;
+}
+
+static inline void fld_range_set_mdt(struct lu_seq_range *range)
+{
+       fld_range_set_type(range, LU_SEQ_RANGE_MDT);
+}
+
+static inline void fld_range_set_ost(struct lu_seq_range *range)
+{
+       fld_range_set_type(range, LU_SEQ_RANGE_OST);
+}
+
+static inline void fld_range_set_any(struct lu_seq_range *range)
+{
+       fld_range_set_type(range, LU_SEQ_RANGE_ANY);
+}
+
+/**
+ * returns  width of given range \a r
+ */
+
+static inline __u64 range_space(const struct lu_seq_range *range)
+{
+       return range->lsr_end - range->lsr_start;
+}
+
+/**
+ * initialize range to zero
+ */
+
+static inline void range_init(struct lu_seq_range *range)
+{
+       range->lsr_start = range->lsr_end = range->lsr_index = 0;
+}
+
+/**
+ * check if given seq id \a s is within given range \a r
+ */
+
+static inline int range_within(const struct lu_seq_range *range,
+                              __u64 s)
+{
+       return s >= range->lsr_start && s < range->lsr_end;
+}
+
+static inline int range_is_sane(const struct lu_seq_range *range)
+{
+       return (range->lsr_end >= range->lsr_start);
+}
+
+static inline int range_is_zero(const struct lu_seq_range *range)
+{
+       return (range->lsr_start == 0 && range->lsr_end == 0);
+}
+
+static inline int range_is_exhausted(const struct lu_seq_range *range)
+
+{
+       return range_space(range) == 0;
+}
+
+/* return 0 if two range have the same location */
+static inline int range_compare_loc(const struct lu_seq_range *r1,
+                                   const struct lu_seq_range *r2)
+{
+       return r1->lsr_index != r2->lsr_index ||
+              r1->lsr_flags != r2->lsr_flags;
+}
+
+#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x):%x:%s"
+
+#define PRANGE(range)          \
+       (range)->lsr_start,     \
+       (range)->lsr_end,       \
+       (range)->lsr_index,     \
+       fld_range_is_mdt(range) ? "mdt" : "ost"
+
+
+/** \defgroup lu_fid lu_fid
+ * @{ */
+
+/**
+ * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat.
+ * Deprecated since HSM and SOM attributes are now stored in separate on-disk
+ * xattr.
+ */
+enum lma_compat {
+       LMAC_HSM = 0x00000001,
+       LMAC_SOM = 0x00000002,
+};
+
+/**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ */
+enum lma_incompat {
+       LMAI_RELEASED = 0x0000001, /* file is released */
+       LMAI_AGENT = 0x00000002, /* agent inode */
+       LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object
+                                           is on the remote MDT */
+};
+#define LMA_INCOMPAT_SUPP      (LMAI_AGENT | LMAI_REMOTE_PARENT)
+
+extern void lustre_lma_swab(struct lustre_mdt_attrs *lma);
+extern void lustre_lma_init(struct lustre_mdt_attrs *lma,
+                           const struct lu_fid *fid, __u32 incompat);
+/**
+ * SOM on-disk attributes stored in a separate xattr.
+ */
+struct som_attrs {
+       /** Bitfield for supported data in this structure. For future use. */
+       __u32   som_compat;
+
+       /** Incompat feature list. The supported feature mask is availabe in
+        * SOM_INCOMPAT_SUPP */
+       __u32   som_incompat;
+
+       /** IO Epoch SOM attributes belongs to */
+       __u64   som_ioepoch;
+       /** total file size in objects */
+       __u64   som_size;
+       /** total fs blocks in objects */
+       __u64   som_blocks;
+       /** mds mount id the size is valid for */
+       __u64   som_mountid;
+};
+extern void lustre_som_swab(struct som_attrs *attrs);
+
+#define SOM_INCOMPAT_SUPP 0x0
+
+/**
+ * HSM on-disk attributes stored in a separate xattr.
+ */
+struct hsm_attrs {
+       /** Bitfield for supported data in this structure. For future use. */
+       __u32   hsm_compat;
+
+       /** HSM flags, see hsm_flags enum below */
+       __u32   hsm_flags;
+       /** backend archive id associated with the file */
+       __u64   hsm_arch_id;
+       /** version associated with the last archiving, if any */
+       __u64   hsm_arch_ver;
+};
+extern void lustre_hsm_swab(struct hsm_attrs *attrs);
+
+/**
+ * fid constants
+ */
+enum {
+       /** LASTID file has zero OID */
+       LUSTRE_FID_LASTID_OID = 0UL,
+       /** initial fid id value */
+       LUSTRE_FID_INIT_OID  = 1UL
+};
+
+/** returns fid object sequence */
+static inline __u64 fid_seq(const struct lu_fid *fid)
+{
+       return fid->f_seq;
+}
+
+/** returns fid object id */
+static inline __u32 fid_oid(const struct lu_fid *fid)
+{
+       return fid->f_oid;
+}
+
+/** returns fid object version */
+static inline __u32 fid_ver(const struct lu_fid *fid)
+{
+       return fid->f_ver;
+}
+
+static inline void fid_zero(struct lu_fid *fid)
+{
+       memset(fid, 0, sizeof(*fid));
+}
+
+static inline obd_id fid_ver_oid(const struct lu_fid *fid)
+{
+       return ((__u64)fid_ver(fid) << 32 | fid_oid(fid));
+}
+
+/**
+ * Note that reserved SEQ numbers below 12 will conflict with ldiskfs
+ * inodes in the IGIF namespace, so these reserved SEQ numbers can be
+ * used for other purposes and not risk collisions with existing inodes.
+ *
+ * Different FID Format
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0
+ */
+enum fid_seq {
+       FID_SEQ_OST_MDT0        = 0,
+       FID_SEQ_LLOG            = 1, /* unnamed llogs */
+       FID_SEQ_ECHO            = 2,
+       FID_SEQ_OST_MDT1        = 3,
+       FID_SEQ_OST_MAX         = 9, /* Max MDT count before OST_on_FID */
+       FID_SEQ_LLOG_NAME       = 10, /* named llogs */
+       FID_SEQ_RSVD            = 11,
+       FID_SEQ_IGIF            = 12,
+       FID_SEQ_IGIF_MAX        = 0x0ffffffffULL,
+       FID_SEQ_IDIF            = 0x100000000ULL,
+       FID_SEQ_IDIF_MAX        = 0x1ffffffffULL,
+       /* Normal FID sequence starts from this value, i.e. 1<<33 */
+       FID_SEQ_START           = 0x200000000ULL,
+       /* sequence for local pre-defined FIDs listed in local_oid */
+       FID_SEQ_LOCAL_FILE      = 0x200000001ULL,
+       FID_SEQ_DOT_LUSTRE      = 0x200000002ULL,
+       /* sequence is used for local named objects FIDs generated
+        * by local_object_storage library */
+       FID_SEQ_LOCAL_NAME      = 0x200000003ULL,
+       /* Because current FLD will only cache the fid sequence, instead
+        * of oid on the client side, if the FID needs to be exposed to
+        * clients sides, it needs to make sure all of fids under one
+        * sequence will be located in one MDT. */
+       FID_SEQ_SPECIAL         = 0x200000004ULL,
+       FID_SEQ_QUOTA           = 0x200000005ULL,
+       FID_SEQ_QUOTA_GLB       = 0x200000006ULL,
+       FID_SEQ_ROOT            = 0x200000007ULL,  /* Located on MDT0 */
+       FID_SEQ_NORMAL          = 0x200000400ULL,
+       FID_SEQ_LOV_DEFAULT     = 0xffffffffffffffffULL
+};
+
+#define OBIF_OID_MAX_BITS         32
+#define OBIF_MAX_OID           (1ULL << OBIF_OID_MAX_BITS)
+#define OBIF_OID_MASK         ((1ULL << OBIF_OID_MAX_BITS) - 1)
+#define IDIF_OID_MAX_BITS         48
+#define IDIF_MAX_OID           (1ULL << IDIF_OID_MAX_BITS)
+#define IDIF_OID_MASK         ((1ULL << IDIF_OID_MAX_BITS) - 1)
+
+/** OID for FID_SEQ_SPECIAL */
+enum special_oid {
+       /* Big Filesystem Lock to serialize rename operations */
+       FID_OID_SPECIAL_BFL     = 1UL,
+};
+
+/** OID for FID_SEQ_DOT_LUSTRE */
+enum dot_lustre_oid {
+       FID_OID_DOT_LUSTRE  = 1UL,
+       FID_OID_DOT_LUSTRE_OBF = 2UL,
+};
+
+static inline int fid_seq_is_mdt0(obd_seq seq)
+{
+       return (seq == FID_SEQ_OST_MDT0);
+}
+
+static inline int fid_seq_is_mdt(const __u64 seq)
+{
+       return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL;
+};
+
+static inline int fid_seq_is_echo(obd_seq seq)
+{
+       return (seq == FID_SEQ_ECHO);
+}
+
+static inline int fid_is_echo(const struct lu_fid *fid)
+{
+       return fid_seq_is_echo(fid_seq(fid));
+}
+
+static inline int fid_seq_is_llog(obd_seq seq)
+{
+       return (seq == FID_SEQ_LLOG);
+}
+
+static inline int fid_is_llog(const struct lu_fid *fid)
+{
+       /* file with OID == 0 is not llog but contains last oid */
+       return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0;
+}
+
+static inline int fid_seq_is_rsvd(const __u64 seq)
+{
+       return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD);
+};
+
+static inline int fid_seq_is_special(const __u64 seq)
+{
+       return seq == FID_SEQ_SPECIAL;
+};
+
+static inline int fid_seq_is_local_file(const __u64 seq)
+{
+       return seq == FID_SEQ_LOCAL_FILE ||
+              seq == FID_SEQ_LOCAL_NAME;
+};
+
+static inline int fid_seq_is_root(const __u64 seq)
+{
+       return seq == FID_SEQ_ROOT;
+}
+
+static inline int fid_seq_is_dot(const __u64 seq)
+{
+       return seq == FID_SEQ_DOT_LUSTRE;
+}
+
+static inline int fid_seq_is_default(const __u64 seq)
+{
+       return seq == FID_SEQ_LOV_DEFAULT;
+}
+
+static inline int fid_is_mdt0(const struct lu_fid *fid)
+{
+       return fid_seq_is_mdt0(fid_seq(fid));
+}
+
+static inline void lu_root_fid(struct lu_fid *fid)
+{
+       fid->f_seq = FID_SEQ_ROOT;
+       fid->f_oid = 1;
+       fid->f_ver = 0;
+}
+
+/**
+ * Check if a fid is igif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a igif; otherwise false.
+ */
+static inline int fid_seq_is_igif(const __u64 seq)
+{
+       return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX;
+}
+
+static inline int fid_is_igif(const struct lu_fid *fid)
+{
+       return fid_seq_is_igif(fid_seq(fid));
+}
+
+/**
+ * Check if a fid is idif or not.
+ * \param fid the fid to be tested.
+ * \return true if the fid is a idif; otherwise false.
+ */
+static inline int fid_seq_is_idif(const __u64 seq)
+{
+       return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX;
+}
+
+static inline int fid_is_idif(const struct lu_fid *fid)
+{
+       return fid_seq_is_idif(fid_seq(fid));
+}
+
+static inline int fid_is_local_file(const struct lu_fid *fid)
+{
+       return fid_seq_is_local_file(fid_seq(fid));
+}
+
+static inline int fid_seq_is_norm(const __u64 seq)
+{
+       return (seq >= FID_SEQ_NORMAL);
+}
+
+static inline int fid_is_norm(const struct lu_fid *fid)
+{
+       return fid_seq_is_norm(fid_seq(fid));
+}
+
+/* convert an OST objid into an IDIF FID SEQ number */
+static inline obd_seq fid_idif_seq(obd_id id, __u32 ost_idx)
+{
+       return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff);
+}
+
+/* convert a packed IDIF FID into an OST objid */
+static inline obd_id fid_idif_id(obd_seq seq, __u32 oid, __u32 ver)
+{
+       return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid;
+}
+
+/* extract ost index from IDIF FID */
+static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid)
+{
+       LASSERT(fid_is_idif(fid));
+       return (fid_seq(fid) >> 16) & 0xffff;
+}
+
+/* extract OST sequence (group) from a wire ost_id (id/seq) pair */
+static inline obd_seq ostid_seq(const struct ost_id *ostid)
+{
+       if (fid_seq_is_mdt0(ostid->oi.oi_seq))
+               return FID_SEQ_OST_MDT0;
+
+       if (fid_seq_is_default(ostid->oi.oi_seq))
+               return FID_SEQ_LOV_DEFAULT;
+
+       if (fid_is_idif(&ostid->oi_fid))
+               return FID_SEQ_OST_MDT0;
+
+       return fid_seq(&ostid->oi_fid);
+}
+
+/* extract OST objid from a wire ost_id (id/seq) pair */
+static inline obd_id ostid_id(const struct ost_id *ostid)
+{
+       if (fid_seq_is_mdt0(ostid_seq(ostid)))
+               return ostid->oi.oi_id & IDIF_OID_MASK;
+
+       if (fid_is_idif(&ostid->oi_fid))
+               return fid_idif_id(fid_seq(&ostid->oi_fid),
+                                  fid_oid(&ostid->oi_fid), 0);
+
+       return fid_oid(&ostid->oi_fid);
+}
+
+static inline void ostid_set_seq(struct ost_id *oi, __u64 seq)
+{
+       if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) {
+               oi->oi.oi_seq = seq;
+       } else {
+               oi->oi_fid.f_seq = seq;
+               /* Note: if f_oid + f_ver is zero, we need init it
+                * to be 1, otherwise, ostid_seq will treat this
+                * as old ostid (oi_seq == 0) */
+               if (oi->oi_fid.f_oid == 0 && oi->oi_fid.f_ver == 0)
+                       oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID;
+       }
+}
+
+static inline void ostid_set_seq_mdt0(struct ost_id *oi)
+{
+       ostid_set_seq(oi, FID_SEQ_OST_MDT0);
+}
+
+static inline void ostid_set_seq_echo(struct ost_id *oi)
+{
+       ostid_set_seq(oi, FID_SEQ_ECHO);
+}
+
+static inline void ostid_set_seq_llog(struct ost_id *oi)
+{
+       ostid_set_seq(oi, FID_SEQ_LLOG);
+}
+
+/**
+ * Note: we need check oi_seq to decide where to set oi_id,
+ * so oi_seq should always be set ahead of oi_id.
+ */
+static inline void ostid_set_id(struct ost_id *oi, __u64 oid)
+{
+       if (fid_seq_is_mdt0(ostid_seq(oi))) {
+               if (oid >= IDIF_MAX_OID) {
+                       CERROR("Bad "LPU64" to set "DOSTID"\n",
+                               oid, POSTID(oi));
+                       return;
+               }
+               oi->oi.oi_id = oid;
+       } else {
+               if (oid > OBIF_MAX_OID) {
+                       CERROR("Bad "LPU64" to set "DOSTID"\n",
+                               oid, POSTID(oi));
+                       return;
+               }
+               oi->oi_fid.f_oid = oid;
+       }
+}
+
+static inline void ostid_inc_id(struct ost_id *oi)
+{
+       if (fid_seq_is_mdt0(ostid_seq(oi))) {
+               if (unlikely(ostid_id(oi) + 1 > IDIF_MAX_OID)) {
+                       CERROR("Bad inc "DOSTID"\n", POSTID(oi));
+                       return;
+               }
+               oi->oi.oi_id++;
+       } else {
+               oi->oi_fid.f_oid++;
+       }
+}
+
+static inline void ostid_dec_id(struct ost_id *oi)
+{
+       if (fid_seq_is_mdt0(ostid_seq(oi)))
+               oi->oi.oi_id--;
+       else
+               oi->oi_fid.f_oid--;
+}
+
+/**
+ * Unpack an OST object id/seq (group) into a FID.  This is needed for
+ * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper
+ * FIDs.  Note that if an id/seq is already in FID/IDIF format it will
+ * be passed through unchanged.  Only legacy OST objects in "group 0"
+ * will be mapped into the IDIF namespace so that they can fit into the
+ * struct lu_fid fields without loss.  For reference see:
+ * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs
+ */
+static inline int ostid_to_fid(struct lu_fid *fid, struct ost_id *ostid,
+                              __u32 ost_idx)
+{
+       if (ost_idx > 0xffff) {
+               CERROR("bad ost_idx, "DOSTID" ost_idx:%u\n", POSTID(ostid),
+                      ost_idx);
+               return -EBADF;
+       }
+
+       if (fid_seq_is_mdt0(ostid_seq(ostid))) {
+               /* This is a "legacy" (old 1.x/2.early) OST object in "group 0"
+                * that we map into the IDIF namespace.  It allows up to 2^48
+                * objects per OST, as this is the object namespace that has
+                * been in production for years.  This can handle create rates
+                * of 1M objects/s/OST for 9 years, or combinations thereof. */
+               if (ostid_id(ostid) >= IDIF_MAX_OID) {
+                        CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+                               POSTID(ostid), ost_idx);
+                        return -EBADF;
+               }
+               fid->f_seq = fid_idif_seq(ostid_id(ostid), ost_idx);
+               /* truncate to 32 bits by assignment */
+               fid->f_oid = ostid_id(ostid);
+               /* in theory, not currently used */
+               fid->f_ver = ostid_id(ostid) >> 48;
+       } else /* if (fid_seq_is_idif(seq) || fid_seq_is_norm(seq)) */ {
+              /* This is either an IDIF object, which identifies objects across
+               * all OSTs, or a regular FID.  The IDIF namespace maps legacy
+               * OST objects into the FID namespace.  In both cases, we just
+               * pass the FID through, no conversion needed. */
+               if (ostid->oi_fid.f_ver != 0) {
+                       CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n",
+                               POSTID(ostid), ost_idx);
+                       return -EBADF;
+               }
+               *fid = ostid->oi_fid;
+       }
+
+       return 0;
+}
+
+/* pack any OST FID into an ostid (id/seq) for the wire/disk */
+static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid)
+{
+       if (unlikely(fid_seq_is_igif(fid->f_seq))) {
+               CERROR("bad IGIF, "DFID"\n", PFID(fid));
+               return -EBADF;
+       }
+
+       if (fid_is_idif(fid)) {
+               ostid_set_seq_mdt0(ostid);
+               ostid_set_id(ostid, fid_idif_id(fid_seq(fid), fid_oid(fid),
+                                               fid_ver(fid)));
+       } else {
+               ostid->oi_fid = *fid;
+       }
+
+       return 0;
+}
+
+/* Check whether the fid is for LAST_ID */
+static inline int fid_is_last_id(const struct lu_fid *fid)
+{
+       return (fid_oid(fid) == 0);
+}
+
+/**
+ * Get inode number from a igif.
+ * \param fid a igif to get inode number from.
+ * \return inode number for the igif.
+ */
+static inline ino_t lu_igif_ino(const struct lu_fid *fid)
+{
+       return fid_seq(fid);
+}
+
+extern void lustre_swab_ost_id(struct ost_id *oid);
+
+/**
+ * Get inode generation from a igif.
+ * \param fid a igif to get inode generation from.
+ * \return inode generation for the igif.
+ */
+static inline __u32 lu_igif_gen(const struct lu_fid *fid)
+{
+       return fid_oid(fid);
+}
+
+/**
+ * Build igif from the inode number/generation.
+ */
+static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen)
+{
+       fid->f_seq = ino;
+       fid->f_oid = gen;
+       fid->f_ver = 0;
+}
+
+/*
+ * Fids are transmitted across network (in the sender byte-ordering),
+ * and stored on disk in big-endian order.
+ */
+static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
+{
+       /* check that all fields are converted */
+       CLASSERT(sizeof *src ==
+                sizeof fid_seq(src) +
+                sizeof fid_oid(src) + sizeof fid_ver(src));
+       dst->f_seq = cpu_to_le64(fid_seq(src));
+       dst->f_oid = cpu_to_le32(fid_oid(src));
+       dst->f_ver = cpu_to_le32(fid_ver(src));
+}
+
+static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+       /* check that all fields are converted */
+       CLASSERT(sizeof *src ==
+                sizeof fid_seq(src) +
+                sizeof fid_oid(src) + sizeof fid_ver(src));
+       dst->f_seq = le64_to_cpu(fid_seq(src));
+       dst->f_oid = le32_to_cpu(fid_oid(src));
+       dst->f_ver = le32_to_cpu(fid_ver(src));
+}
+
+static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src)
+{
+       /* check that all fields are converted */
+       CLASSERT(sizeof *src ==
+                sizeof fid_seq(src) +
+                sizeof fid_oid(src) + sizeof fid_ver(src));
+       dst->f_seq = cpu_to_be64(fid_seq(src));
+       dst->f_oid = cpu_to_be32(fid_oid(src));
+       dst->f_ver = cpu_to_be32(fid_ver(src));
+}
+
+static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
+{
+       /* check that all fields are converted */
+       CLASSERT(sizeof *src ==
+                sizeof fid_seq(src) +
+                sizeof fid_oid(src) + sizeof fid_ver(src));
+       dst->f_seq = be64_to_cpu(fid_seq(src));
+       dst->f_oid = be32_to_cpu(fid_oid(src));
+       dst->f_ver = be32_to_cpu(fid_ver(src));
+}
+
+static inline int fid_is_sane(const struct lu_fid *fid)
+{
+       return fid != NULL &&
+              ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) ||
+               fid_is_igif(fid) || fid_is_idif(fid) ||
+               fid_seq_is_rsvd(fid_seq(fid)));
+}
+
+static inline int fid_is_zero(const struct lu_fid *fid)
+{
+       return fid_seq(fid) == 0 && fid_oid(fid) == 0;
+}
+
+extern void lustre_swab_lu_fid(struct lu_fid *fid);
+extern void lustre_swab_lu_seq_range(struct lu_seq_range *range);
+
+static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1)
+{
+       /* Check that there is no alignment padding. */
+       CLASSERT(sizeof *f0 ==
+                sizeof f0->f_seq + sizeof f0->f_oid + sizeof f0->f_ver);
+       return memcmp(f0, f1, sizeof *f0) == 0;
+}
+
+#define __diff_normalize(val0, val1)                       \
+({                                                           \
+       typeof(val0) __val0 = (val0);                      \
+       typeof(val1) __val1 = (val1);                      \
+                                                               \
+       (__val0 == __val1 ? 0 : __val0 > __val1 ? +1 : -1);     \
+})
+
+static inline int lu_fid_cmp(const struct lu_fid *f0,
+                            const struct lu_fid *f1)
+{
+       return
+               __diff_normalize(fid_seq(f0), fid_seq(f1)) ?:
+               __diff_normalize(fid_oid(f0), fid_oid(f1)) ?:
+               __diff_normalize(fid_ver(f0), fid_ver(f1));
+}
+
+static inline void ostid_cpu_to_le(struct ost_id *src_oi,
+                                  struct ost_id *dst_oi)
+{
+       if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+               dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+               dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+       } else {
+               fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid);
+       }
+}
+
+static inline void ostid_le_to_cpu(struct ost_id *src_oi,
+                                  struct ost_id *dst_oi)
+{
+       if (fid_seq_is_mdt0(ostid_seq(src_oi))) {
+               dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+               dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+       } else {
+               fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid);
+       }
+}
+
+/** @} lu_fid */
+
+/** \defgroup lu_dir lu_dir
+ * @{ */
+
+/**
+ * Enumeration of possible directory entry attributes.
+ *
+ * Attributes follow directory entry header in the order they appear in this
+ * enumeration.
+ */
+enum lu_dirent_attrs {
+       LUDA_FID                = 0x0001,
+       LUDA_TYPE               = 0x0002,
+       LUDA_64BITHASH          = 0x0004,
+
+       /* The following attrs are used for MDT interanl only,
+        * not visible to client */
+
+       /* Verify the dirent consistency */
+       LUDA_VERIFY             = 0x8000,
+       /* Only check but not repair the dirent inconsistency */
+       LUDA_VERIFY_DRYRUN      = 0x4000,
+       /* The dirent has been repaired, or to be repaired (dryrun). */
+       LUDA_REPAIR             = 0x2000,
+       /* The system is upgraded, has beed or to be repaired (dryrun). */
+       LUDA_UPGRADE            = 0x1000,
+       /* Ignore this record, go to next directly. */
+       LUDA_IGNORE             = 0x0800,
+};
+
+#define LU_DIRENT_ATTRS_MASK   0xf800
+
+/**
+ * Layout of readdir pages, as transmitted on wire.
+ */
+struct lu_dirent {
+       /** valid if LUDA_FID is set. */
+       struct lu_fid lde_fid;
+       /** a unique entry identifier: a hash or an offset. */
+       __u64    lde_hash;
+       /** total record length, including all attributes. */
+       __u16    lde_reclen;
+       /** name length */
+       __u16    lde_namelen;
+       /** optional variable size attributes following this entry.
+        *  taken from enum lu_dirent_attrs.
+        */
+       __u32    lde_attrs;
+       /** name is followed by the attributes indicated in ->ldp_attrs, in
+        *  their natural order. After the last attribute, padding bytes are
+        *  added to make ->lde_reclen a multiple of 8.
+        */
+       char      lde_name[0];
+};
+
+/*
+ * Definitions of optional directory entry attributes formats.
+ *
+ * Individual attributes do not have their length encoded in a generic way. It
+ * is assumed that consumer of an attribute knows its format. This means that
+ * it is impossible to skip over an unknown attribute, except by skipping over all
+ * remaining attributes (by using ->lde_reclen), which is not too
+ * constraining, because new server versions will append new attributes at
+ * the end of an entry.
+ */
+
+/**
+ * Fid directory attribute: a fid of an object referenced by the entry. This
+ * will be almost always requested by the client and supplied by the server.
+ *
+ * Aligned to 8 bytes.
+ */
+/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */
+
+/**
+ * File type.
+ *
+ * Aligned to 2 bytes.
+ */
+struct luda_type {
+       __u16 lt_type;
+};
+
+struct lu_dirpage {
+       __u64       ldp_hash_start;
+       __u64       ldp_hash_end;
+       __u32       ldp_flags;
+       __u32       ldp_pad0;
+       struct lu_dirent ldp_entries[0];
+};
+
+enum lu_dirpage_flags {
+       /**
+        * dirpage contains no entry.
+        */
+       LDF_EMPTY   = 1 << 0,
+       /**
+        * last entry's lde_hash equals ldp_hash_end.
+        */
+       LDF_COLLIDE = 1 << 1
+};
+
+static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp)
+{
+       if (le32_to_cpu(dp->ldp_flags) & LDF_EMPTY)
+               return NULL;
+       else
+               return dp->ldp_entries;
+}
+
+static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent)
+{
+       struct lu_dirent *next;
+
+       if (le16_to_cpu(ent->lde_reclen) != 0)
+               next = ((void *)ent) + le16_to_cpu(ent->lde_reclen);
+       else
+               next = NULL;
+
+       return next;
+}
+
+static inline int lu_dirent_calc_size(int namelen, __u16 attr)
+{
+       int size;
+
+       if (attr & LUDA_TYPE) {
+               const unsigned align = sizeof(struct luda_type) - 1;
+               size = (sizeof(struct lu_dirent) + namelen + align) & ~align;
+               size += sizeof(struct luda_type);
+       } else
+               size = sizeof(struct lu_dirent) + namelen;
+
+       return (size + 7) & ~7;
+}
+
+static inline int lu_dirent_size(struct lu_dirent *ent)
+{
+       if (le16_to_cpu(ent->lde_reclen) == 0) {
+               return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen),
+                                          le32_to_cpu(ent->lde_attrs));
+       }
+       return le16_to_cpu(ent->lde_reclen);
+}
+
+#define MDS_DIR_END_OFF 0xfffffffffffffffeULL
+
+/**
+ * MDS_READPAGE page size
+ *
+ * This is the directory page size packed in MDS_READPAGE RPC.
+ * It's different than PAGE_CACHE_SIZE because the client needs to
+ * access the struct lu_dirpage header packed at the beginning of
+ * the "page" and without this there isn't any way to know find the
+ * lu_dirpage header is if client and server PAGE_CACHE_SIZE differ.
+ */
+#define LU_PAGE_SHIFT 12
+#define LU_PAGE_SIZE  (1UL << LU_PAGE_SHIFT)
+#define LU_PAGE_MASK  (~(LU_PAGE_SIZE - 1))
+
+#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT))
+
+/** @} lu_dir */
+
+struct lustre_handle {
+       __u64 cookie;
+};
+#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL
+
+static inline int lustre_handle_is_used(struct lustre_handle *lh)
+{
+       return lh->cookie != 0ull;
+}
+
+static inline int lustre_handle_equal(const struct lustre_handle *lh1,
+                                     const struct lustre_handle *lh2)
+{
+       return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+                                     struct lustre_handle *src)
+{
+       tgt->cookie = src->cookie;
+}
+
+/* flags for lm_flags */
+#define MSGHDR_AT_SUPPORT             0x1
+#define MSGHDR_CKSUM_INCOMPAT18         0x2
+
+#define lustre_msg lustre_msg_v2
+/* we depend on this structure to be 8-byte aligned */
+/* this type is only endian-adjusted in lustre_unpack_msg() */
+struct lustre_msg_v2 {
+       __u32 lm_bufcount;
+       __u32 lm_secflvr;
+       __u32 lm_magic;
+       __u32 lm_repsize;
+       __u32 lm_cksum;
+       __u32 lm_flags;
+       __u32 lm_padding_2;
+       __u32 lm_padding_3;
+       __u32 lm_buflens[0];
+};
+
+/* without gss, ptlrpc_body is put at the first buffer. */
+#define PTLRPC_NUM_VERSIONS     4
+#define JOBSTATS_JOBID_SIZE     32  /* 32 bytes string */
+struct ptlrpc_body_v3 {
+       struct lustre_handle pb_handle;
+       __u32 pb_type;
+       __u32 pb_version;
+       __u32 pb_opc;
+       __u32 pb_status;
+       __u64 pb_last_xid;
+       __u64 pb_last_seen;
+       __u64 pb_last_committed;
+       __u64 pb_transno;
+       __u32 pb_flags;
+       __u32 pb_op_flags;
+       __u32 pb_conn_cnt;
+       __u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+       __u32 pb_service_time; /* for rep, actual service time */
+       __u32 pb_limit;
+       __u64 pb_slv;
+       /* VBR: pre-versions */
+       __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+       /* padding for future needs */
+       __u64 pb_padding[4];
+       char  pb_jobid[JOBSTATS_JOBID_SIZE];
+};
+#define ptlrpc_body     ptlrpc_body_v3
+
+struct ptlrpc_body_v2 {
+       struct lustre_handle pb_handle;
+       __u32 pb_type;
+       __u32 pb_version;
+       __u32 pb_opc;
+       __u32 pb_status;
+       __u64 pb_last_xid;
+       __u64 pb_last_seen;
+       __u64 pb_last_committed;
+       __u64 pb_transno;
+       __u32 pb_flags;
+       __u32 pb_op_flags;
+       __u32 pb_conn_cnt;
+       __u32 pb_timeout;  /* for req, the deadline, for rep, the service est */
+       __u32 pb_service_time; /* for rep, actual service time, also used for
+                                 net_latency of req */
+       __u32 pb_limit;
+       __u64 pb_slv;
+       /* VBR: pre-versions */
+       __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS];
+       /* padding for future needs */
+       __u64 pb_padding[4];
+};
+
+extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
+
+/* message body offset for lustre_msg_v2 */
+/* ptlrpc body offset in all request/reply messages */
+#define MSG_PTLRPC_BODY_OFF         0
+
+/* normal request/reply message record offset */
+#define REQ_REC_OFF                 1
+#define REPLY_REC_OFF             1
+
+/* ldlm request message body offset */
+#define DLM_LOCKREQ_OFF                 1 /* lockreq offset */
+#define DLM_REQ_REC_OFF                 2 /* normal dlm request record offset */
+
+/* ldlm intent lock message body offset */
+#define DLM_INTENT_IT_OFF             2 /* intent lock it offset */
+#define DLM_INTENT_REC_OFF           3 /* intent lock record offset */
+
+/* ldlm reply message body offset */
+#define DLM_LOCKREPLY_OFF             1 /* lockrep offset */
+#define DLM_REPLY_REC_OFF             2 /* reply record offset */
+
+/** only use in req->rq_{req,rep}_swab_mask */
+#define MSG_PTLRPC_HEADER_OFF     31
+
+/* Flags that are operation-specific go in the top 16 bits. */
+#define MSG_OP_FLAG_MASK   0xffff0000
+#define MSG_OP_FLAG_SHIFT  16
+
+/* Flags that apply to all requests are in the bottom 16 bits */
+#define MSG_GEN_FLAG_MASK     0x0000ffff
+#define MSG_LAST_REPLAY           0x0001
+#define MSG_RESENT             0x0002
+#define MSG_REPLAY             0x0004
+/* #define MSG_AT_SUPPORT       0x0008
+ * This was used in early prototypes of adaptive timeouts, and while there
+ * shouldn't be any users of that code there also isn't a need for using this
+ * bits. Defer usage until at least 1.10 to avoid potential conflict. */
+#define MSG_DELAY_REPLAY         0x0010
+#define MSG_VERSION_REPLAY     0x0020
+#define MSG_REQ_REPLAY_DONE       0x0040
+#define MSG_LOCK_REPLAY_DONE      0x0080
+
+/*
+ * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT)
+ */
+
+#define MSG_CONNECT_RECOVERING  0x00000001
+#define MSG_CONNECT_RECONNECT   0x00000002
+#define MSG_CONNECT_REPLAYABLE  0x00000004
+//#define MSG_CONNECT_PEER     0x8
+#define MSG_CONNECT_LIBCLIENT   0x00000010
+#define MSG_CONNECT_INITIAL     0x00000020
+#define MSG_CONNECT_ASYNC       0x00000040
+#define MSG_CONNECT_NEXT_VER    0x00000080 /* use next version of lustre_msg */
+#define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
+
+/* Connect flags */
+#define OBD_CONNECT_RDONLY             0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX               0x2ULL /*connect specific LOV idx */
+#define OBD_CONNECT_MDS                   0x4ULL /*connect from MDT to OST */
+#define OBD_CONNECT_GRANT               0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK          0x10ULL /*server takes locks for cli */
+#define OBD_CONNECT_VERSION          0x20ULL /*Lustre versions in ocd */
+#define OBD_CONNECT_REQPORTAL      0x40ULL /*Separate non-IO req portal */
+#define OBD_CONNECT_ACL                  0x80ULL /*access control lists */
+#define OBD_CONNECT_XATTR             0x100ULL /*client use extended attr */
+#define OBD_CONNECT_CROW               0x200ULL /*MDS+OST create obj on write*/
+#define OBD_CONNECT_TRUNCLOCK     0x400ULL /*locks on server for punch */
+#define OBD_CONNECT_TRANSNO         0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS            0x1000ULL /*support for inodebits locks*/
+#define OBD_CONNECT_JOIN              0x2000ULL /*files can be concatenated.
+                                                 *We do not support JOIN FILE
+                                                 *anymore, reserve this flags
+                                                 *just for preventing such bit
+                                                 *to be reused.*/
+#define OBD_CONNECT_ATTRFID        0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH        0x8000ULL /*No open hndl on specl nodes*/
+#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /*Remote client */
+#define OBD_CONNECT_RMT_CLIENT_FORCE  0x20000ULL /*Remote client by force */
+#define OBD_CONNECT_BRW_SIZE     0x40000ULL /*Max bytes per rpc */
+#define OBD_CONNECT_QUOTA64       0x80000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_MDS_CAPA    0x100000ULL /*MDS capability */
+#define OBD_CONNECT_OSS_CAPA    0x200000ULL /*OSS capability */
+#define OBD_CONNECT_CANCELSET  0x400000ULL /*Early batched cancels. */
+#define OBD_CONNECT_SOM              0x800000ULL /*Size on MDS */
+#define OBD_CONNECT_AT       0x1000000ULL /*client uses AT */
+#define OBD_CONNECT_LRU_RESIZE      0x2000000ULL /*LRU resize feature. */
+#define OBD_CONNECT_MDS_MDS     0x4000000ULL /*MDS-MDS connection */
+#define OBD_CONNECT_REAL           0x8000000ULL /*real connection */
+#define OBD_CONNECT_CHANGE_QS      0x10000000ULL /*Not used since 2.4 */
+#define OBD_CONNECT_CKSUM        0x20000000ULL /*support several cksum algos*/
+#define OBD_CONNECT_FID            0x40000000ULL /*FID is supported by server */
+#define OBD_CONNECT_VBR            0x80000000ULL /*version based recovery */
+#define OBD_CONNECT_LOV_V3     0x100000000ULL /*client supports LOV v3 EA */
+#define OBD_CONNECT_GRANT_SHRINK  0x200000000ULL /* support grant shrink */
+#define OBD_CONNECT_SKIP_ORPHAN   0x400000000ULL /* don't reuse orphan objids */
+#define OBD_CONNECT_MAX_EASIZE    0x800000000ULL /* preserved for large EA */
+#define OBD_CONNECT_FULL20       0x1000000000ULL /* it is 2.0 client */
+#define OBD_CONNECT_LAYOUTLOCK   0x2000000000ULL /* client uses layout lock */
+#define OBD_CONNECT_64BITHASH    0x4000000000ULL /* client supports 64-bits
+                                                 * directory hash */
+#define OBD_CONNECT_MAXBYTES     0x8000000000ULL /* max stripe size */
+#define OBD_CONNECT_IMP_RECOV   0x10000000000ULL /* imp recovery support */
+#define OBD_CONNECT_JOBSTATS    0x20000000000ULL /* jobid in ptlrpc_body */
+#define OBD_CONNECT_UMASK       0x40000000000ULL /* create uses client umask */
+#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS
+                                                 * RPC error properly */
+#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for
+                                                 * finer space reservation */
+#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8
+                                                  * policy and 2.x server */
+#define OBD_CONNECT_LVB_TYPE   0x400000000000ULL /* variable type of LVB */
+#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */
+#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */
+#define OBD_CONNECT_SHORTIO     0x2000000000000ULL/* short io */
+#define OBD_CONNECT_PINGLESS   0x4000000000000ULL/* pings not required */
+/* XXX README XXX:
+ * Please DO NOT add flag values here before first ensuring that this same
+ * flag value is not in use on some other branch.  Please clear any such
+ * changes with senior engineers before starting to use a new flag.  Then,
+ * submit a small patch against EVERY branch that ONLY adds the new flag,
+ * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the
+ * flag to check_obd_connect_data(), and updates wiretests accordingly, so it
+ * can be approved and landed easily to reserve the flag for future use. */
+
+/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS
+ * connection.  It is a temporary bug fix for Imperative Recovery interop
+ * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for
+ * 2.2 clients/servers is no longer needed.  LU-1252/LU-1644. */
+#define OBD_CONNECT_MNE_SWAB            OBD_CONNECT_MDS_MDS
+
+#define OCD_HAS_FLAG(ocd, flg)  \
+       (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg))
+
+
+#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE
+
+#define MDT_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+                               OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+                               OBD_CONNECT_IBITS | \
+                               OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \
+                               OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+                               OBD_CONNECT_RMT_CLIENT | \
+                               OBD_CONNECT_RMT_CLIENT_FORCE | \
+                               OBD_CONNECT_BRW_SIZE | OBD_CONNECT_MDS_CAPA | \
+                               OBD_CONNECT_OSS_CAPA | OBD_CONNECT_MDS_MDS | \
+                               OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \
+                               OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \
+                               OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \
+                               OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \
+                               OBD_CONNECT_EINPROGRESS | \
+                               OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \
+                               OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\
+                               OBD_CONNECT_PINGLESS)
+#define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
+                               OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
+                               OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
+                               OBD_CONNECT_BRW_SIZE | OBD_CONNECT_OSS_CAPA | \
+                               OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \
+                               LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \
+                               OBD_CONNECT_RMT_CLIENT | \
+                               OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \
+                               OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \
+                               OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \
+                               OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \
+                               OBD_CONNECT_MAX_EASIZE | \
+                               OBD_CONNECT_EINPROGRESS | \
+                               OBD_CONNECT_JOBSTATS | \
+                               OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\
+                               OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \
+                               OBD_CONNECT_PINGLESS)
+#define ECHO_CONNECT_SUPPORTED (0)
+#define MGS_CONNECT_SUPPORTED  (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \
+                               OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \
+                               OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS)
+
+/* Features required for this version of the client to work with server */
+#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \
+                                OBD_CONNECT_FULL20)
+
+#define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\
+                                               ((patch)<<8) + (fix))
+#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255)
+#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255)
+#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255)
+#define OBD_OCD_VERSION_FIX(version)   ((int)(version)&255)
+
+/* This structure is used for both request and reply.
+ *
+ * If we eventually have separate connect data for different types, which we
+ * almost certainly will, then perhaps we stick a union in here. */
+struct obd_connect_data_v1 {
+       __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+       __u32 ocd_version;       /* lustre release version number */
+       __u32 ocd_grant;         /* initial cache grant amount (bytes) */
+       __u32 ocd_index;         /* LOV index to connect to */
+       __u32 ocd_brw_size;      /* Maximum BRW size in bytes, must be 2^n */
+       __u64 ocd_ibits_known;   /* inode bits this client understands */
+       __u8  ocd_blocksize;     /* log2 of the backend filesystem blocksize */
+       __u8  ocd_inodespace;    /* log2 of the per-inode space consumption */
+       __u16 ocd_grant_extent;  /* per-extent grant overhead, in 1K blocks */
+       __u32 ocd_unused;       /* also fix lustre_swab_connect */
+       __u64 ocd_transno;       /* first transno from client to be replayed */
+       __u32 ocd_group;         /* MDS group on OST */
+       __u32 ocd_cksum_types;   /* supported checksum algorithms */
+       __u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+       __u32 ocd_instance;      /* also fix lustre_swab_connect */
+       __u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+};
+
+struct obd_connect_data {
+       __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+       __u32 ocd_version;       /* lustre release version number */
+       __u32 ocd_grant;         /* initial cache grant amount (bytes) */
+       __u32 ocd_index;         /* LOV index to connect to */
+       __u32 ocd_brw_size;      /* Maximum BRW size in bytes */
+       __u64 ocd_ibits_known;   /* inode bits this client understands */
+       __u8  ocd_blocksize;     /* log2 of the backend filesystem blocksize */
+       __u8  ocd_inodespace;    /* log2 of the per-inode space consumption */
+       __u16 ocd_grant_extent;  /* per-extent grant overhead, in 1K blocks */
+       __u32 ocd_unused;       /* also fix lustre_swab_connect */
+       __u64 ocd_transno;       /* first transno from client to be replayed */
+       __u32 ocd_group;         /* MDS group on OST */
+       __u32 ocd_cksum_types;   /* supported checksum algorithms */
+       __u32 ocd_max_easize;    /* How big LOV EA can be on MDS */
+       __u32 ocd_instance;      /* instance # of this target */
+       __u64 ocd_maxbytes;      /* Maximum stripe size in bytes */
+       /* Fields after ocd_maxbytes are only accessible by the receiver
+        * if the corresponding flag in ocd_connect_flags is set. Accessing
+        * any field after ocd_maxbytes on the receiver without a valid flag
+        * may result in out-of-bound memory access and kernel oops. */
+       __u64 padding1;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding2;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding3;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding4;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding5;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding6;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding7;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding8;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 padding9;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingA;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingB;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingC;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingD;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingE;   /* added 2.1.0. also fix lustre_swab_connect */
+       __u64 paddingF;   /* added 2.1.0. also fix lustre_swab_connect */
+};
+/* XXX README XXX:
+ * Please DO NOT use any fields here before first ensuring that this same
+ * field is not in use on some other branch.  Please clear any such changes
+ * with senior engineers before starting to use a new field.  Then, submit
+ * a small patch against EVERY branch that ONLY adds the new field along with
+ * the matching OBD_CONNECT flag, so that can be approved and landed easily to
+ * reserve the flag for future use. */
+
+
+extern void lustre_swab_connect(struct obd_connect_data *ocd);
+
+/*
+ * Supported checksum algorithms. Up to 32 checksum types are supported.
+ * (32-bit mask stored in obd_connect_data::ocd_cksum_types)
+ * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new
+ * algorithm and also the OBD_FL_CKSUM* flags.
+ */
+typedef enum {
+       OBD_CKSUM_CRC32 = 0x00000001,
+       OBD_CKSUM_ADLER = 0x00000002,
+       OBD_CKSUM_CRC32C= 0x00000004,
+} cksum_type_t;
+
+/*
+ *   OST requests: OBDO & OBD request records
+ */
+
+/* opcodes */
+typedef enum {
+       OST_REPLY      =  0,       /* reply ? */
+       OST_GETATTR    =  1,
+       OST_SETATTR    =  2,
+       OST_READ       =  3,
+       OST_WRITE      =  4,
+       OST_CREATE     =  5,
+       OST_DESTROY    =  6,
+       OST_GET_INFO   =  7,
+       OST_CONNECT    =  8,
+       OST_DISCONNECT =  9,
+       OST_PUNCH      = 10,
+       OST_OPEN       = 11,
+       OST_CLOSE      = 12,
+       OST_STATFS     = 13,
+       OST_SYNC       = 16,
+       OST_SET_INFO   = 17,
+       OST_QUOTACHECK = 18,
+       OST_QUOTACTL   = 19,
+       OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */
+       OST_LAST_OPC
+} ost_cmd_t;
+#define OST_FIRST_OPC  OST_REPLY
+
+enum obdo_flags {
+       OBD_FL_INLINEDATA   = 0x00000001,
+       OBD_FL_OBDMDEXISTS  = 0x00000002,
+       OBD_FL_DELORPHAN    = 0x00000004, /* if set in o_flags delete orphans */
+       OBD_FL_NORPC    = 0x00000008, /* set in o_flags do in OSC not OST */
+       OBD_FL_IDONLY       = 0x00000010, /* set in o_flags only adjust obj id*/
+       OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */
+       OBD_FL_DEBUG_CHECK  = 0x00000040, /* echo client/server debug check */
+       OBD_FL_NO_USRQUOTA  = 0x00000100, /* the object's owner is over quota */
+       OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
+       OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
+       OBD_FL_SRVLOCK      = 0x00000800, /* delegate DLM locking to server */
+       OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+       OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+       OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */
+       OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
+       OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+       OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+       OBD_FL_MMAP      = 0x00040000, /* object is mmapped on the client.
+                                          * XXX: obsoleted - reserved for old
+                                          * clients prior than 2.2 */
+       OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */
+       OBD_FL_NOSPC_BLK    = 0x00100000, /* no more block space on OST */
+
+       /* Note that while these checksum values are currently separate bits,
+        * in 2.x we can actually allow all values from 1-31 if we wanted. */
+       OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER |
+                             OBD_FL_CKSUM_CRC32C,
+
+       /* mask for local-only flag, which won't be sent over network */
+       OBD_FL_LOCAL_MASK   = 0xF0000000,
+};
+
+#define LOV_MAGIC_V1      0x0BD10BD0
+#define LOV_MAGIC       LOV_MAGIC_V1
+#define LOV_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_MAGIC_V3      0x0BD30BD0
+
+/*
+ * magic for fully defined striping
+ * the idea is that we should have different magics for striping "hints"
+ * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct
+ * lov_mds_md_v[13]). at the moment the magics are used in wire protocol,
+ * we can't just change it w/o long way preparation, but we still need a
+ * mechanism to allow LOD to differentiate hint versus ready striping.
+ * so, at the moment we do a trick: MDT knows what to expect from request
+ * depending on the case (replay uses ready striping, non-replay req uses
+ * hints), so MDT replaces magic with appropriate one and now LOD can
+ * easily understand what's inside -bzzz
+ */
+#define LOV_MAGIC_V1_DEF  0x0CD10BD0
+#define LOV_MAGIC_V3_DEF  0x0CD30BD0
+
+#define LOV_PATTERN_RAID0 0x001   /* stripes are used round-robin */
+#define LOV_PATTERN_RAID1 0x002   /* stripes are mirrors of each other */
+#define LOV_PATTERN_FIRST 0x100   /* first stripe is not in round-robin */
+#define LOV_PATTERN_CMOBD 0x200
+
+#define lov_ost_data lov_ost_data_v1
+struct lov_ost_data_v1 {         /* per-stripe data structure (little-endian)*/
+       struct ost_id l_ost_oi;   /* OST object ID */
+       __u32 l_ost_gen;          /* generation of this l_ost_idx */
+       __u32 l_ost_idx;          /* OST index in LOV (lov_tgt_desc->tgts) */
+};
+
+#define lov_mds_md lov_mds_md_v1
+struct lov_mds_md_v1 {     /* LOV EA mds/wire data (little-endian) */
+       __u32 lmm_magic;          /* magic number = LOV_MAGIC_V1 */
+       __u32 lmm_pattern;      /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+       struct ost_id   lmm_oi;   /* LOV object ID */
+       __u32 lmm_stripe_size;    /* size of stripe in bytes */
+       /* lmm_stripe_count used to be __u32 */
+       __u16 lmm_stripe_count;   /* num stripes in use for this object */
+       __u16 lmm_layout_gen;     /* layout generation number */
+       struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+/**
+ * Sigh, because pre-2.4 uses
+ * struct lov_mds_md_v1 {
+ *     ........
+ *     __u64 lmm_object_id;
+ *     __u64 lmm_object_seq;
+ *      ......
+ *      }
+ * to identify the LOV(MDT) object, and lmm_object_seq will
+ * be normal_fid, which make it hard to combine these conversion
+ * to ostid_to FID. so we will do lmm_oi/fid conversion separately
+ *
+ * We can tell the lmm_oi by this way,
+ * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0
+ * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL
+ * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k},
+ *      lmm_oi.f_ver = 0
+ *
+ * But currently lmm_oi/lsm_oi does not have any "real" usages,
+ * except for printing some information, and the user can always
+ * get the real FID from LMA, besides this multiple case check might
+ * make swab more complicate. So we will keep using id/seq for lmm_oi.
+ */
+
+static inline void fid_to_lmm_oi(const struct lu_fid *fid,
+                                struct ost_id *oi)
+{
+       oi->oi.oi_id = fid_oid(fid);
+       oi->oi.oi_seq = fid_seq(fid);
+}
+
+static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq)
+{
+       oi->oi.oi_seq = seq;
+}
+
+static inline __u64 lmm_oi_id(struct ost_id *oi)
+{
+       return oi->oi.oi_id;
+}
+
+static inline __u64 lmm_oi_seq(struct ost_id *oi)
+{
+       return oi->oi.oi_seq;
+}
+
+static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi,
+                                   struct ost_id *src_oi)
+{
+       dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id);
+       dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq);
+}
+
+static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi,
+                                   struct ost_id *src_oi)
+{
+       dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id);
+       dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq);
+}
+
+/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */
+
+#define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data))
+#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
+
+#define XATTR_NAME_ACL_ACCESS   "system.posix_acl_access"
+#define XATTR_NAME_ACL_DEFAULT  "system.posix_acl_default"
+#define XATTR_USER_PREFIX       "user."
+#define XATTR_TRUSTED_PREFIX    "trusted."
+#define XATTR_SECURITY_PREFIX   "security."
+#define XATTR_LUSTRE_PREFIX     "lustre."
+
+#define XATTR_NAME_LOV   "trusted.lov"
+#define XATTR_NAME_LMA   "trusted.lma"
+#define XATTR_NAME_LMV   "trusted.lmv"
+#define XATTR_NAME_LINK         "trusted.link"
+#define XATTR_NAME_FID   "trusted.fid"
+#define XATTR_NAME_VERSION      "trusted.version"
+#define XATTR_NAME_SOM         "trusted.som"
+#define XATTR_NAME_HSM         "trusted.hsm"
+#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace"
+
+struct lov_mds_md_v3 {     /* LOV EA mds/wire data (little-endian) */
+       __u32 lmm_magic;          /* magic number = LOV_MAGIC_V3 */
+       __u32 lmm_pattern;      /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+       struct ost_id   lmm_oi;   /* LOV object ID */
+       __u32 lmm_stripe_size;    /* size of stripe in bytes */
+       /* lmm_stripe_count used to be __u32 */
+       __u16 lmm_stripe_count;   /* num stripes in use for this object */
+       __u16 lmm_layout_gen;     /* layout generation number */
+       char  lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */
+       struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+#define OBD_MD_FLID    (0x00000001ULL) /* object ID */
+#define OBD_MD_FLATIME     (0x00000002ULL) /* access time */
+#define OBD_MD_FLMTIME     (0x00000004ULL) /* data modification time */
+#define OBD_MD_FLCTIME     (0x00000008ULL) /* change time */
+#define OBD_MD_FLSIZE      (0x00000010ULL) /* size */
+#define OBD_MD_FLBLOCKS    (0x00000020ULL) /* allocated blocks count */
+#define OBD_MD_FLBLKSZ     (0x00000040ULL) /* block size */
+#define OBD_MD_FLMODE      (0x00000080ULL) /* access bits (mode & ~S_IFMT) */
+#define OBD_MD_FLTYPE      (0x00000100ULL) /* object type (mode & S_IFMT) */
+#define OBD_MD_FLUID       (0x00000200ULL) /* user ID */
+#define OBD_MD_FLGID       (0x00000400ULL) /* group ID */
+#define OBD_MD_FLFLAGS     (0x00000800ULL) /* flags word */
+#define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
+#define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
+/*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
+#define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
+#define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
+#define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
+#define OBD_MD_FLHANDLE    (0x00080000ULL) /* file/lock handle */
+#define OBD_MD_FLCKSUM     (0x00100000ULL) /* bulk data checksum */
+#define OBD_MD_FLQOS       (0x00200000ULL) /* quality of service stats */
+/*#define OBD_MD_FLOSCOPQ    (0x00400000ULL) osc opaque data, never used */
+#define OBD_MD_FLCOOKIE    (0x00800000ULL) /* log cancellation cookie */
+#define OBD_MD_FLGROUP     (0x01000000ULL) /* group */
+#define OBD_MD_FLFID       (0x02000000ULL) /* ->ost write inline fid */
+#define OBD_MD_FLEPOCH     (0x04000000ULL) /* ->ost write with ioepoch */
+                                          /* ->mds if epoch opens or closes */
+#define OBD_MD_FLGRANT     (0x08000000ULL) /* ost preallocation space grant */
+#define OBD_MD_FLDIREA     (0x10000000ULL) /* dir's extended attribute data */
+#define OBD_MD_FLUSRQUOTA  (0x20000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLGRPQUOTA  (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
+
+#define OBD_MD_MDS      (0x0000000100000000ULL) /* where an inode lives on */
+#define OBD_MD_REINT       (0x0000000200000000ULL) /* reintegrate oa */
+#define OBD_MD_MEA      (0x0000000400000000ULL) /* CMD split EA  */
+
+/* OBD_MD_MDTIDX is used to get MDT index, but it is never been used overwire,
+ * and it is already obsolete since 2.3 */
+/* #define OBD_MD_MDTIDX      (0x0000000800000000ULL) */
+
+#define OBD_MD_FLXATTR       (0x0000001000000000ULL) /* xattr */
+#define OBD_MD_FLXATTRLS     (0x0000002000000000ULL) /* xattr list */
+#define OBD_MD_FLXATTRRM     (0x0000004000000000ULL) /* xattr remove */
+#define OBD_MD_FLACL    (0x0000008000000000ULL) /* ACL */
+#define OBD_MD_FLRMTPERM     (0x0000010000000000ULL) /* remote permission */
+#define OBD_MD_FLMDSCAPA     (0x0000020000000000ULL) /* MDS capability */
+#define OBD_MD_FLOSSCAPA     (0x0000040000000000ULL) /* OSS capability */
+#define OBD_MD_FLCKSPLIT     (0x0000080000000000ULL) /* Check split on server */
+#define OBD_MD_FLCROSSREF    (0x0000100000000000ULL) /* Cross-ref case */
+#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes
+                                                     * under lock */
+#define OBD_MD_FLOBJCOUNT    (0x0000400000000000ULL) /* for multiple destroy */
+
+#define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */
+#define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */
+#define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */
+#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
+
+#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
+
+#define OBD_MD_FLGETATTR (OBD_MD_FLID    | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
+                         OBD_MD_FLCTIME | OBD_MD_FLSIZE  | OBD_MD_FLBLKSZ | \
+                         OBD_MD_FLMODE  | OBD_MD_FLTYPE  | OBD_MD_FLUID   | \
+                         OBD_MD_FLGID   | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \
+                         OBD_MD_FLGENER | OBD_MD_FLRDEV  | OBD_MD_FLGROUP)
+
+/* don't forget obdo_fid which is way down at the bottom so it can
+ * come after the definition of llog_cookie */
+
+enum hss_valid {
+       HSS_SETMASK     = 0x01,
+       HSS_CLEARMASK   = 0x02,
+       HSS_ARCHIVE_ID  = 0x04,
+};
+
+struct hsm_state_set {
+       __u32   hss_valid;
+       __u32   hss_archive_id;
+       __u64   hss_setmask;
+       __u64   hss_clearmask;
+};
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_state_set(struct hsm_state_set *hss);
+
+extern void lustre_swab_obd_statfs (struct obd_statfs *os);
+
+/* ost_body.data values for OST_BRW */
+
+#define OBD_BRW_READ       0x01
+#define OBD_BRW_WRITE     0x02
+#define OBD_BRW_RWMASK   (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_SYNC       0x08 /* this page is a part of synchronous
+                                     * transfer and is not accounted in
+                                     * the grant. */
+#define OBD_BRW_CHECK     0x10
+#define OBD_BRW_FROM_GRANT      0x20 /* the osc manages this under llite */
+#define OBD_BRW_GRANTED         0x40 /* the ost manages this */
+#define OBD_BRW_NOCACHE         0x80 /* this page is a part of non-cached IO */
+#define OBD_BRW_NOQUOTA        0x100
+#define OBD_BRW_SRVLOCK        0x200 /* Client holds no lock over this page */
+#define OBD_BRW_ASYNC    0x400 /* Server may delay commit to disk */
+#define OBD_BRW_MEMALLOC       0x800 /* Client runs in the "kswapd" context */
+#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */
+#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */
+
+#define OBD_OBJECT_EOF 0xffffffffffffffffULL
+
+#define OST_MIN_PRECREATE 32
+#define OST_MAX_PRECREATE 20000
+
+struct obd_ioobj {
+       struct ost_id   ioo_oid;        /* object ID, if multi-obj BRW */
+       __u32           ioo_max_brw;    /* low 16 bits were o_mode before 2.4,
+                                        * now (PTLRPC_BULK_OPS_COUNT - 1) in
+                                        * high 16 bits in 2.4 and later */
+       __u32           ioo_bufcnt;     /* number of niobufs for this object */
+};
+
+#define IOOBJ_MAX_BRW_BITS     16
+#define IOOBJ_TYPE_MASK                ((1U << IOOBJ_MAX_BRW_BITS) - 1)
+#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1)
+#define ioobj_max_brw_set(ioo, num)                                    \
+do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0)
+
+extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo);
+
+/* multiple of 8 bytes => can array */
+struct niobuf_remote {
+       __u64 offset;
+       __u32 len;
+       __u32 flags;
+};
+
+extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr);
+
+/* lock value block communicated between the filter and llite */
+
+/* OST_LVB_ERR_INIT is needed because the return code in rc is
+ * negative, i.e. because ((MASK + rc) & MASK) != MASK. */
+#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL
+#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL
+#define OST_LVB_IS_ERR(blocks)                                   \
+       ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK)
+#define OST_LVB_SET_ERR(blocks, rc)                                 \
+       do { blocks = OST_LVB_ERR_INIT + rc; } while (0)
+#define OST_LVB_GET_ERR(blocks)    (int)(blocks - OST_LVB_ERR_INIT)
+
+struct ost_lvb_v1 {
+       __u64           lvb_size;
+       obd_time        lvb_mtime;
+       obd_time        lvb_atime;
+       obd_time        lvb_ctime;
+       __u64           lvb_blocks;
+};
+
+extern void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb);
+
+struct ost_lvb {
+       __u64           lvb_size;
+       obd_time        lvb_mtime;
+       obd_time        lvb_atime;
+       obd_time        lvb_ctime;
+       __u64           lvb_blocks;
+       __u32           lvb_mtime_ns;
+       __u32           lvb_atime_ns;
+       __u32           lvb_ctime_ns;
+       __u32           lvb_padding;
+};
+
+extern void lustre_swab_ost_lvb(struct ost_lvb *lvb);
+
+/*
+ *   lquota data structures
+ */
+
+#ifndef QUOTABLOCK_BITS
+#define QUOTABLOCK_BITS 10
+#endif
+
+#ifndef QUOTABLOCK_SIZE
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+#endif
+
+#ifndef toqb
+#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS)
+#endif
+
+/* The lquota_id structure is an union of all the possible identifier types that
+ * can be used with quota, this includes:
+ * - 64-bit user ID
+ * - 64-bit group ID
+ * - a FID which can be used for per-directory quota in the future */
+union lquota_id {
+       struct lu_fid   qid_fid; /* FID for per-directory quota */
+       __u64           qid_uid; /* user identifier */
+       __u64           qid_gid; /* group identifier */
+};
+
+/* quotactl management */
+struct obd_quotactl {
+       __u32                   qc_cmd;
+       __u32                   qc_type; /* see Q_* flag below */
+       __u32                   qc_id;
+       __u32                   qc_stat;
+       struct obd_dqinfo       qc_dqinfo;
+       struct obd_dqblk        qc_dqblk;
+};
+
+extern void lustre_swab_obd_quotactl(struct obd_quotactl *q);
+
+#define Q_QUOTACHECK   0x800100 /* deprecated as of 2.4 */
+#define Q_INITQUOTA    0x800101 /* deprecated as of 2.4  */
+#define Q_GETOINFO     0x800102 /* get obd quota info */
+#define Q_GETOQUOTA    0x800103 /* get obd quotas */
+#define Q_FINVALIDATE  0x800104 /* deprecated as of 2.4 */
+
+#define Q_COPY(out, in, member) (out)->member = (in)->member
+
+#define QCTL_COPY(out, in)             \
+do {                                   \
+       Q_COPY(out, in, qc_cmd);        \
+       Q_COPY(out, in, qc_type);       \
+       Q_COPY(out, in, qc_id);         \
+       Q_COPY(out, in, qc_stat);       \
+       Q_COPY(out, in, qc_dqinfo);     \
+       Q_COPY(out, in, qc_dqblk);      \
+} while (0)
+
+/* Body of quota request used for quota acquire/release RPCs between quota
+ * master (aka QMT) and slaves (ak QSD). */
+struct quota_body {
+       struct lu_fid   qb_fid;     /* FID of global index packing the pool ID
+                                     * and type (data or metadata) as well as
+                                     * the quota type (user or group). */
+       union lquota_id qb_id;      /* uid or gid or directory FID */
+       __u32           qb_flags;   /* see below */
+       __u32           qb_padding;
+       __u64           qb_count;   /* acquire/release count (kbytes/inodes) */
+       __u64           qb_usage;   /* current slave usage (kbytes/inodes) */
+       __u64           qb_slv_ver; /* slave index file version */
+       struct lustre_handle    qb_lockh;     /* per-ID lock handle */
+       struct lustre_handle    qb_glb_lockh; /* global lock handle */
+       __u64           qb_padding1[4];
+};
+
+/* When the quota_body is used in the reply of quota global intent
+ * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */
+#define qb_slv_fid     qb_fid
+/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in
+ * quota reply */
+#define qb_qunit       qb_usage
+
+#define QUOTA_DQACQ_FL_ACQ     0x1  /* acquire quota */
+#define QUOTA_DQACQ_FL_PREACQ  0x2  /* pre-acquire */
+#define QUOTA_DQACQ_FL_REL     0x4  /* release quota */
+#define QUOTA_DQACQ_FL_REPORT  0x8  /* report usage */
+
+extern void lustre_swab_quota_body(struct quota_body *b);
+
+/* Quota types currently supported */
+enum {
+       LQUOTA_TYPE_USR = 0x00, /* maps to USRQUOTA */
+       LQUOTA_TYPE_GRP = 0x01, /* maps to GRPQUOTA */
+       LQUOTA_TYPE_MAX
+};
+
+/* There are 2 different resource types on which a quota limit can be enforced:
+ * - inodes on the MDTs
+ * - blocks on the OSTs */
+enum {
+       LQUOTA_RES_MD           = 0x01, /* skip 0 to avoid null oid in FID */
+       LQUOTA_RES_DT           = 0x02,
+       LQUOTA_LAST_RES,
+       LQUOTA_FIRST_RES        = LQUOTA_RES_MD
+};
+#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1)
+
+/*
+ * Space accounting support
+ * Format of an accounting record, providing disk usage information for a given
+ * user or group
+ */
+struct lquota_acct_rec { /* 16 bytes */
+       __u64 bspace;  /* current space in use */
+       __u64 ispace;  /* current # inodes in use */
+};
+
+/*
+ * Global quota index support
+ * Format of a global record, providing global quota settings for a given quota
+ * identifier
+ */
+struct lquota_glb_rec { /* 32 bytes */
+       __u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */
+       __u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */
+       __u64 qbr_time;      /* grace time, in seconds */
+       __u64 qbr_granted;   /* how much is granted to slaves, in #inodes or
+                             * kbytes */
+};
+
+/*
+ * Slave index support
+ * Format of a slave record, recording how much space is granted to a given
+ * slave
+ */
+struct lquota_slv_rec { /* 8 bytes */
+       __u64 qsr_granted; /* space granted to the slave for the key=ID,
+                           * in #inodes or kbytes */
+};
+
+/* Data structures associated with the quota locks */
+
+/* Glimpse descriptor used for the index & per-ID quota locks */
+struct ldlm_gl_lquota_desc {
+       union lquota_id gl_id;    /* quota ID subject to the glimpse */
+       __u64           gl_flags; /* see LQUOTA_FL* below */
+       __u64           gl_ver;   /* new index version */
+       __u64           gl_hardlimit; /* new hardlimit or qunit value */
+       __u64           gl_softlimit; /* new softlimit */
+       __u64           gl_time;
+       __u64           gl_pad2;
+};
+#define gl_qunit       gl_hardlimit /* current qunit value used when
+                                     * glimpsing per-ID quota locks */
+
+/* quota glimpse flags */
+#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */
+
+/* LVB used with quota (global and per-ID) locks */
+struct lquota_lvb {
+       __u64   lvb_flags;      /* see LQUOTA_FL* above */
+       __u64   lvb_id_may_rel; /* space that might be released later */
+       __u64   lvb_id_rel;     /* space released by the slave for this ID */
+       __u64   lvb_id_qunit;   /* current qunit value */
+       __u64   lvb_pad1;
+};
+
+extern void lustre_swab_lquota_lvb(struct lquota_lvb *lvb);
+
+/* LVB used with global quota lock */
+#define lvb_glb_ver  lvb_id_may_rel /* current version of the global index */
+
+/* op codes */
+typedef enum {
+       QUOTA_DQACQ     = 601,
+       QUOTA_DQREL     = 602,
+       QUOTA_LAST_OPC
+} quota_cmd_t;
+#define QUOTA_FIRST_OPC        QUOTA_DQACQ
+
+/*
+ *   MDS REQ RECORDS
+ */
+
+/* opcodes */
+typedef enum {
+       MDS_GETATTR             = 33,
+       MDS_GETATTR_NAME        = 34,
+       MDS_CLOSE               = 35,
+       MDS_REINT               = 36,
+       MDS_READPAGE            = 37,
+       MDS_CONNECT             = 38,
+       MDS_DISCONNECT          = 39,
+       MDS_GETSTATUS           = 40,
+       MDS_STATFS              = 41,
+       MDS_PIN                 = 42,
+       MDS_UNPIN               = 43,
+       MDS_SYNC                = 44,
+       MDS_DONE_WRITING        = 45,
+       MDS_SET_INFO            = 46,
+       MDS_QUOTACHECK          = 47,
+       MDS_QUOTACTL            = 48,
+       MDS_GETXATTR            = 49,
+       MDS_SETXATTR            = 50, /* obsolete, now it's MDS_REINT op */
+       MDS_WRITEPAGE           = 51,
+       MDS_IS_SUBDIR           = 52,
+       MDS_GET_INFO            = 53,
+       MDS_HSM_STATE_GET       = 54,
+       MDS_HSM_STATE_SET       = 55,
+       MDS_HSM_ACTION          = 56,
+       MDS_HSM_PROGRESS        = 57,
+       MDS_HSM_REQUEST         = 58,
+       MDS_HSM_CT_REGISTER     = 59,
+       MDS_HSM_CT_UNREGISTER   = 60,
+       MDS_SWAP_LAYOUTS        = 61,
+       MDS_LAST_OPC
+} mds_cmd_t;
+
+#define MDS_FIRST_OPC    MDS_GETATTR
+
+
+/* opcodes for object update */
+typedef enum {
+       UPDATE_OBJ      = 1000,
+       UPDATE_LAST_OPC
+} update_cmd_t;
+
+#define UPDATE_FIRST_OPC    UPDATE_OBJ
+
+/*
+ * Do not exceed 63
+ */
+
+typedef enum {
+       REINT_SETATTR  = 1,
+       REINT_CREATE   = 2,
+       REINT_LINK     = 3,
+       REINT_UNLINK   = 4,
+       REINT_RENAME   = 5,
+       REINT_OPEN     = 6,
+       REINT_SETXATTR = 7,
+       REINT_RMENTRY  = 8,
+//      REINT_WRITE    = 9,
+       REINT_MAX
+} mds_reint_t, mdt_reint_t;
+
+extern void lustre_swab_generic_32s (__u32 *val);
+
+/* the disposition of the intent outlines what was executed */
+#define DISP_IT_EXECD  0x00000001
+#define DISP_LOOKUP_EXECD    0x00000002
+#define DISP_LOOKUP_NEG      0x00000004
+#define DISP_LOOKUP_POS      0x00000008
+#define DISP_OPEN_CREATE     0x00000010
+#define DISP_OPEN_OPEN       0x00000020
+#define DISP_ENQ_COMPLETE    0x00400000
+#define DISP_ENQ_OPEN_REF    0x00800000
+#define DISP_ENQ_CREATE_REF  0x01000000
+#define DISP_OPEN_LOCK       0x02000000
+
+/* INODE LOCK PARTS */
+#define MDS_INODELOCK_LOOKUP 0x000001       /* dentry, mode, owner, group */
+#define MDS_INODELOCK_UPDATE 0x000002       /* size, links, timestamps */
+#define MDS_INODELOCK_OPEN   0x000004       /* For opened files */
+#define MDS_INODELOCK_LAYOUT 0x000008       /* for layout */
+#define MDS_INODELOCK_PERM   0x000010       /* for permission */
+
+#define MDS_INODELOCK_MAXSHIFT 4
+/* This FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+
+extern void lustre_swab_ll_fid (struct ll_fid *fid);
+
+/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * name[2,3] fields that need to be used for the quota id (also a FID). */
+enum {
+       LUSTRE_RES_ID_SEQ_OFF = 0,
+       LUSTRE_RES_ID_VER_OID_OFF = 1,
+       LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */
+       LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2,
+       LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3,
+       LUSTRE_RES_ID_HSH_OFF = 3
+};
+
+#define MDS_STATUS_CONN 1
+#define MDS_STATUS_LOV 2
+
+/* mdt_thread_info.mti_flags. */
+enum md_op_flags {
+       /* The flag indicates Size-on-MDS attributes are changed. */
+       MF_SOM_CHANGE      = (1 << 0),
+       /* Flags indicates an epoch opens or closes. */
+       MF_EPOCH_OPEN      = (1 << 1),
+       MF_EPOCH_CLOSE    = (1 << 2),
+       MF_MDC_CANCEL_FID1      = (1 << 3),
+       MF_MDC_CANCEL_FID2      = (1 << 4),
+       MF_MDC_CANCEL_FID3      = (1 << 5),
+       MF_MDC_CANCEL_FID4      = (1 << 6),
+       /* There is a pending attribute update. */
+       MF_SOM_AU              = (1 << 7),
+       /* Cancel OST locks while getattr OST attributes. */
+       MF_GETATTR_LOCK  = (1 << 8),
+       MF_GET_MDT_IDX    = (1 << 9),
+};
+
+#define MF_SOM_LOCAL_FLAGS (MF_SOM_CHANGE | MF_EPOCH_OPEN | MF_EPOCH_CLOSE)
+
+#define LUSTRE_BFLAG_UNCOMMITTED_WRITES   0x1
+
+/* these should be identical to their EXT4_*_FL counterparts, they are
+ * redefined here only to avoid dragging in fs/ext4/ext4.h */
+#define LUSTRE_SYNC_FL  0x00000008 /* Synchronous updates */
+#define LUSTRE_IMMUTABLE_FL    0x00000010 /* Immutable file */
+#define LUSTRE_APPEND_FL       0x00000020 /* writes to file may only append */
+#define LUSTRE_NOATIME_FL      0x00000080 /* do not update atime */
+#define LUSTRE_DIRSYNC_FL      0x00010000 /* dirsync behaviour (dir only) */
+
+/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values
+ * for the client inode i_flags.  The LUSTRE_*_FL are the Lustre wire
+ * protocol equivalents of LDISKFS_*_FL values stored on disk, while
+ * the S_* flags are kernel-internal values that change between kernel
+ * versions.  These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS.
+ * See b=16526 for a full history. */
+static inline int ll_ext_to_inode_flags(int flags)
+{
+       return (((flags & LUSTRE_SYNC_FL)      ? S_SYNC      : 0) |
+               ((flags & LUSTRE_NOATIME_FL)   ? S_NOATIME   : 0) |
+               ((flags & LUSTRE_APPEND_FL)    ? S_APPEND    : 0) |
+#if defined(S_DIRSYNC)
+               ((flags & LUSTRE_DIRSYNC_FL)   ? S_DIRSYNC   : 0) |
+#endif
+               ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0));
+}
+
+static inline int ll_inode_to_ext_flags(int iflags)
+{
+       return (((iflags & S_SYNC)      ? LUSTRE_SYNC_FL      : 0) |
+               ((iflags & S_NOATIME)   ? LUSTRE_NOATIME_FL   : 0) |
+               ((iflags & S_APPEND)    ? LUSTRE_APPEND_FL    : 0) |
+#if defined(S_DIRSYNC)
+               ((iflags & S_DIRSYNC)   ? LUSTRE_DIRSYNC_FL   : 0) |
+#endif
+               ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0));
+}
+
+struct mdt_body {
+       struct lu_fid  fid1;
+       struct lu_fid  fid2;
+       struct lustre_handle handle;
+       __u64     valid;
+       __u64     size;   /* Offset, in the case of MDS_READPAGE */
+       obd_time        mtime;
+       obd_time        atime;
+       obd_time        ctime;
+       __u64     blocks; /* XID, in the case of MDS_READPAGE */
+       __u64     ioepoch;
+       __u64          unused1; /* was "ino" until 2.4.0 */
+       __u32     fsuid;
+       __u32     fsgid;
+       __u32     capability;
+       __u32     mode;
+       __u32     uid;
+       __u32     gid;
+       __u32     flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */
+       __u32     rdev;
+       __u32     nlink; /* #bytes to read in the case of MDS_READPAGE */
+       __u32          unused2; /* was "generation" until 2.4.0 */
+       __u32     suppgid;
+       __u32     eadatasize;
+       __u32     aclsize;
+       __u32     max_mdsize;
+       __u32     max_cookiesize;
+       __u32     uid_h; /* high 32-bits of uid, for FUID */
+       __u32     gid_h; /* high 32-bits of gid, for FUID */
+       __u32     padding_5; /* also fix lustre_swab_mdt_body */
+       __u64     padding_6;
+       __u64     padding_7;
+       __u64     padding_8;
+       __u64     padding_9;
+       __u64     padding_10;
+}; /* 216 */
+
+extern void lustre_swab_mdt_body (struct mdt_body *b);
+
+struct mdt_ioepoch {
+       struct lustre_handle handle;
+       __u64  ioepoch;
+       __u32  flags;
+       __u32  padding;
+};
+
+extern void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b);
+
+/* permissions for md_perm.mp_perm */
+enum {
+       CFS_SETUID_PERM = 0x01,
+       CFS_SETGID_PERM = 0x02,
+       CFS_SETGRP_PERM = 0x04,
+       CFS_RMTACL_PERM = 0x08,
+       CFS_RMTOWN_PERM = 0x10
+};
+
+/* inode access permission for remote user, the inode info are omitted,
+ * for client knows them. */
+struct mdt_remote_perm {
+       __u32      rp_uid;
+       __u32      rp_gid;
+       __u32      rp_fsuid;
+       __u32      rp_fsuid_h;
+       __u32      rp_fsgid;
+       __u32      rp_fsgid_h;
+       __u32      rp_access_perm; /* MAY_READ/WRITE/EXEC */
+       __u32      rp_padding;
+};
+
+extern void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p);
+
+struct mdt_rec_setattr {
+       __u32      sa_opcode;
+       __u32      sa_cap;
+       __u32      sa_fsuid;
+       __u32      sa_fsuid_h;
+       __u32      sa_fsgid;
+       __u32      sa_fsgid_h;
+       __u32      sa_suppgid;
+       __u32      sa_suppgid_h;
+       __u32      sa_padding_1;
+       __u32      sa_padding_1_h;
+       struct lu_fid   sa_fid;
+       __u64      sa_valid;
+       __u32      sa_uid;
+       __u32      sa_gid;
+       __u64      sa_size;
+       __u64      sa_blocks;
+       obd_time        sa_mtime;
+       obd_time        sa_atime;
+       obd_time        sa_ctime;
+       __u32      sa_attr_flags;
+       __u32      sa_mode;
+       __u32      sa_bias;      /* some operation flags */
+       __u32      sa_padding_3;
+       __u32      sa_padding_4;
+       __u32      sa_padding_5;
+};
+
+extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa);
+
+/*
+ * Attribute flags used in mdt_rec_setattr::sa_valid.
+ * The kernel's #defines for ATTR_* should not be used over the network
+ * since the client and MDS may run different kernels (see bug 13828)
+ * Therefore, we should only use MDS_ATTR_* attributes for sa_valid.
+ */
+#define MDS_ATTR_MODE    0x1ULL /* = 1 */
+#define MDS_ATTR_UID      0x2ULL /* = 2 */
+#define MDS_ATTR_GID      0x4ULL /* = 4 */
+#define MDS_ATTR_SIZE    0x8ULL /* = 8 */
+#define MDS_ATTR_ATIME 0x10ULL /* = 16 */
+#define MDS_ATTR_MTIME 0x20ULL /* = 32 */
+#define MDS_ATTR_CTIME 0x40ULL /* = 64 */
+#define MDS_ATTR_ATIME_SET    0x80ULL /* = 128 */
+#define MDS_ATTR_MTIME_SET   0x100ULL /* = 256 */
+#define MDS_ATTR_FORCE       0x200ULL /* = 512, Not a change, but a change it */
+#define MDS_ATTR_ATTR_FLAG   0x400ULL /* = 1024 */
+#define MDS_ATTR_KILL_SUID   0x800ULL /* = 2048 */
+#define MDS_ATTR_KILL_SGID  0x1000ULL /* = 4096 */
+#define MDS_ATTR_CTIME_SET  0x2000ULL /* = 8192 */
+#define MDS_ATTR_FROM_OPEN  0x4000ULL /* = 16384, called from open path, ie O_TRUNC */
+#define MDS_ATTR_BLOCKS     0x8000ULL /* = 32768 */
+
+#ifndef FMODE_READ
+#define FMODE_READ            00000001
+#define FMODE_WRITE          00000002
+#endif
+
+#define MDS_FMODE_CLOSED        00000000
+#define MDS_FMODE_EXEC    00000004
+/* IO Epoch is opened on a closed file. */
+#define MDS_FMODE_EPOCH          01000000
+/* IO Epoch is opened on a file truncate. */
+#define MDS_FMODE_TRUNC          02000000
+/* Size-on-MDS Attribute Update is pending. */
+#define MDS_FMODE_SOM      04000000
+
+#define MDS_OPEN_CREATED        00000010
+#define MDS_OPEN_CROSS    00000020
+
+#define MDS_OPEN_CREAT    00000100
+#define MDS_OPEN_EXCL      00000200
+#define MDS_OPEN_TRUNC    00001000
+#define MDS_OPEN_APPEND          00002000
+#define MDS_OPEN_SYNC      00010000
+#define MDS_OPEN_DIRECTORY       00200000
+
+#define MDS_OPEN_BY_FID                040000000 /* open_by_fid for known object */
+#define MDS_OPEN_DELAY_CREATE  0100000000 /* delay initial object create */
+#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE     0400000000 /* open for join file.
+                                          * We do not support JOIN FILE
+                                          * anymore, reserve this flags
+                                          * just for preventing such bit
+                                          * to be reused. */
+
+#define MDS_OPEN_LOCK   04000000000 /* This open requires open lock */
+#define MDS_OPEN_HAS_EA      010000000000 /* specify object create pattern */
+#define MDS_OPEN_HAS_OBJS    020000000000 /* Just set the EA the obj exist */
+#define MDS_OPEN_NORESTORE  0100000000000ULL /* Do not restore file at open */
+#define MDS_OPEN_NEWSTRIPE  0200000000000ULL /* New stripe needed (restripe or
+                                             * hsm restore) */
+#define MDS_OPEN_VOLATILE   0400000000000ULL /* File is volatile = created
+                                               unlinked */
+
+/* permission for create non-directory file */
+#define MAY_CREATE      (1 << 7)
+/* permission for create directory file */
+#define MAY_LINK       (1 << 8)
+/* permission for delete from the directory */
+#define MAY_UNLINK      (1 << 9)
+/* source's permission for rename */
+#define MAY_RENAME_SRC  (1 << 10)
+/* target's permission for rename */
+#define MAY_RENAME_TAR  (1 << 11)
+/* part (parent's) VTX permission check */
+#define MAY_VTX_PART    (1 << 12)
+/* full VTX permission check */
+#define MAY_VTX_FULL    (1 << 13)
+/* lfs rgetfacl permission check */
+#define MAY_RGETFACL    (1 << 14)
+
+enum {
+       MDS_CHECK_SPLIT         = 1 << 0,
+       MDS_CROSS_REF           = 1 << 1,
+       MDS_VTX_BYPASS          = 1 << 2,
+       MDS_PERM_BYPASS         = 1 << 3,
+       MDS_SOM                 = 1 << 4,
+       MDS_QUOTA_IGNORE        = 1 << 5,
+       MDS_CLOSE_CLEANUP       = 1 << 6,
+       MDS_KEEP_ORPHAN         = 1 << 7,
+       MDS_RECOV_OPEN          = 1 << 8,
+       MDS_DATA_MODIFIED       = 1 << 9,
+       MDS_CREATE_VOLATILE     = 1 << 10,
+       MDS_OWNEROVERRIDE       = 1 << 11,
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_create {
+       __u32      cr_opcode;
+       __u32      cr_cap;
+       __u32      cr_fsuid;
+       __u32      cr_fsuid_h;
+       __u32      cr_fsgid;
+       __u32      cr_fsgid_h;
+       __u32      cr_suppgid1;
+       __u32      cr_suppgid1_h;
+       __u32      cr_suppgid2;
+       __u32      cr_suppgid2_h;
+       struct lu_fid   cr_fid1;
+       struct lu_fid   cr_fid2;
+       struct lustre_handle cr_old_handle; /* handle in case of open replay */
+       obd_time        cr_time;
+       __u64      cr_rdev;
+       __u64      cr_ioepoch;
+       __u64      cr_padding_1;   /* rr_blocks */
+       __u32      cr_mode;
+       __u32      cr_bias;
+       /* use of helpers set/get_mrc_cr_flags() is needed to access
+        * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to
+        * extend cr_flags size without breaking 1.8 compat */
+       __u32      cr_flags_l;     /* for use with open, low  32 bits  */
+       __u32      cr_flags_h;     /* for use with open, high 32 bits */
+       __u32      cr_umask;       /* umask for create */
+       __u32      cr_padding_4;   /* rr_padding_4 */
+};
+
+static inline void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags)
+{
+       mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll);
+       mrc->cr_flags_h = (__u32)(flags >> 32);
+}
+
+static inline __u64 get_mrc_cr_flags(struct mdt_rec_create *mrc)
+{
+       return ((__u64)(mrc->cr_flags_l) | ((__u64)mrc->cr_flags_h << 32));
+}
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_link {
+       __u32      lk_opcode;
+       __u32      lk_cap;
+       __u32      lk_fsuid;
+       __u32      lk_fsuid_h;
+       __u32      lk_fsgid;
+       __u32      lk_fsgid_h;
+       __u32      lk_suppgid1;
+       __u32      lk_suppgid1_h;
+       __u32      lk_suppgid2;
+       __u32      lk_suppgid2_h;
+       struct lu_fid   lk_fid1;
+       struct lu_fid   lk_fid2;
+       obd_time        lk_time;
+       __u64      lk_padding_1;   /* rr_atime */
+       __u64      lk_padding_2;   /* rr_ctime */
+       __u64      lk_padding_3;   /* rr_size */
+       __u64      lk_padding_4;   /* rr_blocks */
+       __u32      lk_bias;
+       __u32      lk_padding_5;   /* rr_mode */
+       __u32      lk_padding_6;   /* rr_flags */
+       __u32      lk_padding_7;   /* rr_padding_2 */
+       __u32      lk_padding_8;   /* rr_padding_3 */
+       __u32      lk_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_unlink {
+       __u32      ul_opcode;
+       __u32      ul_cap;
+       __u32      ul_fsuid;
+       __u32      ul_fsuid_h;
+       __u32      ul_fsgid;
+       __u32      ul_fsgid_h;
+       __u32      ul_suppgid1;
+       __u32      ul_suppgid1_h;
+       __u32      ul_suppgid2;
+       __u32      ul_suppgid2_h;
+       struct lu_fid   ul_fid1;
+       struct lu_fid   ul_fid2;
+       obd_time        ul_time;
+       __u64      ul_padding_2;   /* rr_atime */
+       __u64      ul_padding_3;   /* rr_ctime */
+       __u64      ul_padding_4;   /* rr_size */
+       __u64      ul_padding_5;   /* rr_blocks */
+       __u32      ul_bias;
+       __u32      ul_mode;
+       __u32      ul_padding_6;   /* rr_flags */
+       __u32      ul_padding_7;   /* rr_padding_2 */
+       __u32      ul_padding_8;   /* rr_padding_3 */
+       __u32      ul_padding_9;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_rename {
+       __u32      rn_opcode;
+       __u32      rn_cap;
+       __u32      rn_fsuid;
+       __u32      rn_fsuid_h;
+       __u32      rn_fsgid;
+       __u32      rn_fsgid_h;
+       __u32      rn_suppgid1;
+       __u32      rn_suppgid1_h;
+       __u32      rn_suppgid2;
+       __u32      rn_suppgid2_h;
+       struct lu_fid   rn_fid1;
+       struct lu_fid   rn_fid2;
+       obd_time        rn_time;
+       __u64      rn_padding_1;   /* rr_atime */
+       __u64      rn_padding_2;   /* rr_ctime */
+       __u64      rn_padding_3;   /* rr_size */
+       __u64      rn_padding_4;   /* rr_blocks */
+       __u32      rn_bias;     /* some operation flags */
+       __u32      rn_mode;     /* cross-ref rename has mode */
+       __u32      rn_padding_5;   /* rr_flags */
+       __u32      rn_padding_6;   /* rr_padding_2 */
+       __u32      rn_padding_7;   /* rr_padding_3 */
+       __u32      rn_padding_8;   /* rr_padding_4 */
+};
+
+/* instance of mdt_reint_rec */
+struct mdt_rec_setxattr {
+       __u32      sx_opcode;
+       __u32      sx_cap;
+       __u32      sx_fsuid;
+       __u32      sx_fsuid_h;
+       __u32      sx_fsgid;
+       __u32      sx_fsgid_h;
+       __u32      sx_suppgid1;
+       __u32      sx_suppgid1_h;
+       __u32      sx_suppgid2;
+       __u32      sx_suppgid2_h;
+       struct lu_fid   sx_fid;
+       __u64      sx_padding_1;   /* These three are rr_fid2 */
+       __u32      sx_padding_2;
+       __u32      sx_padding_3;
+       __u64      sx_valid;
+       obd_time        sx_time;
+       __u64      sx_padding_5;   /* rr_ctime */
+       __u64      sx_padding_6;   /* rr_size */
+       __u64      sx_padding_7;   /* rr_blocks */
+       __u32      sx_size;
+       __u32      sx_flags;
+       __u32      sx_padding_8;   /* rr_flags */
+       __u32      sx_padding_9;   /* rr_padding_2 */
+       __u32      sx_padding_10;  /* rr_padding_3 */
+       __u32      sx_padding_11;  /* rr_padding_4 */
+};
+
+/*
+ * mdt_rec_reint is the template for all mdt_reint_xxx structures.
+ * Do NOT change the size of various members, otherwise the value
+ * will be broken in lustre_swab_mdt_rec_reint().
+ *
+ * If you add new members in other mdt_reint_xxx structres and need to use the
+ * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also.
+ */
+struct mdt_rec_reint {
+       __u32      rr_opcode;
+       __u32      rr_cap;
+       __u32      rr_fsuid;
+       __u32      rr_fsuid_h;
+       __u32      rr_fsgid;
+       __u32      rr_fsgid_h;
+       __u32      rr_suppgid1;
+       __u32      rr_suppgid1_h;
+       __u32      rr_suppgid2;
+       __u32      rr_suppgid2_h;
+       struct lu_fid   rr_fid1;
+       struct lu_fid   rr_fid2;
+       obd_time        rr_mtime;
+       obd_time        rr_atime;
+       obd_time        rr_ctime;
+       __u64      rr_size;
+       __u64      rr_blocks;
+       __u32      rr_bias;
+       __u32      rr_mode;
+       __u32      rr_flags;
+       __u32      rr_flags_h;
+       __u32      rr_umask;
+       __u32      rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
+};
+
+extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr);
+
+struct lmv_desc {
+       __u32 ld_tgt_count;             /* how many MDS's */
+       __u32 ld_active_tgt_count;       /* how many active */
+       __u32 ld_default_stripe_count;     /* how many objects are used */
+       __u32 ld_pattern;                 /* default MEA_MAGIC_* */
+       __u64 ld_default_hash_size;
+       __u64 ld_padding_1;             /* also fix lustre_swab_lmv_desc */
+       __u32 ld_padding_2;             /* also fix lustre_swab_lmv_desc */
+       __u32 ld_qos_maxage;           /* in second */
+       __u32 ld_padding_3;             /* also fix lustre_swab_lmv_desc */
+       __u32 ld_padding_4;             /* also fix lustre_swab_lmv_desc */
+       struct obd_uuid ld_uuid;
+};
+
+extern void lustre_swab_lmv_desc (struct lmv_desc *ld);
+
+/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */
+struct lmv_stripe_md {
+       __u32    mea_magic;
+       __u32    mea_count;
+       __u32    mea_master;
+       __u32    mea_padding;
+       char      mea_pool_name[LOV_MAXPOOLNAME];
+       struct lu_fid mea_ids[0];
+};
+
+extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea);
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32        0x7fffffffUL
+#define MAX_HASH_SIZE      0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+enum fld_rpc_opc {
+       FLD_QUERY                      = 900,
+       FLD_LAST_OPC,
+       FLD_FIRST_OPC              = FLD_QUERY
+};
+
+enum seq_rpc_opc {
+       SEQ_QUERY                      = 700,
+       SEQ_LAST_OPC,
+       SEQ_FIRST_OPC              = SEQ_QUERY
+};
+
+enum seq_op {
+       SEQ_ALLOC_SUPER = 0,
+       SEQ_ALLOC_META = 1
+};
+
+/*
+ *  LOV data structures
+ */
+
+#define LOV_MAX_UUID_BUFFER_SIZE  8192
+/* The size of the buffer the lov/mdc reserves for the
+ * array of UUIDs returned by the MDS.  With the current
+ * protocol, this will limit the max number of OSTs per LOV */
+
+#define LOV_DESC_MAGIC 0xB0CCDE5C
+
+/* LOV settings descriptor (should only contain static info) */
+struct lov_desc {
+       __u32 ld_tgt_count;             /* how many OBD's */
+       __u32 ld_active_tgt_count;       /* how many active */
+       __u32 ld_default_stripe_count;     /* how many objects are used */
+       __u32 ld_pattern;                 /* default PATTERN_RAID0 */
+       __u64 ld_default_stripe_size;      /* in bytes */
+       __u64 ld_default_stripe_offset;    /* in bytes */
+       __u32 ld_padding_0;             /* unused */
+       __u32 ld_qos_maxage;           /* in second */
+       __u32 ld_padding_1;             /* also fix lustre_swab_lov_desc */
+       __u32 ld_padding_2;             /* also fix lustre_swab_lov_desc */
+       struct obd_uuid ld_uuid;
+};
+
+#define ld_magic ld_active_tgt_count       /* for swabbing from llogs */
+
+extern void lustre_swab_lov_desc (struct lov_desc *ld);
+
+/*
+ *   LDLM requests:
+ */
+/* opcodes -- MUST be distinct from OST/MDS opcodes */
+typedef enum {
+       LDLM_ENQUEUE     = 101,
+       LDLM_CONVERT     = 102,
+       LDLM_CANCEL      = 103,
+       LDLM_BL_CALLBACK = 104,
+       LDLM_CP_CALLBACK = 105,
+       LDLM_GL_CALLBACK = 106,
+       LDLM_SET_INFO    = 107,
+       LDLM_LAST_OPC
+} ldlm_cmd_t;
+#define LDLM_FIRST_OPC LDLM_ENQUEUE
+
+#define RES_NAME_SIZE 4
+struct ldlm_res_id {
+       __u64 name[RES_NAME_SIZE];
+};
+
+extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id);
+
+static inline int ldlm_res_eq(const struct ldlm_res_id *res0,
+                             const struct ldlm_res_id *res1)
+{
+       return !memcmp(res0, res1, sizeof(*res0));
+}
+
+/* lock types */
+typedef enum {
+       LCK_MINMODE = 0,
+       LCK_EX      = 1,
+       LCK_PW      = 2,
+       LCK_PR      = 4,
+       LCK_CW      = 8,
+       LCK_CR      = 16,
+       LCK_NL      = 32,
+       LCK_GROUP   = 64,
+       LCK_COS     = 128,
+       LCK_MAXMODE
+} ldlm_mode_t;
+
+#define LCK_MODE_NUM    8
+
+typedef enum {
+       LDLM_PLAIN     = 10,
+       LDLM_EXTENT    = 11,
+       LDLM_FLOCK     = 12,
+       LDLM_IBITS     = 13,
+       LDLM_MAX_TYPE
+} ldlm_type_t;
+
+#define LDLM_MIN_TYPE LDLM_PLAIN
+
+struct ldlm_extent {
+       __u64 start;
+       __u64 end;
+       __u64 gid;
+};
+
+static inline int ldlm_extent_overlap(struct ldlm_extent *ex1,
+                                     struct ldlm_extent *ex2)
+{
+       return (ex1->start <= ex2->end) && (ex2->start <= ex1->end);
+}
+
+/* check if @ex1 contains @ex2 */
+static inline int ldlm_extent_contain(struct ldlm_extent *ex1,
+                                     struct ldlm_extent *ex2)
+{
+       return (ex1->start <= ex2->start) && (ex1->end >= ex2->end);
+}
+
+struct ldlm_inodebits {
+       __u64 bits;
+};
+
+struct ldlm_flock_wire {
+       __u64 lfw_start;
+       __u64 lfw_end;
+       __u64 lfw_owner;
+       __u32 lfw_padding;
+       __u32 lfw_pid;
+};
+
+/* it's important that the fields of the ldlm_extent structure match
+ * the first fields of the ldlm_flock structure because there is only
+ * one ldlm_swab routine to process the ldlm_policy_data_t union. if
+ * this ever changes we will need to swab the union differently based
+ * on the resource type. */
+
+typedef union {
+       struct ldlm_extent l_extent;
+       struct ldlm_flock_wire l_flock;
+       struct ldlm_inodebits l_inodebits;
+} ldlm_wire_policy_data_t;
+
+extern void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d);
+
+union ldlm_gl_desc {
+       struct ldlm_gl_lquota_desc      lquota_desc;
+};
+
+extern void lustre_swab_gl_desc(union ldlm_gl_desc *);
+
+struct ldlm_intent {
+       __u64 opc;
+};
+
+extern void lustre_swab_ldlm_intent (struct ldlm_intent *i);
+
+struct ldlm_resource_desc {
+       ldlm_type_t lr_type;
+       __u32 lr_padding;       /* also fix lustre_swab_ldlm_resource_desc */
+       struct ldlm_res_id lr_name;
+};
+
+extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r);
+
+struct ldlm_lock_desc {
+       struct ldlm_resource_desc l_resource;
+       ldlm_mode_t l_req_mode;
+       ldlm_mode_t l_granted_mode;
+       ldlm_wire_policy_data_t l_policy_data;
+};
+
+extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l);
+
+#define LDLM_LOCKREQ_HANDLES 2
+#define LDLM_ENQUEUE_CANCEL_OFF 1
+
+struct ldlm_request {
+       __u32 lock_flags;
+       __u32 lock_count;
+       struct ldlm_lock_desc lock_desc;
+       struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES];
+};
+
+extern void lustre_swab_ldlm_request (struct ldlm_request *rq);
+
+/* If LDLM_ENQUEUE, 1 slot is already occupied, 1 is available.
+ * Otherwise, 2 are available. */
+#define ldlm_request_bufsize(count,type)                               \
+({                                                                   \
+       int _avail = LDLM_LOCKREQ_HANDLES;                            \
+       _avail -= (type == LDLM_ENQUEUE ? LDLM_ENQUEUE_CANCEL_OFF : 0); \
+       sizeof(struct ldlm_request) +                              \
+       (count > _avail ? count - _avail : 0) *                  \
+       sizeof(struct lustre_handle);                              \
+})
+
+struct ldlm_reply {
+       __u32 lock_flags;
+       __u32 lock_padding;     /* also fix lustre_swab_ldlm_reply */
+       struct ldlm_lock_desc lock_desc;
+       struct lustre_handle lock_handle;
+       __u64  lock_policy_res1;
+       __u64  lock_policy_res2;
+};
+
+extern void lustre_swab_ldlm_reply (struct ldlm_reply *r);
+
+#define ldlm_flags_to_wire(flags)    ((__u32)(flags))
+#define ldlm_flags_from_wire(flags)  ((__u64)(flags))
+
+/*
+ * Opcodes for mountconf (mgs and mgc)
+ */
+typedef enum {
+       MGS_CONNECT = 250,
+       MGS_DISCONNECT,
+       MGS_EXCEPTION,   /* node died, etc. */
+       MGS_TARGET_REG, /* whenever target starts up */
+       MGS_TARGET_DEL,
+       MGS_SET_INFO,
+       MGS_CONFIG_READ,
+       MGS_LAST_OPC
+} mgs_cmd_t;
+#define MGS_FIRST_OPC MGS_CONNECT
+
+#define MGS_PARAM_MAXLEN 1024
+#define KEY_SET_INFO "set_info"
+
+struct mgs_send_param {
+       char         mgs_param[MGS_PARAM_MAXLEN];
+};
+
+/* We pass this info to the MGS so it can write config logs */
+#define MTI_NAME_MAXLEN  64
+#define MTI_PARAM_MAXLEN 4096
+#define MTI_NIDS_MAX     32
+struct mgs_target_info {
+       __u32       mti_lustre_ver;
+       __u32       mti_stripe_index;
+       __u32       mti_config_ver;
+       __u32       mti_flags;
+       __u32       mti_nid_count;
+       __u32       mti_instance; /* Running instance of target */
+       char         mti_fsname[MTI_NAME_MAXLEN];
+       char         mti_svname[MTI_NAME_MAXLEN];
+       char         mti_uuid[sizeof(struct obd_uuid)];
+       __u64       mti_nids[MTI_NIDS_MAX];     /* host nids (lnet_nid_t)*/
+       char         mti_params[MTI_PARAM_MAXLEN];
+};
+extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo);
+
+struct mgs_nidtbl_entry {
+       __u64      mne_version;    /* table version of this entry */
+       __u32      mne_instance;   /* target instance # */
+       __u32      mne_index;      /* target index */
+       __u32      mne_length;     /* length of this entry - by bytes */
+       __u8        mne_type;       /* target type LDD_F_SV_TYPE_OST/MDT */
+       __u8        mne_nid_type;   /* type of nid(mbz). for ipv6. */
+       __u8        mne_nid_size;   /* size of each NID, by bytes */
+       __u8        mne_nid_count;  /* # of NIDs in buffer */
+       union {
+               lnet_nid_t nids[0];     /* variable size buffer for NIDs. */
+       } u;
+};
+extern void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo);
+
+struct mgs_config_body {
+       char     mcb_name[MTI_NAME_MAXLEN]; /* logname */
+       __u64    mcb_offset;    /* next index of config log to request */
+       __u16    mcb_type;      /* type of log: CONFIG_T_[CONFIG|RECOVER] */
+       __u8     mcb_reserved;
+       __u8     mcb_bits;      /* bits unit size of config log */
+       __u32    mcb_units;     /* # of units for bulk transfer */
+};
+extern void lustre_swab_mgs_config_body(struct mgs_config_body *body);
+
+struct mgs_config_res {
+       __u64    mcr_offset;    /* index of last config log */
+       __u64    mcr_size;      /* size of the log */
+};
+extern void lustre_swab_mgs_config_res(struct mgs_config_res *body);
+
+/* Config marker flags (in config log) */
+#define CM_START       0x01
+#define CM_END  0x02
+#define CM_SKIP        0x04
+#define CM_UPGRADE146  0x08
+#define CM_EXCLUDE     0x10
+#define CM_START_SKIP (CM_START | CM_SKIP)
+
+struct cfg_marker {
+       __u32        cm_step;       /* aka config version */
+       __u32        cm_flags;
+       __u32        cm_vers;       /* lustre release version number */
+       __u32        cm_padding;    /* 64 bit align */
+       obd_time          cm_createtime; /*when this record was first created */
+       obd_time          cm_canceltime; /*when this record is no longer valid*/
+       char          cm_tgtname[MTI_NAME_MAXLEN];
+       char          cm_comment[MTI_NAME_MAXLEN];
+};
+
+extern void lustre_swab_cfg_marker(struct cfg_marker *marker,
+                                  int swab, int size);
+
+/*
+ * Opcodes for multiple servers.
+ */
+
+typedef enum {
+       OBD_PING = 400,
+       OBD_LOG_CANCEL,
+       OBD_QC_CALLBACK,
+       OBD_IDX_READ,
+       OBD_LAST_OPC
+} obd_cmd_t;
+#define OBD_FIRST_OPC OBD_PING
+
+/* catalog of log objects */
+
+/** Identifier for a single log object */
+struct llog_logid {
+       struct ost_id           lgl_oi;
+       __u32              lgl_ogen;
+} __attribute__((packed));
+
+/** Records written to the CATALOGS list */
+#define CATLIST "CATALOGS"
+struct llog_catid {
+       struct llog_logid       lci_logid;
+       __u32              lci_padding1;
+       __u32              lci_padding2;
+       __u32              lci_padding3;
+} __attribute__((packed));
+
+/* Log data record types - there is no specific reason that these need to
+ * be related to the RPC opcodes, but no reason not to (may be handy later?)
+ */
+#define LLOG_OP_MAGIC 0x10600000
+#define LLOG_OP_MASK  0xfff00000
+
+typedef enum {
+       LLOG_PAD_MAGIC          = LLOG_OP_MAGIC | 0x00000,
+       OST_SZ_REC              = LLOG_OP_MAGIC | 0x00f00,
+       /* OST_RAID1_REC        = LLOG_OP_MAGIC | 0x01000, never used */
+       MDS_UNLINK_REC          = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) |
+                                 REINT_UNLINK, /* obsolete after 2.5.0 */
+       MDS_UNLINK64_REC        = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+                                 REINT_UNLINK,
+       /* MDS_SETATTR_REC      = LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */
+       MDS_SETATTR64_REC       = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) |
+                                 REINT_SETATTR,
+       OBD_CFG_REC             = LLOG_OP_MAGIC | 0x20000,
+       /* PTL_CFG_REC          = LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */
+       LLOG_GEN_REC            = LLOG_OP_MAGIC | 0x40000,
+       /* LLOG_JOIN_REC        = LLOG_OP_MAGIC | 0x50000, obsolete  1.8.0 */
+       CHANGELOG_REC           = LLOG_OP_MAGIC | 0x60000,
+       CHANGELOG_USER_REC      = LLOG_OP_MAGIC | 0x70000,
+       LLOG_HDR_MAGIC          = LLOG_OP_MAGIC | 0x45539,
+       LLOG_LOGID_MAGIC        = LLOG_OP_MAGIC | 0x4553b,
+} llog_op_type;
+
+#define LLOG_REC_HDR_NEEDS_SWABBING(r) \
+       (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC))
+
+/** Log record header - stored in little endian order.
+ * Each record must start with this struct, end with a llog_rec_tail,
+ * and be a multiple of 256 bits in size.
+ */
+struct llog_rec_hdr {
+       __u32   lrh_len;
+       __u32   lrh_index;
+       __u32   lrh_type;
+       __u32   lrh_id;
+};
+
+struct llog_rec_tail {
+       __u32   lrt_len;
+       __u32   lrt_index;
+};
+
+/* Where data follow just after header */
+#define REC_DATA(ptr)                                          \
+       ((void *)((char *)ptr + sizeof(struct llog_rec_hdr)))
+
+#define REC_DATA_LEN(rec)                                      \
+       (rec->lrh_len - sizeof(struct llog_rec_hdr) -           \
+        sizeof(struct llog_rec_tail))
+
+struct llog_logid_rec {
+       struct llog_rec_hdr     lid_hdr;
+       struct llog_logid       lid_id;
+       __u32                   lid_padding1;
+       __u64                   lid_padding2;
+       __u64                   lid_padding3;
+       struct llog_rec_tail    lid_tail;
+} __attribute__((packed));
+
+struct llog_unlink_rec {
+       struct llog_rec_hdr     lur_hdr;
+       obd_id                  lur_oid;
+       obd_count               lur_oseq;
+       obd_count               lur_count;
+       struct llog_rec_tail    lur_tail;
+} __attribute__((packed));
+
+struct llog_unlink64_rec {
+       struct llog_rec_hdr     lur_hdr;
+       struct lu_fid           lur_fid;
+       obd_count               lur_count; /* to destroy the lost precreated */
+       __u32                   lur_padding1;
+       __u64                   lur_padding2;
+       __u64                   lur_padding3;
+       struct llog_rec_tail    lur_tail;
+} __attribute__((packed));
+
+struct llog_setattr64_rec {
+       struct llog_rec_hdr     lsr_hdr;
+       struct ost_id           lsr_oi;
+       __u32                   lsr_uid;
+       __u32                   lsr_uid_h;
+       __u32                   lsr_gid;
+       __u32                   lsr_gid_h;
+       __u64                   lsr_padding;
+       struct llog_rec_tail    lsr_tail;
+} __attribute__((packed));
+
+struct llog_size_change_rec {
+       struct llog_rec_hdr     lsc_hdr;
+       struct ll_fid           lsc_fid;
+       __u32                   lsc_ioepoch;
+       __u32                   lsc_padding1;
+       __u64                   lsc_padding2;
+       __u64                   lsc_padding3;
+       struct llog_rec_tail    lsc_tail;
+} __attribute__((packed));
+
+#define CHANGELOG_MAGIC 0xca103000
+
+/** \a changelog_rec_type's that can't be masked */
+#define CHANGELOG_MINMASK (1 << CL_MARK)
+/** bits covering all \a changelog_rec_type's */
+#define CHANGELOG_ALLMASK 0XFFFFFFFF
+/** default \a changelog_rec_type mask */
+#define CHANGELOG_DEFMASK CHANGELOG_ALLMASK & ~(1 << CL_ATIME | 1 << CL_CLOSE)
+
+/* changelog llog name, needed by client replicators */
+#define CHANGELOG_CATALOG "changelog_catalog"
+
+struct changelog_setinfo {
+       __u64 cs_recno;
+       __u32 cs_id;
+} __attribute__((packed));
+
+/** changelog record */
+struct llog_changelog_rec {
+       struct llog_rec_hdr  cr_hdr;
+       struct changelog_rec cr;
+       struct llog_rec_tail cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+struct llog_changelog_ext_rec {
+       struct llog_rec_hdr      cr_hdr;
+       struct changelog_ext_rec cr;
+       struct llog_rec_tail     cr_tail; /**< for_sizezof_only */
+} __attribute__((packed));
+
+#define CHANGELOG_USER_PREFIX "cl"
+
+struct llog_changelog_user_rec {
+       struct llog_rec_hdr   cur_hdr;
+       __u32            cur_id;
+       __u32            cur_padding;
+       __u64            cur_endrec;
+       struct llog_rec_tail  cur_tail;
+} __attribute__((packed));
+
+/* Old llog gen for compatibility */
+struct llog_gen {
+       __u64 mnt_cnt;
+       __u64 conn_cnt;
+} __attribute__((packed));
+
+struct llog_gen_rec {
+       struct llog_rec_hdr     lgr_hdr;
+       struct llog_gen         lgr_gen;
+       __u64                   padding1;
+       __u64                   padding2;
+       __u64                   padding3;
+       struct llog_rec_tail    lgr_tail;
+};
+
+/* On-disk header structure of each log object, stored in little endian order */
+#define LLOG_CHUNK_SIZE         8192
+#define LLOG_HEADER_SIZE       (96)
+#define LLOG_BITMAP_BYTES       (LLOG_CHUNK_SIZE - LLOG_HEADER_SIZE)
+
+#define LLOG_MIN_REC_SIZE       (24) /* round(llog_rec_hdr + llog_rec_tail) */
+
+/* flags for the logs */
+enum llog_flag {
+       LLOG_F_ZAP_WHEN_EMPTY   = 0x1,
+       LLOG_F_IS_CAT           = 0x2,
+       LLOG_F_IS_PLAIN         = 0x4,
+};
+
+struct llog_log_hdr {
+       struct llog_rec_hdr     llh_hdr;
+       obd_time                llh_timestamp;
+       __u32              llh_count;
+       __u32              llh_bitmap_offset;
+       __u32              llh_size;
+       __u32              llh_flags;
+       __u32              llh_cat_idx;
+       /* for a catalog the first plain slot is next to it */
+       struct obd_uuid  llh_tgtuuid;
+       __u32              llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23];
+       __u32              llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)];
+       struct llog_rec_tail    llh_tail;
+} __attribute__((packed));
+
+#define LLOG_BITMAP_SIZE(llh)  (__u32)((llh->llh_hdr.lrh_len -         \
+                                       llh->llh_bitmap_offset -        \
+                                       sizeof(llh->llh_tail)) * 8)
+
+/** log cookies are used to reference a specific log file and a record therein */
+struct llog_cookie {
+       struct llog_logid       lgc_lgl;
+       __u32              lgc_subsys;
+       __u32              lgc_index;
+       __u32              lgc_padding;
+} __attribute__((packed));
+
+/** llog protocol */
+enum llogd_rpc_ops {
+       LLOG_ORIGIN_HANDLE_CREATE       = 501,
+       LLOG_ORIGIN_HANDLE_NEXT_BLOCK   = 502,
+       LLOG_ORIGIN_HANDLE_READ_HEADER  = 503,
+       LLOG_ORIGIN_HANDLE_WRITE_REC    = 504,
+       LLOG_ORIGIN_HANDLE_CLOSE        = 505,
+       LLOG_ORIGIN_CONNECT          = 506,
+       LLOG_CATINFO                    = 507,  /* deprecated */
+       LLOG_ORIGIN_HANDLE_PREV_BLOCK   = 508,
+       LLOG_ORIGIN_HANDLE_DESTROY      = 509,  /* for destroy llog object*/
+       LLOG_LAST_OPC,
+       LLOG_FIRST_OPC            = LLOG_ORIGIN_HANDLE_CREATE
+};
+
+struct llogd_body {
+       struct llog_logid  lgd_logid;
+       __u32 lgd_ctxt_idx;
+       __u32 lgd_llh_flags;
+       __u32 lgd_index;
+       __u32 lgd_saved_index;
+       __u32 lgd_len;
+       __u64 lgd_cur_offset;
+} __attribute__((packed));
+
+struct llogd_conn_body {
+       struct llog_gen  lgdc_gen;
+       struct llog_logid       lgdc_logid;
+       __u32              lgdc_ctxt_idx;
+} __attribute__((packed));
+
+/* Note: 64-bit types are 64-bit aligned in structure */
+struct obdo {
+       obd_valid              o_valid; /* hot fields in this obdo */
+       struct ost_id      o_oi;
+       obd_id            o_parent_seq;
+       obd_size                o_size;  /* o_size-o_blocks == ost_lvb */
+       obd_time                o_mtime;
+       obd_time                o_atime;
+       obd_time                o_ctime;
+       obd_blocks            o_blocks;       /* brw: cli sent cached bytes */
+       obd_size                o_grant;
+
+       /* 32-bit fields start here: keep an even number of them via padding */
+       obd_blksize          o_blksize;      /* optimal IO blocksize */
+       obd_mode                o_mode;  /* brw: cli sent cache remain */
+       obd_uid          o_uid;
+       obd_gid          o_gid;
+       obd_flag                o_flags;
+       obd_count              o_nlink; /* brw: checksum */
+       obd_count              o_parent_oid;
+       obd_count               o_misc;         /* brw: o_dropped */
+
+       __u64              o_ioepoch;      /* epoch in ost writes */
+       __u32              o_stripe_idx;   /* holds stripe idx */
+       __u32              o_parent_ver;
+       struct lustre_handle    o_handle;       /* brw: lock handle to prolong
+                                                * locks */
+       struct llog_cookie      o_lcookie;      /* destroy: unlink cookie from
+                                                * MDS */
+       __u32                   o_uid_h;
+       __u32                   o_gid_h;
+
+       __u64                   o_data_version; /* getattr: sum of iversion for
+                                                * each stripe.
+                                                * brw: grant space consumed on
+                                                * the client for the write */
+       __u64                   o_padding_4;
+       __u64                   o_padding_5;
+       __u64                   o_padding_6;
+};
+
+#define o_dirty   o_blocks
+#define o_undirty o_mode
+#define o_dropped o_misc
+#define o_cksum   o_nlink
+#define o_grant_used o_data_version
+
+static inline void lustre_set_wire_obdo(struct obd_connect_data *ocd,
+                                       struct obdo *wobdo, struct obdo *lobdo)
+{
+       memcpy(wobdo, lobdo, sizeof(*lobdo));
+       wobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+       if (ocd == NULL)
+               return;
+
+       if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+           fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) {
+               /* Currently OBD_FL_OSTID will only be used when 2.4 echo
+                * client communicate with pre-2.4 server */
+               wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid);
+               wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid);
+       }
+}
+
+static inline void lustre_get_wire_obdo(struct obd_connect_data *ocd,
+                                       struct obdo *lobdo, struct obdo *wobdo)
+{
+       obd_flag local_flags = 0;
+
+       if (lobdo->o_valid & OBD_MD_FLFLAGS)
+                local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK;
+
+       LASSERT(!(wobdo->o_flags & OBD_FL_LOCAL_MASK));
+
+       memcpy(lobdo, wobdo, sizeof(*lobdo));
+       if (local_flags != 0) {
+               lobdo->o_valid |= OBD_MD_FLFLAGS;
+               lobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+               lobdo->o_flags |= local_flags;
+       }
+       if (ocd == NULL)
+               return;
+
+       if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) &&
+           fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) {
+               /* see above */
+               lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq;
+               lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id;
+               lobdo->o_oi.oi_fid.f_ver = 0;
+       }
+}
+
+extern void lustre_swab_obdo (struct obdo *o);
+
+/* request structure for OST's */
+struct ost_body {
+       struct  obdo oa;
+};
+
+/* Key for FIEMAP to be used in get_info calls */
+struct ll_fiemap_info_key {
+       char    name[8];
+       struct  obdo oa;
+       struct  ll_user_fiemap fiemap;
+};
+
+extern void lustre_swab_ost_body (struct ost_body *b);
+extern void lustre_swab_ost_last_id(obd_id *id);
+extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap);
+
+extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+                                           int stripe_count);
+extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm);
+
+/* llog_swab.c */
+extern void lustre_swab_llogd_body (struct llogd_body *d);
+extern void lustre_swab_llog_hdr (struct llog_log_hdr *h);
+extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d);
+extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec);
+extern void lustre_swab_llog_id(struct llog_logid *lid);
+
+struct lustre_cfg;
+extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
+
+/* Functions for dumping PTLRPC fields */
+void dump_rniobuf(struct niobuf_remote *rnb);
+void dump_ioo(struct obd_ioobj *nb);
+void dump_obdo(struct obdo *oa);
+void dump_ost_body(struct ost_body *ob);
+void dump_rcs(__u32 *rc);
+
+#define IDX_INFO_MAGIC 0x3D37CC37
+
+/* Index file transfer through the network. The server serializes the index into
+ * a byte stream which is sent to the client via a bulk transfer */
+struct idx_info {
+       __u32           ii_magic;
+
+       /* reply: see idx_info_flags below */
+       __u32           ii_flags;
+
+       /* request & reply: number of lu_idxpage (to be) transferred */
+       __u16           ii_count;
+       __u16           ii_pad0;
+
+       /* request: requested attributes passed down to the iterator API */
+       __u32           ii_attrs;
+
+       /* request & reply: index file identifier (FID) */
+       struct lu_fid   ii_fid;
+
+       /* reply: version of the index file before starting to walk the index.
+        * Please note that the version can be modified at any time during the
+        * transfer */
+       __u64           ii_version;
+
+       /* request: hash to start with:
+        * reply: hash of the first entry of the first lu_idxpage and hash
+        *      of the entry to read next if any */
+       __u64           ii_hash_start;
+       __u64           ii_hash_end;
+
+       /* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is
+        * set */
+       __u16           ii_keysize;
+
+       /* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC
+        * is set */
+       __u16           ii_recsize;
+
+       __u32           ii_pad1;
+       __u64           ii_pad2;
+       __u64           ii_pad3;
+};
+extern void lustre_swab_idx_info(struct idx_info *ii);
+
+#define II_END_OFF     MDS_DIR_END_OFF /* all entries have been read */
+
+/* List of flags used in idx_info::ii_flags */
+enum idx_info_flags {
+       II_FL_NOHASH    = 1 << 0, /* client doesn't care about hash value */
+       II_FL_VARKEY    = 1 << 1, /* keys can be of variable size */
+       II_FL_VARREC    = 1 << 2, /* records can be of variable size */
+       II_FL_NONUNQ    = 1 << 3, /* index supports non-unique keys */
+};
+
+#define LIP_MAGIC 0x8A6D6B6C
+
+/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */
+struct lu_idxpage {
+       /* 16-byte header */
+       __u32   lip_magic;
+       __u16   lip_flags;
+       __u16   lip_nr;   /* number of entries in the container */
+       __u64   lip_pad0; /* additional padding for future use */
+
+       /* key/record pairs are stored in the remaining 4080 bytes.
+        * depending upon the flags in idx_info::ii_flags, each key/record
+        * pair might be preceded by:
+        * - a hash value
+        * - the key size (II_FL_VARKEY is set)
+        * - the record size (II_FL_VARREC is set)
+        *
+        * For the time being, we only support fixed-size key & record. */
+       char    lip_entries[0];
+};
+extern void lustre_swab_lip_header(struct lu_idxpage *lip);
+
+#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries))
+
+/* Gather all possible type associated with a 4KB container */
+union lu_page {
+       struct lu_dirpage       lp_dir; /* for MDS_READPAGE */
+       struct lu_idxpage       lp_idx; /* for OBD_IDX_READ */
+       char                    lp_array[LU_PAGE_SIZE];
+};
+
+/* security opcodes */
+typedef enum {
+       SEC_CTX_INIT        = 801,
+       SEC_CTX_INIT_CONT       = 802,
+       SEC_CTX_FINI        = 803,
+       SEC_LAST_OPC,
+       SEC_FIRST_OPC      = SEC_CTX_INIT
+} sec_cmd_t;
+
+/*
+ * capa related definitions
+ */
+#define CAPA_HMAC_MAX_LEN       64
+#define CAPA_HMAC_KEY_MAX_LEN   56
+
+/* NB take care when changing the sequence of elements this struct,
+ * because the offset info is used in find_capa() */
+struct lustre_capa {
+       struct lu_fid   lc_fid;  /** fid */
+       __u64      lc_opc;       /** operations allowed */
+       __u64      lc_uid;       /** file owner */
+       __u64      lc_gid;       /** file group */
+       __u32      lc_flags;       /** HMAC algorithm & flags */
+       __u32      lc_keyid;       /** key# used for the capability */
+       __u32      lc_timeout;     /** capa timeout value (sec) */
+       __u32      lc_expiry;      /** expiry time (sec) */
+       __u8        lc_hmac[CAPA_HMAC_MAX_LEN];   /** HMAC */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa(struct lustre_capa *c);
+
+/** lustre_capa::lc_opc */
+enum {
+       CAPA_OPC_BODY_WRITE   = 1<<0,  /**< write object data */
+       CAPA_OPC_BODY_READ    = 1<<1,  /**< read object data */
+       CAPA_OPC_INDEX_LOOKUP = 1<<2,  /**< lookup object fid */
+       CAPA_OPC_INDEX_INSERT = 1<<3,  /**< insert object fid */
+       CAPA_OPC_INDEX_DELETE = 1<<4,  /**< delete object fid */
+       CAPA_OPC_OSS_WRITE    = 1<<5,  /**< write oss object data */
+       CAPA_OPC_OSS_READ     = 1<<6,  /**< read oss object data */
+       CAPA_OPC_OSS_TRUNC    = 1<<7,  /**< truncate oss object */
+       CAPA_OPC_OSS_DESTROY  = 1<<8,  /**< destroy oss object */
+       CAPA_OPC_META_WRITE   = 1<<9,  /**< write object meta data */
+       CAPA_OPC_META_READ    = 1<<10, /**< read object meta data */
+};
+
+#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE)
+#define CAPA_OPC_MDS_ONLY                                                 \
+       (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \
+        CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE)
+#define CAPA_OPC_OSS_ONLY                                                 \
+       (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC |      \
+        CAPA_OPC_OSS_DESTROY)
+#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY
+#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY)
+
+/* MDS capability covers object capability for operations of body r/w
+ * (dir readpage/sendpage), index lookup/insert/delete and meta data r/w,
+ * while OSS capability only covers object capability for operations of
+ * oss data(file content) r/w/truncate.
+ */
+static inline int capa_for_mds(struct lustre_capa *c)
+{
+       return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) != 0;
+}
+
+static inline int capa_for_oss(struct lustre_capa *c)
+{
+       return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) == 0;
+}
+
+/* lustre_capa::lc_hmac_alg */
+enum {
+       CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */
+       CAPA_HMAC_ALG_MAX,
+};
+
+#define CAPA_FL_MASK       0x00ffffff
+#define CAPA_HMAC_ALG_MASK      0xff000000
+
+struct lustre_capa_key {
+       __u64   lk_seq;       /**< mds# */
+       __u32   lk_keyid;     /**< key# */
+       __u32   lk_padding;
+       __u8    lk_key[CAPA_HMAC_KEY_MAX_LEN];    /**< key */
+} __attribute__((packed));
+
+extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k);
+
+/** The link ea holds 1 \a link_ea_entry for each hardlink */
+#define LINK_EA_MAGIC 0x11EAF1DFUL
+struct link_ea_header {
+       __u32 leh_magic;
+       __u32 leh_reccount;
+       __u64 leh_len;      /* total size */
+       /* future use */
+       __u32 padding1;
+       __u32 padding2;
+};
+
+/** Hardlink data is name and parent fid.
+ * Stored in this crazy struct for maximum packing and endian-neutrality
+ */
+struct link_ea_entry {
+       /** __u16 stored big-endian, unaligned */
+       unsigned char      lee_reclen[2];
+       unsigned char      lee_parent_fid[sizeof(struct lu_fid)];
+       char           lee_name[0];
+}__attribute__((packed));
+
+/** fid2path request/reply structure */
+struct getinfo_fid2path {
+       struct lu_fid   gf_fid;
+       __u64      gf_recno;
+       __u32      gf_linkno;
+       __u32      gf_pathlen;
+       char        gf_path[0];
+} __attribute__((packed));
+
+void lustre_swab_fid2path (struct getinfo_fid2path *gf);
+
+enum {
+       LAYOUT_INTENT_ACCESS    = 0,
+       LAYOUT_INTENT_READ      = 1,
+       LAYOUT_INTENT_WRITE     = 2,
+       LAYOUT_INTENT_GLIMPSE   = 3,
+       LAYOUT_INTENT_TRUNC     = 4,
+       LAYOUT_INTENT_RELEASE   = 5,
+       LAYOUT_INTENT_RESTORE   = 6
+};
+
+/* enqueue layout lock with intent */
+struct layout_intent {
+       __u32 li_opc; /* intent operation for enqueue, read, write etc */
+       __u32 li_flags;
+       __u64 li_start;
+       __u64 li_end;
+};
+
+void lustre_swab_layout_intent(struct layout_intent *li);
+
+/**
+ * On the wire version of hsm_progress structure.
+ *
+ * Contains the userspace hsm_progress and some internal fields.
+ */
+struct hsm_progress_kernel {
+       /* Field taken from struct hsm_progress */
+       lustre_fid              hpk_fid;
+       __u64                   hpk_cookie;
+       struct hsm_extent       hpk_extent;
+       __u16                   hpk_flags;
+       __u16                   hpk_errval; /* positive val */
+       __u32                   hpk_padding1;
+       /* Additional fields */
+       __u64                   hpk_data_version;
+       __u64                   hpk_padding2;
+} __attribute__((packed));
+
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_current_action(struct hsm_current_action *action);
+extern void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk);
+extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus);
+extern void lustre_swab_hsm_user_item(struct hsm_user_item *hui);
+extern void lustre_swab_hsm_request(struct hsm_request *hr);
+
+/**
+ * These are object update opcode under UPDATE_OBJ, which is currently
+ * being used by cross-ref operations between MDT.
+ *
+ * During the cross-ref operation, the Master MDT, which the client send the
+ * request to, will disassembly the operation into object updates, then OSP
+ * will send these updates to the remote MDT to be executed.
+ *
+ *   Update request format
+ *   magic:  UPDATE_BUFFER_MAGIC_V1
+ *   Count:  How many updates in the req.
+ *   bufs[0] : following are packets of object.
+ *   update[0]:
+ *             type: object_update_op, the op code of update
+ *             fid: The object fid of the update.
+ *             lens/bufs: other parameters of the update.
+ *   update[1]:
+ *             type: object_update_op, the op code of update
+ *             fid: The object fid of the update.
+ *             lens/bufs: other parameters of the update.
+ *   ..........
+ *   update[7]:        type: object_update_op, the op code of update
+ *             fid: The object fid of the update.
+ *             lens/bufs: other parameters of the update.
+ *   Current 8 maxim updates per object update request.
+ *
+ *******************************************************************
+ *   update reply format:
+ *
+ *   ur_version: UPDATE_REPLY_V1
+ *   ur_count:   The count of the reply, which is usually equal
+ *              to the number of updates in the request.
+ *   ur_lens:    The reply lengths of each object update.
+ *
+ *   replies:    1st update reply  [4bytes_ret: other body]
+ *              2nd update reply  [4bytes_ret: other body]
+ *              .....
+ *              nth update reply  [4bytes_ret: other body]
+ *
+ *   For each reply of the update, the format would be
+ *      result(4 bytes):Other stuff
+ */
+
+#define UPDATE_MAX_OPS         10
+#define UPDATE_BUFFER_MAGIC_V1 0xBDDE0001
+#define UPDATE_BUFFER_MAGIC    UPDATE_BUFFER_MAGIC_V1
+#define UPDATE_BUF_COUNT       8
+enum object_update_op {
+       OBJ_CREATE              = 1,
+       OBJ_DESTROY             = 2,
+       OBJ_REF_ADD             = 3,
+       OBJ_REF_DEL             = 4,
+       OBJ_ATTR_SET            = 5,
+       OBJ_ATTR_GET            = 6,
+       OBJ_XATTR_SET           = 7,
+       OBJ_XATTR_GET           = 8,
+       OBJ_INDEX_LOOKUP        = 9,
+       OBJ_INDEX_INSERT        = 10,
+       OBJ_INDEX_DELETE        = 11,
+       OBJ_LAST
+};
+
+struct update {
+       __u32           u_type;
+       __u32           u_batchid;
+       struct lu_fid   u_fid;
+       __u32           u_lens[UPDATE_BUF_COUNT];
+       __u32           u_bufs[0];
+};
+
+struct update_buf {
+       __u32   ub_magic;
+       __u32   ub_count;
+       __u32   ub_bufs[0];
+};
+
+#define UPDATE_REPLY_V1                0x00BD0001
+struct update_reply {
+       __u32   ur_version;
+       __u32   ur_count;
+       __u32   ur_lens[0];
+};
+
+void lustre_swab_update_buf(struct update_buf *ub);
+void lustre_swab_update_reply_buf(struct update_reply *ur);
+
+/** layout swap request structure
+ * fid1 and fid2 are in mdt_body
+ */
+struct mdc_swap_layouts {
+       __u64      msl_flags;
+} __packed;
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
+
+#endif
+/** @} lustreidl */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h
new file mode 100644 (file)
index 0000000..1c87a61
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre/lustre_lfsck_user.h
+ *
+ * Lustre LFSCK userspace interfaces.
+ *
+ * Author: Fan Yong <yong.fan@whamcloud.com>
+ */
+
+#ifndef _LUSTRE_LFSCK_USER_H
+# define _LUSTRE_LFSCK_USER_H
+
+enum lfsck_param_flags {
+       /* Reset LFSCK iterator position to the device beginning. */
+       LPF_RESET       = 0x0001,
+
+       /* Exit when fail. */
+       LPF_FAILOUT     = 0x0002,
+
+       /* Dryrun mode, only check without modification */
+       LPF_DRYRUN      = 0x0004,
+};
+
+enum lfsck_type {
+       /* For MDT-OST consistency check/repair. */
+       LT_LAYOUT       = 0x0001,
+
+       /* For MDT-MDT consistency check/repair. */
+       LT_DNE          = 0x0002,
+
+       /* For FID-in-dirent and linkEA consistency check/repair. */
+       LT_NAMESPACE    = 0x0004,
+};
+
+#define LFSCK_VERSION_V1       1
+#define LFSCK_VERSION_V2       2
+
+#define LFSCK_TYPES_ALL                ((__u16)(~0))
+#define LFSCK_TYPES_DEF                ((__u16)0)
+#define LFSCK_TYPES_SUPPORTED  LT_NAMESPACE
+
+#define LFSCK_SPEED_NO_LIMIT   0
+#define LFSCK_SPEED_LIMIT_DEF  LFSCK_SPEED_NO_LIMIT
+
+enum lfsck_start_valid {
+       LSV_SPEED_LIMIT         = 0x00000001,
+       LSV_ERROR_HANDLE        = 0x00000002,
+       LSV_DRYRUN              = 0x00000004,
+};
+
+/* Arguments for starting lfsck. */
+struct lfsck_start {
+       /* Which arguments are valid, see 'enum lfsck_start_valid'. */
+       __u32   ls_valid;
+
+       /* How many items can be scanned at most per second. */
+       __u32   ls_speed_limit;
+
+       /* For compatibility between user space tools and kernel service. */
+       __u16   ls_version;
+
+       /* Which LFSCK components to be (have been) started. */
+       __u16   ls_active;
+
+       /* Flags for the LFSCK, see 'enum lfsck_param_flags'. */
+       __u16   ls_flags;
+
+       /* For 64-bits aligned. */
+       __u16   ls_padding;
+};
+
+#endif /* _LUSTRE_LFSCK_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h
new file mode 100644 (file)
index 0000000..7e9f575
--- /dev/null
@@ -0,0 +1,1145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lustre_user.h
+ *
+ * Lustre public user-space interface definitions.
+ */
+
+#ifndef _LUSTRE_USER_H
+#define _LUSTRE_USER_H
+
+/** \defgroup lustreuser lustreuser
+ *
+ * @{
+ */
+
+#include <lustre/ll_fiemap.h>
+#include <linux/lustre_user.h>
+
+/* for statfs() */
+#define LL_SUPER_MAGIC 0x0BD00BD0
+
+#ifndef FSFILT_IOC_GETFLAGS
+#define FSFILT_IOC_GETFLAGS           _IOR('f', 1, long)
+#define FSFILT_IOC_SETFLAGS           _IOW('f', 2, long)
+#define FSFILT_IOC_GETVERSION       _IOR('f', 3, long)
+#define FSFILT_IOC_SETVERSION       _IOW('f', 4, long)
+#define FSFILT_IOC_GETVERSION_OLD       _IOR('v', 1, long)
+#define FSFILT_IOC_SETVERSION_OLD       _IOW('v', 2, long)
+#define FSFILT_IOC_FIEMAP               _IOWR('f', 11, struct ll_user_fiemap)
+#endif
+
+/* FIEMAP flags supported by Lustre */
+#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER)
+
+enum obd_statfs_state {
+       OS_STATE_DEGRADED       = 0x00000001, /**< RAID degraded/rebuilding */
+       OS_STATE_READONLY       = 0x00000002, /**< filesystem is read-only */
+       OS_STATE_RDONLY_1       = 0x00000004, /**< obsolete 1.6, was EROFS=30 */
+       OS_STATE_RDONLY_2       = 0x00000008, /**< obsolete 1.6, was EROFS=30 */
+       OS_STATE_RDONLY_3       = 0x00000010, /**< obsolete 1.6, was EROFS=30 */
+};
+
+struct obd_statfs {
+       __u64      os_type;
+       __u64      os_blocks;
+       __u64      os_bfree;
+       __u64      os_bavail;
+       __u64      os_files;
+       __u64      os_ffree;
+       __u8        os_fsid[40];
+       __u32      os_bsize;
+       __u32      os_namelen;
+       __u64      os_maxbytes;
+       __u32      os_state;       /**< obd_statfs_state OS_STATE_* flag */
+       __u32      os_fprecreated;      /* objs available now to the caller */
+                                       /* used in QoS code to find preferred
+                                        * OSTs */
+       __u32      os_spare2;
+       __u32      os_spare3;
+       __u32      os_spare4;
+       __u32      os_spare5;
+       __u32      os_spare6;
+       __u32      os_spare7;
+       __u32      os_spare8;
+       __u32      os_spare9;
+};
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ **/
+struct lu_fid {
+       /**
+       * FID sequence. Sequence is a unit of migration: all files (objects)
+       * with FIDs from a given sequence are stored on the same server.
+       * Lustre should support 2^64 objects, so even if each sequence
+       * has only a single object we can still enumerate 2^64 objects.
+       **/
+       __u64 f_seq;
+       /* FID number within sequence. */
+       __u32 f_oid;
+       /**
+        * FID version, used to distinguish different versions (in the sense
+        * of snapshots, etc.) of the same file system object. Not currently
+        * used.
+        **/
+       __u32 f_ver;
+};
+
+struct filter_fid {
+       struct lu_fid   ff_parent;  /* ff_parent.f_ver == file stripe number */
+};
+
+/* keep this one for compatibility */
+struct filter_fid_old {
+       struct lu_fid   ff_parent;
+       __u64           ff_objid;
+       __u64           ff_seq;
+};
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+ * to print or parse them.  Other functions (e.g. compare, swab) could be moved
+ * here from lustre_idl.h if needed. */
+typedef struct lu_fid lustre_fid;
+
+/**
+ * Following struct for object attributes, that will be kept inode's EA.
+ * Introduced in 2.0 release (please see b15993, for details)
+ * Added to all objects since Lustre 2.4 as contains self FID
+ */
+struct lustre_mdt_attrs {
+       /**
+        * Bitfield for supported data in this structure. From enum lma_compat.
+        * lma_self_fid and lma_flags are always available.
+        */
+       __u32   lma_compat;
+       /**
+        * Per-file incompat feature list. Lustre version should support all
+        * flags set in this field. The supported feature mask is available in
+        * LMA_INCOMPAT_SUPP.
+        */
+       __u32   lma_incompat;
+       /** FID of this inode */
+       struct lu_fid  lma_self_fid;
+};
+
+/**
+ * Prior to 2.4, the LMA structure also included SOM attributes which has since
+ * been moved to a dedicated xattr
+ * lma_flags was also removed because of lma_compat/incompat fields.
+ */
+#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+
+/**
+ * OST object IDentifier.
+ */
+struct ost_id {
+       union {
+               struct ostid {
+                       __u64   oi_id;
+                       __u64   oi_seq;
+               } oi;
+               struct lu_fid oi_fid;
+       };
+};
+
+#define DOSTID LPX64":"LPU64
+#define POSTID(oi) ostid_seq(oi), ostid_id(oi)
+
+/*
+ * The ioctl naming rules:
+ * LL_*     - works on the currently opened filehandle instead of parent dir
+ * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
+ * *_MDC_*  - gets/sets data related to MDC
+ * *_LOV_*  - gets/sets data related to OSC/LOV
+ * *FILE*   - called on parent dir and passes in a filename
+ * *STRIPE* - set/get lov_user_md
+ * *INFO    - set/get lov_user_mds_data
+ */
+/* see <lustre_lib.h> for ioctl numberss 101-150 */
+#define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
+#define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
+#define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
+/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */
+#define LL_IOC_LOV_SETSTRIPE       _IOW ('f', 154, long)
+/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */
+#define LL_IOC_LOV_GETSTRIPE       _IOW ('f', 155, long)
+/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */
+#define LL_IOC_LOV_SETEA               _IOW ('f', 156, long)
+#define LL_IOC_RECREATE_OBJ         _IOW ('f', 157, long)
+#define LL_IOC_RECREATE_FID         _IOW ('f', 157, struct lu_fid)
+#define LL_IOC_GROUP_LOCK             _IOW ('f', 158, long)
+#define LL_IOC_GROUP_UNLOCK         _IOW ('f', 159, long)
+/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */
+#define LL_IOC_QUOTACHECK             _IOW ('f', 160, int)
+/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */
+#define LL_IOC_POLL_QUOTACHECK   _IOR ('f', 161, struct if_quotacheck *)
+/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */
+#define LL_IOC_QUOTACTL                 _IOWR('f', 162, struct if_quotactl)
+#define IOC_OBD_STATFS           _IOWR('f', 164, struct obd_statfs *)
+#define IOC_LOV_GETINFO                 _IOWR('f', 165, struct lov_user_mds_data *)
+#define LL_IOC_FLUSHCTX                 _IOW ('f', 166, long)
+#define LL_IOC_RMTACL             _IOW ('f', 167, long)
+#define LL_IOC_GETOBDCOUNT           _IOR ('f', 168, long)
+#define LL_IOC_LLOOP_ATTACH         _IOWR('f', 169, long)
+#define LL_IOC_LLOOP_DETACH         _IOWR('f', 170, long)
+#define LL_IOC_LLOOP_INFO             _IOWR('f', 171, struct lu_fid)
+#define LL_IOC_LLOOP_DETACH_BYDEV       _IOWR('f', 172, long)
+#define LL_IOC_PATH2FID                 _IOR ('f', 173, long)
+#define LL_IOC_GET_CONNECT_FLAGS       _IOWR('f', 174, __u64 *)
+#define LL_IOC_GET_MDTIDX             _IOR ('f', 175, int)
+
+/* see <lustre_lib.h> for ioctl numbers 177-210 */
+
+#define LL_IOC_HSM_STATE_GET           _IOR('f', 211, struct hsm_user_state)
+#define LL_IOC_HSM_STATE_SET           _IOW('f', 212, struct hsm_state_set)
+#define LL_IOC_HSM_CT_START            _IOW('f', 213, struct lustre_kernelcomm)
+#define LL_IOC_HSM_COPY_START          _IOW('f', 214, struct hsm_copy *)
+#define LL_IOC_HSM_COPY_END            _IOW('f', 215, struct hsm_copy *)
+#define LL_IOC_HSM_PROGRESS            _IOW('f', 216, struct hsm_user_request)
+#define LL_IOC_HSM_REQUEST             _IOW('f', 217, struct hsm_user_request)
+#define LL_IOC_DATA_VERSION            _IOR('f', 218, struct ioc_data_version)
+#define LL_IOC_LOV_SWAP_LAYOUTS                _IOW('f', 219, \
+                                               struct lustre_swap_layouts)
+#define LL_IOC_HSM_ACTION              _IOR('f', 220, \
+                                               struct hsm_current_action)
+/* see <lustre_lib.h> for ioctl numbers 221-232 */
+
+#define LL_IOC_LMV_SETSTRIPE       _IOWR('f', 240, struct lmv_user_md)
+#define LL_IOC_LMV_GETSTRIPE       _IOWR('f', 241, struct lmv_user_md)
+#define LL_IOC_REMOVE_ENTRY        _IOWR('f', 242, __u64)
+
+#define LL_STATFS_LMV     1
+#define LL_STATFS_LOV     2
+#define LL_STATFS_NODELAY      4
+
+#define IOC_MDC_TYPE       'i'
+#define IOC_MDC_LOOKUP   _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_GETFILESTRIPE   _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *)
+#define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *)
+#define LL_IOC_MDC_GETINFO      _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *)
+
+/* Keep these for backward compartability. */
+#define LL_IOC_OBD_STATFS       IOC_OBD_STATFS
+#define IOC_MDC_GETSTRIPE       IOC_MDC_GETFILESTRIPE
+
+
+#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */
+
+/* Hopefully O_LOV_DELAY_CREATE does not conflict with standard O_xxx flags.
+ * Previously it was defined as 0100000000 and conflicts with FMODE_NONOTIFY
+ * which was added since kernel 2.6.36, so we redefine it as 020000000.
+ * To be compatible with old version's statically linked binary, finally we
+ * define it as (020000000 | 0100000000).
+ * */
+#define O_LOV_DELAY_CREATE      0120000000
+
+#define LL_FILE_IGNORE_LOCK     0x00000001
+#define LL_FILE_GROUP_LOCKED    0x00000002
+#define LL_FILE_READAHEA       0x00000004
+#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */
+#define LL_FILE_LOCKLESS_IO     0x00000010 /* server-side locks with cio */
+#define LL_FILE_RMTACL   0x00000020
+
+#define LOV_USER_MAGIC_V1 0x0BD10BD0
+#define LOV_USER_MAGIC    LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0
+#define LOV_USER_MAGIC_V3 0x0BD30BD0
+
+#define LMV_MAGIC_V1      0x0CD10CD0    /*normal stripe lmv magic */
+#define LMV_USER_MAGIC    0x0CD20CD0    /*default lmv magic*/
+
+#define LOV_PATTERN_RAID0 0x001
+#define LOV_PATTERN_RAID1 0x002
+#define LOV_PATTERN_FIRST 0x100
+
+#define LOV_MAXPOOLNAME 16
+#define LOV_POOLNAMEF "%.16s"
+
+#define LOV_MIN_STRIPE_BITS 16   /* maximum PAGE_SIZE (ia64), power of 2 */
+#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
+#define LOV_MAX_STRIPE_COUNT_OLD 160
+/* This calculation is crafted so that input of 4096 will result in 160
+ * which in turn is equal to old maximal stripe count.
+ * XXX: In fact this is too simpified for now, what it also need is to get
+ * ea_type argument to clearly know how much space each stripe consumes.
+ *
+ * The limit of 12 pages is somewhat arbitrary, but is a reasonably large
+ * allocation that is sufficient for the current generation of systems.
+ *
+ * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */
+#define LOV_MAX_STRIPE_COUNT 2000  /* ((12 * 4096 - 256) / 24) */
+#define LOV_ALL_STRIPES       0xffff /* only valid for directories */
+#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */
+
+#define lov_user_ost_data lov_user_ost_data_v1
+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
+       struct ost_id l_ost_oi;   /* OST object ID */
+       __u32 l_ost_gen;          /* generation of this OST index */
+       __u32 l_ost_idx;          /* OST index in LOV */
+} __attribute__((packed));
+
+#define lov_user_md lov_user_md_v1
+struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
+       __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
+       __u32 lmm_pattern;      /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+       struct ost_id lmm_oi;     /* LOV object ID */
+       __u32 lmm_stripe_size;    /* size of stripe in bytes */
+       __u16 lmm_stripe_count;   /* num stripes in use for this object */
+       union {
+               __u16 lmm_stripe_offset;  /* starting stripe offset in
+                                          * lmm_objects, use when writing */
+               __u16 lmm_layout_gen;     /* layout generation number
+                                          * used when reading */
+       };
+       struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed,  __may_alias__));
+
+struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
+       __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
+       __u32 lmm_pattern;      /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+       struct ost_id lmm_oi;     /* LOV object ID */
+       __u32 lmm_stripe_size;    /* size of stripe in bytes */
+       __u16 lmm_stripe_count;   /* num stripes in use for this object */
+       union {
+               __u16 lmm_stripe_offset;  /* starting stripe offset in
+                                          * lmm_objects, use when writing */
+               __u16 lmm_layout_gen;     /* layout generation number
+                                          * used when reading */
+       };
+       char  lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+       struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
+/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
+ * use this.  It is unsafe to #define those values in this header as it
+ * is possible the application has already #included <sys/stat.h>. */
+#ifdef HAVE_LOV_USER_MDS_DATA
+#define lov_user_mds_data lov_user_mds_data_v1
+struct lov_user_mds_data_v1 {
+       lstat_t lmd_st;          /* MDS stat struct */
+       struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+       lstat_t lmd_st;          /* MDS stat struct */
+       struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
+} __attribute__((packed));
+#endif
+
+/* keep this to be the same size as lov_user_ost_data_v1 */
+struct lmv_user_mds_data {
+       struct lu_fid   lum_fid;
+       __u32           lum_padding;
+       __u32           lum_mds;
+};
+
+/* lum_type */
+enum {
+       LMV_STRIPE_TYPE = 0,
+       LMV_DEFAULT_TYPE = 1,
+};
+
+#define lmv_user_md lmv_user_md_v1
+struct lmv_user_md_v1 {
+       __u32   lum_magic;       /* must be the first field */
+       __u32   lum_stripe_count;  /* dirstripe count */
+       __u32   lum_stripe_offset; /* MDT idx for default dirstripe */
+       __u32   lum_hash_type;     /* Dir stripe policy */
+       __u32   lum_type;         /* LMV type: default or normal */
+       __u32   lum_padding1;
+       __u32   lum_padding2;
+       __u32   lum_padding3;
+       char    lum_pool_name[LOV_MAXPOOLNAME];
+       struct  lmv_user_mds_data  lum_objects[0];
+};
+
+static inline int lmv_user_md_size(int stripes, int lmm_magic)
+{
+       return sizeof(struct lmv_user_md) +
+                     stripes * sizeof(struct lmv_user_mds_data);
+}
+
+extern void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
+
+struct ll_recreate_obj {
+       __u64 lrc_id;
+       __u32 lrc_ost_idx;
+};
+
+struct ll_fid {
+       __u64 id;        /* holds object id */
+       __u32 generation; /* holds object generation */
+       __u32 f_type;     /* holds object type or stripe idx when passing it to
+                          * OST for saving into EA. */
+};
+
+#define UUID_MAX       40
+struct obd_uuid {
+       char uuid[UUID_MAX];
+};
+
+static inline int obd_uuid_equals(const struct obd_uuid *u1,
+                                 const struct obd_uuid *u2)
+{
+       return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0;
+}
+
+static inline int obd_uuid_empty(struct obd_uuid *uuid)
+{
+       return uuid->uuid[0] == '\0';
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
+{
+       strncpy((char *)uuid->uuid, tmp, sizeof(*uuid));
+       uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
+/* For printf's only, make sure uuid is terminated */
+static inline char *obd_uuid2str(struct obd_uuid *uuid)
+{
+       if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
+               /* Obviously not safe, but for printfs, no real harm done...
+                  we're always null-terminated, even in a race. */
+               static char temp[sizeof(*uuid)];
+               memcpy(temp, uuid->uuid, sizeof(*uuid) - 1);
+               temp[sizeof(*uuid) - 1] = '\0';
+               return temp;
+       }
+       return (char *)(uuid->uuid);
+}
+
+/* Extract fsname from uuid (or target name) of a target
+   e.g. (myfs-OST0007_UUID -> myfs)
+   see also deuuidify. */
+static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen)
+{
+       char *p;
+
+       strncpy(buf, uuid, buflen - 1);
+       buf[buflen - 1] = '\0';
+       p = strrchr(buf, '-');
+       if (p)
+          *p = '\0';
+}
+
+/* printf display format
+   e.g. printf("file FID is "DFID"\n", PFID(fid)); */
+#define DFID_NOBRACE LPX64":0x%x:0x%x"
+#define DFID "["DFID_NOBRACE"]"
+#define PFID(fid)     \
+       (fid)->f_seq, \
+       (fid)->f_oid, \
+       (fid)->f_ver
+
+/* scanf input parse format -- strip '[' first.
+   e.g. sscanf(fidstr, SFID, RFID(&fid)); */
+/* #define SFID "0x"LPX64i":0x"LPSZX":0x"LPSZX""
+liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 4 has type 'unsigned int *'
+liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 5 has type 'unsigned int *'
+*/
+#define SFID "0x"LPX64i":0x%x:0x%x"
+#define RFID(fid)     \
+       &((fid)->f_seq), \
+       &((fid)->f_oid), \
+       &((fid)->f_ver)
+
+
+/********* Quotas **********/
+
+/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
+#define LUSTRE_Q_QUOTAON    0x800002     /* turn quotas on */
+#define LUSTRE_Q_QUOTAOFF   0x800003     /* turn quotas off */
+#define LUSTRE_Q_GETINFO    0x800005     /* get information about quota files */
+#define LUSTRE_Q_SETINFO    0x800006     /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA   0x800007     /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA   0x800008     /* set user quota structure */
+/* lustre-specific control commands */
+#define LUSTRE_Q_INVALIDATE  0x80000b     /* invalidate quota data */
+#define LUSTRE_Q_FINVALIDATE 0x80000c     /* invalidate filter quota data */
+
+#define UGQUOTA 2       /* set both USRQUOTA and GRPQUOTA */
+
+struct if_quotacheck {
+       char                obd_type[16];
+       struct obd_uuid  obd_uuid;
+};
+
+#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629
+
+/* permission */
+#define N_PERMS_MAX      64
+
+struct perm_downcall_data {
+       __u64 pdd_nid;
+       __u32 pdd_perm;
+       __u32 pdd_padding;
+};
+
+struct identity_downcall_data {
+       __u32                       idd_magic;
+       __u32                       idd_err;
+       __u32                       idd_uid;
+       __u32                       idd_gid;
+       __u32                       idd_nperms;
+       __u32                       idd_ngroups;
+       struct perm_downcall_data idd_perms[N_PERMS_MAX];
+       __u32                       idd_groups[0];
+};
+
+/* for non-mapped uid/gid */
+#define NOBODY_UID      99
+#define NOBODY_GID      99
+
+#define INVALID_ID      (-1)
+
+enum {
+       RMT_LSETFACL    = 1,
+       RMT_LGETFACL    = 2,
+       RMT_RSETFACL    = 3,
+       RMT_RGETFACL    = 4
+};
+
+#ifdef NEED_QUOTA_DEFS
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS     1
+#define QIF_SPACE       2
+#define QIF_ILIMITS     4
+#define QIF_INODES      8
+#define QIF_BTIME       16
+#define QIF_ITIME       32
+#define QIF_LIMITS      (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE       (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES       (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL         (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* lustre volatile file support
+ * file name header: .^L^S^T^R:volatile"
+ */
+#define LUSTRE_VOLATILE_HDR    ".\x0c\x13\x14\x12:VOLATILE"
+#define LUSTRE_VOLATILE_HDR_LEN        14
+/* hdr + MDT index */
+#define LUSTRE_VOLATILE_IDX    LUSTRE_VOLATILE_HDR":%.4X:"
+
+typedef enum lustre_quota_version {
+       LUSTRE_QUOTA_V2 = 1
+} lustre_quota_version_t;
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
+       __u64 dqi_bgrace;
+       __u64 dqi_igrace;
+       __u32 dqi_flags;
+       __u32 dqi_valid;
+};
+
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
+       __u64 dqb_bhardlimit;
+       __u64 dqb_bsoftlimit;
+       __u64 dqb_curspace;
+       __u64 dqb_ihardlimit;
+       __u64 dqb_isoftlimit;
+       __u64 dqb_curinodes;
+       __u64 dqb_btime;
+       __u64 dqb_itime;
+       __u32 dqb_valid;
+       __u32 dqb_padding;
+};
+
+enum {
+       QC_GENERAL      = 0,
+       QC_MDTIDX       = 1,
+       QC_OSTIDX       = 2,
+       QC_UUID  = 3
+};
+
+struct if_quotactl {
+       __u32              qc_cmd;
+       __u32              qc_type;
+       __u32              qc_id;
+       __u32              qc_stat;
+       __u32              qc_valid;
+       __u32              qc_idx;
+       struct obd_dqinfo       qc_dqinfo;
+       struct obd_dqblk        qc_dqblk;
+       char                obd_type[16];
+       struct obd_uuid  obd_uuid;
+};
+
+/* swap layout flags */
+#define        SWAP_LAYOUTS_CHECK_DV1          (1 << 0)
+#define        SWAP_LAYOUTS_CHECK_DV2          (1 << 1)
+#define        SWAP_LAYOUTS_KEEP_MTIME         (1 << 2)
+#define        SWAP_LAYOUTS_KEEP_ATIME         (1 << 3)
+struct lustre_swap_layouts {
+       __u64   sl_flags;
+       __u32   sl_fd;
+       __u32   sl_gid;
+       __u64   sl_dv1;
+       __u64   sl_dv2;
+};
+
+
+/********* Changelogs **********/
+/** Changelog record types */
+enum changelog_rec_type {
+       CL_MARK     = 0,
+       CL_CREATE   = 1,  /* namespace */
+       CL_MKDIR    = 2,  /* namespace */
+       CL_HARDLINK = 3,  /* namespace */
+       CL_SOFTLINK = 4,  /* namespace */
+       CL_MKNOD    = 5,  /* namespace */
+       CL_UNLINK   = 6,  /* namespace */
+       CL_RMDIR    = 7,  /* namespace */
+       CL_RENAME   = 8,  /* namespace */
+       CL_EXT      = 9,  /* namespace extended record (2nd half of rename) */
+       CL_OPEN     = 10, /* not currently used */
+       CL_CLOSE    = 11, /* may be written to log only with mtime change */
+       CL_LAYOUT   = 12, /* file layout/striping modified */
+       CL_TRUNC    = 13,
+       CL_SETATTR  = 14,
+       CL_XATTR    = 15,
+       CL_HSM      = 16, /* HSM specific events, see flags */
+       CL_MTIME    = 17, /* Precedence: setattr > mtime > ctime > atime */
+       CL_CTIME    = 18,
+       CL_ATIME    = 19,
+       CL_LAST
+};
+
+static inline const char *changelog_type2str(int type) {
+       static const char *changelog_str[] = {
+               "MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
+               "RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
+               "SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME",
+       };
+
+       if (type >= 0 && type < CL_LAST)
+               return changelog_str[type];
+       return NULL;
+}
+
+/* per-record flags */
+#define CLF_VERSION     0x1000
+#define CLF_EXT_VERSION 0x2000
+#define CLF_FLAGSHIFT   12
+#define CLF_FLAGMASK    ((1U << CLF_FLAGSHIFT) - 1)
+#define CLF_VERMASK     (~CLF_FLAGMASK)
+/* Anything under the flagmask may be per-type (if desired) */
+/* Flags for unlink */
+#define CLF_UNLINK_LAST       0x0001 /* Unlink of last hardlink */
+#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */
+                                    /* HSM cleaning needed */
+/* Flags for rename */
+#define CLF_RENAME_LAST       0x0001 /* rename unlink last hardlink of target */
+
+/* Flags for HSM */
+/* 12b used (from high weight to low weight):
+ * 2b for flags
+ * 3b for event
+ * 7b for error code
+ */
+#define CLF_HSM_ERR_L  0 /* HSM return code, 7 bits */
+#define CLF_HSM_ERR_H  6
+#define CLF_HSM_EVENT_L      7 /* HSM event, 3 bits, see enum hsm_event */
+#define CLF_HSM_EVENT_H      9
+#define CLF_HSM_FLAG_L      10 /* HSM flags, 2 bits, 1 used, 1 spare */
+#define CLF_HSM_FLAG_H      11
+#define CLF_HSM_SPARE_L     12 /* 4 spare bits */
+#define CLF_HSM_SPARE_H     15
+#define CLF_HSM_LAST   15
+
+/* Remove bits higher than _h, then extract the value
+ * between _h and _l by shifting lower weigth to bit 0. */
+#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \
+                                  >> (CLF_HSM_LAST - _h + _l))
+
+#define CLF_HSM_SUCCESS      0x00
+#define CLF_HSM_MAXERROR     0x7E
+#define CLF_HSM_ERROVERFLOW  0x7F
+
+#define CLF_HSM_DIRTY  1 /* file is dirty after HSM request end */
+
+/* 3 bits field => 8 values allowed */
+enum hsm_event {
+       HE_ARCHIVE      = 0,
+       HE_RESTORE      = 1,
+       HE_CANCEL       = 2,
+       HE_RELEASE      = 3,
+       HE_REMOVE       = 4,
+       HE_STATE        = 5,
+       HE_SPARE1       = 6,
+       HE_SPARE2       = 7,
+};
+
+static inline enum hsm_event hsm_get_cl_event(__u16 flags)
+{
+       return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L);
+}
+
+static inline void hsm_set_cl_event(int *flags, enum hsm_event he)
+{
+       *flags |= (he << CLF_HSM_EVENT_L);
+}
+
+static inline __u16 hsm_get_cl_flags(int flags)
+{
+       return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L);
+}
+
+static inline void hsm_set_cl_flags(int *flags, int bits)
+{
+       *flags |= (bits << CLF_HSM_FLAG_L);
+}
+
+static inline int hsm_get_cl_error(int flags)
+{
+       return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L);
+}
+
+static inline void hsm_set_cl_error(int *flags, int error)
+{
+       *flags |= (error << CLF_HSM_ERR_L);
+}
+
+#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + sizeof(struct changelog_rec))
+
+struct changelog_rec {
+       __u16            cr_namelen;
+       __u16            cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */
+       __u32            cr_type;  /**< \a changelog_rec_type */
+       __u64            cr_index; /**< changelog record number */
+       __u64            cr_prev;  /**< last index for this target fid */
+       __u64            cr_time;
+       union {
+               lustre_fid    cr_tfid;  /**< target fid */
+               __u32    cr_markerflags; /**< CL_MARK flags */
+       };
+       lustre_fid          cr_pfid;    /**< parent fid */
+       char              cr_name[0];     /**< last element */
+} __attribute__((packed));
+
+/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save
+ * space, only rename uses changelog_ext_rec, while others use changelog_rec to
+ * store records.
+ */
+struct changelog_ext_rec {
+       __u16                   cr_namelen;
+       __u16                   cr_flags; /**< (flags & CLF_FLAGMASK) |
+                                               CLF_EXT_VERSION */
+       __u32                   cr_type;  /**< \a changelog_rec_type */
+       __u64                   cr_index; /**< changelog record number */
+       __u64                   cr_prev;  /**< last index for this target fid */
+       __u64                   cr_time;
+       union {
+               lustre_fid      cr_tfid;        /**< target fid */
+               __u32           cr_markerflags; /**< CL_MARK flags */
+       };
+       lustre_fid              cr_pfid;        /**< target parent fid */
+       lustre_fid              cr_sfid;        /**< source fid, or zero */
+       lustre_fid              cr_spfid;       /**< source parent fid, or zero */
+       char                    cr_name[0];     /**< last element */
+} __attribute__((packed));
+
+#define CHANGELOG_REC_EXTENDED(rec) \
+       (((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION)
+
+static inline int changelog_rec_size(struct changelog_rec *rec)
+{
+       return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec):
+                                            sizeof(*rec);
+}
+
+static inline char *changelog_rec_name(struct changelog_rec *rec)
+{
+       return CHANGELOG_REC_EXTENDED(rec) ?
+               ((struct changelog_ext_rec *)rec)->cr_name: rec->cr_name;
+}
+
+static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec)
+{
+       return rec->cr_namelen - strlen(rec->cr_name) - 1;
+}
+
+static inline char *changelog_rec_sname(struct changelog_ext_rec *rec)
+{
+       return rec->cr_name + strlen(rec->cr_name) + 1;
+}
+
+struct ioc_changelog {
+       __u64 icc_recno;
+       __u32 icc_mdtindex;
+       __u32 icc_id;
+       __u32 icc_flags;
+};
+
+enum changelog_message_type {
+       CL_RECORD = 10, /* message is a changelog_rec */
+       CL_EOF    = 11, /* at end of current changelog */
+};
+
+/********* Misc **********/
+
+struct ioc_data_version {
+       __u64 idv_version;
+       __u64 idv_flags;     /* See LL_DV_xxx */
+};
+#define LL_DV_NOFLUSH 0x01   /* Do not take READ EXTENT LOCK before sampling
+                               version. Dirty caches are left unchanged. */
+
+#ifndef offsetof
+# define offsetof(typ,memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define dot_lustre_name ".lustre"
+
+
+/********* HSM **********/
+
+/** HSM per-file state
+ * See HSM_FLAGS below.
+ */
+enum hsm_states {
+       HS_EXISTS       = 0x00000001,
+       HS_DIRTY        = 0x00000002,
+       HS_RELEASED     = 0x00000004,
+       HS_ARCHIVED     = 0x00000008,
+       HS_NORELEASE    = 0x00000010,
+       HS_NOARCHIVE    = 0x00000020,
+       HS_LOST         = 0x00000040,
+};
+
+/* HSM user-setable flags. */
+#define HSM_USER_MASK   (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY)
+
+/* Other HSM flags. */
+#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED)
+
+/*
+ * All HSM-related possible flags that could be applied to a file.
+ * This should be kept in sync with hsm_states.
+ */
+#define HSM_FLAGS_MASK  (HSM_USER_MASK | HSM_STATUS_MASK)
+
+/**
+ * HSM request progress state
+ */
+enum hsm_progress_states {
+       HPS_WAITING     = 1,
+       HPS_RUNNING     = 2,
+       HPS_DONE        = 3,
+};
+#define HPS_NONE       0
+
+static inline char *hsm_progress_state2name(enum hsm_progress_states s)
+{
+       switch  (s) {
+       case HPS_WAITING:       return "waiting";
+       case HPS_RUNNING:       return "running";
+       case HPS_DONE:          return "done";
+       default:                return "unknown";
+       }
+}
+
+struct hsm_extent {
+       __u64 offset;
+       __u64 length;
+} __attribute__((packed));
+
+/**
+ * Current HSM states of a Lustre file.
+ *
+ * This structure purpose is to be sent to user-space mainly. It describes the
+ * current HSM flags and in-progress action.
+ */
+struct hsm_user_state {
+       /** Current HSM states, from enum hsm_states. */
+       __u32                   hus_states;
+       __u32                   hus_archive_id;
+       /**  The current undergoing action, if there is one */
+       __u32                   hus_in_progress_state;
+       __u32                   hus_in_progress_action;
+       struct hsm_extent       hus_in_progress_location;
+       char                    hus_extended_info[];
+};
+
+struct hsm_state_set_ioc {
+       struct lu_fid   hssi_fid;
+       __u64           hssi_setmask;
+       __u64           hssi_clearmask;
+};
+
+/*
+ * This structure describes the current in-progress action for a file.
+ * it is retuned to user space and send over the wire
+ */
+struct hsm_current_action {
+       /**  The current undergoing action, if there is one */
+       /* state is one of hsm_progress_states */
+       __u32                   hca_state;
+       /* action is one of hsm_user_action */
+       __u32                   hca_action;
+       struct hsm_extent       hca_location;
+};
+
+/***** HSM user requests ******/
+/* User-generated (lfs/ioctl) request types */
+enum hsm_user_action {
+       HUA_NONE    =  1, /* no action (noop) */
+       HUA_ARCHIVE = 10, /* copy to hsm */
+       HUA_RESTORE = 11, /* prestage */
+       HUA_RELEASE = 12, /* drop ost objects */
+       HUA_REMOVE  = 13, /* remove from archive */
+       HUA_CANCEL  = 14  /* cancel a request */
+};
+
+static inline char *hsm_user_action2name(enum hsm_user_action  a)
+{
+       switch  (a) {
+       case HUA_NONE:    return "NOOP";
+       case HUA_ARCHIVE: return "ARCHIVE";
+       case HUA_RESTORE: return "RESTORE";
+       case HUA_RELEASE: return "RELEASE";
+       case HUA_REMOVE:  return "REMOVE";
+       case HUA_CANCEL:  return "CANCEL";
+       default:          return "UNKNOWN";
+       }
+}
+
+/*
+ * List of hr_flags (bit field)
+ */
+#define HSM_FORCE_ACTION 0x0001
+/* used by CT, connot be set by user */
+#define HSM_GHOST_COPY   0x0002
+
+/**
+ * Contains all the fixed part of struct hsm_user_request.
+ *
+ */
+struct hsm_request {
+       __u32 hr_action;        /* enum hsm_user_action */
+       __u32 hr_archive_id;    /* archive id, used only with HUA_ARCHIVE */
+       __u64 hr_flags;         /* request flags */
+       __u32 hr_itemcount;     /* item count in hur_user_item vector */
+       __u32 hr_data_len;
+};
+
+struct hsm_user_item {
+       lustre_fid      hui_fid;
+       struct hsm_extent hui_extent;
+} __attribute__((packed));
+
+struct hsm_user_request {
+       struct hsm_request      hur_request;
+       struct hsm_user_item    hur_user_item[0];
+       /* extra data blob at end of struct (after all
+        * hur_user_items), only use helpers to access it
+        */
+} __attribute__((packed));
+
+/** Return pointer to data field in a hsm user request */
+static inline void *hur_data(struct hsm_user_request *hur)
+{
+       return &(hur->hur_user_item[hur->hur_request.hr_itemcount]);
+}
+
+/** Compute the current length of the provided hsm_user_request. */
+static inline int hur_len(struct hsm_user_request *hur)
+{
+       return offsetof(struct hsm_user_request,
+                       hur_user_item[hur->hur_request.hr_itemcount]) +
+               hur->hur_request.hr_data_len;
+}
+
+/****** HSM RPCs to copytool *****/
+/* Message types the copytool may receive */
+enum hsm_message_type {
+       HMT_ACTION_LIST = 100, /* message is a hsm_action_list */
+};
+
+/* Actions the copytool may be instructed to take for a given action_item */
+enum hsm_copytool_action {
+       HSMA_NONE    = 10, /* no action */
+       HSMA_ARCHIVE = 20, /* arbitrary offset */
+       HSMA_RESTORE = 21,
+       HSMA_REMOVE  = 22,
+       HSMA_CANCEL  = 23
+};
+
+static inline char *hsm_copytool_action2name(enum hsm_copytool_action  a)
+{
+       switch  (a) {
+       case HSMA_NONE:    return "NOOP";
+       case HSMA_ARCHIVE: return "ARCHIVE";
+       case HSMA_RESTORE: return "RESTORE";
+       case HSMA_REMOVE:  return "REMOVE";
+       case HSMA_CANCEL:  return "CANCEL";
+       default:           return "UNKNOWN";
+       }
+}
+
+/* Copytool item action description */
+struct hsm_action_item {
+       __u32      hai_len;     /* valid size of this struct */
+       __u32      hai_action;  /* hsm_copytool_action, but use known size */
+       lustre_fid hai_fid;     /* Lustre FID to operated on */
+       lustre_fid hai_dfid;    /* fid used for data access */
+       struct hsm_extent hai_extent;  /* byte range to operate on */
+       __u64      hai_cookie;  /* action cookie from coordinator */
+       __u64      hai_gid;     /* grouplock id */
+       char       hai_data[0]; /* variable length */
+} __attribute__((packed));
+
+/*
+ * helper function which print in hexa the first bytes of
+ * hai opaque field
+ * \param hai [IN] record to print
+ * \param buffer [OUT] output buffer
+ * \param len [IN] max buffer len
+ * \retval buffer
+ */
+static inline char *hai_dump_data_field(struct hsm_action_item *hai,
+                                       char *buffer, int len)
+{
+       int i, sz, data_len;
+       char *ptr;
+
+       ptr = buffer;
+       sz = len;
+       data_len = hai->hai_len - sizeof(*hai);
+       for (i = 0 ; (i < data_len) && (sz > 0) ; i++)
+       {
+               int cnt;
+
+               cnt = snprintf(ptr, sz, "%.2X",
+                              (unsigned char)hai->hai_data[i]);
+               ptr += cnt;
+               sz -= cnt;
+       }
+       *ptr = '\0';
+       return buffer;
+}
+
+/* Copytool action list */
+#define HAL_VERSION 1
+#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */
+struct hsm_action_list {
+       __u32 hal_version;
+       __u32 hal_count;       /* number of hai's to follow */
+       __u64 hal_compound_id; /* returned by coordinator */
+       __u64 hal_flags;
+       __u32 hal_archive_id; /* which archive backend */
+       __u32 padding1;
+       char  hal_fsname[0];   /* null-terminated */
+       /* struct hsm_action_item[hal_count] follows, aligned on 8-byte
+          boundaries. See hai_zero */
+} __attribute__((packed));
+
+#ifndef HAVE_CFS_SIZE_ROUND
+static inline int cfs_size_round (int val)
+{
+       return (val + 7) & (~0x7);
+}
+#define HAVE_CFS_SIZE_ROUND
+#endif
+
+/* Return pointer to first hai in action list */
+static inline struct hsm_action_item * hai_zero(struct hsm_action_list *hal)
+{
+       return (struct hsm_action_item *)(hal->hal_fsname +
+                                         cfs_size_round(strlen(hal-> \
+                                                               hal_fsname)));
+}
+/* Return pointer to next hai */
+static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai)
+{
+       return (struct hsm_action_item *)((char *)hai +
+                                         cfs_size_round(hai->hai_len));
+}
+
+/* Return size of an hsm_action_list */
+static inline int hal_size(struct hsm_action_list *hal)
+{
+       int i, sz;
+       struct hsm_action_item *hai;
+
+       sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname));
+       hai = hai_zero(hal);
+       for (i = 0 ; i < hal->hal_count ; i++) {
+               sz += cfs_size_round(hai->hai_len);
+               hai = hai_next(hai);
+       }
+       return(sz);
+}
+
+/* Copytool progress reporting */
+#define HP_FLAG_COMPLETED 0x01
+#define HP_FLAG_RETRY     0x02
+
+struct hsm_progress {
+       lustre_fid              hp_fid;
+       __u64                   hp_cookie;
+       struct hsm_extent       hp_extent;
+       __u16                   hp_flags;
+       __u16                   hp_errval; /* positive val */
+       __u32                   padding;
+};
+
+/**
+ * Use by copytool during any hsm request they handled.
+ * This structure is initialized by llapi_hsm_copy_start()
+ * which is an helper over the ioctl() interface
+ * Store Lustre, internal use only, data.
+ */
+struct hsm_copy {
+       __u64                   hc_data_version;
+       __u16                   hc_flags;
+       __u16                   hc_errval; /* positive val */
+       __u32                   padding;
+       struct hsm_action_item  hc_hai;
+};
+
+/** @} lustreuser */
+
+#endif /* _LUSTRE_USER_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustreapi.h b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h
new file mode 100644 (file)
index 0000000..63da665
--- /dev/null
@@ -0,0 +1,310 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTREAPI_H_
+#define _LUSTREAPI_H_
+
+/** \defgroup llapi llapi
+ *
+ * @{
+ */
+
+#include <lustre/lustre_user.h>
+
+typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid, void *args);
+
+/* lustreapi message severity level */
+enum llapi_message_level {
+       LLAPI_MSG_OFF    = 0,
+       LLAPI_MSG_FATAL  = 1,
+       LLAPI_MSG_ERROR  = 2,
+       LLAPI_MSG_WARN   = 3,
+       LLAPI_MSG_NORMAL = 4,
+       LLAPI_MSG_INFO   = 5,
+       LLAPI_MSG_DEBUG  = 6,
+       LLAPI_MSG_MAX
+};
+
+/* the bottom three bits reserved for llapi_message_level */
+#define LLAPI_MSG_MASK   0x00000007
+#define LLAPI_MSG_NO_ERRNO      0x00000010
+
+extern void llapi_msg_set_level(int level);
+extern void llapi_error(int level, int rc, char *fmt, ...);
+#define llapi_err_noerrno(level, fmt, a...)                         \
+       llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a)
+extern void llapi_printf(int level, char *fmt, ...);
+extern int llapi_file_create(const char *name, unsigned long long stripe_size,
+                            int stripe_offset, int stripe_count,
+                            int stripe_pattern);
+extern int llapi_file_open(const char *name, int flags, int mode,
+                          unsigned long long stripe_size, int stripe_offset,
+                          int stripe_count, int stripe_pattern);
+extern int llapi_file_create_pool(const char *name,
+                                 unsigned long long stripe_size,
+                                 int stripe_offset, int stripe_count,
+                                 int stripe_pattern, char *pool_name);
+extern int llapi_file_open_pool(const char *name, int flags, int mode,
+                               unsigned long long stripe_size,
+                               int stripe_offset, int stripe_count,
+                               int stripe_pattern, char *pool_name);
+extern int llapi_poollist(const char *name);
+extern int llapi_get_poollist(const char *name, char **poollist, int list_size,
+                             char *buffer, int buffer_size);
+extern int llapi_get_poolmembers(const char *poolname, char **members,
+                                int list_size, char *buffer, int buffer_size);
+extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
+#define HAVE_LLAPI_FILE_LOOKUP
+extern int llapi_file_lookup(int dirfd, const char *name);
+
+#define VERBOSE_COUNT      0x1
+#define VERBOSE_SIZE       0x2
+#define VERBOSE_OFFSET     0x4
+#define VERBOSE_POOL       0x8
+#define VERBOSE_DETAIL     0x10
+#define VERBOSE_OBJID      0x20
+#define VERBOSE_GENERATION 0x40
+#define VERBOSE_MDTINDEX   0x80
+#define VERBOSE_ALL    (VERBOSE_COUNT | VERBOSE_SIZE | VERBOSE_OFFSET | \
+                           VERBOSE_POOL | VERBOSE_OBJID | VERBOSE_GENERATION)
+
+struct find_param {
+       unsigned int maxdepth;
+       time_t  atime;
+       time_t  mtime;
+       time_t  ctime;
+       int     asign;  /* cannot be bitfields due to using pointers to */
+       int     csign;  /* access them during argument parsing. */
+       int     msign;
+       int     type;
+       int          size_sign:2,       /* these need to be signed values */
+                       stripesize_sign:2,
+                       stripecount_sign:2;
+       unsigned long long size;
+       unsigned long long size_units;
+       uid_t uid;
+       gid_t gid;
+
+       unsigned long   zeroend:1,
+                       recursive:1,
+                       exclude_pattern:1,
+                       exclude_type:1,
+                       exclude_obd:1,
+                       exclude_mdt:1,
+                       exclude_gid:1,
+                       exclude_uid:1,
+                       check_gid:1,        /* group ID */
+                       check_uid:1,        /* user ID */
+                       check_pool:1,      /* LOV pool name */
+                       check_size:1,      /* file size */
+                       exclude_pool:1,
+                       exclude_size:1,
+                       exclude_atime:1,
+                       exclude_mtime:1,
+                       exclude_ctime:1,
+                       get_lmv:1,            /* get MDT list from LMV */
+                       raw:1,            /* do not fill in defaults */
+                       check_stripesize:1,     /* LOV stripe size */
+                       exclude_stripesize:1,
+                       check_stripecount:1,    /* LOV stripe count */
+                       exclude_stripecount:1;
+
+       int     verbose;
+       int     quiet;
+
+       /* regular expression */
+       char   *pattern;
+
+       char   *print_fmt;
+
+       struct  obd_uuid       *obduuid;
+       int                  num_obds;
+       int                  num_alloc_obds;
+       int                  obdindex;
+       int                 *obdindexes;
+
+       struct  obd_uuid       *mdtuuid;
+       int                  num_mdts;
+       int                  num_alloc_mdts;
+       int                  mdtindex;
+       int                 *mdtindexes;
+       int                  file_mdtindex;
+
+       int     lumlen;
+       struct  lov_user_mds_data *lmd;
+
+       char poolname[LOV_MAXPOOLNAME + 1];
+
+       int                     fp_lmv_count;
+       struct lmv_user_md      *fp_lmv_md;
+
+       unsigned long long stripesize;
+       unsigned long long stripesize_units;
+       unsigned long long stripecount;
+
+       /* In-process parameters. */
+       unsigned long   got_uuids:1,
+                       obds_printed:1,
+                       have_fileinfo:1;        /* file attrs and LOV xattr */
+       unsigned int    depth;
+       dev_t      st_dev;
+};
+
+extern int llapi_ostlist(char *path, struct find_param *param);
+extern int llapi_uuid_match(char *real_uuid, char *search_uuid);
+extern int llapi_getstripe(char *path, struct find_param *param);
+extern int llapi_find(char *path, struct find_param *param);
+
+extern int llapi_file_fget_mdtidx(int fd, int *mdtidx);
+extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset,
+                                int stripe_count, int stripe_pattern,
+                                char *poolname);
+int llapi_direntry_remove(char *dname);
+extern int llapi_obd_statfs(char *path, __u32 type, __u32 index,
+                    struct obd_statfs *stat_buf,
+                    struct obd_uuid *uuid_buf);
+extern int llapi_ping(char *obd_type, char *obd_name);
+extern int llapi_target_check(int num_types, char **obd_types, char *dir);
+extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
+extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid);
+extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
+extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
+extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count);
+extern int llapi_is_lustre_mnttype(const char *type);
+extern int llapi_search_ost(char *fsname, char *poolname, char *ostname);
+extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
+extern int parse_size(char *optarg, unsigned long long *size,
+                     unsigned long long *size_units, int bytes_spec);
+extern int llapi_search_mounts(const char *pathname, int index,
+                              char *mntdir, char *fsname);
+extern int llapi_search_fsname(const char *pathname, char *fsname);
+extern int llapi_getname(const char *path, char *buf, size_t size);
+
+extern void llapi_ping_target(char *obd_type, char *obd_name,
+                             char *obd_uuid, void *args);
+
+extern int llapi_search_rootpath(char *pathname, const char *fsname);
+
+struct mntent;
+#define HAVE_LLAPI_IS_LUSTRE_MNT
+extern int llapi_is_lustre_mnt(struct mntent *mnt);
+extern int llapi_quotachown(char *path, int flag);
+extern int llapi_quotacheck(char *mnt, int check_type);
+extern int llapi_poll_quotacheck(char *mnt, struct if_quotacheck *qchk);
+extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
+extern int llapi_target_iterate(int type_num, char **obd_type, void *args,
+                               llapi_cb_t cb);
+extern int llapi_get_connect_flags(const char *mnt, __u64 *flags);
+extern int llapi_lsetfacl(int argc, char *argv[]);
+extern int llapi_lgetfacl(int argc, char *argv[]);
+extern int llapi_rsetfacl(int argc, char *argv[]);
+extern int llapi_rgetfacl(int argc, char *argv[]);
+extern int llapi_cp(int argc, char *argv[]);
+extern int llapi_ls(int argc, char *argv[]);
+extern int llapi_fid2path(const char *device, const char *fidstr, char *path,
+                         int pathlen, long long *recno, int *linkno);
+extern int llapi_path2fid(const char *path, lustre_fid *fid);
+extern int llapi_fd2fid(const int fd, lustre_fid *fid);
+
+extern int llapi_get_version(char *buffer, int buffer_size, char **version);
+extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
+extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask,
+                              __u32 archive_id);
+
+extern int llapi_create_volatile_idx(char *directory, int idx, int mode);
+static inline int llapi_create_volatile(char *directory, int mode)
+{
+       return llapi_create_volatile_idx(directory, -1, mode);
+}
+
+
+extern int llapi_fswap_layouts(const int fd1, const int fd2,
+                              __u64 dv1, __u64 dv2, __u64 flags);
+extern int llapi_swap_layouts(const char *path1, const char *path2,
+                             __u64 dv1, __u64 dv2, __u64 flags);
+
+/* Changelog interface.  priv is private state, managed internally
+   by these functions */
+#define CHANGELOG_FLAG_FOLLOW 0x01   /* Not yet implemented */
+#define CHANGELOG_FLAG_BLOCK  0x02   /* Blocking IO makes sense in case of
+   slow user parsing of the records, but it also prevents us from cleaning
+   up if the records are not consumed. */
+
+/* Records received are in extentded format now, though most of them are still
+ * written in disk in changelog_rec format (to save space and time), it's
+ * converted to extented format in the lustre api to ease changelog analysis. */
+#define HAVE_CHANGELOG_EXTEND_REC 1
+
+extern int llapi_changelog_start(void **priv, int flags, const char *mdtname,
+                                long long startrec);
+extern int llapi_changelog_fini(void **priv);
+extern int llapi_changelog_recv(void *priv, struct changelog_ext_rec **rech);
+extern int llapi_changelog_free(struct changelog_ext_rec **rech);
+/* Allow records up to endrec to be destroyed; requires registered id. */
+extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
+                                long long endrec);
+
+/* HSM copytool interface.
+ * priv is private state, managed internally by these functions
+ */
+struct hsm_copytool_private;
+extern int llapi_hsm_copytool_start(struct hsm_copytool_private **priv,
+                                   char *fsname, int flags,
+                                   int archive_count, int *archives);
+extern int llapi_hsm_copytool_fini(struct hsm_copytool_private **priv);
+extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv,
+                                  struct hsm_action_list **hal, int *msgsize);
+extern int llapi_hsm_copytool_free(struct hsm_action_list **hal);
+extern int llapi_hsm_copy_start(char *mnt, struct hsm_copy *copy,
+                               const struct hsm_action_item *hai);
+extern int llapi_hsm_copy_end(char *mnt, struct hsm_copy *copy,
+                             const struct hsm_progress *hp);
+extern int llapi_hsm_progress(char *mnt, struct hsm_progress *hp);
+extern int llapi_hsm_import(const char *dst, int archive, struct stat *st,
+                           unsigned long long stripe_size, int stripe_offset,
+                           int stripe_count, int stripe_pattern,
+                           char *pool_name, lustre_fid *newfid);
+
+/* HSM user interface */
+extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount,
+                                                            int data_len);
+extern int llapi_hsm_request(char *mnt, struct hsm_user_request *request);
+extern int llapi_hsm_current_action(const char *path,
+                                   struct hsm_current_action *hca);
+/** @} llapi */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_acl.h b/drivers/staging/lustre/lustre/include/lustre_acl.h
new file mode 100644 (file)
index 0000000..5cfb87b
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_acl.h
+ */
+
+#ifndef _LUSTRE_ACL_H
+#define _LUSTRE_ACL_H
+
+#include <linux/lustre_acl.h>
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_capa.h b/drivers/staging/lustre/lustre/include/lustre_capa.h
new file mode 100644 (file)
index 0000000..d77bffc
--- /dev/null
@@ -0,0 +1,305 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_capa.h
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#ifndef __LINUX_CAPA_H_
+#define __LINUX_CAPA_H_
+
+/** \defgroup capa capa
+ *
+ * @{
+ */
+
+/*
+ * capability
+ */
+#include <linux/crypto.h>
+#include <lustre/lustre_idl.h>
+
+#define CAPA_TIMEOUT 1800              /* sec, == 30 min */
+#define CAPA_KEY_TIMEOUT (24 * 60 * 60)  /* sec, == 1 days */
+
+struct capa_hmac_alg {
+       const char     *ha_name;
+       int          ha_len;
+       int          ha_keylen;
+};
+
+#define DEF_CAPA_HMAC_ALG(name, type, len, keylen)      \
+[CAPA_HMAC_ALG_ ## type] = {                       \
+       .ha_name         = name,                        \
+       .ha_len   = len,                         \
+       .ha_keylen       = keylen,                    \
+}
+
+struct client_capa {
+       struct inode         *inode;
+       struct list_head                lli_list;     /* link to lli_oss_capas */
+};
+
+struct target_capa {
+       struct hlist_node         c_hash;       /* link to capa hash */
+};
+
+struct obd_capa {
+       struct list_head                c_list;       /* link to capa_list */
+
+       struct lustre_capa      c_capa;       /* capa */
+       atomic_t              c_refc;       /* ref count */
+       cfs_time_t              c_expiry;     /* jiffies */
+       spinlock_t              c_lock; /* protect capa content */
+       int                     c_site;
+
+       union {
+               struct client_capa      cli;
+               struct target_capa      tgt;
+       } u;
+};
+
+enum {
+       CAPA_SITE_CLIENT = 0,
+       CAPA_SITE_SERVER,
+       CAPA_SITE_MAX
+};
+
+static inline struct lu_fid *capa_fid(struct lustre_capa *capa)
+{
+       return &capa->lc_fid;
+}
+
+static inline __u64 capa_opc(struct lustre_capa *capa)
+{
+       return capa->lc_opc;
+}
+
+static inline __u64 capa_uid(struct lustre_capa *capa)
+{
+       return capa->lc_uid;
+}
+
+static inline __u64 capa_gid(struct lustre_capa *capa)
+{
+       return capa->lc_gid;
+}
+
+static inline __u32 capa_flags(struct lustre_capa *capa)
+{
+       return capa->lc_flags & 0xffffff;
+}
+
+static inline __u32 capa_alg(struct lustre_capa *capa)
+{
+       return (capa->lc_flags >> 24);
+}
+
+static inline __u32 capa_keyid(struct lustre_capa *capa)
+{
+       return capa->lc_keyid;
+}
+
+static inline __u64 capa_key_seq(struct lustre_capa_key *key)
+{
+       return key->lk_seq;
+}
+
+static inline __u32 capa_key_keyid(struct lustre_capa_key *key)
+{
+       return key->lk_keyid;
+}
+
+static inline __u32 capa_timeout(struct lustre_capa *capa)
+{
+       return capa->lc_timeout;
+}
+
+static inline __u32 capa_expiry(struct lustre_capa *capa)
+{
+       return capa->lc_expiry;
+}
+
+void _debug_capa(struct lustre_capa *, struct libcfs_debug_msg_data *,
+                const char *fmt, ... );
+#define DEBUG_CAPA(level, capa, fmt, args...)                            \
+do {                                                                      \
+       if (((level) & D_CANTMASK) != 0 ||                                   \
+           ((libcfs_debug & (level)) != 0 &&                             \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) {               \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);             \
+               _debug_capa((capa), &msgdata, fmt, ##args);                 \
+       }                                                                     \
+} while (0)
+
+#define DEBUG_CAPA_KEY(level, k, fmt, args...)                          \
+do {                                                                      \
+CDEBUG(level, fmt " capability key@%p seq "LPU64" keyid %u\n",          \
+       ##args, k, capa_key_seq(k), capa_key_keyid(k));                  \
+} while (0)
+
+typedef int (* renew_capa_cb_t)(struct obd_capa *, struct lustre_capa *);
+
+/* obdclass/capa.c */
+extern struct list_head capa_list[];
+extern spinlock_t capa_lock;
+extern int capa_count[];
+extern struct kmem_cache *capa_cachep;
+
+struct hlist_head *init_capa_hash(void);
+void cleanup_capa_hash(struct hlist_head *hash);
+
+struct obd_capa *capa_add(struct hlist_head *hash,
+                         struct lustre_capa *capa);
+struct obd_capa *capa_lookup(struct hlist_head *hash,
+                            struct lustre_capa *capa, int alive);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key);
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen);
+void capa_cpy(void *dst, struct obd_capa *ocapa);
+static inline struct obd_capa *alloc_capa(int site)
+{
+       struct obd_capa *ocapa;
+
+       if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER))
+               return ERR_PTR(-EINVAL);
+
+       OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep);
+       if (unlikely(!ocapa))
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&ocapa->c_list);
+       atomic_set(&ocapa->c_refc, 1);
+       spin_lock_init(&ocapa->c_lock);
+       ocapa->c_site = site;
+       if (ocapa->c_site == CAPA_SITE_CLIENT)
+               INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+       else
+               INIT_HLIST_NODE(&ocapa->u.tgt.c_hash);
+
+       return ocapa;
+}
+
+static inline struct obd_capa *capa_get(struct obd_capa *ocapa)
+{
+       if (!ocapa)
+               return NULL;
+
+       atomic_inc(&ocapa->c_refc);
+       return ocapa;
+}
+
+static inline void capa_put(struct obd_capa *ocapa)
+{
+       if (!ocapa)
+               return;
+
+       if (atomic_read(&ocapa->c_refc) == 0) {
+               DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for");
+               LBUG();
+       }
+
+       if (atomic_dec_and_test(&ocapa->c_refc)) {
+               LASSERT(list_empty(&ocapa->c_list));
+               if (ocapa->c_site == CAPA_SITE_CLIENT) {
+                       LASSERT(list_empty(&ocapa->u.cli.lli_list));
+               } else {
+                       struct hlist_node *hnode;
+
+                       hnode = &ocapa->u.tgt.c_hash;
+                       LASSERT(!hnode->next && !hnode->pprev);
+               }
+               OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa));
+       }
+}
+
+static inline int open_flags_to_accmode(int flags)
+{
+       int mode = flags;
+
+       if ((mode + 1) & O_ACCMODE)
+               mode++;
+       if (mode & O_TRUNC)
+               mode |= 2;
+
+       return mode;
+}
+
+static inline __u64 capa_open_opc(int mode)
+{
+       return mode & FMODE_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_READ;
+}
+
+static inline void set_capa_expiry(struct obd_capa *ocapa)
+{
+       cfs_time_t expiry = cfs_time_sub((cfs_time_t)ocapa->c_capa.lc_expiry,
+                                        cfs_time_current_sec());
+       ocapa->c_expiry = cfs_time_add(cfs_time_current(),
+                                      cfs_time_seconds(expiry));
+}
+
+static inline int capa_is_expired_sec(struct lustre_capa *capa)
+{
+       return (capa->lc_expiry - cfs_time_current_sec() <= 0);
+}
+
+static inline int capa_is_expired(struct obd_capa *ocapa)
+{
+       return cfs_time_beforeq(ocapa->c_expiry, cfs_time_current());
+}
+
+static inline int capa_opc_supported(struct lustre_capa *capa, __u64 opc)
+{
+       return (capa_opc(capa) & opc) == opc;
+}
+
+struct filter_capa_key {
+       struct list_head              k_list;
+       struct lustre_capa_key  k_key;
+};
+
+enum {
+       LC_ID_NONE      = 0,
+       LC_ID_PLAIN     = 1,
+       LC_ID_CONVERT   = 2
+};
+
+#define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT)
+
+/** @} capa */
+
+#endif /* __LINUX_CAPA_H_ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_cfg.h b/drivers/staging/lustre/lustre/include/lustre_cfg.h
new file mode 100644 (file)
index 0000000..f12429f
--- /dev/null
@@ -0,0 +1,299 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_CFG_H
+#define _LUSTRE_CFG_H
+
+/** \defgroup cfg cfg
+ *
+ * @{
+ */
+
+/*
+ * 1cf6
+ * lcfG
+ */
+#define LUSTRE_CFG_VERSION 0x1cf60001
+#define LUSTRE_CFG_MAX_BUFCOUNT 8
+
+#define LCFG_HDR_SIZE(count) \
+    cfs_size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)]))
+
+/** If the LCFG_REQUIRED bit is set in a configuration command,
+ * then the client is required to understand this parameter
+ * in order to mount the filesystem. If it does not understand
+ * a REQUIRED command the client mount will fail. */
+#define LCFG_REQUIRED   0x0001000
+
+enum lcfg_command_type {
+       LCFG_ATTACH          = 0x00cf001, /**< create a new obd instance */
+       LCFG_DETACH          = 0x00cf002, /**< destroy obd instance */
+       LCFG_SETUP            = 0x00cf003, /**< call type-specific setup */
+       LCFG_CLEANUP        = 0x00cf004, /**< call type-specific cleanup */
+       LCFG_ADD_UUID      = 0x00cf005, /**< add a nid to a niduuid */
+       LCFG_DEL_UUID      = 0x00cf006, /**< remove a nid from a niduuid */
+       LCFG_MOUNTOPT      = 0x00cf007, /**< create a profile (mdc, osc) */
+       LCFG_DEL_MOUNTOPT       = 0x00cf008, /**< destroy a profile */
+       LCFG_SET_TIMEOUT        = 0x00cf009, /**< set obd_timeout */
+       LCFG_SET_UPCALL  = 0x00cf00a, /**< deprecated */
+       LCFG_ADD_CONN      = 0x00cf00b, /**< add a failover niduuid to an obd */
+       LCFG_DEL_CONN      = 0x00cf00c, /**< remove a failover niduuid */
+       LCFG_LOV_ADD_OBD        = 0x00cf00d, /**< add an osc to a lov */
+       LCFG_LOV_DEL_OBD        = 0x00cf00e, /**< remove an osc from a lov */
+       LCFG_PARAM            = 0x00cf00f, /**< set a proc parameter */
+       LCFG_MARKER          = 0x00cf010, /**< metadata about next cfg rec */
+       LCFG_LOG_START    = 0x00ce011, /**< mgc only, process a cfg log */
+       LCFG_LOG_END        = 0x00ce012, /**< stop processing updates */
+       LCFG_LOV_ADD_INA        = 0x00ce013, /**< like LOV_ADD_OBD, inactive */
+       LCFG_ADD_MDC        = 0x00cf014, /**< add an mdc to a lmv */
+       LCFG_DEL_MDC        = 0x00cf015, /**< remove an mdc from a lmv */
+       LCFG_SPTLRPC_CONF       = 0x00ce016, /**< security */
+       LCFG_POOL_NEW      = 0x00ce020, /**< create an ost pool name */
+       LCFG_POOL_ADD      = 0x00ce021, /**< add an ost to a pool */
+       LCFG_POOL_REM      = 0x00ce022, /**< remove an ost from a pool */
+       LCFG_POOL_DEL      = 0x00ce023, /**< destroy an ost pool name */
+       LCFG_SET_LDLM_TIMEOUT   = 0x00ce030, /**< set ldlm_timeout */
+       LCFG_PRE_CLEANUP        = 0x00cf031, /**< call type-specific pre
+                                             * cleanup cleanup */
+};
+
+struct lustre_cfg_bufs {
+       void    *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT];
+       __u32    lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT];
+       __u32    lcfg_bufcount;
+};
+
+struct lustre_cfg {
+       __u32 lcfg_version;
+       __u32 lcfg_command;
+
+       __u32 lcfg_num;
+       __u32 lcfg_flags;
+       __u64 lcfg_nid;
+       __u32 lcfg_nal;         /* not used any more */
+
+       __u32 lcfg_bufcount;
+       __u32 lcfg_buflens[0];
+};
+
+enum cfg_record_type {
+       PORTALS_CFG_TYPE = 1,
+       LUSTRE_CFG_TYPE = 123,
+};
+
+#define LUSTRE_CFG_BUFLEN(lcfg, idx)       \
+       ((lcfg)->lcfg_bufcount <= (idx)  \
+        ? 0                                \
+        : (lcfg)->lcfg_buflens[(idx)])
+
+static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs,
+                                      __u32               index,
+                                      void                *buf,
+                                      __u32               buflen)
+{
+       if (index >= LUSTRE_CFG_MAX_BUFCOUNT)
+               return;
+       if (bufs == NULL)
+               return;
+
+       if (bufs->lcfg_bufcount <= index)
+               bufs->lcfg_bufcount = index + 1;
+
+       bufs->lcfg_buf[index]    = buf;
+       bufs->lcfg_buflen[index] = buflen;
+}
+
+static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs,
+                                             __u32 index,
+                                             char *str)
+{
+       lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0);
+}
+
+static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name)
+{
+       memset((bufs), 0, sizeof(*bufs));
+       if (name)
+               lustre_cfg_bufs_set_string(bufs, 0, name);
+}
+
+static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index)
+{
+       int i;
+       int offset;
+       int bufcount;
+       LASSERT (lcfg != NULL);
+       LASSERT (index >= 0);
+
+       bufcount = lcfg->lcfg_bufcount;
+       if (index >= bufcount)
+               return NULL;
+
+       offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+       for (i = 0; i < index; i++)
+               offset += cfs_size_round(lcfg->lcfg_buflens[i]);
+       return (char *)lcfg + offset;
+}
+
+static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs,
+                                       struct lustre_cfg *lcfg)
+{
+       int i;
+       bufs->lcfg_bufcount = lcfg->lcfg_bufcount;
+       for (i = 0; i < bufs->lcfg_bufcount; i++) {
+               bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i];
+               bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i);
+       }
+}
+
+static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index)
+{
+       char *s;
+
+       if (lcfg->lcfg_buflens[index] == 0)
+               return NULL;
+
+       s = lustre_cfg_buf(lcfg, index);
+       if (s == NULL)
+               return NULL;
+
+       /*
+        * make sure it's NULL terminated, even if this kills a char
+        * of data.  Try to use the padding first though.
+        */
+       if (s[lcfg->lcfg_buflens[index] - 1] != '\0') {
+               int last = min((int)lcfg->lcfg_buflens[index],
+                              cfs_size_round(lcfg->lcfg_buflens[index]) - 1);
+               char lost = s[last];
+               s[last] = '\0';
+               if (lost != '\0') {
+                       CWARN("Truncated buf %d to '%s' (lost '%c'...)\n",
+                             index, s, lost);
+               }
+       }
+       return s;
+}
+
+static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens)
+{
+       int i;
+       int len;
+       ENTRY;
+
+       len = LCFG_HDR_SIZE(bufcount);
+       for (i = 0; i < bufcount; i++)
+               len += cfs_size_round(buflens[i]);
+
+       RETURN(cfs_size_round(len));
+}
+
+
+#include <obd_support.h>
+
+static inline struct lustre_cfg *lustre_cfg_new(int cmd,
+                                               struct lustre_cfg_bufs *bufs)
+{
+       struct lustre_cfg *lcfg;
+       char *ptr;
+       int i;
+
+       ENTRY;
+
+       OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
+                                      bufs->lcfg_buflen));
+       if (!lcfg)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       lcfg->lcfg_version = LUSTRE_CFG_VERSION;
+       lcfg->lcfg_command = cmd;
+       lcfg->lcfg_bufcount = bufs->lcfg_bufcount;
+
+       ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount);
+       for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+               lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i];
+               LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr);
+       }
+       RETURN(lcfg);
+}
+
+static inline void lustre_cfg_free(struct lustre_cfg *lcfg)
+{
+       int len;
+
+       len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens);
+
+       OBD_FREE(lcfg, len);
+       EXIT;
+       return;
+}
+
+static inline int lustre_cfg_sanity_check(void *buf, int len)
+{
+       struct lustre_cfg *lcfg = (struct lustre_cfg *)buf;
+       ENTRY;
+       if (!lcfg)
+               RETURN(-EINVAL);
+
+       /* check that the first bits of the struct are valid */
+       if (len < LCFG_HDR_SIZE(0))
+               RETURN(-EINVAL);
+
+       if (lcfg->lcfg_version != LUSTRE_CFG_VERSION)
+               RETURN(-EINVAL);
+
+       if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT)
+               RETURN(-EINVAL);
+
+       /* check that the buflens are valid */
+       if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount))
+               RETURN(-EINVAL);
+
+       /* make sure all the pointers point inside the data */
+       if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens))
+               RETURN(-EINVAL);
+
+       RETURN(0);
+}
+
+#include <lustre/lustre_user.h>
+
+#ifndef INVALID_UID
+#define INVALID_UID     (-1)
+#endif
+
+/** @} cfg */
+
+#endif // _LUSTRE_CFG_H
diff --git a/drivers/staging/lustre/lustre/include/lustre_debug.h b/drivers/staging/lustre/lustre/include/lustre_debug.h
new file mode 100644 (file)
index 0000000..3d9e446
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_DEBUG_H
+#define _LUSTRE_DEBUG_H
+
+/** \defgroup debug debug
+ *
+ * @{
+ */
+
+#include <lustre_net.h>
+#include <obd.h>
+
+#include <linux/lustre_debug.h>
+
+#define ASSERT_MAX_SIZE_MB 60000ULL
+#define ASSERT_PAGE_INDEX(index, OP)                               \
+do { if (index > ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT)) {       \
+       CERROR("bad page index %lu > %llu\n", index,                \
+              ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT));      \
+       libcfs_debug = ~0UL;                                        \
+       OP;                                                          \
+}} while(0)
+
+#define ASSERT_FILE_OFFSET(offset, OP)                           \
+do { if (offset > ASSERT_MAX_SIZE_MB << 20) {                     \
+       CERROR("bad file offset %llu > %llu\n", offset,          \
+              ASSERT_MAX_SIZE_MB << 20);                              \
+       libcfs_debug = ~0UL;                                        \
+       OP;                                                          \
+}} while(0)
+
+/* lib/debug.c */
+void dump_lniobuf(struct niobuf_local *lnb);
+int dump_req(struct ptlrpc_request *req);
+void dump_lsm(int level, struct lov_stripe_md *lsm);
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id);
+int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id);
+
+/** @} debug */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h
new file mode 100644 (file)
index 0000000..8db6086
--- /dev/null
@@ -0,0 +1,543 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_disk.h
+ *
+ * Lustre disk format definitions.
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_DISK_H
+#define _LUSTRE_DISK_H
+
+/** \defgroup disk disk
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/types.h>
+
+/****************** on-disk files *********************/
+
+#define MDT_LOGS_DIR      "LOGS"  /* COMPAT_146 */
+#define MOUNT_CONFIGS_DIR "CONFIGS"
+#define CONFIGS_FILE      "mountdata"
+/** Persistent mount data are stored on the disk in this file. */
+#define MOUNT_DATA_FILE    MOUNT_CONFIGS_DIR"/"CONFIGS_FILE
+#define LAST_RCVD       "last_rcvd"
+#define LOV_OBJID       "lov_objid"
+#define LOV_OBJSEQ             "lov_objseq"
+#define HEALTH_CHECK      "health_check"
+#define CAPA_KEYS       "capa_keys"
+#define CHANGELOG_USERS   "changelog_users"
+#define MGS_NIDTBL_DIR    "NIDTBL_VERSIONS"
+#define QMT_DIR           "quota_master"
+#define QSD_DIR           "quota_slave"
+
+/****************** persistent mount data *********************/
+
+#define LDD_F_SV_TYPE_MDT   0x0001
+#define LDD_F_SV_TYPE_OST   0x0002
+#define LDD_F_SV_TYPE_MGS   0x0004
+#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT  | \
+                           LDD_F_SV_TYPE_OST  | \
+                           LDD_F_SV_TYPE_MGS)
+#define LDD_F_SV_ALL   0x0008
+/** need an index assignment */
+#define LDD_F_NEED_INDEX    0x0010
+/** never registered */
+#define LDD_F_VIRGIN   0x0020
+/** update the config logs for this server */
+#define LDD_F_UPDATE   0x0040
+/** rewrite the LDD */
+#define LDD_F_REWRITE_LDD   0x0080
+/** regenerate config logs for this fs or server */
+#define LDD_F_WRITECONF     0x0100
+/** COMPAT_14 */
+#define LDD_F_UPGRADE14     0x0200
+/** process as lctl conf_param */
+#define LDD_F_PARAM     0x0400
+/** all nodes are specified as service nodes */
+#define LDD_F_NO_PRIMNODE   0x1000
+/** IR enable flag */
+#define LDD_F_IR_CAPABLE    0x2000
+/** the MGS refused to register the target. */
+#define LDD_F_ERROR     0x4000
+
+/* opc for target register */
+#define LDD_F_OPC_REG   0x10000000
+#define LDD_F_OPC_UNREG 0x20000000
+#define LDD_F_OPC_READY 0x40000000
+#define LDD_F_OPC_MASK  0xf0000000
+
+#define LDD_F_ONDISK_MASK  (LDD_F_SV_TYPE_MASK)
+
+#define LDD_F_MASK       0xFFFF
+
+enum ldd_mount_type {
+       LDD_MT_EXT3 = 0,
+       LDD_MT_LDISKFS,
+       LDD_MT_SMFS,
+       LDD_MT_REISERFS,
+       LDD_MT_LDISKFS2,
+       LDD_MT_ZFS,
+       LDD_MT_LAST
+};
+
+static inline char *mt_str(enum ldd_mount_type mt)
+{
+       static char *mount_type_string[] = {
+               "ext3",
+               "ldiskfs",
+               "smfs",
+               "reiserfs",
+               "ldiskfs2",
+               "zfs",
+       };
+       return mount_type_string[mt];
+}
+
+static inline char *mt_type(enum ldd_mount_type mt)
+{
+       static char *mount_type_string[] = {
+               "osd-ldiskfs",
+               "osd-ldiskfs",
+               "osd-smfs",
+               "osd-reiserfs",
+               "osd-ldiskfs",
+               "osd-zfs",
+       };
+       return mount_type_string[mt];
+}
+
+#define LDD_INCOMPAT_SUPP 0
+#define LDD_ROCOMPAT_SUPP 0
+
+#define LDD_MAGIC 0x1dd00001
+
+/* On-disk configuration file. In host-endian order. */
+struct lustre_disk_data {
+       __u32      ldd_magic;
+       __u32      ldd_feature_compat;  /* compatible feature flags */
+       __u32      ldd_feature_rocompat;/* read-only compatible feature flags */
+       __u32      ldd_feature_incompat;/* incompatible feature flags */
+
+       __u32      ldd_config_ver;      /* config rewrite count - not used */
+       __u32      ldd_flags;      /* LDD_SV_TYPE */
+       __u32      ldd_svindex;  /* server index (0001), must match
+                                          svname */
+       __u32      ldd_mount_type;      /* target fs type LDD_MT_* */
+       char       ldd_fsname[64];      /* filesystem this server is part of,
+                                          MTI_NAME_MAXLEN */
+       char       ldd_svname[64];      /* this server's name (lustre-mdt0001)*/
+       __u8       ldd_uuid[40];        /* server UUID (COMPAT_146) */
+
+/*200*/ char       ldd_userdata[1024 - 200]; /* arbitrary user string */
+/*1024*/__u8       ldd_padding[4096 - 1024];
+/*4096*/char       ldd_mount_opts[4096]; /* target fs mount opts */
+/*8192*/char       ldd_params[4096];     /* key=value pairs */
+};
+
+
+#define IS_MDT(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_MDT)
+#define IS_OST(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_OST)
+#define IS_MGS(data)    ((data)->lsi_flags & LDD_F_SV_TYPE_MGS)
+#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \
+                        LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST))
+#define MT_STR(data)    mt_str((data)->ldd_mount_type)
+
+/* Make the mdt/ost server obd name based on the filesystem name */
+static inline int server_make_name(__u32 flags, __u16 index, char *fs,
+                                  char *name)
+{
+       if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) {
+               if (!(flags & LDD_F_SV_ALL))
+                       sprintf(name, "%.8s%c%s%04x", fs,
+                               (flags & LDD_F_VIRGIN) ? ':' :
+                                       ((flags & LDD_F_WRITECONF) ? '=' : '-'),
+                               (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST",
+                               index);
+       } else if (flags & LDD_F_SV_TYPE_MGS) {
+               sprintf(name, "MGS");
+       } else {
+               CERROR("unknown server type %#x\n", flags);
+               return 1;
+       }
+       return 0;
+}
+
+/****************** mount command *********************/
+
+/* The lmd is only used internally by Lustre; mount simply passes
+   everything as string options */
+
+#define LMD_MAGIC    0xbdacbd03
+
+/* gleaned from the mount command - no persistent info here */
+struct lustre_mount_data {
+       __u32      lmd_magic;
+       __u32      lmd_flags;    /* lustre mount flags */
+       int     lmd_mgs_failnodes; /* mgs failover node count */
+       int     lmd_exclude_count;
+       int     lmd_recovery_time_soft;
+       int     lmd_recovery_time_hard;
+       char      *lmd_dev;        /* device name */
+       char      *lmd_profile;       /* client only */
+       char      *lmd_mgssec;  /* sptlrpc flavor to mgs */
+       char      *lmd_opts;      /* lustre mount options (as opposed to
+                                        _device_ mount options) */
+       char      *lmd_params;  /* lustre params */
+       __u32     *lmd_exclude;       /* array of OSTs to ignore */
+       char    *lmd_mgs;          /* MGS nid */
+       char    *lmd_osd_type;      /* OSD type */
+};
+
+#define LMD_FLG_SERVER       0x0001  /* Mounting a server */
+#define LMD_FLG_CLIENT       0x0002  /* Mounting a client */
+#define LMD_FLG_ABORT_RECOV  0x0008  /* Abort recovery */
+#define LMD_FLG_NOSVC  0x0010  /* Only start MGS/MGC for servers,
+                                       no other services */
+#define LMD_FLG_NOMGS  0x0020  /* Only start target for servers, reusing
+                                       existing MGS services */
+#define LMD_FLG_WRITECONF    0x0040  /* Rewrite config log */
+#define LMD_FLG_NOIR    0x0080  /* NO imperative recovery */
+#define LMD_FLG_NOSCRUB             0x0100  /* Do not trigger scrub automatically */
+#define LMD_FLG_MGS         0x0200  /* Also start MGS along with server */
+#define LMD_FLG_IAM         0x0400  /* IAM dir */
+#define LMD_FLG_NO_PRIMNODE  0x0800  /* all nodes are service nodes */
+#define LMD_FLG_VIRGIN      0x1000  /* the service registers first time */
+#define LMD_FLG_UPDATE      0x2000  /* update parameters */
+
+#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
+
+
+/****************** last_rcvd file *********************/
+
+/** version recovery epoch */
+#define LR_EPOCH_BITS   32
+#define lr_epoch(a) ((a) >> LR_EPOCH_BITS)
+#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */
+#define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */
+
+#define LR_SERVER_SIZE   512
+#define LR_CLIENT_START 8192
+#define LR_CLIENT_SIZE   128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+
+/*
+ * This limit is arbitrary (131072 clients on x86), but it is convenient to use
+ * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation.
+ * If we need more than 131072 clients (order-2 allocation on x86) then this
+ * should become an array of single-page pointers that are allocated on demand.
+ */
+#if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8)
+#define LR_MAX_CLIENTS (128 * 1024UL)
+#else
+#define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8)
+#endif
+
+/** COMPAT_146: this is an OST (temporary) */
+#define OBD_COMPAT_OST   0x00000002
+/** COMPAT_146: this is an MDT (temporary) */
+#define OBD_COMPAT_MDT   0x00000004
+/** 2.0 server, interop flag to show server version is changed */
+#define OBD_COMPAT_20     0x00000008
+
+/** MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_LOVOBJID   0x00000001
+
+/** OST handles group subdirs */
+#define OBD_INCOMPAT_GROUPS     0x00000001
+/** this is an OST */
+#define OBD_INCOMPAT_OST       0x00000002
+/** this is an MDT */
+#define OBD_INCOMPAT_MDT       0x00000004
+/** common last_rvcd format */
+#define OBD_INCOMPAT_COMMON_LR  0x00000008
+/** FID is enabled */
+#define OBD_INCOMPAT_FID       0x00000010
+/** Size-on-MDS is enabled */
+#define OBD_INCOMPAT_SOM       0x00000020
+/** filesystem using iam format to store directory entries */
+#define OBD_INCOMPAT_IAM_DIR    0x00000040
+/** LMA attribute contains per-inode incompatible flags */
+#define OBD_INCOMPAT_LMA       0x00000080
+/** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16
+ * bits are now used to store a generation. Once we start changing the layout
+ * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count
+ * will be confused by interpreting stripe_count | gen << 16 as the actual
+ * stripe count */
+#define OBD_INCOMPAT_LMM_VER    0x00000100
+/** multiple OI files for MDT */
+#define OBD_INCOMPAT_MULTI_OI   0x00000200
+
+/* Data stored per server at the head of the last_rcvd file.  In le32 order.
+   This should be common to filter_internal.h, lustre_mds.h */
+struct lr_server_data {
+       __u8  lsd_uuid[40];     /* server UUID */
+       __u64 lsd_last_transno;    /* last completed transaction ID */
+       __u64 lsd_compat14;     /* reserved - compat with old last_rcvd */
+       __u64 lsd_mount_count;     /* incarnation number */
+       __u32 lsd_feature_compat;  /* compatible feature flags */
+       __u32 lsd_feature_rocompat;/* read-only compatible feature flags */
+       __u32 lsd_feature_incompat;/* incompatible feature flags */
+       __u32 lsd_server_size;     /* size of server data area */
+       __u32 lsd_client_start;    /* start of per-client data area */
+       __u16 lsd_client_size;     /* size of per-client data area */
+       __u16 lsd_subdir_count;    /* number of subdirectories for objects */
+       __u64 lsd_catalog_oid;     /* recovery catalog object id */
+       __u32 lsd_catalog_ogen;    /* recovery catalog inode generation */
+       __u8  lsd_peeruuid[40];    /* UUID of MDS associated with this OST */
+       __u32 lsd_osd_index;       /* index number of OST in LOV */
+       __u32 lsd_padding1;     /* was lsd_mdt_index, unused in 2.4.0 */
+       __u32 lsd_start_epoch;     /* VBR: start epoch from last boot */
+       /** transaction values since lsd_trans_table_time */
+       __u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
+       /** start point of transno table below */
+       __u32 lsd_trans_table_time; /* time of first slot in table above */
+       __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
+       __u8  lsd_padding[LR_SERVER_SIZE - 288];
+};
+
+/* Data stored per client in the last_rcvd file.  In le32 order. */
+struct lsd_client_data {
+       __u8  lcd_uuid[40];      /* client UUID */
+       __u64 lcd_last_transno; /* last completed transaction ID */
+       __u64 lcd_last_xid;     /* xid for the last transaction */
+       __u32 lcd_last_result;  /* result from last RPC */
+       __u32 lcd_last_data;    /* per-op data (disposition for open &c.) */
+       /* for MDS_CLOSE requests */
+       __u64 lcd_last_close_transno; /* last completed transaction ID */
+       __u64 lcd_last_close_xid;     /* xid for the last transaction */
+       __u32 lcd_last_close_result;  /* result from last RPC */
+       __u32 lcd_last_close_data;    /* per-op data */
+       /* VBR: last versions */
+       __u64 lcd_pre_versions[4];
+       __u32 lcd_last_epoch;
+       /** orphans handling for delayed export rely on that */
+       __u32 lcd_first_epoch;
+       __u8  lcd_padding[LR_CLIENT_SIZE - 128];
+};
+
+/* bug20354: the lcd_uuid for export of clients may be wrong */
+static inline void check_lcd(char *obd_name, int index,
+                            struct lsd_client_data *lcd)
+{
+       int length = sizeof(lcd->lcd_uuid);
+       if (strnlen((char*)lcd->lcd_uuid, length) == length) {
+               lcd->lcd_uuid[length - 1] = '\0';
+
+               LCONSOLE_ERROR("the client UUID (%s) on %s for exports"
+                              "stored in last_rcvd(index = %d) is bad!\n",
+                              lcd->lcd_uuid, obd_name, index);
+       }
+}
+
+/* last_rcvd handling */
+static inline void lsd_le_to_cpu(struct lr_server_data *buf,
+                                struct lr_server_data *lsd)
+{
+       int i;
+       memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid));
+       lsd->lsd_last_transno     = le64_to_cpu(buf->lsd_last_transno);
+       lsd->lsd_compat14        = le64_to_cpu(buf->lsd_compat14);
+       lsd->lsd_mount_count      = le64_to_cpu(buf->lsd_mount_count);
+       lsd->lsd_feature_compat   = le32_to_cpu(buf->lsd_feature_compat);
+       lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
+       lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
+       lsd->lsd_server_size      = le32_to_cpu(buf->lsd_server_size);
+       lsd->lsd_client_start     = le32_to_cpu(buf->lsd_client_start);
+       lsd->lsd_client_size      = le16_to_cpu(buf->lsd_client_size);
+       lsd->lsd_subdir_count     = le16_to_cpu(buf->lsd_subdir_count);
+       lsd->lsd_catalog_oid      = le64_to_cpu(buf->lsd_catalog_oid);
+       lsd->lsd_catalog_ogen     = le32_to_cpu(buf->lsd_catalog_ogen);
+       memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
+       lsd->lsd_osd_index      = le32_to_cpu(buf->lsd_osd_index);
+       lsd->lsd_padding1       = le32_to_cpu(buf->lsd_padding1);
+       lsd->lsd_start_epoch      = le32_to_cpu(buf->lsd_start_epoch);
+       for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+               lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
+       lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
+       lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
+}
+
+static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
+                                struct lr_server_data *buf)
+{
+       int i;
+       memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid));
+       buf->lsd_last_transno     = cpu_to_le64(lsd->lsd_last_transno);
+       buf->lsd_compat14        = cpu_to_le64(lsd->lsd_compat14);
+       buf->lsd_mount_count      = cpu_to_le64(lsd->lsd_mount_count);
+       buf->lsd_feature_compat   = cpu_to_le32(lsd->lsd_feature_compat);
+       buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
+       buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
+       buf->lsd_server_size      = cpu_to_le32(lsd->lsd_server_size);
+       buf->lsd_client_start     = cpu_to_le32(lsd->lsd_client_start);
+       buf->lsd_client_size      = cpu_to_le16(lsd->lsd_client_size);
+       buf->lsd_subdir_count     = cpu_to_le16(lsd->lsd_subdir_count);
+       buf->lsd_catalog_oid      = cpu_to_le64(lsd->lsd_catalog_oid);
+       buf->lsd_catalog_ogen     = cpu_to_le32(lsd->lsd_catalog_ogen);
+       memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
+       buf->lsd_osd_index        = cpu_to_le32(lsd->lsd_osd_index);
+       buf->lsd_padding1         = cpu_to_le32(lsd->lsd_padding1);
+       buf->lsd_start_epoch      = cpu_to_le32(lsd->lsd_start_epoch);
+       for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
+               buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
+       buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
+       buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
+}
+
+static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
+                                struct lsd_client_data *lcd)
+{
+       memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
+       lcd->lcd_last_transno       = le64_to_cpu(buf->lcd_last_transno);
+       lcd->lcd_last_xid          = le64_to_cpu(buf->lcd_last_xid);
+       lcd->lcd_last_result    = le32_to_cpu(buf->lcd_last_result);
+       lcd->lcd_last_data        = le32_to_cpu(buf->lcd_last_data);
+       lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
+       lcd->lcd_last_close_xid     = le64_to_cpu(buf->lcd_last_close_xid);
+       lcd->lcd_last_close_result  = le32_to_cpu(buf->lcd_last_close_result);
+       lcd->lcd_last_close_data    = le32_to_cpu(buf->lcd_last_close_data);
+       lcd->lcd_pre_versions[0]    = le64_to_cpu(buf->lcd_pre_versions[0]);
+       lcd->lcd_pre_versions[1]    = le64_to_cpu(buf->lcd_pre_versions[1]);
+       lcd->lcd_pre_versions[2]    = le64_to_cpu(buf->lcd_pre_versions[2]);
+       lcd->lcd_pre_versions[3]    = le64_to_cpu(buf->lcd_pre_versions[3]);
+       lcd->lcd_last_epoch      = le32_to_cpu(buf->lcd_last_epoch);
+       lcd->lcd_first_epoch    = le32_to_cpu(buf->lcd_first_epoch);
+}
+
+static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
+                                struct lsd_client_data *buf)
+{
+       memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
+       buf->lcd_last_transno       = cpu_to_le64(lcd->lcd_last_transno);
+       buf->lcd_last_xid          = cpu_to_le64(lcd->lcd_last_xid);
+       buf->lcd_last_result    = cpu_to_le32(lcd->lcd_last_result);
+       buf->lcd_last_data        = cpu_to_le32(lcd->lcd_last_data);
+       buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
+       buf->lcd_last_close_xid     = cpu_to_le64(lcd->lcd_last_close_xid);
+       buf->lcd_last_close_result  = cpu_to_le32(lcd->lcd_last_close_result);
+       buf->lcd_last_close_data    = cpu_to_le32(lcd->lcd_last_close_data);
+       buf->lcd_pre_versions[0]    = cpu_to_le64(lcd->lcd_pre_versions[0]);
+       buf->lcd_pre_versions[1]    = cpu_to_le64(lcd->lcd_pre_versions[1]);
+       buf->lcd_pre_versions[2]    = cpu_to_le64(lcd->lcd_pre_versions[2]);
+       buf->lcd_pre_versions[3]    = cpu_to_le64(lcd->lcd_pre_versions[3]);
+       buf->lcd_last_epoch      = cpu_to_le32(lcd->lcd_last_epoch);
+       buf->lcd_first_epoch    = cpu_to_le32(lcd->lcd_first_epoch);
+}
+
+static inline __u64 lcd_last_transno(struct lsd_client_data *lcd)
+{
+       return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
+               lcd->lcd_last_transno : lcd->lcd_last_close_transno);
+}
+
+static inline __u64 lcd_last_xid(struct lsd_client_data *lcd)
+{
+       return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
+               lcd->lcd_last_xid : lcd->lcd_last_close_xid);
+}
+
+/****************** superblock additional info *********************/
+
+struct ll_sb_info;
+
+struct lustre_sb_info {
+       int                    lsi_flags;
+       struct obd_device       *lsi_mgc;     /* mgc obd */
+       struct lustre_mount_data *lsi_lmd;     /* mount command info */
+       struct ll_sb_info       *lsi_llsbi;   /* add'l client sbi info */
+       struct dt_device         *lsi_dt_dev;  /* dt device to access disk fs*/
+       struct vfsmount   *lsi_srv_mnt; /* the one server mount */
+       atomic_t              lsi_mounts;  /* references to the srv_mnt */
+       char                      lsi_svname[MTI_NAME_MAXLEN];
+       char                      lsi_osd_obdname[64];
+       char                      lsi_osd_uuid[64];
+       struct obd_export        *lsi_osd_exp;
+       char                      lsi_osd_type[16];
+       char                      lsi_fstype[16];
+       struct backing_dev_info   lsi_bdi;     /* each client mountpoint needs
+                                                 own backing_dev_info */
+};
+
+#define LSI_UMOUNT_FAILOVER          0x00200000
+#define LSI_BDI_INITIALIZED          0x00400000
+
+#define     s2lsi(sb)  ((struct lustre_sb_info *)((sb)->s_fs_info))
+#define     s2lsi_nocast(sb) ((sb)->s_fs_info)
+
+#define     get_profile_name(sb)   (s2lsi(sb)->lsi_lmd->lmd_profile)
+#define            get_mount_flags(sb)    (s2lsi(sb)->lsi_lmd->lmd_flags)
+#define            get_mntdev_name(sb)    (s2lsi(sb)->lsi_lmd->lmd_dev)
+
+
+/****************** mount lookup info *********************/
+
+struct lustre_mount_info {
+       char             *lmi_name;
+       struct super_block   *lmi_sb;
+       struct vfsmount      *lmi_mnt;
+       struct list_head            lmi_list_chain;
+};
+
+/****************** prototypes *********************/
+
+/* obd_mount.c */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr);
+int server_name2index(const char *svname, __u32 *idx, const char **endptr);
+int server_name2svname(const char *label, char *svname, const char **endptr,
+                      size_t svsize);
+
+int lustre_put_lsi(struct super_block *sb);
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+                       char *s1, char *s2, char *s3, char *s4);
+int lustre_start_mgc(struct super_block *sb);
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+                                                 struct vfsmount *mnt));
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
+int lustre_common_put_super(struct super_block *sb);
+
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
+
+/** @} disk */
+
+#endif // _LUSTRE_DISK_H
diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm.h b/drivers/staging/lustre/lustre/include/lustre_dlm.h
new file mode 100644 (file)
index 0000000..317f928
--- /dev/null
@@ -0,0 +1,1671 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** \defgroup LDLM Lustre Distributed Lock Manager
+ *
+ * Lustre DLM is based on VAX DLM.
+ * Its two main roles are:
+ *   - To provide locking assuring consistency of data on all Lustre nodes.
+ *   - To allow clients to cache state protected by a lock by holding the
+ *     lock until a conflicting lock is requested or it is expired by the LRU.
+ *
+ * @{
+ */
+
+#ifndef _LUSTRE_DLM_H__
+#define _LUSTRE_DLM_H__
+
+#include <linux/lustre_dlm.h>
+
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_handles.h>
+#include <interval_tree.h> /* for interval_node{}, ldlm_extent */
+#include <lu_ref.h>
+
+struct obd_ops;
+struct obd_device;
+
+#define OBD_LDLM_DEVICENAME  "ldlm"
+
+#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus())
+#define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000))
+#define LDLM_CTIME_AGE_LIMIT (10)
+#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024
+
+/**
+ * LDLM non-error return states
+ */
+typedef enum {
+       ELDLM_OK = 0,
+
+       ELDLM_LOCK_CHANGED = 300,
+       ELDLM_LOCK_ABORTED = 301,
+       ELDLM_LOCK_REPLACED = 302,
+       ELDLM_NO_LOCK_DATA = 303,
+       ELDLM_LOCK_WOULDBLOCK = 304,
+
+       ELDLM_NAMESPACE_EXISTS = 400,
+       ELDLM_BAD_NAMESPACE    = 401
+} ldlm_error_t;
+
+/**
+ * LDLM namespace type.
+ * The "client" type is actually an indication that this is a narrow local view
+ * into complete namespace on the server. Such namespaces cannot make any
+ * decisions about lack of conflicts or do any autonomous lock granting without
+ * first speaking to a server.
+ */
+typedef enum {
+       LDLM_NAMESPACE_SERVER = 1 << 0,
+       LDLM_NAMESPACE_CLIENT = 1 << 1
+} ldlm_side_t;
+
+/**
+ * Declaration of flags sent through the wire.
+ **/
+#define LDLM_FL_LOCK_CHANGED   0x000001 /* extent, mode, or resource changed */
+
+/**
+ * If the server returns one of these flags, then the lock was put on that list.
+ * If the client sends one of these flags (during recovery ONLY!), it wants the
+ * lock added to the specified list, no questions asked.
+ */
+#define LDLM_FL_BLOCK_GRANTED  0x000002
+#define LDLM_FL_BLOCK_CONV     0x000004
+#define LDLM_FL_BLOCK_WAIT     0x000008
+
+/* Used to be LDLM_FL_CBPENDING 0x000010 moved to non-wire flags */
+
+#define LDLM_FL_AST_SENT       0x000020 /* blocking or cancel packet was
+                                        * queued for sending. */
+/* Used to be LDLM_FL_WAIT_NOREPROC 0x000040   moved to non-wire flags */
+/* Used to be LDLM_FL_CANCEL   0x000080   moved to non-wire flags */
+
+/**
+ * Lock is being replayed.  This could probably be implied by the fact that one
+ * of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous.
+ */
+#define LDLM_FL_REPLAY  0x000100
+
+#define LDLM_FL_INTENT_ONLY    0x000200 /* Don't grant lock, just do intent. */
+
+/* Used to be LDLM_FL_LOCAL_ONLY 0x000400  moved to non-wire flags */
+/* Used to be LDLM_FL_FAILED     0x000800  moved to non-wire flags */
+
+#define LDLM_FL_HAS_INTENT     0x001000 /* lock request has intent */
+
+/* Used to be LDLM_FL_CANCELING  0x002000  moved to non-wire flags */
+/* Used to be LDLM_FL_LOCAL      0x004000  moved to non-wire flags */
+
+#define LDLM_FL_DISCARD_DATA   0x010000 /* discard (no writeback) on cancel */
+
+#define LDLM_FL_NO_TIMEOUT     0x020000 /* Blocked by group lock - wait
+                                        * indefinitely */
+
+/** file & record locking */
+#define LDLM_FL_BLOCK_NOWAIT   0x040000 /* Server told not to wait if blocked.
+                                        * For AGL, OST will not send glimpse
+                                        * callback. */
+#define LDLM_FL_TEST_LOCK      0x080000 // return blocking lock
+
+/* Used to be LDLM_FL_LVB_READY  0x100000 moved to non-wire flags */
+/* Used to be LDLM_FL_KMS_IGNORE 0x200000 moved to non-wire flags */
+/* Used to be LDLM_FL_NO_LRU     0x400000 moved to non-wire flags */
+
+/* Immediatelly cancel such locks when they block some other locks. Send
+ * cancel notification to original lock holder, but expect no reply. This is
+ * for clients (like liblustre) that cannot be expected to reliably response
+ * to blocking AST. */
+#define LDLM_FL_CANCEL_ON_BLOCK 0x800000
+
+/* Flags flags inherited from parent lock when doing intents. */
+#define LDLM_INHERIT_FLAGS     (LDLM_FL_CANCEL_ON_BLOCK)
+
+/* Used to be LDLM_FL_CP_REQD  0x1000000 moved to non-wire flags */
+/* Used to be LDLM_FL_CLEANED  0x2000000 moved to non-wire flags */
+/* Used to be LDLM_FL_ATOMIC_CB      0x4000000 moved to non-wire flags */
+/* Used to be LDLM_FL_BL_AST    0x10000000 moved to non-wire flags */
+/* Used to be LDLM_FL_BL_DONE  0x20000000 moved to non-wire flags */
+
+/* measure lock contention and return -EUSERS if locking contention is high */
+#define LDLM_FL_DENY_ON_CONTENTION 0x40000000
+
+/* These are flags that are mapped into the flags and ASTs of blocking locks */
+#define LDLM_AST_DISCARD_DATA  0x80000000 /* Add FL_DISCARD to blocking ASTs */
+
+/* Flags sent in AST lock_flags to be mapped into the receiving lock. */
+#define LDLM_AST_FLAGS  (LDLM_FL_DISCARD_DATA)
+
+/*
+ * --------------------------------------------------------------------------
+ * NOTE! Starting from this point, that is, LDLM_FL_* flags with values above
+ * 0x80000000 will not be sent over the wire.
+ * --------------------------------------------------------------------------
+ */
+
+/**
+ * Declaration of flags not sent through the wire.
+ **/
+
+/**
+ * Used for marking lock as a target for -EINTR while cp_ast sleep
+ * emulation + race with upcoming bl_ast.
+ */
+#define LDLM_FL_FAIL_LOC       0x100000000ULL
+
+/**
+ * Used while processing the unused list to know that we have already
+ * handled this lock and decided to skip it.
+ */
+#define LDLM_FL_SKIPPED        0x200000000ULL
+/* this lock is being destroyed */
+#define LDLM_FL_CBPENDING      0x400000000ULL
+/* not a real flag, not saved in lock */
+#define LDLM_FL_WAIT_NOREPROC  0x800000000ULL
+/* cancellation callback already run */
+#define LDLM_FL_CANCEL  0x1000000000ULL
+#define LDLM_FL_LOCAL_ONLY     0x2000000000ULL
+/* don't run the cancel callback under ldlm_cli_cancel_unused */
+#define LDLM_FL_FAILED  0x4000000000ULL
+/* lock cancel has already been sent */
+#define LDLM_FL_CANCELING      0x8000000000ULL
+/* local lock (ie, no srv/cli split) */
+#define LDLM_FL_LOCAL    0x10000000000ULL
+/* XXX FIXME: This is being added to b_size as a low-risk fix to the fact that
+ * the LVB filling happens _after_ the lock has been granted, so another thread
+ * can match it before the LVB has been updated.  As a dirty hack, we set
+ * LDLM_FL_LVB_READY only after we've done the LVB poop.
+ * this is only needed on LOV/OSC now, where LVB is actually used and callers
+ * must set it in input flags.
+ *
+ * The proper fix is to do the granting inside of the completion AST, which can
+ * be replaced with a LVB-aware wrapping function for OSC locks.  That change is
+ * pretty high-risk, though, and would need a lot more testing. */
+#define LDLM_FL_LVB_READY      0x20000000000ULL
+/* A lock contributes to the known minimum size (KMS) calculation until it has
+ * finished the part of its cancelation that performs write back on its dirty
+ * pages.  It can remain on the granted list during this whole time.  Threads
+ * racing to update the KMS after performing their writeback need to know to
+ * exclude each other's locks from the calculation as they walk the granted
+ * list. */
+#define LDLM_FL_KMS_IGNORE     0x40000000000ULL
+/* completion AST to be executed */
+#define LDLM_FL_CP_REQD        0x80000000000ULL
+/* cleanup_resource has already handled the lock */
+#define LDLM_FL_CLEANED        0x100000000000ULL
+/* optimization hint: LDLM can run blocking callback from current context
+ * w/o involving separate thread. in order to decrease cs rate */
+#define LDLM_FL_ATOMIC_CB      0x200000000000ULL
+
+/* It may happen that a client initiates two operations, e.g. unlink and
+ * mkdir, such that the server sends a blocking AST for conflicting
+ * locks to this client for the first operation, whereas the second
+ * operation has canceled this lock and is waiting for rpc_lock which is
+ * taken by the first operation. LDLM_FL_BL_AST is set by
+ * ldlm_callback_handler() in the lock to prevent the Early Lock Cancel
+ * (ELC) code from cancelling it.
+ *
+ * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock
+ * cache is dropped to let ldlm_callback_handler() return EINVAL to the
+ * server. It is used when ELC RPC is already prepared and is waiting
+ * for rpc_lock, too late to send a separate CANCEL RPC. */
+#define LDLM_FL_BL_AST   0x400000000000ULL
+#define LDLM_FL_BL_DONE         0x800000000000ULL
+/* Don't put lock into the LRU list, so that it is not canceled due to aging.
+ * Used by MGC locks, they are cancelled only at unmount or by callback. */
+#define LDLM_FL_NO_LRU         0x1000000000000ULL
+
+/**
+ * The blocking callback is overloaded to perform two functions.  These flags
+ * indicate which operation should be performed.
+ */
+#define LDLM_CB_BLOCKING    1
+#define LDLM_CB_CANCELING   2
+
+/**
+ * \name Lock Compatibility Matrix.
+ *
+ * A lock has both a type (extent, flock, inode bits, or plain) and a mode.
+ * Lock types are described in their respective implementation files:
+ * ldlm_{extent,flock,inodebits,plain}.c.
+ *
+ * There are six lock modes along with a compatibility matrix to indicate if
+ * two locks are compatible.
+ *
+ * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock
+ *   on the parent.
+ * - PW: Protective Write (normal write) mode. When a client requests a write
+ *   lock from an OST, a lock with PW mode will be issued.
+ * - PR: Protective Read (normal read) mode. When a client requests a read from
+ *   an OST, a lock with PR mode will be issued. Also, if the client opens a
+ *   file for execution, it is granted a lock with PR mode.
+ * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client
+ *   requests a write lock during a file open operation.
+ * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants
+ *   an inodebit lock with the CR mode on the intermediate path component.
+ * - NL Null mode.
+ *
+ * <PRE>
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * </PRE>
+ */
+/** @{ */
+#define LCK_COMPAT_EX  LCK_NL
+#define LCK_COMPAT_PW  (LCK_COMPAT_EX | LCK_CR)
+#define LCK_COMPAT_PR  (LCK_COMPAT_PW | LCK_PR)
+#define LCK_COMPAT_CW  (LCK_COMPAT_PW | LCK_CW)
+#define LCK_COMPAT_CR  (LCK_COMPAT_CW | LCK_PR | LCK_PW)
+#define LCK_COMPAT_NL  (LCK_COMPAT_CR | LCK_EX | LCK_GROUP)
+#define LCK_COMPAT_GROUP  (LCK_GROUP | LCK_NL)
+#define LCK_COMPAT_COS (LCK_COS)
+/** @} Lock Compatibility Matrix */
+
+extern ldlm_mode_t lck_compat_array[];
+
+static inline void lockmode_verify(ldlm_mode_t mode)
+{
+       LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE);
+}
+
+static inline int lockmode_compat(ldlm_mode_t exist_mode, ldlm_mode_t new_mode)
+{
+       return (lck_compat_array[exist_mode] & new_mode);
+}
+
+/*
+ *
+ * cluster name spaces
+ *
+ */
+
+#define DLM_OST_NAMESPACE 1
+#define DLM_MDS_NAMESPACE 2
+
+/* XXX
+   - do we just separate this by security domains and use a prefix for
+     multiple namespaces in the same domain?
+   -
+*/
+
+/**
+ * Locking rules for LDLM:
+ *
+ * lr_lock
+ *
+ * lr_lock
+ *     waiting_locks_spinlock
+ *
+ * lr_lock
+ *     led_lock
+ *
+ * lr_lock
+ *     ns_lock
+ *
+ * lr_lvb_mutex
+ *     lr_lock
+ *
+ */
+
+struct ldlm_pool;
+struct ldlm_lock;
+struct ldlm_resource;
+struct ldlm_namespace;
+
+/**
+ * Operations on LDLM pools.
+ * LDLM pool is a pool of locks in the namespace without any implicitly
+ * specified limits.
+ * Locks in the pool are organized in LRU.
+ * Local memory pressure or server instructions (e.g. mempressure on server)
+ * can trigger freeing of locks from the pool
+ */
+struct ldlm_pool_ops {
+       /** Recalculate pool \a pl usage */
+       int (*po_recalc)(struct ldlm_pool *pl);
+       /** Cancel at least \a nr locks from pool \a pl */
+       int (*po_shrink)(struct ldlm_pool *pl, int nr,
+                        unsigned int gfp_mask);
+       int (*po_setup)(struct ldlm_pool *pl, int limit);
+};
+
+/** One second for pools thread check interval. Each pool has own period. */
+#define LDLM_POOLS_THREAD_PERIOD (1)
+
+/** ~6% margin for modest pools. See ldlm_pool.c for details. */
+#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4)
+
+/** Default recalc period for server side pools in sec. */
+#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1)
+
+/** Default recalc period for client side pools in sec. */
+#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10)
+
+/**
+ * LDLM pool structure to track granted locks.
+ * For purposes of determining when to release locks on e.g. memory pressure.
+ * This feature is commonly referred to as lru_resize.
+ */
+struct ldlm_pool {
+       /** Pool proc directory. */
+       proc_dir_entry_t        *pl_proc_dir;
+       /** Pool name, must be long enough to hold compound proc entry name. */
+       char                    pl_name[100];
+       /** Lock for protecting SLV/CLV updates. */
+       spinlock_t              pl_lock;
+       /** Number of allowed locks in in pool, both, client and server side. */
+       atomic_t                pl_limit;
+       /** Number of granted locks in */
+       atomic_t                pl_granted;
+       /** Grant rate per T. */
+       atomic_t                pl_grant_rate;
+       /** Cancel rate per T. */
+       atomic_t                pl_cancel_rate;
+       /** Server lock volume (SLV). Protected by pl_lock. */
+       __u64                   pl_server_lock_volume;
+       /** Current biggest client lock volume. Protected by pl_lock. */
+       __u64                   pl_client_lock_volume;
+       /** Lock volume factor. SLV on client is calculated as following:
+        *  server_slv * lock_volume_factor. */
+       atomic_t                pl_lock_volume_factor;
+       /** Time when last SLV from server was obtained. */
+       time_t                  pl_recalc_time;
+       /** Recalculation period for pool. */
+       time_t                  pl_recalc_period;
+       /** Recalculation and shrink operations. */
+       struct ldlm_pool_ops    *pl_ops;
+       /** Number of planned locks for next period. */
+       int                     pl_grant_plan;
+       /** Pool statistics. */
+       struct lprocfs_stats    *pl_stats;
+};
+
+typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **,
+                              void *req_cookie, ldlm_mode_t mode, __u64 flags,
+                              void *data);
+
+typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock);
+
+/**
+ * LVB operations.
+ * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could
+ * be associated with an LDLM lock and transferred from client to server and
+ * back.
+ *
+ * Currently LVBs are used by:
+ *  - OSC-OST code to maintain current object size/times
+ *  - layout lock code to return the layout when the layout lock is granted
+ */
+struct ldlm_valblock_ops {
+       int (*lvbo_init)(struct ldlm_resource *res);
+       int (*lvbo_update)(struct ldlm_resource *res,
+                          struct ptlrpc_request *r,
+                          int increase);
+       int (*lvbo_free)(struct ldlm_resource *res);
+       /* Return size of lvb data appropriate RPC size can be reserved */
+       int (*lvbo_size)(struct ldlm_lock *lock);
+       /* Called to fill in lvb data to RPC buffer @buf */
+       int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen);
+};
+
+/**
+ * LDLM pools related, type of lock pool in the namespace.
+ * Greedy means release cached locks aggressively
+ */
+typedef enum {
+       LDLM_NAMESPACE_GREEDY = 1 << 0,
+       LDLM_NAMESPACE_MODEST = 1 << 1
+} ldlm_appetite_t;
+
+/**
+ * Default values for the "max_nolock_size", "contention_time" and
+ * "contended_locks" namespace tunables.
+ */
+#define NS_DEFAULT_MAX_NOLOCK_BYTES 0
+#define NS_DEFAULT_CONTENTION_SECONDS 2
+#define NS_DEFAULT_CONTENDED_LOCKS 32
+
+struct ldlm_ns_bucket {
+       /** back pointer to namespace */
+       struct ldlm_namespace      *nsb_namespace;
+       /**
+        * Estimated lock callback time.  Used by adaptive timeout code to
+        * avoid spurious client evictions due to unresponsiveness when in
+        * fact the network or overall system load is at fault
+        */
+       struct adaptive_timeout     nsb_at_estimate;
+};
+
+enum {
+       /** LDLM namespace lock stats */
+       LDLM_NSS_LOCKS    = 0,
+       LDLM_NSS_LAST
+};
+
+typedef enum {
+       /** invalide type */
+       LDLM_NS_TYPE_UNKNOWN    = 0,
+       /** mdc namespace */
+       LDLM_NS_TYPE_MDC,
+       /** mds namespace */
+       LDLM_NS_TYPE_MDT,
+       /** osc namespace */
+       LDLM_NS_TYPE_OSC,
+       /** ost namespace */
+       LDLM_NS_TYPE_OST,
+       /** mgc namespace */
+       LDLM_NS_TYPE_MGC,
+       /** mgs namespace */
+       LDLM_NS_TYPE_MGT,
+} ldlm_ns_type_t;
+
+/**
+ * LDLM Namespace.
+ *
+ * Namespace serves to contain locks related to a particular service.
+ * There are two kinds of namespaces:
+ * - Server namespace has knowledge of all locks and is therefore authoritative
+ *   to make decisions like what locks could be granted and what conflicts
+ *   exist during new lock enqueue.
+ * - Client namespace only has limited knowledge about locks in the namespace,
+ *   only seeing locks held by the client.
+ *
+ * Every Lustre service has one server namespace present on the server serving
+ * that service. Every client connected to the service has a client namespace
+ * for it.
+ * Every lock obtained by client in that namespace is actually represented by
+ * two in-memory locks. One on the server and one on the client. The locks are
+ * linked by a special cookie by which one node can tell to the other which lock
+ * it actually means during communications. Such locks are called remote locks.
+ * The locks held by server only without any reference to a client are called
+ * local locks.
+ */
+struct ldlm_namespace {
+       /** Backward link to OBD, required for LDLM pool to store new SLV. */
+       struct obd_device       *ns_obd;
+
+       /** Flag indicating if namespace is on client instead of server */
+       ldlm_side_t             ns_client;
+
+       /** Resource hash table for namespace. */
+       cfs_hash_t              *ns_rs_hash;
+
+       /** serialize */
+       spinlock_t              ns_lock;
+
+       /** big refcount (by bucket) */
+       atomic_t                ns_bref;
+
+       /**
+        * Namespace connect flags supported by server (may be changed via
+        * /proc, LRU resize may be disabled/enabled).
+        */
+       __u64                   ns_connect_flags;
+
+       /** Client side original connect flags supported by server. */
+       __u64                   ns_orig_connect_flags;
+
+       /* namespace proc dir entry */
+       struct proc_dir_entry   *ns_proc_dir_entry;
+
+       /**
+        * Position in global namespace list linking all namespaces on
+        * the node.
+        */
+       struct list_head                ns_list_chain;
+
+       /**
+        * List of unused locks for this namespace. This list is also called
+        * LRU lock list.
+        * Unused locks are locks with zero reader/writer reference counts.
+        * This list is only used on clients for lock caching purposes.
+        * When we want to release some locks voluntarily or if server wants
+        * us to release some locks due to e.g. memory pressure, we take locks
+        * to release from the head of this list.
+        * Locks are linked via l_lru field in \see struct ldlm_lock.
+        */
+       struct list_head                ns_unused_list;
+       /** Number of locks in the LRU list above */
+       int                     ns_nr_unused;
+
+       /**
+        * Maximum number of locks permitted in the LRU. If 0, means locks
+        * are managed by pools and there is no preset limit, rather it is all
+        * controlled by available memory on this client and on server.
+        */
+       unsigned int            ns_max_unused;
+       /** Maximum allowed age (last used time) for locks in the LRU */
+       unsigned int            ns_max_age;
+       /**
+        * Server only: number of times we evicted clients due to lack of reply
+        * to ASTs.
+        */
+       unsigned int            ns_timeouts;
+       /**
+        * Number of seconds since the file change time after which the
+        * MDT will return an UPDATE lock along with a LOOKUP lock.
+        * This allows the client to start caching negative dentries
+        * for a directory and may save an RPC for a later stat.
+        */
+       unsigned int            ns_ctime_age_limit;
+
+       /**
+        * Used to rate-limit ldlm_namespace_dump calls.
+        * \see ldlm_namespace_dump. Increased by 10 seconds every time
+        * it is called.
+        */
+       cfs_time_t              ns_next_dump;
+
+       /** "policy" function that does actual lock conflict determination */
+       ldlm_res_policy         ns_policy;
+
+       /**
+        * LVB operations for this namespace.
+        * \see struct ldlm_valblock_ops
+        */
+       struct ldlm_valblock_ops *ns_lvbo;
+
+       /**
+        * Used by filter code to store pointer to OBD of the service.
+        * Should be dropped in favor of \a ns_obd
+        */
+       void                    *ns_lvbp;
+
+       /**
+        * Wait queue used by __ldlm_namespace_free. Gets woken up every time
+        * a resource is removed.
+        */
+       wait_queue_head_t               ns_waitq;
+       /** LDLM pool structure for this namespace */
+       struct ldlm_pool        ns_pool;
+       /** Definition of how eagerly unused locks will be released from LRU */
+       ldlm_appetite_t         ns_appetite;
+
+       /**
+        * If more than \a ns_contended_locks are found, the resource is
+        * considered to be contended. Lock enqueues might specify that no
+        * contended locks should be granted
+        */
+       unsigned                ns_contended_locks;
+
+       /**
+        * The resources in this namespace remember contended state during
+        * \a ns_contention_time, in seconds.
+        */
+       unsigned                ns_contention_time;
+
+       /**
+        * Limit size of contended extent locks, in bytes.
+        * If extended lock is requested for more then this many bytes and
+        * caller instructs us not to grant contended locks, we would disregard
+        * such a request.
+        */
+       unsigned                ns_max_nolock_size;
+
+       /** Limit of parallel AST RPC count. */
+       unsigned                ns_max_parallel_ast;
+
+       /** Callback to cancel locks before replaying it during recovery. */
+       ldlm_cancel_for_recovery ns_cancel_for_recovery;
+
+       /** LDLM lock stats */
+       struct lprocfs_stats    *ns_stats;
+
+       /**
+        * Flag to indicate namespace is being freed. Used to determine if
+        * recalculation of LDLM pool statistics should be skipped.
+        */
+       unsigned                ns_stopping:1;
+};
+
+/**
+ * Returns 1 if namespace \a ns is a client namespace.
+ */
+static inline int ns_is_client(struct ldlm_namespace *ns)
+{
+       LASSERT(ns != NULL);
+       LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+                                   LDLM_NAMESPACE_SERVER)));
+       LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+               ns->ns_client == LDLM_NAMESPACE_SERVER);
+       return ns->ns_client == LDLM_NAMESPACE_CLIENT;
+}
+
+/**
+ * Returns 1 if namespace \a ns is a server namespace.
+ */
+static inline int ns_is_server(struct ldlm_namespace *ns)
+{
+       LASSERT(ns != NULL);
+       LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT |
+                                   LDLM_NAMESPACE_SERVER)));
+       LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT ||
+               ns->ns_client == LDLM_NAMESPACE_SERVER);
+       return ns->ns_client == LDLM_NAMESPACE_SERVER;
+}
+
+/**
+ * Returns 1 if namespace \a ns supports early lock cancel (ELC).
+ */
+static inline int ns_connect_cancelset(struct ldlm_namespace *ns)
+{
+       LASSERT(ns != NULL);
+       return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET);
+}
+
+/**
+ * Returns 1 if this namespace supports lru_resize.
+ */
+static inline int ns_connect_lru_resize(struct ldlm_namespace *ns)
+{
+       LASSERT(ns != NULL);
+       return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline void ns_register_cancel(struct ldlm_namespace *ns,
+                                     ldlm_cancel_for_recovery arg)
+{
+       LASSERT(ns != NULL);
+       ns->ns_cancel_for_recovery = arg;
+}
+
+struct ldlm_lock;
+
+/** Type for blocking callback function of a lock. */
+typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
+                                     struct ldlm_lock_desc *new, void *data,
+                                     int flag);
+/** Type for completion callback function of a lock. */
+typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags,
+                                       void *data);
+/** Type for glimpse callback function of a lock. */
+typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
+/** Type for weight callback function of a lock. */
+typedef unsigned long (*ldlm_weigh_callback)(struct ldlm_lock *lock);
+
+/** Work list for sending GL ASTs to multiple locks. */
+struct ldlm_glimpse_work {
+       struct ldlm_lock        *gl_lock; /* lock to glimpse */
+       struct list_head                 gl_list; /* linkage to other gl work structs */
+       __u32                    gl_flags;/* see LDLM_GL_WORK_* below */
+       union ldlm_gl_desc      *gl_desc; /* glimpse descriptor to be packed in
+                                          * glimpse callback request */
+};
+
+/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */
+#define LDLM_GL_WORK_NOFREE 0x1
+
+/** Interval node data for each LDLM_EXTENT lock. */
+struct ldlm_interval {
+       struct interval_node    li_node;  /* node for tree management */
+       struct list_head                li_group; /* the locks which have the same
+                                          * policy - group of the policy */
+};
+#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
+
+/**
+ * Interval tree for extent locks.
+ * The interval tree must be accessed under the resource lock.
+ * Interval trees are used for granted extent locks to speed up conflicts
+ * lookup. See ldlm/interval_tree.c for more details.
+ */
+struct ldlm_interval_tree {
+       /** Tree size. */
+       int                     lit_size;
+       ldlm_mode_t             lit_mode;  /* lock mode */
+       struct interval_node    *lit_root; /* actual ldlm_interval */
+};
+
+/** Whether to track references to exports by LDLM locks. */
+#define LUSTRE_TRACKS_LOCK_EXP_REFS (0)
+
+/** Cancel flags. */
+typedef enum {
+       LCF_ASYNC      = 0x1, /* Cancel locks asynchronously. */
+       LCF_LOCAL      = 0x2, /* Cancel locks locally, not notifing server */
+       LCF_BL_AST     = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST
+                              * in the same RPC */
+} ldlm_cancel_flags_t;
+
+struct ldlm_flock {
+       __u64 start;
+       __u64 end;
+       __u64 owner;
+       __u64 blocking_owner;
+       struct obd_export *blocking_export;
+       /* Protected by the hash lock */
+       __u32 blocking_refs;
+       __u32 pid;
+};
+
+typedef union {
+       struct ldlm_extent l_extent;
+       struct ldlm_flock l_flock;
+       struct ldlm_inodebits l_inodebits;
+} ldlm_policy_data_t;
+
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+                                const ldlm_policy_data_t *lpolicy,
+                                ldlm_wire_policy_data_t *wpolicy);
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+                                 const ldlm_wire_policy_data_t *wpolicy,
+                                 ldlm_policy_data_t *lpolicy);
+
+enum lvb_type {
+       LVB_T_NONE      = 0,
+       LVB_T_OST       = 1,
+       LVB_T_LQUOTA    = 2,
+       LVB_T_LAYOUT    = 3,
+};
+
+/**
+ * LDLM lock structure
+ *
+ * Represents a single LDLM lock and its state in memory. Each lock is
+ * associated with a single ldlm_resource, the object which is being
+ * locked. There may be multiple ldlm_locks on a single resource,
+ * depending on the lock type and whether the locks are conflicting or
+ * not.
+ */
+struct ldlm_lock {
+       /**
+        * Local lock handle.
+        * When remote side wants to tell us about a lock, they address
+        * it by this opaque handle.  The handle does not hold a
+        * reference on the ldlm_lock, so it can be safely passed to
+        * other threads or nodes. When the lock needs to be accessed
+        * from the handle, it is looked up again in the lock table, and
+        * may no longer exist.
+        *
+        * Must be first in the structure.
+        */
+       struct portals_handle   l_handle;
+       /**
+        * Lock reference count.
+        * This is how many users have pointers to actual structure, so that
+        * we do not accidentally free lock structure that is in use.
+        */
+       atomic_t                l_refc;
+       /**
+        * Internal spinlock protects l_resource.  We should hold this lock
+        * first before taking res_lock.
+        */
+       spinlock_t              l_lock;
+       /**
+        * Pointer to actual resource this lock is in.
+        * ldlm_lock_change_resource() can change this.
+        */
+       struct ldlm_resource    *l_resource;
+       /**
+        * List item for client side LRU list.
+        * Protected by ns_lock in struct ldlm_namespace.
+        */
+       struct list_head                l_lru;
+       /**
+        * Linkage to resource's lock queues according to current lock state.
+        * (could be granted, waiting or converting)
+        * Protected by lr_lock in struct ldlm_resource.
+        */
+       struct list_head                l_res_link;
+       /**
+        * Tree node for ldlm_extent.
+        */
+       struct ldlm_interval    *l_tree_node;
+       /**
+        * Per export hash of locks.
+        * Protected by per-bucket exp->exp_lock_hash locks.
+        */
+       struct hlist_node       l_exp_hash;
+       /**
+        * Per export hash of flock locks.
+        * Protected by per-bucket exp->exp_flock_hash locks.
+        */
+       struct hlist_node       l_exp_flock_hash;
+       /**
+        * Requested mode.
+        * Protected by lr_lock.
+        */
+       ldlm_mode_t             l_req_mode;
+       /**
+        * Granted mode, also protected by lr_lock.
+        */
+       ldlm_mode_t             l_granted_mode;
+       /** Lock completion handler pointer. Called when lock is granted. */
+       ldlm_completion_callback l_completion_ast;
+       /**
+        * Lock blocking AST handler pointer.
+        * It plays two roles:
+        * - as a notification of an attempt to queue a conflicting lock (once)
+        * - as a notification when the lock is being cancelled.
+        *
+        * As such it's typically called twice: once for the initial conflict
+        * and then once more when the last user went away and the lock is
+        * cancelled (could happen recursively).
+        */
+       ldlm_blocking_callback  l_blocking_ast;
+       /**
+        * Lock glimpse handler.
+        * Glimpse handler is used to obtain LVB updates from a client by
+        * server
+        */
+       ldlm_glimpse_callback   l_glimpse_ast;
+
+       /** XXX apparently unused "weight" handler. To be removed? */
+       ldlm_weigh_callback     l_weigh_ast;
+
+       /**
+        * Lock export.
+        * This is a pointer to actual client export for locks that were granted
+        * to clients. Used server-side.
+        */
+       struct obd_export       *l_export;
+       /**
+        * Lock connection export.
+        * Pointer to server export on a client.
+        */
+       struct obd_export       *l_conn_export;
+
+       /**
+        * Remote lock handle.
+        * If the lock is remote, this is the handle of the other side lock
+        * (l_handle)
+        */
+       struct lustre_handle    l_remote_handle;
+
+       /**
+        * Representation of private data specific for a lock type.
+        * Examples are: extent range for extent lock or bitmask for ibits locks
+        */
+       ldlm_policy_data_t      l_policy_data;
+
+       /**
+        * Lock state flags.
+        * Like whenever we receive any blocking requests for this lock, etc.
+        * Protected by lr_lock.
+        */
+       __u64                   l_flags;
+       /**
+        * Lock r/w usage counters.
+        * Protected by lr_lock.
+        */
+       __u32                   l_readers;
+       __u32                   l_writers;
+       /**
+        * If the lock is granted, a process sleeps on this waitq to learn when
+        * it's no longer in use.  If the lock is not granted, a process sleeps
+        * on this waitq to learn when it becomes granted.
+        */
+       wait_queue_head_t               l_waitq;
+
+       /**
+        * Seconds. It will be updated if there is any activity related to
+        * the lock, e.g. enqueue the lock or send blocking AST.
+        */
+       cfs_time_t              l_last_activity;
+
+       /**
+        * Time last used by e.g. being matched by lock match.
+        * Jiffies. Should be converted to time if needed.
+        */
+       cfs_time_t              l_last_used;
+
+       /** Originally requested extent for the extent lock. */
+       struct ldlm_extent      l_req_extent;
+
+       unsigned int            l_failed:1,
+       /**
+        * Set for locks that were removed from class hash table and will be
+        * destroyed when last reference to them is released. Set by
+        * ldlm_lock_destroy_internal().
+        *
+        * Protected by lock and resource locks.
+        */
+                               l_destroyed:1,
+       /*
+        * it's set in lock_res_and_lock() and unset in unlock_res_and_lock().
+        *
+        * NB: compared with check_res_locked(), checking this bit is cheaper.
+        * Also, spin_is_locked() is deprecated for kernel code; one reason is
+        * because it works only for SMP so user needs to add extra macros like
+        * LASSERT_SPIN_LOCKED for uniprocessor kernels.
+        */
+                               l_res_locked:1,
+       /*
+        * It's set once we call ldlm_add_waiting_lock_res_locked()
+        * to start the lock-timeout timer and it will never be reset.
+        *
+        * Protected by lock_res_and_lock().
+        */
+                               l_waited:1,
+       /** Flag whether this is a server namespace lock. */
+                               l_ns_srv:1;
+
+       /*
+        * Client-side-only members.
+        */
+
+       enum lvb_type         l_lvb_type;
+
+       /**
+        * Temporary storage for a LVB received during an enqueue operation.
+        */
+       __u32                   l_lvb_len;
+       void                    *l_lvb_data;
+
+       /** Private storage for lock user. Opaque to LDLM. */
+       void                    *l_ast_data;
+
+       /*
+        * Server-side-only members.
+        */
+
+       /**
+        * Connection cookie for the client originating the operation.
+        * Used by Commit on Share (COS) code. Currently only used for
+        * inodebits locks on MDS.
+        */
+       __u64                   l_client_cookie;
+
+       /**
+        * List item for locks waiting for cancellation from clients.
+        * The lists this could be linked into are:
+        * waiting_locks_list (protected by waiting_locks_spinlock),
+        * then if the lock timed out, it is moved to
+        * expired_lock_thread.elt_expired_locks for further processing.
+        * Protected by elt_lock.
+        */
+       struct list_head                l_pending_chain;
+
+       /**
+        * Set when lock is sent a blocking AST. Time in seconds when timeout
+        * is reached and client holding this lock could be evicted.
+        * This timeout could be further extended by e.g. certain IO activity
+        * under this lock.
+        * \see ost_rw_prolong_locks
+        */
+       cfs_time_t              l_callback_timeout;
+
+       /** Local PID of process which created this lock. */
+       __u32                   l_pid;
+
+       /**
+        * Number of times blocking AST was sent for this lock.
+        * This is for debugging. Valid values are 0 and 1, if there is an
+        * attempt to send blocking AST more than once, an assertion would be
+        * hit. \see ldlm_work_bl_ast_lock
+        */
+       int                     l_bl_ast_run;
+       /** List item ldlm_add_ast_work_item() for case of blocking ASTs. */
+       struct list_head                l_bl_ast;
+       /** List item ldlm_add_ast_work_item() for case of completion ASTs. */
+       struct list_head                l_cp_ast;
+       /** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */
+       struct list_head                l_rk_ast;
+
+       /**
+        * Pointer to a conflicting lock that caused blocking AST to be sent
+        * for this lock
+        */
+       struct ldlm_lock        *l_blocking_lock;
+
+       /**
+        * Protected by lr_lock, linkages to "skip lists".
+        * For more explanations of skip lists see ldlm/ldlm_inodebits.c
+        */
+       struct list_head                l_sl_mode;
+       struct list_head                l_sl_policy;
+
+       /** Reference tracking structure to debug leaked locks. */
+       struct lu_ref           l_reference;
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       /* Debugging stuff for bug 20498, for tracking export references. */
+       /** number of export references taken */
+       int                     l_exp_refs_nr;
+       /** link all locks referencing one export */
+       struct list_head                l_exp_refs_link;
+       /** referenced export object */
+       struct obd_export       *l_exp_refs_target;
+#endif
+       /**
+        * export blocking dlm lock list, protected by
+        * l_export->exp_bl_list_lock.
+        * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock
+        * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
+        */
+       struct list_head                l_exp_list;
+};
+
+/**
+ * LDLM resource description.
+ * Basically, resource is a representation for a single object.
+ * Object has a name which is currently 4 64-bit integers. LDLM user is
+ * responsible for creation of a mapping between objects it wants to be
+ * protected and resource names.
+ *
+ * A resource can only hold locks of a single lock type, though there may be
+ * multiple ldlm_locks on a single resource, depending on the lock type and
+ * whether the locks are conflicting or not.
+ */
+struct ldlm_resource {
+       struct ldlm_ns_bucket   *lr_ns_bucket;
+
+       /**
+        * List item for list in namespace hash.
+        * protected by ns_lock
+        */
+       struct hlist_node       lr_hash;
+
+       /** Spinlock to protect locks under this resource. */
+       spinlock_t              lr_lock;
+
+       /**
+        * protected by lr_lock
+        * @{ */
+       /** List of locks in granted state */
+       struct list_head                lr_granted;
+       /** List of locks waiting to change their granted mode (converted) */
+       struct list_head                lr_converting;
+       /**
+        * List of locks that could not be granted due to conflicts and
+        * that are waiting for conflicts to go away */
+       struct list_head                lr_waiting;
+       /** @} */
+
+       /* XXX No longer needed? Remove ASAP */
+       ldlm_mode_t             lr_most_restr;
+
+       /** Type of locks this resource can hold. Only one type per resource. */
+       ldlm_type_t             lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */
+
+       /** Resource name */
+       struct ldlm_res_id      lr_name;
+       /** Reference count for this resource */
+       atomic_t                lr_refcount;
+
+       /**
+        * Interval trees (only for extent locks) for all modes of this resource
+        */
+       struct ldlm_interval_tree lr_itree[LCK_MODE_NUM];
+
+       /**
+        * Server-side-only lock value block elements.
+        * To serialize lvbo_init.
+        */
+       struct mutex            lr_lvb_mutex;
+       int                     lr_lvb_len;
+       /** protected by lr_lock */
+       void                    *lr_lvb_data;
+
+       /** When the resource was considered as contended. */
+       cfs_time_t              lr_contention_time;
+       /** List of references to this resource. For debugging. */
+       struct lu_ref           lr_reference;
+
+       struct inode            *lr_lvb_inode;
+};
+
+static inline bool ldlm_has_layout(struct ldlm_lock *lock)
+{
+       return lock->l_resource->lr_type == LDLM_IBITS &&
+               lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT;
+}
+
+static inline char *
+ldlm_ns_name(struct ldlm_namespace *ns)
+{
+       return ns->ns_rs_hash->hs_name;
+}
+
+static inline struct ldlm_namespace *
+ldlm_res_to_ns(struct ldlm_resource *res)
+{
+       return res->lr_ns_bucket->nsb_namespace;
+}
+
+static inline struct ldlm_namespace *
+ldlm_lock_to_ns(struct ldlm_lock *lock)
+{
+       return ldlm_res_to_ns(lock->l_resource);
+}
+
+static inline char *
+ldlm_lock_to_ns_name(struct ldlm_lock *lock)
+{
+       return ldlm_ns_name(ldlm_lock_to_ns(lock));
+}
+
+static inline struct adaptive_timeout *
+ldlm_lock_to_ns_at(struct ldlm_lock *lock)
+{
+       return &lock->l_resource->lr_ns_bucket->nsb_at_estimate;
+}
+
+static inline int ldlm_lvbo_init(struct ldlm_resource *res)
+{
+       struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+       if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_init != NULL)
+               return ns->ns_lvbo->lvbo_init(res);
+
+       return 0;
+}
+
+static inline int ldlm_lvbo_size(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+       if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL)
+               return ns->ns_lvbo->lvbo_size(lock);
+
+       return 0;
+}
+
+static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+       if (ns->ns_lvbo != NULL) {
+               LASSERT(ns->ns_lvbo->lvbo_fill != NULL);
+               return ns->ns_lvbo->lvbo_fill(lock, buf, len);
+       }
+       return 0;
+}
+
+struct ldlm_ast_work {
+       struct ldlm_lock      *w_lock;
+       int                 w_blocking;
+       struct ldlm_lock_desc  w_desc;
+       struct list_head             w_list;
+       int                 w_flags;
+       void              *w_data;
+       int                 w_datalen;
+};
+
+/**
+ * Common ldlm_enqueue parameters
+ */
+struct ldlm_enqueue_info {
+       __u32 ei_type;   /** Type of the lock being enqueued. */
+       __u32 ei_mode;   /** Mode of the lock being enqueued. */
+       void *ei_cb_bl;  /** blocking lock callback */
+       void *ei_cb_cp;  /** lock completion callback */
+       void *ei_cb_gl;  /** lock glimpse callback */
+       void *ei_cb_wg;  /** lock weigh callback */
+       void *ei_cbdata; /** Data to be passed into callbacks. */
+};
+
+extern struct obd_ops ldlm_obd_ops;
+
+extern char *ldlm_lockname[];
+extern char *ldlm_typename[];
+extern char *ldlm_it2str(int it);
+
+/**
+ * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG.
+ * For the cases where we do not have actual lock to print along
+ * with a debugging message that is ldlm-related
+ */
+#define LDLM_DEBUG_NOLOCK(format, a...)                        \
+       CDEBUG(D_DLMTRACE, "### " format "\n" , ##a)
+
+/**
+ * Support function for lock information printing into debug logs.
+ * \see LDLM_DEBUG
+ */
+#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do {      \
+       CFS_CHECK_STACK(msgdata, mask, cdls);                      \
+                                                                       \
+       if (((mask) & D_CANTMASK) != 0 ||                              \
+           ((libcfs_debug & (mask)) != 0 &&                        \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))    \
+               _ldlm_lock_debug(lock, msgdata, fmt, ##a);            \
+} while(0)
+
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+                     struct libcfs_debug_msg_data *data,
+                     const char *fmt, ...)
+       __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Rate-limited version of lock printing function.
+ */
+#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do {                    \
+       static cfs_debug_limit_state_t _ldlm_cdls;                         \
+       LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls);       \
+       ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\
+} while (0)
+
+#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a)
+#define LDLM_WARN(lock, fmt, a...)  LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a)
+
+/** Non-rate-limited lock printing function for debugging purposes. */
+#define LDLM_DEBUG(lock, fmt, a...)   do {                               \
+       if (likely(lock != NULL)) {                                         \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL);      \
+               ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock,           \
+                               "### " fmt , ##a);                          \
+       } else {                                                            \
+               LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a);                \
+       }                                                                   \
+} while (0)
+
+typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags,
+                                     int first_enq, ldlm_error_t *err,
+                                     struct list_head *work_list);
+
+/**
+ * Return values for lock iterators.
+ * Also used during deciding of lock grants and cancellations.
+ */
+#define LDLM_ITER_CONTINUE 1 /* keep iterating */
+#define LDLM_ITER_STOP     2 /* stop iterating */
+
+typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *);
+typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *);
+
+/** \defgroup ldlm_iterator Lock iterators
+ *
+ * LDLM provides for a way to iterate through every lock on a resource or
+ * namespace or every resource in a namespace.
+ * @{ */
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+                         void *closure);
+void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
+                           void *closure);
+int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *,
+                         ldlm_iterator_t iter, void *data);
+/** @} ldlm_iterator */
+
+int ldlm_replay_locks(struct obd_import *imp);
+
+/* ldlm_flock.c */
+int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+
+/* ldlm_extent.c */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms);
+
+struct ldlm_callback_suite {
+       ldlm_completion_callback lcs_completion;
+       ldlm_blocking_callback   lcs_blocking;
+       ldlm_glimpse_callback    lcs_glimpse;
+       ldlm_weigh_callback      lcs_weigh;
+};
+
+/* ldlm_lockd.c */
+int ldlm_del_waiting_lock(struct ldlm_lock *lock);
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout);
+int ldlm_get_ref(void);
+void ldlm_put_ref(void);
+int ldlm_init_export(struct obd_export *exp);
+void ldlm_destroy_export(struct obd_export *exp);
+struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req);
+
+/* ldlm_lock.c */
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg);
+void ldlm_lock2handle(const struct ldlm_lock *lock,
+                     struct lustre_handle *lockh);
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags);
+void ldlm_cancel_callback(struct ldlm_lock *);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *);
+int ldlm_lock_set_data(struct lustre_handle *, void *);
+
+/**
+ * Obtain a lock reference by its handle.
+ */
+static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h)
+{
+       return __ldlm_handle2lock(h, 0);
+}
+
+#define LDLM_LOCK_REF_DEL(lock) \
+       lu_ref_del(&lock->l_reference, "handle", current)
+
+static inline struct ldlm_lock *
+ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags)
+{
+       struct ldlm_lock *lock;
+
+       lock = __ldlm_handle2lock(h, flags);
+       if (lock != NULL)
+               LDLM_LOCK_REF_DEL(lock);
+       return lock;
+}
+
+/**
+ * Update Lock Value Block Operations (LVBO) on a resource taking into account
+ * data from reqest \a r
+ */
+static inline int ldlm_res_lvbo_update(struct ldlm_resource *res,
+                                      struct ptlrpc_request *r, int increase)
+{
+       if (ldlm_res_to_ns(res)->ns_lvbo &&
+           ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) {
+               return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r,
+                                                                increase);
+       }
+       return 0;
+}
+
+int ldlm_error2errno(ldlm_error_t error);
+ldlm_error_t ldlm_errno2error(int err_no); /* don't call it `errno': this
+                                           * confuses user-space. */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp);
+#endif
+
+/**
+ * Release a temporary lock reference obtained by ldlm_handle2lock() or
+ * __ldlm_handle2lock().
+ */
+#define LDLM_LOCK_PUT(lock)                 \
+do {                                       \
+       LDLM_LOCK_REF_DEL(lock);                \
+       /*LDLM_DEBUG((lock), "put");*/    \
+       ldlm_lock_put(lock);                \
+} while (0)
+
+/**
+ * Release a lock reference obtained by some other means (see
+ * LDLM_LOCK_PUT()).
+ */
+#define LDLM_LOCK_RELEASE(lock)                 \
+do {                                       \
+       /*LDLM_DEBUG((lock), "put");*/    \
+       ldlm_lock_put(lock);                \
+} while (0)
+
+#define LDLM_LOCK_GET(lock)                 \
+({                                           \
+       ldlm_lock_get(lock);                \
+       /*LDLM_DEBUG((lock), "get");*/    \
+       lock;                              \
+})
+
+#define ldlm_lock_list_put(head, member, count)                     \
+({                                                               \
+       struct ldlm_lock *_lock, *_next;                            \
+       int c = count;                                        \
+       list_for_each_entry_safe(_lock, _next, head, member) {  \
+               if (c-- == 0)                                  \
+                       break;                                \
+               list_del_init(&_lock->member);            \
+               LDLM_LOCK_RELEASE(_lock);                          \
+       }                                                          \
+       LASSERT(c <= 0);                                            \
+})
+
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+void ldlm_lock_put(struct ldlm_lock *lock);
+void ldlm_lock_destroy(struct ldlm_lock *lock);
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc);
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode);
+int  ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock);
+void ldlm_lock_fail_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match(struct ldlm_lock *lock);
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock);
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+                           const struct ldlm_res_id *, ldlm_type_t type,
+                           ldlm_policy_data_t *, ldlm_mode_t mode,
+                           struct lustre_handle *, int unref);
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+                                       __u64 *bits);
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+                                       __u32 *flags);
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode);
+void ldlm_lock_cancel(struct ldlm_lock *lock);
+void ldlm_reprocess_all(struct ldlm_resource *res);
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns);
+void ldlm_lock_dump_handle(int level, struct lustre_handle *);
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req);
+
+/* resource.c */
+struct ldlm_namespace *
+ldlm_namespace_new(struct obd_device *obd, char *name,
+                  ldlm_side_t client, ldlm_appetite_t apt,
+                  ldlm_ns_type_t ns_type);
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags);
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+                        struct obd_import *imp, int force);
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client);
+void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client);
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client);
+void ldlm_namespace_get(struct ldlm_namespace *ns);
+void ldlm_namespace_put(struct ldlm_namespace *ns);
+int ldlm_proc_setup(void);
+#ifdef LPROCFS
+void ldlm_proc_cleanup(void);
+#else
+static inline void ldlm_proc_cleanup(void) {}
+#endif
+
+/* resource.c - internal */
+struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
+                                       struct ldlm_resource *parent,
+                                       const struct ldlm_res_id *,
+                                       ldlm_type_t type, int create);
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
+int ldlm_resource_putref(struct ldlm_resource *res);
+void ldlm_resource_add_lock(struct ldlm_resource *res,
+                           struct list_head *head,
+                           struct ldlm_lock *lock);
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock);
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level);
+void ldlm_namespace_dump(int level, struct ldlm_namespace *);
+void ldlm_resource_dump(int level, struct ldlm_resource *);
+int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
+                             const struct ldlm_res_id *);
+
+#define LDLM_RESOURCE_ADDREF(res) do {                           \
+       lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+#define LDLM_RESOURCE_DELREF(res) do {                           \
+       lu_ref_del(&(res)->lr_reference, __FUNCTION__, current);  \
+} while (0)
+
+/* ldlm_request.c */
+int ldlm_expired_completion_wait(void *data);
+/** \defgroup ldlm_local_ast Default AST handlers for local locks
+ * These AST handlers are typically used for server-side local locks and are
+ * also used by client-side lock handlers to perform minimum level base
+ * processing.
+ * @{ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock);
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                     void *data, int flag);
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp);
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data);
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data);
+/** @} ldlm_local_ast */
+
+/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users.
+ * These are typically used by client and server (*_local versions)
+ * to obtain and release locks.
+ * @{ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+                    struct ldlm_enqueue_info *einfo,
+                    const struct ldlm_res_id *res_id,
+                    ldlm_policy_data_t const *policy, __u64 *flags,
+                    void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+                    struct lustre_handle *lockh, int async);
+int ldlm_prep_enqueue_req(struct obd_export *exp,
+                         struct ptlrpc_request *req,
+                         struct list_head *cancels,
+                         int count);
+int ldlm_prep_elc_req(struct obd_export *exp,
+                     struct ptlrpc_request *req,
+                     int version, int opc, int canceloff,
+                     struct list_head *cancels, int count);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len);
+int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req,
+                        const struct ldlm_request *dlm_req,
+                        const struct ldlm_callback_suite *cbs);
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+                         ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+                         __u64 *flags, void *lvb, __u32 lvb_len,
+                         struct lustre_handle *lockh, int rc);
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+                          const struct ldlm_res_id *res_id,
+                          ldlm_type_t type, ldlm_policy_data_t *policy,
+                          ldlm_mode_t mode, __u64 *flags,
+                          ldlm_blocking_callback blocking,
+                          ldlm_completion_callback completion,
+                          ldlm_glimpse_callback glimpse,
+                          void *data, __u32 lvb_len, enum lvb_type lvb_type,
+                          const __u64 *client_cookie,
+                          struct lustre_handle *lockh);
+int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new,
+                   void *data, __u32 data_len);
+int ldlm_cli_convert(struct lustre_handle *, int new_mode, __u32 *flags);
+int ldlm_cli_update_pool(struct ptlrpc_request *req);
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+                   ldlm_cancel_flags_t cancel_flags);
+int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *,
+                          ldlm_cancel_flags_t flags, void *opaque);
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+                                   const struct ldlm_res_id *res_id,
+                                   ldlm_policy_data_t *policy,
+                                   ldlm_mode_t mode,
+                                   ldlm_cancel_flags_t flags,
+                                   void *opaque);
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head,
+                       int count, ldlm_cancel_flags_t flags);
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+                              struct list_head *cancels,
+                              ldlm_policy_data_t *policy,
+                              ldlm_mode_t mode, int lock_flags,
+                              ldlm_cancel_flags_t cancel_flags, void *opaque);
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+                              ldlm_cancel_flags_t flags);
+int ldlm_cli_cancel_list(struct list_head *head, int count,
+                        struct ptlrpc_request *req, ldlm_cancel_flags_t flags);
+/** @} ldlm_cli_api */
+
+/* mds/handler.c */
+/* This has to be here because recursive inclusion sucks. */
+int intent_disposition(struct ldlm_reply *rep, int flag);
+void intent_set_disposition(struct ldlm_reply *rep, int flag);
+
+
+/* ioctls for trying requests */
+#define IOC_LDLM_TYPE             'f'
+#define IOC_LDLM_MIN_NR                 40
+
+#define IOC_LDLM_TEST             _IOWR('f', 40, long)
+#define IOC_LDLM_DUMP             _IOWR('f', 41, long)
+#define IOC_LDLM_REGRESS_START   _IOWR('f', 42, long)
+#define IOC_LDLM_REGRESS_STOP     _IOWR('f', 43, long)
+#define IOC_LDLM_MAX_NR                 43
+
+/**
+ * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more
+ * than one lock_res is dead-lock safe.
+ */
+enum lock_res_type {
+       LRT_NORMAL,
+       LRT_NEW
+};
+
+/** Lock resource. */
+static inline void lock_res(struct ldlm_resource *res)
+{
+       spin_lock(&res->lr_lock);
+}
+
+/** Lock resource with a way to instruct lockdep code about nestedness-safe. */
+static inline void lock_res_nested(struct ldlm_resource *res,
+                                  enum lock_res_type mode)
+{
+       spin_lock_nested(&res->lr_lock, mode);
+}
+
+/** Unlock resource. */
+static inline void unlock_res(struct ldlm_resource *res)
+{
+       spin_unlock(&res->lr_lock);
+}
+
+/** Check if resource is already locked, assert if not. */
+static inline void check_res_locked(struct ldlm_resource *res)
+{
+       LASSERT(spin_is_locked(&res->lr_lock));
+}
+
+struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock);
+void unlock_res_and_lock(struct ldlm_lock *lock);
+
+/* ldlm_pool.c */
+/** \defgroup ldlm_pools Various LDLM pool related functions
+ * There are not used outside of ldlm.
+ * @{
+ */
+void ldlm_pools_recalc(ldlm_side_t client);
+int ldlm_pools_init(void);
+void ldlm_pools_fini(void);
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+                  int idx, ldlm_side_t client);
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+                    unsigned int gfp_mask);
+void ldlm_pool_fini(struct ldlm_pool *pl);
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit);
+int ldlm_pool_recalc(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl);
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl);
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl);
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv);
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv);
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit);
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock);
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock);
+/** @} */
+
+#endif
+/** @} LDLM */
diff --git a/drivers/staging/lustre/lustre/include/lustre_eacl.h b/drivers/staging/lustre/lustre/include/lustre_eacl.h
new file mode 100644 (file)
index 0000000..b94f76a
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_EACL_H
+#define _LUSTRE_EACL_H
+
+/** \defgroup eacl eacl
+ *
+ * @{
+ */
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <linux/posix_acl_xattr.h>
+
+typedef struct {
+       __u16              e_tag;
+       __u16              e_perm;
+       __u32              e_id;
+       __u32              e_stat;
+} ext_acl_xattr_entry;
+
+typedef struct {
+       __u32              a_count;
+       ext_acl_xattr_entry     a_entries[0];
+} ext_acl_xattr_header;
+
+#define CFS_ACL_XATTR_SIZE(count, prefix) \
+       (sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry))
+
+#define CFS_ACL_XATTR_COUNT(size, prefix) \
+       (((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry))
+
+
+extern ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size);
+extern int
+lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+                             posix_acl_xattr_header **out);
+extern void
+lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size);
+extern void
+lustre_ext_acl_xattr_free(ext_acl_xattr_header *header);
+extern int
+lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+                            ext_acl_xattr_header *ext_header,
+                            posix_acl_xattr_header **out);
+extern ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+                          ext_acl_xattr_header *ext_header);
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+/** @} eacl */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_export.h b/drivers/staging/lustre/lustre/include/lustre_export.h
new file mode 100644 (file)
index 0000000..d61c020
--- /dev/null
@@ -0,0 +1,389 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_export PortalRPC export definitions
+ *
+ * @{
+ */
+
+#ifndef __EXPORT_H
+#define __EXPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+struct mds_client_data;
+struct mdt_client_data;
+struct mds_idmap_table;
+struct mdt_idmap_table;
+
+/**
+ * Target-specific export data
+ */
+struct tg_export_data {
+       /** Protects led_lcd below */
+       struct mutex            ted_lcd_lock;
+       /** Per-client data for each export */
+       struct lsd_client_data  *ted_lcd;
+       /** Offset of record in last_rcvd file */
+       loff_t                  ted_lr_off;
+       /** Client index in last_rcvd file */
+       int                     ted_lr_idx;
+};
+
+/**
+ * MDT-specific export data
+ */
+struct mdt_export_data {
+       struct tg_export_data   med_ted;
+       /** List of all files opened by client on this MDT */
+       struct list_head                med_open_head;
+       spinlock_t              med_open_lock; /* med_open_head, mfd_list */
+       /** Bitmask of all ibit locks this MDT understands */
+       __u64                   med_ibits_known;
+       struct mutex            med_idmap_mutex;
+       struct lustre_idmap_table *med_idmap;
+};
+
+struct ec_export_data { /* echo client */
+       struct list_head eced_locks;
+};
+
+/* In-memory access to client data from OST struct */
+/** Filter (oss-side) specific import data */
+struct filter_export_data {
+       struct tg_export_data   fed_ted;
+       spinlock_t              fed_lock;       /**< protects fed_mod_list */
+       long                   fed_dirty;    /* in bytes */
+       long                   fed_grant;    /* in bytes */
+       struct list_head                 fed_mod_list; /* files being modified */
+       int                     fed_mod_count;/* items in fed_writing list */
+       long                   fed_pending;  /* bytes just being written */
+       __u32                 fed_group;
+       __u8                   fed_pagesize; /* log2 of client page size */
+};
+
+struct mgs_export_data {
+       struct list_head                med_clients;    /* mgc fs client via this exp */
+       spinlock_t              med_lock;       /* protect med_clients */
+};
+
+/**
+ * per-NID statistics structure.
+ * It tracks access patterns to this export on a per-client-NID basis
+ */
+struct nid_stat {
+       lnet_nid_t             nid;
+       struct hlist_node        nid_hash;
+       struct list_head               nid_list;
+       struct obd_device       *nid_obd;
+       struct proc_dir_entry   *nid_proc;
+       struct lprocfs_stats    *nid_stats;
+       struct lprocfs_stats    *nid_ldlm_stats;
+       atomic_t             nid_exp_ref_count; /* for obd_nid_stats_hash
+                                                          exp_nid_stats */
+};
+
+#define nidstat_getref(nidstat)                                                \
+do {                                                                      \
+       atomic_inc(&(nidstat)->nid_exp_ref_count);                       \
+} while(0)
+
+#define nidstat_putref(nidstat)                                                \
+do {                                                                      \
+       atomic_dec(&(nidstat)->nid_exp_ref_count);                       \
+       LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0,         \
+                "stat %p nid_exp_ref_count < 0\n", nidstat);             \
+} while(0)
+
+enum obd_option {
+       OBD_OPT_FORCE =  0x0001,
+       OBD_OPT_FAILOVER =      0x0002,
+       OBD_OPT_ABORT_RECOV =   0x0004,
+};
+
+/**
+ * Export structure. Represents target-side of connection in portals.
+ * Also used in Lustre to connect between layers on the same node when
+ * there is no network-connection in-between.
+ * For every connected client there is an export structure on the server
+ * attached to the same obd device.
+ */
+struct obd_export {
+       /**
+        * Export handle, it's id is provided to client on connect
+        * Subsequent client RPCs contain this handle id to identify
+        * what export they are talking to.
+        */
+       struct portals_handle     exp_handle;
+       atomic_t              exp_refcount;
+       /**
+        * Set of counters below is to track where export references are
+        * kept. The exp_rpc_count is used for reconnect handling also,
+        * the cb_count and locks_count are for debug purposes only for now.
+        * The sum of them should be less than exp_refcount by 3
+        */
+       atomic_t              exp_rpc_count; /* RPC references */
+       atomic_t              exp_cb_count; /* Commit callback references */
+       /** Number of queued replay requests to be processes */
+       atomic_t                  exp_replay_count;
+       atomic_t              exp_locks_count; /** Lock references */
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       struct list_head                exp_locks_list;
+       spinlock_t                exp_locks_list_guard;
+#endif
+       /** UUID of client connected to this export */
+       struct obd_uuid    exp_client_uuid;
+       /** To link all exports on an obd device */
+       struct list_head                exp_obd_chain;
+       struct hlist_node         exp_uuid_hash; /** uuid-export hash*/
+       struct hlist_node         exp_nid_hash; /** nid-export hash */
+       /**
+        * All exports eligible for ping evictor are linked into a list
+        * through this field in "most time since last request on this export"
+        * order
+        * protected by obd_dev_lock
+        */
+       struct list_head                exp_obd_chain_timed;
+       /** Obd device of this export */
+       struct obd_device       *exp_obd;
+       /**
+        * "reverse" import to send requests (e.g. from ldlm) back to client
+        * exp_lock protect its change
+        */
+       struct obd_import       *exp_imp_reverse;
+       struct nid_stat   *exp_nid_stats;
+       struct lprocfs_stats     *exp_md_stats;
+       /** Active connetion */
+       struct ptlrpc_connection *exp_connection;
+       /** Connection count value from last succesful reconnect rpc */
+       __u32                exp_conn_cnt;
+       /** Hash list of all ldlm locks granted on this export */
+       cfs_hash_t             *exp_lock_hash;
+       /**
+        * Hash list for Posix lock deadlock detection, added with
+        * ldlm_lock::l_exp_flock_hash.
+        */
+       cfs_hash_t             *exp_flock_hash;
+       struct list_head                exp_outstanding_replies;
+       struct list_head                exp_uncommitted_replies;
+       spinlock_t                exp_uncommitted_replies_lock;
+       /** Last committed transno for this export */
+       __u64                exp_last_committed;
+       /** When was last request received */
+       cfs_time_t              exp_last_request_time;
+       /** On replay all requests waiting for replay are linked here */
+       struct list_head                exp_req_replay_queue;
+       /**
+        * protects exp_flags, exp_outstanding_replies and the change
+        * of exp_imp_reverse
+        */
+       spinlock_t                exp_lock;
+       /** Compatibility flags for this export are embedded into
+        *  exp_connect_data */
+       struct obd_connect_data   exp_connect_data;
+       enum obd_option    exp_flags;
+       unsigned long        exp_failed:1,
+                                 exp_in_recovery:1,
+                                 exp_disconnected:1,
+                                 exp_connecting:1,
+                                 /** VBR: export missed recovery */
+                                 exp_delayed:1,
+                                 /** VBR: failed version checking */
+                                 exp_vbr_failed:1,
+                                 exp_req_replay_needed:1,
+                                 exp_lock_replay_needed:1,
+                                 exp_need_sync:1,
+                                 exp_flvr_changed:1,
+                                 exp_flvr_adapt:1,
+                                 exp_libclient:1, /* liblustre client? */
+                                 /* client timed out and tried to reconnect,
+                                  * but couldn't because of active rpcs */
+                                 exp_abort_active_req:1,
+                                 /* if to swap nidtbl entries for 2.2 clients.
+                                  * Only used by the MGS to fix LU-1644. */
+                                 exp_need_mne_swab:1;
+       /* also protected by exp_lock */
+       enum lustre_sec_part      exp_sp_peer;
+       struct sptlrpc_flavor     exp_flvr;          /* current */
+       struct sptlrpc_flavor     exp_flvr_old[2];      /* about-to-expire */
+       cfs_time_t              exp_flvr_expire[2];   /* seconds */
+
+       /** protects exp_hp_rpcs */
+       spinlock_t                exp_rpc_lock;
+       struct list_head                  exp_hp_rpcs;  /* (potential) HP RPCs */
+
+       /** blocking dlm lock list, protected by exp_bl_list_lock */
+       struct list_head                exp_bl_list;
+       spinlock_t                exp_bl_list_lock;
+
+       /** Target specific data */
+       union {
+               struct tg_export_data     eu_target_data;
+               struct mdt_export_data    eu_mdt_data;
+               struct filter_export_data eu_filter_data;
+               struct ec_export_data     eu_ec_data;
+               struct mgs_export_data    eu_mgs_data;
+       } u;
+};
+
+#define exp_target_data u.eu_target_data
+#define exp_mdt_data    u.eu_mdt_data
+#define exp_filter_data u.eu_filter_data
+#define exp_ec_data     u.eu_ec_data
+
+static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp)
+{
+       return &exp->exp_connect_data.ocd_connect_flags;
+}
+
+static inline __u64 exp_connect_flags(struct obd_export *exp)
+{
+       return *exp_connect_flags_ptr(exp);
+}
+
+static inline int exp_max_brw_size(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE)
+               return exp->exp_connect_data.ocd_brw_size;
+
+       return ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_connect_multibulk(struct obd_export *exp)
+{
+       return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE;
+}
+
+static inline int exp_expired(struct obd_export *exp, cfs_duration_t age)
+{
+       LASSERT(exp->exp_delayed);
+       return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age),
+                              cfs_time_current_sec());
+}
+
+static inline int exp_connect_cancelset(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET);
+}
+
+static inline int exp_connect_lru_resize(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_rmtclient(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int client_is_remote(struct obd_export *exp)
+{
+       struct obd_import *imp = class_exp2cliimp(exp);
+
+       return !!(imp->imp_connect_data.ocd_connect_flags &
+                 OBD_CONNECT_RMT_CLIENT);
+}
+
+static inline int exp_connect_vbr(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       LASSERT(exp->exp_connection);
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR);
+}
+
+static inline int exp_connect_som(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM);
+}
+
+static inline int exp_connect_umask(struct obd_export *exp)
+{
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK);
+}
+
+static inline int imp_connect_lru_resize(struct obd_import *imp)
+{
+       struct obd_connect_data *ocd;
+
+       LASSERT(imp != NULL);
+       ocd = &imp->imp_connect_data;
+       return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE);
+}
+
+static inline int exp_connect_layout(struct obd_export *exp)
+{
+       return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK);
+}
+
+static inline bool exp_connect_lvb_type(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE)
+               return true;
+       else
+               return false;
+}
+
+static inline bool imp_connect_lvb_type(struct obd_import *imp)
+{
+       struct obd_connect_data *ocd;
+
+       LASSERT(imp != NULL);
+       ocd = &imp->imp_connect_data;
+       if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE)
+               return true;
+       else
+               return false;
+}
+
+extern struct obd_export *class_conn2export(struct lustre_handle *conn);
+extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
+
+/** @} export */
+
+#endif /* __EXPORT_H */
+/** @} obd_export */
diff --git a/drivers/staging/lustre/lustre/include/lustre_fid.h b/drivers/staging/lustre/lustre/include/lustre_fid.h
new file mode 100644 (file)
index 0000000..7d20cba
--- /dev/null
@@ -0,0 +1,762 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fid.h
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+#ifndef __LINUX_FID_H
+#define __LINUX_FID_H
+
+/** \defgroup fid fid
+ *
+ * @{
+ *
+ * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs
+ * describes the FID namespace and interoperability requirements for FIDs.
+ * The important parts of that document are included here for reference.
+ *
+ * FID
+ *   File IDentifier generated by client from range allocated by the SEQuence
+ *   service and stored in struct lu_fid. The FID is composed of three parts:
+ *   SEQuence, ObjectID, and VERsion.  The SEQ component is a filesystem
+ *   unique 64-bit integer, and only one client is ever assigned any SEQ value.
+ *   The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved
+ *   for system use.  The OID component is a 32-bit value generated by the
+ *   client on a per-SEQ basis to allow creating many unique FIDs without
+ *   communication with the server.  The VER component is a 32-bit value that
+ *   distinguishes between different FID instantiations, such as snapshots or
+ *   separate subtrees within the filesystem.  FIDs with the same VER field
+ *   are considered part of the same namespace.
+ *
+ * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and
+ *   MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while
+ *   OSTs use 64-bit Lustre object IDs and generation numbers.
+ *
+ * NEW filesystems are those formatted since the introduction of FIDs.
+ *
+ * IGIF
+ *   Inode and Generation In FID, a surrogate FID used to globally identify
+ *   an existing object on OLD formatted MDT file system. This would only be
+ *   used on MDT0 in a DNE filesystem, because there cannot be more than one
+ *   MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1]
+ *   range, where inode number is stored in SEQ, and inode generation is in OID.
+ *   NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem,
+ *   which is the maximum possible for an ldiskfs backend.  It also assumes
+ *   that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible
+ *   to clients, which has always been true.
+ *
+ * IDIF
+ *   object ID In FID, a surrogate FID used to globally identify an existing
+ *   OST object on OLD formatted OST file system. Belongs to a sequence in
+ *   [2^32, 2^33 - 1]. Sequence number is calculated as:
+ *
+ *      1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff)
+ *
+ *   that is, SEQ consists of 16-bit OST index, and higher 16 bits of object
+ *   ID. The generation of unique SEQ values per OST allows the IDIF FIDs to
+ *   be identified in the FLD correctly. The OID field is calculated as:
+ *
+ *      objid & 0xffffffff
+ *
+ *   that is, it consists of lower 32 bits of object ID.  For objects within
+ *   the IDIF range, object ID extraction will be:
+ *
+ *      o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid;
+ *      o_seq = 0;  // formerly group number
+ *
+ *   NOTE: This assumes that no more than 2^48-1 objects have ever been created
+ *   on any OST, and that no more than 65535 OSTs are in use.  Both are very
+ *   reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming
+ *   a maximum creation rate of 1M objects per second for a maximum of 9 years,
+ *   or combinations thereof.
+ *
+ * OST_MDT0
+ *   Surrogate FID used to identify an existing object on OLD formatted OST
+ *   filesystem. Belongs to the reserved SEQuence 0, and is used prior to
+ *   the introduction of FID-on-OST, at which point IDIF will be used to
+ *   identify objects as residing on a specific OST.
+ *
+ * LLOG
+ *   For Lustre Log objects the object sequence 1 is used. This is compatible
+ *   with both OLD and NEW namespaces, as this SEQ number is in the
+ *   ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * ECHO
+ *   For testing OST IO performance the object sequence 2 is used. This is
+ *   compatible with both OLD and NEW namespaces, as this SEQ number is in
+ *   the ext3/ldiskfs reserved inode range and does not conflict with IGIF
+ *   sequence numbers.
+ *
+ * OST_MDT1 .. OST_MAX
+ *   For testing with multiple MDTs the object sequence 3 through 9 is used,
+ *   allowing direct mapping of MDTs 1 through 7 respectively, for a total
+ *   of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group"
+ *   mappings. However, this SEQ range is only for testing prior to any
+ *   production DNE release, as the objects in this range conflict across all
+ *   OSTs, as the OST index is not part of the FID.  For production DNE usage,
+ *   OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs.
+ *
+ * DLM OST objid to IDIF mapping
+ *   For compatibility with existing OLD OST network protocol structures, the
+ *   FID must map onto the o_id and o_seq in a manner that ensures existing
+ *   objects are identified consistently for IO, as well as onto the LDLM
+ *   namespace to ensure IDIFs there is only a single resource name for any
+ *   object in the DLM.  The OLD OST object DLM resource mapping is:
+ *
+ *      resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases
+ *
+ *   The NEW OST object DLM resource mapping is the same for both MDT and OST:
+ *
+ *      resource[] = {SEQ, OID, VER, HASH};
+ *
+ *  NOTE: for mapping IDIF values to DLM resource names the o_id may be
+ *  larger than the 2^33 reserved sequence numbers for IDIF, so it is possible
+ *  for the o_id numbers to overlap FID SEQ numbers in the resource. However,
+ *  in all production releases the OLD o_seq field is always zero, and all
+ *  valid FID OID values are non-zero, so the lock resources will not collide.
+ *  Even so, the MDT and OST resources are also in different LDLM namespaces.
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include <lustre_mdt.h>
+#include <obd.h>
+
+
+struct lu_site;
+struct lu_context;
+
+/* Whole sequences space range and zero range definitions */
+extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE;
+extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE;
+extern const struct lu_fid LUSTRE_BFL_FID;
+extern const struct lu_fid LU_OBF_FID;
+extern const struct lu_fid LU_DOT_LUSTRE_FID;
+
+enum {
+       /*
+        * This is how may metadata FIDs may be allocated in one sequence(128k)
+        */
+       LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL,
+
+       /*
+        * This is how many data FIDs could be allocated in one sequence(4B - 1)
+        */
+       LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL,
+
+       /*
+        * How many sequences to allocate to a client at once.
+        */
+       LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL,
+
+       /*
+        * seq allocation pool size.
+        */
+       LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000,
+
+       /*
+        * This is how many sequences may be in one super-sequence allocated to
+        * MDTs.
+        */
+       LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
+};
+
+enum {
+       /** 2^6 FIDs for OI containers */
+       OSD_OI_FID_OID_BITS     = 6,
+       /** reserve enough FIDs in case we want more in the future */
+       OSD_OI_FID_OID_BITS_MAX = 10,
+};
+
+/** special OID for local objects */
+enum local_oid {
+       /** \see fld_mod_init */
+       FLD_INDEX_OID           = 3UL,
+       /** \see fid_mod_init */
+       FID_SEQ_CTL_OID         = 4UL,
+       FID_SEQ_SRV_OID         = 5UL,
+       /** \see mdd_mod_init */
+       MDD_ROOT_INDEX_OID      = 6UL, /* deprecated in 2.4 */
+       MDD_ORPHAN_OID          = 7UL, /* deprecated in 2.4 */
+       MDD_LOV_OBJ_OID         = 8UL,
+       MDD_CAPA_KEYS_OID       = 9UL,
+       /** \see mdt_mod_init */
+       LAST_RECV_OID           = 11UL,
+       OSD_FS_ROOT_OID         = 13UL,
+       ACCT_USER_OID           = 15UL,
+       ACCT_GROUP_OID          = 16UL,
+       LFSCK_BOOKMARK_OID      = 17UL,
+       OTABLE_IT_OID           = 18UL,
+       /* These two definitions are obsolete
+        * OFD_GROUP0_LAST_OID     = 20UL,
+        * OFD_GROUP4K_LAST_OID    = 20UL+4096,
+        */
+       OFD_LAST_GROUP_OID      = 4117UL,
+       LLOG_CATALOGS_OID       = 4118UL,
+       MGS_CONFIGS_OID         = 4119UL,
+       OFD_HEALTH_CHECK_OID    = 4120UL,
+       MDD_LOV_OBJ_OSEQ        = 4121UL,
+       LFSCK_NAMESPACE_OID     = 4122UL,
+       REMOTE_PARENT_DIR_OID   = 4123UL,
+};
+
+static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+       fid->f_seq = FID_SEQ_LOCAL_FILE;
+       fid->f_oid = oid;
+       fid->f_ver = 0;
+}
+
+static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid)
+{
+       fid->f_seq = FID_SEQ_LOCAL_NAME;
+       fid->f_oid = oid;
+       fid->f_ver = 0;
+}
+
+/* For new FS (>= 2.4), the root FID will be changed to
+ * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4),
+ * the root FID will still be IGIF */
+static inline int fid_is_root(const struct lu_fid *fid)
+{
+       return unlikely((fid_seq(fid) == FID_SEQ_ROOT &&
+                        fid_oid(fid) == 1));
+}
+
+static inline int fid_is_dot_lustre(const struct lu_fid *fid)
+{
+       return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+                       fid_oid(fid) == FID_OID_DOT_LUSTRE);
+}
+
+static inline int fid_is_obf(const struct lu_fid *fid)
+{
+       return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE &&
+                       fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF);
+}
+
+static inline int fid_is_otable_it(const struct lu_fid *fid)
+{
+       return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+                       fid_oid(fid) == OTABLE_IT_OID);
+}
+
+static inline int fid_is_acct(const struct lu_fid *fid)
+{
+       return fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
+              (fid_oid(fid) == ACCT_USER_OID ||
+               fid_oid(fid) == ACCT_GROUP_OID);
+}
+
+static inline int fid_is_quota(const struct lu_fid *fid)
+{
+       return fid_seq(fid) == FID_SEQ_QUOTA ||
+              fid_seq(fid) == FID_SEQ_QUOTA_GLB;
+}
+
+static inline int fid_is_namespace_visible(const struct lu_fid *fid)
+{
+       const __u64 seq = fid_seq(fid);
+
+       /* Here, we cannot distinguish whether the normal FID is for OST
+        * object or not. It is caller's duty to check more if needed. */
+       return (!fid_is_last_id(fid) &&
+               (fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) ||
+              fid_is_root(fid) || fid_is_dot_lustre(fid);
+}
+
+static inline int fid_seq_in_fldb(__u64 seq)
+{
+       return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) ||
+              fid_seq_is_root(seq) || fid_seq_is_dot(seq);
+}
+
+static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq)
+{
+       if (fid_seq_is_mdt0(seq)) {
+               fid->f_seq = fid_idif_seq(0, 0);
+       } else {
+               LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) ||
+                        fid_seq_is_idif(seq), LPX64"\n", seq);
+               fid->f_seq = seq;
+       }
+       fid->f_oid = 0;
+       fid->f_ver = 0;
+}
+
+enum lu_mgr_type {
+       LUSTRE_SEQ_SERVER,
+       LUSTRE_SEQ_CONTROLLER
+};
+
+struct lu_server_seq;
+
+/* Client sequence manager interface. */
+struct lu_client_seq {
+       /* Sequence-controller export. */
+       struct obd_export      *lcs_exp;
+       struct mutex            lcs_mutex;
+
+       /*
+        * Range of allowed for allocation sequeces. When using lu_client_seq on
+        * clients, this contains meta-sequence range. And for servers this
+        * contains super-sequence range.
+        */
+       struct lu_seq_range      lcs_space;
+
+       /* Seq related proc */
+       proc_dir_entry_t   *lcs_proc_dir;
+
+       /* This holds last allocated fid in last obtained seq */
+       struct lu_fid      lcs_fid;
+
+       /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */
+       enum lu_cli_type        lcs_type;
+
+       /*
+        * Service uuid, passed from MDT + seq name to form unique seq name to
+        * use it with procfs.
+        */
+       char                lcs_name[80];
+
+       /*
+        * Sequence width, that is how many objects may be allocated in one
+        * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH.
+        */
+       __u64              lcs_width;
+
+       /* Seq-server for direct talking */
+       struct lu_server_seq   *lcs_srv;
+
+       /* wait queue for fid allocation and update indicator */
+       wait_queue_head_t            lcs_waitq;
+       int                  lcs_update;
+};
+
+/* server sequence manager interface */
+struct lu_server_seq {
+       /* Available sequences space */
+       struct lu_seq_range      lss_space;
+
+       /* keeps highwater in lsr_end for seq allocation algorithm */
+       struct lu_seq_range      lss_lowater_set;
+       struct lu_seq_range      lss_hiwater_set;
+
+       /*
+        * Device for server side seq manager needs (saving sequences to backing
+        * store).
+        */
+       struct dt_device       *lss_dev;
+
+       /* /seq file object device */
+       struct dt_object       *lss_obj;
+
+       /* Seq related proc */
+       proc_dir_entry_t   *lss_proc_dir;
+
+       /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */
+       enum lu_mgr_type       lss_type;
+
+       /* Client interafce to request controller */
+       struct lu_client_seq   *lss_cli;
+
+       /* Mutex for protecting allocation */
+       struct mutex            lss_mutex;
+
+       /*
+        * Service uuid, passed from MDT + seq name to form unique seq name to
+        * use it with procfs.
+        */
+       char                lss_name[80];
+
+       /*
+        * Allocation chunks for super and meta sequences. Default values are
+        * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH.
+        */
+       __u64              lss_width;
+
+       /*
+        * minimum lss_alloc_set size that should be allocated from
+        * lss_space
+        */
+       __u64              lss_set_width;
+
+       /* sync is needed for update operation */
+       __u32              lss_need_sync;
+
+       /**
+        * Pointer to site object, required to access site fld.
+        */
+       struct seq_server_site  *lss_site;
+};
+
+int seq_query(struct com_thread_info *info);
+int seq_handle(struct ptlrpc_request *req);
+
+/* Server methods */
+int seq_server_init(struct lu_server_seq *seq,
+                   struct dt_device *dev,
+                   const char *prefix,
+                   enum lu_mgr_type type,
+                   struct seq_server_site *ss,
+                   const struct lu_env *env);
+
+void seq_server_fini(struct lu_server_seq *seq,
+                    const struct lu_env *env);
+
+int seq_server_alloc_super(struct lu_server_seq *seq,
+                          struct lu_seq_range *out,
+                          const struct lu_env *env);
+
+int seq_server_alloc_meta(struct lu_server_seq *seq,
+                         struct lu_seq_range *out,
+                         const struct lu_env *env);
+
+int seq_server_set_cli(struct lu_server_seq *seq,
+                      struct lu_client_seq *cli,
+                      const struct lu_env *env);
+
+/* Client methods */
+int seq_client_init(struct lu_client_seq *seq,
+                   struct obd_export *exp,
+                   enum lu_cli_type type,
+                   const char *prefix,
+                   struct lu_server_seq *srv);
+
+void seq_client_fini(struct lu_client_seq *seq);
+
+void seq_client_flush(struct lu_client_seq *seq);
+
+int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq,
+                        struct lu_fid *fid);
+int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq,
+                      seqno_t *seqnr);
+int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss);
+/* Fids common stuff */
+int fid_is_local(const struct lu_env *env,
+                struct lu_site *site, const struct lu_fid *fid);
+
+int client_fid_init(struct obd_device *obd, struct obd_export *exp,
+                   enum lu_cli_type type);
+int client_fid_fini(struct obd_device *obd);
+
+/* fid locking */
+
+struct ldlm_namespace;
+
+/*
+ * Build (DLM) resource name from FID.
+ *
+ * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2],
+ * but was moved into name[1] along with the OID to avoid consuming the
+ * renaming name[2,3] fields that need to be used for the quota identifier.
+ */
+static inline struct ldlm_res_id *
+fid_build_reg_res_name(const struct lu_fid *f,
+                      struct ldlm_res_id *name)
+{
+       memset(name, 0, sizeof *name);
+       name->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(f);
+       name->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(f);
+       return name;
+}
+
+/*
+ * Build (DLM) resource identifier from global quota FID and quota ID.
+ */
+static inline struct ldlm_res_id *
+fid_build_quota_resid(const struct lu_fid *glb_fid, union lquota_id *qid,
+                     struct ldlm_res_id *res)
+{
+       fid_build_reg_res_name(glb_fid, res);
+       res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid);
+       res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid);
+       return res;
+}
+
+/*
+ * Extract global FID and quota ID from resource name
+ */
+static inline void fid_extract_quota_resid(struct ldlm_res_id *res,
+                                          struct lu_fid *glb_fid,
+                                          union lquota_id *qid)
+{
+       glb_fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF];
+       glb_fid->f_oid = (__u32)res->name[LUSTRE_RES_ID_VER_OID_OFF];
+       glb_fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+       qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF];
+       qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF];
+       qid->qid_fid.f_ver =
+               (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32);
+}
+
+/*
+ * Return true if resource is for object identified by fid.
+ */
+static inline int fid_res_name_eq(const struct lu_fid *f,
+                                 const struct ldlm_res_id *name)
+{
+       return name->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(f) &&
+              name->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(f);
+}
+
+/* reverse function of fid_build_reg_res_name() */
+static inline void fid_build_from_res_name(struct lu_fid *f,
+                                          const struct ldlm_res_id *name)
+{
+       fid_zero(f);
+       f->f_seq = name->name[LUSTRE_RES_ID_SEQ_OFF];
+       f->f_oid = name->name[LUSTRE_RES_ID_VER_OID_OFF] & 0xffffffff;
+       f->f_ver = name->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32;
+       LASSERT(fid_res_name_eq(f, name));
+}
+
+static inline struct ldlm_res_id *
+fid_build_pdo_res_name(const struct lu_fid *f,
+                      unsigned int hash,
+                      struct ldlm_res_id *name)
+{
+       fid_build_reg_res_name(f, name);
+       name->name[LUSTRE_RES_ID_HSH_OFF] = hash;
+       return name;
+}
+
+/**
+ * Build DLM resource name from object id & seq, which will be removed
+ * finally, when we replace ost_id with FID in data stack.
+ *
+ * Currently, resid from the old client, whose res[0] = object_id,
+ * res[1] = object_seq, is just oposite with Metatdata
+ * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid.
+ * To unifiy the resid identification, we will reverse the data
+ * resid to keep it same with Metadata resid, i.e.
+ *
+ * For resid from the old client,
+ *    res[0] = objid,  res[1] = 0, still keep the original order,
+ *    for compatiblity.
+ *
+ * For new resid
+ *    res will be built from normal FID directly, i.e. res[0] = f_seq,
+ *    res[1] = f_oid + f_ver.
+ */
+static inline void ostid_build_res_name(struct ost_id *oi,
+                                       struct ldlm_res_id *name)
+{
+       memset(name, 0, sizeof *name);
+       if (fid_seq_is_mdt0(ostid_seq(oi))) {
+               name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi);
+               name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi);
+       } else {
+               fid_build_reg_res_name((struct lu_fid *)oi, name);
+       }
+}
+
+static inline void ostid_res_name_to_id(struct ost_id *oi,
+                                       struct ldlm_res_id *name)
+{
+       if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_SEQ_OFF])) {
+               /* old resid */
+               ostid_set_seq(oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+               ostid_set_id(oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+       } else {
+               /* new resid */
+               fid_build_from_res_name((struct lu_fid *)oi, name);
+       }
+}
+
+/**
+ * Return true if the resource is for the object identified by this id & group.
+ */
+static inline int ostid_res_name_eq(struct ost_id *oi,
+                                   struct ldlm_res_id *name)
+{
+       /* Note: it is just a trick here to save some effort, probably the
+        * correct way would be turn them into the FID and compare */
+       if (fid_seq_is_mdt0(ostid_seq(oi))) {
+               return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) &&
+                      name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi);
+       } else {
+               return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) &&
+                      name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi);
+       }
+}
+
+/* The same as osc_build_res_name() */
+static inline void ost_fid_build_resid(const struct lu_fid *fid,
+                                      struct ldlm_res_id *resname)
+{
+       if (fid_is_mdt0(fid) || fid_is_idif(fid)) {
+               struct ost_id oi;
+               oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */
+               if (fid_to_ostid(fid, &oi) != 0)
+                       return;
+               ostid_build_res_name(&oi, resname);
+       } else {
+               fid_build_reg_res_name(fid, resname);
+       }
+}
+
+static inline void ost_fid_from_resid(struct lu_fid *fid,
+                                     const struct ldlm_res_id *name)
+{
+       if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) {
+               /* old resid */
+               struct ost_id oi;
+               ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]);
+               ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF]);
+               ostid_to_fid(fid, &oi, 0);
+       } else {
+               /* new resid */
+               fid_build_from_res_name(fid, name);
+       }
+}
+
+/**
+ * Flatten 128-bit FID values into a 64-bit value for use as an inode number.
+ * For non-IGIF FIDs this starts just over 2^32, and continues without
+ * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ
+ * into the range where there may not be many OID values in use, to minimize
+ * the risk of conflict.
+ *
+ * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true,
+ * the time between re-used inode numbers is very long - 2^40 SEQ numbers,
+ * or about 2^40 client mounts, if clients create less than 2^24 files/mount.
+ */
+static inline __u64 fid_flatten(const struct lu_fid *fid)
+{
+       __u64 ino;
+       __u64 seq;
+
+       if (fid_is_igif(fid)) {
+               ino = lu_igif_ino(fid);
+               RETURN(ino);
+       }
+
+       seq = fid_seq(fid);
+
+       ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid);
+
+       RETURN(ino ? ino : fid_oid(fid));
+}
+
+static inline __u32 fid_hash(const struct lu_fid *f, int bits)
+{
+       /* all objects with same id and different versions will belong to same
+        * collisions list. */
+       return cfs_hash_long(fid_flatten(f), bits);
+}
+
+/**
+ * map fid to 32 bit value for ino on 32bit systems. */
+static inline __u32 fid_flatten32(const struct lu_fid *fid)
+{
+       __u32 ino;
+       __u64 seq;
+
+       if (fid_is_igif(fid)) {
+               ino = lu_igif_ino(fid);
+               RETURN(ino);
+       }
+
+       seq = fid_seq(fid) - FID_SEQ_START;
+
+       /* Map the high bits of the OID into higher bits of the inode number so
+        * that inodes generated at about the same time have a reduced chance
+        * of collisions. This will give a period of 2^12 = 1024 unique clients
+        * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects
+        * (from OID), or up to 128M inodes without collisions for new files. */
+       ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) +
+              (seq >> (64 - (40-8)) & 0xffffff00) +
+              (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8);
+
+       RETURN(ino ? ino : fid_oid(fid));
+}
+
+static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2)
+{
+       LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n",
+                PFID(fid1), PFID(fid2));
+
+       if (fid_is_idif(fid1) && fid_is_idif(fid2))
+               return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) -
+                      fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver);
+
+       return fid_oid(fid1) - fid_oid(fid2);
+}
+
+#define LUSTRE_SEQ_SRV_NAME "seq_srv"
+#define LUSTRE_SEQ_CTL_NAME "seq_ctl"
+
+/* Range common stuff */
+static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+       dst->lsr_start = cpu_to_le64(src->lsr_start);
+       dst->lsr_end = cpu_to_le64(src->lsr_end);
+       dst->lsr_index = cpu_to_le32(src->lsr_index);
+       dst->lsr_flags = cpu_to_le32(src->lsr_flags);
+}
+
+static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+       dst->lsr_start = le64_to_cpu(src->lsr_start);
+       dst->lsr_end = le64_to_cpu(src->lsr_end);
+       dst->lsr_index = le32_to_cpu(src->lsr_index);
+       dst->lsr_flags = le32_to_cpu(src->lsr_flags);
+}
+
+static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+       dst->lsr_start = cpu_to_be64(src->lsr_start);
+       dst->lsr_end = cpu_to_be64(src->lsr_end);
+       dst->lsr_index = cpu_to_be32(src->lsr_index);
+       dst->lsr_flags = cpu_to_be32(src->lsr_flags);
+}
+
+static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src)
+{
+       dst->lsr_start = be64_to_cpu(src->lsr_start);
+       dst->lsr_end = be64_to_cpu(src->lsr_end);
+       dst->lsr_index = be32_to_cpu(src->lsr_index);
+       dst->lsr_flags = be32_to_cpu(src->lsr_flags);
+}
+
+/** @} fid */
+
+#endif /* __LINUX_FID_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_fld.h b/drivers/staging/lustre/lustre/include/lustre_fld.h
new file mode 100644 (file)
index 0000000..11e034a
--- /dev/null
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_FLD_H
+#define __LINUX_FLD_H
+
+/** \defgroup fld fld
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <lustre_mdt.h>
+#include <dt_object.h>
+
+#include <linux/libcfs/libcfs.h>
+
+struct lu_client_fld;
+struct lu_server_fld;
+struct lu_fld_hash;
+struct fld_cache;
+
+extern const struct dt_index_features fld_index_features;
+extern const char fld_index_name[];
+
+/*
+ * FLD (Fid Location Database) interface.
+ */
+enum {
+       LUSTRE_CLI_FLD_HASH_DHT = 0,
+       LUSTRE_CLI_FLD_HASH_RRB
+};
+
+
+struct lu_fld_target {
+       struct list_head               ft_chain;
+       struct obd_export       *ft_exp;
+       struct lu_server_fld    *ft_srv;
+       __u64               ft_idx;
+};
+
+struct lu_server_fld {
+       /**
+        * Fld dir proc entry. */
+       proc_dir_entry_t    *lsf_proc_dir;
+
+       /**
+        * /fld file object device */
+       struct dt_object        *lsf_obj;
+
+       /**
+        * super sequence controller export, needed to forward fld
+        * lookup  request. */
+       struct obd_export       *lsf_control_exp;
+
+       /**
+        * Client FLD cache. */
+       struct fld_cache        *lsf_cache;
+
+       /**
+        * Protect index modifications */
+       struct mutex            lsf_lock;
+
+       /**
+        * Fld service name in form "fld-srv-lustre-MDTXXX" */
+       char                 lsf_name[80];
+
+};
+
+struct lu_client_fld {
+       /**
+        * Client side proc entry. */
+       proc_dir_entry_t    *lcf_proc_dir;
+
+       /**
+        * List of exports client FLD knows about. */
+       struct list_head               lcf_targets;
+
+       /**
+        * Current hash to be used to chose an export. */
+       struct lu_fld_hash      *lcf_hash;
+
+       /**
+        * Exports count. */
+       int                   lcf_count;
+
+       /**
+        * Lock protecting exports list and fld_hash. */
+       spinlock_t               lcf_lock;
+
+       /**
+        * Client FLD cache. */
+       struct fld_cache        *lcf_cache;
+
+       /**
+        * Client fld proc entry name. */
+       char                 lcf_name[80];
+
+       const struct lu_context *lcf_ctx;
+
+       int                   lcf_flags;
+};
+
+/**
+ * number of blocks to reserve for particular operations. Should be function of
+ * ... something. Stub for now.
+ */
+enum {
+       /* one insert operation can involve two delete and one insert */
+       FLD_TXN_INDEX_INSERT_CREDITS  = 60,
+       FLD_TXN_INDEX_DELETE_CREDITS  = 20,
+};
+
+int fld_query(struct com_thread_info *info);
+
+/* Server methods */
+int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld,
+                   struct dt_device *dt, const char *prefix, int mds_node_id,
+                   int type);
+
+void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld);
+
+int fld_declare_server_create(const struct lu_env *env,
+                             struct lu_server_fld *fld,
+                             struct lu_seq_range *new,
+                             struct thandle *th);
+
+int fld_server_create(const struct lu_env *env,
+                     struct lu_server_fld *fld,
+                     struct lu_seq_range *add_range,
+                     struct thandle *th);
+
+int fld_insert_entry(const struct lu_env *env,
+                    struct lu_server_fld *fld,
+                    const struct lu_seq_range *range);
+
+int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld,
+                     seqno_t seq, struct lu_seq_range *range);
+
+/* Client methods */
+int fld_client_init(struct lu_client_fld *fld,
+                   const char *prefix, int hash);
+
+void fld_client_fini(struct lu_client_fld *fld);
+
+void fld_client_flush(struct lu_client_fld *fld);
+
+int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds,
+                     __u32 flags, const struct lu_env *env);
+
+int fld_client_create(struct lu_client_fld *fld,
+                     struct lu_seq_range *range,
+                     const struct lu_env *env);
+
+int fld_client_delete(struct lu_client_fld *fld,
+                     seqno_t seq,
+                     const struct lu_env *env);
+
+int fld_client_add_target(struct lu_client_fld *fld,
+                         struct lu_fld_target *tar);
+
+int fld_client_del_target(struct lu_client_fld *fld,
+                         __u64 idx);
+
+void fld_client_proc_fini(struct lu_client_fld *fld);
+
+/** @} fld */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h
new file mode 100644 (file)
index 0000000..9dcc332
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_fsfilt.h
+ *
+ * Filesystem interface helper.
+ */
+
+#ifndef _LUSTRE_FSFILT_H
+#define _LUSTRE_FSFILT_H
+
+#include <linux/lustre_fsfilt.h>
+
+#define LU221_BAD_TIME (0x80000000U + 24 * 3600)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_ha.h b/drivers/staging/lustre/lustre/include/lustre_ha.h
new file mode 100644 (file)
index 0000000..105f6d6
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_HA_H
+#define _LUSTRE_HA_H
+
+/** \defgroup ha ha
+ *
+ * @{
+ */
+
+struct obd_import;
+struct obd_export;
+struct obd_device;
+struct ptlrpc_request;
+
+
+int ptlrpc_replay(struct obd_import *imp);
+int ptlrpc_resend(struct obd_import *imp);
+void ptlrpc_free_committed(struct obd_import *imp);
+void ptlrpc_wake_delayed(struct obd_import *imp);
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async);
+int ptlrpc_set_import_active(struct obd_import *imp, int active);
+void ptlrpc_activate_import(struct obd_import *imp);
+void ptlrpc_deactivate_import(struct obd_import *imp);
+void ptlrpc_invalidate_import(struct obd_import *imp);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
+int ptlrpc_check_suspend(void);
+void ptlrpc_activate_timeouts(struct obd_import *imp);
+void ptlrpc_deactivate_timeouts(struct obd_import *imp);
+
+/** @} ha */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_handles.h b/drivers/staging/lustre/lustre/include/lustre_handles.h
new file mode 100644 (file)
index 0000000..fcd40f3
--- /dev/null
@@ -0,0 +1,93 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LUSTRE_HANDLES_H_
+#define __LUSTRE_HANDLES_H_
+
+/** \defgroup handles handles
+ *
+ * @{
+ */
+
+#include <linux/lustre_handles.h>
+
+#include <linux/libcfs/libcfs.h>
+
+
+struct portals_handle_ops {
+       void (*hop_addref)(void *object);
+       void (*hop_free)(void *object, int size);
+};
+
+/* These handles are most easily used by having them appear at the very top of
+ * whatever object that you want to make handles for.  ie:
+ *
+ * struct ldlm_lock {
+ *      struct portals_handle handle;
+ *      ...
+ * };
+ *
+ * Now you're able to assign the results of cookie2handle directly to an
+ * ldlm_lock.  If it's not at the top, you'll want to use container_of()
+ * to compute the start of the structure based on the handle field. */
+struct portals_handle {
+       struct list_head                        h_link;
+       __u64                           h_cookie;
+       struct portals_handle_ops       *h_ops;
+
+       /* newly added fields to handle the RCU issue. -jxiong */
+       cfs_rcu_head_t                  h_rcu;
+       spinlock_t                      h_lock;
+       unsigned int                    h_size:31;
+       unsigned int                    h_in:1;
+};
+#define RCU2HANDLE(rcu)    container_of(rcu, struct portals_handle, h_rcu)
+
+/* handles.c */
+
+/* Add a handle to the hash table */
+void class_handle_hash(struct portals_handle *,
+                      struct portals_handle_ops *ops);
+void class_handle_unhash(struct portals_handle *);
+void class_handle_hash_back(struct portals_handle *);
+void *class_handle2object(__u64 cookie);
+void class_handle_free_cb(cfs_rcu_head_t *);
+int class_handle_init(void);
+void class_handle_cleanup(void);
+
+/** @} handles */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_idmap.h b/drivers/staging/lustre/lustre/include/lustre_idmap.h
new file mode 100644 (file)
index 0000000..084bdd6
--- /dev/null
@@ -0,0 +1,104 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/include/lustre_idmap.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_IDMAP_H
+#define _LUSTRE_IDMAP_H
+
+/** \defgroup idmap idmap
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+#define CFS_NGROUPS_PER_BLOCK   ((int)(PAGE_CACHE_SIZE / sizeof(gid_t)))
+
+#define CFS_GROUP_AT(gi, i) \
+       ((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK])
+
+enum {
+       CFS_IC_NOTHING     = 0,    /* convert nothing */
+       CFS_IC_ALL       = 1,    /* convert all items */
+       CFS_IC_MAPPED      = 2,    /* convert mapped uid/gid */
+       CFS_IC_UNMAPPED    = 3     /* convert unmapped uid/gid */
+};
+
+#define  CFS_IDMAP_NOTFOUND     (-1)
+
+#define CFS_IDMAP_HASHSIZE      32
+
+enum lustre_idmap_idx {
+       RMT_UIDMAP_IDX,
+       LCL_UIDMAP_IDX,
+       RMT_GIDMAP_IDX,
+       LCL_GIDMAP_IDX,
+       CFS_IDMAP_N_HASHES
+};
+
+struct lustre_idmap_table {
+       spinlock_t      lit_lock;
+       struct list_head        lit_idmaps[CFS_IDMAP_N_HASHES][CFS_IDMAP_HASHSIZE];
+};
+
+struct lu_ucred;
+
+extern void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist);
+extern void lustre_groups_sort(group_info_t *group_info);
+extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp);
+
+extern int lustre_idmap_add(struct lustre_idmap_table *t,
+                           uid_t ruid, uid_t luid,
+                           gid_t rgid, gid_t lgid);
+extern int lustre_idmap_del(struct lustre_idmap_table *t,
+                           uid_t ruid, uid_t luid,
+                           gid_t rgid, gid_t lgid);
+extern int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+                                  struct lustre_idmap_table *t,
+                                  int reverse, uid_t uid);
+extern int lustre_idmap_lookup_gid(struct lu_ucred *mu,
+                                  struct lustre_idmap_table *t,
+                                  int reverse, gid_t gid);
+extern struct lustre_idmap_table *lustre_idmap_init(void);
+extern void lustre_idmap_fini(struct lustre_idmap_table *t);
+
+/** @} idmap */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h
new file mode 100644 (file)
index 0000000..3a5dd6a
--- /dev/null
@@ -0,0 +1,367 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup obd_import PtlRPC import definitions
+ * Imports are client-side representation of remote obd target.
+ *
+ * @{
+ */
+
+#ifndef __IMPORT_H
+#define __IMPORT_H
+
+/** \defgroup export export
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <lustre/lustre_idl.h>
+
+
+/**
+ * Adaptive Timeout stuff
+ *
+ * @{
+ */
+#define D_ADAPTTO D_OTHER
+#define AT_BINS 4                /* "bin" means "N seconds of history" */
+#define AT_FLG_NOHIST 0x1        /* use last reported value only */
+
+struct adaptive_timeout {
+       time_t          at_binstart;     /* bin start time */
+       unsigned int    at_hist[AT_BINS];    /* timeout history bins */
+       unsigned int    at_flags;
+       unsigned int    at_current;       /* current timeout value */
+       unsigned int    at_worst_ever;       /* worst-ever timeout value */
+       time_t          at_worst_time;       /* worst-ever timeout timestamp */
+       spinlock_t      at_lock;
+};
+
+struct ptlrpc_at_array {
+       struct list_head       *paa_reqs_array; /** array to hold requests */
+       __u32        paa_size;       /** the size of array */
+       __u32        paa_count;      /** the total count of reqs */
+       time_t      paa_deadline;   /** the earliest deadline of reqs */
+       __u32       *paa_reqs_count; /** the count of reqs in each entry */
+};
+
+#define IMP_AT_MAX_PORTALS 8
+struct imp_at {
+       int                  iat_portal[IMP_AT_MAX_PORTALS];
+       struct adaptive_timeout iat_net_latency;
+       struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS];
+};
+
+
+/** @} */
+
+/** Possible import states */
+enum lustre_imp_state {
+       LUSTRE_IMP_CLOSED     = 1,
+       LUSTRE_IMP_NEW  = 2,
+       LUSTRE_IMP_DISCON     = 3,
+       LUSTRE_IMP_CONNECTING = 4,
+       LUSTRE_IMP_REPLAY     = 5,
+       LUSTRE_IMP_REPLAY_LOCKS = 6,
+       LUSTRE_IMP_REPLAY_WAIT  = 7,
+       LUSTRE_IMP_RECOVER    = 8,
+       LUSTRE_IMP_FULL       = 9,
+       LUSTRE_IMP_EVICTED    = 10,
+};
+
+/** Returns test string representation of numeric import state \a state */
+static inline char * ptlrpc_import_state_name(enum lustre_imp_state state)
+{
+       static char* import_state_names[] = {
+               "<UNKNOWN>", "CLOSED",  "NEW", "DISCONN",
+               "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT",
+               "RECOVER", "FULL", "EVICTED",
+       };
+
+       LASSERT (state <= LUSTRE_IMP_EVICTED);
+       return import_state_names[state];
+}
+
+/**
+ * List of import event types
+ */
+enum obd_import_event {
+       IMP_EVENT_DISCON     = 0x808001,
+       IMP_EVENT_INACTIVE   = 0x808002,
+       IMP_EVENT_INVALIDATE = 0x808003,
+       IMP_EVENT_ACTIVE     = 0x808004,
+       IMP_EVENT_OCD   = 0x808005,
+       IMP_EVENT_DEACTIVATE = 0x808006,
+       IMP_EVENT_ACTIVATE   = 0x808007,
+};
+
+/**
+ * Definition of import connection structure
+ */
+struct obd_import_conn {
+       /** Item for linking connections together */
+       struct list_head                oic_item;
+       /** Pointer to actual PortalRPC connection */
+       struct ptlrpc_connection *oic_conn;
+       /** uuid of remote side */
+       struct obd_uuid    oic_uuid;
+       /**
+        * Time (64 bit jiffies) of last connection attempt on this connection
+        */
+       __u64                oic_last_attempt;
+};
+
+/* state history */
+#define IMP_STATE_HIST_LEN 16
+struct import_state_hist {
+       enum lustre_imp_state ish_state;
+       time_t          ish_time;
+};
+
+/**
+ * Defintion of PortalRPC import structure.
+ * Imports are representing client-side view to remote target.
+ */
+struct obd_import {
+       /** Local handle (== id) for this import. */
+       struct portals_handle     imp_handle;
+       /** Reference counter */
+       atomic_t              imp_refcount;
+       struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
+       /** Currently active connection */
+       struct ptlrpc_connection *imp_connection;
+       /** PortalRPC client structure for this import */
+       struct ptlrpc_client     *imp_client;
+       /** List element for linking into pinger chain */
+       struct list_head                imp_pinger_chain;
+       /** List element for linking into chain for destruction */
+       struct list_head                imp_zombie_chain;
+
+       /**
+        * Lists of requests that are retained for replay, waiting for a reply,
+        * or waiting for recovery to complete, respectively.
+        * @{
+        */
+       struct list_head                imp_replay_list;
+       struct list_head                imp_sending_list;
+       struct list_head                imp_delayed_list;
+       /** @} */
+
+       /** obd device for this import */
+       struct obd_device       *imp_obd;
+
+       /**
+        * some seciruty-related fields
+        * @{
+        */
+       struct ptlrpc_sec       *imp_sec;
+       struct mutex              imp_sec_mutex;
+       cfs_time_t              imp_sec_expire;
+       /** @} */
+
+       /** Wait queue for those who need to wait for recovery completion */
+       wait_queue_head_t              imp_recovery_waitq;
+
+       /** Number of requests currently in-flight */
+       atomic_t              imp_inflight;
+       /** Number of requests currently unregistering */
+       atomic_t              imp_unregistering;
+       /** Number of replay requests inflight */
+       atomic_t              imp_replay_inflight;
+       /** Number of currently happening import invalidations */
+       atomic_t              imp_inval_count;
+       /** Numbner of request timeouts */
+       atomic_t              imp_timeouts;
+       /** Current import state */
+       enum lustre_imp_state     imp_state;
+       /** History of import states */
+       struct import_state_hist  imp_state_hist[IMP_STATE_HIST_LEN];
+       int                    imp_state_hist_idx;
+       /** Current import generation. Incremented on every reconnect */
+       int                    imp_generation;
+       /** Incremented every time we send reconnection request */
+       __u32                imp_conn_cnt;
+       /**
+       * \see ptlrpc_free_committed remembers imp_generation value here
+       * after a check to save on unnecessary replay list iterations
+       */
+       int                    imp_last_generation_checked;
+       /** Last tranno we replayed */
+       __u64                imp_last_replay_transno;
+       /** Last transno committed on remote side */
+       __u64                imp_peer_committed_transno;
+       /**
+        * \see ptlrpc_free_committed remembers last_transno since its last
+        * check here and if last_transno did not change since last run of
+        * ptlrpc_free_committed and import generation is the same, we can
+        * skip looking for requests to remove from replay list as optimisation
+        */
+       __u64                imp_last_transno_checked;
+       /**
+        * Remote export handle. This is how remote side knows what export
+        * we are talking to. Filled from response to connect request
+        */
+       struct lustre_handle      imp_remote_handle;
+       /** When to perform next ping. time in jiffies. */
+       cfs_time_t              imp_next_ping;
+       /** When we last succesfully connected. time in 64bit jiffies */
+       __u64                imp_last_success_conn;
+
+       /** List of all possible connection for import. */
+       struct list_head                imp_conn_list;
+       /**
+        * Current connection. \a imp_connection is imp_conn_current->oic_conn
+        */
+       struct obd_import_conn   *imp_conn_current;
+
+       /** Protects flags, level, generation, conn_cnt, *_list */
+       spinlock_t                imp_lock;
+
+       /* flags */
+       unsigned long        imp_no_timeout:1, /* timeouts are disabled */
+                                 imp_invalid:1,    /* evicted */
+                                 /* administratively disabled */
+                                 imp_deactive:1,
+                                 /* try to recover the import */
+                                 imp_replayable:1,
+                                 /* don't run recovery (timeout instead) */
+                                 imp_dlm_fake:1,
+                                 /* use 1/2 timeout on MDS' OSCs */
+                                 imp_server_timeout:1,
+                                 /* VBR: imp in delayed recovery */
+                                 imp_delayed_recovery:1,
+                                 /* VBR: if gap was found then no lock replays
+                                  */
+                                 imp_no_lock_replay:1,
+                                 /* recovery by versions was failed */
+                                 imp_vbr_failed:1,
+                                 /* force an immidiate ping */
+                                 imp_force_verify:1,
+                                 /* force a scheduled ping */
+                                 imp_force_next_verify:1,
+                                 /* pingable */
+                                 imp_pingable:1,
+                                 /* resend for replay */
+                                 imp_resend_replay:1,
+                                 /* disable normal recovery, for test only. */
+                                 imp_no_pinger_recover:1,
+                                 /* need IR MNE swab */
+                                 imp_need_mne_swab:1,
+                                 /* import must be reconnected instead of
+                                  * chouse new connection */
+                                 imp_force_reconnect:1,
+                                 /* import has tried to connect with server */
+                                 imp_connect_tried:1;
+       __u32                imp_connect_op;
+       struct obd_connect_data   imp_connect_data;
+       __u64                imp_connect_flags_orig;
+       int                    imp_connect_error;
+
+       __u32                imp_msg_magic;
+       __u32                imp_msghdr_flags;       /* adjusted based on server capability */
+
+       struct ptlrpc_request_pool *imp_rq_pool;          /* emergency request pool */
+
+       struct imp_at        imp_at;             /* adaptive timeout data */
+       time_t              imp_last_reply_time;    /* for health check */
+};
+
+typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
+                                   int event, void *event_arg, void *cb_data);
+
+/**
+ * Structure for import observer.
+ * It is possible to register "observer" on an import and every time
+ * something happens to an import (like connect/evict/disconnect)
+ * obderver will get its callback called with event type
+ */
+struct obd_import_observer {
+       struct list_head           oio_chain;
+       obd_import_callback  oio_cb;
+       void            *oio_cb_data;
+};
+
+void class_observe_import(struct obd_import *imp, obd_import_callback cb,
+                         void *cb_data);
+void class_unobserve_import(struct obd_import *imp, obd_import_callback cb,
+                           void *cb_data);
+void class_notify_import_observers(struct obd_import *imp, int event,
+                                  void *event_arg);
+
+/* import.c */
+static inline unsigned int at_est2timeout(unsigned int val)
+{
+       /* add an arbitrary minimum: 125% +5 sec */
+       return (val + (val >> 2) + 5);
+}
+
+static inline unsigned int at_timeout2est(unsigned int val)
+{
+       /* restore estimate value from timeout: e=4/5(t-5) */
+       LASSERT(val);
+       return (max((val << 2) / 5, 5U) - 4);
+}
+
+static inline void at_reset(struct adaptive_timeout *at, int val) {
+       at->at_current = val;
+       at->at_worst_ever = val;
+       at->at_worst_time = cfs_time_current_sec();
+}
+static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
+       memset(at, 0, sizeof(*at));
+       spin_lock_init(&at->at_lock);
+       at->at_flags = flags;
+       at_reset(at, val);
+}
+extern unsigned int at_min;
+static inline int at_get(struct adaptive_timeout *at) {
+       return (at->at_current > at_min) ? at->at_current : at_min;
+}
+int at_measured(struct adaptive_timeout *at, unsigned int val);
+int import_at_get_index(struct obd_import *imp, int portal);
+extern unsigned int at_max;
+#define AT_OFF (at_max == 0)
+
+/* genops.c */
+struct obd_export;
+extern struct obd_import *class_exp2cliimp(struct obd_export *);
+extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
+
+/** @} import */
+
+#endif /* __IMPORT_H */
+
+/** @} obd_import */
diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
new file mode 100644 (file)
index 0000000..bdfc539
--- /dev/null
@@ -0,0 +1,667 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_lib.h
+ *
+ * Basic Lustre library routines.
+ */
+
+#ifndef _LUSTRE_LIB_H
+#define _LUSTRE_LIB_H
+
+/** \defgroup lib lib
+ *
+ * @{
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ver.h>
+#include <lustre_cfg.h>
+#include <linux/lustre_lib.h>
+
+/* target.c */
+struct ptlrpc_request;
+struct obd_export;
+struct lu_target;
+struct l_wait_info;
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lvfs.h>
+
+
+int target_pack_pool_reply(struct ptlrpc_request *req);
+int do_set_info_async(struct obd_import *imp,
+                     int opcode, int version,
+                     obd_count keylen, void *key,
+                     obd_count vallen, void *val,
+                     struct ptlrpc_request_set *set);
+
+#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */
+#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
+
+/* client.c */
+
+int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg* lcfg);
+struct client_obd *client_conn2cli(struct lustre_handle *conn);
+
+struct md_open_data;
+struct obd_client_handle {
+       struct lustre_handle  och_fh;
+       struct lu_fid    och_fid;
+       struct md_open_data  *och_mod;
+       __u32 och_magic;
+       int och_flags;
+};
+#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed
+
+/* statfs_pack.c */
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs);
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs);
+
+/* l_lock.c */
+struct lustre_lock {
+       int                     l_depth;
+       task_t          *l_owner;
+       struct semaphore        l_sem;
+       spinlock_t              l_spin;
+};
+
+void l_lock_init(struct lustre_lock *);
+void l_lock(struct lustre_lock *);
+void l_unlock(struct lustre_lock *);
+int l_has_lock(struct lustre_lock *);
+
+/*
+ * For md echo client
+ */
+enum md_echo_cmd {
+       ECHO_MD_CREATE       = 1, /* Open/Create file on MDT */
+       ECHO_MD_MKDIR   = 2, /* Mkdir on MDT */
+       ECHO_MD_DESTROY      = 3, /* Unlink file on MDT */
+       ECHO_MD_RMDIR   = 4, /* Rmdir on MDT */
+       ECHO_MD_LOOKUP       = 5, /* Lookup on MDT */
+       ECHO_MD_GETATTR      = 6, /* Getattr on MDT */
+       ECHO_MD_SETATTR      = 7, /* Setattr on MDT */
+       ECHO_MD_ALLOC_FID    = 8, /* Get FIDs from MDT */
+};
+
+/*
+ *   OBD IOCTLS
+ */
+#define OBD_IOCTL_VERSION 0x00010004
+
+struct obd_ioctl_data {
+       __u32 ioc_len;
+       __u32 ioc_version;
+
+       union {
+               __u64 ioc_cookie;
+               __u64 ioc_u64_1;
+       };
+       union {
+               __u32 ioc_conn1;
+               __u32 ioc_u32_1;
+       };
+       union {
+               __u32 ioc_conn2;
+               __u32 ioc_u32_2;
+       };
+
+       struct obdo ioc_obdo1;
+       struct obdo ioc_obdo2;
+
+       obd_size ioc_count;
+       obd_off  ioc_offset;
+       __u32    ioc_dev;
+       __u32    ioc_command;
+
+       __u64 ioc_nid;
+       __u32 ioc_nal;
+       __u32 ioc_type;
+
+       /* buffers the kernel will treat as user pointers */
+       __u32  ioc_plen1;
+       char  *ioc_pbuf1;
+       __u32  ioc_plen2;
+       char  *ioc_pbuf2;
+
+       /* inline buffers for various arguments */
+       __u32  ioc_inllen1;
+       char  *ioc_inlbuf1;
+       __u32  ioc_inllen2;
+       char  *ioc_inlbuf2;
+       __u32  ioc_inllen3;
+       char  *ioc_inlbuf3;
+       __u32  ioc_inllen4;
+       char  *ioc_inlbuf4;
+
+       char    ioc_bulk[0];
+};
+
+struct obd_ioctl_hdr {
+       __u32 ioc_len;
+       __u32 ioc_version;
+};
+
+static inline int obd_ioctl_packlen(struct obd_ioctl_data *data)
+{
+       int len = cfs_size_round(sizeof(struct obd_ioctl_data));
+       len += cfs_size_round(data->ioc_inllen1);
+       len += cfs_size_round(data->ioc_inllen2);
+       len += cfs_size_round(data->ioc_inllen3);
+       len += cfs_size_round(data->ioc_inllen4);
+       return len;
+}
+
+
+static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data)
+{
+       if (data->ioc_len > (1<<30)) {
+               CERROR("OBD ioctl: ioc_len larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen1 > (1<<30)) {
+               CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen2 > (1<<30)) {
+               CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen3 > (1<<30)) {
+               CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inllen4 > (1<<30)) {
+               CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+               CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+               CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf3 && !data->ioc_inllen3) {
+               CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_inlbuf4 && !data->ioc_inllen4) {
+               CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_pbuf1 && !data->ioc_plen1) {
+               CERROR("OBD ioctl: pbuf1 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_pbuf2 && !data->ioc_plen2) {
+               CERROR("OBD ioctl: pbuf2 pointer but 0 length\n");
+               return 1;
+       }
+       if (data->ioc_plen1 && !data->ioc_pbuf1) {
+               CERROR("OBD ioctl: plen1 set but NULL pointer\n");
+               return 1;
+       }
+       if (data->ioc_plen2 && !data->ioc_pbuf2) {
+               CERROR("OBD ioctl: plen2 set but NULL pointer\n");
+               return 1;
+       }
+       if (obd_ioctl_packlen(data) > data->ioc_len) {
+               CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n",
+                      obd_ioctl_packlen(data), data->ioc_len);
+               return 1;
+       }
+       return 0;
+}
+
+
+#include <obd_support.h>
+
+/* function defined in lustre/obdclass/<platform>/<platform>-module.c */
+int obd_ioctl_getdata(char **buf, int *len, void *arg);
+int obd_ioctl_popdata(void *arg, void *data, int len);
+
+static inline void obd_ioctl_freedata(char *buf, int len)
+{
+       ENTRY;
+
+       OBD_FREE_LARGE(buf, len);
+       EXIT;
+       return;
+}
+
+/*
+ * BSD ioctl description:
+ * #define IOC_V1       _IOR(g, n1, long)
+ * #define IOC_V2       _IOW(g, n2, long)
+ *
+ * ioctl(f, IOC_V1, arg);
+ * arg will be treated as a long value,
+ *
+ * ioctl(f, IOC_V2, arg)
+ * arg will be treated as a pointer, bsd will call
+ * copyin(buf, arg, sizeof(long))
+ *
+ * To make BSD ioctl handles argument correctly and simplely,
+ * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data
+ * for us. Does this change affect Linux?  (XXX Liang)
+ */
+#define OBD_IOC_CREATE          _IOWR('f', 101, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DESTROY                _IOW ('f', 104, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PREALLOCATE        _IOWR('f', 105, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_SETATTR                _IOW ('f', 107, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETATTR                _IOWR ('f', 108, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ              _IOWR('f', 109, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_WRITE            _IOWR('f', 110, OBD_IOC_DATA_TYPE)
+
+
+#define OBD_IOC_STATFS          _IOWR('f', 113, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SYNC              _IOW ('f', 114, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_READ2            _IOWR('f', 115, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FORMAT          _IOWR('f', 116, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARTITION            _IOWR('f', 117, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_COPY              _IOWR('f', 120, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_MIGR              _IOWR('f', 121, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PUNCH            _IOWR('f', 122, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_MODULE_DEBUG      _IOWR('f', 124, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_READ              _IOWR('f', 125, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_BRW_WRITE            _IOWR('f', 126, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_NAME2DEV              _IOWR('f', 127, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_UUID2DEV              _IOWR('f', 130, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GETNAME                _IOWR('f', 131, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETMDNAME            _IOR('f', 131, char[MAX_OBD_NAME])
+#define OBD_IOC_GETDTNAME             OBD_IOC_GETNAME
+
+#define OBD_IOC_LOV_GET_CONFIG  _IOWR('f', 132, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLIENT_RECOVER  _IOW ('f', 133, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PING_TARGET        _IOW ('f', 136, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_DEC_FS_USE_COUNT       _IO  ('f', 139      )
+#define OBD_IOC_NO_TRANSNO          _IOW ('f', 140, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_SET_READONLY      _IOW ('f', 141, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ABORT_RECOVERY  _IOR ('f', 142, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_ROOT_SQUASH        _IOWR('f', 143, OBD_IOC_DATA_TYPE)
+
+#define OBD_GET_VERSION                _IOWR ('f', 144, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GSS_SUPPORT        _IOWR('f', 145, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CLOSE_UUID          _IOWR ('f', 147, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CHANGELOG_SEND  _IOW ('f', 148, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_GETDEVICE            _IOWR ('f', 149, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_FID2PATH              _IOWR ('f', 150, OBD_IOC_DATA_TYPE)
+/* see also <lustre/lustre_user.h> for ioctls 151-153 */
+/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */
+#define OBD_IOC_LOV_SETSTRIPE    _IOW ('f', 154, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */
+#define OBD_IOC_LOV_GETSTRIPE    _IOW ('f', 155, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */
+#define OBD_IOC_LOV_SETEA            _IOW ('f', 156, OBD_IOC_DATA_TYPE)
+/* see <lustre/lustre_user.h> for ioctls 157-159 */
+/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */
+#define OBD_IOC_QUOTACHECK          _IOW ('f', 160, int)
+/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */
+#define OBD_IOC_POLL_QUOTACHECK        _IOR ('f', 161, struct if_quotacheck *)
+/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */
+#define OBD_IOC_QUOTACTL              _IOWR('f', 162, struct if_quotactl)
+/* see  also <lustre/lustre_user.h> for ioctls 163-176 */
+#define OBD_IOC_CHANGELOG_REG    _IOW ('f', 177, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_DEREG        _IOW ('f', 178, struct obd_ioctl_data)
+#define OBD_IOC_CHANGELOG_CLEAR        _IOW ('f', 179, struct obd_ioctl_data)
+#define OBD_IOC_RECORD          _IOWR('f', 180, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_ENDRECORD            _IOWR('f', 181, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARSE            _IOWR('f', 182, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DORECORD              _IOWR('f', 183, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PROCESS_CFG        _IOWR('f', 184, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_DUMP_LOG              _IOWR('f', 185, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_CLEAR_LOG            _IOWR('f', 186, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PARAM            _IOW ('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL              _IOWR('f', 188, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_REPLACE_NIDS      _IOWR('f', 189, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_CATLOGLIST          _IOWR('f', 190, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_INFO            _IOWR('f', 191, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_PRINT          _IOWR('f', 192, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CANCEL        _IOWR('f', 193, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_REMOVE        _IOWR('f', 194, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_LLOG_CHECK          _IOWR('f', 195, OBD_IOC_DATA_TYPE)
+/* OBD_IOC_LLOG_CATINFO is deprecated */
+#define OBD_IOC_LLOG_CATINFO      _IOWR('f', 196, OBD_IOC_DATA_TYPE)
+
+#define ECHO_IOC_GET_STRIPE        _IOWR('f', 200, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_SET_STRIPE        _IOWR('f', 201, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_ENQUEUE              _IOWR('f', 202, OBD_IOC_DATA_TYPE)
+#define ECHO_IOC_CANCEL                _IOWR('f', 203, OBD_IOC_DATA_TYPE)
+
+#define OBD_IOC_GET_OBJ_VERSION        _IOR('f', 210, OBD_IOC_DATA_TYPE)
+
+/* <lustre/lustre_user.h> defines ioctl number 218-219 */
+#define OBD_IOC_GET_MNTOPT          _IOW('f', 220, mntopt_t)
+
+#define OBD_IOC_ECHO_MD                _IOR('f', 221, struct obd_ioctl_data)
+#define OBD_IOC_ECHO_ALLOC_SEQ  _IOWR('f', 222, struct obd_ioctl_data)
+
+#define OBD_IOC_START_LFSCK           _IOWR('f', 230, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_STOP_LFSCK            _IOW('f', 231, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_PAUSE_LFSCK           _IOW('f', 232, OBD_IOC_DATA_TYPE)
+
+/* XXX _IOWR('f', 250, long) has been defined in
+ * libcfs/include/libcfs/libcfs_private.h for debug, don't use it
+ */
+
+/* Until such time as we get_info the per-stripe maximum from the OST,
+ * we define this to be 2T - 4k, which is the ext3 maxbytes. */
+#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL
+
+/* Special values for remove LOV EA from disk */
+#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \
+                                                offset == (typeof(offset))(-1))
+
+/* #define POISON_BULK 0 */
+
+/*
+ * l_wait_event is a flexible sleeping function, permitting simple caller
+ * configuration of interrupt and timeout sensitivity along with actions to
+ * be performed in the event of either exception.
+ *
+ * The first form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler,
+ *                                        intr_handler, callback_data);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * l_wait_event() makes the current process wait on 'waitq' until 'condition'
+ * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending.  It
+ * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before
+ * 'condition' becomes true, it optionally calls the specified 'intr_handler'
+ * if not NULL, and returns -EINTR.
+ *
+ * If a non-zero timeout is specified, signals are ignored until the timeout
+ * has expired.  At this time, if 'timeout_handler' is not NULL it is called.
+ * If it returns FALSE l_wait_event() continues to wait as described above with
+ * signals enabled.  Otherwise it returns -ETIMEDOUT.
+ *
+ * LWI_INTR(intr_handler, callback_data) is shorthand for
+ * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data)
+ *
+ * The second form of usage looks like this:
+ *
+ * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ *
+ * This form is the same as the first except that it COMPLETELY IGNORES
+ * SIGNALS.  The caller must therefore beware that if 'timeout' is zero, or if
+ * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that
+ * can unblock the current process is 'condition' becoming TRUE.
+ *
+ * Another form of usage is:
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval,
+ *                                            timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ * This is the same as previous case, but condition is checked once every
+ * 'interval' jiffies (if non-zero).
+ *
+ * Subtle synchronization point: this macro does *not* necessary takes
+ * wait-queue spin-lock before returning, and, hence, following idiom is safe
+ * ONLY when caller provides some external locking:
+ *
+ *          Thread1                        Thread2
+ *
+ *   l_wait_event(&obj->wq, ....);                                    (1)
+ *
+ *                                 wake_up(&obj->wq):           (2)
+ *                                      spin_lock(&q->lock);     (2.1)
+ *                                      __wake_up_common(q, ...);     (2.2)
+ *                                      spin_unlock(&q->lock, flags); (2.3)
+ *
+ *   OBD_FREE_PTR(obj);                                                  (3)
+ *
+ * As l_wait_event() may "short-cut" execution and return without taking
+ * wait-queue spin-lock, some additional synchronization is necessary to
+ * guarantee that step (3) can begin only after (2.3) finishes.
+ *
+ * XXX nikita: some ptlrpc daemon threads have races of that sort.
+ *
+ */
+static inline int back_to_sleep(void *arg)
+{
+       return 0;
+}
+
+#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
+
+struct l_wait_info {
+       cfs_duration_t lwi_timeout;
+       cfs_duration_t lwi_interval;
+       int         lwi_allow_intr;
+       int  (*lwi_on_timeout)(void *);
+       void (*lwi_on_signal)(void *);
+       void  *lwi_cb_data;
+};
+
+/* NB: LWI_TIMEOUT ignores signals completely */
+#define LWI_TIMEOUT(time, cb, data)         \
+((struct l_wait_info) {                         \
+       .lwi_timeout    = time,          \
+       .lwi_on_timeout = cb,              \
+       .lwi_cb_data    = data,          \
+       .lwi_interval   = 0,                \
+       .lwi_allow_intr = 0                  \
+})
+
+#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data)  \
+((struct l_wait_info) {                                 \
+       .lwi_timeout    = time,                  \
+       .lwi_on_timeout = cb,                      \
+       .lwi_cb_data    = data,                  \
+       .lwi_interval   = interval,                  \
+       .lwi_allow_intr = 0                          \
+})
+
+#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data)   \
+((struct l_wait_info) {                                 \
+       .lwi_timeout    = time,                  \
+       .lwi_on_timeout = time_cb,                    \
+       .lwi_on_signal  = sig_cb,                      \
+       .lwi_cb_data    = data,                  \
+       .lwi_interval   = 0,                        \
+       .lwi_allow_intr = 0                          \
+})
+
+#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data)       \
+((struct l_wait_info) {                                         \
+       .lwi_timeout    = time,                          \
+       .lwi_on_timeout = time_cb,                            \
+       .lwi_on_signal  = sig_cb,                              \
+       .lwi_cb_data    = data,                          \
+       .lwi_interval   = 0,                                \
+       .lwi_allow_intr = 1                                  \
+})
+
+#define LWI_INTR(cb, data)  LWI_TIMEOUT_INTR(0, NULL, cb, data)
+
+
+/*
+ * wait for @condition to become true, but no longer than timeout, specified
+ * by @info.
+ */
+#define __l_wait_event(wq, condition, info, ret, l_add_wait)              \
+do {                                                                      \
+       wait_queue_t __wait;                                             \
+       cfs_duration_t __timeout = info->lwi_timeout;                     \
+       sigset_t   __blocked;                                         \
+       int   __allow_intr = info->lwi_allow_intr;                           \
+                                                                              \
+       ret = 0;                                                               \
+       if (condition)                                                   \
+               break;                                                   \
+                                                                              \
+       init_waitqueue_entry_current(&__wait);                                      \
+       l_add_wait(&wq, &__wait);                                             \
+                                                                              \
+       /* Block all signals (just the non-fatal ones if no timeout). */       \
+       if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr))   \
+               __blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);             \
+       else                                                               \
+               __blocked = cfs_block_sigsinv(0);                             \
+                                                                              \
+       for (;;) {                                                           \
+               unsigned       __wstate;                                       \
+                                                                              \
+               __wstate = info->lwi_on_signal != NULL &&                     \
+                          (__timeout == 0 || __allow_intr) ?             \
+                       TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;             \
+                                                                              \
+               set_current_state(TASK_INTERRUPTIBLE);           \
+                                                                              \
+               if (condition)                                           \
+                       break;                                           \
+                                                                              \
+               if (__timeout == 0) {                                     \
+                       waitq_wait(&__wait, __wstate);               \
+               } else {                                                       \
+                       cfs_duration_t interval = info->lwi_interval?     \
+                                            min_t(cfs_duration_t,           \
+                                                info->lwi_interval,__timeout):\
+                                            __timeout;                 \
+                       cfs_duration_t remaining = waitq_timedwait(&__wait,\
+                                                  __wstate,               \
+                                                  interval);             \
+                       __timeout = cfs_time_sub(__timeout,                 \
+                                           cfs_time_sub(interval, remaining));\
+                       if (__timeout == 0) {                             \
+                               if (info->lwi_on_timeout == NULL ||         \
+                                   info->lwi_on_timeout(info->lwi_cb_data)) { \
+                                       ret = -ETIMEDOUT;                     \
+                                       break;                           \
+                               }                                             \
+                               /* Take signals after the timeout expires. */  \
+                               if (info->lwi_on_signal != NULL)               \
+                                   (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\
+                       }                                                     \
+               }                                                             \
+                                                                              \
+               if (condition)                                           \
+                       break;                                           \
+               if (cfs_signal_pending()) {                                 \
+                       if (info->lwi_on_signal != NULL &&                   \
+                           (__timeout == 0 || __allow_intr)) {         \
+                               if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \
+                                       info->lwi_on_signal(info->lwi_cb_data);\
+                               ret = -EINTR;                             \
+                               break;                                   \
+                       }                                                     \
+                       /* We have to do this here because some signals */     \
+                       /* are not blockable - ie from strace(1).       */     \
+                       /* In these cases we want to schedule_timeout() */     \
+                       /* again, because we don't want that to return  */     \
+                       /* -EINTR when the RPC actually succeeded.      */     \
+                       /* the recalc_sigpending() below will deliver the */     \
+                       /* signal properly.                          */     \
+                       cfs_clear_sigpending();                         \
+               }                                                             \
+       }                                                                     \
+                                                                              \
+       cfs_restore_sigs(__blocked);                                       \
+                                                                              \
+       set_current_state(TASK_RUNNING);                               \
+       remove_wait_queue(&wq, &__wait);                                           \
+} while (0)
+
+
+
+#define l_wait_event(wq, condition, info)                     \
+({                                                           \
+       int              __ret;                       \
+       struct l_wait_info *__info = (info);                \
+                                                               \
+       __l_wait_event(wq, condition, __info,              \
+                      __ret, add_wait_queue);             \
+       __ret;                                            \
+})
+
+#define l_wait_event_exclusive(wq, condition, info)         \
+({                                                           \
+       int              __ret;                       \
+       struct l_wait_info *__info = (info);                \
+                                                               \
+       __l_wait_event(wq, condition, __info,              \
+                      __ret, add_wait_queue_exclusive);         \
+       __ret;                                            \
+})
+
+#define l_wait_event_exclusive_head(wq, condition, info)       \
+({                                                           \
+       int              __ret;                       \
+       struct l_wait_info *__info = (info);                \
+                                                               \
+       __l_wait_event(wq, condition, __info,              \
+                      __ret, add_wait_queue_exclusive_head);    \
+       __ret;                                            \
+})
+
+#define l_wait_condition(wq, condition)                         \
+({                                                           \
+       struct l_wait_info lwi = { 0 };                  \
+       l_wait_event(wq, condition, &lwi);                    \
+})
+
+#define l_wait_condition_exclusive(wq, condition)             \
+({                                                           \
+       struct l_wait_info lwi = { 0 };                  \
+       l_wait_event_exclusive(wq, condition, &lwi);        \
+})
+
+#define l_wait_condition_exclusive_head(wq, condition)   \
+({                                                           \
+       struct l_wait_info lwi = { 0 };                  \
+       l_wait_event_exclusive_head(wq, condition, &lwi);       \
+})
+
+#define LIBLUSTRE_CLIENT (0)
+
+/** @} lib */
+
+#endif /* _LUSTRE_LIB_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_linkea.h b/drivers/staging/lustre/lustre/include/lustre_linkea.h
new file mode 100644 (file)
index 0000000..5790be9
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: di wang <di.wang@intel.com>
+ */
+
+struct linkea_data {
+       /**
+        * Buffer to keep link EA body.
+        */
+       struct lu_buf           *ld_buf;
+       /**
+        * The matched header, entry and its lenght in the EA
+        */
+       struct link_ea_header   *ld_leh;
+       struct link_ea_entry    *ld_lee;
+       int                     ld_reclen;
+};
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf);
+int linkea_init(struct linkea_data *ldata);
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+                        struct lu_name *lname, struct lu_fid *pfid);
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+                  const struct lu_fid *pfid);
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname);
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+                     const struct lu_fid  *pfid);
+
+#define LINKEA_NEXT_ENTRY(ldata)       \
+       (struct link_ea_entry *)((char *)ldata.ld_lee + ldata.ld_reclen)
+
+#define LINKEA_FIRST_ENTRY(ldata)      \
+       (struct link_ea_entry *)(ldata.ld_leh + 1)
diff --git a/drivers/staging/lustre/lustre/include/lustre_lite.h b/drivers/staging/lustre/lustre/include/lustre_lite.h
new file mode 100644 (file)
index 0000000..25f8bfa
--- /dev/null
@@ -0,0 +1,147 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LL_H
+#define _LL_H
+
+/** \defgroup lite lite
+ *
+ * @{
+ */
+
+#include <linux/lustre_lite.h>
+
+#include <obd_class.h>
+#include <obd_ost.h>
+#include <lustre_net.h>
+#include <lustre_mds.h>
+#include <lustre_ha.h>
+
+/* 4UL * 1024 * 1024 */
+#define LL_MAX_BLKSIZE_BITS     (22)
+#define LL_MAX_BLKSIZE   (1UL<<LL_MAX_BLKSIZE_BITS)
+
+#include <lustre/lustre_user.h>
+
+
+struct lustre_rw_params {
+       int             lrp_lock_mode;
+       ldlm_policy_data_t lrp_policy;
+       obd_flag           lrp_brw_flags;
+       int             lrp_ast_flags;
+};
+
+/*
+ * XXX nikita: this function lives in the header because it is used by both
+ * llite kernel module and liblustre library, and there is no (?) better place
+ * to put it in.
+ */
+static inline void lustre_build_lock_params(int cmd, unsigned long open_flags,
+                                           __u64 connect_flags,
+                                           loff_t pos, ssize_t len,
+                                           struct lustre_rw_params *params)
+{
+       params->lrp_lock_mode = (cmd == OBD_BRW_READ) ? LCK_PR : LCK_PW;
+       params->lrp_brw_flags = 0;
+
+       params->lrp_policy.l_extent.start = pos;
+       params->lrp_policy.l_extent.end = pos + len - 1;
+       /*
+        * for now O_APPEND always takes local locks.
+        */
+       if (cmd == OBD_BRW_WRITE && (open_flags & O_APPEND)) {
+               params->lrp_policy.l_extent.start = 0;
+               params->lrp_policy.l_extent.end   = OBD_OBJECT_EOF;
+       } else if (LIBLUSTRE_CLIENT && (connect_flags & OBD_CONNECT_SRVLOCK)) {
+               /*
+                * liblustre: OST-side locking for all non-O_APPEND
+                * reads/writes.
+                */
+               params->lrp_lock_mode = LCK_NL;
+               params->lrp_brw_flags = OBD_BRW_SRVLOCK;
+       } else {
+               /*
+                * nothing special for the kernel. In the future llite may use
+                * OST-side locks for small writes into highly contended
+                * files.
+                */
+       }
+       params->lrp_ast_flags = (open_flags & O_NONBLOCK) ?
+               LDLM_FL_BLOCK_NOWAIT : 0;
+}
+
+/*
+ * This is embedded into liblustre and llite super-blocks to keep track of
+ * connect flags (capabilities) supported by all imports given mount is
+ * connected to.
+ */
+struct lustre_client_ocd {
+       /*
+        * This is conjunction of connect_flags across all imports (LOVs) this
+        * mount is connected to. This field is updated by cl_ocd_update()
+        * under ->lco_lock.
+        */
+       __u64         lco_flags;
+       struct mutex       lco_lock;
+       struct obd_export *lco_md_exp;
+       struct obd_export *lco_dt_exp;
+};
+
+/*
+ * Chain of hash overflow pages.
+ */
+struct ll_dir_chain {
+       /* XXX something. Later */
+};
+
+static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
+{
+}
+
+static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
+{
+}
+
+static inline unsigned long hash_x_index(__u64 hash, int hash64)
+{
+       if (BITS_PER_LONG == 32 && hash64)
+               hash >>= 32;
+       return ~0UL - hash;
+}
+
+/** @} lite */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_log.h b/drivers/staging/lustre/lustre/include/lustre_log.h
new file mode 100644 (file)
index 0000000..714ab37
--- /dev/null
@@ -0,0 +1,576 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_log.h
+ *
+ * Generic infrastructure for managing a collection of logs.
+ * These logs are used for:
+ *
+ * - orphan recovery: OST adds record on create
+ * - mtime/size consistency: the OST adds a record on first write
+ * - open/unlinked objects: OST adds a record on destroy
+ *
+ * - mds unlink log: the MDS adds an entry upon delete
+ *
+ * - raid1 replication log between OST's
+ * - MDS replication logs
+ */
+
+#ifndef _LUSTRE_LOG_H
+#define _LUSTRE_LOG_H
+
+/** \defgroup log log
+ *
+ * @{
+ */
+
+#include <linux/lustre_log.h>
+
+#include <obd_class.h>
+#include <obd_ost.h>
+#include <lustre/lustre_idl.h>
+#include <dt_object.h>
+
+#define LOG_NAME_LIMIT(logname, name)             \
+       snprintf(logname, sizeof(logname), "LOGS/%s", name)
+#define LLOG_EEMPTY 4711
+
+enum llog_open_param {
+       LLOG_OPEN_EXISTS        = 0x0000,
+       LLOG_OPEN_NEW           = 0x0001,
+};
+
+struct plain_handle_data {
+       struct list_head          phd_entry;
+       struct llog_handle *phd_cat_handle;
+       struct llog_cookie  phd_cookie; /* cookie of this log in its cat */
+};
+
+struct cat_handle_data {
+       struct list_head              chd_head;
+       struct llog_handle     *chd_current_log; /* currently open log */
+       struct llog_handle      *chd_next_log; /* llog to be used next */
+};
+
+static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid)
+{
+       /* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS)
+        * logid's by non-zero ogen (inode generation) and convert them
+        * into IGIF */
+       if (id->lgl_ogen == 0) {
+               fid->f_seq = id->lgl_oi.oi.oi_seq;
+               fid->f_oid = id->lgl_oi.oi.oi_id;
+               fid->f_ver = 0;
+       } else {
+               lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen);
+       }
+}
+
+static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id)
+{
+       id->lgl_oi.oi.oi_seq = fid->f_seq;
+       id->lgl_oi.oi.oi_id = fid->f_oid;
+       id->lgl_ogen = 0;
+}
+
+static inline void logid_set_id(struct llog_logid *log_id, __u64 id)
+{
+       log_id->lgl_oi.oi.oi_id = id;
+}
+
+static inline __u64 logid_id(struct llog_logid *log_id)
+{
+       return log_id->lgl_oi.oi.oi_id;
+}
+
+struct llog_handle;
+
+/* llog.c  -  general API */
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+                    int flags, struct obd_uuid *uuid);
+int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh,
+                     struct llog_rec_hdr *rec, void *data);
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+                llog_cb_t cb, void *data, void *catdata);
+int llog_process_or_fork(const struct lu_env *env,
+                        struct llog_handle *loghandle,
+                        llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_reverse_process(const struct lu_env *env,
+                        struct llog_handle *loghandle, llog_cb_t cb,
+                        void *data, void *catdata);
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+                   int index);
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+             struct llog_handle **lgh, struct llog_logid *logid,
+             char *name, enum llog_open_param open_param);
+int llog_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_get_size(struct llog_handle *loghandle);
+
+/* llog_process flags */
+#define LLOG_FLAG_NODEAMON 0x0001
+
+/* llog_cat.c - catalog api */
+struct llog_process_data {
+       /**
+        * Any useful data needed while processing catalog. This is
+        * passed later to process callback.
+        */
+       void            *lpd_data;
+       /**
+        * Catalog process callback function, called for each record
+        * in catalog.
+        */
+       llog_cb_t           lpd_cb;
+       /**
+        * Start processing the catalog from startcat/startidx
+        */
+       int               lpd_startcat;
+       int               lpd_startidx;
+};
+
+struct llog_process_cat_data {
+       /**
+        * Temporary stored first_idx while scanning log.
+        */
+       int               lpcd_first_idx;
+       /**
+        * Temporary stored last_idx while scanning log.
+        */
+       int               lpcd_last_idx;
+};
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle);
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+                    struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+                    void *buf, struct thandle *th);
+int llog_cat_declare_add_rec(const struct lu_env *env,
+                            struct llog_handle *cathandle,
+                            struct llog_rec_hdr *rec, struct thandle *th);
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+                struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+                void *buf);
+int llog_cat_cancel_records(const struct lu_env *env,
+                           struct llog_handle *cathandle, int count,
+                           struct llog_cookie *cookies);
+int llog_cat_process_or_fork(const struct lu_env *env,
+                            struct llog_handle *cat_llh, llog_cb_t cb,
+                            void *data, int startcat, int startidx, bool fork);
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+                    llog_cb_t cb, void *data, int startcat, int startidx);
+int llog_cat_reverse_process(const struct lu_env *env,
+                            struct llog_handle *cat_llh, llog_cb_t cb,
+                            void *data);
+int llog_cat_init_and_process(const struct lu_env *env,
+                             struct llog_handle *llh);
+
+/* llog_obd.c */
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+              struct obd_llog_group *olg, int index,
+              struct obd_device *disk_obd, struct llog_operations *op);
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt);
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *);
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags);
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+                struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+                struct llog_cookie *logcookies, int numcookies);
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+               struct lov_stripe_md *lsm, int count,
+               struct llog_cookie *cookies, int flags);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                 struct obd_device *disk_obd, int *idx);
+
+int obd_llog_finish(struct obd_device *obd, int count);
+
+/* llog_ioctl.c */
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+              struct obd_ioctl_data *data);
+
+/* llog_net.c */
+int llog_initiator_connect(struct llog_ctxt *ctxt);
+
+struct llog_operations {
+       int (*lop_destroy)(const struct lu_env *env,
+                          struct llog_handle *handle);
+       int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h,
+                             int *curr_idx, int next_idx, __u64 *offset,
+                             void *buf, int len);
+       int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h,
+                             int prev_idx, void *buf, int len);
+       int (*lop_read_header)(const struct lu_env *env,
+                              struct llog_handle *handle);
+       int (*lop_setup)(const struct lu_env *env, struct obd_device *obd,
+                        struct obd_llog_group *olg, int ctxt_idx,
+                        struct obd_device *disk_obd);
+       int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp,
+                       int flags);
+       int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt);
+       int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt,
+                         struct lov_stripe_md *lsm, int count,
+                         struct llog_cookie *cookies, int flags);
+       int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid,
+                          struct llog_gen *gen, struct obd_uuid *uuid);
+       /**
+        * Any llog file must be opened first using llog_open().  Llog can be
+        * opened by name, logid or without both, in last case the new logid
+        * will be generated.
+        */
+       int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh,
+                       struct llog_logid *logid, char *name,
+                       enum llog_open_param);
+       /**
+        * Opened llog may not exist and this must be checked where needed using
+        * the llog_exist() call.
+        */
+       int (*lop_exist)(struct llog_handle *lgh);
+       /**
+        * Close llog file and calls llog_free_handle() implicitly.
+        * Any opened llog must be closed by llog_close() call.
+        */
+       int (*lop_close)(const struct lu_env *env, struct llog_handle *handle);
+       /**
+        * Create new llog file. The llog must be opened.
+        * Must be used only for local llog operations.
+        */
+       int (*lop_declare_create)(const struct lu_env *env,
+                                 struct llog_handle *handle,
+                                 struct thandle *th);
+       int (*lop_create)(const struct lu_env *env, struct llog_handle *handle,
+                         struct thandle *th);
+       /**
+        * write new record in llog. It appends records usually but can edit
+        * existing records too.
+        */
+       int (*lop_declare_write_rec)(const struct lu_env *env,
+                                    struct llog_handle *lgh,
+                                    struct llog_rec_hdr *rec,
+                                    int idx, struct thandle *th);
+       int (*lop_write_rec)(const struct lu_env *env,
+                            struct llog_handle *loghandle,
+                            struct llog_rec_hdr *rec,
+                            struct llog_cookie *cookie, int cookiecount,
+                            void *buf, int idx, struct thandle *th);
+       /**
+        * Add new record in llog catalog. Does the same as llog_write_rec()
+        * but using llog catalog.
+        */
+       int (*lop_declare_add)(const struct lu_env *env,
+                              struct llog_handle *lgh,
+                              struct llog_rec_hdr *rec, struct thandle *th);
+       int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh,
+                      struct llog_rec_hdr *rec, struct llog_cookie *cookie,
+                      void *buf, struct thandle *th);
+       /* Old llog_add version, used in MDS-LOV-OSC now and will gone with
+        * LOD/OSP replacement */
+       int (*lop_obd_add)(const struct lu_env *env, struct llog_ctxt *ctxt,
+                          struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+                          struct llog_cookie *logcookies, int numcookies);
+};
+
+/* In-memory descriptor for a log object or log catalog */
+struct llog_handle {
+       struct rw_semaphore      lgh_lock;
+       spinlock_t               lgh_hdr_lock; /* protect lgh_hdr data */
+       struct llog_logid        lgh_id; /* id of this log */
+       struct llog_log_hdr     *lgh_hdr;
+       struct file             *lgh_file;
+       struct dt_object        *lgh_obj;
+       int                      lgh_last_idx;
+       int                      lgh_cur_idx; /* used during llog_process */
+       __u64                    lgh_cur_offset; /* used during llog_process */
+       struct llog_ctxt        *lgh_ctxt;
+       union {
+               struct plain_handle_data         phd;
+               struct cat_handle_data           chd;
+       } u;
+       char                    *lgh_name;
+       void                    *private_data;
+       struct llog_operations  *lgh_logops;
+       atomic_t                 lgh_refcount;
+};
+
+/* llog_lvfs.c */
+extern struct llog_operations llog_lvfs_ops;
+
+/* llog_osd.c */
+extern struct llog_operations llog_osd_ops;
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+                         int idx, int count,
+                         struct llog_catid *idarray);
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+                         int idx, int count,
+                         struct llog_catid *idarray);
+
+#define LLOG_CTXT_FLAG_UNINITIALIZED     0x00000001
+#define LLOG_CTXT_FLAG_STOP             0x00000002
+
+struct llog_ctxt {
+       int                   loc_idx; /* my index the obd array of ctxt's */
+       struct obd_device       *loc_obd; /* points back to the containing obd*/
+       struct obd_llog_group   *loc_olg; /* group containing that ctxt */
+       struct obd_export       *loc_exp; /* parent "disk" export (e.g. MDS) */
+       struct obd_import       *loc_imp; /* to use in RPC's: can be backward
+                                            pointing import */
+       struct llog_operations  *loc_logops;
+       struct llog_handle      *loc_handle;
+       struct mutex             loc_mutex; /* protect loc_imp */
+       atomic_t             loc_refcount;
+       long                 loc_flags; /* flags, see above defines */
+       struct dt_object        *loc_dir;
+};
+
+#define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
+
+static inline int llog_obd2ops(struct llog_ctxt *ctxt,
+                              struct llog_operations **lop)
+{
+       if (ctxt == NULL)
+               return -ENOTCONN;
+
+       *lop = ctxt->loc_logops;
+       if (*lop == NULL)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+static inline int llog_handle2ops(struct llog_handle *loghandle,
+                                 struct llog_operations **lop)
+{
+       if (loghandle == NULL || loghandle->lgh_logops == NULL)
+               return -EINVAL;
+
+       *lop = loghandle->lgh_logops;
+       return 0;
+}
+
+static inline int llog_data_len(int len)
+{
+       return cfs_size_round(len);
+}
+
+static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt)
+{
+       atomic_inc(&ctxt->loc_refcount);
+       CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt,
+              atomic_read(&ctxt->loc_refcount));
+       return ctxt;
+}
+
+static inline void llog_ctxt_put(struct llog_ctxt *ctxt)
+{
+       if (ctxt == NULL)
+               return;
+       LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON);
+       CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt,
+              atomic_read(&ctxt->loc_refcount) - 1);
+       __llog_ctxt_put(NULL, ctxt);
+}
+
+static inline void llog_group_init(struct obd_llog_group *olg, int group)
+{
+       init_waitqueue_head(&olg->olg_waitq);
+       spin_lock_init(&olg->olg_lock);
+       mutex_init(&olg->olg_cat_processing);
+       olg->olg_seq = group;
+}
+
+static inline int llog_group_set_ctxt(struct obd_llog_group *olg,
+                                     struct llog_ctxt *ctxt, int index)
+{
+       LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+       spin_lock(&olg->olg_lock);
+       if (olg->olg_ctxts[index] != NULL) {
+               spin_unlock(&olg->olg_lock);
+               return -EEXIST;
+       }
+       olg->olg_ctxts[index] = ctxt;
+       spin_unlock(&olg->olg_lock);
+       return 0;
+}
+
+static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg,
+                                                   int index)
+{
+       struct llog_ctxt *ctxt;
+
+       LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+
+       spin_lock(&olg->olg_lock);
+       if (olg->olg_ctxts[index] == NULL)
+               ctxt = NULL;
+       else
+               ctxt = llog_ctxt_get(olg->olg_ctxts[index]);
+       spin_unlock(&olg->olg_lock);
+       return ctxt;
+}
+
+static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index)
+{
+       LASSERT(index >= 0 && index < LLOG_MAX_CTXTS);
+       spin_lock(&olg->olg_lock);
+       olg->olg_ctxts[index] = NULL;
+       spin_unlock(&olg->olg_lock);
+}
+
+static inline struct llog_ctxt *llog_get_context(struct obd_device *obd,
+                                                int index)
+{
+       return llog_group_get_ctxt(&obd->obd_olg, index);
+}
+
+static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index)
+{
+       return (olg->olg_ctxts[index] == NULL);
+}
+
+static inline int llog_ctxt_null(struct obd_device *obd, int index)
+{
+       return (llog_group_ctxt_null(&obd->obd_olg, index));
+}
+
+static inline int llog_destroy(const struct lu_env *env,
+                              struct llog_handle *handle)
+{
+       struct llog_operations *lop;
+       int rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(handle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_destroy == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_destroy(env, handle);
+       RETURN(rc);
+}
+
+static inline int llog_next_block(const struct lu_env *env,
+                                 struct llog_handle *loghandle, int *cur_idx,
+                                 int next_idx, __u64 *cur_offset, void *buf,
+                                 int len)
+{
+       struct llog_operations *lop;
+       int rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(loghandle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_next_block == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx,
+                                cur_offset, buf, len);
+       RETURN(rc);
+}
+
+static inline int llog_prev_block(const struct lu_env *env,
+                                 struct llog_handle *loghandle,
+                                 int prev_idx, void *buf, int len)
+{
+       struct llog_operations *lop;
+       int rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(loghandle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_prev_block == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len);
+       RETURN(rc);
+}
+
+static inline int llog_connect(struct llog_ctxt *ctxt,
+                              struct llog_logid *logid, struct llog_gen *gen,
+                              struct obd_uuid *uuid)
+{
+       struct llog_operations  *lop;
+       int                      rc;
+
+       ENTRY;
+
+       rc = llog_obd2ops(ctxt, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_connect == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_connect(ctxt, logid, gen, uuid);
+       RETURN(rc);
+}
+
+/* llog.c */
+int llog_exist(struct llog_handle *loghandle);
+int llog_declare_create(const struct lu_env *env,
+                       struct llog_handle *loghandle, struct thandle *th);
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+               struct thandle *th);
+int llog_declare_write_rec(const struct lu_env *env,
+                          struct llog_handle *handle,
+                          struct llog_rec_hdr *rec, int idx,
+                          struct thandle *th);
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+                  struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+                  int numcookies, void *buf, int idx, struct thandle *th);
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+            struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+            void *buf, struct thandle *th);
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+                    struct llog_rec_hdr *rec, struct thandle *th);
+int lustre_process_log(struct super_block *sb, char *logname,
+                      struct config_llog_instance *cfg);
+int lustre_end_log(struct super_block *sb, char *logname,
+                  struct config_llog_instance *cfg);
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+                    struct llog_handle **res, struct llog_logid *logid,
+                    char *name);
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+              struct llog_logid *logid, char *name);
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+              struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+              int cookiecount, void *buf, int idx);
+
+/** @} log */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h
new file mode 100644 (file)
index 0000000..fb1561a
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mdc.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDC_H
+#define _LUSTRE_MDC_H
+
+/** \defgroup mdc mdc
+ *
+ * @{
+ */
+
+# include <linux/fs.h>
+# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/posix_acl_xattr.h>
+# endif /* CONFIG_FS_POSIX_ACL */
+# include <linux/lustre_intent.h>
+#include <lustre_handles.h>
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct ptlrpc_client;
+struct obd_export;
+struct ptlrpc_request;
+struct obd_device;
+
+struct mdc_rpc_lock {
+       struct mutex            rpcl_mutex;
+       struct lookup_intent    *rpcl_it;
+       int                     rpcl_fakes;
+};
+
+#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL)
+
+static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
+{
+       mutex_init(&lck->rpcl_mutex);
+       lck->rpcl_it = NULL;
+}
+
+static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck,
+                                   struct lookup_intent *it)
+{
+       ENTRY;
+
+       if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP))
+               return;
+
+       /* This would normally block until the existing request finishes.
+        * If fail_loc is set it will block until the regular request is
+        * done, then set rpcl_it to MDC_FAKE_RPCL_IT.  Once that is set
+        * it will only be cleared when all fake requests are finished.
+        * Only when all fake requests are finished can normal requests
+        * be sent, to ensure they are recoverable again. */
+ again:
+       mutex_lock(&lck->rpcl_mutex);
+
+       if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) {
+               lck->rpcl_it = MDC_FAKE_RPCL_IT;
+               lck->rpcl_fakes++;
+               mutex_unlock(&lck->rpcl_mutex);
+               return;
+       }
+
+       /* This will only happen when the CFS_FAIL_CHECK() was
+        * just turned off but there are still requests in progress.
+        * Wait until they finish.  It doesn't need to be efficient
+        * in this extremely rare case, just have low overhead in
+        * the common case when it isn't true. */
+       while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) {
+               mutex_unlock(&lck->rpcl_mutex);
+               schedule_timeout(cfs_time_seconds(1) / 4);
+               goto again;
+       }
+
+       LASSERT(lck->rpcl_it == NULL);
+       lck->rpcl_it = it;
+}
+
+static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
+                                   struct lookup_intent *it)
+{
+       if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP))
+               goto out;
+
+       if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */
+               mutex_lock(&lck->rpcl_mutex);
+
+               LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes);
+               lck->rpcl_fakes--;
+
+               if (lck->rpcl_fakes == 0)
+                       lck->rpcl_it = NULL;
+
+       } else {
+               LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it);
+               lck->rpcl_it = NULL;
+       }
+
+       mutex_unlock(&lck->rpcl_mutex);
+ out:
+       EXIT;
+}
+
+static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
+                                              struct mdt_body *body)
+{
+       if (body->valid & OBD_MD_FLMODEASIZE) {
+               if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize)
+                       exp->exp_obd->u.cli.cl_max_mds_easize =
+                                               body->max_mdsize;
+               if (exp->exp_obd->u.cli.cl_max_mds_cookiesize <
+                                               body->max_cookiesize)
+                       exp->exp_obd->u.cli.cl_max_mds_cookiesize =
+                                               body->max_cookiesize;
+       }
+}
+
+
+struct mdc_cache_waiter {
+       struct list_head              mcw_entry;
+       wait_queue_head_t            mcw_waitq;
+};
+
+/* mdc/mdc_locks.c */
+int it_disposition(struct lookup_intent *it, int flag);
+void it_clear_disposition(struct lookup_intent *it, int flag);
+void it_set_disposition(struct lookup_intent *it, int flag);
+int it_open_error(int phase, struct lookup_intent *it);
+
+/** @} mdc */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mds.h b/drivers/staging/lustre/lustre/include/lustre_mds.h
new file mode 100644 (file)
index 0000000..b386f87
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_mds.h
+ *
+ * MDS data structures.
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_MDS_H
+#define _LUSTRE_MDS_H
+
+/** \defgroup mds mds
+ *
+ * @{
+ */
+
+#include <lustre_handles.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_export.h>
+
+struct mds_group_info {
+       struct obd_uuid *uuid;
+       int group;
+};
+
+struct mds_capa_info {
+       struct obd_uuid *uuid;
+       struct lustre_capa_key *capa;
+};
+
+#define MDD_OBD_NAME     "mdd_obd"
+#define MDD_OBD_UUID     "mdd_obd_uuid"
+
+static inline int md_should_create(__u64 flags)
+{
+       return !(flags & MDS_OPEN_DELAY_CREATE ||
+              !(flags & FMODE_WRITE));
+}
+
+/* these are local flags, used only on the client, private */
+#define M_CHECK_STALE     0200000000
+
+/** @} mds */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_mdt.h b/drivers/staging/lustre/lustre/include/lustre_mdt.h
new file mode 100644 (file)
index 0000000..dba26a6
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LINUX_MDT_H
+#define __LINUX_MDT_H
+
+/** \defgroup mdt mdt
+ *
+ * @{
+ */
+
+#include <lustre/lustre_idl.h>
+#include <lustre_req_layout.h>
+#include <md_object.h>
+#include <dt_object.h>
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * Common thread info for mdt, seq and fld
+ */
+struct com_thread_info {
+       /*
+        * for req-layout interface.
+        */
+       struct req_capsule *cti_pill;
+};
+
+enum {
+       ESERIOUS = 0x0001000
+};
+
+static inline int err_serious(int rc)
+{
+       LASSERT(rc < 0);
+       LASSERT(-rc < ESERIOUS);
+       return -(-rc | ESERIOUS);
+}
+
+static inline int clear_serious(int rc)
+{
+       if (rc < 0)
+               rc = -(-rc & ~ESERIOUS);
+       return rc;
+}
+
+static inline int is_serious(int rc)
+{
+       return (rc < 0 && -rc & ESERIOUS);
+}
+
+/** @} mdt */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h
new file mode 100644 (file)
index 0000000..293dd90
--- /dev/null
@@ -0,0 +1,3451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/** \defgroup PtlRPC Portal RPC and networking module.
+ *
+ * PortalRPC is the layer used by rest of lustre code to achieve network
+ * communications: establish connections with corresponding export and import
+ * states, listen for a service, send and receive RPCs.
+ * PortalRPC also includes base recovery framework: packet resending and
+ * replaying, reconnections, pinger.
+ *
+ * PortalRPC utilizes LNet as its transport layer.
+ *
+ * @{
+ */
+
+
+#ifndef _LUSTRE_NET_H
+#define _LUSTRE_NET_H
+
+/** \defgroup net net
+ *
+ * @{
+ */
+
+#include <linux/lustre_net.h>
+
+#include <linux/libcfs/libcfs.h>
+// #include <obd.h>
+#include <linux/lnet/lnet.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_ha.h>
+#include <lustre_sec.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lu_object.h>
+#include <lustre_req_layout.h>
+
+#include <obd_support.h>
+#include <lustre_ver.h>
+
+/* MD flags we _always_ use */
+#define PTLRPC_MD_OPTIONS  0
+
+/**
+ * Max # of bulk operations in one request.
+ * In order for the client and server to properly negotiate the maximum
+ * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two
+ * value.  The client is free to limit the actual RPC size for any bulk
+ * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */
+#define PTLRPC_BULK_OPS_BITS   2
+#define PTLRPC_BULK_OPS_COUNT  (1U << PTLRPC_BULK_OPS_BITS)
+/**
+ * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and
+ * should not be used on the server at all.  Otherwise, it imposes a
+ * protocol limitation on the maximum RPC size that can be used by any
+ * RPC sent to that server in the future.  Instead, the server should
+ * use the negotiated per-client ocd_brw_size to determine the bulk
+ * RPC count. */
+#define PTLRPC_BULK_OPS_MASK   (~((__u64)PTLRPC_BULK_OPS_COUNT - 1))
+
+/**
+ * Define maxima for bulk I/O.
+ *
+ * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT
+ * of LNET_MTU sized RDMA transfers.  Clients and servers negotiate the
+ * currently supported maximum between peers at connect via ocd_brw_size.
+ */
+#define PTLRPC_MAX_BRW_BITS    (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
+#define PTLRPC_MAX_BRW_SIZE    (1 << PTLRPC_MAX_BRW_BITS)
+#define PTLRPC_MAX_BRW_PAGES   (PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+#define ONE_MB_BRW_SIZE                (1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_SIZE                (1 << LNET_MTU_BITS)
+#define MD_MAX_BRW_PAGES       (MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define DT_MAX_BRW_SIZE                PTLRPC_MAX_BRW_SIZE
+#define DT_MAX_BRW_PAGES       (DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define OFD_MAX_BRW_SIZE       (1 << LNET_MTU_BITS)
+
+/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
+# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0)
+#  error "PTLRPC_MAX_BRW_PAGES isn't a power of two"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE))
+#  error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE"
+# endif
+# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT)
+#  error "PTLRPC_MAX_BRW_SIZE too big"
+# endif
+# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT)
+#  error "PTLRPC_MAX_BRW_PAGES too big"
+# endif
+
+#define PTLRPC_NTHRS_INIT      2
+
+/**
+ * Buffer Constants
+ *
+ * Constants determine how memory is used to buffer incoming service requests.
+ *
+ * ?_NBUFS           # buffers to allocate when growing the pool
+ * ?_BUFSIZE       # bytes in a single request buffer
+ * ?_MAXREQSIZE         # maximum request service will receive
+ *
+ * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk
+ * of ?_NBUFS is added to the pool.
+ *
+ * Messages larger than ?_MAXREQSIZE are dropped.  Request buffers are
+ * considered full when less than ?_MAXREQSIZE is left in them.
+ */
+/**
+ * Thread Constants
+ *
+ * Constants determine how threads are created for ptlrpc service.
+ *
+ * ?_NTHRS_INIT                # threads to create for each service partition on
+ *                       initializing. If it's non-affinity service and
+ *                       there is only one partition, it's the overall #
+ *                       threads for the service while initializing.
+ * ?_NTHRS_BASE                # threads should be created at least for each
+ *                       ptlrpc partition to keep the service healthy.
+ *                       It's the low-water mark of threads upper-limit
+ *                       for each partition.
+ * ?_THR_FACTOR         # threads can be added on threads upper-limit for
+ *                       each CPU core. This factor is only for reference,
+ *                       we might decrease value of factor if number of cores
+ *                       per CPT is above a limit.
+ * ?_NTHRS_MAX         # overall threads can be created for a service,
+ *                       it's a soft limit because if service is running
+ *                       on machine with hundreds of cores and tens of
+ *                       CPU partitions, we need to guarantee each partition
+ *                       has ?_NTHRS_BASE threads, which means total threads
+ *                       will be ?_NTHRS_BASE * number_of_cpts which can
+ *                       exceed ?_NTHRS_MAX.
+ *
+ * Examples
+ *
+ * #define MDS_NTHRS_INIT      2
+ * #define MDS_NTHRS_BASE      64
+ * #define MDS_NTHRS_FACTOR    8
+ * #define MDS_NTHRS_MAX       1024
+ *
+ * Example 1):
+ * ---------------------------------------------------------------------
+ * Server(A) has 16 cores, user configured it to 4 partitions so each
+ * partition has 4 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96
+ *
+ * Total number of threads for the service is:
+ *     96 * partitions(4) = 384
+ *
+ * Example 2):
+ * ---------------------------------------------------------------------
+ * Server(B) has 32 cores, user configured it to 4 partitions so each
+ * partition has 8 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128
+ *
+ * Total number of threads for the service is:
+ *     128 * partitions(4) = 512
+ *
+ * Example 3):
+ * ---------------------------------------------------------------------
+ * Server(B) has 96 cores, user configured it to 8 partitions so each
+ * partition has 12 cores, then actual number of service threads on each
+ * partition is:
+ *     MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160
+ *
+ * Total number of threads for the service is:
+ *     160 * partitions(8) = 1280
+ *
+ * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number
+ * as upper limit of threads number for each partition:
+ *     MDS_NTHRS_MAX(1024) / partitions(8) = 128
+ *
+ * Example 4):
+ * ---------------------------------------------------------------------
+ * Server(C) have a thousand of cores and user configured it to 32 partitions
+ *     MDS_NTHRS_BASE(64) * 32 = 2048
+ *
+ * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need
+ * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads
+ * to keep service healthy, so total number of threads will just be 2048.
+ *
+ * NB: we don't suggest to choose server with that many cores because backend
+ *     filesystem itself, buffer cache, or underlying network stack might
+ *     have some SMP scalability issues at that large scale.
+ *
+ *     If user already has a fat machine with hundreds or thousands of cores,
+ *     there are two choices for configuration:
+ *     a) create CPU table from subset of all CPUs and run Lustre on
+ *     top of this subset
+ *     b) bind service threads on a few partitions, see modparameters of
+ *     MDS and OSS for details
+*
+ * NB: these calculations (and examples below) are simplified to help
+ *     understanding, the real implementation is a little more complex,
+ *     please see ptlrpc_server_nthreads_check() for details.
+ *
+ */
+
+ /*
+  * LDLM threads constants:
+  *
+  * Given 8 as factor and 24 as base threads number
+  *
+  * example 1)
+  * On 4-core machine we will have 24 + 8 * 4 = 56 threads.
+  *
+  * example 2)
+  * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56
+  * threads for each partition and total threads number will be 112.
+  *
+  * example 3)
+  * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24)
+  * threads for each partition to keep service healthy, so total threads
+  * number should be 24 * 8 = 192.
+  *
+  * So with these constants, threads number will be at the similar level
+  * of old versions, unless target machine has over a hundred cores
+  */
+#define LDLM_THR_FACTOR                8
+#define LDLM_NTHRS_INIT                PTLRPC_NTHRS_INIT
+#define LDLM_NTHRS_BASE                24
+#define LDLM_NTHRS_MAX         (num_online_cpus() == 1 ? 64 : 128)
+
+#define LDLM_BL_THREADS   LDLM_NTHRS_AUTO_INIT
+#define LDLM_CLIENT_NBUFS 1
+#define LDLM_SERVER_NBUFS 64
+#define LDLM_BUFSIZE      (8 * 1024)
+#define LDLM_MAXREQSIZE   (5 * 1024)
+#define LDLM_MAXREPSIZE   (1024)
+
+ /*
+  * MDS threads constants:
+  *
+  * Please see examples in "Thread Constants", MDS threads number will be at
+  * the comparable level of old versions, unless the server has many cores.
+  */
+#ifndef MDS_MAX_THREADS
+#define MDS_MAX_THREADS                1024
+#define MDS_MAX_OTHR_THREADS   256
+
+#else /* MDS_MAX_THREADS */
+#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT
+#undef MDS_MAX_THREADS
+#define MDS_MAX_THREADS        PTLRPC_NTHRS_INIT
+#endif
+#define MDS_MAX_OTHR_THREADS   max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2)
+#endif
+
+/* default service */
+#define MDS_THR_FACTOR         8
+#define MDS_NTHRS_INIT         PTLRPC_NTHRS_INIT
+#define MDS_NTHRS_MAX          MDS_MAX_THREADS
+#define MDS_NTHRS_BASE         min(64, MDS_NTHRS_MAX)
+
+/* read-page service */
+#define MDS_RDPG_THR_FACTOR    4
+#define MDS_RDPG_NTHRS_INIT    PTLRPC_NTHRS_INIT
+#define MDS_RDPG_NTHRS_MAX     MDS_MAX_OTHR_THREADS
+#define MDS_RDPG_NTHRS_BASE    min(48, MDS_RDPG_NTHRS_MAX)
+
+/* these should be removed when we remove setattr service in the future */
+#define MDS_SETA_THR_FACTOR    4
+#define MDS_SETA_NTHRS_INIT    PTLRPC_NTHRS_INIT
+#define MDS_SETA_NTHRS_MAX     MDS_MAX_OTHR_THREADS
+#define MDS_SETA_NTHRS_BASE    min(48, MDS_SETA_NTHRS_MAX)
+
+/* non-affinity threads */
+#define MDS_OTHR_NTHRS_INIT    PTLRPC_NTHRS_INIT
+#define MDS_OTHR_NTHRS_MAX     MDS_MAX_OTHR_THREADS
+
+#define MDS_NBUFS              64
+
+/**
+ * Assume file name length = FNAME_MAX = 256 (true for ext3).
+ *       path name length = PATH_MAX = 4096
+ *       LOV MD size max  = EA_MAX = 24 * 2000
+ *             (NB: 24 is size of lov_ost_data)
+ *       LOV LOGCOOKIE size max = 32 * 2000
+ *             (NB: 32 is size of llog_cookie)
+ * symlink:  FNAME_MAX + PATH_MAX  <- largest
+ * link:     FNAME_MAX + PATH_MAX  (mds_rec_link < mds_rec_create)
+ * rename:   FNAME_MAX + FNAME_MAX
+ * open:     FNAME_MAX + EA_MAX
+ *
+ * MDS_MAXREQSIZE ~= 4736 bytes =
+ * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX
+ * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
+ *
+ * Realistic size is about 512 bytes (20 character name + 128 char symlink),
+ * except in the open case where there are a large number of OSTs in a LOV.
+ */
+#define MDS_MAXREQSIZE         (5 * 1024)      /* >= 4736 */
+#define MDS_MAXREPSIZE         (9 * 1024)      /* >= 8300 */
+
+/**
+ * MDS incoming request with LOV EA
+ * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate
+ */
+#define MDS_LOV_MAXREQSIZE     max(MDS_MAXREQSIZE, \
+                                   362 + LOV_MAX_STRIPE_COUNT * 24)
+/**
+ * MDS outgoing reply with LOV EA
+ *
+ * NB: max reply size Lustre 2.4+ client can get from old MDS is:
+ * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes
+ *
+ * but 2.4 or later MDS will never send reply with llog_cookie to any
+ * version client. This macro is defined for server side reply buffer size.
+ */
+#define MDS_LOV_MAXREPSIZE     MDS_LOV_MAXREQSIZE
+
+/**
+ * This is the size of a maximum REINT_SETXATTR request:
+ *
+ *   lustre_msg                 56 (32 + 4 x 5 + 4)
+ *   ptlrpc_body       184
+ *   mdt_rec_setxattr  136
+ *   lustre_capa       120
+ *   name              256 (XATTR_NAME_MAX)
+ *   value           65536 (XATTR_SIZE_MAX)
+ */
+#define MDS_EA_MAXREQSIZE      66288
+
+/**
+ * These are the maximum request and reply sizes (rounded up to 1 KB
+ * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL.
+ */
+#define MDS_REG_MAXREQSIZE     (((max(MDS_EA_MAXREQSIZE, \
+                                      MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10)
+#define MDS_REG_MAXREPSIZE     MDS_REG_MAXREQSIZE
+
+/**
+ * The update request includes all of updates from the create, which might
+ * include linkea (4K maxim), together with other updates, we set it to 9K:
+ * lustre_msg + ptlrpc_body + UPDATE_BUF_SIZE (8K)
+ */
+#define MDS_OUT_MAXREQSIZE     (9 * 1024)
+#define MDS_OUT_MAXREPSIZE     MDS_MAXREPSIZE
+
+/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */
+#define MDS_BUFSIZE            max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+                                   8 * 1024)
+
+/**
+ * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD.
+ * However, we need to allocate a much larger buffer for it because LNet
+ * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid
+ * dropping of maximum-sized incoming request.  So if MDS_REG_BUFSIZE is only a
+ * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request
+ * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory
+ * utilization is very low.
+ *
+ * In the meanwhile, size of rqbd can't be too large, because rqbd can't be
+ * reused until all requests fit in it have been processed and released,
+ * which means one long blocked request can prevent the rqbd be reused.
+ * Now we set request buffer size to 160 KB, so even each rqbd is unlinked
+ * from LNet with unused 65 KB, buffer utilization will be about 59%.
+ * Please check LU-2432 for details.
+ */
+#define MDS_REG_BUFSIZE                max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+                                   160 * 1024)
+
+/**
+ * MDS_OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is
+ * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some
+ * extra bytes to each request buffer to improve buffer utilization rate.
+  */
+#define MDS_OUT_BUFSIZE                max(MDS_OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \
+                                   24 * 1024)
+
+/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */
+#define FLD_MAXREQSIZE  (160)
+
+/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */
+#define FLD_MAXREPSIZE  (152)
+#define FLD_BUFSIZE    (1 << 12)
+
+/**
+ * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range +
+ * __u32 padding */
+#define SEQ_MAXREQSIZE  (160)
+
+/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */
+#define SEQ_MAXREPSIZE  (152)
+#define SEQ_BUFSIZE    (1 << 12)
+
+/** MGS threads must be >= 3, see bug 22458 comment #28 */
+#define MGS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1)
+#define MGS_NTHRS_MAX  32
+
+#define MGS_NBUFS       64
+#define MGS_BUFSIZE     (8 * 1024)
+#define MGS_MAXREQSIZE  (7 * 1024)
+#define MGS_MAXREPSIZE  (9 * 1024)
+
+ /*
+  * OSS threads constants:
+  *
+  * Given 8 as factor and 64 as base threads number
+  *
+  * example 1):
+  * On 8-core server configured to 2 partitions, we will have
+  * 64 + 8 * 4 = 96 threads for each partition, 192 total threads.
+  *
+  * example 2):
+  * On 32-core machine configured to 4 partitions, we will have
+  * 64 + 8 * 8 = 112 threads for each partition, so total threads number
+  * will be 112 * 4 = 448.
+  *
+  * example 3):
+  * On 64-core machine configured to 4 partitions, we will have
+  * 64 + 16 * 8 = 192 threads for each partition, so total threads number
+  * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we
+  * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads
+  * for each partition.
+  *
+  * So we can see that with these constants, threads number wil be at the
+  * similar level of old versions, unless the server has many cores.
+  */
+ /* depress threads factor for VM with small memory size */
+#define OSS_THR_FACTOR         min_t(int, 8, \
+                               NUM_CACHEPAGES >> (28 - PAGE_CACHE_SHIFT))
+#define OSS_NTHRS_INIT         (PTLRPC_NTHRS_INIT + 1)
+#define OSS_NTHRS_BASE         64
+#define OSS_NTHRS_MAX          512
+
+/* threads for handling "create" request */
+#define OSS_CR_THR_FACTOR      1
+#define OSS_CR_NTHRS_INIT      PTLRPC_NTHRS_INIT
+#define OSS_CR_NTHRS_BASE      8
+#define OSS_CR_NTHRS_MAX       64
+
+/**
+ * OST_IO_MAXREQSIZE ~=
+ *     lustre_msg + ptlrpc_body + obdo + obd_ioobj +
+ *     DT_MAX_BRW_PAGES * niobuf_remote
+ *
+ * - single object with 16 pages is 512 bytes
+ * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover
+ * - Must be a multiple of 1024
+ * - actual size is about 18K
+ */
+#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \
+                            sizeof(struct ptlrpc_body) + \
+                            sizeof(struct obdo) + \
+                            sizeof(struct obd_ioobj) + \
+                            sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES)
+/**
+ * FIEMAP request can be 4K+ for now
+ */
+#define OST_MAXREQSIZE         (5 * 1024)
+#define OST_IO_MAXREQSIZE      max_t(int, OST_MAXREQSIZE, \
+                               (((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1))
+
+#define OST_MAXREPSIZE         (9 * 1024)
+#define OST_IO_MAXREPSIZE      OST_MAXREPSIZE
+
+#define OST_NBUFS              64
+/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */
+#define OST_BUFSIZE            max_t(int, OST_MAXREQSIZE + 1024, 16 * 1024)
+/**
+ * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization
+ * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details.
+ */
+#define OST_IO_BUFSIZE         max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024)
+
+/* Macro to hide a typecast. */
+#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args)
+
+/**
+ * Structure to single define portal connection.
+ */
+struct ptlrpc_connection {
+       /** linkage for connections hash table */
+       struct hlist_node       c_hash;
+       /** Our own lnet nid for this connection */
+       lnet_nid_t            c_self;
+       /** Remote side nid for this connection */
+       lnet_process_id_t       c_peer;
+       /** UUID of the other side */
+       struct obd_uuid  c_remote_uuid;
+       /** reference counter for this connection */
+       atomic_t            c_refcount;
+};
+
+/** Client definition for PortalRPC */
+struct ptlrpc_client {
+       /** What lnet portal does this client send messages to by default */
+       __u32              cli_request_portal;
+       /** What portal do we expect replies on */
+       __u32              cli_reply_portal;
+       /** Name of the client */
+       char               *cli_name;
+};
+
+/** state flags of requests */
+/* XXX only ones left are those used by the bulk descs as well! */
+#define PTL_RPC_FL_INTR      (1 << 0)  /* reply wait was interrupted by user */
+#define PTL_RPC_FL_TIMEOUT   (1 << 7)  /* request timed out waiting for reply */
+
+#define REQ_MAX_ACK_LOCKS 8
+
+union ptlrpc_async_args {
+       /**
+        * Scratchpad for passing args to completion interpreter. Users
+        * cast to the struct of their choosing, and CLASSERT that this is
+        * big enough.  For _tons_ of context, OBD_ALLOC a struct and store
+        * a pointer to it here.  The pointer_arg ensures this struct is at
+        * least big enough for that.
+        */
+       void      *pointer_arg[11];
+       __u64      space[7];
+};
+
+struct ptlrpc_request_set;
+typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int);
+typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *);
+
+/**
+ * Definition of request set structure.
+ * Request set is a list of requests (not necessary to the same target) that
+ * once populated with RPCs could be sent in parallel.
+ * There are two kinds of request sets. General purpose and with dedicated
+ * serving thread. Example of the latter is ptlrpcd set.
+ * For general purpose sets once request set started sending it is impossible
+ * to add new requests to such set.
+ * Provides a way to call "completion callbacks" when all requests in the set
+ * returned.
+ */
+struct ptlrpc_request_set {
+       atomic_t          set_refcount;
+       /** number of in queue requests */
+       atomic_t          set_new_count;
+       /** number of uncompleted requests */
+       atomic_t          set_remaining;
+       /** wait queue to wait on for request events */
+       wait_queue_head_t          set_waitq;
+       wait_queue_head_t         *set_wakeup_ptr;
+       /** List of requests in the set */
+       struct list_head            set_requests;
+       /**
+        * List of completion callbacks to be called when the set is completed
+        * This is only used if \a set_interpret is NULL.
+        * Links struct ptlrpc_set_cbdata.
+        */
+       struct list_head            set_cblist;
+       /** Completion callback, if only one. */
+       set_interpreter_func  set_interpret;
+       /** opaq argument passed to completion \a set_interpret callback. */
+       void             *set_arg;
+       /**
+        * Lock for \a set_new_requests manipulations
+        * locked so that any old caller can communicate requests to
+        * the set holder who can then fold them into the lock-free set
+        */
+       spinlock_t              set_new_req_lock;
+       /** List of new yet unsent requests. Only used with ptlrpcd now. */
+       struct list_head            set_new_requests;
+
+       /** rq_status of requests that have been freed already */
+       int                set_rc;
+       /** Additional fields used by the flow control extension */
+       /** Maximum number of RPCs in flight */
+       int                set_max_inflight;
+       /** Callback function used to generate RPCs */
+       set_producer_func     set_producer;
+       /** opaq argument passed to the producer callback */
+       void             *set_producer_arg;
+};
+
+/**
+ * Description of a single ptrlrpc_set callback
+ */
+struct ptlrpc_set_cbdata {
+       /** List linkage item */
+       struct list_head              psc_item;
+       /** Pointer to interpreting function */
+       set_interpreter_func    psc_interpret;
+       /** Opaq argument to pass to the callback */
+       void               *psc_data;
+};
+
+struct ptlrpc_bulk_desc;
+struct ptlrpc_service_part;
+struct ptlrpc_service;
+
+/**
+ * ptlrpc callback & work item stuff
+ */
+struct ptlrpc_cb_id {
+       void   (*cbid_fn)(lnet_event_t *ev);     /* specific callback fn */
+       void    *cbid_arg;                    /* additional arg */
+};
+
+/** Maximum number of locks to fit into reply state */
+#define RS_MAX_LOCKS 8
+#define RS_DEBUG     0
+
+/**
+ * Structure to define reply state on the server
+ * Reply state holds various reply message information. Also for "difficult"
+ * replies (rep-ack case) we store the state after sending reply and wait
+ * for the client to acknowledge the reception. In these cases locks could be
+ * added to the state for replay/failover consistency guarantees.
+ */
+struct ptlrpc_reply_state {
+       /** Callback description */
+       struct ptlrpc_cb_id    rs_cb_id;
+       /** Linkage for list of all reply states in a system */
+       struct list_head             rs_list;
+       /** Linkage for list of all reply states on same export */
+       struct list_head             rs_exp_list;
+       /** Linkage for list of all reply states for same obd */
+       struct list_head             rs_obd_list;
+#if RS_DEBUG
+       struct list_head             rs_debug_list;
+#endif
+       /** A spinlock to protect the reply state flags */
+       spinlock_t              rs_lock;
+       /** Reply state flags */
+       unsigned long     rs_difficult:1;     /* ACK/commit stuff */
+       unsigned long     rs_no_ack:1;    /* no ACK, even for
+                                                 difficult requests */
+       unsigned long     rs_scheduled:1;     /* being handled? */
+       unsigned long     rs_scheduled_ever:1;/* any schedule attempts? */
+       unsigned long     rs_handled:1;  /* been handled yet? */
+       unsigned long     rs_on_net:1;   /* reply_out_callback pending? */
+       unsigned long     rs_prealloc:1; /* rs from prealloc list */
+       unsigned long     rs_committed:1;/* the transaction was committed
+                                                and the rs was dispatched
+                                                by ptlrpc_commit_replies */
+       /** Size of the state */
+       int                 rs_size;
+       /** opcode */
+       __u32             rs_opc;
+       /** Transaction number */
+       __u64             rs_transno;
+       /** xid */
+       __u64             rs_xid;
+       struct obd_export     *rs_export;
+       struct ptlrpc_service_part *rs_svcpt;
+       /** Lnet metadata handle for the reply */
+       lnet_handle_md_t       rs_md_h;
+       atomic_t           rs_refcount;
+
+       /** Context for the sevice thread */
+       struct ptlrpc_svc_ctx *rs_svc_ctx;
+       /** Reply buffer (actually sent to the client), encoded if needed */
+       struct lustre_msg     *rs_repbuf;       /* wrapper */
+       /** Size of the reply buffer */
+       int                 rs_repbuf_len;   /* wrapper buf length */
+       /** Size of the reply message */
+       int                 rs_repdata_len;  /* wrapper msg length */
+       /**
+        * Actual reply message. Its content is encrupted (if needed) to
+        * produce reply buffer for actual sending. In simple case
+        * of no network encryption we jus set \a rs_repbuf to \a rs_msg
+        */
+       struct lustre_msg     *rs_msg;    /* reply message */
+
+       /** Number of locks awaiting client ACK */
+       int                 rs_nlocks;
+       /** Handles of locks awaiting client reply ACK */
+       struct lustre_handle   rs_locks[RS_MAX_LOCKS];
+       /** Lock modes of locks in \a rs_locks */
+       ldlm_mode_t         rs_modes[RS_MAX_LOCKS];
+};
+
+struct ptlrpc_thread;
+
+/** RPC stages */
+enum rq_phase {
+       RQ_PHASE_NEW        = 0xebc0de00,
+       RQ_PHASE_RPC        = 0xebc0de01,
+       RQ_PHASE_BULK      = 0xebc0de02,
+       RQ_PHASE_INTERPRET      = 0xebc0de03,
+       RQ_PHASE_COMPLETE       = 0xebc0de04,
+       RQ_PHASE_UNREGISTERING  = 0xebc0de05,
+       RQ_PHASE_UNDEFINED      = 0xebc0de06
+};
+
+/** Type of request interpreter call-back */
+typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
+                                   struct ptlrpc_request *req,
+                                   void *arg, int rc);
+
+/**
+ * Definition of request pool structure.
+ * The pool is used to store empty preallocated requests for the case
+ * when we would actually need to send something without performing
+ * any allocations (to avoid e.g. OOM).
+ */
+struct ptlrpc_request_pool {
+       /** Locks the list */
+       spinlock_t prp_lock;
+       /** list of ptlrpc_request structs */
+       struct list_head prp_req_list;
+       /** Maximum message size that would fit into a rquest from this pool */
+       int prp_rq_size;
+       /** Function to allocate more requests for this pool */
+       void (*prp_populate)(struct ptlrpc_request_pool *, int);
+};
+
+struct lu_context;
+struct lu_env;
+
+struct ldlm_lock;
+
+/**
+ * \defgroup nrs Network Request Scheduler
+ * @{
+ */
+struct ptlrpc_nrs_policy;
+struct ptlrpc_nrs_resource;
+struct ptlrpc_nrs_request;
+
+/**
+ * NRS control operations.
+ *
+ * These are common for all policies.
+ */
+enum ptlrpc_nrs_ctl {
+       /**
+        * Not a valid opcode.
+        */
+       PTLRPC_NRS_CTL_INVALID,
+       /**
+        * Activate the policy.
+        */
+       PTLRPC_NRS_CTL_START,
+       /**
+        * Reserved for multiple primary policies, which may be a possibility
+        * in the future.
+        */
+       PTLRPC_NRS_CTL_STOP,
+       /**
+        * Policies can start using opcodes from this value and onwards for
+        * their own purposes; the assigned value itself is arbitrary.
+        */
+       PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20,
+};
+
+/**
+ * ORR policy operations
+ */
+enum nrs_ctl_orr {
+       NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+       NRS_CTL_ORR_WR_QUANTUM,
+       NRS_CTL_ORR_RD_OFF_TYPE,
+       NRS_CTL_ORR_WR_OFF_TYPE,
+       NRS_CTL_ORR_RD_SUPP_REQ,
+       NRS_CTL_ORR_WR_SUPP_REQ,
+};
+
+/**
+ * NRS policy operations.
+ *
+ * These determine the behaviour of a policy, and are called in response to
+ * NRS core events.
+ */
+struct ptlrpc_nrs_pol_ops {
+       /**
+        * Called during policy registration; this operation is optional.
+        *
+        * \param[in,out] policy The policy being initialized
+        */
+       int     (*op_policy_init) (struct ptlrpc_nrs_policy *policy);
+       /**
+        * Called during policy unregistration; this operation is optional.
+        *
+        * \param[in,out] policy The policy being unregistered/finalized
+        */
+       void    (*op_policy_fini) (struct ptlrpc_nrs_policy *policy);
+       /**
+        * Called when activating a policy via lprocfs; policies allocate and
+        * initialize their resources here; this operation is optional.
+        *
+        * \param[in,out] policy The policy being started
+        *
+        * \see nrs_policy_start_locked()
+        */
+       int     (*op_policy_start) (struct ptlrpc_nrs_policy *policy);
+       /**
+        * Called when deactivating a policy via lprocfs; policies deallocate
+        * their resources here; this operation is optional
+        *
+        * \param[in,out] policy The policy being stopped
+        *
+        * \see nrs_policy_stop0()
+        */
+       void    (*op_policy_stop) (struct ptlrpc_nrs_policy *policy);
+       /**
+        * Used for policy-specific operations; i.e. not generic ones like
+        * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous
+        * to an ioctl; this operation is optional.
+        *
+        * \param[in,out]        policy The policy carrying out operation \a opc
+        * \param[in]     opc    The command operation being carried out
+        * \param[in,out] arg    An generic buffer for communication between the
+        *                       user and the control operation
+        *
+        * \retval -ve error
+        * \retval   0 success
+        *
+        * \see ptlrpc_nrs_policy_control()
+        */
+       int     (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy,
+                                 enum ptlrpc_nrs_ctl opc, void *arg);
+
+       /**
+        * Called when obtaining references to the resources of the resource
+        * hierarchy for a request that has arrived for handling at the PTLRPC
+        * service. Policies should return -ve for requests they do not wish
+        * to handle. This operation is mandatory.
+        *
+        * \param[in,out] policy  The policy we're getting resources for.
+        * \param[in,out] nrq     The request we are getting resources for.
+        * \param[in]     parent  The parent resource of the resource being
+        *                        requested; set to NULL if none.
+        * \param[out]    resp    The resource is to be returned here; the
+        *                        fallback policy in an NRS head should
+        *                        \e always return a non-NULL pointer value.
+        * \param[in]  moving_req When set, signifies that this is an attempt
+        *                        to obtain resources for a request being moved
+        *                        to the high-priority NRS head by
+        *                        ldlm_lock_reorder_req().
+        *                        This implies two things:
+        *                        1. We are under obd_export::exp_rpc_lock and
+        *                        so should not sleep.
+        *                        2. We should not perform non-idempotent or can
+        *                        skip performing idempotent operations that
+        *                        were carried out when resources were first
+        *                        taken for the request when it was initialized
+        *                        in ptlrpc_nrs_req_initialize().
+        *
+        * \retval 0, +ve The level of the returned resource in the resource
+        *                hierarchy; currently only 0 (for a non-leaf resource)
+        *                and 1 (for a leaf resource) are supported by the
+        *                framework.
+        * \retval -ve    error
+        *
+        * \see ptlrpc_nrs_req_initialize()
+        * \see ptlrpc_nrs_hpreq_add_nolock()
+        * \see ptlrpc_nrs_req_hp_move()
+        */
+       int     (*op_res_get) (struct ptlrpc_nrs_policy *policy,
+                              struct ptlrpc_nrs_request *nrq,
+                              const struct ptlrpc_nrs_resource *parent,
+                              struct ptlrpc_nrs_resource **resp,
+                              bool moving_req);
+       /**
+        * Called when releasing references taken for resources in the resource
+        * hierarchy for the request; this operation is optional.
+        *
+        * \param[in,out] policy The policy the resource belongs to
+        * \param[in] res        The resource to be freed
+        *
+        * \see ptlrpc_nrs_req_finalize()
+        * \see ptlrpc_nrs_hpreq_add_nolock()
+        * \see ptlrpc_nrs_req_hp_move()
+        */
+       void    (*op_res_put) (struct ptlrpc_nrs_policy *policy,
+                              const struct ptlrpc_nrs_resource *res);
+
+       /**
+        * Obtains a request for handling from the policy, and optionally
+        * removes the request from the policy; this operation is mandatory.
+        *
+        * \param[in,out] policy The policy to poll
+        * \param[in]     peek   When set, signifies that we just want to
+        *                       examine the request, and not handle it, so the
+        *                       request is not removed from the policy.
+        * \param[in]     force  When set, it will force a policy to return a
+        *                       request if it has one queued.
+        *
+        * \retval NULL No request available for handling
+        * \retval valid-pointer The request polled for handling
+        *
+        * \see ptlrpc_nrs_req_get_nolock()
+        */
+       struct ptlrpc_nrs_request *
+               (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek,
+                              bool force);
+       /**
+        * Called when attempting to add a request to a policy for later
+        * handling; this operation is mandatory.
+        *
+        * \param[in,out] policy  The policy on which to enqueue \a nrq
+        * \param[in,out] nrq The request to enqueue
+        *
+        * \retval 0    success
+        * \retval != 0 error
+        *
+        * \see ptlrpc_nrs_req_add_nolock()
+        */
+       int     (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy,
+                                  struct ptlrpc_nrs_request *nrq);
+       /**
+        * Removes a request from the policy's set of pending requests. Normally
+        * called after a request has been polled successfully from the policy
+        * for handling; this operation is mandatory.
+        *
+        * \param[in,out] policy The policy the request \a nrq belongs to
+        * \param[in,out] nrq    The request to dequeue
+        *
+        * \see ptlrpc_nrs_req_del_nolock()
+        */
+       void    (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy,
+                                  struct ptlrpc_nrs_request *nrq);
+       /**
+        * Called after the request being carried out. Could be used for
+        * job/resource control; this operation is optional.
+        *
+        * \param[in,out] policy The policy which is stopping to handle request
+        *                       \a nrq
+        * \param[in,out] nrq    The request
+        *
+        * \pre spin_is_locked(&svcpt->scp_req_lock)
+        *
+        * \see ptlrpc_nrs_req_stop_nolock()
+        */
+       void    (*op_req_stop) (struct ptlrpc_nrs_policy *policy,
+                               struct ptlrpc_nrs_request *nrq);
+       /**
+        * Registers the policy's lprocfs interface with a PTLRPC service.
+        *
+        * \param[in] svc The service
+        *
+        * \retval 0    success
+        * \retval != 0 error
+        */
+       int     (*op_lprocfs_init) (struct ptlrpc_service *svc);
+       /**
+        * Unegisters the policy's lprocfs interface with a PTLRPC service.
+        *
+        * In cases of failed policy registration in
+        * \e ptlrpc_nrs_policy_register(), this function may be called for a
+        * service which has not registered the policy successfully, so
+        * implementations of this method should make sure their operations are
+        * safe in such cases.
+        *
+        * \param[in] svc The service
+        */
+       void    (*op_lprocfs_fini) (struct ptlrpc_service *svc);
+};
+
+/**
+ * Policy flags
+ */
+enum nrs_policy_flags {
+       /**
+        * Fallback policy, use this flag only on a single supported policy per
+        * service. The flag cannot be used on policies that use
+        * \e PTLRPC_NRS_FL_REG_EXTERN
+        */
+       PTLRPC_NRS_FL_FALLBACK          = (1 << 0),
+       /**
+        * Start policy immediately after registering.
+        */
+       PTLRPC_NRS_FL_REG_START         = (1 << 1),
+       /**
+        * This is a policy registering from a module different to the one NRS
+        * core ships in (currently ptlrpc).
+        */
+       PTLRPC_NRS_FL_REG_EXTERN        = (1 << 2),
+};
+
+/**
+ * NRS queue type.
+ *
+ * Denotes whether an NRS instance is for handling normal or high-priority
+ * RPCs, or whether an operation pertains to one or both of the NRS instances
+ * in a service.
+ */
+enum ptlrpc_nrs_queue_type {
+       PTLRPC_NRS_QUEUE_REG    = (1 << 0),
+       PTLRPC_NRS_QUEUE_HP     = (1 << 1),
+       PTLRPC_NRS_QUEUE_BOTH   = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP)
+};
+
+/**
+ * NRS head
+ *
+ * A PTLRPC service has at least one NRS head instance for handling normal
+ * priority RPCs, and may optionally have a second NRS head instance for
+ * handling high-priority RPCs. Each NRS head maintains a list of available
+ * policies, of which one and only one policy is acting as the fallback policy,
+ * and optionally a different policy may be acting as the primary policy. For
+ * all RPCs handled by this NRS head instance, NRS core will first attempt to
+ * enqueue the RPC using the primary policy (if any). The fallback policy is
+ * used in the following cases:
+ * - when there was no primary policy in the
+ *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request
+ *   was initialized.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, denoted it did not wish, or for some other reason was
+ *   not able to handle the request, by returning a non-valid NRS resource
+ *   reference.
+ * - when the primary policy that was at the
+ *   ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the
+ *   RPC was initialized, fails later during the request enqueueing stage.
+ *
+ * \see nrs_resource_get_safe()
+ * \see nrs_request_enqueue()
+ */
+struct ptlrpc_nrs {
+       spinlock_t                      nrs_lock;
+       /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */
+       /**
+        * List of registered policies
+        */
+       struct list_head                        nrs_policy_list;
+       /**
+        * List of policies with queued requests. Policies that have any
+        * outstanding requests are queued here, and this list is queried
+        * in a round-robin manner from NRS core when obtaining a request
+        * for handling. This ensures that requests from policies that at some
+        * point transition away from the
+        * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained.
+        */
+       struct list_head                        nrs_policy_queued;
+       /**
+        * Service partition for this NRS head
+        */
+       struct ptlrpc_service_part     *nrs_svcpt;
+       /**
+        * Primary policy, which is the preferred policy for handling RPCs
+        */
+       struct ptlrpc_nrs_policy       *nrs_policy_primary;
+       /**
+        * Fallback policy, which is the backup policy for handling RPCs
+        */
+       struct ptlrpc_nrs_policy       *nrs_policy_fallback;
+       /**
+        * This NRS head handles either HP or regular requests
+        */
+       enum ptlrpc_nrs_queue_type      nrs_queue_type;
+       /**
+        * # queued requests from all policies in this NRS head
+        */
+       unsigned long                   nrs_req_queued;
+       /**
+        * # scheduled requests from all policies in this NRS head
+        */
+       unsigned long                   nrs_req_started;
+       /**
+        * # policies on this NRS
+        */
+       unsigned                        nrs_num_pols;
+       /**
+        * This NRS head is in progress of starting a policy
+        */
+       unsigned                        nrs_policy_starting:1;
+       /**
+        * In progress of shutting down the whole NRS head; used during
+        * unregistration
+        */
+       unsigned                        nrs_stopping:1;
+};
+
+#define NRS_POL_NAME_MAX               16
+
+struct ptlrpc_nrs_pol_desc;
+
+/**
+ * Service compatibility predicate; this determines whether a policy is adequate
+ * for handling RPCs of a particular PTLRPC service.
+ *
+ * XXX:This should give the same result during policy registration and
+ * unregistration, and for all partitions of a service; so the result should not
+ * depend on temporal service or other properties, that may influence the
+ * result.
+ */
+typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc,
+                                      const struct ptlrpc_nrs_pol_desc *desc);
+
+struct ptlrpc_nrs_pol_conf {
+       /**
+        * Human-readable policy name
+        */
+       char                               nc_name[NRS_POL_NAME_MAX];
+       /**
+        * NRS operations for this policy
+        */
+       const struct ptlrpc_nrs_pol_ops   *nc_ops;
+       /**
+        * Service compatibility predicate
+        */
+       nrs_pol_desc_compat_t              nc_compat;
+       /**
+        * Set for policies that support a single ptlrpc service, i.e. ones that
+        * have \a pd_compat set to nrs_policy_compat_one(). The variable value
+        * depicts the name of the single service that such policies are
+        * compatible with.
+        */
+       const char                        *nc_compat_svc_name;
+       /**
+        * Owner module for this policy descriptor; policies registering from a
+        * different module to the one the NRS framework is held within
+        * (currently ptlrpc), should set this field to THIS_MODULE.
+        */
+       module_t                          *nc_owner;
+       /**
+        * Policy registration flags; a bitmast of \e nrs_policy_flags
+        */
+       unsigned                           nc_flags;
+};
+
+/**
+ * NRS policy registering descriptor
+ *
+ * Is used to hold a description of a policy that can be passed to NRS core in
+ * order to register the policy with NRS heads in different PTLRPC services.
+ */
+struct ptlrpc_nrs_pol_desc {
+       /**
+        * Human-readable policy name
+        */
+       char                                    pd_name[NRS_POL_NAME_MAX];
+       /**
+        * Link into nrs_core::nrs_policies
+        */
+       struct list_head                                pd_list;
+       /**
+        * NRS operations for this policy
+        */
+       const struct ptlrpc_nrs_pol_ops        *pd_ops;
+       /**
+        * Service compatibility predicate
+        */
+       nrs_pol_desc_compat_t                   pd_compat;
+       /**
+        * Set for policies that are compatible with only one PTLRPC service.
+        *
+        * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name
+        */
+       const char                             *pd_compat_svc_name;
+       /**
+        * Owner module for this policy descriptor.
+        *
+        * We need to hold a reference to the module whenever we might make use
+        * of any of the module's contents, i.e.
+        * - If one or more instances of the policy are at a state where they
+        *   might be handling a request, i.e.
+        *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or
+        *   ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to
+        *   call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference
+        *   is taken on the module when
+        *   \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it
+        *   becomes 0, so that we hold only one reference to the module maximum
+        *   at any time.
+        *
+        *   We do not need to hold a reference to the module, even though we
+        *   might use code and data from the module, in the following cases:
+        * - During external policy registration, because this should happen in
+        *   the module's init() function, in which case the module is safe from
+        *   removal because a reference is being held on the module by the
+        *   kernel, and iirc kmod (and I guess module-init-tools also) will
+        *   serialize any racing processes properly anyway.
+        * - During external policy unregistration, because this should happen
+        *   in a module's exit() function, and any attempts to start a policy
+        *   instance would need to take a reference on the module, and this is
+        *   not possible once we have reached the point where the exit()
+        *   handler is called.
+        * - During service registration and unregistration, as service setup
+        *   and cleanup, and policy registration, unregistration and policy
+        *   instance starting, are serialized by \e nrs_core::nrs_mutex, so
+        *   as long as users adhere to the convention of registering policies
+        *   in init() and unregistering them in module exit() functions, there
+        *   should not be a race between these operations.
+        * - During any policy-specific lprocfs operations, because a reference
+        *   is held by the kernel on a proc entry that has been entered by a
+        *   syscall, so as long as proc entries are removed during unregistration time,
+        *   then unregistration and lprocfs operations will be properly
+        *   serialized.
+        */
+       module_t                               *pd_owner;
+       /**
+        * Bitmask of \e nrs_policy_flags
+        */
+       unsigned                                pd_flags;
+       /**
+        * # of references on this descriptor
+        */
+       atomic_t                                pd_refs;
+};
+
+/**
+ * NRS policy state
+ *
+ * Policies transition from one state to the other during their lifetime
+ */
+enum ptlrpc_nrs_pol_state {
+       /**
+        * Not a valid policy state.
+        */
+       NRS_POL_STATE_INVALID,
+       /**
+        * Policies are at this state either at the start of their life, or
+        * transition here when the user selects a different policy to act
+        * as the primary one.
+        */
+       NRS_POL_STATE_STOPPED,
+       /**
+        * Policy is progress of stopping
+        */
+       NRS_POL_STATE_STOPPING,
+       /**
+        * Policy is in progress of starting
+        */
+       NRS_POL_STATE_STARTING,
+       /**
+        * A policy is in this state in two cases:
+        * - it is the fallback policy, which is always in this state.
+        * - it has been activated by the user; i.e. it is the primary policy,
+        */
+       NRS_POL_STATE_STARTED,
+};
+
+/**
+ * NRS policy information
+ *
+ * Used for obtaining information for the status of a policy via lprocfs
+ */
+struct ptlrpc_nrs_pol_info {
+       /**
+        * Policy name
+        */
+       char                            pi_name[NRS_POL_NAME_MAX];
+       /**
+        * Current policy state
+        */
+       enum ptlrpc_nrs_pol_state       pi_state;
+       /**
+        * # RPCs enqueued for later dispatching by the policy
+        */
+       long                            pi_req_queued;
+       /**
+        * # RPCs started for dispatch by the policy
+        */
+       long                            pi_req_started;
+       /**
+        * Is this a fallback policy?
+        */
+       unsigned                        pi_fallback:1;
+};
+
+/**
+ * NRS policy
+ *
+ * There is one instance of this for each policy in each NRS head of each
+ * PTLRPC service partition.
+ */
+struct ptlrpc_nrs_policy {
+       /**
+        * Linkage into the NRS head's list of policies,
+        * ptlrpc_nrs:nrs_policy_list
+        */
+       struct list_head                        pol_list;
+       /**
+        * Linkage into the NRS head's list of policies with enqueued
+        * requests ptlrpc_nrs:nrs_policy_queued
+        */
+       struct list_head                        pol_list_queued;
+       /**
+        * Current state of this policy
+        */
+       enum ptlrpc_nrs_pol_state       pol_state;
+       /**
+        * Bitmask of nrs_policy_flags
+        */
+       unsigned                        pol_flags;
+       /**
+        * # RPCs enqueued for later dispatching by the policy
+        */
+       long                            pol_req_queued;
+       /**
+        * # RPCs started for dispatch by the policy
+        */
+       long                            pol_req_started;
+       /**
+        * Usage Reference count taken on the policy instance
+        */
+       long                            pol_ref;
+       /**
+        * The NRS head this policy has been created at
+        */
+       struct ptlrpc_nrs              *pol_nrs;
+       /**
+        * Private policy data; varies by policy type
+        */
+       void                           *pol_private;
+       /**
+        * Policy descriptor for this policy instance.
+        */
+       struct ptlrpc_nrs_pol_desc     *pol_desc;
+};
+
+/**
+ * NRS resource
+ *
+ * Resources are embedded into two types of NRS entities:
+ * - Inside NRS policies, in the policy's private data in
+ *   ptlrpc_nrs_policy::pol_private
+ * - In objects that act as prime-level scheduling entities in different NRS
+ *   policies; e.g. on a policy that performs round robin or similar order
+ *   scheduling across client NIDs, there would be one NRS resource per unique
+ *   client NID. On a policy which performs round robin scheduling across
+ *   backend filesystem objects, there would be one resource associated with
+ *   each of the backend filesystem objects partaking in the scheduling
+ *   performed by the policy.
+ *
+ * NRS resources share a parent-child relationship, in which resources embedded
+ * in policy instances are the parent entities, with all scheduling entities
+ * a policy schedules across being the children, thus forming a simple resource
+ * hierarchy. This hierarchy may be extended with one or more levels in the
+ * future if the ability to have more than one primary policy is added.
+ *
+ * Upon request initialization, references to the then active NRS policies are
+ * taken and used to later handle the dispatching of the request with one of
+ * these policies.
+ *
+ * \see nrs_resource_get_safe()
+ * \see ptlrpc_nrs_req_add()
+ */
+struct ptlrpc_nrs_resource {
+       /**
+        * This NRS resource's parent; is NULL for resources embedded in NRS
+        * policy instances; i.e. those are top-level ones.
+        */
+       struct ptlrpc_nrs_resource     *res_parent;
+       /**
+        * The policy associated with this resource.
+        */
+       struct ptlrpc_nrs_policy       *res_policy;
+};
+
+enum {
+       NRS_RES_FALLBACK,
+       NRS_RES_PRIMARY,
+       NRS_RES_MAX
+};
+
+/* \name fifo
+ *
+ * FIFO policy
+ *
+ * This policy is a logical wrapper around previous, non-NRS functionality.
+ * It dispatches RPCs in the same order as they arrive from the network. This
+ * policy is currently used as the fallback policy, and the only enabled policy
+ * on all NRS heads of all PTLRPC service partitions.
+ * @{
+ */
+
+/**
+ * Private data structure for the FIFO policy
+ */
+struct nrs_fifo_head {
+       /**
+        * Resource object for policy instance.
+        */
+       struct ptlrpc_nrs_resource      fh_res;
+       /**
+        * List of queued requests.
+        */
+       struct list_head                        fh_list;
+       /**
+        * For debugging purposes.
+        */
+       __u64                           fh_sequence;
+};
+
+struct nrs_fifo_req {
+       struct list_head                fr_list;
+       __u64                   fr_sequence;
+};
+
+/** @} fifo */
+
+/**
+ * \name CRR-N
+ *
+ * CRR-N, Client Round Robin over NIDs
+ * @{
+ */
+
+/**
+ * private data structure for CRR-N NRS
+ */
+struct nrs_crrn_net {
+       struct ptlrpc_nrs_resource      cn_res;
+       cfs_binheap_t                  *cn_binheap;
+       cfs_hash_t                     *cn_cli_hash;
+       /**
+        * Used when a new scheduling round commences, in order to synchronize
+        * all clients with the new round number.
+        */
+       __u64                           cn_round;
+       /**
+        * Determines the relevant ordering amongst request batches within a
+        * scheduling round.
+        */
+       __u64                           cn_sequence;
+       /**
+        * Round Robin quantum; the maximum number of RPCs that each request
+        * batch for each client can have in a scheduling round.
+        */
+       __u16                           cn_quantum;
+};
+
+/**
+ * Object representing a client in CRR-N, as identified by its NID
+ */
+struct nrs_crrn_client {
+       struct ptlrpc_nrs_resource      cc_res;
+       struct hlist_node               cc_hnode;
+       lnet_nid_t                      cc_nid;
+       /**
+        * The round number against which this client is currently scheduling
+        * requests.
+        */
+       __u64                           cc_round;
+       /**
+        * The sequence number used for requests scheduled by this client during
+        * the current round number.
+        */
+       __u64                           cc_sequence;
+       atomic_t                        cc_ref;
+       /**
+        * Round Robin quantum; the maximum number of RPCs the client is allowed
+        * to schedule in a single batch of each round.
+        */
+       __u16                           cc_quantum;
+       /**
+        * # of pending requests for this client, on all existing rounds
+        */
+       __u16                           cc_active;
+};
+
+/**
+ * CRR-N NRS request definition
+ */
+struct nrs_crrn_req {
+       /**
+        * Round number for this request; shared with all other requests in the
+        * same batch.
+        */
+       __u64                   cr_round;
+       /**
+        * Sequence number for this request; shared with all other requests in
+        * the same batch.
+        */
+       __u64                   cr_sequence;
+};
+
+/**
+ * CRR-N policy operations.
+ */
+enum nrs_ctl_crr {
+       /**
+        * Read the RR quantum size of a CRR-N policy.
+        */
+       NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC,
+       /**
+        * Write the RR quantum size of a CRR-N policy.
+        */
+       NRS_CTL_CRRN_WR_QUANTUM,
+};
+
+/** @} CRR-N */
+
+/**
+ * \name ORR/TRR
+ *
+ * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies
+ * @{
+ */
+
+/**
+ * Lower and upper byte offsets of a brw RPC
+ */
+struct nrs_orr_req_range {
+       __u64           or_start;
+       __u64           or_end;
+};
+
+/**
+ * RPC types supported by the ORR/TRR policies
+ */
+enum nrs_orr_supp {
+       NOS_OST_READ  = (1 << 0),
+       NOS_OST_WRITE = (1 << 1),
+       NOS_OST_RW    = (NOS_OST_READ | NOS_OST_WRITE),
+       /**
+        * Default value for policies.
+        */
+       NOS_DFLT      = NOS_OST_READ
+};
+
+/**
+ * As unique keys for grouping RPCs together, we use the object's OST FID for
+ * the ORR policy, and the OST index for the TRR policy.
+ *
+ * XXX: We waste some space for TRR policy instances by using a union, but it
+ *     allows to consolidate some of the code between ORR and TRR, and these
+ *     policies will probably eventually merge into one anyway.
+ */
+struct nrs_orr_key {
+       union {
+               /** object FID for ORR */
+               struct lu_fid   ok_fid;
+               /** OST index for TRR */
+               __u32           ok_idx;
+       };
+};
+
+/**
+ * The largest base string for unique hash/slab object names is
+ * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT
+ * id number, so this _should_ be more than enough for the maximum number of
+ * CPTs on any system. If it does happen that this statement is incorrect,
+ * nrs_orr_genobjname() will inevitably yield a non-unique name and cause
+ * kmem_cache_create() to complain (on Linux), so the erroneous situation
+ * will hopefully not go unnoticed.
+ */
+#define NRS_ORR_OBJ_NAME_MAX   (sizeof("nrs_orr_reg_") + 3)
+
+/**
+ * private data structure for ORR and TRR NRS
+ */
+struct nrs_orr_data {
+       struct ptlrpc_nrs_resource      od_res;
+       cfs_binheap_t                  *od_binheap;
+       cfs_hash_t                     *od_obj_hash;
+       struct kmem_cache                      *od_cache;
+       /**
+        * Used when a new scheduling round commences, in order to synchronize
+        * all object or OST batches with the new round number.
+        */
+       __u64                           od_round;
+       /**
+        * Determines the relevant ordering amongst request batches within a
+        * scheduling round.
+        */
+       __u64                           od_sequence;
+       /**
+        * RPC types that are currently supported.
+        */
+       enum nrs_orr_supp               od_supp;
+       /**
+        * Round Robin quantum; the maxium number of RPCs that each request
+        * batch for each object or OST can have in a scheduling round.
+        */
+       __u16                           od_quantum;
+       /**
+        * Whether to use physical disk offsets or logical file offsets.
+        */
+       bool                            od_physical;
+       /**
+        * XXX: We need to provide a persistently allocated string to hold
+        * unique object names for this policy, since in currently supported
+        * versions of Linux by Lustre, kmem_cache_create() just sets a pointer
+        * to the name string provided. kstrdup() is used in the version of
+        * kmeme_cache_create() in current Linux mainline, so we may be able to
+        * remove this in the future.
+        */
+       char                            od_objname[NRS_ORR_OBJ_NAME_MAX];
+};
+
+/**
+ * Represents a backend-fs object or OST in the ORR and TRR policies
+ * respectively
+ */
+struct nrs_orr_object {
+       struct ptlrpc_nrs_resource      oo_res;
+       struct hlist_node               oo_hnode;
+       /**
+        * The round number against which requests are being scheduled for this
+        * object or OST
+        */
+       __u64                           oo_round;
+       /**
+        * The sequence number used for requests scheduled for this object or
+        * OST during the current round number.
+        */
+       __u64                           oo_sequence;
+       /**
+        * The key of the object or OST for which this structure instance is
+        * scheduling RPCs
+        */
+       struct nrs_orr_key              oo_key;
+       atomic_t                        oo_ref;
+       /**
+        * Round Robin quantum; the maximum number of RPCs that are allowed to
+        * be scheduled for the object or OST in a single batch of each round.
+        */
+       __u16                           oo_quantum;
+       /**
+        * # of pending requests for this object or OST, on all existing rounds
+        */
+       __u16                           oo_active;
+};
+
+/**
+ * ORR/TRR NRS request definition
+ */
+struct nrs_orr_req {
+       /**
+        * The offset range this request covers
+        */
+       struct nrs_orr_req_range        or_range;
+       /**
+        * Round number for this request; shared with all other requests in the
+        * same batch.
+        */
+       __u64                           or_round;
+       /**
+        * Sequence number for this request; shared with all other requests in
+        * the same batch.
+        */
+       __u64                           or_sequence;
+       /**
+        * For debugging purposes.
+        */
+       struct nrs_orr_key              or_key;
+       /**
+        * An ORR policy instance has filled in request information while
+        * enqueueing the request on the service partition's regular NRS head.
+        */
+       unsigned int                    or_orr_set:1;
+       /**
+        * A TRR policy instance has filled in request information while
+        * enqueueing the request on the service partition's regular NRS head.
+        */
+       unsigned int                    or_trr_set:1;
+       /**
+        * Request offset ranges have been filled in with logical offset
+        * values.
+        */
+       unsigned int                    or_logical_set:1;
+       /**
+        * Request offset ranges have been filled in with physical offset
+        * values.
+        */
+       unsigned int                    or_physical_set:1;
+};
+
+/** @} ORR/TRR */
+
+/**
+ * NRS request
+ *
+ * Instances of this object exist embedded within ptlrpc_request; the main
+ * purpose of this object is to hold references to the request's resources
+ * for the lifetime of the request, and to hold properties that policies use
+ * use for determining the request's scheduling priority.
+ * */
+struct ptlrpc_nrs_request {
+       /**
+        * The request's resource hierarchy.
+        */
+       struct ptlrpc_nrs_resource     *nr_res_ptrs[NRS_RES_MAX];
+       /**
+        * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the
+        * policy that was used to enqueue the request.
+        *
+        * \see nrs_request_enqueue()
+        */
+       unsigned                        nr_res_idx;
+       unsigned                        nr_initialized:1;
+       unsigned                        nr_enqueued:1;
+       unsigned                        nr_started:1;
+       unsigned                        nr_finalized:1;
+       cfs_binheap_node_t              nr_node;
+
+       /**
+        * Policy-specific fields, used for determining a request's scheduling
+        * priority, and other supporting functionality.
+        */
+       union {
+               /**
+                * Fields for the FIFO policy
+                */
+               struct nrs_fifo_req     fifo;
+               /**
+                * CRR-N request defintion
+                */
+               struct nrs_crrn_req     crr;
+               /** ORR and TRR share the same request definition */
+               struct nrs_orr_req      orr;
+       } nr_u;
+       /**
+        * Externally-registering policies may want to use this to allocate
+        * their own request properties.
+        */
+       void                           *ext;
+};
+
+/** @} nrs */
+
+/**
+ * Basic request prioritization operations structure.
+ * The whole idea is centered around locks and RPCs that might affect locks.
+ * When a lock is contended we try to give priority to RPCs that might lead
+ * to fastest release of that lock.
+ * Currently only implemented for OSTs only in a way that makes all
+ * IO and truncate RPCs that are coming from a locked region where a lock is
+ * contended a priority over other requests.
+ */
+struct ptlrpc_hpreq_ops {
+       /**
+        * Check if the lock handle of the given lock is the same as
+        * taken from the request.
+        */
+       int  (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *);
+       /**
+        * Check if the request is a high priority one.
+        */
+       int  (*hpreq_check)(struct ptlrpc_request *);
+       /**
+        * Called after the request has been handled.
+        */
+       void (*hpreq_fini)(struct ptlrpc_request *);
+};
+
+/**
+ * Represents remote procedure call.
+ *
+ * This is a staple structure used by everybody wanting to send a request
+ * in Lustre.
+ */
+struct ptlrpc_request {
+       /* Request type: one of PTL_RPC_MSG_* */
+       int rq_type;
+       /** Result of request processing */
+       int rq_status;
+       /**
+        * Linkage item through which this request is included into
+        * sending/delayed lists on client and into rqbd list on server
+        */
+       struct list_head rq_list;
+       /**
+        * Server side list of incoming unserved requests sorted by arrival
+        * time.  Traversed from time to time to notice about to expire
+        * requests and sent back "early replies" to clients to let them
+        * know server is alive and well, just very busy to service their
+        * requests in time
+        */
+       struct list_head rq_timed_list;
+       /** server-side history, used for debuging purposes. */
+       struct list_head rq_history_list;
+       /** server-side per-export list */
+       struct list_head rq_exp_list;
+       /** server-side hp handlers */
+       struct ptlrpc_hpreq_ops *rq_ops;
+
+       /** initial thread servicing this request */
+       struct ptlrpc_thread *rq_svc_thread;
+
+       /** history sequence # */
+       __u64 rq_history_seq;
+       /** \addtogroup  nrs
+        * @{
+        */
+       /** stub for NRS request */
+       struct ptlrpc_nrs_request rq_nrq;
+       /** @} nrs */
+       /** the index of service's srv_at_array into which request is linked */
+       time_t rq_at_index;
+       /** Lock to protect request flags and some other important bits, like
+        * rq_list
+        */
+       spinlock_t rq_lock;
+       /** client-side flags are serialized by rq_lock */
+       unsigned int rq_intr:1, rq_replied:1, rq_err:1,
+               rq_timedout:1, rq_resend:1, rq_restart:1,
+               /**
+                * when ->rq_replay is set, request is kept by the client even
+                * after server commits corresponding transaction. This is
+                * used for operations that require sequence of multiple
+                * requests to be replayed. The only example currently is file
+                * open/close. When last request in such a sequence is
+                * committed, ->rq_replay is cleared on all requests in the
+                * sequence.
+                */
+               rq_replay:1,
+               rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
+               rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
+               rq_early:1, rq_must_unlink:1,
+               rq_memalloc:1,      /* req originated from "kswapd" */
+               /* server-side flags */
+               rq_packed_final:1,  /* packed final reply */
+               rq_hp:1,            /* high priority RPC */
+               rq_at_linked:1,     /* link into service's srv_at_array */
+               rq_reply_truncate:1,
+               rq_committed:1,
+               /* whether the "rq_set" is a valid one */
+               rq_invalid_rqset:1,
+               rq_generation_set:1,
+               /* do not resend request on -EINPROGRESS */
+               rq_no_retry_einprogress:1,
+               /* allow the req to be sent if the import is in recovery
+                * status */
+               rq_allow_replay:1;
+
+       unsigned int rq_nr_resend;
+
+       enum rq_phase rq_phase; /* one of RQ_PHASE_* */
+       enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */
+       atomic_t rq_refcount;/* client-side refcount for SENT race,
+                                   server-side refcounf for multiple replies */
+
+       /** Portal to which this request would be sent */
+       short rq_request_portal;  /* XXX FIXME bug 249 */
+       /** Portal where to wait for reply and where reply would be sent */
+       short rq_reply_portal;    /* XXX FIXME bug 249 */
+
+       /**
+        * client-side:
+        * !rq_truncate : # reply bytes actually received,
+        *  rq_truncate : required repbuf_len for resend
+        */
+       int rq_nob_received;
+       /** Request length */
+       int rq_reqlen;
+       /** Reply length */
+       int rq_replen;
+       /** Request message - what client sent */
+       struct lustre_msg *rq_reqmsg;
+       /** Reply message - server response */
+       struct lustre_msg *rq_repmsg;
+       /** Transaction number */
+       __u64 rq_transno;
+       /** xid */
+       __u64 rq_xid;
+       /**
+        * List item to for replay list. Not yet commited requests get linked
+        * there.
+        * Also see \a rq_replay comment above.
+        */
+       struct list_head rq_replay_list;
+
+       /**
+        * security and encryption data
+        * @{ */
+       struct ptlrpc_cli_ctx   *rq_cli_ctx;     /**< client's half ctx */
+       struct ptlrpc_svc_ctx   *rq_svc_ctx;     /**< server's half ctx */
+       struct list_head               rq_ctx_chain;   /**< link to waited ctx */
+
+       struct sptlrpc_flavor    rq_flvr;       /**< for client & server */
+       enum lustre_sec_part     rq_sp_from;
+
+       /* client/server security flags */
+       unsigned int
+                                rq_ctx_init:1,      /* context initiation */
+                                rq_ctx_fini:1,      /* context destroy */
+                                rq_bulk_read:1,     /* request bulk read */
+                                rq_bulk_write:1,    /* request bulk write */
+                                /* server authentication flags */
+                                rq_auth_gss:1,      /* authenticated by gss */
+                                rq_auth_remote:1,   /* authed as remote user */
+                                rq_auth_usr_root:1, /* authed as root */
+                                rq_auth_usr_mdt:1,  /* authed as mdt */
+                                rq_auth_usr_ost:1,  /* authed as ost */
+                                /* security tfm flags */
+                                rq_pack_udesc:1,
+                                rq_pack_bulk:1,
+                                /* doesn't expect reply FIXME */
+                                rq_no_reply:1,
+                                rq_pill_init:1;     /* pill initialized */
+
+       uid_t               rq_auth_uid;        /* authed uid */
+       uid_t               rq_auth_mapped_uid; /* authed uid mapped to */
+
+       /* (server side), pointed directly into req buffer */
+       struct ptlrpc_user_desc *rq_user_desc;
+
+       /* various buffer pointers */
+       struct lustre_msg       *rq_reqbuf;      /* req wrapper */
+       char                *rq_repbuf;      /* rep buffer */
+       struct lustre_msg       *rq_repdata;     /* rep wrapper msg */
+       struct lustre_msg       *rq_clrbuf;      /* only in priv mode */
+       int                   rq_reqbuf_len;  /* req wrapper buf len */
+       int                   rq_reqdata_len; /* req wrapper msg len */
+       int                   rq_repbuf_len;  /* rep buffer len */
+       int                   rq_repdata_len; /* rep wrapper msg len */
+       int                   rq_clrbuf_len;  /* only in priv mode */
+       int                   rq_clrdata_len; /* only in priv mode */
+
+       /** early replies go to offset 0, regular replies go after that */
+       unsigned int         rq_reply_off;
+
+       /** @} */
+
+       /** Fields that help to see if request and reply were swabbed or not */
+       __u32 rq_req_swab_mask;
+       __u32 rq_rep_swab_mask;
+
+       /** What was import generation when this request was sent */
+       int rq_import_generation;
+       enum lustre_imp_state rq_send_state;
+
+       /** how many early replies (for stats) */
+       int rq_early_count;
+
+       /** client+server request */
+       lnet_handle_md_t     rq_req_md_h;
+       struct ptlrpc_cb_id  rq_req_cbid;
+       /** optional time limit for send attempts */
+       cfs_duration_t       rq_delay_limit;
+       /** time request was first queued */
+       cfs_time_t         rq_queued_time;
+
+       /* server-side... */
+       /** request arrival time */
+       struct timeval       rq_arrival_time;
+       /** separated reply state */
+       struct ptlrpc_reply_state *rq_reply_state;
+       /** incoming request buffer */
+       struct ptlrpc_request_buffer_desc *rq_rqbd;
+
+       /** client-only incoming reply */
+       lnet_handle_md_t     rq_reply_md_h;
+       wait_queue_head_t         rq_reply_waitq;
+       struct ptlrpc_cb_id  rq_reply_cbid;
+
+       /** our LNet NID */
+       lnet_nid_t         rq_self;
+       /** Peer description (the other side) */
+       lnet_process_id_t    rq_peer;
+       /** Server-side, export on which request was received */
+       struct obd_export   *rq_export;
+       /** Client side, import where request is being sent */
+       struct obd_import   *rq_import;
+
+       /** Replay callback, called after request is replayed at recovery */
+       void (*rq_replay_cb)(struct ptlrpc_request *);
+       /**
+        * Commit callback, called when request is committed and about to be
+        * freed.
+        */
+       void (*rq_commit_cb)(struct ptlrpc_request *);
+       /** Opaq data for replay and commit callbacks. */
+       void  *rq_cb_data;
+
+       /** For bulk requests on client only: bulk descriptor */
+       struct ptlrpc_bulk_desc *rq_bulk;
+
+       /** client outgoing req */
+       /**
+        * when request/reply sent (secs), or time when request should be sent
+        */
+       time_t rq_sent;
+       /** time for request really sent out */
+       time_t rq_real_sent;
+
+       /** when request must finish. volatile
+        * so that servers' early reply updates to the deadline aren't
+        * kept in per-cpu cache */
+       volatile time_t rq_deadline;
+       /** when req reply unlink must finish. */
+       time_t rq_reply_deadline;
+       /** when req bulk unlink must finish. */
+       time_t rq_bulk_deadline;
+       /**
+        * service time estimate (secs)
+        * If the requestsis not served by this time, it is marked as timed out.
+        */
+       int    rq_timeout;
+
+       /** Multi-rpc bits */
+       /** Per-request waitq introduced by bug 21938 for recovery waiting */
+       wait_queue_head_t rq_set_waitq;
+       /** Link item for request set lists */
+       struct list_head  rq_set_chain;
+       /** Link back to the request set */
+       struct ptlrpc_request_set *rq_set;
+       /** Async completion handler, called when reply is received */
+       ptlrpc_interpterer_t rq_interpret_reply;
+       /** Async completion context */
+       union ptlrpc_async_args rq_async_args;
+
+       /** Pool if request is from preallocated list */
+       struct ptlrpc_request_pool *rq_pool;
+
+       struct lu_context          rq_session;
+       struct lu_context          rq_recov_session;
+
+       /** request format description */
+       struct req_capsule        rq_pill;
+};
+
+/**
+ * Call completion handler for rpc if any, return it's status or original
+ * rc if there was no handler defined for this request.
+ */
+static inline int ptlrpc_req_interpret(const struct lu_env *env,
+                                      struct ptlrpc_request *req, int rc)
+{
+       if (req->rq_interpret_reply != NULL) {
+               req->rq_status = req->rq_interpret_reply(env, req,
+                                                        &req->rq_async_args,
+                                                        rc);
+               return req->rq_status;
+       }
+       return rc;
+}
+
+/** \addtogroup  nrs
+ * @{
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf);
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf);
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req);
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+                               struct ptlrpc_nrs_pol_info *info);
+
+/*
+ * Can the request be moved from the regular NRS head to the high-priority NRS
+ * head (of the same PTLRPC service partition), if any?
+ *
+ * For a reliable result, this should be checked under svcpt->scp_req lock.
+ */
+static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req)
+{
+       struct ptlrpc_nrs_request *nrq = &req->rq_nrq;
+
+       /**
+        * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the
+        * request has been enqueued first, and ptlrpc_nrs_request::nr_started
+        * to make sure it has not been scheduled yet (analogous to previous
+        * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list).
+        */
+       return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp;
+}
+/** @} nrs */
+
+/**
+ * Returns 1 if request buffer at offset \a index was already swabbed
+ */
+static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index)
+{
+       LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+       return req->rq_req_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request reply buffer at offset \a index was already swabbed
+ */
+static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+       LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+       return req->rq_rep_swab_mask & (1 << index);
+}
+
+/**
+ * Returns 1 if request needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req)
+{
+       return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Returns 1 if request reply needs to be swabbed into local cpu byteorder
+ */
+static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req)
+{
+       return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+}
+
+/**
+ * Mark request buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index)
+{
+       LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
+       LASSERT((req->rq_req_swab_mask & (1 << index)) == 0);
+       req->rq_req_swab_mask |= 1 << index;
+}
+
+/**
+ * Mark request reply buffer at offset \a index that it was already swabbed
+ */
+static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index)
+{
+       LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8);
+       LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0);
+       req->rq_rep_swab_mask |= 1 << index;
+}
+
+/**
+ * Convert numerical request phase value \a phase into text string description
+ */
+static inline const char *
+ptlrpc_phase2str(enum rq_phase phase)
+{
+       switch (phase) {
+       case RQ_PHASE_NEW:
+               return "New";
+       case RQ_PHASE_RPC:
+               return "Rpc";
+       case RQ_PHASE_BULK:
+               return "Bulk";
+       case RQ_PHASE_INTERPRET:
+               return "Interpret";
+       case RQ_PHASE_COMPLETE:
+               return "Complete";
+       case RQ_PHASE_UNREGISTERING:
+               return "Unregistering";
+       default:
+               return "?Phase?";
+       }
+}
+
+/**
+ * Convert numerical request phase of the request \a req into text stringi
+ * description
+ */
+static inline const char *
+ptlrpc_rqphase2str(struct ptlrpc_request *req)
+{
+       return ptlrpc_phase2str(req->rq_phase);
+}
+
+/**
+ * Debugging functions and helpers to print request structure into debug log
+ * @{
+ */
+/* Spare the preprocessor, spoil the bugs. */
+#define FLAG(field, str) (field ? str : "")
+
+/** Convert bit flags into a string */
+#define DEBUG_REQ_FLAGS(req)                                               \
+       ptlrpc_rqphase2str(req),                                                \
+       FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"),                \
+       FLAG(req->rq_err, "E"),                                          \
+       FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"),   \
+       FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),            \
+       FLAG(req->rq_no_resend, "N"),                                      \
+       FLAG(req->rq_waiting, "W"),                                          \
+       FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"),                  \
+       FLAG(req->rq_committed, "M")
+
+#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s"
+
+void _debug_req(struct ptlrpc_request *req,
+               struct libcfs_debug_msg_data *data, const char *fmt, ...)
+       __attribute__ ((format (printf, 3, 4)));
+
+/**
+ * Helper that decides if we need to print request accordig to current debug
+ * level settings
+ */
+#define debug_req(msgdata, mask, cdls, req, fmt, a...)                 \
+do {                                                                     \
+       CFS_CHECK_STACK(msgdata, mask, cdls);                            \
+                                                                             \
+       if (((mask) & D_CANTMASK) != 0 ||                                    \
+           ((libcfs_debug & (mask)) != 0 &&                              \
+            (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))          \
+               _debug_req((req), msgdata, fmt, ##a);                    \
+} while(0)
+
+/**
+ * This is the debug print function you need to use to print request sturucture
+ * content into lustre debug log.
+ * for most callers (level is a constant) this is resolved at compile time */
+#define DEBUG_REQ(level, req, fmt, args...)                               \
+do {                                                                     \
+       if ((level) & (D_ERROR | D_WARNING)) {                          \
+               static cfs_debug_limit_state_t cdls;                      \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls);          \
+               debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\
+       } else {                                                              \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL);            \
+               debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \
+       }                                                                    \
+} while (0)
+/** @} */
+
+/**
+ * Structure that defines a single page of a bulk transfer
+ */
+struct ptlrpc_bulk_page {
+       /** Linkage to list of pages in a bulk */
+       struct list_head       bp_link;
+       /**
+        * Number of bytes in a page to transfer starting from \a bp_pageoffset
+        */
+       int           bp_buflen;
+       /** offset within a page */
+       int           bp_pageoffset;
+       /** The page itself */
+       struct page     *bp_page;
+};
+
+#define BULK_GET_SOURCE   0
+#define BULK_PUT_SINK     1
+#define BULK_GET_SINK     2
+#define BULK_PUT_SOURCE   3
+
+/**
+ * Definition of bulk descriptor.
+ * Bulks are special "Two phase" RPCs where initial request message
+ * is sent first and it is followed bt a transfer (o receiving) of a large
+ * amount of data to be settled into pages referenced from the bulk descriptors.
+ * Bulks transfers (the actual data following the small requests) are done
+ * on separate LNet portals.
+ * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs.
+ *  Another user is readpage for MDT.
+ */
+struct ptlrpc_bulk_desc {
+       /** completed with failure */
+       unsigned long bd_failure:1;
+       /** {put,get}{source,sink} */
+       unsigned long bd_type:2;
+       /** client side */
+       unsigned long bd_registered:1;
+       /** For serialization with callback */
+       spinlock_t bd_lock;
+       /** Import generation when request for this bulk was sent */
+       int bd_import_generation;
+       /** LNet portal for this bulk */
+       __u32 bd_portal;
+       /** Server side - export this bulk created for */
+       struct obd_export *bd_export;
+       /** Client side - import this bulk was sent on */
+       struct obd_import *bd_import;
+       /** Back pointer to the request */
+       struct ptlrpc_request *bd_req;
+       wait_queue_head_t           bd_waitq;   /* server side only WQ */
+       int                 bd_iov_count;    /* # entries in bd_iov */
+       int                 bd_max_iov;      /* allocated size of bd_iov */
+       int                 bd_nob;       /* # bytes covered */
+       int                 bd_nob_transferred; /* # bytes GOT/PUT */
+
+       __u64             bd_last_xid;
+
+       struct ptlrpc_cb_id    bd_cbid;  /* network callback info */
+       lnet_nid_t           bd_sender;       /* stash event::sender */
+       int                     bd_md_count;    /* # valid entries in bd_mds */
+       int                     bd_md_max_brw;  /* max entries in bd_mds */
+       /** array of associated MDs */
+       lnet_handle_md_t        bd_mds[PTLRPC_BULK_OPS_COUNT];
+
+       /*
+        * encrypt iov, size is either 0 or bd_iov_count.
+        */
+       lnet_kiov_t        *bd_enc_iov;
+
+       lnet_kiov_t         bd_iov[0];
+};
+
+enum {
+       SVC_STOPPED     = 1 << 0,
+       SVC_STOPPING    = 1 << 1,
+       SVC_STARTING    = 1 << 2,
+       SVC_RUNNING     = 1 << 3,
+       SVC_EVENT       = 1 << 4,
+       SVC_SIGNAL      = 1 << 5,
+};
+
+#define PTLRPC_THR_NAME_LEN            32
+/**
+ * Definition of server service thread structure
+ */
+struct ptlrpc_thread {
+       /**
+        * List of active threads in svc->srv_threads
+        */
+       struct list_head t_link;
+       /**
+        * thread-private data (preallocated memory)
+        */
+       void *t_data;
+       __u32 t_flags;
+       /**
+        * service thread index, from ptlrpc_start_threads
+        */
+       unsigned int t_id;
+       /**
+        * service thread pid
+        */
+       pid_t t_pid;
+       /**
+        * put watchdog in the structure per thread b=14840
+        */
+       struct lc_watchdog *t_watchdog;
+       /**
+        * the svc this thread belonged to b=18582
+        */
+       struct ptlrpc_service_part      *t_svcpt;
+       wait_queue_head_t                       t_ctl_waitq;
+       struct lu_env                   *t_env;
+       char                            t_name[PTLRPC_THR_NAME_LEN];
+};
+
+static inline int thread_is_init(struct ptlrpc_thread *thread)
+{
+       return thread->t_flags == 0;
+}
+
+static inline int thread_is_stopped(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_STOPPED);
+}
+
+static inline int thread_is_stopping(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_STOPPING);
+}
+
+static inline int thread_is_starting(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_STARTING);
+}
+
+static inline int thread_is_running(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_RUNNING);
+}
+
+static inline int thread_is_event(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_EVENT);
+}
+
+static inline int thread_is_signal(struct ptlrpc_thread *thread)
+{
+       return !!(thread->t_flags & SVC_SIGNAL);
+}
+
+static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+       thread->t_flags &= ~flags;
+}
+
+static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+       thread->t_flags = flags;
+}
+
+static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags)
+{
+       thread->t_flags |= flags;
+}
+
+static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread,
+                                             __u32 flags)
+{
+       if (thread->t_flags & flags) {
+               thread->t_flags &= ~flags;
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * Request buffer descriptor structure.
+ * This is a structure that contains one posted request buffer for service.
+ * Once data land into a buffer, event callback creates actual request and
+ * notifies wakes one of the service threads to process new incoming request.
+ * More than one request can fit into the buffer.
+ */
+struct ptlrpc_request_buffer_desc {
+       /** Link item for rqbds on a service */
+       struct list_head             rqbd_list;
+       /** History of requests for this buffer */
+       struct list_head             rqbd_reqs;
+       /** Back pointer to service for which this buffer is registered */
+       struct ptlrpc_service_part *rqbd_svcpt;
+       /** LNet descriptor */
+       lnet_handle_md_t       rqbd_md_h;
+       int                 rqbd_refcount;
+       /** The buffer itself */
+       char              *rqbd_buffer;
+       struct ptlrpc_cb_id    rqbd_cbid;
+       /**
+        * This "embedded" request structure is only used for the
+        * last request to fit into the buffer
+        */
+       struct ptlrpc_request  rqbd_req;
+};
+
+typedef int  (*svc_handler_t)(struct ptlrpc_request *req);
+
+struct ptlrpc_service_ops {
+       /**
+        * if non-NULL called during thread creation (ptlrpc_start_thread())
+        * to initialize service specific per-thread state.
+        */
+       int             (*so_thr_init)(struct ptlrpc_thread *thr);
+       /**
+        * if non-NULL called during thread shutdown (ptlrpc_main()) to
+        * destruct state created by ->srv_init().
+        */
+       void            (*so_thr_done)(struct ptlrpc_thread *thr);
+       /**
+        * Handler function for incoming requests for this service
+        */
+       int             (*so_req_handler)(struct ptlrpc_request *req);
+       /**
+        * function to determine priority of the request, it's called
+        * on every new request
+        */
+       int             (*so_hpreq_handler)(struct ptlrpc_request *);
+       /**
+        * service-specific print fn
+        */
+       void            (*so_req_printer)(void *, struct ptlrpc_request *);
+};
+
+#ifndef __cfs_cacheline_aligned
+/* NB: put it here for reducing patche dependence */
+# define __cfs_cacheline_aligned
+#endif
+
+/**
+ * How many high priority requests to serve before serving one normal
+ * priority request
+ */
+#define PTLRPC_SVC_HP_RATIO 10
+
+/**
+ * Definition of PortalRPC service.
+ * The service is listening on a particular portal (like tcp port)
+ * and perform actions for a specific server like IO service for OST
+ * or general metadata service for MDS.
+ */
+struct ptlrpc_service {
+       /** serialize /proc operations */
+       spinlock_t                      srv_lock;
+       /** most often accessed fields */
+       /** chain thru all services */
+       struct list_head                      srv_list;
+       /** service operations table */
+       struct ptlrpc_service_ops       srv_ops;
+       /** only statically allocated strings here; we don't clean them */
+       char                       *srv_name;
+       /** only statically allocated strings here; we don't clean them */
+       char                       *srv_thread_name;
+       /** service thread list */
+       struct list_head                      srv_threads;
+       /** threads # should be created for each partition on initializing */
+       int                             srv_nthrs_cpt_init;
+       /** limit of threads number for each partition */
+       int                             srv_nthrs_cpt_limit;
+       /** Root of /proc dir tree for this service */
+       proc_dir_entry_t           *srv_procroot;
+       /** Pointer to statistic data for this service */
+       struct lprocfs_stats       *srv_stats;
+       /** # hp per lp reqs to handle */
+       int                          srv_hpreq_ratio;
+       /** biggest request to receive */
+       int                          srv_max_req_size;
+       /** biggest reply to send */
+       int                          srv_max_reply_size;
+       /** size of individual buffers */
+       int                          srv_buf_size;
+       /** # buffers to allocate in 1 group */
+       int                          srv_nbuf_per_group;
+       /** Local portal on which to receive requests */
+       __u32                      srv_req_portal;
+       /** Portal on the client to send replies to */
+       __u32                      srv_rep_portal;
+       /**
+        * Tags for lu_context associated with this thread, see struct
+        * lu_context.
+        */
+       __u32                      srv_ctx_tags;
+       /** soft watchdog timeout multiplier */
+       int                          srv_watchdog_factor;
+       /** under unregister_service */
+       unsigned                        srv_is_stopping:1;
+
+       /** max # request buffers in history per partition */
+       int                             srv_hist_nrqbds_cpt_max;
+       /** number of CPTs this service bound on */
+       int                             srv_ncpts;
+       /** CPTs array this service bound on */
+       __u32                           *srv_cpts;
+       /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */
+       int                             srv_cpt_bits;
+       /** CPT table this service is running over */
+       struct cfs_cpt_table            *srv_cptable;
+       /**
+        * partition data for ptlrpc service
+        */
+       struct ptlrpc_service_part      *srv_parts[0];
+};
+
+/**
+ * Definition of PortalRPC service partition data.
+ * Although a service only has one instance of it right now, but we
+ * will have multiple instances very soon (instance per CPT).
+ *
+ * it has four locks:
+ * \a scp_lock
+ *    serialize operations on rqbd and requests waiting for preprocess
+ * \a scp_req_lock
+ *    serialize operations active requests sent to this portal
+ * \a scp_at_lock
+ *    serialize adaptive timeout stuff
+ * \a scp_rep_lock
+ *    serialize operations on RS list (reply states)
+ *
+ * We don't have any use-case to take two or more locks at the same time
+ * for now, so there is no lock order issue.
+ */
+struct ptlrpc_service_part {
+       /** back reference to owner */
+       struct ptlrpc_service           *scp_service __cfs_cacheline_aligned;
+       /* CPT id, reserved */
+       int                             scp_cpt;
+       /** always increasing number */
+       int                             scp_thr_nextid;
+       /** # of starting threads */
+       int                             scp_nthrs_starting;
+       /** # of stopping threads, reserved for shrinking threads */
+       int                             scp_nthrs_stopping;
+       /** # running threads */
+       int                             scp_nthrs_running;
+       /** service threads list */
+       struct list_head                        scp_threads;
+
+       /**
+        * serialize the following fields, used for protecting
+        * rqbd list and incoming requests waiting for preprocess,
+        * threads starting & stopping are also protected by this lock.
+        */
+       spinlock_t                      scp_lock  __cfs_cacheline_aligned;
+       /** total # req buffer descs allocated */
+       int                             scp_nrqbds_total;
+       /** # posted request buffers for receiving */
+       int                             scp_nrqbds_posted;
+       /** in progress of allocating rqbd */
+       int                             scp_rqbd_allocating;
+       /** # incoming reqs */
+       int                             scp_nreqs_incoming;
+       /** request buffers to be reposted */
+       struct list_head                        scp_rqbd_idle;
+       /** req buffers receiving */
+       struct list_head                        scp_rqbd_posted;
+       /** incoming reqs */
+       struct list_head                        scp_req_incoming;
+       /** timeout before re-posting reqs, in tick */
+       cfs_duration_t                  scp_rqbd_timeout;
+       /**
+        * all threads sleep on this. This wait-queue is signalled when new
+        * incoming request arrives and when difficult reply has to be handled.
+        */
+       wait_queue_head_t                       scp_waitq;
+
+       /** request history */
+       struct list_head                        scp_hist_reqs;
+       /** request buffer history */
+       struct list_head                        scp_hist_rqbds;
+       /** # request buffers in history */
+       int                             scp_hist_nrqbds;
+       /** sequence number for request */
+       __u64                           scp_hist_seq;
+       /** highest seq culled from history */
+       __u64                           scp_hist_seq_culled;
+
+       /**
+        * serialize the following fields, used for processing requests
+        * sent to this portal
+        */
+       spinlock_t                      scp_req_lock __cfs_cacheline_aligned;
+       /** # reqs in either of the NRS heads below */
+       /** # reqs being served */
+       int                             scp_nreqs_active;
+       /** # HPreqs being served */
+       int                             scp_nhreqs_active;
+       /** # hp requests handled */
+       int                             scp_hreq_count;
+
+       /** NRS head for regular requests */
+       struct ptlrpc_nrs               scp_nrs_reg;
+       /** NRS head for HP requests; this is only valid for services that can
+        *  handle HP requests */
+       struct ptlrpc_nrs              *scp_nrs_hp;
+
+       /** AT stuff */
+       /** @{ */
+       /**
+        * serialize the following fields, used for changes on
+        * adaptive timeout
+        */
+       spinlock_t                      scp_at_lock __cfs_cacheline_aligned;
+       /** estimated rpc service time */
+       struct adaptive_timeout         scp_at_estimate;
+       /** reqs waiting for replies */
+       struct ptlrpc_at_array          scp_at_array;
+       /** early reply timer */
+       timer_list_t                    scp_at_timer;
+       /** debug */
+       cfs_time_t                      scp_at_checktime;
+       /** check early replies */
+       unsigned                        scp_at_check;
+       /** @} */
+
+       /**
+        * serialize the following fields, used for processing
+        * replies for this portal
+        */
+       spinlock_t                      scp_rep_lock __cfs_cacheline_aligned;
+       /** all the active replies */
+       struct list_head                        scp_rep_active;
+       /** List of free reply_states */
+       struct list_head                        scp_rep_idle;
+       /** waitq to run, when adding stuff to srv_free_rs_list */
+       wait_queue_head_t                       scp_rep_waitq;
+       /** # 'difficult' replies */
+       atomic_t                        scp_nreps_difficult;
+};
+
+#define ptlrpc_service_for_each_part(part, i, svc)                     \
+       for (i = 0;                                                     \
+            i < (svc)->srv_ncpts &&                                    \
+            (svc)->srv_parts != NULL &&                                \
+            ((part) = (svc)->srv_parts[i]) != NULL; i++)
+
+/**
+ * Declaration of ptlrpcd control structure
+ */
+struct ptlrpcd_ctl {
+       /**
+        * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE)
+        */
+       unsigned long                   pc_flags;
+       /**
+        * Thread lock protecting structure fields.
+        */
+       spinlock_t                      pc_lock;
+       /**
+        * Start completion.
+        */
+       struct completion               pc_starting;
+       /**
+        * Stop completion.
+        */
+       struct completion               pc_finishing;
+       /**
+        * Thread requests set.
+        */
+       struct ptlrpc_request_set  *pc_set;
+       /**
+        * Thread name used in cfs_daemonize()
+        */
+       char                    pc_name[16];
+       /**
+        * Environment for request interpreters to run in.
+        */
+       struct lu_env          pc_env;
+       /**
+        * Index of ptlrpcd thread in the array.
+        */
+       int                      pc_index;
+       /**
+        * Number of the ptlrpcd's partners.
+        */
+       int                      pc_npartners;
+       /**
+        * Pointer to the array of partners' ptlrpcd_ctl structure.
+        */
+       struct ptlrpcd_ctl      **pc_partners;
+       /**
+        * Record the partner index to be processed next.
+        */
+       int                      pc_cursor;
+};
+
+/* Bits for pc_flags */
+enum ptlrpcd_ctl_flags {
+       /**
+        * Ptlrpc thread start flag.
+        */
+       LIOD_START       = 1 << 0,
+       /**
+        * Ptlrpc thread stop flag.
+        */
+       LIOD_STOP       = 1 << 1,
+       /**
+        * Ptlrpc thread force flag (only stop force so far).
+        * This will cause aborting any inflight rpcs handled
+        * by thread if LIOD_STOP is specified.
+        */
+       LIOD_FORCE       = 1 << 2,
+       /**
+        * This is a recovery ptlrpc thread.
+        */
+       LIOD_RECOVERY    = 1 << 3,
+       /**
+        * The ptlrpcd is bound to some CPU core.
+        */
+       LIOD_BIND       = 1 << 4,
+};
+
+/**
+ * \addtogroup nrs
+ * @{
+ *
+ * Service compatibility function; the policy is compatible with all services.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval true The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc,
+                                        const struct ptlrpc_nrs_pol_desc *desc)
+{
+       return true;
+}
+
+/**
+ * Service compatibility function; the policy is compatible with only a specific
+ * service which is identified by its human-readable name at
+ * ptlrpc_service::srv_name.
+ *
+ * \param[in] svc  The service the policy is attempting to register with.
+ * \param[in] desc The policy descriptor
+ *
+ * \retval false The policy is not compatible with the service
+ * \retval true         The policy is compatible with the service
+ *
+ * \see ptlrpc_nrs_pol_desc::pd_compat()
+ */
+static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc,
+                                        const struct ptlrpc_nrs_pol_desc *desc)
+{
+       LASSERT(desc->pd_compat_svc_name != NULL);
+       return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0;
+}
+
+/** @} nrs */
+
+/* ptlrpc/events.c */
+extern lnet_handle_eq_t ptlrpc_eq_h;
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+                              lnet_process_id_t *peer, lnet_nid_t *self);
+/**
+ * These callbacks are invoked by LNet when something happened to
+ * underlying buffer
+ * @{
+ */
+extern void request_out_callback(lnet_event_t *ev);
+extern void reply_in_callback(lnet_event_t *ev);
+extern void client_bulk_callback(lnet_event_t *ev);
+extern void request_in_callback(lnet_event_t *ev);
+extern void reply_out_callback(lnet_event_t *ev);
+/** @} */
+
+/* ptlrpc/connection.c */
+struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer,
+                                               lnet_nid_t self,
+                                               struct obd_uuid *uuid);
+int ptlrpc_connection_put(struct ptlrpc_connection *c);
+struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
+int ptlrpc_connection_init(void);
+void ptlrpc_connection_fini(void);
+extern lnet_pid_t ptl_get_pid(void);
+
+/* ptlrpc/niobuf.c */
+/**
+ * Actual interfacing with LNet to put/get/register/unregister stuff
+ * @{
+ */
+
+int ptlrpc_register_bulk(struct ptlrpc_request *req);
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async);
+
+static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req)
+{
+       struct ptlrpc_bulk_desc *desc;
+       int                   rc;
+
+       LASSERT(req != NULL);
+       desc = req->rq_bulk;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+           req->rq_bulk_deadline > cfs_time_current_sec())
+               return 1;
+
+       if (!desc)
+               return 0;
+
+       spin_lock(&desc->bd_lock);
+       rc = desc->bd_md_count;
+       spin_unlock(&desc->bd_lock);
+       return rc;
+}
+
+#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01
+#define PTLRPC_REPLY_EARLY        0x02
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags);
+int ptlrpc_reply(struct ptlrpc_request *req);
+int ptlrpc_send_error(struct ptlrpc_request *req, int difficult);
+int ptlrpc_error(struct ptlrpc_request *req);
+void ptlrpc_resend_req(struct ptlrpc_request *request);
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd);
+/** @} */
+
+/* ptlrpc/client.c */
+/**
+ * Client-side portals API. Everything to send requests, receive replies,
+ * request queues, request management, etc.
+ * @{
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+                       struct ptlrpc_client *);
+void ptlrpc_cleanup_client(struct obd_import *imp);
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
+
+int ptlrpc_queue_wait(struct ptlrpc_request *req);
+int ptlrpc_replay_req(struct ptlrpc_request *req);
+int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async);
+void ptlrpc_restart_req(struct ptlrpc_request *req);
+void ptlrpc_abort_inflight(struct obd_import *imp);
+void ptlrpc_cleanup_imp(struct obd_import *imp);
+void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void);
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+                                            void *arg);
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+                     set_interpreter_func fn, void *data);
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *);
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set);
+int ptlrpc_set_wait(struct ptlrpc_request_set *);
+int ptlrpc_expired_set(void *data);
+void ptlrpc_interrupted_set(void *data);
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req);
+void ptlrpc_set_destroy(struct ptlrpc_request_set *);
+void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+                           struct ptlrpc_request *req);
+
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
+
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int, int,
+                   void (*populate_pool)(struct ptlrpc_request_pool *, int));
+
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+                                           const struct req_format *format);
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+                                           struct ptlrpc_request_pool *,
+                                           const struct req_format *format);
+void ptlrpc_request_free(struct ptlrpc_request *request);
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+                       __u32 version, int opcode);
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+                                               const struct req_format *format,
+                                               __u32 version, int opcode);
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+                            __u32 version, int opcode, char **bufs,
+                            struct ptlrpc_cli_ctx *ctx);
+struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
+                                      int opcode, int count, __u32 *lengths,
+                                      char **bufs);
+struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp,
+                                            __u32 version, int opcode,
+                                           int count, __u32 *lengths, char **bufs,
+                                           struct ptlrpc_request_pool *pool);
+void ptlrpc_req_finished(struct ptlrpc_request *request);
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request);
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+                                             unsigned npages, unsigned max_brw,
+                                             unsigned type, unsigned portal);
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin);
+static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk)
+{
+       __ptlrpc_free_bulk(bulk, 1);
+}
+static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk)
+{
+       __ptlrpc_free_bulk(bulk, 0);
+}
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+                            struct page *page, int pageoffset, int len, int);
+static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc,
+                                            struct page *page, int pageoffset,
+                                            int len)
+{
+       __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1);
+}
+
+static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc,
+                                              struct page *page, int pageoffset,
+                                              int len)
+{
+       __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0);
+}
+
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+                                     struct obd_import *imp);
+__u64 ptlrpc_next_xid(void);
+__u64 ptlrpc_sample_next_xid(void);
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
+
+/* Set of routines to run a function in ptlrpcd context */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+                        int (*cb)(const struct lu_env *, void *), void *data);
+void ptlrpcd_destroy_work(void *handler);
+int ptlrpcd_queue_work(void *handler);
+
+/** @} */
+struct ptlrpc_service_buf_conf {
+       /* nbufs is buffers # to allocate when growing the pool */
+       unsigned int                    bc_nbufs;
+       /* buffer size to post */
+       unsigned int                    bc_buf_size;
+       /* portal to listed for requests on */
+       unsigned int                    bc_req_portal;
+       /* portal of where to send replies to */
+       unsigned int                    bc_rep_portal;
+       /* maximum request size to be accepted for this service */
+       unsigned int                    bc_req_max_size;
+       /* maximum reply size this service can ever send */
+       unsigned int                    bc_rep_max_size;
+};
+
+struct ptlrpc_service_thr_conf {
+       /* threadname should be 8 characters or less - 6 will be added on */
+       char                            *tc_thr_name;
+       /* threads increasing factor for each CPU */
+       unsigned int                    tc_thr_factor;
+       /* service threads # to start on each partition while initializing */
+       unsigned int                    tc_nthrs_init;
+       /*
+        * low water of threads # upper-limit on each partition while running,
+        * service availability may be impacted if threads number is lower
+        * than this value. It can be ZERO if the service doesn't require
+        * CPU affinity or there is only one partition.
+        */
+       unsigned int                    tc_nthrs_base;
+       /* "soft" limit for total threads number */
+       unsigned int                    tc_nthrs_max;
+       /* user specified threads number, it will be validated due to
+        * other members of this structure. */
+       unsigned int                    tc_nthrs_user;
+       /* set NUMA node affinity for service threads */
+       unsigned int                    tc_cpu_affinity;
+       /* Tags for lu_context associated with service thread */
+       __u32                           tc_ctx_tags;
+};
+
+struct ptlrpc_service_cpt_conf {
+       struct cfs_cpt_table            *cc_cptable;
+       /* string pattern to describe CPTs for a service */
+       char                            *cc_pattern;
+};
+
+struct ptlrpc_service_conf {
+       /* service name */
+       char                            *psc_name;
+       /* soft watchdog timeout multiplifier to print stuck service traces */
+       unsigned int                    psc_watchdog_factor;
+       /* buffer information */
+       struct ptlrpc_service_buf_conf  psc_buf;
+       /* thread information */
+       struct ptlrpc_service_thr_conf  psc_thr;
+       /* CPU partition information */
+       struct ptlrpc_service_cpt_conf  psc_cpt;
+       /* function table */
+       struct ptlrpc_service_ops       psc_ops;
+};
+
+/* ptlrpc/service.c */
+/**
+ * Server-side services API. Register/unregister service, request state
+ * management, service thread management
+ *
+ * @{
+ */
+void ptlrpc_save_lock(struct ptlrpc_request *req,
+                     struct lustre_handle *lock, int mode, int no_ack);
+void ptlrpc_commit_replies(struct obd_export *exp);
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs);
+void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs);
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req);
+struct ptlrpc_service *ptlrpc_register_service(
+                               struct ptlrpc_service_conf *conf,
+                               struct proc_dir_entry *proc_entry);
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc);
+int ptlrpc_unregister_service(struct ptlrpc_service *service);
+int liblustre_check_services(void *arg);
+void ptlrpc_daemonize(char *name);
+int ptlrpc_service_health_check(struct ptlrpc_service *);
+void ptlrpc_server_drop_request(struct ptlrpc_request *req);
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+                                 struct obd_export *export);
+
+int ptlrpc_hr_init(void);
+void ptlrpc_hr_fini(void);
+
+/** @} */
+
+/* ptlrpc/import.c */
+/**
+ * Import API
+ * @{
+ */
+int ptlrpc_connect_import(struct obd_import *imp);
+int ptlrpc_init_import(struct obd_import *imp);
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+void deuuidify(char *uuid, const char *prefix, char **uuid_start,
+              int *uuid_len);
+
+/* ptlrpc/pack_generic.c */
+int ptlrpc_reconnect_import(struct obd_import *imp);
+/** @} */
+
+/**
+ * ptlrpc msg buffer and swab interface
+ *
+ * @{
+ */
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+                        int index);
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+                               int index);
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len);
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len);
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+                       char **bufs);
+int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count,
+                       __u32 *lens, char **bufs);
+int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens,
+                     char **bufs);
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+                        __u32 *lens, char **bufs, int flags);
+#define LPRFL_EARLY_REPLY 1
+int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens,
+                           char **bufs, int flags);
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+                     unsigned int newlen, int move_data);
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
+int __lustre_unpack_msg(struct lustre_msg *m, int len);
+int lustre_msg_hdr_size(__u32 magic, int count);
+int lustre_msg_size(__u32 magic, int count, __u32 *lengths);
+int lustre_msg_size_v2(int count, __u32 *lengths);
+int lustre_packed_msg_size(struct lustre_msg *msg);
+int lustre_msg_early_size(void);
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size);
+void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
+int lustre_msg_buflen(struct lustre_msg *m, int n);
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len);
+int lustre_msg_bufcount(struct lustre_msg *m);
+char *lustre_msg_string(struct lustre_msg *m, int n, int max_len);
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg);
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags);
+__u32 lustre_msg_get_flags(struct lustre_msg *msg);
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags);
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg);
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags);
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags);
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg);
+__u32 lustre_msg_get_type(struct lustre_msg *msg);
+__u32 lustre_msg_get_version(struct lustre_msg *msg);
+void lustre_msg_add_version(struct lustre_msg *msg, int version);
+__u32 lustre_msg_get_opc(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg);
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg);
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg);
+__u64 lustre_msg_get_transno(struct lustre_msg *msg);
+__u64 lustre_msg_get_slv(struct lustre_msg *msg);
+__u32 lustre_msg_get_limit(struct lustre_msg *msg);
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv);
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit);
+int lustre_msg_get_status(struct lustre_msg *msg);
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg);
+int lustre_msg_is_v1(struct lustre_msg *msg);
+__u32 lustre_msg_get_magic(struct lustre_msg *msg);
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg);
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg);
+char *lustre_msg_get_jobid(struct lustre_msg *msg);
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18);
+#else
+# warning "remove checksum compatibility support for b1_8"
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg);
+#endif
+void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle);
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type);
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc);
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid);
+void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed);
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions);
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno);
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status);
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt);
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes);
+void ptlrpc_request_set_replen(struct ptlrpc_request *req);
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout);
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time);
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid);
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum);
+
+static inline void
+lustre_shrink_reply(struct ptlrpc_request *req, int segment,
+                   unsigned int newlen, int move_data)
+{
+       LASSERT(req->rq_reply_state);
+       LASSERT(req->rq_repmsg);
+       req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment,
+                                          newlen, move_data);
+}
+/** @} */
+
+/** Change request phase of \a req to \a new_phase */
+static inline void
+ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase)
+{
+       if (req->rq_phase == new_phase)
+               return;
+
+       if (new_phase == RQ_PHASE_UNREGISTERING) {
+               req->rq_next_phase = req->rq_phase;
+               if (req->rq_import)
+                       atomic_inc(&req->rq_import->imp_unregistering);
+       }
+
+       if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+               if (req->rq_import)
+                       atomic_dec(&req->rq_import->imp_unregistering);
+       }
+
+       DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"",
+                 ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase));
+
+       req->rq_phase = new_phase;
+}
+
+/**
+ * Returns true if request \a req got early reply and hard deadline is not met
+ */
+static inline int
+ptlrpc_client_early(struct ptlrpc_request *req)
+{
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+           req->rq_reply_deadline > cfs_time_current_sec())
+               return 0;
+       return req->rq_early;
+}
+
+/**
+ * Returns true if we got real reply from server for this request
+ */
+static inline int
+ptlrpc_client_replied(struct ptlrpc_request *req)
+{
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+           req->rq_reply_deadline > cfs_time_current_sec())
+               return 0;
+       return req->rq_replied;
+}
+
+/** Returns true if request \a req is in process of receiving server reply */
+static inline int
+ptlrpc_client_recv(struct ptlrpc_request *req)
+{
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+           req->rq_reply_deadline > cfs_time_current_sec())
+               return 1;
+       return req->rq_receiving_reply;
+}
+
+static inline int
+ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req)
+{
+       int rc;
+
+       spin_lock(&req->rq_lock);
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+           req->rq_reply_deadline > cfs_time_current_sec()) {
+               spin_unlock(&req->rq_lock);
+               return 1;
+       }
+       rc = req->rq_receiving_reply || req->rq_must_unlink;
+       spin_unlock(&req->rq_lock);
+       return rc;
+}
+
+static inline void
+ptlrpc_client_wake_req(struct ptlrpc_request *req)
+{
+       if (req->rq_set == NULL)
+               wake_up(&req->rq_reply_waitq);
+       else
+               wake_up(&req->rq_set->set_waitq);
+}
+
+static inline void
+ptlrpc_rs_addref(struct ptlrpc_reply_state *rs)
+{
+       LASSERT(atomic_read(&rs->rs_refcount) > 0);
+       atomic_inc(&rs->rs_refcount);
+}
+
+static inline void
+ptlrpc_rs_decref(struct ptlrpc_reply_state *rs)
+{
+       LASSERT(atomic_read(&rs->rs_refcount) > 0);
+       if (atomic_dec_and_test(&rs->rs_refcount))
+               lustre_free_reply_state(rs);
+}
+
+/* Should only be called once per req */
+static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req)
+{
+       if (req->rq_reply_state == NULL)
+               return; /* shouldn't occur */
+       ptlrpc_rs_decref(req->rq_reply_state);
+       req->rq_reply_state = NULL;
+       req->rq_repmsg = NULL;
+}
+
+static inline __u32 lustre_request_magic(struct ptlrpc_request *req)
+{
+       return lustre_msg_get_magic(req->rq_reqmsg);
+}
+
+static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req)
+{
+       switch (req->rq_reqmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return req->rq_reqmsg->lm_repsize;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n",
+                        req->rq_reqmsg->lm_magic);
+               return -EFAULT;
+       }
+}
+
+static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req)
+{
+       if (req->rq_delay_limit != 0 &&
+           cfs_time_before(cfs_time_add(req->rq_queued_time,
+                                        cfs_time_seconds(req->rq_delay_limit)),
+                           cfs_time_current())) {
+               return 1;
+       }
+       return 0;
+}
+
+static inline int ptlrpc_no_resend(struct ptlrpc_request *req)
+{
+       if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) {
+               spin_lock(&req->rq_lock);
+               req->rq_no_resend = 1;
+               spin_unlock(&req->rq_lock);
+       }
+       return req->rq_no_resend;
+}
+
+static inline int
+ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
+{
+       int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate);
+
+       return svcpt->scp_service->srv_watchdog_factor *
+              max_t(int, at, obd_timeout);
+}
+
+static inline struct ptlrpc_service *
+ptlrpc_req2svc(struct ptlrpc_request *req)
+{
+       LASSERT(req->rq_rqbd != NULL);
+       return req->rq_rqbd->rqbd_svcpt->scp_service;
+}
+
+/* ldlm/ldlm_lib.c */
+/**
+ * Target client logic
+ * @{
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg);
+int client_obd_cleanup(struct obd_device *obddev);
+int client_connect_import(const struct lu_env *env,
+                         struct obd_export **exp, struct obd_device *obd,
+                         struct obd_uuid *cluuid, struct obd_connect_data *,
+                         void *localdata);
+int client_disconnect_export(struct obd_export *exp);
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                          int priority);
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid);
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+                           struct obd_uuid *uuid);
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
+void client_destroy_import(struct obd_import *imp);
+/** @} */
+
+
+/* ptlrpc/pinger.c */
+/**
+ * Pinger API (client side only)
+ * @{
+ */
+enum timeout_event {
+       TIMEOUT_GRANT = 1
+};
+struct timeout_item;
+typedef int (*timeout_cb_t)(struct timeout_item *, void *);
+int ptlrpc_pinger_add_import(struct obd_import *imp);
+int ptlrpc_pinger_del_import(struct obd_import *imp);
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+                             timeout_cb_t cb, void *data,
+                             struct list_head *obd_list);
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+                             enum timeout_event event);
+struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
+int ptlrpc_obd_ping(struct obd_device *obd);
+cfs_time_t ptlrpc_suspend_wakeup_time(void);
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req);
+void ptlrpc_pinger_ir_up(void);
+void ptlrpc_pinger_ir_down(void);
+/** @} */
+int ptlrpc_pinger_suppress_pings(void);
+
+/* ptlrpc daemon bind policy */
+typedef enum {
+       /* all ptlrpcd threads are free mode */
+       PDB_POLICY_NONE   = 1,
+       /* all ptlrpcd threads are bound mode */
+       PDB_POLICY_FULL   = 2,
+       /* <free1 bound1> <free2 bound2> ... <freeN boundN> */
+       PDB_POLICY_PAIR   = 3,
+       /* <free1 bound1> <bound1 free2> ... <freeN boundN> <boundN free1>,
+        * means each ptlrpcd[X] has two partners: thread[X-1] and thread[X+1].
+        * If kernel supports NUMA, pthrpcd threads are binded and
+        * grouped by NUMA node */
+       PDB_POLICY_NEIGHBOR      = 4,
+} pdb_policy_t;
+
+/* ptlrpc daemon load policy
+ * It is caller's duty to specify how to push the async RPC into some ptlrpcd
+ * queue, but it is not enforced, affected by "ptlrpcd_bind_policy". If it is
+ * "PDB_POLICY_FULL", then the RPC will be processed by the selected ptlrpcd,
+ * Otherwise, the RPC may be processed by the selected ptlrpcd or its partner,
+ * depends on which is scheduled firstly, to accelerate the RPC processing. */
+typedef enum {
+       /* on the same CPU core as the caller */
+       PDL_POLICY_SAME  = 1,
+       /* within the same CPU partition, but not the same core as the caller */
+       PDL_POLICY_LOCAL        = 2,
+       /* round-robin on all CPU cores, but not the same core as the caller */
+       PDL_POLICY_ROUND        = 3,
+       /* the specified CPU core is preferred, but not enforced */
+       PDL_POLICY_PREFERRED    = 4,
+} pdl_policy_t;
+
+/* ptlrpc/ptlrpcd.c */
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
+void ptlrpcd_free(struct ptlrpcd_ctl *pc);
+void ptlrpcd_wake(struct ptlrpc_request *req);
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx);
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);
+int ptlrpcd_addref(void);
+void ptlrpcd_decref(void);
+
+/* ptlrpc/lproc_ptlrpc.c */
+/**
+ * procfs output related functions
+ * @{
+ */
+const char* ll_opcode2str(__u32 opcode);
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd);
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes);
+#else
+static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {}
+static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {}
+#endif
+/** @} */
+
+/* ptlrpc/llog_server.c */
+int llog_origin_handle_open(struct ptlrpc_request *req);
+int llog_origin_handle_destroy(struct ptlrpc_request *req);
+int llog_origin_handle_prev_block(struct ptlrpc_request *req);
+int llog_origin_handle_next_block(struct ptlrpc_request *req);
+int llog_origin_handle_read_header(struct ptlrpc_request *req);
+int llog_origin_handle_close(struct ptlrpc_request *req);
+int llog_origin_handle_cancel(struct ptlrpc_request *req);
+
+/* ptlrpc/llog_client.c */
+extern struct llog_operations llog_client_ops;
+
+/** @} net */
+
+#endif
+/** @} PtlRPC */
diff --git a/drivers/staging/lustre/lustre/include/lustre_param.h b/drivers/staging/lustre/lustre/include/lustre_param.h
new file mode 100644 (file)
index 0000000..ed65468
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_param.h
+ *
+ * User-settable parameter keys
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_PARAM_H
+#define _LUSTRE_PARAM_H
+
+/** \defgroup param param
+ *
+ * @{
+ */
+
+/* For interoperability */
+struct cfg_interop_param {
+       char *old_param;
+       char *new_param;
+};
+
+/* obd_config.c */
+int class_find_param(char *buf, char *key, char **valp);
+struct cfg_interop_param *class_find_old_param(const char *param,
+                                              struct cfg_interop_param *ptr);
+int class_get_next_param(char **params, char *copy);
+int class_match_param(char *buf, char *key, char **valp);
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh);
+int class_parse_net(char *buf, __u32 *net, char **endh);
+int class_match_nid(char *buf, char *key, lnet_nid_t nid);
+int class_match_net(char *buf, char *key, __u32 net);
+/* obd_mount.c */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+           char *s1, char *s2, char *s3, char *s4);
+
+
+
+/****************** User-settable parameter keys *********************/
+/* e.g.
+       tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda
+       lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0
+                   ... testfs-MDT0000.lov.stripesize=4M
+                   ... testfs-OST0000.ost.client_cache_seconds=15
+                   ... testfs.sys.timeout=<secs>
+                   ... testfs.llite.max_read_ahead_mb=16
+*/
+
+/* System global or special params not handled in obd's proc
+ * See mgs_write_log_sys()
+ */
+#define PARAM_TIMEOUT        "timeout="          /* global */
+#define PARAM_LDLM_TIMEOUT      "ldlm_timeout="     /* global */
+#define PARAM_AT_MIN          "at_min="           /* global */
+#define PARAM_AT_MAX          "at_max="           /* global */
+#define PARAM_AT_EXTRA      "at_extra="         /* global */
+#define PARAM_AT_EARLY_MARGIN      "at_early_margin="  /* global */
+#define PARAM_AT_HISTORY          "at_history="       /* global */
+#define PARAM_JOBID_VAR                   "jobid_var="        /* global */
+#define PARAM_MGSNODE        "mgsnode="          /* only at mounttime */
+#define PARAM_FAILNODE      "failover.node="    /* add failover nid */
+#define PARAM_FAILMODE      "failover.mode="    /* initial mount only */
+#define PARAM_ACTIVE          "active="           /* activate/deactivate */
+#define PARAM_NETWORK        "network="          /* bind on nid */
+#define PARAM_ID_UPCALL                "identity_upcall="  /* identity upcall */
+
+/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */
+#define PARAM_OST                "ost."
+#define PARAM_OSC                "osc."
+#define PARAM_MDT                "mdt."
+#define PARAM_MDD                "mdd."
+#define PARAM_MDC                "mdc."
+#define PARAM_LLITE            "llite."
+#define PARAM_LOV                "lov."
+#define PARAM_LOD              "lod."
+#define PARAM_OSP              "osp."
+#define PARAM_SYS                "sys."              /* global */
+#define PARAM_SRPC              "srpc."
+#define PARAM_SRPC_FLVR            "srpc.flavor."
+#define PARAM_SRPC_UDESC          "srpc.udesc.cli2mdt"
+#define PARAM_SEC                "security."
+#define PARAM_QUOTA            "quota."            /* global */
+
+/** @} param */
+
+#endif /* _LUSTRE_PARAM_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_quota.h b/drivers/staging/lustre/lustre/include/lustre_quota.h
new file mode 100644 (file)
index 0000000..1c3041f
--- /dev/null
@@ -0,0 +1,239 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LUSTRE_QUOTA_H
+#define _LUSTRE_QUOTA_H
+
+/** \defgroup quota quota
+ *
+ */
+
+#include <linux/lustre_quota.h>
+
+#include <dt_object.h>
+#include <lustre_fid.h>
+#include <lustre_dlm.h>
+
+#ifndef MAX_IQ_TIME
+#define MAX_IQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+#ifndef MAX_DQ_TIME
+#define MAX_DQ_TIME  604800     /* (7*24*60*60) 1 week */
+#endif
+
+struct lquota_id_info;
+struct lquota_trans;
+
+/* Gather all quota record type in an union that can be used to read any records
+ * from disk. All fields of these records must be 64-bit aligned, otherwise the
+ * OSD layer may swab them incorrectly. */
+union lquota_rec {
+       struct lquota_glb_rec   lqr_glb_rec;
+       struct lquota_slv_rec   lqr_slv_rec;
+       struct lquota_acct_rec  lqr_acct_rec;
+};
+
+/* Index features supported by the global index objects
+ * Only used for migration purpose and should be removed once on-disk migration
+ * is no longer needed */
+extern struct dt_index_features dt_quota_iusr_features;
+extern struct dt_index_features dt_quota_busr_features;
+extern struct dt_index_features dt_quota_igrp_features;
+extern struct dt_index_features dt_quota_bgrp_features;
+
+/* Name used in the configuration logs to identify the default metadata pool
+ * (composed of all the MDTs, with pool ID 0) and the default data pool (all
+ * the OSTs, with pool ID 0 too). */
+#define QUOTA_METAPOOL_NAME   "mdt="
+#define QUOTA_DATAPOOL_NAME   "ost="
+
+/*
+ * Quota Master Target support
+ */
+
+/* Request handlers for quota master operations.
+ * This is used by the MDT to pass quota/lock requests to the quota master
+ * target. This won't be needed any more once the QMT is a real target and
+ * does not rely any more on the MDT service threads and namespace. */
+struct qmt_handlers {
+       /* Handle quotactl request from client. */
+       int (*qmth_quotactl)(const struct lu_env *, struct lu_device *,
+                            struct obd_quotactl *);
+
+       /* Handle dqacq/dqrel request from slave. */
+       int (*qmth_dqacq)(const struct lu_env *, struct lu_device *,
+                         struct ptlrpc_request *);
+
+       /* LDLM intent policy associated with quota locks */
+       int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *,
+                                 struct ptlrpc_request *, struct ldlm_lock **,
+                                 int);
+
+       /* Initialize LVB of ldlm resource associated with quota objects */
+       int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *);
+
+       /* Update LVB of ldlm resource associated with quota objects */
+       int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *,
+                               struct ptlrpc_request *, int);
+
+       /* Return size of LVB to be packed in ldlm message */
+       int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *);
+
+       /* Fill request buffer with lvb */
+       int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *,
+                             int);
+
+       /* Free lvb associated with ldlm resource */
+       int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *);
+};
+
+/* actual handlers are defined in lustre/quota/qmt_handler.c */
+extern struct qmt_handlers qmt_hdls;
+
+/*
+ * Quota enforcement support on slaves
+ */
+
+struct qsd_instance;
+
+/* The quota slave feature is implemented under the form of a library.
+ * The API is the following:
+ *
+ * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd
+ *            instance via qsd_init(). This creates all required structures
+ *            to manage quota enforcement for this target and performs all
+ *            low-level initialization which does not involve any lustre
+ *            object. qsd_init() should typically be called when the OSD
+ *            is being set up.
+ *
+ * - qsd_prepare(): This sets up on-disk objects associated with the quota slave
+ *               feature and initiates the quota reintegration procedure if
+ *               needed. qsd_prepare() should typically be called when
+ *               ->ldo_prepare is invoked.
+ *
+ * - qsd_start(): a qsd instance should be started once recovery is completed
+ *             (i.e. when ->ldo_recovery_complete is called). This is used
+ *             to notify the qsd layer that quota should now be enforced
+ *             again via the qsd_op_begin/end functions. The last step of the
+ *             reintegration prodecure (namely usage reconciliation) will be
+ *             completed during start.
+ *
+ * - qsd_fini(): is used to release a qsd_instance structure allocated with
+ *            qsd_init(). This releases all quota slave objects and frees the
+ *            structures associated with the qsd_instance.
+ *
+ * - qsd_op_begin(): is used to enforce quota, it must be called in the
+ *                declaration of each operation. qsd_op_end() should then be
+ *                invoked later once all operations have been completed in
+ *                order to release/adjust the quota space.
+ *                Running qsd_op_begin() before qsd_start() isn't fatal and
+ *                will return success.
+ *                Once qsd_start() has been run, qsd_op_begin() will block
+ *                until the reintegration procedure is completed.
+ *
+ * - qsd_op_end(): performs the post operation quota processing. This must be
+ *              called after the operation transaction stopped.
+ *              While qsd_op_begin() must be invoked each time a new
+ *              operation is declared, qsd_op_end() should be called only
+ *              once for the whole transaction.
+ *
+ * - qsd_op_adjust(): triggers pre-acquire/release if necessary.
+ *
+ * Below are the function prototypes to be used by OSD layer to manage quota
+ * enforcement. Arguments are documented where each function is defined.  */
+
+struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *,
+                             proc_dir_entry_t *);
+int qsd_prepare(const struct lu_env *, struct qsd_instance *);
+int qsd_start(const struct lu_env *, struct qsd_instance *);
+void qsd_fini(const struct lu_env *, struct qsd_instance *);
+int qsd_op_begin(const struct lu_env *, struct qsd_instance *,
+                struct lquota_trans *, struct lquota_id_info *, int *);
+void qsd_op_end(const struct lu_env *, struct qsd_instance *,
+               struct lquota_trans *);
+void qsd_op_adjust(const struct lu_env *, struct qsd_instance *,
+                  union lquota_id *, int);
+/* This is exported for the ldiskfs quota migration only,
+ * see convert_quota_file() */
+int lquota_disk_write_glb(const struct lu_env *, struct dt_object *,
+                         __u64, struct lquota_glb_rec *);
+
+/*
+ * Quota information attached to a transaction
+ */
+
+struct lquota_entry;
+
+struct lquota_id_info {
+       /* quota identifier */
+       union lquota_id          lqi_id;
+
+       /* USRQUOTA or GRPQUOTA for now, could be expanded for
+        * directory quota or other types later.  */
+       int                      lqi_type;
+
+       /* inodes or kbytes to be consumed or released, it could
+        * be negative when releasing space.  */
+       long long                lqi_space;
+
+       /* quota slave entry structure associated with this ID */
+       struct lquota_entry     *lqi_qentry;
+
+       /* whether we are reporting blocks or inodes */
+       bool                     lqi_is_blk;
+};
+
+/* Since we enforce only inode quota in meta pool (MDTs), and block quota in
+ * data pool (OSTs), there are at most 4 quota ids being enforced in a single
+ * transaction, which is chown transaction:
+ * original uid and gid, new uid and gid.
+ *
+ * This value might need to be revised when directory quota is added.  */
+#define QUOTA_MAX_TRANSIDS    4
+
+/* all qids involved in a single transaction */
+struct lquota_trans {
+       unsigned short          lqt_id_cnt;
+       struct lquota_id_info   lqt_ids[QUOTA_MAX_TRANSIDS];
+};
+
+/* flags for quota local enforcement */
+#define QUOTA_FL_OVER_USRQUOTA  0x01
+#define QUOTA_FL_OVER_GRPQUOTA  0x02
+#define QUOTA_FL_SYNC     0x04
+
+#define IS_LQUOTA_RES(res)                                             \
+       (res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA ||   \
+        res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB)
+
+/* helper function used by MDT & OFD to retrieve quota accounting information
+ * on slave */
+int lquotactl_slv(const struct lu_env *, struct dt_device *,
+                 struct obd_quotactl *);
+/** @} quota */
+#endif /* _LUSTRE_QUOTA_H */
diff --git a/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/drivers/staging/lustre/lustre/include/lustre_req_layout.h
new file mode 100644 (file)
index 0000000..f4d3820
--- /dev/null
@@ -0,0 +1,334 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre_req_layout.h
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#ifndef _LUSTRE_REQ_LAYOUT_H__
+#define _LUSTRE_REQ_LAYOUT_H__
+
+/** \defgroup req_layout req_layout
+ *
+ * @{
+ */
+
+struct req_msg_field;
+struct req_format;
+struct req_capsule;
+
+struct ptlrpc_request;
+
+enum req_location {
+       RCL_CLIENT,
+       RCL_SERVER,
+       RCL_NR
+};
+
+/* Maximal number of fields (buffers) in a request message. */
+#define REQ_MAX_FIELD_NR  9
+
+struct req_capsule {
+       struct ptlrpc_request   *rc_req;
+       const struct req_format *rc_fmt;
+       enum req_location       rc_loc;
+       __u32               rc_area[RCL_NR][REQ_MAX_FIELD_NR];
+};
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_net.h>
+
+void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req,
+                     enum req_location location);
+void req_capsule_fini(struct req_capsule *pill);
+
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt);
+void req_capsule_client_dump(struct req_capsule *pill);
+void req_capsule_server_dump(struct req_capsule *pill);
+void req_capsule_init_area(struct req_capsule *pill);
+int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc);
+int  req_capsule_server_pack(struct req_capsule *pill);
+
+void *req_capsule_client_get(struct req_capsule *pill,
+                            const struct req_msg_field *field);
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field,
+                                 void *swabber);
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  int len);
+void *req_capsule_server_get(struct req_capsule *pill,
+                            const struct req_msg_field *field);
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  int len);
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field,
+                                 void *swabber);
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+                                       const struct req_msg_field *field,
+                                       int len, void *swabber);
+const void *req_capsule_other_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field);
+
+void req_capsule_set_size(struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc, int size);
+int req_capsule_get_size(const struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc);
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc);
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+                        enum req_location loc);
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt);
+
+int req_capsule_has_field(const struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc);
+int req_capsule_field_present(const struct req_capsule *pill,
+                             const struct req_msg_field *field,
+                             enum req_location loc);
+void req_capsule_shrink(struct req_capsule *pill,
+                       const struct req_msg_field *field,
+                       unsigned int newlen,
+                       enum req_location loc);
+int req_capsule_server_grow(struct req_capsule *pill,
+                           const struct req_msg_field *field,
+                           unsigned int newlen);
+int  req_layout_init(void);
+void req_layout_fini(void);
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+
+extern struct req_format RQF_OBD_PING;
+extern struct req_format RQF_OBD_SET_INFO;
+extern struct req_format RQF_SEC_CTX;
+extern struct req_format RQF_OBD_IDX_READ;
+/* MGS req_format */
+extern struct req_format RQF_MGS_TARGET_REG;
+extern struct req_format RQF_MGS_SET_INFO;
+extern struct req_format RQF_MGS_CONFIG_READ;
+/* fid/fld req_format */
+extern struct req_format RQF_SEQ_QUERY;
+extern struct req_format RQF_FLD_QUERY;
+/* MDS req_format */
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_STATFS;
+extern struct req_format RQF_MDS_GETSTATUS;
+extern struct req_format RQF_MDS_SYNC;
+extern struct req_format RQF_MDS_GETXATTR;
+extern struct req_format RQF_MDS_GETATTR;
+extern struct req_format RQF_UPDATE_OBJ;
+
+/*
+ * This is format of direct (non-intent) MDS_GETATTR_NAME request.
+ */
+extern struct req_format RQF_MDS_GETATTR_NAME;
+extern struct req_format RQF_MDS_CLOSE;
+extern struct req_format RQF_MDS_PIN;
+extern struct req_format RQF_MDS_UNPIN;
+extern struct req_format RQF_MDS_CONNECT;
+extern struct req_format RQF_MDS_DISCONNECT;
+extern struct req_format RQF_MDS_GET_INFO;
+extern struct req_format RQF_MDS_READPAGE;
+extern struct req_format RQF_MDS_WRITEPAGE;
+extern struct req_format RQF_MDS_IS_SUBDIR;
+extern struct req_format RQF_MDS_DONE_WRITING;
+extern struct req_format RQF_MDS_REINT;
+extern struct req_format RQF_MDS_REINT_CREATE;
+extern struct req_format RQF_MDS_REINT_CREATE_RMT_ACL;
+extern struct req_format RQF_MDS_REINT_CREATE_SLAVE;
+extern struct req_format RQF_MDS_REINT_CREATE_SYM;
+extern struct req_format RQF_MDS_REINT_OPEN;
+extern struct req_format RQF_MDS_REINT_UNLINK;
+extern struct req_format RQF_MDS_REINT_LINK;
+extern struct req_format RQF_MDS_REINT_RENAME;
+extern struct req_format RQF_MDS_REINT_SETATTR;
+extern struct req_format RQF_MDS_REINT_SETXATTR;
+extern struct req_format RQF_MDS_QUOTACHECK;
+extern struct req_format RQF_MDS_QUOTACTL;
+extern struct req_format RQF_QC_CALLBACK;
+extern struct req_format RQF_QUOTA_DQACQ;
+extern struct req_format RQF_MDS_SWAP_LAYOUTS;
+/* MDS hsm formats */
+extern struct req_format RQF_MDS_HSM_STATE_GET;
+extern struct req_format RQF_MDS_HSM_STATE_SET;
+extern struct req_format RQF_MDS_HSM_ACTION;
+extern struct req_format RQF_MDS_HSM_PROGRESS;
+extern struct req_format RQF_MDS_HSM_CT_REGISTER;
+extern struct req_format RQF_MDS_HSM_CT_UNREGISTER;
+extern struct req_format RQF_MDS_HSM_REQUEST;
+/* OST req_format */
+extern struct req_format RQF_OST_CONNECT;
+extern struct req_format RQF_OST_DISCONNECT;
+extern struct req_format RQF_OST_QUOTACHECK;
+extern struct req_format RQF_OST_QUOTACTL;
+extern struct req_format RQF_OST_GETATTR;
+extern struct req_format RQF_OST_SETATTR;
+extern struct req_format RQF_OST_CREATE;
+extern struct req_format RQF_OST_PUNCH;
+extern struct req_format RQF_OST_SYNC;
+extern struct req_format RQF_OST_DESTROY;
+extern struct req_format RQF_OST_BRW_READ;
+extern struct req_format RQF_OST_BRW_WRITE;
+extern struct req_format RQF_OST_STATFS;
+extern struct req_format RQF_OST_SET_GRANT_INFO;
+extern struct req_format RQF_OST_GET_INFO_GENERIC;
+extern struct req_format RQF_OST_GET_INFO_LAST_ID;
+extern struct req_format RQF_OST_GET_INFO_LAST_FID;
+extern struct req_format RQF_OST_SET_INFO_LAST_FID;
+extern struct req_format RQF_OST_GET_INFO_FIEMAP;
+
+/* LDLM req_format */
+extern struct req_format RQF_LDLM_ENQUEUE;
+extern struct req_format RQF_LDLM_ENQUEUE_LVB;
+extern struct req_format RQF_LDLM_CONVERT;
+extern struct req_format RQF_LDLM_INTENT;
+extern struct req_format RQF_LDLM_INTENT_BASIC;
+extern struct req_format RQF_LDLM_INTENT_LAYOUT;
+extern struct req_format RQF_LDLM_INTENT_GETATTR;
+extern struct req_format RQF_LDLM_INTENT_OPEN;
+extern struct req_format RQF_LDLM_INTENT_CREATE;
+extern struct req_format RQF_LDLM_INTENT_UNLINK;
+extern struct req_format RQF_LDLM_INTENT_QUOTA;
+extern struct req_format RQF_LDLM_CANCEL;
+extern struct req_format RQF_LDLM_CALLBACK;
+extern struct req_format RQF_LDLM_CP_CALLBACK;
+extern struct req_format RQF_LDLM_BL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_CALLBACK;
+extern struct req_format RQF_LDLM_GL_DESC_CALLBACK;
+/* LOG req_format */
+extern struct req_format RQF_LOG_CANCEL;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK;
+extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER;
+extern struct req_format RQF_LLOG_ORIGIN_CONNECT;
+
+extern struct req_msg_field RMF_GENERIC_DATA;
+extern struct req_msg_field RMF_PTLRPC_BODY;
+extern struct req_msg_field RMF_MDT_BODY;
+extern struct req_msg_field RMF_MDT_EPOCH;
+extern struct req_msg_field RMF_OBD_STATFS;
+extern struct req_msg_field RMF_NAME;
+extern struct req_msg_field RMF_SYMTGT;
+extern struct req_msg_field RMF_TGTUUID;
+extern struct req_msg_field RMF_CLUUID;
+extern struct req_msg_field RMF_SETINFO_VAL;
+extern struct req_msg_field RMF_SETINFO_KEY;
+extern struct req_msg_field RMF_GETINFO_VAL;
+extern struct req_msg_field RMF_GETINFO_VALLEN;
+extern struct req_msg_field RMF_GETINFO_KEY;
+extern struct req_msg_field RMF_IDX_INFO;
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ */
+extern struct req_msg_field RMF_CONN;
+extern struct req_msg_field RMF_CONNECT_DATA;
+extern struct req_msg_field RMF_DLM_REQ;
+extern struct req_msg_field RMF_DLM_REP;
+extern struct req_msg_field RMF_DLM_LVB;
+extern struct req_msg_field RMF_DLM_GL_DESC;
+extern struct req_msg_field RMF_LDLM_INTENT;
+extern struct req_msg_field RMF_LAYOUT_INTENT;
+extern struct req_msg_field RMF_MDT_MD;
+extern struct req_msg_field RMF_REC_REINT;
+extern struct req_msg_field RMF_EADATA;
+extern struct req_msg_field RMF_ACL;
+extern struct req_msg_field RMF_LOGCOOKIES;
+extern struct req_msg_field RMF_CAPA1;
+extern struct req_msg_field RMF_CAPA2;
+extern struct req_msg_field RMF_OBD_QUOTACHECK;
+extern struct req_msg_field RMF_OBD_QUOTACTL;
+extern struct req_msg_field RMF_QUOTA_BODY;
+extern struct req_msg_field RMF_STRING;
+extern struct req_msg_field RMF_SWAP_LAYOUTS;
+extern struct req_msg_field RMF_MDS_HSM_PROGRESS;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+extern struct req_msg_field RMF_MDS_HSM_USER_ITEM;
+extern struct req_msg_field RMF_MDS_HSM_ARCHIVE;
+extern struct req_msg_field RMF_HSM_USER_STATE;
+extern struct req_msg_field RMF_HSM_STATE_SET;
+extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION;
+extern struct req_msg_field RMF_MDS_HSM_REQUEST;
+
+/* seq-mgr fields */
+extern struct req_msg_field RMF_SEQ_OPC;
+extern struct req_msg_field RMF_SEQ_RANGE;
+extern struct req_msg_field RMF_FID_SPACE;
+
+/* FLD fields */
+extern struct req_msg_field RMF_FLD_OPC;
+extern struct req_msg_field RMF_FLD_MDFLD;
+
+extern struct req_msg_field RMF_LLOGD_BODY;
+extern struct req_msg_field RMF_LLOG_LOG_HDR;
+extern struct req_msg_field RMF_LLOGD_CONN_BODY;
+
+extern struct req_msg_field RMF_MGS_TARGET_INFO;
+extern struct req_msg_field RMF_MGS_SEND_PARAM;
+
+extern struct req_msg_field RMF_OST_BODY;
+extern struct req_msg_field RMF_OBD_IOOBJ;
+extern struct req_msg_field RMF_OBD_ID;
+extern struct req_msg_field RMF_FID;
+extern struct req_msg_field RMF_NIOBUF_REMOTE;
+extern struct req_msg_field RMF_RCS;
+extern struct req_msg_field RMF_FIEMAP_KEY;
+extern struct req_msg_field RMF_FIEMAP_VAL;
+extern struct req_msg_field RMF_OST_ID;
+
+/* MGS config read message format */
+extern struct req_msg_field RMF_MGS_CONFIG_BODY;
+extern struct req_msg_field RMF_MGS_CONFIG_RES;
+
+/* generic uint32 */
+extern struct req_msg_field RMF_U32;
+
+/* OBJ update format */
+extern struct req_msg_field RMF_UPDATE;
+extern struct req_msg_field RMF_UPDATE_REPLY;
+/** @} req_layout */
+
+#endif /* _LUSTRE_REQ_LAYOUT_H__ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_sec.h b/drivers/staging/lustre/lustre/include/lustre_sec.h
new file mode 100644 (file)
index 0000000..9e0908e
--- /dev/null
@@ -0,0 +1,1145 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LUSTRE_SEC_H_
+#define _LUSTRE_SEC_H_
+
+/** \defgroup sptlrpc sptlrpc
+ *
+ * @{
+ */
+
+/*
+ * to avoid include
+ */
+struct obd_import;
+struct obd_export;
+struct ptlrpc_request;
+struct ptlrpc_reply_state;
+struct ptlrpc_bulk_desc;
+struct brw_page;
+/* Linux specific */
+struct key;
+struct seq_file;
+
+/*
+ * forward declaration
+ */
+struct ptlrpc_sec_policy;
+struct ptlrpc_sec_cops;
+struct ptlrpc_sec_sops;
+struct ptlrpc_sec;
+struct ptlrpc_svc_ctx;
+struct ptlrpc_cli_ctx;
+struct ptlrpc_ctx_ops;
+
+/**
+ * \addtogroup flavor flavor
+ *
+ * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits
+ * are unused, must be set to 0 for future expansion.
+ * <pre>
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * </pre>
+ *
+ * @{
+ */
+
+/*
+ * flavor constants
+ */
+enum sptlrpc_policy {
+       SPTLRPC_POLICY_NULL          = 0,
+       SPTLRPC_POLICY_PLAIN        = 1,
+       SPTLRPC_POLICY_GSS            = 2,
+       SPTLRPC_POLICY_MAX,
+};
+
+enum sptlrpc_mech_null {
+       SPTLRPC_MECH_NULL              = 0,
+       SPTLRPC_MECH_NULL_MAX,
+};
+
+enum sptlrpc_mech_plain {
+       SPTLRPC_MECH_PLAIN            = 0,
+       SPTLRPC_MECH_PLAIN_MAX,
+};
+
+enum sptlrpc_mech_gss {
+       SPTLRPC_MECH_GSS_NULL      = 0,
+       SPTLRPC_MECH_GSS_KRB5      = 1,
+       SPTLRPC_MECH_GSS_MAX,
+};
+
+enum sptlrpc_service_type {
+       SPTLRPC_SVC_NULL                = 0,    /**< no security */
+       SPTLRPC_SVC_AUTH                = 1,    /**< authentication only */
+       SPTLRPC_SVC_INTG                = 2,    /**< integrity */
+       SPTLRPC_SVC_PRIV                = 3,    /**< privacy */
+       SPTLRPC_SVC_MAX,
+};
+
+enum sptlrpc_bulk_type {
+       SPTLRPC_BULK_DEFAULT        = 0,    /**< follow rpc flavor */
+       SPTLRPC_BULK_HASH              = 1,    /**< hash integrity */
+       SPTLRPC_BULK_MAX,
+};
+
+enum sptlrpc_bulk_service {
+       SPTLRPC_BULK_SVC_NULL      = 0,    /**< no security */
+       SPTLRPC_BULK_SVC_AUTH      = 1,    /**< authentication only */
+       SPTLRPC_BULK_SVC_INTG      = 2,    /**< integrity */
+       SPTLRPC_BULK_SVC_PRIV      = 3,    /**< privacy */
+       SPTLRPC_BULK_SVC_MAX,
+};
+
+/*
+ * compose/extract macros
+ */
+#define FLVR_POLICY_OFFSET           (0)
+#define FLVR_MECH_OFFSET               (4)
+#define FLVR_SVC_OFFSET                 (8)
+#define FLVR_BULK_TYPE_OFFSET     (12)
+#define FLVR_BULK_SVC_OFFSET       (16)
+
+#define MAKE_FLVR(policy, mech, svc, btype, bsvc)                     \
+       (((__u32)(policy) << FLVR_POLICY_OFFSET) |                    \
+        ((__u32)(mech) << FLVR_MECH_OFFSET) |                    \
+        ((__u32)(svc) << FLVR_SVC_OFFSET) |                        \
+        ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) |                \
+        ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET))
+
+/*
+ * extraction
+ */
+#define SPTLRPC_FLVR_POLICY(flavor)                                 \
+       ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_MECH(flavor)                                     \
+       ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_SVC(flavor)                                       \
+       ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_TYPE(flavor)                           \
+       ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF)
+#define SPTLRPC_FLVR_BULK_SVC(flavor)                             \
+       ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF)
+
+#define SPTLRPC_FLVR_BASE(flavor)                                     \
+       ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF)
+#define SPTLRPC_FLVR_BASE_SUB(flavor)                             \
+       ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF)
+
+/*
+ * gss subflavors
+ */
+#define MAKE_BASE_SUBFLVR(mech, svc)                               \
+       ((__u32)(mech) |                                                \
+        ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET)))
+
+#define SPTLRPC_SUBFLVR_KRB5N                                     \
+       MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL)
+#define SPTLRPC_SUBFLVR_KRB5A                                     \
+       MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH)
+#define SPTLRPC_SUBFLVR_KRB5I                                     \
+       MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG)
+#define SPTLRPC_SUBFLVR_KRB5P                                     \
+       MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV)
+
+/*
+ * "end user" flavors
+ */
+#define SPTLRPC_FLVR_NULL                             \
+       MAKE_FLVR(SPTLRPC_POLICY_NULL,            \
+                 SPTLRPC_MECH_NULL,                \
+                 SPTLRPC_SVC_NULL,                  \
+                 SPTLRPC_BULK_DEFAULT,          \
+                 SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_PLAIN                           \
+       MAKE_FLVR(SPTLRPC_POLICY_PLAIN,          \
+                 SPTLRPC_MECH_PLAIN,              \
+                 SPTLRPC_SVC_NULL,                  \
+                 SPTLRPC_BULK_HASH,                \
+                 SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5N                           \
+       MAKE_FLVR(SPTLRPC_POLICY_GSS,              \
+                 SPTLRPC_MECH_GSS_KRB5,                \
+                 SPTLRPC_SVC_NULL,                  \
+                 SPTLRPC_BULK_DEFAULT,          \
+                 SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5A                           \
+       MAKE_FLVR(SPTLRPC_POLICY_GSS,              \
+                 SPTLRPC_MECH_GSS_KRB5,                \
+                 SPTLRPC_SVC_AUTH,                  \
+                 SPTLRPC_BULK_DEFAULT,          \
+                 SPTLRPC_BULK_SVC_NULL)
+#define SPTLRPC_FLVR_KRB5I                           \
+       MAKE_FLVR(SPTLRPC_POLICY_GSS,              \
+                 SPTLRPC_MECH_GSS_KRB5,                \
+                 SPTLRPC_SVC_INTG,                  \
+                 SPTLRPC_BULK_DEFAULT,          \
+                 SPTLRPC_BULK_SVC_INTG)
+#define SPTLRPC_FLVR_KRB5P                           \
+       MAKE_FLVR(SPTLRPC_POLICY_GSS,              \
+                 SPTLRPC_MECH_GSS_KRB5,                \
+                 SPTLRPC_SVC_PRIV,                  \
+                 SPTLRPC_BULK_DEFAULT,          \
+                 SPTLRPC_BULK_SVC_PRIV)
+
+#define SPTLRPC_FLVR_DEFAULT       SPTLRPC_FLVR_NULL
+
+#define SPTLRPC_FLVR_INVALID       ((__u32) 0xFFFFFFFF)
+#define SPTLRPC_FLVR_ANY               ((__u32) 0xFFF00000)
+
+/**
+ * extract the useful part from wire flavor
+ */
+#define WIRE_FLVR(wflvr)               (((__u32) (wflvr)) & 0x000FFFFF)
+
+/** @} flavor */
+
+static inline void flvr_set_svc(__u32 *flvr, __u32 svc)
+{
+       LASSERT(svc < SPTLRPC_SVC_MAX);
+       *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+                         SPTLRPC_FLVR_MECH(*flvr),
+                         svc,
+                         SPTLRPC_FLVR_BULK_TYPE(*flvr),
+                         SPTLRPC_FLVR_BULK_SVC(*flvr));
+}
+
+static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc)
+{
+       LASSERT(svc < SPTLRPC_BULK_SVC_MAX);
+       *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr),
+                         SPTLRPC_FLVR_MECH(*flvr),
+                         SPTLRPC_FLVR_SVC(*flvr),
+                         SPTLRPC_FLVR_BULK_TYPE(*flvr),
+                         svc);
+}
+
+struct bulk_spec_hash {
+       __u8    hash_alg;
+};
+
+/**
+ * Full description of flavors being used on a ptlrpc connection, include
+ * both regular RPC and bulk transfer parts.
+ */
+struct sptlrpc_flavor {
+       /**
+        * wire flavor, should be renamed to sf_wire.
+        */
+       __u32   sf_rpc;
+       /**
+        * general flags of PTLRPC_SEC_FL_*
+        */
+       __u32   sf_flags;
+       /**
+        * rpc flavor specification
+        */
+       union {
+               /* nothing for now */
+       } u_rpc;
+       /**
+        * bulk flavor specification
+        */
+       union {
+               struct bulk_spec_hash hash;
+       } u_bulk;
+};
+
+/**
+ * identify the RPC is generated from what part of Lustre. It's encoded into
+ * RPC requests and to be checked by ptlrpc service.
+ */
+enum lustre_sec_part {
+       LUSTRE_SP_CLI      = 0,
+       LUSTRE_SP_MDT,
+       LUSTRE_SP_OST,
+       LUSTRE_SP_MGC,
+       LUSTRE_SP_MGS,
+       LUSTRE_SP_ANY      = 0xFF
+};
+
+const char *sptlrpc_part2name(enum lustre_sec_part sp);
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd);
+
+/**
+ * A rule specifies a flavor to be used by a ptlrpc connection between
+ * two Lustre parts.
+ */
+struct sptlrpc_rule {
+       __u32              sr_netid;   /* LNET network ID */
+       __u8                sr_from;    /* sec_part */
+       __u8                sr_to;      /* sec_part */
+       __u16              sr_padding;
+       struct sptlrpc_flavor   sr_flvr;
+};
+
+/**
+ * A set of rules in memory.
+ *
+ * Rules are generated and stored on MGS, and propagated to MDT, OST,
+ * and client when needed.
+ */
+struct sptlrpc_rule_set {
+       int                  srs_nslot;
+       int                  srs_nrule;
+       struct sptlrpc_rule    *srs_rules;
+};
+
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr);
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr);
+
+static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set)
+{
+       memset(set, 0, sizeof(*set));
+}
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set);
+int  sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set,
+                           struct sptlrpc_rule *rule);
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+                           enum lustre_sec_part from,
+                           enum lustre_sec_part to,
+                           lnet_nid_t nid,
+                           struct sptlrpc_flavor *sf);
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set);
+
+int  sptlrpc_process_config(struct lustre_cfg *lcfg);
+void sptlrpc_conf_log_start(const char *logname);
+void sptlrpc_conf_log_stop(const char *logname);
+void sptlrpc_conf_log_update_begin(const char *logname);
+void sptlrpc_conf_log_update_end(const char *logname);
+void sptlrpc_conf_client_adapt(struct obd_device *obd);
+int  sptlrpc_conf_target_get_rules(struct obd_device *obd,
+                                  struct sptlrpc_rule_set *rset,
+                                  int initial);
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+                                 enum lustre_sec_part from,
+                                 lnet_nid_t nid,
+                                 struct sptlrpc_flavor *flavor);
+
+/* The maximum length of security payload. 1024 is enough for Kerberos 5,
+ * and should be enough for other future mechanisms but not sure.
+ * Only used by pre-allocated request/reply pool.
+ */
+#define SPTLRPC_MAX_PAYLOAD     (1024)
+
+
+struct vfs_cred {
+       uint32_t        vc_uid;
+       uint32_t        vc_gid;
+};
+
+struct ptlrpc_ctx_ops {
+       /**
+        * To determine whether it's suitable to use the \a ctx for \a vcred.
+        */
+       int     (*match)       (struct ptlrpc_cli_ctx *ctx,
+                               struct vfs_cred *vcred);
+
+       /**
+        * To bring the \a ctx uptodate.
+        */
+       int     (*refresh)     (struct ptlrpc_cli_ctx *ctx);
+
+       /**
+        * Validate the \a ctx.
+        */
+       int     (*validate)    (struct ptlrpc_cli_ctx *ctx);
+
+       /**
+        * Force the \a ctx to die.
+        */
+       void    (*die)   (struct ptlrpc_cli_ctx *ctx,
+                               int grace);
+       int     (*display)     (struct ptlrpc_cli_ctx *ctx,
+                               char *buf, int bufsize);
+
+       /**
+        * Sign the request message using \a ctx.
+        *
+        * \pre req->rq_reqmsg point to request message.
+        * \pre req->rq_reqlen is the request message length.
+        * \post req->rq_reqbuf point to request message with signature.
+        * \post req->rq_reqdata_len is set to the final request message size.
+        *
+        * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign().
+        */
+       int     (*sign) (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req);
+
+       /**
+        * Verify the reply message using \a ctx.
+        *
+        * \pre req->rq_repdata point to reply message with signature.
+        * \pre req->rq_repdata_len is the total reply message length.
+        * \post req->rq_repmsg point to reply message without signature.
+        * \post req->rq_replen is the reply message length.
+        *
+        * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify().
+        */
+       int     (*verify)      (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req);
+
+       /**
+        * Encrypt the request message using \a ctx.
+        *
+        * \pre req->rq_reqmsg point to request message in clear text.
+        * \pre req->rq_reqlen is the request message length.
+        * \post req->rq_reqbuf point to request message.
+        * \post req->rq_reqdata_len is set to the final request message size.
+        *
+        * \see gss_cli_ctx_seal().
+        */
+       int     (*seal) (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req);
+
+       /**
+        * Decrypt the reply message using \a ctx.
+        *
+        * \pre req->rq_repdata point to encrypted reply message.
+        * \pre req->rq_repdata_len is the total cipher text length.
+        * \post req->rq_repmsg point to reply message in clear text.
+        * \post req->rq_replen is the reply message length in clear text.
+        *
+        * \see gss_cli_ctx_unseal().
+        */
+       int     (*unseal)      (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req);
+
+       /**
+        * Wrap bulk request data. This is called before wrapping RPC
+        * request message.
+        *
+        * \pre bulk buffer is descripted by desc->bd_iov and
+        * desc->bd_iov_count. note for read it's just buffer, no data
+        * need to be sent;  for write it contains data in clear text.
+        * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared
+        * (usually inside of RPC request message).
+        * - encryption: cipher text bulk buffer is descripted by
+        *   desc->bd_enc_iov and desc->bd_iov_count (currently assume iov
+        *   count remains the same).
+        * - otherwise: bulk buffer is still desc->bd_iov and
+        *   desc->bd_iov_count.
+        *
+        * \return 0: success.
+        * \return -ev: error code.
+        *
+        * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk().
+        */
+       int     (*wrap_bulk)   (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req,
+                               struct ptlrpc_bulk_desc *desc);
+
+       /**
+        * Unwrap bulk reply data. This is called after wrapping RPC
+        * reply message.
+        *
+        * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and
+        * desc->bd_iov_count, according to wrap_bulk().
+        * \post final bulk data in clear text is placed in buffer described
+        * by desc->bd_iov and desc->bd_iov_count.
+        * \return +ve nob of actual bulk data in clear text.
+        * \return -ve error code.
+        *
+        * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk().
+        */
+       int     (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx,
+                               struct ptlrpc_request *req,
+                               struct ptlrpc_bulk_desc *desc);
+};
+
+#define PTLRPC_CTX_NEW_BIT          (0)  /* newly created */
+#define PTLRPC_CTX_UPTODATE_BIT        (1)  /* uptodate */
+#define PTLRPC_CTX_DEAD_BIT        (2)  /* mark expired gracefully */
+#define PTLRPC_CTX_ERROR_BIT      (3)  /* fatal error (refresh, etc.) */
+#define PTLRPC_CTX_CACHED_BIT    (8)  /* in ctx cache (hash etc.) */
+#define PTLRPC_CTX_ETERNAL_BIT  (9)  /* always valid */
+
+#define PTLRPC_CTX_NEW          (1 << PTLRPC_CTX_NEW_BIT)
+#define PTLRPC_CTX_UPTODATE        (1 << PTLRPC_CTX_UPTODATE_BIT)
+#define PTLRPC_CTX_DEAD                (1 << PTLRPC_CTX_DEAD_BIT)
+#define PTLRPC_CTX_ERROR              (1 << PTLRPC_CTX_ERROR_BIT)
+#define PTLRPC_CTX_CACHED            (1 << PTLRPC_CTX_CACHED_BIT)
+#define PTLRPC_CTX_ETERNAL          (1 << PTLRPC_CTX_ETERNAL_BIT)
+
+#define PTLRPC_CTX_STATUS_MASK  (PTLRPC_CTX_NEW_BIT    |       \
+                                       PTLRPC_CTX_UPTODATE   |       \
+                                       PTLRPC_CTX_DEAD       |       \
+                                       PTLRPC_CTX_ERROR)
+
+struct ptlrpc_cli_ctx {
+       struct hlist_node       cc_cache;      /* linked into ctx cache */
+       atomic_t            cc_refcount;
+       struct ptlrpc_sec      *cc_sec;
+       struct ptlrpc_ctx_ops  *cc_ops;
+       cfs_time_t            cc_expire;     /* in seconds */
+       unsigned int        cc_early_expire:1;
+       unsigned long      cc_flags;
+       struct vfs_cred  cc_vcred;
+       spinlock_t              cc_lock;
+       struct list_head              cc_req_list;   /* waiting reqs linked here */
+       struct list_head              cc_gc_chain;   /* linked to gc chain */
+};
+
+/**
+ * client side policy operation vector.
+ */
+struct ptlrpc_sec_cops {
+       /**
+        * Given an \a imp, create and initialize a ptlrpc_sec structure.
+        * \param ctx service context:
+        * - regular import: \a ctx should be NULL;
+        * - reverse import: \a ctx is obtained from incoming request.
+        * \param flavor specify what flavor to use.
+        *
+        * When necessary, policy module is responsible for taking reference
+        * on the import.
+        *
+        * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr().
+        */
+       struct ptlrpc_sec *     (*create_sec)  (struct obd_import *imp,
+                                               struct ptlrpc_svc_ctx *ctx,
+                                               struct sptlrpc_flavor *flavor);
+
+       /**
+        * Destructor of ptlrpc_sec. When called, refcount has been dropped
+        * to 0 and all contexts has been destroyed.
+        *
+        * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr().
+        */
+       void                (*destroy_sec) (struct ptlrpc_sec *sec);
+
+       /**
+        * Notify that this ptlrpc_sec is going to die. Optionally, policy
+        * module is supposed to set sec->ps_dying and whatever necessary
+        * actions.
+        *
+        * \see plain_kill_sec(), gss_sec_kill().
+        */
+       void                (*kill_sec)    (struct ptlrpc_sec *sec);
+
+       /**
+        * Given \a vcred, lookup and/or create its context. The policy module
+        * is supposed to maintain its own context cache.
+        * XXX currently \a create and \a remove_dead is always 1, perhaps
+        * should be removed completely.
+        *
+        * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr().
+        */
+       struct ptlrpc_cli_ctx * (*lookup_ctx)  (struct ptlrpc_sec *sec,
+                                               struct vfs_cred *vcred,
+                                               int create,
+                                               int remove_dead);
+
+       /**
+        * Called then the reference of \a ctx dropped to 0. The policy module
+        * is supposed to destroy this context or whatever else according to
+        * its cache maintainance mechamism.
+        *
+        * \param sync if zero, we shouldn't wait for the context being
+        * destroyed completely.
+        *
+        * \see plain_release_ctx(), gss_sec_release_ctx_kr().
+        */
+       void                (*release_ctx) (struct ptlrpc_sec *sec,
+                                               struct ptlrpc_cli_ctx *ctx,
+                                               int sync);
+
+       /**
+        * Flush the context cache.
+        *
+        * \param uid context of which user, -1 means all contexts.
+        * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected
+        * contexts should be cleared immediately.
+        * \param force if zero, only idle contexts will be flushed.
+        *
+        * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr().
+        */
+       int                  (*flush_ctx_cache)
+                                              (struct ptlrpc_sec *sec,
+                                               uid_t uid,
+                                               int grace,
+                                               int force);
+
+       /**
+        * Called periodically by garbage collector to remove dead contexts
+        * from cache.
+        *
+        * \see gss_sec_gc_ctx_kr().
+        */
+       void                (*gc_ctx)      (struct ptlrpc_sec *sec);
+
+       /**
+        * Given an context \a ctx, install a corresponding reverse service
+        * context on client side.
+        * XXX currently it's only used by GSS module, maybe we should remove
+        * this from general API.
+        */
+       int                  (*install_rctx)(struct obd_import *imp,
+                                               struct ptlrpc_sec *sec,
+                                               struct ptlrpc_cli_ctx *ctx);
+
+       /**
+        * To allocate request buffer for \a req.
+        *
+        * \pre req->rq_reqmsg == NULL.
+        * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated,
+        * we are not supposed to free it.
+        * \post if success, req->rq_reqmsg point to a buffer with size
+        * at least \a lustre_msg_size.
+        *
+        * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf().
+        */
+       int                  (*alloc_reqbuf)(struct ptlrpc_sec *sec,
+                                               struct ptlrpc_request *req,
+                                               int lustre_msg_size);
+
+       /**
+        * To free request buffer for \a req.
+        *
+        * \pre req->rq_reqbuf != NULL.
+        *
+        * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf().
+        */
+       void                (*free_reqbuf) (struct ptlrpc_sec *sec,
+                                               struct ptlrpc_request *req);
+
+       /**
+        * To allocate reply buffer for \a req.
+        *
+        * \pre req->rq_repbuf == NULL.
+        * \post if success, req->rq_repbuf point to a buffer with size
+        * req->rq_repbuf_len, the size should be large enough to receive
+        * reply which be transformed from \a lustre_msg_size of clear text.
+        *
+        * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf().
+        */
+       int                  (*alloc_repbuf)(struct ptlrpc_sec *sec,
+                                               struct ptlrpc_request *req,
+                                               int lustre_msg_size);
+
+       /**
+        * To free reply buffer for \a req.
+        *
+        * \pre req->rq_repbuf != NULL.
+        * \post req->rq_repbuf == NULL.
+        * \post req->rq_repbuf_len == 0.
+        *
+        * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf().
+        */
+       void                (*free_repbuf) (struct ptlrpc_sec *sec,
+                                               struct ptlrpc_request *req);
+
+       /**
+        * To expand the request buffer of \a req, thus the \a segment in
+        * the request message pointed by req->rq_reqmsg can accommodate
+        * at least \a newsize of data.
+        *
+        * \pre req->rq_reqmsg->lm_buflens[segment] < newsize.
+        *
+        * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(),
+        * gss_enlarge_reqbuf().
+        */
+       int                  (*enlarge_reqbuf)
+                                              (struct ptlrpc_sec *sec,
+                                               struct ptlrpc_request *req,
+                                               int segment, int newsize);
+       /*
+        * misc
+        */
+       int                  (*display)     (struct ptlrpc_sec *sec,
+                                               struct seq_file *seq);
+};
+
+/**
+ * server side policy operation vector.
+ */
+struct ptlrpc_sec_sops {
+       /**
+        * verify an incoming request.
+        *
+        * \pre request message is pointed by req->rq_reqbuf, size is
+        * req->rq_reqdata_len; and the message has been unpacked to
+        * host byte order.
+        *
+        * \retval SECSVC_OK success, req->rq_reqmsg point to request message
+        * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set;
+        * req->rq_sp_from is decoded from request.
+        * \retval SECSVC_COMPLETE success, the request has been fully
+        * processed, and reply message has been prepared; req->rq_sp_from is
+        * decoded from request.
+        * \retval SECSVC_DROP failed, this request should be dropped.
+        *
+        * \see null_accept(), plain_accept(), gss_svc_accept_kr().
+        */
+       int                  (*accept)      (struct ptlrpc_request *req);
+
+       /**
+        * Perform security transformation upon reply message.
+        *
+        * \pre reply message is pointed by req->rq_reply_state->rs_msg, size
+        * is req->rq_replen.
+        * \post req->rs_repdata_len is the final message size.
+        * \post req->rq_reply_off is set.
+        *
+        * \see null_authorize(), plain_authorize(), gss_svc_authorize().
+        */
+       int                  (*authorize)   (struct ptlrpc_request *req);
+
+       /**
+        * Invalidate server context \a ctx.
+        *
+        * \see gss_svc_invalidate_ctx().
+        */
+       void                (*invalidate_ctx)
+                                              (struct ptlrpc_svc_ctx *ctx);
+
+       /**
+        * Allocate a ptlrpc_reply_state.
+        *
+        * \param msgsize size of the reply message in clear text.
+        * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we
+        * should simply use it; otherwise we'll responsible for allocating
+        * a new one.
+        * \post req->rq_reply_state != NULL;
+        * \post req->rq_reply_state->rs_msg != NULL;
+        *
+        * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs().
+        */
+       int                  (*alloc_rs)    (struct ptlrpc_request *req,
+                                               int msgsize);
+
+       /**
+        * Free a ptlrpc_reply_state.
+        */
+       void                (*free_rs)     (struct ptlrpc_reply_state *rs);
+
+       /**
+        * Release the server context \a ctx.
+        *
+        * \see gss_svc_free_ctx().
+        */
+       void                (*free_ctx)    (struct ptlrpc_svc_ctx *ctx);
+
+       /**
+        * Install a reverse context based on the server context \a ctx.
+        *
+        * \see gss_svc_install_rctx_kr().
+        */
+       int                  (*install_rctx)(struct obd_import *imp,
+                                               struct ptlrpc_svc_ctx *ctx);
+
+       /**
+        * Prepare buffer for incoming bulk write.
+        *
+        * \pre desc->bd_iov and desc->bd_iov_count describes the buffer
+        * intended to receive the write.
+        *
+        * \see gss_svc_prep_bulk().
+        */
+       int                  (*prep_bulk)   (struct ptlrpc_request *req,
+                                               struct ptlrpc_bulk_desc *desc);
+
+       /**
+        * Unwrap the bulk write data.
+        *
+        * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk().
+        */
+       int                  (*unwrap_bulk) (struct ptlrpc_request *req,
+                                               struct ptlrpc_bulk_desc *desc);
+
+       /**
+        * Wrap the bulk read data.
+        *
+        * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk().
+        */
+       int                  (*wrap_bulk)   (struct ptlrpc_request *req,
+                                               struct ptlrpc_bulk_desc *desc);
+};
+
+struct ptlrpc_sec_policy {
+       module_t                   *sp_owner;
+       char                       *sp_name;
+       __u16                      sp_policy; /* policy number */
+       struct ptlrpc_sec_cops   *sp_cops;   /* client ops */
+       struct ptlrpc_sec_sops   *sp_sops;   /* server ops */
+};
+
+#define PTLRPC_SEC_FL_REVERSE     0x0001 /* reverse sec */
+#define PTLRPC_SEC_FL_ROOTONLY   0x0002 /* treat everyone as root */
+#define PTLRPC_SEC_FL_UDESC         0x0004 /* ship udesc */
+#define PTLRPC_SEC_FL_BULK           0x0008 /* intensive bulk i/o expected */
+#define PTLRPC_SEC_FL_PAG             0x0010 /* PAG mode */
+
+/**
+ * The ptlrpc_sec represents the client side ptlrpc security facilities,
+ * each obd_import (both regular and reverse import) must associate with
+ * a ptlrpc_sec.
+ *
+ * \see sptlrpc_import_sec_adapt().
+ */
+struct ptlrpc_sec {
+       struct ptlrpc_sec_policy       *ps_policy;
+       atomic_t                    ps_refcount;
+       /** statistic only */
+       atomic_t                    ps_nctx;
+       /** unique identifier */
+       int                          ps_id;
+       struct sptlrpc_flavor      ps_flvr;
+       enum lustre_sec_part        ps_part;
+       /** after set, no more new context will be created */
+       unsigned int                ps_dying:1;
+       /** owning import */
+       struct obd_import             *ps_import;
+       spinlock_t                      ps_lock;
+
+       /*
+        * garbage collection
+        */
+       struct list_head                      ps_gc_list;
+       cfs_time_t                    ps_gc_interval; /* in seconds */
+       cfs_time_t                    ps_gc_next;     /* in seconds */
+};
+
+static inline int sec_is_reverse(struct ptlrpc_sec *sec)
+{
+       return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE);
+}
+
+static inline int sec_is_rootonly(struct ptlrpc_sec *sec)
+{
+       return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY);
+}
+
+
+struct ptlrpc_svc_ctx {
+       atomic_t                    sc_refcount;
+       struct ptlrpc_sec_policy       *sc_policy;
+};
+
+/*
+ * user identity descriptor
+ */
+#define LUSTRE_MAX_GROUPS             (128)
+
+struct ptlrpc_user_desc {
+       __u32      pud_uid;
+       __u32      pud_gid;
+       __u32      pud_fsuid;
+       __u32      pud_fsgid;
+       __u32      pud_cap;
+       __u32      pud_ngroups;
+       __u32      pud_groups[0];
+};
+
+/*
+ * bulk flavors
+ */
+enum sptlrpc_bulk_hash_alg {
+       BULK_HASH_ALG_NULL      = 0,
+       BULK_HASH_ALG_ADLER32,
+       BULK_HASH_ALG_CRC32,
+       BULK_HASH_ALG_MD5,
+       BULK_HASH_ALG_SHA1,
+       BULK_HASH_ALG_SHA256,
+       BULK_HASH_ALG_SHA384,
+       BULK_HASH_ALG_SHA512,
+       BULK_HASH_ALG_MAX
+};
+
+const char * sptlrpc_get_hash_name(__u8 hash_alg);
+__u8 sptlrpc_get_hash_alg(const char *algname);
+
+enum {
+       BSD_FL_ERR      = 1,
+};
+
+struct ptlrpc_bulk_sec_desc {
+       __u8        bsd_version;    /* 0 */
+       __u8        bsd_type;       /* SPTLRPC_BULK_XXX */
+       __u8        bsd_svc;    /* SPTLRPC_BULK_SVC_XXXX */
+       __u8        bsd_flags;      /* flags */
+       __u32      bsd_nob;     /* nob of bulk data */
+       __u8        bsd_data[0];    /* policy-specific token */
+};
+
+
+/*
+ * lprocfs
+ */
+struct proc_dir_entry;
+extern struct proc_dir_entry *sptlrpc_proc_root;
+
+/*
+ * round size up to next power of 2, for slab allocation.
+ * @size must be sane (can't overflow after round up)
+ */
+static inline int size_roundup_power2(int size)
+{
+       size--;
+       size |= size >> 1;
+       size |= size >> 2;
+       size |= size >> 4;
+       size |= size >> 8;
+       size |= size >> 16;
+       size++;
+       return size;
+}
+
+/*
+ * internal support libraries
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+                                 int segment, int newsize);
+
+/*
+ * security policies
+ */
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy);
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy);
+
+__u32 sptlrpc_name2flavor_base(const char *name);
+const char *sptlrpc_flavor2name_base(__u32 flvr);
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+                              char *buf, int bufsize);
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize);
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize);
+
+static inline
+struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy)
+{
+       __module_get(policy->sp_owner);
+       return policy;
+}
+
+static inline
+void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy)
+{
+       module_put(policy->sp_owner);
+}
+
+/*
+ * client credential
+ */
+static inline
+unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx)
+{
+       return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK);
+}
+
+static inline
+int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx)
+{
+       return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE);
+}
+
+static inline
+int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx)
+{
+       return (cli_ctx_status(ctx) != 0);
+}
+
+static inline
+int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx)
+{
+       return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0);
+}
+
+static inline
+int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx)
+{
+       return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0);
+}
+
+static inline
+int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx)
+{
+       return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0);
+}
+
+static inline
+int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx)
+{
+       return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0);
+}
+
+/*
+ * sec get/put
+ */
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec);
+void sptlrpc_sec_put(struct ptlrpc_sec *sec);
+
+/*
+ * internal apis which only used by policy impelentation
+ */
+int  sptlrpc_get_next_secid(void);
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec);
+
+/*
+ * exported client context api
+ */
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync);
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx);
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+
+/*
+ * exported client context wrap/buffers
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req);
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req);
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+                              int segment, int newsize);
+int  sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+                                   struct ptlrpc_request **req_ret);
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req);
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req);
+
+/*
+ * exported higher interface of import & request
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+                            struct ptlrpc_svc_ctx *ctx,
+                            struct sptlrpc_flavor *flvr);
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp);
+void sptlrpc_import_sec_put(struct obd_import *imp);
+
+int  sptlrpc_import_check_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp);
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp);
+int  sptlrpc_req_get_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync);
+int  sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout);
+int  sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req);
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode);
+
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule);
+
+/* gc */
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec);
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx);
+
+/* misc */
+const char * sec2target_str(struct ptlrpc_sec *sec);
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev);
+
+/*
+ * server side
+ */
+enum secsvc_accept_res {
+       SECSVC_OK       = 0,
+       SECSVC_COMPLETE,
+       SECSVC_DROP,
+};
+
+int  sptlrpc_svc_unwrap_request(struct ptlrpc_request *req);
+int  sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  sptlrpc_svc_wrap_reply(struct ptlrpc_request *req);
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs);
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req);
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req);
+
+int  sptlrpc_target_export_check(struct obd_export *exp,
+                                struct ptlrpc_request *req);
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+                                     struct sptlrpc_rule_set *rset);
+
+/*
+ * reverse context
+ */
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+                               struct ptlrpc_svc_ctx *ctx);
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+                               struct ptlrpc_cli_ctx *ctx);
+
+/* bulk security api */
+int sptlrpc_enc_pool_add_user(void);
+int sptlrpc_enc_pool_del_user(void);
+int  sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc);
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc);
+
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc);
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+                                struct ptlrpc_bulk_desc *desc,
+                                int nob);
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+                                 struct ptlrpc_bulk_desc *desc);
+
+/* bulk helpers (internal use only by policies) */
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+                             void *buf, int buflen);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed);
+
+/* user descriptor helpers */
+static inline int sptlrpc_user_desc_size(int ngroups)
+{
+       return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32);
+}
+
+int sptlrpc_current_user_desc_size(void);
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset);
+int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed);
+
+
+#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN)
+#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE)
+
+enum {
+       LUSTRE_SEC_NONE  = 0,
+       LUSTRE_SEC_REMOTE       = 1,
+       LUSTRE_SEC_SPECIFY      = 2,
+       LUSTRE_SEC_ALL    = 3
+};
+
+/** @} sptlrpc */
+
+#endif /* _LUSTRE_SEC_H_ */
diff --git a/drivers/staging/lustre/lustre/include/lustre_update.h b/drivers/staging/lustre/lustre/include/lustre_update.h
new file mode 100644 (file)
index 0000000..84defce
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.htm
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * lustre/include/lustre_update.h
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#ifndef _LUSTRE_UPDATE_H
+#define _LUSTRE_UPDATE_H
+
+#define UPDATE_BUFFER_SIZE     8192
+struct update_request {
+       struct dt_device        *ur_dt;
+       struct list_head                ur_list;    /* attached itself to thandle */
+       int                     ur_flags;
+       int                     ur_rc;      /* request result */
+       int                     ur_batchid; /* Current batch(trans) id */
+       struct update_buf       *ur_buf;   /* Holding the update req */
+};
+
+static inline unsigned long update_size(struct update *update)
+{
+       unsigned long size;
+       int        i;
+
+       size = cfs_size_round(offsetof(struct update, u_bufs[0]));
+       for (i = 0; i < UPDATE_BUF_COUNT; i++)
+               size += cfs_size_round(update->u_lens[i]);
+
+       return size;
+}
+
+static inline void *update_param_buf(struct update *update, int index,
+                                    int *size)
+{
+       int     i;
+       void    *ptr;
+
+       if (index >= UPDATE_BUF_COUNT)
+               return NULL;
+
+       ptr = (char *)update + cfs_size_round(offsetof(struct update,
+                                                      u_bufs[0]));
+       for (i = 0; i < index; i++) {
+               LASSERT(update->u_lens[i] > 0);
+               ptr += cfs_size_round(update->u_lens[i]);
+       }
+
+       if (size != NULL)
+               *size = update->u_lens[index];
+
+       return ptr;
+}
+
+static inline unsigned long update_buf_size(struct update_buf *buf)
+{
+       unsigned long size;
+       int        i = 0;
+
+       size = cfs_size_round(offsetof(struct update_buf, ub_bufs[0]));
+       for (i = 0; i < buf->ub_count; i++) {
+               struct update *update;
+
+               update = (struct update *)((char *)buf + size);
+               size += update_size(update);
+       }
+       LASSERT(size <= UPDATE_BUFFER_SIZE);
+       return size;
+}
+
+static inline void *update_buf_get(struct update_buf *buf, int index, int *size)
+{
+       int     count = buf->ub_count;
+       void    *ptr;
+       int     i = 0;
+
+       if (index >= count)
+               return NULL;
+
+       ptr = (char *)buf + cfs_size_round(offsetof(struct update_buf,
+                                                   ub_bufs[0]));
+       for (i = 0; i < index; i++)
+               ptr += update_size((struct update *)ptr);
+
+       if (size != NULL)
+               *size = update_size((struct update *)ptr);
+
+       return ptr;
+}
+
+static inline void update_init_reply_buf(struct update_reply *reply, int count)
+{
+       reply->ur_version = UPDATE_REPLY_V1;
+       reply->ur_count = count;
+}
+
+static inline void *update_get_buf_internal(struct update_reply *reply,
+                                           int index, int *size)
+{
+       char *ptr;
+       int count = reply->ur_count;
+       int i;
+
+       if (index >= count)
+               return NULL;
+
+       ptr = (char *)reply + cfs_size_round(offsetof(struct update_reply,
+                                            ur_lens[count]));
+       for (i = 0; i < index; i++) {
+               LASSERT(reply->ur_lens[i] > 0);
+               ptr += cfs_size_round(reply->ur_lens[i]);
+       }
+
+       if (size != NULL)
+               *size = reply->ur_lens[index];
+
+       return ptr;
+}
+
+static inline void update_insert_reply(struct update_reply *reply, void *data,
+                                      int data_len, int index, int rc)
+{
+       char *ptr;
+
+       ptr = update_get_buf_internal(reply, index, NULL);
+       LASSERT(ptr != NULL);
+
+       *(int *)ptr = cpu_to_le32(rc);
+       ptr += sizeof(int);
+       if (data_len > 0) {
+               LASSERT(data != NULL);
+               memcpy(ptr, data, data_len);
+       }
+       reply->ur_lens[index] = data_len + sizeof(int);
+}
+
+static inline int update_get_reply_buf(struct update_reply *reply, void **buf,
+                                      int index)
+{
+       char *ptr;
+       int  size = 0;
+       int  result;
+
+       ptr = update_get_buf_internal(reply, index, &size);
+       result = *(int *)ptr;
+
+       if (result < 0)
+               return result;
+
+       LASSERT((ptr != NULL && size >= sizeof(int)));
+       *buf = ptr + sizeof(int);
+       return size - sizeof(int);
+}
+
+static inline int update_get_reply_result(struct update_reply *reply,
+                                         void **buf, int index)
+{
+       void *ptr;
+       int  size;
+
+       ptr = update_get_buf_internal(reply, index, &size);
+       LASSERT(ptr != NULL && size > sizeof(int));
+       return *(int *)ptr;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lustre_ver.h b/drivers/staging/lustre/lustre/include/lustre_ver.h
new file mode 100644 (file)
index 0000000..dc187b8
--- /dev/null
@@ -0,0 +1,24 @@
+#ifndef _LUSTRE_VER_H_
+#define _LUSTRE_VER_H_
+/* This file automatically generated from lustre/include/lustre_ver.h.in,
+ * based on parameters in lustre/autoconf/lustre-version.ac.
+ * Changes made directly to this file will be lost. */
+
+#define LUSTRE_MAJOR 2
+#define LUSTRE_MINOR 3
+#define LUSTRE_PATCH 64
+#define LUSTRE_FIX 0
+#define LUSTRE_VERSION_STRING "2.3.64"
+
+#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX)
+
+/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches
+ * by this amount (set in lustre/autoconf/lustre-version.ac). */
+#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32)
+
+/* If lustre version of client and servers it connects to differs by more
+ * than this amount, client would issue a warning.
+ * (set in lustre/autoconf/lustre-version.ac) */
+#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/lvfs.h b/drivers/staging/lustre/lustre/include/lvfs.h
new file mode 100644 (file)
index 0000000..28f1a6b
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lvfs.h
+ *
+ * lustre VFS/process permission interface
+ */
+
+#ifndef __LVFS_H__
+#define __LVFS_H__
+
+#define LL_FID_NAMELEN (16 + 1 + 8 + 1)
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lvfs.h>
+
+#include <linux/libcfs/lucache.h>
+
+
+/* lvfs_common.c */
+struct dentry *lvfs_fid2dentry(struct lvfs_run_ctxt *, __u64, __u32, __u64 ,void *data);
+
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
+              struct lvfs_ucred *cred);
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
+             struct lvfs_ucred *cred);
+#endif
diff --git a/drivers/staging/lustre/lustre/include/md_object.h b/drivers/staging/lustre/lustre/include/md_object.h
new file mode 100644 (file)
index 0000000..92d6420
--- /dev/null
@@ -0,0 +1,908 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/md_object.h
+ *
+ * Extention of lu_object.h for metadata objects
+ */
+
+#ifndef _LUSTRE_MD_OBJECT_H
+#define _LUSTRE_MD_OBJECT_H
+
+/** \defgroup md md
+ * Sub-class of lu_object with methods common for "meta-data" objects in MDT
+ * stack.
+ *
+ * Meta-data objects implement namespace operations: you can link, unlink
+ * them, and treat them as directories.
+ *
+ * Examples: mdt, cmm, and mdt are implementations of md interface.
+ * @{
+ */
+
+
+/*
+ * super-class definitions.
+ */
+#include <dt_object.h>
+
+struct md_device;
+struct md_device_operations;
+struct md_object;
+struct obd_export;
+
+enum {
+       UCRED_INVALID   = -1,
+       UCRED_INIT      = 0,
+       UCRED_OLD       = 1,
+       UCRED_NEW       = 2
+};
+
+enum {
+       MD_CAPAINFO_MAX = 5
+};
+
+/** there are at most 5 fids in one operation, see rename, NOTE the last one
+ * is a temporary one used for is_subdir() */
+struct md_capainfo {
+       __u32              mc_auth;
+       __u32              mc_padding;
+       struct lu_fid      mc_fid[MD_CAPAINFO_MAX];
+       struct lustre_capa     *mc_capa[MD_CAPAINFO_MAX];
+};
+
+struct md_quota {
+       struct obd_export       *mq_exp;
+};
+
+/**
+ * Implemented in mdd/mdd_handler.c.
+ *
+ * XXX should be moved into separate .h/.c together with all md security
+ * related definitions.
+ */
+struct md_capainfo *md_capainfo(const struct lu_env *env);
+struct md_quota *md_quota(const struct lu_env *env);
+
+/** metadata attributes */
+enum ma_valid {
+       MA_INODE     = (1 << 0),
+       MA_LOV       = (1 << 1),
+       MA_COOKIE    = (1 << 2),
+       MA_FLAGS     = (1 << 3),
+       MA_LMV       = (1 << 4),
+       MA_ACL_DEF   = (1 << 5),
+       MA_LOV_DEF   = (1 << 6),
+       MA_LAY_GEN   = (1 << 7),
+       MA_HSM       = (1 << 8),
+       MA_SOM       = (1 << 9),
+       MA_PFID      = (1 << 10)
+};
+
+typedef enum {
+       MDL_MINMODE  = 0,
+       MDL_EX       = 1,
+       MDL_PW       = 2,
+       MDL_PR       = 4,
+       MDL_CW       = 8,
+       MDL_CR       = 16,
+       MDL_NL       = 32,
+       MDL_GROUP    = 64,
+       MDL_MAXMODE
+} mdl_mode_t;
+
+typedef enum {
+       MDT_NUL_LOCK = 0,
+       MDT_REG_LOCK = (1 << 0),
+       MDT_PDO_LOCK = (1 << 1)
+} mdl_type_t;
+
+/* memory structure for hsm attributes
+ * for fields description see the on disk structure hsm_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_hsm {
+       __u32   mh_compat;
+       __u32   mh_flags;
+       __u64   mh_arch_id;
+       __u64   mh_arch_ver;
+};
+
+#define IOEPOCH_INVAL 0
+
+/* memory structure for som attributes
+ * for fields description see the on disk structure som_attrs
+ * which is defined in lustre_idl.h
+ */
+struct md_som_data {
+       __u32   msd_compat;
+       __u32   msd_incompat;
+       __u64   msd_ioepoch;
+       __u64   msd_size;
+       __u64   msd_blocks;
+       __u64   msd_mountid;
+};
+
+struct md_attr {
+       __u64              ma_valid;
+       __u64              ma_need;
+       __u64              ma_attr_flags;
+       struct lu_attr    ma_attr;
+       struct lu_fid      ma_pfid;
+       struct md_hsm      ma_hsm;
+       struct lov_mds_md      *ma_lmm;
+       struct lmv_stripe_md   *ma_lmv;
+       void               *ma_acl;
+       struct llog_cookie     *ma_cookie;
+       struct lustre_capa     *ma_capa;
+       struct md_som_data     *ma_som;
+       int                  ma_lmm_size;
+       int                  ma_lmv_size;
+       int                  ma_acl_size;
+       int                  ma_cookie_size;
+       __u16              ma_layout_gen;
+};
+
+/** Additional parameters for create */
+struct md_op_spec {
+       union {
+               /** symlink target */
+               const char             *sp_symname;
+               /** parent FID for cross-ref mkdir */
+               const struct lu_fid      *sp_pfid;
+               /** eadata for regular files */
+               struct md_spec_reg {
+                       /** lov objs exist already */
+                       const struct lu_fid   *fid;
+                       const void *eadata;
+                       int  eadatalen;
+               } sp_ea;
+       } u;
+
+       /** Create flag from client: such as MDS_OPEN_CREAT, and others. */
+       __u64      sp_cr_flags;
+
+       /** don't create lov objects or llog cookie - this replay */
+       unsigned int no_create:1,
+                    sp_cr_lookup:1, /* do lookup sanity check or not. */
+                    sp_rm_entry:1;  /* only remove name entry */
+
+       /** Current lock mode for parent dir where create is performing. */
+       mdl_mode_t sp_cr_mode;
+
+       /** to create directory */
+       const struct dt_index_features *sp_feat;
+};
+
+/**
+ * Operations implemented for each md object (both directory and leaf).
+ */
+struct md_object_operations {
+       int (*moo_permission)(const struct lu_env *env,
+                             struct md_object *pobj, struct md_object *cobj,
+                             struct md_attr *attr, int mask);
+
+       int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj,
+                           struct md_attr *attr);
+
+       int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj,
+                           const struct md_attr *attr);
+
+       int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj,
+                            struct lu_buf *buf, const char *name);
+
+       int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj,
+                             struct lu_buf *buf);
+
+       int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj,
+                            const struct lu_buf *buf, const char *name,
+                            int fl);
+
+       int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj,
+                            const char *name);
+
+       /** This method is used to swap the layouts between 2 objects */
+       int (*moo_swap_layouts)(const struct lu_env *env,
+                              struct md_object *obj1, struct md_object *obj2,
+                              __u64 flags);
+
+       /** \retval number of bytes actually read upon success */
+       int (*moo_readpage)(const struct lu_env *env, struct md_object *obj,
+                           const struct lu_rdpg *rdpg);
+
+       int (*moo_readlink)(const struct lu_env *env, struct md_object *obj,
+                           struct lu_buf *buf);
+       int (*moo_changelog)(const struct lu_env *env,
+                            enum changelog_rec_type type, int flags,
+                            struct md_object *obj);
+       /** part of cross-ref operation */
+       int (*moo_object_create)(const struct lu_env *env,
+                                struct md_object *obj,
+                                const struct md_op_spec *spec,
+                                struct md_attr *ma);
+
+       int (*moo_ref_add)(const struct lu_env *env,
+                          struct md_object *obj,
+                          const struct md_attr *ma);
+
+       int (*moo_ref_del)(const struct lu_env *env,
+                          struct md_object *obj,
+                          struct md_attr *ma);
+
+       int (*moo_open)(const struct lu_env *env,
+                       struct md_object *obj, int flag);
+
+       int (*moo_close)(const struct lu_env *env, struct md_object *obj,
+                        struct md_attr *ma, int mode);
+
+       int (*moo_capa_get)(const struct lu_env *, struct md_object *,
+                           struct lustre_capa *, int renewal);
+
+       int (*moo_object_sync)(const struct lu_env *, struct md_object *);
+
+       int (*moo_file_lock)(const struct lu_env *env, struct md_object *obj,
+                            struct lov_mds_md *lmm, struct ldlm_extent *extent,
+                            struct lustre_handle *lockh);
+       int (*moo_file_unlock)(const struct lu_env *env, struct md_object *obj,
+                              struct lov_mds_md *lmm,
+                              struct lustre_handle *lockh);
+       int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj,
+                              struct lustre_handle *lh,
+                              struct ldlm_enqueue_info *einfo,
+                              void *policy);
+};
+
+/**
+ * Operations implemented for each directory object.
+ */
+struct md_dir_operations {
+       int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj,
+                             const struct lu_fid *fid, struct lu_fid *sfid);
+
+       int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj,
+                         const struct lu_name *lname, struct lu_fid *fid,
+                         struct md_op_spec *spec);
+
+       mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env,
+                                   struct md_object *obj,
+                                   mdl_mode_t mode);
+
+       int (*mdo_create)(const struct lu_env *env, struct md_object *pobj,
+                         const struct lu_name *lname, struct md_object *child,
+                         struct md_op_spec *spec,
+                         struct md_attr *ma);
+
+       /** This method is used for creating data object for this meta object*/
+       int (*mdo_create_data)(const struct lu_env *env, struct md_object *p,
+                              struct md_object *o,
+                              const struct md_op_spec *spec,
+                              struct md_attr *ma);
+
+       int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj,
+                         struct md_object *tpobj, const struct lu_fid *lf,
+                         const struct lu_name *lsname, struct md_object *tobj,
+                         const struct lu_name *ltname, struct md_attr *ma);
+
+       int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj,
+                       struct md_object *src_obj, const struct lu_name *lname,
+                       struct md_attr *ma);
+
+       int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj,
+                         struct md_object *cobj, const struct lu_name *lname,
+                         struct md_attr *ma, int no_name);
+
+       /** This method is used to compare a requested layout to an existing
+        * layout (struct lov_mds_md_v1/3 vs struct lov_mds_md_v1/3) */
+       int (*mdo_lum_lmm_cmp)(const struct lu_env *env,
+                              struct md_object *cobj,
+                              const struct md_op_spec *spec,
+                              struct md_attr *ma);
+
+       /** partial ops for cross-ref case */
+       int (*mdo_name_insert)(const struct lu_env *env,
+                              struct md_object *obj,
+                              const struct lu_name *lname,
+                              const struct lu_fid *fid,
+                              const struct md_attr *ma);
+
+       int (*mdo_name_remove)(const struct lu_env *env,
+                              struct md_object *obj,
+                              const struct lu_name *lname,
+                              const struct md_attr *ma);
+
+       int (*mdo_rename_tgt)(const struct lu_env *env, struct md_object *pobj,
+                             struct md_object *tobj, const struct lu_fid *fid,
+                             const struct lu_name *lname, struct md_attr *ma);
+};
+
+struct md_device_operations {
+       /** meta-data device related handlers. */
+       int (*mdo_root_get)(const struct lu_env *env, struct md_device *m,
+                           struct lu_fid *f);
+
+       int (*mdo_maxsize_get)(const struct lu_env *env, struct md_device *m,
+                              int *md_size, int *cookie_size);
+
+       int (*mdo_statfs)(const struct lu_env *env, struct md_device *m,
+                         struct obd_statfs *sfs);
+
+       int (*mdo_init_capa_ctxt)(const struct lu_env *env, struct md_device *m,
+                                 int mode, unsigned long timeout, __u32 alg,
+                                 struct lustre_capa_key *keys);
+
+       int (*mdo_update_capa_key)(const struct lu_env *env,
+                                  struct md_device *m,
+                                  struct lustre_capa_key *key);
+
+       int (*mdo_llog_ctxt_get)(const struct lu_env *env,
+                                struct md_device *m, int idx, void **h);
+
+       int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m,
+                            unsigned int cmd, int len, void *data);
+};
+
+enum md_upcall_event {
+       /** Sync the md layer*/
+       MD_LOV_SYNC = (1 << 0),
+       /** Just for split, no need trans, for replay */
+       MD_NO_TRANS = (1 << 1),
+       MD_LOV_CONFIG = (1 << 2),
+       /** Trigger quota recovery */
+       MD_LOV_QUOTA = (1 << 3)
+};
+
+struct md_upcall {
+       /** this lock protects upcall using against its removal
+        * read lock is for usage the upcall, write - for init/fini */
+       struct rw_semaphore     mu_upcall_sem;
+       /** device to call, upper layer normally */
+       struct md_device       *mu_upcall_dev;
+       /** upcall function */
+       int (*mu_upcall)(const struct lu_env *env, struct md_device *md,
+                        enum md_upcall_event ev, void *data);
+};
+
+struct md_device {
+       struct lu_device                   md_lu_dev;
+       const struct md_device_operations *md_ops;
+       struct md_upcall                   md_upcall;
+};
+
+static inline void md_upcall_init(struct md_device *m, void *upcl)
+{
+       init_rwsem(&m->md_upcall.mu_upcall_sem);
+       m->md_upcall.mu_upcall_dev = NULL;
+       m->md_upcall.mu_upcall = upcl;
+}
+
+static inline void md_upcall_dev_set(struct md_device *m, struct md_device *up)
+{
+       down_write(&m->md_upcall.mu_upcall_sem);
+       m->md_upcall.mu_upcall_dev = up;
+       up_write(&m->md_upcall.mu_upcall_sem);
+}
+
+static inline void md_upcall_fini(struct md_device *m)
+{
+       down_write(&m->md_upcall.mu_upcall_sem);
+       m->md_upcall.mu_upcall_dev = NULL;
+       m->md_upcall.mu_upcall = NULL;
+       up_write(&m->md_upcall.mu_upcall_sem);
+}
+
+static inline int md_do_upcall(const struct lu_env *env, struct md_device *m,
+                               enum md_upcall_event ev, void *data)
+{
+       int rc = 0;
+       down_read(&m->md_upcall.mu_upcall_sem);
+       if (m->md_upcall.mu_upcall_dev != NULL &&
+           m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall != NULL) {
+               rc = m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall(env,
+                                             m->md_upcall.mu_upcall_dev,
+                                             ev, data);
+       }
+       up_read(&m->md_upcall.mu_upcall_sem);
+       return rc;
+}
+
+struct md_object {
+       struct lu_object                   mo_lu;
+       const struct md_object_operations *mo_ops;
+       const struct md_dir_operations    *mo_dir_ops;
+};
+
+/**
+ * seq-server site.
+ */
+struct seq_server_site {
+       struct lu_site       *ss_lu;
+       /**
+        * mds number of this site.
+        */
+       mdsno_t        ss_node_id;
+       /**
+        * Fid location database
+        */
+       struct lu_server_fld *ss_server_fld;
+       struct lu_client_fld *ss_client_fld;
+
+       /**
+        * Server Seq Manager
+        */
+       struct lu_server_seq *ss_server_seq;
+
+       /**
+        * Controller Seq Manager
+        */
+       struct lu_server_seq *ss_control_seq;
+       struct obd_export    *ss_control_exp;
+
+       /**
+        * Client Seq Manager
+        */
+       struct lu_client_seq *ss_client_seq;
+};
+
+static inline struct md_device *lu2md_dev(const struct lu_device *d)
+{
+       LASSERT(IS_ERR(d) || lu_device_is_md(d));
+       return container_of0(d, struct md_device, md_lu_dev);
+}
+
+static inline struct lu_device *md2lu_dev(struct md_device *d)
+{
+       return &d->md_lu_dev;
+}
+
+static inline struct md_object *lu2md(const struct lu_object *o)
+{
+       LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev));
+       return container_of0(o, struct md_object, mo_lu);
+}
+
+static inline struct md_object *md_object_next(const struct md_object *obj)
+{
+       return (obj ? lu2md(lu_object_next(&obj->mo_lu)) : NULL);
+}
+
+static inline struct md_device *md_obj2dev(const struct md_object *o)
+{
+       LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->mo_lu.lo_dev));
+       return container_of0(o->mo_lu.lo_dev, struct md_device, md_lu_dev);
+}
+
+static inline struct seq_server_site *lu_site2seq(const struct lu_site *s)
+{
+       return s->ld_seq_site;
+}
+
+static inline int md_device_init(struct md_device *md, struct lu_device_type *t)
+{
+       return lu_device_init(&md->md_lu_dev, t);
+}
+
+static inline void md_device_fini(struct md_device *md)
+{
+       lu_device_fini(&md->md_lu_dev);
+}
+
+static inline struct md_object *md_object_find_slice(const struct lu_env *env,
+                                                    struct md_device *md,
+                                                    const struct lu_fid *f)
+{
+       return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL));
+}
+
+
+/** md operations */
+static inline int mo_permission(const struct lu_env *env,
+                               struct md_object *p,
+                               struct md_object *c,
+                               struct md_attr *at,
+                               int mask)
+{
+       LASSERT(c->mo_ops->moo_permission);
+       return c->mo_ops->moo_permission(env, p, c, at, mask);
+}
+
+static inline int mo_attr_get(const struct lu_env *env,
+                             struct md_object *m,
+                             struct md_attr *at)
+{
+       LASSERT(m->mo_ops->moo_attr_get);
+       return m->mo_ops->moo_attr_get(env, m, at);
+}
+
+static inline int mo_readlink(const struct lu_env *env,
+                             struct md_object *m,
+                             struct lu_buf *buf)
+{
+       LASSERT(m->mo_ops->moo_readlink);
+       return m->mo_ops->moo_readlink(env, m, buf);
+}
+
+static inline int mo_changelog(const struct lu_env *env,
+                              enum changelog_rec_type type,
+                              int flags, struct md_object *m)
+{
+       LASSERT(m->mo_ops->moo_changelog);
+       return m->mo_ops->moo_changelog(env, type, flags, m);
+}
+
+static inline int mo_attr_set(const struct lu_env *env,
+                             struct md_object *m,
+                             const struct md_attr *at)
+{
+       LASSERT(m->mo_ops->moo_attr_set);
+       return m->mo_ops->moo_attr_set(env, m, at);
+}
+
+static inline int mo_xattr_get(const struct lu_env *env,
+                              struct md_object *m,
+                              struct lu_buf *buf,
+                              const char *name)
+{
+       LASSERT(m->mo_ops->moo_xattr_get);
+       return m->mo_ops->moo_xattr_get(env, m, buf, name);
+}
+
+static inline int mo_xattr_del(const struct lu_env *env,
+                              struct md_object *m,
+                              const char *name)
+{
+       LASSERT(m->mo_ops->moo_xattr_del);
+       return m->mo_ops->moo_xattr_del(env, m, name);
+}
+
+static inline int mo_xattr_set(const struct lu_env *env,
+                              struct md_object *m,
+                              const struct lu_buf *buf,
+                              const char *name,
+                              int flags)
+{
+       LASSERT(m->mo_ops->moo_xattr_set);
+       return m->mo_ops->moo_xattr_set(env, m, buf, name, flags);
+}
+
+static inline int mo_xattr_list(const struct lu_env *env,
+                               struct md_object *m,
+                               struct lu_buf *buf)
+{
+       LASSERT(m->mo_ops->moo_xattr_list);
+       return m->mo_ops->moo_xattr_list(env, m, buf);
+}
+
+static inline int mo_swap_layouts(const struct lu_env *env,
+                                 struct md_object *o1,
+                                 struct md_object *o2, __u64 flags)
+{
+       LASSERT(o1->mo_ops->moo_swap_layouts);
+       LASSERT(o2->mo_ops->moo_swap_layouts);
+       if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts)
+               return -EPERM;
+       return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags);
+}
+
+static inline int mo_open(const struct lu_env *env,
+                         struct md_object *m,
+                         int flags)
+{
+       LASSERT(m->mo_ops->moo_open);
+       return m->mo_ops->moo_open(env, m, flags);
+}
+
+static inline int mo_close(const struct lu_env *env,
+                          struct md_object *m,
+                          struct md_attr *ma,
+                          int mode)
+{
+       LASSERT(m->mo_ops->moo_close);
+       return m->mo_ops->moo_close(env, m, ma, mode);
+}
+
+static inline int mo_readpage(const struct lu_env *env,
+                             struct md_object *m,
+                             const struct lu_rdpg *rdpg)
+{
+       LASSERT(m->mo_ops->moo_readpage);
+       return m->mo_ops->moo_readpage(env, m, rdpg);
+}
+
+static inline int mo_object_create(const struct lu_env *env,
+                                  struct md_object *m,
+                                  const struct md_op_spec *spc,
+                                  struct md_attr *at)
+{
+       LASSERT(m->mo_ops->moo_object_create);
+       return m->mo_ops->moo_object_create(env, m, spc, at);
+}
+
+static inline int mo_ref_add(const struct lu_env *env,
+                            struct md_object *m,
+                            const struct md_attr *ma)
+{
+       LASSERT(m->mo_ops->moo_ref_add);
+       return m->mo_ops->moo_ref_add(env, m, ma);
+}
+
+static inline int mo_ref_del(const struct lu_env *env,
+                            struct md_object *m,
+                            struct md_attr *ma)
+{
+       LASSERT(m->mo_ops->moo_ref_del);
+       return m->mo_ops->moo_ref_del(env, m, ma);
+}
+
+static inline int mo_capa_get(const struct lu_env *env,
+                             struct md_object *m,
+                             struct lustre_capa *c,
+                             int renewal)
+{
+       LASSERT(m->mo_ops->moo_capa_get);
+       return m->mo_ops->moo_capa_get(env, m, c, renewal);
+}
+
+static inline int mo_object_sync(const struct lu_env *env, struct md_object *m)
+{
+       LASSERT(m->mo_ops->moo_object_sync);
+       return m->mo_ops->moo_object_sync(env, m);
+}
+
+static inline int mo_file_lock(const struct lu_env *env, struct md_object *m,
+                              struct lov_mds_md *lmm,
+                              struct ldlm_extent *extent,
+                              struct lustre_handle *lockh)
+{
+       LASSERT(m->mo_ops->moo_file_lock);
+       return m->mo_ops->moo_file_lock(env, m, lmm, extent, lockh);
+}
+
+static inline int mo_file_unlock(const struct lu_env *env, struct md_object *m,
+                                struct lov_mds_md *lmm,
+                                struct lustre_handle *lockh)
+{
+       LASSERT(m->mo_ops->moo_file_unlock);
+       return m->mo_ops->moo_file_unlock(env, m, lmm, lockh);
+}
+
+static inline int mo_object_lock(const struct lu_env *env,
+                                struct md_object *m,
+                                struct lustre_handle *lh,
+                                struct ldlm_enqueue_info *einfo,
+                                void *policy)
+{
+       LASSERT(m->mo_ops->moo_object_lock);
+       return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy);
+}
+
+static inline int mdo_lookup(const struct lu_env *env,
+                            struct md_object *p,
+                            const struct lu_name *lname,
+                            struct lu_fid *f,
+                            struct md_op_spec *spec)
+{
+       LASSERT(p->mo_dir_ops->mdo_lookup);
+       return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec);
+}
+
+static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env,
+                                      struct md_object *mo,
+                                      mdl_mode_t lm)
+{
+       if (mo->mo_dir_ops->mdo_lock_mode == NULL)
+               return MDL_MINMODE;
+       return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm);
+}
+
+static inline int mdo_create(const struct lu_env *env,
+                            struct md_object *p,
+                            const struct lu_name *lchild_name,
+                            struct md_object *c,
+                            struct md_op_spec *spc,
+                            struct md_attr *at)
+{
+       LASSERT(p->mo_dir_ops->mdo_create);
+       return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at);
+}
+
+static inline int mdo_create_data(const struct lu_env *env,
+                                 struct md_object *p,
+                                 struct md_object *c,
+                                 const struct md_op_spec *spec,
+                                 struct md_attr *ma)
+{
+       LASSERT(c->mo_dir_ops->mdo_create_data);
+       return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma);
+}
+
+static inline int mdo_rename(const struct lu_env *env,
+                            struct md_object *sp,
+                            struct md_object *tp,
+                            const struct lu_fid *lf,
+                            const struct lu_name *lsname,
+                            struct md_object *t,
+                            const struct lu_name *ltname,
+                            struct md_attr *ma)
+{
+       LASSERT(tp->mo_dir_ops->mdo_rename);
+       return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname,
+                                         ma);
+}
+
+static inline int mdo_is_subdir(const struct lu_env *env,
+                               struct md_object *mo,
+                               const struct lu_fid *fid,
+                               struct lu_fid *sfid)
+{
+       LASSERT(mo->mo_dir_ops->mdo_is_subdir);
+       return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid);
+}
+
+static inline int mdo_link(const struct lu_env *env,
+                          struct md_object *p,
+                          struct md_object *s,
+                          const struct lu_name *lname,
+                          struct md_attr *ma)
+{
+       LASSERT(s->mo_dir_ops->mdo_link);
+       return s->mo_dir_ops->mdo_link(env, p, s, lname, ma);
+}
+
+static inline int mdo_unlink(const struct lu_env *env,
+                            struct md_object *p,
+                            struct md_object *c,
+                            const struct lu_name *lname,
+                            struct md_attr *ma, int no_name)
+{
+       LASSERT(p->mo_dir_ops->mdo_unlink);
+       return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name);
+}
+
+static inline int mdo_lum_lmm_cmp(const struct lu_env *env,
+                                 struct md_object *c,
+                                 const struct md_op_spec *spec,
+                                 struct md_attr *ma)
+{
+       LASSERT(c->mo_dir_ops->mdo_lum_lmm_cmp);
+       return c->mo_dir_ops->mdo_lum_lmm_cmp(env, c, spec, ma);
+}
+
+static inline int mdo_name_insert(const struct lu_env *env,
+                                 struct md_object *p,
+                                 const struct lu_name *lname,
+                                 const struct lu_fid *f,
+                                 const struct md_attr *ma)
+{
+       LASSERT(p->mo_dir_ops->mdo_name_insert);
+       return p->mo_dir_ops->mdo_name_insert(env, p, lname, f, ma);
+}
+
+static inline int mdo_name_remove(const struct lu_env *env,
+                                 struct md_object *p,
+                                 const struct lu_name *lname,
+                                 const struct md_attr *ma)
+{
+       LASSERT(p->mo_dir_ops->mdo_name_remove);
+       return p->mo_dir_ops->mdo_name_remove(env, p, lname, ma);
+}
+
+static inline int mdo_rename_tgt(const struct lu_env *env,
+                                struct md_object *p,
+                                struct md_object *t,
+                                const struct lu_fid *lf,
+                                const struct lu_name *lname,
+                                struct md_attr *ma)
+{
+       if (t) {
+               LASSERT(t->mo_dir_ops->mdo_rename_tgt);
+               return t->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma);
+       } else {
+               LASSERT(p->mo_dir_ops->mdo_rename_tgt);
+               return p->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma);
+       }
+}
+
+/**
+ * Used in MDD/OUT layer for object lock rule
+ **/
+enum mdd_object_role {
+       MOR_SRC_PARENT,
+       MOR_SRC_CHILD,
+       MOR_TGT_PARENT,
+       MOR_TGT_CHILD,
+       MOR_TGT_ORPHAN
+};
+
+struct dt_device;
+/**
+ * Structure to hold object information. This is used to create object
+ * \pre llod_dir exist
+ */
+struct lu_local_obj_desc {
+       const char                    *llod_dir;
+       const char                    *llod_name;
+       __u32                       llod_oid;
+       int                           llod_is_index;
+       const struct dt_index_features  *llod_feat;
+       struct list_head                       llod_linkage;
+};
+
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd);
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
+void lustre_hsm2buf(void *buf, struct md_hsm *mh);
+
+struct lu_ucred {
+       __u32          uc_valid;
+       __u32          uc_o_uid;
+       __u32          uc_o_gid;
+       __u32          uc_o_fsuid;
+       __u32          uc_o_fsgid;
+       __u32          uc_uid;
+       __u32          uc_gid;
+       __u32          uc_fsuid;
+       __u32          uc_fsgid;
+       __u32          uc_suppgids[2];
+       cfs_cap_t          uc_cap;
+       __u32          uc_umask;
+       group_info_t   *uc_ginfo;
+       struct md_identity *uc_identity;
+};
+
+struct lu_ucred *lu_ucred(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_check(const struct lu_env *env);
+
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env);
+
+int lu_ucred_global_init(void);
+
+void lu_ucred_global_fini(void);
+
+#define md_cap_t(x) (x)
+
+#define MD_CAP_TO_MASK(x) (1 << (x))
+
+#define md_cap_raised(c, flag) (md_cap_t(c) & MD_CAP_TO_MASK(flag))
+
+/* capable() is copied from linux kernel! */
+static inline int md_capable(struct lu_ucred *uc, cfs_cap_t cap)
+{
+       if (md_cap_raised(uc->uc_cap, cap))
+               return 1;
+       return 0;
+}
+
+/** @} md */
+#endif /* _LINUX_MD_OBJECT_H */
diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h
new file mode 100644 (file)
index 0000000..0a251fd
--- /dev/null
@@ -0,0 +1,1677 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_H
+#define __OBD_H
+
+#include <linux/obd.h>
+
+#define IOC_OSC_TYPE    'h'
+#define IOC_OSC_MIN_NR       20
+#define IOC_OSC_SET_ACTIVE   _IOWR(IOC_OSC_TYPE, 21, struct obd_device *)
+#define IOC_OSC_MAX_NR       50
+
+#define IOC_MDC_TYPE    'i'
+#define IOC_MDC_MIN_NR       20
+#define IOC_MDC_MAX_NR       50
+
+#include <lustre/lustre_idl.h>
+#include <lu_ref.h>
+#include <lustre_lib.h>
+#include <lustre_export.h>
+#include <lustre_fld.h>
+#include <lustre_capa.h>
+
+#include <linux/libcfs/bitmap.h>
+
+
+#define MAX_OBD_DEVICES 8192
+
+struct osc_async_rc {
+       int     ar_rc;
+       int     ar_force_sync;
+       __u64   ar_min_xid;
+};
+
+struct lov_oinfo {              /* per-stripe data structure */
+       struct ost_id   loi_oi;    /* object ID/Sequence on the target OST */
+       int loi_ost_idx;           /* OST stripe index in lov_tgt_desc->tgts */
+       int loi_ost_gen;           /* generation of this loi_ost_idx */
+
+       unsigned long loi_kms_valid:1;
+       __u64 loi_kms;       /* known minimum size */
+       struct ost_lvb loi_lvb;
+       struct osc_async_rc     loi_ar;
+};
+
+static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms)
+{
+       oinfo->loi_kms = kms;
+       oinfo->loi_kms_valid = 1;
+}
+
+static inline void loi_init(struct lov_oinfo *loi)
+{
+}
+
+struct lov_stripe_md {
+       atomic_t     lsm_refc;
+       spinlock_t      lsm_lock;
+       pid_t       lsm_lock_owner; /* debugging */
+
+       /* maximum possible file size, might change as OSTs status changes,
+        * e.g. disconnected, deactivated */
+       __u64       lsm_maxbytes;
+       struct {
+               /* Public members. */
+               struct ost_id lw_object_oi; /* lov object id/seq */
+
+               /* LOV-private members start here -- only for use in lov/. */
+               __u32 lw_magic;
+               __u32 lw_stripe_size;      /* size of the stripe */
+               __u32 lw_pattern;         /* striping pattern (RAID0, RAID1) */
+               __u16 lw_stripe_count;  /* number of objects being striped over */
+               __u16 lw_layout_gen;       /* generation of the layout */
+               char  lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */
+       } lsm_wire;
+
+       struct lov_oinfo *lsm_oinfo[0];
+};
+
+#define lsm_oi          lsm_wire.lw_object_oi
+#define lsm_magic      lsm_wire.lw_magic
+#define lsm_layout_gen   lsm_wire.lw_layout_gen
+#define lsm_stripe_size  lsm_wire.lw_stripe_size
+#define lsm_pattern      lsm_wire.lw_pattern
+#define lsm_stripe_count lsm_wire.lw_stripe_count
+#define lsm_pool_name    lsm_wire.lw_pool_name
+
+struct obd_info;
+
+typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
+
+/* obd info for a particular level (lov, osc). */
+struct obd_info {
+       /* Lock policy. It keeps an extent which is specific for a particular
+        * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy,
+        * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */
+       ldlm_policy_data_t      oi_policy;
+       /* Flags used for set request specific flags:
+          - while lock handling, the flags obtained on the enqueue
+          request are set here.
+          - while stats, the flags used for control delay/resend.
+          - while setattr, the flags used for distinguish punch operation
+        */
+       __u64              oi_flags;
+       /* Lock handle specific for every OSC lock. */
+       struct lustre_handle   *oi_lockh;
+       /* lsm data specific for every OSC. */
+       struct lov_stripe_md   *oi_md;
+       /* obdo data specific for every OSC, if needed at all. */
+       struct obdo         *oi_oa;
+       /* statfs data specific for every OSC, if needed at all. */
+       struct obd_statfs      *oi_osfs;
+       /* An update callback which is called to update some data on upper
+        * level. E.g. it is used for update lsm->lsm_oinfo at every recieved
+        * request in osc level for enqueue requests. It is also possible to
+        * update some caller data from LOV layer if needed. */
+       obd_enqueue_update_f    oi_cb_up;
+       /* oss capability, its type is obd_capa in client to avoid copy.
+        * in contrary its type is lustre_capa in OSS. */
+       void               *oi_capa;
+       /* transfer jobid from ost_sync() to filter_sync()... */
+       char               *oi_jobid;
+};
+
+/* compare all relevant fields. */
+static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1,
+                                   struct lov_stripe_md *m2)
+{
+       /*
+        * ->lsm_wire contains padding, but it should be zeroed out during
+        * allocation.
+        */
+       return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof m1->lsm_wire);
+}
+
+static inline int lov_lum_lsm_cmp(struct lov_user_md *lum,
+                                 struct lov_stripe_md  *lsm)
+{
+       if (lsm->lsm_magic != lum->lmm_magic)
+               return 1;
+       if ((lsm->lsm_stripe_count != 0) && (lum->lmm_stripe_count != 0) &&
+           (lsm->lsm_stripe_count != lum->lmm_stripe_count))
+               return 2;
+       if ((lsm->lsm_stripe_size != 0) && (lum->lmm_stripe_size != 0) &&
+           (lsm->lsm_stripe_size != lum->lmm_stripe_size))
+               return 3;
+       if ((lsm->lsm_pattern != 0) && (lum->lmm_pattern != 0) &&
+           (lsm->lsm_pattern != lum->lmm_pattern))
+               return 4;
+       if ((lsm->lsm_magic == LOV_MAGIC_V3) &&
+           (strncmp(lsm->lsm_pool_name,
+                    ((struct lov_user_md_v3 *)lum)->lmm_pool_name,
+                    LOV_MAXPOOLNAME) != 0))
+               return 5;
+       return 0;
+}
+
+static inline int lov_lum_swab_if_needed(struct lov_user_md_v3 *lumv3,
+                                        int *lmm_magic,
+                                        struct lov_user_md *lum)
+{
+       if (lum && copy_from_user(lumv3, lum,sizeof(struct lov_user_md_v1)))
+               return -EFAULT;
+
+       *lmm_magic = lumv3->lmm_magic;
+
+       if (*lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
+               lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lumv3);
+               *lmm_magic = LOV_USER_MAGIC_V1;
+       } else if (*lmm_magic == LOV_USER_MAGIC_V3) {
+               if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+                       return -EFAULT;
+       } else if (*lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
+               if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3)))
+                       return -EFAULT;
+               lustre_swab_lov_user_md_v3(lumv3);
+               *lmm_magic = LOV_USER_MAGIC_V3;
+       } else if (*lmm_magic != LOV_USER_MAGIC_V1) {
+               CDEBUG(D_IOCTL,
+                      "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+                      *lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3);
+                      return -EINVAL;
+       }
+       return 0;
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md);
+void lov_stripe_unlock(struct lov_stripe_md *md);
+
+struct obd_type {
+       struct list_head typ_chain;
+       struct obd_ops *typ_dt_ops;
+       struct md_ops *typ_md_ops;
+       proc_dir_entry_t *typ_procroot;
+       char *typ_name;
+       int  typ_refcnt;
+       struct lu_device_type *typ_lu;
+       spinlock_t obd_type_lock;
+};
+
+struct brw_page {
+       obd_off  off;
+       struct page *pg;
+       int count;
+       obd_flag flag;
+};
+
+/* Individual type definitions */
+
+struct ost_server_data;
+
+struct osd_properties {
+       size_t osd_max_ea_size;
+};
+
+#define OBT_MAGIC       0xBDDECEAE
+/* hold common fields for "target" device */
+struct obd_device_target {
+       __u32                obt_magic;
+       __u32                obt_instance;
+       struct super_block       *obt_sb;
+       /** last_rcvd file */
+       struct file           *obt_rcvd_filp;
+       __u64                obt_mount_count;
+       struct rw_semaphore       obt_rwsem;
+       struct vfsmount   *obt_vfsmnt;
+       struct file           *obt_health_check_filp;
+       struct osd_properties     obt_osd_properties;
+       struct obd_job_stats      obt_jobstats;
+};
+
+/* llog contexts */
+enum llog_ctxt_id {
+       LLOG_CONFIG_ORIG_CTXT  =  0,
+       LLOG_CONFIG_REPL_CTXT,
+       LLOG_MDS_OST_ORIG_CTXT,
+       LLOG_MDS_OST_REPL_CTXT,
+       LLOG_SIZE_ORIG_CTXT,
+       LLOG_SIZE_REPL_CTXT,
+       LLOG_RD1_ORIG_CTXT,
+       LLOG_RD1_REPL_CTXT,
+       LLOG_TEST_ORIG_CTXT,
+       LLOG_TEST_REPL_CTXT,
+       LLOG_LOVEA_ORIG_CTXT,
+       LLOG_LOVEA_REPL_CTXT,
+       LLOG_CHANGELOG_ORIG_CTXT,      /**< changelog generation on mdd */
+       LLOG_CHANGELOG_REPL_CTXT,      /**< changelog access on clients */
+       LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */
+       LLOG_MAX_CTXTS
+};
+
+#define FILTER_SUBDIR_COUNT      32        /* set to zero for no subdirs */
+
+struct filter_subdirs {
+       struct dentry *dentry[FILTER_SUBDIR_COUNT];
+};
+
+
+struct filter_ext {
+       __u64           fe_start;
+       __u64           fe_end;
+};
+
+struct filter_obd {
+       /* NB this field MUST be first */
+       struct obd_device_target fo_obt;
+       const char              *fo_fstype;
+
+       int                     fo_group_count;
+       struct dentry           *fo_dentry_O;
+       struct dentry           **fo_dentry_O_groups;
+       struct filter_subdirs   *fo_dentry_O_sub;
+       struct mutex            fo_init_lock;   /* group initialization lock*/
+       int                     fo_committed_group;
+
+       spinlock_t              fo_objidlock;   /* protect fo_lastobjid */
+
+       unsigned long           fo_destroys_in_progress;
+       struct mutex            fo_create_locks[FILTER_SUBDIR_COUNT];
+
+       struct list_head fo_export_list;
+       int               fo_subdir_count;
+
+       obd_size             fo_tot_dirty;      /* protected by obd_osfs_lock */
+       obd_size             fo_tot_granted;    /* all values in bytes */
+       obd_size             fo_tot_pending;
+       int               fo_tot_granted_clients;
+
+       obd_size             fo_readcache_max_filesize;
+       spinlock_t              fo_flags_lock;
+       unsigned int     fo_read_cache:1,   /**< enable read-only cache */
+                            fo_writethrough_cache:1,/**< read cache writes */
+                            fo_mds_ost_sync:1, /**< MDS-OST orphan recovery*/
+                            fo_raid_degraded:1;/**< RAID device degraded */
+
+       struct obd_import   *fo_mdc_imp;
+       struct obd_uuid      fo_mdc_uuid;
+       struct lustre_handle fo_mdc_conn;
+       struct file     **fo_last_objid_files;
+       __u64          *fo_last_objids; /* last created objid for groups,
+                                             * protected by fo_objidlock */
+
+       struct mutex            fo_alloc_lock;
+
+       atomic_t         fo_r_in_flight;
+       atomic_t         fo_w_in_flight;
+
+       /*
+        * per-filter pool of kiobuf's allocated by filter_common_setup() and
+        * torn down by filter_cleanup().
+        *
+        * This pool contains kiobuf used by
+        * filter_{prep,commit}rw_{read,write}() and is shared by all OST
+        * threads.
+        *
+        * Locking: protected by internal lock of cfs_hash, pool can be
+        * found from this hash table by t_id of ptlrpc_thread.
+        */
+       struct cfs_hash         *fo_iobuf_hash;
+
+       struct brw_stats         fo_filter_stats;
+
+       int                   fo_fmd_max_num; /* per exp filter_mod_data */
+       int                   fo_fmd_max_age; /* jiffies to fmd expiry */
+       unsigned long       fo_syncjournal:1, /* sync journal on writes */
+                                fo_sync_lock_cancel:2;/* sync on lock cancel */
+
+
+       /* sptlrpc stuff */
+       rwlock_t                fo_sptlrpc_lock;
+       struct sptlrpc_rule_set  fo_sptlrpc_rset;
+
+       /* capability related */
+       unsigned int         fo_fl_oss_capa;
+       struct list_head               fo_capa_keys;
+       struct hlist_head       *fo_capa_hash;
+       int                   fo_sec_level;
+};
+
+struct timeout_item {
+       enum timeout_event ti_event;
+       cfs_time_t       ti_timeout;
+       timeout_cb_t       ti_cb;
+       void          *ti_cb_data;
+       struct list_head         ti_obd_list;
+       struct list_head         ti_chain;
+};
+
+#define OSC_MAX_RIF_DEFAULT       8
+#define MDS_OSC_MAX_RIF_DEFAULT   50
+#define OSC_MAX_RIF_MAX         256
+#define OSC_MAX_DIRTY_DEFAULT  (OSC_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_MB_MAX   2048     /* arbitrary, but < MAX_LONG bytes */
+#define OSC_DEFAULT_RESENDS      10
+
+/* possible values for fo_sync_lock_cancel */
+enum {
+       NEVER_SYNC_ON_CANCEL = 0,
+       BLOCKING_SYNC_ON_CANCEL = 1,
+       ALWAYS_SYNC_ON_CANCEL = 2,
+       NUM_SYNC_ON_CANCEL_STATES
+};
+
+#define MDC_MAX_RIF_DEFAULT       8
+#define MDC_MAX_RIF_MAX         512
+
+struct mdc_rpc_lock;
+struct obd_import;
+struct client_obd {
+       struct rw_semaphore  cl_sem;
+       struct obd_uuid   cl_target_uuid;
+       struct obd_import       *cl_import; /* ptlrpc connection state */
+       int                   cl_conn_count;
+       /* max_mds_easize is purely a performance thing so we don't have to
+        * call obd_size_diskmd() all the time. */
+       int                   cl_default_mds_easize;
+       int                   cl_max_mds_easize;
+       int                   cl_max_mds_cookiesize;
+
+       enum lustre_sec_part     cl_sp_me;
+       enum lustre_sec_part     cl_sp_to;
+       struct sptlrpc_flavor    cl_flvr_mgc;   /* fixed flavor of mgc->mgs */
+
+       /* the grant values are protected by loi_list_lock below */
+       long                 cl_dirty;   /* all _dirty_ in bytes */
+       long                 cl_dirty_max;     /* allowed w/o rpc */
+       long                 cl_dirty_transit; /* dirty synchronous */
+       long                 cl_avail_grant;   /* bytes of credit for ost */
+       long                 cl_lost_grant;    /* lost credits (trunc) */
+
+       /* since we allocate grant by blocks, we don't know how many grant will
+        * be used to add a page into cache. As a solution, we reserve maximum
+        * grant before trying to dirty a page and unreserve the rest.
+        * See osc_{reserve|unreserve}_grant for details. */
+       long             cl_reserved_grant;
+       struct list_head           cl_cache_waiters; /* waiting for cache/grant */
+       cfs_time_t         cl_next_shrink_grant;   /* jiffies */
+       struct list_head           cl_grant_shrink_list;  /* Timeout event list */
+       int               cl_grant_shrink_interval; /* seconds */
+
+       /* A chunk is an optimal size used by osc_extent to determine
+        * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */
+       int               cl_chunkbits;
+       int               cl_chunk;
+       int               cl_extent_tax; /* extent overhead, by bytes */
+
+       /* keep track of objects that have lois that contain pages which
+        * have been queued for async brw.  this lock also protects the
+        * lists of osc_client_pages that hang off of the loi */
+       /*
+        * ->cl_loi_list_lock protects consistency of
+        * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and
+        * ->ap_completion() call-backs are executed under this lock. As we
+        * cannot guarantee that these call-backs never block on all platforms
+        * (as a matter of fact they do block on Mac OS X), type of
+        * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux
+        * and blocking mutex on Mac OS X. (Alternative is to make this lock
+        * blocking everywhere, but we don't want to slow down fast-path of
+        * our main platform.)
+        *
+        * Exact type of ->cl_loi_list_lock is defined in arch/obd.h together
+        * with client_obd_list_{un,}lock() and
+        * client_obd_list_lock_{init,done}() functions.
+        *
+        * NB by Jinshan: though field names are still _loi_, but actually
+        * osc_object{}s are in the list.
+        */
+       client_obd_lock_t       cl_loi_list_lock;
+       struct list_head               cl_loi_ready_list;
+       struct list_head               cl_loi_hp_ready_list;
+       struct list_head               cl_loi_write_list;
+       struct list_head               cl_loi_read_list;
+       int                   cl_r_in_flight;
+       int                   cl_w_in_flight;
+       /* just a sum of the loi/lop pending numbers to be exported by /proc */
+       atomic_t             cl_pending_w_pages;
+       atomic_t             cl_pending_r_pages;
+       __u32                    cl_max_pages_per_rpc;
+       int                   cl_max_rpcs_in_flight;
+       struct obd_histogram     cl_read_rpc_hist;
+       struct obd_histogram     cl_write_rpc_hist;
+       struct obd_histogram     cl_read_page_hist;
+       struct obd_histogram     cl_write_page_hist;
+       struct obd_histogram     cl_read_offset_hist;
+       struct obd_histogram     cl_write_offset_hist;
+
+       /* lru for osc caching pages */
+       struct cl_client_cache  *cl_cache;
+       struct list_head                 cl_lru_osc; /* member of cl_cache->ccc_lru */
+       atomic_t                *cl_lru_left;
+       atomic_t                 cl_lru_busy;
+       atomic_t                 cl_lru_shrinkers;
+       atomic_t                 cl_lru_in_list;
+       struct list_head                 cl_lru_list; /* lru page list */
+       client_obd_lock_t        cl_lru_list_lock; /* page list protector */
+
+       /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */
+       atomic_t             cl_destroy_in_flight;
+       wait_queue_head_t             cl_destroy_waitq;
+
+       struct mdc_rpc_lock     *cl_rpc_lock;
+       struct mdc_rpc_lock     *cl_close_lock;
+
+       /* mgc datastruct */
+       struct semaphore         cl_mgc_sem;
+       struct vfsmount  *cl_mgc_vfsmnt;
+       struct dentry      *cl_mgc_configs_dir;
+       atomic_t             cl_mgc_refcount;
+       struct obd_export       *cl_mgc_mgsexp;
+
+       /* checksumming for data sent over the network */
+       unsigned int         cl_checksum:1; /* 0 = disabled, 1 = enabled */
+       /* supported checksum types that are worked out at connect time */
+       __u32               cl_supp_cksum_types;
+       /* checksum algorithm to be used */
+       cksum_type_t         cl_cksum_type;
+
+       /* also protected by the poorly named _loi_list_lock lock above */
+       struct osc_async_rc      cl_ar;
+
+       /* used by quotacheck when the servers are older than 2.4 */
+       int                   cl_qchk_stat; /* quotacheck stat of the peer */
+#define CL_NOT_QUOTACHECKED 1   /* client->cl_qchk_stat init value */
+#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0)
+#warning "please consider removing quotacheck compatibility code"
+#endif
+
+       /* sequence manager */
+       struct lu_client_seq    *cl_seq;
+
+       atomic_t             cl_resends; /* resend count */
+
+       /* ptlrpc work for writeback in ptlrpcd context */
+       void                *cl_writeback_work;
+       /* hash tables for osc_quota_info */
+       cfs_hash_t            *cl_quota_hash[MAXQUOTAS];
+};
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
+
+struct obd_id_info {
+       __u32   idx;
+       obd_id  *data;
+};
+
+/* */
+
+struct echo_obd {
+       struct obd_device_target eo_obt;
+       struct obdo             eo_oa;
+       spinlock_t               eo_lock;
+       __u64                    eo_lastino;
+       struct lustre_handle    eo_nl_lock;
+       atomic_t                eo_prep;
+};
+
+struct ost_obd {
+       struct ptlrpc_service   *ost_service;
+       struct ptlrpc_service   *ost_create_service;
+       struct ptlrpc_service   *ost_io_service;
+       struct ptlrpc_service   *ost_seq_service;
+       struct mutex            ost_health_mutex;
+};
+
+struct echo_client_obd {
+       struct obd_export       *ec_exp;   /* the local connection to osc/lov */
+       spinlock_t              ec_lock;
+       struct list_head           ec_objects;
+       struct list_head           ec_locks;
+       int               ec_nstripes;
+       __u64           ec_unique;
+};
+
+struct lov_qos_oss {
+       struct obd_uuid     lqo_uuid;       /* ptlrpc's c_remote_uuid */
+       struct list_head          lqo_oss_list;   /* link to lov_qos */
+       __u64          lqo_bavail;     /* total bytes avail on OSS */
+       __u64          lqo_penalty;    /* current penalty */
+       __u64          lqo_penalty_per_obj;/* penalty decrease every obj*/
+       time_t        lqo_used;       /* last used time, seconds */
+       __u32          lqo_ost_count;  /* number of osts on this oss */
+};
+
+struct ltd_qos {
+       struct lov_qos_oss *ltq_oss;     /* oss info */
+       __u64          ltq_penalty;     /* current penalty */
+       __u64          ltq_penalty_per_obj; /* penalty decrease every obj*/
+       __u64          ltq_weight;      /* net weighting */
+       time_t        ltq_used; /* last used time, seconds */
+       unsigned int    ltq_usable:1;    /* usable for striping */
+};
+
+/* Generic subset of OSTs */
+struct ost_pool {
+       __u32         *op_array;      /* array of index of
+                                                  lov_obd->lov_tgts */
+       unsigned int    op_count;      /* number of OSTs in the array */
+       unsigned int    op_size;       /* allocated size of lp_array */
+       struct rw_semaphore op_rw_sem;     /* to protect ost_pool use */
+};
+
+/* Round-robin allocator data */
+struct lov_qos_rr {
+       __u32          lqr_start_idx;   /* start index of new inode */
+       __u32          lqr_offset_idx;  /* aliasing for start_idx  */
+       int              lqr_start_count; /* reseed counter */
+       struct ost_pool     lqr_pool;   /* round-robin optimized list */
+       unsigned long       lqr_dirty:1;     /* recalc round-robin list */
+};
+
+/* allow statfs data caching for 1 second */
+#define OBD_STATFS_CACHE_SECONDS 1
+
+struct lov_statfs_data {
+       struct obd_info   lsd_oi;
+       struct obd_statfs lsd_statfs;
+};
+/* Stripe placement optimization */
+struct lov_qos {
+       struct list_head          lq_oss_list; /* list of OSSs that targets use */
+       struct rw_semaphore lq_rw_sem;
+       __u32          lq_active_oss_count;
+       unsigned int    lq_prio_free;   /* priority for free space */
+       unsigned int    lq_threshold_rr;/* priority for rr */
+       struct lov_qos_rr   lq_rr;        /* round robin qos data */
+       unsigned long       lq_dirty:1,     /* recalc qos data */
+                           lq_same_space:1,/* the ost's all have approx.
+                                              the same space avail */
+                           lq_reset:1,     /* zero current penalties */
+                           lq_statfs_in_progress:1; /* statfs op in
+                                                       progress */
+       /* qos statfs data */
+       struct lov_statfs_data *lq_statfs_data;
+       wait_queue_head_t        lq_statfs_waitq; /* waitqueue to notify statfs
+                                             * requests completion */
+};
+
+struct lov_tgt_desc {
+       struct list_head          ltd_kill;
+       struct obd_uuid     ltd_uuid;
+       struct obd_device  *ltd_obd;
+       struct obd_export  *ltd_exp;
+       struct ltd_qos      ltd_qos;     /* qos info per target */
+       __u32          ltd_gen;
+       __u32          ltd_index;   /* index in lov_obd->tgts */
+       unsigned long       ltd_active:1,/* is this target up for requests */
+                           ltd_activate:1,/* should  target be activated */
+                           ltd_reap:1;  /* should this target be deleted */
+};
+
+/* Pool metadata */
+#define pool_tgt_size(_p)   _p->pool_obds.op_size
+#define pool_tgt_count(_p)  _p->pool_obds.op_count
+#define pool_tgt_array(_p)  _p->pool_obds.op_array
+#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem
+
+struct pool_desc {
+       char              pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */
+       struct ost_pool       pool_obds;              /* pool members */
+       atomic_t          pool_refcount;          /* pool ref. counter */
+       struct lov_qos_rr     pool_rr;          /* round robin qos */
+       struct hlist_node      pool_hash;             /* access by poolname */
+       struct list_head            pool_list;        /* serial access */
+       proc_dir_entry_t *pool_proc_entry;      /* file in /proc */
+       struct obd_device    *pool_lobd;              /* obd of the lov/lod to which
+                                                      * this pool belongs */
+};
+
+struct lov_obd {
+       struct lov_desc  desc;
+       struct lov_tgt_desc   **lov_tgts;             /* sparse array */
+       struct ost_pool  lov_packed;        /* all OSTs in a packed
+                                                         array */
+       struct mutex            lov_lock;
+       struct obd_connect_data lov_ocd;
+       atomic_t            lov_refcount;
+       __u32              lov_tgt_count;        /* how many OBD's */
+       __u32              lov_active_tgt_count;  /* how many active */
+       __u32              lov_death_row;/* tgts scheduled to be deleted */
+       __u32              lov_tgt_size;   /* size of tgts array */
+       int                  lov_connects;
+       int                  lov_pool_count;
+       cfs_hash_t           *lov_pools_hash_body; /* used for key access */
+       struct list_head              lov_pool_list; /* used for sequential access */
+       proc_dir_entry_t   *lov_pool_proc_entry;
+       enum lustre_sec_part    lov_sp_me;
+
+       /* Cached LRU pages from upper layer */
+       void                   *lov_cache;
+
+       struct rw_semaphore     lov_notify_lock;
+};
+
+struct lmv_tgt_desc {
+       struct obd_uuid         ltd_uuid;
+       struct obd_export       *ltd_exp;
+       int                     ltd_idx;
+       struct mutex            ltd_fid_mutex;
+       unsigned long           ltd_active:1; /* target up for requests */
+};
+
+enum placement_policy {
+       PLACEMENT_CHAR_POLICY   = 0,
+       PLACEMENT_NID_POLICY    = 1,
+       PLACEMENT_INVAL_POLICY  = 2,
+       PLACEMENT_MAX_POLICY
+};
+
+typedef enum placement_policy placement_policy_t;
+
+struct lmv_obd {
+       int                     refcount;
+       struct lu_client_fld    lmv_fld;
+       spinlock_t              lmv_lock;
+       placement_policy_t      lmv_placement;
+       struct lmv_desc         desc;
+       struct obd_uuid         cluuid;
+       struct obd_export       *exp;
+
+       struct mutex            init_mutex;
+       int                     connected;
+       int                     max_easize;
+       int                     max_def_easize;
+       int                     max_cookiesize;
+       int                     server_timeout;
+
+       int                     tgts_size; /* size of tgts array */
+       struct lmv_tgt_desc     **tgts;
+
+       struct obd_connect_data conn_data;
+};
+
+struct niobuf_local {
+       __u64           lnb_file_offset;
+       __u32           lnb_page_offset;
+       __u32           len;
+       __u32           flags;
+       struct page     *page;
+       struct dentry   *dentry;
+       int             lnb_grant_used;
+       int             rc;
+};
+
+#define LUSTRE_FLD_NAME         "fld"
+#define LUSTRE_SEQ_NAME         "seq"
+
+#define LUSTRE_MDD_NAME         "mdd"
+#define LUSTRE_OSD_LDISKFS_NAME        "osd-ldiskfs"
+#define LUSTRE_OSD_ZFS_NAME     "osd-zfs"
+#define LUSTRE_VVP_NAME         "vvp"
+#define LUSTRE_LMV_NAME         "lmv"
+#define LUSTRE_SLP_NAME         "slp"
+#define LUSTRE_LOD_NAME                "lod"
+#define LUSTRE_OSP_NAME                "osp"
+#define LUSTRE_LWP_NAME                "lwp"
+
+/* obd device type names */
+ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
+#define LUSTRE_MDS_NAME         "mds"
+#define LUSTRE_MDT_NAME         "mdt"
+#define LUSTRE_MDC_NAME         "mdc"
+#define LUSTRE_OSS_NAME         "ost"       /* FIXME change name to oss */
+#define LUSTRE_OST_NAME         "obdfilter" /* FIXME change name to ost */
+#define LUSTRE_OSC_NAME         "osc"
+#define LUSTRE_LOV_NAME         "lov"
+#define LUSTRE_MGS_NAME         "mgs"
+#define LUSTRE_MGC_NAME         "mgc"
+
+#define LUSTRE_ECHO_NAME       "obdecho"
+#define LUSTRE_ECHO_CLIENT_NAME "echo_client"
+#define LUSTRE_QMT_NAME         "qmt"
+
+/* Constant obd names (post-rename) */
+#define LUSTRE_MDS_OBDNAME "MDS"
+#define LUSTRE_OSS_OBDNAME "OSS"
+#define LUSTRE_MGS_OBDNAME "MGS"
+#define LUSTRE_MGC_OBDNAME "MGC"
+
+static inline int is_osp_on_mdt(char *name)
+{
+       char   *ptr;
+
+       ptr = strrchr(name, '-');
+       if (ptr == NULL) {
+               CERROR("%s is not a obdname\n", name);
+               return 0;
+       }
+
+       /* 1.8 OSC/OSP name on MDT is fsname-OSTxxxx-osc */
+       if (strncmp(ptr + 1, "osc", 3) == 0)
+               return 1;
+
+       if (strncmp(ptr + 1, "MDT", 3) != 0)
+               return 0;
+
+       while (*(--ptr) != '-' && ptr != name);
+
+       if (ptr == name)
+               return 0;
+
+       if (strncmp(ptr + 1, LUSTRE_OSP_NAME, strlen(LUSTRE_OSP_NAME)) != 0 &&
+           strncmp(ptr + 1, LUSTRE_OSC_NAME, strlen(LUSTRE_OSC_NAME)) != 0)
+               return 0;
+
+       return 1;
+}
+
+/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */
+#define N_LOCAL_TEMP_PAGE 0x10000000
+
+struct obd_trans_info {
+       __u64               oti_transno;
+       __u64               oti_xid;
+       /* Only used on the server side for tracking acks. */
+       struct oti_req_ack_lock {
+               struct lustre_handle lock;
+               __u32           mode;
+       }                       oti_ack_locks[4];
+       void                *oti_handle;
+       struct llog_cookie       oti_onecookie;
+       struct llog_cookie      *oti_logcookies;
+       int                   oti_numcookies;
+       /** synchronous write is needed */
+       unsigned long            oti_sync_write:1;
+
+       /* initial thread handling transaction */
+       struct ptlrpc_thread *   oti_thread;
+       __u32               oti_conn_cnt;
+       /** VBR: versions */
+       __u64               oti_pre_version;
+       /** JobID */
+       char                *oti_jobid;
+
+       struct obd_uuid  *oti_ost_uuid;
+};
+
+static inline void oti_init(struct obd_trans_info *oti,
+                           struct ptlrpc_request *req)
+{
+       if (oti == NULL)
+               return;
+       memset(oti, 0, sizeof(*oti));
+
+       if (req == NULL)
+               return;
+
+       oti->oti_xid = req->rq_xid;
+       /** VBR: take versions from request */
+       if (req->rq_reqmsg != NULL &&
+           lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+               __u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg);
+               oti->oti_pre_version = pre_version ? pre_version[0] : 0;
+               oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg);
+       }
+
+       /** called from mds_create_objects */
+       if (req->rq_repmsg != NULL)
+               oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+       oti->oti_thread = req->rq_svc_thread;
+       if (req->rq_reqmsg != NULL)
+               oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+}
+
+static inline void oti_alloc_cookies(struct obd_trans_info *oti,int num_cookies)
+{
+       if (!oti)
+               return;
+
+       if (num_cookies == 1)
+               oti->oti_logcookies = &oti->oti_onecookie;
+       else
+               OBD_ALLOC_LARGE(oti->oti_logcookies,
+                               num_cookies * sizeof(oti->oti_onecookie));
+
+       oti->oti_numcookies = num_cookies;
+}
+
+static inline void oti_free_cookies(struct obd_trans_info *oti)
+{
+       if (!oti || !oti->oti_logcookies)
+               return;
+
+       if (oti->oti_logcookies == &oti->oti_onecookie)
+               LASSERT(oti->oti_numcookies == 1);
+       else
+               OBD_FREE_LARGE(oti->oti_logcookies,
+                              oti->oti_numcookies*sizeof(oti->oti_onecookie));
+       oti->oti_logcookies = NULL;
+       oti->oti_numcookies = 0;
+}
+
+/*
+ * Events signalled through obd_notify() upcall-chain.
+ */
+enum obd_notify_event {
+       /* target added */
+       OBD_NOTIFY_CREATE,
+       /* Device connect start */
+       OBD_NOTIFY_CONNECT,
+       /* Device activated */
+       OBD_NOTIFY_ACTIVE,
+       /* Device deactivated */
+       OBD_NOTIFY_INACTIVE,
+       /* Device disconnected */
+       OBD_NOTIFY_DISCON,
+       /* Connect data for import were changed */
+       OBD_NOTIFY_OCD,
+       /* Sync request */
+       OBD_NOTIFY_SYNC_NONBLOCK,
+       OBD_NOTIFY_SYNC,
+       /* Configuration event */
+       OBD_NOTIFY_CONFIG,
+       /* Administratively deactivate/activate event */
+       OBD_NOTIFY_DEACTIVATE,
+       OBD_NOTIFY_ACTIVATE
+};
+
+/*
+ * Data structure used to pass obd_notify()-event to non-obd listeners (llite
+ * and liblustre being main examples).
+ */
+struct obd_notify_upcall {
+       int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
+                         enum obd_notify_event ev, void *owner, void *data);
+       /* Opaque datum supplied by upper layer listener */
+       void *onu_owner;
+};
+
+struct target_recovery_data {
+       svc_handler_t           trd_recovery_handler;
+       pid_t                   trd_processing_task;
+       struct completion       trd_starting;
+       struct completion       trd_finishing;
+};
+
+struct obd_llog_group {
+       int             olg_seq;
+       struct llog_ctxt  *olg_ctxts[LLOG_MAX_CTXTS];
+       wait_queue_head_t       olg_waitq;
+       spinlock_t         olg_lock;
+       struct mutex       olg_cat_processing;
+};
+
+/* corresponds to one of the obd's */
+#define OBD_DEVICE_MAGIC       0XAB5CD6EF
+#define OBD_DEV_BY_DEVNAME      0xffffd0de
+
+struct obd_device {
+       struct obd_type *obd_type;
+       __u32              obd_magic;
+
+       /* common and UUID name of this device */
+       char                obd_name[MAX_OBD_NAME];
+       struct obd_uuid  obd_uuid;
+
+       struct lu_device       *obd_lu_dev;
+
+       int                  obd_minor;
+       /* bitfield modification is protected by obd_dev_lock */
+       unsigned long obd_attached:1,      /* finished attach */
+                     obd_set_up:1,     /* finished setup */
+                     obd_recovering:1,    /* there are recoverable clients */
+                     obd_abort_recovery:1,/* recovery expired */
+                     obd_version_recov:1, /* obd uses version checking */
+                     obd_replayable:1,    /* recovery is enabled; inform clients */
+                     obd_no_transno:1,    /* no committed-transno notification */
+                     obd_no_recov:1,      /* fail instead of retry messages */
+                     obd_stopping:1,      /* started cleanup */
+                     obd_starting:1,      /* started setup */
+                     obd_force:1,       /* cleanup with > 0 obd refcount */
+                     obd_fail:1,         /* cleanup with failover */
+                     obd_async_recov:1,   /* allow asynchronous orphan cleanup */
+                     obd_no_conn:1,       /* deny new connections */
+                     obd_inactive:1,      /* device active/inactive
+                                          * (for /proc/status only!!) */
+                     obd_no_ir:1,       /* no imperative recovery. */
+                     obd_process_conf:1;  /* device is processing mgs config */
+       /* use separate field as it is set in interrupt to don't mess with
+        * protection of other bits using _bh lock */
+       unsigned long obd_recovery_expired:1;
+       /* uuid-export hash body */
+       cfs_hash_t           *obd_uuid_hash;
+       /* nid-export hash body */
+       cfs_hash_t           *obd_nid_hash;
+       /* nid stats body */
+       cfs_hash_t           *obd_nid_stats_hash;
+       struct list_head              obd_nid_stats;
+       atomic_t            obd_refcount;
+       wait_queue_head_t            obd_refcount_waitq;
+       struct list_head              obd_exports;
+       struct list_head              obd_unlinked_exports;
+       struct list_head              obd_delayed_exports;
+       int                  obd_num_exports;
+       spinlock_t              obd_nid_lock;
+       struct ldlm_namespace  *obd_namespace;
+       struct ptlrpc_client    obd_ldlm_client; /* XXX OST/MDS only */
+       /* a spinlock is OK for what we do now, may need a semaphore later */
+       spinlock_t              obd_dev_lock; /* protect OBD bitfield above */
+       struct mutex            obd_dev_mutex;
+       __u64                   obd_last_committed;
+       struct fsfilt_operations *obd_fsops;
+       spinlock_t              obd_osfs_lock;
+       struct obd_statfs       obd_osfs;       /* locked by obd_osfs_lock */
+       __u64                   obd_osfs_age;
+       struct lvfs_run_ctxt    obd_lvfs_ctxt;
+       struct obd_llog_group   obd_olg;        /* default llog group */
+       struct obd_device       *obd_observer;
+       struct rw_semaphore     obd_observer_link_sem;
+       struct obd_notify_upcall obd_upcall;
+       struct obd_export       *obd_self_export;
+       /* list of exports in LRU order, for ping evictor, with obd_dev_lock */
+       struct list_head              obd_exports_timed;
+       time_t            obd_eviction_timer; /* for ping evictor */
+
+       int                           obd_max_recoverable_clients;
+       atomic_t                     obd_connected_clients;
+       int                           obd_stale_clients;
+       int                           obd_delayed_clients;
+       /* this lock protects all recovery list_heads, timer and
+        * obd_next_recovery_transno value */
+       spinlock_t                       obd_recovery_task_lock;
+       __u64                       obd_next_recovery_transno;
+       int                           obd_replayed_requests;
+       int                           obd_requests_queued_for_recovery;
+       wait_queue_head_t                     obd_next_transno_waitq;
+       /* protected by obd_recovery_task_lock */
+       timer_list_t                  obd_recovery_timer;
+       time_t                     obd_recovery_start; /* seconds */
+       time_t                     obd_recovery_end; /* seconds, for lprocfs_status */
+       int                           obd_recovery_time_hard;
+       int                           obd_recovery_timeout;
+       int                           obd_recovery_ir_factor;
+
+       /* new recovery stuff from CMD2 */
+       struct target_recovery_data      obd_recovery_data;
+       int                           obd_replayed_locks;
+       atomic_t                     obd_req_replay_clients;
+       atomic_t                     obd_lock_replay_clients;
+       /* all lists are protected by obd_recovery_task_lock */
+       struct list_head                       obd_req_replay_queue;
+       struct list_head                       obd_lock_replay_queue;
+       struct list_head                       obd_final_req_queue;
+       int                           obd_recovery_stage;
+
+       union {
+               struct obd_device_target obt;
+               struct filter_obd filter;
+               struct client_obd cli;
+               struct ost_obd ost;
+               struct echo_client_obd echo_client;
+               struct echo_obd echo;
+               struct lov_obd lov;
+               struct lmv_obd lmv;
+       } u;
+       /* Fields used by LProcFS */
+       unsigned int       obd_cntr_base;
+       struct lprocfs_stats  *obd_stats;
+
+       unsigned int       md_cntr_base;
+       struct lprocfs_stats  *md_stats;
+
+       proc_dir_entry_t  *obd_proc_entry;
+       void              *obd_proc_private; /* type private PDEs */
+       proc_dir_entry_t  *obd_proc_exports_entry;
+       proc_dir_entry_t  *obd_svc_procroot;
+       struct lprocfs_stats  *obd_svc_stats;
+       atomic_t           obd_evict_inprogress;
+       wait_queue_head_t           obd_evict_inprogress_waitq;
+       struct list_head             obd_evict_list; /* protected with pet_lock */
+
+       /**
+        * Ldlm pool part. Save last calculated SLV and Limit.
+        */
+       rwlock_t                obd_pool_lock;
+       int                 obd_pool_limit;
+       __u64             obd_pool_slv;
+
+       /**
+        * A list of outstanding class_incref()'s against this obd. For
+        * debugging.
+        */
+       struct lu_ref     obd_reference;
+
+       int                    obd_conn_inprogress;
+};
+
+#define OBD_LLOG_FL_SENDNOW     0x0001
+#define OBD_LLOG_FL_EXIT       0x0002
+
+enum obd_cleanup_stage {
+/* Special case hack for MDS LOVs */
+       OBD_CLEANUP_EARLY,
+/* can be directly mapped to .ldto_device_fini() */
+       OBD_CLEANUP_EXPORTS,
+};
+
+/* get/set_info keys */
+#define KEY_ASYNC             "async"
+#define KEY_BLOCKSIZE_BITS      "blocksize_bits"
+#define KEY_BLOCKSIZE     "blocksize"
+#define KEY_CAPA_KEY       "capa_key"
+#define KEY_CHANGELOG_CLEAR     "changelog_clear"
+#define KEY_FID2PATH       "fid2path"
+#define KEY_CHECKSUM       "checksum"
+#define KEY_CLEAR_FS       "clear_fs"
+#define KEY_CONN_DATA     "conn_data"
+#define KEY_EVICT_BY_NID       "evict_by_nid"
+#define KEY_FIEMAP           "fiemap"
+#define KEY_FLUSH_CTX     "flush_ctx"
+#define KEY_GRANT_SHRINK       "grant_shrink"
+#define KEY_HSM_COPYTOOL_SEND   "hsm_send"
+#define KEY_INIT_RECOV_BACKUP   "init_recov_bk"
+#define KEY_INIT_RECOV   "initial_recov"
+#define KEY_INTERMDS       "inter_mds"
+#define KEY_LAST_ID         "last_id"
+#define KEY_LAST_FID           "last_fid"
+#define KEY_LOCK_TO_STRIPE      "lock_to_stripe"
+#define KEY_LOVDESC         "lovdesc"
+#define KEY_LOV_IDX         "lov_idx"
+#define KEY_MAX_EASIZE   "max_easize"
+#define KEY_MDS_CONN       "mds_conn"
+#define KEY_MGSSEC           "mgssec"
+#define KEY_NEXT_ID         "next_id"
+#define KEY_READ_ONLY     "read-only"
+#define KEY_REGISTER_TARGET     "register_target"
+#define KEY_SET_FS           "set_fs"
+#define KEY_TGT_COUNT     "tgt_count"
+/*      KEY_SET_INFO in lustre_idl.h */
+#define KEY_SPTLRPC_CONF       "sptlrpc_conf"
+#define KEY_CONNECT_FLAG       "connect_flags"
+#define KEY_SYNC_LOCK_CANCEL    "sync_lock_cancel"
+
+#define KEY_CACHE_SET          "cache_set"
+#define KEY_CACHE_LRU_SHRINK   "cache_lru_shrink"
+#define KEY_CHANGELOG_INDEX    "changelog_index"
+
+struct lu_context;
+
+/* /!\ must be coherent with include/linux/namei.h on patched kernel */
+#define IT_OPEN     (1 << 0)
+#define IT_CREAT    (1 << 1)
+#define IT_READDIR  (1 << 2)
+#define IT_GETATTR  (1 << 3)
+#define IT_LOOKUP   (1 << 4)
+#define IT_UNLINK   (1 << 5)
+#define IT_TRUNC    (1 << 6)
+#define IT_GETXATTR (1 << 7)
+#define IT_EXEC     (1 << 8)
+#define IT_PIN      (1 << 9)
+#define IT_LAYOUT   (1 << 10)
+#define IT_QUOTA_DQACQ (1 << 11)
+#define IT_QUOTA_CONN  (1 << 12)
+
+static inline int it_to_lock_mode(struct lookup_intent *it)
+{
+       /* CREAT needs to be tested before open (both could be set) */
+       if (it->it_op & IT_CREAT)
+               return LCK_CW;
+       else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP |
+                             IT_LAYOUT))
+               return LCK_CR;
+
+       LASSERTF(0, "Invalid it_op: %d\n", it->it_op);
+       return -EINVAL;
+}
+
+struct md_op_data {
+       struct lu_fid      op_fid1; /* operation fid1 (usualy parent) */
+       struct lu_fid      op_fid2; /* operation fid2 (usualy child) */
+       struct lu_fid      op_fid3; /* 2 extra fids to find conflicting */
+       struct lu_fid      op_fid4; /* to the operation locks. */
+       mdsno_t          op_mds;  /* what mds server open will go to */
+       struct lustre_handle    op_handle;
+       obd_time                op_mod_time;
+       const char           *op_name;
+       int                  op_namelen;
+       __u32              op_mode;
+       struct lmv_stripe_md   *op_mea1;
+       struct lmv_stripe_md   *op_mea2;
+       __u32              op_suppgids[2];
+       __u32              op_fsuid;
+       __u32              op_fsgid;
+       cfs_cap_t              op_cap;
+       void               *op_data;
+
+       /* iattr fields and blocks. */
+       struct iattr        op_attr;
+       unsigned int        op_attr_flags;
+       __u64              op_valid;
+       loff_t            op_attr_blocks;
+
+       /* Size-on-MDS epoch and flags. */
+       __u64              op_ioepoch;
+       __u32              op_flags;
+
+       /* Capa fields */
+       struct obd_capa *op_capa1;
+       struct obd_capa *op_capa2;
+
+       /* Various operation flags. */
+       __u32              op_bias;
+
+       /* Operation type */
+       __u32              op_opc;
+
+       /* Used by readdir */
+       __u64              op_offset;
+
+       /* Used by readdir */
+       __u32              op_npages;
+
+       /* used to transfer info between the stacks of MD client
+        * see enum op_cli_flags */
+       __u32                   op_cli_flags;
+};
+
+enum op_cli_flags {
+       CLI_SET_MEA     = 1 << 0,
+       CLI_RM_ENTRY    = 1 << 1,
+};
+
+struct md_enqueue_info;
+/* metadata stat-ahead */
+typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req,
+                               struct md_enqueue_info *minfo,
+                               int rc);
+
+/* seq client type */
+enum lu_cli_type {
+       LUSTRE_SEQ_METADATA = 1,
+       LUSTRE_SEQ_DATA
+};
+
+struct md_enqueue_info {
+       struct md_op_data       mi_data;
+       struct lookup_intent    mi_it;
+       struct lustre_handle    mi_lockh;
+       struct inode       *mi_dir;
+       md_enqueue_cb_t  mi_cb;
+       __u64              mi_cbdata;
+       unsigned int        mi_generation;
+};
+
+struct obd_ops {
+       module_t *o_owner;
+       int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len,
+                          void *karg, void *uarg);
+       int (*o_get_info)(const struct lu_env *env, struct obd_export *,
+                         __u32 keylen, void *key, __u32 *vallen, void *val,
+                         struct lov_stripe_md *lsm);
+       int (*o_set_info_async)(const struct lu_env *, struct obd_export *,
+                               __u32 keylen, void *key,
+                               __u32 vallen, void *val,
+                               struct ptlrpc_request_set *set);
+       int (*o_attach)(struct obd_device *dev, obd_count len, void *data);
+       int (*o_detach)(struct obd_device *dev);
+       int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg);
+       int (*o_precleanup)(struct obd_device *dev,
+                           enum obd_cleanup_stage cleanup_stage);
+       int (*o_cleanup)(struct obd_device *dev);
+       int (*o_process_config)(struct obd_device *dev, obd_count len,
+                               void *data);
+       int (*o_postrecov)(struct obd_device *dev);
+       int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid,
+                         int priority);
+       int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid);
+       /* connect to the target device with given connection
+        * data. @ocd->ocd_connect_flags is modified to reflect flags actually
+        * granted by the target, which are guaranteed to be a subset of flags
+        * asked for. If @ocd == NULL, use default parameters. */
+       int (*o_connect)(const struct lu_env *env,
+                        struct obd_export **exp, struct obd_device *src,
+                        struct obd_uuid *cluuid, struct obd_connect_data *ocd,
+                        void *localdata);
+       int (*o_reconnect)(const struct lu_env *env,
+                          struct obd_export *exp, struct obd_device *src,
+                          struct obd_uuid *cluuid,
+                          struct obd_connect_data *ocd,
+                          void *localdata);
+       int (*o_disconnect)(struct obd_export *exp);
+
+       /* Initialize/finalize fids infrastructure. */
+       int (*o_fid_init)(struct obd_device *obd,
+                         struct obd_export *exp, enum lu_cli_type type);
+       int (*o_fid_fini)(struct obd_device *obd);
+
+       /* Allocate new fid according to passed @hint. */
+       int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid,
+                          struct md_op_data *op_data);
+
+       /*
+        * Object with @fid is getting deleted, we may want to do something
+        * about this.
+        */
+       int (*o_statfs)(const struct lu_env *, struct obd_export *exp,
+                       struct obd_statfs *osfs, __u64 max_age, __u32 flags);
+       int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo,
+                             __u64 max_age, struct ptlrpc_request_set *set);
+       int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt,
+                       struct lov_stripe_md *mem_src);
+       int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt,
+                         struct lov_mds_md *disk_src, int disk_len);
+       int (*o_preallocate)(struct lustre_handle *, obd_count *req,
+                            obd_id *ids);
+       /* FIXME: add fid capability support for create & destroy! */
+       int (*o_precreate)(struct obd_export *exp);
+       int (*o_create)(const struct lu_env *env, struct obd_export *exp,
+                       struct obdo *oa, struct lov_stripe_md **ea,
+                       struct obd_trans_info *oti);
+       int (*o_create_async)(struct obd_export *exp,  struct obd_info *oinfo,
+                             struct lov_stripe_md **ea,
+                             struct obd_trans_info *oti);
+       int (*o_destroy)(const struct lu_env *env, struct obd_export *exp,
+                        struct obdo *oa, struct lov_stripe_md *ea,
+                        struct obd_trans_info *oti, struct obd_export *md_exp,
+                        void *capa);
+       int (*o_setattr)(const struct lu_env *, struct obd_export *exp,
+                        struct obd_info *oinfo, struct obd_trans_info *oti);
+       int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+                              struct obd_trans_info *oti,
+                              struct ptlrpc_request_set *rqset);
+       int (*o_getattr)(const struct lu_env *env, struct obd_export *exp,
+                        struct obd_info *oinfo);
+       int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo,
+                              struct ptlrpc_request_set *set);
+       int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo,
+                    obd_count oa_bufs, struct brw_page *pgarr,
+                    struct obd_trans_info *oti);
+       int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm,
+                          struct ost_lvb *lvb, int kms_only);
+       int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
+                           obd_off size, int shrink);
+       int (*o_punch)(const struct lu_env *, struct obd_export *exp,
+                      struct obd_info *oinfo, struct obd_trans_info *oti,
+                      struct ptlrpc_request_set *rqset);
+       int (*o_sync)(const struct lu_env *env, struct obd_export *exp,
+                     struct obd_info *oinfo, obd_size start, obd_size end,
+                     struct ptlrpc_request_set *set);
+       int (*o_migrate)(struct lustre_handle *conn, struct lov_stripe_md *dst,
+                        struct lov_stripe_md *src, obd_size start,
+                        obd_size end, struct obd_trans_info *oti);
+       int (*o_copy)(struct lustre_handle *dstconn, struct lov_stripe_md *dst,
+                     struct lustre_handle *srconn, struct lov_stripe_md *src,
+                     obd_size start, obd_size end, struct obd_trans_info *);
+       int (*o_iterate)(struct lustre_handle *conn,
+                        int (*)(obd_id, obd_seq, void *),
+                        obd_id *startid, obd_seq seq, void *data);
+       int (*o_preprw)(const struct lu_env *env, int cmd,
+                       struct obd_export *exp, struct obdo *oa, int objcount,
+                       struct obd_ioobj *obj, struct niobuf_remote *remote,
+                       int *nr_pages, struct niobuf_local *local,
+                       struct obd_trans_info *oti, struct lustre_capa *capa);
+       int (*o_commitrw)(const struct lu_env *env, int cmd,
+                         struct obd_export *exp, struct obdo *oa,
+                         int objcount, struct obd_ioobj *obj,
+                         struct niobuf_remote *remote, int pages,
+                         struct niobuf_local *local,
+                         struct obd_trans_info *oti, int rc);
+       int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo,
+                        struct ldlm_enqueue_info *einfo,
+                        struct ptlrpc_request_set *rqset);
+       int (*o_change_cbdata)(struct obd_export *, struct lov_stripe_md *,
+                              ldlm_iterator_t it, void *data);
+       int (*o_find_cbdata)(struct obd_export *, struct lov_stripe_md *,
+                            ldlm_iterator_t it, void *data);
+       int (*o_cancel)(struct obd_export *, struct lov_stripe_md *md,
+                       __u32 mode, struct lustre_handle *);
+       int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *,
+                              ldlm_cancel_flags_t flags, void *opaque);
+       int (*o_init_export)(struct obd_export *exp);
+       int (*o_destroy_export)(struct obd_export *exp);
+       int (*o_extent_calc)(struct obd_export *, struct lov_stripe_md *,
+                            int cmd, obd_off *);
+
+       /* llog related obd_methods */
+       int (*o_llog_init)(struct obd_device *obd, struct obd_llog_group *grp,
+                          struct obd_device *disk_obd, int *idx);
+       int (*o_llog_finish)(struct obd_device *obd, int count);
+       int (*o_llog_connect)(struct obd_export *, struct llogd_conn_body *);
+
+       /* metadata-only methods */
+       int (*o_pin)(struct obd_export *, const struct lu_fid *fid,
+                    struct obd_capa *, struct obd_client_handle *, int flag);
+       int (*o_unpin)(struct obd_export *, struct obd_client_handle *, int);
+
+       int (*o_import_event)(struct obd_device *, struct obd_import *,
+                             enum obd_import_event);
+
+       int (*o_notify)(struct obd_device *obd, struct obd_device *watched,
+                       enum obd_notify_event ev, void *data);
+
+       int (*o_health_check)(const struct lu_env *env, struct obd_device *);
+       struct obd_uuid *(*o_get_uuid) (struct obd_export *exp);
+
+       /* quota methods */
+       int (*o_quotacheck)(struct obd_device *, struct obd_export *,
+                           struct obd_quotactl *);
+       int (*o_quotactl)(struct obd_device *, struct obd_export *,
+                         struct obd_quotactl *);
+
+       int (*o_ping)(const struct lu_env *, struct obd_export *exp);
+
+       /* pools methods */
+       int (*o_pool_new)(struct obd_device *obd, char *poolname);
+       int (*o_pool_del)(struct obd_device *obd, char *poolname);
+       int (*o_pool_add)(struct obd_device *obd, char *poolname,
+                         char *ostname);
+       int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+                         char *ostname);
+       void (*o_getref)(struct obd_device *obd);
+       void (*o_putref)(struct obd_device *obd);
+       /*
+        * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
+        * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
+        * Also, add a wrapper function in include/linux/obd_class.h. */
+};
+
+enum {
+       LUSTRE_OPC_MKDIR    = (1 << 0),
+       LUSTRE_OPC_SYMLINK  = (1 << 1),
+       LUSTRE_OPC_MKNOD    = (1 << 2),
+       LUSTRE_OPC_CREATE   = (1 << 3),
+       LUSTRE_OPC_ANY      = (1 << 4)
+};
+
+/* lmv structures */
+#define MEA_MAGIC_LAST_CHAR      0xb2221ca1
+#define MEA_MAGIC_ALL_CHARS      0xb222a11c
+#define MEA_MAGIC_HASH_SEGMENT   0xb222a11b
+
+#define MAX_HASH_SIZE_32        0x7fffffffUL
+#define MAX_HASH_SIZE      0x7fffffffffffffffULL
+#define MAX_HASH_HIGHEST_BIT     0x1000000000000000ULL
+
+struct lustre_md {
+       struct mdt_body  *body;
+       struct lov_stripe_md    *lsm;
+       struct lmv_stripe_md    *mea;
+#ifdef CONFIG_FS_POSIX_ACL
+       struct posix_acl        *posix_acl;
+#endif
+       struct mdt_remote_perm  *remote_perm;
+       struct obd_capa  *mds_capa;
+       struct obd_capa  *oss_capa;
+};
+
+struct md_open_data {
+       struct obd_client_handle *mod_och;
+       struct ptlrpc_request    *mod_open_req;
+       struct ptlrpc_request    *mod_close_req;
+       atomic_t              mod_refcount;
+};
+
+struct lookup_intent;
+
+struct md_ops {
+       int (*m_getstatus)(struct obd_export *, struct lu_fid *,
+                          struct obd_capa **);
+       int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
+       int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *,
+                            ldlm_iterator_t, void *);
+       int (*m_close)(struct obd_export *, struct md_op_data *,
+                      struct md_open_data *, struct ptlrpc_request **);
+       int (*m_create)(struct obd_export *, struct md_op_data *,
+                       const void *, int, int, __u32, __u32, cfs_cap_t,
+                       __u64, struct ptlrpc_request **);
+       int (*m_done_writing)(struct obd_export *, struct md_op_data  *,
+                             struct md_open_data *);
+       int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *,
+                        struct lookup_intent *, struct md_op_data *,
+                        struct lustre_handle *, void *, int,
+                        struct ptlrpc_request **, __u64);
+       int (*m_getattr)(struct obd_export *, struct md_op_data *,
+                        struct ptlrpc_request **);
+       int (*m_getattr_name)(struct obd_export *, struct md_op_data *,
+                             struct ptlrpc_request **);
+       int (*m_intent_lock)(struct obd_export *, struct md_op_data *,
+                            void *, int, struct lookup_intent *, int,
+                            struct ptlrpc_request **,
+                            ldlm_blocking_callback, __u64);
+       int (*m_link)(struct obd_export *, struct md_op_data *,
+                     struct ptlrpc_request **);
+       int (*m_rename)(struct obd_export *, struct md_op_data *,
+                       const char *, int, const char *, int,
+                       struct ptlrpc_request **);
+       int (*m_is_subdir)(struct obd_export *, const struct lu_fid *,
+                          const struct lu_fid *,
+                          struct ptlrpc_request **);
+       int (*m_setattr)(struct obd_export *, struct md_op_data *, void *,
+                        int , void *, int, struct ptlrpc_request **,
+                        struct md_open_data **mod);
+       int (*m_sync)(struct obd_export *, const struct lu_fid *,
+                     struct obd_capa *, struct ptlrpc_request **);
+       int (*m_readpage)(struct obd_export *, struct md_op_data *,
+                         struct page **, struct ptlrpc_request **);
+
+       int (*m_unlink)(struct obd_export *, struct md_op_data *,
+                       struct ptlrpc_request **);
+
+       int (*m_setxattr)(struct obd_export *, const struct lu_fid *,
+                         struct obd_capa *, obd_valid, const char *,
+                         const char *, int, int, int, __u32,
+                         struct ptlrpc_request **);
+
+       int (*m_getxattr)(struct obd_export *, const struct lu_fid *,
+                         struct obd_capa *, obd_valid, const char *,
+                         const char *, int, int, int,
+                         struct ptlrpc_request **);
+
+       int (*m_init_ea_size)(struct obd_export *, int, int, int);
+
+       int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *,
+                              struct obd_export *, struct obd_export *,
+                              struct lustre_md *);
+
+       int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *);
+
+       int (*m_set_open_replay_data)(struct obd_export *,
+                                     struct obd_client_handle *,
+                                     struct ptlrpc_request *);
+       int (*m_clear_open_replay_data)(struct obd_export *,
+                                       struct obd_client_handle *);
+       int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *);
+
+       ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64,
+                                   const struct lu_fid *, ldlm_type_t,
+                                   ldlm_policy_data_t *, ldlm_mode_t,
+                                   struct lustre_handle *);
+
+       int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *,
+                              ldlm_policy_data_t *, ldlm_mode_t,
+                              ldlm_cancel_flags_t flags, void *opaque);
+       int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc,
+                           renew_capa_cb_t cb);
+       int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *,
+                            const struct req_msg_field *, struct obd_capa **);
+
+       int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *,
+                                struct obd_capa *, __u32,
+                                struct ptlrpc_request **);
+
+       int (*m_intent_getattr_async)(struct obd_export *,
+                                     struct md_enqueue_info *,
+                                     struct ldlm_enqueue_info *);
+
+       int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *,
+                                struct lu_fid *, __u64 *bits);
+
+       /*
+        * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to
+        * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a
+        * wrapper function in include/linux/obd_class.h.
+        */
+};
+
+struct lsm_operations {
+       void (*lsm_free)(struct lov_stripe_md *);
+       int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
+                          struct obd_export *md_exp);
+       void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *,
+                                   obd_off *);
+       void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *,
+                                    obd_off *);
+       int (*lsm_lmm_verify) (struct lov_mds_md *lmm, int lmm_bytes,
+                              __u16 *stripe_count);
+       int (*lsm_unpackmd) (struct lov_obd *lov, struct lov_stripe_md *lsm,
+                            struct lov_mds_md *lmm);
+};
+
+extern const struct lsm_operations lsm_v1_ops;
+extern const struct lsm_operations lsm_v3_ops;
+static inline const struct lsm_operations *lsm_op_find(int magic)
+{
+       switch(magic) {
+       case LOV_MAGIC_V1:
+              return &lsm_v1_ops;
+       case LOV_MAGIC_V3:
+              return &lsm_v3_ops;
+       default:
+              CERROR("Cannot recognize lsm_magic %08x\n", magic);
+              return NULL;
+       }
+}
+
+/* Requests for obd_extent_calc() */
+#define OBD_CALC_STRIPE_START   1
+#define OBD_CALC_STRIPE_END     2
+
+static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo)
+{
+       return oinfo->oi_capa;
+}
+
+static inline struct md_open_data *obd_mod_alloc(void)
+{
+       struct md_open_data *mod;
+       OBD_ALLOC_PTR(mod);
+       if (mod == NULL)
+               return NULL;
+       atomic_set(&mod->mod_refcount, 1);
+       return mod;
+}
+
+#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount)
+#define obd_mod_put(mod)                                       \
+({                                                           \
+       if (atomic_dec_and_test(&(mod)->mod_refcount)) {          \
+               if ((mod)->mod_open_req)                          \
+                       ptlrpc_req_finished((mod)->mod_open_req);   \
+               OBD_FREE_PTR(mod);                            \
+       }                                                      \
+})
+
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid);
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent);
+
+/* return 1 if client should be resend request */
+static inline int client_should_resend(int resend, struct client_obd *cli)
+{
+       return atomic_read(&cli->cl_resends) ?
+              atomic_read(&cli->cl_resends) > resend : 1;
+}
+
+/**
+ * Return device name for this device
+ *
+ * XXX: lu_device is declared before obd_device, while a pointer pointing
+ * back to obd_device in lu_device, so this helper function defines here
+ * instead of in lu_object.h
+ */
+static inline const char *lu_dev_name(const struct lu_device *lu_dev)
+{
+       return lu_dev->ld_obd->obd_name;
+}
+
+static inline bool filename_is_volatile(const char *name, int namelen, int *idx)
+{
+       const char      *start;
+       char            *end;
+
+       if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0)
+               return false;
+
+       /* caller does not care of idx */
+       if (idx == NULL)
+               return true;
+
+       /* volatile file, the MDT can be set from name */
+       /* name format is LUSTRE_VOLATILE_HDR:[idx]: */
+       /* if no MDT is specified, use std way */
+       if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2)
+               goto bad_format;
+       /* test for no MDT idx case */
+       if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') &&
+           (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) {
+               *idx = -1;
+               return true;
+       }
+       /* we have an idx, read it */
+       start = name + LUSTRE_VOLATILE_HDR_LEN + 1;
+       *idx = strtoul(start, &end, 0);
+       /* error cases:
+        * no digit, no trailing :, negative value
+        */
+       if (((*idx == 0) && (end == start)) ||
+           (*end != ':') || (*idx < 0))
+               goto bad_format;
+
+       return true;
+bad_format:
+       /* bad format of mdt idx, we cannot return an error
+        * to caller so we use hash algo */
+       CERROR("Bad volatile file name format: %s\n",
+              name + LUSTRE_VOLATILE_HDR_LEN);
+       return false;
+}
+
+static inline int cli_brw_size(struct obd_device *obd)
+{
+       LASSERT(obd != NULL);
+       return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_cache.h b/drivers/staging/lustre/lustre/include/obd_cache.h
new file mode 100644 (file)
index 0000000..c8249fb
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_CACHE_H__
+#define _OBD_CACHE_H__
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_cksum.h b/drivers/staging/lustre/lustre/include/obd_cksum.h
new file mode 100644 (file)
index 0000000..5f740f1
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __OBD_CKSUM
+#define __OBD_CKSUM
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+
+static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type)
+{
+       switch (cksum_type) {
+       case OBD_CKSUM_CRC32:
+               return CFS_HASH_ALG_CRC32;
+       case OBD_CKSUM_ADLER:
+               return CFS_HASH_ALG_ADLER32;
+       case OBD_CKSUM_CRC32C:
+               return CFS_HASH_ALG_CRC32C;
+       default:
+               CERROR("Unknown checksum type (%x)!!!\n", cksum_type);
+               LBUG();
+       }
+       return 0;
+}
+
+/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can
+ * only be a single checksum type per RPC.
+ *
+ * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask
+ * since they need to represent the full range of checksum algorithms that
+ * both the client and server can understand.
+ *
+ * In case of an unsupported types/flags we fall back to ADLER
+ * because that is supported by all clients since 1.8
+ *
+ * In case multiple algorithms are supported the best one is used. */
+static inline obd_flag cksum_type_pack(cksum_type_t cksum_type)
+{
+       unsigned int    performance = 0, tmp;
+       obd_flag        flag = OBD_FL_CKSUM_ADLER;
+
+       if (cksum_type & OBD_CKSUM_CRC32) {
+               tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32));
+               if (tmp > performance) {
+                       performance = tmp;
+                       flag = OBD_FL_CKSUM_CRC32;
+               }
+       }
+       if (cksum_type & OBD_CKSUM_CRC32C) {
+               tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C));
+               if (tmp > performance) {
+                       performance = tmp;
+                       flag = OBD_FL_CKSUM_CRC32C;
+               }
+       }
+       if (cksum_type & OBD_CKSUM_ADLER) {
+               tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER));
+               if (tmp > performance) {
+                       performance = tmp;
+                       flag = OBD_FL_CKSUM_ADLER;
+               }
+       }
+       if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C |
+                                                  OBD_CKSUM_CRC32 |
+                                                  OBD_CKSUM_ADLER))))
+               CWARN("unknown cksum type %x\n", cksum_type);
+
+       return flag;
+}
+
+static inline cksum_type_t cksum_type_unpack(obd_flag o_flags)
+{
+       switch (o_flags & OBD_FL_CKSUM_ALL) {
+       case OBD_FL_CKSUM_CRC32C:
+               return OBD_CKSUM_CRC32C;
+       case OBD_FL_CKSUM_CRC32:
+               return OBD_CKSUM_CRC32;
+       default:
+               break;
+       }
+
+       return OBD_CKSUM_ADLER;
+}
+
+/* Return a bitmask of the checksum types supported on this system.
+ * 1.8 supported ADLER it is base and not depend on hw
+ * Client uses all available local algos
+ */
+static inline cksum_type_t cksum_types_supported_client(void)
+{
+       cksum_type_t ret = OBD_CKSUM_ADLER;
+
+       CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+       if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0)
+               ret |= OBD_CKSUM_CRC32C;
+       if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0)
+               ret |= OBD_CKSUM_CRC32;
+
+       return ret;
+}
+
+/* Server uses algos that perform at 50% or better of the Adler */
+static inline cksum_type_t cksum_types_supported_server(void)
+{
+       int          base_speed;
+       cksum_type_t    ret = OBD_CKSUM_ADLER;
+
+       CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n",
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)),
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)),
+              cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)));
+
+       base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2;
+
+       if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >=
+           base_speed)
+               ret |= OBD_CKSUM_CRC32C;
+       if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >=
+           base_speed)
+               ret |= OBD_CKSUM_CRC32;
+
+       return ret;
+}
+
+
+/* Select the best checksum algorithm among those supplied in the cksum_types
+ * input.
+ *
+ * Currently, calling cksum_type_pack() with a mask will return the fastest
+ * checksum type due to its benchmarking at libcfs module load.
+ * Caution is advised, however, since what is fastest on a single client may
+ * not be the fastest or most efficient algorithm on the server.  */
+static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types)
+{
+       return cksum_type_unpack(cksum_type_pack(cksum_types));
+}
+
+/* Checksum algorithm names. Must be defined in the same order as the
+ * OBD_CKSUM_* flags. */
+#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"}
+
+#endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h
new file mode 100644 (file)
index 0000000..de5c585
--- /dev/null
@@ -0,0 +1,2281 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef __CLASS_OBD_H
+#define __CLASS_OBD_H
+
+
+#include <obd_support.h>
+#include <lustre_import.h>
+#include <lustre_net.h>
+#include <obd.h>
+#include <lustre_lib.h>
+#include <lustre/lustre_idl.h>
+#include <lprocfs_status.h>
+
+#include <linux/obd_class.h>
+
+#define OBD_STATFS_NODELAY      0x0001  /* requests should be send without delay
+                                        * and resends for avoid deadlocks */
+#define OBD_STATFS_FROM_CACHE   0x0002  /* the statfs callback should not update
+                                        * obd_osfs_age */
+#define OBD_STATFS_PTLRPCD      0x0004  /* requests will be sent via ptlrpcd
+                                        * instead of a specific set. This
+                                        * means that we cannot rely on the set
+                                        * interpret routine to be called.
+                                        * lov_statfs_fini() must thus be called
+                                        * by the request interpret routine */
+#define OBD_STATFS_FOR_MDT0    0x0008  /* The statfs is only for retrieving
+                                        * information from MDT0. */
+#define OBD_FL_PUNCH    0x00000001      /* To indicate it is punch operation */
+
+/* OBD Device Declarations */
+extern struct obd_device *obd_devs[MAX_OBD_DEVICES];
+extern rwlock_t obd_dev_lock;
+
+/* OBD Operations Declarations */
+extern struct obd_device *class_conn2obd(struct lustre_handle *);
+extern struct obd_device *class_exp2obd(struct obd_export *);
+extern int class_handle_ioctl(unsigned int cmd, unsigned long arg);
+extern int lustre_get_jobid(char *jobid);
+
+struct lu_device_type;
+
+/* genops.c */
+struct obd_export *class_conn2export(struct lustre_handle *);
+int class_register_type(struct obd_ops *, struct md_ops *,
+                       struct lprocfs_vars *, const char *nm,
+                       struct lu_device_type *ldt);
+int class_unregister_type(const char *nm);
+
+struct obd_device *class_newdev(const char *type_name, const char *name);
+void class_release_dev(struct obd_device *obd);
+
+int class_name2dev(const char *name);
+struct obd_device *class_name2obd(const char *name);
+int class_uuid2dev(struct obd_uuid *uuid);
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+void class_obd_list(void);
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+                                         const char * typ_name,
+                                         struct obd_uuid *grp_uuid);
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid,
+                                          int *next);
+struct obd_device * class_num2obd(int num);
+int get_devices_count(void);
+
+int class_notify_sptlrpc_conf(const char *fsname, int namelen);
+
+char *obd_export_nid2str(struct obd_export *exp);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep);
+
+int obd_zombie_impexp_init(void);
+void obd_zombie_impexp_stop(void);
+void obd_zombie_impexp_cull(void);
+void obd_zombie_barrier(void);
+void obd_exports_barrier(struct obd_device *obd);
+int kuc_len(int payload_len);
+struct kuc_hdr * kuc_ptr(void *p);
+int kuc_ispayload(void *p);
+void *kuc_alloc(int payload_len, int transport, int type);
+void kuc_free(void *p, int payload_len);
+
+struct llog_handle;
+struct llog_rec_hdr;
+typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *,
+                        struct llog_rec_hdr *, void *);
+/* obd_config.c */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+                                    const char *new_name);
+int class_process_config(struct lustre_cfg *lcfg);
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+                            struct lustre_cfg *lcfg, void *data);
+int class_attach(struct lustre_cfg *lcfg);
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+struct obd_device *class_incref(struct obd_device *obd,
+                               const char *scope, const void *source);
+void class_decref(struct obd_device *obd,
+                 const char *scope, const void *source);
+void dump_exports(struct obd_device *obd, int locks);
+int class_config_llog_handler(const struct lu_env *env,
+                             struct llog_handle *handle,
+                             struct llog_rec_hdr *rec, void *data);
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg);
+int class_add_uuid(const char *uuid, __u64 nid);
+
+/*obdecho*/
+#ifdef LPROCFS
+extern void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+#define CFG_F_START     0x01   /* Set when we start updating from a log */
+#define CFG_F_MARKER    0x02   /* We are within a maker */
+#define CFG_F_SKIP      0x04   /* We should ignore this cfg command */
+#define CFG_F_COMPAT146 0x08   /* Allow old-style logs */
+#define CFG_F_EXCLUDE   0x10   /* OST exclusion list */
+
+/* Passed as data param to class_config_parse_llog */
+struct config_llog_instance {
+       char           *cfg_obdname;
+       void           *cfg_instance;
+       struct super_block *cfg_sb;
+       struct obd_uuid     cfg_uuid;
+       llog_cb_t           cfg_callback;
+       int              cfg_last_idx; /* for partial llog processing */
+       int              cfg_flags;
+};
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+                           char *name, struct config_llog_instance *cfg);
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+                          char *name, struct config_llog_instance *cfg);
+
+enum {
+       CONFIG_T_CONFIG  = 0,
+       CONFIG_T_SPTLRPC = 1,
+       CONFIG_T_RECOVER = 2,
+       CONFIG_T_MAX     = 3
+};
+
+/* list of active configuration logs  */
+struct config_llog_data {
+       struct ldlm_res_id        cld_resid;
+       struct config_llog_instance cld_cfg;
+       struct list_head                  cld_list_chain;
+       atomic_t                cld_refcount;
+       struct config_llog_data    *cld_sptlrpc;/* depended sptlrpc log */
+       struct config_llog_data    *cld_recover;    /* imperative recover log */
+       struct obd_export         *cld_mgcexp;
+       struct mutex                cld_lock;
+       int                      cld_type;
+       unsigned int            cld_stopping:1, /* we were told to stop
+                                                    * watching */
+                                   cld_lostlock:1; /* lock not requeued */
+       char                    cld_logname[0];
+};
+
+struct lustre_profile {
+       struct list_head       lp_list;
+       char        *lp_profile;
+       char        *lp_dt;
+       char        *lp_md;
+};
+
+struct lustre_profile *class_get_profile(const char * prof);
+void class_del_profile(const char *prof);
+void class_del_profiles(void);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *);
+void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *);
+extern void (*class_export_dump_hook)(struct obd_export *);
+
+#else
+
+#define __class_export_add_lock_ref(exp, lock)      do {} while(0)
+#define __class_export_del_lock_ref(exp, lock)      do {} while(0)
+
+#endif
+
+#define class_export_rpc_inc(exp)                                     \
+({                                                                   \
+       atomic_inc(&(exp)->exp_rpc_count);                        \
+       CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n",    \
+              (exp), atomic_read(&(exp)->exp_rpc_count));        \
+})
+
+#define class_export_rpc_dec(exp)                                     \
+({                                                                   \
+       LASSERT_ATOMIC_POS(&exp->exp_rpc_count);                        \
+       atomic_dec(&(exp)->exp_rpc_count);                        \
+       CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n",    \
+              (exp), atomic_read(&(exp)->exp_rpc_count));        \
+})
+
+#define class_export_lock_get(exp, lock)                               \
+({                                                                   \
+       atomic_inc(&(exp)->exp_locks_count);                    \
+       __class_export_add_lock_ref(exp, lock);                  \
+       CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \
+              (exp), atomic_read(&(exp)->exp_locks_count));    \
+       class_export_get(exp);                                    \
+})
+
+#define class_export_lock_put(exp, lock)                               \
+({                                                                   \
+       LASSERT_ATOMIC_POS(&exp->exp_locks_count);                    \
+       atomic_dec(&(exp)->exp_locks_count);                    \
+       __class_export_del_lock_ref(exp, lock);                  \
+       CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \
+              (exp), atomic_read(&(exp)->exp_locks_count));    \
+       class_export_put(exp);                                    \
+})
+
+#define class_export_cb_get(exp)                                       \
+({                                                                   \
+       atomic_inc(&(exp)->exp_cb_count);                          \
+       CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\
+              (exp), atomic_read(&(exp)->exp_cb_count));          \
+       class_export_get(exp);                                    \
+})
+
+#define class_export_cb_put(exp)                                       \
+({                                                                   \
+       LASSERT_ATOMIC_POS(&exp->exp_cb_count);                  \
+       atomic_dec(&(exp)->exp_cb_count);                          \
+       CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\
+              (exp), atomic_read(&(exp)->exp_cb_count));          \
+       class_export_put(exp);                                    \
+})
+
+/* genops.c */
+struct obd_export *class_export_get(struct obd_export *exp);
+void class_export_put(struct obd_export *exp);
+struct obd_export *class_new_export(struct obd_device *obddev,
+                                   struct obd_uuid *cluuid);
+void class_unlink_export(struct obd_export *exp);
+
+struct obd_import *class_import_get(struct obd_import *);
+void class_import_put(struct obd_import *);
+struct obd_import *class_new_import(struct obd_device *obd);
+void class_destroy_import(struct obd_import *exp);
+
+struct obd_type *class_search_type(const char *name);
+struct obd_type *class_get_type(const char *name);
+void class_put_type(struct obd_type *type);
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+                 struct obd_uuid *cluuid);
+int class_disconnect(struct obd_export *exp);
+void class_fail_export(struct obd_export *exp);
+int class_connected_export(struct obd_export *exp);
+void class_disconnect_exports(struct obd_device *obddev);
+int class_manual_cleanup(struct obd_device *obd);
+void class_disconnect_stale_exports(struct obd_device *,
+                                   int (*test_export)(struct obd_export *));
+static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
+{
+       return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) |
+               (obd->obd_force ? OBD_OPT_FORCE : 0) |
+               (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) |
+               0);
+}
+
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid);
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj);
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr,
+                    unsigned int ia_valid);
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid);
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid);
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+                 unsigned int valid);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo);
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo);
+
+#define OBT(dev)       (dev)->obd_type
+#define OBP(dev, op)    (dev)->obd_type->typ_dt_ops->o_ ## op
+#define MDP(dev, op)    (dev)->obd_type->typ_md_ops->m_ ## op
+#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op
+
+/* Ensure obd_setup: used for cleanup which must be called
+   while obd is stopping */
+#define OBD_CHECK_DEV(obd)                                   \
+do {                                                       \
+       if (!(obd)) {                                      \
+               CERROR("NULL device\n");                        \
+               RETURN(-ENODEV);                                \
+       }                                                      \
+} while (0)
+
+/* ensure obd_setup and !obd_stopping */
+#define OBD_CHECK_DEV_ACTIVE(obd)                             \
+do {                                                       \
+       OBD_CHECK_DEV(obd);                                  \
+       if (!(obd)->obd_set_up || (obd)->obd_stopping) {        \
+               CERROR("Device %d not setup\n",          \
+                      (obd)->obd_minor);                      \
+               RETURN(-ENODEV);                                \
+       }                                                      \
+} while (0)
+
+
+#ifdef LPROCFS
+#define OBD_COUNTER_OFFSET(op)                           \
+       ((offsetof(struct obd_ops, o_ ## op) -            \
+         offsetof(struct obd_ops, o_iocontrol))                \
+        / sizeof(((struct obd_ops *)(0))->o_iocontrol))
+
+#define OBD_COUNTER_INCREMENT(obdx, op)                           \
+       if ((obdx)->obd_stats != NULL) {                          \
+               unsigned int coffset;                        \
+               coffset = (unsigned int)((obdx)->obd_cntr_base) + \
+                       OBD_COUNTER_OFFSET(op);            \
+               LASSERT(coffset < (obdx)->obd_stats->ls_num);     \
+               lprocfs_counter_incr((obdx)->obd_stats, coffset); \
+       }
+
+#define EXP_COUNTER_INCREMENT(export, op)                                  \
+       if ((export)->exp_obd->obd_stats != NULL) {                       \
+               unsigned int coffset;                                   \
+               coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \
+                       OBD_COUNTER_OFFSET(op);                       \
+               LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num);     \
+               lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \
+               if ((export)->exp_nid_stats != NULL &&                 \
+                   (export)->exp_nid_stats->nid_stats != NULL)       \
+                       lprocfs_counter_incr(                           \
+                               (export)->exp_nid_stats->nid_stats, coffset);\
+       }
+
+#define MD_COUNTER_OFFSET(op)                             \
+       ((offsetof(struct md_ops, m_ ## op) -              \
+         offsetof(struct md_ops, m_getstatus))          \
+        / sizeof(((struct md_ops *)(0))->m_getstatus))
+
+#define MD_COUNTER_INCREMENT(obdx, op)                    \
+       if ((obd)->md_stats != NULL) {                     \
+               unsigned int coffset;                       \
+               coffset = (unsigned int)((obdx)->md_cntr_base) + \
+                       MD_COUNTER_OFFSET(op);             \
+               LASSERT(coffset < (obdx)->md_stats->ls_num);     \
+               lprocfs_counter_incr((obdx)->md_stats, coffset); \
+       }
+
+#define EXP_MD_COUNTER_INCREMENT(export, op)                            \
+       if ((export)->exp_obd->obd_stats != NULL) {                       \
+               unsigned int coffset;                                   \
+               coffset = (unsigned int)((export)->exp_obd->md_cntr_base) +  \
+                       MD_COUNTER_OFFSET(op);                         \
+               LASSERT(coffset < (export)->exp_obd->md_stats->ls_num);      \
+               lprocfs_counter_incr((export)->exp_obd->md_stats, coffset);  \
+               if ((export)->exp_md_stats != NULL)                       \
+                       lprocfs_counter_incr(                           \
+                               (export)->exp_md_stats, coffset);           \
+       }
+
+#else
+#define OBD_COUNTER_OFFSET(op)
+#define OBD_COUNTER_INCREMENT(obd, op)
+#define EXP_COUNTER_INCREMENT(exp, op)
+#define MD_COUNTER_INCREMENT(obd, op)
+#define EXP_MD_COUNTER_INCREMENT(exp, op)
+#endif
+
+static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp)
+{
+       /* Always add in ldlm_stats */
+       tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC
+                                                 ,LPROCFS_STATS_FLAG_NOPERCPU);
+       if (tmp->nid_ldlm_stats == NULL)
+               return -ENOMEM;
+
+       lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats);
+
+       return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats",
+                                     tmp->nid_ldlm_stats);
+}
+
+#define OBD_CHECK_MD_OP(obd, op, err)                     \
+do {                                                       \
+       if (!OBT(obd) || !MDP((obd), op)) {                  \
+               if (err)                                        \
+                       CERROR("md_" #op ": dev %s/%d no operation\n", \
+                              obd->obd_name, obd->obd_minor);  \
+               RETURN(err);                                \
+       }                                                      \
+} while (0)
+
+#define EXP_CHECK_MD_OP(exp, op)                               \
+do {                                                       \
+       if ((exp) == NULL) {                                \
+               CERROR("obd_" #op ": NULL export\n");      \
+               RETURN(-ENODEV);                                \
+       }                                                      \
+       if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {   \
+               CERROR("obd_" #op ": cleaned up obd\n");        \
+               RETURN(-EOPNOTSUPP);                        \
+       }                                                      \
+       if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \
+               CERROR("obd_" #op ": dev %s/%d no operation\n", \
+                      (exp)->exp_obd->obd_name,                \
+                      (exp)->exp_obd->obd_minor);            \
+               RETURN(-EOPNOTSUPP);                        \
+       }                                                      \
+} while (0)
+
+
+#define OBD_CHECK_DT_OP(obd, op, err)                     \
+do {                                                       \
+       if (!OBT(obd) || !OBP((obd), op)) {                  \
+               if (err)                                        \
+                       CERROR("obd_" #op ": dev %d no operation\n",    \
+                              obd->obd_minor);          \
+               RETURN(err);                                \
+       }                                                      \
+} while (0)
+
+#define EXP_CHECK_DT_OP(exp, op)                               \
+do {                                                       \
+       if ((exp) == NULL) {                                \
+               CERROR("obd_" #op ": NULL export\n");      \
+               RETURN(-ENODEV);                                \
+       }                                                      \
+       if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) {   \
+               CERROR("obd_" #op ": cleaned up obd\n");        \
+               RETURN(-EOPNOTSUPP);                        \
+       }                                                      \
+       if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \
+               CERROR("obd_" #op ": dev %d no operation\n",    \
+                      (exp)->exp_obd->obd_minor);            \
+               RETURN(-EOPNOTSUPP);                        \
+       }                                                      \
+} while (0)
+
+#define CTXT_CHECK_OP(ctxt, op, err)                            \
+do {                                                            \
+       if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) {             \
+               if (err)                                             \
+                       CERROR("lop_" #op ": dev %d no operation\n", \
+                              ctxt->loc_obd->obd_minor);           \
+               RETURN(err);                                     \
+       }                                                           \
+} while (0)
+
+static inline int class_devno_max(void)
+{
+       return MAX_OBD_DEVICES;
+}
+
+static inline int obd_get_info(const struct lu_env *env,
+                              struct obd_export *exp, __u32 keylen,
+                              void *key, __u32 *vallen, void *val,
+                              struct lov_stripe_md *lsm)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, get_info);
+       EXP_COUNTER_INCREMENT(exp, get_info);
+
+       rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val,
+                                        lsm);
+       RETURN(rc);
+}
+
+static inline int obd_set_info_async(const struct lu_env *env,
+                                    struct obd_export *exp, obd_count keylen,
+                                    void *key, obd_count vallen, void *val,
+                                    struct ptlrpc_request_set *set)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, set_info_async);
+       EXP_COUNTER_INCREMENT(exp, set_info_async);
+
+       rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen,
+                                              val, set);
+       RETURN(rc);
+}
+
+/*
+ * obd-lu integration.
+ *
+ * Functionality is being moved into new lu_device-based layering, but some
+ * pieces of configuration process are still based on obd devices.
+ *
+ * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully
+ * subsume ->o_setup() methods of obd devices they replace. The same for
+ * lu_device_operations::ldo_process_config() and ->o_process_config(). As a
+ * result, obd_setup() and obd_process_config() branch and call one XOR
+ * another.
+ *
+ * Yet neither lu_device_type_operations::ldto_device_fini() nor
+ * lu_device_type_operations::ldto_device_free() fully implement the
+ * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
+ * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
+ */
+
+#define DECLARE_LU_VARS(ldt, d)                 \
+       struct lu_device_type *ldt;       \
+       struct lu_device *d
+
+static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+       int rc;
+       DECLARE_LU_VARS(ldt, d);
+       ENTRY;
+
+       ldt = obd->obd_type->typ_lu;
+       if (ldt != NULL) {
+               struct lu_context  session_ctx;
+               struct lu_env env;
+               lu_context_init(&session_ctx, LCT_SESSION);
+               session_ctx.lc_thread = NULL;
+               lu_context_enter(&session_ctx);
+
+               rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+               if (rc == 0) {
+                       env.le_ses = &session_ctx;
+                       d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
+                       lu_env_fini(&env);
+                       if (!IS_ERR(d)) {
+                               obd->obd_lu_dev = d;
+                               d->ld_obd = obd;
+                               rc = 0;
+                       } else
+                               rc = PTR_ERR(d);
+               }
+               lu_context_exit(&session_ctx);
+               lu_context_fini(&session_ctx);
+
+       } else {
+               OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
+               OBD_COUNTER_INCREMENT(obd, setup);
+               rc = OBP(obd, setup)(obd, cfg);
+       }
+       RETURN(rc);
+}
+
+static inline int obd_precleanup(struct obd_device *obd,
+                                enum obd_cleanup_stage cleanup_stage)
+{
+       int rc;
+       DECLARE_LU_VARS(ldt, d);
+       ENTRY;
+
+       OBD_CHECK_DEV(obd);
+       ldt = obd->obd_type->typ_lu;
+       d = obd->obd_lu_dev;
+       if (ldt != NULL && d != NULL) {
+               if (cleanup_stage == OBD_CLEANUP_EXPORTS) {
+                       struct lu_env env;
+
+                       rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+                       if (rc == 0) {
+                               ldt->ldt_ops->ldto_device_fini(&env, d);
+                               lu_env_fini(&env);
+                       }
+               }
+       }
+       OBD_CHECK_DT_OP(obd, precleanup, 0);
+       OBD_COUNTER_INCREMENT(obd, precleanup);
+
+       rc = OBP(obd, precleanup)(obd, cleanup_stage);
+       RETURN(rc);
+}
+
+static inline int obd_cleanup(struct obd_device *obd)
+{
+       int rc;
+       DECLARE_LU_VARS(ldt, d);
+       ENTRY;
+
+       OBD_CHECK_DEV(obd);
+
+       ldt = obd->obd_type->typ_lu;
+       d = obd->obd_lu_dev;
+       if (ldt != NULL && d != NULL) {
+               struct lu_env env;
+
+               rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+               if (rc == 0) {
+                       ldt->ldt_ops->ldto_device_free(&env, d);
+                       lu_env_fini(&env);
+                       obd->obd_lu_dev = NULL;
+               }
+       }
+       OBD_CHECK_DT_OP(obd, cleanup, 0);
+       OBD_COUNTER_INCREMENT(obd, cleanup);
+
+       rc = OBP(obd, cleanup)(obd);
+       RETURN(rc);
+}
+
+static inline void obd_cleanup_client_import(struct obd_device *obd)
+{
+       ENTRY;
+
+       /* If we set up but never connected, the
+          client import will not have been cleaned. */
+       down_write(&obd->u.cli.cl_sem);
+       if (obd->u.cli.cl_import) {
+               struct obd_import *imp;
+               imp = obd->u.cli.cl_import;
+               CDEBUG(D_CONFIG, "%s: client import never connected\n",
+                      obd->obd_name);
+               ptlrpc_invalidate_import(imp);
+               if (imp->imp_rq_pool) {
+                       ptlrpc_free_rq_pool(imp->imp_rq_pool);
+                       imp->imp_rq_pool = NULL;
+               }
+               client_destroy_import(imp);
+               obd->u.cli.cl_import = NULL;
+       }
+       up_write(&obd->u.cli.cl_sem);
+
+       EXIT;
+}
+
+static inline int
+obd_process_config(struct obd_device *obd, int datalen, void *data)
+{
+       int rc;
+       DECLARE_LU_VARS(ldt, d);
+       ENTRY;
+
+       OBD_CHECK_DEV(obd);
+
+       obd->obd_process_conf = 1;
+       ldt = obd->obd_type->typ_lu;
+       d = obd->obd_lu_dev;
+       if (ldt != NULL && d != NULL) {
+               struct lu_env env;
+
+               rc = lu_env_init(&env, ldt->ldt_ctx_tags);
+               if (rc == 0) {
+                       rc = d->ld_ops->ldo_process_config(&env, d, data);
+                       lu_env_fini(&env);
+               }
+       } else {
+               OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
+               rc = OBP(obd, process_config)(obd, datalen, data);
+       }
+       OBD_COUNTER_INCREMENT(obd, process_config);
+       obd->obd_process_conf = 0;
+
+       RETURN(rc);
+}
+
+/* Pack an in-memory MD struct for storage on disk.
+ * Returns +ve size of packed MD (0 for free), or -ve error.
+ *
+ * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL).
+ * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed.
+ * If @*disk_tgt == NULL, it will be allocated
+ */
+static inline int obd_packmd(struct obd_export *exp,
+                            struct lov_mds_md **disk_tgt,
+                            struct lov_stripe_md *mem_src)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, packmd);
+       EXP_COUNTER_INCREMENT(exp, packmd);
+
+       rc = OBP(exp->exp_obd, packmd)(exp, disk_tgt, mem_src);
+       RETURN(rc);
+}
+
+static inline int obd_size_diskmd(struct obd_export *exp,
+                                 struct lov_stripe_md *mem_src)
+{
+       return obd_packmd(exp, NULL, mem_src);
+}
+
+/* helper functions */
+static inline int obd_alloc_diskmd(struct obd_export *exp,
+                                  struct lov_mds_md **disk_tgt)
+{
+       LASSERT(disk_tgt);
+       LASSERT(*disk_tgt == NULL);
+       return obd_packmd(exp, disk_tgt, NULL);
+}
+
+static inline int obd_free_diskmd(struct obd_export *exp,
+                                 struct lov_mds_md **disk_tgt)
+{
+       LASSERT(disk_tgt);
+       LASSERT(*disk_tgt);
+       /*
+        * LU-2590, for caller's convenience, *disk_tgt could be host
+        * endianness, it needs swab to LE if necessary, while just
+        * lov_mds_md header needs it for figuring out how much memory
+        * needs to be freed.
+        */
+       if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+           (((*disk_tgt)->lmm_magic == LOV_MAGIC_V1) ||
+            ((*disk_tgt)->lmm_magic == LOV_MAGIC_V3)))
+               lustre_swab_lov_mds_md(*disk_tgt);
+       return obd_packmd(exp, disk_tgt, NULL);
+}
+
+/* Unpack an MD struct from disk to in-memory format.
+ * Returns +ve size of unpacked MD (0 for free), or -ve error.
+ *
+ * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL).
+ * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed.
+ * If @*mem_tgt == NULL, it will be allocated
+ */
+static inline int obd_unpackmd(struct obd_export *exp,
+                              struct lov_stripe_md **mem_tgt,
+                              struct lov_mds_md *disk_src,
+                              int disk_len)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, unpackmd);
+       EXP_COUNTER_INCREMENT(exp, unpackmd);
+
+       rc = OBP(exp->exp_obd, unpackmd)(exp, mem_tgt, disk_src, disk_len);
+       RETURN(rc);
+}
+
+/* helper functions */
+static inline int obd_alloc_memmd(struct obd_export *exp,
+                                 struct lov_stripe_md **mem_tgt)
+{
+       LASSERT(mem_tgt);
+       LASSERT(*mem_tgt == NULL);
+       return obd_unpackmd(exp, mem_tgt, NULL, 0);
+}
+
+static inline int obd_free_memmd(struct obd_export *exp,
+                                struct lov_stripe_md **mem_tgt)
+{
+       int rc;
+
+       LASSERT(mem_tgt);
+       LASSERT(*mem_tgt);
+       rc = obd_unpackmd(exp, mem_tgt, NULL, 0);
+       *mem_tgt = NULL;
+       return rc;
+}
+
+static inline int obd_precreate(struct obd_export *exp)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, precreate);
+       OBD_COUNTER_INCREMENT(exp->exp_obd, precreate);
+
+       rc = OBP(exp->exp_obd, precreate)(exp);
+       RETURN(rc);
+}
+
+static inline int obd_create_async(struct obd_export *exp,
+                                  struct obd_info *oinfo,
+                                  struct lov_stripe_md **ea,
+                                  struct obd_trans_info *oti)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, create_async);
+       EXP_COUNTER_INCREMENT(exp, create_async);
+
+       rc = OBP(exp->exp_obd, create_async)(exp, oinfo, ea, oti);
+       RETURN(rc);
+}
+
+static inline int obd_create(const struct lu_env *env, struct obd_export *exp,
+                            struct obdo *obdo, struct lov_stripe_md **ea,
+                            struct obd_trans_info *oti)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, create);
+       EXP_COUNTER_INCREMENT(exp, create);
+
+       rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti);
+       RETURN(rc);
+}
+
+static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp,
+                             struct obdo *obdo, struct lov_stripe_md *ea,
+                             struct obd_trans_info *oti,
+                             struct obd_export *md_exp, void *capa)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, destroy);
+       EXP_COUNTER_INCREMENT(exp, destroy);
+
+       rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp, capa);
+       RETURN(rc);
+}
+
+static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp,
+                             struct obd_info *oinfo)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, getattr);
+       EXP_COUNTER_INCREMENT(exp, getattr);
+
+       rc = OBP(exp->exp_obd, getattr)(env, exp, oinfo);
+       RETURN(rc);
+}
+
+static inline int obd_getattr_async(struct obd_export *exp,
+                                   struct obd_info *oinfo,
+                                   struct ptlrpc_request_set *set)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, getattr_async);
+       EXP_COUNTER_INCREMENT(exp, getattr_async);
+
+       rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set);
+       RETURN(rc);
+}
+
+static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp,
+                             struct obd_info *oinfo,
+                             struct obd_trans_info *oti)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, setattr);
+       EXP_COUNTER_INCREMENT(exp, setattr);
+
+       rc = OBP(exp->exp_obd, setattr)(env, exp, oinfo, oti);
+       RETURN(rc);
+}
+
+/* This performs all the requests set init/wait/destroy actions. */
+static inline int obd_setattr_rqset(struct obd_export *exp,
+                                   struct obd_info *oinfo,
+                                   struct obd_trans_info *oti)
+{
+       struct ptlrpc_request_set *set = NULL;
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, setattr_async);
+       EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+       set =  ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+       RETURN(rc);
+}
+
+/* This adds all the requests into @set if @set != NULL, otherwise
+   all requests are sent asynchronously without waiting for response. */
+static inline int obd_setattr_async(struct obd_export *exp,
+                                   struct obd_info *oinfo,
+                                   struct obd_trans_info *oti,
+                                   struct ptlrpc_request_set *set)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, setattr_async);
+       EXP_COUNTER_INCREMENT(exp, setattr_async);
+
+       rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set);
+       RETURN(rc);
+}
+
+static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                              int priority)
+{
+       struct obd_device *obd = imp->imp_obd;
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DEV_ACTIVE(obd);
+       OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, add_conn);
+
+       rc = OBP(obd, add_conn)(imp, uuid, priority);
+       RETURN(rc);
+}
+
+static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+       struct obd_device *obd = imp->imp_obd;
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DEV_ACTIVE(obd);
+       OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, del_conn);
+
+       rc = OBP(obd, del_conn)(imp, uuid);
+       RETURN(rc);
+}
+
+static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp)
+{
+       struct obd_uuid *uuid;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL);
+       EXP_COUNTER_INCREMENT(exp, get_uuid);
+
+       uuid = OBP(exp->exp_obd, get_uuid)(exp);
+       RETURN(uuid);
+}
+
+/** Create a new /a exp on device /a obd for the uuid /a cluuid
+ * @param exp New export handle
+ * @param d Connect data, supported flags are set, flags also understood
+ *    by obd are returned.
+ */
+static inline int obd_connect(const struct lu_env *env,
+                             struct obd_export **exp,struct obd_device *obd,
+                             struct obd_uuid *cluuid,
+                             struct obd_connect_data *data,
+                             void *localdata)
+{
+       int rc;
+       __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition
+                                                  * check */
+       ENTRY;
+
+       OBD_CHECK_DEV_ACTIVE(obd);
+       OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, connect);
+
+       rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata);
+       /* check that only subset is granted */
+       LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) ==
+                                   data->ocd_connect_flags));
+       RETURN(rc);
+}
+
+static inline int obd_reconnect(const struct lu_env *env,
+                               struct obd_export *exp,
+                               struct obd_device *obd,
+                               struct obd_uuid *cluuid,
+                               struct obd_connect_data *d,
+                               void *localdata)
+{
+       int rc;
+       __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
+                                                  * check */
+
+       ENTRY;
+
+       OBD_CHECK_DEV_ACTIVE(obd);
+       OBD_CHECK_DT_OP(obd, reconnect, 0);
+       OBD_COUNTER_INCREMENT(obd, reconnect);
+
+       rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata);
+       /* check that only subset is granted */
+       LASSERT(ergo(d != NULL,
+                    (d->ocd_connect_flags & ocf) == d->ocd_connect_flags));
+       RETURN(rc);
+}
+
+static inline int obd_disconnect(struct obd_export *exp)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, disconnect);
+       EXP_COUNTER_INCREMENT(exp, disconnect);
+
+       rc = OBP(exp->exp_obd, disconnect)(exp);
+       RETURN(rc);
+}
+
+static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp,
+                              enum lu_cli_type type)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, fid_init, 0);
+       OBD_COUNTER_INCREMENT(obd, fid_init);
+
+       rc = OBP(obd, fid_init)(obd, exp, type);
+       RETURN(rc);
+}
+
+static inline int obd_fid_fini(struct obd_device *obd)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, fid_fini, 0);
+       OBD_COUNTER_INCREMENT(obd, fid_fini);
+
+       rc = OBP(obd, fid_fini)(obd);
+       RETURN(rc);
+}
+
+static inline int obd_fid_alloc(struct obd_export *exp,
+                               struct lu_fid *fid,
+                               struct md_op_data *op_data)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, fid_alloc);
+       EXP_COUNTER_INCREMENT(exp, fid_alloc);
+
+       rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data);
+       RETURN(rc);
+}
+
+static inline int obd_ping(const struct lu_env *env, struct obd_export *exp)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, ping, 0);
+       EXP_COUNTER_INCREMENT(exp, ping);
+
+       rc = OBP(exp->exp_obd, ping)(env, exp);
+       RETURN(rc);
+}
+
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, pool_new);
+
+       rc = OBP(obd, pool_new)(obd, poolname);
+       RETURN(rc);
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, pool_del);
+
+       rc = OBP(obd, pool_del)(obd, poolname);
+       RETURN(rc);
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, pool_add);
+
+       rc = OBP(obd, pool_add)(obd, poolname, ostname);
+       RETURN(rc);
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, pool_rem);
+
+       rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+       RETURN(rc);
+}
+
+static inline void obd_getref(struct obd_device *obd)
+{
+       ENTRY;
+       if (OBT(obd) && OBP(obd, getref)) {
+               OBD_COUNTER_INCREMENT(obd, getref);
+               OBP(obd, getref)(obd);
+       }
+       EXIT;
+}
+
+static inline void obd_putref(struct obd_device *obd)
+{
+       ENTRY;
+       if (OBT(obd) && OBP(obd, putref)) {
+               OBD_COUNTER_INCREMENT(obd, putref);
+               OBP(obd, putref)(obd);
+       }
+       EXIT;
+}
+
+static inline int obd_init_export(struct obd_export *exp)
+{
+       int rc = 0;
+
+       ENTRY;
+       if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+           OBP((exp)->exp_obd, init_export))
+               rc = OBP(exp->exp_obd, init_export)(exp);
+       RETURN(rc);
+}
+
+static inline int obd_destroy_export(struct obd_export *exp)
+{
+       ENTRY;
+       if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) &&
+           OBP((exp)->exp_obd, destroy_export))
+               OBP(exp->exp_obd, destroy_export)(exp);
+       RETURN(0);
+}
+
+static inline int obd_extent_calc(struct obd_export *exp,
+                                 struct lov_stripe_md *md,
+                                 int cmd, obd_off *offset)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_DT_OP(exp, extent_calc);
+       rc = OBP(exp->exp_obd, extent_calc)(exp, md, cmd, offset);
+       RETURN(rc);
+}
+
+static inline struct dentry *
+obd_lvfs_fid2dentry(struct obd_export *exp, struct ost_id *oi, __u32 gen)
+{
+       struct lvfs_run_ctxt *ctxt = &exp->exp_obd->obd_lvfs_ctxt;
+       LASSERT(exp->exp_obd);
+
+       return ctxt->cb_ops.l_fid2dentry(ostid_id(oi), gen, ostid_seq(oi),
+                                        exp->exp_obd);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs_async(struct obd_export *exp,
+                                  struct obd_info *oinfo,
+                                  __u64 max_age,
+                                  struct ptlrpc_request_set *rqset)
+{
+       int rc = 0;
+       struct obd_device *obd;
+       ENTRY;
+
+       if (exp == NULL || exp->exp_obd == NULL)
+               RETURN(-EINVAL);
+
+       obd = exp->exp_obd;
+       OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, statfs);
+
+       CDEBUG(D_SUPER, "%s: osfs %p age "LPU64", max_age "LPU64"\n",
+              obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age);
+       if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+               rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset);
+       } else {
+               CDEBUG(D_SUPER,"%s: use %p cache blocks "LPU64"/"LPU64
+                      " objects "LPU64"/"LPU64"\n",
+                      obd->obd_name, &obd->obd_osfs,
+                      obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+                      obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+               spin_lock(&obd->obd_osfs_lock);
+               memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs));
+               spin_unlock(&obd->obd_osfs_lock);
+               oinfo->oi_flags |= OBD_STATFS_FROM_CACHE;
+               if (oinfo->oi_cb_up)
+                       oinfo->oi_cb_up(oinfo, 0);
+       }
+       RETURN(rc);
+}
+
+static inline int obd_statfs_rqset(struct obd_export *exp,
+                                  struct obd_statfs *osfs, __u64 max_age,
+                                  __u32 flags)
+{
+       struct ptlrpc_request_set *set = NULL;
+       struct obd_info oinfo = { { { 0 } } };
+       int rc = 0;
+       ENTRY;
+
+       set =  ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       oinfo.oi_osfs = osfs;
+       oinfo.oi_flags = flags;
+       rc = obd_statfs_async(exp, &oinfo, max_age, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+       RETURN(rc);
+}
+
+/* @max_age is the oldest time in jiffies that we accept using a cached data.
+ * If the cache is older than @max_age we will get a new value from the
+ * target.  Use a value of "cfs_time_current() + HZ" to guarantee freshness. */
+static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp,
+                            struct obd_statfs *osfs, __u64 max_age,
+                            __u32 flags)
+{
+       int rc = 0;
+       struct obd_device *obd = exp->exp_obd;
+       ENTRY;
+
+       if (obd == NULL)
+               RETURN(-EINVAL);
+
+       OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP);
+       OBD_COUNTER_INCREMENT(obd, statfs);
+
+       CDEBUG(D_SUPER, "osfs "LPU64", max_age "LPU64"\n",
+              obd->obd_osfs_age, max_age);
+       if (cfs_time_before_64(obd->obd_osfs_age, max_age)) {
+               rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags);
+               if (rc == 0) {
+                       spin_lock(&obd->obd_osfs_lock);
+                       memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs));
+                       obd->obd_osfs_age = cfs_time_current_64();
+                       spin_unlock(&obd->obd_osfs_lock);
+               }
+       } else {
+               CDEBUG(D_SUPER, "%s: use %p cache blocks "LPU64"/"LPU64
+                      " objects "LPU64"/"LPU64"\n",
+                      obd->obd_name, &obd->obd_osfs,
+                      obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks,
+                      obd->obd_osfs.os_ffree, obd->obd_osfs.os_files);
+               spin_lock(&obd->obd_osfs_lock);
+               memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
+               spin_unlock(&obd->obd_osfs_lock);
+       }
+       RETURN(rc);
+}
+
+static inline int obd_sync_rqset(struct obd_export *exp, struct obd_info *oinfo,
+                                obd_size start, obd_size end)
+{
+       struct ptlrpc_request_set *set = NULL;
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+       EXP_COUNTER_INCREMENT(exp, sync);
+
+       set =  ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       rc = OBP(exp->exp_obd, sync)(NULL, exp, oinfo, start, end, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+       RETURN(rc);
+}
+
+static inline int obd_sync(const struct lu_env *env, struct obd_export *exp,
+                          struct obd_info *oinfo, obd_size start, obd_size end,
+                          struct ptlrpc_request_set *set)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP);
+       EXP_COUNTER_INCREMENT(exp, sync);
+
+       rc = OBP(exp->exp_obd, sync)(env, exp, oinfo, start, end, set);
+       RETURN(rc);
+}
+
+static inline int obd_punch_rqset(struct obd_export *exp,
+                                 struct obd_info *oinfo,
+                                 struct obd_trans_info *oti)
+{
+       struct ptlrpc_request_set *set = NULL;
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, punch);
+       EXP_COUNTER_INCREMENT(exp, punch);
+
+       set =  ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       rc = OBP(exp->exp_obd, punch)(NULL, exp, oinfo, oti, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+       RETURN(rc);
+}
+
+static inline int obd_punch(const struct lu_env *env, struct obd_export *exp,
+                           struct obd_info *oinfo, struct obd_trans_info *oti,
+                           struct ptlrpc_request_set *rqset)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, punch);
+       EXP_COUNTER_INCREMENT(exp, punch);
+
+       rc = OBP(exp->exp_obd, punch)(env, exp, oinfo, oti, rqset);
+       RETURN(rc);
+}
+
+static inline int obd_brw(int cmd, struct obd_export *exp,
+                         struct obd_info *oinfo, obd_count oa_bufs,
+                         struct brw_page *pg, struct obd_trans_info *oti)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, brw);
+       EXP_COUNTER_INCREMENT(exp, brw);
+
+       if (!(cmd & (OBD_BRW_RWMASK | OBD_BRW_CHECK))) {
+               CERROR("obd_brw: cmd must be OBD_BRW_READ, OBD_BRW_WRITE, "
+                      "or OBD_BRW_CHECK\n");
+               LBUG();
+       }
+
+       rc = OBP(exp->exp_obd, brw)(cmd, exp, oinfo, oa_bufs, pg, oti);
+       RETURN(rc);
+}
+
+static inline int obd_preprw(const struct lu_env *env, int cmd,
+                            struct obd_export *exp, struct obdo *oa,
+                            int objcount, struct obd_ioobj *obj,
+                            struct niobuf_remote *remote, int *pages,
+                            struct niobuf_local *local,
+                            struct obd_trans_info *oti,
+                            struct lustre_capa *capa)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, preprw);
+       EXP_COUNTER_INCREMENT(exp, preprw);
+
+       rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote,
+                                      pages, local, oti, capa);
+       RETURN(rc);
+}
+
+static inline int obd_commitrw(const struct lu_env *env, int cmd,
+                              struct obd_export *exp, struct obdo *oa,
+                              int objcount, struct obd_ioobj *obj,
+                              struct niobuf_remote *rnb, int pages,
+                              struct niobuf_local *local,
+                              struct obd_trans_info *oti, int rc)
+{
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, commitrw);
+       EXP_COUNTER_INCREMENT(exp, commitrw);
+
+       rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj,
+                                        rnb, pages, local, oti, rc);
+       RETURN(rc);
+}
+
+static inline int obd_merge_lvb(struct obd_export *exp,
+                               struct lov_stripe_md *lsm,
+                               struct ost_lvb *lvb, int kms_only)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, merge_lvb);
+       EXP_COUNTER_INCREMENT(exp, merge_lvb);
+
+       rc = OBP(exp->exp_obd, merge_lvb)(exp, lsm, lvb, kms_only);
+       RETURN(rc);
+}
+
+static inline int obd_adjust_kms(struct obd_export *exp,
+                                struct lov_stripe_md *lsm, obd_off size,
+                                int shrink)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, adjust_kms);
+       EXP_COUNTER_INCREMENT(exp, adjust_kms);
+
+       rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink);
+       RETURN(rc);
+}
+
+static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp,
+                               int len, void *karg, void *uarg)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, iocontrol);
+       EXP_COUNTER_INCREMENT(exp, iocontrol);
+
+       rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg);
+       RETURN(rc);
+}
+
+static inline int obd_enqueue_rqset(struct obd_export *exp,
+                                   struct obd_info *oinfo,
+                                   struct ldlm_enqueue_info *einfo)
+{
+       struct ptlrpc_request_set *set = NULL;
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, enqueue);
+       EXP_COUNTER_INCREMENT(exp, enqueue);
+
+       set =  ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+       RETURN(rc);
+}
+
+static inline int obd_enqueue(struct obd_export *exp,
+                             struct obd_info *oinfo,
+                             struct ldlm_enqueue_info *einfo,
+                             struct ptlrpc_request_set *set)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, enqueue);
+       EXP_COUNTER_INCREMENT(exp, enqueue);
+
+       rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set);
+       RETURN(rc);
+}
+
+static inline int obd_change_cbdata(struct obd_export *exp,
+                                   struct lov_stripe_md *lsm,
+                                   ldlm_iterator_t it, void *data)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, change_cbdata);
+       EXP_COUNTER_INCREMENT(exp, change_cbdata);
+
+       rc = OBP(exp->exp_obd, change_cbdata)(exp, lsm, it, data);
+       RETURN(rc);
+}
+
+static inline int obd_find_cbdata(struct obd_export *exp,
+                                 struct lov_stripe_md *lsm,
+                                 ldlm_iterator_t it, void *data)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, find_cbdata);
+       EXP_COUNTER_INCREMENT(exp, find_cbdata);
+
+       rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data);
+       RETURN(rc);
+}
+
+static inline int obd_cancel(struct obd_export *exp,
+                            struct lov_stripe_md *ea, __u32 mode,
+                            struct lustre_handle *lockh)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, cancel);
+       EXP_COUNTER_INCREMENT(exp, cancel);
+
+       rc = OBP(exp->exp_obd, cancel)(exp, ea, mode, lockh);
+       RETURN(rc);
+}
+
+static inline int obd_cancel_unused(struct obd_export *exp,
+                                   struct lov_stripe_md *ea,
+                                   ldlm_cancel_flags_t flags,
+                                   void *opaque)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, cancel_unused);
+       EXP_COUNTER_INCREMENT(exp, cancel_unused);
+
+       rc = OBP(exp->exp_obd, cancel_unused)(exp, ea, flags, opaque);
+       RETURN(rc);
+}
+
+static inline int obd_pin(struct obd_export *exp, const struct lu_fid *fid,
+                         struct obd_capa *oc, struct obd_client_handle *handle,
+                         int flag)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, pin);
+       EXP_COUNTER_INCREMENT(exp, pin);
+
+       rc = OBP(exp->exp_obd, pin)(exp, fid, oc, handle, flag);
+       RETURN(rc);
+}
+
+static inline int obd_unpin(struct obd_export *exp,
+                           struct obd_client_handle *handle, int flag)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, unpin);
+       EXP_COUNTER_INCREMENT(exp, unpin);
+
+       rc = OBP(exp->exp_obd, unpin)(exp, handle, flag);
+       RETURN(rc);
+}
+
+
+static inline void obd_import_event(struct obd_device *obd,
+                                   struct obd_import *imp,
+                                   enum obd_import_event event)
+{
+       ENTRY;
+       if (!obd) {
+               CERROR("NULL device\n");
+               EXIT;
+               return;
+       }
+       if (obd->obd_set_up && OBP(obd, import_event)) {
+               OBD_COUNTER_INCREMENT(obd, import_event);
+               OBP(obd, import_event)(obd, imp, event);
+       }
+       EXIT;
+}
+
+static inline int obd_llog_connect(struct obd_export *exp,
+                                  struct llogd_conn_body *body)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, llog_connect, 0);
+       EXP_COUNTER_INCREMENT(exp, llog_connect);
+
+       rc = OBP(exp->exp_obd, llog_connect)(exp, body);
+       RETURN(rc);
+}
+
+
+static inline int obd_notify(struct obd_device *obd,
+                            struct obd_device *watched,
+                            enum obd_notify_event ev,
+                            void *data)
+{
+       int rc;
+       ENTRY;
+       OBD_CHECK_DEV(obd);
+
+       /* the check for async_recov is a complete hack - I'm hereby
+          overloading the meaning to also mean "this was called from
+          mds_postsetup".  I know that my mds is able to handle notifies
+          by this point, and it needs to get them to execute mds_postrecov. */
+       if (!obd->obd_set_up && !obd->obd_async_recov) {
+               CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name);
+               RETURN(-EINVAL);
+       }
+
+       if (!OBP(obd, notify)) {
+               CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name);
+               RETURN(-ENOSYS);
+       }
+
+       OBD_COUNTER_INCREMENT(obd, notify);
+       rc = OBP(obd, notify)(obd, watched, ev, data);
+       RETURN(rc);
+}
+
+static inline int obd_notify_observer(struct obd_device *observer,
+                                     struct obd_device *observed,
+                                     enum obd_notify_event ev,
+                                     void *data)
+{
+       int rc1;
+       int rc2;
+
+       struct obd_notify_upcall *onu;
+
+       if (observer->obd_observer)
+               rc1 = obd_notify(observer->obd_observer, observed, ev, data);
+       else
+               rc1 = 0;
+       /*
+        * Also, call non-obd listener, if any
+        */
+       onu = &observer->obd_upcall;
+       if (onu->onu_upcall != NULL)
+               rc2 = onu->onu_upcall(observer, observed, ev,
+                                     onu->onu_owner, NULL);
+       else
+               rc2 = 0;
+
+       return rc1 ? rc1 : rc2;
+}
+
+static inline int obd_quotacheck(struct obd_export *exp,
+                                struct obd_quotactl *oqctl)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, quotacheck);
+       EXP_COUNTER_INCREMENT(exp, quotacheck);
+
+       rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl);
+       RETURN(rc);
+}
+
+static inline int obd_quotactl(struct obd_export *exp,
+                              struct obd_quotactl *oqctl)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_DT_OP(exp, quotactl);
+       EXP_COUNTER_INCREMENT(exp, quotactl);
+
+       rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl);
+       RETURN(rc);
+}
+
+static inline int obd_health_check(const struct lu_env *env,
+                                  struct obd_device *obd)
+{
+       /* returns: 0 on healthy
+        *       >0 on unhealthy + reason code/flag
+        *          however the only suppored reason == 1 right now
+        *          We'll need to define some better reasons
+        *          or flags in the future.
+        *       <0 on error
+        */
+       int rc;
+       ENTRY;
+
+       /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */
+       if (obd == NULL || !OBT(obd)) {
+               CERROR("cleaned up obd\n");
+               RETURN(-EOPNOTSUPP);
+       }
+       if (!obd->obd_set_up || obd->obd_stopping)
+               RETURN(0);
+       if (!OBP(obd, health_check))
+               RETURN(0);
+
+       rc = OBP(obd, health_check)(env, obd);
+       RETURN(rc);
+}
+
+static inline int obd_register_observer(struct obd_device *obd,
+                                       struct obd_device *observer)
+{
+       ENTRY;
+       OBD_CHECK_DEV(obd);
+       down_write(&obd->obd_observer_link_sem);
+       if (obd->obd_observer && observer) {
+               up_write(&obd->obd_observer_link_sem);
+               RETURN(-EALREADY);
+       }
+       obd->obd_observer = observer;
+       up_write(&obd->obd_observer_link_sem);
+       RETURN(0);
+}
+
+static inline int obd_pin_observer(struct obd_device *obd,
+                                  struct obd_device **observer)
+{
+       ENTRY;
+       down_read(&obd->obd_observer_link_sem);
+       if (!obd->obd_observer) {
+               *observer = NULL;
+               up_read(&obd->obd_observer_link_sem);
+               RETURN(-ENOENT);
+       }
+       *observer = obd->obd_observer;
+       RETURN(0);
+}
+
+static inline int obd_unpin_observer(struct obd_device *obd)
+{
+       ENTRY;
+       up_read(&obd->obd_observer_link_sem);
+       RETURN(0);
+}
+
+#if 0
+static inline int obd_register_page_removal_cb(struct obd_export *exp,
+                                              obd_page_removal_cb_t cb,
+                                              obd_pin_extent_cb pin_cb)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, register_page_removal_cb, 0);
+       OBD_COUNTER_INCREMENT(exp->exp_obd, register_page_removal_cb);
+
+       rc = OBP(exp->exp_obd, register_page_removal_cb)(exp, cb, pin_cb);
+       RETURN(rc);
+}
+
+static inline int obd_unregister_page_removal_cb(struct obd_export *exp,
+                                                obd_page_removal_cb_t cb)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, unregister_page_removal_cb, 0);
+       OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_page_removal_cb);
+
+       rc = OBP(exp->exp_obd, unregister_page_removal_cb)(exp, cb);
+       RETURN(rc);
+}
+
+static inline int obd_register_lock_cancel_cb(struct obd_export *exp,
+                                             obd_lock_cancel_cb cb)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, register_lock_cancel_cb, 0);
+       OBD_COUNTER_INCREMENT(exp->exp_obd, register_lock_cancel_cb);
+
+       rc = OBP(exp->exp_obd, register_lock_cancel_cb)(exp, cb);
+       RETURN(rc);
+}
+
+static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp,
+                                                obd_lock_cancel_cb cb)
+{
+       int rc;
+       ENTRY;
+
+       OBD_CHECK_DT_OP(exp->exp_obd, unregister_lock_cancel_cb, 0);
+       OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_lock_cancel_cb);
+
+       rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb);
+       RETURN(rc);
+}
+#endif
+
+/* metadata helpers */
+static inline int md_getstatus(struct obd_export *exp,
+                              struct lu_fid *fid, struct obd_capa **pc)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_MD_OP(exp, getstatus);
+       EXP_MD_COUNTER_INCREMENT(exp, getstatus);
+       rc = MDP(exp->exp_obd, getstatus)(exp, fid, pc);
+       RETURN(rc);
+}
+
+static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data,
+                            struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, getattr);
+       EXP_MD_COUNTER_INCREMENT(exp, getattr);
+       rc = MDP(exp->exp_obd, getattr)(exp, op_data, request);
+       RETURN(rc);
+}
+
+static inline int md_null_inode(struct obd_export *exp,
+                                  const struct lu_fid *fid)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, null_inode);
+       EXP_MD_COUNTER_INCREMENT(exp, null_inode);
+       rc = MDP(exp->exp_obd, null_inode)(exp, fid);
+       RETURN(rc);
+}
+
+static inline int md_find_cbdata(struct obd_export *exp,
+                                const struct lu_fid *fid,
+                                ldlm_iterator_t it, void *data)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, find_cbdata);
+       EXP_MD_COUNTER_INCREMENT(exp, find_cbdata);
+       rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data);
+       RETURN(rc);
+}
+
+static inline int md_close(struct obd_export *exp, struct md_op_data *op_data,
+                          struct md_open_data *mod,
+                          struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, close);
+       EXP_MD_COUNTER_INCREMENT(exp, close);
+       rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request);
+       RETURN(rc);
+}
+
+static inline int md_create(struct obd_export *exp, struct md_op_data *op_data,
+                           const void *data, int datalen, int mode, __u32 uid,
+                           __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+                           struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, create);
+       EXP_MD_COUNTER_INCREMENT(exp, create);
+       rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode,
+                                      uid, gid, cap_effective, rdev, request);
+       RETURN(rc);
+}
+
+static inline int md_done_writing(struct obd_export *exp,
+                                 struct md_op_data *op_data,
+                                 struct md_open_data *mod)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, done_writing);
+       EXP_MD_COUNTER_INCREMENT(exp, done_writing);
+       rc = MDP(exp->exp_obd, done_writing)(exp, op_data, mod);
+       RETURN(rc);
+}
+
+static inline int md_enqueue(struct obd_export *exp,
+                            struct ldlm_enqueue_info *einfo,
+                            struct lookup_intent *it,
+                            struct md_op_data *op_data,
+                            struct lustre_handle *lockh,
+                            void *lmm, int lmmsize,
+                            struct ptlrpc_request **req,
+                            int extra_lock_flags)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, enqueue);
+       EXP_MD_COUNTER_INCREMENT(exp, enqueue);
+       rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh,
+                                       lmm, lmmsize, req, extra_lock_flags);
+       RETURN(rc);
+}
+
+static inline int md_getattr_name(struct obd_export *exp,
+                                 struct md_op_data *op_data,
+                                 struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, getattr_name);
+       EXP_MD_COUNTER_INCREMENT(exp, getattr_name);
+       rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request);
+       RETURN(rc);
+}
+
+static inline int md_intent_lock(struct obd_export *exp,
+                                struct md_op_data *op_data, void *lmm,
+                                int lmmsize, struct lookup_intent *it,
+                                int lookup_flags, struct ptlrpc_request **reqp,
+                                ldlm_blocking_callback cb_blocking,
+                                __u64 extra_lock_flags)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, intent_lock);
+       EXP_MD_COUNTER_INCREMENT(exp, intent_lock);
+       rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize,
+                                           it, lookup_flags, reqp, cb_blocking,
+                                           extra_lock_flags);
+       RETURN(rc);
+}
+
+static inline int md_link(struct obd_export *exp, struct md_op_data *op_data,
+                         struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, link);
+       EXP_MD_COUNTER_INCREMENT(exp, link);
+       rc = MDP(exp->exp_obd, link)(exp, op_data, request);
+       RETURN(rc);
+}
+
+static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data,
+                           const char *old, int oldlen, const char *new,
+                           int newlen, struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, rename);
+       EXP_MD_COUNTER_INCREMENT(exp, rename);
+       rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new,
+                                      newlen, request);
+       RETURN(rc);
+}
+
+static inline int md_is_subdir(struct obd_export *exp,
+                              const struct lu_fid *pfid,
+                              const struct lu_fid *cfid,
+                              struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, is_subdir);
+       EXP_MD_COUNTER_INCREMENT(exp, is_subdir);
+       rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request);
+       RETURN(rc);
+}
+
+static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data,
+                            void *ea, int ealen, void *ea2, int ea2len,
+                            struct ptlrpc_request **request,
+                            struct md_open_data **mod)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, setattr);
+       EXP_MD_COUNTER_INCREMENT(exp, setattr);
+       rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen,
+                                       ea2, ea2len, request, mod);
+       RETURN(rc);
+}
+
+static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid,
+                         struct obd_capa *oc, struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, sync);
+       EXP_MD_COUNTER_INCREMENT(exp, sync);
+       rc = MDP(exp->exp_obd, sync)(exp, fid, oc, request);
+       RETURN(rc);
+}
+
+static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata,
+                             struct page **pages,
+                             struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, readpage);
+       EXP_MD_COUNTER_INCREMENT(exp, readpage);
+       rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request);
+       RETURN(rc);
+}
+
+static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data,
+                           struct ptlrpc_request **request)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, unlink);
+       EXP_MD_COUNTER_INCREMENT(exp, unlink);
+       rc = MDP(exp->exp_obd, unlink)(exp, op_data, request);
+       RETURN(rc);
+}
+
+static inline int md_get_lustre_md(struct obd_export *exp,
+                                  struct ptlrpc_request *req,
+                                  struct obd_export *dt_exp,
+                                  struct obd_export *md_exp,
+                                  struct lustre_md *md)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, get_lustre_md);
+       EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md);
+       RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md));
+}
+
+static inline int md_free_lustre_md(struct obd_export *exp,
+                                   struct lustre_md *md)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, free_lustre_md);
+       EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md);
+       RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md));
+}
+
+static inline int md_setxattr(struct obd_export *exp,
+                             const struct lu_fid *fid, struct obd_capa *oc,
+                             obd_valid valid, const char *name,
+                             const char *input, int input_size,
+                             int output_size, int flags, __u32 suppgid,
+                             struct ptlrpc_request **request)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, setxattr);
+       EXP_MD_COUNTER_INCREMENT(exp, setxattr);
+       RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, oc, valid, name, input,
+                                          input_size, output_size, flags,
+                                          suppgid, request));
+}
+
+static inline int md_getxattr(struct obd_export *exp,
+                             const struct lu_fid *fid, struct obd_capa *oc,
+                             obd_valid valid, const char *name,
+                             const char *input, int input_size,
+                             int output_size, int flags,
+                             struct ptlrpc_request **request)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, getxattr);
+       EXP_MD_COUNTER_INCREMENT(exp, getxattr);
+       RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, oc, valid, name, input,
+                                          input_size, output_size, flags,
+                                          request));
+}
+
+static inline int md_set_open_replay_data(struct obd_export *exp,
+                                         struct obd_client_handle *och,
+                                         struct ptlrpc_request *open_req)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, set_open_replay_data);
+       EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data);
+       RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, open_req));
+}
+
+static inline int md_clear_open_replay_data(struct obd_export *exp,
+                                           struct obd_client_handle *och)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, clear_open_replay_data);
+       EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data);
+       RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och));
+}
+
+static inline int md_set_lock_data(struct obd_export *exp,
+                                  __u64 *lockh, void *data, __u64 *bits)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, set_lock_data);
+       EXP_MD_COUNTER_INCREMENT(exp, set_lock_data);
+       RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits));
+}
+
+static inline int md_cancel_unused(struct obd_export *exp,
+                                  const struct lu_fid *fid,
+                                  ldlm_policy_data_t *policy,
+                                  ldlm_mode_t mode,
+                                  ldlm_cancel_flags_t flags,
+                                  void *opaque)
+{
+       int rc;
+       ENTRY;
+
+       EXP_CHECK_MD_OP(exp, cancel_unused);
+       EXP_MD_COUNTER_INCREMENT(exp, cancel_unused);
+
+       rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode,
+                                             flags, opaque);
+       RETURN(rc);
+}
+
+static inline ldlm_mode_t md_lock_match(struct obd_export *exp, __u64 flags,
+                                       const struct lu_fid *fid,
+                                       ldlm_type_t type,
+                                       ldlm_policy_data_t *policy,
+                                       ldlm_mode_t mode,
+                                       struct lustre_handle *lockh)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, lock_match);
+       EXP_MD_COUNTER_INCREMENT(exp, lock_match);
+       RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type,
+                                            policy, mode, lockh));
+}
+
+static inline int md_init_ea_size(struct obd_export *exp, int easize,
+                                 int def_asize, int cookiesize)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, init_ea_size);
+       EXP_MD_COUNTER_INCREMENT(exp, init_ea_size);
+       RETURN(MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize,
+                                              cookiesize));
+}
+
+static inline int md_get_remote_perm(struct obd_export *exp,
+                                    const struct lu_fid *fid,
+                                    struct obd_capa *oc, __u32 suppgid,
+                                    struct ptlrpc_request **request)
+{
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, get_remote_perm);
+       EXP_MD_COUNTER_INCREMENT(exp, get_remote_perm);
+       RETURN(MDP(exp->exp_obd, get_remote_perm)(exp, fid, oc, suppgid,
+                                                 request));
+}
+
+static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa,
+                               renew_capa_cb_t cb)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, renew_capa);
+       EXP_MD_COUNTER_INCREMENT(exp, renew_capa);
+       rc = MDP(exp->exp_obd, renew_capa)(exp, ocapa, cb);
+       RETURN(rc);
+}
+
+static inline int md_unpack_capa(struct obd_export *exp,
+                                struct ptlrpc_request *req,
+                                const struct req_msg_field *field,
+                                struct obd_capa **oc)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, unpack_capa);
+       EXP_MD_COUNTER_INCREMENT(exp, unpack_capa);
+       rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc);
+       RETURN(rc);
+}
+
+static inline int md_intent_getattr_async(struct obd_export *exp,
+                                         struct md_enqueue_info *minfo,
+                                         struct ldlm_enqueue_info *einfo)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, intent_getattr_async);
+       EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async);
+       rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo, einfo);
+       RETURN(rc);
+}
+
+static inline int md_revalidate_lock(struct obd_export *exp,
+                                    struct lookup_intent *it,
+                                    struct lu_fid *fid, __u64 *bits)
+{
+       int rc;
+       ENTRY;
+       EXP_CHECK_MD_OP(exp, revalidate_lock);
+       EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock);
+       rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits);
+       RETURN(rc);
+}
+
+
+/* OBD Metadata Support */
+
+extern int obd_init_caches(void);
+extern void obd_cleanup_caches(void);
+
+/* support routines */
+extern struct kmem_cache *obdo_cachep;
+
+#define OBDO_ALLOC(ptr)                                                       \
+do {                                                                     \
+       OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, __GFP_IO);        \
+} while(0)
+
+#define OBDO_FREE(ptr)                                                 \
+do {                                                                     \
+       OBD_SLAB_FREE_PTR((ptr), obdo_cachep);                          \
+} while(0)
+
+
+static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid)
+{
+       /* something here */
+}
+
+static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa)
+{
+       /* something here */
+}
+
+typedef int (*register_lwp_cb)(void *data);
+
+struct lwp_register_item {
+       struct obd_export **lri_exp;
+       register_lwp_cb     lri_cb_func;
+       void               *lri_cb_data;
+       struct list_head            lri_list;
+       char                lri_name[MTI_NAME_MAXLEN];
+};
+
+/* I'm as embarrassed about this as you are.
+ *
+ * <shaver> // XXX do not look into _superhack with remaining eye
+ * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
+extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+
+/* obd_mount.c */
+
+/* sysctl.c */
+extern void obd_sysctl_init (void);
+extern void obd_sysctl_clean (void);
+
+/* uuid.c  */
+typedef __u8 class_uuid_t[16];
+void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out);
+
+/* lustre_peer.c    */
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index);
+int class_add_uuid(const char *uuid, __u64 nid);
+int class_del_uuid (const char *uuid);
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid);
+void class_init_uuidlist(void);
+void class_exit_uuidlist(void);
+
+/* mea.c */
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen);
+int raw_name2idx(int hashtype, int count, const char *name, int namelen);
+
+/* prng.c */
+#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t))
+
+#endif /* __LINUX_OBD_CLASS_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_lov.h b/drivers/staging/lustre/lustre/include/obd_lov.h
new file mode 100644 (file)
index 0000000..d82f334
--- /dev/null
@@ -0,0 +1,126 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_LOV_H__
+#define _OBD_LOV_H__
+
+#define LOV_DEFAULT_STRIPE_SIZE (1 << LNET_MTU_BITS)
+
+static inline int lov_stripe_md_size(__u16 stripes)
+{
+       return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo*);
+}
+
+static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
+{
+       if (lmm_magic == LOV_MAGIC_V3)
+               return sizeof(struct lov_mds_md_v3) +
+                       stripes * sizeof(struct lov_ost_data_v1);
+       else
+               return sizeof(struct lov_mds_md_v1) +
+                       stripes * sizeof(struct lov_ost_data_v1);
+}
+
+struct lov_version_size {
+       __u32   lvs_magic;
+       size_t  lvs_lmm_size;
+       size_t  lvs_lod_size;
+};
+
+static inline __u32 lov_mds_md_stripecnt(int ea_size, __u32 lmm_magic)
+{
+       static const struct lov_version_size lmm_ver_size[] = {
+                       { .lvs_magic = LOV_MAGIC_V3,
+                         .lvs_lmm_size = sizeof(struct lov_mds_md_v3),
+                         .lvs_lod_size = sizeof(struct lov_ost_data_v1) },
+                       { .lvs_magic = LOV_MAGIC_V1,
+                         .lvs_lmm_size = sizeof(struct lov_mds_md_v1),
+                         .lvs_lod_size = sizeof(struct lov_ost_data_v1)} };
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(lmm_ver_size); i++) {
+               if (lmm_magic == lmm_ver_size[i].lvs_magic) {
+                       if (ea_size <= lmm_ver_size[i].lvs_lmm_size)
+                               return 0;
+                       return (ea_size - lmm_ver_size[i].lvs_lmm_size) /
+                               lmm_ver_size[i].lvs_lod_size;
+               }
+       }
+
+       /* Invalid LOV magic, so no stripes could fit */
+       return 0;
+}
+
+/* lov_do_div64(a, b) returns a % b, and a = a / b.
+ * The 32-bit code is LOV-specific due to knowing about stripe limits in
+ * order to reduce the divisor to a 32-bit number.  If the divisor is
+ * already a 32-bit value the compiler handles this directly. */
+#if BITS_PER_LONG > 32
+# define lov_do_div64(n,base) ({                                       \
+       uint64_t __base = (base);                                       \
+       uint64_t __rem;                                                 \
+       __rem = ((uint64_t)(n)) % __base;                               \
+       (n) = ((uint64_t)(n)) / __base;                                 \
+       __rem;                                                          \
+  })
+#else
+# define lov_do_div64(n,base) ({                                       \
+       uint64_t __rem;                                                 \
+       if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) {  \
+               int __remainder;                                              \
+               LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \
+                        "division %llu / %llu\n", (n), (uint64_t)(base));    \
+               __remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);          \
+               (n) >>= LOV_MIN_STRIPE_BITS;                            \
+               __rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS);       \
+               __rem <<= LOV_MIN_STRIPE_BITS;                          \
+               __rem += __remainder;                                   \
+       } else {                                                        \
+               __rem = do_div(n, base);                                \
+       }                                                               \
+       __rem;                                                          \
+  })
+#endif
+
+#define IOC_LOV_TYPE              'g'
+#define IOC_LOV_MIN_NR          50
+#define IOC_LOV_SET_OSC_ACTIVE  _IOWR('g', 50, long)
+#define IOC_LOV_MAX_NR          50
+
+#define QOS_DEFAULT_THRESHOLD     10 /* MB */
+#define QOS_DEFAULT_MAXAGE           5  /* Seconds */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_ost.h b/drivers/staging/lustre/lustre/include/obd_ost.h
new file mode 100644 (file)
index 0000000..af89843
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/obd_ost.h
+ *
+ * Data structures for object storage targets and client: OST & OSC's
+ *
+ * See also lustre_idl.h for wire formats of requests.
+ */
+
+#ifndef _LUSTRE_OST_H
+#define _LUSTRE_OST_H
+
+#include <obd_class.h>
+
+struct osc_brw_async_args {
+       struct obdo       *aa_oa;
+       int             aa_requested_nob;
+       int             aa_nio_count;
+       obd_count         aa_page_count;
+       int             aa_resends;
+       struct brw_page  **aa_ppga;
+       struct client_obd *aa_cli;
+       struct list_head         aa_oaps;
+       struct list_head         aa_exts;
+       struct obd_capa   *aa_ocapa;
+       struct cl_req     *aa_clerq;
+};
+
+#define osc_grant_args osc_brw_async_args
+struct osc_async_args {
+       struct obd_info   *aa_oi;
+};
+
+struct osc_setattr_args {
+       struct obdo      *sa_oa;
+       obd_enqueue_update_f sa_upcall;
+       void            *sa_cookie;
+};
+
+struct osc_fsync_args {
+       struct obd_info     *fa_oi;
+       obd_enqueue_update_f fa_upcall;
+       void            *fa_cookie;
+};
+
+struct osc_enqueue_args {
+       struct obd_export       *oa_exp;
+       __u64               *oa_flags;
+       obd_enqueue_update_f      oa_upcall;
+       void                 *oa_cookie;
+       struct ost_lvb     *oa_lvb;
+       struct lustre_handle     *oa_lockh;
+       struct ldlm_enqueue_info *oa_ei;
+       unsigned int          oa_agl:1;
+};
+
+#if 0
+int osc_extent_blocking_cb(struct ldlm_lock *lock,
+                          struct ldlm_lock_desc *new, void *data,
+                          int flag);
+#endif
+
+#endif
diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h
new file mode 100644 (file)
index 0000000..b5d40af
--- /dev/null
@@ -0,0 +1,851 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _OBD_SUPPORT
+#define _OBD_SUPPORT
+
+#include <linux/libcfs/libcfs.h>
+#include <lvfs.h>
+#include <lprocfs_status.h>
+
+#include <linux/obd_support.h>
+
+/* global variables */
+extern struct lprocfs_stats *obd_memory;
+enum {
+       OBD_MEMORY_STAT = 0,
+       OBD_MEMORY_PAGES_STAT = 1,
+       OBD_STATS_NUM,
+};
+
+extern unsigned int obd_debug_peer_on_timeout;
+extern unsigned int obd_dump_on_timeout;
+extern unsigned int obd_dump_on_eviction;
+/* obd_timeout should only be used for recovery, not for
+   networking / disk / timings affected by load (use Adaptive Timeouts) */
+extern unsigned int obd_timeout;         /* seconds */
+extern unsigned int ldlm_timeout;       /* seconds */
+extern unsigned int obd_timeout_set;
+extern unsigned int ldlm_timeout_set;
+extern unsigned int at_min;
+extern unsigned int at_max;
+extern unsigned int at_history;
+extern int at_early_margin;
+extern int at_extra;
+extern unsigned int obd_sync_filter;
+extern unsigned int obd_max_dirty_pages;
+extern atomic_t obd_dirty_pages;
+extern atomic_t obd_dirty_transit_pages;
+extern unsigned int obd_alloc_fail_rate;
+extern char obd_jobid_var[];
+
+/* lvfs.c */
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+                  size_t size, const char *file, int line);
+
+/* Some hash init argument constants */
+#define HASH_POOLS_BKT_BITS 3
+#define HASH_POOLS_CUR_BITS 3
+#define HASH_POOLS_MAX_BITS 7
+#define HASH_UUID_BKT_BITS 5
+#define HASH_UUID_CUR_BITS 7
+#define HASH_UUID_MAX_BITS 12
+#define HASH_NID_BKT_BITS 5
+#define HASH_NID_CUR_BITS 7
+#define HASH_NID_MAX_BITS 12
+#define HASH_NID_STATS_BKT_BITS 5
+#define HASH_NID_STATS_CUR_BITS 7
+#define HASH_NID_STATS_MAX_BITS 12
+#define HASH_LQE_BKT_BITS 5
+#define HASH_LQE_CUR_BITS 7
+#define HASH_LQE_MAX_BITS 12
+#define HASH_CONN_BKT_BITS 5
+#define HASH_CONN_CUR_BITS 5
+#define HASH_CONN_MAX_BITS 15
+#define HASH_EXP_LOCK_BKT_BITS  5
+#define HASH_EXP_LOCK_CUR_BITS  7
+#define HASH_EXP_LOCK_MAX_BITS  16
+#define HASH_CL_ENV_BKT_BITS    5
+#define HASH_CL_ENV_BITS       10
+#define HASH_JOB_STATS_BKT_BITS 5
+#define HASH_JOB_STATS_CUR_BITS 7
+#define HASH_JOB_STATS_MAX_BITS 12
+
+/* Timeout definitions */
+#define OBD_TIMEOUT_DEFAULT         100
+#define LDLM_TIMEOUT_DEFAULT       20
+#define MDS_LDLM_TIMEOUT_DEFAULT       6
+/* Time to wait for all clients to reconnect during recovery (hard limit) */
+#define OBD_RECOVERY_TIME_HARD   (obd_timeout * 9)
+/* Time to wait for all clients to reconnect during recovery (soft limit) */
+/* Should be very conservative; must catch the first reconnect after reboot */
+#define OBD_RECOVERY_TIME_SOFT   (obd_timeout * 3)
+/* Change recovery-small 26b time if you change this */
+#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* a bit more than maximal journal commit time in seconds */
+#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U)
+/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
+ * failover targets the client only pings one server at a time, and pings
+ * can be lost on a loaded network. Since eviction has serious consequences,
+ * and there's no urgent need to evict a client just because it's idle, we
+ * should be very conservative here. */
+#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
+#define DISK_TIMEOUT 50          /* Beyond this we warn about disk speed */
+#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */
+ /* Max connect interval for nonresponsive servers; ~50s to avoid building up
+    connect requests in the LND queues, but within obd_timeout so we don't
+    miss the recovery window */
+#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout))
+#define CONNECTION_SWITCH_INC 5  /* Connection timeout backoff */
+/* In general this should be low to have quick detection of a system
+   running on a backup server. (If it's too low, import_select_connection
+   will increase the timeout anyhow.)  */
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
+/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */
+#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \
+                            INITIAL_CONNECT_TIMEOUT)
+/* The min time a target should wait for clients to reconnect in recovery */
+#define OBD_RECOVERY_TIME_MIN    (2*RECONNECT_DELAY_MAX)
+#define OBD_IR_FACTOR_MIN       1
+#define OBD_IR_FACTOR_MAX       10
+#define OBD_IR_FACTOR_DEFAULT    (OBD_IR_FACTOR_MAX/2)
+/* default timeout for the MGS to become IR_FULL */
+#define OBD_IR_MGS_TIMEOUT       (4*obd_timeout)
+#define LONG_UNLINK 300          /* Unlink should happen before now */
+
+/**
+ * Time interval of shrink, if the client is "idle" more than this interval,
+ * then the ll_grant thread will return the requested grant space to filter
+ */
+#define GRANT_SHRINK_INTERVAL      1200/*20 minutes*/
+
+#define OBD_FAIL_MDS                0x100
+#define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
+#define OBD_FAIL_MDS_GETATTR_NET        0x102
+#define OBD_FAIL_MDS_GETATTR_PACK      0x103
+#define OBD_FAIL_MDS_READPAGE_NET      0x104
+#define OBD_FAIL_MDS_READPAGE_PACK       0x105
+#define OBD_FAIL_MDS_SENDPAGE      0x106
+#define OBD_FAIL_MDS_REINT_NET    0x107
+#define OBD_FAIL_MDS_REINT_UNPACK      0x108
+#define OBD_FAIL_MDS_REINT_SETATTR       0x109
+#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a
+#define OBD_FAIL_MDS_REINT_CREATE      0x10b
+#define OBD_FAIL_MDS_REINT_CREATE_WRITE  0x10c
+#define OBD_FAIL_MDS_REINT_UNLINK      0x10d
+#define OBD_FAIL_MDS_REINT_UNLINK_WRITE  0x10e
+#define OBD_FAIL_MDS_REINT_LINK          0x10f
+#define OBD_FAIL_MDS_REINT_LINK_WRITE    0x110
+#define OBD_FAIL_MDS_REINT_RENAME      0x111
+#define OBD_FAIL_MDS_REINT_RENAME_WRITE  0x112
+#define OBD_FAIL_MDS_OPEN_NET      0x113
+#define OBD_FAIL_MDS_OPEN_PACK    0x114
+#define OBD_FAIL_MDS_CLOSE_NET    0x115
+#define OBD_FAIL_MDS_CLOSE_PACK          0x116
+#define OBD_FAIL_MDS_CONNECT_NET        0x117
+#define OBD_FAIL_MDS_CONNECT_PACK      0x118
+#define OBD_FAIL_MDS_REINT_NET_REP       0x119
+#define OBD_FAIL_MDS_DISCONNECT_NET      0x11a
+#define OBD_FAIL_MDS_GETSTATUS_NET       0x11b
+#define OBD_FAIL_MDS_GETSTATUS_PACK      0x11c
+#define OBD_FAIL_MDS_STATFS_PACK        0x11d
+#define OBD_FAIL_MDS_STATFS_NET          0x11e
+#define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
+#define OBD_FAIL_MDS_PIN_NET        0x120
+#define OBD_FAIL_MDS_UNPIN_NET    0x121
+#define OBD_FAIL_MDS_ALL_REPLY_NET       0x122
+#define OBD_FAIL_MDS_ALL_REQUEST_NET     0x123
+#define OBD_FAIL_MDS_SYNC_NET      0x124
+#define OBD_FAIL_MDS_SYNC_PACK    0x125
+#define OBD_FAIL_MDS_DONE_WRITING_NET    0x126
+#define OBD_FAIL_MDS_DONE_WRITING_PACK   0x127
+#define OBD_FAIL_MDS_ALLOC_OBDO          0x128
+#define OBD_FAIL_MDS_PAUSE_OPEN          0x129
+#define OBD_FAIL_MDS_STATFS_LCW_SLEEP    0x12a
+#define OBD_FAIL_MDS_OPEN_CREATE        0x12b
+#define OBD_FAIL_MDS_OST_SETATTR        0x12c
+#define OBD_FAIL_MDS_QUOTACHECK_NET      0x12d
+#define OBD_FAIL_MDS_QUOTACTL_NET      0x12e
+#define OBD_FAIL_MDS_CLIENT_ADD          0x12f
+#define OBD_FAIL_MDS_GETXATTR_NET      0x130
+#define OBD_FAIL_MDS_GETXATTR_PACK       0x131
+#define OBD_FAIL_MDS_SETXATTR_NET      0x132
+#define OBD_FAIL_MDS_SETXATTR      0x133
+#define OBD_FAIL_MDS_SETXATTR_WRITE      0x134
+#define OBD_FAIL_MDS_FS_SETUP      0x135
+#define OBD_FAIL_MDS_RESEND          0x136
+#define OBD_FAIL_MDS_LLOG_CREATE_FAILED  0x137
+#define OBD_FAIL_MDS_LOV_SYNC_RACE       0x138
+#define OBD_FAIL_MDS_OSC_PRECREATE       0x139
+#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT   0x13a
+#define OBD_FAIL_MDS_CLOSE_NET_REP       0x13b
+#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ     0x13c
+#define OBD_FAIL_MDS_DROP_QUOTA_REQ      0x13d
+#define OBD_FAIL_MDS_REMOVE_COMMON_EA    0x13e
+#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING   0x13f
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD    0x140
+#define OBD_FAIL_MDS_LOV_PREP_CREATE     0x141
+#define OBD_FAIL_MDS_REINT_DELAY        0x142
+#define OBD_FAIL_MDS_READLINK_EPROTO     0x143
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE    0x144
+#define OBD_FAIL_MDS_PDO_LOCK      0x145
+#define OBD_FAIL_MDS_PDO_LOCK2    0x146
+#define OBD_FAIL_MDS_OSC_CREATE_FAIL     0x147
+#define OBD_FAIL_MDS_NEGATIVE_POSITIVE  0x148
+#define OBD_FAIL_MDS_HSM_STATE_GET_NET         0x149
+#define OBD_FAIL_MDS_HSM_STATE_SET_NET         0x14a
+#define OBD_FAIL_MDS_HSM_PROGRESS_NET          0x14b
+#define OBD_FAIL_MDS_HSM_REQUEST_NET           0x14c
+#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET       0x14d
+#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET     0x14e
+#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET          0x14f
+#define OBD_FAIL_MDS_HSM_ACTION_NET            0x150
+#define OBD_FAIL_MDS_CHANGELOG_INIT            0x151
+
+/* layout lock */
+#define OBD_FAIL_MDS_NO_LL_GETATTR      0x170
+#define OBD_FAIL_MDS_NO_LL_OPEN                 0x171
+#define OBD_FAIL_MDS_LL_BLOCK           0x172
+
+/* CMD */
+#define OBD_FAIL_MDS_IS_SUBDIR_NET       0x180
+#define OBD_FAIL_MDS_IS_SUBDIR_PACK      0x181
+#define OBD_FAIL_MDS_SET_INFO_NET      0x182
+#define OBD_FAIL_MDS_WRITEPAGE_NET       0x183
+#define OBD_FAIL_MDS_WRITEPAGE_PACK      0x184
+#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
+#define OBD_FAIL_MDS_GET_INFO_NET      0x186
+#define OBD_FAIL_MDS_DQACQ_NET    0x187
+
+/* OI scrub */
+#define OBD_FAIL_OSD_SCRUB_DELAY                       0x190
+#define OBD_FAIL_OSD_SCRUB_CRASH                       0x191
+#define OBD_FAIL_OSD_SCRUB_FATAL                       0x192
+#define OBD_FAIL_OSD_FID_MAPPING                       0x193
+#define OBD_FAIL_OSD_LMA_INCOMPAT                      0x194
+
+#define OBD_FAIL_OST                0x200
+#define OBD_FAIL_OST_CONNECT_NET        0x201
+#define OBD_FAIL_OST_DISCONNECT_NET      0x202
+#define OBD_FAIL_OST_GET_INFO_NET      0x203
+#define OBD_FAIL_OST_CREATE_NET          0x204
+#define OBD_FAIL_OST_DESTROY_NET        0x205
+#define OBD_FAIL_OST_GETATTR_NET        0x206
+#define OBD_FAIL_OST_SETATTR_NET        0x207
+#define OBD_FAIL_OST_OPEN_NET      0x208
+#define OBD_FAIL_OST_CLOSE_NET    0x209
+#define OBD_FAIL_OST_BRW_NET        0x20a
+#define OBD_FAIL_OST_PUNCH_NET    0x20b
+#define OBD_FAIL_OST_STATFS_NET          0x20c
+#define OBD_FAIL_OST_HANDLE_UNPACK       0x20d
+#define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
+#define OBD_FAIL_OST_BRW_READ_BULK       0x20f
+#define OBD_FAIL_OST_SYNC_NET      0x210
+#define OBD_FAIL_OST_ALL_REPLY_NET       0x211
+#define OBD_FAIL_OST_ALL_REQUEST_NET     0x212
+#define OBD_FAIL_OST_LDLM_REPLY_NET      0x213
+#define OBD_FAIL_OST_BRW_PAUSE_BULK      0x214
+#define OBD_FAIL_OST_ENOSPC          0x215
+#define OBD_FAIL_OST_EROFS            0x216
+#define OBD_FAIL_OST_ENOENT          0x217
+#define OBD_FAIL_OST_QUOTACHECK_NET      0x218
+#define OBD_FAIL_OST_QUOTACTL_NET      0x219
+#define OBD_FAIL_OST_CHECKSUM_RECEIVE    0x21a
+#define OBD_FAIL_OST_CHECKSUM_SEND       0x21b
+#define OBD_FAIL_OST_BRW_SIZE      0x21c
+#define OBD_FAIL_OST_DROP_REQ      0x21d
+#define OBD_FAIL_OST_SETATTR_CREDITS     0x21e
+#define OBD_FAIL_OST_HOLD_WRITE_RPC      0x21f
+#define OBD_FAIL_OST_BRW_WRITE_BULK2     0x220
+#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
+#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
+#define OBD_FAIL_OST_PAUSE_CREATE      0x223
+#define OBD_FAIL_OST_BRW_PAUSE_PACK      0x224
+#define OBD_FAIL_OST_CONNECT_NET2      0x225
+#define OBD_FAIL_OST_NOMEM            0x226
+#define OBD_FAIL_OST_BRW_PAUSE_BULK2     0x227
+#define OBD_FAIL_OST_MAPBLK_ENOSPC       0x228
+#define OBD_FAIL_OST_ENOINO          0x229
+#define OBD_FAIL_OST_DQACQ_NET    0x230
+#define OBD_FAIL_OST_STATFS_EINPROGRESS  0x231
+
+#define OBD_FAIL_LDLM              0x300
+#define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
+#define OBD_FAIL_LDLM_ENQUEUE_NET                      0x302
+#define OBD_FAIL_LDLM_CONVERT_NET                      0x303
+#define OBD_FAIL_LDLM_CANCEL_NET                       0x304
+#define OBD_FAIL_LDLM_BL_CALLBACK_NET                  0x305
+#define OBD_FAIL_LDLM_CP_CALLBACK_NET                  0x306
+#define OBD_FAIL_LDLM_GL_CALLBACK_NET                  0x307
+#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308
+#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309
+#define OBD_FAIL_LDLM_CREATE_RESOURCE    0x30a
+#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED    0x30b
+#define OBD_FAIL_LDLM_REPLY          0x30c
+#define OBD_FAIL_LDLM_RECOV_CLIENTS      0x30d
+#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e
+#define OBD_FAIL_LDLM_GLIMPSE      0x30f
+#define OBD_FAIL_LDLM_CANCEL_RACE      0x310
+#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE  0x311
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+#define OBD_FAIL_LDLM_CLOSE_THREAD       0x313
+#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE  0x314
+#define OBD_FAIL_LDLM_CP_CB_WAIT        0x315
+#define OBD_FAIL_LDLM_OST_FAIL_RACE      0x316
+#define OBD_FAIL_LDLM_INTR_CP_AST      0x317
+#define OBD_FAIL_LDLM_CP_BL_RACE        0x318
+#define OBD_FAIL_LDLM_NEW_LOCK    0x319
+#define OBD_FAIL_LDLM_AGL_DELAY          0x31a
+#define OBD_FAIL_LDLM_AGL_NOLOCK        0x31b
+#define OBD_FAIL_LDLM_OST_LVB           0x31c
+
+/* LOCKLESS IO */
+#define OBD_FAIL_LDLM_SET_CONTENTION     0x385
+
+#define OBD_FAIL_OSC                0x400
+#define OBD_FAIL_OSC_BRW_READ_BULK       0x401
+#define OBD_FAIL_OSC_BRW_WRITE_BULK      0x402
+#define OBD_FAIL_OSC_LOCK_BL_AST        0x403
+#define OBD_FAIL_OSC_LOCK_CP_AST        0x404
+#define OBD_FAIL_OSC_MATCH            0x405
+#define OBD_FAIL_OSC_BRW_PREP_REQ      0x406
+#define OBD_FAIL_OSC_SHUTDOWN      0x407
+#define OBD_FAIL_OSC_CHECKSUM_RECEIVE    0x408
+#define OBD_FAIL_OSC_CHECKSUM_SEND       0x409
+#define OBD_FAIL_OSC_BRW_PREP_REQ2       0x40a
+#define OBD_FAIL_OSC_CONNECT_CKSUM       0x40b
+#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY    0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE    0x40d
+#define OBD_FAIL_OSC_OBJECT_CONTENTION   0x40e
+#define OBD_FAIL_OSC_CP_CANCEL_RACE      0x40f
+#define OBD_FAIL_OSC_CP_ENQ_RACE        0x410
+#define OBD_FAIL_OSC_NO_GRANT      0x411
+#define OBD_FAIL_OSC_DELAY_SETTIME      0x412
+
+#define OBD_FAIL_PTLRPC                  0x500
+#define OBD_FAIL_PTLRPC_ACK          0x501
+#define OBD_FAIL_PTLRPC_RQBD        0x502
+#define OBD_FAIL_PTLRPC_BULK_GET_NET     0x503
+#define OBD_FAIL_PTLRPC_BULK_PUT_NET     0x504
+#define OBD_FAIL_PTLRPC_DROP_RPC        0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
+#define OBD_FAIL_PTLRPC_DELAY_RECOV      0x507
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
+#define OBD_FAIL_PTLRPC_PAUSE_REQ      0x50a
+#define OBD_FAIL_PTLRPC_PAUSE_REP      0x50c
+#define OBD_FAIL_PTLRPC_IMP_DEACTIVE     0x50d
+#define OBD_FAIL_PTLRPC_DUMP_LOG        0x50e
+#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f
+#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510
+#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT    0x511
+#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
+#define OBD_FAIL_PTLRPC_DROP_REQ_OPC     0x513
+#define OBD_FAIL_PTLRPC_FINISH_REPLAY    0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
+#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL   0x516
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND    0x517
+
+#define OBD_FAIL_OBD_PING_NET      0x600
+#define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
+#define OBD_FAIL_OBD_LOGD_NET      0x602
+#define OBD_FAIL_OBD_QC_CALLBACK_NET     0x603
+#define OBD_FAIL_OBD_DQACQ            0x604
+#define OBD_FAIL_OBD_LLOG_SETUP          0x605
+#define OBD_FAIL_OBD_LOG_CANCEL_REP      0x606
+#define OBD_FAIL_OBD_IDX_READ_NET      0x607
+#define OBD_FAIL_OBD_IDX_READ_BREAK     0x608
+#define OBD_FAIL_OBD_NO_LRU             0x609
+
+#define OBD_FAIL_TGT_REPLY_NET    0x700
+#define OBD_FAIL_TGT_CONN_RACE    0x701
+#define OBD_FAIL_TGT_FORCE_RECONNECT     0x702
+#define OBD_FAIL_TGT_DELAY_CONNECT       0x703
+#define OBD_FAIL_TGT_DELAY_RECONNECT     0x704
+#define OBD_FAIL_TGT_DELAY_PRECREATE     0x705
+#define OBD_FAIL_TGT_TOOMANY_THREADS     0x706
+#define OBD_FAIL_TGT_REPLAY_DROP        0x707
+#define OBD_FAIL_TGT_FAKE_EXP      0x708
+#define OBD_FAIL_TGT_REPLAY_DELAY      0x709
+#define OBD_FAIL_TGT_LAST_REPLAY        0x710
+#define OBD_FAIL_TGT_CLIENT_ADD          0x711
+#define OBD_FAIL_TGT_RCVG_FLAG    0x712
+
+#define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
+#define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
+#define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
+#define OBD_FAIL_MDC_RPCS_SEM           0x804
+#define OBD_FAIL_MDC_LIGHTWEIGHT        0x805
+
+#define OBD_FAIL_MGS                0x900
+#define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
+#define OBD_FAIL_MGS_ALL_REPLY_NET       0x902
+#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG   0x903
+#define OBD_FAIL_MGS_PAUSE_REQ    0x904
+#define OBD_FAIL_MGS_PAUSE_TARGET_REG    0x905
+
+#define OBD_FAIL_QUOTA_DQACQ_NET                       0xA01
+#define OBD_FAIL_QUOTA_EDQUOT      0xA02
+#define OBD_FAIL_QUOTA_DELAY_REINT       0xA03
+#define OBD_FAIL_QUOTA_RECOVERABLE_ERR   0xA04
+
+#define OBD_FAIL_LPROC_REMOVE      0xB00
+
+#define OBD_FAIL_GENERAL_ALLOC    0xC00
+
+#define OBD_FAIL_SEQ                0x1000
+#define OBD_FAIL_SEQ_QUERY_NET    0x1001
+#define OBD_FAIL_SEQ_EXHAUST            0x1002
+
+#define OBD_FAIL_FLD                0x1100
+#define OBD_FAIL_FLD_QUERY_NET    0x1101
+
+#define OBD_FAIL_SEC_CTX                0x1200
+#define OBD_FAIL_SEC_CTX_INIT_NET      0x1201
+#define OBD_FAIL_SEC_CTX_INIT_CONT_NET   0x1202
+#define OBD_FAIL_SEC_CTX_FINI_NET      0x1203
+#define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
+
+#define OBD_FAIL_LLOG                         0x1300
+#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET           0x1301
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET      0x1302
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET     0x1303
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET  0x1305
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET  0x1306
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET   0x1307
+#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET       0x1308
+#define OBD_FAIL_LLOG_CATINFO_NET                 0x1309
+#define OBD_FAIL_MDS_SYNC_CAPA_SL                 0x1310
+#define OBD_FAIL_SEQ_ALLOC                       0x1311
+
+#define OBD_FAIL_LLITE                       0x1400
+#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE             0x1401
+#define OBD_FAIL_LOCK_STATE_WAIT_INTR         0x1402
+#define OBD_FAIL_LOV_INIT                          0x1403
+#define OBD_FAIL_GLIMPSE_DELAY                     0x1404
+
+#define OBD_FAIL_FID_INDIR     0x1501
+#define OBD_FAIL_FID_INLMA     0x1502
+#define OBD_FAIL_FID_IGIF      0x1504
+#define OBD_FAIL_FID_LOOKUP    0x1505
+#define OBD_FAIL_FID_NOLMA     0x1506
+
+/* LFSCK */
+#define OBD_FAIL_LFSCK_DELAY1          0x1600
+#define OBD_FAIL_LFSCK_DELAY2          0x1601
+#define OBD_FAIL_LFSCK_DELAY3          0x1602
+#define OBD_FAIL_LFSCK_LINKEA_CRASH    0x1603
+#define OBD_FAIL_LFSCK_LINKEA_MORE     0x1604
+#define OBD_FAIL_LFSCK_FATAL1          0x1608
+#define OBD_FAIL_LFSCK_FATAL2          0x1609
+#define OBD_FAIL_LFSCK_CRASH           0x160a
+#define OBD_FAIL_LFSCK_NO_AUTO         0x160b
+#define OBD_FAIL_LFSCK_NO_DOUBLESCAN   0x160c
+
+/* UPDATE */
+#define OBD_FAIL_UPDATE_OBJ_NET                        0x1700
+#define OBD_FAIL_UPDATE_OBJ_NET_REP            0x1701
+
+
+/* Assign references to moved code to reduce code changes */
+#define OBD_FAIL_PRECHECK(id)             CFS_FAIL_PRECHECK(id)
+#define OBD_FAIL_CHECK(id)                   CFS_FAIL_CHECK(id)
+#define OBD_FAIL_CHECK_VALUE(id, value)         CFS_FAIL_CHECK_VALUE(id, value)
+#define OBD_FAIL_CHECK_ORSET(id, value)         CFS_FAIL_CHECK_ORSET(id, value)
+#define OBD_FAIL_CHECK_RESET(id, value)         CFS_FAIL_CHECK_RESET(id, value)
+#define OBD_FAIL_RETURN(id, ret)               CFS_FAIL_RETURN(id, ret)
+#define OBD_FAIL_TIMEOUT(id, secs)           CFS_FAIL_TIMEOUT(id, secs)
+#define OBD_FAIL_TIMEOUT_MS(id, ms)         CFS_FAIL_TIMEOUT_MS(id, ms)
+#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs)
+#define OBD_RACE(id)                       CFS_RACE(id)
+#define OBD_FAIL_ONCE                     CFS_FAIL_ONCE
+#define OBD_FAILED                           CFS_FAILED
+
+extern atomic_t libcfs_kmemory;
+
+#ifdef LPROCFS
+#define obd_memory_add(size)                                             \
+       lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sub(size)                                             \
+       lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size))
+#define obd_memory_sum()                                                     \
+       lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT,              \
+                               LPROCFS_FIELDS_FLAGS_SUM)
+#define obd_pages_add(order)                                             \
+       lprocfs_counter_add(obd_memory, OBD_MEMORY_PAGES_STAT,          \
+                           (long)(1 << (order)))
+#define obd_pages_sub(order)                                             \
+       lprocfs_counter_sub(obd_memory, OBD_MEMORY_PAGES_STAT,          \
+                           (long)(1 << (order)))
+#define obd_pages_sum()                                                       \
+       lprocfs_stats_collector(obd_memory, OBD_MEMORY_PAGES_STAT,          \
+                               LPROCFS_FIELDS_FLAGS_SUM)
+
+extern void obd_update_maxusage(void);
+extern __u64 obd_memory_max(void);
+extern __u64 obd_pages_max(void);
+
+#else
+
+extern __u64 obd_alloc;
+extern __u64 obd_pages;
+
+extern __u64 obd_max_alloc;
+extern __u64 obd_max_pages;
+
+static inline void obd_memory_add(long size)
+{
+       obd_alloc += size;
+       if (obd_alloc > obd_max_alloc)
+               obd_max_alloc = obd_alloc;
+}
+
+static inline void obd_memory_sub(long size)
+{
+       obd_alloc -= size;
+}
+
+static inline void obd_pages_add(int order)
+{
+       obd_pages += 1<< order;
+       if (obd_pages > obd_max_pages)
+               obd_max_pages = obd_pages;
+}
+
+static inline void obd_pages_sub(int order)
+{
+       obd_pages -= 1<< order;
+}
+
+#define obd_memory_sum() (obd_alloc)
+#define obd_pages_sum()  (obd_pages)
+
+#define obd_memory_max() (obd_max_alloc)
+#define obd_pages_max() (obd_max_pages)
+
+#endif
+
+#define OBD_DEBUG_MEMUSAGE (1)
+
+#if OBD_DEBUG_MEMUSAGE
+#define OBD_ALLOC_POST(ptr, size, name)                                 \
+               obd_memory_add(size);                              \
+               CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",       \
+                      (int)(size), ptr)
+
+#define OBD_FREE_PRE(ptr, size, name)                             \
+       LASSERT(ptr);                                              \
+       obd_memory_sub(size);                                      \
+       CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n",              \
+              (int)(size), ptr);                                      \
+       POISON(ptr, 0x5a, size)
+
+#else /* !OBD_DEBUG_MEMUSAGE */
+
+#define OBD_ALLOC_POST(ptr, size, name) ((void)0)
+#define OBD_FREE_PRE(ptr, size, name)   ((void)0)
+
+#endif /* !OBD_DEBUG_MEMUSAGE */
+
+#define HAS_FAIL_ALLOC_FLAG OBD_FAIL_CHECK(OBD_FAIL_GENERAL_ALLOC)
+
+#define OBD_ALLOC_FAIL_BITS 24
+#define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1)
+#define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100)
+
+#if defined(LUSTRE_UTILS) /* this version is for utils only */
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)                   \
+do {                                                                         \
+       (ptr) = (cptab) == NULL ?                                             \
+               kmalloc(size, flags) :                                \
+               kmalloc_node(size, flags, cfs_cpt_spread_node(cptab, cpt));   \
+       if (unlikely((ptr) == NULL)) {                                  \
+               CERROR("kmalloc of '" #ptr "' (%d bytes) failed at %s:%d\n",  \
+                      (int)(size), __FILE__, __LINE__);                      \
+       } else {                                                              \
+               memset(ptr, 0, size);                                         \
+               CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p\n",          \
+                      (int)(size), ptr);                                     \
+       }                                                                     \
+} while (0)
+
+#else /* this version is for the kernel and liblustre */
+#define OBD_FREE_RTN0(ptr)                                                 \
+({                                                                         \
+       kfree(ptr);                                                     \
+       (ptr) = NULL;                                                    \
+       0;                                                                  \
+})
+
+#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags)                   \
+do {                                                                         \
+       (ptr) = (cptab) == NULL ?                                             \
+               kmalloc(size, flags | __GFP_ZERO) :                           \
+               kmalloc_node(size, flags | __GFP_ZERO,                        \
+                            cfs_cpt_spread_node(cptab, cpt));                \
+       if (likely((ptr) != NULL &&                                        \
+                  (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 ||       \
+                   !obd_alloc_fail(ptr, #ptr, "km", size,                  \
+                                   __FILE__, __LINE__) ||                  \
+                   OBD_FREE_RTN0(ptr)))){                                  \
+               OBD_ALLOC_POST(ptr, size, "kmalloced");                \
+       }                                                                    \
+} while (0)
+#endif
+
+#define OBD_ALLOC_GFP(ptr, size, gfp_mask)                                   \
+       __OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask)
+
+#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, __GFP_IO)
+#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_IOFS)
+#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof *(ptr))
+#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof *(ptr))
+
+#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask)                   \
+       __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask)
+
+#define OBD_CPT_ALLOC(ptr, cptab, cpt, size)                                 \
+       OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO)
+
+#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt)                                   \
+       OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof *(ptr))
+
+# define __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)                        \
+do {                                                                         \
+       (ptr) = cptab == NULL ?                                               \
+               vzalloc(size) :                                               \
+               vzalloc_node(size, cfs_cpt_spread_node(cptab, cpt));          \
+       if (unlikely((ptr) == NULL)) {                                  \
+               CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n",        \
+                      (int)(size));                                      \
+               CERROR(LPU64" total bytes allocated by Lustre, %d by LNET\n", \
+                      obd_memory_sum(), atomic_read(&libcfs_kmemory));   \
+       } else {                                                              \
+               OBD_ALLOC_POST(ptr, size, "vmalloced");                \
+       }                                                                    \
+} while(0)
+
+# define OBD_VMALLOC(ptr, size)                                                      \
+        __OBD_VMALLOC_VEROBSE(ptr, NULL, 0, size)
+# define OBD_CPT_VMALLOC(ptr, cptab, cpt, size)                                      \
+        __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size)
+
+
+/* Allocations above this size are considered too big and could not be done
+ * atomically.
+ *
+ * Be very careful when changing this value, especially when decreasing it,
+ * since vmalloc in Linux doesn't perform well on multi-cores system, calling
+ * vmalloc in critical path would hurt peformance badly. See LU-66.
+ */
+#define OBD_ALLOC_BIG (4 * PAGE_CACHE_SIZE)
+
+#define OBD_ALLOC_LARGE(ptr, size)                                         \
+do {                                                                     \
+       if (size > OBD_ALLOC_BIG)                                            \
+               OBD_VMALLOC(ptr, size);                                \
+       else                                                              \
+               OBD_ALLOC(ptr, size);                                    \
+} while (0)
+
+#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size)                           \
+do {                                                                         \
+       if (size > OBD_ALLOC_BIG)                                             \
+               OBD_CPT_VMALLOC(ptr, cptab, cpt, size);                       \
+       else                                                                  \
+               OBD_CPT_ALLOC(ptr, cptab, cpt, size);                         \
+} while (0)
+
+#define OBD_FREE_LARGE(ptr, size)                                           \
+do {                                                                     \
+       if (size > OBD_ALLOC_BIG)                                            \
+               OBD_VFREE(ptr, size);                                    \
+       else                                                              \
+               OBD_FREE(ptr, size);                                      \
+} while (0)
+
+
+#ifdef CONFIG_DEBUG_SLAB
+#define POISON(ptr, c, s) do {} while (0)
+#define POISON_PTR(ptr)  ((void)0)
+#else
+#define POISON(ptr, c, s) memset(ptr, c, s)
+#define POISON_PTR(ptr)  (ptr) = (void *)0xdeadbeef
+#endif
+
+#ifdef POISON_BULK
+#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_CACHE_SIZE);   \
+                                   kunmap(page); } while (0)
+#else
+#define POISON_PAGE(page, val) do { } while (0)
+#endif
+
+#define OBD_FREE(ptr, size)                                               \
+do {                                                                     \
+       OBD_FREE_PRE(ptr, size, "kfreed");                                  \
+       kfree(ptr);                                                     \
+       POISON_PTR(ptr);                                                      \
+} while(0)
+
+
+#define OBD_FREE_RCU(ptr, size, handle)                                              \
+do {                                                                         \
+       struct portals_handle *__h = (handle);                                \
+                                                                             \
+       LASSERT(handle != NULL);                                              \
+       __h->h_cookie = (unsigned long)(ptr);                                 \
+       __h->h_size = (size);                                                 \
+       call_rcu(&__h->h_rcu, class_handle_free_cb);                          \
+       POISON_PTR(ptr);                                                      \
+} while(0)
+
+
+#define OBD_VFREE(ptr, size)                           \
+       do {                                            \
+               OBD_FREE_PRE(ptr, size, "vfreed");      \
+               vfree(ptr);                     \
+               POISON_PTR(ptr);                        \
+       } while (0)
+
+/* we memset() the slab object to 0 when allocation succeeds, so DO NOT
+ * HAVE A CTOR THAT DOES ANYTHING.  its work will be cleared here.  we'd
+ * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */
+#define OBD_SLAB_FREE_RTN0(ptr, slab)                                   \
+({                                                                         \
+       kmem_cache_free((slab), (ptr));                             \
+       (ptr) = NULL;                                                    \
+       0;                                                                  \
+})
+
+#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type)          \
+do {                                                                         \
+       LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt()));         \
+       (ptr) = (cptab) == NULL ?                                             \
+               kmem_cache_alloc(slab, type | __GFP_ZERO) :             \
+               kmem_cache_alloc_node(slab, type | __GFP_ZERO,          \
+                                     cfs_cpt_spread_node(cptab, cpt)); \
+       if (likely((ptr) != NULL &&                                        \
+                  (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 ||       \
+                   !obd_alloc_fail(ptr, #ptr, "slab-", size,            \
+                                   __FILE__, __LINE__) ||                  \
+                   OBD_SLAB_FREE_RTN0(ptr, slab)))) {                  \
+               OBD_ALLOC_POST(ptr, size, "slab-alloced");                  \
+       }                                                                    \
+} while(0)
+
+#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags)                           \
+       __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags)
+#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags)           \
+       __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags)
+
+#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof *(ptr))
+
+#define OBD_SLAB_FREE(ptr, slab, size)                                 \
+do {                                                                     \
+       OBD_FREE_PRE(ptr, size, "slab-freed");                          \
+       kmem_cache_free(slab, ptr);                                     \
+       POISON_PTR(ptr);                                                      \
+} while(0)
+
+#define OBD_SLAB_ALLOC(ptr, slab, size)                                              \
+       OBD_SLAB_ALLOC_GFP(ptr, slab, size, __GFP_IO)
+
+#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size)                              \
+       OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, __GFP_IO)
+
+#define OBD_SLAB_ALLOC_PTR(ptr, slab)                                        \
+       OBD_SLAB_ALLOC(ptr, slab, sizeof *(ptr))
+
+#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt)                        \
+       OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof *(ptr))
+
+#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags)                             \
+       OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof *(ptr), flags)
+
+#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags)                     \
+       OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof *(ptr), flags)
+
+#define OBD_SLAB_FREE_PTR(ptr, slab)                                         \
+       OBD_SLAB_FREE((ptr), (slab), sizeof *(ptr))
+
+#define KEY_IS(str) \
+       (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0)
+
+/* Wrapper for contiguous page frame allocation */
+#define __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)                  \
+do {                                                                         \
+       (ptr) = (cptab) == NULL ?                                             \
+               alloc_page(gfp_mask) :                                \
+               alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), gfp_mask, 0);\
+       if (unlikely((ptr) == NULL)) {                                  \
+               CERROR("alloc_pages of '" #ptr "' %d page(s) / "LPU64" bytes "\
+                      "failed\n", (int)1,                                  \
+                      (__u64)(1 << PAGE_CACHE_SHIFT));                  \
+               CERROR(LPU64" total bytes and "LPU64" total pages "        \
+                      "("LPU64" bytes) allocated by Lustre, "          \
+                      "%d total bytes by LNET\n",                          \
+                      obd_memory_sum(),                                      \
+                      obd_pages_sum() << PAGE_CACHE_SHIFT,                  \
+                      obd_pages_sum(),                                \
+                      atomic_read(&libcfs_kmemory));                \
+       } else {                                                              \
+               obd_pages_add(0);                                            \
+               CDEBUG(D_MALLOC, "alloc_pages '" #ptr "': %d page(s) / "      \
+                      LPU64" bytes at %p.\n",                          \
+                      (int)1,                                          \
+                      (__u64)(1 << PAGE_CACHE_SHIFT), ptr);                \
+       }                                                                    \
+} while (0)
+
+#define OBD_PAGE_ALLOC(ptr, gfp_mask)                                        \
+       __OBD_PAGE_ALLOC_VERBOSE(ptr, NULL, 0, gfp_mask)
+#define OBD_PAGE_CPT_ALLOC(ptr, cptab, cpt, gfp_mask)                        \
+       __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask)
+
+#define OBD_PAGE_FREE(ptr)                                                 \
+do {                                                                     \
+       LASSERT(ptr);                                                    \
+       obd_pages_sub(0);                                                    \
+       CDEBUG(D_MALLOC, "free_pages '" #ptr "': %d page(s) / "LPU64" bytes " \
+              "at %p.\n",                                                  \
+              (int)1, (__u64)(1 << PAGE_CACHE_SHIFT),                    \
+              ptr);                                                      \
+       __free_page(ptr);                                                  \
+       (ptr) = (void *)0xdeadbeef;                                        \
+} while (0)
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lclient/glimpse.c b/drivers/staging/lustre/lustre/lclient/glimpse.c
new file mode 100644 (file)
index 0000000..7f3974b
--- /dev/null
@@ -0,0 +1,274 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * glimpse code shared between vvp and liblustre (and other Lustre clients in
+ * the future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+
+# include <lustre_dlm.h>
+# include <lustre_lite.h>
+# include <lustre_mdc.h>
+# include <linux/pagemap.h>
+# include <linux/file.h>
+
+#include "cl_object.h"
+#include "lclient.h"
+# include "../llite/llite_internal.h"
+
+static const struct cl_lock_descr whole_file = {
+       .cld_start = 0,
+       .cld_end   = CL_PAGE_EOF,
+       .cld_mode  = CLM_READ
+};
+
+/*
+ * Check whether file has possible unwriten pages.
+ *
+ * \retval 1    file is mmap-ed or has dirty pages
+ *      0    otherwise
+ */
+blkcnt_t dirty_cnt(struct inode *inode)
+{
+       blkcnt_t cnt = 0;
+       struct ccc_object *vob = cl_inode2ccc(inode);
+       void          *results[1];
+
+       if (inode->i_mapping != NULL)
+               cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree,
+                                                 results, 0, 1,
+                                                 PAGECACHE_TAG_DIRTY);
+       if (cnt == 0 && atomic_read(&vob->cob_mmap_cnt) > 0)
+               cnt = 1;
+
+       return (cnt > 0) ? 1 : 0;
+}
+
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+                   struct inode *inode, struct cl_object *clob, int agl)
+{
+       struct cl_lock_descr *descr = &ccc_env_info(env)->cti_descr;
+       struct cl_inode_info *lli   = cl_i2info(inode);
+       const struct lu_fid  *fid   = lu_object_fid(&clob->co_lu);
+       struct ccc_io   *cio   = ccc_env_io(env);
+       struct cl_lock       *lock;
+       int result;
+
+       ENTRY;
+       result = 0;
+       if (!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)) {
+               CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
+               if (lli->lli_has_smd) {
+                       /* NOTE: this looks like DLM lock request, but it may
+                        *       not be one. Due to CEF_ASYNC flag (translated
+                        *       to LDLM_FL_HAS_INTENT by osc), this is
+                        *       glimpse request, that won't revoke any
+                        *       conflicting DLM locks held. Instead,
+                        *       ll_glimpse_callback() will be called on each
+                        *       client holding a DLM lock against this file,
+                        *       and resulting size will be returned for each
+                        *       stripe. DLM lock on [0, EOF] is acquired only
+                        *       if there were no conflicting locks. If there
+                        *       were conflicting locks, enqueuing or waiting
+                        *       fails with -ENAVAIL, but valid inode
+                        *       attributes are returned anyway. */
+                       *descr = whole_file;
+                       descr->cld_obj   = clob;
+                       descr->cld_mode  = CLM_PHANTOM;
+                       descr->cld_enq_flags = CEF_ASYNC | CEF_MUST;
+                       if (agl)
+                               descr->cld_enq_flags |= CEF_AGL;
+                       cio->cui_glimpse = 1;
+                       /*
+                        * CEF_ASYNC is used because glimpse sub-locks cannot
+                        * deadlock (because they never conflict with other
+                        * locks) and, hence, can be enqueued out-of-order.
+                        *
+                        * CEF_MUST protects glimpse lock from conversion into
+                        * a lockless mode.
+                        */
+                       lock = cl_lock_request(env, io, descr, "glimpse",
+                                              current);
+                       cio->cui_glimpse = 0;
+
+                       if (lock == NULL)
+                               RETURN(0);
+
+                       if (IS_ERR(lock))
+                               RETURN(PTR_ERR(lock));
+
+                       LASSERT(agl == 0);
+                       result = cl_wait(env, lock);
+                       if (result == 0) {
+                               cl_merge_lvb(env, inode);
+                               if (cl_isize_read(inode) > 0 &&
+                                   inode->i_blocks == 0) {
+                                       /*
+                                        * LU-417: Add dirty pages block count
+                                        * lest i_blocks reports 0, some "cp" or
+                                        * "tar" may think it's a completely
+                                        * sparse file and skip it.
+                                        */
+                                       inode->i_blocks = dirty_cnt(inode);
+                               }
+                               cl_unuse(env, lock);
+                       }
+                       cl_lock_release(env, lock, "glimpse", current);
+               } else {
+                       CDEBUG(D_DLMTRACE, "No objects for inode\n");
+                       cl_merge_lvb(env, inode);
+               }
+       }
+
+       RETURN(result);
+}
+
+static int cl_io_get(struct inode *inode, struct lu_env **envout,
+                    struct cl_io **ioout, int *refcheck)
+{
+       struct lu_env     *env;
+       struct cl_io       *io;
+       struct cl_inode_info   *lli = cl_i2info(inode);
+       struct cl_object       *clob = lli->lli_clob;
+       int result;
+
+       if (S_ISREG(cl_inode_mode(inode))) {
+               env = cl_env_get(refcheck);
+               if (!IS_ERR(env)) {
+                       io = ccc_env_thread_io(env);
+                       io->ci_obj = clob;
+                       *envout = env;
+                       *ioout  = io;
+                       result = +1;
+               } else
+                       result = PTR_ERR(env);
+       } else
+               result = 0;
+       return result;
+}
+
+int cl_glimpse_size0(struct inode *inode, int agl)
+{
+       /*
+        * We don't need ast_flags argument to cl_glimpse_size(), because
+        * osc_lock_enqueue() takes care of the possible deadlock that said
+        * argument was introduced to avoid.
+        */
+       /*
+        * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to
+        * cl_glimpse_size(), which doesn't make sense: glimpse locks are not
+        * blocking anyway.
+        */
+       struct lu_env     *env = NULL;
+       struct cl_io       *io  = NULL;
+       int                  result;
+       int                  refcheck;
+
+       ENTRY;
+
+       result = cl_io_get(inode, &env, &io, &refcheck);
+       if (result > 0) {
+       again:
+               io->ci_verify_layout = 1;
+               result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+               if (result > 0)
+                       /*
+                        * nothing to do for this io. This currently happens
+                        * when stripe sub-object's are not yet created.
+                        */
+                       result = io->ci_result;
+               else if (result == 0)
+                       result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+                                                agl);
+
+               OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
+               cl_io_fini(env, io);
+               if (unlikely(io->ci_need_restart))
+                       goto again;
+               cl_env_put(env, &refcheck);
+       }
+       RETURN(result);
+}
+
+int cl_local_size(struct inode *inode)
+{
+       struct lu_env      *env = NULL;
+       struct cl_io        *io  = NULL;
+       struct ccc_thread_info  *cti;
+       struct cl_object        *clob;
+       struct cl_lock_descr    *descr;
+       struct cl_lock    *lock;
+       int                   result;
+       int                   refcheck;
+
+       ENTRY;
+
+       if (!cl_i2info(inode)->lli_has_smd)
+               RETURN(0);
+
+       result = cl_io_get(inode, &env, &io, &refcheck);
+       if (result <= 0)
+               RETURN(result);
+
+       clob = io->ci_obj;
+       result = cl_io_init(env, io, CIT_MISC, clob);
+       if (result > 0)
+               result = io->ci_result;
+       else if (result == 0) {
+               cti = ccc_env_info(env);
+               descr = &cti->cti_descr;
+
+               *descr = whole_file;
+               descr->cld_obj = clob;
+               lock = cl_lock_peek(env, io, descr, "localsize", current);
+               if (lock != NULL) {
+                       cl_merge_lvb(env, inode);
+                       cl_unuse(env, lock);
+                       cl_lock_release(env, lock, "localsize", current);
+                       result = 0;
+               } else
+                       result = -ENODATA;
+       }
+       cl_io_fini(env, io);
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_cl.c b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
new file mode 100644 (file)
index 0000000..4a01666
--- /dev/null
@@ -0,0 +1,1325 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/fs.h>
+# include <linux/sched.h>
+# include <linux/mm.h>
+# include <linux/quotaops.h>
+# include <linux/highmem.h>
+# include <linux/pagemap.h>
+# include <linux/rbtree.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_mdc.h>
+#include <cl_object.h>
+
+#include <lclient.h>
+
+#include "../llite/llite_internal.h"
+
+const struct cl_req_operations ccc_req_ops;
+
+/*
+ * ccc_ prefix stands for "Common Client Code".
+ */
+
+static struct kmem_cache *ccc_lock_kmem;
+static struct kmem_cache *ccc_object_kmem;
+static struct kmem_cache *ccc_thread_kmem;
+static struct kmem_cache *ccc_session_kmem;
+static struct kmem_cache *ccc_req_kmem;
+
+static struct lu_kmem_descr ccc_caches[] = {
+       {
+               .ckd_cache = &ccc_lock_kmem,
+               .ckd_name  = "ccc_lock_kmem",
+               .ckd_size  = sizeof (struct ccc_lock)
+       },
+       {
+               .ckd_cache = &ccc_object_kmem,
+               .ckd_name  = "ccc_object_kmem",
+               .ckd_size  = sizeof (struct ccc_object)
+       },
+       {
+               .ckd_cache = &ccc_thread_kmem,
+               .ckd_name  = "ccc_thread_kmem",
+               .ckd_size  = sizeof (struct ccc_thread_info),
+       },
+       {
+               .ckd_cache = &ccc_session_kmem,
+               .ckd_name  = "ccc_session_kmem",
+               .ckd_size  = sizeof (struct ccc_session)
+       },
+       {
+               .ckd_cache = &ccc_req_kmem,
+               .ckd_name  = "ccc_req_kmem",
+               .ckd_size  = sizeof (struct ccc_req)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+void *ccc_key_init(const struct lu_context *ctx,
+                         struct lu_context_key *key)
+{
+       struct ccc_thread_info *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+void ccc_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+       struct ccc_thread_info *info = data;
+       OBD_SLAB_FREE_PTR(info, ccc_thread_kmem);
+}
+
+void *ccc_session_key_init(const struct lu_context *ctx,
+                                 struct lu_context_key *key)
+{
+       struct ccc_session *session;
+
+       OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, __GFP_IO);
+       if (session == NULL)
+               session = ERR_PTR(-ENOMEM);
+       return session;
+}
+
+void ccc_session_key_fini(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data)
+{
+       struct ccc_session *session = data;
+       OBD_SLAB_FREE_PTR(session, ccc_session_kmem);
+}
+
+struct lu_context_key ccc_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = ccc_key_init,
+       .lct_fini = ccc_key_fini
+};
+
+struct lu_context_key ccc_session_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = ccc_session_key_init,
+       .lct_fini = ccc_session_key_fini
+};
+
+
+/* type constructor/destructor: ccc_type_{init,fini,start,stop}(). */
+// LU_TYPE_INIT_FINI(ccc, &ccc_key, &ccc_session_key);
+
+int ccc_device_init(const struct lu_env *env, struct lu_device *d,
+                          const char *name, struct lu_device *next)
+{
+       struct ccc_device  *vdv;
+       int rc;
+       ENTRY;
+
+       vdv = lu2ccc_dev(d);
+       vdv->cdv_next = lu2cl_dev(next);
+
+       LASSERT(d->ld_site != NULL && next->ld_type != NULL);
+       next->ld_site = d->ld_site;
+       rc = next->ld_type->ldt_ops->ldto_device_init(
+                       env, next, next->ld_type->ldt_name, NULL);
+       if (rc == 0) {
+               lu_device_get(next);
+               lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+       }
+       RETURN(rc);
+}
+
+struct lu_device *ccc_device_fini(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       return cl2lu_dev(lu2ccc_dev(d)->cdv_next);
+}
+
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+                                  struct lu_device_type *t,
+                                  struct lustre_cfg *cfg,
+                                  const struct lu_device_operations *luops,
+                                  const struct cl_device_operations *clops)
+{
+       struct ccc_device *vdv;
+       struct lu_device  *lud;
+       struct cl_site    *site;
+       int rc;
+       ENTRY;
+
+       OBD_ALLOC_PTR(vdv);
+       if (vdv == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       lud = &vdv->cdv_cl.cd_lu_dev;
+       cl_device_init(&vdv->cdv_cl, t);
+       ccc2lu_dev(vdv)->ld_ops = luops;
+       vdv->cdv_cl.cd_ops = clops;
+
+       OBD_ALLOC_PTR(site);
+       if (site != NULL) {
+               rc = cl_site_init(site, &vdv->cdv_cl);
+               if (rc == 0)
+                       rc = lu_site_init_finish(&site->cs_lu);
+               else {
+                       LASSERT(lud->ld_site == NULL);
+                       CERROR("Cannot init lu_site, rc %d.\n", rc);
+                       OBD_FREE_PTR(site);
+               }
+       } else
+               rc = -ENOMEM;
+       if (rc != 0) {
+               ccc_device_free(env, lud);
+               lud = ERR_PTR(rc);
+       }
+       RETURN(lud);
+}
+
+struct lu_device *ccc_device_free(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       struct ccc_device *vdv  = lu2ccc_dev(d);
+       struct cl_site    *site = lu2cl_site(d->ld_site);
+       struct lu_device  *next = cl2lu_dev(vdv->cdv_next);
+
+       if (d->ld_site != NULL) {
+               cl_site_fini(site);
+               OBD_FREE_PTR(site);
+       }
+       cl_device_fini(lu2cl_dev(d));
+       OBD_FREE_PTR(vdv);
+       return next;
+}
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+                       struct cl_req *req)
+{
+       struct ccc_req *vrq;
+       int result;
+
+       OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, __GFP_IO);
+       if (vrq != NULL) {
+               cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+/**
+ * An `emergency' environment used by ccc_inode_fini() when cl_env_get()
+ * fails. Access to this environment is serialized by ccc_inode_fini_guard
+ * mutex.
+ */
+static struct lu_env *ccc_inode_fini_env = NULL;
+
+/**
+ * A mutex serializing calls to slp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+static DEFINE_MUTEX(ccc_inode_fini_guard);
+static int dummy_refcheck;
+
+int ccc_global_init(struct lu_device_type *device_type)
+{
+       int result;
+
+       result = lu_kmem_init(ccc_caches);
+       if (result)
+               return result;
+
+       result = lu_device_type_init(device_type);
+       if (result)
+               goto out_kmem;
+
+       ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck,
+                                         LCT_REMEMBER|LCT_NOREF);
+       if (IS_ERR(ccc_inode_fini_env)) {
+               result = PTR_ERR(ccc_inode_fini_env);
+               goto out_device;
+       }
+
+       ccc_inode_fini_env->le_ctx.lc_cookie = 0x4;
+       return 0;
+out_device:
+       lu_device_type_fini(device_type);
+out_kmem:
+       lu_kmem_fini(ccc_caches);
+       return result;
+}
+
+void ccc_global_fini(struct lu_device_type *device_type)
+{
+       if (ccc_inode_fini_env != NULL) {
+               cl_env_put(ccc_inode_fini_env, &dummy_refcheck);
+               ccc_inode_fini_env = NULL;
+       }
+       lu_device_type_fini(device_type);
+       lu_kmem_fini(ccc_caches);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *unused,
+                                  struct lu_device *dev,
+                                  const struct cl_object_operations *clops,
+                                  const struct lu_object_operations *luops)
+{
+       struct ccc_object *vob;
+       struct lu_object  *obj;
+
+       OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, __GFP_IO);
+       if (vob != NULL) {
+               struct cl_object_header *hdr;
+
+               obj = ccc2lu(vob);
+               hdr = &vob->cob_header;
+               cl_object_header_init(hdr);
+               lu_object_init(obj, &hdr->coh_lu, dev);
+               lu_object_add_top(&hdr->coh_lu, obj);
+
+               vob->cob_cl.co_ops = clops;
+               obj->lo_ops = luops;
+       } else
+               obj = NULL;
+       return obj;
+}
+
+int ccc_object_init0(const struct lu_env *env,
+                           struct ccc_object *vob,
+                           const struct cl_object_conf *conf)
+{
+       vob->cob_inode = conf->coc_inode;
+       vob->cob_transient_pages = 0;
+       cl_object_page_init(&vob->cob_cl, sizeof(struct ccc_page));
+       return 0;
+}
+
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+                          const struct lu_object_conf *conf)
+{
+       struct ccc_device *dev = lu2ccc_dev(obj->lo_dev);
+       struct ccc_object *vob = lu2ccc(obj);
+       struct lu_object  *below;
+       struct lu_device  *under;
+       int result;
+
+       under = &dev->cdv_next->cd_lu_dev;
+       below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+       if (below != NULL) {
+               const struct cl_object_conf *cconf;
+
+               cconf = lu2cl_conf(conf);
+               INIT_LIST_HEAD(&vob->cob_pending_list);
+               lu_object_add(obj, below);
+               result = ccc_object_init0(env, vob, cconf);
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+       struct ccc_object *vob = lu2ccc(obj);
+
+       lu_object_fini(obj);
+       lu_object_header_fini(obj->lo_header);
+       OBD_SLAB_FREE_PTR(vob, ccc_object_kmem);
+}
+
+int ccc_lock_init(const struct lu_env *env,
+                 struct cl_object *obj, struct cl_lock *lock,
+                 const struct cl_io *unused,
+                 const struct cl_lock_operations *lkops)
+{
+       struct ccc_lock *clk;
+       int result;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+       OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, __GFP_IO);
+       if (clk != NULL) {
+               cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+                const struct cl_attr *attr, unsigned valid)
+{
+       return 0;
+}
+
+int ccc_object_glimpse(const struct lu_env *env,
+                      const struct cl_object *obj, struct ost_lvb *lvb)
+{
+       struct inode *inode = ccc_object_inode(obj);
+
+       ENTRY;
+       lvb->lvb_mtime = cl_inode_mtime(inode);
+       lvb->lvb_atime = cl_inode_atime(inode);
+       lvb->lvb_ctime = cl_inode_ctime(inode);
+       /*
+        * LU-417: Add dirty pages block count lest i_blocks reports 0, some
+        * "cp" or "tar" on remote node may think it's a completely sparse file
+        * and skip it.
+        */
+       if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0)
+               lvb->lvb_blocks = dirty_cnt(inode);
+       RETURN(0);
+}
+
+
+
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+                       const struct cl_object_conf *conf)
+{
+       /* TODO: destroy all pages attached to this object. */
+       return 0;
+}
+
+static void ccc_object_size_lock(struct cl_object *obj)
+{
+       struct inode *inode = ccc_object_inode(obj);
+
+       cl_isize_lock(inode);
+       cl_object_attr_lock(obj);
+}
+
+static void ccc_object_size_unlock(struct cl_object *obj)
+{
+       struct inode *inode = ccc_object_inode(obj);
+
+       cl_object_attr_unlock(obj);
+       cl_isize_unlock(inode);
+}
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+struct page *ccc_page_vmpage(const struct lu_env *env,
+                           const struct cl_page_slice *slice)
+{
+       return cl2vm_page(slice);
+}
+
+int ccc_page_is_under_lock(const struct lu_env *env,
+                          const struct cl_page_slice *slice,
+                          struct cl_io *io)
+{
+       struct ccc_io   *cio  = ccc_env_io(env);
+       struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr;
+       struct cl_page       *page = slice->cpl_page;
+
+       int result;
+
+       ENTRY;
+
+       if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+           io->ci_type == CIT_FAULT) {
+               if (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+                       result = -EBUSY;
+               else {
+                       desc->cld_start = page->cp_index;
+                       desc->cld_end   = page->cp_index;
+                       desc->cld_obj   = page->cp_obj;
+                       desc->cld_mode  = CLM_READ;
+                       result = cl_queue_match(&io->ci_lockset.cls_done,
+                                               desc) ? -EBUSY : 0;
+               }
+       } else
+               result = 0;
+       RETURN(result);
+}
+
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice)
+{
+       /*
+        * Cached read?
+        */
+       LBUG();
+       return 0;
+}
+
+void ccc_transient_page_verify(const struct cl_page *page)
+{
+}
+
+int ccc_transient_page_own(const struct lu_env *env,
+                                  const struct cl_page_slice *slice,
+                                  struct cl_io *unused,
+                                  int nonblock)
+{
+       ccc_transient_page_verify(slice->cpl_page);
+       return 0;
+}
+
+void ccc_transient_page_assume(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     struct cl_io *unused)
+{
+       ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_unassume(const struct lu_env *env,
+                                       const struct cl_page_slice *slice,
+                                       struct cl_io *unused)
+{
+       ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_disown(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     struct cl_io *unused)
+{
+       ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_discard(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *unused)
+{
+       struct cl_page *page = slice->cpl_page;
+
+       ccc_transient_page_verify(slice->cpl_page);
+
+       /*
+        * For transient pages, remove it from the radix tree.
+        */
+       cl_page_delete(env, page);
+}
+
+int ccc_transient_page_prep(const struct lu_env *env,
+                                  const struct cl_page_slice *slice,
+                                  struct cl_io *unused)
+{
+       ENTRY;
+       /* transient page should always be sent. */
+       RETURN(0);
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+void ccc_lock_delete(const struct lu_env *env,
+                    const struct cl_lock_slice *slice)
+{
+       CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+}
+
+void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+{
+       struct ccc_lock *clk = cl2ccc_lock(slice);
+       OBD_SLAB_FREE_PTR(clk, ccc_lock_kmem);
+}
+
+int ccc_lock_enqueue(const struct lu_env *env,
+                    const struct cl_lock_slice *slice,
+                    struct cl_io *unused, __u32 enqflags)
+{
+       CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+       return 0;
+}
+
+int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+       CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+       return 0;
+}
+
+int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+       CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+       return 0;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_fits_into() methods for ccc
+ * layer. This function is executed every time io finds an existing lock in
+ * the lock cache while creating new lock. This function has to decide whether
+ * cached lock "fits" into io.
+ *
+ * \param slice lock to be checked
+ * \param io    IO that wants a lock.
+ *
+ * \see lov_lock_fits_into().
+ */
+int ccc_lock_fits_into(const struct lu_env *env,
+                      const struct cl_lock_slice *slice,
+                      const struct cl_lock_descr *need,
+                      const struct cl_io *io)
+{
+       const struct cl_lock       *lock  = slice->cls_lock;
+       const struct cl_lock_descr *descr = &lock->cll_descr;
+       const struct ccc_io     *cio   = ccc_env_io(env);
+       int                      result;
+
+       ENTRY;
+       /*
+        * Work around DLM peculiarity: it assumes that glimpse
+        * (LDLM_FL_HAS_INTENT) lock is always LCK_PR, and returns reads lock
+        * when asked for LCK_PW lock with LDLM_FL_HAS_INTENT flag set. Make
+        * sure that glimpse doesn't get CLM_WRITE top-lock, so that it
+        * doesn't enqueue CLM_WRITE sub-locks.
+        */
+       if (cio->cui_glimpse)
+               result = descr->cld_mode != CLM_WRITE;
+
+       /*
+        * Also, don't match incomplete write locks for read, otherwise read
+        * would enqueue missing sub-locks in the write mode.
+        */
+       else if (need->cld_mode != descr->cld_mode)
+               result = lock->cll_state >= CLS_ENQUEUED;
+       else
+               result = 1;
+       RETURN(result);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for ccc layer, invoked
+ * whenever lock state changes. Transfers object attributes, that might be
+ * updated as a result of lock acquiring into inode.
+ */
+void ccc_lock_state(const struct lu_env *env,
+                   const struct cl_lock_slice *slice,
+                   enum cl_lock_state state)
+{
+       struct cl_lock *lock = slice->cls_lock;
+       ENTRY;
+
+       /*
+        * Refresh inode attributes when the lock is moving into CLS_HELD
+        * state, and only when this is a result of real enqueue, rather than
+        * of finding lock in the cache.
+        */
+       if (state == CLS_HELD && lock->cll_state < CLS_HELD) {
+               struct cl_object *obj;
+               struct inode     *inode;
+
+               obj   = slice->cls_obj;
+               inode = ccc_object_inode(obj);
+
+               /* vmtruncate() sets the i_size
+                * under both a DLM lock and the
+                * ll_inode_size_lock().  If we don't get the
+                * ll_inode_size_lock() here we can match the DLM lock and
+                * reset i_size.  generic_file_write can then trust the
+                * stale i_size when doing appending writes and effectively
+                * cancel the result of the truncate.  Getting the
+                * ll_inode_size_lock() after the enqueue maintains the DLM
+                * -> ll_inode_size_lock() acquiring order. */
+               if (lock->cll_descr.cld_start == 0 &&
+                   lock->cll_descr.cld_end == CL_PAGE_EOF)
+                       cl_merge_lvb(env, inode);
+       }
+       EXIT;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       struct cl_io *io = ios->cis_io;
+
+       CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+}
+
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+                         __u32 enqflags, enum cl_lock_mode mode,
+                         pgoff_t start, pgoff_t end)
+{
+       struct ccc_io     *cio   = ccc_env_io(env);
+       struct cl_lock_descr   *descr = &cio->cui_link.cill_descr;
+       struct cl_object       *obj   = io->ci_obj;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end);
+
+       memset(&cio->cui_link, 0, sizeof cio->cui_link);
+
+       if (cio->cui_fd && (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+               descr->cld_mode = CLM_GROUP;
+               descr->cld_gid  = cio->cui_fd->fd_grouplock.cg_gid;
+       } else {
+               descr->cld_mode  = mode;
+       }
+       descr->cld_obj   = obj;
+       descr->cld_start = start;
+       descr->cld_end   = end;
+       descr->cld_enq_flags = enqflags;
+
+       cl_io_lock_add(env, io, &cio->cui_link);
+       RETURN(0);
+}
+
+void ccc_io_update_iov(const struct lu_env *env,
+                      struct ccc_io *cio, struct cl_io *io)
+{
+       int i;
+       size_t size = io->u.ci_rw.crw_count;
+
+       cio->cui_iov_olen = 0;
+       if (!cl_is_normalio(env, io) || cio->cui_tot_nrsegs == 0)
+               return;
+
+       for (i = 0; i < cio->cui_tot_nrsegs; i++) {
+               struct iovec *iv = &cio->cui_iov[i];
+
+               if (iv->iov_len < size)
+                       size -= iv->iov_len;
+               else {
+                       if (iv->iov_len > size) {
+                               cio->cui_iov_olen = iv->iov_len;
+                               iv->iov_len = size;
+                       }
+                       break;
+               }
+       }
+
+       cio->cui_nrsegs = i + 1;
+       LASSERTF(cio->cui_tot_nrsegs >= cio->cui_nrsegs,
+                "tot_nrsegs: %lu, nrsegs: %lu\n",
+                cio->cui_tot_nrsegs, cio->cui_nrsegs);
+}
+
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+                   __u32 enqflags, enum cl_lock_mode mode,
+                   loff_t start, loff_t end)
+{
+       struct cl_object *obj = io->ci_obj;
+       return ccc_io_one_lock_index(env, io, enqflags, mode,
+                                    cl_index(obj, start), cl_index(obj, end));
+}
+
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       CLOBINVRNT(env, ios->cis_io->ci_obj,
+                  ccc_object_invariant(ios->cis_io->ci_obj));
+}
+
+void ccc_io_advance(const struct lu_env *env,
+                   const struct cl_io_slice *ios,
+                   size_t nob)
+{
+       struct ccc_io    *cio = cl2ccc_io(env, ios);
+       struct cl_io     *io  = ios->cis_io;
+       struct cl_object *obj = ios->cis_io->ci_obj;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+       if (!cl_is_normalio(env, io))
+               return;
+
+       LASSERT(cio->cui_tot_nrsegs >= cio->cui_nrsegs);
+       LASSERT(cio->cui_tot_count  >= nob);
+
+       cio->cui_iov    += cio->cui_nrsegs;
+       cio->cui_tot_nrsegs -= cio->cui_nrsegs;
+       cio->cui_tot_count  -= nob;
+
+       /* update the iov */
+       if (cio->cui_iov_olen > 0) {
+               struct iovec *iv;
+
+               cio->cui_iov--;
+               cio->cui_tot_nrsegs++;
+               iv = &cio->cui_iov[0];
+               if (io->ci_continue) {
+                       iv->iov_base += iv->iov_len;
+                       LASSERT(cio->cui_iov_olen > iv->iov_len);
+                       iv->iov_len = cio->cui_iov_olen - iv->iov_len;
+               } else {
+                       /* restore the iov_len, in case of restart io. */
+                       iv->iov_len = cio->cui_iov_olen;
+               }
+               cio->cui_iov_olen = 0;
+       }
+}
+
+/**
+ * Helper function that if necessary adjusts file size (inode->i_size), when
+ * position at the offset \a pos is accessed. File size can be arbitrary stale
+ * on a Lustre client, but client at least knows KMS. If accessed area is
+ * inside [0, KMS], set file size to KMS, otherwise glimpse file size.
+ *
+ * Locking: cl_isize_lock is used to serialize changes to inode size and to
+ * protect consistency between inode size and cl_object
+ * attributes. cl_object_size_lock() protects consistency between cl_attr's of
+ * top-object and sub-objects.
+ */
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_io *io, loff_t start, size_t count, int *exceed)
+{
+       struct cl_attr *attr  = ccc_env_thread_attr(env);
+       struct inode   *inode = ccc_object_inode(obj);
+       loff_t    pos   = start + count - 1;
+       loff_t kms;
+       int result;
+
+       /*
+        * Consistency guarantees: following possibilities exist for the
+        * relation between region being accessed and real file size at this
+        * moment:
+        *
+        *  (A): the region is completely inside of the file;
+        *
+        *  (B-x): x bytes of region are inside of the file, the rest is
+        *  outside;
+        *
+        *  (C): the region is completely outside of the file.
+        *
+        * This classification is stable under DLM lock already acquired by
+        * the caller, because to change the class, other client has to take
+        * DLM lock conflicting with our lock. Also, any updates to ->i_size
+        * by other threads on this client are serialized by
+        * ll_inode_size_lock(). This guarantees that short reads are handled
+        * correctly in the face of concurrent writes and truncates.
+        */
+       ccc_object_size_lock(obj);
+       result = cl_object_attr_get(env, obj, attr);
+       if (result == 0) {
+               kms = attr->cat_kms;
+               if (pos > kms) {
+                       /*
+                        * A glimpse is necessary to determine whether we
+                        * return a short read (B) or some zeroes at the end
+                        * of the buffer (C)
+                        */
+                       ccc_object_size_unlock(obj);
+                       result = cl_glimpse_lock(env, io, inode, obj, 0);
+                       if (result == 0 && exceed != NULL) {
+                               /* If objective page index exceed end-of-file
+                                * page index, return directly. Do not expect
+                                * kernel will check such case correctly.
+                                * linux-2.6.18-128.1.1 miss to do that.
+                                * --bug 17336 */
+                               loff_t size = cl_isize_read(inode);
+                               unsigned long cur_index = start >> PAGE_CACHE_SHIFT;
+
+                               if ((size == 0 && cur_index != 0) ||
+                                   (((size - 1) >> PAGE_CACHE_SHIFT) < cur_index))
+                               *exceed = 1;
+                       }
+                       return result;
+               } else {
+                       /*
+                        * region is within kms and, hence, within real file
+                        * size (A). We need to increase i_size to cover the
+                        * read region so that generic_file_read() will do its
+                        * job, but that doesn't mean the kms size is
+                        * _correct_, it is only the _minimum_ size. If
+                        * someone does a stat they will get the correct size
+                        * which will always be >= the kms value here.
+                        * b=11081
+                        */
+                       if (cl_isize_read(inode) < kms) {
+                               cl_isize_write_nolock(inode, kms);
+                               CDEBUG(D_VFSTRACE,
+                                      DFID" updating i_size "LPU64"\n",
+                                      PFID(lu_object_fid(&obj->co_lu)),
+                                      (__u64)cl_isize_read(inode));
+
+                       }
+               }
+       }
+       ccc_object_size_unlock(obj);
+       return result;
+}
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+void ccc_req_completion(const struct lu_env *env,
+                       const struct cl_req_slice *slice, int ioret)
+{
+       struct ccc_req *vrq;
+
+       if (ioret > 0)
+               cl_stats_tally(slice->crs_dev, slice->crs_req->crq_type, ioret);
+
+       vrq = cl2ccc_req(slice);
+       OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for ccc
+ * layer. ccc is responsible for
+ *
+ *    - o_[mac]time
+ *
+ *    - o_mode
+ *
+ *    - o_parent_seq
+ *
+ *    - o_[ug]id
+ *
+ *    - o_parent_oid
+ *
+ *    - o_parent_ver
+ *
+ *    - o_ioepoch,
+ *
+ *  and capability.
+ */
+void ccc_req_attr_set(const struct lu_env *env,
+                     const struct cl_req_slice *slice,
+                     const struct cl_object *obj,
+                     struct cl_req_attr *attr, obd_valid flags)
+{
+       struct inode *inode;
+       struct obdo  *oa;
+       obd_flag      valid_flags;
+
+       oa = attr->cra_oa;
+       inode = ccc_object_inode(obj);
+       valid_flags = OBD_MD_FLTYPE;
+
+       if ((flags & OBD_MD_FLOSSCAPA) != 0) {
+               LASSERT(attr->cra_capa == NULL);
+               attr->cra_capa = cl_capa_lookup(inode,
+                                               slice->crs_req->crq_type);
+       }
+
+       if (slice->crs_req->crq_type == CRT_WRITE) {
+               if (flags & OBD_MD_FLEPOCH) {
+                       oa->o_valid |= OBD_MD_FLEPOCH;
+                       oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch;
+                       valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+                                      OBD_MD_FLUID | OBD_MD_FLGID;
+               }
+       }
+       obdo_from_inode(oa, inode, valid_flags & flags);
+       obdo_set_parent_fid(oa, &cl_i2info(inode)->lli_fid);
+       memcpy(attr->cra_jobid, cl_i2info(inode)->lli_jobid,
+              JOBSTATS_JOBID_SIZE);
+}
+
+const struct cl_req_operations ccc_req_ops = {
+       .cro_attr_set   = ccc_req_attr_set,
+       .cro_completion = ccc_req_completion
+};
+
+int cl_setattr_ost(struct inode *inode, const struct iattr *attr,
+                  struct obd_capa *capa)
+{
+       struct lu_env *env;
+       struct cl_io  *io;
+       int         result;
+       int         refcheck;
+
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       io = ccc_env_thread_io(env);
+       io->ci_obj = cl_i2info(inode)->lli_clob;
+
+       io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime);
+       io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime);
+       io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime);
+       io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size;
+       io->u.ci_setattr.sa_valid = attr->ia_valid;
+       io->u.ci_setattr.sa_capa = capa;
+
+again:
+       if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
+               struct ccc_io *cio = ccc_env_io(env);
+
+               if (attr->ia_valid & ATTR_FILE)
+                       /* populate the file descriptor for ftruncate to honor
+                        * group lock - see LU-787 */
+                       cio->cui_fd = cl_iattr2fd(inode, attr);
+
+               result = cl_io_loop(env, io);
+       } else {
+               result = io->ci_result;
+       }
+       cl_io_fini(env, io);
+       if (unlikely(io->ci_need_restart))
+               goto again;
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+struct lu_device *ccc2lu_dev(struct ccc_device *vdv)
+{
+       return &vdv->cdv_cl.cd_lu_dev;
+}
+
+struct ccc_device *lu2ccc_dev(const struct lu_device *d)
+{
+       return container_of0(d, struct ccc_device, cdv_cl.cd_lu_dev);
+}
+
+struct ccc_device *cl2ccc_dev(const struct cl_device *d)
+{
+       return container_of0(d, struct ccc_device, cdv_cl);
+}
+
+struct lu_object *ccc2lu(struct ccc_object *vob)
+{
+       return &vob->cob_cl.co_lu;
+}
+
+struct ccc_object *lu2ccc(const struct lu_object *obj)
+{
+       return container_of0(obj, struct ccc_object, cob_cl.co_lu);
+}
+
+struct ccc_object *cl2ccc(const struct cl_object *obj)
+{
+       return container_of0(obj, struct ccc_object, cob_cl);
+}
+
+struct ccc_lock *cl2ccc_lock(const struct cl_lock_slice *slice)
+{
+       return container_of(slice, struct ccc_lock, clk_cl);
+}
+
+struct ccc_io *cl2ccc_io(const struct lu_env *env,
+                        const struct cl_io_slice *slice)
+{
+       struct ccc_io *cio;
+
+       cio = container_of(slice, struct ccc_io, cui_cl);
+       LASSERT(cio == ccc_env_io(env));
+       return cio;
+}
+
+struct ccc_req *cl2ccc_req(const struct cl_req_slice *slice)
+{
+       return container_of0(slice, struct ccc_req, crq_cl);
+}
+
+struct page *cl2vm_page(const struct cl_page_slice *slice)
+{
+       return cl2ccc_page(slice)->cpg_page;
+}
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+int ccc_object_invariant(const struct cl_object *obj)
+{
+       struct inode     *inode = ccc_object_inode(obj);
+       struct cl_inode_info *lli   = cl_i2info(inode);
+
+       return (S_ISREG(cl_inode_mode(inode)) ||
+               /* i_mode of unlinked inode is zeroed. */
+               cl_inode_mode(inode) == 0) && lli->lli_clob == obj;
+}
+
+struct inode *ccc_object_inode(const struct cl_object *obj)
+{
+       return cl2ccc(obj)->cob_inode;
+}
+
+/**
+ * Returns a pointer to cl_page associated with \a vmpage, without acquiring
+ * additional reference to the resulting page. This is an unsafe version of
+ * cl_vmpage_page() that can only be used under vmpage lock.
+ */
+struct cl_page *ccc_vmpage_page_transient(struct page *vmpage)
+{
+       KLASSERT(PageLocked(vmpage));
+       return (struct cl_page *)vmpage->private;
+}
+
+/**
+ * Initialize or update CLIO structures for regular files when new
+ * meta-data arrives from the server.
+ *
+ * \param inode regular file inode
+ * \param md    new file metadata from MDS
+ * - allocates cl_object if necessary,
+ * - updated layout, if object was already here.
+ */
+int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
+{
+       struct lu_env   *env;
+       struct cl_inode_info *lli;
+       struct cl_object     *clob;
+       struct lu_site       *site;
+       struct lu_fid   *fid;
+       struct cl_object_conf conf = {
+               .coc_inode = inode,
+               .u = {
+                       .coc_md    = md
+               }
+       };
+       int result = 0;
+       int refcheck;
+
+       LASSERT(md->body->valid & OBD_MD_FLID);
+       LASSERT(S_ISREG(cl_inode_mode(inode)));
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               return PTR_ERR(env);
+
+       site = cl_i2sbi(inode)->ll_site;
+       lli  = cl_i2info(inode);
+       fid  = &lli->lli_fid;
+       LASSERT(fid_is_sane(fid));
+
+       if (lli->lli_clob == NULL) {
+               /* clob is slave of inode, empty lli_clob means for new inode,
+                * there is no clob in cache with the given fid, so it is
+                * unnecessary to perform lookup-alloc-lookup-insert, just
+                * alloc and insert directly. */
+               LASSERT(inode->i_state & I_NEW);
+               conf.coc_lu.loc_flags = LOC_F_NEW;
+               clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
+                                     fid, &conf);
+               if (!IS_ERR(clob)) {
+                       /*
+                        * No locking is necessary, as new inode is
+                        * locked by I_NEW bit.
+                        */
+                       lli->lli_clob = clob;
+                       lli->lli_has_smd = md->lsm != NULL;
+                       lu_object_ref_add(&clob->co_lu, "inode", inode);
+               } else
+                       result = PTR_ERR(clob);
+       } else {
+               result = cl_conf_set(env, lli->lli_clob, &conf);
+       }
+
+       cl_env_put(env, &refcheck);
+
+       if (result != 0)
+               CERROR("Failure to initialize cl object "DFID": %d\n",
+                      PFID(fid), result);
+       return result;
+}
+
+/**
+ * Wait for others drop their references of the object at first, then we drop
+ * the last one, which will lead to the object be destroyed immediately.
+ * Must be called after cl_object_kill() against this object.
+ *
+ * The reason we want to do this is: destroying top object will wait for sub
+ * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs)
+ * to initiate top object destroying which may deadlock. See bz22520.
+ */
+static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
+{
+       struct lu_object_header *header = obj->co_lu.lo_header;
+       wait_queue_t       waiter;
+
+       if (unlikely(atomic_read(&header->loh_ref) != 1)) {
+               struct lu_site *site = obj->co_lu.lo_dev->ld_site;
+               struct lu_site_bkt_data *bkt;
+
+               bkt = lu_site_bkt_from_fid(site, &header->loh_fid);
+
+               init_waitqueue_entry_current(&waiter);
+               add_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+
+               while (1) {
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       if (atomic_read(&header->loh_ref) == 1)
+                               break;
+                       waitq_wait(&waiter, TASK_UNINTERRUPTIBLE);
+               }
+
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&bkt->lsb_marche_funebre, &waiter);
+       }
+
+       cl_object_put(env, obj);
+}
+
+void cl_inode_fini(struct inode *inode)
+{
+       struct lu_env      *env;
+       struct cl_inode_info    *lli  = cl_i2info(inode);
+       struct cl_object        *clob = lli->lli_clob;
+       int refcheck;
+       int emergency;
+
+       if (clob != NULL) {
+               void                *cookie;
+
+               cookie = cl_env_reenter();
+               env = cl_env_get(&refcheck);
+               emergency = IS_ERR(env);
+               if (emergency) {
+                       mutex_lock(&ccc_inode_fini_guard);
+                       LASSERT(ccc_inode_fini_env != NULL);
+                       cl_env_implant(ccc_inode_fini_env, &refcheck);
+                       env = ccc_inode_fini_env;
+               }
+               /*
+                * cl_object cache is a slave to inode cache (which, in turn
+                * is a slave to dentry cache), don't keep cl_object in memory
+                * when its master is evicted.
+                */
+               cl_object_kill(env, clob);
+               lu_object_ref_del(&clob->co_lu, "inode", inode);
+               cl_object_put_last(env, clob);
+               lli->lli_clob = NULL;
+               if (emergency) {
+                       cl_env_unplant(ccc_inode_fini_env, &refcheck);
+                       mutex_unlock(&ccc_inode_fini_guard);
+               } else
+                       cl_env_put(env, &refcheck);
+               cl_env_reexit(cookie);
+       }
+}
+
+/**
+ * return IF_* type for given lu_dirent entry.
+ * IF_* flag shld be converted to particular OS file type in
+ * platform llite module.
+ */
+__u16 ll_dirent_type_get(struct lu_dirent *ent)
+{
+       __u16 type = 0;
+       struct luda_type *lt;
+       int len = 0;
+
+       if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
+               const unsigned align = sizeof(struct luda_type) - 1;
+
+               len = le16_to_cpu(ent->lde_namelen);
+               len = (len + align) & ~align;
+               lt = (void *)ent->lde_name + len;
+               type = IFTODT(le16_to_cpu(lt->lt_type));
+       }
+       return type;
+}
+
+/**
+ * build inode number from passed @fid */
+__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32)
+{
+       if (BITS_PER_LONG == 32 || api32)
+               RETURN(fid_flatten32(fid));
+       else
+               RETURN(fid_flatten(fid));
+}
+
+/**
+ * build inode generation from passed @fid.  If our FID overflows the 32-bit
+ * inode number then return a non-zero generation to distinguish them. */
+__u32 cl_fid_build_gen(const struct lu_fid *fid)
+{
+       __u32 gen;
+       ENTRY;
+
+       if (fid_is_igif(fid)) {
+               gen = lu_igif_gen(fid);
+               RETURN(gen);
+       }
+
+       gen = (fid_flatten(fid) >> 32);
+       RETURN(gen);
+}
+
+/* lsm is unreliable after hsm implementation as layout can be changed at
+ * any time. This is only to support old, non-clio-ized interfaces. It will
+ * cause deadlock if clio operations are called with this extra layout refcount
+ * because in case the layout changed during the IO, ll_layout_refresh() will
+ * have to wait for the refcount to become zero to destroy the older layout.
+ *
+ * Notice that the lsm returned by this function may not be valid unless called
+ * inside layout lock - MDS_INODELOCK_LAYOUT. */
+struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode)
+{
+       return lov_lsm_get(cl_i2info(inode)->lli_clob);
+}
+
+void inline ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm)
+{
+       lov_lsm_put(cl_i2info(inode)->lli_clob, lsm);
+}
diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_misc.c b/drivers/staging/lustre/lustre/lclient/lcommon_misc.c
new file mode 100644 (file)
index 0000000..8ecbef9
--- /dev/null
@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <cl_object.h>
+#include <lclient.h>
+
+#include <lustre_lite.h>
+
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
+{
+       struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 };
+       __u32 valsize = sizeof(struct lov_desc);
+       int rc, easize, def_easize, cookiesize;
+       struct lov_desc desc;
+       __u16 stripes;
+       ENTRY;
+
+       rc = obd_get_info(NULL, dt_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC,
+                         &valsize, &desc, NULL);
+       if (rc)
+               RETURN(rc);
+
+       stripes = min(desc.ld_tgt_count, (__u32)LOV_MAX_STRIPE_COUNT);
+       lsm.lsm_stripe_count = stripes;
+       easize = obd_size_diskmd(dt_exp, &lsm);
+
+       lsm.lsm_stripe_count = desc.ld_default_stripe_count;
+       def_easize = obd_size_diskmd(dt_exp, &lsm);
+
+       cookiesize = stripes * sizeof(struct llog_cookie);
+
+       CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n",
+              easize, cookiesize);
+
+       rc = md_init_ea_size(md_exp, easize, def_easize, cookiesize);
+       RETURN(rc);
+}
+
+/**
+ * This function is used as an upcall-callback hooked by liblustre and llite
+ * clients into obd_notify() listeners chain to handle notifications about
+ * change of import connect_flags. See llu_fsswop_mount() and
+ * lustre_common_fill_super().
+ */
+int cl_ocd_update(struct obd_device *host,
+                 struct obd_device *watched,
+                 enum obd_notify_event ev, void *owner, void *data)
+{
+       struct lustre_client_ocd *lco;
+       struct client_obd       *cli;
+       __u64 flags;
+       int   result;
+
+       ENTRY;
+       if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+               cli = &watched->u.cli;
+               lco = owner;
+               flags = cli->cl_import->imp_connect_data.ocd_connect_flags;
+               CDEBUG(D_SUPER, "Changing connect_flags: "LPX64" -> "LPX64"\n",
+                      lco->lco_flags, flags);
+               mutex_lock(&lco->lco_lock);
+               lco->lco_flags &= flags;
+               /* for each osc event update ea size */
+               if (lco->lco_dt_exp)
+                       cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp);
+
+               mutex_unlock(&lco->lco_lock);
+               result = 0;
+       } else {
+               CERROR("unexpected notification from %s %s!\n",
+                      watched->obd_type->typ_name,
+                      watched->obd_name);
+               result = -EINVAL;
+       }
+       RETURN(result);
+}
+
+#define GROUPLOCK_SCOPE "grouplock"
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+                    struct ccc_grouplock *cg)
+{
+       struct lu_env     *env;
+       struct cl_io       *io;
+       struct cl_lock   *lock;
+       struct cl_lock_descr   *descr;
+       __u32              enqflags;
+       int                  refcheck;
+       int                  rc;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               return PTR_ERR(env);
+
+       io = ccc_env_thread_io(env);
+       io->ci_obj = obj;
+       io->ci_ignore_layout = 1;
+
+       rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+       if (rc) {
+               LASSERT(rc < 0);
+               cl_env_put(env, &refcheck);
+               return rc;
+       }
+
+       descr = &ccc_env_info(env)->cti_descr;
+       descr->cld_obj = obj;
+       descr->cld_start = 0;
+       descr->cld_end = CL_PAGE_EOF;
+       descr->cld_gid = gid;
+       descr->cld_mode = CLM_GROUP;
+
+       enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0);
+       descr->cld_enq_flags = enqflags;
+
+       lock = cl_lock_request(env, io, descr, GROUPLOCK_SCOPE, current);
+       if (IS_ERR(lock)) {
+               cl_io_fini(env, io);
+               cl_env_put(env, &refcheck);
+               return PTR_ERR(lock);
+       }
+
+       cg->cg_env  = cl_env_get(&refcheck);
+       cg->cg_io   = io;
+       cg->cg_lock = lock;
+       cg->cg_gid  = gid;
+       LASSERT(cg->cg_env == env);
+
+       cl_env_unplant(env, &refcheck);
+       return 0;
+}
+
+void cl_put_grouplock(struct ccc_grouplock *cg)
+{
+       struct lu_env  *env  = cg->cg_env;
+       struct cl_io   *io   = cg->cg_io;
+       struct cl_lock *lock = cg->cg_lock;
+       int          refcheck;
+
+       LASSERT(cg->cg_env);
+       LASSERT(cg->cg_gid);
+
+       cl_env_implant(env, &refcheck);
+       cl_env_put(env, &refcheck);
+
+       cl_unuse(env, lock);
+       cl_lock_release(env, lock, GROUPLOCK_SCOPE, current);
+       cl_io_fini(env, io);
+       cl_env_put(env, NULL);
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/interval_tree.c b/drivers/staging/lustre/lustre/ldlm/interval_tree.c
new file mode 100644 (file)
index 0000000..ce90c7e
--- /dev/null
@@ -0,0 +1,764 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/interval_tree.c
+ *
+ * Interval tree library used by ldlm extent lock code
+ *
+ * Author: Huang Wei <huangwei@clusterfs.com>
+ * Author: Jay Xiong <jinshan.xiong@sun.com>
+ */
+# include <lustre_dlm.h>
+#include <obd_support.h>
+#include <interval_tree.h>
+
+enum {
+       INTERVAL_RED = 0,
+       INTERVAL_BLACK = 1
+};
+
+static inline int node_is_left_child(struct interval_node *node)
+{
+       LASSERT(node->in_parent != NULL);
+       return node == node->in_parent->in_left;
+}
+
+static inline int node_is_right_child(struct interval_node *node)
+{
+       LASSERT(node->in_parent != NULL);
+       return node == node->in_parent->in_right;
+}
+
+static inline int node_is_red(struct interval_node *node)
+{
+       return node->in_color == INTERVAL_RED;
+}
+
+static inline int node_is_black(struct interval_node *node)
+{
+       return node->in_color == INTERVAL_BLACK;
+}
+
+static inline int extent_compare(struct interval_node_extent *e1,
+                                struct interval_node_extent *e2)
+{
+       int rc;
+       if (e1->start == e2->start) {
+               if (e1->end < e2->end)
+                       rc = -1;
+               else if (e1->end > e2->end)
+                       rc = 1;
+               else
+                       rc = 0;
+       } else {
+               if (e1->start < e2->start)
+                       rc = -1;
+               else
+                       rc = 1;
+       }
+       return rc;
+}
+
+static inline int extent_equal(struct interval_node_extent *e1,
+                              struct interval_node_extent *e2)
+{
+       return (e1->start == e2->start) && (e1->end == e2->end);
+}
+
+static inline int extent_overlapped(struct interval_node_extent *e1,
+                                   struct interval_node_extent *e2)
+{
+       return (e1->start <= e2->end) && (e2->start <= e1->end);
+}
+
+static inline int node_compare(struct interval_node *n1,
+                              struct interval_node *n2)
+{
+       return extent_compare(&n1->in_extent, &n2->in_extent);
+}
+
+static inline int node_equal(struct interval_node *n1,
+                            struct interval_node *n2)
+{
+       return extent_equal(&n1->in_extent, &n2->in_extent);
+}
+
+static inline __u64 max_u64(__u64 x, __u64 y)
+{
+       return x > y ? x : y;
+}
+
+static inline __u64 min_u64(__u64 x, __u64 y)
+{
+       return x < y ? x : y;
+}
+
+#define interval_for_each(node, root)             \
+for (node = interval_first(root); node != NULL;         \
+     node = interval_next(node))
+
+#define interval_for_each_reverse(node, root)     \
+for (node = interval_last(root); node != NULL;   \
+     node = interval_prev(node))
+
+static struct interval_node *interval_first(struct interval_node *node)
+{
+       ENTRY;
+
+       if (!node)
+               RETURN(NULL);
+       while (node->in_left)
+               node = node->in_left;
+       RETURN(node);
+}
+
+static struct interval_node *interval_last(struct interval_node *node)
+{
+       ENTRY;
+
+       if (!node)
+               RETURN(NULL);
+       while (node->in_right)
+               node = node->in_right;
+       RETURN(node);
+}
+
+static struct interval_node *interval_next(struct interval_node *node)
+{
+       ENTRY;
+
+       if (!node)
+               RETURN(NULL);
+       if (node->in_right)
+               RETURN(interval_first(node->in_right));
+       while (node->in_parent && node_is_right_child(node))
+               node = node->in_parent;
+       RETURN(node->in_parent);
+}
+
+static struct interval_node *interval_prev(struct interval_node *node)
+{
+       ENTRY;
+
+       if (!node)
+               RETURN(NULL);
+
+       if (node->in_left)
+               RETURN(interval_last(node->in_left));
+
+       while (node->in_parent && node_is_left_child(node))
+               node = node->in_parent;
+
+       RETURN(node->in_parent);
+}
+
+enum interval_iter interval_iterate(struct interval_node *root,
+                                   interval_callback_t func,
+                                   void *data)
+{
+       struct interval_node *node;
+       enum interval_iter rc = INTERVAL_ITER_CONT;
+       ENTRY;
+
+       interval_for_each(node, root) {
+               rc = func(node, data);
+               if (rc == INTERVAL_ITER_STOP)
+                       break;
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate);
+
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+                                           interval_callback_t func,
+                                           void *data)
+{
+       struct interval_node *node;
+       enum interval_iter rc = INTERVAL_ITER_CONT;
+       ENTRY;
+
+       interval_for_each_reverse(node, root) {
+               rc = func(node, data);
+               if (rc == INTERVAL_ITER_STOP)
+                       break;
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate_reverse);
+
+/* try to find a node with same interval in the tree,
+ * if found, return the pointer to the node, otherwise return NULL*/
+struct interval_node *interval_find(struct interval_node *root,
+                                   struct interval_node_extent *ex)
+{
+       struct interval_node *walk = root;
+       int rc;
+       ENTRY;
+
+       while (walk) {
+               rc = extent_compare(ex, &walk->in_extent);
+               if (rc == 0)
+                       break;
+               else if (rc < 0)
+                       walk = walk->in_left;
+               else
+                       walk = walk->in_right;
+       }
+
+       RETURN(walk);
+}
+EXPORT_SYMBOL(interval_find);
+
+static void __rotate_change_maxhigh(struct interval_node *node,
+                                   struct interval_node *rotate)
+{
+       __u64 left_max, right_max;
+
+       rotate->in_max_high = node->in_max_high;
+       left_max = node->in_left ? node->in_left->in_max_high : 0;
+       right_max = node->in_right ? node->in_right->in_max_high : 0;
+       node->in_max_high  = max_u64(interval_high(node),
+                                    max_u64(left_max,right_max));
+}
+
+/* The left rotation "pivots" around the link from node to node->right, and
+ * - node will be linked to node->right's left child, and
+ * - node->right's left child will be linked to node's right child.  */
+static void __rotate_left(struct interval_node *node,
+                         struct interval_node **root)
+{
+       struct interval_node *right = node->in_right;
+       struct interval_node *parent = node->in_parent;
+
+       node->in_right = right->in_left;
+       if (node->in_right)
+               right->in_left->in_parent = node;
+
+       right->in_left = node;
+       right->in_parent = parent;
+       if (parent) {
+               if (node_is_left_child(node))
+                       parent->in_left = right;
+               else
+                       parent->in_right = right;
+       } else {
+               *root = right;
+       }
+       node->in_parent = right;
+
+       /* update max_high for node and right */
+       __rotate_change_maxhigh(node, right);
+}
+
+/* The right rotation "pivots" around the link from node to node->left, and
+ * - node will be linked to node->left's right child, and
+ * - node->left's right child will be linked to node's left child.  */
+static void __rotate_right(struct interval_node *node,
+                          struct interval_node **root)
+{
+       struct interval_node *left = node->in_left;
+       struct interval_node *parent = node->in_parent;
+
+       node->in_left = left->in_right;
+       if (node->in_left)
+               left->in_right->in_parent = node;
+       left->in_right = node;
+
+       left->in_parent = parent;
+       if (parent) {
+               if (node_is_right_child(node))
+                       parent->in_right = left;
+               else
+                       parent->in_left = left;
+       } else {
+               *root = left;
+       }
+       node->in_parent = left;
+
+       /* update max_high for node and left */
+       __rotate_change_maxhigh(node, left);
+}
+
+#define interval_swap(a, b) do {                       \
+       struct interval_node *c = a; a = b; b = c;      \
+} while (0)
+
+/*
+ * Operations INSERT and DELETE, when run on a tree with n keys,
+ * take O(logN) time.Because they modify the tree, the result
+ * may violate the red-black properties.To restore these properties,
+ * we must change the colors of some of the nodes in the tree
+ * and also change the pointer structure.
+ */
+static void interval_insert_color(struct interval_node *node,
+                                 struct interval_node **root)
+{
+       struct interval_node *parent, *gparent;
+       ENTRY;
+
+       while ((parent = node->in_parent) && node_is_red(parent)) {
+               gparent = parent->in_parent;
+               /* Parent is RED, so gparent must not be NULL */
+               if (node_is_left_child(parent)) {
+                       struct interval_node *uncle;
+                       uncle = gparent->in_right;
+                       if (uncle && node_is_red(uncle)) {
+                               uncle->in_color = INTERVAL_BLACK;
+                               parent->in_color = INTERVAL_BLACK;
+                               gparent->in_color = INTERVAL_RED;
+                               node = gparent;
+                               continue;
+                       }
+
+                       if (parent->in_right == node) {
+                               __rotate_left(parent, root);
+                               interval_swap(node, parent);
+                       }
+
+                       parent->in_color = INTERVAL_BLACK;
+                       gparent->in_color = INTERVAL_RED;
+                       __rotate_right(gparent, root);
+               } else {
+                       struct interval_node *uncle;
+                       uncle = gparent->in_left;
+                       if (uncle && node_is_red(uncle)) {
+                               uncle->in_color = INTERVAL_BLACK;
+                               parent->in_color = INTERVAL_BLACK;
+                               gparent->in_color = INTERVAL_RED;
+                               node = gparent;
+                               continue;
+                       }
+
+                       if (node_is_left_child(node)) {
+                               __rotate_right(parent, root);
+                               interval_swap(node, parent);
+                       }
+
+                       parent->in_color = INTERVAL_BLACK;
+                       gparent->in_color = INTERVAL_RED;
+                       __rotate_left(gparent, root);
+               }
+       }
+
+       (*root)->in_color = INTERVAL_BLACK;
+       EXIT;
+}
+
+struct interval_node *interval_insert(struct interval_node *node,
+                                     struct interval_node **root)
+
+{
+       struct interval_node **p, *parent = NULL;
+       ENTRY;
+
+       LASSERT(!interval_is_intree(node));
+       p = root;
+       while (*p) {
+               parent = *p;
+               if (node_equal(parent, node))
+                       RETURN(parent);
+
+               /* max_high field must be updated after each iteration */
+               if (parent->in_max_high < interval_high(node))
+                       parent->in_max_high = interval_high(node);
+
+               if (node_compare(node, parent) < 0)
+                       p = &parent->in_left;
+               else
+                       p = &parent->in_right;
+       }
+
+       /* link node into the tree */
+       node->in_parent = parent;
+       node->in_color = INTERVAL_RED;
+       node->in_left = node->in_right = NULL;
+       *p = node;
+
+       interval_insert_color(node, root);
+       node->in_intree = 1;
+
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(interval_insert);
+
+static inline int node_is_black_or_0(struct interval_node *node)
+{
+       return !node || node_is_black(node);
+}
+
+static void interval_erase_color(struct interval_node *node,
+                                struct interval_node *parent,
+                                struct interval_node **root)
+{
+       struct interval_node *tmp;
+       ENTRY;
+
+       while (node_is_black_or_0(node) && node != *root) {
+               if (parent->in_left == node) {
+                       tmp = parent->in_right;
+                       if (node_is_red(tmp)) {
+                               tmp->in_color = INTERVAL_BLACK;
+                               parent->in_color = INTERVAL_RED;
+                               __rotate_left(parent, root);
+                               tmp = parent->in_right;
+                       }
+                       if (node_is_black_or_0(tmp->in_left) &&
+                           node_is_black_or_0(tmp->in_right)) {
+                               tmp->in_color = INTERVAL_RED;
+                               node = parent;
+                               parent = node->in_parent;
+                       } else {
+                               if (node_is_black_or_0(tmp->in_right)) {
+                                       struct interval_node *o_left;
+                                       if ((o_left = tmp->in_left))
+                                            o_left->in_color = INTERVAL_BLACK;
+                                       tmp->in_color = INTERVAL_RED;
+                                       __rotate_right(tmp, root);
+                                       tmp = parent->in_right;
+                               }
+                               tmp->in_color = parent->in_color;
+                               parent->in_color = INTERVAL_BLACK;
+                               if (tmp->in_right)
+                                   tmp->in_right->in_color = INTERVAL_BLACK;
+                               __rotate_left(parent, root);
+                               node = *root;
+                               break;
+                       }
+               } else {
+                       tmp = parent->in_left;
+                       if (node_is_red(tmp)) {
+                               tmp->in_color = INTERVAL_BLACK;
+                               parent->in_color = INTERVAL_RED;
+                               __rotate_right(parent, root);
+                               tmp = parent->in_left;
+                       }
+                       if (node_is_black_or_0(tmp->in_left) &&
+                           node_is_black_or_0(tmp->in_right)) {
+                               tmp->in_color = INTERVAL_RED;
+                               node = parent;
+                               parent = node->in_parent;
+                       } else {
+                               if (node_is_black_or_0(tmp->in_left)) {
+                                       struct interval_node *o_right;
+                                       if ((o_right = tmp->in_right))
+                                           o_right->in_color = INTERVAL_BLACK;
+                                       tmp->in_color = INTERVAL_RED;
+                                       __rotate_left(tmp, root);
+                                       tmp = parent->in_left;
+                               }
+                               tmp->in_color = parent->in_color;
+                               parent->in_color = INTERVAL_BLACK;
+                               if (tmp->in_left)
+                                       tmp->in_left->in_color = INTERVAL_BLACK;
+                               __rotate_right(parent, root);
+                               node = *root;
+                               break;
+                       }
+               }
+       }
+       if (node)
+               node->in_color = INTERVAL_BLACK;
+       EXIT;
+}
+
+/*
+ * if the @max_high value of @node is changed, this function traverse  a path
+ * from node  up to the root to update max_high for the whole tree.
+ */
+static void update_maxhigh(struct interval_node *node,
+                          __u64  old_maxhigh)
+{
+       __u64 left_max, right_max;
+       ENTRY;
+
+       while (node) {
+               left_max = node->in_left ? node->in_left->in_max_high : 0;
+               right_max = node->in_right ? node->in_right->in_max_high : 0;
+               node->in_max_high = max_u64(interval_high(node),
+                                           max_u64(left_max, right_max));
+
+               if (node->in_max_high >= old_maxhigh)
+                       break;
+               node = node->in_parent;
+       }
+       EXIT;
+}
+
+void interval_erase(struct interval_node *node,
+                   struct interval_node **root)
+{
+       struct interval_node *child, *parent;
+       int color;
+       ENTRY;
+
+       LASSERT(interval_is_intree(node));
+       node->in_intree = 0;
+       if (!node->in_left) {
+               child = node->in_right;
+       } else if (!node->in_right) {
+               child = node->in_left;
+       } else { /* Both left and right child are not NULL */
+               struct interval_node *old = node;
+
+               node = interval_next(node);
+               child = node->in_right;
+               parent = node->in_parent;
+               color = node->in_color;
+
+               if (child)
+                       child->in_parent = parent;
+               if (parent == old)
+                       parent->in_right = child;
+               else
+                       parent->in_left = child;
+
+               node->in_color = old->in_color;
+               node->in_right = old->in_right;
+               node->in_left = old->in_left;
+               node->in_parent = old->in_parent;
+
+               if (old->in_parent) {
+                       if (node_is_left_child(old))
+                               old->in_parent->in_left = node;
+                       else
+                               old->in_parent->in_right = node;
+               } else {
+                       *root = node;
+               }
+
+               old->in_left->in_parent = node;
+               if (old->in_right)
+                       old->in_right->in_parent = node;
+               update_maxhigh(child ? : parent, node->in_max_high);
+               update_maxhigh(node, old->in_max_high);
+               if (parent == old)
+                        parent = node;
+               goto color;
+       }
+       parent = node->in_parent;
+       color = node->in_color;
+
+       if (child)
+               child->in_parent = parent;
+       if (parent) {
+               if (node_is_left_child(node))
+                       parent->in_left = child;
+               else
+                       parent->in_right = child;
+       } else {
+               *root = child;
+       }
+
+       update_maxhigh(child ? : parent, node->in_max_high);
+
+color:
+       if (color == INTERVAL_BLACK)
+               interval_erase_color(child, parent, root);
+       EXIT;
+}
+EXPORT_SYMBOL(interval_erase);
+
+static inline int interval_may_overlap(struct interval_node *node,
+                                         struct interval_node_extent *ext)
+{
+       return (ext->start <= node->in_max_high &&
+               ext->end >= interval_low(node));
+}
+
+/*
+ * This function finds all intervals that overlap interval ext,
+ * and calls func to handle resulted intervals one by one.
+ * in lustre, this function will find all conflicting locks in
+ * the granted queue and add these locks to the ast work list.
+ *
+ * {
+ *       if (node == NULL)
+ *            return 0;
+ *       if (ext->end < interval_low(node)) {
+ *            interval_search(node->in_left, ext, func, data);
+ *       } else if (interval_may_overlap(node, ext)) {
+ *            if (extent_overlapped(ext, &node->in_extent))
+ *                    func(node, data);
+ *            interval_search(node->in_left, ext, func, data);
+ *            interval_search(node->in_right, ext, func, data);
+ *       }
+ *       return 0;
+ * }
+ *
+ */
+enum interval_iter interval_search(struct interval_node *node,
+                                  struct interval_node_extent *ext,
+                                  interval_callback_t func,
+                                  void *data)
+{
+       struct interval_node *parent;
+       enum interval_iter rc = INTERVAL_ITER_CONT;
+
+       LASSERT(ext != NULL);
+       LASSERT(func != NULL);
+
+       while (node) {
+               if (ext->end < interval_low(node)) {
+                       if (node->in_left) {
+                               node = node->in_left;
+                               continue;
+                       }
+               } else if (interval_may_overlap(node, ext)) {
+                       if (extent_overlapped(ext, &node->in_extent)) {
+                               rc = func(node, data);
+                               if (rc == INTERVAL_ITER_STOP)
+                                       break;
+                       }
+
+                       if (node->in_left) {
+                               node = node->in_left;
+                               continue;
+                       }
+                       if (node->in_right) {
+                               node = node->in_right;
+                               continue;
+                       }
+               }
+
+               parent = node->in_parent;
+               while (parent) {
+                       if (node_is_left_child(node) &&
+                           parent->in_right) {
+                               /* If we ever got the left, it means that the
+                                * parent met ext->end<interval_low(parent), or
+                                * may_overlap(parent). If the former is true,
+                                * we needn't go back. So stop early and check
+                                * may_overlap(parent) after this loop.  */
+                               node = parent->in_right;
+                               break;
+                       }
+                       node = parent;
+                       parent = parent->in_parent;
+               }
+               if (parent == NULL || !interval_may_overlap(parent, ext))
+                       break;
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(interval_search);
+
+static enum interval_iter interval_overlap_cb(struct interval_node *n,
+                                             void *args)
+{
+       *(int *)args = 1;
+       return INTERVAL_ITER_STOP;
+}
+
+int interval_is_overlapped(struct interval_node *root,
+                          struct interval_node_extent *ext)
+{
+       int has = 0;
+       (void)interval_search(root, ext, interval_overlap_cb, &has);
+       return has;
+}
+EXPORT_SYMBOL(interval_is_overlapped);
+
+/* Don't expand to low. Expanding downwards is expensive, and meaningless to
+ * some extents, because programs seldom do IO backward.
+ *
+ * The recursive algorithm of expanding low:
+ * expand_low {
+ *     struct interval_node *tmp;
+ *     static __u64 res = 0;
+ *
+ *     if (root == NULL)
+ *             return res;
+ *     if (root->in_max_high < low) {
+ *             res = max_u64(root->in_max_high + 1, res);
+ *             return res;
+ *     } else if (low < interval_low(root)) {
+ *             interval_expand_low(root->in_left, low);
+ *             return res;
+ *     }
+ *
+ *     if (interval_high(root) < low)
+ *             res = max_u64(interval_high(root) + 1, res);
+ *     interval_expand_low(root->in_left, low);
+ *     interval_expand_low(root->in_right, low);
+ *
+ *     return res;
+ * }
+ *
+ * It's much easy to eliminate the recursion, see interval_search for
+ * an example. -jay
+ */
+static inline __u64 interval_expand_low(struct interval_node *root, __u64 low)
+{
+       /* we only concern the empty tree right now. */
+       if (root == NULL)
+               return 0;
+       return low;
+}
+
+static inline __u64 interval_expand_high(struct interval_node *node, __u64 high)
+{
+       __u64 result = ~0;
+
+       while (node != NULL) {
+               if (node->in_max_high < high)
+                       break;
+
+               if (interval_low(node) > high) {
+                       result = interval_low(node) - 1;
+                       node = node->in_left;
+               } else {
+                       node = node->in_right;
+               }
+       }
+
+       return result;
+}
+
+/* expanding the extent based on @ext. */
+void interval_expand(struct interval_node *root,
+                    struct interval_node_extent *ext,
+                    struct interval_node_extent *limiter)
+{
+       /* The assertion of interval_is_overlapped is expensive because we may
+        * travel many nodes to find the overlapped node. */
+       LASSERT(interval_is_overlapped(root, ext) == 0);
+       if (!limiter || limiter->start < ext->start)
+               ext->start = interval_expand_low(root, ext->start);
+       if (!limiter || limiter->end > ext->end)
+               ext->end = interval_expand_high(root, ext->end);
+       LASSERT(interval_is_overlapped(root, ext) == 0);
+}
+EXPORT_SYMBOL(interval_expand);
diff --git a/drivers/staging/lustre/lustre/ldlm/l_lock.c b/drivers/staging/lustre/lustre/ldlm/l_lock.c
new file mode 100644 (file)
index 0000000..853409a
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+#include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <lustre_lib.h>
+
+/**
+ * Lock a lock and its resource.
+ *
+ * LDLM locking uses resource to serialize access to locks
+ * but there is a case when we change resource of lock upon
+ * enqueue reply. We rely on lock->l_resource = new_res
+ * being an atomic operation.
+ */
+struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock)
+{
+       /* on server-side resource of lock doesn't change */
+       if (!lock->l_ns_srv)
+               spin_lock(&lock->l_lock);
+
+       lock_res(lock->l_resource);
+
+       lock->l_res_locked = 1;
+       return lock->l_resource;
+}
+EXPORT_SYMBOL(lock_res_and_lock);
+
+/**
+ * Unlock a lock and its resource previously locked with lock_res_and_lock
+ */
+void unlock_res_and_lock(struct ldlm_lock *lock)
+{
+       /* on server-side resource of lock doesn't change */
+       lock->l_res_locked = 0;
+
+       unlock_res(lock->l_resource);
+       if (!lock->l_ns_srv)
+               spin_unlock(&lock->l_lock);
+}
+EXPORT_SYMBOL(unlock_res_and_lock);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c
new file mode 100644 (file)
index 0000000..f7432f7
--- /dev/null
@@ -0,0 +1,242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_extent.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of EXTENT lock type
+ *
+ * EXTENT lock type is for locking a contiguous range of values, represented
+ * by 64-bit starting and ending offsets (inclusive). There are several extent
+ * lock modes, some of which may be mutually incompatible. Extent locks are
+ * considered incompatible if their modes are incompatible and their extents
+ * intersect.  See the lock mode compatibility matrix in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+/* When a lock is cancelled by a client, the KMS may undergo change if this
+ * is the "highest lock".  This function returns the new KMS value.
+ * Caller must hold lr_lock already.
+ *
+ * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */
+__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
+{
+       struct ldlm_resource *res = lock->l_resource;
+       struct list_head *tmp;
+       struct ldlm_lock *lck;
+       __u64 kms = 0;
+       ENTRY;
+
+       /* don't let another thread in ldlm_extent_shift_kms race in
+        * just after we finish and take our lock into account in its
+        * calculation of the kms */
+       lock->l_flags |= LDLM_FL_KMS_IGNORE;
+
+       list_for_each(tmp, &res->lr_granted) {
+               lck = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               if (lck->l_flags & LDLM_FL_KMS_IGNORE)
+                       continue;
+
+               if (lck->l_policy_data.l_extent.end >= old_kms)
+                       RETURN(old_kms);
+
+               /* This extent _has_ to be smaller than old_kms (checked above)
+                * so kms can only ever be smaller or the same as old_kms. */
+               if (lck->l_policy_data.l_extent.end + 1 > kms)
+                       kms = lck->l_policy_data.l_extent.end + 1;
+       }
+       LASSERTF(kms <= old_kms, "kms "LPU64" old_kms "LPU64"\n", kms, old_kms);
+
+       RETURN(kms);
+}
+EXPORT_SYMBOL(ldlm_extent_shift_kms);
+
+struct kmem_cache *ldlm_interval_slab;
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+{
+       struct ldlm_interval *node;
+       ENTRY;
+
+       LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
+       OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+       if (node == NULL)
+               RETURN(NULL);
+
+       INIT_LIST_HEAD(&node->li_group);
+       ldlm_interval_attach(node, lock);
+       RETURN(node);
+}
+
+void ldlm_interval_free(struct ldlm_interval *node)
+{
+       if (node) {
+               LASSERT(list_empty(&node->li_group));
+               LASSERT(!interval_is_intree(&node->li_node));
+               OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+       }
+}
+
+/* interval tree, for LDLM_EXTENT. */
+void ldlm_interval_attach(struct ldlm_interval *n,
+                         struct ldlm_lock *l)
+{
+       LASSERT(l->l_tree_node == NULL);
+       LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
+
+       list_add_tail(&l->l_sl_policy, &n->li_group);
+       l->l_tree_node = n;
+}
+
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
+{
+       struct ldlm_interval *n = l->l_tree_node;
+
+       if (n == NULL)
+               return NULL;
+
+       LASSERT(!list_empty(&n->li_group));
+       l->l_tree_node = NULL;
+       list_del_init(&l->l_sl_policy);
+
+       return (list_empty(&n->li_group) ? n : NULL);
+}
+
+static inline int lock_mode_to_index(ldlm_mode_t mode)
+{
+       int index;
+
+       LASSERT(mode != 0);
+       LASSERT(IS_PO2(mode));
+       for (index = -1; mode; index++, mode >>= 1) ;
+       LASSERT(index < LCK_MODE_NUM);
+       return index;
+}
+
+/** Add newly granted lock into interval tree for the resource. */
+void ldlm_extent_add_lock(struct ldlm_resource *res,
+                         struct ldlm_lock *lock)
+{
+       struct interval_node *found, **root;
+       struct ldlm_interval *node;
+       struct ldlm_extent *extent;
+       int idx;
+
+       LASSERT(lock->l_granted_mode == lock->l_req_mode);
+
+       node = lock->l_tree_node;
+       LASSERT(node != NULL);
+       LASSERT(!interval_is_intree(&node->li_node));
+
+       idx = lock_mode_to_index(lock->l_granted_mode);
+       LASSERT(lock->l_granted_mode == 1 << idx);
+       LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
+
+       /* node extent initialize */
+       extent = &lock->l_policy_data.l_extent;
+       interval_set(&node->li_node, extent->start, extent->end);
+
+       root = &res->lr_itree[idx].lit_root;
+       found = interval_insert(&node->li_node, root);
+       if (found) { /* The policy group found. */
+               struct ldlm_interval *tmp = ldlm_interval_detach(lock);
+               LASSERT(tmp != NULL);
+               ldlm_interval_free(tmp);
+               ldlm_interval_attach(to_ldlm_interval(found), lock);
+       }
+       res->lr_itree[idx].lit_size++;
+
+       /* even though we use interval tree to manage the extent lock, we also
+        * add the locks into grant list, for debug purpose, .. */
+       ldlm_resource_add_lock(res, &res->lr_granted, lock);
+}
+
+/** Remove cancelled lock from resource interval tree. */
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
+{
+       struct ldlm_resource *res = lock->l_resource;
+       struct ldlm_interval *node = lock->l_tree_node;
+       struct ldlm_interval_tree *tree;
+       int idx;
+
+       if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */
+               return;
+
+       idx = lock_mode_to_index(lock->l_granted_mode);
+       LASSERT(lock->l_granted_mode == 1 << idx);
+       tree = &res->lr_itree[idx];
+
+       LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
+
+       tree->lit_size--;
+       node = ldlm_interval_detach(lock);
+       if (node) {
+               interval_erase(&node->li_node, &tree->lit_root);
+               ldlm_interval_free(node);
+       }
+}
+
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy)
+{
+       memset(lpolicy, 0, sizeof(*lpolicy));
+       lpolicy->l_extent.start = wpolicy->l_extent.start;
+       lpolicy->l_extent.end = wpolicy->l_extent.end;
+       lpolicy->l_extent.gid = wpolicy->l_extent.gid;
+}
+
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy)
+{
+       memset(wpolicy, 0, sizeof(*wpolicy));
+       wpolicy->l_extent.start = lpolicy->l_extent.start;
+       wpolicy->l_extent.end = lpolicy->l_extent.end;
+       wpolicy->l_extent.gid = lpolicy->l_extent.gid;
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
new file mode 100644 (file)
index 0000000..f100a84
--- /dev/null
@@ -0,0 +1,849 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company LP.
+ * Developed under the sponsorship of the US Government under
+ * Subcontract No. B514193
+ *
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file implements POSIX lock type for Lustre.
+ * Its policy properties are start and end of extent and PID.
+ *
+ * These locks are only done through MDS due to POSIX semantics requiring
+ * e.g. that locks could be only partially released and as such split into
+ * two parts, and also that two adjacent locks from the same process may be
+ * merged into a single wider lock.
+ *
+ * Lock modes are mapped like this:
+ * PR and PW for READ and WRITE locks
+ * NL to request a releasing of a portion of the lock
+ *
+ * These flock locks never timeout.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <linux/list.h>
+
+#include "ldlm_internal.h"
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                           void *data, int flag);
+
+/**
+ * list_for_remaining_safe - iterate over the remaining entries in a list
+ *           and safeguard against removal of a list entry.
+ * \param pos   the &struct list_head to use as a loop counter. pos MUST
+ *           have been initialized prior to using it in this macro.
+ * \param n     another &struct list_head to use as temporary storage
+ * \param head  the head for your list.
+ */
+#define list_for_remaining_safe(pos, n, head) \
+       for (n = pos->next; pos != (head); pos = n, n = pos->next)
+
+static inline int
+ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+       return((new->l_policy_data.l_flock.owner ==
+               lock->l_policy_data.l_flock.owner) &&
+              (new->l_export == lock->l_export));
+}
+
+static inline int
+ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new)
+{
+       return((new->l_policy_data.l_flock.start <=
+               lock->l_policy_data.l_flock.end) &&
+              (new->l_policy_data.l_flock.end >=
+               lock->l_policy_data.l_flock.start));
+}
+
+static inline int ldlm_flock_blocking_link(struct ldlm_lock *req,
+                                          struct ldlm_lock *lock)
+{
+       int rc = 0;
+
+       /* For server only */
+       if (req->l_export == NULL)
+               return 0;
+
+       if (unlikely(req->l_export->exp_flock_hash == NULL)) {
+               rc = ldlm_init_flock_export(req->l_export);
+               if (rc)
+                       goto error;
+       }
+
+       LASSERT(hlist_unhashed(&req->l_exp_flock_hash));
+
+       req->l_policy_data.l_flock.blocking_owner =
+               lock->l_policy_data.l_flock.owner;
+       req->l_policy_data.l_flock.blocking_export =
+               lock->l_export;
+       req->l_policy_data.l_flock.blocking_refs = 0;
+
+       cfs_hash_add(req->l_export->exp_flock_hash,
+                    &req->l_policy_data.l_flock.owner,
+                    &req->l_exp_flock_hash);
+error:
+       return rc;
+}
+
+static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req)
+{
+       /* For server only */
+       if (req->l_export == NULL)
+               return;
+
+       check_res_locked(req->l_resource);
+       if (req->l_export->exp_flock_hash != NULL &&
+           !hlist_unhashed(&req->l_exp_flock_hash))
+               cfs_hash_del(req->l_export->exp_flock_hash,
+                            &req->l_policy_data.l_flock.owner,
+                            &req->l_exp_flock_hash);
+}
+
+static inline void
+ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, __u64 flags)
+{
+       ENTRY;
+
+       LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: 0x%llx)",
+                  mode, flags);
+
+       /* Safe to not lock here, since it should be empty anyway */
+       LASSERT(hlist_unhashed(&lock->l_exp_flock_hash));
+
+       list_del_init(&lock->l_res_link);
+       if (flags == LDLM_FL_WAIT_NOREPROC &&
+           !(lock->l_flags & LDLM_FL_FAILED)) {
+               /* client side - set a flag to prevent sending a CANCEL */
+               lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
+
+               /* when reaching here, it is under lock_res_and_lock(). Thus,
+                  need call the nolock version of ldlm_lock_decref_internal*/
+               ldlm_lock_decref_internal_nolock(lock, mode);
+       }
+
+       ldlm_lock_destroy_nolock(lock);
+       EXIT;
+}
+
+/**
+ * POSIX locks deadlock detection code.
+ *
+ * Given a new lock \a req and an existing lock \a bl_lock it conflicts
+ * with, we need to iterate through all blocked POSIX locks for this
+ * export and see if there is a deadlock condition arising. (i.e. when
+ * one client holds a lock on something and want a lock on something
+ * else and at the same time another client has the opposite situation).
+ */
+static int
+ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock)
+{
+       struct obd_export *req_exp = req->l_export;
+       struct obd_export *bl_exp = bl_lock->l_export;
+       __u64 req_owner = req->l_policy_data.l_flock.owner;
+       __u64 bl_owner = bl_lock->l_policy_data.l_flock.owner;
+
+       /* For server only */
+       if (req_exp == NULL)
+               return 0;
+
+       class_export_get(bl_exp);
+       while (1) {
+               struct obd_export *bl_exp_new;
+               struct ldlm_lock *lock = NULL;
+               struct ldlm_flock *flock;
+
+               if (bl_exp->exp_flock_hash != NULL)
+                       lock = cfs_hash_lookup(bl_exp->exp_flock_hash,
+                                              &bl_owner);
+               if (lock == NULL)
+                       break;
+
+               flock = &lock->l_policy_data.l_flock;
+               LASSERT(flock->owner == bl_owner);
+               bl_owner = flock->blocking_owner;
+               bl_exp_new = class_export_get(flock->blocking_export);
+               class_export_put(bl_exp);
+
+               cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash);
+               bl_exp = bl_exp_new;
+
+               if (bl_owner == req_owner && bl_exp == req_exp) {
+                       class_export_put(bl_exp);
+                       return 1;
+               }
+       }
+       class_export_put(bl_exp);
+
+       return 0;
+}
+
+/**
+ * Process a granting attempt for flock lock.
+ * Must be called under ns lock held.
+ *
+ * This function looks for any conflicts for \a lock in the granted or
+ * waiting queues. The lock is granted if no conflicts are found in
+ * either queue.
+ *
+ * It is also responsible for splitting a lock if a portion of the lock
+ * is released.
+ *
+ * If \a first_enq is 0 (ie, called from ldlm_reprocess_queue):
+ *   - blocking ASTs have already been sent
+ *
+ * If \a first_enq is 1 (ie, called from ldlm_lock_enqueue):
+ *   - blocking ASTs have not been sent yet, so list of conflicting locks
+ *     would be collected and ASTs sent.
+ */
+int
+ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, int first_enq,
+                       ldlm_error_t *err, struct list_head *work_list)
+{
+       struct ldlm_resource *res = req->l_resource;
+       struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+       struct list_head *tmp;
+       struct list_head *ownlocks = NULL;
+       struct ldlm_lock *lock = NULL;
+       struct ldlm_lock *new = req;
+       struct ldlm_lock *new2 = NULL;
+       ldlm_mode_t mode = req->l_req_mode;
+       int local = ns_is_client(ns);
+       int added = (mode == LCK_NL);
+       int overlaps = 0;
+       int splitted = 0;
+       const struct ldlm_callback_suite null_cbs = { NULL };
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_DLMTRACE, "flags %#llx owner "LPU64" pid %u mode %u start "
+              LPU64" end "LPU64"\n", *flags,
+              new->l_policy_data.l_flock.owner,
+              new->l_policy_data.l_flock.pid, mode,
+              req->l_policy_data.l_flock.start,
+              req->l_policy_data.l_flock.end);
+
+       *err = ELDLM_OK;
+
+       if (local) {
+               /* No blocking ASTs are sent to the clients for
+                * Posix file & record locks */
+               req->l_blocking_ast = NULL;
+       } else {
+               /* Called on the server for lock cancels. */
+               req->l_blocking_ast = ldlm_flock_blocking_ast;
+       }
+
+reprocess:
+       if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) {
+               /* This loop determines where this processes locks start
+                * in the resource lr_granted list. */
+               list_for_each(tmp, &res->lr_granted) {
+                       lock = list_entry(tmp, struct ldlm_lock,
+                                             l_res_link);
+                       if (ldlm_same_flock_owner(lock, req)) {
+                               ownlocks = tmp;
+                               break;
+                       }
+               }
+       } else {
+               lockmode_verify(mode);
+
+               /* This loop determines if there are existing locks
+                * that conflict with the new lock request. */
+               list_for_each(tmp, &res->lr_granted) {
+                       lock = list_entry(tmp, struct ldlm_lock,
+                                             l_res_link);
+
+                       if (ldlm_same_flock_owner(lock, req)) {
+                               if (!ownlocks)
+                                       ownlocks = tmp;
+                               continue;
+                       }
+
+                       /* locks are compatible, overlap doesn't matter */
+                       if (lockmode_compat(lock->l_granted_mode, mode))
+                               continue;
+
+                       if (!ldlm_flocks_overlap(lock, req))
+                               continue;
+
+                       if (!first_enq)
+                               RETURN(LDLM_ITER_CONTINUE);
+
+                       if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+                               ldlm_flock_destroy(req, mode, *flags);
+                               *err = -EAGAIN;
+                               RETURN(LDLM_ITER_STOP);
+                       }
+
+                       if (*flags & LDLM_FL_TEST_LOCK) {
+                               ldlm_flock_destroy(req, mode, *flags);
+                               req->l_req_mode = lock->l_granted_mode;
+                               req->l_policy_data.l_flock.pid =
+                                       lock->l_policy_data.l_flock.pid;
+                               req->l_policy_data.l_flock.start =
+                                       lock->l_policy_data.l_flock.start;
+                               req->l_policy_data.l_flock.end =
+                                       lock->l_policy_data.l_flock.end;
+                               *flags |= LDLM_FL_LOCK_CHANGED;
+                               RETURN(LDLM_ITER_STOP);
+                       }
+
+                       if (ldlm_flock_deadlock(req, lock)) {
+                               ldlm_flock_destroy(req, mode, *flags);
+                               *err = -EDEADLK;
+                               RETURN(LDLM_ITER_STOP);
+                       }
+
+                       rc = ldlm_flock_blocking_link(req, lock);
+                       if (rc) {
+                               ldlm_flock_destroy(req, mode, *flags);
+                               *err = rc;
+                               RETURN(LDLM_ITER_STOP);
+                       }
+                       ldlm_resource_add_lock(res, &res->lr_waiting, req);
+                       *flags |= LDLM_FL_BLOCK_GRANTED;
+                       RETURN(LDLM_ITER_STOP);
+               }
+       }
+
+       if (*flags & LDLM_FL_TEST_LOCK) {
+               ldlm_flock_destroy(req, mode, *flags);
+               req->l_req_mode = LCK_NL;
+               *flags |= LDLM_FL_LOCK_CHANGED;
+               RETURN(LDLM_ITER_STOP);
+       }
+
+       /* In case we had slept on this lock request take it off of the
+        * deadlock detection hash list. */
+       ldlm_flock_blocking_unlink(req);
+
+       /* Scan the locks owned by this process that overlap this request.
+        * We may have to merge or split existing locks. */
+
+       if (!ownlocks)
+               ownlocks = &res->lr_granted;
+
+       list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) {
+               lock = list_entry(ownlocks, struct ldlm_lock, l_res_link);
+
+               if (!ldlm_same_flock_owner(lock, new))
+                       break;
+
+               if (lock->l_granted_mode == mode) {
+                       /* If the modes are the same then we need to process
+                        * locks that overlap OR adjoin the new lock. The extra
+                        * logic condition is necessary to deal with arithmetic
+                        * overflow and underflow. */
+                       if ((new->l_policy_data.l_flock.start >
+                            (lock->l_policy_data.l_flock.end + 1))
+                           && (lock->l_policy_data.l_flock.end !=
+                               OBD_OBJECT_EOF))
+                               continue;
+
+                       if ((new->l_policy_data.l_flock.end <
+                            (lock->l_policy_data.l_flock.start - 1))
+                           && (lock->l_policy_data.l_flock.start != 0))
+                               break;
+
+                       if (new->l_policy_data.l_flock.start <
+                           lock->l_policy_data.l_flock.start) {
+                               lock->l_policy_data.l_flock.start =
+                                       new->l_policy_data.l_flock.start;
+                       } else {
+                               new->l_policy_data.l_flock.start =
+                                       lock->l_policy_data.l_flock.start;
+                       }
+
+                       if (new->l_policy_data.l_flock.end >
+                           lock->l_policy_data.l_flock.end) {
+                               lock->l_policy_data.l_flock.end =
+                                       new->l_policy_data.l_flock.end;
+                       } else {
+                               new->l_policy_data.l_flock.end =
+                                       lock->l_policy_data.l_flock.end;
+                       }
+
+                       if (added) {
+                               ldlm_flock_destroy(lock, mode, *flags);
+                       } else {
+                               new = lock;
+                               added = 1;
+                       }
+                       continue;
+               }
+
+               if (new->l_policy_data.l_flock.start >
+                   lock->l_policy_data.l_flock.end)
+                       continue;
+
+               if (new->l_policy_data.l_flock.end <
+                   lock->l_policy_data.l_flock.start)
+                       break;
+
+               ++overlaps;
+
+               if (new->l_policy_data.l_flock.start <=
+                   lock->l_policy_data.l_flock.start) {
+                       if (new->l_policy_data.l_flock.end <
+                           lock->l_policy_data.l_flock.end) {
+                               lock->l_policy_data.l_flock.start =
+                                       new->l_policy_data.l_flock.end + 1;
+                               break;
+                       }
+                       ldlm_flock_destroy(lock, lock->l_req_mode, *flags);
+                       continue;
+               }
+               if (new->l_policy_data.l_flock.end >=
+                   lock->l_policy_data.l_flock.end) {
+                       lock->l_policy_data.l_flock.end =
+                               new->l_policy_data.l_flock.start - 1;
+                       continue;
+               }
+
+               /* split the existing lock into two locks */
+
+               /* if this is an F_UNLCK operation then we could avoid
+                * allocating a new lock and use the req lock passed in
+                * with the request but this would complicate the reply
+                * processing since updates to req get reflected in the
+                * reply. The client side replays the lock request so
+                * it must see the original lock data in the reply. */
+
+               /* XXX - if ldlm_lock_new() can sleep we should
+                * release the lr_lock, allocate the new lock,
+                * and restart processing this lock. */
+               if (!new2) {
+                       unlock_res_and_lock(req);
+                       new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK,
+                                               lock->l_granted_mode, &null_cbs,
+                                               NULL, 0, LVB_T_NONE);
+                       lock_res_and_lock(req);
+                       if (!new2) {
+                               ldlm_flock_destroy(req, lock->l_granted_mode,
+                                                  *flags);
+                               *err = -ENOLCK;
+                               RETURN(LDLM_ITER_STOP);
+                       }
+                       goto reprocess;
+               }
+
+               splitted = 1;
+
+               new2->l_granted_mode = lock->l_granted_mode;
+               new2->l_policy_data.l_flock.pid =
+                       new->l_policy_data.l_flock.pid;
+               new2->l_policy_data.l_flock.owner =
+                       new->l_policy_data.l_flock.owner;
+               new2->l_policy_data.l_flock.start =
+                       lock->l_policy_data.l_flock.start;
+               new2->l_policy_data.l_flock.end =
+                       new->l_policy_data.l_flock.start - 1;
+               lock->l_policy_data.l_flock.start =
+                       new->l_policy_data.l_flock.end + 1;
+               new2->l_conn_export = lock->l_conn_export;
+               if (lock->l_export != NULL) {
+                       new2->l_export = class_export_lock_get(lock->l_export, new2);
+                       if (new2->l_export->exp_lock_hash &&
+                           hlist_unhashed(&new2->l_exp_hash))
+                               cfs_hash_add(new2->l_export->exp_lock_hash,
+                                            &new2->l_remote_handle,
+                                            &new2->l_exp_hash);
+               }
+               if (*flags == LDLM_FL_WAIT_NOREPROC)
+                       ldlm_lock_addref_internal_nolock(new2,
+                                                        lock->l_granted_mode);
+
+               /* insert new2 at lock */
+               ldlm_resource_add_lock(res, ownlocks, new2);
+               LDLM_LOCK_RELEASE(new2);
+               break;
+       }
+
+       /* if new2 is created but never used, destroy it*/
+       if (splitted == 0 && new2 != NULL)
+               ldlm_lock_destroy_nolock(new2);
+
+       /* At this point we're granting the lock request. */
+       req->l_granted_mode = req->l_req_mode;
+
+       /* Add req to the granted queue before calling ldlm_reprocess_all(). */
+       if (!added) {
+               list_del_init(&req->l_res_link);
+               /* insert new lock before ownlocks in list. */
+               ldlm_resource_add_lock(res, ownlocks, req);
+       }
+
+       if (*flags != LDLM_FL_WAIT_NOREPROC) {
+               /* The only one possible case for client-side calls flock
+                * policy function is ldlm_flock_completion_ast inside which
+                * carries LDLM_FL_WAIT_NOREPROC flag. */
+               CERROR("Illegal parameter for client-side-only module.\n");
+               LBUG();
+       }
+
+       /* In case we're reprocessing the requested lock we can't destroy
+        * it until after calling ldlm_add_ast_work_item() above so that laawi()
+        * can bump the reference count on \a req. Otherwise \a req
+        * could be freed before the completion AST can be sent.  */
+       if (added)
+               ldlm_flock_destroy(req, mode, *flags);
+
+       ldlm_resource_dump(D_INFO, res);
+       RETURN(LDLM_ITER_CONTINUE);
+}
+
+struct ldlm_flock_wait_data {
+       struct ldlm_lock *fwd_lock;
+       int            fwd_generation;
+};
+
+static void
+ldlm_flock_interrupted_wait(void *data)
+{
+       struct ldlm_lock *lock;
+       ENTRY;
+
+       lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock;
+
+       /* take lock off the deadlock detection hash list. */
+       lock_res_and_lock(lock);
+       ldlm_flock_blocking_unlink(lock);
+
+       /* client side - set flag to prevent lock from being put on LRU list */
+       lock->l_flags |= LDLM_FL_CBPENDING;
+       unlock_res_and_lock(lock);
+
+       EXIT;
+}
+
+/**
+ * Flock completion callback function.
+ *
+ * \param lock [in,out]: A lock to be handled
+ * \param flags    [in]: flags
+ * \param *data    [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg
+ *
+ * \retval 0    : success
+ * \retval <0   : failure
+ */
+int
+ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+       struct file_lock                *getlk = lock->l_ast_data;
+       struct obd_device             *obd;
+       struct obd_import             *imp = NULL;
+       struct ldlm_flock_wait_data     fwd;
+       struct l_wait_info            lwi;
+       ldlm_error_t                err;
+       int                          rc = 0;
+       ENTRY;
+
+       CDEBUG(D_DLMTRACE, "flags: 0x%llx data: %p getlk: %p\n",
+              flags, data, getlk);
+
+       /* Import invalidation. We need to actually release the lock
+        * references being held, so that it can go away. No point in
+        * holding the lock even if app still believes it has it, since
+        * server already dropped it anyway. Only for granted locks too. */
+       if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) ==
+           (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) {
+               if (lock->l_req_mode == lock->l_granted_mode &&
+                   lock->l_granted_mode != LCK_NL &&
+                   NULL == data)
+                       ldlm_lock_decref_internal(lock, lock->l_req_mode);
+
+               /* Need to wake up the waiter if we were evicted */
+               wake_up(&lock->l_waitq);
+               RETURN(0);
+       }
+
+       LASSERT(flags != LDLM_FL_WAIT_NOREPROC);
+
+       if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+                      LDLM_FL_BLOCK_CONV))) {
+               if (NULL == data)
+                       /* mds granted the lock in the reply */
+                       goto granted;
+               /* CP AST RPC: lock get granted, wake it up */
+               wake_up(&lock->l_waitq);
+               RETURN(0);
+       }
+
+       LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+                  "sleeping");
+       fwd.fwd_lock = lock;
+       obd = class_exp2obd(lock->l_conn_export);
+
+       /* if this is a local lock, there is no import */
+       if (NULL != obd)
+               imp = obd->u.cli.cl_import;
+
+       if (NULL != imp) {
+               spin_lock(&imp->imp_lock);
+               fwd.fwd_generation = imp->imp_generation;
+               spin_unlock(&imp->imp_lock);
+       }
+
+       lwi = LWI_TIMEOUT_INTR(0, NULL, ldlm_flock_interrupted_wait, &fwd);
+
+       /* Go to sleep until the lock is granted. */
+       rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi);
+
+       if (rc) {
+               LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+                          rc);
+               RETURN(rc);
+       }
+
+granted:
+       OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10);
+
+       if (lock->l_destroyed) {
+               LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed");
+               RETURN(0);
+       }
+
+       if (lock->l_flags & LDLM_FL_FAILED) {
+               LDLM_DEBUG(lock, "client-side enqueue waking up: failed");
+               RETURN(-EIO);
+       }
+
+       if (rc) {
+               LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+                          rc);
+               RETURN(rc);
+       }
+
+       LDLM_DEBUG(lock, "client-side enqueue granted");
+
+       lock_res_and_lock(lock);
+
+       /* take lock off the deadlock detection hash list. */
+       ldlm_flock_blocking_unlink(lock);
+
+       /* ldlm_lock_enqueue() has already placed lock on the granted list. */
+       list_del_init(&lock->l_res_link);
+
+       if (flags & LDLM_FL_TEST_LOCK) {
+               /* fcntl(F_GETLK) request */
+               /* The old mode was saved in getlk->fl_type so that if the mode
+                * in the lock changes we can decref the appropriate refcount.*/
+               ldlm_flock_destroy(lock, flock_type(getlk),
+                                  LDLM_FL_WAIT_NOREPROC);
+               switch (lock->l_granted_mode) {
+               case LCK_PR:
+                       flock_set_type(getlk, F_RDLCK);
+                       break;
+               case LCK_PW:
+                       flock_set_type(getlk, F_WRLCK);
+                       break;
+               default:
+                       flock_set_type(getlk, F_UNLCK);
+               }
+               flock_set_pid(getlk, (pid_t)lock->l_policy_data.l_flock.pid);
+               flock_set_start(getlk,
+                               (loff_t)lock->l_policy_data.l_flock.start);
+               flock_set_end(getlk,
+                             (loff_t)lock->l_policy_data.l_flock.end);
+       } else {
+               __u64 noreproc = LDLM_FL_WAIT_NOREPROC;
+
+               /* We need to reprocess the lock to do merges or splits
+                * with existing locks owned by this process. */
+               ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL);
+       }
+       unlock_res_and_lock(lock);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_flock_completion_ast);
+
+int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                           void *data, int flag)
+{
+       ENTRY;
+
+       LASSERT(lock);
+       LASSERT(flag == LDLM_CB_CANCELING);
+
+       /* take lock off the deadlock detection hash list. */
+       lock_res_and_lock(lock);
+       ldlm_flock_blocking_unlink(lock);
+       unlock_res_and_lock(lock);
+       RETURN(0);
+}
+
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                      ldlm_policy_data_t *lpolicy)
+{
+       memset(lpolicy, 0, sizeof(*lpolicy));
+       lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+       lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+       lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+       /* Compat code, old clients had no idea about owner field and
+        * relied solely on pid for ownership. Introduced in LU-104, 2.1,
+        * April 2011 */
+       lpolicy->l_flock.owner = wpolicy->l_flock.lfw_pid;
+}
+
+
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                      ldlm_policy_data_t *lpolicy)
+{
+       memset(lpolicy, 0, sizeof(*lpolicy));
+       lpolicy->l_flock.start = wpolicy->l_flock.lfw_start;
+       lpolicy->l_flock.end = wpolicy->l_flock.lfw_end;
+       lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid;
+       lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner;
+}
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy)
+{
+       memset(wpolicy, 0, sizeof(*wpolicy));
+       wpolicy->l_flock.lfw_start = lpolicy->l_flock.start;
+       wpolicy->l_flock.lfw_end = lpolicy->l_flock.end;
+       wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid;
+       wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner;
+}
+
+/*
+ * Export handle<->flock hash operations.
+ */
+static unsigned
+ldlm_export_flock_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_u64_hash(*(__u64 *)key, mask);
+}
+
+static void *
+ldlm_export_flock_key(struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+       return &lock->l_policy_data.l_flock.owner;
+}
+
+static int
+ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode)
+{
+       return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64));
+}
+
+static void *
+ldlm_export_flock_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+}
+
+static void
+ldlm_export_flock_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+       struct ldlm_flock *flock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+       LDLM_LOCK_GET(lock);
+
+       flock = &lock->l_policy_data.l_flock;
+       LASSERT(flock->blocking_export != NULL);
+       class_export_get(flock->blocking_export);
+       flock->blocking_refs++;
+}
+
+static void
+ldlm_export_flock_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+       struct ldlm_flock *flock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash);
+       LDLM_LOCK_RELEASE(lock);
+
+       flock = &lock->l_policy_data.l_flock;
+       LASSERT(flock->blocking_export != NULL);
+       class_export_put(flock->blocking_export);
+       if (--flock->blocking_refs == 0) {
+               flock->blocking_owner = 0;
+               flock->blocking_export = NULL;
+       }
+}
+
+static cfs_hash_ops_t ldlm_export_flock_ops = {
+       .hs_hash        = ldlm_export_flock_hash,
+       .hs_key  = ldlm_export_flock_key,
+       .hs_keycmp      = ldlm_export_flock_keycmp,
+       .hs_object      = ldlm_export_flock_object,
+       .hs_get  = ldlm_export_flock_get,
+       .hs_put  = ldlm_export_flock_put,
+       .hs_put_locked  = ldlm_export_flock_put,
+};
+
+int ldlm_init_flock_export(struct obd_export *exp)
+{
+       exp->exp_flock_hash =
+               cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+                               HASH_EXP_LOCK_CUR_BITS,
+                               HASH_EXP_LOCK_MAX_BITS,
+                               HASH_EXP_LOCK_BKT_BITS, 0,
+                               CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+                               &ldlm_export_flock_ops,
+                               CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE);
+       if (!exp->exp_flock_hash)
+               RETURN(-ENOMEM);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_init_flock_export);
+
+void ldlm_destroy_flock_export(struct obd_export *exp)
+{
+       ENTRY;
+       if (exp->exp_flock_hash) {
+               cfs_hash_putref(exp->exp_flock_hash);
+               exp->exp_flock_hash = NULL;
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_flock_export);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c
new file mode 100644 (file)
index 0000000..574b2ff
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_inodebits.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of IBITS lock type
+ *
+ * IBITS lock type contains a bit mask determining various properties of an
+ * object. The meanings of specific bits are specific to the caller and are
+ * opaque to LDLM code.
+ *
+ * Locks with intersecting bitmasks and conflicting lock modes (e.g.  LCK_PW)
+ * are considered conflicting.  See the lock mode compatibility matrix
+ * in lustre_dlm.h.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy)
+{
+       memset(lpolicy, 0, sizeof(*lpolicy));
+       lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits;
+}
+
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy)
+{
+       memset(wpolicy, 0, sizeof(*wpolicy));
+       wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits;
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h
new file mode 100644 (file)
index 0000000..141a957
--- /dev/null
@@ -0,0 +1,276 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define MAX_STRING_SIZE 128
+
+extern atomic_t ldlm_srv_namespace_nr;
+extern atomic_t ldlm_cli_namespace_nr;
+extern struct mutex ldlm_srv_namespace_lock;
+extern struct list_head ldlm_srv_namespace_list;
+extern struct mutex ldlm_cli_namespace_lock;
+extern struct list_head ldlm_cli_namespace_list;
+
+static inline atomic_t *ldlm_namespace_nr(ldlm_side_t client)
+{
+       return client == LDLM_NAMESPACE_SERVER ?
+               &ldlm_srv_namespace_nr : &ldlm_cli_namespace_nr;
+}
+
+static inline struct list_head *ldlm_namespace_list(ldlm_side_t client)
+{
+       return client == LDLM_NAMESPACE_SERVER ?
+               &ldlm_srv_namespace_list : &ldlm_cli_namespace_list;
+}
+
+static inline struct mutex *ldlm_namespace_lock(ldlm_side_t client)
+{
+       return client == LDLM_NAMESPACE_SERVER ?
+               &ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock;
+}
+
+/* ldlm_request.c */
+/* Cancel lru flag, it indicates we cancel aged locks. */
+enum {
+       LDLM_CANCEL_AGED   = 1 << 0, /* Cancel aged locks (non lru resize). */
+       LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */
+       LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */
+       LDLM_CANCEL_LRUR   = 1 << 3, /* Cancel locks from lru resize. */
+       LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither
+                                     * sending nor waiting for any rpcs) */
+};
+
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+                   ldlm_cancel_flags_t sync, int flags);
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns,
+                         struct list_head *cancels, int count, int max,
+                         ldlm_cancel_flags_t cancel_flags, int flags);
+extern int ldlm_enqueue_min;
+int ldlm_get_enq_timeout(struct ldlm_lock *lock);
+
+/* ldlm_resource.c */
+int ldlm_resource_putref_locked(struct ldlm_resource *res);
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+                                    struct ldlm_lock *new);
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+                              struct obd_import *imp, int force);
+void ldlm_namespace_free_post(struct ldlm_namespace *ns);
+/* ldlm_lock.c */
+
+struct ldlm_cb_set_arg {
+       struct ptlrpc_request_set       *set;
+       int                              type; /* LDLM_{CP,BL,GL}_CALLBACK */
+       atomic_t                         restart;
+       struct list_head                        *list;
+       union ldlm_gl_desc              *gl_desc; /* glimpse AST descriptor */
+};
+
+typedef enum {
+       LDLM_WORK_BL_AST,
+       LDLM_WORK_CP_AST,
+       LDLM_WORK_REVOKE_AST,
+       LDLM_WORK_GL_AST
+} ldlm_desc_ast_t;
+
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+                 enum req_location loc, void *data, int size);
+struct ldlm_lock *
+ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *,
+                ldlm_type_t type, ldlm_mode_t,
+                const struct ldlm_callback_suite *cbs,
+                void *data, __u32 lvb_len, enum lvb_type lvb_type);
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
+                              void *cookie, __u64 *flags);
+void ldlm_lock_addref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal(struct ldlm_lock *, __u32 mode);
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, __u32 mode);
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+                           struct list_head *work_list);
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+                     ldlm_desc_ast_t ast_type);
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq);
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock);
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock);
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock);
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock);
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock);
+
+void ldlm_cancel_locks_for_export(struct obd_export *export);
+
+/* ldlm_lockd.c */
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+                          struct ldlm_lock *lock);
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns,
+                          struct ldlm_lock_desc *ld,
+                          struct list_head *cancels, int count,
+                          ldlm_cancel_flags_t cancel_flags);
+
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+                            struct ldlm_lock_desc *ld, struct ldlm_lock *lock);
+
+
+/* ldlm_extent.c */
+void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
+
+/* ldlm_flock.c */
+int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags,
+                           int first_enq, ldlm_error_t *err,
+                           struct list_head *work_list);
+int ldlm_init_flock_export(struct obd_export *exp);
+void ldlm_destroy_flock_export(struct obd_export *exp);
+
+/* l_lock.c */
+void l_check_ns_lock(struct ldlm_namespace *ns);
+void l_check_no_ns_lock(struct ldlm_namespace *ns);
+
+extern proc_dir_entry_t *ldlm_svc_proc_dir;
+extern proc_dir_entry_t *ldlm_type_proc_dir;
+
+struct ldlm_state {
+       struct ptlrpc_service *ldlm_cb_service;
+       struct ptlrpc_service *ldlm_cancel_service;
+       struct ptlrpc_client *ldlm_client;
+       struct ptlrpc_connection *ldlm_server_conn;
+       struct ldlm_bl_pool *ldlm_bl_pool;
+};
+
+/* interval tree, for LDLM_EXTENT. */
+extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */
+extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
+extern void ldlm_interval_free(struct ldlm_interval *node);
+/* this function must be called with res lock held */
+static inline struct ldlm_extent *
+ldlm_interval_extent(struct ldlm_interval *node)
+{
+       struct ldlm_lock *lock;
+       LASSERT(!list_empty(&node->li_group));
+
+       lock = list_entry(node->li_group.next, struct ldlm_lock,
+                             l_sl_policy);
+       return &lock->l_policy_data.l_extent;
+}
+
+int ldlm_init(void);
+void ldlm_exit(void);
+
+enum ldlm_policy_res {
+       LDLM_POLICY_CANCEL_LOCK,
+       LDLM_POLICY_KEEP_LOCK,
+       LDLM_POLICY_SKIP_LOCK
+};
+
+typedef enum ldlm_policy_res ldlm_policy_res_t;
+
+#define LDLM_POOL_PROC_READER_SEQ_SHOW(var, type)                          \
+       static int lprocfs_##var##_seq_show(struct seq_file *m, void *v) \
+       {                                                                   \
+               struct ldlm_pool *pl = m->private;                          \
+               type tmp;                                                   \
+                                                                           \
+               spin_lock(&pl->pl_lock);                                    \
+               tmp = pl->pl_##var;                                         \
+               spin_unlock(&pl->pl_lock);                                  \
+                                                                           \
+               return lprocfs_rd_uint(m, &tmp);                            \
+       }                                                                   \
+       struct __##var##__dummy_read {;} /* semicolon catcher */
+
+#define LDLM_POOL_PROC_WRITER(var, type)                                   \
+       int lprocfs_wr_##var(struct file *file, const char *buffer,         \
+                            unsigned long count, void *data)               \
+       {                                                                   \
+               struct ldlm_pool *pl = data;                                \
+               type tmp;                                                   \
+               int rc;                                                     \
+                                                                           \
+               rc = lprocfs_wr_uint(file, buffer, count, &tmp);            \
+               if (rc < 0) {                                               \
+                       CERROR("Can't parse user input, rc = %d\n", rc);    \
+                       return rc;                                          \
+               }                                                           \
+                                                                           \
+               spin_lock(&pl->pl_lock);                                    \
+               pl->pl_##var = tmp;                                         \
+               spin_unlock(&pl->pl_lock);                                  \
+                                                                           \
+               return rc;                                                  \
+       }                                                                   \
+       struct __##var##__dummy_write {;} /* semicolon catcher */
+
+static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
+{
+       int ret = 0;
+
+       lock_res_and_lock(lock);
+       if (((lock->l_req_mode == lock->l_granted_mode) &&
+            !(lock->l_flags & LDLM_FL_CP_REQD)) ||
+           (lock->l_flags & (LDLM_FL_FAILED | LDLM_FL_CANCEL)))
+               ret = 1;
+       unlock_res_and_lock(lock);
+
+       return ret;
+}
+
+typedef void (*ldlm_policy_wire_to_local_t)(const ldlm_wire_policy_data_t *,
+                                           ldlm_policy_data_t *);
+
+typedef void (*ldlm_policy_local_to_wire_t)(const ldlm_policy_data_t *,
+                                           ldlm_wire_policy_data_t *);
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy);
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy);
+void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy);
+void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy);
+void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy);
+void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy);
+void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy);
+void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy);
+
+void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
new file mode 100644 (file)
index 0000000..42df530
--- /dev/null
@@ -0,0 +1,868 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/**
+ * This file deals with various client/target related logic including recovery.
+ *
+ * TODO: This code more logically belongs in the ptlrpc module than in ldlm and
+ * should be moved.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ldlm_internal.h"
+
+/* @priority: If non-zero, move the selected connection to the list head.
+ * @create: If zero, only search in existing connections.
+ */
+static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                          int priority, int create)
+{
+       struct ptlrpc_connection *ptlrpc_conn;
+       struct obd_import_conn *imp_conn = NULL, *item;
+       int rc = 0;
+       ENTRY;
+
+       if (!create && !priority) {
+               CDEBUG(D_HA, "Nothing to do\n");
+               RETURN(-EINVAL);
+       }
+
+       ptlrpc_conn = ptlrpc_uuid_to_connection(uuid);
+       if (!ptlrpc_conn) {
+               CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid);
+               RETURN (-ENOENT);
+       }
+
+       if (create) {
+               OBD_ALLOC(imp_conn, sizeof(*imp_conn));
+               if (!imp_conn) {
+                       GOTO(out_put, rc = -ENOMEM);
+               }
+       }
+
+       spin_lock(&imp->imp_lock);
+       list_for_each_entry(item, &imp->imp_conn_list, oic_item) {
+               if (obd_uuid_equals(uuid, &item->oic_uuid)) {
+                       if (priority) {
+                               list_del(&item->oic_item);
+                               list_add(&item->oic_item,
+                                            &imp->imp_conn_list);
+                               item->oic_last_attempt = 0;
+                       }
+                       CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n",
+                              imp, imp->imp_obd->obd_name, uuid->uuid,
+                              (priority ? ", moved to head" : ""));
+                       spin_unlock(&imp->imp_lock);
+                       GOTO(out_free, rc = 0);
+               }
+       }
+       /* No existing import connection found for \a uuid. */
+       if (create) {
+               imp_conn->oic_conn = ptlrpc_conn;
+               imp_conn->oic_uuid = *uuid;
+               imp_conn->oic_last_attempt = 0;
+               if (priority)
+                       list_add(&imp_conn->oic_item, &imp->imp_conn_list);
+               else
+                       list_add_tail(&imp_conn->oic_item,
+                                         &imp->imp_conn_list);
+               CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n",
+                      imp, imp->imp_obd->obd_name, uuid->uuid,
+                      (priority ? "head" : "tail"));
+       } else {
+               spin_unlock(&imp->imp_lock);
+               GOTO(out_free, rc = -ENOENT);
+       }
+
+       spin_unlock(&imp->imp_lock);
+       RETURN(0);
+out_free:
+       if (imp_conn)
+               OBD_FREE(imp_conn, sizeof(*imp_conn));
+out_put:
+       ptlrpc_connection_put(ptlrpc_conn);
+       RETURN(rc);
+}
+
+int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid)
+{
+       return import_set_conn(imp, uuid, 1, 0);
+}
+
+int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid,
+                          int priority)
+{
+       return import_set_conn(imp, uuid, priority, 1);
+}
+EXPORT_SYMBOL(client_import_add_conn);
+
+int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid)
+{
+       struct obd_import_conn *imp_conn;
+       struct obd_export *dlmexp;
+       int rc = -ENOENT;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       if (list_empty(&imp->imp_conn_list)) {
+               LASSERT(!imp->imp_connection);
+               GOTO(out, rc);
+       }
+
+       list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) {
+               if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid))
+                       continue;
+               LASSERT(imp_conn->oic_conn);
+
+               if (imp_conn == imp->imp_conn_current) {
+                       LASSERT(imp_conn->oic_conn == imp->imp_connection);
+
+                       if (imp->imp_state != LUSTRE_IMP_CLOSED &&
+                           imp->imp_state != LUSTRE_IMP_DISCON) {
+                               CERROR("can't remove current connection\n");
+                               GOTO(out, rc = -EBUSY);
+                       }
+
+                       ptlrpc_connection_put(imp->imp_connection);
+                       imp->imp_connection = NULL;
+
+                       dlmexp = class_conn2export(&imp->imp_dlm_handle);
+                       if (dlmexp && dlmexp->exp_connection) {
+                               LASSERT(dlmexp->exp_connection ==
+                                       imp_conn->oic_conn);
+                               ptlrpc_connection_put(dlmexp->exp_connection);
+                               dlmexp->exp_connection = NULL;
+                       }
+               }
+
+               list_del(&imp_conn->oic_item);
+               ptlrpc_connection_put(imp_conn->oic_conn);
+               OBD_FREE(imp_conn, sizeof(*imp_conn));
+               CDEBUG(D_HA, "imp %p@%s: remove connection %s\n",
+                      imp, imp->imp_obd->obd_name, uuid->uuid);
+               rc = 0;
+               break;
+       }
+out:
+       spin_unlock(&imp->imp_lock);
+       if (rc == -ENOENT)
+               CERROR("connection %s not found\n", uuid->uuid);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_del_conn);
+
+/**
+ * Find conn UUID by peer NID. \a peer is a server NID. This function is used
+ * to find a conn uuid of \a imp which can reach \a peer.
+ */
+int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer,
+                           struct obd_uuid *uuid)
+{
+       struct obd_import_conn *conn;
+       int rc = -ENOENT;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+               /* Check if conn UUID does have this peer NID. */
+               if (class_check_uuid(&conn->oic_uuid, peer)) {
+                       *uuid = conn->oic_uuid;
+                       rc = 0;
+                       break;
+               }
+       }
+       spin_unlock(&imp->imp_lock);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(client_import_find_conn);
+
+void client_destroy_import(struct obd_import *imp)
+{
+       /* Drop security policy instance after all RPCs have finished/aborted
+        * to let all busy contexts be released. */
+       class_import_get(imp);
+       class_destroy_import(imp);
+       sptlrpc_import_sec_put(imp);
+       class_import_put(imp);
+}
+EXPORT_SYMBOL(client_destroy_import);
+
+/**
+ * Check whether or not the OSC is on MDT.
+ * In the config log,
+ * osc on MDT
+ *     setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID
+ * osc on client
+ *     setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID
+ *
+ **/
+static int osc_on_mdt(char *obdname)
+{
+       char *ptr;
+
+       ptr = strrchr(obdname, '-');
+       if (ptr == NULL)
+               return 0;
+
+       if (strncmp(ptr + 1, "MDT", 3) == 0)
+               return 1;
+
+       return 0;
+}
+
+/* Configure an RPC client OBD device.
+ *
+ * lcfg parameters:
+ * 1 - client UUID
+ * 2 - server UUID
+ * 3 - inactive-on-startup
+ */
+int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+       struct client_obd *cli = &obddev->u.cli;
+       struct obd_import *imp;
+       struct obd_uuid server_uuid;
+       int rq_portal, rp_portal, connect_op;
+       char *name = obddev->obd_type->typ_name;
+       ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN;
+       int rc;
+       char    *cli_name = lustre_cfg_buf(lcfg, 0);
+       ENTRY;
+
+       /* In a more perfect world, we would hang a ptlrpc_client off of
+        * obd_type and just use the values from there. */
+       if (!strcmp(name, LUSTRE_OSC_NAME) ||
+           (!(strcmp(name, LUSTRE_OSP_NAME)) &&
+            (is_osp_on_mdt(cli_name) &&
+              strstr(lustre_cfg_buf(lcfg, 1), "OST") != NULL))) {
+               /* OSC or OSP_on_MDT for OSTs */
+               rq_portal = OST_REQUEST_PORTAL;
+               rp_portal = OSC_REPLY_PORTAL;
+               connect_op = OST_CONNECT;
+               cli->cl_sp_me = LUSTRE_SP_CLI;
+               cli->cl_sp_to = LUSTRE_SP_OST;
+               ns_type = LDLM_NS_TYPE_OSC;
+       } else if (!strcmp(name, LUSTRE_MDC_NAME) ||
+                  !strcmp(name, LUSTRE_LWP_NAME) ||
+                  (!strcmp(name, LUSTRE_OSP_NAME) &&
+                   (is_osp_on_mdt(cli_name) &&
+                    strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL))) {
+               /* MDC or OSP_on_MDT for other MDTs */
+               rq_portal = MDS_REQUEST_PORTAL;
+               rp_portal = MDC_REPLY_PORTAL;
+               connect_op = MDS_CONNECT;
+               cli->cl_sp_me = LUSTRE_SP_CLI;
+               cli->cl_sp_to = LUSTRE_SP_MDT;
+               ns_type = LDLM_NS_TYPE_MDC;
+       } else if (!strcmp(name, LUSTRE_MGC_NAME)) {
+               rq_portal = MGS_REQUEST_PORTAL;
+               rp_portal = MGC_REPLY_PORTAL;
+               connect_op = MGS_CONNECT;
+               cli->cl_sp_me = LUSTRE_SP_MGC;
+               cli->cl_sp_to = LUSTRE_SP_MGS;
+               cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID;
+               ns_type = LDLM_NS_TYPE_MGC;
+       } else {
+               CERROR("unknown client OBD type \"%s\", can't setup\n",
+                      name);
+               RETURN(-EINVAL);
+       }
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+               CERROR("requires a TARGET UUID\n");
+               RETURN(-EINVAL);
+       }
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) {
+               CERROR("client UUID must be less than 38 characters\n");
+               RETURN(-EINVAL);
+       }
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) {
+               CERROR("setup requires a SERVER UUID\n");
+               RETURN(-EINVAL);
+       }
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) {
+               CERROR("target UUID must be less than 38 characters\n");
+               RETURN(-EINVAL);
+       }
+
+       init_rwsem(&cli->cl_sem);
+       sema_init(&cli->cl_mgc_sem, 1);
+       cli->cl_conn_count = 0;
+       memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2),
+              min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2),
+                    sizeof(server_uuid)));
+
+       cli->cl_dirty = 0;
+       cli->cl_avail_grant = 0;
+       /* FIXME: Should limit this for the sum of all cl_dirty_max. */
+       cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024;
+       if (cli->cl_dirty_max >> PAGE_CACHE_SHIFT > num_physpages / 8)
+               cli->cl_dirty_max = num_physpages << (PAGE_CACHE_SHIFT - 3);
+       INIT_LIST_HEAD(&cli->cl_cache_waiters);
+       INIT_LIST_HEAD(&cli->cl_loi_ready_list);
+       INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
+       INIT_LIST_HEAD(&cli->cl_loi_write_list);
+       INIT_LIST_HEAD(&cli->cl_loi_read_list);
+       client_obd_list_lock_init(&cli->cl_loi_list_lock);
+       atomic_set(&cli->cl_pending_w_pages, 0);
+       atomic_set(&cli->cl_pending_r_pages, 0);
+       cli->cl_r_in_flight = 0;
+       cli->cl_w_in_flight = 0;
+
+       spin_lock_init(&cli->cl_read_rpc_hist.oh_lock);
+       spin_lock_init(&cli->cl_write_rpc_hist.oh_lock);
+       spin_lock_init(&cli->cl_read_page_hist.oh_lock);
+       spin_lock_init(&cli->cl_write_page_hist.oh_lock);
+       spin_lock_init(&cli->cl_read_offset_hist.oh_lock);
+       spin_lock_init(&cli->cl_write_offset_hist.oh_lock);
+
+       /* lru for osc. */
+       INIT_LIST_HEAD(&cli->cl_lru_osc);
+       atomic_set(&cli->cl_lru_shrinkers, 0);
+       atomic_set(&cli->cl_lru_busy, 0);
+       atomic_set(&cli->cl_lru_in_list, 0);
+       INIT_LIST_HEAD(&cli->cl_lru_list);
+       client_obd_list_lock_init(&cli->cl_lru_list_lock);
+
+       init_waitqueue_head(&cli->cl_destroy_waitq);
+       atomic_set(&cli->cl_destroy_in_flight, 0);
+       /* Turn on checksumming by default. */
+       cli->cl_checksum = 1;
+       /*
+        * The supported checksum types will be worked out at connect time
+        * Set cl_chksum* to CRC32 for now to avoid returning screwed info
+        * through procfs.
+        */
+       cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32;
+       atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
+
+       /* This value may be reduced at connect time in
+        * ptlrpc_connect_interpret() . We initialize it to only
+        * 1MB until we know what the performance looks like.
+        * In the future this should likely be increased. LU-1431 */
+       cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES,
+                                         LNET_MTU >> PAGE_CACHE_SHIFT);
+
+       if (!strcmp(name, LUSTRE_MDC_NAME)) {
+               cli->cl_max_rpcs_in_flight = MDC_MAX_RIF_DEFAULT;
+       } else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 128 /* MB */) {
+               cli->cl_max_rpcs_in_flight = 2;
+       } else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 256 /* MB */) {
+               cli->cl_max_rpcs_in_flight = 3;
+       } else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 512 /* MB */) {
+               cli->cl_max_rpcs_in_flight = 4;
+       } else {
+               if (osc_on_mdt(obddev->obd_name))
+                       cli->cl_max_rpcs_in_flight = MDS_OSC_MAX_RIF_DEFAULT;
+               else
+                       cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT;
+       }
+       rc = ldlm_get_ref();
+       if (rc) {
+               CERROR("ldlm_get_ref failed: %d\n", rc);
+               GOTO(err, rc);
+       }
+
+       ptlrpc_init_client(rq_portal, rp_portal, name,
+                          &obddev->obd_ldlm_client);
+
+       imp = class_new_import(obddev);
+       if (imp == NULL)
+               GOTO(err_ldlm, rc = -ENOENT);
+       imp->imp_client = &obddev->obd_ldlm_client;
+       imp->imp_connect_op = connect_op;
+       memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+              LUSTRE_CFG_BUFLEN(lcfg, 1));
+       class_import_put(imp);
+
+       rc = client_import_add_conn(imp, &server_uuid, 1);
+       if (rc) {
+               CERROR("can't add initial connection\n");
+               GOTO(err_import, rc);
+       }
+
+       cli->cl_import = imp;
+       /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
+       cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
+       cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
+               if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
+                       CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
+                              name, obddev->obd_name,
+                              cli->cl_target_uuid.uuid);
+                       spin_lock(&imp->imp_lock);
+                       imp->imp_deactive = 1;
+                       spin_unlock(&imp->imp_lock);
+               }
+       }
+
+       obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name,
+                                                  LDLM_NAMESPACE_CLIENT,
+                                                  LDLM_NAMESPACE_GREEDY,
+                                                  ns_type);
+       if (obddev->obd_namespace == NULL) {
+               CERROR("Unable to create client namespace - %s\n",
+                      obddev->obd_name);
+               GOTO(err_import, rc = -ENOMEM);
+       }
+
+       cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
+
+       RETURN(rc);
+
+err_import:
+       class_destroy_import(imp);
+err_ldlm:
+       ldlm_put_ref();
+err:
+       RETURN(rc);
+
+}
+EXPORT_SYMBOL(client_obd_setup);
+
+int client_obd_cleanup(struct obd_device *obddev)
+{
+       ENTRY;
+
+       ldlm_namespace_free_post(obddev->obd_namespace);
+       obddev->obd_namespace = NULL;
+
+       LASSERT(obddev->u.cli.cl_import == NULL);
+
+       ldlm_put_ref();
+       RETURN(0);
+}
+EXPORT_SYMBOL(client_obd_cleanup);
+
+/* ->o_connect() method for client side (OSC and MDC and MGC) */
+int client_connect_import(const struct lu_env *env,
+                         struct obd_export **exp,
+                         struct obd_device *obd, struct obd_uuid *cluuid,
+                         struct obd_connect_data *data, void *localdata)
+{
+       struct client_obd       *cli    = &obd->u.cli;
+       struct obd_import       *imp    = cli->cl_import;
+       struct obd_connect_data *ocd;
+       struct lustre_handle    conn    = { 0 };
+       int                  rc;
+       ENTRY;
+
+       *exp = NULL;
+       down_write(&cli->cl_sem);
+       if (cli->cl_conn_count > 0 )
+               GOTO(out_sem, rc = -EALREADY);
+
+       rc = class_connect(&conn, obd, cluuid);
+       if (rc)
+               GOTO(out_sem, rc);
+
+       cli->cl_conn_count++;
+       *exp = class_conn2export(&conn);
+
+       LASSERT(obd->obd_namespace);
+
+       imp->imp_dlm_handle = conn;
+       rc = ptlrpc_init_import(imp);
+       if (rc != 0)
+               GOTO(out_ldlm, rc);
+
+       ocd = &imp->imp_connect_data;
+       if (data) {
+               *ocd = *data;
+               imp->imp_connect_flags_orig = data->ocd_connect_flags;
+       }
+
+       rc = ptlrpc_connect_import(imp);
+       if (rc != 0) {
+               LASSERT (imp->imp_state == LUSTRE_IMP_DISCON);
+               GOTO(out_ldlm, rc);
+       }
+       LASSERT((*exp)->exp_connection);
+
+       if (data) {
+               LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) ==
+                        ocd->ocd_connect_flags, "old "LPX64", new "LPX64"\n",
+                        data->ocd_connect_flags, ocd->ocd_connect_flags);
+               data->ocd_connect_flags = ocd->ocd_connect_flags;
+       }
+
+       ptlrpc_pinger_add_import(imp);
+
+       EXIT;
+
+       if (rc) {
+out_ldlm:
+               cli->cl_conn_count--;
+               class_disconnect(*exp);
+               *exp = NULL;
+       }
+out_sem:
+       up_write(&cli->cl_sem);
+
+       return rc;
+}
+EXPORT_SYMBOL(client_connect_import);
+
+int client_disconnect_export(struct obd_export *exp)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct client_obd *cli;
+       struct obd_import *imp;
+       int rc = 0, err;
+       ENTRY;
+
+       if (!obd) {
+               CERROR("invalid export for disconnect: exp %p cookie "LPX64"\n",
+                      exp, exp ? exp->exp_handle.h_cookie : -1);
+               RETURN(-EINVAL);
+       }
+
+       cli = &obd->u.cli;
+       imp = cli->cl_import;
+
+       down_write(&cli->cl_sem);
+       CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name,
+              cli->cl_conn_count);
+
+       if (!cli->cl_conn_count) {
+               CERROR("disconnecting disconnected device (%s)\n",
+                      obd->obd_name);
+               GOTO(out_disconnect, rc = -EINVAL);
+       }
+
+       cli->cl_conn_count--;
+       if (cli->cl_conn_count)
+               GOTO(out_disconnect, rc = 0);
+
+       /* Mark import deactivated now, so we don't try to reconnect if any
+        * of the cleanup RPCs fails (e.g. LDLM cancel, etc).  We don't
+        * fully deactivate the import, or that would drop all requests. */
+       spin_lock(&imp->imp_lock);
+       imp->imp_deactive = 1;
+       spin_unlock(&imp->imp_lock);
+
+       /* Some non-replayable imports (MDS's OSCs) are pinged, so just
+        * delete it regardless.  (It's safe to delete an import that was
+        * never added.) */
+       (void)ptlrpc_pinger_del_import(imp);
+
+       if (obd->obd_namespace != NULL) {
+               /* obd_force == local only */
+               ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
+                                      obd->obd_force ? LCF_LOCAL : 0, NULL);
+               ldlm_namespace_free_prior(obd->obd_namespace, imp, obd->obd_force);
+       }
+
+       /* There's no need to hold sem while disconnecting an import,
+        * and it may actually cause deadlock in GSS. */
+       up_write(&cli->cl_sem);
+       rc = ptlrpc_disconnect_import(imp, 0);
+       down_write(&cli->cl_sem);
+
+       ptlrpc_invalidate_import(imp);
+
+       EXIT;
+
+out_disconnect:
+       /* Use server style - class_disconnect should be always called for
+        * o_disconnect. */
+       err = class_disconnect(exp);
+       if (!rc && err)
+               rc = err;
+
+       up_write(&cli->cl_sem);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(client_disconnect_export);
+
+
+/**
+ * Packs current SLV and Limit into \a req.
+ */
+int target_pack_pool_reply(struct ptlrpc_request *req)
+{
+       struct obd_device *obd;
+       ENTRY;
+
+       /* Check that we still have all structures alive as this may
+        * be some late RPC at shutdown time. */
+       if (unlikely(!req->rq_export || !req->rq_export->exp_obd ||
+                    !exp_connect_lru_resize(req->rq_export))) {
+               lustre_msg_set_slv(req->rq_repmsg, 0);
+               lustre_msg_set_limit(req->rq_repmsg, 0);
+               RETURN(0);
+       }
+
+       /* OBD is alive here as export is alive, which we checked above. */
+       obd = req->rq_export->exp_obd;
+
+       read_lock(&obd->obd_pool_lock);
+       lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv);
+       lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit);
+       read_unlock(&obd->obd_pool_lock);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(target_pack_pool_reply);
+
+int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id)
+{
+       if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) {
+               DEBUG_REQ(D_ERROR, req, "dropping reply");
+               return (-ECOMM);
+       }
+
+       if (unlikely(rc)) {
+               DEBUG_REQ(D_NET, req, "processing error (%d)", rc);
+               req->rq_status = rc;
+               return (ptlrpc_send_error(req, 1));
+       } else {
+               DEBUG_REQ(D_NET, req, "sending reply");
+       }
+
+       return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT));
+}
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+       struct ptlrpc_service_part *svcpt;
+       int                     netrc;
+       struct ptlrpc_reply_state *rs;
+       struct obd_export        *exp;
+       ENTRY;
+
+       if (req->rq_no_reply) {
+               EXIT;
+               return;
+       }
+
+       svcpt = req->rq_rqbd->rqbd_svcpt;
+       rs = req->rq_reply_state;
+       if (rs == NULL || !rs->rs_difficult) {
+               /* no notifiers */
+               target_send_reply_msg (req, rc, fail_id);
+               EXIT;
+               return;
+       }
+
+       /* must be an export if locks saved */
+       LASSERT (req->rq_export != NULL);
+       /* req/reply consistent */
+       LASSERT(rs->rs_svcpt == svcpt);
+
+       /* "fresh" reply */
+       LASSERT (!rs->rs_scheduled);
+       LASSERT (!rs->rs_scheduled_ever);
+       LASSERT (!rs->rs_handled);
+       LASSERT (!rs->rs_on_net);
+       LASSERT (rs->rs_export == NULL);
+       LASSERT (list_empty(&rs->rs_obd_list));
+       LASSERT (list_empty(&rs->rs_exp_list));
+
+       exp = class_export_get (req->rq_export);
+
+       /* disable reply scheduling while I'm setting up */
+       rs->rs_scheduled = 1;
+       rs->rs_on_net    = 1;
+       rs->rs_xid       = req->rq_xid;
+       rs->rs_transno   = req->rq_transno;
+       rs->rs_export    = exp;
+       rs->rs_opc       = lustre_msg_get_opc(req->rq_reqmsg);
+
+       spin_lock(&exp->exp_uncommitted_replies_lock);
+       CDEBUG(D_NET, "rs transno = "LPU64", last committed = "LPU64"\n",
+              rs->rs_transno, exp->exp_last_committed);
+       if (rs->rs_transno > exp->exp_last_committed) {
+               /* not committed already */
+               list_add_tail(&rs->rs_obd_list,
+                                 &exp->exp_uncommitted_replies);
+       }
+       spin_unlock(&exp->exp_uncommitted_replies_lock);
+
+       spin_lock(&exp->exp_lock);
+       list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies);
+       spin_unlock(&exp->exp_lock);
+
+       netrc = target_send_reply_msg(req, rc, fail_id);
+
+       spin_lock(&svcpt->scp_rep_lock);
+
+       atomic_inc(&svcpt->scp_nreps_difficult);
+
+       if (netrc != 0) {
+               /* error sending: reply is off the net.  Also we need +1
+                * reply ref until ptlrpc_handle_rs() is done
+                * with the reply state (if the send was successful, there
+                * would have been +1 ref for the net, which
+                * reply_out_callback leaves alone) */
+               rs->rs_on_net = 0;
+               ptlrpc_rs_addref(rs);
+       }
+
+       spin_lock(&rs->rs_lock);
+       if (rs->rs_transno <= exp->exp_last_committed ||
+           (!rs->rs_on_net && !rs->rs_no_ack) ||
+           list_empty(&rs->rs_exp_list) ||     /* completed already */
+           list_empty(&rs->rs_obd_list)) {
+               CDEBUG(D_HA, "Schedule reply immediately\n");
+               ptlrpc_dispatch_difficult_reply(rs);
+       } else {
+               list_add(&rs->rs_list, &svcpt->scp_rep_active);
+               rs->rs_scheduled = 0;   /* allow notifier to schedule */
+       }
+       spin_unlock(&rs->rs_lock);
+       spin_unlock(&svcpt->scp_rep_lock);
+       EXIT;
+}
+EXPORT_SYMBOL(target_send_reply);
+
+ldlm_mode_t lck_compat_array[] = {
+       [LCK_EX] LCK_COMPAT_EX,
+       [LCK_PW] LCK_COMPAT_PW,
+       [LCK_PR] LCK_COMPAT_PR,
+       [LCK_CW] LCK_COMPAT_CW,
+       [LCK_CR] LCK_COMPAT_CR,
+       [LCK_NL] LCK_COMPAT_NL,
+       [LCK_GROUP] LCK_COMPAT_GROUP,
+       [LCK_COS] LCK_COMPAT_COS,
+};
+
+/**
+ * Rather arbitrary mapping from LDLM error codes to errno values. This should
+ * not escape to the user level.
+ */
+int ldlm_error2errno(ldlm_error_t error)
+{
+       int result;
+
+       switch (error) {
+       case ELDLM_OK:
+               result = 0;
+               break;
+       case ELDLM_LOCK_CHANGED:
+               result = -ESTALE;
+               break;
+       case ELDLM_LOCK_ABORTED:
+               result = -ENAVAIL;
+               break;
+       case ELDLM_LOCK_REPLACED:
+               result = -ESRCH;
+               break;
+       case ELDLM_NO_LOCK_DATA:
+               result = -ENOENT;
+               break;
+       case ELDLM_NAMESPACE_EXISTS:
+               result = -EEXIST;
+               break;
+       case ELDLM_BAD_NAMESPACE:
+               result = -EBADF;
+               break;
+       default:
+               if (((int)error) < 0)  /* cast to signed type */
+                       result = error; /* as ldlm_error_t can be unsigned */
+               else {
+                       CERROR("Invalid DLM result code: %d\n", error);
+                       result = -EPROTO;
+               }
+       }
+       return result;
+}
+EXPORT_SYMBOL(ldlm_error2errno);
+
+/**
+ * Dual to ldlm_error2errno(): maps errno values back to ldlm_error_t.
+ */
+ldlm_error_t ldlm_errno2error(int err_no)
+{
+       int error;
+
+       switch (err_no) {
+       case 0:
+               error = ELDLM_OK;
+               break;
+       case -ESTALE:
+               error = ELDLM_LOCK_CHANGED;
+               break;
+       case -ENAVAIL:
+               error = ELDLM_LOCK_ABORTED;
+               break;
+       case -ESRCH:
+               error = ELDLM_LOCK_REPLACED;
+               break;
+       case -ENOENT:
+               error = ELDLM_NO_LOCK_DATA;
+               break;
+       case -EEXIST:
+               error = ELDLM_NAMESPACE_EXISTS;
+               break;
+       case -EBADF:
+               error = ELDLM_BAD_NAMESPACE;
+               break;
+       default:
+               error = err_no;
+       }
+       return error;
+}
+EXPORT_SYMBOL(ldlm_errno2error);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void ldlm_dump_export_locks(struct obd_export *exp)
+{
+       spin_lock(&exp->exp_locks_list_guard);
+       if (!list_empty(&exp->exp_locks_list)) {
+               struct ldlm_lock *lock;
+
+               CERROR("dumping locks for export %p,"
+                      "ignore if the unmount doesn't hang\n", exp);
+               list_for_each_entry(lock, &exp->exp_locks_list,
+                                       l_exp_refs_link)
+                       LDLM_ERROR(lock, "lock:");
+       }
+       spin_unlock(&exp->exp_locks_list_guard);
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
new file mode 100644 (file)
index 0000000..33b76a1
--- /dev/null
@@ -0,0 +1,2429 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lock.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+# include <linux/lustre_intent.h>
+
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+/* lock types */
+char *ldlm_lockname[] = {
+       [0] "--",
+       [LCK_EX] "EX",
+       [LCK_PW] "PW",
+       [LCK_PR] "PR",
+       [LCK_CW] "CW",
+       [LCK_CR] "CR",
+       [LCK_NL] "NL",
+       [LCK_GROUP] "GROUP",
+       [LCK_COS] "COS"
+};
+EXPORT_SYMBOL(ldlm_lockname);
+
+char *ldlm_typename[] = {
+       [LDLM_PLAIN] "PLN",
+       [LDLM_EXTENT] "EXT",
+       [LDLM_FLOCK] "FLK",
+       [LDLM_IBITS] "IBT",
+};
+EXPORT_SYMBOL(ldlm_typename);
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire18_to_local[] = {
+       [LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_wire_to_local,
+       [LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_wire_to_local,
+       [LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_wire18_to_local,
+       [LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_wire_to_local_t ldlm_policy_wire21_to_local[] = {
+       [LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_wire_to_local,
+       [LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_wire_to_local,
+       [LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_wire21_to_local,
+       [LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_wire_to_local,
+};
+
+static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = {
+       [LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_local_to_wire,
+       [LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_local_to_wire,
+       [LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_local_to_wire,
+       [LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_local_to_wire,
+};
+
+/**
+ * Converts lock policy from local format to on the wire lock_desc format
+ */
+void ldlm_convert_policy_to_wire(ldlm_type_t type,
+                                const ldlm_policy_data_t *lpolicy,
+                                ldlm_wire_policy_data_t *wpolicy)
+{
+       ldlm_policy_local_to_wire_t convert;
+
+       convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE];
+
+       convert(lpolicy, wpolicy);
+}
+
+/**
+ * Converts lock policy from on the wire lock_desc format to local format
+ */
+void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type,
+                                 const ldlm_wire_policy_data_t *wpolicy,
+                                 ldlm_policy_data_t *lpolicy)
+{
+       ldlm_policy_wire_to_local_t convert;
+       int new_client;
+
+       /** some badness for 2.0.0 clients, but 2.0.0 isn't supported */
+       new_client = (exp_connect_flags(exp) & OBD_CONNECT_FULL20) != 0;
+       if (new_client)
+               convert = ldlm_policy_wire21_to_local[type - LDLM_MIN_TYPE];
+       else
+               convert = ldlm_policy_wire18_to_local[type - LDLM_MIN_TYPE];
+
+       convert(wpolicy, lpolicy);
+}
+
+char *ldlm_it2str(int it)
+{
+       switch (it) {
+       case IT_OPEN:
+               return "open";
+       case IT_CREAT:
+               return "creat";
+       case (IT_OPEN | IT_CREAT):
+               return "open|creat";
+       case IT_READDIR:
+               return "readdir";
+       case IT_GETATTR:
+               return "getattr";
+       case IT_LOOKUP:
+               return "lookup";
+       case IT_UNLINK:
+               return "unlink";
+       case IT_GETXATTR:
+               return "getxattr";
+       case IT_LAYOUT:
+               return "layout";
+       default:
+               CERROR("Unknown intent %d\n", it);
+               return "UNKNOWN";
+       }
+}
+EXPORT_SYMBOL(ldlm_it2str);
+
+extern struct kmem_cache *ldlm_lock_slab;
+
+
+void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg)
+{
+       ns->ns_policy = arg;
+}
+EXPORT_SYMBOL(ldlm_register_intent);
+
+/*
+ * REFCOUNTED LOCK OBJECTS
+ */
+
+
+/**
+ * Get a reference on a lock.
+ *
+ * Lock refcounts, during creation:
+ *   - one special one for allocation, dec'd only once in destroy
+ *   - one for being a lock that's in-use
+ *   - one for the addref associated with a new lock
+ */
+struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock)
+{
+       atomic_inc(&lock->l_refc);
+       return lock;
+}
+EXPORT_SYMBOL(ldlm_lock_get);
+
+/**
+ * Release lock reference.
+ *
+ * Also frees the lock if it was last reference.
+ */
+void ldlm_lock_put(struct ldlm_lock *lock)
+{
+       ENTRY;
+
+       LASSERT(lock->l_resource != LP_POISON);
+       LASSERT(atomic_read(&lock->l_refc) > 0);
+       if (atomic_dec_and_test(&lock->l_refc)) {
+               struct ldlm_resource *res;
+
+               LDLM_DEBUG(lock,
+                          "final lock_put on destroyed lock, freeing it.");
+
+               res = lock->l_resource;
+               LASSERT(lock->l_destroyed);
+               LASSERT(list_empty(&lock->l_res_link));
+               LASSERT(list_empty(&lock->l_pending_chain));
+
+               lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats,
+                                    LDLM_NSS_LOCKS);
+               lu_ref_del(&res->lr_reference, "lock", lock);
+               ldlm_resource_putref(res);
+               lock->l_resource = NULL;
+               if (lock->l_export) {
+                       class_export_lock_put(lock->l_export, lock);
+                       lock->l_export = NULL;
+               }
+
+               if (lock->l_lvb_data != NULL)
+                       OBD_FREE(lock->l_lvb_data, lock->l_lvb_len);
+
+               ldlm_interval_free(ldlm_interval_detach(lock));
+               lu_ref_fini(&lock->l_reference);
+               OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle);
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_put);
+
+/**
+ * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked.
+ */
+int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock)
+{
+       int rc = 0;
+       if (!list_empty(&lock->l_lru)) {
+               struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+               LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+               list_del_init(&lock->l_lru);
+               if (lock->l_flags & LDLM_FL_SKIPPED)
+                       lock->l_flags &= ~LDLM_FL_SKIPPED;
+               LASSERT(ns->ns_nr_unused > 0);
+               ns->ns_nr_unused--;
+               rc = 1;
+       }
+       return rc;
+}
+
+/**
+ * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first.
+ */
+int ldlm_lock_remove_from_lru(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+       int rc;
+
+       ENTRY;
+       if (lock->l_ns_srv) {
+               LASSERT(list_empty(&lock->l_lru));
+               RETURN(0);
+       }
+
+       spin_lock(&ns->ns_lock);
+       rc = ldlm_lock_remove_from_lru_nolock(lock);
+       spin_unlock(&ns->ns_lock);
+       EXIT;
+       return rc;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked.
+ */
+void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+       lock->l_last_used = cfs_time_current();
+       LASSERT(list_empty(&lock->l_lru));
+       LASSERT(lock->l_resource->lr_type != LDLM_FLOCK);
+       list_add_tail(&lock->l_lru, &ns->ns_unused_list);
+       LASSERT(ns->ns_nr_unused >= 0);
+       ns->ns_nr_unused++;
+}
+
+/**
+ * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks
+ * first.
+ */
+void ldlm_lock_add_to_lru(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+       ENTRY;
+       spin_lock(&ns->ns_lock);
+       ldlm_lock_add_to_lru_nolock(lock);
+       spin_unlock(&ns->ns_lock);
+       EXIT;
+}
+
+/**
+ * Moves LDLM lock \a lock that is already in namespace LRU to the tail of
+ * the LRU. Performs necessary LRU locking
+ */
+void ldlm_lock_touch_in_lru(struct ldlm_lock *lock)
+{
+       struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
+
+       ENTRY;
+       if (lock->l_ns_srv) {
+               LASSERT(list_empty(&lock->l_lru));
+               EXIT;
+               return;
+       }
+
+       spin_lock(&ns->ns_lock);
+       if (!list_empty(&lock->l_lru)) {
+               ldlm_lock_remove_from_lru_nolock(lock);
+               ldlm_lock_add_to_lru_nolock(lock);
+       }
+       spin_unlock(&ns->ns_lock);
+       EXIT;
+}
+
+/**
+ * Helper to destroy a locked lock.
+ *
+ * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock
+ * Must be called with l_lock and lr_lock held.
+ *
+ * Does not actually free the lock data, but rather marks the lock as
+ * destroyed by setting l_destroyed field in the lock to 1.  Destroys a
+ * handle->lock association too, so that the lock can no longer be found
+ * and removes the lock from LRU list.  Actual lock freeing occurs when
+ * last lock reference goes away.
+ *
+ * Original comment (of some historical value):
+ * This used to have a 'strict' flag, which recovery would use to mark an
+ * in-use lock as needing-to-die.  Lest I am ever tempted to put it back, I
+ * shall explain why it's gone: with the new hash table scheme, once you call
+ * ldlm_lock_destroy, you can never drop your final references on this lock.
+ * Because it's not in the hash table anymore.  -phil
+ */
+int ldlm_lock_destroy_internal(struct ldlm_lock *lock)
+{
+       ENTRY;
+
+       if (lock->l_readers || lock->l_writers) {
+               LDLM_ERROR(lock, "lock still has references");
+               LBUG();
+       }
+
+       if (!list_empty(&lock->l_res_link)) {
+               LDLM_ERROR(lock, "lock still on resource");
+               LBUG();
+       }
+
+       if (lock->l_destroyed) {
+               LASSERT(list_empty(&lock->l_lru));
+               EXIT;
+               return 0;
+       }
+       lock->l_destroyed = 1;
+
+       if (lock->l_export && lock->l_export->exp_lock_hash) {
+               /* NB: it's safe to call cfs_hash_del() even lock isn't
+                * in exp_lock_hash. */
+               /* In the function below, .hs_keycmp resolves to
+                * ldlm_export_lock_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               cfs_hash_del(lock->l_export->exp_lock_hash,
+                            &lock->l_remote_handle, &lock->l_exp_hash);
+       }
+
+       ldlm_lock_remove_from_lru(lock);
+       class_handle_unhash(&lock->l_handle);
+
+#if 0
+       /* Wake anyone waiting for this lock */
+       /* FIXME: I should probably add yet another flag, instead of using
+        * l_export to only call this on clients */
+       if (lock->l_export)
+               class_export_put(lock->l_export);
+       lock->l_export = NULL;
+       if (lock->l_export && lock->l_completion_ast)
+               lock->l_completion_ast(lock, 0);
+#endif
+       EXIT;
+       return 1;
+}
+
+/**
+ * Destroys a LDLM lock \a lock. Performs necessary locking first.
+ */
+void ldlm_lock_destroy(struct ldlm_lock *lock)
+{
+       int first;
+       ENTRY;
+       lock_res_and_lock(lock);
+       first = ldlm_lock_destroy_internal(lock);
+       unlock_res_and_lock(lock);
+
+       /* drop reference from hashtable only for first destroy */
+       if (first) {
+               lu_ref_del(&lock->l_reference, "hash", lock);
+               LDLM_LOCK_RELEASE(lock);
+       }
+       EXIT;
+}
+
+/**
+ * Destroys a LDLM lock \a lock that is already locked.
+ */
+void ldlm_lock_destroy_nolock(struct ldlm_lock *lock)
+{
+       int first;
+       ENTRY;
+       first = ldlm_lock_destroy_internal(lock);
+       /* drop reference from hashtable only for first destroy */
+       if (first) {
+               lu_ref_del(&lock->l_reference, "hash", lock);
+               LDLM_LOCK_RELEASE(lock);
+       }
+       EXIT;
+}
+
+/* this is called by portals_handle2object with the handle lock taken */
+static void lock_handle_addref(void *lock)
+{
+       LDLM_LOCK_GET((struct ldlm_lock *)lock);
+}
+
+static void lock_handle_free(void *lock, int size)
+{
+       LASSERT(size == sizeof(struct ldlm_lock));
+       OBD_SLAB_FREE(lock, ldlm_lock_slab, size);
+}
+
+struct portals_handle_ops lock_handle_ops = {
+       .hop_addref = lock_handle_addref,
+       .hop_free   = lock_handle_free,
+};
+
+/**
+ *
+ * Allocate and initialize new lock structure.
+ *
+ * usage: pass in a resource on which you have done ldlm_resource_get
+ *     new lock will take over the refcount.
+ * returns: lock with refcount 2 - one for current caller and one for remote
+ */
+static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource)
+{
+       struct ldlm_lock *lock;
+       ENTRY;
+
+       if (resource == NULL)
+               LBUG();
+
+       OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, __GFP_IO);
+       if (lock == NULL)
+               RETURN(NULL);
+
+       spin_lock_init(&lock->l_lock);
+       lock->l_resource = resource;
+       lu_ref_add(&resource->lr_reference, "lock", lock);
+
+       atomic_set(&lock->l_refc, 2);
+       INIT_LIST_HEAD(&lock->l_res_link);
+       INIT_LIST_HEAD(&lock->l_lru);
+       INIT_LIST_HEAD(&lock->l_pending_chain);
+       INIT_LIST_HEAD(&lock->l_bl_ast);
+       INIT_LIST_HEAD(&lock->l_cp_ast);
+       INIT_LIST_HEAD(&lock->l_rk_ast);
+       init_waitqueue_head(&lock->l_waitq);
+       lock->l_blocking_lock = NULL;
+       INIT_LIST_HEAD(&lock->l_sl_mode);
+       INIT_LIST_HEAD(&lock->l_sl_policy);
+       INIT_HLIST_NODE(&lock->l_exp_hash);
+       INIT_HLIST_NODE(&lock->l_exp_flock_hash);
+
+       lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats,
+                            LDLM_NSS_LOCKS);
+       INIT_LIST_HEAD(&lock->l_handle.h_link);
+       class_handle_hash(&lock->l_handle, &lock_handle_ops);
+
+       lu_ref_init(&lock->l_reference);
+       lu_ref_add(&lock->l_reference, "hash", lock);
+       lock->l_callback_timeout = 0;
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       INIT_LIST_HEAD(&lock->l_exp_refs_link);
+       lock->l_exp_refs_nr = 0;
+       lock->l_exp_refs_target = NULL;
+#endif
+       INIT_LIST_HEAD(&lock->l_exp_list);
+
+       RETURN(lock);
+}
+
+/**
+ * Moves LDLM lock \a lock to another resource.
+ * This is used on client when server returns some other lock than requested
+ * (typically as a result of intent operation)
+ */
+int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+                             const struct ldlm_res_id *new_resid)
+{
+       struct ldlm_resource *oldres = lock->l_resource;
+       struct ldlm_resource *newres;
+       int type;
+       ENTRY;
+
+       LASSERT(ns_is_client(ns));
+
+       lock_res_and_lock(lock);
+       if (memcmp(new_resid, &lock->l_resource->lr_name,
+                  sizeof(lock->l_resource->lr_name)) == 0) {
+               /* Nothing to do */
+               unlock_res_and_lock(lock);
+               RETURN(0);
+       }
+
+       LASSERT(new_resid->name[0] != 0);
+
+       /* This function assumes that the lock isn't on any lists */
+       LASSERT(list_empty(&lock->l_res_link));
+
+       type = oldres->lr_type;
+       unlock_res_and_lock(lock);
+
+       newres = ldlm_resource_get(ns, NULL, new_resid, type, 1);
+       if (newres == NULL)
+               RETURN(-ENOMEM);
+
+       lu_ref_add(&newres->lr_reference, "lock", lock);
+       /*
+        * To flip the lock from the old to the new resource, lock, oldres and
+        * newres have to be locked. Resource spin-locks are nested within
+        * lock->l_lock, and are taken in the memory address order to avoid
+        * dead-locks.
+        */
+       spin_lock(&lock->l_lock);
+       oldres = lock->l_resource;
+       if (oldres < newres) {
+               lock_res(oldres);
+               lock_res_nested(newres, LRT_NEW);
+       } else {
+               lock_res(newres);
+               lock_res_nested(oldres, LRT_NEW);
+       }
+       LASSERT(memcmp(new_resid, &oldres->lr_name,
+                      sizeof oldres->lr_name) != 0);
+       lock->l_resource = newres;
+       unlock_res(oldres);
+       unlock_res_and_lock(lock);
+
+       /* ...and the flowers are still standing! */
+       lu_ref_del(&oldres->lr_reference, "lock", lock);
+       ldlm_resource_putref(oldres);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_lock_change_resource);
+
+/** \defgroup ldlm_handles LDLM HANDLES
+ * Ways to get hold of locks without any addresses.
+ * @{
+ */
+
+/**
+ * Fills in handle for LDLM lock \a lock into supplied \a lockh
+ * Does not take any references.
+ */
+void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh)
+{
+       lockh->cookie = lock->l_handle.h_cookie;
+}
+EXPORT_SYMBOL(ldlm_lock2handle);
+
+/**
+ * Obtain a lock reference by handle.
+ *
+ * if \a flags: atomically get the lock and set the flags.
+ *           Return NULL if flag already set
+ */
+struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle,
+                                    __u64 flags)
+{
+       struct ldlm_lock *lock;
+       ENTRY;
+
+       LASSERT(handle);
+
+       lock = class_handle2object(handle->cookie);
+       if (lock == NULL)
+               RETURN(NULL);
+
+       /* It's unlikely but possible that someone marked the lock as
+        * destroyed after we did handle2object on it */
+       if (flags == 0 && !lock->l_destroyed) {
+               lu_ref_add(&lock->l_reference, "handle", current);
+               RETURN(lock);
+       }
+
+       lock_res_and_lock(lock);
+
+       LASSERT(lock->l_resource != NULL);
+
+       lu_ref_add_atomic(&lock->l_reference, "handle", current);
+       if (unlikely(lock->l_destroyed)) {
+               unlock_res_and_lock(lock);
+               CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock);
+               LDLM_LOCK_PUT(lock);
+               RETURN(NULL);
+       }
+
+       if (flags && (lock->l_flags & flags)) {
+               unlock_res_and_lock(lock);
+               LDLM_LOCK_PUT(lock);
+               RETURN(NULL);
+       }
+
+       if (flags)
+               lock->l_flags |= flags;
+
+       unlock_res_and_lock(lock);
+       RETURN(lock);
+}
+EXPORT_SYMBOL(__ldlm_handle2lock);
+/** @} ldlm_handles */
+
+/**
+ * Fill in "on the wire" representation for given LDLM lock into supplied
+ * lock descriptor \a desc structure.
+ */
+void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc)
+{
+       struct obd_export *exp = lock->l_export ?: lock->l_conn_export;
+
+       /* INODEBITS_INTEROP: If the other side does not support
+        * inodebits, reply with a plain lock descriptor. */
+       if ((lock->l_resource->lr_type == LDLM_IBITS) &&
+           (exp && !(exp_connect_flags(exp) & OBD_CONNECT_IBITS))) {
+               /* Make sure all the right bits are set in this lock we
+                  are going to pass to client */
+               LASSERTF(lock->l_policy_data.l_inodebits.bits ==
+                        (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+                         MDS_INODELOCK_LAYOUT),
+                        "Inappropriate inode lock bits during "
+                        "conversion " LPU64 "\n",
+                        lock->l_policy_data.l_inodebits.bits);
+
+               ldlm_res2desc(lock->l_resource, &desc->l_resource);
+               desc->l_resource.lr_type = LDLM_PLAIN;
+
+               /* Convert "new" lock mode to something old client can
+                  understand */
+               if ((lock->l_req_mode == LCK_CR) ||
+                   (lock->l_req_mode == LCK_CW))
+                       desc->l_req_mode = LCK_PR;
+               else
+                       desc->l_req_mode = lock->l_req_mode;
+               if ((lock->l_granted_mode == LCK_CR) ||
+                   (lock->l_granted_mode == LCK_CW)) {
+                       desc->l_granted_mode = LCK_PR;
+               } else {
+                       /* We never grant PW/EX locks to clients */
+                       LASSERT((lock->l_granted_mode != LCK_PW) &&
+                               (lock->l_granted_mode != LCK_EX));
+                       desc->l_granted_mode = lock->l_granted_mode;
+               }
+
+               /* We do not copy policy here, because there is no
+                  policy for plain locks */
+       } else {
+               ldlm_res2desc(lock->l_resource, &desc->l_resource);
+               desc->l_req_mode = lock->l_req_mode;
+               desc->l_granted_mode = lock->l_granted_mode;
+               ldlm_convert_policy_to_wire(lock->l_resource->lr_type,
+                                           &lock->l_policy_data,
+                                           &desc->l_policy_data);
+       }
+}
+EXPORT_SYMBOL(ldlm_lock2desc);
+
+/**
+ * Add a lock to list of conflicting locks to send AST to.
+ *
+ * Only add if we have not sent a blocking AST to the lock yet.
+ */
+void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+                          struct list_head *work_list)
+{
+       if ((lock->l_flags & LDLM_FL_AST_SENT) == 0) {
+               LDLM_DEBUG(lock, "lock incompatible; sending blocking AST.");
+               lock->l_flags |= LDLM_FL_AST_SENT;
+               /* If the enqueuing client said so, tell the AST recipient to
+                * discard dirty data, rather than writing back. */
+               if (new->l_flags & LDLM_AST_DISCARD_DATA)
+                       lock->l_flags |= LDLM_FL_DISCARD_DATA;
+               LASSERT(list_empty(&lock->l_bl_ast));
+               list_add(&lock->l_bl_ast, work_list);
+               LDLM_LOCK_GET(lock);
+               LASSERT(lock->l_blocking_lock == NULL);
+               lock->l_blocking_lock = LDLM_LOCK_GET(new);
+       }
+}
+
+/**
+ * Add a lock to list of just granted locks to send completion AST to.
+ */
+void ldlm_add_cp_work_item(struct ldlm_lock *lock, struct list_head *work_list)
+{
+       if ((lock->l_flags & LDLM_FL_CP_REQD) == 0) {
+               lock->l_flags |= LDLM_FL_CP_REQD;
+               LDLM_DEBUG(lock, "lock granted; sending completion AST.");
+               LASSERT(list_empty(&lock->l_cp_ast));
+               list_add(&lock->l_cp_ast, work_list);
+               LDLM_LOCK_GET(lock);
+       }
+}
+
+/**
+ * Aggregator function to add AST work items into a list. Determines
+ * what sort of an AST work needs to be done and calls the proper
+ * adding function.
+ * Must be called with lr_lock held.
+ */
+void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
+                           struct list_head *work_list)
+{
+       ENTRY;
+       check_res_locked(lock->l_resource);
+       if (new)
+               ldlm_add_bl_work_item(lock, new, work_list);
+       else
+               ldlm_add_cp_work_item(lock, work_list);
+       EXIT;
+}
+
+/**
+ * Add specified reader/writer reference to LDLM lock with handle \a lockh.
+ * r/w reference type is determined by \a mode
+ * Calls ldlm_lock_addref_internal.
+ */
+void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode)
+{
+       struct ldlm_lock *lock;
+
+       lock = ldlm_handle2lock(lockh);
+       LASSERT(lock != NULL);
+       ldlm_lock_addref_internal(lock, mode);
+       LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_addref);
+
+/**
+ * Helper function.
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * r/w reference type is determined by \a mode
+ * Removes lock from LRU if it is there.
+ * Assumes the LDLM lock is already locked.
+ */
+void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+       ldlm_lock_remove_from_lru(lock);
+       if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+               lock->l_readers++;
+               lu_ref_add_atomic(&lock->l_reference, "reader", lock);
+       }
+       if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+               lock->l_writers++;
+               lu_ref_add_atomic(&lock->l_reference, "writer", lock);
+       }
+       LDLM_LOCK_GET(lock);
+       lu_ref_add_atomic(&lock->l_reference, "user", lock);
+       LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]);
+}
+
+/**
+ * Attempts to add reader/writer reference to a lock with handle \a lockh, and
+ * fails if lock is already LDLM_FL_CBPENDING or destroyed.
+ *
+ * \retval 0 success, lock was addref-ed
+ *
+ * \retval -EAGAIN lock is being canceled.
+ */
+int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode)
+{
+       struct ldlm_lock *lock;
+       int            result;
+
+       result = -EAGAIN;
+       lock = ldlm_handle2lock(lockh);
+       if (lock != NULL) {
+               lock_res_and_lock(lock);
+               if (lock->l_readers != 0 || lock->l_writers != 0 ||
+                   !(lock->l_flags & LDLM_FL_CBPENDING)) {
+                       ldlm_lock_addref_internal_nolock(lock, mode);
+                       result = 0;
+               }
+               unlock_res_and_lock(lock);
+               LDLM_LOCK_PUT(lock);
+       }
+       return result;
+}
+EXPORT_SYMBOL(ldlm_lock_addref_try);
+
+/**
+ * Add specified reader/writer reference to LDLM lock \a lock.
+ * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work.
+ * Only called for local locks.
+ */
+void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+       lock_res_and_lock(lock);
+       ldlm_lock_addref_internal_nolock(lock, mode);
+       unlock_res_and_lock(lock);
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Assumes LDLM lock is already locked.
+ * only called in ldlm_flock_destroy and for local locks.
+ * Does NOT add lock to LRU if no r/w references left to accomodate flock locks
+ * that cannot be placed in LRU.
+ */
+void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, __u32 mode)
+{
+       LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+       if (mode & (LCK_NL | LCK_CR | LCK_PR)) {
+               LASSERT(lock->l_readers > 0);
+               lu_ref_del(&lock->l_reference, "reader", lock);
+               lock->l_readers--;
+       }
+       if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) {
+               LASSERT(lock->l_writers > 0);
+               lu_ref_del(&lock->l_reference, "writer", lock);
+               lock->l_writers--;
+       }
+
+       lu_ref_del(&lock->l_reference, "user", lock);
+       LDLM_LOCK_RELEASE(lock);    /* matches the LDLM_LOCK_GET() in addref */
+}
+
+/**
+ * Removes reader/writer reference for LDLM lock \a lock.
+ * Locks LDLM lock first.
+ * If the lock is determined to be client lock on a client and r/w refcount
+ * drops to zero and the lock is not blocked, the lock is added to LRU lock
+ * on the namespace.
+ * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called.
+ */
+void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
+{
+       struct ldlm_namespace *ns;
+       ENTRY;
+
+       lock_res_and_lock(lock);
+
+       ns = ldlm_lock_to_ns(lock);
+
+       ldlm_lock_decref_internal_nolock(lock, mode);
+
+       if (lock->l_flags & LDLM_FL_LOCAL &&
+           !lock->l_readers && !lock->l_writers) {
+               /* If this is a local lock on a server namespace and this was
+                * the last reference, cancel the lock. */
+               CDEBUG(D_INFO, "forcing cancel of local lock\n");
+               lock->l_flags |= LDLM_FL_CBPENDING;
+       }
+
+       if (!lock->l_readers && !lock->l_writers &&
+           (lock->l_flags & LDLM_FL_CBPENDING)) {
+               /* If we received a blocked AST and this was the last reference,
+                * run the callback. */
+               if (lock->l_ns_srv && lock->l_export)
+                       CERROR("FL_CBPENDING set on non-local lock--just a "
+                              "warning\n");
+
+               LDLM_DEBUG(lock, "final decref done on cbpending lock");
+
+               LDLM_LOCK_GET(lock); /* dropped by bl thread */
+               ldlm_lock_remove_from_lru(lock);
+               unlock_res_and_lock(lock);
+
+               if (lock->l_flags & LDLM_FL_FAIL_LOC)
+                       OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+               if ((lock->l_flags & LDLM_FL_ATOMIC_CB) ||
+                   ldlm_bl_to_thread_lock(ns, NULL, lock) != 0)
+                       ldlm_handle_bl_callback(ns, NULL, lock);
+       } else if (ns_is_client(ns) &&
+                  !lock->l_readers && !lock->l_writers &&
+                  !(lock->l_flags & LDLM_FL_NO_LRU) &&
+                  !(lock->l_flags & LDLM_FL_BL_AST)) {
+
+               LDLM_DEBUG(lock, "add lock into lru list");
+
+               /* If this is a client-side namespace and this was the last
+                * reference, put it on the LRU. */
+               ldlm_lock_add_to_lru(lock);
+               unlock_res_and_lock(lock);
+
+               if (lock->l_flags & LDLM_FL_FAIL_LOC)
+                       OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+               /* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE
+                * are not supported by the server, otherwise, it is done on
+                * enqueue. */
+               if (!exp_connect_cancelset(lock->l_conn_export) &&
+                   !ns_connect_lru_resize(ns))
+                       ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0);
+       } else {
+               LDLM_DEBUG(lock, "do not add lock into lru list");
+               unlock_res_and_lock(lock);
+       }
+
+       EXIT;
+}
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle \a lockh
+ */
+void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode)
+{
+       struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+       LASSERTF(lock != NULL, "Non-existing lock: "LPX64"\n", lockh->cookie);
+       ldlm_lock_decref_internal(lock, mode);
+       LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref);
+
+/**
+ * Decrease reader/writer refcount for LDLM lock with handle
+ * \a lockh and mark it for subsequent cancellation once r/w refcount
+ * drops to zero instead of putting into LRU.
+ *
+ * Typical usage is for GROUP locks which we cannot allow to be cached.
+ */
+void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode)
+{
+       struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
+       ENTRY;
+
+       LASSERT(lock != NULL);
+
+       LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
+       lock_res_and_lock(lock);
+       lock->l_flags |= LDLM_FL_CBPENDING;
+       unlock_res_and_lock(lock);
+       ldlm_lock_decref_internal(lock, mode);
+       LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_decref_and_cancel);
+
+struct sl_insert_point {
+       struct list_head *res_link;
+       struct list_head *mode_link;
+       struct list_head *policy_link;
+};
+
+/**
+ * Finds a position to insert the new lock into granted lock list.
+ *
+ * Used for locks eligible for skiplist optimization.
+ *
+ * Parameters:
+ *      queue [input]:  the granted list where search acts on;
+ *      req [input]:    the lock whose position to be located;
+ *      prev [output]:  positions within 3 lists to insert @req to
+ * Return Value:
+ *      filled @prev
+ * NOTE: called by
+ *  - ldlm_grant_lock_with_skiplist
+ */
+static void search_granted_lock(struct list_head *queue,
+                               struct ldlm_lock *req,
+                               struct sl_insert_point *prev)
+{
+       struct list_head *tmp;
+       struct ldlm_lock *lock, *mode_end, *policy_end;
+       ENTRY;
+
+       list_for_each(tmp, queue) {
+               lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               mode_end = list_entry(lock->l_sl_mode.prev,
+                                         struct ldlm_lock, l_sl_mode);
+
+               if (lock->l_req_mode != req->l_req_mode) {
+                       /* jump to last lock of mode group */
+                       tmp = &mode_end->l_res_link;
+                       continue;
+               }
+
+               /* suitable mode group is found */
+               if (lock->l_resource->lr_type == LDLM_PLAIN) {
+                       /* insert point is last lock of the mode group */
+                       prev->res_link = &mode_end->l_res_link;
+                       prev->mode_link = &mode_end->l_sl_mode;
+                       prev->policy_link = &req->l_sl_policy;
+                       EXIT;
+                       return;
+               } else if (lock->l_resource->lr_type == LDLM_IBITS) {
+                       for (;;) {
+                               policy_end =
+                                       list_entry(lock->l_sl_policy.prev,
+                                                      struct ldlm_lock,
+                                                      l_sl_policy);
+
+                               if (lock->l_policy_data.l_inodebits.bits ==
+                                   req->l_policy_data.l_inodebits.bits) {
+                                       /* insert point is last lock of
+                                        * the policy group */
+                                       prev->res_link =
+                                               &policy_end->l_res_link;
+                                       prev->mode_link =
+                                               &policy_end->l_sl_mode;
+                                       prev->policy_link =
+                                               &policy_end->l_sl_policy;
+                                       EXIT;
+                                       return;
+                               }
+
+                               if (policy_end == mode_end)
+                                       /* done with mode group */
+                                       break;
+
+                               /* go to next policy group within mode group */
+                               tmp = policy_end->l_res_link.next;
+                               lock = list_entry(tmp, struct ldlm_lock,
+                                                     l_res_link);
+                       }  /* loop over policy groups within the mode group */
+
+                       /* insert point is last lock of the mode group,
+                        * new policy group is started */
+                       prev->res_link = &mode_end->l_res_link;
+                       prev->mode_link = &mode_end->l_sl_mode;
+                       prev->policy_link = &req->l_sl_policy;
+                       EXIT;
+                       return;
+               } else {
+                       LDLM_ERROR(lock,"is not LDLM_PLAIN or LDLM_IBITS lock");
+                       LBUG();
+               }
+       }
+
+       /* insert point is last lock on the queue,
+        * new mode group and new policy group are started */
+       prev->res_link = queue->prev;
+       prev->mode_link = &req->l_sl_mode;
+       prev->policy_link = &req->l_sl_policy;
+       EXIT;
+       return;
+}
+
+/**
+ * Add a lock into resource granted list after a position described by
+ * \a prev.
+ */
+static void ldlm_granted_list_add_lock(struct ldlm_lock *lock,
+                                      struct sl_insert_point *prev)
+{
+       struct ldlm_resource *res = lock->l_resource;
+       ENTRY;
+
+       check_res_locked(res);
+
+       ldlm_resource_dump(D_INFO, res);
+       LDLM_DEBUG(lock, "About to add lock:");
+
+       if (lock->l_destroyed) {
+               CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+               return;
+       }
+
+       LASSERT(list_empty(&lock->l_res_link));
+       LASSERT(list_empty(&lock->l_sl_mode));
+       LASSERT(list_empty(&lock->l_sl_policy));
+
+       /*
+        * lock->link == prev->link means lock is first starting the group.
+        * Don't re-add to itself to suppress kernel warnings.
+        */
+       if (&lock->l_res_link != prev->res_link)
+               list_add(&lock->l_res_link, prev->res_link);
+       if (&lock->l_sl_mode != prev->mode_link)
+               list_add(&lock->l_sl_mode, prev->mode_link);
+       if (&lock->l_sl_policy != prev->policy_link)
+               list_add(&lock->l_sl_policy, prev->policy_link);
+
+       EXIT;
+}
+
+/**
+ * Add a lock to granted list on a resource maintaining skiplist
+ * correctness.
+ */
+static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock)
+{
+       struct sl_insert_point prev;
+       ENTRY;
+
+       LASSERT(lock->l_req_mode == lock->l_granted_mode);
+
+       search_granted_lock(&lock->l_resource->lr_granted, lock, &prev);
+       ldlm_granted_list_add_lock(lock, &prev);
+       EXIT;
+}
+
+/**
+ * Perform lock granting bookkeeping.
+ *
+ * Includes putting the lock into granted list and updating lock mode.
+ * NOTE: called by
+ *  - ldlm_lock_enqueue
+ *  - ldlm_reprocess_queue
+ *  - ldlm_lock_convert
+ *
+ * must be called with lr_lock held
+ */
+void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
+{
+       struct ldlm_resource *res = lock->l_resource;
+       ENTRY;
+
+       check_res_locked(res);
+
+       lock->l_granted_mode = lock->l_req_mode;
+       if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS)
+               ldlm_grant_lock_with_skiplist(lock);
+       else if (res->lr_type == LDLM_EXTENT)
+               ldlm_extent_add_lock(res, lock);
+       else
+               ldlm_resource_add_lock(res, &res->lr_granted, lock);
+
+       if (lock->l_granted_mode < res->lr_most_restr)
+               res->lr_most_restr = lock->l_granted_mode;
+
+       if (work_list && lock->l_completion_ast != NULL)
+               ldlm_add_ast_work_item(lock, NULL, work_list);
+
+       ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock);
+       EXIT;
+}
+
+/**
+ * Search for a lock with given properties in a queue.
+ *
+ * \retval a referenced lock or NULL.  See the flag descriptions below, in the
+ * comment above ldlm_lock_match
+ */
+static struct ldlm_lock *search_queue(struct list_head *queue,
+                                     ldlm_mode_t *mode,
+                                     ldlm_policy_data_t *policy,
+                                     struct ldlm_lock *old_lock,
+                                     __u64 flags, int unref)
+{
+       struct ldlm_lock *lock;
+       struct list_head       *tmp;
+
+       list_for_each(tmp, queue) {
+               ldlm_mode_t match;
+
+               lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               if (lock == old_lock)
+                       break;
+
+               /* llite sometimes wants to match locks that will be
+                * canceled when their users drop, but we allow it to match
+                * if it passes in CBPENDING and the lock still has users.
+                * this is generally only going to be used by children
+                * whose parents already hold a lock so forward progress
+                * can still happen. */
+               if (lock->l_flags & LDLM_FL_CBPENDING &&
+                   !(flags & LDLM_FL_CBPENDING))
+                       continue;
+               if (!unref && lock->l_flags & LDLM_FL_CBPENDING &&
+                   lock->l_readers == 0 && lock->l_writers == 0)
+                       continue;
+
+               if (!(lock->l_req_mode & *mode))
+                       continue;
+               match = lock->l_req_mode;
+
+               if (lock->l_resource->lr_type == LDLM_EXTENT &&
+                   (lock->l_policy_data.l_extent.start >
+                    policy->l_extent.start ||
+                    lock->l_policy_data.l_extent.end < policy->l_extent.end))
+                       continue;
+
+               if (unlikely(match == LCK_GROUP) &&
+                   lock->l_resource->lr_type == LDLM_EXTENT &&
+                   lock->l_policy_data.l_extent.gid != policy->l_extent.gid)
+                       continue;
+
+               /* We match if we have existing lock with same or wider set
+                  of bits. */
+               if (lock->l_resource->lr_type == LDLM_IBITS &&
+                    ((lock->l_policy_data.l_inodebits.bits &
+                     policy->l_inodebits.bits) !=
+                     policy->l_inodebits.bits))
+                       continue;
+
+               if (!unref &&
+                   (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED ||
+                    lock->l_failed))
+                       continue;
+
+               if ((flags & LDLM_FL_LOCAL_ONLY) &&
+                   !(lock->l_flags & LDLM_FL_LOCAL))
+                       continue;
+
+               if (flags & LDLM_FL_TEST_LOCK) {
+                       LDLM_LOCK_GET(lock);
+                       ldlm_lock_touch_in_lru(lock);
+               } else {
+                       ldlm_lock_addref_internal_nolock(lock, match);
+               }
+               *mode = match;
+               return lock;
+       }
+
+       return NULL;
+}
+
+void ldlm_lock_fail_match_locked(struct ldlm_lock *lock)
+{
+       if (!lock->l_failed) {
+               lock->l_failed = 1;
+               wake_up_all(&lock->l_waitq);
+       }
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match_locked);
+
+void ldlm_lock_fail_match(struct ldlm_lock *lock)
+{
+       lock_res_and_lock(lock);
+       ldlm_lock_fail_match_locked(lock);
+       unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_fail_match);
+
+/**
+ * Mark lock as "matchable" by OST.
+ *
+ * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB
+ * is not yet valid.
+ * Assumes LDLM lock is already locked.
+ */
+void ldlm_lock_allow_match_locked(struct ldlm_lock *lock)
+{
+       lock->l_flags |= LDLM_FL_LVB_READY;
+       wake_up_all(&lock->l_waitq);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match_locked);
+
+/**
+ * Mark lock as "matchable" by OST.
+ * Locks the lock and then \see ldlm_lock_allow_match_locked
+ */
+void ldlm_lock_allow_match(struct ldlm_lock *lock)
+{
+       lock_res_and_lock(lock);
+       ldlm_lock_allow_match_locked(lock);
+       unlock_res_and_lock(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_allow_match);
+
+/**
+ * Attempt to find a lock with specified properties.
+ *
+ * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is
+ * set in \a flags
+ *
+ * Can be called in two ways:
+ *
+ * If 'ns' is NULL, then lockh describes an existing lock that we want to look
+ * for a duplicate of.
+ *
+ * Otherwise, all of the fields must be filled in, to match against.
+ *
+ * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the
+ *     server (ie, connh is NULL)
+ * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted
+ *     list will be considered
+ * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked
+ *     to be canceled can still be matched as long as they still have reader
+ *     or writer refernces
+ * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock,
+ *     just tell us if we would have matched.
+ *
+ * \retval 1 if it finds an already-existing lock that is compatible; in this
+ * case, lockh is filled in with a addref()ed lock
+ *
+ * We also check security context, and if that fails we simply return 0 (to
+ * keep caller code unchanged), the context failure will be discovered by
+ * caller sometime later.
+ */
+ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags,
+                           const struct ldlm_res_id *res_id, ldlm_type_t type,
+                           ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                           struct lustre_handle *lockh, int unref)
+{
+       struct ldlm_resource *res;
+       struct ldlm_lock *lock, *old_lock = NULL;
+       int rc = 0;
+       ENTRY;
+
+       if (ns == NULL) {
+               old_lock = ldlm_handle2lock(lockh);
+               LASSERT(old_lock);
+
+               ns = ldlm_lock_to_ns(old_lock);
+               res_id = &old_lock->l_resource->lr_name;
+               type = old_lock->l_resource->lr_type;
+               mode = old_lock->l_req_mode;
+       }
+
+       res = ldlm_resource_get(ns, NULL, res_id, type, 0);
+       if (res == NULL) {
+               LASSERT(old_lock == NULL);
+               RETURN(0);
+       }
+
+       LDLM_RESOURCE_ADDREF(res);
+       lock_res(res);
+
+       lock = search_queue(&res->lr_granted, &mode, policy, old_lock,
+                           flags, unref);
+       if (lock != NULL)
+               GOTO(out, rc = 1);
+       if (flags & LDLM_FL_BLOCK_GRANTED)
+               GOTO(out, rc = 0);
+       lock = search_queue(&res->lr_converting, &mode, policy, old_lock,
+                           flags, unref);
+       if (lock != NULL)
+               GOTO(out, rc = 1);
+       lock = search_queue(&res->lr_waiting, &mode, policy, old_lock,
+                           flags, unref);
+       if (lock != NULL)
+               GOTO(out, rc = 1);
+
+       EXIT;
+ out:
+       unlock_res(res);
+       LDLM_RESOURCE_DELREF(res);
+       ldlm_resource_putref(res);
+
+       if (lock) {
+               ldlm_lock2handle(lock, lockh);
+               if ((flags & LDLM_FL_LVB_READY) &&
+                   (!(lock->l_flags & LDLM_FL_LVB_READY))) {
+                       struct l_wait_info lwi;
+                       if (lock->l_completion_ast) {
+                               int err = lock->l_completion_ast(lock,
+                                                         LDLM_FL_WAIT_NOREPROC,
+                                                                NULL);
+                               if (err) {
+                                       if (flags & LDLM_FL_TEST_LOCK)
+                                               LDLM_LOCK_RELEASE(lock);
+                                       else
+                                               ldlm_lock_decref_internal(lock,
+                                                                         mode);
+                                       rc = 0;
+                                       goto out2;
+                               }
+                       }
+
+                       lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
+                                              NULL, LWI_ON_SIGNAL_NOOP, NULL);
+
+                       /* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */
+                       l_wait_event(lock->l_waitq,
+                                    lock->l_flags & LDLM_FL_LVB_READY ||
+                                    lock->l_destroyed || lock->l_failed,
+                                    &lwi);
+                       if (!(lock->l_flags & LDLM_FL_LVB_READY)) {
+                               if (flags & LDLM_FL_TEST_LOCK)
+                                       LDLM_LOCK_RELEASE(lock);
+                               else
+                                       ldlm_lock_decref_internal(lock, mode);
+                               rc = 0;
+                       }
+               }
+       }
+ out2:
+       if (rc) {
+               LDLM_DEBUG(lock, "matched ("LPU64" "LPU64")",
+                          (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                               res_id->name[2] : policy->l_extent.start,
+                          (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                               res_id->name[3] : policy->l_extent.end);
+
+               /* check user's security context */
+               if (lock->l_conn_export &&
+                   sptlrpc_import_check_ctx(
+                               class_exp2cliimp(lock->l_conn_export))) {
+                       if (!(flags & LDLM_FL_TEST_LOCK))
+                               ldlm_lock_decref_internal(lock, mode);
+                       rc = 0;
+               }
+
+               if (flags & LDLM_FL_TEST_LOCK)
+                       LDLM_LOCK_RELEASE(lock);
+
+       } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
+               LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
+                                 LPU64"/"LPU64" ("LPU64" "LPU64")", ns,
+                                 type, mode, res_id->name[0], res_id->name[1],
+                                 (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                       res_id->name[2] :policy->l_extent.start,
+                                 (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                       res_id->name[3] : policy->l_extent.end);
+       }
+       if (old_lock)
+               LDLM_LOCK_PUT(old_lock);
+
+       return rc ? mode : 0;
+}
+EXPORT_SYMBOL(ldlm_lock_match);
+
+ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh,
+                                       __u64 *bits)
+{
+       struct ldlm_lock *lock;
+       ldlm_mode_t mode = 0;
+       ENTRY;
+
+       lock = ldlm_handle2lock(lockh);
+       if (lock != NULL) {
+               lock_res_and_lock(lock);
+               if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED ||
+                   lock->l_failed)
+                       GOTO(out, mode);
+
+               if (lock->l_flags & LDLM_FL_CBPENDING &&
+                   lock->l_readers == 0 && lock->l_writers == 0)
+                       GOTO(out, mode);
+
+               if (bits)
+                       *bits = lock->l_policy_data.l_inodebits.bits;
+               mode = lock->l_granted_mode;
+               ldlm_lock_addref_internal_nolock(lock, mode);
+       }
+
+       EXIT;
+
+out:
+       if (lock != NULL) {
+               unlock_res_and_lock(lock);
+               LDLM_LOCK_PUT(lock);
+       }
+       return mode;
+}
+EXPORT_SYMBOL(ldlm_revalidate_lock_handle);
+
+/** The caller must guarantee that the buffer is large enough. */
+int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
+                 enum req_location loc, void *data, int size)
+{
+       void *lvb;
+       ENTRY;
+
+       LASSERT(data != NULL);
+       LASSERT(size >= 0);
+
+       switch (lock->l_lvb_type) {
+       case LVB_T_OST:
+               if (size == sizeof(struct ost_lvb)) {
+                       if (loc == RCL_CLIENT)
+                               lvb = req_capsule_client_swab_get(pill,
+                                               &RMF_DLM_LVB,
+                                               lustre_swab_ost_lvb);
+                       else
+                               lvb = req_capsule_server_swab_get(pill,
+                                               &RMF_DLM_LVB,
+                                               lustre_swab_ost_lvb);
+                       if (unlikely(lvb == NULL)) {
+                               LDLM_ERROR(lock, "no LVB");
+                               RETURN(-EPROTO);
+                       }
+
+                       memcpy(data, lvb, size);
+               } else if (size == sizeof(struct ost_lvb_v1)) {
+                       struct ost_lvb *olvb = data;
+
+                       if (loc == RCL_CLIENT)
+                               lvb = req_capsule_client_swab_get(pill,
+                                               &RMF_DLM_LVB,
+                                               lustre_swab_ost_lvb_v1);
+                       else
+                               lvb = req_capsule_server_sized_swab_get(pill,
+                                               &RMF_DLM_LVB, size,
+                                               lustre_swab_ost_lvb_v1);
+                       if (unlikely(lvb == NULL)) {
+                               LDLM_ERROR(lock, "no LVB");
+                               RETURN(-EPROTO);
+                       }
+
+                       memcpy(data, lvb, size);
+                       olvb->lvb_mtime_ns = 0;
+                       olvb->lvb_atime_ns = 0;
+                       olvb->lvb_ctime_ns = 0;
+               } else {
+                       LDLM_ERROR(lock, "Replied unexpected ost LVB size %d",
+                                  size);
+                       RETURN(-EINVAL);
+               }
+               break;
+       case LVB_T_LQUOTA:
+               if (size == sizeof(struct lquota_lvb)) {
+                       if (loc == RCL_CLIENT)
+                               lvb = req_capsule_client_swab_get(pill,
+                                               &RMF_DLM_LVB,
+                                               lustre_swab_lquota_lvb);
+                       else
+                               lvb = req_capsule_server_swab_get(pill,
+                                               &RMF_DLM_LVB,
+                                               lustre_swab_lquota_lvb);
+                       if (unlikely(lvb == NULL)) {
+                               LDLM_ERROR(lock, "no LVB");
+                               RETURN(-EPROTO);
+                       }
+
+                       memcpy(data, lvb, size);
+               } else {
+                       LDLM_ERROR(lock, "Replied unexpected lquota LVB size %d",
+                                  size);
+                       RETURN(-EINVAL);
+               }
+               break;
+       case LVB_T_LAYOUT:
+               if (size == 0)
+                       break;
+
+               if (loc == RCL_CLIENT)
+                       lvb = req_capsule_client_get(pill, &RMF_DLM_LVB);
+               else
+                       lvb = req_capsule_server_get(pill, &RMF_DLM_LVB);
+               if (unlikely(lvb == NULL)) {
+                       LDLM_ERROR(lock, "no LVB");
+                       RETURN(-EPROTO);
+               }
+
+               memcpy(data, lvb, size);
+               break;
+       default:
+               LDLM_ERROR(lock, "Unknown LVB type: %d\n", lock->l_lvb_type);
+               libcfs_debug_dumpstack(NULL);
+               RETURN(-EINVAL);
+       }
+
+       RETURN(0);
+}
+
+/**
+ * Create and fill in new LDLM lock with specified properties.
+ * Returns a referenced lock
+ */
+struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
+                                  const struct ldlm_res_id *res_id,
+                                  ldlm_type_t type,
+                                  ldlm_mode_t mode,
+                                  const struct ldlm_callback_suite *cbs,
+                                  void *data, __u32 lvb_len,
+                                  enum lvb_type lvb_type)
+{
+       struct ldlm_lock *lock;
+       struct ldlm_resource *res;
+       ENTRY;
+
+       res = ldlm_resource_get(ns, NULL, res_id, type, 1);
+       if (res == NULL)
+               RETURN(NULL);
+
+       lock = ldlm_lock_new(res);
+
+       if (lock == NULL)
+               RETURN(NULL);
+
+       lock->l_req_mode = mode;
+       lock->l_ast_data = data;
+       lock->l_pid = current_pid();
+       lock->l_ns_srv = !!ns_is_server(ns);
+       if (cbs) {
+               lock->l_blocking_ast = cbs->lcs_blocking;
+               lock->l_completion_ast = cbs->lcs_completion;
+               lock->l_glimpse_ast = cbs->lcs_glimpse;
+               lock->l_weigh_ast = cbs->lcs_weigh;
+       }
+
+       lock->l_tree_node = NULL;
+       /* if this is the extent lock, allocate the interval tree node */
+       if (type == LDLM_EXTENT) {
+               if (ldlm_interval_alloc(lock) == NULL)
+                       GOTO(out, 0);
+       }
+
+       if (lvb_len) {
+               lock->l_lvb_len = lvb_len;
+               OBD_ALLOC(lock->l_lvb_data, lvb_len);
+               if (lock->l_lvb_data == NULL)
+                       GOTO(out, 0);
+       }
+
+       lock->l_lvb_type = lvb_type;
+       if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK))
+               GOTO(out, 0);
+
+       RETURN(lock);
+
+out:
+       ldlm_lock_destroy(lock);
+       LDLM_LOCK_RELEASE(lock);
+       return NULL;
+}
+
+/**
+ * Enqueue (request) a lock.
+ *
+ * Does not block. As a result of enqueue the lock would be put
+ * into granted or waiting list.
+ *
+ * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag
+ * set, skip all the enqueueing and delegate lock processing to intent policy
+ * function.
+ */
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
+                              struct ldlm_lock **lockp,
+                              void *cookie, __u64 *flags)
+{
+       struct ldlm_lock *lock = *lockp;
+       struct ldlm_resource *res = lock->l_resource;
+       int local = ns_is_client(ldlm_res_to_ns(res));
+       ldlm_error_t rc = ELDLM_OK;
+       struct ldlm_interval *node = NULL;
+       ENTRY;
+
+       lock->l_last_activity = cfs_time_current_sec();
+       /* policies are not executed on the client or during replay */
+       if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT
+           && !local && ns->ns_policy) {
+               rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags,
+                                  NULL);
+               if (rc == ELDLM_LOCK_REPLACED) {
+                       /* The lock that was returned has already been granted,
+                        * and placed into lockp.  If it's not the same as the
+                        * one we passed in, then destroy the old one and our
+                        * work here is done. */
+                       if (lock != *lockp) {
+                               ldlm_lock_destroy(lock);
+                               LDLM_LOCK_RELEASE(lock);
+                       }
+                       *flags |= LDLM_FL_LOCK_CHANGED;
+                       RETURN(0);
+               } else if (rc != ELDLM_OK ||
+                          (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) {
+                       ldlm_lock_destroy(lock);
+                       RETURN(rc);
+               }
+       }
+
+       /* For a replaying lock, it might be already in granted list. So
+        * unlinking the lock will cause the interval node to be freed, we
+        * have to allocate the interval node early otherwise we can't regrant
+        * this lock in the future. - jay */
+       if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
+               OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+
+       lock_res_and_lock(lock);
+       if (local && lock->l_req_mode == lock->l_granted_mode) {
+               /* The server returned a blocked lock, but it was granted
+                * before we got a chance to actually enqueue it.  We don't
+                * need to do anything else. */
+               *flags &= ~(LDLM_FL_BLOCK_GRANTED |
+                           LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT);
+               GOTO(out, ELDLM_OK);
+       }
+
+       ldlm_resource_unlink_lock(lock);
+       if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) {
+               if (node == NULL) {
+                       ldlm_lock_destroy_nolock(lock);
+                       GOTO(out, rc = -ENOMEM);
+               }
+
+               INIT_LIST_HEAD(&node->li_group);
+               ldlm_interval_attach(node, lock);
+               node = NULL;
+       }
+
+       /* Some flags from the enqueue want to make it into the AST, via the
+        * lock's l_flags. */
+       lock->l_flags |= *flags & LDLM_AST_DISCARD_DATA;
+
+       /* This distinction between local lock trees is very important; a client
+        * namespace only has information about locks taken by that client, and
+        * thus doesn't have enough information to decide for itself if it can
+        * be granted (below).  In this case, we do exactly what the server
+        * tells us to do, as dictated by the 'flags'.
+        *
+        * We do exactly the same thing during recovery, when the server is
+        * more or less trusting the clients not to lie.
+        *
+        * FIXME (bug 268): Detect obvious lies by checking compatibility in
+        * granted/converting queues. */
+       if (local) {
+               if (*flags & LDLM_FL_BLOCK_CONV)
+                       ldlm_resource_add_lock(res, &res->lr_converting, lock);
+               else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED))
+                       ldlm_resource_add_lock(res, &res->lr_waiting, lock);
+               else
+                       ldlm_grant_lock(lock, NULL);
+               GOTO(out, ELDLM_OK);
+       } else {
+               CERROR("This is client-side-only module, cannot handle "
+                      "LDLM_NAMESPACE_SERVER resource type lock.\n");
+               LBUG();
+       }
+
+out:
+       unlock_res_and_lock(lock);
+       if (node)
+               OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+       return rc;
+}
+
+
+/**
+ * Process a call to blocking AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+       struct ldlm_cb_set_arg *arg = opaq;
+       struct ldlm_lock_desc   d;
+       int                  rc;
+       struct ldlm_lock       *lock;
+       ENTRY;
+
+       if (list_empty(arg->list))
+               RETURN(-ENOENT);
+
+       lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast);
+
+       /* nobody should touch l_bl_ast */
+       lock_res_and_lock(lock);
+       list_del_init(&lock->l_bl_ast);
+
+       LASSERT(lock->l_flags & LDLM_FL_AST_SENT);
+       LASSERT(lock->l_bl_ast_run == 0);
+       LASSERT(lock->l_blocking_lock);
+       lock->l_bl_ast_run++;
+       unlock_res_and_lock(lock);
+
+       ldlm_lock2desc(lock->l_blocking_lock, &d);
+
+       rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING);
+       LDLM_LOCK_RELEASE(lock->l_blocking_lock);
+       lock->l_blocking_lock = NULL;
+       LDLM_LOCK_RELEASE(lock);
+
+       RETURN(rc);
+}
+
+/**
+ * Process a call to completion AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+       struct ldlm_cb_set_arg  *arg = opaq;
+       int                   rc = 0;
+       struct ldlm_lock        *lock;
+       ldlm_completion_callback completion_callback;
+       ENTRY;
+
+       if (list_empty(arg->list))
+               RETURN(-ENOENT);
+
+       lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast);
+
+       /* It's possible to receive a completion AST before we've set
+        * the l_completion_ast pointer: either because the AST arrived
+        * before the reply, or simply because there's a small race
+        * window between receiving the reply and finishing the local
+        * enqueue. (bug 842)
+        *
+        * This can't happen with the blocking_ast, however, because we
+        * will never call the local blocking_ast until we drop our
+        * reader/writer reference, which we won't do until we get the
+        * reply and finish enqueueing. */
+
+       /* nobody should touch l_cp_ast */
+       lock_res_and_lock(lock);
+       list_del_init(&lock->l_cp_ast);
+       LASSERT(lock->l_flags & LDLM_FL_CP_REQD);
+       /* save l_completion_ast since it can be changed by
+        * mds_intent_policy(), see bug 14225 */
+       completion_callback = lock->l_completion_ast;
+       lock->l_flags &= ~LDLM_FL_CP_REQD;
+       unlock_res_and_lock(lock);
+
+       if (completion_callback != NULL)
+               rc = completion_callback(lock, 0, (void *)arg);
+       LDLM_LOCK_RELEASE(lock);
+
+       RETURN(rc);
+}
+
+/**
+ * Process a call to revocation AST callback for a lock in ast_work list
+ */
+static int
+ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+       struct ldlm_cb_set_arg *arg = opaq;
+       struct ldlm_lock_desc   desc;
+       int                  rc;
+       struct ldlm_lock       *lock;
+       ENTRY;
+
+       if (list_empty(arg->list))
+               RETURN(-ENOENT);
+
+       lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast);
+       list_del_init(&lock->l_rk_ast);
+
+       /* the desc just pretend to exclusive */
+       ldlm_lock2desc(lock, &desc);
+       desc.l_req_mode = LCK_EX;
+       desc.l_granted_mode = 0;
+
+       rc = lock->l_blocking_ast(lock, &desc, (void*)arg, LDLM_CB_BLOCKING);
+       LDLM_LOCK_RELEASE(lock);
+
+       RETURN(rc);
+}
+
+/**
+ * Process a call to glimpse AST callback for a lock in ast_work list
+ */
+int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq)
+{
+       struct ldlm_cb_set_arg          *arg = opaq;
+       struct ldlm_glimpse_work        *gl_work;
+       struct ldlm_lock                *lock;
+       int                              rc = 0;
+       ENTRY;
+
+       if (list_empty(arg->list))
+               RETURN(-ENOENT);
+
+       gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work,
+                                gl_list);
+       list_del_init(&gl_work->gl_list);
+
+       lock = gl_work->gl_lock;
+
+       /* transfer the glimpse descriptor to ldlm_cb_set_arg */
+       arg->gl_desc = gl_work->gl_desc;
+
+       /* invoke the actual glimpse callback */
+       if (lock->l_glimpse_ast(lock, (void*)arg) == 0)
+               rc = 1;
+
+       LDLM_LOCK_RELEASE(lock);
+
+       if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0)
+               OBD_FREE_PTR(gl_work);
+
+       RETURN(rc);
+}
+
+/**
+ * Process list of locks in need of ASTs being sent.
+ *
+ * Used on server to send multiple ASTs together instead of sending one by
+ * one.
+ */
+int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list,
+                     ldlm_desc_ast_t ast_type)
+{
+       struct ldlm_cb_set_arg *arg;
+       set_producer_func       work_ast_lock;
+       int                  rc;
+
+       if (list_empty(rpc_list))
+               RETURN(0);
+
+       OBD_ALLOC_PTR(arg);
+       if (arg == NULL)
+               RETURN(-ENOMEM);
+
+       atomic_set(&arg->restart, 0);
+       arg->list = rpc_list;
+
+       switch (ast_type) {
+               case LDLM_WORK_BL_AST:
+                       arg->type = LDLM_BL_CALLBACK;
+                       work_ast_lock = ldlm_work_bl_ast_lock;
+                       break;
+               case LDLM_WORK_CP_AST:
+                       arg->type = LDLM_CP_CALLBACK;
+                       work_ast_lock = ldlm_work_cp_ast_lock;
+                       break;
+               case LDLM_WORK_REVOKE_AST:
+                       arg->type = LDLM_BL_CALLBACK;
+                       work_ast_lock = ldlm_work_revoke_ast_lock;
+                       break;
+               case LDLM_WORK_GL_AST:
+                       arg->type = LDLM_GL_CALLBACK;
+                       work_ast_lock = ldlm_work_gl_ast_lock;
+                       break;
+               default:
+                       LBUG();
+       }
+
+       /* We create a ptlrpc request set with flow control extension.
+        * This request set will use the work_ast_lock function to produce new
+        * requests and will send a new request each time one completes in order
+        * to keep the number of requests in flight to ns_max_parallel_ast */
+       arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX,
+                                    work_ast_lock, arg);
+       if (arg->set == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       ptlrpc_set_wait(arg->set);
+       ptlrpc_set_destroy(arg->set);
+
+       rc = atomic_read(&arg->restart) ? -ERESTART : 0;
+       GOTO(out, rc);
+out:
+       OBD_FREE_PTR(arg);
+       return rc;
+}
+
+static int reprocess_one_queue(struct ldlm_resource *res, void *closure)
+{
+       ldlm_reprocess_all(res);
+       return LDLM_ITER_CONTINUE;
+}
+
+static int ldlm_reprocess_res(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                             struct hlist_node *hnode, void *arg)
+{
+       struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+       int    rc;
+
+       rc = reprocess_one_queue(res, arg);
+
+       return rc == LDLM_ITER_STOP;
+}
+
+/**
+ * Iterate through all resources on a namespace attempting to grant waiting
+ * locks.
+ */
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns)
+{
+       ENTRY;
+
+       if (ns != NULL) {
+               cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                        ldlm_reprocess_res, NULL);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_reprocess_all_ns);
+
+/**
+ * Try to grant all waiting locks on a resource.
+ *
+ * Calls ldlm_reprocess_queue on converting and waiting queues.
+ *
+ * Typically called after some resource locks are cancelled to see
+ * if anything could be granted as a result of the cancellation.
+ */
+void ldlm_reprocess_all(struct ldlm_resource *res)
+{
+       LIST_HEAD(rpc_list);
+
+       ENTRY;
+       if (!ns_is_client(ldlm_res_to_ns(res))) {
+               CERROR("This is client-side-only module, cannot handle "
+                      "LDLM_NAMESPACE_SERVER resource type lock.\n");
+               LBUG();
+       }
+       EXIT;
+}
+
+/**
+ * Helper function to call blocking AST for LDLM lock \a lock in a
+ * "cancelling" mode.
+ */
+void ldlm_cancel_callback(struct ldlm_lock *lock)
+{
+       check_res_locked(lock->l_resource);
+       if (!(lock->l_flags & LDLM_FL_CANCEL)) {
+               lock->l_flags |= LDLM_FL_CANCEL;
+               if (lock->l_blocking_ast) {
+                       unlock_res_and_lock(lock);
+                       lock->l_blocking_ast(lock, NULL, lock->l_ast_data,
+                                            LDLM_CB_CANCELING);
+                       lock_res_and_lock(lock);
+               } else {
+                       LDLM_DEBUG(lock, "no blocking ast");
+               }
+       }
+       lock->l_flags |= LDLM_FL_BL_DONE;
+}
+
+/**
+ * Remove skiplist-enabled LDLM lock \a req from granted list
+ */
+void ldlm_unlink_lock_skiplist(struct ldlm_lock *req)
+{
+       if (req->l_resource->lr_type != LDLM_PLAIN &&
+           req->l_resource->lr_type != LDLM_IBITS)
+               return;
+
+       list_del_init(&req->l_sl_policy);
+       list_del_init(&req->l_sl_mode);
+}
+
+/**
+ * Attempts to cancel LDLM lock \a lock that has no reader/writer references.
+ */
+void ldlm_lock_cancel(struct ldlm_lock *lock)
+{
+       struct ldlm_resource *res;
+       struct ldlm_namespace *ns;
+       ENTRY;
+
+       lock_res_and_lock(lock);
+
+       res = lock->l_resource;
+       ns  = ldlm_res_to_ns(res);
+
+       /* Please do not, no matter how tempting, remove this LBUG without
+        * talking to me first. -phik */
+       if (lock->l_readers || lock->l_writers) {
+               LDLM_ERROR(lock, "lock still has references");
+               LBUG();
+       }
+
+       if (lock->l_waited)
+               ldlm_del_waiting_lock(lock);
+
+       /* Releases cancel callback. */
+       ldlm_cancel_callback(lock);
+
+       /* Yes, second time, just in case it was added again while we were
+          running with no res lock in ldlm_cancel_callback */
+       if (lock->l_waited)
+               ldlm_del_waiting_lock(lock);
+
+       ldlm_resource_unlink_lock(lock);
+       ldlm_lock_destroy_nolock(lock);
+
+       if (lock->l_granted_mode == lock->l_req_mode)
+               ldlm_pool_del(&ns->ns_pool, lock);
+
+       /* Make sure we will not be called again for same lock what is possible
+        * if not to zero out lock->l_granted_mode */
+       lock->l_granted_mode = LCK_MINMODE;
+       unlock_res_and_lock(lock);
+
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_cancel);
+
+/**
+ * Set opaque data into the lock that only makes sense to upper layer.
+ */
+int ldlm_lock_set_data(struct lustre_handle *lockh, void *data)
+{
+       struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+       int rc = -EINVAL;
+       ENTRY;
+
+       if (lock) {
+               if (lock->l_ast_data == NULL)
+                       lock->l_ast_data = data;
+               if (lock->l_ast_data == data)
+                       rc = 0;
+               LDLM_LOCK_PUT(lock);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_lock_set_data);
+
+struct export_cl_data {
+       struct obd_export       *ecl_exp;
+       int                     ecl_loop;
+};
+
+/**
+ * Iterator function for ldlm_cancel_locks_for_export.
+ * Cancels passed locks.
+ */
+int ldlm_cancel_locks_for_export_cb(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                   struct hlist_node *hnode, void *data)
+
+{
+       struct export_cl_data   *ecl = (struct export_cl_data *)data;
+       struct obd_export       *exp  = ecl->ecl_exp;
+       struct ldlm_lock     *lock = cfs_hash_object(hs, hnode);
+       struct ldlm_resource *res;
+
+       res = ldlm_resource_getref(lock->l_resource);
+       LDLM_LOCK_GET(lock);
+
+       LDLM_DEBUG(lock, "export %p", exp);
+       ldlm_res_lvbo_update(res, NULL, 1);
+       ldlm_lock_cancel(lock);
+       ldlm_reprocess_all(res);
+       ldlm_resource_putref(res);
+       LDLM_LOCK_RELEASE(lock);
+
+       ecl->ecl_loop++;
+       if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) {
+               CDEBUG(D_INFO,
+                      "Cancel lock %p for export %p (loop %d), still have "
+                      "%d locks left on hash table.\n",
+                      lock, exp, ecl->ecl_loop,
+                      atomic_read(&hs->hs_count));
+       }
+
+       return 0;
+}
+
+/**
+ * Cancel all locks for given export.
+ *
+ * Typically called on client disconnection/eviction
+ */
+void ldlm_cancel_locks_for_export(struct obd_export *exp)
+{
+       struct export_cl_data   ecl = {
+               .ecl_exp        = exp,
+               .ecl_loop       = 0,
+       };
+
+       cfs_hash_for_each_empty(exp->exp_lock_hash,
+                               ldlm_cancel_locks_for_export_cb, &ecl);
+}
+
+/**
+ * Downgrade an exclusive lock.
+ *
+ * A fast variant of ldlm_lock_convert for convertion of exclusive
+ * locks. The convertion is always successful.
+ * Used by Commit on Sharing (COS) code.
+ *
+ * \param lock A lock to convert
+ * \param new_mode new lock mode
+ */
+void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode)
+{
+       ENTRY;
+
+       LASSERT(lock->l_granted_mode & (LCK_PW | LCK_EX));
+       LASSERT(new_mode == LCK_COS);
+
+       lock_res_and_lock(lock);
+       ldlm_resource_unlink_lock(lock);
+       /*
+        * Remove the lock from pool as it will be added again in
+        * ldlm_grant_lock() called below.
+        */
+       ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock);
+
+       lock->l_req_mode = new_mode;
+       ldlm_grant_lock(lock, NULL);
+       unlock_res_and_lock(lock);
+       ldlm_reprocess_all(lock->l_resource);
+
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_lock_downgrade);
+
+/**
+ * Attempt to convert already granted lock to a different mode.
+ *
+ * While lock conversion is not currently used, future client-side
+ * optimizations could take advantage of it to avoid discarding cached
+ * pages on a file.
+ */
+struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
+                                       __u32 *flags)
+{
+       LIST_HEAD(rpc_list);
+       struct ldlm_resource *res;
+       struct ldlm_namespace *ns;
+       int granted = 0;
+       struct ldlm_interval *node;
+       ENTRY;
+
+       /* Just return if mode is unchanged. */
+       if (new_mode == lock->l_granted_mode) {
+               *flags |= LDLM_FL_BLOCK_GRANTED;
+               RETURN(lock->l_resource);
+       }
+
+       /* I can't check the type of lock here because the bitlock of lock
+        * is not held here, so do the allocation blindly. -jay */
+       OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO);
+       if (node == NULL)  /* Actually, this causes EDEADLOCK to be returned */
+               RETURN(NULL);
+
+       LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR),
+                "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
+
+       lock_res_and_lock(lock);
+
+       res = lock->l_resource;
+       ns  = ldlm_res_to_ns(res);
+
+       lock->l_req_mode = new_mode;
+       if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) {
+               ldlm_resource_unlink_lock(lock);
+       } else {
+               ldlm_resource_unlink_lock(lock);
+               if (res->lr_type == LDLM_EXTENT) {
+                       /* FIXME: ugly code, I have to attach the lock to a
+                        * interval node again since perhaps it will be granted
+                        * soon */
+                       INIT_LIST_HEAD(&node->li_group);
+                       ldlm_interval_attach(node, lock);
+                       node = NULL;
+               }
+       }
+
+       /*
+        * Remove old lock from the pool before adding the lock with new
+        * mode below in ->policy()
+        */
+       ldlm_pool_del(&ns->ns_pool, lock);
+
+       /* If this is a local resource, put it on the appropriate list. */
+       if (ns_is_client(ldlm_res_to_ns(res))) {
+               if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) {
+                       ldlm_resource_add_lock(res, &res->lr_converting, lock);
+               } else {
+                       /* This should never happen, because of the way the
+                        * server handles conversions. */
+                       LDLM_ERROR(lock, "Erroneous flags %x on local lock\n",
+                                  *flags);
+                       LBUG();
+
+                       ldlm_grant_lock(lock, &rpc_list);
+                       granted = 1;
+                       /* FIXME: completion handling not with lr_lock held ! */
+                       if (lock->l_completion_ast)
+                               lock->l_completion_ast(lock, 0, NULL);
+               }
+       } else {
+               CERROR("This is client-side-only module, cannot handle "
+                      "LDLM_NAMESPACE_SERVER resource type lock.\n");
+               LBUG();
+       }
+       unlock_res_and_lock(lock);
+
+       if (granted)
+               ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST);
+       if (node)
+               OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+       RETURN(res);
+}
+EXPORT_SYMBOL(ldlm_lock_convert);
+
+/**
+ * Print lock with lock handle \a lockh description into debug log.
+ *
+ * Used when printing all locks on a resource for debug purposes.
+ */
+void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh)
+{
+       struct ldlm_lock *lock;
+
+       if (!((libcfs_debug | D_ERROR) & level))
+               return;
+
+       lock = ldlm_handle2lock(lockh);
+       if (lock == NULL)
+               return;
+
+       LDLM_DEBUG_LIMIT(level, lock, "###");
+
+       LDLM_LOCK_PUT(lock);
+}
+EXPORT_SYMBOL(ldlm_lock_dump_handle);
+
+/**
+ * Print lock information with custom message into debug log.
+ * Helper function.
+ */
+void _ldlm_lock_debug(struct ldlm_lock *lock,
+                     struct libcfs_debug_msg_data *msgdata,
+                     const char *fmt, ...)
+{
+       va_list args;
+       struct obd_export *exp = lock->l_export;
+       struct ldlm_resource *resource = lock->l_resource;
+       char *nid = "local";
+
+       va_start(args, fmt);
+
+       if (exp && exp->exp_connection) {
+               nid = libcfs_nid2str(exp->exp_connection->c_peer.nid);
+       } else if (exp && exp->exp_obd != NULL) {
+               struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
+               nid = libcfs_nid2str(imp->imp_connection->c_peer.nid);
+       }
+
+       if (resource == NULL) {
+               libcfs_debug_vmsg2(msgdata, fmt, args,
+                      " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+                      "res: \?\? rrc=\?\? type: \?\?\? flags: "LPX64" nid: %s "
+                      "remote: "LPX64" expref: %d pid: %u timeout: %lu "
+                      "lvb_type: %d\n",
+                      lock,
+                      lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+                      lock->l_readers, lock->l_writers,
+                      ldlm_lockname[lock->l_granted_mode],
+                      ldlm_lockname[lock->l_req_mode],
+                      lock->l_flags, nid, lock->l_remote_handle.cookie,
+                      exp ? atomic_read(&exp->exp_refcount) : -99,
+                      lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+               va_end(args);
+               return;
+       }
+
+       switch (resource->lr_type) {
+       case LDLM_EXTENT:
+               libcfs_debug_vmsg2(msgdata, fmt, args,
+                      " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+                      "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64
+                      "] (req "LPU64"->"LPU64") flags: "LPX64" nid: %s remote:"
+                      " "LPX64" expref: %d pid: %u timeout: %lu lvb_type: %d\n",
+                      ldlm_lock_to_ns_name(lock), lock,
+                      lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+                      lock->l_readers, lock->l_writers,
+                      ldlm_lockname[lock->l_granted_mode],
+                      ldlm_lockname[lock->l_req_mode],
+                      resource->lr_name.name[0],
+                      resource->lr_name.name[1],
+                      atomic_read(&resource->lr_refcount),
+                      ldlm_typename[resource->lr_type],
+                      lock->l_policy_data.l_extent.start,
+                      lock->l_policy_data.l_extent.end,
+                      lock->l_req_extent.start, lock->l_req_extent.end,
+                      lock->l_flags, nid, lock->l_remote_handle.cookie,
+                      exp ? atomic_read(&exp->exp_refcount) : -99,
+                      lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+               break;
+
+       case LDLM_FLOCK:
+               libcfs_debug_vmsg2(msgdata, fmt, args,
+                      " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+                      "res: "LPU64"/"LPU64" rrc: %d type: %s pid: %d "
+                      "["LPU64"->"LPU64"] flags: "LPX64" nid: %s remote: "LPX64
+                      " expref: %d pid: %u timeout: %lu\n",
+                      ldlm_lock_to_ns_name(lock), lock,
+                      lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
+                      lock->l_readers, lock->l_writers,
+                      ldlm_lockname[lock->l_granted_mode],
+                      ldlm_lockname[lock->l_req_mode],
+                      resource->lr_name.name[0],
+                      resource->lr_name.name[1],
+                      atomic_read(&resource->lr_refcount),
+                      ldlm_typename[resource->lr_type],
+                      lock->l_policy_data.l_flock.pid,
+                      lock->l_policy_data.l_flock.start,
+                      lock->l_policy_data.l_flock.end,
+                      lock->l_flags, nid, lock->l_remote_handle.cookie,
+                      exp ? atomic_read(&exp->exp_refcount) : -99,
+                      lock->l_pid, lock->l_callback_timeout);
+               break;
+
+       case LDLM_IBITS:
+               libcfs_debug_vmsg2(msgdata, fmt, args,
+                      " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+                      "res: "LPU64"/"LPU64" bits "LPX64" rrc: %d type: %s "
+                      "flags: "LPX64" nid: %s remote: "LPX64" expref: %d "
+                      "pid: %u timeout: %lu lvb_type: %d\n",
+                      ldlm_lock_to_ns_name(lock),
+                      lock, lock->l_handle.h_cookie,
+                      atomic_read (&lock->l_refc),
+                      lock->l_readers, lock->l_writers,
+                      ldlm_lockname[lock->l_granted_mode],
+                      ldlm_lockname[lock->l_req_mode],
+                      resource->lr_name.name[0],
+                      resource->lr_name.name[1],
+                      lock->l_policy_data.l_inodebits.bits,
+                      atomic_read(&resource->lr_refcount),
+                      ldlm_typename[resource->lr_type],
+                      lock->l_flags, nid, lock->l_remote_handle.cookie,
+                      exp ? atomic_read(&exp->exp_refcount) : -99,
+                      lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+               break;
+
+       default:
+               libcfs_debug_vmsg2(msgdata, fmt, args,
+                      " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
+                      "res: "LPU64"/"LPU64" rrc: %d type: %s flags: "LPX64" "
+                      "nid: %s remote: "LPX64" expref: %d pid: %u timeout: %lu"
+                      "lvb_type: %d\n",
+                      ldlm_lock_to_ns_name(lock),
+                      lock, lock->l_handle.h_cookie,
+                      atomic_read (&lock->l_refc),
+                      lock->l_readers, lock->l_writers,
+                      ldlm_lockname[lock->l_granted_mode],
+                      ldlm_lockname[lock->l_req_mode],
+                      resource->lr_name.name[0],
+                      resource->lr_name.name[1],
+                      atomic_read(&resource->lr_refcount),
+                      ldlm_typename[resource->lr_type],
+                      lock->l_flags, nid, lock->l_remote_handle.cookie,
+                      exp ? atomic_read(&exp->exp_refcount) : -99,
+                      lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type);
+               break;
+       }
+       va_end(args);
+}
+EXPORT_SYMBOL(_ldlm_lock_debug);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
new file mode 100644 (file)
index 0000000..324d5e4
--- /dev/null
@@ -0,0 +1,1238 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_lockd.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <linux/list.h>
+#include "ldlm_internal.h"
+
+static int ldlm_num_threads;
+CFS_MODULE_PARM(ldlm_num_threads, "i", int, 0444,
+               "number of DLM service threads to start");
+
+static char *ldlm_cpts;
+CFS_MODULE_PARM(ldlm_cpts, "s", charp, 0444,
+               "CPU partitions ldlm threads should run on");
+
+extern struct kmem_cache *ldlm_resource_slab;
+extern struct kmem_cache *ldlm_lock_slab;
+static struct mutex    ldlm_ref_mutex;
+static int ldlm_refcount;
+
+struct ldlm_cb_async_args {
+       struct ldlm_cb_set_arg *ca_set_arg;
+       struct ldlm_lock       *ca_lock;
+};
+
+/* LDLM state */
+
+static struct ldlm_state *ldlm_state;
+
+inline cfs_time_t round_timeout(cfs_time_t timeout)
+{
+       return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1);
+}
+
+/* timeout for initial callback (AST) reply (bz10399) */
+static inline unsigned int ldlm_get_rq_timeout(void)
+{
+       /* Non-AT value */
+       unsigned int timeout = min(ldlm_timeout, obd_timeout / 3);
+
+       return timeout < 1 ? 1 : timeout;
+}
+
+#define ELT_STOPPED   0
+#define ELT_READY     1
+#define ELT_TERMINATE 2
+
+struct ldlm_bl_pool {
+       spinlock_t              blp_lock;
+
+       /*
+        * blp_prio_list is used for callbacks that should be handled
+        * as a priority. It is used for LDLM_FL_DISCARD_DATA requests.
+        * see bug 13843
+        */
+       struct list_head              blp_prio_list;
+
+       /*
+        * blp_list is used for all other callbacks which are likely
+        * to take longer to process.
+        */
+       struct list_head              blp_list;
+
+       wait_queue_head_t            blp_waitq;
+       struct completion       blp_comp;
+       atomic_t            blp_num_threads;
+       atomic_t            blp_busy_threads;
+       int                  blp_min_threads;
+       int                  blp_max_threads;
+};
+
+struct ldlm_bl_work_item {
+       struct list_head              blwi_entry;
+       struct ldlm_namespace  *blwi_ns;
+       struct ldlm_lock_desc   blwi_ld;
+       struct ldlm_lock       *blwi_lock;
+       struct list_head              blwi_head;
+       int                  blwi_count;
+       struct completion       blwi_comp;
+       ldlm_cancel_flags_t     blwi_flags;
+       int                  blwi_mem_pressure;
+};
+
+
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
+{
+       RETURN(0);
+}
+
+int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout)
+{
+       RETURN(0);
+}
+
+
+
+/**
+ * Callback handler for receiving incoming blocking ASTs.
+ *
+ * This can only happen on client side.
+ */
+void ldlm_handle_bl_callback(struct ldlm_namespace *ns,
+                            struct ldlm_lock_desc *ld, struct ldlm_lock *lock)
+{
+       int do_ast;
+       ENTRY;
+
+       LDLM_DEBUG(lock, "client blocking AST callback handler");
+
+       lock_res_and_lock(lock);
+       lock->l_flags |= LDLM_FL_CBPENDING;
+
+       if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)
+               lock->l_flags |= LDLM_FL_CANCEL;
+
+       do_ast = (!lock->l_readers && !lock->l_writers);
+       unlock_res_and_lock(lock);
+
+       if (do_ast) {
+               CDEBUG(D_DLMTRACE, "Lock %p already unused, calling callback (%p)\n",
+                      lock, lock->l_blocking_ast);
+               if (lock->l_blocking_ast != NULL)
+                       lock->l_blocking_ast(lock, ld, lock->l_ast_data,
+                                            LDLM_CB_BLOCKING);
+       } else {
+               CDEBUG(D_DLMTRACE, "Lock %p is referenced, will be cancelled later\n",
+                      lock);
+       }
+
+       LDLM_DEBUG(lock, "client blocking callback handler END");
+       LDLM_LOCK_RELEASE(lock);
+       EXIT;
+}
+
+/**
+ * Callback handler for receiving incoming completion ASTs.
+ *
+ * This only can happen on client side.
+ */
+static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
+                                   struct ldlm_namespace *ns,
+                                   struct ldlm_request *dlm_req,
+                                   struct ldlm_lock *lock)
+{
+       int lvb_len;
+       LIST_HEAD(ast_list);
+       int rc = 0;
+       ENTRY;
+
+       LDLM_DEBUG(lock, "client completion callback handler START");
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
+               int to = cfs_time_seconds(1);
+               while (to > 0) {
+                       schedule_timeout_and_set_state(
+                               TASK_INTERRUPTIBLE, to);
+                       if (lock->l_granted_mode == lock->l_req_mode ||
+                           lock->l_destroyed)
+                               break;
+               }
+       }
+
+       lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT);
+       if (lvb_len < 0) {
+               LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len);
+               GOTO(out, rc = lvb_len);
+       } else if (lvb_len > 0) {
+               if (lock->l_lvb_len > 0) {
+                       /* for extent lock, lvb contains ost_lvb{}. */
+                       LASSERT(lock->l_lvb_data != NULL);
+
+                       if (unlikely(lock->l_lvb_len < lvb_len)) {
+                               LDLM_ERROR(lock, "Replied LVB is larger than "
+                                          "expectation, expected = %d, "
+                                          "replied = %d",
+                                          lock->l_lvb_len, lvb_len);
+                               GOTO(out, rc = -EINVAL);
+                       }
+               } else if (ldlm_has_layout(lock)) { /* for layout lock, lvb has
+                                                    * variable length */
+                       void *lvb_data;
+
+                       OBD_ALLOC(lvb_data, lvb_len);
+                       if (lvb_data == NULL) {
+                               LDLM_ERROR(lock, "No memory: %d.\n", lvb_len);
+                               GOTO(out, rc = -ENOMEM);
+                       }
+
+                       lock_res_and_lock(lock);
+                       LASSERT(lock->l_lvb_data == NULL);
+                       lock->l_lvb_data = lvb_data;
+                       lock->l_lvb_len = lvb_len;
+                       unlock_res_and_lock(lock);
+               }
+       }
+
+       lock_res_and_lock(lock);
+       if (lock->l_destroyed ||
+           lock->l_granted_mode == lock->l_req_mode) {
+               /* bug 11300: the lock has already been granted */
+               unlock_res_and_lock(lock);
+               LDLM_DEBUG(lock, "Double grant race happened");
+               GOTO(out, rc = 0);
+       }
+
+       /* If we receive the completion AST before the actual enqueue returned,
+        * then we might need to switch lock modes, resources, or extents. */
+       if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) {
+               lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
+               LDLM_DEBUG(lock, "completion AST, new lock mode");
+       }
+
+       if (lock->l_resource->lr_type != LDLM_PLAIN) {
+               ldlm_convert_policy_to_local(req->rq_export,
+                                         dlm_req->lock_desc.l_resource.lr_type,
+                                         &dlm_req->lock_desc.l_policy_data,
+                                         &lock->l_policy_data);
+               LDLM_DEBUG(lock, "completion AST, new policy data");
+       }
+
+       ldlm_resource_unlink_lock(lock);
+       if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
+                  &lock->l_resource->lr_name,
+                  sizeof(lock->l_resource->lr_name)) != 0) {
+               unlock_res_and_lock(lock);
+               rc = ldlm_lock_change_resource(ns, lock,
+                               &dlm_req->lock_desc.l_resource.lr_name);
+               if (rc < 0) {
+                       LDLM_ERROR(lock, "Failed to allocate resource");
+                       GOTO(out, rc);
+               }
+               LDLM_DEBUG(lock, "completion AST, new resource");
+               CERROR("change resource!\n");
+               lock_res_and_lock(lock);
+       }
+
+       if (dlm_req->lock_flags & LDLM_FL_AST_SENT) {
+               /* BL_AST locks are not needed in LRU.
+                * Let ldlm_cancel_lru() be fast. */
+               ldlm_lock_remove_from_lru(lock);
+               lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
+               LDLM_DEBUG(lock, "completion AST includes blocking AST");
+       }
+
+       if (lock->l_lvb_len > 0) {
+               rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT,
+                                  lock->l_lvb_data, lvb_len);
+               if (rc < 0) {
+                       unlock_res_and_lock(lock);
+                       GOTO(out, rc);
+               }
+       }
+
+       ldlm_grant_lock(lock, &ast_list);
+       unlock_res_and_lock(lock);
+
+       LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
+
+       /* Let Enqueue to call osc_lock_upcall() and initialize
+        * l_ast_data */
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2);
+
+       ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST);
+
+       LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)",
+                         lock);
+       GOTO(out, rc);
+
+out:
+       if (rc < 0) {
+               lock_res_and_lock(lock);
+               lock->l_flags |= LDLM_FL_FAILED;
+               unlock_res_and_lock(lock);
+               wake_up(&lock->l_waitq);
+       }
+       LDLM_LOCK_RELEASE(lock);
+}
+
+/**
+ * Callback handler for receiving incoming glimpse ASTs.
+ *
+ * This only can happen on client side.  After handling the glimpse AST
+ * we also consider dropping the lock here if it is unused locally for a
+ * long time.
+ */
+static void ldlm_handle_gl_callback(struct ptlrpc_request *req,
+                                   struct ldlm_namespace *ns,
+                                   struct ldlm_request *dlm_req,
+                                   struct ldlm_lock *lock)
+{
+       int rc = -ENOSYS;
+       ENTRY;
+
+       LDLM_DEBUG(lock, "client glimpse AST callback handler");
+
+       if (lock->l_glimpse_ast != NULL)
+               rc = lock->l_glimpse_ast(lock, req);
+
+       if (req->rq_repmsg != NULL) {
+               ptlrpc_reply(req);
+       } else {
+               req->rq_status = rc;
+               ptlrpc_error(req);
+       }
+
+       lock_res_and_lock(lock);
+       if (lock->l_granted_mode == LCK_PW &&
+           !lock->l_readers && !lock->l_writers &&
+           cfs_time_after(cfs_time_current(),
+                          cfs_time_add(lock->l_last_used,
+                                       cfs_time_seconds(10)))) {
+               unlock_res_and_lock(lock);
+               if (ldlm_bl_to_thread_lock(ns, NULL, lock))
+                       ldlm_handle_bl_callback(ns, NULL, lock);
+
+               EXIT;
+               return;
+       }
+       unlock_res_and_lock(lock);
+       LDLM_LOCK_RELEASE(lock);
+       EXIT;
+}
+
+static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
+{
+       if (req->rq_no_reply)
+               return 0;
+
+       req->rq_status = rc;
+       if (!req->rq_packed_final) {
+               rc = lustre_pack_reply(req, 1, NULL, NULL);
+               if (rc)
+                       return rc;
+       }
+       return ptlrpc_reply(req);
+}
+
+static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
+                              ldlm_cancel_flags_t cancel_flags)
+{
+       struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+       ENTRY;
+
+       spin_lock(&blp->blp_lock);
+       if (blwi->blwi_lock &&
+           blwi->blwi_lock->l_flags & LDLM_FL_DISCARD_DATA) {
+               /* add LDLM_FL_DISCARD_DATA requests to the priority list */
+               list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list);
+       } else {
+               /* other blocking callbacks are added to the regular list */
+               list_add_tail(&blwi->blwi_entry, &blp->blp_list);
+       }
+       spin_unlock(&blp->blp_lock);
+
+       wake_up(&blp->blp_waitq);
+
+       /* can not check blwi->blwi_flags as blwi could be already freed in
+          LCF_ASYNC mode */
+       if (!(cancel_flags & LCF_ASYNC))
+               wait_for_completion(&blwi->blwi_comp);
+
+       RETURN(0);
+}
+
+static inline void init_blwi(struct ldlm_bl_work_item *blwi,
+                            struct ldlm_namespace *ns,
+                            struct ldlm_lock_desc *ld,
+                            struct list_head *cancels, int count,
+                            struct ldlm_lock *lock,
+                            ldlm_cancel_flags_t cancel_flags)
+{
+       init_completion(&blwi->blwi_comp);
+       INIT_LIST_HEAD(&blwi->blwi_head);
+
+       if (memory_pressure_get())
+               blwi->blwi_mem_pressure = 1;
+
+       blwi->blwi_ns = ns;
+       blwi->blwi_flags = cancel_flags;
+       if (ld != NULL)
+               blwi->blwi_ld = *ld;
+       if (count) {
+               list_add(&blwi->blwi_head, cancels);
+               list_del_init(cancels);
+               blwi->blwi_count = count;
+       } else {
+               blwi->blwi_lock = lock;
+       }
+}
+
+/**
+ * Queues a list of locks \a cancels containing \a count locks
+ * for later processing by a blocking thread.  If \a count is zero,
+ * then the lock referenced as \a lock is queued instead.
+ *
+ * The blocking thread would then call ->l_blocking_ast callback in the lock.
+ * If list addition fails an error is returned and caller is supposed to
+ * call ->l_blocking_ast itself.
+ */
+static int ldlm_bl_to_thread(struct ldlm_namespace *ns,
+                            struct ldlm_lock_desc *ld,
+                            struct ldlm_lock *lock,
+                            struct list_head *cancels, int count,
+                            ldlm_cancel_flags_t cancel_flags)
+{
+       ENTRY;
+
+       if (cancels && count == 0)
+               RETURN(0);
+
+       if (cancel_flags & LCF_ASYNC) {
+               struct ldlm_bl_work_item *blwi;
+
+               OBD_ALLOC(blwi, sizeof(*blwi));
+               if (blwi == NULL)
+                       RETURN(-ENOMEM);
+               init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags);
+
+               RETURN(__ldlm_bl_to_thread(blwi, cancel_flags));
+       } else {
+               /* if it is synchronous call do minimum mem alloc, as it could
+                * be triggered from kernel shrinker
+                */
+               struct ldlm_bl_work_item blwi;
+
+               memset(&blwi, 0, sizeof(blwi));
+               init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags);
+               RETURN(__ldlm_bl_to_thread(&blwi, cancel_flags));
+       }
+}
+
+
+int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+                          struct ldlm_lock *lock)
+{
+       return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC);
+}
+
+int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld,
+                          struct list_head *cancels, int count,
+                          ldlm_cancel_flags_t cancel_flags)
+{
+       return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags);
+}
+
+/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */
+static int ldlm_handle_setinfo(struct ptlrpc_request *req)
+{
+       struct obd_device *obd = req->rq_export->exp_obd;
+       char *key;
+       void *val;
+       int keylen, vallen;
+       int rc = -ENOSYS;
+       ENTRY;
+
+       DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name);
+
+       req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO);
+
+       key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+       if (key == NULL) {
+               DEBUG_REQ(D_IOCTL, req, "no set_info key");
+               RETURN(-EFAULT);
+       }
+       keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                                     RCL_CLIENT);
+       val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+       if (val == NULL) {
+               DEBUG_REQ(D_IOCTL, req, "no set_info val");
+               RETURN(-EFAULT);
+       }
+       vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL,
+                                     RCL_CLIENT);
+
+       /* We are responsible for swabbing contents of val */
+
+       if (KEY_IS(KEY_HSM_COPYTOOL_SEND))
+               /* Pass it on to mdc (the "export" in this case) */
+               rc = obd_set_info_async(req->rq_svc_thread->t_env,
+                                       req->rq_export,
+                                       sizeof(KEY_HSM_COPYTOOL_SEND),
+                                       KEY_HSM_COPYTOOL_SEND,
+                                       vallen, val, NULL);
+       else
+               DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key);
+
+       return rc;
+}
+
+static inline void ldlm_callback_errmsg(struct ptlrpc_request *req,
+                                       const char *msg, int rc,
+                                       struct lustre_handle *handle)
+{
+       DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req,
+                 "%s: [nid %s] [rc %d] [lock "LPX64"]",
+                 msg, libcfs_id2str(req->rq_peer), rc,
+                 handle ? handle->cookie : 0);
+       if (req->rq_no_reply)
+               CWARN("No reply was sent, maybe cause bug 21636.\n");
+       else if (rc)
+               CWARN("Send reply failed, maybe cause bug 21636.\n");
+}
+
+static int ldlm_handle_qc_callback(struct ptlrpc_request *req)
+{
+       struct obd_quotactl *oqctl;
+       struct client_obd *cli = &req->rq_export->exp_obd->u.cli;
+
+       oqctl = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+       if (oqctl == NULL) {
+               CERROR("Can't unpack obd_quotactl\n");
+               RETURN(-EPROTO);
+       }
+
+       cli->cl_qchk_stat = oqctl->qc_stat;
+       return 0;
+}
+
+/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */
+static int ldlm_callback_handler(struct ptlrpc_request *req)
+{
+       struct ldlm_namespace *ns;
+       struct ldlm_request *dlm_req;
+       struct ldlm_lock *lock;
+       int rc;
+       ENTRY;
+
+       /* Requests arrive in sender's byte order.  The ptlrpc service
+        * handler has already checked and, if necessary, byte-swapped the
+        * incoming request message body, but I am responsible for the
+        * message buffers. */
+
+       /* do nothing for sec context finalize */
+       if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI)
+               RETURN(0);
+
+       req_capsule_init(&req->rq_pill, req, RCL_SERVER);
+
+       if (req->rq_export == NULL) {
+               rc = ldlm_callback_reply(req, -ENOTCONN);
+               ldlm_callback_errmsg(req, "Operate on unconnected server",
+                                    rc, NULL);
+               RETURN(0);
+       }
+
+       LASSERT(req->rq_export != NULL);
+       LASSERT(req->rq_export->exp_obd != NULL);
+
+       switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+       case LDLM_BL_CALLBACK:
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+                       RETURN(0);
+               break;
+       case LDLM_CP_CALLBACK:
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET))
+                       RETURN(0);
+               break;
+       case LDLM_GL_CALLBACK:
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET))
+                       RETURN(0);
+               break;
+       case LDLM_SET_INFO:
+               rc = ldlm_handle_setinfo(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case OBD_LOG_CANCEL: /* remove this eventually - for 1.4.0 compat */
+               CERROR("shouldn't be handling OBD_LOG_CANCEL on DLM thread\n");
+               req_capsule_set(&req->rq_pill, &RQF_LOG_CANCEL);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_NET))
+                       RETURN(0);
+               rc = llog_origin_handle_cancel(req);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_REP))
+                       RETURN(0);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case LLOG_ORIGIN_HANDLE_CREATE:
+               req_capsule_set(&req->rq_pill, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                       RETURN(0);
+               rc = llog_origin_handle_open(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
+               req_capsule_set(&req->rq_pill,
+                               &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                       RETURN(0);
+               rc = llog_origin_handle_next_block(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case LLOG_ORIGIN_HANDLE_READ_HEADER:
+               req_capsule_set(&req->rq_pill,
+                               &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                       RETURN(0);
+               rc = llog_origin_handle_read_header(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case LLOG_ORIGIN_HANDLE_CLOSE:
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET))
+                       RETURN(0);
+               rc = llog_origin_handle_close(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       case OBD_QC_CALLBACK:
+               req_capsule_set(&req->rq_pill, &RQF_QC_CALLBACK);
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_QC_CALLBACK_NET))
+                       RETURN(0);
+               rc = ldlm_handle_qc_callback(req);
+               ldlm_callback_reply(req, rc);
+               RETURN(0);
+       default:
+               CERROR("unknown opcode %u\n",
+                      lustre_msg_get_opc(req->rq_reqmsg));
+               ldlm_callback_reply(req, -EPROTO);
+               RETURN(0);
+       }
+
+       ns = req->rq_export->exp_obd->obd_namespace;
+       LASSERT(ns != NULL);
+
+       req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK);
+
+       dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+       if (dlm_req == NULL) {
+               rc = ldlm_callback_reply(req, -EPROTO);
+               ldlm_callback_errmsg(req, "Operate without parameter", rc,
+                                    NULL);
+               RETURN(0);
+       }
+
+       /* Force a known safe race, send a cancel to the server for a lock
+        * which the server has already started a blocking callback on. */
+       if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) &&
+           lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+               rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0);
+               if (rc < 0)
+                       CERROR("ldlm_cli_cancel: %d\n", rc);
+       }
+
+       lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0);
+       if (!lock) {
+               CDEBUG(D_DLMTRACE, "callback on lock "LPX64" - lock "
+                      "disappeared\n", dlm_req->lock_handle[0].cookie);
+               rc = ldlm_callback_reply(req, -EINVAL);
+               ldlm_callback_errmsg(req, "Operate with invalid parameter", rc,
+                                    &dlm_req->lock_handle[0]);
+               RETURN(0);
+       }
+
+       if ((lock->l_flags & LDLM_FL_FAIL_LOC) &&
+           lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK)
+               OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE);
+
+       /* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */
+       lock_res_and_lock(lock);
+       lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags &
+                                             LDLM_AST_FLAGS);
+       if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+               /* If somebody cancels lock and cache is already dropped,
+                * or lock is failed before cp_ast received on client,
+                * we can tell the server we have no lock. Otherwise, we
+                * should send cancel after dropping the cache. */
+               if (((lock->l_flags & LDLM_FL_CANCELING) &&
+                   (lock->l_flags & LDLM_FL_BL_DONE)) ||
+                   (lock->l_flags & LDLM_FL_FAILED)) {
+                       LDLM_DEBUG(lock, "callback on lock "
+                                  LPX64" - lock disappeared\n",
+                                  dlm_req->lock_handle[0].cookie);
+                       unlock_res_and_lock(lock);
+                       LDLM_LOCK_RELEASE(lock);
+                       rc = ldlm_callback_reply(req, -EINVAL);
+                       ldlm_callback_errmsg(req, "Operate on stale lock", rc,
+                                            &dlm_req->lock_handle[0]);
+                       RETURN(0);
+               }
+               /* BL_AST locks are not needed in LRU.
+                * Let ldlm_cancel_lru() be fast. */
+               ldlm_lock_remove_from_lru(lock);
+               lock->l_flags |= LDLM_FL_BL_AST;
+       }
+       unlock_res_and_lock(lock);
+
+       /* We want the ost thread to get this reply so that it can respond
+        * to ost requests (write cache writeback) that might be triggered
+        * in the callback.
+        *
+        * But we'd also like to be able to indicate in the reply that we're
+        * cancelling right now, because it's unused, or have an intent result
+        * in the reply, so we might have to push the responsibility for sending
+        * the reply down into the AST handlers, alas. */
+
+       switch (lustre_msg_get_opc(req->rq_reqmsg)) {
+       case LDLM_BL_CALLBACK:
+               CDEBUG(D_INODE, "blocking ast\n");
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK);
+               if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)) {
+                       rc = ldlm_callback_reply(req, 0);
+                       if (req->rq_no_reply || rc)
+                               ldlm_callback_errmsg(req, "Normal process", rc,
+                                                    &dlm_req->lock_handle[0]);
+               }
+               if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock))
+                       ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock);
+               break;
+       case LDLM_CP_CALLBACK:
+               CDEBUG(D_INODE, "completion ast\n");
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK);
+               ldlm_callback_reply(req, 0);
+               ldlm_handle_cp_callback(req, ns, dlm_req, lock);
+               break;
+       case LDLM_GL_CALLBACK:
+               CDEBUG(D_INODE, "glimpse ast\n");
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
+               ldlm_handle_gl_callback(req, ns, dlm_req, lock);
+               break;
+       default:
+               LBUG();                  /* checked above */
+       }
+
+       RETURN(0);
+}
+
+
+static struct ldlm_bl_work_item *ldlm_bl_get_work(struct ldlm_bl_pool *blp)
+{
+       struct ldlm_bl_work_item *blwi = NULL;
+       static unsigned int num_bl = 0;
+
+       spin_lock(&blp->blp_lock);
+       /* process a request from the blp_list at least every blp_num_threads */
+       if (!list_empty(&blp->blp_list) &&
+           (list_empty(&blp->blp_prio_list) || num_bl == 0))
+               blwi = list_entry(blp->blp_list.next,
+                                     struct ldlm_bl_work_item, blwi_entry);
+       else
+               if (!list_empty(&blp->blp_prio_list))
+                       blwi = list_entry(blp->blp_prio_list.next,
+                                             struct ldlm_bl_work_item,
+                                             blwi_entry);
+
+       if (blwi) {
+               if (++num_bl >= atomic_read(&blp->blp_num_threads))
+                       num_bl = 0;
+               list_del(&blwi->blwi_entry);
+       }
+       spin_unlock(&blp->blp_lock);
+
+       return blwi;
+}
+
+/* This only contains temporary data until the thread starts */
+struct ldlm_bl_thread_data {
+       char                    bltd_name[CFS_CURPROC_COMM_MAX];
+       struct ldlm_bl_pool     *bltd_blp;
+       struct completion       bltd_comp;
+       int                     bltd_num;
+};
+
+static int ldlm_bl_thread_main(void *arg);
+
+static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp)
+{
+       struct ldlm_bl_thread_data bltd = { .bltd_blp = blp };
+       task_t *task;
+
+       init_completion(&bltd.bltd_comp);
+       bltd.bltd_num = atomic_read(&blp->blp_num_threads);
+       snprintf(bltd.bltd_name, sizeof(bltd.bltd_name) - 1,
+               "ldlm_bl_%02d", bltd.bltd_num);
+       task = kthread_run(ldlm_bl_thread_main, &bltd, bltd.bltd_name);
+       if (IS_ERR(task)) {
+               CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n",
+                      atomic_read(&blp->blp_num_threads), PTR_ERR(task));
+               return PTR_ERR(task);
+       }
+       wait_for_completion(&bltd.bltd_comp);
+
+       return 0;
+}
+
+/**
+ * Main blocking requests processing thread.
+ *
+ * Callers put locks into its queue by calling ldlm_bl_to_thread.
+ * This thread in the end ends up doing actual call to ->l_blocking_ast
+ * for queued locks.
+ */
+static int ldlm_bl_thread_main(void *arg)
+{
+       struct ldlm_bl_pool *blp;
+       ENTRY;
+
+       {
+               struct ldlm_bl_thread_data *bltd = arg;
+
+               blp = bltd->bltd_blp;
+
+               atomic_inc(&blp->blp_num_threads);
+               atomic_inc(&blp->blp_busy_threads);
+
+               complete(&bltd->bltd_comp);
+               /* cannot use bltd after this, it is only on caller's stack */
+       }
+
+       while (1) {
+               struct l_wait_info lwi = { 0 };
+               struct ldlm_bl_work_item *blwi = NULL;
+               int busy;
+
+               blwi = ldlm_bl_get_work(blp);
+
+               if (blwi == NULL) {
+                       atomic_dec(&blp->blp_busy_threads);
+                       l_wait_event_exclusive(blp->blp_waitq,
+                                        (blwi = ldlm_bl_get_work(blp)) != NULL,
+                                        &lwi);
+                       busy = atomic_inc_return(&blp->blp_busy_threads);
+               } else {
+                       busy = atomic_read(&blp->blp_busy_threads);
+               }
+
+               if (blwi->blwi_ns == NULL)
+                       /* added by ldlm_cleanup() */
+                       break;
+
+               /* Not fatal if racy and have a few too many threads */
+               if (unlikely(busy < blp->blp_max_threads &&
+                            busy >= atomic_read(&blp->blp_num_threads) &&
+                            !blwi->blwi_mem_pressure))
+                       /* discard the return value, we tried */
+                       ldlm_bl_thread_start(blp);
+
+               if (blwi->blwi_mem_pressure)
+                       memory_pressure_set();
+
+               if (blwi->blwi_count) {
+                       int count;
+                       /* The special case when we cancel locks in LRU
+                        * asynchronously, we pass the list of locks here.
+                        * Thus locks are marked LDLM_FL_CANCELING, but NOT
+                        * canceled locally yet. */
+                       count = ldlm_cli_cancel_list_local(&blwi->blwi_head,
+                                                          blwi->blwi_count,
+                                                          LCF_BL_AST);
+                       ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL,
+                                            blwi->blwi_flags);
+               } else {
+                       ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld,
+                                               blwi->blwi_lock);
+               }
+               if (blwi->blwi_mem_pressure)
+                       memory_pressure_clr();
+
+               if (blwi->blwi_flags & LCF_ASYNC)
+                       OBD_FREE(blwi, sizeof(*blwi));
+               else
+                       complete(&blwi->blwi_comp);
+       }
+
+       atomic_dec(&blp->blp_busy_threads);
+       atomic_dec(&blp->blp_num_threads);
+       complete(&blp->blp_comp);
+       RETURN(0);
+}
+
+
+static int ldlm_setup(void);
+static int ldlm_cleanup(void);
+
+int ldlm_get_ref(void)
+{
+       int rc = 0;
+       ENTRY;
+       mutex_lock(&ldlm_ref_mutex);
+       if (++ldlm_refcount == 1) {
+               rc = ldlm_setup();
+               if (rc)
+                       ldlm_refcount--;
+       }
+       mutex_unlock(&ldlm_ref_mutex);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_get_ref);
+
+void ldlm_put_ref(void)
+{
+       ENTRY;
+       mutex_lock(&ldlm_ref_mutex);
+       if (ldlm_refcount == 1) {
+               int rc = ldlm_cleanup();
+               if (rc)
+                       CERROR("ldlm_cleanup failed: %d\n", rc);
+               else
+                       ldlm_refcount--;
+       } else {
+               ldlm_refcount--;
+       }
+       mutex_unlock(&ldlm_ref_mutex);
+
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_put_ref);
+
+/*
+ * Export handle<->lock hash operations.
+ */
+static unsigned
+ldlm_export_lock_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask);
+}
+
+static void *
+ldlm_export_lock_key(struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+       return &lock->l_remote_handle;
+}
+
+static void
+ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key)
+{
+       struct ldlm_lock     *lock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+       lock->l_remote_handle = *(struct lustre_handle *)key;
+}
+
+static int
+ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode)
+{
+       return lustre_handle_equal(ldlm_export_lock_key(hnode), key);
+}
+
+static void *
+ldlm_export_lock_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+}
+
+static void
+ldlm_export_lock_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+       LDLM_LOCK_GET(lock);
+}
+
+static void
+ldlm_export_lock_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_lock *lock;
+
+       lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash);
+       LDLM_LOCK_RELEASE(lock);
+}
+
+static cfs_hash_ops_t ldlm_export_lock_ops = {
+       .hs_hash        = ldlm_export_lock_hash,
+       .hs_key  = ldlm_export_lock_key,
+       .hs_keycmp      = ldlm_export_lock_keycmp,
+       .hs_keycpy      = ldlm_export_lock_keycpy,
+       .hs_object      = ldlm_export_lock_object,
+       .hs_get  = ldlm_export_lock_get,
+       .hs_put  = ldlm_export_lock_put,
+       .hs_put_locked  = ldlm_export_lock_put,
+};
+
+int ldlm_init_export(struct obd_export *exp)
+{
+       ENTRY;
+
+       exp->exp_lock_hash =
+               cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid),
+                               HASH_EXP_LOCK_CUR_BITS,
+                               HASH_EXP_LOCK_MAX_BITS,
+                               HASH_EXP_LOCK_BKT_BITS, 0,
+                               CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA,
+                               &ldlm_export_lock_ops,
+                               CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY |
+                               CFS_HASH_NBLK_CHANGE);
+
+       if (!exp->exp_lock_hash)
+               RETURN(-ENOMEM);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_init_export);
+
+void ldlm_destroy_export(struct obd_export *exp)
+{
+       ENTRY;
+       cfs_hash_putref(exp->exp_lock_hash);
+       exp->exp_lock_hash = NULL;
+
+       ldlm_destroy_flock_export(exp);
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_destroy_export);
+
+static int ldlm_setup(void)
+{
+       static struct ptlrpc_service_conf       conf;
+       struct ldlm_bl_pool                     *blp = NULL;
+       int rc = 0;
+       int i;
+       ENTRY;
+
+       if (ldlm_state != NULL)
+               RETURN(-EALREADY);
+
+       OBD_ALLOC(ldlm_state, sizeof(*ldlm_state));
+       if (ldlm_state == NULL)
+               RETURN(-ENOMEM);
+
+#ifdef LPROCFS
+       rc = ldlm_proc_setup();
+       if (rc != 0)
+               GOTO(out, rc);
+#endif
+
+       memset(&conf, 0, sizeof(conf));
+       conf = (typeof(conf)) {
+               .psc_name               = "ldlm_cbd",
+               .psc_watchdog_factor    = 2,
+               .psc_buf                = {
+                       .bc_nbufs               = LDLM_CLIENT_NBUFS,
+                       .bc_buf_size            = LDLM_BUFSIZE,
+                       .bc_req_max_size        = LDLM_MAXREQSIZE,
+                       .bc_rep_max_size        = LDLM_MAXREPSIZE,
+                       .bc_req_portal          = LDLM_CB_REQUEST_PORTAL,
+                       .bc_rep_portal          = LDLM_CB_REPLY_PORTAL,
+               },
+               .psc_thr                = {
+                       .tc_thr_name            = "ldlm_cb",
+                       .tc_thr_factor          = LDLM_THR_FACTOR,
+                       .tc_nthrs_init          = LDLM_NTHRS_INIT,
+                       .tc_nthrs_base          = LDLM_NTHRS_BASE,
+                       .tc_nthrs_max           = LDLM_NTHRS_MAX,
+                       .tc_nthrs_user          = ldlm_num_threads,
+                       .tc_cpu_affinity        = 1,
+                       .tc_ctx_tags            = LCT_MD_THREAD | LCT_DT_THREAD,
+               },
+               .psc_cpt                = {
+                       .cc_pattern             = ldlm_cpts,
+               },
+               .psc_ops                = {
+                       .so_req_handler         = ldlm_callback_handler,
+               },
+       };
+       ldlm_state->ldlm_cb_service = \
+                       ptlrpc_register_service(&conf, ldlm_svc_proc_dir);
+       if (IS_ERR(ldlm_state->ldlm_cb_service)) {
+               CERROR("failed to start service\n");
+               rc = PTR_ERR(ldlm_state->ldlm_cb_service);
+               ldlm_state->ldlm_cb_service = NULL;
+               GOTO(out, rc);
+       }
+
+
+       OBD_ALLOC(blp, sizeof(*blp));
+       if (blp == NULL)
+               GOTO(out, rc = -ENOMEM);
+       ldlm_state->ldlm_bl_pool = blp;
+
+       spin_lock_init(&blp->blp_lock);
+       INIT_LIST_HEAD(&blp->blp_list);
+       INIT_LIST_HEAD(&blp->blp_prio_list);
+       init_waitqueue_head(&blp->blp_waitq);
+       atomic_set(&blp->blp_num_threads, 0);
+       atomic_set(&blp->blp_busy_threads, 0);
+
+       if (ldlm_num_threads == 0) {
+               blp->blp_min_threads = LDLM_NTHRS_INIT;
+               blp->blp_max_threads = LDLM_NTHRS_MAX;
+       } else {
+               blp->blp_min_threads = blp->blp_max_threads = \
+                       min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT,
+                                                        ldlm_num_threads));
+       }
+
+       for (i = 0; i < blp->blp_min_threads; i++) {
+               rc = ldlm_bl_thread_start(blp);
+               if (rc < 0)
+                       GOTO(out, rc);
+       }
+
+
+       rc = ldlm_pools_init();
+       if (rc) {
+               CERROR("Failed to initialize LDLM pools: %d\n", rc);
+               GOTO(out, rc);
+       }
+       RETURN(0);
+
+ out:
+       ldlm_cleanup();
+       RETURN(rc);
+}
+
+static int ldlm_cleanup(void)
+{
+       ENTRY;
+
+       if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) ||
+           !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) {
+               CERROR("ldlm still has namespaces; clean these up first.\n");
+               ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+               ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+               RETURN(-EBUSY);
+       }
+
+       ldlm_pools_fini();
+
+       if (ldlm_state->ldlm_bl_pool != NULL) {
+               struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+
+               while (atomic_read(&blp->blp_num_threads) > 0) {
+                       struct ldlm_bl_work_item blwi = { .blwi_ns = NULL };
+
+                       init_completion(&blp->blp_comp);
+
+                       spin_lock(&blp->blp_lock);
+                       list_add_tail(&blwi.blwi_entry, &blp->blp_list);
+                       wake_up(&blp->blp_waitq);
+                       spin_unlock(&blp->blp_lock);
+
+                       wait_for_completion(&blp->blp_comp);
+               }
+
+               OBD_FREE(blp, sizeof(*blp));
+       }
+
+       if (ldlm_state->ldlm_cb_service != NULL)
+               ptlrpc_unregister_service(ldlm_state->ldlm_cb_service);
+
+       ldlm_proc_cleanup();
+
+
+       OBD_FREE(ldlm_state, sizeof(*ldlm_state));
+       ldlm_state = NULL;
+
+       RETURN(0);
+}
+
+int ldlm_init(void)
+{
+       mutex_init(&ldlm_ref_mutex);
+       mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER));
+       mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT));
+       ldlm_resource_slab = kmem_cache_create("ldlm_resources",
+                                              sizeof(struct ldlm_resource), 0,
+                                              SLAB_HWCACHE_ALIGN, NULL);
+       if (ldlm_resource_slab == NULL)
+               return -ENOMEM;
+
+       ldlm_lock_slab = kmem_cache_create("ldlm_locks",
+                             sizeof(struct ldlm_lock), 0,
+                             SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL);
+       if (ldlm_lock_slab == NULL) {
+               kmem_cache_destroy(ldlm_resource_slab);
+               return -ENOMEM;
+       }
+
+       ldlm_interval_slab = kmem_cache_create("interval_node",
+                                       sizeof(struct ldlm_interval),
+                                       0, SLAB_HWCACHE_ALIGN, NULL);
+       if (ldlm_interval_slab == NULL) {
+               kmem_cache_destroy(ldlm_resource_slab);
+               kmem_cache_destroy(ldlm_lock_slab);
+               return -ENOMEM;
+       }
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       class_export_dump_hook = ldlm_dump_export_locks;
+#endif
+       return 0;
+}
+
+void ldlm_exit(void)
+{
+       if (ldlm_refcount)
+               CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
+       kmem_cache_destroy(ldlm_resource_slab);
+       /* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+        * synchronize_rcu() to wait a grace period elapsed, so that
+        * ldlm_lock_free() get a chance to be called. */
+       synchronize_rcu();
+       kmem_cache_destroy(ldlm_lock_slab);
+       kmem_cache_destroy(ldlm_interval_slab);
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c
new file mode 100644 (file)
index 0000000..ec29e28
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_plain.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+/**
+ * This file contains implementation of PLAIN lock type.
+ *
+ * PLAIN locks are the simplest form of LDLM locking, and are used when
+ * there only needs to be a single lock on a resource. This avoids some
+ * of the complexity of EXTENT and IBITS lock types, but doesn't allow
+ * different "parts" of a resource to be locked concurrently.  Example
+ * use cases for PLAIN locks include locking of MGS configuration logs
+ * and (as of Lustre 2.4) quota records.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+
+#include "ldlm_internal.h"
+
+
+void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy,
+                                    ldlm_policy_data_t *lpolicy)
+{
+       /* No policy for plain locks */
+}
+
+void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy,
+                                    ldlm_wire_policy_data_t *wpolicy)
+{
+       /* No policy for plain locks */
+}
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
new file mode 100644 (file)
index 0000000..b3b6028
--- /dev/null
@@ -0,0 +1,1384 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_pool.c
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ */
+
+/*
+ * Idea of this code is rather simple. Each second, for each server namespace
+ * we have SLV - server lock volume which is calculated on current number of
+ * granted locks, grant speed for past period, etc - that is, locking load.
+ * This SLV number may be thought as a flow definition for simplicity. It is
+ * sent to clients with each occasion to let them know what is current load
+ * situation on the server. By default, at the beginning, SLV on server is
+ * set max value which is calculated as the following: allow to one client
+ * have all locks of limit ->pl_limit for 10h.
+ *
+ * Next, on clients, number of cached locks is not limited artificially in any
+ * way as it was before. Instead, client calculates CLV, that is, client lock
+ * volume for each lock and compares it with last SLV from the server. CLV is
+ * calculated as the number of locks in LRU * lock live time in seconds. If
+ * CLV > SLV - lock is canceled.
+ *
+ * Client has LVF, that is, lock volume factor which regulates how much sensitive
+ * client should be about last SLV from server. The higher LVF is the more locks
+ * will be canceled on client. Default value for it is 1. Setting LVF to 2 means
+ * that client will cancel locks 2 times faster.
+ *
+ * Locks on a client will be canceled more intensively in these cases:
+ * (1) if SLV is smaller, that is, load is higher on the server;
+ * (2) client has a lot of locks (the more locks are held by client, the bigger
+ *     chances that some of them should be canceled);
+ * (3) client has old locks (taken some time ago);
+ *
+ * Thus, according to flow paradigm that we use for better understanding SLV,
+ * CLV is the volume of particle in flow described by SLV. According to this,
+ * if flow is getting thinner, more and more particles become outside of it and
+ * as particles are locks, they should be canceled.
+ *
+ * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas
+ * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many
+ * cleanups. Flow definition to allow more easy understanding of the logic belongs
+ * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes.
+ * And design and implementation are done by Yury Umanets (umka@clusterfs.com).
+ *
+ * Glossary for terms used:
+ *
+ * pl_limit - Number of allowed locks in pool. Applies to server and client
+ * side (tunable);
+ *
+ * pl_granted - Number of granted locks (calculated);
+ * pl_grant_rate - Number of granted locks for last T (calculated);
+ * pl_cancel_rate - Number of canceled locks for last T (calculated);
+ * pl_grant_speed - Grant speed (GR - CR) for last T (calculated);
+ * pl_grant_plan - Planned number of granted locks for next T (calculated);
+ * pl_server_lock_volume - Current server lock volume (calculated);
+ *
+ * As it may be seen from list above, we have few possible tunables which may
+ * affect behavior much. They all may be modified via proc. However, they also
+ * give a possibility for constructing few pre-defined behavior policies. If
+ * none of predefines is suitable for a working pattern being used, new one may
+ * be "constructed" via proc tunables.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+# include <lustre_dlm.h>
+
+#include <cl_object.h>
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include "ldlm_internal.h"
+
+
+/*
+ * 50 ldlm locks for 1MB of RAM.
+ */
+#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_CACHE_SHIFT)) * 50)
+
+/*
+ * Maximal possible grant step plan in %.
+ */
+#define LDLM_POOL_MAX_GSP (30)
+
+/*
+ * Minimal possible grant step plan in %.
+ */
+#define LDLM_POOL_MIN_GSP (1)
+
+/*
+ * This controls the speed of reaching LDLM_POOL_MAX_GSP
+ * with increasing thread period.
+ */
+#define LDLM_POOL_GSP_STEP_SHIFT (2)
+
+/*
+ * LDLM_POOL_GSP% of all locks is default GP.
+ */
+#define LDLM_POOL_GP(L)   (((L) * LDLM_POOL_MAX_GSP) / 100)
+
+/*
+ * Max age for locks on clients.
+ */
+#define LDLM_POOL_MAX_AGE (36000)
+
+/*
+ * The granularity of SLV calculation.
+ */
+#define LDLM_POOL_SLV_SHIFT (10)
+
+extern proc_dir_entry_t *ldlm_ns_proc_dir;
+
+static inline __u64 dru(__u64 val, __u32 shift, int round_up)
+{
+       return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift;
+}
+
+static inline __u64 ldlm_pool_slv_max(__u32 L)
+{
+       /*
+        * Allow to have all locks for 1 client for 10 hrs.
+        * Formula is the following: limit * 10h / 1 client.
+        */
+       __u64 lim = (__u64)L *  LDLM_POOL_MAX_AGE / 1;
+       return lim;
+}
+
+static inline __u64 ldlm_pool_slv_min(__u32 L)
+{
+       return 1;
+}
+
+enum {
+       LDLM_POOL_FIRST_STAT = 0,
+       LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT,
+       LDLM_POOL_GRANT_STAT,
+       LDLM_POOL_CANCEL_STAT,
+       LDLM_POOL_GRANT_RATE_STAT,
+       LDLM_POOL_CANCEL_RATE_STAT,
+       LDLM_POOL_GRANT_PLAN_STAT,
+       LDLM_POOL_SLV_STAT,
+       LDLM_POOL_SHRINK_REQTD_STAT,
+       LDLM_POOL_SHRINK_FREED_STAT,
+       LDLM_POOL_RECALC_STAT,
+       LDLM_POOL_TIMING_STAT,
+       LDLM_POOL_LAST_STAT
+};
+
+static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
+{
+       return container_of(pl, struct ldlm_namespace, ns_pool);
+}
+
+/**
+ * Calculates suggested grant_step in % of available locks for passed
+ * \a period. This is later used in grant_plan calculations.
+ */
+static inline int ldlm_pool_t2gsp(unsigned int t)
+{
+       /*
+        * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP
+        * and up to 30% for anything higher than LDLM_POOL_GSP_STEP.
+        *
+        * How this will affect execution is the following:
+        *
+        * - for thread period 1s we will have grant_step 1% which good from
+        * pov of taking some load off from server and push it out to clients.
+        * This is like that because 1% for grant_step means that server will
+        * not allow clients to get lots of locks in short period of time and
+        * keep all old locks in their caches. Clients will always have to
+        * get some locks back if they want to take some new;
+        *
+        * - for thread period 10s (which is default) we will have 23% which
+        * means that clients will have enough of room to take some new locks
+        * without getting some back. All locks from this 23% which were not
+        * taken by clients in current period will contribute in SLV growing.
+        * SLV growing means more locks cached on clients until limit or grant
+        * plan is reached.
+        */
+       return LDLM_POOL_MAX_GSP -
+               ((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >>
+                (t >> LDLM_POOL_GSP_STEP_SHIFT));
+}
+
+/**
+ * Recalculates next grant limit on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
+{
+       int granted, grant_step, limit;
+
+       limit = ldlm_pool_get_limit(pl);
+       granted = atomic_read(&pl->pl_granted);
+
+       grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+       grant_step = ((limit - granted) * grant_step) / 100;
+       pl->pl_grant_plan = granted + grant_step;
+       limit = (limit * 5) >> 2;
+       if (pl->pl_grant_plan > limit)
+               pl->pl_grant_plan = limit;
+}
+
+/**
+ * Recalculates next SLV on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
+{
+       int granted;
+       int grant_plan;
+       int round_up;
+       __u64 slv;
+       __u64 slv_factor;
+       __u64 grant_usage;
+       __u32 limit;
+
+       slv = pl->pl_server_lock_volume;
+       grant_plan = pl->pl_grant_plan;
+       limit = ldlm_pool_get_limit(pl);
+       granted = atomic_read(&pl->pl_granted);
+       round_up = granted < limit;
+
+       grant_usage = max_t(int, limit - (granted - grant_plan), 1);
+
+       /*
+        * Find out SLV change factor which is the ratio of grant usage
+        * from limit. SLV changes as fast as the ratio of grant plan
+        * consumption. The more locks from grant plan are not consumed
+        * by clients in last interval (idle time), the faster grows
+        * SLV. And the opposite, the more grant plan is over-consumed
+        * (load time) the faster drops SLV.
+        */
+       slv_factor = (grant_usage << LDLM_POOL_SLV_SHIFT);
+       do_div(slv_factor, limit);
+       slv = slv * slv_factor;
+       slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up);
+
+       if (slv > ldlm_pool_slv_max(limit)) {
+               slv = ldlm_pool_slv_max(limit);
+       } else if (slv < ldlm_pool_slv_min(limit)) {
+               slv = ldlm_pool_slv_min(limit);
+       }
+
+       pl->pl_server_lock_volume = slv;
+}
+
+/**
+ * Recalculates next stats on passed \a pl.
+ *
+ * \pre ->pl_lock is locked.
+ */
+static void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
+{
+       int grant_plan = pl->pl_grant_plan;
+       __u64 slv = pl->pl_server_lock_volume;
+       int granted = atomic_read(&pl->pl_granted);
+       int grant_rate = atomic_read(&pl->pl_grant_rate);
+       int cancel_rate = atomic_read(&pl->pl_cancel_rate);
+
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
+                           slv);
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+                           granted);
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+                           grant_rate);
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+                           grant_plan);
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+                           cancel_rate);
+}
+
+/**
+ * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd.
+ */
+static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl)
+{
+       struct obd_device *obd;
+
+       /*
+        * Set new SLV in obd field for using it later without accessing the
+        * pool. This is required to avoid race between sending reply to client
+        * with new SLV and cleanup server stack in which we can't guarantee
+        * that namespace is still alive. We know only that obd is alive as
+        * long as valid export is alive.
+        */
+       obd = ldlm_pl2ns(pl)->ns_obd;
+       LASSERT(obd != NULL);
+       write_lock(&obd->obd_pool_lock);
+       obd->obd_pool_slv = pl->pl_server_lock_volume;
+       write_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates all pool fields on passed \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
+{
+       time_t recalc_interval_sec;
+       ENTRY;
+
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec < pl->pl_recalc_period)
+               RETURN(0);
+
+       spin_lock(&pl->pl_lock);
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec < pl->pl_recalc_period) {
+               spin_unlock(&pl->pl_lock);
+               RETURN(0);
+       }
+       /*
+        * Recalc SLV after last period. This should be done
+        * _before_ recalculating new grant plan.
+        */
+       ldlm_pool_recalc_slv(pl);
+
+       /*
+        * Make sure that pool informed obd of last SLV changes.
+        */
+       ldlm_srv_pool_push_slv(pl);
+
+       /*
+        * Update grant_plan for new period.
+        */
+       ldlm_pool_recalc_grant_plan(pl);
+
+       pl->pl_recalc_time = cfs_time_current_sec();
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+                           recalc_interval_sec);
+       spin_unlock(&pl->pl_lock);
+       RETURN(0);
+}
+
+/**
+ * This function is used on server side as main entry point for memory
+ * pressure handling. It decreases SLV on \a pl according to passed
+ * \a nr and \a gfp_mask.
+ *
+ * Our goal here is to decrease SLV such a way that clients hold \a nr
+ * locks smaller in next 10h.
+ */
+static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
+                               int nr, unsigned int gfp_mask)
+{
+       __u32 limit;
+
+       /*
+        * VM is asking how many entries may be potentially freed.
+        */
+       if (nr == 0)
+               return atomic_read(&pl->pl_granted);
+
+       /*
+        * Client already canceled locks but server is already in shrinker
+        * and can't cancel anything. Let's catch this race.
+        */
+       if (atomic_read(&pl->pl_granted) == 0)
+               RETURN(0);
+
+       spin_lock(&pl->pl_lock);
+
+       /*
+        * We want shrinker to possibly cause cancellation of @nr locks from
+        * clients or grant approximately @nr locks smaller next intervals.
+        *
+        * This is why we decreased SLV by @nr. This effect will only be as
+        * long as one re-calc interval (1s these days) and this should be
+        * enough to pass this decreased SLV to all clients. On next recalc
+        * interval pool will either increase SLV if locks load is not high
+        * or will keep on same level or even decrease again, thus, shrinker
+        * decreased SLV will affect next recalc intervals and this way will
+        * make locking load lower.
+        */
+       if (nr < pl->pl_server_lock_volume) {
+               pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr;
+       } else {
+               limit = ldlm_pool_get_limit(pl);
+               pl->pl_server_lock_volume = ldlm_pool_slv_min(limit);
+       }
+
+       /*
+        * Make sure that pool informed obd of last SLV changes.
+        */
+       ldlm_srv_pool_push_slv(pl);
+       spin_unlock(&pl->pl_lock);
+
+       /*
+        * We did not really free any memory here so far, it only will be
+        * freed later may be, so that we return 0 to not confuse VM.
+        */
+       return 0;
+}
+
+/**
+ * Setup server side pool \a pl with passed \a limit.
+ */
+static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit)
+{
+       struct obd_device *obd;
+
+       obd = ldlm_pl2ns(pl)->ns_obd;
+       LASSERT(obd != NULL && obd != LP_POISON);
+       LASSERT(obd->obd_type != LP_POISON);
+       write_lock(&obd->obd_pool_lock);
+       obd->obd_pool_limit = limit;
+       write_unlock(&obd->obd_pool_lock);
+
+       ldlm_pool_set_limit(pl, limit);
+       return 0;
+}
+
+/**
+ * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl.
+ */
+static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl)
+{
+       struct obd_device *obd;
+
+       /*
+        * Get new SLV and Limit from obd which is updated with coming
+        * RPCs.
+        */
+       obd = ldlm_pl2ns(pl)->ns_obd;
+       LASSERT(obd != NULL);
+       read_lock(&obd->obd_pool_lock);
+       pl->pl_server_lock_volume = obd->obd_pool_slv;
+       ldlm_pool_set_limit(pl, obd->obd_pool_limit);
+       read_unlock(&obd->obd_pool_lock);
+}
+
+/**
+ * Recalculates client size pool \a pl according to current SLV and Limit.
+ */
+static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
+{
+       time_t recalc_interval_sec;
+       ENTRY;
+
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec < pl->pl_recalc_period)
+               RETURN(0);
+
+       spin_lock(&pl->pl_lock);
+       /*
+        * Check if we need to recalc lists now.
+        */
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec < pl->pl_recalc_period) {
+               spin_unlock(&pl->pl_lock);
+               RETURN(0);
+       }
+
+       /*
+        * Make sure that pool knows last SLV and Limit from obd.
+        */
+       ldlm_cli_pool_pop_slv(pl);
+
+       pl->pl_recalc_time = cfs_time_current_sec();
+       lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+                           recalc_interval_sec);
+       spin_unlock(&pl->pl_lock);
+
+       /*
+        * Do not cancel locks in case lru resize is disabled for this ns.
+        */
+       if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
+               RETURN(0);
+
+       /*
+        * In the time of canceling locks on client we do not need to maintain
+        * sharp timing, we only want to cancel locks asap according to new SLV.
+        * It may be called when SLV has changed much, this is why we do not
+        * take into account pl->pl_recalc_time here.
+        */
+       RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC,
+                              LDLM_CANCEL_LRUR));
+}
+
+/**
+ * This function is main entry point for memory pressure handling on client
+ * side.  Main goal of this function is to cancel some number of locks on
+ * passed \a pl according to \a nr and \a gfp_mask.
+ */
+static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
+                               int nr, unsigned int gfp_mask)
+{
+       struct ldlm_namespace *ns;
+       int canceled = 0, unused;
+
+       ns = ldlm_pl2ns(pl);
+
+       /*
+        * Do not cancel locks in case lru resize is disabled for this ns.
+        */
+       if (!ns_connect_lru_resize(ns))
+               RETURN(0);
+
+       /*
+        * Make sure that pool knows last SLV and Limit from obd.
+        */
+       ldlm_cli_pool_pop_slv(pl);
+
+       spin_lock(&ns->ns_lock);
+       unused = ns->ns_nr_unused;
+       spin_unlock(&ns->ns_lock);
+
+       if (nr) {
+               canceled = ldlm_cancel_lru(ns, nr, LCF_ASYNC,
+                                          LDLM_CANCEL_SHRINK);
+       }
+       /*
+        * Return the number of potentially reclaimable locks.
+        */
+       return ((unused - canceled) / 100) * sysctl_vfs_cache_pressure;
+}
+
+struct ldlm_pool_ops ldlm_srv_pool_ops = {
+       .po_recalc = ldlm_srv_pool_recalc,
+       .po_shrink = ldlm_srv_pool_shrink,
+       .po_setup  = ldlm_srv_pool_setup
+};
+
+struct ldlm_pool_ops ldlm_cli_pool_ops = {
+       .po_recalc = ldlm_cli_pool_recalc,
+       .po_shrink = ldlm_cli_pool_shrink
+};
+
+/**
+ * Pool recalc wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_recalc(struct ldlm_pool *pl)
+{
+       time_t recalc_interval_sec;
+       int count;
+
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec <= 0)
+               goto recalc;
+
+       spin_lock(&pl->pl_lock);
+       recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time;
+       if (recalc_interval_sec > 0) {
+               /*
+                * Update pool statistics every 1s.
+                */
+               ldlm_pool_recalc_stats(pl);
+
+               /*
+                * Zero out all rates and speed for the last period.
+                */
+               atomic_set(&pl->pl_grant_rate, 0);
+               atomic_set(&pl->pl_cancel_rate, 0);
+       }
+       spin_unlock(&pl->pl_lock);
+
+ recalc:
+       if (pl->pl_ops->po_recalc != NULL) {
+               count = pl->pl_ops->po_recalc(pl);
+               lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+                                   count);
+               return count;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(ldlm_pool_recalc);
+
+/**
+ * Pool shrink wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ */
+int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
+                    unsigned int gfp_mask)
+{
+       int cancel = 0;
+
+       if (pl->pl_ops->po_shrink != NULL) {
+               cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
+               if (nr > 0) {
+                       lprocfs_counter_add(pl->pl_stats,
+                                           LDLM_POOL_SHRINK_REQTD_STAT,
+                                           nr);
+                       lprocfs_counter_add(pl->pl_stats,
+                                           LDLM_POOL_SHRINK_FREED_STAT,
+                                           cancel);
+                       CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, "
+                              "shrunk %d\n", pl->pl_name, nr, cancel);
+               }
+       }
+       return cancel;
+}
+EXPORT_SYMBOL(ldlm_pool_shrink);
+
+/**
+ * Pool setup wrapper. Will call either client or server pool recalc callback
+ * depending what pool \a pl is used.
+ *
+ * Sets passed \a limit into pool \a pl.
+ */
+int ldlm_pool_setup(struct ldlm_pool *pl, int limit)
+{
+       if (pl->pl_ops->po_setup != NULL)
+               return(pl->pl_ops->po_setup(pl, limit));
+       return 0;
+}
+EXPORT_SYMBOL(ldlm_pool_setup);
+
+static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused)
+{
+       int granted, grant_rate, cancel_rate, grant_step;
+       int grant_speed, grant_plan, lvf;
+       struct ldlm_pool *pl = m->private;
+       __u64 slv, clv;
+       __u32 limit;
+
+       spin_lock(&pl->pl_lock);
+       slv = pl->pl_server_lock_volume;
+       clv = pl->pl_client_lock_volume;
+       limit = ldlm_pool_get_limit(pl);
+       grant_plan = pl->pl_grant_plan;
+       granted = atomic_read(&pl->pl_granted);
+       grant_rate = atomic_read(&pl->pl_grant_rate);
+       cancel_rate = atomic_read(&pl->pl_cancel_rate);
+       grant_speed = grant_rate - cancel_rate;
+       lvf = atomic_read(&pl->pl_lock_volume_factor);
+       grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period);
+       spin_unlock(&pl->pl_lock);
+
+       seq_printf(m, "LDLM pool state (%s):\n"
+                     "  SLV: "LPU64"\n"
+                     "  CLV: "LPU64"\n"
+                     "  LVF: %d\n",
+                     pl->pl_name, slv, clv, lvf);
+
+       if (ns_is_server(ldlm_pl2ns(pl))) {
+               seq_printf(m, "  GSP: %d%%\n"
+                             "  GP:  %d\n",
+                             grant_step, grant_plan);
+       }
+       seq_printf(m, "  GR:  %d\n" "  CR:  %d\n" "  GS:  %d\n"
+                     "  G:   %d\n" "  L:   %d\n",
+                     grant_rate, cancel_rate, grant_speed,
+                     granted, limit);
+
+       return 0;
+}
+LPROC_SEQ_FOPS_RO(lprocfs_pool_state);
+
+static int lprocfs_grant_speed_seq_show(struct seq_file *m, void *unused)
+{
+       struct ldlm_pool *pl = m->private;
+       int            grant_speed;
+
+       spin_lock(&pl->pl_lock);
+       /* serialize with ldlm_pool_recalc */
+       grant_speed = atomic_read(&pl->pl_grant_rate) -
+                       atomic_read(&pl->pl_cancel_rate);
+       spin_unlock(&pl->pl_lock);
+       return lprocfs_rd_uint(m, &grant_speed);
+}
+
+LDLM_POOL_PROC_READER_SEQ_SHOW(grant_plan, int);
+LPROC_SEQ_FOPS_RO(lprocfs_grant_plan);
+
+LDLM_POOL_PROC_READER_SEQ_SHOW(recalc_period, int);
+LDLM_POOL_PROC_WRITER(recalc_period, int);
+static ssize_t lprocfs_recalc_period_seq_write(struct file *file, const char *buf,
+                                          size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+
+       return lprocfs_wr_recalc_period(file, buf, len, seq->private);
+}
+LPROC_SEQ_FOPS(lprocfs_recalc_period);
+
+LPROC_SEQ_FOPS_RO_TYPE(ldlm_pool, u64);
+LPROC_SEQ_FOPS_RO_TYPE(ldlm_pool, atomic);
+LPROC_SEQ_FOPS_RW_TYPE(ldlm_pool_rw, atomic);
+
+LPROC_SEQ_FOPS_RO(lprocfs_grant_speed);
+
+#define LDLM_POOL_ADD_VAR(name, var, ops)                      \
+       do {                                                    \
+               snprintf(var_name, MAX_STRING_SIZE, #name);     \
+               pool_vars[0].data = var;                        \
+               pool_vars[0].fops = ops;                        \
+               lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0);\
+       } while (0)
+
+static int ldlm_pool_proc_init(struct ldlm_pool *pl)
+{
+       struct ldlm_namespace *ns = ldlm_pl2ns(pl);
+       struct proc_dir_entry *parent_ns_proc;
+       struct lprocfs_vars pool_vars[2];
+       char *var_name = NULL;
+       int rc = 0;
+       ENTRY;
+
+       OBD_ALLOC(var_name, MAX_STRING_SIZE + 1);
+       if (!var_name)
+               RETURN(-ENOMEM);
+
+       parent_ns_proc = ns->ns_proc_dir_entry;
+       if (parent_ns_proc == NULL) {
+               CERROR("%s: proc entry is not initialized\n",
+                      ldlm_ns_name(ns));
+               GOTO(out_free_name, rc = -EINVAL);
+       }
+       pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc,
+                                          NULL, NULL);
+       if (IS_ERR(pl->pl_proc_dir)) {
+               CERROR("LProcFS failed in ldlm-pool-init\n");
+               rc = PTR_ERR(pl->pl_proc_dir);
+               GOTO(out_free_name, rc);
+       }
+
+       var_name[MAX_STRING_SIZE] = '\0';
+       memset(pool_vars, 0, sizeof(pool_vars));
+       pool_vars[0].name = var_name;
+
+       LDLM_POOL_ADD_VAR("server_lock_volume", &pl->pl_server_lock_volume,
+                         &ldlm_pool_u64_fops);
+       LDLM_POOL_ADD_VAR("limit", &pl->pl_limit, &ldlm_pool_rw_atomic_fops);
+       LDLM_POOL_ADD_VAR("granted", &pl->pl_granted, &ldlm_pool_atomic_fops);
+       LDLM_POOL_ADD_VAR("grant_speed", pl, &lprocfs_grant_speed_fops);
+       LDLM_POOL_ADD_VAR("cancel_rate", &pl->pl_cancel_rate,
+                         &ldlm_pool_atomic_fops);
+       LDLM_POOL_ADD_VAR("grant_rate", &pl->pl_grant_rate,
+                         &ldlm_pool_atomic_fops);
+       LDLM_POOL_ADD_VAR("grant_plan", pl, &lprocfs_grant_plan_fops);
+       LDLM_POOL_ADD_VAR("recalc_period", pl, &lprocfs_recalc_period_fops);
+       LDLM_POOL_ADD_VAR("lock_volume_factor", &pl->pl_lock_volume_factor,
+                         &ldlm_pool_rw_atomic_fops);
+       LDLM_POOL_ADD_VAR("state", pl, &lprocfs_pool_state_fops);
+
+       pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT -
+                                          LDLM_POOL_FIRST_STAT, 0);
+       if (!pl->pl_stats)
+               GOTO(out_free_name, rc = -ENOMEM);
+
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "granted", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "grant", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "cancel", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "grant_rate", "locks/s");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "cancel_rate", "locks/s");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "grant_plan", "locks/s");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "slv", "slv");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "shrink_request", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "shrink_freed", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "recalc_freed", "locks");
+       lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT,
+                            LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
+                            "recalc_timing", "sec");
+       rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats);
+
+       EXIT;
+out_free_name:
+       OBD_FREE(var_name, MAX_STRING_SIZE + 1);
+       return rc;
+}
+
+static void ldlm_pool_proc_fini(struct ldlm_pool *pl)
+{
+       if (pl->pl_stats != NULL) {
+               lprocfs_free_stats(&pl->pl_stats);
+               pl->pl_stats = NULL;
+       }
+       if (pl->pl_proc_dir != NULL) {
+               lprocfs_remove(&pl->pl_proc_dir);
+               pl->pl_proc_dir = NULL;
+       }
+}
+
+int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns,
+                  int idx, ldlm_side_t client)
+{
+       int rc;
+       ENTRY;
+
+       spin_lock_init(&pl->pl_lock);
+       atomic_set(&pl->pl_granted, 0);
+       pl->pl_recalc_time = cfs_time_current_sec();
+       atomic_set(&pl->pl_lock_volume_factor, 1);
+
+       atomic_set(&pl->pl_grant_rate, 0);
+       atomic_set(&pl->pl_cancel_rate, 0);
+       pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L);
+
+       snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d",
+                ldlm_ns_name(ns), idx);
+
+       if (client == LDLM_NAMESPACE_SERVER) {
+               pl->pl_ops = &ldlm_srv_pool_ops;
+               ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L);
+               pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD;
+               pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L);
+       } else {
+               ldlm_pool_set_limit(pl, 1);
+               pl->pl_server_lock_volume = 0;
+               pl->pl_ops = &ldlm_cli_pool_ops;
+               pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD;
+       }
+       pl->pl_client_lock_volume = 0;
+       rc = ldlm_pool_proc_init(pl);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_pool_init);
+
+void ldlm_pool_fini(struct ldlm_pool *pl)
+{
+       ENTRY;
+       ldlm_pool_proc_fini(pl);
+
+       /*
+        * Pool should not be used after this point. We can't free it here as
+        * it lives in struct ldlm_namespace, but still interested in catching
+        * any abnormal using cases.
+        */
+       POISON(pl, 0x5a, sizeof(*pl));
+       EXIT;
+}
+EXPORT_SYMBOL(ldlm_pool_fini);
+
+/**
+ * Add new taken ldlm lock \a lock into pool \a pl accounting.
+ */
+void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+       /*
+        * FLOCK locks are special in a sense that they are almost never
+        * cancelled, instead special kind of lock is used to drop them.
+        * also there is no LRU for flock locks, so no point in tracking
+        * them anyway.
+        */
+       if (lock->l_resource->lr_type == LDLM_FLOCK)
+               return;
+
+       atomic_inc(&pl->pl_granted);
+       atomic_inc(&pl->pl_grant_rate);
+       lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
+       /*
+        * Do not do pool recalc for client side as all locks which
+        * potentially may be canceled has already been packed into
+        * enqueue/cancel rpc. Also we do not want to run out of stack
+        * with too long call paths.
+        */
+       if (ns_is_server(ldlm_pl2ns(pl)))
+               ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_add);
+
+/**
+ * Remove ldlm lock \a lock from pool \a pl accounting.
+ */
+void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
+{
+       /*
+        * Filter out FLOCK locks. Read above comment in ldlm_pool_add().
+        */
+       if (lock->l_resource->lr_type == LDLM_FLOCK)
+               return;
+
+       LASSERT(atomic_read(&pl->pl_granted) > 0);
+       atomic_dec(&pl->pl_granted);
+       atomic_inc(&pl->pl_cancel_rate);
+
+       lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
+
+       if (ns_is_server(ldlm_pl2ns(pl)))
+               ldlm_pool_recalc(pl);
+}
+EXPORT_SYMBOL(ldlm_pool_del);
+
+/**
+ * Returns current \a pl SLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
+{
+       __u64 slv;
+       spin_lock(&pl->pl_lock);
+       slv = pl->pl_server_lock_volume;
+       spin_unlock(&pl->pl_lock);
+       return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_slv);
+
+/**
+ * Sets passed \a slv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
+{
+       spin_lock(&pl->pl_lock);
+       pl->pl_server_lock_volume = slv;
+       spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_slv);
+
+/**
+ * Returns current \a pl CLV.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+__u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
+{
+       __u64 slv;
+       spin_lock(&pl->pl_lock);
+       slv = pl->pl_client_lock_volume;
+       spin_unlock(&pl->pl_lock);
+       return slv;
+}
+EXPORT_SYMBOL(ldlm_pool_get_clv);
+
+/**
+ * Sets passed \a clv to \a pl.
+ *
+ * \pre ->pl_lock is not locked.
+ */
+void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
+{
+       spin_lock(&pl->pl_lock);
+       pl->pl_client_lock_volume = clv;
+       spin_unlock(&pl->pl_lock);
+}
+EXPORT_SYMBOL(ldlm_pool_set_clv);
+
+/**
+ * Returns current \a pl limit.
+ */
+__u32 ldlm_pool_get_limit(struct ldlm_pool *pl)
+{
+       return atomic_read(&pl->pl_limit);
+}
+EXPORT_SYMBOL(ldlm_pool_get_limit);
+
+/**
+ * Sets passed \a limit to \a pl.
+ */
+void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit)
+{
+       atomic_set(&pl->pl_limit, limit);
+}
+EXPORT_SYMBOL(ldlm_pool_set_limit);
+
+/**
+ * Returns current LVF from \a pl.
+ */
+__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl)
+{
+       return atomic_read(&pl->pl_lock_volume_factor);
+}
+EXPORT_SYMBOL(ldlm_pool_get_lvf);
+
+static int ldlm_pool_granted(struct ldlm_pool *pl)
+{
+       return atomic_read(&pl->pl_granted);
+}
+
+static struct ptlrpc_thread *ldlm_pools_thread;
+static struct shrinker *ldlm_pools_srv_shrinker;
+static struct shrinker *ldlm_pools_cli_shrinker;
+static struct completion ldlm_pools_comp;
+
+/*
+ * Cancel \a nr locks from all namespaces (if possible). Returns number of
+ * cached locks after shrink is finished. All namespaces are asked to
+ * cancel approximately equal amount of locks to keep balancing.
+ */
+static int ldlm_pools_shrink(ldlm_side_t client, int nr,
+                            unsigned int gfp_mask)
+{
+       int total = 0, cached = 0, nr_ns;
+       struct ldlm_namespace *ns;
+       void *cookie;
+
+       if (client == LDLM_NAMESPACE_CLIENT && nr != 0 &&
+           !(gfp_mask & __GFP_FS))
+               return -1;
+
+       CDEBUG(D_DLMTRACE, "Request to shrink %d %s locks from all pools\n",
+              nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
+
+       cookie = cl_env_reenter();
+
+       /*
+        * Find out how many resources we may release.
+        */
+       for (nr_ns = atomic_read(ldlm_namespace_nr(client));
+            nr_ns > 0; nr_ns--)
+       {
+               mutex_lock(ldlm_namespace_lock(client));
+               if (list_empty(ldlm_namespace_list(client))) {
+                       mutex_unlock(ldlm_namespace_lock(client));
+                       cl_env_reexit(cookie);
+                       return 0;
+               }
+               ns = ldlm_namespace_first_locked(client);
+               ldlm_namespace_get(ns);
+               ldlm_namespace_move_locked(ns, client);
+               mutex_unlock(ldlm_namespace_lock(client));
+               total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask);
+               ldlm_namespace_put(ns);
+       }
+
+       if (nr == 0 || total == 0) {
+               cl_env_reexit(cookie);
+               return total;
+       }
+
+       /*
+        * Shrink at least ldlm_namespace_nr(client) namespaces.
+        */
+       for (nr_ns = atomic_read(ldlm_namespace_nr(client));
+            nr_ns > 0; nr_ns--)
+       {
+               int cancel, nr_locks;
+
+               /*
+                * Do not call shrink under ldlm_namespace_lock(client)
+                */
+               mutex_lock(ldlm_namespace_lock(client));
+               if (list_empty(ldlm_namespace_list(client))) {
+                       mutex_unlock(ldlm_namespace_lock(client));
+                       /*
+                        * If list is empty, we can't return any @cached > 0,
+                        * that probably would cause needless shrinker
+                        * call.
+                        */
+                       cached = 0;
+                       break;
+               }
+               ns = ldlm_namespace_first_locked(client);
+               ldlm_namespace_get(ns);
+               ldlm_namespace_move_locked(ns, client);
+               mutex_unlock(ldlm_namespace_lock(client));
+
+               nr_locks = ldlm_pool_granted(&ns->ns_pool);
+               cancel = 1 + nr_locks * nr / total;
+               ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
+               cached += ldlm_pool_granted(&ns->ns_pool);
+               ldlm_namespace_put(ns);
+       }
+       cl_env_reexit(cookie);
+       /* we only decrease the SLV in server pools shrinker, return -1 to
+        * kernel to avoid needless loop. LU-1128 */
+       return (client == LDLM_NAMESPACE_SERVER) ? -1 : cached;
+}
+
+static int ldlm_pools_srv_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+       return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER,
+                                shrink_param(sc, nr_to_scan),
+                                shrink_param(sc, gfp_mask));
+}
+
+static int ldlm_pools_cli_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+       return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT,
+                                shrink_param(sc, nr_to_scan),
+                                shrink_param(sc, gfp_mask));
+}
+
+void ldlm_pools_recalc(ldlm_side_t client)
+{
+       __u32 nr_l = 0, nr_p = 0, l;
+       struct ldlm_namespace *ns;
+       int nr, equal = 0;
+
+       /*
+        * No need to setup pool limit for client pools.
+        */
+       if (client == LDLM_NAMESPACE_SERVER) {
+               /*
+                * Check all modest namespaces first.
+                */
+               mutex_lock(ldlm_namespace_lock(client));
+               list_for_each_entry(ns, ldlm_namespace_list(client),
+                                       ns_list_chain)
+               {
+                       if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
+                               continue;
+
+                       l = ldlm_pool_granted(&ns->ns_pool);
+                       if (l == 0)
+                               l = 1;
+
+                       /*
+                        * Set the modest pools limit equal to their avg granted
+                        * locks + ~6%.
+                        */
+                       l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0);
+                       ldlm_pool_setup(&ns->ns_pool, l);
+                       nr_l += l;
+                       nr_p++;
+               }
+
+               /*
+                * Make sure that modest namespaces did not eat more that 2/3
+                * of limit.
+                */
+               if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
+                       CWARN("\"Modest\" pools eat out 2/3 of server locks "
+                             "limit (%d of %lu). This means that you have too "
+                             "many clients for this amount of server RAM. "
+                             "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L);
+                       equal = 1;
+               }
+
+               /*
+                * The rest is given to greedy namespaces.
+                */
+               list_for_each_entry(ns, ldlm_namespace_list(client),
+                                       ns_list_chain)
+               {
+                       if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
+                               continue;
+
+                       if (equal) {
+                               /*
+                                * In the case 2/3 locks are eaten out by
+                                * modest pools, we re-setup equal limit
+                                * for _all_ pools.
+                                */
+                               l = LDLM_POOL_HOST_L /
+                                       atomic_read(
+                                               ldlm_namespace_nr(client));
+                       } else {
+                               /*
+                                * All the rest of greedy pools will have
+                                * all locks in equal parts.
+                                */
+                               l = (LDLM_POOL_HOST_L - nr_l) /
+                                       (atomic_read(
+                                               ldlm_namespace_nr(client)) -
+                                        nr_p);
+                       }
+                       ldlm_pool_setup(&ns->ns_pool, l);
+               }
+               mutex_unlock(ldlm_namespace_lock(client));
+       }
+
+       /*
+        * Recalc at least ldlm_namespace_nr(client) namespaces.
+        */
+       for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) {
+               int     skip;
+               /*
+                * Lock the list, get first @ns in the list, getref, move it
+                * to the tail, unlock and call pool recalc. This way we avoid
+                * calling recalc under @ns lock what is really good as we get
+                * rid of potential deadlock on client nodes when canceling
+                * locks synchronously.
+                */
+               mutex_lock(ldlm_namespace_lock(client));
+               if (list_empty(ldlm_namespace_list(client))) {
+                       mutex_unlock(ldlm_namespace_lock(client));
+                       break;
+               }
+               ns = ldlm_namespace_first_locked(client);
+
+               spin_lock(&ns->ns_lock);
+               /*
+                * skip ns which is being freed, and we don't want to increase
+                * its refcount again, not even temporarily. bz21519 & LU-499.
+                */
+               if (ns->ns_stopping) {
+                       skip = 1;
+               } else {
+                       skip = 0;
+                       ldlm_namespace_get(ns);
+               }
+               spin_unlock(&ns->ns_lock);
+
+               ldlm_namespace_move_locked(ns, client);
+               mutex_unlock(ldlm_namespace_lock(client));
+
+               /*
+                * After setup is done - recalc the pool.
+                */
+               if (!skip) {
+                       ldlm_pool_recalc(&ns->ns_pool);
+                       ldlm_namespace_put(ns);
+               }
+       }
+}
+EXPORT_SYMBOL(ldlm_pools_recalc);
+
+static int ldlm_pools_thread_main(void *arg)
+{
+       struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+       ENTRY;
+
+       thread_set_flags(thread, SVC_RUNNING);
+       wake_up(&thread->t_ctl_waitq);
+
+       CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n",
+               "ldlm_poold", current_pid());
+
+       while (1) {
+               struct l_wait_info lwi;
+
+               /*
+                * Recal all pools on this tick.
+                */
+               ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
+               ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
+
+               /*
+                * Wait until the next check time, or until we're
+                * stopped.
+                */
+               lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD),
+                                 NULL, NULL);
+               l_wait_event(thread->t_ctl_waitq,
+                            thread_is_stopping(thread) ||
+                            thread_is_event(thread),
+                            &lwi);
+
+               if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+                       break;
+               else
+                       thread_test_and_clear_flags(thread, SVC_EVENT);
+       }
+
+       thread_set_flags(thread, SVC_STOPPED);
+       wake_up(&thread->t_ctl_waitq);
+
+       CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n",
+               "ldlm_poold", current_pid());
+
+       complete_and_exit(&ldlm_pools_comp, 0);
+}
+
+static int ldlm_pools_thread_start(void)
+{
+       struct l_wait_info lwi = { 0 };
+       task_t *task;
+       ENTRY;
+
+       if (ldlm_pools_thread != NULL)
+               RETURN(-EALREADY);
+
+       OBD_ALLOC_PTR(ldlm_pools_thread);
+       if (ldlm_pools_thread == NULL)
+               RETURN(-ENOMEM);
+
+       init_completion(&ldlm_pools_comp);
+       init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq);
+
+       task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread,
+                          "ldlm_poold");
+       if (IS_ERR(task)) {
+               CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task));
+               OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread));
+               ldlm_pools_thread = NULL;
+               RETURN(PTR_ERR(task));
+       }
+       l_wait_event(ldlm_pools_thread->t_ctl_waitq,
+                    thread_is_running(ldlm_pools_thread), &lwi);
+       RETURN(0);
+}
+
+static void ldlm_pools_thread_stop(void)
+{
+       ENTRY;
+
+       if (ldlm_pools_thread == NULL) {
+               EXIT;
+               return;
+       }
+
+       thread_set_flags(ldlm_pools_thread, SVC_STOPPING);
+       wake_up(&ldlm_pools_thread->t_ctl_waitq);
+
+       /*
+        * Make sure that pools thread is finished before freeing @thread.
+        * This fixes possible race and oops due to accessing freed memory
+        * in pools thread.
+        */
+       wait_for_completion(&ldlm_pools_comp);
+       OBD_FREE_PTR(ldlm_pools_thread);
+       ldlm_pools_thread = NULL;
+       EXIT;
+}
+
+int ldlm_pools_init(void)
+{
+       int rc;
+       ENTRY;
+
+       rc = ldlm_pools_thread_start();
+       if (rc == 0) {
+               ldlm_pools_srv_shrinker =
+                       set_shrinker(DEFAULT_SEEKS,
+                                        ldlm_pools_srv_shrink);
+               ldlm_pools_cli_shrinker =
+                       set_shrinker(DEFAULT_SEEKS,
+                                        ldlm_pools_cli_shrink);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_pools_init);
+
+void ldlm_pools_fini(void)
+{
+       if (ldlm_pools_srv_shrinker != NULL) {
+               remove_shrinker(ldlm_pools_srv_shrinker);
+               ldlm_pools_srv_shrinker = NULL;
+       }
+       if (ldlm_pools_cli_shrinker != NULL) {
+               remove_shrinker(ldlm_pools_cli_shrinker);
+               ldlm_pools_cli_shrinker = NULL;
+       }
+       ldlm_pools_thread_stop();
+}
+EXPORT_SYMBOL(ldlm_pools_fini);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
new file mode 100644 (file)
index 0000000..1a690ed
--- /dev/null
@@ -0,0 +1,2333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/**
+ * This file contains Asynchronous System Trap (AST) handlers and related
+ * LDLM request-processing routines.
+ *
+ * An AST is a callback issued on a lock when its state is changed. There are
+ * several different types of ASTs (callbacks) registered for each lock:
+ *
+ * - completion AST: when a lock is enqueued by some process, but cannot be
+ *   granted immediately due to other conflicting locks on the same resource,
+ *   the completion AST is sent to notify the caller when the lock is
+ *   eventually granted
+ *
+ * - blocking AST: when a lock is granted to some process, if another process
+ *   enqueues a conflicting (blocking) lock on a resource, a blocking AST is
+ *   sent to notify the holder(s) of the lock(s) of the conflicting lock
+ *   request. The lock holder(s) must release their lock(s) on that resource in
+ *   a timely manner or be evicted by the server.
+ *
+ * - glimpse AST: this is used when a process wants information about a lock
+ *   (i.e. the lock value block (LVB)) but does not necessarily require holding
+ *   the lock. If the resource is locked, the lock holder(s) are sent glimpse
+ *   ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL
+ *   their lock(s) if they are idle. If the resource is not locked, the server
+ *   may grant the lock.
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <obd.h>
+
+#include "ldlm_internal.h"
+
+int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
+CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
+               "lock enqueue timeout minimum");
+
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
+static void interrupted_completion_wait(void *data)
+{
+}
+
+struct lock_wait_data {
+       struct ldlm_lock *lwd_lock;
+       __u32        lwd_conn_cnt;
+};
+
+struct ldlm_async_args {
+       struct lustre_handle lock_handle;
+};
+
+int ldlm_expired_completion_wait(void *data)
+{
+       struct lock_wait_data *lwd = data;
+       struct ldlm_lock *lock = lwd->lwd_lock;
+       struct obd_import *imp;
+       struct obd_device *obd;
+
+       ENTRY;
+       if (lock->l_conn_export == NULL) {
+               static cfs_time_t next_dump = 0, last_dump = 0;
+
+               if (ptlrpc_check_suspend())
+                       RETURN(0);
+
+               LCONSOLE_WARN("lock timed out (enqueued at "CFS_TIME_T", "
+                             CFS_DURATION_T"s ago)\n",
+                             lock->l_last_activity,
+                             cfs_time_sub(cfs_time_current_sec(),
+                                          lock->l_last_activity));
+               LDLM_DEBUG(lock, "lock timed out (enqueued at "CFS_TIME_T", "
+                          CFS_DURATION_T"s ago); not entering recovery in "
+                          "server code, just going back to sleep",
+                          lock->l_last_activity,
+                          cfs_time_sub(cfs_time_current_sec(),
+                                       lock->l_last_activity));
+               if (cfs_time_after(cfs_time_current(), next_dump)) {
+                       last_dump = next_dump;
+                       next_dump = cfs_time_shift(300);
+                       ldlm_namespace_dump(D_DLMTRACE,
+                                           ldlm_lock_to_ns(lock));
+                       if (last_dump == 0)
+                               libcfs_debug_dumplog();
+               }
+               RETURN(0);
+       }
+
+       obd = lock->l_conn_export->exp_obd;
+       imp = obd->u.cli.cl_import;
+       ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
+       LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", "
+                 CFS_DURATION_T"s ago), entering recovery for %s@%s",
+                 lock->l_last_activity,
+                 cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity),
+                 obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_expired_completion_wait);
+
+/* We use the same basis for both server side and client side functions
+   from a single node. */
+int ldlm_get_enq_timeout(struct ldlm_lock *lock)
+{
+       int timeout = at_get(ldlm_lock_to_ns_at(lock));
+       if (AT_OFF)
+               return obd_timeout / 2;
+       /* Since these are non-updating timeouts, we should be conservative.
+          It would be nice to have some kind of "early reply" mechanism for
+          lock callbacks too... */
+       timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */
+       return max(timeout, ldlm_enqueue_min);
+}
+EXPORT_SYMBOL(ldlm_get_enq_timeout);
+
+/**
+ * Helper function for ldlm_completion_ast(), updating timings when lock is
+ * actually granted.
+ */
+static int ldlm_completion_tail(struct ldlm_lock *lock)
+{
+       long delay;
+       int  result;
+
+       if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED) {
+               LDLM_DEBUG(lock, "client-side enqueue: destroyed");
+               result = -EIO;
+       } else {
+               delay = cfs_time_sub(cfs_time_current_sec(),
+                                    lock->l_last_activity);
+               LDLM_DEBUG(lock, "client-side enqueue: granted after "
+                          CFS_DURATION_T"s", delay);
+
+               /* Update our time estimate */
+               at_measured(ldlm_lock_to_ns_at(lock),
+                           delay);
+               result = 0;
+       }
+       return result;
+}
+
+/**
+ * Implementation of ->l_completion_ast() for a client, that doesn't wait
+ * until lock is granted. Suitable for locks enqueued through ptlrpcd, of
+ * other threads that cannot block for long.
+ */
+int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+       ENTRY;
+
+       if (flags == LDLM_FL_WAIT_NOREPROC) {
+               LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+               RETURN(0);
+       }
+
+       if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+                      LDLM_FL_BLOCK_CONV))) {
+               wake_up(&lock->l_waitq);
+               RETURN(ldlm_completion_tail(lock));
+       }
+
+       LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+                  "going forward");
+       ldlm_reprocess_all(lock->l_resource);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_completion_ast_async);
+
+/**
+ * Generic LDLM "completion" AST. This is called in several cases:
+ *
+ *     - when a reply to an ENQUEUE RPC is received from the server
+ *       (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at
+ *       this point (determined by flags);
+ *
+ *     - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has
+ *       been granted;
+ *
+ *     - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock
+ *       gets correct lvb;
+ *
+ *     - to force all locks when resource is destroyed (cleanup_resource());
+ *
+ *     - during lock conversion (not used currently).
+ *
+ * If lock is not granted in the first case, this function waits until second
+ * or penultimate cases happen in some other thread.
+ *
+ */
+int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
+{
+       /* XXX ALLOCATE - 160 bytes */
+       struct lock_wait_data lwd;
+       struct obd_device *obd;
+       struct obd_import *imp = NULL;
+       struct l_wait_info lwi;
+       __u32 timeout;
+       int rc = 0;
+       ENTRY;
+
+       if (flags == LDLM_FL_WAIT_NOREPROC) {
+               LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
+               goto noreproc;
+       }
+
+       if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
+                      LDLM_FL_BLOCK_CONV))) {
+               wake_up(&lock->l_waitq);
+               RETURN(0);
+       }
+
+       LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
+                  "sleeping");
+
+noreproc:
+
+       obd = class_exp2obd(lock->l_conn_export);
+
+       /* if this is a local lock, then there is no import */
+       if (obd != NULL) {
+               imp = obd->u.cli.cl_import;
+       }
+
+       /* Wait a long time for enqueue - server may have to callback a
+          lock from another client.  Server will evict the other client if it
+          doesn't respond reasonably, and then give us the lock. */
+       timeout = ldlm_get_enq_timeout(lock) * 2;
+
+       lwd.lwd_lock = lock;
+
+       if (lock->l_flags & LDLM_FL_NO_TIMEOUT) {
+               LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
+               lwi = LWI_INTR(interrupted_completion_wait, &lwd);
+       } else {
+               lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+                                      ldlm_expired_completion_wait,
+                                      interrupted_completion_wait, &lwd);
+       }
+
+       if (imp != NULL) {
+               spin_lock(&imp->imp_lock);
+               lwd.lwd_conn_cnt = imp->imp_conn_cnt;
+               spin_unlock(&imp->imp_lock);
+       }
+
+       if (ns_is_client(ldlm_lock_to_ns(lock)) &&
+           OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
+                                OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
+               lock->l_flags |= LDLM_FL_FAIL_LOC;
+               rc = -EINTR;
+       } else {
+               /* Go to sleep until the lock is granted or cancelled. */
+               rc = l_wait_event(lock->l_waitq,
+                                 is_granted_or_cancelled(lock), &lwi);
+       }
+
+       if (rc) {
+               LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
+                          rc);
+               RETURN(rc);
+       }
+
+       RETURN(ldlm_completion_tail(lock));
+}
+EXPORT_SYMBOL(ldlm_completion_ast);
+
+/**
+ * A helper to build a blocking AST function
+ *
+ * Perform a common operation for blocking ASTs:
+ * defferred lock cancellation.
+ *
+ * \param lock the lock blocking or canceling AST was called on
+ * \retval 0
+ * \see mdt_blocking_ast
+ * \see ldlm_blocking_ast
+ */
+int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock)
+{
+       int do_ast;
+       ENTRY;
+
+       lock->l_flags |= LDLM_FL_CBPENDING;
+       do_ast = (!lock->l_readers && !lock->l_writers);
+       unlock_res_and_lock(lock);
+
+       if (do_ast) {
+               struct lustre_handle lockh;
+               int rc;
+
+               LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
+               ldlm_lock2handle(lock, &lockh);
+               rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+               if (rc < 0)
+                       CERROR("ldlm_cli_cancel: %d\n", rc);
+       } else {
+               LDLM_DEBUG(lock, "Lock still has references, will be "
+                          "cancelled later");
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_blocking_ast_nocheck);
+
+/**
+ * Server blocking AST
+ *
+ * ->l_blocking_ast() callback for LDLM locks acquired by server-side
+ * OBDs.
+ *
+ * \param lock the lock which blocks a request or cancelling lock
+ * \param desc unused
+ * \param data unused
+ * \param flag indicates whether this cancelling or blocking callback
+ * \retval 0
+ * \see ldlm_blocking_ast_nocheck
+ */
+int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                     void *data, int flag)
+{
+       ENTRY;
+
+       if (flag == LDLM_CB_CANCELING) {
+               /* Don't need to do anything here. */
+               RETURN(0);
+       }
+
+       lock_res_and_lock(lock);
+       /* Get this: if ldlm_blocking_ast is racing with intent_policy, such
+        * that ldlm_blocking_ast is called just before intent_policy method
+        * takes the lr_lock, then by the time we get the lock, we might not
+        * be the correct blocking function anymore.  So check, and return
+        * early, if so. */
+       if (lock->l_blocking_ast != ldlm_blocking_ast) {
+               unlock_res_and_lock(lock);
+               RETURN(0);
+       }
+       RETURN(ldlm_blocking_ast_nocheck(lock));
+}
+EXPORT_SYMBOL(ldlm_blocking_ast);
+
+/**
+ * ->l_glimpse_ast() for DLM extent locks acquired on the server-side. See
+ * comment in filter_intent_policy() on why you may need this.
+ */
+int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
+{
+       /*
+        * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for
+        * that is rather subtle: with OST-side locking, it may so happen that
+        * _all_ extent locks are held by the OST. If client wants to obtain
+        * current file size it calls ll{,u}_glimpse_size(), and (as locks are
+        * on the server), dummy glimpse callback fires and does
+        * nothing. Client still receives correct file size due to the
+        * following fragment in filter_intent_policy():
+        *
+        * rc = l->l_glimpse_ast(l, NULL); // this will update the LVB
+        * if (rc != 0 && res->lr_namespace->ns_lvbo &&
+        *     res->lr_namespace->ns_lvbo->lvbo_update) {
+        *       res->lr_namespace->ns_lvbo->lvbo_update(res, NULL, 0, 1);
+        * }
+        *
+        * that is, after glimpse_ast() fails, filter_lvbo_update() runs, and
+        * returns correct file size to the client.
+        */
+       return -ELDLM_NO_LOCK_DATA;
+}
+EXPORT_SYMBOL(ldlm_glimpse_ast);
+
+/**
+ * Enqueue a local lock (typically on a server).
+ */
+int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
+                          const struct ldlm_res_id *res_id,
+                          ldlm_type_t type, ldlm_policy_data_t *policy,
+                          ldlm_mode_t mode, __u64 *flags,
+                          ldlm_blocking_callback blocking,
+                          ldlm_completion_callback completion,
+                          ldlm_glimpse_callback glimpse,
+                          void *data, __u32 lvb_len, enum lvb_type lvb_type,
+                          const __u64 *client_cookie,
+                          struct lustre_handle *lockh)
+{
+       struct ldlm_lock *lock;
+       int err;
+       const struct ldlm_callback_suite cbs = { .lcs_completion = completion,
+                                                .lcs_blocking   = blocking,
+                                                .lcs_glimpse    = glimpse,
+       };
+       ENTRY;
+
+       LASSERT(!(*flags & LDLM_FL_REPLAY));
+       if (unlikely(ns_is_client(ns))) {
+               CERROR("Trying to enqueue local lock in a shadow namespace\n");
+               LBUG();
+       }
+
+       lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len,
+                               lvb_type);
+       if (unlikely(!lock))
+               GOTO(out_nolock, err = -ENOMEM);
+
+       ldlm_lock2handle(lock, lockh);
+
+       /* NB: we don't have any lock now (lock_res_and_lock)
+        * because it's a new lock */
+       ldlm_lock_addref_internal_nolock(lock, mode);
+       lock->l_flags |= LDLM_FL_LOCAL;
+       if (*flags & LDLM_FL_ATOMIC_CB)
+               lock->l_flags |= LDLM_FL_ATOMIC_CB;
+
+       if (policy != NULL)
+               lock->l_policy_data = *policy;
+       if (client_cookie != NULL)
+               lock->l_client_cookie = *client_cookie;
+       if (type == LDLM_EXTENT)
+               lock->l_req_extent = policy->l_extent;
+
+       err = ldlm_lock_enqueue(ns, &lock, policy, flags);
+       if (unlikely(err != ELDLM_OK))
+               GOTO(out, err);
+
+       if (policy != NULL)
+               *policy = lock->l_policy_data;
+
+       if (lock->l_completion_ast)
+               lock->l_completion_ast(lock, *flags, NULL);
+
+       LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
+       EXIT;
+ out:
+       LDLM_LOCK_RELEASE(lock);
+ out_nolock:
+       return err;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_local);
+
+static void failed_lock_cleanup(struct ldlm_namespace *ns,
+                               struct ldlm_lock *lock, int mode)
+{
+       int need_cancel = 0;
+
+       /* Set a flag to prevent us from sending a CANCEL (bug 407) */
+       lock_res_and_lock(lock);
+       /* Check that lock is not granted or failed, we might race. */
+       if ((lock->l_req_mode != lock->l_granted_mode) &&
+           !(lock->l_flags & LDLM_FL_FAILED)) {
+               /* Make sure that this lock will not be found by raced
+                * bl_ast and -EINVAL reply is sent to server anyways.
+                * bug 17645 */
+               lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED |
+                                LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING;
+               need_cancel = 1;
+       }
+       unlock_res_and_lock(lock);
+
+       if (need_cancel)
+               LDLM_DEBUG(lock,
+                          "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | "
+                          "LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING");
+       else
+               LDLM_DEBUG(lock, "lock was granted or failed in race");
+
+       ldlm_lock_decref_internal(lock, mode);
+
+       /* XXX - HACK because we shouldn't call ldlm_lock_destroy()
+        *       from llite/file.c/ll_file_flock(). */
+       /* This code makes for the fact that we do not have blocking handler on
+        * a client for flock locks. As such this is the place where we must
+        * completely kill failed locks. (interrupted and those that
+        * were waiting to be granted when server evicted us. */
+       if (lock->l_resource->lr_type == LDLM_FLOCK) {
+               lock_res_and_lock(lock);
+               ldlm_resource_unlink_lock(lock);
+               ldlm_lock_destroy_nolock(lock);
+               unlock_res_and_lock(lock);
+       }
+}
+
+/**
+ * Finishing portion of client lock enqueue code.
+ *
+ * Called after receiving reply from server.
+ */
+int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
+                         ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
+                         __u64 *flags, void *lvb, __u32 lvb_len,
+                         struct lustre_handle *lockh,int rc)
+{
+       struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+       int is_replay = *flags & LDLM_FL_REPLAY;
+       struct ldlm_lock *lock;
+       struct ldlm_reply *reply;
+       int cleanup_phase = 1;
+       int size = 0;
+       ENTRY;
+
+       lock = ldlm_handle2lock(lockh);
+       /* ldlm_cli_enqueue is holding a reference on this lock. */
+       if (!lock) {
+               LASSERT(type == LDLM_FLOCK);
+               RETURN(-ENOLCK);
+       }
+
+       LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len),
+                "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len);
+
+       if (rc != ELDLM_OK) {
+               LASSERT(!is_replay);
+               LDLM_DEBUG(lock, "client-side enqueue END (%s)",
+                          rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED");
+
+               if (rc != ELDLM_LOCK_ABORTED)
+                       GOTO(cleanup, rc);
+       }
+
+       /* Before we return, swab the reply */
+       reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+       if (reply == NULL)
+               GOTO(cleanup, rc = -EPROTO);
+
+       if (lvb_len != 0) {
+               LASSERT(lvb != NULL);
+
+               size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB,
+                                           RCL_SERVER);
+               if (size < 0) {
+                       LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size);
+                       GOTO(cleanup, rc = size);
+               } else if (unlikely(size > lvb_len)) {
+                       LDLM_ERROR(lock, "Replied LVB is larger than "
+                                  "expectation, expected = %d, replied = %d",
+                                  lvb_len, size);
+                       GOTO(cleanup, rc = -EINVAL);
+               }
+       }
+
+       if (rc == ELDLM_LOCK_ABORTED) {
+               if (lvb_len != 0)
+                       rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+                                          lvb, size);
+               GOTO(cleanup, rc = (rc != 0 ? rc : ELDLM_LOCK_ABORTED));
+       }
+
+       /* lock enqueued on the server */
+       cleanup_phase = 0;
+
+       lock_res_and_lock(lock);
+       /* Key change rehash lock in per-export hash with new key */
+       if (exp->exp_lock_hash) {
+               /* In the function below, .hs_keycmp resolves to
+                * ldlm_export_lock_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               cfs_hash_rehash_key(exp->exp_lock_hash,
+                                   &lock->l_remote_handle,
+                                   &reply->lock_handle,
+                                   &lock->l_exp_hash);
+       } else {
+               lock->l_remote_handle = reply->lock_handle;
+       }
+
+       *flags = ldlm_flags_from_wire(reply->lock_flags);
+       lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+                                             LDLM_INHERIT_FLAGS);
+       /* move NO_TIMEOUT flag to the lock to force ldlm_lock_match()
+        * to wait with no timeout as well */
+       lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
+                                             LDLM_FL_NO_TIMEOUT);
+       unlock_res_and_lock(lock);
+
+       CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: 0x%llx\n",
+              lock, reply->lock_handle.cookie, *flags);
+
+       /* If enqueue returned a blocked lock but the completion handler has
+        * already run, then it fixed up the resource and we don't need to do it
+        * again. */
+       if ((*flags) & LDLM_FL_LOCK_CHANGED) {
+               int newmode = reply->lock_desc.l_req_mode;
+               LASSERT(!is_replay);
+               if (newmode && newmode != lock->l_req_mode) {
+                       LDLM_DEBUG(lock, "server returned different mode %s",
+                                  ldlm_lockname[newmode]);
+                       lock->l_req_mode = newmode;
+               }
+
+               if (memcmp(reply->lock_desc.l_resource.lr_name.name,
+                         lock->l_resource->lr_name.name,
+                         sizeof(struct ldlm_res_id))) {
+                       CDEBUG(D_INFO, "remote intent success, locking "
+                                       "(%ld,%ld,%ld) instead of "
+                                       "(%ld,%ld,%ld)\n",
+                             (long)reply->lock_desc.l_resource.lr_name.name[0],
+                             (long)reply->lock_desc.l_resource.lr_name.name[1],
+                             (long)reply->lock_desc.l_resource.lr_name.name[2],
+                             (long)lock->l_resource->lr_name.name[0],
+                             (long)lock->l_resource->lr_name.name[1],
+                             (long)lock->l_resource->lr_name.name[2]);
+
+                       rc = ldlm_lock_change_resource(ns, lock,
+                                       &reply->lock_desc.l_resource.lr_name);
+                       if (rc || lock->l_resource == NULL)
+                               GOTO(cleanup, rc = -ENOMEM);
+                       LDLM_DEBUG(lock, "client-side enqueue, new resource");
+               }
+               if (with_policy)
+                       if (!(type == LDLM_IBITS &&
+                             !(exp_connect_flags(exp) & OBD_CONNECT_IBITS)))
+                               /* We assume lock type cannot change on server*/
+                               ldlm_convert_policy_to_local(exp,
+                                               lock->l_resource->lr_type,
+                                               &reply->lock_desc.l_policy_data,
+                                               &lock->l_policy_data);
+               if (type != LDLM_PLAIN)
+                       LDLM_DEBUG(lock,"client-side enqueue, new policy data");
+       }
+
+       if ((*flags) & LDLM_FL_AST_SENT ||
+           /* Cancel extent locks as soon as possible on a liblustre client,
+            * because it cannot handle asynchronous ASTs robustly (see
+            * bug 7311). */
+           (LIBLUSTRE_CLIENT && type == LDLM_EXTENT)) {
+               lock_res_and_lock(lock);
+               lock->l_flags |= LDLM_FL_CBPENDING |  LDLM_FL_BL_AST;
+               unlock_res_and_lock(lock);
+               LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
+       }
+
+       /* If the lock has already been granted by a completion AST, don't
+        * clobber the LVB with an older one. */
+       if (lvb_len != 0) {
+               /* We must lock or a racing completion might update lvb without
+                * letting us know and we'll clobber the correct value.
+                * Cannot unlock after the check either, a that still leaves
+                * a tiny window for completion to get in */
+               lock_res_and_lock(lock);
+               if (lock->l_req_mode != lock->l_granted_mode)
+                       rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
+                                          lock->l_lvb_data, size);
+               unlock_res_and_lock(lock);
+               if (rc < 0) {
+                       cleanup_phase = 1;
+                       GOTO(cleanup, rc);
+               }
+       }
+
+       if (!is_replay) {
+               rc = ldlm_lock_enqueue(ns, &lock, NULL, flags);
+               if (lock->l_completion_ast != NULL) {
+                       int err = lock->l_completion_ast(lock, *flags, NULL);
+                       if (!rc)
+                               rc = err;
+                       if (rc)
+                               cleanup_phase = 1;
+               }
+       }
+
+       if (lvb_len && lvb != NULL) {
+               /* Copy the LVB here, and not earlier, because the completion
+                * AST (if any) can override what we got in the reply */
+               memcpy(lvb, lock->l_lvb_data, lvb_len);
+       }
+
+       LDLM_DEBUG(lock, "client-side enqueue END");
+       EXIT;
+cleanup:
+       if (cleanup_phase == 1 && rc)
+               failed_lock_cleanup(ns, lock, mode);
+       /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
+       LDLM_LOCK_PUT(lock);
+       LDLM_LOCK_RELEASE(lock);
+       return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
+
+/**
+ * Estimate number of lock handles that would fit into request of given
+ * size.  PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into
+ * a single page on the send/receive side. XXX: 512 should be changed to
+ * more adequate value.
+ */
+static inline int ldlm_req_handles_avail(int req_size, int off)
+{
+       int avail;
+
+       avail = min_t(int, LDLM_MAXREQSIZE, PAGE_CACHE_SIZE - 512) - req_size;
+       if (likely(avail >= 0))
+               avail /= (int)sizeof(struct lustre_handle);
+       else
+               avail = 0;
+       avail += LDLM_LOCKREQ_HANDLES - off;
+
+       return avail;
+}
+
+static inline int ldlm_capsule_handles_avail(struct req_capsule *pill,
+                                            enum req_location loc,
+                                            int off)
+{
+       int size = req_capsule_msg_size(pill, loc);
+       return ldlm_req_handles_avail(size, off);
+}
+
+static inline int ldlm_format_handles_avail(struct obd_import *imp,
+                                           const struct req_format *fmt,
+                                           enum req_location loc, int off)
+{
+       int size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc);
+       return ldlm_req_handles_avail(size, off);
+}
+
+/**
+ * Cancel LRU locks and pack them into the enqueue request. Pack there the given
+ * \a count locks in \a cancels.
+ *
+ * This is to be called by functions preparing their own requests that
+ * might contain lists of locks to cancel in addition to actual operation
+ * that needs to be performed.
+ */
+int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
+                     int version, int opc, int canceloff,
+                     struct list_head *cancels, int count)
+{
+       struct ldlm_namespace   *ns = exp->exp_obd->obd_namespace;
+       struct req_capsule      *pill = &req->rq_pill;
+       struct ldlm_request     *dlm = NULL;
+       int flags, avail, to_free, pack = 0;
+       LIST_HEAD(head);
+       int rc;
+       ENTRY;
+
+       if (cancels == NULL)
+               cancels = &head;
+       if (ns_connect_cancelset(ns)) {
+               /* Estimate the amount of available space in the request. */
+               req_capsule_filled_sizes(pill, RCL_CLIENT);
+               avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
+
+               flags = ns_connect_lru_resize(ns) ?
+                       LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+               to_free = !ns_connect_lru_resize(ns) &&
+                         opc == LDLM_ENQUEUE ? 1 : 0;
+
+               /* Cancel LRU locks here _only_ if the server supports
+                * EARLY_CANCEL. Otherwise we have to send extra CANCEL
+                * RPC, which will make us slower. */
+               if (avail > count)
+                       count += ldlm_cancel_lru_local(ns, cancels, to_free,
+                                                      avail - count, 0, flags);
+               if (avail > count)
+                       pack = count;
+               else
+                       pack = avail;
+               req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT,
+                                    ldlm_request_bufsize(pack, opc));
+       }
+
+       rc = ptlrpc_request_pack(req, version, opc);
+       if (rc) {
+               ldlm_lock_list_put(cancels, l_bl_ast, count);
+               RETURN(rc);
+       }
+
+       if (ns_connect_cancelset(ns)) {
+               if (canceloff) {
+                       dlm = req_capsule_client_get(pill, &RMF_DLM_REQ);
+                       LASSERT(dlm);
+                       /* Skip first lock handler in ldlm_request_pack(),
+                        * this method will incrment @lock_count according
+                        * to the lock handle amount actually written to
+                        * the buffer. */
+                       dlm->lock_count = canceloff;
+               }
+               /* Pack into the request @pack lock handles. */
+               ldlm_cli_cancel_list(cancels, pack, req, 0);
+               /* Prepare and send separate cancel RPC for others. */
+               ldlm_cli_cancel_list(cancels, count - pack, NULL, 0);
+       } else {
+               ldlm_lock_list_put(cancels, l_bl_ast, count);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_prep_elc_req);
+
+int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
+                         struct list_head *cancels, int count)
+{
+       return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+                                LDLM_ENQUEUE_CANCEL_OFF, cancels, count);
+}
+EXPORT_SYMBOL(ldlm_prep_enqueue_req);
+
+struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+       struct ptlrpc_request *req;
+       int rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+       if (req == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(ERR_PTR(rc));
+       }
+
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+       ptlrpc_request_set_replen(req);
+       RETURN(req);
+}
+EXPORT_SYMBOL(ldlm_enqueue_pack);
+
+/**
+ * Client-side lock enqueue.
+ *
+ * If a request has some specific initialisation it is passed in \a reqp,
+ * otherwise it is created in ldlm_cli_enqueue.
+ *
+ * Supports sync and async requests, pass \a async flag accordingly. If a
+ * request was created in ldlm_cli_enqueue and it is the async request,
+ * pass it to the caller in \a reqp.
+ */
+int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
+                    struct ldlm_enqueue_info *einfo,
+                    const struct ldlm_res_id *res_id,
+                    ldlm_policy_data_t const *policy, __u64 *flags,
+                    void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
+                    struct lustre_handle *lockh, int async)
+{
+       struct ldlm_namespace *ns;
+       struct ldlm_lock      *lock;
+       struct ldlm_request   *body;
+       int                 is_replay = *flags & LDLM_FL_REPLAY;
+       int                 req_passed_in = 1;
+       int                 rc, err;
+       struct ptlrpc_request *req;
+       ENTRY;
+
+       LASSERT(exp != NULL);
+
+       ns = exp->exp_obd->obd_namespace;
+
+       /* If we're replaying this lock, just check some invariants.
+        * If we're creating a new lock, get everything all setup nice. */
+       if (is_replay) {
+               lock = ldlm_handle2lock_long(lockh, 0);
+               LASSERT(lock != NULL);
+               LDLM_DEBUG(lock, "client-side enqueue START");
+               LASSERT(exp == lock->l_conn_export);
+       } else {
+               const struct ldlm_callback_suite cbs = {
+                       .lcs_completion = einfo->ei_cb_cp,
+                       .lcs_blocking   = einfo->ei_cb_bl,
+                       .lcs_glimpse    = einfo->ei_cb_gl,
+                       .lcs_weigh      = einfo->ei_cb_wg
+               };
+               lock = ldlm_lock_create(ns, res_id, einfo->ei_type,
+                                       einfo->ei_mode, &cbs, einfo->ei_cbdata,
+                                       lvb_len, lvb_type);
+               if (lock == NULL)
+                       RETURN(-ENOMEM);
+               /* for the local lock, add the reference */
+               ldlm_lock_addref_internal(lock, einfo->ei_mode);
+               ldlm_lock2handle(lock, lockh);
+               if (policy != NULL) {
+                       /* INODEBITS_INTEROP: If the server does not support
+                        * inodebits, we will request a plain lock in the
+                        * descriptor (ldlm_lock2desc() below) but use an
+                        * inodebits lock internally with both bits set.
+                        */
+                       if (einfo->ei_type == LDLM_IBITS &&
+                           !(exp_connect_flags(exp) &
+                             OBD_CONNECT_IBITS))
+                               lock->l_policy_data.l_inodebits.bits =
+                                       MDS_INODELOCK_LOOKUP |
+                                       MDS_INODELOCK_UPDATE;
+                       else
+                               lock->l_policy_data = *policy;
+               }
+
+               if (einfo->ei_type == LDLM_EXTENT)
+                       lock->l_req_extent = policy->l_extent;
+               LDLM_DEBUG(lock, "client-side enqueue START, flags %llx\n",
+                          *flags);
+       }
+
+       lock->l_conn_export = exp;
+       lock->l_export = NULL;
+       lock->l_blocking_ast = einfo->ei_cb_bl;
+       lock->l_flags |= (*flags & LDLM_FL_NO_LRU);
+
+       /* lock not sent to server yet */
+
+       if (reqp == NULL || *reqp == NULL) {
+               req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                               &RQF_LDLM_ENQUEUE,
+                                               LUSTRE_DLM_VERSION,
+                                               LDLM_ENQUEUE);
+               if (req == NULL) {
+                       failed_lock_cleanup(ns, lock, einfo->ei_mode);
+                       LDLM_LOCK_RELEASE(lock);
+                       RETURN(-ENOMEM);
+               }
+               req_passed_in = 0;
+               if (reqp)
+                       *reqp = req;
+       } else {
+               int len;
+
+               req = *reqp;
+               len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ,
+                                          RCL_CLIENT);
+               LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n",
+                        DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
+       }
+
+       /* Dump lock data into the request buffer */
+       body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+       ldlm_lock2desc(lock, &body->lock_desc);
+       body->lock_flags = ldlm_flags_to_wire(*flags);
+       body->lock_handle[0] = *lockh;
+
+       /* Continue as normal. */
+       if (!req_passed_in) {
+               if (lvb_len > 0)
+                       req_capsule_extend(&req->rq_pill,
+                                          &RQF_LDLM_ENQUEUE_LVB);
+               req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                                    lvb_len);
+               ptlrpc_request_set_replen(req);
+       }
+
+       /*
+        * Liblustre client doesn't get extent locks, except for O_APPEND case
+        * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where
+        * [i_size, OBD_OBJECT_EOF] lock is taken.
+        */
+       LASSERT(ergo(LIBLUSTRE_CLIENT, einfo->ei_type != LDLM_EXTENT ||
+                    policy->l_extent.end == OBD_OBJECT_EOF));
+
+       if (async) {
+               LASSERT(reqp != NULL);
+               RETURN(0);
+       }
+
+       LDLM_DEBUG(lock, "sending request");
+
+       rc = ptlrpc_queue_wait(req);
+
+       err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0,
+                                   einfo->ei_mode, flags, lvb, lvb_len,
+                                   lockh, rc);
+
+       /* If ldlm_cli_enqueue_fini did not find the lock, we need to free
+        * one reference that we took */
+       if (err == -ENOLCK)
+               LDLM_LOCK_RELEASE(lock);
+       else
+               rc = err;
+
+       if (!req_passed_in && req != NULL) {
+               ptlrpc_req_finished(req);
+               if (reqp)
+                       *reqp = NULL;
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_cli_enqueue);
+
+static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
+                                 __u32 *flags)
+{
+       struct ldlm_resource *res;
+       int rc;
+       ENTRY;
+       if (ns_is_client(ldlm_lock_to_ns(lock))) {
+               CERROR("Trying to cancel local lock\n");
+               LBUG();
+       }
+       LDLM_DEBUG(lock, "client-side local convert");
+
+       res = ldlm_lock_convert(lock, new_mode, flags);
+       if (res) {
+               ldlm_reprocess_all(res);
+               rc = 0;
+       } else {
+               rc = EDEADLOCK;
+       }
+       LDLM_DEBUG(lock, "client-side local convert handler END");
+       LDLM_LOCK_PUT(lock);
+       RETURN(rc);
+}
+
+/* FIXME: one of ldlm_cli_convert or the server side should reject attempted
+ * conversion of locks which are on the waiting or converting queue */
+/* Caller of this code is supposed to take care of lock readers/writers
+   accounting */
+int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, __u32 *flags)
+{
+       struct ldlm_request   *body;
+       struct ldlm_reply     *reply;
+       struct ldlm_lock      *lock;
+       struct ldlm_resource  *res;
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       lock = ldlm_handle2lock(lockh);
+       if (!lock) {
+               LBUG();
+               RETURN(-EINVAL);
+       }
+       *flags = 0;
+
+       if (lock->l_conn_export == NULL)
+               RETURN(ldlm_cli_convert_local(lock, new_mode, flags));
+
+       LDLM_DEBUG(lock, "client-side convert");
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export),
+                                       &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
+                                       LDLM_CONVERT);
+       if (req == NULL) {
+               LDLM_LOCK_PUT(lock);
+               RETURN(-ENOMEM);
+       }
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+       body->lock_handle[0] = lock->l_remote_handle;
+
+       body->lock_desc.l_req_mode = new_mode;
+       body->lock_flags = ldlm_flags_to_wire(*flags);
+
+
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc != ELDLM_OK)
+               GOTO(out, rc);
+
+       reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+       if (reply == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       if (req->rq_status)
+               GOTO(out, rc = req->rq_status);
+
+       res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
+       if (res != NULL) {
+               ldlm_reprocess_all(res);
+               /* Go to sleep until the lock is granted. */
+               /* FIXME: or cancelled. */
+               if (lock->l_completion_ast) {
+                       rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
+                                                   NULL);
+                       if (rc)
+                               GOTO(out, rc);
+               }
+       } else {
+               rc = EDEADLOCK;
+       }
+       EXIT;
+ out:
+       LDLM_LOCK_PUT(lock);
+       ptlrpc_req_finished(req);
+       return rc;
+}
+EXPORT_SYMBOL(ldlm_cli_convert);
+
+/**
+ * Cancel locks locally.
+ * Returns:
+ * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server
+ * \retval LDLM_FL_CANCELING otherwise;
+ * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC.
+ */
+static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
+{
+       __u64 rc = LDLM_FL_LOCAL_ONLY;
+       ENTRY;
+
+       if (lock->l_conn_export) {
+               bool local_only;
+
+               LDLM_DEBUG(lock, "client-side cancel");
+               /* Set this flag to prevent others from getting new references*/
+               lock_res_and_lock(lock);
+               lock->l_flags |= LDLM_FL_CBPENDING;
+               local_only = !!(lock->l_flags &
+                               (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
+               ldlm_cancel_callback(lock);
+               rc = (lock->l_flags & LDLM_FL_BL_AST) ?
+                       LDLM_FL_BL_AST : LDLM_FL_CANCELING;
+               unlock_res_and_lock(lock);
+
+               if (local_only) {
+                       CDEBUG(D_DLMTRACE, "not sending request (at caller's "
+                              "instruction)\n");
+                       rc = LDLM_FL_LOCAL_ONLY;
+               }
+               ldlm_lock_cancel(lock);
+       } else {
+               if (ns_is_client(ldlm_lock_to_ns(lock))) {
+                       LDLM_ERROR(lock, "Trying to cancel local lock");
+                       LBUG();
+               }
+               LDLM_DEBUG(lock, "server-side local cancel");
+               ldlm_lock_cancel(lock);
+               ldlm_reprocess_all(lock->l_resource);
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * Pack \a count locks in \a head into ldlm_request buffer of request \a req.
+ */
+static void ldlm_cancel_pack(struct ptlrpc_request *req,
+                            struct list_head *head, int count)
+{
+       struct ldlm_request *dlm;
+       struct ldlm_lock *lock;
+       int max, packed = 0;
+       ENTRY;
+
+       dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+       LASSERT(dlm != NULL);
+
+       /* Check the room in the request buffer. */
+       max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) -
+               sizeof(struct ldlm_request);
+       max /= sizeof(struct lustre_handle);
+       max += LDLM_LOCKREQ_HANDLES;
+       LASSERT(max >= dlm->lock_count + count);
+
+       /* XXX: it would be better to pack lock handles grouped by resource.
+        * so that the server cancel would call filter_lvbo_update() less
+        * frequently. */
+       list_for_each_entry(lock, head, l_bl_ast) {
+               if (!count--)
+                       break;
+               LASSERT(lock->l_conn_export);
+               /* Pack the lock handle to the given request buffer. */
+               LDLM_DEBUG(lock, "packing");
+               dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle;
+               packed++;
+       }
+       CDEBUG(D_DLMTRACE, "%d locks packed\n", packed);
+       EXIT;
+}
+
+/**
+ * Prepare and send a batched cancel RPC. It will include \a count lock
+ * handles of locks given in \a cancels list. */
+int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
+                       int count, ldlm_cancel_flags_t flags)
+{
+       struct ptlrpc_request *req = NULL;
+       struct obd_import *imp;
+       int free, sent = 0;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(exp != NULL);
+       LASSERT(count > 0);
+
+       CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val);
+
+       if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
+               RETURN(count);
+
+       free = ldlm_format_handles_avail(class_exp2cliimp(exp),
+                                        &RQF_LDLM_CANCEL, RCL_CLIENT, 0);
+       if (count > free)
+               count = free;
+
+       while (1) {
+               imp = class_exp2cliimp(exp);
+               if (imp == NULL || imp->imp_invalid) {
+                       CDEBUG(D_DLMTRACE,
+                              "skipping cancel on invalid import %p\n", imp);
+                       RETURN(count);
+               }
+
+               req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL);
+               if (req == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
+               req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT,
+                                    ldlm_request_bufsize(count, LDLM_CANCEL));
+
+               rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL);
+               if (rc) {
+                       ptlrpc_request_free(req);
+                       GOTO(out, rc);
+               }
+
+               req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
+               req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+               ptlrpc_at_set_req_timeout(req);
+
+               ldlm_cancel_pack(req, cancels, count);
+
+               ptlrpc_request_set_replen(req);
+               if (flags & LCF_ASYNC) {
+                       ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+                       sent = count;
+                       GOTO(out, 0);
+               } else {
+                       rc = ptlrpc_queue_wait(req);
+               }
+               if (rc == ESTALE) {
+                       CDEBUG(D_DLMTRACE, "client/server (nid %s) "
+                              "out of sync -- not fatal\n",
+                              libcfs_nid2str(req->rq_import->
+                                             imp_connection->c_peer.nid));
+                       rc = 0;
+               } else if (rc == -ETIMEDOUT && /* check there was no reconnect*/
+                          req->rq_import_generation == imp->imp_generation) {
+                       ptlrpc_req_finished(req);
+                       continue;
+               } else if (rc != ELDLM_OK) {
+                       /* -ESHUTDOWN is common on umount */
+                       CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+                                    "Got rc %d from cancel RPC: "
+                                    "canceling anyway\n", rc);
+                       break;
+               }
+               sent = count;
+               break;
+       }
+
+       ptlrpc_req_finished(req);
+       EXIT;
+out:
+       return sent ? sent : rc;
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_req);
+
+static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
+{
+       LASSERT(imp != NULL);
+       return &imp->imp_obd->obd_namespace->ns_pool;
+}
+
+/**
+ * Update client's OBD pool related fields with new SLV and Limit from \a req.
+ */
+int ldlm_cli_update_pool(struct ptlrpc_request *req)
+{
+       struct obd_device *obd;
+       __u64 new_slv;
+       __u32 new_limit;
+       ENTRY;
+       if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
+                    !imp_connect_lru_resize(req->rq_import)))
+       {
+               /*
+                * Do nothing for corner cases.
+                */
+               RETURN(0);
+       }
+
+       /* In some cases RPC may contain SLV and limit zeroed out. This
+        * is the case when server does not support LRU resize feature.
+        * This is also possible in some recovery cases when server-side
+        * reqs have no reference to the OBD export and thus access to
+        * server-side namespace is not possible. */
+       if (lustre_msg_get_slv(req->rq_repmsg) == 0 ||
+           lustre_msg_get_limit(req->rq_repmsg) == 0) {
+               DEBUG_REQ(D_HA, req, "Zero SLV or Limit found "
+                         "(SLV: "LPU64", Limit: %u)",
+                         lustre_msg_get_slv(req->rq_repmsg),
+                         lustre_msg_get_limit(req->rq_repmsg));
+               RETURN(0);
+       }
+
+       new_limit = lustre_msg_get_limit(req->rq_repmsg);
+       new_slv = lustre_msg_get_slv(req->rq_repmsg);
+       obd = req->rq_import->imp_obd;
+
+       /* Set new SLV and limit in OBD fields to make them accessible
+        * to the pool thread. We do not access obd_namespace and pool
+        * directly here as there is no reliable way to make sure that
+        * they are still alive at cleanup time. Evil races are possible
+        * which may cause Oops at that time. */
+       write_lock(&obd->obd_pool_lock);
+       obd->obd_pool_slv = new_slv;
+       obd->obd_pool_limit = new_limit;
+       write_unlock(&obd->obd_pool_lock);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_update_pool);
+
+/**
+ * Client side lock cancel.
+ *
+ * Lock must not have any readers or writers by this time.
+ */
+int ldlm_cli_cancel(struct lustre_handle *lockh,
+                   ldlm_cancel_flags_t cancel_flags)
+{
+       struct obd_export *exp;
+       int avail, flags, count = 1;
+       __u64 rc = 0;
+       struct ldlm_namespace *ns;
+       struct ldlm_lock *lock;
+       LIST_HEAD(cancels);
+       ENTRY;
+
+       /* concurrent cancels on the same handle can happen */
+       lock = ldlm_handle2lock_long(lockh, LDLM_FL_CANCELING);
+       if (lock == NULL) {
+               LDLM_DEBUG_NOLOCK("lock is already being destroyed\n");
+               RETURN(0);
+       }
+
+       rc = ldlm_cli_cancel_local(lock);
+       if (rc == LDLM_FL_LOCAL_ONLY) {
+               LDLM_LOCK_RELEASE(lock);
+               RETURN(0);
+       }
+       /* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL
+        * RPC which goes to canceld portal, so we can cancel other LRU locks
+        * here and send them all as one LDLM_CANCEL RPC. */
+       LASSERT(list_empty(&lock->l_bl_ast));
+       list_add(&lock->l_bl_ast, &cancels);
+
+       exp = lock->l_conn_export;
+       if (exp_connect_cancelset(exp)) {
+               avail = ldlm_format_handles_avail(class_exp2cliimp(exp),
+                                                 &RQF_LDLM_CANCEL,
+                                                 RCL_CLIENT, 0);
+               LASSERT(avail > 0);
+
+               ns = ldlm_lock_to_ns(lock);
+               flags = ns_connect_lru_resize(ns) ?
+                       LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
+               count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
+                                              LCF_BL_AST, flags);
+       }
+       ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel);
+
+/**
+ * Locally cancel up to \a count locks in list \a cancels.
+ * Return the number of cancelled locks.
+ */
+int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
+                              ldlm_cancel_flags_t flags)
+{
+       LIST_HEAD(head);
+       struct ldlm_lock *lock, *next;
+       int left = 0, bl_ast = 0;
+       __u64 rc;
+
+       left = count;
+       list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
+               if (left-- == 0)
+                       break;
+
+               if (flags & LCF_LOCAL) {
+                       rc = LDLM_FL_LOCAL_ONLY;
+                       ldlm_lock_cancel(lock);
+               } else {
+                       rc = ldlm_cli_cancel_local(lock);
+               }
+               /* Until we have compound requests and can send LDLM_CANCEL
+                * requests batched with generic RPCs, we need to send cancels
+                * with the LDLM_FL_BL_AST flag in a separate RPC from
+                * the one being generated now. */
+               if (!(flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) {
+                       LDLM_DEBUG(lock, "Cancel lock separately");
+                       list_del_init(&lock->l_bl_ast);
+                       list_add(&lock->l_bl_ast, &head);
+                       bl_ast++;
+                       continue;
+               }
+               if (rc == LDLM_FL_LOCAL_ONLY) {
+                       /* CANCEL RPC should not be sent to server. */
+                       list_del_init(&lock->l_bl_ast);
+                       LDLM_LOCK_RELEASE(lock);
+                       count--;
+               }
+       }
+       if (bl_ast > 0) {
+               count -= bl_ast;
+               ldlm_cli_cancel_list(&head, bl_ast, NULL, 0);
+       }
+
+       RETURN(count);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list_local);
+
+/**
+ * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns,
+                                                   struct ldlm_lock *lock,
+                                                   int unused, int added,
+                                                   int count)
+{
+       ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK;
+       ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery;
+       lock_res_and_lock(lock);
+
+       /* don't check added & count since we want to process all locks
+        * from unused list */
+       switch (lock->l_resource->lr_type) {
+               case LDLM_EXTENT:
+               case LDLM_IBITS:
+                       if (cb && cb(lock))
+                               break;
+               default:
+                       result = LDLM_POLICY_SKIP_LOCK;
+                       lock->l_flags |= LDLM_FL_SKIPPED;
+                       break;
+       }
+
+       unlock_res_and_lock(lock);
+       RETURN(result);
+}
+
+/**
+ * Callback function for LRU-resize policy. Decides whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current
+ * scan \a added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
+                                                struct ldlm_lock *lock,
+                                                int unused, int added,
+                                                int count)
+{
+       cfs_time_t cur = cfs_time_current();
+       struct ldlm_pool *pl = &ns->ns_pool;
+       __u64 slv, lvf, lv;
+       cfs_time_t la;
+
+       /* Stop LRU processing when we reach past @count or have checked all
+        * locks in LRU. */
+       if (count && added >= count)
+               return LDLM_POLICY_KEEP_LOCK;
+
+       slv = ldlm_pool_get_slv(pl);
+       lvf = ldlm_pool_get_lvf(pl);
+       la = cfs_duration_sec(cfs_time_sub(cur,
+                             lock->l_last_used));
+       lv = lvf * la * unused;
+
+       /* Inform pool about current CLV to see it via proc. */
+       ldlm_pool_set_clv(pl, lv);
+
+       /* Stop when SLV is not yet come from server or lv is smaller than
+        * it is. */
+       return (slv == 0 || lv < slv) ?
+               LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for proc used policy. Makes decision whether to keep
+ * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
+ * added and number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
+                                                  struct ldlm_lock *lock,
+                                                  int unused, int added,
+                                                  int count)
+{
+       /* Stop LRU processing when we reach past @count or have checked all
+        * locks in LRU. */
+       return (added >= count) ?
+               LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for aged policy. Makes decision whether to keep \a lock in
+ * LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
+                                                struct ldlm_lock *lock,
+                                                int unused, int added,
+                                                int count)
+{
+       /* Stop LRU processing if young lock is found and we reach past count */
+       return ((added >= count) &&
+               cfs_time_before(cfs_time_current(),
+                               cfs_time_add(lock->l_last_used,
+                                            ns->ns_max_age))) ?
+               LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+/**
+ * Callback function for default policy. Makes decision whether to keep \a lock
+ * in LRU for current LRU size \a unused, added in current scan \a added and
+ * number of locks to be preferably canceled \a count.
+ *
+ * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
+ *
+ * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
+ */
+static ldlm_policy_res_t ldlm_cancel_default_policy(struct ldlm_namespace *ns,
+                                                   struct ldlm_lock *lock,
+                                                   int unused, int added,
+                                                   int count)
+{
+       /* Stop LRU processing when we reach past count or have checked all
+        * locks in LRU. */
+       return (added >= count) ?
+               LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
+}
+
+typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
+                                                     struct ldlm_lock *, int,
+                                                     int, int);
+
+static ldlm_cancel_lru_policy_t
+ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
+{
+       if (flags & LDLM_CANCEL_NO_WAIT)
+               return ldlm_cancel_no_wait_policy;
+
+       if (ns_connect_lru_resize(ns)) {
+               if (flags & LDLM_CANCEL_SHRINK)
+                       /* We kill passed number of old locks. */
+                       return ldlm_cancel_passed_policy;
+               else if (flags & LDLM_CANCEL_LRUR)
+                       return ldlm_cancel_lrur_policy;
+               else if (flags & LDLM_CANCEL_PASSED)
+                       return ldlm_cancel_passed_policy;
+       } else {
+               if (flags & LDLM_CANCEL_AGED)
+                       return ldlm_cancel_aged_policy;
+       }
+
+       return ldlm_cancel_default_policy;
+}
+
+/**
+ * - Free space in LRU for \a count new locks,
+ *   redundant unused locks are canceled locally;
+ * - also cancel locally unused aged locks;
+ * - do not cancel more than \a max locks;
+ * - GET the found locks and add them into the \a cancels list.
+ *
+ * A client lock can be added to the l_bl_ast list only when it is
+ * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing
+ * CANCEL.  There are the following use cases:
+ * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and
+ * ldlm_cli_cancel(), which check and set this flag properly. As any
+ * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
+ * later without any special locking.
+ *
+ * Calling policies for enabled LRU resize:
+ * ----------------------------------------
+ * flags & LDLM_CANCEL_LRUR - use LRU resize policy (SLV from server) to
+ *                         cancel not more than \a count locks;
+ *
+ * flags & LDLM_CANCEL_PASSED - cancel \a count number of old locks (located at
+ *                           the beginning of LRU list);
+ *
+ * flags & LDLM_CANCEL_SHRINK - cancel not more than \a count locks according to
+ *                           memory pressre policy function;
+ *
+ * flags & LDLM_CANCEL_AGED - cancel \a count locks according to "aged policy".
+ *
+ * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible
+ *                            (typically before replaying locks) w/o
+ *                            sending any RPCs or waiting for any
+ *                            outstanding RPC to complete.
+ */
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, struct list_head *cancels,
+                                int count, int max, int flags)
+{
+       ldlm_cancel_lru_policy_t pf;
+       struct ldlm_lock *lock, *next;
+       int added = 0, unused, remained;
+       ENTRY;
+
+       spin_lock(&ns->ns_lock);
+       unused = ns->ns_nr_unused;
+       remained = unused;
+
+       if (!ns_connect_lru_resize(ns))
+               count += unused - ns->ns_max_unused;
+
+       pf = ldlm_cancel_lru_policy(ns, flags);
+       LASSERT(pf != NULL);
+
+       while (!list_empty(&ns->ns_unused_list)) {
+               ldlm_policy_res_t result;
+
+               /* all unused locks */
+               if (remained-- <= 0)
+                       break;
+
+               /* For any flags, stop scanning if @max is reached. */
+               if (max && added >= max)
+                       break;
+
+               list_for_each_entry_safe(lock, next, &ns->ns_unused_list,
+                                            l_lru) {
+                       /* No locks which got blocking requests. */
+                       LASSERT(!(lock->l_flags & LDLM_FL_BL_AST));
+
+                       if (flags & LDLM_CANCEL_NO_WAIT &&
+                           lock->l_flags & LDLM_FL_SKIPPED)
+                               /* already processed */
+                               continue;
+
+                       /* Somebody is already doing CANCEL. No need for this
+                        * lock in LRU, do not traverse it again. */
+                       if (!(lock->l_flags & LDLM_FL_CANCELING))
+                               break;
+
+                       ldlm_lock_remove_from_lru_nolock(lock);
+               }
+               if (&lock->l_lru == &ns->ns_unused_list)
+                       break;
+
+               LDLM_LOCK_GET(lock);
+               spin_unlock(&ns->ns_lock);
+               lu_ref_add(&lock->l_reference, __FUNCTION__, current);
+
+               /* Pass the lock through the policy filter and see if it
+                * should stay in LRU.
+                *
+                * Even for shrinker policy we stop scanning if
+                * we find a lock that should stay in the cache.
+                * We should take into account lock age anyway
+                * as a new lock is a valuable resource even if
+                * it has a low weight.
+                *
+                * That is, for shrinker policy we drop only
+                * old locks, but additionally choose them by
+                * their weight. Big extent locks will stay in
+                * the cache. */
+               result = pf(ns, lock, unused, added, count);
+               if (result == LDLM_POLICY_KEEP_LOCK) {
+                       lu_ref_del(&lock->l_reference,
+                                  __FUNCTION__, current);
+                       LDLM_LOCK_RELEASE(lock);
+                       spin_lock(&ns->ns_lock);
+                       break;
+               }
+               if (result == LDLM_POLICY_SKIP_LOCK) {
+                       lu_ref_del(&lock->l_reference,
+                                  __func__, current);
+                       LDLM_LOCK_RELEASE(lock);
+                       spin_lock(&ns->ns_lock);
+                       continue;
+               }
+
+               lock_res_and_lock(lock);
+               /* Check flags again under the lock. */
+               if ((lock->l_flags & LDLM_FL_CANCELING) ||
+                   (ldlm_lock_remove_from_lru(lock) == 0)) {
+                       /* Another thread is removing lock from LRU, or
+                        * somebody is already doing CANCEL, or there
+                        * is a blocking request which will send cancel
+                        * by itself, or the lock is no longer unused. */
+                       unlock_res_and_lock(lock);
+                       lu_ref_del(&lock->l_reference,
+                                  __FUNCTION__, current);
+                       LDLM_LOCK_RELEASE(lock);
+                       spin_lock(&ns->ns_lock);
+                       continue;
+               }
+               LASSERT(!lock->l_readers && !lock->l_writers);
+
+               /* If we have chosen to cancel this lock voluntarily, we
+                * better send cancel notification to server, so that it
+                * frees appropriate state. This might lead to a race
+                * where while we are doing cancel here, server is also
+                * silently cancelling this lock. */
+               lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK;
+
+               /* Setting the CBPENDING flag is a little misleading,
+                * but prevents an important race; namely, once
+                * CBPENDING is set, the lock can accumulate no more
+                * readers/writers. Since readers and writers are
+                * already zero here, ldlm_lock_decref() won't see
+                * this flag and call l_blocking_ast */
+               lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
+
+               /* We can't re-add to l_lru as it confuses the
+                * refcounting in ldlm_lock_remove_from_lru() if an AST
+                * arrives after we drop lr_lock below. We use l_bl_ast
+                * and can't use l_pending_chain as it is used both on
+                * server and client nevertheless bug 5666 says it is
+                * used only on server */
+               LASSERT(list_empty(&lock->l_bl_ast));
+               list_add(&lock->l_bl_ast, cancels);
+               unlock_res_and_lock(lock);
+               lu_ref_del(&lock->l_reference, __FUNCTION__, current);
+               spin_lock(&ns->ns_lock);
+               added++;
+               unused--;
+       }
+       spin_unlock(&ns->ns_lock);
+       RETURN(added);
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
+                         int count, int max, ldlm_cancel_flags_t cancel_flags,
+                         int flags)
+{
+       int added;
+       added = ldlm_prepare_lru_list(ns, cancels, count, max, flags);
+       if (added <= 0)
+               return added;
+       return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
+}
+
+/**
+ * Cancel at least \a nr locks from given namespace LRU.
+ *
+ * When called with LCF_ASYNC the blocking callback will be handled
+ * in a thread and this function will return after the thread has been
+ * asked to call the callback.  When called with LCF_ASYNC the blocking
+ * callback will be performed in this function.
+ */
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
+                   ldlm_cancel_flags_t cancel_flags,
+                   int flags)
+{
+       LIST_HEAD(cancels);
+       int count, rc;
+       ENTRY;
+
+       /* Just prepare the list of locks, do not actually cancel them yet.
+        * Locks are cancelled later in a separate thread. */
+       count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags);
+       rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
+       if (rc == 0)
+               RETURN(count);
+
+       RETURN(0);
+}
+
+/**
+ * Find and cancel locally unused locks found on resource, matched to the
+ * given policy, mode. GET the found locks and add them into the \a cancels
+ * list.
+ */
+int ldlm_cancel_resource_local(struct ldlm_resource *res,
+                              struct list_head *cancels,
+                              ldlm_policy_data_t *policy,
+                              ldlm_mode_t mode, int lock_flags,
+                              ldlm_cancel_flags_t cancel_flags, void *opaque)
+{
+       struct ldlm_lock *lock;
+       int count = 0;
+       ENTRY;
+
+       lock_res(res);
+       list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+               if (opaque != NULL && lock->l_ast_data != opaque) {
+                       LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+                                  lock->l_ast_data, opaque);
+                       //LBUG();
+                       continue;
+               }
+
+               if (lock->l_readers || lock->l_writers)
+                       continue;
+
+               /* If somebody is already doing CANCEL, or blocking AST came,
+                * skip this lock. */
+               if (lock->l_flags & LDLM_FL_BL_AST ||
+                   lock->l_flags & LDLM_FL_CANCELING)
+                       continue;
+
+               if (lockmode_compat(lock->l_granted_mode, mode))
+                       continue;
+
+               /* If policy is given and this is IBITS lock, add to list only
+                * those locks that match by policy. */
+               if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
+                   !(lock->l_policy_data.l_inodebits.bits &
+                     policy->l_inodebits.bits))
+                       continue;
+
+               /* See CBPENDING comment in ldlm_cancel_lru */
+               lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
+                                lock_flags;
+
+               LASSERT(list_empty(&lock->l_bl_ast));
+               list_add(&lock->l_bl_ast, cancels);
+               LDLM_LOCK_GET(lock);
+               count++;
+       }
+       unlock_res(res);
+
+       RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
+}
+EXPORT_SYMBOL(ldlm_cancel_resource_local);
+
+/**
+ * Cancel client-side locks from a list and send/prepare cancel RPCs to the
+ * server.
+ * If \a req is NULL, send CANCEL request to server with handles of locks
+ * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests
+ * separately per lock.
+ * If \a req is not NULL, put handles of locks in \a cancels into the request
+ * buffer at the offset \a off.
+ * Destroy \a cancels at the end.
+ */
+int ldlm_cli_cancel_list(struct list_head *cancels, int count,
+                        struct ptlrpc_request *req, ldlm_cancel_flags_t flags)
+{
+       struct ldlm_lock *lock;
+       int res = 0;
+       ENTRY;
+
+       if (list_empty(cancels) || count == 0)
+               RETURN(0);
+
+       /* XXX: requests (both batched and not) could be sent in parallel.
+        * Usually it is enough to have just 1 RPC, but it is possible that
+        * there are too many locks to be cancelled in LRU or on a resource.
+        * It would also speed up the case when the server does not support
+        * the feature. */
+       while (count > 0) {
+               LASSERT(!list_empty(cancels));
+               lock = list_entry(cancels->next, struct ldlm_lock,
+                                     l_bl_ast);
+               LASSERT(lock->l_conn_export);
+
+               if (exp_connect_cancelset(lock->l_conn_export)) {
+                       res = count;
+                       if (req)
+                               ldlm_cancel_pack(req, cancels, count);
+                       else
+                               res = ldlm_cli_cancel_req(lock->l_conn_export,
+                                                         cancels, count,
+                                                         flags);
+               } else {
+                       res = ldlm_cli_cancel_req(lock->l_conn_export,
+                                                 cancels, 1, flags);
+               }
+
+               if (res < 0) {
+                       CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
+                                    "ldlm_cli_cancel_list: %d\n", res);
+                       res = count;
+               }
+
+               count -= res;
+               ldlm_lock_list_put(cancels, l_bl_ast, res);
+       }
+       LASSERT(count == 0);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_list);
+
+/**
+ * Cancel all locks on a resource that have 0 readers/writers.
+ *
+ * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+                                   const struct ldlm_res_id *res_id,
+                                   ldlm_policy_data_t *policy,
+                                   ldlm_mode_t mode,
+                                   ldlm_cancel_flags_t flags,
+                                   void *opaque)
+{
+       struct ldlm_resource *res;
+       LIST_HEAD(cancels);
+       int count;
+       int rc;
+       ENTRY;
+
+       res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+       if (res == NULL) {
+               /* This is not a problem. */
+               CDEBUG(D_INFO, "No resource "LPU64"\n", res_id->name[0]);
+               RETURN(0);
+       }
+
+       LDLM_RESOURCE_ADDREF(res);
+       count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
+                                          0, flags | LCF_BL_AST, opaque);
+       rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
+       if (rc != ELDLM_OK)
+               CERROR("ldlm_cli_cancel_unused_resource: %d\n", rc);
+
+       LDLM_RESOURCE_DELREF(res);
+       ldlm_resource_putref(res);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource);
+
+struct ldlm_cli_cancel_arg {
+       int     lc_flags;
+       void   *lc_opaque;
+};
+
+static int ldlm_cli_hash_cancel_unused(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                      struct hlist_node *hnode, void *arg)
+{
+       struct ldlm_resource       *res = cfs_hash_object(hs, hnode);
+       struct ldlm_cli_cancel_arg     *lc = arg;
+       int                          rc;
+
+       rc = ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+                                            NULL, LCK_MINMODE,
+                                            lc->lc_flags, lc->lc_opaque);
+       if (rc != 0) {
+               CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n",
+                      res->lr_name.name[0], rc);
+       }
+       /* must return 0 for hash iteration */
+       return 0;
+}
+
+/**
+ * Cancel all locks on a namespace (or a specific resource, if given)
+ * that have 0 readers/writers.
+ *
+ * If flags & LCF_LOCAL, throw the locks away without trying
+ * to notify the server. */
+int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
+                          const struct ldlm_res_id *res_id,
+                          ldlm_cancel_flags_t flags, void *opaque)
+{
+       struct ldlm_cli_cancel_arg arg = {
+               .lc_flags       = flags,
+               .lc_opaque      = opaque,
+       };
+
+       ENTRY;
+
+       if (ns == NULL)
+               RETURN(ELDLM_OK);
+
+       if (res_id != NULL) {
+               RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
+                                                      LCK_MINMODE, flags,
+                                                      opaque));
+       } else {
+               cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                        ldlm_cli_hash_cancel_unused, &arg);
+               RETURN(ELDLM_OK);
+       }
+}
+EXPORT_SYMBOL(ldlm_cli_cancel_unused);
+
+/* Lock iterators. */
+
+int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
+                         void *closure)
+{
+       struct list_head *tmp, *next;
+       struct ldlm_lock *lock;
+       int rc = LDLM_ITER_CONTINUE;
+
+       ENTRY;
+
+       if (!res)
+               RETURN(LDLM_ITER_CONTINUE);
+
+       lock_res(res);
+       list_for_each_safe(tmp, next, &res->lr_granted) {
+               lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               if (iter(lock, closure) == LDLM_ITER_STOP)
+                       GOTO(out, rc = LDLM_ITER_STOP);
+       }
+
+       list_for_each_safe(tmp, next, &res->lr_converting) {
+               lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               if (iter(lock, closure) == LDLM_ITER_STOP)
+                       GOTO(out, rc = LDLM_ITER_STOP);
+       }
+
+       list_for_each_safe(tmp, next, &res->lr_waiting) {
+               lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+
+               if (iter(lock, closure) == LDLM_ITER_STOP)
+                       GOTO(out, rc = LDLM_ITER_STOP);
+       }
+ out:
+       unlock_res(res);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_resource_foreach);
+
+struct iter_helper_data {
+       ldlm_iterator_t iter;
+       void *closure;
+};
+
+static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
+{
+       struct iter_helper_data *helper = closure;
+       return helper->iter(lock, helper->closure);
+}
+
+static int ldlm_res_iter_helper(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                               struct hlist_node *hnode, void *arg)
+
+{
+       struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+       return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+              LDLM_ITER_STOP;
+}
+
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+                           ldlm_iterator_t iter, void *closure)
+
+{
+       struct iter_helper_data helper = { iter: iter, closure: closure };
+
+       cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                ldlm_res_iter_helper, &helper);
+
+}
+EXPORT_SYMBOL(ldlm_namespace_foreach);
+
+/* non-blocking function to manipulate a lock whose cb_data is being put away.
+ * return  0:  find no resource
+ *       > 0:  must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE.
+ *       < 0:  errors
+ */
+int ldlm_resource_iterate(struct ldlm_namespace *ns,
+                         const struct ldlm_res_id *res_id,
+                         ldlm_iterator_t iter, void *data)
+{
+       struct ldlm_resource *res;
+       int rc;
+       ENTRY;
+
+       if (ns == NULL) {
+               CERROR("must pass in namespace\n");
+               LBUG();
+       }
+
+       res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
+       if (res == NULL)
+               RETURN(0);
+
+       LDLM_RESOURCE_ADDREF(res);
+       rc = ldlm_resource_foreach(res, iter, data);
+       LDLM_RESOURCE_DELREF(res);
+       ldlm_resource_putref(res);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_resource_iterate);
+
+/* Lock replay */
+
+static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
+{
+       struct list_head *list = closure;
+
+       /* we use l_pending_chain here, because it's unused on clients. */
+       LASSERTF(list_empty(&lock->l_pending_chain),
+                "lock %p next %p prev %p\n",
+                lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev);
+       /* bug 9573: don't replay locks left after eviction, or
+        * bug 17614: locks being actively cancelled. Get a reference
+        * on a lock so that it does not disapear under us (e.g. due to cancel)
+        */
+       if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) {
+               list_add(&lock->l_pending_chain, list);
+               LDLM_LOCK_GET(lock);
+       }
+
+       return LDLM_ITER_CONTINUE;
+}
+
+static int replay_lock_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req,
+                                struct ldlm_async_args *aa, int rc)
+{
+       struct ldlm_lock     *lock;
+       struct ldlm_reply    *reply;
+       struct obd_export    *exp;
+
+       ENTRY;
+       atomic_dec(&req->rq_import->imp_replay_inflight);
+       if (rc != ELDLM_OK)
+               GOTO(out, rc);
+
+
+       reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+       if (reply == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       lock = ldlm_handle2lock(&aa->lock_handle);
+       if (!lock) {
+               CERROR("received replay ack for unknown local cookie "LPX64
+                      " remote cookie "LPX64 " from server %s id %s\n",
+                      aa->lock_handle.cookie, reply->lock_handle.cookie,
+                      req->rq_export->exp_client_uuid.uuid,
+                      libcfs_id2str(req->rq_peer));
+               GOTO(out, rc = -ESTALE);
+       }
+
+       /* Key change rehash lock in per-export hash with new key */
+       exp = req->rq_export;
+       if (exp && exp->exp_lock_hash) {
+               /* In the function below, .hs_keycmp resolves to
+                * ldlm_export_lock_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               cfs_hash_rehash_key(exp->exp_lock_hash,
+                                   &lock->l_remote_handle,
+                                   &reply->lock_handle,
+                                   &lock->l_exp_hash);
+       } else {
+               lock->l_remote_handle = reply->lock_handle;
+       }
+
+       LDLM_DEBUG(lock, "replayed lock:");
+       ptlrpc_import_recovery_state_machine(req->rq_import);
+       LDLM_LOCK_PUT(lock);
+out:
+       if (rc != ELDLM_OK)
+               ptlrpc_connect_import(req->rq_import);
+
+       RETURN(rc);
+}
+
+static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
+{
+       struct ptlrpc_request *req;
+       struct ldlm_async_args *aa;
+       struct ldlm_request   *body;
+       int flags;
+       ENTRY;
+
+
+       /* Bug 11974: Do not replay a lock which is actively being canceled */
+       if (lock->l_flags & LDLM_FL_CANCELING) {
+               LDLM_DEBUG(lock, "Not replaying canceled lock:");
+               RETURN(0);
+       }
+
+       /* If this is reply-less callback lock, we cannot replay it, since
+        * server might have long dropped it, but notification of that event was
+        * lost by network. (and server granted conflicting lock already) */
+       if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) {
+               LDLM_DEBUG(lock, "Not replaying reply-less lock:");
+               ldlm_lock_cancel(lock);
+               RETURN(0);
+       }
+
+       /*
+        * If granted mode matches the requested mode, this lock is granted.
+        *
+        * If they differ, but we have a granted mode, then we were granted
+        * one mode and now want another: ergo, converting.
+        *
+        * If we haven't been granted anything and are on a resource list,
+        * then we're blocked/waiting.
+        *
+        * If we haven't been granted anything and we're NOT on a resource list,
+        * then we haven't got a reply yet and don't have a known disposition.
+        * This happens whenever a lock enqueue is the request that triggers
+        * recovery.
+        */
+       if (lock->l_granted_mode == lock->l_req_mode)
+               flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+       else if (lock->l_granted_mode)
+               flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
+       else if (!list_empty(&lock->l_res_link))
+               flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+       else
+               flags = LDLM_FL_REPLAY;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
+                                       LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       /* We're part of recovery, so don't wait for it. */
+       req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
+       ldlm_lock2desc(lock, &body->lock_desc);
+       body->lock_flags = ldlm_flags_to_wire(flags);
+
+       ldlm_lock2handle(lock, &body->lock_handle[0]);
+       if (lock->l_lvb_len > 0)
+               req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB);
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                            lock->l_lvb_len);
+       ptlrpc_request_set_replen(req);
+       /* notify the server we've replayed all requests.
+        * also, we mark the request to be put on a dedicated
+        * queue to be processed after all request replayes.
+        * bug 6063 */
+       lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE);
+
+       LDLM_DEBUG(lock, "replaying lock:");
+
+       atomic_inc(&req->rq_import->imp_replay_inflight);
+       CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       aa->lock_handle = body->lock_handle[0];
+       req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret;
+       ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+       RETURN(0);
+}
+
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+       int canceled;
+       LIST_HEAD(cancels);
+
+       CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before"
+                          "replay for namespace %s (%d)\n",
+                          ldlm_ns_name(ns), ns->ns_nr_unused);
+
+       /* We don't need to care whether or not LRU resize is enabled
+        * because the LDLM_CANCEL_NO_WAIT policy doesn't use the
+        * count parameter */
+       canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+                                        LCF_LOCAL, LDLM_CANCEL_NO_WAIT);
+
+       CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+                          canceled, ldlm_ns_name(ns));
+}
+
+int ldlm_replay_locks(struct obd_import *imp)
+{
+       struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+       LIST_HEAD(list);
+       struct ldlm_lock *lock, *next;
+       int rc = 0;
+
+       ENTRY;
+
+       LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+
+       /* don't replay locks if import failed recovery */
+       if (imp->imp_vbr_failed)
+               RETURN(0);
+
+       /* ensure this doesn't fall to 0 before all have been queued */
+       atomic_inc(&imp->imp_replay_inflight);
+
+       if (ldlm_cancel_unused_locks_before_replay)
+               ldlm_cancel_unused_locks_for_replay(ns);
+
+       ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+
+       list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
+               list_del_init(&lock->l_pending_chain);
+               if (rc) {
+                       LDLM_LOCK_RELEASE(lock);
+                       continue; /* or try to do the rest? */
+               }
+               rc = replay_one_lock(imp, lock);
+               LDLM_LOCK_RELEASE(lock);
+       }
+
+       atomic_dec(&imp->imp_replay_inflight);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ldlm_replay_locks);
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c
new file mode 100644 (file)
index 0000000..9052dc5
--- /dev/null
@@ -0,0 +1,1409 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ldlm/ldlm_resource.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LDLM
+# include <lustre_dlm.h>
+
+#include <lustre_fid.h>
+#include <obd_class.h>
+#include "ldlm_internal.h"
+
+struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab;
+
+atomic_t ldlm_srv_namespace_nr = ATOMIC_INIT(0);
+atomic_t ldlm_cli_namespace_nr = ATOMIC_INIT(0);
+
+struct mutex ldlm_srv_namespace_lock;
+LIST_HEAD(ldlm_srv_namespace_list);
+
+struct mutex ldlm_cli_namespace_lock;
+LIST_HEAD(ldlm_cli_namespace_list);
+
+proc_dir_entry_t *ldlm_type_proc_dir = NULL;
+proc_dir_entry_t *ldlm_ns_proc_dir = NULL;
+proc_dir_entry_t *ldlm_svc_proc_dir = NULL;
+
+extern unsigned int ldlm_cancel_unused_locks_before_replay;
+
+/* during debug dump certain amount of granted locks for one resource to avoid
+ * DDOS. */
+unsigned int ldlm_dump_granted_max = 256;
+
+#ifdef LPROCFS
+static ssize_t lprocfs_wr_dump_ns(struct file *file, const char *buffer,
+                                 size_t count, loff_t *off)
+{
+       ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE);
+       ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE);
+       RETURN(count);
+}
+LPROC_SEQ_FOPS_WR_ONLY(ldlm, dump_ns);
+
+LPROC_SEQ_FOPS_RW_TYPE(ldlm_rw, uint);
+LPROC_SEQ_FOPS_RO_TYPE(ldlm, uint);
+
+int ldlm_proc_setup(void)
+{
+       int rc;
+       struct lprocfs_vars list[] = {
+               { "dump_namespaces", &ldlm_dump_ns_fops, 0, 0222 },
+               { "dump_granted_max", &ldlm_rw_uint_fops,
+                 &ldlm_dump_granted_max },
+               { "cancel_unused_locks_before_replay", &ldlm_rw_uint_fops,
+                 &ldlm_cancel_unused_locks_before_replay },
+               { NULL }};
+       ENTRY;
+       LASSERT(ldlm_ns_proc_dir == NULL);
+
+       ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME,
+                                             proc_lustre_root,
+                                             NULL, NULL);
+       if (IS_ERR(ldlm_type_proc_dir)) {
+               CERROR("LProcFS failed in ldlm-init\n");
+               rc = PTR_ERR(ldlm_type_proc_dir);
+               GOTO(err, rc);
+       }
+
+       ldlm_ns_proc_dir = lprocfs_register("namespaces",
+                                           ldlm_type_proc_dir,
+                                           NULL, NULL);
+       if (IS_ERR(ldlm_ns_proc_dir)) {
+               CERROR("LProcFS failed in ldlm-init\n");
+               rc = PTR_ERR(ldlm_ns_proc_dir);
+               GOTO(err_type, rc);
+       }
+
+       ldlm_svc_proc_dir = lprocfs_register("services",
+                                           ldlm_type_proc_dir,
+                                           NULL, NULL);
+       if (IS_ERR(ldlm_svc_proc_dir)) {
+               CERROR("LProcFS failed in ldlm-init\n");
+               rc = PTR_ERR(ldlm_svc_proc_dir);
+               GOTO(err_ns, rc);
+       }
+
+       rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL);
+
+       RETURN(0);
+
+err_ns:
+       lprocfs_remove(&ldlm_ns_proc_dir);
+err_type:
+       lprocfs_remove(&ldlm_type_proc_dir);
+err:
+       ldlm_svc_proc_dir = NULL;
+       ldlm_type_proc_dir = NULL;
+       ldlm_ns_proc_dir = NULL;
+       RETURN(rc);
+}
+
+void ldlm_proc_cleanup(void)
+{
+       if (ldlm_svc_proc_dir)
+               lprocfs_remove(&ldlm_svc_proc_dir);
+
+       if (ldlm_ns_proc_dir)
+               lprocfs_remove(&ldlm_ns_proc_dir);
+
+       if (ldlm_type_proc_dir)
+               lprocfs_remove(&ldlm_type_proc_dir);
+
+       ldlm_svc_proc_dir = NULL;
+       ldlm_type_proc_dir = NULL;
+       ldlm_ns_proc_dir = NULL;
+}
+
+static int lprocfs_ns_resources_seq_show(struct seq_file *m, void *v)
+{
+       struct ldlm_namespace *ns  = m->private;
+       __u64             res = 0;
+       cfs_hash_bd_t     bd;
+       int                 i;
+
+       /* result is not strictly consistant */
+       cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i)
+               res += cfs_hash_bd_count_get(&bd);
+       return lprocfs_rd_u64(m, &res);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_ns_resources);
+
+static int lprocfs_ns_locks_seq_show(struct seq_file *m, void *v)
+{
+       struct ldlm_namespace *ns = m->private;
+       __u64             locks;
+
+       locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS,
+                                       LPROCFS_FIELDS_FLAGS_SUM);
+       return lprocfs_rd_u64(m, &locks);
+}
+LPROC_SEQ_FOPS_RO(lprocfs_ns_locks);
+
+static int lprocfs_lru_size_seq_show(struct seq_file *m, void *v)
+{
+       struct ldlm_namespace *ns = m->private;
+       __u32 *nr = &ns->ns_max_unused;
+
+       if (ns_connect_lru_resize(ns))
+               nr = &ns->ns_nr_unused;
+       return lprocfs_rd_uint(m, nr);
+}
+
+static ssize_t lprocfs_lru_size_seq_write(struct file *file, const char *buffer,
+                                     size_t count, loff_t *off)
+{
+       struct ldlm_namespace *ns = ((struct seq_file *)file->private_data)->private;
+       char dummy[MAX_STRING_SIZE + 1], *end;
+       unsigned long tmp;
+       int lru_resize;
+
+       dummy[MAX_STRING_SIZE] = '\0';
+       if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+               return -EFAULT;
+
+       if (strncmp(dummy, "clear", 5) == 0) {
+               CDEBUG(D_DLMTRACE,
+                      "dropping all unused locks from namespace %s\n",
+                      ldlm_ns_name(ns));
+               if (ns_connect_lru_resize(ns)) {
+                       int canceled, unused  = ns->ns_nr_unused;
+
+                       /* Try to cancel all @ns_nr_unused locks. */
+                       canceled = ldlm_cancel_lru(ns, unused, 0,
+                                                  LDLM_CANCEL_PASSED);
+                       if (canceled < unused) {
+                               CDEBUG(D_DLMTRACE,
+                                      "not all requested locks are canceled, "
+                                      "requested: %d, canceled: %d\n", unused,
+                                      canceled);
+                               return -EINVAL;
+                       }
+               } else {
+                       tmp = ns->ns_max_unused;
+                       ns->ns_max_unused = 0;
+                       ldlm_cancel_lru(ns, 0, 0, LDLM_CANCEL_PASSED);
+                       ns->ns_max_unused = tmp;
+               }
+               return count;
+       }
+
+       tmp = simple_strtoul(dummy, &end, 0);
+       if (dummy == end) {
+               CERROR("invalid value written\n");
+               return -EINVAL;
+       }
+       lru_resize = (tmp == 0);
+
+       if (ns_connect_lru_resize(ns)) {
+               if (!lru_resize)
+                       ns->ns_max_unused = (unsigned int)tmp;
+
+               if (tmp > ns->ns_nr_unused)
+                       tmp = ns->ns_nr_unused;
+               tmp = ns->ns_nr_unused - tmp;
+
+               CDEBUG(D_DLMTRACE,
+                      "changing namespace %s unused locks from %u to %u\n",
+                      ldlm_ns_name(ns), ns->ns_nr_unused,
+                      (unsigned int)tmp);
+               ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+               if (!lru_resize) {
+                       CDEBUG(D_DLMTRACE,
+                              "disable lru_resize for namespace %s\n",
+                              ldlm_ns_name(ns));
+                       ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE;
+               }
+       } else {
+               CDEBUG(D_DLMTRACE,
+                      "changing namespace %s max_unused from %u to %u\n",
+                      ldlm_ns_name(ns), ns->ns_max_unused,
+                      (unsigned int)tmp);
+               ns->ns_max_unused = (unsigned int)tmp;
+               ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_CANCEL_PASSED);
+
+               /* Make sure that LRU resize was originally supported before
+                * turning it on here. */
+               if (lru_resize &&
+                   (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) {
+                       CDEBUG(D_DLMTRACE,
+                              "enable lru_resize for namespace %s\n",
+                              ldlm_ns_name(ns));
+                       ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+               }
+       }
+
+       return count;
+}
+LPROC_SEQ_FOPS(lprocfs_lru_size);
+
+static int lprocfs_elc_seq_show(struct seq_file *m, void *v)
+{
+       struct ldlm_namespace *ns = m->private;
+       unsigned int supp = ns_connect_cancelset(ns);
+
+       return lprocfs_rd_uint(m, &supp);
+}
+
+static ssize_t lprocfs_elc_seq_write(struct file *file, const char *buffer,
+                                size_t count, loff_t *off)
+{
+       struct ldlm_namespace *ns = ((struct seq_file *)file->private_data)->private;
+       unsigned int supp = -1;
+       int rc;
+
+       rc = lprocfs_wr_uint(file, buffer, count, &supp);
+       if (rc < 0)
+               return rc;
+
+       if (supp == 0)
+               ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET;
+       else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET)
+               ns->ns_connect_flags |= OBD_CONNECT_CANCELSET;
+       return count;
+}
+LPROC_SEQ_FOPS(lprocfs_elc);
+
+void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns)
+{
+       if (ns->ns_proc_dir_entry == NULL)
+               CERROR("dlm namespace %s has no procfs dir?\n",
+                      ldlm_ns_name(ns));
+       else
+               lprocfs_remove(&ns->ns_proc_dir_entry);
+
+       if (ns->ns_stats != NULL)
+               lprocfs_free_stats(&ns->ns_stats);
+}
+
+#define LDLM_NS_ADD_VAR(name, var, ops)                                \
+       do {                                                    \
+               snprintf(lock_name, MAX_STRING_SIZE, name);     \
+               lock_vars[0].data = var;                        \
+               lock_vars[0].fops = ops;                        \
+               lprocfs_add_vars(ns_pde, lock_vars, 0);         \
+       } while (0)
+
+int ldlm_namespace_proc_register(struct ldlm_namespace *ns)
+{
+       struct lprocfs_vars lock_vars[2];
+       char lock_name[MAX_STRING_SIZE + 1];
+       proc_dir_entry_t *ns_pde;
+
+       LASSERT(ns != NULL);
+       LASSERT(ns->ns_rs_hash != NULL);
+
+       if (ns->ns_proc_dir_entry != NULL) {
+               ns_pde = ns->ns_proc_dir_entry;
+       } else {
+               ns_pde = proc_mkdir(ldlm_ns_name(ns), ldlm_ns_proc_dir);
+               if (ns_pde == NULL)
+                       return -ENOMEM;
+               ns->ns_proc_dir_entry = ns_pde;
+       }
+
+       ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0);
+       if (ns->ns_stats == NULL)
+               return -ENOMEM;
+
+       lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS,
+                            LPROCFS_CNTR_AVGMINMAX, "locks", "locks");
+
+       lock_name[MAX_STRING_SIZE] = '\0';
+
+       memset(lock_vars, 0, sizeof(lock_vars));
+       lock_vars[0].name = lock_name;
+
+       LDLM_NS_ADD_VAR("resource_count", ns, &lprocfs_ns_resources_fops);
+       LDLM_NS_ADD_VAR("lock_count", ns, &lprocfs_ns_locks_fops);
+
+       if (ns_is_client(ns)) {
+               LDLM_NS_ADD_VAR("lock_unused_count", &ns->ns_nr_unused,
+                               &ldlm_uint_fops);
+               LDLM_NS_ADD_VAR("lru_size", ns, &lprocfs_lru_size_fops);
+               LDLM_NS_ADD_VAR("lru_max_age", &ns->ns_max_age,
+                               &ldlm_rw_uint_fops);
+               LDLM_NS_ADD_VAR("early_lock_cancel", ns, &lprocfs_elc_fops);
+       } else {
+               LDLM_NS_ADD_VAR("ctime_age_limit", &ns->ns_ctime_age_limit,
+                               &ldlm_rw_uint_fops);
+               LDLM_NS_ADD_VAR("lock_timeouts", &ns->ns_timeouts,
+                               &ldlm_uint_fops);
+               LDLM_NS_ADD_VAR("max_nolock_bytes", &ns->ns_max_nolock_size,
+                               &ldlm_rw_uint_fops);
+               LDLM_NS_ADD_VAR("contention_seconds", &ns->ns_contention_time,
+                               &ldlm_rw_uint_fops);
+               LDLM_NS_ADD_VAR("contended_locks", &ns->ns_contended_locks,
+                               &ldlm_rw_uint_fops);
+               LDLM_NS_ADD_VAR("max_parallel_ast", &ns->ns_max_parallel_ast,
+                               &ldlm_rw_uint_fops);
+       }
+       return 0;
+}
+#undef MAX_STRING_SIZE
+#else /* LPROCFS */
+
+#define ldlm_namespace_proc_unregister(ns)      ({;})
+#define ldlm_namespace_proc_register(ns)       ({0;})
+
+#endif /* LPROCFS */
+
+static unsigned ldlm_res_hop_hash(cfs_hash_t *hs,
+                                 const void *key, unsigned mask)
+{
+       const struct ldlm_res_id     *id  = key;
+       unsigned                val = 0;
+       unsigned                i;
+
+       for (i = 0; i < RES_NAME_SIZE; i++)
+               val += id->name[i];
+       return val & mask;
+}
+
+static unsigned ldlm_res_hop_fid_hash(cfs_hash_t *hs,
+                                     const void *key, unsigned mask)
+{
+       const struct ldlm_res_id *id = key;
+       struct lu_fid       fid;
+       __u32          hash;
+       __u32          val;
+
+       fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF];
+       fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF];
+       fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32);
+
+       hash = fid_flatten32(&fid);
+       hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+       if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) {
+               val = id->name[LUSTRE_RES_ID_HSH_OFF];
+               hash += (val >> 5) + (val << 11);
+       } else {
+               val = fid_oid(&fid);
+       }
+       hash = cfs_hash_long(hash, hs->hs_bkt_bits);
+       /* give me another random factor */
+       hash -= cfs_hash_long((unsigned long)hs, val % 11 + 3);
+
+       hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+       hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1);
+
+       return hash & mask;
+}
+
+static void *ldlm_res_hop_key(struct hlist_node *hnode)
+{
+       struct ldlm_resource   *res;
+
+       res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+       return &res->lr_name;
+}
+
+static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct ldlm_resource   *res;
+
+       res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+       return ldlm_res_eq((const struct ldlm_res_id *)key,
+                          (const struct ldlm_res_id *)&res->lr_name);
+}
+
+static void *ldlm_res_hop_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct ldlm_resource, lr_hash);
+}
+
+static void ldlm_res_hop_get_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_resource *res;
+
+       res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+       ldlm_resource_getref(res);
+}
+
+static void ldlm_res_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_resource *res;
+
+       res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+       /* cfs_hash_for_each_nolock is the only chance we call it */
+       ldlm_resource_putref_locked(res);
+}
+
+static void ldlm_res_hop_put(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ldlm_resource *res;
+
+       res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+       ldlm_resource_putref(res);
+}
+
+cfs_hash_ops_t ldlm_ns_hash_ops = {
+       .hs_hash        = ldlm_res_hop_hash,
+       .hs_key  = ldlm_res_hop_key,
+       .hs_keycmp      = ldlm_res_hop_keycmp,
+       .hs_keycpy      = NULL,
+       .hs_object      = ldlm_res_hop_object,
+       .hs_get  = ldlm_res_hop_get_locked,
+       .hs_put_locked  = ldlm_res_hop_put_locked,
+       .hs_put  = ldlm_res_hop_put
+};
+
+cfs_hash_ops_t ldlm_ns_fid_hash_ops = {
+       .hs_hash        = ldlm_res_hop_fid_hash,
+       .hs_key  = ldlm_res_hop_key,
+       .hs_keycmp      = ldlm_res_hop_keycmp,
+       .hs_keycpy      = NULL,
+       .hs_object      = ldlm_res_hop_object,
+       .hs_get  = ldlm_res_hop_get_locked,
+       .hs_put_locked  = ldlm_res_hop_put_locked,
+       .hs_put  = ldlm_res_hop_put
+};
+
+typedef struct {
+       ldlm_ns_type_t  nsd_type;
+       /** hash bucket bits */
+       unsigned        nsd_bkt_bits;
+       /** hash bits */
+       unsigned        nsd_all_bits;
+       /** hash operations */
+       cfs_hash_ops_t *nsd_hops;
+} ldlm_ns_hash_def_t;
+
+ldlm_ns_hash_def_t ldlm_ns_hash_defs[] =
+{
+       {
+               .nsd_type       = LDLM_NS_TYPE_MDC,
+               .nsd_bkt_bits   = 11,
+               .nsd_all_bits   = 16,
+               .nsd_hops       = &ldlm_ns_fid_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_MDT,
+               .nsd_bkt_bits   = 14,
+               .nsd_all_bits   = 21,
+               .nsd_hops       = &ldlm_ns_fid_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_OSC,
+               .nsd_bkt_bits   = 8,
+               .nsd_all_bits   = 12,
+               .nsd_hops       = &ldlm_ns_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_OST,
+               .nsd_bkt_bits   = 11,
+               .nsd_all_bits   = 17,
+               .nsd_hops       = &ldlm_ns_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_MGC,
+               .nsd_bkt_bits   = 4,
+               .nsd_all_bits   = 4,
+               .nsd_hops       = &ldlm_ns_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_MGT,
+               .nsd_bkt_bits   = 4,
+               .nsd_all_bits   = 4,
+               .nsd_hops       = &ldlm_ns_hash_ops,
+       },
+       {
+               .nsd_type       = LDLM_NS_TYPE_UNKNOWN,
+       },
+};
+
+/**
+ * Create and initialize new empty namespace.
+ */
+struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name,
+                                         ldlm_side_t client,
+                                         ldlm_appetite_t apt,
+                                         ldlm_ns_type_t ns_type)
+{
+       struct ldlm_namespace *ns = NULL;
+       struct ldlm_ns_bucket *nsb;
+       ldlm_ns_hash_def_t    *nsd;
+       cfs_hash_bd_t     bd;
+       int                 idx;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(obd != NULL);
+
+       rc = ldlm_get_ref();
+       if (rc) {
+               CERROR("ldlm_get_ref failed: %d\n", rc);
+               RETURN(NULL);
+       }
+
+       for (idx = 0;;idx++) {
+               nsd = &ldlm_ns_hash_defs[idx];
+               if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) {
+                       CERROR("Unknown type %d for ns %s\n", ns_type, name);
+                       GOTO(out_ref, NULL);
+               }
+
+               if (nsd->nsd_type == ns_type)
+                       break;
+       }
+
+       OBD_ALLOC_PTR(ns);
+       if (!ns)
+               GOTO(out_ref, NULL);
+
+       ns->ns_rs_hash = cfs_hash_create(name,
+                                        nsd->nsd_all_bits, nsd->nsd_all_bits,
+                                        nsd->nsd_bkt_bits, sizeof(*nsb),
+                                        CFS_HASH_MIN_THETA,
+                                        CFS_HASH_MAX_THETA,
+                                        nsd->nsd_hops,
+                                        CFS_HASH_DEPTH |
+                                        CFS_HASH_BIGNAME |
+                                        CFS_HASH_SPIN_BKTLOCK |
+                                        CFS_HASH_NO_ITEMREF);
+       if (ns->ns_rs_hash == NULL)
+               GOTO(out_ns, NULL);
+
+       cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) {
+               nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+               at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0);
+               nsb->nsb_namespace = ns;
+       }
+
+       ns->ns_obd      = obd;
+       ns->ns_appetite = apt;
+       ns->ns_client   = client;
+
+       INIT_LIST_HEAD(&ns->ns_list_chain);
+       INIT_LIST_HEAD(&ns->ns_unused_list);
+       spin_lock_init(&ns->ns_lock);
+       atomic_set(&ns->ns_bref, 0);
+       init_waitqueue_head(&ns->ns_waitq);
+
+       ns->ns_max_nolock_size    = NS_DEFAULT_MAX_NOLOCK_BYTES;
+       ns->ns_contention_time    = NS_DEFAULT_CONTENTION_SECONDS;
+       ns->ns_contended_locks    = NS_DEFAULT_CONTENDED_LOCKS;
+
+       ns->ns_max_parallel_ast   = LDLM_DEFAULT_PARALLEL_AST_LIMIT;
+       ns->ns_nr_unused          = 0;
+       ns->ns_max_unused        = LDLM_DEFAULT_LRU_SIZE;
+       ns->ns_max_age      = LDLM_DEFAULT_MAX_ALIVE;
+       ns->ns_ctime_age_limit    = LDLM_CTIME_AGE_LIMIT;
+       ns->ns_timeouts    = 0;
+       ns->ns_orig_connect_flags = 0;
+       ns->ns_connect_flags      = 0;
+       ns->ns_stopping    = 0;
+       rc = ldlm_namespace_proc_register(ns);
+       if (rc != 0) {
+               CERROR("Can't initialize ns proc, rc %d\n", rc);
+               GOTO(out_hash, rc);
+       }
+
+       idx = atomic_read(ldlm_namespace_nr(client));
+       rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client);
+       if (rc) {
+               CERROR("Can't initialize lock pool, rc %d\n", rc);
+               GOTO(out_proc, rc);
+       }
+
+       ldlm_namespace_register(ns, client);
+       RETURN(ns);
+out_proc:
+       ldlm_namespace_proc_unregister(ns);
+       ldlm_namespace_cleanup(ns, 0);
+out_hash:
+       cfs_hash_putref(ns->ns_rs_hash);
+out_ns:
+       OBD_FREE_PTR(ns);
+out_ref:
+       ldlm_put_ref();
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(ldlm_namespace_new);
+
+extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
+
+/**
+ * Cancel and destroy all locks on a resource.
+ *
+ * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just
+ * clean up.  This is currently only used for recovery, and we make
+ * certain assumptions as a result--notably, that we shouldn't cancel
+ * locks with refs.
+ */
+static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
+                            __u64 flags)
+{
+       struct list_head *tmp;
+       int rc = 0, client = ns_is_client(ldlm_res_to_ns(res));
+       bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY);
+
+       do {
+               struct ldlm_lock *lock = NULL;
+
+               /* First, we look for non-cleaned-yet lock
+                * all cleaned locks are marked by CLEANED flag. */
+               lock_res(res);
+               list_for_each(tmp, q) {
+                       lock = list_entry(tmp, struct ldlm_lock,
+                                             l_res_link);
+                       if (lock->l_flags & LDLM_FL_CLEANED) {
+                               lock = NULL;
+                               continue;
+                       }
+                       LDLM_LOCK_GET(lock);
+                       lock->l_flags |= LDLM_FL_CLEANED;
+                       break;
+               }
+
+               if (lock == NULL) {
+                       unlock_res(res);
+                       break;
+               }
+
+               /* Set CBPENDING so nothing in the cancellation path
+                * can match this lock. */
+               lock->l_flags |= LDLM_FL_CBPENDING;
+               lock->l_flags |= LDLM_FL_FAILED;
+               lock->l_flags |= flags;
+
+               /* ... without sending a CANCEL message for local_only. */
+               if (local_only)
+                       lock->l_flags |= LDLM_FL_LOCAL_ONLY;
+
+               if (local_only && (lock->l_readers || lock->l_writers)) {
+                       /* This is a little bit gross, but much better than the
+                        * alternative: pretend that we got a blocking AST from
+                        * the server, so that when the lock is decref'd, it
+                        * will go away ... */
+                       unlock_res(res);
+                       LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY");
+                       if (lock->l_completion_ast)
+                               lock->l_completion_ast(lock, 0, NULL);
+                       LDLM_LOCK_RELEASE(lock);
+                       continue;
+               }
+
+               if (client) {
+                       struct lustre_handle lockh;
+
+                       unlock_res(res);
+                       ldlm_lock2handle(lock, &lockh);
+                       rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+                       if (rc)
+                               CERROR("ldlm_cli_cancel: %d\n", rc);
+               } else {
+                       ldlm_resource_unlink_lock(lock);
+                       unlock_res(res);
+                       LDLM_DEBUG(lock, "Freeing a lock still held by a "
+                                  "client node");
+                       ldlm_lock_destroy(lock);
+               }
+               LDLM_LOCK_RELEASE(lock);
+       } while (1);
+}
+
+static int ldlm_resource_clean(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                              struct hlist_node *hnode, void *arg)
+{
+       struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+       __u64 flags = *(__u64 *)arg;
+
+       cleanup_resource(res, &res->lr_granted, flags);
+       cleanup_resource(res, &res->lr_converting, flags);
+       cleanup_resource(res, &res->lr_waiting, flags);
+
+       return 0;
+}
+
+static int ldlm_resource_complain(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                 struct hlist_node *hnode, void *arg)
+{
+       struct ldlm_resource  *res = cfs_hash_object(hs, hnode);
+
+       lock_res(res);
+       CERROR("Namespace %s resource refcount nonzero "
+              "(%d) after lock cleanup; forcing "
+              "cleanup.\n",
+              ldlm_ns_name(ldlm_res_to_ns(res)),
+              atomic_read(&res->lr_refcount) - 1);
+
+       CERROR("Resource: %p ("LPU64"/"LPU64"/"LPU64"/"
+              LPU64") (rc: %d)\n", res,
+              res->lr_name.name[0], res->lr_name.name[1],
+              res->lr_name.name[2], res->lr_name.name[3],
+              atomic_read(&res->lr_refcount) - 1);
+
+       ldlm_resource_dump(D_ERROR, res);
+       unlock_res(res);
+       return 0;
+}
+
+/**
+ * Cancel and destroy all locks in the namespace.
+ *
+ * Typically used during evictions when server notified client that it was
+ * evicted and all of its state needs to be destroyed.
+ * Also used during shutdown.
+ */
+int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags)
+{
+       if (ns == NULL) {
+               CDEBUG(D_INFO, "NULL ns, skipping cleanup\n");
+               return ELDLM_OK;
+       }
+
+       cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, &flags);
+       cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, NULL);
+       return ELDLM_OK;
+}
+EXPORT_SYMBOL(ldlm_namespace_cleanup);
+
+/**
+ * Attempts to free namespace.
+ *
+ * Only used when namespace goes away, like during an unmount.
+ */
+static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force)
+{
+       ENTRY;
+
+       /* At shutdown time, don't call the cancellation callback */
+       ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0);
+
+       if (atomic_read(&ns->ns_bref) > 0) {
+               struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+               int rc;
+               CDEBUG(D_DLMTRACE,
+                      "dlm namespace %s free waiting on refcount %d\n",
+                      ldlm_ns_name(ns), atomic_read(&ns->ns_bref));
+force_wait:
+               if (force)
+                       lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL);
+
+               rc = l_wait_event(ns->ns_waitq,
+                                 atomic_read(&ns->ns_bref) == 0, &lwi);
+
+               /* Forced cleanups should be able to reclaim all references,
+                * so it's safe to wait forever... we can't leak locks... */
+               if (force && rc == -ETIMEDOUT) {
+                       LCONSOLE_ERROR("Forced cleanup waiting for %s "
+                                      "namespace with %d resources in use, "
+                                      "(rc=%d)\n", ldlm_ns_name(ns),
+                                      atomic_read(&ns->ns_bref), rc);
+                       GOTO(force_wait, rc);
+               }
+
+               if (atomic_read(&ns->ns_bref)) {
+                       LCONSOLE_ERROR("Cleanup waiting for %s namespace "
+                                      "with %d resources in use, (rc=%d)\n",
+                                      ldlm_ns_name(ns),
+                                      atomic_read(&ns->ns_bref), rc);
+                       RETURN(ELDLM_NAMESPACE_EXISTS);
+               }
+               CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n",
+                      ldlm_ns_name(ns));
+       }
+
+       RETURN(ELDLM_OK);
+}
+
+/**
+ * Performs various cleanups for passed \a ns to make it drop refc and be
+ * ready for freeing. Waits for refc == 0.
+ *
+ * The following is done:
+ * (0) Unregister \a ns from its list to make inaccessible for potential
+ * users like pools thread and others;
+ * (1) Clear all locks in \a ns.
+ */
+void ldlm_namespace_free_prior(struct ldlm_namespace *ns,
+                              struct obd_import *imp,
+                              int force)
+{
+       int rc;
+       ENTRY;
+       if (!ns) {
+               EXIT;
+               return;
+       }
+
+       spin_lock(&ns->ns_lock);
+       ns->ns_stopping = 1;
+       spin_unlock(&ns->ns_lock);
+
+       /*
+        * Can fail with -EINTR when force == 0 in which case try harder.
+        */
+       rc = __ldlm_namespace_free(ns, force);
+       if (rc != ELDLM_OK) {
+               if (imp) {
+                       ptlrpc_disconnect_import(imp, 0);
+                       ptlrpc_invalidate_import(imp);
+               }
+
+               /*
+                * With all requests dropped and the import inactive
+                * we are gaurenteed all reference will be dropped.
+                */
+               rc = __ldlm_namespace_free(ns, 1);
+               LASSERT(rc == 0);
+       }
+       EXIT;
+}
+
+/**
+ * Performs freeing memory structures related to \a ns. This is only done
+ * when ldlm_namespce_free_prior() successfully removed all resources
+ * referencing \a ns and its refc == 0.
+ */
+void ldlm_namespace_free_post(struct ldlm_namespace *ns)
+{
+       ENTRY;
+       if (!ns) {
+               EXIT;
+               return;
+       }
+
+       /* Make sure that nobody can find this ns in its list. */
+       ldlm_namespace_unregister(ns, ns->ns_client);
+       /* Fini pool _before_ parent proc dir is removed. This is important as
+        * ldlm_pool_fini() removes own proc dir which is child to @dir.
+        * Removing it after @dir may cause oops. */
+       ldlm_pool_fini(&ns->ns_pool);
+
+       ldlm_namespace_proc_unregister(ns);
+       cfs_hash_putref(ns->ns_rs_hash);
+       /* Namespace \a ns should be not on list at this time, otherwise
+        * this will cause issues related to using freed \a ns in poold
+        * thread. */
+       LASSERT(list_empty(&ns->ns_list_chain));
+       OBD_FREE_PTR(ns);
+       ldlm_put_ref();
+       EXIT;
+}
+
+/**
+ * Cleanup the resource, and free namespace.
+ * bug 12864:
+ * Deadlock issue:
+ * proc1: destroy import
+ *     class_disconnect_export(grab cl_sem) ->
+ *           -> ldlm_namespace_free ->
+ *           -> lprocfs_remove(grab _lprocfs_lock).
+ * proc2: read proc info
+ *     lprocfs_fops_read(grab _lprocfs_lock) ->
+ *           -> osc_rd_active, etc(grab cl_sem).
+ *
+ * So that I have to split the ldlm_namespace_free into two parts - the first
+ * part ldlm_namespace_free_prior is used to cleanup the resource which is
+ * being used; the 2nd part ldlm_namespace_free_post is used to unregister the
+ * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem
+ * held.
+ */
+void ldlm_namespace_free(struct ldlm_namespace *ns,
+                        struct obd_import *imp,
+                        int force)
+{
+       ldlm_namespace_free_prior(ns, imp, force);
+       ldlm_namespace_free_post(ns);
+}
+EXPORT_SYMBOL(ldlm_namespace_free);
+
+void ldlm_namespace_get(struct ldlm_namespace *ns)
+{
+       atomic_inc(&ns->ns_bref);
+}
+EXPORT_SYMBOL(ldlm_namespace_get);
+
+void ldlm_namespace_put(struct ldlm_namespace *ns)
+{
+       if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) {
+               wake_up(&ns->ns_waitq);
+               spin_unlock(&ns->ns_lock);
+       }
+}
+EXPORT_SYMBOL(ldlm_namespace_put);
+
+/** Register \a ns in the list of namespaces */
+void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+       mutex_lock(ldlm_namespace_lock(client));
+       LASSERT(list_empty(&ns->ns_list_chain));
+       list_add(&ns->ns_list_chain, ldlm_namespace_list(client));
+       atomic_inc(ldlm_namespace_nr(client));
+       mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Unregister \a ns from the list of namespaces. */
+void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+       mutex_lock(ldlm_namespace_lock(client));
+       LASSERT(!list_empty(&ns->ns_list_chain));
+       /* Some asserts and possibly other parts of the code are still
+        * using list_empty(&ns->ns_list_chain). This is why it is
+        * important to use list_del_init() here. */
+       list_del_init(&ns->ns_list_chain);
+       atomic_dec(ldlm_namespace_nr(client));
+       mutex_unlock(ldlm_namespace_lock(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client)
+{
+       LASSERT(!list_empty(&ns->ns_list_chain));
+       LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+       list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client));
+}
+
+/** Should be called with ldlm_namespace_lock(client) taken. */
+struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client)
+{
+       LASSERT(mutex_is_locked(ldlm_namespace_lock(client)));
+       LASSERT(!list_empty(ldlm_namespace_list(client)));
+       return container_of(ldlm_namespace_list(client)->next,
+               struct ldlm_namespace, ns_list_chain);
+}
+
+/** Create and initialize new resource. */
+static struct ldlm_resource *ldlm_resource_new(void)
+{
+       struct ldlm_resource *res;
+       int idx;
+
+       OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, __GFP_IO);
+       if (res == NULL)
+               return NULL;
+
+       INIT_LIST_HEAD(&res->lr_granted);
+       INIT_LIST_HEAD(&res->lr_converting);
+       INIT_LIST_HEAD(&res->lr_waiting);
+
+       /* Initialize interval trees for each lock mode. */
+       for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+               res->lr_itree[idx].lit_size = 0;
+               res->lr_itree[idx].lit_mode = 1 << idx;
+               res->lr_itree[idx].lit_root = NULL;
+       }
+
+       atomic_set(&res->lr_refcount, 1);
+       spin_lock_init(&res->lr_lock);
+       lu_ref_init(&res->lr_reference);
+
+       /* The creator of the resource must unlock the mutex after LVB
+        * initialization. */
+       mutex_init(&res->lr_lvb_mutex);
+       mutex_lock(&res->lr_lvb_mutex);
+
+       return res;
+}
+
+/**
+ * Return a reference to resource with given name, creating it if necessary.
+ * Args: namespace with ns_lock unlocked
+ * Locks: takes and releases NS hash-lock and res->lr_lock
+ * Returns: referenced, unlocked ldlm_resource or NULL
+ */
+struct ldlm_resource *
+ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent,
+                 const struct ldlm_res_id *name, ldlm_type_t type, int create)
+{
+       struct hlist_node     *hnode;
+       struct ldlm_resource *res;
+       cfs_hash_bd_t    bd;
+       __u64            version;
+
+       LASSERT(ns != NULL);
+       LASSERT(parent == NULL);
+       LASSERT(ns->ns_rs_hash != NULL);
+       LASSERT(name->name[0] != 0);
+
+       cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0);
+       hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+       if (hnode != NULL) {
+               cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+               res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+               /* Synchronize with regard to resource creation. */
+               if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+                       mutex_lock(&res->lr_lvb_mutex);
+                       mutex_unlock(&res->lr_lvb_mutex);
+               }
+
+               if (unlikely(res->lr_lvb_len < 0)) {
+                       ldlm_resource_putref(res);
+                       res = NULL;
+               }
+               return res;
+       }
+
+       version = cfs_hash_bd_version_get(&bd);
+       cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0);
+
+       if (create == 0)
+               return NULL;
+
+       LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE,
+                "type: %d\n", type);
+       res = ldlm_resource_new();
+       if (!res)
+               return NULL;
+
+       res->lr_ns_bucket  = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd);
+       res->lr_name       = *name;
+       res->lr_type       = type;
+       res->lr_most_restr = LCK_NL;
+
+       cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+       hnode = (version == cfs_hash_bd_version_get(&bd)) ?  NULL :
+               cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name);
+
+       if (hnode != NULL) {
+               /* Someone won the race and already added the resource. */
+               cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+               /* Clean lu_ref for failed resource. */
+               lu_ref_fini(&res->lr_reference);
+               /* We have taken lr_lvb_mutex. Drop it. */
+               mutex_unlock(&res->lr_lvb_mutex);
+               OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+
+               res = hlist_entry(hnode, struct ldlm_resource, lr_hash);
+               /* Synchronize with regard to resource creation. */
+               if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+                       mutex_lock(&res->lr_lvb_mutex);
+                       mutex_unlock(&res->lr_lvb_mutex);
+               }
+
+               if (unlikely(res->lr_lvb_len < 0)) {
+                       ldlm_resource_putref(res);
+                       res = NULL;
+               }
+               return res;
+       }
+       /* We won! Let's add the resource. */
+       cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash);
+       if (cfs_hash_bd_count_get(&bd) == 1)
+               ldlm_namespace_get(ns);
+
+       cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+       if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) {
+               int rc;
+
+               OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2);
+               rc = ns->ns_lvbo->lvbo_init(res);
+               if (rc < 0) {
+                       CERROR("lvbo_init failed for resource "
+                              LPU64": rc %d\n", name->name[0], rc);
+                       if (res->lr_lvb_data) {
+                               OBD_FREE(res->lr_lvb_data, res->lr_lvb_len);
+                               res->lr_lvb_data = NULL;
+                       }
+                       res->lr_lvb_len = rc;
+                       mutex_unlock(&res->lr_lvb_mutex);
+                       ldlm_resource_putref(res);
+                       return NULL;
+               }
+       }
+
+       /* We create resource with locked lr_lvb_mutex. */
+       mutex_unlock(&res->lr_lvb_mutex);
+
+       return res;
+}
+EXPORT_SYMBOL(ldlm_resource_get);
+
+struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res)
+{
+       LASSERT(res != NULL);
+       LASSERT(res != LP_POISON);
+       atomic_inc(&res->lr_refcount);
+       CDEBUG(D_INFO, "getref res: %p count: %d\n", res,
+              atomic_read(&res->lr_refcount));
+       return res;
+}
+
+static void __ldlm_resource_putref_final(cfs_hash_bd_t *bd,
+                                        struct ldlm_resource *res)
+{
+       struct ldlm_ns_bucket *nsb = res->lr_ns_bucket;
+
+       if (!list_empty(&res->lr_granted)) {
+               ldlm_resource_dump(D_ERROR, res);
+               LBUG();
+       }
+
+       if (!list_empty(&res->lr_converting)) {
+               ldlm_resource_dump(D_ERROR, res);
+               LBUG();
+       }
+
+       if (!list_empty(&res->lr_waiting)) {
+               ldlm_resource_dump(D_ERROR, res);
+               LBUG();
+       }
+
+       cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash,
+                              bd, &res->lr_hash);
+       lu_ref_fini(&res->lr_reference);
+       if (cfs_hash_bd_count_get(bd) == 0)
+               ldlm_namespace_put(nsb->nsb_namespace);
+}
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref(struct ldlm_resource *res)
+{
+       struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+       cfs_hash_bd_t   bd;
+
+       LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+       CDEBUG(D_INFO, "putref res: %p count: %d\n",
+              res, atomic_read(&res->lr_refcount) - 1);
+
+       cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd);
+       if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) {
+               __ldlm_resource_putref_final(&bd, res);
+               cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+               if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+                       ns->ns_lvbo->lvbo_free(res);
+               OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ldlm_resource_putref);
+
+/* Returns 1 if the resource was freed, 0 if it remains. */
+int ldlm_resource_putref_locked(struct ldlm_resource *res)
+{
+       struct ldlm_namespace *ns = ldlm_res_to_ns(res);
+
+       LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON);
+       CDEBUG(D_INFO, "putref res: %p count: %d\n",
+              res, atomic_read(&res->lr_refcount) - 1);
+
+       if (atomic_dec_and_test(&res->lr_refcount)) {
+               cfs_hash_bd_t bd;
+
+               cfs_hash_bd_get(ldlm_res_to_ns(res)->ns_rs_hash,
+                               &res->lr_name, &bd);
+               __ldlm_resource_putref_final(&bd, res);
+               cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1);
+               /* NB: ns_rs_hash is created with CFS_HASH_NO_ITEMREF,
+                * so we should never be here while calling cfs_hash_del,
+                * cfs_hash_for_each_nolock is the only case we can get
+                * here, which is safe to release cfs_hash_bd_lock.
+                */
+               if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free)
+                       ns->ns_lvbo->lvbo_free(res);
+               OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
+
+               cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1);
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * Add a lock into a given resource into specified lock list.
+ */
+void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
+                           struct ldlm_lock *lock)
+{
+       check_res_locked(res);
+
+       LDLM_DEBUG(lock, "About to add this lock:\n");
+
+       if (lock->l_destroyed) {
+               CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+               return;
+       }
+
+       LASSERT(list_empty(&lock->l_res_link));
+
+       list_add_tail(&lock->l_res_link, head);
+}
+
+/**
+ * Insert a lock into resource after specified lock.
+ *
+ * Obtain resource description from the lock we are inserting after.
+ */
+void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
+                                    struct ldlm_lock *new)
+{
+       struct ldlm_resource *res = original->l_resource;
+
+       check_res_locked(res);
+
+       ldlm_resource_dump(D_INFO, res);
+       LDLM_DEBUG(new, "About to insert this lock after %p:\n", original);
+
+       if (new->l_destroyed) {
+               CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n");
+               goto out;
+       }
+
+       LASSERT(list_empty(&new->l_res_link));
+
+       list_add(&new->l_res_link, &original->l_res_link);
+ out:;
+}
+
+void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
+{
+       int type = lock->l_resource->lr_type;
+
+       check_res_locked(lock->l_resource);
+       if (type == LDLM_IBITS || type == LDLM_PLAIN)
+               ldlm_unlink_lock_skiplist(lock);
+       else if (type == LDLM_EXTENT)
+               ldlm_extent_unlink_lock(lock);
+       list_del_init(&lock->l_res_link);
+}
+EXPORT_SYMBOL(ldlm_resource_unlink_lock);
+
+void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc)
+{
+       desc->lr_type = res->lr_type;
+       desc->lr_name = res->lr_name;
+}
+
+/**
+ * Print information about all locks in all namespaces on this node to debug
+ * log.
+ */
+void ldlm_dump_all_namespaces(ldlm_side_t client, int level)
+{
+       struct list_head *tmp;
+
+       if (!((libcfs_debug | D_ERROR) & level))
+               return;
+
+       mutex_lock(ldlm_namespace_lock(client));
+
+       list_for_each(tmp, ldlm_namespace_list(client)) {
+               struct ldlm_namespace *ns;
+               ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain);
+               ldlm_namespace_dump(level, ns);
+       }
+
+       mutex_unlock(ldlm_namespace_lock(client));
+}
+EXPORT_SYMBOL(ldlm_dump_all_namespaces);
+
+static int ldlm_res_hash_dump(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                             struct hlist_node *hnode, void *arg)
+{
+       struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+       int    level = (int)(unsigned long)arg;
+
+       lock_res(res);
+       ldlm_resource_dump(level, res);
+       unlock_res(res);
+
+       return 0;
+}
+
+/**
+ * Print information about all locks in this namespace on this node to debug
+ * log.
+ */
+void ldlm_namespace_dump(int level, struct ldlm_namespace *ns)
+{
+       if (!((libcfs_debug | D_ERROR) & level))
+               return;
+
+       CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n",
+              ldlm_ns_name(ns), atomic_read(&ns->ns_bref),
+              ns_is_client(ns) ? "client" : "server");
+
+       if (cfs_time_before(cfs_time_current(), ns->ns_next_dump))
+               return;
+
+       cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                ldlm_res_hash_dump,
+                                (void *)(unsigned long)level);
+       spin_lock(&ns->ns_lock);
+       ns->ns_next_dump = cfs_time_shift(10);
+       spin_unlock(&ns->ns_lock);
+}
+EXPORT_SYMBOL(ldlm_namespace_dump);
+
+/**
+ * Print information about all locks in this resource to debug log.
+ */
+void ldlm_resource_dump(int level, struct ldlm_resource *res)
+{
+       struct ldlm_lock *lock;
+       unsigned int granted = 0;
+
+       CLASSERT(RES_NAME_SIZE == 4);
+
+       if (!((libcfs_debug | D_ERROR) & level))
+               return;
+
+       CDEBUG(level, "--- Resource: %p ("LPU64"/"LPU64"/"LPU64"/"LPU64
+              ") (rc: %d)\n", res, res->lr_name.name[0], res->lr_name.name[1],
+              res->lr_name.name[2], res->lr_name.name[3],
+              atomic_read(&res->lr_refcount));
+
+       if (!list_empty(&res->lr_granted)) {
+               CDEBUG(level, "Granted locks (in reverse order):\n");
+               list_for_each_entry_reverse(lock, &res->lr_granted,
+                                               l_res_link) {
+                       LDLM_DEBUG_LIMIT(level, lock, "###");
+                       if (!(level & D_CANTMASK) &&
+                           ++granted > ldlm_dump_granted_max) {
+                               CDEBUG(level, "only dump %d granted locks to "
+                                      "avoid DDOS.\n", granted);
+                               break;
+                       }
+               }
+       }
+       if (!list_empty(&res->lr_converting)) {
+               CDEBUG(level, "Converting locks:\n");
+               list_for_each_entry(lock, &res->lr_converting, l_res_link)
+                       LDLM_DEBUG_LIMIT(level, lock, "###");
+       }
+       if (!list_empty(&res->lr_waiting)) {
+               CDEBUG(level, "Waiting locks:\n");
+               list_for_each_entry(lock, &res->lr_waiting, l_res_link)
+                       LDLM_DEBUG_LIMIT(level, lock, "###");
+       }
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/Makefile b/drivers/staging/lustre/lustre/libcfs/Makefile
new file mode 100644 (file)
index 0000000..bf5c563
--- /dev/null
@@ -0,0 +1,21 @@
+obj-$(CONFIG_LUSTRE_FS) += libcfs.o
+
+libcfs-linux-objs := linux-tracefile.o linux-debug.o
+libcfs-linux-objs += linux-prim.o linux-cpu.o
+libcfs-linux-objs += linux-tcpip.o
+libcfs-linux-objs += linux-proc.o linux-curproc.o
+libcfs-linux-objs += linux-module.o
+libcfs-linux-objs += linux-crypto.o
+libcfs-linux-objs += linux-crypto-adler.o
+
+libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs))
+
+libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \
+                  watchdog.o libcfs_string.o hash.o kernel_user_comm.o \
+                  prng.o workitem.o upcall_cache.o libcfs_cpu.o \
+                  libcfs_mem.o libcfs_lock.o
+
+libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs)
+
+ccflags-y := -I$(src)/../include
+ccflags-y += -I$(src)/
diff --git a/drivers/staging/lustre/lustre/libcfs/debug.c b/drivers/staging/lustre/lustre/libcfs/debug.c
new file mode 100644 (file)
index 0000000..5a87b08
--- /dev/null
@@ -0,0 +1,476 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+static char debug_file_name[1024];
+
+unsigned int libcfs_subsystem_debug = ~0;
+CFS_MODULE_PARM(libcfs_subsystem_debug, "i", int, 0644,
+               "Lustre kernel debug subsystem mask");
+EXPORT_SYMBOL(libcfs_subsystem_debug);
+
+unsigned int libcfs_debug = (D_CANTMASK |
+                            D_NETERROR | D_HA | D_CONFIG | D_IOCTL);
+CFS_MODULE_PARM(libcfs_debug, "i", int, 0644,
+               "Lustre kernel debug mask");
+EXPORT_SYMBOL(libcfs_debug);
+
+unsigned int libcfs_debug_mb = 0;
+CFS_MODULE_PARM(libcfs_debug_mb, "i", uint, 0644,
+               "Total debug buffer size.");
+EXPORT_SYMBOL(libcfs_debug_mb);
+
+unsigned int libcfs_printk = D_CANTMASK;
+CFS_MODULE_PARM(libcfs_printk, "i", uint, 0644,
+               "Lustre kernel debug console mask");
+EXPORT_SYMBOL(libcfs_printk);
+
+unsigned int libcfs_console_ratelimit = 1;
+CFS_MODULE_PARM(libcfs_console_ratelimit, "i", uint, 0644,
+               "Lustre kernel debug console ratelimit (0 to disable)");
+EXPORT_SYMBOL(libcfs_console_ratelimit);
+
+unsigned int libcfs_console_max_delay;
+CFS_MODULE_PARM(libcfs_console_max_delay, "l", uint, 0644,
+               "Lustre kernel debug console max delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_max_delay);
+
+unsigned int libcfs_console_min_delay;
+CFS_MODULE_PARM(libcfs_console_min_delay, "l", uint, 0644,
+               "Lustre kernel debug console min delay (jiffies)");
+EXPORT_SYMBOL(libcfs_console_min_delay);
+
+unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF;
+CFS_MODULE_PARM(libcfs_console_backoff, "i", uint, 0644,
+               "Lustre kernel debug console backoff factor");
+EXPORT_SYMBOL(libcfs_console_backoff);
+
+unsigned int libcfs_debug_binary = 1;
+EXPORT_SYMBOL(libcfs_debug_binary);
+
+unsigned int libcfs_stack = 3 * THREAD_SIZE / 4;
+EXPORT_SYMBOL(libcfs_stack);
+
+unsigned int portal_enter_debugger;
+EXPORT_SYMBOL(portal_enter_debugger);
+
+unsigned int libcfs_catastrophe;
+EXPORT_SYMBOL(libcfs_catastrophe);
+
+unsigned int libcfs_watchdog_ratelimit = 300;
+EXPORT_SYMBOL(libcfs_watchdog_ratelimit);
+
+unsigned int libcfs_panic_on_lbug = 1;
+CFS_MODULE_PARM(libcfs_panic_on_lbug, "i", uint, 0644,
+               "Lustre kernel panic on LBUG");
+EXPORT_SYMBOL(libcfs_panic_on_lbug);
+
+atomic_t libcfs_kmemory = ATOMIC_INIT(0);
+EXPORT_SYMBOL(libcfs_kmemory);
+
+static wait_queue_head_t debug_ctlwq;
+
+char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT;
+
+/* We need to pass a pointer here, but elsewhere this must be a const */
+char *libcfs_debug_file_path;
+CFS_MODULE_PARM(libcfs_debug_file_path, "s", charp, 0644,
+               "Path for dumping debug logs, "
+               "set 'NONE' to prevent log dumping");
+
+int libcfs_panic_in_progress;
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+const char *
+libcfs_debug_subsys2str(int subsys)
+{
+       switch (1 << subsys) {
+       default:
+               return NULL;
+       case S_UNDEFINED:
+               return "undefined";
+       case S_MDC:
+               return "mdc";
+       case S_MDS:
+               return "mds";
+       case S_OSC:
+               return "osc";
+       case S_OST:
+               return "ost";
+       case S_CLASS:
+               return "class";
+       case S_LOG:
+               return "log";
+       case S_LLITE:
+               return "llite";
+       case S_RPC:
+               return "rpc";
+       case S_LNET:
+               return "lnet";
+       case S_LND:
+               return "lnd";
+       case S_PINGER:
+               return "pinger";
+       case S_FILTER:
+               return "filter";
+       case S_ECHO:
+               return "echo";
+       case S_LDLM:
+               return "ldlm";
+       case S_LOV:
+               return "lov";
+       case S_LQUOTA:
+               return "lquota";
+       case S_OSD:
+               return "osd";
+       case S_LMV:
+               return "lmv";
+       case S_SEC:
+               return "sec";
+       case S_GSS:
+               return "gss";
+       case S_MGC:
+               return "mgc";
+       case S_MGS:
+               return "mgs";
+       case S_FID:
+               return "fid";
+       case S_FLD:
+               return "fld";
+       }
+}
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+const char *
+libcfs_debug_dbg2str(int debug)
+{
+       switch (1 << debug) {
+       default:
+               return NULL;
+       case D_TRACE:
+               return "trace";
+       case D_INODE:
+               return "inode";
+       case D_SUPER:
+               return "super";
+       case D_EXT2:
+               return "ext2";
+       case D_MALLOC:
+               return "malloc";
+       case D_CACHE:
+               return "cache";
+       case D_INFO:
+               return "info";
+       case D_IOCTL:
+               return "ioctl";
+       case D_NETERROR:
+               return "neterror";
+       case D_NET:
+               return "net";
+       case D_WARNING:
+               return "warning";
+       case D_BUFFS:
+               return "buffs";
+       case D_OTHER:
+               return "other";
+       case D_DENTRY:
+               return "dentry";
+       case D_NETTRACE:
+               return "nettrace";
+       case D_PAGE:
+               return "page";
+       case D_DLMTRACE:
+               return "dlmtrace";
+       case D_ERROR:
+               return "error";
+       case D_EMERG:
+               return "emerg";
+       case D_HA:
+               return "ha";
+       case D_RPCTRACE:
+               return "rpctrace";
+       case D_VFSTRACE:
+               return "vfstrace";
+       case D_READA:
+               return "reada";
+       case D_MMAP:
+               return "mmap";
+       case D_CONFIG:
+               return "config";
+       case D_CONSOLE:
+               return "console";
+       case D_QUOTA:
+               return "quota";
+       case D_SEC:
+               return "sec";
+       case D_LFSCK:
+               return "lfsck";
+       }
+}
+
+int
+libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
+{
+       const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+                                                libcfs_debug_dbg2str;
+       int        len = 0;
+       const char   *token;
+       int        i;
+
+       if (mask == 0) {                        /* "0" */
+               if (size > 0)
+                       str[0] = '0';
+               len = 1;
+       } else {                                /* space-separated tokens */
+               for (i = 0; i < 32; i++) {
+                       if ((mask & (1 << i)) == 0)
+                               continue;
+
+                       token = fn(i);
+                       if (token == NULL)            /* unused bit */
+                               continue;
+
+                       if (len > 0) {            /* separator? */
+                               if (len < size)
+                                       str[len] = ' ';
+                               len++;
+                       }
+
+                       while (*token != 0) {
+                               if (len < size)
+                                       str[len] = *token;
+                               token++;
+                               len++;
+                       }
+               }
+       }
+
+       /* terminate 'str' */
+       if (len < size)
+               str[len] = 0;
+       else
+               str[size - 1] = 0;
+
+       return len;
+}
+
+int
+libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
+{
+       const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+                                                libcfs_debug_dbg2str;
+       int      m = 0;
+       int      matched;
+       int      n;
+       int      t;
+
+       /* Allow a number for backwards compatibility */
+
+       for (n = strlen(str); n > 0; n--)
+               if (!isspace(str[n-1]))
+                       break;
+       matched = n;
+
+       if ((t = sscanf(str, "%i%n", &m, &matched)) >= 1 &&
+           matched == n) {
+               /* don't print warning for lctl set_param debug=0 or -1 */
+               if (m != 0 && m != -1)
+                       CWARN("You are trying to use a numerical value for the "
+                             "mask - this will be deprecated in a future "
+                             "release.\n");
+               *mask = m;
+               return 0;
+       }
+
+       return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK,
+                           0xffffffff);
+}
+
+/**
+ * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages()
+ */
+void libcfs_debug_dumplog_internal(void *arg)
+{
+       DECL_JOURNAL_DATA;
+
+       PUSH_JOURNAL;
+
+       if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0) {
+               snprintf(debug_file_name, sizeof(debug_file_name) - 1,
+                        "%s.%ld." LPLD, libcfs_debug_file_path_arr,
+                        cfs_time_current_sec(), (long_ptr_t)arg);
+               printk(KERN_ALERT "LustreError: dumping log to %s\n",
+                      debug_file_name);
+               cfs_tracefile_dump_all_pages(debug_file_name);
+               libcfs_run_debug_log_upcall(debug_file_name);
+       }
+       POP_JOURNAL;
+}
+
+int libcfs_debug_dumplog_thread(void *arg)
+{
+       libcfs_debug_dumplog_internal(arg);
+       wake_up(&debug_ctlwq);
+       return 0;
+}
+
+void libcfs_debug_dumplog(void)
+{
+       wait_queue_t wait;
+       task_t    *dumper;
+       ENTRY;
+
+       /* we're being careful to ensure that the kernel thread is
+        * able to set our state to running as it exits before we
+        * get to schedule() */
+       init_waitqueue_entry_current(&wait);
+       set_current_state(TASK_INTERRUPTIBLE);
+       add_wait_queue(&debug_ctlwq, &wait);
+
+       dumper = kthread_run(libcfs_debug_dumplog_thread,
+                            (void *)(long)current_pid(),
+                            "libcfs_debug_dumper");
+       if (IS_ERR(dumper))
+               printk(KERN_ERR "LustreError: cannot start log dump thread:"
+                      " %ld\n", PTR_ERR(dumper));
+       else
+               waitq_wait(&wait, TASK_INTERRUPTIBLE);
+
+       /* be sure to teardown if cfs_create_thread() failed */
+       remove_wait_queue(&debug_ctlwq, &wait);
+       set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(libcfs_debug_dumplog);
+
+int libcfs_debug_init(unsigned long bufsize)
+{
+       int    rc = 0;
+       unsigned int max = libcfs_debug_mb;
+
+       init_waitqueue_head(&debug_ctlwq);
+
+       if (libcfs_console_max_delay <= 0 || /* not set by user or */
+           libcfs_console_min_delay <= 0 || /* set to invalid values */
+           libcfs_console_min_delay >= libcfs_console_max_delay) {
+               libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY;
+               libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY;
+       }
+
+       if (libcfs_debug_file_path != NULL) {
+               memset(libcfs_debug_file_path_arr, 0, PATH_MAX);
+               strncpy(libcfs_debug_file_path_arr,
+                       libcfs_debug_file_path, PATH_MAX-1);
+       }
+
+       /* If libcfs_debug_mb is set to an invalid value or uninitialized
+        * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */
+       if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) {
+               max = TCD_MAX_PAGES;
+       } else {
+               max = (max / num_possible_cpus());
+               max = (max << (20 - PAGE_CACHE_SHIFT));
+       }
+       rc = cfs_tracefile_init(max);
+
+       if (rc == 0)
+               libcfs_register_panic_notifier();
+
+       return rc;
+}
+
+int libcfs_debug_cleanup(void)
+{
+       libcfs_unregister_panic_notifier();
+       cfs_tracefile_exit();
+       return 0;
+}
+
+int libcfs_debug_clear_buffer(void)
+{
+       cfs_trace_flush_pages();
+       return 0;
+}
+
+/* Debug markers, although printed by S_LNET
+ * should not be be marked as such. */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int libcfs_debug_mark_buffer(const char *text)
+{
+       CDEBUG(D_TRACE,"***************************************************\n");
+       LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text);
+       CDEBUG(D_TRACE,"***************************************************\n");
+
+       return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_LNET
+
+void libcfs_debug_set_level(unsigned int debug_level)
+{
+       printk(KERN_WARNING "Lustre: Setting portals debug level to %08x\n",
+              debug_level);
+       libcfs_debug = debug_level;
+}
+
+EXPORT_SYMBOL(libcfs_debug_set_level);
+
+long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc)
+{
+       libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n",
+                        rc, rc, rc);
+       return rc;
+}
+EXPORT_SYMBOL(libcfs_log_return);
+
+void libcfs_log_goto(struct libcfs_debug_msg_data *msgdata, const char *label,
+                    long_ptr_t rc)
+{
+       libcfs_debug_msg(msgdata, "Process leaving via %s (rc=" LPLU " : " LPLD
+                        " : " LPLX ")\n", label, (ulong_ptr_t)rc, rc, rc);
+}
+EXPORT_SYMBOL(libcfs_log_goto);
diff --git a/drivers/staging/lustre/lustre/libcfs/fail.c b/drivers/staging/lustre/lustre/libcfs/fail.c
new file mode 100644 (file)
index 0000000..c54448d
--- /dev/null
@@ -0,0 +1,137 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores,
+ * CA 94065 USA or visit www.oracle.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Oracle Corporation, Inc.
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+unsigned long cfs_fail_loc = 0;
+unsigned int cfs_fail_val = 0;
+wait_queue_head_t cfs_race_waitq;
+int cfs_race_state;
+
+EXPORT_SYMBOL(cfs_fail_loc);
+EXPORT_SYMBOL(cfs_fail_val);
+EXPORT_SYMBOL(cfs_race_waitq);
+EXPORT_SYMBOL(cfs_race_state);
+
+int __cfs_fail_check_set(__u32 id, __u32 value, int set)
+{
+       static atomic_t cfs_fail_count = ATOMIC_INIT(0);
+
+       LASSERT(!(id & CFS_FAIL_ONCE));
+
+       if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) ==
+           (CFS_FAILED | CFS_FAIL_ONCE)) {
+               atomic_set(&cfs_fail_count, 0); /* paranoia */
+               return 0;
+       }
+
+       /* Fail 1/cfs_fail_val times */
+       if (cfs_fail_loc & CFS_FAIL_RAND) {
+               if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0)
+                       return 0;
+       }
+
+       /* Skip the first cfs_fail_val, then fail */
+       if (cfs_fail_loc & CFS_FAIL_SKIP) {
+               if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val)
+                       return 0;
+       }
+
+       /* check cfs_fail_val... */
+       if (set == CFS_FAIL_LOC_VALUE) {
+               if (cfs_fail_val != -1 && cfs_fail_val != value)
+                       return 0;
+       }
+
+       /* Fail cfs_fail_val times, overridden by FAIL_ONCE */
+       if (cfs_fail_loc & CFS_FAIL_SOME &&
+           (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) {
+               int count = atomic_inc_return(&cfs_fail_count);
+
+               if (count >= cfs_fail_val) {
+                       set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+                       atomic_set(&cfs_fail_count, 0);
+                       /* we are lost race to increase  */
+                       if (count > cfs_fail_val)
+                               return 0;
+               }
+       }
+
+       if ((set == CFS_FAIL_LOC_ORSET || set == CFS_FAIL_LOC_RESET) &&
+           (value & CFS_FAIL_ONCE))
+               set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc);
+       /* Lost race to set CFS_FAILED_BIT. */
+       if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) {
+               /* If CFS_FAIL_ONCE is valid, only one process can fail,
+                * otherwise multi-process can fail at the same time. */
+               if (cfs_fail_loc & CFS_FAIL_ONCE)
+                       return 0;
+       }
+
+       switch (set) {
+               case CFS_FAIL_LOC_NOSET:
+               case CFS_FAIL_LOC_VALUE:
+                       break;
+               case CFS_FAIL_LOC_ORSET:
+                       cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE);
+                       break;
+               case CFS_FAIL_LOC_RESET:
+                       cfs_fail_loc = value;
+                       break;
+               default:
+                       LASSERTF(0, "called with bad set %u\n", set);
+                       break;
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(__cfs_fail_check_set);
+
+int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set)
+{
+       int ret = 0;
+
+       ret = __cfs_fail_check_set(id, value, set);
+       if (ret) {
+               CERROR("cfs_fail_timeout id %x sleeping for %dms\n",
+                      id, ms);
+               schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+                                                  cfs_time_seconds(ms) / 1000);
+               set_current_state(TASK_RUNNING);
+               CERROR("cfs_fail_timeout id %x awake\n", id);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(__cfs_fail_timeout_set);
diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c
new file mode 100644 (file)
index 0000000..98c76df
--- /dev/null
@@ -0,0 +1,2123 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/hash.c
+ *
+ * Implement a hash class for hash process in lustre system.
+ *
+ * Author: YuZhangyong <yzy@clusterfs.com>
+ *
+ * 2008-08-15: Brian Behlendorf <behlendorf1@llnl.gov>
+ * - Simplified API and improved documentation
+ * - Added per-hash feature flags:
+ *   * CFS_HASH_DEBUG additional validation
+ *   * CFS_HASH_REHASH dynamic rehashing
+ * - Added per-hash statistics
+ * - General performance enhancements
+ *
+ * 2009-07-31: Liang Zhen <zhen.liang@sun.com>
+ * - move all stuff to libcfs
+ * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH
+ * - ignore hs_rwlock if without CFS_HASH_REHASH setting
+ * - buckets are allocated one by one(intead of contiguous memory),
+ *   to avoid unnecessary cacheline conflict
+ *
+ * 2010-03-01: Liang Zhen <zhen.liang@sun.com>
+ * - "bucket" is a group of hlist_head now, user can speicify bucket size
+ *   by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share
+ *   one lock for reducing memory overhead.
+ *
+ * - support lockless hash, caller will take care of locks:
+ *   avoid lock overhead for hash tables that are already protected
+ *   by locking in the caller for another reason
+ *
+ * - support both spin_lock/rwlock for bucket:
+ *   overhead of spinlock contention is lower than read/write
+ *   contention of rwlock, so using spinlock to serialize operations on
+ *   bucket is more reasonable for those frequently changed hash tables
+ *
+ * - support one-single lock mode:
+ *   one lock to protect all hash operations to avoid overhead of
+ *   multiple locks if hash table is always small
+ *
+ * - removed a lot of unnecessary addref & decref on hash element:
+ *   addref & decref are atomic operations in many use-cases which
+ *   are expensive.
+ *
+ * - support non-blocking cfs_hash_add() and cfs_hash_findadd():
+ *   some lustre use-cases require these functions to be strictly
+ *   non-blocking, we need to schedule required rehash on a different
+ *   thread on those cases.
+ *
+ * - safer rehash on large hash table
+ *   In old implementation, rehash function will exclusively lock the
+ *   hash table and finish rehash in one batch, it's dangerous on SMP
+ *   system because rehash millions of elements could take long time.
+ *   New implemented rehash can release lock and relax CPU in middle
+ *   of rehash, it's safe for another thread to search/change on the
+ *   hash table even it's in rehasing.
+ *
+ * - support two different refcount modes
+ *   . hash table has refcount on element
+ *   . hash table doesn't change refcount on adding/removing element
+ *
+ * - support long name hash table (for param-tree)
+ *
+ * - fix a bug for cfs_hash_rehash_key:
+ *   in old implementation, cfs_hash_rehash_key could screw up the
+ *   hash-table because @key is overwritten without any protection.
+ *   Now we need user to define hs_keycpy for those rehash enabled
+ *   hash tables, cfs_hash_rehash_key will overwrite hash-key
+ *   inside lock by calling hs_keycpy.
+ *
+ * - better hash iteration:
+ *   Now we support both locked iteration & lockless iteration of hash
+ *   table. Also, user can break the iteration by return 1 in callback.
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/seq_file.h>
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static unsigned int warn_on_depth = 8;
+CFS_MODULE_PARM(warn_on_depth, "i", uint, 0644,
+               "warning when hash depth is high.");
+#endif
+
+struct cfs_wi_sched *cfs_sched_rehash;
+
+static inline void
+cfs_hash_nl_lock(cfs_hash_lock_t *lock, int exclusive) {}
+
+static inline void
+cfs_hash_nl_unlock(cfs_hash_lock_t *lock, int exclusive) {}
+
+static inline void
+cfs_hash_spin_lock(cfs_hash_lock_t *lock, int exclusive)
+{
+       spin_lock(&lock->spin);
+}
+
+static inline void
+cfs_hash_spin_unlock(cfs_hash_lock_t *lock, int exclusive)
+{
+       spin_unlock(&lock->spin);
+}
+
+static inline void
+cfs_hash_rw_lock(cfs_hash_lock_t *lock, int exclusive)
+{
+       if (!exclusive)
+               read_lock(&lock->rw);
+       else
+               write_lock(&lock->rw);
+}
+
+static inline void
+cfs_hash_rw_unlock(cfs_hash_lock_t *lock, int exclusive)
+{
+       if (!exclusive)
+               read_unlock(&lock->rw);
+       else
+               write_unlock(&lock->rw);
+}
+
+/** No lock hash */
+static cfs_hash_lock_ops_t cfs_hash_nl_lops =
+{
+       .hs_lock        = cfs_hash_nl_lock,
+       .hs_unlock      = cfs_hash_nl_unlock,
+       .hs_bkt_lock    = cfs_hash_nl_lock,
+       .hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** no bucket lock, one spinlock to protect everything */
+static cfs_hash_lock_ops_t cfs_hash_nbl_lops =
+{
+       .hs_lock        = cfs_hash_spin_lock,
+       .hs_unlock      = cfs_hash_spin_unlock,
+       .hs_bkt_lock    = cfs_hash_nl_lock,
+       .hs_bkt_unlock  = cfs_hash_nl_unlock,
+};
+
+/** spin bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_spin_lops =
+{
+       .hs_lock        = cfs_hash_rw_lock,
+       .hs_unlock      = cfs_hash_rw_unlock,
+       .hs_bkt_lock    = cfs_hash_spin_lock,
+       .hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is enabled */
+static cfs_hash_lock_ops_t cfs_hash_bkt_rw_lops =
+{
+       .hs_lock        = cfs_hash_rw_lock,
+       .hs_unlock      = cfs_hash_rw_unlock,
+       .hs_bkt_lock    = cfs_hash_rw_lock,
+       .hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+/** spin bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_spin_lops =
+{
+       .hs_lock        = cfs_hash_nl_lock,
+       .hs_unlock      = cfs_hash_nl_unlock,
+       .hs_bkt_lock    = cfs_hash_spin_lock,
+       .hs_bkt_unlock  = cfs_hash_spin_unlock,
+};
+
+/** rw bucket lock, rehash is disabled */
+static cfs_hash_lock_ops_t cfs_hash_nr_bkt_rw_lops =
+{
+       .hs_lock        = cfs_hash_nl_lock,
+       .hs_unlock      = cfs_hash_nl_unlock,
+       .hs_bkt_lock    = cfs_hash_rw_lock,
+       .hs_bkt_unlock  = cfs_hash_rw_unlock,
+};
+
+static void
+cfs_hash_lock_setup(cfs_hash_t *hs)
+{
+       if (cfs_hash_with_no_lock(hs)) {
+               hs->hs_lops = &cfs_hash_nl_lops;
+
+       } else if (cfs_hash_with_no_bktlock(hs)) {
+               hs->hs_lops = &cfs_hash_nbl_lops;
+               spin_lock_init(&hs->hs_lock.spin);
+
+       } else if (cfs_hash_with_rehash(hs)) {
+               rwlock_init(&hs->hs_lock.rw);
+
+               if (cfs_hash_with_rw_bktlock(hs))
+                       hs->hs_lops = &cfs_hash_bkt_rw_lops;
+               else if (cfs_hash_with_spin_bktlock(hs))
+                       hs->hs_lops = &cfs_hash_bkt_spin_lops;
+               else
+                       LBUG();
+       } else {
+               if (cfs_hash_with_rw_bktlock(hs))
+                       hs->hs_lops = &cfs_hash_nr_bkt_rw_lops;
+               else if (cfs_hash_with_spin_bktlock(hs))
+                       hs->hs_lops = &cfs_hash_nr_bkt_spin_lops;
+               else
+                       LBUG();
+       }
+}
+
+/**
+ * Simple hash head without depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+       struct hlist_head       hh_head;        /**< entries list */
+} cfs_hash_head_t;
+
+static int
+cfs_hash_hh_hhead_size(cfs_hash_t *hs)
+{
+       return sizeof(cfs_hash_head_t);
+}
+
+static struct hlist_head *
+cfs_hash_hh_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       cfs_hash_head_t *head = (cfs_hash_head_t *)&bd->bd_bucket->hsb_head[0];
+
+       return &head[bd->bd_offset].hh_head;
+}
+
+static int
+cfs_hash_hh_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd));
+       return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_hh_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       hlist_del_init(hnode);
+       return -1; /* unknown depth */
+}
+
+/**
+ * Simple hash head with depth tracking
+ * new element is always added to head of hlist
+ */
+typedef struct {
+       struct hlist_head       hd_head;        /**< entries list */
+       unsigned int        hd_depth;       /**< list length */
+} cfs_hash_head_dep_t;
+
+static int
+cfs_hash_hd_hhead_size(cfs_hash_t *hs)
+{
+       return sizeof(cfs_hash_head_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_hd_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       cfs_hash_head_dep_t   *head;
+
+       head = (cfs_hash_head_dep_t *)&bd->bd_bucket->hsb_head[0];
+       return &head[bd->bd_offset].hd_head;
+}
+
+static int
+cfs_hash_hd_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+                                              cfs_hash_head_dep_t, hd_head);
+       hlist_add_head(hnode, &hh->hd_head);
+       return ++hh->hd_depth;
+}
+
+static int
+cfs_hash_hd_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd),
+                                              cfs_hash_head_dep_t, hd_head);
+       hlist_del_init(hnode);
+       return --hh->hd_depth;
+}
+
+/**
+ * double links hash head without depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+       struct hlist_head       dh_head;        /**< entries list */
+       struct hlist_node       *dh_tail;       /**< the last entry */
+} cfs_hash_dhead_t;
+
+static int
+cfs_hash_dh_hhead_size(cfs_hash_t *hs)
+{
+       return sizeof(cfs_hash_dhead_t);
+}
+
+static struct hlist_head *
+cfs_hash_dh_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       cfs_hash_dhead_t *head;
+
+       head = (cfs_hash_dhead_t *)&bd->bd_bucket->hsb_head[0];
+       return &head[bd->bd_offset].dh_head;
+}
+
+static int
+cfs_hash_dh_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+                                           cfs_hash_dhead_t, dh_head);
+
+       if (dh->dh_tail != NULL) /* not empty */
+               hlist_add_after(dh->dh_tail, hnode);
+       else /* empty list */
+               hlist_add_head(hnode, &dh->dh_head);
+       dh->dh_tail = hnode;
+       return -1; /* unknown depth */
+}
+
+static int
+cfs_hash_dh_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnd)
+{
+       cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd),
+                                           cfs_hash_dhead_t, dh_head);
+
+       if (hnd->next == NULL) { /* it's the tail */
+               dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL :
+                             container_of(hnd->pprev, struct hlist_node, next);
+       }
+       hlist_del_init(hnd);
+       return -1; /* unknown depth */
+}
+
+/**
+ * double links hash head with depth tracking
+ * new element is always added to tail of hlist
+ */
+typedef struct {
+       struct hlist_head       dd_head;        /**< entries list */
+       struct hlist_node       *dd_tail;       /**< the last entry */
+       unsigned int        dd_depth;       /**< list length */
+} cfs_hash_dhead_dep_t;
+
+static int
+cfs_hash_dd_hhead_size(cfs_hash_t *hs)
+{
+       return sizeof(cfs_hash_dhead_dep_t);
+}
+
+static struct hlist_head *
+cfs_hash_dd_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd)
+{
+       cfs_hash_dhead_dep_t *head;
+
+       head = (cfs_hash_dhead_dep_t *)&bd->bd_bucket->hsb_head[0];
+       return &head[bd->bd_offset].dd_head;
+}
+
+static int
+cfs_hash_dd_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnode)
+{
+       cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+                                               cfs_hash_dhead_dep_t, dd_head);
+
+       if (dh->dd_tail != NULL) /* not empty */
+               hlist_add_after(dh->dd_tail, hnode);
+       else /* empty list */
+               hlist_add_head(hnode, &dh->dd_head);
+       dh->dd_tail = hnode;
+       return ++dh->dd_depth;
+}
+
+static int
+cfs_hash_dd_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                     struct hlist_node *hnd)
+{
+       cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd),
+                                               cfs_hash_dhead_dep_t, dd_head);
+
+       if (hnd->next == NULL) { /* it's the tail */
+               dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL :
+                             container_of(hnd->pprev, struct hlist_node, next);
+       }
+       hlist_del_init(hnd);
+       return --dh->dd_depth;
+}
+
+static cfs_hash_hlist_ops_t cfs_hash_hh_hops = {
+       .hop_hhead      = cfs_hash_hh_hhead,
+       .hop_hhead_size = cfs_hash_hh_hhead_size,
+       .hop_hnode_add  = cfs_hash_hh_hnode_add,
+       .hop_hnode_del  = cfs_hash_hh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_hd_hops = {
+       .hop_hhead      = cfs_hash_hd_hhead,
+       .hop_hhead_size = cfs_hash_hd_hhead_size,
+       .hop_hnode_add  = cfs_hash_hd_hnode_add,
+       .hop_hnode_del  = cfs_hash_hd_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dh_hops = {
+       .hop_hhead      = cfs_hash_dh_hhead,
+       .hop_hhead_size = cfs_hash_dh_hhead_size,
+       .hop_hnode_add  = cfs_hash_dh_hnode_add,
+       .hop_hnode_del  = cfs_hash_dh_hnode_del,
+};
+
+static cfs_hash_hlist_ops_t cfs_hash_dd_hops = {
+       .hop_hhead      = cfs_hash_dd_hhead,
+       .hop_hhead_size = cfs_hash_dd_hhead_size,
+       .hop_hnode_add  = cfs_hash_dd_hnode_add,
+       .hop_hnode_del  = cfs_hash_dd_hnode_del,
+};
+
+static void
+cfs_hash_hlist_setup(cfs_hash_t *hs)
+{
+       if (cfs_hash_with_add_tail(hs)) {
+               hs->hs_hops = cfs_hash_with_depth(hs) ?
+                             &cfs_hash_dd_hops : &cfs_hash_dh_hops;
+       } else {
+               hs->hs_hops = cfs_hash_with_depth(hs) ?
+                             &cfs_hash_hd_hops : &cfs_hash_hh_hops;
+       }
+}
+
+static void
+cfs_hash_bd_from_key(cfs_hash_t *hs, cfs_hash_bucket_t **bkts,
+                    unsigned int bits, const void *key, cfs_hash_bd_t *bd)
+{
+       unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1);
+
+       LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits);
+
+       bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)];
+       bd->bd_offset = index >> (bits - hs->hs_bkt_bits);
+}
+
+void
+cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd)
+{
+       /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+       if (likely(hs->hs_rehash_buckets == NULL)) {
+               cfs_hash_bd_from_key(hs, hs->hs_buckets,
+                                    hs->hs_cur_bits, key, bd);
+       } else {
+               LASSERT(hs->hs_rehash_bits != 0);
+               cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+                                    hs->hs_rehash_bits, key, bd);
+       }
+}
+EXPORT_SYMBOL(cfs_hash_bd_get);
+
+static inline void
+cfs_hash_bd_dep_record(cfs_hash_t *hs, cfs_hash_bd_t *bd, int dep_cur)
+{
+       if (likely(dep_cur <= bd->bd_bucket->hsb_depmax))
+               return;
+
+       bd->bd_bucket->hsb_depmax = dep_cur;
+# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+       if (likely(warn_on_depth == 0 ||
+                  max(warn_on_depth, hs->hs_dep_max) >= dep_cur))
+               return;
+
+       spin_lock(&hs->hs_dep_lock);
+       hs->hs_dep_max  = dep_cur;
+       hs->hs_dep_bkt  = bd->bd_bucket->hsb_index;
+       hs->hs_dep_off  = bd->bd_offset;
+       hs->hs_dep_bits = hs->hs_cur_bits;
+       spin_unlock(&hs->hs_dep_lock);
+
+       cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi);
+# endif
+}
+
+void
+cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                      struct hlist_node *hnode)
+{
+       int             rc;
+
+       rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode);
+       cfs_hash_bd_dep_record(hs, bd, rc);
+       bd->bd_bucket->hsb_version++;
+       if (unlikely(bd->bd_bucket->hsb_version == 0))
+               bd->bd_bucket->hsb_version++;
+       bd->bd_bucket->hsb_count++;
+
+       if (cfs_hash_with_counter(hs))
+               atomic_inc(&hs->hs_count);
+       if (!cfs_hash_with_no_itemref(hs))
+               cfs_hash_get(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_add_locked);
+
+void
+cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                      struct hlist_node *hnode)
+{
+       hs->hs_hops->hop_hnode_del(hs, bd, hnode);
+
+       LASSERT(bd->bd_bucket->hsb_count > 0);
+       bd->bd_bucket->hsb_count--;
+       bd->bd_bucket->hsb_version++;
+       if (unlikely(bd->bd_bucket->hsb_version == 0))
+               bd->bd_bucket->hsb_version++;
+
+       if (cfs_hash_with_counter(hs)) {
+               LASSERT(atomic_read(&hs->hs_count) > 0);
+               atomic_dec(&hs->hs_count);
+       }
+       if (!cfs_hash_with_no_itemref(hs))
+               cfs_hash_put_locked(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_bd_del_locked);
+
+void
+cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old,
+                       cfs_hash_bd_t *bd_new, struct hlist_node *hnode)
+{
+       cfs_hash_bucket_t *obkt = bd_old->bd_bucket;
+       cfs_hash_bucket_t *nbkt = bd_new->bd_bucket;
+       int             rc;
+
+       if (cfs_hash_bd_compare(bd_old, bd_new) == 0)
+               return;
+
+       /* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops
+        * in cfs_hash_bd_del/add_locked */
+       hs->hs_hops->hop_hnode_del(hs, bd_old, hnode);
+       rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode);
+       cfs_hash_bd_dep_record(hs, bd_new, rc);
+
+       LASSERT(obkt->hsb_count > 0);
+       obkt->hsb_count--;
+       obkt->hsb_version++;
+       if (unlikely(obkt->hsb_version == 0))
+               obkt->hsb_version++;
+       nbkt->hsb_count++;
+       nbkt->hsb_version++;
+       if (unlikely(nbkt->hsb_version == 0))
+               nbkt->hsb_version++;
+}
+EXPORT_SYMBOL(cfs_hash_bd_move_locked);
+
+enum {
+       /** always set, for sanity (avoid ZERO intent) */
+       CFS_HS_LOOKUP_MASK_FIND     = 1 << 0,
+       /** return entry with a ref */
+       CFS_HS_LOOKUP_MASK_REF      = 1 << 1,
+       /** add entry if not existing */
+       CFS_HS_LOOKUP_MASK_ADD      = 1 << 2,
+       /** delete entry, ignore other masks */
+       CFS_HS_LOOKUP_MASK_DEL      = 1 << 3,
+};
+
+typedef enum cfs_hash_lookup_intent {
+       /** return item w/o refcount */
+       CFS_HS_LOOKUP_IT_PEEK       = CFS_HS_LOOKUP_MASK_FIND,
+       /** return item with refcount */
+       CFS_HS_LOOKUP_IT_FIND       = (CFS_HS_LOOKUP_MASK_FIND |
+                                      CFS_HS_LOOKUP_MASK_REF),
+       /** return item w/o refcount if existed, otherwise add */
+       CFS_HS_LOOKUP_IT_ADD    = (CFS_HS_LOOKUP_MASK_FIND |
+                                      CFS_HS_LOOKUP_MASK_ADD),
+       /** return item with refcount if existed, otherwise add */
+       CFS_HS_LOOKUP_IT_FINDADD    = (CFS_HS_LOOKUP_IT_FIND |
+                                      CFS_HS_LOOKUP_MASK_ADD),
+       /** delete if existed */
+       CFS_HS_LOOKUP_IT_FINDDEL    = (CFS_HS_LOOKUP_MASK_FIND |
+                                      CFS_HS_LOOKUP_MASK_DEL)
+} cfs_hash_lookup_intent_t;
+
+static struct hlist_node *
+cfs_hash_bd_lookup_intent(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                         const void *key, struct hlist_node *hnode,
+                         cfs_hash_lookup_intent_t intent)
+
+{
+       struct hlist_head  *hhead = cfs_hash_bd_hhead(hs, bd);
+       struct hlist_node  *ehnode;
+       struct hlist_node  *match;
+       int  intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0;
+
+       /* with this function, we can avoid a lot of useless refcount ops,
+        * which are expensive atomic operations most time. */
+       match = intent_add ? NULL : hnode;
+       hlist_for_each(ehnode, hhead) {
+               if (!cfs_hash_keycmp(hs, key, ehnode))
+                       continue;
+
+               if (match != NULL && match != ehnode) /* can't match */
+                       continue;
+
+               /* match and ... */
+               if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) {
+                       cfs_hash_bd_del_locked(hs, bd, ehnode);
+                       return ehnode;
+               }
+
+               /* caller wants refcount? */
+               if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0)
+                       cfs_hash_get(hs, ehnode);
+               return ehnode;
+       }
+       /* no match item */
+       if (!intent_add)
+               return NULL;
+
+       LASSERT(hnode != NULL);
+       cfs_hash_bd_add_locked(hs, bd, hnode);
+       return hnode;
+}
+
+struct hlist_node *
+cfs_hash_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, const void *key)
+{
+       return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+                                        CFS_HS_LOOKUP_IT_FIND);
+}
+EXPORT_SYMBOL(cfs_hash_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_bd_peek_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, const void *key)
+{
+       return cfs_hash_bd_lookup_intent(hs, bd, key, NULL,
+                                        CFS_HS_LOOKUP_IT_PEEK);
+}
+EXPORT_SYMBOL(cfs_hash_bd_peek_locked);
+
+struct hlist_node *
+cfs_hash_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                          const void *key, struct hlist_node *hnode,
+                          int noref)
+{
+       return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+                                        CFS_HS_LOOKUP_IT_ADD |
+                                        (!noref * CFS_HS_LOOKUP_MASK_REF));
+}
+EXPORT_SYMBOL(cfs_hash_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                          const void *key, struct hlist_node *hnode)
+{
+       /* hnode can be NULL, we find the first item with @key */
+       return cfs_hash_bd_lookup_intent(hs, bd, key, hnode,
+                                        CFS_HS_LOOKUP_IT_FINDDEL);
+}
+EXPORT_SYMBOL(cfs_hash_bd_finddel_locked);
+
+static void
+cfs_hash_multi_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                      unsigned n, int excl)
+{
+       cfs_hash_bucket_t *prev = NULL;
+       int             i;
+
+       /**
+        * bds must be ascendantly ordered by bd->bd_bucket->hsb_index.
+        * NB: it's possible that several bds point to the same bucket but
+        * have different bd::bd_offset, so need take care of deadlock.
+        */
+       cfs_hash_for_each_bd(bds, n, i) {
+               if (prev == bds[i].bd_bucket)
+                       continue;
+
+               LASSERT(prev == NULL ||
+                       prev->hsb_index < bds[i].bd_bucket->hsb_index);
+               cfs_hash_bd_lock(hs, &bds[i], excl);
+               prev = bds[i].bd_bucket;
+       }
+}
+
+static void
+cfs_hash_multi_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                        unsigned n, int excl)
+{
+       cfs_hash_bucket_t *prev = NULL;
+       int             i;
+
+       cfs_hash_for_each_bd(bds, n, i) {
+               if (prev != bds[i].bd_bucket) {
+                       cfs_hash_bd_unlock(hs, &bds[i], excl);
+                       prev = bds[i].bd_bucket;
+               }
+       }
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                               unsigned n, const void *key)
+{
+       struct hlist_node  *ehnode;
+       unsigned           i;
+
+       cfs_hash_for_each_bd(bds, n, i) {
+               ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL,
+                                                  CFS_HS_LOOKUP_IT_FIND);
+               if (ehnode != NULL)
+                       return ehnode;
+       }
+       return NULL;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_findadd_locked(cfs_hash_t *hs,
+                                cfs_hash_bd_t *bds, unsigned n, const void *key,
+                                struct hlist_node *hnode, int noref)
+{
+       struct hlist_node  *ehnode;
+       int             intent;
+       unsigned           i;
+
+       LASSERT(hnode != NULL);
+       intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF);
+
+       cfs_hash_for_each_bd(bds, n, i) {
+               ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key,
+                                                  NULL, intent);
+               if (ehnode != NULL)
+                       return ehnode;
+       }
+
+       if (i == 1) { /* only one bucket */
+               cfs_hash_bd_add_locked(hs, &bds[0], hnode);
+       } else {
+               cfs_hash_bd_t      mybd;
+
+               cfs_hash_bd_get(hs, key, &mybd);
+               cfs_hash_bd_add_locked(hs, &mybd, hnode);
+       }
+
+       return hnode;
+}
+
+static struct hlist_node *
+cfs_hash_multi_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                                unsigned n, const void *key,
+                                struct hlist_node *hnode)
+{
+       struct hlist_node  *ehnode;
+       unsigned           i;
+
+       cfs_hash_for_each_bd(bds, n, i) {
+               ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode,
+                                                  CFS_HS_LOOKUP_IT_FINDDEL);
+               if (ehnode != NULL)
+                       return ehnode;
+       }
+       return NULL;
+}
+
+static void
+cfs_hash_bd_order(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2)
+{
+       int     rc;
+
+       if (bd2->bd_bucket == NULL)
+               return;
+
+       if (bd1->bd_bucket == NULL) {
+               *bd1 = *bd2;
+               bd2->bd_bucket = NULL;
+               return;
+       }
+
+       rc = cfs_hash_bd_compare(bd1, bd2);
+       if (rc == 0) {
+               bd2->bd_bucket = NULL;
+
+       } else if (rc > 0) { /* swab bd1 and bd2 */
+               cfs_hash_bd_t tmp;
+
+               tmp = *bd2;
+               *bd2 = *bd1;
+               *bd1 = tmp;
+       }
+}
+
+void
+cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds)
+{
+       /* NB: caller should hold hs_lock.rw if REHASH is set */
+       cfs_hash_bd_from_key(hs, hs->hs_buckets,
+                            hs->hs_cur_bits, key, &bds[0]);
+       if (likely(hs->hs_rehash_buckets == NULL)) {
+               /* no rehash or not rehashing */
+               bds[1].bd_bucket = NULL;
+               return;
+       }
+
+       LASSERT(hs->hs_rehash_bits != 0);
+       cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+                            hs->hs_rehash_bits, key, &bds[1]);
+
+       cfs_hash_bd_order(&bds[0], &bds[1]);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_get);
+
+void
+cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl)
+{
+       cfs_hash_multi_bd_lock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lock);
+
+void
+cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl)
+{
+       cfs_hash_multi_bd_unlock(hs, bds, 2, excl);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_unlock);
+
+struct hlist_node *
+cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                              const void *key)
+{
+       return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_lookup_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                               const void *key, struct hlist_node *hnode,
+                               int noref)
+{
+       return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key,
+                                               hnode, noref);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_findadd_locked);
+
+struct hlist_node *
+cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds,
+                               const void *key, struct hlist_node *hnode)
+{
+       return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_dual_bd_finddel_locked);
+
+static void
+cfs_hash_buckets_free(cfs_hash_bucket_t **buckets,
+                     int bkt_size, int prev_size, int size)
+{
+       int     i;
+
+       for (i = prev_size; i < size; i++) {
+               if (buckets[i] != NULL)
+                       LIBCFS_FREE(buckets[i], bkt_size);
+       }
+
+       LIBCFS_FREE(buckets, sizeof(buckets[0]) * size);
+}
+
+/*
+ * Create or grow bucket memory. Return old_buckets if no allocation was
+ * needed, the newly allocated buckets if allocation was needed and
+ * successful, and NULL on error.
+ */
+static cfs_hash_bucket_t **
+cfs_hash_buckets_realloc(cfs_hash_t *hs, cfs_hash_bucket_t **old_bkts,
+                        unsigned int old_size, unsigned int new_size)
+{
+       cfs_hash_bucket_t **new_bkts;
+       int              i;
+
+       LASSERT(old_size == 0 || old_bkts != NULL);
+
+       if (old_bkts != NULL && old_size == new_size)
+               return old_bkts;
+
+       LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size);
+       if (new_bkts == NULL)
+               return NULL;
+
+       if (old_bkts != NULL) {
+               memcpy(new_bkts, old_bkts,
+                      min(old_size, new_size) * sizeof(*old_bkts));
+       }
+
+       for (i = old_size; i < new_size; i++) {
+               struct hlist_head *hhead;
+               cfs_hash_bd_t     bd;
+
+               LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs));
+               if (new_bkts[i] == NULL) {
+                       cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs),
+                                             old_size, new_size);
+                       return NULL;
+               }
+
+               new_bkts[i]->hsb_index   = i;
+               new_bkts[i]->hsb_version = 1;  /* shouldn't be zero */
+               new_bkts[i]->hsb_depmax  = -1; /* unknown */
+               bd.bd_bucket = new_bkts[i];
+               cfs_hash_bd_for_each_hlist(hs, &bd, hhead)
+                       INIT_HLIST_HEAD(hhead);
+
+               if (cfs_hash_with_no_lock(hs) ||
+                   cfs_hash_with_no_bktlock(hs))
+                       continue;
+
+               if (cfs_hash_with_rw_bktlock(hs))
+                       rwlock_init(&new_bkts[i]->hsb_lock.rw);
+               else if (cfs_hash_with_spin_bktlock(hs))
+                       spin_lock_init(&new_bkts[i]->hsb_lock.spin);
+               else
+                       LBUG(); /* invalid use-case */
+       }
+       return new_bkts;
+}
+
+/**
+ * Initialize new libcfs hash, where:
+ * @name     - Descriptive hash name
+ * @cur_bits - Initial hash table size, in bits
+ * @max_bits - Maximum allowed hash table resize, in bits
+ * @ops      - Registered hash table operations
+ * @flags    - CFS_HASH_REHASH enable synamic hash resizing
+ *        - CFS_HASH_SORT enable chained hash sort
+ */
+static int cfs_hash_rehash_worker(cfs_workitem_t *wi);
+
+#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1
+static int cfs_hash_dep_print(cfs_workitem_t *wi)
+{
+       cfs_hash_t *hs = container_of(wi, cfs_hash_t, hs_dep_wi);
+       int      dep;
+       int      bkt;
+       int      off;
+       int      bits;
+
+       spin_lock(&hs->hs_dep_lock);
+       dep  = hs->hs_dep_max;
+       bkt  = hs->hs_dep_bkt;
+       off  = hs->hs_dep_off;
+       bits = hs->hs_dep_bits;
+       spin_unlock(&hs->hs_dep_lock);
+
+       LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n",
+                     hs->hs_name, bits, dep, bkt, off);
+       spin_lock(&hs->hs_dep_lock);
+       hs->hs_dep_bits = 0; /* mark as workitem done */
+       spin_unlock(&hs->hs_dep_lock);
+       return 0;
+}
+
+static void cfs_hash_depth_wi_init(cfs_hash_t *hs)
+{
+       spin_lock_init(&hs->hs_dep_lock);
+       cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print);
+}
+
+static void cfs_hash_depth_wi_cancel(cfs_hash_t *hs)
+{
+       if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi))
+               return;
+
+       spin_lock(&hs->hs_dep_lock);
+       while (hs->hs_dep_bits != 0) {
+               spin_unlock(&hs->hs_dep_lock);
+               cond_resched();
+               spin_lock(&hs->hs_dep_lock);
+       }
+       spin_unlock(&hs->hs_dep_lock);
+}
+
+#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */
+
+static inline void cfs_hash_depth_wi_init(cfs_hash_t *hs) {}
+static inline void cfs_hash_depth_wi_cancel(cfs_hash_t *hs) {}
+
+#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */
+
+cfs_hash_t *
+cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits,
+               unsigned bkt_bits, unsigned extra_bytes,
+               unsigned min_theta, unsigned max_theta,
+               cfs_hash_ops_t *ops, unsigned flags)
+{
+       cfs_hash_t *hs;
+       int      len;
+
+       ENTRY;
+
+       CLASSERT(CFS_HASH_THETA_BITS < 15);
+
+       LASSERT(name != NULL);
+       LASSERT(ops != NULL);
+       LASSERT(ops->hs_key);
+       LASSERT(ops->hs_hash);
+       LASSERT(ops->hs_object);
+       LASSERT(ops->hs_keycmp);
+       LASSERT(ops->hs_get != NULL);
+       LASSERT(ops->hs_put_locked != NULL);
+
+       if ((flags & CFS_HASH_REHASH) != 0)
+               flags |= CFS_HASH_COUNTER; /* must have counter */
+
+       LASSERT(cur_bits > 0);
+       LASSERT(cur_bits >= bkt_bits);
+       LASSERT(max_bits >= cur_bits && max_bits < 31);
+       LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits));
+       LASSERT(ergo((flags & CFS_HASH_REHASH) != 0,
+                    (flags & CFS_HASH_NO_LOCK) == 0));
+       LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0,
+                     ops->hs_keycpy != NULL));
+
+       len = (flags & CFS_HASH_BIGNAME) == 0 ?
+             CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN;
+       LIBCFS_ALLOC(hs, offsetof(cfs_hash_t, hs_name[len]));
+       if (hs == NULL)
+               RETURN(NULL);
+
+       strncpy(hs->hs_name, name, len);
+       hs->hs_name[len - 1] = '\0';
+       hs->hs_flags = flags;
+
+       atomic_set(&hs->hs_refcount, 1);
+       atomic_set(&hs->hs_count, 0);
+
+       cfs_hash_lock_setup(hs);
+       cfs_hash_hlist_setup(hs);
+
+       hs->hs_cur_bits = (__u8)cur_bits;
+       hs->hs_min_bits = (__u8)cur_bits;
+       hs->hs_max_bits = (__u8)max_bits;
+       hs->hs_bkt_bits = (__u8)bkt_bits;
+
+       hs->hs_ops       = ops;
+       hs->hs_extra_bytes = extra_bytes;
+       hs->hs_rehash_bits = 0;
+       cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker);
+       cfs_hash_depth_wi_init(hs);
+
+       if (cfs_hash_with_rehash(hs))
+               __cfs_hash_set_theta(hs, min_theta, max_theta);
+
+       hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0,
+                                                 CFS_HASH_NBKT(hs));
+       if (hs->hs_buckets != NULL)
+               return hs;
+
+       LIBCFS_FREE(hs, offsetof(cfs_hash_t, hs_name[len]));
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(cfs_hash_create);
+
+/**
+ * Cleanup libcfs hash @hs.
+ */
+static void
+cfs_hash_destroy(cfs_hash_t *hs)
+{
+       struct hlist_node     *hnode;
+       struct hlist_node     *pos;
+       cfs_hash_bd_t    bd;
+       int                i;
+       ENTRY;
+
+       LASSERT(hs != NULL);
+       LASSERT(!cfs_hash_is_exiting(hs) &&
+               !cfs_hash_is_iterating(hs));
+
+       /**
+        * prohibit further rehashes, don't need any lock because
+        * I'm the only (last) one can change it.
+        */
+       hs->hs_exiting = 1;
+       if (cfs_hash_with_rehash(hs))
+               cfs_hash_rehash_cancel(hs);
+
+       cfs_hash_depth_wi_cancel(hs);
+       /* rehash should be done/canceled */
+       LASSERT(hs->hs_buckets != NULL &&
+               hs->hs_rehash_buckets == NULL);
+
+       cfs_hash_for_each_bucket(hs, &bd, i) {
+               struct hlist_head *hhead;
+
+               LASSERT(bd.bd_bucket != NULL);
+               /* no need to take this lock, just for consistent code */
+               cfs_hash_bd_lock(hs, &bd, 1);
+
+               cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+                       hlist_for_each_safe(hnode, pos, hhead) {
+                               LASSERTF(!cfs_hash_with_assert_empty(hs),
+                                        "hash %s bucket %u(%u) is not "
+                                        " empty: %u items left\n",
+                                        hs->hs_name, bd.bd_bucket->hsb_index,
+                                        bd.bd_offset, bd.bd_bucket->hsb_count);
+                               /* can't assert key valicate, because we
+                                * can interrupt rehash */
+                               cfs_hash_bd_del_locked(hs, &bd, hnode);
+                               cfs_hash_exit(hs, hnode);
+                       }
+               }
+               LASSERT(bd.bd_bucket->hsb_count == 0);
+               cfs_hash_bd_unlock(hs, &bd, 1);
+               cond_resched();
+       }
+
+       LASSERT(atomic_read(&hs->hs_count) == 0);
+
+       cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs),
+                             0, CFS_HASH_NBKT(hs));
+       i = cfs_hash_with_bigname(hs) ?
+           CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN;
+       LIBCFS_FREE(hs, offsetof(cfs_hash_t, hs_name[i]));
+
+       EXIT;
+}
+
+cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs)
+{
+       if (atomic_inc_not_zero(&hs->hs_refcount))
+               return hs;
+       return NULL;
+}
+EXPORT_SYMBOL(cfs_hash_getref);
+
+void cfs_hash_putref(cfs_hash_t *hs)
+{
+       if (atomic_dec_and_test(&hs->hs_refcount))
+               cfs_hash_destroy(hs);
+}
+EXPORT_SYMBOL(cfs_hash_putref);
+
+static inline int
+cfs_hash_rehash_bits(cfs_hash_t *hs)
+{
+       if (cfs_hash_with_no_lock(hs) ||
+           !cfs_hash_with_rehash(hs))
+               return -EOPNOTSUPP;
+
+       if (unlikely(cfs_hash_is_exiting(hs)))
+               return -ESRCH;
+
+       if (unlikely(cfs_hash_is_rehashing(hs)))
+               return -EALREADY;
+
+       if (unlikely(cfs_hash_is_iterating(hs)))
+               return -EAGAIN;
+
+       /* XXX: need to handle case with max_theta != 2.0
+        *      and the case with min_theta != 0.5 */
+       if ((hs->hs_cur_bits < hs->hs_max_bits) &&
+           (__cfs_hash_theta(hs) > hs->hs_max_theta))
+               return hs->hs_cur_bits + 1;
+
+       if (!cfs_hash_with_shrink(hs))
+               return 0;
+
+       if ((hs->hs_cur_bits > hs->hs_min_bits) &&
+           (__cfs_hash_theta(hs) < hs->hs_min_theta))
+               return hs->hs_cur_bits - 1;
+
+       return 0;
+}
+
+/**
+ * don't allow inline rehash if:
+ * - user wants non-blocking change (add/del) on hash table
+ * - too many elements
+ */
+static inline int
+cfs_hash_rehash_inline(cfs_hash_t *hs)
+{
+       return !cfs_hash_with_nblk_change(hs) &&
+              atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called when the item is added.
+ */
+void
+cfs_hash_add(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+       cfs_hash_bd_t   bd;
+       int          bits;
+
+       LASSERT(hlist_unhashed(hnode));
+
+       cfs_hash_lock(hs, 0);
+       cfs_hash_bd_get_and_lock(hs, key, &bd, 1);
+
+       cfs_hash_key_validate(hs, key, hnode);
+       cfs_hash_bd_add_locked(hs, &bd, hnode);
+
+       cfs_hash_bd_unlock(hs, &bd, 1);
+
+       bits = cfs_hash_rehash_bits(hs);
+       cfs_hash_unlock(hs, 0);
+       if (bits > 0)
+               cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+}
+EXPORT_SYMBOL(cfs_hash_add);
+
+static struct hlist_node *
+cfs_hash_find_or_add(cfs_hash_t *hs, const void *key,
+                    struct hlist_node *hnode, int noref)
+{
+       struct hlist_node *ehnode;
+       cfs_hash_bd_t     bds[2];
+       int            bits = 0;
+
+       LASSERT(hlist_unhashed(hnode));
+
+       cfs_hash_lock(hs, 0);
+       cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+       cfs_hash_key_validate(hs, key, hnode);
+       ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key,
+                                                hnode, noref);
+       cfs_hash_dual_bd_unlock(hs, bds, 1);
+
+       if (ehnode == hnode) /* new item added */
+               bits = cfs_hash_rehash_bits(hs);
+       cfs_hash_unlock(hs, 0);
+       if (bits > 0)
+               cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+       return ehnode;
+}
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  The registered
+ * ops->hs_get function will be called if the item was added.
+ * Returns 0 on success or -EALREADY on key collisions.
+ */
+int
+cfs_hash_add_unique(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+       return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ?
+              -EALREADY : 0;
+}
+EXPORT_SYMBOL(cfs_hash_add_unique);
+
+/**
+ * Add item @hnode to libcfs hash @hs using @key.  If this @key
+ * already exists in the hash then ops->hs_get will be called on the
+ * conflicting entry and that entry will be returned to the caller.
+ * Otherwise ops->hs_get is called on the item which was added.
+ */
+void *
+cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key,
+                       struct hlist_node *hnode)
+{
+       hnode = cfs_hash_find_or_add(hs, key, hnode, 0);
+
+       return cfs_hash_object(hs, hnode);
+}
+EXPORT_SYMBOL(cfs_hash_findadd_unique);
+
+/**
+ * Delete item @hnode from the libcfs hash @hs using @key.  The @key
+ * is required to ensure the correct hash bucket is locked since there
+ * is no direct linkage from the item to the bucket.  The object
+ * removed from the hash will be returned and obs->hs_put is called
+ * on the removed object.
+ */
+void *
+cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode)
+{
+       void       *obj  = NULL;
+       int          bits = 0;
+       cfs_hash_bd_t   bds[2];
+
+       cfs_hash_lock(hs, 0);
+       cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1);
+
+       /* NB: do nothing if @hnode is not in hash table */
+       if (hnode == NULL || !hlist_unhashed(hnode)) {
+               if (bds[1].bd_bucket == NULL && hnode != NULL) {
+                       cfs_hash_bd_del_locked(hs, &bds[0], hnode);
+               } else {
+                       hnode = cfs_hash_dual_bd_finddel_locked(hs, bds,
+                                                               key, hnode);
+               }
+       }
+
+       if (hnode != NULL) {
+               obj  = cfs_hash_object(hs, hnode);
+               bits = cfs_hash_rehash_bits(hs);
+       }
+
+       cfs_hash_dual_bd_unlock(hs, bds, 1);
+       cfs_hash_unlock(hs, 0);
+       if (bits > 0)
+               cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs));
+
+       return obj;
+}
+EXPORT_SYMBOL(cfs_hash_del);
+
+/**
+ * Delete item given @key in libcfs hash @hs.  The first @key found in
+ * the hash will be removed, if the key exists multiple times in the hash
+ * @hs this function must be called once per key.  The removed object
+ * will be returned and ops->hs_put is called on the removed object.
+ */
+void *
+cfs_hash_del_key(cfs_hash_t *hs, const void *key)
+{
+       return cfs_hash_del(hs, key, NULL);
+}
+EXPORT_SYMBOL(cfs_hash_del_key);
+
+/**
+ * Lookup an item using @key in the libcfs hash @hs and return it.
+ * If the @key is found in the hash hs->hs_get() is called and the
+ * matching objects is returned.  It is the callers responsibility
+ * to call the counterpart ops->hs_put using the cfs_hash_put() macro
+ * when when finished with the object.  If the @key was not found
+ * in the hash @hs NULL is returned.
+ */
+void *
+cfs_hash_lookup(cfs_hash_t *hs, const void *key)
+{
+       void             *obj = NULL;
+       struct hlist_node     *hnode;
+       cfs_hash_bd_t    bds[2];
+
+       cfs_hash_lock(hs, 0);
+       cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+       hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key);
+       if (hnode != NULL)
+               obj = cfs_hash_object(hs, hnode);
+
+       cfs_hash_dual_bd_unlock(hs, bds, 0);
+       cfs_hash_unlock(hs, 0);
+
+       return obj;
+}
+EXPORT_SYMBOL(cfs_hash_lookup);
+
+static void
+cfs_hash_for_each_enter(cfs_hash_t *hs)
+{
+       LASSERT(!cfs_hash_is_exiting(hs));
+
+       if (!cfs_hash_with_rehash(hs))
+               return;
+       /*
+        * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter
+        * because it's just an unreliable signal to rehash-thread,
+        * rehash-thread will try to finsih rehash ASAP when seeing this.
+        */
+       hs->hs_iterating = 1;
+
+       cfs_hash_lock(hs, 1);
+       hs->hs_iterators++;
+
+       /* NB: iteration is mostly called by service thread,
+        * we tend to cancel pending rehash-requst, instead of
+        * blocking service thread, we will relaunch rehash request
+        * after iteration */
+       if (cfs_hash_is_rehashing(hs))
+               cfs_hash_rehash_cancel_locked(hs);
+       cfs_hash_unlock(hs, 1);
+}
+
+static void
+cfs_hash_for_each_exit(cfs_hash_t *hs)
+{
+       int remained;
+       int bits;
+
+       if (!cfs_hash_with_rehash(hs))
+               return;
+       cfs_hash_lock(hs, 1);
+       remained = --hs->hs_iterators;
+       bits = cfs_hash_rehash_bits(hs);
+       cfs_hash_unlock(hs, 1);
+       /* NB: it's race on cfs_has_t::hs_iterating, see above */
+       if (remained == 0)
+               hs->hs_iterating = 0;
+       if (bits > 0) {
+               cfs_hash_rehash(hs, atomic_read(&hs->hs_count) <
+                                   CFS_HASH_LOOP_HOG);
+       }
+}
+
+/**
+ * For each item in the libcfs hash @hs call the passed callback @func
+ * and pass to it as an argument each hash item and the private @data.
+ *
+ * a) the function may sleep!
+ * b) during the callback:
+ *    . the bucket lock is held so the callback must never sleep.
+ *    . if @removal_safe is true, use can remove current item by
+ *      cfs_hash_bd_del_locked
+ */
+static __u64
+cfs_hash_for_each_tight(cfs_hash_t *hs, cfs_hash_for_each_cb_t func,
+                       void *data, int remove_safe)
+{
+       struct hlist_node     *hnode;
+       struct hlist_node     *pos;
+       cfs_hash_bd_t    bd;
+       __u64            count = 0;
+       int                excl  = !!remove_safe;
+       int                loop  = 0;
+       int                i;
+       ENTRY;
+
+       cfs_hash_for_each_enter(hs);
+
+       cfs_hash_lock(hs, 0);
+       LASSERT(!cfs_hash_is_rehashing(hs));
+
+       cfs_hash_for_each_bucket(hs, &bd, i) {
+               struct hlist_head *hhead;
+
+               cfs_hash_bd_lock(hs, &bd, excl);
+               if (func == NULL) { /* only glimpse size */
+                       count += bd.bd_bucket->hsb_count;
+                       cfs_hash_bd_unlock(hs, &bd, excl);
+                       continue;
+               }
+
+               cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+                       hlist_for_each_safe(hnode, pos, hhead) {
+                               cfs_hash_bucket_validate(hs, &bd, hnode);
+                               count++;
+                               loop++;
+                               if (func(hs, &bd, hnode, data)) {
+                                       cfs_hash_bd_unlock(hs, &bd, excl);
+                                       goto out;
+                               }
+                       }
+               }
+               cfs_hash_bd_unlock(hs, &bd, excl);
+               if (loop < CFS_HASH_LOOP_HOG)
+                       continue;
+               loop = 0;
+               cfs_hash_unlock(hs, 0);
+               cond_resched();
+               cfs_hash_lock(hs, 0);
+       }
+ out:
+       cfs_hash_unlock(hs, 0);
+
+       cfs_hash_for_each_exit(hs);
+       RETURN(count);
+}
+
+typedef struct {
+       cfs_hash_cond_opt_cb_t  func;
+       void               *arg;
+} cfs_hash_cond_arg_t;
+
+static int
+cfs_hash_cond_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                        struct hlist_node *hnode, void *data)
+{
+       cfs_hash_cond_arg_t *cond = data;
+
+       if (cond->func(cfs_hash_object(hs, hnode), cond->arg))
+               cfs_hash_bd_del_locked(hs, bd, hnode);
+       return 0;
+}
+
+/**
+ * Delete item from the libcfs hash @hs when @func return true.
+ * The write lock being hold during loop for each bucket to avoid
+ * any object be reference.
+ */
+void
+cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t func, void *data)
+{
+       cfs_hash_cond_arg_t arg = {
+               .func   = func,
+               .arg    = data,
+       };
+
+       cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1);
+}
+EXPORT_SYMBOL(cfs_hash_cond_del);
+
+void
+cfs_hash_for_each(cfs_hash_t *hs,
+                 cfs_hash_for_each_cb_t func, void *data)
+{
+       cfs_hash_for_each_tight(hs, func, data, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each);
+
+void
+cfs_hash_for_each_safe(cfs_hash_t *hs,
+                      cfs_hash_for_each_cb_t func, void *data)
+{
+       cfs_hash_for_each_tight(hs, func, data, 1);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_safe);
+
+static int
+cfs_hash_peek(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+             struct hlist_node *hnode, void *data)
+{
+       *(int *)data = 0;
+       return 1; /* return 1 to break the loop */
+}
+
+int
+cfs_hash_is_empty(cfs_hash_t *hs)
+{
+       int empty = 1;
+
+       cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0);
+       return empty;
+}
+EXPORT_SYMBOL(cfs_hash_is_empty);
+
+__u64
+cfs_hash_size_get(cfs_hash_t *hs)
+{
+       return cfs_hash_with_counter(hs) ?
+              atomic_read(&hs->hs_count) :
+              cfs_hash_for_each_tight(hs, NULL, NULL, 0);
+}
+EXPORT_SYMBOL(cfs_hash_size_get);
+
+/*
+ * cfs_hash_for_each_relax:
+ * Iterate the hash table and call @func on each item without
+ * any lock. This function can't guarantee to finish iteration
+ * if these features are enabled:
+ *
+ *  a. if rehash_key is enabled, an item can be moved from
+ *     one bucket to another bucket
+ *  b. user can remove non-zero-ref item from hash-table,
+ *     so the item can be removed from hash-table, even worse,
+ *     it's possible that user changed key and insert to another
+ *     hash bucket.
+ * there's no way for us to finish iteration correctly on previous
+ * two cases, so iteration has to be stopped on change.
+ */
+static int
+cfs_hash_for_each_relax(cfs_hash_t *hs, cfs_hash_for_each_cb_t func, void *data)
+{
+       struct hlist_node *hnode;
+       struct hlist_node *tmp;
+       cfs_hash_bd_t     bd;
+       __u32        version;
+       int            count = 0;
+       int            stop_on_change;
+       int            rc;
+       int            i;
+       ENTRY;
+
+       stop_on_change = cfs_hash_with_rehash_key(hs) ||
+                        !cfs_hash_with_no_itemref(hs) ||
+                        CFS_HOP(hs, put_locked) == NULL;
+       cfs_hash_lock(hs, 0);
+       LASSERT(!cfs_hash_is_rehashing(hs));
+
+       cfs_hash_for_each_bucket(hs, &bd, i) {
+               struct hlist_head *hhead;
+
+               cfs_hash_bd_lock(hs, &bd, 0);
+               version = cfs_hash_bd_version_get(&bd);
+
+               cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+                       for (hnode = hhead->first; hnode != NULL;) {
+                               cfs_hash_bucket_validate(hs, &bd, hnode);
+                               cfs_hash_get(hs, hnode);
+                               cfs_hash_bd_unlock(hs, &bd, 0);
+                               cfs_hash_unlock(hs, 0);
+
+                               rc = func(hs, &bd, hnode, data);
+                               if (stop_on_change)
+                                       cfs_hash_put(hs, hnode);
+                               cond_resched();
+                               count++;
+
+                               cfs_hash_lock(hs, 0);
+                               cfs_hash_bd_lock(hs, &bd, 0);
+                               if (!stop_on_change) {
+                                       tmp = hnode->next;
+                                       cfs_hash_put_locked(hs, hnode);
+                                       hnode = tmp;
+                               } else { /* bucket changed? */
+                                       if (version !=
+                                           cfs_hash_bd_version_get(&bd))
+                                               break;
+                                       /* safe to continue because no change */
+                                       hnode = hnode->next;
+                               }
+                               if (rc) /* callback wants to break iteration */
+                                       break;
+                       }
+               }
+               cfs_hash_bd_unlock(hs, &bd, 0);
+       }
+       cfs_hash_unlock(hs, 0);
+
+       return count;
+}
+
+int
+cfs_hash_for_each_nolock(cfs_hash_t *hs,
+                        cfs_hash_for_each_cb_t func, void *data)
+{
+       ENTRY;
+
+       if (cfs_hash_with_no_lock(hs) ||
+           cfs_hash_with_rehash_key(hs) ||
+           !cfs_hash_with_no_itemref(hs))
+               RETURN(-EOPNOTSUPP);
+
+       if (CFS_HOP(hs, get) == NULL ||
+           (CFS_HOP(hs, put) == NULL &&
+            CFS_HOP(hs, put_locked) == NULL))
+               RETURN(-EOPNOTSUPP);
+
+       cfs_hash_for_each_enter(hs);
+       cfs_hash_for_each_relax(hs, func, data);
+       cfs_hash_for_each_exit(hs);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_nolock);
+
+/**
+ * For each hash bucket in the libcfs hash @hs call the passed callback
+ * @func until all the hash buckets are empty.  The passed callback @func
+ * or the previously registered callback hs->hs_put must remove the item
+ * from the hash.  You may either use the cfs_hash_del() or hlist_del()
+ * functions.  No rwlocks will be held during the callback @func it is
+ * safe to sleep if needed.  This function will not terminate until the
+ * hash is empty.  Note it is still possible to concurrently add new
+ * items in to the hash.  It is the callers responsibility to ensure
+ * the required locking is in place to prevent concurrent insertions.
+ */
+int
+cfs_hash_for_each_empty(cfs_hash_t *hs,
+                       cfs_hash_for_each_cb_t func, void *data)
+{
+       unsigned  i = 0;
+       ENTRY;
+
+       if (cfs_hash_with_no_lock(hs))
+               return -EOPNOTSUPP;
+
+       if (CFS_HOP(hs, get) == NULL ||
+           (CFS_HOP(hs, put) == NULL &&
+            CFS_HOP(hs, put_locked) == NULL))
+               return -EOPNOTSUPP;
+
+       cfs_hash_for_each_enter(hs);
+       while (cfs_hash_for_each_relax(hs, func, data)) {
+               CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n",
+                      hs->hs_name, i++);
+       }
+       cfs_hash_for_each_exit(hs);
+       RETURN(0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_empty);
+
+void
+cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex,
+                       cfs_hash_for_each_cb_t func, void *data)
+{
+       struct hlist_head   *hhead;
+       struct hlist_node   *hnode;
+       cfs_hash_bd_t       bd;
+
+       cfs_hash_for_each_enter(hs);
+       cfs_hash_lock(hs, 0);
+       if (hindex >= CFS_HASH_NHLIST(hs))
+               goto out;
+
+       cfs_hash_bd_index_set(hs, hindex, &bd);
+
+       cfs_hash_bd_lock(hs, &bd, 0);
+       hhead = cfs_hash_bd_hhead(hs, &bd);
+       hlist_for_each(hnode, hhead) {
+               if (func(hs, &bd, hnode, data))
+                       break;
+       }
+       cfs_hash_bd_unlock(hs, &bd, 0);
+ out:
+       cfs_hash_unlock(hs, 0);
+       cfs_hash_for_each_exit(hs);
+}
+
+EXPORT_SYMBOL(cfs_hash_hlist_for_each);
+
+/*
+ * For each item in the libcfs hash @hs which matches the @key call
+ * the passed callback @func and pass to it as an argument each hash
+ * item and the private @data. During the callback the bucket lock
+ * is held so the callback must never sleep.
+   */
+void
+cfs_hash_for_each_key(cfs_hash_t *hs, const void *key,
+                     cfs_hash_for_each_cb_t func, void *data)
+{
+       struct hlist_node   *hnode;
+       cfs_hash_bd_t       bds[2];
+       unsigned            i;
+
+       cfs_hash_lock(hs, 0);
+
+       cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0);
+
+       cfs_hash_for_each_bd(bds, 2, i) {
+               struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]);
+
+               hlist_for_each(hnode, hlist) {
+                       cfs_hash_bucket_validate(hs, &bds[i], hnode);
+
+                       if (cfs_hash_keycmp(hs, key, hnode)) {
+                               if (func(hs, &bds[i], hnode, data))
+                                       break;
+                       }
+               }
+       }
+
+       cfs_hash_dual_bd_unlock(hs, bds, 0);
+       cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_for_each_key);
+
+/**
+ * Rehash the libcfs hash @hs to the given @bits.  This can be used
+ * to grow the hash size when excessive chaining is detected, or to
+ * shrink the hash when it is larger than needed.  When the CFS_HASH_REHASH
+ * flag is set in @hs the libcfs hash may be dynamically rehashed
+ * during addition or removal if the hash's theta value exceeds
+ * either the hs->hs_min_theta or hs->max_theta values.  By default
+ * these values are tuned to keep the chained hash depth small, and
+ * this approach assumes a reasonably uniform hashing function.  The
+ * theta thresholds for @hs are tunable via cfs_hash_set_theta().
+ */
+void
+cfs_hash_rehash_cancel_locked(cfs_hash_t *hs)
+{
+       int     i;
+
+       /* need hold cfs_hash_lock(hs, 1) */
+       LASSERT(cfs_hash_with_rehash(hs) &&
+               !cfs_hash_with_no_lock(hs));
+
+       if (!cfs_hash_is_rehashing(hs))
+               return;
+
+       if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) {
+               hs->hs_rehash_bits = 0;
+               return;
+       }
+
+       for (i = 2; cfs_hash_is_rehashing(hs); i++) {
+               cfs_hash_unlock(hs, 1);
+               /* raise console warning while waiting too long */
+               CDEBUG(IS_PO2(i >> 3) ? D_WARNING : D_INFO,
+                      "hash %s is still rehashing, rescheded %d\n",
+                      hs->hs_name, i - 1);
+               cond_resched();
+               cfs_hash_lock(hs, 1);
+       }
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel_locked);
+
+void
+cfs_hash_rehash_cancel(cfs_hash_t *hs)
+{
+       cfs_hash_lock(hs, 1);
+       cfs_hash_rehash_cancel_locked(hs);
+       cfs_hash_unlock(hs, 1);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_cancel);
+
+int
+cfs_hash_rehash(cfs_hash_t *hs, int do_rehash)
+{
+       int     rc;
+
+       LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs));
+
+       cfs_hash_lock(hs, 1);
+
+       rc = cfs_hash_rehash_bits(hs);
+       if (rc <= 0) {
+               cfs_hash_unlock(hs, 1);
+               return rc;
+       }
+
+       hs->hs_rehash_bits = rc;
+       if (!do_rehash) {
+               /* launch and return */
+               cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi);
+               cfs_hash_unlock(hs, 1);
+               return 0;
+       }
+
+       /* rehash right now */
+       cfs_hash_unlock(hs, 1);
+
+       return cfs_hash_rehash_worker(&hs->hs_rehash_wi);
+}
+EXPORT_SYMBOL(cfs_hash_rehash);
+
+static int
+cfs_hash_rehash_bd(cfs_hash_t *hs, cfs_hash_bd_t *old)
+{
+       cfs_hash_bd_t      new;
+       struct hlist_head  *hhead;
+       struct hlist_node  *hnode;
+       struct hlist_node  *pos;
+       void          *key;
+       int             c = 0;
+
+       /* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */
+       cfs_hash_bd_for_each_hlist(hs, old, hhead) {
+               hlist_for_each_safe(hnode, pos, hhead) {
+                       key = cfs_hash_key(hs, hnode);
+                       LASSERT(key != NULL);
+                       /* Validate hnode is in the correct bucket. */
+                       cfs_hash_bucket_validate(hs, old, hnode);
+                       /*
+                        * Delete from old hash bucket; move to new bucket.
+                        * ops->hs_key must be defined.
+                        */
+                       cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets,
+                                            hs->hs_rehash_bits, key, &new);
+                       cfs_hash_bd_move_locked(hs, old, &new, hnode);
+                       c++;
+               }
+       }
+
+       return c;
+}
+
+static int
+cfs_hash_rehash_worker(cfs_workitem_t *wi)
+{
+       cfs_hash_t       *hs = container_of(wi, cfs_hash_t, hs_rehash_wi);
+       cfs_hash_bucket_t **bkts;
+       cfs_hash_bd_t       bd;
+       unsigned int    old_size;
+       unsigned int    new_size;
+       int              bsize;
+       int              count = 0;
+       int              rc = 0;
+       int              i;
+
+       LASSERT (hs != NULL && cfs_hash_with_rehash(hs));
+
+       cfs_hash_lock(hs, 0);
+       LASSERT(cfs_hash_is_rehashing(hs));
+
+       old_size = CFS_HASH_NBKT(hs);
+       new_size = CFS_HASH_RH_NBKT(hs);
+
+       cfs_hash_unlock(hs, 0);
+
+       /*
+        * don't need hs::hs_rwlock for hs::hs_buckets,
+        * because nobody can change bkt-table except me.
+        */
+       bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets,
+                                       old_size, new_size);
+       cfs_hash_lock(hs, 1);
+       if (bkts == NULL) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       if (bkts == hs->hs_buckets) {
+               bkts = NULL; /* do nothing */
+               goto out;
+       }
+
+       rc = __cfs_hash_theta(hs);
+       if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) {
+               /* free the new allocated bkt-table */
+               old_size = new_size;
+               new_size = CFS_HASH_NBKT(hs);
+               rc = -EALREADY;
+               goto out;
+       }
+
+       LASSERT(hs->hs_rehash_buckets == NULL);
+       hs->hs_rehash_buckets = bkts;
+
+       rc = 0;
+       cfs_hash_for_each_bucket(hs, &bd, i) {
+               if (cfs_hash_is_exiting(hs)) {
+                       rc = -ESRCH;
+                       /* someone wants to destroy the hash, abort now */
+                       if (old_size < new_size) /* OK to free old bkt-table */
+                               break;
+                       /* it's shrinking, need free new bkt-table */
+                       hs->hs_rehash_buckets = NULL;
+                       old_size = new_size;
+                       new_size = CFS_HASH_NBKT(hs);
+                       goto out;
+               }
+
+               count += cfs_hash_rehash_bd(hs, &bd);
+               if (count < CFS_HASH_LOOP_HOG ||
+                   cfs_hash_is_iterating(hs)) { /* need to finish ASAP */
+                       continue;
+               }
+
+               count = 0;
+               cfs_hash_unlock(hs, 1);
+               cond_resched();
+               cfs_hash_lock(hs, 1);
+       }
+
+       hs->hs_rehash_count++;
+
+       bkts = hs->hs_buckets;
+       hs->hs_buckets = hs->hs_rehash_buckets;
+       hs->hs_rehash_buckets = NULL;
+
+       hs->hs_cur_bits = hs->hs_rehash_bits;
+ out:
+       hs->hs_rehash_bits = 0;
+       if (rc == -ESRCH) /* never be scheduled again */
+               cfs_wi_exit(cfs_sched_rehash, wi);
+       bsize = cfs_hash_bkt_size(hs);
+       cfs_hash_unlock(hs, 1);
+       /* can't refer to @hs anymore because it could be destroyed */
+       if (bkts != NULL)
+               cfs_hash_buckets_free(bkts, bsize, new_size, old_size);
+       if (rc != 0)
+               CDEBUG(D_INFO, "early quit of of rehashing: %d\n", rc);
+       /* return 1 only if cfs_wi_exit is called */
+       return rc == -ESRCH;
+}
+
+/**
+ * Rehash the object referenced by @hnode in the libcfs hash @hs.  The
+ * @old_key must be provided to locate the objects previous location
+ * in the hash, and the @new_key will be used to reinsert the object.
+ * Use this function instead of a cfs_hash_add() + cfs_hash_del()
+ * combo when it is critical that there is no window in time where the
+ * object is missing from the hash.  When an object is being rehashed
+ * the registered cfs_hash_get() and cfs_hash_put() functions will
+ * not be called.
+ */
+void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key,
+                        void *new_key, struct hlist_node *hnode)
+{
+       cfs_hash_bd_t   bds[3];
+       cfs_hash_bd_t   old_bds[2];
+       cfs_hash_bd_t   new_bd;
+
+       LASSERT(!hlist_unhashed(hnode));
+
+       cfs_hash_lock(hs, 0);
+
+       cfs_hash_dual_bd_get(hs, old_key, old_bds);
+       cfs_hash_bd_get(hs, new_key, &new_bd);
+
+       bds[0] = old_bds[0];
+       bds[1] = old_bds[1];
+       bds[2] = new_bd;
+
+       /* NB: bds[0] and bds[1] are ordered already */
+       cfs_hash_bd_order(&bds[1], &bds[2]);
+       cfs_hash_bd_order(&bds[0], &bds[1]);
+
+       cfs_hash_multi_bd_lock(hs, bds, 3, 1);
+       if (likely(old_bds[1].bd_bucket == NULL)) {
+               cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode);
+       } else {
+               cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode);
+               cfs_hash_bd_add_locked(hs, &new_bd, hnode);
+       }
+       /* overwrite key inside locks, otherwise may screw up with
+        * other operations, i.e: rehash */
+       cfs_hash_keycpy(hs, new_key, hnode);
+
+       cfs_hash_multi_bd_unlock(hs, bds, 3, 1);
+       cfs_hash_unlock(hs, 0);
+}
+EXPORT_SYMBOL(cfs_hash_rehash_key);
+
+int cfs_hash_debug_header(struct seq_file *m)
+{
+       return seq_printf(m, "%-*s%6s%6s%6s%6s%6s%6s%6s%7s%8s%8s%8s%s\n",
+                CFS_HASH_BIGNAME_LEN,
+                "name", "cur", "min", "max", "theta", "t-min", "t-max",
+                "flags", "rehash", "count", "maxdep", "maxdepb",
+                " distribution");
+}
+EXPORT_SYMBOL(cfs_hash_debug_header);
+
+static cfs_hash_bucket_t **
+cfs_hash_full_bkts(cfs_hash_t *hs)
+{
+       /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+       if (hs->hs_rehash_buckets == NULL)
+               return hs->hs_buckets;
+
+       LASSERT(hs->hs_rehash_bits != 0);
+       return hs->hs_rehash_bits > hs->hs_cur_bits ?
+              hs->hs_rehash_buckets : hs->hs_buckets;
+}
+
+static unsigned int
+cfs_hash_full_nbkt(cfs_hash_t *hs)
+{
+       /* NB: caller should hold hs->hs_rwlock if REHASH is set */
+       if (hs->hs_rehash_buckets == NULL)
+               return CFS_HASH_NBKT(hs);
+
+       LASSERT(hs->hs_rehash_bits != 0);
+       return hs->hs_rehash_bits > hs->hs_cur_bits ?
+              CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs);
+}
+
+int cfs_hash_debug_str(cfs_hash_t *hs, struct seq_file *m)
+{
+       int                 dist[8] = { 0, };
+       int                 maxdep  = -1;
+       int                 maxdepb = -1;
+       int                 total   = 0;
+       int                 theta;
+       int                 i;
+
+       cfs_hash_lock(hs, 0);
+       theta = __cfs_hash_theta(hs);
+
+       seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d  0x%02x %6d ",
+                     CFS_HASH_BIGNAME_LEN, hs->hs_name,
+                     1 << hs->hs_cur_bits, 1 << hs->hs_min_bits,
+                     1 << hs->hs_max_bits,
+                     __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta),
+                     __cfs_hash_theta_int(hs->hs_min_theta),
+                     __cfs_hash_theta_frac(hs->hs_min_theta),
+                     __cfs_hash_theta_int(hs->hs_max_theta),
+                     __cfs_hash_theta_frac(hs->hs_max_theta),
+                     hs->hs_flags, hs->hs_rehash_count);
+
+       /*
+        * The distribution is a summary of the chained hash depth in
+        * each of the libcfs hash buckets.  Each buckets hsb_count is
+        * divided by the hash theta value and used to generate a
+        * histogram of the hash distribution.  A uniform hash will
+        * result in all hash buckets being close to the average thus
+        * only the first few entries in the histogram will be non-zero.
+        * If you hash function results in a non-uniform hash the will
+        * be observable by outlier bucks in the distribution histogram.
+        *
+        * Uniform hash distribution:      128/128/0/0/0/0/0/0
+        * Non-Uniform hash distribution:  128/125/0/0/0/0/2/1
+        */
+       for (i = 0; i < cfs_hash_full_nbkt(hs); i++) {
+               cfs_hash_bd_t  bd;
+
+               bd.bd_bucket = cfs_hash_full_bkts(hs)[i];
+               cfs_hash_bd_lock(hs, &bd, 0);
+               if (maxdep < bd.bd_bucket->hsb_depmax) {
+                       maxdep  = bd.bd_bucket->hsb_depmax;
+                       maxdepb = ffz(~maxdep);
+               }
+               total += bd.bd_bucket->hsb_count;
+               dist[min(__cfs_fls(bd.bd_bucket->hsb_count/max(theta,1)),7)]++;
+               cfs_hash_bd_unlock(hs, &bd, 0);
+       }
+
+       seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb);
+       for (i = 0; i < 8; i++)
+               seq_printf(m, "%d%c",  dist[i], (i == 7) ? '\n' : '/');
+
+       cfs_hash_unlock(hs, 0);
+
+       return 0;
+}
+EXPORT_SYMBOL(cfs_hash_debug_str);
diff --git a/drivers/staging/lustre/lustre/libcfs/heap.c b/drivers/staging/lustre/lustre/libcfs/heap.c
new file mode 100644 (file)
index 0000000..147e4fe
--- /dev/null
@@ -0,0 +1,475 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ */
+/*
+ * libcfs/libcfs/heap.c
+ *
+ * Author: Eric Barton <eeb@whamcloud.com>
+ *        Liang Zhen   <liang@whamcloud.com>
+ */
+/** \addtogroup heap
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define CBH_ALLOC(ptr, h)                                              \
+do {                                                                   \
+       if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW)                      \
+               LIBCFS_CPT_ALLOC_GFP((ptr), h->cbh_cptab, h->cbh_cptid, \
+                                    CBH_NOB, GFP_ATOMIC);      \
+       else                                                            \
+               LIBCFS_CPT_ALLOC((ptr), h->cbh_cptab, h->cbh_cptid,     \
+                                CBH_NOB);                              \
+} while (0)
+
+#define CBH_FREE(ptr)  LIBCFS_FREE(ptr, CBH_NOB)
+
+/**
+ * Grows the capacity of a binary heap so that it can handle a larger number of
+ * \e cfs_binheap_node_t objects.
+ *
+ * \param[in] h The binary heap
+ *
+ * \retval 0      Successfully grew the heap
+ * \retval -ENOMEM OOM error
+ */
+static int
+cfs_binheap_grow(cfs_binheap_t *h)
+{
+       cfs_binheap_node_t ***frag1 = NULL;
+       cfs_binheap_node_t  **frag2;
+       int hwm = h->cbh_hwm;
+
+       /* need a whole new chunk of pointers */
+       LASSERT((h->cbh_hwm & CBH_MASK) == 0);
+
+       if (hwm == 0) {
+               /* first use of single indirect */
+               CBH_ALLOC(h->cbh_elements1, h);
+               if (h->cbh_elements1 == NULL)
+                       return -ENOMEM;
+
+               goto out;
+       }
+
+       hwm -= CBH_SIZE;
+       if (hwm < CBH_SIZE * CBH_SIZE) {
+               /* not filled double indirect */
+               CBH_ALLOC(frag2, h);
+               if (frag2 == NULL)
+                       return -ENOMEM;
+
+               if (hwm == 0) {
+                       /* first use of double indirect */
+                       CBH_ALLOC(h->cbh_elements2, h);
+                       if (h->cbh_elements2 == NULL) {
+                               CBH_FREE(frag2);
+                               return -ENOMEM;
+                       }
+               }
+
+               h->cbh_elements2[hwm >> CBH_SHIFT] = frag2;
+               goto out;
+       }
+
+       hwm -= CBH_SIZE * CBH_SIZE;
+#if (CBH_SHIFT * 3 < 32)
+       if (hwm >= CBH_SIZE * CBH_SIZE * CBH_SIZE) {
+               /* filled triple indirect */
+               return -ENOMEM;
+       }
+#endif
+       CBH_ALLOC(frag2, h);
+       if (frag2 == NULL)
+               return -ENOMEM;
+
+       if (((hwm >> CBH_SHIFT) & CBH_MASK) == 0) {
+               /* first use of this 2nd level index */
+               CBH_ALLOC(frag1, h);
+               if (frag1 == NULL) {
+                       CBH_FREE(frag2);
+                       return -ENOMEM;
+               }
+       }
+
+       if (hwm == 0) {
+               /* first use of triple indirect */
+               CBH_ALLOC(h->cbh_elements3, h);
+               if (h->cbh_elements3 == NULL) {
+                       CBH_FREE(frag2);
+                       CBH_FREE(frag1);
+                       return -ENOMEM;
+               }
+       }
+
+       if (frag1 != NULL) {
+               LASSERT(h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] == NULL);
+               h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] = frag1;
+       } else {
+               frag1 = h->cbh_elements3[hwm >> (2 * CBH_SHIFT)];
+               LASSERT(frag1 != NULL);
+       }
+
+       frag1[(hwm >> CBH_SHIFT) & CBH_MASK] = frag2;
+
+ out:
+       h->cbh_hwm += CBH_SIZE;
+       return 0;
+}
+
+/**
+ * Creates and initializes a binary heap instance.
+ *
+ * \param[in] ops   The operations to be used
+ * \param[in] flags The heap flags
+ * \parm[in]  count The initial heap capacity in # of elements
+ * \param[in] arg   An optional private argument
+ * \param[in] cptab The CPT table this heap instance will operate over
+ * \param[in] cptid The CPT id of \a cptab this heap instance will operate over
+ *
+ * \retval valid-pointer A newly-created and initialized binary heap object
+ * \retval NULL                 error
+ */
+cfs_binheap_t *
+cfs_binheap_create(cfs_binheap_ops_t *ops, unsigned int flags,
+                  unsigned count, void *arg, struct cfs_cpt_table *cptab,
+                  int cptid)
+{
+       cfs_binheap_t *h;
+
+       LASSERT(ops != NULL);
+       LASSERT(ops->hop_compare != NULL);
+       LASSERT(cptab != NULL);
+       LASSERT(cptid == CFS_CPT_ANY ||
+              (cptid >= 0 && cptid < cptab->ctb_nparts));
+
+       LIBCFS_CPT_ALLOC(h, cptab, cptid, sizeof(*h));
+       if (h == NULL)
+               return NULL;
+
+       h->cbh_ops        = ops;
+       h->cbh_nelements  = 0;
+       h->cbh_hwm        = 0;
+       h->cbh_private    = arg;
+       h->cbh_flags      = flags & (~CBH_FLAG_ATOMIC_GROW);
+       h->cbh_cptab      = cptab;
+       h->cbh_cptid      = cptid;
+
+       while (h->cbh_hwm < count) { /* preallocate */
+               if (cfs_binheap_grow(h) != 0) {
+                       cfs_binheap_destroy(h);
+                       return NULL;
+               }
+       }
+
+       h->cbh_flags |= flags & CBH_FLAG_ATOMIC_GROW;
+
+       return h;
+}
+EXPORT_SYMBOL(cfs_binheap_create);
+
+/**
+ * Releases all resources associated with a binary heap instance.
+ *
+ * Deallocates memory for all indirection levels and the binary heap object
+ * itself.
+ *
+ * \param[in] h The binary heap object
+ */
+void
+cfs_binheap_destroy(cfs_binheap_t *h)
+{
+       int idx0;
+       int idx1;
+       int n;
+
+       LASSERT(h != NULL);
+
+       n = h->cbh_hwm;
+
+       if (n > 0) {
+               CBH_FREE(h->cbh_elements1);
+               n -= CBH_SIZE;
+       }
+
+       if (n > 0) {
+               for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+                       CBH_FREE(h->cbh_elements2[idx0]);
+                       n -= CBH_SIZE;
+               }
+
+               CBH_FREE(h->cbh_elements2);
+       }
+
+       if (n > 0) {
+               for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) {
+
+                       for (idx1 = 0; idx1 < CBH_SIZE && n > 0; idx1++) {
+                               CBH_FREE(h->cbh_elements3[idx0][idx1]);
+                               n -= CBH_SIZE;
+                       }
+
+                       CBH_FREE(h->cbh_elements3[idx0]);
+               }
+
+               CBH_FREE(h->cbh_elements3);
+       }
+
+       LIBCFS_FREE(h, sizeof(*h));
+}
+EXPORT_SYMBOL(cfs_binheap_destroy);
+
+/**
+ * Obtains a double pointer to a heap element, given its index into the binary
+ * tree.
+ *
+ * \param[in] h          The binary heap instance
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer A double pointer to a heap pointer entry
+ */
+static cfs_binheap_node_t **
+cfs_binheap_pointer(cfs_binheap_t *h, unsigned int idx)
+{
+       if (idx < CBH_SIZE)
+               return &(h->cbh_elements1[idx]);
+
+       idx -= CBH_SIZE;
+       if (idx < CBH_SIZE * CBH_SIZE)
+               return &(h->cbh_elements2[idx >> CBH_SHIFT][idx & CBH_MASK]);
+
+       idx -= CBH_SIZE * CBH_SIZE;
+       return &(h->cbh_elements3[idx >> (2 * CBH_SHIFT)]\
+                                [(idx >> CBH_SHIFT) & CBH_MASK]\
+                                [idx & CBH_MASK]);
+}
+
+/**
+ * Obtains a pointer to a heap element, given its index into the binary tree.
+ *
+ * \param[in] h          The binary heap
+ * \param[in] idx The requested node's index
+ *
+ * \retval valid-pointer The requested heap node
+ * \retval NULL                 Supplied index is out of bounds
+ */
+cfs_binheap_node_t *
+cfs_binheap_find(cfs_binheap_t *h, unsigned int idx)
+{
+       if (idx >= h->cbh_nelements)
+               return NULL;
+
+       return *cfs_binheap_pointer(h, idx);
+}
+EXPORT_SYMBOL(cfs_binheap_find);
+
+/**
+ * Moves a node upwards, towards the root of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+cfs_binheap_bubble(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+       unsigned int         cur_idx = e->chn_index;
+       cfs_binheap_node_t **cur_ptr;
+       unsigned int         parent_idx;
+       cfs_binheap_node_t **parent_ptr;
+       int                  did_sth = 0;
+
+       cur_ptr = cfs_binheap_pointer(h, cur_idx);
+       LASSERT(*cur_ptr == e);
+
+       while (cur_idx > 0) {
+               parent_idx = (cur_idx - 1) >> 1;
+
+               parent_ptr = cfs_binheap_pointer(h, parent_idx);
+               LASSERT((*parent_ptr)->chn_index == parent_idx);
+
+               if (h->cbh_ops->hop_compare(*parent_ptr, e))
+                       break;
+
+               (*parent_ptr)->chn_index = cur_idx;
+               *cur_ptr = *parent_ptr;
+               cur_ptr = parent_ptr;
+               cur_idx = parent_idx;
+               did_sth = 1;
+       }
+
+       e->chn_index = cur_idx;
+       *cur_ptr = e;
+
+       return did_sth;
+}
+
+/**
+ * Moves a node downwards, towards the last level of the binary tree.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 1 The position of \a e in the tree was changed at least once
+ * \retval 0 The position of \a e in the tree was not changed
+ */
+static int
+cfs_binheap_sink(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+       unsigned int         n = h->cbh_nelements;
+       unsigned int         child_idx;
+       cfs_binheap_node_t **child_ptr;
+       cfs_binheap_node_t  *child;
+       unsigned int         child2_idx;
+       cfs_binheap_node_t **child2_ptr;
+       cfs_binheap_node_t  *child2;
+       unsigned int         cur_idx;
+       cfs_binheap_node_t **cur_ptr;
+       int                  did_sth = 0;
+
+       cur_idx = e->chn_index;
+       cur_ptr = cfs_binheap_pointer(h, cur_idx);
+       LASSERT(*cur_ptr == e);
+
+       while (cur_idx < n) {
+               child_idx = (cur_idx << 1) + 1;
+               if (child_idx >= n)
+                       break;
+
+               child_ptr = cfs_binheap_pointer(h, child_idx);
+               child = *child_ptr;
+
+               child2_idx = child_idx + 1;
+               if (child2_idx < n) {
+                       child2_ptr = cfs_binheap_pointer(h, child2_idx);
+                       child2 = *child2_ptr;
+
+                       if (h->cbh_ops->hop_compare(child2, child)) {
+                               child_idx = child2_idx;
+                               child_ptr = child2_ptr;
+                               child = child2;
+                       }
+               }
+
+               LASSERT(child->chn_index == child_idx);
+
+               if (h->cbh_ops->hop_compare(e, child))
+                       break;
+
+               child->chn_index = cur_idx;
+               *cur_ptr = child;
+               cur_ptr = child_ptr;
+               cur_idx = child_idx;
+               did_sth = 1;
+       }
+
+       e->chn_index = cur_idx;
+       *cur_ptr = e;
+
+       return did_sth;
+}
+
+/**
+ * Sort-inserts a node into the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ *
+ * \retval 0   Element inserted successfully
+ * \retval != 0 error
+ */
+int
+cfs_binheap_insert(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+       cfs_binheap_node_t **new_ptr;
+       unsigned int         new_idx = h->cbh_nelements;
+       int                  rc;
+
+       if (new_idx == h->cbh_hwm) {
+               rc = cfs_binheap_grow(h);
+               if (rc != 0)
+                       return rc;
+       }
+
+       if (h->cbh_ops->hop_enter) {
+               rc = h->cbh_ops->hop_enter(h, e);
+               if (rc != 0)
+                       return rc;
+       }
+
+       e->chn_index = new_idx;
+       new_ptr = cfs_binheap_pointer(h, new_idx);
+       h->cbh_nelements++;
+       *new_ptr = e;
+
+       cfs_binheap_bubble(h, e);
+
+       return 0;
+}
+EXPORT_SYMBOL(cfs_binheap_insert);
+
+/**
+ * Removes a node from the binary heap.
+ *
+ * \param[in] h The heap
+ * \param[in] e The node
+ */
+void
+cfs_binheap_remove(cfs_binheap_t *h, cfs_binheap_node_t *e)
+{
+       unsigned int         n = h->cbh_nelements;
+       unsigned int         cur_idx = e->chn_index;
+       cfs_binheap_node_t **cur_ptr;
+       cfs_binheap_node_t  *last;
+
+       LASSERT(cur_idx != CBH_POISON);
+       LASSERT(cur_idx < n);
+
+       cur_ptr = cfs_binheap_pointer(h, cur_idx);
+       LASSERT(*cur_ptr == e);
+
+       n--;
+       last = *cfs_binheap_pointer(h, n);
+       h->cbh_nelements = n;
+       if (last == e)
+               return;
+
+       last->chn_index = cur_idx;
+       *cur_ptr = last;
+       if (!cfs_binheap_bubble(h, *cur_ptr))
+               cfs_binheap_sink(h, *cur_ptr);
+
+       e->chn_index = CBH_POISON;
+       if (h->cbh_ops->hop_exit)
+               h->cbh_ops->hop_exit(h, e);
+}
+EXPORT_SYMBOL(cfs_binheap_remove);
+
+/** @} heap */
diff --git a/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c b/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c
new file mode 100644 (file)
index 0000000..d6d3b2e
--- /dev/null
@@ -0,0 +1,346 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ *
+ * Kernel <-> userspace communication routines.
+ * Using pipes for all arches.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_KUC D_OTHER
+
+#include <linux/libcfs/libcfs.h>
+
+#ifdef LUSTRE_UTILS
+/* This is the userspace side. */
+
+/** Start the userspace side of a KUC pipe.
+ * @param link Private descriptor for pipe/socket.
+ * @param groups KUC broadcast group to listen to
+ *       (can be null for unicast to this pid)
+ */
+int libcfs_ukuc_start(lustre_kernelcomm *link, int group)
+{
+       int pfd[2];
+
+       if (pipe(pfd) < 0)
+               return -errno;
+
+       memset(link, 0, sizeof(*link));
+       link->lk_rfd = pfd[0];
+       link->lk_wfd = pfd[1];
+       link->lk_group = group;
+       link->lk_uid = getpid();
+       return 0;
+}
+
+int libcfs_ukuc_stop(lustre_kernelcomm *link)
+{
+       if (link->lk_wfd > 0)
+               close(link->lk_wfd);
+       return close(link->lk_rfd);
+}
+
+#define lhsz sizeof(*kuch)
+
+/** Read a message from the link.
+ * Allocates memory, returns handle
+ *
+ * @param link Private descriptor for pipe/socket.
+ * @param buf Buffer to read into, must include size for kuc_hdr
+ * @param maxsize Maximum message size allowed
+ * @param transport Only listen to messages on this transport
+ *      (and the generic transport)
+ */
+int libcfs_ukuc_msg_get(lustre_kernelcomm *link, char *buf, int maxsize,
+                       int transport)
+{
+       struct kuc_hdr *kuch;
+       int rc = 0;
+
+       memset(buf, 0, maxsize);
+
+       CDEBUG(D_KUC, "Waiting for message from kernel on fd %d\n",
+              link->lk_rfd);
+
+       while (1) {
+               /* Read header first to get message size */
+               rc = read(link->lk_rfd, buf, lhsz);
+               if (rc <= 0) {
+                       rc = -errno;
+                       break;
+               }
+               kuch = (struct kuc_hdr *)buf;
+
+               CDEBUG(D_KUC, "Received message mg=%x t=%d m=%d l=%d\n",
+                      kuch->kuc_magic, kuch->kuc_transport, kuch->kuc_msgtype,
+                      kuch->kuc_msglen);
+
+               if (kuch->kuc_magic != KUC_MAGIC) {
+                       CERROR("bad message magic %x != %x\n",
+                              kuch->kuc_magic, KUC_MAGIC);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               if (kuch->kuc_msglen > maxsize) {
+                       rc = -EMSGSIZE;
+                       break;
+               }
+
+               /* Read payload */
+               rc = read(link->lk_rfd, buf + lhsz, kuch->kuc_msglen - lhsz);
+               if (rc < 0) {
+                       rc = -errno;
+                       break;
+               }
+               if (rc < (kuch->kuc_msglen - lhsz)) {
+                       CERROR("short read: got %d of %d bytes\n",
+                              rc, kuch->kuc_msglen);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               if (kuch->kuc_transport == transport ||
+                   kuch->kuc_transport == KUC_TRANSPORT_GENERIC) {
+                       return 0;
+               }
+               /* Drop messages for other transports */
+       }
+       return rc;
+}
+
+#else /* LUSTRE_UTILS */
+/* This is the kernel side (liblustre as well). */
+
+/**
+ * libcfs_kkuc_msg_put - send an message from kernel to userspace
+ * @param fp to send the message to
+ * @param payload Payload data.  First field of payload is always
+ *   struct kuc_hdr
+ */
+int libcfs_kkuc_msg_put(struct file *filp, void *payload)
+{
+       struct kuc_hdr *kuch = (struct kuc_hdr *)payload;
+       ssize_t count = kuch->kuc_msglen;
+       loff_t offset = 0;
+       mm_segment_t fs;
+       int rc = -ENOSYS;
+
+       if (filp == NULL || IS_ERR(filp))
+               return -EBADF;
+
+       if (kuch->kuc_magic != KUC_MAGIC) {
+               CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic);
+               return -ENOSYS;
+       }
+
+       fs = get_fs();
+       set_fs(KERNEL_DS);
+       while (count > 0) {
+               rc = vfs_write(filp, (void __force __user *)payload,
+                              count, &offset);
+               if (rc < 0)
+                       break;
+               count -= rc;
+               payload += rc;
+               rc = 0;
+       }
+       set_fs(fs);
+
+       if (rc < 0)
+               CWARN("message send failed (%d)\n", rc);
+       else
+               CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp);
+
+       return rc;
+}
+EXPORT_SYMBOL(libcfs_kkuc_msg_put);
+
+/* Broadcast groups are global across all mounted filesystems;
+ * i.e. registering for a group on 1 fs will get messages for that
+ * group from any fs */
+/** A single group reigstration has a uid and a file pointer */
+struct kkuc_reg {
+       struct list_head        kr_chain;
+       int             kr_uid;
+       struct file     *kr_fp;
+       __u32           kr_data;
+};
+static struct list_head kkuc_groups[KUC_GRP_MAX+1] = {};
+/* Protect message sending against remove and adds */
+static DECLARE_RWSEM(kg_sem);
+
+/** Add a receiver to a broadcast group
+ * @param filp pipe to write into
+ * @param uid identidier for this receiver
+ * @param group group number
+ */
+int libcfs_kkuc_group_add(struct file *filp, int uid, int group, __u32 data)
+{
+       struct kkuc_reg *reg;
+
+       if (group > KUC_GRP_MAX) {
+               CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+               return -EINVAL;
+       }
+
+       /* fput in group_rem */
+       if (filp == NULL)
+               return -EBADF;
+
+       /* freed in group_rem */
+       reg = kmalloc(sizeof(*reg), 0);
+       if (reg == NULL)
+               return -ENOMEM;
+
+       reg->kr_fp = filp;
+       reg->kr_uid = uid;
+       reg->kr_data = data;
+
+       down_write(&kg_sem);
+       if (kkuc_groups[group].next == NULL)
+               INIT_LIST_HEAD(&kkuc_groups[group]);
+       list_add(&reg->kr_chain, &kkuc_groups[group]);
+       up_write(&kg_sem);
+
+       CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group);
+
+       return 0;
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_add);
+
+int libcfs_kkuc_group_rem(int uid, int group)
+{
+       struct kkuc_reg *reg, *next;
+       ENTRY;
+
+       if (kkuc_groups[group].next == NULL)
+               RETURN(0);
+
+       if (uid == 0) {
+               /* Broadcast a shutdown message */
+               struct kuc_hdr lh;
+
+               lh.kuc_magic = KUC_MAGIC;
+               lh.kuc_transport = KUC_TRANSPORT_GENERIC;
+               lh.kuc_msgtype = KUC_MSG_SHUTDOWN;
+               lh.kuc_msglen = sizeof(lh);
+               libcfs_kkuc_group_put(group, &lh);
+       }
+
+       down_write(&kg_sem);
+       list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) {
+               if ((uid == 0) || (uid == reg->kr_uid)) {
+                       list_del(&reg->kr_chain);
+                       CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n",
+                              reg->kr_uid, reg->kr_fp, group);
+                       if (reg->kr_fp != NULL)
+                               fput(reg->kr_fp);
+                       kfree(reg);
+               }
+       }
+       up_write(&kg_sem);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_rem);
+
+int libcfs_kkuc_group_put(int group, void *payload)
+{
+       struct kkuc_reg *reg;
+       int              rc = 0;
+       int one_success = 0;
+       ENTRY;
+
+       down_read(&kg_sem);
+       list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+               if (reg->kr_fp != NULL) {
+                       rc = libcfs_kkuc_msg_put(reg->kr_fp, payload);
+                       if (rc == 0)
+                               one_success = 1;
+                       else if (rc == -EPIPE) {
+                               fput(reg->kr_fp);
+                               reg->kr_fp = NULL;
+                       }
+               }
+       }
+       up_read(&kg_sem);
+
+       /* don't return an error if the message has been delivered
+        * at least to one agent */
+       if (one_success)
+               rc = 0;
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_put);
+
+/**
+ * Calls a callback function for each link of the given kuc group.
+ * @param group the group to call the function on.
+ * @param cb_func the function to be called.
+ * @param cb_arg iextra argument to be passed to the callback function.
+ */
+int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func,
+                             void *cb_arg)
+{
+       struct kkuc_reg *reg;
+       int rc = 0;
+       ENTRY;
+
+       if (group > KUC_GRP_MAX) {
+               CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group);
+               RETURN(-EINVAL);
+       }
+
+       /* no link for this group */
+       if (kkuc_groups[group].next == NULL)
+               RETURN(0);
+
+       down_read(&kg_sem);
+       list_for_each_entry(reg, &kkuc_groups[group], kr_chain) {
+               if (reg->kr_fp != NULL) {
+                       rc = cb_func(reg->kr_data, cb_arg);
+               }
+       }
+       up_read(&kg_sem);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(libcfs_kkuc_group_foreach);
+
+#endif /* LUSTRE_UTILS */
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c b/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c
new file mode 100644 (file)
index 0000000..8e88eb5
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+/** Global CPU partition table */
+struct cfs_cpt_table   *cfs_cpt_table __read_mostly = NULL;
+EXPORT_SYMBOL(cfs_cpt_table);
+
+#ifndef HAVE_LIBCFS_CPT
+
+#define CFS_CPU_VERSION_MAGIC     0xbabecafe
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+       struct cfs_cpt_table *cptab;
+
+       if (ncpt != 1) {
+               CERROR("Can't support cpu partition number %d\n", ncpt);
+               return NULL;
+       }
+
+       LIBCFS_ALLOC(cptab, sizeof(*cptab));
+       if (cptab != NULL) {
+               cptab->ctb_version = CFS_CPU_VERSION_MAGIC;
+               cptab->ctb_nparts  = ncpt;
+       }
+
+       return cptab;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+       LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC);
+
+       LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+void
+cfs_cpu_fini(void)
+{
+       if (cfs_cpt_table != NULL) {
+               cfs_cpt_table_free(cfs_cpt_table);
+               cfs_cpt_table = NULL;
+       }
+}
+
+int
+cfs_cpu_init(void)
+{
+       cfs_cpt_table = cfs_cpt_table_alloc(1);
+
+       return cfs_cpt_table != NULL ? 0 : -1;
+}
+
+#endif /* HAVE_LIBCFS_CPT */
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c b/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c
new file mode 100644 (file)
index 0000000..8d6c4ad
--- /dev/null
@@ -0,0 +1,192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+
+/** destroy cpu-partition lock, see libcfs_private.h for more detail */
+void
+cfs_percpt_lock_free(struct cfs_percpt_lock *pcl)
+{
+       LASSERT(pcl->pcl_locks != NULL);
+       LASSERT(!pcl->pcl_locked);
+
+       cfs_percpt_free(pcl->pcl_locks);
+       LIBCFS_FREE(pcl, sizeof(*pcl));
+}
+EXPORT_SYMBOL(cfs_percpt_lock_free);
+
+/**
+ * create cpu-partition lock, see libcfs_private.h for more detail.
+ *
+ * cpu-partition lock is designed for large-scale SMP system, so we need to
+ * reduce cacheline conflict as possible as we can, that's the
+ * reason we always allocate cacheline-aligned memory block.
+ */
+struct cfs_percpt_lock *
+cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab)
+{
+       struct cfs_percpt_lock  *pcl;
+       spinlock_t              *lock;
+       int                     i;
+
+       /* NB: cptab can be NULL, pcl will be for HW CPUs on that case */
+       LIBCFS_ALLOC(pcl, sizeof(*pcl));
+       if (pcl == NULL)
+               return NULL;
+
+       pcl->pcl_cptab = cptab;
+       pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock));
+       if (pcl->pcl_locks == NULL) {
+               LIBCFS_FREE(pcl, sizeof(*pcl));
+               return NULL;
+       }
+
+       cfs_percpt_for_each(lock, i, pcl->pcl_locks)
+               spin_lock_init(lock);
+
+       return pcl;
+}
+EXPORT_SYMBOL(cfs_percpt_lock_alloc);
+
+/**
+ * lock a CPU partition
+ *
+ * \a index != CFS_PERCPT_LOCK_EX
+ *     hold private lock indexed by \a index
+ *
+ * \a index == CFS_PERCPT_LOCK_EX
+ *     exclusively lock @pcl and nobody can take private lock
+ */
+void
+cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index)
+{
+       int     ncpt = cfs_cpt_number(pcl->pcl_cptab);
+       int     i;
+
+       LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt);
+
+       if (ncpt == 1) {
+               index = 0;
+       } else { /* serialize with exclusive lock */
+               while (pcl->pcl_locked)
+                       cpu_relax();
+       }
+
+       if (likely(index != CFS_PERCPT_LOCK_EX)) {
+               spin_lock(pcl->pcl_locks[index]);
+               return;
+       }
+
+       /* exclusive lock request */
+       for (i = 0; i < ncpt; i++) {
+               spin_lock(pcl->pcl_locks[i]);
+               if (i == 0) {
+                       LASSERT(!pcl->pcl_locked);
+                       /* nobody should take private lock after this
+                        * so I wouldn't starve for too long time */
+                       pcl->pcl_locked = 1;
+               }
+       }
+}
+EXPORT_SYMBOL(cfs_percpt_lock);
+
+/** unlock a CPU partition */
+void
+cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index)
+{
+       int     ncpt = cfs_cpt_number(pcl->pcl_cptab);
+       int     i;
+
+       index = ncpt == 1 ? 0 : index;
+
+       if (likely(index != CFS_PERCPT_LOCK_EX)) {
+               spin_unlock(pcl->pcl_locks[index]);
+               return;
+       }
+
+       for (i = ncpt - 1; i >= 0; i--) {
+               if (i == 0) {
+                       LASSERT(pcl->pcl_locked);
+                       pcl->pcl_locked = 0;
+               }
+               spin_unlock(pcl->pcl_locks[i]);
+       }
+}
+EXPORT_SYMBOL(cfs_percpt_unlock);
+
+
+/** free cpu-partition refcount */
+void
+cfs_percpt_atomic_free(atomic_t **refs)
+{
+       cfs_percpt_free(refs);
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_free);
+
+/** allocate cpu-partition refcount with initial value @init_val */
+atomic_t **
+cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int init_val)
+{
+       atomic_t        **refs;
+       atomic_t        *ref;
+       int             i;
+
+       refs = cfs_percpt_alloc(cptab, sizeof(*ref));
+       if (refs == NULL)
+               return NULL;
+
+       cfs_percpt_for_each(ref, i, refs)
+               atomic_set(ref, init_val);
+       return refs;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_alloc);
+
+/** return sum of cpu-partition refs */
+int
+cfs_percpt_atomic_summary(atomic_t **refs)
+{
+       atomic_t        *ref;
+       int             i;
+       int             val = 0;
+
+       cfs_percpt_for_each(ref, i, refs)
+               val += atomic_read(ref);
+
+       return val;
+}
+EXPORT_SYMBOL(cfs_percpt_atomic_summary);
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c b/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c
new file mode 100644 (file)
index 0000000..8791373
--- /dev/null
@@ -0,0 +1,205 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+struct cfs_var_array {
+       unsigned int            va_count;       /* # of buffers */
+       unsigned int            va_size;        /* size of each var */
+       struct cfs_cpt_table    *va_cptab;      /* cpu partition table */
+       void                    *va_ptrs[0];    /* buffer addresses */
+};
+
+/*
+ * free per-cpu data, see more detail in cfs_percpt_free
+ */
+void
+cfs_percpt_free(void *vars)
+{
+       struct  cfs_var_array *arr;
+       int     i;
+
+       arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+       for (i = 0; i < arr->va_count; i++) {
+               if (arr->va_ptrs[i] != NULL)
+                       LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+       }
+
+       LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+                                 va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_percpt_free);
+
+/*
+ * allocate per cpu-partition variables, returned value is an array of pointers,
+ * variable can be indexed by CPU partition ID, i.e:
+ *
+ *     arr = cfs_percpt_alloc(cfs_cpu_pt, size);
+ *     then caller can access memory block for CPU 0 by arr[0],
+ *     memory block for CPU 1 by arr[1]...
+ *     memory block for CPU N by arr[N]...
+ *
+ * cacheline aligned.
+ */
+void *
+cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size)
+{
+       struct cfs_var_array    *arr;
+       int                     count;
+       int                     i;
+
+       count = cfs_cpt_number(cptab);
+
+       LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+       if (arr == NULL)
+               return NULL;
+
+       arr->va_size    = size = L1_CACHE_ALIGN(size);
+       arr->va_count   = count;
+       arr->va_cptab   = cptab;
+
+       for (i = 0; i < count; i++) {
+               LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size);
+               if (arr->va_ptrs[i] == NULL) {
+                       cfs_percpt_free((void *)&arr->va_ptrs[0]);
+                       return NULL;
+               }
+       }
+
+       return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_percpt_alloc);
+
+/*
+ * return number of CPUs (or number of elements in per-cpu data)
+ * according to cptab of @vars
+ */
+int
+cfs_percpt_number(void *vars)
+{
+       struct cfs_var_array *arr;
+
+       arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+       return arr->va_count;
+}
+EXPORT_SYMBOL(cfs_percpt_number);
+
+/*
+ * return memory block shadowed from current CPU
+ */
+void *
+cfs_percpt_current(void *vars)
+{
+       struct cfs_var_array *arr;
+       int    cpt;
+
+       arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+       cpt = cfs_cpt_current(arr->va_cptab, 0);
+       if (cpt < 0)
+               return NULL;
+
+       return arr->va_ptrs[cpt];
+}
+EXPORT_SYMBOL(cfs_percpt_current);
+
+void *
+cfs_percpt_index(void *vars, int idx)
+{
+       struct cfs_var_array *arr;
+
+       arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+       LASSERT(idx >= 0 && idx < arr->va_count);
+       return arr->va_ptrs[idx];
+}
+EXPORT_SYMBOL(cfs_percpt_index);
+
+/*
+ * free variable array, see more detail in cfs_array_alloc
+ */
+void
+cfs_array_free(void *vars)
+{
+       struct cfs_var_array    *arr;
+       int                     i;
+
+       arr = container_of(vars, struct cfs_var_array, va_ptrs[0]);
+
+       for (i = 0; i < arr->va_count; i++) {
+               if (arr->va_ptrs[i] == NULL)
+                       continue;
+
+               LIBCFS_FREE(arr->va_ptrs[i], arr->va_size);
+       }
+       LIBCFS_FREE(arr, offsetof(struct cfs_var_array,
+                                 va_ptrs[arr->va_count]));
+}
+EXPORT_SYMBOL(cfs_array_free);
+
+/*
+ * allocate a variable array, returned value is an array of pointers.
+ * Caller can specify length of array by @count, @size is size of each
+ * memory block in array.
+ */
+void *
+cfs_array_alloc(int count, unsigned int size)
+{
+       struct cfs_var_array    *arr;
+       int                     i;
+
+       LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count]));
+       if (arr == NULL)
+               return NULL;
+
+       arr->va_count   = count;
+       arr->va_size    = size;
+
+       for (i = 0; i < count; i++) {
+               LIBCFS_ALLOC(arr->va_ptrs[i], size);
+
+               if (arr->va_ptrs[i] == NULL) {
+                       cfs_array_free((void *)&arr->va_ptrs[0]);
+                       return NULL;
+               }
+       }
+
+       return (void *)&arr->va_ptrs[0];
+}
+EXPORT_SYMBOL(cfs_array_alloc);
diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_string.c b/drivers/staging/lustre/lustre/libcfs/libcfs_string.c
new file mode 100644 (file)
index 0000000..9edccc9
--- /dev/null
@@ -0,0 +1,647 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * String manipulation functions.
+ *
+ * libcfs/libcfs/libcfs_string.c
+ *
+ * Author: Nathan Rutman <nathan.rutman@sun.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+
+/* non-0 = don't match */
+int cfs_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+       if (s1 == NULL || s2 == NULL)
+               return 1;
+
+       if (n == 0)
+               return 0;
+
+       while (n-- != 0 && tolower(*s1) == tolower(*s2)) {
+               if (n == 0 || *s1 == '\0' || *s2 == '\0')
+                       break;
+               s1++;
+               s2++;
+       }
+
+       return tolower(*(unsigned char *)s1) - tolower(*(unsigned char *)s2);
+}
+EXPORT_SYMBOL(cfs_strncasecmp);
+
+/* Convert a text string to a bitmask */
+int cfs_str2mask(const char *str, const char *(*bit2str)(int bit),
+                int *oldmask, int minmask, int allmask)
+{
+       const char *debugstr;
+       char op = 0;
+       int newmask = minmask, i, len, found = 0;
+       ENTRY;
+
+       /* <str> must be a list of tokens separated by whitespace
+        * and optionally an operator ('+' or '-').  If an operator
+        * appears first in <str>, '*oldmask' is used as the starting point
+        * (relative), otherwise minmask is used (absolute).  An operator
+        * applies to all following tokens up to the next operator. */
+       while (*str != 0) {
+               while (isspace(*str))
+                       str++;
+               if (*str == 0)
+                       break;
+               if (*str == '+' || *str == '-') {
+                       op = *str++;
+                       if (!found)
+                               /* only if first token is relative */
+                               newmask = *oldmask;
+                       while (isspace(*str))
+                               str++;
+                       if (*str == 0)    /* trailing op */
+                               return -EINVAL;
+               }
+
+               /* find token length */
+               for (len = 0; str[len] != 0 && !isspace(str[len]) &&
+                     str[len] != '+' && str[len] != '-'; len++);
+
+               /* match token */
+               found = 0;
+               for (i = 0; i < 32; i++) {
+                       debugstr = bit2str(i);
+                       if (debugstr != NULL &&
+                           strlen(debugstr) == len &&
+                           cfs_strncasecmp(str, debugstr, len) == 0) {
+                               if (op == '-')
+                                       newmask &= ~(1 << i);
+                               else
+                                       newmask |= (1 << i);
+                               found = 1;
+                               break;
+                       }
+               }
+               if (!found && len == 3 &&
+                   (cfs_strncasecmp(str, "ALL", len) == 0)) {
+                       if (op == '-')
+                               newmask = minmask;
+                       else
+                               newmask = allmask;
+                       found = 1;
+               }
+               if (!found) {
+                       CWARN("unknown mask '%.*s'.\n"
+                             "mask usage: [+|-]<all|type> ...\n", len, str);
+                       return -EINVAL;
+               }
+               str += len;
+       }
+
+       *oldmask = newmask;
+       return 0;
+}
+EXPORT_SYMBOL(cfs_str2mask);
+
+/* Duplicate a string in a platform-independent way */
+char *cfs_strdup(const char *str, u_int32_t flags)
+{
+       size_t lenz; /* length of str + zero byte */
+       char *dup_str;
+
+       lenz = strlen(str) + 1;
+
+       dup_str = kmalloc(lenz, flags);
+       if (dup_str == NULL)
+               return NULL;
+
+       memcpy(dup_str, str, lenz);
+
+       return dup_str;
+}
+EXPORT_SYMBOL(cfs_strdup);
+
+/**
+ * cfs_{v}snprintf() return the actual size that is printed rather than
+ * the size that would be printed in standard functions.
+ */
+/* safe vsnprintf */
+int cfs_vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+{
+       int i;
+
+       LASSERT(size > 0);
+       i = vsnprintf(buf, size, fmt, args);
+
+       return  (i >= size ? size - 1 : i);
+}
+EXPORT_SYMBOL(cfs_vsnprintf);
+
+/* safe snprintf */
+int cfs_snprintf(char *buf, size_t size, const char *fmt, ...)
+{
+       va_list args;
+       int i;
+
+       va_start(args, fmt);
+       i = cfs_vsnprintf(buf, size, fmt, args);
+       va_end(args);
+
+       return  i;
+}
+EXPORT_SYMBOL(cfs_snprintf);
+
+/* get the first string out of @str */
+char *cfs_firststr(char *str, size_t size)
+{
+       size_t i = 0;
+       char  *end;
+
+       /* trim leading spaces */
+       while (i < size && *str && isspace(*str)) {
+               ++i;
+               ++str;
+       }
+
+       /* string with all spaces */
+       if (*str == '\0')
+               goto out;
+
+       end = str;
+       while (i < size && *end != '\0' && !isspace(*end)) {
+               ++i;
+               ++end;
+       }
+
+       *end= '\0';
+out:
+       return str;
+}
+EXPORT_SYMBOL(cfs_firststr);
+
+char *
+cfs_trimwhite(char *str)
+{
+       char *end;
+
+       while (cfs_iswhite(*str))
+               str++;
+
+       end = str + strlen(str);
+       while (end > str) {
+               if (!cfs_iswhite(end[-1]))
+                       break;
+               end--;
+       }
+
+       *end = 0;
+       return str;
+}
+EXPORT_SYMBOL(cfs_trimwhite);
+
+/**
+ * Extracts tokens from strings.
+ *
+ * Looks for \a delim in string \a next, sets \a res to point to
+ * substring before the delimiter, sets \a next right after the found
+ * delimiter.
+ *
+ * \retval 1 if \a res points to a string of non-whitespace characters
+ * \retval 0 otherwise
+ */
+int
+cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res)
+{
+       char *end;
+
+       if (next->ls_str == NULL)
+               return 0;
+
+       /* skip leading white spaces */
+       while (next->ls_len) {
+               if (!cfs_iswhite(*next->ls_str))
+                       break;
+               next->ls_str++;
+               next->ls_len--;
+       }
+
+       if (next->ls_len == 0) /* whitespaces only */
+               return 0;
+
+       if (*next->ls_str == delim) {
+               /* first non-writespace is the delimiter */
+               return 0;
+       }
+
+       res->ls_str = next->ls_str;
+       end = memchr(next->ls_str, delim, next->ls_len);
+       if (end == NULL) {
+               /* there is no the delimeter in the string */
+               end = next->ls_str + next->ls_len;
+               next->ls_str = NULL;
+       } else {
+               next->ls_str = end + 1;
+               next->ls_len -= (end - res->ls_str + 1);
+       }
+
+       /* skip ending whitespaces */
+       while (--end != res->ls_str) {
+               if (!cfs_iswhite(*end))
+                       break;
+       }
+
+       res->ls_len = end - res->ls_str + 1;
+       return 1;
+}
+EXPORT_SYMBOL(cfs_gettok);
+
+/**
+ * Converts string to integer.
+ *
+ * Accepts decimal and hexadecimal number recordings.
+ *
+ * \retval 1 if first \a nob chars of \a str convert to decimal or
+ * hexadecimal integer in the range [\a min, \a max]
+ * \retval 0 otherwise
+ */
+int
+cfs_str2num_check(char *str, int nob, unsigned *num,
+                 unsigned min, unsigned max)
+{
+       char    *endp;
+
+       str = cfs_trimwhite(str);
+       *num = strtoul(str, &endp, 0);
+       if (endp == str)
+               return 0;
+
+       for (; endp < str + nob; endp++) {
+               if (!cfs_iswhite(*endp))
+                       return 0;
+       }
+
+       return (*num >= min && *num <= max);
+}
+EXPORT_SYMBOL(cfs_str2num_check);
+
+/**
+ * Parses \<range_expr\> token of the syntax. If \a bracketed is false,
+ * \a src should only have a single token which can be \<number\> or  \*
+ *
+ * \retval pointer to allocated range_expr and initialized
+ * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a
+ `* src parses to
+ * \<number\> |
+ * \<number\> '-' \<number\> |
+ * \<number\> '-' \<number\> '/' \<number\>
+ * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or
+ * -ENOMEM will be returned.
+ */
+int
+cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max,
+                    int bracketed, struct cfs_range_expr **expr)
+{
+       struct cfs_range_expr   *re;
+       struct cfs_lstr         tok;
+
+       LIBCFS_ALLOC(re, sizeof(*re));
+       if (re == NULL)
+               return -ENOMEM;
+
+       if (src->ls_len == 1 && src->ls_str[0] == '*') {
+               re->re_lo = min;
+               re->re_hi = max;
+               re->re_stride = 1;
+               goto out;
+       }
+
+       if (cfs_str2num_check(src->ls_str, src->ls_len,
+                             &re->re_lo, min, max)) {
+               /* <number> is parsed */
+               re->re_hi = re->re_lo;
+               re->re_stride = 1;
+               goto out;
+       }
+
+       if (!bracketed || !cfs_gettok(src, '-', &tok))
+               goto failed;
+
+       if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+                              &re->re_lo, min, max))
+               goto failed;
+
+       /* <number> - */
+       if (cfs_str2num_check(src->ls_str, src->ls_len,
+                             &re->re_hi, min, max)) {
+               /* <number> - <number> is parsed */
+               re->re_stride = 1;
+               goto out;
+       }
+
+       /* go to check <number> '-' <number> '/' <number> */
+       if (cfs_gettok(src, '/', &tok)) {
+               if (!cfs_str2num_check(tok.ls_str, tok.ls_len,
+                                      &re->re_hi, min, max))
+                       goto failed;
+
+               /* <number> - <number> / ... */
+               if (cfs_str2num_check(src->ls_str, src->ls_len,
+                                     &re->re_stride, min, max)) {
+                       /* <number> - <number> / <number> is parsed */
+                       goto out;
+               }
+       }
+
+ out:
+       *expr = re;
+       return 0;
+
+ failed:
+       LIBCFS_FREE(re, sizeof(*re));
+       return -EINVAL;
+}
+EXPORT_SYMBOL(cfs_range_expr_parse);
+
+/**
+ * Matches value (\a value) against ranges expression list \a expr_list.
+ *
+ * \retval 1 if \a value matches
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list)
+{
+       struct cfs_range_expr   *expr;
+
+       list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+               if (value >= expr->re_lo && value <= expr->re_hi &&
+                   ((value - expr->re_lo) % expr->re_stride) == 0)
+                       return 1;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(cfs_expr_list_match);
+
+/**
+ * Convert express list (\a expr_list) to an array of all matched values
+ *
+ * \retval N N is total number of all matched values
+ * \retval 0 if expression list is empty
+ * \retval < 0 for failure
+ */
+int
+cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp)
+{
+       struct cfs_range_expr   *expr;
+       __u32                   *val;
+       int                     count = 0;
+       int                     i;
+
+       list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+               for (i = expr->re_lo; i <= expr->re_hi; i++) {
+                       if (((i - expr->re_lo) % expr->re_stride) == 0)
+                               count++;
+               }
+       }
+
+       if (count == 0) /* empty expression list */
+               return 0;
+
+       if (count > max) {
+               CERROR("Number of values %d exceeds max allowed %d\n",
+                      max, count);
+               return -EINVAL;
+       }
+
+       LIBCFS_ALLOC(val, sizeof(val[0]) * count);
+       if (val == NULL)
+               return -ENOMEM;
+
+       count = 0;
+       list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+               for (i = expr->re_lo; i <= expr->re_hi; i++) {
+                       if (((i - expr->re_lo) % expr->re_stride) == 0)
+                               val[count++] = i;
+               }
+       }
+
+       *valpp = val;
+       return count;
+}
+EXPORT_SYMBOL(cfs_expr_list_values);
+
+/**
+ * Frees cfs_range_expr structures of \a expr_list.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free(struct cfs_expr_list *expr_list)
+{
+       while (!list_empty(&expr_list->el_exprs)) {
+               struct cfs_range_expr *expr;
+
+               expr = list_entry(expr_list->el_exprs.next,
+                                     struct cfs_range_expr, re_link),
+               list_del(&expr->re_link);
+               LIBCFS_FREE(expr, sizeof(*expr));
+       }
+
+       LIBCFS_FREE(expr_list, sizeof(*expr_list));
+}
+EXPORT_SYMBOL(cfs_expr_list_free);
+
+void
+cfs_expr_list_print(struct cfs_expr_list *expr_list)
+{
+       struct cfs_range_expr *expr;
+
+       list_for_each_entry(expr, &expr_list->el_exprs, re_link) {
+               CDEBUG(D_WARNING, "%d-%d/%d\n",
+                      expr->re_lo, expr->re_hi, expr->re_stride);
+       }
+}
+EXPORT_SYMBOL(cfs_expr_list_print);
+
+/**
+ * Parses \<cfs_expr_list\> token of the syntax.
+ *
+ * \retval 1 if \a str parses to \<number\> | \<expr_list\>
+ * \retval 0 otherwise
+ */
+int
+cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max,
+                   struct cfs_expr_list **elpp)
+{
+       struct cfs_expr_list    *expr_list;
+       struct cfs_range_expr   *expr;
+       struct cfs_lstr         src;
+       int                     rc;
+
+       LIBCFS_ALLOC(expr_list, sizeof(*expr_list));
+       if (expr_list == NULL)
+               return -ENOMEM;
+
+       src.ls_str = str;
+       src.ls_len = len;
+
+       INIT_LIST_HEAD(&expr_list->el_exprs);
+
+       if (src.ls_str[0] == '[' &&
+           src.ls_str[src.ls_len - 1] == ']') {
+               src.ls_str++;
+               src.ls_len -= 2;
+
+               rc = -EINVAL;
+               while (src.ls_str != NULL) {
+                       struct cfs_lstr tok;
+
+                       if (!cfs_gettok(&src, ',', &tok)) {
+                               rc = -EINVAL;
+                               break;
+                       }
+
+                       rc = cfs_range_expr_parse(&tok, min, max, 1, &expr);
+                       if (rc != 0)
+                               break;
+
+                       list_add_tail(&expr->re_link,
+                                         &expr_list->el_exprs);
+               }
+       } else {
+               rc = cfs_range_expr_parse(&src, min, max, 0, &expr);
+               if (rc == 0) {
+                       list_add_tail(&expr->re_link,
+                                         &expr_list->el_exprs);
+               }
+       }
+
+       if (rc != 0)
+               cfs_expr_list_free(expr_list);
+       else
+               *elpp = expr_list;
+
+       return rc;
+}
+EXPORT_SYMBOL(cfs_expr_list_parse);
+
+/**
+ * Frees cfs_expr_list structures of \a list.
+ *
+ * For each struct cfs_expr_list structure found on \a list it frees
+ * range_expr list attached to it and frees the cfs_expr_list itself.
+ *
+ * \retval none
+ */
+void
+cfs_expr_list_free_list(struct list_head *list)
+{
+       struct cfs_expr_list *el;
+
+       while (!list_empty(list)) {
+               el = list_entry(list->next,
+                                   struct cfs_expr_list, el_link);
+               list_del(&el->el_link);
+               cfs_expr_list_free(el);
+       }
+}
+EXPORT_SYMBOL(cfs_expr_list_free_list);
+
+int
+cfs_ip_addr_parse(char *str, int len, struct list_head *list)
+{
+       struct cfs_expr_list    *el;
+       struct cfs_lstr         src;
+       int                     rc;
+       int                     i;
+
+       src.ls_str = str;
+       src.ls_len = len;
+       i = 0;
+
+       while (src.ls_str != NULL) {
+               struct cfs_lstr res;
+
+               if (!cfs_gettok(&src, '.', &res)) {
+                       rc = -EINVAL;
+                       goto out;
+               }
+
+               rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el);
+               if (rc != 0)
+                       goto out;
+
+               list_add_tail(&el->el_link, list);
+               i++;
+       }
+
+       if (i == 4)
+               return 0;
+
+       rc = -EINVAL;
+ out:
+       cfs_expr_list_free_list(list);
+
+       return rc;
+}
+EXPORT_SYMBOL(cfs_ip_addr_parse);
+
+/**
+ * Matches address (\a addr) against address set encoded in \a list.
+ *
+ * \retval 1 if \a addr matches
+ * \retval 0 otherwise
+ */
+int
+cfs_ip_addr_match(__u32 addr, struct list_head *list)
+{
+       struct cfs_expr_list *el;
+       int i = 0;
+
+       list_for_each_entry_reverse(el, list, el_link) {
+               if (!cfs_expr_list_match(addr & 0xff, el))
+                       return 0;
+               addr >>= 8;
+               i++;
+       }
+
+       return i == 4;
+}
+EXPORT_SYMBOL(cfs_ip_addr_match);
+
+void
+cfs_ip_addr_free(struct list_head *list)
+{
+       cfs_expr_list_free_list(list);
+}
+EXPORT_SYMBOL(cfs_ip_addr_free);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
new file mode 100644 (file)
index 0000000..95142d1
--- /dev/null
@@ -0,0 +1,1085 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/libcfs/libcfs.h>
+
+#ifdef CONFIG_SMP
+
+/**
+ * modparam for setting number of partitions
+ *
+ *  0 : estimate best value based on cores or NUMA nodes
+ *  1 : disable multiple partitions
+ * >1 : specify number of partitions
+ */
+static int     cpu_npartitions;
+CFS_MODULE_PARM(cpu_npartitions, "i", int, 0444, "# of CPU partitions");
+
+/**
+ * modparam for setting CPU partitions patterns:
+ *
+ * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
+ *      number in bracket is processor ID (core or HT)
+ *
+ * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
+ *       are NUMA node ID, number before bracket is CPU partition ID.
+ *
+ * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
+ */
+static char    *cpu_pattern = "";
+CFS_MODULE_PARM(cpu_pattern, "s", charp, 0444, "CPU partitions pattern");
+
+struct cfs_cpt_data {
+       /* serialize hotplug etc */
+       spinlock_t              cpt_lock;
+       /* reserved for hotplug */
+       unsigned long           cpt_version;
+       /* mutex to protect cpt_cpumask */
+       struct semaphore        cpt_mutex;
+       /* scratch buffer for set/unset_node */
+       cpumask_t               *cpt_cpumask;
+};
+
+static struct cfs_cpt_data     cpt_data;
+
+void
+cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
+{
+       /* return cpumask of cores in the same socket */
+       cpumask_copy(mask, topology_core_cpumask(cpu));
+}
+EXPORT_SYMBOL(cfs_cpu_core_siblings);
+
+/* return number of cores in the same socket of \a cpu */
+int
+cfs_cpu_core_nsiblings(int cpu)
+{
+       int     num;
+
+       down(&cpt_data.cpt_mutex);
+
+       cfs_cpu_core_siblings(cpu, cpt_data.cpt_cpumask);
+       num = cpus_weight(*cpt_data.cpt_cpumask);
+
+       up(&cpt_data.cpt_mutex);
+
+       return num;
+}
+EXPORT_SYMBOL(cfs_cpu_core_nsiblings);
+
+/* return cpumask of HTs in the same core */
+void
+cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
+{
+       cpumask_copy(mask, topology_thread_cpumask(cpu));
+}
+EXPORT_SYMBOL(cfs_cpu_ht_siblings);
+
+/* return number of HTs in the same core of \a cpu */
+int
+cfs_cpu_ht_nsiblings(int cpu)
+{
+       int     num;
+
+       down(&cpt_data.cpt_mutex);
+
+       cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
+       num = cpus_weight(*cpt_data.cpt_cpumask);
+
+       up(&cpt_data.cpt_mutex);
+
+       return num;
+}
+EXPORT_SYMBOL(cfs_cpu_ht_nsiblings);
+
+void
+cfs_node_to_cpumask(int node, cpumask_t *mask)
+{
+       cpumask_copy(mask, cpumask_of_node(node));
+}
+EXPORT_SYMBOL(cfs_node_to_cpumask);
+
+void
+cfs_cpt_table_free(struct cfs_cpt_table *cptab)
+{
+       int     i;
+
+       if (cptab->ctb_cpu2cpt != NULL) {
+               LIBCFS_FREE(cptab->ctb_cpu2cpt,
+                           num_possible_cpus() *
+                           sizeof(cptab->ctb_cpu2cpt[0]));
+       }
+
+       for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
+               struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+               if (part->cpt_nodemask != NULL) {
+                       LIBCFS_FREE(part->cpt_nodemask,
+                                   sizeof(*part->cpt_nodemask));
+               }
+
+               if (part->cpt_cpumask != NULL)
+                       LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
+       }
+
+       if (cptab->ctb_parts != NULL) {
+               LIBCFS_FREE(cptab->ctb_parts,
+                           cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
+       }
+
+       if (cptab->ctb_nodemask != NULL)
+               LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+       if (cptab->ctb_cpumask != NULL)
+               LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
+
+       LIBCFS_FREE(cptab, sizeof(*cptab));
+}
+EXPORT_SYMBOL(cfs_cpt_table_free);
+
+struct cfs_cpt_table *
+cfs_cpt_table_alloc(unsigned int ncpt)
+{
+       struct cfs_cpt_table *cptab;
+       int     i;
+
+       LIBCFS_ALLOC(cptab, sizeof(*cptab));
+       if (cptab == NULL)
+               return NULL;
+
+       cptab->ctb_nparts = ncpt;
+
+       LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
+       LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
+
+       if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
+               goto failed;
+
+       LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
+                    num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+       if (cptab->ctb_cpu2cpt == NULL)
+               goto failed;
+
+       memset(cptab->ctb_cpu2cpt, -1,
+              num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
+
+       LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
+       if (cptab->ctb_parts == NULL)
+               goto failed;
+
+       for (i = 0; i < ncpt; i++) {
+               struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
+
+               LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
+               LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
+               if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
+                       goto failed;
+       }
+
+       spin_lock(&cpt_data.cpt_lock);
+       /* Reserved for hotplug */
+       cptab->ctb_version = cpt_data.cpt_version;
+       spin_unlock(&cpt_data.cpt_lock);
+
+       return cptab;
+
+ failed:
+       cfs_cpt_table_free(cptab);
+       return NULL;
+}
+EXPORT_SYMBOL(cfs_cpt_table_alloc);
+
+int
+cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
+{
+       char    *tmp = buf;
+       int     rc = 0;
+       int     i;
+       int     j;
+
+       for (i = 0; i < cptab->ctb_nparts; i++) {
+               if (len > 0) {
+                       rc = snprintf(tmp, len, "%d\t: ", i);
+                       len -= rc;
+               }
+
+               if (len <= 0) {
+                       rc = -EFBIG;
+                       goto out;
+               }
+
+               tmp += rc;
+               for_each_cpu_mask(j, *cptab->ctb_parts[i].cpt_cpumask) {
+                       rc = snprintf(tmp, len, "%d ", j);
+                       len -= rc;
+                       if (len <= 0) {
+                               rc = -EFBIG;
+                               goto out;
+                       }
+                       tmp += rc;
+               }
+
+               *tmp = '\n';
+               tmp++;
+               len--;
+       }
+
+ out:
+       if (rc < 0)
+               return rc;
+
+       return tmp - buf;
+}
+EXPORT_SYMBOL(cfs_cpt_table_print);
+
+int
+cfs_cpt_number(struct cfs_cpt_table *cptab)
+{
+       return cptab->ctb_nparts;
+}
+EXPORT_SYMBOL(cfs_cpt_number);
+
+int
+cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
+{
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       return cpt == CFS_CPT_ANY ?
+              cpus_weight(*cptab->ctb_cpumask) :
+              cpus_weight(*cptab->ctb_parts[cpt].cpt_cpumask);
+}
+EXPORT_SYMBOL(cfs_cpt_weight);
+
+int
+cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
+{
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       return cpt == CFS_CPT_ANY ?
+              any_online_cpu(*cptab->ctb_cpumask) != NR_CPUS :
+              any_online_cpu(*cptab->ctb_parts[cpt].cpt_cpumask) != NR_CPUS;
+}
+EXPORT_SYMBOL(cfs_cpt_online);
+
+cpumask_t *
+cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
+{
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       return cpt == CFS_CPT_ANY ?
+              cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
+}
+EXPORT_SYMBOL(cfs_cpt_cpumask);
+
+nodemask_t *
+cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
+{
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       return cpt == CFS_CPT_ANY ?
+              cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
+}
+EXPORT_SYMBOL(cfs_cpt_nodemask);
+
+int
+cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+       int     node;
+
+       LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
+
+       if (cpu < 0 || cpu >= NR_CPUS || !cpu_online(cpu)) {
+               CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
+               return 0;
+       }
+
+       if (cptab->ctb_cpu2cpt[cpu] != -1) {
+               CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
+                      cpu, cptab->ctb_cpu2cpt[cpu]);
+               return 0;
+       }
+
+       cptab->ctb_cpu2cpt[cpu] = cpt;
+
+       LASSERT(!cpu_isset(cpu, *cptab->ctb_cpumask));
+       LASSERT(!cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
+
+       cpu_set(cpu, *cptab->ctb_cpumask);
+       cpu_set(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
+
+       node = cpu_to_node(cpu);
+
+       /* first CPU of @node in this CPT table */
+       if (!node_isset(node, *cptab->ctb_nodemask))
+               node_set(node, *cptab->ctb_nodemask);
+
+       /* first CPU of @node in this partition */
+       if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
+               node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpu);
+
+void
+cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
+{
+       int     node;
+       int     i;
+
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       if (cpu < 0 || cpu >= NR_CPUS) {
+               CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
+               return;
+       }
+
+       if (cpt == CFS_CPT_ANY) {
+               /* caller doesn't know the partition ID */
+               cpt = cptab->ctb_cpu2cpt[cpu];
+               if (cpt < 0) { /* not set in this CPT-table */
+                       CDEBUG(D_INFO, "Try to unset cpu %d which is "
+                                      "not in CPT-table %p\n", cpt, cptab);
+                       return;
+               }
+
+       } else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
+               CDEBUG(D_INFO,
+                      "CPU %d is not in cpu-partition %d\n", cpu, cpt);
+               return;
+       }
+
+       LASSERT(cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask));
+       LASSERT(cpu_isset(cpu, *cptab->ctb_cpumask));
+
+       cpu_clear(cpu, *cptab->ctb_parts[cpt].cpt_cpumask);
+       cpu_clear(cpu, *cptab->ctb_cpumask);
+       cptab->ctb_cpu2cpt[cpu] = -1;
+
+       node = cpu_to_node(cpu);
+
+       LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
+       LASSERT(node_isset(node, *cptab->ctb_nodemask));
+
+       for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) {
+               /* this CPT has other CPU belonging to this node? */
+               if (cpu_to_node(i) == node)
+                       break;
+       }
+
+       if (i == NR_CPUS)
+               node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
+
+       for_each_cpu_mask(i, *cptab->ctb_cpumask) {
+               /* this CPT-table has other CPU belonging to this node? */
+               if (cpu_to_node(i) == node)
+                       break;
+       }
+
+       if (i == NR_CPUS)
+               node_clear(node, *cptab->ctb_nodemask);
+
+       return;
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpu);
+
+int
+cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+       int     i;
+
+       if (cpus_weight(*mask) == 0 || any_online_cpu(*mask) == NR_CPUS) {
+               CDEBUG(D_INFO, "No online CPU is found in the CPU mask "
+                              "for CPU partition %d\n", cpt);
+               return 0;
+       }
+
+       for_each_cpu_mask(i, *mask) {
+               if (!cfs_cpt_set_cpu(cptab, cpt, i))
+                       return 0;
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_cpumask);
+
+void
+cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
+{
+       int     i;
+
+       for_each_cpu_mask(i, *mask)
+               cfs_cpt_unset_cpu(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
+
+int
+cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+       cpumask_t       *mask;
+       int             rc;
+
+       if (node < 0 || node >= MAX_NUMNODES) {
+               CDEBUG(D_INFO,
+                      "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+               return 0;
+       }
+
+       down(&cpt_data.cpt_mutex);
+
+       mask = cpt_data.cpt_cpumask;
+       cfs_node_to_cpumask(node, mask);
+
+       rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
+
+       up(&cpt_data.cpt_mutex);
+
+       return rc;
+}
+EXPORT_SYMBOL(cfs_cpt_set_node);
+
+void
+cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
+{
+       cpumask_t *mask;
+
+       if (node < 0 || node >= MAX_NUMNODES) {
+               CDEBUG(D_INFO,
+                      "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
+               return;
+       }
+
+       down(&cpt_data.cpt_mutex);
+
+       mask = cpt_data.cpt_cpumask;
+       cfs_node_to_cpumask(node, mask);
+
+       cfs_cpt_unset_cpumask(cptab, cpt, mask);
+
+       up(&cpt_data.cpt_mutex);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_node);
+
+int
+cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+       int     i;
+
+       for_each_node_mask(i, *mask) {
+               if (!cfs_cpt_set_node(cptab, cpt, i))
+                       return 0;
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(cfs_cpt_set_nodemask);
+
+void
+cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
+{
+       int     i;
+
+       for_each_node_mask(i, *mask)
+               cfs_cpt_unset_node(cptab, cpt, i);
+}
+EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
+
+void
+cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
+{
+       int     last;
+       int     i;
+
+       if (cpt == CFS_CPT_ANY) {
+               last = cptab->ctb_nparts - 1;
+               cpt = 0;
+       } else {
+               last = cpt;
+       }
+
+       for (; cpt <= last; cpt++) {
+               for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask)
+                       cfs_cpt_unset_cpu(cptab, cpt, i);
+       }
+}
+EXPORT_SYMBOL(cfs_cpt_clear);
+
+int
+cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
+{
+       nodemask_t      *mask;
+       int             weight;
+       int             rotor;
+       int             node;
+
+       /* convert CPU partition ID to HW node id */
+
+       if (cpt < 0 || cpt >= cptab->ctb_nparts) {
+               mask = cptab->ctb_nodemask;
+               rotor = cptab->ctb_spread_rotor++;
+       } else {
+               mask = cptab->ctb_parts[cpt].cpt_nodemask;
+               rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
+       }
+
+       weight = nodes_weight(*mask);
+       LASSERT(weight > 0);
+
+       rotor %= weight;
+
+       for_each_node_mask(node, *mask) {
+               if (rotor-- == 0)
+                       return node;
+       }
+
+       LBUG();
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_spread_node);
+
+int
+cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
+{
+       int     cpu = smp_processor_id();
+       int     cpt = cptab->ctb_cpu2cpt[cpu];
+
+       if (cpt < 0) {
+               if (!remap)
+                       return cpt;
+
+               /* don't return negative value for safety of upper layer,
+                * instead we shadow the unknown cpu to a valid partition ID */
+               cpt = cpu % cptab->ctb_nparts;
+       }
+
+       return cpt;
+}
+EXPORT_SYMBOL(cfs_cpt_current);
+
+int
+cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
+{
+       LASSERT(cpu >= 0 && cpu < NR_CPUS);
+
+       return cptab->ctb_cpu2cpt[cpu];
+}
+EXPORT_SYMBOL(cfs_cpt_of_cpu);
+
+int
+cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
+{
+       cpumask_t       *cpumask;
+       nodemask_t      *nodemask;
+       int             rc;
+       int             i;
+
+       LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
+
+       if (cpt == CFS_CPT_ANY) {
+               cpumask = cptab->ctb_cpumask;
+               nodemask = cptab->ctb_nodemask;
+       } else {
+               cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
+               nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
+       }
+
+       if (any_online_cpu(*cpumask) == NR_CPUS) {
+               CERROR("No online CPU found in CPU partition %d, did someone "
+                      "do CPU hotplug on system? You might need to reload "
+                      "Lustre modules to keep system working well.\n", cpt);
+               return -EINVAL;
+       }
+
+       for_each_online_cpu(i) {
+               if (cpu_isset(i, *cpumask))
+                       continue;
+
+               rc = set_cpus_allowed_ptr(current, cpumask);
+               set_mems_allowed(*nodemask);
+               if (rc == 0)
+                       schedule(); /* switch to allowed CPU */
+
+               return rc;
+       }
+
+       /* don't need to set affinity because all online CPUs are covered */
+       return 0;
+}
+EXPORT_SYMBOL(cfs_cpt_bind);
+
+/**
+ * Choose max to \a number CPUs from \a node and set them in \a cpt.
+ * We always prefer to choose CPU in the same core/socket.
+ */
+static int
+cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
+                    cpumask_t *node, int number)
+{
+       cpumask_t       *socket = NULL;
+       cpumask_t       *core = NULL;
+       int             rc = 0;
+       int             cpu;
+
+       LASSERT(number > 0);
+
+       if (number >= cpus_weight(*node)) {
+               while (!cpus_empty(*node)) {
+                       cpu = first_cpu(*node);
+
+                       rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
+                       if (!rc)
+                               return -EINVAL;
+                       cpu_clear(cpu, *node);
+               }
+               return 0;
+       }
+
+       /* allocate scratch buffer */
+       LIBCFS_ALLOC(socket, cpumask_size());
+       LIBCFS_ALLOC(core, cpumask_size());
+       if (socket == NULL || core == NULL) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       while (!cpus_empty(*node)) {
+               cpu = first_cpu(*node);
+
+               /* get cpumask for cores in the same socket */
+               cfs_cpu_core_siblings(cpu, socket);
+               cpus_and(*socket, *socket, *node);
+
+               LASSERT(!cpus_empty(*socket));
+
+               while (!cpus_empty(*socket)) {
+                       int     i;
+
+                       /* get cpumask for hts in the same core */
+                       cfs_cpu_ht_siblings(cpu, core);
+                       cpus_and(*core, *core, *node);
+
+                       LASSERT(!cpus_empty(*core));
+
+                       for_each_cpu_mask(i, *core) {
+                               cpu_clear(i, *socket);
+                               cpu_clear(i, *node);
+
+                               rc = cfs_cpt_set_cpu(cptab, cpt, i);
+                               if (!rc) {
+                                       rc = -EINVAL;
+                                       goto out;
+                               }
+
+                               if (--number == 0)
+                                       goto out;
+                       }
+                       cpu = first_cpu(*socket);
+               }
+       }
+
+ out:
+       if (socket != NULL)
+               LIBCFS_FREE(socket, cpumask_size());
+       if (core != NULL)
+               LIBCFS_FREE(core, cpumask_size());
+       return rc;
+}
+
+#define CPT_WEIGHT_MIN  4u
+
+static unsigned int
+cfs_cpt_num_estimate(void)
+{
+       unsigned nnode = num_online_nodes();
+       unsigned ncpu  = num_online_cpus();
+       unsigned ncpt;
+
+       if (ncpu <= CPT_WEIGHT_MIN) {
+               ncpt = 1;
+               goto out;
+       }
+
+       /* generate reasonable number of CPU partitions based on total number
+        * of CPUs, Preferred N should be power2 and match this condition:
+        * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
+       for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
+
+       if (ncpt <= nnode) { /* fat numa system */
+               while (nnode > ncpt)
+                       nnode >>= 1;
+
+       } else { /* ncpt > nnode */
+               while ((nnode << 1) <= ncpt)
+                       nnode <<= 1;
+       }
+
+       ncpt = nnode;
+
+ out:
+#if (BITS_PER_LONG == 32)
+       /* config many CPU partitions on 32-bit system could consume
+        * too much memory */
+       ncpt = min(2U, ncpt);
+#endif
+       while (ncpu % ncpt != 0)
+               ncpt--; /* worst case is 1 */
+
+       return ncpt;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create(int ncpt)
+{
+       struct cfs_cpt_table *cptab = NULL;
+       cpumask_t       *mask = NULL;
+       int             cpt = 0;
+       int             num;
+       int             rc;
+       int             i;
+
+       rc = cfs_cpt_num_estimate();
+       if (ncpt <= 0)
+               ncpt = rc;
+
+       if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
+               CWARN("CPU partition number %d is larger than suggested "
+                     "value (%d), your system may have performance"
+                     "issue or run out of memory while under pressure\n",
+                     ncpt, rc);
+       }
+
+       if (num_online_cpus() % ncpt != 0) {
+               CERROR("CPU number %d is not multiple of cpu_npartition %d, "
+                      "please try different cpu_npartitions value or"
+                      "set pattern string by cpu_pattern=STRING\n",
+                      (int)num_online_cpus(), ncpt);
+               goto failed;
+       }
+
+       cptab = cfs_cpt_table_alloc(ncpt);
+       if (cptab == NULL) {
+               CERROR("Failed to allocate CPU map(%d)\n", ncpt);
+               goto failed;
+       }
+
+       num = num_online_cpus() / ncpt;
+       if (num == 0) {
+               CERROR("CPU changed while setting CPU partition\n");
+               goto failed;
+       }
+
+       LIBCFS_ALLOC(mask, cpumask_size());
+       if (mask == NULL) {
+               CERROR("Failed to allocate scratch cpumask\n");
+               goto failed;
+       }
+
+       for_each_online_node(i) {
+               cfs_node_to_cpumask(i, mask);
+
+               while (!cpus_empty(*mask)) {
+                       struct cfs_cpu_partition *part;
+                       int    n;
+
+                       if (cpt >= ncpt)
+                               goto failed;
+
+                       part = &cptab->ctb_parts[cpt];
+
+                       n = num - cpus_weight(*part->cpt_cpumask);
+                       LASSERT(n > 0);
+
+                       rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
+                       if (rc < 0)
+                               goto failed;
+
+                       LASSERT(num >= cpus_weight(*part->cpt_cpumask));
+                       if (num == cpus_weight(*part->cpt_cpumask))
+                               cpt++;
+               }
+       }
+
+       if (cpt != ncpt ||
+           num != cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
+               CERROR("Expect %d(%d) CPU partitions but got %d(%d), "
+                      "CPU hotplug/unplug while setting?\n",
+                      cptab->ctb_nparts, num, cpt,
+                      cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask));
+               goto failed;
+       }
+
+       LIBCFS_FREE(mask, cpumask_size());
+
+       return cptab;
+
+ failed:
+       CERROR("Failed to setup CPU-partition-table with %d "
+              "CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
+              ncpt, num_online_nodes(), num_online_cpus());
+
+       if (mask != NULL)
+               LIBCFS_FREE(mask, cpumask_size());
+
+       if (cptab != NULL)
+               cfs_cpt_table_free(cptab);
+
+       return NULL;
+}
+
+static struct cfs_cpt_table *
+cfs_cpt_table_create_pattern(char *pattern)
+{
+       struct cfs_cpt_table    *cptab;
+       char                    *str    = pattern;
+       int                     node    = 0;
+       int                     high;
+       int                     ncpt;
+       int                     c;
+
+       for (ncpt = 0;; ncpt++) { /* quick scan bracket */
+               str = strchr(str, '[');
+               if (str == NULL)
+                       break;
+               str++;
+       }
+
+       str = cfs_trimwhite(pattern);
+       if (*str == 'n' || *str == 'N') {
+               pattern = str + 1;
+               node = 1;
+       }
+
+       if (ncpt == 0 ||
+           (node && ncpt > num_online_nodes()) ||
+           (!node && ncpt > num_online_cpus())) {
+               CERROR("Invalid pattern %s, or too many partitions %d\n",
+                      pattern, ncpt);
+               return NULL;
+       }
+
+       high = node ? MAX_NUMNODES - 1 : NR_CPUS - 1;
+
+       cptab = cfs_cpt_table_alloc(ncpt);
+       if (cptab == NULL) {
+               CERROR("Failed to allocate cpu partition table\n");
+               return NULL;
+       }
+
+       for (str = cfs_trimwhite(pattern), c = 0;; c++) {
+               struct cfs_range_expr   *range;
+               struct cfs_expr_list    *el;
+               char                    *bracket = strchr(str, '[');
+               int                     cpt;
+               int                     rc;
+               int                     i;
+               int                     n;
+
+               if (bracket == NULL) {
+                       if (*str != 0) {
+                               CERROR("Invalid pattern %s\n", str);
+                               goto failed;
+                       } else if (c != ncpt) {
+                               CERROR("expect %d partitions but found %d\n",
+                                      ncpt, c);
+                               goto failed;
+                       }
+                       break;
+               }
+
+               if (sscanf(str, "%u%n", &cpt, &n) < 1) {
+                       CERROR("Invalid cpu pattern %s\n", str);
+                       goto failed;
+               }
+
+               if (cpt < 0 || cpt >= ncpt) {
+                       CERROR("Invalid partition id %d, total partitions %d\n",
+                              cpt, ncpt);
+                       goto failed;
+               }
+
+               if (cfs_cpt_weight(cptab, cpt) != 0) {
+                       CERROR("Partition %d has already been set.\n", cpt);
+                       goto failed;
+               }
+
+               str = cfs_trimwhite(str + n);
+               if (str != bracket) {
+                       CERROR("Invalid pattern %s\n", str);
+                       goto failed;
+               }
+
+               bracket = strchr(str, ']');
+               if (bracket == NULL) {
+                       CERROR("missing right bracket for cpt %d, %s\n",
+                              cpt, str);
+                       goto failed;
+               }
+
+               if (cfs_expr_list_parse(str, (bracket - str) + 1,
+                                       0, high, &el) != 0) {
+                       CERROR("Can't parse number range: %s\n", str);
+                       goto failed;
+               }
+
+               list_for_each_entry(range, &el->el_exprs, re_link) {
+                       for (i = range->re_lo; i <= range->re_hi; i++) {
+                               if ((i - range->re_lo) % range->re_stride != 0)
+                                       continue;
+
+                               rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
+                                           cfs_cpt_set_cpu(cptab, cpt, i);
+                               if (!rc) {
+                                       cfs_expr_list_free(el);
+                                       goto failed;
+                               }
+                       }
+               }
+
+               cfs_expr_list_free(el);
+
+               if (!cfs_cpt_online(cptab, cpt)) {
+                       CERROR("No online CPU is found on partition %d\n", cpt);
+                       goto failed;
+               }
+
+               str = cfs_trimwhite(bracket + 1);
+       }
+
+       return cptab;
+
+ failed:
+       cfs_cpt_table_free(cptab);
+       return NULL;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int
+cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       unsigned int  cpu = (unsigned long)hcpu;
+
+       switch (action) {
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               spin_lock(&cpt_data.cpt_lock);
+               cpt_data.cpt_version++;
+               spin_unlock(&cpt_data.cpt_lock);
+       default:
+               CWARN("Lustre: can't support CPU hotplug well now, "
+                     "performance and stability could be impacted"
+                     "[CPU %u notify: %lx]\n", cpu, action);
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block cfs_cpu_notifier = {
+       .notifier_call  = cfs_cpu_notify,
+       .priority       = 0
+};
+
+#endif
+
+void
+cfs_cpu_fini(void)
+{
+       if (cfs_cpt_table != NULL)
+               cfs_cpt_table_free(cfs_cpt_table);
+
+#ifdef CONFIG_HOTPLUG_CPU
+       unregister_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+       if (cpt_data.cpt_cpumask != NULL)
+               LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
+}
+
+int
+cfs_cpu_init(void)
+{
+       LASSERT(cfs_cpt_table == NULL);
+
+       memset(&cpt_data, 0, sizeof(cpt_data));
+
+       LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
+       if (cpt_data.cpt_cpumask == NULL) {
+               CERROR("Failed to allocate scratch buffer\n");
+               return -1;
+       }
+
+       spin_lock_init(&cpt_data.cpt_lock);
+       sema_init(&cpt_data.cpt_mutex, 1);
+
+#ifdef CONFIG_HOTPLUG_CPU
+       register_hotcpu_notifier(&cfs_cpu_notifier);
+#endif
+
+       if (*cpu_pattern != 0) {
+               cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
+               if (cfs_cpt_table == NULL) {
+                       CERROR("Failed to create cptab from pattern %s\n",
+                              cpu_pattern);
+                       goto failed;
+               }
+
+       } else {
+               cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
+               if (cfs_cpt_table == NULL) {
+                       CERROR("Failed to create ptable with npartitions %d\n",
+                              cpu_npartitions);
+                       goto failed;
+               }
+       }
+
+       spin_lock(&cpt_data.cpt_lock);
+       if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
+               spin_unlock(&cpt_data.cpt_lock);
+               CERROR("CPU hotplug/unplug during setup\n");
+               goto failed;
+       }
+       spin_unlock(&cpt_data.cpt_lock);
+
+       LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
+                num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
+       return 0;
+
+ failed:
+       cfs_cpu_fini();
+       return -1;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c
new file mode 100644 (file)
index 0000000..20b2d61
--- /dev/null
@@ -0,0 +1,144 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ */
+
+/*
+ * This is crypto api shash wrappers to zlib_adler32.
+ */
+
+#include <linux/module.h>
+#include <linux/zutil.h>
+#include <crypto/internal/hash.h>
+
+
+#define CHKSUM_BLOCK_SIZE      1
+#define CHKSUM_DIGEST_SIZE     4
+
+
+static u32 __adler32(u32 cksum, unsigned char const *p, size_t len)
+{
+       return zlib_adler32(cksum, p, len);
+}
+
+static int adler32_cra_init(struct crypto_tfm *tfm)
+{
+       u32 *key = crypto_tfm_ctx(tfm);
+
+       *key = 1;
+
+       return 0;
+}
+
+static int adler32_setkey(struct crypto_shash *hash, const u8 *key,
+                         unsigned int keylen)
+{
+       u32 *mctx = crypto_shash_ctx(hash);
+
+       if (keylen != sizeof(u32)) {
+               crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+       *mctx = *(u32 *)key;
+       return 0;
+}
+
+static int adler32_init(struct shash_desc *desc)
+{
+       u32 *mctx = crypto_shash_ctx(desc->tfm);
+       u32 *cksump = shash_desc_ctx(desc);
+
+       *cksump = *mctx;
+
+       return 0;
+}
+
+static int adler32_update(struct shash_desc *desc, const u8 *data,
+                         unsigned int len)
+{
+       u32 *cksump = shash_desc_ctx(desc);
+
+       *cksump = __adler32(*cksump, data, len);
+       return 0;
+}
+static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len,
+                          u8 *out)
+{
+       *(u32 *)out = __adler32(*cksump, data, len);
+       return 0;
+}
+
+static int adler32_finup(struct shash_desc *desc, const u8 *data,
+                        unsigned int len, u8 *out)
+{
+       return __adler32_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int adler32_final(struct shash_desc *desc, u8 *out)
+{
+       u32 *cksump = shash_desc_ctx(desc);
+
+       *(u32 *)out = *cksump;
+       return 0;
+}
+
+static int adler32_digest(struct shash_desc *desc, const u8 *data,
+                         unsigned int len, u8 *out)
+{
+       return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len,
+                                   out);
+}
+static struct shash_alg alg = {
+       .setkey         = adler32_setkey,
+       .init           = adler32_init,
+       .update         = adler32_update,
+       .final          = adler32_final,
+       .finup          = adler32_finup,
+       .digest         = adler32_digest,
+       .descsize       = sizeof(u32),
+       .digestsize     = CHKSUM_DIGEST_SIZE,
+       .base           = {
+               .cra_name               = "adler32",
+               .cra_driver_name        = "adler32-zlib",
+               .cra_priority           = 100,
+               .cra_blocksize          = CHKSUM_BLOCK_SIZE,
+               .cra_ctxsize            = sizeof(u32),
+               .cra_module             = THIS_MODULE,
+               .cra_init               = adler32_cra_init,
+       }
+};
+
+
+int cfs_crypto_adler32_register(void)
+{
+       return crypto_register_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_adler32_register);
+
+void cfs_crypto_adler32_unregister(void)
+{
+       crypto_unregister_shash(&alg);
+}
+EXPORT_SYMBOL(cfs_crypto_adler32_unregister);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c
new file mode 100644 (file)
index 0000000..8e35777
--- /dev/null
@@ -0,0 +1,289 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/linux-crypto.h>
+/**
+ *  Array of  hash algorithm speed in MByte per second
+ */
+static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX];
+
+
+
+static int cfs_crypto_hash_alloc(unsigned char alg_id,
+                                const struct cfs_crypto_hash_type **type,
+                                struct hash_desc *desc, unsigned char *key,
+                                unsigned int key_len)
+{
+       int     err = 0;
+
+       *type = cfs_crypto_hash_type(alg_id);
+
+       if (*type == NULL) {
+               CWARN("Unsupported hash algorithm id = %d, max id is %d\n",
+                     alg_id, CFS_HASH_ALG_MAX);
+               return -EINVAL;
+       }
+       desc->tfm = crypto_alloc_hash((*type)->cht_name, 0, 0);
+
+       if (desc->tfm == NULL)
+               return -EINVAL;
+
+       if (IS_ERR(desc->tfm)) {
+               CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n",
+                      (*type)->cht_name);
+               return PTR_ERR(desc->tfm);
+       }
+
+       desc->flags = 0;
+
+       /** Shash have different logic for initialization then digest
+        * shash: crypto_hash_setkey, crypto_hash_init
+        * digest: crypto_digest_init, crypto_digest_setkey
+        * Skip this function for digest, because we use shash logic at
+        * cfs_crypto_hash_alloc.
+        */
+       if (key != NULL) {
+               err = crypto_hash_setkey(desc->tfm, key, key_len);
+       } else if ((*type)->cht_key != 0) {
+               err = crypto_hash_setkey(desc->tfm,
+                                        (unsigned char *)&((*type)->cht_key),
+                                        (*type)->cht_size);
+       }
+
+       if (err != 0) {
+               crypto_free_hash(desc->tfm);
+               return err;
+       }
+
+       CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n",
+              (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_name,
+              (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_driver_name,
+              cfs_crypto_hash_speeds[alg_id]);
+
+       return crypto_hash_init(desc);
+}
+
+int cfs_crypto_hash_digest(unsigned char alg_id,
+                          const void *buf, unsigned int buf_len,
+                          unsigned char *key, unsigned int key_len,
+                          unsigned char *hash, unsigned int *hash_len)
+{
+       struct scatterlist      sl;
+       struct hash_desc        hdesc;
+       int                     err;
+       const struct cfs_crypto_hash_type       *type;
+
+       if (buf == NULL || buf_len == 0 || hash_len == NULL)
+               return -EINVAL;
+
+       err = cfs_crypto_hash_alloc(alg_id, &type, &hdesc, key, key_len);
+       if (err != 0)
+               return err;
+
+       if (hash == NULL || *hash_len < type->cht_size) {
+               *hash_len = type->cht_size;
+               crypto_free_hash(hdesc.tfm);
+               return -ENOSPC;
+       }
+       sg_init_one(&sl, (void *)buf, buf_len);
+
+       hdesc.flags = 0;
+       err = crypto_hash_digest(&hdesc, &sl, sl.length, hash);
+       crypto_free_hash(hdesc.tfm);
+
+       return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_digest);
+
+struct cfs_crypto_hash_desc *
+       cfs_crypto_hash_init(unsigned char alg_id,
+                            unsigned char *key, unsigned int key_len)
+{
+
+       struct  hash_desc       *hdesc;
+       int                  err;
+       const struct cfs_crypto_hash_type       *type;
+
+       hdesc = kmalloc(sizeof(*hdesc), 0);
+       if (hdesc == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       err = cfs_crypto_hash_alloc(alg_id, &type, hdesc, key, key_len);
+
+       if (err) {
+               kfree(hdesc);
+               return ERR_PTR(err);
+       }
+       return (struct cfs_crypto_hash_desc *)hdesc;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_init);
+
+int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc,
+                               struct page *page, unsigned int offset,
+                               unsigned int len)
+{
+       struct scatterlist sl;
+
+       sg_init_table(&sl, 1);
+       sg_set_page(&sl, page, len, offset & ~CFS_PAGE_MASK);
+
+       return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update_page);
+
+int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc,
+                          const void *buf, unsigned int buf_len)
+{
+       struct scatterlist sl;
+
+       sg_init_one(&sl, (void *)buf, buf_len);
+
+       return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length);
+}
+EXPORT_SYMBOL(cfs_crypto_hash_update);
+
+/*      If hash_len pointer is NULL - destroy descriptor. */
+int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc,
+                         unsigned char *hash, unsigned int *hash_len)
+{
+       int     err;
+       int     size = crypto_hash_digestsize(((struct hash_desc *)hdesc)->tfm);
+
+       if (hash_len == NULL) {
+               crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+               kfree(hdesc);
+               return 0;
+       }
+       if (hash == NULL || *hash_len < size) {
+               *hash_len = size;
+               return -ENOSPC;
+       }
+       err = crypto_hash_final((struct hash_desc *) hdesc, hash);
+
+       if (err < 0) {
+               /* May be caller can fix error */
+               return err;
+       }
+       crypto_free_hash(((struct hash_desc *)hdesc)->tfm);
+       kfree(hdesc);
+       return err;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_final);
+
+static void cfs_crypto_performance_test(unsigned char alg_id,
+                                       const unsigned char *buf,
+                                       unsigned int buf_len)
+{
+       unsigned long              start, end;
+       int                          bcount, err = 0;
+       int                          sec = 1; /* do test only 1 sec */
+       unsigned char              hash[64];
+       unsigned int                hash_len = 64;
+
+       for (start = jiffies, end = start + sec * HZ, bcount = 0;
+            time_before(jiffies, end); bcount++) {
+               err = cfs_crypto_hash_digest(alg_id, buf, buf_len, NULL, 0,
+                                            hash, &hash_len);
+               if (err)
+                       break;
+
+       }
+       end = jiffies;
+
+       if (err) {
+               cfs_crypto_hash_speeds[alg_id] =  -1;
+               CDEBUG(D_INFO, "Crypto hash algorithm %s, err = %d\n",
+                      cfs_crypto_hash_name(alg_id), err);
+       } else {
+               unsigned long   tmp;
+               tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) *
+                      1000) / (1024 * 1024);
+               cfs_crypto_hash_speeds[alg_id] = (int)tmp;
+       }
+       CDEBUG(D_INFO, "Crypto hash algorithm %s speed = %d MB/s\n",
+              cfs_crypto_hash_name(alg_id), cfs_crypto_hash_speeds[alg_id]);
+}
+
+int cfs_crypto_hash_speed(unsigned char hash_alg)
+{
+       if (hash_alg < CFS_HASH_ALG_MAX)
+               return cfs_crypto_hash_speeds[hash_alg];
+       else
+               return -1;
+}
+EXPORT_SYMBOL(cfs_crypto_hash_speed);
+
+/**
+ * Do performance test for all hash algorithms.
+ */
+static int cfs_crypto_test_hashes(void)
+{
+       unsigned char      i;
+       unsigned char      *data;
+       unsigned int        j;
+       /* Data block size for testing hash. Maximum
+        * kmalloc size for 2.6.18 kernel is 128K */
+       unsigned int        data_len = 1 * 128 * 1024;
+
+       data = kmalloc(data_len, 0);
+       if (data == NULL) {
+               CERROR("Failed to allocate mem\n");
+               return -ENOMEM;
+       }
+
+       for (j = 0; j < data_len; j++)
+               data[j] = j & 0xff;
+
+       for (i = 0; i < CFS_HASH_ALG_MAX; i++)
+               cfs_crypto_performance_test(i, data, data_len);
+
+       kfree(data);
+       return 0;
+}
+
+static int adler32;
+
+int cfs_crypto_register(void)
+{
+       adler32 = cfs_crypto_adler32_register();
+
+       /* check all algorithms and do performance test */
+       cfs_crypto_test_hashes();
+       return 0;
+}
+void cfs_crypto_unregister(void)
+{
+       if (adler32 == 0)
+               cfs_crypto_adler32_unregister();
+
+       return;
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c
new file mode 100644 (file)
index 0000000..f236510
--- /dev/null
@@ -0,0 +1,339 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-curproc.c
+ *
+ * Lustre curproc API implementation for Linux kernel
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+
+#include <linux/compat.h>
+#include <linux/thread_info.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+/*
+ * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * for Linux kernel.
+ */
+
+int    cfs_curproc_groups_nr(void)
+{
+       int nr;
+
+       task_lock(current);
+       nr = current_cred()->group_info->ngroups;
+       task_unlock(current);
+       return nr;
+}
+
+void   cfs_curproc_groups_dump(gid_t *array, int size)
+{
+       task_lock(current);
+       size = min_t(int, size, current_cred()->group_info->ngroups);
+       memcpy(array, current_cred()->group_info->blocks[0], size * sizeof(__u32));
+       task_unlock(current);
+}
+
+
+int    current_is_in_group(gid_t gid)
+{
+       return in_group_p(gid);
+}
+
+/* Currently all the CFS_CAP_* defines match CAP_* ones. */
+#define cfs_cap_pack(cap) (cap)
+#define cfs_cap_unpack(cap) (cap)
+
+void cfs_cap_raise(cfs_cap_t cap)
+{
+       struct cred *cred;
+       if ((cred = prepare_creds())) {
+               cap_raise(cred->cap_effective, cfs_cap_unpack(cap));
+               commit_creds(cred);
+       }
+}
+
+void cfs_cap_lower(cfs_cap_t cap)
+{
+       struct cred *cred;
+       if ((cred = prepare_creds())) {
+               cap_lower(cred->cap_effective, cfs_cap_unpack(cap));
+               commit_creds(cred);
+       }
+}
+
+int cfs_cap_raised(cfs_cap_t cap)
+{
+       return cap_raised(current_cap(), cfs_cap_unpack(cap));
+}
+
+void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap)
+{
+#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330
+       *cap = cfs_cap_pack(kcap);
+#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026
+       *cap = cfs_cap_pack(kcap[0]);
+#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522
+       /* XXX lost high byte */
+       *cap = cfs_cap_pack(kcap.cap[0]);
+#else
+       #error "need correct _KERNEL_CAPABILITY_VERSION "
+#endif
+}
+
+void cfs_kernel_cap_unpack(kernel_cap_t *kcap, cfs_cap_t cap)
+{
+#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330
+       *kcap = cfs_cap_unpack(cap);
+#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026
+       (*kcap)[0] = cfs_cap_unpack(cap);
+#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522
+       kcap->cap[0] = cfs_cap_unpack(cap);
+#else
+       #error "need correct _KERNEL_CAPABILITY_VERSION "
+#endif
+}
+
+cfs_cap_t cfs_curproc_cap_pack(void)
+{
+       cfs_cap_t cap;
+       cfs_kernel_cap_pack(current_cap(), &cap);
+       return cap;
+}
+
+void cfs_curproc_cap_unpack(cfs_cap_t cap)
+{
+       struct cred *cred;
+       if ((cred = prepare_creds())) {
+               cfs_kernel_cap_unpack(&cred->cap_effective, cap);
+               commit_creds(cred);
+       }
+}
+
+int cfs_capable(cfs_cap_t cap)
+{
+       return capable(cfs_cap_unpack(cap));
+}
+
+/* Check if task is running in 32-bit API mode, for the purpose of
+ * userspace binary interfaces.  On 32-bit Linux this is (unfortunately)
+ * always true, even if the application is using LARGEFILE64 and 64-bit
+ * APIs, because Linux provides no way for the filesystem to know if it
+ * is called via 32-bit or 64-bit APIs.  Other clients may vary.  On
+ * 64-bit systems, this will only be true if the binary is calling a
+ * 32-bit system call. */
+int current_is_32bit(void)
+{
+       return is_compat_task();
+}
+
+static int cfs_access_process_vm(struct task_struct *tsk, unsigned long addr,
+                                void *buf, int len, int write)
+{
+       /* Just copied from kernel for the kernels which doesn't
+        * have access_process_vm() exported */
+       struct mm_struct *mm;
+       struct vm_area_struct *vma;
+       struct page *page;
+       void *old_buf = buf;
+
+       mm = get_task_mm(tsk);
+       if (!mm)
+               return 0;
+
+       down_read(&mm->mmap_sem);
+       /* ignore errors, just check how much was sucessfully transfered */
+       while (len) {
+               int bytes, rc, offset;
+               void *maddr;
+
+               rc = get_user_pages(tsk, mm, addr, 1,
+                                    write, 1, &page, &vma);
+               if (rc <= 0)
+                       break;
+
+               bytes = len;
+               offset = addr & (PAGE_SIZE-1);
+               if (bytes > PAGE_SIZE-offset)
+                       bytes = PAGE_SIZE-offset;
+
+               maddr = kmap(page);
+               if (write) {
+                       copy_to_user_page(vma, page, addr,
+                                         maddr + offset, buf, bytes);
+                       set_page_dirty_lock(page);
+               } else {
+                       copy_from_user_page(vma, page, addr,
+                                           buf, maddr + offset, bytes);
+               }
+               kunmap(page);
+               page_cache_release(page);
+               len -= bytes;
+               buf += bytes;
+               addr += bytes;
+       }
+       up_read(&mm->mmap_sem);
+       mmput(mm);
+
+       return buf - old_buf;
+}
+
+/* Read the environment variable of current process specified by @key. */
+int cfs_get_environ(const char *key, char *value, int *val_len)
+{
+       struct mm_struct *mm;
+       char *buffer, *tmp_buf = NULL;
+       int buf_len = PAGE_CACHE_SIZE;
+       int key_len = strlen(key);
+       unsigned long addr;
+       int rc;
+       ENTRY;
+
+       buffer = kmalloc(buf_len, GFP_USER);
+       if (!buffer)
+               RETURN(-ENOMEM);
+
+       mm = get_task_mm(current);
+       if (!mm) {
+               kfree(buffer);
+               RETURN(-EINVAL);
+       }
+
+       /* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(),
+        * which is already holding mmap_sem for writes.  If some other
+        * thread gets the write lock in the meantime, this thread will
+        * block, but at least it won't deadlock on itself.  LU-1735 */
+       if (down_read_trylock(&mm->mmap_sem) == 0)
+               return -EDEADLK;
+       up_read(&mm->mmap_sem);
+
+       addr = mm->env_start;
+       while (addr < mm->env_end) {
+               int this_len, retval, scan_len;
+               char *env_start, *env_end;
+
+               memset(buffer, 0, buf_len);
+
+               this_len = min_t(int, mm->env_end - addr, buf_len);
+               retval = cfs_access_process_vm(current, addr, buffer,
+                                              this_len, 0);
+               if (retval != this_len)
+                       break;
+
+               addr += retval;
+
+               /* Parse the buffer to find out the specified key/value pair.
+                * The "key=value" entries are separated by '\0'. */
+               env_start = buffer;
+               scan_len = this_len;
+               while (scan_len) {
+                       char *entry;
+                       int entry_len;
+
+                       env_end = memscan(env_start, '\0', scan_len);
+                       LASSERT(env_end >= env_start &&
+                               env_end <= env_start + scan_len);
+
+                       /* The last entry of this buffer cross the buffer
+                        * boundary, reread it in next cycle. */
+                       if (unlikely(env_end - env_start == scan_len)) {
+                               /* This entry is too large to fit in buffer */
+                               if (unlikely(scan_len == this_len)) {
+                                       CERROR("Too long env variable.\n");
+                                       GOTO(out, rc = -EINVAL);
+                               }
+                               addr -= scan_len;
+                               break;
+                       }
+
+                       entry = env_start;
+                       entry_len = env_end - env_start;
+
+                       /* Key length + length of '=' */
+                       if (entry_len > key_len + 1 &&
+                           !memcmp(entry, key, key_len)) {
+                               entry += key_len + 1;
+                               entry_len -= key_len + 1;
+                               /* The 'value' buffer passed in is too small.*/
+                               if (entry_len >= *val_len)
+                                       GOTO(out, rc = -EOVERFLOW);
+
+                               memcpy(value, entry, entry_len);
+                               *val_len = entry_len;
+                               GOTO(out, rc = 0);
+                       }
+
+                       scan_len -= (env_end - env_start + 1);
+                       env_start = env_end + 1;
+               }
+       }
+       GOTO(out, rc = -ENOENT);
+
+out:
+       mmput(mm);
+       kfree((void *)buffer);
+       if (tmp_buf)
+               kfree((void *)tmp_buf);
+       return rc;
+}
+EXPORT_SYMBOL(cfs_get_environ);
+
+EXPORT_SYMBOL(cfs_curproc_groups_nr);
+EXPORT_SYMBOL(cfs_curproc_groups_dump);
+EXPORT_SYMBOL(current_is_in_group);
+EXPORT_SYMBOL(cfs_cap_raise);
+EXPORT_SYMBOL(cfs_cap_lower);
+EXPORT_SYMBOL(cfs_cap_raised);
+EXPORT_SYMBOL(cfs_curproc_cap_pack);
+EXPORT_SYMBOL(cfs_curproc_cap_unpack);
+EXPORT_SYMBOL(cfs_capable);
+EXPORT_SYMBOL(current_is_32bit);
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
new file mode 100644 (file)
index 0000000..e2c195b
--- /dev/null
@@ -0,0 +1,264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-debug.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include <linux/completion.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/version.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include "tracefile.h"
+
+#include <linux/kallsyms.h>
+
+char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall";
+char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall";
+
+/**
+ * Upcall function once a Lustre log has been dumped.
+ *
+ * \param file  path of the dumped log
+ */
+void libcfs_run_debug_log_upcall(char *file)
+{
+       char *argv[3];
+       int   rc;
+       char *envp[] = {
+               "HOME=/",
+               "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+               NULL};
+       ENTRY;
+
+       argv[0] = lnet_debug_log_upcall;
+
+       LASSERTF(file != NULL, "called on a null filename\n");
+       argv[1] = file; //only need to pass the path of the file
+
+       argv[2] = NULL;
+
+       rc = USERMODEHELPER(argv[0], argv, envp);
+       if (rc < 0 && rc != -ENOENT) {
+               CERROR("Error %d invoking LNET debug log upcall %s %s; "
+                      "check /proc/sys/lnet/debug_log_upcall\n",
+                      rc, argv[0], argv[1]);
+       } else {
+               CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n",
+                      argv[0], argv[1]);
+       }
+
+       EXIT;
+}
+
+void libcfs_run_upcall(char **argv)
+{
+       int   rc;
+       int   argc;
+       char *envp[] = {
+               "HOME=/",
+               "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+               NULL};
+       ENTRY;
+
+       argv[0] = lnet_upcall;
+       argc = 1;
+       while (argv[argc] != NULL)
+               argc++;
+
+       LASSERT(argc >= 2);
+
+       rc = USERMODEHELPER(argv[0], argv, envp);
+       if (rc < 0 && rc != -ENOENT) {
+               CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; "
+                      "check /proc/sys/lnet/upcall\n",
+                      rc, argv[0], argv[1],
+                      argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+                      argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+                      argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+                      argc < 6 ? "" : ",...");
+       } else {
+               CDEBUG(D_HA, "Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n",
+                      argv[0], argv[1],
+                      argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+                      argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+                      argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+                      argc < 6 ? "" : ",...");
+       }
+}
+
+void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *msgdata)
+{
+       char *argv[6];
+       char buf[32];
+
+       ENTRY;
+       snprintf (buf, sizeof buf, "%d", msgdata->msg_line);
+
+       argv[1] = "LBUG";
+       argv[2] = (char *)msgdata->msg_file;
+       argv[3] = (char *)msgdata->msg_fn;
+       argv[4] = buf;
+       argv[5] = NULL;
+
+       libcfs_run_upcall (argv);
+}
+
+/* coverity[+kill] */
+void lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
+{
+       libcfs_catastrophe = 1;
+       libcfs_debug_msg(msgdata, "LBUG\n");
+
+       if (in_interrupt()) {
+               panic("LBUG in interrupt.\n");
+               /* not reached */
+       }
+
+       libcfs_debug_dumpstack(NULL);
+       if (!libcfs_panic_on_lbug)
+               libcfs_debug_dumplog();
+       libcfs_run_lbug_upcall(msgdata);
+       if (libcfs_panic_on_lbug)
+               panic("LBUG");
+       set_task_state(current, TASK_UNINTERRUPTIBLE);
+       while (1)
+               schedule();
+}
+
+
+#include <linux/nmi.h>
+#include <asm/stacktrace.h>
+
+
+static int print_trace_stack(void *data, char *name)
+{
+       printk(" <%s> ", name);
+       return 0;
+}
+
+# define RELIABLE reliable
+# define DUMP_TRACE_CONST const
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+       char fmt[32];
+       touch_nmi_watchdog();
+       sprintf(fmt, " [<%016lx>] %s%%s\n", addr, RELIABLE ? "": "? ");
+       __print_symbol(fmt, addr);
+}
+
+static DUMP_TRACE_CONST struct stacktrace_ops print_trace_ops = {
+       .stack = print_trace_stack,
+       .address = print_trace_address,
+       .walk_stack = print_context_stack,
+};
+
+void libcfs_debug_dumpstack(struct task_struct *tsk)
+{
+       /* dump_stack() */
+       /* show_trace() */
+       if (tsk == NULL)
+               tsk = current;
+       printk("Pid: %d, comm: %.20s\n", tsk->pid, tsk->comm);
+       /* show_trace_log_lvl() */
+       printk("\nCall Trace:\n");
+       dump_trace(tsk, NULL, NULL,
+                  0,
+                  &print_trace_ops, NULL);
+       printk("\n");
+}
+
+task_t *libcfs_current(void)
+{
+       CWARN("current task struct is %p\n", current);
+       return current;
+}
+
+static int panic_notifier(struct notifier_block *self, unsigned long unused1,
+                        void *unused2)
+{
+       if (libcfs_panic_in_progress)
+               return 0;
+
+       libcfs_panic_in_progress = 1;
+       mb();
+
+       return 0;
+}
+
+static struct notifier_block libcfs_panic_notifier = {
+       notifier_call :     panic_notifier,
+       next :        NULL,
+       priority :        10000
+};
+
+void libcfs_register_panic_notifier(void)
+{
+       atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+void libcfs_unregister_panic_notifier(void)
+{
+       atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
+}
+
+EXPORT_SYMBOL(libcfs_debug_dumpstack);
+EXPORT_SYMBOL(libcfs_current);
+
+
+EXPORT_SYMBOL(libcfs_run_upcall);
+EXPORT_SYMBOL(libcfs_run_lbug_upcall);
+EXPORT_SYMBOL(lbug_with_loc);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c
new file mode 100644 (file)
index 0000000..2c7d4a3
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define LNET_MINOR 240
+
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
+{
+       struct libcfs_ioctl_hdr   *hdr;
+       struct libcfs_ioctl_data  *data;
+       int err;
+       ENTRY;
+
+       hdr = (struct libcfs_ioctl_hdr *)buf;
+       data = (struct libcfs_ioctl_data *)buf;
+
+       err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
+       if (err)
+               RETURN(err);
+
+       if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
+               CERROR("PORTALS: version mismatch kernel vs application\n");
+               RETURN(-EINVAL);
+       }
+
+       if (hdr->ioc_len + buf >= end) {
+               CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+               RETURN(-EINVAL);
+       }
+
+
+       if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
+               CERROR("PORTALS: user buffer too small for ioctl\n");
+               RETURN(-EINVAL);
+       }
+
+       err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
+       if (err)
+               RETURN(err);
+
+       if (libcfs_ioctl_is_invalid(data)) {
+               CERROR("PORTALS: ioctl not correctly formatted\n");
+               RETURN(-EINVAL);
+       }
+
+       if (data->ioc_inllen1)
+               data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+       if (data->ioc_inllen2)
+               data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+                       cfs_size_round(data->ioc_inllen1);
+
+       RETURN(0);
+}
+
+int libcfs_ioctl_popdata(void *arg, void *data, int size)
+{
+       if (copy_to_user((char *)arg, data, size))
+               return -EFAULT;
+       return 0;
+}
+
+extern struct cfs_psdev_ops      libcfs_psdev_ops;
+
+static int
+libcfs_psdev_open(struct inode * inode, struct file * file)
+{
+       struct libcfs_device_userstate **pdu = NULL;
+       int    rc = 0;
+
+       if (!inode)
+               return (-EINVAL);
+       pdu = (struct libcfs_device_userstate **)&file->private_data;
+       if (libcfs_psdev_ops.p_open != NULL)
+               rc = libcfs_psdev_ops.p_open(0, (void *)pdu);
+       else
+               return (-EPERM);
+       return rc;
+}
+
+/* called when closing /dev/device */
+static int
+libcfs_psdev_release(struct inode * inode, struct file * file)
+{
+       struct libcfs_device_userstate *pdu;
+       int    rc = 0;
+
+       if (!inode)
+               return (-EINVAL);
+       pdu = file->private_data;
+       if (libcfs_psdev_ops.p_close != NULL)
+               rc = libcfs_psdev_ops.p_close(0, (void *)pdu);
+       else
+               rc = -EPERM;
+       return rc;
+}
+
+static long libcfs_ioctl(struct file *file,
+                        unsigned int cmd, unsigned long arg)
+{
+       struct cfs_psdev_file    pfile;
+       int    rc = 0;
+
+       if (current_fsuid() != 0)
+               return -EACCES;
+
+       if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+            _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  ||
+            _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) {
+               CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+                      _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+               return (-EINVAL);
+       }
+
+       /* Handle platform-dependent IOC requests */
+       switch (cmd) {
+       case IOC_LIBCFS_PANIC:
+               if (!cfs_capable(CFS_CAP_SYS_BOOT))
+                       return (-EPERM);
+               panic("debugctl-invoked panic");
+               return (0);
+       case IOC_LIBCFS_MEMHOG:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       return -EPERM;
+               /* go thought */
+       }
+
+       pfile.off = 0;
+       pfile.private_data = file->private_data;
+       if (libcfs_psdev_ops.p_ioctl != NULL)
+               rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
+       else
+               rc = -EPERM;
+       return (rc);
+}
+
+static struct file_operations libcfs_fops = {
+       unlocked_ioctl: libcfs_ioctl,
+       open :    libcfs_psdev_open,
+       release :       libcfs_psdev_release
+};
+
+psdev_t libcfs_dev = {
+       LNET_MINOR,
+       "lnet",
+       &libcfs_fops
+};
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c
new file mode 100644 (file)
index 0000000..b652a79
--- /dev/null
@@ -0,0 +1,259 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs_struct.h>
+#include <linux/sched.h>
+
+#include <linux/libcfs/libcfs.h>
+
+#if defined(CONFIG_KGDB)
+#include <asm/kgdb.h>
+#endif
+
+#define LINUX_WAITQ(w) ((wait_queue_t *) w)
+#define LINUX_WAITQ_HEAD(w) ((wait_queue_head_t *) w)
+
+void
+init_waitqueue_entry_current(wait_queue_t *link)
+{
+       init_waitqueue_entry(LINUX_WAITQ(link), current);
+}
+EXPORT_SYMBOL(init_waitqueue_entry_current);
+
+/**
+ * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * waiting threads, which is not always desirable because all threads will
+ * be waken up again and again, even user only needs a few of them to be
+ * active most time. This is not good for performance because cache can
+ * be polluted by different threads.
+ *
+ * LIFO list can resolve this problem because we always wakeup the most
+ * recent active thread by default.
+ *
+ * NB: please don't call non-exclusive & exclusive wait on the same
+ * waitq if add_wait_queue_exclusive_head is used.
+ */
+void
+add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&LINUX_WAITQ_HEAD(waitq)->lock, flags);
+       __add_wait_queue_exclusive(LINUX_WAITQ_HEAD(waitq), LINUX_WAITQ(link));
+       spin_unlock_irqrestore(&LINUX_WAITQ_HEAD(waitq)->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive_head);
+
+void
+waitq_wait(wait_queue_t *link, cfs_task_state_t state)
+{
+       schedule();
+}
+EXPORT_SYMBOL(waitq_wait);
+
+int64_t
+waitq_timedwait(wait_queue_t *link, cfs_task_state_t state,
+                   int64_t timeout)
+{
+       return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(waitq_timedwait);
+
+void
+schedule_timeout_and_set_state(cfs_task_state_t state, int64_t timeout)
+{
+       set_current_state(state);
+       schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_and_set_state);
+
+/* deschedule for a bit... */
+void
+cfs_pause(cfs_duration_t ticks)
+{
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       schedule_timeout(ticks);
+}
+EXPORT_SYMBOL(cfs_pause);
+
+void cfs_init_timer(timer_list_t *t)
+{
+       init_timer(t);
+}
+EXPORT_SYMBOL(cfs_init_timer);
+
+void cfs_timer_init(timer_list_t *t, cfs_timer_func_t *func, void *arg)
+{
+       init_timer(t);
+       t->function = func;
+       t->data = (unsigned long)arg;
+}
+EXPORT_SYMBOL(cfs_timer_init);
+
+void cfs_timer_done(timer_list_t *t)
+{
+       return;
+}
+EXPORT_SYMBOL(cfs_timer_done);
+
+void cfs_timer_arm(timer_list_t *t, cfs_time_t deadline)
+{
+       mod_timer(t, deadline);
+}
+EXPORT_SYMBOL(cfs_timer_arm);
+
+void cfs_timer_disarm(timer_list_t *t)
+{
+       del_timer(t);
+}
+EXPORT_SYMBOL(cfs_timer_disarm);
+
+int  cfs_timer_is_armed(timer_list_t *t)
+{
+       return timer_pending(t);
+}
+EXPORT_SYMBOL(cfs_timer_is_armed);
+
+cfs_time_t cfs_timer_deadline(timer_list_t *t)
+{
+       return t->expires;
+}
+EXPORT_SYMBOL(cfs_timer_deadline);
+
+void cfs_enter_debugger(void)
+{
+#if defined(CONFIG_KGDB)
+//     BREAKPOINT();
+#else
+       /* nothing */
+#endif
+}
+
+
+sigset_t
+cfs_block_allsigs(void)
+{
+       unsigned long     flags;
+       sigset_t        old;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       old = current->blocked;
+       sigfillset(&current->blocked);
+       recalc_sigpending();
+       SIGNAL_MASK_UNLOCK(current, flags);
+
+       return old;
+}
+
+sigset_t cfs_block_sigs(unsigned long sigs)
+{
+       unsigned long  flags;
+       sigset_t        old;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       old = current->blocked;
+       sigaddsetmask(&current->blocked, sigs);
+       recalc_sigpending();
+       SIGNAL_MASK_UNLOCK(current, flags);
+       return old;
+}
+
+/* Block all signals except for the @sigs */
+sigset_t cfs_block_sigsinv(unsigned long sigs)
+{
+       unsigned long flags;
+       sigset_t old;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       old = current->blocked;
+       sigaddsetmask(&current->blocked, ~sigs);
+       recalc_sigpending();
+       SIGNAL_MASK_UNLOCK(current, flags);
+
+       return old;
+}
+
+void
+cfs_restore_sigs (sigset_t old)
+{
+       unsigned long  flags;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       current->blocked = old;
+       recalc_sigpending();
+       SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+int
+cfs_signal_pending(void)
+{
+       return signal_pending(current);
+}
+
+void
+cfs_clear_sigpending(void)
+{
+       unsigned long flags;
+
+       SIGNAL_MASK_LOCK(current, flags);
+       clear_tsk_thread_flag(current, TIF_SIGPENDING);
+       SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+int
+libcfs_arch_init(void)
+{
+       return 0;
+}
+
+void
+libcfs_arch_cleanup(void)
+{
+       return;
+}
+
+EXPORT_SYMBOL(libcfs_arch_init);
+EXPORT_SYMBOL(libcfs_arch_cleanup);
+EXPORT_SYMBOL(cfs_enter_debugger);
+EXPORT_SYMBOL(cfs_block_allsigs);
+EXPORT_SYMBOL(cfs_block_sigs);
+EXPORT_SYMBOL(cfs_block_sigsinv);
+EXPORT_SYMBOL(cfs_restore_sigs);
+EXPORT_SYMBOL(cfs_signal_pending);
+EXPORT_SYMBOL(cfs_clear_sigpending);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c
new file mode 100644 (file)
index 0000000..522b28e
--- /dev/null
@@ -0,0 +1,580 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/linux/linux-proc.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <asm/div64.h>
+#include "tracefile.h"
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_header_t *lnet_table_header = NULL;
+#endif
+extern char lnet_upcall[1024];
+/**
+ * The path of debug log dump upcall script.
+ */
+extern char lnet_debug_log_upcall[1024];
+
+#define CTL_LNET       (0x100)
+enum {
+       PSDEV_DEBUG = 1,          /* control debugging */
+       PSDEV_SUBSYSTEM_DEBUG,    /* control debugging */
+       PSDEV_PRINTK,        /* force all messages to console */
+       PSDEV_CONSOLE_RATELIMIT,  /* ratelimit console messages */
+       PSDEV_CONSOLE_MAX_DELAY_CS, /* maximum delay over which we skip messages */
+       PSDEV_CONSOLE_MIN_DELAY_CS, /* initial delay over which we skip messages */
+       PSDEV_CONSOLE_BACKOFF,    /* delay increase factor */
+       PSDEV_DEBUG_PATH,        /* crashdump log location */
+       PSDEV_DEBUG_DUMP_PATH,    /* crashdump tracelog location */
+       PSDEV_CPT_TABLE,          /* information about cpu partitions */
+       PSDEV_LNET_UPCALL,      /* User mode upcall script  */
+       PSDEV_LNET_MEMUSED,       /* bytes currently PORTAL_ALLOCated */
+       PSDEV_LNET_CATASTROPHE,   /* if we have LBUGged or panic'd */
+       PSDEV_LNET_PANIC_ON_LBUG, /* flag to panic on LBUG */
+       PSDEV_LNET_DUMP_KERNEL,   /* snapshot kernel debug buffer to file */
+       PSDEV_LNET_DAEMON_FILE,   /* spool kernel debug buffer to file */
+       PSDEV_LNET_DEBUG_MB,      /* size of debug buffer */
+       PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */
+       PSDEV_LNET_WATCHDOG_RATELIMIT,  /* ratelimit watchdog messages  */
+       PSDEV_LNET_FORCE_LBUG,    /* hook to force an LBUG */
+       PSDEV_LNET_FAIL_LOC,      /* control test failures instrumentation */
+       PSDEV_LNET_FAIL_VAL,      /* userdata for fail loc */
+};
+
+int
+proc_call_handler(void *data, int write,
+                 loff_t *ppos, void *buffer, size_t *lenp,
+                 int (*handler)(void *data, int write,
+                                loff_t pos, void *buffer, int len))
+{
+       int rc = handler(data, write, *ppos, buffer, *lenp);
+
+       if (rc < 0)
+               return rc;
+
+       if (write) {
+               *ppos += *lenp;
+       } else {
+               *lenp = rc;
+               *ppos += rc;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(proc_call_handler);
+
+static int __proc_dobitmasks(void *data, int write,
+                            loff_t pos, void *buffer, int nob)
+{
+       const int     tmpstrlen = 512;
+       char     *tmpstr;
+       int        rc;
+       unsigned int *mask = data;
+       int        is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0;
+       int        is_printk = (mask == &libcfs_printk) ? 1 : 0;
+
+       rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen);
+       if (rc < 0)
+               return rc;
+
+       if (!write) {
+               libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys);
+               rc = strlen(tmpstr);
+
+               if (pos >= rc) {
+                       rc = 0;
+               } else {
+                       rc = cfs_trace_copyout_string(buffer, nob,
+                                                     tmpstr + pos, "\n");
+               }
+       } else {
+               rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob);
+               if (rc < 0) {
+                       cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+                       return rc;
+               }
+
+               rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys);
+               /* Always print LBUG/LASSERT to console, so keep this mask */
+               if (is_printk)
+                       *mask |= D_EMERG;
+       }
+
+       cfs_trace_free_string_buffer(tmpstr, tmpstrlen);
+       return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_dobitmasks)
+
+static int min_watchdog_ratelimit = 0;   /* disable ratelimiting */
+static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */
+
+static int __proc_dump_kernel(void *data, int write,
+                             loff_t pos, void *buffer, int nob)
+{
+       if (!write)
+               return 0;
+
+       return cfs_trace_dump_debug_buffer_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_dump_kernel)
+
+static int __proc_daemon_file(void *data, int write,
+                             loff_t pos, void *buffer, int nob)
+{
+       if (!write) {
+               int len = strlen(cfs_tracefile);
+
+               if (pos >= len)
+                       return 0;
+
+               return cfs_trace_copyout_string(buffer, nob,
+                                               cfs_tracefile + pos, "\n");
+       }
+
+       return cfs_trace_daemon_command_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_daemon_file)
+
+static int __proc_debug_mb(void *data, int write,
+                          loff_t pos, void *buffer, int nob)
+{
+       if (!write) {
+               char tmpstr[32];
+               int  len = snprintf(tmpstr, sizeof(tmpstr), "%d",
+                                   cfs_trace_get_debug_mb());
+
+               if (pos >= len)
+                       return 0;
+
+               return cfs_trace_copyout_string(buffer, nob, tmpstr + pos,
+                      "\n");
+       }
+
+       return cfs_trace_set_debug_mb_usrstr(buffer, nob);
+}
+
+DECLARE_PROC_HANDLER(proc_debug_mb)
+
+int LL_PROC_PROTO(proc_console_max_delay_cs)
+{
+       int rc, max_delay_cs;
+       ctl_table_t dummy = *table;
+       cfs_duration_t d;
+
+       dummy.data = &max_delay_cs;
+       dummy.proc_handler = &proc_dointvec;
+
+       if (!write) { /* read */
+               max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100);
+               rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+               return rc;
+       }
+
+       /* write */
+       max_delay_cs = 0;
+       rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+       if (rc < 0)
+               return rc;
+       if (max_delay_cs <= 0)
+               return -EINVAL;
+
+       d = cfs_time_seconds(max_delay_cs) / 100;
+       if (d == 0 || d < libcfs_console_min_delay)
+               return -EINVAL;
+       libcfs_console_max_delay = d;
+
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_console_min_delay_cs)
+{
+       int rc, min_delay_cs;
+       ctl_table_t dummy = *table;
+       cfs_duration_t d;
+
+       dummy.data = &min_delay_cs;
+       dummy.proc_handler = &proc_dointvec;
+
+       if (!write) { /* read */
+               min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100);
+               rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+               return rc;
+       }
+
+       /* write */
+       min_delay_cs = 0;
+       rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+       if (rc < 0)
+               return rc;
+       if (min_delay_cs <= 0)
+               return -EINVAL;
+
+       d = cfs_time_seconds(min_delay_cs) / 100;
+       if (d == 0 || d > libcfs_console_max_delay)
+               return -EINVAL;
+       libcfs_console_min_delay = d;
+
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_console_backoff)
+{
+       int rc, backoff;
+       ctl_table_t dummy = *table;
+
+       dummy.data = &backoff;
+       dummy.proc_handler = &proc_dointvec;
+
+       if (!write) { /* read */
+               backoff= libcfs_console_backoff;
+               rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+               return rc;
+       }
+
+       /* write */
+       backoff = 0;
+       rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
+       if (rc < 0)
+               return rc;
+       if (backoff <= 0)
+               return -EINVAL;
+
+       libcfs_console_backoff = backoff;
+
+       return rc;
+}
+
+int LL_PROC_PROTO(libcfs_force_lbug)
+{
+       if (write)
+               LBUG();
+       return 0;
+}
+
+int LL_PROC_PROTO(proc_fail_loc)
+{
+       int rc;
+       long old_fail_loc = cfs_fail_loc;
+
+       rc = ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos);
+       if (old_fail_loc != cfs_fail_loc)
+               wake_up(&cfs_race_waitq);
+       return rc;
+}
+
+static int __proc_cpt_table(void *data, int write,
+                           loff_t pos, void *buffer, int nob)
+{
+       char *buf = NULL;
+       int   len = 4096;
+       int   rc  = 0;
+
+       if (write)
+               return -EPERM;
+
+       LASSERT(cfs_cpt_table != NULL);
+
+       while (1) {
+               LIBCFS_ALLOC(buf, len);
+               if (buf == NULL)
+                       return -ENOMEM;
+
+               rc = cfs_cpt_table_print(cfs_cpt_table, buf, len);
+               if (rc >= 0)
+                       break;
+
+               LIBCFS_FREE(buf, len);
+               if (rc == -EFBIG) {
+                       len <<= 1;
+                       continue;
+               }
+               goto out;
+       }
+
+       if (pos >= rc) {
+               rc = 0;
+               goto out;
+       }
+
+       rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL);
+ out:
+       if (buf != NULL)
+               LIBCFS_FREE(buf, len);
+       return rc;
+}
+DECLARE_PROC_HANDLER(proc_cpt_table)
+
+static ctl_table_t lnet_table[] = {
+       /*
+        * NB No .strategy entries have been provided since sysctl(8) prefers
+        * to go via /proc for portability.
+        */
+       {
+               INIT_CTL_NAME(PSDEV_DEBUG)
+               .procname = "debug",
+               .data     = &libcfs_debug,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dobitmasks,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_SUBSYSTEM_DEBUG)
+               .procname = "subsystem_debug",
+               .data     = &libcfs_subsystem_debug,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dobitmasks,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_PRINTK)
+               .procname = "printk",
+               .data     = &libcfs_printk,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dobitmasks,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_CONSOLE_RATELIMIT)
+               .procname = "console_ratelimit",
+               .data     = &libcfs_console_ratelimit,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(PSDEV_CONSOLE_MAX_DELAY_CS)
+               .procname = "console_max_delay_centisecs",
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_console_max_delay_cs
+       },
+       {
+               INIT_CTL_NAME(PSDEV_CONSOLE_MIN_DELAY_CS)
+               .procname = "console_min_delay_centisecs",
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_console_min_delay_cs
+       },
+       {
+               INIT_CTL_NAME(PSDEV_CONSOLE_BACKOFF)
+               .procname = "console_backoff",
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_console_backoff
+       },
+
+       {
+               INIT_CTL_NAME(PSDEV_DEBUG_PATH)
+               .procname = "debug_path",
+               .data     = libcfs_debug_file_path_arr,
+               .maxlen   = sizeof(libcfs_debug_file_path_arr),
+               .mode     = 0644,
+               .proc_handler = &proc_dostring,
+       },
+
+       {
+               INIT_CTL_NAME(PSDEV_CPT_TABLE)
+               .procname = "cpu_partition_table",
+               .maxlen   = 128,
+               .mode     = 0444,
+               .proc_handler = &proc_cpt_table,
+       },
+
+       {
+               INIT_CTL_NAME(PSDEV_LNET_UPCALL)
+               .procname = "upcall",
+               .data     = lnet_upcall,
+               .maxlen   = sizeof(lnet_upcall),
+               .mode     = 0644,
+               .proc_handler = &proc_dostring,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_DEBUG_LOG_UPCALL)
+               .procname = "debug_log_upcall",
+               .data     = lnet_debug_log_upcall,
+               .maxlen   = sizeof(lnet_debug_log_upcall),
+               .mode     = 0644,
+               .proc_handler = &proc_dostring,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_MEMUSED)
+               .procname = "lnet_memused",
+               .data     = (int *)&libcfs_kmemory.counter,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               INIT_STRATEGY(&sysctl_intvec)
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_CATASTROPHE)
+               .procname = "catastrophe",
+               .data     = &libcfs_catastrophe,
+               .maxlen   = sizeof(int),
+               .mode     = 0444,
+               .proc_handler = &proc_dointvec,
+               INIT_STRATEGY(&sysctl_intvec)
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_PANIC_ON_LBUG)
+               .procname = "panic_on_lbug",
+               .data     = &libcfs_panic_on_lbug,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec,
+               INIT_STRATEGY(&sysctl_intvec)
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_DUMP_KERNEL)
+               .procname = "dump_kernel",
+               .maxlen   = 256,
+               .mode     = 0200,
+               .proc_handler = &proc_dump_kernel,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_DAEMON_FILE)
+               .procname = "daemon_file",
+               .mode     = 0644,
+               .maxlen   = 256,
+               .proc_handler = &proc_daemon_file,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_DEBUG_MB)
+               .procname = "debug_mb",
+               .mode     = 0644,
+               .proc_handler = &proc_debug_mb,
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_WATCHDOG_RATELIMIT)
+               .procname = "watchdog_ratelimit",
+               .data     = &libcfs_watchdog_ratelimit,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec_minmax,
+               .extra1   = &min_watchdog_ratelimit,
+               .extra2   = &max_watchdog_ratelimit,
+       },
+       {       INIT_CTL_NAME(PSDEV_LNET_FORCE_LBUG)
+               .procname = "force_lbug",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0200,
+               .proc_handler = &libcfs_force_lbug
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_FAIL_LOC)
+               .procname = "fail_loc",
+               .data     = &cfs_fail_loc,
+               .maxlen   = sizeof(cfs_fail_loc),
+               .mode     = 0644,
+               .proc_handler = &proc_fail_loc
+       },
+       {
+               INIT_CTL_NAME(PSDEV_LNET_FAIL_VAL)
+               .procname = "fail_val",
+               .data     = &cfs_fail_val,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(0)
+       }
+};
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_t top_table[] = {
+       {
+               INIT_CTL_NAME(CTL_LNET)
+               .procname = "lnet",
+               .mode     = 0555,
+               .data     = NULL,
+               .maxlen   = 0,
+               .child    = lnet_table,
+       },
+       {
+               INIT_CTL_NAME(0)
+       }
+};
+#endif
+
+int insert_proc(void)
+{
+#ifdef CONFIG_SYSCTL
+       if (lnet_table_header == NULL)
+               lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+       return 0;
+}
+
+void remove_proc(void)
+{
+#ifdef CONFIG_SYSCTL
+       if (lnet_table_header != NULL)
+               unregister_sysctl_table(lnet_table_header);
+
+       lnet_table_header = NULL;
+#endif
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c
new file mode 100644 (file)
index 0000000..855c7e8
--- /dev/null
@@ -0,0 +1,659 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/libcfs.h>
+
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/file.h>
+/* For sys_open & sys_close */
+#include <linux/syscalls.h>
+
+int
+libcfs_sock_ioctl(int cmd, unsigned long arg)
+{
+       mm_segment_t    oldmm = get_fs();
+       struct socket  *sock;
+       int          rc;
+       struct file    *sock_filp;
+
+       rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+       if (rc != 0) {
+               CERROR ("Can't create socket: %d\n", rc);
+               return rc;
+       }
+
+       sock_filp = sock_alloc_file(sock, 0, NULL);
+       if (IS_ERR(sock_filp)) {
+               sock_release(sock);
+               rc = PTR_ERR(sock_filp);
+               goto out;
+       }
+
+       set_fs(KERNEL_DS);
+       if (sock_filp->f_op->unlocked_ioctl)
+               rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg);
+       set_fs(oldmm);
+
+       fput(sock_filp);
+out:
+       return rc;
+}
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+       struct ifreq   ifr;
+       int         nob;
+       int         rc;
+       __u32     val;
+
+       nob = strnlen(name, IFNAMSIZ);
+       if (nob == IFNAMSIZ) {
+               CERROR("Interface name %s too long\n", name);
+               return -EINVAL;
+       }
+
+       CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+
+       strcpy(ifr.ifr_name, name);
+       rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
+
+       if (rc != 0) {
+               CERROR("Can't get flags for interface %s\n", name);
+               return rc;
+       }
+
+       if ((ifr.ifr_flags & IFF_UP) == 0) {
+               CDEBUG(D_NET, "Interface %s down\n", name);
+               *up = 0;
+               *ip = *mask = 0;
+               return 0;
+       }
+
+       *up = 1;
+
+       strcpy(ifr.ifr_name, name);
+       ifr.ifr_addr.sa_family = AF_INET;
+       rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
+
+       if (rc != 0) {
+               CERROR("Can't get IP address for interface %s\n", name);
+               return rc;
+       }
+
+       val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+       *ip = ntohl(val);
+
+       strcpy(ifr.ifr_name, name);
+       ifr.ifr_addr.sa_family = AF_INET;
+       rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
+
+       if (rc != 0) {
+               CERROR("Can't get netmask for interface %s\n", name);
+               return rc;
+       }
+
+       val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
+       *mask = ntohl(val);
+
+       return 0;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_query);
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+       /* Allocate and fill in 'names', returning # interfaces/error */
+       char       **names;
+       int          toobig;
+       int          nalloc;
+       int          nfound;
+       struct ifreq   *ifr;
+       struct ifconf   ifc;
+       int          rc;
+       int          nob;
+       int          i;
+
+
+       nalloc = 16;    /* first guess at max interfaces */
+       toobig = 0;
+       for (;;) {
+               if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) {
+                       toobig = 1;
+                       nalloc = PAGE_CACHE_SIZE/sizeof(*ifr);
+                       CWARN("Too many interfaces: only enumerating first %d\n",
+                             nalloc);
+               }
+
+               LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+               if (ifr == NULL) {
+                       CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+                       rc = -ENOMEM;
+                       goto out0;
+               }
+
+               ifc.ifc_buf = (char *)ifr;
+               ifc.ifc_len = nalloc * sizeof(*ifr);
+
+               rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
+
+               if (rc < 0) {
+                       CERROR ("Error %d enumerating interfaces\n", rc);
+                       goto out1;
+               }
+
+               LASSERT (rc == 0);
+
+               nfound = ifc.ifc_len/sizeof(*ifr);
+               LASSERT (nfound <= nalloc);
+
+               if (nfound < nalloc || toobig)
+                       break;
+
+               LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+               nalloc *= 2;
+       }
+
+       if (nfound == 0)
+               goto out1;
+
+       LIBCFS_ALLOC(names, nfound * sizeof(*names));
+       if (names == NULL) {
+               rc = -ENOMEM;
+               goto out1;
+       }
+       /* NULL out all names[i] */
+       memset (names, 0, nfound * sizeof(*names));
+
+       for (i = 0; i < nfound; i++) {
+
+               nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+               if (nob == IFNAMSIZ) {
+                       /* no space for terminating NULL */
+                       CERROR("interface name %.*s too long (%d max)\n",
+                              nob, ifr[i].ifr_name, IFNAMSIZ);
+                       rc = -ENAMETOOLONG;
+                       goto out2;
+               }
+
+               LIBCFS_ALLOC(names[i], IFNAMSIZ);
+               if (names[i] == NULL) {
+                       rc = -ENOMEM;
+                       goto out2;
+               }
+
+               memcpy(names[i], ifr[i].ifr_name, nob);
+               names[i][nob] = 0;
+       }
+
+       *namesp = names;
+       rc = nfound;
+
+ out2:
+       if (rc < 0)
+               libcfs_ipif_free_enumeration(names, nfound);
+ out1:
+       LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+ out0:
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_enumerate);
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+       int      i;
+
+       LASSERT (n > 0);
+
+       for (i = 0; i < n && names[i] != NULL; i++)
+               LIBCFS_FREE(names[i], IFNAMSIZ);
+
+       LIBCFS_FREE(names, n * sizeof(*names));
+}
+
+EXPORT_SYMBOL(libcfs_ipif_free_enumeration);
+
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+       int         rc;
+       mm_segment_t   oldmm = get_fs();
+       long       ticks = timeout * HZ;
+       unsigned long  then;
+       struct timeval tv;
+
+       LASSERT (nob > 0);
+       /* Caller may pass a zero timeout if she thinks the socket buffer is
+        * empty enough to take the whole message immediately */
+
+       for (;;) {
+               struct iovec  iov = {
+                       .iov_base = buffer,
+                       .iov_len  = nob
+               };
+               struct msghdr msg = {
+                       .msg_name       = NULL,
+                       .msg_namelen    = 0,
+                       .msg_iov        = &iov,
+                       .msg_iovlen     = 1,
+                       .msg_control    = NULL,
+                       .msg_controllen = 0,
+                       .msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
+               };
+
+               if (timeout != 0) {
+                       /* Set send timeout to remaining time */
+                       tv = (struct timeval) {
+                               .tv_sec = ticks / HZ,
+                               .tv_usec = ((ticks % HZ) * 1000000) / HZ
+                       };
+                       set_fs(KERNEL_DS);
+                       rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+                                            (char *)&tv, sizeof(tv));
+                       set_fs(oldmm);
+                       if (rc != 0) {
+                               CERROR("Can't set socket send timeout "
+                                      "%ld.%06d: %d\n",
+                                      (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                               return rc;
+                       }
+               }
+
+               set_fs (KERNEL_DS);
+               then = jiffies;
+               rc = sock_sendmsg (sock, &msg, iov.iov_len);
+               ticks -= jiffies - then;
+               set_fs (oldmm);
+
+               if (rc == nob)
+                       return 0;
+
+               if (rc < 0)
+                       return rc;
+
+               if (rc == 0) {
+                       CERROR ("Unexpected zero rc\n");
+                       return (-ECONNABORTED);
+               }
+
+               if (ticks <= 0)
+                       return -EAGAIN;
+
+               buffer = ((char *)buffer) + rc;
+               nob -= rc;
+       }
+
+       return (0);
+}
+EXPORT_SYMBOL(libcfs_sock_write);
+
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+       int         rc;
+       mm_segment_t   oldmm = get_fs();
+       long       ticks = timeout * HZ;
+       unsigned long  then;
+       struct timeval tv;
+
+       LASSERT (nob > 0);
+       LASSERT (ticks > 0);
+
+       for (;;) {
+               struct iovec  iov = {
+                       .iov_base = buffer,
+                       .iov_len  = nob
+               };
+               struct msghdr msg = {
+                       .msg_name       = NULL,
+                       .msg_namelen    = 0,
+                       .msg_iov        = &iov,
+                       .msg_iovlen     = 1,
+                       .msg_control    = NULL,
+                       .msg_controllen = 0,
+                       .msg_flags      = 0
+               };
+
+               /* Set receive timeout to remaining time */
+               tv = (struct timeval) {
+                       .tv_sec = ticks / HZ,
+                       .tv_usec = ((ticks % HZ) * 1000000) / HZ
+               };
+               set_fs(KERNEL_DS);
+               rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+                                    (char *)&tv, sizeof(tv));
+               set_fs(oldmm);
+               if (rc != 0) {
+                       CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
+                              (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                       return rc;
+               }
+
+               set_fs(KERNEL_DS);
+               then = jiffies;
+               rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
+               ticks -= jiffies - then;
+               set_fs(oldmm);
+
+               if (rc < 0)
+                       return rc;
+
+               if (rc == 0)
+                       return -ECONNRESET;
+
+               buffer = ((char *)buffer) + rc;
+               nob -= rc;
+
+               if (nob == 0)
+                       return 0;
+
+               if (ticks <= 0)
+                       return -ETIMEDOUT;
+       }
+}
+
+EXPORT_SYMBOL(libcfs_sock_read);
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal,
+                   __u32 local_ip, int local_port)
+{
+       struct sockaddr_in  locaddr;
+       struct socket      *sock;
+       int              rc;
+       int              option;
+       mm_segment_t    oldmm = get_fs();
+
+       /* All errors are fatal except bind failure if the port is in use */
+       *fatal = 1;
+
+       rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+       *sockp = sock;
+       if (rc != 0) {
+               CERROR ("Can't create socket: %d\n", rc);
+               return (rc);
+       }
+
+       set_fs (KERNEL_DS);
+       option = 1;
+       rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+                            (char *)&option, sizeof (option));
+       set_fs (oldmm);
+       if (rc != 0) {
+               CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+               goto failed;
+       }
+
+       if (local_ip != 0 || local_port != 0) {
+               memset(&locaddr, 0, sizeof(locaddr));
+               locaddr.sin_family = AF_INET;
+               locaddr.sin_port = htons(local_port);
+               locaddr.sin_addr.s_addr = (local_ip == 0) ?
+                                         INADDR_ANY : htonl(local_ip);
+
+               rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr,
+                                    sizeof(locaddr));
+               if (rc == -EADDRINUSE) {
+                       CDEBUG(D_NET, "Port %d already in use\n", local_port);
+                       *fatal = 0;
+                       goto failed;
+               }
+               if (rc != 0) {
+                       CERROR("Error trying to bind to port %d: %d\n",
+                              local_port, rc);
+                       goto failed;
+               }
+       }
+
+       return 0;
+
+ failed:
+       sock_release(sock);
+       return rc;
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+       mm_segment_t    oldmm = get_fs();
+       int              option;
+       int              rc;
+
+       if (txbufsize != 0) {
+               option = txbufsize;
+               set_fs (KERNEL_DS);
+               rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+                                    (char *)&option, sizeof (option));
+               set_fs (oldmm);
+               if (rc != 0) {
+                       CERROR ("Can't set send buffer %d: %d\n",
+                               option, rc);
+                       return (rc);
+               }
+       }
+
+       if (rxbufsize != 0) {
+               option = rxbufsize;
+               set_fs (KERNEL_DS);
+               rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
+                                     (char *)&option, sizeof (option));
+               set_fs (oldmm);
+               if (rc != 0) {
+                       CERROR ("Can't set receive buffer %d: %d\n",
+                               option, rc);
+                       return (rc);
+               }
+       }
+
+       return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_setbuf);
+
+int
+libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port)
+{
+       struct sockaddr_in sin;
+       int             len = sizeof (sin);
+       int             rc;
+
+       rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len,
+                                remote ? 2 : 0);
+       if (rc != 0) {
+               CERROR ("Error %d getting sock %s IP/port\n",
+                       rc, remote ? "peer" : "local");
+               return rc;
+       }
+
+       if (ip != NULL)
+               *ip = ntohl (sin.sin_addr.s_addr);
+
+       if (port != NULL)
+               *port = ntohs (sin.sin_port);
+
+       return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getaddr);
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+
+       if (txbufsize != NULL) {
+               *txbufsize = sock->sk->sk_sndbuf;
+       }
+
+       if (rxbufsize != NULL) {
+               *rxbufsize = sock->sk->sk_rcvbuf;
+       }
+
+       return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getbuf);
+
+int
+libcfs_sock_listen (struct socket **sockp,
+                   __u32 local_ip, int local_port, int backlog)
+{
+       int      fatal;
+       int      rc;
+
+       rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+       if (rc != 0) {
+               if (!fatal)
+                       CERROR("Can't create socket: port %d already in use\n",
+                              local_port);
+               return rc;
+       }
+
+       rc = (*sockp)->ops->listen(*sockp, backlog);
+       if (rc == 0)
+               return 0;
+
+       CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+       sock_release(*sockp);
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_listen);
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
+{
+       wait_queue_t   wait;
+       struct socket *newsock;
+       int         rc;
+
+       init_waitqueue_entry(&wait, current);
+
+       /* XXX this should add a ref to sock->ops->owner, if
+        * TCP could be a module */
+       rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock);
+       if (rc) {
+               CERROR("Can't allocate socket\n");
+               return rc;
+       }
+
+       newsock->ops = sock->ops;
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       add_wait_queue(cfs_sk_sleep(sock->sk), &wait);
+
+       rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+       if (rc == -EAGAIN) {
+               /* Nothing ready, so wait for activity */
+               schedule();
+               rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+       }
+
+       remove_wait_queue(cfs_sk_sleep(sock->sk), &wait);
+       set_current_state(TASK_RUNNING);
+
+       if (rc != 0)
+               goto failed;
+
+       *newsockp = newsock;
+       return 0;
+
+ failed:
+       sock_release(newsock);
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_accept);
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+       wake_up_all(cfs_sk_sleep(sock->sk));
+}
+
+EXPORT_SYMBOL(libcfs_sock_abort_accept);
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal,
+                    __u32 local_ip, int local_port,
+                    __u32 peer_ip, int peer_port)
+{
+       struct sockaddr_in  srvaddr;
+       int              rc;
+
+       rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+       if (rc != 0)
+               return rc;
+
+       memset (&srvaddr, 0, sizeof (srvaddr));
+       srvaddr.sin_family = AF_INET;
+       srvaddr.sin_port = htons(peer_port);
+       srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+       rc = (*sockp)->ops->connect(*sockp,
+                                   (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+                                   0);
+       if (rc == 0)
+               return 0;
+
+       /* EADDRNOTAVAIL probably means we're already connected to the same
+        * peer/port on the same local port on a differently typed
+        * connection.  Let our caller retry with a different local
+        * port... */
+       *fatal = !(rc == -EADDRNOTAVAIL);
+
+       CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET,
+              "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+              HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+
+       sock_release(*sockp);
+       return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_connect);
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+       sock_release(sock);
+}
+
+EXPORT_SYMBOL(libcfs_sock_release);
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c
new file mode 100644 (file)
index 0000000..6f56343
--- /dev/null
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+/* percents to share the total debug memory for each type */
+static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = {
+       80,  /* 80% pages for CFS_TCD_TYPE_PROC */
+       10,  /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */
+       10   /* 10% pages for CFS_TCD_TYPE_IRQ */
+};
+
+char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+
+struct rw_semaphore cfs_tracefile_sem;
+
+int cfs_tracefile_init_arch()
+{
+       int    i;
+       int    j;
+       struct cfs_trace_cpu_data *tcd;
+
+       init_rwsem(&cfs_tracefile_sem);
+
+       /* initialize trace_data */
+       memset(cfs_trace_data, 0, sizeof(cfs_trace_data));
+       for (i = 0; i < CFS_TCD_TYPE_MAX; i++) {
+               cfs_trace_data[i] =
+                       kmalloc(sizeof(union cfs_trace_data_union) *
+                               num_possible_cpus(), GFP_KERNEL);
+               if (cfs_trace_data[i] == NULL)
+                       goto out;
+
+       }
+
+       /* arch related info initialized */
+       cfs_tcd_for_each(tcd, i, j) {
+               spin_lock_init(&tcd->tcd_lock);
+               tcd->tcd_pages_factor = pages_factor[i];
+               tcd->tcd_type = i;
+               tcd->tcd_cpu = j;
+       }
+
+       for (i = 0; i < num_possible_cpus(); i++)
+               for (j = 0; j < 3; j++) {
+                       cfs_trace_console_buffers[i][j] =
+                               kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE,
+                                       GFP_KERNEL);
+
+                       if (cfs_trace_console_buffers[i][j] == NULL)
+                               goto out;
+               }
+
+       return 0;
+
+out:
+       cfs_tracefile_fini_arch();
+       printk(KERN_ERR "lnet: Not enough memory\n");
+       return -ENOMEM;
+}
+
+void cfs_tracefile_fini_arch()
+{
+       int    i;
+       int    j;
+
+       for (i = 0; i < num_possible_cpus(); i++)
+               for (j = 0; j < 3; j++)
+                       if (cfs_trace_console_buffers[i][j] != NULL) {
+                               kfree(cfs_trace_console_buffers[i][j]);
+                               cfs_trace_console_buffers[i][j] = NULL;
+                       }
+
+       for (i = 0; cfs_trace_data[i] != NULL; i++) {
+               kfree(cfs_trace_data[i]);
+               cfs_trace_data[i] = NULL;
+       }
+
+       fini_rwsem(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_lock()
+{
+       down_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_read_unlock()
+{
+       up_read(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_lock()
+{
+       down_write(&cfs_tracefile_sem);
+}
+
+void cfs_tracefile_write_unlock()
+{
+       up_write(&cfs_tracefile_sem);
+}
+
+cfs_trace_buf_type_t cfs_trace_buf_idx_get()
+{
+       if (in_irq())
+               return CFS_TCD_TYPE_IRQ;
+       else if (in_softirq())
+               return CFS_TCD_TYPE_SOFTIRQ;
+       else
+               return CFS_TCD_TYPE_PROC;
+}
+
+/*
+ * The walking argument indicates the locking comes from all tcd types
+ * iterator and we must lock it and dissable local irqs to avoid deadlocks
+ * with other interrupt locks that might be happening. See LU-1311
+ * for details.
+ */
+int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+{
+       __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+       if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+               spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags);
+       else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+               spin_lock_bh(&tcd->tcd_lock);
+       else if (unlikely(walking))
+               spin_lock_irq(&tcd->tcd_lock);
+       else
+               spin_lock(&tcd->tcd_lock);
+       return 1;
+}
+
+void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
+{
+       __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX);
+       if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
+               spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags);
+       else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
+               spin_unlock_bh(&tcd->tcd_lock);
+       else if (unlikely(walking))
+               spin_unlock_irq(&tcd->tcd_lock);
+       else
+               spin_unlock(&tcd->tcd_lock);
+}
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+                     struct cfs_trace_page *tage)
+{
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       return tcd->tcd_cpu == tage->cpu;
+}
+
+void
+cfs_set_ptldebug_header(struct ptldebug_header *header,
+                       struct libcfs_debug_msg_data *msgdata,
+                       unsigned long stack)
+{
+       struct timeval tv;
+
+       do_gettimeofday(&tv);
+
+       header->ph_subsys = msgdata->msg_subsys;
+       header->ph_mask = msgdata->msg_mask;
+       header->ph_cpu_id = smp_processor_id();
+       header->ph_type = cfs_trace_buf_idx_get();
+       header->ph_sec = (__u32)tv.tv_sec;
+       header->ph_usec = tv.tv_usec;
+       header->ph_stack = stack;
+       header->ph_pid = current->pid;
+       header->ph_line_num = msgdata->msg_line;
+       header->ph_extern_pid = 0;
+       return;
+}
+
+static char *
+dbghdr_to_err_string(struct ptldebug_header *hdr)
+{
+       switch (hdr->ph_subsys) {
+
+               case S_LND:
+               case S_LNET:
+                       return "LNetError";
+               default:
+                       return "LustreError";
+       }
+}
+
+static char *
+dbghdr_to_info_string(struct ptldebug_header *hdr)
+{
+       switch (hdr->ph_subsys) {
+
+               case S_LND:
+               case S_LNET:
+                       return "LNet";
+               default:
+                       return "Lustre";
+       }
+}
+
+void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+                         const char *buf, int len, const char *file,
+                         const char *fn)
+{
+       char *prefix = "Lustre", *ptype = NULL;
+
+       if ((mask & D_EMERG) != 0) {
+               prefix = dbghdr_to_err_string(hdr);
+               ptype = KERN_EMERG;
+       } else if ((mask & D_ERROR) != 0) {
+               prefix = dbghdr_to_err_string(hdr);
+               ptype = KERN_ERR;
+       } else if ((mask & D_WARNING) != 0) {
+               prefix = dbghdr_to_info_string(hdr);
+               ptype = KERN_WARNING;
+       } else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) {
+               prefix = dbghdr_to_info_string(hdr);
+               ptype = KERN_INFO;
+       }
+
+       if ((mask & D_CONSOLE) != 0) {
+               printk("%s%s: %.*s", ptype, prefix, len, buf);
+       } else {
+               printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix,
+                      hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num,
+                      fn, len, buf);
+       }
+       return;
+}
+
+int cfs_trace_max_debug_mb(void)
+{
+       int  total_mb = (num_physpages >> (20 - PAGE_SHIFT));
+
+       return MAX(512, (total_mb * 80)/100);
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h
new file mode 100644 (file)
index 0000000..ba84e4f
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_LINUX_TRACEFILE_H__
+#define __LIBCFS_LINUX_TRACEFILE_H__
+
+/**
+ * three types of trace_data in linux
+ */
+typedef enum {
+       CFS_TCD_TYPE_PROC = 0,
+       CFS_TCD_TYPE_SOFTIRQ,
+       CFS_TCD_TYPE_IRQ,
+       CFS_TCD_TYPE_MAX
+} cfs_trace_buf_type_t;
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/lwt.c b/drivers/staging/lustre/lustre/libcfs/lwt.c
new file mode 100644 (file)
index 0000000..b631f7d
--- /dev/null
@@ -0,0 +1,266 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/lwt.c
+ *
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#if LWT_SUPPORT
+
+#if !KLWT_SUPPORT
+int     lwt_enabled;
+lwt_cpu_t   lwt_cpus[NR_CPUS];
+#endif
+
+int     lwt_pages_per_cpu;
+
+/* NB only root is allowed to retrieve LWT info; it's an open door into the
+ * kernel... */
+
+int
+lwt_lookup_string (int *size, char *knl_ptr,
+                  char *user_ptr, int user_size)
+{
+       int   maxsize = 128;
+
+       /* knl_ptr was retrieved from an LWT snapshot and the caller wants to
+        * turn it into a string.  NB we can crash with an access violation
+        * trying to determine the string length, so we're trusting our
+        * caller... */
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               return (-EPERM);
+
+       if (user_size > 0 &&
+           maxsize > user_size)
+               maxsize = user_size;
+
+       *size = strnlen (knl_ptr, maxsize - 1) + 1;
+
+       if (user_ptr != NULL) {
+               if (user_size < 4)
+                       return (-EINVAL);
+
+               if (copy_to_user (user_ptr, knl_ptr, *size))
+                       return (-EFAULT);
+
+               /* Did I truncate the string?  */
+               if (knl_ptr[*size - 1] != 0)
+                       copy_to_user (user_ptr + *size - 4, "...", 4);
+       }
+
+       return (0);
+}
+
+int
+lwt_control (int enable, int clear)
+{
+       lwt_page_t  *p;
+       int       i;
+       int       j;
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               return (-EPERM);
+
+       if (!enable) {
+               LWT_EVENT(0,0,0,0);
+               lwt_enabled = 0;
+               mb();
+               /* give people some time to stop adding traces */
+               schedule_timeout(10);
+       }
+
+       for (i = 0; i < num_online_cpus(); i++) {
+               p = lwt_cpus[i].lwtc_current_page;
+
+               if (p == NULL)
+                       return (-ENODATA);
+
+               if (!clear)
+                       continue;
+
+               for (j = 0; j < lwt_pages_per_cpu; j++) {
+                       memset (p->lwtp_events, 0, PAGE_CACHE_SIZE);
+
+                       p = list_entry (p->lwtp_list.next,
+                                           lwt_page_t, lwtp_list);
+               }
+       }
+
+       if (enable) {
+               lwt_enabled = 1;
+               mb();
+               LWT_EVENT(0,0,0,0);
+       }
+
+       return (0);
+}
+
+int
+lwt_snapshot (cfs_cycles_t *now, int *ncpu, int *total_size,
+             void *user_ptr, int user_size)
+{
+       const int    events_per_page = PAGE_CACHE_SIZE / sizeof(lwt_event_t);
+       const int    bytes_per_page = events_per_page * sizeof(lwt_event_t);
+       lwt_page_t  *p;
+       int       i;
+       int       j;
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               return (-EPERM);
+
+       *ncpu = num_online_cpus();
+       *total_size = num_online_cpus() * lwt_pages_per_cpu *
+               bytes_per_page;
+       *now = get_cycles();
+
+       if (user_ptr == NULL)
+               return (0);
+
+       for (i = 0; i < num_online_cpus(); i++) {
+               p = lwt_cpus[i].lwtc_current_page;
+
+               if (p == NULL)
+                       return (-ENODATA);
+
+               for (j = 0; j < lwt_pages_per_cpu; j++) {
+                       if (copy_to_user(user_ptr, p->lwtp_events,
+                                            bytes_per_page))
+                               return (-EFAULT);
+
+                       user_ptr = ((char *)user_ptr) + bytes_per_page;
+                       p = list_entry(p->lwtp_list.next,
+                                          lwt_page_t, lwtp_list);
+               }
+       }
+
+       return (0);
+}
+
+int
+lwt_init ()
+{
+       int     i;
+       int     j;
+
+       for (i = 0; i < num_online_cpus(); i++)
+               if (lwt_cpus[i].lwtc_current_page != NULL)
+                       return (-EALREADY);
+
+       LASSERT (!lwt_enabled);
+
+       /* NULL pointers, zero scalars */
+       memset (lwt_cpus, 0, sizeof (lwt_cpus));
+       lwt_pages_per_cpu =
+               LWT_MEMORY / (num_online_cpus() * PAGE_CACHE_SIZE);
+
+       for (i = 0; i < num_online_cpus(); i++)
+               for (j = 0; j < lwt_pages_per_cpu; j++) {
+                       struct page *page = alloc_page (GFP_KERNEL);
+                       lwt_page_t  *lwtp;
+
+                       if (page == NULL) {
+                               CERROR ("Can't allocate page\n");
+                               lwt_fini ();
+                               return (-ENOMEM);
+                       }
+
+                       LIBCFS_ALLOC(lwtp, sizeof (*lwtp));
+                       if (lwtp == NULL) {
+                               CERROR ("Can't allocate lwtp\n");
+                               __free_page(page);
+                               lwt_fini ();
+                               return (-ENOMEM);
+                       }
+
+                       lwtp->lwtp_page = page;
+                       lwtp->lwtp_events = page_address(page);
+                       memset (lwtp->lwtp_events, 0, PAGE_CACHE_SIZE);
+
+                       if (j == 0) {
+                               INIT_LIST_HEAD (&lwtp->lwtp_list);
+                               lwt_cpus[i].lwtc_current_page = lwtp;
+                       } else {
+                               list_add (&lwtp->lwtp_list,
+                                   &lwt_cpus[i].lwtc_current_page->lwtp_list);
+                       }
+               }
+
+       lwt_enabled = 1;
+       mb();
+
+       LWT_EVENT(0,0,0,0);
+
+       return (0);
+}
+
+void
+lwt_fini ()
+{
+       int    i;
+
+       lwt_control(0, 0);
+
+       for (i = 0; i < num_online_cpus(); i++)
+               while (lwt_cpus[i].lwtc_current_page != NULL) {
+                       lwt_page_t *lwtp = lwt_cpus[i].lwtc_current_page;
+
+                       if (list_empty (&lwtp->lwtp_list)) {
+                               lwt_cpus[i].lwtc_current_page = NULL;
+                       } else {
+                               lwt_cpus[i].lwtc_current_page =
+                                       list_entry (lwtp->lwtp_list.next,
+                                                       lwt_page_t, lwtp_list);
+
+                               list_del (&lwtp->lwtp_list);
+                       }
+
+                       __free_page (lwtp->lwtp_page);
+                       LIBCFS_FREE (lwtp, sizeof (*lwtp));
+               }
+}
+
+EXPORT_SYMBOL(lwt_enabled);
+EXPORT_SYMBOL(lwt_cpus);
+
+EXPORT_SYMBOL(lwt_init);
+EXPORT_SYMBOL(lwt_fini);
+EXPORT_SYMBOL(lwt_lookup_string);
+EXPORT_SYMBOL(lwt_control);
+EXPORT_SYMBOL(lwt_snapshot);
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/module.c b/drivers/staging/lustre/lustre/libcfs/module.c
new file mode 100644 (file)
index 0000000..3372537
--- /dev/null
@@ -0,0 +1,498 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/libcfs_crypto.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnet.h>
+#include "tracefile.h"
+
+void
+kportal_memhog_free (struct libcfs_device_userstate *ldu)
+{
+       struct page **level0p = &ldu->ldu_memhog_root_page;
+       struct page **level1p;
+       struct page **level2p;
+       int        count1;
+       int        count2;
+
+       if (*level0p != NULL) {
+
+               level1p = (struct page **)page_address(*level0p);
+               count1 = 0;
+
+               while (count1 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+                      *level1p != NULL) {
+
+                       level2p = (struct page **)page_address(*level1p);
+                       count2 = 0;
+
+                       while (count2 < PAGE_CACHE_SIZE/sizeof(struct page *) &&
+                              *level2p != NULL) {
+
+                               __free_page(*level2p);
+                               ldu->ldu_memhog_pages--;
+                               level2p++;
+                               count2++;
+                       }
+
+                       __free_page(*level1p);
+                       ldu->ldu_memhog_pages--;
+                       level1p++;
+                       count1++;
+               }
+
+               __free_page(*level0p);
+               ldu->ldu_memhog_pages--;
+
+               *level0p = NULL;
+       }
+
+       LASSERT (ldu->ldu_memhog_pages == 0);
+}
+
+int
+kportal_memhog_alloc (struct libcfs_device_userstate *ldu, int npages, int flags)
+{
+       struct page **level0p;
+       struct page **level1p;
+       struct page **level2p;
+       int        count1;
+       int        count2;
+
+       LASSERT (ldu->ldu_memhog_pages == 0);
+       LASSERT (ldu->ldu_memhog_root_page == NULL);
+
+       if (npages < 0)
+               return -EINVAL;
+
+       if (npages == 0)
+               return 0;
+
+       level0p = &ldu->ldu_memhog_root_page;
+       *level0p = alloc_page(flags);
+       if (*level0p == NULL)
+               return -ENOMEM;
+       ldu->ldu_memhog_pages++;
+
+       level1p = (struct page **)page_address(*level0p);
+       count1 = 0;
+       memset(level1p, 0, PAGE_CACHE_SIZE);
+
+       while (ldu->ldu_memhog_pages < npages &&
+              count1 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+               if (cfs_signal_pending())
+                       return (-EINTR);
+
+               *level1p = alloc_page(flags);
+               if (*level1p == NULL)
+                       return -ENOMEM;
+               ldu->ldu_memhog_pages++;
+
+               level2p = (struct page **)page_address(*level1p);
+               count2 = 0;
+               memset(level2p, 0, PAGE_CACHE_SIZE);
+
+               while (ldu->ldu_memhog_pages < npages &&
+                      count2 < PAGE_CACHE_SIZE/sizeof(struct page *)) {
+
+                       if (cfs_signal_pending())
+                               return (-EINTR);
+
+                       *level2p = alloc_page(flags);
+                       if (*level2p == NULL)
+                               return (-ENOMEM);
+                       ldu->ldu_memhog_pages++;
+
+                       level2p++;
+                       count2++;
+               }
+
+               level1p++;
+               count1++;
+       }
+
+       return 0;
+}
+
+/* called when opening /dev/device */
+static int libcfs_psdev_open(unsigned long flags, void *args)
+{
+       struct libcfs_device_userstate *ldu;
+       ENTRY;
+
+       try_module_get(THIS_MODULE);
+
+       LIBCFS_ALLOC(ldu, sizeof(*ldu));
+       if (ldu != NULL) {
+               ldu->ldu_memhog_pages = 0;
+               ldu->ldu_memhog_root_page = NULL;
+       }
+       *(struct libcfs_device_userstate **)args = ldu;
+
+       RETURN(0);
+}
+
+/* called when closing /dev/device */
+static int libcfs_psdev_release(unsigned long flags, void *args)
+{
+       struct libcfs_device_userstate *ldu;
+       ENTRY;
+
+       ldu = (struct libcfs_device_userstate *)args;
+       if (ldu != NULL) {
+               kportal_memhog_free(ldu);
+               LIBCFS_FREE(ldu, sizeof(*ldu));
+       }
+
+       module_put(THIS_MODULE);
+       RETURN(0);
+}
+
+static struct rw_semaphore ioctl_list_sem;
+static struct list_head ioctl_list;
+
+int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
+{
+       int rc = 0;
+
+       down_write(&ioctl_list_sem);
+       if (!list_empty(&hand->item))
+               rc = -EBUSY;
+       else
+               list_add_tail(&hand->item, &ioctl_list);
+       up_write(&ioctl_list_sem);
+
+       return rc;
+}
+EXPORT_SYMBOL(libcfs_register_ioctl);
+
+int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
+{
+       int rc = 0;
+
+       down_write(&ioctl_list_sem);
+       if (list_empty(&hand->item))
+               rc = -ENOENT;
+       else
+               list_del_init(&hand->item);
+       up_write(&ioctl_list_sem);
+
+       return rc;
+}
+EXPORT_SYMBOL(libcfs_deregister_ioctl);
+
+static int libcfs_ioctl_int(struct cfs_psdev_file *pfile,unsigned long cmd,
+                           void *arg, struct libcfs_ioctl_data *data)
+{
+       int err = -EINVAL;
+       ENTRY;
+
+       switch (cmd) {
+       case IOC_LIBCFS_CLEAR_DEBUG:
+               libcfs_debug_clear_buffer();
+               RETURN(0);
+       /*
+        * case IOC_LIBCFS_PANIC:
+        * Handled in arch/cfs_module.c
+        */
+       case IOC_LIBCFS_MARK_DEBUG:
+               if (data->ioc_inlbuf1 == NULL ||
+                   data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+                       RETURN(-EINVAL);
+               libcfs_debug_mark_buffer(data->ioc_inlbuf1);
+               RETURN(0);
+#if LWT_SUPPORT
+       case IOC_LIBCFS_LWT_CONTROL:
+               err = lwt_control ((data->ioc_flags & 1) != 0,
+                                  (data->ioc_flags & 2) != 0);
+               break;
+
+       case IOC_LIBCFS_LWT_SNAPSHOT: {
+               cfs_cycles_t   now;
+               int         ncpu;
+               int         total_size;
+
+               err = lwt_snapshot (&now, &ncpu, &total_size,
+                                   data->ioc_pbuf1, data->ioc_plen1);
+               data->ioc_u64[0] = now;
+               data->ioc_u32[0] = ncpu;
+               data->ioc_u32[1] = total_size;
+
+               /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */
+               data->ioc_u32[2] = sizeof(lwt_event_t);
+               data->ioc_u32[3] = offsetof(lwt_event_t, lwte_where);
+
+               if (err == 0 &&
+                   libcfs_ioctl_popdata(arg, data, sizeof (*data)))
+                       err = -EFAULT;
+               break;
+       }
+
+       case IOC_LIBCFS_LWT_LOOKUP_STRING:
+               err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
+                                        data->ioc_pbuf2, data->ioc_plen2);
+               if (err == 0 &&
+                   libcfs_ioctl_popdata(arg, data, sizeof (*data)))
+                       err = -EFAULT;
+               break;
+#endif
+       case IOC_LIBCFS_MEMHOG:
+               if (pfile->private_data == NULL) {
+                       err = -EINVAL;
+               } else {
+                       kportal_memhog_free(pfile->private_data);
+                       /* XXX The ioc_flags is not GFP flags now, need to be fixed */
+                       err = kportal_memhog_alloc(pfile->private_data,
+                                                  data->ioc_count,
+                                                  data->ioc_flags);
+                       if (err != 0)
+                               kportal_memhog_free(pfile->private_data);
+               }
+               break;
+
+       case IOC_LIBCFS_PING_TEST: {
+               extern void (kping_client)(struct libcfs_ioctl_data *);
+               void (*ping)(struct libcfs_ioctl_data *);
+
+               CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n",
+                      data->ioc_count, libcfs_nid2str(data->ioc_nid),
+                      libcfs_nid2str(data->ioc_nid));
+               ping = symbol_get(kping_client);
+               if (!ping)
+                       CERROR("symbol_get failed\n");
+               else {
+                       ping(data);
+                       symbol_put(kping_client);
+               }
+               RETURN(0);
+       }
+
+       default: {
+               struct libcfs_ioctl_handler *hand;
+               err = -EINVAL;
+               down_read(&ioctl_list_sem);
+               list_for_each_entry(hand, &ioctl_list, item) {
+                       err = hand->handle_ioctl(cmd, data);
+                       if (err != -EINVAL) {
+                               if (err == 0)
+                                       err = libcfs_ioctl_popdata(arg,
+                                                       data, sizeof (*data));
+                               break;
+                       }
+               }
+               up_read(&ioctl_list_sem);
+               break;
+       }
+       }
+
+       RETURN(err);
+}
+
+static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *arg)
+{
+       char    *buf;
+       struct libcfs_ioctl_data *data;
+       int err = 0;
+       ENTRY;
+
+       LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS);
+       if (buf == NULL)
+               RETURN(-ENOMEM);
+
+       /* 'cmd' and permissions get checked in our arch-specific caller */
+       if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+               CERROR("PORTALS ioctl: data error\n");
+               GOTO(out, err = -EINVAL);
+       }
+       data = (struct libcfs_ioctl_data *)buf;
+
+       err = libcfs_ioctl_int(pfile, cmd, arg, data);
+
+out:
+       LIBCFS_FREE(buf, 1024);
+       RETURN(err);
+}
+
+
+struct cfs_psdev_ops libcfs_psdev_ops = {
+       libcfs_psdev_open,
+       libcfs_psdev_release,
+       NULL,
+       NULL,
+       libcfs_ioctl
+};
+
+extern int insert_proc(void);
+extern void remove_proc(void);
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+extern psdev_t libcfs_dev;
+extern struct rw_semaphore cfs_tracefile_sem;
+extern struct mutex cfs_trace_thread_mutex;
+extern struct cfs_wi_sched *cfs_sched_rehash;
+
+extern void libcfs_init_nidstrings(void);
+extern int libcfs_arch_init(void);
+extern void libcfs_arch_cleanup(void);
+
+static int init_libcfs_module(void)
+{
+       int rc;
+
+       libcfs_arch_init();
+       libcfs_init_nidstrings();
+       init_rwsem(&cfs_tracefile_sem);
+       mutex_init(&cfs_trace_thread_mutex);
+       init_rwsem(&ioctl_list_sem);
+       INIT_LIST_HEAD(&ioctl_list);
+       init_waitqueue_head(&cfs_race_waitq);
+
+       rc = libcfs_debug_init(5 * 1024 * 1024);
+       if (rc < 0) {
+               printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc);
+               return (rc);
+       }
+
+       rc = cfs_cpu_init();
+       if (rc != 0)
+               goto cleanup_debug;
+
+#if LWT_SUPPORT
+       rc = lwt_init();
+       if (rc != 0) {
+               CERROR("lwt_init: error %d\n", rc);
+               goto cleanup_debug;
+       }
+#endif
+       rc = misc_register(&libcfs_dev);
+       if (rc) {
+               CERROR("misc_register: error %d\n", rc);
+               goto cleanup_lwt;
+       }
+
+       rc = cfs_wi_startup();
+       if (rc) {
+               CERROR("initialize workitem: error %d\n", rc);
+               goto cleanup_deregister;
+       }
+
+       /* max to 4 threads, should be enough for rehash */
+       rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4);
+       rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY,
+                                rc, &cfs_sched_rehash);
+       if (rc != 0) {
+               CERROR("Startup workitem scheduler: error: %d\n", rc);
+               goto cleanup_deregister;
+       }
+
+       rc = cfs_crypto_register();
+       if (rc) {
+               CERROR("cfs_crypto_regster: error %d\n", rc);
+               goto cleanup_wi;
+       }
+
+
+       rc = insert_proc();
+       if (rc) {
+               CERROR("insert_proc: error %d\n", rc);
+               goto cleanup_crypto;
+       }
+
+       CDEBUG (D_OTHER, "portals setup OK\n");
+       return 0;
+ cleanup_crypto:
+       cfs_crypto_unregister();
+ cleanup_wi:
+       cfs_wi_shutdown();
+ cleanup_deregister:
+       misc_deregister(&libcfs_dev);
+ cleanup_lwt:
+#if LWT_SUPPORT
+       lwt_fini();
+#endif
+ cleanup_debug:
+       libcfs_debug_cleanup();
+       return rc;
+}
+
+static void exit_libcfs_module(void)
+{
+       int rc;
+
+       remove_proc();
+
+       CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+              atomic_read(&libcfs_kmemory));
+
+       if (cfs_sched_rehash != NULL) {
+               cfs_wi_sched_destroy(cfs_sched_rehash);
+               cfs_sched_rehash = NULL;
+       }
+
+       cfs_crypto_unregister();
+       cfs_wi_shutdown();
+
+       rc = misc_deregister(&libcfs_dev);
+       if (rc)
+               CERROR("misc_deregister error %d\n", rc);
+
+#if LWT_SUPPORT
+       lwt_fini();
+#endif
+       cfs_cpu_fini();
+
+       if (atomic_read(&libcfs_kmemory) != 0)
+               CERROR("Portals memory leaked: %d bytes\n",
+                      atomic_read(&libcfs_kmemory));
+
+       rc = libcfs_debug_cleanup();
+       if (rc)
+               printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n",
+                      rc);
+
+       fini_rwsem(&ioctl_list_sem);
+       fini_rwsem(&cfs_tracefile_sem);
+
+       libcfs_arch_cleanup();
+}
+
+cfs_module(libcfs, "1.0.0", init_libcfs_module, exit_libcfs_module);
diff --git a/drivers/staging/lustre/lustre/libcfs/nidstrings.c b/drivers/staging/lustre/lustre/libcfs/nidstrings.c
new file mode 100644 (file)
index 0000000..ccfd107
--- /dev/null
@@ -0,0 +1,867 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/nidstrings.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+static char      libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int       libcfs_nidstring_idx = 0;
+
+static spinlock_t libcfs_nidstring_lock;
+
+void libcfs_init_nidstrings (void)
+{
+       spin_lock_init(&libcfs_nidstring_lock);
+}
+
+# define NIDSTR_LOCK(f)   spin_lock_irqsave(&libcfs_nidstring_lock, f)
+# define NIDSTR_UNLOCK(f) spin_unlock_irqrestore(&libcfs_nidstring_lock, f)
+
+static char *
+libcfs_next_nidstring (void)
+{
+       char      *str;
+       unsigned long  flags;
+
+       NIDSTR_LOCK(flags);
+
+       str = libcfs_nidstrings[libcfs_nidstring_idx++];
+       if (libcfs_nidstring_idx ==
+           sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0]))
+               libcfs_nidstring_idx = 0;
+
+       NIDSTR_UNLOCK(flags);
+       return str;
+}
+
+static int  libcfs_lo_str2addr(const char *str, int nob, __u32 *addr);
+static void libcfs_ip_addr2str(__u32 addr, char *str);
+static int  libcfs_ip_str2addr(const char *str, int nob, __u32 *addr);
+static void libcfs_decnum_addr2str(__u32 addr, char *str);
+static void libcfs_hexnum_addr2str(__u32 addr, char *str);
+static int  libcfs_num_str2addr(const char *str, int nob, __u32 *addr);
+static int  libcfs_num_parse(char *str, int len, struct list_head *list);
+static int  libcfs_num_match(__u32 addr, struct list_head *list);
+
+struct netstrfns {
+       int       nf_type;
+       char    *nf_name;
+       char    *nf_modname;
+       void       (*nf_addr2str)(__u32 addr, char *str);
+       int     (*nf_str2addr)(const char *str, int nob, __u32 *addr);
+       int     (*nf_parse_addrlist)(char *str, int len,
+                                       struct list_head *list);
+       int     (*nf_match_addr)(__u32 addr, struct list_head *list);
+};
+
+static struct netstrfns  libcfs_netstrfns[] = {
+       {/* .nf_type      */  LOLND,
+        /* .nf_name      */  "lo",
+        /* .nf_modname   */  "klolnd",
+        /* .nf_addr2str  */  libcfs_decnum_addr2str,
+        /* .nf_str2addr  */  libcfs_lo_str2addr,
+        /* .nf_parse_addr*/  libcfs_num_parse,
+        /* .nf_match_addr*/  libcfs_num_match},
+       {/* .nf_type      */  SOCKLND,
+        /* .nf_name      */  "tcp",
+        /* .nf_modname   */  "ksocklnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  O2IBLND,
+        /* .nf_name      */  "o2ib",
+        /* .nf_modname   */  "ko2iblnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  CIBLND,
+        /* .nf_name      */  "cib",
+        /* .nf_modname   */  "kciblnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  OPENIBLND,
+        /* .nf_name      */  "openib",
+        /* .nf_modname   */  "kopeniblnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  IIBLND,
+        /* .nf_name      */  "iib",
+        /* .nf_modname   */  "kiiblnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  VIBLND,
+        /* .nf_name      */  "vib",
+        /* .nf_modname   */  "kviblnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  RALND,
+        /* .nf_name      */  "ra",
+        /* .nf_modname   */  "kralnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  QSWLND,
+        /* .nf_name      */  "elan",
+        /* .nf_modname   */  "kqswlnd",
+        /* .nf_addr2str  */  libcfs_decnum_addr2str,
+        /* .nf_str2addr  */  libcfs_num_str2addr,
+        /* .nf_parse_addrlist*/  libcfs_num_parse,
+        /* .nf_match_addr*/  libcfs_num_match},
+       {/* .nf_type      */  GMLND,
+        /* .nf_name      */  "gm",
+        /* .nf_modname   */  "kgmlnd",
+        /* .nf_addr2str  */  libcfs_hexnum_addr2str,
+        /* .nf_str2addr  */  libcfs_num_str2addr,
+        /* .nf_parse_addrlist*/  libcfs_num_parse,
+        /* .nf_match_addr*/  libcfs_num_match},
+       {/* .nf_type      */  MXLND,
+        /* .nf_name      */  "mx",
+        /* .nf_modname   */  "kmxlnd",
+        /* .nf_addr2str  */  libcfs_ip_addr2str,
+        /* .nf_str2addr  */  libcfs_ip_str2addr,
+        /* .nf_parse_addrlist*/  cfs_ip_addr_parse,
+        /* .nf_match_addr*/  cfs_ip_addr_match},
+       {/* .nf_type      */  PTLLND,
+        /* .nf_name      */  "ptl",
+        /* .nf_modname   */  "kptllnd",
+        /* .nf_addr2str  */  libcfs_decnum_addr2str,
+        /* .nf_str2addr  */  libcfs_num_str2addr,
+        /* .nf_parse_addrlist*/  libcfs_num_parse,
+        /* .nf_match_addr*/  libcfs_num_match},
+       {/* .nf_type      */  GNILND,
+        /* .nf_name      */  "gni",
+        /* .nf_modname   */  "kgnilnd",
+        /* .nf_addr2str  */  libcfs_decnum_addr2str,
+        /* .nf_str2addr  */  libcfs_num_str2addr,
+        /* .nf_parse_addrlist*/  libcfs_num_parse,
+        /* .nf_match_addr*/  libcfs_num_match},
+       /* placeholder for net0 alias.  It MUST BE THE LAST ENTRY */
+       {/* .nf_type      */  -1},
+};
+
+const int libcfs_nnetstrfns = sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]);
+
+int
+libcfs_lo_str2addr(const char *str, int nob, __u32 *addr)
+{
+       *addr = 0;
+       return 1;
+}
+
+void
+libcfs_ip_addr2str(__u32 addr, char *str)
+{
+#if 0   /* never lookup */
+#endif
+       snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u",
+                (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+                (addr >> 8) & 0xff, addr & 0xff);
+}
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+
+int
+libcfs_ip_str2addr(const char *str, int nob, __u32 *addr)
+{
+       int   a;
+       int   b;
+       int   c;
+       int   d;
+       int   n = nob;                    /* XscanfX */
+
+       /* numeric IP? */
+       if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+           n == nob &&
+           (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+           (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+               *addr = ((a<<24)|(b<<16)|(c<<8)|d);
+               return 1;
+       }
+
+       return 0;
+}
+
+void
+libcfs_decnum_addr2str(__u32 addr, char *str)
+{
+       snprintf(str, LNET_NIDSTR_SIZE, "%u", addr);
+}
+
+void
+libcfs_hexnum_addr2str(__u32 addr, char *str)
+{
+       snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr);
+}
+
+int
+libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
+{
+       int     n;
+
+       n = nob;
+       if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+               return 1;
+
+       n = nob;
+       if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+               return 1;
+
+       n = nob;
+       if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+               return 1;
+
+       return 0;
+}
+
+struct netstrfns *
+libcfs_lnd2netstrfns(int lnd)
+{
+       int    i;
+
+       if (lnd >= 0)
+               for (i = 0; i < libcfs_nnetstrfns; i++)
+                       if (lnd == libcfs_netstrfns[i].nf_type)
+                               return &libcfs_netstrfns[i];
+
+       return NULL;
+}
+
+struct netstrfns *
+libcfs_namenum2netstrfns(const char *name)
+{
+       struct netstrfns *nf;
+       int            i;
+
+       for (i = 0; i < libcfs_nnetstrfns; i++) {
+               nf = &libcfs_netstrfns[i];
+               if (nf->nf_type >= 0 &&
+                   !strncmp(name, nf->nf_name, strlen(nf->nf_name)))
+                       return nf;
+       }
+       return NULL;
+}
+
+struct netstrfns *
+libcfs_name2netstrfns(const char *name)
+{
+       int    i;
+
+       for (i = 0; i < libcfs_nnetstrfns; i++)
+               if (libcfs_netstrfns[i].nf_type >= 0 &&
+                   !strcmp(libcfs_netstrfns[i].nf_name, name))
+                       return &libcfs_netstrfns[i];
+
+       return NULL;
+}
+
+int
+libcfs_isknown_lnd(int type)
+{
+       return libcfs_lnd2netstrfns(type) != NULL;
+}
+
+char *
+libcfs_lnd2modname(int lnd)
+{
+       struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+       return (nf == NULL) ? NULL : nf->nf_modname;
+}
+
+char *
+libcfs_lnd2str(int lnd)
+{
+       char       *str;
+       struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+       if (nf != NULL)
+               return nf->nf_name;
+
+       str = libcfs_next_nidstring();
+       snprintf(str, LNET_NIDSTR_SIZE, "?%u?", lnd);
+       return str;
+}
+
+int
+libcfs_str2lnd(const char *str)
+{
+       struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+       if (nf != NULL)
+               return nf->nf_type;
+
+       return -1;
+}
+
+char *
+libcfs_net2str(__u32 net)
+{
+       int            lnd = LNET_NETTYP(net);
+       int            num = LNET_NETNUM(net);
+       struct netstrfns *nf  = libcfs_lnd2netstrfns(lnd);
+       char         *str = libcfs_next_nidstring();
+
+       if (nf == NULL)
+               snprintf(str, LNET_NIDSTR_SIZE, "<%u:%u>", lnd, num);
+       else if (num == 0)
+               snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name);
+       else
+               snprintf(str, LNET_NIDSTR_SIZE, "%s%u", nf->nf_name, num);
+
+       return str;
+}
+
+char *
+libcfs_nid2str(lnet_nid_t nid)
+{
+       __u32        addr = LNET_NIDADDR(nid);
+       __u32        net = LNET_NIDNET(nid);
+       int            lnd = LNET_NETTYP(net);
+       int            nnum = LNET_NETNUM(net);
+       struct netstrfns *nf;
+       char         *str;
+       int            nob;
+
+       if (nid == LNET_NID_ANY)
+               return "<?>";
+
+       nf = libcfs_lnd2netstrfns(lnd);
+       str = libcfs_next_nidstring();
+
+       if (nf == NULL)
+               snprintf(str, LNET_NIDSTR_SIZE, "%x@<%u:%u>", addr, lnd, nnum);
+       else {
+               nf->nf_addr2str(addr, str);
+               nob = strlen(str);
+               if (nnum == 0)
+                       snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s",
+                                nf->nf_name);
+               else
+                       snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%u",
+                                nf->nf_name, nnum);
+       }
+
+       return str;
+}
+
+static struct netstrfns *
+libcfs_str2net_internal(const char *str, __u32 *net)
+{
+       struct netstrfns *uninitialized_var(nf);
+       int            nob;
+       int            netnum;
+       int            i;
+
+       for (i = 0; i < libcfs_nnetstrfns; i++) {
+               nf = &libcfs_netstrfns[i];
+               if (nf->nf_type >= 0 &&
+                   !strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+                       break;
+       }
+
+       if (i == libcfs_nnetstrfns)
+               return NULL;
+
+       nob = strlen(nf->nf_name);
+
+       if (strlen(str) == (unsigned int)nob) {
+               netnum = 0;
+       } else {
+               if (nf->nf_type == LOLND) /* net number not allowed */
+                       return NULL;
+
+               str += nob;
+               i = strlen(str);
+               if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+                   i != (int)strlen(str))
+                       return NULL;
+       }
+
+       *net = LNET_MKNET(nf->nf_type, netnum);
+       return nf;
+}
+
+__u32
+libcfs_str2net(const char *str)
+{
+       __u32  net;
+
+       if (libcfs_str2net_internal(str, &net) != NULL)
+               return net;
+
+       return LNET_NIDNET(LNET_NID_ANY);
+}
+
+lnet_nid_t
+libcfs_str2nid(const char *str)
+{
+       const char       *sep = strchr(str, '@');
+       struct netstrfns *nf;
+       __u32        net;
+       __u32        addr;
+
+       if (sep != NULL) {
+               nf = libcfs_str2net_internal(sep + 1, &net);
+               if (nf == NULL)
+                       return LNET_NID_ANY;
+       } else {
+               sep = str + strlen(str);
+               net = LNET_MKNET(SOCKLND, 0);
+               nf = libcfs_lnd2netstrfns(SOCKLND);
+               LASSERT (nf != NULL);
+       }
+
+       if (!nf->nf_str2addr(str, (int)(sep - str), &addr))
+               return LNET_NID_ANY;
+
+       return LNET_MKNID(net, addr);
+}
+
+char *
+libcfs_id2str(lnet_process_id_t id)
+{
+       char *str = libcfs_next_nidstring();
+
+       if (id.pid == LNET_PID_ANY) {
+               snprintf(str, LNET_NIDSTR_SIZE,
+                        "LNET_PID_ANY-%s", libcfs_nid2str(id.nid));
+               return str;
+       }
+
+       snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+                ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+                (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+       return str;
+}
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, const char *str)
+{
+       if (!strcmp(str, "*")) {
+               *nidp = LNET_NID_ANY;
+               return 1;
+       }
+
+       *nidp = libcfs_str2nid(str);
+       return *nidp != LNET_NID_ANY;
+}
+
+/**
+ * Nid range list syntax.
+ * \verbatim
+ *
+ * <nidlist>    :== <nidrange> [ ' ' <nidrange> ]
+ * <nidrange>  :== <addrrange> '@' <net>
+ * <addrrange>       :== '*' |
+ *                    <ipaddr_range> |
+ *                      <cfs_expr_list>
+ * <ipaddr_range>    :== <cfs_expr_list>.<cfs_expr_list>.<cfs_expr_list>.
+ *                      <cfs_expr_list>
+ * <cfs_expr_list>   :== <number> |
+ *                    <expr_list>
+ * <expr_list>       :== '[' <range_expr> [ ',' <range_expr>] ']'
+ * <range_expr>      :== <number> |
+ *                    <number> '-' <number> |
+ *                    <number> '-' <number> '/' <number>
+ * <net>            :== <netname> | <netname><number>
+ * <netname>    :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" |
+ *                    "vib" | "ra" | "elan" | "mx" | "ptl"
+ * \endverbatim
+ */
+
+/**
+ * Structure to represent \<nidrange\> token of the syntax.
+ *
+ * One of this is created for each \<net\> parsed.
+ */
+struct nidrange {
+       /**
+        * Link to list of this structures which is built on nid range
+        * list parsing.
+        */
+       struct list_head nr_link;
+       /**
+        * List head for addrrange::ar_link.
+        */
+       struct list_head nr_addrranges;
+       /**
+        * Flag indicating that *@<net> is found.
+        */
+       int nr_all;
+       /**
+        * Pointer to corresponding element of libcfs_netstrfns.
+        */
+       struct netstrfns *nr_netstrfns;
+       /**
+        * Number of network. E.g. 5 if \<net\> is "elan5".
+        */
+       int nr_netnum;
+};
+
+/**
+ * Structure to represent \<addrrange\> token of the syntax.
+ */
+struct addrrange {
+       /**
+        * Link to nidrange::nr_addrranges.
+        */
+       struct list_head ar_link;
+       /**
+        * List head for cfs_expr_list::el_list.
+        */
+       struct list_head ar_numaddr_ranges;
+};
+
+/**
+ * Nf_parse_addrlist method for networks using numeric addresses.
+ *
+ * Examples of such networks are gm and elan.
+ *
+ * \retval 0 if \a str parsed to numeric address
+ * \retval errno otherwise
+ */
+static int
+libcfs_num_parse(char *str, int len, struct list_head *list)
+{
+       struct cfs_expr_list *el;
+       int     rc;
+
+       rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el);
+       if (rc == 0)
+               list_add_tail(&el->el_link, list);
+
+       return rc;
+}
+
+/**
+ * Parses \<addrrange\> token on the syntax.
+ *
+ * Allocates struct addrrange and links to \a nidrange via
+ * (nidrange::nr_addrranges)
+ *
+ * \retval 1 if \a src parses to '*' | \<ipaddr_range\> | \<cfs_expr_list\>
+ * \retval 0 otherwise
+ */
+static int
+parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange)
+{
+       struct addrrange *addrrange;
+
+       if (src->ls_len == 1 && src->ls_str[0] == '*') {
+               nidrange->nr_all = 1;
+               return 1;
+       }
+
+       LIBCFS_ALLOC(addrrange, sizeof(struct addrrange));
+       if (addrrange == NULL)
+               return 0;
+       list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges);
+       INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges);
+
+       return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str,
+                                               src->ls_len,
+                                               &addrrange->ar_numaddr_ranges);
+}
+
+/**
+ * Finds or creates struct nidrange.
+ *
+ * Checks if \a src is a valid network name, looks for corresponding
+ * nidrange on the ist of nidranges (\a nidlist), creates new struct
+ * nidrange if it is not found.
+ *
+ * \retval pointer to struct nidrange matching network specified via \a src
+ * \retval NULL if \a src does not match any network
+ */
+static struct nidrange *
+add_nidrange(const struct cfs_lstr *src,
+            struct list_head *nidlist)
+{
+       struct netstrfns *nf;
+       struct nidrange *nr;
+       int endlen;
+       unsigned netnum;
+
+       if (src->ls_len >= LNET_NIDSTR_SIZE)
+               return NULL;
+
+       nf = libcfs_namenum2netstrfns(src->ls_str);
+       if (nf == NULL)
+               return NULL;
+       endlen = src->ls_len - strlen(nf->nf_name);
+       if (endlen == 0)
+               /* network name only, e.g. "elan" or "tcp" */
+               netnum = 0;
+       else {
+               /* e.g. "elan25" or "tcp23", refuse to parse if
+                * network name is not appended with decimal or
+                * hexadecimal number */
+               if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name),
+                                      endlen, &netnum, 0, MAX_NUMERIC_VALUE))
+                       return NULL;
+       }
+
+       list_for_each_entry(nr, nidlist, nr_link) {
+               if (nr->nr_netstrfns != nf)
+                       continue;
+               if (nr->nr_netnum != netnum)
+                       continue;
+               return nr;
+       }
+
+       LIBCFS_ALLOC(nr, sizeof(struct nidrange));
+       if (nr == NULL)
+               return NULL;
+       list_add_tail(&nr->nr_link, nidlist);
+       INIT_LIST_HEAD(&nr->nr_addrranges);
+       nr->nr_netstrfns = nf;
+       nr->nr_all = 0;
+       nr->nr_netnum = netnum;
+
+       return nr;
+}
+
+/**
+ * Parses \<nidrange\> token of the syntax.
+ *
+ * \retval 1 if \a src parses to \<addrrange\> '@' \<net\>
+ * \retval 0 otherwise
+ */
+static int
+parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
+{
+       struct cfs_lstr addrrange;
+       struct cfs_lstr net;
+       struct cfs_lstr tmp;
+       struct nidrange *nr;
+
+       tmp = *src;
+       if (cfs_gettok(src, '@', &addrrange) == 0)
+               goto failed;
+
+       if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL)
+               goto failed;
+
+       nr = add_nidrange(&net, nidlist);
+       if (nr == NULL)
+               goto failed;
+
+       if (parse_addrange(&addrrange, nr) != 0)
+               goto failed;
+
+       return 1;
+ failed:
+       CWARN("can't parse nidrange: \"%.*s\"\n", tmp.ls_len, tmp.ls_str);
+       return 0;
+}
+
+/**
+ * Frees addrrange structures of \a list.
+ *
+ * For each struct addrrange structure found on \a list it frees
+ * cfs_expr_list list attached to it and frees the addrrange itself.
+ *
+ * \retval none
+ */
+static void
+free_addrranges(struct list_head *list)
+{
+       while (!list_empty(list)) {
+               struct addrrange *ar;
+
+               ar = list_entry(list->next, struct addrrange, ar_link);
+
+               cfs_expr_list_free_list(&ar->ar_numaddr_ranges);
+               list_del(&ar->ar_link);
+               LIBCFS_FREE(ar, sizeof(struct addrrange));
+       }
+}
+
+/**
+ * Frees nidrange strutures of \a list.
+ *
+ * For each struct nidrange structure found on \a list it frees
+ * addrrange list attached to it and frees the nidrange itself.
+ *
+ * \retval none
+ */
+void
+cfs_free_nidlist(struct list_head *list)
+{
+       struct list_head *pos, *next;
+       struct nidrange *nr;
+
+       list_for_each_safe(pos, next, list) {
+               nr = list_entry(pos, struct nidrange, nr_link);
+               free_addrranges(&nr->nr_addrranges);
+               list_del(pos);
+               LIBCFS_FREE(nr, sizeof(struct nidrange));
+       }
+}
+
+/**
+ * Parses nid range list.
+ *
+ * Parses with rigorous syntax and overflow checking \a str into
+ * \<nidrange\> [ ' ' \<nidrange\> ], compiles \a str into set of
+ * structures and links that structure to \a nidlist. The resulting
+ * list can be used to match a NID againts set of NIDS defined by \a
+ * str.
+ * \see cfs_match_nid
+ *
+ * \retval 1 on success
+ * \retval 0 otherwise
+ */
+int
+cfs_parse_nidlist(char *str, int len, struct list_head *nidlist)
+{
+       struct cfs_lstr src;
+       struct cfs_lstr res;
+       int rc;
+       ENTRY;
+
+       src.ls_str = str;
+       src.ls_len = len;
+       INIT_LIST_HEAD(nidlist);
+       while (src.ls_str) {
+               rc = cfs_gettok(&src, ' ', &res);
+               if (rc == 0) {
+                       cfs_free_nidlist(nidlist);
+                       RETURN(0);
+               }
+               rc = parse_nidrange(&res, nidlist);
+               if (rc == 0) {
+                       cfs_free_nidlist(nidlist);
+                       RETURN(0);
+               }
+       }
+       RETURN(1);
+}
+
+/*
+ * Nf_match_addr method for networks using numeric addresses
+ *
+ * \retval 1 on match
+ * \retval 0 otherwise
+ */
+static int
+libcfs_num_match(__u32 addr, struct list_head *numaddr)
+{
+       struct cfs_expr_list *el;
+
+       LASSERT(!list_empty(numaddr));
+       el = list_entry(numaddr->next, struct cfs_expr_list, el_link);
+
+       return cfs_expr_list_match(addr, el);
+}
+
+/**
+ * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist).
+ *
+ * \see cfs_parse_nidlist()
+ *
+ * \retval 1 on match
+ * \retval 0  otherwises
+ */
+int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
+{
+       struct nidrange *nr;
+       struct addrrange *ar;
+       ENTRY;
+
+       list_for_each_entry(nr, nidlist, nr_link) {
+               if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid)))
+                       continue;
+               if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid)))
+                       continue;
+               if (nr->nr_all)
+                       RETURN(1);
+               list_for_each_entry(ar, &nr->nr_addrranges, ar_link)
+                       if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid),
+                                                      &ar->ar_numaddr_ranges))
+                               RETURN(1);
+       }
+       RETURN(0);
+}
+
+
+EXPORT_SYMBOL(libcfs_isknown_lnd);
+EXPORT_SYMBOL(libcfs_lnd2modname);
+EXPORT_SYMBOL(libcfs_lnd2str);
+EXPORT_SYMBOL(libcfs_str2lnd);
+EXPORT_SYMBOL(libcfs_net2str);
+EXPORT_SYMBOL(libcfs_nid2str);
+EXPORT_SYMBOL(libcfs_str2net);
+EXPORT_SYMBOL(libcfs_str2nid);
+EXPORT_SYMBOL(libcfs_id2str);
+EXPORT_SYMBOL(libcfs_str2anynid);
+EXPORT_SYMBOL(cfs_free_nidlist);
+EXPORT_SYMBOL(cfs_parse_nidlist);
+EXPORT_SYMBOL(cfs_match_nid);
diff --git a/drivers/staging/lustre/lustre/libcfs/prng.c b/drivers/staging/lustre/lustre/libcfs/prng.c
new file mode 100644 (file)
index 0000000..69224d8
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/prng.c
+ *
+ * concatenation of following two 16-bit multiply with carry generators
+ * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16,
+ * number and carry packed within the same 32 bit integer.
+ * algorithm recommended by Marsaglia
+*/
+
+#include <linux/libcfs/libcfs.h>
+
+/*
+From: George Marsaglia <geo@stat.fsu.edu>
+Newsgroups: sci.math
+Subject: Re: A RANDOM NUMBER GENERATOR FOR C
+Date: Tue, 30 Sep 1997 05:29:35 -0700
+
+ * You may replace the two constants 36969 and 18000 by any
+ * pair of distinct constants from this list:
+ * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584
+ * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243
+ * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974
+ * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114
+ * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088
+ * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834
+ * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013
+ * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083
+ * (or any other 16-bit constants k for which both k*2^16-1
+ * and k*2^15-1 are prime) */
+
+#define RANDOM_CONST_A 18030
+#define RANDOM_CONST_B 29013
+
+static unsigned int seed_x = 521288629;
+static unsigned int seed_y = 362436069;
+
+/**
+ * cfs_rand - creates new seeds
+ *
+ * First it creates new seeds from the previous seeds. Then it generates a
+ * new psuedo random number for use.
+ *
+ * Returns a pseudo-random 32-bit integer
+ */
+unsigned int cfs_rand(void)
+{
+       seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16);
+       seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16);
+
+       return ((seed_x << 16) + (seed_y & 65535));
+}
+EXPORT_SYMBOL(cfs_rand);
+
+/**
+ * cfs_srand - sets the inital seed
+ * @seed1 : (seed_x) should have the most entropy in the low bits of the word
+ * @seed2 : (seed_y) should have the most entropy in the high bits of the word
+ *
+ * Replaces the original seeds with new values. Used to generate a new pseudo
+ * random numbers.
+ */
+void cfs_srand(unsigned int seed1, unsigned int seed2)
+{
+       if (seed1)
+               seed_x = seed1; /* use default seeds if parameter is 0 */
+       if (seed2)
+               seed_y = seed2;
+}
+EXPORT_SYMBOL(cfs_srand);
+
+/**
+ * cfs_get_random_bytes - generate a bunch of random numbers
+ * @buf : buffer to fill with random numbers
+ * @size: size of passed in buffer
+ *
+ * Fills a buffer with random bytes
+ */
+void cfs_get_random_bytes(void *buf, int size)
+{
+       int *p = buf;
+       int rem, tmp;
+
+       LASSERT(size >= 0);
+
+       rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size);
+       if (rem) {
+               get_random_bytes(&tmp, sizeof(tmp));
+               tmp ^= cfs_rand();
+               memcpy(buf, &tmp, rem);
+               p = buf + rem;
+               size -= rem;
+       }
+
+       while (size >= sizeof(int)) {
+               get_random_bytes(&tmp, sizeof(tmp));
+               *p = cfs_rand() ^ tmp;
+               size -= sizeof(int);
+               p++;
+       }
+       buf = p;
+       if (size) {
+               get_random_bytes(&tmp, sizeof(tmp));
+               tmp ^= cfs_rand();
+               memcpy(buf, &tmp, size);
+       }
+}
+EXPORT_SYMBOL(cfs_get_random_bytes);
diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.c b/drivers/staging/lustre/lustre/libcfs/tracefile.c
new file mode 100644 (file)
index 0000000..439e71d
--- /dev/null
@@ -0,0 +1,1195 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/tracefile.c
+ *
+ * Author: Zach Brown <zab@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+#include "tracefile.h"
+
+#include <linux/libcfs/libcfs.h>
+
+/* XXX move things up to the top, comment */
+union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned;
+
+char cfs_tracefile[TRACEFILE_NAME_SIZE];
+long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+static struct tracefiled_ctl trace_tctl;
+struct mutex cfs_trace_thread_mutex;
+static int thread_running = 0;
+
+atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
+
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+                                        struct cfs_trace_cpu_data *tcd);
+
+static inline struct cfs_trace_page *
+cfs_tage_from_list(struct list_head *list)
+{
+       return list_entry(list, struct cfs_trace_page, linkage);
+}
+
+static struct cfs_trace_page *cfs_tage_alloc(int gfp)
+{
+       struct page         *page;
+       struct cfs_trace_page *tage;
+
+       /* My caller is trying to free memory */
+       if (!in_interrupt() && memory_pressure_get())
+               return NULL;
+
+       /*
+        * Don't spam console with allocation failures: they will be reported
+        * by upper layer anyway.
+        */
+       gfp |= __GFP_NOWARN;
+       page = alloc_page(gfp);
+       if (page == NULL)
+               return NULL;
+
+       tage = kmalloc(sizeof(*tage), gfp);
+       if (tage == NULL) {
+               __free_page(page);
+               return NULL;
+       }
+
+       tage->page = page;
+       atomic_inc(&cfs_tage_allocated);
+       return tage;
+}
+
+static void cfs_tage_free(struct cfs_trace_page *tage)
+{
+       __LASSERT(tage != NULL);
+       __LASSERT(tage->page != NULL);
+
+       __free_page(tage->page);
+       kfree(tage);
+       atomic_dec(&cfs_tage_allocated);
+}
+
+static void cfs_tage_to_tail(struct cfs_trace_page *tage,
+                            struct list_head *queue)
+{
+       __LASSERT(tage != NULL);
+       __LASSERT(queue != NULL);
+
+       list_move_tail(&tage->linkage, queue);
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, int gfp,
+                          struct list_head *stock)
+{
+       int i;
+
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
+       for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) {
+               struct cfs_trace_page *tage;
+
+               tage = cfs_tage_alloc(gfp);
+               if (tage == NULL)
+                       break;
+               list_add_tail(&tage->linkage, stock);
+       }
+       return i;
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *
+cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
+{
+       struct cfs_trace_page *tage;
+
+       if (tcd->tcd_cur_pages > 0) {
+               __LASSERT(!list_empty(&tcd->tcd_pages));
+               tage = cfs_tage_from_list(tcd->tcd_pages.prev);
+               if (tage->used + len <= PAGE_CACHE_SIZE)
+                       return tage;
+       }
+
+       if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
+               if (tcd->tcd_cur_stock_pages > 0) {
+                       tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
+                       --tcd->tcd_cur_stock_pages;
+                       list_del_init(&tage->linkage);
+               } else {
+                       tage = cfs_tage_alloc(GFP_ATOMIC);
+                       if (unlikely(tage == NULL)) {
+                               if ((!memory_pressure_get() ||
+                                    in_interrupt()) && printk_ratelimit())
+                                       printk(KERN_WARNING
+                                              "cannot allocate a tage (%ld)\n",
+                                              tcd->tcd_cur_pages);
+                               return NULL;
+                       }
+               }
+
+               tage->used = 0;
+               tage->cpu = smp_processor_id();
+               tage->type = tcd->tcd_type;
+               list_add_tail(&tage->linkage, &tcd->tcd_pages);
+               tcd->tcd_cur_pages++;
+
+               if (tcd->tcd_cur_pages > 8 && thread_running) {
+                       struct tracefiled_ctl *tctl = &trace_tctl;
+                       /*
+                        * wake up tracefiled to process some pages.
+                        */
+                       wake_up(&tctl->tctl_waitq);
+               }
+               return tage;
+       }
+       return NULL;
+}
+
+static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
+{
+       int pgcount = tcd->tcd_cur_pages / 10;
+       struct page_collection pc;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
+       if (printk_ratelimit())
+               printk(KERN_WARNING "debug daemon buffer overflowed; "
+                      "discarding 10%% of pages (%d of %ld)\n",
+                      pgcount + 1, tcd->tcd_cur_pages);
+
+       INIT_LIST_HEAD(&pc.pc_pages);
+       spin_lock_init(&pc.pc_lock);
+
+       list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+               if (pgcount-- == 0)
+                       break;
+
+               list_move_tail(&tage->linkage, &pc.pc_pages);
+               tcd->tcd_cur_pages--;
+       }
+       put_pages_on_tcd_daemon_list(&pc, tcd);
+}
+
+/* return a page that has 'len' bytes left at the end */
+static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
+                                                unsigned long len)
+{
+       struct cfs_trace_page *tage;
+
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
+       if (len > PAGE_CACHE_SIZE) {
+               printk(KERN_ERR
+                      "cowardly refusing to write %lu bytes in a page\n", len);
+               return NULL;
+       }
+
+       tage = cfs_trace_get_tage_try(tcd, len);
+       if (tage != NULL)
+               return tage;
+       if (thread_running)
+               cfs_tcd_shrink(tcd);
+       if (tcd->tcd_cur_pages > 0) {
+               tage = cfs_tage_from_list(tcd->tcd_pages.next);
+               tage->used = 0;
+               cfs_tage_to_tail(tage, &tcd->tcd_pages);
+       }
+       return tage;
+}
+
+int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
+                    const char *format, ...)
+{
+       va_list args;
+       int     rc;
+
+       va_start(args, format);
+       rc = libcfs_debug_vmsg2(msgdata, format, args, NULL);
+       va_end(args);
+
+       return rc;
+}
+EXPORT_SYMBOL(libcfs_debug_msg);
+
+int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
+                      const char *format1, va_list args,
+                      const char *format2, ...)
+{
+       struct cfs_trace_cpu_data *tcd = NULL;
+       struct ptldebug_header     header = {0};
+       struct cfs_trace_page     *tage;
+       /* string_buf is used only if tcd != NULL, and is always set then */
+       char                  *string_buf = NULL;
+       char                  *debug_buf;
+       int                     known_size;
+       int                     needed = 85; /* average message length */
+       int                     max_nob;
+       va_list             ap;
+       int                     depth;
+       int                     i;
+       int                     remain;
+       int                     mask = msgdata->msg_mask;
+       char                  *file = (char *)msgdata->msg_file;
+       cfs_debug_limit_state_t   *cdls = msgdata->msg_cdls;
+
+       if (strchr(file, '/'))
+               file = strrchr(file, '/') + 1;
+
+       tcd = cfs_trace_get_tcd();
+
+       /* cfs_trace_get_tcd() grabs a lock, which disables preemption and
+        * pins us to a particular CPU.  This avoids an smp_processor_id()
+        * warning on Linux when debugging is enabled. */
+       cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
+
+       if (tcd == NULL)                /* arch may not log in IRQ context */
+               goto console;
+
+       if (tcd->tcd_cur_pages == 0)
+               header.ph_flags |= PH_FLAG_FIRST_RECORD;
+
+       if (tcd->tcd_shutting_down) {
+               cfs_trace_put_tcd(tcd);
+               tcd = NULL;
+               goto console;
+       }
+
+       depth = __current_nesting_level();
+       known_size = strlen(file) + 1 + depth;
+       if (msgdata->msg_fn)
+               known_size += strlen(msgdata->msg_fn) + 1;
+
+       if (libcfs_debug_binary)
+               known_size += sizeof(header);
+
+       /*/
+        * '2' used because vsnprintf return real size required for output
+        * _without_ terminating NULL.
+        * if needed is to small for this format.
+        */
+       for (i = 0; i < 2; i++) {
+               tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
+               if (tage == NULL) {
+                       if (needed + known_size > PAGE_CACHE_SIZE)
+                               mask |= D_ERROR;
+
+                       cfs_trace_put_tcd(tcd);
+                       tcd = NULL;
+                       goto console;
+               }
+
+               string_buf = (char *)page_address(tage->page) +
+                                       tage->used + known_size;
+
+               max_nob = PAGE_CACHE_SIZE - tage->used - known_size;
+               if (max_nob <= 0) {
+                       printk(KERN_EMERG "negative max_nob: %d\n",
+                              max_nob);
+                       mask |= D_ERROR;
+                       cfs_trace_put_tcd(tcd);
+                       tcd = NULL;
+                       goto console;
+               }
+
+               needed = 0;
+               if (format1) {
+                       va_copy(ap, args);
+                       needed = vsnprintf(string_buf, max_nob, format1, ap);
+                       va_end(ap);
+               }
+
+               if (format2) {
+                       remain = max_nob - needed;
+                       if (remain < 0)
+                               remain = 0;
+
+                       va_start(ap, format2);
+                       needed += vsnprintf(string_buf + needed, remain,
+                                           format2, ap);
+                       va_end(ap);
+               }
+
+               if (needed < max_nob) /* well. printing ok.. */
+                       break;
+       }
+
+       if (*(string_buf+needed-1) != '\n')
+               printk(KERN_INFO "format at %s:%d:%s doesn't end in "
+                      "newline\n", file, msgdata->msg_line, msgdata->msg_fn);
+
+       header.ph_len = known_size + needed;
+       debug_buf = (char *)page_address(tage->page) + tage->used;
+
+       if (libcfs_debug_binary) {
+               memcpy(debug_buf, &header, sizeof(header));
+               tage->used += sizeof(header);
+               debug_buf += sizeof(header);
+       }
+
+       /* indent message according to the nesting level */
+       while (depth-- > 0) {
+               *(debug_buf++) = '.';
+               ++ tage->used;
+       }
+
+       strcpy(debug_buf, file);
+       tage->used += strlen(file) + 1;
+       debug_buf += strlen(file) + 1;
+
+       if (msgdata->msg_fn) {
+               strcpy(debug_buf, msgdata->msg_fn);
+               tage->used += strlen(msgdata->msg_fn) + 1;
+               debug_buf += strlen(msgdata->msg_fn) + 1;
+       }
+
+       __LASSERT(debug_buf == string_buf);
+
+       tage->used += needed;
+       __LASSERT (tage->used <= PAGE_CACHE_SIZE);
+
+console:
+       if ((mask & libcfs_printk) == 0) {
+               /* no console output requested */
+               if (tcd != NULL)
+                       cfs_trace_put_tcd(tcd);
+               return 1;
+       }
+
+       if (cdls != NULL) {
+               if (libcfs_console_ratelimit &&
+                   cdls->cdls_next != 0 &&     /* not first time ever */
+                   !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+                       /* skipping a console message */
+                       cdls->cdls_count++;
+                       if (tcd != NULL)
+                               cfs_trace_put_tcd(tcd);
+                       return 1;
+               }
+
+               if (cfs_time_after(cfs_time_current(), cdls->cdls_next +
+                                                      libcfs_console_max_delay
+                                                      + cfs_time_seconds(10))) {
+                       /* last timeout was a long time ago */
+                       cdls->cdls_delay /= libcfs_console_backoff * 4;
+               } else {
+                       cdls->cdls_delay *= libcfs_console_backoff;
+
+                       if (cdls->cdls_delay < libcfs_console_min_delay)
+                               cdls->cdls_delay = libcfs_console_min_delay;
+                       else if (cdls->cdls_delay > libcfs_console_max_delay)
+                               cdls->cdls_delay = libcfs_console_max_delay;
+               }
+
+               /* ensure cdls_next is never zero after it's been seen */
+               cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+       }
+
+       if (tcd != NULL) {
+               cfs_print_to_console(&header, mask, string_buf, needed, file,
+                                    msgdata->msg_fn);
+               cfs_trace_put_tcd(tcd);
+       } else {
+               string_buf = cfs_trace_get_console_buffer();
+
+               needed = 0;
+               if (format1 != NULL) {
+                       va_copy(ap, args);
+                       needed = vsnprintf(string_buf,
+                                          CFS_TRACE_CONSOLE_BUFFER_SIZE,
+                                          format1, ap);
+                       va_end(ap);
+               }
+               if (format2 != NULL) {
+                       remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed;
+                       if (remain > 0) {
+                               va_start(ap, format2);
+                               needed += vsnprintf(string_buf+needed, remain,
+                                                   format2, ap);
+                               va_end(ap);
+                       }
+               }
+               cfs_print_to_console(&header, mask,
+                                    string_buf, needed, file, msgdata->msg_fn);
+
+               cfs_trace_put_console_buffer(string_buf);
+       }
+
+       if (cdls != NULL && cdls->cdls_count != 0) {
+               string_buf = cfs_trace_get_console_buffer();
+
+               needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE,
+                                 "Skipped %d previous similar message%s\n",
+                                 cdls->cdls_count,
+                                 (cdls->cdls_count > 1) ? "s" : "");
+
+               cfs_print_to_console(&header, mask,
+                                    string_buf, needed, file, msgdata->msg_fn);
+
+               cfs_trace_put_console_buffer(string_buf);
+               cdls->cdls_count = 0;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(libcfs_debug_vmsg2);
+
+void
+cfs_trace_assertion_failed(const char *str,
+                          struct libcfs_debug_msg_data *msgdata)
+{
+       struct ptldebug_header hdr;
+
+       libcfs_panic_in_progress = 1;
+       libcfs_catastrophe = 1;
+       mb();
+
+       cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
+
+       cfs_print_to_console(&hdr, D_EMERG, str, strlen(str),
+                            msgdata->msg_file, msgdata->msg_fn);
+
+       panic("Lustre debug assertion failure\n");
+
+       /* not reached */
+}
+
+static void
+panic_collect_pages(struct page_collection *pc)
+{
+       /* Do the collect_pages job on a single CPU: assumes that all other
+        * CPUs have been stopped during a panic.  If this isn't true for some
+        * arch, this will have to be implemented separately in each arch.  */
+       int                     i;
+       int                     j;
+       struct cfs_trace_cpu_data *tcd;
+
+       INIT_LIST_HEAD(&pc->pc_pages);
+
+       cfs_tcd_for_each(tcd, i, j) {
+               list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+               tcd->tcd_cur_pages = 0;
+
+               if (pc->pc_want_daemon_pages) {
+                       list_splice_init(&tcd->tcd_daemon_pages,
+                                            &pc->pc_pages);
+                       tcd->tcd_cur_daemon_pages = 0;
+               }
+       }
+}
+
+static void collect_pages_on_all_cpus(struct page_collection *pc)
+{
+       struct cfs_trace_cpu_data *tcd;
+       int i, cpu;
+
+       spin_lock(&pc->pc_lock);
+       cfs_for_each_possible_cpu(cpu) {
+               cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+                       list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
+                       tcd->tcd_cur_pages = 0;
+                       if (pc->pc_want_daemon_pages) {
+                               list_splice_init(&tcd->tcd_daemon_pages,
+                                                    &pc->pc_pages);
+                               tcd->tcd_cur_daemon_pages = 0;
+                       }
+               }
+       }
+       spin_unlock(&pc->pc_lock);
+}
+
+static void collect_pages(struct page_collection *pc)
+{
+       INIT_LIST_HEAD(&pc->pc_pages);
+
+       if (libcfs_panic_in_progress)
+               panic_collect_pages(pc);
+       else
+               collect_pages_on_all_cpus(pc);
+}
+
+static void put_pages_back_on_all_cpus(struct page_collection *pc)
+{
+       struct cfs_trace_cpu_data *tcd;
+       struct list_head *cur_head;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+       int i, cpu;
+
+       spin_lock(&pc->pc_lock);
+       cfs_for_each_possible_cpu(cpu) {
+               cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+                       cur_head = tcd->tcd_pages.next;
+
+                       list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
+                                                linkage) {
+
+                               __LASSERT_TAGE_INVARIANT(tage);
+
+                               if (tage->cpu != cpu || tage->type != i)
+                                       continue;
+
+                               cfs_tage_to_tail(tage, cur_head);
+                               tcd->tcd_cur_pages++;
+                       }
+               }
+       }
+       spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_back(struct page_collection *pc)
+{
+       if (!libcfs_panic_in_progress)
+               put_pages_back_on_all_cpus(pc);
+}
+
+/* Add pages to a per-cpu debug daemon ringbuffer.  This buffer makes sure that
+ * we have a good amount of data at all times for dumping during an LBUG, even
+ * if we have been steadily writing (and otherwise discarding) pages via the
+ * debug daemon. */
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+                                        struct cfs_trace_cpu_data *tcd)
+{
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+
+       spin_lock(&pc->pc_lock);
+       list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
+
+               __LASSERT_TAGE_INVARIANT(tage);
+
+               if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type)
+                       continue;
+
+               cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages);
+               tcd->tcd_cur_daemon_pages++;
+
+               if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
+                       struct cfs_trace_page *victim;
+
+                       __LASSERT(!list_empty(&tcd->tcd_daemon_pages));
+                       victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next);
+
+                       __LASSERT_TAGE_INVARIANT(victim);
+
+                       list_del(&victim->linkage);
+                       cfs_tage_free(victim);
+                       tcd->tcd_cur_daemon_pages--;
+               }
+       }
+       spin_unlock(&pc->pc_lock);
+}
+
+static void put_pages_on_daemon_list(struct page_collection *pc)
+{
+       struct cfs_trace_cpu_data *tcd;
+       int i, cpu;
+
+       cfs_for_each_possible_cpu(cpu) {
+               cfs_tcd_for_each_type_lock(tcd, i, cpu)
+                       put_pages_on_tcd_daemon_list(pc, tcd);
+       }
+}
+
+void cfs_trace_debug_print(void)
+{
+       struct page_collection pc;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+
+       spin_lock_init(&pc.pc_lock);
+
+       pc.pc_want_daemon_pages = 1;
+       collect_pages(&pc);
+       list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+               char *p, *file, *fn;
+               struct page *page;
+
+               __LASSERT_TAGE_INVARIANT(tage);
+
+               page = tage->page;
+               p = page_address(page);
+               while (p < ((char *)page_address(page) + tage->used)) {
+                       struct ptldebug_header *hdr;
+                       int len;
+                       hdr = (void *)p;
+                       p += sizeof(*hdr);
+                       file = p;
+                       p += strlen(file) + 1;
+                       fn = p;
+                       p += strlen(fn) + 1;
+                       len = hdr->ph_len - (int)(p - (char *)hdr);
+
+                       cfs_print_to_console(hdr, D_EMERG, p, len, file, fn);
+
+                       p += len;
+               }
+
+               list_del(&tage->linkage);
+               cfs_tage_free(tage);
+       }
+}
+
+int cfs_tracefile_dump_all_pages(char *filename)
+{
+       struct page_collection  pc;
+       struct file             *filp;
+       struct cfs_trace_page   *tage;
+       struct cfs_trace_page   *tmp;
+       int rc;
+
+       DECL_MMSPACE;
+
+       cfs_tracefile_write_lock();
+
+       filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
+       if (IS_ERR(filp)) {
+               rc = PTR_ERR(filp);
+               filp = NULL;
+               printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
+                     filename, rc);
+               goto out;
+       }
+
+       spin_lock_init(&pc.pc_lock);
+       pc.pc_want_daemon_pages = 1;
+       collect_pages(&pc);
+       if (list_empty(&pc.pc_pages)) {
+               rc = 0;
+               goto close;
+       }
+
+       /* ok, for now, just write the pages.  in the future we'll be building
+        * iobufs with the pages and calling generic_direct_IO */
+       MMSPACE_OPEN;
+       list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+               __LASSERT_TAGE_INVARIANT(tage);
+
+               rc = filp_write(filp, page_address(tage->page),
+                               tage->used, filp_poff(filp));
+               if (rc != (int)tage->used) {
+                       printk(KERN_WARNING "wanted to write %u but wrote "
+                              "%d\n", tage->used, rc);
+                       put_pages_back(&pc);
+                       __LASSERT(list_empty(&pc.pc_pages));
+                       break;
+               }
+               list_del(&tage->linkage);
+               cfs_tage_free(tage);
+       }
+       MMSPACE_CLOSE;
+       rc = filp_fsync(filp);
+       if (rc)
+               printk(KERN_ERR "sync returns %d\n", rc);
+close:
+       filp_close(filp, NULL);
+out:
+       cfs_tracefile_write_unlock();
+       return rc;
+}
+
+void cfs_trace_flush_pages(void)
+{
+       struct page_collection pc;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+
+       spin_lock_init(&pc.pc_lock);
+
+       pc.pc_want_daemon_pages = 1;
+       collect_pages(&pc);
+       list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
+
+               __LASSERT_TAGE_INVARIANT(tage);
+
+               list_del(&tage->linkage);
+               cfs_tage_free(tage);
+       }
+}
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+                           const char *usr_buffer, int usr_buffer_nob)
+{
+       int    nob;
+
+       if (usr_buffer_nob > knl_buffer_nob)
+               return -EOVERFLOW;
+
+       if (copy_from_user((void *)knl_buffer,
+                          (void *)usr_buffer, usr_buffer_nob))
+               return -EFAULT;
+
+       nob = strnlen(knl_buffer, usr_buffer_nob);
+       while (nob-- >= 0)                    /* strip trailing whitespace */
+               if (!isspace(knl_buffer[nob]))
+                       break;
+
+       if (nob < 0)                        /* empty string */
+               return -EINVAL;
+
+       if (nob == knl_buffer_nob)            /* no space to terminate */
+               return -EOVERFLOW;
+
+       knl_buffer[nob + 1] = 0;                /* terminate */
+       return 0;
+}
+EXPORT_SYMBOL(cfs_trace_copyin_string);
+
+int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+                            const char *knl_buffer, char *append)
+{
+       /* NB if 'append' != NULL, it's a single character to append to the
+        * copied out string - usually "\n", for /proc entries and "" (i.e. a
+        * terminating zero byte) for sysctl entries */
+       int   nob = strlen(knl_buffer);
+
+       if (nob > usr_buffer_nob)
+               nob = usr_buffer_nob;
+
+       if (copy_to_user(usr_buffer, knl_buffer, nob))
+               return -EFAULT;
+
+       if (append != NULL && nob < usr_buffer_nob) {
+               if (copy_to_user(usr_buffer + nob, append, 1))
+                       return -EFAULT;
+
+               nob++;
+       }
+
+       return nob;
+}
+EXPORT_SYMBOL(cfs_trace_copyout_string);
+
+int cfs_trace_allocate_string_buffer(char **str, int nob)
+{
+       if (nob > 2 * PAGE_CACHE_SIZE)      /* string must be "sensible" */
+               return -EINVAL;
+
+       *str = kmalloc(nob, GFP_IOFS | __GFP_ZERO);
+       if (*str == NULL)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void cfs_trace_free_string_buffer(char *str, int nob)
+{
+       kfree(str);
+}
+
+int cfs_trace_dump_debug_buffer_usrstr(void *usr_str, int usr_str_nob)
+{
+       char     *str;
+       int        rc;
+
+       rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+       if (rc != 0)
+               return rc;
+
+       rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+                                    usr_str, usr_str_nob);
+       if (rc != 0)
+               goto out;
+
+       if (str[0] != '/') {
+               rc = -EINVAL;
+               goto out;
+       }
+       rc = cfs_tracefile_dump_all_pages(str);
+out:
+       cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+       return rc;
+}
+
+int cfs_trace_daemon_command(char *str)
+{
+       int       rc = 0;
+
+       cfs_tracefile_write_lock();
+
+       if (strcmp(str, "stop") == 0) {
+               cfs_tracefile_write_unlock();
+               cfs_trace_stop_thread();
+               cfs_tracefile_write_lock();
+               memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
+
+       } else if (strncmp(str, "size=", 5) == 0) {
+               cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0);
+               if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480)
+                       cfs_tracefile_size = CFS_TRACEFILE_SIZE;
+               else
+                       cfs_tracefile_size <<= 20;
+
+       } else if (strlen(str) >= sizeof(cfs_tracefile)) {
+               rc = -ENAMETOOLONG;
+       } else if (str[0] != '/') {
+               rc = -EINVAL;
+       } else {
+               strcpy(cfs_tracefile, str);
+
+               printk(KERN_INFO
+                      "Lustre: debug daemon will attempt to start writing "
+                      "to %s (%lukB max)\n", cfs_tracefile,
+                      (long)(cfs_tracefile_size >> 10));
+
+               cfs_trace_start_thread();
+       }
+
+       cfs_tracefile_write_unlock();
+       return rc;
+}
+
+int cfs_trace_daemon_command_usrstr(void *usr_str, int usr_str_nob)
+{
+       char *str;
+       int   rc;
+
+       rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1);
+       if (rc != 0)
+               return rc;
+
+       rc = cfs_trace_copyin_string(str, usr_str_nob + 1,
+                                usr_str, usr_str_nob);
+       if (rc == 0)
+               rc = cfs_trace_daemon_command(str);
+
+       cfs_trace_free_string_buffer(str, usr_str_nob + 1);
+       return rc;
+}
+
+int cfs_trace_set_debug_mb(int mb)
+{
+       int i;
+       int j;
+       int pages;
+       int limit = cfs_trace_max_debug_mb();
+       struct cfs_trace_cpu_data *tcd;
+
+       if (mb < num_possible_cpus()) {
+               printk(KERN_WARNING
+                      "Lustre: %d MB is too small for debug buffer size, "
+                      "setting it to %d MB.\n", mb, num_possible_cpus());
+               mb = num_possible_cpus();
+       }
+
+       if (mb > limit) {
+               printk(KERN_WARNING
+                      "Lustre: %d MB is too large for debug buffer size, "
+                      "setting it to %d MB.\n", mb, limit);
+               mb = limit;
+       }
+
+       mb /= num_possible_cpus();
+       pages = mb << (20 - PAGE_CACHE_SHIFT);
+
+       cfs_tracefile_write_lock();
+
+       cfs_tcd_for_each(tcd, i, j)
+               tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
+
+       cfs_tracefile_write_unlock();
+
+       return 0;
+}
+
+int cfs_trace_set_debug_mb_usrstr(void *usr_str, int usr_str_nob)
+{
+       char     str[32];
+       int      rc;
+
+       rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob);
+       if (rc < 0)
+               return rc;
+
+       return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0));
+}
+
+int cfs_trace_get_debug_mb(void)
+{
+       int i;
+       int j;
+       struct cfs_trace_cpu_data *tcd;
+       int total_pages = 0;
+
+       cfs_tracefile_read_lock();
+
+       cfs_tcd_for_each(tcd, i, j)
+               total_pages += tcd->tcd_max_pages;
+
+       cfs_tracefile_read_unlock();
+
+       return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1;
+}
+
+static int tracefiled(void *arg)
+{
+       struct page_collection pc;
+       struct tracefiled_ctl *tctl = arg;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+       struct file *filp;
+       int last_loop = 0;
+       int rc;
+
+       DECL_MMSPACE;
+
+       /* we're started late enough that we pick up init's fs context */
+       /* this is so broken in uml?  what on earth is going on? */
+
+       spin_lock_init(&pc.pc_lock);
+       complete(&tctl->tctl_start);
+
+       while (1) {
+               wait_queue_t __wait;
+
+               pc.pc_want_daemon_pages = 0;
+               collect_pages(&pc);
+               if (list_empty(&pc.pc_pages))
+                       goto end_loop;
+
+               filp = NULL;
+               cfs_tracefile_read_lock();
+               if (cfs_tracefile[0] != 0) {
+                       filp = filp_open(cfs_tracefile,
+                                        O_CREAT | O_RDWR | O_LARGEFILE,
+                                        0600);
+                       if (IS_ERR(filp)) {
+                               rc = PTR_ERR(filp);
+                               filp = NULL;
+                               printk(KERN_WARNING "couldn't open %s: "
+                                      "%d\n", cfs_tracefile, rc);
+                       }
+               }
+               cfs_tracefile_read_unlock();
+               if (filp == NULL) {
+                       put_pages_on_daemon_list(&pc);
+                       __LASSERT(list_empty(&pc.pc_pages));
+                       goto end_loop;
+               }
+
+               MMSPACE_OPEN;
+
+               list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+                                                  linkage) {
+                       static loff_t f_pos;
+
+                       __LASSERT_TAGE_INVARIANT(tage);
+
+                       if (f_pos >= (off_t)cfs_tracefile_size)
+                               f_pos = 0;
+                       else if (f_pos > (off_t)filp_size(filp))
+                               f_pos = filp_size(filp);
+
+                       rc = filp_write(filp, page_address(tage->page),
+                                       tage->used, &f_pos);
+                       if (rc != (int)tage->used) {
+                               printk(KERN_WARNING "wanted to write %u "
+                                      "but wrote %d\n", tage->used, rc);
+                               put_pages_back(&pc);
+                               __LASSERT(list_empty(&pc.pc_pages));
+                       }
+               }
+               MMSPACE_CLOSE;
+
+               filp_close(filp, NULL);
+               put_pages_on_daemon_list(&pc);
+               if (!list_empty(&pc.pc_pages)) {
+                       int i;
+
+                       printk(KERN_ALERT "Lustre: trace pages aren't "
+                              " empty\n");
+                       printk(KERN_ERR "total cpus(%d): ",
+                              num_possible_cpus());
+                       for (i = 0; i < num_possible_cpus(); i++)
+                               if (cpu_online(i))
+                                       printk(KERN_ERR "%d(on) ", i);
+                               else
+                                       printk(KERN_ERR "%d(off) ", i);
+                       printk(KERN_ERR "\n");
+
+                       i = 0;
+                       list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
+                                                    linkage)
+                               printk(KERN_ERR "page %d belongs to cpu "
+                                      "%d\n", ++i, tage->cpu);
+                       printk(KERN_ERR "There are %d pages unwritten\n",
+                              i);
+               }
+               __LASSERT(list_empty(&pc.pc_pages));
+end_loop:
+               if (atomic_read(&tctl->tctl_shutdown)) {
+                       if (last_loop == 0) {
+                               last_loop = 1;
+                               continue;
+                       } else {
+                               break;
+                       }
+               }
+               init_waitqueue_entry_current(&__wait);
+               add_wait_queue(&tctl->tctl_waitq, &__wait);
+               set_current_state(TASK_INTERRUPTIBLE);
+               waitq_timedwait(&__wait, TASK_INTERRUPTIBLE,
+                                   cfs_time_seconds(1));
+               remove_wait_queue(&tctl->tctl_waitq, &__wait);
+       }
+       complete(&tctl->tctl_stop);
+       return 0;
+}
+
+int cfs_trace_start_thread(void)
+{
+       struct tracefiled_ctl *tctl = &trace_tctl;
+       int rc = 0;
+
+       mutex_lock(&cfs_trace_thread_mutex);
+       if (thread_running)
+               goto out;
+
+       init_completion(&tctl->tctl_start);
+       init_completion(&tctl->tctl_stop);
+       init_waitqueue_head(&tctl->tctl_waitq);
+       atomic_set(&tctl->tctl_shutdown, 0);
+
+       if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) {
+               rc = -ECHILD;
+               goto out;
+       }
+
+       wait_for_completion(&tctl->tctl_start);
+       thread_running = 1;
+out:
+       mutex_unlock(&cfs_trace_thread_mutex);
+       return rc;
+}
+
+void cfs_trace_stop_thread(void)
+{
+       struct tracefiled_ctl *tctl = &trace_tctl;
+
+       mutex_lock(&cfs_trace_thread_mutex);
+       if (thread_running) {
+               printk(KERN_INFO
+                      "Lustre: shutting down debug daemon thread...\n");
+               atomic_set(&tctl->tctl_shutdown, 1);
+               wait_for_completion(&tctl->tctl_stop);
+               thread_running = 0;
+       }
+       mutex_unlock(&cfs_trace_thread_mutex);
+}
+
+int cfs_tracefile_init(int max_pages)
+{
+       struct cfs_trace_cpu_data *tcd;
+       int                 i;
+       int                 j;
+       int                 rc;
+       int                 factor;
+
+       rc = cfs_tracefile_init_arch();
+       if (rc != 0)
+               return rc;
+
+       cfs_tcd_for_each(tcd, i, j) {
+               /* tcd_pages_factor is initialized int tracefile_init_arch. */
+               factor = tcd->tcd_pages_factor;
+               INIT_LIST_HEAD(&tcd->tcd_pages);
+               INIT_LIST_HEAD(&tcd->tcd_stock_pages);
+               INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
+               tcd->tcd_cur_pages = 0;
+               tcd->tcd_cur_stock_pages = 0;
+               tcd->tcd_cur_daemon_pages = 0;
+               tcd->tcd_max_pages = (max_pages * factor) / 100;
+               LASSERT(tcd->tcd_max_pages > 0);
+               tcd->tcd_shutting_down = 0;
+       }
+
+       return 0;
+}
+
+static void trace_cleanup_on_all_cpus(void)
+{
+       struct cfs_trace_cpu_data *tcd;
+       struct cfs_trace_page *tage;
+       struct cfs_trace_page *tmp;
+       int i, cpu;
+
+       cfs_for_each_possible_cpu(cpu) {
+               cfs_tcd_for_each_type_lock(tcd, i, cpu) {
+                       tcd->tcd_shutting_down = 1;
+
+                       list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages,
+                                                          linkage) {
+                               __LASSERT_TAGE_INVARIANT(tage);
+
+                               list_del(&tage->linkage);
+                               cfs_tage_free(tage);
+                       }
+
+                       tcd->tcd_cur_pages = 0;
+               }
+       }
+}
+
+static void cfs_trace_cleanup(void)
+{
+       struct page_collection pc;
+
+       INIT_LIST_HEAD(&pc.pc_pages);
+       spin_lock_init(&pc.pc_lock);
+
+       trace_cleanup_on_all_cpus();
+
+       cfs_tracefile_fini_arch();
+}
+
+void cfs_tracefile_exit(void)
+{
+       cfs_trace_stop_thread();
+       cfs_trace_cleanup();
+}
diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.h b/drivers/staging/lustre/lustre/libcfs/tracefile.h
new file mode 100644 (file)
index 0000000..7e8d17c
--- /dev/null
@@ -0,0 +1,340 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LIBCFS_TRACEFILE_H__
+#define __LIBCFS_TRACEFILE_H__
+
+#include <linux/libcfs/libcfs.h>
+
+#include "linux/linux-tracefile.h"
+
+/* trace file lock routines */
+
+#define TRACEFILE_NAME_SIZE 1024
+extern char      cfs_tracefile[TRACEFILE_NAME_SIZE];
+extern long long cfs_tracefile_size;
+
+extern void libcfs_run_debug_log_upcall(char *file);
+
+int  cfs_tracefile_init_arch(void);
+void cfs_tracefile_fini_arch(void);
+
+void cfs_tracefile_read_lock(void);
+void cfs_tracefile_read_unlock(void);
+void cfs_tracefile_write_lock(void);
+void cfs_tracefile_write_unlock(void);
+
+int cfs_tracefile_dump_all_pages(char *filename);
+void cfs_trace_debug_print(void);
+void cfs_trace_flush_pages(void);
+int cfs_trace_start_thread(void);
+void cfs_trace_stop_thread(void);
+int cfs_tracefile_init(int max_pages);
+void cfs_tracefile_exit(void);
+
+
+
+int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob,
+                           const char *usr_buffer, int usr_buffer_nob);
+int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob,
+                            const char *knl_str, char *append);
+int cfs_trace_allocate_string_buffer(char **str, int nob);
+void cfs_trace_free_string_buffer(char *str, int nob);
+int cfs_trace_dump_debug_buffer_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_daemon_command(char *str);
+int cfs_trace_daemon_command_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_set_debug_mb(int mb);
+int cfs_trace_set_debug_mb_usrstr(void *usr_str, int usr_str_nob);
+int cfs_trace_get_debug_mb(void);
+
+extern void libcfs_debug_dumplog_internal(void *arg);
+extern void libcfs_register_panic_notifier(void);
+extern void libcfs_unregister_panic_notifier(void);
+extern int  libcfs_panic_in_progress;
+extern int  cfs_trace_max_debug_mb(void);
+
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+#ifdef LUSTRE_TRACEFILE_PRIVATE
+
+/*
+ * Private declare for tracefile
+ */
+#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
+
+#define CFS_TRACEFILE_SIZE (500 << 20)
+
+/* Size of a buffer for sprinting console messages if we can't get a page
+ * from system */
+#define CFS_TRACE_CONSOLE_BUFFER_SIZE   1024
+
+union cfs_trace_data_union {
+       struct cfs_trace_cpu_data {
+               /*
+                * Even though this structure is meant to be per-CPU, locking
+                * is needed because in some places the data may be accessed
+                * from other CPUs. This lock is directly used in trace_get_tcd
+                * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and
+                * tcd_for_each_type_lock
+                */
+               spinlock_t              tcd_lock;
+               unsigned long      tcd_lock_flags;
+
+               /*
+                * pages with trace records not yet processed by tracefiled.
+                */
+               struct list_head              tcd_pages;
+               /* number of pages on ->tcd_pages */
+               unsigned long      tcd_cur_pages;
+
+               /*
+                * pages with trace records already processed by
+                * tracefiled. These pages are kept in memory, so that some
+                * portion of log can be written in the event of LBUG. This
+                * list is maintained in LRU order.
+                *
+                * Pages are moved to ->tcd_daemon_pages by tracefiled()
+                * (put_pages_on_daemon_list()). LRU pages from this list are
+                * discarded when list grows too large.
+                */
+               struct list_head              tcd_daemon_pages;
+               /* number of pages on ->tcd_daemon_pages */
+               unsigned long      tcd_cur_daemon_pages;
+
+               /*
+                * Maximal number of pages allowed on ->tcd_pages and
+                * ->tcd_daemon_pages each.
+                * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current
+                * implementation.
+                */
+               unsigned long      tcd_max_pages;
+
+               /*
+                * preallocated pages to write trace records into. Pages from
+                * ->tcd_stock_pages are moved to ->tcd_pages by
+                * portals_debug_msg().
+                *
+                * This list is necessary, because on some platforms it's
+                * impossible to perform efficient atomic page allocation in a
+                * non-blockable context.
+                *
+                * Such platforms fill ->tcd_stock_pages "on occasion", when
+                * tracing code is entered in blockable context.
+                *
+                * trace_get_tage_try() tries to get a page from
+                * ->tcd_stock_pages first and resorts to atomic page
+                * allocation only if this queue is empty. ->tcd_stock_pages
+                * is replenished when tracing code is entered in blocking
+                * context (darwin-tracefile.c:trace_get_tcd()). We try to
+                * maintain TCD_STOCK_PAGES (40 by default) pages in this
+                * queue. Atomic allocation is only required if more than
+                * TCD_STOCK_PAGES pagesful are consumed by trace records all
+                * emitted in non-blocking contexts. Which is quite unlikely.
+                */
+               struct list_head              tcd_stock_pages;
+               /* number of pages on ->tcd_stock_pages */
+               unsigned long      tcd_cur_stock_pages;
+
+               unsigned short    tcd_shutting_down;
+               unsigned short    tcd_cpu;
+               unsigned short    tcd_type;
+               /* The factors to share debug memory. */
+               unsigned short    tcd_pages_factor;
+       } tcd;
+       char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))];
+};
+
+#define TCD_MAX_TYPES      8
+extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS];
+
+#define cfs_tcd_for_each(tcd, i, j)                                   \
+    for (i = 0; cfs_trace_data[i] != NULL; i++)                           \
+       for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd);            \
+            j < num_possible_cpus();                            \
+            j++, (tcd) = &(*cfs_trace_data[i])[j].tcd)
+
+#define cfs_tcd_for_each_type_lock(tcd, i, cpu)                           \
+    for (i = 0; cfs_trace_data[i] &&                                 \
+        (tcd = &(*cfs_trace_data[i])[cpu].tcd) &&                      \
+        cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++)
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct page_collection {
+       struct list_head        pc_pages;
+       /*
+        * spin-lock protecting ->pc_pages. It is taken by smp_call_function()
+        * call-back functions. XXX nikita: Which is horrible: all processors
+        * receive NMI at the same time only to be serialized by this
+        * lock. Probably ->pc_pages should be replaced with an array of
+        * NR_CPUS elements accessed locklessly.
+        */
+       spinlock_t      pc_lock;
+       /*
+        * if this flag is set, collect_pages() will spill both
+        * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise,
+        * only ->tcd_pages are spilled.
+        */
+       int             pc_want_daemon_pages;
+};
+
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct tracefiled_ctl {
+       struct completion       tctl_start;
+       struct completion       tctl_stop;
+       wait_queue_head_t               tctl_waitq;
+       pid_t                   tctl_pid;
+       atomic_t                tctl_shutdown;
+};
+
+/*
+ * small data-structure for each page owned by tracefiled.
+ */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
+struct cfs_trace_page {
+       /*
+        * page itself
+        */
+       struct page       *page;
+       /*
+        * linkage into one of the lists in trace_data_union or
+        * page_collection
+        */
+       struct list_head           linkage;
+       /*
+        * number of bytes used within this page
+        */
+       unsigned int     used;
+       /*
+        * cpu that owns this page
+        */
+       unsigned short       cpu;
+       /*
+        * type(context) of this page
+        */
+       unsigned short       type;
+};
+
+extern void cfs_set_ptldebug_header(struct ptldebug_header *header,
+                                   struct libcfs_debug_msg_data *m,
+                                   unsigned long stack);
+extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
+                                const char *buf, int len, const char *file,
+                                const char *fn);
+
+extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking);
+
+/**
+ * trace_buf_type_t, trace_buf_idx_get() and trace_console_buffers[][]
+ * are not public libcfs API; they should be defined in
+ * platform-specific tracefile include files
+ * (see, for example, linux-tracefile.h).
+ */
+
+extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX];
+extern cfs_trace_buf_type_t cfs_trace_buf_idx_get(void);
+
+static inline char *
+cfs_trace_get_console_buffer(void)
+{
+       unsigned int i = get_cpu();
+       unsigned int j = cfs_trace_buf_idx_get();
+
+       return cfs_trace_console_buffers[i][j];
+}
+
+static inline void
+cfs_trace_put_console_buffer(char *buffer)
+{
+       put_cpu();
+}
+
+static inline struct cfs_trace_cpu_data *
+cfs_trace_get_tcd(void)
+{
+       struct cfs_trace_cpu_data *tcd =
+               &(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd;
+
+       cfs_trace_lock_tcd(tcd, 0);
+
+       return tcd;
+}
+
+static inline void
+cfs_trace_put_tcd (struct cfs_trace_cpu_data *tcd)
+{
+       cfs_trace_unlock_tcd(tcd, 0);
+
+       put_cpu();
+}
+
+int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, int gfp,
+                          struct list_head *stock);
+
+
+int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd,
+                     struct cfs_trace_page *tage);
+
+extern void cfs_trace_assertion_failed(const char *str,
+                                      struct libcfs_debug_msg_data *m);
+
+/* ASSERTION that is safe to use within the debug system */
+#define __LASSERT(cond)                                                 \
+do {                                                               \
+       if (unlikely(!(cond))) {                                        \
+               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL);     \
+               cfs_trace_assertion_failed("ASSERTION("#cond") failed", \
+                                          &msgdata);              \
+       }                                                              \
+} while (0)
+
+#define __LASSERT_TAGE_INVARIANT(tage)                           \
+do {                                                               \
+       __LASSERT(tage != NULL);                                        \
+       __LASSERT(tage->page != NULL);                            \
+       __LASSERT(tage->used <= PAGE_CACHE_SIZE);                        \
+       __LASSERT(page_count(tage->page) > 0);                \
+} while (0)
+
+#endif /* LUSTRE_TRACEFILE_PRIVATE */
+
+#endif /* __LIBCFS_TRACEFILE_H__ */
diff --git a/drivers/staging/lustre/lustre/libcfs/upcall_cache.c b/drivers/staging/lustre/lustre/libcfs/upcall_cache.c
new file mode 100644 (file)
index 0000000..18c68c3
--- /dev/null
@@ -0,0 +1,462 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/upcall_cache.c
+ *
+ * Supplementary groups cache.
+ */
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/lucache.h>
+
+static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache,
+                                             __u64 key, void *args)
+{
+       struct upcall_cache_entry *entry;
+
+       LIBCFS_ALLOC(entry, sizeof(*entry));
+       if (!entry)
+               return NULL;
+
+       UC_CACHE_SET_NEW(entry);
+       INIT_LIST_HEAD(&entry->ue_hash);
+       entry->ue_key = key;
+       atomic_set(&entry->ue_refcount, 0);
+       init_waitqueue_head(&entry->ue_waitq);
+       if (cache->uc_ops->init_entry)
+               cache->uc_ops->init_entry(entry, args);
+       return entry;
+}
+
+/* protected by cache lock */
+static void free_entry(struct upcall_cache *cache,
+                      struct upcall_cache_entry *entry)
+{
+       if (cache->uc_ops->free_entry)
+               cache->uc_ops->free_entry(cache, entry);
+
+       list_del(&entry->ue_hash);
+       CDEBUG(D_OTHER, "destroy cache entry %p for key "LPU64"\n",
+              entry, entry->ue_key);
+       LIBCFS_FREE(entry, sizeof(*entry));
+}
+
+static inline int upcall_compare(struct upcall_cache *cache,
+                                struct upcall_cache_entry *entry,
+                                __u64 key, void *args)
+{
+       if (entry->ue_key != key)
+               return -1;
+
+       if (cache->uc_ops->upcall_compare)
+               return cache->uc_ops->upcall_compare(cache, entry, key, args);
+
+       return 0;
+}
+
+static inline int downcall_compare(struct upcall_cache *cache,
+                                  struct upcall_cache_entry *entry,
+                                  __u64 key, void *args)
+{
+       if (entry->ue_key != key)
+               return -1;
+
+       if (cache->uc_ops->downcall_compare)
+               return cache->uc_ops->downcall_compare(cache, entry, key, args);
+
+       return 0;
+}
+
+static inline void get_entry(struct upcall_cache_entry *entry)
+{
+       atomic_inc(&entry->ue_refcount);
+}
+
+static inline void put_entry(struct upcall_cache *cache,
+                            struct upcall_cache_entry *entry)
+{
+       if (atomic_dec_and_test(&entry->ue_refcount) &&
+           (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) {
+               free_entry(cache, entry);
+       }
+}
+
+static int check_unlink_entry(struct upcall_cache *cache,
+                             struct upcall_cache_entry *entry)
+{
+       if (UC_CACHE_IS_VALID(entry) &&
+           cfs_time_before(cfs_time_current(), entry->ue_expire))
+               return 0;
+
+       if (UC_CACHE_IS_ACQUIRING(entry)) {
+               if (entry->ue_acquire_expire == 0 ||
+                   cfs_time_before(cfs_time_current(),
+                                   entry->ue_acquire_expire))
+                       return 0;
+
+               UC_CACHE_SET_EXPIRED(entry);
+               wake_up_all(&entry->ue_waitq);
+       } else if (!UC_CACHE_IS_INVALID(entry)) {
+               UC_CACHE_SET_EXPIRED(entry);
+       }
+
+       list_del_init(&entry->ue_hash);
+       if (!atomic_read(&entry->ue_refcount))
+               free_entry(cache, entry);
+       return 1;
+}
+
+static inline int refresh_entry(struct upcall_cache *cache,
+                        struct upcall_cache_entry *entry)
+{
+       LASSERT(cache->uc_ops->do_upcall);
+       return cache->uc_ops->do_upcall(cache, entry);
+}
+
+struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache,
+                                                 __u64 key, void *args)
+{
+       struct upcall_cache_entry *entry = NULL, *new = NULL, *next;
+       struct list_head *head;
+       wait_queue_t wait;
+       int rc, found;
+       ENTRY;
+
+       LASSERT(cache);
+
+       head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+find_again:
+       found = 0;
+       spin_lock(&cache->uc_lock);
+       list_for_each_entry_safe(entry, next, head, ue_hash) {
+               /* check invalid & expired items */
+               if (check_unlink_entry(cache, entry))
+                       continue;
+               if (upcall_compare(cache, entry, key, args) == 0) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (!found) {
+               if (!new) {
+                       spin_unlock(&cache->uc_lock);
+                       new = alloc_entry(cache, key, args);
+                       if (!new) {
+                               CERROR("fail to alloc entry\n");
+                               RETURN(ERR_PTR(-ENOMEM));
+                       }
+                       goto find_again;
+               } else {
+                       list_add(&new->ue_hash, head);
+                       entry = new;
+               }
+       } else {
+               if (new) {
+                       free_entry(cache, new);
+                       new = NULL;
+               }
+               list_move(&entry->ue_hash, head);
+       }
+       get_entry(entry);
+
+       /* acquire for new one */
+       if (UC_CACHE_IS_NEW(entry)) {
+               UC_CACHE_SET_ACQUIRING(entry);
+               UC_CACHE_CLEAR_NEW(entry);
+               spin_unlock(&cache->uc_lock);
+               rc = refresh_entry(cache, entry);
+               spin_lock(&cache->uc_lock);
+               entry->ue_acquire_expire =
+                       cfs_time_shift(cache->uc_acquire_expire);
+               if (rc < 0) {
+                       UC_CACHE_CLEAR_ACQUIRING(entry);
+                       UC_CACHE_SET_INVALID(entry);
+                       wake_up_all(&entry->ue_waitq);
+                       if (unlikely(rc == -EREMCHG)) {
+                               put_entry(cache, entry);
+                               GOTO(out, entry = ERR_PTR(rc));
+                       }
+               }
+       }
+       /* someone (and only one) is doing upcall upon this item,
+        * wait it to complete */
+       if (UC_CACHE_IS_ACQUIRING(entry)) {
+               long expiry = (entry == new) ?
+                             cfs_time_seconds(cache->uc_acquire_expire) :
+                             MAX_SCHEDULE_TIMEOUT;
+               long left;
+
+               init_waitqueue_entry_current(&wait);
+               add_wait_queue(&entry->ue_waitq, &wait);
+               set_current_state(TASK_INTERRUPTIBLE);
+               spin_unlock(&cache->uc_lock);
+
+               left = waitq_timedwait(&wait, TASK_INTERRUPTIBLE,
+                                          expiry);
+
+               spin_lock(&cache->uc_lock);
+               remove_wait_queue(&entry->ue_waitq, &wait);
+               if (UC_CACHE_IS_ACQUIRING(entry)) {
+                       /* we're interrupted or upcall failed in the middle */
+                       rc = left > 0 ? -EINTR : -ETIMEDOUT;
+                       CERROR("acquire for key "LPU64": error %d\n",
+                              entry->ue_key, rc);
+                       put_entry(cache, entry);
+                       GOTO(out, entry = ERR_PTR(rc));
+               }
+       }
+
+       /* invalid means error, don't need to try again */
+       if (UC_CACHE_IS_INVALID(entry)) {
+               put_entry(cache, entry);
+               GOTO(out, entry = ERR_PTR(-EIDRM));
+       }
+
+       /* check expired
+        * We can't refresh the existing one because some
+        * memory might be shared by multiple processes.
+        */
+       if (check_unlink_entry(cache, entry)) {
+               /* if expired, try again. but if this entry is
+                * created by me but too quickly turn to expired
+                * without any error, should at least give a
+                * chance to use it once.
+                */
+               if (entry != new) {
+                       put_entry(cache, entry);
+                       spin_unlock(&cache->uc_lock);
+                       new = NULL;
+                       goto find_again;
+               }
+       }
+
+       /* Now we know it's good */
+out:
+       spin_unlock(&cache->uc_lock);
+       RETURN(entry);
+}
+EXPORT_SYMBOL(upcall_cache_get_entry);
+
+void upcall_cache_put_entry(struct upcall_cache *cache,
+                           struct upcall_cache_entry *entry)
+{
+       ENTRY;
+
+       if (!entry) {
+               EXIT;
+               return;
+       }
+
+       LASSERT(atomic_read(&entry->ue_refcount) > 0);
+       spin_lock(&cache->uc_lock);
+       put_entry(cache, entry);
+       spin_unlock(&cache->uc_lock);
+       EXIT;
+}
+EXPORT_SYMBOL(upcall_cache_put_entry);
+
+int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key,
+                         void *args)
+{
+       struct upcall_cache_entry *entry = NULL;
+       struct list_head *head;
+       int found = 0, rc = 0;
+       ENTRY;
+
+       LASSERT(cache);
+
+       head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+       spin_lock(&cache->uc_lock);
+       list_for_each_entry(entry, head, ue_hash) {
+               if (downcall_compare(cache, entry, key, args) == 0) {
+                       found = 1;
+                       get_entry(entry);
+                       break;
+               }
+       }
+
+       if (!found) {
+               CDEBUG(D_OTHER, "%s: upcall for key "LPU64" not expected\n",
+                      cache->uc_name, key);
+               /* haven't found, it's possible */
+               spin_unlock(&cache->uc_lock);
+               RETURN(-EINVAL);
+       }
+
+       if (err) {
+               CDEBUG(D_OTHER, "%s: upcall for key "LPU64" returned %d\n",
+                      cache->uc_name, entry->ue_key, err);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       if (!UC_CACHE_IS_ACQUIRING(entry)) {
+               CDEBUG(D_RPCTRACE,"%s: found uptodate entry %p (key "LPU64")\n",
+                      cache->uc_name, entry, entry->ue_key);
+               GOTO(out, rc = 0);
+       }
+
+       if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) {
+               CERROR("%s: found a stale entry %p (key "LPU64") in ioctl\n",
+                      cache->uc_name, entry, entry->ue_key);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       spin_unlock(&cache->uc_lock);
+       if (cache->uc_ops->parse_downcall)
+               rc = cache->uc_ops->parse_downcall(cache, entry, args);
+       spin_lock(&cache->uc_lock);
+       if (rc)
+               GOTO(out, rc);
+
+       entry->ue_expire = cfs_time_shift(cache->uc_entry_expire);
+       UC_CACHE_SET_VALID(entry);
+       CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key "LPU64"\n",
+              cache->uc_name, entry, entry->ue_key);
+out:
+       if (rc) {
+               UC_CACHE_SET_INVALID(entry);
+               list_del_init(&entry->ue_hash);
+       }
+       UC_CACHE_CLEAR_ACQUIRING(entry);
+       spin_unlock(&cache->uc_lock);
+       wake_up_all(&entry->ue_waitq);
+       put_entry(cache, entry);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(upcall_cache_downcall);
+
+static void cache_flush(struct upcall_cache *cache, int force)
+{
+       struct upcall_cache_entry *entry, *next;
+       int i;
+       ENTRY;
+
+       spin_lock(&cache->uc_lock);
+       for (i = 0; i < UC_CACHE_HASH_SIZE; i++) {
+               list_for_each_entry_safe(entry, next,
+                                        &cache->uc_hashtable[i], ue_hash) {
+                       if (!force && atomic_read(&entry->ue_refcount)) {
+                               UC_CACHE_SET_EXPIRED(entry);
+                               continue;
+                       }
+                       LASSERT(!atomic_read(&entry->ue_refcount));
+                       free_entry(cache, entry);
+               }
+       }
+       spin_unlock(&cache->uc_lock);
+       EXIT;
+}
+
+void upcall_cache_flush_idle(struct upcall_cache *cache)
+{
+       cache_flush(cache, 0);
+}
+EXPORT_SYMBOL(upcall_cache_flush_idle);
+
+void upcall_cache_flush_all(struct upcall_cache *cache)
+{
+       cache_flush(cache, 1);
+}
+EXPORT_SYMBOL(upcall_cache_flush_all);
+
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args)
+{
+       struct list_head *head;
+       struct upcall_cache_entry *entry;
+       int found = 0;
+       ENTRY;
+
+       head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)];
+
+       spin_lock(&cache->uc_lock);
+       list_for_each_entry(entry, head, ue_hash) {
+               if (upcall_compare(cache, entry, key, args) == 0) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (found) {
+               CWARN("%s: flush entry %p: key "LPU64", ref %d, fl %x, "
+                     "cur %lu, ex %ld/%ld\n",
+                     cache->uc_name, entry, entry->ue_key,
+                     atomic_read(&entry->ue_refcount), entry->ue_flags,
+                     cfs_time_current_sec(), entry->ue_acquire_expire,
+                     entry->ue_expire);
+               UC_CACHE_SET_EXPIRED(entry);
+               if (!atomic_read(&entry->ue_refcount))
+                       free_entry(cache, entry);
+       }
+       spin_unlock(&cache->uc_lock);
+}
+EXPORT_SYMBOL(upcall_cache_flush_one);
+
+struct upcall_cache *upcall_cache_init(const char *name, const char *upcall,
+                                      struct upcall_cache_ops *ops)
+{
+       struct upcall_cache *cache;
+       int i;
+       ENTRY;
+
+       LIBCFS_ALLOC(cache, sizeof(*cache));
+       if (!cache)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       spin_lock_init(&cache->uc_lock);
+       rwlock_init(&cache->uc_upcall_rwlock);
+       for (i = 0; i < UC_CACHE_HASH_SIZE; i++)
+               INIT_LIST_HEAD(&cache->uc_hashtable[i]);
+       strncpy(cache->uc_name, name, sizeof(cache->uc_name) - 1);
+       /* upcall pathname proc tunable */
+       strncpy(cache->uc_upcall, upcall, sizeof(cache->uc_upcall) - 1);
+       cache->uc_entry_expire = 20 * 60;
+       cache->uc_acquire_expire = 30;
+       cache->uc_ops = ops;
+
+       RETURN(cache);
+}
+EXPORT_SYMBOL(upcall_cache_init);
+
+void upcall_cache_cleanup(struct upcall_cache *cache)
+{
+       if (!cache)
+               return;
+       upcall_cache_flush_all(cache);
+       LIBCFS_FREE(cache, sizeof(*cache));
+}
+EXPORT_SYMBOL(upcall_cache_cleanup);
diff --git a/drivers/staging/lustre/lustre/libcfs/watchdog.c b/drivers/staging/lustre/lustre/libcfs/watchdog.c
new file mode 100644 (file)
index 0000000..7c385ad
--- /dev/null
@@ -0,0 +1,516 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/watchdog.c
+ *
+ * Author: Jacob Berkman <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+#include "tracefile.h"
+
+struct lc_watchdog {
+       spinlock_t  lcw_lock;     /* check or change lcw_list */
+       int          lcw_refcount; /* must hold lcw_pending_timers_lock */
+       timer_list_t     lcw_timer;    /* kernel timer */
+       struct list_head      lcw_list;     /* chain on pending list */
+       cfs_time_t      lcw_last_touched; /* last touched stamp */
+       task_t     *lcw_task;     /* owner task */
+       void      (*lcw_callback)(pid_t, void *);
+       void       *lcw_data;
+
+       pid_t      lcw_pid;
+
+       enum {
+               LC_WATCHDOG_DISABLED,
+               LC_WATCHDOG_ENABLED,
+               LC_WATCHDOG_EXPIRED
+       } lcw_state;
+};
+
+#ifdef WITH_WATCHDOG
+/*
+ * The dispatcher will complete lcw_start_completion when it starts,
+ * and lcw_stop_completion when it exits.
+ * Wake lcw_event_waitq to signal timer callback dispatches.
+ */
+static struct completion lcw_start_completion;
+static struct completion  lcw_stop_completion;
+static wait_queue_head_t lcw_event_waitq;
+
+/*
+ * Set this and wake lcw_event_waitq to stop the dispatcher.
+ */
+enum {
+       LCW_FLAG_STOP = 0
+};
+static unsigned long lcw_flags = 0;
+
+/*
+ * Number of outstanding watchdogs.
+ * When it hits 1, we start the dispatcher.
+ * When it hits 0, we stop the dispatcher.
+ */
+static __u32    lcw_refcount = 0;
+static DEFINE_MUTEX(lcw_refcount_mutex);
+
+/*
+ * List of timers that have fired that need their callbacks run by the
+ * dispatcher.
+ */
+/* BH lock! */
+static DEFINE_SPINLOCK(lcw_pending_timers_lock);
+static struct list_head lcw_pending_timers = LIST_HEAD_INIT(lcw_pending_timers);
+
+/* Last time a watchdog expired */
+static cfs_time_t lcw_last_watchdog_time;
+static int lcw_recent_watchdog_count;
+
+static void
+lcw_dump(struct lc_watchdog *lcw)
+{
+       ENTRY;
+       rcu_read_lock();
+       if (lcw->lcw_task == NULL) {
+               LCONSOLE_WARN("Process " LPPID " was not found in the task "
+                             "list; watchdog callback may be incomplete\n",
+                             (int)lcw->lcw_pid);
+       } else {
+               libcfs_debug_dumpstack(lcw->lcw_task);
+       }
+
+       rcu_read_unlock();
+       EXIT;
+}
+
+static void lcw_cb(ulong_ptr_t data)
+{
+       struct lc_watchdog *lcw = (struct lc_watchdog *)data;
+       ENTRY;
+
+       if (lcw->lcw_state != LC_WATCHDOG_ENABLED) {
+               EXIT;
+               return;
+       }
+
+       lcw->lcw_state = LC_WATCHDOG_EXPIRED;
+
+       spin_lock_bh(&lcw->lcw_lock);
+       LASSERT(list_empty(&lcw->lcw_list));
+
+       spin_lock_bh(&lcw_pending_timers_lock);
+       lcw->lcw_refcount++; /* +1 for pending list */
+       list_add(&lcw->lcw_list, &lcw_pending_timers);
+       wake_up(&lcw_event_waitq);
+
+       spin_unlock_bh(&lcw_pending_timers_lock);
+       spin_unlock_bh(&lcw->lcw_lock);
+       EXIT;
+}
+
+static int is_watchdog_fired(void)
+{
+       int rc;
+
+       if (test_bit(LCW_FLAG_STOP, &lcw_flags))
+               return 1;
+
+       spin_lock_bh(&lcw_pending_timers_lock);
+       rc = !list_empty(&lcw_pending_timers);
+       spin_unlock_bh(&lcw_pending_timers_lock);
+       return rc;
+}
+
+static void lcw_dump_stack(struct lc_watchdog *lcw)
+{
+       cfs_time_t      current_time;
+       cfs_duration_t  delta_time;
+       struct timeval  timediff;
+
+       current_time = cfs_time_current();
+       delta_time = cfs_time_sub(current_time, lcw->lcw_last_touched);
+       cfs_duration_usec(delta_time, &timediff);
+
+       /*
+        * Check to see if we should throttle the watchdog timer to avoid
+        * too many dumps going to the console thus triggering an NMI.
+        */
+       delta_time = cfs_duration_sec(cfs_time_sub(current_time,
+                                                  lcw_last_watchdog_time));
+
+       if (delta_time < libcfs_watchdog_ratelimit &&
+           lcw_recent_watchdog_count > 3) {
+               LCONSOLE_WARN("Service thread pid %u was inactive for "
+                             "%lu.%.02lus. Watchdog stack traces are limited "
+                             "to 3 per %d seconds, skipping this one.\n",
+                             (int)lcw->lcw_pid,
+                             timediff.tv_sec,
+                             timediff.tv_usec / 10000,
+                             libcfs_watchdog_ratelimit);
+       } else {
+               if (delta_time < libcfs_watchdog_ratelimit) {
+                       lcw_recent_watchdog_count++;
+               } else {
+                       memcpy(&lcw_last_watchdog_time, &current_time,
+                              sizeof(current_time));
+                       lcw_recent_watchdog_count = 0;
+               }
+
+               LCONSOLE_WARN("Service thread pid %u was inactive for "
+                             "%lu.%.02lus. The thread might be hung, or it "
+                             "might only be slow and will resume later. "
+                             "Dumping the stack trace for debugging purposes:"
+                             "\n",
+                             (int)lcw->lcw_pid,
+                             timediff.tv_sec,
+                             timediff.tv_usec / 10000);
+               lcw_dump(lcw);
+       }
+}
+
+static int lcw_dispatch_main(void *data)
+{
+       int              rc = 0;
+       struct lc_watchdog *lcw;
+       LIST_HEAD      (zombies);
+
+       ENTRY;
+
+       complete(&lcw_start_completion);
+
+       while (1) {
+               int dumplog = 1;
+
+               cfs_wait_event_interruptible(lcw_event_waitq,
+                                            is_watchdog_fired(), rc);
+               CDEBUG(D_INFO, "Watchdog got woken up...\n");
+               if (test_bit(LCW_FLAG_STOP, &lcw_flags)) {
+                       CDEBUG(D_INFO, "LCW_FLAG_STOP set, shutting down...\n");
+
+                       spin_lock_bh(&lcw_pending_timers_lock);
+                       rc = !list_empty(&lcw_pending_timers);
+                       spin_unlock_bh(&lcw_pending_timers_lock);
+                       if (rc) {
+                               CERROR("pending timers list was not empty at "
+                                      "time of watchdog dispatch shutdown\n");
+                       }
+                       break;
+               }
+
+               spin_lock_bh(&lcw_pending_timers_lock);
+               while (!list_empty(&lcw_pending_timers)) {
+                       int is_dumplog;
+
+                       lcw = list_entry(lcw_pending_timers.next,
+                                            struct lc_watchdog, lcw_list);
+                       /* +1 ref for callback to make sure lwc wouldn't be
+                        * deleted after releasing lcw_pending_timers_lock */
+                       lcw->lcw_refcount++;
+                       spin_unlock_bh(&lcw_pending_timers_lock);
+
+                       /* lock ordering */
+                       spin_lock_bh(&lcw->lcw_lock);
+                       spin_lock_bh(&lcw_pending_timers_lock);
+
+                       if (list_empty(&lcw->lcw_list)) {
+                               /* already removed from pending list */
+                               lcw->lcw_refcount--; /* -1 ref for callback */
+                               if (lcw->lcw_refcount == 0)
+                                       list_add(&lcw->lcw_list, &zombies);
+                               spin_unlock_bh(&lcw->lcw_lock);
+                               /* still hold lcw_pending_timers_lock */
+                               continue;
+                       }
+
+                       list_del_init(&lcw->lcw_list);
+                       lcw->lcw_refcount--; /* -1 ref for pending list */
+
+                       spin_unlock_bh(&lcw_pending_timers_lock);
+                       spin_unlock_bh(&lcw->lcw_lock);
+
+                       CDEBUG(D_INFO, "found lcw for pid " LPPID "\n",
+                              lcw->lcw_pid);
+                       lcw_dump_stack(lcw);
+
+                       is_dumplog = lcw->lcw_callback == lc_watchdog_dumplog;
+                       if (lcw->lcw_state != LC_WATCHDOG_DISABLED &&
+                           (dumplog || !is_dumplog)) {
+                               lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data);
+                               if (dumplog && is_dumplog)
+                                       dumplog = 0;
+                       }
+
+                       spin_lock_bh(&lcw_pending_timers_lock);
+                       lcw->lcw_refcount--; /* -1 ref for callback */
+                       if (lcw->lcw_refcount == 0)
+                               list_add(&lcw->lcw_list, &zombies);
+               }
+               spin_unlock_bh(&lcw_pending_timers_lock);
+
+               while (!list_empty(&zombies)) {
+                       lcw = list_entry(lcw_pending_timers.next,
+                                        struct lc_watchdog, lcw_list);
+                       list_del(&lcw->lcw_list);
+                       LIBCFS_FREE(lcw, sizeof(*lcw));
+               }
+       }
+
+       complete(&lcw_stop_completion);
+
+       RETURN(rc);
+}
+
+static void lcw_dispatch_start(void)
+{
+       task_t *task;
+
+       ENTRY;
+       LASSERT(lcw_refcount == 1);
+
+       init_completion(&lcw_stop_completion);
+       init_completion(&lcw_start_completion);
+       init_waitqueue_head(&lcw_event_waitq);
+
+       CDEBUG(D_INFO, "starting dispatch thread\n");
+       task = kthread_run(lcw_dispatch_main, NULL, "lc_watchdogd");
+       if (IS_ERR(task)) {
+               CERROR("error spawning watchdog dispatch thread: %ld\n",
+                       PTR_ERR(task));
+               EXIT;
+               return;
+       }
+       wait_for_completion(&lcw_start_completion);
+       CDEBUG(D_INFO, "watchdog dispatcher initialization complete.\n");
+
+       EXIT;
+}
+
+static void lcw_dispatch_stop(void)
+{
+       ENTRY;
+       LASSERT(lcw_refcount == 0);
+
+       CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n");
+
+       set_bit(LCW_FLAG_STOP, &lcw_flags);
+       wake_up(&lcw_event_waitq);
+
+       wait_for_completion(&lcw_stop_completion);
+
+       CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n");
+
+       EXIT;
+}
+
+struct lc_watchdog *lc_watchdog_add(int timeout,
+                                   void (*callback)(pid_t, void *),
+                                   void *data)
+{
+       struct lc_watchdog *lcw = NULL;
+       ENTRY;
+
+       LIBCFS_ALLOC(lcw, sizeof(*lcw));
+       if (lcw == NULL) {
+               CDEBUG(D_INFO, "Could not allocate new lc_watchdog\n");
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       spin_lock_init(&lcw->lcw_lock);
+       lcw->lcw_refcount = 1; /* refcount for owner */
+       lcw->lcw_task     = current;
+       lcw->lcw_pid      = current_pid();
+       lcw->lcw_callback = (callback != NULL) ? callback : lc_watchdog_dumplog;
+       lcw->lcw_data     = data;
+       lcw->lcw_state    = LC_WATCHDOG_DISABLED;
+
+       INIT_LIST_HEAD(&lcw->lcw_list);
+       cfs_timer_init(&lcw->lcw_timer, lcw_cb, lcw);
+
+       mutex_lock(&lcw_refcount_mutex);
+       if (++lcw_refcount == 1)
+               lcw_dispatch_start();
+       mutex_unlock(&lcw_refcount_mutex);
+
+       /* Keep this working in case we enable them by default */
+       if (lcw->lcw_state == LC_WATCHDOG_ENABLED) {
+               lcw->lcw_last_touched = cfs_time_current();
+               cfs_timer_arm(&lcw->lcw_timer, cfs_time_seconds(timeout) +
+                             cfs_time_current());
+       }
+
+       RETURN(lcw);
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
+{
+       cfs_time_t newtime = cfs_time_current();;
+
+       if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) {
+               struct timeval timediff;
+               cfs_time_t delta_time = cfs_time_sub(newtime,
+                                                    lcw->lcw_last_touched);
+               cfs_duration_usec(delta_time, &timediff);
+
+               LCONSOLE_WARN("Service thread pid %u %s after %lu.%.02lus. "
+                             "This indicates the system was overloaded (too "
+                             "many service threads, or there were not enough "
+                             "hardware resources).\n",
+                             lcw->lcw_pid,
+                             message,
+                             timediff.tv_sec,
+                             timediff.tv_usec / 10000);
+       }
+       lcw->lcw_last_touched = newtime;
+}
+
+static void lc_watchdog_del_pending(struct lc_watchdog *lcw)
+{
+       spin_lock_bh(&lcw->lcw_lock);
+       if (unlikely(!list_empty(&lcw->lcw_list))) {
+               spin_lock_bh(&lcw_pending_timers_lock);
+               list_del_init(&lcw->lcw_list);
+               lcw->lcw_refcount--; /* -1 ref for pending list */
+               spin_unlock_bh(&lcw_pending_timers_lock);
+       }
+
+       spin_unlock_bh(&lcw->lcw_lock);
+}
+
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
+{
+       ENTRY;
+       LASSERT(lcw != NULL);
+
+       lc_watchdog_del_pending(lcw);
+
+       lcw_update_time(lcw, "resumed");
+       lcw->lcw_state = LC_WATCHDOG_ENABLED;
+
+       cfs_timer_arm(&lcw->lcw_timer, cfs_time_current() +
+                     cfs_time_seconds(timeout));
+
+       EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+       ENTRY;
+       LASSERT(lcw != NULL);
+
+       lc_watchdog_del_pending(lcw);
+
+       lcw_update_time(lcw, "completed");
+       lcw->lcw_state = LC_WATCHDOG_DISABLED;
+
+       EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+       int dead;
+
+       ENTRY;
+       LASSERT(lcw != NULL);
+
+       cfs_timer_disarm(&lcw->lcw_timer);
+
+       lcw_update_time(lcw, "stopped");
+
+       spin_lock_bh(&lcw->lcw_lock);
+       spin_lock_bh(&lcw_pending_timers_lock);
+       if (unlikely(!list_empty(&lcw->lcw_list))) {
+               list_del_init(&lcw->lcw_list);
+               lcw->lcw_refcount--; /* -1 ref for pending list */
+       }
+
+       lcw->lcw_refcount--; /* -1 ref for owner */
+       dead = lcw->lcw_refcount == 0;
+       spin_unlock_bh(&lcw_pending_timers_lock);
+       spin_unlock_bh(&lcw->lcw_lock);
+
+       if (dead)
+               LIBCFS_FREE(lcw, sizeof(*lcw));
+
+       mutex_lock(&lcw_refcount_mutex);
+       if (--lcw_refcount == 0)
+               lcw_dispatch_stop();
+       mutex_unlock(&lcw_refcount_mutex);
+
+       EXIT;
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+/*
+ * Provided watchdog handlers
+ */
+
+void lc_watchdog_dumplog(pid_t pid, void *data)
+{
+       libcfs_debug_dumplog_internal((void *)((long_ptr_t)pid));
+}
+EXPORT_SYMBOL(lc_watchdog_dumplog);
+
+#else   /* !defined(WITH_WATCHDOG) */
+
+struct lc_watchdog *lc_watchdog_add(int timeout,
+                                   void (*callback)(pid_t pid, void *),
+                                   void *data)
+{
+       static struct lc_watchdog      watchdog;
+       return &watchdog;
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+#endif
diff --git a/drivers/staging/lustre/lustre/libcfs/workitem.c b/drivers/staging/lustre/lustre/libcfs/workitem.c
new file mode 100644 (file)
index 0000000..b533666
--- /dev/null
@@ -0,0 +1,475 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/workitem.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *      Liang Zhen  <zhen.liang@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/libcfs/libcfs.h>
+
+#define CFS_WS_NAME_LEN         16
+
+typedef struct cfs_wi_sched {
+       struct list_head                ws_list;        /* chain on global list */
+       /** serialised workitems */
+       spinlock_t              ws_lock;
+       /** where schedulers sleep */
+       wait_queue_head_t               ws_waitq;
+       /** concurrent workitems */
+       struct list_head                ws_runq;
+       /** rescheduled running-workitems, a workitem can be rescheduled
+        * while running in wi_action(), but we don't to execute it again
+        * unless it returns from wi_action(), so we put it on ws_rerunq
+        * while rescheduling, and move it to runq after it returns
+        * from wi_action() */
+       struct list_head                ws_rerunq;
+       /** CPT-table for this scheduler */
+       struct cfs_cpt_table    *ws_cptab;
+       /** CPT id for affinity */
+       int                     ws_cpt;
+       /** number of scheduled workitems */
+       int                     ws_nscheduled;
+       /** started scheduler thread, protected by cfs_wi_data::wi_glock */
+       unsigned int            ws_nthreads:30;
+       /** shutting down, protected by cfs_wi_data::wi_glock */
+       unsigned int            ws_stopping:1;
+       /** serialize starting thread, protected by cfs_wi_data::wi_glock */
+       unsigned int            ws_starting:1;
+       /** scheduler name */
+       char                    ws_name[CFS_WS_NAME_LEN];
+} cfs_wi_sched_t;
+
+struct cfs_workitem_data {
+       /** serialize */
+       spinlock_t              wi_glock;
+       /** list of all schedulers */
+       struct list_head                wi_scheds;
+       /** WI module is initialized */
+       int                     wi_init;
+       /** shutting down the whole WI module */
+       int                     wi_stopping;
+} cfs_wi_data;
+
+static inline void
+cfs_wi_sched_lock(cfs_wi_sched_t *sched)
+{
+       spin_lock(&sched->ws_lock);
+}
+
+static inline void
+cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
+{
+       spin_unlock(&sched->ws_lock);
+}
+
+static inline int
+cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
+{
+       cfs_wi_sched_lock(sched);
+       if (sched->ws_stopping) {
+               cfs_wi_sched_unlock(sched);
+               return 0;
+       }
+
+       if (!list_empty(&sched->ws_runq)) {
+               cfs_wi_sched_unlock(sched);
+               return 0;
+       }
+       cfs_wi_sched_unlock(sched);
+       return 1;
+}
+
+
+/* XXX:
+ * 0. it only works when called from wi->wi_action.
+ * 1. when it returns no one shall try to schedule the workitem.
+ */
+void
+cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+       LASSERT(!in_interrupt()); /* because we use plain spinlock */
+       LASSERT(!sched->ws_stopping);
+
+       cfs_wi_sched_lock(sched);
+
+       LASSERT(wi->wi_running);
+       if (wi->wi_scheduled) { /* cancel pending schedules */
+               LASSERT(!list_empty(&wi->wi_list));
+               list_del_init(&wi->wi_list);
+
+               LASSERT(sched->ws_nscheduled > 0);
+               sched->ws_nscheduled--;
+       }
+
+       LASSERT(list_empty(&wi->wi_list));
+
+       wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+       cfs_wi_sched_unlock(sched);
+
+       return;
+}
+EXPORT_SYMBOL(cfs_wi_exit);
+
+/**
+ * cancel schedule request of workitem \a wi
+ */
+int
+cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+       int     rc;
+
+       LASSERT(!in_interrupt()); /* because we use plain spinlock */
+       LASSERT(!sched->ws_stopping);
+
+       /*
+        * return 0 if it's running already, otherwise return 1, which
+        * means the workitem will not be scheduled and will not have
+        * any race with wi_action.
+        */
+       cfs_wi_sched_lock(sched);
+
+       rc = !(wi->wi_running);
+
+       if (wi->wi_scheduled) { /* cancel pending schedules */
+               LASSERT(!list_empty(&wi->wi_list));
+               list_del_init(&wi->wi_list);
+
+               LASSERT(sched->ws_nscheduled > 0);
+               sched->ws_nscheduled--;
+
+               wi->wi_scheduled = 0;
+       }
+
+       LASSERT (list_empty(&wi->wi_list));
+
+       cfs_wi_sched_unlock(sched);
+       return rc;
+}
+EXPORT_SYMBOL(cfs_wi_deschedule);
+
+/*
+ * Workitem scheduled with (serial == 1) is strictly serialised not only with
+ * itself, but also with others scheduled this way.
+ *
+ * Now there's only one static serialised queue, but in the future more might
+ * be added, and even dynamic creation of serialised queues might be supported.
+ */
+void
+cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
+{
+       LASSERT(!in_interrupt()); /* because we use plain spinlock */
+       LASSERT(!sched->ws_stopping);
+
+       cfs_wi_sched_lock(sched);
+
+       if (!wi->wi_scheduled) {
+               LASSERT (list_empty(&wi->wi_list));
+
+               wi->wi_scheduled = 1;
+               sched->ws_nscheduled++;
+               if (!wi->wi_running) {
+                       list_add_tail(&wi->wi_list, &sched->ws_runq);
+                       wake_up(&sched->ws_waitq);
+               } else {
+                       list_add(&wi->wi_list, &sched->ws_rerunq);
+               }
+       }
+
+       LASSERT (!list_empty(&wi->wi_list));
+       cfs_wi_sched_unlock(sched);
+       return;
+}
+EXPORT_SYMBOL(cfs_wi_schedule);
+
+
+static int
+cfs_wi_scheduler (void *arg)
+{
+       struct cfs_wi_sched     *sched = (cfs_wi_sched_t *)arg;
+
+       cfs_block_allsigs();
+
+       /* CPT affinity scheduler? */
+       if (sched->ws_cptab != NULL)
+               cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
+
+       spin_lock(&cfs_wi_data.wi_glock);
+
+       LASSERT(sched->ws_starting == 1);
+       sched->ws_starting--;
+       sched->ws_nthreads++;
+
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       cfs_wi_sched_lock(sched);
+
+       while (!sched->ws_stopping) {
+               int          nloops = 0;
+               int          rc;
+               cfs_workitem_t *wi;
+
+               while (!list_empty(&sched->ws_runq) &&
+                      nloops < CFS_WI_RESCHED) {
+                       wi = list_entry(sched->ws_runq.next,
+                                           cfs_workitem_t, wi_list);
+                       LASSERT(wi->wi_scheduled && !wi->wi_running);
+
+                       list_del_init(&wi->wi_list);
+
+                       LASSERT(sched->ws_nscheduled > 0);
+                       sched->ws_nscheduled--;
+
+                       wi->wi_running   = 1;
+                       wi->wi_scheduled = 0;
+
+
+                       cfs_wi_sched_unlock(sched);
+                       nloops++;
+
+                       rc = (*wi->wi_action) (wi);
+
+                       cfs_wi_sched_lock(sched);
+                       if (rc != 0) /* WI should be dead, even be freed! */
+                               continue;
+
+                       wi->wi_running = 0;
+                       if (list_empty(&wi->wi_list))
+                               continue;
+
+                       LASSERT(wi->wi_scheduled);
+                       /* wi is rescheduled, should be on rerunq now, we
+                        * move it to runq so it can run action now */
+                       list_move_tail(&wi->wi_list, &sched->ws_runq);
+               }
+
+               if (!list_empty(&sched->ws_runq)) {
+                       cfs_wi_sched_unlock(sched);
+                       /* don't sleep because some workitems still
+                        * expect me to come back soon */
+                       cond_resched();
+                       cfs_wi_sched_lock(sched);
+                       continue;
+               }
+
+               cfs_wi_sched_unlock(sched);
+               cfs_wait_event_interruptible_exclusive(sched->ws_waitq,
+                               !cfs_wi_sched_cansleep(sched), rc);
+               cfs_wi_sched_lock(sched);
+       }
+
+       cfs_wi_sched_unlock(sched);
+
+       spin_lock(&cfs_wi_data.wi_glock);
+       sched->ws_nthreads--;
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       return 0;
+}
+
+
+void
+cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
+{
+       int     i;
+
+       LASSERT(cfs_wi_data.wi_init);
+       LASSERT(!cfs_wi_data.wi_stopping);
+
+       spin_lock(&cfs_wi_data.wi_glock);
+       if (sched->ws_stopping) {
+               CDEBUG(D_INFO, "%s is in progress of stopping\n",
+                      sched->ws_name);
+               spin_unlock(&cfs_wi_data.wi_glock);
+               return;
+       }
+
+       LASSERT(!list_empty(&sched->ws_list));
+       sched->ws_stopping = 1;
+
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       i = 2;
+       wake_up_all(&sched->ws_waitq);
+
+       spin_lock(&cfs_wi_data.wi_glock);
+       while (sched->ws_nthreads > 0) {
+               CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
+                      "waiting for %d threads of WI sched[%s] to terminate\n",
+                      sched->ws_nthreads, sched->ws_name);
+
+               spin_unlock(&cfs_wi_data.wi_glock);
+               cfs_pause(cfs_time_seconds(1) / 20);
+               spin_lock(&cfs_wi_data.wi_glock);
+       }
+
+       list_del(&sched->ws_list);
+
+       spin_unlock(&cfs_wi_data.wi_glock);
+       LASSERT(sched->ws_nscheduled == 0);
+
+       LIBCFS_FREE(sched, sizeof(*sched));
+}
+EXPORT_SYMBOL(cfs_wi_sched_destroy);
+
+int
+cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
+                   int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
+{
+       struct cfs_wi_sched     *sched;
+       int                     rc;
+
+       LASSERT(cfs_wi_data.wi_init);
+       LASSERT(!cfs_wi_data.wi_stopping);
+       LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
+               (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
+
+       LIBCFS_ALLOC(sched, sizeof(*sched));
+       if (sched == NULL)
+               return -ENOMEM;
+
+       strncpy(sched->ws_name, name, CFS_WS_NAME_LEN);
+       sched->ws_cptab = cptab;
+       sched->ws_cpt = cpt;
+
+       spin_lock_init(&sched->ws_lock);
+       init_waitqueue_head(&sched->ws_waitq);
+       INIT_LIST_HEAD(&sched->ws_runq);
+       INIT_LIST_HEAD(&sched->ws_rerunq);
+       INIT_LIST_HEAD(&sched->ws_list);
+
+       rc = 0;
+       while (nthrs > 0)  {
+               char    name[16];
+               task_t  *task;
+               spin_lock(&cfs_wi_data.wi_glock);
+               while (sched->ws_starting > 0) {
+                       spin_unlock(&cfs_wi_data.wi_glock);
+                       schedule();
+                       spin_lock(&cfs_wi_data.wi_glock);
+               }
+
+               sched->ws_starting++;
+               spin_unlock(&cfs_wi_data.wi_glock);
+
+               if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
+                       snprintf(name, sizeof(name), "%s_%02d_%02d",
+                                sched->ws_name, sched->ws_cpt,
+                                sched->ws_nthreads);
+               } else {
+                       snprintf(name, sizeof(name), "%s_%02d",
+                                sched->ws_name, sched->ws_nthreads);
+               }
+
+               task = kthread_run(cfs_wi_scheduler, sched, name);
+               if (!IS_ERR(task)) {
+                       nthrs--;
+                       continue;
+               }
+               rc = PTR_ERR(task);
+
+               CERROR("Failed to create thread for WI scheduler %s: %d\n",
+                      name, rc);
+
+               spin_lock(&cfs_wi_data.wi_glock);
+
+               /* make up for cfs_wi_sched_destroy */
+               list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+               sched->ws_starting--;
+
+               spin_unlock(&cfs_wi_data.wi_glock);
+
+               cfs_wi_sched_destroy(sched);
+               return rc;
+       }
+       spin_lock(&cfs_wi_data.wi_glock);
+       list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       *sched_pp = sched;
+       return 0;
+}
+EXPORT_SYMBOL(cfs_wi_sched_create);
+
+int
+cfs_wi_startup(void)
+{
+       memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
+
+       spin_lock_init(&cfs_wi_data.wi_glock);
+       INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
+       cfs_wi_data.wi_init = 1;
+
+       return 0;
+}
+
+void
+cfs_wi_shutdown (void)
+{
+       struct cfs_wi_sched     *sched;
+
+       spin_lock(&cfs_wi_data.wi_glock);
+       cfs_wi_data.wi_stopping = 1;
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       /* nobody should contend on this list */
+       list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+               sched->ws_stopping = 1;
+               wake_up_all(&sched->ws_waitq);
+       }
+
+       list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+               spin_lock(&cfs_wi_data.wi_glock);
+
+               while (sched->ws_nthreads != 0) {
+                       spin_unlock(&cfs_wi_data.wi_glock);
+                       cfs_pause(cfs_time_seconds(1) / 20);
+                       spin_lock(&cfs_wi_data.wi_glock);
+               }
+               spin_unlock(&cfs_wi_data.wi_glock);
+       }
+       while (!list_empty(&cfs_wi_data.wi_scheds)) {
+               sched = list_entry(cfs_wi_data.wi_scheds.next,
+                                      struct cfs_wi_sched, ws_list);
+               list_del(&sched->ws_list);
+               LIBCFS_FREE(sched, sizeof(*sched));
+       }
+
+       cfs_wi_data.wi_stopping = 0;
+       cfs_wi_data.wi_init = 0;
+}
diff --git a/drivers/staging/lustre/lustre/llite/Makefile b/drivers/staging/lustre/lustre/llite/Makefile
new file mode 100644 (file)
index 0000000..dff0c04
--- /dev/null
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTRE_FS) += lustre.o
+obj-$(CONFIG_LUSTRE_FS) += llite_lloop.o
+lustre-y := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o \
+           rw.o lproc_llite.o namei.o symlink.o llite_mmap.o \
+           xattr.o remote_perm.o llite_rmtacl.o llite_capa.o \
+           rw26.o super25.o statahead.o \
+           ../lclient/glimpse.o ../lclient/lcommon_cl.o ../lclient/lcommon_misc.o \
+           vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
+
+llite_lloop-y := lloop.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c
new file mode 100644 (file)
index 0000000..7d6abff
--- /dev/null
@@ -0,0 +1,675 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/quotaops.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+
+#include "llite_internal.h"
+
+static void free_dentry_data(struct rcu_head *head)
+{
+       struct ll_dentry_data *lld;
+
+       lld = container_of(head, struct ll_dentry_data, lld_rcu_head);
+       OBD_FREE_PTR(lld);
+}
+
+/* should NOT be called with the dcache lock, see fs/dcache.c */
+static void ll_release(struct dentry *de)
+{
+       struct ll_dentry_data *lld;
+       ENTRY;
+       LASSERT(de != NULL);
+       lld = ll_d2d(de);
+       if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */
+               RETURN_EXIT;
+
+       if (lld->lld_it) {
+               ll_intent_release(lld->lld_it);
+               OBD_FREE(lld->lld_it, sizeof(*lld->lld_it));
+       }
+       LASSERT(lld->lld_cwd_count == 0);
+       LASSERT(lld->lld_mnt_count == 0);
+       de->d_fsdata = NULL;
+       call_rcu(&lld->lld_rcu_head, free_dentry_data);
+
+       EXIT;
+}
+
+/* Compare if two dentries are the same.  Don't match if the existing dentry
+ * is marked invalid.  Returns 1 if different, 0 if the same.
+ *
+ * This avoids a race where ll_lookup_it() instantiates a dentry, but we get
+ * an AST before calling d_revalidate_it().  The dentry still exists (marked
+ * INVALID) so d_lookup() matches it, but we have no lock on it (so
+ * lock_match() fails) and we spin around real_lookup(). */
+int ll_dcompare(const struct dentry *parent, const struct inode *pinode,
+               const struct dentry *dentry, const struct inode *inode,
+               unsigned int len, const char *str, const struct qstr *name)
+{
+       ENTRY;
+
+       if (len != name->len)
+               RETURN(1);
+
+       if (memcmp(str, name->name, len))
+               RETURN(1);
+
+       CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n",
+              name->len, name->name, dentry, dentry->d_flags,
+              d_refcount(dentry));
+
+       /* mountpoint is always valid */
+       if (d_mountpoint((struct dentry *)dentry))
+               RETURN(0);
+
+       if (d_lustre_invalid(dentry))
+               RETURN(1);
+
+       RETURN(0);
+}
+
+static inline int return_if_equal(struct ldlm_lock *lock, void *data)
+{
+       if ((lock->l_flags &
+            (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA)) ==
+           (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA))
+               return LDLM_ITER_CONTINUE;
+       return LDLM_ITER_STOP;
+}
+
+/* find any ldlm lock of the inode in mdc and lov
+ * return 0    not find
+ *     1    find one
+ *      < 0    error */
+static int find_cbdata(struct inode *inode)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct lov_stripe_md *lsm;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(inode);
+       rc = md_find_cbdata(sbi->ll_md_exp, ll_inode2fid(inode),
+                           return_if_equal, NULL);
+       if (rc != 0)
+                RETURN(rc);
+
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm == NULL)
+               RETURN(rc);
+
+       rc = obd_find_cbdata(sbi->ll_dt_exp, lsm, return_if_equal, NULL);
+       ccc_inode_lsm_put(inode, lsm);
+
+       RETURN(rc);
+}
+
+/**
+ * Called when last reference to a dentry is dropped and dcache wants to know
+ * whether or not it should cache it:
+ * - return 1 to delete the dentry immediately
+ * - return 0 to cache the dentry
+ * Should NOT be called with the dcache lock, see fs/dcache.c
+ */
+static int ll_ddelete(const struct dentry *de)
+{
+       ENTRY;
+       LASSERT(de);
+
+       CDEBUG(D_DENTRY, "%s dentry %.*s (%p, parent %p, inode %p) %s%s\n",
+              d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping",
+              de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+              d_unhashed((struct dentry *)de) ? "" : "hashed,",
+              list_empty(&de->d_subdirs) ? "" : "subdirs");
+
+       /* kernel >= 2.6.38 last refcount is decreased after this function. */
+       LASSERT(d_refcount(de) == 1);
+
+       /* Disable this piece of code temproarily because this is called
+        * inside dcache_lock so it's not appropriate to do lots of work
+        * here. ATTENTION: Before this piece of code enabling, LU-2487 must be
+        * resolved. */
+#if 0
+       /* if not ldlm lock for this inode, set i_nlink to 0 so that
+        * this inode can be recycled later b=20433 */
+       if (de->d_inode && !find_cbdata(de->d_inode))
+               clear_nlink(de->d_inode);
+#endif
+
+       if (d_lustre_invalid((struct dentry *)de))
+               RETURN(1);
+       RETURN(0);
+}
+
+static int ll_set_dd(struct dentry *de)
+{
+       ENTRY;
+       LASSERT(de != NULL);
+
+       CDEBUG(D_DENTRY, "ldd on dentry %.*s (%p) parent %p inode %p refc %d\n",
+               de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+               d_refcount(de));
+
+       if (de->d_fsdata == NULL) {
+               struct ll_dentry_data *lld;
+
+               OBD_ALLOC_PTR(lld);
+               if (likely(lld != NULL)) {
+                       spin_lock(&de->d_lock);
+                       if (likely(de->d_fsdata == NULL))
+                               de->d_fsdata = lld;
+                       else
+                               OBD_FREE_PTR(lld);
+                       spin_unlock(&de->d_lock);
+               } else {
+                       RETURN(-ENOMEM);
+               }
+       }
+
+       RETURN(0);
+}
+
+int ll_dops_init(struct dentry *de, int block, int init_sa)
+{
+       struct ll_dentry_data *lld = ll_d2d(de);
+       int rc = 0;
+
+       if (lld == NULL && block != 0) {
+               rc = ll_set_dd(de);
+               if (rc)
+                       return rc;
+
+               lld = ll_d2d(de);
+       }
+
+       if (lld != NULL && init_sa != 0)
+               lld->lld_sa_generation = 0;
+
+       /* kernel >= 2.6.38 d_op is set in d_alloc() */
+       LASSERT(de->d_op == &ll_d_ops);
+       return rc;
+}
+
+void ll_intent_drop_lock(struct lookup_intent *it)
+{
+       if (it->it_op && it->d.lustre.it_lock_mode) {
+               struct lustre_handle handle;
+
+               handle.cookie = it->d.lustre.it_lock_handle;
+
+               CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64
+                      " from it %p\n", handle.cookie, it);
+               ldlm_lock_decref(&handle, it->d.lustre.it_lock_mode);
+
+               /* bug 494: intent_release may be called multiple times, from
+                * this thread and we don't want to double-decref this lock */
+               it->d.lustre.it_lock_mode = 0;
+               if (it->d.lustre.it_remote_lock_mode != 0) {
+                       handle.cookie = it->d.lustre.it_remote_lock_handle;
+
+                       CDEBUG(D_DLMTRACE, "releasing remote lock with cookie"
+                              LPX64" from it %p\n", handle.cookie, it);
+                       ldlm_lock_decref(&handle,
+                                        it->d.lustre.it_remote_lock_mode);
+                       it->d.lustre.it_remote_lock_mode = 0;
+               }
+       }
+}
+
+void ll_intent_release(struct lookup_intent *it)
+{
+       ENTRY;
+
+       CDEBUG(D_INFO, "intent %p released\n", it);
+       ll_intent_drop_lock(it);
+       /* We are still holding extra reference on a request, need to free it */
+       if (it_disposition(it, DISP_ENQ_OPEN_REF))
+                ptlrpc_req_finished(it->d.lustre.it_data); /* ll_file_open */
+       if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */
+               ptlrpc_req_finished(it->d.lustre.it_data);
+       if (it_disposition(it, DISP_ENQ_COMPLETE)) /* saved req from revalidate
+                                                   * to lookup */
+               ptlrpc_req_finished(it->d.lustre.it_data);
+
+       it->d.lustre.it_disposition = 0;
+       it->d.lustre.it_data = NULL;
+       EXIT;
+}
+
+void ll_invalidate_aliases(struct inode *inode)
+{
+       struct dentry *dentry;
+       struct ll_d_hlist_node *p;
+       ENTRY;
+
+       LASSERT(inode != NULL);
+
+       CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
+              inode->i_ino, inode->i_generation, inode);
+
+       ll_lock_dcache(inode);
+       ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+               CDEBUG(D_DENTRY, "dentry in drop %.*s (%p) parent %p "
+                      "inode %p flags %d\n", dentry->d_name.len,
+                      dentry->d_name.name, dentry, dentry->d_parent,
+                      dentry->d_inode, dentry->d_flags);
+
+               if (dentry->d_name.len == 1 && dentry->d_name.name[0] == '/') {
+                       CERROR("called on root (?) dentry=%p, inode=%p "
+                              "ino=%lu\n", dentry, inode, inode->i_ino);
+                       lustre_dump_dentry(dentry, 1);
+                       libcfs_debug_dumpstack(NULL);
+               }
+
+               d_lustre_invalidate(dentry, 0);
+       }
+       ll_unlock_dcache(inode);
+
+       EXIT;
+}
+
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+                           struct lookup_intent *it,
+                           struct dentry *de)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (!request)
+               RETURN(0);
+
+       if (it_disposition(it, DISP_LOOKUP_NEG))
+               RETURN(-ENOENT);
+
+       rc = ll_prep_inode(&de->d_inode, request, NULL, it);
+
+       RETURN(rc);
+}
+
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry)
+{
+       LASSERT(it != NULL);
+       LASSERT(dentry != NULL);
+
+       if (it->d.lustre.it_lock_mode && dentry->d_inode != NULL) {
+               struct inode *inode = dentry->d_inode;
+               struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+
+               CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+                      inode, inode->i_ino, inode->i_generation);
+               ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+       }
+
+       /* drop lookup or getattr locks immediately */
+       if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) {
+               /* on 2.6 there are situation when several lookups and
+                * revalidations may be requested during single operation.
+                * therefore, we don't release intent here -bzzz */
+               ll_intent_drop_lock(it);
+       }
+}
+
+void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft)
+{
+       struct lookup_intent *it = *itp;
+
+       if (!it || it->it_op == IT_GETXATTR)
+               it = *itp = deft;
+
+}
+
+int ll_revalidate_it(struct dentry *de, int lookup_flags,
+                    struct lookup_intent *it)
+{
+       struct md_op_data *op_data;
+       struct ptlrpc_request *req = NULL;
+       struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+       struct obd_export *exp;
+       struct inode *parent = de->d_parent->d_inode;
+       int rc;
+
+       ENTRY;
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%s,intent=%s\n", de->d_name.name,
+              LL_IT2STR(it));
+
+       if (de->d_inode == NULL) {
+               __u64 ibits;
+
+               /* We can only use negative dentries if this is stat or lookup,
+                  for opens and stuff we do need to query server. */
+               /* If there is IT_CREAT in intent op set, then we must throw
+                  away this negative dentry and actually do the request to
+                  kernel to create whatever needs to be created (if possible)*/
+               if (it && (it->it_op & IT_CREAT))
+                       RETURN(0);
+
+               if (d_lustre_invalid(de))
+                       RETURN(0);
+
+               ibits = MDS_INODELOCK_UPDATE;
+               rc = ll_have_md_lock(parent, &ibits, LCK_MINMODE);
+               GOTO(out_sa, rc);
+       }
+
+       /* Never execute intents for mount points.
+        * Attributes will be fixed up in ll_inode_revalidate_it */
+       if (d_mountpoint(de))
+               GOTO(out_sa, rc = 1);
+
+       /* need to get attributes in case root got changed from other client */
+       if (de == de->d_sb->s_root) {
+               rc = __ll_inode_revalidate_it(de, it, MDS_INODELOCK_LOOKUP);
+               if (rc == 0)
+                       rc = 1;
+               GOTO(out_sa, rc);
+       }
+
+       exp = ll_i2mdexp(de->d_inode);
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_REVALIDATE_PAUSE, 5);
+       ll_frob_intent(&it, &lookup_it);
+       LASSERT(it);
+
+       if (it->it_op == IT_LOOKUP && !d_lustre_invalid(de))
+               RETURN(1);
+
+       if (it->it_op == IT_OPEN) {
+               struct inode *inode = de->d_inode;
+               struct ll_inode_info *lli = ll_i2info(inode);
+               struct obd_client_handle **och_p;
+               __u64 *och_usecount;
+               __u64 ibits;
+
+               /*
+                * We used to check for MDS_INODELOCK_OPEN here, but in fact
+                * just having LOOKUP lock is enough to justify inode is the
+                * same. And if inode is the same and we have suitable
+                * openhandle, then there is no point in doing another OPEN RPC
+                * just to throw away newly received openhandle.  There are no
+                * security implications too, if file owner or access mode is
+                * change, LOOKUP lock is revoked.
+                */
+
+
+               if (it->it_flags & FMODE_WRITE) {
+                       och_p = &lli->lli_mds_write_och;
+                       och_usecount = &lli->lli_open_fd_write_count;
+               } else if (it->it_flags & FMODE_EXEC) {
+                       och_p = &lli->lli_mds_exec_och;
+                       och_usecount = &lli->lli_open_fd_exec_count;
+               } else {
+                       och_p = &lli->lli_mds_read_och;
+                       och_usecount = &lli->lli_open_fd_read_count;
+               }
+               /* Check for the proper lock. */
+               ibits = MDS_INODELOCK_LOOKUP;
+               if (!ll_have_md_lock(inode, &ibits, LCK_MINMODE))
+                       goto do_lock;
+               mutex_lock(&lli->lli_och_mutex);
+               if (*och_p) { /* Everything is open already, do nothing */
+                       /*(*och_usecount)++;  Do not let them steal our open
+                         handle from under us */
+                       SET_BUT_UNUSED(och_usecount);
+                       /* XXX The code above was my original idea, but in case
+                          we have the handle, but we cannot use it due to later
+                          checks (e.g. O_CREAT|O_EXCL flags set), nobody
+                          would decrement counter increased here. So we just
+                          hope the lock won't be invalidated in between. But
+                          if it would be, we'll reopen the open request to
+                          MDS later during file open path */
+                       mutex_unlock(&lli->lli_och_mutex);
+                       RETURN(1);
+               } else {
+                       mutex_unlock(&lli->lli_och_mutex);
+               }
+       }
+
+       if (it->it_op == IT_GETATTR) {
+               rc = ll_statahead_enter(parent, &de, 0);
+               if (rc == 1)
+                       goto mark;
+               else if (rc != -EAGAIN && rc != 0)
+                       GOTO(out, rc = 0);
+       }
+
+do_lock:
+       op_data = ll_prep_md_op_data(NULL, parent, de->d_inode,
+                                    de->d_name.name, de->d_name.len,
+                                    0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       if (!IS_POSIXACL(parent) || !exp_connect_umask(exp))
+               it->it_create_mode &= ~current_umask();
+       it->it_create_mode |= M_CHECK_STALE;
+       rc = md_intent_lock(exp, op_data, NULL, 0, it,
+                           lookup_flags,
+                           &req, ll_md_blocking_ast, 0);
+       it->it_create_mode &= ~M_CHECK_STALE;
+       ll_finish_md_op_data(op_data);
+
+       /* If req is NULL, then md_intent_lock only tried to do a lock match;
+        * if all was well, it will return 1 if it found locks, 0 otherwise. */
+       if (req == NULL && rc >= 0) {
+               if (!rc)
+                       goto do_lookup;
+               GOTO(out, rc);
+       }
+
+       if (rc < 0) {
+               if (rc != -ESTALE) {
+                       CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status "
+                              "%d\n", rc, it->d.lustre.it_status);
+               }
+               GOTO(out, rc = 0);
+       }
+
+revalidate_finish:
+       rc = ll_revalidate_it_finish(req, it, de);
+       if (rc != 0) {
+               if (rc != -ESTALE && rc != -ENOENT)
+                       ll_intent_release(it);
+               GOTO(out, rc = 0);
+       }
+
+       if ((it->it_op & IT_OPEN) && de->d_inode &&
+           !S_ISREG(de->d_inode->i_mode) &&
+           !S_ISDIR(de->d_inode->i_mode)) {
+               ll_release_openhandle(de, it);
+       }
+       rc = 1;
+
+out:
+       /* We do not free request as it may be reused during following lookup
+        * (see comment in mdc/mdc_locks.c::mdc_intent_lock()), request will
+        * be freed in ll_lookup_it or in ll_intent_release. But if
+        * request was not completed, we need to free it. (bug 5154, 9903) */
+       if (req != NULL && !it_disposition(it, DISP_ENQ_COMPLETE))
+               ptlrpc_req_finished(req);
+       if (rc == 0) {
+               /* mdt may grant layout lock for the newly created file, so
+                * release the lock to avoid leaking */
+               ll_intent_drop_lock(it);
+               ll_invalidate_aliases(de->d_inode);
+       } else {
+               __u64 bits = 0;
+               __u64 matched_bits = 0;
+
+               CDEBUG(D_DENTRY, "revalidated dentry %.*s (%p) parent %p "
+                      "inode %p refc %d\n", de->d_name.len,
+                      de->d_name.name, de, de->d_parent, de->d_inode,
+                      d_refcount(de));
+
+               ll_set_lock_data(exp, de->d_inode, it, &bits);
+
+               /* Note: We have to match both LOOKUP and PERM lock
+                * here to make sure the dentry is valid and no one
+                * changing the permission.
+                * But if the client connects < 2.4 server, which will
+                * only grant LOOKUP lock, so we can only Match LOOKUP
+                * lock for old server */
+               if (exp_connect_flags(ll_i2mdexp(de->d_inode)) &&
+                                                       OBD_CONNECT_LVB_TYPE)
+                       matched_bits =
+                               MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM;
+               else
+                       matched_bits = MDS_INODELOCK_LOOKUP;
+
+               if (((bits & matched_bits) == matched_bits) &&
+                   d_lustre_invalid(de))
+                       d_lustre_revalidate(de);
+               ll_lookup_finish_locks(it, de);
+       }
+
+mark:
+       if (it != NULL && it->it_op == IT_GETATTR && rc > 0)
+               ll_statahead_mark(parent, de);
+       RETURN(rc);
+
+       /*
+        * This part is here to combat evil-evil race in real_lookup on 2.6
+        * kernels.  The race details are: We enter do_lookup() looking for some
+        * name, there is nothing in dcache for this name yet and d_lookup()
+        * returns NULL.  We proceed to real_lookup(), and while we do this,
+        * another process does open on the same file we looking up (most simple
+        * reproducer), open succeeds and the dentry is added. Now back to
+        * us. In real_lookup() we do d_lookup() again and suddenly find the
+        * dentry, so we call d_revalidate on it, but there is no lock, so
+        * without this code we would return 0, but unpatched real_lookup just
+        * returns -ENOENT in such a case instead of retrying the lookup. Once
+        * this is dealt with in real_lookup(), all of this ugly mess can go and
+        * we can just check locks in ->d_revalidate without doing any RPCs
+        * ever.
+        */
+do_lookup:
+       if (it != &lookup_it) {
+               /* MDS_INODELOCK_UPDATE needed for IT_GETATTR case. */
+               if (it->it_op == IT_GETATTR)
+                       lookup_it.it_op = IT_GETATTR;
+               ll_lookup_finish_locks(it, de);
+               it = &lookup_it;
+       }
+
+       /* Do real lookup here. */
+       op_data = ll_prep_md_op_data(NULL, parent, NULL, de->d_name.name,
+                                    de->d_name.len, 0, (it->it_op & IT_CREAT ?
+                                                        LUSTRE_OPC_CREATE :
+                                                        LUSTRE_OPC_ANY), NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       rc = md_intent_lock(exp, op_data, NULL, 0,  it, 0, &req,
+                           ll_md_blocking_ast, 0);
+       if (rc >= 0) {
+               struct mdt_body *mdt_body;
+               struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0};
+               mdt_body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+               if (de->d_inode)
+                       fid = *ll_inode2fid(de->d_inode);
+
+               /* see if we got same inode, if not - return error */
+               if (lu_fid_eq(&fid, &mdt_body->fid1)) {
+                       ll_finish_md_op_data(op_data);
+                       op_data = NULL;
+                       goto revalidate_finish;
+               }
+               ll_intent_release(it);
+       }
+       ll_finish_md_op_data(op_data);
+       GOTO(out, rc = 0);
+
+out_sa:
+       /*
+        * For rc == 1 case, should not return directly to prevent losing
+        * statahead windows; for rc == 0 case, the "lookup" will be done later.
+        */
+       if (it != NULL && it->it_op == IT_GETATTR && rc == 1)
+               ll_statahead_enter(parent, &de, 1);
+       goto mark;
+}
+
+/*
+ * Always trust cached dentries. Update statahead window if necessary.
+ */
+int ll_revalidate_nd(struct dentry *dentry, unsigned int flags)
+{
+       struct inode *parent = dentry->d_parent->d_inode;
+       int unplug = 0;
+
+       ENTRY;
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%s,flags=%u\n",
+              dentry->d_name.name, flags);
+
+       if (!(flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE)) &&
+           ll_need_statahead(parent, dentry) > 0) {
+               if (flags & LOOKUP_RCU)
+                       RETURN(-ECHILD);
+
+               if (dentry->d_inode == NULL)
+                       unplug = 1;
+               do_statahead_enter(parent, &dentry, unplug);
+               ll_statahead_mark(parent, dentry);
+       }
+
+       RETURN(1);
+}
+
+
+void ll_d_iput(struct dentry *de, struct inode *inode)
+{
+       LASSERT(inode);
+       if (!find_cbdata(inode))
+               clear_nlink(inode);
+       iput(inode);
+}
+
+struct dentry_operations ll_d_ops = {
+       .d_revalidate = ll_revalidate_nd,
+       .d_release = ll_release,
+       .d_delete  = ll_ddelete,
+       .d_iput    = ll_d_iput,
+       .d_compare = ll_dcompare,
+};
diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c
new file mode 100644 (file)
index 0000000..23c61fe
--- /dev/null
@@ -0,0 +1,1978 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/dir.c
+ *
+ * Directory code for lustre client.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+#include <linux/buffer_head.h>   // for wait_on_buffer
+#include <linux/pagevec.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre/lustre_idl.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_fid.h>
+#include "llite_internal.h"
+
+/*
+ * (new) readdir implementation overview.
+ *
+ * Original lustre readdir implementation cached exact copy of raw directory
+ * pages on the client. These pages were indexed in client page cache by
+ * logical offset in the directory file. This design, while very simple and
+ * intuitive had some inherent problems:
+ *
+ *     . it implies that byte offset to the directory entry serves as a
+ *     telldir(3)/seekdir(3) cookie, but that offset is not stable: in
+ *     ext3/htree directory entries may move due to splits, and more
+ *     importantly,
+ *
+ *     . it is incompatible with the design of split directories for cmd3,
+ *     that assumes that names are distributed across nodes based on their
+ *     hash, and so readdir should be done in hash order.
+ *
+ * New readdir implementation does readdir in hash order, and uses hash of a
+ * file name as a telldir/seekdir cookie. This led to number of complications:
+ *
+ *     . hash is not unique, so it cannot be used to index cached directory
+ *     pages on the client (note, that it requires a whole pageful of hash
+ *     collided entries to cause two pages to have identical hashes);
+ *
+ *     . hash is not unique, so it cannot, strictly speaking, be used as an
+ *     entry cookie. ext3/htree has the same problem and lustre implementation
+ *     mimics their solution: seekdir(hash) positions directory at the first
+ *     entry with the given hash.
+ *
+ * Client side.
+ *
+ * 0. caching
+ *
+ * Client caches directory pages using hash of the first entry as an index. As
+ * noted above hash is not unique, so this solution doesn't work as is:
+ * special processing is needed for "page hash chains" (i.e., sequences of
+ * pages filled with entries all having the same hash value).
+ *
+ * First, such chains have to be detected. To this end, server returns to the
+ * client the hash of the first entry on the page next to one returned. When
+ * client detects that this hash is the same as hash of the first entry on the
+ * returned page, page hash collision has to be handled. Pages in the
+ * hash chain, except first one, are termed "overflow pages".
+ *
+ * Solution to index uniqueness problem is to not cache overflow
+ * pages. Instead, when page hash collision is detected, all overflow pages
+ * from emerging chain are immediately requested from the server and placed in
+ * a special data structure (struct ll_dir_chain). This data structure is used
+ * by ll_readdir() to process entries from overflow pages. When readdir
+ * invocation finishes, overflow pages are discarded. If page hash collision
+ * chain weren't completely processed, next call to readdir will again detect
+ * page hash collision, again read overflow pages in, process next portion of
+ * entries and again discard the pages. This is not as wasteful as it looks,
+ * because, given reasonable hash, page hash collisions are extremely rare.
+ *
+ * 1. directory positioning
+ *
+ * When seekdir(hash) is called, original
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * Server.
+ *
+ * identification of and access to overflow pages
+ *
+ * page format
+ *
+ * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
+ * a header lu_dirpage which describes the start/end hash, and whether this
+ * page is empty (contains no dir entry) or hash collide with next page.
+ * After client receives reply, several pages will be integrated into dir page
+ * in PAGE_CACHE_SIZE (if PAGE_CACHE_SIZE greater than LU_PAGE_SIZE), and the
+ * lu_dirpage for this integrated page will be adjusted. See
+ * lmv_adjust_dirpages().
+ *
+ */
+
+/* returns the page unlocked, but with a reference */
+static int ll_dir_filler(void *_hash, struct page *page0)
+{
+       struct inode *inode = page0->mapping->host;
+       int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH;
+       struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp;
+       struct ptlrpc_request *request;
+       struct mdt_body *body;
+       struct md_op_data *op_data;
+       __u64 hash = *((__u64 *)_hash);
+       struct page **page_pool;
+       struct page *page;
+       struct lu_dirpage *dp;
+       int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_CACHE_SHIFT;
+       int nrdpgs = 0; /* number of pages read actually */
+       int npages;
+       int i;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash "LPU64"\n",
+              inode->i_ino, inode->i_generation, inode, hash);
+
+       LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES);
+
+       OBD_ALLOC(page_pool, sizeof(page) * max_pages);
+       if (page_pool != NULL) {
+               page_pool[0] = page0;
+       } else {
+               page_pool = &page0;
+               max_pages = 1;
+       }
+       for (npages = 1; npages < max_pages; npages++) {
+               page = page_cache_alloc_cold(inode->i_mapping);
+               if (!page)
+                       break;
+               page_pool[npages] = page;
+       }
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       op_data->op_npages = npages;
+       op_data->op_offset = hash;
+       rc = md_readpage(exp, op_data, page_pool, &request);
+       ll_finish_md_op_data(op_data);
+       if (rc == 0) {
+               body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+               /* Checked by mdc_readpage() */
+               LASSERT(body != NULL);
+
+               if (body->valid & OBD_MD_FLSIZE)
+                       cl_isize_write(inode, body->size);
+
+               nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_CACHE_SIZE-1)
+                        >> PAGE_CACHE_SHIFT;
+               SetPageUptodate(page0);
+       }
+       unlock_page(page0);
+       ptlrpc_req_finished(request);
+
+       CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages);
+
+       ll_pagevec_init(&lru_pvec, 0);
+       for (i = 1; i < npages; i++) {
+               unsigned long offset;
+               int ret;
+
+               page = page_pool[i];
+
+               if (rc < 0 || i >= nrdpgs) {
+                       page_cache_release(page);
+                       continue;
+               }
+
+               SetPageUptodate(page);
+
+               dp = kmap(page);
+               hash = le64_to_cpu(dp->ldp_hash_start);
+               kunmap(page);
+
+               offset = hash_x_index(hash, hash64);
+
+               prefetchw(&page->flags);
+               ret = add_to_page_cache_lru(page, inode->i_mapping, offset,
+                                           GFP_KERNEL);
+               if (ret == 0) {
+                       unlock_page(page);
+                       if (ll_pagevec_add(&lru_pvec, page) == 0)
+                               ll_pagevec_lru_add_file(&lru_pvec);
+               } else {
+                       CDEBUG(D_VFSTRACE, "page %lu add to page cache failed:"
+                              " %d\n", offset, ret);
+               }
+               page_cache_release(page);
+       }
+       ll_pagevec_lru_add_file(&lru_pvec);
+
+       if (page_pool != &page0)
+               OBD_FREE(page_pool, sizeof(struct page *) * max_pages);
+       EXIT;
+       return rc;
+}
+
+static void ll_check_page(struct inode *dir, struct page *page)
+{
+       /* XXX: check page format later */
+       SetPageChecked(page);
+}
+
+void ll_release_page(struct page *page, int remove)
+{
+       kunmap(page);
+       if (remove) {
+               lock_page(page);
+               if (likely(page->mapping != NULL))
+                       truncate_complete_page(page->mapping, page);
+               unlock_page(page);
+       }
+       page_cache_release(page);
+}
+
+/*
+ * Find, kmap and return page that contains given hash.
+ */
+static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
+                                      __u64 *start, __u64 *end)
+{
+       int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+       struct address_space *mapping = dir->i_mapping;
+       /*
+        * Complement of hash is used as an index so that
+        * radix_tree_gang_lookup() can be used to find a page with starting
+        * hash _smaller_ than one we are looking for.
+        */
+       unsigned long offset = hash_x_index(*hash, hash64);
+       struct page *page;
+       int found;
+
+       TREE_READ_LOCK_IRQ(mapping);
+       found = radix_tree_gang_lookup(&mapping->page_tree,
+                                      (void **)&page, offset, 1);
+       if (found > 0) {
+               struct lu_dirpage *dp;
+
+               page_cache_get(page);
+               TREE_READ_UNLOCK_IRQ(mapping);
+               /*
+                * In contrast to find_lock_page() we are sure that directory
+                * page cannot be truncated (while DLM lock is held) and,
+                * hence, can avoid restart.
+                *
+                * In fact, page cannot be locked here at all, because
+                * ll_dir_filler() does synchronous io.
+                */
+               wait_on_page_locked(page);
+               if (PageUptodate(page)) {
+                       dp = kmap(page);
+                       if (BITS_PER_LONG == 32 && hash64) {
+                               *start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+                               *end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+                               *hash  = *hash >> 32;
+                       } else {
+                               *start = le64_to_cpu(dp->ldp_hash_start);
+                               *end   = le64_to_cpu(dp->ldp_hash_end);
+                       }
+                       LASSERTF(*start <= *hash, "start = "LPX64",end = "
+                                LPX64",hash = "LPX64"\n", *start, *end, *hash);
+                       CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash "LPU64"\n",
+                              offset, *start, *end, *hash);
+                       if (*hash > *end) {
+                               ll_release_page(page, 0);
+                               page = NULL;
+                       } else if (*end != *start && *hash == *end) {
+                               /*
+                                * upon hash collision, remove this page,
+                                * otherwise put page reference, and
+                                * ll_get_dir_page() will issue RPC to fetch
+                                * the page we want.
+                                */
+                               ll_release_page(page,
+                                   le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+                               page = NULL;
+                       }
+               } else {
+                       page_cache_release(page);
+                       page = ERR_PTR(-EIO);
+               }
+
+       } else {
+               TREE_READ_UNLOCK_IRQ(mapping);
+               page = NULL;
+       }
+       return page;
+}
+
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+                            struct ll_dir_chain *chain)
+{
+       ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
+       struct address_space *mapping = dir->i_mapping;
+       struct lustre_handle lockh;
+       struct lu_dirpage *dp;
+       struct page *page;
+       ldlm_mode_t mode;
+       int rc;
+       __u64 start = 0;
+       __u64 end = 0;
+       __u64 lhash = hash;
+       struct ll_inode_info *lli = ll_i2info(dir);
+       int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
+
+       mode = LCK_PR;
+       rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED,
+                          ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh);
+       if (!rc) {
+               struct ldlm_enqueue_info einfo = {.ei_type = LDLM_IBITS,
+                                                 .ei_mode = mode,
+                                                 .ei_cb_bl =
+                                                 ll_md_blocking_ast,
+                                                 .ei_cb_cp =
+                                                 ldlm_completion_ast,
+                                                 .ei_cb_gl = NULL,
+                                                 .ei_cb_wg = NULL,
+                                                 .ei_cbdata = NULL};
+               struct lookup_intent it = { .it_op = IT_READDIR };
+               struct ptlrpc_request *request;
+               struct md_op_data *op_data;
+
+               op_data = ll_prep_md_op_data(NULL, dir, NULL, NULL, 0, 0,
+               LUSTRE_OPC_ANY, NULL);
+               if (IS_ERR(op_data))
+                       return (void *)op_data;
+
+               rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it,
+                               op_data, &lockh, NULL, 0, NULL, 0);
+
+               ll_finish_md_op_data(op_data);
+
+               request = (struct ptlrpc_request *)it.d.lustre.it_data;
+               if (request)
+                       ptlrpc_req_finished(request);
+               if (rc < 0) {
+                       CERROR("lock enqueue: "DFID" at "LPU64": rc %d\n",
+                               PFID(ll_inode2fid(dir)), hash, rc);
+                       return ERR_PTR(rc);
+               }
+
+               CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n",
+                      dir, dir->i_ino, dir->i_generation);
+               md_set_lock_data(ll_i2sbi(dir)->ll_md_exp,
+                                &it.d.lustre.it_lock_handle, dir, NULL);
+       } else {
+               /* for cross-ref object, l_ast_data of the lock may not be set,
+                * we reset it here */
+               md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie,
+                                dir, NULL);
+       }
+       ldlm_lock_dump_handle(D_OTHER, &lockh);
+
+       mutex_lock(&lli->lli_readdir_mutex);
+       page = ll_dir_page_locate(dir, &lhash, &start, &end);
+       if (IS_ERR(page)) {
+               CERROR("dir page locate: "DFID" at "LPU64": rc %ld\n",
+                      PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
+               GOTO(out_unlock, page);
+       } else if (page != NULL) {
+               /*
+                * XXX nikita: not entirely correct handling of a corner case:
+                * suppose hash chain of entries with hash value HASH crosses
+                * border between pages P0 and P1. First both P0 and P1 are
+                * cached, seekdir() is called for some entry from the P0 part
+                * of the chain. Later P0 goes out of cache. telldir(HASH)
+                * happens and finds P1, as it starts with matching hash
+                * value. Remaining entries from P0 part of the chain are
+                * skipped. (Is that really a bug?)
+                *
+                * Possible solutions: 0. don't cache P1 is such case, handle
+                * it as an "overflow" page. 1. invalidate all pages at
+                * once. 2. use HASH|1 as an index for P1.
+                */
+               GOTO(hash_collision, page);
+       }
+
+       page = read_cache_page(mapping, hash_x_index(hash, hash64),
+                              ll_dir_filler, &lhash);
+       if (IS_ERR(page)) {
+               CERROR("read cache page: "DFID" at "LPU64": rc %ld\n",
+                      PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
+               GOTO(out_unlock, page);
+       }
+
+       wait_on_page_locked(page);
+       (void)kmap(page);
+       if (!PageUptodate(page)) {
+               CERROR("page not updated: "DFID" at "LPU64": rc %d\n",
+                      PFID(ll_inode2fid(dir)), hash, -5);
+               goto fail;
+       }
+       if (!PageChecked(page))
+               ll_check_page(dir, page);
+       if (PageError(page)) {
+               CERROR("page error: "DFID" at "LPU64": rc %d\n",
+                      PFID(ll_inode2fid(dir)), hash, -5);
+               goto fail;
+       }
+hash_collision:
+       dp = page_address(page);
+       if (BITS_PER_LONG == 32 && hash64) {
+               start = le64_to_cpu(dp->ldp_hash_start) >> 32;
+               end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
+               lhash = hash >> 32;
+       } else {
+               start = le64_to_cpu(dp->ldp_hash_start);
+               end   = le64_to_cpu(dp->ldp_hash_end);
+               lhash = hash;
+       }
+       if (end == start) {
+               LASSERT(start == lhash);
+               CWARN("Page-wide hash collision: "LPU64"\n", end);
+               if (BITS_PER_LONG == 32 && hash64)
+                       CWARN("Real page-wide hash collision at ["LPU64" "LPU64
+                             "] with hash "LPU64"\n",
+                             le64_to_cpu(dp->ldp_hash_start),
+                             le64_to_cpu(dp->ldp_hash_end), hash);
+               /*
+                * Fetch whole overflow chain...
+                *
+                * XXX not yet.
+                */
+               goto fail;
+       }
+out_unlock:
+       mutex_unlock(&lli->lli_readdir_mutex);
+       ldlm_lock_decref(&lockh, mode);
+       return page;
+
+fail:
+       ll_release_page(page, 1);
+       page = ERR_PTR(-EIO);
+       goto out_unlock;
+}
+
+int ll_dir_read(struct inode *inode, __u64 *_pos, void *cookie,
+               filldir_t filldir)
+{
+       struct ll_inode_info *info       = ll_i2info(inode);
+       struct ll_sb_info    *sbi       = ll_i2sbi(inode);
+       __u64            pos    = *_pos;
+       int                api32      = ll_need_32bit_api(sbi);
+       int                hash64     = sbi->ll_flags & LL_SBI_64BIT_HASH;
+       struct page       *page;
+       struct ll_dir_chain   chain;
+       int                done = 0;
+       int                rc = 0;
+       ENTRY;
+
+       ll_dir_chain_init(&chain);
+
+       page = ll_get_dir_page(inode, pos, &chain);
+
+       while (rc == 0 && !done) {
+               struct lu_dirpage *dp;
+               struct lu_dirent  *ent;
+
+               if (!IS_ERR(page)) {
+                       /*
+                        * If page is empty (end of directory is reached),
+                        * use this value.
+                        */
+                       __u64 hash = MDS_DIR_END_OFF;
+                       __u64 next;
+
+                       dp = page_address(page);
+                       for (ent = lu_dirent_start(dp); ent != NULL && !done;
+                            ent = lu_dirent_next(ent)) {
+                               __u16     type;
+                               int         namelen;
+                               struct lu_fid  fid;
+                               __u64     lhash;
+                               __u64     ino;
+
+                               /*
+                                * XXX: implement correct swabbing here.
+                                */
+
+                               hash = le64_to_cpu(ent->lde_hash);
+                               if (hash < pos)
+                                       /*
+                                        * Skip until we find target hash
+                                        * value.
+                                        */
+                                       continue;
+
+                               namelen = le16_to_cpu(ent->lde_namelen);
+                               if (namelen == 0)
+                                       /*
+                                        * Skip dummy record.
+                                        */
+                                       continue;
+
+                               if (api32 && hash64)
+                                       lhash = hash >> 32;
+                               else
+                                       lhash = hash;
+                               fid_le_to_cpu(&fid, &ent->lde_fid);
+                               ino = cl_fid_build_ino(&fid, api32);
+                               type = ll_dirent_type_get(ent);
+                               /* For 'll_nfs_get_name_filldir()', it will try
+                                * to access the 'ent' through its 'lde_name',
+                                * so the parameter 'name' for 'filldir()' must
+                                * be part of the 'ent'. */
+                               done = filldir(cookie, ent->lde_name, namelen,
+                                              lhash, ino, type);
+                       }
+                       next = le64_to_cpu(dp->ldp_hash_end);
+                       if (!done) {
+                               pos = next;
+                               if (pos == MDS_DIR_END_OFF) {
+                                       /*
+                                        * End of directory reached.
+                                        */
+                                       done = 1;
+                                       ll_release_page(page, 0);
+                               } else if (1 /* chain is exhausted*/) {
+                                       /*
+                                        * Normal case: continue to the next
+                                        * page.
+                                        */
+                                       ll_release_page(page,
+                                           le32_to_cpu(dp->ldp_flags) &
+                                                       LDF_COLLIDE);
+                                       next = pos;
+                                       page = ll_get_dir_page(inode, pos,
+                                                              &chain);
+                               } else {
+                                       /*
+                                        * go into overflow page.
+                                        */
+                                       LASSERT(le32_to_cpu(dp->ldp_flags) &
+                                               LDF_COLLIDE);
+                                       ll_release_page(page, 1);
+                               }
+                       } else {
+                               pos = hash;
+                               ll_release_page(page, 0);
+                       }
+               } else {
+                       rc = PTR_ERR(page);
+                       CERROR("error reading dir "DFID" at %lu: rc %d\n",
+                              PFID(&info->lli_fid), (unsigned long)pos, rc);
+               }
+       }
+
+       *_pos = pos;
+       ll_dir_chain_fini(&chain);
+       RETURN(rc);
+}
+
+static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir)
+{
+       struct inode            *inode  = filp->f_dentry->d_inode;
+       struct ll_file_data     *lfd    = LUSTRE_FPRIVATE(filp);
+       struct ll_sb_info       *sbi    = ll_i2sbi(inode);
+       __u64                   pos     = lfd->lfd_pos;
+       int                     hash64  = sbi->ll_flags & LL_SBI_64BIT_HASH;
+       int                     api32   = ll_need_32bit_api(sbi);
+       int                     rc;
+       struct path             path;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu "
+              " 32bit_api %d\n", inode->i_ino, inode->i_generation,
+              inode, (unsigned long)pos, i_size_read(inode), api32);
+
+       if (pos == MDS_DIR_END_OFF)
+               /*
+                * end-of-file.
+                */
+               GOTO(out, rc = 0);
+
+       rc = ll_dir_read(inode, &pos, cookie, filldir);
+       lfd->lfd_pos = pos;
+       if (pos == MDS_DIR_END_OFF) {
+               if (api32)
+                       filp->f_pos = LL_DIR_END_OFF_32BIT;
+               else
+                       filp->f_pos = LL_DIR_END_OFF;
+       } else {
+               if (api32 && hash64)
+                       filp->f_pos = pos >> 32;
+               else
+                       filp->f_pos = pos;
+       }
+       filp->f_version = inode->i_version;
+       path.mnt = filp->f_path.mnt;
+       path.dentry = filp->f_dentry;
+       touch_atime(&path);
+
+out:
+       if (!rc)
+               ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1);
+
+       RETURN(rc);
+}
+
+int ll_send_mgc_param(struct obd_export *mgc, char *string)
+{
+       struct mgs_send_param *msp;
+       int rc = 0;
+
+       OBD_ALLOC_PTR(msp);
+       if (!msp)
+               return -ENOMEM;
+
+       strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN);
+       rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
+                               sizeof(struct mgs_send_param), msp, NULL);
+       if (rc)
+               CERROR("Failed to set parameter: %d\n", rc);
+       OBD_FREE_PTR(msp);
+
+       return rc;
+}
+
+int ll_dir_setdirstripe(struct inode *dir, struct lmv_user_md *lump,
+                       char *filename)
+{
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       struct ll_sb_info *sbi = ll_i2sbi(dir);
+       int mode;
+       int err;
+
+       ENTRY;
+
+       mode = (0755 & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR;
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, filename,
+                                    strlen(filename), mode, LUSTRE_OPC_MKDIR,
+                                    lump);
+       if (IS_ERR(op_data))
+               GOTO(err_exit, err = PTR_ERR(op_data));
+
+       op_data->op_cli_flags |= CLI_SET_MEA;
+       err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
+                       current_fsuid(), current_fsgid(),
+                       cfs_curproc_cap_pack(), 0, &request);
+       ll_finish_md_op_data(op_data);
+       if (err)
+               GOTO(err_exit, err);
+err_exit:
+       ptlrpc_req_finished(request);
+       return err;
+}
+
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+                    int set_default)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct md_op_data *op_data;
+       struct ptlrpc_request *req = NULL;
+       int rc = 0;
+       struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
+       struct obd_device *mgc = lsi->lsi_mgc;
+       int lum_size;
+       ENTRY;
+
+       if (lump != NULL) {
+               /*
+                * This is coming from userspace, so should be in
+                * local endian.  But the MDS would like it in little
+                * endian, so we swab it before we send it.
+                */
+               switch (lump->lmm_magic) {
+               case LOV_USER_MAGIC_V1: {
+                       if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+                               lustre_swab_lov_user_md_v1(lump);
+                       lum_size = sizeof(struct lov_user_md_v1);
+                       break;
+               }
+               case LOV_USER_MAGIC_V3: {
+                       if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+                               lustre_swab_lov_user_md_v3(
+                                       (struct lov_user_md_v3 *)lump);
+                       lum_size = sizeof(struct lov_user_md_v3);
+                       break;
+               }
+               default: {
+                       CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+                                       " %#08x != %#08x nor %#08x\n",
+                                       lump->lmm_magic, LOV_USER_MAGIC_V1,
+                                       LOV_USER_MAGIC_V3);
+                       RETURN(-EINVAL);
+               }
+               }
+       } else {
+               lum_size = sizeof(struct lov_user_md_v1);
+       }
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       if (lump != NULL && lump->lmm_magic == cpu_to_le32(LMV_USER_MAGIC))
+               op_data->op_cli_flags |= CLI_SET_MEA;
+
+       /* swabbing is done in lov_setstripe() on server side */
+       rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size,
+                       NULL, 0, &req, NULL);
+       ll_finish_md_op_data(op_data);
+       ptlrpc_req_finished(req);
+       if (rc) {
+               if (rc != -EPERM && rc != -EACCES)
+                       CERROR("mdc_setattr fails: rc = %d\n", rc);
+       }
+
+       /* In the following we use the fact that LOV_USER_MAGIC_V1 and
+        LOV_USER_MAGIC_V3 have the same initial fields so we do not
+        need the make the distiction between the 2 versions */
+       if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
+               char *param = NULL;
+               char *buf;
+
+               OBD_ALLOC(param, MGS_PARAM_MAXLEN);
+               if (param == NULL)
+                       GOTO(end, rc = -ENOMEM);
+
+               buf = param;
+               /* Get fsname and assume devname to be -MDT0000. */
+               ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN);
+               strcat(buf, "-MDT0000.lov");
+               buf += strlen(buf);
+
+               /* Set root stripesize */
+               sprintf(buf, ".stripesize=%u",
+                       lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
+               rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+               if (rc)
+                       GOTO(end, rc);
+
+               /* Set root stripecount */
+               sprintf(buf, ".stripecount=%hd",
+                       lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
+               rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+               if (rc)
+                       GOTO(end, rc);
+
+               /* Set root stripeoffset */
+               sprintf(buf, ".stripeoffset=%hd",
+                       lump ? le16_to_cpu(lump->lmm_stripe_offset) :
+                       (typeof(lump->lmm_stripe_offset))(-1));
+               rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
+
+end:
+               if (param != NULL)
+                       OBD_FREE(param, MGS_PARAM_MAXLEN);
+       }
+       RETURN(rc);
+}
+
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+                    int *lmm_size, struct ptlrpc_request **request)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct mdt_body   *body;
+       struct lov_mds_md *lmm = NULL;
+       struct ptlrpc_request *req = NULL;
+       int rc, lmmsize;
+       struct md_op_data *op_data;
+
+       rc = ll_get_max_mdsize(sbi, &lmmsize);
+       if (rc)
+               RETURN(rc);
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+                                    0, lmmsize, LUSTRE_OPC_ANY,
+                                    NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+       rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+       ll_finish_md_op_data(op_data);
+       if (rc < 0) {
+               CDEBUG(D_INFO, "md_getattr failed on inode "
+                      "%lu/%u: rc %d\n", inode->i_ino,
+                      inode->i_generation, rc);
+               GOTO(out, rc);
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL);
+
+       lmmsize = body->eadatasize;
+
+       if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+           lmmsize == 0) {
+               GOTO(out, rc = -ENODATA);
+       }
+
+       lmm = req_capsule_server_sized_get(&req->rq_pill,
+                                          &RMF_MDT_MD, lmmsize);
+       LASSERT(lmm != NULL);
+
+       /*
+        * This is coming from the MDS, so is probably in
+        * little endian.  We convert it to host endian before
+        * passing it to userspace.
+        */
+       /* We don't swab objects for directories */
+       switch (le32_to_cpu(lmm->lmm_magic)) {
+       case LOV_MAGIC_V1:
+               if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+                       lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+               break;
+       case LOV_MAGIC_V3:
+               if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+                       lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+               break;
+       default:
+               CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
+               rc = -EPROTO;
+       }
+out:
+       *lmmp = lmm;
+       *lmm_size = lmmsize;
+       *request = req;
+       return rc;
+}
+
+/*
+ *  Get MDT index for the inode.
+ */
+int ll_get_mdt_idx(struct inode *inode)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct md_op_data *op_data;
+       int rc, mdtidx;
+       ENTRY;
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0,
+                                    0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       op_data->op_flags |= MF_GET_MDT_IDX;
+       rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
+       mdtidx = op_data->op_mds;
+       ll_finish_md_op_data(op_data);
+       if (rc < 0) {
+               CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+               RETURN(rc);
+       }
+       return mdtidx;
+}
+
+/**
+ * Generic handler to do any pre-copy work.
+ *
+ * It send a first hsm_progress (with extent length == 0) to coordinator as a
+ * first information for it that real work has started.
+ *
+ * Moreover, for a ARCHIVE request, it will sample the file data version and
+ * store it in \a copy.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
+{
+       struct ll_sb_info               *sbi = ll_s2sbi(sb);
+       struct hsm_progress_kernel       hpk;
+       int                              rc;
+       ENTRY;
+
+       /* Forge a hsm_progress based on data from copy. */
+       hpk.hpk_fid = copy->hc_hai.hai_fid;
+       hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+       hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
+       hpk.hpk_extent.length = 0;
+       hpk.hpk_flags = 0;
+       hpk.hpk_errval = 0;
+       hpk.hpk_data_version = 0;
+
+
+       /* For archive request, we need to read the current file version. */
+       if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
+               struct inode    *inode;
+               __u64            data_version = 0;
+
+               /* Get inode for this fid */
+               inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+               if (IS_ERR(inode)) {
+                       hpk.hpk_flags |= HP_FLAG_RETRY;
+                       /* hpk_errval is >= 0 */
+                       hpk.hpk_errval = -PTR_ERR(inode);
+                       GOTO(progress, rc = PTR_ERR(inode));
+               }
+
+               /* Read current file data version */
+               rc = ll_data_version(inode, &data_version, 1);
+               iput(inode);
+               if (rc != 0) {
+                       CDEBUG(D_HSM, "Could not read file data version of "
+                                     DFID" (rc = %d). Archive request ("
+                                     LPX64") could not be done.\n",
+                                     PFID(&copy->hc_hai.hai_fid), rc,
+                                     copy->hc_hai.hai_cookie);
+                       hpk.hpk_flags |= HP_FLAG_RETRY;
+                       /* hpk_errval must be >= 0 */
+                       hpk.hpk_errval = -rc;
+                       GOTO(progress, rc);
+               }
+
+               /* Store it the hsm_copy for later copytool use.
+                * Always modified even if no lsm. */
+               copy->hc_data_version = data_version;
+       }
+
+progress:
+       rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+                          &hpk, NULL);
+
+       RETURN(rc);
+}
+
+/**
+ * Generic handler to do any post-copy work.
+ *
+ * It will send the last hsm_progress update to coordinator to inform it
+ * that copy is finished and whether it was successful or not.
+ *
+ * Moreover,
+ * - for ARCHIVE request, it will sample the file data version and compare it
+ *   with the version saved in ll_ioc_copy_start(). If they do not match, copy
+ *   will be considered as failed.
+ * - for RESTORE request, it will sample the file data version and send it to
+ *   coordinator which is useful if the file was imported as 'released'.
+ *
+ * \return 0 on success.
+ */
+static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
+{
+       struct ll_sb_info               *sbi = ll_s2sbi(sb);
+       struct hsm_progress_kernel       hpk;
+       int                              rc;
+       ENTRY;
+
+       /* If you modify the logic here, also check llapi_hsm_copy_end(). */
+       /* Take care: copy->hc_hai.hai_action, len, gid and data are not
+        * initialized if copy_end was called with copy == NULL.
+        */
+
+       /* Forge a hsm_progress based on data from copy. */
+       hpk.hpk_fid = copy->hc_hai.hai_fid;
+       hpk.hpk_cookie = copy->hc_hai.hai_cookie;
+       hpk.hpk_extent = copy->hc_hai.hai_extent;
+       hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
+       hpk.hpk_errval = copy->hc_errval;
+       hpk.hpk_data_version = 0;
+
+       /* For archive request, we need to check the file data was not changed.
+        *
+        * For restore request, we need to send the file data version, this is
+        * useful when the file was created using hsm_import.
+        */
+       if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
+            (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
+           (copy->hc_errval == 0)) {
+               struct inode    *inode;
+               __u64            data_version = 0;
+
+               /* Get lsm for this fid */
+               inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
+               if (IS_ERR(inode)) {
+                       hpk.hpk_flags |= HP_FLAG_RETRY;
+                       /* hpk_errval must be >= 0 */
+                       hpk.hpk_errval = -PTR_ERR(inode);
+                       GOTO(progress, rc = PTR_ERR(inode));
+               }
+
+               rc = ll_data_version(inode, &data_version,
+                                    copy->hc_hai.hai_action == HSMA_ARCHIVE);
+               iput(inode);
+               if (rc) {
+                       CDEBUG(D_HSM, "Could not read file data version. "
+                                     "Request could not be confirmed.\n");
+                       if (hpk.hpk_errval == 0)
+                               hpk.hpk_errval = -rc;
+                       GOTO(progress, rc);
+               }
+
+               /* Store it the hsm_copy for later copytool use.
+                * Always modified even if no lsm. */
+               hpk.hpk_data_version = data_version;
+
+               /* File could have been stripped during archiving, so we need
+                * to check anyway. */
+               if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
+                   (copy->hc_data_version != data_version)) {
+                       CDEBUG(D_HSM, "File data version mismatched. "
+                             "File content was changed during archiving. "
+                              DFID", start:"LPX64" current:"LPX64"\n",
+                              PFID(&copy->hc_hai.hai_fid),
+                              copy->hc_data_version, data_version);
+                       /* File was changed, send error to cdt. Do not ask for
+                        * retry because if a file is modified frequently,
+                        * the cdt will loop on retried archive requests.
+                        * The policy engine will ask for a new archive later
+                        * when the file will not be modified for some tunable
+                        * time */
+                       /* we do not notify caller */
+                       hpk.hpk_flags &= ~HP_FLAG_RETRY;
+                       /* hpk_errval must be >= 0 */
+                       hpk.hpk_errval = EBUSY;
+               }
+
+       }
+
+progress:
+       rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
+                          &hpk, NULL);
+
+       RETURN(rc);
+}
+
+
+static int copy_and_ioctl(int cmd, struct obd_export *exp, void *data, int len)
+{
+       void *ptr;
+       int rc;
+
+       OBD_ALLOC(ptr, len);
+       if (ptr == NULL)
+               return -ENOMEM;
+       if (copy_from_user(ptr, data, len)) {
+               OBD_FREE(ptr, len);
+               return -EFAULT;
+       }
+       rc = obd_iocontrol(cmd, exp, len, data, NULL);
+       OBD_FREE(ptr, len);
+       return rc;
+}
+
+static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
+{
+       int cmd = qctl->qc_cmd;
+       int type = qctl->qc_type;
+       int id = qctl->qc_id;
+       int valid = qctl->qc_valid;
+       int rc = 0;
+       ENTRY;
+
+       switch (cmd) {
+       case LUSTRE_Q_INVALIDATE:
+       case LUSTRE_Q_FINVALIDATE:
+       case Q_QUOTAON:
+       case Q_QUOTAOFF:
+       case Q_SETQUOTA:
+       case Q_SETINFO:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+                   sbi->ll_flags & LL_SBI_RMT_CLIENT)
+                       RETURN(-EPERM);
+               break;
+       case Q_GETQUOTA:
+               if (((type == USRQUOTA && current_euid() != id) ||
+                    (type == GRPQUOTA && !in_egroup_p(id))) &&
+                   (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+                    sbi->ll_flags & LL_SBI_RMT_CLIENT))
+                       RETURN(-EPERM);
+               break;
+       case Q_GETINFO:
+               break;
+       default:
+               CERROR("unsupported quotactl op: %#x\n", cmd);
+               RETURN(-ENOTTY);
+       }
+
+       if (valid != QC_GENERAL) {
+               if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+                       RETURN(-EOPNOTSUPP);
+
+               if (cmd == Q_GETINFO)
+                       qctl->qc_cmd = Q_GETOINFO;
+               else if (cmd == Q_GETQUOTA)
+                       qctl->qc_cmd = Q_GETOQUOTA;
+               else
+                       RETURN(-EINVAL);
+
+               switch (valid) {
+               case QC_MDTIDX:
+                       rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+                                          sizeof(*qctl), qctl, NULL);
+                       break;
+               case QC_OSTIDX:
+                       rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
+                                          sizeof(*qctl), qctl, NULL);
+                       break;
+               case QC_UUID:
+                       rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
+                                          sizeof(*qctl), qctl, NULL);
+                       if (rc == -EAGAIN)
+                               rc = obd_iocontrol(OBD_IOC_QUOTACTL,
+                                                  sbi->ll_dt_exp,
+                                                  sizeof(*qctl), qctl, NULL);
+                       break;
+               default:
+                       rc = -EINVAL;
+                       break;
+               }
+
+               if (rc)
+                       RETURN(rc);
+
+               qctl->qc_cmd = cmd;
+       } else {
+               struct obd_quotactl *oqctl;
+
+               OBD_ALLOC_PTR(oqctl);
+               if (oqctl == NULL)
+                       RETURN(-ENOMEM);
+
+               QCTL_COPY(oqctl, qctl);
+               rc = obd_quotactl(sbi->ll_md_exp, oqctl);
+               if (rc) {
+                       if (rc != -EALREADY && cmd == Q_QUOTAON) {
+                               oqctl->qc_cmd = Q_QUOTAOFF;
+                               obd_quotactl(sbi->ll_md_exp, oqctl);
+                       }
+                       OBD_FREE_PTR(oqctl);
+                       RETURN(rc);
+               }
+               /* If QIF_SPACE is not set, client should collect the
+                * space usage from OSSs by itself */
+               if (cmd == Q_GETQUOTA &&
+                   !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
+                   !oqctl->qc_dqblk.dqb_curspace) {
+                       struct obd_quotactl *oqctl_tmp;
+
+                       OBD_ALLOC_PTR(oqctl_tmp);
+                       if (oqctl_tmp == NULL)
+                               GOTO(out, rc = -ENOMEM);
+
+                       oqctl_tmp->qc_cmd = Q_GETOQUOTA;
+                       oqctl_tmp->qc_id = oqctl->qc_id;
+                       oqctl_tmp->qc_type = oqctl->qc_type;
+
+                       /* collect space usage from OSTs */
+                       oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+                       rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
+                       if (!rc || rc == -EREMOTEIO) {
+                               oqctl->qc_dqblk.dqb_curspace =
+                                       oqctl_tmp->qc_dqblk.dqb_curspace;
+                               oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
+                       }
+
+                       /* collect space & inode usage from MDTs */
+                       oqctl_tmp->qc_dqblk.dqb_curspace = 0;
+                       oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
+                       rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
+                       if (!rc || rc == -EREMOTEIO) {
+                               oqctl->qc_dqblk.dqb_curspace +=
+                                       oqctl_tmp->qc_dqblk.dqb_curspace;
+                               oqctl->qc_dqblk.dqb_curinodes =
+                                       oqctl_tmp->qc_dqblk.dqb_curinodes;
+                               oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
+                       } else {
+                               oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
+                       }
+
+                       OBD_FREE_PTR(oqctl_tmp);
+               }
+out:
+               QCTL_COPY(qctl, oqctl);
+               OBD_FREE_PTR(oqctl);
+       }
+
+       RETURN(rc);
+}
+
+static char *
+ll_getname(const char __user *filename)
+{
+       int ret = 0, len;
+       char *tmp = __getname();
+
+       if (!tmp)
+               return ERR_PTR(-ENOMEM);
+
+       len = strncpy_from_user(tmp, filename, PATH_MAX);
+       if (len == 0)
+               ret = -ENOENT;
+       else if (len > PATH_MAX)
+               ret = -ENAMETOOLONG;
+
+       if (ret) {
+               __putname(tmp);
+               tmp =  ERR_PTR(ret);
+       }
+       return tmp;
+}
+
+#define ll_putname(filename) __putname(filename)
+
+static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct obd_ioctl_data *data;
+       int rc = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n",
+              inode->i_ino, inode->i_generation, inode, cmd);
+
+       /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+       if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+               return -ENOTTY;
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+       switch(cmd) {
+       case FSFILT_IOC_GETFLAGS:
+       case FSFILT_IOC_SETFLAGS:
+               RETURN(ll_iocontrol(inode, file, cmd, arg));
+       case FSFILT_IOC_GETVERSION_OLD:
+       case FSFILT_IOC_GETVERSION:
+               RETURN(put_user(inode->i_generation, (int *)arg));
+       /* We need to special case any other ioctls we want to handle,
+        * to send them to the MDS/OST as appropriate and to properly
+        * network encode the arg field.
+       case FSFILT_IOC_SETVERSION_OLD:
+       case FSFILT_IOC_SETVERSION:
+       */
+       case LL_IOC_GET_MDTIDX: {
+               int mdtidx;
+
+               mdtidx = ll_get_mdt_idx(inode);
+               if (mdtidx < 0)
+                       RETURN(mdtidx);
+
+               if (put_user((int)mdtidx, (int*)arg))
+                       RETURN(-EFAULT);
+
+               return 0;
+       }
+       case IOC_MDC_LOOKUP: {
+               struct ptlrpc_request *request = NULL;
+               int namelen, len = 0;
+               char *buf = NULL;
+               char *filename;
+               struct md_op_data *op_data;
+
+               rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+               if (rc)
+                       RETURN(rc);
+               data = (void *)buf;
+
+               filename = data->ioc_inlbuf1;
+               namelen = strlen(filename);
+
+               if (namelen < 1) {
+                       CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+                       GOTO(out_free, rc = -EINVAL);
+               }
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, namelen,
+                                            0, LUSTRE_OPC_ANY, NULL);
+               if (IS_ERR(op_data))
+                       GOTO(out_free, rc = PTR_ERR(op_data));
+
+               op_data->op_valid = OBD_MD_FLID;
+               rc = md_getattr_name(sbi->ll_md_exp, op_data, &request);
+               ll_finish_md_op_data(op_data);
+               if (rc < 0) {
+                       CDEBUG(D_INFO, "md_getattr_name: %d\n", rc);
+                       GOTO(out_free, rc);
+               }
+               ptlrpc_req_finished(request);
+               EXIT;
+out_free:
+               obd_ioctl_freedata(buf, len);
+               return rc;
+       }
+       case LL_IOC_LMV_SETSTRIPE: {
+               struct lmv_user_md  *lum;
+               char            *buf = NULL;
+               char            *filename;
+               int              namelen = 0;
+               int              lumlen = 0;
+               int              len;
+               int              rc;
+
+               rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+               if (rc)
+                       RETURN(rc);
+
+               data = (void *)buf;
+               if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL ||
+                   data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0)
+                       GOTO(lmv_out_free, rc = -EINVAL);
+
+               filename = data->ioc_inlbuf1;
+               namelen = data->ioc_inllen1;
+
+               if (namelen < 1) {
+                       CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
+                       GOTO(lmv_out_free, rc = -EINVAL);
+               }
+               lum = (struct lmv_user_md *)data->ioc_inlbuf2;
+               lumlen = data->ioc_inllen2;
+
+               if (lum->lum_magic != LMV_USER_MAGIC ||
+                   lumlen != sizeof(*lum)) {
+                       CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
+                              filename, lum->lum_magic, lumlen, -EFAULT);
+                       GOTO(lmv_out_free, rc = -EINVAL);
+               }
+
+               /**
+                * ll_dir_setdirstripe will be used to set dir stripe
+                *  mdc_create--->mdt_reint_create (with dirstripe)
+                */
+               rc = ll_dir_setdirstripe(inode, lum, filename);
+lmv_out_free:
+               obd_ioctl_freedata(buf, len);
+               RETURN(rc);
+
+       }
+       case LL_IOC_LOV_SETSTRIPE: {
+               struct lov_user_md_v3 lumv3;
+               struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+               struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+               struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+
+               int set_default = 0;
+
+               LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
+               LASSERT(sizeof(lumv3.lmm_objects[0]) ==
+                       sizeof(lumv3p->lmm_objects[0]));
+               /* first try with v1 which is smaller than v3 */
+               if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
+                       RETURN(-EFAULT);
+
+               if ((lumv1->lmm_magic == LOV_USER_MAGIC_V3) ) {
+                       if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
+                               RETURN(-EFAULT);
+               }
+
+               if (inode->i_sb->s_root == file->f_dentry)
+                       set_default = 1;
+
+               /* in v1 and v3 cases lumv1 points to data */
+               rc = ll_dir_setstripe(inode, lumv1, set_default);
+
+               RETURN(rc);
+       }
+       case LL_IOC_LMV_GETSTRIPE: {
+               struct lmv_user_md *lump = (struct lmv_user_md *)arg;
+               struct lmv_user_md lum;
+               struct lmv_user_md *tmp;
+               int lum_size;
+               int rc = 0;
+               int mdtindex;
+
+               if (copy_from_user(&lum, lump, sizeof(struct lmv_user_md)))
+                       RETURN(-EFAULT);
+
+               if (lum.lum_magic != LMV_MAGIC_V1)
+                       RETURN(-EINVAL);
+
+               lum_size = lmv_user_md_size(1, LMV_MAGIC_V1);
+               OBD_ALLOC(tmp, lum_size);
+               if (tmp == NULL)
+                       GOTO(free_lmv, rc = -ENOMEM);
+
+               memcpy(tmp, &lum, sizeof(lum));
+               tmp->lum_type = LMV_STRIPE_TYPE;
+               tmp->lum_stripe_count = 1;
+               mdtindex = ll_get_mdt_idx(inode);
+               if (mdtindex < 0)
+                       GOTO(free_lmv, rc = -ENOMEM);
+
+               tmp->lum_stripe_offset = mdtindex;
+               tmp->lum_objects[0].lum_mds = mdtindex;
+               memcpy(&tmp->lum_objects[0].lum_fid, ll_inode2fid(inode),
+                      sizeof(struct lu_fid));
+               if (copy_to_user((void *)arg, tmp, lum_size))
+                       GOTO(free_lmv, rc = -EFAULT);
+free_lmv:
+               if (tmp)
+                       OBD_FREE(tmp, lum_size);
+               RETURN(rc);
+       }
+       case LL_IOC_REMOVE_ENTRY: {
+               char            *filename = NULL;
+               int              namelen = 0;
+               int              rc;
+
+               /* Here is a little hack to avoid sending REINT_RMENTRY to
+                * unsupported server, which might crash the server(LU-2730),
+                * Because both LVB_TYPE and REINT_RMENTRY will be supported
+                * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the
+                * server will support REINT_RMENTRY XXX*/
+               if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE))
+                       return -ENOTSUPP;
+
+               filename = ll_getname((const char *)arg);
+               if (IS_ERR(filename))
+                       RETURN(PTR_ERR(filename));
+
+               namelen = strlen(filename);
+               if (namelen < 1)
+                       GOTO(out_rmdir, rc = -EINVAL);
+
+               rc = ll_rmdir_entry(inode, filename, namelen);
+out_rmdir:
+               if (filename)
+                       ll_putname(filename);
+               RETURN(rc);
+       }
+       case LL_IOC_LOV_SWAP_LAYOUTS:
+               RETURN(-EPERM);
+       case LL_IOC_OBD_STATFS:
+               RETURN(ll_obd_statfs(inode, (void *)arg));
+       case LL_IOC_LOV_GETSTRIPE:
+       case LL_IOC_MDC_GETINFO:
+       case IOC_MDC_GETFILEINFO:
+       case IOC_MDC_GETFILESTRIPE: {
+               struct ptlrpc_request *request = NULL;
+               struct lov_user_md *lump;
+               struct lov_mds_md *lmm = NULL;
+               struct mdt_body *body;
+               char *filename = NULL;
+               int lmmsize;
+
+               if (cmd == IOC_MDC_GETFILEINFO ||
+                   cmd == IOC_MDC_GETFILESTRIPE) {
+                       filename = ll_getname((const char *)arg);
+                       if (IS_ERR(filename))
+                               RETURN(PTR_ERR(filename));
+
+                       rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
+                                                     &lmmsize, &request);
+               } else {
+                       rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+               }
+
+               if (request) {
+                       body = req_capsule_server_get(&request->rq_pill,
+                                                     &RMF_MDT_BODY);
+                       LASSERT(body != NULL);
+               } else {
+                       GOTO(out_req, rc);
+               }
+
+               if (rc < 0) {
+                       if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
+                                              cmd == LL_IOC_MDC_GETINFO))
+                               GOTO(skip_lmm, rc = 0);
+                       else
+                               GOTO(out_req, rc);
+               }
+
+               if (cmd == IOC_MDC_GETFILESTRIPE ||
+                   cmd == LL_IOC_LOV_GETSTRIPE) {
+                       lump = (struct lov_user_md *)arg;
+               } else {
+                       struct lov_user_mds_data *lmdp;
+                       lmdp = (struct lov_user_mds_data *)arg;
+                       lump = &lmdp->lmd_lmm;
+               }
+               if (copy_to_user(lump, lmm, lmmsize)) {
+                       if (copy_to_user(lump, lmm, sizeof(*lump)))
+                               GOTO(out_req, rc = -EFAULT);
+                       rc = -EOVERFLOW;
+               }
+       skip_lmm:
+               if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
+                       struct lov_user_mds_data *lmdp;
+                       lstat_t st = { 0 };
+
+                       st.st_dev     = inode->i_sb->s_dev;
+                       st.st_mode    = body->mode;
+                       st.st_nlink   = body->nlink;
+                       st.st_uid     = body->uid;
+                       st.st_gid     = body->gid;
+                       st.st_rdev    = body->rdev;
+                       st.st_size    = body->size;
+                       st.st_blksize = PAGE_CACHE_SIZE;
+                       st.st_blocks  = body->blocks;
+                       st.st_atime   = body->atime;
+                       st.st_mtime   = body->mtime;
+                       st.st_ctime   = body->ctime;
+                       st.st_ino     = inode->i_ino;
+
+                       lmdp = (struct lov_user_mds_data *)arg;
+                       if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st)))
+                               GOTO(out_req, rc = -EFAULT);
+               }
+
+               EXIT;
+       out_req:
+               ptlrpc_req_finished(request);
+               if (filename)
+                       ll_putname(filename);
+               return rc;
+       }
+       case IOC_LOV_GETINFO: {
+               struct lov_user_mds_data *lumd;
+               struct lov_stripe_md *lsm;
+               struct lov_user_md *lum;
+               struct lov_mds_md *lmm;
+               int lmmsize;
+               lstat_t st;
+
+               lumd = (struct lov_user_mds_data *)arg;
+               lum = &lumd->lmd_lmm;
+
+               rc = ll_get_max_mdsize(sbi, &lmmsize);
+               if (rc)
+                       RETURN(rc);
+
+               OBD_ALLOC_LARGE(lmm, lmmsize);
+               if (copy_from_user(lmm, lum, lmmsize))
+                       GOTO(free_lmm, rc = -EFAULT);
+
+               switch (lmm->lmm_magic) {
+               case LOV_USER_MAGIC_V1:
+                       if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1))
+                               break;
+                       /* swab objects first so that stripes num will be sane */
+                       lustre_swab_lov_user_md_objects(
+                               ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+                               ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+                       lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+                       break;
+               case LOV_USER_MAGIC_V3:
+                       if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3))
+                               break;
+                       /* swab objects first so that stripes num will be sane */
+                       lustre_swab_lov_user_md_objects(
+                               ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+                               ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+                       lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+                       break;
+               default:
+                       GOTO(free_lmm, rc = -EINVAL);
+               }
+
+               rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
+               if (rc < 0)
+                       GOTO(free_lmm, rc = -ENOMEM);
+
+               /* Perform glimpse_size operation. */
+               memset(&st, 0, sizeof(st));
+
+               rc = ll_glimpse_ioctl(sbi, lsm, &st);
+               if (rc)
+                       GOTO(free_lsm, rc);
+
+               if (copy_to_user(&lumd->lmd_st, &st, sizeof(st)))
+                       GOTO(free_lsm, rc = -EFAULT);
+
+               EXIT;
+       free_lsm:
+               obd_free_memmd(sbi->ll_dt_exp, &lsm);
+       free_lmm:
+               OBD_FREE_LARGE(lmm, lmmsize);
+               return rc;
+       }
+       case OBD_IOC_LLOG_CATINFO: {
+               RETURN(-EOPNOTSUPP);
+       }
+       case OBD_IOC_QUOTACHECK: {
+               struct obd_quotactl *oqctl;
+               int error = 0;
+
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+                   sbi->ll_flags & LL_SBI_RMT_CLIENT)
+                       RETURN(-EPERM);
+
+               OBD_ALLOC_PTR(oqctl);
+               if (!oqctl)
+                       RETURN(-ENOMEM);
+               oqctl->qc_type = arg;
+               rc = obd_quotacheck(sbi->ll_md_exp, oqctl);
+               if (rc < 0) {
+                       CDEBUG(D_INFO, "md_quotacheck failed: rc %d\n", rc);
+                       error = rc;
+               }
+
+               rc = obd_quotacheck(sbi->ll_dt_exp, oqctl);
+               if (rc < 0)
+                       CDEBUG(D_INFO, "obd_quotacheck failed: rc %d\n", rc);
+
+               OBD_FREE_PTR(oqctl);
+               return error ?: rc;
+       }
+       case OBD_IOC_POLL_QUOTACHECK: {
+               struct if_quotacheck *check;
+
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN) ||
+                   sbi->ll_flags & LL_SBI_RMT_CLIENT)
+                       RETURN(-EPERM);
+
+               OBD_ALLOC_PTR(check);
+               if (!check)
+                       RETURN(-ENOMEM);
+
+               rc = obd_iocontrol(cmd, sbi->ll_md_exp, 0, (void *)check,
+                                  NULL);
+               if (rc) {
+                       CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
+                       if (copy_to_user((void *)arg, check,
+                                            sizeof(*check)))
+                               CDEBUG(D_QUOTA, "copy_to_user failed\n");
+                       GOTO(out_poll, rc);
+               }
+
+               rc = obd_iocontrol(cmd, sbi->ll_dt_exp, 0, (void *)check,
+                                  NULL);
+               if (rc) {
+                       CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
+                       if (copy_to_user((void *)arg, check,
+                                            sizeof(*check)))
+                               CDEBUG(D_QUOTA, "copy_to_user failed\n");
+                       GOTO(out_poll, rc);
+               }
+       out_poll:
+               OBD_FREE_PTR(check);
+               RETURN(rc);
+       }
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+       case LL_IOC_QUOTACTL_18: {
+               /* copy the old 1.x quota struct for internal use, then copy
+                * back into old format struct.  For 1.8 compatibility. */
+               struct if_quotactl_18 *qctl_18;
+               struct if_quotactl *qctl_20;
+
+               OBD_ALLOC_PTR(qctl_18);
+               if (!qctl_18)
+                       RETURN(-ENOMEM);
+
+               OBD_ALLOC_PTR(qctl_20);
+               if (!qctl_20)
+                       GOTO(out_quotactl_18, rc = -ENOMEM);
+
+               if (copy_from_user(qctl_18, (void *)arg, sizeof(*qctl_18)))
+                       GOTO(out_quotactl_20, rc = -ENOMEM);
+
+               QCTL_COPY(qctl_20, qctl_18);
+               qctl_20->qc_idx = 0;
+
+               /* XXX: dqb_valid was borrowed as a flag to mark that
+                *      only mds quota is wanted */
+               if (qctl_18->qc_cmd == Q_GETQUOTA &&
+                   qctl_18->qc_dqblk.dqb_valid) {
+                       qctl_20->qc_valid = QC_MDTIDX;
+                       qctl_20->qc_dqblk.dqb_valid = 0;
+               } else if (qctl_18->obd_uuid.uuid[0] != '\0') {
+                       qctl_20->qc_valid = QC_UUID;
+                       qctl_20->obd_uuid = qctl_18->obd_uuid;
+               } else {
+                       qctl_20->qc_valid = QC_GENERAL;
+               }
+
+               rc = quotactl_ioctl(sbi, qctl_20);
+
+               if (rc == 0) {
+                       QCTL_COPY(qctl_18, qctl_20);
+                       qctl_18->obd_uuid = qctl_20->obd_uuid;
+
+                       if (copy_to_user((void *)arg, qctl_18,
+                                            sizeof(*qctl_18)))
+                               rc = -EFAULT;
+               }
+
+       out_quotactl_20:
+               OBD_FREE_PTR(qctl_20);
+       out_quotactl_18:
+               OBD_FREE_PTR(qctl_18);
+               RETURN(rc);
+       }
+#else
+#warning "remove old LL_IOC_QUOTACTL_18 compatibility code"
+#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */
+       case LL_IOC_QUOTACTL: {
+               struct if_quotactl *qctl;
+
+               OBD_ALLOC_PTR(qctl);
+               if (!qctl)
+                       RETURN(-ENOMEM);
+
+               if (copy_from_user(qctl, (void *)arg, sizeof(*qctl)))
+                       GOTO(out_quotactl, rc = -EFAULT);
+
+               rc = quotactl_ioctl(sbi, qctl);
+
+               if (rc == 0 && copy_to_user((void *)arg,qctl,sizeof(*qctl)))
+                       rc = -EFAULT;
+
+       out_quotactl:
+               OBD_FREE_PTR(qctl);
+               RETURN(rc);
+       }
+       case OBD_IOC_GETDTNAME:
+       case OBD_IOC_GETMDNAME:
+               RETURN(ll_get_obd_name(inode, cmd, arg));
+       case LL_IOC_FLUSHCTX:
+               RETURN(ll_flush_ctx(inode));
+#ifdef CONFIG_FS_POSIX_ACL
+       case LL_IOC_RMTACL: {
+           if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+               inode == inode->i_sb->s_root->d_inode) {
+               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+               LASSERT(fd != NULL);
+               rc = rct_add(&sbi->ll_rct, current_pid(), arg);
+               if (!rc)
+                       fd->fd_flags |= LL_FILE_RMTACL;
+               RETURN(rc);
+           } else
+               RETURN(0);
+       }
+#endif
+       case LL_IOC_GETOBDCOUNT: {
+               int count, vallen;
+               struct obd_export *exp;
+
+               if (copy_from_user(&count, (int *)arg, sizeof(int)))
+                       RETURN(-EFAULT);
+
+               /* get ost count when count is zero, get mdt count otherwise */
+               exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
+               vallen = sizeof(count);
+               rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
+                                 KEY_TGT_COUNT, &vallen, &count, NULL);
+               if (rc) {
+                       CERROR("get target count failed: %d\n", rc);
+                       RETURN(rc);
+               }
+
+               if (copy_to_user((int *)arg, &count, sizeof(int)))
+                       RETURN(-EFAULT);
+
+               RETURN(0);
+       }
+       case LL_IOC_PATH2FID:
+               if (copy_to_user((void *)arg, ll_inode2fid(inode),
+                                    sizeof(struct lu_fid)))
+                       RETURN(-EFAULT);
+               RETURN(0);
+       case LL_IOC_GET_CONNECT_FLAGS: {
+               RETURN(obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, (void*)arg));
+       }
+       case OBD_IOC_CHANGELOG_SEND:
+       case OBD_IOC_CHANGELOG_CLEAR:
+               rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+                                   sizeof(struct ioc_changelog));
+               RETURN(rc);
+       case OBD_IOC_FID2PATH:
+               RETURN(ll_fid2path(inode, (void *)arg));
+       case LL_IOC_HSM_REQUEST: {
+               struct hsm_user_request *hur;
+               int                      totalsize;
+
+               OBD_ALLOC_PTR(hur);
+               if (hur == NULL)
+                       RETURN(-ENOMEM);
+
+               /* We don't know the true size yet; copy the fixed-size part */
+               if (copy_from_user(hur, (void *)arg, sizeof(*hur))) {
+                       OBD_FREE_PTR(hur);
+                       RETURN(-EFAULT);
+               }
+
+               /* Compute the whole struct size */
+               totalsize = hur_len(hur);
+               OBD_FREE_PTR(hur);
+               OBD_ALLOC_LARGE(hur, totalsize);
+               if (hur == NULL)
+                       RETURN(-ENOMEM);
+
+               /* Copy the whole struct */
+               if (copy_from_user(hur, (void *)arg, totalsize)) {
+                       OBD_FREE_LARGE(hur, totalsize);
+                       RETURN(-EFAULT);
+               }
+
+               rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
+                                  hur, NULL);
+
+               OBD_FREE_LARGE(hur, totalsize);
+
+               RETURN(rc);
+       }
+       case LL_IOC_HSM_PROGRESS: {
+               struct hsm_progress_kernel      hpk;
+               struct hsm_progress             hp;
+
+               if (copy_from_user(&hp, (void *)arg, sizeof(hp)))
+                       RETURN(-EFAULT);
+
+               hpk.hpk_fid = hp.hp_fid;
+               hpk.hpk_cookie = hp.hp_cookie;
+               hpk.hpk_extent = hp.hp_extent;
+               hpk.hpk_flags = hp.hp_flags;
+               hpk.hpk_errval = hp.hp_errval;
+               hpk.hpk_data_version = 0;
+
+               /* File may not exist in Lustre; all progress
+                * reported to Lustre root */
+               rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
+                                  NULL);
+               RETURN(rc);
+       }
+       case LL_IOC_HSM_CT_START:
+               rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg,
+                                   sizeof(struct lustre_kernelcomm));
+               RETURN(rc);
+
+       case LL_IOC_HSM_COPY_START: {
+               struct hsm_copy *copy;
+               int              rc;
+
+               OBD_ALLOC_PTR(copy);
+               if (copy == NULL)
+                       RETURN(-ENOMEM);
+               if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+                       OBD_FREE_PTR(copy);
+                       RETURN(-EFAULT);
+               }
+
+               rc = ll_ioc_copy_start(inode->i_sb, copy);
+               if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+                       rc = -EFAULT;
+
+               OBD_FREE_PTR(copy);
+               RETURN(rc);
+       }
+       case LL_IOC_HSM_COPY_END: {
+               struct hsm_copy *copy;
+               int              rc;
+
+               OBD_ALLOC_PTR(copy);
+               if (copy == NULL)
+                       RETURN(-ENOMEM);
+               if (copy_from_user(copy, (char *)arg, sizeof(*copy))) {
+                       OBD_FREE_PTR(copy);
+                       RETURN(-EFAULT);
+               }
+
+               rc = ll_ioc_copy_end(inode->i_sb, copy);
+               if (copy_to_user((char *)arg, copy, sizeof(*copy)))
+                       rc = -EFAULT;
+
+               OBD_FREE_PTR(copy);
+               RETURN(rc);
+       }
+       default:
+               RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL,
+                                    (void *)arg));
+       }
+}
+
+static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
+{
+       struct inode *inode = file->f_mapping->host;
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       int api32 = ll_need_32bit_api(sbi);
+       loff_t ret = -EINVAL;
+       ENTRY;
+
+       mutex_lock(&inode->i_mutex);
+       switch (origin) {
+               case SEEK_SET:
+                       break;
+               case SEEK_CUR:
+                       offset += file->f_pos;
+                       break;
+               case SEEK_END:
+                       if (offset > 0)
+                               GOTO(out, ret);
+                       if (api32)
+                               offset += LL_DIR_END_OFF_32BIT;
+                       else
+                               offset += LL_DIR_END_OFF;
+                       break;
+               default:
+                       GOTO(out, ret);
+       }
+
+       if (offset >= 0 &&
+           ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
+            (!api32 && offset <= LL_DIR_END_OFF))) {
+               if (offset != file->f_pos) {
+                       if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
+                           (!api32 && offset == LL_DIR_END_OFF))
+                               fd->lfd_pos = MDS_DIR_END_OFF;
+                       else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH)
+                               fd->lfd_pos = offset << 32;
+                       else
+                               fd->lfd_pos = offset;
+                       file->f_pos = offset;
+                       file->f_version = 0;
+               }
+               ret = offset;
+       }
+       GOTO(out, ret);
+
+out:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
+}
+
+int ll_dir_open(struct inode *inode, struct file *file)
+{
+       ENTRY;
+       RETURN(ll_file_open(inode, file));
+}
+
+int ll_dir_release(struct inode *inode, struct file *file)
+{
+       ENTRY;
+       RETURN(ll_file_release(inode, file));
+}
+
+struct file_operations ll_dir_operations = {
+       .llseek   = ll_dir_seek,
+       .open     = ll_dir_open,
+       .release  = ll_dir_release,
+       .read     = generic_read_dir,
+       .readdir  = ll_readdir,
+       .unlocked_ioctl   = ll_dir_ioctl,
+       .fsync    = ll_fsync,
+};
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
new file mode 100644 (file)
index 0000000..ed1e3f7
--- /dev/null
@@ -0,0 +1,3198 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/file.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <lustre_dlm.h>
+#include <lustre_lite.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include "llite_internal.h"
+#include <lustre/ll_fiemap.h>
+
+#include "cl_object.h"
+
+struct ll_file_data *ll_file_data_get(void)
+{
+       struct ll_file_data *fd;
+
+       OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
+       fd->fd_write_failed = false;
+       return fd;
+}
+
+static void ll_file_data_put(struct ll_file_data *fd)
+{
+       if (fd != NULL)
+               OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
+}
+
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+                         struct lustre_handle *fh)
+{
+       op_data->op_fid1 = ll_i2info(inode)->lli_fid;
+       op_data->op_attr.ia_mode = inode->i_mode;
+       op_data->op_attr.ia_atime = inode->i_atime;
+       op_data->op_attr.ia_mtime = inode->i_mtime;
+       op_data->op_attr.ia_ctime = inode->i_ctime;
+       op_data->op_attr.ia_size = i_size_read(inode);
+       op_data->op_attr_blocks = inode->i_blocks;
+       ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+                                       ll_inode_to_ext_flags(inode->i_flags);
+       op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
+       if (fh)
+               op_data->op_handle = *fh;
+       op_data->op_capa1 = ll_mdscapa_get(inode);
+
+       if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
+               op_data->op_bias |= MDS_DATA_MODIFIED;
+}
+
+/**
+ * Closes the IO epoch and packs all the attributes into @op_data for
+ * the CLOSE rpc.
+ */
+static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
+                            struct obd_client_handle *och)
+{
+       ENTRY;
+
+       op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
+                                       ATTR_MTIME | ATTR_MTIME_SET |
+                                       ATTR_CTIME | ATTR_CTIME_SET;
+
+       if (!(och->och_flags & FMODE_WRITE))
+               goto out;
+
+       if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
+               op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+       else
+               ll_ioepoch_close(inode, op_data, &och, 0);
+
+out:
+       ll_pack_inode2opdata(inode, op_data, &och->och_fh);
+       ll_prep_md_op_data(op_data, inode, NULL, NULL,
+                          0, 0, LUSTRE_OPC_ANY, NULL);
+       EXIT;
+}
+
+static int ll_close_inode_openhandle(struct obd_export *md_exp,
+                                    struct inode *inode,
+                                    struct obd_client_handle *och)
+{
+       struct obd_export *exp = ll_i2mdexp(inode);
+       struct md_op_data *op_data;
+       struct ptlrpc_request *req = NULL;
+       struct obd_device *obd = class_exp2obd(exp);
+       int epoch_close = 1;
+       int rc;
+       ENTRY;
+
+       if (obd == NULL) {
+               /*
+                * XXX: in case of LMV, is this correct to access
+                * ->exp_handle?
+                */
+               CERROR("Invalid MDC connection handle "LPX64"\n",
+                      ll_i2mdexp(inode)->exp_handle.h_cookie);
+               GOTO(out, rc = 0);
+       }
+
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
+
+       ll_prepare_close(inode, op_data, och);
+       epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
+       rc = md_close(md_exp, op_data, och->och_mod, &req);
+       if (rc == -EAGAIN) {
+               /* This close must have the epoch closed. */
+               LASSERT(epoch_close);
+               /* MDS has instructed us to obtain Size-on-MDS attribute from
+                * OSTs and send setattr to back to MDS. */
+               rc = ll_som_update(inode, op_data);
+               if (rc) {
+                       CERROR("inode %lu mdc Size-on-MDS update failed: "
+                              "rc = %d\n", inode->i_ino, rc);
+                       rc = 0;
+               }
+       } else if (rc) {
+               CERROR("inode %lu mdc close failed: rc = %d\n",
+                      inode->i_ino, rc);
+       }
+
+       /* DATA_MODIFIED flag was successfully sent on close, cancel data
+        * modification flag. */
+       if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+               struct ll_inode_info *lli = ll_i2info(inode);
+
+               spin_lock(&lli->lli_lock);
+               lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+               spin_unlock(&lli->lli_lock);
+       }
+
+       ll_finish_md_op_data(op_data);
+
+       if (rc == 0) {
+               rc = ll_objects_destroy(req, inode);
+               if (rc)
+                       CERROR("inode %lu ll_objects destroy: rc = %d\n",
+                              inode->i_ino, rc);
+       }
+
+       EXIT;
+out:
+
+       if (exp_connect_som(exp) && !epoch_close &&
+           S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
+               ll_queue_done_writing(inode, LLIF_DONE_WRITING);
+       } else {
+               md_clear_open_replay_data(md_exp, och);
+               /* Free @och if it is not waiting for DONE_WRITING. */
+               och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+               OBD_FREE_PTR(och);
+       }
+       if (req) /* This is close request */
+               ptlrpc_req_finished(req);
+       return rc;
+}
+
+int ll_md_real_close(struct inode *inode, int flags)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_client_handle **och_p;
+       struct obd_client_handle *och;
+       __u64 *och_usecount;
+       int rc = 0;
+       ENTRY;
+
+       if (flags & FMODE_WRITE) {
+               och_p = &lli->lli_mds_write_och;
+               och_usecount = &lli->lli_open_fd_write_count;
+       } else if (flags & FMODE_EXEC) {
+               och_p = &lli->lli_mds_exec_och;
+               och_usecount = &lli->lli_open_fd_exec_count;
+       } else {
+               LASSERT(flags & FMODE_READ);
+               och_p = &lli->lli_mds_read_och;
+               och_usecount = &lli->lli_open_fd_read_count;
+       }
+
+       mutex_lock(&lli->lli_och_mutex);
+       if (*och_usecount) { /* There are still users of this handle, so
+                               skip freeing it. */
+               mutex_unlock(&lli->lli_och_mutex);
+               RETURN(0);
+       }
+       och=*och_p;
+       *och_p = NULL;
+       mutex_unlock(&lli->lli_och_mutex);
+
+       if (och) { /* There might be a race and somebody have freed this och
+                     already */
+               rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+                                              inode, och);
+       }
+
+       RETURN(rc);
+}
+
+int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+               struct file *file)
+{
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       int rc = 0;
+       ENTRY;
+
+       /* clear group lock, if present */
+       if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
+               ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
+
+       /* Let's see if we have good enough OPEN lock on the file and if
+          we can skip talking to MDS */
+       if (file->f_dentry->d_inode) { /* Can this ever be false? */
+               int lockmode;
+               int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
+               struct lustre_handle lockh;
+               struct inode *inode = file->f_dentry->d_inode;
+               ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
+
+               mutex_lock(&lli->lli_och_mutex);
+               if (fd->fd_omode & FMODE_WRITE) {
+                       lockmode = LCK_CW;
+                       LASSERT(lli->lli_open_fd_write_count);
+                       lli->lli_open_fd_write_count--;
+               } else if (fd->fd_omode & FMODE_EXEC) {
+                       lockmode = LCK_PR;
+                       LASSERT(lli->lli_open_fd_exec_count);
+                       lli->lli_open_fd_exec_count--;
+               } else {
+                       lockmode = LCK_CR;
+                       LASSERT(lli->lli_open_fd_read_count);
+                       lli->lli_open_fd_read_count--;
+               }
+               mutex_unlock(&lli->lli_och_mutex);
+
+               if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
+                                  LDLM_IBITS, &policy, lockmode,
+                                  &lockh)) {
+                       rc = ll_md_real_close(file->f_dentry->d_inode,
+                                             fd->fd_omode);
+               }
+       } else {
+               CERROR("Releasing a file %p with negative dentry %p. Name %s",
+                      file, file->f_dentry, file->f_dentry->d_name.name);
+       }
+
+       LUSTRE_FPRIVATE(file) = NULL;
+       ll_file_data_put(fd);
+       ll_capa_close(inode);
+
+       RETURN(rc);
+}
+
+/* While this returns an error code, fput() the caller does not, so we need
+ * to make every effort to clean up all of our state here.  Also, applications
+ * rarely check close errors and even if an error is returned they will not
+ * re-try the close call.
+ */
+int ll_file_release(struct inode *inode, struct file *file)
+{
+       struct ll_file_data *fd;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+              inode->i_generation, inode);
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+           inode == inode->i_sb->s_root->d_inode) {
+               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+               LASSERT(fd != NULL);
+               if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
+                       fd->fd_flags &= ~LL_FILE_RMTACL;
+                       rct_del(&sbi->ll_rct, current_pid());
+                       et_search_free(&sbi->ll_et, current_pid());
+               }
+       }
+#endif
+
+       if (inode->i_sb->s_root != file->f_dentry)
+               ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
+       fd = LUSTRE_FPRIVATE(file);
+       LASSERT(fd != NULL);
+
+       /* The last ref on @file, maybe not the the owner pid of statahead.
+        * Different processes can open the same dir, "ll_opendir_key" means:
+        * it is me that should stop the statahead thread. */
+       if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
+           lli->lli_opendir_pid != 0)
+               ll_stop_statahead(inode, lli->lli_opendir_key);
+
+       if (inode->i_sb->s_root == file->f_dentry) {
+               LUSTRE_FPRIVATE(file) = NULL;
+               ll_file_data_put(fd);
+               RETURN(0);
+       }
+
+       if (!S_ISDIR(inode->i_mode)) {
+               lov_read_and_clear_async_rc(lli->lli_clob);
+               lli->lli_async_rc = 0;
+       }
+
+       rc = ll_md_close(sbi->ll_md_exp, inode, file);
+
+       if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
+               libcfs_debug_dumplog();
+
+       RETURN(rc);
+}
+
+static int ll_intent_file_open(struct file *file, void *lmm,
+                              int lmmsize, struct lookup_intent *itp)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
+       struct dentry *parent = file->f_dentry->d_parent;
+       const char *name = file->f_dentry->d_name.name;
+       const int len = file->f_dentry->d_name.len;
+       struct md_op_data *op_data;
+       struct ptlrpc_request *req;
+       __u32 opc = LUSTRE_OPC_ANY;
+       int rc;
+       ENTRY;
+
+       if (!parent)
+               RETURN(-ENOENT);
+
+       /* Usually we come here only for NFSD, and we want open lock.
+          But we can also get here with pre 2.6.15 patchless kernels, and in
+          that case that lock is also ok */
+       /* We can also get here if there was cached open handle in revalidate_it
+        * but it disappeared while we were getting from there to ll_file_open.
+        * But this means this file was closed and immediatelly opened which
+        * makes a good candidate for using OPEN lock */
+       /* If lmmsize & lmm are not 0, we are just setting stripe info
+        * parameters. No need for the open lock */
+       if (lmm == NULL && lmmsize == 0) {
+               itp->it_flags |= MDS_OPEN_LOCK;
+               if (itp->it_flags & FMODE_WRITE)
+                       opc = LUSTRE_OPC_CREATE;
+       }
+
+       op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
+                                     file->f_dentry->d_inode, name, len,
+                                     O_RDWR, opc, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       itp->it_flags |= MDS_OPEN_BY_FID;
+       rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
+                           0 /*unused */, &req, ll_md_blocking_ast, 0);
+       ll_finish_md_op_data(op_data);
+       if (rc == -ESTALE) {
+               /* reason for keep own exit path - don`t flood log
+               * with messages with -ESTALE errors.
+               */
+               if (!it_disposition(itp, DISP_OPEN_OPEN) ||
+                    it_open_error(DISP_OPEN_OPEN, itp))
+                       GOTO(out, rc);
+               ll_release_openhandle(file->f_dentry, itp);
+               GOTO(out, rc);
+       }
+
+       if (it_disposition(itp, DISP_LOOKUP_NEG))
+               GOTO(out, rc = -ENOENT);
+
+       if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
+               rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
+               CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
+       if (!rc && itp->d.lustre.it_lock_mode)
+               ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
+                                itp, NULL);
+
+out:
+       ptlrpc_req_finished(itp->d.lustre.it_data);
+       it_clear_disposition(itp, DISP_ENQ_COMPLETE);
+       ll_intent_drop_lock(itp);
+
+       RETURN(rc);
+}
+
+/**
+ * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
+ * not believe attributes if a few ioepoch holders exist. Attributes for
+ * previous ioepoch if new one is opened are also skipped by MDS.
+ */
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
+{
+       if (ioepoch && lli->lli_ioepoch != ioepoch) {
+               lli->lli_ioepoch = ioepoch;
+               CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
+                      ioepoch, PFID(&lli->lli_fid));
+       }
+}
+
+static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
+                      struct lookup_intent *it, struct obd_client_handle *och)
+{
+       struct ptlrpc_request *req = it->d.lustre.it_data;
+       struct mdt_body *body;
+
+       LASSERT(och);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL);                /* reply already checked out */
+
+       memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
+       och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+       och->och_fid = lli->lli_fid;
+       och->och_flags = it->it_flags;
+       ll_ioepoch_open(lli, body->ioepoch);
+
+       return md_set_open_replay_data(md_exp, och, req);
+}
+
+int ll_local_open(struct file *file, struct lookup_intent *it,
+                 struct ll_file_data *fd, struct obd_client_handle *och)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       ENTRY;
+
+       LASSERT(!LUSTRE_FPRIVATE(file));
+
+       LASSERT(fd != NULL);
+
+       if (och) {
+               struct ptlrpc_request *req = it->d.lustre.it_data;
+               struct mdt_body *body;
+               int rc;
+
+               rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
+               if (rc)
+                       RETURN(rc);
+
+               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+               if ((it->it_flags & FMODE_WRITE) &&
+                   (body->valid & OBD_MD_FLSIZE))
+                       CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
+                              lli->lli_ioepoch, PFID(&lli->lli_fid));
+       }
+
+       LUSTRE_FPRIVATE(file) = fd;
+       ll_readahead_init(inode, &fd->fd_ras);
+       fd->fd_omode = it->it_flags;
+       RETURN(0);
+}
+
+/* Open a file, and (for the very first open) create objects on the OSTs at
+ * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
+ * creation or open until ll_lov_setstripe() ioctl is called.
+ *
+ * If we already have the stripe MD locally then we don't request it in
+ * md_open(), by passing a lmm_size = 0.
+ *
+ * It is up to the application to ensure no other processes open this file
+ * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
+ * used.  We might be able to avoid races of that sort by getting lli_open_sem
+ * before returning in the O_LOV_DELAY_CREATE case and dropping it here
+ * or in ll_file_release(), but I'm not sure that is desirable/necessary.
+ */
+int ll_file_open(struct inode *inode, struct file *file)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct lookup_intent *it, oit = { .it_op = IT_OPEN,
+                                         .it_flags = file->f_flags };
+       struct obd_client_handle **och_p = NULL;
+       __u64 *och_usecount = NULL;
+       struct ll_file_data *fd;
+       int rc = 0, opendir_set = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
+              inode->i_generation, inode, file->f_flags);
+
+       it = file->private_data; /* XXX: compat macro */
+       file->private_data = NULL; /* prevent ll_local_open assertion */
+
+       fd = ll_file_data_get();
+       if (fd == NULL)
+               GOTO(out_och_free, rc = -ENOMEM);
+
+       fd->fd_file = file;
+       if (S_ISDIR(inode->i_mode)) {
+               spin_lock(&lli->lli_sa_lock);
+               if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
+                   lli->lli_opendir_pid == 0) {
+                       lli->lli_opendir_key = fd;
+                       lli->lli_opendir_pid = current_pid();
+                       opendir_set = 1;
+               }
+               spin_unlock(&lli->lli_sa_lock);
+       }
+
+       if (inode->i_sb->s_root == file->f_dentry) {
+               LUSTRE_FPRIVATE(file) = fd;
+               RETURN(0);
+       }
+
+       if (!it || !it->d.lustre.it_disposition) {
+               /* Convert f_flags into access mode. We cannot use file->f_mode,
+                * because everything but O_ACCMODE mask was stripped from
+                * there */
+               if ((oit.it_flags + 1) & O_ACCMODE)
+                       oit.it_flags++;
+               if (file->f_flags & O_TRUNC)
+                       oit.it_flags |= FMODE_WRITE;
+
+               /* kernel only call f_op->open in dentry_open.  filp_open calls
+                * dentry_open after call to open_namei that checks permissions.
+                * Only nfsd_open call dentry_open directly without checking
+                * permissions and because of that this code below is safe. */
+               if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
+                       oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+
+               /* We do not want O_EXCL here, presumably we opened the file
+                * already? XXX - NFS implications? */
+               oit.it_flags &= ~O_EXCL;
+
+               /* bug20584, if "it_flags" contains O_CREAT, the file will be
+                * created if necessary, then "IT_CREAT" should be set to keep
+                * consistent with it */
+               if (oit.it_flags & O_CREAT)
+                       oit.it_op |= IT_CREAT;
+
+               it = &oit;
+       }
+
+restart:
+       /* Let's see if we have file open on MDS already. */
+       if (it->it_flags & FMODE_WRITE) {
+               och_p = &lli->lli_mds_write_och;
+               och_usecount = &lli->lli_open_fd_write_count;
+       } else if (it->it_flags & FMODE_EXEC) {
+               och_p = &lli->lli_mds_exec_och;
+               och_usecount = &lli->lli_open_fd_exec_count;
+        } else {
+               och_p = &lli->lli_mds_read_och;
+               och_usecount = &lli->lli_open_fd_read_count;
+       }
+
+       mutex_lock(&lli->lli_och_mutex);
+       if (*och_p) { /* Open handle is present */
+               if (it_disposition(it, DISP_OPEN_OPEN)) {
+                       /* Well, there's extra open request that we do not need,
+                          let's close it somehow. This will decref request. */
+                       rc = it_open_error(DISP_OPEN_OPEN, it);
+                       if (rc) {
+                               mutex_unlock(&lli->lli_och_mutex);
+                               GOTO(out_openerr, rc);
+                       }
+
+                       ll_release_openhandle(file->f_dentry, it);
+               }
+               (*och_usecount)++;
+
+               rc = ll_local_open(file, it, fd, NULL);
+               if (rc) {
+                       (*och_usecount)--;
+                       mutex_unlock(&lli->lli_och_mutex);
+                       GOTO(out_openerr, rc);
+               }
+       } else {
+               LASSERT(*och_usecount == 0);
+               if (!it->d.lustre.it_disposition) {
+                       /* We cannot just request lock handle now, new ELC code
+                          means that one of other OPEN locks for this file
+                          could be cancelled, and since blocking ast handler
+                          would attempt to grab och_mutex as well, that would
+                          result in a deadlock */
+                       mutex_unlock(&lli->lli_och_mutex);
+                       it->it_create_mode |= M_CHECK_STALE;
+                       rc = ll_intent_file_open(file, NULL, 0, it);
+                       it->it_create_mode &= ~M_CHECK_STALE;
+                       if (rc)
+                               GOTO(out_openerr, rc);
+
+                       goto restart;
+               }
+               OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
+               if (!*och_p)
+                       GOTO(out_och_free, rc = -ENOMEM);
+
+               (*och_usecount)++;
+
+               /* md_intent_lock() didn't get a request ref if there was an
+                * open error, so don't do cleanup on the request here
+                * (bug 3430) */
+               /* XXX (green): Should not we bail out on any error here, not
+                * just open error? */
+               rc = it_open_error(DISP_OPEN_OPEN, it);
+               if (rc)
+                       GOTO(out_och_free, rc);
+
+               LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
+
+               rc = ll_local_open(file, it, fd, *och_p);
+               if (rc)
+                       GOTO(out_och_free, rc);
+       }
+       mutex_unlock(&lli->lli_och_mutex);
+       fd = NULL;
+
+       /* Must do this outside lli_och_mutex lock to prevent deadlock where
+          different kind of OPEN lock for this same inode gets cancelled
+          by ldlm_cancel_lru */
+       if (!S_ISREG(inode->i_mode))
+               GOTO(out_och_free, rc);
+
+       ll_capa_open(inode);
+
+       if (!lli->lli_has_smd) {
+               if (file->f_flags & O_LOV_DELAY_CREATE ||
+                   !(file->f_mode & FMODE_WRITE)) {
+                       CDEBUG(D_INODE, "object creation was delayed\n");
+                       GOTO(out_och_free, rc);
+               }
+       }
+       file->f_flags &= ~O_LOV_DELAY_CREATE;
+       GOTO(out_och_free, rc);
+
+out_och_free:
+       if (rc) {
+               if (och_p && *och_p) {
+                       OBD_FREE(*och_p, sizeof (struct obd_client_handle));
+                       *och_p = NULL; /* OBD_FREE writes some magic there */
+                       (*och_usecount)--;
+               }
+               mutex_unlock(&lli->lli_och_mutex);
+
+out_openerr:
+               if (opendir_set != 0)
+                       ll_stop_statahead(inode, lli->lli_opendir_key);
+               if (fd != NULL)
+                       ll_file_data_put(fd);
+       } else {
+               ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
+       }
+
+       if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
+               ptlrpc_req_finished(it->d.lustre.it_data);
+               it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+       }
+
+       return rc;
+}
+
+/* Fills the obdo with the attributes for the lsm */
+static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
+                         struct obd_capa *capa, struct obdo *obdo,
+                         __u64 ioepoch, int sync)
+{
+       struct ptlrpc_request_set *set;
+       struct obd_info     oinfo = { { { 0 } } };
+       int                     rc;
+
+       ENTRY;
+
+       LASSERT(lsm != NULL);
+
+       oinfo.oi_md = lsm;
+       oinfo.oi_oa = obdo;
+       oinfo.oi_oa->o_oi = lsm->lsm_oi;
+       oinfo.oi_oa->o_mode = S_IFREG;
+       oinfo.oi_oa->o_ioepoch = ioepoch;
+       oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
+                              OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+                              OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
+                              OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+                              OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
+                              OBD_MD_FLDATAVERSION;
+       oinfo.oi_capa = capa;
+       if (sync) {
+               oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
+               oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
+       }
+
+       set = ptlrpc_prep_set();
+       if (set == NULL) {
+               CERROR("can't allocate ptlrpc set\n");
+               rc = -ENOMEM;
+       } else {
+               rc = obd_getattr_async(exp, &oinfo, set);
+               if (rc == 0)
+                       rc = ptlrpc_set_wait(set);
+               ptlrpc_set_destroy(set);
+       }
+       if (rc == 0)
+               oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+                                        OBD_MD_FLATIME | OBD_MD_FLMTIME |
+                                        OBD_MD_FLCTIME | OBD_MD_FLSIZE |
+                                        OBD_MD_FLDATAVERSION);
+       RETURN(rc);
+}
+
+/**
+  * Performs the getattr on the inode and updates its fields.
+  * If @sync != 0, perform the getattr under the server-side lock.
+  */
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+                    __u64 ioepoch, int sync)
+{
+       struct obd_capa      *capa = ll_mdscapa_get(inode);
+       struct lov_stripe_md *lsm;
+       int rc;
+       ENTRY;
+
+       lsm = ccc_inode_lsm_get(inode);
+       rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
+                           capa, obdo, ioepoch, sync);
+       capa_put(capa);
+       if (rc == 0) {
+               struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
+
+               obdo_refresh_inode(inode, obdo, obdo->o_valid);
+               CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
+                      " blksize %lu\n", POSTID(oi), i_size_read(inode),
+                      (unsigned long long)inode->i_blocks,
+                      (unsigned long)ll_inode_blksize(inode));
+       }
+       ccc_inode_lsm_put(inode, lsm);
+       RETURN(rc);
+}
+
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct cl_object *obj = lli->lli_clob;
+       struct cl_attr *attr = ccc_env_thread_attr(env);
+       struct ost_lvb lvb;
+       int rc = 0;
+
+       ENTRY;
+
+       ll_inode_size_lock(inode);
+       /* merge timestamps the most recently obtained from mds with
+          timestamps obtained from osts */
+       LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
+       LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
+       LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
+       inode_init_lvb(inode, &lvb);
+
+       cl_object_attr_lock(obj);
+       rc = cl_object_attr_get(env, obj, attr);
+       cl_object_attr_unlock(obj);
+
+       if (rc == 0) {
+               if (lvb.lvb_atime < attr->cat_atime)
+                       lvb.lvb_atime = attr->cat_atime;
+               if (lvb.lvb_ctime < attr->cat_ctime)
+                       lvb.lvb_ctime = attr->cat_ctime;
+               if (lvb.lvb_mtime < attr->cat_mtime)
+                       lvb.lvb_mtime = attr->cat_mtime;
+
+               CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
+                               PFID(&lli->lli_fid), attr->cat_size);
+               cl_isize_write_nolock(inode, attr->cat_size);
+
+               inode->i_blocks = attr->cat_blocks;
+
+               LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
+               LTIME_S(inode->i_atime) = lvb.lvb_atime;
+               LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
+       }
+       ll_inode_size_unlock(inode);
+
+       RETURN(rc);
+}
+
+int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
+                    lstat_t *st)
+{
+       struct obdo obdo = { 0 };
+       int rc;
+
+       rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
+       if (rc == 0) {
+               st->st_size   = obdo.o_size;
+               st->st_blocks = obdo.o_blocks;
+               st->st_mtime  = obdo.o_mtime;
+               st->st_atime  = obdo.o_atime;
+               st->st_ctime  = obdo.o_ctime;
+       }
+       return rc;
+}
+
+void ll_io_init(struct cl_io *io, const struct file *file, int write)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+
+       io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
+       if (write) {
+               io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
+               io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
+                                     file->f_flags & O_DIRECT ||
+                                     IS_SYNC(inode);
+       }
+       io->ci_obj     = ll_i2info(inode)->lli_clob;
+       io->ci_lockreq = CILR_MAYBE;
+       if (ll_file_nolock(file)) {
+               io->ci_lockreq = CILR_NEVER;
+               io->ci_no_srvlock = 1;
+       } else if (file->f_flags & O_APPEND) {
+               io->ci_lockreq = CILR_MANDATORY;
+       }
+}
+
+static ssize_t
+ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
+                  struct file *file, enum cl_io_type iot,
+                  loff_t *ppos, size_t count)
+{
+       struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
+       struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
+       struct cl_io     *io;
+       ssize_t        result;
+       ENTRY;
+
+restart:
+       io = ccc_env_thread_io(env);
+       ll_io_init(io, file, iot == CIT_WRITE);
+
+       if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
+               struct vvp_io *vio = vvp_env_io(env);
+               struct ccc_io *cio = ccc_env_io(env);
+               int write_mutex_locked = 0;
+
+               cio->cui_fd  = LUSTRE_FPRIVATE(file);
+               vio->cui_io_subtype = args->via_io_subtype;
+
+               switch (vio->cui_io_subtype) {
+               case IO_NORMAL:
+                       cio->cui_iov = args->u.normal.via_iov;
+                       cio->cui_nrsegs = args->u.normal.via_nrsegs;
+                       cio->cui_tot_nrsegs = cio->cui_nrsegs;
+                       cio->cui_iocb = args->u.normal.via_iocb;
+                       if ((iot == CIT_WRITE) &&
+                           !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+                               if (mutex_lock_interruptible(&lli->
+                                                              lli_write_mutex))
+                                       GOTO(out, result = -ERESTARTSYS);
+                               write_mutex_locked = 1;
+                       } else if (iot == CIT_READ) {
+                               down_read(&lli->lli_trunc_sem);
+                       }
+                       break;
+               case IO_SENDFILE:
+                       vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
+                       vio->u.sendfile.cui_target = args->u.sendfile.via_target;
+                       break;
+               case IO_SPLICE:
+                       vio->u.splice.cui_pipe = args->u.splice.via_pipe;
+                       vio->u.splice.cui_flags = args->u.splice.via_flags;
+                       break;
+               default:
+                       CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
+                       LBUG();
+               }
+               result = cl_io_loop(env, io);
+               if (write_mutex_locked)
+                       mutex_unlock(&lli->lli_write_mutex);
+               else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
+                       up_read(&lli->lli_trunc_sem);
+       } else {
+               /* cl_io_rw_init() handled IO */
+               result = io->ci_result;
+       }
+
+       if (io->ci_nob > 0) {
+               result = io->ci_nob;
+               *ppos = io->u.ci_wr.wr.crw_pos;
+       }
+       GOTO(out, result);
+out:
+       cl_io_fini(env, io);
+       /* If any bit been read/written (result != 0), we just return
+        * short read/write instead of restart io. */
+       if (result == 0 && io->ci_need_restart) {
+               CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
+                      iot == CIT_READ ? "read" : "write",
+                      file->f_dentry->d_name.name, *ppos, count);
+               LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
+               goto restart;
+       }
+
+       if (iot == CIT_READ) {
+               if (result >= 0)
+                       ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
+                                          LPROC_LL_READ_BYTES, result);
+       } else if (iot == CIT_WRITE) {
+               if (result >= 0) {
+                       ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
+                                          LPROC_LL_WRITE_BYTES, result);
+                       fd->fd_write_failed = false;
+               } else if (result != -ERESTARTSYS) {
+                       fd->fd_write_failed = true;
+               }
+       }
+
+       return result;
+}
+
+
+/*
+ * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
+ */
+static int ll_file_get_iov_count(const struct iovec *iov,
+                                unsigned long *nr_segs, size_t *count)
+{
+       size_t cnt = 0;
+       unsigned long seg;
+
+       for (seg = 0; seg < *nr_segs; seg++) {
+               const struct iovec *iv = &iov[seg];
+
+               /*
+                * If any segment has a negative length, or the cumulative
+                * length ever wraps negative then return -EINVAL.
+                */
+               cnt += iv->iov_len;
+               if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+                       return -EINVAL;
+               if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+                       continue;
+               if (seg == 0)
+                       return -EFAULT;
+               *nr_segs = seg;
+               cnt -= iv->iov_len;   /* This segment is no good */
+               break;
+       }
+       *count = cnt;
+       return 0;
+}
+
+static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t pos)
+{
+       struct lu_env      *env;
+       struct vvp_io_args *args;
+       size_t        count;
+       ssize_t      result;
+       int              refcheck;
+       ENTRY;
+
+       result = ll_file_get_iov_count(iov, &nr_segs, &count);
+       if (result)
+               RETURN(result);
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       args = vvp_env_args(env, IO_NORMAL);
+       args->u.normal.via_iov = (struct iovec *)iov;
+       args->u.normal.via_nrsegs = nr_segs;
+       args->u.normal.via_iocb = iocb;
+
+       result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+                                   &iocb->ki_pos, count);
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
+                           loff_t *ppos)
+{
+       struct lu_env *env;
+       struct iovec  *local_iov;
+       struct kiocb  *kiocb;
+       ssize_t result;
+       int         refcheck;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       local_iov = &vvp_env_info(env)->vti_local_iov;
+       kiocb = &vvp_env_info(env)->vti_kiocb;
+       local_iov->iov_base = (void __user *)buf;
+       local_iov->iov_len = count;
+       init_sync_kiocb(kiocb, file);
+       kiocb->ki_pos = *ppos;
+       kiocb->ki_left = count;
+
+       result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
+       *ppos = kiocb->ki_pos;
+
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+/*
+ * Write to a file (through the page cache).
+ */
+static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                                unsigned long nr_segs, loff_t pos)
+{
+       struct lu_env      *env;
+       struct vvp_io_args *args;
+       size_t        count;
+       ssize_t      result;
+       int              refcheck;
+       ENTRY;
+
+       result = ll_file_get_iov_count(iov, &nr_segs, &count);
+       if (result)
+               RETURN(result);
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       args = vvp_env_args(env, IO_NORMAL);
+       args->u.normal.via_iov = (struct iovec *)iov;
+       args->u.normal.via_nrsegs = nr_segs;
+       args->u.normal.via_iocb = iocb;
+
+       result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
+                                 &iocb->ki_pos, count);
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
+                            loff_t *ppos)
+{
+       struct lu_env *env;
+       struct iovec  *local_iov;
+       struct kiocb  *kiocb;
+       ssize_t result;
+       int         refcheck;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       local_iov = &vvp_env_info(env)->vti_local_iov;
+       kiocb = &vvp_env_info(env)->vti_kiocb;
+       local_iov->iov_base = (void __user *)buf;
+       local_iov->iov_len = count;
+       init_sync_kiocb(kiocb, file);
+       kiocb->ki_pos = *ppos;
+       kiocb->ki_left = count;
+
+       result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
+       *ppos = kiocb->ki_pos;
+
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+
+
+/*
+ * Send file content (through pagecache) somewhere with helper
+ */
+static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
+                                  struct pipe_inode_info *pipe, size_t count,
+                                  unsigned int flags)
+{
+       struct lu_env      *env;
+       struct vvp_io_args *args;
+       ssize_t      result;
+       int              refcheck;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       args = vvp_env_args(env, IO_SPLICE);
+       args->u.splice.via_pipe = pipe;
+       args->u.splice.via_flags = flags;
+
+       result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
+       cl_env_put(env, &refcheck);
+       RETURN(result);
+}
+
+static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
+                          obd_count ost_idx)
+{
+       struct obd_export *exp = ll_i2dtexp(inode);
+       struct obd_trans_info oti = { 0 };
+       struct obdo *oa = NULL;
+       int lsm_size;
+       int rc = 0;
+       struct lov_stripe_md *lsm = NULL, *lsm2;
+       ENTRY;
+
+       OBDO_ALLOC(oa);
+       if (oa == NULL)
+               RETURN(-ENOMEM);
+
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm == NULL)
+               GOTO(out, rc = -ENOENT);
+
+       lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
+                  (lsm->lsm_stripe_count));
+
+       OBD_ALLOC_LARGE(lsm2, lsm_size);
+       if (lsm2 == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       oa->o_oi = *oi;
+       oa->o_nlink = ost_idx;
+       oa->o_flags |= OBD_FL_RECREATE_OBJS;
+       oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
+       obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
+                                  OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+       obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
+       memcpy(lsm2, lsm, lsm_size);
+       ll_inode_size_lock(inode);
+       rc = obd_create(NULL, exp, oa, &lsm2, &oti);
+       ll_inode_size_unlock(inode);
+
+       OBD_FREE_LARGE(lsm2, lsm_size);
+       GOTO(out, rc);
+out:
+       ccc_inode_lsm_put(inode, lsm);
+       OBDO_FREE(oa);
+       return rc;
+}
+
+static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
+{
+       struct ll_recreate_obj ucreat;
+       struct ost_id           oi;
+       ENTRY;
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               RETURN(-EPERM);
+
+       if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
+                          sizeof(ucreat)))
+               RETURN(-EFAULT);
+
+       ostid_set_seq_mdt0(&oi);
+       ostid_set_id(&oi, ucreat.lrc_id);
+       RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
+}
+
+static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
+{
+       struct lu_fid   fid;
+       struct ost_id   oi;
+       obd_count       ost_idx;
+       ENTRY;
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               RETURN(-EPERM);
+
+       if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
+               RETURN(-EFAULT);
+
+       fid_to_ostid(&fid, &oi);
+       ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
+       RETURN(ll_lov_recreate(inode, &oi, ost_idx));
+}
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
+                            int flags, struct lov_user_md *lum, int lum_size)
+{
+       struct lov_stripe_md *lsm = NULL;
+       struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
+       int rc = 0;
+       ENTRY;
+
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm != NULL) {
+               ccc_inode_lsm_put(inode, lsm);
+               CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
+                      inode->i_ino);
+               RETURN(-EEXIST);
+       }
+
+       ll_inode_size_lock(inode);
+       rc = ll_intent_file_open(file, lum, lum_size, &oit);
+       if (rc)
+               GOTO(out, rc);
+       rc = oit.d.lustre.it_status;
+       if (rc < 0)
+               GOTO(out_req_free, rc);
+
+       ll_release_openhandle(file->f_dentry, &oit);
+
+ out:
+       ll_inode_size_unlock(inode);
+       ll_intent_release(&oit);
+       ccc_inode_lsm_put(inode, lsm);
+       RETURN(rc);
+out_req_free:
+       ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
+       goto out;
+}
+
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+                            struct lov_mds_md **lmmp, int *lmm_size,
+                            struct ptlrpc_request **request)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct mdt_body  *body;
+       struct lov_mds_md *lmm = NULL;
+       struct ptlrpc_request *req = NULL;
+       struct md_op_data *op_data;
+       int rc, lmmsize;
+
+       rc = ll_get_max_mdsize(sbi, &lmmsize);
+       if (rc)
+               RETURN(rc);
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
+                                    strlen(filename), lmmsize,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+       rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+       ll_finish_md_op_data(op_data);
+       if (rc < 0) {
+               CDEBUG(D_INFO, "md_getattr_name failed "
+                      "on %s: rc %d\n", filename, rc);
+               GOTO(out, rc);
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL); /* checked by mdc_getattr_name */
+
+       lmmsize = body->eadatasize;
+
+       if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
+                       lmmsize == 0) {
+               GOTO(out, rc = -ENODATA);
+       }
+
+       lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
+       LASSERT(lmm != NULL);
+
+       if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
+           (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
+               GOTO(out, rc = -EPROTO);
+       }
+
+       /*
+        * This is coming from the MDS, so is probably in
+        * little endian.  We convert it to host endian before
+        * passing it to userspace.
+        */
+       if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
+               /* if function called for directory - we should
+                * avoid swab not existent lsm objects */
+               if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
+                       lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+                       if (S_ISREG(body->mode))
+                               lustre_swab_lov_user_md_objects(
+                                ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+                                ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+               } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+                       lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+                       if (S_ISREG(body->mode))
+                               lustre_swab_lov_user_md_objects(
+                                ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+                                ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+               }
+       }
+
+out:
+       *lmmp = lmm;
+       *lmm_size = lmmsize;
+       *request = req;
+       return rc;
+}
+
+static int ll_lov_setea(struct inode *inode, struct file *file,
+                           unsigned long arg)
+{
+       int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
+       struct lov_user_md      *lump;
+       int                      lum_size = sizeof(struct lov_user_md) +
+                                           sizeof(struct lov_user_ost_data);
+       int                      rc;
+       ENTRY;
+
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+               RETURN(-EPERM);
+
+       OBD_ALLOC_LARGE(lump, lum_size);
+       if (lump == NULL)
+               RETURN(-ENOMEM);
+
+       if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
+               OBD_FREE_LARGE(lump, lum_size);
+               RETURN(-EFAULT);
+       }
+
+       rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
+
+       OBD_FREE_LARGE(lump, lum_size);
+       RETURN(rc);
+}
+
+static int ll_lov_setstripe(struct inode *inode, struct file *file,
+                           unsigned long arg)
+{
+       struct lov_user_md_v3    lumv3;
+       struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+       struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
+       struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
+       int                      lum_size, rc;
+       int                      flags = FMODE_WRITE;
+       ENTRY;
+
+       /* first try with v1 which is smaller than v3 */
+       lum_size = sizeof(struct lov_user_md_v1);
+       if (copy_from_user(lumv1, lumv1p, lum_size))
+               RETURN(-EFAULT);
+
+       if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+               lum_size = sizeof(struct lov_user_md_v3);
+               if (copy_from_user(&lumv3, lumv3p, lum_size))
+                       RETURN(-EFAULT);
+       }
+
+       rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
+       if (rc == 0) {
+               struct lov_stripe_md *lsm;
+               __u32 gen;
+
+               put_user(0, &lumv1p->lmm_stripe_count);
+
+               ll_layout_refresh(inode, &gen);
+               lsm = ccc_inode_lsm_get(inode);
+               rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
+                                  0, lsm, (void *)arg);
+               ccc_inode_lsm_put(inode, lsm);
+       }
+       RETURN(rc);
+}
+
+static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
+{
+       struct lov_stripe_md *lsm;
+       int rc = -ENODATA;
+       ENTRY;
+
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm != NULL)
+               rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
+                                  lsm, (void *)arg);
+       ccc_inode_lsm_put(inode, lsm);
+       RETURN(rc);
+}
+
+int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+       struct ll_inode_info   *lli = ll_i2info(inode);
+       struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+       struct ccc_grouplock    grouplock;
+       int                  rc;
+       ENTRY;
+
+       if (ll_file_nolock(file))
+               RETURN(-EOPNOTSUPP);
+
+       spin_lock(&lli->lli_lock);
+       if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+               CWARN("group lock already existed with gid %lu\n",
+                     fd->fd_grouplock.cg_gid);
+               spin_unlock(&lli->lli_lock);
+               RETURN(-EINVAL);
+       }
+       LASSERT(fd->fd_grouplock.cg_lock == NULL);
+       spin_unlock(&lli->lli_lock);
+
+       rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
+                             arg, (file->f_flags & O_NONBLOCK), &grouplock);
+       if (rc)
+               RETURN(rc);
+
+       spin_lock(&lli->lli_lock);
+       if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+               spin_unlock(&lli->lli_lock);
+               CERROR("another thread just won the race\n");
+               cl_put_grouplock(&grouplock);
+               RETURN(-EINVAL);
+       }
+
+       fd->fd_flags |= LL_FILE_GROUP_LOCKED;
+       fd->fd_grouplock = grouplock;
+       spin_unlock(&lli->lli_lock);
+
+       CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
+       RETURN(0);
+}
+
+int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
+{
+       struct ll_inode_info   *lli = ll_i2info(inode);
+       struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+       struct ccc_grouplock    grouplock;
+       ENTRY;
+
+       spin_lock(&lli->lli_lock);
+       if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+               spin_unlock(&lli->lli_lock);
+               CWARN("no group lock held\n");
+               RETURN(-EINVAL);
+       }
+       LASSERT(fd->fd_grouplock.cg_lock != NULL);
+
+       if (fd->fd_grouplock.cg_gid != arg) {
+               CWARN("group lock %lu doesn't match current id %lu\n",
+                      arg, fd->fd_grouplock.cg_gid);
+               spin_unlock(&lli->lli_lock);
+               RETURN(-EINVAL);
+       }
+
+       grouplock = fd->fd_grouplock;
+       memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
+       fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
+       spin_unlock(&lli->lli_lock);
+
+       cl_put_grouplock(&grouplock);
+       CDEBUG(D_INFO, "group lock %lu released\n", arg);
+       RETURN(0);
+}
+
+/**
+ * Close inode open handle
+ *
+ * \param dentry [in]     dentry which contains the inode
+ * \param it     [in,out] intent which contains open info and result
+ *
+ * \retval 0     success
+ * \retval <0    failure
+ */
+int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
+{
+       struct inode *inode = dentry->d_inode;
+       struct obd_client_handle *och;
+       int rc;
+       ENTRY;
+
+       LASSERT(inode);
+
+       /* Root ? Do nothing. */
+       if (dentry->d_inode->i_sb->s_root == dentry)
+               RETURN(0);
+
+       /* No open handle to close? Move away */
+       if (!it_disposition(it, DISP_OPEN_OPEN))
+               RETURN(0);
+
+       LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
+
+       OBD_ALLOC(och, sizeof(*och));
+       if (!och)
+               GOTO(out, rc = -ENOMEM);
+
+       ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
+                   ll_i2info(inode), it, och);
+
+       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
+                                      inode, och);
+ out:
+       /* this one is in place of ll_file_open */
+       if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
+               ptlrpc_req_finished(it->d.lustre.it_data);
+               it_clear_disposition(it, DISP_ENQ_OPEN_REF);
+       }
+       RETURN(rc);
+}
+
+/**
+ * Get size for inode for which FIEMAP mapping is requested.
+ * Make the FIEMAP get_info call and returns the result.
+ */
+int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+             int num_bytes)
+{
+       struct obd_export *exp = ll_i2dtexp(inode);
+       struct lov_stripe_md *lsm = NULL;
+       struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
+       int vallen = num_bytes;
+       int rc;
+       ENTRY;
+
+       /* Checks for fiemap flags */
+       if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+               fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+               return -EBADR;
+       }
+
+       /* Check for FIEMAP_FLAG_SYNC */
+       if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
+               rc = filemap_fdatawrite(inode->i_mapping);
+               if (rc)
+                       return rc;
+       }
+
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm == NULL)
+               return -ENOENT;
+
+       /* If the stripe_count > 1 and the application does not understand
+        * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
+        */
+       if (lsm->lsm_stripe_count > 1 &&
+           !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
+               GOTO(out, rc = -EOPNOTSUPP);
+
+       fm_key.oa.o_oi = lsm->lsm_oi;
+       fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+       obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
+       obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
+       /* If filesize is 0, then there would be no objects for mapping */
+       if (fm_key.oa.o_size == 0) {
+               fiemap->fm_mapped_extents = 0;
+               GOTO(out, rc = 0);
+       }
+
+       memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
+
+       rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
+                         fiemap, lsm);
+       if (rc)
+               CERROR("obd_get_info failed: rc = %d\n", rc);
+
+out:
+       ccc_inode_lsm_put(inode, lsm);
+       RETURN(rc);
+}
+
+int ll_fid2path(struct inode *inode, void *arg)
+{
+       struct obd_export       *exp = ll_i2mdexp(inode);
+       struct getinfo_fid2path *gfout, *gfin;
+       int                      outsize, rc;
+       ENTRY;
+
+       if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
+           !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
+               RETURN(-EPERM);
+
+       /* Need to get the buflen */
+       OBD_ALLOC_PTR(gfin);
+       if (gfin == NULL)
+               RETURN(-ENOMEM);
+       if (copy_from_user(gfin, arg, sizeof(*gfin))) {
+               OBD_FREE_PTR(gfin);
+               RETURN(-EFAULT);
+       }
+
+       outsize = sizeof(*gfout) + gfin->gf_pathlen;
+       OBD_ALLOC(gfout, outsize);
+       if (gfout == NULL) {
+               OBD_FREE_PTR(gfin);
+               RETURN(-ENOMEM);
+       }
+       memcpy(gfout, gfin, sizeof(*gfout));
+       OBD_FREE_PTR(gfin);
+
+       /* Call mdc_iocontrol */
+       rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
+       if (rc)
+               GOTO(gf_free, rc);
+
+       if (copy_to_user(arg, gfout, outsize))
+               rc = -EFAULT;
+
+gf_free:
+       OBD_FREE(gfout, outsize);
+       RETURN(rc);
+}
+
+static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
+{
+       struct ll_user_fiemap *fiemap_s;
+       size_t num_bytes, ret_bytes;
+       unsigned int extent_count;
+       int rc = 0;
+
+       /* Get the extent count so we can calculate the size of
+        * required fiemap buffer */
+       if (get_user(extent_count,
+           &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
+               RETURN(-EFAULT);
+       num_bytes = sizeof(*fiemap_s) + (extent_count *
+                                        sizeof(struct ll_fiemap_extent));
+
+       OBD_ALLOC_LARGE(fiemap_s, num_bytes);
+       if (fiemap_s == NULL)
+               RETURN(-ENOMEM);
+
+       /* get the fiemap value */
+       if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
+                          sizeof(*fiemap_s)))
+               GOTO(error, rc = -EFAULT);
+
+       /* If fm_extent_count is non-zero, read the first extent since
+        * it is used to calculate end_offset and device from previous
+        * fiemap call. */
+       if (extent_count) {
+               if (copy_from_user(&fiemap_s->fm_extents[0],
+                   (char __user *)arg + sizeof(*fiemap_s),
+                   sizeof(struct ll_fiemap_extent)))
+                       GOTO(error, rc = -EFAULT);
+       }
+
+       rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
+       if (rc)
+               GOTO(error, rc);
+
+       ret_bytes = sizeof(struct ll_user_fiemap);
+
+       if (extent_count != 0)
+               ret_bytes += (fiemap_s->fm_mapped_extents *
+                                sizeof(struct ll_fiemap_extent));
+
+       if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
+               rc = -EFAULT;
+
+error:
+       OBD_FREE_LARGE(fiemap_s, num_bytes);
+       RETURN(rc);
+}
+
+/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param extent_lock  Take extent lock. Not needed if a process is already
+ *                    holding the OST object group locks.
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version,
+                   int extent_lock)
+{
+       struct lov_stripe_md    *lsm = NULL;
+       struct ll_sb_info       *sbi = ll_i2sbi(inode);
+       struct obdo             *obdo = NULL;
+       int                      rc;
+       ENTRY;
+
+       /* If no stripe, we consider version is 0. */
+       lsm = ccc_inode_lsm_get(inode);
+       if (lsm == NULL) {
+               *data_version = 0;
+               CDEBUG(D_INODE, "No object for inode\n");
+               RETURN(0);
+       }
+
+       OBD_ALLOC_PTR(obdo);
+       if (obdo == NULL) {
+               ccc_inode_lsm_put(inode, lsm);
+               RETURN(-ENOMEM);
+       }
+
+       rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
+       if (!rc) {
+               if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
+                       rc = -EOPNOTSUPP;
+               else
+                       *data_version = obdo->o_data_version;
+       }
+
+       OBD_FREE_PTR(obdo);
+       ccc_inode_lsm_put(inode, lsm);
+
+       RETURN(rc);
+}
+
+struct ll_swap_stack {
+       struct iattr             ia1, ia2;
+       __u64                    dv1, dv2;
+       struct inode            *inode1, *inode2;
+       bool                     check_dv1, check_dv2;
+};
+
+static int ll_swap_layouts(struct file *file1, struct file *file2,
+                          struct lustre_swap_layouts *lsl)
+{
+       struct mdc_swap_layouts  msl;
+       struct md_op_data       *op_data;
+       __u32                    gid;
+       __u64                    dv;
+       struct ll_swap_stack    *llss = NULL;
+       int                      rc;
+
+       OBD_ALLOC_PTR(llss);
+       if (llss == NULL)
+               RETURN(-ENOMEM);
+
+       llss->inode1 = file1->f_dentry->d_inode;
+       llss->inode2 = file2->f_dentry->d_inode;
+
+       if (!S_ISREG(llss->inode2->i_mode))
+               GOTO(free, rc = -EINVAL);
+
+       if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
+           ll_permission(llss->inode2, MAY_WRITE, NULL))
+               GOTO(free, rc = -EPERM);
+
+       if (llss->inode2->i_sb != llss->inode1->i_sb)
+               GOTO(free, rc = -EXDEV);
+
+       /* we use 2 bool because it is easier to swap than 2 bits */
+       if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
+               llss->check_dv1 = true;
+
+       if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
+               llss->check_dv2 = true;
+
+       /* we cannot use lsl->sl_dvX directly because we may swap them */
+       llss->dv1 = lsl->sl_dv1;
+       llss->dv2 = lsl->sl_dv2;
+
+       rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
+       if (rc == 0) /* same file, done! */
+               GOTO(free, rc = 0);
+
+       if (rc < 0) { /* sequentialize it */
+               swap(llss->inode1, llss->inode2);
+               swap(file1, file2);
+               swap(llss->dv1, llss->dv2);
+               swap(llss->check_dv1, llss->check_dv2);
+       }
+
+       gid = lsl->sl_gid;
+       if (gid != 0) { /* application asks to flush dirty cache */
+               rc = ll_get_grouplock(llss->inode1, file1, gid);
+               if (rc < 0)
+                       GOTO(free, rc);
+
+               rc = ll_get_grouplock(llss->inode2, file2, gid);
+               if (rc < 0) {
+                       ll_put_grouplock(llss->inode1, file1, gid);
+                       GOTO(free, rc);
+               }
+       }
+
+       /* to be able to restore mtime and atime after swap
+        * we need to first save them */
+       if (lsl->sl_flags &
+           (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
+               llss->ia1.ia_mtime = llss->inode1->i_mtime;
+               llss->ia1.ia_atime = llss->inode1->i_atime;
+               llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
+               llss->ia2.ia_mtime = llss->inode2->i_mtime;
+               llss->ia2.ia_atime = llss->inode2->i_atime;
+               llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
+       }
+
+       /* ultimate check, before swaping the layouts we check if
+        * dataversion has changed (if requested) */
+       if (llss->check_dv1) {
+               rc = ll_data_version(llss->inode1, &dv, 0);
+               if (rc)
+                       GOTO(putgl, rc);
+               if (dv != llss->dv1)
+                       GOTO(putgl, rc = -EAGAIN);
+       }
+
+       if (llss->check_dv2) {
+               rc = ll_data_version(llss->inode2, &dv, 0);
+               if (rc)
+                       GOTO(putgl, rc);
+               if (dv != llss->dv2)
+                       GOTO(putgl, rc = -EAGAIN);
+       }
+
+       /* struct md_op_data is used to send the swap args to the mdt
+        * only flags is missing, so we use struct mdc_swap_layouts
+        * through the md_op_data->op_data */
+       /* flags from user space have to be converted before they are send to
+        * server, no flag is sent today, they are only used on the client */
+       msl.msl_flags = 0;
+       rc = -ENOMEM;
+       op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
+                                    0, LUSTRE_OPC_ANY, &msl);
+       if (op_data != NULL) {
+               rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
+                                  ll_i2mdexp(llss->inode1),
+                                  sizeof(*op_data), op_data, NULL);
+               ll_finish_md_op_data(op_data);
+       }
+
+putgl:
+       if (gid != 0) {
+               ll_put_grouplock(llss->inode2, file2, gid);
+               ll_put_grouplock(llss->inode1, file1, gid);
+       }
+
+       /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
+       if (rc != 0)
+               GOTO(free, rc);
+
+       /* clear useless flags */
+       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
+               llss->ia1.ia_valid &= ~ATTR_MTIME;
+               llss->ia2.ia_valid &= ~ATTR_MTIME;
+       }
+
+       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
+               llss->ia1.ia_valid &= ~ATTR_ATIME;
+               llss->ia2.ia_valid &= ~ATTR_ATIME;
+       }
+
+       /* update time if requested */
+       rc = 0;
+       if (llss->ia2.ia_valid != 0) {
+               mutex_lock(&llss->inode1->i_mutex);
+               rc = ll_setattr(file1->f_dentry, &llss->ia2);
+               mutex_unlock(&llss->inode1->i_mutex);
+       }
+
+       if (llss->ia1.ia_valid != 0) {
+               int rc1;
+
+               mutex_lock(&llss->inode2->i_mutex);
+               rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
+               mutex_unlock(&llss->inode2->i_mutex);
+               if (rc == 0)
+                       rc = rc1;
+       }
+
+free:
+       if (llss != NULL)
+               OBD_FREE_PTR(llss);
+
+       RETURN(rc);
+}
+
+long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct inode            *inode = file->f_dentry->d_inode;
+       struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
+       int                      flags, rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
+              inode->i_generation, inode, cmd);
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
+
+       /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
+       if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
+               RETURN(-ENOTTY);
+
+       switch(cmd) {
+       case LL_IOC_GETFLAGS:
+               /* Get the current value of the file flags */
+               return put_user(fd->fd_flags, (int *)arg);
+       case LL_IOC_SETFLAGS:
+       case LL_IOC_CLRFLAGS:
+               /* Set or clear specific file flags */
+               /* XXX This probably needs checks to ensure the flags are
+                *     not abused, and to handle any flag side effects.
+                */
+               if (get_user(flags, (int *) arg))
+                       RETURN(-EFAULT);
+
+               if (cmd == LL_IOC_SETFLAGS) {
+                       if ((flags & LL_FILE_IGNORE_LOCK) &&
+                           !(file->f_flags & O_DIRECT)) {
+                               CERROR("%s: unable to disable locking on "
+                                      "non-O_DIRECT file\n", current->comm);
+                               RETURN(-EINVAL);
+                       }
+
+                       fd->fd_flags |= flags;
+               } else {
+                       fd->fd_flags &= ~flags;
+               }
+               RETURN(0);
+       case LL_IOC_LOV_SETSTRIPE:
+               RETURN(ll_lov_setstripe(inode, file, arg));
+       case LL_IOC_LOV_SETEA:
+               RETURN(ll_lov_setea(inode, file, arg));
+       case LL_IOC_LOV_SWAP_LAYOUTS: {
+               struct file *file2;
+               struct lustre_swap_layouts lsl;
+
+               if (copy_from_user(&lsl, (char *)arg,
+                                      sizeof(struct lustre_swap_layouts)))
+                       RETURN(-EFAULT);
+
+               if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
+                       RETURN(-EPERM);
+
+               file2 = fget(lsl.sl_fd);
+               if (file2 == NULL)
+                       RETURN(-EBADF);
+
+               rc = -EPERM;
+               if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
+                       rc = ll_swap_layouts(file, file2, &lsl);
+               fput(file2);
+               RETURN(rc);
+       }
+       case LL_IOC_LOV_GETSTRIPE:
+               RETURN(ll_lov_getstripe(inode, arg));
+       case LL_IOC_RECREATE_OBJ:
+               RETURN(ll_lov_recreate_obj(inode, arg));
+       case LL_IOC_RECREATE_FID:
+               RETURN(ll_lov_recreate_fid(inode, arg));
+       case FSFILT_IOC_FIEMAP:
+               RETURN(ll_ioctl_fiemap(inode, arg));
+       case FSFILT_IOC_GETFLAGS:
+       case FSFILT_IOC_SETFLAGS:
+               RETURN(ll_iocontrol(inode, file, cmd, arg));
+       case FSFILT_IOC_GETVERSION_OLD:
+       case FSFILT_IOC_GETVERSION:
+               RETURN(put_user(inode->i_generation, (int *)arg));
+       case LL_IOC_GROUP_LOCK:
+               RETURN(ll_get_grouplock(inode, file, arg));
+       case LL_IOC_GROUP_UNLOCK:
+               RETURN(ll_put_grouplock(inode, file, arg));
+       case IOC_OBD_STATFS:
+               RETURN(ll_obd_statfs(inode, (void *)arg));
+
+       /* We need to special case any other ioctls we want to handle,
+        * to send them to the MDS/OST as appropriate and to properly
+        * network encode the arg field.
+       case FSFILT_IOC_SETVERSION_OLD:
+       case FSFILT_IOC_SETVERSION:
+       */
+       case LL_IOC_FLUSHCTX:
+               RETURN(ll_flush_ctx(inode));
+       case LL_IOC_PATH2FID: {
+               if (copy_to_user((void *)arg, ll_inode2fid(inode),
+                                sizeof(struct lu_fid)))
+                       RETURN(-EFAULT);
+
+               RETURN(0);
+       }
+       case OBD_IOC_FID2PATH:
+               RETURN(ll_fid2path(inode, (void *)arg));
+       case LL_IOC_DATA_VERSION: {
+               struct ioc_data_version idv;
+               int                     rc;
+
+               if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
+                       RETURN(-EFAULT);
+
+               rc = ll_data_version(inode, &idv.idv_version,
+                               !(idv.idv_flags & LL_DV_NOFLUSH));
+
+               if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
+                       RETURN(-EFAULT);
+
+               RETURN(rc);
+       }
+
+       case LL_IOC_GET_MDTIDX: {
+               int mdtidx;
+
+               mdtidx = ll_get_mdt_idx(inode);
+               if (mdtidx < 0)
+                       RETURN(mdtidx);
+
+               if (put_user((int)mdtidx, (int*)arg))
+                       RETURN(-EFAULT);
+
+               RETURN(0);
+       }
+       case OBD_IOC_GETDTNAME:
+       case OBD_IOC_GETMDNAME:
+               RETURN(ll_get_obd_name(inode, cmd, arg));
+       case LL_IOC_HSM_STATE_GET: {
+               struct md_op_data       *op_data;
+               struct hsm_user_state   *hus;
+               int                      rc;
+
+               OBD_ALLOC_PTR(hus);
+               if (hus == NULL)
+                       RETURN(-ENOMEM);
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                            LUSTRE_OPC_ANY, hus);
+               if (op_data == NULL) {
+                       OBD_FREE_PTR(hus);
+                       RETURN(-ENOMEM);
+               }
+
+               rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+                                  op_data, NULL);
+
+               if (copy_to_user((void *)arg, hus, sizeof(*hus)))
+                       rc = -EFAULT;
+
+               ll_finish_md_op_data(op_data);
+               OBD_FREE_PTR(hus);
+               RETURN(rc);
+       }
+       case LL_IOC_HSM_STATE_SET: {
+               struct md_op_data       *op_data;
+               struct hsm_state_set    *hss;
+               int                      rc;
+
+               OBD_ALLOC_PTR(hss);
+               if (hss == NULL)
+                       RETURN(-ENOMEM);
+               if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
+                       OBD_FREE_PTR(hss);
+                       RETURN(-EFAULT);
+               }
+
+               /* Non-root users are forbidden to set or clear flags which are
+                * NOT defined in HSM_USER_MASK. */
+               if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
+                   && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
+                       OBD_FREE_PTR(hss);
+                       RETURN(-EPERM);
+               }
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                            LUSTRE_OPC_ANY, hss);
+               if (op_data == NULL) {
+                       OBD_FREE_PTR(hss);
+                       RETURN(-ENOMEM);
+               }
+
+               rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+                                  op_data, NULL);
+
+               ll_finish_md_op_data(op_data);
+
+               OBD_FREE_PTR(hss);
+               RETURN(rc);
+       }
+       case LL_IOC_HSM_ACTION: {
+               struct md_op_data               *op_data;
+               struct hsm_current_action       *hca;
+               int                              rc;
+
+               OBD_ALLOC_PTR(hca);
+               if (hca == NULL)
+                       RETURN(-ENOMEM);
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                            LUSTRE_OPC_ANY, hca);
+               if (op_data == NULL) {
+                       OBD_FREE_PTR(hca);
+                       RETURN(-ENOMEM);
+               }
+
+               rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
+                                  op_data, NULL);
+
+               if (copy_to_user((char *)arg, hca, sizeof(*hca)))
+                       rc = -EFAULT;
+
+               ll_finish_md_op_data(op_data);
+               OBD_FREE_PTR(hca);
+               RETURN(rc);
+       }
+       default: {
+               int err;
+
+               if (LLIOC_STOP ==
+                    ll_iocontrol_call(inode, file, cmd, arg, &err))
+                       RETURN(err);
+
+               RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
+                                    (void *)arg));
+       }
+       }
+}
+
+
+loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       loff_t retval, eof = 0;
+
+       ENTRY;
+       retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
+                          (origin == SEEK_CUR) ? file->f_pos : 0);
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
+              inode->i_ino, inode->i_generation, inode, retval, retval,
+              origin);
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
+
+       if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
+               retval = ll_glimpse_size(inode);
+               if (retval != 0)
+                       RETURN(retval);
+               eof = i_size_read(inode);
+       }
+
+       retval = ll_generic_file_llseek_size(file, offset, origin,
+                                         ll_file_maxbytes(inode), eof);
+       RETURN(retval);
+}
+
+int ll_flush(struct file *file, fl_owner_t id)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       int rc, err;
+
+       LASSERT(!S_ISDIR(inode->i_mode));
+
+       /* catch async errors that were recorded back when async writeback
+        * failed for pages in this mapping. */
+       rc = lli->lli_async_rc;
+       lli->lli_async_rc = 0;
+       err = lov_read_and_clear_async_rc(lli->lli_clob);
+       if (rc == 0)
+               rc = err;
+
+       /* The application has been told write failure already.
+        * Do not report failure again. */
+       if (fd->fd_write_failed)
+               return 0;
+       return rc ? -EIO : 0;
+}
+
+/**
+ * Called to make sure a portion of file has been written out.
+ * if @local_only is not true, it will send OST_SYNC RPCs to ost.
+ *
+ * Return how many pages have been written.
+ */
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+                      enum cl_fsync_mode mode, int ignore_layout)
+{
+       struct cl_env_nest nest;
+       struct lu_env *env;
+       struct cl_io *io;
+       struct obd_capa *capa = NULL;
+       struct cl_fsync_io *fio;
+       int result;
+       ENTRY;
+
+       if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
+           mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
+               RETURN(-EINVAL);
+
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
+
+       io = ccc_env_thread_io(env);
+       io->ci_obj = cl_i2info(inode)->lli_clob;
+       io->ci_ignore_layout = ignore_layout;
+
+       /* initialize parameters for sync */
+       fio = &io->u.ci_fsync;
+       fio->fi_capa = capa;
+       fio->fi_start = start;
+       fio->fi_end = end;
+       fio->fi_fid = ll_inode2fid(inode);
+       fio->fi_mode = mode;
+       fio->fi_nr_written = 0;
+
+       if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
+               result = cl_io_loop(env, io);
+       else
+               result = io->ci_result;
+       if (result == 0)
+               result = fio->fi_nr_written;
+       cl_io_fini(env, io);
+       cl_env_nested_put(&nest, env);
+
+       capa_put(capa);
+
+       RETURN(result);
+}
+
+/*
+ * When dentry is provided (the 'else' case), *file->f_dentry may be
+ * null and dentry must be used directly rather than pulled from
+ * *file->f_dentry as is done otherwise.
+ */
+
+int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+       struct dentry *dentry = file->f_dentry;
+       struct inode *inode = dentry->d_inode;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ptlrpc_request *req;
+       struct obd_capa *oc;
+       int rc, err;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+              inode->i_generation, inode);
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
+
+       rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       mutex_lock(&inode->i_mutex);
+
+       /* catch async errors that were recorded back when async writeback
+        * failed for pages in this mapping. */
+       if (!S_ISDIR(inode->i_mode)) {
+               err = lli->lli_async_rc;
+               lli->lli_async_rc = 0;
+               if (rc == 0)
+                       rc = err;
+               err = lov_read_and_clear_async_rc(lli->lli_clob);
+               if (rc == 0)
+                       rc = err;
+       }
+
+       oc = ll_mdscapa_get(inode);
+       err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
+                     &req);
+       capa_put(oc);
+       if (!rc)
+               rc = err;
+       if (!err)
+               ptlrpc_req_finished(req);
+
+       if (datasync && S_ISREG(inode->i_mode)) {
+               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+               err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
+                               CL_FSYNC_ALL, 0);
+               if (rc == 0 && err < 0)
+                       rc = err;
+               if (rc < 0)
+                       fd->fd_write_failed = true;
+               else
+                       fd->fd_write_failed = false;
+       }
+
+       mutex_unlock(&inode->i_mutex);
+       RETURN(rc);
+}
+
+int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
+                                          .ei_cb_cp =ldlm_flock_completion_ast,
+                                          .ei_cbdata = file_lock };
+       struct md_op_data *op_data;
+       struct lustre_handle lockh = {0};
+       ldlm_policy_data_t flock = {{0}};
+       int flags = 0;
+       int rc;
+       int rc2 = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
+              inode->i_ino, file_lock);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
+
+       if (file_lock->fl_flags & FL_FLOCK) {
+               LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
+               /* flocks are whole-file locks */
+               flock.l_flock.end = OFFSET_MAX;
+               /* For flocks owner is determined by the local file desctiptor*/
+               flock.l_flock.owner = (unsigned long)file_lock->fl_file;
+       } else if (file_lock->fl_flags & FL_POSIX) {
+               flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
+               flock.l_flock.start = file_lock->fl_start;
+               flock.l_flock.end = file_lock->fl_end;
+       } else {
+               RETURN(-EINVAL);
+       }
+       flock.l_flock.pid = file_lock->fl_pid;
+
+       /* Somewhat ugly workaround for svc lockd.
+        * lockd installs custom fl_lmops->lm_compare_owner that checks
+        * for the fl_owner to be the same (which it always is on local node
+        * I guess between lockd processes) and then compares pid.
+        * As such we assign pid to the owner field to make it all work,
+        * conflict with normal locks is unlikely since pid space and
+        * pointer space for current->files are not intersecting */
+       if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
+               flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
+
+       switch (file_lock->fl_type) {
+       case F_RDLCK:
+               einfo.ei_mode = LCK_PR;
+               break;
+       case F_UNLCK:
+               /* An unlock request may or may not have any relation to
+                * existing locks so we may not be able to pass a lock handle
+                * via a normal ldlm_lock_cancel() request. The request may even
+                * unlock a byte range in the middle of an existing lock. In
+                * order to process an unlock request we need all of the same
+                * information that is given with a normal read or write record
+                * lock request. To avoid creating another ldlm unlock (cancel)
+                * message we'll treat a LCK_NL flock request as an unlock. */
+               einfo.ei_mode = LCK_NL;
+               break;
+       case F_WRLCK:
+               einfo.ei_mode = LCK_PW;
+               break;
+       default:
+               CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
+                       file_lock->fl_type);
+               RETURN (-ENOTSUPP);
+       }
+
+       switch (cmd) {
+       case F_SETLKW:
+#ifdef F_SETLKW64
+       case F_SETLKW64:
+#endif
+               flags = 0;
+               break;
+       case F_SETLK:
+#ifdef F_SETLK64
+       case F_SETLK64:
+#endif
+               flags = LDLM_FL_BLOCK_NOWAIT;
+               break;
+       case F_GETLK:
+#ifdef F_GETLK64
+       case F_GETLK64:
+#endif
+               flags = LDLM_FL_TEST_LOCK;
+               /* Save the old mode so that if the mode in the lock changes we
+                * can decrement the appropriate reader or writer refcount. */
+               file_lock->fl_type = einfo.ei_mode;
+               break;
+       default:
+               CERROR("unknown fcntl lock command: %d\n", cmd);
+               RETURN (-EINVAL);
+       }
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
+              "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
+              flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
+
+       rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+                       op_data, &lockh, &flock, 0, NULL /* req */, flags);
+
+       if ((file_lock->fl_flags & FL_FLOCK) &&
+           (rc == 0 || file_lock->fl_type == F_UNLCK))
+               rc2  = flock_lock_file_wait(file, file_lock);
+       if ((file_lock->fl_flags & FL_POSIX) &&
+           (rc == 0 || file_lock->fl_type == F_UNLCK) &&
+           !(flags & LDLM_FL_TEST_LOCK))
+               rc2  = posix_lock_file_wait(file, file_lock);
+
+       if (rc2 && file_lock->fl_type != F_UNLCK) {
+               einfo.ei_mode = LCK_NL;
+               md_enqueue(sbi->ll_md_exp, &einfo, NULL,
+                       op_data, &lockh, &flock, 0, NULL /* req */, flags);
+               rc = rc2;
+       }
+
+       ll_finish_md_op_data(op_data);
+
+       RETURN(rc);
+}
+
+int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
+{
+       ENTRY;
+
+       RETURN(-ENOSYS);
+}
+
+/**
+ * test if some locks matching bits and l_req_mode are acquired
+ * - bits can be in different locks
+ * - if found clear the common lock bits in *bits
+ * - the bits not found, are kept in *bits
+ * \param inode [IN]
+ * \param bits [IN] searched lock bits [IN]
+ * \param l_req_mode [IN] searched lock mode
+ * \retval boolean, true iff all bits are found
+ */
+int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
+{
+       struct lustre_handle lockh;
+       ldlm_policy_data_t policy;
+       ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
+                               (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
+       struct lu_fid *fid;
+       __u64 flags;
+       int i;
+       ENTRY;
+
+       if (!inode)
+              RETURN(0);
+
+       fid = &ll_i2info(inode)->lli_fid;
+       CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
+              ldlm_lockname[mode]);
+
+       flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
+       for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
+               policy.l_inodebits.bits = *bits & (1 << i);
+               if (policy.l_inodebits.bits == 0)
+                       continue;
+
+               if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
+                                 &policy, mode, &lockh)) {
+                       struct ldlm_lock *lock;
+
+                       lock = ldlm_handle2lock(&lockh);
+                       if (lock) {
+                               *bits &=
+                                     ~(lock->l_policy_data.l_inodebits.bits);
+                               LDLM_LOCK_PUT(lock);
+                       } else {
+                               *bits &= ~policy.l_inodebits.bits;
+                       }
+               }
+       }
+       RETURN(*bits == 0);
+}
+
+ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+                           struct lustre_handle *lockh, __u64 flags)
+{
+       ldlm_policy_data_t policy = { .l_inodebits = {bits}};
+       struct lu_fid *fid;
+       ldlm_mode_t rc;
+       ENTRY;
+
+       fid = &ll_i2info(inode)->lli_fid;
+       CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
+
+       rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
+                          fid, LDLM_IBITS, &policy,
+                          LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
+       RETURN(rc);
+}
+
+static int ll_inode_revalidate_fini(struct inode *inode, int rc)
+{
+       /* Already unlinked. Just update nlink and return success */
+       if (rc == -ENOENT) {
+               clear_nlink(inode);
+               /* This path cannot be hit for regular files unless in
+                * case of obscure races, so no need to to validate
+                * size. */
+               if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+                       return 0;
+       } else if (rc != 0) {
+               CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
+                      ll_get_fsname(inode->i_sb, NULL, 0),
+                      PFID(ll_inode2fid(inode)), rc);
+       }
+
+       return rc;
+}
+
+int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
+                            __u64 ibits)
+{
+       struct inode *inode = dentry->d_inode;
+       struct ptlrpc_request *req = NULL;
+       struct obd_export *exp;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(inode != NULL);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
+              inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
+
+       exp = ll_i2mdexp(inode);
+
+       /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
+        *      But under CMD case, it caused some lock issues, should be fixed
+        *      with new CMD ibits lock. See bug 12718 */
+       if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
+               struct lookup_intent oit = { .it_op = IT_GETATTR };
+               struct md_op_data *op_data;
+
+               if (ibits == MDS_INODELOCK_LOOKUP)
+                       oit.it_op = IT_LOOKUP;
+
+               /* Call getattr by fid, so do not provide name at all. */
+               op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
+                                            dentry->d_inode, NULL, 0, 0,
+                                            LUSTRE_OPC_ANY, NULL);
+               if (IS_ERR(op_data))
+                       RETURN(PTR_ERR(op_data));
+
+               oit.it_create_mode |= M_CHECK_STALE;
+               rc = md_intent_lock(exp, op_data, NULL, 0,
+                                   /* we are not interested in name
+                                      based lookup */
+                                   &oit, 0, &req,
+                                   ll_md_blocking_ast, 0);
+               ll_finish_md_op_data(op_data);
+               oit.it_create_mode &= ~M_CHECK_STALE;
+               if (rc < 0) {
+                       rc = ll_inode_revalidate_fini(inode, rc);
+                       GOTO (out, rc);
+               }
+
+               rc = ll_revalidate_it_finish(req, &oit, dentry);
+               if (rc != 0) {
+                       ll_intent_release(&oit);
+                       GOTO(out, rc);
+               }
+
+               /* Unlinked? Unhash dentry, so it is not picked up later by
+                  do_lookup() -> ll_revalidate_it(). We cannot use d_drop
+                  here to preserve get_cwd functionality on 2.6.
+                  Bug 10503 */
+               if (!dentry->d_inode->i_nlink)
+                       d_lustre_invalidate(dentry, 0);
+
+               ll_lookup_finish_locks(&oit, dentry);
+       } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
+               struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+               obd_valid valid = OBD_MD_FLGETATTR;
+               struct md_op_data *op_data;
+               int ealen = 0;
+
+               if (S_ISREG(inode->i_mode)) {
+                       rc = ll_get_max_mdsize(sbi, &ealen);
+                       if (rc)
+                               RETURN(rc);
+                       valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
+               }
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+                                            0, ealen, LUSTRE_OPC_ANY,
+                                            NULL);
+               if (IS_ERR(op_data))
+                       RETURN(PTR_ERR(op_data));
+
+               op_data->op_valid = valid;
+               /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
+                * capa for this inode. Because we only keep capas of dirs
+                * fresh. */
+               rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+               ll_finish_md_op_data(op_data);
+               if (rc) {
+                       rc = ll_inode_revalidate_fini(inode, rc);
+                       RETURN(rc);
+               }
+
+               rc = ll_prep_inode(&inode, req, NULL, NULL);
+       }
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
+                          __u64 ibits)
+{
+       struct inode *inode = dentry->d_inode;
+       int rc;
+       ENTRY;
+
+       rc = __ll_inode_revalidate_it(dentry, it, ibits);
+       if (rc != 0)
+               RETURN(rc);
+
+       /* if object isn't regular file, don't validate size */
+       if (!S_ISREG(inode->i_mode)) {
+               LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
+               LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
+               LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
+       } else {
+               rc = ll_glimpse_size(inode);
+       }
+       RETURN(rc);
+}
+
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
+                 struct lookup_intent *it, struct kstat *stat)
+{
+       struct inode *inode = de->d_inode;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       int res = 0;
+
+       res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
+                                            MDS_INODELOCK_LOOKUP);
+       ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
+
+       if (res)
+               return res;
+
+       stat->dev = inode->i_sb->s_dev;
+       if (ll_need_32bit_api(sbi))
+               stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
+       else
+               stat->ino = inode->i_ino;
+       stat->mode = inode->i_mode;
+       stat->nlink = inode->i_nlink;
+       stat->uid = inode->i_uid;
+       stat->gid = inode->i_gid;
+       stat->rdev = inode->i_rdev;
+       stat->atime = inode->i_atime;
+       stat->mtime = inode->i_mtime;
+       stat->ctime = inode->i_ctime;
+       stat->blksize = 1 << inode->i_blkbits;
+
+       stat->size = i_size_read(inode);
+       stat->blocks = inode->i_blocks;
+
+       return 0;
+}
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+       struct lookup_intent it = { .it_op = IT_GETATTR };
+
+       return ll_getattr_it(mnt, de, &it, stat);
+}
+
+
+struct posix_acl * ll_get_acl(struct inode *inode, int type)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct posix_acl *acl = NULL;
+       ENTRY;
+
+       spin_lock(&lli->lli_lock);
+       /* VFS' acl_permission_check->check_acl will release the refcount */
+       acl = posix_acl_dup(lli->lli_posix_acl);
+       spin_unlock(&lli->lli_lock);
+
+       RETURN(acl);
+}
+
+
+int ll_inode_permission(struct inode *inode, int mask)
+{
+       int rc = 0;
+       ENTRY;
+
+#ifdef MAY_NOT_BLOCK
+       if (mask & MAY_NOT_BLOCK)
+               return -ECHILD;
+#endif
+
+       /* as root inode are NOT getting validated in lookup operation,
+       * need to do it before permission check. */
+
+       if (inode == inode->i_sb->s_root->d_inode) {
+               struct lookup_intent it = { .it_op = IT_LOOKUP };
+
+               rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
+                                             MDS_INODELOCK_LOOKUP);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
+              inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
+
+       if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
+               return lustre_check_remote_perm(inode, mask);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
+       rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
+
+       RETURN(rc);
+}
+
+#define READ_METHOD aio_read
+#define READ_FUNCTION ll_file_aio_read
+#define WRITE_METHOD aio_write
+#define WRITE_FUNCTION ll_file_aio_write
+
+/* -o localflock - only provides locally consistent flock locks */
+struct file_operations ll_file_operations = {
+       .read      = ll_file_read,
+       .READ_METHOD    = READ_FUNCTION,
+       .write    = ll_file_write,
+       .WRITE_METHOD   = WRITE_FUNCTION,
+       .unlocked_ioctl = ll_file_ioctl,
+       .open      = ll_file_open,
+       .release        = ll_file_release,
+       .mmap      = ll_file_mmap,
+       .llseek  = ll_file_seek,
+       .splice_read    = ll_file_splice_read,
+       .fsync    = ll_fsync,
+       .flush    = ll_flush
+};
+
+struct file_operations ll_file_operations_flock = {
+       .read      = ll_file_read,
+       .READ_METHOD    = READ_FUNCTION,
+       .write    = ll_file_write,
+       .WRITE_METHOD   = WRITE_FUNCTION,
+       .unlocked_ioctl = ll_file_ioctl,
+       .open      = ll_file_open,
+       .release        = ll_file_release,
+       .mmap      = ll_file_mmap,
+       .llseek  = ll_file_seek,
+       .splice_read    = ll_file_splice_read,
+       .fsync    = ll_fsync,
+       .flush    = ll_flush,
+       .flock    = ll_file_flock,
+       .lock      = ll_file_flock
+};
+
+/* These are for -o noflock - to return ENOSYS on flock calls */
+struct file_operations ll_file_operations_noflock = {
+       .read      = ll_file_read,
+       .READ_METHOD    = READ_FUNCTION,
+       .write    = ll_file_write,
+       .WRITE_METHOD   = WRITE_FUNCTION,
+       .unlocked_ioctl = ll_file_ioctl,
+       .open      = ll_file_open,
+       .release        = ll_file_release,
+       .mmap      = ll_file_mmap,
+       .llseek  = ll_file_seek,
+       .splice_read    = ll_file_splice_read,
+       .fsync    = ll_fsync,
+       .flush    = ll_flush,
+       .flock    = ll_file_noflock,
+       .lock      = ll_file_noflock
+};
+
+struct inode_operations ll_file_inode_operations = {
+       .setattr        = ll_setattr,
+       .getattr        = ll_getattr,
+       .permission     = ll_inode_permission,
+       .setxattr       = ll_setxattr,
+       .getxattr       = ll_getxattr,
+       .listxattr      = ll_listxattr,
+       .removexattr    = ll_removexattr,
+       .get_acl        = ll_get_acl,
+};
+
+/* dynamic ioctl number support routins */
+static struct llioc_ctl_data {
+       struct rw_semaphore     ioc_sem;
+       struct list_head              ioc_head;
+} llioc = {
+       __RWSEM_INITIALIZER(llioc.ioc_sem),
+       LIST_HEAD_INIT(llioc.ioc_head)
+};
+
+
+struct llioc_data {
+       struct list_head              iocd_list;
+       unsigned int        iocd_size;
+       llioc_callback_t        iocd_cb;
+       unsigned int        iocd_count;
+       unsigned int        iocd_cmd[0];
+};
+
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
+{
+       unsigned int size;
+       struct llioc_data *in_data = NULL;
+       ENTRY;
+
+       if (cb == NULL || cmd == NULL ||
+           count > LLIOC_MAX_CMD || count < 0)
+               RETURN(NULL);
+
+       size = sizeof(*in_data) + count * sizeof(unsigned int);
+       OBD_ALLOC(in_data, size);
+       if (in_data == NULL)
+               RETURN(NULL);
+
+       memset(in_data, 0, sizeof(*in_data));
+       in_data->iocd_size = size;
+       in_data->iocd_cb = cb;
+       in_data->iocd_count = count;
+       memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
+
+       down_write(&llioc.ioc_sem);
+       list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
+       up_write(&llioc.ioc_sem);
+
+       RETURN(in_data);
+}
+
+void ll_iocontrol_unregister(void *magic)
+{
+       struct llioc_data *tmp;
+
+       if (magic == NULL)
+               return;
+
+       down_write(&llioc.ioc_sem);
+       list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
+               if (tmp == magic) {
+                       unsigned int size = tmp->iocd_size;
+
+                       list_del(&tmp->iocd_list);
+                       up_write(&llioc.ioc_sem);
+
+                       OBD_FREE(tmp, size);
+                       return;
+               }
+       }
+       up_write(&llioc.ioc_sem);
+
+       CWARN("didn't find iocontrol register block with magic: %p\n", magic);
+}
+
+EXPORT_SYMBOL(ll_iocontrol_register);
+EXPORT_SYMBOL(ll_iocontrol_unregister);
+
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
+                       unsigned int cmd, unsigned long arg, int *rcp)
+{
+       enum llioc_iter ret = LLIOC_CONT;
+       struct llioc_data *data;
+       int rc = -EINVAL, i;
+
+       down_read(&llioc.ioc_sem);
+       list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
+               for (i = 0; i < data->iocd_count; i++) {
+                       if (cmd != data->iocd_cmd[i])
+                               continue;
+
+                       ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
+                       break;
+               }
+
+               if (ret == LLIOC_STOP)
+                       break;
+       }
+       up_read(&llioc.ioc_sem);
+
+       if (rcp)
+               *rcp = rc;
+       return ret;
+}
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct cl_env_nest nest;
+       struct lu_env *env;
+       int result;
+       ENTRY;
+
+       if (lli->lli_clob == NULL)
+               RETURN(0);
+
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       result = cl_conf_set(env, lli->lli_clob, conf);
+       cl_env_nested_put(&nest, env);
+
+       if (conf->coc_opc == OBJECT_CONF_SET) {
+               struct ldlm_lock *lock = conf->coc_lock;
+
+               LASSERT(lock != NULL);
+               LASSERT(ldlm_has_layout(lock));
+               if (result == 0) {
+                       /* it can only be allowed to match after layout is
+                        * applied to inode otherwise false layout would be
+                        * seen. Applying layout shoud happen before dropping
+                        * the intent lock. */
+                       ldlm_lock_allow_match(lock);
+               }
+       }
+       RETURN(result);
+}
+
+/* Fetch layout from MDT with getxattr request, if it's not ready yet */
+static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
+
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct obd_capa *oc;
+       struct ptlrpc_request *req;
+       struct mdt_body *body;
+       void *lvbdata;
+       void *lmm;
+       int lmmsize;
+       int rc;
+       ENTRY;
+
+       if (lock->l_lvb_data != NULL)
+               RETURN(0);
+
+       /* if layout lock was granted right away, the layout is returned
+        * within DLM_LVB of dlm reply; otherwise if the lock was ever
+        * blocked and then granted via completion ast, we have to fetch
+        * layout here. Please note that we can't use the LVB buffer in
+        * completion AST because it doesn't have a large enough buffer */
+       oc = ll_mdscapa_get(inode);
+       rc = ll_get_max_mdsize(sbi, &lmmsize);
+       if (rc == 0)
+               rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+                               OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
+                               lmmsize, 0, &req);
+       capa_put(oc);
+       if (rc < 0)
+               RETURN(rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL || body->eadatasize > lmmsize)
+               GOTO(out, rc = -EPROTO);
+
+       lmmsize = body->eadatasize;
+       if (lmmsize == 0) /* empty layout */
+               GOTO(out, rc = 0);
+
+       lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
+       if (lmm == NULL)
+               GOTO(out, rc = -EFAULT);
+
+       OBD_ALLOC_LARGE(lvbdata, lmmsize);
+       if (lvbdata == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       memcpy(lvbdata, lmm, lmmsize);
+       lock_res_and_lock(lock);
+       if (lock->l_lvb_data == NULL) {
+               lock->l_lvb_data = lvbdata;
+               lock->l_lvb_len = lmmsize;
+               lvbdata = NULL;
+       }
+       unlock_res_and_lock(lock);
+
+       if (lvbdata != NULL)
+               OBD_FREE_LARGE(lvbdata, lmmsize);
+       EXIT;
+
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+/**
+ * Apply the layout to the inode. Layout lock is held and will be released
+ * in this function.
+ */
+static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
+                               struct inode *inode, __u32 *gen, bool reconf)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_sb_info    *sbi = ll_i2sbi(inode);
+       struct ldlm_lock *lock;
+       struct lustre_md md = { NULL };
+       struct cl_object_conf conf;
+       int rc = 0;
+       bool lvb_ready;
+       bool wait_layout = false;
+       ENTRY;
+
+       LASSERT(lustre_handle_is_used(lockh));
+
+       lock = ldlm_handle2lock(lockh);
+       LASSERT(lock != NULL);
+       LASSERT(ldlm_has_layout(lock));
+
+       LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
+               inode, PFID(&lli->lli_fid), reconf);
+
+       /* in case this is a caching lock and reinstate with new inode */
+       md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
+
+       lock_res_and_lock(lock);
+       lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
+       unlock_res_and_lock(lock);
+       /* checking lvb_ready is racy but this is okay. The worst case is
+        * that multi processes may configure the file on the same time. */
+       if (lvb_ready || !reconf) {
+               rc = -ENODATA;
+               if (lvb_ready) {
+                       /* layout_gen must be valid if layout lock is not
+                        * cancelled and stripe has already set */
+                       *gen = lli->lli_layout_gen;
+                       rc = 0;
+               }
+               GOTO(out, rc);
+       }
+
+       rc = ll_layout_fetch(inode, lock);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /* for layout lock, lmm is returned in lock's lvb.
+        * lvb_data is immutable if the lock is held so it's safe to access it
+        * without res lock. See the description in ldlm_lock_decref_internal()
+        * for the condition to free lvb_data of layout lock */
+       if (lock->l_lvb_data != NULL) {
+               rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
+                                 lock->l_lvb_data, lock->l_lvb_len);
+               if (rc >= 0) {
+                       *gen = LL_LAYOUT_GEN_EMPTY;
+                       if (md.lsm != NULL)
+                               *gen = md.lsm->lsm_layout_gen;
+                       rc = 0;
+               } else {
+                       CERROR("%s: file "DFID" unpackmd error: %d\n",
+                               ll_get_fsname(inode->i_sb, NULL, 0),
+                               PFID(&lli->lli_fid), rc);
+               }
+       }
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /* set layout to file. Unlikely this will fail as old layout was
+        * surely eliminated */
+       memset(&conf, 0, sizeof conf);
+       conf.coc_opc = OBJECT_CONF_SET;
+       conf.coc_inode = inode;
+       conf.coc_lock = lock;
+       conf.u.coc_md = &md;
+       rc = ll_layout_conf(inode, &conf);
+
+       if (md.lsm != NULL)
+               obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+
+       /* refresh layout failed, need to wait */
+       wait_layout = rc == -EBUSY;
+       EXIT;
+
+out:
+       LDLM_LOCK_PUT(lock);
+       ldlm_lock_decref(lockh, mode);
+
+       /* wait for IO to complete if it's still being used. */
+       if (wait_layout) {
+               CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
+                       ll_get_fsname(inode->i_sb, NULL, 0),
+                       inode, PFID(&lli->lli_fid));
+
+               memset(&conf, 0, sizeof conf);
+               conf.coc_opc = OBJECT_CONF_WAIT;
+               conf.coc_inode = inode;
+               rc = ll_layout_conf(inode, &conf);
+               if (rc == 0)
+                       rc = -EAGAIN;
+
+               CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
+                       PFID(&lli->lli_fid), rc);
+       }
+       RETURN(rc);
+}
+
+/**
+ * This function checks if there exists a LAYOUT lock on the client side,
+ * or enqueues it if it doesn't have one in cache.
+ *
+ * This function will not hold layout lock so it may be revoked any time after
+ * this function returns. Any operations depend on layout should be redone
+ * in that case.
+ *
+ * This function should be called before lov_io_init() to get an uptodate
+ * layout version, the caller should save the version number and after IO
+ * is finished, this function should be called again to verify that layout
+ * is not changed during IO time.
+ */
+int ll_layout_refresh(struct inode *inode, __u32 *gen)
+{
+       struct ll_inode_info  *lli = ll_i2info(inode);
+       struct ll_sb_info     *sbi = ll_i2sbi(inode);
+       struct md_op_data     *op_data;
+       struct lookup_intent   it;
+       struct lustre_handle   lockh;
+       ldlm_mode_t            mode;
+       struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
+                                          .ei_mode = LCK_CR,
+                                          .ei_cb_bl = ll_md_blocking_ast,
+                                          .ei_cb_cp = ldlm_completion_ast,
+                                          .ei_cbdata = NULL };
+       int rc;
+       ENTRY;
+
+       *gen = lli->lli_layout_gen;
+       if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+               RETURN(0);
+
+       /* sanity checks */
+       LASSERT(fid_is_sane(ll_inode2fid(inode)));
+       LASSERT(S_ISREG(inode->i_mode));
+
+       /* mostly layout lock is caching on the local side, so try to match
+        * it before grabbing layout lock mutex. */
+       mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
+       if (mode != 0) { /* hit cached lock */
+               rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
+               if (rc == 0)
+                       RETURN(0);
+
+               /* better hold lli_layout_mutex to try again otherwise
+                * it will have starvation problem. */
+       }
+
+       /* take layout lock mutex to enqueue layout lock exclusively. */
+       mutex_lock(&lli->lli_layout_mutex);
+
+again:
+       /* try again. Maybe somebody else has done this. */
+       mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
+       if (mode != 0) { /* hit cached lock */
+               rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+               if (rc == -EAGAIN)
+                       goto again;
+
+               mutex_unlock(&lli->lli_layout_mutex);
+               RETURN(rc);
+       }
+
+       op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
+                       0, 0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data)) {
+               mutex_unlock(&lli->lli_layout_mutex);
+               RETURN(PTR_ERR(op_data));
+       }
+
+       /* have to enqueue one */
+       memset(&it, 0, sizeof(it));
+       it.it_op = IT_LAYOUT;
+       lockh.cookie = 0ULL;
+
+       LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
+                       ll_get_fsname(inode->i_sb, NULL, 0), inode,
+                       PFID(&lli->lli_fid));
+
+       rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
+                       NULL, 0, NULL, 0);
+       if (it.d.lustre.it_data != NULL)
+               ptlrpc_req_finished(it.d.lustre.it_data);
+       it.d.lustre.it_data = NULL;
+
+       ll_finish_md_op_data(op_data);
+
+       mode = it.d.lustre.it_lock_mode;
+       it.d.lustre.it_lock_mode = 0;
+       ll_intent_drop_lock(&it);
+
+       if (rc == 0) {
+               /* set lock data in case this is a new lock */
+               ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
+               rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
+               if (rc == -EAGAIN)
+                       goto again;
+       }
+       mutex_unlock(&lli->lli_layout_mutex);
+
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_capa.c b/drivers/staging/lustre/lustre/llite/llite_capa.c
new file mode 100644 (file)
index 0000000..b6fd959
--- /dev/null
@@ -0,0 +1,661 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_capa.c
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/fs.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+#include <linux/file.h>
+#include <linux/kmod.h>
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+/* for obd_capa.c_list, client capa might stay in three places:
+ * 1. ll_capa_list.
+ * 2. ll_idle_capas.
+ * 3. stand alone: just allocated.
+ */
+
+/* capas for oss writeback and those failed to renew */
+static LIST_HEAD(ll_idle_capas);
+static struct ptlrpc_thread ll_capa_thread;
+static struct list_head *ll_capa_list = &capa_list[CAPA_SITE_CLIENT];
+
+/* llite capa renewal timer */
+struct timer_list ll_capa_timer;
+/* for debug: indicate whether capa on llite is enabled or not */
+static atomic_t ll_capa_debug = ATOMIC_INIT(0);
+static unsigned long long ll_capa_renewed = 0;
+static unsigned long long ll_capa_renewal_noent = 0;
+static unsigned long long ll_capa_renewal_failed = 0;
+static unsigned long long ll_capa_renewal_retries = 0;
+
+static inline void update_capa_timer(struct obd_capa *ocapa, cfs_time_t expiry)
+{
+       if (cfs_time_before(expiry, ll_capa_timer.expires) ||
+           !timer_pending(&ll_capa_timer)) {
+               mod_timer(&ll_capa_timer, expiry);
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+                          "ll_capa_timer update: %lu/%lu by", expiry, jiffies);
+       }
+}
+
+static inline cfs_time_t capa_renewal_time(struct obd_capa *ocapa)
+{
+       return cfs_time_sub(ocapa->c_expiry,
+                           cfs_time_seconds(ocapa->c_capa.lc_timeout) / 2);
+}
+
+static inline int capa_is_to_expire(struct obd_capa *ocapa)
+{
+       return cfs_time_beforeq(capa_renewal_time(ocapa), cfs_time_current());
+}
+
+static inline int have_expired_capa(void)
+{
+       struct obd_capa *ocapa = NULL;
+       int expired = 0;
+
+       /* if ll_capa_list has client capa to expire or ll_idle_capas has
+        * expired capa, return 1.
+        */
+       spin_lock(&capa_lock);
+       if (!list_empty(ll_capa_list)) {
+               ocapa = list_entry(ll_capa_list->next, struct obd_capa,
+                                      c_list);
+               expired = capa_is_to_expire(ocapa);
+               if (!expired)
+                       update_capa_timer(ocapa, capa_renewal_time(ocapa));
+       } else if (!list_empty(&ll_idle_capas)) {
+               ocapa = list_entry(ll_idle_capas.next, struct obd_capa,
+                                      c_list);
+               expired = capa_is_expired(ocapa);
+               if (!expired)
+                       update_capa_timer(ocapa, ocapa->c_expiry);
+       }
+       spin_unlock(&capa_lock);
+
+       if (expired)
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired");
+       return expired;
+}
+
+static void sort_add_capa(struct obd_capa *ocapa, struct list_head *head)
+{
+       struct obd_capa *tmp;
+       struct list_head *before = NULL;
+
+       /* TODO: client capa is sorted by expiry, this could be optimized */
+       list_for_each_entry_reverse(tmp, head, c_list) {
+               if (cfs_time_aftereq(ocapa->c_expiry, tmp->c_expiry)) {
+                       before = &tmp->c_list;
+                       break;
+               }
+       }
+
+       LASSERT(&ocapa->c_list != before);
+       list_add(&ocapa->c_list, before ?: head);
+}
+
+static inline int obd_capa_open_count(struct obd_capa *oc)
+{
+       struct ll_inode_info *lli = ll_i2info(oc->u.cli.inode);
+       return atomic_read(&lli->lli_open_count);
+}
+
+static void ll_delete_capa(struct obd_capa *ocapa)
+{
+       struct ll_inode_info *lli = ll_i2info(ocapa->u.cli.inode);
+
+       if (capa_for_mds(&ocapa->c_capa)) {
+               LASSERT(lli->lli_mds_capa == ocapa);
+               lli->lli_mds_capa = NULL;
+       } else if (capa_for_oss(&ocapa->c_capa)) {
+               list_del_init(&ocapa->u.cli.lli_list);
+       }
+
+       DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client");
+       list_del_init(&ocapa->c_list);
+       capa_count[CAPA_SITE_CLIENT]--;
+       /* release the ref when alloc */
+       capa_put(ocapa);
+}
+
+/* three places where client capa is deleted:
+ * 1. capa_thread_main(), main place to delete expired capa.
+ * 2. ll_clear_inode_capas() in ll_clear_inode().
+ * 3. ll_truncate_free_capa() delete truncate capa explicitly in ll_setattr_ost().
+ */
+static int capa_thread_main(void *unused)
+{
+       struct obd_capa *ocapa, *tmp, *next;
+       struct inode *inode = NULL;
+       struct l_wait_info lwi = { 0 };
+       int rc;
+       ENTRY;
+
+       thread_set_flags(&ll_capa_thread, SVC_RUNNING);
+       wake_up(&ll_capa_thread.t_ctl_waitq);
+
+       while (1) {
+               l_wait_event(ll_capa_thread.t_ctl_waitq,
+                            !thread_is_running(&ll_capa_thread) ||
+                            have_expired_capa(),
+                            &lwi);
+
+               if (!thread_is_running(&ll_capa_thread))
+                       break;
+
+               next = NULL;
+
+               spin_lock(&capa_lock);
+               list_for_each_entry_safe(ocapa, tmp, ll_capa_list, c_list) {
+                       __u64 ibits;
+
+                       LASSERT(ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC);
+
+                       if (!capa_is_to_expire(ocapa)) {
+                               next = ocapa;
+                               break;
+                       }
+
+                       list_del_init(&ocapa->c_list);
+
+                       /* for MDS capability, only renew those which belong to
+                        * dir, or its inode is opened, or client holds LOOKUP
+                        * lock.
+                        */
+                       /* ibits may be changed by ll_have_md_lock() so we have
+                        * to set it each time */
+                       ibits = MDS_INODELOCK_LOOKUP;
+                       if (capa_for_mds(&ocapa->c_capa) &&
+                           !S_ISDIR(ocapa->u.cli.inode->i_mode) &&
+                           obd_capa_open_count(ocapa) == 0 &&
+                           !ll_have_md_lock(ocapa->u.cli.inode,
+                                            &ibits, LCK_MINMODE)) {
+                               DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+                                          "skip renewal for");
+                               sort_add_capa(ocapa, &ll_idle_capas);
+                               continue;
+                       }
+
+                       /* for OSS capability, only renew those whose inode is
+                        * opened.
+                        */
+                       if (capa_for_oss(&ocapa->c_capa) &&
+                           obd_capa_open_count(ocapa) == 0) {
+                               /* oss capa with open count == 0 won't renew,
+                                * move to idle list */
+                               sort_add_capa(ocapa, &ll_idle_capas);
+                               continue;
+                       }
+
+                       /* NB iput() is in ll_update_capa() */
+                       inode = igrab(ocapa->u.cli.inode);
+                       if (inode == NULL) {
+                               DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+                                          "igrab failed for");
+                               continue;
+                       }
+
+                       capa_get(ocapa);
+                       ll_capa_renewed++;
+                       spin_unlock(&capa_lock);
+                       rc = md_renew_capa(ll_i2mdexp(inode), ocapa,
+                                          ll_update_capa);
+                       spin_lock(&capa_lock);
+                       if (rc) {
+                               DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+                                          "renew failed: %d", rc);
+                               ll_capa_renewal_failed++;
+                       }
+               }
+
+               if (next)
+                       update_capa_timer(next, capa_renewal_time(next));
+
+               list_for_each_entry_safe(ocapa, tmp, &ll_idle_capas,
+                                            c_list) {
+                       if (!capa_is_expired(ocapa)) {
+                               if (!next)
+                                       update_capa_timer(ocapa,
+                                                         ocapa->c_expiry);
+                               break;
+                       }
+
+                       if (atomic_read(&ocapa->c_refc) > 1) {
+                               DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+                                          "expired(c_refc %d), don't release",
+                                          atomic_read(&ocapa->c_refc));
+                               /* don't try to renew any more */
+                               list_del_init(&ocapa->c_list);
+                               continue;
+                       }
+
+                       /* expired capa is released. */
+                       DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release expired");
+                       ll_delete_capa(ocapa);
+               }
+
+               spin_unlock(&capa_lock);
+       }
+
+       thread_set_flags(&ll_capa_thread, SVC_STOPPED);
+       wake_up(&ll_capa_thread.t_ctl_waitq);
+       RETURN(0);
+}
+
+void ll_capa_timer_callback(unsigned long unused)
+{
+       wake_up(&ll_capa_thread.t_ctl_waitq);
+}
+
+int ll_capa_thread_start(void)
+{
+       task_t *task;
+       ENTRY;
+
+       init_waitqueue_head(&ll_capa_thread.t_ctl_waitq);
+
+       task = kthread_run(capa_thread_main, NULL, "ll_capa");
+       if (IS_ERR(task)) {
+               CERROR("cannot start expired capa thread: rc %ld\n",
+                       PTR_ERR(task));
+               RETURN(PTR_ERR(task));
+       }
+       wait_event(ll_capa_thread.t_ctl_waitq,
+                      thread_is_running(&ll_capa_thread));
+
+       RETURN(0);
+}
+
+void ll_capa_thread_stop(void)
+{
+       thread_set_flags(&ll_capa_thread, SVC_STOPPING);
+       wake_up(&ll_capa_thread.t_ctl_waitq);
+       wait_event(ll_capa_thread.t_ctl_waitq,
+                      thread_is_stopped(&ll_capa_thread));
+}
+
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *ocapa;
+       int found = 0;
+
+       ENTRY;
+
+       if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0)
+               RETURN(NULL);
+
+       LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW ||
+               opc == CAPA_OPC_OSS_TRUNC);
+
+       spin_lock(&capa_lock);
+       list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+               if (capa_is_expired(ocapa))
+                       continue;
+               if ((opc & CAPA_OPC_OSS_WRITE) &&
+                   capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) {
+                       found = 1;
+                       break;
+               } else if ((opc & CAPA_OPC_OSS_READ) &&
+                          capa_opc_supported(&ocapa->c_capa,
+                                             CAPA_OPC_OSS_READ)) {
+                       found = 1;
+                       break;
+               } else if ((opc & CAPA_OPC_OSS_TRUNC) &&
+                          capa_opc_supported(&ocapa->c_capa, opc)) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (found) {
+               LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+                                 ll_inode2fid(inode)));
+               LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+               capa_get(ocapa);
+
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+       } else {
+               ocapa = NULL;
+
+               if (atomic_read(&ll_capa_debug)) {
+                       CERROR("no capability for "DFID" opc "LPX64"\n",
+                              PFID(&lli->lli_fid), opc);
+                       atomic_set(&ll_capa_debug, 0);
+               }
+       }
+       spin_unlock(&capa_lock);
+
+       RETURN(ocapa);
+}
+EXPORT_SYMBOL(ll_osscapa_get);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *ocapa;
+       ENTRY;
+
+       LASSERT(inode != NULL);
+
+       if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0)
+               RETURN(NULL);
+
+       spin_lock(&capa_lock);
+       ocapa = capa_get(lli->lli_mds_capa);
+       spin_unlock(&capa_lock);
+       if (!ocapa && atomic_read(&ll_capa_debug)) {
+               CERROR("no mds capability for "DFID"\n", PFID(&lli->lli_fid));
+               atomic_set(&ll_capa_debug, 0);
+       }
+
+       RETURN(ocapa);
+}
+
+static struct obd_capa *do_add_mds_capa(struct inode *inode,
+                                       struct obd_capa *ocapa)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *old = lli->lli_mds_capa;
+       struct lustre_capa *capa = &ocapa->c_capa;
+
+       if (!old) {
+               ocapa->u.cli.inode = inode;
+               lli->lli_mds_capa = ocapa;
+               capa_count[CAPA_SITE_CLIENT]++;
+
+               DEBUG_CAPA(D_SEC, capa, "add MDS");
+       } else {
+               spin_lock(&old->c_lock);
+               old->c_capa = *capa;
+               spin_unlock(&old->c_lock);
+
+               DEBUG_CAPA(D_SEC, capa, "update MDS");
+
+               capa_put(ocapa);
+               ocapa = old;
+       }
+       return ocapa;
+}
+
+static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *ocapa;
+
+       /* inside capa_lock */
+       list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) {
+               if ((capa_opc(&ocapa->c_capa) & opc) != opc)
+                       continue;
+
+               LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa),
+                                 ll_inode2fid(inode)));
+               LASSERT(ocapa->c_site == CAPA_SITE_CLIENT);
+
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client");
+               return ocapa;
+       }
+
+       return NULL;
+}
+
+static inline void inode_add_oss_capa(struct inode *inode,
+                                     struct obd_capa *ocapa)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *tmp;
+       struct list_head *next = NULL;
+
+       /* capa is sorted in lli_oss_capas so lookup can always find the
+        * latest one */
+       list_for_each_entry(tmp, &lli->lli_oss_capas, u.cli.lli_list) {
+               if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) {
+                       next = &tmp->u.cli.lli_list;
+                       break;
+               }
+       }
+       LASSERT(&ocapa->u.cli.lli_list != next);
+       list_move_tail(&ocapa->u.cli.lli_list, next ?: &lli->lli_oss_capas);
+}
+
+static struct obd_capa *do_add_oss_capa(struct inode *inode,
+                                       struct obd_capa *ocapa)
+{
+       struct obd_capa *old;
+       struct lustre_capa *capa = &ocapa->c_capa;
+
+       LASSERTF(S_ISREG(inode->i_mode),
+                "inode has oss capa, but not regular file, mode: %d\n",
+                inode->i_mode);
+
+       /* FIXME: can't replace it so easily with fine-grained opc */
+       old = do_lookup_oss_capa(inode, capa_opc(capa) & CAPA_OPC_OSS_ONLY);
+       if (!old) {
+               ocapa->u.cli.inode = inode;
+               INIT_LIST_HEAD(&ocapa->u.cli.lli_list);
+               capa_count[CAPA_SITE_CLIENT]++;
+
+               DEBUG_CAPA(D_SEC, capa, "add OSS");
+       } else {
+               spin_lock(&old->c_lock);
+               old->c_capa = *capa;
+               spin_unlock(&old->c_lock);
+
+               DEBUG_CAPA(D_SEC, capa, "update OSS");
+
+               capa_put(ocapa);
+               ocapa = old;
+       }
+
+       inode_add_oss_capa(inode, ocapa);
+       return ocapa;
+}
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa)
+{
+       spin_lock(&capa_lock);
+       ocapa = capa_for_mds(&ocapa->c_capa) ? do_add_mds_capa(inode, ocapa) :
+                                              do_add_oss_capa(inode, ocapa);
+
+       /* truncate capa won't renew */
+       if (ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) {
+               set_capa_expiry(ocapa);
+               list_del_init(&ocapa->c_list);
+               sort_add_capa(ocapa, ll_capa_list);
+
+               update_capa_timer(ocapa, capa_renewal_time(ocapa));
+       }
+
+       spin_unlock(&capa_lock);
+
+       atomic_set(&ll_capa_debug, 1);
+       return ocapa;
+}
+
+static inline void delay_capa_renew(struct obd_capa *oc, cfs_time_t delay)
+{
+       /* NB: set a fake expiry for this capa to prevent it renew too soon */
+       oc->c_expiry = cfs_time_add(oc->c_expiry, cfs_time_seconds(delay));
+}
+
+int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa)
+{
+       struct inode *inode = ocapa->u.cli.inode;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(ocapa);
+
+       if (IS_ERR(capa)) {
+               /* set error code */
+               rc = PTR_ERR(capa);
+               spin_lock(&capa_lock);
+               if (rc == -ENOENT) {
+                       DEBUG_CAPA(D_SEC, &ocapa->c_capa,
+                                  "renewal canceled because object removed");
+                       ll_capa_renewal_noent++;
+               } else {
+                       ll_capa_renewal_failed++;
+
+                       /* failed capa won't be renewed any longer, but if -EIO,
+                        * client might be doing recovery, retry in 2 min. */
+                       if (rc == -EIO && !capa_is_expired(ocapa)) {
+                               delay_capa_renew(ocapa, 120);
+                               DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+                                          "renewal failed: -EIO, "
+                                          "retry in 2 mins");
+                               ll_capa_renewal_retries++;
+                               GOTO(retry, rc);
+                       } else {
+                               DEBUG_CAPA(D_ERROR, &ocapa->c_capa,
+                                          "renewal failed(rc: %d) for", rc);
+                       }
+               }
+
+               list_del_init(&ocapa->c_list);
+               sort_add_capa(ocapa, &ll_idle_capas);
+               spin_unlock(&capa_lock);
+
+               capa_put(ocapa);
+               iput(inode);
+               RETURN(rc);
+       }
+
+       spin_lock(&ocapa->c_lock);
+       LASSERT(!memcmp(&ocapa->c_capa, capa,
+                       offsetof(struct lustre_capa, lc_opc)));
+       ocapa->c_capa = *capa;
+       set_capa_expiry(ocapa);
+       spin_unlock(&ocapa->c_lock);
+
+       spin_lock(&capa_lock);
+       if (capa_for_oss(capa))
+               inode_add_oss_capa(inode, ocapa);
+       DEBUG_CAPA(D_SEC, capa, "renew");
+       EXIT;
+retry:
+       list_del_init(&ocapa->c_list);
+       sort_add_capa(ocapa, ll_capa_list);
+       update_capa_timer(ocapa, capa_renewal_time(ocapa));
+       spin_unlock(&capa_lock);
+
+       capa_put(ocapa);
+       iput(inode);
+       return rc;
+}
+
+void ll_capa_open(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+
+       if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+           == 0)
+               return;
+
+       if (!S_ISREG(inode->i_mode))
+               return;
+
+       atomic_inc(&lli->lli_open_count);
+}
+
+void ll_capa_close(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+
+       if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+           == 0)
+               return;
+
+       if (!S_ISREG(inode->i_mode))
+               return;
+
+       atomic_dec(&lli->lli_open_count);
+}
+
+/* delete CAPA_OPC_OSS_TRUNC only */
+void ll_truncate_free_capa(struct obd_capa *ocapa)
+{
+       if (!ocapa)
+               return;
+
+       LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC);
+       DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free truncate");
+
+       /* release ref when find */
+       capa_put(ocapa);
+       if (likely(ocapa->c_capa.lc_opc == CAPA_OPC_OSS_TRUNC)) {
+               spin_lock(&capa_lock);
+               ll_delete_capa(ocapa);
+               spin_unlock(&capa_lock);
+       }
+}
+
+void ll_clear_inode_capas(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct obd_capa *ocapa, *tmp;
+
+       spin_lock(&capa_lock);
+       ocapa = lli->lli_mds_capa;
+       if (ocapa)
+               ll_delete_capa(ocapa);
+
+       list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas,
+                                    u.cli.lli_list)
+               ll_delete_capa(ocapa);
+       spin_unlock(&capa_lock);
+}
+
+void ll_print_capa_stat(struct ll_sb_info *sbi)
+{
+       if (sbi->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA))
+               LCONSOLE_INFO("Fid capabilities renewed: %llu\n"
+                             "Fid capabilities renewal ENOENT: %llu\n"
+                             "Fid capabilities failed to renew: %llu\n"
+                             "Fid capabilities renewal retries: %llu\n",
+                             ll_capa_renewed, ll_capa_renewal_noent,
+                             ll_capa_renewal_failed, ll_capa_renewal_retries);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_close.c b/drivers/staging/lustre/lustre/llite/llite_close.c
new file mode 100644 (file)
index 0000000..00b2b38
--- /dev/null
@@ -0,0 +1,412 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_close.c
+ *
+ * Lustre Lite routines to issue a secondary close after writeback
+ */
+
+#include <linux/module.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+/** records that a write is in flight */
+void vvp_write_pending(struct ccc_object *club, struct ccc_page *page)
+{
+       struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+
+       ENTRY;
+       spin_lock(&lli->lli_lock);
+       lli->lli_flags |= LLIF_SOM_DIRTY;
+       if (page != NULL && list_empty(&page->cpg_pending_linkage))
+               list_add(&page->cpg_pending_linkage,
+                            &club->cob_pending_list);
+       spin_unlock(&lli->lli_lock);
+       EXIT;
+}
+
+/** records that a write has completed */
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page)
+{
+       struct ll_inode_info *lli = ll_i2info(club->cob_inode);
+       int rc = 0;
+
+       ENTRY;
+       spin_lock(&lli->lli_lock);
+       if (page != NULL && !list_empty(&page->cpg_pending_linkage)) {
+               list_del_init(&page->cpg_pending_linkage);
+               rc = 1;
+       }
+       spin_unlock(&lli->lli_lock);
+       if (rc)
+               ll_queue_done_writing(club->cob_inode, 0);
+       EXIT;
+}
+
+/** Queues DONE_WRITING if
+ * - done writing is allowed;
+ * - inode has no no dirty pages; */
+void ll_queue_done_writing(struct inode *inode, unsigned long flags)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+       ENTRY;
+
+       spin_lock(&lli->lli_lock);
+       lli->lli_flags |= flags;
+
+       if ((lli->lli_flags & LLIF_DONE_WRITING) &&
+           list_empty(&club->cob_pending_list)) {
+               struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq;
+
+               if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+                       CWARN("ino %lu/%u(flags %u) som valid it just after "
+                             "recovery\n",
+                             inode->i_ino, inode->i_generation,
+                             lli->lli_flags);
+               /* DONE_WRITING is allowed and inode has no dirty page. */
+               spin_lock(&lcq->lcq_lock);
+
+               LASSERT(list_empty(&lli->lli_close_list));
+               CDEBUG(D_INODE, "adding inode %lu/%u to close list\n",
+                      inode->i_ino, inode->i_generation);
+               list_add_tail(&lli->lli_close_list, &lcq->lcq_head);
+
+               /* Avoid a concurrent insertion into the close thread queue:
+                * an inode is already in the close thread, open(), write(),
+                * close() happen, epoch is closed as the inode is marked as
+                * LLIF_EPOCH_PENDING. When pages are written inode should not
+                * be inserted into the queue again, clear this flag to avoid
+                * it. */
+               lli->lli_flags &= ~LLIF_DONE_WRITING;
+
+               wake_up(&lcq->lcq_waitq);
+               spin_unlock(&lcq->lcq_lock);
+       }
+       spin_unlock(&lli->lli_lock);
+       EXIT;
+}
+
+/** Pack SOM attributes info @opdata for CLOSE, DONE_WRITING rpc. */
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       ENTRY;
+
+       op_data->op_flags |= MF_SOM_CHANGE;
+       /* Check if Size-on-MDS attributes are valid. */
+       if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+               CERROR("ino %lu/%u(flags %u) som valid it just after "
+                      "recovery\n", inode->i_ino, inode->i_generation,
+                      lli->lli_flags);
+
+       if (!cl_local_size(inode)) {
+               /* Send Size-on-MDS Attributes if valid. */
+               op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET |
+                               ATTR_ATIME_SET | ATTR_SIZE | ATTR_BLOCKS;
+       }
+       EXIT;
+}
+
+/** Closes ioepoch and packs Size-on-MDS attribute if needed into @op_data. */
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+                     struct obd_client_handle **och, unsigned long flags)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
+       ENTRY;
+
+       spin_lock(&lli->lli_lock);
+       if (!(list_empty(&club->cob_pending_list))) {
+               if (!(lli->lli_flags & LLIF_EPOCH_PENDING)) {
+                       LASSERT(*och != NULL);
+                       LASSERT(lli->lli_pending_och == NULL);
+                       /* Inode is dirty and there is no pending write done
+                        * request yet, DONE_WRITE is to be sent later. */
+                       lli->lli_flags |= LLIF_EPOCH_PENDING;
+                       lli->lli_pending_och = *och;
+                       spin_unlock(&lli->lli_lock);
+
+                       inode = igrab(inode);
+                       LASSERT(inode);
+                       GOTO(out, 0);
+               }
+               if (flags & LLIF_DONE_WRITING) {
+                       /* Some pages are still dirty, it is early to send
+                        * DONE_WRITE. Wait untill all pages will be flushed
+                        * and try DONE_WRITE again later. */
+                       LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+                       lli->lli_flags |= LLIF_DONE_WRITING;
+                       spin_unlock(&lli->lli_lock);
+
+                       inode = igrab(inode);
+                       LASSERT(inode);
+                       GOTO(out, 0);
+               }
+       }
+       CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID"\n",
+              ll_i2info(inode)->lli_ioepoch, PFID(&lli->lli_fid));
+       op_data->op_flags |= MF_EPOCH_CLOSE;
+
+       if (flags & LLIF_DONE_WRITING) {
+               LASSERT(lli->lli_flags & LLIF_SOM_DIRTY);
+               LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING));
+               *och = lli->lli_pending_och;
+               lli->lli_pending_och = NULL;
+               lli->lli_flags &= ~LLIF_EPOCH_PENDING;
+       } else {
+               /* Pack Size-on-MDS inode attributes only if they has changed */
+               if (!(lli->lli_flags & LLIF_SOM_DIRTY)) {
+                       spin_unlock(&lli->lli_lock);
+                       GOTO(out, 0);
+               }
+
+               /* There is a pending DONE_WRITE -- close epoch with no
+                * attribute change. */
+               if (lli->lli_flags & LLIF_EPOCH_PENDING) {
+                       spin_unlock(&lli->lli_lock);
+                       GOTO(out, 0);
+               }
+       }
+
+       LASSERT(list_empty(&club->cob_pending_list));
+       lli->lli_flags &= ~LLIF_SOM_DIRTY;
+       spin_unlock(&lli->lli_lock);
+       ll_done_writing_attr(inode, op_data);
+
+       EXIT;
+out:
+       return;
+}
+
+/**
+ * Cliens updates SOM attributes on MDS (including llog cookies):
+ * obd_getattr with no lock and md_setattr.
+ */
+int ll_som_update(struct inode *inode, struct md_op_data *op_data)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ptlrpc_request *request = NULL;
+       __u32 old_flags;
+       struct obdo *oa;
+       int rc;
+       ENTRY;
+
+       LASSERT(op_data != NULL);
+       if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
+               CERROR("ino %lu/%u(flags %u) som valid it just after "
+                      "recovery\n", inode->i_ino, inode->i_generation,
+                      lli->lli_flags);
+
+       OBDO_ALLOC(oa);
+       if (!oa) {
+               CERROR("can't allocate memory for Size-on-MDS update.\n");
+               RETURN(-ENOMEM);
+       }
+
+       old_flags = op_data->op_flags;
+       op_data->op_flags = MF_SOM_CHANGE;
+
+       /* If inode is already in another epoch, skip getattr from OSTs. */
+       if (lli->lli_ioepoch == op_data->op_ioepoch) {
+               rc = ll_inode_getattr(inode, oa, op_data->op_ioepoch,
+                                     old_flags & MF_GETATTR_LOCK);
+               if (rc) {
+                       oa->o_valid = 0;
+                       if (rc != -ENOENT)
+                               CERROR("inode_getattr failed (%d): unable to "
+                                      "send a Size-on-MDS attribute update "
+                                      "for inode %lu/%u\n", rc, inode->i_ino,
+                                      inode->i_generation);
+               } else {
+                       CDEBUG(D_INODE, "Size-on-MDS update on "DFID"\n",
+                              PFID(&lli->lli_fid));
+               }
+               /* Install attributes into op_data. */
+               md_from_obdo(op_data, oa, oa->o_valid);
+       }
+
+       rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data,
+                       NULL, 0, NULL, 0, &request, NULL);
+       ptlrpc_req_finished(request);
+
+       OBDO_FREE(oa);
+       RETURN(rc);
+}
+
+/**
+ * Closes the ioepoch and packs all the attributes into @op_data for
+ * DONE_WRITING rpc.
+ */
+static void ll_prepare_done_writing(struct inode *inode,
+                                   struct md_op_data *op_data,
+                                   struct obd_client_handle **och)
+{
+       ll_ioepoch_close(inode, op_data, och, LLIF_DONE_WRITING);
+       /* If there is no @och, we do not do D_W yet. */
+       if (*och == NULL)
+               return;
+
+       ll_pack_inode2opdata(inode, op_data, &(*och)->och_fh);
+       ll_prep_md_op_data(op_data, inode, NULL, NULL,
+                          0, 0, LUSTRE_OPC_ANY, NULL);
+}
+
+/** Send a DONE_WRITING rpc. */
+static void ll_done_writing(struct inode *inode)
+{
+       struct obd_client_handle *och = NULL;
+       struct md_op_data *op_data;
+       int rc;
+       ENTRY;
+
+       LASSERT(exp_connect_som(ll_i2mdexp(inode)));
+
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL) {
+               CERROR("can't allocate op_data\n");
+               EXIT;
+               return;
+       }
+
+       ll_prepare_done_writing(inode, op_data, &och);
+       /* If there is no @och, we do not do D_W yet. */
+       if (och == NULL)
+               GOTO(out, 0);
+
+       rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL);
+       if (rc == -EAGAIN) {
+               /* MDS has instructed us to obtain Size-on-MDS attribute from
+                * OSTs and send setattr to back to MDS. */
+               rc = ll_som_update(inode, op_data);
+       } else if (rc) {
+               CERROR("inode %lu mdc done_writing failed: rc = %d\n",
+                      inode->i_ino, rc);
+       }
+out:
+       ll_finish_md_op_data(op_data);
+       if (och) {
+               md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och);
+               OBD_FREE_PTR(och);
+       }
+       EXIT;
+}
+
+static struct ll_inode_info *ll_close_next_lli(struct ll_close_queue *lcq)
+{
+       struct ll_inode_info *lli = NULL;
+
+       spin_lock(&lcq->lcq_lock);
+
+       if (!list_empty(&lcq->lcq_head)) {
+               lli = list_entry(lcq->lcq_head.next, struct ll_inode_info,
+                                    lli_close_list);
+               list_del_init(&lli->lli_close_list);
+       } else if (atomic_read(&lcq->lcq_stop))
+               lli = ERR_PTR(-EALREADY);
+
+       spin_unlock(&lcq->lcq_lock);
+       return lli;
+}
+
+static int ll_close_thread(void *arg)
+{
+       struct ll_close_queue *lcq = arg;
+       ENTRY;
+
+       complete(&lcq->lcq_comp);
+
+       while (1) {
+               struct l_wait_info lwi = { 0 };
+               struct ll_inode_info *lli;
+               struct inode *inode;
+
+               l_wait_event_exclusive(lcq->lcq_waitq,
+                                      (lli = ll_close_next_lli(lcq)) != NULL,
+                                      &lwi);
+               if (IS_ERR(lli))
+                       break;
+
+               inode = ll_info2i(lli);
+               CDEBUG(D_INFO, "done_writting for inode %lu/%u\n",
+                      inode->i_ino, inode->i_generation);
+               ll_done_writing(inode);
+               iput(inode);
+       }
+
+       CDEBUG(D_INFO, "ll_close exiting\n");
+       complete(&lcq->lcq_comp);
+       RETURN(0);
+}
+
+int ll_close_thread_start(struct ll_close_queue **lcq_ret)
+{
+       struct ll_close_queue *lcq;
+       task_t *task;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CLOSE_THREAD))
+               return -EINTR;
+
+       OBD_ALLOC(lcq, sizeof(*lcq));
+       if (lcq == NULL)
+               return -ENOMEM;
+
+       spin_lock_init(&lcq->lcq_lock);
+       INIT_LIST_HEAD(&lcq->lcq_head);
+       init_waitqueue_head(&lcq->lcq_waitq);
+       init_completion(&lcq->lcq_comp);
+
+       task = kthread_run(ll_close_thread, lcq, "ll_close");
+       if (IS_ERR(task)) {
+               OBD_FREE(lcq, sizeof(*lcq));
+               return PTR_ERR(task);
+       }
+
+       wait_for_completion(&lcq->lcq_comp);
+       *lcq_ret = lcq;
+       return 0;
+}
+
+void ll_close_thread_shutdown(struct ll_close_queue *lcq)
+{
+       init_completion(&lcq->lcq_comp);
+       atomic_inc(&lcq->lcq_stop);
+       wake_up(&lcq->lcq_waitq);
+       wait_for_completion(&lcq->lcq_comp);
+       OBD_FREE(lcq, sizeof(*lcq));
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
new file mode 100644 (file)
index 0000000..992cd20
--- /dev/null
@@ -0,0 +1,1576 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LLITE_INTERNAL_H
+#define LLITE_INTERNAL_H
+#include <lustre_debug.h>
+#include <lustre_ver.h>
+#include <lustre_disk.h>  /* for s2sbi */
+#include <lustre_eacl.h>
+
+/* for struct cl_lock_descr and struct cl_io */
+#include <cl_object.h>
+#include <lclient.h>
+#include <lustre_mdc.h>
+#include <linux/lustre_intent.h>
+
+#ifndef FMODE_EXEC
+#define FMODE_EXEC 0
+#endif
+
+#ifndef VM_FAULT_RETRY
+#define VM_FAULT_RETRY 0
+#endif
+
+/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it.
+ * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */
+#ifndef LOOKUP_CONTINUE
+#define LOOKUP_CONTINUE LOOKUP_PARENT
+#endif
+
+/** Only used on client-side for indicating the tail of dir hash/offset. */
+#define LL_DIR_END_OFF   0x7fffffffffffffffULL
+#define LL_DIR_END_OFF_32BIT    0x7fffffffUL
+
+#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
+#define LUSTRE_FPRIVATE(file) ((file)->private_data)
+
+struct ll_dentry_data {
+       int                             lld_cwd_count;
+       int                             lld_mnt_count;
+       struct obd_client_handle        lld_cwd_och;
+       struct obd_client_handle        lld_mnt_och;
+       struct lookup_intent            *lld_it;
+       unsigned int                    lld_sa_generation;
+       unsigned int                    lld_invalid:1;
+       struct rcu_head                 lld_rcu_head;
+};
+
+#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata))
+
+extern struct file_operations ll_pgcache_seq_fops;
+
+#define LLI_INODE_MAGIC                 0x111d0de5
+#define LLI_INODE_DEAD           0xdeadd00d
+
+/* remote client permission cache */
+#define REMOTE_PERM_HASHSIZE 16
+
+struct ll_getname_data {
+       char        *lgd_name;      /* points to a buffer with NAME_MAX+1 size */
+       struct lu_fid    lgd_fid;       /* target fid we are looking for */
+       int           lgd_found;     /* inode matched? */
+};
+
+/* llite setxid/access permission for user on remote client */
+struct ll_remote_perm {
+       struct hlist_node       lrp_list;
+       uid_t              lrp_uid;
+       gid_t              lrp_gid;
+       uid_t              lrp_fsuid;
+       gid_t              lrp_fsgid;
+       int                  lrp_access_perm; /* MAY_READ/WRITE/EXEC, this
+                                                   is access permission with
+                                                   lrp_fsuid/lrp_fsgid. */
+};
+
+enum lli_flags {
+       /* MDS has an authority for the Size-on-MDS attributes. */
+       LLIF_MDS_SIZE_LOCK      = (1 << 0),
+       /* Epoch close is postponed. */
+       LLIF_EPOCH_PENDING      = (1 << 1),
+       /* DONE WRITING is allowed. */
+       LLIF_DONE_WRITING       = (1 << 2),
+       /* Sizeon-on-MDS attributes are changed. An attribute update needs to
+        * be sent to MDS. */
+       LLIF_SOM_DIRTY    = (1 << 3),
+       /* File is contented */
+       LLIF_CONTENDED    = (1 << 4),
+       /* Truncate uses server lock for this file */
+       LLIF_SRVLOCK        = (1 << 5),
+       /* File data is modified. */
+       LLIF_DATA_MODIFIED      = (1 << 6),
+};
+
+struct ll_inode_info {
+       __u32                           lli_inode_magic;
+       __u32                           lli_flags;
+       __u64                           lli_ioepoch;
+
+       spinlock_t                      lli_lock;
+       struct posix_acl                *lli_posix_acl;
+
+       struct hlist_head               *lli_remote_perms;
+       struct mutex                            lli_rmtperm_mutex;
+
+       /* identifying fields for both metadata and data stacks. */
+       struct lu_fid              lli_fid;
+       /* Parent fid for accessing default stripe data on parent directory
+        * for allocating OST objects after a mknod() and later open-by-FID. */
+       struct lu_fid              lli_pfid;
+
+       struct list_head                      lli_close_list;
+       struct list_head                      lli_oss_capas;
+       /* open count currently used by capability only, indicate whether
+        * capability needs renewal */
+       atomic_t                    lli_open_count;
+       struct obd_capa         *lli_mds_capa;
+       cfs_time_t                    lli_rmtperm_time;
+
+       /* handle is to be sent to MDS later on done_writing and setattr.
+        * Open handle data are needed for the recovery to reconstruct
+        * the inode state on the MDS. XXX: recovery is not ready yet. */
+       struct obd_client_handle       *lli_pending_och;
+
+       /* We need all three because every inode may be opened in different
+        * modes */
+       struct obd_client_handle       *lli_mds_read_och;
+       struct obd_client_handle       *lli_mds_write_och;
+       struct obd_client_handle       *lli_mds_exec_och;
+       __u64                      lli_open_fd_read_count;
+       __u64                      lli_open_fd_write_count;
+       __u64                      lli_open_fd_exec_count;
+       /* Protects access to och pointers and their usage counters */
+       struct mutex                    lli_och_mutex;
+
+       struct inode                    lli_vfs_inode;
+
+       /* the most recent timestamps obtained from mds */
+       struct ost_lvb                  lli_lvb;
+       spinlock_t                      lli_agl_lock;
+
+       /* Try to make the d::member and f::member are aligned. Before using
+        * these members, make clear whether it is directory or not. */
+       union {
+               /* for directory */
+               struct {
+                       /* serialize normal readdir and statahead-readdir. */
+                       struct mutex                    d_readdir_mutex;
+
+                       /* metadata statahead */
+                       /* since parent-child threads can share the same @file
+                        * struct, "opendir_key" is the token when dir close for
+                        * case of parent exit before child -- it is me should
+                        * cleanup the dir readahead. */
+                       void                       *d_opendir_key;
+                       struct ll_statahead_info       *d_sai;
+                       struct posix_acl               *d_def_acl;
+                       /* protect statahead stuff. */
+                       spinlock_t                      d_sa_lock;
+                       /* "opendir_pid" is the token when lookup/revalid
+                        * -- I am the owner of dir statahead. */
+                       pid_t                      d_opendir_pid;
+               } d;
+
+#define lli_readdir_mutex       u.d.d_readdir_mutex
+#define lli_opendir_key         u.d.d_opendir_key
+#define lli_sai                 u.d.d_sai
+#define lli_def_acl         u.d.d_def_acl
+#define lli_sa_lock         u.d.d_sa_lock
+#define lli_opendir_pid         u.d.d_opendir_pid
+
+               /* for non-directory */
+               struct {
+                       struct semaphore                f_size_sem;
+                       void                            *f_size_sem_owner;
+                       char                            *f_symlink_name;
+                       __u64                           f_maxbytes;
+                       /*
+                        * struct rw_semaphore {
+                        *    signed long       count;     // align d.d_def_acl
+                        *    spinlock_t        wait_lock; // align d.d_sa_lock
+                        *    struct list_head wait_list;
+                        * }
+                        */
+                       struct rw_semaphore             f_trunc_sem;
+                       struct mutex                    f_write_mutex;
+
+                       struct rw_semaphore             f_glimpse_sem;
+                       cfs_time_t                      f_glimpse_time;
+                       struct list_head                        f_agl_list;
+                       __u64                           f_agl_index;
+
+                       /* for writepage() only to communicate to fsync */
+                       int                             f_async_rc;
+
+                       /* volatile file criteria is based on file name, this
+                        * flag is used to keep the test result, so the strcmp
+                        * is done only once
+                        */
+                       bool                            f_volatile;
+                       /*
+                        * whenever a process try to read/write the file, the
+                        * jobid of the process will be saved here, and it'll
+                        * be packed into the write PRC when flush later.
+                        *
+                        * so the read/write statistics for jobid will not be
+                        * accurate if the file is shared by different jobs.
+                        */
+                       char                 f_jobid[JOBSTATS_JOBID_SIZE];
+               } f;
+
+#define lli_size_sem       u.f.f_size_sem
+#define lli_size_sem_owner      u.f.f_size_sem_owner
+#define lli_symlink_name       u.f.f_symlink_name
+#define lli_maxbytes       u.f.f_maxbytes
+#define lli_trunc_sem     u.f.f_trunc_sem
+#define lli_write_mutex         u.f.f_write_mutex
+#define lli_glimpse_sem                u.f.f_glimpse_sem
+#define lli_glimpse_time       u.f.f_glimpse_time
+#define lli_agl_list           u.f.f_agl_list
+#define lli_agl_index          u.f.f_agl_index
+#define lli_async_rc           u.f.f_async_rc
+#define lli_jobid              u.f.f_jobid
+#define lli_volatile           u.f.f_volatile
+
+       } u;
+
+       /* XXX: For following frequent used members, although they maybe special
+        *      used for non-directory object, it is some time-wasting to check
+        *      whether the object is directory or not before using them. On the
+        *      other hand, currently, sizeof(f) > sizeof(d), it cannot reduce
+        *      the "ll_inode_info" size even if moving those members into u.f.
+        *      So keep them out side.
+        *
+        *      In the future, if more members are added only for directory,
+        *      some of the following members can be moved into u.f.
+        */
+       bool                        lli_has_smd;
+       struct cl_object               *lli_clob;
+
+       /* mutex to request for layout lock exclusively. */
+       struct mutex                    lli_layout_mutex;
+       /* valid only inside LAYOUT ibits lock, protected by lli_layout_mutex */
+       __u32                           lli_layout_gen;
+};
+
+/*
+ * Locking to guarantee consistency of non-atomic updates to long long i_size,
+ * consistency between file size and KMS.
+ *
+ * Implemented by ->lli_size_sem and ->lsm_lock, nested in that order.
+ */
+
+void ll_inode_size_lock(struct inode *inode);
+void ll_inode_size_unlock(struct inode *inode);
+
+// FIXME: replace the name of this with LL_I to conform to kernel stuff
+// static inline struct ll_inode_info *LL_I(struct inode *inode)
+static inline struct ll_inode_info *ll_i2info(struct inode *inode)
+{
+       return container_of(inode, struct ll_inode_info, lli_vfs_inode);
+}
+
+/* default to about 40meg of readahead on a given system.  That much tied
+ * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */
+#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT))
+
+/* default to read-ahead full files smaller than 2MB on the second read */
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT))
+
+enum ra_stat {
+       RA_STAT_HIT = 0,
+       RA_STAT_MISS,
+       RA_STAT_DISTANT_READPAGE,
+       RA_STAT_MISS_IN_WINDOW,
+       RA_STAT_FAILED_GRAB_PAGE,
+       RA_STAT_FAILED_MATCH,
+       RA_STAT_DISCARDED,
+       RA_STAT_ZERO_LEN,
+       RA_STAT_ZERO_WINDOW,
+       RA_STAT_EOF,
+       RA_STAT_MAX_IN_FLIGHT,
+       RA_STAT_WRONG_GRAB_PAGE,
+       _NR_RA_STAT,
+};
+
+struct ll_ra_info {
+       atomic_t              ra_cur_pages;
+       unsigned long        ra_max_pages;
+       unsigned long        ra_max_pages_per_file;
+       unsigned long        ra_max_read_ahead_whole_pages;
+};
+
+/* ra_io_arg will be filled in the beginning of ll_readahead with
+ * ras_lock, then the following ll_read_ahead_pages will read RA
+ * pages according to this arg, all the items in this structure are
+ * counted by page index.
+ */
+struct ra_io_arg {
+       unsigned long ria_start;  /* start offset of read-ahead*/
+       unsigned long ria_end;    /* end offset of read-ahead*/
+       /* If stride read pattern is detected, ria_stoff means where
+        * stride read is started. Note: for normal read-ahead, the
+        * value here is meaningless, and also it will not be accessed*/
+       pgoff_t ria_stoff;
+       /* ria_length and ria_pages are the length and pages length in the
+        * stride I/O mode. And they will also be used to check whether
+        * it is stride I/O read-ahead in the read-ahead pages*/
+       unsigned long ria_length;
+       unsigned long ria_pages;
+};
+
+/* LL_HIST_MAX=32 causes an overflow */
+#define LL_HIST_MAX 28
+#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */
+#define LL_PROCESS_HIST_MAX 10
+struct per_process_info {
+       pid_t pid;
+       struct obd_histogram pp_r_hist;
+       struct obd_histogram pp_w_hist;
+};
+
+/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */
+struct ll_rw_extents_info {
+       struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1];
+};
+
+#define LL_OFFSET_HIST_MAX 100
+struct ll_rw_process_info {
+       pid_t                rw_pid;
+       int                    rw_op;
+       loff_t              rw_range_start;
+       loff_t              rw_range_end;
+       loff_t              rw_last_file_pos;
+       loff_t              rw_offset;
+       size_t              rw_smallest_extent;
+       size_t              rw_largest_extent;
+       struct ll_file_data      *rw_last_file;
+};
+
+enum stats_track_type {
+       STATS_TRACK_ALL = 0,  /* track all processes */
+       STATS_TRACK_PID,      /* track process with this pid */
+       STATS_TRACK_PPID,     /* track processes with this ppid */
+       STATS_TRACK_GID,      /* track processes with this gid */
+       STATS_TRACK_LAST,
+};
+
+/* flags for sbi->ll_flags */
+#define LL_SBI_NOLCK        0x01 /* DLM locking disabled (directio-only) */
+#define LL_SBI_CHECKSUM          0x02 /* checksum each page as it's written */
+#define LL_SBI_FLOCK        0x04
+#define LL_SBI_USER_XATTR      0x08 /* support user xattr */
+#define LL_SBI_ACL            0x10 /* support ACL */
+#define LL_SBI_RMT_CLIENT      0x40 /* remote client */
+#define LL_SBI_MDS_CAPA          0x80 /* support mds capa */
+#define LL_SBI_OSS_CAPA         0x100 /* support oss capa */
+#define LL_SBI_LOCALFLOCK       0x200 /* Local flocks support by kernel */
+#define LL_SBI_LRU_RESIZE       0x400 /* lru resize support */
+#define LL_SBI_LAZYSTATFS       0x800 /* lazystatfs mount option */
+#define LL_SBI_SOM_PREVIEW     0x1000 /* SOM preview mount option */
+#define LL_SBI_32BIT_API       0x2000 /* generate 32 bit inodes. */
+#define LL_SBI_64BIT_HASH      0x4000 /* support 64-bits dir hash/offset */
+#define LL_SBI_AGL_ENABLED     0x8000 /* enable agl */
+#define LL_SBI_VERBOSE 0x10000 /* verbose mount/umount */
+#define LL_SBI_LAYOUT_LOCK    0x20000 /* layout lock support */
+#define LL_SBI_USER_FID2PATH  0x40000 /* allow fid2path by unprivileged users */
+
+#define LL_SBI_FLAGS { \
+       "nolck",        \
+       "checksum",     \
+       "flock",        \
+       "xattr",        \
+       "acl",          \
+       "rmt_client",   \
+       "mds_capa",     \
+       "oss_capa",     \
+       "flock",        \
+       "lru_resize",   \
+       "lazy_statfs",  \
+       "som",          \
+       "32bit_api",    \
+       "64bit_hash",   \
+       "agl",          \
+       "verbose",      \
+       "layout",       \
+       "user_fid2path" }
+
+/* default value for ll_sb_info->contention_time */
+#define SBI_DEFAULT_CONTENTION_SECONDS     60
+/* default value for lockless_truncate_enable */
+#define SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE 1
+#define RCE_HASHES      32
+
+struct rmtacl_ctl_entry {
+       struct list_head       rce_list;
+       pid_t       rce_key; /* hash key */
+       int           rce_ops; /* acl operation type */
+};
+
+struct rmtacl_ctl_table {
+       spinlock_t      rct_lock;
+       struct list_head        rct_entries[RCE_HASHES];
+};
+
+#define EE_HASHES       32
+
+struct eacl_entry {
+       struct list_head            ee_list;
+       pid_t            ee_key; /* hash key */
+       struct lu_fid    ee_fid;
+       int                ee_type; /* ACL type for ACCESS or DEFAULT */
+       ext_acl_xattr_header *ee_acl;
+};
+
+struct eacl_table {
+       spinlock_t      et_lock;
+       struct list_head        et_entries[EE_HASHES];
+};
+
+struct ll_sb_info {
+       struct list_head                  ll_list;
+       /* this protects pglist and ra_info.  It isn't safe to
+        * grab from interrupt contexts */
+       spinlock_t                ll_lock;
+       spinlock_t                ll_pp_extent_lock; /* pp_extent entry*/
+       spinlock_t                ll_process_lock; /* ll_rw_process_info */
+       struct obd_uuid    ll_sb_uuid;
+       struct obd_export       *ll_md_exp;
+       struct obd_export       *ll_dt_exp;
+       struct proc_dir_entry*    ll_proc_root;
+       struct lu_fid        ll_root_fid; /* root object fid */
+
+       int                    ll_flags;
+       int                       ll_umounting:1;
+       struct list_head                ll_conn_chain; /* per-conn chain of SBs */
+       struct lustre_client_ocd  ll_lco;
+
+       struct list_head                ll_orphan_dentry_list; /*please don't ask -p*/
+       struct ll_close_queue    *ll_lcq;
+
+       struct lprocfs_stats     *ll_stats; /* lprocfs stats counter */
+
+       struct cl_client_cache    ll_cache;
+
+       struct lprocfs_stats     *ll_ra_stats;
+
+       struct ll_ra_info        ll_ra_info;
+       unsigned int          ll_namelen;
+       struct file_operations   *ll_fop;
+
+       /* =0 - hold lock over whole read/write
+        * >0 - max. chunk to be read/written w/o lock re-acquiring */
+       unsigned long        ll_max_rw_chunk;
+       unsigned int          ll_md_brw_size; /* used by readdir */
+
+       struct lu_site     *ll_site;
+       struct cl_device         *ll_cl;
+       /* Statistics */
+       struct ll_rw_extents_info ll_rw_extents_info;
+       int                    ll_extent_process_count;
+       struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX];
+       unsigned int          ll_offset_process_count;
+       struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX];
+       unsigned int          ll_rw_offset_entry_count;
+       int                    ll_stats_track_id;
+       enum stats_track_type     ll_stats_track_type;
+       int                    ll_rw_stats_on;
+
+       /* metadata stat-ahead */
+       unsigned int          ll_sa_max;     /* max statahead RPCs */
+       atomic_t                  ll_sa_total;   /* statahead thread started
+                                                 * count */
+       atomic_t                  ll_sa_wrong;   /* statahead thread stopped for
+                                                 * low hit ratio */
+       atomic_t                  ll_agl_total;  /* AGL thread started count */
+
+       dev_t                ll_sdev_orig; /* save s_dev before assign for
+                                                * clustred nfs */
+       struct rmtacl_ctl_table   ll_rct;
+       struct eacl_table        ll_et;
+};
+
+#define LL_DEFAULT_MAX_RW_CHUNK      (32 * 1024 * 1024)
+
+struct ll_ra_read {
+       pgoff_t      lrr_start;
+       pgoff_t      lrr_count;
+       struct task_struct *lrr_reader;
+       struct list_head          lrr_linkage;
+};
+
+/*
+ * per file-descriptor read-ahead data.
+ */
+struct ll_readahead_state {
+       spinlock_t  ras_lock;
+       /*
+        * index of the last page that read(2) needed and that wasn't in the
+        * cache. Used by ras_update() to detect seeks.
+        *
+        * XXX nikita: if access seeks into cached region, Lustre doesn't see
+        * this.
+        */
+       unsigned long   ras_last_readpage;
+       /*
+        * number of pages read after last read-ahead window reset. As window
+        * is reset on each seek, this is effectively a number of consecutive
+        * accesses. Maybe ->ras_accessed_in_window is better name.
+        *
+        * XXX nikita: window is also reset (by ras_update()) when Lustre
+        * believes that memory pressure evicts read-ahead pages. In that
+        * case, it probably doesn't make sense to expand window to
+        * PTLRPC_MAX_BRW_PAGES on the third access.
+        */
+       unsigned long   ras_consecutive_pages;
+       /*
+        * number of read requests after the last read-ahead window reset
+        * As window is reset on each seek, this is effectively the number
+        * on consecutive read request and is used to trigger read-ahead.
+        */
+       unsigned long   ras_consecutive_requests;
+       /*
+        * Parameters of current read-ahead window. Handled by
+        * ras_update(). On the initial access to the file or after a seek,
+        * window is reset to 0. After 3 consecutive accesses, window is
+        * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by
+        * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages.
+        */
+       unsigned long   ras_window_start, ras_window_len;
+       /*
+        * Where next read-ahead should start at. This lies within read-ahead
+        * window. Read-ahead window is read in pieces rather than at once
+        * because: 1. lustre limits total number of pages under read-ahead by
+        * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages
+        * not covered by DLM lock.
+        */
+       unsigned long   ras_next_readahead;
+       /*
+        * Total number of ll_file_read requests issued, reads originating
+        * due to mmap are not counted in this total.  This value is used to
+        * trigger full file read-ahead after multiple reads to a small file.
+        */
+       unsigned long   ras_requests;
+       /*
+        * Page index with respect to the current request, these value
+        * will not be accurate when dealing with reads issued via mmap.
+        */
+       unsigned long   ras_request_index;
+       /*
+        * list of struct ll_ra_read's one per read(2) call current in
+        * progress against this file descriptor. Used by read-ahead code,
+        * protected by ->ras_lock.
+        */
+       struct list_head      ras_read_beads;
+       /*
+        * The following 3 items are used for detecting the stride I/O
+        * mode.
+        * In stride I/O mode,
+        * ...............|-----data-----|****gap*****|--------|******|....
+        *    offset      |-stride_pages-|-stride_gap-|
+        * ras_stride_offset = offset;
+        * ras_stride_length = stride_pages + stride_gap;
+        * ras_stride_pages = stride_pages;
+        * Note: all these three items are counted by pages.
+        */
+       unsigned long   ras_stride_length;
+       unsigned long   ras_stride_pages;
+       pgoff_t  ras_stride_offset;
+       /*
+        * number of consecutive stride request count, and it is similar as
+        * ras_consecutive_requests, but used for stride I/O mode.
+        * Note: only more than 2 consecutive stride request are detected,
+        * stride read-ahead will be enable
+        */
+       unsigned long   ras_consecutive_stride_requests;
+};
+
+extern struct kmem_cache *ll_file_data_slab;
+struct lustre_handle;
+struct ll_file_data {
+       struct ll_readahead_state fd_ras;
+       int fd_omode;
+       struct ccc_grouplock fd_grouplock;
+       __u64 lfd_pos;
+       __u32 fd_flags;
+       struct file *fd_file;
+       /* Indicate whether need to report failure when close.
+        * true: failure is known, not report again.
+        * false: unknown failure, should report. */
+       bool fd_write_failed;
+};
+
+struct lov_stripe_md;
+
+extern spinlock_t inode_lock;
+
+extern struct proc_dir_entry *proc_lustre_fs_root;
+
+static inline struct inode *ll_info2i(struct ll_inode_info *lli)
+{
+       return &lli->lli_vfs_inode;
+}
+
+struct it_cb_data {
+       struct inode  *icbd_parent;
+       struct dentry **icbd_childp;
+       obd_id  hash;
+};
+
+__u32 ll_i2suppgid(struct inode *i);
+void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2);
+
+static inline int ll_need_32bit_api(struct ll_sb_info *sbi)
+{
+#if BITS_PER_LONG == 32
+       return 1;
+#else
+       return unlikely(current_is_32bit() || (sbi->ll_flags & LL_SBI_32BIT_API));
+#endif
+}
+
+#define LLAP_MAGIC 98764321
+
+extern struct kmem_cache *ll_async_page_slab;
+extern size_t ll_async_page_slab_size;
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar);
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
+struct ll_ra_read *ll_ra_read_get(struct file *f);
+
+/* llite/lproc_llite.c */
+#ifdef LPROCFS
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+                               struct super_block *sb, char *osc, char *mdc);
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi);
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count);
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+                       struct super_block *sb, char *osc, char *mdc){return 0;}
+static inline void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) {}
+static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {}
+static void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+
+/* llite/dir.c */
+void ll_release_page(struct page *page, int remove);
+extern struct file_operations ll_dir_operations;
+extern struct inode_operations ll_dir_inode_operations;
+struct page *ll_get_dir_page(struct inode *dir, __u64 hash,
+                            struct ll_dir_chain *chain);
+int ll_dir_read(struct inode *inode, __u64 *_pos, void *cookie,
+               filldir_t filldir);
+
+int ll_get_mdt_idx(struct inode *inode);
+/* llite/namei.c */
+int ll_objects_destroy(struct ptlrpc_request *request,
+                      struct inode *dir);
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+                     struct lustre_md *lic);
+int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+                      void *data, int flag);
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de);
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen);
+
+/* llite/rw.c */
+int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
+int ll_writepage(struct page *page, struct writeback_control *wbc);
+int ll_writepages(struct address_space *, struct writeback_control *wbc);
+void ll_removepage(struct page *page);
+int ll_readpage(struct file *file, struct page *page);
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
+int ll_file_punch(struct inode *, loff_t, int);
+ssize_t ll_file_lockless_io(struct file *, char *, size_t, loff_t *, int);
+void ll_clear_file_contended(struct inode*);
+int ll_sync_page_range(struct inode *, struct address_space *, loff_t, size_t);
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+                struct ll_readahead_state *ras, struct address_space *mapping,
+                struct cl_page_list *queue, int flags);
+
+/* llite/file.c */
+extern struct file_operations ll_file_operations;
+extern struct file_operations ll_file_operations_flock;
+extern struct file_operations ll_file_operations_noflock;
+extern struct inode_operations ll_file_inode_operations;
+extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
+                                 __u64);
+extern int ll_have_md_lock(struct inode *inode, __u64 *bits,
+                          ldlm_mode_t l_req_mode);
+extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
+                                  struct lustre_handle *lockh, __u64 flags);
+int __ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,
+                            __u64 bits);
+int ll_revalidate_nd(struct dentry *dentry, unsigned int flags);
+int ll_file_open(struct inode *inode, struct file *file);
+int ll_file_release(struct inode *inode, struct file *file);
+int ll_glimpse_ioctl(struct ll_sb_info *sbi,
+                    struct lov_stripe_md *lsm, lstat_t *st);
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch);
+int ll_local_open(struct file *file,
+                 struct lookup_intent *it, struct ll_file_data *fd,
+                 struct obd_client_handle *och);
+int ll_release_openhandle(struct dentry *, struct lookup_intent *);
+int ll_md_close(struct obd_export *md_exp, struct inode *inode,
+               struct file *file);
+int ll_md_real_close(struct inode *inode, int flags);
+void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data,
+                     struct obd_client_handle **och, unsigned long flags);
+void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data);
+int ll_som_update(struct inode *inode, struct md_op_data *op_data);
+int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
+                    __u64 ioepoch, int sync);
+int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
+                 struct md_open_data **mod);
+void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
+                         struct lustre_handle *fh);
+extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+                             struct ll_file_data *file, loff_t pos,
+                             size_t count, int rw);
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
+              struct lookup_intent *it, struct kstat *stat);
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
+struct ll_file_data *ll_file_data_get(void);
+struct posix_acl * ll_get_acl(struct inode *inode, int type);
+
+int ll_inode_permission(struct inode *inode, int mask);
+
+int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
+                            int flags, struct lov_user_md *lum,
+                            int lum_size);
+int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
+                            struct lov_mds_md **lmm, int *lmm_size,
+                            struct ptlrpc_request **request);
+int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
+                    int set_default);
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
+                    int *lmm_size, struct ptlrpc_request **request);
+int ll_fsync(struct file *file, loff_t start, loff_t end, int data);
+int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+             int num_bytes);
+int ll_merge_lvb(const struct lu_env *env, struct inode *inode);
+int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+int ll_fid2path(struct inode *inode, void *arg);
+int ll_data_version(struct inode *inode, __u64 *data_version, int extent_lock);
+
+/* llite/dcache.c */
+
+int ll_dops_init(struct dentry *de, int block, int init_sa);
+extern struct dentry_operations ll_d_ops;
+void ll_intent_drop_lock(struct lookup_intent *);
+void ll_intent_release(struct lookup_intent *);
+void ll_invalidate_aliases(struct inode *);
+void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
+void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
+int ll_dcompare(const struct dentry *parent, const struct inode *pinode,
+               const struct dentry *dentry, const struct inode *inode,
+               unsigned int len, const char *str, const struct qstr *d_name);
+int ll_revalidate_it_finish(struct ptlrpc_request *request,
+                           struct lookup_intent *it, struct dentry *de);
+
+/* llite/llite_lib.c */
+extern struct super_operations lustre_super_operations;
+
+char *ll_read_opt(const char *opt, char *data);
+void ll_lli_init(struct ll_inode_info *lli);
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt);
+void ll_put_super(struct super_block *sb);
+void ll_kill_super(struct super_block *sb);
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock);
+struct inode *ll_inode_from_lock(struct ldlm_lock *lock);
+void ll_clear_inode(struct inode *inode);
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr);
+int ll_setattr(struct dentry *de, struct iattr *attr);
+int ll_statfs(struct dentry *de, struct kstatfs *sfs);
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+                      __u64 max_age, __u32 flags);
+void ll_update_inode(struct inode *inode, struct lustre_md *md);
+void ll_read_inode2(struct inode *inode, void *opaque);
+void ll_delete_inode(struct inode *inode);
+int ll_iocontrol(struct inode *inode, struct file *file,
+                unsigned int cmd, unsigned long arg);
+int ll_flush_ctx(struct inode *inode);
+void ll_umount_begin(struct super_block *sb);
+int ll_remount_fs(struct super_block *sb, int *flags, char *data);
+int ll_show_options(struct seq_file *seq, struct dentry *dentry);
+void ll_dirty_page_discard_warn(struct page *page, int ioret);
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+                 struct super_block *, struct lookup_intent *);
+void lustre_dump_dentry(struct dentry *, int recur);
+void lustre_dump_inode(struct inode *);
+int ll_obd_statfs(struct inode *inode, void *arg);
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
+int ll_process_config(struct lustre_cfg *lcfg);
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+                                     struct inode *i1, struct inode *i2,
+                                     const char *name, int namelen,
+                                     int mode, __u32 opc, void *data);
+void ll_finish_md_op_data(struct md_op_data *op_data);
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg);
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen);
+
+/* llite/llite_nfs.c */
+extern struct export_operations lustre_export_operations;
+__u32 get_uuid2int(const char *name, int len);
+struct inode *search_inode_for_lustre(struct super_block *sb,
+                                     const struct lu_fid *fid);
+
+/* llite/special.c */
+extern struct inode_operations ll_special_inode_operations;
+extern struct file_operations ll_special_chr_inode_fops;
+extern struct file_operations ll_special_chr_file_fops;
+extern struct file_operations ll_special_blk_inode_fops;
+extern struct file_operations ll_special_fifo_inode_fops;
+extern struct file_operations ll_special_fifo_file_fops;
+extern struct file_operations ll_special_sock_inode_fops;
+
+/* llite/symlink.c */
+extern struct inode_operations ll_fast_symlink_inode_operations;
+
+/* llite/llite_close.c */
+struct ll_close_queue {
+       spinlock_t              lcq_lock;
+       struct list_head                lcq_head;
+       wait_queue_head_t               lcq_waitq;
+       struct completion       lcq_comp;
+       atomic_t                lcq_stop;
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+
+void vvp_write_pending (struct ccc_object *club, struct ccc_page *page);
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page);
+
+/* specific achitecture can implement only part of this list */
+enum vvp_io_subtype {
+       /** normal IO */
+       IO_NORMAL,
+       /** io called from .sendfile */
+       IO_SENDFILE,
+       /** io started from splice_{read|write} */
+       IO_SPLICE
+};
+
+/* IO subtypes */
+struct vvp_io {
+       /** io subtype */
+       enum vvp_io_subtype    cui_io_subtype;
+
+       union {
+               struct {
+                       read_actor_t      cui_actor;
+                       void         *cui_target;
+               } sendfile;
+               struct {
+                       struct pipe_inode_info *cui_pipe;
+                       unsigned int        cui_flags;
+               } splice;
+               struct vvp_fault_io {
+                       /**
+                        * Inode modification time that is checked across DLM
+                        * lock request.
+                        */
+                       time_t           ft_mtime;
+                       struct vm_area_struct *ft_vma;
+                       /**
+                        *  locked page returned from vvp_io
+                        */
+                       struct page         *ft_vmpage;
+                       struct vm_fault_api {
+                               /**
+                                * kernel fault info
+                                */
+                               struct vm_fault *ft_vmf;
+                               /**
+                                * fault API used bitflags for return code.
+                                */
+                               unsigned int    ft_flags;
+                       } fault;
+               } fault;
+       } u;
+       /**
+        * Read-ahead state used by read and page-fault IO contexts.
+        */
+       struct ll_ra_read    cui_bead;
+       /**
+        * Set when cui_bead has been initialized.
+        */
+       int               cui_ra_window_set;
+       /**
+        * Partially truncated page, that vvp_io_trunc_start() keeps locked
+        * across truncate.
+        */
+       struct cl_page      *cui_partpage;
+};
+
+/**
+ * IO arguments for various VFS I/O interfaces.
+ */
+struct vvp_io_args {
+       /** normal/sendfile/splice */
+       enum vvp_io_subtype via_io_subtype;
+
+       union {
+               struct {
+                       struct kiocb      *via_iocb;
+                       struct iovec      *via_iov;
+                       unsigned long      via_nrsegs;
+               } normal;
+               struct {
+                       read_actor_t       via_actor;
+                       void          *via_target;
+               } sendfile;
+               struct {
+                       struct pipe_inode_info  *via_pipe;
+                       unsigned int       via_flags;
+               } splice;
+       } u;
+};
+
+struct ll_cl_context {
+       void       *lcc_cookie;
+       struct cl_io   *lcc_io;
+       struct cl_page *lcc_page;
+       struct lu_env  *lcc_env;
+       int          lcc_refcheck;
+       int          lcc_created;
+};
+
+struct vvp_thread_info {
+       struct ost_lvb       vti_lvb;
+       struct cl_2queue     vti_queue;
+       struct iovec     vti_local_iov;
+       struct vvp_io_args   vti_args;
+       struct ra_io_arg     vti_ria;
+       struct kiocb     vti_kiocb;
+       struct ll_cl_context vti_io_ctx;
+};
+
+static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
+{
+       extern struct lu_context_key vvp_key;
+       struct vvp_thread_info      *info;
+
+       info = lu_context_key_get(&env->le_ctx, &vvp_key);
+       LASSERT(info != NULL);
+       return info;
+}
+
+static inline struct vvp_io_args *vvp_env_args(const struct lu_env *env,
+                                              enum vvp_io_subtype type)
+{
+       struct vvp_io_args *ret = &vvp_env_info(env)->vti_args;
+
+       ret->via_io_subtype = type;
+
+       return ret;
+}
+
+struct vvp_session {
+       struct vvp_io    vs_ios;
+};
+
+static inline struct vvp_session *vvp_env_session(const struct lu_env *env)
+{
+       extern struct lu_context_key vvp_session_key;
+       struct vvp_session *ses;
+
+       ses = lu_context_key_get(env->le_ses, &vvp_session_key);
+       LASSERT(ses != NULL);
+       return ses;
+}
+
+static inline struct vvp_io *vvp_env_io(const struct lu_env *env)
+{
+       return &vvp_env_session(env)->vs_ios;
+}
+
+void ll_queue_done_writing(struct inode *inode, unsigned long flags);
+void ll_close_thread_shutdown(struct ll_close_queue *lcq);
+int ll_close_thread_start(struct ll_close_queue **lcq_ret);
+
+/* llite/llite_mmap.c */
+typedef struct rb_root  rb_root_t;
+typedef struct rb_node  rb_node_t;
+
+struct ll_lock_tree_node;
+struct ll_lock_tree {
+       rb_root_t                      lt_root;
+       struct list_head                      lt_locked_list;
+       struct ll_file_data         *lt_fd;
+};
+
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
+int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
+struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
+                                             __u64 end, ldlm_mode_t mode);
+void policy_from_vma(ldlm_policy_data_t *policy,
+               struct vm_area_struct *vma, unsigned long addr, size_t count);
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+                              size_t count);
+
+static inline void ll_invalidate_page(struct page *vmpage)
+{
+       struct address_space *mapping = vmpage->mapping;
+       loff_t offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+       LASSERT(PageLocked(vmpage));
+       if (mapping == NULL)
+               return;
+
+       ll_teardown_mmaps(mapping, offset, offset + PAGE_CACHE_SIZE);
+       truncate_complete_page(mapping, vmpage);
+}
+
+#define    ll_s2sbi(sb)        (s2lsi(sb)->lsi_llsbi)
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2dtexp(struct super_block *sb)
+{
+       return ll_s2sbi(sb)->ll_dt_exp;
+}
+
+/* don't need an addref as the sb_info should be holding one */
+static inline struct obd_export *ll_s2mdexp(struct super_block *sb)
+{
+       return ll_s2sbi(sb)->ll_md_exp;
+}
+
+static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi)
+{
+       struct obd_device *obd = sbi->ll_md_exp->exp_obd;
+       if (obd == NULL)
+               LBUG();
+       return &obd->u.cli;
+}
+
+// FIXME: replace the name of this with LL_SB to conform to kernel stuff
+static inline struct ll_sb_info *ll_i2sbi(struct inode *inode)
+{
+       return ll_s2sbi(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2dtexp(struct inode *inode)
+{
+       return ll_s2dtexp(inode->i_sb);
+}
+
+static inline struct obd_export *ll_i2mdexp(struct inode *inode)
+{
+       return ll_s2mdexp(inode->i_sb);
+}
+
+static inline struct lu_fid *ll_inode2fid(struct inode *inode)
+{
+       struct lu_fid *fid;
+
+       LASSERT(inode != NULL);
+       fid = &ll_i2info(inode)->lli_fid;
+
+       return fid;
+}
+
+static inline int ll_mds_max_easize(struct super_block *sb)
+{
+       return sbi2mdc(ll_s2sbi(sb))->cl_max_mds_easize;
+}
+
+static inline __u64 ll_file_maxbytes(struct inode *inode)
+{
+       return ll_i2info(inode)->lli_maxbytes;
+}
+
+/* llite/xattr.c */
+int ll_setxattr(struct dentry *dentry, const char *name,
+               const void *value, size_t size, int flags);
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+                   void *buffer, size_t size);
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ll_removexattr(struct dentry *dentry, const char *name);
+
+/* llite/remote_perm.c */
+extern struct kmem_cache *ll_remote_perm_cachep;
+extern struct kmem_cache *ll_rmtperm_hash_cachep;
+
+struct hlist_head *alloc_rmtperm_hash(void);
+void free_rmtperm_hash(struct hlist_head *hash);
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm);
+int lustre_check_remote_perm(struct inode *inode, int mask);
+
+/* llite/llite_capa.c */
+extern timer_list_t ll_capa_timer;
+
+int ll_capa_thread_start(void);
+void ll_capa_thread_stop(void);
+void ll_capa_timer_callback(unsigned long unused);
+
+struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa);
+int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa);
+
+void ll_capa_open(struct inode *inode);
+void ll_capa_close(struct inode *inode);
+
+struct obd_capa *ll_mdscapa_get(struct inode *inode);
+struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc);
+
+void ll_truncate_free_capa(struct obd_capa *ocapa);
+void ll_clear_inode_capas(struct inode *inode);
+void ll_print_capa_stat(struct ll_sb_info *sbi);
+
+/* llite/llite_cl.c */
+extern struct lu_device_type vvp_device_type;
+
+/**
+ * Common IO arguments for various VFS I/O interfaces.
+ */
+int cl_sb_init(struct super_block *sb);
+int cl_sb_fini(struct super_block *sb);
+enum cl_lock_mode  vvp_mode_from_vma(struct vm_area_struct *vma);
+void ll_io_init(struct cl_io *io, const struct file *file, int write);
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+               struct ll_readahead_state *ras, unsigned long index,
+               unsigned hit);
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
+int ll_is_file_contended(struct file *file);
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which);
+
+/* llite/llite_rmtacl.c */
+#ifdef CONFIG_FS_POSIX_ACL
+obd_valid rce_ops2valid(int ops);
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key);
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops);
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key);
+void rct_init(struct rmtacl_ctl_table *rct);
+void rct_fini(struct rmtacl_ctl_table *rct);
+
+void ee_free(struct eacl_entry *ee);
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+          ext_acl_xattr_header *header);
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+                                struct lu_fid *fid, int type);
+void et_search_free(struct eacl_table *et, pid_t key);
+void et_init(struct eacl_table *et);
+void et_fini(struct eacl_table *et);
+#endif
+
+/* statahead.c */
+
+#define LL_SA_RPC_MIN     2
+#define LL_SA_RPC_DEF     32
+#define LL_SA_RPC_MAX     8192
+
+#define LL_SA_CACHE_BIT         5
+#define LL_SA_CACHE_SIZE       (1 << LL_SA_CACHE_BIT)
+#define LL_SA_CACHE_MASK       (LL_SA_CACHE_SIZE - 1)
+
+/* per inode struct, for dir only */
+struct ll_statahead_info {
+       struct inode       *sai_inode;
+       atomic_t            sai_refcount;   /* when access this struct, hold
+                                                * refcount */
+       unsigned int        sai_generation; /* generation for statahead */
+       unsigned int        sai_max;    /* max ahead of lookup */
+       __u64              sai_sent;       /* stat requests sent count */
+       __u64              sai_replied;    /* stat requests which received
+                                                * reply */
+       __u64              sai_index;      /* index of statahead entry */
+       __u64              sai_index_wait; /* index of entry which is the
+                                                * caller is waiting for */
+       __u64              sai_hit;     /* hit count */
+       __u64              sai_miss;       /* miss count:
+                                                * for "ls -al" case, it includes
+                                                * hidden dentry miss;
+                                                * for "ls -l" case, it does not
+                                                * include hidden dentry miss.
+                                                * "sai_miss_hidden" is used for
+                                                * the later case.
+                                                */
+       unsigned int        sai_consecutive_miss; /* consecutive miss */
+       unsigned int        sai_miss_hidden;/* "ls -al", but first dentry
+                                                * is not a hidden one */
+       unsigned int        sai_skip_hidden;/* skipped hidden dentry count */
+       unsigned int        sai_ls_all:1,   /* "ls -al", do stat-ahead for
+                                                * hidden entries */
+                               sai_in_readpage:1,/* statahead is in readdir()*/
+                               sai_agl_valid:1;/* AGL is valid for the dir */
+       wait_queue_head_t            sai_waitq;      /* stat-ahead wait queue */
+       struct ptlrpc_thread    sai_thread;     /* stat-ahead thread */
+       struct ptlrpc_thread    sai_agl_thread; /* AGL thread */
+       struct list_head              sai_entries;    /* entry list */
+       struct list_head              sai_entries_received; /* entries returned */
+       struct list_head              sai_entries_stated;   /* entries stated */
+       struct list_head              sai_entries_agl; /* AGL entries to be sent */
+       struct list_head              sai_cache[LL_SA_CACHE_SIZE];
+       spinlock_t              sai_cache_lock[LL_SA_CACHE_SIZE];
+       atomic_t                sai_cache_count; /* entry count in cache */
+};
+
+int do_statahead_enter(struct inode *dir, struct dentry **dentry,
+                      int only_unplug);
+void ll_stop_statahead(struct inode *dir, void *key);
+
+static inline int ll_glimpse_size(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       int rc;
+
+       down_read(&lli->lli_glimpse_sem);
+       rc = cl_glimpse_size(inode);
+       lli->lli_glimpse_time = cfs_time_current();
+       up_read(&lli->lli_glimpse_sem);
+       return rc;
+}
+
+static inline void
+ll_statahead_mark(struct inode *dir, struct dentry *dentry)
+{
+       struct ll_inode_info     *lli = ll_i2info(dir);
+       struct ll_statahead_info *sai = lli->lli_sai;
+       struct ll_dentry_data    *ldd = ll_d2d(dentry);
+
+       /* not the same process, don't mark */
+       if (lli->lli_opendir_pid != current_pid())
+               return;
+
+       if (sai != NULL && ldd != NULL)
+               ldd->lld_sa_generation = sai->sai_generation;
+}
+
+static inline int
+ll_need_statahead(struct inode *dir, struct dentry *dentryp)
+{
+       struct ll_inode_info  *lli;
+       struct ll_dentry_data *ldd;
+
+       if (ll_i2sbi(dir)->ll_sa_max == 0)
+               return -EAGAIN;
+
+       lli = ll_i2info(dir);
+       /* not the same process, don't statahead */
+       if (lli->lli_opendir_pid != current_pid())
+               return -EAGAIN;
+
+       /* statahead has been stopped */
+       if (lli->lli_opendir_key == NULL)
+               return -EAGAIN;
+
+       ldd = ll_d2d(dentryp);
+       /*
+        * When stats a dentry, the system trigger more than once "revalidate"
+        * or "lookup", for "getattr", for "getxattr", and maybe for others.
+        * Under patchless client mode, the operation intent is not accurate,
+        * which maybe misguide the statahead thread. For example:
+        * The "revalidate" call for "getattr" and "getxattr" of a dentry maybe
+        * have the same operation intent -- "IT_GETATTR".
+        * In fact, one dentry should has only one chance to interact with the
+        * statahead thread, otherwise the statahead windows will be confused.
+        * The solution is as following:
+        * Assign "lld_sa_generation" with "sai_generation" when a dentry
+        * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
+        * will bypass interacting with statahead thread for checking:
+        * "lld_sa_generation == lli_sai->sai_generation"
+        */
+       if (ldd && lli->lli_sai &&
+           ldd->lld_sa_generation == lli->lli_sai->sai_generation)
+               return -EAGAIN;
+
+       return 1;
+}
+
+static inline int
+ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int only_unplug)
+{
+       int ret;
+
+       ret = ll_need_statahead(dir, *dentryp);
+       if (ret <= 0)
+               return ret;
+
+       return do_statahead_enter(dir, dentryp, only_unplug);
+}
+
+/* llite ioctl register support rountine */
+enum llioc_iter {
+       LLIOC_CONT = 0,
+       LLIOC_STOP
+};
+
+#define LLIOC_MAX_CMD     256
+
+/*
+ * Rules to write a callback function:
+ *
+ * Parameters:
+ *  @magic: Dynamic ioctl call routine will feed this vaule with the pointer
+ *      returned to ll_iocontrol_register.  Callback functions should use this
+ *      data to check the potential collasion of ioctl cmd. If collasion is
+ *      found, callback function should return LLIOC_CONT.
+ *  @rcp: The result of ioctl command.
+ *
+ *  Return values:
+ *      If @magic matches the pointer returned by ll_iocontrol_data, the
+ *      callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
+ */
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
+               struct file *file, unsigned int cmd, unsigned long arg,
+               void *magic, int *rcp);
+
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
+               unsigned int cmd, unsigned long arg, int *rcp);
+
+/* export functions */
+/* Register ioctl block dynamatically for a regular file.
+ *
+ * @cmd: the array of ioctl command set
+ * @count: number of commands in the @cmd
+ * @cb: callback function, it will be called if an ioctl command is found to
+ *      belong to the command list @cmd.
+ *
+ * Return vaule:
+ *      A magic pointer will be returned if success;
+ *      otherwise, NULL will be returned.
+ * */
+void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
+void ll_iocontrol_unregister(void *magic);
+
+
+/* lclient compat stuff */
+#define cl_inode_info ll_inode_info
+#define cl_i2info(info) ll_i2info(info)
+#define cl_inode_mode(inode) ((inode)->i_mode)
+#define cl_i2sbi ll_i2sbi
+
+static inline struct ll_file_data *cl_iattr2fd(struct inode *inode,
+                                              const struct iattr *attr)
+{
+       LASSERT(attr->ia_valid & ATTR_FILE);
+       return LUSTRE_FPRIVATE(attr->ia_file);
+}
+
+static inline void cl_isize_lock(struct inode *inode)
+{
+       ll_inode_size_lock(inode);
+}
+
+static inline void cl_isize_unlock(struct inode *inode)
+{
+       ll_inode_size_unlock(inode);
+}
+
+static inline void cl_isize_write_nolock(struct inode *inode, loff_t kms)
+{
+       LASSERT(down_trylock(&ll_i2info(inode)->lli_size_sem) != 0);
+       i_size_write(inode, kms);
+}
+
+static inline void cl_isize_write(struct inode *inode, loff_t kms)
+{
+       ll_inode_size_lock(inode);
+       i_size_write(inode, kms);
+       ll_inode_size_unlock(inode);
+}
+
+#define cl_isize_read(inode)        i_size_read(inode)
+
+static inline int cl_merge_lvb(const struct lu_env *env, struct inode *inode)
+{
+       return ll_merge_lvb(env, inode);
+}
+
+#define cl_inode_atime(inode) LTIME_S((inode)->i_atime)
+#define cl_inode_ctime(inode) LTIME_S((inode)->i_ctime)
+#define cl_inode_mtime(inode) LTIME_S((inode)->i_mtime)
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt);
+
+int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
+                      enum cl_fsync_mode mode, int ignore_layout);
+
+/** direct write pages */
+struct ll_dio_pages {
+       /** page array to be written. we don't support
+        * partial pages except the last one. */
+       struct page **ldp_pages;
+       /* offset of each page */
+       loff_t       *ldp_offsets;
+       /** if ldp_offsets is NULL, it means a sequential
+        * pages to be written, then this is the file offset
+        * of the * first page. */
+       loff_t  ldp_start_offset;
+       /** how many bytes are to be written. */
+       size_t  ldp_size;
+       /** # of pages in the array. */
+       int        ldp_nr;
+};
+
+static inline void cl_stats_tally(struct cl_device *dev, enum cl_req_type crt,
+                                 int rc)
+{
+       int opc = (crt == CRT_READ) ? LPROC_LL_OSC_READ :
+                                     LPROC_LL_OSC_WRITE;
+
+       ll_stats_ops_tally(ll_s2sbi(cl2ccc_dev(dev)->cdv_sb), opc, rc);
+}
+
+extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+                                 int rw, struct inode *inode,
+                                 struct ll_dio_pages *pv);
+
+static inline int ll_file_nolock(const struct file *file)
+{
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct inode *inode = file->f_dentry->d_inode;
+
+       LASSERT(fd != NULL);
+       return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) ||
+               (ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK));
+}
+
+static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode,
+                                   struct lookup_intent *it, __u64 *bits)
+{
+       if (!it->d.lustre.it_lock_set) {
+               struct lustre_handle handle;
+
+               /* If this inode is a remote object, it will get two
+                * separate locks in different namespaces, Master MDT,
+                * where the name entry is, will grant LOOKUP lock,
+                * remote MDT, where the object is, will grant
+                * UPDATE|PERM lock. The inode will be attched to both
+                * LOOKUP and PERM locks, so revoking either locks will
+                * case the dcache being cleared */
+               if (it->d.lustre.it_remote_lock_mode) {
+                       handle.cookie = it->d.lustre.it_remote_lock_handle;
+                       CDEBUG(D_DLMTRACE, "setting l_data to inode %p"
+                              "(%lu/%u) for remote lock "LPX64"\n", inode,
+                              inode->i_ino, inode->i_generation,
+                              handle.cookie);
+                       md_set_lock_data(exp, &handle.cookie, inode, NULL);
+               }
+
+               handle.cookie = it->d.lustre.it_lock_handle;
+
+               CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)"
+                      " for lock "LPX64"\n", inode, inode->i_ino,
+                      inode->i_generation, handle.cookie);
+
+               md_set_lock_data(exp, &handle.cookie, inode,
+                                &it->d.lustre.it_lock_bits);
+               it->d.lustre.it_lock_set = 1;
+       }
+
+       if (bits != NULL)
+               *bits = it->d.lustre.it_lock_bits;
+}
+
+static inline void ll_lock_dcache(struct inode *inode)
+{
+       spin_lock(&inode->i_lock);
+}
+
+static inline void ll_unlock_dcache(struct inode *inode)
+{
+       spin_unlock(&inode->i_lock);
+}
+
+static inline int d_lustre_invalid(const struct dentry *dentry)
+{
+       struct ll_dentry_data *lld = ll_d2d(dentry);
+
+       return (lld == NULL) || lld->lld_invalid;
+}
+
+static inline void __d_lustre_invalidate(struct dentry *dentry)
+{
+       struct ll_dentry_data *lld = ll_d2d(dentry);
+
+       if (lld != NULL)
+               lld->lld_invalid = 1;
+}
+
+/*
+ * Mark dentry INVALID, if dentry refcount is zero (this is normally case for
+ * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later;
+ * else dput() of the last refcount will unhash this dentry and kill it.
+ */
+static inline void d_lustre_invalidate(struct dentry *dentry, int nested)
+{
+       CDEBUG(D_DENTRY, "invalidate dentry %.*s (%p) parent %p inode %p "
+              "refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry,
+              dentry->d_parent, dentry->d_inode, d_refcount(dentry));
+
+       spin_lock_nested(&dentry->d_lock,
+                        nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL);
+       __d_lustre_invalidate(dentry);
+       if (d_refcount(dentry) == 0)
+               __d_drop(dentry);
+       spin_unlock(&dentry->d_lock);
+}
+
+static inline void d_lustre_revalidate(struct dentry *dentry)
+{
+       spin_lock(&dentry->d_lock);
+       LASSERT(ll_d2d(dentry) != NULL);
+       ll_d2d(dentry)->lld_invalid = 0;
+       spin_unlock(&dentry->d_lock);
+}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+/* Compatibility for old (1.8) compiled userspace quota code */
+struct if_quotactl_18 {
+       __u32              qc_cmd;
+       __u32              qc_type;
+       __u32              qc_id;
+       __u32              qc_stat;
+       struct obd_dqinfo       qc_dqinfo;
+       struct obd_dqblk        qc_dqblk;
+       char                obd_type[16];
+       struct obd_uuid  obd_uuid;
+};
+#define LL_IOC_QUOTACTL_18           _IOWR('f', 162, struct if_quotactl_18 *)
+/* End compatibility for old (1.8) compiled userspace quota code */
+#else
+#warning "remove old LL_IOC_QUOTACTL_18 compatibility code"
+#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */
+
+enum {
+       LL_LAYOUT_GEN_NONE  = ((__u32)-2),      /* layout lock was cancelled */
+       LL_LAYOUT_GEN_EMPTY = ((__u32)-1)       /* for empty layout */
+};
+
+int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
+int ll_layout_refresh(struct inode *inode, __u32 *gen);
+
+#endif /* LLITE_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
new file mode 100644 (file)
index 0000000..2311b20
--- /dev/null
@@ -0,0 +1,2408 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_lib.c
+ *
+ * Lustre Light Super operations
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include <lustre_log.h>
+#include <cl_object.h>
+#include <obd_cksum.h>
+#include "llite_internal.h"
+
+struct kmem_cache *ll_file_data_slab;
+
+LIST_HEAD(ll_super_blocks);
+DEFINE_SPINLOCK(ll_sb_lock);
+
+#ifndef MS_HAS_NEW_AOPS
+extern struct address_space_operations ll_aops;
+#else
+extern struct address_space_operations_ext ll_aops;
+#endif
+
+#ifndef log2
+#define log2(n) ffz(~(n))
+#endif
+
+static struct ll_sb_info *ll_init_sbi(void)
+{
+       struct ll_sb_info *sbi = NULL;
+       unsigned long pages;
+       unsigned long lru_page_max;
+       struct sysinfo si;
+       class_uuid_t uuid;
+       int i;
+       ENTRY;
+
+       OBD_ALLOC(sbi, sizeof(*sbi));
+       if (!sbi)
+               RETURN(NULL);
+
+       spin_lock_init(&sbi->ll_lock);
+       mutex_init(&sbi->ll_lco.lco_lock);
+       spin_lock_init(&sbi->ll_pp_extent_lock);
+       spin_lock_init(&sbi->ll_process_lock);
+       sbi->ll_rw_stats_on = 0;
+
+       si_meminfo(&si);
+       pages = si.totalram - si.totalhigh;
+       if (pages >> (20 - PAGE_CACHE_SHIFT) < 512) {
+               lru_page_max = pages / 2;
+       } else {
+               lru_page_max = (pages / 4) * 3;
+       }
+
+       /* initialize lru data */
+       atomic_set(&sbi->ll_cache.ccc_users, 0);
+       sbi->ll_cache.ccc_lru_max = lru_page_max;
+       atomic_set(&sbi->ll_cache.ccc_lru_left, lru_page_max);
+       spin_lock_init(&sbi->ll_cache.ccc_lru_lock);
+       INIT_LIST_HEAD(&sbi->ll_cache.ccc_lru);
+
+       sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
+                                          SBI_DEFAULT_READAHEAD_MAX);
+       sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
+       sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+                                          SBI_DEFAULT_READAHEAD_WHOLE_MAX;
+       INIT_LIST_HEAD(&sbi->ll_conn_chain);
+       INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
+
+       ll_generate_random_uuid(uuid);
+       class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
+       CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
+
+       spin_lock(&ll_sb_lock);
+       list_add_tail(&sbi->ll_list, &ll_super_blocks);
+       spin_unlock(&ll_sb_lock);
+
+       sbi->ll_flags |= LL_SBI_VERBOSE;
+       sbi->ll_flags |= LL_SBI_CHECKSUM;
+
+       sbi->ll_flags |= LL_SBI_LRU_RESIZE;
+
+       for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+               spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+                              pp_r_hist.oh_lock);
+               spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
+                              pp_w_hist.oh_lock);
+       }
+
+       /* metadata statahead is enabled by default */
+       sbi->ll_sa_max = LL_SA_RPC_DEF;
+       atomic_set(&sbi->ll_sa_total, 0);
+       atomic_set(&sbi->ll_sa_wrong, 0);
+       atomic_set(&sbi->ll_agl_total, 0);
+       sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+
+       RETURN(sbi);
+}
+
+void ll_free_sbi(struct super_block *sb)
+{
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       ENTRY;
+
+       if (sbi != NULL) {
+               spin_lock(&ll_sb_lock);
+               list_del(&sbi->ll_list);
+               spin_unlock(&ll_sb_lock);
+               OBD_FREE(sbi, sizeof(*sbi));
+       }
+       EXIT;
+}
+
+static struct dentry_operations ll_d_root_ops = {
+       .d_compare = ll_dcompare,
+       .d_revalidate = ll_revalidate_nd,
+};
+
+static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
+                                   struct vfsmount *mnt)
+{
+       struct inode *root = 0;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct obd_device *obd;
+       struct obd_capa *oc = NULL;
+       struct obd_statfs *osfs = NULL;
+       struct ptlrpc_request *request = NULL;
+       struct obd_connect_data *data = NULL;
+       struct obd_uuid *uuid;
+       struct md_op_data *op_data;
+       struct lustre_md lmd;
+       obd_valid valid;
+       int size, err, checksum;
+       ENTRY;
+
+       obd = class_name2obd(md);
+       if (!obd) {
+               CERROR("MD %s: not setup or attached\n", md);
+               RETURN(-EINVAL);
+       }
+
+       OBD_ALLOC_PTR(data);
+       if (data == NULL)
+               RETURN(-ENOMEM);
+
+       OBD_ALLOC_PTR(osfs);
+       if (osfs == NULL) {
+               OBD_FREE_PTR(data);
+               RETURN(-ENOMEM);
+       }
+
+       if (proc_lustre_fs_root) {
+               err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
+                                                 dt, md);
+               if (err < 0)
+                       CERROR("could not register mount in /proc/fs/lustre\n");
+       }
+
+       /* indicate the features supported by this client */
+       data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
+                                 OBD_CONNECT_ATTRFID  |
+                                 OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
+                                 OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA |
+                                 OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
+                                 OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
+                                 OBD_CONNECT_RMT_CLIENT | OBD_CONNECT_VBR    |
+                                 OBD_CONNECT_FULL20   | OBD_CONNECT_64BITHASH|
+                                 OBD_CONNECT_EINPROGRESS |
+                                 OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+                                 OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
+
+       if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+               data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+       if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
+               data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+#ifdef CONFIG_FS_POSIX_ACL
+       data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK;
+#endif
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
+               /* flag mdc connection as lightweight, only used for test
+                * purpose, use with care */
+               data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
+
+       data->ocd_ibits_known = MDS_INODELOCK_FULL;
+       data->ocd_version = LUSTRE_VERSION_CODE;
+
+       if (sb->s_flags & MS_RDONLY)
+               data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
+       if (sbi->ll_flags & LL_SBI_USER_XATTR)
+               data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+
+#ifdef HAVE_MS_FLOCK_LOCK
+       /* force vfs to use lustre handler for flock() calls - bug 10743 */
+       sb->s_flags |= MS_FLOCK_LOCK;
+#endif
+#ifdef MS_HAS_NEW_AOPS
+       sb->s_flags |= MS_HAS_NEW_AOPS;
+#endif
+
+       if (sbi->ll_flags & LL_SBI_FLOCK)
+               sbi->ll_fop = &ll_file_operations_flock;
+       else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+               sbi->ll_fop = &ll_file_operations;
+       else
+               sbi->ll_fop = &ll_file_operations_noflock;
+
+       /* real client */
+       data->ocd_connect_flags |= OBD_CONNECT_REAL;
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+               data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+       data->ocd_brw_size = MD_MAX_BRW_SIZE;
+
+       err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL);
+       if (err == -EBUSY) {
+               LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing "
+                                  "recovery, of which this client is not a "
+                                  "part. Please wait for recovery to complete,"
+                                  " abort, or time out.\n", md);
+               GOTO(out, err);
+       } else if (err) {
+               CERROR("cannot connect to %s: rc = %d\n", md, err);
+               GOTO(out, err);
+       }
+
+       sbi->ll_md_exp->exp_connect_data = *data;
+
+       err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp,
+                          LUSTRE_SEQ_METADATA);
+       if (err) {
+               CERROR("%s: Can't init metadata layer FID infrastructure, "
+                      "rc = %d\n", sbi->ll_md_exp->exp_obd->obd_name, err);
+               GOTO(out_md, err);
+       }
+
+       /* For mount, we only need fs info from MDT0, and also in DNE, it
+        * can make sure the client can be mounted as long as MDT0 is
+        * avaible */
+       err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
+                       cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                       OBD_STATFS_FOR_MDT0);
+       if (err)
+               GOTO(out_md_fid, err);
+
+       /* This needs to be after statfs to ensure connect has finished.
+        * Note that "data" does NOT contain the valid connect reply.
+        * If connecting to a 1.8 server there will be no LMV device, so
+        * we can access the MDC export directly and exp_connect_flags will
+        * be non-zero, but if accessing an upgraded 2.1 server it will
+        * have the correct flags filled in.
+        * XXX: fill in the LMV exp_connect_flags from MDC(s). */
+       valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD;
+       if (exp_connect_flags(sbi->ll_md_exp) != 0 &&
+           valid != CLIENT_CONNECT_MDT_REQD) {
+               char *buf;
+
+               OBD_ALLOC_WAIT(buf, PAGE_CACHE_SIZE);
+               obd_connect_flags2str(buf, PAGE_CACHE_SIZE,
+                                     valid ^ CLIENT_CONNECT_MDT_REQD, ",");
+               LCONSOLE_ERROR_MSG(0x170, "Server %s does not support "
+                                  "feature(s) needed for correct operation "
+                                  "of this client (%s). Please upgrade "
+                                  "server or downgrade client.\n",
+                                  sbi->ll_md_exp->exp_obd->obd_name, buf);
+               OBD_FREE(buf, PAGE_CACHE_SIZE);
+               GOTO(out_md_fid, err = -EPROTO);
+       }
+
+       size = sizeof(*data);
+       err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA),
+                          KEY_CONN_DATA,  &size, data, NULL);
+       if (err) {
+               CERROR("%s: Get connect data failed: rc = %d\n",
+                      sbi->ll_md_exp->exp_obd->obd_name, err);
+               GOTO(out_md_fid, err);
+       }
+
+       LASSERT(osfs->os_bsize);
+       sb->s_blocksize = osfs->os_bsize;
+       sb->s_blocksize_bits = log2(osfs->os_bsize);
+       sb->s_magic = LL_SUPER_MAGIC;
+       sb->s_maxbytes = MAX_LFS_FILESIZE;
+       sbi->ll_namelen = osfs->os_namelen;
+       sbi->ll_max_rw_chunk = LL_DEFAULT_MAX_RW_CHUNK;
+
+       if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
+           !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
+               LCONSOLE_INFO("Disabling user_xattr feature because "
+                             "it is not supported on the server\n");
+               sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+       }
+
+       if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+#ifdef MS_POSIXACL
+               sb->s_flags |= MS_POSIXACL;
+#endif
+               sbi->ll_flags |= LL_SBI_ACL;
+       } else {
+               LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
+#ifdef MS_POSIXACL
+               sb->s_flags &= ~MS_POSIXACL;
+#endif
+               sbi->ll_flags &= ~LL_SBI_ACL;
+       }
+
+       if (data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) {
+               if (!(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+                       sbi->ll_flags |= LL_SBI_RMT_CLIENT;
+                       LCONSOLE_INFO("client is set as remote by default.\n");
+               }
+       } else {
+               if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+                       sbi->ll_flags &= ~LL_SBI_RMT_CLIENT;
+                       LCONSOLE_INFO("client claims to be remote, but server "
+                                     "rejected, forced to be local.\n");
+               }
+       }
+
+       if (data->ocd_connect_flags & OBD_CONNECT_MDS_CAPA) {
+               LCONSOLE_INFO("client enabled MDS capability!\n");
+               sbi->ll_flags |= LL_SBI_MDS_CAPA;
+       }
+
+       if (data->ocd_connect_flags & OBD_CONNECT_OSS_CAPA) {
+               LCONSOLE_INFO("client enabled OSS capability!\n");
+               sbi->ll_flags |= LL_SBI_OSS_CAPA;
+       }
+
+       if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
+               sbi->ll_flags |= LL_SBI_64BIT_HASH;
+
+       if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+               sbi->ll_md_brw_size = data->ocd_brw_size;
+       else
+               sbi->ll_md_brw_size = PAGE_CACHE_SIZE;
+
+       if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) {
+               LCONSOLE_INFO("Layout lock feature supported.\n");
+               sbi->ll_flags |= LL_SBI_LAYOUT_LOCK;
+       }
+
+       obd = class_name2obd(dt);
+       if (!obd) {
+               CERROR("DT %s: not setup or attached\n", dt);
+               GOTO(out_md_fid, err = -ENODEV);
+       }
+
+       data->ocd_connect_flags = OBD_CONNECT_GRANT     | OBD_CONNECT_VERSION  |
+                                 OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
+                                 OBD_CONNECT_CANCELSET | OBD_CONNECT_FID      |
+                                 OBD_CONNECT_SRVLOCK   | OBD_CONNECT_TRUNCLOCK|
+                                 OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT |
+                                 OBD_CONNECT_OSS_CAPA | OBD_CONNECT_VBR|
+                                 OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH |
+                                 OBD_CONNECT_MAXBYTES |
+                                 OBD_CONNECT_EINPROGRESS |
+                                 OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
+                                 OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
+
+       if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
+               data->ocd_connect_flags |= OBD_CONNECT_SOM;
+
+       if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
+               /* OBD_CONNECT_CKSUM should always be set, even if checksums are
+                * disabled by default, because it can still be enabled on the
+                * fly via /proc. As a consequence, we still need to come to an
+                * agreement on the supported algorithms at connect time */
+               data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
+
+               if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
+                       data->ocd_cksum_types = OBD_CKSUM_ADLER;
+               else
+                       data->ocd_cksum_types = cksum_types_supported_client();
+       }
+
+       data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+               data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE;
+
+       CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
+              "ocd_grant: %d\n", data->ocd_connect_flags,
+              data->ocd_version, data->ocd_grant);
+
+       obd->obd_upcall.onu_owner = &sbi->ll_lco;
+       obd->obd_upcall.onu_upcall = cl_ocd_update;
+
+       data->ocd_brw_size = DT_MAX_BRW_SIZE;
+
+       err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
+                         NULL);
+       if (err == -EBUSY) {
+               LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing "
+                                  "recovery, of which this client is not a "
+                                  "part.  Please wait for recovery to "
+                                  "complete, abort, or time out.\n", dt);
+               GOTO(out_md, err);
+       } else if (err) {
+               CERROR("%s: Cannot connect to %s: rc = %d\n",
+                      sbi->ll_dt_exp->exp_obd->obd_name, dt, err);
+               GOTO(out_md, err);
+       }
+
+       sbi->ll_dt_exp->exp_connect_data = *data;
+
+       err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
+                          LUSTRE_SEQ_METADATA);
+       if (err) {
+               CERROR("%s: Can't init data layer FID infrastructure, "
+                      "rc = %d\n", sbi->ll_dt_exp->exp_obd->obd_name, err);
+               GOTO(out_dt, err);
+       }
+
+       mutex_lock(&sbi->ll_lco.lco_lock);
+       sbi->ll_lco.lco_flags = data->ocd_connect_flags;
+       sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
+       sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
+       mutex_unlock(&sbi->ll_lco.lco_lock);
+
+       fid_zero(&sbi->ll_root_fid);
+       err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc);
+       if (err) {
+               CERROR("cannot mds_connect: rc = %d\n", err);
+               GOTO(out_lock_cn_cb, err);
+       }
+       if (!fid_is_sane(&sbi->ll_root_fid)) {
+               CERROR("%s: Invalid root fid "DFID" during mount\n",
+                      sbi->ll_md_exp->exp_obd->obd_name,
+                      PFID(&sbi->ll_root_fid));
+               GOTO(out_lock_cn_cb, err = -EINVAL);
+       }
+       CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid));
+
+       sb->s_op = &lustre_super_operations;
+#if THREAD_SIZE >= 8192 /*b=17630*/
+       sb->s_export_op = &lustre_export_operations;
+#endif
+
+       /* make root inode
+        * XXX: move this to after cbd setup? */
+       valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMDSCAPA;
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+               valid |= OBD_MD_FLRMTPERM;
+       else if (sbi->ll_flags & LL_SBI_ACL)
+               valid |= OBD_MD_FLACL;
+
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               GOTO(out_lock_cn_cb, err = -ENOMEM);
+
+       op_data->op_fid1 = sbi->ll_root_fid;
+       op_data->op_mode = 0;
+       op_data->op_capa1 = oc;
+       op_data->op_valid = valid;
+
+       err = md_getattr(sbi->ll_md_exp, op_data, &request);
+       if (oc)
+               capa_put(oc);
+       OBD_FREE_PTR(op_data);
+       if (err) {
+               CERROR("%s: md_getattr failed for root: rc = %d\n",
+                      sbi->ll_md_exp->exp_obd->obd_name, err);
+               GOTO(out_lock_cn_cb, err);
+       }
+
+       err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+                              sbi->ll_md_exp, &lmd);
+       if (err) {
+               CERROR("failed to understand root inode md: rc = %d\n", err);
+               ptlrpc_req_finished(request);
+               GOTO(out_lock_cn_cb, err);
+       }
+
+       LASSERT(fid_is_sane(&sbi->ll_root_fid));
+       root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid,
+                                           sbi->ll_flags & LL_SBI_32BIT_API),
+                      &lmd);
+       md_free_lustre_md(sbi->ll_md_exp, &lmd);
+       ptlrpc_req_finished(request);
+
+       if (root == NULL || IS_ERR(root)) {
+               if (lmd.lsm)
+                       obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm);
+#ifdef CONFIG_FS_POSIX_ACL
+               if (lmd.posix_acl) {
+                       posix_acl_release(lmd.posix_acl);
+                       lmd.posix_acl = NULL;
+               }
+#endif
+               err = IS_ERR(root) ? PTR_ERR(root) : -EBADF;
+               root = NULL;
+               CERROR("lustre_lite: bad iget4 for root\n");
+               GOTO(out_root, err);
+       }
+
+       err = ll_close_thread_start(&sbi->ll_lcq);
+       if (err) {
+               CERROR("cannot start close thread: rc %d\n", err);
+               GOTO(out_root, err);
+       }
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+               rct_init(&sbi->ll_rct);
+               et_init(&sbi->ll_et);
+       }
+#endif
+
+       checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
+       err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+                                KEY_CHECKSUM, sizeof(checksum), &checksum,
+                                NULL);
+       cl_sb_init(sb);
+
+       err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET),
+                                KEY_CACHE_SET, sizeof(sbi->ll_cache),
+                                &sbi->ll_cache, NULL);
+
+       sb->s_root = d_make_root(root);
+       if (sb->s_root == NULL) {
+               CERROR("%s: can't make root dentry\n",
+                       ll_get_fsname(sb, NULL, 0));
+               GOTO(out_root, err = -ENOMEM);
+       }
+
+       /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
+       d_set_d_op(sb->s_root, &ll_d_root_ops);
+       sb->s_d_op = &ll_d_ops;
+
+       sbi->ll_sdev_orig = sb->s_dev;
+
+       /* We set sb->s_dev equal on all lustre clients in order to support
+        * NFS export clustering.  NFSD requires that the FSID be the same
+        * on all clients. */
+       /* s_dev is also used in lt_compare() to compare two fs, but that is
+        * only a node-local comparison. */
+       uuid = obd_get_uuid(sbi->ll_md_exp);
+       if (uuid != NULL)
+               sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
+
+       if (data != NULL)
+               OBD_FREE_PTR(data);
+       if (osfs != NULL)
+               OBD_FREE_PTR(osfs);
+
+       RETURN(err);
+out_root:
+       if (root)
+               iput(root);
+out_lock_cn_cb:
+       obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+out_dt:
+       obd_disconnect(sbi->ll_dt_exp);
+       sbi->ll_dt_exp = NULL;
+       /* Make sure all OScs are gone, since cl_cache is accessing sbi. */
+       obd_zombie_barrier();
+out_md_fid:
+       obd_fid_fini(sbi->ll_md_exp->exp_obd);
+out_md:
+       obd_disconnect(sbi->ll_md_exp);
+       sbi->ll_md_exp = NULL;
+out:
+       if (data != NULL)
+               OBD_FREE_PTR(data);
+       if (osfs != NULL)
+               OBD_FREE_PTR(osfs);
+       lprocfs_unregister_mountpoint(sbi);
+       return err;
+}
+
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+       int size, rc;
+
+       *lmmsize = obd_size_diskmd(sbi->ll_dt_exp, NULL);
+       size = sizeof(int);
+       rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE),
+                         KEY_MAX_EASIZE, &size, lmmsize, NULL);
+       if (rc)
+               CERROR("Get max mdsize error rc %d \n", rc);
+
+       RETURN(rc);
+}
+
+void ll_dump_inode(struct inode *inode)
+{
+       struct ll_d_hlist_node *tmp;
+       int dentry_count = 0;
+
+       LASSERT(inode != NULL);
+
+       ll_d_hlist_for_each(tmp, &inode->i_dentry)
+               dentry_count++;
+
+       CERROR("inode %p dump: dev=%s ino=%lu mode=%o count=%u, %d dentries\n",
+              inode, ll_i2mdexp(inode)->exp_obd->obd_name, inode->i_ino,
+              inode->i_mode, atomic_read(&inode->i_count), dentry_count);
+}
+
+void lustre_dump_dentry(struct dentry *dentry, int recur)
+{
+       struct list_head *tmp;
+       int subdirs = 0;
+
+       LASSERT(dentry != NULL);
+
+       list_for_each(tmp, &dentry->d_subdirs)
+               subdirs++;
+
+       CERROR("dentry %p dump: name=%.*s parent=%.*s (%p), inode=%p, count=%u,"
+              " flags=0x%x, fsdata=%p, %d subdirs\n", dentry,
+              dentry->d_name.len, dentry->d_name.name,
+              dentry->d_parent->d_name.len, dentry->d_parent->d_name.name,
+              dentry->d_parent, dentry->d_inode, d_refcount(dentry),
+              dentry->d_flags, dentry->d_fsdata, subdirs);
+       if (dentry->d_inode != NULL)
+               ll_dump_inode(dentry->d_inode);
+
+       if (recur == 0)
+               return;
+
+       list_for_each(tmp, &dentry->d_subdirs) {
+               struct dentry *d = list_entry(tmp, struct dentry, d_u.d_child);
+               lustre_dump_dentry(d, recur - 1);
+       }
+}
+
+void client_common_put_super(struct super_block *sb)
+{
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       ENTRY;
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+               et_fini(&sbi->ll_et);
+               rct_fini(&sbi->ll_rct);
+       }
+#endif
+
+       ll_close_thread_shutdown(sbi->ll_lcq);
+
+       cl_sb_fini(sb);
+
+       list_del(&sbi->ll_conn_chain);
+
+       obd_fid_fini(sbi->ll_dt_exp->exp_obd);
+       obd_disconnect(sbi->ll_dt_exp);
+       sbi->ll_dt_exp = NULL;
+       /* wait till all OSCs are gone, since cl_cache is accessing sbi.
+        * see LU-2543. */
+       obd_zombie_barrier();
+
+       lprocfs_unregister_mountpoint(sbi);
+
+       obd_fid_fini(sbi->ll_md_exp->exp_obd);
+       obd_disconnect(sbi->ll_md_exp);
+       sbi->ll_md_exp = NULL;
+
+       EXIT;
+}
+
+void ll_kill_super(struct super_block *sb)
+{
+       struct ll_sb_info *sbi;
+
+       ENTRY;
+
+       /* not init sb ?*/
+       if (!(sb->s_flags & MS_ACTIVE))
+               return;
+
+       sbi = ll_s2sbi(sb);
+       /* we need restore s_dev from changed for clustred NFS before put_super
+        * because new kernels have cached s_dev and change sb->s_dev in
+        * put_super not affected real removing devices */
+       if (sbi) {
+               sb->s_dev = sbi->ll_sdev_orig;
+               sbi->ll_umounting = 1;
+       }
+       EXIT;
+}
+
+char *ll_read_opt(const char *opt, char *data)
+{
+       char *value;
+       char *retval;
+       ENTRY;
+
+       CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data);
+       if (strncmp(opt, data, strlen(opt)))
+               RETURN(NULL);
+       if ((value = strchr(data, '=')) == NULL)
+               RETURN(NULL);
+
+       value++;
+       OBD_ALLOC(retval, strlen(value) + 1);
+       if (!retval) {
+               CERROR("out of memory!\n");
+               RETURN(NULL);
+       }
+
+       memcpy(retval, value, strlen(value)+1);
+       CDEBUG(D_SUPER, "Assigned option: %s, value %s\n", opt, retval);
+       RETURN(retval);
+}
+
+static inline int ll_set_opt(const char *opt, char *data, int fl)
+{
+       if (strncmp(opt, data, strlen(opt)) != 0)
+               return(0);
+       else
+               return(fl);
+}
+
+/* non-client-specific mount options are parsed in lmd_parse */
+static int ll_options(char *options, int *flags)
+{
+       int tmp;
+       char *s1 = options, *s2;
+       ENTRY;
+
+       if (!options)
+               RETURN(0);
+
+       CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
+
+       while (*s1) {
+               CDEBUG(D_SUPER, "next opt=%s\n", s1);
+               tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 5, 50, 0)
+               tmp = ll_set_opt("acl", s1, LL_SBI_ACL);
+               if (tmp) {
+                       /* Ignore deprecated mount option.  The client will
+                        * always try to mount with ACL support, whether this
+                        * is used depends on whether server supports it. */
+                       LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
+                                                 "mount option 'acl'.\n");
+                       goto next;
+               }
+               tmp = ll_set_opt("noacl", s1, LL_SBI_ACL);
+               if (tmp) {
+                       LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated "
+                                                 "mount option 'noacl'.\n");
+                       goto next;
+               }
+#else
+#warning "{no}acl options have been deprecated since 1.8, please remove them"
+#endif
+               tmp = ll_set_opt("remote_client", s1, LL_SBI_RMT_CLIENT);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+
+               tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("som_preview", s1, LL_SBI_SOM_PREVIEW);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE);
+               if (tmp) {
+                       *flags |= tmp;
+                       goto next;
+               }
+               tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE);
+               if (tmp) {
+                       *flags &= ~tmp;
+                       goto next;
+               }
+               LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
+                                  s1);
+               RETURN(-EINVAL);
+
+next:
+               /* Find next opt */
+               s2 = strchr(s1, ',');
+               if (s2 == NULL)
+                       break;
+               s1 = s2 + 1;
+       }
+       RETURN(0);
+}
+
+void ll_lli_init(struct ll_inode_info *lli)
+{
+       lli->lli_inode_magic = LLI_INODE_MAGIC;
+       lli->lli_flags = 0;
+       lli->lli_ioepoch = 0;
+       lli->lli_maxbytes = MAX_LFS_FILESIZE;
+       spin_lock_init(&lli->lli_lock);
+       lli->lli_posix_acl = NULL;
+       lli->lli_remote_perms = NULL;
+       mutex_init(&lli->lli_rmtperm_mutex);
+       /* Do not set lli_fid, it has been initialized already. */
+       fid_zero(&lli->lli_pfid);
+       INIT_LIST_HEAD(&lli->lli_close_list);
+       INIT_LIST_HEAD(&lli->lli_oss_capas);
+       atomic_set(&lli->lli_open_count, 0);
+       lli->lli_mds_capa = NULL;
+       lli->lli_rmtperm_time = 0;
+       lli->lli_pending_och = NULL;
+       lli->lli_mds_read_och = NULL;
+       lli->lli_mds_write_och = NULL;
+       lli->lli_mds_exec_och = NULL;
+       lli->lli_open_fd_read_count = 0;
+       lli->lli_open_fd_write_count = 0;
+       lli->lli_open_fd_exec_count = 0;
+       mutex_init(&lli->lli_och_mutex);
+       spin_lock_init(&lli->lli_agl_lock);
+       lli->lli_has_smd = false;
+       lli->lli_layout_gen = LL_LAYOUT_GEN_NONE;
+       lli->lli_clob = NULL;
+
+       LASSERT(lli->lli_vfs_inode.i_mode != 0);
+       if (S_ISDIR(lli->lli_vfs_inode.i_mode)) {
+               mutex_init(&lli->lli_readdir_mutex);
+               lli->lli_opendir_key = NULL;
+               lli->lli_sai = NULL;
+               lli->lli_def_acl = NULL;
+               spin_lock_init(&lli->lli_sa_lock);
+               lli->lli_opendir_pid = 0;
+       } else {
+               sema_init(&lli->lli_size_sem, 1);
+               lli->lli_size_sem_owner = NULL;
+               lli->lli_symlink_name = NULL;
+               init_rwsem(&lli->lli_trunc_sem);
+               mutex_init(&lli->lli_write_mutex);
+               init_rwsem(&lli->lli_glimpse_sem);
+               lli->lli_glimpse_time = 0;
+               INIT_LIST_HEAD(&lli->lli_agl_list);
+               lli->lli_agl_index = 0;
+               lli->lli_async_rc = 0;
+               lli->lli_volatile = false;
+       }
+       mutex_init(&lli->lli_layout_mutex);
+}
+
+static inline int ll_bdi_register(struct backing_dev_info *bdi)
+{
+       static atomic_t ll_bdi_num = ATOMIC_INIT(0);
+
+       bdi->name = "lustre";
+       return bdi_register(bdi, NULL, "lustre-%d",
+                           atomic_inc_return(&ll_bdi_num));
+}
+
+int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
+{
+       struct lustre_profile *lprof = NULL;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct ll_sb_info *sbi;
+       char  *dt = NULL, *md = NULL;
+       char  *profilenm = get_profile_name(sb);
+       struct config_llog_instance *cfg;
+       /* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */
+       const int instlen = sizeof(cfg->cfg_instance) * 2 + 2;
+       int    err;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+       OBD_ALLOC_PTR(cfg);
+       if (cfg == NULL)
+               RETURN(-ENOMEM);
+
+       try_module_get(THIS_MODULE);
+
+       /* client additional sb info */
+       lsi->lsi_llsbi = sbi = ll_init_sbi();
+       if (!sbi) {
+               module_put(THIS_MODULE);
+               OBD_FREE_PTR(cfg);
+               RETURN(-ENOMEM);
+       }
+
+       err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
+       if (err)
+               GOTO(out_free, err);
+
+       err = bdi_init(&lsi->lsi_bdi);
+       if (err)
+               GOTO(out_free, err);
+       lsi->lsi_flags |= LSI_BDI_INITIALIZED;
+       lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+       err = ll_bdi_register(&lsi->lsi_bdi);
+       if (err)
+               GOTO(out_free, err);
+
+       sb->s_bdi = &lsi->lsi_bdi;
+
+       /* Generate a string unique to this super, in case some joker tries
+          to mount the same fs at two mount points.
+          Use the address of the super itself.*/
+       cfg->cfg_instance = sb;
+       cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
+       cfg->cfg_callback = class_config_llog_handler;
+       /* set up client obds */
+       err = lustre_process_log(sb, profilenm, cfg);
+       if (err < 0) {
+               CERROR("Unable to process log: %d\n", err);
+               GOTO(out_free, err);
+       }
+
+       /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
+       lprof = class_get_profile(profilenm);
+       if (lprof == NULL) {
+               LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be"
+                                  " read from the MGS.  Does that filesystem "
+                                  "exist?\n", profilenm);
+               GOTO(out_free, err = -EINVAL);
+       }
+       CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
+              lprof->lp_md, lprof->lp_dt);
+
+       OBD_ALLOC(dt, strlen(lprof->lp_dt) + instlen + 2);
+       if (!dt)
+               GOTO(out_free, err = -ENOMEM);
+       sprintf(dt, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
+
+       OBD_ALLOC(md, strlen(lprof->lp_md) + instlen + 2);
+       if (!md)
+               GOTO(out_free, err = -ENOMEM);
+       sprintf(md, "%s-%p", lprof->lp_md, cfg->cfg_instance);
+
+       /* connections, registrations, sb setup */
+       err = client_common_fill_super(sb, md, dt, mnt);
+
+out_free:
+       if (md)
+               OBD_FREE(md, strlen(lprof->lp_md) + instlen + 2);
+       if (dt)
+               OBD_FREE(dt, strlen(lprof->lp_dt) + instlen + 2);
+       if (err)
+               ll_put_super(sb);
+       else if (sbi->ll_flags & LL_SBI_VERBOSE)
+               LCONSOLE_WARN("Mounted %s\n", profilenm);
+
+       OBD_FREE_PTR(cfg);
+       RETURN(err);
+} /* ll_fill_super */
+
+void ll_put_super(struct super_block *sb)
+{
+       struct config_llog_instance cfg;
+       struct obd_device *obd;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       char *profilenm = get_profile_name(sb);
+       int next, force = 1;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
+
+       ll_print_capa_stat(sbi);
+
+       cfg.cfg_instance = sb;
+       lustre_end_log(sb, profilenm, &cfg);
+
+       if (sbi->ll_md_exp) {
+               obd = class_exp2obd(sbi->ll_md_exp);
+               if (obd)
+                       force = obd->obd_force;
+       }
+
+       /* We need to set force before the lov_disconnect in
+          lustre_common_put_super, since l_d cleans up osc's as well. */
+       if (force) {
+               next = 0;
+               while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
+                                                    &next)) != NULL) {
+                       obd->obd_force = force;
+               }
+       }
+
+       if (sbi->ll_lcq) {
+               /* Only if client_common_fill_super succeeded */
+               client_common_put_super(sb);
+       }
+
+       next = 0;
+       while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
+               class_manual_cleanup(obd);
+       }
+
+       if (sbi->ll_flags & LL_SBI_VERBOSE)
+               LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : "");
+
+       if (profilenm)
+               class_del_profile(profilenm);
+
+       if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
+               bdi_destroy(&lsi->lsi_bdi);
+               lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
+       }
+
+       ll_free_sbi(sb);
+       lsi->lsi_llsbi = NULL;
+
+       lustre_common_put_super(sb);
+
+       module_put(THIS_MODULE);
+
+       EXIT;
+} /* client_put_super */
+
+struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock)
+{
+       struct inode *inode = NULL;
+
+       /* NOTE: we depend on atomic igrab() -bzzz */
+       lock_res_and_lock(lock);
+       if (lock->l_resource->lr_lvb_inode) {
+               struct ll_inode_info * lli;
+               lli = ll_i2info(lock->l_resource->lr_lvb_inode);
+               if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+                       inode = igrab(lock->l_resource->lr_lvb_inode);
+               } else {
+                       inode = lock->l_resource->lr_lvb_inode;
+                       LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+                                        D_WARNING, lock, "lr_lvb_inode %p is "
+                                        "bogus: magic %08x",
+                                        lock->l_resource->lr_lvb_inode,
+                                        lli->lli_inode_magic);
+                       inode = NULL;
+               }
+       }
+       unlock_res_and_lock(lock);
+       return inode;
+}
+
+struct inode *ll_inode_from_lock(struct ldlm_lock *lock)
+{
+       struct inode *inode = NULL;
+       /* NOTE: we depend on atomic igrab() -bzzz */
+       lock_res_and_lock(lock);
+       if (lock->l_ast_data) {
+               struct ll_inode_info *lli = ll_i2info(lock->l_ast_data);
+               if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
+                       inode = igrab(lock->l_ast_data);
+               } else {
+                       inode = lock->l_ast_data;
+                       LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
+                                        D_WARNING, lock, "l_ast_data %p is "
+                                        "bogus: magic %08x", lock->l_ast_data,
+                                        lli->lli_inode_magic);
+                       inode = NULL;
+               }
+       }
+       unlock_res_and_lock(lock);
+       return inode;
+}
+
+void ll_clear_inode(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+              inode->i_generation, inode);
+
+       if (S_ISDIR(inode->i_mode)) {
+               /* these should have been cleared in ll_file_release */
+               LASSERT(lli->lli_opendir_key == NULL);
+               LASSERT(lli->lli_sai == NULL);
+               LASSERT(lli->lli_opendir_pid == 0);
+       }
+
+       ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+       md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
+
+       LASSERT(!lli->lli_open_fd_write_count);
+       LASSERT(!lli->lli_open_fd_read_count);
+       LASSERT(!lli->lli_open_fd_exec_count);
+
+       if (lli->lli_mds_write_och)
+               ll_md_real_close(inode, FMODE_WRITE);
+       if (lli->lli_mds_exec_och)
+               ll_md_real_close(inode, FMODE_EXEC);
+       if (lli->lli_mds_read_och)
+               ll_md_real_close(inode, FMODE_READ);
+
+       if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) {
+               OBD_FREE(lli->lli_symlink_name,
+                        strlen(lli->lli_symlink_name) + 1);
+               lli->lli_symlink_name = NULL;
+       }
+
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+               LASSERT(lli->lli_posix_acl == NULL);
+               if (lli->lli_remote_perms) {
+                       free_rmtperm_hash(lli->lli_remote_perms);
+                       lli->lli_remote_perms = NULL;
+               }
+       }
+#ifdef CONFIG_FS_POSIX_ACL
+       else if (lli->lli_posix_acl) {
+               LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
+               LASSERT(lli->lli_remote_perms == NULL);
+               posix_acl_release(lli->lli_posix_acl);
+               lli->lli_posix_acl = NULL;
+       }
+#endif
+       lli->lli_inode_magic = LLI_INODE_DEAD;
+
+       ll_clear_inode_capas(inode);
+       if (!S_ISDIR(inode->i_mode))
+               LASSERT(list_empty(&lli->lli_agl_list));
+
+       /*
+        * XXX This has to be done before lsm is freed below, because
+        * cl_object still uses inode lsm.
+        */
+       cl_inode_fini(inode);
+       lli->lli_has_smd = false;
+
+       EXIT;
+}
+
+int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
+                 struct md_open_data **mod)
+{
+       struct lustre_md md;
+       struct inode *inode = dentry->d_inode;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ptlrpc_request *request = NULL;
+       int rc, ia_valid;
+       ENTRY;
+
+       op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, NULL, 0,
+                       &request, mod);
+       if (rc) {
+               ptlrpc_req_finished(request);
+               if (rc == -ENOENT) {
+                       clear_nlink(inode);
+                       /* Unlinked special device node? Or just a race?
+                        * Pretend we done everything. */
+                       if (!S_ISREG(inode->i_mode) &&
+                           !S_ISDIR(inode->i_mode)) {
+                               ia_valid = op_data->op_attr.ia_valid;
+                               op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
+                               rc = simple_setattr(dentry, &op_data->op_attr);
+                               op_data->op_attr.ia_valid = ia_valid;
+                       }
+               } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
+                       CERROR("md_setattr fails: rc = %d\n", rc);
+               }
+               RETURN(rc);
+       }
+
+       rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
+                             sbi->ll_md_exp, &md);
+       if (rc) {
+               ptlrpc_req_finished(request);
+               RETURN(rc);
+       }
+
+       ia_valid = op_data->op_attr.ia_valid;
+       /* inode size will be in ll_setattr_ost, can't do it now since dirty
+        * cache is not cleared yet. */
+       op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
+       rc = simple_setattr(dentry, &op_data->op_attr);
+       op_data->op_attr.ia_valid = ia_valid;
+
+       /* Extract epoch data if obtained. */
+       op_data->op_handle = md.body->handle;
+       op_data->op_ioepoch = md.body->ioepoch;
+
+       ll_update_inode(inode, &md);
+       ptlrpc_req_finished(request);
+
+       RETURN(rc);
+}
+
+/* Close IO epoch and send Size-on-MDS attribute update. */
+static int ll_setattr_done_writing(struct inode *inode,
+                                  struct md_op_data *op_data,
+                                  struct md_open_data *mod)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(op_data != NULL);
+       if (!S_ISREG(inode->i_mode))
+               RETURN(0);
+
+       CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID" for truncate\n",
+              op_data->op_ioepoch, PFID(&lli->lli_fid));
+
+       op_data->op_flags = MF_EPOCH_CLOSE;
+       ll_done_writing_attr(inode, op_data);
+       ll_pack_inode2opdata(inode, op_data, NULL);
+
+       rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, mod);
+       if (rc == -EAGAIN) {
+               /* MDS has instructed us to obtain Size-on-MDS attribute
+                * from OSTs and send setattr to back to MDS. */
+               rc = ll_som_update(inode, op_data);
+       } else if (rc) {
+               CERROR("inode %lu mdc truncate failed: rc = %d\n",
+                      inode->i_ino, rc);
+       }
+       RETURN(rc);
+}
+
+static int ll_setattr_ost(struct inode *inode, struct iattr *attr)
+{
+       struct obd_capa *capa;
+       int rc;
+
+       if (attr->ia_valid & ATTR_SIZE)
+               capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC);
+       else
+               capa = ll_mdscapa_get(inode);
+
+       rc = cl_setattr_ost(inode, attr, capa);
+
+       if (attr->ia_valid & ATTR_SIZE)
+               ll_truncate_free_capa(capa);
+       else
+               capa_put(capa);
+
+       return rc;
+}
+
+
+/* If this inode has objects allocated to it (lsm != NULL), then the OST
+ * object(s) determine the file size and mtime.  Otherwise, the MDS will
+ * keep these values until such a time that objects are allocated for it.
+ * We do the MDS operations first, as it is checking permissions for us.
+ * We don't to the MDS RPC if there is nothing that we want to store there,
+ * otherwise there is no harm in updating mtime/atime on the MDS if we are
+ * going to do an RPC anyways.
+ *
+ * If we are doing a truncate, we will send the mtime and ctime updates
+ * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
+ * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
+ * at the same time.
+ */
+int ll_setattr_raw(struct dentry *dentry, struct iattr *attr)
+{
+       struct inode *inode = dentry->d_inode;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct md_op_data *op_data = NULL;
+       struct md_open_data *mod = NULL;
+       int rc = 0, rc1 = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "%s: setattr inode %p/fid:"DFID" from %llu to %llu, "
+               "valid %x\n", ll_get_fsname(inode->i_sb, NULL, 0), inode,
+               PFID(&lli->lli_fid), i_size_read(inode), attr->ia_size,
+               attr->ia_valid);
+
+       if (attr->ia_valid & ATTR_SIZE) {
+               /* Check new size against VFS/VM file size limit and rlimit */
+               rc = inode_newsize_ok(inode, attr->ia_size);
+               if (rc)
+                       RETURN(rc);
+
+               /* The maximum Lustre file size is variable, based on the
+                * OST maximum object size and number of stripes.  This
+                * needs another check in addition to the VFS check above. */
+               if (attr->ia_size > ll_file_maxbytes(inode)) {
+                       CDEBUG(D_INODE,"file "DFID" too large %llu > "LPU64"\n",
+                              PFID(&lli->lli_fid), attr->ia_size,
+                              ll_file_maxbytes(inode));
+                       RETURN(-EFBIG);
+               }
+
+               attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+       }
+
+       /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */
+       if (attr->ia_valid & TIMES_SET_FLAGS) {
+               if (current_fsuid() != inode->i_uid &&
+                   !cfs_capable(CFS_CAP_FOWNER))
+                       RETURN(-EPERM);
+       }
+
+       /* We mark all of the fields "set" so MDS/OST does not re-set them */
+       if (attr->ia_valid & ATTR_CTIME) {
+               attr->ia_ctime = CFS_CURRENT_TIME;
+               attr->ia_valid |= ATTR_CTIME_SET;
+       }
+       if (!(attr->ia_valid & ATTR_ATIME_SET) &&
+           (attr->ia_valid & ATTR_ATIME)) {
+               attr->ia_atime = CFS_CURRENT_TIME;
+               attr->ia_valid |= ATTR_ATIME_SET;
+       }
+       if (!(attr->ia_valid & ATTR_MTIME_SET) &&
+           (attr->ia_valid & ATTR_MTIME)) {
+               attr->ia_mtime = CFS_CURRENT_TIME;
+               attr->ia_valid |= ATTR_MTIME_SET;
+       }
+
+       if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
+               CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n",
+                      LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
+                      cfs_time_current_sec());
+
+       /* If we are changing file size, file content is modified, flag it. */
+       if (attr->ia_valid & ATTR_SIZE) {
+               attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+               spin_lock(&lli->lli_lock);
+               lli->lli_flags |= LLIF_DATA_MODIFIED;
+               spin_unlock(&lli->lli_lock);
+       }
+
+       /* We always do an MDS RPC, even if we're only changing the size;
+        * only the MDS knows whether truncate() should fail with -ETXTBUSY */
+
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               RETURN(-ENOMEM);
+
+       if (!S_ISDIR(inode->i_mode)) {
+               if (attr->ia_valid & ATTR_SIZE)
+                       inode_dio_write_done(inode);
+               mutex_unlock(&inode->i_mutex);
+               down_write(&lli->lli_trunc_sem);
+       }
+
+       memcpy(&op_data->op_attr, attr, sizeof(*attr));
+
+       /* Open epoch for truncate. */
+       if (exp_connect_som(ll_i2mdexp(inode)) &&
+           (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET)))
+               op_data->op_flags = MF_EPOCH_OPEN;
+
+       rc = ll_md_setattr(dentry, op_data, &mod);
+       if (rc)
+               GOTO(out, rc);
+
+       /* RPC to MDT is sent, cancel data modification flag */
+       if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
+               spin_lock(&lli->lli_lock);
+               lli->lli_flags &= ~LLIF_DATA_MODIFIED;
+               spin_unlock(&lli->lli_lock);
+       }
+
+       ll_ioepoch_open(lli, op_data->op_ioepoch);
+       if (!S_ISREG(inode->i_mode))
+               GOTO(out, rc = 0);
+
+       if (attr->ia_valid & (ATTR_SIZE |
+                             ATTR_ATIME | ATTR_ATIME_SET |
+                             ATTR_MTIME | ATTR_MTIME_SET))
+               /* For truncate and utimes sending attributes to OSTs, setting
+                * mtime/atime to the past will be performed under PW [0:EOF]
+                * extent lock (new_size:EOF for truncate).  It may seem
+                * excessive to send mtime/atime updates to OSTs when not
+                * setting times to past, but it is necessary due to possible
+                * time de-synchronization between MDT inode and OST objects */
+               rc = ll_setattr_ost(inode, attr);
+       EXIT;
+out:
+       if (op_data) {
+               if (op_data->op_ioepoch) {
+                       rc1 = ll_setattr_done_writing(inode, op_data, mod);
+                       if (!rc)
+                               rc = rc1;
+               }
+               ll_finish_md_op_data(op_data);
+       }
+       if (!S_ISDIR(inode->i_mode)) {
+               up_write(&lli->lli_trunc_sem);
+               mutex_lock(&inode->i_mutex);
+               if (attr->ia_valid & ATTR_SIZE)
+                       inode_dio_wait(inode);
+       }
+
+       ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ?
+                       LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1);
+
+       return rc;
+}
+
+int ll_setattr(struct dentry *de, struct iattr *attr)
+{
+       int mode = de->d_inode->i_mode;
+
+       if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
+                             (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
+               attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+
+       if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
+                              (ATTR_SIZE|ATTR_MODE)) &&
+           (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
+            (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+             !(attr->ia_mode & S_ISGID))))
+               attr->ia_valid |= ATTR_FORCE;
+
+       if ((mode & S_ISUID) &&
+           !(attr->ia_mode & S_ISUID) &&
+           !(attr->ia_valid & ATTR_KILL_SUID))
+               attr->ia_valid |= ATTR_KILL_SUID;
+
+       if (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
+           !(attr->ia_mode & S_ISGID) &&
+           !(attr->ia_valid & ATTR_KILL_SGID))
+               attr->ia_valid |= ATTR_KILL_SGID;
+
+       return ll_setattr_raw(de, attr);
+}
+
+int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
+                      __u64 max_age, __u32 flags)
+{
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct obd_statfs obd_osfs;
+       int rc;
+       ENTRY;
+
+       rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
+       if (rc) {
+               CERROR("md_statfs fails: rc = %d\n", rc);
+               RETURN(rc);
+       }
+
+       osfs->os_type = sb->s_magic;
+
+       CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
+              osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
+
+       if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+               flags |= OBD_STATFS_NODELAY;
+
+       rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
+       if (rc) {
+               CERROR("obd_statfs fails: rc = %d\n", rc);
+               RETURN(rc);
+       }
+
+       CDEBUG(D_SUPER, "OSC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
+              obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
+              obd_osfs.os_files);
+
+       osfs->os_bsize = obd_osfs.os_bsize;
+       osfs->os_blocks = obd_osfs.os_blocks;
+       osfs->os_bfree = obd_osfs.os_bfree;
+       osfs->os_bavail = obd_osfs.os_bavail;
+
+       /* If we don't have as many objects free on the OST as inodes
+        * on the MDS, we reduce the total number of inodes to
+        * compensate, so that the "inodes in use" number is correct.
+        */
+       if (obd_osfs.os_ffree < osfs->os_ffree) {
+               osfs->os_files = (osfs->os_files - osfs->os_ffree) +
+                       obd_osfs.os_ffree;
+               osfs->os_ffree = obd_osfs.os_ffree;
+       }
+
+       RETURN(rc);
+}
+int ll_statfs(struct dentry *de, struct kstatfs *sfs)
+{
+       struct super_block *sb = de->d_sb;
+       struct obd_statfs osfs;
+       int rc;
+
+       CDEBUG(D_VFSTRACE, "VFS Op: at "LPU64" jiffies\n", get_jiffies_64());
+       ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
+
+       /* Some amount of caching on the client is allowed */
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               0);
+       if (rc)
+               return rc;
+
+       statfs_unpack(sfs, &osfs);
+
+       /* We need to downshift for all 32-bit kernels, because we can't
+        * tell if the kernel is being called via sys_statfs64() or not.
+        * Stop before overflowing f_bsize - in which case it is better
+        * to just risk EOVERFLOW if caller is using old sys_statfs(). */
+       if (sizeof(long) < 8) {
+               while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
+                       sfs->f_bsize <<= 1;
+
+                       osfs.os_blocks >>= 1;
+                       osfs.os_bfree >>= 1;
+                       osfs.os_bavail >>= 1;
+               }
+       }
+
+       sfs->f_blocks = osfs.os_blocks;
+       sfs->f_bfree = osfs.os_bfree;
+       sfs->f_bavail = osfs.os_bavail;
+
+       return 0;
+}
+
+void ll_inode_size_lock(struct inode *inode)
+{
+       struct ll_inode_info *lli;
+
+       LASSERT(!S_ISDIR(inode->i_mode));
+
+       lli = ll_i2info(inode);
+       LASSERT(lli->lli_size_sem_owner != current);
+       down(&lli->lli_size_sem);
+       LASSERT(lli->lli_size_sem_owner == NULL);
+       lli->lli_size_sem_owner = current;
+}
+
+void ll_inode_size_unlock(struct inode *inode)
+{
+       struct ll_inode_info *lli;
+
+       lli = ll_i2info(inode);
+       LASSERT(lli->lli_size_sem_owner == current);
+       lli->lli_size_sem_owner = NULL;
+       up(&lli->lli_size_sem);
+}
+
+void ll_update_inode(struct inode *inode, struct lustre_md *md)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct mdt_body *body = md->body;
+       struct lov_stripe_md *lsm = md->lsm;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+
+       LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
+       if (lsm != NULL) {
+               if (!lli->lli_has_smd &&
+                   !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
+                       cl_file_inode_init(inode, md);
+
+               lli->lli_maxbytes = lsm->lsm_maxbytes;
+               if (lli->lli_maxbytes > MAX_LFS_FILESIZE)
+                       lli->lli_maxbytes = MAX_LFS_FILESIZE;
+       }
+
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT) {
+               if (body->valid & OBD_MD_FLRMTPERM)
+                       ll_update_remote_perm(inode, md->remote_perm);
+       }
+#ifdef CONFIG_FS_POSIX_ACL
+       else if (body->valid & OBD_MD_FLACL) {
+               spin_lock(&lli->lli_lock);
+               if (lli->lli_posix_acl)
+                       posix_acl_release(lli->lli_posix_acl);
+               lli->lli_posix_acl = md->posix_acl;
+               spin_unlock(&lli->lli_lock);
+       }
+#endif
+       inode->i_ino = cl_fid_build_ino(&body->fid1,
+                                       sbi->ll_flags & LL_SBI_32BIT_API);
+       inode->i_generation = cl_fid_build_gen(&body->fid1);
+
+       if (body->valid & OBD_MD_FLATIME) {
+               if (body->atime > LTIME_S(inode->i_atime))
+                       LTIME_S(inode->i_atime) = body->atime;
+               lli->lli_lvb.lvb_atime = body->atime;
+       }
+       if (body->valid & OBD_MD_FLMTIME) {
+               if (body->mtime > LTIME_S(inode->i_mtime)) {
+                       CDEBUG(D_INODE, "setting ino %lu mtime from %lu "
+                              "to "LPU64"\n", inode->i_ino,
+                              LTIME_S(inode->i_mtime), body->mtime);
+                       LTIME_S(inode->i_mtime) = body->mtime;
+               }
+               lli->lli_lvb.lvb_mtime = body->mtime;
+       }
+       if (body->valid & OBD_MD_FLCTIME) {
+               if (body->ctime > LTIME_S(inode->i_ctime))
+                       LTIME_S(inode->i_ctime) = body->ctime;
+               lli->lli_lvb.lvb_ctime = body->ctime;
+       }
+       if (body->valid & OBD_MD_FLMODE)
+               inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
+       if (body->valid & OBD_MD_FLTYPE)
+               inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT);
+       LASSERT(inode->i_mode != 0);
+       if (S_ISREG(inode->i_mode)) {
+               inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1, LL_MAX_BLKSIZE_BITS);
+       } else {
+               inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+       }
+       if (body->valid & OBD_MD_FLUID)
+               inode->i_uid = body->uid;
+       if (body->valid & OBD_MD_FLGID)
+               inode->i_gid = body->gid;
+       if (body->valid & OBD_MD_FLFLAGS)
+               inode->i_flags = ll_ext_to_inode_flags(body->flags);
+       if (body->valid & OBD_MD_FLNLINK)
+               set_nlink(inode, body->nlink);
+       if (body->valid & OBD_MD_FLRDEV)
+               inode->i_rdev = old_decode_dev(body->rdev);
+
+       if (body->valid & OBD_MD_FLID) {
+               /* FID shouldn't be changed! */
+               if (fid_is_sane(&lli->lli_fid)) {
+                       LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1),
+                                "Trying to change FID "DFID
+                                " to the "DFID", inode %lu/%u(%p)\n",
+                                PFID(&lli->lli_fid), PFID(&body->fid1),
+                                inode->i_ino, inode->i_generation, inode);
+               } else
+                       lli->lli_fid = body->fid1;
+       }
+
+       LASSERT(fid_seq(&lli->lli_fid) != 0);
+
+       if (body->valid & OBD_MD_FLSIZE) {
+               if (exp_connect_som(ll_i2mdexp(inode)) &&
+                   S_ISREG(inode->i_mode)) {
+                       struct lustre_handle lockh;
+                       ldlm_mode_t mode;
+
+                       /* As it is possible a blocking ast has been processed
+                        * by this time, we need to check there is an UPDATE
+                        * lock on the client and set LLIF_MDS_SIZE_LOCK holding
+                        * it. */
+                       mode = ll_take_md_lock(inode, MDS_INODELOCK_UPDATE,
+                                              &lockh, LDLM_FL_CBPENDING);
+                       if (mode) {
+                               if (lli->lli_flags & (LLIF_DONE_WRITING |
+                                                     LLIF_EPOCH_PENDING |
+                                                     LLIF_SOM_DIRTY)) {
+                                       CERROR("ino %lu flags %u still has "
+                                              "size authority! do not trust "
+                                              "the size got from MDS\n",
+                                              inode->i_ino, lli->lli_flags);
+                               } else {
+                                       /* Use old size assignment to avoid
+                                        * deadlock bz14138 & bz14326 */
+                                       i_size_write(inode, body->size);
+                                       lli->lli_flags |= LLIF_MDS_SIZE_LOCK;
+                               }
+                               ldlm_lock_decref(&lockh, mode);
+                       }
+               } else {
+                       /* Use old size assignment to avoid
+                        * deadlock bz14138 & bz14326 */
+                       i_size_write(inode, body->size);
+
+                       CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n",
+                              inode->i_ino, (unsigned long long)body->size);
+               }
+
+               if (body->valid & OBD_MD_FLBLOCKS)
+                       inode->i_blocks = body->blocks;
+       }
+
+       if (body->valid & OBD_MD_FLMDSCAPA) {
+               LASSERT(md->mds_capa);
+               ll_add_capa(inode, md->mds_capa);
+       }
+       if (body->valid & OBD_MD_FLOSSCAPA) {
+               LASSERT(md->oss_capa);
+               ll_add_capa(inode, md->oss_capa);
+       }
+}
+
+void ll_read_inode2(struct inode *inode, void *opaque)
+{
+       struct lustre_md *md = opaque;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
+              PFID(&lli->lli_fid), inode);
+
+       LASSERT(!lli->lli_has_smd);
+
+       /* Core attributes from the MDS first.  This is a new inode, and
+        * the VFS doesn't zero times in the core inode so we have to do
+        * it ourselves.  They will be overwritten by either MDS or OST
+        * attributes - we just need to make sure they aren't newer. */
+       LTIME_S(inode->i_mtime) = 0;
+       LTIME_S(inode->i_atime) = 0;
+       LTIME_S(inode->i_ctime) = 0;
+       inode->i_rdev = 0;
+       ll_update_inode(inode, md);
+
+       /* OIDEBUG(inode); */
+
+       /* initializing backing dev info. */
+       inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi;
+
+
+       if (S_ISREG(inode->i_mode)) {
+               struct ll_sb_info *sbi = ll_i2sbi(inode);
+               inode->i_op = &ll_file_inode_operations;
+               inode->i_fop = sbi->ll_fop;
+               inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops;
+               EXIT;
+       } else if (S_ISDIR(inode->i_mode)) {
+               inode->i_op = &ll_dir_inode_operations;
+               inode->i_fop = &ll_dir_operations;
+               EXIT;
+       } else if (S_ISLNK(inode->i_mode)) {
+               inode->i_op = &ll_fast_symlink_inode_operations;
+               EXIT;
+       } else {
+               inode->i_op = &ll_special_inode_operations;
+
+               init_special_inode(inode, inode->i_mode,
+                                  inode->i_rdev);
+
+               EXIT;
+       }
+}
+
+void ll_delete_inode(struct inode *inode)
+{
+       struct cl_inode_info *lli = cl_i2info(inode);
+       ENTRY;
+
+       if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL)
+               /* discard all dirty pages before truncating them, required by
+                * osc_extent implementation at LU-1030. */
+               cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
+                                  CL_FSYNC_DISCARD, 1);
+
+       truncate_inode_pages(&inode->i_data, 0);
+
+       /* Workaround for LU-118 */
+       if (inode->i_data.nrpages) {
+               TREE_READ_LOCK_IRQ(&inode->i_data);
+               TREE_READ_UNLOCK_IRQ(&inode->i_data);
+               LASSERTF(inode->i_data.nrpages == 0,
+                        "inode=%lu/%u(%p) nrpages=%lu, see "
+                        "http://jira.whamcloud.com/browse/LU-118\n",
+                        inode->i_ino, inode->i_generation, inode,
+                        inode->i_data.nrpages);
+       }
+       /* Workaround end */
+
+       ll_clear_inode(inode);
+       clear_inode(inode);
+
+       EXIT;
+}
+
+int ll_iocontrol(struct inode *inode, struct file *file,
+                unsigned int cmd, unsigned long arg)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ptlrpc_request *req = NULL;
+       int rc, flags = 0;
+       ENTRY;
+
+       switch(cmd) {
+       case FSFILT_IOC_GETFLAGS: {
+               struct mdt_body *body;
+               struct md_op_data *op_data;
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
+                                            0, 0, LUSTRE_OPC_ANY,
+                                            NULL);
+               if (IS_ERR(op_data))
+                       RETURN(PTR_ERR(op_data));
+
+               op_data->op_valid = OBD_MD_FLFLAGS;
+               rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+               ll_finish_md_op_data(op_data);
+               if (rc) {
+                       CERROR("failure %d inode %lu\n", rc, inode->i_ino);
+                       RETURN(-abs(rc));
+               }
+
+               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+               flags = body->flags;
+
+               ptlrpc_req_finished(req);
+
+               RETURN(put_user(flags, (int *)arg));
+       }
+       case FSFILT_IOC_SETFLAGS: {
+               struct lov_stripe_md *lsm;
+               struct obd_info oinfo = { { { 0 } } };
+               struct md_op_data *op_data;
+
+               if (get_user(flags, (int *)arg))
+                       RETURN(-EFAULT);
+
+               op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                            LUSTRE_OPC_ANY, NULL);
+               if (IS_ERR(op_data))
+                       RETURN(PTR_ERR(op_data));
+
+               ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = flags;
+               op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+               rc = md_setattr(sbi->ll_md_exp, op_data,
+                               NULL, 0, NULL, 0, &req, NULL);
+               ll_finish_md_op_data(op_data);
+               ptlrpc_req_finished(req);
+               if (rc)
+                       RETURN(rc);
+
+               inode->i_flags = ll_ext_to_inode_flags(flags);
+
+               lsm = ccc_inode_lsm_get(inode);
+               if (lsm == NULL)
+                       RETURN(0);
+
+               OBDO_ALLOC(oinfo.oi_oa);
+               if (!oinfo.oi_oa) {
+                       ccc_inode_lsm_put(inode, lsm);
+                       RETURN(-ENOMEM);
+               }
+               oinfo.oi_md = lsm;
+               oinfo.oi_oa->o_oi = lsm->lsm_oi;
+               oinfo.oi_oa->o_flags = flags;
+               oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS |
+                                      OBD_MD_FLGROUP;
+               oinfo.oi_capa = ll_mdscapa_get(inode);
+               obdo_set_parent_fid(oinfo.oi_oa, &ll_i2info(inode)->lli_fid);
+               rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL);
+               capa_put(oinfo.oi_capa);
+               OBDO_FREE(oinfo.oi_oa);
+               ccc_inode_lsm_put(inode, lsm);
+
+               if (rc && rc != -EPERM && rc != -EACCES)
+                       CERROR("osc_setattr_async fails: rc = %d\n", rc);
+
+               RETURN(rc);
+       }
+       default:
+               RETURN(-ENOSYS);
+       }
+
+       RETURN(0);
+}
+
+int ll_flush_ctx(struct inode *inode)
+{
+       struct ll_sb_info  *sbi = ll_i2sbi(inode);
+
+       CDEBUG(D_SEC, "flush context for user %d\n", current_uid());
+
+       obd_set_info_async(NULL, sbi->ll_md_exp,
+                          sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+                          0, NULL, NULL);
+       obd_set_info_async(NULL, sbi->ll_dt_exp,
+                          sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
+                          0, NULL, NULL);
+       return 0;
+}
+
+/* umount -f client means force down, don't save state */
+void ll_umount_begin(struct super_block *sb)
+{
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct obd_device *obd;
+       struct obd_ioctl_data *ioc_data;
+       ENTRY;
+
+
+       CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
+              sb->s_count, atomic_read(&sb->s_active));
+
+       obd = class_exp2obd(sbi->ll_md_exp);
+       if (obd == NULL) {
+               CERROR("Invalid MDC connection handle "LPX64"\n",
+                      sbi->ll_md_exp->exp_handle.h_cookie);
+               EXIT;
+               return;
+       }
+       obd->obd_force = 1;
+
+       obd = class_exp2obd(sbi->ll_dt_exp);
+       if (obd == NULL) {
+               CERROR("Invalid LOV connection handle "LPX64"\n",
+                      sbi->ll_dt_exp->exp_handle.h_cookie);
+               EXIT;
+               return;
+       }
+       obd->obd_force = 1;
+
+       OBD_ALLOC_PTR(ioc_data);
+       if (ioc_data) {
+               obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp,
+                             sizeof *ioc_data, ioc_data, NULL);
+
+               obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp,
+                             sizeof *ioc_data, ioc_data, NULL);
+
+               OBD_FREE_PTR(ioc_data);
+       }
+
+       /* Really, we'd like to wait until there are no requests outstanding,
+        * and then continue.  For now, we just invalidate the requests,
+        * schedule() and sleep one second if needed, and hope.
+        */
+       schedule();
+
+       EXIT;
+}
+
+int ll_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       char *profilenm = get_profile_name(sb);
+       int err;
+       __u32 read_only;
+
+       if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+               read_only = *flags & MS_RDONLY;
+               err = obd_set_info_async(NULL, sbi->ll_md_exp,
+                                        sizeof(KEY_READ_ONLY),
+                                        KEY_READ_ONLY, sizeof(read_only),
+                                        &read_only, NULL);
+               if (err) {
+                       LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
+                                     profilenm, read_only ?
+                                     "read-only" : "read-write", err);
+                       return err;
+               }
+
+               if (read_only)
+                       sb->s_flags |= MS_RDONLY;
+               else
+                       sb->s_flags &= ~MS_RDONLY;
+
+               if (sbi->ll_flags & LL_SBI_VERBOSE)
+                       LCONSOLE_WARN("Remounted %s %s\n", profilenm,
+                                     read_only ?  "read-only" : "read-write");
+       }
+       return 0;
+}
+
+int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
+                 struct super_block *sb, struct lookup_intent *it)
+{
+       struct ll_sb_info *sbi = NULL;
+       struct lustre_md md;
+       int rc;
+       ENTRY;
+
+       LASSERT(*inode || sb);
+       sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
+       rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp,
+                             sbi->ll_md_exp, &md);
+       if (rc)
+               RETURN(rc);
+
+       if (*inode) {
+               ll_update_inode(*inode, &md);
+       } else {
+               LASSERT(sb != NULL);
+
+               /*
+                * At this point server returns to client's same fid as client
+                * generated for creating. So using ->fid1 is okay here.
+                */
+               LASSERT(fid_is_sane(&md.body->fid1));
+
+               *inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1,
+                                            sbi->ll_flags & LL_SBI_32BIT_API),
+                                &md);
+               if (*inode == NULL || IS_ERR(*inode)) {
+#ifdef CONFIG_FS_POSIX_ACL
+                       if (md.posix_acl) {
+                               posix_acl_release(md.posix_acl);
+                               md.posix_acl = NULL;
+                       }
+#endif
+                       rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM;
+                       *inode = NULL;
+                       CERROR("new_inode -fatal: rc %d\n", rc);
+                       GOTO(out, rc);
+               }
+       }
+
+       /* Handling piggyback layout lock.
+        * Layout lock can be piggybacked by getattr and open request.
+        * The lsm can be applied to inode only if it comes with a layout lock
+        * otherwise correct layout may be overwritten, for example:
+        * 1. proc1: mdt returns a lsm but not granting layout
+        * 2. layout was changed by another client
+        * 3. proc2: refresh layout and layout lock granted
+        * 4. proc1: to apply a stale layout */
+       if (it != NULL && it->d.lustre.it_lock_mode != 0) {
+               struct lustre_handle lockh;
+               struct ldlm_lock *lock;
+
+               lockh.cookie = it->d.lustre.it_lock_handle;
+               lock = ldlm_handle2lock(&lockh);
+               LASSERT(lock != NULL);
+               if (ldlm_has_layout(lock)) {
+                       struct cl_object_conf conf;
+
+                       memset(&conf, 0, sizeof(conf));
+                       conf.coc_opc = OBJECT_CONF_SET;
+                       conf.coc_inode = *inode;
+                       conf.coc_lock = lock;
+                       conf.u.coc_md = &md;
+                       (void)ll_layout_conf(*inode, &conf);
+               }
+               LDLM_LOCK_PUT(lock);
+       }
+
+out:
+       if (md.lsm != NULL)
+               obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
+       md_free_lustre_md(sbi->ll_md_exp, &md);
+       RETURN(rc);
+}
+
+int ll_obd_statfs(struct inode *inode, void *arg)
+{
+       struct ll_sb_info *sbi = NULL;
+       struct obd_export *exp;
+       char *buf = NULL;
+       struct obd_ioctl_data *data = NULL;
+       __u32 type;
+       __u32 flags;
+       int len = 0, rc;
+
+       if (!inode || !(sbi = ll_i2sbi(inode)))
+               GOTO(out_statfs, rc = -EINVAL);
+
+       rc = obd_ioctl_getdata(&buf, &len, arg);
+       if (rc)
+               GOTO(out_statfs, rc);
+
+       data = (void*)buf;
+       if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
+           !data->ioc_pbuf1 || !data->ioc_pbuf2)
+               GOTO(out_statfs, rc = -EINVAL);
+
+       if (data->ioc_inllen1 != sizeof(__u32) ||
+           data->ioc_inllen2 != sizeof(__u32) ||
+           data->ioc_plen1 != sizeof(struct obd_statfs) ||
+           data->ioc_plen2 != sizeof(struct obd_uuid))
+               GOTO(out_statfs, rc = -EINVAL);
+
+       memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
+       if (type & LL_STATFS_LMV)
+               exp = sbi->ll_md_exp;
+       else if (type & LL_STATFS_LOV)
+               exp = sbi->ll_dt_exp;
+       else
+               GOTO(out_statfs, rc = -ENODEV);
+
+       flags = (type & LL_STATFS_NODELAY) ? OBD_STATFS_NODELAY : 0;
+       rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, &flags);
+       if (rc)
+               GOTO(out_statfs, rc);
+out_statfs:
+       if (buf)
+               obd_ioctl_freedata(buf, len);
+       return rc;
+}
+
+int ll_process_config(struct lustre_cfg *lcfg)
+{
+       char *ptr;
+       void *sb;
+       struct lprocfs_static_vars lvars;
+       unsigned long x;
+       int rc = 0;
+
+       lprocfs_llite_init_vars(&lvars);
+
+       /* The instance name contains the sb: lustre-client-aacfe000 */
+       ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
+       if (!ptr || !*(++ptr))
+               return -EINVAL;
+       if (sscanf(ptr, "%lx", &x) != 1)
+               return -EINVAL;
+       sb = (void *)x;
+       /* This better be a real Lustre superblock! */
+       LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
+
+       /* Note we have not called client_common_fill_super yet, so
+          proc fns must be able to handle that! */
+       rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
+                                     lcfg, sb);
+       if (rc > 0)
+               rc = 0;
+       return(rc);
+}
+
+/* this function prepares md_op_data hint for passing ot down to MD stack. */
+struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data,
+                                      struct inode *i1, struct inode *i2,
+                                      const char *name, int namelen,
+                                      int mode, __u32 opc, void *data)
+{
+       LASSERT(i1 != NULL);
+
+       if (namelen > ll_i2sbi(i1)->ll_namelen)
+               return ERR_PTR(-ENAMETOOLONG);
+
+       if (op_data == NULL)
+               OBD_ALLOC_PTR(op_data);
+
+       if (op_data == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       ll_i2gids(op_data->op_suppgids, i1, i2);
+       op_data->op_fid1 = *ll_inode2fid(i1);
+       op_data->op_capa1 = ll_mdscapa_get(i1);
+
+       if (i2) {
+               op_data->op_fid2 = *ll_inode2fid(i2);
+               op_data->op_capa2 = ll_mdscapa_get(i2);
+       } else {
+               fid_zero(&op_data->op_fid2);
+               op_data->op_capa2 = NULL;
+       }
+
+       op_data->op_name = name;
+       op_data->op_namelen = namelen;
+       op_data->op_mode = mode;
+       op_data->op_mod_time = cfs_time_current_sec();
+       op_data->op_fsuid = current_fsuid();
+       op_data->op_fsgid = current_fsgid();
+       op_data->op_cap = cfs_curproc_cap_pack();
+       op_data->op_bias = 0;
+       op_data->op_cli_flags = 0;
+       if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) &&
+            filename_is_volatile(name, namelen, NULL))
+               op_data->op_bias |= MDS_CREATE_VOLATILE;
+       op_data->op_opc = opc;
+       op_data->op_mds = 0;
+       op_data->op_data = data;
+
+       /* If the file is being opened after mknod() (normally due to NFS)
+        * try to use the default stripe data from parent directory for
+        * allocating OST objects.  Try to pass the parent FID to MDS. */
+       if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) &&
+           !ll_i2info(i2)->lli_has_smd) {
+               struct ll_inode_info *lli = ll_i2info(i2);
+
+               spin_lock(&lli->lli_lock);
+               if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid)))
+                       op_data->op_fid1 = lli->lli_pfid;
+               spin_unlock(&lli->lli_lock);
+               /** We ignore parent's capability temporary. */
+       }
+
+       /* When called by ll_setattr_raw, file is i1. */
+       if (LLIF_DATA_MODIFIED & ll_i2info(i1)->lli_flags)
+               op_data->op_bias |= MDS_DATA_MODIFIED;
+
+       return op_data;
+}
+
+void ll_finish_md_op_data(struct md_op_data *op_data)
+{
+       capa_put(op_data->op_capa1);
+       capa_put(op_data->op_capa2);
+       OBD_FREE_PTR(op_data);
+}
+
+int ll_show_options(struct seq_file *seq, struct dentry *dentry)
+{
+       struct ll_sb_info *sbi;
+
+       LASSERT((seq != NULL) && (dentry != NULL));
+       sbi = ll_s2sbi(dentry->d_sb);
+
+       if (sbi->ll_flags & LL_SBI_NOLCK)
+               seq_puts(seq, ",nolock");
+
+       if (sbi->ll_flags & LL_SBI_FLOCK)
+               seq_puts(seq, ",flock");
+
+       if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
+               seq_puts(seq, ",localflock");
+
+       if (sbi->ll_flags & LL_SBI_USER_XATTR)
+               seq_puts(seq, ",user_xattr");
+
+       if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+               seq_puts(seq, ",lazystatfs");
+
+       if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
+               seq_puts(seq, ",user_fid2path");
+
+       RETURN(0);
+}
+
+/**
+ * Get obd name by cmd, and copy out to user space
+ */
+int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct obd_device *obd;
+       ENTRY;
+
+       if (cmd == OBD_IOC_GETDTNAME)
+               obd = class_exp2obd(sbi->ll_dt_exp);
+       else if (cmd == OBD_IOC_GETMDNAME)
+               obd = class_exp2obd(sbi->ll_md_exp);
+       else
+               RETURN(-EINVAL);
+
+       if (!obd)
+               RETURN(-ENOENT);
+
+       if (copy_to_user((void *)arg, obd->obd_name,
+                            strlen(obd->obd_name) + 1))
+               RETURN(-EFAULT);
+
+       RETURN(0);
+}
+
+/**
+ * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the
+ * fsname will be returned in this buffer; otherwise, a static buffer will be
+ * used to store the fsname and returned to caller.
+ */
+char *ll_get_fsname(struct super_block *sb, char *buf, int buflen)
+{
+       static char fsname_static[MTI_NAME_MAXLEN];
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       char *ptr;
+       int len;
+
+       if (buf == NULL) {
+               /* this means the caller wants to use static buffer
+                * and it doesn't care about race. Usually this is
+                * in error reporting path */
+               buf = fsname_static;
+               buflen = sizeof(fsname_static);
+       }
+
+       len = strlen(lsi->lsi_lmd->lmd_profile);
+       ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+       if (ptr && (strcmp(ptr, "-client") == 0))
+               len -= 7;
+
+       if (unlikely(len >= buflen))
+               len = buflen - 1;
+       strncpy(buf, lsi->lsi_lmd->lmd_profile, len);
+       buf[len] = '\0';
+
+       return buf;
+}
+
+static char* ll_d_path(struct dentry *dentry, char *buf, int bufsize)
+{
+       char *path = NULL;
+
+       struct path p;
+
+       p.dentry = dentry;
+       p.mnt = current->fs->root.mnt;
+       path_get(&p);
+       path = d_path(&p, buf, bufsize);
+       path_put(&p);
+
+       return path;
+}
+
+void ll_dirty_page_discard_warn(struct page *page, int ioret)
+{
+       char *buf, *path = NULL;
+       struct dentry *dentry = NULL;
+       struct ccc_object *obj = cl_inode2ccc(page->mapping->host);
+
+       /* this can be called inside spin lock so use GFP_ATOMIC. */
+       buf = (char *)__get_free_page(GFP_ATOMIC);
+       if (buf != NULL) {
+               dentry = d_find_alias(page->mapping->host);
+               if (dentry != NULL)
+                       path = ll_d_path(dentry, buf, PAGE_SIZE);
+       }
+
+       CWARN("%s: dirty page discard: %s/fid: "DFID"/%s may get corrupted "
+             "(rc %d)\n", ll_get_fsname(page->mapping->host->i_sb, NULL, 0),
+             s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev,
+             PFID(&obj->cob_header.coh_lu.loh_fid),
+             (path && !IS_ERR(path)) ? path : "", ioret);
+
+       if (dentry != NULL)
+               dput(dentry);
+
+       if (buf != NULL)
+               free_page((unsigned long)buf);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
new file mode 100644 (file)
index 0000000..d9590d8
--- /dev/null
@@ -0,0 +1,507 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
+                      int *type);
+
+static struct vm_operations_struct ll_file_vm_ops;
+
+void policy_from_vma(ldlm_policy_data_t *policy,
+                           struct vm_area_struct *vma, unsigned long addr,
+                           size_t count)
+{
+       policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) +
+                                (vma->vm_pgoff << PAGE_CACHE_SHIFT);
+       policy->l_extent.end = (policy->l_extent.start + count - 1) |
+                              ~CFS_PAGE_MASK;
+}
+
+struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
+                              size_t count)
+{
+       struct vm_area_struct *vma, *ret = NULL;
+       ENTRY;
+
+       /* mmap_sem must have been held by caller. */
+       LASSERT(!down_write_trylock(&mm->mmap_sem));
+
+       for(vma = find_vma(mm, addr);
+           vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) {
+               if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops &&
+                   vma->vm_flags & VM_SHARED) {
+                       ret = vma;
+                       break;
+               }
+       }
+       RETURN(ret);
+}
+
+/**
+ * API independent part for page fault initialization.
+ * \param vma - virtual memory area addressed to page fault
+ * \param env - corespondent lu_env to processing
+ * \param nest - nested level
+ * \param index - page index corespondent to fault.
+ * \parm ra_flags - vma readahead flags.
+ *
+ * \return allocated and initialized env for fault operation.
+ * \retval EINVAL if env can't allocated
+ * \return other error codes from cl_io_init.
+ */
+struct cl_io *ll_fault_io_init(struct vm_area_struct *vma,
+                              struct lu_env **env_ret,
+                              struct cl_env_nest *nest,
+                              pgoff_t index, unsigned long *ra_flags)
+{
+       struct file       *file  = vma->vm_file;
+       struct inode      *inode = file->f_dentry->d_inode;
+       struct cl_io      *io;
+       struct cl_fault_io *fio;
+       struct lu_env     *env;
+       ENTRY;
+
+       *env_ret = NULL;
+       if (ll_file_nolock(file))
+               RETURN(ERR_PTR(-EOPNOTSUPP));
+
+       /*
+        * page fault can be called when lustre IO is
+        * already active for the current thread, e.g., when doing read/write
+        * against user level buffer mapped from Lustre buffer. To avoid
+        * stomping on existing context, optionally force an allocation of a new
+        * one.
+        */
+       env = cl_env_nested_get(nest);
+       if (IS_ERR(env))
+                RETURN(ERR_PTR(-EINVAL));
+
+       *env_ret = env;
+
+       io = ccc_env_thread_io(env);
+       io->ci_obj = ll_i2info(inode)->lli_clob;
+       LASSERT(io->ci_obj != NULL);
+
+       fio = &io->u.ci_fault;
+       fio->ft_index      = index;
+       fio->ft_executable = vma->vm_flags&VM_EXEC;
+
+       /*
+        * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+        * the kernel will not read other pages not covered by ldlm in
+        * filemap_nopage. we do our readahead in ll_readpage.
+        */
+       if (ra_flags != NULL)
+               *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+       vma->vm_flags &= ~VM_SEQ_READ;
+       vma->vm_flags |= VM_RAND_READ;
+
+       CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags,
+              fio->ft_index, fio->ft_executable);
+
+       if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) {
+               struct ccc_io *cio = ccc_env_io(env);
+               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+               LASSERT(cio->cui_cl.cis_io == io);
+
+               /* mmap lock must be MANDATORY
+                * it has to cache pages. */
+               io->ci_lockreq = CILR_MANDATORY;
+
+               cio->cui_fd  = fd;
+       }
+
+       return io;
+}
+
+/* Sharing code of page_mkwrite method for rhel5 and rhel6 */
+static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage,
+                           bool *retry)
+{
+       struct lu_env      *env;
+       struct cl_io        *io;
+       struct vvp_io      *vio;
+       struct cl_env_nest       nest;
+       int                   result;
+       sigset_t             set;
+       struct inode         *inode;
+       struct ll_inode_info     *lli;
+       ENTRY;
+
+       LASSERT(vmpage != NULL);
+
+       io = ll_fault_io_init(vma, &env,  &nest, vmpage->index, NULL);
+       if (IS_ERR(io))
+               GOTO(out, result = PTR_ERR(io));
+
+       result = io->ci_result;
+       if (result < 0)
+               GOTO(out, result);
+
+       io->u.ci_fault.ft_mkwrite = 1;
+       io->u.ci_fault.ft_writable = 1;
+
+       vio = vvp_env_io(env);
+       vio->u.fault.ft_vma    = vma;
+       vio->u.fault.ft_vmpage = vmpage;
+
+       set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+       /* we grab lli_trunc_sem to exclude truncate case.
+        * Otherwise, we could add dirty pages into osc cache
+        * while truncate is on-going. */
+       inode = ccc_object_inode(io->ci_obj);
+       lli = ll_i2info(inode);
+       down_read(&lli->lli_trunc_sem);
+
+       result = cl_io_loop(env, io);
+
+       up_read(&lli->lli_trunc_sem);
+
+       cfs_restore_sigs(set);
+
+       if (result == 0) {
+               struct inode *inode = vma->vm_file->f_dentry->d_inode;
+               struct ll_inode_info *lli = ll_i2info(inode);
+
+               lock_page(vmpage);
+               if (vmpage->mapping == NULL) {
+                       unlock_page(vmpage);
+
+                       /* page was truncated and lock was cancelled, return
+                        * ENODATA so that VM_FAULT_NOPAGE will be returned
+                        * to handle_mm_fault(). */
+                       if (result == 0)
+                               result = -ENODATA;
+               } else if (!PageDirty(vmpage)) {
+                       /* race, the page has been cleaned by ptlrpcd after
+                        * it was unlocked, it has to be added into dirty
+                        * cache again otherwise this soon-to-dirty page won't
+                        * consume any grants, even worse if this page is being
+                        * transferred because it will break RPC checksum.
+                        */
+                       unlock_page(vmpage);
+
+                       CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has "
+                              "been written out, retry.\n",
+                              vmpage, vmpage->index);
+
+                       *retry = true;
+                       result = -EAGAIN;
+               }
+
+               if (result == 0) {
+                       spin_lock(&lli->lli_lock);
+                       lli->lli_flags |= LLIF_DATA_MODIFIED;
+                       spin_unlock(&lli->lli_lock);
+               }
+       }
+       EXIT;
+
+out:
+       cl_io_fini(env, io);
+       cl_env_nested_put(&nest, env);
+
+       CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result);
+
+       LASSERT(ergo(result == 0, PageLocked(vmpage)));
+       return(result);
+}
+
+
+
+static inline int to_fault_error(int result)
+{
+       switch(result) {
+       case 0:
+               result = VM_FAULT_LOCKED;
+               break;
+       case -EFAULT:
+               result = VM_FAULT_NOPAGE;
+               break;
+       case -ENOMEM:
+               result = VM_FAULT_OOM;
+               break;
+       default:
+               result = VM_FAULT_SIGBUS;
+               break;
+       }
+       return result;
+}
+
+/**
+ * Lustre implementation of a vm_operations_struct::fault() method, called by
+ * VM to server page fault (both in kernel and user space).
+ *
+ * \param vma - is virtiual area struct related to page fault
+ * \param vmf - structure which describe type and address where hit fault
+ *
+ * \return allocated and filled _locked_ page for address
+ * \retval VM_FAULT_ERROR on general error
+ * \retval NOPAGE_OOM not have memory for allocate new page
+ */
+static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct lu_env      *env;
+       struct cl_io        *io;
+       struct vvp_io      *vio = NULL;
+       struct page          *vmpage;
+       unsigned long       ra_flags;
+       struct cl_env_nest       nest;
+       int                   result;
+       int                   fault_ret = 0;
+       ENTRY;
+
+       io = ll_fault_io_init(vma, &env,  &nest, vmf->pgoff, &ra_flags);
+       if (IS_ERR(io))
+               RETURN(to_fault_error(PTR_ERR(io)));
+
+       result = io->ci_result;
+       if (result == 0) {
+               vio = vvp_env_io(env);
+               vio->u.fault.ft_vma       = vma;
+               vio->u.fault.ft_vmpage    = NULL;
+               vio->u.fault.fault.ft_vmf = vmf;
+
+               result = cl_io_loop(env, io);
+
+               fault_ret = vio->u.fault.fault.ft_flags;
+               vmpage = vio->u.fault.ft_vmpage;
+               if (result != 0 && vmpage != NULL) {
+                       page_cache_release(vmpage);
+                       vmf->page = NULL;
+               }
+       }
+       cl_io_fini(env, io);
+       cl_env_nested_put(&nest, env);
+
+       vma->vm_flags |= ra_flags;
+       if (result != 0 && !(fault_ret & VM_FAULT_RETRY))
+               fault_ret |= to_fault_error(result);
+
+       CDEBUG(D_MMAP, "%s fault %d/%d\n",
+              current->comm, fault_ret, result);
+       RETURN(fault_ret);
+}
+
+static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       int count = 0;
+       bool printed = false;
+       int result;
+       sigset_t set;
+
+       /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
+        * so that it can be killed by admin but not cause segfault by
+        * other signals. */
+       set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
+
+restart:
+       result = ll_fault0(vma, vmf);
+       LASSERT(!(result & VM_FAULT_LOCKED));
+       if (result == 0) {
+               struct page *vmpage = vmf->page;
+
+               /* check if this page has been truncated */
+               lock_page(vmpage);
+               if (unlikely(vmpage->mapping == NULL)) { /* unlucky */
+                       unlock_page(vmpage);
+                       page_cache_release(vmpage);
+                       vmf->page = NULL;
+
+                       if (!printed && ++count > 16) {
+                               CWARN("the page is under heavy contention,"
+                                     "maybe your app(%s) needs revising :-)\n",
+                                     current->comm);
+                               printed = true;
+                       }
+
+                       goto restart;
+               }
+
+               result |= VM_FAULT_LOCKED;
+       }
+       cfs_restore_sigs(set);
+       return result;
+}
+
+static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       int count = 0;
+       bool printed = false;
+       bool retry;
+       int result;
+
+       do {
+               retry = false;
+               result = ll_page_mkwrite0(vma, vmf->page, &retry);
+
+               if (!printed && ++count > 16) {
+                       CWARN("app(%s): the page %lu of file %lu is under heavy"
+                             " contention.\n",
+                             current->comm, vmf->pgoff,
+                             vma->vm_file->f_dentry->d_inode->i_ino);
+                       printed = true;
+               }
+       } while (retry);
+
+       switch(result) {
+       case 0:
+               LASSERT(PageLocked(vmf->page));
+               result = VM_FAULT_LOCKED;
+               break;
+       case -ENODATA:
+       case -EFAULT:
+               result = VM_FAULT_NOPAGE;
+               break;
+       case -ENOMEM:
+               result = VM_FAULT_OOM;
+               break;
+       case -EAGAIN:
+               result = VM_FAULT_RETRY;
+               break;
+       default:
+               result = VM_FAULT_SIGBUS;
+               break;
+       }
+
+       return result;
+}
+
+/**
+ *  To avoid cancel the locks covering mmapped region for lock cache pressure,
+ *  we track the mapped vma count in ccc_object::cob_mmap_cnt.
+ */
+static void ll_vm_open(struct vm_area_struct * vma)
+{
+       struct inode *inode    = vma->vm_file->f_dentry->d_inode;
+       struct ccc_object *vob = cl_inode2ccc(inode);
+
+       ENTRY;
+       LASSERT(vma->vm_file);
+       LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+       atomic_inc(&vob->cob_mmap_cnt);
+       EXIT;
+}
+
+/**
+ * Dual to ll_vm_open().
+ */
+static void ll_vm_close(struct vm_area_struct *vma)
+{
+       struct inode      *inode = vma->vm_file->f_dentry->d_inode;
+       struct ccc_object *vob   = cl_inode2ccc(inode);
+
+       ENTRY;
+       LASSERT(vma->vm_file);
+       atomic_dec(&vob->cob_mmap_cnt);
+       LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+       EXIT;
+}
+
+
+/* return the user space pointer that maps to a file offset via a vma */
+static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte)
+{
+       return vma->vm_start + (byte - ((__u64)vma->vm_pgoff << PAGE_CACHE_SHIFT));
+
+}
+
+/* XXX put nice comment here.  talk about __free_pte -> dirty pages and
+ * nopage's reference passing to the pte */
+int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last)
+{
+       int rc = -ENOENT;
+       ENTRY;
+
+       LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first);
+       if (mapping_mapped(mapping)) {
+               rc = 0;
+               unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1,
+                                   last - first + 1, 0);
+       }
+
+       RETURN(rc);
+}
+
+static struct vm_operations_struct ll_file_vm_ops = {
+       .fault                  = ll_fault,
+       .page_mkwrite           = ll_page_mkwrite,
+       .open                   = ll_vm_open,
+       .close                  = ll_vm_close,
+};
+
+int ll_file_mmap(struct file *file, struct vm_area_struct * vma)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       int rc;
+       ENTRY;
+
+       if (ll_file_nolock(file))
+               RETURN(-EOPNOTSUPP);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1);
+       rc = generic_file_mmap(file, vma);
+       if (rc == 0) {
+               vma->vm_ops = &ll_file_vm_ops;
+               vma->vm_ops->open(vma);
+               /* update the inode's size and mtime */
+               rc = ll_glimpse_size(inode);
+       }
+
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c
new file mode 100644 (file)
index 0000000..28cc41e
--- /dev/null
@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/llite_nfs.c
+ *
+ * NFS export of Lustre Light File System
+ *
+ * Author: Yury Umanets <umka@clusterfs.com>
+ * Author: Huang Hua <huanghua@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/exportfs.h>
+
+__u32 get_uuid2int(const char *name, int len)
+{
+       __u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9;
+       while (len--) {
+               __u32 key = key1 + (key0 ^ (*name++ * 7152373));
+               if (key & 0x80000000) key -= 0x7fffffff;
+               key1 = key0;
+               key0 = key;
+       }
+       return (key0 << 1);
+}
+
+static int ll_nfs_test_inode(struct inode *inode, void *opaque)
+{
+       return lu_fid_eq(&ll_i2info(inode)->lli_fid,
+                        (struct lu_fid *)opaque);
+}
+
+struct inode *search_inode_for_lustre(struct super_block *sb,
+                                     const struct lu_fid *fid)
+{
+       struct ll_sb_info     *sbi = ll_s2sbi(sb);
+       struct ptlrpc_request *req = NULL;
+       struct inode      *inode = NULL;
+       int                eadatalen = 0;
+       unsigned long         hash = cl_fid_build_ino(fid,
+                                                     ll_need_32bit_api(sbi));
+       struct  md_op_data    *op_data;
+       int                rc;
+       ENTRY;
+
+       CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid));
+
+       inode = ilookup5(sb, hash, ll_nfs_test_inode, (void *)fid);
+       if (inode)
+               RETURN(inode);
+
+       rc = ll_get_max_mdsize(sbi, &eadatalen);
+       if (rc)
+               RETURN(ERR_PTR(rc));
+
+       /* Because inode is NULL, ll_prep_md_op_data can not
+        * be used here. So we allocate op_data ourselves */
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       op_data->op_fid1 = *fid;
+       op_data->op_mode = eadatalen;
+       op_data->op_valid = OBD_MD_FLEASIZE;
+
+       /* mds_fid2dentry ignores f_type */
+       rc = md_getattr(sbi->ll_md_exp, op_data, &req);
+       OBD_FREE_PTR(op_data);
+       if (rc) {
+               CERROR("can't get object attrs, fid "DFID", rc %d\n",
+                      PFID(fid), rc);
+               RETURN(ERR_PTR(rc));
+       }
+       rc = ll_prep_inode(&inode, req, sb, NULL);
+       ptlrpc_req_finished(req);
+       if (rc)
+               RETURN(ERR_PTR(rc));
+
+       RETURN(inode);
+}
+
+struct lustre_nfs_fid {
+       struct lu_fid   lnf_child;
+       struct lu_fid   lnf_parent;
+};
+
+static struct dentry *
+ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent)
+{
+       struct inode  *inode;
+       struct dentry *result;
+       ENTRY;
+
+       CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid));
+       if (!fid_is_sane(fid))
+               RETURN(ERR_PTR(-ESTALE));
+
+       inode = search_inode_for_lustre(sb, fid);
+       if (IS_ERR(inode))
+               RETURN(ERR_PTR(PTR_ERR(inode)));
+
+       if (is_bad_inode(inode)) {
+               /* we didn't find the right inode.. */
+               iput(inode);
+               RETURN(ERR_PTR(-ESTALE));
+       }
+
+       /**
+        * It is an anonymous dentry without OST objects created yet.
+        * We have to find the parent to tell MDS how to init lov objects.
+        */
+       if (S_ISREG(inode->i_mode) && !ll_i2info(inode)->lli_has_smd &&
+           parent != NULL) {
+               struct ll_inode_info *lli = ll_i2info(inode);
+
+               spin_lock(&lli->lli_lock);
+               lli->lli_pfid = *parent;
+               spin_unlock(&lli->lli_lock);
+       }
+
+       result = d_obtain_alias(inode);
+       if (IS_ERR(result))
+               RETURN(result);
+
+       ll_dops_init(result, 1, 0);
+
+       RETURN(result);
+}
+
+#define LUSTRE_NFS_FID   0x97
+
+/**
+ * \a connectable - is nfsd will connect himself or this should be done
+ *               at lustre
+ *
+ * The return value is file handle type:
+ * 1 -- contains child file handle;
+ * 2 -- contains child file handle and parent file handle;
+ * 255 -- error.
+ */
+static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen,
+                       struct inode *parent)
+{
+       struct lustre_nfs_fid *nfs_fid = (void *)fh;
+       ENTRY;
+
+       CDEBUG(D_INFO, "encoding for (%lu,"DFID") maxlen=%d minlen=%d\n",
+             inode->i_ino, PFID(ll_inode2fid(inode)), *plen,
+             (int)sizeof(struct lustre_nfs_fid));
+
+       if (*plen < sizeof(struct lustre_nfs_fid) / 4)
+               RETURN(255);
+
+       nfs_fid->lnf_child = *ll_inode2fid(inode);
+       nfs_fid->lnf_parent = *ll_inode2fid(parent);
+       *plen = sizeof(struct lustre_nfs_fid) / 4;
+
+       RETURN(LUSTRE_NFS_FID);
+}
+
+static int ll_nfs_get_name_filldir(void *cookie, const char *name, int namelen,
+                                  loff_t hash, u64 ino, unsigned type)
+{
+       /* It is hack to access lde_fid for comparison with lgd_fid.
+        * So the input 'name' must be part of the 'lu_dirent'. */
+       struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name);
+       struct ll_getname_data *lgd = cookie;
+       struct lu_fid fid;
+
+       fid_le_to_cpu(&fid, &lde->lde_fid);
+       if (lu_fid_eq(&fid, &lgd->lgd_fid)) {
+               memcpy(lgd->lgd_name, name, namelen);
+               lgd->lgd_name[namelen] = 0;
+               lgd->lgd_found = 1;
+       }
+       return lgd->lgd_found;
+}
+
+static int ll_get_name(struct dentry *dentry, char *name,
+                      struct dentry *child)
+{
+       struct inode *dir = dentry->d_inode;
+       struct ll_getname_data lgd;
+       __u64 offset = 0;
+       int rc;
+       ENTRY;
+
+       if (!dir || !S_ISDIR(dir->i_mode))
+               GOTO(out, rc = -ENOTDIR);
+
+       if (!dir->i_fop)
+               GOTO(out, rc = -EINVAL);
+
+       lgd.lgd_name = name;
+       lgd.lgd_fid = ll_i2info(child->d_inode)->lli_fid;
+       lgd.lgd_found = 0;
+
+       mutex_lock(&dir->i_mutex);
+       rc = ll_dir_read(dir, &offset, &lgd, ll_nfs_get_name_filldir);
+       mutex_unlock(&dir->i_mutex);
+       if (!rc && !lgd.lgd_found)
+               rc = -ENOENT;
+       EXIT;
+
+out:
+       return rc;
+}
+
+static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                                     int fh_len, int fh_type)
+{
+       struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+       if (fh_type != LUSTRE_NFS_FID)
+               RETURN(ERR_PTR(-EPROTO));
+
+       RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent));
+}
+
+static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid,
+                                     int fh_len, int fh_type)
+{
+       struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid;
+
+       if (fh_type != LUSTRE_NFS_FID)
+               RETURN(ERR_PTR(-EPROTO));
+
+       RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL));
+}
+
+static struct dentry *ll_get_parent(struct dentry *dchild)
+{
+       struct ptlrpc_request *req = NULL;
+       struct inode      *dir = dchild->d_inode;
+       struct ll_sb_info     *sbi;
+       struct dentry    *result = NULL;
+       struct mdt_body       *body;
+       static char        dotdot[] = "..";
+       struct md_op_data     *op_data;
+       int                rc;
+       int                   lmmsize;
+       ENTRY;
+
+       LASSERT(dir && S_ISDIR(dir->i_mode));
+
+       sbi = ll_s2sbi(dir->i_sb);
+
+       CDEBUG(D_INFO, "getting parent for (%lu,"DFID")\n",
+                       dir->i_ino, PFID(ll_inode2fid(dir)));
+
+       rc = ll_get_max_mdsize(sbi, &lmmsize);
+       if (rc != 0)
+               RETURN(ERR_PTR(rc));
+
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot,
+                                    strlen(dotdot), lmmsize,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN((void *)op_data);
+
+       rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+       ll_finish_md_op_data(op_data);
+       if (rc) {
+               CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino);
+               RETURN(ERR_PTR(rc));
+       }
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body->valid & OBD_MD_FLID);
+
+       CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n",
+               PFID(ll_inode2fid(dir)), PFID(&body->fid1));
+
+       result = ll_iget_for_nfs(dir->i_sb, &body->fid1, NULL);
+
+       ptlrpc_req_finished(req);
+       RETURN(result);
+}
+
+struct export_operations lustre_export_operations = {
+       .get_parent = ll_get_parent,
+       .encode_fh  = ll_encode_fh,
+       .get_name   = ll_get_name,
+       .fh_to_dentry = ll_fh_to_dentry,
+       .fh_to_parent = ll_fh_to_parent,
+};
diff --git a/drivers/staging/lustre/lustre/llite/llite_rmtacl.c b/drivers/staging/lustre/lustre/llite/llite_rmtacl.c
new file mode 100644 (file)
index 0000000..4c61036
--- /dev/null
@@ -0,0 +1,301 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/llite_rmtacl.c
+ *
+ * Lustre Remote User Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#include <lustre_lite.h>
+#include <lustre_eacl.h>
+#include "llite_internal.h"
+
+static inline __u32 rce_hashfunc(uid_t id)
+{
+       return id & (RCE_HASHES - 1);
+}
+
+static inline __u32 ee_hashfunc(uid_t id)
+{
+       return id & (EE_HASHES - 1);
+}
+
+obd_valid rce_ops2valid(int ops)
+{
+       switch (ops) {
+       case RMT_LSETFACL:
+               return OBD_MD_FLRMTLSETFACL;
+       case RMT_LGETFACL:
+               return OBD_MD_FLRMTLGETFACL;
+       case RMT_RSETFACL:
+               return OBD_MD_FLRMTRSETFACL;
+       case RMT_RGETFACL:
+               return OBD_MD_FLRMTRGETFACL;
+       default:
+               return 0;
+       }
+}
+
+static struct rmtacl_ctl_entry *rce_alloc(pid_t key, int ops)
+{
+       struct rmtacl_ctl_entry *rce;
+
+       OBD_ALLOC_PTR(rce);
+       if (!rce)
+               return NULL;
+
+       INIT_LIST_HEAD(&rce->rce_list);
+       rce->rce_key = key;
+       rce->rce_ops = ops;
+
+       return rce;
+}
+
+static void rce_free(struct rmtacl_ctl_entry *rce)
+{
+       if (!list_empty(&rce->rce_list))
+               list_del(&rce->rce_list);
+
+       OBD_FREE_PTR(rce);
+}
+
+static struct rmtacl_ctl_entry *__rct_search(struct rmtacl_ctl_table *rct,
+                                          pid_t key)
+{
+       struct rmtacl_ctl_entry *rce;
+       struct list_head *head = &rct->rct_entries[rce_hashfunc(key)];
+
+       list_for_each_entry(rce, head, rce_list)
+               if (rce->rce_key == key)
+                       return rce;
+
+       return NULL;
+}
+
+struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key)
+{
+       struct rmtacl_ctl_entry *rce;
+
+       spin_lock(&rct->rct_lock);
+       rce = __rct_search(rct, key);
+       spin_unlock(&rct->rct_lock);
+       return rce;
+}
+
+int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops)
+{
+       struct rmtacl_ctl_entry *rce, *e;
+
+       rce = rce_alloc(key, ops);
+       if (rce == NULL)
+               return -ENOMEM;
+
+       spin_lock(&rct->rct_lock);
+       e = __rct_search(rct, key);
+       if (unlikely(e != NULL)) {
+               CWARN("Unexpected stale rmtacl_entry found: "
+                     "[key: %d] [ops: %d]\n", (int)key, ops);
+               rce_free(e);
+       }
+       list_add_tail(&rce->rce_list, &rct->rct_entries[rce_hashfunc(key)]);
+       spin_unlock(&rct->rct_lock);
+
+       return 0;
+}
+
+int rct_del(struct rmtacl_ctl_table *rct, pid_t key)
+{
+       struct rmtacl_ctl_entry *rce;
+
+       spin_lock(&rct->rct_lock);
+       rce = __rct_search(rct, key);
+       if (rce)
+               rce_free(rce);
+       spin_unlock(&rct->rct_lock);
+
+       return rce ? 0 : -ENOENT;
+}
+
+void rct_init(struct rmtacl_ctl_table *rct)
+{
+       int i;
+
+       spin_lock_init(&rct->rct_lock);
+       for (i = 0; i < RCE_HASHES; i++)
+               INIT_LIST_HEAD(&rct->rct_entries[i]);
+}
+
+void rct_fini(struct rmtacl_ctl_table *rct)
+{
+       struct rmtacl_ctl_entry *rce;
+       int i;
+
+       spin_lock(&rct->rct_lock);
+       for (i = 0; i < RCE_HASHES; i++)
+               while (!list_empty(&rct->rct_entries[i])) {
+                       rce = list_entry(rct->rct_entries[i].next,
+                                            struct rmtacl_ctl_entry, rce_list);
+                       rce_free(rce);
+               }
+       spin_unlock(&rct->rct_lock);
+}
+
+
+static struct eacl_entry *ee_alloc(pid_t key, struct lu_fid *fid, int type,
+                                  ext_acl_xattr_header *header)
+{
+       struct eacl_entry *ee;
+
+       OBD_ALLOC_PTR(ee);
+       if (!ee)
+               return NULL;
+
+       INIT_LIST_HEAD(&ee->ee_list);
+       ee->ee_key = key;
+       ee->ee_fid = *fid;
+       ee->ee_type = type;
+       ee->ee_acl = header;
+
+       return ee;
+}
+
+void ee_free(struct eacl_entry *ee)
+{
+       if (!list_empty(&ee->ee_list))
+               list_del(&ee->ee_list);
+
+       if (ee->ee_acl)
+               lustre_ext_acl_xattr_free(ee->ee_acl);
+
+       OBD_FREE_PTR(ee);
+}
+
+static struct eacl_entry *__et_search_del(struct eacl_table *et, pid_t key,
+                                       struct lu_fid *fid, int type)
+{
+       struct eacl_entry *ee;
+       struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+       LASSERT(fid != NULL);
+       list_for_each_entry(ee, head, ee_list)
+               if (ee->ee_key == key) {
+                       if (lu_fid_eq(&ee->ee_fid, fid) &&
+                           ee->ee_type == type) {
+                               list_del_init(&ee->ee_list);
+                               return ee;
+                       }
+               }
+
+       return NULL;
+}
+
+struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key,
+                                struct lu_fid *fid, int type)
+{
+       struct eacl_entry *ee;
+
+       spin_lock(&et->et_lock);
+       ee = __et_search_del(et, key, fid, type);
+       spin_unlock(&et->et_lock);
+       return ee;
+}
+
+void et_search_free(struct eacl_table *et, pid_t key)
+{
+       struct eacl_entry *ee, *next;
+       struct list_head *head = &et->et_entries[ee_hashfunc(key)];
+
+       spin_lock(&et->et_lock);
+       list_for_each_entry_safe(ee, next, head, ee_list)
+               if (ee->ee_key == key)
+                       ee_free(ee);
+
+       spin_unlock(&et->et_lock);
+}
+
+int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type,
+          ext_acl_xattr_header *header)
+{
+       struct eacl_entry *ee, *e;
+
+       ee = ee_alloc(key, fid, type, header);
+       if (ee == NULL)
+               return -ENOMEM;
+
+       spin_lock(&et->et_lock);
+       e = __et_search_del(et, key, fid, type);
+       if (unlikely(e != NULL)) {
+               CWARN("Unexpected stale eacl_entry found: "
+                     "[key: %d] [fid: "DFID"] [type: %d]\n",
+                     (int)key, PFID(fid), type);
+               ee_free(e);
+       }
+       list_add_tail(&ee->ee_list, &et->et_entries[ee_hashfunc(key)]);
+       spin_unlock(&et->et_lock);
+
+       return 0;
+}
+
+void et_init(struct eacl_table *et)
+{
+       int i;
+
+       spin_lock_init(&et->et_lock);
+       for (i = 0; i < EE_HASHES; i++)
+               INIT_LIST_HEAD(&et->et_entries[i]);
+}
+
+void et_fini(struct eacl_table *et)
+{
+       struct eacl_entry *ee;
+       int i;
+
+       spin_lock(&et->et_lock);
+       for (i = 0; i < EE_HASHES; i++)
+               while (!list_empty(&et->et_entries[i])) {
+                       ee = list_entry(et->et_entries[i].next,
+                                           struct eacl_entry, ee_list);
+                       ee_free(ee);
+               }
+       spin_unlock(&et->et_lock);
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
new file mode 100644 (file)
index 0000000..9d4c17e
--- /dev/null
@@ -0,0 +1,867 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/*
+ *  linux/drivers/block/loop.c
+ *
+ *  Written by Theodore Ts'o, 3/29/93
+ *
+ * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
+ * permitted under the GNU General Public License.
+ *
+ * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
+ * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
+ *
+ * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
+ *
+ * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
+ *
+ * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
+ *
+ * Loadable modules and other fixes by AK, 1998
+ *
+ * Maximum number of loop devices now dynamic via max_loop module parameter.
+ * Russell Kroll <rkroll@exploits.org> 19990701
+ *
+ * Maximum number of loop devices when compiled-in now selectable by passing
+ * max_loop=<1-255> to the kernel on boot.
+ * Erik I. Bols?, <eriki@himolde.no>, Oct 31, 1999
+ *
+ * Completely rewrite request handling to be make_request_fn style and
+ * non blocking, pushing work to a helper thread. Lots of fixes from
+ * Al Viro too.
+ * Jens Axboe <axboe@suse.de>, Nov 2000
+ *
+ * Support up to 256 loop devices
+ * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
+ *
+ * Support for falling back on the write file operation when the address space
+ * operations prepare_write and/or commit_write are not available on the
+ * backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
+ * Still To Fix:
+ * - Advisory locking is ignored here.
+ * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
+ *
+ */
+
+#include <linux/module.h>
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>         /* for invalidate_bdev() */
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+#include <linux/pagevec.h>
+
+#include <asm/uaccess.h>
+
+#include <lustre_lib.h>
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+#define LLOOP_MAX_SEGMENTS     LNET_MAX_IOV
+
+/* Possible states of device */
+enum {
+       LLOOP_UNBOUND,
+       LLOOP_BOUND,
+       LLOOP_RUNDOWN,
+};
+
+struct lloop_device {
+       int               lo_number;
+       int               lo_refcnt;
+       loff_t         lo_offset;
+       loff_t         lo_sizelimit;
+       int               lo_flags;
+       int             (*ioctl)(struct lloop_device *, int cmd,
+                                   unsigned long arg);
+
+       struct file      *lo_backing_file;
+       struct block_device *lo_device;
+       unsigned             lo_blocksize;
+
+       int               old_gfp_mask;
+
+       spinlock_t              lo_lock;
+       struct bio              *lo_bio;
+       struct bio              *lo_biotail;
+       int                     lo_state;
+       struct semaphore        lo_sem;
+       struct mutex            lo_ctl_mutex;
+       atomic_t         lo_pending;
+       wait_queue_head_t         lo_bh_wait;
+
+       struct request_queue *lo_queue;
+
+       const struct lu_env *lo_env;
+       struct cl_io     lo_io;
+       struct ll_dio_pages  lo_pvec;
+
+       /* data to handle bio for lustre. */
+       struct lo_request_data {
+               struct page *lrd_pages[LLOOP_MAX_SEGMENTS];
+               loff_t       lrd_offsets[LLOOP_MAX_SEGMENTS];
+       } lo_requests[1];
+};
+
+/*
+ * Loop flags
+ */
+enum {
+       LO_FLAGS_READ_ONLY       = 1,
+};
+
+static int lloop_major;
+#define MAX_LOOP_DEFAULT  16
+static int max_loop = MAX_LOOP_DEFAULT;
+static struct lloop_device *loop_dev;
+static struct gendisk **disks;
+static struct mutex lloop_mutex;
+static void *ll_iocontrol_magic = NULL;
+
+static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
+{
+       loff_t size, offset, loopsize;
+
+       /* Compute loopsize in bytes */
+       size = i_size_read(file->f_mapping->host);
+       offset = lo->lo_offset;
+       loopsize = size - offset;
+       if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
+               loopsize = lo->lo_sizelimit;
+
+       /*
+        * Unfortunately, if we want to do I/O on the device,
+        * the number of 512-byte sectors has to fit into a sector_t.
+        */
+       return loopsize >> 9;
+}
+
+static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
+{
+       const struct lu_env  *env   = lo->lo_env;
+       struct cl_io     *io    = &lo->lo_io;
+       struct inode     *inode = lo->lo_backing_file->f_dentry->d_inode;
+       struct cl_object     *obj = ll_i2info(inode)->lli_clob;
+       pgoff_t        offset;
+       int                ret;
+       int                i;
+       int                rw;
+       obd_count            page_count = 0;
+       struct bio_vec       *bvec;
+       struct bio         *bio;
+       ssize_t        bytes;
+
+       struct ll_dio_pages  *pvec = &lo->lo_pvec;
+       struct page      **pages = pvec->ldp_pages;
+       loff_t         *offsets = pvec->ldp_offsets;
+
+       truncate_inode_pages(inode->i_mapping, 0);
+
+       /* initialize the IO */
+       memset(io, 0, sizeof(*io));
+       io->ci_obj = obj;
+       ret = cl_io_init(env, io, CIT_MISC, obj);
+       if (ret)
+               return io->ci_result;
+       io->ci_lockreq = CILR_NEVER;
+
+       LASSERT(head != NULL);
+       rw = head->bi_rw;
+       for (bio = head; bio != NULL; bio = bio->bi_next) {
+               LASSERT(rw == bio->bi_rw);
+
+               offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
+               bio_for_each_segment(bvec, bio, i) {
+                       BUG_ON(bvec->bv_offset != 0);
+                       BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE);
+
+                       pages[page_count] = bvec->bv_page;
+                       offsets[page_count] = offset;
+                       page_count++;
+                       offset += bvec->bv_len;
+               }
+               LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
+       }
+
+       ll_stats_ops_tally(ll_i2sbi(inode),
+                       (rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
+                       page_count);
+
+       pvec->ldp_size = page_count << PAGE_CACHE_SHIFT;
+       pvec->ldp_nr = page_count;
+
+       /* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to
+        * write those pages into OST. Even worse case is that more pages
+        * would be asked to write out to swap space, and then finally get here
+        * again.
+        * Unfortunately this is NOT easy to fix.
+        * Thoughts on solution:
+        * 0. Define a reserved pool for cl_pages, which could be a list of
+        *    pre-allocated cl_pages;
+        * 1. Define a new operation in cl_object_operations{}, says clo_depth,
+        *    which measures how many layers for this lustre object. Generally
+        *    speaking, the depth would be 2, one for llite, and one for lovsub.
+        *    However, for SNS, there will be more since we need additional page
+        *    to store parity;
+        * 2. Reserve the # of (page_count * depth) cl_pages from the reserved
+        *    pool. Afterwards, the clio would allocate the pages from reserved
+        *    pool, this guarantees we neeedn't allocate the cl_pages from
+        *    generic cl_page slab cache.
+        *    Of course, if there is NOT enough pages in the pool, we might
+        *    be asked to write less pages once, this purely depends on
+        *    implementation. Anyway, we should be careful to avoid deadlocking.
+        */
+       mutex_lock(&inode->i_mutex);
+       bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
+       mutex_unlock(&inode->i_mutex);
+       cl_io_fini(env, io);
+       return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
+}
+
+/*
+ * Add bio to back of pending list
+ */
+static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&lo->lo_lock, flags);
+       if (lo->lo_biotail) {
+               lo->lo_biotail->bi_next = bio;
+               lo->lo_biotail = bio;
+       } else
+               lo->lo_bio = lo->lo_biotail = bio;
+       spin_unlock_irqrestore(&lo->lo_lock, flags);
+
+       atomic_inc(&lo->lo_pending);
+       if (waitqueue_active(&lo->lo_bh_wait))
+               wake_up(&lo->lo_bh_wait);
+}
+
+/*
+ * Grab first pending buffer
+ */
+static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
+{
+       struct bio *first;
+       struct bio **bio;
+       unsigned int count = 0;
+       unsigned int page_count = 0;
+       int rw;
+
+       spin_lock_irq(&lo->lo_lock);
+       first = lo->lo_bio;
+       if (unlikely(first == NULL)) {
+               spin_unlock_irq(&lo->lo_lock);
+               return 0;
+       }
+
+       /* TODO: need to split the bio, too bad. */
+       LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS);
+
+       rw = first->bi_rw;
+       bio = &lo->lo_bio;
+       while (*bio && (*bio)->bi_rw == rw) {
+               CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
+                      (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
+                      page_count, (*bio)->bi_vcnt);
+               if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
+                       break;
+
+
+               page_count += (*bio)->bi_vcnt;
+               count++;
+               bio = &(*bio)->bi_next;
+       }
+       if (*bio) {
+               /* Some of bios can't be mergable. */
+               lo->lo_bio = *bio;
+               *bio = NULL;
+       } else {
+               /* Hit the end of queue */
+               lo->lo_biotail = NULL;
+               lo->lo_bio = NULL;
+       }
+       *req = first;
+       spin_unlock_irq(&lo->lo_lock);
+       return count;
+}
+
+static ll_mrf_ret
+loop_make_request(struct request_queue *q, struct bio *old_bio)
+{
+       struct lloop_device *lo = q->queuedata;
+       int rw = bio_rw(old_bio);
+       int inactive;
+
+       if (!lo)
+               goto err;
+
+       CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
+              (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
+
+       spin_lock_irq(&lo->lo_lock);
+       inactive = (lo->lo_state != LLOOP_BOUND);
+       spin_unlock_irq(&lo->lo_lock);
+       if (inactive)
+               goto err;
+
+       if (rw == WRITE) {
+               if (lo->lo_flags & LO_FLAGS_READ_ONLY)
+                       goto err;
+       } else if (rw == READA) {
+               rw = READ;
+       } else if (rw != READ) {
+               CERROR("lloop: unknown command (%x)\n", rw);
+               goto err;
+       }
+       loop_add_bio(lo, old_bio);
+       LL_MRF_RETURN(0);
+err:
+       cfs_bio_io_error(old_bio, old_bio->bi_size);
+       LL_MRF_RETURN(0);
+}
+
+
+static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
+{
+       int ret;
+       ret = do_bio_lustrebacked(lo, bio);
+       while (bio) {
+               struct bio *tmp = bio->bi_next;
+               bio->bi_next = NULL;
+               cfs_bio_endio(bio, bio->bi_size, ret);
+               bio = tmp;
+       }
+}
+
+static inline int loop_active(struct lloop_device *lo)
+{
+       return atomic_read(&lo->lo_pending) ||
+               (lo->lo_state == LLOOP_RUNDOWN);
+}
+
+/*
+ * worker thread that handles reads/writes to file backed loop devices,
+ * to avoid blocking in our make_request_fn.
+ */
+static int loop_thread(void *data)
+{
+       struct lloop_device *lo = data;
+       struct bio *bio;
+       unsigned int count;
+       unsigned long times = 0;
+       unsigned long total_count = 0;
+
+       struct lu_env *env;
+       int refcheck;
+       int ret = 0;
+
+       set_user_nice(current, -20);
+
+       lo->lo_state = LLOOP_BOUND;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               GOTO(out, ret = PTR_ERR(env));
+
+       lo->lo_env = env;
+       memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec));
+       lo->lo_pvec.ldp_pages   = lo->lo_requests[0].lrd_pages;
+       lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets;
+
+       /*
+        * up sem, we are running
+        */
+       up(&lo->lo_sem);
+
+       for (;;) {
+               wait_event(lo->lo_bh_wait, loop_active(lo));
+               if (!atomic_read(&lo->lo_pending)) {
+                       int exiting = 0;
+                       spin_lock_irq(&lo->lo_lock);
+                       exiting = (lo->lo_state == LLOOP_RUNDOWN);
+                       spin_unlock_irq(&lo->lo_lock);
+                       if (exiting)
+                               break;
+               }
+
+               bio = NULL;
+               count = loop_get_bio(lo, &bio);
+               if (!count) {
+                       CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
+                       continue;
+               }
+
+               total_count += count;
+               if (total_count < count) {     /* overflow */
+                       total_count = count;
+                       times = 1;
+               } else {
+                       times++;
+               }
+               if ((times & 127) == 0) {
+                       CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n",
+                              total_count, times, total_count / times);
+               }
+
+               LASSERT(bio != NULL);
+               LASSERT(count <= atomic_read(&lo->lo_pending));
+               loop_handle_bio(lo, bio);
+               atomic_sub(count, &lo->lo_pending);
+       }
+       cl_env_put(env, &refcheck);
+
+out:
+       up(&lo->lo_sem);
+       return ret;
+}
+
+static int loop_set_fd(struct lloop_device *lo, struct file *unused,
+                      struct block_device *bdev, struct file *file)
+{
+       struct inode     *inode;
+       struct address_space *mapping;
+       int                lo_flags = 0;
+       int                error;
+       loff_t          size;
+
+       if (!try_module_get(THIS_MODULE))
+               return -ENODEV;
+
+       error = -EBUSY;
+       if (lo->lo_state != LLOOP_UNBOUND)
+               goto out;
+
+       mapping = file->f_mapping;
+       inode = mapping->host;
+
+       error = -EINVAL;
+       if (!S_ISREG(inode->i_mode) || inode->i_sb->s_magic != LL_SUPER_MAGIC)
+               goto out;
+
+       if (!(file->f_mode & FMODE_WRITE))
+               lo_flags |= LO_FLAGS_READ_ONLY;
+
+       size = get_loop_size(lo, file);
+
+       if ((loff_t)(sector_t)size != size) {
+               error = -EFBIG;
+               goto out;
+       }
+
+       /* remove all pages in cache so as dirty pages not to be existent. */
+       truncate_inode_pages(mapping, 0);
+
+       set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+
+       lo->lo_blocksize = PAGE_CACHE_SIZE;
+       lo->lo_device = bdev;
+       lo->lo_flags = lo_flags;
+       lo->lo_backing_file = file;
+       lo->ioctl = NULL;
+       lo->lo_sizelimit = 0;
+       lo->old_gfp_mask = mapping_gfp_mask(mapping);
+       mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+       lo->lo_bio = lo->lo_biotail = NULL;
+
+       /*
+        * set queue make_request_fn, and add limits based on lower level
+        * device
+        */
+       blk_queue_make_request(lo->lo_queue, loop_make_request);
+       lo->lo_queue->queuedata = lo;
+
+       /* queue parameters */
+       CLASSERT(PAGE_CACHE_SIZE < (1 << (sizeof(unsigned short) * 8)));
+       blk_queue_logical_block_size(lo->lo_queue,
+                                    (unsigned short)PAGE_CACHE_SIZE);
+       blk_queue_max_hw_sectors(lo->lo_queue,
+                                LLOOP_MAX_SEGMENTS << (PAGE_CACHE_SHIFT - 9));
+       blk_queue_max_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
+
+       set_capacity(disks[lo->lo_number], size);
+       bd_set_size(bdev, size << 9);
+
+       set_blocksize(bdev, lo->lo_blocksize);
+
+       kthread_run(loop_thread, lo, "lloop%d", lo->lo_number);
+       down(&lo->lo_sem);
+       return 0;
+
+out:
+       /* This is safe: open() is still holding a reference. */
+       module_put(THIS_MODULE);
+       return error;
+}
+
+static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
+                      int count)
+{
+       struct file *filp = lo->lo_backing_file;
+       int gfp = lo->old_gfp_mask;
+
+       if (lo->lo_state != LLOOP_BOUND)
+               return -ENXIO;
+
+       if (lo->lo_refcnt > count)      /* we needed one fd for the ioctl */
+               return -EBUSY;
+
+       if (filp == NULL)
+               return -EINVAL;
+
+       spin_lock_irq(&lo->lo_lock);
+       lo->lo_state = LLOOP_RUNDOWN;
+       spin_unlock_irq(&lo->lo_lock);
+       wake_up(&lo->lo_bh_wait);
+
+       down(&lo->lo_sem);
+       lo->lo_backing_file = NULL;
+       lo->ioctl = NULL;
+       lo->lo_device = NULL;
+       lo->lo_offset = 0;
+       lo->lo_sizelimit = 0;
+       lo->lo_flags = 0;
+       ll_invalidate_bdev(bdev, 0);
+       set_capacity(disks[lo->lo_number], 0);
+       bd_set_size(bdev, 0);
+       mapping_set_gfp_mask(filp->f_mapping, gfp);
+       lo->lo_state = LLOOP_UNBOUND;
+       fput(filp);
+       /* This is safe: open() is still holding a reference. */
+       module_put(THIS_MODULE);
+       return 0;
+}
+
+static int lo_open(struct block_device *bdev, fmode_t mode)
+{
+       struct lloop_device *lo = bdev->bd_disk->private_data;
+
+       mutex_lock(&lo->lo_ctl_mutex);
+       lo->lo_refcnt++;
+       mutex_unlock(&lo->lo_ctl_mutex);
+
+       return 0;
+}
+
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+       struct lloop_device *lo = disk->private_data;
+
+       mutex_lock(&lo->lo_ctl_mutex);
+       --lo->lo_refcnt;
+       mutex_unlock(&lo->lo_ctl_mutex);
+}
+
+/* lloop device node's ioctl function. */
+static int lo_ioctl(struct block_device *bdev, fmode_t mode,
+                   unsigned int cmd, unsigned long arg)
+{
+       struct lloop_device *lo = bdev->bd_disk->private_data;
+       struct inode *inode = NULL;
+       int err = 0;
+
+       mutex_lock(&lloop_mutex);
+       switch (cmd) {
+       case LL_IOC_LLOOP_DETACH: {
+               err = loop_clr_fd(lo, bdev, 2);
+               if (err == 0)
+                       ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+               break;
+       }
+
+       case LL_IOC_LLOOP_INFO: {
+               struct lu_fid fid;
+
+               LASSERT(lo->lo_backing_file != NULL);
+               if (inode == NULL)
+                       inode = lo->lo_backing_file->f_dentry->d_inode;
+               if (lo->lo_state == LLOOP_BOUND)
+                       fid = ll_i2info(inode)->lli_fid;
+               else
+                       fid_zero(&fid);
+
+               if (copy_to_user((struct lu_fid *)arg, &fid, sizeof(fid)))
+                       err = -EFAULT;
+               break;
+       }
+
+       default:
+               err = -EINVAL;
+               break;
+       }
+       mutex_unlock(&lloop_mutex);
+
+       return err;
+}
+
+static struct block_device_operations lo_fops = {
+       .owner =        THIS_MODULE,
+       .open =  lo_open,
+       .release =      lo_release,
+       .ioctl =        lo_ioctl,
+};
+
+/* dynamic iocontrol callback.
+ * This callback is registered in lloop_init and will be called by
+ * ll_iocontrol_call.
+ *
+ * This is a llite regular file ioctl function. It takes the responsibility
+ * of attaching or detaching a file by a lloop's device numner.
+ */
+static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
+                                  unsigned int cmd, unsigned long arg,
+                                  void *magic, int *rcp)
+{
+       struct lloop_device *lo = NULL;
+       struct block_device *bdev = NULL;
+       int err = 0;
+       dev_t dev;
+
+       if (magic != ll_iocontrol_magic)
+               return LLIOC_CONT;
+
+       if (disks == NULL)
+               GOTO(out1, err = -ENODEV);
+
+       CWARN("Enter llop_ioctl\n");
+
+       mutex_lock(&lloop_mutex);
+       switch (cmd) {
+       case LL_IOC_LLOOP_ATTACH: {
+               struct lloop_device *lo_free = NULL;
+               int i;
+
+               for (i = 0; i < max_loop; i++, lo = NULL) {
+                       lo = &loop_dev[i];
+                       if (lo->lo_state == LLOOP_UNBOUND) {
+                               if (!lo_free)
+                                       lo_free = lo;
+                               continue;
+                       }
+                       if (lo->lo_backing_file->f_dentry->d_inode ==
+                           file->f_dentry->d_inode)
+                               break;
+               }
+               if (lo || !lo_free)
+                       GOTO(out, err = -EBUSY);
+
+               lo = lo_free;
+               dev = MKDEV(lloop_major, lo->lo_number);
+
+               /* quit if the used pointer is writable */
+               if (put_user((long)old_encode_dev(dev), (long*)arg))
+                       GOTO(out, err = -EFAULT);
+
+               bdev = blkdev_get_by_dev(dev, file->f_mode, NULL);
+               if (IS_ERR(bdev))
+                       GOTO(out, err = PTR_ERR(bdev));
+
+               get_file(file);
+               err = loop_set_fd(lo, NULL, bdev, file);
+               if (err) {
+                       fput(file);
+                       ll_blkdev_put(bdev, 0);
+               }
+
+               break;
+       }
+
+       case LL_IOC_LLOOP_DETACH_BYDEV: {
+               int minor;
+
+               dev = old_decode_dev(arg);
+               if (MAJOR(dev) != lloop_major)
+                       GOTO(out, err = -EINVAL);
+
+               minor = MINOR(dev);
+               if (minor > max_loop - 1)
+                       GOTO(out, err = -EINVAL);
+
+               lo = &loop_dev[minor];
+               if (lo->lo_state != LLOOP_BOUND)
+                       GOTO(out, err = -EINVAL);
+
+               bdev = lo->lo_device;
+               err = loop_clr_fd(lo, bdev, 1);
+               if (err == 0)
+                       ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
+
+               break;
+       }
+
+       default:
+               err = -EINVAL;
+               break;
+       }
+
+out:
+       mutex_unlock(&lloop_mutex);
+out1:
+       if (rcp)
+               *rcp = err;
+       return LLIOC_STOP;
+}
+
+static int __init lloop_init(void)
+{
+       int     i;
+       unsigned int cmdlist[] = {
+               LL_IOC_LLOOP_ATTACH,
+               LL_IOC_LLOOP_DETACH_BYDEV,
+       };
+
+       if (max_loop < 1 || max_loop > 256) {
+               max_loop = MAX_LOOP_DEFAULT;
+               CWARN("lloop: invalid max_loop (must be between"
+                     " 1 and 256), using default (%u)\n", max_loop);
+       }
+
+       lloop_major = register_blkdev(0, "lloop");
+       if (lloop_major < 0)
+               return -EIO;
+
+       CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n",
+              lloop_major, max_loop);
+
+       ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
+       if (ll_iocontrol_magic == NULL)
+               goto out_mem1;
+
+       OBD_ALLOC_WAIT(loop_dev, max_loop * sizeof(*loop_dev));
+       if (!loop_dev)
+               goto out_mem1;
+
+       OBD_ALLOC_WAIT(disks, max_loop * sizeof(*disks));
+       if (!disks)
+               goto out_mem2;
+
+       for (i = 0; i < max_loop; i++) {
+               disks[i] = alloc_disk(1);
+               if (!disks[i])
+                       goto out_mem3;
+       }
+
+       mutex_init(&lloop_mutex);
+
+       for (i = 0; i < max_loop; i++) {
+               struct lloop_device *lo = &loop_dev[i];
+               struct gendisk *disk = disks[i];
+
+               lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
+               if (!lo->lo_queue)
+                       goto out_mem4;
+
+               mutex_init(&lo->lo_ctl_mutex);
+               sema_init(&lo->lo_sem, 0);
+               init_waitqueue_head(&lo->lo_bh_wait);
+               lo->lo_number = i;
+               spin_lock_init(&lo->lo_lock);
+               disk->major = lloop_major;
+               disk->first_minor = i;
+               disk->fops = &lo_fops;
+               sprintf(disk->disk_name, "lloop%d", i);
+               disk->private_data = lo;
+               disk->queue = lo->lo_queue;
+       }
+
+       /* We cannot fail after we call this, so another loop!*/
+       for (i = 0; i < max_loop; i++)
+               add_disk(disks[i]);
+       return 0;
+
+out_mem4:
+       while (i--)
+               blk_cleanup_queue(loop_dev[i].lo_queue);
+       i = max_loop;
+out_mem3:
+       while (i--)
+               put_disk(disks[i]);
+       OBD_FREE(disks, max_loop * sizeof(*disks));
+out_mem2:
+       OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+out_mem1:
+       unregister_blkdev(lloop_major, "lloop");
+       ll_iocontrol_unregister(ll_iocontrol_magic);
+       CERROR("lloop: ran out of memory\n");
+       return -ENOMEM;
+}
+
+static void lloop_exit(void)
+{
+       int i;
+
+       ll_iocontrol_unregister(ll_iocontrol_magic);
+       for (i = 0; i < max_loop; i++) {
+               del_gendisk(disks[i]);
+               blk_cleanup_queue(loop_dev[i].lo_queue);
+               put_disk(disks[i]);
+       }
+       if (ll_unregister_blkdev(lloop_major, "lloop"))
+               CWARN("lloop: cannot unregister blkdev\n");
+       else
+               CDEBUG(D_CONFIG, "unregistered lloop major %d\n", lloop_major);
+
+       OBD_FREE(disks, max_loop * sizeof(*disks));
+       OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
+}
+
+module_init(lloop_init);
+module_exit(lloop_exit);
+
+CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre virtual block device");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c
new file mode 100644 (file)
index 0000000..6a82505
--- /dev/null
@@ -0,0 +1,1370 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/version.h>
+#include <lustre_lite.h>
+#include <lprocfs_status.h>
+#include <linux/seq_file.h>
+#include <obd_support.h>
+
+#include "llite_internal.h"
+
+struct proc_dir_entry *proc_lustre_fs_root;
+
+#ifdef LPROCFS
+/* /proc/lustre/llite mount point registration */
+extern struct file_operations vvp_dump_pgcache_file_ops;
+struct file_operations ll_rw_extents_stats_fops;
+struct file_operations ll_rw_extents_stats_pp_fops;
+struct file_operations ll_rw_offset_stats_fops;
+
+static int ll_blksize_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = (struct super_block *)m->private;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc)
+             rc = seq_printf(m, "%u\n", osfs.os_bsize);
+
+       return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_blksize);
+
+static int ll_kbytestotal_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = (struct super_block *)m->private;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_blocks;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               rc = seq_printf(m, LPU64"\n", result);
+       }
+       return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytestotal);
+
+static int ll_kbytesfree_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = (struct super_block *)m->private;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bfree;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               rc = seq_printf(m, LPU64"\n", result);
+       }
+       return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytesfree);
+
+static int ll_kbytesavail_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = (struct super_block *)m->private;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bavail;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               rc = seq_printf(m, LPU64"\n", result);
+       }
+       return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_kbytesavail);
+
+static int ll_filestotal_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = (struct super_block *)m->private;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc)
+                rc = seq_printf(m, LPU64"\n", osfs.os_files);
+       return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filestotal);
+
+static int ll_filesfree_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = (struct super_block *)m->private;
+       struct obd_statfs osfs;
+       int rc;
+
+       LASSERT(sb != NULL);
+       rc = ll_statfs_internal(sb, &osfs,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               OBD_STATFS_NODELAY);
+       if (!rc)
+                rc = seq_printf(m, LPU64"\n", osfs.os_ffree);
+       return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_filesfree);
+
+static int ll_client_type_seq_show(struct seq_file *m, void *v)
+{
+       struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)m->private);
+       int rc;
+
+       LASSERT(sbi != NULL);
+
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT)
+               rc = seq_printf(m, "remote client\n");
+       else
+               rc = seq_printf(m, "local client\n");
+
+       return rc;
+}
+LPROC_SEQ_FOPS_RO(ll_client_type);
+
+static int ll_fstype_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = (struct super_block *)m->private;
+
+       LASSERT(sb != NULL);
+       return seq_printf(m, "%s\n", sb->s_type->name);
+}
+LPROC_SEQ_FOPS_RO(ll_fstype);
+
+static int ll_sb_uuid_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = (struct super_block *)m->private;
+
+       LASSERT(sb != NULL);
+       return seq_printf(m, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
+}
+LPROC_SEQ_FOPS_RO(ll_sb_uuid);
+
+static int ll_site_stats_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+
+       /*
+        * See description of statistical counters in struct cl_site, and
+        * struct lu_site.
+        */
+       return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m);
+}
+LPROC_SEQ_FOPS_RO(ll_site_stats);
+
+static int ll_max_readahead_mb_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       long pages_number;
+       int mult;
+
+       spin_lock(&sbi->ll_lock);
+       pages_number = sbi->ll_ra_info.ra_max_pages;
+       spin_unlock(&sbi->ll_lock);
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_readahead_mb_seq_write(struct file *file, const char *buffer,
+                                        size_t count, loff_t *off)
+{
+       struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int mult, rc, pages_number;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               return rc;
+
+       if (pages_number < 0 || pages_number > num_physpages / 2) {
+               CERROR("can't set file readahead more than %lu MB\n",
+                      num_physpages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/
+               return -ERANGE;
+       }
+
+       spin_lock(&sbi->ll_lock);
+       sbi->ll_ra_info.ra_max_pages = pages_number;
+       spin_unlock(&sbi->ll_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(ll_max_readahead_mb);
+
+static int ll_max_readahead_per_file_mb_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       long pages_number;
+       int mult;
+
+       spin_lock(&sbi->ll_lock);
+       pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+       spin_unlock(&sbi->ll_lock);
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_readahead_per_file_mb_seq_write(struct file *file,
+                                                 const char *buffer,
+                                                 size_t count, loff_t *off)
+{
+       struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int mult, rc, pages_number;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               return rc;
+
+       if (pages_number < 0 ||
+               pages_number > sbi->ll_ra_info.ra_max_pages) {
+               CERROR("can't set file readahead more than"
+                      "max_read_ahead_mb %lu MB\n",
+                      sbi->ll_ra_info.ra_max_pages);
+               return -ERANGE;
+       }
+
+       spin_lock(&sbi->ll_lock);
+       sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+       spin_unlock(&sbi->ll_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(ll_max_readahead_per_file_mb);
+
+static int ll_max_read_ahead_whole_mb_seq_show(struct seq_file *m, void *unused)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       long pages_number;
+       int mult;
+
+       spin_lock(&sbi->ll_lock);
+       pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
+       spin_unlock(&sbi->ll_lock);
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       return lprocfs_seq_read_frac_helper(m, pages_number, mult);
+}
+
+static ssize_t ll_max_read_ahead_whole_mb_seq_write(struct file *file,
+                                               const char *buffer,
+                                               size_t count, loff_t *off)
+{
+       struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int mult, rc, pages_number;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               return rc;
+
+       /* Cap this at the current max readahead window size, the readahead
+        * algorithm does this anyway so it's pointless to set it larger. */
+       if (pages_number < 0 ||
+           pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
+               CERROR("can't set max_read_ahead_whole_mb more than "
+                      "max_read_ahead_per_file_mb: %lu\n",
+                       sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_CACHE_SHIFT));
+               return -ERANGE;
+       }
+
+       spin_lock(&sbi->ll_lock);
+       sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number;
+       spin_unlock(&sbi->ll_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(ll_max_read_ahead_whole_mb);
+
+static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block     *sb    = m->private;
+       struct ll_sb_info      *sbi   = ll_s2sbi(sb);
+       struct cl_client_cache *cache = &sbi->ll_cache;
+       int shift = 20 - PAGE_CACHE_SHIFT;
+       int max_cached_mb;
+       int unused_mb;
+
+       max_cached_mb = cache->ccc_lru_max >> shift;
+       unused_mb = atomic_read(&cache->ccc_lru_left) >> shift;
+       return seq_printf(m,
+                       "users: %d\n"
+                       "max_cached_mb: %d\n"
+                       "used_mb: %d\n"
+                       "unused_mb: %d\n"
+                       "reclaim_count: %u\n",
+                       atomic_read(&cache->ccc_users),
+                       max_cached_mb,
+                       max_cached_mb - unused_mb,
+                       unused_mb,
+                       cache->ccc_lru_shrinkers);
+}
+
+static ssize_t ll_max_cached_mb_seq_write(struct file *file, const char *buffer,
+                                     size_t count, loff_t *off)
+{
+       struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct cl_client_cache *cache = &sbi->ll_cache;
+       int mult, rc, pages_number;
+       int diff = 0;
+       int nrpages = 0;
+       ENTRY;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       buffer = lprocfs_find_named_value(buffer, "max_cached_mb:", &count);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               RETURN(rc);
+
+       if (pages_number < 0 || pages_number > num_physpages) {
+               CERROR("%s: can't set max cache more than %lu MB\n",
+                      ll_get_fsname(sb, NULL, 0),
+                      num_physpages >> (20 - PAGE_CACHE_SHIFT));
+               RETURN(-ERANGE);
+       }
+
+       if (sbi->ll_dt_exp == NULL)
+               RETURN(-ENODEV);
+
+       spin_lock(&sbi->ll_lock);
+       diff = pages_number - cache->ccc_lru_max;
+       spin_unlock(&sbi->ll_lock);
+
+       /* easy - add more LRU slots. */
+       if (diff >= 0) {
+               atomic_add(diff, &cache->ccc_lru_left);
+               GOTO(out, rc = 0);
+       }
+
+       diff = -diff;
+       while (diff > 0) {
+               int tmp;
+
+               /* reduce LRU budget from free slots. */
+               do {
+                       int ov, nv;
+
+                       ov = atomic_read(&cache->ccc_lru_left);
+                       if (ov == 0)
+                               break;
+
+                       nv = ov > diff ? ov - diff : 0;
+                       rc = cfs_atomic_cmpxchg(&cache->ccc_lru_left, ov, nv);
+                       if (likely(ov == rc)) {
+                               diff -= ov - nv;
+                               nrpages += ov - nv;
+                               break;
+                       }
+               } while (1);
+
+               if (diff <= 0)
+                       break;
+
+               /* difficult - have to ask OSCs to drop LRU slots. */
+               tmp = diff << 1;
+               rc = obd_set_info_async(NULL, sbi->ll_dt_exp,
+                               sizeof(KEY_CACHE_LRU_SHRINK),
+                               KEY_CACHE_LRU_SHRINK,
+                               sizeof(tmp), &tmp, NULL);
+               if (rc < 0)
+                       break;
+       }
+
+out:
+       if (rc >= 0) {
+               spin_lock(&sbi->ll_lock);
+               cache->ccc_lru_max = pages_number;
+               spin_unlock(&sbi->ll_lock);
+               rc = count;
+       } else {
+               atomic_add(nrpages, &cache->ccc_lru_left);
+       }
+       return rc;
+}
+LPROC_SEQ_FOPS(ll_max_cached_mb);
+
+static int ll_checksum_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return seq_printf(m, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0);
+}
+
+static ssize_t ll_checksum_seq_write(struct file *file, const char *buffer,
+                                size_t count, loff_t *off)
+{
+       struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int val, rc;
+
+       if (!sbi->ll_dt_exp)
+               /* Not set up yet */
+               return -EAGAIN;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+       if (val)
+               sbi->ll_flags |= LL_SBI_CHECKSUM;
+       else
+               sbi->ll_flags &= ~LL_SBI_CHECKSUM;
+
+       rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
+                               KEY_CHECKSUM, sizeof(val), &val, NULL);
+       if (rc)
+               CWARN("Failed to set OSC checksum flags: %d\n", rc);
+
+       return count;
+}
+LPROC_SEQ_FOPS(ll_checksum);
+
+static int ll_max_rw_chunk_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+
+       return seq_printf(m, "%lu\n", ll_s2sbi(sb)->ll_max_rw_chunk);
+}
+
+static ssize_t ll_max_rw_chunk_seq_write(struct file *file, const char *buffer,
+                                    size_t count, loff_t *off)
+{
+       struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+       int rc, val;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+       ll_s2sbi(sb)->ll_max_rw_chunk = val;
+       return count;
+}
+LPROC_SEQ_FOPS(ll_max_rw_chunk);
+
+static int ll_rd_track_id(struct seq_file *m, enum stats_track_type type)
+{
+       struct super_block *sb = m->private;
+
+       if (ll_s2sbi(sb)->ll_stats_track_type == type) {
+               return seq_printf(m, "%d\n",
+                               ll_s2sbi(sb)->ll_stats_track_id);
+
+       } else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) {
+               return seq_printf(m, "0 (all)\n");
+       } else {
+               return seq_printf(m, "untracked\n");
+       }
+}
+
+static int ll_wr_track_id(const char *buffer, unsigned long count, void *data,
+                         enum stats_track_type type)
+{
+       struct super_block *sb = data;
+       int rc, pid;
+
+       rc = lprocfs_write_helper(buffer, count, &pid);
+       if (rc)
+               return rc;
+       ll_s2sbi(sb)->ll_stats_track_id = pid;
+       if (pid == 0)
+               ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL;
+       else
+               ll_s2sbi(sb)->ll_stats_track_type = type;
+       lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats);
+       return count;
+}
+
+static int ll_track_pid_seq_show(struct seq_file *m, void *v)
+{
+       return ll_rd_track_id(m, STATS_TRACK_PID);
+}
+
+static ssize_t ll_track_pid_seq_write(struct file *file, const char *buffer,
+                                 size_t count, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PID);
+}
+LPROC_SEQ_FOPS(ll_track_pid);
+
+static int ll_track_ppid_seq_show(struct seq_file *m, void *v)
+{
+       return ll_rd_track_id(m, STATS_TRACK_PPID);
+}
+
+static ssize_t ll_track_ppid_seq_write(struct file *file, const char *buffer,
+                                  size_t count, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_PPID);
+}
+LPROC_SEQ_FOPS(ll_track_ppid);
+
+static int ll_track_gid_seq_show(struct seq_file *m, void *v)
+{
+       return ll_rd_track_id(m, STATS_TRACK_GID);
+}
+
+static ssize_t ll_track_gid_seq_write(struct file *file, const char *buffer,
+                                 size_t count, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       return ll_wr_track_id(buffer, count, seq->private, STATS_TRACK_GID);
+}
+LPROC_SEQ_FOPS(ll_track_gid);
+
+static int ll_statahead_max_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return seq_printf(m, "%u\n", sbi->ll_sa_max);
+}
+
+static ssize_t ll_statahead_max_seq_write(struct file *file, const char *buffer,
+                                     size_t count, loff_t *off)
+{
+       struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val >= 0 && val <= LL_SA_RPC_MAX)
+               sbi->ll_sa_max = val;
+       else
+               CERROR("Bad statahead_max value %d. Valid values are in the "
+                      "range [0, %d]\n", val, LL_SA_RPC_MAX);
+
+       return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_max);
+
+static int ll_statahead_agl_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return seq_printf(m, "%u\n",
+                       sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0);
+}
+
+static ssize_t ll_statahead_agl_seq_write(struct file *file, const char *buffer,
+                                     size_t count, loff_t *off)
+{
+       struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val)
+               sbi->ll_flags |= LL_SBI_AGL_ENABLED;
+       else
+               sbi->ll_flags &= ~LL_SBI_AGL_ENABLED;
+
+       return count;
+}
+LPROC_SEQ_FOPS(ll_statahead_agl);
+
+static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return seq_printf(m,
+                       "statahead total: %u\n"
+                       "statahead wrong: %u\n"
+                       "agl total: %u\n",
+                       atomic_read(&sbi->ll_sa_total),
+                       atomic_read(&sbi->ll_sa_wrong),
+                       atomic_read(&sbi->ll_agl_total));
+}
+LPROC_SEQ_FOPS_RO(ll_statahead_stats);
+
+static int ll_lazystatfs_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+       return seq_printf(m, "%u\n",
+                       (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
+}
+
+static ssize_t ll_lazystatfs_seq_write(struct file *file, const char *buffer,
+                                  size_t count, loff_t *off)
+{
+       struct super_block *sb = ((struct seq_file *)file->private_data)->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val)
+               sbi->ll_flags |= LL_SBI_LAZYSTATFS;
+       else
+               sbi->ll_flags &= ~LL_SBI_LAZYSTATFS;
+
+       return count;
+}
+LPROC_SEQ_FOPS(ll_lazystatfs);
+
+static int ll_maxea_size_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       unsigned int ealen;
+       int rc;
+
+       rc = ll_get_max_mdsize(sbi, &ealen);
+       if (rc)
+               return rc;
+
+       return seq_printf(m, "%u\n", ealen);
+}
+LPROC_SEQ_FOPS_RO(ll_maxea_size);
+
+static int ll_sbi_flags_seq_show(struct seq_file *m, void *v)
+{
+       const char *str[] = LL_SBI_FLAGS;
+       struct super_block *sb = m->private;
+       int flags = ll_s2sbi(sb)->ll_flags;
+       int i = 0;
+
+       while (flags != 0) {
+               if (ARRAY_SIZE(str) <= i) {
+                       CERROR("%s: Revise array LL_SBI_FLAGS to match sbi "
+                               "flags please.\n", ll_get_fsname(sb, NULL, 0));
+                       return -EINVAL;
+               }
+
+               if (flags & 0x1)
+                       seq_printf(m, "%s ", str[i]);
+               flags >>= 1;
+               ++i;
+       }
+       seq_printf(m, "\b\n");
+       return 0;
+}
+LPROC_SEQ_FOPS_RO(ll_sbi_flags);
+
+static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
+       { "uuid",         &ll_sb_uuid_fops,       0, 0 },
+       //{ "mntpt_path",   ll_rd_path,      0, 0 },
+       { "fstype",       &ll_fstype_fops,        0, 0 },
+       { "site",         &ll_site_stats_fops,    0, 0 },
+       { "blocksize",    &ll_blksize_fops,       0, 0 },
+       { "kbytestotal",  &ll_kbytestotal_fops,   0, 0 },
+       { "kbytesfree",   &ll_kbytesfree_fops,    0, 0 },
+       { "kbytesavail",  &ll_kbytesavail_fops,   0, 0 },
+       { "filestotal",   &ll_filestotal_fops,    0, 0 },
+       { "filesfree",    &ll_filesfree_fops,     0, 0 },
+       { "client_type",  &ll_client_type_fops,   0, 0 },
+       //{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
+       { "max_read_ahead_mb", &ll_max_readahead_mb_fops, 0 },
+       { "max_read_ahead_per_file_mb", &ll_max_readahead_per_file_mb_fops, 0 },
+       { "max_read_ahead_whole_mb", &ll_max_read_ahead_whole_mb_fops, 0 },
+       { "max_cached_mb",    &ll_max_cached_mb_fops, 0 },
+       { "checksum_pages",   &ll_checksum_fops, 0 },
+       { "max_rw_chunk",     &ll_max_rw_chunk_fops, 0 },
+       { "stats_track_pid",  &ll_track_pid_fops, 0 },
+       { "stats_track_ppid", &ll_track_ppid_fops, 0 },
+       { "stats_track_gid",  &ll_track_gid_fops, 0 },
+       { "statahead_max",    &ll_statahead_max_fops, 0 },
+       { "statahead_agl",    &ll_statahead_agl_fops, 0 },
+       { "statahead_stats",  &ll_statahead_stats_fops, 0, 0 },
+       { "lazystatfs",       &ll_lazystatfs_fops, 0 },
+       { "max_easize",       &ll_maxea_size_fops, 0, 0 },
+       { "sbi_flags",        &ll_sbi_flags_fops, 0, 0 },
+       { 0 }
+};
+
+#define MAX_STRING_SIZE 128
+
+struct llite_file_opcode {
+       __u32       opcode;
+       __u32       type;
+       const char *opname;
+} llite_opcode_table[LPROC_LL_FILE_OPCODES] = {
+       /* file operation */
+       { LPROC_LL_DIRTY_HITS,     LPROCFS_TYPE_REGS, "dirty_pages_hits" },
+       { LPROC_LL_DIRTY_MISSES,   LPROCFS_TYPE_REGS, "dirty_pages_misses" },
+       { LPROC_LL_READ_BYTES,     LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+                                  "read_bytes" },
+       { LPROC_LL_WRITE_BYTES,    LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+                                  "write_bytes" },
+       { LPROC_LL_BRW_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+                                  "brw_read" },
+       { LPROC_LL_BRW_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
+                                  "brw_write" },
+       { LPROC_LL_OSC_READ,       LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+                                  "osc_read" },
+       { LPROC_LL_OSC_WRITE,      LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES,
+                                  "osc_write" },
+       { LPROC_LL_IOCTL,         LPROCFS_TYPE_REGS, "ioctl" },
+       { LPROC_LL_OPEN,           LPROCFS_TYPE_REGS, "open" },
+       { LPROC_LL_RELEASE,     LPROCFS_TYPE_REGS, "close" },
+       { LPROC_LL_MAP,     LPROCFS_TYPE_REGS, "mmap" },
+       { LPROC_LL_LLSEEK,       LPROCFS_TYPE_REGS, "seek" },
+       { LPROC_LL_FSYNC,         LPROCFS_TYPE_REGS, "fsync" },
+       { LPROC_LL_READDIR,     LPROCFS_TYPE_REGS, "readdir" },
+       /* inode operation */
+       { LPROC_LL_SETATTR,     LPROCFS_TYPE_REGS, "setattr" },
+       { LPROC_LL_TRUNC,         LPROCFS_TYPE_REGS, "truncate" },
+       { LPROC_LL_FLOCK,         LPROCFS_TYPE_REGS, "flock" },
+       { LPROC_LL_GETATTR,     LPROCFS_TYPE_REGS, "getattr" },
+       /* dir inode operation */
+       { LPROC_LL_CREATE,       LPROCFS_TYPE_REGS, "create" },
+       { LPROC_LL_LINK,           LPROCFS_TYPE_REGS, "link" },
+       { LPROC_LL_UNLINK,       LPROCFS_TYPE_REGS, "unlink" },
+       { LPROC_LL_SYMLINK,     LPROCFS_TYPE_REGS, "symlink" },
+       { LPROC_LL_MKDIR,         LPROCFS_TYPE_REGS, "mkdir" },
+       { LPROC_LL_RMDIR,         LPROCFS_TYPE_REGS, "rmdir" },
+       { LPROC_LL_MKNOD,         LPROCFS_TYPE_REGS, "mknod" },
+       { LPROC_LL_RENAME,       LPROCFS_TYPE_REGS, "rename" },
+       /* special inode operation */
+       { LPROC_LL_STAFS,         LPROCFS_TYPE_REGS, "statfs" },
+       { LPROC_LL_ALLOC_INODE,    LPROCFS_TYPE_REGS, "alloc_inode" },
+       { LPROC_LL_SETXATTR,       LPROCFS_TYPE_REGS, "setxattr" },
+       { LPROC_LL_GETXATTR,       LPROCFS_TYPE_REGS, "getxattr" },
+       { LPROC_LL_LISTXATTR,      LPROCFS_TYPE_REGS, "listxattr" },
+       { LPROC_LL_REMOVEXATTR,    LPROCFS_TYPE_REGS, "removexattr" },
+       { LPROC_LL_INODE_PERM,     LPROCFS_TYPE_REGS, "inode_permission" },
+};
+
+void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count)
+{
+       if (!sbi->ll_stats)
+               return;
+       if (sbi->ll_stats_track_type == STATS_TRACK_ALL)
+               lprocfs_counter_add(sbi->ll_stats, op, count);
+       else if (sbi->ll_stats_track_type == STATS_TRACK_PID &&
+                sbi->ll_stats_track_id == current->pid)
+               lprocfs_counter_add(sbi->ll_stats, op, count);
+       else if (sbi->ll_stats_track_type == STATS_TRACK_PPID &&
+                sbi->ll_stats_track_id == current->parent->pid)
+               lprocfs_counter_add(sbi->ll_stats, op, count);
+       else if (sbi->ll_stats_track_type == STATS_TRACK_GID &&
+                sbi->ll_stats_track_id == current_gid())
+               lprocfs_counter_add(sbi->ll_stats, op, count);
+}
+EXPORT_SYMBOL(ll_stats_ops_tally);
+
+static const char *ra_stat_string[] = {
+       [RA_STAT_HIT] = "hits",
+       [RA_STAT_MISS] = "misses",
+       [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive",
+       [RA_STAT_MISS_IN_WINDOW] = "miss inside window",
+       [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page",
+       [RA_STAT_FAILED_MATCH] = "failed lock match",
+       [RA_STAT_DISCARDED] = "read but discarded",
+       [RA_STAT_ZERO_LEN] = "zero length file",
+       [RA_STAT_ZERO_WINDOW] = "zero size window",
+       [RA_STAT_EOF] = "read-ahead to EOF",
+       [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
+       [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(llite, name);
+LPROC_SEQ_FOPS_RO_TYPE(llite, uuid);
+
+int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
+                               struct super_block *sb, char *osc, char *mdc)
+{
+       struct lprocfs_vars lvars[2];
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct obd_device *obd;
+       proc_dir_entry_t *dir;
+       char name[MAX_STRING_SIZE + 1], *ptr;
+       int err, id, len, rc;
+       ENTRY;
+
+       memset(lvars, 0, sizeof(lvars));
+
+       name[MAX_STRING_SIZE] = '\0';
+       lvars[0].name = name;
+
+       LASSERT(sbi != NULL);
+       LASSERT(mdc != NULL);
+       LASSERT(osc != NULL);
+
+       /* Get fsname */
+       len = strlen(lsi->lsi_lmd->lmd_profile);
+       ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
+       if (ptr && (strcmp(ptr, "-client") == 0))
+               len -= 7;
+
+       /* Mount info */
+       snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len,
+                lsi->lsi_lmd->lmd_profile, sb);
+
+       sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL);
+       if (IS_ERR(sbi->ll_proc_root)) {
+               err = PTR_ERR(sbi->ll_proc_root);
+               sbi->ll_proc_root = NULL;
+               RETURN(err);
+       }
+
+       rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
+                               &vvp_dump_pgcache_file_ops, sbi);
+       if (rc)
+               CWARN("Error adding the dump_page_cache file\n");
+
+       rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644,
+                               &ll_rw_extents_stats_fops, sbi);
+       if (rc)
+               CWARN("Error adding the extent_stats file\n");
+
+       rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process",
+                               0644, &ll_rw_extents_stats_pp_fops, sbi);
+       if (rc)
+               CWARN("Error adding the extents_stats_per_process file\n");
+
+       rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644,
+                               &ll_rw_offset_stats_fops, sbi);
+       if (rc)
+               CWARN("Error adding the offset_stats file\n");
+
+       /* File operations stats */
+       sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES,
+                                           LPROCFS_STATS_FLAG_NONE);
+       if (sbi->ll_stats == NULL)
+               GOTO(out, err = -ENOMEM);
+       /* do counter init */
+       for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) {
+               __u32 type = llite_opcode_table[id].type;
+               void *ptr = NULL;
+               if (type & LPROCFS_TYPE_REGS)
+                       ptr = "regs";
+               else if (type & LPROCFS_TYPE_BYTES)
+                       ptr = "bytes";
+               else if (type & LPROCFS_TYPE_PAGES)
+                       ptr = "pages";
+               lprocfs_counter_init(sbi->ll_stats,
+                                    llite_opcode_table[id].opcode,
+                                    (type & LPROCFS_CNTR_AVGMINMAX),
+                                    llite_opcode_table[id].opname, ptr);
+       }
+       err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats);
+       if (err)
+               GOTO(out, err);
+
+       sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
+                                              LPROCFS_STATS_FLAG_NONE);
+       if (sbi->ll_ra_stats == NULL)
+               GOTO(out, err = -ENOMEM);
+
+       for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
+               lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
+                                    ra_stat_string[id], "pages");
+       err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
+                                    sbi->ll_ra_stats);
+       if (err)
+               GOTO(out, err);
+
+
+       err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb);
+       if (err)
+               GOTO(out, err);
+
+       /* MDC info */
+       obd = class_name2obd(mdc);
+
+       LASSERT(obd != NULL);
+       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+       LASSERT(obd->obd_type->typ_name != NULL);
+
+       dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
+       if (dir == NULL)
+               GOTO(out, err = -ENOMEM);
+
+       snprintf(name, MAX_STRING_SIZE, "common_name");
+       lvars[0].fops = &llite_name_fops;
+       err = lprocfs_add_vars(dir, lvars, obd);
+       if (err)
+               GOTO(out, err);
+
+       snprintf(name, MAX_STRING_SIZE, "uuid");
+       lvars[0].fops = &llite_uuid_fops;
+       err = lprocfs_add_vars(dir, lvars, obd);
+       if (err)
+               GOTO(out, err);
+
+       /* OSC */
+       obd = class_name2obd(osc);
+
+       LASSERT(obd != NULL);
+       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+       LASSERT(obd->obd_type->typ_name != NULL);
+
+       dir = proc_mkdir(obd->obd_type->typ_name, sbi->ll_proc_root);
+       if (dir == NULL)
+               GOTO(out, err = -ENOMEM);
+
+       snprintf(name, MAX_STRING_SIZE, "common_name");
+       lvars[0].fops = &llite_name_fops;
+       err = lprocfs_add_vars(dir, lvars, obd);
+       if (err)
+               GOTO(out, err);
+
+       snprintf(name, MAX_STRING_SIZE, "uuid");
+       lvars[0].fops = &llite_uuid_fops;
+       err = lprocfs_add_vars(dir, lvars, obd);
+out:
+       if (err) {
+               lprocfs_remove(&sbi->ll_proc_root);
+               lprocfs_free_stats(&sbi->ll_ra_stats);
+               lprocfs_free_stats(&sbi->ll_stats);
+       }
+       RETURN(err);
+}
+
+void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi)
+{
+       if (sbi->ll_proc_root) {
+               lprocfs_remove(&sbi->ll_proc_root);
+               lprocfs_free_stats(&sbi->ll_ra_stats);
+               lprocfs_free_stats(&sbi->ll_stats);
+       }
+}
+#undef MAX_STRING_SIZE
+
+#define pct(a,b) (b ? a * 100 / b : 0)
+
+static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
+                                  struct seq_file *seq, int which)
+{
+       unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+       unsigned long start, end, r, w;
+       char *unitp = "KMGTPEZY";
+       int i, units = 10;
+       struct per_process_info *pp_info = &io_extents->pp_extents[which];
+
+       read_cum = 0;
+       write_cum = 0;
+       start = 0;
+
+       for(i = 0; i < LL_HIST_MAX; i++) {
+               read_tot += pp_info->pp_r_hist.oh_buckets[i];
+               write_tot += pp_info->pp_w_hist.oh_buckets[i];
+       }
+
+       for(i = 0; i < LL_HIST_MAX; i++) {
+               r = pp_info->pp_r_hist.oh_buckets[i];
+               w = pp_info->pp_w_hist.oh_buckets[i];
+               read_cum += r;
+               write_cum += w;
+               end = 1 << (i + LL_HIST_START - units);
+               seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu  | "
+                          "%14lu %4lu %4lu\n", start, *unitp, end, *unitp,
+                          (i == LL_HIST_MAX - 1) ? '+' : ' ',
+                          r, pct(r, read_tot), pct(read_cum, read_tot),
+                          w, pct(w, write_tot), pct(write_cum, write_tot));
+               start = end;
+               if (start == 1<<10) {
+                       start = 1;
+                       units += 10;
+                       unitp++;
+               }
+               if (read_cum == read_tot && write_cum == write_tot)
+                       break;
+       }
+}
+
+static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v)
+{
+       struct timeval now;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+       int k;
+
+       do_gettimeofday(&now);
+
+       if (!sbi->ll_rw_stats_on) {
+               seq_printf(seq, "disabled\n"
+                               "write anything in this file to activate, "
+                               "then 0 or \"[D/d]isabled\" to deactivate\n");
+               return 0;
+       }
+       seq_printf(seq, "snapshot_time:  %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+       seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+       seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+                  "extents", "calls", "%", "cum%",
+                  "calls", "%", "cum%");
+       spin_lock(&sbi->ll_pp_extent_lock);
+       for (k = 0; k < LL_PROCESS_HIST_MAX; k++) {
+               if (io_extents->pp_extents[k].pid != 0) {
+                       seq_printf(seq, "\nPID: %d\n",
+                                  io_extents->pp_extents[k].pid);
+                       ll_display_extents_info(io_extents, seq, k);
+               }
+       }
+       spin_unlock(&sbi->ll_pp_extent_lock);
+       return 0;
+}
+
+static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file,
+                                               const char *buf, size_t len,
+                                               loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+       int i;
+       int value = 1, rc = 0;
+
+       rc = lprocfs_write_helper(buf, len, &value);
+       if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+                      strcmp(buf, "Disabled") == 0))
+               value = 0;
+
+       if (value == 0)
+               sbi->ll_rw_stats_on = 0;
+       else
+               sbi->ll_rw_stats_on = 1;
+
+       spin_lock(&sbi->ll_pp_extent_lock);
+       for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+               io_extents->pp_extents[i].pid = 0;
+               lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+               lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+       }
+       spin_unlock(&sbi->ll_pp_extent_lock);
+       return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats_pp);
+
+static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v)
+{
+       struct timeval now;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+       do_gettimeofday(&now);
+
+       if (!sbi->ll_rw_stats_on) {
+               seq_printf(seq, "disabled\n"
+                               "write anything in this file to activate, "
+                               "then 0 or \"[D/d]isabled\" to deactivate\n");
+               return 0;
+       }
+       seq_printf(seq, "snapshot_time:  %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+
+       seq_printf(seq, "%15s %19s       | %20s\n", " ", "read", "write");
+       seq_printf(seq, "%13s   %14s %4s %4s  | %14s %4s %4s\n",
+                  "extents", "calls", "%", "cum%",
+                  "calls", "%", "cum%");
+       spin_lock(&sbi->ll_lock);
+       ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX);
+       spin_unlock(&sbi->ll_lock);
+
+       return 0;
+}
+
+static ssize_t ll_rw_extents_stats_seq_write(struct file *file, const char *buf,
+                                       size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+       int i;
+       int value = 1, rc = 0;
+
+       rc = lprocfs_write_helper(buf, len, &value);
+       if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+                      strcmp(buf, "Disabled") == 0))
+               value = 0;
+
+       if (value == 0)
+               sbi->ll_rw_stats_on = 0;
+       else
+               sbi->ll_rw_stats_on = 1;
+       spin_lock(&sbi->ll_pp_extent_lock);
+       for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
+               io_extents->pp_extents[i].pid = 0;
+               lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist);
+               lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist);
+       }
+       spin_unlock(&sbi->ll_pp_extent_lock);
+
+       return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_extents_stats);
+
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+                      struct ll_file_data *file, loff_t pos,
+                      size_t count, int rw)
+{
+       int i, cur = -1;
+       struct ll_rw_process_info *process;
+       struct ll_rw_process_info *offset;
+       int *off_count = &sbi->ll_rw_offset_entry_count;
+       int *process_count = &sbi->ll_offset_process_count;
+       struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info;
+
+       if(!sbi->ll_rw_stats_on)
+               return;
+       process = sbi->ll_rw_process_info;
+       offset = sbi->ll_rw_offset_info;
+
+       spin_lock(&sbi->ll_pp_extent_lock);
+       /* Extent statistics */
+       for(i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+               if(io_extents->pp_extents[i].pid == pid) {
+                       cur = i;
+                       break;
+               }
+       }
+
+       if (cur == -1) {
+               /* new process */
+               sbi->ll_extent_process_count =
+                       (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX;
+               cur = sbi->ll_extent_process_count;
+               io_extents->pp_extents[cur].pid = pid;
+               lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist);
+               lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist);
+       }
+
+       for(i = 0; (count >= (1 << LL_HIST_START << i)) &&
+            (i < (LL_HIST_MAX - 1)); i++);
+       if (rw == 0) {
+               io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++;
+               io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++;
+       } else {
+               io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++;
+               io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++;
+       }
+       spin_unlock(&sbi->ll_pp_extent_lock);
+
+       spin_lock(&sbi->ll_process_lock);
+       /* Offset statistics */
+       for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+               if (process[i].rw_pid == pid) {
+                       if (process[i].rw_last_file != file) {
+                               process[i].rw_range_start = pos;
+                               process[i].rw_last_file_pos = pos + count;
+                               process[i].rw_smallest_extent = count;
+                               process[i].rw_largest_extent = count;
+                               process[i].rw_offset = 0;
+                               process[i].rw_last_file = file;
+                               spin_unlock(&sbi->ll_process_lock);
+                               return;
+                       }
+                       if (process[i].rw_last_file_pos != pos) {
+                               *off_count =
+                                   (*off_count + 1) % LL_OFFSET_HIST_MAX;
+                               offset[*off_count].rw_op = process[i].rw_op;
+                               offset[*off_count].rw_pid = pid;
+                               offset[*off_count].rw_range_start =
+                                       process[i].rw_range_start;
+                               offset[*off_count].rw_range_end =
+                                       process[i].rw_last_file_pos;
+                               offset[*off_count].rw_smallest_extent =
+                                       process[i].rw_smallest_extent;
+                               offset[*off_count].rw_largest_extent =
+                                       process[i].rw_largest_extent;
+                               offset[*off_count].rw_offset =
+                                       process[i].rw_offset;
+                               process[i].rw_op = rw;
+                               process[i].rw_range_start = pos;
+                               process[i].rw_smallest_extent = count;
+                               process[i].rw_largest_extent = count;
+                               process[i].rw_offset = pos -
+                                       process[i].rw_last_file_pos;
+                       }
+                       if(process[i].rw_smallest_extent > count)
+                               process[i].rw_smallest_extent = count;
+                       if(process[i].rw_largest_extent < count)
+                               process[i].rw_largest_extent = count;
+                       process[i].rw_last_file_pos = pos + count;
+                       spin_unlock(&sbi->ll_process_lock);
+                       return;
+               }
+       }
+       *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX;
+       process[*process_count].rw_pid = pid;
+       process[*process_count].rw_op = rw;
+       process[*process_count].rw_range_start = pos;
+       process[*process_count].rw_last_file_pos = pos + count;
+       process[*process_count].rw_smallest_extent = count;
+       process[*process_count].rw_largest_extent = count;
+       process[*process_count].rw_offset = 0;
+       process[*process_count].rw_last_file = file;
+       spin_unlock(&sbi->ll_process_lock);
+}
+
+static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v)
+{
+       struct timeval now;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_process_info *offset = sbi->ll_rw_offset_info;
+       struct ll_rw_process_info *process = sbi->ll_rw_process_info;
+       int i;
+
+       do_gettimeofday(&now);
+
+       if (!sbi->ll_rw_stats_on) {
+               seq_printf(seq, "disabled\n"
+                               "write anything in this file to activate, "
+                               "then 0 or \"[D/d]isabled\" to deactivate\n");
+               return 0;
+       }
+       spin_lock(&sbi->ll_process_lock);
+
+       seq_printf(seq, "snapshot_time:  %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+       seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n",
+                  "R/W", "PID", "RANGE START", "RANGE END",
+                  "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET");
+       /* We stored the discontiguous offsets here; print them first */
+       for(i = 0; i < LL_OFFSET_HIST_MAX; i++) {
+               if (offset[i].rw_pid != 0)
+                       seq_printf(seq,"%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+                                  offset[i].rw_op ? 'W' : 'R',
+                                  offset[i].rw_pid,
+                                  offset[i].rw_range_start,
+                                  offset[i].rw_range_end,
+                                  (unsigned long)offset[i].rw_smallest_extent,
+                                  (unsigned long)offset[i].rw_largest_extent,
+                                  offset[i].rw_offset);
+       }
+       /* Then print the current offsets for each process */
+       for(i = 0; i < LL_PROCESS_HIST_MAX; i++) {
+               if (process[i].rw_pid != 0)
+                       seq_printf(seq,"%3c %10d %14Lu %14Lu %17lu %17lu %14Lu",
+                                  process[i].rw_op ? 'W' : 'R',
+                                  process[i].rw_pid,
+                                  process[i].rw_range_start,
+                                  process[i].rw_last_file_pos,
+                                  (unsigned long)process[i].rw_smallest_extent,
+                                  (unsigned long)process[i].rw_largest_extent,
+                                  process[i].rw_offset);
+       }
+       spin_unlock(&sbi->ll_process_lock);
+
+       return 0;
+}
+
+static ssize_t ll_rw_offset_stats_seq_write(struct file *file, const char *buf,
+                                      size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct ll_sb_info *sbi = seq->private;
+       struct ll_rw_process_info *process_info = sbi->ll_rw_process_info;
+       struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info;
+       int value = 1, rc = 0;
+
+       rc = lprocfs_write_helper(buf, len, &value);
+
+       if (rc < 0 && (strcmp(buf, "disabled") == 0 ||
+                          strcmp(buf, "Disabled") == 0))
+               value = 0;
+
+       if (value == 0)
+               sbi->ll_rw_stats_on = 0;
+       else
+               sbi->ll_rw_stats_on = 1;
+
+       spin_lock(&sbi->ll_process_lock);
+       sbi->ll_offset_process_count = 0;
+       sbi->ll_rw_offset_entry_count = 0;
+       memset(process_info, 0, sizeof(struct ll_rw_process_info) *
+              LL_PROCESS_HIST_MAX);
+       memset(offset_info, 0, sizeof(struct ll_rw_process_info) *
+              LL_OFFSET_HIST_MAX);
+       spin_unlock(&sbi->ll_process_lock);
+
+       return len;
+}
+
+LPROC_SEQ_FOPS(ll_rw_offset_stats);
+
+void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = NULL;
+    lvars->obd_vars     = lprocfs_llite_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c
new file mode 100644 (file)
index 0000000..58d59aa
--- /dev/null
@@ -0,0 +1,1279 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/quotaops.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include "llite_internal.h"
+
+static int ll_create_it(struct inode *, struct dentry *,
+                       int, struct lookup_intent *);
+
+/*
+ * Check if we have something mounted at the named dchild.
+ * In such a case there would always be dentry present.
+ */
+static int ll_d_mountpoint(struct dentry *dparent, struct dentry *dchild,
+                          struct qstr *name)
+{
+       int mounted = 0;
+
+       if (unlikely(dchild)) {
+               mounted = d_mountpoint(dchild);
+       } else if (dparent) {
+               dchild = d_lookup(dparent, name);
+               if (dchild) {
+                       mounted = d_mountpoint(dchild);
+                       dput(dchild);
+               }
+       }
+       return mounted;
+}
+
+int ll_unlock(__u32 mode, struct lustre_handle *lockh)
+{
+       ENTRY;
+
+       ldlm_lock_decref(lockh, mode);
+
+       RETURN(0);
+}
+
+
+/* called from iget5_locked->find_inode() under inode_lock spinlock */
+static int ll_test_inode(struct inode *inode, void *opaque)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct lustre_md     *md = opaque;
+
+       if (unlikely(!(md->body->valid & OBD_MD_FLID))) {
+               CERROR("MDS body missing FID\n");
+               return 0;
+       }
+
+       if (!lu_fid_eq(&lli->lli_fid, &md->body->fid1))
+               return 0;
+
+       return 1;
+}
+
+static int ll_set_inode(struct inode *inode, void *opaque)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct mdt_body *body = ((struct lustre_md *)opaque)->body;
+
+       if (unlikely(!(body->valid & OBD_MD_FLID))) {
+               CERROR("MDS body missing FID\n");
+               return -EINVAL;
+       }
+
+       lli->lli_fid = body->fid1;
+       if (unlikely(!(body->valid & OBD_MD_FLTYPE))) {
+               CERROR("Can not initialize inode "DFID" without object type: "
+                      "valid = "LPX64"\n", PFID(&lli->lli_fid), body->valid);
+               return -EINVAL;
+       }
+
+       inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mode & S_IFMT);
+       if (unlikely(inode->i_mode == 0)) {
+               CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid));
+               return -EINVAL;
+       }
+
+       ll_lli_init(lli);
+
+       return 0;
+}
+
+
+/*
+ * Get an inode by inode number (already instantiated by the intent lookup).
+ * Returns inode or NULL
+ */
+struct inode *ll_iget(struct super_block *sb, ino_t hash,
+                     struct lustre_md *md)
+{
+       struct inode     *inode;
+       ENTRY;
+
+       LASSERT(hash != 0);
+       inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md);
+
+       if (inode) {
+               if (inode->i_state & I_NEW) {
+                       int rc = 0;
+
+                       ll_read_inode2(inode, md);
+                       if (S_ISREG(inode->i_mode) &&
+                           ll_i2info(inode)->lli_clob == NULL) {
+                               CDEBUG(D_INODE,
+                                       "%s: apply lsm %p to inode "DFID".\n",
+                                       ll_get_fsname(sb, NULL, 0), md->lsm,
+                                       PFID(ll_inode2fid(inode)));
+                               rc = cl_file_inode_init(inode, md);
+                       }
+                       if (rc != 0) {
+                               make_bad_inode(inode);
+                               unlock_new_inode(inode);
+                               iput(inode);
+                               inode = ERR_PTR(rc);
+                       } else
+                               unlock_new_inode(inode);
+               } else if (!(inode->i_state & (I_FREEING | I_CLEAR)))
+                       ll_update_inode(inode, md);
+               CDEBUG(D_VFSTRACE, "got inode: %p for "DFID"\n",
+                      inode, PFID(&md->body->fid1));
+       }
+       RETURN(inode);
+}
+
+static void ll_invalidate_negative_children(struct inode *dir)
+{
+       struct dentry *dentry, *tmp_subdir;
+       struct ll_d_hlist_node *p;
+
+       ll_lock_dcache(dir);
+       ll_d_hlist_for_each_entry(dentry, p, &dir->i_dentry, d_alias) {
+               spin_lock(&dentry->d_lock);
+               if (!list_empty(&dentry->d_subdirs)) {
+                       struct dentry *child;
+
+                       list_for_each_entry_safe(child, tmp_subdir,
+                                                &dentry->d_subdirs,
+                                                d_u.d_child) {
+                               if (child->d_inode == NULL)
+                                       d_lustre_invalidate(child, 1);
+                       }
+               }
+               spin_unlock(&dentry->d_lock);
+       }
+       ll_unlock_dcache(dir);
+}
+
+int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                      void *data, int flag)
+{
+       int rc;
+       struct lustre_handle lockh;
+       ENTRY;
+
+       switch (flag) {
+       case LDLM_CB_BLOCKING:
+               ldlm_lock2handle(lock, &lockh);
+               rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+               if (rc < 0) {
+                       CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+                       RETURN(rc);
+               }
+               break;
+       case LDLM_CB_CANCELING: {
+               struct inode *inode = ll_inode_from_resource_lock(lock);
+               struct ll_inode_info *lli;
+               __u64 bits = lock->l_policy_data.l_inodebits.bits;
+               struct lu_fid *fid;
+               ldlm_mode_t mode = lock->l_req_mode;
+
+               /* Inode is set to lock->l_resource->lr_lvb_inode
+                * for mdc - bug 24555 */
+               LASSERT(lock->l_ast_data == NULL);
+
+               /* Invalidate all dentries associated with this inode */
+               if (inode == NULL)
+                       break;
+
+               LASSERT(lock->l_flags & LDLM_FL_CANCELING);
+               /* For OPEN locks we differentiate between lock modes
+                * LCK_CR, LCK_CW, LCK_PR - bug 22891 */
+               if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE |
+                           MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM))
+                       ll_have_md_lock(inode, &bits, LCK_MINMODE);
+
+               if (bits & MDS_INODELOCK_OPEN)
+                       ll_have_md_lock(inode, &bits, mode);
+
+               fid = ll_inode2fid(inode);
+               if (lock->l_resource->lr_name.name[0] != fid_seq(fid) ||
+                   lock->l_resource->lr_name.name[1] != fid_oid(fid) ||
+                   lock->l_resource->lr_name.name[2] != fid_ver(fid)) {
+                       LDLM_ERROR(lock, "data mismatch with object "
+                                  DFID" (%p)", PFID(fid), inode);
+               }
+
+               if (bits & MDS_INODELOCK_OPEN) {
+                       int flags = 0;
+                       switch (lock->l_req_mode) {
+                       case LCK_CW:
+                               flags = FMODE_WRITE;
+                               break;
+                       case LCK_PR:
+                               flags = FMODE_EXEC;
+                               break;
+                       case LCK_CR:
+                               flags = FMODE_READ;
+                               break;
+                       default:
+                               CERROR("Unexpected lock mode for OPEN lock "
+                                      "%d, inode %ld\n", lock->l_req_mode,
+                                      inode->i_ino);
+                       }
+                       ll_md_real_close(inode, flags);
+               }
+
+               lli = ll_i2info(inode);
+               if (bits & MDS_INODELOCK_LAYOUT) {
+                       struct cl_object_conf conf = { { 0 } };
+
+                       conf.coc_opc = OBJECT_CONF_INVALIDATE;
+                       conf.coc_inode = inode;
+                       rc = ll_layout_conf(inode, &conf);
+                       if (rc)
+                               CDEBUG(D_INODE, "invaliding layout %d.\n", rc);
+               }
+
+               if (bits & MDS_INODELOCK_UPDATE)
+                       lli->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
+
+               if (S_ISDIR(inode->i_mode) &&
+                    (bits & MDS_INODELOCK_UPDATE)) {
+                       CDEBUG(D_INODE, "invalidating inode %lu\n",
+                              inode->i_ino);
+                       truncate_inode_pages(inode->i_mapping, 0);
+                       ll_invalidate_negative_children(inode);
+               }
+
+               if (inode->i_sb->s_root &&
+                   inode != inode->i_sb->s_root->d_inode &&
+                   (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)))
+                       ll_invalidate_aliases(inode);
+               iput(inode);
+               break;
+       }
+       default:
+               LBUG();
+       }
+
+       RETURN(0);
+}
+
+__u32 ll_i2suppgid(struct inode *i)
+{
+       if (current_is_in_group(i->i_gid))
+               return (__u32)i->i_gid;
+       else
+               return (__u32)(-1);
+}
+
+/* Pack the required supplementary groups into the supplied groups array.
+ * If we don't need to use the groups from the target inode(s) then we
+ * instead pack one or more groups from the user's supplementary group
+ * array in case it might be useful.  Not needed if doing an MDS-side upcall. */
+void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2)
+{
+#if 0
+       int i;
+#endif
+
+       LASSERT(i1 != NULL);
+       LASSERT(suppgids != NULL);
+
+       suppgids[0] = ll_i2suppgid(i1);
+
+       if (i2)
+               suppgids[1] = ll_i2suppgid(i2);
+               else
+                       suppgids[1] = -1;
+
+#if 0
+       for (i = 0; i < current_ngroups; i++) {
+               if (suppgids[0] == -1) {
+                       if (current_groups[i] != suppgids[1])
+                               suppgids[0] = current_groups[i];
+                       continue;
+               }
+               if (suppgids[1] == -1) {
+                       if (current_groups[i] != suppgids[0])
+                               suppgids[1] = current_groups[i];
+                       continue;
+               }
+               break;
+       }
+#endif
+}
+
+/*
+ * try to reuse three types of dentry:
+ * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid
+ *    by concurrent .revalidate).
+ * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may
+ *    be cleared by others calling d_lustre_revalidate).
+ * 3. DISCONNECTED alias.
+ */
+static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry)
+{
+       struct dentry *alias, *discon_alias, *invalid_alias;
+       struct ll_d_hlist_node *p;
+
+       if (ll_d_hlist_empty(&inode->i_dentry))
+               return NULL;
+
+       discon_alias = invalid_alias = NULL;
+
+       ll_lock_dcache(inode);
+       ll_d_hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) {
+               LASSERT(alias != dentry);
+
+               spin_lock(&alias->d_lock);
+               if (alias->d_flags & DCACHE_DISCONNECTED)
+                       /* LASSERT(last_discon == NULL); LU-405, bz 20055 */
+                       discon_alias = alias;
+               else if (alias->d_parent == dentry->d_parent         &&
+                        alias->d_name.hash == dentry->d_name.hash       &&
+                        alias->d_name.len == dentry->d_name.len         &&
+                        memcmp(alias->d_name.name, dentry->d_name.name,
+                               dentry->d_name.len) == 0)
+                       invalid_alias = alias;
+               spin_unlock(&alias->d_lock);
+
+               if (invalid_alias)
+                       break;
+       }
+       alias = invalid_alias ?: discon_alias ?: NULL;
+       if (alias) {
+               spin_lock(&alias->d_lock);
+               dget_dlock(alias);
+               spin_unlock(&alias->d_lock);
+       }
+       ll_unlock_dcache(inode);
+
+       return alias;
+}
+
+/*
+ * Similar to d_splice_alias(), but lustre treats invalid alias
+ * similar to DCACHE_DISCONNECTED, and tries to use it anyway.
+ */
+struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de)
+{
+       struct dentry *new;
+
+       if (inode) {
+               new = ll_find_alias(inode, de);
+               if (new) {
+                       ll_dops_init(new, 1, 1);
+                       d_move(new, de);
+                       iput(inode);
+                       CDEBUG(D_DENTRY,
+                              "Reuse dentry %p inode %p refc %d flags %#x\n",
+                             new, new->d_inode, d_refcount(new), new->d_flags);
+                       return new;
+               }
+       }
+       ll_dops_init(de, 1, 1);
+       __d_lustre_invalidate(de);
+       d_add(de, inode);
+       CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n",
+              de, de->d_inode, d_refcount(de), de->d_flags);
+       return de;
+}
+
+int ll_lookup_it_finish(struct ptlrpc_request *request,
+                       struct lookup_intent *it, void *data)
+{
+       struct it_cb_data *icbd = data;
+       struct dentry **de = icbd->icbd_childp;
+       struct inode *parent = icbd->icbd_parent;
+       struct inode *inode = NULL;
+       __u64 bits = 0;
+       int rc;
+       ENTRY;
+
+       /* NB 1 request reference will be taken away by ll_intent_lock()
+        * when I return */
+       CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it,
+              it->d.lustre.it_disposition);
+       if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+               rc = ll_prep_inode(&inode, request, (*de)->d_sb, it);
+               if (rc)
+                       RETURN(rc);
+
+               ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits);
+
+               /* We used to query real size from OSTs here, but actually
+                  this is not needed. For stat() calls size would be updated
+                  from subsequent do_revalidate()->ll_inode_revalidate_it() in
+                  2.4 and
+                  vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
+                  Everybody else who needs correct file size would call
+                  ll_glimpse_size or some equivalent themselves anyway.
+                  Also see bug 7198. */
+       }
+
+       /* Only hash *de if it is unhashed (new dentry).
+        * Atoimc_open may passin hashed dentries for open.
+        */
+       if (d_unhashed(*de))
+               *de = ll_splice_alias(inode, *de);
+
+       if (!it_disposition(it, DISP_LOOKUP_NEG)) {
+               /* we have lookup look - unhide dentry */
+               if (bits & MDS_INODELOCK_LOOKUP)
+                       d_lustre_revalidate(*de);
+       } else if (!it_disposition(it, DISP_OPEN_CREATE)) {
+               /* If file created on server, don't depend on parent UPDATE
+                * lock to unhide it. It is left hidden and next lookup can
+                * find it in ll_splice_alias.
+                */
+               /* Check that parent has UPDATE lock. */
+               struct lookup_intent parent_it = {
+                                       .it_op = IT_GETATTR,
+                                       .d.lustre.it_lock_handle = 0 };
+
+               if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it,
+                                      &ll_i2info(parent)->lli_fid, NULL)) {
+                       d_lustre_revalidate(*de);
+                       ll_intent_release(&parent_it);
+               }
+       }
+
+       RETURN(0);
+}
+
+static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
+                                  struct lookup_intent *it, int lookup_flags)
+{
+       struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
+       struct dentry *save = dentry, *retval;
+       struct ptlrpc_request *req = NULL;
+       struct md_op_data *op_data;
+       struct it_cb_data icbd;
+       __u32 opc;
+       int rc;
+       ENTRY;
+
+       if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen)
+               RETURN(ERR_PTR(-ENAMETOOLONG));
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n",
+              dentry->d_name.len, dentry->d_name.name, parent->i_ino,
+              parent->i_generation, parent, LL_IT2STR(it));
+
+       if (d_mountpoint(dentry))
+               CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it));
+
+       ll_frob_intent(&it, &lookup_it);
+
+       /* As do_lookup is called before follow_mount, root dentry may be left
+        * not valid, revalidate it here. */
+       if (parent->i_sb->s_root && (parent->i_sb->s_root->d_inode == parent) &&
+           (it->it_op & (IT_OPEN | IT_CREAT))) {
+               rc = ll_inode_revalidate_it(parent->i_sb->s_root, it,
+                                           MDS_INODELOCK_LOOKUP);
+               if (rc)
+                       RETURN(ERR_PTR(rc));
+       }
+
+       if (it->it_op == IT_GETATTR) {
+               rc = ll_statahead_enter(parent, &dentry, 0);
+               if (rc == 1) {
+                       if (dentry == save)
+                               GOTO(out, retval = NULL);
+                       GOTO(out, retval = dentry);
+               }
+       }
+
+       icbd.icbd_childp = &dentry;
+       icbd.icbd_parent = parent;
+
+       if (it->it_op & IT_CREAT ||
+           (it->it_op & IT_OPEN && it->it_create_mode & O_CREAT))
+               opc = LUSTRE_OPC_CREATE;
+       else
+               opc = LUSTRE_OPC_ANY;
+
+       op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name,
+                                    dentry->d_name.len, lookup_flags, opc,
+                                    NULL);
+       if (IS_ERR(op_data))
+               RETURN((void *)op_data);
+
+       /* enforce umask if acl disabled or MDS doesn't support umask */
+       if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
+               it->it_create_mode &= ~current_umask();
+
+       rc = md_intent_lock(ll_i2mdexp(parent), op_data, NULL, 0, it,
+                           lookup_flags, &req, ll_md_blocking_ast, 0);
+       ll_finish_md_op_data(op_data);
+       if (rc < 0)
+               GOTO(out, retval = ERR_PTR(rc));
+
+       rc = ll_lookup_it_finish(req, it, &icbd);
+       if (rc != 0) {
+               ll_intent_release(it);
+               GOTO(out, retval = ERR_PTR(rc));
+       }
+
+       if ((it->it_op & IT_OPEN) && dentry->d_inode &&
+           !S_ISREG(dentry->d_inode->i_mode) &&
+           !S_ISDIR(dentry->d_inode->i_mode)) {
+               ll_release_openhandle(dentry, it);
+       }
+       ll_lookup_finish_locks(it, dentry);
+
+       if (dentry == save)
+               GOTO(out, retval = NULL);
+       else
+               GOTO(out, retval = dentry);
+ out:
+       if (req)
+               ptlrpc_req_finished(req);
+       if (it->it_op == IT_GETATTR && (retval == NULL || retval == dentry))
+               ll_statahead_mark(parent, dentry);
+       return retval;
+}
+
+static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
+                                  unsigned int flags)
+{
+       struct lookup_intent *itp, it = { .it_op = IT_GETATTR };
+       struct dentry *de;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),flags=%u\n",
+              dentry->d_name.len, dentry->d_name.name, parent->i_ino,
+              parent->i_generation, parent, flags);
+
+       /* Optimize away (CREATE && !OPEN). Let .create handle the race. */
+       if ((flags & LOOKUP_CREATE ) && !(flags & LOOKUP_OPEN)) {
+               ll_dops_init(dentry, 1, 1);
+               __d_lustre_invalidate(dentry);
+               d_add(dentry, NULL);
+               return NULL;
+       }
+
+       if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE))
+               itp = NULL;
+       else
+               itp = &it;
+       de = ll_lookup_it(parent, dentry, itp, 0);
+
+       if (itp != NULL)
+               ll_intent_release(itp);
+
+       return de;
+}
+
+/*
+ * For cached negative dentry and new dentry, handle lookup/create/open
+ * together.
+ */
+static int ll_atomic_open(struct inode *dir, struct dentry *dentry,
+                         struct file *file, unsigned open_flags,
+                         umode_t mode, int *opened)
+{
+       struct lookup_intent *it;
+       struct dentry *de;
+       long long lookup_flags = LOOKUP_OPEN;
+       int rc = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),file %p,"
+                          "open_flags %x,mode %x opened %d\n",
+              dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+              dir->i_generation, dir, file, open_flags, mode, *opened);
+
+       OBD_ALLOC(it, sizeof(*it));
+       if (!it)
+               RETURN(-ENOMEM);
+
+       it->it_op = IT_OPEN;
+       if (mode) {
+               it->it_op |= IT_CREAT;
+               lookup_flags |= LOOKUP_CREATE;
+       }
+       it->it_create_mode = (mode & S_IALLUGO) | S_IFREG;
+       it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags);
+
+       /* Dentry added to dcache tree in ll_lookup_it */
+       de = ll_lookup_it(dir, dentry, it, lookup_flags);
+       if (IS_ERR(de))
+               rc = PTR_ERR(de);
+       else if (de != NULL)
+               dentry = de;
+
+       if (!rc) {
+               if (it_disposition(it, DISP_OPEN_CREATE)) {
+                       /* Dentry instantiated in ll_create_it. */
+                       rc = ll_create_it(dir, dentry, mode, it);
+                       if (rc) {
+                               /* We dget in ll_splice_alias. */
+                               if (de != NULL)
+                                       dput(de);
+                               goto out_release;
+                       }
+
+                       *opened |= FILE_CREATED;
+               }
+               if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) {
+                       /* Open dentry. */
+                       if (S_ISFIFO(dentry->d_inode->i_mode)) {
+                               /* We cannot call open here as it would
+                                * deadlock.
+                                */
+                               if (it_disposition(it, DISP_ENQ_OPEN_REF))
+                                       ptlrpc_req_finished(
+                                                      (struct ptlrpc_request *)
+                                                         it->d.lustre.it_data);
+                               rc = finish_no_open(file, de);
+                       } else {
+                               file->private_data = it;
+                               rc = finish_open(file, dentry, NULL, opened);
+                               /* We dget in ll_splice_alias. finish_open takes
+                                * care of dget for fd open.
+                                */
+                               if (de != NULL)
+                                       dput(de);
+                       }
+               } else {
+                       rc = finish_no_open(file, de);
+               }
+       }
+
+out_release:
+       ll_intent_release(it);
+       OBD_FREE(it, sizeof(*it));
+
+       RETURN(rc);
+}
+
+
+/* We depend on "mode" being set with the proper file type/umask by now */
+static struct inode *ll_create_node(struct inode *dir, const char *name,
+                                   int namelen, const void *data, int datalen,
+                                   int mode, __u64 extra,
+                                   struct lookup_intent *it)
+{
+       struct inode *inode = NULL;
+       struct ptlrpc_request *request = NULL;
+       struct ll_sb_info *sbi = ll_i2sbi(dir);
+       int rc;
+       ENTRY;
+
+       LASSERT(it && it->d.lustre.it_disposition);
+
+       LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF));
+       request = it->d.lustre.it_data;
+       it_clear_disposition(it, DISP_ENQ_CREATE_REF);
+       rc = ll_prep_inode(&inode, request, dir->i_sb, it);
+       if (rc)
+               GOTO(out, inode = ERR_PTR(rc));
+
+       LASSERT(ll_d_hlist_empty(&inode->i_dentry));
+
+       /* We asked for a lock on the directory, but were granted a
+        * lock on the inode.  Since we finally have an inode pointer,
+        * stuff it in the lock. */
+       CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n",
+              inode, inode->i_ino, inode->i_generation);
+       ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL);
+       EXIT;
+ out:
+       ptlrpc_req_finished(request);
+       return inode;
+}
+
+/*
+ * By the time this is called, we already have created the directory cache
+ * entry for the new file, but it is so far negative - it has no inode.
+ *
+ * We defer creating the OBD object(s) until open, to keep the intent and
+ * non-intent code paths similar, and also because we do not have the MDS
+ * inode number before calling ll_create_node() (which is needed for LOV),
+ * so we would need to do yet another RPC to the MDS to store the LOV EA
+ * data on the MDS.  If needed, we would pass the PACKED lmm as data and
+ * lmm_size in datalen (the MDS still has code which will handle that).
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode,
+                       struct lookup_intent *it)
+{
+       struct inode *inode;
+       int rc = 0;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n",
+              dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+              dir->i_generation, dir, LL_IT2STR(it));
+
+       rc = it_open_error(DISP_OPEN_CREATE, it);
+       if (rc)
+               RETURN(rc);
+
+       inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len,
+                              NULL, 0, mode, 0, it);
+       if (IS_ERR(inode))
+               RETURN(PTR_ERR(inode));
+
+       if (filename_is_volatile(dentry->d_name.name, dentry->d_name.len, NULL))
+               ll_i2info(inode)->lli_volatile = true;
+
+       d_instantiate(dentry, inode);
+       RETURN(0);
+}
+
+static void ll_update_times(struct ptlrpc_request *request,
+                           struct inode *inode)
+{
+       struct mdt_body *body = req_capsule_server_get(&request->rq_pill,
+                                                      &RMF_MDT_BODY);
+
+       LASSERT(body);
+       if (body->valid & OBD_MD_FLMTIME &&
+           body->mtime > LTIME_S(inode->i_mtime)) {
+               CDEBUG(D_INODE, "setting ino %lu mtime from %lu to "LPU64"\n",
+                      inode->i_ino, LTIME_S(inode->i_mtime), body->mtime);
+               LTIME_S(inode->i_mtime) = body->mtime;
+       }
+       if (body->valid & OBD_MD_FLCTIME &&
+           body->ctime > LTIME_S(inode->i_ctime))
+               LTIME_S(inode->i_ctime) = body->ctime;
+}
+
+static int ll_new_node(struct inode *dir, struct qstr *name,
+                      const char *tgt, int mode, int rdev,
+                      struct dentry *dchild, __u32 opc)
+{
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       struct inode *inode = NULL;
+       struct ll_sb_info *sbi = ll_i2sbi(dir);
+       int tgt_len = 0;
+       int err;
+
+       ENTRY;
+       if (unlikely(tgt != NULL))
+               tgt_len = strlen(tgt) + 1;
+
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name,
+                                    name->len, 0, opc, NULL);
+       if (IS_ERR(op_data))
+               GOTO(err_exit, err = PTR_ERR(op_data));
+
+       err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode,
+                       current_fsuid(), current_fsgid(),
+                       cfs_curproc_cap_pack(), rdev, &request);
+       ll_finish_md_op_data(op_data);
+       if (err)
+               GOTO(err_exit, err);
+
+       ll_update_times(request, dir);
+
+       if (dchild) {
+               err = ll_prep_inode(&inode, request, dchild->d_sb, NULL);
+               if (err)
+                    GOTO(err_exit, err);
+
+               d_instantiate(dchild, inode);
+       }
+       EXIT;
+err_exit:
+       ptlrpc_req_finished(request);
+
+       return err;
+}
+
+static int ll_mknod_generic(struct inode *dir, struct qstr *name, int mode,
+                           unsigned rdev, struct dentry *dchild)
+{
+       int err;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p) mode %o dev %x\n",
+              name->len, name->name, dir->i_ino, dir->i_generation, dir,
+              mode, rdev);
+
+       if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+               mode &= ~current_umask();
+
+       switch (mode & S_IFMT) {
+       case 0:
+               mode |= S_IFREG; /* for mode = 0 case, fallthrough */
+       case S_IFREG:
+       case S_IFCHR:
+       case S_IFBLK:
+       case S_IFIFO:
+       case S_IFSOCK:
+               err = ll_new_node(dir, name, NULL, mode, rdev, dchild,
+                                 LUSTRE_OPC_MKNOD);
+               break;
+       case S_IFDIR:
+               err = -EPERM;
+               break;
+       default:
+               err = -EINVAL;
+       }
+
+       if (!err)
+               ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1);
+
+       RETURN(err);
+}
+
+/*
+ * Plain create. Intent create is handled in atomic_open.
+ */
+static int ll_create_nd(struct inode *dir, struct dentry *dentry,
+                       umode_t mode, bool want_excl)
+{
+       int rc;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),"
+                          "flags=%u, excl=%d\n",
+              dentry->d_name.len, dentry->d_name.name, dir->i_ino,
+              dir->i_generation, dir, mode, want_excl);
+
+       rc = ll_mknod_generic(dir, &dentry->d_name, mode, 0, dentry);
+
+       ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, unhashed %d\n",
+              dentry->d_name.len, dentry->d_name.name, d_unhashed(dentry));
+
+       return rc;
+}
+
+static int ll_symlink_generic(struct inode *dir, struct qstr *name,
+                             const char *tgt, struct dentry *dchild)
+{
+       int err;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),target=%.*s\n",
+              name->len, name->name, dir->i_ino, dir->i_generation,
+              dir, 3000, tgt);
+
+       err = ll_new_node(dir, name, (char *)tgt, S_IFLNK | S_IRWXUGO,
+                         0, dchild, LUSTRE_OPC_SYMLINK);
+
+       if (!err)
+               ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1);
+
+       RETURN(err);
+}
+
+static int ll_link_generic(struct inode *src,  struct inode *dir,
+                          struct qstr *name, struct dentry *dchild)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(dir);
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       int err;
+
+       ENTRY;
+       CDEBUG(D_VFSTRACE,
+              "VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%.*s\n",
+              src->i_ino, src->i_generation, src, dir->i_ino,
+              dir->i_generation, dir, name->len, name->name);
+
+       op_data = ll_prep_md_op_data(NULL, src, dir, name->name, name->len,
+                                    0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       err = md_link(sbi->ll_md_exp, op_data, &request);
+       ll_finish_md_op_data(op_data);
+       if (err)
+               GOTO(out, err);
+
+       ll_update_times(request, dir);
+       ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1);
+       EXIT;
+out:
+       ptlrpc_req_finished(request);
+       RETURN(err);
+}
+
+static int ll_mkdir_generic(struct inode *dir, struct qstr *name,
+                           int mode, struct dentry *dchild)
+
+{
+       int err;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+              name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+       if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
+               mode &= ~current_umask();
+       mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR;
+       err = ll_new_node(dir, name, NULL, mode, 0, dchild, LUSTRE_OPC_MKDIR);
+
+       if (!err)
+               ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1);
+
+       RETURN(err);
+}
+
+/* Try to find the child dentry by its name.
+   If found, put the result fid into @fid. */
+static void ll_get_child_fid(struct inode * dir, struct qstr *name,
+                            struct lu_fid *fid)
+{
+       struct dentry *parent, *child;
+
+       parent = ll_d_hlist_entry(dir->i_dentry, struct dentry, d_alias);
+       child = d_lookup(parent, name);
+       if (child) {
+               if (child->d_inode)
+                       *fid = *ll_inode2fid(child->d_inode);
+               dput(child);
+       }
+}
+
+static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent,
+                           struct dentry *dchild, struct qstr *name)
+{
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+              name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+       if (unlikely(ll_d_mountpoint(dparent, dchild, name)))
+               RETURN(-EBUSY);
+
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len,
+                                    S_IFDIR, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       ll_get_child_fid(dir, name, &op_data->op_fid3);
+       op_data->op_fid2 = op_data->op_fid3;
+       rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+       ll_finish_md_op_data(op_data);
+       if (rc == 0) {
+               ll_update_times(request, dir);
+               ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+       }
+
+       ptlrpc_req_finished(request);
+       RETURN(rc);
+}
+
+/**
+ * Remove dir entry
+ **/
+int ll_rmdir_entry(struct inode *dir, char *name, int namelen)
+{
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+              namelen, name, dir->i_ino, dir->i_generation, dir);
+
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name),
+                                    S_IFDIR, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+       op_data->op_cli_flags |= CLI_RM_ENTRY;
+       rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+       ll_finish_md_op_data(op_data);
+       if (rc == 0) {
+               ll_update_times(request, dir);
+               ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1);
+       }
+
+       ptlrpc_req_finished(request);
+       RETURN(rc);
+}
+
+int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
+{
+       struct mdt_body *body;
+       struct lov_mds_md *eadata;
+       struct lov_stripe_md *lsm = NULL;
+       struct obd_trans_info oti = { 0 };
+       struct obdo *oa;
+       struct obd_capa *oc = NULL;
+       int rc;
+       ENTRY;
+
+       /* req is swabbed so this is safe */
+       body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+       if (!(body->valid & OBD_MD_FLEASIZE))
+               RETURN(0);
+
+       if (body->eadatasize == 0) {
+               CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n");
+               GOTO(out, rc = -EPROTO);
+       }
+
+       /* The MDS sent back the EA because we unlinked the last reference
+        * to this file. Use this EA to unlink the objects on the OST.
+        * It's opaque so we don't swab here; we leave it to obd_unpackmd() to
+        * check it is complete and sensible. */
+       eadata = req_capsule_server_sized_get(&request->rq_pill, &RMF_MDT_MD,
+                                             body->eadatasize);
+       LASSERT(eadata != NULL);
+
+       rc = obd_unpackmd(ll_i2dtexp(dir), &lsm, eadata, body->eadatasize);
+       if (rc < 0) {
+               CERROR("obd_unpackmd: %d\n", rc);
+               GOTO(out, rc);
+       }
+       LASSERT(rc >= sizeof(*lsm));
+
+       OBDO_ALLOC(oa);
+       if (oa == NULL)
+               GOTO(out_free_memmd, rc = -ENOMEM);
+
+       oa->o_oi = lsm->lsm_oi;
+       oa->o_mode = body->mode & S_IFMT;
+       oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP;
+
+       if (body->valid & OBD_MD_FLCOOKIE) {
+               oa->o_valid |= OBD_MD_FLCOOKIE;
+               oti.oti_logcookies =
+                       req_capsule_server_sized_get(&request->rq_pill,
+                                                    &RMF_LOGCOOKIES,
+                                                  sizeof(struct llog_cookie) *
+                                                    lsm->lsm_stripe_count);
+               if (oti.oti_logcookies == NULL) {
+                       oa->o_valid &= ~OBD_MD_FLCOOKIE;
+                       body->valid &= ~OBD_MD_FLCOOKIE;
+               }
+       }
+
+       if (body->valid & OBD_MD_FLOSSCAPA) {
+               rc = md_unpack_capa(ll_i2mdexp(dir), request, &RMF_CAPA2, &oc);
+               if (rc)
+                       GOTO(out_free_memmd, rc);
+       }
+
+       rc = obd_destroy(NULL, ll_i2dtexp(dir), oa, lsm, &oti,
+                        ll_i2mdexp(dir), oc);
+       capa_put(oc);
+       if (rc)
+               CERROR("obd destroy objid "DOSTID" error %d\n",
+                      POSTID(&lsm->lsm_oi), rc);
+out_free_memmd:
+       obd_free_memmd(ll_i2dtexp(dir), &lsm);
+       OBDO_FREE(oa);
+out:
+       return rc;
+}
+
+/* ll_unlink_generic() doesn't update the inode with the new link count.
+ * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there
+ * is any lock existing. They will recycle dentries and inodes based upon locks
+ * too. b=20433 */
+static int ll_unlink_generic(struct inode *dir, struct dentry *dparent,
+                            struct dentry *dchild, struct qstr *name)
+{
+       struct ptlrpc_request *request = NULL;
+       struct md_op_data *op_data;
+       int rc;
+       ENTRY;
+       CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
+              name->len, name->name, dir->i_ino, dir->i_generation, dir);
+
+       /*
+        * XXX: unlink bind mountpoint maybe call to here,
+        * just check it as vfs_unlink does.
+        */
+       if (unlikely(ll_d_mountpoint(dparent, dchild, name)))
+               RETURN(-EBUSY);
+
+       op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name,
+                                    name->len, 0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       ll_get_child_fid(dir, name, &op_data->op_fid3);
+       op_data->op_fid2 = op_data->op_fid3;
+       rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request);
+       ll_finish_md_op_data(op_data);
+       if (rc)
+               GOTO(out, rc);
+
+       ll_update_times(request, dir);
+       ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1);
+
+       rc = ll_objects_destroy(request, dir);
+ out:
+       ptlrpc_req_finished(request);
+       RETURN(rc);
+}
+
+static int ll_rename_generic(struct inode *src, struct dentry *src_dparent,
+                            struct dentry *src_dchild, struct qstr *src_name,
+                            struct inode *tgt, struct dentry *tgt_dparent,
+                            struct dentry *tgt_dchild, struct qstr *tgt_name)
+{
+       struct ptlrpc_request *request = NULL;
+       struct ll_sb_info *sbi = ll_i2sbi(src);
+       struct md_op_data *op_data;
+       int err;
+       ENTRY;
+       CDEBUG(D_VFSTRACE,"VFS Op:oldname=%.*s,src_dir=%lu/%u(%p),newname=%.*s,"
+              "tgt_dir=%lu/%u(%p)\n", src_name->len, src_name->name,
+              src->i_ino, src->i_generation, src, tgt_name->len,
+              tgt_name->name, tgt->i_ino, tgt->i_generation, tgt);
+
+       if (unlikely(ll_d_mountpoint(src_dparent, src_dchild, src_name) ||
+           ll_d_mountpoint(tgt_dparent, tgt_dchild, tgt_name)))
+               RETURN(-EBUSY);
+
+       op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       ll_get_child_fid(src, src_name, &op_data->op_fid3);
+       ll_get_child_fid(tgt, tgt_name, &op_data->op_fid4);
+       err = md_rename(sbi->ll_md_exp, op_data,
+                       src_name->name, src_name->len,
+                       tgt_name->name, tgt_name->len, &request);
+       ll_finish_md_op_data(op_data);
+       if (!err) {
+               ll_update_times(request, src);
+               ll_update_times(request, tgt);
+               ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1);
+               err = ll_objects_destroy(request, src);
+       }
+
+       ptlrpc_req_finished(request);
+
+       RETURN(err);
+}
+
+static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode,
+                   dev_t rdev)
+{
+       return ll_mknod_generic(dir, &dchild->d_name, mode,
+                               old_encode_dev(rdev), dchild);
+}
+
+static int ll_unlink(struct inode * dir, struct dentry *dentry)
+{
+       return ll_unlink_generic(dir, NULL, dentry, &dentry->d_name);
+}
+
+static int ll_mkdir(struct inode *dir, struct dentry *dentry, ll_umode_t mode)
+{
+       return ll_mkdir_generic(dir, &dentry->d_name, mode, dentry);
+}
+
+static int ll_rmdir(struct inode *dir, struct dentry *dentry)
+{
+       return ll_rmdir_generic(dir, NULL, dentry, &dentry->d_name);
+}
+
+static int ll_symlink(struct inode *dir, struct dentry *dentry,
+                     const char *oldname)
+{
+       return ll_symlink_generic(dir, &dentry->d_name, oldname, dentry);
+}
+
+static int ll_link(struct dentry *old_dentry, struct inode *dir,
+                  struct dentry *new_dentry)
+{
+       return ll_link_generic(old_dentry->d_inode, dir, &new_dentry->d_name,
+                              new_dentry);
+}
+
+static int ll_rename(struct inode *old_dir, struct dentry *old_dentry,
+                    struct inode *new_dir, struct dentry *new_dentry)
+{
+       int err;
+       err = ll_rename_generic(old_dir, NULL,
+                                old_dentry, &old_dentry->d_name,
+                                new_dir, NULL, new_dentry,
+                                &new_dentry->d_name);
+       if (!err) {
+                       d_move(old_dentry, new_dentry);
+       }
+       return err;
+}
+
+struct inode_operations ll_dir_inode_operations = {
+       .mknod        = ll_mknod,
+       .atomic_open        = ll_atomic_open,
+       .lookup      = ll_lookup_nd,
+       .create      = ll_create_nd,
+       /* We need all these non-raw things for NFSD, to not patch it. */
+       .unlink      = ll_unlink,
+       .mkdir        = ll_mkdir,
+       .rmdir        = ll_rmdir,
+       .symlink            = ll_symlink,
+       .link          = ll_link,
+       .rename      = ll_rename,
+       .setattr            = ll_setattr,
+       .getattr            = ll_getattr,
+       .permission      = ll_inode_permission,
+       .setxattr          = ll_setxattr,
+       .getxattr          = ll_getxattr,
+       .listxattr        = ll_listxattr,
+       .removexattr    = ll_removexattr,
+       .get_acl            = ll_get_acl,
+};
+
+struct inode_operations ll_special_inode_operations = {
+       .setattr        = ll_setattr,
+       .getattr        = ll_getattr,
+       .permission     = ll_inode_permission,
+       .setxattr       = ll_setxattr,
+       .getxattr       = ll_getxattr,
+       .listxattr      = ll_listxattr,
+       .removexattr    = ll_removexattr,
+       .get_acl            = ll_get_acl,
+};
diff --git a/drivers/staging/lustre/lustre/llite/remote_perm.c b/drivers/staging/lustre/lustre/llite/remote_perm.c
new file mode 100644 (file)
index 0000000..68b2dc4
--- /dev/null
@@ -0,0 +1,333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/remote_perm.c
+ *
+ * Lustre Permission Cache for Remote Client
+ *
+ * Author: Lai Siyao <lsy@clusterfs.com>
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+#include "llite_internal.h"
+
+struct kmem_cache *ll_remote_perm_cachep = NULL;
+struct kmem_cache *ll_rmtperm_hash_cachep = NULL;
+
+static inline struct ll_remote_perm *alloc_ll_remote_perm(void)
+{
+       struct ll_remote_perm *lrp;
+
+       OBD_SLAB_ALLOC_PTR_GFP(lrp, ll_remote_perm_cachep, GFP_KERNEL);
+       if (lrp)
+               INIT_HLIST_NODE(&lrp->lrp_list);
+       return lrp;
+}
+
+static inline void free_ll_remote_perm(struct ll_remote_perm *lrp)
+{
+       if (!lrp)
+               return;
+
+       if (!hlist_unhashed(&lrp->lrp_list))
+               hlist_del(&lrp->lrp_list);
+       OBD_SLAB_FREE(lrp, ll_remote_perm_cachep, sizeof(*lrp));
+}
+
+struct hlist_head *alloc_rmtperm_hash(void)
+{
+       struct hlist_head *hash;
+       int i;
+
+       OBD_SLAB_ALLOC_GFP(hash, ll_rmtperm_hash_cachep,
+                          REMOTE_PERM_HASHSIZE * sizeof(*hash),
+                          GFP_IOFS);
+       if (!hash)
+               return NULL;
+
+       for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+               INIT_HLIST_HEAD(hash + i);
+
+       return hash;
+}
+
+void free_rmtperm_hash(struct hlist_head *hash)
+{
+       int i;
+       struct ll_remote_perm *lrp;
+       struct hlist_node *next;
+
+       if(!hash)
+               return;
+
+       for (i = 0; i < REMOTE_PERM_HASHSIZE; i++)
+               hlist_for_each_entry_safe(lrp, next, hash + i,
+                                             lrp_list)
+                       free_ll_remote_perm(lrp);
+       OBD_SLAB_FREE(hash, ll_rmtperm_hash_cachep,
+                     REMOTE_PERM_HASHSIZE * sizeof(*hash));
+}
+
+static inline int remote_perm_hashfunc(uid_t uid)
+{
+       return uid & (REMOTE_PERM_HASHSIZE - 1);
+}
+
+/* NB: setxid permission is not checked here, instead it's done on
+ * MDT when client get remote permission. */
+static int do_check_remote_perm(struct ll_inode_info *lli, int mask)
+{
+       struct hlist_head *head;
+       struct ll_remote_perm *lrp;
+       int found = 0, rc;
+       ENTRY;
+
+       if (!lli->lli_remote_perms)
+               RETURN(-ENOENT);
+
+       head = lli->lli_remote_perms + remote_perm_hashfunc(current_uid());
+
+       spin_lock(&lli->lli_lock);
+       hlist_for_each_entry(lrp, head, lrp_list) {
+               if (lrp->lrp_uid != current_uid())
+                       continue;
+               if (lrp->lrp_gid != current_gid())
+                       continue;
+               if (lrp->lrp_fsuid != current_fsuid())
+                       continue;
+               if (lrp->lrp_fsgid != current_fsgid())
+                       continue;
+               found = 1;
+               break;
+       }
+
+       if (!found)
+               GOTO(out, rc = -ENOENT);
+
+       CDEBUG(D_SEC, "found remote perm: %u/%u/%u/%u - %#x\n",
+              lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+              lrp->lrp_access_perm);
+       rc = ((lrp->lrp_access_perm & mask) == mask) ? 0 : -EACCES;
+
+out:
+       spin_unlock(&lli->lli_lock);
+       return rc;
+}
+
+int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_remote_perm *lrp = NULL, *tmp = NULL;
+       struct hlist_head *head, *perm_hash = NULL;
+       ENTRY;
+
+       LASSERT(ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT);
+
+#if 0
+       if (perm->rp_uid != current->uid ||
+           perm->rp_gid != current->gid ||
+           perm->rp_fsuid != current->fsuid ||
+           perm->rp_fsgid != current->fsgid) {
+               /* user might setxid in this small period */
+               CDEBUG(D_SEC,
+                      "remote perm user %u/%u/%u/%u != current %u/%u/%u/%u\n",
+                      perm->rp_uid, perm->rp_gid, perm->rp_fsuid,
+                      perm->rp_fsgid, current->uid, current->gid,
+                      current->fsuid, current->fsgid);
+               RETURN(-EAGAIN);
+       }
+#endif
+
+       if (!lli->lli_remote_perms) {
+               perm_hash = alloc_rmtperm_hash();
+               if (perm_hash == NULL) {
+                       CERROR("alloc lli_remote_perms failed!\n");
+                       RETURN(-ENOMEM);
+               }
+       }
+
+       spin_lock(&lli->lli_lock);
+
+       if (!lli->lli_remote_perms)
+               lli->lli_remote_perms = perm_hash;
+       else if (perm_hash)
+               free_rmtperm_hash(perm_hash);
+
+       head = lli->lli_remote_perms + remote_perm_hashfunc(perm->rp_uid);
+
+again:
+       hlist_for_each_entry(tmp, head, lrp_list) {
+               if (tmp->lrp_uid != perm->rp_uid)
+                       continue;
+               if (tmp->lrp_gid != perm->rp_gid)
+                       continue;
+               if (tmp->lrp_fsuid != perm->rp_fsuid)
+                       continue;
+               if (tmp->lrp_fsgid != perm->rp_fsgid)
+                       continue;
+               if (lrp)
+                       free_ll_remote_perm(lrp);
+               lrp = tmp;
+               break;
+       }
+
+       if (!lrp) {
+               spin_unlock(&lli->lli_lock);
+               lrp = alloc_ll_remote_perm();
+               if (!lrp) {
+                       CERROR("alloc memory for ll_remote_perm failed!\n");
+                       RETURN(-ENOMEM);
+               }
+               spin_lock(&lli->lli_lock);
+               goto again;
+       }
+
+       lrp->lrp_access_perm = perm->rp_access_perm;
+       if (lrp != tmp) {
+               lrp->lrp_uid     = perm->rp_uid;
+               lrp->lrp_gid     = perm->rp_gid;
+               lrp->lrp_fsuid       = perm->rp_fsuid;
+               lrp->lrp_fsgid       = perm->rp_fsgid;
+               hlist_add_head(&lrp->lrp_list, head);
+       }
+       lli->lli_rmtperm_time = cfs_time_current();
+       spin_unlock(&lli->lli_lock);
+
+       CDEBUG(D_SEC, "new remote perm@%p: %u/%u/%u/%u - %#x\n",
+              lrp, lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid,
+              lrp->lrp_access_perm);
+
+       RETURN(0);
+}
+
+int lustre_check_remote_perm(struct inode *inode, int mask)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ptlrpc_request *req = NULL;
+       struct mdt_remote_perm *perm;
+       struct obd_capa *oc;
+       cfs_time_t save;
+       int i = 0, rc;
+       ENTRY;
+
+       do {
+               save = lli->lli_rmtperm_time;
+               rc = do_check_remote_perm(lli, mask);
+               if (!rc || (rc != -ENOENT && i))
+                       break;
+
+               might_sleep();
+
+               mutex_lock(&lli->lli_rmtperm_mutex);
+               /* check again */
+               if (save != lli->lli_rmtperm_time) {
+                       rc = do_check_remote_perm(lli, mask);
+                       if (!rc || (rc != -ENOENT && i)) {
+                               mutex_unlock(&lli->lli_rmtperm_mutex);
+                               break;
+                       }
+               }
+
+               if (i++ > 5) {
+                       CERROR("check remote perm falls in dead loop!\n");
+                       LBUG();
+               }
+
+               oc = ll_mdscapa_get(inode);
+               rc = md_get_remote_perm(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+                                       ll_i2suppgid(inode), &req);
+               capa_put(oc);
+               if (rc) {
+                       mutex_unlock(&lli->lli_rmtperm_mutex);
+                       break;
+               }
+
+               perm = req_capsule_server_swab_get(&req->rq_pill, &RMF_ACL,
+                                                  lustre_swab_mdt_remote_perm);
+               if (unlikely(perm == NULL)) {
+                       mutex_unlock(&lli->lli_rmtperm_mutex);
+                       rc = -EPROTO;
+                       break;
+               }
+
+               rc = ll_update_remote_perm(inode, perm);
+               mutex_unlock(&lli->lli_rmtperm_mutex);
+               if (rc == -ENOMEM)
+                       break;
+
+               ptlrpc_req_finished(req);
+               req = NULL;
+       } while (1);
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+#if 0  /* NB: remote perms can't be freed in ll_mdc_blocking_ast of UPDATE lock,
+       * because it will fail sanity test 48.
+       */
+void ll_free_remote_perms(struct inode *inode)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct hlist_head *hash = lli->lli_remote_perms;
+       struct ll_remote_perm *lrp;
+       struct hlist_node *node, *next;
+       int i;
+
+       LASSERT(hash);
+
+       spin_lock(&lli->lli_lock);
+
+       for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) {
+               hlist_for_each_entry_safe(lrp, node, next, hash + i,
+                                             lrp_list)
+                       free_ll_remote_perm(lrp);
+       }
+
+       spin_unlock(&lli->lli_lock);
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
new file mode 100644 (file)
index 0000000..fac1178
--- /dev/null
@@ -0,0 +1,1314 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/rw.c
+ *
+ * Lustre Lite I/O page cache routines shared by different kernel revs
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+/* current_is_kswapd() */
+#include <linux/swap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include <obd_cksum.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+/**
+ * Finalizes cl-data before exiting typical address_space operation. Dual to
+ * ll_cl_init().
+ */
+static void ll_cl_fini(struct ll_cl_context *lcc)
+{
+       struct lu_env  *env  = lcc->lcc_env;
+       struct cl_io   *io   = lcc->lcc_io;
+       struct cl_page *page = lcc->lcc_page;
+
+       LASSERT(lcc->lcc_cookie == current);
+       LASSERT(env != NULL);
+
+       if (page != NULL) {
+               lu_ref_del(&page->cp_reference, "cl_io", io);
+               cl_page_put(env, page);
+       }
+
+       if (io && lcc->lcc_created) {
+               cl_io_end(env, io);
+               cl_io_unlock(env, io);
+               cl_io_iter_fini(env, io);
+               cl_io_fini(env, io);
+       }
+       cl_env_put(env, &lcc->lcc_refcheck);
+}
+
+/**
+ * Initializes common cl-data at the typical address_space operation entry
+ * point.
+ */
+static struct ll_cl_context *ll_cl_init(struct file *file,
+                                       struct page *vmpage, int create)
+{
+       struct ll_cl_context *lcc;
+       struct lu_env    *env;
+       struct cl_io     *io;
+       struct cl_object *clob;
+       struct ccc_io    *cio;
+
+       int refcheck;
+       int result = 0;
+
+       clob = ll_i2info(vmpage->mapping->host)->lli_clob;
+       LASSERT(clob != NULL);
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               return ERR_PTR(PTR_ERR(env));
+
+       lcc = &vvp_env_info(env)->vti_io_ctx;
+       memset(lcc, 0, sizeof(*lcc));
+       lcc->lcc_env = env;
+       lcc->lcc_refcheck = refcheck;
+       lcc->lcc_cookie = current;
+
+       cio = ccc_env_io(env);
+       io = cio->cui_cl.cis_io;
+       if (io == NULL && create) {
+               struct inode *inode = vmpage->mapping->host;
+               loff_t pos;
+
+               if (mutex_trylock(&inode->i_mutex)) {
+                       mutex_unlock(&(inode)->i_mutex);
+
+                       /* this is too bad. Someone is trying to write the
+                        * page w/o holding inode mutex. This means we can
+                        * add dirty pages into cache during truncate */
+                       CERROR("Proc %s is dirting page w/o inode lock, this"
+                              "will break truncate.\n", current->comm);
+                       libcfs_debug_dumpstack(NULL);
+                       LBUG();
+                       return ERR_PTR(-EIO);
+               }
+
+               /*
+                * Loop-back driver calls ->prepare_write() and ->sendfile()
+                * methods directly, bypassing file system ->write() operation,
+                * so cl_io has to be created here.
+                */
+               io = ccc_env_thread_io(env);
+               ll_io_init(io, file, 1);
+
+               /* No lock at all for this kind of IO - we can't do it because
+                * we have held page lock, it would cause deadlock.
+                * XXX: This causes poor performance to loop device - One page
+                *      per RPC.
+                *      In order to get better performance, users should use
+                *      lloop driver instead.
+                */
+               io->ci_lockreq = CILR_NEVER;
+
+               pos = (vmpage->index << PAGE_CACHE_SHIFT);
+
+               /* Create a temp IO to serve write. */
+               result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE);
+               if (result == 0) {
+                       cio->cui_fd = LUSTRE_FPRIVATE(file);
+                       cio->cui_iov = NULL;
+                       cio->cui_nrsegs = 0;
+                       result = cl_io_iter_init(env, io);
+                       if (result == 0) {
+                               result = cl_io_lock(env, io);
+                               if (result == 0)
+                                       result = cl_io_start(env, io);
+                       }
+               } else
+                       result = io->ci_result;
+               lcc->lcc_created = 1;
+       }
+
+       lcc->lcc_io = io;
+       if (io == NULL)
+               result = -EIO;
+       if (result == 0) {
+               struct cl_page   *page;
+
+               LASSERT(io != NULL);
+               LASSERT(io->ci_state == CIS_IO_GOING);
+               LASSERT(cio->cui_fd == LUSTRE_FPRIVATE(file));
+               page = cl_page_find(env, clob, vmpage->index, vmpage,
+                                   CPT_CACHEABLE);
+               if (!IS_ERR(page)) {
+                       lcc->lcc_page = page;
+                       lu_ref_add(&page->cp_reference, "cl_io", io);
+                       result = 0;
+               } else
+                       result = PTR_ERR(page);
+       }
+       if (result) {
+               ll_cl_fini(lcc);
+               lcc = ERR_PTR(result);
+       }
+
+       CDEBUG(D_VFSTRACE, "%lu@"DFID" -> %d %p %p\n",
+              vmpage->index, PFID(lu_object_fid(&clob->co_lu)), result,
+              env, io);
+       return lcc;
+}
+
+static struct ll_cl_context *ll_cl_get(void)
+{
+       struct ll_cl_context *lcc;
+       struct lu_env *env;
+       int refcheck;
+
+       env = cl_env_get(&refcheck);
+       LASSERT(!IS_ERR(env));
+       lcc = &vvp_env_info(env)->vti_io_ctx;
+       LASSERT(env == lcc->lcc_env);
+       LASSERT(current == lcc->lcc_cookie);
+       cl_env_put(env, &refcheck);
+
+       /* env has got in ll_cl_init, so it is still usable. */
+       return lcc;
+}
+
+/**
+ * ->prepare_write() address space operation called by generic_file_write()
+ * for every page during write.
+ */
+int ll_prepare_write(struct file *file, struct page *vmpage, unsigned from,
+                    unsigned to)
+{
+       struct ll_cl_context *lcc;
+       int result;
+       ENTRY;
+
+       lcc = ll_cl_init(file, vmpage, 1);
+       if (!IS_ERR(lcc)) {
+               struct lu_env  *env = lcc->lcc_env;
+               struct cl_io   *io  = lcc->lcc_io;
+               struct cl_page *page = lcc->lcc_page;
+
+               cl_page_assume(env, io, page);
+
+               result = cl_io_prepare_write(env, io, page, from, to);
+               if (result == 0) {
+                       /*
+                        * Add a reference, so that page is not evicted from
+                        * the cache until ->commit_write() is called.
+                        */
+                       cl_page_get(page);
+                       lu_ref_add(&page->cp_reference, "prepare_write",
+                                  current);
+               } else {
+                       cl_page_unassume(env, io, page);
+                       ll_cl_fini(lcc);
+               }
+               /* returning 0 in prepare assumes commit must be called
+                * afterwards */
+       } else {
+               result = PTR_ERR(lcc);
+       }
+       RETURN(result);
+}
+
+int ll_commit_write(struct file *file, struct page *vmpage, unsigned from,
+                   unsigned to)
+{
+       struct ll_cl_context *lcc;
+       struct lu_env    *env;
+       struct cl_io     *io;
+       struct cl_page   *page;
+       int result = 0;
+       ENTRY;
+
+       lcc  = ll_cl_get();
+       env  = lcc->lcc_env;
+       page = lcc->lcc_page;
+       io   = lcc->lcc_io;
+
+       LASSERT(cl_page_is_owned(page, io));
+       LASSERT(from <= to);
+       if (from != to) /* handle short write case. */
+               result = cl_io_commit_write(env, io, page, from, to);
+       if (cl_page_is_owned(page, io))
+               cl_page_unassume(env, io, page);
+
+       /*
+        * Release reference acquired by ll_prepare_write().
+        */
+       lu_ref_del(&page->cp_reference, "prepare_write", current);
+       cl_page_put(env, page);
+       ll_cl_fini(lcc);
+       RETURN(result);
+}
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt)
+{
+       __u64 opc;
+
+       opc = crt == CRT_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
+       return ll_osscapa_get(inode, opc);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
+
+/**
+ * Get readahead pages from the filesystem readahead pool of the client for a
+ * thread.
+ *
+ * /param sbi superblock for filesystem readahead state ll_ra_info
+ * /param ria per-thread readahead state
+ * /param pages number of pages requested for readahead for the thread.
+ *
+ * WARNING: This algorithm is used to reduce contention on sbi->ll_lock.
+ * It should work well if the ra_max_pages is much greater than the single
+ * file's read-ahead window, and not too many threads contending for
+ * these readahead pages.
+ *
+ * TODO: There may be a 'global sync problem' if many threads are trying
+ * to get an ra budget that is larger than the remaining readahead pages
+ * and reach here at exactly the same time. They will compute /a ret to
+ * consume the remaining pages, but will fail at atomic_add_return() and
+ * get a zero ra window, although there is still ra space remaining. - Jay */
+
+static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
+                                    struct ra_io_arg *ria,
+                                    unsigned long pages)
+{
+       struct ll_ra_info *ra = &sbi->ll_ra_info;
+       long ret;
+       ENTRY;
+
+       /* If read-ahead pages left are less than 1M, do not do read-ahead,
+        * otherwise it will form small read RPC(< 1M), which hurt server
+        * performance a lot. */
+       ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), pages);
+       if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
+               GOTO(out, ret = 0);
+
+       /* If the non-strided (ria_pages == 0) readahead window
+        * (ria_start + ret) has grown across an RPC boundary, then trim
+        * readahead size by the amount beyond the RPC so it ends on an
+        * RPC boundary. If the readahead window is already ending on
+        * an RPC boundary (beyond_rpc == 0), or smaller than a full
+        * RPC (beyond_rpc < ret) the readahead size is unchanged.
+        * The (beyond_rpc != 0) check is skipped since the conditional
+        * branch is more expensive than subtracting zero from the result.
+        *
+        * Strided read is left unaligned to avoid small fragments beyond
+        * the RPC boundary from needing an extra read RPC. */
+       if (ria->ria_pages == 0) {
+               long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES;
+               if (/* beyond_rpc != 0 && */ beyond_rpc < ret)
+                       ret -= beyond_rpc;
+       }
+
+       if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
+               atomic_sub(ret, &ra->ra_cur_pages);
+               ret = 0;
+       }
+
+out:
+       RETURN(ret);
+}
+
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
+{
+       struct ll_ra_info *ra = &sbi->ll_ra_info;
+       atomic_sub(len, &ra->ra_cur_pages);
+}
+
+static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
+{
+       LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
+       lprocfs_counter_incr(sbi->ll_ra_stats, which);
+}
+
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(mapping->host);
+       ll_ra_stats_inc_sbi(sbi, which);
+}
+
+#define RAS_CDEBUG(ras) \
+       CDEBUG(D_READA,                                               \
+              "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
+              "csr %lu sf %lu sp %lu sl %lu \n",                           \
+              ras->ras_last_readpage, ras->ras_consecutive_requests,   \
+              ras->ras_consecutive_pages, ras->ras_window_start,           \
+              ras->ras_window_len, ras->ras_next_readahead,             \
+              ras->ras_requests, ras->ras_request_index,                   \
+              ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
+              ras->ras_stride_pages, ras->ras_stride_length)
+
+static int index_in_window(unsigned long index, unsigned long point,
+                          unsigned long before, unsigned long after)
+{
+       unsigned long start = point - before, end = point + after;
+
+       if (start > point)
+              start = 0;
+       if (end < point)
+              end = ~0;
+
+       return start <= index && index <= end;
+}
+
+static struct ll_readahead_state *ll_ras_get(struct file *f)
+{
+       struct ll_file_data       *fd;
+
+       fd = LUSTRE_FPRIVATE(f);
+       return &fd->fd_ras;
+}
+
+void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
+{
+       struct ll_readahead_state *ras;
+
+       ras = ll_ras_get(f);
+
+       spin_lock(&ras->ras_lock);
+       ras->ras_requests++;
+       ras->ras_request_index = 0;
+       ras->ras_consecutive_requests++;
+       rar->lrr_reader = current;
+
+       list_add(&rar->lrr_linkage, &ras->ras_read_beads);
+       spin_unlock(&ras->ras_lock);
+}
+
+void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar)
+{
+       struct ll_readahead_state *ras;
+
+       ras = ll_ras_get(f);
+
+       spin_lock(&ras->ras_lock);
+       list_del_init(&rar->lrr_linkage);
+       spin_unlock(&ras->ras_lock);
+}
+
+static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras)
+{
+       struct ll_ra_read *scan;
+
+       list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) {
+               if (scan->lrr_reader == current)
+                       return scan;
+       }
+       return NULL;
+}
+
+struct ll_ra_read *ll_ra_read_get(struct file *f)
+{
+       struct ll_readahead_state *ras;
+       struct ll_ra_read        *bead;
+
+       ras = ll_ras_get(f);
+
+       spin_lock(&ras->ras_lock);
+       bead = ll_ra_read_get_locked(ras);
+       spin_unlock(&ras->ras_lock);
+       return bead;
+}
+
+static int cl_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+                             struct cl_page_list *queue, struct cl_page *page,
+                             struct page *vmpage)
+{
+       struct ccc_page *cp;
+       int           rc;
+
+       ENTRY;
+
+       rc = 0;
+       cl_page_assume(env, io, page);
+       lu_ref_add(&page->cp_reference, "ra", current);
+       cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+       if (!cp->cpg_defer_uptodate && !PageUptodate(vmpage)) {
+               rc = cl_page_is_under_lock(env, io, page);
+               if (rc == -EBUSY) {
+                       cp->cpg_defer_uptodate = 1;
+                       cp->cpg_ra_used = 0;
+                       cl_page_list_add(queue, page);
+                       rc = 1;
+               } else {
+                       cl_page_delete(env, page);
+                       rc = -ENOLCK;
+               }
+       } else {
+               /* skip completed pages */
+               cl_page_unassume(env, io, page);
+       }
+       lu_ref_del(&page->cp_reference, "ra", current);
+       cl_page_put(env, page);
+       RETURN(rc);
+}
+
+/**
+ * Initiates read-ahead of a page with given index.
+ *
+ * \retval     +ve: page was added to \a queue.
+ *
+ * \retval -ENOLCK: there is no extent lock for this part of a file, stop
+ *               read-ahead.
+ *
+ * \retval  -ve, 0: page wasn't added to \a queue for other reason.
+ */
+static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+                             struct cl_page_list *queue,
+                             pgoff_t index, struct address_space *mapping)
+{
+       struct page      *vmpage;
+       struct cl_object *clob  = ll_i2info(mapping->host)->lli_clob;
+       struct cl_page   *page;
+       enum ra_stat      which = _NR_RA_STAT; /* keep gcc happy */
+       unsigned int      gfp_mask;
+       int            rc    = 0;
+       const char       *msg   = NULL;
+
+       ENTRY;
+
+       gfp_mask = GFP_HIGHUSER & ~__GFP_WAIT;
+#ifdef __GFP_NOWARN
+       gfp_mask |= __GFP_NOWARN;
+#endif
+       vmpage = grab_cache_page_nowait(mapping, index);
+       if (vmpage != NULL) {
+               /* Check if vmpage was truncated or reclaimed */
+               if (vmpage->mapping == mapping) {
+                       page = cl_page_find(env, clob, vmpage->index,
+                                           vmpage, CPT_CACHEABLE);
+                       if (!IS_ERR(page)) {
+                               rc = cl_read_ahead_page(env, io, queue,
+                                                       page, vmpage);
+                               if (rc == -ENOLCK) {
+                                       which = RA_STAT_FAILED_MATCH;
+                                       msg   = "lock match failed";
+                               }
+                       } else {
+                               which = RA_STAT_FAILED_GRAB_PAGE;
+                               msg   = "cl_page_find failed";
+                       }
+               } else {
+                       which = RA_STAT_WRONG_GRAB_PAGE;
+                       msg   = "g_c_p_n returned invalid page";
+               }
+               if (rc != 1)
+                       unlock_page(vmpage);
+               page_cache_release(vmpage);
+       } else {
+               which = RA_STAT_FAILED_GRAB_PAGE;
+               msg   = "g_c_p_n failed";
+       }
+       if (msg != NULL) {
+               ll_ra_stats_inc(mapping, which);
+               CDEBUG(D_READA, "%s\n", msg);
+       }
+       RETURN(rc);
+}
+
+#define RIA_DEBUG(ria)                                                \
+       CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n",       \
+       ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
+       ria->ria_pages)
+
+/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
+ * know what the actual RPC size is.  If this needs to change, it makes more
+ * sense to tune the i_blkbits value for the file based on the OSTs it is
+ * striped over, rather than having a constant value for all files here. */
+
+/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_CACHE_SHIFT)).
+ * Temprarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
+ * by default, this should be adjusted corresponding with max_read_ahead_mb
+ * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
+ * up quickly which will affect read performance siginificantly. See LU-2816 */
+#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_CACHE_SHIFT)
+
+static inline int stride_io_mode(struct ll_readahead_state *ras)
+{
+       return ras->ras_consecutive_stride_requests > 1;
+}
+/* The function calculates how much pages will be read in
+ * [off, off + length], in such stride IO area,
+ * stride_offset = st_off, stride_lengh = st_len,
+ * stride_pages = st_pgs
+ *
+ *   |------------------|*****|------------------|*****|------------|*****|....
+ * st_off
+ *   |--- st_pgs     ---|
+ *   |-----     st_len   -----|
+ *
+ *           How many pages it should read in such pattern
+ *           |-------------------------------------------------------------|
+ *           off
+ *           |<------            length                      ------->|
+ *
+ *       =   |<----->|  +  |-------------------------------------| +   |---|
+ *          start_left          st_pgs * i                 end_left
+ */
+static unsigned long
+stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs,
+               unsigned long off, unsigned long length)
+{
+       __u64 start = off > st_off ? off - st_off : 0;
+       __u64 end = off + length > st_off ? off + length - st_off : 0;
+       unsigned long start_left = 0;
+       unsigned long end_left = 0;
+       unsigned long pg_count;
+
+       if (st_len == 0 || length == 0 || end == 0)
+               return length;
+
+       start_left = do_div(start, st_len);
+       if (start_left < st_pgs)
+               start_left = st_pgs - start_left;
+       else
+               start_left = 0;
+
+       end_left = do_div(end, st_len);
+       if (end_left > st_pgs)
+               end_left = st_pgs;
+
+       CDEBUG(D_READA, "start "LPU64", end "LPU64" start_left %lu end_left %lu \n",
+              start, end, start_left, end_left);
+
+       if (start == end)
+               pg_count = end_left - (st_pgs - start_left);
+       else
+               pg_count = start_left + st_pgs * (end - start - 1) + end_left;
+
+       CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu"
+              "pgcount %lu\n", st_off, st_len, st_pgs, off, length, pg_count);
+
+       return pg_count;
+}
+
+static int ria_page_count(struct ra_io_arg *ria)
+{
+       __u64 length = ria->ria_end >= ria->ria_start ?
+                      ria->ria_end - ria->ria_start + 1 : 0;
+
+       return stride_pg_count(ria->ria_stoff, ria->ria_length,
+                              ria->ria_pages, ria->ria_start,
+                              length);
+}
+
+/*Check whether the index is in the defined ra-window */
+static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
+{
+       /* If ria_length == ria_pages, it means non-stride I/O mode,
+        * idx should always inside read-ahead window in this case
+        * For stride I/O mode, just check whether the idx is inside
+        * the ria_pages. */
+       return ria->ria_length == 0 || ria->ria_length == ria->ria_pages ||
+              (idx >= ria->ria_stoff && (idx - ria->ria_stoff) %
+               ria->ria_length < ria->ria_pages);
+}
+
+static int ll_read_ahead_pages(const struct lu_env *env,
+                              struct cl_io *io, struct cl_page_list *queue,
+                              struct ra_io_arg *ria,
+                              unsigned long *reserved_pages,
+                              struct address_space *mapping,
+                              unsigned long *ra_end)
+{
+       int rc, count = 0, stride_ria;
+       unsigned long page_idx;
+
+       LASSERT(ria != NULL);
+       RIA_DEBUG(ria);
+
+       stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
+       for (page_idx = ria->ria_start; page_idx <= ria->ria_end &&
+                       *reserved_pages > 0; page_idx++) {
+               if (ras_inside_ra_window(page_idx, ria)) {
+                       /* If the page is inside the read-ahead window*/
+                       rc = ll_read_ahead_page(env, io, queue,
+                                               page_idx, mapping);
+                       if (rc == 1) {
+                               (*reserved_pages)--;
+                               count ++;
+                       } else if (rc == -ENOLCK)
+                               break;
+               } else if (stride_ria) {
+                       /* If it is not in the read-ahead window, and it is
+                        * read-ahead mode, then check whether it should skip
+                        * the stride gap */
+                       pgoff_t offset;
+                       /* FIXME: This assertion only is valid when it is for
+                        * forward read-ahead, it will be fixed when backward
+                        * read-ahead is implemented */
+                       LASSERTF(page_idx > ria->ria_stoff, "Invalid page_idx %lu"
+                               "rs %lu re %lu ro %lu rl %lu rp %lu\n", page_idx,
+                               ria->ria_start, ria->ria_end, ria->ria_stoff,
+                               ria->ria_length, ria->ria_pages);
+                       offset = page_idx - ria->ria_stoff;
+                       offset = offset % (ria->ria_length);
+                       if (offset > ria->ria_pages) {
+                               page_idx += ria->ria_length - offset;
+                               CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
+                                      ria->ria_length - offset);
+                               continue;
+                       }
+               }
+       }
+       *ra_end = page_idx;
+       return count;
+}
+
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+                struct ll_readahead_state *ras, struct address_space *mapping,
+                struct cl_page_list *queue, int flags)
+{
+       struct vvp_io *vio = vvp_env_io(env);
+       struct vvp_thread_info *vti = vvp_env_info(env);
+       struct cl_attr *attr = ccc_env_thread_attr(env);
+       unsigned long start = 0, end = 0, reserved;
+       unsigned long ra_end, len;
+       struct inode *inode;
+       struct ll_ra_read *bead;
+       struct ra_io_arg *ria = &vti->vti_ria;
+       struct ll_inode_info *lli;
+       struct cl_object *clob;
+       int ret = 0;
+       __u64 kms;
+       ENTRY;
+
+       inode = mapping->host;
+       lli = ll_i2info(inode);
+       clob = lli->lli_clob;
+
+       memset(ria, 0, sizeof *ria);
+
+       cl_object_attr_lock(clob);
+       ret = cl_object_attr_get(env, clob, attr);
+       cl_object_attr_unlock(clob);
+
+       if (ret != 0)
+               RETURN(ret);
+       kms = attr->cat_kms;
+       if (kms == 0) {
+               ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN);
+               RETURN(0);
+       }
+
+       spin_lock(&ras->ras_lock);
+       if (vio->cui_ra_window_set)
+               bead = &vio->cui_bead;
+       else
+               bead = NULL;
+
+       /* Enlarge the RA window to encompass the full read */
+       if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
+           bead->lrr_start + bead->lrr_count) {
+               ras->ras_window_len = bead->lrr_start + bead->lrr_count -
+                                     ras->ras_window_start;
+       }
+       /* Reserve a part of the read-ahead window that we'll be issuing */
+       if (ras->ras_window_len) {
+               start = ras->ras_next_readahead;
+               end = ras->ras_window_start + ras->ras_window_len - 1;
+       }
+       if (end != 0) {
+               unsigned long rpc_boundary;
+               /*
+                * Align RA window to an optimal boundary.
+                *
+                * XXX This would be better to align to cl_max_pages_per_rpc
+                * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
+                * be aligned to the RAID stripe size in the future and that
+                * is more important than the RPC size.
+                */
+               /* Note: we only trim the RPC, instead of extending the RPC
+                * to the boundary, so to avoid reading too much pages during
+                * random reading. */
+               rpc_boundary = ((end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1)));
+               if (rpc_boundary > 0)
+                       rpc_boundary--;
+
+               if (rpc_boundary  > start)
+                       end = rpc_boundary;
+
+               /* Truncate RA window to end of file */
+               end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
+
+               ras->ras_next_readahead = max(end, end + 1);
+               RAS_CDEBUG(ras);
+       }
+       ria->ria_start = start;
+       ria->ria_end = end;
+       /* If stride I/O mode is detected, get stride window*/
+       if (stride_io_mode(ras)) {
+               ria->ria_stoff = ras->ras_stride_offset;
+               ria->ria_length = ras->ras_stride_length;
+               ria->ria_pages = ras->ras_stride_pages;
+       }
+       spin_unlock(&ras->ras_lock);
+
+       if (end == 0) {
+               ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW);
+               RETURN(0);
+       }
+       len = ria_page_count(ria);
+       if (len == 0)
+               RETURN(0);
+
+       reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len);
+       if (reserved < len)
+               ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
+
+       CDEBUG(D_READA, "reserved page %lu ra_cur %d ra_max %lu\n", reserved,
+              atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
+              ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
+
+       ret = ll_read_ahead_pages(env, io, queue,
+                                 ria, &reserved, mapping, &ra_end);
+
+       LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
+       if (reserved != 0)
+               ll_ra_count_put(ll_i2sbi(inode), reserved);
+
+       if (ra_end == end + 1 && ra_end == (kms >> PAGE_CACHE_SHIFT))
+               ll_ra_stats_inc(mapping, RA_STAT_EOF);
+
+       /* if we didn't get to the end of the region we reserved from
+        * the ras we need to go back and update the ras so that the
+        * next read-ahead tries from where we left off.  we only do so
+        * if the region we failed to issue read-ahead on is still ahead
+        * of the app and behind the next index to start read-ahead from */
+       CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n",
+              ra_end, end, ria->ria_end);
+
+       if (ra_end != end + 1) {
+               spin_lock(&ras->ras_lock);
+               if (ra_end < ras->ras_next_readahead &&
+                   index_in_window(ra_end, ras->ras_window_start, 0,
+                                   ras->ras_window_len)) {
+                       ras->ras_next_readahead = ra_end;
+                       RAS_CDEBUG(ras);
+               }
+               spin_unlock(&ras->ras_lock);
+       }
+
+       RETURN(ret);
+}
+
+static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
+                         unsigned long index)
+{
+       ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1));
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_reset(struct inode *inode, struct ll_readahead_state *ras,
+                     unsigned long index)
+{
+       ras->ras_last_readpage = index;
+       ras->ras_consecutive_requests = 0;
+       ras->ras_consecutive_pages = 0;
+       ras->ras_window_len = 0;
+       ras_set_start(inode, ras, index);
+       ras->ras_next_readahead = max(ras->ras_window_start, index);
+
+       RAS_CDEBUG(ras);
+}
+
+/* called with the ras_lock held or from places where it doesn't matter */
+static void ras_stride_reset(struct ll_readahead_state *ras)
+{
+       ras->ras_consecutive_stride_requests = 0;
+       ras->ras_stride_length = 0;
+       ras->ras_stride_pages = 0;
+       RAS_CDEBUG(ras);
+}
+
+void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
+{
+       spin_lock_init(&ras->ras_lock);
+       ras_reset(inode, ras, 0);
+       ras->ras_requests = 0;
+       INIT_LIST_HEAD(&ras->ras_read_beads);
+}
+
+/*
+ * Check whether the read request is in the stride window.
+ * If it is in the stride window, return 1, otherwise return 0.
+ */
+static int index_in_stride_window(struct ll_readahead_state *ras,
+                                 unsigned long index)
+{
+       unsigned long stride_gap;
+
+       if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 ||
+           ras->ras_stride_pages == ras->ras_stride_length)
+               return 0;
+
+       stride_gap = index - ras->ras_last_readpage - 1;
+
+       /* If it is contiguous read */
+       if (stride_gap == 0)
+               return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
+
+       /* Otherwise check the stride by itself */
+       return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
+               ras->ras_consecutive_pages == ras->ras_stride_pages;
+}
+
+static void ras_update_stride_detector(struct ll_readahead_state *ras,
+                                      unsigned long index)
+{
+       unsigned long stride_gap = index - ras->ras_last_readpage - 1;
+
+       if (!stride_io_mode(ras) && (stride_gap != 0 ||
+            ras->ras_consecutive_stride_requests == 0)) {
+               ras->ras_stride_pages = ras->ras_consecutive_pages;
+               ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+       }
+       LASSERT(ras->ras_request_index == 0);
+       LASSERT(ras->ras_consecutive_stride_requests == 0);
+
+       if (index <= ras->ras_last_readpage) {
+               /*Reset stride window for forward read*/
+               ras_stride_reset(ras);
+               return;
+       }
+
+       ras->ras_stride_pages = ras->ras_consecutive_pages;
+       ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
+
+       RAS_CDEBUG(ras);
+       return;
+}
+
+static unsigned long
+stride_page_count(struct ll_readahead_state *ras, unsigned long len)
+{
+       return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length,
+                              ras->ras_stride_pages, ras->ras_stride_offset,
+                              len);
+}
+
+/* Stride Read-ahead window will be increased inc_len according to
+ * stride I/O pattern */
+static void ras_stride_increase_window(struct ll_readahead_state *ras,
+                                      struct ll_ra_info *ra,
+                                      unsigned long inc_len)
+{
+       unsigned long left, step, window_len;
+       unsigned long stride_len;
+
+       LASSERT(ras->ras_stride_length > 0);
+       LASSERTF(ras->ras_window_start + ras->ras_window_len
+                >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
+                " stride_offset %lu\n", ras->ras_window_start,
+                ras->ras_window_len, ras->ras_stride_offset);
+
+       stride_len = ras->ras_window_start + ras->ras_window_len -
+                    ras->ras_stride_offset;
+
+       left = stride_len % ras->ras_stride_length;
+       window_len = ras->ras_window_len - left;
+
+       if (left < ras->ras_stride_pages)
+               left += inc_len;
+       else
+               left = ras->ras_stride_pages + inc_len;
+
+       LASSERT(ras->ras_stride_pages != 0);
+
+       step = left / ras->ras_stride_pages;
+       left %= ras->ras_stride_pages;
+
+       window_len += step * ras->ras_stride_length + left;
+
+       if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
+               ras->ras_window_len = window_len;
+
+       RAS_CDEBUG(ras);
+}
+
+static void ras_increase_window(struct inode *inode,
+                               struct ll_readahead_state *ras,
+                               struct ll_ra_info *ra)
+{
+       /* The stretch of ra-window should be aligned with max rpc_size
+        * but current clio architecture does not support retrieve such
+        * information from lower layer. FIXME later
+        */
+       if (stride_io_mode(ras))
+               ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode));
+       else
+               ras->ras_window_len = min(ras->ras_window_len +
+                                         RAS_INCREASE_STEP(inode),
+                                         ra->ra_max_pages_per_file);
+}
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+               struct ll_readahead_state *ras, unsigned long index,
+               unsigned hit)
+{
+       struct ll_ra_info *ra = &sbi->ll_ra_info;
+       int zero = 0, stride_detect = 0, ra_miss = 0;
+       ENTRY;
+
+       spin_lock(&ras->ras_lock);
+
+       ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
+
+       /* reset the read-ahead window in two cases.  First when the app seeks
+        * or reads to some other part of the file.  Secondly if we get a
+        * read-ahead miss that we think we've previously issued.  This can
+        * be a symptom of there being so many read-ahead pages that the VM is
+        * reclaiming it before we get to it. */
+       if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
+               zero = 1;
+               ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE);
+       } else if (!hit && ras->ras_window_len &&
+                  index < ras->ras_next_readahead &&
+                  index_in_window(index, ras->ras_window_start, 0,
+                                  ras->ras_window_len)) {
+               ra_miss = 1;
+               ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
+       }
+
+       /* On the second access to a file smaller than the tunable
+        * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+        * file up to ra_max_pages_per_file.  This is simply a best effort
+        * and only occurs once per open file.  Normal RA behavior is reverted
+        * to for subsequent IO.  The mmap case does not increment
+        * ras_requests and thus can never trigger this behavior. */
+       if (ras->ras_requests == 2 && !ras->ras_request_index) {
+               __u64 kms_pages;
+
+               kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                           PAGE_CACHE_SHIFT;
+
+               CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
+                      ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
+
+               if (kms_pages &&
+                   kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+                       ras->ras_window_start = 0;
+                       ras->ras_last_readpage = 0;
+                       ras->ras_next_readahead = 0;
+                       ras->ras_window_len = min(ra->ra_max_pages_per_file,
+                               ra->ra_max_read_ahead_whole_pages);
+                       GOTO(out_unlock, 0);
+               }
+       }
+       if (zero) {
+               /* check whether it is in stride I/O mode*/
+               if (!index_in_stride_window(ras, index)) {
+                       if (ras->ras_consecutive_stride_requests == 0 &&
+                           ras->ras_request_index == 0) {
+                               ras_update_stride_detector(ras, index);
+                               ras->ras_consecutive_stride_requests++;
+                       } else {
+                               ras_stride_reset(ras);
+                       }
+                       ras_reset(inode, ras, index);
+                       ras->ras_consecutive_pages++;
+                       GOTO(out_unlock, 0);
+               } else {
+                       ras->ras_consecutive_pages = 0;
+                       ras->ras_consecutive_requests = 0;
+                       if (++ras->ras_consecutive_stride_requests > 1)
+                               stride_detect = 1;
+                       RAS_CDEBUG(ras);
+               }
+       } else {
+               if (ra_miss) {
+                       if (index_in_stride_window(ras, index) &&
+                           stride_io_mode(ras)) {
+                               /*If stride-RA hit cache miss, the stride dector
+                                *will not be reset to avoid the overhead of
+                                *redetecting read-ahead mode */
+                               if (index != ras->ras_last_readpage + 1)
+                                       ras->ras_consecutive_pages = 0;
+                               ras_reset(inode, ras, index);
+                               RAS_CDEBUG(ras);
+                       } else {
+                               /* Reset both stride window and normal RA
+                                * window */
+                               ras_reset(inode, ras, index);
+                               ras->ras_consecutive_pages++;
+                               ras_stride_reset(ras);
+                               GOTO(out_unlock, 0);
+                       }
+               } else if (stride_io_mode(ras)) {
+                       /* If this is contiguous read but in stride I/O mode
+                        * currently, check whether stride step still is valid,
+                        * if invalid, it will reset the stride ra window*/
+                       if (!index_in_stride_window(ras, index)) {
+                               /* Shrink stride read-ahead window to be zero */
+                               ras_stride_reset(ras);
+                               ras->ras_window_len = 0;
+                               ras->ras_next_readahead = index;
+                       }
+               }
+       }
+       ras->ras_consecutive_pages++;
+       ras->ras_last_readpage = index;
+       ras_set_start(inode, ras, index);
+
+       if (stride_io_mode(ras))
+               /* Since stride readahead is sentivite to the offset
+                * of read-ahead, so we use original offset here,
+                * instead of ras_window_start, which is RPC aligned */
+               ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+       else
+               ras->ras_next_readahead = max(ras->ras_window_start,
+                                             ras->ras_next_readahead);
+       RAS_CDEBUG(ras);
+
+       /* Trigger RA in the mmap case where ras_consecutive_requests
+        * is not incremented and thus can't be used to trigger RA */
+       if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) {
+               ras->ras_window_len = RAS_INCREASE_STEP(inode);
+               GOTO(out_unlock, 0);
+       }
+
+       /* Initially reset the stride window offset to next_readahead*/
+       if (ras->ras_consecutive_stride_requests == 2 && stride_detect) {
+               /**
+                * Once stride IO mode is detected, next_readahead should be
+                * reset to make sure next_readahead > stride offset
+                */
+               ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+               ras->ras_stride_offset = index;
+               ras->ras_window_len = RAS_INCREASE_STEP(inode);
+       }
+
+       /* The initial ras_window_len is set to the request size.  To avoid
+        * uselessly reading and discarding pages for random IO the window is
+        * only increased once per consecutive request received. */
+       if ((ras->ras_consecutive_requests > 1 || stride_detect) &&
+           !ras->ras_request_index)
+               ras_increase_window(inode, ras, ra);
+       EXIT;
+out_unlock:
+       RAS_CDEBUG(ras);
+       ras->ras_request_index++;
+       spin_unlock(&ras->ras_lock);
+       return;
+}
+
+int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
+{
+       struct inode           *inode = vmpage->mapping->host;
+       struct ll_inode_info   *lli   = ll_i2info(inode);
+       struct lu_env     *env;
+       struct cl_io       *io;
+       struct cl_page   *page;
+       struct cl_object       *clob;
+       struct cl_env_nest      nest;
+       bool redirtied = false;
+       bool unlocked = false;
+       int result;
+       ENTRY;
+
+       LASSERT(PageLocked(vmpage));
+       LASSERT(!PageWriteback(vmpage));
+
+       LASSERT(ll_i2dtexp(inode) != NULL);
+
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               GOTO(out, result = PTR_ERR(env));
+
+       clob  = ll_i2info(inode)->lli_clob;
+       LASSERT(clob != NULL);
+
+       io = ccc_env_thread_io(env);
+       io->ci_obj = clob;
+       io->ci_ignore_layout = 1;
+       result = cl_io_init(env, io, CIT_MISC, clob);
+       if (result == 0) {
+               page = cl_page_find(env, clob, vmpage->index,
+                                   vmpage, CPT_CACHEABLE);
+               if (!IS_ERR(page)) {
+                       lu_ref_add(&page->cp_reference, "writepage",
+                                  current);
+                       cl_page_assume(env, io, page);
+                       result = cl_page_flush(env, io, page);
+                       if (result != 0) {
+                               /*
+                                * Re-dirty page on error so it retries write,
+                                * but not in case when IO has actually
+                                * occurred and completed with an error.
+                                */
+                               if (!PageError(vmpage)) {
+                                       redirty_page_for_writepage(wbc, vmpage);
+                                       result = 0;
+                                       redirtied = true;
+                               }
+                       }
+                       cl_page_disown(env, io, page);
+                       unlocked = true;
+                       lu_ref_del(&page->cp_reference,
+                                  "writepage", current);
+                       cl_page_put(env, page);
+               } else {
+                       result = PTR_ERR(page);
+               }
+       }
+       cl_io_fini(env, io);
+
+       if (redirtied && wbc->sync_mode == WB_SYNC_ALL) {
+               loff_t offset = cl_offset(clob, vmpage->index);
+
+               /* Flush page failed because the extent is being written out.
+                * Wait for the write of extent to be finished to avoid
+                * breaking kernel which assumes ->writepage should mark
+                * PageWriteback or clean the page. */
+               result = cl_sync_file_range(inode, offset,
+                                           offset + PAGE_CACHE_SIZE - 1,
+                                           CL_FSYNC_LOCAL, 1);
+               if (result > 0) {
+                       /* actually we may have written more than one page.
+                        * decreasing this page because the caller will count
+                        * it. */
+                       wbc->nr_to_write -= result - 1;
+                       result = 0;
+               }
+       }
+
+       cl_env_nested_put(&nest, env);
+       GOTO(out, result);
+
+out:
+       if (result < 0) {
+               if (!lli->lli_async_rc)
+                       lli->lli_async_rc = result;
+               SetPageError(vmpage);
+               if (!unlocked)
+                       unlock_page(vmpage);
+       }
+       return result;
+}
+
+int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+       struct inode *inode = mapping->host;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       loff_t start;
+       loff_t end;
+       enum cl_fsync_mode mode;
+       int range_whole = 0;
+       int result;
+       int ignore_layout = 0;
+       ENTRY;
+
+       if (wbc->range_cyclic) {
+               start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+               end = OBD_OBJECT_EOF;
+       } else {
+               start = wbc->range_start;
+               end = wbc->range_end;
+               if (end == LLONG_MAX) {
+                       end = OBD_OBJECT_EOF;
+                       range_whole = start == 0;
+               }
+       }
+
+       mode = CL_FSYNC_NONE;
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               mode = CL_FSYNC_LOCAL;
+
+       if (sbi->ll_umounting)
+               /* if the mountpoint is being umounted, all pages have to be
+                * evicted to avoid hitting LBUG when truncate_inode_pages()
+                * is called later on. */
+               ignore_layout = 1;
+       result = cl_sync_file_range(inode, start, end, mode, ignore_layout);
+       if (result > 0) {
+               wbc->nr_to_write -= result;
+               result = 0;
+        }
+
+       if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
+               if (end == OBD_OBJECT_EOF)
+                       end = i_size_read(inode);
+               mapping->writeback_index = (end >> PAGE_CACHE_SHIFT) + 1;
+       }
+       RETURN(result);
+}
+
+int ll_readpage(struct file *file, struct page *vmpage)
+{
+       struct ll_cl_context *lcc;
+       int result;
+       ENTRY;
+
+       lcc = ll_cl_init(file, vmpage, 0);
+       if (!IS_ERR(lcc)) {
+               struct lu_env  *env  = lcc->lcc_env;
+               struct cl_io   *io   = lcc->lcc_io;
+               struct cl_page *page = lcc->lcc_page;
+
+               LASSERT(page->cp_type == CPT_CACHEABLE);
+               if (likely(!PageUptodate(vmpage))) {
+                       cl_page_assume(env, io, page);
+                       result = cl_io_read_page(env, io, page);
+               } else {
+                       /* Page from a non-object file. */
+                       unlock_page(vmpage);
+                       result = 0;
+               }
+               ll_cl_fini(lcc);
+       } else {
+               unlock_page(vmpage);
+               result = PTR_ERR(lcc);
+       }
+       RETURN(result);
+}
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
new file mode 100644 (file)
index 0000000..27e4e64
--- /dev/null
@@ -0,0 +1,586 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lustre/llite/rw26.c
+ *
+ * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <asm/uaccess.h>
+
+#include <linux/migrate.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+#include <linux/lustre_compat25.h>
+
+/**
+ * Implements Linux VM address_space::invalidatepage() method. This method is
+ * called when the page is truncate from a file, either as a result of
+ * explicit truncate, or when inode is removed from memory (as a result of
+ * final iput(), umount, or memory pressure induced icache shrinking).
+ *
+ * [0, offset] bytes of the page remain valid (this is for a case of not-page
+ * aligned truncate). Lustre leaves partially truncated page in the cache,
+ * relying on struct inode::i_size to limit further accesses.
+ */
+static void ll_invalidatepage(struct page *vmpage, unsigned long offset)
+{
+       struct inode     *inode;
+       struct lu_env    *env;
+       struct cl_page   *page;
+       struct cl_object *obj;
+
+       int refcheck;
+
+       LASSERT(PageLocked(vmpage));
+       LASSERT(!PageWriteback(vmpage));
+
+       /*
+        * It is safe to not check anything in invalidatepage/releasepage
+        * below because they are run with page locked and all our io is
+        * happening with locked page too
+        */
+       if (offset == 0) {
+               env = cl_env_get(&refcheck);
+               if (!IS_ERR(env)) {
+                       inode = vmpage->mapping->host;
+                       obj = ll_i2info(inode)->lli_clob;
+                       if (obj != NULL) {
+                               page = cl_vmpage_page(vmpage, obj);
+                               if (page != NULL) {
+                                       lu_ref_add(&page->cp_reference,
+                                                  "delete", vmpage);
+                                       cl_page_delete(env, page);
+                                       lu_ref_del(&page->cp_reference,
+                                                  "delete", vmpage);
+                                       cl_page_put(env, page);
+                               }
+                       } else
+                               LASSERT(vmpage->private == 0);
+                       cl_env_put(env, &refcheck);
+               }
+       }
+}
+
+#ifdef HAVE_RELEASEPAGE_WITH_INT
+#define RELEASEPAGE_ARG_TYPE int
+#else
+#define RELEASEPAGE_ARG_TYPE gfp_t
+#endif
+static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask)
+{
+       struct cl_env_nest nest;
+       struct lu_env     *env;
+       struct cl_object  *obj;
+       struct cl_page    *page;
+       struct address_space *mapping;
+       int result;
+
+       LASSERT(PageLocked(vmpage));
+       if (PageWriteback(vmpage) || PageDirty(vmpage))
+               return 0;
+
+       mapping = vmpage->mapping;
+       if (mapping == NULL)
+               return 1;
+
+       obj = ll_i2info(mapping->host)->lli_clob;
+       if (obj == NULL)
+               return 1;
+
+       /* 1 for page allocator, 1 for cl_page and 1 for page cache */
+       if (page_count(vmpage) > 3)
+               return 0;
+
+       /* TODO: determine what gfp should be used by @gfp_mask. */
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               /* If we can't allocate an env we won't call cl_page_put()
+                * later on which further means it's impossible to drop
+                * page refcount by cl_page, so ask kernel to not free
+                * this page. */
+               return 0;
+
+       page = cl_vmpage_page(vmpage, obj);
+       result = page == NULL;
+       if (page != NULL) {
+               if (!cl_page_in_use(page)) {
+                       result = 1;
+                       cl_page_delete(env, page);
+               }
+               cl_page_put(env, page);
+       }
+       cl_env_nested_put(&nest, env);
+       return result;
+}
+
+static int ll_set_page_dirty(struct page *vmpage)
+{
+#if 0
+       struct cl_page    *page = vvp_vmpage_page_transient(vmpage);
+       struct vvp_object *obj  = cl_inode2vvp(vmpage->mapping->host);
+       struct vvp_page   *cpg;
+
+       /*
+        * XXX should page method be called here?
+        */
+       LASSERT(&obj->co_cl == page->cp_obj);
+       cpg = cl2vvp_page(cl_page_at(page, &vvp_device_type));
+       /*
+        * XXX cannot do much here, because page is possibly not locked:
+        * sys_munmap()->...
+        *     ->unmap_page_range()->zap_pte_range()->set_page_dirty().
+        */
+       vvp_write_pending(obj, cpg);
+#endif
+       RETURN(__set_page_dirty_nobuffers(vmpage));
+}
+
+#define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL
+
+static inline int ll_get_user_pages(int rw, unsigned long user_addr,
+                                   size_t size, struct page ***pages,
+                                   int *max_pages)
+{
+       int result = -ENOMEM;
+
+       /* set an arbitrary limit to prevent arithmetic overflow */
+       if (size > MAX_DIRECTIO_SIZE) {
+               *pages = NULL;
+               return -EFBIG;
+       }
+
+       *max_pages = (user_addr + size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       *max_pages -= user_addr >> PAGE_CACHE_SHIFT;
+
+       OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages));
+       if (*pages) {
+               down_read(&current->mm->mmap_sem);
+               result = get_user_pages(current, current->mm, user_addr,
+                                       *max_pages, (rw == READ), 0, *pages,
+                                       NULL);
+               up_read(&current->mm->mmap_sem);
+               if (unlikely(result <= 0))
+                       OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages));
+       }
+
+       return result;
+}
+
+/*  ll_free_user_pages - tear down page struct array
+ *  @pages: array of page struct pointers underlying target buffer */
+static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
+{
+       int i;
+
+       for (i = 0; i < npages; i++) {
+               if (pages[i] == NULL)
+                       break;
+               if (do_dirty)
+                       set_page_dirty_lock(pages[i]);
+               page_cache_release(pages[i]);
+       }
+
+       OBD_FREE_LARGE(pages, npages * sizeof(*pages));
+}
+
+ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
+                          int rw, struct inode *inode,
+                          struct ll_dio_pages *pv)
+{
+       struct cl_page    *clp;
+       struct cl_2queue  *queue;
+       struct cl_object  *obj = io->ci_obj;
+       int i;
+       ssize_t rc = 0;
+       loff_t file_offset  = pv->ldp_start_offset;
+       long size          = pv->ldp_size;
+       int page_count      = pv->ldp_nr;
+       struct page **pages = pv->ldp_pages;
+       long page_size      = cl_page_size(obj);
+       bool do_io;
+       int  io_pages       = 0;
+       ENTRY;
+
+       queue = &io->ci_queue;
+       cl_2queue_init(queue);
+       for (i = 0; i < page_count; i++) {
+               if (pv->ldp_offsets)
+                   file_offset = pv->ldp_offsets[i];
+
+               LASSERT(!(file_offset & (page_size - 1)));
+               clp = cl_page_find(env, obj, cl_index(obj, file_offset),
+                                  pv->ldp_pages[i], CPT_TRANSIENT);
+               if (IS_ERR(clp)) {
+                       rc = PTR_ERR(clp);
+                       break;
+               }
+
+               rc = cl_page_own(env, io, clp);
+               if (rc) {
+                       LASSERT(clp->cp_state == CPS_FREEING);
+                       cl_page_put(env, clp);
+                       break;
+               }
+
+               do_io = true;
+
+               /* check the page type: if the page is a host page, then do
+                * write directly */
+               if (clp->cp_type == CPT_CACHEABLE) {
+                       struct page *vmpage = cl_page_vmpage(env, clp);
+                       struct page *src_page;
+                       struct page *dst_page;
+                       void       *src;
+                       void       *dst;
+
+                       src_page = (rw == WRITE) ? pages[i] : vmpage;
+                       dst_page = (rw == WRITE) ? vmpage : pages[i];
+
+                       src = ll_kmap_atomic(src_page, KM_USER0);
+                       dst = ll_kmap_atomic(dst_page, KM_USER1);
+                       memcpy(dst, src, min(page_size, size));
+                       ll_kunmap_atomic(dst, KM_USER1);
+                       ll_kunmap_atomic(src, KM_USER0);
+
+                       /* make sure page will be added to the transfer by
+                        * cl_io_submit()->...->vvp_page_prep_write(). */
+                       if (rw == WRITE)
+                               set_page_dirty(vmpage);
+
+                       if (rw == READ) {
+                               /* do not issue the page for read, since it
+                                * may reread a ra page which has NOT uptodate
+                                * bit set. */
+                               cl_page_disown(env, io, clp);
+                               do_io = false;
+                       }
+               }
+
+               if (likely(do_io)) {
+                       cl_2queue_add(queue, clp);
+
+                       /*
+                        * Set page clip to tell transfer formation engine
+                        * that page has to be sent even if it is beyond KMS.
+                        */
+                       cl_page_clip(env, clp, 0, min(size, page_size));
+
+                       ++io_pages;
+               }
+
+               /* drop the reference count for cl_page_find */
+               cl_page_put(env, clp);
+               size -= page_size;
+               file_offset += page_size;
+       }
+
+       if (rc == 0 && io_pages) {
+               rc = cl_io_submit_sync(env, io,
+                                      rw == READ ? CRT_READ : CRT_WRITE,
+                                      queue, 0);
+       }
+       if (rc == 0)
+               rc = pv->ldp_size;
+
+       cl_2queue_discard(env, io, queue);
+       cl_2queue_disown(env, io, queue);
+       cl_2queue_fini(env, queue);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ll_direct_rw_pages);
+
+static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
+                                  int rw, struct inode *inode,
+                                  struct address_space *mapping,
+                                  size_t size, loff_t file_offset,
+                                  struct page **pages, int page_count)
+{
+    struct ll_dio_pages pvec = { .ldp_pages    = pages,
+                                .ldp_nr           = page_count,
+                                .ldp_size       = size,
+                                .ldp_offsets      = NULL,
+                                .ldp_start_offset = file_offset
+                              };
+
+    return ll_direct_rw_pages(env, io, rw, inode, &pvec);
+}
+
+#ifdef KMALLOC_MAX_SIZE
+#define MAX_MALLOC KMALLOC_MAX_SIZE
+#else
+#define MAX_MALLOC (128 * 1024)
+#endif
+
+/* This is the maximum size of a single O_DIRECT request, based on the
+ * kmalloc limit.  We need to fit all of the brw_page structs, each one
+ * representing PAGE_SIZE worth of user data, into a single buffer, and
+ * then truncate this to be a full-sized RPC.  For 4kB PAGE_SIZE this is
+ * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */
+#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \
+                     ~(DT_MAX_BRW_SIZE - 1))
+static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
+                              const struct iovec *iov, loff_t file_offset,
+                              unsigned long nr_segs)
+{
+       struct lu_env *env;
+       struct cl_io *io;
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       struct ccc_object *obj = cl_inode2ccc(inode);
+       long count = iov_length(iov, nr_segs);
+       long tot_bytes = 0, result = 0;
+       struct ll_inode_info *lli = ll_i2info(inode);
+       unsigned long seg = 0;
+       long size = MAX_DIO_SIZE;
+       int refcheck;
+       ENTRY;
+
+       if (!lli->lli_has_smd)
+               RETURN(-EBADF);
+
+       /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
+       if ((file_offset & ~CFS_PAGE_MASK) || (count & ~CFS_PAGE_MASK))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), size=%lu (max %lu), "
+              "offset=%lld=%llx, pages %lu (max %lu)\n",
+              inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE,
+              file_offset, file_offset, count >> PAGE_CACHE_SHIFT,
+              MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
+
+       /* Check that all user buffers are aligned as well */
+       for (seg = 0; seg < nr_segs; seg++) {
+               if (((unsigned long)iov[seg].iov_base & ~CFS_PAGE_MASK) ||
+                   (iov[seg].iov_len & ~CFS_PAGE_MASK))
+                       RETURN(-EINVAL);
+       }
+
+       env = cl_env_get(&refcheck);
+       LASSERT(!IS_ERR(env));
+       io = ccc_env_io(env)->cui_cl.cis_io;
+       LASSERT(io != NULL);
+
+       /* 0. Need locking between buffered and direct access. and race with
+        *    size changing by concurrent truncates and writes.
+        * 1. Need inode mutex to operate transient pages.
+        */
+       if (rw == READ)
+               mutex_lock(&inode->i_mutex);
+
+       LASSERT(obj->cob_transient_pages == 0);
+       for (seg = 0; seg < nr_segs; seg++) {
+               long iov_left = iov[seg].iov_len;
+               unsigned long user_addr = (unsigned long)iov[seg].iov_base;
+
+               if (rw == READ) {
+                       if (file_offset >= i_size_read(inode))
+                               break;
+                       if (file_offset + iov_left > i_size_read(inode))
+                               iov_left = i_size_read(inode) - file_offset;
+               }
+
+               while (iov_left > 0) {
+                       struct page **pages;
+                       int page_count, max_pages = 0;
+                       long bytes;
+
+                       bytes = min(size, iov_left);
+                       page_count = ll_get_user_pages(rw, user_addr, bytes,
+                                                      &pages, &max_pages);
+                       if (likely(page_count > 0)) {
+                               if (unlikely(page_count <  max_pages))
+                                       bytes = page_count << PAGE_CACHE_SHIFT;
+                               result = ll_direct_IO_26_seg(env, io, rw, inode,
+                                                            file->f_mapping,
+                                                            bytes, file_offset,
+                                                            pages, page_count);
+                               ll_free_user_pages(pages, max_pages, rw==READ);
+                       } else if (page_count == 0) {
+                               GOTO(out, result = -EFAULT);
+                       } else {
+                               result = page_count;
+                       }
+                       if (unlikely(result <= 0)) {
+                               /* If we can't allocate a large enough buffer
+                                * for the request, shrink it to a smaller
+                                * PAGE_SIZE multiple and try again.
+                                * We should always be able to kmalloc for a
+                                * page worth of page pointers = 4MB on i386. */
+                               if (result == -ENOMEM &&
+                                   size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
+                                          PAGE_CACHE_SIZE) {
+                                       size = ((((size / 2) - 1) |
+                                                ~CFS_PAGE_MASK) + 1) &
+                                               CFS_PAGE_MASK;
+                                       CDEBUG(D_VFSTRACE,"DIO size now %lu\n",
+                                              size);
+                                       continue;
+                               }
+
+                               GOTO(out, result);
+                       }
+
+                       tot_bytes += result;
+                       file_offset += result;
+                       iov_left -= result;
+                       user_addr += result;
+               }
+       }
+out:
+       LASSERT(obj->cob_transient_pages == 0);
+       if (rw == READ)
+               mutex_unlock(&inode->i_mutex);
+
+       if (tot_bytes > 0) {
+               if (rw == WRITE) {
+                       struct lov_stripe_md *lsm;
+
+                       lsm = ccc_inode_lsm_get(inode);
+                       LASSERT(lsm != NULL);
+                       lov_stripe_lock(lsm);
+                       obd_adjust_kms(ll_i2dtexp(inode), lsm, file_offset, 0);
+                       lov_stripe_unlock(lsm);
+                       ccc_inode_lsm_put(inode, lsm);
+               }
+       }
+
+       cl_env_put(env, &refcheck);
+       RETURN(tot_bytes ? : result);
+}
+
+static int ll_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata)
+{
+       pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+       struct page *page;
+       int rc;
+       unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+       ENTRY;
+
+       page = grab_cache_page_write_begin(mapping, index, flags);
+       if (!page)
+               RETURN(-ENOMEM);
+
+       *pagep = page;
+
+       rc = ll_prepare_write(file, page, from, from + len);
+       if (rc) {
+               unlock_page(page);
+               page_cache_release(page);
+       }
+       RETURN(rc);
+}
+
+static int ll_write_end(struct file *file, struct address_space *mapping,
+                       loff_t pos, unsigned len, unsigned copied,
+                       struct page *page, void *fsdata)
+{
+       unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+       int rc;
+
+       rc = ll_commit_write(file, page, from, from + copied);
+       unlock_page(page);
+       page_cache_release(page);
+
+       return rc ?: copied;
+}
+
+#ifdef CONFIG_MIGRATION
+int ll_migratepage(struct address_space *mapping,
+               struct page *newpage, struct page *page
+               , enum migrate_mode mode
+               )
+{
+       /* Always fail page migration until we have a proper implementation */
+       return -EIO;
+}
+#endif
+
+#ifndef MS_HAS_NEW_AOPS
+struct address_space_operations ll_aops = {
+       .readpage       = ll_readpage,
+//     .readpages      = ll_readpages,
+       .direct_IO      = ll_direct_IO_26,
+       .writepage      = ll_writepage,
+       .writepages     = ll_writepages,
+       .set_page_dirty = ll_set_page_dirty,
+       .write_begin    = ll_write_begin,
+       .write_end      = ll_write_end,
+       .invalidatepage = ll_invalidatepage,
+       .releasepage    = (void *)ll_releasepage,
+#ifdef CONFIG_MIGRATION
+       .migratepage    = ll_migratepage,
+#endif
+       .bmap      = NULL
+};
+#else
+struct address_space_operations_ext ll_aops = {
+       .orig_aops.readpage       = ll_readpage,
+//     .orig_aops.readpages      = ll_readpages,
+       .orig_aops.direct_IO      = ll_direct_IO_26,
+       .orig_aops.writepage      = ll_writepage,
+       .orig_aops.writepages     = ll_writepages,
+       .orig_aops.set_page_dirty = ll_set_page_dirty,
+       .orig_aops.prepare_write  = ll_prepare_write,
+       .orig_aops.commit_write   = ll_commit_write,
+       .orig_aops.invalidatepage = ll_invalidatepage,
+       .orig_aops.releasepage    = ll_releasepage,
+#ifdef CONFIG_MIGRATION
+       .orig_aops.migratepage    = ll_migratepage,
+#endif
+       .orig_aops.bmap    = NULL,
+       .write_begin    = ll_write_begin,
+       .write_end      = ll_write_end
+};
+#endif
diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c
new file mode 100644 (file)
index 0000000..7747f8f
--- /dev/null
@@ -0,0 +1,1722 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include "llite_internal.h"
+
+#define SA_OMITTED_ENTRY_MAX 8ULL
+
+typedef enum {
+       /** negative values are for error cases */
+       SA_ENTRY_INIT = 0,      /** init entry */
+       SA_ENTRY_SUCC = 1,      /** stat succeed */
+       SA_ENTRY_INVA = 2,      /** invalid entry */
+       SA_ENTRY_DEST = 3,      /** entry to be destroyed */
+} se_stat_t;
+
+struct ll_sa_entry {
+       /* link into sai->sai_entries */
+       struct list_head              se_link;
+       /* link into sai->sai_entries_{received,stated} */
+       struct list_head              se_list;
+       /* link into sai hash table locally */
+       struct list_head              se_hash;
+       /* entry reference count */
+       atomic_t            se_refcount;
+       /* entry index in the sai */
+       __u64              se_index;
+       /* low layer ldlm lock handle */
+       __u64              se_handle;
+       /* entry status */
+       se_stat_t              se_stat;
+       /* entry size, contains name */
+       int                  se_size;
+       /* pointer to async getattr enqueue info */
+       struct md_enqueue_info *se_minfo;
+       /* pointer to the async getattr request */
+       struct ptlrpc_request  *se_req;
+       /* pointer to the target inode */
+       struct inode       *se_inode;
+       /* entry name */
+       struct qstr          se_qstr;
+};
+
+static unsigned int sai_generation = 0;
+static DEFINE_SPINLOCK(sai_generation_lock);
+
+static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry)
+{
+       return list_empty(&entry->se_hash);
+}
+
+/*
+ * The entry only can be released by the caller, it is necessary to hold lock.
+ */
+static inline int ll_sa_entry_stated(struct ll_sa_entry *entry)
+{
+       smp_rmb();
+       return (entry->se_stat != SA_ENTRY_INIT);
+}
+
+static inline int ll_sa_entry_hash(int val)
+{
+       return val & LL_SA_CACHE_MASK;
+}
+
+/*
+ * Insert entry to hash SA table.
+ */
+static inline void
+ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+       int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+       spin_lock(&sai->sai_cache_lock[i]);
+       list_add_tail(&entry->se_hash, &sai->sai_cache[i]);
+       spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+/*
+ * Remove entry from SA table.
+ */
+static inline void
+ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+       int i = ll_sa_entry_hash(entry->se_qstr.hash);
+
+       spin_lock(&sai->sai_cache_lock[i]);
+       list_del_init(&entry->se_hash);
+       spin_unlock(&sai->sai_cache_lock[i]);
+}
+
+static inline int agl_should_run(struct ll_statahead_info *sai,
+                                struct inode *inode)
+{
+       return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid);
+}
+
+static inline struct ll_sa_entry *
+sa_first_received_entry(struct ll_statahead_info *sai)
+{
+       return list_entry(sai->sai_entries_received.next,
+                             struct ll_sa_entry, se_list);
+}
+
+static inline struct ll_inode_info *
+agl_first_entry(struct ll_statahead_info *sai)
+{
+       return list_entry(sai->sai_entries_agl.next,
+                             struct ll_inode_info, lli_agl_list);
+}
+
+static inline int sa_sent_full(struct ll_statahead_info *sai)
+{
+       return atomic_read(&sai->sai_cache_count) >= sai->sai_max;
+}
+
+static inline int sa_received_empty(struct ll_statahead_info *sai)
+{
+       return list_empty(&sai->sai_entries_received);
+}
+
+static inline int agl_list_empty(struct ll_statahead_info *sai)
+{
+       return list_empty(&sai->sai_entries_agl);
+}
+
+/**
+ * (1) hit ratio less than 80%
+ * or
+ * (2) consecutive miss more than 8
+ * then means low hit.
+ */
+static inline int sa_low_hit(struct ll_statahead_info *sai)
+{
+       return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) ||
+               (sai->sai_consecutive_miss > 8));
+}
+
+/*
+ * If the given index is behind of statahead window more than
+ * SA_OMITTED_ENTRY_MAX, then it is old.
+ */
+static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index)
+{
+       return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX <
+                sai->sai_index);
+}
+
+/*
+ * Insert it into sai_entries tail when init.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index,
+                 const char *name, int len)
+{
+       struct ll_inode_info *lli;
+       struct ll_sa_entry   *entry;
+       int                entry_size;
+       char             *dname;
+       ENTRY;
+
+       entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4;
+       OBD_ALLOC(entry, entry_size);
+       if (unlikely(entry == NULL))
+               RETURN(ERR_PTR(-ENOMEM));
+
+       CDEBUG(D_READA, "alloc sa entry %.*s(%p) index "LPU64"\n",
+              len, name, entry, index);
+
+       entry->se_index = index;
+
+       /*
+        * Statahead entry reference rules:
+        *
+        * 1) When statahead entry is initialized, its reference is set as 2.
+        *    One reference is used by the directory scanner. When the scanner
+        *    searches the statahead cache for the given name, it can perform
+        *    lockless hash lookup (only the scanner can remove entry from hash
+        *    list), and once found, it needn't to call "atomic_inc()" for the
+        *    entry reference. So the performance is improved. After using the
+        *    statahead entry, the scanner will call "atomic_dec()" to drop the
+        *    reference held when initialization. If it is the last reference,
+        *    the statahead entry will be freed.
+        *
+        * 2) All other threads, including statahead thread and ptlrpcd thread,
+        *    when they process the statahead entry, the reference for target
+        *    should be held to guarantee the entry will not be released by the
+        *    directory scanner. After processing the entry, these threads will
+        *    drop the entry reference. If it is the last reference, the entry
+        *    will be freed.
+        *
+        *    The second reference when initializes the statahead entry is used
+        *    by the statahead thread, following the rule 2).
+        */
+       atomic_set(&entry->se_refcount, 2);
+       entry->se_stat = SA_ENTRY_INIT;
+       entry->se_size = entry_size;
+       dname = (char *)entry + sizeof(struct ll_sa_entry);
+       memcpy(dname, name, len);
+       dname[len] = 0;
+       entry->se_qstr.hash = full_name_hash(name, len);
+       entry->se_qstr.len = len;
+       entry->se_qstr.name = dname;
+
+       lli = ll_i2info(sai->sai_inode);
+       spin_lock(&lli->lli_sa_lock);
+       list_add_tail(&entry->se_link, &sai->sai_entries);
+       INIT_LIST_HEAD(&entry->se_list);
+       ll_sa_entry_enhash(sai, entry);
+       spin_unlock(&lli->lli_sa_lock);
+
+       atomic_inc(&sai->sai_cache_count);
+
+       RETURN(entry);
+}
+
+/*
+ * Used by the directory scanner to search entry with name.
+ *
+ * Only the caller can remove the entry from hash, so it is unnecessary to hold
+ * hash lock. It is caller's duty to release the init refcount on the entry, so
+ * it is also unnecessary to increase refcount on the entry.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr)
+{
+       struct ll_sa_entry *entry;
+       int i = ll_sa_entry_hash(qstr->hash);
+
+       list_for_each_entry(entry, &sai->sai_cache[i], se_hash) {
+               if (entry->se_qstr.hash == qstr->hash &&
+                   entry->se_qstr.len == qstr->len &&
+                   memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0)
+                       return entry;
+       }
+       return NULL;
+}
+
+/*
+ * Used by the async getattr request callback to find entry with index.
+ *
+ * Inside lli_sa_lock to prevent others to change the list during the search.
+ * It needs to increase entry refcount before returning to guarantee that the
+ * entry cannot be freed by others.
+ */
+static struct ll_sa_entry *
+ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index)
+{
+       struct ll_sa_entry *entry;
+
+       list_for_each_entry(entry, &sai->sai_entries, se_link) {
+               if (entry->se_index == index) {
+                       LASSERT(atomic_read(&entry->se_refcount) > 0);
+                       atomic_inc(&entry->se_refcount);
+                       return entry;
+               }
+               if (entry->se_index > index)
+                       break;
+       }
+       return NULL;
+}
+
+static void ll_sa_entry_cleanup(struct ll_statahead_info *sai,
+                                struct ll_sa_entry *entry)
+{
+       struct md_enqueue_info *minfo = entry->se_minfo;
+       struct ptlrpc_request  *req   = entry->se_req;
+
+       if (minfo) {
+               entry->se_minfo = NULL;
+               ll_intent_release(&minfo->mi_it);
+               iput(minfo->mi_dir);
+               OBD_FREE_PTR(minfo);
+       }
+
+       if (req) {
+               entry->se_req = NULL;
+               ptlrpc_req_finished(req);
+       }
+}
+
+static void ll_sa_entry_put(struct ll_statahead_info *sai,
+                            struct ll_sa_entry *entry)
+{
+       if (atomic_dec_and_test(&entry->se_refcount)) {
+               CDEBUG(D_READA, "free sa entry %.*s(%p) index "LPU64"\n",
+                      entry->se_qstr.len, entry->se_qstr.name, entry,
+                      entry->se_index);
+
+               LASSERT(list_empty(&entry->se_link));
+               LASSERT(list_empty(&entry->se_list));
+               LASSERT(ll_sa_entry_unhashed(entry));
+
+               ll_sa_entry_cleanup(sai, entry);
+               if (entry->se_inode)
+                       iput(entry->se_inode);
+
+               OBD_FREE(entry, entry->se_size);
+               atomic_dec(&sai->sai_cache_count);
+       }
+}
+
+static inline void
+do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+       struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+       LASSERT(!ll_sa_entry_unhashed(entry));
+       LASSERT(!list_empty(&entry->se_link));
+
+       ll_sa_entry_unhash(sai, entry);
+
+       spin_lock(&lli->lli_sa_lock);
+       entry->se_stat = SA_ENTRY_DEST;
+       list_del_init(&entry->se_link);
+       if (likely(!list_empty(&entry->se_list)))
+               list_del_init(&entry->se_list);
+       spin_unlock(&lli->lli_sa_lock);
+
+       ll_sa_entry_put(sai, entry);
+}
+
+/*
+ * Delete it from sai_entries_stated list when fini.
+ */
+static void
+ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+       struct ll_sa_entry *pos, *next;
+
+       if (entry)
+               do_sa_entry_fini(sai, entry);
+
+       /* drop old entry, only 'scanner' process does this, no need to lock */
+       list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) {
+               if (!is_omitted_entry(sai, pos->se_index))
+                       break;
+               do_sa_entry_fini(sai, pos);
+       }
+}
+
+/*
+ * Inside lli_sa_lock.
+ */
+static void
+do_sa_entry_to_stated(struct ll_statahead_info *sai,
+                     struct ll_sa_entry *entry, se_stat_t stat)
+{
+       struct ll_sa_entry *se;
+       struct list_head         *pos = &sai->sai_entries_stated;
+
+       if (!list_empty(&entry->se_list))
+               list_del_init(&entry->se_list);
+
+       list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) {
+               if (se->se_index < entry->se_index) {
+                       pos = &se->se_list;
+                       break;
+               }
+       }
+
+       list_add(&entry->se_list, pos);
+       entry->se_stat = stat;
+}
+
+/*
+ * Move entry to sai_entries_stated and sort with the index.
+ * \retval 1    -- entry to be destroyed.
+ * \retval 0    -- entry is inserted into stated list.
+ */
+static int
+ll_sa_entry_to_stated(struct ll_statahead_info *sai,
+                     struct ll_sa_entry *entry, se_stat_t stat)
+{
+       struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+       int                ret = 1;
+
+       ll_sa_entry_cleanup(sai, entry);
+
+       spin_lock(&lli->lli_sa_lock);
+       if (likely(entry->se_stat != SA_ENTRY_DEST)) {
+               do_sa_entry_to_stated(sai, entry, stat);
+               ret = 0;
+       }
+       spin_unlock(&lli->lli_sa_lock);
+
+       return ret;
+}
+
+/*
+ * Insert inode into the list of sai_entries_agl.
+ */
+static void ll_agl_add(struct ll_statahead_info *sai,
+                      struct inode *inode, int index)
+{
+       struct ll_inode_info *child  = ll_i2info(inode);
+       struct ll_inode_info *parent = ll_i2info(sai->sai_inode);
+       int                added  = 0;
+
+       spin_lock(&child->lli_agl_lock);
+       if (child->lli_agl_index == 0) {
+               child->lli_agl_index = index;
+               spin_unlock(&child->lli_agl_lock);
+
+               LASSERT(list_empty(&child->lli_agl_list));
+
+               igrab(inode);
+               spin_lock(&parent->lli_agl_lock);
+               if (agl_list_empty(sai))
+                       added = 1;
+               list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl);
+               spin_unlock(&parent->lli_agl_lock);
+       } else {
+               spin_unlock(&child->lli_agl_lock);
+       }
+
+       if (added > 0)
+               wake_up(&sai->sai_agl_thread.t_ctl_waitq);
+}
+
+static struct ll_statahead_info *ll_sai_alloc(void)
+{
+       struct ll_statahead_info *sai;
+       int                    i;
+       ENTRY;
+
+       OBD_ALLOC_PTR(sai);
+       if (!sai)
+               RETURN(NULL);
+
+       atomic_set(&sai->sai_refcount, 1);
+
+       spin_lock(&sai_generation_lock);
+       sai->sai_generation = ++sai_generation;
+       if (unlikely(sai_generation == 0))
+               sai->sai_generation = ++sai_generation;
+       spin_unlock(&sai_generation_lock);
+
+       sai->sai_max = LL_SA_RPC_MIN;
+       sai->sai_index = 1;
+       init_waitqueue_head(&sai->sai_waitq);
+       init_waitqueue_head(&sai->sai_thread.t_ctl_waitq);
+       init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq);
+
+       INIT_LIST_HEAD(&sai->sai_entries);
+       INIT_LIST_HEAD(&sai->sai_entries_received);
+       INIT_LIST_HEAD(&sai->sai_entries_stated);
+       INIT_LIST_HEAD(&sai->sai_entries_agl);
+
+       for (i = 0; i < LL_SA_CACHE_SIZE; i++) {
+               INIT_LIST_HEAD(&sai->sai_cache[i]);
+               spin_lock_init(&sai->sai_cache_lock[i]);
+       }
+       atomic_set(&sai->sai_cache_count, 0);
+
+       RETURN(sai);
+}
+
+static inline struct ll_statahead_info *
+ll_sai_get(struct ll_statahead_info *sai)
+{
+       atomic_inc(&sai->sai_refcount);
+       return sai;
+}
+
+static void ll_sai_put(struct ll_statahead_info *sai)
+{
+       struct inode     *inode = sai->sai_inode;
+       struct ll_inode_info *lli   = ll_i2info(inode);
+       ENTRY;
+
+       if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) {
+               struct ll_sa_entry *entry, *next;
+
+               if (unlikely(atomic_read(&sai->sai_refcount) > 0)) {
+                       /* It is race case, the interpret callback just hold
+                        * a reference count */
+                       spin_unlock(&lli->lli_sa_lock);
+                       RETURN_EXIT;
+               }
+
+               LASSERT(lli->lli_opendir_key == NULL);
+               LASSERT(thread_is_stopped(&sai->sai_thread));
+               LASSERT(thread_is_stopped(&sai->sai_agl_thread));
+
+               lli->lli_sai = NULL;
+               lli->lli_opendir_pid = 0;
+               spin_unlock(&lli->lli_sa_lock);
+
+               if (sai->sai_sent > sai->sai_replied)
+                       CDEBUG(D_READA,"statahead for dir "DFID" does not "
+                             "finish: [sent:"LPU64"] [replied:"LPU64"]\n",
+                             PFID(&lli->lli_fid),
+                             sai->sai_sent, sai->sai_replied);
+
+               list_for_each_entry_safe(entry, next,
+                                            &sai->sai_entries, se_link)
+                       do_sa_entry_fini(sai, entry);
+
+               LASSERT(list_empty(&sai->sai_entries));
+               LASSERT(sa_received_empty(sai));
+               LASSERT(list_empty(&sai->sai_entries_stated));
+
+               LASSERT(atomic_read(&sai->sai_cache_count) == 0);
+               LASSERT(agl_list_empty(sai));
+
+               iput(inode);
+               OBD_FREE_PTR(sai);
+       }
+
+       EXIT;
+}
+
+/* Do NOT forget to drop inode refcount when into sai_entries_agl. */
+static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai)
+{
+       struct ll_inode_info *lli   = ll_i2info(inode);
+       __u64            index = lli->lli_agl_index;
+       int                rc;
+       ENTRY;
+
+       LASSERT(list_empty(&lli->lli_agl_list));
+
+       /* AGL maybe fall behind statahead with one entry */
+       if (is_omitted_entry(sai, index + 1)) {
+               lli->lli_agl_index = 0;
+               iput(inode);
+               RETURN_EXIT;
+       }
+
+       /* Someone is in glimpse (sync or async), do nothing. */
+       rc = down_write_trylock(&lli->lli_glimpse_sem);
+       if (rc == 0) {
+               lli->lli_agl_index = 0;
+               iput(inode);
+               RETURN_EXIT;
+       }
+
+       /*
+        * Someone triggered glimpse within 1 sec before.
+        * 1) The former glimpse succeeded with glimpse lock granted by OST, and
+        *    if the lock is still cached on client, AGL needs to do nothing. If
+        *    it is cancelled by other client, AGL maybe cannot obtaion new lock
+        *    for no glimpse callback triggered by AGL.
+        * 2) The former glimpse succeeded, but OST did not grant glimpse lock.
+        *    Under such case, it is quite possible that the OST will not grant
+        *    glimpse lock for AGL also.
+        * 3) The former glimpse failed, compared with other two cases, it is
+        *    relative rare. AGL can ignore such case, and it will not muchly
+        *    affect the performance.
+        */
+       if (lli->lli_glimpse_time != 0 &&
+           cfs_time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) {
+               up_write(&lli->lli_glimpse_sem);
+               lli->lli_agl_index = 0;
+               iput(inode);
+               RETURN_EXIT;
+       }
+
+       CDEBUG(D_READA, "Handling (init) async glimpse: inode = "
+              DFID", idx = "LPU64"\n", PFID(&lli->lli_fid), index);
+
+       cl_agl(inode);
+       lli->lli_agl_index = 0;
+       lli->lli_glimpse_time = cfs_time_current();
+       up_write(&lli->lli_glimpse_sem);
+
+       CDEBUG(D_READA, "Handled (init) async glimpse: inode= "
+              DFID", idx = "LPU64", rc = %d\n",
+              PFID(&lli->lli_fid), index, rc);
+
+       iput(inode);
+
+       EXIT;
+}
+
+static void ll_post_statahead(struct ll_statahead_info *sai)
+{
+       struct inode       *dir   = sai->sai_inode;
+       struct inode       *child;
+       struct ll_inode_info   *lli   = ll_i2info(dir);
+       struct ll_sa_entry     *entry;
+       struct md_enqueue_info *minfo;
+       struct lookup_intent   *it;
+       struct ptlrpc_request  *req;
+       struct mdt_body *body;
+       int                  rc    = 0;
+       ENTRY;
+
+       spin_lock(&lli->lli_sa_lock);
+       if (unlikely(sa_received_empty(sai))) {
+               spin_unlock(&lli->lli_sa_lock);
+               RETURN_EXIT;
+       }
+       entry = sa_first_received_entry(sai);
+       atomic_inc(&entry->se_refcount);
+       list_del_init(&entry->se_list);
+       spin_unlock(&lli->lli_sa_lock);
+
+       LASSERT(entry->se_handle != 0);
+
+       minfo = entry->se_minfo;
+       it = &minfo->mi_it;
+       req = entry->se_req;
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EFAULT);
+
+       child = entry->se_inode;
+       if (child == NULL) {
+               /*
+                * lookup.
+                */
+               LASSERT(fid_is_zero(&minfo->mi_data.op_fid2));
+
+               /* XXX: No fid in reply, this is probaly cross-ref case.
+                * SA can't handle it yet. */
+               if (body->valid & OBD_MD_MDS)
+                       GOTO(out, rc = -EAGAIN);
+       } else {
+               /*
+                * revalidate.
+                */
+               /* unlinked and re-created with the same name */
+               if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))){
+                       entry->se_inode = NULL;
+                       iput(child);
+                       child = NULL;
+               }
+       }
+
+       it->d.lustre.it_lock_handle = entry->se_handle;
+       rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL);
+       if (rc != 1)
+               GOTO(out, rc = -EAGAIN);
+
+       rc = ll_prep_inode(&child, req, dir->i_sb, it);
+       if (rc)
+               GOTO(out, rc);
+
+       CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
+              child, child->i_ino, child->i_generation);
+       ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL);
+
+       entry->se_inode = child;
+
+       if (agl_should_run(sai, child))
+               ll_agl_add(sai, child, entry->se_index);
+
+       EXIT;
+
+out:
+       /* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock
+        * reference count by calling "ll_intent_drop_lock()" in spite of the
+        * above operations failed or not. Do not worry about calling
+        * "ll_intent_drop_lock()" more than once. */
+       rc = ll_sa_entry_to_stated(sai, entry,
+                                  rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+       if (rc == 0 && entry->se_index == sai->sai_index_wait)
+               wake_up(&sai->sai_waitq);
+       ll_sa_entry_put(sai, entry);
+}
+
+static int ll_statahead_interpret(struct ptlrpc_request *req,
+                                 struct md_enqueue_info *minfo, int rc)
+{
+       struct lookup_intent     *it  = &minfo->mi_it;
+       struct inode         *dir = minfo->mi_dir;
+       struct ll_inode_info     *lli = ll_i2info(dir);
+       struct ll_statahead_info *sai = NULL;
+       struct ll_sa_entry       *entry;
+       int                    wakeup;
+       ENTRY;
+
+       if (it_disposition(it, DISP_LOOKUP_NEG))
+               rc = -ENOENT;
+
+       spin_lock(&lli->lli_sa_lock);
+       /* stale entry */
+       if (unlikely(lli->lli_sai == NULL ||
+                    lli->lli_sai->sai_generation != minfo->mi_generation)) {
+               spin_unlock(&lli->lli_sa_lock);
+               GOTO(out, rc = -ESTALE);
+       } else {
+               sai = ll_sai_get(lli->lli_sai);
+               if (unlikely(!thread_is_running(&sai->sai_thread))) {
+                       sai->sai_replied++;
+                       spin_unlock(&lli->lli_sa_lock);
+                       GOTO(out, rc = -EBADFD);
+               }
+
+               entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata);
+               if (entry == NULL) {
+                       sai->sai_replied++;
+                       spin_unlock(&lli->lli_sa_lock);
+                       GOTO(out, rc = -EIDRM);
+               }
+
+               if (rc != 0) {
+                       do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA);
+                       wakeup = (entry->se_index == sai->sai_index_wait);
+               } else {
+                       entry->se_minfo = minfo;
+                       entry->se_req = ptlrpc_request_addref(req);
+                       /* Release the async ibits lock ASAP to avoid deadlock
+                        * when statahead thread tries to enqueue lock on parent
+                        * for readpage and other tries to enqueue lock on child
+                        * with parent's lock held, for example: unlink. */
+                       entry->se_handle = it->d.lustre.it_lock_handle;
+                       ll_intent_drop_lock(it);
+                       wakeup = sa_received_empty(sai);
+                       list_add_tail(&entry->se_list,
+                                         &sai->sai_entries_received);
+               }
+               sai->sai_replied++;
+               spin_unlock(&lli->lli_sa_lock);
+
+               ll_sa_entry_put(sai, entry);
+               if (wakeup)
+                       wake_up(&sai->sai_thread.t_ctl_waitq);
+       }
+
+       EXIT;
+
+out:
+       if (rc != 0) {
+               ll_intent_release(it);
+               iput(dir);
+               OBD_FREE_PTR(minfo);
+       }
+       if (sai != NULL)
+               ll_sai_put(sai);
+       return rc;
+}
+
+static void sa_args_fini(struct md_enqueue_info *minfo,
+                        struct ldlm_enqueue_info *einfo)
+{
+       LASSERT(minfo && einfo);
+       iput(minfo->mi_dir);
+       capa_put(minfo->mi_data.op_capa1);
+       capa_put(minfo->mi_data.op_capa2);
+       OBD_FREE_PTR(minfo);
+       OBD_FREE_PTR(einfo);
+}
+
+/**
+ * There is race condition between "capa_put" and "ll_statahead_interpret" for
+ * accessing "op_data.op_capa[1,2]" as following:
+ * "capa_put" releases "op_data.op_capa[1,2]"'s reference count after calling
+ * "md_intent_getattr_async". But "ll_statahead_interpret" maybe run first, and
+ * fill "op_data.op_capa[1,2]" as POISON, then cause "capa_put" access invalid
+ * "ocapa". So here reserve "op_data.op_capa[1,2]" in "pcapa" before calling
+ * "md_intent_getattr_async".
+ */
+static int sa_args_init(struct inode *dir, struct inode *child,
+                       struct ll_sa_entry *entry, struct md_enqueue_info **pmi,
+                       struct ldlm_enqueue_info **pei,
+                       struct obd_capa **pcapa)
+{
+       struct qstr           *qstr = &entry->se_qstr;
+       struct ll_inode_info     *lli  = ll_i2info(dir);
+       struct md_enqueue_info   *minfo;
+       struct ldlm_enqueue_info *einfo;
+       struct md_op_data       *op_data;
+
+       OBD_ALLOC_PTR(einfo);
+       if (einfo == NULL)
+               return -ENOMEM;
+
+       OBD_ALLOC_PTR(minfo);
+       if (minfo == NULL) {
+               OBD_FREE_PTR(einfo);
+               return -ENOMEM;
+       }
+
+       op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name,
+                                    qstr->len, 0, LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data)) {
+               OBD_FREE_PTR(einfo);
+               OBD_FREE_PTR(minfo);
+               return PTR_ERR(op_data);
+       }
+
+       minfo->mi_it.it_op = IT_GETATTR;
+       minfo->mi_dir = igrab(dir);
+       minfo->mi_cb = ll_statahead_interpret;
+       minfo->mi_generation = lli->lli_sai->sai_generation;
+       minfo->mi_cbdata = entry->se_index;
+
+       einfo->ei_type   = LDLM_IBITS;
+       einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
+       einfo->ei_cb_bl  = ll_md_blocking_ast;
+       einfo->ei_cb_cp  = ldlm_completion_ast;
+       einfo->ei_cb_gl  = NULL;
+       einfo->ei_cbdata = NULL;
+
+       *pmi = minfo;
+       *pei = einfo;
+       pcapa[0] = op_data->op_capa1;
+       pcapa[1] = op_data->op_capa2;
+
+       return 0;
+}
+
+static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry)
+{
+       struct md_enqueue_info   *minfo;
+       struct ldlm_enqueue_info *einfo;
+       struct obd_capa   *capas[2];
+       int                    rc;
+       ENTRY;
+
+       rc = sa_args_init(dir, NULL, entry, &minfo, &einfo, capas);
+       if (rc)
+               RETURN(rc);
+
+       rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+       if (!rc) {
+               capa_put(capas[0]);
+               capa_put(capas[1]);
+       } else {
+               sa_args_fini(minfo, einfo);
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * similar to ll_revalidate_it().
+ * \retval      1 -- dentry valid
+ * \retval      0 -- will send stat-ahead request
+ * \retval others -- prepare stat-ahead request failed
+ */
+static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry,
+                           struct dentry *dentry)
+{
+       struct inode         *inode = dentry->d_inode;
+       struct lookup_intent      it = { .it_op = IT_GETATTR,
+                                        .d.lustre.it_lock_handle = 0 };
+       struct md_enqueue_info   *minfo;
+       struct ldlm_enqueue_info *einfo;
+       struct obd_capa   *capas[2];
+       int rc;
+       ENTRY;
+
+       if (unlikely(inode == NULL))
+               RETURN(1);
+
+       if (d_mountpoint(dentry))
+               RETURN(1);
+
+       if (unlikely(dentry == dentry->d_sb->s_root))
+               RETURN(1);
+
+       entry->se_inode = igrab(inode);
+       rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),NULL);
+       if (rc == 1) {
+               entry->se_handle = it.d.lustre.it_lock_handle;
+               ll_intent_release(&it);
+               RETURN(1);
+       }
+
+       rc = sa_args_init(dir, inode, entry, &minfo, &einfo, capas);
+       if (rc) {
+               entry->se_inode = NULL;
+               iput(inode);
+               RETURN(rc);
+       }
+
+       rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+       if (!rc) {
+               capa_put(capas[0]);
+               capa_put(capas[1]);
+       } else {
+               entry->se_inode = NULL;
+               iput(inode);
+               sa_args_fini(minfo, einfo);
+       }
+
+       RETURN(rc);
+}
+
+static void ll_statahead_one(struct dentry *parent, const char* entry_name,
+                            int entry_name_len)
+{
+       struct inode         *dir    = parent->d_inode;
+       struct ll_inode_info     *lli    = ll_i2info(dir);
+       struct ll_statahead_info *sai    = lli->lli_sai;
+       struct dentry       *dentry = NULL;
+       struct ll_sa_entry       *entry;
+       int                    rc;
+       int                    rc1;
+       ENTRY;
+
+       entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name,
+                                 entry_name_len);
+       if (IS_ERR(entry))
+               RETURN_EXIT;
+
+       dentry = d_lookup(parent, &entry->se_qstr);
+       if (!dentry) {
+               rc = do_sa_lookup(dir, entry);
+       } else {
+               rc = do_sa_revalidate(dir, entry, dentry);
+               if (rc == 1 && agl_should_run(sai, dentry->d_inode))
+                       ll_agl_add(sai, dentry->d_inode, entry->se_index);
+       }
+
+       if (dentry != NULL)
+               dput(dentry);
+
+       if (rc) {
+               rc1 = ll_sa_entry_to_stated(sai, entry,
+                                       rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC);
+               if (rc1 == 0 && entry->se_index == sai->sai_index_wait)
+                       wake_up(&sai->sai_waitq);
+       } else {
+               sai->sai_sent++;
+       }
+
+       sai->sai_index++;
+       /* drop one refcount on entry by ll_sa_entry_alloc */
+       ll_sa_entry_put(sai, entry);
+
+       EXIT;
+}
+
+static int ll_agl_thread(void *arg)
+{
+       struct dentry       *parent = (struct dentry *)arg;
+       struct inode         *dir    = parent->d_inode;
+       struct ll_inode_info     *plli   = ll_i2info(dir);
+       struct ll_inode_info     *clli;
+       struct ll_sb_info       *sbi    = ll_i2sbi(dir);
+       struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
+       struct ptlrpc_thread     *thread = &sai->sai_agl_thread;
+       struct l_wait_info      lwi    = { 0 };
+       ENTRY;
+
+       CDEBUG(D_READA, "agl thread started: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+
+       atomic_inc(&sbi->ll_agl_total);
+       spin_lock(&plli->lli_agl_lock);
+       sai->sai_agl_valid = 1;
+       thread_set_flags(thread, SVC_RUNNING);
+       spin_unlock(&plli->lli_agl_lock);
+       wake_up(&thread->t_ctl_waitq);
+
+       while (1) {
+               l_wait_event(thread->t_ctl_waitq,
+                            !agl_list_empty(sai) ||
+                            !thread_is_running(thread),
+                            &lwi);
+
+               if (!thread_is_running(thread))
+                       break;
+
+               spin_lock(&plli->lli_agl_lock);
+               /* The statahead thread maybe help to process AGL entries,
+                * so check whether list empty again. */
+               if (!agl_list_empty(sai)) {
+                       clli = agl_first_entry(sai);
+                       list_del_init(&clli->lli_agl_list);
+                       spin_unlock(&plli->lli_agl_lock);
+                       ll_agl_trigger(&clli->lli_vfs_inode, sai);
+               } else {
+                       spin_unlock(&plli->lli_agl_lock);
+               }
+       }
+
+       spin_lock(&plli->lli_agl_lock);
+       sai->sai_agl_valid = 0;
+       while (!agl_list_empty(sai)) {
+               clli = agl_first_entry(sai);
+               list_del_init(&clli->lli_agl_list);
+               spin_unlock(&plli->lli_agl_lock);
+               clli->lli_agl_index = 0;
+               iput(&clli->lli_vfs_inode);
+               spin_lock(&plli->lli_agl_lock);
+       }
+       thread_set_flags(thread, SVC_STOPPED);
+       spin_unlock(&plli->lli_agl_lock);
+       wake_up(&thread->t_ctl_waitq);
+       ll_sai_put(sai);
+       CDEBUG(D_READA, "agl thread stopped: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+       RETURN(0);
+}
+
+static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai)
+{
+       struct ptlrpc_thread *thread = &sai->sai_agl_thread;
+       struct l_wait_info    lwi    = { 0 };
+       struct ll_inode_info  *plli;
+       task_t        *task;
+       ENTRY;
+
+       CDEBUG(D_READA, "start agl thread: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+
+       plli = ll_i2info(parent->d_inode);
+       task = kthread_run(ll_agl_thread, parent,
+                              "ll_agl_%u", plli->lli_opendir_pid);
+       if (IS_ERR(task)) {
+               CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task));
+               thread_set_flags(thread, SVC_STOPPED);
+               RETURN_EXIT;
+       }
+
+       l_wait_event(thread->t_ctl_waitq,
+                    thread_is_running(thread) || thread_is_stopped(thread),
+                    &lwi);
+       EXIT;
+}
+
+static int ll_statahead_thread(void *arg)
+{
+       struct dentry       *parent = (struct dentry *)arg;
+       struct inode         *dir    = parent->d_inode;
+       struct ll_inode_info     *plli   = ll_i2info(dir);
+       struct ll_inode_info     *clli;
+       struct ll_sb_info       *sbi    = ll_i2sbi(dir);
+       struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
+       struct ptlrpc_thread     *thread = &sai->sai_thread;
+       struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread;
+       struct page           *page;
+       __u64                pos    = 0;
+       int                    first  = 0;
+       int                    rc     = 0;
+       struct ll_dir_chain       chain;
+       struct l_wait_info      lwi    = { 0 };
+       ENTRY;
+
+       CDEBUG(D_READA, "statahead thread started: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+
+       if (sbi->ll_flags & LL_SBI_AGL_ENABLED)
+               ll_start_agl(parent, sai);
+
+       atomic_inc(&sbi->ll_sa_total);
+       spin_lock(&plli->lli_sa_lock);
+       thread_set_flags(thread, SVC_RUNNING);
+       spin_unlock(&plli->lli_sa_lock);
+       wake_up(&thread->t_ctl_waitq);
+
+       ll_dir_chain_init(&chain);
+       page = ll_get_dir_page(dir, pos, &chain);
+
+       while (1) {
+               struct lu_dirpage *dp;
+               struct lu_dirent  *ent;
+
+               if (IS_ERR(page)) {
+                       rc = PTR_ERR(page);
+                       CDEBUG(D_READA, "error reading dir "DFID" at "LPU64
+                              "/"LPU64": [rc %d] [parent %u]\n",
+                              PFID(ll_inode2fid(dir)), pos, sai->sai_index,
+                              rc, plli->lli_opendir_pid);
+                       GOTO(out, rc);
+               }
+
+               dp = page_address(page);
+               for (ent = lu_dirent_start(dp); ent != NULL;
+                    ent = lu_dirent_next(ent)) {
+                       __u64 hash;
+                       int namelen;
+                       char *name;
+
+                       hash = le64_to_cpu(ent->lde_hash);
+                       if (unlikely(hash < pos))
+                               /*
+                                * Skip until we find target hash value.
+                                */
+                               continue;
+
+                       namelen = le16_to_cpu(ent->lde_namelen);
+                       if (unlikely(namelen == 0))
+                               /*
+                                * Skip dummy record.
+                                */
+                               continue;
+
+                       name = ent->lde_name;
+                       if (name[0] == '.') {
+                               if (namelen == 1) {
+                                       /*
+                                        * skip "."
+                                        */
+                                       continue;
+                               } else if (name[1] == '.' && namelen == 2) {
+                                       /*
+                                        * skip ".."
+                                        */
+                                       continue;
+                               } else if (!sai->sai_ls_all) {
+                                       /*
+                                        * skip hidden files.
+                                        */
+                                       sai->sai_skip_hidden++;
+                                       continue;
+                               }
+                       }
+
+                       /*
+                        * don't stat-ahead first entry.
+                        */
+                       if (unlikely(++first == 1))
+                               continue;
+
+keep_it:
+                       l_wait_event(thread->t_ctl_waitq,
+                                    !sa_sent_full(sai) ||
+                                    !sa_received_empty(sai) ||
+                                    !agl_list_empty(sai) ||
+                                    !thread_is_running(thread),
+                                    &lwi);
+
+interpret_it:
+                       while (!sa_received_empty(sai))
+                               ll_post_statahead(sai);
+
+                       if (unlikely(!thread_is_running(thread))) {
+                               ll_release_page(page, 0);
+                               GOTO(out, rc = 0);
+                       }
+
+                       /* If no window for metadata statahead, but there are
+                        * some AGL entries to be triggered, then try to help
+                        * to process the AGL entries. */
+                       if (sa_sent_full(sai)) {
+                               spin_lock(&plli->lli_agl_lock);
+                               while (!agl_list_empty(sai)) {
+                                       clli = agl_first_entry(sai);
+                                       list_del_init(&clli->lli_agl_list);
+                                       spin_unlock(&plli->lli_agl_lock);
+                                       ll_agl_trigger(&clli->lli_vfs_inode,
+                                                      sai);
+
+                                       if (!sa_received_empty(sai))
+                                               goto interpret_it;
+
+                                       if (unlikely(
+                                               !thread_is_running(thread))) {
+                                               ll_release_page(page, 0);
+                                               GOTO(out, rc = 0);
+                                       }
+
+                                       if (!sa_sent_full(sai))
+                                               goto do_it;
+
+                                       spin_lock(&plli->lli_agl_lock);
+                               }
+                               spin_unlock(&plli->lli_agl_lock);
+
+                               goto keep_it;
+                       }
+
+do_it:
+                       ll_statahead_one(parent, name, namelen);
+               }
+               pos = le64_to_cpu(dp->ldp_hash_end);
+               if (pos == MDS_DIR_END_OFF) {
+                       /*
+                        * End of directory reached.
+                        */
+                       ll_release_page(page, 0);
+                       while (1) {
+                               l_wait_event(thread->t_ctl_waitq,
+                                            !sa_received_empty(sai) ||
+                                            sai->sai_sent == sai->sai_replied||
+                                            !thread_is_running(thread),
+                                            &lwi);
+
+                               while (!sa_received_empty(sai))
+                                       ll_post_statahead(sai);
+
+                               if (unlikely(!thread_is_running(thread)))
+                                       GOTO(out, rc = 0);
+
+                               if (sai->sai_sent == sai->sai_replied &&
+                                   sa_received_empty(sai))
+                                       break;
+                       }
+
+                       spin_lock(&plli->lli_agl_lock);
+                       while (!agl_list_empty(sai) &&
+                              thread_is_running(thread)) {
+                               clli = agl_first_entry(sai);
+                               list_del_init(&clli->lli_agl_list);
+                               spin_unlock(&plli->lli_agl_lock);
+                               ll_agl_trigger(&clli->lli_vfs_inode, sai);
+                               spin_lock(&plli->lli_agl_lock);
+                       }
+                       spin_unlock(&plli->lli_agl_lock);
+
+                       GOTO(out, rc = 0);
+               } else if (1) {
+                       /*
+                        * chain is exhausted.
+                        * Normal case: continue to the next page.
+                        */
+                       ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+                                             LDF_COLLIDE);
+                       sai->sai_in_readpage = 1;
+                       page = ll_get_dir_page(dir, pos, &chain);
+                       sai->sai_in_readpage = 0;
+               } else {
+                       LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+                       ll_release_page(page, 1);
+                       /*
+                        * go into overflow page.
+                        */
+               }
+       }
+       EXIT;
+
+out:
+       if (sai->sai_agl_valid) {
+               spin_lock(&plli->lli_agl_lock);
+               thread_set_flags(agl_thread, SVC_STOPPING);
+               spin_unlock(&plli->lli_agl_lock);
+               wake_up(&agl_thread->t_ctl_waitq);
+
+               CDEBUG(D_READA, "stop agl thread: [pid %d]\n",
+                      current_pid());
+               l_wait_event(agl_thread->t_ctl_waitq,
+                            thread_is_stopped(agl_thread),
+                            &lwi);
+       } else {
+               /* Set agl_thread flags anyway. */
+               thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+       }
+       ll_dir_chain_fini(&chain);
+       spin_lock(&plli->lli_sa_lock);
+       if (!sa_received_empty(sai)) {
+               thread_set_flags(thread, SVC_STOPPING);
+               spin_unlock(&plli->lli_sa_lock);
+
+               /* To release the resources held by received entries. */
+               while (!sa_received_empty(sai))
+                       ll_post_statahead(sai);
+
+               spin_lock(&plli->lli_sa_lock);
+       }
+       thread_set_flags(thread, SVC_STOPPED);
+       spin_unlock(&plli->lli_sa_lock);
+       wake_up(&sai->sai_waitq);
+       wake_up(&thread->t_ctl_waitq);
+       ll_sai_put(sai);
+       dput(parent);
+       CDEBUG(D_READA, "statahead thread stopped: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+       return rc;
+}
+
+/**
+ * called in ll_file_release().
+ */
+void ll_stop_statahead(struct inode *dir, void *key)
+{
+       struct ll_inode_info *lli = ll_i2info(dir);
+
+       if (unlikely(key == NULL))
+               return;
+
+       spin_lock(&lli->lli_sa_lock);
+       if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) {
+               spin_unlock(&lli->lli_sa_lock);
+               return;
+       }
+
+       lli->lli_opendir_key = NULL;
+
+       if (lli->lli_sai) {
+               struct l_wait_info lwi = { 0 };
+               struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread;
+
+               if (!thread_is_stopped(thread)) {
+                       thread_set_flags(thread, SVC_STOPPING);
+                       spin_unlock(&lli->lli_sa_lock);
+                       wake_up(&thread->t_ctl_waitq);
+
+                       CDEBUG(D_READA, "stop statahead thread: [pid %d]\n",
+                              current_pid());
+                       l_wait_event(thread->t_ctl_waitq,
+                                    thread_is_stopped(thread),
+                                    &lwi);
+               } else {
+                       spin_unlock(&lli->lli_sa_lock);
+               }
+
+               /*
+                * Put the ref which was held when first statahead_enter.
+                * It maybe not the last ref for some statahead requests
+                * maybe inflight.
+                */
+               ll_sai_put(lli->lli_sai);
+       } else {
+               lli->lli_opendir_pid = 0;
+               spin_unlock(&lli->lli_sa_lock);
+       }
+}
+
+enum {
+       /**
+        * not first dirent, or is "."
+        */
+       LS_NONE_FIRST_DE = 0,
+       /**
+        * the first non-hidden dirent
+        */
+       LS_FIRST_DE,
+       /**
+        * the first hidden dirent, that is "."
+        */
+       LS_FIRST_DOT_DE
+};
+
+static int is_first_dirent(struct inode *dir, struct dentry *dentry)
+{
+       struct ll_dir_chain   chain;
+       struct qstr       *target = &dentry->d_name;
+       struct page       *page;
+       __u64            pos    = 0;
+       int                dot_de;
+       int                rc     = LS_NONE_FIRST_DE;
+       ENTRY;
+
+       ll_dir_chain_init(&chain);
+       page = ll_get_dir_page(dir, pos, &chain);
+
+       while (1) {
+               struct lu_dirpage *dp;
+               struct lu_dirent  *ent;
+
+               if (IS_ERR(page)) {
+                       struct ll_inode_info *lli = ll_i2info(dir);
+
+                       rc = PTR_ERR(page);
+                       CERROR("error reading dir "DFID" at "LPU64": "
+                              "[rc %d] [parent %u]\n",
+                              PFID(ll_inode2fid(dir)), pos,
+                              rc, lli->lli_opendir_pid);
+                       break;
+               }
+
+               dp = page_address(page);
+               for (ent = lu_dirent_start(dp); ent != NULL;
+                    ent = lu_dirent_next(ent)) {
+                       __u64 hash;
+                       int namelen;
+                       char *name;
+
+                       hash = le64_to_cpu(ent->lde_hash);
+                       /* The ll_get_dir_page() can return any page containing
+                        * the given hash which may be not the start hash. */
+                       if (unlikely(hash < pos))
+                               continue;
+
+                       namelen = le16_to_cpu(ent->lde_namelen);
+                       if (unlikely(namelen == 0))
+                               /*
+                                * skip dummy record.
+                                */
+                               continue;
+
+                       name = ent->lde_name;
+                       if (name[0] == '.') {
+                               if (namelen == 1)
+                                       /*
+                                        * skip "."
+                                        */
+                                       continue;
+                               else if (name[1] == '.' && namelen == 2)
+                                       /*
+                                        * skip ".."
+                                        */
+                                       continue;
+                               else
+                                       dot_de = 1;
+                       } else {
+                               dot_de = 0;
+                       }
+
+                       if (dot_de && target->name[0] != '.') {
+                               CDEBUG(D_READA, "%.*s skip hidden file %.*s\n",
+                                      target->len, target->name,
+                                      namelen, name);
+                               continue;
+                       }
+
+                       if (target->len != namelen ||
+                           memcmp(target->name, name, namelen) != 0)
+                               rc = LS_NONE_FIRST_DE;
+                       else if (!dot_de)
+                               rc = LS_FIRST_DE;
+                       else
+                               rc = LS_FIRST_DOT_DE;
+
+                       ll_release_page(page, 0);
+                       GOTO(out, rc);
+               }
+               pos = le64_to_cpu(dp->ldp_hash_end);
+               if (pos == MDS_DIR_END_OFF) {
+                       /*
+                        * End of directory reached.
+                        */
+                       ll_release_page(page, 0);
+                       break;
+               } else if (1) {
+                       /*
+                        * chain is exhausted
+                        * Normal case: continue to the next page.
+                        */
+                       ll_release_page(page, le32_to_cpu(dp->ldp_flags) &
+                                             LDF_COLLIDE);
+                       page = ll_get_dir_page(dir, pos, &chain);
+               } else {
+                       /*
+                        * go into overflow page.
+                        */
+                       LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
+                       ll_release_page(page, 1);
+               }
+       }
+       EXIT;
+
+out:
+       ll_dir_chain_fini(&chain);
+       return rc;
+}
+
+static void
+ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry)
+{
+       struct ptlrpc_thread *thread = &sai->sai_thread;
+       struct ll_sb_info    *sbi    = ll_i2sbi(sai->sai_inode);
+       int                hit;
+       ENTRY;
+
+       if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC)
+               hit = 1;
+       else
+               hit = 0;
+
+       ll_sa_entry_fini(sai, entry);
+       if (hit) {
+               sai->sai_hit++;
+               sai->sai_consecutive_miss = 0;
+               sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max);
+       } else {
+               struct ll_inode_info *lli = ll_i2info(sai->sai_inode);
+
+               sai->sai_miss++;
+               sai->sai_consecutive_miss++;
+               if (sa_low_hit(sai) && thread_is_running(thread)) {
+                       atomic_inc(&sbi->ll_sa_wrong);
+                       CDEBUG(D_READA, "Statahead for dir "DFID" hit "
+                              "ratio too low: hit/miss "LPU64"/"LPU64
+                              ", sent/replied "LPU64"/"LPU64", stopping "
+                              "statahead thread: pid %d\n",
+                              PFID(&lli->lli_fid), sai->sai_hit,
+                              sai->sai_miss, sai->sai_sent,
+                              sai->sai_replied, current_pid());
+                       spin_lock(&lli->lli_sa_lock);
+                       if (!thread_is_stopped(thread))
+                               thread_set_flags(thread, SVC_STOPPING);
+                       spin_unlock(&lli->lli_sa_lock);
+               }
+       }
+
+       if (!thread_is_stopped(thread))
+               wake_up(&thread->t_ctl_waitq);
+
+       EXIT;
+}
+
+/**
+ * Start statahead thread if this is the first dir entry.
+ * Otherwise if a thread is started already, wait it until it is ahead of me.
+ * \retval 1       -- find entry with lock in cache, the caller needs to do
+ *                 nothing.
+ * \retval 0       -- find entry in cache, but without lock, the caller needs
+ *                 refresh from MDS.
+ * \retval others  -- the caller need to process as non-statahead.
+ */
+int do_statahead_enter(struct inode *dir, struct dentry **dentryp,
+                      int only_unplug)
+{
+       struct ll_inode_info     *lli   = ll_i2info(dir);
+       struct ll_statahead_info *sai   = lli->lli_sai;
+       struct dentry       *parent;
+       struct ll_sa_entry       *entry;
+       struct ptlrpc_thread     *thread;
+       struct l_wait_info      lwi   = { 0 };
+       int                    rc    = 0;
+       struct ll_inode_info     *plli;
+       ENTRY;
+
+       LASSERT(lli->lli_opendir_pid == current_pid());
+
+       if (sai) {
+               thread = &sai->sai_thread;
+               if (unlikely(thread_is_stopped(thread) &&
+                            list_empty(&sai->sai_entries_stated))) {
+                       /* to release resource */
+                       ll_stop_statahead(dir, lli->lli_opendir_key);
+                       RETURN(-EAGAIN);
+               }
+
+               if ((*dentryp)->d_name.name[0] == '.') {
+                       if (sai->sai_ls_all ||
+                           sai->sai_miss_hidden >= sai->sai_skip_hidden) {
+                               /*
+                                * Hidden dentry is the first one, or statahead
+                                * thread does not skip so many hidden dentries
+                                * before "sai_ls_all" enabled as below.
+                                */
+                       } else {
+                               if (!sai->sai_ls_all)
+                                       /*
+                                        * It maybe because hidden dentry is not
+                                        * the first one, "sai_ls_all" was not
+                                        * set, then "ls -al" missed. Enable
+                                        * "sai_ls_all" for such case.
+                                        */
+                                       sai->sai_ls_all = 1;
+
+                               /*
+                                * Such "getattr" has been skipped before
+                                * "sai_ls_all" enabled as above.
+                                */
+                               sai->sai_miss_hidden++;
+                               RETURN(-EAGAIN);
+                       }
+               }
+
+               entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name);
+               if (entry == NULL || only_unplug) {
+                       ll_sai_unplug(sai, entry);
+                       RETURN(entry ? 1 : -EAGAIN);
+               }
+
+               /* if statahead is busy in readdir, help it do post-work */
+               while (!ll_sa_entry_stated(entry) &&
+                      sai->sai_in_readpage &&
+                      !sa_received_empty(sai))
+                       ll_post_statahead(sai);
+
+               if (!ll_sa_entry_stated(entry)) {
+                       sai->sai_index_wait = entry->se_index;
+                       lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL,
+                                              LWI_ON_SIGNAL_NOOP, NULL);
+                       rc = l_wait_event(sai->sai_waitq,
+                                         ll_sa_entry_stated(entry) ||
+                                         thread_is_stopped(thread),
+                                         &lwi);
+                       if (rc < 0) {
+                               ll_sai_unplug(sai, entry);
+                               RETURN(-EAGAIN);
+                       }
+               }
+
+               if (entry->se_stat == SA_ENTRY_SUCC &&
+                   entry->se_inode != NULL) {
+                       struct inode *inode = entry->se_inode;
+                       struct lookup_intent it = { .it_op = IT_GETATTR,
+                                                   .d.lustre.it_lock_handle =
+                                                    entry->se_handle };
+                       __u64 bits;
+
+                       rc = md_revalidate_lock(ll_i2mdexp(dir), &it,
+                                               ll_inode2fid(inode), &bits);
+                       if (rc == 1) {
+                               if ((*dentryp)->d_inode == NULL) {
+                                       *dentryp = ll_splice_alias(inode,
+                                                                  *dentryp);
+                               } else if ((*dentryp)->d_inode != inode) {
+                                       /* revalidate, but inode is recreated */
+                                       CDEBUG(D_READA,
+                                             "stale dentry %.*s inode %lu/%u, "
+                                             "statahead inode %lu/%u\n",
+                                             (*dentryp)->d_name.len,
+                                             (*dentryp)->d_name.name,
+                                             (*dentryp)->d_inode->i_ino,
+                                             (*dentryp)->d_inode->i_generation,
+                                             inode->i_ino,
+                                             inode->i_generation);
+                                       ll_sai_unplug(sai, entry);
+                                       RETURN(-ESTALE);
+                               } else {
+                                       iput(inode);
+                               }
+                               entry->se_inode = NULL;
+
+                               if ((bits & MDS_INODELOCK_LOOKUP) &&
+                                   d_lustre_invalid(*dentryp))
+                                       d_lustre_revalidate(*dentryp);
+                               ll_intent_release(&it);
+                       }
+               }
+
+               ll_sai_unplug(sai, entry);
+               RETURN(rc);
+       }
+
+       /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
+       rc = is_first_dirent(dir, *dentryp);
+       if (rc == LS_NONE_FIRST_DE)
+               /* It is not "ls -{a}l" operation, no need statahead for it. */
+               GOTO(out, rc = -EAGAIN);
+
+       sai = ll_sai_alloc();
+       if (sai == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       sai->sai_ls_all = (rc == LS_FIRST_DOT_DE);
+       sai->sai_inode = igrab(dir);
+       if (unlikely(sai->sai_inode == NULL)) {
+               CWARN("Do not start stat ahead on dying inode "DFID"\n",
+                     PFID(&lli->lli_fid));
+               GOTO(out, rc = -ESTALE);
+       }
+
+       /* get parent reference count here, and put it in ll_statahead_thread */
+       parent = dget((*dentryp)->d_parent);
+       if (unlikely(sai->sai_inode != parent->d_inode)) {
+               struct ll_inode_info *nlli = ll_i2info(parent->d_inode);
+
+               CWARN("Race condition, someone changed %.*s just now: "
+                     "old parent "DFID", new parent "DFID"\n",
+                     (*dentryp)->d_name.len, (*dentryp)->d_name.name,
+                     PFID(&lli->lli_fid), PFID(&nlli->lli_fid));
+               dput(parent);
+               iput(sai->sai_inode);
+               GOTO(out, rc = -EAGAIN);
+       }
+
+       CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %.*s]\n",
+              current_pid(), parent->d_name.len, parent->d_name.name);
+
+       lli->lli_sai = sai;
+
+       plli = ll_i2info(parent->d_inode);
+       rc = PTR_ERR(kthread_run(ll_statahead_thread, parent,
+                                "ll_sa_%u", plli->lli_opendir_pid));
+       thread = &sai->sai_thread;
+       if (IS_ERR_VALUE(rc)) {
+               CERROR("can't start ll_sa thread, rc: %d\n", rc);
+               dput(parent);
+               lli->lli_opendir_key = NULL;
+               thread_set_flags(thread, SVC_STOPPED);
+               thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED);
+               ll_sai_put(sai);
+               LASSERT(lli->lli_sai == NULL);
+               RETURN(-EAGAIN);
+       }
+
+       l_wait_event(thread->t_ctl_waitq,
+                    thread_is_running(thread) || thread_is_stopped(thread),
+                    &lwi);
+
+       /*
+        * We don't stat-ahead for the first dirent since we are already in
+        * lookup.
+        */
+       RETURN(-EAGAIN);
+
+out:
+       if (sai != NULL)
+               OBD_FREE_PTR(sai);
+       spin_lock(&lli->lli_sa_lock);
+       lli->lli_opendir_key = NULL;
+       lli->lli_opendir_pid = 0;
+       spin_unlock(&lli->lli_sa_lock);
+       return rc;
+}
diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c
new file mode 100644 (file)
index 0000000..82c14a9
--- /dev/null
@@ -0,0 +1,226 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <lustre_lite.h>
+#include <lustre_ha.h>
+#include <lustre_dlm.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <lprocfs_status.h>
+#include "llite_internal.h"
+
+static struct kmem_cache *ll_inode_cachep;
+
+static struct inode *ll_alloc_inode(struct super_block *sb)
+{
+       struct ll_inode_info *lli;
+       ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1);
+       OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, __GFP_IO);
+       if (lli == NULL)
+               return NULL;
+
+       inode_init_once(&lli->lli_vfs_inode);
+       return &lli->lli_vfs_inode;
+}
+
+static void ll_inode_destroy_callback(struct rcu_head *head)
+{
+       struct inode *inode = container_of(head, struct inode, i_rcu);
+       struct ll_inode_info *ptr = ll_i2info(inode);
+       OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep);
+}
+
+static void ll_destroy_inode(struct inode *inode)
+{
+       call_rcu(&inode->i_rcu, ll_inode_destroy_callback);
+}
+
+int ll_init_inodecache(void)
+{
+       ll_inode_cachep = kmem_cache_create("lustre_inode_cache",
+                                              sizeof(struct ll_inode_info),
+                                              0, SLAB_HWCACHE_ALIGN, NULL);
+       if (ll_inode_cachep == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+void ll_destroy_inodecache(void)
+{
+       kmem_cache_destroy(ll_inode_cachep);
+}
+
+/* exported operations */
+struct super_operations lustre_super_operations =
+{
+       .alloc_inode   = ll_alloc_inode,
+       .destroy_inode = ll_destroy_inode,
+       .evict_inode   = ll_delete_inode,
+       .put_super     = ll_put_super,
+       .statfs = ll_statfs,
+       .umount_begin  = ll_umount_begin,
+       .remount_fs    = ll_remount_fs,
+       .show_options  = ll_show_options,
+};
+MODULE_ALIAS_FS("lustre");
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
+
+int vvp_global_init(void);
+void vvp_global_fini(void);
+
+static int __init init_lustre_lite(void)
+{
+       int i, rc, seed[2];
+       struct timeval tv;
+       lnet_process_id_t lnet_id;
+
+       CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1);
+
+       /* print an address of _any_ initialized kernel symbol from this
+        * module, to allow debugging with gdb that doesn't support data
+        * symbols from modules.*/
+       CDEBUG(D_INFO, "Lustre client module (%p).\n",
+              &lustre_super_operations);
+
+       rc = ll_init_inodecache();
+       if (rc)
+               return -ENOMEM;
+       ll_file_data_slab = kmem_cache_create("ll_file_data",
+                                                sizeof(struct ll_file_data), 0,
+                                                SLAB_HWCACHE_ALIGN, NULL);
+       if (ll_file_data_slab == NULL) {
+               ll_destroy_inodecache();
+               return -ENOMEM;
+       }
+
+       ll_remote_perm_cachep = kmem_cache_create("ll_remote_perm_cache",
+                                                 sizeof(struct ll_remote_perm),
+                                                     0, 0, NULL);
+       if (ll_remote_perm_cachep == NULL) {
+               kmem_cache_destroy(ll_file_data_slab);
+               ll_file_data_slab = NULL;
+               ll_destroy_inodecache();
+               return -ENOMEM;
+       }
+
+       ll_rmtperm_hash_cachep = kmem_cache_create("ll_rmtperm_hash_cache",
+                                                  REMOTE_PERM_HASHSIZE *
+                                                  sizeof(struct list_head),
+                                                  0, 0, NULL);
+       if (ll_rmtperm_hash_cachep == NULL) {
+               kmem_cache_destroy(ll_remote_perm_cachep);
+               ll_remote_perm_cachep = NULL;
+               kmem_cache_destroy(ll_file_data_slab);
+               ll_file_data_slab = NULL;
+               ll_destroy_inodecache();
+               return -ENOMEM;
+       }
+
+       proc_lustre_fs_root = proc_lustre_root ?
+                             lprocfs_register("llite", proc_lustre_root, NULL, NULL) : NULL;
+
+       lustre_register_client_fill_super(ll_fill_super);
+       lustre_register_kill_super_cb(ll_kill_super);
+
+       lustre_register_client_process_config(ll_process_config);
+
+       cfs_get_random_bytes(seed, sizeof(seed));
+
+       /* Nodes with small feet have little entropy
+        * the NID for this node gives the most entropy in the low bits */
+       for (i=0; ; i++) {
+               if (LNetGetId(i, &lnet_id) == -ENOENT) {
+                       break;
+               }
+               if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) {
+                       seed[0] ^= LNET_NIDADDR(lnet_id.nid);
+               }
+       }
+
+       do_gettimeofday(&tv);
+       cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+       init_timer(&ll_capa_timer);
+       ll_capa_timer.function = ll_capa_timer_callback;
+       rc = ll_capa_thread_start();
+       /*
+        * XXX normal cleanup is needed here.
+        */
+       if (rc == 0)
+               rc = vvp_global_init();
+
+       return rc;
+}
+
+static void __exit exit_lustre_lite(void)
+{
+       vvp_global_fini();
+       del_timer(&ll_capa_timer);
+       ll_capa_thread_stop();
+       LASSERTF(capa_count[CAPA_SITE_CLIENT] == 0,
+                "client remaining capa count %d\n",
+                capa_count[CAPA_SITE_CLIENT]);
+
+       lustre_register_client_fill_super(NULL);
+       lustre_register_kill_super_cb(NULL);
+
+       lustre_register_client_process_config(NULL);
+
+       ll_destroy_inodecache();
+
+       kmem_cache_destroy(ll_rmtperm_hash_cachep);
+       ll_rmtperm_hash_cachep = NULL;
+
+       kmem_cache_destroy(ll_remote_perm_cachep);
+       ll_remote_perm_cachep = NULL;
+
+       kmem_cache_destroy(ll_file_data_slab);
+       if (proc_lustre_fs_root)
+               lprocfs_remove(&proc_lustre_fs_root);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Lite Client File System");
+MODULE_LICENSE("GPL");
+
+module_init(init_lustre_lite);
+module_exit(exit_lustre_lite);
diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
new file mode 100644 (file)
index 0000000..5260e98
--- /dev/null
@@ -0,0 +1,192 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/version.h>
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <lustre_lite.h>
+#include "llite_internal.h"
+
+static int ll_readlink_internal(struct inode *inode,
+                               struct ptlrpc_request **request, char **symname)
+{
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       int rc, symlen = i_size_read(inode) + 1;
+       struct mdt_body *body;
+       struct md_op_data *op_data;
+       ENTRY;
+
+       *request = NULL;
+
+       if (lli->lli_symlink_name) {
+               int print_limit = min_t(int, PAGE_SIZE - 128, symlen);
+
+               *symname = lli->lli_symlink_name;
+               /* If the total CDEBUG() size is larger than a page, it
+                * will print a warning to the console, avoid this by
+                * printing just the last part of the symlink. */
+               CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n",
+                      print_limit < symlen ? "..." : "", print_limit,
+                      (*symname) + symlen - print_limit, symlen);
+               RETURN(0);
+       }
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       op_data->op_valid = OBD_MD_LINKNAME;
+       rc = md_getattr(sbi->ll_md_exp, op_data, request);
+       ll_finish_md_op_data(op_data);
+       if (rc) {
+               if (rc != -ENOENT)
+                       CERROR("inode %lu: rc = %d\n", inode->i_ino, rc);
+               GOTO (failed, rc);
+       }
+
+       body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL);
+       if ((body->valid & OBD_MD_LINKNAME) == 0) {
+               CERROR("OBD_MD_LINKNAME not set on reply\n");
+               GOTO(failed, rc = -EPROTO);
+       }
+
+       LASSERT(symlen != 0);
+       if (body->eadatasize != symlen) {
+               CERROR("inode %lu: symlink length %d not expected %d\n",
+                       inode->i_ino, body->eadatasize - 1, symlen - 1);
+               GOTO(failed, rc = -EPROTO);
+       }
+
+       *symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD);
+       if (*symname == NULL ||
+           strnlen(*symname, symlen) != symlen - 1) {
+               /* not full/NULL terminated */
+               CERROR("inode %lu: symlink not NULL terminated string"
+                       "of length %d\n", inode->i_ino, symlen - 1);
+               GOTO(failed, rc = -EPROTO);
+       }
+
+       OBD_ALLOC(lli->lli_symlink_name, symlen);
+       /* do not return an error if we cannot cache the symlink locally */
+       if (lli->lli_symlink_name) {
+               memcpy(lli->lli_symlink_name, *symname, symlen);
+               *symname = lli->lli_symlink_name;
+       }
+       RETURN(0);
+
+failed:
+       RETURN (rc);
+}
+
+static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+       struct inode *inode = dentry->d_inode;
+       struct ptlrpc_request *request;
+       char *symname;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op\n");
+
+       ll_inode_size_lock(inode);
+       rc = ll_readlink_internal(inode, &request, &symname);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = vfs_readlink(dentry, buffer, buflen, symname);
+ out:
+       ptlrpc_req_finished(request);
+       ll_inode_size_unlock(inode);
+       RETURN(rc);
+}
+
+static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+       struct inode *inode = dentry->d_inode;
+       struct ptlrpc_request *request = NULL;
+       int rc;
+       char *symname;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op\n");
+       /* Limit the recursive symlink depth to 5 instead of default
+        * 8 links when kernel has 4k stack to prevent stack overflow.
+        * For 8k stacks we need to limit it to 7 for local servers. */
+       if (THREAD_SIZE < 8192 && current->link_count >= 6) {
+               rc = -ELOOP;
+       } else if (THREAD_SIZE == 8192 && current->link_count >= 8) {
+               rc = -ELOOP;
+       } else {
+               ll_inode_size_lock(inode);
+               rc = ll_readlink_internal(inode, &request, &symname);
+               ll_inode_size_unlock(inode);
+       }
+       if (rc) {
+               ptlrpc_req_finished(request);
+               request = NULL;
+               symname = ERR_PTR(rc);
+       }
+
+       nd_set_link(nd, symname);
+       /* symname may contain a pointer to the request message buffer,
+        * we delay request releasing until ll_put_link then.
+        */
+       RETURN(request);
+}
+
+static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+{
+       ptlrpc_req_finished(cookie);
+}
+
+struct inode_operations ll_fast_symlink_inode_operations = {
+       .readlink       = ll_readlink,
+       .setattr        = ll_setattr,
+       .follow_link    = ll_follow_link,
+       .put_link       = ll_put_link,
+       .getattr        = ll_getattr,
+       .permission     = ll_inode_permission,
+       .setxattr       = ll_setxattr,
+       .getxattr       = ll_getxattr,
+       .listxattr      = ll_listxattr,
+       .removexattr    = ll_removexattr,
+};
diff --git a/drivers/staging/lustre/lustre/llite/vvp_dev.c b/drivers/staging/lustre/lustre/llite/vvp_dev.c
new file mode 100644 (file)
index 0000000..9254b99
--- /dev/null
@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_device and cl_device_type implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+/*
+ * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical
+ * "llite_" (var. "ll_") prefix.
+ */
+
+struct kmem_cache *vvp_thread_kmem;
+static struct kmem_cache *vvp_session_kmem;
+static struct lu_kmem_descr vvp_caches[] = {
+       {
+               .ckd_cache = &vvp_thread_kmem,
+               .ckd_name  = "vvp_thread_kmem",
+               .ckd_size  = sizeof (struct vvp_thread_info),
+       },
+       {
+               .ckd_cache = &vvp_session_kmem,
+               .ckd_name  = "vvp_session_kmem",
+               .ckd_size  = sizeof (struct vvp_session)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+static void *vvp_key_init(const struct lu_context *ctx,
+                         struct lu_context_key *key)
+{
+       struct vvp_thread_info *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, vvp_thread_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void vvp_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+       struct vvp_thread_info *info = data;
+       OBD_SLAB_FREE_PTR(info, vvp_thread_kmem);
+}
+
+static void *vvp_session_key_init(const struct lu_context *ctx,
+                                 struct lu_context_key *key)
+{
+       struct vvp_session *session;
+
+       OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, __GFP_IO);
+       if (session == NULL)
+               session = ERR_PTR(-ENOMEM);
+       return session;
+}
+
+static void vvp_session_key_fini(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data)
+{
+       struct vvp_session *session = data;
+       OBD_SLAB_FREE_PTR(session, vvp_session_kmem);
+}
+
+
+struct lu_context_key vvp_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = vvp_key_init,
+       .lct_fini = vvp_key_fini
+};
+
+struct lu_context_key vvp_session_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = vvp_session_key_init,
+       .lct_fini = vvp_session_key_fini
+};
+
+/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(vvp, &ccc_key, &ccc_session_key, &vvp_key, &vvp_session_key);
+
+static const struct lu_device_operations vvp_lu_ops = {
+       .ldo_object_alloc      = vvp_object_alloc
+};
+
+static const struct cl_device_operations vvp_cl_ops = {
+       .cdo_req_init = ccc_req_init
+};
+
+static struct lu_device *vvp_device_alloc(const struct lu_env *env,
+                                         struct lu_device_type *t,
+                                         struct lustre_cfg *cfg)
+{
+       return ccc_device_alloc(env, t, cfg, &vvp_lu_ops, &vvp_cl_ops);
+}
+
+static const struct lu_device_type_operations vvp_device_type_ops = {
+       .ldto_init = vvp_type_init,
+       .ldto_fini = vvp_type_fini,
+
+       .ldto_start = vvp_type_start,
+       .ldto_stop  = vvp_type_stop,
+
+       .ldto_device_alloc = vvp_device_alloc,
+       .ldto_device_free  = ccc_device_free,
+       .ldto_device_init  = ccc_device_init,
+       .ldto_device_fini  = ccc_device_fini
+};
+
+struct lu_device_type vvp_device_type = {
+       .ldt_tags     = LU_DEVICE_CL,
+       .ldt_name     = LUSTRE_VVP_NAME,
+       .ldt_ops      = &vvp_device_type_ops,
+       .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/**
+ * A mutex serializing calls to vvp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+int vvp_global_init(void)
+{
+       int result;
+
+       result = lu_kmem_init(vvp_caches);
+       if (result == 0) {
+               result = ccc_global_init(&vvp_device_type);
+               if (result != 0)
+                       lu_kmem_fini(vvp_caches);
+       }
+       return result;
+}
+
+void vvp_global_fini(void)
+{
+       ccc_global_fini(&vvp_device_type);
+       lu_kmem_fini(vvp_caches);
+}
+
+
+/*****************************************************************************
+ *
+ * mirror obd-devices into cl devices.
+ *
+ */
+
+int cl_sb_init(struct super_block *sb)
+{
+       struct ll_sb_info *sbi;
+       struct cl_device  *cl;
+       struct lu_env     *env;
+       int rc = 0;
+       int refcheck;
+
+       sbi  = ll_s2sbi(sb);
+       env = cl_env_get(&refcheck);
+       if (!IS_ERR(env)) {
+               cl = cl_type_setup(env, NULL, &vvp_device_type,
+                                  sbi->ll_dt_exp->exp_obd->obd_lu_dev);
+               if (!IS_ERR(cl)) {
+                       cl2ccc_dev(cl)->cdv_sb = sb;
+                       sbi->ll_cl = cl;
+                       sbi->ll_site = cl2lu_dev(cl)->ld_site;
+               }
+               cl_env_put(env, &refcheck);
+       } else
+               rc = PTR_ERR(env);
+       RETURN(rc);
+}
+
+int cl_sb_fini(struct super_block *sb)
+{
+       struct ll_sb_info *sbi;
+       struct lu_env     *env;
+       struct cl_device  *cld;
+       int             refcheck;
+       int             result;
+
+       ENTRY;
+       sbi = ll_s2sbi(sb);
+       env = cl_env_get(&refcheck);
+       if (!IS_ERR(env)) {
+               cld = sbi->ll_cl;
+
+               if (cld != NULL) {
+                       cl_stack_fini(env, cld);
+                       sbi->ll_cl = NULL;
+                       sbi->ll_site = NULL;
+               }
+               cl_env_put(env, &refcheck);
+               result = 0;
+       } else {
+               CERROR("Cannot cleanup cl-stack due to memory shortage.\n");
+               result = PTR_ERR(env);
+       }
+       /*
+        * If mount failed (sbi->ll_cl == NULL), and this there are no other
+        * mounts, stop device types manually (this usually happens
+        * automatically when last device is destroyed).
+        */
+       lu_types_stop();
+       RETURN(result);
+}
+
+/****************************************************************************
+ *
+ * /proc/fs/lustre/llite/$MNT/dump_page_cache
+ *
+ ****************************************************************************/
+
+/*
+ * To represent contents of a page cache as a byte stream, following
+ * information if encoded in 64bit offset:
+ *
+ *       - file hash bucket in lu_site::ls_hash[]       28bits
+ *
+ *       - how far file is from bucket head          4bits
+ *
+ *       - page index                             32bits
+ *
+ * First two data identify a file in the cache uniquely.
+ */
+
+#define PGC_OBJ_SHIFT (32 + 4)
+#define PGC_DEPTH_SHIFT (32)
+
+struct vvp_pgcache_id {
+       unsigned                 vpi_bucket;
+       unsigned                 vpi_depth;
+       uint32_t                 vpi_index;
+
+       unsigned                 vpi_curdep;
+       struct lu_object_header *vpi_obj;
+};
+
+static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
+{
+       CLASSERT(sizeof(pos) == sizeof(__u64));
+
+       id->vpi_index  = pos & 0xffffffff;
+       id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
+       id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT);
+}
+
+static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
+{
+       return
+               ((__u64)id->vpi_index) |
+               ((__u64)id->vpi_depth  << PGC_DEPTH_SHIFT) |
+               ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
+}
+
+static int vvp_pgcache_obj_get(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                              struct hlist_node *hnode, void *data)
+{
+       struct vvp_pgcache_id   *id  = data;
+       struct lu_object_header *hdr = cfs_hash_object(hs, hnode);
+
+       if (id->vpi_curdep-- > 0)
+               return 0; /* continue */
+
+       if (lu_object_is_dying(hdr))
+               return 1;
+
+       cfs_hash_get(hs, hnode);
+       id->vpi_obj = hdr;
+       return 1;
+}
+
+static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
+                                        struct lu_device *dev,
+                                        struct vvp_pgcache_id *id)
+{
+       LASSERT(lu_device_is_cl(dev));
+
+       id->vpi_depth &= 0xf;
+       id->vpi_obj    = NULL;
+       id->vpi_curdep = id->vpi_depth;
+
+       cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket,
+                               vvp_pgcache_obj_get, id);
+       if (id->vpi_obj != NULL) {
+               struct lu_object *lu_obj;
+
+               lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type);
+               if (lu_obj != NULL) {
+                       lu_object_ref_add(lu_obj, "dump", current);
+                       return lu2cl(lu_obj);
+               }
+               lu_object_put(env, lu_object_top(id->vpi_obj));
+
+       } else if (id->vpi_curdep > 0) {
+               id->vpi_depth = 0xf;
+       }
+       return NULL;
+}
+
+static loff_t vvp_pgcache_find(const struct lu_env *env,
+                              struct lu_device *dev, loff_t pos)
+{
+       struct cl_object     *clob;
+       struct lu_site       *site;
+       struct vvp_pgcache_id id;
+
+       site = dev->ld_site;
+       vvp_pgcache_id_unpack(pos, &id);
+
+       while (1) {
+               if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash))
+                       return ~0ULL;
+               clob = vvp_pgcache_obj(env, dev, &id);
+               if (clob != NULL) {
+                       struct cl_object_header *hdr;
+                       int                   nr;
+                       struct cl_page    *pg;
+
+                       /* got an object. Find next page. */
+                       hdr = cl_object_header(clob);
+
+                       spin_lock(&hdr->coh_page_guard);
+                       nr = radix_tree_gang_lookup(&hdr->coh_tree,
+                                                   (void **)&pg,
+                                                   id.vpi_index, 1);
+                       if (nr > 0) {
+                               id.vpi_index = pg->cp_index;
+                               /* Cant support over 16T file */
+                               nr = !(pg->cp_index > 0xffffffff);
+                       }
+                       spin_unlock(&hdr->coh_page_guard);
+
+                       lu_object_ref_del(&clob->co_lu, "dump", current);
+                       cl_object_put(env, clob);
+                       if (nr > 0)
+                               return vvp_pgcache_id_pack(&id);
+               }
+               /* to the next object. */
+               ++id.vpi_depth;
+               id.vpi_depth &= 0xf;
+               if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
+                       return ~0ULL;
+               id.vpi_index = 0;
+       }
+}
+
+#define seq_page_flag(seq, page, flag, has_flags) do {           \
+       if (test_bit(PG_##flag, &(page)->flags)) {                \
+               seq_printf(seq, "%s"#flag, has_flags ? "|" : "");       \
+               has_flags = 1;                                    \
+       }                                                              \
+} while(0)
+
+static void vvp_pgcache_page_show(const struct lu_env *env,
+                                 struct seq_file *seq, struct cl_page *page)
+{
+       struct ccc_page *cpg;
+       struct page      *vmpage;
+       int           has_flags;
+
+       cpg = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+       vmpage = cpg->cpg_page;
+       seq_printf(seq," %5i | %p %p %s %s %s %s | %p %lu/%u(%p) %lu %u [",
+                  0 /* gen */,
+                  cpg, page,
+                  "none",
+                  cpg->cpg_write_queued ? "wq" : "- ",
+                  cpg->cpg_defer_uptodate ? "du" : "- ",
+                  PageWriteback(vmpage) ? "wb" : "-",
+                  vmpage, vmpage->mapping->host->i_ino,
+                  vmpage->mapping->host->i_generation,
+                  vmpage->mapping->host, vmpage->index,
+                  page_count(vmpage));
+       has_flags = 0;
+       seq_page_flag(seq, vmpage, locked, has_flags);
+       seq_page_flag(seq, vmpage, error, has_flags);
+       seq_page_flag(seq, vmpage, referenced, has_flags);
+       seq_page_flag(seq, vmpage, uptodate, has_flags);
+       seq_page_flag(seq, vmpage, dirty, has_flags);
+       seq_page_flag(seq, vmpage, writeback, has_flags);
+       seq_printf(seq, "%s]\n", has_flags ? "" : "-");
+}
+
+static int vvp_pgcache_show(struct seq_file *f, void *v)
+{
+       loff_t             pos;
+       struct ll_sb_info       *sbi;
+       struct cl_object        *clob;
+       struct lu_env      *env;
+       struct cl_page    *page;
+       struct cl_object_header *hdr;
+       struct vvp_pgcache_id    id;
+       int                   refcheck;
+       int                   result;
+
+       env = cl_env_get(&refcheck);
+       if (!IS_ERR(env)) {
+               pos = *(loff_t *) v;
+               vvp_pgcache_id_unpack(pos, &id);
+               sbi = f->private;
+               clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
+               if (clob != NULL) {
+                       hdr = cl_object_header(clob);
+
+                       spin_lock(&hdr->coh_page_guard);
+                       page = cl_page_lookup(hdr, id.vpi_index);
+                       spin_unlock(&hdr->coh_page_guard);
+
+                       seq_printf(f, "%8x@"DFID": ",
+                                  id.vpi_index, PFID(&hdr->coh_lu.loh_fid));
+                       if (page != NULL) {
+                               vvp_pgcache_page_show(env, f, page);
+                               cl_page_put(env, page);
+                       } else
+                               seq_puts(f, "missing\n");
+                       lu_object_ref_del(&clob->co_lu, "dump", current);
+                       cl_object_put(env, clob);
+               } else
+                       seq_printf(f, "%llx missing\n", pos);
+               cl_env_put(env, &refcheck);
+               result = 0;
+       } else
+               result = PTR_ERR(env);
+       return result;
+}
+
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+       struct ll_sb_info *sbi;
+       struct lu_env     *env;
+       int             refcheck;
+
+       sbi = f->private;
+
+       env = cl_env_get(&refcheck);
+       if (!IS_ERR(env)) {
+               sbi = f->private;
+               if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT)
+                       pos = ERR_PTR(-EFBIG);
+               else {
+                       *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
+                                               *pos);
+                       if (*pos == ~0ULL)
+                               pos = NULL;
+               }
+               cl_env_put(env, &refcheck);
+       }
+       return pos;
+}
+
+static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
+{
+       struct ll_sb_info *sbi;
+       struct lu_env     *env;
+       int             refcheck;
+
+       env = cl_env_get(&refcheck);
+       if (!IS_ERR(env)) {
+               sbi = f->private;
+               *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
+               if (*pos == ~0ULL)
+                       pos = NULL;
+               cl_env_put(env, &refcheck);
+       }
+       return pos;
+}
+
+static void vvp_pgcache_stop(struct seq_file *f, void *v)
+{
+       /* Nothing to do */
+}
+
+static struct seq_operations vvp_pgcache_ops = {
+       .start = vvp_pgcache_start,
+       .next  = vvp_pgcache_next,
+       .stop  = vvp_pgcache_stop,
+       .show  = vvp_pgcache_show
+};
+
+static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
+{
+       struct ll_sb_info     *sbi = PDE_DATA(inode);
+       struct seq_file       *seq;
+       int                 result;
+
+       result = seq_open(filp, &vvp_pgcache_ops);
+       if (result == 0) {
+               seq = filp->private_data;
+               seq->private = sbi;
+       }
+       return result;
+}
+
+struct file_operations vvp_dump_pgcache_file_ops = {
+       .owner   = THIS_MODULE,
+       .open    = vvp_dump_pgcache_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h
new file mode 100644 (file)
index 0000000..c82bf17
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal definitions for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef VVP_INTERNAL_H
+#define VVP_INTERNAL_H
+
+
+#include <cl_object.h>
+#include "llite_internal.h"
+
+int           vvp_io_init     (const struct lu_env *env,
+                                  struct cl_object *obj, struct cl_io *io);
+int           vvp_lock_init   (const struct lu_env *env,
+                                  struct cl_object *obj, struct cl_lock *lock,
+                                  const struct cl_io *io);
+int              vvp_page_init   (const struct lu_env *env,
+                                  struct cl_object *obj,
+                                  struct cl_page *page, struct page *vmpage);
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *hdr,
+                                  struct lu_device *dev);
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+extern struct kmem_cache *vvp_thread_kmem;
+
+#endif /* VVP_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
new file mode 100644 (file)
index 0000000..eb964ac
--- /dev/null
@@ -0,0 +1,1186 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+                               const struct cl_io_slice *slice);
+
+/**
+ * True, if \a io is a normal io, False for sendfile() / splice_{read|write}
+ */
+int cl_is_normalio(const struct lu_env *env, const struct cl_io *io)
+{
+       struct vvp_io *vio = vvp_env_io(env);
+
+       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+       return vio->cui_io_subtype == IO_NORMAL;
+}
+
+/**
+ * For swapping layout. The file's layout may have changed.
+ * To avoid populating pages to a wrong stripe, we have to verify the
+ * correctness of layout. It works because swapping layout processes
+ * have to acquire group lock.
+ */
+static bool can_populate_pages(const struct lu_env *env, struct cl_io *io,
+                               struct inode *inode)
+{
+       struct ll_inode_info    *lli = ll_i2info(inode);
+       struct ccc_io           *cio = ccc_env_io(env);
+       bool rc = true;
+
+       switch (io->ci_type) {
+       case CIT_READ:
+       case CIT_WRITE:
+               /* don't need lock here to check lli_layout_gen as we have held
+                * extent lock and GROUP lock has to hold to swap layout */
+               if (lli->lli_layout_gen != cio->cui_layout_gen) {
+                       io->ci_need_restart = 1;
+                       /* this will return application a short read/write */
+                       io->ci_continue = 0;
+                       rc = false;
+               }
+       case CIT_FAULT:
+               /* fault is okay because we've already had a page. */
+       default:
+               break;
+       }
+
+       return rc;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static int vvp_io_fault_iter_init(const struct lu_env *env,
+                                 const struct cl_io_slice *ios)
+{
+       struct vvp_io *vio   = cl2vvp_io(env, ios);
+       struct inode  *inode = ccc_object_inode(ios->cis_obj);
+
+       LASSERT(inode ==
+               cl2ccc_io(env, ios)->cui_fd->fd_file->f_dentry->d_inode);
+       vio->u.fault.ft_mtime = LTIME_S(inode->i_mtime);
+       return 0;
+}
+
+static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       struct cl_io     *io  = ios->cis_io;
+       struct cl_object *obj = io->ci_obj;
+       struct ccc_io    *cio = cl2ccc_io(env, ios);
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+       CDEBUG(D_VFSTRACE, "ignore/verify layout %d/%d, layout version %d.\n",
+               io->ci_ignore_layout, io->ci_verify_layout, cio->cui_layout_gen);
+
+       if (!io->ci_ignore_layout && io->ci_verify_layout) {
+               __u32 gen = 0;
+
+               /* check layout version */
+               ll_layout_refresh(ccc_object_inode(obj), &gen);
+               io->ci_need_restart = cio->cui_layout_gen != gen;
+               if (io->ci_need_restart)
+                       CDEBUG(D_VFSTRACE, "layout changed from %d to %d.\n",
+                               cio->cui_layout_gen, gen);
+       }
+}
+
+static void vvp_io_fault_fini(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct cl_io   *io   = ios->cis_io;
+       struct cl_page *page = io->u.ci_fault.ft_page;
+
+       CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+
+       if (page != NULL) {
+               lu_ref_del(&page->cp_reference, "fault", io);
+               cl_page_put(env, page);
+               io->u.ci_fault.ft_page = NULL;
+       }
+       vvp_io_fini(env, ios);
+}
+
+enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
+{
+       /*
+        * we only want to hold PW locks if the mmap() can generate
+        * writes back to the file and that only happens in shared
+        * writable vmas
+        */
+       if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+               return CLM_WRITE;
+       return CLM_READ;
+}
+
+static int vvp_mmap_locks(const struct lu_env *env,
+                         struct ccc_io *vio, struct cl_io *io)
+{
+       struct ccc_thread_info *cti = ccc_env_info(env);
+       struct mm_struct       *mm = current->mm;
+       struct vm_area_struct  *vma;
+       struct cl_lock_descr   *descr = &cti->cti_descr;
+       ldlm_policy_data_t      policy;
+       unsigned long      addr;
+       unsigned long      seg;
+       ssize_t          count;
+       int                  result;
+       ENTRY;
+
+       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+       if (!cl_is_normalio(env, io))
+               RETURN(0);
+
+       if (vio->cui_iov == NULL) /* nfs or loop back device write */
+               RETURN(0);
+
+       /* No MM (e.g. NFS)? No vmas too. */
+       if (mm == NULL)
+               RETURN(0);
+
+       for (seg = 0; seg < vio->cui_nrsegs; seg++) {
+               const struct iovec *iv = &vio->cui_iov[seg];
+
+               addr = (unsigned long)iv->iov_base;
+               count = iv->iov_len;
+               if (count == 0)
+                       continue;
+
+               count += addr & (~CFS_PAGE_MASK);
+               addr &= CFS_PAGE_MASK;
+
+               down_read(&mm->mmap_sem);
+               while((vma = our_vma(mm, addr, count)) != NULL) {
+                       struct inode *inode = vma->vm_file->f_dentry->d_inode;
+                       int flags = CEF_MUST;
+
+                       if (ll_file_nolock(vma->vm_file)) {
+                               /*
+                                * For no lock case, a lockless lock will be
+                                * generated.
+                                */
+                               flags = CEF_NEVER;
+                       }
+
+                       /*
+                        * XXX: Required lock mode can be weakened: CIT_WRITE
+                        * io only ever reads user level buffer, and CIT_READ
+                        * only writes on it.
+                        */
+                       policy_from_vma(&policy, vma, addr, count);
+                       descr->cld_mode = vvp_mode_from_vma(vma);
+                       descr->cld_obj = ll_i2info(inode)->lli_clob;
+                       descr->cld_start = cl_index(descr->cld_obj,
+                                                   policy.l_extent.start);
+                       descr->cld_end = cl_index(descr->cld_obj,
+                                                 policy.l_extent.end);
+                       descr->cld_enq_flags = flags;
+                       result = cl_io_lock_alloc_add(env, io, descr);
+
+                       CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+                              descr->cld_mode, descr->cld_start,
+                              descr->cld_end);
+
+                       if (result < 0)
+                               RETURN(result);
+
+                       if (vma->vm_end - addr >= count)
+                               break;
+
+                       count -= vma->vm_end - addr;
+                       addr = vma->vm_end;
+               }
+               up_read(&mm->mmap_sem);
+       }
+       RETURN(0);
+}
+
+static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
+                         enum cl_lock_mode mode, loff_t start, loff_t end)
+{
+       struct ccc_io *cio = ccc_env_io(env);
+       int result;
+       int ast_flags = 0;
+
+       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+       ENTRY;
+
+       ccc_io_update_iov(env, cio, io);
+
+       if (io->u.ci_rw.crw_nonblock)
+               ast_flags |= CEF_NONBLOCK;
+       result = vvp_mmap_locks(env, cio, io);
+       if (result == 0)
+               result = ccc_io_one_lock(env, io, ast_flags, mode, start, end);
+       RETURN(result);
+}
+
+static int vvp_io_read_lock(const struct lu_env *env,
+                           const struct cl_io_slice *ios)
+{
+       struct cl_io     *io  = ios->cis_io;
+       struct ll_inode_info *lli = ll_i2info(ccc_object_inode(io->ci_obj));
+       int result;
+
+       ENTRY;
+       /* XXX: Layer violation, we shouldn't see lsm at llite level. */
+       if (lli->lli_has_smd) /* lsm-less file doesn't need to lock */
+               result = vvp_io_rw_lock(env, io, CLM_READ,
+                                       io->u.ci_rd.rd.crw_pos,
+                                       io->u.ci_rd.rd.crw_pos +
+                                       io->u.ci_rd.rd.crw_count - 1);
+       else
+               result = 0;
+       RETURN(result);
+}
+
+static int vvp_io_fault_lock(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct cl_io *io   = ios->cis_io;
+       struct vvp_io *vio = cl2vvp_io(env, ios);
+       /*
+        * XXX LDLM_FL_CBPENDING
+        */
+       return ccc_io_one_lock_index
+               (env, io, 0, vvp_mode_from_vma(vio->u.fault.ft_vma),
+                io->u.ci_fault.ft_index, io->u.ci_fault.ft_index);
+}
+
+static int vvp_io_write_lock(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct cl_io *io = ios->cis_io;
+       loff_t start;
+       loff_t end;
+
+       if (io->u.ci_wr.wr_append) {
+               start = 0;
+               end   = OBD_OBJECT_EOF;
+       } else {
+               start = io->u.ci_wr.wr.crw_pos;
+               end   = start + io->u.ci_wr.wr.crw_count - 1;
+       }
+       return vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
+}
+
+static int vvp_io_setattr_iter_init(const struct lu_env *env,
+                                   const struct cl_io_slice *ios)
+{
+       return 0;
+}
+
+/**
+ * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io.
+ *
+ * Handles "lockless io" mode when extent locking is done by server.
+ */
+static int vvp_io_setattr_lock(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+       struct ccc_io *cio = ccc_env_io(env);
+       struct cl_io  *io  = ios->cis_io;
+       __u64 new_size;
+       __u32 enqflags = 0;
+
+       if (cl_io_is_trunc(io)) {
+               new_size = io->u.ci_setattr.sa_attr.lvb_size;
+               if (new_size == 0)
+                       enqflags = CEF_DISCARD_DATA;
+       } else {
+               if ((io->u.ci_setattr.sa_attr.lvb_mtime >=
+                    io->u.ci_setattr.sa_attr.lvb_ctime) ||
+                   (io->u.ci_setattr.sa_attr.lvb_atime >=
+                    io->u.ci_setattr.sa_attr.lvb_ctime))
+                       return 0;
+               new_size = 0;
+       }
+       cio->u.setattr.cui_local_lock = SETATTR_EXTENT_LOCK;
+       return ccc_io_one_lock(env, io, enqflags, CLM_WRITE,
+                              new_size, OBD_OBJECT_EOF);
+}
+
+static int vvp_do_vmtruncate(struct inode *inode, size_t size)
+{
+       int     result;
+       /*
+        * Only ll_inode_size_lock is taken at this level.
+        */
+       ll_inode_size_lock(inode);
+       result = inode_newsize_ok(inode, size);
+       if (result < 0) {
+               ll_inode_size_unlock(inode);
+               return result;
+       }
+       truncate_setsize(inode, size);
+       ll_inode_size_unlock(inode);
+       return result;
+}
+
+static int vvp_io_setattr_trunc(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               struct inode *inode, loff_t size)
+{
+       inode_dio_wait(inode);
+       return 0;
+}
+
+static int vvp_io_setattr_time(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+       struct cl_io       *io    = ios->cis_io;
+       struct cl_object   *obj   = io->ci_obj;
+       struct cl_attr     *attr  = ccc_env_thread_attr(env);
+       int result;
+       unsigned valid = CAT_CTIME;
+
+       cl_object_attr_lock(obj);
+       attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+       if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) {
+               attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+               valid |= CAT_ATIME;
+       }
+       if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) {
+               attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+               valid |= CAT_MTIME;
+       }
+       result = cl_object_attr_set(env, obj, attr, valid);
+       cl_object_attr_unlock(obj);
+
+       return result;
+}
+
+static int vvp_io_setattr_start(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+       struct cl_io    *io    = ios->cis_io;
+       struct inode    *inode = ccc_object_inode(io->ci_obj);
+
+       mutex_lock(&inode->i_mutex);
+       if (cl_io_is_trunc(io))
+               return vvp_io_setattr_trunc(env, ios, inode,
+                                           io->u.ci_setattr.sa_attr.lvb_size);
+       else
+               return vvp_io_setattr_time(env, ios);
+}
+
+static void vvp_io_setattr_end(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+       struct cl_io *io    = ios->cis_io;
+       struct inode *inode = ccc_object_inode(io->ci_obj);
+
+       if (cl_io_is_trunc(io)) {
+               /* Truncate in memory pages - they must be clean pages
+                * because osc has already notified to destroy osc_extents. */
+               vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
+               inode_dio_write_done(inode);
+       }
+       mutex_unlock(&inode->i_mutex);
+}
+
+static void vvp_io_setattr_fini(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+       vvp_io_fini(env, ios);
+}
+
+static ssize_t lustre_generic_file_read(struct file *file,
+                                       struct ccc_io *vio, loff_t *ppos)
+{
+       return generic_file_aio_read(vio->cui_iocb, vio->cui_iov,
+                                    vio->cui_nrsegs, *ppos);
+}
+
+static ssize_t lustre_generic_file_write(struct file *file,
+                                       struct ccc_io *vio, loff_t *ppos)
+{
+       return generic_file_aio_write(vio->cui_iocb, vio->cui_iov,
+                                     vio->cui_nrsegs, *ppos);
+}
+
+static int vvp_io_read_start(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct vvp_io     *vio   = cl2vvp_io(env, ios);
+       struct ccc_io     *cio   = cl2ccc_io(env, ios);
+       struct cl_io      *io    = ios->cis_io;
+       struct cl_object  *obj   = io->ci_obj;
+       struct inode      *inode = ccc_object_inode(obj);
+       struct ll_ra_read *bead  = &vio->cui_bead;
+       struct file       *file  = cio->cui_fd->fd_file;
+
+       int     result;
+       loff_t  pos = io->u.ci_rd.rd.crw_pos;
+       long    cnt = io->u.ci_rd.rd.crw_count;
+       long    tot = cio->cui_tot_count;
+       int     exceed = 0;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+       CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+
+       if (!can_populate_pages(env, io, inode))
+               return 0;
+
+       result = ccc_prep_size(env, obj, io, pos, tot, &exceed);
+       if (result != 0)
+               return result;
+       else if (exceed != 0)
+               goto out;
+
+       LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+                       "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
+                       inode->i_ino, cnt, pos, i_size_read(inode));
+
+       /* turn off the kernel's read-ahead */
+       cio->cui_fd->fd_file->f_ra.ra_pages = 0;
+
+       /* initialize read-ahead window once per syscall */
+       if (!vio->cui_ra_window_set) {
+               vio->cui_ra_window_set = 1;
+               bead->lrr_start = cl_index(obj, pos);
+               /*
+                * XXX: explicit PAGE_CACHE_SIZE
+                */
+               bead->lrr_count = cl_index(obj, tot + PAGE_CACHE_SIZE - 1);
+               ll_ra_read_in(file, bead);
+       }
+
+       /* BUG: 5972 */
+       file_accessed(file);
+       switch (vio->cui_io_subtype) {
+       case IO_NORMAL:
+                result = lustre_generic_file_read(file, cio, &pos);
+                break;
+       case IO_SPLICE:
+               result = generic_file_splice_read(file, &pos,
+                               vio->u.splice.cui_pipe, cnt,
+                               vio->u.splice.cui_flags);
+               /* LU-1109: do splice read stripe by stripe otherwise if it
+                * may make nfsd stuck if this read occupied all internal pipe
+                * buffers. */
+               io->ci_continue = 0;
+               break;
+       default:
+               CERROR("Wrong IO type %u\n", vio->cui_io_subtype);
+               LBUG();
+       }
+
+out:
+       if (result >= 0) {
+               if (result < cnt)
+                       io->ci_continue = 0;
+               io->ci_nob += result;
+               ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+                                 cio->cui_fd, pos, result, 0);
+               result = 0;
+       }
+       return result;
+}
+
+static void vvp_io_read_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       struct vvp_io *vio = cl2vvp_io(env, ios);
+       struct ccc_io *cio = cl2ccc_io(env, ios);
+
+       if (vio->cui_ra_window_set)
+               ll_ra_read_ex(cio->cui_fd->fd_file, &vio->cui_bead);
+
+       vvp_io_fini(env, ios);
+}
+
+static int vvp_io_write_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct ccc_io      *cio   = cl2ccc_io(env, ios);
+       struct cl_io       *io    = ios->cis_io;
+       struct cl_object   *obj   = io->ci_obj;
+       struct inode       *inode = ccc_object_inode(obj);
+       struct file     *file  = cio->cui_fd->fd_file;
+       ssize_t result = 0;
+       loff_t pos = io->u.ci_wr.wr.crw_pos;
+       size_t cnt = io->u.ci_wr.wr.crw_count;
+
+       ENTRY;
+
+       if (!can_populate_pages(env, io, inode))
+               return 0;
+
+       if (cl_io_is_append(io)) {
+               /*
+                * PARALLEL IO This has to be changed for parallel IO doing
+                * out-of-order writes.
+                */
+               pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
+               cio->cui_iocb->ki_pos = pos;
+       }
+
+       CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
+
+       if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */
+               result = 0;
+       else
+               result = lustre_generic_file_write(file, cio, &pos);
+
+       if (result > 0) {
+               if (result < cnt)
+                       io->ci_continue = 0;
+               io->ci_nob += result;
+               ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+                                 cio->cui_fd, pos, result, 0);
+               result = 0;
+       }
+       RETURN(result);
+}
+
+static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
+{
+       struct vm_fault *vmf = cfio->fault.ft_vmf;
+
+       cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, vmf);
+
+       if (vmf->page) {
+               LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n",
+                              vmf->virtual_address);
+               if (unlikely(!(cfio->fault.ft_flags & VM_FAULT_LOCKED))) {
+                       lock_page(vmf->page);
+                       cfio->fault.ft_flags &= VM_FAULT_LOCKED;
+               }
+
+               cfio->ft_vmpage = vmf->page;
+               return 0;
+       }
+
+       if (cfio->fault.ft_flags & VM_FAULT_SIGBUS) {
+               CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
+               return -EFAULT;
+       }
+
+       if (cfio->fault.ft_flags & VM_FAULT_OOM) {
+               CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address);
+               return -ENOMEM;
+       }
+
+       if (cfio->fault.ft_flags & VM_FAULT_RETRY)
+               return -EAGAIN;
+
+       CERROR("unknow error in page fault %d!\n", cfio->fault.ft_flags);
+       return -EINVAL;
+}
+
+
+static int vvp_io_fault_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct vvp_io       *vio     = cl2vvp_io(env, ios);
+       struct cl_io    *io      = ios->cis_io;
+       struct cl_object    *obj     = io->ci_obj;
+       struct inode    *inode   = ccc_object_inode(obj);
+       struct cl_fault_io  *fio     = &io->u.ci_fault;
+       struct vvp_fault_io *cfio    = &vio->u.fault;
+       loff_t         offset;
+       int               result  = 0;
+       struct page       *vmpage  = NULL;
+       struct cl_page      *page;
+       loff_t         size;
+       pgoff_t       last; /* last page in a file data region */
+
+       if (fio->ft_executable &&
+           LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
+               CWARN("binary "DFID
+                     " changed while waiting for the page fault lock\n",
+                     PFID(lu_object_fid(&obj->co_lu)));
+
+       /* offset of the last byte on the page */
+       offset = cl_offset(obj, fio->ft_index + 1) - 1;
+       LASSERT(cl_index(obj, offset) == fio->ft_index);
+       result = ccc_prep_size(env, obj, io, 0, offset + 1, NULL);
+       if (result != 0)
+               return result;
+
+       /* must return locked page */
+       if (fio->ft_mkwrite) {
+               LASSERT(cfio->ft_vmpage != NULL);
+               lock_page(cfio->ft_vmpage);
+       } else {
+               result = vvp_io_kernel_fault(cfio);
+               if (result != 0)
+                       return result;
+       }
+
+       vmpage = cfio->ft_vmpage;
+       LASSERT(PageLocked(vmpage));
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE))
+               ll_invalidate_page(vmpage);
+
+       size = i_size_read(inode);
+       /* Though we have already held a cl_lock upon this page, but
+        * it still can be truncated locally. */
+       if (unlikely((vmpage->mapping != inode->i_mapping) ||
+                    (page_offset(vmpage) > size))) {
+               CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n");
+
+               /* return +1 to stop cl_io_loop() and ll_fault() will catch
+                * and retry. */
+               GOTO(out, result = +1);
+       }
+
+
+       if (fio->ft_mkwrite ) {
+               pgoff_t last_index;
+               /*
+                * Capture the size while holding the lli_trunc_sem from above
+                * we want to make sure that we complete the mkwrite action
+                * while holding this lock. We need to make sure that we are
+                * not past the end of the file.
+                */
+               last_index = cl_index(obj, size - 1);
+               if (last_index < fio->ft_index) {
+                       CDEBUG(D_PAGE,
+                               "llite: mkwrite and truncate race happened: "
+                               "%p: 0x%lx 0x%lx\n",
+                               vmpage->mapping,fio->ft_index,last_index);
+                       /*
+                        * We need to return if we are
+                        * passed the end of the file. This will propagate
+                        * up the call stack to ll_page_mkwrite where
+                        * we will return VM_FAULT_NOPAGE. Any non-negative
+                        * value returned here will be silently
+                        * converted to 0. If the vmpage->mapping is null
+                        * the error code would be converted back to ENODATA
+                        * in ll_page_mkwrite0. Thus we return -ENODATA
+                        * to handle both cases
+                        */
+                       GOTO(out, result = -ENODATA);
+               }
+       }
+
+       page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE);
+       if (IS_ERR(page))
+               GOTO(out, result = PTR_ERR(page));
+
+       /* if page is going to be written, we should add this page into cache
+        * earlier. */
+       if (fio->ft_mkwrite) {
+               wait_on_page_writeback(vmpage);
+               if (set_page_dirty(vmpage)) {
+                       struct ccc_page *cp;
+
+                       /* vvp_page_assume() calls wait_on_page_writeback(). */
+                       cl_page_assume(env, io, page);
+
+                       cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+                       vvp_write_pending(cl2ccc(obj), cp);
+
+                       /* Do not set Dirty bit here so that in case IO is
+                        * started before the page is really made dirty, we
+                        * still have chance to detect it. */
+                       result = cl_page_cache_add(env, io, page, CRT_WRITE);
+                       LASSERT(cl_page_is_owned(page, io));
+
+                       vmpage = NULL;
+                       if (result < 0) {
+                               cl_page_unmap(env, io, page);
+                               cl_page_discard(env, io, page);
+                               cl_page_disown(env, io, page);
+
+                               cl_page_put(env, page);
+
+                               /* we're in big trouble, what can we do now? */
+                               if (result == -EDQUOT)
+                                       result = -ENOSPC;
+                               GOTO(out, result);
+                       } else
+                               cl_page_disown(env, io, page);
+               }
+       }
+
+       last = cl_index(obj, size - 1);
+       /*
+        * The ft_index is only used in the case of
+        * a mkwrite action. We need to check
+        * our assertions are correct, since
+        * we should have caught this above
+        */
+       LASSERT(!fio->ft_mkwrite || fio->ft_index <= last);
+       if (fio->ft_index == last)
+               /*
+                * Last page is mapped partially.
+                */
+               fio->ft_nob = size - cl_offset(obj, fio->ft_index);
+       else
+               fio->ft_nob = cl_page_size(obj);
+
+       lu_ref_add(&page->cp_reference, "fault", io);
+       fio->ft_page = page;
+       EXIT;
+
+out:
+       /* return unlocked vmpage to avoid deadlocking */
+       if (vmpage != NULL)
+               unlock_page(vmpage);
+       cfio->fault.ft_flags &= ~VM_FAULT_LOCKED;
+       return result;
+}
+
+static int vvp_io_fsync_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       /* we should mark TOWRITE bit to each dirty page in radix tree to
+        * verify pages have been written, but this is difficult because of
+        * race. */
+       return 0;
+}
+
+static int vvp_io_read_page(const struct lu_env *env,
+                           const struct cl_io_slice *ios,
+                           const struct cl_page_slice *slice)
+{
+       struct cl_io          *io     = ios->cis_io;
+       struct cl_object          *obj    = slice->cpl_obj;
+       struct ccc_page    *cp     = cl2ccc_page(slice);
+       struct cl_page      *page   = slice->cpl_page;
+       struct inode          *inode  = ccc_object_inode(obj);
+       struct ll_sb_info        *sbi    = ll_i2sbi(inode);
+       struct ll_file_data       *fd     = cl2ccc_io(env, ios)->cui_fd;
+       struct ll_readahead_state *ras    = &fd->fd_ras;
+       struct page             *vmpage = cp->cpg_page;
+       struct cl_2queue          *queue  = &io->ci_queue;
+       int rc;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+       LASSERT(slice->cpl_obj == obj);
+
+       ENTRY;
+
+       if (sbi->ll_ra_info.ra_max_pages_per_file &&
+           sbi->ll_ra_info.ra_max_pages)
+               ras_update(sbi, inode, ras, page->cp_index,
+                          cp->cpg_defer_uptodate);
+
+       /* Sanity check whether the page is protected by a lock. */
+       rc = cl_page_is_under_lock(env, io, page);
+       if (rc != -EBUSY) {
+               CL_PAGE_HEADER(D_WARNING, env, page, "%s: %d\n",
+                              rc == -ENODATA ? "without a lock" :
+                              "match failed", rc);
+               if (rc != -ENODATA)
+                       RETURN(rc);
+       }
+
+       if (cp->cpg_defer_uptodate) {
+               cp->cpg_ra_used = 1;
+               cl_page_export(env, page, 1);
+       }
+       /*
+        * Add page into the queue even when it is marked uptodate above.
+        * this will unlock it automatically as part of cl_page_list_disown().
+        */
+       cl_2queue_add(queue, page);
+       if (sbi->ll_ra_info.ra_max_pages_per_file &&
+           sbi->ll_ra_info.ra_max_pages)
+               ll_readahead(env, io, ras,
+                            vmpage->mapping, &queue->c2_qin, fd->fd_flags);
+
+       RETURN(0);
+}
+
+static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io,
+                           struct cl_page *page, struct ccc_page *cp,
+                           enum cl_req_type crt)
+{
+       struct cl_2queue  *queue;
+       int result;
+
+       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+       queue = &io->ci_queue;
+       cl_2queue_init_page(queue, page);
+
+       result = cl_io_submit_sync(env, io, crt, queue, 0);
+       LASSERT(cl_page_is_owned(page, io));
+
+       if (crt == CRT_READ)
+               /*
+                * in CRT_WRITE case page is left locked even in case of
+                * error.
+                */
+               cl_page_list_disown(env, io, &queue->c2_qin);
+       cl_2queue_fini(env, queue);
+
+       return result;
+}
+
+/**
+ * Prepare partially written-to page for a write.
+ */
+static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io,
+                                 struct cl_object *obj, struct cl_page *pg,
+                                 struct ccc_page *cp,
+                                 unsigned from, unsigned to)
+{
+       struct cl_attr *attr   = ccc_env_thread_attr(env);
+       loff_t    offset = cl_offset(obj, pg->cp_index);
+       int          result;
+
+       cl_object_attr_lock(obj);
+       result = cl_object_attr_get(env, obj, attr);
+       cl_object_attr_unlock(obj);
+       if (result == 0) {
+               /*
+                * If are writing to a new page, no need to read old data.
+                * The extent locking will have updated the KMS, and for our
+                * purposes here we can treat it like i_size.
+                */
+               if (attr->cat_kms <= offset) {
+                       char *kaddr = ll_kmap_atomic(cp->cpg_page, KM_USER0);
+
+                       memset(kaddr, 0, cl_page_size(obj));
+                       ll_kunmap_atomic(kaddr, KM_USER0);
+               } else if (cp->cpg_defer_uptodate)
+                       cp->cpg_ra_used = 1;
+               else
+                       result = vvp_page_sync_io(env, io, pg, cp, CRT_READ);
+               /*
+                * In older implementations, obdo_refresh_inode is called here
+                * to update the inode because the write might modify the
+                * object info at OST. However, this has been proven useless,
+                * since LVB functions will be called when user space program
+                * tries to retrieve inode attribute.  Also, see bug 15909 for
+                * details. -jay
+                */
+               if (result == 0)
+                       cl_page_export(env, pg, 1);
+       }
+       return result;
+}
+
+static int vvp_io_prepare_write(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               const struct cl_page_slice *slice,
+                               unsigned from, unsigned to)
+{
+       struct cl_object *obj    = slice->cpl_obj;
+       struct ccc_page  *cp     = cl2ccc_page(slice);
+       struct cl_page   *pg     = slice->cpl_page;
+       struct page       *vmpage = cp->cpg_page;
+
+       int result;
+
+       ENTRY;
+
+       LINVRNT(cl_page_is_vmlocked(env, pg));
+       LASSERT(vmpage->mapping->host == ccc_object_inode(obj));
+
+       result = 0;
+
+       CL_PAGE_HEADER(D_PAGE, env, pg, "preparing: [%d, %d]\n", from, to);
+       if (!PageUptodate(vmpage)) {
+               /*
+                * We're completely overwriting an existing page, so _don't_
+                * set it up to date until commit_write
+                */
+               if (from == 0 && to == PAGE_CACHE_SIZE) {
+                       CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n");
+                       POISON_PAGE(page, 0x11);
+               } else
+                       result = vvp_io_prepare_partial(env, ios->cis_io, obj,
+                                                       pg, cp, from, to);
+       } else
+               CL_PAGE_HEADER(D_PAGE, env, pg, "uptodate\n");
+       RETURN(result);
+}
+
+static int vvp_io_commit_write(const struct lu_env *env,
+                              const struct cl_io_slice *ios,
+                              const struct cl_page_slice *slice,
+                              unsigned from, unsigned to)
+{
+       struct cl_object  *obj    = slice->cpl_obj;
+       struct cl_io      *io     = ios->cis_io;
+       struct ccc_page   *cp     = cl2ccc_page(slice);
+       struct cl_page    *pg     = slice->cpl_page;
+       struct inode      *inode  = ccc_object_inode(obj);
+       struct ll_sb_info *sbi    = ll_i2sbi(inode);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct page     *vmpage = cp->cpg_page;
+
+       int    result;
+       int    tallyop;
+       loff_t size;
+
+       ENTRY;
+
+       LINVRNT(cl_page_is_vmlocked(env, pg));
+       LASSERT(vmpage->mapping->host == inode);
+
+       LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "commiting page write\n");
+       CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to);
+
+       /*
+        * queue a write for some time in the future the first time we
+        * dirty the page.
+        *
+        * This is different from what other file systems do: they usually
+        * just mark page (and some of its buffers) dirty and rely on
+        * balance_dirty_pages() to start a write-back. Lustre wants write-back
+        * to be started earlier for the following reasons:
+        *
+        *     (1) with a large number of clients we need to limit the amount
+        *     of cached data on the clients a lot;
+        *
+        *     (2) large compute jobs generally want compute-only then io-only
+        *     and the IO should complete as quickly as possible;
+        *
+        *     (3) IO is batched up to the RPC size and is async until the
+        *     client max cache is hit
+        *     (/proc/fs/lustre/osc/OSC.../max_dirty_mb)
+        *
+        */
+       if (!PageDirty(vmpage)) {
+               tallyop = LPROC_LL_DIRTY_MISSES;
+               result = cl_page_cache_add(env, io, pg, CRT_WRITE);
+               if (result == 0) {
+                       /* page was added into cache successfully. */
+                       set_page_dirty(vmpage);
+                       vvp_write_pending(cl2ccc(obj), cp);
+               } else if (result == -EDQUOT) {
+                       pgoff_t last_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+                       bool need_clip = true;
+
+                       /*
+                        * Client ran out of disk space grant. Possible
+                        * strategies are:
+                        *
+                        *     (a) do a sync write, renewing grant;
+                        *
+                        *     (b) stop writing on this stripe, switch to the
+                        *     next one.
+                        *
+                        * (b) is a part of "parallel io" design that is the
+                        * ultimate goal. (a) is what "old" client did, and
+                        * what the new code continues to do for the time
+                        * being.
+                        */
+                       if (last_index > pg->cp_index) {
+                               to = PAGE_CACHE_SIZE;
+                               need_clip = false;
+                       } else if (last_index == pg->cp_index) {
+                               int size_to = i_size_read(inode) & ~CFS_PAGE_MASK;
+                               if (to < size_to)
+                                       to = size_to;
+                       }
+                       if (need_clip)
+                               cl_page_clip(env, pg, 0, to);
+                       result = vvp_page_sync_io(env, io, pg, cp, CRT_WRITE);
+                       if (result)
+                               CERROR("Write page %lu of inode %p failed %d\n",
+                                      pg->cp_index, inode, result);
+               }
+       } else {
+               tallyop = LPROC_LL_DIRTY_HITS;
+               result = 0;
+       }
+       ll_stats_ops_tally(sbi, tallyop, 1);
+
+       /* Inode should be marked DIRTY even if no new page was marked DIRTY
+        * because page could have been not flushed between 2 modifications.
+        * It is important the file is marked DIRTY as soon as the I/O is done
+        * Indeed, when cache is flushed, file could be already closed and it
+        * is too late to warn the MDT.
+        * It is acceptable that file is marked DIRTY even if I/O is dropped
+        * for some reasons before being flushed to OST.
+        */
+       if (result == 0) {
+               spin_lock(&lli->lli_lock);
+               lli->lli_flags |= LLIF_DATA_MODIFIED;
+               spin_unlock(&lli->lli_lock);
+       }
+
+       size = cl_offset(obj, pg->cp_index) + to;
+
+       ll_inode_size_lock(inode);
+       if (result == 0) {
+               if (size > i_size_read(inode)) {
+                       cl_isize_write_nolock(inode, size);
+                       CDEBUG(D_VFSTRACE, DFID" updating i_size %lu\n",
+                              PFID(lu_object_fid(&obj->co_lu)),
+                              (unsigned long)size);
+               }
+               cl_page_export(env, pg, 1);
+       } else {
+               if (size > i_size_read(inode))
+                       cl_page_discard(env, io, pg);
+       }
+       ll_inode_size_unlock(inode);
+       RETURN(result);
+}
+
+static const struct cl_io_operations vvp_io_ops = {
+       .op = {
+               [CIT_READ] = {
+                       .cio_fini      = vvp_io_read_fini,
+                       .cio_lock      = vvp_io_read_lock,
+                       .cio_start     = vvp_io_read_start,
+                       .cio_advance   = ccc_io_advance
+               },
+               [CIT_WRITE] = {
+                       .cio_fini      = vvp_io_fini,
+                       .cio_lock      = vvp_io_write_lock,
+                       .cio_start     = vvp_io_write_start,
+                       .cio_advance   = ccc_io_advance
+               },
+               [CIT_SETATTR] = {
+                       .cio_fini       = vvp_io_setattr_fini,
+                       .cio_iter_init  = vvp_io_setattr_iter_init,
+                       .cio_lock       = vvp_io_setattr_lock,
+                       .cio_start      = vvp_io_setattr_start,
+                       .cio_end        = vvp_io_setattr_end
+               },
+               [CIT_FAULT] = {
+                       .cio_fini      = vvp_io_fault_fini,
+                       .cio_iter_init = vvp_io_fault_iter_init,
+                       .cio_lock      = vvp_io_fault_lock,
+                       .cio_start     = vvp_io_fault_start,
+                       .cio_end       = ccc_io_end
+               },
+               [CIT_FSYNC] = {
+                       .cio_start  = vvp_io_fsync_start,
+                       .cio_fini   = vvp_io_fini
+               },
+               [CIT_MISC] = {
+                       .cio_fini   = vvp_io_fini
+               }
+       },
+       .cio_read_page     = vvp_io_read_page,
+       .cio_prepare_write = vvp_io_prepare_write,
+       .cio_commit_write  = vvp_io_commit_write
+};
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_io *io)
+{
+       struct vvp_io      *vio   = vvp_env_io(env);
+       struct ccc_io      *cio   = ccc_env_io(env);
+       struct inode       *inode = ccc_object_inode(obj);
+       int              result;
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+       ENTRY;
+
+       CL_IO_SLICE_CLEAN(cio, cui_cl);
+       cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops);
+       vio->cui_ra_window_set = 0;
+       result = 0;
+       if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+               size_t count;
+               struct ll_inode_info *lli = ll_i2info(inode);
+
+               count = io->u.ci_rw.crw_count;
+               /* "If nbyte is 0, read() will return 0 and have no other
+                *  results."  -- Single Unix Spec */
+               if (count == 0)
+                       result = 1;
+               else {
+                       cio->cui_tot_count = count;
+                       cio->cui_tot_nrsegs = 0;
+               }
+               /* for read/write, we store the jobid in the inode, and
+                * it'll be fetched by osc when building RPC.
+                *
+                * it's not accurate if the file is shared by different
+                * jobs.
+                */
+               lustre_get_jobid(lli->lli_jobid);
+       } else if (io->ci_type == CIT_SETATTR) {
+               if (!cl_io_is_trunc(io))
+                       io->ci_lockreq = CILR_MANDATORY;
+       }
+
+       /* ignore layout change for generic CIT_MISC but not for glimpse.
+        * io context for glimpse must set ci_verify_layout to true,
+        * see cl_glimpse_size0() for details. */
+       if (io->ci_type == CIT_MISC && !io->ci_verify_layout)
+               io->ci_ignore_layout = 1;
+
+       /* Enqueue layout lock and get layout version. We need to do this
+        * even for operations requiring to open file, such as read and write,
+        * because it might not grant layout lock in IT_OPEN. */
+       if (result == 0 && !io->ci_ignore_layout) {
+               result = ll_layout_refresh(inode, &cio->cui_layout_gen);
+               if (result == -ENOENT)
+                       /* If the inode on MDS has been removed, but the objects
+                        * on OSTs haven't been destroyed (async unlink), layout
+                        * fetch will return -ENOENT, we'd ingore this error
+                        * and continue with dirty flush. LU-3230. */
+                       result = 0;
+               if (result < 0)
+                       CERROR("%s: refresh file layout " DFID " error %d.\n",
+                               ll_get_fsname(inode->i_sb, NULL, 0),
+                               PFID(lu_object_fid(&obj->co_lu)), result);
+       }
+
+       RETURN(result);
+}
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+                               const struct cl_io_slice *slice)
+{
+       /* Caling just for assertion */
+       cl2ccc_io(env, slice);
+       return vvp_env_io(env);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_lock.c b/drivers/staging/lustre/lustre/llite/vvp_lock.c
new file mode 100644 (file)
index 0000000..9b8712b
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp lock functions.
+ *
+ */
+
+/**
+ * Estimates lock value for the purpose of managing the lock cache during
+ * memory shortages.
+ *
+ * Locks for memory mapped files are almost infinitely precious, others are
+ * junk. "Mapped locks" are heavy, but not infinitely heavy, so that they are
+ * ordered within themselves by weights assigned from other layers.
+ */
+static unsigned long vvp_lock_weigh(const struct lu_env *env,
+                                   const struct cl_lock_slice *slice)
+{
+       struct ccc_object *cob = cl2ccc(slice->cls_obj);
+
+       ENTRY;
+       RETURN(atomic_read(&cob->cob_mmap_cnt) > 0 ? ~0UL >> 2 : 0);
+}
+
+static const struct cl_lock_operations vvp_lock_ops = {
+       .clo_delete    = ccc_lock_delete,
+       .clo_fini      = ccc_lock_fini,
+       .clo_enqueue   = ccc_lock_enqueue,
+       .clo_wait      = ccc_lock_wait,
+       .clo_unuse     = ccc_lock_unuse,
+       .clo_fits_into = ccc_lock_fits_into,
+       .clo_state     = ccc_lock_state,
+       .clo_weigh     = vvp_lock_weigh
+};
+
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_lock *lock, const struct cl_io *io)
+{
+       return ccc_lock_init(env, obj, lock, io, &vvp_lock_ops);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_object.c b/drivers/staging/lustre/lustre/llite/vvp_object.c
new file mode 100644 (file)
index 0000000..01edc5b
--- /dev/null
@@ -0,0 +1,186 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_object implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int vvp_object_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *o)
+{
+       struct ccc_object    *obj   = lu2ccc(o);
+       struct inode     *inode = obj->cob_inode;
+       struct ll_inode_info *lli;
+
+       (*p)(env, cookie, "(%s %d %d) inode: %p ",
+            list_empty(&obj->cob_pending_list) ? "-" : "+",
+            obj->cob_transient_pages, atomic_read(&obj->cob_mmap_cnt),
+            inode);
+       if (inode) {
+               lli = ll_i2info(inode);
+               (*p)(env, cookie, "%lu/%u %o %u %d %p "DFID,
+                    inode->i_ino, inode->i_generation, inode->i_mode,
+                    inode->i_nlink, atomic_read(&inode->i_count),
+                    lli->lli_clob, PFID(&lli->lli_fid));
+       }
+       return 0;
+}
+
+static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_attr *attr)
+{
+       struct inode *inode = ccc_object_inode(obj);
+
+       /*
+        * lov overwrites most of these fields in
+        * lov_attr_get()->...lov_merge_lvb_kms(), except when inode
+        * attributes are newer.
+        */
+
+       attr->cat_size = i_size_read(inode);
+       attr->cat_mtime = LTIME_S(inode->i_mtime);
+       attr->cat_atime = LTIME_S(inode->i_atime);
+       attr->cat_ctime = LTIME_S(inode->i_ctime);
+       attr->cat_blocks = inode->i_blocks;
+       attr->cat_uid = inode->i_uid;
+       attr->cat_gid = inode->i_gid;
+       /* KMS is not known by this layer */
+       return 0; /* layers below have to fill in the rest */
+}
+
+static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj,
+                       const struct cl_attr *attr, unsigned valid)
+{
+       struct inode *inode = ccc_object_inode(obj);
+
+       if (valid & CAT_UID)
+               inode->i_uid = attr->cat_uid;
+       if (valid & CAT_GID)
+               inode->i_gid = attr->cat_gid;
+       if (valid & CAT_ATIME)
+               LTIME_S(inode->i_atime) = attr->cat_atime;
+       if (valid & CAT_MTIME)
+               LTIME_S(inode->i_mtime) = attr->cat_mtime;
+       if (valid & CAT_CTIME)
+               LTIME_S(inode->i_ctime) = attr->cat_ctime;
+       if (0 && valid & CAT_SIZE)
+               cl_isize_write_nolock(inode, attr->cat_size);
+       /* not currently necessary */
+       if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE))
+               mark_inode_dirty(inode);
+       return 0;
+}
+
+int vvp_conf_set(const struct lu_env *env, struct cl_object *obj,
+               const struct cl_object_conf *conf)
+{
+       struct ll_inode_info *lli = ll_i2info(conf->coc_inode);
+
+       if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+               lli->lli_layout_gen = LL_LAYOUT_GEN_NONE;
+               return 0;
+       }
+
+       if (conf->coc_opc != OBJECT_CONF_SET)
+               return 0;
+
+       if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL) {
+               CDEBUG(D_VFSTRACE, "layout lock change: %u -> %u\n",
+                       lli->lli_layout_gen,
+                       conf->u.coc_md->lsm->lsm_layout_gen);
+
+               lli->lli_has_smd = true;
+               lli->lli_layout_gen = conf->u.coc_md->lsm->lsm_layout_gen;
+       } else {
+               CDEBUG(D_VFSTRACE, "layout lock destroyed: %u.\n",
+                       lli->lli_layout_gen);
+
+               lli->lli_has_smd = false;
+               lli->lli_layout_gen = LL_LAYOUT_GEN_EMPTY;
+       }
+       return 0;
+}
+
+static const struct cl_object_operations vvp_ops = {
+       .coo_page_init = vvp_page_init,
+       .coo_lock_init = vvp_lock_init,
+       .coo_io_init   = vvp_io_init,
+       .coo_attr_get  = vvp_attr_get,
+       .coo_attr_set  = vvp_attr_set,
+       .coo_conf_set  = vvp_conf_set,
+       .coo_glimpse   = ccc_object_glimpse
+};
+
+static const struct lu_object_operations vvp_lu_obj_ops = {
+       .loo_object_init  = ccc_object_init,
+       .loo_object_free  = ccc_object_free,
+       .loo_object_print = vvp_object_print
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode)
+{
+       struct cl_inode_info *lli = cl_i2info(inode);
+       struct cl_object     *obj = lli->lli_clob;
+       struct lu_object     *lu;
+
+       LASSERT(obj != NULL);
+       lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type);
+       LASSERT(lu != NULL);
+       return lu2ccc(lu);
+}
+
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *hdr,
+                                  struct lu_device *dev)
+{
+       return ccc_object_alloc(env, hdr, dev, &vvp_ops, &vvp_lu_obj_ops);
+}
diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c
new file mode 100644 (file)
index 0000000..4568e69
--- /dev/null
@@ -0,0 +1,558 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+static void vvp_page_fini_common(struct ccc_page *cp)
+{
+       struct page *vmpage = cp->cpg_page;
+
+       LASSERT(vmpage != NULL);
+       page_cache_release(vmpage);
+}
+
+static void vvp_page_fini(const struct lu_env *env,
+                         struct cl_page_slice *slice)
+{
+       struct ccc_page *cp = cl2ccc_page(slice);
+       struct page *vmpage  = cp->cpg_page;
+
+       /*
+        * vmpage->private was already cleared when page was moved into
+        * VPG_FREEING state.
+        */
+       LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
+       vvp_page_fini_common(cp);
+}
+
+static int vvp_page_own(const struct lu_env *env,
+                       const struct cl_page_slice *slice, struct cl_io *io,
+                       int nonblock)
+{
+       struct ccc_page *vpg    = cl2ccc_page(slice);
+       struct page      *vmpage = vpg->cpg_page;
+
+       LASSERT(vmpage != NULL);
+       if (nonblock) {
+               if (!trylock_page(vmpage))
+                       return -EAGAIN;
+
+               if (unlikely(PageWriteback(vmpage))) {
+                       unlock_page(vmpage);
+                       return -EAGAIN;
+               }
+
+               return 0;
+       }
+
+       lock_page(vmpage);
+       wait_on_page_writeback(vmpage);
+       return 0;
+}
+
+static void vvp_page_assume(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *unused)
+{
+       struct page *vmpage = cl2vm_page(slice);
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+       wait_on_page_writeback(vmpage);
+}
+
+static void vvp_page_unassume(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *unused)
+{
+       struct page *vmpage = cl2vm_page(slice);
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+}
+
+static void vvp_page_disown(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io)
+{
+       struct page *vmpage = cl2vm_page(slice);
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+
+       unlock_page(cl2vm_page(slice));
+}
+
+static void vvp_page_discard(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *unused)
+{
+       struct page        *vmpage  = cl2vm_page(slice);
+       struct address_space *mapping;
+       struct ccc_page      *cpg     = cl2ccc_page(slice);
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+
+       mapping = vmpage->mapping;
+
+       if (cpg->cpg_defer_uptodate && !cpg->cpg_ra_used)
+               ll_ra_stats_inc(mapping, RA_STAT_DISCARDED);
+
+       /*
+        * truncate_complete_page() calls
+        * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete().
+        */
+       truncate_complete_page(mapping, vmpage);
+}
+
+static int vvp_page_unmap(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         struct cl_io *unused)
+{
+       struct page *vmpage = cl2vm_page(slice);
+       __u64       offset;
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+
+       offset = vmpage->index << PAGE_CACHE_SHIFT;
+
+       /*
+        * XXX is it safe to call this with the page lock held?
+        */
+       ll_teardown_mmaps(vmpage->mapping, offset, offset + PAGE_CACHE_SIZE);
+       return 0;
+}
+
+static void vvp_page_delete(const struct lu_env *env,
+                           const struct cl_page_slice *slice)
+{
+       struct page       *vmpage = cl2vm_page(slice);
+       struct inode     *inode  = vmpage->mapping->host;
+       struct cl_object *obj    = slice->cpl_obj;
+
+       LASSERT(PageLocked(vmpage));
+       LASSERT((struct cl_page *)vmpage->private == slice->cpl_page);
+       LASSERT(inode == ccc_object_inode(obj));
+
+       vvp_write_complete(cl2ccc(obj), cl2ccc_page(slice));
+       ClearPagePrivate(vmpage);
+       vmpage->private = 0;
+       /*
+        * Reference from vmpage to cl_page is removed, but the reference back
+        * is still here. It is removed later in vvp_page_fini().
+        */
+}
+
+static void vvp_page_export(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           int uptodate)
+{
+       struct page *vmpage = cl2vm_page(slice);
+
+       LASSERT(vmpage != NULL);
+       LASSERT(PageLocked(vmpage));
+       if (uptodate)
+               SetPageUptodate(vmpage);
+       else
+               ClearPageUptodate(vmpage);
+}
+
+static int vvp_page_is_vmlocked(const struct lu_env *env,
+                               const struct cl_page_slice *slice)
+{
+       return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA;
+}
+
+static int vvp_page_prep_read(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *unused)
+{
+       ENTRY;
+       /* Skip the page already marked as PG_uptodate. */
+       RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0);
+}
+
+static int vvp_page_prep_write(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *unused)
+{
+       struct page *vmpage = cl2vm_page(slice);
+
+       LASSERT(PageLocked(vmpage));
+       LASSERT(!PageDirty(vmpage));
+
+       set_page_writeback(vmpage);
+       vvp_write_pending(cl2ccc(slice->cpl_obj), cl2ccc_page(slice));
+
+       return 0;
+}
+
+/**
+ * Handles page transfer errors at VM level.
+ *
+ * This takes inode as a separate argument, because inode on which error is to
+ * be set can be different from \a vmpage inode in case of direct-io.
+ */
+static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret)
+{
+       struct ccc_object *obj = cl_inode2ccc(inode);
+
+       if (ioret == 0) {
+               ClearPageError(vmpage);
+               obj->cob_discard_page_warned = 0;
+       } else {
+               SetPageError(vmpage);
+               if (ioret == -ENOSPC)
+                       set_bit(AS_ENOSPC, &inode->i_mapping->flags);
+               else
+                       set_bit(AS_EIO, &inode->i_mapping->flags);
+
+               if ((ioret == -ESHUTDOWN || ioret == -EINTR) &&
+                    obj->cob_discard_page_warned == 0) {
+                       obj->cob_discard_page_warned = 1;
+                       ll_dirty_page_discard_warn(vmpage, ioret);
+               }
+       }
+}
+
+static void vvp_page_completion_read(const struct lu_env *env,
+                                    const struct cl_page_slice *slice,
+                                    int ioret)
+{
+       struct ccc_page *cp     = cl2ccc_page(slice);
+       struct page      *vmpage = cp->cpg_page;
+       struct cl_page  *page   = cl_page_top(slice->cpl_page);
+       struct inode    *inode  = ccc_object_inode(page->cp_obj);
+       ENTRY;
+
+       LASSERT(PageLocked(vmpage));
+       CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret);
+
+       if (cp->cpg_defer_uptodate)
+               ll_ra_count_put(ll_i2sbi(inode), 1);
+
+       if (ioret == 0)  {
+               if (!cp->cpg_defer_uptodate)
+                       cl_page_export(env, page, 1);
+       } else
+               cp->cpg_defer_uptodate = 0;
+
+       if (page->cp_sync_io == NULL)
+               unlock_page(vmpage);
+
+       EXIT;
+}
+
+static void vvp_page_completion_write(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     int ioret)
+{
+       struct ccc_page *cp     = cl2ccc_page(slice);
+       struct cl_page  *pg     = slice->cpl_page;
+       struct page      *vmpage = cp->cpg_page;
+       ENTRY;
+
+       LASSERT(ergo(pg->cp_sync_io != NULL, PageLocked(vmpage)));
+       LASSERT(PageWriteback(vmpage));
+
+       CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret);
+
+       /*
+        * TODO: Actually it makes sense to add the page into oap pending
+        * list again and so that we don't need to take the page out from
+        * SoM write pending list, if we just meet a recoverable error,
+        * -ENOMEM, etc.
+        * To implement this, we just need to return a non zero value in
+        * ->cpo_completion method. The underlying transfer should be notified
+        * and then re-add the page into pending transfer queue.  -jay
+        */
+
+       cp->cpg_write_queued = 0;
+       vvp_write_complete(cl2ccc(slice->cpl_obj), cp);
+
+       /*
+        * Only mark the page error only when it's an async write because
+        * applications won't wait for IO to finish.
+        */
+       if (pg->cp_sync_io == NULL)
+               vvp_vmpage_error(ccc_object_inode(pg->cp_obj), vmpage, ioret);
+
+       end_page_writeback(vmpage);
+       EXIT;
+}
+
+/**
+ * Implements cl_page_operations::cpo_make_ready() method.
+ *
+ * This is called to yank a page from the transfer cache and to send it out as
+ * a part of transfer. This function try-locks the page. If try-lock failed,
+ * page is owned by some concurrent IO, and should be skipped (this is bad,
+ * but hopefully rare situation, as it usually results in transfer being
+ * shorter than possible).
+ *
+ * \retval 0      success, page can be placed into transfer
+ *
+ * \retval -EAGAIN page is either used by concurrent IO has been
+ * truncated. Skip it.
+ */
+static int vvp_page_make_ready(const struct lu_env *env,
+                              const struct cl_page_slice *slice)
+{
+       struct page *vmpage = cl2vm_page(slice);
+       struct cl_page *pg = slice->cpl_page;
+       int result = 0;
+
+       lock_page(vmpage);
+       if (clear_page_dirty_for_io(vmpage)) {
+               LASSERT(pg->cp_state == CPS_CACHED);
+               /* This actually clears the dirty bit in the radix
+                * tree. */
+               set_page_writeback(vmpage);
+               vvp_write_pending(cl2ccc(slice->cpl_obj),
+                               cl2ccc_page(slice));
+               CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
+       } else if (pg->cp_state == CPS_PAGEOUT) {
+               /* is it possible for osc_flush_async_page() to already
+                * make it ready? */
+               result = -EALREADY;
+       } else {
+               CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n",
+                             pg->cp_state);
+               LBUG();
+       }
+       unlock_page(vmpage);
+       RETURN(result);
+}
+
+static int vvp_page_print(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         void *cookie, lu_printer_t printer)
+{
+       struct ccc_page *vp = cl2ccc_page(slice);
+       struct page      *vmpage = vp->cpg_page;
+
+       (*printer)(env, cookie, LUSTRE_VVP_NAME"-page@%p(%d:%d:%d) "
+                  "vm@%p ",
+                  vp, vp->cpg_defer_uptodate, vp->cpg_ra_used,
+                  vp->cpg_write_queued, vmpage);
+       if (vmpage != NULL) {
+               (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru",
+                          (long)vmpage->flags, page_count(vmpage),
+                          page_mapcount(vmpage), vmpage->private,
+                          page_index(vmpage),
+                          list_empty(&vmpage->lru) ? "not-" : "");
+       }
+       (*printer)(env, cookie, "\n");
+       return 0;
+}
+
+static const struct cl_page_operations vvp_page_ops = {
+       .cpo_own           = vvp_page_own,
+       .cpo_assume     = vvp_page_assume,
+       .cpo_unassume      = vvp_page_unassume,
+       .cpo_disown     = vvp_page_disown,
+       .cpo_vmpage     = ccc_page_vmpage,
+       .cpo_discard       = vvp_page_discard,
+       .cpo_delete     = vvp_page_delete,
+       .cpo_unmap       = vvp_page_unmap,
+       .cpo_export     = vvp_page_export,
+       .cpo_is_vmlocked   = vvp_page_is_vmlocked,
+       .cpo_fini         = vvp_page_fini,
+       .cpo_print       = vvp_page_print,
+       .cpo_is_under_lock = ccc_page_is_under_lock,
+       .io = {
+               [CRT_READ] = {
+                       .cpo_prep       = vvp_page_prep_read,
+                       .cpo_completion  = vvp_page_completion_read,
+                       .cpo_make_ready  = ccc_fail,
+               },
+               [CRT_WRITE] = {
+                       .cpo_prep       = vvp_page_prep_write,
+                       .cpo_completion  = vvp_page_completion_write,
+                       .cpo_make_ready  = vvp_page_make_ready,
+               }
+       }
+};
+
+static void vvp_transient_page_verify(const struct cl_page *page)
+{
+       struct inode *inode = ccc_object_inode(page->cp_obj);
+
+       LASSERT(!mutex_trylock(&inode->i_mutex));
+}
+
+static int vvp_transient_page_own(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 struct cl_io *unused, int nonblock)
+{
+       vvp_transient_page_verify(slice->cpl_page);
+       return 0;
+}
+
+static void vvp_transient_page_assume(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     struct cl_io *unused)
+{
+       vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_unassume(const struct lu_env *env,
+                                       const struct cl_page_slice *slice,
+                                       struct cl_io *unused)
+{
+       vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_disown(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     struct cl_io *unused)
+{
+       vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_discard(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *unused)
+{
+       struct cl_page *page = slice->cpl_page;
+
+       vvp_transient_page_verify(slice->cpl_page);
+
+       /*
+        * For transient pages, remove it from the radix tree.
+        */
+       cl_page_delete(env, page);
+}
+
+static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
+                                         const struct cl_page_slice *slice)
+{
+       struct inode    *inode = ccc_object_inode(slice->cpl_obj);
+       int     locked;
+
+       locked = !mutex_trylock(&inode->i_mutex);
+       if (!locked)
+               mutex_unlock(&inode->i_mutex);
+       return locked ? -EBUSY : -ENODATA;
+}
+
+static void
+vvp_transient_page_completion(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             int ioret)
+{
+       vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_fini(const struct lu_env *env,
+                                   struct cl_page_slice *slice)
+{
+       struct ccc_page *cp = cl2ccc_page(slice);
+       struct cl_page *clp = slice->cpl_page;
+       struct ccc_object *clobj = cl2ccc(clp->cp_obj);
+
+       vvp_page_fini_common(cp);
+       LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+       clobj->cob_transient_pages--;
+}
+
+static const struct cl_page_operations vvp_transient_page_ops = {
+       .cpo_own           = vvp_transient_page_own,
+       .cpo_assume     = vvp_transient_page_assume,
+       .cpo_unassume      = vvp_transient_page_unassume,
+       .cpo_disown     = vvp_transient_page_disown,
+       .cpo_discard       = vvp_transient_page_discard,
+       .cpo_vmpage     = ccc_page_vmpage,
+       .cpo_fini         = vvp_transient_page_fini,
+       .cpo_is_vmlocked   = vvp_transient_page_is_vmlocked,
+       .cpo_print       = vvp_page_print,
+       .cpo_is_under_lock = ccc_page_is_under_lock,
+       .io = {
+               [CRT_READ] = {
+                       .cpo_prep       = ccc_transient_page_prep,
+                       .cpo_completion  = vvp_transient_page_completion,
+               },
+               [CRT_WRITE] = {
+                       .cpo_prep       = ccc_transient_page_prep,
+                       .cpo_completion  = vvp_transient_page_completion,
+               }
+       }
+};
+
+int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_page *page, struct page *vmpage)
+{
+       struct ccc_page *cpg = cl_object_page_slice(obj, page);
+
+       CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+       cpg->cpg_page = vmpage;
+       page_cache_get(vmpage);
+
+       INIT_LIST_HEAD(&cpg->cpg_pending_linkage);
+       if (page->cp_type == CPT_CACHEABLE) {
+               SetPagePrivate(vmpage);
+               vmpage->private = (unsigned long)page;
+               cl_page_slice_add(page, &cpg->cpg_cl, obj,
+                               &vvp_page_ops);
+       } else {
+               struct ccc_object *clobj = cl2ccc(obj);
+
+               LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
+               cl_page_slice_add(page, &cpg->cpg_cl, obj,
+                               &vvp_transient_page_ops);
+               clobj->cob_transient_pages++;
+       }
+       return 0;
+}
diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c
new file mode 100644 (file)
index 0000000..4176264
--- /dev/null
@@ -0,0 +1,578 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/selinux.h>
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <obd_support.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_eacl.h>
+
+#include "llite_internal.h"
+
+#define XATTR_USER_T       (1)
+#define XATTR_TRUSTED_T         (2)
+#define XATTR_SECURITY_T       (3)
+#define XATTR_ACL_ACCESS_T      (4)
+#define XATTR_ACL_DEFAULT_T     (5)
+#define XATTR_LUSTRE_T   (6)
+#define XATTR_OTHER_T     (7)
+
+static
+int get_xattr_type(const char *name)
+{
+       if (!strcmp(name, POSIX_ACL_XATTR_ACCESS))
+               return XATTR_ACL_ACCESS_T;
+
+       if (!strcmp(name, POSIX_ACL_XATTR_DEFAULT))
+               return XATTR_ACL_DEFAULT_T;
+
+       if (!strncmp(name, XATTR_USER_PREFIX,
+                    sizeof(XATTR_USER_PREFIX) - 1))
+               return XATTR_USER_T;
+
+       if (!strncmp(name, XATTR_TRUSTED_PREFIX,
+                    sizeof(XATTR_TRUSTED_PREFIX) - 1))
+               return XATTR_TRUSTED_T;
+
+       if (!strncmp(name, XATTR_SECURITY_PREFIX,
+                    sizeof(XATTR_SECURITY_PREFIX) - 1))
+               return XATTR_SECURITY_T;
+
+       if (!strncmp(name, XATTR_LUSTRE_PREFIX,
+                    sizeof(XATTR_LUSTRE_PREFIX) - 1))
+               return XATTR_LUSTRE_T;
+
+       return XATTR_OTHER_T;
+}
+
+static
+int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
+{
+       if ((xattr_type == XATTR_ACL_ACCESS_T ||
+            xattr_type == XATTR_ACL_DEFAULT_T) &&
+          !(sbi->ll_flags & LL_SBI_ACL))
+               return -EOPNOTSUPP;
+
+       if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
+               return -EOPNOTSUPP;
+       if (xattr_type == XATTR_TRUSTED_T && !cfs_capable(CFS_CAP_SYS_ADMIN))
+               return -EPERM;
+       if (xattr_type == XATTR_OTHER_T)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+static
+int ll_setxattr_common(struct inode *inode, const char *name,
+                      const void *value, size_t size,
+                      int flags, __u64 valid)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ptlrpc_request *req;
+       int xattr_type, rc;
+       struct obd_capa *oc;
+       posix_acl_xattr_header *new_value = NULL;
+       struct rmtacl_ctl_entry *rce = NULL;
+       ext_acl_xattr_header *acl = NULL;
+       const char *pv = value;
+       ENTRY;
+
+       xattr_type = get_xattr_type(name);
+       rc = xattr_type_filter(sbi, xattr_type);
+       if (rc)
+               RETURN(rc);
+
+       /* b10667: ignore lustre special xattr for now */
+       if ((xattr_type == XATTR_TRUSTED_T && strcmp(name, "trusted.lov") == 0) ||
+           (xattr_type == XATTR_LUSTRE_T && strcmp(name, "lustre.lov") == 0))
+               RETURN(0);
+
+       /* b15587: ignore security.capability xattr for now */
+       if ((xattr_type == XATTR_SECURITY_T &&
+           strcmp(name, "security.capability") == 0))
+               RETURN(0);
+
+       /* LU-549:  Disable security.selinux when selinux is disabled */
+       if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+           strcmp(name, "security.selinux") == 0)
+               RETURN(-EOPNOTSUPP);
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+           (xattr_type == XATTR_ACL_ACCESS_T ||
+           xattr_type == XATTR_ACL_DEFAULT_T)) {
+               rce = rct_search(&sbi->ll_rct, current_pid());
+               if (rce == NULL ||
+                   (rce->rce_ops != RMT_LSETFACL &&
+                   rce->rce_ops != RMT_RSETFACL))
+                       RETURN(-EOPNOTSUPP);
+
+               if (rce->rce_ops == RMT_LSETFACL) {
+                       struct eacl_entry *ee;
+
+                       ee = et_search_del(&sbi->ll_et, current_pid(),
+                                          ll_inode2fid(inode), xattr_type);
+                       LASSERT(ee != NULL);
+                       if (valid & OBD_MD_FLXATTR) {
+                               acl = lustre_acl_xattr_merge2ext(
+                                               (posix_acl_xattr_header *)value,
+                                               size, ee->ee_acl);
+                               if (IS_ERR(acl)) {
+                                       ee_free(ee);
+                                       RETURN(PTR_ERR(acl));
+                               }
+                               size =  CFS_ACL_XATTR_SIZE(\
+                                               le32_to_cpu(acl->a_count), \
+                                               ext_acl_xattr);
+                               pv = (const char *)acl;
+                       }
+                       ee_free(ee);
+               } else if (rce->rce_ops == RMT_RSETFACL) {
+                       size = lustre_posix_acl_xattr_filter(
+                                               (posix_acl_xattr_header *)value,
+                                               size, &new_value);
+                       if (unlikely(size < 0))
+                               RETURN(size);
+
+                       pv = (const char *)new_value;
+               } else
+                       RETURN(-EOPNOTSUPP);
+
+               valid |= rce_ops2valid(rce->rce_ops);
+       }
+#endif
+       oc = ll_mdscapa_get(inode);
+       rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+                        valid, name, pv, size, 0, flags, ll_i2suppgid(inode),
+                        &req);
+       capa_put(oc);
+#ifdef CONFIG_FS_POSIX_ACL
+       if (new_value != NULL)
+               lustre_posix_acl_xattr_free(new_value, size);
+       if (acl != NULL)
+               lustre_ext_acl_xattr_free(acl);
+#endif
+       if (rc) {
+               if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+                       LCONSOLE_INFO("Disabling user_xattr feature because "
+                                     "it is not supported on the server\n");
+                       sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+               }
+               RETURN(rc);
+       }
+
+       ptlrpc_req_finished(req);
+       RETURN(0);
+}
+
+int ll_setxattr(struct dentry *dentry, const char *name,
+               const void *value, size_t size, int flags)
+{
+       struct inode *inode = dentry->d_inode;
+
+       LASSERT(inode);
+       LASSERT(name);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+              inode->i_ino, inode->i_generation, inode, name);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1);
+
+       if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+                    sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+            strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+           (strncmp(name, XATTR_LUSTRE_PREFIX,
+                    sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+            strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+               struct lov_user_md *lump = (struct lov_user_md *)value;
+               int rc = 0;
+
+               /* Attributes that are saved via getxattr will always have
+                * the stripe_offset as 0.  Instead, the MDS should be
+                * allowed to pick the starting OST index.   b=17846 */
+               if (lump != NULL && lump->lmm_stripe_offset == 0)
+                       lump->lmm_stripe_offset = -1;
+
+               if (lump != NULL && S_ISREG(inode->i_mode)) {
+                       struct file f;
+                       int flags = FMODE_WRITE;
+                       int lum_size = (lump->lmm_magic == LOV_USER_MAGIC_V1) ?
+                               sizeof(*lump) : sizeof(struct lov_user_md_v3);
+
+                       f.f_dentry = dentry;
+                       rc = ll_lov_setstripe_ea_info(inode, &f, flags, lump,
+                                                     lum_size);
+                       /* b10667: rc always be 0 here for now */
+                       rc = 0;
+               } else if (S_ISDIR(inode->i_mode)) {
+                       rc = ll_dir_setstripe(inode, lump, 0);
+               }
+
+               return rc;
+
+       } else if (strcmp(name, XATTR_NAME_LMA) == 0 ||
+                  strcmp(name, XATTR_NAME_LINK) == 0)
+               return 0;
+
+       return ll_setxattr_common(inode, name, value, size, flags,
+                                 OBD_MD_FLXATTR);
+}
+
+int ll_removexattr(struct dentry *dentry, const char *name)
+{
+       struct inode *inode = dentry->d_inode;
+
+       LASSERT(inode);
+       LASSERT(name);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+              inode->i_ino, inode->i_generation, inode, name);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1);
+       return ll_setxattr_common(inode, name, NULL, 0, 0,
+                                 OBD_MD_FLXATTRRM);
+}
+
+static
+int ll_getxattr_common(struct inode *inode, const char *name,
+                      void *buffer, size_t size, __u64 valid)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct ptlrpc_request *req = NULL;
+       struct mdt_body *body;
+       int xattr_type, rc;
+       void *xdata;
+       struct obd_capa *oc;
+       struct rmtacl_ctl_entry *rce = NULL;
+       ENTRY;
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+              inode->i_ino, inode->i_generation, inode);
+
+       /* listxattr have slightly different behavior from of ext3:
+        * without 'user_xattr' ext3 will list all xattr names but
+        * filtered out "^user..*"; we list them all for simplicity.
+        */
+       if (!name) {
+               xattr_type = XATTR_OTHER_T;
+               goto do_getxattr;
+       }
+
+       xattr_type = get_xattr_type(name);
+       rc = xattr_type_filter(sbi, xattr_type);
+       if (rc)
+               RETURN(rc);
+
+       /* b15587: ignore security.capability xattr for now */
+       if ((xattr_type == XATTR_SECURITY_T &&
+           strcmp(name, "security.capability") == 0))
+               RETURN(-ENODATA);
+
+       /* LU-549:  Disable security.selinux when selinux is disabled */
+       if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() &&
+           strcmp(name, "security.selinux") == 0)
+               RETURN(-EOPNOTSUPP);
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
+           (xattr_type == XATTR_ACL_ACCESS_T ||
+           xattr_type == XATTR_ACL_DEFAULT_T)) {
+               rce = rct_search(&sbi->ll_rct, current_pid());
+               if (rce == NULL ||
+                   (rce->rce_ops != RMT_LSETFACL &&
+                   rce->rce_ops != RMT_LGETFACL &&
+                   rce->rce_ops != RMT_RSETFACL &&
+                   rce->rce_ops != RMT_RGETFACL))
+                       RETURN(-EOPNOTSUPP);
+       }
+
+       /* posix acl is under protection of LOOKUP lock. when calling to this,
+        * we just have path resolution to the target inode, so we have great
+        * chance that cached ACL is uptodate.
+        */
+       if (xattr_type == XATTR_ACL_ACCESS_T &&
+           !(sbi->ll_flags & LL_SBI_RMT_CLIENT)) {
+               struct ll_inode_info *lli = ll_i2info(inode);
+               struct posix_acl *acl;
+
+               spin_lock(&lli->lli_lock);
+               acl = posix_acl_dup(lli->lli_posix_acl);
+               spin_unlock(&lli->lli_lock);
+
+               if (!acl)
+                       RETURN(-ENODATA);
+
+               rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+               posix_acl_release(acl);
+               RETURN(rc);
+       }
+       if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode))
+               RETURN(-ENODATA);
+#endif
+
+do_getxattr:
+       oc = ll_mdscapa_get(inode);
+       rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
+                        valid | (rce ? rce_ops2valid(rce->rce_ops) : 0),
+                        name, NULL, 0, size, 0, &req);
+       capa_put(oc);
+       if (rc) {
+               if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
+                       LCONSOLE_INFO("Disabling user_xattr feature because "
+                                     "it is not supported on the server\n");
+                       sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+               }
+               RETURN(rc);
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body);
+
+       /* only detect the xattr size */
+       if (size == 0)
+               GOTO(out, rc = body->eadatasize);
+
+       if (size < body->eadatasize) {
+               CERROR("server bug: replied size %u > %u\n",
+                      body->eadatasize, (int)size);
+               GOTO(out, rc = -ERANGE);
+       }
+
+       if (body->eadatasize == 0)
+               GOTO(out, rc = -ENODATA);
+
+       /* do not need swab xattr data */
+       xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA,
+                                            body->eadatasize);
+       if (!xdata)
+               GOTO(out, rc = -EFAULT);
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (body->eadatasize >= 0 && rce && rce->rce_ops == RMT_LSETFACL) {
+               ext_acl_xattr_header *acl;
+
+               acl = lustre_posix_acl_xattr_2ext((posix_acl_xattr_header *)xdata,
+                                                 body->eadatasize);
+               if (IS_ERR(acl))
+                       GOTO(out, rc = PTR_ERR(acl));
+
+               rc = ee_add(&sbi->ll_et, current_pid(), ll_inode2fid(inode),
+                           xattr_type, acl);
+               if (unlikely(rc < 0)) {
+                       lustre_ext_acl_xattr_free(acl);
+                       GOTO(out, rc);
+               }
+       }
+#endif
+
+       if (body->eadatasize == 0) {
+               rc = -ENODATA;
+       } else {
+               LASSERT(buffer);
+               memcpy(buffer, xdata, body->eadatasize);
+               rc = body->eadatasize;
+       }
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+                   void *buffer, size_t size)
+{
+       struct inode *inode = dentry->d_inode;
+
+       LASSERT(inode);
+       LASSERT(name);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n",
+              inode->i_ino, inode->i_generation, inode, name);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1);
+
+       if ((strncmp(name, XATTR_TRUSTED_PREFIX,
+                    sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 &&
+            strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) ||
+           (strncmp(name, XATTR_LUSTRE_PREFIX,
+                    sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 &&
+            strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) {
+               struct lov_stripe_md *lsm;
+               struct lov_user_md *lump;
+               struct lov_mds_md *lmm = NULL;
+               struct ptlrpc_request *request = NULL;
+               int rc = 0, lmmsize = 0;
+
+               if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+                       return -ENODATA;
+
+               if (size == 0 && S_ISDIR(inode->i_mode)) {
+                       /* XXX directory EA is fix for now, optimize to save
+                        * RPC transfer */
+                       GOTO(out, rc = sizeof(struct lov_user_md));
+               }
+
+               lsm = ccc_inode_lsm_get(inode);
+               if (lsm == NULL) {
+                       if (S_ISDIR(inode->i_mode)) {
+                               rc = ll_dir_getstripe(inode, &lmm,
+                                                     &lmmsize, &request);
+                       } else {
+                               rc = -ENODATA;
+                       }
+               } else {
+                       /* LSM is present already after lookup/getattr call.
+                        * we need to grab layout lock once it is implemented */
+                       rc = obd_packmd(ll_i2dtexp(inode), &lmm, lsm);
+                       lmmsize = rc;
+               }
+               ccc_inode_lsm_put(inode, lsm);
+
+               if (rc < 0)
+                      GOTO(out, rc);
+
+               if (size == 0) {
+                       /* used to call ll_get_max_mdsize() forward to get
+                        * the maximum buffer size, while some apps (such as
+                        * rsync 3.0.x) care much about the exact xattr value
+                        * size */
+                       rc = lmmsize;
+                       GOTO(out, rc);
+               }
+
+               if (size < lmmsize) {
+                       CERROR("server bug: replied size %d > %d for %s (%s)\n",
+                              lmmsize, (int)size, dentry->d_name.name, name);
+                       GOTO(out, rc = -ERANGE);
+               }
+
+               lump = (struct lov_user_md *)buffer;
+               memcpy(lump, lmm, lmmsize);
+               /* do not return layout gen for getxattr otherwise it would
+                * confuse tar --xattr by recognizing layout gen as stripe
+                * offset when the file is restored. See LU-2809. */
+               lump->lmm_layout_gen = 0;
+
+               rc = lmmsize;
+out:
+               if (request)
+                       ptlrpc_req_finished(request);
+               else if (lmm)
+                       obd_free_diskmd(ll_i2dtexp(inode), &lmm);
+               return(rc);
+       }
+
+       return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR);
+}
+
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+       struct inode *inode = dentry->d_inode;
+       int rc = 0, rc2 = 0;
+       struct lov_mds_md *lmm = NULL;
+       struct ptlrpc_request *request = NULL;
+       int lmmsize;
+
+       LASSERT(inode);
+
+       CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n",
+              inode->i_ino, inode->i_generation, inode);
+
+       ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1);
+
+       rc = ll_getxattr_common(inode, NULL, buffer, size, OBD_MD_FLXATTRLS);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       if (buffer != NULL) {
+               struct ll_sb_info *sbi = ll_i2sbi(inode);
+               char *xattr_name = buffer;
+               int xlen, rem = rc;
+
+               while (rem > 0) {
+                       xlen = strnlen(xattr_name, rem - 1) + 1;
+                       rem -= xlen;
+                       if (xattr_type_filter(sbi,
+                                       get_xattr_type(xattr_name)) == 0) {
+                               /* skip OK xattr type
+                                * leave it in buffer
+                                */
+                               xattr_name += xlen;
+                               continue;
+                       }
+                       /* move up remaining xattrs in buffer
+                        * removing the xattr that is not OK
+                        */
+                       memmove(xattr_name, xattr_name + xlen, rem);
+                       rc -= xlen;
+               }
+       }
+       if (S_ISREG(inode->i_mode)) {
+               if (!ll_i2info(inode)->lli_has_smd)
+                       rc2 = -1;
+       } else if (S_ISDIR(inode->i_mode)) {
+               rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request);
+       }
+
+       if (rc2 < 0) {
+               GOTO(out, rc2 = 0);
+       } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+               const int prefix_len = sizeof(XATTR_LUSTRE_PREFIX) - 1;
+               const size_t name_len   = sizeof("lov") - 1;
+               const size_t total_len  = prefix_len + name_len + 1;
+
+               if (buffer && (rc + total_len) <= size) {
+                       buffer += rc;
+                       memcpy(buffer, XATTR_LUSTRE_PREFIX, prefix_len);
+                       memcpy(buffer + prefix_len, "lov", name_len);
+                       buffer[prefix_len + name_len] = '\0';
+               }
+               rc2 = total_len;
+       }
+out:
+       ptlrpc_req_finished(request);
+       rc = rc + rc2;
+
+       return rc;
+}
diff --git a/drivers/staging/lustre/lustre/lmv/Makefile b/drivers/staging/lustre/lustre/lmv/Makefile
new file mode 100644 (file)
index 0000000..8cc81ad
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += lmv.o
+lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_fld.c b/drivers/staging/lustre/lustre/lmv/lmv_fld.c
new file mode 100644 (file)
index 0000000..a4805ae
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2013, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_fid.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+int lmv_fld_lookup(struct lmv_obd *lmv,
+                  const struct lu_fid *fid,
+                  mdsno_t *mds)
+{
+       int rc;
+       ENTRY;
+
+
+       /* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and
+        * this fid_is_local check should be removed once LU-2240 is fixed */
+       LASSERTF((fid_seq_in_fldb(fid_seq(fid)) ||
+                 fid_seq_is_local_file(fid_seq(fid))) &&
+                fid_is_sane(fid), DFID" is insane!\n", PFID(fid));
+
+       rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds,
+                              LU_SEQ_RANGE_MDT, NULL);
+       if (rc) {
+               CERROR("Error while looking for mds number. Seq "LPX64
+                      ", err = %d\n", fid_seq(fid), rc);
+               RETURN(rc);
+       }
+
+       CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n",
+              *mds, PFID(fid));
+
+       if (*mds >= lmv->desc.ld_tgt_count) {
+               CERROR("FLD lookup got invalid mds #%x (max: %x) "
+                      "for fid="DFID"\n", *mds, lmv->desc.ld_tgt_count,
+                      PFID(fid));
+               rc = -EINVAL;
+       }
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_intent.c b/drivers/staging/lustre/lustre/lmv/lmv_intent.c
new file mode 100644 (file)
index 0000000..7eefab5
--- /dev/null
@@ -0,0 +1,328 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+#include <linux/lustre_intent.h>
+
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre_dlm.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "lmv_internal.h"
+
+static int lmv_intent_remote(struct obd_export *exp, void *lmm,
+                            int lmmsize, struct lookup_intent *it,
+                            const struct lu_fid *parent_fid, int flags,
+                            struct ptlrpc_request **reqp,
+                            ldlm_blocking_callback cb_blocking,
+                            __u64 extra_lock_flags)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd          *lmv = &obd->u.lmv;
+       struct ptlrpc_request   *req = NULL;
+       struct lustre_handle    plock;
+       struct md_op_data       *op_data;
+       struct lmv_tgt_desc     *tgt;
+       struct mdt_body         *body;
+       int                     pmode;
+       int                     rc = 0;
+       ENTRY;
+
+       body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               RETURN(-EPROTO);
+
+       LASSERT((body->valid & OBD_MD_MDS));
+
+       /*
+        * Unfortunately, we have to lie to MDC/MDS to retrieve
+        * attributes llite needs and provideproper locking.
+        */
+       if (it->it_op & IT_LOOKUP)
+               it->it_op = IT_GETATTR;
+
+       /*
+        * We got LOOKUP lock, but we really need attrs.
+        */
+       pmode = it->d.lustre.it_lock_mode;
+       if (pmode) {
+               plock.cookie = it->d.lustre.it_lock_handle;
+               it->d.lustre.it_lock_mode = 0;
+               it->d.lustre.it_data = NULL;
+       }
+
+       LASSERT(fid_is_sane(&body->fid1));
+
+       tgt = lmv_find_target(lmv, &body->fid1);
+       if (IS_ERR(tgt))
+               GOTO(out, rc = PTR_ERR(tgt));
+
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       op_data->op_fid1 = body->fid1;
+       /* Sent the parent FID to the remote MDT */
+       if (parent_fid != NULL) {
+               /* The parent fid is only for remote open to
+                * check whether the open is from OBF,
+                * see mdt_cross_open */
+               LASSERT(it->it_op & IT_OPEN);
+               op_data->op_fid2 = *parent_fid;
+               /* Add object FID to op_fid3, in case it needs to check stale
+                * (M_CHECK_STALE), see mdc_finish_intent_lock */
+               op_data->op_fid3 = body->fid1;
+       }
+
+       op_data->op_bias = MDS_CROSS_REF;
+       CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%d\n",
+              PFID(&body->fid1), tgt->ltd_idx);
+
+       it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
+       rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+                           flags, &req, cb_blocking, extra_lock_flags);
+       if (rc)
+               GOTO(out_free_op_data, rc);
+
+       /*
+        * LLite needs LOOKUP lock to track dentry revocation in order to
+        * maintain dcache consistency. Thus drop UPDATE|PERM lock here
+        * and put LOOKUP in request.
+        */
+       if (it->d.lustre.it_lock_mode != 0) {
+               it->d.lustre.it_remote_lock_handle =
+                                       it->d.lustre.it_lock_handle;
+               it->d.lustre.it_remote_lock_mode = it->d.lustre.it_lock_mode;
+       }
+
+       it->d.lustre.it_lock_handle = plock.cookie;
+       it->d.lustre.it_lock_mode = pmode;
+
+       EXIT;
+out_free_op_data:
+       OBD_FREE_PTR(op_data);
+out:
+       if (rc && pmode)
+               ldlm_lock_decref(&plock, pmode);
+
+       ptlrpc_req_finished(*reqp);
+       *reqp = req;
+       return rc;
+}
+
+/*
+ * IT_OPEN is intended to open (and create, possible) an object. Parent (pid)
+ * may be split dir.
+ */
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+                   void *lmm, int lmmsize, struct lookup_intent *it,
+                   int flags, struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd          *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       struct mdt_body         *body;
+       int                     rc;
+       ENTRY;
+
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       /* If it is ready to open the file by FID, do not need
+        * allocate FID at all, otherwise it will confuse MDT */
+       if ((it->it_op & IT_CREAT) &&
+           !(it->it_flags & MDS_OPEN_BY_FID)) {
+               /*
+                * For open with IT_CREATE and for IT_CREATE cases allocate new
+                * fid and setup FLD for it.
+                */
+               op_data->op_fid3 = op_data->op_fid2;
+               rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+               if (rc != 0)
+                       RETURN(rc);
+       }
+
+       CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID","
+              " name='%s' -> mds #%d\n", PFID(&op_data->op_fid1),
+              PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx);
+
+       rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, flags,
+                           reqp, cb_blocking, extra_lock_flags);
+       if (rc != 0)
+               RETURN(rc);
+       /*
+        * Nothing is found, do not access body->fid1 as it is zero and thus
+        * pointless.
+        */
+       if ((it->d.lustre.it_disposition & DISP_LOOKUP_NEG) &&
+           !(it->d.lustre.it_disposition & DISP_OPEN_CREATE) &&
+           !(it->d.lustre.it_disposition & DISP_OPEN_OPEN))
+               RETURN(rc);
+
+       body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               RETURN(-EPROTO);
+       /*
+        * Not cross-ref case, just get out of here.
+        */
+       if (likely(!(body->valid & OBD_MD_MDS)))
+               RETURN(0);
+
+       /*
+        * Okay, MDS has returned success. Probably name has been resolved in
+        * remote inode.
+        */
+       rc = lmv_intent_remote(exp, lmm, lmmsize, it, &op_data->op_fid1, flags,
+                              reqp, cb_blocking, extra_lock_flags);
+       if (rc != 0) {
+               LASSERT(rc < 0);
+               /*
+                * This is possible, that some userspace application will try to
+                * open file as directory and we will have -ENOTDIR here. As
+                * this is normal situation, we should not print error here,
+                * only debug info.
+                */
+               CDEBUG(D_INODE, "Can't handle remote %s: dir "DFID"("DFID"):"
+                      "%*s: %d\n", LL_IT2STR(it), PFID(&op_data->op_fid2),
+                      PFID(&op_data->op_fid1), op_data->op_namelen,
+                      op_data->op_name, rc);
+               RETURN(rc);
+       }
+
+       RETURN(rc);
+}
+
+/*
+ * Handler for: getattr, lookup and revalidate cases.
+ */
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+                     void *lmm, int lmmsize, struct lookup_intent *it,
+                     int flags, struct ptlrpc_request **reqp,
+                     ldlm_blocking_callback cb_blocking,
+                     __u64 extra_lock_flags)
+{
+       struct obd_device      *obd = exp->exp_obd;
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc    *tgt = NULL;
+       struct mdt_body *body;
+       int                  rc = 0;
+       ENTRY;
+
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       if (!fid_is_sane(&op_data->op_fid2))
+               fid_zero(&op_data->op_fid2);
+
+       CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID
+              ", name='%s' -> mds #%d\n", PFID(&op_data->op_fid1),
+              PFID(&op_data->op_fid2),
+              op_data->op_name ? op_data->op_name : "<NULL>",
+              tgt->ltd_idx);
+
+       op_data->op_bias &= ~MDS_CROSS_REF;
+
+       rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it,
+                            flags, reqp, cb_blocking, extra_lock_flags);
+
+       if (rc < 0 || *reqp == NULL)
+               RETURN(rc);
+
+       /*
+        * MDS has returned success. Probably name has been resolved in
+        * remote inode. Let's check this.
+        */
+       body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               RETURN(-EPROTO);
+       /* Not cross-ref case, just get out of here. */
+       if (likely(!(body->valid & OBD_MD_MDS)))
+               RETURN(0);
+
+       rc = lmv_intent_remote(exp, lmm, lmmsize, it, NULL, flags, reqp,
+                              cb_blocking, extra_lock_flags);
+
+       RETURN(rc);
+}
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+                   void *lmm, int lmmsize, struct lookup_intent *it,
+                   int flags, struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags)
+{
+       struct obd_device *obd = exp->exp_obd;
+       int             rc;
+       ENTRY;
+
+       LASSERT(it != NULL);
+       LASSERT(fid_is_sane(&op_data->op_fid1));
+
+       CDEBUG(D_INODE, "INTENT LOCK '%s' for '%*s' on "DFID"\n",
+              LL_IT2STR(it), op_data->op_namelen, op_data->op_name,
+              PFID(&op_data->op_fid1));
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT))
+               rc = lmv_intent_lookup(exp, op_data, lmm, lmmsize, it,
+                                      flags, reqp, cb_blocking,
+                                      extra_lock_flags);
+       else if (it->it_op & IT_OPEN)
+               rc = lmv_intent_open(exp, op_data, lmm, lmmsize, it,
+                                    flags, reqp, cb_blocking,
+                                    extra_lock_flags);
+       else
+               LBUG();
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/drivers/staging/lustre/lustre/lmv/lmv_internal.h
new file mode 100644 (file)
index 0000000..f75b0a9
--- /dev/null
@@ -0,0 +1,159 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _LMV_INTERNAL_H_
+#define _LMV_INTERNAL_H_
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+
+#define LMV_MAX_TGT_COUNT 128
+
+#define lmv_init_lock(lmv)   mutex_lock(&lmv->init_mutex);
+#define lmv_init_unlock(lmv) mutex_unlock(&lmv->init_mutex);
+
+#define LL_IT2STR(it)                                  \
+       ((it) ? ldlm_it2str((it)->it_op) : "0")
+
+int lmv_check_connect(struct obd_device *obd);
+
+int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+                   void *lmm, int lmmsize, struct lookup_intent *it,
+                   int flags, struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags);
+
+int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data,
+                     void *lmm, int lmmsize, struct lookup_intent *it,
+                     int flags, struct ptlrpc_request **reqp,
+                     ldlm_blocking_callback cb_blocking,
+                     __u64 extra_lock_flags);
+
+int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
+                   void *lmm, int lmmsize, struct lookup_intent *it,
+                   int flags, struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags);
+
+int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
+                    void *, int);
+int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid,
+                  mdsno_t *mds);
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid,
+                   mdsno_t mds);
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+                 struct md_op_data *op_data);
+
+static inline struct lmv_stripe_md *lmv_get_mea(struct ptlrpc_request *req)
+{
+       struct mdt_body  *body;
+       struct lmv_stripe_md    *mea;
+
+       LASSERT(req != NULL);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+
+       if (!body || !S_ISDIR(body->mode) || !body->eadatasize)
+               return NULL;
+
+       mea = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD,
+                                          body->eadatasize);
+       LASSERT(mea != NULL);
+
+       if (mea->mea_count == 0)
+               return NULL;
+       if( mea->mea_magic != MEA_MAGIC_LAST_CHAR &&
+               mea->mea_magic != MEA_MAGIC_ALL_CHARS &&
+               mea->mea_magic != MEA_MAGIC_HASH_SEGMENT)
+               return NULL;
+
+       return mea;
+}
+
+static inline int lmv_get_easize(struct lmv_obd *lmv)
+{
+       return sizeof(struct lmv_stripe_md) +
+               lmv->desc.ld_tgt_count *
+               sizeof(struct lu_fid);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_get_target(struct lmv_obd *lmv, mdsno_t mds)
+{
+       int count = lmv->desc.ld_tgt_count;
+       int i;
+
+       for (i = 0; i < count; i++) {
+               if (lmv->tgts[i] == NULL)
+                       continue;
+
+               if (lmv->tgts[i]->ltd_idx == mds)
+                       return lmv->tgts[i];
+       }
+
+       return ERR_PTR(-ENODEV);
+}
+
+static inline struct lmv_tgt_desc *
+lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid)
+{
+       mdsno_t mds = 0;
+       int rc;
+
+       if (lmv->desc.ld_tgt_count > 1) {
+               rc = lmv_fld_lookup(lmv, fid, &mds);
+               if (rc)
+                       return ERR_PTR(rc);
+       }
+
+       return lmv_get_target(lmv, mds);
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+               struct lu_fid *fid);
+/* lproc_lmv.c */
+#ifdef LPROCFS
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+extern struct file_operations lmv_proc_target_fops;
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
new file mode 100644 (file)
index 0000000..1eebfbf
--- /dev/null
@@ -0,0 +1,2727 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <asm/div64.h>
+#include <linux/seq_file.h>
+#include <linux/namei.h>
+
+#include <lustre/lustre_idl.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre_lite.h>
+#include <lustre_fid.h>
+#include "lmv_internal.h"
+
+static void lmv_activate_target(struct lmv_obd *lmv,
+                               struct lmv_tgt_desc *tgt,
+                               int activate)
+{
+       if (tgt->ltd_active == activate)
+               return;
+
+       tgt->ltd_active = activate;
+       lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
+}
+
+/**
+ * Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LMV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
+ */
+static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
+                             int activate)
+{
+       struct lmv_tgt_desc    *uninitialized_var(tgt);
+       struct obd_device      *obd;
+       int                  i;
+       int                  rc = 0;
+       ENTRY;
+
+       CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
+              lmv, uuid->uuid, activate);
+
+       spin_lock(&lmv->lmv_lock);
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               tgt = lmv->tgts[i];
+               if (tgt == NULL || tgt->ltd_exp == NULL)
+                       continue;
+
+               CDEBUG(D_INFO, "Target idx %d is %s conn "LPX64"\n", i,
+                      tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
+
+               if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+                       break;
+       }
+
+       if (i == lmv->desc.ld_tgt_count)
+               GOTO(out_lmv_lock, rc = -EINVAL);
+
+       obd = class_exp2obd(tgt->ltd_exp);
+       if (obd == NULL)
+               GOTO(out_lmv_lock, rc = -ENOTCONN);
+
+       CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
+              obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
+              obd->obd_type->typ_name, i);
+       LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
+
+       if (tgt->ltd_active == activate) {
+               CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
+                      activate ? "" : "in");
+               GOTO(out_lmv_lock, rc);
+       }
+
+       CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
+              activate ? "" : "in");
+       lmv_activate_target(lmv, tgt, activate);
+       EXIT;
+
+ out_lmv_lock:
+       spin_unlock(&lmv->lmv_lock);
+       return rc;
+}
+
+struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
+{
+       struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+       return obd_get_uuid(lmv->tgts[0]->ltd_exp);
+}
+
+static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
+                     enum obd_notify_event ev, void *data)
+{
+       struct obd_connect_data *conn_data;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct obd_uuid  *uuid;
+       int                   rc = 0;
+       ENTRY;
+
+       if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
+               CERROR("unexpected notification of %s %s!\n",
+                      watched->obd_type->typ_name,
+                      watched->obd_name);
+               RETURN(-EINVAL);
+       }
+
+       uuid = &watched->u.cli.cl_target_uuid;
+       if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
+               /*
+                * Set MDC as active before notifying the observer, so the
+                * observer can use the MDC normally.
+                */
+               rc = lmv_set_mdc_active(lmv, uuid,
+                                       ev == OBD_NOTIFY_ACTIVE);
+               if (rc) {
+                       CERROR("%sactivation of %s failed: %d\n",
+                              ev == OBD_NOTIFY_ACTIVE ? "" : "de",
+                              uuid->uuid, rc);
+                       RETURN(rc);
+               }
+       } else if (ev == OBD_NOTIFY_OCD) {
+               conn_data = &watched->u.cli.cl_import->imp_connect_data;
+               /*
+                * XXX: Make sure that ocd_connect_flags from all targets are
+                * the same. Otherwise one of MDTs runs wrong version or
+                * something like this.  --umka
+                */
+               obd->obd_self_export->exp_connect_data = *conn_data;
+       }
+#if 0
+       else if (ev == OBD_NOTIFY_DISCON) {
+               /*
+                * For disconnect event, flush fld cache for failout MDS case.
+                */
+               fld_client_flush(&lmv->lmv_fld);
+       }
+#endif
+       /*
+        * Pass the notification up the chain.
+        */
+       if (obd->obd_observer)
+               rc = obd_notify(obd->obd_observer, watched, ev, data);
+
+       RETURN(rc);
+}
+
+/**
+ * This is fake connect function. Its purpose is to initialize lmv and say
+ * caller that everything is okay. Real connection will be performed later.
+ */
+static int lmv_connect(const struct lu_env *env,
+                      struct obd_export **exp, struct obd_device *obd,
+                      struct obd_uuid *cluuid, struct obd_connect_data *data,
+                      void *localdata)
+{
+       struct proc_dir_entry *lmv_proc_dir;
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       struct lustre_handle  conn = { 0 };
+       int                 rc = 0;
+       ENTRY;
+
+       /*
+        * We don't want to actually do the underlying connections more than
+        * once, so keep track.
+        */
+       lmv->refcount++;
+       if (lmv->refcount > 1) {
+               *exp = NULL;
+               RETURN(0);
+       }
+
+       rc = class_connect(&conn, obd, cluuid);
+       if (rc) {
+               CERROR("class_connection() returned %d\n", rc);
+               RETURN(rc);
+       }
+
+       *exp = class_conn2export(&conn);
+       class_export_get(*exp);
+
+       lmv->exp = *exp;
+       lmv->connected = 0;
+       lmv->cluuid = *cluuid;
+
+       if (data)
+               lmv->conn_data = *data;
+
+       if (obd->obd_proc_private != NULL) {
+               lmv_proc_dir = obd->obd_proc_private;
+       } else {
+               lmv_proc_dir = lprocfs_register("target_obds", obd->obd_proc_entry,
+                                               NULL, NULL);
+               if (IS_ERR(lmv_proc_dir)) {
+                       CERROR("could not register /proc/fs/lustre/%s/%s/target_obds.",
+                              obd->obd_type->typ_name, obd->obd_name);
+                       lmv_proc_dir = NULL;
+               }
+               obd->obd_proc_private = lmv_proc_dir;
+       }
+
+       /*
+        * All real clients should perform actual connection right away, because
+        * it is possible, that LMV will not have opportunity to connect targets
+        * and MDC stuff will be called directly, for instance while reading
+        * ../mdc/../kbytesfree procfs file, etc.
+        */
+       if (data->ocd_connect_flags & OBD_CONNECT_REAL)
+               rc = lmv_check_connect(obd);
+
+       if (rc && lmv_proc_dir) {
+               lprocfs_remove(&lmv_proc_dir);
+               obd->obd_proc_private = NULL;
+       }
+
+       RETURN(rc);
+}
+
+static void lmv_set_timeouts(struct obd_device *obd)
+{
+       struct lmv_tgt_desc   *tgt;
+       struct lmv_obd  *lmv;
+       int                 i;
+
+       lmv = &obd->u.lmv;
+       if (lmv->server_timeout == 0)
+               return;
+
+       if (lmv->connected == 0)
+               return;
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               tgt = lmv->tgts[i];
+               if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+                       continue;
+
+               obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS),
+                                  KEY_INTERMDS, 0, NULL, NULL);
+       }
+}
+
+static int lmv_init_ea_size(struct obd_export *exp, int easize,
+                           int def_easize, int cookiesize)
+{
+       struct obd_device   *obd = exp->exp_obd;
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       int               i;
+       int               rc = 0;
+       int               change = 0;
+       ENTRY;
+
+       if (lmv->max_easize < easize) {
+               lmv->max_easize = easize;
+               change = 1;
+       }
+       if (lmv->max_def_easize < def_easize) {
+               lmv->max_def_easize = def_easize;
+               change = 1;
+       }
+       if (lmv->max_cookiesize < cookiesize) {
+               lmv->max_cookiesize = cookiesize;
+               change = 1;
+       }
+       if (change == 0)
+               RETURN(0);
+
+       if (lmv->connected == 0)
+               RETURN(0);
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL ||
+                   lmv->tgts[i]->ltd_exp == NULL ||
+                   lmv->tgts[i]->ltd_active == 0) {
+                       CWARN("%s: NULL export for %d\n", obd->obd_name, i);
+                       continue;
+               }
+
+               rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize,
+                                    cookiesize);
+               if (rc) {
+                       CERROR("%s: obd_init_ea_size() failed on MDT target %d:"
+                              " rc = %d.\n", obd->obd_name, i, rc);
+                       break;
+               }
+       }
+       RETURN(rc);
+}
+
+#define MAX_STRING_SIZE 128
+
+int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+       struct proc_dir_entry   *lmv_proc_dir;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct obd_uuid  *cluuid = &lmv->cluuid;
+       struct obd_uuid   lmv_mdc_uuid = { "LMV_MDC_UUID" };
+       struct obd_device       *mdc_obd;
+       struct obd_export       *mdc_exp;
+       struct lu_fld_target     target;
+       int                   rc;
+       ENTRY;
+
+       mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+                                       &obd->obd_uuid);
+       if (!mdc_obd) {
+               CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+               RETURN(-EINVAL);
+       }
+
+       CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n",
+               mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+               tgt->ltd_uuid.uuid, obd->obd_uuid.uuid,
+               cluuid->uuid);
+
+       if (!mdc_obd->obd_set_up) {
+               CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+               RETURN(-EINVAL);
+       }
+
+       rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid,
+                        &lmv->conn_data, NULL);
+       if (rc) {
+               CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
+               RETURN(rc);
+       }
+
+       /*
+        * Init fid sequence client for this mdc and add new fld target.
+        */
+       rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA);
+       if (rc)
+               RETURN(rc);
+
+       target.ft_srv = NULL;
+       target.ft_exp = mdc_exp;
+       target.ft_idx = tgt->ltd_idx;
+
+       fld_client_add_target(&lmv->lmv_fld, &target);
+
+       rc = obd_register_observer(mdc_obd, obd);
+       if (rc) {
+               obd_disconnect(mdc_exp);
+               CERROR("target %s register_observer error %d\n",
+                      tgt->ltd_uuid.uuid, rc);
+               RETURN(rc);
+       }
+
+       if (obd->obd_observer) {
+               /*
+                * Tell the observer about the new target.
+                */
+               rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
+                               OBD_NOTIFY_ACTIVE,
+                               (void *)(tgt - lmv->tgts[0]));
+               if (rc) {
+                       obd_disconnect(mdc_exp);
+                       RETURN(rc);
+               }
+       }
+
+       tgt->ltd_active = 1;
+       tgt->ltd_exp = mdc_exp;
+       lmv->desc.ld_active_tgt_count++;
+
+       md_init_ea_size(tgt->ltd_exp, lmv->max_easize,
+                       lmv->max_def_easize, lmv->max_cookiesize);
+
+       CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
+               mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
+               atomic_read(&obd->obd_refcount));
+
+       lmv_proc_dir = obd->obd_proc_private;
+       if (lmv_proc_dir) {
+               struct proc_dir_entry *mdc_symlink;
+
+               LASSERT(mdc_obd->obd_type != NULL);
+               LASSERT(mdc_obd->obd_type->typ_name != NULL);
+               mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name,
+                                                 lmv_proc_dir,
+                                                 "../../../%s/%s",
+                                                 mdc_obd->obd_type->typ_name,
+                                                 mdc_obd->obd_name);
+               if (mdc_symlink == NULL) {
+                       CERROR("Could not register LMV target "
+                              "/proc/fs/lustre/%s/%s/target_obds/%s.",
+                              obd->obd_type->typ_name, obd->obd_name,
+                              mdc_obd->obd_name);
+                       lprocfs_remove(&lmv_proc_dir);
+                       obd->obd_proc_private = NULL;
+               }
+       }
+       RETURN(0);
+}
+
+static void lmv_del_target(struct lmv_obd *lmv, int index)
+{
+       if (lmv->tgts[index] == NULL)
+               return;
+
+       OBD_FREE_PTR(lmv->tgts[index]);
+       lmv->tgts[index] = NULL;
+       return;
+}
+
+static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+                          __u32 index, int gen)
+{
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc *tgt;
+       int               rc = 0;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
+
+       lmv_init_lock(lmv);
+
+       if (lmv->desc.ld_tgt_count == 0) {
+               struct obd_device *mdc_obd;
+
+               mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
+                                               &obd->obd_uuid);
+               if (!mdc_obd) {
+                       lmv_init_unlock(lmv);
+                       CERROR("%s: Target %s not attached: rc = %d\n",
+                              obd->obd_name, uuidp->uuid, -EINVAL);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) {
+               tgt = lmv->tgts[index];
+               CERROR("%s: UUID %s already assigned at LOV target index %d:"
+                      " rc = %d\n", obd->obd_name,
+                      obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
+               lmv_init_unlock(lmv);
+               RETURN(-EEXIST);
+       }
+
+       if (index >= lmv->tgts_size) {
+               /* We need to reallocate the lmv target array. */
+               struct lmv_tgt_desc **newtgts, **old = NULL;
+               __u32 newsize = 1;
+               __u32 oldsize = 0;
+
+               while (newsize < index + 1)
+                       newsize = newsize << 1;
+               OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+               if (newtgts == NULL) {
+                       lmv_init_unlock(lmv);
+                       RETURN(-ENOMEM);
+               }
+
+               if (lmv->tgts_size) {
+                       memcpy(newtgts, lmv->tgts,
+                              sizeof(*newtgts) * lmv->tgts_size);
+                       old = lmv->tgts;
+                       oldsize = lmv->tgts_size;
+               }
+
+               lmv->tgts = newtgts;
+               lmv->tgts_size = newsize;
+               smp_rmb();
+               if (old)
+                       OBD_FREE(old, sizeof(*old) * oldsize);
+
+               CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts,
+                      lmv->tgts_size);
+       }
+
+       OBD_ALLOC_PTR(tgt);
+       if (!tgt) {
+               lmv_init_unlock(lmv);
+               RETURN(-ENOMEM);
+       }
+
+       mutex_init(&tgt->ltd_fid_mutex);
+       tgt->ltd_idx = index;
+       tgt->ltd_uuid = *uuidp;
+       tgt->ltd_active = 0;
+       lmv->tgts[index] = tgt;
+       if (index >= lmv->desc.ld_tgt_count)
+               lmv->desc.ld_tgt_count = index + 1;
+
+       if (lmv->connected) {
+               rc = lmv_connect_mdc(obd, tgt);
+               if (rc) {
+                       spin_lock(&lmv->lmv_lock);
+                       lmv->desc.ld_tgt_count--;
+                       memset(tgt, 0, sizeof(*tgt));
+                       spin_unlock(&lmv->lmv_lock);
+               } else {
+                       int easize = sizeof(struct lmv_stripe_md) +
+                                    lmv->desc.ld_tgt_count *
+                                    sizeof(struct lu_fid);
+                       lmv_init_ea_size(obd->obd_self_export, easize, 0, 0);
+               }
+       }
+
+       lmv_init_unlock(lmv);
+       RETURN(rc);
+}
+
+int lmv_check_connect(struct obd_device *obd)
+{
+       struct lmv_obd       *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc  *tgt;
+       int                i;
+       int                rc;
+       int                easize;
+       ENTRY;
+
+       if (lmv->connected)
+               RETURN(0);
+
+       lmv_init_lock(lmv);
+       if (lmv->connected) {
+               lmv_init_unlock(lmv);
+               RETURN(0);
+       }
+
+       if (lmv->desc.ld_tgt_count == 0) {
+               lmv_init_unlock(lmv);
+               CERROR("%s: no targets configured.\n", obd->obd_name);
+               RETURN(-EINVAL);
+       }
+
+       CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
+              lmv->cluuid.uuid, obd->obd_name);
+
+       LASSERT(lmv->tgts != NULL);
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               tgt = lmv->tgts[i];
+               if (tgt == NULL)
+                       continue;
+               rc = lmv_connect_mdc(obd, tgt);
+               if (rc)
+                       GOTO(out_disc, rc);
+       }
+
+       lmv_set_timeouts(obd);
+       class_export_put(lmv->exp);
+       lmv->connected = 1;
+       easize = lmv_get_easize(lmv);
+       lmv_init_ea_size(obd->obd_self_export, easize, 0, 0);
+       lmv_init_unlock(lmv);
+       RETURN(0);
+
+ out_disc:
+       while (i-- > 0) {
+               int rc2;
+               tgt = lmv->tgts[i];
+               if (tgt == NULL)
+                       continue;
+               tgt->ltd_active = 0;
+               if (tgt->ltd_exp) {
+                       --lmv->desc.ld_active_tgt_count;
+                       rc2 = obd_disconnect(tgt->ltd_exp);
+                       if (rc2) {
+                               CERROR("LMV target %s disconnect on "
+                                      "MDC idx %d: error %d\n",
+                                      tgt->ltd_uuid.uuid, i, rc2);
+                       }
+               }
+       }
+       class_disconnect(lmv->exp);
+       lmv_init_unlock(lmv);
+       RETURN(rc);
+}
+
+static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
+{
+       struct proc_dir_entry  *lmv_proc_dir;
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       struct obd_device      *mdc_obd;
+       int                  rc;
+       ENTRY;
+
+       LASSERT(tgt != NULL);
+       LASSERT(obd != NULL);
+
+       mdc_obd = class_exp2obd(tgt->ltd_exp);
+
+       if (mdc_obd) {
+               mdc_obd->obd_force = obd->obd_force;
+               mdc_obd->obd_fail = obd->obd_fail;
+               mdc_obd->obd_no_recov = obd->obd_no_recov;
+       }
+
+       lmv_proc_dir = obd->obd_proc_private;
+       if (lmv_proc_dir)
+               lprocfs_remove_proc_entry(mdc_obd->obd_name, lmv_proc_dir);
+
+       rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
+       if (rc)
+               CERROR("Can't finanize fids factory\n");
+
+       CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n",
+              tgt->ltd_exp->exp_obd->obd_name,
+              tgt->ltd_exp->exp_obd->obd_uuid.uuid);
+
+       obd_register_observer(tgt->ltd_exp->exp_obd, NULL);
+       rc = obd_disconnect(tgt->ltd_exp);
+       if (rc) {
+               if (tgt->ltd_active) {
+                       CERROR("Target %s disconnect error %d\n",
+                              tgt->ltd_uuid.uuid, rc);
+               }
+       }
+
+       lmv_activate_target(lmv, tgt, 0);
+       tgt->ltd_exp = NULL;
+       RETURN(0);
+}
+
+static int lmv_disconnect(struct obd_export *exp)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       int                 rc;
+       int                 i;
+       ENTRY;
+
+       if (!lmv->tgts)
+               goto out_local;
+
+       /*
+        * Only disconnect the underlying layers on the final disconnect.
+        */
+       lmv->refcount--;
+       if (lmv->refcount != 0)
+               goto out_local;
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+                       continue;
+
+               lmv_disconnect_mdc(obd, lmv->tgts[i]);
+       }
+
+       if (obd->obd_proc_private)
+               lprocfs_remove((proc_dir_entry_t **)&obd->obd_proc_private);
+       else
+               CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n",
+                      obd->obd_type->typ_name, obd->obd_name);
+
+out_local:
+       /*
+        * This is the case when no real connection is established by
+        * lmv_check_connect().
+        */
+       if (!lmv->connected)
+               class_export_put(exp);
+       rc = class_disconnect(exp);
+       if (lmv->refcount == 0)
+               lmv->connected = 0;
+       RETURN(rc);
+}
+
+static int lmv_fid2path(struct obd_export *exp, int len, void *karg, void *uarg)
+{
+       struct obd_device       *obddev = class_exp2obd(exp);
+       struct lmv_obd          *lmv = &obddev->u.lmv;
+       struct getinfo_fid2path *gf;
+       struct lmv_tgt_desc     *tgt;
+       struct getinfo_fid2path *remote_gf = NULL;
+       int                     remote_gf_size = 0;
+       int                     rc;
+
+       gf = (struct getinfo_fid2path *)karg;
+       tgt = lmv_find_target(lmv, &gf->gf_fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+repeat_fid2path:
+       rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg);
+       if (rc != 0 && rc != -EREMOTE)
+               GOTO(out_fid2path, rc);
+
+       /* If remote_gf != NULL, it means just building the
+        * path on the remote MDT, copy this path segement to gf */
+       if (remote_gf != NULL) {
+               struct getinfo_fid2path *ori_gf;
+               char *ptr;
+
+               ori_gf = (struct getinfo_fid2path *)karg;
+               if (strlen(ori_gf->gf_path) +
+                   strlen(gf->gf_path) > ori_gf->gf_pathlen)
+                       GOTO(out_fid2path, rc = -EOVERFLOW);
+
+               ptr = ori_gf->gf_path;
+
+               memmove(ptr + strlen(gf->gf_path) + 1, ptr,
+                       strlen(ori_gf->gf_path));
+
+               strncpy(ptr, gf->gf_path, strlen(gf->gf_path));
+               ptr += strlen(gf->gf_path);
+               *ptr = '/';
+       }
+
+       CDEBUG(D_INFO, "%s: get path %s "DFID" rec: "LPU64" ln: %u\n",
+              tgt->ltd_exp->exp_obd->obd_name,
+              gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno,
+              gf->gf_linkno);
+
+       if (rc == 0)
+               GOTO(out_fid2path, rc);
+
+       /* sigh, has to go to another MDT to do path building further */
+       if (remote_gf == NULL) {
+               remote_gf_size = sizeof(*remote_gf) + PATH_MAX;
+               OBD_ALLOC(remote_gf, remote_gf_size);
+               if (remote_gf == NULL)
+                       GOTO(out_fid2path, rc = -ENOMEM);
+               remote_gf->gf_pathlen = PATH_MAX;
+       }
+
+       if (!fid_is_sane(&gf->gf_fid)) {
+               CERROR("%s: invalid FID "DFID": rc = %d\n",
+                      tgt->ltd_exp->exp_obd->obd_name,
+                      PFID(&gf->gf_fid), -EINVAL);
+               GOTO(out_fid2path, rc = -EINVAL);
+       }
+
+       tgt = lmv_find_target(lmv, &gf->gf_fid);
+       if (IS_ERR(tgt))
+               GOTO(out_fid2path, rc = -EINVAL);
+
+       remote_gf->gf_fid = gf->gf_fid;
+       remote_gf->gf_recno = -1;
+       remote_gf->gf_linkno = -1;
+       memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen);
+       gf = remote_gf;
+       goto repeat_fid2path;
+
+out_fid2path:
+       if (remote_gf != NULL)
+               OBD_FREE(remote_gf, remote_gf_size);
+       RETURN(rc);
+}
+
+static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
+                        int len, void *karg, void *uarg)
+{
+       struct obd_device    *obddev = class_exp2obd(exp);
+       struct lmv_obd       *lmv = &obddev->u.lmv;
+       int                i = 0;
+       int                rc = 0;
+       int                set = 0;
+       int                count = lmv->desc.ld_tgt_count;
+       ENTRY;
+
+       if (count == 0)
+               RETURN(-ENOTTY);
+
+       switch (cmd) {
+       case IOC_OBD_STATFS: {
+               struct obd_ioctl_data *data = karg;
+               struct obd_device *mdc_obd;
+               struct obd_statfs stat_buf = {0};
+               __u32 index;
+
+               memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+               if ((index >= count))
+                       RETURN(-ENODEV);
+
+               if (lmv->tgts[index] == NULL ||
+                   lmv->tgts[index]->ltd_active == 0)
+                       RETURN(-ENODATA);
+
+               mdc_obd = class_exp2obd(lmv->tgts[index]->ltd_exp);
+               if (!mdc_obd)
+                       RETURN(-EINVAL);
+
+               /* copy UUID */
+               if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd),
+                                    min((int) data->ioc_plen2,
+                                        (int) sizeof(struct obd_uuid))))
+                       RETURN(-EFAULT);
+
+               rc = obd_statfs(NULL, lmv->tgts[index]->ltd_exp, &stat_buf,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               0);
+               if (rc)
+                       RETURN(rc);
+               if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+                                    min((int) data->ioc_plen1,
+                                        (int) sizeof(stat_buf))))
+                       RETURN(-EFAULT);
+               break;
+       }
+       case OBD_IOC_QUOTACTL: {
+               struct if_quotactl *qctl = karg;
+               struct lmv_tgt_desc *tgt = NULL;
+               struct obd_quotactl *oqctl;
+
+               if (qctl->qc_valid == QC_MDTIDX) {
+                       if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+                               RETURN(-EINVAL);
+
+                       tgt = lmv->tgts[qctl->qc_idx];
+                       if (tgt == NULL || tgt->ltd_exp == NULL)
+                               RETURN(-EINVAL);
+               } else if (qctl->qc_valid == QC_UUID) {
+                       for (i = 0; i < count; i++) {
+                               tgt = lmv->tgts[i];
+                               if (tgt == NULL)
+                                       continue;
+                               if (!obd_uuid_equals(&tgt->ltd_uuid,
+                                                    &qctl->obd_uuid))
+                                       continue;
+
+                               if (tgt->ltd_exp == NULL)
+                                       RETURN(-EINVAL);
+
+                               break;
+                       }
+               } else {
+                       RETURN(-EINVAL);
+               }
+
+               if (i >= count)
+                       RETURN(-EAGAIN);
+
+               LASSERT(tgt && tgt->ltd_exp);
+               OBD_ALLOC_PTR(oqctl);
+               if (!oqctl)
+                       RETURN(-ENOMEM);
+
+               QCTL_COPY(oqctl, qctl);
+               rc = obd_quotactl(tgt->ltd_exp, oqctl);
+               if (rc == 0) {
+                       QCTL_COPY(qctl, oqctl);
+                       qctl->qc_valid = QC_MDTIDX;
+                       qctl->obd_uuid = tgt->ltd_uuid;
+               }
+               OBD_FREE_PTR(oqctl);
+               break;
+       }
+       case OBD_IOC_CHANGELOG_SEND:
+       case OBD_IOC_CHANGELOG_CLEAR: {
+               struct ioc_changelog *icc = karg;
+
+               if (icc->icc_mdtindex >= count)
+                       RETURN(-ENODEV);
+
+               if (lmv->tgts[icc->icc_mdtindex] == NULL ||
+                   lmv->tgts[icc->icc_mdtindex]->ltd_exp == NULL ||
+                   lmv->tgts[icc->icc_mdtindex]->ltd_active == 0)
+                       RETURN(-ENODEV);
+               rc = obd_iocontrol(cmd, lmv->tgts[icc->icc_mdtindex]->ltd_exp,
+                                  sizeof(*icc), icc, NULL);
+               break;
+       }
+       case LL_IOC_GET_CONNECT_FLAGS: {
+               if (lmv->tgts[0] == NULL)
+                       RETURN(-ENODATA);
+               rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg);
+               break;
+       }
+       case OBD_IOC_FID2PATH: {
+               rc = lmv_fid2path(exp, len, karg, uarg);
+               break;
+       }
+       case LL_IOC_HSM_STATE_GET:
+       case LL_IOC_HSM_STATE_SET:
+       case LL_IOC_HSM_ACTION:
+       case LL_IOC_LOV_SWAP_LAYOUTS: {
+               struct md_op_data       *op_data = karg;
+               struct lmv_tgt_desc     *tgt1, *tgt2;
+
+               tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
+               if (IS_ERR(tgt1))
+                       RETURN(PTR_ERR(tgt1));
+
+               tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
+               if (IS_ERR(tgt2))
+                       RETURN(PTR_ERR(tgt2));
+
+               if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL))
+                       RETURN(-EINVAL);
+
+               /* only files on same MDT can have their layouts swapped */
+               if (tgt1->ltd_idx != tgt2->ltd_idx)
+                       RETURN(-EPERM);
+
+               rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
+               break;
+       }
+       default:
+               for (i = 0; i < count; i++) {
+                       struct obd_device *mdc_obd;
+                       int err;
+
+                       if (lmv->tgts[i] == NULL ||
+                           lmv->tgts[i]->ltd_exp == NULL)
+                               continue;
+                       /* ll_umount_begin() sets force flag but for lmv, not
+                        * mdc. Let's pass it through */
+                       mdc_obd = class_exp2obd(lmv->tgts[i]->ltd_exp);
+                       mdc_obd->obd_force = obddev->obd_force;
+                       err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len,
+                                           karg, uarg);
+                       if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+                               RETURN(err);
+                       } else if (err) {
+                               if (lmv->tgts[i]->ltd_active) {
+                                       CERROR("error: iocontrol MDC %s on MDT"
+                                              "idx %d cmd %x: err = %d\n",
+                                               lmv->tgts[i]->ltd_uuid.uuid,
+                                               i, cmd, err);
+                                       if (!rc)
+                                               rc = err;
+                               }
+                       } else
+                               set = 1;
+               }
+               if (!set && !rc)
+                       rc = -EIO;
+       }
+       RETURN(rc);
+}
+
+#if 0
+static int lmv_all_chars_policy(int count, const char *name,
+                               int len)
+{
+       unsigned int c = 0;
+
+       while (len > 0)
+               c += name[--len];
+       c = c % count;
+       return c;
+}
+
+static int lmv_nid_policy(struct lmv_obd *lmv)
+{
+       struct obd_import *imp;
+       __u32         id;
+
+       /*
+        * XXX: To get nid we assume that underlying obd device is mdc.
+        */
+       imp = class_exp2cliimp(lmv->tgts[0].ltd_exp);
+       id = imp->imp_connection->c_self ^ (imp->imp_connection->c_self >> 32);
+       return id % lmv->desc.ld_tgt_count;
+}
+
+static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+                         placement_policy_t placement)
+{
+       switch (placement) {
+       case PLACEMENT_CHAR_POLICY:
+               return lmv_all_chars_policy(lmv->desc.ld_tgt_count,
+                                           op_data->op_name,
+                                           op_data->op_namelen);
+       case PLACEMENT_NID_POLICY:
+               return lmv_nid_policy(lmv);
+
+       default:
+               break;
+       }
+
+       CERROR("Unsupported placement policy %x\n", placement);
+       return -EINVAL;
+}
+#endif
+
+/**
+ * This is _inode_ placement policy function (not name).
+ */
+static int lmv_placement_policy(struct obd_device *obd,
+                               struct md_op_data *op_data,
+                               mdsno_t *mds)
+{
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       ENTRY;
+
+       LASSERT(mds != NULL);
+
+       if (lmv->desc.ld_tgt_count == 1) {
+               *mds = 0;
+               RETURN(0);
+       }
+
+       /**
+        * If stripe_offset is provided during setdirstripe
+        * (setdirstripe -i xx), xx MDS will be choosen.
+        */
+       if (op_data->op_cli_flags & CLI_SET_MEA) {
+               struct lmv_user_md *lum;
+
+               lum = (struct lmv_user_md *)op_data->op_data;
+               if (lum->lum_type == LMV_STRIPE_TYPE &&
+                   lum->lum_stripe_offset != -1) {
+                       if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) {
+                               CERROR("%s: Stripe_offset %d > MDT count %d:"
+                                      " rc = %d\n", obd->obd_name,
+                                      lum->lum_stripe_offset,
+                                      lmv->desc.ld_tgt_count, -ERANGE);
+                               RETURN(-ERANGE);
+                       }
+                       *mds = lum->lum_stripe_offset;
+                       RETURN(0);
+               }
+       }
+
+       /* Allocate new fid on target according to operation type and parent
+        * home mds. */
+       *mds = op_data->op_mds;
+       RETURN(0);
+}
+
+int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid,
+                   mdsno_t mds)
+{
+       struct lmv_tgt_desc     *tgt;
+       int                      rc;
+       ENTRY;
+
+       tgt = lmv_get_target(lmv, mds);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       /*
+        * New seq alloc and FLD setup should be atomic. Otherwise we may find
+        * on server that seq in new allocated fid is not yet known.
+        */
+       mutex_lock(&tgt->ltd_fid_mutex);
+
+       if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL)
+               GOTO(out, rc = -ENODEV);
+
+       /*
+        * Asking underlaying tgt layer to allocate new fid.
+        */
+       rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL);
+       if (rc > 0) {
+               LASSERT(fid_is_sane(fid));
+               rc = 0;
+       }
+
+       EXIT;
+out:
+       mutex_unlock(&tgt->ltd_fid_mutex);
+       return rc;
+}
+
+int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+                 struct md_op_data *op_data)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       mdsno_t         mds = 0;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(op_data != NULL);
+       LASSERT(fid != NULL);
+
+       rc = lmv_placement_policy(obd, op_data, &mds);
+       if (rc) {
+               CERROR("Can't get target for allocating fid, "
+                      "rc %d\n", rc);
+               RETURN(rc);
+       }
+
+       rc = __lmv_fid_alloc(lmv, fid, mds);
+       if (rc) {
+               CERROR("Can't alloc new fid, rc %d\n", rc);
+               RETURN(rc);
+       }
+
+       RETURN(rc);
+}
+
+static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lmv_obd       *lmv = &obd->u.lmv;
+       struct lprocfs_static_vars  lvars;
+       struct lmv_desc     *desc;
+       int                      rc;
+       ENTRY;
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+               CERROR("LMV setup requires a descriptor\n");
+               RETURN(-EINVAL);
+       }
+
+       desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
+       if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+               CERROR("Lmv descriptor size wrong: %d > %d\n",
+                      (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+               RETURN(-EINVAL);
+       }
+
+       OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * 32);
+       if (lmv->tgts == NULL)
+               RETURN(-ENOMEM);
+       lmv->tgts_size = 32;
+
+       obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
+       lmv->desc.ld_tgt_count = 0;
+       lmv->desc.ld_active_tgt_count = 0;
+       lmv->max_cookiesize = 0;
+       lmv->max_def_easize = 0;
+       lmv->max_easize = 0;
+       lmv->lmv_placement = PLACEMENT_CHAR_POLICY;
+
+       spin_lock_init(&lmv->lmv_lock);
+       mutex_init(&lmv->init_mutex);
+
+       lprocfs_lmv_init_vars(&lvars);
+
+       lprocfs_obd_setup(obd, lvars.obd_vars);
+#ifdef LPROCFS
+       {
+               rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+                                       0444, &lmv_proc_target_fops, obd);
+               if (rc)
+                       CWARN("%s: error adding LMV target_obd file: rc = %d\n",
+                              obd->obd_name, rc);
+       }
+#endif
+       rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
+                            LUSTRE_CLI_FLD_HASH_DHT);
+       if (rc) {
+               CERROR("Can't init FLD, err %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       RETURN(0);
+
+out:
+       return rc;
+}
+
+static int lmv_cleanup(struct obd_device *obd)
+{
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       ENTRY;
+
+       fld_client_fini(&lmv->lmv_fld);
+       if (lmv->tgts != NULL) {
+               int i;
+               for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+                       if (lmv->tgts[i] == NULL)
+                               continue;
+                       lmv_del_target(lmv, i);
+               }
+               OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size);
+               lmv->tgts_size = 0;
+       }
+       RETURN(0);
+}
+
+static int lmv_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+       struct lustre_cfg       *lcfg = buf;
+       struct obd_uuid         obd_uuid;
+       int                     gen;
+       __u32                   index;
+       int                     rc;
+       ENTRY;
+
+       switch (lcfg->lcfg_command) {
+       case LCFG_ADD_MDC:
+               /* modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
+                * 2:0  3:1  4:lustre-MDT0000-mdc_UUID */
+               if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+                       GOTO(out, rc = -EINVAL);
+
+               obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+               if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1)
+                       GOTO(out, rc = -EINVAL);
+               if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1)
+                       GOTO(out, rc = -EINVAL);
+               rc = lmv_add_target(obd, &obd_uuid, index, gen);
+               GOTO(out, rc);
+       default:
+               CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+               GOTO(out, rc = -EINVAL);
+       }
+out:
+       RETURN(rc);
+}
+
+static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
+                     struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       struct obd_statfs     *temp;
+       int                 rc = 0;
+       int                 i;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       OBD_ALLOC(temp, sizeof(*temp));
+       if (temp == NULL)
+               RETURN(-ENOMEM);
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+                       continue;
+
+               rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
+                               max_age, flags);
+               if (rc) {
+                       CERROR("can't stat MDS #%d (%s), error %d\n", i,
+                              lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
+                              rc);
+                       GOTO(out_free_temp, rc);
+               }
+
+               if (i == 0) {
+                       *osfs = *temp;
+                       /* If the statfs is from mount, it will needs
+                        * retrieve necessary information from MDT0.
+                        * i.e. mount does not need the merged osfs
+                        * from all of MDT.
+                        * And also clients can be mounted as long as
+                        * MDT0 is in service*/
+                       if (flags & OBD_STATFS_FOR_MDT0)
+                               GOTO(out_free_temp, rc);
+               } else {
+                       osfs->os_bavail += temp->os_bavail;
+                       osfs->os_blocks += temp->os_blocks;
+                       osfs->os_ffree += temp->os_ffree;
+                       osfs->os_files += temp->os_files;
+               }
+       }
+
+       EXIT;
+out_free_temp:
+       OBD_FREE(temp, sizeof(*temp));
+       return rc;
+}
+
+static int lmv_getstatus(struct obd_export *exp,
+                        struct lu_fid *fid,
+                        struct obd_capa **pc)
+{
+       struct obd_device    *obd = exp->exp_obd;
+       struct lmv_obd       *lmv = &obd->u.lmv;
+       int                rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       rc = md_getstatus(lmv->tgts[0]->ltd_exp, fid, pc);
+       RETURN(rc);
+}
+
+static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+                       struct obd_capa *oc, obd_valid valid, const char *name,
+                       const char *input, int input_size, int output_size,
+                       int flags, struct ptlrpc_request **request)
+{
+       struct obd_device      *obd = exp->exp_obd;
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc    *tgt;
+       int                  rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_getxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+                        input_size, output_size, flags, request);
+
+       RETURN(rc);
+}
+
+static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+                       struct obd_capa *oc, obd_valid valid, const char *name,
+                       const char *input, int input_size, int output_size,
+                       int flags, __u32 suppgid,
+                       struct ptlrpc_request **request)
+{
+       struct obd_device      *obd = exp->exp_obd;
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc    *tgt;
+       int                  rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input,
+                        input_size, output_size, flags, suppgid,
+                        request);
+
+       RETURN(rc);
+}
+
+static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
+                      struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       if (op_data->op_flags & MF_GET_MDT_IDX) {
+               op_data->op_mds = tgt->ltd_idx;
+               RETURN(0);
+       }
+
+       rc = md_getattr(tgt->ltd_exp, op_data, request);
+
+       RETURN(rc);
+}
+
+static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
+{
+       struct obd_device   *obd = exp->exp_obd;
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       int               i;
+       int               rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+       /*
+        * With DNE every object can have two locks in different namespaces:
+        * lookup lock in space of MDT storing direntry and update/open lock in
+        * space of MDT storing inode.
+        */
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+                       continue;
+               md_null_inode(lmv->tgts[i]->ltd_exp, fid);
+       }
+
+       RETURN(0);
+}
+
+static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+                          ldlm_iterator_t it, void *data)
+{
+       struct obd_device   *obd = exp->exp_obd;
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       int               i;
+       int               rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+       /*
+        * With DNE every object can have two locks in different namespaces:
+        * lookup lock in space of MDT storing direntry and update/open lock in
+        * space of MDT storing inode.
+        */
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL)
+                       continue;
+               rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       RETURN(rc);
+}
+
+
+static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
+                    struct md_open_data *mod, struct ptlrpc_request **request)
+{
+       struct obd_device     *obd = exp->exp_obd;
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc   *tgt;
+       int                 rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
+       rc = md_close(tgt->ltd_exp, op_data, mod, request);
+       RETURN(rc);
+}
+
+struct lmv_tgt_desc
+*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
+               struct lu_fid *fid)
+{
+       struct lmv_tgt_desc *tgt;
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               return tgt;
+
+       op_data->op_mds = tgt->ltd_idx;
+
+       return tgt;
+}
+
+int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
+              const void *data, int datalen, int mode, __u32 uid,
+              __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
+              struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       if (!lmv->desc.ld_active_tgt_count)
+               RETURN(-EIO);
+
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n",
+              op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+              op_data->op_mds);
+
+       op_data->op_flags |= MF_MDC_CANCEL_FID1;
+       rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
+                      cap_effective, rdev, request);
+
+       if (rc == 0) {
+               if (*request == NULL)
+                       RETURN(rc);
+               CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2));
+       }
+       RETURN(rc);
+}
+
+static int lmv_done_writing(struct obd_export *exp,
+                           struct md_op_data *op_data,
+                           struct md_open_data *mod)
+{
+       struct obd_device     *obd = exp->exp_obd;
+       struct lmv_obd  *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc   *tgt;
+       int                 rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_done_writing(tgt->ltd_exp, op_data, mod);
+       RETURN(rc);
+}
+
+static int
+lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+                  struct lookup_intent *it, struct md_op_data *op_data,
+                  struct lustre_handle *lockh, void *lmm, int lmmsize,
+                  int extra_lock_flags)
+{
+       struct ptlrpc_request      *req = it->d.lustre.it_data;
+       struct obd_device         *obd = exp->exp_obd;
+       struct lmv_obd       *lmv = &obd->u.lmv;
+       struct lustre_handle    plock;
+       struct lmv_tgt_desc     *tgt;
+       struct md_op_data         *rdata;
+       struct lu_fid          fid1;
+       struct mdt_body     *body;
+       int                      rc = 0;
+       int                      pmode;
+       ENTRY;
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL);
+
+       if (!(body->valid & OBD_MD_MDS))
+               RETURN(0);
+
+       CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n",
+              LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1));
+
+       /*
+        * We got LOOKUP lock, but we really need attrs.
+        */
+       pmode = it->d.lustre.it_lock_mode;
+       LASSERT(pmode != 0);
+       memcpy(&plock, lockh, sizeof(plock));
+       it->d.lustre.it_lock_mode = 0;
+       it->d.lustre.it_data = NULL;
+       fid1 = body->fid1;
+
+       it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
+       ptlrpc_req_finished(req);
+
+       tgt = lmv_find_target(lmv, &fid1);
+       if (IS_ERR(tgt))
+               GOTO(out, rc = PTR_ERR(tgt));
+
+       OBD_ALLOC_PTR(rdata);
+       if (rdata == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rdata->op_fid1 = fid1;
+       rdata->op_bias = MDS_CROSS_REF;
+
+       rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh,
+                       lmm, lmmsize, NULL, extra_lock_flags);
+       OBD_FREE_PTR(rdata);
+       EXIT;
+out:
+       ldlm_lock_decref(&plock, pmode);
+       return rc;
+}
+
+static int
+lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+           struct lookup_intent *it, struct md_op_data *op_data,
+           struct lustre_handle *lockh, void *lmm, int lmmsize,
+           struct ptlrpc_request **req, __u64 extra_lock_flags)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd     *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc      *tgt;
+       int                    rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID"\n",
+              LL_IT2STR(it), PFID(&op_data->op_fid1));
+
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n",
+              LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx);
+
+       rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh,
+                       lmm, lmmsize, req, extra_lock_flags);
+
+       if (rc == 0 && it && it->it_op == IT_OPEN) {
+               rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh,
+                                       lmm, lmmsize, extra_lock_flags);
+       }
+       RETURN(rc);
+}
+
+static int
+lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data,
+                struct ptlrpc_request **request)
+{
+       struct ptlrpc_request   *req = NULL;
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       struct mdt_body  *body;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
+              op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+              tgt->ltd_idx);
+
+       rc = md_getattr_name(tgt->ltd_exp, op_data, request);
+       if (rc != 0)
+               RETURN(rc);
+
+       body = req_capsule_server_get(&(*request)->rq_pill,
+                                     &RMF_MDT_BODY);
+       LASSERT(body != NULL);
+
+       if (body->valid & OBD_MD_MDS) {
+               struct lu_fid rid = body->fid1;
+               CDEBUG(D_INODE, "Request attrs for "DFID"\n",
+                      PFID(&rid));
+
+               tgt = lmv_find_target(lmv, &rid);
+               if (IS_ERR(tgt)) {
+                       ptlrpc_req_finished(*request);
+                       RETURN(PTR_ERR(tgt));
+               }
+
+               op_data->op_fid1 = rid;
+               op_data->op_valid |= OBD_MD_FLCROSSREF;
+               op_data->op_namelen = 0;
+               op_data->op_name = NULL;
+               rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
+               ptlrpc_req_finished(*request);
+               *request = req;
+       }
+
+       RETURN(rc);
+}
+
+#define md_op_data_fid(op_data, fl)                 \
+       (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \
+        fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \
+        fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \
+        fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \
+        NULL)
+
+static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data,
+                           int op_tgt, ldlm_mode_t mode, int bits, int flag)
+{
+       struct lu_fid     *fid = md_op_data_fid(op_data, flag);
+       struct obd_device      *obd = exp->exp_obd;
+       struct lmv_obd   *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc    *tgt;
+       ldlm_policy_data_t      policy = {{0}};
+       int                  rc = 0;
+       ENTRY;
+
+       if (!fid_is_sane(fid))
+               RETURN(0);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       if (tgt->ltd_idx != op_tgt) {
+               CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
+               policy.l_inodebits.bits = bits;
+               rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
+                                     mode, LCF_ASYNC, NULL);
+       } else {
+               CDEBUG(D_INODE,
+                      "EARLY_CANCEL skip operation target %d on "DFID"\n",
+                      op_tgt, PFID(fid));
+               op_data->op_flags |= flag;
+               rc = 0;
+       }
+
+       RETURN(rc);
+}
+
+/*
+ * llite passes fid of an target inode in op_data->op_fid1 and id of directory in
+ * op_data->op_fid2
+ */
+static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
+                   struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       LASSERT(op_data->op_namelen != 0);
+
+       CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n",
+              PFID(&op_data->op_fid2), op_data->op_namelen,
+              op_data->op_name, PFID(&op_data->op_fid1));
+
+       op_data->op_fsuid = current_fsuid();
+       op_data->op_fsgid = current_fsgid();
+       op_data->op_cap = cfs_curproc_cap_pack();
+       tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       /*
+        * Cancel UPDATE lock on child (fid1).
+        */
+       op_data->op_flags |= MF_MDC_CANCEL_FID2;
+       rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+                             MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
+       if (rc != 0)
+               RETURN(rc);
+
+       rc = md_link(tgt->ltd_exp, op_data, request);
+
+       RETURN(rc);
+}
+
+static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
+                     const char *old, int oldlen, const char *new, int newlen,
+                     struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *src_tgt;
+       struct lmv_tgt_desc     *tgt_tgt;
+       int                     rc;
+       ENTRY;
+
+       LASSERT(oldlen != 0);
+
+       CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n",
+              oldlen, old, PFID(&op_data->op_fid1),
+              newlen, new, PFID(&op_data->op_fid2));
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       op_data->op_fsuid = current_fsuid();
+       op_data->op_fsgid = current_fsgid();
+       op_data->op_cap = cfs_curproc_cap_pack();
+       src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(src_tgt))
+               RETURN(PTR_ERR(src_tgt));
+
+       tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+       if (IS_ERR(tgt_tgt))
+               RETURN(PTR_ERR(tgt_tgt));
+       /*
+        * LOOKUP lock on src child (fid3) should also be cancelled for
+        * src_tgt in mdc_rename.
+        */
+       op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+       /*
+        * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
+        * own target.
+        */
+       rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+                             LCK_EX, MDS_INODELOCK_UPDATE,
+                             MF_MDC_CANCEL_FID2);
+
+       /*
+        * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt.
+        */
+       if (rc == 0) {
+               rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+                                     LCK_EX, MDS_INODELOCK_LOOKUP,
+                                     MF_MDC_CANCEL_FID4);
+       }
+
+       /*
+        * Cancel all the locks on tgt child (fid4).
+        */
+       if (rc == 0)
+               rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+                                     LCK_EX, MDS_INODELOCK_FULL,
+                                     MF_MDC_CANCEL_FID4);
+
+       if (rc == 0)
+               rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen,
+                              new, newlen, request);
+       RETURN(rc);
+}
+
+static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
+                      void *ea, int ealen, void *ea2, int ea2len,
+                      struct ptlrpc_request **request,
+                      struct md_open_data **mod)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc = 0;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
+              PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
+
+       op_data->op_flags |= MF_MDC_CANCEL_FID1;
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, ea2,
+                       ea2len, request, mod);
+
+       RETURN(rc);
+}
+
+static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid,
+                   struct obd_capa *oc, struct ptlrpc_request **request)
+{
+       struct obd_device        *obd = exp->exp_obd;
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc       *tgt;
+       int                     rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_sync(tgt->ltd_exp, fid, oc, request);
+       RETURN(rc);
+}
+
+/*
+ * Adjust a set of pages, each page containing an array of lu_dirpages,
+ * so that each page can be used as a single logical lu_dirpage.
+ *
+ * A lu_dirpage is laid out as follows, where s = ldp_hash_start,
+ * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a
+ * struct lu_dirent.  It has size up to LU_PAGE_SIZE. The ldp_hash_end
+ * value is used as a cookie to request the next lu_dirpage in a
+ * directory listing that spans multiple pages (two in this example):
+ *   ________
+ *  |  |
+ * .|--------v-------   -----.
+ * |s|e|f|p|ent|ent| ... |ent|
+ * '--|--------------   -----'   Each CFS_PAGE contains a single
+ *    '------.            lu_dirpage.
+ * .---------v-------   -----.
+ * |s|e|f|p|ent| 0 | ... | 0 |
+ * '-----------------   -----'
+ *
+ * However, on hosts where the native VM page size (PAGE_CACHE_SIZE) is
+ * larger than LU_PAGE_SIZE, a single host page may contain multiple
+ * lu_dirpages. After reading the lu_dirpages from the MDS, the
+ * ldp_hash_end of the first lu_dirpage refers to the one immediately
+ * after it in the same CFS_PAGE (arrows simplified for brevity, but
+ * in general e0==s1, e1==s2, etc.):
+ *
+ * .--------------------   -----.
+ * |s0|e0|f0|p|ent|ent| ... |ent|
+ * |---v----------------   -----|
+ * |s1|e1|f1|p|ent|ent| ... |ent|
+ * |---v----------------   -----|  Here, each CFS_PAGE contains
+ *          ...                 multiple lu_dirpages.
+ * |---v----------------   -----|
+ * |s'|e'|f'|p|ent|ent| ... |ent|
+ * '---|----------------   -----'
+ *     v
+ * .----------------------------.
+ * |   next CFS_PAGE       |
+ *
+ * This structure is transformed into a single logical lu_dirpage as follows:
+ *
+ * - Replace e0 with e' so the request for the next lu_dirpage gets the page
+ *   labeled 'next CFS_PAGE'.
+ *
+ * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether
+ *   a hash collision with the next page exists.
+ *
+ * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span
+ *   to the first entry of the next lu_dirpage.
+ */
+#if PAGE_CACHE_SIZE > LU_PAGE_SIZE
+static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs)
+{
+       int i;
+
+       for (i = 0; i < ncfspgs; i++) {
+               struct lu_dirpage       *dp = kmap(pages[i]);
+               struct lu_dirpage       *first = dp;
+               struct lu_dirent        *end_dirent = NULL;
+               struct lu_dirent        *ent;
+               __u64                   hash_end = dp->ldp_hash_end;
+               __u32                   flags = dp->ldp_flags;
+
+               for (; nlupgs > 1; nlupgs--) {
+                       ent = lu_dirent_start(dp);
+                       for (end_dirent = ent; ent != NULL;
+                            end_dirent = ent, ent = lu_dirent_next(ent));
+
+                       /* Advance dp to next lu_dirpage. */
+                       dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE);
+
+                       /* Check if we've reached the end of the CFS_PAGE. */
+                       if (!((unsigned long)dp & ~CFS_PAGE_MASK))
+                               break;
+
+                       /* Save the hash and flags of this lu_dirpage. */
+                       hash_end = dp->ldp_hash_end;
+                       flags = dp->ldp_flags;
+
+                       /* Check if lu_dirpage contains no entries. */
+                       if (!end_dirent)
+                               break;
+
+                       /* Enlarge the end entry lde_reclen from 0 to
+                        * first entry of next lu_dirpage. */
+                       LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0);
+                       end_dirent->lde_reclen =
+                               cpu_to_le16((char *)(dp->ldp_entries) -
+                                           (char *)end_dirent);
+               }
+
+               first->ldp_hash_end = hash_end;
+               first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE);
+               first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE);
+
+               kunmap(pages[i]);
+       }
+}
+#else
+#define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0)
+#endif /* PAGE_CACHE_SIZE > LU_PAGE_SIZE */
+
+static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data,
+                       struct page **pages, struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd          *lmv = &obd->u.lmv;
+       __u64                   offset = op_data->op_offset;
+       int                     rc;
+       int                     ncfspgs; /* pages read in PAGE_CACHE_SIZE */
+       int                     nlupgs; /* pages read in LU_PAGE_SIZE */
+       struct lmv_tgt_desc     *tgt;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "READPAGE at "LPX64" from "DFID"\n",
+              offset, PFID(&op_data->op_fid1));
+
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_readpage(tgt->ltd_exp, op_data, pages, request);
+       if (rc != 0)
+               RETURN(rc);
+
+       ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_CACHE_SIZE - 1)
+                >> PAGE_CACHE_SHIFT;
+       nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
+       LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
+       LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages);
+
+       CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs,
+              op_data->op_npages);
+
+       lmv_adjust_dirpages(pages, ncfspgs, nlupgs);
+
+       RETURN(rc);
+}
+
+static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
+                     struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt = NULL;
+       struct mdt_body         *body;
+       int                  rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+retry:
+       /* Send unlink requests to the MDT where the child is located */
+       if (likely(!fid_is_zero(&op_data->op_fid2)))
+               tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
+       else
+               tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       op_data->op_fsuid = current_fsuid();
+       op_data->op_fsgid = current_fsgid();
+       op_data->op_cap = cfs_curproc_cap_pack();
+
+       /*
+        * If child's fid is given, cancel unused locks for it if it is from
+        * another export than parent.
+        *
+        * LOOKUP lock for child (fid3) should also be cancelled on parent
+        * tgt_tgt in mdc_unlink().
+        */
+       op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
+
+       /*
+        * Cancel FULL locks on child (fid3).
+        */
+       rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
+                             MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
+
+       if (rc != 0)
+               RETURN(rc);
+
+       CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n",
+              PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
+
+       rc = md_unlink(tgt->ltd_exp, op_data, request);
+       if (rc != 0 && rc != -EREMOTE)
+               RETURN(rc);
+
+       body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               RETURN(-EPROTO);
+
+       /* Not cross-ref case, just get out of here. */
+       if (likely(!(body->valid & OBD_MD_MDS)))
+               RETURN(0);
+
+       CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
+              exp->exp_obd->obd_name, PFID(&body->fid1));
+
+       /* This is a remote object, try remote MDT, Note: it may
+        * try more than 1 time here, Considering following case
+        * /mnt/lustre is root on MDT0, remote1 is on MDT1
+        * 1. Initially A does not know where remote1 is, it send
+        *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
+        *    resend unlink RPC to MDT1 (retry 1st time).
+        *
+        * 2. During the unlink RPC in flight,
+        *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
+        *    and create new remote1, but on MDT0
+        *
+        * 3. MDT1 get unlink RPC(from A), then do remote lock on
+        *    /mnt/lustre, then lookup get fid of remote1, and find
+        *    it is remote dir again, and replay -EREMOTE again.
+        *
+        * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
+        *
+        * In theory, it might try unlimited time here, but it should
+        * be very rare case.  */
+       op_data->op_fid2 = body->fid1;
+       ptlrpc_req_finished(*request);
+       *request = NULL;
+
+       goto retry;
+}
+
+static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       struct lmv_obd *lmv = &obd->u.lmv;
+       int rc = 0;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY:
+               /* XXX: here should be calling obd_precleanup() down to
+                * stack. */
+               break;
+       case OBD_CLEANUP_EXPORTS:
+               fld_client_proc_fini(&lmv->lmv_fld);
+               lprocfs_obd_cleanup(obd);
+               break;
+       default:
+               break;
+       }
+       RETURN(rc);
+}
+
+static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
+                       __u32 keylen, void *key, __u32 *vallen, void *val,
+                       struct lov_stripe_md *lsm)
+{
+       struct obd_device       *obd;
+       struct lmv_obd    *lmv;
+       int                   rc = 0;
+       ENTRY;
+
+       obd = class_exp2obd(exp);
+       if (obd == NULL) {
+               CDEBUG(D_IOCTL, "Invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               RETURN(-EINVAL);
+       }
+
+       lmv = &obd->u.lmv;
+       if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
+               struct lmv_tgt_desc *tgt;
+               int i;
+
+               rc = lmv_check_connect(obd);
+               if (rc)
+                       RETURN(rc);
+
+               LASSERT(*vallen == sizeof(__u32));
+               for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+                       tgt = lmv->tgts[i];
+                       /*
+                        * All tgts should be connected when this gets called.
+                        */
+                       if (tgt == NULL || tgt->ltd_exp == NULL)
+                               continue;
+
+                       if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
+                                         vallen, val, NULL))
+                               RETURN(0);
+               }
+               RETURN(-EINVAL);
+       } else if (KEY_IS(KEY_MAX_EASIZE) || KEY_IS(KEY_CONN_DATA)) {
+               rc = lmv_check_connect(obd);
+               if (rc)
+                       RETURN(rc);
+
+               /*
+                * Forwarding this request to first MDS, it should know LOV
+                * desc.
+                */
+               rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key,
+                                 vallen, val, NULL);
+               if (!rc && KEY_IS(KEY_CONN_DATA))
+                       exp->exp_connect_data = *(struct obd_connect_data *)val;
+               RETURN(rc);
+       } else if (KEY_IS(KEY_TGT_COUNT)) {
+               *((int *)val) = lmv->desc.ld_tgt_count;
+               RETURN(0);
+       }
+
+       CDEBUG(D_IOCTL, "Invalid key\n");
+       RETURN(-EINVAL);
+}
+
+int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                      obd_count keylen, void *key, obd_count vallen,
+                      void *val, struct ptlrpc_request_set *set)
+{
+       struct lmv_tgt_desc    *tgt;
+       struct obd_device      *obd;
+       struct lmv_obd   *lmv;
+       int rc = 0;
+       ENTRY;
+
+       obd = class_exp2obd(exp);
+       if (obd == NULL) {
+               CDEBUG(D_IOCTL, "Invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               RETURN(-EINVAL);
+       }
+       lmv = &obd->u.lmv;
+
+       if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) {
+               int i, err = 0;
+
+               for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+                       tgt = lmv->tgts[i];
+
+                       if (tgt == NULL || tgt->ltd_exp == NULL)
+                               continue;
+
+                       err = obd_set_info_async(env, tgt->ltd_exp,
+                                                keylen, key, vallen, val, set);
+                       if (err && rc == 0)
+                               rc = err;
+               }
+
+               RETURN(rc);
+       }
+
+       RETURN(-EINVAL);
+}
+
+int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+              struct lov_stripe_md *lsm)
+{
+       struct obd_device        *obd = class_exp2obd(exp);
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       struct lmv_stripe_md      *meap;
+       struct lmv_stripe_md      *lsmp;
+       int                     mea_size;
+       int                     i;
+       ENTRY;
+
+       mea_size = lmv_get_easize(lmv);
+       if (!lmmp)
+               RETURN(mea_size);
+
+       if (*lmmp && !lsm) {
+               OBD_FREE_LARGE(*lmmp, mea_size);
+               *lmmp = NULL;
+               RETURN(0);
+       }
+
+       if (*lmmp == NULL) {
+               OBD_ALLOC_LARGE(*lmmp, mea_size);
+               if (*lmmp == NULL)
+                       RETURN(-ENOMEM);
+       }
+
+       if (!lsm)
+               RETURN(mea_size);
+
+       lsmp = (struct lmv_stripe_md *)lsm;
+       meap = (struct lmv_stripe_md *)*lmmp;
+
+       if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR &&
+           lsmp->mea_magic != MEA_MAGIC_ALL_CHARS)
+               RETURN(-EINVAL);
+
+       meap->mea_magic = cpu_to_le32(lsmp->mea_magic);
+       meap->mea_count = cpu_to_le32(lsmp->mea_count);
+       meap->mea_master = cpu_to_le32(lsmp->mea_master);
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               meap->mea_ids[i] = lsmp->mea_ids[i];
+               fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]);
+       }
+
+       RETURN(mea_size);
+}
+
+int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+                struct lov_mds_md *lmm, int lmm_size)
+{
+       struct obd_device         *obd = class_exp2obd(exp);
+       struct lmv_stripe_md      **tmea = (struct lmv_stripe_md **)lsmp;
+       struct lmv_stripe_md       *mea = (struct lmv_stripe_md *)lmm;
+       struct lmv_obd       *lmv = &obd->u.lmv;
+       int                      mea_size;
+       int                      i;
+       __u32                  magic;
+       ENTRY;
+
+       mea_size = lmv_get_easize(lmv);
+       if (lsmp == NULL)
+               return mea_size;
+
+       if (*lsmp != NULL && lmm == NULL) {
+               OBD_FREE_LARGE(*tmea, mea_size);
+               *lsmp = NULL;
+               RETURN(0);
+       }
+
+       LASSERT(mea_size == lmm_size);
+
+       OBD_ALLOC_LARGE(*tmea, mea_size);
+       if (*tmea == NULL)
+               RETURN(-ENOMEM);
+
+       if (!lmm)
+               RETURN(mea_size);
+
+       if (mea->mea_magic == MEA_MAGIC_LAST_CHAR ||
+           mea->mea_magic == MEA_MAGIC_ALL_CHARS ||
+           mea->mea_magic == MEA_MAGIC_HASH_SEGMENT)
+       {
+               magic = le32_to_cpu(mea->mea_magic);
+       } else {
+               /*
+                * Old mea is not handled here.
+                */
+               CERROR("Old not supportable EA is found\n");
+               LBUG();
+       }
+
+       (*tmea)->mea_magic = magic;
+       (*tmea)->mea_count = le32_to_cpu(mea->mea_count);
+       (*tmea)->mea_master = le32_to_cpu(mea->mea_master);
+
+       for (i = 0; i < (*tmea)->mea_count; i++) {
+               (*tmea)->mea_ids[i] = mea->mea_ids[i];
+               fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]);
+       }
+       RETURN(mea_size);
+}
+
+static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+                            ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                            ldlm_cancel_flags_t flags, void *opaque)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       int                   rc = 0;
+       int                   err;
+       int                   i;
+       ENTRY;
+
+       LASSERT(fid != NULL);
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL ||
+                   lmv->tgts[i]->ltd_active == 0)
+                       continue;
+
+               err = md_cancel_unused(lmv->tgts[i]->ltd_exp, fid,
+                                      policy, mode, flags, opaque);
+               if (!rc)
+                       rc = err;
+       }
+       RETURN(rc);
+}
+
+int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+                     __u64 *bits)
+{
+       struct lmv_obd    *lmv = &exp->exp_obd->u.lmv;
+       int                   rc;
+       ENTRY;
+
+       rc =  md_set_lock_data(lmv->tgts[0]->ltd_exp, lockh, data, bits);
+       RETURN(rc);
+}
+
+ldlm_mode_t lmv_lock_match(struct obd_export *exp, __u64 flags,
+                          const struct lu_fid *fid, ldlm_type_t type,
+                          ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                          struct lustre_handle *lockh)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       ldlm_mode_t           rc;
+       int                   i;
+       ENTRY;
+
+       CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
+
+       /*
+        * With CMD every object can have two locks in different namespaces:
+        * lookup lock in space of mds storing direntry and update/open lock in
+        * space of mds storing inode. Thus we check all targets, not only that
+        * one fid was created in.
+        */
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               if (lmv->tgts[i] == NULL ||
+                   lmv->tgts[i]->ltd_exp == NULL ||
+                   lmv->tgts[i]->ltd_active == 0)
+                       continue;
+
+               rc = md_lock_match(lmv->tgts[i]->ltd_exp, flags, fid,
+                                  type, policy, mode, lockh);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       RETURN(0);
+}
+
+int lmv_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+                     struct obd_export *dt_exp, struct obd_export *md_exp,
+                     struct lustre_md *md)
+{
+       struct lmv_obd    *lmv = &exp->exp_obd->u.lmv;
+
+       return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md);
+}
+
+int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       ENTRY;
+
+       if (md->mea)
+               obd_free_memmd(exp, (void *)&md->mea);
+       RETURN(md_free_lustre_md(lmv->tgts[0]->ltd_exp, md));
+}
+
+int lmv_set_open_replay_data(struct obd_export *exp,
+                            struct obd_client_handle *och,
+                            struct ptlrpc_request *open_req)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       ENTRY;
+
+       tgt = lmv_find_target(lmv, &och->och_fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       RETURN(md_set_open_replay_data(tgt->ltd_exp, och, open_req));
+}
+
+int lmv_clear_open_replay_data(struct obd_export *exp,
+                              struct obd_client_handle *och)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       ENTRY;
+
+       tgt = lmv_find_target(lmv, &och->och_fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       RETURN(md_clear_open_replay_data(tgt->ltd_exp, och));
+}
+
+static int lmv_get_remote_perm(struct obd_export *exp,
+                              const struct lu_fid *fid,
+                              struct obd_capa *oc, __u32 suppgid,
+                              struct ptlrpc_request **request)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_get_remote_perm(tgt->ltd_exp, fid, oc, suppgid, request);
+       RETURN(rc);
+}
+
+static int lmv_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+                         renew_capa_cb_t cb)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &oc->c_capa.lc_fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_renew_capa(tgt->ltd_exp, oc, cb);
+       RETURN(rc);
+}
+
+int lmv_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+                   const struct req_msg_field *field, struct obd_capa **oc)
+{
+       struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
+
+       return md_unpack_capa(lmv->tgts[0]->ltd_exp, req, field, oc);
+}
+
+int lmv_intent_getattr_async(struct obd_export *exp,
+                            struct md_enqueue_info *minfo,
+                            struct ldlm_enqueue_info *einfo)
+{
+       struct md_op_data       *op_data = &minfo->mi_data;
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt = NULL;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &op_data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_intent_getattr_async(tgt->ltd_exp, minfo, einfo);
+       RETURN(rc);
+}
+
+int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+                       struct lu_fid *fid, __u64 *bits)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd    *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                   rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, fid);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
+       RETURN(rc);
+}
+
+/**
+ * For lmv, only need to send request to master MDT, and the master MDT will
+ * process with other slave MDTs. The only exception is Q_GETOQUOTA for which
+ * we directly fetch data from the slave MDTs.
+ */
+int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
+                struct obd_quotactl *oqctl)
+{
+       struct obd_device   *obd = class_exp2obd(exp);
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc *tgt = lmv->tgts[0];
+       int               rc = 0, i;
+       __u64           curspace, curinodes;
+       ENTRY;
+
+       if (!lmv->desc.ld_tgt_count || !tgt->ltd_active) {
+               CERROR("master lmv inactive\n");
+               RETURN(-EIO);
+       }
+
+       if (oqctl->qc_cmd != Q_GETOQUOTA) {
+               rc = obd_quotactl(tgt->ltd_exp, oqctl);
+               RETURN(rc);
+       }
+
+       curspace = curinodes = 0;
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               int err;
+               tgt = lmv->tgts[i];
+
+               if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0)
+                       continue;
+               if (!tgt->ltd_active) {
+                       CDEBUG(D_HA, "mdt %d is inactive.\n", i);
+                       continue;
+               }
+
+               err = obd_quotactl(tgt->ltd_exp, oqctl);
+               if (err) {
+                       CERROR("getquota on mdt %d failed. %d\n", i, err);
+                       if (!rc)
+                               rc = err;
+               } else {
+                       curspace += oqctl->qc_dqblk.dqb_curspace;
+                       curinodes += oqctl->qc_dqblk.dqb_curinodes;
+               }
+       }
+       oqctl->qc_dqblk.dqb_curspace = curspace;
+       oqctl->qc_dqblk.dqb_curinodes = curinodes;
+
+       RETURN(rc);
+}
+
+int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp,
+                  struct obd_quotactl *oqctl)
+{
+       struct obd_device   *obd = class_exp2obd(exp);
+       struct lmv_obd      *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc *tgt;
+       int               i, rc = 0;
+       ENTRY;
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               int err;
+               tgt = lmv->tgts[i];
+               if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) {
+                       CERROR("lmv idx %d inactive\n", i);
+                       RETURN(-EIO);
+               }
+
+               err = obd_quotacheck(tgt->ltd_exp, oqctl);
+               if (err && !rc)
+                       rc = err;
+       }
+
+       RETURN(rc);
+}
+
+struct obd_ops lmv_obd_ops = {
+       .o_owner                = THIS_MODULE,
+       .o_setup                = lmv_setup,
+       .o_cleanup            = lmv_cleanup,
+       .o_precleanup      = lmv_precleanup,
+       .o_process_config       = lmv_process_config,
+       .o_connect            = lmv_connect,
+       .o_disconnect      = lmv_disconnect,
+       .o_statfs              = lmv_statfs,
+       .o_get_info          = lmv_get_info,
+       .o_set_info_async       = lmv_set_info_async,
+       .o_packmd              = lmv_packmd,
+       .o_unpackmd          = lmv_unpackmd,
+       .o_notify              = lmv_notify,
+       .o_get_uuid          = lmv_get_uuid,
+       .o_iocontrol        = lmv_iocontrol,
+       .o_quotacheck      = lmv_quotacheck,
+       .o_quotactl          = lmv_quotactl
+};
+
+struct md_ops lmv_md_ops = {
+       .m_getstatus        = lmv_getstatus,
+       .m_null_inode           = lmv_null_inode,
+       .m_find_cbdata    = lmv_find_cbdata,
+       .m_close                = lmv_close,
+       .m_create              = lmv_create,
+       .m_done_writing  = lmv_done_writing,
+       .m_enqueue            = lmv_enqueue,
+       .m_getattr            = lmv_getattr,
+       .m_getxattr          = lmv_getxattr,
+       .m_getattr_name  = lmv_getattr_name,
+       .m_intent_lock    = lmv_intent_lock,
+       .m_link          = lmv_link,
+       .m_rename              = lmv_rename,
+       .m_setattr            = lmv_setattr,
+       .m_setxattr          = lmv_setxattr,
+       .m_sync          = lmv_sync,
+       .m_readpage          = lmv_readpage,
+       .m_unlink              = lmv_unlink,
+       .m_init_ea_size  = lmv_init_ea_size,
+       .m_cancel_unused        = lmv_cancel_unused,
+       .m_set_lock_data        = lmv_set_lock_data,
+       .m_lock_match      = lmv_lock_match,
+       .m_get_lustre_md        = lmv_get_lustre_md,
+       .m_free_lustre_md       = lmv_free_lustre_md,
+       .m_set_open_replay_data = lmv_set_open_replay_data,
+       .m_clear_open_replay_data = lmv_clear_open_replay_data,
+       .m_renew_capa      = lmv_renew_capa,
+       .m_unpack_capa    = lmv_unpack_capa,
+       .m_get_remote_perm      = lmv_get_remote_perm,
+       .m_intent_getattr_async = lmv_intent_getattr_async,
+       .m_revalidate_lock      = lmv_revalidate_lock
+};
+
+int __init lmv_init(void)
+{
+       struct lprocfs_static_vars lvars;
+       int                     rc;
+
+       lprocfs_lmv_init_vars(&lvars);
+
+       rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
+                                lvars.module_vars, LUSTRE_LMV_NAME, NULL);
+       return rc;
+}
+
+static void lmv_exit(void)
+{
+       class_unregister_type(LUSTRE_LMV_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver");
+MODULE_LICENSE("GPL");
+
+module_init(lmv_init);
+module_exit(lmv_exit);
diff --git a/drivers/staging/lustre/lustre/lmv/lproc_lmv.c b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c
new file mode 100644 (file)
index 0000000..d1c45b5
--- /dev/null
@@ -0,0 +1,235 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/seq_file.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+#ifndef LPROCFS
+static struct lprocfs_vars lprocfs_module_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_obd_vars[] = { {0} };
+#else
+static int lmv_numobd_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device       *dev = (struct obd_device *)m->private;
+       struct lmv_desc  *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lmv.desc;
+       return seq_printf(m, "%u\n", desc->ld_tgt_count);
+}
+LPROC_SEQ_FOPS_RO(lmv_numobd);
+
+static const char *placement_name[] = {
+       [PLACEMENT_CHAR_POLICY] = "CHAR",
+       [PLACEMENT_NID_POLICY]  = "NID",
+       [PLACEMENT_INVAL_POLICY]  = "INVAL"
+};
+
+static placement_policy_t placement_name2policy(char *name, int len)
+{
+       int                  i;
+
+       for (i = 0; i < PLACEMENT_MAX_POLICY; i++) {
+               if (!strncmp(placement_name[i], name, len))
+                       return i;
+       }
+       return PLACEMENT_INVAL_POLICY;
+}
+
+static const char *placement_policy2name(placement_policy_t placement)
+{
+       LASSERT(placement < PLACEMENT_MAX_POLICY);
+       return placement_name[placement];
+}
+
+static int lmv_placement_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device       *dev = (struct obd_device *)m->private;
+       struct lmv_obd    *lmv;
+
+       LASSERT(dev != NULL);
+       lmv = &dev->u.lmv;
+       return seq_printf(m, "%s\n", placement_policy2name(lmv->lmv_placement));
+}
+
+#define MAX_POLICY_STRING_SIZE 64
+
+static ssize_t lmv_placement_seq_write(struct file *file, const char *buffer,
+                                  size_t count, loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       char                 dummy[MAX_POLICY_STRING_SIZE + 1];
+       int                   len = count;
+       placement_policy_t       policy;
+       struct lmv_obd    *lmv;
+
+       if (copy_from_user(dummy, buffer, MAX_POLICY_STRING_SIZE))
+               return -EFAULT;
+
+       LASSERT(dev != NULL);
+       lmv = &dev->u.lmv;
+
+       if (len > MAX_POLICY_STRING_SIZE)
+               len = MAX_POLICY_STRING_SIZE;
+
+       if (dummy[len - 1] == '\n')
+               len--;
+       dummy[len] = '\0';
+
+       policy = placement_name2policy(dummy, len);
+       if (policy != PLACEMENT_INVAL_POLICY) {
+               spin_lock(&lmv->lmv_lock);
+               lmv->lmv_placement = policy;
+               spin_unlock(&lmv->lmv_lock);
+       } else {
+               CERROR("Invalid placement policy \"%s\"!\n", dummy);
+               return -EINVAL;
+       }
+       return count;
+}
+LPROC_SEQ_FOPS(lmv_placement);
+
+static int lmv_activeobd_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device       *dev = (struct obd_device *)m->private;
+       struct lmv_desc  *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lmv.desc;
+       return seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+}
+LPROC_SEQ_FOPS_RO(lmv_activeobd);
+
+static int lmv_desc_uuid_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = (struct obd_device *)m->private;
+       struct lmv_obd    *lmv;
+
+       LASSERT(dev != NULL);
+       lmv = &dev->u.lmv;
+       return seq_printf(m, "%s\n", lmv->desc.ld_uuid.uuid);
+}
+LPROC_SEQ_FOPS_RO(lmv_desc_uuid);
+
+static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+       struct obd_device       *dev = p->private;
+       struct lmv_obd    *lmv = &dev->u.lmv;
+       return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static void lmv_tgt_seq_stop(struct seq_file *p, void *v)
+{
+       return;
+}
+
+static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       struct obd_device       *dev = p->private;
+       struct lmv_obd    *lmv = &dev->u.lmv;
+       ++*pos;
+       return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos];
+}
+
+static int lmv_tgt_seq_show(struct seq_file *p, void *v)
+{
+       struct lmv_tgt_desc     *tgt = v;
+
+       if (tgt == NULL)
+               return 0;
+       return seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_idx,
+                         tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN");
+}
+
+struct seq_operations lmv_tgt_sops = {
+       .start           = lmv_tgt_seq_start,
+       .stop             = lmv_tgt_seq_stop,
+       .next             = lmv_tgt_seq_next,
+       .show             = lmv_tgt_seq_show,
+};
+
+static int lmv_target_seq_open(struct inode *inode, struct file *file)
+{
+       struct seq_file  *seq;
+       int                  rc;
+
+       rc = seq_open(file, &lmv_tgt_sops);
+       if (rc)
+               return rc;
+
+       seq = file->private_data;
+       seq->private = PDE_DATA(inode);
+
+       return 0;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(lmv, uuid);
+
+struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
+       { "numobd",       &lmv_numobd_fops,       0, 0 },
+       { "placement",    &lmv_placement_fops,    0, 0 },
+       { "activeobd",    &lmv_activeobd_fops,    0, 0 },
+       { "uuid",         &lmv_uuid_fops,         0, 0 },
+       { "desc_uuid",    &lmv_desc_uuid_fops,    0, 0 },
+       { 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(lmv, numrefs);
+
+static struct lprocfs_vars lprocfs_lmv_module_vars[] = {
+       { "num_refs",      &lmv_numrefs_fops, 0, 0 },
+       { 0 }
+};
+
+struct file_operations lmv_proc_target_fops = {
+       .owner          = THIS_MODULE,
+       .open            = lmv_target_seq_open,
+       .read            = seq_read,
+       .llseek        = seq_lseek,
+       .release              = seq_release,
+};
+
+#endif /* LPROCFS */
+void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars)
+{
+       lvars->module_vars    = lprocfs_lmv_module_vars;
+       lvars->obd_vars       = lprocfs_lmv_obd_vars;
+}
diff --git a/drivers/staging/lustre/lustre/lov/Makefile b/drivers/staging/lustre/lustre/lov/Makefile
new file mode 100644 (file)
index 0000000..67eaec2
--- /dev/null
@@ -0,0 +1,9 @@
+obj-$(CONFIG_LUSTRE_FS) += lov.o
+lov-y := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o \
+        lov_request.o lov_ea.o lov_dev.o lov_object.o lov_page.o  \
+        lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o      \
+        lovsub_lock.o lovsub_io.o lov_pool.o
+
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
new file mode 100644 (file)
index 0000000..28801b8
--- /dev/null
@@ -0,0 +1,820 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#ifndef LOV_CL_INTERNAL_H
+#define LOV_CL_INTERNAL_H
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <cl_object.h>
+#include "lov_internal.h"
+
+/** \defgroup lov lov
+ * Logical object volume layer. This layer implements data striping (raid0).
+ *
+ * At the lov layer top-entity (object, page, lock, io) is connected to one or
+ * more sub-entities: top-object, representing a file is connected to a set of
+ * sub-objects, each representing a stripe, file-level top-lock is connected
+ * to a set of per-stripe sub-locks, top-page is connected to a (single)
+ * sub-page, and a top-level IO is connected to a set of (potentially
+ * concurrent) sub-IO's.
+ *
+ * Sub-object, sub-page, and sub-io have well-defined top-object and top-page
+ * respectively, while a single sub-lock can be part of multiple top-locks.
+ *
+ * Reference counting models are different for different types of entities:
+ *
+ *     - top-object keeps a reference to its sub-objects, and destroys them
+ *       when it is destroyed.
+ *
+ *     - top-page keeps a reference to its sub-page, and destroys it when it
+ *       is destroyed.
+ *
+ *     - sub-lock keep a reference to its top-locks. Top-lock keeps a
+ *       reference (and a hold, see cl_lock_hold()) on its sub-locks when it
+ *       actively using them (that is, in cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED, cl_lock_state::CLS_HELD states). When
+ *       moving into cl_lock_state::CLS_CACHED state, top-lock releases a
+ *       hold. From this moment top-lock has only a 'weak' reference to its
+ *       sub-locks. This reference is protected by top-lock
+ *       cl_lock::cll_guard, and will be automatically cleared by the sub-lock
+ *       when the latter is destroyed. When a sub-lock is canceled, a
+ *       reference to it is removed from the top-lock array, and top-lock is
+ *       moved into CLS_NEW state. It is guaranteed that all sub-locks exist
+ *       while their top-lock is in CLS_HELD or CLS_CACHED states.
+ *
+ *     - IO's are not reference counted.
+ *
+ * To implement a connection between top and sub entities, lov layer is split
+ * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both
+ * implementing full set of cl-interfaces. For example, top-object has vvp and
+ * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is
+ * used to track child-parent relationship.
+ *
+ * @{
+ */
+
+struct lovsub_device;
+struct lovsub_object;
+struct lovsub_lock;
+
+enum lov_device_flags {
+       LOV_DEV_INITIALIZED = 1 << 0
+};
+
+/*
+ * Upper half.
+ */
+
+/**
+ * Resources that are used in memory-cleaning path, and whose allocation
+ * cannot fail even when memory is tight. They are preallocated in sufficient
+ * quantities in lov_device::ld_emerg[], and access to them is serialized
+ * lov_device::ld_mutex.
+ */
+struct lov_device_emerg {
+       /**
+        * Page list used to submit IO when memory is in pressure.
+        */
+       struct cl_page_list emrg_page_list;
+       /**
+        * sub-io's shared by all threads accessing this device when memory is
+        * too low to allocate sub-io's dynamically.
+        */
+       struct cl_io    emrg_subio;
+       /**
+        * Environments used by sub-io's in
+        * lov_device_emerg::emrg_subio.
+        */
+       struct lu_env      *emrg_env;
+       /**
+        * Refchecks for lov_device_emerg::emrg_env.
+        *
+        * \see cl_env_get()
+        */
+       int              emrg_refcheck;
+};
+
+struct lov_device {
+       /*
+        * XXX Locking of lov-private data is missing.
+        */
+       struct cl_device          ld_cl;
+       struct lov_obd     *ld_lov;
+       /** size of lov_device::ld_target[] array */
+       __u32                ld_target_nr;
+       struct lovsub_device    **ld_target;
+       __u32                ld_flags;
+
+       /** Emergency resources used in memory-cleansing paths. */
+       struct lov_device_emerg **ld_emrg;
+       /**
+        * Serializes access to lov_device::ld_emrg in low-memory
+        * conditions.
+        */
+       struct mutex              ld_mutex;
+};
+
+/**
+ * Layout type.
+ */
+enum lov_layout_type {
+       /** empty file without body */
+       LLT_EMPTY,
+       /** striped file */
+       LLT_RAID0,
+       LLT_NR
+};
+
+/**
+ * lov-specific file state.
+ *
+ * lov object has particular layout type, determining how top-object is built
+ * on top of sub-objects. Layout type can change dynamically. When this
+ * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode,
+ * all state pertaining to the old layout type is destroyed, and new state is
+ * constructed. All object methods take said semaphore in the shared mode,
+ * providing serialization against transition between layout types.
+ *
+ * To avoid multiple `if' or `switch' statements, selecting behavior for the
+ * current layout type, object methods perform double-dispatch, invoking
+ * function corresponding to the current layout type.
+ */
+struct lov_object {
+       struct cl_object       lo_cl;
+       /**
+        * Serializes object operations with transitions between layout types.
+        *
+        * This semaphore is taken in shared mode by all object methods, and
+        * is taken in exclusive mode when object type is changed.
+        *
+        * \see lov_object::lo_type
+        */
+       struct rw_semaphore     lo_type_guard;
+       /**
+        * Type of an object. Protected by lov_object::lo_type_guard.
+        */
+       enum lov_layout_type    lo_type;
+       /**
+        * True if layout is invalid. This bit is cleared when layout lock
+        * is lost.
+        */
+       bool                    lo_layout_invalid;
+       /**
+        * How many IOs are on going on this object. Layout can be changed
+        * only if there is no active IO.
+        */
+       atomic_t               lo_active_ios;
+       /**
+        * Waitq - wait for no one else is using lo_lsm
+        */
+       wait_queue_head_t              lo_waitq;
+       /**
+        * Layout metadata. NULL if empty layout.
+        */
+       struct lov_stripe_md  *lo_lsm;
+
+       union lov_layout_state {
+               struct lov_layout_raid0 {
+                       unsigned               lo_nr;
+                       /**
+                        * When this is true, lov_object::lo_attr contains
+                        * valid up to date attributes for a top-level
+                        * object. This field is reset to 0 when attributes of
+                        * any sub-object change.
+                        */
+                       int                    lo_attr_valid;
+                       /**
+                        * Array of sub-objects. Allocated when top-object is
+                        * created (lov_init_raid0()).
+                        *
+                        * Top-object is a strict master of its sub-objects:
+                        * it is created before them, and outlives its
+                        * children (this later is necessary so that basic
+                        * functions like cl_object_top() always
+                        * work). Top-object keeps a reference on every
+                        * sub-object.
+                        *
+                        * When top-object is destroyed (lov_delete_raid0())
+                        * it releases its reference to a sub-object and waits
+                        * until the latter is finally destroyed.
+                        */
+                       struct lovsub_object **lo_sub;
+                       /**
+                        * protect lo_sub
+                        */
+                       spinlock_t              lo_sub_lock;
+                       /**
+                        * Cached object attribute, built from sub-object
+                        * attributes.
+                        */
+                       struct cl_attr   lo_attr;
+               } raid0;
+               struct lov_layout_state_empty {
+               } empty;
+       } u;
+       /**
+        * Thread that acquired lov_object::lo_type_guard in an exclusive
+        * mode.
+        */
+       task_t      *lo_owner;
+};
+
+/**
+ * Flags that top-lock can set on each of its sub-locks.
+ */
+enum lov_sub_flags {
+       /** Top-lock acquired a hold (cl_lock_hold()) on a sub-lock. */
+       LSF_HELD = 1 << 0
+};
+
+/**
+ * State lov_lock keeps for each sub-lock.
+ */
+struct lov_lock_sub {
+       /** sub-lock itself */
+       struct lovsub_lock  *sub_lock;
+       /** An array of per-sub-lock flags, taken from enum lov_sub_flags */
+       unsigned             sub_flags;
+       int               sub_stripe;
+       struct cl_lock_descr sub_descr;
+       struct cl_lock_descr sub_got;
+};
+
+/**
+ * lov-specific lock state.
+ */
+struct lov_lock {
+       struct cl_lock_slice   lls_cl;
+       /** Number of sub-locks in this lock */
+       int                 lls_nr;
+       /**
+        * Number of existing sub-locks.
+        */
+       unsigned               lls_nr_filled;
+       /**
+        * Set when sub-lock was canceled, while top-lock was being
+        * used, or unused.
+        */
+       unsigned int           lls_cancel_race:1;
+       /**
+        * An array of sub-locks
+        *
+        * There are two issues with managing sub-locks:
+        *
+        *     - sub-locks are concurrently canceled, and
+        *
+        *     - sub-locks are shared with other top-locks.
+        *
+        * To manage cancellation, top-lock acquires a hold on a sublock
+        * (lov_sublock_adopt()) when the latter is inserted into
+        * lov_lock::lls_sub[]. This hold is released (lov_sublock_release())
+        * when top-lock is going into CLS_CACHED state or destroyed. Hold
+        * prevents sub-lock from cancellation.
+        *
+        * Sub-lock sharing means, among other things, that top-lock that is
+        * in the process of creation (i.e., not yet inserted into lock list)
+        * is already accessible to other threads once at least one of its
+        * sub-locks is created, see lov_lock_sub_init().
+        *
+        * Sub-lock can be in one of the following states:
+        *
+        *     - doesn't exist, lov_lock::lls_sub[]::sub_lock == NULL. Such
+        *       sub-lock was either never created (top-lock is in CLS_NEW
+        *       state), or it was created, then canceled, then destroyed
+        *       (lov_lock_unlink() cleared sub-lock pointer in the top-lock).
+        *
+        *     - sub-lock exists and is on
+        *       hold. (lov_lock::lls_sub[]::sub_flags & LSF_HELD). This is a
+        *       normal state of a sub-lock in CLS_HELD and CLS_CACHED states
+        *       of a top-lock.
+        *
+        *     - sub-lock exists, but is not held by the top-lock. This
+        *       happens after top-lock released a hold on sub-locks before
+        *       going into cache (lov_lock_unuse()).
+        *
+        * \todo To support wide-striping, array has to be replaced with a set
+        * of queues to avoid scanning.
+        */
+       struct lov_lock_sub   *lls_sub;
+       /**
+        * Original description with which lock was enqueued.
+        */
+       struct cl_lock_descr   lls_orig;
+};
+
+struct lov_page {
+       struct cl_page_slice lps_cl;
+       int               lps_invalid;
+};
+
+/*
+ * Bottom half.
+ */
+
+struct lovsub_device {
+       struct cl_device   acid_cl;
+       struct lov_device *acid_super;
+       int             acid_idx;
+       struct cl_device  *acid_next;
+};
+
+struct lovsub_object {
+       struct cl_object_header lso_header;
+       struct cl_object        lso_cl;
+       struct lov_object      *lso_super;
+       int                  lso_index;
+};
+
+/**
+ * A link between a top-lock and a sub-lock. Separate data-structure is
+ * necessary, because top-locks and sub-locks are in M:N relationship.
+ *
+ * \todo This can be optimized for a (by far) most frequent case of a single
+ * top-lock per sub-lock.
+ */
+struct lov_lock_link {
+       struct lov_lock *lll_super;
+       /** An index within parent lock. */
+       int           lll_idx;
+       /**
+        * A linkage into per sub-lock list of all corresponding top-locks,
+        * hanging off lovsub_lock::lss_parents.
+        */
+       struct list_head       lll_list;
+};
+
+/**
+ * Lock state at lovsub layer.
+ */
+struct lovsub_lock {
+       struct cl_lock_slice  lss_cl;
+       /**
+        * List of top-locks that have given sub-lock as their part. Protected
+        * by cl_lock::cll_guard mutex.
+        */
+       struct list_head            lss_parents;
+       /**
+        * Top-lock that initiated current operation on this sub-lock. This is
+        * only set during top-to-bottom lock operations like enqueue, and is
+        * used to optimize state change notification. Protected by
+        * cl_lock::cll_guard mutex.
+        *
+        * \see lovsub_lock_state_one().
+        */
+       struct cl_lock       *lss_active;
+};
+
+/**
+ * Describe the environment settings for sublocks.
+ */
+struct lov_sublock_env {
+       const struct lu_env *lse_env;
+       struct cl_io    *lse_io;
+       struct lov_io_sub   *lse_sub;
+};
+
+struct lovsub_page {
+       struct cl_page_slice lsb_cl;
+};
+
+
+struct lov_thread_info {
+       struct cl_object_conf   lti_stripe_conf;
+       struct lu_fid      lti_fid;
+       struct cl_lock_descr    lti_ldescr;
+       struct ost_lvb    lti_lvb;
+       struct cl_2queue        lti_cl2q;
+       struct cl_lock_closure  lti_closure;
+       wait_queue_t      lti_waiter;
+};
+
+/**
+ * State that lov_io maintains for every sub-io.
+ */
+struct lov_io_sub {
+       int               sub_stripe;
+       /**
+        * sub-io for a stripe. Ideally sub-io's can be stopped and resumed
+        * independently, with lov acting as a scheduler to maximize overall
+        * throughput.
+        */
+       struct cl_io    *sub_io;
+       /**
+        * Linkage into a list (hanging off lov_io::lis_active) of all
+        * sub-io's active for the current IO iteration.
+        */
+       struct list_head           sub_linkage;
+       /**
+        * true, iff cl_io_init() was successfully executed against
+        * lov_io_sub::sub_io.
+        */
+       int               sub_io_initialized;
+       /**
+        * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't
+        * allocated, but borrowed from a per-device emergency pool.
+        */
+       int               sub_borrowed;
+       /**
+        * environment, in which sub-io executes.
+        */
+       struct lu_env *sub_env;
+       /**
+        * environment's refcheck.
+        *
+        * \see cl_env_get()
+        */
+       int               sub_refcheck;
+       int               sub_refcheck2;
+       int               sub_reenter;
+       void            *sub_cookie;
+};
+
+/**
+ * IO state private for LOV.
+ */
+struct lov_io {
+       /** super-class */
+       struct cl_io_slice lis_cl;
+       /**
+        * Pointer to the object slice. This is a duplicate of
+        * lov_io::lis_cl::cis_object.
+        */
+       struct lov_object *lis_object;
+       /**
+        * Original end-of-io position for this IO, set by the upper layer as
+        * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this,
+        * changes pos and count to fit IO into a single stripe and uses saved
+        * value to determine when IO iterations have to stop.
+        *
+        * This is used only for CIT_READ and CIT_WRITE io's.
+        */
+       loff_t       lis_io_endpos;
+
+       /**
+        * starting position within a file, for the current io loop iteration
+        * (stripe), used by ci_io_loop().
+        */
+       obd_off     lis_pos;
+       /**
+        * end position with in a file, for the current stripe io. This is
+        * exclusive (i.e., next offset after last byte affected by io).
+        */
+       obd_off     lis_endpos;
+
+       int             lis_mem_frozen;
+       int             lis_stripe_count;
+       int             lis_active_subios;
+
+       /**
+        * the index of ls_single_subio in ls_subios array
+        */
+       int             lis_single_subio_index;
+       struct cl_io       lis_single_subio;
+
+       /**
+        * size of ls_subios array, actually the highest stripe #
+        */
+       int             lis_nr_subios;
+       struct lov_io_sub *lis_subs;
+       /**
+        * List of active sub-io's.
+        */
+       struct list_head         lis_active;
+};
+
+struct lov_session {
+       struct lov_io     ls_io;
+       struct lov_sublock_env ls_subenv;
+};
+
+/**
+ * State of transfer for lov.
+ */
+struct lov_req {
+       struct cl_req_slice lr_cl;
+};
+
+/**
+ * State of transfer for lovsub.
+ */
+struct lovsub_req {
+       struct cl_req_slice lsrq_cl;
+};
+
+extern struct lu_device_type lov_device_type;
+extern struct lu_device_type lovsub_device_type;
+
+extern struct lu_context_key lov_key;
+extern struct lu_context_key lov_session_key;
+
+extern struct kmem_cache *lov_lock_kmem;
+extern struct kmem_cache *lov_object_kmem;
+extern struct kmem_cache *lov_thread_kmem;
+extern struct kmem_cache *lov_session_kmem;
+extern struct kmem_cache *lov_req_kmem;
+
+extern struct kmem_cache *lovsub_lock_kmem;
+extern struct kmem_cache *lovsub_object_kmem;
+extern struct kmem_cache *lovsub_req_kmem;
+
+extern struct kmem_cache *lov_lock_link_kmem;
+
+int   lov_object_init     (const struct lu_env *env, struct lu_object *obj,
+                          const struct lu_object_conf *conf);
+int   lovsub_object_init  (const struct lu_env *env, struct lu_object *obj,
+                          const struct lu_object_conf *conf);
+int   lov_lock_init       (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init       (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_io *io);
+int   lovsub_lock_init    (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_lock *lock, const struct cl_io *io);
+
+int   lov_lock_init_raid0 (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_lock *lock, const struct cl_io *io);
+int   lov_lock_init_empty (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init_raid0   (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_io *io);
+int   lov_io_init_empty   (const struct lu_env *env, struct cl_object *obj,
+                          struct cl_io *io);
+void  lov_lock_unlink     (const struct lu_env *env, struct lov_lock_link *link,
+                          struct lovsub_lock *sub);
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio,
+                              int stripe);
+void  lov_sub_put           (struct lov_io_sub *sub);
+int   lov_sublock_modify  (const struct lu_env *env, struct lov_lock *lov,
+                          struct lovsub_lock *sublock,
+                          const struct cl_lock_descr *d, int idx);
+
+
+int   lov_page_init       (const struct lu_env *env, struct cl_object *ob,
+                          struct cl_page *page, struct page *vmpage);
+int   lovsub_page_init    (const struct lu_env *env, struct cl_object *ob,
+                          struct cl_page *page, struct page *vmpage);
+
+int   lov_page_init_empty (const struct lu_env *env,
+                          struct cl_object *obj,
+                          struct cl_page *page, struct page *vmpage);
+int   lov_page_init_raid0 (const struct lu_env *env,
+                          struct cl_object *obj,
+                          struct cl_page *page, struct page *vmpage);
+struct lu_object *lov_object_alloc   (const struct lu_env *env,
+                                     const struct lu_object_header *hdr,
+                                     struct lu_device *dev);
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+                                     const struct lu_object_header *hdr,
+                                     struct lu_device *dev);
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+                                        struct lov_lock *lck,
+                                        struct lovsub_lock *sub);
+struct lov_io_sub    *lov_page_subio    (const struct lu_env *env,
+                                        struct lov_io *lio,
+                                        const struct cl_page_slice *slice);
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm);
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
+
+#define lov_foreach_target(lov, var)               \
+       for (var = 0; var < lov_targets_nr(lov); ++var)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct lov_session *lov_env_session(const struct lu_env *env)
+{
+       struct lov_session *ses;
+
+       ses = lu_context_key_get(env->le_ses, &lov_session_key);
+       LASSERT(ses != NULL);
+       return ses;
+}
+
+static inline struct lov_io *lov_env_io(const struct lu_env *env)
+{
+       return &lov_env_session(env)->ls_io;
+}
+
+static inline int lov_is_object(const struct lu_object *obj)
+{
+       return obj->lo_dev->ld_type == &lov_device_type;
+}
+
+static inline int lovsub_is_object(const struct lu_object *obj)
+{
+       return obj->lo_dev->ld_type == &lovsub_device_type;
+}
+
+static inline struct lu_device *lov2lu_dev(struct lov_device *lov)
+{
+       return &lov->ld_cl.cd_lu_dev;
+}
+
+static inline struct lov_device *lu2lov_dev(const struct lu_device *d)
+{
+       LINVRNT(d->ld_type == &lov_device_type);
+       return container_of0(d, struct lov_device, ld_cl.cd_lu_dev);
+}
+
+static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub)
+{
+       return &lovsub->acid_cl;
+}
+
+static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub)
+{
+       return &lovsub2cl_dev(lovsub)->cd_lu_dev;
+}
+
+static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d)
+{
+       LINVRNT(d->ld_type == &lovsub_device_type);
+       return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev);
+}
+
+static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d)
+{
+       LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type);
+       return container_of0(d, struct lovsub_device, acid_cl);
+}
+
+static inline struct lu_object *lov2lu(struct lov_object *lov)
+{
+       return &lov->lo_cl.co_lu;
+}
+
+static inline struct cl_object *lov2cl(struct lov_object *lov)
+{
+       return &lov->lo_cl;
+}
+
+static inline struct lov_object *lu2lov(const struct lu_object *obj)
+{
+       LINVRNT(lov_is_object(obj));
+       return container_of0(obj, struct lov_object, lo_cl.co_lu);
+}
+
+static inline struct lov_object *cl2lov(const struct cl_object *obj)
+{
+       LINVRNT(lov_is_object(&obj->co_lu));
+       return container_of0(obj, struct lov_object, lo_cl);
+}
+
+static inline struct lu_object *lovsub2lu(struct lovsub_object *los)
+{
+       return &los->lso_cl.co_lu;
+}
+
+static inline struct cl_object *lovsub2cl(struct lovsub_object *los)
+{
+       return &los->lso_cl;
+}
+
+static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj)
+{
+       LINVRNT(lovsub_is_object(&obj->co_lu));
+       return container_of0(obj, struct lovsub_object, lso_cl);
+}
+
+static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
+{
+       LINVRNT(lovsub_is_object(obj));
+       return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
+}
+
+static inline struct lovsub_lock *
+cl2lovsub_lock(const struct cl_lock_slice *slice)
+{
+       LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
+       return container_of(slice, struct lovsub_lock, lss_cl);
+}
+
+static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
+{
+       const struct cl_lock_slice *slice;
+
+       slice = cl_lock_at(lock, &lovsub_device_type);
+       LASSERT(slice != NULL);
+       return cl2lovsub_lock(slice);
+}
+
+static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
+{
+       LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
+       return container_of(slice, struct lov_lock, lls_cl);
+}
+
+static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
+{
+       LINVRNT(lov_is_object(&slice->cpl_obj->co_lu));
+       return container_of0(slice, struct lov_page, lps_cl);
+}
+
+static inline struct lov_req *cl2lov_req(const struct cl_req_slice *slice)
+{
+       return container_of0(slice, struct lov_req, lr_cl);
+}
+
+static inline struct lovsub_page *
+cl2lovsub_page(const struct cl_page_slice *slice)
+{
+       LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
+       return container_of0(slice, struct lovsub_page, lsb_cl);
+}
+
+static inline struct lovsub_req *cl2lovsub_req(const struct cl_req_slice *slice)
+{
+       return container_of0(slice, struct lovsub_req, lsrq_cl);
+}
+
+static inline struct cl_page *lov_sub_page(const struct cl_page_slice *slice)
+{
+       return slice->cpl_page->cp_child;
+}
+
+static inline struct lov_io *cl2lov_io(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+       struct lov_io *lio;
+
+       lio = container_of(ios, struct lov_io, lis_cl);
+       LASSERT(lio == lov_env_io(env));
+       return lio;
+}
+
+static inline int lov_targets_nr(const struct lov_device *lov)
+{
+       return lov->ld_lov->desc.ld_tgt_count;
+}
+
+static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
+{
+       struct lov_thread_info *info;
+
+       info = lu_context_key_get(&env->le_ctx, &lov_key);
+       LASSERT(info != NULL);
+       return info;
+}
+
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov)
+{
+       LASSERT(lov->lo_type == LLT_RAID0);
+       LASSERT(lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC ||
+               lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC_V3);
+       return &lov->u.raid0;
+}
+
+/** @} lov */
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lov/lov_dev.c b/drivers/staging/lustre/lustre/lov/lov_dev.c
new file mode 100644 (file)
index 0000000..f94f8d9
--- /dev/null
@@ -0,0 +1,533 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "lov_cl_internal.h"
+
+struct kmem_cache *lov_lock_kmem;
+struct kmem_cache *lov_object_kmem;
+struct kmem_cache *lov_thread_kmem;
+struct kmem_cache *lov_session_kmem;
+struct kmem_cache *lov_req_kmem;
+
+struct kmem_cache *lovsub_lock_kmem;
+struct kmem_cache *lovsub_object_kmem;
+struct kmem_cache *lovsub_req_kmem;
+
+struct kmem_cache *lov_lock_link_kmem;
+
+/** Lock class of lov_device::ld_mutex. */
+struct lock_class_key cl_lov_device_mutex_class;
+
+struct lu_kmem_descr lov_caches[] = {
+       {
+               .ckd_cache = &lov_lock_kmem,
+               .ckd_name  = "lov_lock_kmem",
+               .ckd_size  = sizeof (struct lov_lock)
+       },
+       {
+               .ckd_cache = &lov_object_kmem,
+               .ckd_name  = "lov_object_kmem",
+               .ckd_size  = sizeof (struct lov_object)
+       },
+       {
+               .ckd_cache = &lov_thread_kmem,
+               .ckd_name  = "lov_thread_kmem",
+               .ckd_size  = sizeof (struct lov_thread_info)
+       },
+       {
+               .ckd_cache = &lov_session_kmem,
+               .ckd_name  = "lov_session_kmem",
+               .ckd_size  = sizeof (struct lov_session)
+       },
+       {
+               .ckd_cache = &lov_req_kmem,
+               .ckd_name  = "lov_req_kmem",
+               .ckd_size  = sizeof (struct lov_req)
+       },
+       {
+               .ckd_cache = &lovsub_lock_kmem,
+               .ckd_name  = "lovsub_lock_kmem",
+               .ckd_size  = sizeof (struct lovsub_lock)
+       },
+       {
+               .ckd_cache = &lovsub_object_kmem,
+               .ckd_name  = "lovsub_object_kmem",
+               .ckd_size  = sizeof (struct lovsub_object)
+       },
+       {
+               .ckd_cache = &lovsub_req_kmem,
+               .ckd_name  = "lovsub_req_kmem",
+               .ckd_size  = sizeof (struct lovsub_req)
+       },
+       {
+               .ckd_cache = &lov_lock_link_kmem,
+               .ckd_name  = "lov_lock_link_kmem",
+               .ckd_size  = sizeof (struct lov_lock_link)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+/*****************************************************************************
+ *
+ * Lov transfer operations.
+ *
+ */
+
+static void lov_req_completion(const struct lu_env *env,
+                              const struct cl_req_slice *slice, int ioret)
+{
+       struct lov_req *lr;
+
+       ENTRY;
+       lr = cl2lov_req(slice);
+       OBD_SLAB_FREE_PTR(lr, lov_req_kmem);
+       EXIT;
+}
+
+static const struct cl_req_operations lov_req_ops = {
+       .cro_completion = lov_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov device and device type functions.
+ *
+ */
+
+static void *lov_key_init(const struct lu_context *ctx,
+                         struct lu_context_key *key)
+{
+       struct lov_thread_info *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, __GFP_IO);
+       if (info != NULL)
+               INIT_LIST_HEAD(&info->lti_closure.clc_list);
+       else
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void lov_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+       struct lov_thread_info *info = data;
+       LINVRNT(list_empty(&info->lti_closure.clc_list));
+       OBD_SLAB_FREE_PTR(info, lov_thread_kmem);
+}
+
+struct lu_context_key lov_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = lov_key_init,
+       .lct_fini = lov_key_fini
+};
+
+static void *lov_session_key_init(const struct lu_context *ctx,
+                                 struct lu_context_key *key)
+{
+       struct lov_session *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void lov_session_key_fini(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data)
+{
+       struct lov_session *info = data;
+       OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+}
+
+struct lu_context_key lov_session_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = lov_session_key_init,
+       .lct_fini = lov_session_key_fini
+};
+
+/* type constructor/destructor: lov_type_{init,fini,start,stop}() */
+LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
+
+static struct lu_device *lov_device_fini(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       int i;
+       struct lov_device *ld = lu2lov_dev(d);
+
+       LASSERT(ld->ld_lov != NULL);
+       if (ld->ld_target == NULL)
+               RETURN(NULL);
+
+       lov_foreach_target(ld, i) {
+               struct lovsub_device *lsd;
+
+               lsd = ld->ld_target[i];
+               if (lsd != NULL) {
+                       cl_stack_fini(env, lovsub2cl_dev(lsd));
+                       ld->ld_target[i] = NULL;
+               }
+       }
+       RETURN(NULL);
+}
+
+static int lov_device_init(const struct lu_env *env, struct lu_device *d,
+                          const char *name, struct lu_device *next)
+{
+       struct lov_device *ld = lu2lov_dev(d);
+       int i;
+       int rc = 0;
+
+       LASSERT(d->ld_site != NULL);
+       if (ld->ld_target == NULL)
+               RETURN(rc);
+
+       lov_foreach_target(ld, i) {
+               struct lovsub_device *lsd;
+               struct cl_device     *cl;
+               struct lov_tgt_desc  *desc;
+
+               desc = ld->ld_lov->lov_tgts[i];
+               if (desc == NULL)
+                       continue;
+
+               cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
+                                  desc->ltd_obd->obd_lu_dev);
+               if (IS_ERR(cl)) {
+                       rc = PTR_ERR(cl);
+                       break;
+               }
+               lsd = cl2lovsub_dev(cl);
+               lsd->acid_idx = i;
+               lsd->acid_super = ld;
+               ld->ld_target[i] = lsd;
+       }
+
+       if (rc)
+               lov_device_fini(env, d);
+       else
+               ld->ld_flags |= LOV_DEV_INITIALIZED;
+
+       RETURN(rc);
+}
+
+static int lov_req_init(const struct lu_env *env, struct cl_device *dev,
+                       struct cl_req *req)
+{
+       struct lov_req *lr;
+       int result;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lr, lov_req_kmem, __GFP_IO);
+       if (lr != NULL) {
+               cl_req_slice_add(req, &lr->lr_cl, dev, &lov_req_ops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       RETURN(result);
+}
+
+static const struct cl_device_operations lov_cl_ops = {
+       .cdo_req_init = lov_req_init
+};
+
+static void lov_emerg_free(struct lov_device_emerg **emrg, int nr)
+{
+       int i;
+
+       for (i = 0; i < nr; ++i) {
+               struct lov_device_emerg *em;
+
+               em = emrg[i];
+               if (em != NULL) {
+                       LASSERT(em->emrg_page_list.pl_nr == 0);
+                       if (em->emrg_env != NULL)
+                               cl_env_put(em->emrg_env, &em->emrg_refcheck);
+                       OBD_FREE_PTR(em);
+               }
+       }
+       OBD_FREE(emrg, nr * sizeof emrg[0]);
+}
+
+static struct lu_device *lov_device_free(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       struct lov_device *ld = lu2lov_dev(d);
+       const int         nr = ld->ld_target_nr;
+
+       cl_device_fini(lu2cl_dev(d));
+       if (ld->ld_target != NULL)
+               OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]);
+       if (ld->ld_emrg != NULL)
+               lov_emerg_free(ld->ld_emrg, nr);
+       OBD_FREE_PTR(ld);
+       return NULL;
+}
+
+static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
+                             __u32 index)
+{
+       struct lov_device *ld = lu2lov_dev(dev);
+       ENTRY;
+
+       if (ld->ld_target[index] != NULL) {
+               cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+               ld->ld_target[index] = NULL;
+       }
+       EXIT;
+}
+
+static struct lov_device_emerg **lov_emerg_alloc(int nr)
+{
+       struct lov_device_emerg **emerg;
+       int i;
+       int result;
+
+       OBD_ALLOC(emerg, nr * sizeof emerg[0]);
+       if (emerg == NULL)
+               return ERR_PTR(-ENOMEM);
+       for (result = i = 0; i < nr && result == 0; i++) {
+               struct lov_device_emerg *em;
+
+               OBD_ALLOC_PTR(em);
+               if (em != NULL) {
+                       emerg[i] = em;
+                       cl_page_list_init(&em->emrg_page_list);
+                       em->emrg_env = cl_env_alloc(&em->emrg_refcheck,
+                                                   LCT_REMEMBER|LCT_NOREF);
+                       if (!IS_ERR(em->emrg_env))
+                               em->emrg_env->le_ctx.lc_cookie = 0x2;
+                       else {
+                               result = PTR_ERR(em->emrg_env);
+                               em->emrg_env = NULL;
+                       }
+               } else
+                       result = -ENOMEM;
+       }
+       if (result != 0) {
+               lov_emerg_free(emerg, nr);
+               emerg = ERR_PTR(result);
+       }
+       return emerg;
+}
+
+static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
+{
+       int   result;
+       __u32 tgt_size;
+       __u32 sub_size;
+
+       ENTRY;
+       result = 0;
+       tgt_size = dev->ld_lov->lov_tgt_size;
+       sub_size = dev->ld_target_nr;
+       if (sub_size < tgt_size) {
+               struct lovsub_device    **newd;
+               struct lov_device_emerg **emerg;
+               const size_t          sz   = sizeof newd[0];
+
+               emerg = lov_emerg_alloc(tgt_size);
+               if (IS_ERR(emerg))
+                       RETURN(PTR_ERR(emerg));
+
+               OBD_ALLOC(newd, tgt_size * sz);
+               if (newd != NULL) {
+                       mutex_lock(&dev->ld_mutex);
+                       if (sub_size > 0) {
+                               memcpy(newd, dev->ld_target, sub_size * sz);
+                               OBD_FREE(dev->ld_target, sub_size * sz);
+                       }
+                       dev->ld_target    = newd;
+                       dev->ld_target_nr = tgt_size;
+
+                       if (dev->ld_emrg != NULL)
+                               lov_emerg_free(dev->ld_emrg, sub_size);
+                       dev->ld_emrg = emerg;
+                       mutex_unlock(&dev->ld_mutex);
+               } else {
+                       lov_emerg_free(emerg, tgt_size);
+                       result = -ENOMEM;
+               }
+       }
+       RETURN(result);
+}
+
+static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
+                            __u32 index)
+{
+       struct obd_device    *obd = dev->ld_obd;
+       struct lov_device    *ld  = lu2lov_dev(dev);
+       struct lov_tgt_desc  *tgt;
+       struct lovsub_device *lsd;
+       struct cl_device     *cl;
+       int rc;
+       ENTRY;
+
+       obd_getref(obd);
+
+       tgt = obd->u.lov.lov_tgts[index];
+       LASSERT(tgt != NULL);
+       LASSERT(tgt->ltd_obd != NULL);
+
+       if (!tgt->ltd_obd->obd_set_up) {
+               CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid));
+               RETURN(-EINVAL);
+       }
+
+       rc = lov_expand_targets(env, ld);
+       if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+               LASSERT(dev->ld_site != NULL);
+
+               cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
+                                  tgt->ltd_obd->obd_lu_dev);
+               if (!IS_ERR(cl)) {
+                       lsd = cl2lovsub_dev(cl);
+                       lsd->acid_idx = index;
+                       lsd->acid_super = ld;
+                       ld->ld_target[index] = lsd;
+               } else {
+                       CERROR("add failed (%d), deleting %s\n", rc,
+                              obd_uuid2str(&tgt->ltd_uuid));
+                       lov_cl_del_target(env, dev, index);
+                       rc = PTR_ERR(cl);
+               }
+       }
+       obd_putref(obd);
+       RETURN(rc);
+}
+
+static int lov_process_config(const struct lu_env *env,
+                             struct lu_device *d, struct lustre_cfg *cfg)
+{
+       struct obd_device *obd = d->ld_obd;
+       int cmd;
+       int rc;
+       int gen;
+       __u32 index;
+
+       obd_getref(obd);
+
+       cmd = cfg->lcfg_command;
+       rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+       if (rc == 0) {
+               switch(cmd) {
+               case LCFG_LOV_ADD_OBD:
+               case LCFG_LOV_ADD_INA:
+                       rc = lov_cl_add_target(env, d, index);
+                       if (rc != 0)
+                               lov_del_target(d->ld_obd, index, 0, 0);
+                       break;
+               case LCFG_LOV_DEL_OBD:
+                       lov_cl_del_target(env, d, index);
+                       break;
+               }
+       }
+       obd_putref(obd);
+       RETURN(rc);
+}
+
+static const struct lu_device_operations lov_lu_ops = {
+       .ldo_object_alloc      = lov_object_alloc,
+       .ldo_process_config    = lov_process_config,
+};
+
+static struct lu_device *lov_device_alloc(const struct lu_env *env,
+                                         struct lu_device_type *t,
+                                         struct lustre_cfg *cfg)
+{
+       struct lu_device *d;
+       struct lov_device *ld;
+       struct obd_device *obd;
+       int rc;
+
+       OBD_ALLOC_PTR(ld);
+       if (ld == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       cl_device_init(&ld->ld_cl, t);
+       d = lov2lu_dev(ld);
+       d->ld_ops       = &lov_lu_ops;
+       ld->ld_cl.cd_ops = &lov_cl_ops;
+
+       mutex_init(&ld->ld_mutex);
+       lockdep_set_class(&ld->ld_mutex, &cl_lov_device_mutex_class);
+
+       /* setup the LOV OBD */
+       obd = class_name2obd(lustre_cfg_string(cfg, 0));
+       LASSERT(obd != NULL);
+       rc = lov_setup(obd, cfg);
+       if (rc) {
+               lov_device_free(env, d);
+               RETURN(ERR_PTR(rc));
+       }
+
+       ld->ld_lov = &obd->u.lov;
+       RETURN(d);
+}
+
+static const struct lu_device_type_operations lov_device_type_ops = {
+       .ldto_init = lov_type_init,
+       .ldto_fini = lov_type_fini,
+
+       .ldto_start = lov_type_start,
+       .ldto_stop  = lov_type_stop,
+
+       .ldto_device_alloc = lov_device_alloc,
+       .ldto_device_free  = lov_device_free,
+
+       .ldto_device_init    = lov_device_init,
+       .ldto_device_fini    = lov_device_fini
+};
+
+struct lu_device_type lov_device_type = {
+       .ldt_tags     = LU_DEVICE_CL,
+       .ldt_name     = LUSTRE_LOV_NAME,
+       .ldt_ops      = &lov_device_type_ops,
+       .ldt_ctx_tags = LCT_CL_THREAD
+};
+EXPORT_SYMBOL(lov_device_type);
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_ea.c b/drivers/staging/lustre/lustre/lov/lov_ea.c
new file mode 100644 (file)
index 0000000..340dbcf
--- /dev/null
@@ -0,0 +1,333 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_ea.c
+ *
+ * Author: Wang Di <wangdi@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <asm/div64.h>
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_idl.h>
+
+#include "lov_internal.h"
+
+struct lovea_unpack_args {
+       struct lov_stripe_md *lsm;
+       int                cursor;
+};
+
+static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
+                                __u16 stripe_count)
+{
+       if (stripe_count == 0 || stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+               CERROR("bad stripe count %d\n", stripe_count);
+               lov_dump_lmm_common(D_WARNING, lmm);
+               return -EINVAL;
+       }
+
+       if (lmm_oi_id(&lmm->lmm_oi) == 0) {
+               CERROR("zero object id\n");
+               lov_dump_lmm_common(D_WARNING, lmm);
+               return -EINVAL;
+       }
+
+       if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) {
+               CERROR("bad striping pattern\n");
+               lov_dump_lmm_common(D_WARNING, lmm);
+               return -EINVAL;
+       }
+
+       if (lmm->lmm_stripe_size == 0 ||
+           (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) {
+               CERROR("bad stripe size %u\n",
+                      le32_to_cpu(lmm->lmm_stripe_size));
+               lov_dump_lmm_common(D_WARNING, lmm);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size)
+{
+       struct lov_stripe_md *lsm;
+       struct lov_oinfo     *loi;
+       int                i, oinfo_ptrs_size;
+
+       LASSERT(stripe_count <= LOV_MAX_STRIPE_COUNT);
+
+       oinfo_ptrs_size = sizeof(struct lov_oinfo *) * stripe_count;
+       *size = sizeof(struct lov_stripe_md) + oinfo_ptrs_size;
+
+       OBD_ALLOC_LARGE(lsm, *size);
+       if (!lsm)
+               return NULL;;
+
+       for (i = 0; i < stripe_count; i++) {
+               OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, __GFP_IO);
+               if (loi == NULL)
+                       goto err;
+               lsm->lsm_oinfo[i] = loi;
+       }
+       lsm->lsm_stripe_count = stripe_count;
+       return lsm;
+
+err:
+       while (--i >= 0)
+               OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab, sizeof(*loi));
+       OBD_FREE_LARGE(lsm, *size);
+       return NULL;
+}
+
+void lsm_free_plain(struct lov_stripe_md *lsm)
+{
+       __u16 stripe_count = lsm->lsm_stripe_count;
+       int i;
+
+       for (i = 0; i < stripe_count; i++)
+               OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab,
+                             sizeof(struct lov_oinfo));
+       OBD_FREE_LARGE(lsm, sizeof(struct lov_stripe_md) +
+                      stripe_count * sizeof(struct lov_oinfo *));
+}
+
+static void lsm_unpackmd_common(struct lov_stripe_md *lsm,
+                               struct lov_mds_md *lmm)
+{
+       /*
+        * This supposes lov_mds_md_v1/v3 first fields are
+        * are the same
+        */
+       lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi);
+       lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
+       lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern);
+       lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
+       lsm->lsm_pool_name[0] = '\0';
+}
+
+static void
+lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno,
+                          obd_off *lov_off, obd_off *swidth)
+{
+       if (swidth)
+               *swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static void
+lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno,
+                          obd_off *lov_off, obd_off *swidth)
+{
+       if (swidth)
+               *swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa,
+                            struct obd_export *md_exp)
+{
+       return 0;
+}
+
+/* Find minimum stripe maxbytes value.  For inactive or
+ * reconnecting targets use LUSTRE_STRIPE_MAXBYTES. */
+static void lov_tgt_maxbytes(struct lov_tgt_desc *tgt, __u64 *stripe_maxbytes)
+{
+       struct obd_import *imp = tgt->ltd_obd->u.cli.cl_import;
+
+       if (imp == NULL || !tgt->ltd_active) {
+               *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+               return;
+       }
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state == LUSTRE_IMP_FULL &&
+           (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) &&
+           imp->imp_connect_data.ocd_maxbytes > 0) {
+               if (*stripe_maxbytes > imp->imp_connect_data.ocd_maxbytes)
+                       *stripe_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+       } else {
+               *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+       }
+       spin_unlock(&imp->imp_lock);
+}
+
+static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes,
+                            __u16 *stripe_count)
+{
+       if (lmm_bytes < sizeof(*lmm)) {
+               CERROR("lov_mds_md_v1 too small: %d, need at least %d\n",
+                      lmm_bytes, (int)sizeof(*lmm));
+               return -EINVAL;
+       }
+
+       *stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+
+       if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) {
+               CERROR("LOV EA V1 too small: %d, need %d\n",
+                      lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1));
+               lov_dump_lmm_common(D_WARNING, lmm);
+               return -EINVAL;
+       }
+
+       return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count);
+}
+
+int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm,
+                   struct lov_mds_md_v1 *lmm)
+{
+       struct lov_oinfo *loi;
+       int i;
+       __u64 stripe_maxbytes = OBD_OBJECT_EOF;
+
+       lsm_unpackmd_common(lsm, lmm);
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               /* XXX LOV STACKING call down to osc_unpackmd() */
+               loi = lsm->lsm_oinfo[i];
+               ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+               loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+               loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+               if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+                       CERROR("OST index %d more than OST count %d\n",
+                              loi->loi_ost_idx, lov->desc.ld_tgt_count);
+                       lov_dump_lmm_v1(D_WARNING, lmm);
+                       return -EINVAL;
+               }
+               if (!lov->lov_tgts[loi->loi_ost_idx]) {
+                       CERROR("OST index %d missing\n", loi->loi_ost_idx);
+                       lov_dump_lmm_v1(D_WARNING, lmm);
+                       return -EINVAL;
+               }
+               /* calculate the minimum stripe max bytes */
+               lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+                                &stripe_maxbytes);
+       }
+
+       lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+
+       return 0;
+}
+
+const struct lsm_operations lsm_v1_ops = {
+       .lsm_free           = lsm_free_plain,
+       .lsm_destroy     = lsm_destroy_plain,
+       .lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+       .lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+       .lsm_lmm_verify  = lsm_lmm_verify_v1,
+       .lsm_unpackmd      = lsm_unpackmd_v1,
+};
+
+static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes,
+                            __u16 *stripe_count)
+{
+       struct lov_mds_md_v3 *lmm;
+
+       lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+       if (lmm_bytes < sizeof(*lmm)) {
+               CERROR("lov_mds_md_v3 too small: %d, need at least %d\n",
+                      lmm_bytes, (int)sizeof(*lmm));
+               return -EINVAL;
+       }
+
+       *stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
+
+       if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) {
+               CERROR("LOV EA V3 too small: %d, need %d\n",
+                      lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3));
+               lov_dump_lmm_common(D_WARNING, lmm);
+               return -EINVAL;
+       }
+
+       return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes,
+                                    *stripe_count);
+}
+
+int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm,
+                   struct lov_mds_md *lmmv1)
+{
+       struct lov_mds_md_v3 *lmm;
+       struct lov_oinfo *loi;
+       int i;
+       __u64 stripe_maxbytes = OBD_OBJECT_EOF;
+       int cplen = 0;
+
+       lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+       lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm);
+       cplen = strlcpy(lsm->lsm_pool_name, lmm->lmm_pool_name,
+                       sizeof(lsm->lsm_pool_name));
+       if (cplen >= sizeof(lsm->lsm_pool_name))
+               return -E2BIG;
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               /* XXX LOV STACKING call down to osc_unpackmd() */
+               loi = lsm->lsm_oinfo[i];
+               ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi);
+               loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+               loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+               if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+                       CERROR("OST index %d more than OST count %d\n",
+                              loi->loi_ost_idx, lov->desc.ld_tgt_count);
+                       lov_dump_lmm_v3(D_WARNING, lmm);
+                       return -EINVAL;
+               }
+               if (!lov->lov_tgts[loi->loi_ost_idx]) {
+                       CERROR("OST index %d missing\n", loi->loi_ost_idx);
+                       lov_dump_lmm_v3(D_WARNING, lmm);
+                       return -EINVAL;
+               }
+               /* calculate the minimum stripe max bytes */
+               lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx],
+                                &stripe_maxbytes);
+       }
+
+       lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count;
+
+       return 0;
+}
+
+const struct lsm_operations lsm_v3_ops = {
+       .lsm_free           = lsm_free_plain,
+       .lsm_destroy     = lsm_destroy_plain,
+       .lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+       .lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+       .lsm_lmm_verify  = lsm_lmm_verify_v3,
+       .lsm_unpackmd      = lsm_unpackmd_v3,
+};
diff --git a/drivers/staging/lustre/lustre/lov/lov_internal.h b/drivers/staging/lustre/lustre/lov/lov_internal.h
new file mode 100644 (file)
index 0000000..16770d1
--- /dev/null
@@ -0,0 +1,323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef LOV_INTERNAL_H
+#define LOV_INTERNAL_H
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_user.h>
+
+struct lov_lock_handles {
+       struct portals_handle   llh_handle;
+       atomic_t            llh_refcount;
+       int                  llh_stripe_count;
+       struct lustre_handle    llh_handles[0];
+};
+
+struct lov_request {
+       struct obd_info   rq_oi;
+       struct lov_request_set  *rq_rqset;
+
+       struct list_head               rq_link;
+
+       int                   rq_idx;   /* index in lov->tgts array */
+       int                   rq_stripe;     /* stripe number */
+       int                   rq_complete;
+       int                   rq_rc;
+       int                   rq_buflen;     /* length of sub_md */
+
+       obd_count               rq_oabufs;
+       obd_count               rq_pgaidx;
+};
+
+struct lov_request_set {
+       struct ldlm_enqueue_info        *set_ei;
+       struct obd_info                 *set_oi;
+       atomic_t                        set_refcount;
+       struct obd_export               *set_exp;
+       /* XXX: There is @set_exp already, however obd_statfs gets obd_device
+          only. */
+       struct obd_device               *set_obd;
+       int                             set_count;
+       atomic_t                        set_completes;
+       atomic_t                        set_success;
+       atomic_t                        set_finish_checked;
+       struct llog_cookie              *set_cookies;
+       int                             set_cookie_sent;
+       struct obd_trans_info           *set_oti;
+       obd_count                       set_oabufs;
+       struct brw_page                 *set_pga;
+       struct lov_lock_handles         *set_lockh;
+       struct list_head                        set_list;
+       wait_queue_head_t                       set_waitq;
+       spinlock_t                      set_lock;
+};
+
+extern struct kmem_cache *lov_oinfo_slab;
+
+void lov_finish_set(struct lov_request_set *set);
+
+static inline void lov_get_reqset(struct lov_request_set *set)
+{
+       LASSERT(set != NULL);
+       LASSERT(atomic_read(&set->set_refcount) > 0);
+       atomic_inc(&set->set_refcount);
+}
+
+static inline void lov_put_reqset(struct lov_request_set *set)
+{
+       if (atomic_dec_and_test(&set->set_refcount))
+               lov_finish_set(set);
+}
+
+static inline struct lov_lock_handles *
+lov_handle2llh(struct lustre_handle *handle)
+{
+       LASSERT(handle != NULL);
+       return(class_handle2object(handle->cookie));
+}
+
+static inline void lov_llh_put(struct lov_lock_handles *llh)
+{
+       CDEBUG(D_INFO, "PUTting llh %p : new refcount %d\n", llh,
+              atomic_read(&llh->llh_refcount) - 1);
+       LASSERT(atomic_read(&llh->llh_refcount) > 0 &&
+               atomic_read(&llh->llh_refcount) < 0x5a5a);
+       if (atomic_dec_and_test(&llh->llh_refcount)) {
+               class_handle_unhash(&llh->llh_handle);
+               /* The structure may be held by other threads because RCU.
+                *   -jxiong */
+               if (atomic_read(&llh->llh_refcount))
+                       return;
+
+               OBD_FREE_RCU(llh, sizeof *llh +
+                            sizeof(*llh->llh_handles) * llh->llh_stripe_count,
+                            &llh->llh_handle);
+       }
+}
+
+#define lov_uuid2str(lv, index) \
+       (char *)((lv)->lov_tgts[index]->ltd_uuid.uuid)
+
+/* lov_merge.c */
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_valid valid,
+                    struct lov_stripe_md *lsm, int stripeno, int *set);
+int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
+                 struct ost_lvb *lvb, int kms_only);
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+                  obd_off size, int shrink);
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+                     struct ost_lvb *lvb, __u64 *kms_place);
+
+/* lov_offset.c */
+obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
+                        int stripeno);
+int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
+                     int stripeno, obd_off *obd_off);
+obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
+                          int stripeno);
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+                         obd_off start, obd_off end,
+                         obd_off *obd_start, obd_off *obd_end);
+int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off);
+
+/* lov_qos.c */
+#define LOV_USES_ASSIGNED_STRIPE       0
+#define LOV_USES_DEFAULT_STRIPE         1
+int qos_add_tgt(struct obd_device *obd, __u32 index);
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt);
+void qos_shrink_lsm(struct lov_request_set *set);
+int qos_prep_create(struct obd_export *exp, struct lov_request_set *set);
+void qos_update(struct lov_obd *lov);
+void qos_statfs_done(struct lov_obd *lov);
+void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait);
+int qos_remedy_create(struct lov_request_set *set, struct lov_request *req);
+
+/* lov_request.c */
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set);
+int lov_set_finished(struct lov_request_set *set, int idempotent);
+void lov_update_set(struct lov_request_set *set,
+                   struct lov_request *req, int rc);
+int lov_update_common_set(struct lov_request_set *set,
+                         struct lov_request *req, int rc);
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx);
+int lov_prep_create_set(struct obd_export *exp, struct obd_info *oifo,
+                       struct lov_stripe_md **ea, struct obdo *src_oa,
+                       struct obd_trans_info *oti,
+                       struct lov_request_set **reqset);
+int cb_create_update(void *cookie, int rc);
+int lov_fini_create_set(struct lov_request_set *set, struct lov_stripe_md **ea);
+int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
+                    obd_count oa_bufs, struct brw_page *pga,
+                    struct obd_trans_info *oti,
+                    struct lov_request_set **reqset);
+int lov_fini_brw_set(struct lov_request_set *set);
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct lov_request_set **reqset);
+int lov_fini_getattr_set(struct lov_request_set *set);
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct obdo *src_oa, struct lov_stripe_md *lsm,
+                        struct obd_trans_info *oti,
+                        struct lov_request_set **reqset);
+int lov_update_destroy_set(struct lov_request_set *set,
+                          struct lov_request *req, int rc);
+int lov_fini_destroy_set(struct lov_request_set *set);
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct obd_trans_info *oti,
+                        struct lov_request_set **reqset);
+int lov_update_setattr_set(struct lov_request_set *set,
+                          struct lov_request *req, int rc);
+int lov_fini_setattr_set(struct lov_request_set *set);
+int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
+                      struct obd_trans_info *oti,
+                      struct lov_request_set **reqset);
+int lov_fini_punch_set(struct lov_request_set *set);
+int lov_prep_sync_set(struct obd_export *exp, struct obd_info *obd_info,
+                     obd_off start, obd_off end,
+                     struct lov_request_set **reqset);
+int lov_fini_sync_set(struct lov_request_set *set);
+int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct ldlm_enqueue_info *einfo,
+                        struct lov_request_set **reqset);
+int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode, int rc,
+                        struct ptlrpc_request_set *rqset);
+int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
+                      struct lov_stripe_md *lsm,
+                      ldlm_policy_data_t *policy, __u32 mode,
+                      struct lustre_handle *lockh,
+                      struct lov_request_set **reqset);
+int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags);
+int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
+                       struct lov_stripe_md *lsm,
+                       __u32 mode, struct lustre_handle *lockh,
+                       struct lov_request_set **reqset);
+int lov_fini_cancel_set(struct lov_request_set *set);
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+                       struct lov_request_set **reqset);
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+                      int success);
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,
+                   int success);
+int lov_fini_statfs_set(struct lov_request_set *set);
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc);
+
+/* lov_obd.c */
+void lov_fix_desc(struct lov_desc *desc);
+void lov_fix_desc_stripe_size(__u64 *val);
+void lov_fix_desc_stripe_count(__u32 *val);
+void lov_fix_desc_pattern(__u32 *val);
+void lov_fix_desc_qos_maxage(__u32 *val);
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count);
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+                   struct obd_connect_data *data);
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+                           __u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, __u32 index,
+                  struct obd_uuid *uuidp, int gen);
+/* lov_log.c */
+int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                 struct obd_device *tgt, int *idx);
+int lov_llog_finish(struct obd_device *obd, int count);
+
+/* lov_pack.c */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmm,
+              struct lov_stripe_md *lsm);
+int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+                struct lov_mds_md *lmm, int lmm_bytes);
+int lov_setstripe(struct obd_export *exp, int max_lmm_size,
+                 struct lov_stripe_md **lsmp, struct lov_user_md *lump);
+int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
+             struct lov_user_md *lump);
+int lov_getstripe(struct obd_export *exp,
+                 struct lov_stripe_md *lsm, struct lov_user_md *lump);
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+                   int pattern, int magic);
+int lov_free_memmd(struct lov_stripe_md **lsmp);
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm);
+void lov_dump_lmm_common(int level, void *lmmp);
+void lov_dump_lmm(int level, void *lmm);
+
+/* lov_ea.c */
+struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size);
+void lsm_free_plain(struct lov_stripe_md *lsm);
+
+int lovea_destroy_object(struct lov_obd *lov, struct lov_stripe_md *lsm,
+                        struct obdo *oa, void *data);
+/* lproc_lov.c */
+extern struct file_operations lov_proc_target_fops;
+#ifdef LPROCFS
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+/* lov_cl.c */
+extern struct lu_device_type lov_device_type;
+
+/* pools */
+extern cfs_hash_ops_t pool_hash_operations;
+/* ost_pool methods */
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count);
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count);
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count);
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx);
+int lov_ost_pool_free(struct ost_pool *op);
+
+/* high level pool methods */
+int lov_pool_new(struct obd_device *obd, char *poolname);
+int lov_pool_del(struct obd_device *obd, char *poolname);
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname);
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+void lov_dump_pool(int level, struct pool_desc *pool);
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
+void lov_pool_putref(struct pool_desc *pool);
+
+static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm)
+{
+       LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+       atomic_inc(&lsm->lsm_refc);
+       return lsm;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/lov/lov_io.c b/drivers/staging/lustre/lustre/lov/lov_io.c
new file mode 100644 (file)
index 0000000..1a87abd
--- /dev/null
@@ -0,0 +1,967 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static inline void lov_sub_enter(struct lov_io_sub *sub)
+{
+       sub->sub_reenter++;
+}
+static inline void lov_sub_exit(struct lov_io_sub *sub)
+{
+       sub->sub_reenter--;
+}
+
+static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
+                           struct lov_io_sub *sub)
+{
+       ENTRY;
+       if (sub->sub_io != NULL) {
+               if (sub->sub_io_initialized) {
+                       lov_sub_enter(sub);
+                       cl_io_fini(sub->sub_env, sub->sub_io);
+                       lov_sub_exit(sub);
+                       sub->sub_io_initialized = 0;
+                       lio->lis_active_subios--;
+               }
+               if (sub->sub_stripe == lio->lis_single_subio_index)
+                       lio->lis_single_subio_index = -1;
+               else if (!sub->sub_borrowed)
+                       OBD_FREE_PTR(sub->sub_io);
+               sub->sub_io = NULL;
+       }
+       if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
+               if (!sub->sub_borrowed)
+                       cl_env_put(sub->sub_env, &sub->sub_refcheck);
+               sub->sub_env = NULL;
+       }
+       EXIT;
+}
+
+static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio,
+                              int stripe, loff_t start, loff_t end)
+{
+       struct lov_stripe_md *lsm    = lio->lis_object->lo_lsm;
+       struct cl_io     *parent = lio->lis_cl.cis_io;
+
+       switch(io->ci_type) {
+       case CIT_SETATTR: {
+               io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
+               io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid;
+               io->u.ci_setattr.sa_capa = parent->u.ci_setattr.sa_capa;
+               if (cl_io_is_trunc(io)) {
+                       loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size;
+
+                       new_size = lov_size_to_stripe(lsm, new_size, stripe);
+                       io->u.ci_setattr.sa_attr.lvb_size = new_size;
+               }
+               break;
+       }
+       case CIT_FAULT: {
+               struct cl_object *obj = parent->ci_obj;
+               loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index);
+
+               io->u.ci_fault = parent->u.ci_fault;
+               off = lov_size_to_stripe(lsm, off, stripe);
+               io->u.ci_fault.ft_index = cl_index(obj, off);
+               break;
+       }
+       case CIT_FSYNC: {
+               io->u.ci_fsync.fi_start = start;
+               io->u.ci_fsync.fi_end = end;
+               io->u.ci_fsync.fi_capa = parent->u.ci_fsync.fi_capa;
+               io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid;
+               io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode;
+               break;
+       }
+       case CIT_READ:
+       case CIT_WRITE: {
+               io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+               if (cl_io_is_append(parent)) {
+                       io->u.ci_wr.wr_append = 1;
+               } else {
+                       io->u.ci_rw.crw_pos = start;
+                       io->u.ci_rw.crw_count = end - start;
+               }
+               break;
+       }
+       default:
+               break;
+       }
+}
+
+static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
+                          struct lov_io_sub *sub)
+{
+       struct lov_object *lov = lio->lis_object;
+       struct lov_device *ld  = lu2lov_dev(lov2cl(lov)->co_lu.lo_dev);
+       struct cl_io      *sub_io;
+       struct cl_object  *sub_obj;
+       struct cl_io      *io  = lio->lis_cl.cis_io;
+
+       int stripe = sub->sub_stripe;
+       int result;
+
+       LASSERT(sub->sub_io == NULL);
+       LASSERT(sub->sub_env == NULL);
+       LASSERT(sub->sub_stripe < lio->lis_stripe_count);
+       ENTRY;
+
+       result = 0;
+       sub->sub_io_initialized = 0;
+       sub->sub_borrowed = 0;
+
+       if (lio->lis_mem_frozen) {
+               LASSERT(mutex_is_locked(&ld->ld_mutex));
+               sub->sub_io  = &ld->ld_emrg[stripe]->emrg_subio;
+               sub->sub_env = ld->ld_emrg[stripe]->emrg_env;
+               sub->sub_borrowed = 1;
+       } else {
+               void *cookie;
+
+               /* obtain new environment */
+               cookie = cl_env_reenter();
+               sub->sub_env = cl_env_get(&sub->sub_refcheck);
+               cl_env_reexit(cookie);
+               if (IS_ERR(sub->sub_env))
+                       result = PTR_ERR(sub->sub_env);
+
+               if (result == 0) {
+                       /*
+                        * First sub-io. Use ->lis_single_subio to
+                        * avoid dynamic allocation.
+                        */
+                       if (lio->lis_active_subios == 0) {
+                               sub->sub_io = &lio->lis_single_subio;
+                               lio->lis_single_subio_index = stripe;
+                       } else {
+                               OBD_ALLOC_PTR(sub->sub_io);
+                               if (sub->sub_io == NULL)
+                                       result = -ENOMEM;
+                       }
+               }
+       }
+
+       if (result == 0) {
+               sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]);
+               sub_io  = sub->sub_io;
+
+               sub_io->ci_obj    = sub_obj;
+               sub_io->ci_result = 0;
+
+               sub_io->ci_parent  = io;
+               sub_io->ci_lockreq = io->ci_lockreq;
+               sub_io->ci_type    = io->ci_type;
+               sub_io->ci_no_srvlock = io->ci_no_srvlock;
+
+               lov_sub_enter(sub);
+               result = cl_io_sub_init(sub->sub_env, sub_io,
+                                       io->ci_type, sub_obj);
+               lov_sub_exit(sub);
+               if (result >= 0) {
+                       lio->lis_active_subios++;
+                       sub->sub_io_initialized = 1;
+                       result = 0;
+               }
+       }
+       if (result != 0)
+               lov_io_sub_fini(env, lio, sub);
+       RETURN(result);
+}
+
+struct lov_io_sub *lov_sub_get(const struct lu_env *env,
+                              struct lov_io *lio, int stripe)
+{
+       int rc;
+       struct lov_io_sub *sub = &lio->lis_subs[stripe];
+
+       LASSERT(stripe < lio->lis_stripe_count);
+       ENTRY;
+
+       if (!sub->sub_io_initialized) {
+               sub->sub_stripe = stripe;
+               rc = lov_io_sub_init(env, lio, sub);
+       } else
+               rc = 0;
+       if (rc == 0)
+               lov_sub_enter(sub);
+       else
+               sub = ERR_PTR(rc);
+       RETURN(sub);
+}
+
+void lov_sub_put(struct lov_io_sub *sub)
+{
+       lov_sub_exit(sub);
+}
+
+/*****************************************************************************
+ *
+ * Lov io operations.
+ *
+ */
+
+static int lov_page_stripe(const struct cl_page *page)
+{
+       struct lovsub_object *subobj;
+
+       ENTRY;
+       subobj = lu2lovsub(
+               lu_object_locate(page->cp_child->cp_obj->co_lu.lo_header,
+                                &lovsub_device_type));
+       LASSERT(subobj != NULL);
+       RETURN(subobj->lso_index);
+}
+
+struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio,
+                                 const struct cl_page_slice *slice)
+{
+       struct lov_stripe_md *lsm  = lio->lis_object->lo_lsm;
+       struct cl_page       *page = slice->cpl_page;
+       int stripe;
+
+       LASSERT(lio->lis_cl.cis_io != NULL);
+       LASSERT(cl2lov(slice->cpl_obj) == lio->lis_object);
+       LASSERT(lsm != NULL);
+       LASSERT(lio->lis_nr_subios > 0);
+       ENTRY;
+
+       stripe = lov_page_stripe(page);
+       RETURN(lov_sub_get(env, lio, stripe));
+}
+
+
+static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
+                            struct cl_io *io)
+{
+       struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+       int result;
+
+       LASSERT(lio->lis_object != NULL);
+       ENTRY;
+
+       /*
+        * Need to be optimized, we can't afford to allocate a piece of memory
+        * when writing a page. -jay
+        */
+       OBD_ALLOC_LARGE(lio->lis_subs,
+                       lsm->lsm_stripe_count * sizeof lio->lis_subs[0]);
+       if (lio->lis_subs != NULL) {
+               lio->lis_nr_subios = lio->lis_stripe_count;
+               lio->lis_single_subio_index = -1;
+               lio->lis_active_subios = 0;
+               result = 0;
+       } else
+               result = -ENOMEM;
+       RETURN(result);
+}
+
+static void lov_io_slice_init(struct lov_io *lio,
+                             struct lov_object *obj, struct cl_io *io)
+{
+       ENTRY;
+
+       io->ci_result = 0;
+       lio->lis_object = obj;
+
+       LASSERT(obj->lo_lsm != NULL);
+       lio->lis_stripe_count = obj->lo_lsm->lsm_stripe_count;
+
+       switch (io->ci_type) {
+       case CIT_READ:
+       case CIT_WRITE:
+               lio->lis_pos = io->u.ci_rw.crw_pos;
+               lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+               lio->lis_io_endpos = lio->lis_endpos;
+               if (cl_io_is_append(io)) {
+                       LASSERT(io->ci_type == CIT_WRITE);
+                       lio->lis_pos = 0;
+                       lio->lis_endpos = OBD_OBJECT_EOF;
+               }
+               break;
+
+       case CIT_SETATTR:
+               if (cl_io_is_trunc(io))
+                       lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size;
+               else
+                       lio->lis_pos = 0;
+               lio->lis_endpos = OBD_OBJECT_EOF;
+               break;
+
+       case CIT_FAULT: {
+               pgoff_t index = io->u.ci_fault.ft_index;
+               lio->lis_pos = cl_offset(io->ci_obj, index);
+               lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+               break;
+       }
+
+       case CIT_FSYNC: {
+               lio->lis_pos = io->u.ci_fsync.fi_start;
+               lio->lis_endpos = io->u.ci_fsync.fi_end;
+               break;
+       }
+
+       case CIT_MISC:
+               lio->lis_pos = 0;
+               lio->lis_endpos = OBD_OBJECT_EOF;
+               break;
+
+       default:
+               LBUG();
+       }
+
+       EXIT;
+}
+
+static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       struct lov_io *lio = cl2lov_io(env, ios);
+       struct lov_object *lov = cl2lov(ios->cis_obj);
+       int i;
+
+       ENTRY;
+       if (lio->lis_subs != NULL) {
+               for (i = 0; i < lio->lis_nr_subios; i++)
+                       lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+               OBD_FREE_LARGE(lio->lis_subs,
+                        lio->lis_nr_subios * sizeof lio->lis_subs[0]);
+               lio->lis_nr_subios = 0;
+       }
+
+       LASSERT(atomic_read(&lov->lo_active_ios) > 0);
+       if (atomic_dec_and_test(&lov->lo_active_ios))
+               wake_up_all(&lov->lo_waitq);
+       EXIT;
+}
+
+static obd_off lov_offset_mod(obd_off val, int delta)
+{
+       if (val != OBD_OBJECT_EOF)
+               val += delta;
+       return val;
+}
+
+static int lov_io_iter_init(const struct lu_env *env,
+                           const struct cl_io_slice *ios)
+{
+       struct lov_io   *lio = cl2lov_io(env, ios);
+       struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+       struct lov_io_sub    *sub;
+       obd_off endpos;
+       obd_off start;
+       obd_off end;
+       int stripe;
+       int rc = 0;
+
+       ENTRY;
+       endpos = lov_offset_mod(lio->lis_endpos, -1);
+       for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) {
+               if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos,
+                                          endpos, &start, &end))
+                       continue;
+
+               end = lov_offset_mod(end, +1);
+               sub = lov_sub_get(env, lio, stripe);
+               if (!IS_ERR(sub)) {
+                       lov_io_sub_inherit(sub->sub_io, lio, stripe,
+                                          start, end);
+                       rc = cl_io_iter_init(sub->sub_env, sub->sub_io);
+                       lov_sub_put(sub);
+                       CDEBUG(D_VFSTRACE, "shrink: %d ["LPU64", "LPU64")\n",
+                              stripe, start, end);
+               } else
+                       rc = PTR_ERR(sub);
+
+               if (!rc)
+                       list_add_tail(&sub->sub_linkage, &lio->lis_active);
+               else
+                       break;
+       }
+       RETURN(rc);
+}
+
+static int lov_io_rw_iter_init(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+       struct lov_io   *lio = cl2lov_io(env, ios);
+       struct cl_io     *io  = ios->cis_io;
+       struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
+       loff_t start = io->u.ci_rw.crw_pos;
+       loff_t next;
+       unsigned long ssize = lsm->lsm_stripe_size;
+
+       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+       ENTRY;
+
+       /* fast path for common case. */
+       if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) {
+
+               lov_do_div64(start, ssize);
+               next = (start + 1) * ssize;
+               if (next <= start * ssize)
+                       next = ~0ull;
+
+               io->ci_continue = next < lio->lis_io_endpos;
+               io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos,
+                                             next) - io->u.ci_rw.crw_pos;
+               lio->lis_pos    = io->u.ci_rw.crw_pos;
+               lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+               CDEBUG(D_VFSTRACE, "stripe: "LPU64" chunk: ["LPU64", "LPU64") "
+                      LPU64"\n", (__u64)start, lio->lis_pos, lio->lis_endpos,
+                      (__u64)lio->lis_io_endpos);
+       }
+       /*
+        * XXX The following call should be optimized: we know, that
+        * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe.
+        */
+       RETURN(lov_io_iter_init(env, ios));
+}
+
+static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
+                      int (*iofunc)(const struct lu_env *, struct cl_io *))
+{
+       struct cl_io *parent = lio->lis_cl.cis_io;
+       struct lov_io_sub *sub;
+       int rc = 0;
+
+       ENTRY;
+       list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+               lov_sub_enter(sub);
+               rc = iofunc(sub->sub_env, sub->sub_io);
+               lov_sub_exit(sub);
+               if (rc)
+                       break;
+
+               if (parent->ci_result == 0)
+                       parent->ci_result = sub->sub_io->ci_result;
+       }
+       RETURN(rc);
+}
+
+static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       ENTRY;
+       RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
+}
+
+static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       ENTRY;
+       RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
+}
+
+static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+       ENTRY;
+       /*
+        * It's possible that lov_io_start() wasn't called against this
+        * sub-io, either because previous sub-io failed, or upper layer
+        * completed IO.
+        */
+       if (io->ci_state == CIS_IO_GOING)
+               cl_io_end(env, io);
+       else
+               io->ci_state = CIS_IO_FINISHED;
+       RETURN(0);
+}
+
+static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+       cl_io_iter_fini(env, io);
+       RETURN(0);
+}
+
+static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+       cl_io_unlock(env, io);
+       RETURN(0);
+}
+
+static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+       int rc;
+
+       rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+       LASSERT(rc == 0);
+}
+
+static void lov_io_iter_fini(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct lov_io *lio = cl2lov_io(env, ios);
+       int rc;
+
+       ENTRY;
+       rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+       LASSERT(rc == 0);
+       while (!list_empty(&lio->lis_active))
+               list_del_init(lio->lis_active.next);
+       EXIT;
+}
+
+static void lov_io_unlock(const struct lu_env *env,
+                         const struct cl_io_slice *ios)
+{
+       int rc;
+
+       ENTRY;
+       rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+       LASSERT(rc == 0);
+       EXIT;
+}
+
+
+static struct cl_page_list *lov_io_submit_qin(struct lov_device *ld,
+                                             struct cl_page_list *qin,
+                                             int idx, int alloc)
+{
+       return alloc ? &qin[idx] : &ld->ld_emrg[idx]->emrg_page_list;
+}
+
+/**
+ * lov implementation of cl_operations::cio_submit() method. It takes a list
+ * of pages in \a queue, splits it into per-stripe sub-lists, invokes
+ * cl_io_submit() on underlying devices to submit sub-lists, and then splices
+ * everything back.
+ *
+ * Major complication of this function is a need to handle memory cleansing:
+ * cl_io_submit() is called to write out pages as a part of VM memory
+ * reclamation, and hence it may not fail due to memory shortages (system
+ * dead-locks otherwise). To deal with this, some resources (sub-lists,
+ * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a
+ * not-memory cleansing context), and in case of memory shortage, these
+ * pre-allocated resources are used by lov_io_submit() under
+ * lov_device::ld_mutex mutex.
+ */
+static int lov_io_submit(const struct lu_env *env,
+                        const struct cl_io_slice *ios,
+                        enum cl_req_type crt, struct cl_2queue *queue)
+{
+       struct lov_io     *lio = cl2lov_io(env, ios);
+       struct lov_object      *obj = lio->lis_object;
+       struct lov_device       *ld = lu2lov_dev(lov2cl(obj)->co_lu.lo_dev);
+       struct cl_page_list    *qin = &queue->c2_qin;
+       struct cl_2queue      *cl2q = &lov_env_info(env)->lti_cl2q;
+       struct cl_page_list *stripes_qin = NULL;
+       struct cl_page *page;
+       struct cl_page *tmp;
+       int stripe;
+
+#define QIN(stripe) lov_io_submit_qin(ld, stripes_qin, stripe, alloc)
+
+       int rc = 0;
+       int alloc =
+               !(current->flags & PF_MEMALLOC);
+       ENTRY;
+       if (lio->lis_active_subios == 1) {
+               int idx = lio->lis_single_subio_index;
+               struct lov_io_sub *sub;
+
+               LASSERT(idx < lio->lis_nr_subios);
+               sub = lov_sub_get(env, lio, idx);
+               LASSERT(!IS_ERR(sub));
+               LASSERT(sub->sub_io == &lio->lis_single_subio);
+               rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+                                    crt, queue);
+               lov_sub_put(sub);
+               RETURN(rc);
+       }
+
+       LASSERT(lio->lis_subs != NULL);
+       if (alloc) {
+               OBD_ALLOC_LARGE(stripes_qin,
+                               sizeof(*stripes_qin) * lio->lis_nr_subios);
+               if (stripes_qin == NULL)
+                       RETURN(-ENOMEM);
+
+               for (stripe = 0; stripe < lio->lis_nr_subios; stripe++)
+                       cl_page_list_init(&stripes_qin[stripe]);
+       } else {
+               /*
+                * If we get here, it means pageout & swap doesn't help.
+                * In order to not make things worse, even don't try to
+                * allocate the memory with __GFP_NOWARN. -jay
+                */
+               mutex_lock(&ld->ld_mutex);
+               lio->lis_mem_frozen = 1;
+       }
+
+       cl_2queue_init(cl2q);
+       cl_page_list_for_each_safe(page, tmp, qin) {
+               stripe = lov_page_stripe(page);
+               cl_page_list_move(QIN(stripe), qin, page);
+       }
+
+       for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+               struct lov_io_sub   *sub;
+               struct cl_page_list *sub_qin = QIN(stripe);
+
+               if (list_empty(&sub_qin->pl_pages))
+                       continue;
+
+               cl_page_list_splice(sub_qin, &cl2q->c2_qin);
+               sub = lov_sub_get(env, lio, stripe);
+               if (!IS_ERR(sub)) {
+                       rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+                                            crt, cl2q);
+                       lov_sub_put(sub);
+               } else
+                       rc = PTR_ERR(sub);
+               cl_page_list_splice(&cl2q->c2_qin,  &queue->c2_qin);
+               cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout);
+               if (rc != 0)
+                       break;
+       }
+
+       for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+               struct cl_page_list *sub_qin = QIN(stripe);
+
+               if (list_empty(&sub_qin->pl_pages))
+                       continue;
+
+               cl_page_list_splice(sub_qin, qin);
+       }
+
+       if (alloc) {
+               OBD_FREE_LARGE(stripes_qin,
+                        sizeof(*stripes_qin) * lio->lis_nr_subios);
+       } else {
+               int i;
+
+               for (i = 0; i < lio->lis_nr_subios; i++) {
+                       struct cl_io *cio = lio->lis_subs[i].sub_io;
+
+                       if (cio && cio == &ld->ld_emrg[i]->emrg_subio)
+                               lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+               }
+               lio->lis_mem_frozen = 0;
+               mutex_unlock(&ld->ld_mutex);
+       }
+
+       RETURN(rc);
+#undef QIN
+}
+
+static int lov_io_prepare_write(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               const struct cl_page_slice *slice,
+                               unsigned from, unsigned to)
+{
+       struct lov_io     *lio      = cl2lov_io(env, ios);
+       struct cl_page    *sub_page = lov_sub_page(slice);
+       struct lov_io_sub *sub;
+       int result;
+
+       ENTRY;
+       sub = lov_page_subio(env, lio, slice);
+       if (!IS_ERR(sub)) {
+               result = cl_io_prepare_write(sub->sub_env, sub->sub_io,
+                                            sub_page, from, to);
+               lov_sub_put(sub);
+       } else
+               result = PTR_ERR(sub);
+       RETURN(result);
+}
+
+static int lov_io_commit_write(const struct lu_env *env,
+                              const struct cl_io_slice *ios,
+                              const struct cl_page_slice *slice,
+                              unsigned from, unsigned to)
+{
+       struct lov_io     *lio      = cl2lov_io(env, ios);
+       struct cl_page    *sub_page = lov_sub_page(slice);
+       struct lov_io_sub *sub;
+       int result;
+
+       ENTRY;
+       sub = lov_page_subio(env, lio, slice);
+       if (!IS_ERR(sub)) {
+               result = cl_io_commit_write(sub->sub_env, sub->sub_io,
+                                           sub_page, from, to);
+               lov_sub_put(sub);
+       } else
+               result = PTR_ERR(sub);
+       RETURN(result);
+}
+
+static int lov_io_fault_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct cl_fault_io *fio;
+       struct lov_io      *lio;
+       struct lov_io_sub  *sub;
+
+       ENTRY;
+       fio = &ios->cis_io->u.ci_fault;
+       lio = cl2lov_io(env, ios);
+       sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page));
+       sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob;
+       lov_sub_put(sub);
+       RETURN(lov_io_start(env, ios));
+}
+
+static void lov_io_fsync_end(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct lov_io *lio = cl2lov_io(env, ios);
+       struct lov_io_sub *sub;
+       unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written;
+       ENTRY;
+
+       *written = 0;
+       list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+               struct cl_io *subio = sub->sub_io;
+
+               lov_sub_enter(sub);
+               lov_io_end_wrapper(sub->sub_env, subio);
+               lov_sub_exit(sub);
+
+               if (subio->ci_result == 0)
+                       *written += subio->u.ci_fsync.fi_nr_written;
+       }
+       RETURN_EXIT;
+}
+
+static const struct cl_io_operations lov_io_ops = {
+       .op = {
+               [CIT_READ] = {
+                       .cio_fini      = lov_io_fini,
+                       .cio_iter_init = lov_io_rw_iter_init,
+                       .cio_iter_fini = lov_io_iter_fini,
+                       .cio_lock      = lov_io_lock,
+                       .cio_unlock    = lov_io_unlock,
+                       .cio_start     = lov_io_start,
+                       .cio_end       = lov_io_end
+               },
+               [CIT_WRITE] = {
+                       .cio_fini      = lov_io_fini,
+                       .cio_iter_init = lov_io_rw_iter_init,
+                       .cio_iter_fini = lov_io_iter_fini,
+                       .cio_lock      = lov_io_lock,
+                       .cio_unlock    = lov_io_unlock,
+                       .cio_start     = lov_io_start,
+                       .cio_end       = lov_io_end
+               },
+               [CIT_SETATTR] = {
+                       .cio_fini      = lov_io_fini,
+                       .cio_iter_init = lov_io_iter_init,
+                       .cio_iter_fini = lov_io_iter_fini,
+                       .cio_lock      = lov_io_lock,
+                       .cio_unlock    = lov_io_unlock,
+                       .cio_start     = lov_io_start,
+                       .cio_end       = lov_io_end
+               },
+               [CIT_FAULT] = {
+                       .cio_fini      = lov_io_fini,
+                       .cio_iter_init = lov_io_iter_init,
+                       .cio_iter_fini = lov_io_iter_fini,
+                       .cio_lock      = lov_io_lock,
+                       .cio_unlock    = lov_io_unlock,
+                       .cio_start     = lov_io_fault_start,
+                       .cio_end       = lov_io_end
+               },
+               [CIT_FSYNC] = {
+                       .cio_fini      = lov_io_fini,
+                       .cio_iter_init = lov_io_iter_init,
+                       .cio_iter_fini = lov_io_iter_fini,
+                       .cio_lock      = lov_io_lock,
+                       .cio_unlock    = lov_io_unlock,
+                       .cio_start     = lov_io_start,
+                       .cio_end       = lov_io_fsync_end
+               },
+               [CIT_MISC] = {
+                       .cio_fini   = lov_io_fini
+               }
+       },
+       .req_op = {
+                [CRT_READ] = {
+                        .cio_submit    = lov_io_submit
+                },
+                [CRT_WRITE] = {
+                        .cio_submit    = lov_io_submit
+                }
+        },
+       .cio_prepare_write = lov_io_prepare_write,
+       .cio_commit_write  = lov_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Empty lov io operations.
+ *
+ */
+
+static void lov_empty_io_fini(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct lov_object *lov = cl2lov(ios->cis_obj);
+       ENTRY;
+
+       if (atomic_dec_and_test(&lov->lo_active_ios))
+               wake_up_all(&lov->lo_waitq);
+       EXIT;
+}
+
+static void lov_empty_impossible(const struct lu_env *env,
+                                struct cl_io_slice *ios)
+{
+       LBUG();
+}
+
+#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
+
+/**
+ * An io operation vector for files without stripes.
+ */
+static const struct cl_io_operations lov_empty_io_ops = {
+       .op = {
+               [CIT_READ] = {
+                       .cio_fini       = lov_empty_io_fini,
+#if 0
+                       .cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_start      = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_end        = LOV_EMPTY_IMPOSSIBLE
+#endif
+               },
+               [CIT_WRITE] = {
+                       .cio_fini      = lov_empty_io_fini,
+                       .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_end       = LOV_EMPTY_IMPOSSIBLE
+               },
+               [CIT_SETATTR] = {
+                       .cio_fini      = lov_empty_io_fini,
+                       .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_end       = LOV_EMPTY_IMPOSSIBLE
+               },
+               [CIT_FAULT] = {
+                       .cio_fini      = lov_empty_io_fini,
+                       .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                       .cio_end       = LOV_EMPTY_IMPOSSIBLE
+               },
+               [CIT_FSYNC] = {
+                       .cio_fini   = lov_empty_io_fini
+               },
+               [CIT_MISC] = {
+                       .cio_fini   = lov_empty_io_fini
+               }
+       },
+       .req_op = {
+                [CRT_READ] = {
+                        .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+                },
+                [CRT_WRITE] = {
+                        .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+                }
+        },
+       .cio_commit_write = LOV_EMPTY_IMPOSSIBLE
+};
+
+int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj,
+                     struct cl_io *io)
+{
+       struct lov_io       *lio = lov_env_io(env);
+       struct lov_object   *lov = cl2lov(obj);
+
+       ENTRY;
+       INIT_LIST_HEAD(&lio->lis_active);
+       lov_io_slice_init(lio, lov, io);
+       if (io->ci_result == 0) {
+               io->ci_result = lov_io_subio_init(env, lio, io);
+               if (io->ci_result == 0) {
+                       cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+                       atomic_inc(&lov->lo_active_ios);
+               }
+       }
+       RETURN(io->ci_result);
+}
+
+int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+                     struct cl_io *io)
+{
+       struct lov_object *lov = cl2lov(obj);
+       struct lov_io *lio = lov_env_io(env);
+       int result;
+       ENTRY;
+
+       lio->lis_object = lov;
+       switch (io->ci_type) {
+       default:
+               LBUG();
+       case CIT_MISC:
+       case CIT_READ:
+               result = 0;
+               break;
+       case CIT_FSYNC:
+       case CIT_SETATTR:
+               result = +1;
+               break;
+       case CIT_WRITE:
+               result = -EBADF;
+               break;
+       case CIT_FAULT:
+               result = -EFAULT;
+               CERROR("Page fault on a file without stripes: "DFID"\n",
+                      PFID(lu_object_fid(&obj->co_lu)));
+               break;
+       }
+       if (result == 0) {
+               cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+               atomic_inc(&lov->lo_active_ios);
+       }
+
+       io->ci_result = result < 0 ? result : 0;
+       RETURN(result != 0);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_lock.c b/drivers/staging/lustre/lustre/lov/lov_lock.c
new file mode 100644 (file)
index 0000000..bdf3334
--- /dev/null
@@ -0,0 +1,1253 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+                                              struct cl_lock *parent);
+
+static int lov_lock_unuse(const struct lu_env *env,
+                         const struct cl_lock_slice *slice);
+/*****************************************************************************
+ *
+ * Lov lock operations.
+ *
+ */
+
+static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env,
+                                                  struct cl_lock *parent,
+                                                  struct lov_lock_sub *lls)
+{
+       struct lov_sublock_env *subenv;
+       struct lov_io     *lio    = lov_env_io(env);
+       struct cl_io       *io     = lio->lis_cl.cis_io;
+       struct lov_io_sub      *sub;
+
+       subenv = &lov_env_session(env)->ls_subenv;
+
+       /*
+        * FIXME: We tend to use the subio's env & io to call the sublock
+        * lock operations because osc lock sometimes stores some control
+        * variables in thread's IO infomation(Now only lockless information).
+        * However, if the lock's host(object) is different from the object
+        * for current IO, we have no way to get the subenv and subio because
+        * they are not initialized at all. As a temp fix, in this case,
+        * we still borrow the parent's env to call sublock operations.
+        */
+       if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) {
+               subenv->lse_env = env;
+               subenv->lse_io  = io;
+               subenv->lse_sub = NULL;
+       } else {
+               sub = lov_sub_get(env, lio, lls->sub_stripe);
+               if (!IS_ERR(sub)) {
+                       subenv->lse_env = sub->sub_env;
+                       subenv->lse_io  = sub->sub_io;
+                       subenv->lse_sub = sub;
+               } else {
+                       subenv = (void*)sub;
+               }
+       }
+       return subenv;
+}
+
+static void lov_sublock_env_put(struct lov_sublock_env *subenv)
+{
+       if (subenv && subenv->lse_sub)
+               lov_sub_put(subenv->lse_sub);
+}
+
+static void lov_sublock_adopt(const struct lu_env *env, struct lov_lock *lck,
+                             struct cl_lock *sublock, int idx,
+                             struct lov_lock_link *link)
+{
+       struct lovsub_lock *lsl;
+       struct cl_lock     *parent = lck->lls_cl.cls_lock;
+       int              rc;
+
+       LASSERT(cl_lock_is_mutexed(parent));
+       LASSERT(cl_lock_is_mutexed(sublock));
+       ENTRY;
+
+       lsl = cl2sub_lock(sublock);
+       /*
+        * check that sub-lock doesn't have lock link to this top-lock.
+        */
+       LASSERT(lov_lock_link_find(env, lck, lsl) == NULL);
+       LASSERT(idx < lck->lls_nr);
+
+       lck->lls_sub[idx].sub_lock = lsl;
+       lck->lls_nr_filled++;
+       LASSERT(lck->lls_nr_filled <= lck->lls_nr);
+       list_add_tail(&link->lll_list, &lsl->lss_parents);
+       link->lll_idx = idx;
+       link->lll_super = lck;
+       cl_lock_get(parent);
+       lu_ref_add(&parent->cll_reference, "lov-child", sublock);
+       lck->lls_sub[idx].sub_flags |= LSF_HELD;
+       cl_lock_user_add(env, sublock);
+
+       rc = lov_sublock_modify(env, lck, lsl, &sublock->cll_descr, idx);
+       LASSERT(rc == 0); /* there is no way this can fail, currently */
+       EXIT;
+}
+
+static struct cl_lock *lov_sublock_alloc(const struct lu_env *env,
+                                        const struct cl_io *io,
+                                        struct lov_lock *lck,
+                                        int idx, struct lov_lock_link **out)
+{
+       struct cl_lock       *sublock;
+       struct cl_lock       *parent;
+       struct lov_lock_link *link;
+
+       LASSERT(idx < lck->lls_nr);
+       ENTRY;
+
+       OBD_SLAB_ALLOC_PTR_GFP(link, lov_lock_link_kmem, __GFP_IO);
+       if (link != NULL) {
+               struct lov_sublock_env *subenv;
+               struct lov_lock_sub  *lls;
+               struct cl_lock_descr *descr;
+
+               parent = lck->lls_cl.cls_lock;
+               lls    = &lck->lls_sub[idx];
+               descr  = &lls->sub_got;
+
+               subenv = lov_sublock_env_get(env, parent, lls);
+               if (!IS_ERR(subenv)) {
+                       /* CAVEAT: Don't try to add a field in lov_lock_sub
+                        * to remember the subio. This is because lock is able
+                        * to be cached, but this is not true for IO. This
+                        * further means a sublock might be referenced in
+                        * different io context. -jay */
+
+                       sublock = cl_lock_hold(subenv->lse_env, subenv->lse_io,
+                                              descr, "lov-parent", parent);
+                       lov_sublock_env_put(subenv);
+               } else {
+                       /* error occurs. */
+                       sublock = (void*)subenv;
+               }
+
+               if (!IS_ERR(sublock))
+                       *out = link;
+               else
+                       OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+       } else
+               sublock = ERR_PTR(-ENOMEM);
+       RETURN(sublock);
+}
+
+static void lov_sublock_unlock(const struct lu_env *env,
+                              struct lovsub_lock *lsl,
+                              struct cl_lock_closure *closure,
+                              struct lov_sublock_env *subenv)
+{
+       ENTRY;
+       lov_sublock_env_put(subenv);
+       lsl->lss_active = NULL;
+       cl_lock_disclosure(env, closure);
+       EXIT;
+}
+
+static int lov_sublock_lock(const struct lu_env *env,
+                           struct lov_lock *lck,
+                           struct lov_lock_sub *lls,
+                           struct cl_lock_closure *closure,
+                           struct lov_sublock_env **lsep)
+{
+       struct lovsub_lock *sublock;
+       struct cl_lock     *child;
+       int              result = 0;
+       ENTRY;
+
+       LASSERT(list_empty(&closure->clc_list));
+
+       sublock = lls->sub_lock;
+       child = sublock->lss_cl.cls_lock;
+       result = cl_lock_closure_build(env, child, closure);
+       if (result == 0) {
+               struct cl_lock *parent = closure->clc_origin;
+
+               LASSERT(cl_lock_is_mutexed(child));
+               sublock->lss_active = parent;
+
+               if (unlikely((child->cll_state == CLS_FREEING) ||
+                            (child->cll_flags & CLF_CANCELLED))) {
+                       struct lov_lock_link *link;
+                       /*
+                        * we could race with lock deletion which temporarily
+                        * put the lock in freeing state, bug 19080.
+                        */
+                       LASSERT(!(lls->sub_flags & LSF_HELD));
+
+                       link = lov_lock_link_find(env, lck, sublock);
+                       LASSERT(link != NULL);
+                       lov_lock_unlink(env, link, sublock);
+                       lov_sublock_unlock(env, sublock, closure, NULL);
+                       lck->lls_cancel_race = 1;
+                       result = CLO_REPEAT;
+               } else if (lsep) {
+                       struct lov_sublock_env *subenv;
+                       subenv = lov_sublock_env_get(env, parent, lls);
+                       if (IS_ERR(subenv)) {
+                               lov_sublock_unlock(env, sublock,
+                                                  closure, NULL);
+                               result = PTR_ERR(subenv);
+                       } else {
+                               *lsep = subenv;
+                       }
+               }
+       }
+       RETURN(result);
+}
+
+/**
+ * Updates the result of a top-lock operation from a result of sub-lock
+ * sub-operations. Top-operations like lov_lock_{enqueue,use,unuse}() iterate
+ * over sub-locks and lov_subresult() is used to calculate return value of a
+ * top-operation. To this end, possible return values of sub-operations are
+ * ordered as
+ *
+ *     - 0               success
+ *     - CLO_WAIT         wait for event
+ *     - CLO_REPEAT     repeat top-operation
+ *     - -ne           fundamental error
+ *
+ * Top-level return code can only go down through this list. CLO_REPEAT
+ * overwrites CLO_WAIT, because lock mutex was released and sleeping condition
+ * has to be rechecked by the upper layer.
+ */
+static int lov_subresult(int result, int rc)
+{
+       int result_rank;
+       int rc_rank;
+
+       ENTRY;
+
+       LASSERTF(result <= 0 || result == CLO_REPEAT || result == CLO_WAIT,
+                "result = %d", result);
+       LASSERTF(rc <= 0 || rc == CLO_REPEAT || rc == CLO_WAIT,
+                "rc = %d\n", rc);
+       CLASSERT(CLO_WAIT < CLO_REPEAT);
+
+       /* calculate ranks in the ordering above */
+       result_rank = result < 0 ? 1 + CLO_REPEAT : result;
+       rc_rank = rc < 0 ? 1 + CLO_REPEAT : rc;
+
+       if (result_rank < rc_rank)
+               result = rc;
+       RETURN(result);
+}
+
+/**
+ * Creates sub-locks for a given lov_lock for the first time.
+ *
+ * Goes through all sub-objects of top-object, and creates sub-locks on every
+ * sub-object intersecting with top-lock extent. This is complicated by the
+ * fact that top-lock (that is being created) can be accessed concurrently
+ * through already created sub-locks (possibly shared with other top-locks).
+ */
+static int lov_lock_sub_init(const struct lu_env *env,
+                            struct lov_lock *lck, const struct cl_io *io)
+{
+       int result = 0;
+       int i;
+       int nr;
+       obd_off start;
+       obd_off end;
+       obd_off file_start;
+       obd_off file_end;
+
+       struct lov_object       *loo    = cl2lov(lck->lls_cl.cls_obj);
+       struct lov_layout_raid0 *r0     = lov_r0(loo);
+       struct cl_lock    *parent = lck->lls_cl.cls_lock;
+
+       ENTRY;
+
+       lck->lls_orig = parent->cll_descr;
+       file_start = cl_offset(lov2cl(loo), parent->cll_descr.cld_start);
+       file_end   = cl_offset(lov2cl(loo), parent->cll_descr.cld_end + 1) - 1;
+
+       for (i = 0, nr = 0; i < r0->lo_nr; i++) {
+               /*
+                * XXX for wide striping smarter algorithm is desirable,
+                * breaking out of the loop, early.
+                */
+               if (lov_stripe_intersects(loo->lo_lsm, i,
+                                         file_start, file_end, &start, &end))
+                       nr++;
+       }
+       LASSERT(nr > 0);
+       OBD_ALLOC_LARGE(lck->lls_sub, nr * sizeof lck->lls_sub[0]);
+       if (lck->lls_sub == NULL)
+               RETURN(-ENOMEM);
+
+       lck->lls_nr = nr;
+       /*
+        * First, fill in sub-lock descriptions in
+        * lck->lls_sub[].sub_descr. They are used by lov_sublock_alloc()
+        * (called below in this function, and by lov_lock_enqueue()) to
+        * create sub-locks. At this moment, no other thread can access
+        * top-lock.
+        */
+       for (i = 0, nr = 0; i < r0->lo_nr; ++i) {
+               if (lov_stripe_intersects(loo->lo_lsm, i,
+                                         file_start, file_end, &start, &end)) {
+                       struct cl_lock_descr *descr;
+
+                       descr = &lck->lls_sub[nr].sub_descr;
+
+                       LASSERT(descr->cld_obj == NULL);
+                       descr->cld_obj   = lovsub2cl(r0->lo_sub[i]);
+                       descr->cld_start = cl_index(descr->cld_obj, start);
+                       descr->cld_end   = cl_index(descr->cld_obj, end);
+                       descr->cld_mode  = parent->cll_descr.cld_mode;
+                       descr->cld_gid   = parent->cll_descr.cld_gid;
+                       descr->cld_enq_flags   = parent->cll_descr.cld_enq_flags;
+                       /* XXX has no effect */
+                       lck->lls_sub[nr].sub_got = *descr;
+                       lck->lls_sub[nr].sub_stripe = i;
+                       nr++;
+               }
+       }
+       LASSERT(nr == lck->lls_nr);
+       /*
+        * Then, create sub-locks. Once at least one sub-lock was created,
+        * top-lock can be reached by other threads.
+        */
+       for (i = 0; i < lck->lls_nr; ++i) {
+               struct cl_lock       *sublock;
+               struct lov_lock_link *link;
+
+               if (lck->lls_sub[i].sub_lock == NULL) {
+                       sublock = lov_sublock_alloc(env, io, lck, i, &link);
+                       if (IS_ERR(sublock)) {
+                               result = PTR_ERR(sublock);
+                               break;
+                       }
+                       cl_lock_get_trust(sublock);
+                       cl_lock_mutex_get(env, sublock);
+                       cl_lock_mutex_get(env, parent);
+                       /*
+                        * recheck under mutex that sub-lock wasn't created
+                        * concurrently, and that top-lock is still alive.
+                        */
+                       if (lck->lls_sub[i].sub_lock == NULL &&
+                           parent->cll_state < CLS_FREEING) {
+                               lov_sublock_adopt(env, lck, sublock, i, link);
+                               cl_lock_mutex_put(env, parent);
+                       } else {
+                               OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+                               cl_lock_mutex_put(env, parent);
+                               cl_lock_unhold(env, sublock,
+                                              "lov-parent", parent);
+                       }
+                       cl_lock_mutex_put(env, sublock);
+                       cl_lock_put(env, sublock);
+               }
+       }
+       /*
+        * Some sub-locks can be missing at this point. This is not a problem,
+        * because enqueue will create them anyway. Main duty of this function
+        * is to fill in sub-lock descriptions in a race free manner.
+        */
+       RETURN(result);
+}
+
+static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck,
+                              int i, int deluser, int rc)
+{
+       struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+       LASSERT(cl_lock_is_mutexed(parent));
+       ENTRY;
+
+       if (lck->lls_sub[i].sub_flags & LSF_HELD) {
+               struct cl_lock    *sublock;
+               int dying;
+
+               LASSERT(lck->lls_sub[i].sub_lock != NULL);
+               sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+               LASSERT(cl_lock_is_mutexed(sublock));
+
+               lck->lls_sub[i].sub_flags &= ~LSF_HELD;
+               if (deluser)
+                       cl_lock_user_del(env, sublock);
+               /*
+                * If the last hold is released, and cancellation is pending
+                * for a sub-lock, release parent mutex, to avoid keeping it
+                * while sub-lock is being paged out.
+                */
+               dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM ||
+                        sublock->cll_descr.cld_mode == CLM_GROUP ||
+                        (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) &&
+                       sublock->cll_holds == 1;
+               if (dying)
+                       cl_lock_mutex_put(env, parent);
+               cl_lock_unhold(env, sublock, "lov-parent", parent);
+               if (dying) {
+                       cl_lock_mutex_get(env, parent);
+                       rc = lov_subresult(rc, CLO_REPEAT);
+               }
+               /*
+                * From now on lck->lls_sub[i].sub_lock is a "weak" pointer,
+                * not backed by a reference on a
+                * sub-lock. lovsub_lock_delete() will clear
+                * lck->lls_sub[i].sub_lock under semaphores, just before
+                * sub-lock is destroyed.
+                */
+       }
+       RETURN(rc);
+}
+
+static void lov_sublock_hold(const struct lu_env *env, struct lov_lock *lck,
+                            int i)
+{
+       struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+       LASSERT(cl_lock_is_mutexed(parent));
+       ENTRY;
+
+       if (!(lck->lls_sub[i].sub_flags & LSF_HELD)) {
+               struct cl_lock *sublock;
+
+               LASSERT(lck->lls_sub[i].sub_lock != NULL);
+               sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+               LASSERT(cl_lock_is_mutexed(sublock));
+               LASSERT(sublock->cll_state != CLS_FREEING);
+
+               lck->lls_sub[i].sub_flags |= LSF_HELD;
+
+               cl_lock_get_trust(sublock);
+               cl_lock_hold_add(env, sublock, "lov-parent", parent);
+               cl_lock_user_add(env, sublock);
+               cl_lock_put(env, sublock);
+       }
+       EXIT;
+}
+
+static void lov_lock_fini(const struct lu_env *env,
+                         struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck;
+       int i;
+
+       ENTRY;
+       lck = cl2lov_lock(slice);
+       LASSERT(lck->lls_nr_filled == 0);
+       if (lck->lls_sub != NULL) {
+               for (i = 0; i < lck->lls_nr; ++i)
+                       /*
+                        * No sub-locks exists at this point, as sub-lock has
+                        * a reference on its parent.
+                        */
+                       LASSERT(lck->lls_sub[i].sub_lock == NULL);
+               OBD_FREE_LARGE(lck->lls_sub,
+                              lck->lls_nr * sizeof lck->lls_sub[0]);
+       }
+       OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+       EXIT;
+}
+
+static int lov_lock_enqueue_wait(const struct lu_env *env,
+                                struct lov_lock *lck,
+                                struct cl_lock *sublock)
+{
+       struct cl_lock *lock = lck->lls_cl.cls_lock;
+       int          result;
+       ENTRY;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+
+       cl_lock_mutex_put(env, lock);
+       result = cl_lock_enqueue_wait(env, sublock, 0);
+       cl_lock_mutex_get(env, lock);
+       RETURN(result ?: CLO_REPEAT);
+}
+
+/**
+ * Tries to advance a state machine of a given sub-lock toward enqueuing of
+ * the top-lock.
+ *
+ * \retval 0 if state-transition can proceed
+ * \retval -ve otherwise.
+ */
+static int lov_lock_enqueue_one(const struct lu_env *env, struct lov_lock *lck,
+                               struct cl_lock *sublock,
+                               struct cl_io *io, __u32 enqflags, int last)
+{
+       int result;
+       ENTRY;
+
+       /* first, try to enqueue a sub-lock ... */
+       result = cl_enqueue_try(env, sublock, io, enqflags);
+       if ((sublock->cll_state == CLS_ENQUEUED) && !(enqflags & CEF_AGL)) {
+               /* if it is enqueued, try to `wait' on it---maybe it's already
+                * granted */
+               result = cl_wait_try(env, sublock);
+               if (result == CLO_REENQUEUED)
+                       result = CLO_WAIT;
+       }
+       /*
+        * If CEF_ASYNC flag is set, then all sub-locks can be enqueued in
+        * parallel, otherwise---enqueue has to wait until sub-lock is granted
+        * before proceeding to the next one.
+        */
+       if ((result == CLO_WAIT) && (sublock->cll_state <= CLS_HELD) &&
+           (enqflags & CEF_ASYNC) && (!last || (enqflags & CEF_AGL)))
+               result = 0;
+       RETURN(result);
+}
+
+/**
+ * Helper function for lov_lock_enqueue() that creates missing sub-lock.
+ */
+static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent,
+                           struct cl_io *io, struct lov_lock *lck, int idx)
+{
+       struct lov_lock_link *link;
+       struct cl_lock       *sublock;
+       int                result;
+
+       LASSERT(parent->cll_depth == 1);
+       cl_lock_mutex_put(env, parent);
+       sublock = lov_sublock_alloc(env, io, lck, idx, &link);
+       if (!IS_ERR(sublock))
+               cl_lock_mutex_get(env, sublock);
+       cl_lock_mutex_get(env, parent);
+
+       if (!IS_ERR(sublock)) {
+               cl_lock_get_trust(sublock);
+               if (parent->cll_state == CLS_QUEUING &&
+                   lck->lls_sub[idx].sub_lock == NULL) {
+                       lov_sublock_adopt(env, lck, sublock, idx, link);
+               } else {
+                       OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+                       /* other thread allocated sub-lock, or enqueue is no
+                        * longer going on */
+                       cl_lock_mutex_put(env, parent);
+                       cl_lock_unhold(env, sublock, "lov-parent", parent);
+                       cl_lock_mutex_get(env, parent);
+               }
+               cl_lock_mutex_put(env, sublock);
+               cl_lock_put(env, sublock);
+               result = CLO_REPEAT;
+       } else
+               result = PTR_ERR(sublock);
+       return result;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This
+ * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock
+ * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock
+ * state machines in the face of sub-locks sharing (by multiple top-locks),
+ * and concurrent sub-lock cancellations.
+ */
+static int lov_lock_enqueue(const struct lu_env *env,
+                           const struct cl_lock_slice *slice,
+                           struct cl_io *io, __u32 enqflags)
+{
+       struct cl_lock   *lock    = slice->cls_lock;
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, lock);
+       int i;
+       int result;
+       enum cl_lock_state minstate;
+
+       ENTRY;
+
+       for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) {
+               int rc;
+               struct lovsub_lock     *sub;
+               struct lov_lock_sub    *lls;
+               struct cl_lock   *sublock;
+               struct lov_sublock_env *subenv;
+
+               if (lock->cll_state != CLS_QUEUING) {
+                       /*
+                        * Lock might have left QUEUING state if previous
+                        * iteration released its mutex. Stop enqueing in this
+                        * case and let the upper layer to decide what to do.
+                        */
+                       LASSERT(i > 0 && result != 0);
+                       break;
+               }
+
+               lls = &lck->lls_sub[i];
+               sub = lls->sub_lock;
+               /*
+                * Sub-lock might have been canceled, while top-lock was
+                * cached.
+                */
+               if (sub == NULL) {
+                       result = lov_sublock_fill(env, lock, io, lck, i);
+                       /* lov_sublock_fill() released @lock mutex,
+                        * restart. */
+                       break;
+               }
+               sublock = sub->lss_cl.cls_lock;
+               rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+               if (rc == 0) {
+                       lov_sublock_hold(env, lck, i);
+                       rc = lov_lock_enqueue_one(subenv->lse_env, lck, sublock,
+                                                 subenv->lse_io, enqflags,
+                                                 i == lck->lls_nr - 1);
+                       minstate = min(minstate, sublock->cll_state);
+                       if (rc == CLO_WAIT) {
+                               switch (sublock->cll_state) {
+                               case CLS_QUEUING:
+                                       /* take recursive mutex, the lock is
+                                        * released in lov_lock_enqueue_wait.
+                                        */
+                                       cl_lock_mutex_get(env, sublock);
+                                       lov_sublock_unlock(env, sub, closure,
+                                                          subenv);
+                                       rc = lov_lock_enqueue_wait(env, lck,
+                                                                  sublock);
+                                       break;
+                               case CLS_CACHED:
+                                       cl_lock_get(sublock);
+                                       /* take recursive mutex of sublock */
+                                       cl_lock_mutex_get(env, sublock);
+                                       /* need to release all locks in closure
+                                        * otherwise it may deadlock. LU-2683.*/
+                                       lov_sublock_unlock(env, sub, closure,
+                                                          subenv);
+                                       /* sublock and parent are held. */
+                                       rc = lov_sublock_release(env, lck, i,
+                                                                1, rc);
+                                       cl_lock_mutex_put(env, sublock);
+                                       cl_lock_put(env, sublock);
+                                       break;
+                               default:
+                                       lov_sublock_unlock(env, sub, closure,
+                                                          subenv);
+                                       break;
+                               }
+                       } else {
+                               LASSERT(sublock->cll_conflict == NULL);
+                               lov_sublock_unlock(env, sub, closure, subenv);
+                       }
+               }
+               result = lov_subresult(result, rc);
+               if (result != 0)
+                       break;
+       }
+       cl_lock_closure_fini(closure);
+       RETURN(result ?: minstate >= CLS_ENQUEUED ? 0 : CLO_WAIT);
+}
+
+static int lov_lock_unuse(const struct lu_env *env,
+                         const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+       int i;
+       int result;
+
+       ENTRY;
+
+       for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+               int rc;
+               struct lovsub_lock     *sub;
+               struct cl_lock   *sublock;
+               struct lov_lock_sub    *lls;
+               struct lov_sublock_env *subenv;
+
+               /* top-lock state cannot change concurrently, because single
+                * thread (one that released the last hold) carries unlocking
+                * to the completion. */
+               LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+               lls = &lck->lls_sub[i];
+               sub = lls->sub_lock;
+               if (sub == NULL)
+                       continue;
+
+               sublock = sub->lss_cl.cls_lock;
+               rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+               if (rc == 0) {
+                       if (lls->sub_flags & LSF_HELD) {
+                               LASSERT(sublock->cll_state == CLS_HELD ||
+                                       sublock->cll_state == CLS_ENQUEUED);
+                               rc = cl_unuse_try(subenv->lse_env, sublock);
+                               rc = lov_sublock_release(env, lck, i, 0, rc);
+                       }
+                       lov_sublock_unlock(env, sub, closure, subenv);
+               }
+               result = lov_subresult(result, rc);
+       }
+
+       if (result == 0 && lck->lls_cancel_race) {
+               lck->lls_cancel_race = 0;
+               result = -ESTALE;
+       }
+       cl_lock_closure_fini(closure);
+       RETURN(result);
+}
+
+
+static void lov_lock_cancel(const struct lu_env *env,
+                          const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+       int i;
+       int result;
+
+       ENTRY;
+
+       for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+               int rc;
+               struct lovsub_lock     *sub;
+               struct cl_lock   *sublock;
+               struct lov_lock_sub    *lls;
+               struct lov_sublock_env *subenv;
+
+               /* top-lock state cannot change concurrently, because single
+                * thread (one that released the last hold) carries unlocking
+                * to the completion. */
+               lls = &lck->lls_sub[i];
+               sub = lls->sub_lock;
+               if (sub == NULL)
+                       continue;
+
+               sublock = sub->lss_cl.cls_lock;
+               rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+               if (rc == 0) {
+                       if (!(lls->sub_flags & LSF_HELD)) {
+                               lov_sublock_unlock(env, sub, closure, subenv);
+                               continue;
+                       }
+
+                       switch(sublock->cll_state) {
+                       case CLS_HELD:
+                               rc = cl_unuse_try(subenv->lse_env, sublock);
+                               lov_sublock_release(env, lck, i, 0, 0);
+                               break;
+                       default:
+                               lov_sublock_release(env, lck, i, 1, 0);
+                               break;
+                       }
+                       lov_sublock_unlock(env, sub, closure, subenv);
+               }
+
+               if (rc == CLO_REPEAT) {
+                       --i;
+                       continue;
+               }
+
+               result = lov_subresult(result, rc);
+       }
+
+       if (result)
+               CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock,
+                             "lov_lock_cancel fails with %d.\n", result);
+
+       cl_lock_closure_fini(closure);
+}
+
+static int lov_lock_wait(const struct lu_env *env,
+                        const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+       enum cl_lock_state      minstate;
+       int                  reenqueued;
+       int                  result;
+       int                  i;
+
+       ENTRY;
+
+again:
+       for (result = 0, minstate = CLS_FREEING, i = 0, reenqueued = 0;
+            i < lck->lls_nr; ++i) {
+               int rc;
+               struct lovsub_lock     *sub;
+               struct cl_lock   *sublock;
+               struct lov_lock_sub    *lls;
+               struct lov_sublock_env *subenv;
+
+               lls = &lck->lls_sub[i];
+               sub = lls->sub_lock;
+               LASSERT(sub != NULL);
+               sublock = sub->lss_cl.cls_lock;
+               rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+               if (rc == 0) {
+                       LASSERT(sublock->cll_state >= CLS_ENQUEUED);
+                       if (sublock->cll_state < CLS_HELD)
+                               rc = cl_wait_try(env, sublock);
+
+                       minstate = min(minstate, sublock->cll_state);
+                       lov_sublock_unlock(env, sub, closure, subenv);
+               }
+               if (rc == CLO_REENQUEUED) {
+                       reenqueued++;
+                       rc = 0;
+               }
+               result = lov_subresult(result, rc);
+               if (result != 0)
+                       break;
+       }
+       /* Each sublock only can be reenqueued once, so will not loop for
+        * ever. */
+       if (result == 0 && reenqueued != 0)
+               goto again;
+       cl_lock_closure_fini(closure);
+       RETURN(result ?: minstate >= CLS_HELD ? 0 : CLO_WAIT);
+}
+
+static int lov_lock_use(const struct lu_env *env,
+                       const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+       int                  result;
+       int                  i;
+
+       LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+       ENTRY;
+
+       for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+               int rc;
+               struct lovsub_lock     *sub;
+               struct cl_lock   *sublock;
+               struct lov_lock_sub    *lls;
+               struct lov_sublock_env *subenv;
+
+               LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT);
+
+               lls = &lck->lls_sub[i];
+               sub = lls->sub_lock;
+               if (sub == NULL) {
+                       /*
+                        * Sub-lock might have been canceled, while top-lock was
+                        * cached.
+                        */
+                       result = -ESTALE;
+                       break;
+               }
+
+               sublock = sub->lss_cl.cls_lock;
+               rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
+               if (rc == 0) {
+                       LASSERT(sublock->cll_state != CLS_FREEING);
+                       lov_sublock_hold(env, lck, i);
+                       if (sublock->cll_state == CLS_CACHED) {
+                               rc = cl_use_try(subenv->lse_env, sublock, 0);
+                               if (rc != 0)
+                                       rc = lov_sublock_release(env, lck,
+                                                                i, 1, rc);
+                       } else if (sublock->cll_state == CLS_NEW) {
+                               /* Sub-lock might have been canceled, while
+                                * top-lock was cached. */
+                               result = -ESTALE;
+                               lov_sublock_release(env, lck, i, 1, result);
+                       }
+                       lov_sublock_unlock(env, sub, closure, subenv);
+               }
+               result = lov_subresult(result, rc);
+               if (result != 0)
+                       break;
+       }
+
+       if (lck->lls_cancel_race) {
+               /*
+                * If there is unlocking happened at the same time, then
+                * sublock_lock state should be FREEING, and lov_sublock_lock
+                * should return CLO_REPEAT. In this case, it should return
+                * ESTALE, and up layer should reset the lock state to be NEW.
+                */
+               lck->lls_cancel_race = 0;
+               LASSERT(result != 0);
+               result = -ESTALE;
+       }
+       cl_lock_closure_fini(closure);
+       RETURN(result);
+}
+
+#if 0
+static int lock_lock_multi_match()
+{
+       struct cl_lock    *lock    = slice->cls_lock;
+       struct cl_lock_descr    *subneed = &lov_env_info(env)->lti_ldescr;
+       struct lov_object       *loo     = cl2lov(lov->lls_cl.cls_obj);
+       struct lov_layout_raid0 *r0      = lov_r0(loo);
+       struct lov_lock_sub     *sub;
+       struct cl_object        *subobj;
+       obd_off  fstart;
+       obd_off  fend;
+       obd_off  start;
+       obd_off  end;
+       int i;
+
+       fstart = cl_offset(need->cld_obj, need->cld_start);
+       fend   = cl_offset(need->cld_obj, need->cld_end + 1) - 1;
+       subneed->cld_mode = need->cld_mode;
+       cl_lock_mutex_get(env, lock);
+       for (i = 0; i < lov->lls_nr; ++i) {
+               sub = &lov->lls_sub[i];
+               if (sub->sub_lock == NULL)
+                       continue;
+               subobj = sub->sub_descr.cld_obj;
+               if (!lov_stripe_intersects(loo->lo_lsm, sub->sub_stripe,
+                                          fstart, fend, &start, &end))
+                       continue;
+               subneed->cld_start = cl_index(subobj, start);
+               subneed->cld_end   = cl_index(subobj, end);
+               subneed->cld_obj   = subobj;
+               if (!cl_lock_ext_match(&sub->sub_got, subneed)) {
+                       result = 0;
+                       break;
+               }
+       }
+       cl_lock_mutex_put(env, lock);
+}
+#endif
+
+/**
+ * Check if the extent region \a descr is covered by \a child against the
+ * specific \a stripe.
+ */
+static int lov_lock_stripe_is_matching(const struct lu_env *env,
+                                      struct lov_object *lov, int stripe,
+                                      const struct cl_lock_descr *child,
+                                      const struct cl_lock_descr *descr)
+{
+       struct lov_stripe_md *lsm = lov->lo_lsm;
+       obd_off start;
+       obd_off end;
+       int result;
+
+       if (lov_r0(lov)->lo_nr == 1)
+               return cl_lock_ext_match(child, descr);
+
+       /*
+        * For a multi-stripes object:
+        * - make sure the descr only covers child's stripe, and
+        * - check if extent is matching.
+        */
+       start = cl_offset(&lov->lo_cl, descr->cld_start);
+       end   = cl_offset(&lov->lo_cl, descr->cld_end + 1) - 1;
+       result = end - start <= lsm->lsm_stripe_size &&
+                stripe == lov_stripe_number(lsm, start) &&
+                stripe == lov_stripe_number(lsm, end);
+       if (result) {
+               struct cl_lock_descr *subd = &lov_env_info(env)->lti_ldescr;
+               obd_off sub_start;
+               obd_off sub_end;
+
+               subd->cld_obj  = NULL;   /* don't need sub object at all */
+               subd->cld_mode = descr->cld_mode;
+               subd->cld_gid  = descr->cld_gid;
+               result = lov_stripe_intersects(lsm, stripe, start, end,
+                                              &sub_start, &sub_end);
+               LASSERT(result);
+               subd->cld_start = cl_index(child->cld_obj, sub_start);
+               subd->cld_end   = cl_index(child->cld_obj, sub_end);
+               result = cl_lock_ext_match(child, subd);
+       }
+       return result;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_fits_into() method.
+ *
+ * Checks whether a lock (given by \a slice) is suitable for \a
+ * io. Multi-stripe locks can be used only for "quick" io, like truncate, or
+ * O_APPEND write.
+ *
+ * \see ccc_lock_fits_into().
+ */
+static int lov_lock_fits_into(const struct lu_env *env,
+                             const struct cl_lock_slice *slice,
+                             const struct cl_lock_descr *need,
+                             const struct cl_io *io)
+{
+       struct lov_lock   *lov = cl2lov_lock(slice);
+       struct lov_object *obj = cl2lov(slice->cls_obj);
+       int result;
+
+       LASSERT(cl_object_same(need->cld_obj, slice->cls_obj));
+       LASSERT(lov->lls_nr > 0);
+
+       ENTRY;
+
+       /* for top lock, it's necessary to match enq flags otherwise it will
+        * run into problem if a sublock is missing and reenqueue. */
+       if (need->cld_enq_flags != lov->lls_orig.cld_enq_flags)
+               return 0;
+
+       if (need->cld_mode == CLM_GROUP)
+               /*
+                * always allow to match group lock.
+                */
+               result = cl_lock_ext_match(&lov->lls_orig, need);
+       else if (lov->lls_nr == 1) {
+               struct cl_lock_descr *got = &lov->lls_sub[0].sub_got;
+               result = lov_lock_stripe_is_matching(env,
+                                                    cl2lov(slice->cls_obj),
+                                                    lov->lls_sub[0].sub_stripe,
+                                                    got, need);
+       } else if (io->ci_type != CIT_SETATTR && io->ci_type != CIT_MISC &&
+                  !cl_io_is_append(io) && need->cld_mode != CLM_PHANTOM)
+               /*
+                * Multi-stripe locks are only suitable for `quick' IO and for
+                * glimpse.
+                */
+               result = 0;
+       else
+               /*
+                * Most general case: multi-stripe existing lock, and
+                * (potentially) multi-stripe @need lock. Check that @need is
+                * covered by @lov's sub-locks.
+                *
+                * For now, ignore lock expansions made by the server, and
+                * match against original lock extent.
+                */
+               result = cl_lock_ext_match(&lov->lls_orig, need);
+       CDEBUG(D_DLMTRACE, DDESCR"/"DDESCR" %d %d/%d: %d\n",
+              PDESCR(&lov->lls_orig), PDESCR(&lov->lls_sub[0].sub_got),
+              lov->lls_sub[0].sub_stripe, lov->lls_nr, lov_r0(obj)->lo_nr,
+              result);
+       RETURN(result);
+}
+
+void lov_lock_unlink(const struct lu_env *env,
+                    struct lov_lock_link *link, struct lovsub_lock *sub)
+{
+       struct lov_lock *lck    = link->lll_super;
+       struct cl_lock  *parent = lck->lls_cl.cls_lock;
+
+       LASSERT(cl_lock_is_mutexed(parent));
+       LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+       ENTRY;
+
+       list_del_init(&link->lll_list);
+       LASSERT(lck->lls_sub[link->lll_idx].sub_lock == sub);
+       /* yank this sub-lock from parent's array */
+       lck->lls_sub[link->lll_idx].sub_lock = NULL;
+       LASSERT(lck->lls_nr_filled > 0);
+       lck->lls_nr_filled--;
+       lu_ref_del(&parent->cll_reference, "lov-child", sub->lss_cl.cls_lock);
+       cl_lock_put(env, parent);
+       OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+       EXIT;
+}
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+                                        struct lov_lock *lck,
+                                        struct lovsub_lock *sub)
+{
+       struct lov_lock_link *scan;
+
+       LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+       ENTRY;
+
+       list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+               if (scan->lll_super == lck)
+                       RETURN(scan);
+       }
+       RETURN(NULL);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked for "top-to-bottom" delete, when lock destruction starts from the
+ * top-lock, e.g., as a result of inode destruction.
+ *
+ * Unlinks top-lock from all its sub-locks. Sub-locks are not deleted there:
+ * this is done separately elsewhere:
+ *
+ *     - for inode destruction, lov_object_delete() calls cl_object_kill() for
+ *       each sub-object, purging its locks;
+ *
+ *     - in other cases (e.g., a fatal error with a top-lock) sub-locks are
+ *       left in the cache.
+ */
+static void lov_lock_delete(const struct lu_env *env,
+                           const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck     = cl2lov_lock(slice);
+       struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+       struct lov_lock_link   *link;
+       int                  rc;
+       int                  i;
+
+       LASSERT(slice->cls_lock->cll_state == CLS_FREEING);
+       ENTRY;
+
+       for (i = 0; i < lck->lls_nr; ++i) {
+               struct lov_lock_sub *lls = &lck->lls_sub[i];
+               struct lovsub_lock  *lsl = lls->sub_lock;
+
+               if (lsl == NULL) /* already removed */
+                       continue;
+
+               rc = lov_sublock_lock(env, lck, lls, closure, NULL);
+               if (rc == CLO_REPEAT) {
+                       --i;
+                       continue;
+               }
+
+               LASSERT(rc == 0);
+               LASSERT(lsl->lss_cl.cls_lock->cll_state < CLS_FREEING);
+
+               if (lls->sub_flags & LSF_HELD)
+                       lov_sublock_release(env, lck, i, 1, 0);
+
+               link = lov_lock_link_find(env, lck, lsl);
+               LASSERT(link != NULL);
+               lov_lock_unlink(env, link, lsl);
+               LASSERT(lck->lls_sub[i].sub_lock == NULL);
+
+               lov_sublock_unlock(env, lsl, closure, NULL);
+       }
+
+       cl_lock_closure_fini(closure);
+       EXIT;
+}
+
+static int lov_lock_print(const struct lu_env *env, void *cookie,
+                         lu_printer_t p, const struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck = cl2lov_lock(slice);
+       int           i;
+
+       (*p)(env, cookie, "%d\n", lck->lls_nr);
+       for (i = 0; i < lck->lls_nr; ++i) {
+               struct lov_lock_sub *sub;
+
+               sub = &lck->lls_sub[i];
+               (*p)(env, cookie, "    %d %x: ", i, sub->sub_flags);
+               if (sub->sub_lock != NULL)
+                       cl_lock_print(env, cookie, p,
+                                     sub->sub_lock->lss_cl.cls_lock);
+               else
+                       (*p)(env, cookie, "---\n");
+       }
+       return 0;
+}
+
+static const struct cl_lock_operations lov_lock_ops = {
+       .clo_fini      = lov_lock_fini,
+       .clo_enqueue   = lov_lock_enqueue,
+       .clo_wait      = lov_lock_wait,
+       .clo_use       = lov_lock_use,
+       .clo_unuse     = lov_lock_unuse,
+       .clo_cancel    = lov_lock_cancel,
+       .clo_fits_into = lov_lock_fits_into,
+       .clo_delete    = lov_lock_delete,
+       .clo_print     = lov_lock_print
+};
+
+int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_lock *lock, const struct cl_io *io)
+{
+       struct lov_lock *lck;
+       int result;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, __GFP_IO);
+       if (lck != NULL) {
+               cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
+               result = lov_lock_sub_init(env, lck, io);
+       } else
+               result = -ENOMEM;
+       RETURN(result);
+}
+
+static void lov_empty_lock_fini(const struct lu_env *env,
+                               struct cl_lock_slice *slice)
+{
+       struct lov_lock *lck = cl2lov_lock(slice);
+       OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+}
+
+static int lov_empty_lock_print(const struct lu_env *env, void *cookie,
+                       lu_printer_t p, const struct cl_lock_slice *slice)
+{
+       (*p)(env, cookie, "empty\n");
+       return 0;
+}
+
+/* XXX: more methods will be added later. */
+static const struct cl_lock_operations lov_empty_lock_ops = {
+       .clo_fini  = lov_empty_lock_fini,
+       .clo_print = lov_empty_lock_print
+};
+
+int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj,
+               struct cl_lock *lock, const struct cl_io *io)
+{
+       struct lov_lock *lck;
+       int result = -ENOMEM;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, __GFP_IO);
+       if (lck != NULL) {
+               cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops);
+               lck->lls_orig = lock->cll_descr;
+               result = 0;
+       }
+       RETURN(result);
+}
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+                                              struct cl_lock *parent)
+{
+       struct cl_lock_closure *closure;
+
+       closure = &lov_env_info(env)->lti_closure;
+       LASSERT(list_empty(&closure->clc_list));
+       cl_lock_closure_init(env, closure, parent, 1);
+       return closure;
+}
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_log.c b/drivers/staging/lustre/lustre/lov/lov_log.c
new file mode 100644 (file)
index 0000000..63b7f8d
--- /dev/null
@@ -0,0 +1,278 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_log.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+#include <lustre_mds.h>
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <obd_ost.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+
+#include "lov_internal.h"
+
+/* Add log records for each OSC that this object is striped over, and return
+ * cookies for each one.  We _would_ have nice abstraction here, except that
+ * we need to keep cookies in stripe order, even if some are NULL, so that
+ * the right cookies are passed back to the right OSTs at the client side.
+ * Unset cookies should be all-zero (which will never occur naturally). */
+static int lov_llog_origin_add(const struct lu_env *env,
+                              struct llog_ctxt *ctxt,
+                              struct llog_rec_hdr *rec,
+                              struct lov_stripe_md *lsm,
+                              struct llog_cookie *logcookies, int numcookies)
+{
+       struct obd_device *obd = ctxt->loc_obd;
+       struct lov_obd *lov = &obd->u.lov;
+       int i, rc = 0, cookies = 0;
+       ENTRY;
+
+       LASSERTF(logcookies && numcookies >= lsm->lsm_stripe_count,
+                "logcookies %p, numcookies %d lsm->lsm_stripe_count %d \n",
+                logcookies, numcookies, lsm->lsm_stripe_count);
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+               struct obd_device *child =
+                       lov->lov_tgts[loi->loi_ost_idx]->ltd_exp->exp_obd;
+               struct llog_ctxt *cctxt = llog_get_context(child, ctxt->loc_idx);
+
+               /* fill mds unlink/setattr log record */
+               switch (rec->lrh_type) {
+               case MDS_UNLINK_REC: {
+                       struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+                       lur->lur_oid = ostid_id(&loi->loi_oi);
+                       lur->lur_oseq = (__u32)ostid_seq(&loi->loi_oi);
+                       break;
+               }
+               case MDS_SETATTR64_REC: {
+                       struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec;
+                       lsr->lsr_oi = loi->loi_oi;
+                       break;
+               }
+               default:
+                       break;
+               }
+
+               /* inject error in llog_obd_add() below */
+               if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FAIL_LOV_LOG_ADD)) {
+                       llog_ctxt_put(cctxt);
+                       cctxt = NULL;
+               }
+               rc = llog_obd_add(env, cctxt, rec, NULL, logcookies + cookies,
+                                 numcookies - cookies);
+               llog_ctxt_put(cctxt);
+               if (rc < 0) {
+                       CERROR("Can't add llog (rc = %d) for stripe %d\n",
+                              rc, cookies);
+                       memset(logcookies + cookies, 0,
+                              sizeof(struct llog_cookie));
+                       rc = 1; /* skip this cookie */
+               }
+               /* Note that rc is always 1 if llog_obd_add was successful */
+               cookies += rc;
+       }
+       RETURN(cookies);
+}
+
+static int lov_llog_origin_connect(struct llog_ctxt *ctxt,
+                                  struct llog_logid *logid,
+                                  struct llog_gen *gen,
+                                  struct obd_uuid *uuid)
+{
+       struct obd_device *obd = ctxt->loc_obd;
+       struct lov_obd *lov = &obd->u.lov;
+       int i, rc = 0, err = 0;
+       ENTRY;
+
+       obd_getref(obd);
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               struct obd_device *child;
+               struct llog_ctxt *cctxt;
+
+               if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+                       continue;
+               if (uuid && !obd_uuid_equals(uuid, &lov->lov_tgts[i]->ltd_uuid))
+                       continue;
+               CDEBUG(D_CONFIG, "connect %d/%d\n", i, lov->desc.ld_tgt_count);
+               child = lov->lov_tgts[i]->ltd_exp->exp_obd;
+               cctxt = llog_get_context(child, ctxt->loc_idx);
+               rc = llog_connect(cctxt, logid, gen, uuid);
+               llog_ctxt_put(cctxt);
+
+               if (rc) {
+                       CERROR("error osc_llog_connect tgt %d (%d)\n", i, rc);
+                       if (!err)
+                               err = rc;
+               }
+       }
+       obd_putref(obd);
+
+       RETURN(err);
+}
+
+/* the replicators commit callback */
+static int lov_llog_repl_cancel(const struct lu_env *env,
+                               struct llog_ctxt *ctxt,
+                               struct lov_stripe_md *lsm,
+                               int count, struct llog_cookie *cookies,
+                               int flags)
+{
+       struct lov_obd *lov;
+       struct obd_device *obd = ctxt->loc_obd;
+       int rc = 0, i;
+       ENTRY;
+
+       LASSERT(lsm != NULL);
+       LASSERT(count == lsm->lsm_stripe_count);
+
+       lov = &obd->u.lov;
+       obd_getref(obd);
+       for (i = 0; i < count; i++, cookies++) {
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+               struct obd_device *child =
+                       lov->lov_tgts[loi->loi_ost_idx]->ltd_exp->exp_obd;
+               struct llog_ctxt *cctxt =
+                       llog_get_context(child, ctxt->loc_idx);
+               int err;
+
+               err = llog_cancel(env, cctxt, NULL, 1, cookies, flags);
+               llog_ctxt_put(cctxt);
+               if (err && lov->lov_tgts[loi->loi_ost_idx]->ltd_active) {
+                       CERROR("%s: objid "DOSTID" subobj "DOSTID
+                              " on OST idx %d: rc = %d\n",
+                              obd->obd_name, POSTID(&lsm->lsm_oi),
+                              POSTID(&loi->loi_oi), loi->loi_ost_idx, err);
+                       if (!rc)
+                               rc = err;
+               }
+       }
+       obd_putref(obd);
+       RETURN(rc);
+}
+
+static struct llog_operations lov_mds_ost_orig_logops = {
+       .lop_obd_add    = lov_llog_origin_add,
+       .lop_connect    = lov_llog_origin_connect,
+};
+
+static struct llog_operations lov_size_repl_logops = {
+       .lop_cancel     = lov_llog_repl_cancel,
+};
+
+int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                 struct obd_device *disk_obd, int *index)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct obd_device *child;
+       int i, rc = 0;
+       ENTRY;
+
+       LASSERT(olg == &obd->obd_olg);
+       rc = llog_setup(NULL, obd, olg, LLOG_MDS_OST_ORIG_CTXT, disk_obd,
+                       &lov_mds_ost_orig_logops);
+       if (rc)
+               RETURN(rc);
+
+       rc = llog_setup(NULL, obd, olg, LLOG_SIZE_REPL_CTXT, disk_obd,
+                       &lov_size_repl_logops);
+       if (rc)
+               GOTO(err_cleanup, rc);
+
+       obd_getref(obd);
+       /* count may not match lov->desc.ld_tgt_count during dynamic ost add */
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               if (!lov->lov_tgts[i])
+                       continue;
+
+               if (index && i != *index)
+                       continue;
+
+               child = lov->lov_tgts[i]->ltd_obd;
+               rc = obd_llog_init(child, &child->obd_olg, disk_obd, &i);
+               if (rc)
+                       CERROR("error osc_llog_init idx %d osc '%s' tgt '%s' "
+                              "(rc=%d)\n", i, child->obd_name,
+                              disk_obd->obd_name, rc);
+               rc = 0;
+       }
+       obd_putref(obd);
+       GOTO(err_cleanup, rc);
+err_cleanup:
+       if (rc) {
+               struct llog_ctxt *ctxt =
+                       llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+               if (ctxt)
+                       llog_cleanup(NULL, ctxt);
+               ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+               if (ctxt)
+                       llog_cleanup(NULL, ctxt);
+       }
+       return rc;
+}
+
+int lov_llog_finish(struct obd_device *obd, int count)
+{
+       struct llog_ctxt *ctxt;
+
+       ENTRY;
+
+       /* cleanup our llogs only if the ctxts have been setup
+        * (client lov doesn't setup, mds lov does). */
+       ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+
+       ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+
+       /* lov->tgt llogs are cleaned during osc_cleanup. */
+       RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_merge.c b/drivers/staging/lustre/lustre/lov/lov_merge.c
new file mode 100644 (file)
index 0000000..ddbac12
--- /dev/null
@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+
+#include "lov_internal.h"
+
+/** Merge the lock value block(&lvb) attributes and KMS from each of the
+ * stripes in a file into a single lvb. It is expected that the caller
+ * initializes the current atime, mtime, ctime to avoid regressing a more
+ * uptodate time on the local client.
+ */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+                     struct ost_lvb *lvb, __u64 *kms_place)
+{
+       __u64 size = 0;
+       __u64 kms = 0;
+       __u64 blocks = 0;
+       obd_time current_mtime = lvb->lvb_mtime;
+       obd_time current_atime = lvb->lvb_atime;
+       obd_time current_ctime = lvb->lvb_ctime;
+       int i;
+       int rc = 0;
+
+       LASSERT(spin_is_locked(&lsm->lsm_lock));
+       LASSERT(lsm->lsm_lock_owner == current_pid());
+
+       CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s="LPU64" m="LPU64
+              " a="LPU64" c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi),
+              lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime,
+              lvb->lvb_blocks);
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+               obd_size lov_size, tmpsize;
+
+               if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) {
+                       rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks);
+                       continue;
+               }
+
+               tmpsize = loi->loi_kms;
+               lov_size = lov_stripe_size(lsm, tmpsize, i);
+               if (lov_size > kms)
+                       kms = lov_size;
+
+               if (loi->loi_lvb.lvb_size > tmpsize)
+                       tmpsize = loi->loi_lvb.lvb_size;
+
+               lov_size = lov_stripe_size(lsm, tmpsize, i);
+               if (lov_size > size)
+                       size = lov_size;
+               /* merge blocks, mtime, atime */
+               blocks += loi->loi_lvb.lvb_blocks;
+               if (loi->loi_lvb.lvb_mtime > current_mtime)
+                       current_mtime = loi->loi_lvb.lvb_mtime;
+               if (loi->loi_lvb.lvb_atime > current_atime)
+                       current_atime = loi->loi_lvb.lvb_atime;
+               if (loi->loi_lvb.lvb_ctime > current_ctime)
+                       current_ctime = loi->loi_lvb.lvb_ctime;
+
+               CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s="LPU64" m="LPU64
+                      " a="LPU64" c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi),
+                      loi->loi_ost_idx, loi->loi_lvb.lvb_size,
+                      loi->loi_lvb.lvb_mtime, loi->loi_lvb.lvb_atime,
+                      loi->loi_lvb.lvb_ctime, loi->loi_lvb.lvb_blocks);
+       }
+
+       *kms_place = kms;
+       lvb->lvb_size = size;
+       lvb->lvb_blocks = blocks;
+       lvb->lvb_mtime = current_mtime;
+       lvb->lvb_atime = current_atime;
+       lvb->lvb_ctime = current_ctime;
+       RETURN(rc);
+}
+
+/** Merge the lock value block(&lvb) attributes from each of the stripes in a
+ * file into a single lvb. It is expected that the caller initializes the
+ * current atime, mtime, ctime to avoid regressing a more uptodate time on
+ * the local client.
+ *
+ * If \a kms_only is set then we do not consider the recently seen size (rss)
+ * when updating the known minimum size (kms).  Even when merging RSS, we will
+ * take the KMS value if it's larger.  This prevents getattr from stomping on
+ * dirty cached pages which extend the file size. */
+int lov_merge_lvb(struct obd_export *exp,
+                 struct lov_stripe_md *lsm, struct ost_lvb *lvb, int kms_only)
+{
+       int   rc;
+       __u64 kms;
+
+       ENTRY;
+       lov_stripe_lock(lsm);
+       rc = lov_merge_lvb_kms(lsm, lvb, &kms);
+       lov_stripe_unlock(lsm);
+       if (kms_only)
+               lvb->lvb_size = kms;
+
+       CDEBUG(D_INODE, "merged for ID "DOSTID" s="LPU64" m="LPU64" a="LPU64
+              " c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi), lvb->lvb_size,
+              lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime, lvb->lvb_blocks);
+       RETURN(rc);
+}
+
+/* Must be called under the lov_stripe_lock() */
+int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
+                  obd_off size, int shrink)
+{
+       struct lov_oinfo *loi;
+       int stripe = 0;
+       __u64 kms;
+       ENTRY;
+
+       LASSERT(spin_is_locked(&lsm->lsm_lock));
+       LASSERT(lsm->lsm_lock_owner == current_pid());
+
+       if (shrink) {
+               for (; stripe < lsm->lsm_stripe_count; stripe++) {
+                       struct lov_oinfo *loi = lsm->lsm_oinfo[stripe];
+                       kms = lov_size_to_stripe(lsm, size, stripe);
+                       CDEBUG(D_INODE,
+                              "stripe %d KMS %sing "LPU64"->"LPU64"\n",
+                              stripe, kms > loi->loi_kms ? "increas":"shrink",
+                              loi->loi_kms, kms);
+                       loi_kms_set(loi, loi->loi_lvb.lvb_size = kms);
+               }
+               RETURN(0);
+       }
+
+       if (size > 0)
+               stripe = lov_stripe_number(lsm, size - 1);
+       kms = lov_size_to_stripe(lsm, size, stripe);
+       loi = lsm->lsm_oinfo[stripe];
+
+       CDEBUG(D_INODE, "stripe %d KMS %sincreasing "LPU64"->"LPU64"\n",
+              stripe, kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms);
+       if (kms > loi->loi_kms)
+               loi_kms_set(loi, kms);
+
+       RETURN(0);
+}
+
+void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_valid valid,
+                    struct lov_stripe_md *lsm, int stripeno, int *set)
+{
+       valid &= src->o_valid;
+
+       if (*set) {
+               if (valid & OBD_MD_FLSIZE) {
+                       /* this handles sparse files properly */
+                       obd_size lov_size;
+
+                       lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
+                       if (lov_size > tgt->o_size)
+                               tgt->o_size = lov_size;
+               }
+               if (valid & OBD_MD_FLBLOCKS)
+                       tgt->o_blocks += src->o_blocks;
+               if (valid & OBD_MD_FLBLKSZ)
+                       tgt->o_blksize += src->o_blksize;
+               if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
+                       tgt->o_ctime = src->o_ctime;
+               if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
+                       tgt->o_mtime = src->o_mtime;
+               if (valid & OBD_MD_FLDATAVERSION)
+                       tgt->o_data_version += src->o_data_version;
+       } else {
+               memcpy(tgt, src, sizeof(*tgt));
+               tgt->o_oi = lsm->lsm_oi;
+               if (valid & OBD_MD_FLSIZE)
+                       tgt->o_size = lov_stripe_size(lsm, src->o_size,
+                                                     stripeno);
+       }
+
+       /* data_version needs to be valid on all stripes to be correct! */
+       if (!(valid & OBD_MD_FLDATAVERSION))
+               tgt->o_valid &= ~OBD_MD_FLDATAVERSION;
+
+       *set += 1;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_obd.c b/drivers/staging/lustre/lustre/lov/lov_obd.c
new file mode 100644 (file)
index 0000000..ef7ff09
--- /dev/null
@@ -0,0 +1,2916 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_obd.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_lib.h>
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_dlm.h>
+#include <lustre_mds.h>
+#include <lustre_debug.h>
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <obd_ost.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+#include <cl_object.h>
+#include <lclient.h> /* for cl_client_lru */
+#include <lustre/ll_fiemap.h>
+#include <lustre_log.h>
+#include <lustre_fid.h>
+
+#include "lov_internal.h"
+
+/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
+   Any function that expects lov_tgts to remain stationary must take a ref. */
+static void lov_getref(struct obd_device *obd)
+{
+       struct lov_obd *lov = &obd->u.lov;
+
+       /* nobody gets through here until lov_putref is done */
+       mutex_lock(&lov->lov_lock);
+       atomic_inc(&lov->lov_refcount);
+       mutex_unlock(&lov->lov_lock);
+       return;
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
+
+static void lov_putref(struct obd_device *obd)
+{
+       struct lov_obd *lov = &obd->u.lov;
+
+       mutex_lock(&lov->lov_lock);
+       /* ok to dec to 0 more than once -- ltd_exp's will be null */
+       if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) {
+               LIST_HEAD(kill);
+               int i;
+               struct lov_tgt_desc *tgt, *n;
+               CDEBUG(D_CONFIG, "destroying %d lov targets\n",
+                      lov->lov_death_row);
+               for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                       tgt = lov->lov_tgts[i];
+
+                       if (!tgt || !tgt->ltd_reap)
+                               continue;
+                       list_add(&tgt->ltd_kill, &kill);
+                       /* XXX - right now there is a dependency on ld_tgt_count
+                        * being the maximum tgt index for computing the
+                        * mds_max_easize. So we can't shrink it. */
+                       lov_ost_pool_remove(&lov->lov_packed, i);
+                       lov->lov_tgts[i] = NULL;
+                       lov->lov_death_row--;
+               }
+               mutex_unlock(&lov->lov_lock);
+
+               list_for_each_entry_safe(tgt, n, &kill, ltd_kill) {
+                       list_del(&tgt->ltd_kill);
+                       /* Disconnect */
+                       __lov_del_obd(obd, tgt);
+               }
+       } else {
+               mutex_unlock(&lov->lov_lock);
+       }
+}
+
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+                             enum obd_notify_event ev);
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+                     enum obd_notify_event ev, void *data);
+
+
+#define MAX_STRING_SIZE 128
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+                   struct obd_connect_data *data)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct obd_uuid *tgt_uuid;
+       struct obd_device *tgt_obd;
+       static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
+       struct obd_import *imp;
+       proc_dir_entry_t *lov_proc_dir;
+       int rc;
+       ENTRY;
+
+       if (!lov->lov_tgts[index])
+               RETURN(-EINVAL);
+
+       tgt_uuid = &lov->lov_tgts[index]->ltd_uuid;
+       tgt_obd = lov->lov_tgts[index]->ltd_obd;
+
+       if (!tgt_obd->obd_set_up) {
+               CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid));
+               RETURN(-EINVAL);
+       }
+
+       /* override the sp_me from lov */
+       tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me;
+
+       if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX))
+               data->ocd_index = index;
+
+       /*
+        * Divine LOV knows that OBDs under it are OSCs.
+        */
+       imp = tgt_obd->u.cli.cl_import;
+
+       if (activate) {
+               tgt_obd->obd_no_recov = 0;
+               /* FIXME this is probably supposed to be
+                  ptlrpc_set_import_active.  Horrible naming. */
+               ptlrpc_activate_import(imp);
+       }
+
+       rc = obd_register_observer(tgt_obd, obd);
+       if (rc) {
+               CERROR("Target %s register_observer error %d\n",
+                      obd_uuid2str(tgt_uuid), rc);
+               RETURN(rc);
+       }
+
+
+       if (imp->imp_invalid) {
+               CDEBUG(D_CONFIG, "not connecting OSC %s; administratively "
+                      "disabled\n", obd_uuid2str(tgt_uuid));
+               RETURN(0);
+       }
+
+       rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd,
+                        &lov_osc_uuid, data, NULL);
+       if (rc || !lov->lov_tgts[index]->ltd_exp) {
+               CERROR("Target %s connect error %d\n",
+                      obd_uuid2str(tgt_uuid), rc);
+               RETURN(-ENODEV);
+       }
+
+       lov->lov_tgts[index]->ltd_reap = 0;
+
+       CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index,
+              obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in");
+
+       lov_proc_dir = obd->obd_proc_private;
+       if (lov_proc_dir) {
+               struct obd_device *osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd;
+               proc_dir_entry_t *osc_symlink;
+
+               LASSERT(osc_obd != NULL);
+               LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC);
+               LASSERT(osc_obd->obd_type->typ_name != NULL);
+
+               osc_symlink = lprocfs_add_symlink(osc_obd->obd_name,
+                                                 lov_proc_dir,
+                                                 "../../../%s/%s",
+                                                 osc_obd->obd_type->typ_name,
+                                                 osc_obd->obd_name);
+               if (osc_symlink == NULL) {
+                       CERROR("could not register LOV target "
+                               "/proc/fs/lustre/%s/%s/target_obds/%s.",
+                               obd->obd_type->typ_name, obd->obd_name,
+                               osc_obd->obd_name);
+                       lprocfs_remove(&lov_proc_dir);
+                       obd->obd_proc_private = NULL;
+               }
+       }
+
+       RETURN(0);
+}
+
+static int lov_connect(const struct lu_env *env,
+                      struct obd_export **exp, struct obd_device *obd,
+                      struct obd_uuid *cluuid, struct obd_connect_data *data,
+                      void *localdata)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct lov_tgt_desc *tgt;
+       struct lustre_handle conn;
+       int i, rc;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects);
+
+       rc = class_connect(&conn, obd, cluuid);
+       if (rc)
+               RETURN(rc);
+
+       *exp = class_conn2export(&conn);
+
+       /* Why should there ever be more than 1 connect? */
+       lov->lov_connects++;
+       LASSERT(lov->lov_connects == 1);
+
+       memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd));
+       if (data)
+               lov->lov_ocd = *data;
+
+       obd_getref(obd);
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               tgt = lov->lov_tgts[i];
+               if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
+                       continue;
+               /* Flags will be lowest common denominator */
+               rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd);
+               if (rc) {
+                       CERROR("%s: lov connect tgt %d failed: %d\n",
+                              obd->obd_name, i, rc);
+                       continue;
+               }
+               /* connect to administrative disabled ost */
+               if (!lov->lov_tgts[i]->ltd_exp)
+                       continue;
+
+               rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd,
+                               OBD_NOTIFY_CONNECT, (void *)&i);
+               if (rc) {
+                       CERROR("%s error sending notify %d\n",
+                              obd->obd_name, rc);
+               }
+       }
+       obd_putref(obd);
+
+       RETURN(0);
+}
+
+static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+       proc_dir_entry_t *lov_proc_dir;
+       struct lov_obd *lov = &obd->u.lov;
+       struct obd_device *osc_obd;
+       int rc;
+       ENTRY;
+
+       osc_obd = class_exp2obd(tgt->ltd_exp);
+       CDEBUG(D_CONFIG, "%s: disconnecting target %s\n",
+              obd->obd_name, osc_obd->obd_name);
+
+       if (tgt->ltd_active) {
+               tgt->ltd_active = 0;
+               lov->desc.ld_active_tgt_count--;
+               tgt->ltd_exp->exp_obd->obd_inactive = 1;
+       }
+
+       lov_proc_dir = obd->obd_proc_private;
+       if (lov_proc_dir)
+               lprocfs_remove_proc_entry(osc_obd->obd_name, lov_proc_dir);
+
+       if (osc_obd) {
+               /* Pass it on to our clients.
+                * XXX This should be an argument to disconnect,
+                * XXX not a back-door flag on the OBD.  Ah well.
+                */
+               osc_obd->obd_force = obd->obd_force;
+               osc_obd->obd_fail = obd->obd_fail;
+               osc_obd->obd_no_recov = obd->obd_no_recov;
+       }
+
+       obd_register_observer(osc_obd, NULL);
+
+       rc = obd_disconnect(tgt->ltd_exp);
+       if (rc) {
+               CERROR("Target %s disconnect error %d\n",
+                      tgt->ltd_uuid.uuid, rc);
+               rc = 0;
+       }
+
+       tgt->ltd_exp = NULL;
+       RETURN(0);
+}
+
+static int lov_disconnect(struct obd_export *exp)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct lov_obd *lov = &obd->u.lov;
+       int i, rc;
+       ENTRY;
+
+       if (!lov->lov_tgts)
+               goto out;
+
+       /* Only disconnect the underlying layers on the final disconnect. */
+       lov->lov_connects--;
+       if (lov->lov_connects != 0) {
+               /* why should there be more than 1 connect? */
+               CERROR("disconnect #%d\n", lov->lov_connects);
+               goto out;
+       }
+
+       /* Let's hold another reference so lov_del_obd doesn't spin through
+          putref every time */
+       obd_getref(obd);
+
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
+                       /* Disconnection is the last we know about an obd */
+                       lov_del_target(obd, i, 0, lov->lov_tgts[i]->ltd_gen);
+               }
+       }
+       obd_putref(obd);
+
+out:
+       rc = class_disconnect(exp); /* bz 9811 */
+       RETURN(rc);
+}
+
+/* Error codes:
+ *
+ *  -EINVAL  : UUID can't be found in the LOV's target list
+ *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
+ *  -EBADF   : The UUID is found, but the OBD is the wrong type (!)
+ *  any >= 0 : is log target index
+ */
+static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
+                             enum obd_notify_event ev)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct lov_tgt_desc *tgt;
+       int index, activate, active;
+       ENTRY;
+
+       CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n",
+              lov, uuid->uuid, ev);
+
+       obd_getref(obd);
+       for (index = 0; index < lov->desc.ld_tgt_count; index++) {
+               tgt = lov->lov_tgts[index];
+               if (!tgt)
+                       continue;
+               /*
+                * LU-642, initially inactive OSC could miss the obd_connect,
+                * we make up for it here.
+                */
+               if (ev == OBD_NOTIFY_ACTIVATE && tgt->ltd_exp == NULL &&
+                   obd_uuid_equals(uuid, &tgt->ltd_uuid)) {
+                       struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"};
+
+                       obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd,
+                                   &lov_osc_uuid, &lov->lov_ocd, NULL);
+               }
+               if (!tgt->ltd_exp)
+                       continue;
+
+               CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n",
+                      index, obd_uuid2str(&tgt->ltd_uuid),
+                      tgt->ltd_exp->exp_handle.h_cookie);
+               if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
+                       break;
+       }
+
+       if (index == lov->desc.ld_tgt_count)
+               GOTO(out, index = -EINVAL);
+
+       if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) {
+               activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0;
+
+               if (lov->lov_tgts[index]->ltd_activate == activate) {
+                       CDEBUG(D_INFO, "OSC %s already %sactivate!\n",
+                              uuid->uuid, activate ? "" : "de");
+               } else {
+                       lov->lov_tgts[index]->ltd_activate = activate;
+                       CDEBUG(D_CONFIG, "%sactivate OSC %s\n",
+                              activate ? "" : "de", obd_uuid2str(uuid));
+               }
+
+       } else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) {
+               active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0;
+
+               if (lov->lov_tgts[index]->ltd_active == active) {
+                       CDEBUG(D_INFO, "OSC %s already %sactive!\n",
+                              uuid->uuid, active ? "" : "in");
+                       GOTO(out, index);
+               } else {
+                       CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n",
+                              obd_uuid2str(uuid), active ? "" : "in");
+               }
+
+               lov->lov_tgts[index]->ltd_active = active;
+               if (active) {
+                       lov->desc.ld_active_tgt_count++;
+                       lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0;
+               } else {
+                       lov->desc.ld_active_tgt_count--;
+                       lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1;
+               }
+       } else {
+               CERROR("Unknown event(%d) for uuid %s", ev, uuid->uuid);
+       }
+
+ out:
+       obd_putref(obd);
+       RETURN(index);
+}
+
+static int lov_notify(struct obd_device *obd, struct obd_device *watched,
+                     enum obd_notify_event ev, void *data)
+{
+       int rc = 0;
+       struct lov_obd *lov = &obd->u.lov;
+       ENTRY;
+
+       down_read(&lov->lov_notify_lock);
+       if (!lov->lov_connects) {
+               up_read(&lov->lov_notify_lock);
+               RETURN(rc);
+       }
+
+       if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE ||
+           ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) {
+               struct obd_uuid *uuid;
+
+               LASSERT(watched);
+
+               if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+                       up_read(&lov->lov_notify_lock);
+                       CERROR("unexpected notification of %s %s!\n",
+                              watched->obd_type->typ_name,
+                              watched->obd_name);
+                       RETURN(-EINVAL);
+               }
+               uuid = &watched->u.cli.cl_target_uuid;
+
+               /* Set OSC as active before notifying the observer, so the
+                * observer can use the OSC normally.
+                */
+               rc = lov_set_osc_active(obd, uuid, ev);
+               if (rc < 0) {
+                       up_read(&lov->lov_notify_lock);
+                       CERROR("event(%d) of %s failed: %d\n", ev,
+                              obd_uuid2str(uuid), rc);
+                       RETURN(rc);
+               }
+               /* active event should be pass lov target index as data */
+               data = &rc;
+       }
+
+       /* Pass the notification up the chain. */
+       if (watched) {
+               rc = obd_notify_observer(obd, watched, ev, data);
+       } else {
+               /* NULL watched means all osc's in the lov (only for syncs) */
+               /* sync event should be send lov idx as data */
+               struct lov_obd *lov = &obd->u.lov;
+               int i, is_sync;
+
+               data = &i;
+               is_sync = (ev == OBD_NOTIFY_SYNC) ||
+                         (ev == OBD_NOTIFY_SYNC_NONBLOCK);
+
+               obd_getref(obd);
+               for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                       if (!lov->lov_tgts[i])
+                               continue;
+
+                       /* don't send sync event if target not
+                        * connected/activated */
+                       if (is_sync &&  !lov->lov_tgts[i]->ltd_active)
+                               continue;
+
+                       rc = obd_notify_observer(obd, lov->lov_tgts[i]->ltd_obd,
+                                                ev, data);
+                       if (rc) {
+                               CERROR("%s: notify %s of %s failed %d\n",
+                                      obd->obd_name,
+                                      obd->obd_observer->obd_name,
+                                      lov->lov_tgts[i]->ltd_obd->obd_name,
+                                      rc);
+                       }
+               }
+               obd_putref(obd);
+       }
+
+       up_read(&lov->lov_notify_lock);
+       RETURN(rc);
+}
+
+static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+                         __u32 index, int gen, int active)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct lov_tgt_desc *tgt;
+       struct obd_device *tgt_obd;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n",
+              uuidp->uuid, index, gen, active);
+
+       if (gen <= 0) {
+               CERROR("request to add OBD %s with invalid generation: %d\n",
+                      uuidp->uuid, gen);
+               RETURN(-EINVAL);
+       }
+
+       tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME,
+                                       &obd->obd_uuid);
+       if (tgt_obd == NULL)
+               RETURN(-EINVAL);
+
+       mutex_lock(&lov->lov_lock);
+
+       if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) {
+               tgt = lov->lov_tgts[index];
+               CERROR("UUID %s already assigned at LOV target index %d\n",
+                      obd_uuid2str(&tgt->ltd_uuid), index);
+               mutex_unlock(&lov->lov_lock);
+               RETURN(-EEXIST);
+       }
+
+       if (index >= lov->lov_tgt_size) {
+               /* We need to reallocate the lov target array. */
+               struct lov_tgt_desc **newtgts, **old = NULL;
+               __u32 newsize, oldsize = 0;
+
+               newsize = max(lov->lov_tgt_size, (__u32)2);
+               while (newsize < index + 1)
+                       newsize = newsize << 1;
+               OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize);
+               if (newtgts == NULL) {
+                       mutex_unlock(&lov->lov_lock);
+                       RETURN(-ENOMEM);
+               }
+
+               if (lov->lov_tgt_size) {
+                       memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) *
+                              lov->lov_tgt_size);
+                       old = lov->lov_tgts;
+                       oldsize = lov->lov_tgt_size;
+               }
+
+               lov->lov_tgts = newtgts;
+               lov->lov_tgt_size = newsize;
+               smp_rmb();
+               if (old)
+                       OBD_FREE(old, sizeof(*old) * oldsize);
+
+               CDEBUG(D_CONFIG, "tgts: %p size: %d\n",
+                      lov->lov_tgts, lov->lov_tgt_size);
+       }
+
+       OBD_ALLOC_PTR(tgt);
+       if (!tgt) {
+               mutex_unlock(&lov->lov_lock);
+               RETURN(-ENOMEM);
+       }
+
+       rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+       if (rc) {
+               mutex_unlock(&lov->lov_lock);
+               OBD_FREE_PTR(tgt);
+               RETURN(rc);
+       }
+
+       tgt->ltd_uuid = *uuidp;
+       tgt->ltd_obd = tgt_obd;
+       /* XXX - add a sanity check on the generation number. */
+       tgt->ltd_gen = gen;
+       tgt->ltd_index = index;
+       tgt->ltd_activate = active;
+       lov->lov_tgts[index] = tgt;
+       if (index >= lov->desc.ld_tgt_count)
+               lov->desc.ld_tgt_count = index + 1;
+
+       mutex_unlock(&lov->lov_lock);
+
+       CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
+               index, tgt->ltd_gen, lov->desc.ld_tgt_count);
+
+       rc = obd_notify(obd, tgt_obd, OBD_NOTIFY_CREATE, &index);
+
+       if (lov->lov_connects == 0) {
+               /* lov_connect hasn't been called yet. We'll do the
+                  lov_connect_obd on this target when that fn first runs,
+                  because we don't know the connect flags yet. */
+               RETURN(0);
+       }
+
+       obd_getref(obd);
+
+       rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
+       if (rc)
+               GOTO(out, rc);
+
+       /* connect to administrative disabled ost */
+       if (!tgt->ltd_exp)
+               GOTO(out, rc = 0);
+
+       if (lov->lov_cache != NULL) {
+               rc = obd_set_info_async(NULL, tgt->ltd_exp,
+                               sizeof(KEY_CACHE_SET), KEY_CACHE_SET,
+                               sizeof(struct cl_client_cache), lov->lov_cache,
+                               NULL);
+               if (rc < 0)
+                       GOTO(out, rc);
+       }
+
+       rc = lov_notify(obd, tgt->ltd_exp->exp_obd,
+                       active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE,
+                       (void *)&index);
+
+out:
+       if (rc) {
+               CERROR("add failed (%d), deleting %s\n", rc,
+                      obd_uuid2str(&tgt->ltd_uuid));
+               lov_del_target(obd, index, 0, 0);
+       }
+       obd_putref(obd);
+       RETURN(rc);
+}
+
+/* Schedule a target for deletion */
+int lov_del_target(struct obd_device *obd, __u32 index,
+                  struct obd_uuid *uuidp, int gen)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       int count = lov->desc.ld_tgt_count;
+       int rc = 0;
+       ENTRY;
+
+       if (index >= count) {
+               CERROR("LOV target index %d >= number of LOV OBDs %d.\n",
+                      index, count);
+               RETURN(-EINVAL);
+       }
+
+       /* to make sure there's no ongoing lov_notify() now */
+       down_write(&lov->lov_notify_lock);
+       obd_getref(obd);
+
+       if (!lov->lov_tgts[index]) {
+               CERROR("LOV target at index %d is not setup.\n", index);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) {
+               CERROR("LOV target UUID %s at index %d doesn't match %s.\n",
+                      lov_uuid2str(lov, index), index,
+                      obd_uuid2str(uuidp));
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n",
+              lov_uuid2str(lov, index), index,
+              lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp,
+              lov->lov_tgts[index]->ltd_active);
+
+       lov->lov_tgts[index]->ltd_reap = 1;
+       lov->lov_death_row++;
+       /* we really delete it from obd_putref */
+out:
+       obd_putref(obd);
+       up_write(&lov->lov_notify_lock);
+
+       RETURN(rc);
+}
+
+static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt)
+{
+       struct obd_device *osc_obd;
+
+       LASSERT(tgt);
+       LASSERT(tgt->ltd_reap);
+
+       osc_obd = class_exp2obd(tgt->ltd_exp);
+
+       CDEBUG(D_CONFIG, "Removing tgt %s : %s\n",
+              tgt->ltd_uuid.uuid,
+              osc_obd ? osc_obd->obd_name : "<no obd>");
+
+       if (tgt->ltd_exp)
+               lov_disconnect_obd(obd, tgt);
+
+       OBD_FREE_PTR(tgt);
+
+       /* Manual cleanup - no cleanup logs to clean up the osc's.  We must
+          do it ourselves. And we can't do it from lov_cleanup,
+          because we just lost our only reference to it. */
+       if (osc_obd)
+               class_manual_cleanup(osc_obd);
+}
+
+void lov_fix_desc_stripe_size(__u64 *val)
+{
+       if (*val < LOV_MIN_STRIPE_SIZE) {
+               if (*val != 0)
+                       LCONSOLE_INFO("Increasing default stripe size to "
+                                     "minimum %u\n",
+                                     LOV_DEFAULT_STRIPE_SIZE);
+               *val = LOV_DEFAULT_STRIPE_SIZE;
+       } else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) {
+               *val &= ~(LOV_MIN_STRIPE_SIZE - 1);
+               LCONSOLE_WARN("Changing default stripe size to "LPU64" (a "
+                             "multiple of %u)\n",
+                             *val, LOV_MIN_STRIPE_SIZE);
+       }
+}
+
+void lov_fix_desc_stripe_count(__u32 *val)
+{
+       if (*val == 0)
+               *val = 1;
+}
+
+void lov_fix_desc_pattern(__u32 *val)
+{
+       /* from lov_setstripe */
+       if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) {
+               LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val);
+               *val = 0;
+       }
+}
+
+void lov_fix_desc_qos_maxage(__u32 *val)
+{
+       /* fix qos_maxage */
+       if (*val == 0)
+               *val = QOS_DEFAULT_MAXAGE;
+}
+
+void lov_fix_desc(struct lov_desc *desc)
+{
+       lov_fix_desc_stripe_size(&desc->ld_default_stripe_size);
+       lov_fix_desc_stripe_count(&desc->ld_default_stripe_count);
+       lov_fix_desc_pattern(&desc->ld_pattern);
+       lov_fix_desc_qos_maxage(&desc->ld_qos_maxage);
+}
+
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       struct lov_desc *desc;
+       struct lov_obd *lov = &obd->u.lov;
+       int rc;
+       ENTRY;
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+               CERROR("LOV setup requires a descriptor\n");
+               RETURN(-EINVAL);
+       }
+
+       desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1);
+
+       if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+               CERROR("descriptor size wrong: %d > %d\n",
+                      (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
+               RETURN(-EINVAL);
+       }
+
+       if (desc->ld_magic != LOV_DESC_MAGIC) {
+               if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) {
+                           CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n",
+                                  obd->obd_name, desc);
+                           lustre_swab_lov_desc(desc);
+               } else {
+                       CERROR("%s: Bad lov desc magic: %#x\n",
+                              obd->obd_name, desc->ld_magic);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       lov_fix_desc(desc);
+
+       desc->ld_active_tgt_count = 0;
+       lov->desc = *desc;
+       lov->lov_tgt_size = 0;
+
+       mutex_init(&lov->lov_lock);
+       atomic_set(&lov->lov_refcount, 0);
+       lov->lov_sp_me = LUSTRE_SP_CLI;
+
+       init_rwsem(&lov->lov_notify_lock);
+
+       lov->lov_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS,
+                                                  HASH_POOLS_MAX_BITS,
+                                                  HASH_POOLS_BKT_BITS, 0,
+                                                  CFS_HASH_MIN_THETA,
+                                                  CFS_HASH_MAX_THETA,
+                                                  &pool_hash_operations,
+                                                  CFS_HASH_DEFAULT);
+       INIT_LIST_HEAD(&lov->lov_pool_list);
+       lov->lov_pool_count = 0;
+       rc = lov_ost_pool_init(&lov->lov_packed, 0);
+       if (rc)
+               GOTO(out, rc);
+
+       lprocfs_lov_init_vars(&lvars);
+       lprocfs_obd_setup(obd, lvars.obd_vars);
+#ifdef LPROCFS
+       {
+               int rc;
+
+               rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
+                                       0444, &lov_proc_target_fops, obd);
+               if (rc)
+                       CWARN("Error adding the target_obd file\n");
+       }
+#endif
+       lov->lov_pool_proc_entry = lprocfs_register("pools",
+                                                   obd->obd_proc_entry,
+                                                   NULL, NULL);
+
+       RETURN(0);
+
+out:
+       return rc;
+}
+
+static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       int rc = 0;
+       struct lov_obd *lov = &obd->u.lov;
+
+       ENTRY;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY: {
+               int i;
+               for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                       if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+                               continue;
+                       obd_precleanup(class_exp2obd(lov->lov_tgts[i]->ltd_exp),
+                                      OBD_CLEANUP_EARLY);
+               }
+               break;
+       }
+       case OBD_CLEANUP_EXPORTS:
+               rc = obd_llog_finish(obd, 0);
+               if (rc != 0)
+                       CERROR("failed to cleanup llogging subsystems\n");
+               break;
+       }
+       RETURN(rc);
+}
+
+static int lov_cleanup(struct obd_device *obd)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       struct list_head *pos, *tmp;
+       struct pool_desc *pool;
+       ENTRY;
+
+       list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
+               pool = list_entry(pos, struct pool_desc, pool_list);
+               /* free pool structs */
+               CDEBUG(D_INFO, "delete pool %p\n", pool);
+               /* In the function below, .hs_keycmp resolves to
+                * pool_hashkey_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               lov_pool_del(obd, pool->pool_name);
+       }
+       cfs_hash_putref(lov->lov_pools_hash_body);
+       lov_ost_pool_free(&lov->lov_packed);
+
+       lprocfs_obd_cleanup(obd);
+       if (lov->lov_tgts) {
+               int i;
+               obd_getref(obd);
+               for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                       if (!lov->lov_tgts[i])
+                               continue;
+
+                       /* Inactive targets may never have connected */
+                       if (lov->lov_tgts[i]->ltd_active ||
+                           atomic_read(&lov->lov_refcount))
+                           /* We should never get here - these
+                              should have been removed in the
+                            disconnect. */
+                               CERROR("lov tgt %d not cleaned!"
+                                      " deathrow=%d, lovrc=%d\n",
+                                      i, lov->lov_death_row,
+                                      atomic_read(&lov->lov_refcount));
+                       lov_del_target(obd, i, 0, 0);
+               }
+               obd_putref(obd);
+               OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
+                        lov->lov_tgt_size);
+               lov->lov_tgt_size = 0;
+       }
+       RETURN(0);
+}
+
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+                           __u32 *indexp, int *genp)
+{
+       struct obd_uuid obd_uuid;
+       int cmd;
+       int rc = 0;
+       ENTRY;
+
+       switch(cmd = lcfg->lcfg_command) {
+       case LCFG_LOV_ADD_OBD:
+       case LCFG_LOV_ADD_INA:
+       case LCFG_LOV_DEL_OBD: {
+               __u32 index;
+               int gen;
+               /* lov_modify_tgts add  0:lov_mdsA  1:ost1_UUID  2:0  3:1 */
+               if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid))
+                       GOTO(out, rc = -EINVAL);
+
+               obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
+
+               if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", indexp) != 1)
+                       GOTO(out, rc = -EINVAL);
+               if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1)
+                       GOTO(out, rc = -EINVAL);
+               index = *indexp;
+               gen = *genp;
+               if (cmd == LCFG_LOV_ADD_OBD)
+                       rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
+               else if (cmd == LCFG_LOV_ADD_INA)
+                       rc = lov_add_target(obd, &obd_uuid, index, gen, 0);
+               else
+                       rc = lov_del_target(obd, index, &obd_uuid, gen);
+               GOTO(out, rc);
+       }
+       case LCFG_PARAM: {
+               struct lprocfs_static_vars lvars = { 0 };
+               struct lov_desc *desc = &(obd->u.lov.desc);
+
+               if (!desc)
+                       GOTO(out, rc = -EINVAL);
+
+               lprocfs_lov_init_vars(&lvars);
+
+               rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars,
+                                             lcfg, obd);
+               if (rc > 0)
+                       rc = 0;
+               GOTO(out, rc);
+       }
+       case LCFG_POOL_NEW:
+       case LCFG_POOL_ADD:
+       case LCFG_POOL_DEL:
+       case LCFG_POOL_REM:
+               GOTO(out, rc);
+
+       default: {
+               CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+               GOTO(out, rc = -EINVAL);
+
+       }
+       }
+out:
+       RETURN(rc);
+}
+
+static int lov_recreate(struct obd_export *exp, struct obdo *src_oa,
+                       struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+       struct lov_stripe_md *obj_mdp, *lsm;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       unsigned ost_idx;
+       int rc, i;
+       ENTRY;
+
+       LASSERT(src_oa->o_valid & OBD_MD_FLFLAGS &&
+               src_oa->o_flags & OBD_FL_RECREATE_OBJS);
+
+       OBD_ALLOC(obj_mdp, sizeof(*obj_mdp));
+       if (obj_mdp == NULL)
+               RETURN(-ENOMEM);
+
+       ost_idx = src_oa->o_nlink;
+       lsm = *ea;
+       if (lsm == NULL)
+               GOTO(out, rc = -EINVAL);
+       if (ost_idx >= lov->desc.ld_tgt_count ||
+           !lov->lov_tgts[ost_idx])
+               GOTO(out, rc = -EINVAL);
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               if (lsm->lsm_oinfo[i]->loi_ost_idx == ost_idx) {
+                       if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) !=
+                                       ostid_id(&src_oa->o_oi))
+                               GOTO(out, rc = -EINVAL);
+                       break;
+               }
+       }
+       if (i == lsm->lsm_stripe_count)
+               GOTO(out, rc = -EINVAL);
+
+       rc = obd_create(NULL, lov->lov_tgts[ost_idx]->ltd_exp,
+                       src_oa, &obj_mdp, oti);
+out:
+       OBD_FREE(obj_mdp, sizeof(*obj_mdp));
+       RETURN(rc);
+}
+
+/* the LOV expects oa->o_id to be set to the LOV object id */
+static int lov_create(const struct lu_env *env, struct obd_export *exp,
+                     struct obdo *src_oa, struct lov_stripe_md **ea,
+                     struct obd_trans_info *oti)
+{
+       struct lov_obd *lov;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(ea != NULL);
+       if (exp == NULL)
+               RETURN(-EINVAL);
+
+       if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+           src_oa->o_flags == OBD_FL_DELORPHAN) {
+               /* should be used with LOV anymore */
+               LBUG();
+       }
+
+       lov = &exp->exp_obd->u.lov;
+       if (!lov->desc.ld_active_tgt_count)
+               RETURN(-EIO);
+
+       obd_getref(exp->exp_obd);
+       /* Recreate a specific object id at the given OST index */
+       if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
+           (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) {
+                rc = lov_recreate(exp, src_oa, ea, oti);
+       }
+
+       obd_putref(exp->exp_obd);
+       RETURN(rc);
+}
+
+#define ASSERT_LSM_MAGIC(lsmp)                                           \
+do {                                                                       \
+       LASSERT((lsmp) != NULL);                                                \
+       LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 ||                    \
+                (lsmp)->lsm_magic == LOV_MAGIC_V3),                        \
+                "%p->lsm_magic=%x\n", (lsmp), (lsmp)->lsm_magic);            \
+} while (0)
+
+static int lov_destroy(const struct lu_env *env, struct obd_export *exp,
+                      struct obdo *oa, struct lov_stripe_md *lsm,
+                      struct obd_trans_info *oti, struct obd_export *md_exp,
+                      void *capa)
+{
+       struct lov_request_set *set;
+       struct obd_info oinfo;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       int rc = 0, err = 0;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(lsm);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       if (oa->o_valid & OBD_MD_FLCOOKIE) {
+               LASSERT(oti);
+               LASSERT(oti->oti_logcookies);
+       }
+
+       lov = &exp->exp_obd->u.lov;
+       obd_getref(exp->exp_obd);
+       rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set);
+       if (rc)
+               GOTO(out, rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               if (oa->o_valid & OBD_MD_FLCOOKIE)
+                       oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+               err = obd_destroy(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                 req->rq_oi.oi_oa, NULL, oti, NULL, capa);
+               err = lov_update_common_set(set, req, err);
+               if (err) {
+                       CERROR("%s: destroying objid "DOSTID" subobj "
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name, POSTID(&oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi),
+                              req->rq_idx, err);
+                       if (!rc)
+                               rc = err;
+               }
+       }
+
+       if (rc == 0) {
+               LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+               rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp);
+       }
+       err = lov_fini_destroy_set(set);
+out:
+       obd_putref(exp->exp_obd);
+       RETURN(rc ? rc : err);
+}
+
+static int lov_getattr(const struct lu_env *env, struct obd_export *exp,
+                      struct obd_info *oinfo)
+{
+       struct lov_request_set *set;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       int err = 0, rc = 0;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+
+       rc = lov_prep_getattr_set(exp, oinfo, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+                      " %u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+                      POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+
+               rc = obd_getattr(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                &req->rq_oi);
+               err = lov_update_common_set(set, req, rc);
+               if (err) {
+                       CERROR("%s: getattr objid "DOSTID" subobj "
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name,
+                              POSTID(&oinfo->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi),
+                              req->rq_idx, err);
+                       break;
+               }
+       }
+
+       rc = lov_fini_getattr_set(set);
+       if (err)
+               rc = err;
+       RETURN(rc);
+}
+
+static int lov_getattr_interpret(struct ptlrpc_request_set *rqset,
+                                void *data, int rc)
+{
+       struct lov_request_set *lovset = (struct lov_request_set *)data;
+       int err;
+       ENTRY;
+
+       /* don't do attribute merge if this aysnc op failed */
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+       err = lov_fini_getattr_set(lovset);
+       RETURN(rc ? rc : err);
+}
+
+static int lov_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+                             struct ptlrpc_request_set *rqset)
+{
+       struct lov_request_set *lovset;
+       struct lov_obd *lov;
+       struct list_head *pos;
+       struct lov_request *req;
+       int rc = 0, err;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+
+       rc = lov_prep_getattr_set(exp, oinfo, &lovset);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+              POSTID(&oinfo->oi_md->lsm_oi), oinfo->oi_md->lsm_stripe_count,
+              oinfo->oi_md->lsm_stripe_size);
+
+       list_for_each(pos, &lovset->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+                      "%u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+                      POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+               rc = obd_getattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                      &req->rq_oi, rqset);
+               if (rc) {
+                       CERROR("%s: getattr objid "DOSTID" subobj"
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name,
+                              POSTID(&oinfo->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi),
+                              req->rq_idx, rc);
+                       GOTO(out, rc);
+               }
+       }
+
+       if (!list_empty(&rqset->set_requests)) {
+               LASSERT(rc == 0);
+               LASSERT (rqset->set_interpret == NULL);
+               rqset->set_interpret = lov_getattr_interpret;
+               rqset->set_arg = (void *)lovset;
+               RETURN(rc);
+       }
+out:
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+       err = lov_fini_getattr_set(lovset);
+       RETURN(rc ? rc : err);
+}
+
+static int lov_setattr(const struct lu_env *env, struct obd_export *exp,
+                      struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov;
+       struct list_head *pos;
+       struct lov_request *req;
+       int err = 0, rc = 0;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       /* for now, we only expect the following updates here */
+       LASSERT(!(oinfo->oi_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLTYPE |
+                                           OBD_MD_FLMODE | OBD_MD_FLATIME |
+                                           OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+                                           OBD_MD_FLFLAGS | OBD_MD_FLSIZE |
+                                           OBD_MD_FLGROUP | OBD_MD_FLUID |
+                                           OBD_MD_FLGID | OBD_MD_FLFID |
+                                           OBD_MD_FLGENER)));
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               rc = obd_setattr(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                &req->rq_oi, NULL);
+               err = lov_update_setattr_set(set, req, rc);
+               if (err) {
+                       CERROR("%s: setattr objid "DOSTID" subobj "
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name,
+                              POSTID(&set->set_oi->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx,
+                              err);
+                       if (!rc)
+                               rc = err;
+               }
+       }
+       err = lov_fini_setattr_set(set);
+       if (!rc)
+               rc = err;
+       RETURN(rc);
+}
+
+static int lov_setattr_interpret(struct ptlrpc_request_set *rqset,
+                                void *data, int rc)
+{
+       struct lov_request_set *lovset = (struct lov_request_set *)data;
+       int err;
+       ENTRY;
+
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+       err = lov_fini_setattr_set(lovset);
+       RETURN(rc ? rc : err);
+}
+
+/* If @oti is given, the request goes from MDS and responses from OSTs are not
+   needed. Otherwise, a client is waiting for responses. */
+static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+                            struct obd_trans_info *oti,
+                            struct ptlrpc_request_set *rqset)
+{
+       struct lov_request_set *set;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+       if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) {
+               LASSERT(oti);
+               LASSERT(oti->oti_logcookies);
+       }
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_setattr_set(exp, oinfo, oti, &set);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n",
+              POSTID(&oinfo->oi_md->lsm_oi),
+              oinfo->oi_md->lsm_stripe_count,
+              oinfo->oi_md->lsm_stripe_size);
+
+       list_for_each(pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+                       oti->oti_logcookies = set->set_cookies + req->rq_stripe;
+
+               CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx"
+                      "%u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe,
+                      POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx);
+
+               rc = obd_setattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                      &req->rq_oi, oti, rqset);
+               if (rc) {
+                       CERROR("error: setattr objid "DOSTID" subobj"
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              POSTID(&set->set_oi->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi),
+                              req->rq_idx, rc);
+                       break;
+               }
+       }
+
+       /* If we are not waiting for responses on async requests, return. */
+       if (rc || !rqset || list_empty(&rqset->set_requests)) {
+               int err;
+               if (rc)
+                       atomic_set(&set->set_completes, 0);
+               err = lov_fini_setattr_set(set);
+               RETURN(rc ? rc : err);
+       }
+
+       LASSERT(rqset->set_interpret == NULL);
+       rqset->set_interpret = lov_setattr_interpret;
+       rqset->set_arg = (void *)set;
+
+       RETURN(0);
+}
+
+static int lov_punch_interpret(struct ptlrpc_request_set *rqset,
+                              void *data, int rc)
+{
+       struct lov_request_set *lovset = (struct lov_request_set *)data;
+       int err;
+       ENTRY;
+
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+       err = lov_fini_punch_set(lovset);
+       RETURN(rc ? rc : err);
+}
+
+/* FIXME: maybe we'll just make one node the authoritative attribute node, then
+ * we can send this 'punch' to just the authoritative node and the nodes
+ * that the punch will affect. */
+static int lov_punch(const struct lu_env *env, struct obd_export *exp,
+                    struct obd_info *oinfo, struct obd_trans_info *oti,
+                    struct ptlrpc_request_set *rqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov;
+       struct list_head *pos;
+       struct lov_request *req;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_punch_set(exp, oinfo, oti, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               rc = obd_punch(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+                              &req->rq_oi, NULL, rqset);
+               if (rc) {
+                       CERROR("%s: punch objid "DOSTID" subobj "DOSTID
+                              " on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name,
+                              POSTID(&set->set_oi->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx, rc);
+                       break;
+               }
+       }
+
+       if (rc || list_empty(&rqset->set_requests)) {
+               int err;
+               err = lov_fini_punch_set(set);
+               RETURN(rc ? rc : err);
+       }
+
+       LASSERT(rqset->set_interpret == NULL);
+       rqset->set_interpret = lov_punch_interpret;
+       rqset->set_arg = (void *)set;
+
+       RETURN(0);
+}
+
+static int lov_sync_interpret(struct ptlrpc_request_set *rqset,
+                             void *data, int rc)
+{
+       struct lov_request_set *lovset = data;
+       int err;
+       ENTRY;
+
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+       err = lov_fini_sync_set(lovset);
+       RETURN(rc ?: err);
+}
+
+static int lov_sync(const struct lu_env *env, struct obd_export *exp,
+                   struct obd_info *oinfo, obd_off start, obd_off end,
+                   struct ptlrpc_request_set *rqset)
+{
+       struct lov_request_set *set = NULL;
+       struct lov_obd *lov;
+       struct list_head *pos;
+       struct lov_request *req;
+       int rc = 0;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+       LASSERT(rqset != NULL);
+
+       if (!exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_sync_set(exp, oinfo, start, end, &set);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_INFO, "fsync objid "DOSTID" ["LPX64", "LPX64"]\n",
+              POSTID(&set->set_oi->oi_oa->o_oi), start, end);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               rc = obd_sync(env, lov->lov_tgts[req->rq_idx]->ltd_exp,
+                             &req->rq_oi, req->rq_oi.oi_policy.l_extent.start,
+                             req->rq_oi.oi_policy.l_extent.end, rqset);
+               if (rc) {
+                       CERROR("%s: fsync objid "DOSTID" subobj "DOSTID
+                              " on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name,
+                              POSTID(&set->set_oi->oi_oa->o_oi),
+                              POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx,
+                              rc);
+                       break;
+               }
+       }
+
+       /* If we are not waiting for responses on async requests, return. */
+       if (rc || list_empty(&rqset->set_requests)) {
+               int err = lov_fini_sync_set(set);
+
+               RETURN(rc ?: err);
+       }
+
+       LASSERT(rqset->set_interpret == NULL);
+       rqset->set_interpret = lov_sync_interpret;
+       rqset->set_arg = (void *)set;
+
+       RETURN(0);
+}
+
+static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo,
+                        obd_count oa_bufs, struct brw_page *pga)
+{
+       struct obd_info oinfo = { { { 0 } } };
+       int i, rc = 0;
+
+       oinfo.oi_oa = lov_oinfo->oi_oa;
+
+       /* The caller just wants to know if there's a chance that this
+        * I/O can succeed */
+       for (i = 0; i < oa_bufs; i++) {
+               int stripe = lov_stripe_number(lov_oinfo->oi_md, pga[i].off);
+               int ost = lov_oinfo->oi_md->lsm_oinfo[stripe]->loi_ost_idx;
+               obd_off start, end;
+
+               if (!lov_stripe_intersects(lov_oinfo->oi_md, i, pga[i].off,
+                                          pga[i].off + pga[i].count - 1,
+                                          &start, &end))
+                       continue;
+
+               if (!lov->lov_tgts[ost] || !lov->lov_tgts[ost]->ltd_active) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", ost);
+                       return -EIO;
+               }
+
+               rc = obd_brw(OBD_BRW_CHECK, lov->lov_tgts[ost]->ltd_exp, &oinfo,
+                            1, &pga[i], NULL);
+               if (rc)
+                       break;
+       }
+       return rc;
+}
+
+static int lov_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+                  obd_count oa_bufs, struct brw_page *pga,
+                  struct obd_trans_info *oti)
+{
+       struct lov_request_set *set;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int err, rc = 0;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+
+       if (cmd == OBD_BRW_CHECK) {
+               rc = lov_brw_check(lov, oinfo, oa_bufs, pga);
+               RETURN(rc);
+       }
+
+       rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               struct obd_export *sub_exp;
+               struct brw_page *sub_pga;
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               sub_exp = lov->lov_tgts[req->rq_idx]->ltd_exp;
+               sub_pga = set->set_pga + req->rq_pgaidx;
+               rc = obd_brw(cmd, sub_exp, &req->rq_oi, req->rq_oabufs,
+                            sub_pga, oti);
+               if (rc)
+                       break;
+               lov_update_common_set(set, req, rc);
+       }
+
+       err = lov_fini_brw_set(set);
+       if (!rc)
+               rc = err;
+       RETURN(rc);
+}
+
+static int lov_enqueue_interpret(struct ptlrpc_request_set *rqset,
+                                void *data, int rc)
+{
+       struct lov_request_set *lovset = (struct lov_request_set *)data;
+       ENTRY;
+       rc = lov_fini_enqueue_set(lovset, lovset->set_ei->ei_mode, rc, rqset);
+       RETURN(rc);
+}
+
+static int lov_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+                      struct ldlm_enqueue_info *einfo,
+                      struct ptlrpc_request_set *rqset)
+{
+       ldlm_mode_t mode = einfo->ei_mode;
+       struct lov_request_set *set;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       ldlm_error_t rc;
+       ENTRY;
+
+       LASSERT(oinfo);
+       ASSERT_LSM_MAGIC(oinfo->oi_md);
+       LASSERT(mode == (mode & -mode));
+
+       /* we should never be asked to replay a lock this way. */
+       LASSERT((oinfo->oi_flags & LDLM_FL_REPLAY) == 0);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_enqueue_set(exp, oinfo, einfo, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               rc = obd_enqueue(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                &req->rq_oi, einfo, rqset);
+               if (rc != ELDLM_OK)
+                       GOTO(out, rc);
+       }
+
+       if (rqset && !list_empty(&rqset->set_requests)) {
+               LASSERT(rc == 0);
+               LASSERT(rqset->set_interpret == NULL);
+               rqset->set_interpret = lov_enqueue_interpret;
+               rqset->set_arg = (void *)set;
+               RETURN(rc);
+       }
+out:
+       rc = lov_fini_enqueue_set(set, mode, rc, rqset);
+       RETURN(rc);
+}
+
+static int lov_change_cbdata(struct obd_export *exp,
+                            struct lov_stripe_md *lsm, ldlm_iterator_t it,
+                            void *data)
+{
+       struct lov_obd *lov;
+       int rc = 0, i;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(lsm);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_stripe_md submd;
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+               if (!lov->lov_tgts[loi->loi_ost_idx]) {
+                       CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx);
+                       continue;
+               }
+
+               submd.lsm_oi = loi->loi_oi;
+               submd.lsm_stripe_count = 0;
+               rc = obd_change_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
+                                      &submd, it, data);
+       }
+       RETURN(rc);
+}
+
+/* find any ldlm lock of the inode in lov
+ * return 0    not find
+ *     1    find one
+ *      < 0    error */
+static int lov_find_cbdata(struct obd_export *exp,
+                          struct lov_stripe_md *lsm, ldlm_iterator_t it,
+                          void *data)
+{
+       struct lov_obd *lov;
+       int rc = 0, i;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(lsm);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_stripe_md submd;
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+               if (!lov->lov_tgts[loi->loi_ost_idx]) {
+                       CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx);
+                       continue;
+               }
+               submd.lsm_oi = loi->loi_oi;
+               submd.lsm_stripe_count = 0;
+               rc = obd_find_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
+                                    &submd, it, data);
+               if (rc != 0)
+                       RETURN(rc);
+       }
+       RETURN(rc);
+}
+
+static int lov_cancel(struct obd_export *exp, struct lov_stripe_md *lsm,
+                     __u32 mode, struct lustre_handle *lockh)
+{
+       struct lov_request_set *set;
+       struct obd_info oinfo;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       struct lustre_handle *lov_lockhp;
+       int err = 0, rc = 0;
+       ENTRY;
+
+       ASSERT_LSM_MAGIC(lsm);
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       LASSERT(lockh);
+       lov = &exp->exp_obd->u.lov;
+       rc = lov_prep_cancel_set(exp, &oinfo, lsm, mode, lockh, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each(pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+               lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+
+               rc = obd_cancel(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                               req->rq_oi.oi_md, mode, lov_lockhp);
+               rc = lov_update_common_set(set, req, rc);
+               if (rc) {
+                       CERROR("%s: cancel objid "DOSTID" subobj "
+                              DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name, POSTID(&lsm->lsm_oi),
+                              POSTID(&req->rq_oi.oi_md->lsm_oi),
+                              req->rq_idx, rc);
+                       err = rc;
+               }
+
+       }
+       lov_fini_cancel_set(set);
+       RETURN(err);
+}
+
+static int lov_cancel_unused(struct obd_export *exp,
+                            struct lov_stripe_md *lsm,
+                            ldlm_cancel_flags_t flags, void *opaque)
+{
+       struct lov_obd *lov;
+       int rc = 0, i;
+       ENTRY;
+
+       if (!exp || !exp->exp_obd)
+               RETURN(-ENODEV);
+
+       lov = &exp->exp_obd->u.lov;
+       if (lsm == NULL) {
+               for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                       int err;
+                       if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+                               continue;
+
+                       err = obd_cancel_unused(lov->lov_tgts[i]->ltd_exp, NULL,
+                                               flags, opaque);
+                       if (!rc)
+                               rc = err;
+               }
+               RETURN(rc);
+       }
+
+       ASSERT_LSM_MAGIC(lsm);
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_stripe_md submd;
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+               int idx = loi->loi_ost_idx;
+               int err;
+
+               if (!lov->lov_tgts[idx]) {
+                       CDEBUG(D_HA, "lov idx %d NULL\n", idx);
+                       continue;
+               }
+
+               if (!lov->lov_tgts[idx]->ltd_active)
+                       CDEBUG(D_HA, "lov idx %d inactive\n", idx);
+
+               submd.lsm_oi = loi->loi_oi;
+               submd.lsm_stripe_count = 0;
+               err = obd_cancel_unused(lov->lov_tgts[idx]->ltd_exp,
+                                       &submd, flags, opaque);
+               if (err && lov->lov_tgts[idx]->ltd_active) {
+                       CERROR("%s: cancel unused objid "DOSTID
+                              " subobj "DOSTID" on OST idx %d: rc = %d\n",
+                              exp->exp_obd->obd_name, POSTID(&lsm->lsm_oi),
+                              POSTID(&loi->loi_oi), idx, err);
+                       if (!rc)
+                               rc = err;
+               }
+       }
+       RETURN(rc);
+}
+
+int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc)
+{
+       struct lov_request_set *lovset = (struct lov_request_set *)data;
+       int err;
+       ENTRY;
+
+       if (rc)
+               atomic_set(&lovset->set_completes, 0);
+
+       err = lov_fini_statfs_set(lovset);
+       RETURN(rc ? rc : err);
+}
+
+static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo,
+                           __u64 max_age, struct ptlrpc_request_set *rqset)
+{
+       struct obd_device      *obd = class_exp2obd(exp);
+       struct lov_request_set *set;
+       struct lov_request *req;
+       struct list_head *pos;
+       struct lov_obd *lov;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(oinfo != NULL);
+       LASSERT(oinfo->oi_osfs != NULL);
+
+       lov = &obd->u.lov;
+       rc = lov_prep_statfs_set(obd, oinfo, &set);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+               rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                     &req->rq_oi, max_age, rqset);
+               if (rc)
+                       break;
+       }
+
+       if (rc || list_empty(&rqset->set_requests)) {
+               int err;
+               if (rc)
+                       atomic_set(&set->set_completes, 0);
+               err = lov_fini_statfs_set(set);
+               RETURN(rc ? rc : err);
+       }
+
+       LASSERT(rqset->set_interpret == NULL);
+       rqset->set_interpret = lov_statfs_interpret;
+       rqset->set_arg = (void *)set;
+       RETURN(0);
+}
+
+static int lov_statfs(const struct lu_env *env, struct obd_export *exp,
+                     struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+       struct ptlrpc_request_set *set = NULL;
+       struct obd_info oinfo = { { { 0 } } };
+       int rc = 0;
+       ENTRY;
+
+
+       /* for obdclass we forbid using obd_statfs_rqset, but prefer using async
+        * statfs requests */
+       set = ptlrpc_prep_set();
+       if (set == NULL)
+               RETURN(-ENOMEM);
+
+       oinfo.oi_osfs = osfs;
+       oinfo.oi_flags = flags;
+       rc = lov_statfs_async(exp, &oinfo, max_age, set);
+       if (rc == 0)
+               rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+
+       RETURN(rc);
+}
+
+static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+                        void *karg, void *uarg)
+{
+       struct obd_device *obddev = class_exp2obd(exp);
+       struct lov_obd *lov = &obddev->u.lov;
+       int i = 0, rc = 0, count = lov->desc.ld_tgt_count;
+       struct obd_uuid *uuidp;
+       ENTRY;
+
+       switch (cmd) {
+       case IOC_OBD_STATFS: {
+               struct obd_ioctl_data *data = karg;
+               struct obd_device *osc_obd;
+               struct obd_statfs stat_buf = {0};
+               __u32 index;
+               __u32 flags;
+
+               memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
+               if ((index >= count))
+                       RETURN(-ENODEV);
+
+               if (!lov->lov_tgts[index])
+                       /* Try again with the next index */
+                       RETURN(-EAGAIN);
+               if (!lov->lov_tgts[index]->ltd_active)
+                       RETURN(-ENODATA);
+
+               osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp);
+               if (!osc_obd)
+                       RETURN(-EINVAL);
+
+               /* copy UUID */
+               if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd),
+                                    min((int) data->ioc_plen2,
+                                        (int) sizeof(struct obd_uuid))))
+                       RETURN(-EFAULT);
+
+               flags = uarg ? *(__u32*)uarg : 0;
+               /* got statfs data */
+               rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               flags);
+               if (rc)
+                       RETURN(rc);
+               if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+                                    min((int) data->ioc_plen1,
+                                        (int) sizeof(stat_buf))))
+                       RETURN(-EFAULT);
+               break;
+       }
+       case OBD_IOC_LOV_GET_CONFIG: {
+               struct obd_ioctl_data *data;
+               struct lov_desc *desc;
+               char *buf = NULL;
+               __u32 *genp;
+
+               len = 0;
+               if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
+                       RETURN(-EINVAL);
+
+               data = (struct obd_ioctl_data *)buf;
+
+               if (sizeof(*desc) > data->ioc_inllen1) {
+                       obd_ioctl_freedata(buf, len);
+                       RETURN(-EINVAL);
+               }
+
+               if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) {
+                       obd_ioctl_freedata(buf, len);
+                       RETURN(-EINVAL);
+               }
+
+               if (sizeof(__u32) * count > data->ioc_inllen3) {
+                       obd_ioctl_freedata(buf, len);
+                       RETURN(-EINVAL);
+               }
+
+               desc = (struct lov_desc *)data->ioc_inlbuf1;
+               memcpy(desc, &(lov->desc), sizeof(*desc));
+
+               uuidp = (struct obd_uuid *)data->ioc_inlbuf2;
+               genp = (__u32 *)data->ioc_inlbuf3;
+               /* the uuid will be empty for deleted OSTs */
+               for (i = 0; i < count; i++, uuidp++, genp++) {
+                       if (!lov->lov_tgts[i])
+                               continue;
+                       *uuidp = lov->lov_tgts[i]->ltd_uuid;
+                       *genp = lov->lov_tgts[i]->ltd_gen;
+               }
+
+               if (copy_to_user((void *)uarg, buf, len))
+                       rc = -EFAULT;
+               obd_ioctl_freedata(buf, len);
+               break;
+       }
+       case LL_IOC_LOV_SETSTRIPE:
+               rc = lov_setstripe(exp, len, karg, uarg);
+               break;
+       case LL_IOC_LOV_GETSTRIPE:
+               rc = lov_getstripe(exp, karg, uarg);
+               break;
+       case LL_IOC_LOV_SETEA:
+               rc = lov_setea(exp, karg, uarg);
+               break;
+       case OBD_IOC_QUOTACTL: {
+               struct if_quotactl *qctl = karg;
+               struct lov_tgt_desc *tgt = NULL;
+               struct obd_quotactl *oqctl;
+
+               if (qctl->qc_valid == QC_OSTIDX) {
+                       if (qctl->qc_idx < 0 || count <= qctl->qc_idx)
+                               RETURN(-EINVAL);
+
+                       tgt = lov->lov_tgts[qctl->qc_idx];
+                       if (!tgt || !tgt->ltd_exp)
+                               RETURN(-EINVAL);
+               } else if (qctl->qc_valid == QC_UUID) {
+                       for (i = 0; i < count; i++) {
+                               tgt = lov->lov_tgts[i];
+                               if (!tgt ||
+                                   !obd_uuid_equals(&tgt->ltd_uuid,
+                                                    &qctl->obd_uuid))
+                                       continue;
+
+                               if (tgt->ltd_exp == NULL)
+                                       RETURN(-EINVAL);
+
+                               break;
+                       }
+               } else {
+                       RETURN(-EINVAL);
+               }
+
+               if (i >= count)
+                       RETURN(-EAGAIN);
+
+               LASSERT(tgt && tgt->ltd_exp);
+               OBD_ALLOC_PTR(oqctl);
+               if (!oqctl)
+                       RETURN(-ENOMEM);
+
+               QCTL_COPY(oqctl, qctl);
+               rc = obd_quotactl(tgt->ltd_exp, oqctl);
+               if (rc == 0) {
+                       QCTL_COPY(qctl, oqctl);
+                       qctl->qc_valid = QC_OSTIDX;
+                       qctl->obd_uuid = tgt->ltd_uuid;
+               }
+               OBD_FREE_PTR(oqctl);
+               break;
+       }
+       default: {
+               int set = 0;
+
+               if (count == 0)
+                       RETURN(-ENOTTY);
+
+               for (i = 0; i < count; i++) {
+                       int err;
+                       struct obd_device *osc_obd;
+
+                       /* OST was disconnected */
+                       if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
+                               continue;
+
+                       /* ll_umount_begin() sets force flag but for lov, not
+                        * osc. Let's pass it through */
+                       osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp);
+                       osc_obd->obd_force = obddev->obd_force;
+                       err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp,
+                                           len, karg, uarg);
+                       if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+                               RETURN(err);
+                       } else if (err) {
+                               if (lov->lov_tgts[i]->ltd_active) {
+                                       CDEBUG(err == -ENOTTY ?
+                                              D_IOCTL : D_WARNING,
+                                              "iocontrol OSC %s on OST "
+                                              "idx %d cmd %x: err = %d\n",
+                                              lov_uuid2str(lov, i),
+                                              i, cmd, err);
+                                       if (!rc)
+                                               rc = err;
+                               }
+                       } else {
+                               set = 1;
+                       }
+               }
+               if (!set && !rc)
+                       rc = -EIO;
+       }
+       }
+
+       RETURN(rc);
+}
+
+#define FIEMAP_BUFFER_SIZE 4096
+
+/**
+ * Non-zero fe_logical indicates that this is a continuation FIEMAP
+ * call. The local end offset and the device are sent in the first
+ * fm_extent. This function calculates the stripe number from the index.
+ * This function returns a stripe_no on which mapping is to be restarted.
+ *
+ * This function returns fm_end_offset which is the in-OST offset at which
+ * mapping should be restarted. If fm_end_offset=0 is returned then caller
+ * will re-calculate proper offset in next stripe.
+ * Note that the first extent is passed to lov_get_info via the value field.
+ *
+ * \param fiemap fiemap request header
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe will be returned in this
+ */
+obd_size fiemap_calc_fm_end_offset(struct ll_user_fiemap *fiemap,
+                                  struct lov_stripe_md *lsm, obd_size fm_start,
+                                  obd_size fm_end, int *start_stripe)
+{
+       obd_size local_end = fiemap->fm_extents[0].fe_logical;
+       obd_off lun_start, lun_end;
+       obd_size fm_end_offset;
+       int stripe_no = -1, i;
+
+       if (fiemap->fm_extent_count == 0 ||
+           fiemap->fm_extents[0].fe_logical == 0)
+               return 0;
+
+       /* Find out stripe_no from ost_index saved in the fe_device */
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               if (lsm->lsm_oinfo[i]->loi_ost_idx ==
+                                       fiemap->fm_extents[0].fe_device) {
+                       stripe_no = i;
+                       break;
+               }
+       }
+       if (stripe_no == -1)
+               return -EINVAL;
+
+       /* If we have finished mapping on previous device, shift logical
+        * offset to start of next device */
+       if ((lov_stripe_intersects(lsm, stripe_no, fm_start, fm_end,
+                                  &lun_start, &lun_end)) != 0 &&
+                                  local_end < lun_end) {
+               fm_end_offset = local_end;
+               *start_stripe = stripe_no;
+       } else {
+               /* This is a special value to indicate that caller should
+                * calculate offset in next stripe. */
+               fm_end_offset = 0;
+               *start_stripe = (stripe_no + 1) % lsm->lsm_stripe_count;
+       }
+
+       return fm_end_offset;
+}
+
+/**
+ * We calculate on which OST the mapping will end. If the length of mapping
+ * is greater than (stripe_size * stripe_count) then the last_stripe will
+ * will be one just before start_stripe. Else we check if the mapping
+ * intersects each OST and find last_stripe.
+ * This function returns the last_stripe and also sets the stripe_count
+ * over which the mapping is spread
+ *
+ * \param lsm striping information for the file
+ * \param fm_start logical start of mapping
+ * \param fm_end logical end of mapping
+ * \param start_stripe starting stripe of the mapping
+ * \param stripe_count the number of stripes across which to map is returned
+ *
+ * \retval last_stripe return the last stripe of the mapping
+ */
+int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, obd_size fm_start,
+                           obd_size fm_end, int start_stripe,
+                           int *stripe_count)
+{
+       int last_stripe;
+       obd_off obd_start, obd_end;
+       int i, j;
+
+       if (fm_end - fm_start > lsm->lsm_stripe_size * lsm->lsm_stripe_count) {
+               last_stripe = (start_stripe < 1 ? lsm->lsm_stripe_count - 1 :
+                                                             start_stripe - 1);
+               *stripe_count = lsm->lsm_stripe_count;
+       } else {
+               for (j = 0, i = start_stripe; j < lsm->lsm_stripe_count;
+                    i = (i + 1) % lsm->lsm_stripe_count, j++) {
+                       if ((lov_stripe_intersects(lsm, i, fm_start, fm_end,
+                                                  &obd_start, &obd_end)) == 0)
+                               break;
+               }
+               *stripe_count = j;
+               last_stripe = (start_stripe + j - 1) %lsm->lsm_stripe_count;
+       }
+
+       return last_stripe;
+}
+
+/**
+ * Set fe_device and copy extents from local buffer into main return buffer.
+ *
+ * \param fiemap fiemap request header
+ * \param lcl_fm_ext array of local fiemap extents to be copied
+ * \param ost_index OST index to be written into the fm_device field for each
+                   extent
+ * \param ext_count number of extents to be copied
+ * \param current_extent where to start copying in main extent array
+ */
+void fiemap_prepare_and_copy_exts(struct ll_user_fiemap *fiemap,
+                                 struct ll_fiemap_extent *lcl_fm_ext,
+                                 int ost_index, unsigned int ext_count,
+                                 int current_extent)
+{
+       char *to;
+       int ext;
+
+       for (ext = 0; ext < ext_count; ext++) {
+               lcl_fm_ext[ext].fe_device = ost_index;
+               lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET;
+       }
+
+       /* Copy fm_extent's from fm_local to return buffer */
+       to = (char *)fiemap + fiemap_count_to_size(current_extent);
+       memcpy(to, lcl_fm_ext, ext_count * sizeof(struct ll_fiemap_extent));
+}
+
+/**
+ * Break down the FIEMAP request and send appropriate calls to individual OSTs.
+ * This also handles the restarting of FIEMAP calls in case mapping overflows
+ * the available number of extents in single call.
+ */
+static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key,
+                     __u32 *vallen, void *val, struct lov_stripe_md *lsm)
+{
+       struct ll_fiemap_info_key *fm_key = key;
+       struct ll_user_fiemap *fiemap = val;
+       struct ll_user_fiemap *fm_local = NULL;
+       struct ll_fiemap_extent *lcl_fm_ext;
+       int count_local;
+       unsigned int get_num_extents = 0;
+       int ost_index = 0, actual_start_stripe, start_stripe;
+       obd_size fm_start, fm_end, fm_length, fm_end_offset;
+       obd_size curr_loc;
+       int current_extent = 0, rc = 0, i;
+       int ost_eof = 0; /* EOF for object */
+       int ost_done = 0; /* done with required mapping for this OST? */
+       int last_stripe;
+       int cur_stripe = 0, cur_stripe_wrap = 0, stripe_count;
+       unsigned int buffer_size = FIEMAP_BUFFER_SIZE;
+
+       if (lsm == NULL)
+               GOTO(out, rc = 0);
+
+       if (fiemap_count_to_size(fm_key->fiemap.fm_extent_count) < buffer_size)
+               buffer_size = fiemap_count_to_size(fm_key->fiemap.fm_extent_count);
+
+       OBD_ALLOC_LARGE(fm_local, buffer_size);
+       if (fm_local == NULL)
+               GOTO(out, rc = -ENOMEM);
+       lcl_fm_ext = &fm_local->fm_extents[0];
+
+       count_local = fiemap_size_to_count(buffer_size);
+
+       memcpy(fiemap, &fm_key->fiemap, sizeof(*fiemap));
+       fm_start = fiemap->fm_start;
+       fm_length = fiemap->fm_length;
+       /* Calculate start stripe, last stripe and length of mapping */
+       actual_start_stripe = start_stripe = lov_stripe_number(lsm, fm_start);
+       fm_end = (fm_length == ~0ULL ? fm_key->oa.o_size :
+                                               fm_start + fm_length - 1);
+       /* If fm_length != ~0ULL but fm_start+fm_length-1 exceeds file size */
+       if (fm_end > fm_key->oa.o_size)
+               fm_end = fm_key->oa.o_size;
+
+       last_stripe = fiemap_calc_last_stripe(lsm, fm_start, fm_end,
+                                           actual_start_stripe, &stripe_count);
+
+       fm_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, fm_start,
+                                                 fm_end, &start_stripe);
+       if (fm_end_offset == -EINVAL)
+               GOTO(out, rc = -EINVAL);
+
+       if (fiemap->fm_extent_count == 0) {
+               get_num_extents = 1;
+               count_local = 0;
+       }
+
+       /* Check each stripe */
+       for (cur_stripe = start_stripe, i = 0; i < stripe_count;
+            i++, cur_stripe = (cur_stripe + 1) % lsm->lsm_stripe_count) {
+               obd_size req_fm_len; /* Stores length of required mapping */
+               obd_size len_mapped_single_call;
+               obd_off lun_start, lun_end, obd_object_end;
+               unsigned int ext_count;
+
+               cur_stripe_wrap = cur_stripe;
+
+               /* Find out range of mapping on this stripe */
+               if ((lov_stripe_intersects(lsm, cur_stripe, fm_start, fm_end,
+                                          &lun_start, &obd_object_end)) == 0)
+                       continue;
+
+               /* If this is a continuation FIEMAP call and we are on
+                * starting stripe then lun_start needs to be set to
+                * fm_end_offset */
+               if (fm_end_offset != 0 && cur_stripe == start_stripe)
+                       lun_start = fm_end_offset;
+
+               if (fm_length != ~0ULL) {
+                       /* Handle fm_start + fm_length overflow */
+                       if (fm_start + fm_length < fm_start)
+                               fm_length = ~0ULL - fm_start;
+                       lun_end = lov_size_to_stripe(lsm, fm_start + fm_length,
+                                                    cur_stripe);
+               } else {
+                       lun_end = ~0ULL;
+               }
+
+               if (lun_start == lun_end)
+                       continue;
+
+               req_fm_len = obd_object_end - lun_start;
+               fm_local->fm_length = 0;
+               len_mapped_single_call = 0;
+
+               /* If the output buffer is very large and the objects have many
+                * extents we may need to loop on a single OST repeatedly */
+               ost_eof = 0;
+               ost_done = 0;
+               do {
+                       if (get_num_extents == 0) {
+                               /* Don't get too many extents. */
+                               if (current_extent + count_local >
+                                   fiemap->fm_extent_count)
+                                       count_local = fiemap->fm_extent_count -
+                                                                current_extent;
+                       }
+
+                       lun_start += len_mapped_single_call;
+                       fm_local->fm_length = req_fm_len - len_mapped_single_call;
+                       req_fm_len = fm_local->fm_length;
+                       fm_local->fm_extent_count = count_local;
+                       fm_local->fm_mapped_extents = 0;
+                       fm_local->fm_flags = fiemap->fm_flags;
+
+                       fm_key->oa.o_oi = lsm->lsm_oinfo[cur_stripe]->loi_oi;
+                       ost_index = lsm->lsm_oinfo[cur_stripe]->loi_ost_idx;
+
+                       if (ost_index < 0 || ost_index >=lov->desc.ld_tgt_count)
+                               GOTO(out, rc = -EINVAL);
+
+                       /* If OST is inactive, return extent with UNKNOWN flag */
+                       if (!lov->lov_tgts[ost_index]->ltd_active) {
+                               fm_local->fm_flags |= FIEMAP_EXTENT_LAST;
+                               fm_local->fm_mapped_extents = 1;
+
+                               lcl_fm_ext[0].fe_logical = lun_start;
+                               lcl_fm_ext[0].fe_length = obd_object_end -
+                                                                     lun_start;
+                               lcl_fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN;
+
+                               goto inactive_tgt;
+                       }
+
+                       fm_local->fm_start = lun_start;
+                       fm_local->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER;
+                       memcpy(&fm_key->fiemap, fm_local, sizeof(*fm_local));
+                       *vallen=fiemap_count_to_size(fm_local->fm_extent_count);
+                       rc = obd_get_info(NULL,
+                                         lov->lov_tgts[ost_index]->ltd_exp,
+                                         keylen, key, vallen, fm_local, lsm);
+                       if (rc != 0)
+                               GOTO(out, rc);
+
+inactive_tgt:
+                       ext_count = fm_local->fm_mapped_extents;
+                       if (ext_count == 0) {
+                               ost_done = 1;
+                               /* If last stripe has hole at the end,
+                                * then we need to return */
+                               if (cur_stripe_wrap == last_stripe) {
+                                       fiemap->fm_mapped_extents = 0;
+                                       goto finish;
+                               }
+                               break;
+                       }
+
+                       /* If we just need num of extents then go to next device */
+                       if (get_num_extents) {
+                               current_extent += ext_count;
+                               break;
+                       }
+
+                       len_mapped_single_call = lcl_fm_ext[ext_count-1].fe_logical -
+                                 lun_start + lcl_fm_ext[ext_count - 1].fe_length;
+
+                       /* Have we finished mapping on this device? */
+                       if (req_fm_len <= len_mapped_single_call)
+                               ost_done = 1;
+
+                       /* Clear the EXTENT_LAST flag which can be present on
+                        * last extent */
+                       if (lcl_fm_ext[ext_count-1].fe_flags & FIEMAP_EXTENT_LAST)
+                               lcl_fm_ext[ext_count - 1].fe_flags &=
+                                                           ~FIEMAP_EXTENT_LAST;
+
+                       curr_loc = lov_stripe_size(lsm,
+                                          lcl_fm_ext[ext_count - 1].fe_logical+
+                                          lcl_fm_ext[ext_count - 1].fe_length,
+                                          cur_stripe);
+                       if (curr_loc >= fm_key->oa.o_size)
+                               ost_eof = 1;
+
+                       fiemap_prepare_and_copy_exts(fiemap, lcl_fm_ext,
+                                                    ost_index, ext_count,
+                                                    current_extent);
+
+                       current_extent += ext_count;
+
+                       /* Ran out of available extents? */
+                       if (current_extent >= fiemap->fm_extent_count)
+                               goto finish;
+               } while (ost_done == 0 && ost_eof == 0);
+
+               if (cur_stripe_wrap == last_stripe)
+                       goto finish;
+       }
+
+finish:
+       /* Indicate that we are returning device offsets unless file just has
+        * single stripe */
+       if (lsm->lsm_stripe_count > 1)
+               fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER;
+
+       if (get_num_extents)
+               goto skip_last_device_calc;
+
+       /* Check if we have reached the last stripe and whether mapping for that
+        * stripe is done. */
+       if (cur_stripe_wrap == last_stripe) {
+               if (ost_done || ost_eof)
+                       fiemap->fm_extents[current_extent - 1].fe_flags |=
+                                                            FIEMAP_EXTENT_LAST;
+       }
+
+skip_last_device_calc:
+       fiemap->fm_mapped_extents = current_extent;
+
+out:
+       OBD_FREE_LARGE(fm_local, buffer_size);
+       return rc;
+}
+
+static int lov_get_info(const struct lu_env *env, struct obd_export *exp,
+                       __u32 keylen, void *key, __u32 *vallen, void *val,
+                       struct lov_stripe_md *lsm)
+{
+       struct obd_device *obddev = class_exp2obd(exp);
+       struct lov_obd *lov = &obddev->u.lov;
+       int i, rc;
+       ENTRY;
+
+       if (!vallen || !val)
+               RETURN(-EFAULT);
+
+       obd_getref(obddev);
+
+       if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+               struct {
+                       char name[16];
+                       struct ldlm_lock *lock;
+               } *data = key;
+               struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name;
+               struct lov_oinfo *loi;
+               __u32 *stripe = val;
+
+               if (*vallen < sizeof(*stripe))
+                       GOTO(out, rc = -EFAULT);
+               *vallen = sizeof(*stripe);
+
+               /* XXX This is another one of those bits that will need to
+                * change if we ever actually support nested LOVs.  It uses
+                * the lock's export to find out which stripe it is. */
+               /* XXX - it's assumed all the locks for deleted OSTs have
+                * been cancelled. Also, the export for deleted OSTs will
+                * be NULL and won't match the lock's export. */
+               for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                       loi = lsm->lsm_oinfo[i];
+                       if (!lov->lov_tgts[loi->loi_ost_idx])
+                               continue;
+                       if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp ==
+                           data->lock->l_conn_export &&
+                           ostid_res_name_eq(&loi->loi_oi, res_id)) {
+                               *stripe = i;
+                               GOTO(out, rc = 0);
+                       }
+               }
+               LDLM_ERROR(data->lock, "lock on inode without such object");
+               dump_lsm(D_ERROR, lsm);
+               GOTO(out, rc = -ENXIO);
+       } else if (KEY_IS(KEY_LAST_ID)) {
+               struct obd_id_info *info = val;
+               __u32 size = sizeof(obd_id);
+               struct lov_tgt_desc *tgt;
+
+               LASSERT(*vallen == sizeof(struct obd_id_info));
+               tgt = lov->lov_tgts[info->idx];
+
+               if (!tgt || !tgt->ltd_active)
+                       GOTO(out, rc = -ESRCH);
+
+               rc = obd_get_info(env, tgt->ltd_exp, keylen, key,
+                                 &size, info->data, NULL);
+               GOTO(out, rc = 0);
+       } else if (KEY_IS(KEY_LOVDESC)) {
+               struct lov_desc *desc_ret = val;
+               *desc_ret = lov->desc;
+
+               GOTO(out, rc = 0);
+       } else if (KEY_IS(KEY_FIEMAP)) {
+               rc = lov_fiemap(lov, keylen, key, vallen, val, lsm);
+               GOTO(out, rc);
+       } else if (KEY_IS(KEY_CONNECT_FLAG)) {
+               struct lov_tgt_desc *tgt;
+               __u64 ost_idx = *((__u64*)val);
+
+               LASSERT(*vallen == sizeof(__u64));
+               LASSERT(ost_idx < lov->desc.ld_tgt_count);
+               tgt = lov->lov_tgts[ost_idx];
+
+               if (!tgt || !tgt->ltd_exp)
+                       GOTO(out, rc = -ESRCH);
+
+               *((__u64 *)val) = exp_connect_flags(tgt->ltd_exp);
+               GOTO(out, rc = 0);
+       } else if (KEY_IS(KEY_TGT_COUNT)) {
+               *((int *)val) = lov->desc.ld_tgt_count;
+               GOTO(out, rc = 0);
+       }
+
+       rc = -EINVAL;
+
+out:
+       obd_putref(obddev);
+       RETURN(rc);
+}
+
+static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                             obd_count keylen, void *key, obd_count vallen,
+                             void *val, struct ptlrpc_request_set *set)
+{
+       struct obd_device *obddev = class_exp2obd(exp);
+       struct lov_obd *lov = &obddev->u.lov;
+       obd_count count;
+       int i, rc = 0, err;
+       struct lov_tgt_desc *tgt;
+       unsigned incr, check_uuid,
+                do_inactive, no_set;
+       unsigned next_id = 0,  mds_con = 0, capa = 0;
+       ENTRY;
+
+       incr = check_uuid = do_inactive = no_set = 0;
+       if (set == NULL) {
+               no_set = 1;
+               set = ptlrpc_prep_set();
+               if (!set)
+                       RETURN(-ENOMEM);
+       }
+
+       obd_getref(obddev);
+       count = lov->desc.ld_tgt_count;
+
+       if (KEY_IS(KEY_NEXT_ID)) {
+               count = vallen / sizeof(struct obd_id_info);
+               vallen = sizeof(obd_id);
+               incr = sizeof(struct obd_id_info);
+               do_inactive = 1;
+               next_id = 1;
+       } else if (KEY_IS(KEY_CHECKSUM)) {
+               do_inactive = 1;
+       } else if (KEY_IS(KEY_EVICT_BY_NID)) {
+               /* use defaults:  do_inactive = incr = 0; */
+       } else if (KEY_IS(KEY_MDS_CONN)) {
+               mds_con = 1;
+       } else if (KEY_IS(KEY_CAPA_KEY)) {
+               capa = 1;
+       } else if (KEY_IS(KEY_CACHE_SET)) {
+               LASSERT(lov->lov_cache == NULL);
+               lov->lov_cache = val;
+               do_inactive = 1;
+       }
+
+       for (i = 0; i < count; i++, val = (char *)val + incr) {
+               if (next_id) {
+                       tgt = lov->lov_tgts[((struct obd_id_info*)val)->idx];
+               } else {
+                       tgt = lov->lov_tgts[i];
+               }
+               /* OST was disconnected */
+               if (!tgt || !tgt->ltd_exp)
+                       continue;
+
+               /* OST is inactive and we don't want inactive OSCs */
+               if (!tgt->ltd_active && !do_inactive)
+                       continue;
+
+               if (mds_con) {
+                       struct mds_group_info *mgi;
+
+                       LASSERT(vallen == sizeof(*mgi));
+                       mgi = (struct mds_group_info *)val;
+
+                       /* Only want a specific OSC */
+                       if (mgi->uuid && !obd_uuid_equals(mgi->uuid,
+                                               &tgt->ltd_uuid))
+                               continue;
+
+                       err = obd_set_info_async(env, tgt->ltd_exp,
+                                        keylen, key, sizeof(int),
+                                        &mgi->group, set);
+               } else if (next_id) {
+                       err = obd_set_info_async(env, tgt->ltd_exp,
+                                        keylen, key, vallen,
+                                        ((struct obd_id_info*)val)->data, set);
+               } else if (capa) {
+                       struct mds_capa_info *info = (struct mds_capa_info*)val;
+
+                       LASSERT(vallen == sizeof(*info));
+
+                        /* Only want a specific OSC */
+                       if (info->uuid &&
+                           !obd_uuid_equals(info->uuid, &tgt->ltd_uuid))
+                               continue;
+
+                       err = obd_set_info_async(env, tgt->ltd_exp, keylen,
+                                                key, sizeof(*info->capa),
+                                                info->capa, set);
+               } else {
+                       /* Only want a specific OSC */
+                       if (check_uuid &&
+                           !obd_uuid_equals(val, &tgt->ltd_uuid))
+                               continue;
+
+                       err = obd_set_info_async(env, tgt->ltd_exp,
+                                        keylen, key, vallen, val, set);
+               }
+
+               if (!rc)
+                       rc = err;
+       }
+
+       obd_putref(obddev);
+       if (no_set) {
+               err = ptlrpc_set_wait(set);
+               if (!rc)
+                       rc = err;
+               ptlrpc_set_destroy(set);
+       }
+       RETURN(rc);
+}
+
+static int lov_extent_calc(struct obd_export *exp, struct lov_stripe_md *lsm,
+                          int cmd, __u64 *offset)
+{
+       __u32 ssize = lsm->lsm_stripe_size;
+       __u64 start;
+
+       start = *offset;
+       lov_do_div64(start, ssize);
+       start = start * ssize;
+
+       CDEBUG(D_DLMTRACE, "offset "LPU64", stripe %u, start "LPU64
+                          ", end "LPU64"\n", *offset, ssize, start,
+                          start + ssize - 1);
+       if (cmd == OBD_CALC_STRIPE_END) {
+               *offset = start + ssize - 1;
+       } else if (cmd == OBD_CALC_STRIPE_START) {
+               *offset = start;
+       } else {
+               LBUG();
+       }
+
+       RETURN(0);
+}
+
+void lov_stripe_lock(struct lov_stripe_md *md)
+{
+       LASSERT(md->lsm_lock_owner != current_pid());
+       spin_lock(&md->lsm_lock);
+       LASSERT(md->lsm_lock_owner == 0);
+       md->lsm_lock_owner = current_pid();
+}
+EXPORT_SYMBOL(lov_stripe_lock);
+
+void lov_stripe_unlock(struct lov_stripe_md *md)
+{
+       LASSERT(md->lsm_lock_owner == current_pid());
+       md->lsm_lock_owner = 0;
+       spin_unlock(&md->lsm_lock);
+}
+EXPORT_SYMBOL(lov_stripe_unlock);
+
+static int lov_quotactl(struct obd_device *obd, struct obd_export *exp,
+                       struct obd_quotactl *oqctl)
+{
+       struct lov_obd      *lov = &obd->u.lov;
+       struct lov_tgt_desc *tgt;
+       __u64           curspace = 0;
+       __u64           bhardlimit = 0;
+       int               i, rc = 0;
+       ENTRY;
+
+       if (oqctl->qc_cmd != LUSTRE_Q_QUOTAON &&
+           oqctl->qc_cmd != LUSTRE_Q_QUOTAOFF &&
+           oqctl->qc_cmd != Q_GETOQUOTA &&
+           oqctl->qc_cmd != Q_INITQUOTA &&
+           oqctl->qc_cmd != LUSTRE_Q_SETQUOTA &&
+           oqctl->qc_cmd != Q_FINVALIDATE) {
+               CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd);
+               RETURN(-EFAULT);
+       }
+
+       /* for lov tgt */
+       obd_getref(obd);
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               int err;
+
+               tgt = lov->lov_tgts[i];
+
+               if (!tgt)
+                       continue;
+
+               if (!tgt->ltd_active || tgt->ltd_reap) {
+                       if (oqctl->qc_cmd == Q_GETOQUOTA &&
+                           lov->lov_tgts[i]->ltd_activate) {
+                               rc = -EREMOTEIO;
+                               CERROR("ost %d is inactive\n", i);
+                       } else {
+                               CDEBUG(D_HA, "ost %d is inactive\n", i);
+                       }
+                       continue;
+               }
+
+               err = obd_quotactl(tgt->ltd_exp, oqctl);
+               if (err) {
+                       if (tgt->ltd_active && !rc)
+                               rc = err;
+                       continue;
+               }
+
+               if (oqctl->qc_cmd == Q_GETOQUOTA) {
+                       curspace += oqctl->qc_dqblk.dqb_curspace;
+                       bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
+               }
+       }
+       obd_putref(obd);
+
+       if (oqctl->qc_cmd == Q_GETOQUOTA) {
+               oqctl->qc_dqblk.dqb_curspace = curspace;
+               oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit;
+       }
+       RETURN(rc);
+}
+
+static int lov_quotacheck(struct obd_device *obd, struct obd_export *exp,
+                         struct obd_quotactl *oqctl)
+{
+       struct lov_obd *lov = &obd->u.lov;
+       int          i, rc = 0;
+       ENTRY;
+
+       obd_getref(obd);
+
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               if (!lov->lov_tgts[i])
+                       continue;
+
+               /* Skip quota check on the administratively disabled OSTs. */
+               if (!lov->lov_tgts[i]->ltd_activate) {
+                       CWARN("lov idx %d was administratively disabled, "
+                             "skip quotacheck on it.\n", i);
+                       continue;
+               }
+
+               if (!lov->lov_tgts[i]->ltd_active) {
+                       CERROR("lov idx %d inactive\n", i);
+                       rc = -EIO;
+                       goto out;
+               }
+       }
+
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               int err;
+
+               if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_activate)
+                       continue;
+
+               err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl);
+               if (err && !rc)
+                       rc = err;
+       }
+
+out:
+       obd_putref(obd);
+
+       RETURN(rc);
+}
+
+struct obd_ops lov_obd_ops = {
+       .o_owner               = THIS_MODULE,
+       .o_setup               = lov_setup,
+       .o_precleanup     = lov_precleanup,
+       .o_cleanup           = lov_cleanup,
+       //.o_process_config      = lov_process_config,
+       .o_connect           = lov_connect,
+       .o_disconnect     = lov_disconnect,
+       .o_statfs             = lov_statfs,
+       .o_statfs_async = lov_statfs_async,
+       .o_packmd             = lov_packmd,
+       .o_unpackmd         = lov_unpackmd,
+       .o_create             = lov_create,
+       .o_destroy           = lov_destroy,
+       .o_getattr           = lov_getattr,
+       .o_getattr_async       = lov_getattr_async,
+       .o_setattr           = lov_setattr,
+       .o_setattr_async       = lov_setattr_async,
+       .o_brw           = lov_brw,
+       .o_merge_lvb       = lov_merge_lvb,
+       .o_adjust_kms     = lov_adjust_kms,
+       .o_punch               = lov_punch,
+       .o_sync         = lov_sync,
+       .o_enqueue           = lov_enqueue,
+       .o_change_cbdata       = lov_change_cbdata,
+       .o_find_cbdata   = lov_find_cbdata,
+       .o_cancel             = lov_cancel,
+       .o_cancel_unused       = lov_cancel_unused,
+       .o_iocontrol       = lov_iocontrol,
+       .o_get_info         = lov_get_info,
+       .o_set_info_async      = lov_set_info_async,
+       .o_extent_calc   = lov_extent_calc,
+       .o_llog_init       = lov_llog_init,
+       .o_llog_finish   = lov_llog_finish,
+       .o_notify             = lov_notify,
+       .o_pool_new         = lov_pool_new,
+       .o_pool_rem         = lov_pool_remove,
+       .o_pool_add         = lov_pool_add,
+       .o_pool_del         = lov_pool_del,
+       .o_getref             = lov_getref,
+       .o_putref             = lov_putref,
+       .o_quotactl         = lov_quotactl,
+       .o_quotacheck     = lov_quotacheck,
+};
+
+struct kmem_cache *lov_oinfo_slab;
+
+extern struct lu_kmem_descr lov_caches[];
+
+int __init lov_init(void)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc;
+       ENTRY;
+
+       /* print an address of _any_ initialized kernel symbol from this
+        * module, to allow debugging with gdb that doesn't support data
+        * symbols from modules.*/
+       CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches);
+
+       rc = lu_kmem_init(lov_caches);
+       if (rc)
+               return rc;
+
+       lov_oinfo_slab = kmem_cache_create("lov_oinfo",
+                                             sizeof(struct lov_oinfo),
+                                             0, SLAB_HWCACHE_ALIGN, NULL);
+       if (lov_oinfo_slab == NULL) {
+               lu_kmem_fini(lov_caches);
+               return -ENOMEM;
+       }
+       lprocfs_lov_init_vars(&lvars);
+
+       rc = class_register_type(&lov_obd_ops, NULL, lvars.module_vars,
+                                LUSTRE_LOV_NAME, &lov_device_type);
+
+       if (rc) {
+               kmem_cache_destroy(lov_oinfo_slab);
+               lu_kmem_fini(lov_caches);
+       }
+
+       RETURN(rc);
+}
+
+static void /*__exit*/ lov_exit(void)
+{
+       class_unregister_type(LUSTRE_LOV_NAME);
+       kmem_cache_destroy(lov_oinfo_slab);
+
+       lu_kmem_fini(lov_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver");
+MODULE_LICENSE("GPL");
+
+cfs_module(lov, LUSTRE_VERSION_STRING, lov_init, lov_exit);
diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c
new file mode 100644 (file)
index 0000000..aa8ae80
--- /dev/null
@@ -0,0 +1,942 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+#include <lustre_debug.h>
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Layout operations.
+ *
+ */
+
+struct lov_layout_operations {
+       int (*llo_init)(const struct lu_env *env, struct lov_device *dev,
+                       struct lov_object *lov,
+                       const struct cl_object_conf *conf,
+                       union lov_layout_state *state);
+       int (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
+                          union lov_layout_state *state);
+       void (*llo_fini)(const struct lu_env *env, struct lov_object *lov,
+                        union lov_layout_state *state);
+       void (*llo_install)(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state);
+       int  (*llo_print)(const struct lu_env *env, void *cookie,
+                         lu_printer_t p, const struct lu_object *o);
+       int  (*llo_page_init)(const struct lu_env *env, struct cl_object *obj,
+                               struct cl_page *page, struct page *vmpage);
+       int  (*llo_lock_init)(const struct lu_env *env,
+                             struct cl_object *obj, struct cl_lock *lock,
+                             const struct cl_io *io);
+       int  (*llo_io_init)(const struct lu_env *env,
+                           struct cl_object *obj, struct cl_io *io);
+       int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
+                           struct cl_attr *attr);
+};
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov);
+
+/*****************************************************************************
+ *
+ * Lov object layout operations.
+ *
+ */
+
+static void lov_install_empty(const struct lu_env *env,
+                             struct lov_object *lov,
+                             union  lov_layout_state *state)
+{
+       /*
+        * File without objects.
+        */
+}
+
+static int lov_init_empty(const struct lu_env *env,
+                         struct lov_device *dev, struct lov_object *lov,
+                         const struct cl_object_conf *conf,
+                         union  lov_layout_state *state)
+{
+       return 0;
+}
+
+static void lov_install_raid0(const struct lu_env *env,
+                             struct lov_object *lov,
+                             union  lov_layout_state *state)
+{
+}
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+                                     struct cl_device *dev,
+                                     const struct lu_fid *fid,
+                                     const struct cl_object_conf *conf)
+{
+       struct lu_object *o;
+
+       ENTRY;
+       o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+       LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+       RETURN(lu2cl(o));
+}
+
+static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
+                       struct cl_object *stripe,
+                       struct lov_layout_raid0 *r0, int idx)
+{
+       struct cl_object_header *hdr;
+       struct cl_object_header *subhdr;
+       struct cl_object_header *parent;
+       struct lov_oinfo        *oinfo;
+       int result;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) {
+               /* For sanity:test_206.
+                * Do not leave the object in cache to avoid accessing
+                * freed memory. This is because osc_object is referring to
+                * lov_oinfo of lsm_stripe_data which will be freed due to
+                * this failure. */
+               cl_object_kill(env, stripe);
+               cl_object_put(env, stripe);
+               return -EIO;
+       }
+
+       hdr    = cl_object_header(lov2cl(lov));
+       subhdr = cl_object_header(stripe);
+       parent = subhdr->coh_parent;
+
+       oinfo = lov->lo_lsm->lsm_oinfo[idx];
+       CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: ostid: "DOSTID
+              " idx: %d gen: %d\n",
+              PFID(&subhdr->coh_lu.loh_fid), subhdr, idx,
+              PFID(&hdr->coh_lu.loh_fid), hdr, POSTID(&oinfo->loi_oi),
+              oinfo->loi_ost_idx, oinfo->loi_ost_gen);
+
+       if (parent == NULL) {
+               subhdr->coh_parent = hdr;
+               subhdr->coh_nesting = hdr->coh_nesting + 1;
+               lu_object_ref_add(&stripe->co_lu, "lov-parent", lov);
+               r0->lo_sub[idx] = cl2lovsub(stripe);
+               r0->lo_sub[idx]->lso_super = lov;
+               r0->lo_sub[idx]->lso_index = idx;
+               result = 0;
+       } else {
+               struct lu_object  *old_obj;
+               struct lov_object *old_lov;
+               unsigned int mask = D_INODE;
+
+               old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type);
+               LASSERT(old_obj != NULL);
+               old_lov = cl2lov(lu2cl(old_obj));
+               if (old_lov->lo_layout_invalid) {
+                       /* the object's layout has already changed but isn't
+                        * refreshed */
+                       lu_object_unhash(env, &stripe->co_lu);
+                       result = -EAGAIN;
+               } else {
+                       mask = D_ERROR;
+                       result = -EIO;
+               }
+
+               LU_OBJECT_DEBUG(mask, env, &stripe->co_lu,
+                               "stripe %d is already owned.\n", idx);
+               LU_OBJECT_DEBUG(mask, env, old_obj, "owned.\n");
+               LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n");
+               cl_object_put(env, stripe);
+       }
+       return result;
+}
+
+static int lov_init_raid0(const struct lu_env *env,
+                         struct lov_device *dev, struct lov_object *lov,
+                         const struct cl_object_conf *conf,
+                         union  lov_layout_state *state)
+{
+       int result;
+       int i;
+
+       struct cl_object        *stripe;
+       struct lov_thread_info  *lti     = lov_env_info(env);
+       struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
+       struct lov_stripe_md    *lsm     = conf->u.coc_md->lsm;
+       struct lu_fid      *ofid    = &lti->lti_fid;
+       struct lov_layout_raid0 *r0      = &state->raid0;
+
+       ENTRY;
+
+       if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) {
+               dump_lsm(D_ERROR, lsm);
+               LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n",
+                        LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic);
+       }
+
+       LASSERT(lov->lo_lsm == NULL);
+       lov->lo_lsm = lsm_addref(lsm);
+       r0->lo_nr  = lsm->lsm_stripe_count;
+       LASSERT(r0->lo_nr <= lov_targets_nr(dev));
+
+       OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+       if (r0->lo_sub != NULL) {
+               result = 0;
+               subconf->coc_inode = conf->coc_inode;
+               spin_lock_init(&r0->lo_sub_lock);
+               /*
+                * Create stripe cl_objects.
+                */
+               for (i = 0; i < r0->lo_nr && result == 0; ++i) {
+                       struct cl_device *subdev;
+                       struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
+                       int ost_idx = oinfo->loi_ost_idx;
+
+                       result = ostid_to_fid(ofid, &oinfo->loi_oi,
+                                             oinfo->loi_ost_idx);
+                       if (result != 0)
+                               GOTO(out, result);
+
+                       subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+                       subconf->u.coc_oinfo = oinfo;
+                       LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx);
+                       /* In the function below, .hs_keycmp resolves to
+                        * lu_obj_hop_keycmp() */
+                       /* coverity[overrun-buffer-val] */
+                       stripe = lov_sub_find(env, subdev, ofid, subconf);
+                       if (!IS_ERR(stripe)) {
+                               result = lov_init_sub(env, lov, stripe, r0, i);
+                               if (result == -EAGAIN) { /* try again */
+                                       --i;
+                                       result = 0;
+                               }
+                       } else {
+                               result = PTR_ERR(stripe);
+                       }
+               }
+       } else
+               result = -ENOMEM;
+out:
+       RETURN(result);
+}
+
+static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
+{
+       LASSERT(lov->lo_type == LLT_EMPTY);
+
+       lov_layout_wait(env, lov);
+
+       cl_object_prune(env, &lov->lo_cl);
+       return 0;
+}
+
+static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
+                              struct lovsub_object *los, int idx)
+{
+       struct cl_object        *sub;
+       struct lov_layout_raid0 *r0;
+       struct lu_site    *site;
+       struct lu_site_bkt_data *bkt;
+       wait_queue_t      *waiter;
+
+       r0  = &lov->u.raid0;
+       LASSERT(r0->lo_sub[idx] == los);
+
+       sub  = lovsub2cl(los);
+       site = sub->co_lu.lo_dev->ld_site;
+       bkt  = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid);
+
+       cl_object_kill(env, sub);
+       /* release a reference to the sub-object and ... */
+       lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+       cl_object_put(env, sub);
+
+       /* ... wait until it is actually destroyed---sub-object clears its
+        * ->lo_sub[] slot in lovsub_object_fini() */
+       if (r0->lo_sub[idx] == los) {
+               waiter = &lov_env_info(env)->lti_waiter;
+               init_waitqueue_entry_current(waiter);
+               add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               while (1) {
+                       /* this wait-queue is signaled at the end of
+                        * lu_object_free(). */
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       spin_lock(&r0->lo_sub_lock);
+                       if (r0->lo_sub[idx] == los) {
+                               spin_unlock(&r0->lo_sub_lock);
+                               waitq_wait(waiter, TASK_UNINTERRUPTIBLE);
+                       } else {
+                               spin_unlock(&r0->lo_sub_lock);
+                               set_current_state(TASK_RUNNING);
+                               break;
+                       }
+               }
+               remove_wait_queue(&bkt->lsb_marche_funebre, waiter);
+       }
+       LASSERT(r0->lo_sub[idx] == NULL);
+}
+
+static int lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
+{
+       struct lov_layout_raid0 *r0 = &state->raid0;
+       struct lov_stripe_md    *lsm = lov->lo_lsm;
+       int i;
+
+       ENTRY;
+
+       dump_lsm(D_INODE, lsm);
+
+       lov_layout_wait(env, lov);
+       if (r0->lo_sub != NULL) {
+               for (i = 0; i < r0->lo_nr; ++i) {
+                       struct lovsub_object *los = r0->lo_sub[i];
+
+                       if (los != NULL) {
+                               cl_locks_prune(env, &los->lso_cl, 1);
+                               /*
+                                * If top-level object is to be evicted from
+                                * the cache, so are its sub-objects.
+                                */
+                               lov_subobject_kill(env, lov, los, i);
+                       }
+               }
+       }
+       cl_object_prune(env, &lov->lo_cl);
+       RETURN(0);
+}
+
+static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
+                          union lov_layout_state *state)
+{
+       LASSERT(lov->lo_type == LLT_EMPTY);
+}
+
+static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov,
+                          union lov_layout_state *state)
+{
+       struct lov_layout_raid0 *r0 = &state->raid0;
+       ENTRY;
+
+       if (r0->lo_sub != NULL) {
+               OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+               r0->lo_sub = NULL;
+       }
+
+       dump_lsm(D_INODE, lov->lo_lsm);
+       lov_free_memmd(&lov->lo_lsm);
+
+       EXIT;
+}
+
+static int lov_print_empty(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct lu_object *o)
+{
+       (*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid);
+       return 0;
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct lu_object *o)
+{
+       struct lov_object       *lov = lu2lov(o);
+       struct lov_layout_raid0 *r0  = lov_r0(lov);
+       struct lov_stripe_md    *lsm = lov->lo_lsm;
+       int i;
+
+       (*p)(env, cookie, "stripes: %d, %svalid, lsm{%p 0x%08X %d %u %u}: \n",
+               r0->lo_nr, lov->lo_layout_invalid ? "in" : "", lsm,
+               lsm->lsm_magic, atomic_read(&lsm->lsm_refc),
+               lsm->lsm_stripe_count, lsm->lsm_layout_gen);
+       for (i = 0; i < r0->lo_nr; ++i) {
+               struct lu_object *sub;
+
+               if (r0->lo_sub[i] != NULL) {
+                       sub = lovsub2lu(r0->lo_sub[i]);
+                       lu_object_print(env, cookie, p, sub);
+               } else
+                       (*p)(env, cookie, "sub %d absent\n", i);
+       }
+       return 0;
+}
+
+/**
+ * Implements cl_object_operations::coo_attr_get() method for an object
+ * without stripes (LLT_EMPTY layout type).
+ *
+ * The only attributes this layer is authoritative in this case is
+ * cl_attr::cat_blocks---it's 0.
+ */
+static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
+                             struct cl_attr *attr)
+{
+       attr->cat_blocks = 0;
+       return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj,
+                             struct cl_attr *attr)
+{
+       struct lov_object       *lov = cl2lov(obj);
+       struct lov_layout_raid0 *r0 = lov_r0(lov);
+       struct cl_attr          *lov_attr = &r0->lo_attr;
+       int                      result = 0;
+
+       ENTRY;
+
+       /* this is called w/o holding type guard mutex, so it must be inside
+        * an on going IO otherwise lsm may be replaced.
+        * LU-2117: it turns out there exists one exception. For mmaped files,
+        * the lock of those files may be requested in the other file's IO
+        * context, and this function is called in ccc_lock_state(), it will
+        * hit this assertion.
+        * Anyway, it's still okay to call attr_get w/o type guard as layout
+        * can't go if locks exist. */
+       /* LASSERT(atomic_read(&lsm->lsm_refc) > 1); */
+
+       if (!r0->lo_attr_valid) {
+               struct lov_stripe_md    *lsm = lov->lo_lsm;
+               struct ost_lvb    *lvb = &lov_env_info(env)->lti_lvb;
+               __u64               kms = 0;
+
+               memset(lvb, 0, sizeof(*lvb));
+               /* XXX: timestamps can be negative by sanity:test_39m,
+                * how can it be? */
+               lvb->lvb_atime = LLONG_MIN;
+               lvb->lvb_ctime = LLONG_MIN;
+               lvb->lvb_mtime = LLONG_MIN;
+
+               /*
+                * XXX that should be replaced with a loop over sub-objects,
+                * doing cl_object_attr_get() on them. But for now, let's
+                * reuse old lov code.
+                */
+
+               /*
+                * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+                * happy. It's not needed, because new code uses
+                * ->coh_attr_guard spin-lock to protect consistency of
+                * sub-object attributes.
+                */
+               lov_stripe_lock(lsm);
+               result = lov_merge_lvb_kms(lsm, lvb, &kms);
+               lov_stripe_unlock(lsm);
+               if (result == 0) {
+                       cl_lvb2attr(lov_attr, lvb);
+                       lov_attr->cat_kms = kms;
+                       r0->lo_attr_valid = 1;
+               }
+       }
+       if (result == 0) { /* merge results */
+               attr->cat_blocks = lov_attr->cat_blocks;
+               attr->cat_size = lov_attr->cat_size;
+               attr->cat_kms = lov_attr->cat_kms;
+               if (attr->cat_atime < lov_attr->cat_atime)
+                       attr->cat_atime = lov_attr->cat_atime;
+               if (attr->cat_ctime < lov_attr->cat_ctime)
+                       attr->cat_ctime = lov_attr->cat_ctime;
+               if (attr->cat_mtime < lov_attr->cat_mtime)
+                       attr->cat_mtime = lov_attr->cat_mtime;
+       }
+       RETURN(result);
+}
+
+const static struct lov_layout_operations lov_dispatch[] = {
+       [LLT_EMPTY] = {
+               .llo_init      = lov_init_empty,
+               .llo_delete    = lov_delete_empty,
+               .llo_fini      = lov_fini_empty,
+               .llo_install   = lov_install_empty,
+               .llo_print     = lov_print_empty,
+               .llo_page_init = lov_page_init_empty,
+               .llo_lock_init = lov_lock_init_empty,
+               .llo_io_init   = lov_io_init_empty,
+               .llo_getattr   = lov_attr_get_empty
+       },
+       [LLT_RAID0] = {
+               .llo_init      = lov_init_raid0,
+               .llo_delete    = lov_delete_raid0,
+               .llo_fini      = lov_fini_raid0,
+               .llo_install   = lov_install_raid0,
+               .llo_print     = lov_print_raid0,
+               .llo_page_init = lov_page_init_raid0,
+               .llo_lock_init = lov_lock_init_raid0,
+               .llo_io_init   = lov_io_init_raid0,
+               .llo_getattr   = lov_attr_get_raid0
+       }
+};
+
+
+/**
+ * Performs a double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH_NOLOCK(obj, op, ...)                           \
+({                                                                   \
+       struct lov_object                     *__obj = (obj);     \
+       enum lov_layout_type                __llt;                \
+                                                                       \
+       __llt = __obj->lo_type;                                  \
+       LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));        \
+       lov_dispatch[__llt].op(__VA_ARGS__);                        \
+})
+
+static inline void lov_conf_freeze(struct lov_object *lov)
+{
+       if (lov->lo_owner != current)
+               down_read(&lov->lo_type_guard);
+}
+
+static inline void lov_conf_thaw(struct lov_object *lov)
+{
+       if (lov->lo_owner != current)
+               up_read(&lov->lo_type_guard);
+}
+
+#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...)                     \
+({                                                                   \
+       struct lov_object                     *__obj = (obj);     \
+       int                                  __lock = !!(lock);      \
+       typeof(lov_dispatch[0].op(__VA_ARGS__)) __result;              \
+                                                                       \
+       if (__lock)                                                  \
+               lov_conf_freeze(__obj);                                 \
+       __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__);    \
+       if (__lock)                                                  \
+               lov_conf_thaw(__obj);                                   \
+       __result;                                                      \
+})
+
+/**
+ * Performs a locked double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH(obj, op, ...)                 \
+       LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__)
+
+#define LOV_2DISPATCH_VOID(obj, op, ...)                               \
+do {                                                               \
+       struct lov_object                     *__obj = (obj);     \
+       enum lov_layout_type                __llt;                \
+                                                                       \
+       lov_conf_freeze(__obj);                                         \
+       __llt = __obj->lo_type;                                  \
+       LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));        \
+       lov_dispatch[__llt].op(__VA_ARGS__);                        \
+       lov_conf_thaw(__obj);                                           \
+} while (0)
+
+static void lov_conf_lock(struct lov_object *lov)
+{
+       LASSERT(lov->lo_owner != current);
+       down_write(&lov->lo_type_guard);
+       LASSERT(lov->lo_owner == NULL);
+       lov->lo_owner = current;
+}
+
+static void lov_conf_unlock(struct lov_object *lov)
+{
+       lov->lo_owner = NULL;
+       up_write(&lov->lo_type_guard);
+}
+
+static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov)
+{
+       struct l_wait_info lwi = { 0 };
+       ENTRY;
+
+       while (atomic_read(&lov->lo_active_ios) > 0) {
+               CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n",
+                       PFID(lu_object_fid(lov2lu(lov))),
+                       atomic_read(&lov->lo_active_ios));
+
+               l_wait_event(lov->lo_waitq,
+                            atomic_read(&lov->lo_active_ios) == 0, &lwi);
+       }
+       RETURN(0);
+}
+
+static int lov_layout_change(const struct lu_env *unused,
+                            struct lov_object *lov,
+                            const struct cl_object_conf *conf)
+{
+       int result;
+       enum lov_layout_type llt = LLT_EMPTY;
+       union lov_layout_state *state = &lov->u;
+       const struct lov_layout_operations *old_ops;
+       const struct lov_layout_operations *new_ops;
+
+       struct cl_object_header *hdr = cl_object_header(&lov->lo_cl);
+       void *cookie;
+       struct lu_env *env;
+       int refcheck;
+       ENTRY;
+
+       LASSERT(0 <= lov->lo_type && lov->lo_type < ARRAY_SIZE(lov_dispatch));
+
+       if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL)
+               llt = LLT_RAID0; /* only raid0 is supported. */
+       LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch));
+
+       cookie = cl_env_reenter();
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env)) {
+               cl_env_reexit(cookie);
+               RETURN(PTR_ERR(env));
+       }
+
+       old_ops = &lov_dispatch[lov->lo_type];
+       new_ops = &lov_dispatch[llt];
+
+       result = old_ops->llo_delete(env, lov, &lov->u);
+       if (result == 0) {
+               old_ops->llo_fini(env, lov, &lov->u);
+
+               LASSERT(atomic_read(&lov->lo_active_ios) == 0);
+               LASSERT(hdr->coh_tree.rnode == NULL);
+               LASSERT(hdr->coh_pages == 0);
+
+               lov->lo_type = LLT_EMPTY;
+               result = new_ops->llo_init(env,
+                                       lu2lov_dev(lov->lo_cl.co_lu.lo_dev),
+                                       lov, conf, state);
+               if (result == 0) {
+                       new_ops->llo_install(env, lov, state);
+                       lov->lo_type = llt;
+               } else {
+                       new_ops->llo_delete(env, lov, state);
+                       new_ops->llo_fini(env, lov, state);
+                       /* this file becomes an EMPTY file. */
+               }
+       }
+
+       cl_env_put(env, &refcheck);
+       cl_env_reexit(cookie);
+       RETURN(result);
+}
+
+/*****************************************************************************
+ *
+ * Lov object operations.
+ *
+ */
+
+int lov_object_init(const struct lu_env *env, struct lu_object *obj,
+                   const struct lu_object_conf *conf)
+{
+       struct lov_device           *dev   = lu2lov_dev(obj->lo_dev);
+       struct lov_object           *lov   = lu2lov(obj);
+       const struct cl_object_conf  *cconf = lu2cl_conf(conf);
+       union  lov_layout_state      *set   = &lov->u;
+       const struct lov_layout_operations *ops;
+       int result;
+
+       ENTRY;
+       init_rwsem(&lov->lo_type_guard);
+       atomic_set(&lov->lo_active_ios, 0);
+       init_waitqueue_head(&lov->lo_waitq);
+
+       cl_object_page_init(lu2cl(obj), sizeof(struct lov_page));
+
+       /* no locking is necessary, as object is being created */
+       lov->lo_type = cconf->u.coc_md->lsm != NULL ? LLT_RAID0 : LLT_EMPTY;
+       ops = &lov_dispatch[lov->lo_type];
+       result = ops->llo_init(env, dev, lov, cconf, set);
+       if (result == 0)
+               ops->llo_install(env, lov, set);
+       RETURN(result);
+}
+
+static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
+                       const struct cl_object_conf *conf)
+{
+       struct lov_stripe_md *lsm = NULL;
+       struct lov_object *lov = cl2lov(obj);
+       int result = 0;
+       ENTRY;
+
+       lov_conf_lock(lov);
+       if (conf->coc_opc == OBJECT_CONF_INVALIDATE) {
+               lov->lo_layout_invalid = true;
+               GOTO(out, result = 0);
+       }
+
+       if (conf->coc_opc == OBJECT_CONF_WAIT) {
+               if (lov->lo_layout_invalid &&
+                   atomic_read(&lov->lo_active_ios) > 0) {
+                       lov_conf_unlock(lov);
+                       result = lov_layout_wait(env, lov);
+                       lov_conf_lock(lov);
+               }
+               GOTO(out, result);
+       }
+
+       LASSERT(conf->coc_opc == OBJECT_CONF_SET);
+
+       if (conf->u.coc_md != NULL)
+               lsm = conf->u.coc_md->lsm;
+       if ((lsm == NULL && lov->lo_lsm == NULL) ||
+           (lsm != NULL && lov->lo_lsm != NULL &&
+            lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen)) {
+               /* same version of layout */
+               lov->lo_layout_invalid = false;
+               GOTO(out, result = 0);
+       }
+
+       /* will change layout - check if there still exists active IO. */
+       if (atomic_read(&lov->lo_active_ios) > 0) {
+               lov->lo_layout_invalid = true;
+               GOTO(out, result = -EBUSY);
+       }
+
+       lov->lo_layout_invalid = lov_layout_change(env, lov, conf);
+       EXIT;
+
+out:
+       lov_conf_unlock(lov);
+       RETURN(result);
+}
+
+static void lov_object_delete(const struct lu_env *env, struct lu_object *obj)
+{
+       struct lov_object *lov = lu2lov(obj);
+
+       ENTRY;
+       LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u);
+       EXIT;
+}
+
+static void lov_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+       struct lov_object *lov = lu2lov(obj);
+
+       ENTRY;
+       LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u);
+       lu_object_fini(obj);
+       OBD_SLAB_FREE_PTR(lov, lov_object_kmem);
+       EXIT;
+}
+
+static int lov_object_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *o)
+{
+       return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o);
+}
+
+int lov_page_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_page *page, struct page *vmpage)
+{
+       return LOV_2DISPATCH_NOLOCK(cl2lov(obj),
+                                   llo_page_init, env, obj, page, vmpage);
+}
+
+/**
+ * Implements cl_object_operations::clo_io_init() method for lov
+ * layer. Dispatches to the appropriate layout io initialization method.
+ */
+int lov_io_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_io *io)
+{
+       CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+       return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
+                                    !io->ci_ignore_layout, env, obj, io);
+}
+
+/**
+ * An implementation of cl_object_operations::clo_attr_get() method for lov
+ * layer. For raid0 layout this collects and merges attributes of all
+ * sub-objects.
+ */
+static int lov_attr_get(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_attr *attr)
+{
+       /* do not take lock, as this function is called under a
+        * spin-lock. Layout is protected from changing by ongoing IO. */
+       return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr);
+}
+
+static int lov_attr_set(const struct lu_env *env, struct cl_object *obj,
+                       const struct cl_attr *attr, unsigned valid)
+{
+       /*
+        * No dispatch is required here, as no layout implements this.
+        */
+       return 0;
+}
+
+int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_lock *lock, const struct cl_io *io)
+{
+       /* No need to lock because we've taken one refcount of layout.  */
+       return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock,
+                                   io);
+}
+
+static const struct cl_object_operations lov_ops = {
+       .coo_page_init = lov_page_init,
+       .coo_lock_init = lov_lock_init,
+       .coo_io_init   = lov_io_init,
+       .coo_attr_get  = lov_attr_get,
+       .coo_attr_set  = lov_attr_set,
+       .coo_conf_set  = lov_conf_set
+};
+
+static const struct lu_object_operations lov_lu_obj_ops = {
+       .loo_object_init      = lov_object_init,
+       .loo_object_delete    = lov_object_delete,
+       .loo_object_release   = NULL,
+       .loo_object_free      = lov_object_free,
+       .loo_object_print     = lov_object_print,
+       .loo_object_invariant = NULL
+};
+
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *unused,
+                                  struct lu_device *dev)
+{
+       struct lov_object *lov;
+       struct lu_object  *obj;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, __GFP_IO);
+       if (lov != NULL) {
+               obj = lov2lu(lov);
+               lu_object_init(obj, NULL, dev);
+               lov->lo_cl.co_ops = &lov_ops;
+               lov->lo_type = -1; /* invalid, to catch uninitialized type */
+               /*
+                * object io operation vector (cl_object::co_iop) is installed
+                * later in lov_object_init(), as different vectors are used
+                * for object with different layouts.
+                */
+               obj->lo_ops = &lov_lu_obj_ops;
+       } else
+               obj = NULL;
+       RETURN(obj);
+}
+
+struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov)
+{
+       struct lov_stripe_md *lsm = NULL;
+
+       lov_conf_freeze(lov);
+       if (lov->lo_lsm != NULL) {
+               lsm = lsm_addref(lov->lo_lsm);
+               CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n",
+                       lsm, atomic_read(&lsm->lsm_refc),
+                       lov->lo_layout_invalid, current);
+       }
+       lov_conf_thaw(lov);
+       return lsm;
+}
+
+void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm)
+{
+       if (lsm == NULL)
+               return;
+
+       CDEBUG(D_INODE, "lsm %p decref %d by %p.\n",
+               lsm, atomic_read(&lsm->lsm_refc), current);
+
+       lov_free_memmd(&lsm);
+}
+
+struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj)
+{
+       struct lu_object *luobj;
+       struct lov_stripe_md *lsm = NULL;
+
+       if (clobj == NULL)
+               return NULL;
+
+       luobj = lu_object_locate(&cl_object_header(clobj)->coh_lu,
+                                &lov_device_type);
+       if (luobj != NULL)
+               lsm = lov_lsm_addref(lu2lov(luobj));
+       return lsm;
+}
+EXPORT_SYMBOL(lov_lsm_get);
+
+void lov_lsm_put(struct cl_object *unused, struct lov_stripe_md *lsm)
+{
+       if (lsm != NULL)
+               lov_free_memmd(&lsm);
+}
+EXPORT_SYMBOL(lov_lsm_put);
+
+int lov_read_and_clear_async_rc(struct cl_object *clob)
+{
+       struct lu_object *luobj;
+       int rc = 0;
+       ENTRY;
+
+       luobj = lu_object_locate(&cl_object_header(clob)->coh_lu,
+                                &lov_device_type);
+       if (luobj != NULL) {
+               struct lov_object *lov = lu2lov(luobj);
+
+               lov_conf_freeze(lov);
+               switch (lov->lo_type) {
+               case LLT_RAID0: {
+                       struct lov_stripe_md *lsm;
+                       int i;
+
+                       lsm = lov->lo_lsm;
+                       LASSERT(lsm != NULL);
+                       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+                               if (loi->loi_ar.ar_rc && !rc)
+                                       rc = loi->loi_ar.ar_rc;
+                               loi->loi_ar.ar_rc = 0;
+                       }
+               }
+               case LLT_EMPTY:
+                       break;
+               default:
+                       LBUG();
+               }
+               lov_conf_thaw(lov);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lov_read_and_clear_async_rc);
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_offset.c b/drivers/staging/lustre/lustre/lov/lov_offset.c
new file mode 100644 (file)
index 0000000..f62b7e5
--- /dev/null
@@ -0,0 +1,267 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+
+#include "lov_internal.h"
+
+/* compute object size given "stripeno" and the ost size */
+obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
+                        int stripeno)
+{
+       unsigned long ssize = lsm->lsm_stripe_size;
+       unsigned long stripe_size;
+       obd_off swidth;
+       obd_size lov_size;
+       int magic = lsm->lsm_magic;
+       ENTRY;
+
+       if (ost_size == 0)
+               RETURN(0);
+
+       LASSERT(lsm_op_find(magic) != NULL);
+       lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth);
+
+       /* lov_do_div64(a, b) returns a % b, and a = a / b */
+       stripe_size = lov_do_div64(ost_size, ssize);
+       if (stripe_size)
+               lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
+       else
+               lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
+
+       RETURN(lov_size);
+}
+
+/* we have an offset in file backed by an lov and want to find out where
+ * that offset lands in our given stripe of the file.  for the easy
+ * case where the offset is within the stripe, we just have to scale the
+ * offset down to make it relative to the stripe instead of the lov.
+ *
+ * the harder case is what to do when the offset doesn't intersect the
+ * stripe.  callers will want start offsets clamped ahead to the start
+ * of the nearest stripe in the file.  end offsets similarly clamped to the
+ * nearest ending byte of a stripe in the file:
+ *
+ * all this function does is move offsets to the nearest region of the
+ * stripe, and it does its work "mod" the full length of all the stripes.
+ * consider a file with 3 stripes:
+ *
+ *          S                                        E
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * to find stripe 1's offsets for S and E, it divides by the full stripe
+ * width and does its math in the context of a single set of stripes:
+ *
+ *          S   E
+ * -----------------------------------
+ * |    0    |     1     |     2     |
+ * -----------------------------------
+ *
+ * it'll notice that E is outside stripe 1 and clamp it to the end of the
+ * stripe, then multiply it back out by lov_off to give the real offsets in
+ * the stripe:
+ *
+ *   S            E
+ * ---------------------------------------------------------------------
+ * |    1    |     1     |     1     |    1    |     1     |     1     |
+ * ---------------------------------------------------------------------
+ *
+ * it would have done similarly and pulled S forward to the start of a 1
+ * stripe if, say, S had landed in a 0 stripe.
+ *
+ * this rounding isn't always correct.  consider an E lov offset that lands
+ * on a 0 stripe, the "mod stripe width" math will pull it forward to the
+ * start of a 1 stripe, when in fact it wanted to be rounded back to the end
+ * of a previous 1 stripe.  this logic is handled by callers and this is why:
+ *
+ * this function returns < 0 when the offset was "before" the stripe and
+ * was moved forward to the start of the stripe in question;  0 when it
+ * falls in the stripe and no shifting was done; > 0 when the offset
+ * was outside the stripe and was pulled back to its final byte. */
+int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
+                     int stripeno, obd_off *obdoff)
+{
+       unsigned long ssize  = lsm->lsm_stripe_size;
+       obd_off stripe_off, this_stripe, swidth;
+       int magic = lsm->lsm_magic;
+       int ret = 0;
+
+       if (lov_off == OBD_OBJECT_EOF) {
+               *obdoff = OBD_OBJECT_EOF;
+               return 0;
+       }
+
+       LASSERT(lsm_op_find(magic) != NULL);
+
+       lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off,
+                                               &swidth);
+
+       /* lov_do_div64(a, b) returns a % b, and a = a / b */
+       stripe_off = lov_do_div64(lov_off, swidth);
+
+       this_stripe = (obd_off)stripeno * ssize;
+       if (stripe_off < this_stripe) {
+               stripe_off = 0;
+               ret = -1;
+       } else {
+               stripe_off -= this_stripe;
+
+               if (stripe_off >= ssize) {
+                       stripe_off = ssize;
+                       ret = 1;
+               }
+       }
+
+       *obdoff = lov_off * ssize + stripe_off;
+       return ret;
+}
+
+/* Given a whole-file size and a stripe number, give the file size which
+ * corresponds to the individual object of that stripe.
+ *
+ * This behaves basically in the same was as lov_stripe_offset, except that
+ * file sizes falling before the beginning of a stripe are clamped to the end
+ * of the previous stripe, not the beginning of the next:
+ *
+ *                                            S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * if clamped to stripe 2 becomes:
+ *
+ *                                S
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ */
+obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
+                          int stripeno)
+{
+       unsigned long ssize  = lsm->lsm_stripe_size;
+       obd_off stripe_off, this_stripe, swidth;
+       int magic = lsm->lsm_magic;
+
+       if (file_size == OBD_OBJECT_EOF)
+               return OBD_OBJECT_EOF;
+
+       LASSERT(lsm_op_find(magic) != NULL);
+       lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size,
+                                               &swidth);
+
+       /* lov_do_div64(a, b) returns a % b, and a = a / b */
+       stripe_off = lov_do_div64(file_size, swidth);
+
+       this_stripe = (obd_off)stripeno * ssize;
+       if (stripe_off < this_stripe) {
+               /* Move to end of previous stripe, or zero */
+               if (file_size > 0) {
+                       file_size--;
+                       stripe_off = ssize;
+               } else {
+                       stripe_off = 0;
+               }
+       } else {
+               stripe_off -= this_stripe;
+
+               if (stripe_off >= ssize) {
+                       /* Clamp to end of this stripe */
+                       stripe_off = ssize;
+               }
+       }
+
+       return (file_size * ssize + stripe_off);
+}
+
+/* given an extent in an lov and a stripe, calculate the extent of the stripe
+ * that is contained within the lov extent.  this returns true if the given
+ * stripe does intersect with the lov extent. */
+int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+                         obd_off start, obd_off end,
+                         obd_off *obd_start, obd_off *obd_end)
+{
+       int start_side, end_side;
+
+       start_side = lov_stripe_offset(lsm, start, stripeno, obd_start);
+       end_side = lov_stripe_offset(lsm, end, stripeno, obd_end);
+
+       CDEBUG(D_INODE, "["LPU64"->"LPU64"] -> [(%d) "LPU64"->"LPU64" (%d)]\n",
+              start, end, start_side, *obd_start, *obd_end, end_side);
+
+       /* this stripe doesn't intersect the file extent when neither
+        * start or the end intersected the stripe and obd_start and
+        * obd_end got rounded up to the save value. */
+       if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+               return 0;
+
+       /* as mentioned in the lov_stripe_offset commentary, end
+        * might have been shifted in the wrong direction.  This
+        * happens when an end offset is before the stripe when viewed
+        * through the "mod stripe size" math. we detect it being shifted
+        * in the wrong direction and touch it up.
+        * interestingly, this can't underflow since end must be > start
+        * if we passed through the previous check.
+        * (should we assert for that somewhere?) */
+       if (end_side != 0)
+               (*obd_end)--;
+
+       return 1;
+}
+
+/* compute which stripe number "lov_off" will be written into */
+int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
+{
+       unsigned long ssize  = lsm->lsm_stripe_size;
+       obd_off stripe_off, swidth;
+       int magic = lsm->lsm_magic;
+
+       LASSERT(lsm_op_find(magic) != NULL);
+       lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth);
+
+       stripe_off = lov_do_div64(lov_off, swidth);
+
+       /* Puts stripe_off/ssize result into stripe_off */
+       lov_do_div64(stripe_off, ssize);
+
+       return stripe_off;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_pack.c b/drivers/staging/lustre/lustre/lov/lov_pack.c
new file mode 100644 (file)
index 0000000..492948a
--- /dev/null
@@ -0,0 +1,678 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <lustre_net.h>
+#include <obd.h>
+#include <obd_lov.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_user.h>
+
+#include "lov_internal.h"
+
+void lov_dump_lmm_common(int level, void *lmmp)
+{
+       struct lov_mds_md *lmm = lmmp;
+       struct ost_id   oi;
+
+       lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi);
+       CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n",
+              POSTID(&oi), le32_to_cpu(lmm->lmm_magic),
+              le32_to_cpu(lmm->lmm_pattern));
+       CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n",
+              le32_to_cpu(lmm->lmm_stripe_size),
+              le16_to_cpu(lmm->lmm_stripe_count),
+              le16_to_cpu(lmm->lmm_layout_gen));
+}
+
+static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
+                                int stripe_count)
+{
+       int i;
+
+       if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
+               CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+                      stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+               return;
+       }
+
+       for (i = 0; i < stripe_count; ++i, ++lod) {
+               struct ost_id   oi;
+
+               ostid_le_to_cpu(&lod->l_ost_oi, &oi);
+               CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i,
+                      le32_to_cpu(lod->l_ost_idx), POSTID(&oi));
+       }
+}
+
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
+{
+       lov_dump_lmm_common(level, lmm);
+       lov_dump_lmm_objects(level, lmm->lmm_objects,
+                            le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
+{
+       lov_dump_lmm_common(level, lmm);
+       CDEBUG(level,"pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
+       lov_dump_lmm_objects(level, lmm->lmm_objects,
+                            le16_to_cpu(lmm->lmm_stripe_count));
+}
+
+void lov_dump_lmm(int level, void *lmm)
+{
+       int magic;
+
+       magic = ((struct lov_mds_md_v1 *)(lmm))->lmm_magic;
+       switch (magic) {
+       case LOV_MAGIC_V1:
+               return lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)(lmm));
+       case LOV_MAGIC_V3:
+               return lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)(lmm));
+       default:
+               CERROR("Cannot recognize lmm_magic %x", magic);
+       }
+       return;
+}
+
+#define LMM_ASSERT(test)                                               \
+do {                                                               \
+       if (!(test)) lov_dump_lmm(D_ERROR, lmm);                        \
+       LASSERT(test); /* so we know what assertion failed */      \
+} while(0)
+
+/* Pack LOV object metadata for disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ *
+ * XXX In the future, this will be enhanced to get the EA size from the
+ *     underlying OSC device(s) to get their EA sizes so we can stack
+ *     LOVs properly.  For now lov_mds_md_size() just assumes one obd_id
+ *     per stripe.
+ */
+int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+              struct lov_stripe_md *lsm)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct lov_obd *lov = &obd->u.lov;
+       struct lov_mds_md_v1 *lmmv1;
+       struct lov_mds_md_v3 *lmmv3;
+       __u16 stripe_count;
+       struct lov_ost_data_v1 *lmm_objects;
+       int lmm_size, lmm_magic;
+       int i;
+       int cplen = 0;
+       ENTRY;
+
+       if (lsm) {
+               lmm_magic = lsm->lsm_magic;
+       } else {
+               if (lmmp && *lmmp)
+                       lmm_magic = le32_to_cpu((*lmmp)->lmm_magic);
+               else
+                       /* lsm == NULL and lmmp == NULL */
+                       lmm_magic = LOV_MAGIC;
+       }
+
+       if ((lmm_magic != LOV_MAGIC_V1) &&
+           (lmm_magic != LOV_MAGIC_V3)) {
+               CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n",
+                       lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3);
+               RETURN(-EINVAL);
+
+       }
+
+       if (lsm) {
+               /* If we are just sizing the EA, limit the stripe count
+                * to the actual number of OSTs in this filesystem. */
+               if (!lmmp) {
+                       stripe_count = lov_get_stripecnt(lov, lmm_magic,
+                                                        lsm->lsm_stripe_count);
+                       lsm->lsm_stripe_count = stripe_count;
+               } else {
+                       stripe_count = lsm->lsm_stripe_count;
+               }
+       } else {
+               /* No need to allocate more than maximum supported stripes.
+                * Anyway, this is pretty inaccurate since ld_tgt_count now
+                * represents max index and we should rely on the actual number
+                * of OSTs instead */
+               stripe_count = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize,
+                                                   lmm_magic);
+               if (stripe_count > lov->desc.ld_tgt_count)
+                       stripe_count = lov->desc.ld_tgt_count;
+       }
+
+       /* XXX LOV STACKING call into osc for sizes */
+       lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+
+       if (!lmmp)
+               RETURN(lmm_size);
+
+       if (*lmmp && !lsm) {
+               stripe_count = le16_to_cpu((*lmmp)->lmm_stripe_count);
+               lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+               OBD_FREE_LARGE(*lmmp, lmm_size);
+               *lmmp = NULL;
+               RETURN(0);
+       }
+
+       if (!*lmmp) {
+               OBD_ALLOC_LARGE(*lmmp, lmm_size);
+               if (!*lmmp)
+                       RETURN(-ENOMEM);
+       }
+
+       CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n",
+              lmm_magic, lmm_size);
+
+       lmmv1 = *lmmp;
+       lmmv3 = (struct lov_mds_md_v3 *)*lmmp;
+       if (lmm_magic == LOV_MAGIC_V3)
+               lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3);
+       else
+               lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
+
+       if (!lsm)
+               RETURN(lmm_size);
+
+       /* lmmv1 and lmmv3 point to the same struct and have the
+        * same first fields
+        */
+       lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi);
+       lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size);
+       lmmv1->lmm_stripe_count = cpu_to_le16(stripe_count);
+       lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern);
+       lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen);
+       if (lsm->lsm_magic == LOV_MAGIC_V3) {
+               cplen = strlcpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name,
+                               sizeof(lmmv3->lmm_pool_name));
+               if (cplen >= sizeof(lmmv3->lmm_pool_name))
+                       RETURN(-E2BIG);
+               lmm_objects = lmmv3->lmm_objects;
+       } else {
+               lmm_objects = lmmv1->lmm_objects;
+       }
+
+       for (i = 0; i < stripe_count; i++) {
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+               /* XXX LOV STACKING call down to osc_packmd() to do packing */
+               LASSERTF(ostid_id(&loi->loi_oi) != 0, "lmm_oi "DOSTID
+                        " stripe %u/%u idx %u\n", POSTID(&lmmv1->lmm_oi),
+                        i, stripe_count, loi->loi_ost_idx);
+               ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi);
+               lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
+               lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
+       }
+
+       RETURN(lmm_size);
+}
+
+/* Find the max stripecount we should use */
+__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count)
+{
+       __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD;
+
+       if (!stripe_count)
+               stripe_count = lov->desc.ld_default_stripe_count;
+       if (stripe_count > lov->desc.ld_active_tgt_count)
+               stripe_count = lov->desc.ld_active_tgt_count;
+       if (!stripe_count)
+               stripe_count = 1;
+
+       /* stripe count is based on whether ldiskfs can handle
+        * larger EA sizes */
+       if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE &&
+           lov->lov_ocd.ocd_max_easize)
+               max_stripes = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize,
+                                                  magic);
+
+       if (stripe_count > max_stripes)
+               stripe_count = max_stripes;
+
+       return stripe_count;
+}
+
+
+static int lov_verify_lmm(void *lmm, int lmm_bytes, __u16 *stripe_count)
+{
+       int rc;
+
+       if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) {
+               char *buffer;
+               int sz;
+
+               CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n",
+                      le32_to_cpu(*(__u32 *)lmm), lmm_bytes);
+               sz = lmm_bytes * 2 + 1;
+               OBD_ALLOC_LARGE(buffer, sz);
+               if (buffer != NULL) {
+                       int i;
+
+                       for (i = 0; i < lmm_bytes; i++)
+                               sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]);
+                       buffer[sz - 1] = '\0';
+                       CERROR("%s\n", buffer);
+                       OBD_FREE_LARGE(buffer, sz);
+               }
+               return -EINVAL;
+       }
+       rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm,
+                                    lmm_bytes, stripe_count);
+       return rc;
+}
+
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count,
+                   int pattern, int magic)
+{
+       int i, lsm_size;
+       ENTRY;
+
+       CDEBUG(D_INFO, "alloc lsm, stripe_count %d\n", stripe_count);
+
+       *lsmp = lsm_alloc_plain(stripe_count, &lsm_size);
+       if (!*lsmp) {
+               CERROR("can't allocate lsmp stripe_count %d\n", stripe_count);
+               RETURN(-ENOMEM);
+       }
+
+       atomic_set(&(*lsmp)->lsm_refc, 1);
+       spin_lock_init(&(*lsmp)->lsm_lock);
+       (*lsmp)->lsm_magic = magic;
+       (*lsmp)->lsm_stripe_count = stripe_count;
+       (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
+       (*lsmp)->lsm_pattern = pattern;
+       (*lsmp)->lsm_pool_name[0] = '\0';
+       (*lsmp)->lsm_layout_gen = 0;
+       (*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0;
+
+       for (i = 0; i < stripe_count; i++)
+               loi_init((*lsmp)->lsm_oinfo[i]);
+
+       RETURN(lsm_size);
+}
+
+int lov_free_memmd(struct lov_stripe_md **lsmp)
+{
+       struct lov_stripe_md *lsm = *lsmp;
+       int refc;
+
+       *lsmp = NULL;
+       LASSERT(atomic_read(&lsm->lsm_refc) > 0);
+       if ((refc = atomic_dec_return(&lsm->lsm_refc)) == 0) {
+               LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+               lsm_op_find(lsm->lsm_magic)->lsm_free(lsm);
+       }
+       return refc;
+}
+
+
+/* Unpack LOV object metadata from disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ */
+int lov_unpackmd(struct obd_export *exp,  struct lov_stripe_md **lsmp,
+                struct lov_mds_md *lmm, int lmm_bytes)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct lov_obd *lov = &obd->u.lov;
+       int rc = 0, lsm_size;
+       __u16 stripe_count;
+       __u32 magic;
+       ENTRY;
+
+       /* If passed an MDS struct use values from there, otherwise defaults */
+       if (lmm) {
+               rc = lov_verify_lmm(lmm, lmm_bytes, &stripe_count);
+               if (rc)
+                       RETURN(rc);
+               magic = le32_to_cpu(lmm->lmm_magic);
+       } else {
+               magic = LOV_MAGIC;
+               stripe_count = lov_get_stripecnt(lov, magic, 0);
+       }
+
+       /* If we aren't passed an lsmp struct, we just want the size */
+       if (!lsmp) {
+               /* XXX LOV STACKING call into osc for sizes */
+               LBUG();
+               RETURN(lov_stripe_md_size(stripe_count));
+       }
+       /* If we are passed an allocated struct but nothing to unpack, free */
+       if (*lsmp && !lmm) {
+               lov_free_memmd(lsmp);
+               RETURN(0);
+       }
+
+       lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0,
+                                  magic);
+       if (lsm_size < 0)
+               RETURN(lsm_size);
+
+       /* If we are passed a pointer but nothing to unpack, we only alloc */
+       if (!lmm)
+               RETURN(lsm_size);
+
+       LASSERT(lsm_op_find(magic) != NULL);
+       rc = lsm_op_find(magic)->lsm_unpackmd(lov, *lsmp, lmm);
+       if (rc) {
+               lov_free_memmd(lsmp);
+               RETURN(rc);
+       }
+
+       RETURN(lsm_size);
+}
+
+static int __lov_setstripe(struct obd_export *exp, int max_lmm_size,
+                          struct lov_stripe_md **lsmp,
+                          struct lov_user_md *lump)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct lov_obd *lov = &obd->u.lov;
+       char buffer[sizeof(struct lov_user_md_v3)];
+       struct lov_user_md_v3 *lumv3 = (struct lov_user_md_v3 *)&buffer[0];
+       struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&buffer[0];
+       int lmm_magic;
+       __u16 stripe_count;
+       int rc;
+       int cplen = 0;
+       ENTRY;
+
+       rc = lov_lum_swab_if_needed(lumv3, &lmm_magic, lump);
+       if (rc)
+               RETURN(rc);
+
+       /* in the rest of the tests, as *lumv1 and lumv3 have the same
+        * fields, we use lumv1 to avoid code duplication */
+
+       if (lumv1->lmm_pattern == 0) {
+               lumv1->lmm_pattern = lov->desc.ld_pattern ?
+                       lov->desc.ld_pattern : LOV_PATTERN_RAID0;
+       }
+
+       if (lumv1->lmm_pattern != LOV_PATTERN_RAID0) {
+               CDEBUG(D_IOCTL, "bad userland stripe pattern: %#x\n",
+                      lumv1->lmm_pattern);
+               RETURN(-EINVAL);
+       }
+
+       /* 64kB is the largest common page size we see (ia64), and matches the
+        * check in lfs */
+       if (lumv1->lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) {
+               CDEBUG(D_IOCTL, "stripe size %u not multiple of %u, fixing\n",
+                      lumv1->lmm_stripe_size, LOV_MIN_STRIPE_SIZE);
+               lumv1->lmm_stripe_size = LOV_MIN_STRIPE_SIZE;
+       }
+
+       if ((lumv1->lmm_stripe_offset >= lov->desc.ld_tgt_count) &&
+           (lumv1->lmm_stripe_offset !=
+            (typeof(lumv1->lmm_stripe_offset))(-1))) {
+               CDEBUG(D_IOCTL, "stripe offset %u > number of OSTs %u\n",
+                      lumv1->lmm_stripe_offset, lov->desc.ld_tgt_count);
+               RETURN(-EINVAL);
+       }
+       stripe_count = lov_get_stripecnt(lov, lmm_magic,
+                                        lumv1->lmm_stripe_count);
+
+       if (max_lmm_size) {
+               int max_stripes = (max_lmm_size -
+                                  lov_mds_md_size(0, lmm_magic)) /
+                                  sizeof(struct lov_ost_data_v1);
+               if (unlikely(max_stripes < stripe_count)) {
+                       CDEBUG(D_IOCTL, "stripe count reset from %d to %d\n",
+                              stripe_count, max_stripes);
+                       stripe_count = max_stripes;
+               }
+       }
+
+       if (lmm_magic == LOV_USER_MAGIC_V3) {
+               struct pool_desc *pool;
+
+               /* In the function below, .hs_keycmp resolves to
+                * pool_hashkey_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               pool = lov_find_pool(lov, lumv3->lmm_pool_name);
+               if (pool != NULL) {
+                       if (lumv3->lmm_stripe_offset !=
+                           (typeof(lumv3->lmm_stripe_offset))(-1)) {
+                               rc = lov_check_index_in_pool(
+                                       lumv3->lmm_stripe_offset, pool);
+                               if (rc < 0) {
+                                       lov_pool_putref(pool);
+                                       RETURN(-EINVAL);
+                               }
+                       }
+
+                       if (stripe_count > pool_tgt_count(pool))
+                               stripe_count = pool_tgt_count(pool);
+
+                       lov_pool_putref(pool);
+               }
+       }
+
+       rc = lov_alloc_memmd(lsmp, stripe_count, lumv1->lmm_pattern, lmm_magic);
+
+       if (rc >= 0) {
+               (*lsmp)->lsm_oinfo[0]->loi_ost_idx = lumv1->lmm_stripe_offset;
+               (*lsmp)->lsm_stripe_size = lumv1->lmm_stripe_size;
+               if (lmm_magic == LOV_USER_MAGIC_V3) {
+                       cplen = strlcpy((*lsmp)->lsm_pool_name,
+                                       lumv3->lmm_pool_name,
+                                       sizeof((*lsmp)->lsm_pool_name));
+                       if (cplen >= sizeof((*lsmp)->lsm_pool_name))
+                               rc = -E2BIG;
+               }
+               rc = 0;
+       }
+
+       RETURN(rc);
+}
+
+/* Configure object striping information on a new file.
+ *
+ * @lmmu is a pointer to a user struct with one or more of the fields set to
+ * indicate the application preference: lmm_stripe_count, lmm_stripe_size,
+ * lmm_stripe_offset, and lmm_stripe_pattern.  lmm_magic must be LOV_MAGIC.
+ * @lsmp is a pointer to an in-core stripe MD that needs to be filled in.
+ */
+int lov_setstripe(struct obd_export *exp, int max_lmm_size,
+                 struct lov_stripe_md **lsmp, struct lov_user_md *lump)
+{
+       int rc;
+       mm_segment_t seg;
+
+       seg = get_fs();
+       set_fs(KERNEL_DS);
+
+       rc = __lov_setstripe(exp, max_lmm_size, lsmp, lump);
+       set_fs(seg);
+       RETURN(rc);
+}
+
+int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
+             struct lov_user_md *lump)
+{
+       int i;
+       int rc;
+       struct obd_export *oexp;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       obd_id last_id = 0;
+       struct lov_user_ost_data_v1 *lmm_objects;
+
+       ENTRY;
+
+       if (lump->lmm_magic == LOV_USER_MAGIC_V3)
+               lmm_objects = ((struct lov_user_md_v3 *)lump)->lmm_objects;
+       else
+               lmm_objects = lump->lmm_objects;
+
+       for (i = 0; i < lump->lmm_stripe_count; i++) {
+               __u32 len = sizeof(last_id);
+               oexp = lov->lov_tgts[lmm_objects[i].l_ost_idx]->ltd_exp;
+               rc = obd_get_info(NULL, oexp, sizeof(KEY_LAST_ID), KEY_LAST_ID,
+                                 &len, &last_id, NULL);
+               if (rc)
+                       RETURN(rc);
+               if (ostid_id(&lmm_objects[i].l_ost_oi) > last_id) {
+                       CERROR("Setting EA for object > than last id on"
+                              " ost idx %d "DOSTID" > "LPD64" \n",
+                              lmm_objects[i].l_ost_idx,
+                              POSTID(&lmm_objects[i].l_ost_oi), last_id);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       rc = lov_setstripe(exp, 0, lsmp, lump);
+       if (rc)
+               RETURN(rc);
+
+       for (i = 0; i < lump->lmm_stripe_count; i++) {
+               (*lsmp)->lsm_oinfo[i]->loi_ost_idx =
+                       lmm_objects[i].l_ost_idx;
+               (*lsmp)->lsm_oinfo[i]->loi_oi = lmm_objects[i].l_ost_oi;
+       }
+       RETURN(0);
+}
+
+
+/* Retrieve object striping information.
+ *
+ * @lump is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_USER_MAGIC.
+ */
+int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
+                 struct lov_user_md *lump)
+{
+       /*
+        * XXX huge struct allocated on stack.
+        */
+       /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+       struct lov_user_md_v3 lum;
+       struct lov_mds_md *lmmk = NULL;
+       int rc, lmm_size;
+       int lum_size;
+       mm_segment_t seg;
+       ENTRY;
+
+       if (!lsm)
+               RETURN(-ENODATA);
+
+       /*
+        * "Switch to kernel segment" to allow copying from kernel space by
+        * copy_{to,from}_user().
+        */
+       seg = get_fs();
+       set_fs(KERNEL_DS);
+
+       /* we only need the header part from user space to get lmm_magic and
+        * lmm_stripe_count, (the header part is common to v1 and v3) */
+       lum_size = sizeof(struct lov_user_md_v1);
+       if (copy_from_user(&lum, lump, lum_size))
+               GOTO(out_set, rc = -EFAULT);
+       else if ((lum.lmm_magic != LOV_USER_MAGIC) &&
+                (lum.lmm_magic != LOV_USER_MAGIC_V3))
+               GOTO(out_set, rc = -EINVAL);
+
+       if (lum.lmm_stripe_count &&
+           (lum.lmm_stripe_count < lsm->lsm_stripe_count)) {
+               /* Return right size of stripe to user */
+               lum.lmm_stripe_count = lsm->lsm_stripe_count;
+               rc = copy_to_user(lump, &lum, lum_size);
+               GOTO(out_set, rc = -EOVERFLOW);
+       }
+       rc = lov_packmd(exp, &lmmk, lsm);
+       if (rc < 0)
+               GOTO(out_set, rc);
+       lmm_size = rc;
+       rc = 0;
+
+       /* FIXME: Bug 1185 - copy fields properly when structs change */
+       /* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */
+       CLASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3));
+       CLASSERT(sizeof lum.lmm_objects[0] == sizeof lmmk->lmm_objects[0]);
+
+       if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) &&
+           ((lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) ||
+           (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)))) {
+               lustre_swab_lov_mds_md(lmmk);
+               lustre_swab_lov_user_md_objects(
+                               (struct lov_user_ost_data*)lmmk->lmm_objects,
+                               lmmk->lmm_stripe_count);
+       }
+       if (lum.lmm_magic == LOV_USER_MAGIC) {
+               /* User request for v1, we need skip lmm_pool_name */
+               if (lmmk->lmm_magic == LOV_MAGIC_V3) {
+                       memmove((char*)(&lmmk->lmm_stripe_count) +
+                               sizeof(lmmk->lmm_stripe_count),
+                               ((struct lov_mds_md_v3*)lmmk)->lmm_objects,
+                               lmmk->lmm_stripe_count *
+                               sizeof(struct lov_ost_data_v1));
+                       lmm_size -= LOV_MAXPOOLNAME;
+               }
+       } else {
+               /* if v3 we just have to update the lum_size */
+               lum_size = sizeof(struct lov_user_md_v3);
+       }
+
+       /* User wasn't expecting this many OST entries */
+       if (lum.lmm_stripe_count == 0)
+               lmm_size = lum_size;
+       else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count)
+               GOTO(out_set, rc = -EOVERFLOW);
+       /*
+        * Have a difference between lov_mds_md & lov_user_md.
+        * So we have to re-order the data before copy to user.
+        */
+       lum.lmm_stripe_count = lmmk->lmm_stripe_count;
+       lum.lmm_layout_gen = lmmk->lmm_layout_gen;
+       ((struct lov_user_md *)lmmk)->lmm_layout_gen = lum.lmm_layout_gen;
+       ((struct lov_user_md *)lmmk)->lmm_stripe_count = lum.lmm_stripe_count;
+       if (copy_to_user(lump, lmmk, lmm_size))
+               rc = -EFAULT;
+
+       obd_free_diskmd(exp, &lmmk);
+out_set:
+       set_fs(seg);
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_page.c b/drivers/staging/lustre/lustre/lov/lov_page.c
new file mode 100644 (file)
index 0000000..65790d6
--- /dev/null
@@ -0,0 +1,235 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lov page operations.
+ *
+ */
+
+static int lov_page_invariant(const struct cl_page_slice *slice)
+{
+       const struct cl_page  *page = slice->cpl_page;
+       const struct cl_page  *sub  = lov_sub_page(slice);
+
+       return ergo(sub != NULL,
+                   page->cp_child == sub &&
+                   sub->cp_parent == page &&
+                   page->cp_state == sub->cp_state);
+}
+
+static void lov_page_fini(const struct lu_env *env,
+                         struct cl_page_slice *slice)
+{
+       struct cl_page  *sub = lov_sub_page(slice);
+
+       LINVRNT(lov_page_invariant(slice));
+       ENTRY;
+
+       if (sub != NULL) {
+               LASSERT(sub->cp_state == CPS_FREEING);
+               lu_ref_del(&sub->cp_reference, "lov", sub->cp_parent);
+               sub->cp_parent = NULL;
+               slice->cpl_page->cp_child = NULL;
+               cl_page_put(env, sub);
+       }
+       EXIT;
+}
+
+static int lov_page_own(const struct lu_env *env,
+                       const struct cl_page_slice *slice, struct cl_io *io,
+                       int nonblock)
+{
+       struct lov_io     *lio = lov_env_io(env);
+       struct lov_io_sub *sub;
+
+       LINVRNT(lov_page_invariant(slice));
+       LINVRNT(!cl2lov_page(slice)->lps_invalid);
+       ENTRY;
+
+       sub = lov_page_subio(env, lio, slice);
+       if (!IS_ERR(sub)) {
+               lov_sub_page(slice)->cp_owner = sub->sub_io;
+               lov_sub_put(sub);
+       } else
+               LBUG(); /* Arrgh */
+       RETURN(0);
+}
+
+static void lov_page_assume(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io)
+{
+       lov_page_own(env, slice, io, 0);
+}
+
+static int lov_page_cache_add(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *io)
+{
+       struct lov_io     *lio = lov_env_io(env);
+       struct lov_io_sub *sub;
+       int rc = 0;
+
+       LINVRNT(lov_page_invariant(slice));
+       LINVRNT(!cl2lov_page(slice)->lps_invalid);
+       ENTRY;
+
+       sub = lov_page_subio(env, lio, slice);
+       if (!IS_ERR(sub)) {
+               rc = cl_page_cache_add(sub->sub_env, sub->sub_io,
+                                      slice->cpl_page->cp_child, CRT_WRITE);
+               lov_sub_put(sub);
+       } else {
+               rc = PTR_ERR(sub);
+               CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, "rc = %d\n", rc);
+       }
+       RETURN(rc);
+}
+
+static int lov_page_print(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         void *cookie, lu_printer_t printer)
+{
+       struct lov_page *lp = cl2lov_page(slice);
+
+       return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p\n", lp);
+}
+
+static const struct cl_page_operations lov_page_ops = {
+       .cpo_fini   = lov_page_fini,
+       .cpo_own    = lov_page_own,
+       .cpo_assume = lov_page_assume,
+       .io = {
+               [CRT_WRITE] = {
+                       .cpo_cache_add = lov_page_cache_add
+               }
+       },
+       .cpo_print  = lov_page_print
+};
+
+static void lov_empty_page_fini(const struct lu_env *env,
+                               struct cl_page_slice *slice)
+{
+       LASSERT(slice->cpl_page->cp_child == NULL);
+}
+
+int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_page *page, struct page *vmpage)
+{
+       struct lov_object *loo = cl2lov(obj);
+       struct lov_layout_raid0 *r0 = lov_r0(loo);
+       struct lov_io     *lio = lov_env_io(env);
+       struct cl_page    *subpage;
+       struct cl_object  *subobj;
+       struct lov_io_sub *sub;
+       struct lov_page   *lpg = cl_object_page_slice(obj, page);
+       loff_t       offset;
+       obd_off     suboff;
+       int             stripe;
+       int             rc;
+       ENTRY;
+
+       offset = cl_offset(obj, page->cp_index);
+       stripe = lov_stripe_number(loo->lo_lsm, offset);
+       LASSERT(stripe < r0->lo_nr);
+       rc = lov_stripe_offset(loo->lo_lsm, offset, stripe,
+                                  &suboff);
+       LASSERT(rc == 0);
+
+       lpg->lps_invalid = 1;
+       cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_page_ops);
+
+       sub = lov_sub_get(env, lio, stripe);
+       if (IS_ERR(sub))
+               GOTO(out, rc = PTR_ERR(sub));
+
+       subobj = lovsub2cl(r0->lo_sub[stripe]);
+       subpage = cl_page_find_sub(sub->sub_env, subobj,
+                                  cl_index(subobj, suboff), vmpage, page);
+       lov_sub_put(sub);
+       if (IS_ERR(subpage))
+               GOTO(out, rc = PTR_ERR(subpage));
+
+       if (likely(subpage->cp_parent == page)) {
+               lu_ref_add(&subpage->cp_reference, "lov", page);
+               lpg->lps_invalid = 0;
+               rc = 0;
+       } else {
+               CL_PAGE_DEBUG(D_ERROR, env, page, "parent page\n");
+               CL_PAGE_DEBUG(D_ERROR, env, subpage, "child page\n");
+               LASSERT(0);
+       }
+
+       EXIT;
+out:
+       return rc;
+}
+
+
+static const struct cl_page_operations lov_empty_page_ops = {
+       .cpo_fini   = lov_empty_page_fini,
+       .cpo_print  = lov_page_print
+};
+
+int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_page *page, struct page *vmpage)
+{
+       struct lov_page *lpg = cl_object_page_slice(obj, page);
+       void *addr;
+       ENTRY;
+
+       cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_empty_page_ops);
+       addr = kmap(vmpage);
+       memset(addr, 0, cl_page_size(obj));
+       kunmap(vmpage);
+       cl_page_export(env, page, 1);
+       RETURN(0);
+}
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lov_pool.c b/drivers/staging/lustre/lustre/lov/lov_pool.c
new file mode 100644 (file)
index 0000000..a96f908
--- /dev/null
@@ -0,0 +1,681 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pool.c
+ *
+ * OST pool methods
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ * Author: Alex Lyashkov <Alexey.Lyashkov@Sun.COM>
+ * Author: Nathaniel Rutman <Nathan.Rutman@Sun.COM>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include "lov_internal.h"
+
+#define pool_tgt(_p, _i) \
+               _p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]]
+
+static void lov_pool_getref(struct pool_desc *pool)
+{
+       CDEBUG(D_INFO, "pool %p\n", pool);
+       atomic_inc(&pool->pool_refcount);
+}
+
+void lov_pool_putref(struct pool_desc *pool)
+{
+       CDEBUG(D_INFO, "pool %p\n", pool);
+       if (atomic_dec_and_test(&pool->pool_refcount)) {
+               LASSERT(hlist_unhashed(&pool->pool_hash));
+               LASSERT(list_empty(&pool->pool_list));
+               LASSERT(pool->pool_proc_entry == NULL);
+               lov_ost_pool_free(&(pool->pool_rr.lqr_pool));
+               lov_ost_pool_free(&(pool->pool_obds));
+               OBD_FREE_PTR(pool);
+               EXIT;
+       }
+}
+
+void lov_pool_putref_locked(struct pool_desc *pool)
+{
+       CDEBUG(D_INFO, "pool %p\n", pool);
+       LASSERT(atomic_read(&pool->pool_refcount) > 1);
+
+       atomic_dec(&pool->pool_refcount);
+}
+
+/*
+ * hash function using a Rotating Hash algorithm
+ * Knuth, D. The Art of Computer Programming,
+ * Volume 3: Sorting and Searching,
+ * Chapter 6.4.
+ * Addison Wesley, 1973
+ */
+static __u32 pool_hashfn(cfs_hash_t *hash_body, const void *key, unsigned mask)
+{
+       int i;
+       __u32 result;
+       char *poolname;
+
+       result = 0;
+       poolname = (char *)key;
+       for (i = 0; i < LOV_MAXPOOLNAME; i++) {
+               if (poolname[i] == '\0')
+                       break;
+               result = (result << 4)^(result >> 28) ^  poolname[i];
+       }
+       return (result % mask);
+}
+
+static void *pool_key(struct hlist_node *hnode)
+{
+       struct pool_desc *pool;
+
+       pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+       return (pool->pool_name);
+}
+
+static int pool_hashkey_keycmp(const void *key, struct hlist_node *compared_hnode)
+{
+       char *pool_name;
+       struct pool_desc *pool;
+
+       pool_name = (char *)key;
+       pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash);
+       return !strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME);
+}
+
+static void *pool_hashobject(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct pool_desc, pool_hash);
+}
+
+static void pool_hashrefcount_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct pool_desc *pool;
+
+       pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+       lov_pool_getref(pool);
+}
+
+static void pool_hashrefcount_put_locked(cfs_hash_t *hs,
+                                        struct hlist_node *hnode)
+{
+       struct pool_desc *pool;
+
+       pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+       lov_pool_putref_locked(pool);
+}
+
+cfs_hash_ops_t pool_hash_operations = {
+       .hs_hash        = pool_hashfn,
+       .hs_key  = pool_key,
+       .hs_keycmp      = pool_hashkey_keycmp,
+       .hs_object      = pool_hashobject,
+       .hs_get  = pool_hashrefcount_get,
+       .hs_put_locked  = pool_hashrefcount_put_locked,
+
+};
+
+#ifdef LPROCFS
+/* ifdef needed for liblustre support */
+/*
+ * pool /proc seq_file methods
+ */
+/*
+ * iterator is used to go through the target pool entries
+ * index is the current entry index in the lp_array[] array
+ * index >= pos returned to the seq_file interface
+ * pos is from 0 to (pool->pool_obds.op_count - 1)
+ */
+#define POOL_IT_MAGIC 0xB001CEA0
+struct pool_iterator {
+       int magic;
+       struct pool_desc *pool;
+       int idx;        /* from 0 to pool_tgt_size - 1 */
+};
+
+static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct pool_iterator *iter = (struct pool_iterator *)s->private;
+       int prev_idx;
+
+       LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+
+       /* test if end of file */
+       if (*pos >= pool_tgt_count(iter->pool))
+               return NULL;
+
+       /* iterate to find a non empty entry */
+       prev_idx = iter->idx;
+       down_read(&pool_tgt_rw_sem(iter->pool));
+       iter->idx++;
+       if (iter->idx == pool_tgt_count(iter->pool)) {
+               iter->idx = prev_idx; /* we stay on the last entry */
+               up_read(&pool_tgt_rw_sem(iter->pool));
+               return NULL;
+       }
+       up_read(&pool_tgt_rw_sem(iter->pool));
+       (*pos)++;
+       /* return != NULL to continue */
+       return iter;
+}
+
+static void *pool_proc_start(struct seq_file *s, loff_t *pos)
+{
+       struct pool_desc *pool = (struct pool_desc *)s->private;
+       struct pool_iterator *iter;
+
+       lov_pool_getref(pool);
+       if ((pool_tgt_count(pool) == 0) ||
+           (*pos >= pool_tgt_count(pool))) {
+               /* iter is not created, so stop() has no way to
+                * find pool to dec ref */
+               lov_pool_putref(pool);
+               return NULL;
+       }
+
+       OBD_ALLOC_PTR(iter);
+       if (!iter)
+               return ERR_PTR(-ENOMEM);
+       iter->magic = POOL_IT_MAGIC;
+       iter->pool = pool;
+       iter->idx = 0;
+
+       /* we use seq_file private field to memorized iterator so
+        * we can free it at stop() */
+       /* /!\ do not forget to restore it to pool before freeing it */
+       s->private = iter;
+       if (*pos > 0) {
+               loff_t i;
+               void *ptr;
+
+               i = 0;
+               do {
+                    ptr = pool_proc_next(s, &iter, &i);
+               } while ((i < *pos) && (ptr != NULL));
+               return ptr;
+       }
+       return iter;
+}
+
+static void pool_proc_stop(struct seq_file *s, void *v)
+{
+       struct pool_iterator *iter = (struct pool_iterator *)s->private;
+
+       /* in some cases stop() method is called 2 times, without
+        * calling start() method (see seq_read() from fs/seq_file.c)
+        * we have to free only if s->private is an iterator */
+       if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+               /* we restore s->private so next call to pool_proc_start()
+                * will work */
+               s->private = iter->pool;
+               lov_pool_putref(iter->pool);
+               OBD_FREE_PTR(iter);
+       }
+       return;
+}
+
+static int pool_proc_show(struct seq_file *s, void *v)
+{
+       struct pool_iterator *iter = (struct pool_iterator *)v;
+       struct lov_tgt_desc *tgt;
+
+       LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+       LASSERT(iter->pool != NULL);
+       LASSERT(iter->idx <= pool_tgt_count(iter->pool));
+
+       down_read(&pool_tgt_rw_sem(iter->pool));
+       tgt = pool_tgt(iter->pool, iter->idx);
+       up_read(&pool_tgt_rw_sem(iter->pool));
+       if (tgt)
+               seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
+
+       return 0;
+}
+
+static struct seq_operations pool_proc_ops = {
+       .start    = pool_proc_start,
+       .next      = pool_proc_next,
+       .stop      = pool_proc_stop,
+       .show      = pool_proc_show,
+};
+
+static int pool_proc_open(struct inode *inode, struct file *file)
+{
+       int rc;
+
+       rc = seq_open(file, &pool_proc_ops);
+       if (!rc) {
+               struct seq_file *s = file->private_data;
+               s->private = PDE_DATA(inode);
+       }
+       return rc;
+}
+
+static struct file_operations pool_proc_operations = {
+       .open      = pool_proc_open,
+       .read      = seq_read,
+       .llseek  = seq_lseek,
+       .release        = seq_release,
+};
+#endif /* LPROCFS */
+
+void lov_dump_pool(int level, struct pool_desc *pool)
+{
+       int i;
+
+       lov_pool_getref(pool);
+
+       CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n",
+              pool->pool_name, pool->pool_obds.op_count);
+       down_read(&pool_tgt_rw_sem(pool));
+
+       for (i = 0; i < pool_tgt_count(pool) ; i++) {
+               if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp)
+                       continue;
+               CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n",
+                      pool->pool_name, i,
+                      obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid)));
+       }
+
+       up_read(&pool_tgt_rw_sem(pool));
+       lov_pool_putref(pool);
+}
+
+#define LOV_POOL_INIT_COUNT 2
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
+{
+       ENTRY;
+
+       if (count == 0)
+               count = LOV_POOL_INIT_COUNT;
+       op->op_array = NULL;
+       op->op_count = 0;
+       init_rwsem(&op->op_rw_sem);
+       op->op_size = count;
+       OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0]));
+       if (op->op_array == NULL) {
+               op->op_size = 0;
+               RETURN(-ENOMEM);
+       }
+       EXIT;
+       return 0;
+}
+
+/* Caller must hold write op_rwlock */
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count)
+{
+       __u32 *new;
+       int new_size;
+
+       LASSERT(min_count != 0);
+
+       if (op->op_count < op->op_size)
+               return 0;
+
+       new_size = max(min_count, 2 * op->op_size);
+       OBD_ALLOC(new, new_size * sizeof(op->op_array[0]));
+       if (new == NULL)
+               return -ENOMEM;
+
+       /* copy old array to new one */
+       memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0]));
+       OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+       op->op_array = new;
+       op->op_size = new_size;
+       return 0;
+}
+
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count)
+{
+       int rc = 0, i;
+       ENTRY;
+
+       down_write(&op->op_rw_sem);
+
+       rc = lov_ost_pool_extend(op, min_count);
+       if (rc)
+               GOTO(out, rc);
+
+       /* search ost in pool array */
+       for (i = 0; i < op->op_count; i++) {
+               if (op->op_array[i] == idx)
+                       GOTO(out, rc = -EEXIST);
+       }
+       /* ost not found we add it */
+       op->op_array[op->op_count] = idx;
+       op->op_count++;
+       EXIT;
+out:
+       up_write(&op->op_rw_sem);
+       return rc;
+}
+
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
+{
+       int i;
+       ENTRY;
+
+       down_write(&op->op_rw_sem);
+
+       for (i = 0; i < op->op_count; i++) {
+               if (op->op_array[i] == idx) {
+                       memmove(&op->op_array[i], &op->op_array[i + 1],
+                               (op->op_count - i - 1) * sizeof(op->op_array[0]));
+                       op->op_count--;
+                       up_write(&op->op_rw_sem);
+                       EXIT;
+                       return 0;
+               }
+       }
+
+       up_write(&op->op_rw_sem);
+       RETURN(-EINVAL);
+}
+
+int lov_ost_pool_free(struct ost_pool *op)
+{
+       ENTRY;
+
+       if (op->op_size == 0)
+               RETURN(0);
+
+       down_write(&op->op_rw_sem);
+
+       OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+       op->op_array = NULL;
+       op->op_count = 0;
+       op->op_size = 0;
+
+       up_write(&op->op_rw_sem);
+       RETURN(0);
+}
+
+
+int lov_pool_new(struct obd_device *obd, char *poolname)
+{
+       struct lov_obd *lov;
+       struct pool_desc *new_pool;
+       int rc;
+       ENTRY;
+
+       lov = &(obd->u.lov);
+
+       if (strlen(poolname) > LOV_MAXPOOLNAME)
+               RETURN(-ENAMETOOLONG);
+
+       OBD_ALLOC_PTR(new_pool);
+       if (new_pool == NULL)
+               RETURN(-ENOMEM);
+
+       strncpy(new_pool->pool_name, poolname, LOV_MAXPOOLNAME);
+       new_pool->pool_name[LOV_MAXPOOLNAME] = '\0';
+       new_pool->pool_lobd = obd;
+       /* ref count init to 1 because when created a pool is always used
+        * up to deletion
+        */
+       atomic_set(&new_pool->pool_refcount, 1);
+       rc = lov_ost_pool_init(&new_pool->pool_obds, 0);
+       if (rc)
+              GOTO(out_err, rc);
+
+       memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr));
+       rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0);
+       if (rc)
+               GOTO(out_free_pool_obds, rc);
+
+       INIT_HLIST_NODE(&new_pool->pool_hash);
+
+#ifdef LPROCFS
+       /* we need this assert seq_file is not implementated for liblustre */
+       /* get ref for /proc file */
+       lov_pool_getref(new_pool);
+       new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
+                                                      poolname, new_pool,
+                                                      &pool_proc_operations);
+       if (IS_ERR(new_pool->pool_proc_entry)) {
+               CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname);
+               new_pool->pool_proc_entry = NULL;
+               lov_pool_putref(new_pool);
+       }
+       CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry);
+#endif
+
+       spin_lock(&obd->obd_dev_lock);
+       list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+       lov->lov_pool_count++;
+       spin_unlock(&obd->obd_dev_lock);
+
+       /* add to find only when it fully ready  */
+       rc = cfs_hash_add_unique(lov->lov_pools_hash_body, poolname,
+                                &new_pool->pool_hash);
+       if (rc)
+               GOTO(out_err, rc = -EEXIST);
+
+       CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n",
+              poolname, lov->lov_pool_count);
+
+       RETURN(0);
+
+out_err:
+       spin_lock(&obd->obd_dev_lock);
+       list_del_init(&new_pool->pool_list);
+       lov->lov_pool_count--;
+       spin_unlock(&obd->obd_dev_lock);
+
+       lprocfs_remove(&new_pool->pool_proc_entry);
+
+       lov_ost_pool_free(&new_pool->pool_rr.lqr_pool);
+out_free_pool_obds:
+       lov_ost_pool_free(&new_pool->pool_obds);
+       OBD_FREE_PTR(new_pool);
+       return rc;
+}
+
+int lov_pool_del(struct obd_device *obd, char *poolname)
+{
+       struct lov_obd *lov;
+       struct pool_desc *pool;
+       ENTRY;
+
+       lov = &(obd->u.lov);
+
+       /* lookup and kill hash reference */
+       pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname);
+       if (pool == NULL)
+               RETURN(-ENOENT);
+
+       if (pool->pool_proc_entry != NULL) {
+               CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry);
+               lprocfs_remove(&pool->pool_proc_entry);
+               lov_pool_putref(pool);
+       }
+
+       spin_lock(&obd->obd_dev_lock);
+       list_del_init(&pool->pool_list);
+       lov->lov_pool_count--;
+       spin_unlock(&obd->obd_dev_lock);
+
+       /* release last reference */
+       lov_pool_putref(pool);
+
+       RETURN(0);
+}
+
+
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+       struct obd_uuid ost_uuid;
+       struct lov_obd *lov;
+       struct pool_desc *pool;
+       unsigned int lov_idx;
+       int rc;
+       ENTRY;
+
+       lov = &(obd->u.lov);
+
+       pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+       if (pool == NULL)
+               RETURN(-ENOENT);
+
+       obd_str2uuid(&ost_uuid, ostname);
+
+
+       /* search ost in lov array */
+       obd_getref(obd);
+       for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+               if (!lov->lov_tgts[lov_idx])
+                       continue;
+               if (obd_uuid_equals(&ost_uuid,
+                                   &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+                       break;
+       }
+       /* test if ost found in lov */
+       if (lov_idx == lov->desc.ld_tgt_count)
+               GOTO(out, rc = -EINVAL);
+
+       rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
+       if (rc)
+               GOTO(out, rc);
+
+       pool->pool_rr.lqr_dirty = 1;
+
+       CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n",
+              ostname, poolname,  pool_tgt_count(pool));
+
+       EXIT;
+out:
+       obd_putref(obd);
+       lov_pool_putref(pool);
+       return rc;
+}
+
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
+{
+       struct obd_uuid ost_uuid;
+       struct lov_obd *lov;
+       struct pool_desc *pool;
+       unsigned int lov_idx;
+       int rc = 0;
+       ENTRY;
+
+       lov = &(obd->u.lov);
+
+       pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+       if (pool == NULL)
+               RETURN(-ENOENT);
+
+       obd_str2uuid(&ost_uuid, ostname);
+
+       obd_getref(obd);
+       /* search ost in lov array, to get index */
+       for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
+               if (!lov->lov_tgts[lov_idx])
+                       continue;
+
+               if (obd_uuid_equals(&ost_uuid,
+                                   &(lov->lov_tgts[lov_idx]->ltd_uuid)))
+                       break;
+       }
+
+       /* test if ost found in lov */
+       if (lov_idx == lov->desc.ld_tgt_count)
+               GOTO(out, rc = -EINVAL);
+
+       lov_ost_pool_remove(&pool->pool_obds, lov_idx);
+
+       pool->pool_rr.lqr_dirty = 1;
+
+       CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
+              poolname);
+
+       EXIT;
+out:
+       obd_putref(obd);
+       lov_pool_putref(pool);
+       return rc;
+}
+
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool)
+{
+       int i, rc;
+       ENTRY;
+
+       /* caller may no have a ref on pool if it got the pool
+        * without calling lov_find_pool() (e.g. go through the lov pool
+        * list)
+        */
+       lov_pool_getref(pool);
+
+       down_read(&pool_tgt_rw_sem(pool));
+
+       for (i = 0; i < pool_tgt_count(pool); i++) {
+               if (pool_tgt_array(pool)[i] == idx)
+                       GOTO(out, rc = 0);
+       }
+       rc = -ENOENT;
+       EXIT;
+out:
+       up_read(&pool_tgt_rw_sem(pool));
+
+       lov_pool_putref(pool);
+       return rc;
+}
+
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname)
+{
+       struct pool_desc *pool;
+
+       pool = NULL;
+       if (poolname[0] != '\0') {
+               pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname);
+               if (pool == NULL)
+                       CWARN("Request for an unknown pool ("LOV_POOLNAMEF")\n",
+                             poolname);
+               if ((pool != NULL) && (pool_tgt_count(pool) == 0)) {
+                       CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n",
+                              poolname);
+                       /* pool is ignored, so we remove ref on it */
+                       lov_pool_putref(pool);
+                       pool = NULL;
+               }
+       }
+       return pool;
+}
diff --git a/drivers/staging/lustre/lustre/lov/lov_request.c b/drivers/staging/lustre/lustre/lov/lov_request.c
new file mode 100644 (file)
index 0000000..13f1637
--- /dev/null
@@ -0,0 +1,1551 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <obd_lov.h>
+#include <lustre/lustre_idl.h>
+
+#include "lov_internal.h"
+
+static void lov_init_set(struct lov_request_set *set)
+{
+       set->set_count = 0;
+       atomic_set(&set->set_completes, 0);
+       atomic_set(&set->set_success, 0);
+       atomic_set(&set->set_finish_checked, 0);
+       set->set_cookies = 0;
+       INIT_LIST_HEAD(&set->set_list);
+       atomic_set(&set->set_refcount, 1);
+       init_waitqueue_head(&set->set_waitq);
+       spin_lock_init(&set->set_lock);
+}
+
+void lov_finish_set(struct lov_request_set *set)
+{
+       struct list_head *pos, *n;
+       ENTRY;
+
+       LASSERT(set);
+       list_for_each_safe(pos, n, &set->set_list) {
+               struct lov_request *req = list_entry(pos,
+                                                        struct lov_request,
+                                                        rq_link);
+               list_del_init(&req->rq_link);
+
+               if (req->rq_oi.oi_oa)
+                       OBDO_FREE(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_md)
+                       OBD_FREE_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+               if (req->rq_oi.oi_osfs)
+                       OBD_FREE(req->rq_oi.oi_osfs,
+                                sizeof(*req->rq_oi.oi_osfs));
+               OBD_FREE(req, sizeof(*req));
+       }
+
+       if (set->set_pga) {
+               int len = set->set_oabufs * sizeof(*set->set_pga);
+               OBD_FREE_LARGE(set->set_pga, len);
+       }
+       if (set->set_lockh)
+               lov_llh_put(set->set_lockh);
+
+       OBD_FREE(set, sizeof(*set));
+       EXIT;
+}
+
+int lov_set_finished(struct lov_request_set *set, int idempotent)
+{
+       int completes = atomic_read(&set->set_completes);
+
+       CDEBUG(D_INFO, "check set %d/%d\n", completes, set->set_count);
+
+       if (completes == set->set_count) {
+               if (idempotent)
+                       return 1;
+               if (atomic_inc_return(&set->set_finish_checked) == 1)
+                       return 1;
+       }
+       return 0;
+}
+
+void lov_update_set(struct lov_request_set *set,
+                   struct lov_request *req, int rc)
+{
+       req->rq_complete = 1;
+       req->rq_rc = rc;
+
+       atomic_inc(&set->set_completes);
+       if (rc == 0)
+               atomic_inc(&set->set_success);
+
+       wake_up(&set->set_waitq);
+}
+
+int lov_update_common_set(struct lov_request_set *set,
+                         struct lov_request *req, int rc)
+{
+       struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+       ENTRY;
+
+       lov_update_set(set, req, rc);
+
+       /* grace error on inactive ost */
+       if (rc && !(lov->lov_tgts[req->rq_idx] &&
+                   lov->lov_tgts[req->rq_idx]->ltd_active))
+               rc = 0;
+
+       /* FIXME in raid1 regime, should return 0 */
+       RETURN(rc);
+}
+
+void lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
+{
+       list_add_tail(&req->rq_link, &set->set_list);
+       set->set_count++;
+       req->rq_rqset = set;
+}
+
+static int lov_check_set(struct lov_obd *lov, int idx)
+{
+       int rc = 0;
+       mutex_lock(&lov->lov_lock);
+
+       if (lov->lov_tgts[idx] == NULL ||
+           lov->lov_tgts[idx]->ltd_active ||
+           (lov->lov_tgts[idx]->ltd_exp != NULL &&
+            class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried))
+               rc = 1;
+
+       mutex_unlock(&lov->lov_lock);
+       return rc;
+}
+
+/* Check if the OSC connection exists and is active.
+ * If the OSC has not yet had a chance to connect to the OST the first time,
+ * wait once for it to connect instead of returning an error.
+ */
+int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx)
+{
+       wait_queue_head_t waitq;
+       struct l_wait_info lwi;
+       struct lov_tgt_desc *tgt;
+       int rc = 0;
+
+       mutex_lock(&lov->lov_lock);
+
+       tgt = lov->lov_tgts[ost_idx];
+
+       if (unlikely(tgt == NULL))
+               GOTO(out, rc = 0);
+
+       if (likely(tgt->ltd_active))
+               GOTO(out, rc = 1);
+
+       if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried)
+               GOTO(out, rc = 0);
+
+       mutex_unlock(&lov->lov_lock);
+
+       init_waitqueue_head(&waitq);
+       lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout),
+                                  cfs_time_seconds(1), NULL, NULL);
+
+       rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi);
+       if (tgt != NULL && tgt->ltd_active)
+               return 1;
+
+       return 0;
+
+out:
+       mutex_unlock(&lov->lov_lock);
+       return rc;
+}
+
+extern void osc_update_enqueue(struct lustre_handle *lov_lockhp,
+                              struct lov_oinfo *loi, int flags,
+                              struct ost_lvb *lvb, __u32 mode, int rc);
+
+static int lov_update_enqueue_lov(struct obd_export *exp,
+                                 struct lustre_handle *lov_lockhp,
+                                 struct lov_oinfo *loi, int flags, int idx,
+                                 struct ost_id *oi, int rc)
+{
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+
+       if (rc != ELDLM_OK &&
+           !(rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT))) {
+               memset(lov_lockhp, 0, sizeof(*lov_lockhp));
+               if (lov->lov_tgts[idx] && lov->lov_tgts[idx]->ltd_active) {
+                       /* -EUSERS used by OST to report file contention */
+                       if (rc != -EINTR && rc != -EUSERS)
+                               CERROR("%s: enqueue objid "DOSTID" subobj"
+                                      DOSTID" on OST idx %d: rc %d\n",
+                                      exp->exp_obd->obd_name,
+                                      POSTID(oi), POSTID(&loi->loi_oi),
+                                      loi->loi_ost_idx, rc);
+               } else
+                       rc = ELDLM_OK;
+       }
+       return rc;
+}
+
+int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc)
+{
+       struct lov_request_set *set = req->rq_rqset;
+       struct lustre_handle *lov_lockhp;
+       struct obd_info *oi = set->set_oi;
+       struct lov_oinfo *loi;
+       ENTRY;
+
+       LASSERT(oi != NULL);
+
+       lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+       loi = oi->oi_md->lsm_oinfo[req->rq_stripe];
+
+       /* XXX LOV STACKING: OSC gets a copy, created in lov_prep_enqueue_set
+        * and that copy can be arbitrarily out of date.
+        *
+        * The LOV API is due for a serious rewriting anyways, and this
+        * can be addressed then. */
+
+       lov_stripe_lock(oi->oi_md);
+       osc_update_enqueue(lov_lockhp, loi, oi->oi_flags,
+                          &req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb, mode, rc);
+       if (rc == ELDLM_LOCK_ABORTED && (oi->oi_flags & LDLM_FL_HAS_INTENT))
+               memset(lov_lockhp, 0, sizeof *lov_lockhp);
+       rc = lov_update_enqueue_lov(set->set_exp, lov_lockhp, loi, oi->oi_flags,
+                                   req->rq_idx, &oi->oi_md->lsm_oi, rc);
+       lov_stripe_unlock(oi->oi_md);
+       lov_update_set(set, req, rc);
+       RETURN(rc);
+}
+
+/* The callback for osc_enqueue that updates lov info for every OSC request. */
+static int cb_update_enqueue(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct ldlm_enqueue_info *einfo;
+       struct lov_request *lovreq;
+
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       einfo = lovreq->rq_rqset->set_ei;
+       return lov_update_enqueue_set(lovreq, einfo->ei_mode, rc);
+}
+
+static int enqueue_done(struct lov_request_set *set, __u32 mode)
+{
+       struct lov_request *req;
+       struct lov_obd *lov = &set->set_exp->exp_obd->u.lov;
+       int completes = atomic_read(&set->set_completes);
+       int rc = 0;
+       ENTRY;
+
+       /* enqueue/match success, just return */
+       if (completes && completes == atomic_read(&set->set_success))
+               RETURN(0);
+
+       /* cancel enqueued/matched locks */
+       list_for_each_entry(req, &set->set_list, rq_link) {
+               struct lustre_handle *lov_lockhp;
+
+               if (!req->rq_complete || req->rq_rc)
+                       continue;
+
+               lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
+               LASSERT(lov_lockhp);
+               if (!lustre_handle_is_used(lov_lockhp))
+                       continue;
+
+               rc = obd_cancel(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                               req->rq_oi.oi_md, mode, lov_lockhp);
+               if (rc && lov->lov_tgts[req->rq_idx] &&
+                   lov->lov_tgts[req->rq_idx]->ltd_active)
+                       CERROR("%s: cancelling obdjid "DOSTID" on OST"
+                              "idx %d error: rc = %d\n",
+                              set->set_exp->exp_obd->obd_name,
+                              POSTID(&req->rq_oi.oi_md->lsm_oi),
+                              req->rq_idx, rc);
+       }
+       if (set->set_lockh)
+               lov_llh_put(set->set_lockh);
+       RETURN(rc);
+}
+
+int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode, int rc,
+                        struct ptlrpc_request_set *rqset)
+{
+       int ret = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       /* Do enqueue_done only for sync requests and if any request
+        * succeeded. */
+       if (!rqset) {
+               if (rc)
+                       atomic_set(&set->set_completes, 0);
+               ret = enqueue_done(set, mode);
+       } else if (set->set_lockh)
+               lov_llh_put(set->set_lockh);
+
+       lov_put_reqset(set);
+
+       RETURN(rc ? rc : ret);
+}
+
+static void lov_llh_addref(void *llhp)
+{
+       struct lov_lock_handles *llh = llhp;
+
+       atomic_inc(&llh->llh_refcount);
+       CDEBUG(D_INFO, "GETting llh %p : new refcount %d\n", llh,
+              atomic_read(&llh->llh_refcount));
+}
+
+static struct portals_handle_ops lov_handle_ops = {
+       .hop_addref = lov_llh_addref,
+       .hop_free   = NULL,
+};
+
+static struct lov_lock_handles *lov_llh_new(struct lov_stripe_md *lsm)
+{
+       struct lov_lock_handles *llh;
+
+       OBD_ALLOC(llh, sizeof *llh +
+                 sizeof(*llh->llh_handles) * lsm->lsm_stripe_count);
+       if (llh == NULL)
+               return NULL;
+
+       atomic_set(&llh->llh_refcount, 2);
+       llh->llh_stripe_count = lsm->lsm_stripe_count;
+       INIT_LIST_HEAD(&llh->llh_handle.h_link);
+       class_handle_hash(&llh->llh_handle, &lov_handle_ops);
+
+       return llh;
+}
+
+int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct ldlm_enqueue_info *einfo,
+                        struct lov_request_set **reqset)
+{
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       struct lov_request_set *set;
+       int i, rc = 0;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+       set->set_ei = einfo;
+       set->set_lockh = lov_llh_new(oinfo->oi_md);
+       if (set->set_lockh == NULL)
+               GOTO(out_set, rc = -ENOMEM);
+       oinfo->oi_lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi;
+               struct lov_request *req;
+               obd_off start, end;
+
+               loi = oinfo->oi_md->lsm_oinfo[i];
+               if (!lov_stripe_intersects(oinfo->oi_md, i,
+                                          oinfo->oi_policy.l_extent.start,
+                                          oinfo->oi_policy.l_extent.end,
+                                          &start, &end))
+                       continue;
+
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               req->rq_buflen = sizeof(*req->rq_oi.oi_md) +
+                       sizeof(struct lov_oinfo *) +
+                       sizeof(struct lov_oinfo);
+               OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+               if (req->rq_oi.oi_md == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               req->rq_oi.oi_md->lsm_oinfo[0] =
+                       ((void *)req->rq_oi.oi_md) + sizeof(*req->rq_oi.oi_md) +
+                       sizeof(struct lov_oinfo *);
+
+               /* Set lov request specific parameters. */
+               req->rq_oi.oi_lockh = set->set_lockh->llh_handles + i;
+               req->rq_oi.oi_cb_up = cb_update_enqueue;
+               req->rq_oi.oi_flags = oinfo->oi_flags;
+
+               LASSERT(req->rq_oi.oi_lockh);
+
+               req->rq_oi.oi_policy.l_extent.gid =
+                       oinfo->oi_policy.l_extent.gid;
+               req->rq_oi.oi_policy.l_extent.start = start;
+               req->rq_oi.oi_policy.l_extent.end = end;
+
+               req->rq_idx = loi->loi_ost_idx;
+               req->rq_stripe = i;
+
+               /* XXX LOV STACKING: submd should be from the subobj */
+               req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+               req->rq_oi.oi_md->lsm_stripe_count = 0;
+               req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms_valid =
+                       loi->loi_kms_valid;
+               req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms = loi->loi_kms;
+               req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb = loi->loi_lvb;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(0);
+out_set:
+       lov_fini_enqueue_set(set, einfo->ei_mode, rc, NULL);
+       RETURN(rc);
+}
+
+int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       rc = enqueue_done(set, mode);
+       if ((set->set_count == atomic_read(&set->set_success)) &&
+           (flags & LDLM_FL_TEST_LOCK))
+               lov_llh_put(set->set_lockh);
+
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo,
+                      struct lov_stripe_md *lsm, ldlm_policy_data_t *policy,
+                      __u32 mode, struct lustre_handle *lockh,
+                      struct lov_request_set **reqset)
+{
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       struct lov_request_set *set;
+       int i, rc = 0;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+       set->set_oi->oi_md = lsm;
+       set->set_lockh = lov_llh_new(lsm);
+       if (set->set_lockh == NULL)
+               GOTO(out_set, rc = -ENOMEM);
+       lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++){
+               struct lov_oinfo *loi;
+               struct lov_request *req;
+               obd_off start, end;
+
+               loi = lsm->lsm_oinfo[i];
+               if (!lov_stripe_intersects(lsm, i, policy->l_extent.start,
+                                          policy->l_extent.end, &start, &end))
+                       continue;
+
+               /* FIXME raid1 should grace this error */
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       GOTO(out_set, rc = -EIO);
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+               OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+               if (req->rq_oi.oi_md == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+
+               req->rq_oi.oi_policy.l_extent.start = start;
+               req->rq_oi.oi_policy.l_extent.end = end;
+               req->rq_oi.oi_policy.l_extent.gid = policy->l_extent.gid;
+
+               req->rq_idx = loi->loi_ost_idx;
+               req->rq_stripe = i;
+
+               /* XXX LOV STACKING: submd should be from the subobj */
+               req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+               req->rq_oi.oi_md->lsm_stripe_count = 0;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_match_set(set, mode, 0);
+       RETURN(rc);
+}
+
+int lov_fini_cancel_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+
+       LASSERT(set->set_exp);
+       if (set->set_lockh)
+               lov_llh_put(set->set_lockh);
+
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo,
+                       struct lov_stripe_md *lsm, __u32 mode,
+                       struct lustre_handle *lockh,
+                       struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       int i, rc = 0;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+       set->set_oi->oi_md = lsm;
+       set->set_lockh = lov_handle2llh(lockh);
+       if (set->set_lockh == NULL) {
+               CERROR("LOV: invalid lov lock handle %p\n", lockh);
+               GOTO(out_set, rc = -EINVAL);
+       }
+       lockh->cookie = set->set_lockh->llh_handle.h_cookie;
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++){
+               struct lov_request *req;
+               struct lustre_handle *lov_lockhp;
+               struct lov_oinfo *loi = lsm->lsm_oinfo[i];
+
+               lov_lockhp = set->set_lockh->llh_handles + i;
+               if (!lustre_handle_is_used(lov_lockhp)) {
+                       CDEBUG(D_INFO, "lov idx %d subobj "DOSTID" no lock\n",
+                              loi->loi_ost_idx, POSTID(&loi->loi_oi));
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+               OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+               if (req->rq_oi.oi_md == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+
+               req->rq_idx = loi->loi_ost_idx;
+               req->rq_stripe = i;
+
+               /* XXX LOV STACKING: submd should be from the subobj */
+               req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+               req->rq_oi.oi_md->lsm_stripe_count = 0;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_cancel_set(set);
+       RETURN(rc);
+}
+static int common_attr_done(struct lov_request_set *set)
+{
+       struct list_head *pos;
+       struct lov_request *req;
+       struct obdo *tmp_oa;
+       int rc = 0, attrset = 0;
+       ENTRY;
+
+       LASSERT(set->set_oi != NULL);
+
+       if (set->set_oi->oi_oa == NULL)
+               RETURN(0);
+
+       if (!atomic_read(&set->set_success))
+               RETURN(-EIO);
+
+       OBDO_ALLOC(tmp_oa);
+       if (tmp_oa == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               if (!req->rq_complete || req->rq_rc)
+                       continue;
+               if (req->rq_oi.oi_oa->o_valid == 0)   /* inactive stripe */
+                       continue;
+               lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa,
+                               req->rq_oi.oi_oa->o_valid,
+                               set->set_oi->oi_md, req->rq_stripe, &attrset);
+       }
+       if (!attrset) {
+               CERROR("No stripes had valid attrs\n");
+               rc = -EIO;
+       }
+       if ((set->set_oi->oi_oa->o_valid & OBD_MD_FLEPOCH) &&
+           (set->set_oi->oi_md->lsm_stripe_count != attrset)) {
+               /* When we take attributes of some epoch, we require all the
+                * ost to be active. */
+               CERROR("Not all the stripes had valid attrs\n");
+               GOTO(out, rc = -EIO);
+       }
+
+       tmp_oa->o_oi = set->set_oi->oi_oa->o_oi;
+       memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa));
+out:
+       if (tmp_oa)
+               OBDO_FREE(tmp_oa);
+       RETURN(rc);
+
+}
+
+static int brw_done(struct lov_request_set *set)
+{
+       struct lov_stripe_md *lsm = set->set_oi->oi_md;
+       struct lov_oinfo     *loi = NULL;
+       struct list_head *pos;
+       struct lov_request *req;
+       ENTRY;
+
+       list_for_each (pos, &set->set_list) {
+               req = list_entry(pos, struct lov_request, rq_link);
+
+               if (!req->rq_complete || req->rq_rc)
+                       continue;
+
+               loi = lsm->lsm_oinfo[req->rq_stripe];
+
+               if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS)
+                       loi->loi_lvb.lvb_blocks = req->rq_oi.oi_oa->o_blocks;
+       }
+
+       RETURN(0);
+}
+
+int lov_fini_brw_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes)) {
+               rc = brw_done(set);
+               /* FIXME update qos data here */
+       }
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
+                    obd_count oa_bufs, struct brw_page *pga,
+                    struct obd_trans_info *oti,
+                    struct lov_request_set **reqset)
+{
+       struct {
+               obd_count       index;
+               obd_count       count;
+               obd_count       off;
+       } *info = NULL;
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i, shift;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oti = oti;
+       set->set_oi = oinfo;
+       set->set_oabufs = oa_bufs;
+       OBD_ALLOC_LARGE(set->set_pga, oa_bufs * sizeof(*set->set_pga));
+       if (!set->set_pga)
+               GOTO(out, rc = -ENOMEM);
+
+       OBD_ALLOC_LARGE(info, sizeof(*info) * oinfo->oi_md->lsm_stripe_count);
+       if (!info)
+               GOTO(out, rc = -ENOMEM);
+
+       /* calculate the page count for each stripe */
+       for (i = 0; i < oa_bufs; i++) {
+               int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off);
+               info[stripe].count++;
+       }
+
+       /* alloc and initialize lov request */
+       shift = 0;
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++){
+               struct lov_oinfo *loi = NULL;
+               struct lov_request *req;
+
+               if (info[i].count == 0)
+                       continue;
+
+               loi = oinfo->oi_md->lsm_oinfo[i];
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       GOTO(out, rc = -EIO);
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out, rc = -ENOMEM);
+               }
+
+               if (oinfo->oi_oa) {
+                       memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+                              sizeof(*req->rq_oi.oi_oa));
+               }
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               req->rq_oi.oi_oa->o_stripe_idx = i;
+
+               req->rq_buflen = sizeof(*req->rq_oi.oi_md);
+               OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen);
+               if (req->rq_oi.oi_md == NULL) {
+                       OBDO_FREE(req->rq_oi.oi_oa);
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out, rc = -ENOMEM);
+               }
+
+               req->rq_idx = loi->loi_ost_idx;
+               req->rq_stripe = i;
+
+               /* XXX LOV STACKING */
+               req->rq_oi.oi_md->lsm_oi = loi->loi_oi;
+               req->rq_oabufs = info[i].count;
+               req->rq_pgaidx = shift;
+               shift += req->rq_oabufs;
+
+               /* remember the index for sort brw_page array */
+               info[i].index = req->rq_pgaidx;
+
+               req->rq_oi.oi_capa = oinfo->oi_capa;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out, rc = -EIO);
+
+       /* rotate & sort the brw_page array */
+       for (i = 0; i < oa_bufs; i++) {
+               int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off);
+
+               shift = info[stripe].index + info[stripe].off;
+               LASSERT(shift < oa_bufs);
+               set->set_pga[shift] = pga[i];
+               lov_stripe_offset(oinfo->oi_md, pga[i].off, stripe,
+                                 &set->set_pga[shift].off);
+               info[stripe].off++;
+       }
+out:
+       if (info)
+               OBD_FREE_LARGE(info,
+                              sizeof(*info) * oinfo->oi_md->lsm_stripe_count);
+
+       if (rc == 0)
+               *reqset = set;
+       else
+               lov_fini_brw_set(set);
+
+       RETURN(rc);
+}
+
+int lov_fini_getattr_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes))
+               rc = common_attr_done(set);
+
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+/* The callback for osc_getattr_async that finilizes a request info when a
+ * response is received. */
+static int cb_getattr_update(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct lov_request *lovreq;
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi;
+               struct lov_request *req;
+
+               loi = oinfo->oi_md->lsm_oinfo[i];
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       if (oinfo->oi_oa->o_valid & OBD_MD_FLEPOCH)
+                               /* SOM requires all the OSTs to be active. */
+                               GOTO(out_set, rc = -EIO);
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               req->rq_stripe = i;
+               req->rq_idx = loi->loi_ost_idx;
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+                      sizeof(*req->rq_oi.oi_oa));
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               req->rq_oi.oi_cb_up = cb_getattr_update;
+               req->rq_oi.oi_capa = oinfo->oi_capa;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_getattr_set(set);
+       RETURN(rc);
+}
+
+int lov_fini_destroy_set(struct lov_request_set *set)
+{
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes)) {
+               /* FIXME update qos data here */
+       }
+
+       lov_put_reqset(set);
+
+       RETURN(0);
+}
+
+int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct obdo *src_oa, struct lov_stripe_md *lsm,
+                        struct obd_trans_info *oti,
+                        struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+       set->set_oi->oi_md = lsm;
+       set->set_oi->oi_oa = src_oa;
+       set->set_oti = oti;
+       if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE)
+               set->set_cookies = oti->oti_logcookies;
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi;
+               struct lov_request *req;
+
+               loi = lsm->lsm_oinfo[i];
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               req->rq_stripe = i;
+               req->rq_idx = loi->loi_ost_idx;
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_destroy_set(set);
+       RETURN(rc);
+}
+
+int lov_fini_setattr_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes)) {
+               rc = common_attr_done(set);
+               /* FIXME update qos data here */
+       }
+
+       lov_put_reqset(set);
+       RETURN(rc);
+}
+
+int lov_update_setattr_set(struct lov_request_set *set,
+                          struct lov_request *req, int rc)
+{
+       struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+       struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+       ENTRY;
+
+       lov_update_set(set, req, rc);
+
+       /* grace error on inactive ost */
+       if (rc && !(lov->lov_tgts[req->rq_idx] &&
+                   lov->lov_tgts[req->rq_idx]->ltd_active))
+               rc = 0;
+
+       if (rc == 0) {
+               if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME)
+                       lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_ctime =
+                               req->rq_oi.oi_oa->o_ctime;
+               if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME)
+                       lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_mtime =
+                               req->rq_oi.oi_oa->o_mtime;
+               if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME)
+                       lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_atime =
+                               req->rq_oi.oi_oa->o_atime;
+       }
+
+       RETURN(rc);
+}
+
+/* The callback for osc_setattr_async that finilizes a request info when a
+ * response is received. */
+static int cb_setattr_update(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct lov_request *lovreq;
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
+                        struct obd_trans_info *oti,
+                        struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oti = oti;
+       set->set_oi = oinfo;
+       if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+               set->set_cookies = oti->oti_logcookies;
+
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+               struct lov_request *req;
+
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+               req->rq_stripe = i;
+               req->rq_idx = loi->loi_ost_idx;
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+                      sizeof(*req->rq_oi.oi_oa));
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               req->rq_oi.oi_oa->o_stripe_idx = i;
+               req->rq_oi.oi_cb_up = cb_setattr_update;
+               req->rq_oi.oi_capa = oinfo->oi_capa;
+
+               if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) {
+                       int off = lov_stripe_offset(oinfo->oi_md,
+                                                   oinfo->oi_oa->o_size, i,
+                                                   &req->rq_oi.oi_oa->o_size);
+
+                       if (off < 0 && req->rq_oi.oi_oa->o_size)
+                               req->rq_oi.oi_oa->o_size--;
+
+                       CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n",
+                              i, req->rq_oi.oi_oa->o_size,
+                              oinfo->oi_oa->o_size);
+               }
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_setattr_set(set);
+       RETURN(rc);
+}
+
+int lov_fini_punch_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes)) {
+               rc = -EIO;
+               /* FIXME update qos data here */
+               if (atomic_read(&set->set_success))
+                       rc = common_attr_done(set);
+       }
+
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+int lov_update_punch_set(struct lov_request_set *set,
+                        struct lov_request *req, int rc)
+{
+       struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov;
+       struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md;
+       ENTRY;
+
+       lov_update_set(set, req, rc);
+
+       /* grace error on inactive ost */
+       if (rc && !lov->lov_tgts[req->rq_idx]->ltd_active)
+               rc = 0;
+
+       if (rc == 0) {
+               lov_stripe_lock(lsm);
+               if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS) {
+                       lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_blocks =
+                               req->rq_oi.oi_oa->o_blocks;
+               }
+
+               lov_stripe_unlock(lsm);
+       }
+
+       RETURN(rc);
+}
+
+/* The callback for osc_punch that finilizes a request info when a response
+ * is received. */
+static int cb_update_punch(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct lov_request *lovreq;
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       return lov_update_punch_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
+                      struct obd_trans_info *oti,
+                      struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_oi = oinfo;
+       set->set_exp = exp;
+
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+               struct lov_request *req;
+               obd_off rs, re;
+
+               if (!lov_stripe_intersects(oinfo->oi_md, i,
+                                          oinfo->oi_policy.l_extent.start,
+                                          oinfo->oi_policy.l_extent.end,
+                                          &rs, &re))
+                       continue;
+
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       GOTO(out_set, rc = -EIO);
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+               req->rq_stripe = i;
+               req->rq_idx = loi->loi_ost_idx;
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
+                      sizeof(*req->rq_oi.oi_oa));
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               req->rq_oi.oi_oa->o_valid |= OBD_MD_FLGROUP;
+
+               req->rq_oi.oi_oa->o_stripe_idx = i;
+               req->rq_oi.oi_cb_up = cb_update_punch;
+
+               req->rq_oi.oi_policy.l_extent.start = rs;
+               req->rq_oi.oi_policy.l_extent.end = re;
+               req->rq_oi.oi_policy.l_extent.gid = -1;
+
+               req->rq_oi.oi_capa = oinfo->oi_capa;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_punch_set(set);
+       RETURN(rc);
+}
+
+int lov_fini_sync_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+       LASSERT(set->set_exp);
+       if (atomic_read(&set->set_completes)) {
+               if (!atomic_read(&set->set_success))
+                       rc = -EIO;
+               /* FIXME update qos data here */
+       }
+
+       lov_put_reqset(set);
+
+       RETURN(rc);
+}
+
+/* The callback for osc_sync that finilizes a request info when a
+ * response is recieved. */
+static int cb_sync_update(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct lov_request *lovreq;
+
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo,
+                     obd_off start, obd_off end,
+                     struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &exp->exp_obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC_PTR(set);
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_exp = exp;
+       set->set_oi = oinfo;
+
+       for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) {
+               struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i];
+               struct lov_request *req;
+               obd_off rs, re;
+
+               if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                       continue;
+               }
+
+               if (!lov_stripe_intersects(oinfo->oi_md, i, start, end, &rs,
+                                          &re))
+                       continue;
+
+               OBD_ALLOC_PTR(req);
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+               req->rq_stripe = i;
+               req->rq_idx = loi->loi_ost_idx;
+
+               OBDO_ALLOC(req->rq_oi.oi_oa);
+               if (req->rq_oi.oi_oa == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+               *req->rq_oi.oi_oa = *oinfo->oi_oa;
+               req->rq_oi.oi_oa->o_oi = loi->loi_oi;
+               req->rq_oi.oi_oa->o_stripe_idx = i;
+
+               req->rq_oi.oi_policy.l_extent.start = rs;
+               req->rq_oi.oi_policy.l_extent.end = re;
+               req->rq_oi.oi_policy.l_extent.gid = -1;
+               req->rq_oi.oi_cb_up = cb_sync_update;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_sync_set(set);
+       RETURN(rc);
+}
+
+#define LOV_U64_MAX ((__u64)~0ULL)
+#define LOV_SUM_MAX(tot, add)                                     \
+       do {                                                        \
+               if ((tot) + (add) < (tot))                            \
+                       (tot) = LOV_U64_MAX;                        \
+               else                                                \
+                       (tot) += (add);                          \
+       } while(0)
+
+int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,int success)
+{
+       ENTRY;
+
+       if (success) {
+               __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov,
+                                                          LOV_MAGIC, 0);
+               if (osfs->os_files != LOV_U64_MAX)
+                       lov_do_div64(osfs->os_files, expected_stripes);
+               if (osfs->os_ffree != LOV_U64_MAX)
+                       lov_do_div64(osfs->os_ffree, expected_stripes);
+
+               spin_lock(&obd->obd_osfs_lock);
+               memcpy(&obd->obd_osfs, osfs, sizeof(*osfs));
+               obd->obd_osfs_age = cfs_time_current_64();
+               spin_unlock(&obd->obd_osfs_lock);
+               RETURN(0);
+       }
+
+       RETURN(-EIO);
+}
+
+int lov_fini_statfs_set(struct lov_request_set *set)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (set == NULL)
+               RETURN(0);
+
+       if (atomic_read(&set->set_completes)) {
+               rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs,
+                                    atomic_read(&set->set_success));
+       }
+       lov_put_reqset(set);
+       RETURN(rc);
+}
+
+void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
+                      int success)
+{
+       int shift = 0, quit = 0;
+       __u64 tmp;
+
+       if (success == 0) {
+               memcpy(osfs, lov_sfs, sizeof(*lov_sfs));
+       } else {
+               if (osfs->os_bsize != lov_sfs->os_bsize) {
+                       /* assume all block sizes are always powers of 2 */
+                       /* get the bits difference */
+                       tmp = osfs->os_bsize | lov_sfs->os_bsize;
+                       for (shift = 0; shift <= 64; ++shift) {
+                               if (tmp & 1) {
+                                       if (quit)
+                                               break;
+                                       else
+                                               quit = 1;
+                                       shift = 0;
+                               }
+                               tmp >>= 1;
+                       }
+               }
+
+               if (osfs->os_bsize < lov_sfs->os_bsize) {
+                       osfs->os_bsize = lov_sfs->os_bsize;
+
+                       osfs->os_bfree  >>= shift;
+                       osfs->os_bavail >>= shift;
+                       osfs->os_blocks >>= shift;
+               } else if (shift != 0) {
+                       lov_sfs->os_bfree  >>= shift;
+                       lov_sfs->os_bavail >>= shift;
+                       lov_sfs->os_blocks >>= shift;
+               }
+               osfs->os_bfree += lov_sfs->os_bfree;
+               osfs->os_bavail += lov_sfs->os_bavail;
+               osfs->os_blocks += lov_sfs->os_blocks;
+               /* XXX not sure about this one - depends on policy.
+                *   - could be minimum if we always stripe on all OBDs
+                *     (but that would be wrong for any other policy,
+                *     if one of the OBDs has no more objects left)
+                *   - could be sum if we stripe whole objects
+                *   - could be average, just to give a nice number
+                *
+                * To give a "reasonable" (if not wholly accurate)
+                * number, we divide the total number of free objects
+                * by expected stripe count (watch out for overflow).
+                */
+               LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files);
+               LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree);
+       }
+}
+
+/* The callback for osc_statfs_async that finilizes a request info when a
+ * response is received. */
+static int cb_statfs_update(void *cookie, int rc)
+{
+       struct obd_info *oinfo = cookie;
+       struct lov_request *lovreq;
+       struct lov_request_set *set;
+       struct obd_statfs *osfs, *lov_sfs;
+       struct lov_obd *lov;
+       struct lov_tgt_desc *tgt;
+       struct obd_device *lovobd, *tgtobd;
+       int success;
+       ENTRY;
+
+       lovreq = container_of(oinfo, struct lov_request, rq_oi);
+       set = lovreq->rq_rqset;
+       lovobd = set->set_obd;
+       lov = &lovobd->u.lov;
+       osfs = set->set_oi->oi_osfs;
+       lov_sfs = oinfo->oi_osfs;
+       success = atomic_read(&set->set_success);
+       /* XXX: the same is done in lov_update_common_set, however
+          lovset->set_exp is not initialized. */
+       lov_update_set(set, lovreq, rc);
+       if (rc)
+               GOTO(out, rc);
+
+       obd_getref(lovobd);
+       tgt = lov->lov_tgts[lovreq->rq_idx];
+       if (!tgt || !tgt->ltd_active)
+               GOTO(out_update, rc);
+
+       tgtobd = class_exp2obd(tgt->ltd_exp);
+       spin_lock(&tgtobd->obd_osfs_lock);
+       memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs));
+       if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0)
+               tgtobd->obd_osfs_age = cfs_time_current_64();
+       spin_unlock(&tgtobd->obd_osfs_lock);
+
+out_update:
+       lov_update_statfs(osfs, lov_sfs, success);
+       obd_putref(lovobd);
+
+out:
+       if (set->set_oi->oi_flags & OBD_STATFS_PTLRPCD &&
+           lov_set_finished(set, 0)) {
+               lov_statfs_interpret(NULL, set, set->set_count !=
+                                    atomic_read(&set->set_success));
+       }
+
+       RETURN(0);
+}
+
+int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
+                       struct lov_request_set **reqset)
+{
+       struct lov_request_set *set;
+       struct lov_obd *lov = &obd->u.lov;
+       int rc = 0, i;
+       ENTRY;
+
+       OBD_ALLOC(set, sizeof(*set));
+       if (set == NULL)
+               RETURN(-ENOMEM);
+       lov_init_set(set);
+
+       set->set_obd = obd;
+       set->set_oi = oinfo;
+
+       /* We only get block data from the OBD */
+       for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+               struct lov_request *req;
+
+               if (lov->lov_tgts[i] == NULL ||
+                   (!lov_check_and_wait_active(lov, i) &&
+                    (oinfo->oi_flags & OBD_STATFS_NODELAY))) {
+                       CDEBUG(D_HA, "lov idx %d inactive\n", i);
+                       continue;
+               }
+
+               /* skip targets that have been explicitely disabled by the
+                * administrator */
+               if (!lov->lov_tgts[i]->ltd_exp) {
+                       CDEBUG(D_HA, "lov idx %d administratively disabled\n", i);
+                       continue;
+               }
+
+               OBD_ALLOC(req, sizeof(*req));
+               if (req == NULL)
+                       GOTO(out_set, rc = -ENOMEM);
+
+               OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs));
+               if (req->rq_oi.oi_osfs == NULL) {
+                       OBD_FREE(req, sizeof(*req));
+                       GOTO(out_set, rc = -ENOMEM);
+               }
+
+               req->rq_idx = i;
+               req->rq_oi.oi_cb_up = cb_statfs_update;
+               req->rq_oi.oi_flags = oinfo->oi_flags;
+
+               lov_set_add_req(req, set);
+       }
+       if (!set->set_count)
+               GOTO(out_set, rc = -EIO);
+       *reqset = set;
+       RETURN(rc);
+out_set:
+       lov_fini_statfs_set(set);
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_dev.c b/drivers/staging/lustre/lustre/lov/lovsub_dev.c
new file mode 100644 (file)
index 0000000..204ecd0
--- /dev/null
@@ -0,0 +1,211 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub transfer operations.
+ *
+ */
+
+static void lovsub_req_completion(const struct lu_env *env,
+                                 const struct cl_req_slice *slice, int ioret)
+{
+       struct lovsub_req *lsr;
+
+       ENTRY;
+       lsr = cl2lovsub_req(slice);
+       OBD_SLAB_FREE_PTR(lsr, lovsub_req_kmem);
+       EXIT;
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for lovsub
+ * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx
+ * field, which is filled there.
+ */
+static void lovsub_req_attr_set(const struct lu_env *env,
+                               const struct cl_req_slice *slice,
+                               const struct cl_object *obj,
+                               struct cl_req_attr *attr, obd_valid flags)
+{
+       struct lovsub_object *subobj;
+
+       ENTRY;
+       subobj = cl2lovsub(obj);
+       /*
+        * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it
+        * unconditionally. It never changes anyway.
+        */
+       attr->cra_oa->o_stripe_idx = subobj->lso_index;
+       EXIT;
+}
+
+static const struct cl_req_operations lovsub_req_ops = {
+       .cro_attr_set   = lovsub_req_attr_set,
+       .cro_completion = lovsub_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov-sub device and device type functions.
+ *
+ */
+
+static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
+                             const char *name, struct lu_device *next)
+{
+       struct lovsub_device  *lsd = lu2lovsub_dev(d);
+       struct lu_device_type *ldt;
+       int rc;
+
+       ENTRY;
+       next->ld_site = d->ld_site;
+       ldt = next->ld_type;
+       LASSERT(ldt != NULL);
+       rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+       if (rc) {
+               next->ld_site = NULL;
+               RETURN(rc);
+       }
+
+       lu_device_get(next);
+       lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+       lsd->acid_next = lu2cl_dev(next);
+       RETURN(rc);
+}
+
+static struct lu_device *lovsub_device_fini(const struct lu_env *env,
+                                           struct lu_device *d)
+{
+       struct lu_device *next;
+       struct lovsub_device *lsd;
+
+       ENTRY;
+       lsd = lu2lovsub_dev(d);
+       next = cl2lu_dev(lsd->acid_next);
+       lsd->acid_super = NULL;
+       lsd->acid_next = NULL;
+       RETURN(next);
+}
+
+static struct lu_device *lovsub_device_free(const struct lu_env *env,
+                                           struct lu_device *d)
+{
+       struct lovsub_device *lsd  = lu2lovsub_dev(d);
+       struct lu_device     *next = cl2lu_dev(lsd->acid_next);
+
+       cl_device_fini(lu2cl_dev(d));
+       OBD_FREE_PTR(lsd);
+       return next;
+}
+
+static int lovsub_req_init(const struct lu_env *env, struct cl_device *dev,
+                          struct cl_req *req)
+{
+       struct lovsub_req *lsr;
+       int result;
+
+       OBD_SLAB_ALLOC_PTR_GFP(lsr, lovsub_req_kmem, __GFP_IO);
+       if (lsr != NULL) {
+               cl_req_slice_add(req, &lsr->lsrq_cl, dev, &lovsub_req_ops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+static const struct lu_device_operations lovsub_lu_ops = {
+       .ldo_object_alloc      = lovsub_object_alloc,
+       .ldo_process_config    = NULL,
+       .ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations lovsub_cl_ops = {
+       .cdo_req_init = lovsub_req_init
+};
+
+static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
+                                            struct lu_device_type *t,
+                                            struct lustre_cfg *cfg)
+{
+       struct lu_device     *d;
+       struct lovsub_device *lsd;
+
+       OBD_ALLOC_PTR(lsd);
+       if (lsd != NULL) {
+               int result;
+
+               result = cl_device_init(&lsd->acid_cl, t);
+               if (result == 0) {
+                       d = lovsub2lu_dev(lsd);
+                       d->ld_ops        = &lovsub_lu_ops;
+                       lsd->acid_cl.cd_ops = &lovsub_cl_ops;
+               } else
+                       d = ERR_PTR(result);
+       } else
+               d = ERR_PTR(-ENOMEM);
+       return d;
+}
+
+static const struct lu_device_type_operations lovsub_device_type_ops = {
+       .ldto_device_alloc = lovsub_device_alloc,
+       .ldto_device_free  = lovsub_device_free,
+
+       .ldto_device_init    = lovsub_device_init,
+       .ldto_device_fini    = lovsub_device_fini
+};
+
+#define LUSTRE_LOVSUB_NAME      "lovsub"
+
+struct lu_device_type lovsub_device_type = {
+       .ldt_tags     = LU_DEVICE_CL,
+       .ldt_name     = LUSTRE_LOVSUB_NAME,
+       .ldt_ops      = &lovsub_device_type_ops,
+       .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_io.c b/drivers/staging/lustre/lustre/lov/lovsub_io.c
new file mode 100644 (file)
index 0000000..783ec68
--- /dev/null
@@ -0,0 +1,55 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub io operations.
+ *
+ */
+
+/* All trivial */
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_lock.c b/drivers/staging/lustre/lustre/lov/lovsub_lock.c
new file mode 100644 (file)
index 0000000..03bab17
--- /dev/null
@@ -0,0 +1,485 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub lock operations.
+ *
+ */
+
+static void lovsub_lock_fini(const struct lu_env *env,
+                            struct cl_lock_slice *slice)
+{
+       struct lovsub_lock   *lsl;
+
+       ENTRY;
+       lsl = cl2lovsub_lock(slice);
+       LASSERT(list_empty(&lsl->lss_parents));
+       OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
+       EXIT;
+}
+
+static void lovsub_parent_lock(const struct lu_env *env, struct lov_lock *lov)
+{
+       struct cl_lock *parent;
+
+       ENTRY;
+       parent = lov->lls_cl.cls_lock;
+       cl_lock_get(parent);
+       lu_ref_add(&parent->cll_reference, "lovsub-parent", current);
+       cl_lock_mutex_get(env, parent);
+       EXIT;
+}
+
+static void lovsub_parent_unlock(const struct lu_env *env, struct lov_lock *lov)
+{
+       struct cl_lock *parent;
+
+       ENTRY;
+       parent = lov->lls_cl.cls_lock;
+       cl_lock_mutex_put(env, lov->lls_cl.cls_lock);
+       lu_ref_del(&parent->cll_reference, "lovsub-parent", current);
+       cl_lock_put(env, parent);
+       EXIT;
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for lovsub layer, which
+ * method is called whenever sub-lock state changes. Propagates state change
+ * to the top-locks.
+ */
+static void lovsub_lock_state(const struct lu_env *env,
+                             const struct cl_lock_slice *slice,
+                             enum cl_lock_state state)
+{
+       struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+       struct lov_lock_link *scan;
+
+       LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+       ENTRY;
+
+       list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+               struct lov_lock *lov    = scan->lll_super;
+               struct cl_lock  *parent = lov->lls_cl.cls_lock;
+
+               if (sub->lss_active != parent) {
+                       lovsub_parent_lock(env, lov);
+                       cl_lock_signal(env, parent);
+                       lovsub_parent_unlock(env, lov);
+               }
+       }
+       EXIT;
+}
+
+/**
+ * Implementation of cl_lock_operation::clo_weigh() estimating lock weight by
+ * asking parent lock.
+ */
+static unsigned long lovsub_lock_weigh(const struct lu_env *env,
+                                      const struct cl_lock_slice *slice)
+{
+       struct lovsub_lock *lock = cl2lovsub_lock(slice);
+       struct lov_lock    *lov;
+       unsigned long       dumbbell;
+
+       ENTRY;
+
+       LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+       if (!list_empty(&lock->lss_parents)) {
+               /*
+                * It is not clear whether all parents have to be asked and
+                * their estimations summed, or it is enough to ask one. For
+                * the current usages, one is always enough.
+                */
+               lov = container_of(lock->lss_parents.next,
+                                  struct lov_lock_link, lll_list)->lll_super;
+
+               lovsub_parent_lock(env, lov);
+               dumbbell = cl_lock_weigh(env, lov->lls_cl.cls_lock);
+               lovsub_parent_unlock(env, lov);
+       } else
+               dumbbell = 0;
+
+       RETURN(dumbbell);
+}
+
+/**
+ * Maps start/end offsets within a stripe, to offsets within a file.
+ */
+static void lovsub_lock_descr_map(const struct cl_lock_descr *in,
+                                 struct lov_object *lov,
+                                 int stripe, struct cl_lock_descr *out)
+{
+       pgoff_t size; /* stripe size in pages */
+       pgoff_t skip; /* how many pages in every stripe are occupied by
+                      * "other" stripes */
+       pgoff_t start;
+       pgoff_t end;
+
+       ENTRY;
+       start = in->cld_start;
+       end   = in->cld_end;
+
+       if (lov->lo_lsm->lsm_stripe_count > 1) {
+               size = cl_index(lov2cl(lov), lov->lo_lsm->lsm_stripe_size);
+               skip = (lov->lo_lsm->lsm_stripe_count - 1) * size;
+
+               /* XXX overflow check here? */
+               start += start/size * skip + stripe * size;
+
+               if (end != CL_PAGE_EOF) {
+                       end += end/size * skip + stripe * size;
+                       /*
+                        * And check for overflow...
+                        */
+                       if (end < in->cld_end)
+                               end = CL_PAGE_EOF;
+               }
+       }
+       out->cld_start = start;
+       out->cld_end   = end;
+       EXIT;
+}
+
+/**
+ * Adjusts parent lock extent when a sub-lock is attached to a parent. This is
+ * called in two ways:
+ *
+ *     - as part of receive call-back, when server returns granted extent to
+ *       the client, and
+ *
+ *     - when top-lock finds existing sub-lock in the cache.
+ *
+ * Note, that lock mode is not propagated to the parent: i.e., if CLM_READ
+ * top-lock matches CLM_WRITE sub-lock, top-lock is still CLM_READ.
+ */
+int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov,
+                      struct lovsub_lock *sublock,
+                      const struct cl_lock_descr *d, int idx)
+{
+       struct cl_lock       *parent;
+       struct lovsub_object *subobj;
+       struct cl_lock_descr *pd;
+       struct cl_lock_descr *parent_descr;
+       int                result;
+
+       parent       = lov->lls_cl.cls_lock;
+       parent_descr = &parent->cll_descr;
+       LASSERT(cl_lock_mode_match(d->cld_mode, parent_descr->cld_mode));
+
+       subobj = cl2lovsub(sublock->lss_cl.cls_obj);
+       pd     = &lov_env_info(env)->lti_ldescr;
+
+       pd->cld_obj  = parent_descr->cld_obj;
+       pd->cld_mode = parent_descr->cld_mode;
+       pd->cld_gid  = parent_descr->cld_gid;
+       lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd);
+       lov->lls_sub[idx].sub_got = *d;
+       /*
+        * Notify top-lock about modification, if lock description changes
+        * materially.
+        */
+       if (!cl_lock_ext_match(parent_descr, pd))
+               result = cl_lock_modify(env, parent, pd);
+       else
+               result = 0;
+       return result;
+}
+
+static int lovsub_lock_modify(const struct lu_env *env,
+                             const struct cl_lock_slice *s,
+                             const struct cl_lock_descr *d)
+{
+       struct lovsub_lock   *lock   = cl2lovsub_lock(s);
+       struct lov_lock_link *scan;
+       struct lov_lock      *lov;
+       int result                 = 0;
+
+       ENTRY;
+
+       LASSERT(cl_lock_mode_match(d->cld_mode,
+                                  s->cls_lock->cll_descr.cld_mode));
+       list_for_each_entry(scan, &lock->lss_parents, lll_list) {
+               int rc;
+
+               lov = scan->lll_super;
+               lovsub_parent_lock(env, lov);
+               rc = lov_sublock_modify(env, lov, lock, d, scan->lll_idx);
+               lovsub_parent_unlock(env, lov);
+               result = result ?: rc;
+       }
+       RETURN(result);
+}
+
+static int lovsub_lock_closure(const struct lu_env *env,
+                              const struct cl_lock_slice *slice,
+                              struct cl_lock_closure *closure)
+{
+       struct lovsub_lock   *sub;
+       struct cl_lock       *parent;
+       struct lov_lock_link *scan;
+       int                result;
+
+       LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+       ENTRY;
+
+       sub    = cl2lovsub_lock(slice);
+       result = 0;
+
+       list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+               parent = scan->lll_super->lls_cl.cls_lock;
+               result = cl_lock_closure_build(env, parent, closure);
+               if (result != 0)
+                       break;
+       }
+       RETURN(result);
+}
+
+/**
+ * A helper function for lovsub_lock_delete() that deals with a given parent
+ * top-lock.
+ */
+static int lovsub_lock_delete_one(const struct lu_env *env,
+                                 struct cl_lock *child, struct lov_lock *lov)
+{
+       struct cl_lock *parent;
+       int          result;
+       ENTRY;
+
+       parent = lov->lls_cl.cls_lock;
+       if (parent->cll_error)
+               RETURN(0);
+
+       result = 0;
+       switch (parent->cll_state) {
+       case CLS_ENQUEUED:
+               /* See LU-1355 for the case that a glimpse lock is
+                * interrupted by signal */
+               LASSERT(parent->cll_flags & CLF_CANCELLED);
+               break;
+       case CLS_QUEUING:
+       case CLS_FREEING:
+               cl_lock_signal(env, parent);
+               break;
+       case CLS_INTRANSIT:
+               /*
+                * Here lies a problem: a sub-lock is canceled while top-lock
+                * is being unlocked. Top-lock cannot be moved into CLS_NEW
+                * state, because unlocking has to succeed eventually by
+                * placing lock into CLS_CACHED (or failing it), see
+                * cl_unuse_try(). Nor can top-lock be left in CLS_CACHED
+                * state, because lov maintains an invariant that all
+                * sub-locks exist in CLS_CACHED (this allows cached top-lock
+                * to be reused immediately). Nor can we wait for top-lock
+                * state to change, because this can be synchronous to the
+                * current thread.
+                *
+                * We know for sure that lov_lock_unuse() will be called at
+                * least one more time to finish un-using, so leave a mark on
+                * the top-lock, that will be seen by the next call to
+                * lov_lock_unuse().
+                */
+               if (cl_lock_is_intransit(parent))
+                       lov->lls_cancel_race = 1;
+               break;
+       case CLS_CACHED:
+               /*
+                * if a sub-lock is canceled move its top-lock into CLS_NEW
+                * state to preserve an invariant that a top-lock in
+                * CLS_CACHED is immediately ready for re-use (i.e., has all
+                * sub-locks), and so that next attempt to re-use the top-lock
+                * enqueues missing sub-lock.
+                */
+               cl_lock_state_set(env, parent, CLS_NEW);
+               /* fall through */
+       case CLS_NEW:
+               /*
+                * if last sub-lock is canceled, destroy the top-lock (which
+                * is now `empty') proactively.
+                */
+               if (lov->lls_nr_filled == 0) {
+                       /* ... but unfortunately, this cannot be done easily,
+                        * as cancellation of a top-lock might acquire mutices
+                        * of its other sub-locks, violating lock ordering,
+                        * see cl_lock_{cancel,delete}() preconditions.
+                        *
+                        * To work around this, the mutex of this sub-lock is
+                        * released, top-lock is destroyed, and sub-lock mutex
+                        * acquired again. The list of parents has to be
+                        * re-scanned from the beginning after this.
+                        *
+                        * Only do this if no mutices other than on @child and
+                        * @parent are held by the current thread.
+                        *
+                        * TODO: The lock modal here is too complex, because
+                        * the lock may be canceled and deleted by voluntarily:
+                        *    cl_lock_request
+                        *      -> osc_lock_enqueue_wait
+                        *      -> osc_lock_cancel_wait
+                        *        -> cl_lock_delete
+                        *          -> lovsub_lock_delete
+                        *            -> cl_lock_cancel/delete
+                        *              -> ...
+                        *
+                        * The better choice is to spawn a kernel thread for
+                        * this purpose. -jay
+                        */
+                       if (cl_lock_nr_mutexed(env) == 2) {
+                               cl_lock_mutex_put(env, child);
+                               cl_lock_cancel(env, parent);
+                               cl_lock_delete(env, parent);
+                               result = 1;
+                       }
+               }
+               break;
+       case CLS_HELD:
+               CL_LOCK_DEBUG(D_ERROR, env, parent, "Delete CLS_HELD lock\n");
+       default:
+               CERROR("Impossible state: %d\n", parent->cll_state);
+               LBUG();
+               break;
+       }
+
+       RETURN(result);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked in "bottom-to-top" delete, when lock destruction starts from the
+ * sub-lock (e.g, as a result of ldlm lock LRU policy).
+ */
+static void lovsub_lock_delete(const struct lu_env *env,
+                              const struct cl_lock_slice *slice)
+{
+       struct cl_lock     *child = slice->cls_lock;
+       struct lovsub_lock *sub   = cl2lovsub_lock(slice);
+       int restart;
+
+       LASSERT(cl_lock_is_mutexed(child));
+
+       ENTRY;
+       /*
+        * Destruction of a sub-lock might take multiple iterations, because
+        * when the last sub-lock of a given top-lock is deleted, top-lock is
+        * canceled proactively, and this requires to release sub-lock
+        * mutex. Once sub-lock mutex has been released, list of its parents
+        * has to be re-scanned from the beginning.
+        */
+       do {
+               struct lov_lock      *lov;
+               struct lov_lock_link *scan;
+               struct lov_lock_link *temp;
+               struct lov_lock_sub  *subdata;
+
+               restart = 0;
+               list_for_each_entry_safe(scan, temp,
+                                            &sub->lss_parents, lll_list) {
+                       lov     = scan->lll_super;
+                       subdata = &lov->lls_sub[scan->lll_idx];
+                       lovsub_parent_lock(env, lov);
+                       subdata->sub_got = subdata->sub_descr;
+                       lov_lock_unlink(env, scan, sub);
+                       restart = lovsub_lock_delete_one(env, child, lov);
+                       lovsub_parent_unlock(env, lov);
+
+                       if (restart) {
+                               cl_lock_mutex_get(env, child);
+                               break;
+                       }
+              }
+       } while (restart);
+       EXIT;
+}
+
+static int lovsub_lock_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct cl_lock_slice *slice)
+{
+       struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+       struct lov_lock      *lov;
+       struct lov_lock_link *scan;
+
+       list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+               lov = scan->lll_super;
+               (*p)(env, cookie, "[%d %p ", scan->lll_idx, lov);
+               if (lov != NULL)
+                       cl_lock_descr_print(env, cookie, p,
+                                           &lov->lls_cl.cls_lock->cll_descr);
+               (*p)(env, cookie, "] ");
+       }
+       return 0;
+}
+
+static const struct cl_lock_operations lovsub_lock_ops = {
+       .clo_fini    = lovsub_lock_fini,
+       .clo_state   = lovsub_lock_state,
+       .clo_delete  = lovsub_lock_delete,
+       .clo_modify  = lovsub_lock_modify,
+       .clo_closure = lovsub_lock_closure,
+       .clo_weigh   = lovsub_lock_weigh,
+       .clo_print   = lovsub_lock_print
+};
+
+int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+                    struct cl_lock *lock, const struct cl_io *io)
+{
+       struct lovsub_lock *lsk;
+       int result;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, __GFP_IO);
+       if (lsk != NULL) {
+               INIT_LIST_HEAD(&lsk->lss_parents);
+               cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       RETURN(result);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_object.c b/drivers/staging/lustre/lustre/lov/lovsub_object.c
new file mode 100644 (file)
index 0000000..1b83d90
--- /dev/null
@@ -0,0 +1,170 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub object operations.
+ *
+ */
+
+int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+                      const struct lu_object_conf *conf)
+{
+       struct lovsub_device  *dev   = lu2lovsub_dev(obj->lo_dev);
+       struct lu_object      *below;
+       struct lu_device      *under;
+
+       int result;
+
+       ENTRY;
+       under = &dev->acid_next->cd_lu_dev;
+       below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+       if (below != NULL) {
+               lu_object_add(obj, below);
+               cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page));
+               result = 0;
+       } else
+               result = -ENOMEM;
+       RETURN(result);
+
+}
+
+static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+       struct lovsub_object *los = lu2lovsub(obj);
+       struct lov_object    *lov = los->lso_super;
+       ENTRY;
+
+       /* We can't assume lov was assigned here, because of the shadow
+        * object handling in lu_object_find.
+        */
+       if (lov) {
+               LASSERT(lov->lo_type == LLT_RAID0);
+               LASSERT(lov->u.raid0.lo_sub[los->lso_index] == los);
+               spin_lock(&lov->u.raid0.lo_sub_lock);
+               lov->u.raid0.lo_sub[los->lso_index] = NULL;
+               spin_unlock(&lov->u.raid0.lo_sub_lock);
+       }
+
+       lu_object_fini(obj);
+       lu_object_header_fini(&los->lso_header.coh_lu);
+       OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+       EXIT;
+}
+
+static int lovsub_object_print(const struct lu_env *env, void *cookie,
+                              lu_printer_t p, const struct lu_object *obj)
+{
+       struct lovsub_object *los = lu2lovsub(obj);
+
+       return (*p)(env, cookie, "[%d]", los->lso_index);
+}
+
+static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj,
+                          const struct cl_attr *attr, unsigned valid)
+{
+       struct lov_object *lov = cl2lovsub(obj)->lso_super;
+
+       ENTRY;
+       lov_r0(lov)->lo_attr_valid = 0;
+       RETURN(0);
+}
+
+static int lovsub_object_glimpse(const struct lu_env *env,
+                                const struct cl_object *obj,
+                                struct ost_lvb *lvb)
+{
+       struct lovsub_object *los = cl2lovsub(obj);
+
+       ENTRY;
+       RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
+}
+
+
+
+static const struct cl_object_operations lovsub_ops = {
+       .coo_page_init = lovsub_page_init,
+       .coo_lock_init = lovsub_lock_init,
+       .coo_attr_set  = lovsub_attr_set,
+       .coo_glimpse   = lovsub_object_glimpse
+};
+
+static const struct lu_object_operations lovsub_lu_obj_ops = {
+       .loo_object_init      = lovsub_object_init,
+       .loo_object_delete    = NULL,
+       .loo_object_release   = NULL,
+       .loo_object_free      = lovsub_object_free,
+       .loo_object_print     = lovsub_object_print,
+       .loo_object_invariant = NULL
+};
+
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+                                     const struct lu_object_header *unused,
+                                     struct lu_device *dev)
+{
+       struct lovsub_object *los;
+       struct lu_object     *obj;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, __GFP_IO);
+       if (los != NULL) {
+               struct cl_object_header *hdr;
+
+               obj = lovsub2lu(los);
+               hdr = &los->lso_header;
+               cl_object_header_init(hdr);
+               lu_object_init(obj, &hdr->coh_lu, dev);
+               lu_object_add_top(&hdr->coh_lu, obj);
+               los->lso_cl.co_ops = &lovsub_ops;
+               obj->lo_ops = &lovsub_lu_obj_ops;
+       } else
+               obj = NULL;
+       RETURN(obj);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lovsub_page.c b/drivers/staging/lustre/lustre/lov/lovsub_page.c
new file mode 100644 (file)
index 0000000..bc9e683
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Lovsub page operations.
+ *
+ */
+
+static void lovsub_page_fini(const struct lu_env *env,
+                            struct cl_page_slice *slice)
+{
+}
+
+static const struct cl_page_operations lovsub_page_ops = {
+       .cpo_fini   = lovsub_page_fini
+};
+
+int lovsub_page_init(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_page *page, struct page *unused)
+{
+       struct lovsub_page *lsb = cl_object_page_slice(obj, page);
+       ENTRY;
+
+       cl_page_slice_add(page, &lsb->lsb_cl, obj, &lovsub_page_ops);
+       RETURN(0);
+}
+
+/** @} lov */
diff --git a/drivers/staging/lustre/lustre/lov/lproc_lov.c b/drivers/staging/lustre/lustre/lov/lproc_lov.c
new file mode 100644 (file)
index 0000000..5b2c0d8
--- /dev/null
@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <lprocfs_status.h>
+#include <obd_class.h>
+#include <linux/seq_file.h>
+#include "lov_internal.h"
+
+#ifdef LPROCFS
+static int lov_stripesize_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = (struct obd_device *)m->private;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       return seq_printf(m, LPU64"\n", desc->ld_default_stripe_size);
+}
+
+static ssize_t lov_stripesize_seq_write(struct file *file, const char *buffer,
+                                   size_t count, loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       struct lov_desc *desc;
+       __u64 val;
+       int rc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       rc = lprocfs_write_u64_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       lov_fix_desc_stripe_size(&val);
+       desc->ld_default_stripe_size = val;
+       return count;
+}
+LPROC_SEQ_FOPS(lov_stripesize);
+
+static int lov_stripeoffset_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = (struct obd_device *)m->private;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       return seq_printf(m, LPU64"\n", desc->ld_default_stripe_offset);
+}
+
+static ssize_t lov_stripeoffset_seq_write(struct file *file, const char *buffer,
+                                     size_t count, loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       struct lov_desc *desc;
+       __u64 val;
+       int rc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       rc = lprocfs_write_u64_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       desc->ld_default_stripe_offset = val;
+       return count;
+}
+LPROC_SEQ_FOPS(lov_stripeoffset);
+
+static int lov_stripetype_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = (struct obd_device *)m->private;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       return seq_printf(m, "%u\n", desc->ld_pattern);
+}
+
+static ssize_t lov_stripetype_seq_write(struct file *file, const char *buffer,
+                                   size_t count, loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       struct lov_desc *desc;
+       int val, rc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       lov_fix_desc_pattern(&val);
+       desc->ld_pattern = val;
+       return count;
+}
+LPROC_SEQ_FOPS(lov_stripetype);
+
+static int lov_stripecount_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = (struct obd_device *)m->private;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       return seq_printf(m, "%d\n",
+                       (__s16)(desc->ld_default_stripe_count + 1) - 1);
+}
+
+static ssize_t lov_stripecount_seq_write(struct file *file, const char *buffer,
+                                    size_t count, loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       struct lov_desc *desc;
+       int val, rc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       lov_fix_desc_stripe_count(&val);
+       desc->ld_default_stripe_count = val;
+       return count;
+}
+LPROC_SEQ_FOPS(lov_stripecount);
+
+static int lov_numobd_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = (struct obd_device *)m->private;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       return seq_printf(m, "%u\n", desc->ld_tgt_count);
+}
+LPROC_SEQ_FOPS_RO(lov_numobd);
+
+static int lov_activeobd_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = (struct obd_device *)m->private;
+       struct lov_desc *desc;
+
+       LASSERT(dev != NULL);
+       desc = &dev->u.lov.desc;
+       return seq_printf(m, "%u\n", desc->ld_active_tgt_count);
+}
+LPROC_SEQ_FOPS_RO(lov_activeobd);
+
+static int lov_desc_uuid_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = (struct obd_device *)m->private;
+       struct lov_obd *lov;
+
+       LASSERT(dev != NULL);
+       lov = &dev->u.lov;
+       return seq_printf(m, "%s\n", lov->desc.ld_uuid.uuid);
+}
+LPROC_SEQ_FOPS_RO(lov_desc_uuid);
+
+static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
+{
+       struct obd_device *dev = p->private;
+       struct lov_obd *lov = &dev->u.lov;
+
+       while (*pos < lov->desc.ld_tgt_count) {
+               if (lov->lov_tgts[*pos])
+                       return lov->lov_tgts[*pos];
+               ++*pos;
+       }
+       return NULL;
+}
+
+static void lov_tgt_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       struct obd_device *dev = p->private;
+       struct lov_obd *lov = &dev->u.lov;
+
+       while (++*pos < lov->desc.ld_tgt_count) {
+               if (lov->lov_tgts[*pos])
+                       return lov->lov_tgts[*pos];
+       }
+       return NULL;
+}
+
+static int lov_tgt_seq_show(struct seq_file *p, void *v)
+{
+       struct lov_tgt_desc *tgt = v;
+       return seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index,
+                         obd_uuid2str(&tgt->ltd_uuid),
+                         tgt->ltd_active ? "" : "IN");
+}
+
+struct seq_operations lov_tgt_sops = {
+       .start = lov_tgt_seq_start,
+       .stop = lov_tgt_seq_stop,
+       .next = lov_tgt_seq_next,
+       .show = lov_tgt_seq_show,
+};
+
+static int lov_target_seq_open(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq;
+       int rc;
+
+       rc = seq_open(file, &lov_tgt_sops);
+       if (rc)
+               return rc;
+
+       seq = file->private_data;
+       seq->private = PDE_DATA(inode);
+       return 0;
+}
+
+LPROC_SEQ_FOPS_RO_TYPE(lov, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(lov, kbytesavail);
+
+struct lprocfs_vars lprocfs_lov_obd_vars[] = {
+       { "uuid",         &lov_uuid_fops,         0, 0 },
+       { "stripesize",   &lov_stripesize_fops,   0 },
+       { "stripeoffset", &lov_stripeoffset_fops, 0 },
+       { "stripecount",  &lov_stripecount_fops,  0 },
+       { "stripetype",   &lov_stripetype_fops,   0 },
+       { "numobd",       &lov_numobd_fops,       0, 0 },
+       { "activeobd",    &lov_activeobd_fops,    0, 0 },
+       { "filestotal",   &lov_filestotal_fops,   0, 0 },
+       { "filesfree",    &lov_filesfree_fops,    0, 0 },
+       /*{ "filegroups", lprocfs_rd_filegroups,  0, 0 },*/
+       { "blocksize",    &lov_blksize_fops,      0, 0 },
+       { "kbytestotal",  &lov_kbytestotal_fops,  0, 0 },
+       { "kbytesfree",   &lov_kbytesfree_fops,   0, 0 },
+       { "kbytesavail",  &lov_kbytesavail_fops,  0, 0 },
+       { "desc_uuid",    &lov_desc_uuid_fops,    0, 0 },
+       { 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(lov, numrefs);
+
+static struct lprocfs_vars lprocfs_lov_module_vars[] = {
+       { "num_refs",     &lov_numrefs_fops,     0, 0 },
+       { 0 }
+};
+
+void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_lov_module_vars;
+    lvars->obd_vars     = lprocfs_lov_obd_vars;
+}
+
+struct file_operations lov_proc_target_fops = {
+       .owner   = THIS_MODULE,
+       .open    = lov_target_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = lprocfs_seq_release,
+};
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/lvfs/Makefile b/drivers/staging/lustre/lustre/lvfs/Makefile
new file mode 100644 (file)
index 0000000..f50b1c5
--- /dev/null
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LUSTRE_FS) += lvfs.o
+
+lvfs-y := lvfs_linux.o fsfilt.o lvfs_lib.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/lvfs/fsfilt.c b/drivers/staging/lustre/lustre/lvfs/fsfilt.c
new file mode 100644 (file)
index 0000000..064445c
--- /dev/null
@@ -0,0 +1,138 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+
+LIST_HEAD(fsfilt_types);
+
+static struct fsfilt_operations *fsfilt_search_type(const char *type)
+{
+       struct fsfilt_operations *found;
+       struct list_head *p;
+
+       list_for_each(p, &fsfilt_types) {
+               found = list_entry(p, struct fsfilt_operations, fs_list);
+               if (!strcmp(found->fs_type, type)) {
+                       return found;
+               }
+       }
+       return NULL;
+}
+
+int fsfilt_register_ops(struct fsfilt_operations *fs_ops)
+{
+       struct fsfilt_operations *found;
+
+       /* lock fsfilt_types list */
+       if ((found = fsfilt_search_type(fs_ops->fs_type))) {
+               if (found != fs_ops) {
+                       CERROR("different operations for type %s\n",
+                              fs_ops->fs_type);
+                       /* unlock fsfilt_types list */
+                       RETURN(-EEXIST);
+               }
+       } else {
+               try_module_get(THIS_MODULE);
+               list_add(&fs_ops->fs_list, &fsfilt_types);
+       }
+
+       /* unlock fsfilt_types list */
+       return 0;
+}
+EXPORT_SYMBOL(fsfilt_register_ops);
+
+void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops)
+{
+       struct list_head *p;
+
+       /* lock fsfilt_types list */
+       list_for_each(p, &fsfilt_types) {
+               struct fsfilt_operations *found;
+
+               found = list_entry(p, typeof(*found), fs_list);
+               if (found == fs_ops) {
+                       list_del(p);
+                       module_put(THIS_MODULE);
+                       break;
+               }
+       }
+       /* unlock fsfilt_types list */
+}
+EXPORT_SYMBOL(fsfilt_unregister_ops);
+
+struct fsfilt_operations *fsfilt_get_ops(const char *type)
+{
+       struct fsfilt_operations *fs_ops;
+
+       /* lock fsfilt_types list */
+       if (!(fs_ops = fsfilt_search_type(type))) {
+               char name[32];
+               int rc;
+
+               snprintf(name, sizeof(name) - 1, "fsfilt_%s", type);
+               name[sizeof(name) - 1] = '\0';
+
+               if (!(rc = request_module("%s", name))) {
+                       fs_ops = fsfilt_search_type(type);
+                       CDEBUG(D_INFO, "Loaded module '%s'\n", name);
+                       if (!fs_ops)
+                               rc = -ENOENT;
+               }
+
+               if (rc) {
+                       CERROR("Can't find %s interface\n", name);
+                       RETURN(ERR_PTR(rc < 0 ? rc : -rc));
+                       /* unlock fsfilt_types list */
+               }
+       }
+       try_module_get(fs_ops->fs_owner);
+       /* unlock fsfilt_types list */
+
+       return fs_ops;
+}
+EXPORT_SYMBOL(fsfilt_get_ops);
+
+void fsfilt_put_ops(struct fsfilt_operations *fs_ops)
+{
+       module_put(fs_ops->fs_owner);
+}
+EXPORT_SYMBOL(fsfilt_put_ops);
diff --git a/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c b/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c
new file mode 100644 (file)
index 0000000..c1e99b3
--- /dev/null
@@ -0,0 +1,761 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/fsfilt_ext3.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <ldiskfs/ldiskfs_config.h>
+#include <ext4/ext4.h>
+#include <ext4/ext4_jbd2.h>
+#include <linux/version.h>
+#include <linux/bitops.h>
+#include <linux/quota.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+#include <obd.h>
+#include <linux/lustre_compat25.h>
+#include <linux/lprocfs_status.h>
+
+#include <ext4/ext4_extents.h>
+
+#ifdef HAVE_EXT_PBLOCK /* Name changed to ext4_ext_pblock for kernel 2.6.35 */
+#define ext3_ext_pblock(ex) ext_pblock((ex))
+#endif
+
+/* for kernels 2.6.18 and later */
+#define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS(sb)
+
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+              ext3_ext_insert_extent(handle, inode, path, newext, flag)
+
+#define ext3_mb_discard_inode_preallocations(inode) \
+                ext3_discard_preallocations(inode)
+
+#define fsfilt_log_start_commit(journal, tid) jbd2_log_start_commit(journal, tid)
+#define fsfilt_log_wait_commit(journal, tid) jbd2_log_wait_commit(journal, tid)
+
+static struct kmem_cache *fcb_cache;
+
+struct fsfilt_cb_data {
+       struct ext4_journal_cb_entry cb_jcb; /* private data - MUST BE FIRST */
+       fsfilt_cb_t cb_func;        /* MDS/OBD completion function */
+       struct obd_device *cb_obd;      /* MDS/OBD completion device */
+       __u64 cb_last_rcvd;          /* MDS/OST last committed operation */
+       void *cb_data;            /* MDS/OST completion function data */
+};
+
+static char *fsfilt_ext3_get_label(struct super_block *sb)
+{
+       return EXT3_SB(sb)->s_es->s_volume_name;
+}
+
+/* kernel has ext4_blocks_for_truncate since linux-3.1.1 */
+# include <ext4/truncate.h>
+
+/*
+ * We don't currently need any additional blocks for rmdir and
+ * unlink transactions because we are storing the OST oa_id inside
+ * the inode (which we will be changing anyways as part of this
+ * transaction).
+ */
+static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
+                              int logs)
+{
+       /* For updates to the last received file */
+       int nblocks = FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb);
+       journal_t *journal;
+       void *handle;
+
+       if (current->journal_info) {
+               CDEBUG(D_INODE, "increasing refcount on %p\n",
+                      current->journal_info);
+               goto journal_start;
+       }
+
+       switch(op) {
+       case FSFILT_OP_UNLINK:
+               /* delete one file + create/update logs for each stripe */
+               nblocks += EXT3_DELETE_TRANS_BLOCKS(inode->i_sb);
+               nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                           FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
+               break;
+       case FSFILT_OP_CANCEL_UNLINK:
+               LASSERT(logs == 1);
+
+               /* blocks for log header bitmap update OR
+                * blocks for catalog header bitmap update + unlink of logs +
+                * blocks for delete the inode (include blocks truncating). */
+               nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
+                         EXT3_DELETE_TRANS_BLOCKS(inode->i_sb) +
+                         ext4_blocks_for_truncate(inode) + 3;
+               break;
+       default: CERROR("unknown transaction start op %d\n", op);
+               LBUG();
+       }
+
+       LASSERT(current->journal_info == desc_private);
+       journal = EXT3_SB(inode->i_sb)->s_journal;
+       if (nblocks > journal->j_max_transaction_buffers) {
+               CWARN("too many credits %d for op %ux%u using %d instead\n",
+                      nblocks, op, logs, journal->j_max_transaction_buffers);
+               nblocks = journal->j_max_transaction_buffers;
+       }
+
+ journal_start:
+       LASSERTF(nblocks > 0, "can't start %d credit transaction\n", nblocks);
+       handle = ext3_journal_start(inode, nblocks);
+
+       if (!IS_ERR(handle))
+               LASSERT(current->journal_info == handle);
+       else
+               CERROR("error starting handle for op %u (%u credits): rc %ld\n",
+                      op, nblocks, PTR_ERR(handle));
+       return handle;
+}
+
+static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync)
+{
+       int rc;
+       handle_t *handle = h;
+
+       LASSERT(current->journal_info == handle);
+       if (force_sync)
+               handle->h_sync = 1; /* recovery likes this */
+
+       rc = ext3_journal_stop(handle);
+
+       return rc;
+}
+
+#ifndef EXT3_EXTENTS_FL
+#define EXT3_EXTENTS_FL                 0x00080000 /* Inode uses extents */
+#endif
+
+#ifndef EXT_ASSERT
+#define EXT_ASSERT(cond)  BUG_ON(!(cond))
+#endif
+
+#define EXT_GENERATION(inode)     (EXT4_I(inode)->i_ext_generation)
+#define ext3_ext_base             inode
+#define ext3_ext_base2inode(inode)      (inode)
+#define EXT_DEPTH(inode)               ext_depth(inode)
+#define fsfilt_ext3_ext_walk_space(inode, block, num, cb, cbdata) \
+                       ext3_ext_walk_space(inode, block, num, cb, cbdata);
+
+struct bpointers {
+       unsigned long *blocks;
+       unsigned long start;
+       int num;
+       int init_num;
+       int create;
+};
+
+static long ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
+                              unsigned long block, int *aflags)
+{
+       struct ext3_inode_info *ei = EXT3_I(inode);
+       unsigned long bg_start;
+       unsigned long colour;
+       int depth;
+
+       if (path) {
+               struct ext3_extent *ex;
+               depth = path->p_depth;
+
+               /* try to predict block placement */
+               if ((ex = path[depth].p_ext))
+                       return ext4_ext_pblock(ex) + (block - le32_to_cpu(ex->ee_block));
+
+               /* it looks index is empty
+                * try to find starting from index itself */
+               if (path[depth].p_bh)
+                       return path[depth].p_bh->b_blocknr;
+       }
+
+       /* OK. use inode's group */
+       bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+               le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+       colour = (current->pid % 16) *
+               (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+       return bg_start + colour + block;
+}
+
+#define ll_unmap_underlying_metadata(sb, blocknr) \
+       unmap_underlying_metadata((sb)->s_bdev, blocknr)
+
+#ifndef EXT3_MB_HINT_GROUP_ALLOC
+static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
+                               struct ext3_ext_path *path, unsigned long block,
+                               unsigned long *count, int *err)
+{
+       unsigned long pblock, goal;
+       int aflags = 0;
+       struct inode *inode = ext3_ext_base2inode(base);
+
+       goal = ext3_ext_find_goal(inode, path, block, &aflags);
+       aflags |= 2; /* block have been already reserved */
+       pblock = ext3_mb_new_blocks(handle, inode, goal, count, aflags, err);
+       return pblock;
+
+}
+#else
+static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
+                               struct ext3_ext_path *path, unsigned long block,
+                               unsigned long *count, int *err)
+{
+       struct inode *inode = ext3_ext_base2inode(base);
+       struct ext3_allocation_request ar;
+       unsigned long pblock;
+       int aflags;
+
+       /* find neighbour allocated blocks */
+       ar.lleft = block;
+       *err = ext3_ext_search_left(base, path, &ar.lleft, &ar.pleft);
+       if (*err)
+               return 0;
+       ar.lright = block;
+       *err = ext3_ext_search_right(base, path, &ar.lright, &ar.pright);
+       if (*err)
+               return 0;
+
+       /* allocate new block */
+       ar.goal = ext3_ext_find_goal(inode, path, block, &aflags);
+       ar.inode = inode;
+       ar.logical = block;
+       ar.len = *count;
+       ar.flags = EXT3_MB_HINT_DATA;
+       pblock = ext3_mb_new_blocks(handle, &ar, err);
+       *count = ar.len;
+       return pblock;
+}
+#endif
+
+static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
+                                 struct ext3_ext_path *path,
+                                 struct ext3_ext_cache *cex,
+#ifdef HAVE_EXT_PREPARE_CB_EXTENT
+                                  struct ext3_extent *ex,
+#endif
+                                 void *cbdata)
+{
+       struct bpointers *bp = cbdata;
+       struct inode *inode = ext3_ext_base2inode(base);
+       struct ext3_extent nex;
+       unsigned long pblock;
+       unsigned long tgen;
+       int err, i;
+       unsigned long count;
+       handle_t *handle;
+
+#ifdef EXT3_EXT_CACHE_EXTENT
+       if (cex->ec_type == EXT3_EXT_CACHE_EXTENT)
+#else
+       if ((cex->ec_len != 0) && (cex->ec_start != 0))
+#endif
+                                                  {
+               err = EXT_CONTINUE;
+               goto map;
+       }
+
+       if (bp->create == 0) {
+               i = 0;
+               if (cex->ec_block < bp->start)
+                       i = bp->start - cex->ec_block;
+               if (i >= cex->ec_len)
+                       CERROR("nothing to do?! i = %d, e_num = %u\n",
+                                       i, cex->ec_len);
+               for (; i < cex->ec_len && bp->num; i++) {
+                       *(bp->blocks) = 0;
+                       bp->blocks++;
+                       bp->num--;
+                       bp->start++;
+               }
+
+               return EXT_CONTINUE;
+       }
+
+       tgen = EXT_GENERATION(base);
+       count = ext3_ext_calc_credits_for_insert(base, path);
+
+       handle = ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1);
+       if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
+       }
+
+       if (tgen != EXT_GENERATION(base)) {
+               /* the tree has changed. so path can be invalid at moment */
+               ext3_journal_stop(handle);
+               return EXT_REPEAT;
+       }
+
+       /* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not
+        * protected by i_data_sem as whole. so we patch it to store
+        * generation to path and now verify the tree hasn't changed */
+       down_write((&EXT4_I(inode)->i_data_sem));
+
+       /* validate extent, make sure the extent tree does not changed */
+       if (EXT_GENERATION(base) != path[0].p_generation) {
+               /* cex is invalid, try again */
+               up_write(&EXT4_I(inode)->i_data_sem);
+               ext3_journal_stop(handle);
+               return EXT_REPEAT;
+       }
+
+       count = cex->ec_len;
+       pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err);
+       if (!pblock)
+               goto out;
+       EXT_ASSERT(count <= cex->ec_len);
+
+       /* insert new extent */
+       nex.ee_block = cpu_to_le32(cex->ec_block);
+       ext3_ext_store_pblock(&nex, pblock);
+       nex.ee_len = cpu_to_le16(count);
+       err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0);
+       if (err) {
+               /* free data blocks we just allocated */
+               /* not a good idea to call discard here directly,
+                * but otherwise we'd need to call it every free() */
+#ifdef EXT3_MB_HINT_GROUP_ALLOC
+               ext3_mb_discard_inode_preallocations(inode);
+#endif
+#ifdef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD /* Introduced in 2.6.32-rc7 */
+               ext3_free_blocks(handle, inode, NULL, ext4_ext_pblock(&nex),
+                                cpu_to_le16(nex.ee_len), 0);
+#else
+               ext3_free_blocks(handle, inode, ext4_ext_pblock(&nex),
+                                cpu_to_le16(nex.ee_len), 0);
+#endif
+               goto out;
+       }
+
+       /*
+        * Putting len of the actual extent we just inserted,
+        * we are asking ext3_ext_walk_space() to continue
+        * scaning after that block
+        */
+       cex->ec_len = le16_to_cpu(nex.ee_len);
+       cex->ec_start = ext4_ext_pblock(&nex);
+       BUG_ON(le16_to_cpu(nex.ee_len) == 0);
+       BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block);
+
+out:
+       up_write((&EXT4_I(inode)->i_data_sem));
+       ext3_journal_stop(handle);
+map:
+       if (err >= 0) {
+               /* map blocks */
+               if (bp->num == 0) {
+                       CERROR("hmm. why do we find this extent?\n");
+                       CERROR("initial space: %lu:%u\n",
+                               bp->start, bp->init_num);
+#ifdef EXT3_EXT_CACHE_EXTENT
+                       CERROR("current extent: %u/%u/%llu %d\n",
+                               cex->ec_block, cex->ec_len,
+                               (unsigned long long)cex->ec_start,
+                               cex->ec_type);
+#else
+                       CERROR("current extent: %u/%u/%llu\n",
+                               cex->ec_block, cex->ec_len,
+                               (unsigned long long)cex->ec_start);
+#endif
+               }
+               i = 0;
+               if (cex->ec_block < bp->start)
+                       i = bp->start - cex->ec_block;
+               if (i >= cex->ec_len)
+                       CERROR("nothing to do?! i = %d, e_num = %u\n",
+                                       i, cex->ec_len);
+               for (; i < cex->ec_len && bp->num; i++) {
+                       *(bp->blocks) = cex->ec_start + i;
+#ifdef EXT3_EXT_CACHE_EXTENT
+                       if (cex->ec_type != EXT3_EXT_CACHE_EXTENT)
+#else
+                       if ((cex->ec_len == 0) || (cex->ec_start == 0))
+#endif
+                                                                       {
+                               /* unmap any possible underlying metadata from
+                                * the block device mapping.  bug 6998. */
+                               ll_unmap_underlying_metadata(inode->i_sb,
+                                                            *(bp->blocks));
+                       }
+                       bp->blocks++;
+                       bp->num--;
+                       bp->start++;
+               }
+       }
+       return err;
+}
+
+int fsfilt_map_nblocks(struct inode *inode, unsigned long block,
+                      unsigned long num, unsigned long *blocks,
+                      int create)
+{
+       struct ext3_ext_base *base = inode;
+       struct bpointers bp;
+       int err;
+
+       CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n",
+              block, block + num - 1, (unsigned) inode->i_ino);
+
+       bp.blocks = blocks;
+       bp.start = block;
+       bp.init_num = bp.num = num;
+       bp.create = create;
+
+       err = fsfilt_ext3_ext_walk_space(base, block, num,
+                                        ext3_ext_new_extent_cb, &bp);
+       ext3_ext_invalidate_cache(base);
+
+       return err;
+}
+
+int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
+                                   int pages, unsigned long *blocks,
+                                   int create)
+{
+       int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+       int rc = 0, i = 0;
+       struct page *fp = NULL;
+       int clen = 0;
+
+       CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
+               inode->i_ino, pages, (*page)->index);
+
+       /* pages are sorted already. so, we just have to find
+        * contig. space and process them properly */
+       while (i < pages) {
+               if (fp == NULL) {
+                       /* start new extent */
+                       fp = *page++;
+                       clen = 1;
+                       i++;
+                       continue;
+               } else if (fp->index + clen == (*page)->index) {
+                       /* continue the extent */
+                       page++;
+                       clen++;
+                       i++;
+                       continue;
+               }
+
+               /* process found extent */
+               rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
+                                       clen * blocks_per_page, blocks,
+                                       create);
+               if (rc)
+                       GOTO(cleanup, rc);
+
+               /* look for next extent */
+               fp = NULL;
+               blocks += blocks_per_page * clen;
+       }
+
+       if (fp)
+               rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
+                                       clen * blocks_per_page, blocks,
+                                       create);
+cleanup:
+       return rc;
+}
+
+int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page,
+                                  int pages, unsigned long *blocks,
+                                  int create)
+{
+       int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+       unsigned long *b;
+       int rc = 0, i;
+
+       for (i = 0, b = blocks; i < pages; i++, page++) {
+               rc = ext3_map_inode_page(inode, *page, b, create);
+               if (rc) {
+                       CERROR("ino %lu, blk %lu create %d: rc %d\n",
+                              inode->i_ino, *b, create, rc);
+                       break;
+               }
+
+               b += blocks_per_page;
+       }
+       return rc;
+}
+
+int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
+                               int pages, unsigned long *blocks,
+                               int create, struct mutex *optional_mutex)
+{
+       int rc;
+
+       if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
+               rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
+                                                    blocks, create);
+               return rc;
+       }
+       if (optional_mutex != NULL)
+               mutex_lock(optional_mutex);
+       rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks, create);
+       if (optional_mutex != NULL)
+               mutex_unlock(optional_mutex);
+
+       return rc;
+}
+
+int fsfilt_ext3_read(struct inode *inode, void *buf, int size, loff_t *offs)
+{
+       unsigned long block;
+       struct buffer_head *bh;
+       int err, blocksize, csize, boffs, osize = size;
+
+       /* prevent reading after eof */
+       spin_lock(&inode->i_lock);
+       if (i_size_read(inode) < *offs + size) {
+               size = i_size_read(inode) - *offs;
+               spin_unlock(&inode->i_lock);
+               if (size < 0) {
+                       CDEBUG(D_EXT2, "size %llu is too short for read @%llu\n",
+                              i_size_read(inode), *offs);
+                       return -EBADR;
+               } else if (size == 0) {
+                       return 0;
+               }
+       } else {
+               spin_unlock(&inode->i_lock);
+       }
+
+       blocksize = 1 << inode->i_blkbits;
+
+       while (size > 0) {
+               block = *offs >> inode->i_blkbits;
+               boffs = *offs & (blocksize - 1);
+               csize = min(blocksize - boffs, size);
+               bh = ext3_bread(NULL, inode, block, 0, &err);
+               if (!bh) {
+                       CERROR("can't read block: %d\n", err);
+                       return err;
+               }
+
+               memcpy(buf, bh->b_data + boffs, csize);
+               brelse(bh);
+
+               *offs += csize;
+               buf += csize;
+               size -= csize;
+       }
+       return osize;
+}
+EXPORT_SYMBOL(fsfilt_ext3_read);
+
+static int fsfilt_ext3_read_record(struct file * file, void *buf,
+                                  int size, loff_t *offs)
+{
+       int rc;
+       rc = fsfilt_ext3_read(file->f_dentry->d_inode, buf, size, offs);
+       if (rc > 0)
+               rc = 0;
+       return rc;
+}
+
+int fsfilt_ext3_write_handle(struct inode *inode, void *buf, int bufsize,
+                               loff_t *offs, handle_t *handle)
+{
+       struct buffer_head *bh = NULL;
+       loff_t old_size = i_size_read(inode), offset = *offs;
+       loff_t new_size = i_size_read(inode);
+       unsigned long block;
+       int err = 0, blocksize = 1 << inode->i_blkbits, size, boffs;
+
+       while (bufsize > 0) {
+               if (bh != NULL)
+                       brelse(bh);
+
+               block = offset >> inode->i_blkbits;
+               boffs = offset & (blocksize - 1);
+               size = min(blocksize - boffs, bufsize);
+               bh = ext3_bread(handle, inode, block, 1, &err);
+               if (!bh) {
+                       CERROR("can't read/create block: %d\n", err);
+                       break;
+               }
+
+               err = ext3_journal_get_write_access(handle, bh);
+               if (err) {
+                       CERROR("journal_get_write_access() returned error %d\n",
+                              err);
+                       break;
+               }
+               LASSERT(bh->b_data + boffs + size <= bh->b_data + bh->b_size);
+               memcpy(bh->b_data + boffs, buf, size);
+               err = ext3_journal_dirty_metadata(handle, bh);
+               if (err) {
+                       CERROR("journal_dirty_metadata() returned error %d\n",
+                              err);
+                       break;
+               }
+               if (offset + size > new_size)
+                       new_size = offset + size;
+               offset += size;
+               bufsize -= size;
+               buf += size;
+       }
+       if (bh)
+               brelse(bh);
+
+       /* correct in-core and on-disk sizes */
+       if (new_size > i_size_read(inode)) {
+               spin_lock(&inode->i_lock);
+               if (new_size > i_size_read(inode))
+                       i_size_write(inode, new_size);
+               if (i_size_read(inode) > EXT3_I(inode)->i_disksize)
+                       EXT3_I(inode)->i_disksize = i_size_read(inode);
+               if (i_size_read(inode) > old_size) {
+                       spin_unlock(&inode->i_lock);
+                       mark_inode_dirty(inode);
+               } else {
+                       spin_unlock(&inode->i_lock);
+               }
+       }
+
+       if (err == 0)
+               *offs = offset;
+       return err;
+}
+EXPORT_SYMBOL(fsfilt_ext3_write_handle);
+
+static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize,
+                                   loff_t *offs, int force_sync)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       handle_t *handle;
+       int err, block_count = 0, blocksize;
+
+       /* Determine how many transaction credits are needed */
+       blocksize = 1 << inode->i_blkbits;
+       block_count = (*offs & (blocksize - 1)) + bufsize;
+       block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
+
+       handle = ext3_journal_start(inode,
+                       block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2);
+       if (IS_ERR(handle)) {
+               CERROR("can't start transaction for %d blocks (%d bytes)\n",
+                      block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2,
+                      bufsize);
+               return PTR_ERR(handle);
+       }
+
+       err = fsfilt_ext3_write_handle(inode, buf, bufsize, offs, handle);
+
+       if (!err && force_sync)
+               handle->h_sync = 1; /* recovery likes this */
+
+       ext3_journal_stop(handle);
+
+       return err;
+}
+
+static int fsfilt_ext3_setup(struct super_block *sb)
+{
+       if (!EXT3_HAS_COMPAT_FEATURE(sb,
+                               EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+               CERROR("ext3 mounted without journal\n");
+               return -EINVAL;
+       }
+
+#ifdef S_PDIROPS
+       CWARN("Enabling PDIROPS\n");
+       set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS);
+       sb->s_flags |= S_PDIROPS;
+#endif
+       if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+               CWARN("filesystem doesn't have dir_index feature enabled\n");
+       return 0;
+}
+static struct fsfilt_operations fsfilt_ext3_ops = {
+       .fs_type                = "ext3",
+       .fs_owner              = THIS_MODULE,
+       .fs_getlabel        = fsfilt_ext3_get_label,
+       .fs_start              = fsfilt_ext3_start,
+       .fs_commit            = fsfilt_ext3_commit,
+       .fs_map_inode_pages     = fsfilt_ext3_map_inode_pages,
+       .fs_write_record        = fsfilt_ext3_write_record,
+       .fs_read_record  = fsfilt_ext3_read_record,
+       .fs_setup              = fsfilt_ext3_setup,
+};
+
+static int __init fsfilt_ext3_init(void)
+{
+       int rc;
+
+       fcb_cache = kmem_cache_create("fsfilt_ext3_fcb",
+                                        sizeof(struct fsfilt_cb_data), 0, 0);
+       if (!fcb_cache) {
+               CERROR("error allocating fsfilt journal callback cache\n");
+               GOTO(out, rc = -ENOMEM);
+       }
+
+       rc = fsfilt_register_ops(&fsfilt_ext3_ops);
+
+       if (rc) {
+               int err = kmem_cache_destroy(fcb_cache);
+               LASSERTF(err == 0, "error destroying new cache: rc %d\n", err);
+       }
+out:
+       return rc;
+}
+
+static void __exit fsfilt_ext3_exit(void)
+{
+       int rc;
+
+       fsfilt_unregister_ops(&fsfilt_ext3_ops);
+       rc = kmem_cache_destroy(fcb_cache);
+       LASSERTF(rc == 0, "couldn't destroy fcb_cache slab\n");
+}
+
+module_init(fsfilt_ext3_init);
+module_exit(fsfilt_ext3_exit);
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c b/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c
new file mode 100644 (file)
index 0000000..97a8be2
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_lib.c
+ *
+ * Lustre filesystem abstraction routines
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+#include <linux/module.h>
+#include <lustre_lib.h>
+#include <lprocfs_status.h>
+
+#ifdef LPROCFS
+void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount)
+{
+       struct lprocfs_counter          *percpu_cntr;
+       struct lprocfs_counter_header   *header;
+       int                             smp_id;
+       unsigned long                   flags = 0;
+
+       if (stats == NULL)
+               return;
+
+       /* With per-client stats, statistics are allocated only for
+        * single CPU area, so the smp_id should be 0 always. */
+       smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+       if (smp_id < 0)
+               return;
+
+       header = &stats->ls_cnt_header[idx];
+       percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+       percpu_cntr->lc_count++;
+
+       if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+               /*
+                * lprocfs_counter_add() can be called in interrupt context,
+                * as memory allocation could trigger memory shrinker call
+                * ldlm_pool_shrink(), which calls lprocfs_counter_add().
+                * LU-1727.
+                *
+                * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+                * flag, because it needs accurate counting lest memory leak
+                * check reports error.
+                */
+               if (in_interrupt() &&
+                   (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+                       percpu_cntr->lc_sum_irq += amount;
+               else
+                       percpu_cntr->lc_sum += amount;
+
+               if (header->lc_config & LPROCFS_CNTR_STDDEV)
+                       percpu_cntr->lc_sumsquare += (__s64)amount * amount;
+               if (amount < percpu_cntr->lc_min)
+                       percpu_cntr->lc_min = amount;
+               if (amount > percpu_cntr->lc_max)
+                       percpu_cntr->lc_max = amount;
+       }
+       lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_add);
+
+void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount)
+{
+       struct lprocfs_counter          *percpu_cntr;
+       struct lprocfs_counter_header   *header;
+       int                             smp_id;
+       unsigned long                   flags = 0;
+
+       if (stats == NULL)
+               return;
+
+       /* With per-client stats, statistics are allocated only for
+        * single CPU area, so the smp_id should be 0 always. */
+       smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags);
+       if (smp_id < 0)
+               return;
+
+       header = &stats->ls_cnt_header[idx];
+       percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx);
+       if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+               /*
+                * Sometimes we use RCU callbacks to free memory which calls
+                * lprocfs_counter_sub(), and RCU callbacks may execute in
+                * softirq context - right now that's the only case we're in
+                * softirq context here, use separate counter for that.
+                * bz20650.
+                *
+                * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE
+                * flag, because it needs accurate counting lest memory leak
+                * check reports error.
+                */
+               if (in_interrupt() &&
+                   (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+                       percpu_cntr->lc_sum_irq -= amount;
+               else
+                       percpu_cntr->lc_sum -= amount;
+       }
+       lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_sub);
+
+int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid)
+{
+       struct lprocfs_counter  *cntr;
+       unsigned int            percpusize;
+       int                     rc = -ENOMEM;
+       unsigned long           flags = 0;
+       int                     i;
+
+       LASSERT(stats->ls_percpu[cpuid] == NULL);
+       LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0);
+
+       percpusize = lprocfs_stats_counter_size(stats);
+       LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize);
+       if (stats->ls_percpu[cpuid] != NULL) {
+               rc = 0;
+               if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) {
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+                               spin_lock_irqsave(&stats->ls_lock, flags);
+                       else
+                               spin_lock(&stats->ls_lock);
+                       if (stats->ls_biggest_alloc_num <= cpuid)
+                               stats->ls_biggest_alloc_num = cpuid + 1;
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) {
+                               spin_unlock_irqrestore(&stats->ls_lock, flags);
+                       } else {
+                               spin_unlock(&stats->ls_lock);
+                       }
+               }
+               /* initialize the ls_percpu[cpuid] non-zero counter */
+               for (i = 0; i < stats->ls_num; ++i) {
+                       cntr = lprocfs_stats_counter_get(stats, cpuid, i);
+                       cntr->lc_min = LC_MIN_INIT;
+               }
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_stats_alloc_one);
+#endif  /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c
new file mode 100644 (file)
index 0000000..1e6f32c
--- /dev/null
@@ -0,0 +1,295 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lvfs/lvfs_linux.c
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/version.h>
+#include <linux/libcfs/libcfs.h>
+#include <lustre_fsfilt.h>
+#include <obd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/lustre_compat25.h>
+#include <lvfs.h>
+
+#include <obd.h>
+#include <lustre_lib.h>
+
+struct lprocfs_stats *obd_memory = NULL;
+EXPORT_SYMBOL(obd_memory);
+/* refine later and change to seqlock or simlar from libcfs */
+
+/* Debugging check only needed during development */
+#ifdef OBD_CTXT_DEBUG
+# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
+# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\
+                                             msg)
+# define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg)
+#else
+# define ASSERT_CTXT_MAGIC(magic) do {} while(0)
+# define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
+# define ASSERT_KERNEL_CTXT(msg) do {} while(0)
+#endif
+
+static void push_group_info(struct lvfs_run_ctxt *save,
+                           struct group_info *ginfo)
+{
+       if (!ginfo) {
+               save->ngroups = current_ngroups;
+               current_ngroups = 0;
+       } else {
+               struct cred *cred;
+               task_lock(current);
+               save->group_info = current_cred()->group_info;
+               if ((cred = prepare_creds())) {
+                       cred->group_info = ginfo;
+                       commit_creds(cred);
+               }
+               task_unlock(current);
+       }
+}
+
+static void pop_group_info(struct lvfs_run_ctxt *save,
+                          struct group_info *ginfo)
+{
+       if (!ginfo) {
+               current_ngroups = save->ngroups;
+       } else {
+               struct cred *cred;
+               task_lock(current);
+               if ((cred = prepare_creds())) {
+                       cred->group_info = save->group_info;
+                       commit_creds(cred);
+               }
+               task_unlock(current);
+       }
+}
+
+/* push / pop to root of obd store */
+void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx,
+              struct lvfs_ucred *uc)
+{
+       /* if there is underlaying dt_device then push_ctxt is not needed */
+       if (new_ctx->dt != NULL)
+               return;
+
+       //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n");
+       ASSERT_CTXT_MAGIC(new_ctx->magic);
+       OBD_SET_CTXT_MAGIC(save);
+
+       save->fs = get_fs();
+       LASSERT(d_refcount(cfs_fs_pwd(current->fs)));
+       LASSERT(d_refcount(new_ctx->pwd));
+       save->pwd = dget(cfs_fs_pwd(current->fs));
+       save->pwdmnt = mntget(cfs_fs_mnt(current->fs));
+       save->luc.luc_umask = current_umask();
+       save->ngroups = current_cred()->group_info->ngroups;
+
+       LASSERT(save->pwd);
+       LASSERT(save->pwdmnt);
+       LASSERT(new_ctx->pwd);
+       LASSERT(new_ctx->pwdmnt);
+
+       if (uc) {
+               struct cred *cred;
+               save->luc.luc_uid = current_uid();
+               save->luc.luc_gid = current_gid();
+               save->luc.luc_fsuid = current_fsuid();
+               save->luc.luc_fsgid = current_fsgid();
+               save->luc.luc_cap = current_cap();
+
+               if ((cred = prepare_creds())) {
+                       cred->uid = uc->luc_uid;
+                       cred->gid = uc->luc_gid;
+                       cred->fsuid = uc->luc_fsuid;
+                       cred->fsgid = uc->luc_fsgid;
+                       cred->cap_effective = uc->luc_cap;
+                       commit_creds(cred);
+               }
+
+               push_group_info(save,
+                               uc->luc_ginfo ?:
+                               uc->luc_identity ? uc->luc_identity->mi_ginfo :
+                                                  NULL);
+       }
+       current->fs->umask = 0; /* umask already applied on client */
+       set_fs(new_ctx->fs);
+       ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd);
+}
+EXPORT_SYMBOL(push_ctxt);
+
+void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx,
+             struct lvfs_ucred *uc)
+{
+       /* if there is underlaying dt_device then pop_ctxt is not needed */
+       if (new_ctx->dt != NULL)
+               return;
+
+       ASSERT_CTXT_MAGIC(saved->magic);
+       ASSERT_KERNEL_CTXT("popping non-kernel context!\n");
+
+       LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n",
+                cfs_fs_pwd(current->fs), new_ctx->pwd);
+       LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n",
+                cfs_fs_mnt(current->fs), new_ctx->pwdmnt);
+
+       set_fs(saved->fs);
+       ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
+
+       dput(saved->pwd);
+       mntput(saved->pwdmnt);
+       current->fs->umask = saved->luc.luc_umask;
+       if (uc) {
+               struct cred *cred;
+               if ((cred = prepare_creds())) {
+                       cred->uid = saved->luc.luc_uid;
+                       cred->gid = saved->luc.luc_gid;
+                       cred->fsuid = saved->luc.luc_fsuid;
+                       cred->fsgid = saved->luc.luc_fsgid;
+                       cred->cap_effective = saved->luc.luc_cap;
+                       commit_creds(cred);
+               }
+
+               pop_group_info(saved,
+                              uc->luc_ginfo ?:
+                              uc->luc_identity ? uc->luc_identity->mi_ginfo :
+                                                 NULL);
+       }
+}
+EXPORT_SYMBOL(pop_ctxt);
+
+/* utility to rename a file */
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
+                 char *oldname, char *newname)
+{
+       struct dentry *dchild_old, *dchild_new;
+       int err = 0;
+       ENTRY;
+
+       ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
+       CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
+              (int)strlen(oldname), oldname, (int)strlen(newname), newname);
+
+       dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
+       if (IS_ERR(dchild_old))
+               RETURN(PTR_ERR(dchild_old));
+
+       if (!dchild_old->d_inode)
+               GOTO(put_old, err = -ENOENT);
+
+       dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
+       if (IS_ERR(dchild_new))
+               GOTO(put_old, err = PTR_ERR(dchild_new));
+
+       err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
+                           dir->d_inode, dchild_new, mnt);
+
+       dput(dchild_new);
+put_old:
+       dput(dchild_old);
+       RETURN(err);
+}
+EXPORT_SYMBOL(lustre_rename);
+
+/* Note: dput(dchild) will *not* be called if there is an error */
+struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de,
+                            int flags)
+{
+       struct path path = {
+               .dentry = de,
+               .mnt = ctxt->pwdmnt,
+       };
+       return ll_dentry_open(&path, flags, current_cred());
+}
+EXPORT_SYMBOL(l_dentry_open);
+
+#ifdef LPROCFS
+__s64 lprocfs_read_helper(struct lprocfs_counter *lc,
+                         struct lprocfs_counter_header *header,
+                         enum lprocfs_stats_flags flags,
+                         enum lprocfs_fields_flags field)
+{
+       __s64 ret = 0;
+
+       if (lc == NULL || header == NULL)
+               RETURN(0);
+
+       switch (field) {
+               case LPROCFS_FIELDS_FLAGS_CONFIG:
+                       ret = header->lc_config;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_SUM:
+                       ret = lc->lc_sum;
+                       if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+                               ret += lc->lc_sum_irq;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_MIN:
+                       ret = lc->lc_min;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_MAX:
+                       ret = lc->lc_max;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_AVG:
+                       ret = (lc->lc_max - lc->lc_min) / 2;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_SUMSQUARE:
+                       ret = lc->lc_sumsquare;
+                       break;
+               case LPROCFS_FIELDS_FLAGS_COUNT:
+                       ret = lc->lc_count;
+                       break;
+               default:
+                       break;
+       };
+
+       RETURN(ret);
+}
+EXPORT_SYMBOL(lprocfs_read_helper);
+#endif /* LPROCFS */
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1");
+MODULE_LICENSE("GPL");
diff --git a/drivers/staging/lustre/lustre/mdc/Makefile b/drivers/staging/lustre/lustre/mdc/Makefile
new file mode 100644 (file)
index 0000000..93bae24
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += mdc.o
+mdc-y := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/mdc/lproc_mdc.c b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c
new file mode 100644 (file)
index 0000000..6592478
--- /dev/null
@@ -0,0 +1,219 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+#ifdef LPROCFS
+
+static int mdc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = m->private;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+static ssize_t mdc_max_rpcs_in_flight_seq_write(struct file *file,
+                                               const char *buffer,
+                                               size_t count,
+                                               loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       struct client_obd *cli = &dev->u.cli;
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val < 1 || val > MDC_MAX_RIF_MAX)
+               return -ERANGE;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_max_rpcs_in_flight = val;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(mdc_max_rpcs_in_flight);
+
+static int mdc_kuc_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, NULL, PDE_DATA(inode));
+}
+
+/* temporary for testing */
+static ssize_t mdc_kuc_write(struct file *file, const char *buffer,
+                            size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       struct kuc_hdr          *lh;
+       struct hsm_action_list  *hal;
+       struct hsm_action_item  *hai;
+       int                      len;
+       int                      fd, rc;
+       ENTRY;
+
+       rc = lprocfs_write_helper(buffer, count, &fd);
+       if (rc)
+               RETURN(rc);
+
+       if (fd < 0)
+               RETURN(-ERANGE);
+       CWARN("message to fd %d\n", fd);
+
+       len = sizeof(*lh) + sizeof(*hal) + MTI_NAME_MAXLEN +
+               /* for mockup below */ 2 * cfs_size_round(sizeof(*hai));
+
+       OBD_ALLOC(lh, len);
+
+       lh->kuc_magic = KUC_MAGIC;
+       lh->kuc_transport = KUC_TRANSPORT_HSM;
+       lh->kuc_msgtype = HMT_ACTION_LIST;
+       lh->kuc_msglen = len;
+
+       hal = (struct hsm_action_list *)(lh + 1);
+       hal->hal_version = HAL_VERSION;
+       hal->hal_archive_id = 1;
+       hal->hal_flags = 0;
+       obd_uuid2fsname(hal->hal_fsname, obd->obd_name, MTI_NAME_MAXLEN);
+
+       /* mock up an action list */
+       hal->hal_count = 2;
+       hai = hai_zero(hal);
+       hai->hai_action = HSMA_ARCHIVE;
+       hai->hai_fid.f_oid = 5;
+       hai->hai_len = sizeof(*hai);
+       hai = hai_next(hai);
+       hai->hai_action = HSMA_RESTORE;
+       hai->hai_fid.f_oid = 10;
+       hai->hai_len = sizeof(*hai);
+
+       /* This works for either broadcast or unicast to a single fd */
+       if (fd == 0) {
+               rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+       } else {
+               struct file *fp = fget(fd);
+
+               rc = libcfs_kkuc_msg_put(fp, lh);
+               fput(fp);
+       }
+       OBD_FREE(lh, len);
+       if (rc < 0)
+               RETURN(rc);
+       RETURN(count);
+}
+
+struct file_operations mdc_kuc_fops = {
+       .open           = mdc_kuc_open,
+       .write          = mdc_kuc_write,
+       .release        = single_release,
+};
+
+LPROC_SEQ_FOPS_WR_ONLY(mdc, ping);
+
+LPROC_SEQ_FOPS_RO_TYPE(mdc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(mdc, state);
+
+static int mdc_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *v)
+{
+       return lprocfs_obd_rd_max_pages_per_rpc(m, m->private);
+}
+LPROC_SEQ_FOPS_RO(mdc_obd_max_pages_per_rpc);
+
+LPROC_SEQ_FOPS_RW_TYPE(mdc, import);
+LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov);
+
+static struct lprocfs_vars lprocfs_mdc_obd_vars[] = {
+       { "uuid",           &mdc_uuid_fops,             0, 0 },
+       { "ping",           &mdc_ping_fops,             0, 0222 },
+       { "connect_flags",  &mdc_connect_flags_fops,    0, 0 },
+       { "blocksize",      &mdc_blksize_fops,          0, 0 },
+       { "kbytestotal",    &mdc_kbytestotal_fops,      0, 0 },
+       { "kbytesfree",     &mdc_kbytesfree_fops,       0, 0 },
+       { "kbytesavail",    &mdc_kbytesavail_fops,      0, 0 },
+       { "filestotal",     &mdc_filestotal_fops,       0, 0 },
+       { "filesfree",      &mdc_filesfree_fops,        0, 0 },
+       /*{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },*/
+       { "mds_server_uuid", &mdc_server_uuid_fops,     0, 0 },
+       { "mds_conn_uuid",  &mdc_conn_uuid_fops,        0, 0 },
+       /*
+        * FIXME: below proc entry is provided, but not in used, instead
+        * sbi->sb_md_brw_size is used, the per obd variable should be used
+        * when CMD is enabled, and dir pages are managed in MDC layer.
+        * Remember to enable proc write function.
+        */
+       { "max_pages_per_rpc",  &mdc_obd_max_pages_per_rpc_fops, 0, 0 },
+       { "max_rpcs_in_flight", &mdc_max_rpcs_in_flight_fops, 0, 0 },
+       { "timeouts",           &mdc_timeouts_fops,    0, 0 },
+       { "import",             &mdc_import_fops, 0 },
+       { "state",              &mdc_state_fops, 0, 0 },
+       { "hsm_nl",             &mdc_kuc_fops, 0, 0200 },
+       { "pinger_recov",       &mdc_pinger_recov_fops, 0, 0 },
+       { 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(mdc, numrefs);
+
+static struct lprocfs_vars lprocfs_mdc_module_vars[] = {
+       { "num_refs",   &mdc_numrefs_fops,     0, 0 },
+       { 0 }
+};
+
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_mdc_module_vars;
+    lvars->obd_vars     = lprocfs_mdc_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_internal.h b/drivers/staging/lustre/lustre/mdc/mdc_internal.h
new file mode 100644 (file)
index 0000000..2aeff0e
--- /dev/null
@@ -0,0 +1,180 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MDC_INTERNAL_H
+#define _MDC_INTERNAL_H
+
+#include <lustre_mdc.h>
+#include <lustre_mds.h>
+
+#ifdef LPROCFS
+void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid,
+                  struct obd_capa *oc, __u64 valid, int ea_size,
+                  __u32 suppgid, int flags);
+void mdc_pack_capa(struct ptlrpc_request *req,
+                  const struct req_msg_field *field, struct obd_capa *oc);
+int mdc_pack_req(struct ptlrpc_request *req, int version, int opc);
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+                       const struct lu_fid *cfid, int flags);
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+                          struct md_op_data *op_data);
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, __u32 size,
+                     const struct lu_fid *fid, struct obd_capa *oc);
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+                     struct md_op_data *data, int ea_size);
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                    void *ea, int ealen, void *ea2, int ea2len);
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                    const void *data, int datalen, __u32 mode, __u32 uid,
+                    __u32 gid, cfs_cap_t capability, __u64 rdev);
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                  __u32 mode, __u64 rdev, __u32 flags, const void *data,
+                  int datalen);
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                    const char *old, int oldlen, const char *new, int newlen);
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data);
+int mdc_enter_request(struct client_obd *cli);
+void mdc_exit_request(struct client_obd *cli);
+
+/* mdc/mdc_locks.c */
+int mdc_set_lock_data(struct obd_export *exp,
+                     __u64 *lockh, void *data, __u64 *bits);
+
+int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid);
+
+int mdc_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
+                   ldlm_iterator_t it, void *data);
+
+int mdc_intent_lock(struct obd_export *exp,
+                   struct md_op_data *,
+                   void *lmm, int lmmsize,
+                   struct lookup_intent *, int,
+                   struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags);
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+               struct lookup_intent *it, struct md_op_data *op_data,
+               struct lustre_handle *lockh, void *lmm, int lmmsize,
+               struct ptlrpc_request **req, __u64 extra_lock_flags);
+
+int mdc_resource_get_unused(struct obd_export *exp, struct lu_fid *fid,
+                           struct list_head *cancels, ldlm_mode_t mode,
+                           __u64 bits);
+/* mdc/mdc_request.c */
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+                 struct md_op_data *op_data);
+
+int mdc_open(struct obd_export *exp, obd_id ino, int type, int flags,
+            struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh,
+            struct ptlrpc_request **);
+
+struct obd_client_handle;
+
+int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req,
+                     struct obd_export *dt_exp, struct obd_export *lmv_exp,
+                     struct lustre_md *md);
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md);
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+                            struct obd_client_handle *och,
+                            struct ptlrpc_request *open_req);
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+                              struct obd_client_handle *och);
+void mdc_commit_open(struct ptlrpc_request *req);
+void mdc_replay_open(struct ptlrpc_request *req);
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+              const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+              cfs_cap_t capability, __u64 rdev,
+              struct ptlrpc_request **request);
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+            struct ptlrpc_request **request);
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+              const char *old, int oldlen, const char *new, int newlen,
+              struct ptlrpc_request **request);
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+               void *ea, int ealen, void *ea2, int ea2len,
+               struct ptlrpc_request **request, struct md_open_data **mod);
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+              struct ptlrpc_request **request);
+int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
+                     ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                     ldlm_cancel_flags_t flags, void *opaque);
+
+static inline void mdc_set_capa_size(struct ptlrpc_request *req,
+                                    const struct req_msg_field *field,
+                                    struct obd_capa *oc)
+{
+       if (oc == NULL)
+               req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+       else
+               /* it is already calculated as sizeof struct obd_capa */
+               ;
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+                       struct lu_fid *fid, __u64 *bits);
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+                            struct md_enqueue_info *minfo,
+                            struct ldlm_enqueue_info *einfo);
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+                          const struct lu_fid *fid, ldlm_type_t type,
+                          ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                          struct lustre_handle *lockh);
+
+static inline int mdc_prep_elc_req(struct obd_export *exp,
+                                  struct ptlrpc_request *req, int opc,
+                                  struct list_head *cancels, int count)
+{
+       return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels,
+                                count);
+}
+
+#endif
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_lib.c b/drivers/staging/lustre/lustre/mdc/mdc_lib.c
new file mode 100644 (file)
index 0000000..e789aed
--- /dev/null
@@ -0,0 +1,564 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+#include <lustre_net.h>
+#include <lustre/lustre_idl.h>
+#include "mdc_internal.h"
+
+
+static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid)
+{
+       LASSERT (b != NULL);
+
+       b->suppgid = suppgid;
+       b->uid = current_uid();
+       b->gid = current_gid();
+       b->fsuid = current_fsuid();
+       b->fsgid = current_fsgid();
+       b->capability = cfs_curproc_cap_pack();
+}
+
+void mdc_pack_capa(struct ptlrpc_request *req, const struct req_msg_field *field,
+                  struct obd_capa *oc)
+{
+       struct req_capsule *pill = &req->rq_pill;
+       struct lustre_capa *c;
+
+       if (oc == NULL) {
+               LASSERT(req_capsule_get_size(pill, field, RCL_CLIENT) == 0);
+               return;
+       }
+
+       c = req_capsule_client_get(pill, field);
+       LASSERT(c != NULL);
+       capa_cpy(c, oc);
+       DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid,
+                       const struct lu_fid *cfid, int flags)
+{
+       struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_MDT_BODY);
+
+       if (pfid) {
+               b->fid1 = *pfid;
+               b->valid = OBD_MD_FLID;
+       }
+       if (cfid)
+               b->fid2 = *cfid;
+       b->flags = flags;
+}
+
+void mdc_swap_layouts_pack(struct ptlrpc_request *req,
+                          struct md_op_data *op_data)
+{
+       struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_MDT_BODY);
+
+       __mdc_pack_body(b, op_data->op_suppgids[0]);
+       b->fid1 = op_data->op_fid1;
+       b->fid2 = op_data->op_fid2;
+       b->valid |= OBD_MD_FLID;
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+}
+
+void mdc_pack_body(struct ptlrpc_request *req,
+                  const struct lu_fid *fid, struct obd_capa *oc,
+                  __u64 valid, int ea_size, __u32 suppgid, int flags)
+{
+       struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_MDT_BODY);
+       LASSERT(b != NULL);
+       b->valid = valid;
+       b->eadatasize = ea_size;
+       b->flags = flags;
+       __mdc_pack_body(b, suppgid);
+       if (fid) {
+               b->fid1 = *fid;
+               b->valid |= OBD_MD_FLID;
+               mdc_pack_capa(req, &RMF_CAPA1, oc);
+       }
+}
+
+void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff,
+                     __u32 size, const struct lu_fid *fid, struct obd_capa *oc)
+{
+       struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_MDT_BODY);
+       b->fid1 = *fid;
+       b->valid |= OBD_MD_FLID;
+       b->size = pgoff;                       /* !! */
+       b->nlink = size;                        /* !! */
+       __mdc_pack_body(b, -1);
+       b->mode = LUDA_FID | LUDA_TYPE;
+
+       mdc_pack_capa(req, &RMF_CAPA1, oc);
+}
+
+/* packing of MDS records */
+void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                    const void *data, int datalen, __u32 mode,
+                    __u32 uid, __u32 gid, cfs_cap_t cap_effective, __u64 rdev)
+{
+       struct mdt_rec_create   *rec;
+       char                    *tmp;
+       __u64                    flags;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+
+       rec->cr_opcode   = REINT_CREATE;
+       rec->cr_fsuid    = uid;
+       rec->cr_fsgid    = gid;
+       rec->cr_cap      = cap_effective;
+       rec->cr_fid1     = op_data->op_fid1;
+       rec->cr_fid2     = op_data->op_fid2;
+       rec->cr_mode     = mode;
+       rec->cr_rdev     = rdev;
+       rec->cr_time     = op_data->op_mod_time;
+       rec->cr_suppgid1 = op_data->op_suppgids[0];
+       rec->cr_suppgid2 = op_data->op_suppgids[1];
+       flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+       if (op_data->op_bias & MDS_CREATE_VOLATILE)
+               flags |= MDS_OPEN_VOLATILE;
+       set_mrc_cr_flags(rec, flags);
+       rec->cr_bias     = op_data->op_bias;
+       rec->cr_umask    = current_umask();
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+       LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+       if (data) {
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+               memcpy(tmp, data, datalen);
+       }
+}
+
+static __u64 mds_pack_open_flags(__u32 flags, __u32 mode)
+{
+       __u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE |
+                                  MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |
+                                  MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |
+                                  MDS_OPEN_BY_FID));
+       if (flags & O_CREAT)
+               cr_flags |= MDS_OPEN_CREAT;
+       if (flags & O_EXCL)
+               cr_flags |= MDS_OPEN_EXCL;
+       if (flags & O_TRUNC)
+               cr_flags |= MDS_OPEN_TRUNC;
+       if (flags & O_APPEND)
+               cr_flags |= MDS_OPEN_APPEND;
+       if (flags & O_SYNC)
+               cr_flags |= MDS_OPEN_SYNC;
+       if (flags & O_DIRECTORY)
+               cr_flags |= MDS_OPEN_DIRECTORY;
+#ifdef FMODE_EXEC
+       if (flags & FMODE_EXEC)
+               cr_flags |= MDS_FMODE_EXEC;
+#endif
+       if (flags & O_LOV_DELAY_CREATE)
+               cr_flags |= MDS_OPEN_DELAY_CREATE;
+
+       if (flags & O_NONBLOCK)
+               cr_flags |= MDS_OPEN_NORESTORE;
+
+       return cr_flags;
+}
+
+/* packing of MDS records */
+void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                  __u32 mode, __u64 rdev, __u32 flags, const void *lmm,
+                  int lmmlen)
+{
+       struct mdt_rec_create *rec;
+       char *tmp;
+       __u64 cr_flags;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+       /* XXX do something about time, uid, gid */
+       rec->cr_opcode   = REINT_OPEN;
+       rec->cr_fsuid   = current_fsuid();
+       rec->cr_fsgid   = current_fsgid();
+       rec->cr_cap      = cfs_curproc_cap_pack();
+       if (op_data != NULL) {
+               rec->cr_fid1 = op_data->op_fid1;
+               rec->cr_fid2 = op_data->op_fid2;
+       }
+       rec->cr_mode     = mode;
+       cr_flags = mds_pack_open_flags(flags, mode);
+       rec->cr_rdev     = rdev;
+       rec->cr_time     = op_data->op_mod_time;
+       rec->cr_suppgid1 = op_data->op_suppgids[0];
+       rec->cr_suppgid2 = op_data->op_suppgids[1];
+       rec->cr_bias     = op_data->op_bias;
+       rec->cr_umask    = current_umask();
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+       /* the next buffer is child capa, which is used for replay,
+        * will be packed from the data in reply message. */
+
+       if (op_data->op_name) {
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+               LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+               if (op_data->op_bias & MDS_CREATE_VOLATILE)
+                       cr_flags |= MDS_OPEN_VOLATILE;
+       }
+
+       if (lmm) {
+               cr_flags |= MDS_OPEN_HAS_EA;
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+               memcpy(tmp, lmm, lmmlen);
+       }
+       set_mrc_cr_flags(rec, cr_flags);
+}
+
+static inline __u64 attr_pack(unsigned int ia_valid) {
+       __u64 sa_valid = 0;
+
+       if (ia_valid & ATTR_MODE)
+               sa_valid |= MDS_ATTR_MODE;
+       if (ia_valid & ATTR_UID)
+               sa_valid |= MDS_ATTR_UID;
+       if (ia_valid & ATTR_GID)
+               sa_valid |= MDS_ATTR_GID;
+       if (ia_valid & ATTR_SIZE)
+               sa_valid |= MDS_ATTR_SIZE;
+       if (ia_valid & ATTR_ATIME)
+               sa_valid |= MDS_ATTR_ATIME;
+       if (ia_valid & ATTR_MTIME)
+               sa_valid |= MDS_ATTR_MTIME;
+       if (ia_valid & ATTR_CTIME)
+               sa_valid |= MDS_ATTR_CTIME;
+       if (ia_valid & ATTR_ATIME_SET)
+               sa_valid |= MDS_ATTR_ATIME_SET;
+       if (ia_valid & ATTR_MTIME_SET)
+               sa_valid |= MDS_ATTR_MTIME_SET;
+       if (ia_valid & ATTR_FORCE)
+               sa_valid |= MDS_ATTR_FORCE;
+       if (ia_valid & ATTR_ATTR_FLAG)
+               sa_valid |= MDS_ATTR_ATTR_FLAG;
+       if (ia_valid & ATTR_KILL_SUID)
+               sa_valid |=  MDS_ATTR_KILL_SUID;
+       if (ia_valid & ATTR_KILL_SGID)
+               sa_valid |= MDS_ATTR_KILL_SGID;
+       if (ia_valid & ATTR_CTIME_SET)
+               sa_valid |= MDS_ATTR_CTIME_SET;
+       if (ia_valid & ATTR_FROM_OPEN)
+               sa_valid |= MDS_ATTR_FROM_OPEN;
+       if (ia_valid & ATTR_BLOCKS)
+               sa_valid |= MDS_ATTR_BLOCKS;
+       if (ia_valid & MDS_OPEN_OWNEROVERRIDE)
+               /* NFSD hack (see bug 5781) */
+               sa_valid |= MDS_OPEN_OWNEROVERRIDE;
+       return sa_valid;
+}
+
+static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec,
+                                struct md_op_data *op_data)
+{
+       rec->sa_opcode  = REINT_SETATTR;
+       rec->sa_fsuid   = current_fsuid();
+       rec->sa_fsgid   = current_fsgid();
+       rec->sa_cap     = cfs_curproc_cap_pack();
+       rec->sa_suppgid = -1;
+
+       rec->sa_fid    = op_data->op_fid1;
+       rec->sa_valid  = attr_pack(op_data->op_attr.ia_valid);
+       rec->sa_mode   = op_data->op_attr.ia_mode;
+       rec->sa_uid    = op_data->op_attr.ia_uid;
+       rec->sa_gid    = op_data->op_attr.ia_gid;
+       rec->sa_size   = op_data->op_attr.ia_size;
+       rec->sa_blocks = op_data->op_attr_blocks;
+       rec->sa_atime  = LTIME_S(op_data->op_attr.ia_atime);
+       rec->sa_mtime  = LTIME_S(op_data->op_attr.ia_mtime);
+       rec->sa_ctime  = LTIME_S(op_data->op_attr.ia_ctime);
+       rec->sa_attr_flags = ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+       if ((op_data->op_attr.ia_valid & ATTR_GID) &&
+           current_is_in_group(op_data->op_attr.ia_gid))
+               rec->sa_suppgid = op_data->op_attr.ia_gid;
+       else
+               rec->sa_suppgid = op_data->op_suppgids[0];
+
+       rec->sa_bias = op_data->op_bias;
+}
+
+static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch,
+                            struct md_op_data *op_data)
+{
+       memcpy(&epoch->handle, &op_data->op_handle, sizeof(epoch->handle));
+       epoch->ioepoch = op_data->op_ioepoch;
+       epoch->flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS;
+}
+
+void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                     void *ea, int ealen, void *ea2, int ea2len)
+{
+       struct mdt_rec_setattr *rec;
+       struct mdt_ioepoch *epoch;
+       struct lov_user_md *lum = NULL;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) ==sizeof(struct mdt_rec_setattr));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+       mdc_setattr_pack_rec(rec, op_data);
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+       if (op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) {
+               epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+               mdc_ioepoch_pack(epoch, op_data);
+       }
+
+       if (ealen == 0)
+               return;
+
+       lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+       if (ea == NULL) { /* Remove LOV EA */
+               lum->lmm_magic = LOV_USER_MAGIC_V1;
+               lum->lmm_stripe_size = 0;
+               lum->lmm_stripe_count = 0;
+               lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1);
+       } else {
+               memcpy(lum, ea, ealen);
+       }
+
+       if (ea2len == 0)
+               return;
+
+       memcpy(req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES), ea2,
+              ea2len);
+}
+
+void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+       struct mdt_rec_unlink *rec;
+       char *tmp;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+       LASSERT(rec != NULL);
+
+       rec->ul_opcode  = op_data->op_cli_flags & CLI_RM_ENTRY ?
+                                       REINT_RMENTRY : REINT_UNLINK;
+       rec->ul_fsuid   = op_data->op_fsuid;
+       rec->ul_fsgid   = op_data->op_fsgid;
+       rec->ul_cap     = op_data->op_cap;
+       rec->ul_mode    = op_data->op_mode;
+       rec->ul_suppgid1= op_data->op_suppgids[0];
+       rec->ul_suppgid2= -1;
+       rec->ul_fid1    = op_data->op_fid1;
+       rec->ul_fid2    = op_data->op_fid2;
+       rec->ul_time    = op_data->op_mod_time;
+       rec->ul_bias    = op_data->op_bias;
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+       LASSERT(tmp != NULL);
+       LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+       struct mdt_rec_link *rec;
+       char *tmp;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+       LASSERT (rec != NULL);
+
+       rec->lk_opcode   = REINT_LINK;
+       rec->lk_fsuid    = op_data->op_fsuid;//current->fsuid;
+       rec->lk_fsgid    = op_data->op_fsgid;//current->fsgid;
+       rec->lk_cap      = op_data->op_cap;//current->cap_effective;
+       rec->lk_suppgid1 = op_data->op_suppgids[0];
+       rec->lk_suppgid2 = op_data->op_suppgids[1];
+       rec->lk_fid1     = op_data->op_fid1;
+       rec->lk_fid2     = op_data->op_fid2;
+       rec->lk_time     = op_data->op_mod_time;
+       rec->lk_bias     = op_data->op_bias;
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+       LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+}
+
+void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
+                    const char *old, int oldlen, const char *new, int newlen)
+{
+       struct mdt_rec_rename *rec;
+       char *tmp;
+
+       CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+       /* XXX do something about time, uid, gid */
+       rec->rn_opcode   = REINT_RENAME;
+       rec->rn_fsuid    = op_data->op_fsuid;
+       rec->rn_fsgid    = op_data->op_fsgid;
+       rec->rn_cap      = op_data->op_cap;
+       rec->rn_suppgid1 = op_data->op_suppgids[0];
+       rec->rn_suppgid2 = op_data->op_suppgids[1];
+       rec->rn_fid1     = op_data->op_fid1;
+       rec->rn_fid2     = op_data->op_fid2;
+       rec->rn_time     = op_data->op_mod_time;
+       rec->rn_mode     = op_data->op_mode;
+       rec->rn_bias     = op_data->op_bias;
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2);
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+       LOGL0(old, oldlen, tmp);
+
+       if (new) {
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_SYMTGT);
+               LOGL0(new, newlen, tmp);
+       }
+}
+
+void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags,
+                     struct md_op_data *op_data, int ea_size)
+{
+       struct mdt_body *b = req_capsule_client_get(&req->rq_pill,
+                                                   &RMF_MDT_BODY);
+
+       b->valid = valid;
+       if (op_data->op_bias & MDS_CHECK_SPLIT)
+               b->valid |= OBD_MD_FLCKSPLIT;
+       if (op_data->op_bias & MDS_CROSS_REF)
+               b->valid |= OBD_MD_FLCROSSREF;
+       b->eadatasize = ea_size;
+       b->flags = flags;
+       __mdc_pack_body(b, op_data->op_suppgids[0]);
+
+       b->fid1 = op_data->op_fid1;
+       b->fid2 = op_data->op_fid2;
+       b->valid |= OBD_MD_FLID;
+
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+
+       if (op_data->op_name) {
+               char *tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+               LOGL0(op_data->op_name, op_data->op_namelen, tmp);
+
+       }
+}
+
+void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
+{
+       struct mdt_ioepoch *epoch;
+       struct mdt_rec_setattr *rec;
+
+       epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+
+       mdc_setattr_pack_rec(rec, op_data);
+       mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_ioepoch_pack(epoch, op_data);
+}
+
+static int mdc_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
+{
+       int rc;
+       ENTRY;
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = list_empty(&mcw->mcw_entry);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       RETURN(rc);
+};
+
+/* We record requests in flight in cli->cl_r_in_flight here.
+ * There is only one write rpc possible in mdc anyway. If this to change
+ * in the future - the code may need to be revisited. */
+int mdc_enter_request(struct client_obd *cli)
+{
+       int rc = 0;
+       struct mdc_cache_waiter mcw;
+       struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+               list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters);
+               init_waitqueue_head(&mcw.mcw_waitq);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               rc = l_wait_event(mcw.mcw_waitq, mdc_req_avail(cli, &mcw), &lwi);
+               if (rc) {
+                       client_obd_list_lock(&cli->cl_loi_list_lock);
+                       if (list_empty(&mcw.mcw_entry))
+                               cli->cl_r_in_flight--;
+                       list_del_init(&mcw.mcw_entry);
+                       client_obd_list_unlock(&cli->cl_loi_list_lock);
+               }
+       } else {
+               cli->cl_r_in_flight++;
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+       }
+       return rc;
+}
+
+void mdc_exit_request(struct client_obd *cli)
+{
+       struct list_head *l, *tmp;
+       struct mdc_cache_waiter *mcw;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_r_in_flight--;
+       list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+               if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
+                       /* No free request slots anymore */
+                       break;
+               }
+
+               mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry);
+               list_del_init(&mcw->mcw_entry);
+               cli->cl_r_in_flight++;
+               wake_up(&mcw->mcw_waitq);
+       }
+       /* Empty waiting list? Decrease reqs in-flight number */
+
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_locks.c b/drivers/staging/lustre/lustre/mdc/mdc_locks.c
new file mode 100644 (file)
index 0000000..1cc90b6
--- /dev/null
@@ -0,0 +1,1229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+
+#include <lustre_acl.h>
+#include <obd_class.h>
+#include <lustre_dlm.h>
+/* fid_res_name_eq() */
+#include <lustre_fid.h>
+#include <lprocfs_status.h>
+#include "mdc_internal.h"
+
+struct mdc_getattr_args {
+       struct obd_export          *ga_exp;
+       struct md_enqueue_info      *ga_minfo;
+       struct ldlm_enqueue_info    *ga_einfo;
+};
+
+int it_disposition(struct lookup_intent *it, int flag)
+{
+       return it->d.lustre.it_disposition & flag;
+}
+EXPORT_SYMBOL(it_disposition);
+
+void it_set_disposition(struct lookup_intent *it, int flag)
+{
+       it->d.lustre.it_disposition |= flag;
+}
+EXPORT_SYMBOL(it_set_disposition);
+
+void it_clear_disposition(struct lookup_intent *it, int flag)
+{
+       it->d.lustre.it_disposition &= ~flag;
+}
+EXPORT_SYMBOL(it_clear_disposition);
+
+int it_open_error(int phase, struct lookup_intent *it)
+{
+       if (it_disposition(it, DISP_OPEN_OPEN)) {
+               if (phase >= DISP_OPEN_OPEN)
+                       return it->d.lustre.it_status;
+               else
+                       return 0;
+       }
+
+       if (it_disposition(it, DISP_OPEN_CREATE)) {
+               if (phase >= DISP_OPEN_CREATE)
+                       return it->d.lustre.it_status;
+               else
+                       return 0;
+       }
+
+       if (it_disposition(it, DISP_LOOKUP_EXECD)) {
+               if (phase >= DISP_LOOKUP_EXECD)
+                       return it->d.lustre.it_status;
+               else
+                       return 0;
+       }
+
+       if (it_disposition(it, DISP_IT_EXECD)) {
+               if (phase >= DISP_IT_EXECD)
+                       return it->d.lustre.it_status;
+               else
+                       return 0;
+       }
+       CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
+              it->d.lustre.it_status);
+       LBUG();
+       return 0;
+}
+EXPORT_SYMBOL(it_open_error);
+
+/* this must be called on a lockh that is known to have a referenced lock */
+int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
+                     __u64 *bits)
+{
+       struct ldlm_lock *lock;
+       struct inode *new_inode = data;
+       ENTRY;
+
+       if(bits)
+               *bits = 0;
+
+       if (!*lockh)
+               RETURN(0);
+
+       lock = ldlm_handle2lock((struct lustre_handle *)lockh);
+
+       LASSERT(lock != NULL);
+       lock_res_and_lock(lock);
+       if (lock->l_resource->lr_lvb_inode &&
+           lock->l_resource->lr_lvb_inode != data) {
+               struct inode *old_inode = lock->l_resource->lr_lvb_inode;
+               LASSERTF(old_inode->i_state & I_FREEING,
+                        "Found existing inode %p/%lu/%u state %lu in lock: "
+                        "setting data to %p/%lu/%u\n", old_inode,
+                        old_inode->i_ino, old_inode->i_generation,
+                        old_inode->i_state,
+                        new_inode, new_inode->i_ino, new_inode->i_generation);
+       }
+       lock->l_resource->lr_lvb_inode = new_inode;
+       if (bits)
+               *bits = lock->l_policy_data.l_inodebits.bits;
+
+       unlock_res_and_lock(lock);
+       LDLM_LOCK_PUT(lock);
+
+       RETURN(0);
+}
+
+ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags,
+                          const struct lu_fid *fid, ldlm_type_t type,
+                          ldlm_policy_data_t *policy, ldlm_mode_t mode,
+                          struct lustre_handle *lockh)
+{
+       struct ldlm_res_id res_id;
+       ldlm_mode_t rc;
+       ENTRY;
+
+       fid_build_reg_res_name(fid, &res_id);
+       rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags,
+                            &res_id, type, policy, mode, lockh, 0);
+       RETURN(rc);
+}
+
+int mdc_cancel_unused(struct obd_export *exp,
+                     const struct lu_fid *fid,
+                     ldlm_policy_data_t *policy,
+                     ldlm_mode_t mode,
+                     ldlm_cancel_flags_t flags,
+                     void *opaque)
+{
+       struct ldlm_res_id res_id;
+       struct obd_device *obd = class_exp2obd(exp);
+       int rc;
+
+       ENTRY;
+
+       fid_build_reg_res_name(fid, &res_id);
+       rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id,
+                                            policy, mode, flags, opaque);
+       RETURN(rc);
+}
+
+int mdc_null_inode(struct obd_export *exp,
+                  const struct lu_fid *fid)
+{
+       struct ldlm_res_id res_id;
+       struct ldlm_resource *res;
+       struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace;
+       ENTRY;
+
+       LASSERTF(ns != NULL, "no namespace passed\n");
+
+       fid_build_reg_res_name(fid, &res_id);
+
+       res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+       if(res == NULL)
+               RETURN(0);
+
+       lock_res(res);
+       res->lr_lvb_inode = NULL;
+       unlock_res(res);
+
+       ldlm_resource_putref(res);
+       RETURN(0);
+}
+
+/* find any ldlm lock of the inode in mdc
+ * return 0    not find
+ *     1    find one
+ *      < 0    error */
+int mdc_find_cbdata(struct obd_export *exp,
+                   const struct lu_fid *fid,
+                   ldlm_iterator_t it, void *data)
+{
+       struct ldlm_res_id res_id;
+       int rc = 0;
+       ENTRY;
+
+       fid_build_reg_res_name((struct lu_fid*)fid, &res_id);
+       rc = ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id,
+                                  it, data);
+       if (rc == LDLM_ITER_STOP)
+               RETURN(1);
+       else if (rc == LDLM_ITER_CONTINUE)
+               RETURN(0);
+       RETURN(rc);
+}
+
+static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc)
+{
+       /* Don't hold error requests for replay. */
+       if (req->rq_replay) {
+               spin_lock(&req->rq_lock);
+               req->rq_replay = 0;
+               spin_unlock(&req->rq_lock);
+       }
+       if (rc && req->rq_transno != 0) {
+               DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc);
+               LBUG();
+       }
+}
+
+/* Save a large LOV EA into the request buffer so that it is available
+ * for replay.  We don't do this in the initial request because the
+ * original request doesn't need this buffer (at most it sends just the
+ * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty
+ * buffer and may also be difficult to allocate and save a very large
+ * request buffer for each open. (bug 5707)
+ *
+ * OOM here may cause recovery failure if lmm is needed (only for the
+ * original open if the MDS crashed just when this client also OOM'd)
+ * but this is incredibly unlikely, and questionable whether the client
+ * could do MDS recovery under OOM anyways... */
+static void mdc_realloc_openmsg(struct ptlrpc_request *req,
+                               struct mdt_body *body)
+{
+       int     rc;
+
+       /* FIXME: remove this explicit offset. */
+       rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4,
+                                       body->eadatasize);
+       if (rc) {
+               CERROR("Can't enlarge segment %d size to %d\n",
+                      DLM_INTENT_REC_OFF + 4, body->eadatasize);
+               body->valid &= ~OBD_MD_FLEASIZE;
+               body->eadatasize = 0;
+       }
+}
+
+static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
+                                                  struct lookup_intent *it,
+                                                  struct md_op_data *op_data,
+                                                  void *lmm, int lmmsize,
+                                                  void *cb_data)
+{
+       struct ptlrpc_request *req;
+       struct obd_device     *obddev = class_exp2obd(exp);
+       struct ldlm_intent    *lit;
+       LIST_HEAD(cancels);
+       int                 count = 0;
+       int                 mode;
+       int                 rc;
+       ENTRY;
+
+       it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG;
+
+       /* XXX: openlock is not cancelled for cross-refs. */
+       /* If inode is known, cancel conflicting OPEN locks. */
+       if (fid_is_sane(&op_data->op_fid2)) {
+               if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
+                       mode = LCK_CW;
+#ifdef FMODE_EXEC
+               else if (it->it_flags & FMODE_EXEC)
+                       mode = LCK_PR;
+#endif
+               else
+                       mode = LCK_CR;
+               count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+                                               &cancels, mode,
+                                               MDS_INODELOCK_OPEN);
+       }
+
+       /* If CREATE, cancel parent's UPDATE lock. */
+       if (it->it_op & IT_CREAT)
+               mode = LCK_EX;
+       else
+               mode = LCK_CR;
+       count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                        &cancels, mode,
+                                        MDS_INODELOCK_UPDATE);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_LDLM_INTENT_OPEN);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       /* parent capability */
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       /* child capability, reserve the size according to parent capa, it will
+        * be filled after we get the reply */
+       mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa1);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+                            max(lmmsize, obddev->u.cli.cl_default_mds_easize));
+
+       rc = ldlm_prep_enqueue_req(exp, req, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               return NULL;
+       }
+
+       spin_lock(&req->rq_lock);
+       req->rq_replay = req->rq_import->imp_replayable;
+       spin_unlock(&req->rq_lock);
+
+       /* pack the intent */
+       lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+       lit->opc = (__u64)it->it_op;
+
+       /* pack the intended request */
+       mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm,
+                     lmmsize);
+
+       /* for remote client, fetch remote perm for current user */
+       if (client_is_remote(exp))
+               req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+                                    sizeof(struct mdt_remote_perm));
+       ptlrpc_request_set_replen(req);
+       return req;
+}
+
+static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp,
+                                                    struct lookup_intent *it,
+                                                    struct md_op_data *op_data)
+{
+       struct ptlrpc_request *req;
+       struct obd_device     *obddev = class_exp2obd(exp);
+       struct ldlm_intent    *lit;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_LDLM_INTENT_UNLINK);
+       if (req == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+
+       rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(ERR_PTR(rc));
+       }
+
+       /* pack the intent */
+       lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+       lit->opc = (__u64)it->it_op;
+
+       /* pack the intended request */
+       mdc_unlink_pack(req, op_data);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            obddev->u.cli.cl_max_mds_easize);
+       req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+                            obddev->u.cli.cl_max_mds_cookiesize);
+       ptlrpc_request_set_replen(req);
+       RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp,
+                                                     struct lookup_intent *it,
+                                                     struct md_op_data *op_data)
+{
+       struct ptlrpc_request *req;
+       struct obd_device     *obddev = class_exp2obd(exp);
+       obd_valid             valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
+                                      OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA |
+                                      OBD_MD_FLMDSCAPA | OBD_MD_MEA |
+                                      (client_is_remote(exp) ?
+                                              OBD_MD_FLRMTPERM : OBD_MD_FLACL);
+       struct ldlm_intent    *lit;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_LDLM_INTENT_GETATTR);
+       if (req == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+
+       rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(ERR_PTR(rc));
+       }
+
+       /* pack the intent */
+       lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+       lit->opc = (__u64)it->it_op;
+
+       /* pack the intended request */
+       mdc_getattr_pack(req, valid, it->it_flags, op_data,
+                        obddev->u.cli.cl_max_mds_easize);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            obddev->u.cli.cl_max_mds_easize);
+       if (client_is_remote(exp))
+               req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+                                    sizeof(struct mdt_remote_perm));
+       ptlrpc_request_set_replen(req);
+       RETURN(req);
+}
+
+static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp,
+                                                    struct lookup_intent *it,
+                                                    struct md_op_data *unused)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req;
+       struct ldlm_intent    *lit;
+       struct layout_intent  *layout;
+       int rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                               &RQF_LDLM_INTENT_LAYOUT);
+       if (req == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0);
+       rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(ERR_PTR(rc));
+       }
+
+       /* pack the intent */
+       lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT);
+       lit->opc = (__u64)it->it_op;
+
+       /* pack the layout intent request */
+       layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT);
+       /* LAYOUT_INTENT_ACCESS is generic, specific operation will be
+        * set for replication */
+       layout->li_opc = LAYOUT_INTENT_ACCESS;
+
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                       obd->u.cli.cl_max_mds_easize);
+       ptlrpc_request_set_replen(req);
+       RETURN(req);
+}
+
+static struct ptlrpc_request *
+mdc_enqueue_pack(struct obd_export *exp, int lvb_len)
+{
+       struct ptlrpc_request *req;
+       int rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
+       if (req == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(ERR_PTR(rc));
+       }
+
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
+       ptlrpc_request_set_replen(req);
+       RETURN(req);
+}
+
+static int mdc_finish_enqueue(struct obd_export *exp,
+                             struct ptlrpc_request *req,
+                             struct ldlm_enqueue_info *einfo,
+                             struct lookup_intent *it,
+                             struct lustre_handle *lockh,
+                             int rc)
+{
+       struct req_capsule  *pill = &req->rq_pill;
+       struct ldlm_request *lockreq;
+       struct ldlm_reply   *lockrep;
+       struct lustre_intent_data *intent = &it->d.lustre;
+       struct ldlm_lock    *lock;
+       void            *lvb_data = NULL;
+       int               lvb_len = 0;
+       ENTRY;
+
+       LASSERT(rc >= 0);
+       /* Similarly, if we're going to replay this request, we don't want to
+        * actually get a lock, just perform the intent. */
+       if (req->rq_transno || req->rq_replay) {
+               lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ);
+               lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY);
+       }
+
+       if (rc == ELDLM_LOCK_ABORTED) {
+               einfo->ei_mode = 0;
+               memset(lockh, 0, sizeof(*lockh));
+               rc = 0;
+       } else { /* rc = 0 */
+               lock = ldlm_handle2lock(lockh);
+               LASSERT(lock != NULL);
+
+               /* If the server gave us back a different lock mode, we should
+                * fix up our variables. */
+               if (lock->l_req_mode != einfo->ei_mode) {
+                       ldlm_lock_addref(lockh, lock->l_req_mode);
+                       ldlm_lock_decref(lockh, einfo->ei_mode);
+                       einfo->ei_mode = lock->l_req_mode;
+               }
+               LDLM_LOCK_PUT(lock);
+       }
+
+       lockrep = req_capsule_server_get(pill, &RMF_DLM_REP);
+       LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */
+
+       intent->it_disposition = (int)lockrep->lock_policy_res1;
+       intent->it_status = (int)lockrep->lock_policy_res2;
+       intent->it_lock_mode = einfo->ei_mode;
+       intent->it_lock_handle = lockh->cookie;
+       intent->it_data = req;
+
+       /* Technically speaking rq_transno must already be zero if
+        * it_status is in error, so the check is a bit redundant */
+       if ((!req->rq_transno || intent->it_status < 0) && req->rq_replay)
+               mdc_clear_replay_flag(req, intent->it_status);
+
+       /* If we're doing an IT_OPEN which did not result in an actual
+        * successful open, then we need to remove the bit which saves
+        * this request for unconditional replay.
+        *
+        * It's important that we do this first!  Otherwise we might exit the
+        * function without doing so, and try to replay a failed create
+        * (bug 3440) */
+       if (it->it_op & IT_OPEN && req->rq_replay &&
+           (!it_disposition(it, DISP_OPEN_OPEN) ||intent->it_status != 0))
+               mdc_clear_replay_flag(req, intent->it_status);
+
+       DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d",
+                 it->it_op, intent->it_disposition, intent->it_status);
+
+       /* We know what to expect, so we do any byte flipping required here */
+       if (it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR)) {
+               struct mdt_body *body;
+
+               body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+               if (body == NULL) {
+                       CERROR ("Can't swab mdt_body\n");
+                       RETURN (-EPROTO);
+               }
+
+               if (it_disposition(it, DISP_OPEN_OPEN) &&
+                   !it_open_error(DISP_OPEN_OPEN, it)) {
+                       /*
+                        * If this is a successful OPEN request, we need to set
+                        * replay handler and data early, so that if replay
+                        * happens immediately after swabbing below, new reply
+                        * is swabbed by that handler correctly.
+                        */
+                       mdc_set_open_replay_data(NULL, NULL, req);
+               }
+
+               if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) {
+                       void *eadata;
+
+                       mdc_update_max_ea_from_body(exp, body);
+
+                       /*
+                        * The eadata is opaque; just check that it is there.
+                        * Eventually, obd_unpackmd() will check the contents.
+                        */
+                       eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+                                                             body->eadatasize);
+                       if (eadata == NULL)
+                               RETURN(-EPROTO);
+
+                       /* save lvb data and length in case this is for layout
+                        * lock */
+                       lvb_data = eadata;
+                       lvb_len = body->eadatasize;
+
+                       /*
+                        * We save the reply LOV EA in case we have to replay a
+                        * create for recovery.  If we didn't allocate a large
+                        * enough request buffer above we need to reallocate it
+                        * here to hold the actual LOV EA.
+                        *
+                        * To not save LOV EA if request is not going to replay
+                        * (for example error one).
+                        */
+                       if ((it->it_op & IT_OPEN) && req->rq_replay) {
+                               void *lmm;
+                               if (req_capsule_get_size(pill, &RMF_EADATA,
+                                                        RCL_CLIENT) <
+                                   body->eadatasize)
+                                       mdc_realloc_openmsg(req, body);
+                               else
+                                       req_capsule_shrink(pill, &RMF_EADATA,
+                                                          body->eadatasize,
+                                                          RCL_CLIENT);
+
+                               req_capsule_set_size(pill, &RMF_EADATA,
+                                                    RCL_CLIENT,
+                                                    body->eadatasize);
+
+                               lmm = req_capsule_client_get(pill, &RMF_EADATA);
+                               if (lmm)
+                                       memcpy(lmm, eadata, body->eadatasize);
+                       }
+               }
+
+               if (body->valid & OBD_MD_FLRMTPERM) {
+                       struct mdt_remote_perm *perm;
+
+                       LASSERT(client_is_remote(exp));
+                       perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+                                               lustre_swab_mdt_remote_perm);
+                       if (perm == NULL)
+                               RETURN(-EPROTO);
+               }
+               if (body->valid & OBD_MD_FLMDSCAPA) {
+                       struct lustre_capa *capa, *p;
+
+                       capa = req_capsule_server_get(pill, &RMF_CAPA1);
+                       if (capa == NULL)
+                               RETURN(-EPROTO);
+
+                       if (it->it_op & IT_OPEN) {
+                               /* client fid capa will be checked in replay */
+                               p = req_capsule_client_get(pill, &RMF_CAPA2);
+                               LASSERT(p);
+                               *p = *capa;
+                       }
+               }
+               if (body->valid & OBD_MD_FLOSSCAPA) {
+                       struct lustre_capa *capa;
+
+                       capa = req_capsule_server_get(pill, &RMF_CAPA2);
+                       if (capa == NULL)
+                               RETURN(-EPROTO);
+               }
+       } else if (it->it_op & IT_LAYOUT) {
+               /* maybe the lock was granted right away and layout
+                * is packed into RMF_DLM_LVB of req */
+               lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER);
+               if (lvb_len > 0) {
+                       lvb_data = req_capsule_server_sized_get(pill,
+                                                       &RMF_DLM_LVB, lvb_len);
+                       if (lvb_data == NULL)
+                               RETURN(-EPROTO);
+               }
+       }
+
+       /* fill in stripe data for layout lock */
+       lock = ldlm_handle2lock(lockh);
+       if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL) {
+               void *lmm;
+
+               LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d\n",
+                       ldlm_it2str(it->it_op), lvb_len);
+
+               OBD_ALLOC_LARGE(lmm, lvb_len);
+               if (lmm == NULL) {
+                       LDLM_LOCK_PUT(lock);
+                       RETURN(-ENOMEM);
+               }
+               memcpy(lmm, lvb_data, lvb_len);
+
+               /* install lvb_data */
+               lock_res_and_lock(lock);
+               if (lock->l_lvb_data == NULL) {
+                       lock->l_lvb_data = lmm;
+                       lock->l_lvb_len = lvb_len;
+                       lmm = NULL;
+               }
+               unlock_res_and_lock(lock);
+               if (lmm != NULL)
+                       OBD_FREE_LARGE(lmm, lvb_len);
+       }
+       if (lock != NULL)
+               LDLM_LOCK_PUT(lock);
+
+       RETURN(rc);
+}
+
+/* We always reserve enough space in the reply packet for a stripe MD, because
+ * we don't know in advance the file type. */
+int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
+               struct lookup_intent *it, struct md_op_data *op_data,
+               struct lustre_handle *lockh, void *lmm, int lmmsize,
+               struct ptlrpc_request **reqp, __u64 extra_lock_flags)
+{
+       struct obd_device     *obddev = class_exp2obd(exp);
+       struct ptlrpc_request *req = NULL;
+       __u64             flags, saved_flags = extra_lock_flags;
+       int                 rc;
+       struct ldlm_res_id res_id;
+       static const ldlm_policy_data_t lookup_policy =
+                           { .l_inodebits = { MDS_INODELOCK_LOOKUP } };
+       static const ldlm_policy_data_t update_policy =
+                           { .l_inodebits = { MDS_INODELOCK_UPDATE } };
+       static const ldlm_policy_data_t layout_policy =
+                           { .l_inodebits = { MDS_INODELOCK_LAYOUT } };
+       ldlm_policy_data_t const *policy = &lookup_policy;
+       int                 generation, resends = 0;
+       struct ldlm_reply     *lockrep;
+       enum lvb_type          lvb_type = 0;
+       ENTRY;
+
+       LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n",
+                einfo->ei_type);
+
+       fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+
+       if (it) {
+               saved_flags |= LDLM_FL_HAS_INTENT;
+               if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR))
+                       policy = &update_policy;
+               else if (it->it_op & IT_LAYOUT)
+                       policy = &layout_policy;
+       }
+
+       LASSERT(reqp == NULL);
+
+       generation = obddev->u.cli.cl_import->imp_generation;
+resend:
+       flags = saved_flags;
+       if (!it) {
+               /* The only way right now is FLOCK, in this case we hide flock
+                  policy as lmm, but lmmsize is 0 */
+               LASSERT(lmm && lmmsize == 0);
+               LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n",
+                        einfo->ei_type);
+               policy = (ldlm_policy_data_t *)lmm;
+               res_id.name[3] = LDLM_FLOCK;
+       } else if (it->it_op & IT_OPEN) {
+               req = mdc_intent_open_pack(exp, it, op_data, lmm, lmmsize,
+                                          einfo->ei_cbdata);
+               policy = &update_policy;
+               einfo->ei_cbdata = NULL;
+               lmm = NULL;
+       } else if (it->it_op & IT_UNLINK) {
+               req = mdc_intent_unlink_pack(exp, it, op_data);
+       } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
+               req = mdc_intent_getattr_pack(exp, it, op_data);
+       } else if (it->it_op & IT_READDIR) {
+               req = mdc_enqueue_pack(exp, 0);
+       } else if (it->it_op & IT_LAYOUT) {
+               if (!imp_connect_lvb_type(class_exp2cliimp(exp)))
+                       RETURN(-EOPNOTSUPP);
+
+               req = mdc_intent_layout_pack(exp, it, op_data);
+               lvb_type = LVB_T_LAYOUT;
+       } else {
+               LBUG();
+               RETURN(-EINVAL);
+       }
+
+       if (IS_ERR(req))
+               RETURN(PTR_ERR(req));
+
+       if (req != NULL && it && it->it_op & IT_CREAT)
+               /* ask ptlrpc not to resend on EINPROGRESS since we have our own
+                * retry logic */
+               req->rq_no_retry_einprogress = 1;
+
+       if (resends) {
+               req->rq_generation_set = 1;
+               req->rq_import_generation = generation;
+               req->rq_sent = cfs_time_current_sec() + resends;
+       }
+
+       /* It is important to obtain rpc_lock first (if applicable), so that
+        * threads that are serialised with rpc_lock are not polluting our
+        * rpcs in flight counter. We do not do flock request limiting, though*/
+       if (it) {
+               mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+               rc = mdc_enter_request(&obddev->u.cli);
+               if (rc != 0) {
+                       mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+                       mdc_clear_replay_flag(req, 0);
+                       ptlrpc_req_finished(req);
+                       RETURN(rc);
+               }
+       }
+
+       rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL,
+                             0, lvb_type, lockh, 0);
+       if (!it) {
+               /* For flock requests we immediatelly return without further
+                  delay and let caller deal with the rest, since rest of
+                  this function metadata processing makes no sense for flock
+                  requests anyway */
+               RETURN(rc);
+       }
+
+       mdc_exit_request(&obddev->u.cli);
+       mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
+
+       if (rc < 0) {
+               CERROR("ldlm_cli_enqueue: %d\n", rc);
+               mdc_clear_replay_flag(req, rc);
+               ptlrpc_req_finished(req);
+               RETURN(rc);
+       }
+
+       lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
+       LASSERT(lockrep != NULL);
+
+       /* Retry the create infinitely when we get -EINPROGRESS from
+        * server. This is required by the new quota design. */
+       if (it && it->it_op & IT_CREAT &&
+           (int)lockrep->lock_policy_res2 == -EINPROGRESS) {
+               mdc_clear_replay_flag(req, rc);
+               ptlrpc_req_finished(req);
+               resends++;
+
+               CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n",
+                      obddev->obd_name, resends, it->it_op,
+                      PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+               if (generation == obddev->u.cli.cl_import->imp_generation) {
+                       goto resend;
+               } else {
+                       CDEBUG(D_HA, "resend cross eviction\n");
+                       RETURN(-EIO);
+               }
+       }
+
+       rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+       if (rc < 0) {
+               if (lustre_handle_is_used(lockh)) {
+                       ldlm_lock_decref(lockh, einfo->ei_mode);
+                       memset(lockh, 0, sizeof(*lockh));
+               }
+               ptlrpc_req_finished(req);
+       }
+       RETURN(rc);
+}
+
+static int mdc_finish_intent_lock(struct obd_export *exp,
+                                 struct ptlrpc_request *request,
+                                 struct md_op_data *op_data,
+                                 struct lookup_intent *it,
+                                 struct lustre_handle *lockh)
+{
+       struct lustre_handle old_lock;
+       struct mdt_body *mdt_body;
+       struct ldlm_lock *lock;
+       int rc;
+
+
+       LASSERT(request != NULL);
+       LASSERT(request != LP_POISON);
+       LASSERT(request->rq_repmsg != LP_POISON);
+
+       if (!it_disposition(it, DISP_IT_EXECD)) {
+               /* The server failed before it even started executing the
+                * intent, i.e. because it couldn't unpack the request. */
+               LASSERT(it->d.lustre.it_status != 0);
+               RETURN(it->d.lustre.it_status);
+       }
+       rc = it_open_error(DISP_IT_EXECD, it);
+       if (rc)
+               RETURN(rc);
+
+       mdt_body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+       LASSERT(mdt_body != NULL);      /* mdc_enqueue checked */
+
+       /* If we were revalidating a fid/name pair, mark the intent in
+        * case we fail and get called again from lookup */
+       if (fid_is_sane(&op_data->op_fid2) &&
+           it->it_create_mode & M_CHECK_STALE &&
+           it->it_op != IT_GETATTR) {
+               it_set_disposition(it, DISP_ENQ_COMPLETE);
+
+               /* Also: did we find the same inode? */
+               /* sever can return one of two fids:
+                * op_fid2 - new allocated fid - if file is created.
+                * op_fid3 - existent fid - if file only open.
+                * op_fid3 is saved in lmv_intent_open */
+               if ((!lu_fid_eq(&op_data->op_fid2, &mdt_body->fid1)) &&
+                   (!lu_fid_eq(&op_data->op_fid3, &mdt_body->fid1))) {
+                       CDEBUG(D_DENTRY, "Found stale data "DFID"("DFID")/"DFID
+                              "\n", PFID(&op_data->op_fid2),
+                              PFID(&op_data->op_fid2), PFID(&mdt_body->fid1));
+                       RETURN(-ESTALE);
+               }
+       }
+
+       rc = it_open_error(DISP_LOOKUP_EXECD, it);
+       if (rc)
+               RETURN(rc);
+
+       /* keep requests around for the multiple phases of the call
+        * this shows the DISP_XX must guarantee we make it into the call
+        */
+       if (!it_disposition(it, DISP_ENQ_CREATE_REF) &&
+           it_disposition(it, DISP_OPEN_CREATE) &&
+           !it_open_error(DISP_OPEN_CREATE, it)) {
+               it_set_disposition(it, DISP_ENQ_CREATE_REF);
+               ptlrpc_request_addref(request); /* balanced in ll_create_node */
+       }
+       if (!it_disposition(it, DISP_ENQ_OPEN_REF) &&
+           it_disposition(it, DISP_OPEN_OPEN) &&
+           !it_open_error(DISP_OPEN_OPEN, it)) {
+               it_set_disposition(it, DISP_ENQ_OPEN_REF);
+               ptlrpc_request_addref(request); /* balanced in ll_file_open */
+               /* BUG 11546 - eviction in the middle of open rpc processing */
+               OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, obd_timeout);
+       }
+
+       if (it->it_op & IT_CREAT) {
+               /* XXX this belongs in ll_create_it */
+       } else if (it->it_op == IT_OPEN) {
+               LASSERT(!it_disposition(it, DISP_OPEN_CREATE));
+       } else {
+               LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP | IT_LAYOUT));
+       }
+
+       /* If we already have a matching lock, then cancel the new
+        * one.  We have to set the data here instead of in
+        * mdc_enqueue, because we need to use the child's inode as
+        * the l_ast_data to match, and that's not available until
+        * intent_finish has performed the iget().) */
+       lock = ldlm_handle2lock(lockh);
+       if (lock) {
+               ldlm_policy_data_t policy = lock->l_policy_data;
+               LDLM_DEBUG(lock, "matching against this");
+
+               LASSERTF(fid_res_name_eq(&mdt_body->fid1,
+                                        &lock->l_resource->lr_name),
+                        "Lock res_id: %lu/%lu/%lu, fid: %lu/%lu/%lu.\n",
+                        (unsigned long)lock->l_resource->lr_name.name[0],
+                        (unsigned long)lock->l_resource->lr_name.name[1],
+                        (unsigned long)lock->l_resource->lr_name.name[2],
+                        (unsigned long)fid_seq(&mdt_body->fid1),
+                        (unsigned long)fid_oid(&mdt_body->fid1),
+                        (unsigned long)fid_ver(&mdt_body->fid1));
+               LDLM_LOCK_PUT(lock);
+
+               memcpy(&old_lock, lockh, sizeof(*lockh));
+               if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
+                                   LDLM_IBITS, &policy, LCK_NL, &old_lock, 0)) {
+                       ldlm_lock_decref_and_cancel(lockh,
+                                                   it->d.lustre.it_lock_mode);
+                       memcpy(lockh, &old_lock, sizeof(old_lock));
+                       it->d.lustre.it_lock_handle = lockh->cookie;
+               }
+       }
+       CDEBUG(D_DENTRY,"D_IT dentry %.*s intent: %s status %d disp %x rc %d\n",
+              op_data->op_namelen, op_data->op_name, ldlm_it2str(it->it_op),
+              it->d.lustre.it_status, it->d.lustre.it_disposition, rc);
+       RETURN(rc);
+}
+
+int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
+                       struct lu_fid *fid, __u64 *bits)
+{
+       /* We could just return 1 immediately, but since we should only
+        * be called in revalidate_it if we already have a lock, let's
+        * verify that. */
+       struct ldlm_res_id res_id;
+       struct lustre_handle lockh;
+       ldlm_policy_data_t policy;
+       ldlm_mode_t mode;
+       ENTRY;
+
+       if (it->d.lustre.it_lock_handle) {
+               lockh.cookie = it->d.lustre.it_lock_handle;
+               mode = ldlm_revalidate_lock_handle(&lockh, bits);
+       } else {
+               fid_build_reg_res_name(fid, &res_id);
+               switch (it->it_op) {
+               case IT_GETATTR:
+                       policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
+                       break;
+               case IT_LAYOUT:
+                       policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT;
+                       break;
+               default:
+                       policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP;
+                       break;
+               }
+               mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
+                                      LDLM_FL_BLOCK_GRANTED, &res_id,
+                                      LDLM_IBITS, &policy,
+                                      LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh, 0);
+       }
+
+       if (mode) {
+               it->d.lustre.it_lock_handle = lockh.cookie;
+               it->d.lustre.it_lock_mode = mode;
+       } else {
+               it->d.lustre.it_lock_handle = 0;
+               it->d.lustre.it_lock_mode = 0;
+       }
+
+       RETURN(!!mode);
+}
+
+/*
+ * This long block is all about fixing up the lock and request state
+ * so that it is correct as of the moment _before_ the operation was
+ * applied; that way, the VFS will think that everything is normal and
+ * call Lustre's regular VFS methods.
+ *
+ * If we're performing a creation, that means that unless the creation
+ * failed with EEXIST, we should fake up a negative dentry.
+ *
+ * For everything else, we want to lookup to succeed.
+ *
+ * One additional note: if CREATE or OPEN succeeded, we add an extra
+ * reference to the request because we need to keep it around until
+ * ll_create/ll_open gets called.
+ *
+ * The server will return to us, in it_disposition, an indication of
+ * exactly what d.lustre.it_status refers to.
+ *
+ * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
+ * otherwise if DISP_OPEN_CREATE is set, then it status is the
+ * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
+ * DISP_LOOKUP_POS will be set, indicating whether the child lookup
+ * was successful.
+ *
+ * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
+ * child lookup.
+ */
+int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
+                   void *lmm, int lmmsize, struct lookup_intent *it,
+                   int lookup_flags, struct ptlrpc_request **reqp,
+                   ldlm_blocking_callback cb_blocking,
+                   __u64 extra_lock_flags)
+{
+       struct lustre_handle lockh;
+       int rc = 0;
+       ENTRY;
+       LASSERT(it);
+
+       CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID
+              ", intent: %s flags %#o\n", op_data->op_namelen,
+              op_data->op_name, PFID(&op_data->op_fid2),
+              PFID(&op_data->op_fid1), ldlm_it2str(it->it_op),
+              it->it_flags);
+
+       lockh.cookie = 0;
+       if (fid_is_sane(&op_data->op_fid2) &&
+           (it->it_op & (IT_LOOKUP | IT_GETATTR))) {
+               /* We could just return 1 immediately, but since we should only
+                * be called in revalidate_it if we already have a lock, let's
+                * verify that. */
+               it->d.lustre.it_lock_handle = 0;
+               rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL);
+               /* Only return failure if it was not GETATTR by cfid
+                  (from inode_revalidate) */
+               if (rc || op_data->op_namelen != 0)
+                       RETURN(rc);
+       }
+
+       /* lookup_it may be called only after revalidate_it has run, because
+        * revalidate_it cannot return errors, only zero.  Returning zero causes
+        * this call to lookup, which *can* return an error.
+        *
+        * We only want to execute the request associated with the intent one
+        * time, however, so don't send the request again.  Instead, skip past
+        * this and use the request from revalidate.  In this case, revalidate
+        * never dropped its reference, so the refcounts are all OK */
+       if (!it_disposition(it, DISP_ENQ_COMPLETE)) {
+               struct ldlm_enqueue_info einfo =
+                       { LDLM_IBITS, it_to_lock_mode(it), cb_blocking,
+                         ldlm_completion_ast, NULL, NULL, NULL };
+
+               /* For case if upper layer did not alloc fid, do it now. */
+               if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) {
+                       rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+                       if (rc < 0) {
+                               CERROR("Can't alloc new fid, rc %d\n", rc);
+                               RETURN(rc);
+                       }
+               }
+               rc = mdc_enqueue(exp, &einfo, it, op_data, &lockh,
+                                lmm, lmmsize, NULL, extra_lock_flags);
+               if (rc < 0)
+                       RETURN(rc);
+       } else if (!fid_is_sane(&op_data->op_fid2) ||
+                  !(it->it_create_mode & M_CHECK_STALE)) {
+               /* DISP_ENQ_COMPLETE set means there is extra reference on
+                * request referenced from this intent, saved for subsequent
+                * lookup.  This path is executed when we proceed to this
+                * lookup, so we clear DISP_ENQ_COMPLETE */
+               it_clear_disposition(it, DISP_ENQ_COMPLETE);
+       }
+       *reqp = it->d.lustre.it_data;
+       rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh);
+       RETURN(rc);
+}
+
+static int mdc_intent_getattr_async_interpret(const struct lu_env *env,
+                                             struct ptlrpc_request *req,
+                                             void *args, int rc)
+{
+       struct mdc_getattr_args  *ga = args;
+       struct obd_export       *exp = ga->ga_exp;
+       struct md_enqueue_info   *minfo = ga->ga_minfo;
+       struct ldlm_enqueue_info *einfo = ga->ga_einfo;
+       struct lookup_intent     *it;
+       struct lustre_handle     *lockh;
+       struct obd_device       *obddev;
+       __u64                flags = LDLM_FL_HAS_INTENT;
+       ENTRY;
+
+       it    = &minfo->mi_it;
+       lockh = &minfo->mi_lockh;
+
+       obddev = class_exp2obd(exp);
+
+       mdc_exit_request(&obddev->u.cli);
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE))
+               rc = -ETIMEDOUT;
+
+       rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode,
+                                  &flags, NULL, 0, lockh, rc);
+       if (rc < 0) {
+               CERROR("ldlm_cli_enqueue_fini: %d\n", rc);
+               mdc_clear_replay_flag(req, rc);
+               GOTO(out, rc);
+       }
+
+       rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh);
+       EXIT;
+
+out:
+       OBD_FREE_PTR(einfo);
+       minfo->mi_cb(req, minfo, rc);
+       return 0;
+}
+
+int mdc_intent_getattr_async(struct obd_export *exp,
+                            struct md_enqueue_info *minfo,
+                            struct ldlm_enqueue_info *einfo)
+{
+       struct md_op_data       *op_data = &minfo->mi_data;
+       struct lookup_intent    *it = &minfo->mi_it;
+       struct ptlrpc_request   *req;
+       struct mdc_getattr_args *ga;
+       struct obd_device       *obddev = class_exp2obd(exp);
+       struct ldlm_res_id       res_id;
+       /*XXX: Both MDS_INODELOCK_LOOKUP and MDS_INODELOCK_UPDATE are needed
+        *     for statahead currently. Consider CMD in future, such two bits
+        *     maybe managed by different MDS, should be adjusted then. */
+       ldlm_policy_data_t       policy = {
+                                       .l_inodebits = { MDS_INODELOCK_LOOKUP |
+                                                        MDS_INODELOCK_UPDATE }
+                                };
+       int                   rc = 0;
+       __u64               flags = LDLM_FL_HAS_INTENT;
+       ENTRY;
+
+       CDEBUG(D_DLMTRACE,"name: %.*s in inode "DFID", intent: %s flags %#o\n",
+              op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
+              ldlm_it2str(it->it_op), it->it_flags);
+
+       fid_build_reg_res_name(&op_data->op_fid1, &res_id);
+       req = mdc_intent_getattr_pack(exp, it, op_data);
+       if (!req)
+               RETURN(-ENOMEM);
+
+       rc = mdc_enter_request(&obddev->u.cli);
+       if (rc != 0) {
+               ptlrpc_req_finished(req);
+               RETURN(rc);
+       }
+
+       rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, &policy, &flags, NULL,
+                             0, LVB_T_NONE, &minfo->mi_lockh, 1);
+       if (rc < 0) {
+               mdc_exit_request(&obddev->u.cli);
+               ptlrpc_req_finished(req);
+               RETURN(rc);
+       }
+
+       CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args));
+       ga = ptlrpc_req_async_args(req);
+       ga->ga_exp = exp;
+       ga->ga_minfo = minfo;
+       ga->ga_einfo = einfo;
+
+       req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
+       ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+
+       RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_reint.c b/drivers/staging/lustre/lustre/mdc/mdc_reint.c
new file mode 100644 (file)
index 0000000..5e25a07
--- /dev/null
@@ -0,0 +1,489 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/kernel.h>
+
+#include <obd_class.h>
+#include "mdc_internal.h"
+#include <lustre_fid.h>
+
+/* mdc_setattr does its own semaphore handling */
+static int mdc_reint(struct ptlrpc_request *request,
+                    struct mdc_rpc_lock *rpc_lock,
+                    int level)
+{
+       int rc;
+
+       request->rq_send_state = level;
+
+       mdc_get_rpc_lock(rpc_lock, NULL);
+       rc = ptlrpc_queue_wait(request);
+       mdc_put_rpc_lock(rpc_lock, NULL);
+       if (rc)
+               CDEBUG(D_INFO, "error in handling %d\n", rc);
+       else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) {
+               rc = -EPROTO;
+       }
+       return rc;
+}
+
+/* Find and cancel locally locks matched by inode @bits & @mode in the resource
+ * found by @fid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+int mdc_resource_get_unused(struct obd_export *exp, struct lu_fid *fid,
+                           struct list_head *cancels, ldlm_mode_t mode,
+                           __u64 bits)
+{
+       struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+       ldlm_policy_data_t policy = {{0}};
+       struct ldlm_res_id res_id;
+       struct ldlm_resource *res;
+       int count;
+       ENTRY;
+
+       /* Return, i.e. cancel nothing, only if ELC is supported (flag in
+        * export) but disabled through procfs (flag in NS).
+        *
+        * This distinguishes from a case when ELC is not supported originally,
+        * when we still want to cancel locks in advance and just cancel them
+        * locally, without sending any RPC. */
+       if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+               RETURN(0);
+
+       fid_build_reg_res_name(fid, &res_id);
+       res = ldlm_resource_get(exp->exp_obd->obd_namespace,
+                               NULL, &res_id, 0, 0);
+       if (res == NULL)
+               RETURN(0);
+       LDLM_RESOURCE_ADDREF(res);
+       /* Initialize ibits lock policy. */
+       policy.l_inodebits.bits = bits;
+       count = ldlm_cancel_resource_local(res, cancels, &policy,
+                                          mode, 0, 0, NULL);
+       LDLM_RESOURCE_DELREF(res);
+       ldlm_resource_putref(res);
+       RETURN(count);
+}
+
+int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
+               void *ea, int ealen, void *ea2, int ea2len,
+               struct ptlrpc_request **request, struct md_open_data **mod)
+{
+       LIST_HEAD(cancels);
+       struct ptlrpc_request *req;
+       struct mdc_rpc_lock *rpc_lock;
+       struct obd_device *obd = exp->exp_obd;
+       int count = 0, rc;
+       __u64 bits;
+       ENTRY;
+
+       LASSERT(op_data != NULL);
+
+       bits = MDS_INODELOCK_UPDATE;
+       if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+               bits |= MDS_INODELOCK_LOOKUP;
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+           (fid_is_sane(&op_data->op_fid1)) &&
+           !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                               &cancels, LCK_EX, bits);
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_REINT_SETATTR);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       if ((op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) == 0)
+               req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT,
+                                    0);
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen);
+       req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT,
+                            ea2len);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       rpc_lock = obd->u.cli.cl_rpc_lock;
+
+       if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME))
+               CDEBUG(D_INODE, "setting mtime "CFS_TIME_T
+                      ", ctime "CFS_TIME_T"\n",
+                      LTIME_S(op_data->op_attr.ia_mtime),
+                      LTIME_S(op_data->op_attr.ia_ctime));
+       mdc_setattr_pack(req, op_data, ea, ealen, ea2, ea2len);
+
+       ptlrpc_request_set_replen(req);
+       if (mod && (op_data->op_flags & MF_EPOCH_OPEN) &&
+           req->rq_import->imp_replayable)
+       {
+               LASSERT(*mod == NULL);
+
+               *mod = obd_mod_alloc();
+               if (*mod == NULL) {
+                       DEBUG_REQ(D_ERROR, req, "Can't allocate "
+                                 "md_open_data");
+               } else {
+                       req->rq_replay = 1;
+                       req->rq_cb_data = *mod;
+                       (*mod)->mod_open_req = req;
+                       req->rq_commit_cb = mdc_commit_open;
+                       /**
+                        * Take an extra reference on \var mod, it protects \var
+                        * mod from being freed on eviction (commit callback is
+                        * called despite rq_replay flag).
+                        * Will be put on mdc_done_writing().
+                        */
+                       obd_mod_get(*mod);
+               }
+       }
+
+       rc = mdc_reint(req, rpc_lock, LUSTRE_IMP_FULL);
+
+       /* Save the obtained info in the original RPC for the replay case. */
+       if (rc == 0 && (op_data->op_flags & MF_EPOCH_OPEN)) {
+               struct mdt_ioepoch *epoch;
+               struct mdt_body  *body;
+
+               epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+               LASSERT(epoch != NULL);
+               LASSERT(body != NULL);
+               epoch->handle = body->handle;
+               epoch->ioepoch = body->ioepoch;
+               req->rq_replay_cb = mdc_replay_open;
+       /** bug 3633, open may be committed and estale answer is not error */
+       } else if (rc == -ESTALE && (op_data->op_flags & MF_SOM_CHANGE)) {
+               rc = 0;
+       } else if (rc == -ERESTARTSYS) {
+               rc = 0;
+       }
+       *request = req;
+       if (rc && req->rq_commit_cb) {
+               /* Put an extra reference on \var mod on error case. */
+               obd_mod_put(*mod);
+               req->rq_commit_cb(req);
+       }
+       RETURN(rc);
+}
+
+int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
+              const void *data, int datalen, int mode, __u32 uid, __u32 gid,
+              cfs_cap_t cap_effective, __u64 rdev,
+              struct ptlrpc_request **request)
+{
+       struct ptlrpc_request *req;
+       int level, rc;
+       int count, resends = 0;
+       struct obd_import *import = exp->exp_obd->u.cli.cl_import;
+       int generation = import->imp_generation;
+       LIST_HEAD(cancels);
+       ENTRY;
+
+       /* For case if upper layer did not alloc fid, do it now. */
+       if (!fid_is_sane(&op_data->op_fid2)) {
+               /*
+                * mdc_fid_alloc() may return errno 1 in case of switch to new
+                * sequence, handle this.
+                */
+               rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data);
+               if (rc < 0) {
+                       CERROR("Can't alloc new fid, rc %d\n", rc);
+                       RETURN(rc);
+               }
+       }
+
+rebuild:
+       count = 0;
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+           (fid_is_sane(&op_data->op_fid1)))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                               &cancels, LCK_EX,
+                                               MDS_INODELOCK_UPDATE);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_REINT_CREATE_RMT_ACL);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+                            data && datalen ? datalen : 0);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       /*
+        * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with
+        * tgt, for symlinks or lov MD data.
+        */
+       mdc_create_pack(req, op_data, data, datalen, mode, uid,
+                       gid, cap_effective, rdev);
+
+       ptlrpc_request_set_replen(req);
+
+       /* ask ptlrpc not to resend on EINPROGRESS since we have our own retry
+        * logic here */
+       req->rq_no_retry_einprogress = 1;
+
+       if (resends) {
+               req->rq_generation_set = 1;
+               req->rq_import_generation = generation;
+               req->rq_sent = cfs_time_current_sec() + resends;
+       }
+       level = LUSTRE_IMP_FULL;
+ resend:
+       rc = mdc_reint(req, exp->exp_obd->u.cli.cl_rpc_lock, level);
+
+       /* Resend if we were told to. */
+       if (rc == -ERESTARTSYS) {
+               level = LUSTRE_IMP_RECOVER;
+               goto resend;
+       } else if (rc == -EINPROGRESS) {
+               /* Retry create infinitely until succeed or get other
+                * error code. */
+               ptlrpc_req_finished(req);
+               resends++;
+
+               CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n",
+                      exp->exp_obd->obd_name, resends,
+                      PFID(&op_data->op_fid1), PFID(&op_data->op_fid2));
+
+               if (generation == import->imp_generation) {
+                       goto rebuild;
+               } else {
+                       CDEBUG(D_HA, "resend cross eviction\n");
+                       RETURN(-EIO);
+               }
+       } else if (rc == 0) {
+               struct mdt_body *body;
+               struct lustre_capa *capa;
+
+               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+               LASSERT(body);
+               if (body->valid & OBD_MD_FLMDSCAPA) {
+                       capa = req_capsule_server_get(&req->rq_pill,
+                                                     &RMF_CAPA1);
+                       if (capa == NULL)
+                               rc = -EPROTO;
+               }
+       }
+
+       *request = req;
+       RETURN(rc);
+}
+
+int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
+              struct ptlrpc_request **request)
+{
+       LIST_HEAD(cancels);
+       struct obd_device *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req = *request;
+       int count = 0, rc;
+       ENTRY;
+
+       LASSERT(req == NULL);
+
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+           (fid_is_sane(&op_data->op_fid1)) &&
+           !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                               &cancels, LCK_EX,
+                                               MDS_INODELOCK_UPDATE);
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+           (fid_is_sane(&op_data->op_fid3)) &&
+           !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+               count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_FULL);
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_REINT_UNLINK);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_unlink_pack(req, op_data);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_easize);
+       req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_cookiesize);
+       ptlrpc_request_set_replen(req);
+
+       *request = req;
+
+       rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+       if (rc == -ERESTARTSYS)
+               rc = 0;
+       RETURN(rc);
+}
+
+int mdc_link(struct obd_export *exp, struct md_op_data *op_data,
+            struct ptlrpc_request **request)
+{
+       LIST_HEAD(cancels);
+       struct obd_device *obd = exp->exp_obd;
+       struct ptlrpc_request *req;
+       int count = 0, rc;
+       ENTRY;
+
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+           (fid_is_sane(&op_data->op_fid2)))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid2,
+                                               &cancels, LCK_EX,
+                                               MDS_INODELOCK_UPDATE);
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+           (fid_is_sane(&op_data->op_fid1)))
+               count += mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_UPDATE);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_link_pack(req, op_data);
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+       *request = req;
+       if (rc == -ERESTARTSYS)
+               rc = 0;
+
+       RETURN(rc);
+}
+
+int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
+              const char *old, int oldlen, const char *new, int newlen,
+              struct ptlrpc_request **request)
+{
+       LIST_HEAD(cancels);
+       struct obd_device *obd = exp->exp_obd;
+       struct ptlrpc_request *req;
+       int count = 0, rc;
+       ENTRY;
+
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
+           (fid_is_sane(&op_data->op_fid1)))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                               &cancels, LCK_EX,
+                                               MDS_INODELOCK_UPDATE);
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID2) &&
+           (fid_is_sane(&op_data->op_fid2)))
+               count += mdc_resource_get_unused(exp, &op_data->op_fid2,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_UPDATE);
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
+           (fid_is_sane(&op_data->op_fid3)))
+               count += mdc_resource_get_unused(exp, &op_data->op_fid3,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_LOOKUP);
+       if ((op_data->op_flags & MF_MDC_CANCEL_FID4) &&
+            (fid_is_sane(&op_data->op_fid4)))
+               count += mdc_resource_get_unused(exp, &op_data->op_fid4,
+                                                &cancels, LCK_EX,
+                                                MDS_INODELOCK_FULL);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_REINT_RENAME);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1);
+       req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       if (exp_connect_cancelset(exp) && req)
+               ldlm_cli_cancel_list(&cancels, count, req, 0);
+
+       mdc_rename_pack(req, op_data, old, oldlen, new, newlen);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_easize);
+       req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_cookiesize);
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
+       *request = req;
+       if (rc == -ERESTARTSYS)
+               rc = 0;
+
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
new file mode 100644 (file)
index 0000000..3cf9d8d
--- /dev/null
@@ -0,0 +1,2753 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDC
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+# include <linux/utsname.h>
+
+#include <lustre_acl.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+#include <lustre_log.h>
+
+#include "mdc_internal.h"
+
+#define REQUEST_MINOR 244
+
+struct mdc_renew_capa_args {
+       struct obd_capa *ra_oc;
+       renew_capa_cb_t  ra_cb;
+};
+
+static int mdc_cleanup(struct obd_device *obd);
+
+int mdc_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req,
+                   const struct req_msg_field *field, struct obd_capa **oc)
+{
+       struct lustre_capa *capa;
+       struct obd_capa *c;
+       ENTRY;
+
+       /* swabbed already in mdc_enqueue */
+       capa = req_capsule_server_get(&req->rq_pill, field);
+       if (capa == NULL)
+               RETURN(-EPROTO);
+
+       c = alloc_capa(CAPA_SITE_CLIENT);
+       if (IS_ERR(c)) {
+               CDEBUG(D_INFO, "alloc capa failed!\n");
+               RETURN(PTR_ERR(c));
+       } else {
+               c->c_capa = *capa;
+               *oc = c;
+               RETURN(0);
+       }
+}
+
+static inline int mdc_queue_wait(struct ptlrpc_request *req)
+{
+       struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+       int rc;
+
+       /* mdc_enter_request() ensures that this client has no more
+        * than cl_max_rpcs_in_flight RPCs simultaneously inf light
+        * against an MDT. */
+       rc = mdc_enter_request(cli);
+       if (rc != 0)
+               return rc;
+
+       rc = ptlrpc_queue_wait(req);
+       mdc_exit_request(cli);
+
+       return rc;
+}
+
+/* Helper that implements most of mdc_getstatus and signal_completed_replay. */
+/* XXX this should become mdc_get_info("key"), sending MDS_GET_INFO RPC */
+static int send_getstatus(struct obd_import *imp, struct lu_fid *rootfid,
+                         struct obd_capa **pc, int level, int msg_flags)
+{
+       struct ptlrpc_request *req;
+       struct mdt_body       *body;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_GETSTATUS,
+                                       LUSTRE_MDS_VERSION, MDS_GETSTATUS);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_pack_body(req, NULL, NULL, 0, 0, -1, 0);
+       lustre_msg_add_flags(req->rq_reqmsg, msg_flags);
+       req->rq_send_state = level;
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       if (body->valid & OBD_MD_FLMDSCAPA) {
+               rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, pc);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       *rootfid = body->fid1;
+       CDEBUG(D_NET,
+              "root fid="DFID", last_committed="LPU64"\n",
+              PFID(rootfid),
+              lustre_msg_get_last_committed(req->rq_repmsg));
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+/* This should be mdc_get_info("rootfid") */
+int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid,
+                 struct obd_capa **pc)
+{
+       return send_getstatus(class_exp2cliimp(exp), rootfid, pc,
+                             LUSTRE_IMP_FULL, 0);
+}
+
+/*
+ * This function now is known to always saying that it will receive 4 buffers
+ * from server. Even for cases when acl_size and md_size is zero, RPC header
+ * will contain 4 fields and RPC itself will contain zero size fields. This is
+ * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed
+ * and thus zero, it shrinks it, making zero size. The same story about
+ * md_size. And this is course of problem when client waits for smaller number
+ * of fields. This issue will be fixed later when client gets aware of RPC
+ * layouts.  --umka
+ */
+static int mdc_getattr_common(struct obd_export *exp,
+                             struct ptlrpc_request *req)
+{
+       struct req_capsule *pill = &req->rq_pill;
+       struct mdt_body    *body;
+       void           *eadata;
+       int              rc;
+       ENTRY;
+
+       /* Request message already built. */
+       rc = ptlrpc_queue_wait(req);
+       if (rc != 0)
+               RETURN(rc);
+
+       /* sanity check for the reply */
+       body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               RETURN(-EPROTO);
+
+       CDEBUG(D_NET, "mode: %o\n", body->mode);
+
+       if (body->eadatasize != 0) {
+               mdc_update_max_ea_from_body(exp, body);
+
+               eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+                                                     body->eadatasize);
+               if (eadata == NULL)
+                       RETURN(-EPROTO);
+       }
+
+       if (body->valid & OBD_MD_FLRMTPERM) {
+               struct mdt_remote_perm *perm;
+
+               LASSERT(client_is_remote(exp));
+               perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+                                               lustre_swab_mdt_remote_perm);
+               if (perm == NULL)
+                       RETURN(-EPROTO);
+       }
+
+       if (body->valid & OBD_MD_FLMDSCAPA) {
+               struct lustre_capa *capa;
+               capa = req_capsule_server_get(pill, &RMF_CAPA1);
+               if (capa == NULL)
+                       RETURN(-EPROTO);
+       }
+
+       RETURN(0);
+}
+
+int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data,
+               struct ptlrpc_request **request)
+{
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       /* Single MDS without an LMV case */
+       if (op_data->op_flags & MF_GET_MDT_IDX) {
+               op_data->op_mds = 0;
+               RETURN(0);
+       }
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+                     op_data->op_valid, op_data->op_mode, -1, 0);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            op_data->op_mode);
+       if (op_data->op_valid & OBD_MD_FLRMTPERM) {
+               LASSERT(client_is_remote(exp));
+               req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+                                    sizeof(struct mdt_remote_perm));
+       }
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_getattr_common(exp, req);
+       if (rc)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
+                    struct ptlrpc_request **request)
+{
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_GETATTR_NAME);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                            op_data->op_namelen + 1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+                     op_data->op_valid, op_data->op_mode,
+                     op_data->op_suppgids[0], 0);
+
+       if (op_data->op_name) {
+               char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+               LASSERT(strnlen(op_data->op_name, op_data->op_namelen) ==
+                               op_data->op_namelen);
+               memcpy(name, op_data->op_name, op_data->op_namelen);
+       }
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            op_data->op_mode);
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_getattr_common(exp, req);
+       if (rc)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+static int mdc_is_subdir(struct obd_export *exp,
+                        const struct lu_fid *pfid,
+                        const struct lu_fid *cfid,
+                        struct ptlrpc_request **request)
+{
+       struct ptlrpc_request  *req;
+       int                  rc;
+
+       ENTRY;
+
+       *request = NULL;
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_MDS_IS_SUBDIR, LUSTRE_MDS_VERSION,
+                                       MDS_IS_SUBDIR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_is_subdir_pack(req, pfid, cfid, 0);
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc && rc != -EREMOTE)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt,
+                           const struct lu_fid *fid,
+                           struct obd_capa *oc, int opcode, obd_valid valid,
+                           const char *xattr_name, const char *input,
+                           int input_size, int output_size, int flags,
+                           __u32 suppgid, struct ptlrpc_request **request)
+{
+       struct ptlrpc_request *req;
+       int   xattr_namelen = 0;
+       char *tmp;
+       int   rc;
+       ENTRY;
+
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, oc);
+       if (xattr_name) {
+               xattr_namelen = strlen(xattr_name) + 1;
+               req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                                    xattr_namelen);
+       }
+       if (input_size) {
+               LASSERT(input);
+               req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT,
+                                    input_size);
+       }
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       if (opcode == MDS_REINT) {
+               struct mdt_rec_setxattr *rec;
+
+               CLASSERT(sizeof(struct mdt_rec_setxattr) ==
+                        sizeof(struct mdt_rec_reint));
+               rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+               rec->sx_opcode = REINT_SETXATTR;
+               /* TODO:
+                *  cfs_curproc_fs{u,g}id() should replace
+                *  current->fs{u,g}id for portability.
+                */
+               rec->sx_fsuid  = current_fsuid();
+               rec->sx_fsgid  = current_fsgid();
+               rec->sx_cap    = cfs_curproc_cap_pack();
+               rec->sx_suppgid1 = suppgid;
+               rec->sx_suppgid2 = -1;
+               rec->sx_fid    = *fid;
+               rec->sx_valid  = valid | OBD_MD_FLCTIME;
+               rec->sx_time   = cfs_time_current_sec();
+               rec->sx_size   = output_size;
+               rec->sx_flags  = flags;
+
+               mdc_pack_capa(req, &RMF_CAPA1, oc);
+       } else {
+               mdc_pack_body(req, fid, oc, valid, output_size, suppgid, flags);
+       }
+
+       if (xattr_name) {
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+               memcpy(tmp, xattr_name, xattr_namelen);
+       }
+       if (input_size) {
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA);
+               memcpy(tmp, input, input_size);
+       }
+
+       if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER))
+               req_capsule_set_size(&req->rq_pill, &RMF_EADATA,
+                                    RCL_SERVER, output_size);
+       ptlrpc_request_set_replen(req);
+
+       /* make rpc */
+       if (opcode == MDS_REINT)
+               mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+       rc = ptlrpc_queue_wait(req);
+
+       if (opcode == MDS_REINT)
+               mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+       if (rc)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+                struct obd_capa *oc, obd_valid valid, const char *xattr_name,
+                const char *input, int input_size, int output_size,
+                int flags, __u32 suppgid, struct ptlrpc_request **request)
+{
+       return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR,
+                               fid, oc, MDS_REINT, valid, xattr_name,
+                               input, input_size, output_size, flags,
+                               suppgid, request);
+}
+
+int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid,
+                struct obd_capa *oc, obd_valid valid, const char *xattr_name,
+                const char *input, int input_size, int output_size,
+                int flags, struct ptlrpc_request **request)
+{
+       return mdc_xattr_common(exp, &RQF_MDS_GETXATTR,
+                               fid, oc, MDS_GETXATTR, valid, xattr_name,
+                               input, input_size, output_size, flags,
+                               -1, request);
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md)
+{
+       struct req_capsule     *pill = &req->rq_pill;
+       struct mdt_body *body = md->body;
+       struct posix_acl       *acl;
+       void               *buf;
+       int                  rc;
+       ENTRY;
+
+       if (!body->aclsize)
+               RETURN(0);
+
+       buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->aclsize);
+
+       if (!buf)
+               RETURN(-EPROTO);
+
+       acl = posix_acl_from_xattr(&init_user_ns, buf, body->aclsize);
+       if (IS_ERR(acl)) {
+               rc = PTR_ERR(acl);
+               CERROR("convert xattr to acl: %d\n", rc);
+               RETURN(rc);
+       }
+
+       rc = posix_acl_valid(acl);
+       if (rc) {
+               CERROR("validate acl: %d\n", rc);
+               posix_acl_release(acl);
+               RETURN(rc);
+       }
+
+       md->posix_acl = acl;
+       RETURN(0);
+}
+#else
+#define mdc_unpack_acl(req, md) 0
+#endif
+
+int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req,
+                     struct obd_export *dt_exp, struct obd_export *md_exp,
+                     struct lustre_md *md)
+{
+       struct req_capsule *pill = &req->rq_pill;
+       int rc;
+       ENTRY;
+
+       LASSERT(md);
+       memset(md, 0, sizeof(*md));
+
+       md->body = req_capsule_server_get(pill, &RMF_MDT_BODY);
+       LASSERT(md->body != NULL);
+
+       if (md->body->valid & OBD_MD_FLEASIZE) {
+               int lmmsize;
+               struct lov_mds_md *lmm;
+
+               if (!S_ISREG(md->body->mode)) {
+                       CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, should be a "
+                              "regular file, but is not\n");
+                       GOTO(out, rc = -EPROTO);
+               }
+
+               if (md->body->eadatasize == 0) {
+                       CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, "
+                              "but eadatasize 0\n");
+                       GOTO(out, rc = -EPROTO);
+               }
+               lmmsize = md->body->eadatasize;
+               lmm = req_capsule_server_sized_get(pill, &RMF_MDT_MD, lmmsize);
+               if (!lmm)
+                       GOTO(out, rc = -EPROTO);
+
+               rc = obd_unpackmd(dt_exp, &md->lsm, lmm, lmmsize);
+               if (rc < 0)
+                       GOTO(out, rc);
+
+               if (rc < sizeof(*md->lsm)) {
+                       CDEBUG(D_INFO, "lsm size too small: "
+                              "rc < sizeof (*md->lsm) (%d < %d)\n",
+                              rc, (int)sizeof(*md->lsm));
+                       GOTO(out, rc = -EPROTO);
+               }
+
+       } else if (md->body->valid & OBD_MD_FLDIREA) {
+               int lmvsize;
+               struct lov_mds_md *lmv;
+
+               if(!S_ISDIR(md->body->mode)) {
+                       CDEBUG(D_INFO, "OBD_MD_FLDIREA set, should be a "
+                              "directory, but is not\n");
+                       GOTO(out, rc = -EPROTO);
+               }
+
+               if (md->body->eadatasize == 0) {
+                       CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, "
+                              "but eadatasize 0\n");
+                       RETURN(-EPROTO);
+               }
+               if (md->body->valid & OBD_MD_MEA) {
+                       lmvsize = md->body->eadatasize;
+                       lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD,
+                                                          lmvsize);
+                       if (!lmv)
+                               GOTO(out, rc = -EPROTO);
+
+                       rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv,
+                                         lmvsize);
+                       if (rc < 0)
+                               GOTO(out, rc);
+
+                       if (rc < sizeof(*md->mea)) {
+                               CDEBUG(D_INFO, "size too small:  "
+                                      "rc < sizeof(*md->mea) (%d < %d)\n",
+                                       rc, (int)sizeof(*md->mea));
+                               GOTO(out, rc = -EPROTO);
+                       }
+               }
+       }
+       rc = 0;
+
+       if (md->body->valid & OBD_MD_FLRMTPERM) {
+               /* remote permission */
+               LASSERT(client_is_remote(exp));
+               md->remote_perm = req_capsule_server_swab_get(pill, &RMF_ACL,
+                                               lustre_swab_mdt_remote_perm);
+               if (!md->remote_perm)
+                       GOTO(out, rc = -EPROTO);
+       }
+       else if (md->body->valid & OBD_MD_FLACL) {
+               /* for ACL, it's possible that FLACL is set but aclsize is zero.
+                * only when aclsize != 0 there's an actual segment for ACL
+                * in reply buffer.
+                */
+               if (md->body->aclsize) {
+                       rc = mdc_unpack_acl(req, md);
+                       if (rc)
+                               GOTO(out, rc);
+#ifdef CONFIG_FS_POSIX_ACL
+               } else {
+                       md->posix_acl = NULL;
+#endif
+               }
+       }
+       if (md->body->valid & OBD_MD_FLMDSCAPA) {
+               struct obd_capa *oc = NULL;
+
+               rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, &oc);
+               if (rc)
+                       GOTO(out, rc);
+               md->mds_capa = oc;
+       }
+
+       if (md->body->valid & OBD_MD_FLOSSCAPA) {
+               struct obd_capa *oc = NULL;
+
+               rc = mdc_unpack_capa(NULL, req, &RMF_CAPA2, &oc);
+               if (rc)
+                       GOTO(out, rc);
+               md->oss_capa = oc;
+       }
+
+       EXIT;
+out:
+       if (rc) {
+               if (md->oss_capa) {
+                       capa_put(md->oss_capa);
+                       md->oss_capa = NULL;
+               }
+               if (md->mds_capa) {
+                       capa_put(md->mds_capa);
+                       md->mds_capa = NULL;
+               }
+#ifdef CONFIG_FS_POSIX_ACL
+               posix_acl_release(md->posix_acl);
+#endif
+               if (md->lsm)
+                       obd_free_memmd(dt_exp, &md->lsm);
+       }
+       return rc;
+}
+
+int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+       ENTRY;
+       RETURN(0);
+}
+
+/**
+ * Handles both OPEN and SETATTR RPCs for OPEN-CLOSE and SETATTR-DONE_WRITING
+ * RPC chains.
+ */
+void mdc_replay_open(struct ptlrpc_request *req)
+{
+       struct md_open_data *mod = req->rq_cb_data;
+       struct ptlrpc_request *close_req;
+       struct obd_client_handle *och;
+       struct lustre_handle old;
+       struct mdt_body *body;
+       ENTRY;
+
+       if (mod == NULL) {
+               DEBUG_REQ(D_ERROR, req,
+                         "Can't properly replay without open data.");
+               EXIT;
+               return;
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL);
+
+       och = mod->mod_och;
+       if (och != NULL) {
+               struct lustre_handle *file_fh;
+
+               LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
+
+               file_fh = &och->och_fh;
+               CDEBUG(D_HA, "updating handle from "LPX64" to "LPX64"\n",
+                      file_fh->cookie, body->handle.cookie);
+               old = *file_fh;
+               *file_fh = body->handle;
+       }
+       close_req = mod->mod_close_req;
+       if (close_req != NULL) {
+               __u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg);
+               struct mdt_ioepoch *epoch;
+
+               LASSERT(opc == MDS_CLOSE || opc == MDS_DONE_WRITING);
+               epoch = req_capsule_client_get(&close_req->rq_pill,
+                                              &RMF_MDT_EPOCH);
+               LASSERT(epoch);
+
+               if (och != NULL)
+                       LASSERT(!memcmp(&old, &epoch->handle, sizeof(old)));
+               DEBUG_REQ(D_HA, close_req, "updating close body with new fh");
+               epoch->handle = body->handle;
+       }
+       EXIT;
+}
+
+void mdc_commit_open(struct ptlrpc_request *req)
+{
+       struct md_open_data *mod = req->rq_cb_data;
+       if (mod == NULL)
+               return;
+
+       /**
+        * No need to touch md_open_data::mod_och, it holds a reference on
+        * \var mod and will zero references to each other, \var mod will be
+        * freed after that when md_open_data::mod_och will put the reference.
+        */
+
+       /**
+        * Do not let open request to disappear as it still may be needed
+        * for close rpc to happen (it may happen on evict only, otherwise
+        * ptlrpc_request::rq_replay does not let mdc_commit_open() to be
+        * called), just mark this rpc as committed to distinguish these 2
+        * cases, see mdc_close() for details. The open request reference will
+        * be put along with freeing \var mod.
+        */
+       ptlrpc_request_addref(req);
+       spin_lock(&req->rq_lock);
+       req->rq_committed = 1;
+       spin_unlock(&req->rq_lock);
+       req->rq_cb_data = NULL;
+       obd_mod_put(mod);
+}
+
+int mdc_set_open_replay_data(struct obd_export *exp,
+                            struct obd_client_handle *och,
+                            struct ptlrpc_request *open_req)
+{
+       struct md_open_data   *mod;
+       struct mdt_rec_create *rec;
+       struct mdt_body       *body;
+       struct obd_import     *imp = open_req->rq_import;
+       ENTRY;
+
+       if (!open_req->rq_replay)
+               RETURN(0);
+
+       rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT);
+       body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(rec != NULL);
+       /* Incoming message in my byte order (it's been swabbed). */
+       /* Outgoing messages always in my byte order. */
+       LASSERT(body != NULL);
+
+       /* Only if the import is replayable, we set replay_open data */
+       if (och && imp->imp_replayable) {
+               mod = obd_mod_alloc();
+               if (mod == NULL) {
+                       DEBUG_REQ(D_ERROR, open_req,
+                                 "Can't allocate md_open_data");
+                       RETURN(0);
+               }
+
+               /**
+                * Take a reference on \var mod, to be freed on mdc_close().
+                * It protects \var mod from being freed on eviction (commit
+                * callback is called despite rq_replay flag).
+                * Another reference for \var och.
+                */
+               obd_mod_get(mod);
+               obd_mod_get(mod);
+
+               spin_lock(&open_req->rq_lock);
+               och->och_mod = mod;
+               mod->mod_och = och;
+               mod->mod_open_req = open_req;
+               open_req->rq_cb_data = mod;
+               open_req->rq_commit_cb = mdc_commit_open;
+               spin_unlock(&open_req->rq_lock);
+       }
+
+       rec->cr_fid2 = body->fid1;
+       rec->cr_ioepoch = body->ioepoch;
+       rec->cr_old_handle.cookie = body->handle.cookie;
+       open_req->rq_replay_cb = mdc_replay_open;
+       if (!fid_is_sane(&body->fid1)) {
+               DEBUG_REQ(D_ERROR, open_req, "Saving replay request with "
+                         "insane fid");
+               LBUG();
+       }
+
+       DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data");
+       RETURN(0);
+}
+
+int mdc_clear_open_replay_data(struct obd_export *exp,
+                              struct obd_client_handle *och)
+{
+       struct md_open_data *mod = och->och_mod;
+       ENTRY;
+
+       /**
+        * It is possible to not have \var mod in a case of eviction between
+        * lookup and ll_file_open().
+        **/
+       if (mod == NULL)
+               RETURN(0);
+
+       LASSERT(mod != LP_POISON);
+
+       mod->mod_och = NULL;
+       och->och_mod = NULL;
+       obd_mod_put(mod);
+
+       RETURN(0);
+}
+
+/* Prepares the request for the replay by the given reply */
+static void mdc_close_handle_reply(struct ptlrpc_request *req,
+                                  struct md_op_data *op_data, int rc) {
+       struct mdt_body  *repbody;
+       struct mdt_ioepoch *epoch;
+
+       if (req && rc == -EAGAIN) {
+               repbody = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+               epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH);
+
+               epoch->flags |= MF_SOM_AU;
+               if (repbody->valid & OBD_MD_FLGETATTRLOCK)
+                       op_data->op_flags |= MF_GETATTR_LOCK;
+       }
+}
+
+int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
+             struct md_open_data *mod, struct ptlrpc_request **request)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_CLOSE);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
+        * portal whose threads are not taking any DLM locks and are therefore
+        * always progressing */
+       req->rq_request_portal = MDS_READPAGE_PORTAL;
+       ptlrpc_at_set_req_timeout(req);
+
+       /* Ensure that this close's handle is fixed up during replay. */
+       if (likely(mod != NULL)) {
+               LASSERTF(mod->mod_open_req != NULL &&
+                        mod->mod_open_req->rq_type != LI_POISON,
+                        "POISONED open %p!\n", mod->mod_open_req);
+
+               mod->mod_close_req = req;
+
+               DEBUG_REQ(D_HA, mod->mod_open_req, "matched open");
+               /* We no longer want to preserve this open for replay even
+                * though the open was committed. b=3632, b=3633 */
+               spin_lock(&mod->mod_open_req->rq_lock);
+               mod->mod_open_req->rq_replay = 0;
+               spin_unlock(&mod->mod_open_req->rq_lock);
+       } else {
+                CDEBUG(D_HA, "couldn't find open req; expecting close error\n");
+       }
+
+       mdc_close_pack(req, op_data);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_easize);
+       req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER,
+                            obd->u.cli.cl_max_mds_cookiesize);
+
+       ptlrpc_request_set_replen(req);
+
+       mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+       rc = ptlrpc_queue_wait(req);
+       mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+       if (req->rq_repmsg == NULL) {
+               CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req,
+                      req->rq_status);
+               if (rc == 0)
+                       rc = req->rq_status ?: -EIO;
+       } else if (rc == 0 || rc == -EAGAIN) {
+               struct mdt_body *body;
+
+               rc = lustre_msg_get_status(req->rq_repmsg);
+               if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+                       DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR, err "
+                                 "= %d", rc);
+                       if (rc > 0)
+                               rc = -rc;
+               }
+               body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+               if (body == NULL)
+                       rc = -EPROTO;
+       } else if (rc == -ESTALE) {
+               /**
+                * it can be allowed error after 3633 if open was committed and
+                * server failed before close was sent. Let's check if mod
+                * exists and return no error in that case
+                */
+               if (mod) {
+                       DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc);
+                       LASSERT(mod->mod_open_req != NULL);
+                       if (mod->mod_open_req->rq_committed)
+                               rc = 0;
+               }
+       }
+
+       if (mod) {
+               if (rc != 0)
+                       mod->mod_close_req = NULL;
+               /* Since now, mod is accessed through open_req only,
+                * thus close req does not keep a reference on mod anymore. */
+               obd_mod_put(mod);
+       }
+       *request = req;
+       mdc_close_handle_reply(req, op_data, rc);
+       RETURN(rc);
+}
+
+int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data,
+                    struct md_open_data *mod)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_DONE_WRITING);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_DONE_WRITING);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       if (mod != NULL) {
+               LASSERTF(mod->mod_open_req != NULL &&
+                        mod->mod_open_req->rq_type != LI_POISON,
+                        "POISONED setattr %p!\n", mod->mod_open_req);
+
+               mod->mod_close_req = req;
+               DEBUG_REQ(D_HA, mod->mod_open_req, "matched setattr");
+               /* We no longer want to preserve this setattr for replay even
+                * though the open was committed. b=3632, b=3633 */
+               spin_lock(&mod->mod_open_req->rq_lock);
+               mod->mod_open_req->rq_replay = 0;
+               spin_unlock(&mod->mod_open_req->rq_lock);
+       }
+
+       mdc_close_pack(req, op_data);
+       ptlrpc_request_set_replen(req);
+
+       mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+       rc = ptlrpc_queue_wait(req);
+       mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL);
+
+       if (rc == -ESTALE) {
+               /**
+                * it can be allowed error after 3633 if open or setattr were
+                * committed and server failed before close was sent.
+                * Let's check if mod exists and return no error in that case
+                */
+               if (mod) {
+                       LASSERT(mod->mod_open_req != NULL);
+                       if (mod->mod_open_req->rq_committed)
+                               rc = 0;
+               }
+       }
+
+       if (mod) {
+               if (rc != 0)
+                       mod->mod_close_req = NULL;
+               /* Since now, mod is accessed through setattr req only,
+                * thus DW req does not keep a reference on mod anymore. */
+               obd_mod_put(mod);
+       }
+
+       mdc_close_handle_reply(req, op_data, rc);
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+
+int mdc_readpage(struct obd_export *exp, struct md_op_data *op_data,
+                struct page **pages, struct ptlrpc_request **request)
+{
+       struct ptlrpc_request   *req;
+       struct ptlrpc_bulk_desc *desc;
+       int                   i;
+       wait_queue_head_t             waitq;
+       int                   resends = 0;
+       struct l_wait_info       lwi;
+       int                   rc;
+       ENTRY;
+
+       *request = NULL;
+       init_waitqueue_head(&waitq);
+
+restart_bulk:
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       req->rq_request_portal = MDS_READPAGE_PORTAL;
+       ptlrpc_at_set_req_timeout(req);
+
+       desc = ptlrpc_prep_bulk_imp(req, op_data->op_npages, 1, BULK_PUT_SINK,
+                                   MDS_BULK_PORTAL);
+       if (desc == NULL) {
+               ptlrpc_request_free(req);
+               RETURN(-ENOMEM);
+       }
+
+       /* NB req now owns desc and will free it when it gets freed */
+       for (i = 0; i < op_data->op_npages; i++)
+               ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+       mdc_readdir_pack(req, op_data->op_offset,
+                        PAGE_CACHE_SIZE * op_data->op_npages,
+                        &op_data->op_fid1, op_data->op_capa1);
+
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc) {
+               ptlrpc_req_finished(req);
+               if (rc != -ETIMEDOUT)
+                       RETURN(rc);
+
+               resends++;
+               if (!client_should_resend(resends, &exp->exp_obd->u.cli)) {
+                       CERROR("too many resend retries, returning error\n");
+                       RETURN(-EIO);
+               }
+               lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, NULL);
+               l_wait_event(waitq, 0, &lwi);
+
+               goto restart_bulk;
+       }
+
+       rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk,
+                                         req->rq_bulk->bd_nob_transferred);
+       if (rc < 0) {
+               ptlrpc_req_finished(req);
+               RETURN(rc);
+       }
+
+       if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) {
+               CERROR("Unexpected # bytes transferred: %d (%ld expected)\n",
+                       req->rq_bulk->bd_nob_transferred,
+                       PAGE_CACHE_SIZE * op_data->op_npages);
+               ptlrpc_req_finished(req);
+               RETURN(-EPROTO);
+       }
+
+       *request = req;
+       RETURN(0);
+}
+
+static int mdc_statfs(const struct lu_env *env,
+                     struct obd_export *exp, struct obd_statfs *osfs,
+                     __u64 max_age, __u32 flags)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req;
+       struct obd_statfs     *msfs;
+       struct obd_import     *imp = NULL;
+       int                 rc;
+       ENTRY;
+
+       /*
+        * Since the request might also come from lprocfs, so we need
+        * sync this with client_disconnect_export Bug15684
+        */
+       down_read(&obd->u.cli.cl_sem);
+       if (obd->u.cli.cl_import)
+               imp = class_import_get(obd->u.cli.cl_import);
+       up_read(&obd->u.cli.cl_sem);
+       if (!imp)
+               RETURN(-ENODEV);
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS,
+                                       LUSTRE_MDS_VERSION, MDS_STATFS);
+       if (req == NULL)
+               GOTO(output, rc = -ENOMEM);
+
+       ptlrpc_request_set_replen(req);
+
+       if (flags & OBD_STATFS_NODELAY) {
+               /* procfs requests not want stay in wait for avoid deadlock */
+               req->rq_no_resend = 1;
+               req->rq_no_delay = 1;
+       }
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc) {
+               /* check connection error first */
+               if (imp->imp_connect_error)
+                       rc = imp->imp_connect_error;
+               GOTO(out, rc);
+       }
+
+       msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+       if (msfs == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       *osfs = *msfs;
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+output:
+       class_import_put(imp);
+       return rc;
+}
+
+static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf)
+{
+       __u32 keylen, vallen;
+       void *key;
+       int rc;
+
+       if (gf->gf_pathlen > PATH_MAX)
+               RETURN(-ENAMETOOLONG);
+       if (gf->gf_pathlen < 2)
+               RETURN(-EOVERFLOW);
+
+       /* Key is KEY_FID2PATH + getinfo_fid2path description */
+       keylen = cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf);
+       OBD_ALLOC(key, keylen);
+       if (key == NULL)
+               RETURN(-ENOMEM);
+       memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH));
+       memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf));
+
+       CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n",
+              PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno);
+
+       if (!fid_is_sane(&gf->gf_fid))
+               GOTO(out, rc = -EINVAL);
+
+       /* Val is struct getinfo_fid2path result plus path */
+       vallen = sizeof(*gf) + gf->gf_pathlen;
+
+       rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf, NULL);
+       if (rc != 0 && rc != -EREMOTE)
+               GOTO(out, rc);
+
+       if (vallen <= sizeof(*gf))
+               GOTO(out, rc = -EPROTO);
+       else if (vallen > sizeof(*gf) + gf->gf_pathlen)
+               GOTO(out, rc = -EOVERFLOW);
+
+       CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n%s\n",
+              PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, gf->gf_path);
+
+out:
+       OBD_FREE(key, keylen);
+       return rc;
+}
+
+static int mdc_ioc_hsm_progress(struct obd_export *exp,
+                               struct hsm_progress_kernel *hpk)
+{
+       struct obd_import               *imp = class_exp2cliimp(exp);
+       struct hsm_progress_kernel      *req_hpk;
+       struct ptlrpc_request           *req;
+       int                              rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS,
+                                       LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+       /* Copy hsm_progress struct */
+       req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS);
+       if (req_hpk == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       *req_hpk = *hpk;
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       GOTO(out, rc);
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives)
+{
+       __u32                   *archive_mask;
+       struct ptlrpc_request   *req;
+       int                      rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER,
+                                       LUSTRE_MDS_VERSION,
+                                       MDS_HSM_CT_REGISTER);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+       /* Copy hsm_progress struct */
+       archive_mask = req_capsule_client_get(&req->rq_pill,
+                                             &RMF_MDS_HSM_ARCHIVE);
+       if (archive_mask == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       *archive_mask = archives;
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       GOTO(out, rc);
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_current_action(struct obd_export *exp,
+                                     struct md_op_data *op_data)
+{
+       struct hsm_current_action       *hca = op_data->op_data;
+       struct hsm_current_action       *req_hca;
+       struct ptlrpc_request           *req;
+       int                              rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_HSM_ACTION);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+                     OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       req_hca = req_capsule_server_get(&req->rq_pill,
+                                        &RMF_MDS_HSM_CURRENT_ACTION);
+       if (req_hca == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       *hca = *req_hca;
+
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp)
+{
+       struct ptlrpc_request   *req;
+       int                      rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER,
+                                       LUSTRE_MDS_VERSION,
+                                       MDS_HSM_CT_UNREGISTER);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       GOTO(out, rc);
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_state_get(struct obd_export *exp,
+                                struct md_op_data *op_data)
+{
+       struct hsm_user_state   *hus = op_data->op_data;
+       struct hsm_user_state   *req_hus;
+       struct ptlrpc_request   *req;
+       int                      rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_HSM_STATE_GET);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET);
+       if (rc != 0) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+                     OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE);
+       if (req_hus == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       *hus = *req_hus;
+
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_state_set(struct obd_export *exp,
+                                struct md_op_data *op_data)
+{
+       struct hsm_state_set    *hss = op_data->op_data;
+       struct hsm_state_set    *req_hss;
+       struct ptlrpc_request   *req;
+       int                      rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_HSM_STATE_SET);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1,
+                     OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0);
+
+       /* Copy states */
+       req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET);
+       if (req_hss == NULL)
+               GOTO(out, rc = -EPROTO);
+       *req_hss = *hss;
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       GOTO(out, rc);
+
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_ioc_hsm_request(struct obd_export *exp,
+                              struct hsm_user_request *hur)
+{
+       struct obd_import       *imp = class_exp2cliimp(exp);
+       struct ptlrpc_request   *req;
+       struct hsm_request      *req_hr;
+       struct hsm_user_item    *req_hui;
+       char                    *req_opaque;
+       int                      rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT,
+                            hur->hur_request.hr_itemcount
+                            * sizeof(struct hsm_user_item));
+       req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT,
+                            hur->hur_request.hr_data_len);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0);
+
+       /* Copy hsm_request struct */
+       req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST);
+       if (req_hr == NULL)
+               GOTO(out, rc = -EPROTO);
+       *req_hr = hur->hur_request;
+
+       /* Copy hsm_user_item structs */
+       req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM);
+       if (req_hui == NULL)
+               GOTO(out, rc = -EPROTO);
+       memcpy(req_hui, hur->hur_user_item,
+              hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item));
+
+       /* Copy opaque field */
+       req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA);
+       if (req_opaque == NULL)
+               GOTO(out, rc = -EPROTO);
+       memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_queue_wait(req);
+       GOTO(out, rc);
+
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static struct kuc_hdr *changelog_kuc_hdr(char *buf, int len, int flags)
+{
+       struct kuc_hdr *lh = (struct kuc_hdr *)buf;
+
+       LASSERT(len <= CR_MAXSIZE);
+
+       lh->kuc_magic = KUC_MAGIC;
+       lh->kuc_transport = KUC_TRANSPORT_CHANGELOG;
+       lh->kuc_flags = flags;
+       lh->kuc_msgtype = CL_RECORD;
+       lh->kuc_msglen = len;
+       return lh;
+}
+
+#define D_CHANGELOG 0
+
+struct changelog_show {
+       __u64           cs_startrec;
+       __u32           cs_flags;
+       struct file     *cs_fp;
+       char            *cs_buf;
+       struct obd_device *cs_obd;
+};
+
+static int changelog_kkuc_cb(const struct lu_env *env, struct llog_handle *llh,
+                            struct llog_rec_hdr *hdr, void *data)
+{
+       struct changelog_show *cs = data;
+       struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr;
+       struct kuc_hdr *lh;
+       int len, rc;
+       ENTRY;
+
+       if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
+               rc = -EINVAL;
+               CERROR("%s: not a changelog rec %x/%d: rc = %d\n",
+                      cs->cs_obd->obd_name, rec->cr_hdr.lrh_type,
+                      rec->cr.cr_type, rc);
+               RETURN(rc);
+       }
+
+       if (rec->cr.cr_index < cs->cs_startrec) {
+               /* Skip entries earlier than what we are interested in */
+               CDEBUG(D_CHANGELOG, "rec="LPU64" start="LPU64"\n",
+                      rec->cr.cr_index, cs->cs_startrec);
+               RETURN(0);
+       }
+
+       CDEBUG(D_CHANGELOG, LPU64" %02d%-5s "LPU64" 0x%x t="DFID" p="DFID
+               " %.*s\n", rec->cr.cr_index, rec->cr.cr_type,
+               changelog_type2str(rec->cr.cr_type), rec->cr.cr_time,
+               rec->cr.cr_flags & CLF_FLAGMASK,
+               PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
+               rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
+
+       len = sizeof(*lh) + changelog_rec_size(&rec->cr) + rec->cr.cr_namelen;
+
+       /* Set up the message */
+       lh = changelog_kuc_hdr(cs->cs_buf, len, cs->cs_flags);
+       memcpy(lh + 1, &rec->cr, len - sizeof(*lh));
+
+       rc = libcfs_kkuc_msg_put(cs->cs_fp, lh);
+       CDEBUG(D_CHANGELOG, "kucmsg fp %p len %d rc %d\n", cs->cs_fp, len,rc);
+
+       RETURN(rc);
+}
+
+static int mdc_changelog_send_thread(void *csdata)
+{
+       struct changelog_show *cs = csdata;
+       struct llog_ctxt *ctxt = NULL;
+       struct llog_handle *llh = NULL;
+       struct kuc_hdr *kuch;
+       int rc;
+
+       CDEBUG(D_CHANGELOG, "changelog to fp=%p start "LPU64"\n",
+              cs->cs_fp, cs->cs_startrec);
+
+       OBD_ALLOC(cs->cs_buf, CR_MAXSIZE);
+       if (cs->cs_buf == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       /* Set up the remote catalog handle */
+       ctxt = llog_get_context(cs->cs_obd, LLOG_CHANGELOG_REPL_CTXT);
+       if (ctxt == NULL)
+               GOTO(out, rc = -ENOENT);
+       rc = llog_open(NULL, ctxt, &llh, NULL, CHANGELOG_CATALOG,
+                      LLOG_OPEN_EXISTS);
+       if (rc) {
+               CERROR("%s: fail to open changelog catalog: rc = %d\n",
+                      cs->cs_obd->obd_name, rc);
+               GOTO(out, rc);
+       }
+       rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT, NULL);
+       if (rc) {
+               CERROR("llog_init_handle failed %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       rc = llog_cat_process(NULL, llh, changelog_kkuc_cb, cs, 0, 0);
+
+       /* Send EOF no matter what our result */
+       if ((kuch = changelog_kuc_hdr(cs->cs_buf, sizeof(*kuch),
+                                     cs->cs_flags))) {
+               kuch->kuc_msgtype = CL_EOF;
+               libcfs_kkuc_msg_put(cs->cs_fp, kuch);
+       }
+
+out:
+       fput(cs->cs_fp);
+       if (llh)
+               llog_cat_close(NULL, llh);
+       if (ctxt)
+               llog_ctxt_put(ctxt);
+       if (cs->cs_buf)
+               OBD_FREE(cs->cs_buf, CR_MAXSIZE);
+       OBD_FREE_PTR(cs);
+       return rc;
+}
+
+static int mdc_ioc_changelog_send(struct obd_device *obd,
+                                 struct ioc_changelog *icc)
+{
+       struct changelog_show *cs;
+       int rc;
+
+       /* Freed in mdc_changelog_send_thread */
+       OBD_ALLOC_PTR(cs);
+       if (!cs)
+               return -ENOMEM;
+
+       cs->cs_obd = obd;
+       cs->cs_startrec = icc->icc_recno;
+       /* matching fput in mdc_changelog_send_thread */
+       cs->cs_fp = fget(icc->icc_id);
+       cs->cs_flags = icc->icc_flags;
+
+       /*
+        * New thread because we should return to user app before
+        * writing into our pipe
+        */
+       rc = PTR_ERR(kthread_run(mdc_changelog_send_thread, cs,
+                                "mdc_clg_send_thread"));
+       if (!IS_ERR_VALUE(rc)) {
+               CDEBUG(D_CHANGELOG, "start changelog thread\n");
+               return 0;
+       }
+
+       CERROR("Failed to start changelog thread: %d\n", rc);
+       OBD_FREE_PTR(cs);
+       return rc;
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+                               struct lustre_kernelcomm *lk);
+
+static int mdc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+                         struct obd_quotactl *oqctl)
+{
+       struct client_obd       *cli = &exp->exp_obd->u.cli;
+       struct ptlrpc_request   *req;
+       struct obd_quotactl     *body;
+       int                   rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_MDS_QUOTACHECK, LUSTRE_MDS_VERSION,
+                                       MDS_QUOTACHECK);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+       *body = *oqctl;
+
+       ptlrpc_request_set_replen(req);
+
+       /* the next poll will find -ENODATA, that means quotacheck is
+        * going on */
+       cli->cl_qchk_stat = -ENODATA;
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               cli->cl_qchk_stat = rc;
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+static int mdc_quota_poll_check(struct obd_export *exp,
+                               struct if_quotacheck *qchk)
+{
+       struct client_obd *cli = &exp->exp_obd->u.cli;
+       int rc;
+       ENTRY;
+
+       qchk->obd_uuid = cli->cl_target_uuid;
+       memcpy(qchk->obd_type, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME));
+
+       rc = cli->cl_qchk_stat;
+       /* the client is not the previous one */
+       if (rc == CL_NOT_QUOTACHECKED)
+               rc = -EINTR;
+       RETURN(rc);
+}
+
+static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                       struct obd_quotactl *oqctl)
+{
+       struct ptlrpc_request   *req;
+       struct obd_quotactl     *oqc;
+       int                   rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION,
+                                       MDS_QUOTACTL);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+       *oqc = *oqctl;
+
+       ptlrpc_request_set_replen(req);
+       ptlrpc_at_set_req_timeout(req);
+       req->rq_no_resend = 1;
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+       if (req->rq_repmsg &&
+           (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+               *oqctl = *oqc;
+       } else if (!rc) {
+               CERROR ("Can't unpack obd_quotactl\n");
+               rc = -EPROTO;
+       }
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+static int mdc_ioc_swap_layouts(struct obd_export *exp,
+                               struct md_op_data *op_data)
+{
+       LIST_HEAD(cancels);
+       struct ptlrpc_request   *req;
+       int                      rc, count;
+       struct mdc_swap_layouts *msl, *payload;
+       ENTRY;
+
+       msl = op_data->op_data;
+
+       /* When the MDT will get the MDS_SWAP_LAYOUTS RPC the
+        * first thing it will do is to cancel the 2 layout
+        * locks hold by this client.
+        * So the client must cancel its layout locks on the 2 fids
+        * with the request RPC to avoid extra RPC round trips
+        */
+       count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels,
+                                       LCK_CR, MDS_INODELOCK_LAYOUT);
+       count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels,
+                                        LCK_CR, MDS_INODELOCK_LAYOUT);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_SWAP_LAYOUTS);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+
+       mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1);
+       mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2);
+
+       rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_swap_layouts_pack(req, op_data);
+
+       payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS);
+       LASSERT(payload);
+
+       *payload = *msl;
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+       EXIT;
+
+out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+                        void *karg, void *uarg)
+{
+       struct obd_device *obd = exp->exp_obd;
+       struct obd_ioctl_data *data = karg;
+       struct obd_import *imp = obd->u.cli.cl_import;
+       struct llog_ctxt *ctxt;
+       int rc;
+       ENTRY;
+
+       if (!try_module_get(THIS_MODULE)) {
+               CERROR("Can't get module. Is it alive?");
+               return -EINVAL;
+       }
+       switch (cmd) {
+       case OBD_IOC_CHANGELOG_SEND:
+               rc = mdc_ioc_changelog_send(obd, karg);
+               GOTO(out, rc);
+       case OBD_IOC_CHANGELOG_CLEAR: {
+               struct ioc_changelog *icc = karg;
+               struct changelog_setinfo cs =
+                       {.cs_recno = icc->icc_recno, .cs_id = icc->icc_id};
+               rc = obd_set_info_async(NULL, exp, strlen(KEY_CHANGELOG_CLEAR),
+                                       KEY_CHANGELOG_CLEAR, sizeof(cs), &cs,
+                                       NULL);
+               GOTO(out, rc);
+       }
+       case OBD_IOC_FID2PATH:
+               rc = mdc_ioc_fid2path(exp, karg);
+               GOTO(out, rc);
+       case LL_IOC_HSM_CT_START:
+               rc = mdc_ioc_hsm_ct_start(exp, karg);
+               GOTO(out, rc);
+       case LL_IOC_HSM_PROGRESS:
+               rc = mdc_ioc_hsm_progress(exp, karg);
+               GOTO(out, rc);
+       case LL_IOC_HSM_STATE_GET:
+               rc = mdc_ioc_hsm_state_get(exp, karg);
+               GOTO(out, rc);
+       case LL_IOC_HSM_STATE_SET:
+               rc = mdc_ioc_hsm_state_set(exp, karg);
+       case LL_IOC_HSM_ACTION:
+               rc = mdc_ioc_hsm_current_action(exp, karg);
+               GOTO(out, rc);
+       case LL_IOC_HSM_REQUEST:
+               rc = mdc_ioc_hsm_request(exp, karg);
+               GOTO(out, rc);
+       case OBD_IOC_CLIENT_RECOVER:
+               rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0);
+               if (rc < 0)
+                       GOTO(out, rc);
+               GOTO(out, rc = 0);
+       case IOC_OSC_SET_ACTIVE:
+               rc = ptlrpc_set_import_active(imp, data->ioc_offset);
+               GOTO(out, rc);
+       case OBD_IOC_PARSE: {
+               ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT);
+               rc = class_config_parse_llog(NULL, ctxt, data->ioc_inlbuf1,
+                                            NULL);
+               llog_ctxt_put(ctxt);
+               GOTO(out, rc);
+       }
+       case OBD_IOC_LLOG_INFO:
+       case OBD_IOC_LLOG_PRINT: {
+               ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+               rc = llog_ioctl(NULL, ctxt, cmd, data);
+               llog_ctxt_put(ctxt);
+               GOTO(out, rc);
+       }
+       case OBD_IOC_POLL_QUOTACHECK:
+               rc = mdc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+               GOTO(out, rc);
+       case OBD_IOC_PING_TARGET:
+               rc = ptlrpc_obd_ping(obd);
+               GOTO(out, rc);
+       /*
+        * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by
+        * LMV instead of MDC. But when the cluster is upgraded from 1.8,
+        * there'd be no LMV layer thus we might be called here. Eventually
+        * this code should be removed.
+        * bz20731, LU-592.
+        */
+       case IOC_OBD_STATFS: {
+               struct obd_statfs stat_buf = {0};
+
+               if (*((__u32 *) data->ioc_inlbuf2) != 0)
+                       GOTO(out, rc = -ENODEV);
+
+               /* copy UUID */
+               if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd),
+                                    min((int) data->ioc_plen2,
+                                        (int) sizeof(struct obd_uuid))))
+                       GOTO(out, rc = -EFAULT);
+
+               rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf,
+                               cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                               0);
+               if (rc != 0)
+                       GOTO(out, rc);
+
+               if (copy_to_user(data->ioc_pbuf1, &stat_buf,
+                                    min((int) data->ioc_plen1,
+                                        (int) sizeof(stat_buf))))
+                       GOTO(out, rc = -EFAULT);
+
+               GOTO(out, rc = 0);
+       }
+       case OBD_IOC_QUOTACTL: {
+               struct if_quotactl *qctl = karg;
+               struct obd_quotactl *oqctl;
+
+               OBD_ALLOC_PTR(oqctl);
+               if (!oqctl)
+                       RETURN(-ENOMEM);
+
+               QCTL_COPY(oqctl, qctl);
+               rc = obd_quotactl(exp, oqctl);
+               if (rc == 0) {
+                       QCTL_COPY(qctl, oqctl);
+                       qctl->qc_valid = QC_MDTIDX;
+                       qctl->obd_uuid = obd->u.cli.cl_target_uuid;
+               }
+               OBD_FREE_PTR(oqctl);
+               break;
+       }
+       case LL_IOC_GET_CONNECT_FLAGS: {
+               if (copy_to_user(uarg,
+                                    exp_connect_flags_ptr(exp),
+                                    sizeof(__u64)))
+                       GOTO(out, rc = -EFAULT);
+               else
+                       GOTO(out, rc = 0);
+       }
+       case LL_IOC_LOV_SWAP_LAYOUTS: {
+               rc = mdc_ioc_swap_layouts(exp, karg);
+               break;
+       }
+       default:
+               CERROR("mdc_ioctl(): unrecognised ioctl %#x\n", cmd);
+               GOTO(out, rc = -ENOTTY);
+       }
+out:
+       module_put(THIS_MODULE);
+
+       return rc;
+}
+
+int mdc_get_info_rpc(struct obd_export *exp,
+                    obd_count keylen, void *key,
+                    int vallen, void *val)
+{
+       struct obd_import      *imp = class_exp2cliimp(exp);
+       struct ptlrpc_request  *req;
+       char               *tmp;
+       int                  rc = -EINVAL;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
+                            RCL_CLIENT, keylen);
+       req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
+                            RCL_CLIENT, sizeof(__u32));
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
+       memcpy(tmp, key, keylen);
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
+       memcpy(tmp, &vallen, sizeof(__u32));
+
+       req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
+                            RCL_SERVER, vallen);
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       /* -EREMOTE means the get_info result is partial, and it needs to
+        * continue on another MDT, see fid2path part in lmv_iocontrol */
+       if (rc == 0 || rc == -EREMOTE) {
+               tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL);
+               memcpy(val, tmp, vallen);
+               if (ptlrpc_rep_need_swab(req)) {
+                       if (KEY_IS(KEY_FID2PATH))
+                               lustre_swab_fid2path(val);
+               }
+       }
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+static void lustre_swab_hai(struct hsm_action_item *h)
+{
+       __swab32s(&h->hai_len);
+       __swab32s(&h->hai_action);
+       lustre_swab_lu_fid(&h->hai_fid);
+       lustre_swab_lu_fid(&h->hai_dfid);
+       __swab64s(&h->hai_cookie);
+       __swab64s(&h->hai_extent.offset);
+       __swab64s(&h->hai_extent.length);
+       __swab64s(&h->hai_gid);
+}
+
+static void lustre_swab_hal(struct hsm_action_list *h)
+{
+       struct hsm_action_item  *hai;
+       int                      i;
+
+       __swab32s(&h->hal_version);
+       __swab32s(&h->hal_count);
+       __swab32s(&h->hal_archive_id);
+       __swab64s(&h->hal_flags);
+       hai = hai_zero(h);
+       for (i = 0; i < h->hal_count; i++) {
+               lustre_swab_hai(hai);
+               hai = hai_next(hai);
+       }
+}
+
+static void lustre_swab_kuch(struct kuc_hdr *l)
+{
+       __swab16s(&l->kuc_magic);
+       /* __u8 l->kuc_transport */
+       __swab16s(&l->kuc_msgtype);
+       __swab16s(&l->kuc_msglen);
+}
+
+static int mdc_ioc_hsm_ct_start(struct obd_export *exp,
+                               struct lustre_kernelcomm *lk)
+{
+       struct obd_import  *imp = class_exp2cliimp(exp);
+       __u32               archive = lk->lk_data;
+       int                 rc = 0;
+
+       if (lk->lk_group != KUC_GRP_HSM) {
+               CERROR("Bad copytool group %d\n", lk->lk_group);
+               return -EINVAL;
+       }
+
+       CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd,
+              lk->lk_uid, lk->lk_group, lk->lk_flags);
+
+       if (lk->lk_flags & LK_FLG_STOP) {
+               rc = libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
+               /* Unregister with the coordinator */
+               if (rc == 0)
+                       rc = mdc_ioc_hsm_ct_unregister(imp);
+       } else {
+               struct file *fp = fget(lk->lk_wfd);
+
+               rc = libcfs_kkuc_group_add(fp, lk->lk_uid, lk->lk_group,
+                                          lk->lk_data);
+               if (rc && fp)
+                       fput(fp);
+               if (rc == 0)
+                       rc = mdc_ioc_hsm_ct_register(imp, archive);
+       }
+
+       return rc;
+}
+
+/**
+ * Send a message to any listening copytools
+ * @param val KUC message (kuc_hdr + hsm_action_list)
+ * @param len total length of message
+ */
+static int mdc_hsm_copytool_send(int len, void *val)
+{
+       struct kuc_hdr          *lh = (struct kuc_hdr *)val;
+       struct hsm_action_list  *hal = (struct hsm_action_list *)(lh + 1);
+       int                      rc;
+       ENTRY;
+
+       if (len < sizeof(*lh) + sizeof(*hal)) {
+               CERROR("Short HSM message %d < %d\n", len,
+                      (int) (sizeof(*lh) + sizeof(*hal)));
+               RETURN(-EPROTO);
+       }
+       if (lh->kuc_magic == __swab16(KUC_MAGIC)) {
+               lustre_swab_kuch(lh);
+               lustre_swab_hal(hal);
+       } else if (lh->kuc_magic != KUC_MAGIC) {
+               CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC);
+               RETURN(-EPROTO);
+       }
+
+       CDEBUG(D_HSM, " Received message mg=%x t=%d m=%d l=%d actions=%d "
+              "on %s\n",
+              lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype,
+              lh->kuc_msglen, hal->hal_count, hal->hal_fsname);
+
+       /* Broadcast to HSM listeners */
+       rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh);
+
+       RETURN(rc);
+}
+
+/**
+ * callback function passed to kuc for re-registering each HSM copytool
+ * running on MDC, after MDT shutdown/recovery.
+ * @param data archive id served by the copytool
+ * @param cb_arg callback argument (obd_import)
+ */
+static int mdc_hsm_ct_reregister(__u32 data, void *cb_arg)
+{
+       struct obd_import       *imp = (struct obd_import *)cb_arg;
+       __u32                    archive = data;
+       int                      rc;
+
+       CDEBUG(D_HA, "recover copytool registration to MDT (archive=%#x)\n",
+              archive);
+       rc = mdc_ioc_hsm_ct_register(imp, archive);
+
+       /* ignore error if the copytool is already registered */
+       return ((rc != 0) && (rc != -EEXIST)) ? rc : 0;
+}
+
+/**
+ * Re-establish all kuc contexts with MDT
+ * after MDT shutdown/recovery.
+ */
+static int mdc_kuc_reregister(struct obd_import *imp)
+{
+       /* re-register HSM agents */
+       return libcfs_kkuc_group_foreach(KUC_GRP_HSM, mdc_hsm_ct_reregister,
+                                        (void *)imp);
+}
+
+int mdc_set_info_async(const struct lu_env *env,
+                      struct obd_export *exp,
+                      obd_count keylen, void *key,
+                      obd_count vallen, void *val,
+                      struct ptlrpc_request_set *set)
+{
+       struct obd_import       *imp = class_exp2cliimp(exp);
+       int                      rc;
+       ENTRY;
+
+       if (KEY_IS(KEY_READ_ONLY)) {
+               if (vallen != sizeof(int))
+                       RETURN(-EINVAL);
+
+               spin_lock(&imp->imp_lock);
+               if (*((int *)val)) {
+                       imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY;
+                       imp->imp_connect_data.ocd_connect_flags |=
+                                                       OBD_CONNECT_RDONLY;
+               } else {
+                       imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY;
+                       imp->imp_connect_data.ocd_connect_flags &=
+                                                       ~OBD_CONNECT_RDONLY;
+               }
+               spin_unlock(&imp->imp_lock);
+
+               rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+                                      keylen, key, vallen, val, set);
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_SPTLRPC_CONF)) {
+               sptlrpc_conf_client_adapt(exp->exp_obd);
+               RETURN(0);
+       }
+       if (KEY_IS(KEY_FLUSH_CTX)) {
+               sptlrpc_import_flush_my_ctx(imp);
+               RETURN(0);
+       }
+       if (KEY_IS(KEY_MDS_CONN)) {
+               /* mds-mds import */
+               spin_lock(&imp->imp_lock);
+               imp->imp_server_timeout = 1;
+               spin_unlock(&imp->imp_lock);
+               imp->imp_client->cli_request_portal = MDS_MDS_PORTAL;
+               CDEBUG(D_OTHER, "%s: timeout / 2\n", exp->exp_obd->obd_name);
+               RETURN(0);
+       }
+       if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
+               rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION,
+                                      keylen, key, vallen, val, set);
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) {
+               rc = mdc_hsm_copytool_send(vallen, val);
+               RETURN(rc);
+       }
+
+       CERROR("Unknown key %s\n", (char *)key);
+       RETURN(-EINVAL);
+}
+
+int mdc_get_info(const struct lu_env *env, struct obd_export *exp,
+                __u32 keylen, void *key, __u32 *vallen, void *val,
+                struct lov_stripe_md *lsm)
+{
+       int rc = -EINVAL;
+
+       if (KEY_IS(KEY_MAX_EASIZE)) {
+               int mdsize, *max_easize;
+
+               if (*vallen != sizeof(int))
+                       RETURN(-EINVAL);
+               mdsize = *(int*)val;
+               if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize)
+                       exp->exp_obd->u.cli.cl_max_mds_easize = mdsize;
+               max_easize = val;
+               *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize;
+               RETURN(0);
+       } else if (KEY_IS(KEY_CONN_DATA)) {
+               struct obd_import *imp = class_exp2cliimp(exp);
+               struct obd_connect_data *data = val;
+
+               if (*vallen != sizeof(*data))
+                       RETURN(-EINVAL);
+
+               *data = imp->imp_connect_data;
+               RETURN(0);
+       } else if (KEY_IS(KEY_TGT_COUNT)) {
+               *((int *)val) = 1;
+               RETURN(0);
+       }
+
+       rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val);
+
+       RETURN(rc);
+}
+
+static int mdc_pin(struct obd_export *exp, const struct lu_fid *fid,
+                  struct obd_capa *oc, struct obd_client_handle *handle,
+                  int flags)
+{
+       struct ptlrpc_request *req;
+       struct mdt_body       *body;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_PIN);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_PIN);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, fid, oc, 0, 0, -1, flags);
+
+       ptlrpc_request_set_replen(req);
+
+       mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       rc = ptlrpc_queue_wait(req);
+       mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       if (rc) {
+               CERROR("Pin failed: %d\n", rc);
+               GOTO(err_out, rc);
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               GOTO(err_out, rc = -EPROTO);
+
+       handle->och_fh = body->handle;
+       handle->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+
+       handle->och_mod = obd_mod_alloc();
+       if (handle->och_mod == NULL) {
+               DEBUG_REQ(D_ERROR, req, "can't allocate md_open_data");
+               GOTO(err_out, rc = -ENOMEM);
+       }
+       handle->och_mod->mod_open_req = req; /* will be dropped by unpin */
+
+       RETURN(0);
+
+err_out:
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+static int mdc_unpin(struct obd_export *exp, struct obd_client_handle *handle,
+                    int flag)
+{
+       struct ptlrpc_request *req;
+       struct mdt_body       *body;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_UNPIN,
+                                       LUSTRE_MDS_VERSION, MDS_UNPIN);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY);
+       body->handle = handle->och_fh;
+       body->flags = flag;
+
+       ptlrpc_request_set_replen(req);
+
+       mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+       rc = ptlrpc_queue_wait(req);
+       mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
+
+       if (rc != 0)
+               CERROR("Unpin failed: %d\n", rc);
+
+       ptlrpc_req_finished(req);
+       ptlrpc_req_finished(handle->och_mod->mod_open_req);
+
+       obd_mod_put(handle->och_mod);
+       RETURN(rc);
+}
+
+int mdc_sync(struct obd_export *exp, const struct lu_fid *fid,
+            struct obd_capa *oc, struct ptlrpc_request **request)
+{
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, fid, oc, 0, 0, -1, 0);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+static int mdc_import_event(struct obd_device *obd, struct obd_import *imp,
+                           enum obd_import_event event)
+{
+       int rc = 0;
+
+       LASSERT(imp->imp_obd == obd);
+
+       switch (event) {
+       case IMP_EVENT_DISCON: {
+#if 0
+               /* XXX Pass event up to OBDs stack. used only for FLD now */
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DISCON, NULL);
+#endif
+               break;
+       }
+       case IMP_EVENT_INACTIVE: {
+               struct client_obd *cli = &obd->u.cli;
+               /*
+                * Flush current sequence to make client obtain new one
+                * from server in case of disconnect/reconnect.
+                */
+               if (cli->cl_seq != NULL)
+                       seq_client_flush(cli->cl_seq);
+
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+               break;
+       }
+       case IMP_EVENT_INVALIDATE: {
+               struct ldlm_namespace *ns = obd->obd_namespace;
+
+               ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+
+               break;
+       }
+       case IMP_EVENT_ACTIVE:
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+               /* restore re-establish kuc registration after reconnecting */
+               if (rc == 0)
+                       rc = mdc_kuc_reregister(imp);
+               break;
+       case IMP_EVENT_OCD:
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+               break;
+       case IMP_EVENT_DEACTIVATE:
+       case IMP_EVENT_ACTIVATE:
+               break;
+       default:
+               CERROR("Unknown import event %x\n", event);
+               LBUG();
+       }
+       RETURN(rc);
+}
+
+int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
+                 struct md_op_data *op_data)
+{
+       struct client_obd *cli = &exp->exp_obd->u.cli;
+       struct lu_client_seq *seq = cli->cl_seq;
+       ENTRY;
+       RETURN(seq_client_alloc_fid(NULL, seq, fid));
+}
+
+struct obd_uuid *mdc_get_uuid(struct obd_export *exp) {
+       struct client_obd *cli = &exp->exp_obd->u.cli;
+       return &cli->cl_target_uuid;
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying it during
+ * recovery, non zero value will be return if the lock can be canceled,
+ * or zero returned for not
+ */
+static int mdc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+       if (lock->l_resource->lr_type != LDLM_IBITS)
+               RETURN(0);
+
+       /* FIXME: if we ever get into a situation where there are too many
+        * opened files with open locks on a single node, then we really
+        * should replay these open locks to reget it */
+       if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN)
+               RETURN(0);
+
+       RETURN(1);
+}
+
+static int mdc_resource_inode_free(struct ldlm_resource *res)
+{
+       if (res->lr_lvb_inode)
+               res->lr_lvb_inode = NULL;
+
+       return 0;
+}
+
+struct ldlm_valblock_ops inode_lvbo = {
+       lvbo_free: mdc_resource_inode_free
+};
+
+static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg)
+{
+       struct client_obd *cli = &obd->u.cli;
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc;
+       ENTRY;
+
+       OBD_ALLOC(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+       if (!cli->cl_rpc_lock)
+               RETURN(-ENOMEM);
+       mdc_init_rpc_lock(cli->cl_rpc_lock);
+
+       ptlrpcd_addref();
+
+       OBD_ALLOC(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+       if (!cli->cl_close_lock)
+               GOTO(err_rpc_lock, rc = -ENOMEM);
+       mdc_init_rpc_lock(cli->cl_close_lock);
+
+       rc = client_obd_setup(obd, cfg);
+       if (rc)
+               GOTO(err_close_lock, rc);
+       lprocfs_mdc_init_vars(&lvars);
+       lprocfs_obd_setup(obd, lvars.obd_vars);
+       sptlrpc_lprocfs_cliobd_attach(obd);
+       ptlrpc_lprocfs_register_obd(obd);
+
+       ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery);
+
+       obd->obd_namespace->ns_lvbo = &inode_lvbo;
+
+       rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+       if (rc) {
+               mdc_cleanup(obd);
+               CERROR("failed to setup llogging subsystems\n");
+       }
+
+       RETURN(rc);
+
+err_close_lock:
+       OBD_FREE(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+err_rpc_lock:
+       OBD_FREE(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+       ptlrpcd_decref();
+       RETURN(rc);
+}
+
+/* Initialize the default and maximum LOV EA and cookie sizes.  This allows
+ * us to make MDS RPCs with large enough reply buffers to hold the
+ * maximum-sized (= maximum striped) EA and cookie without having to
+ * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
+static int mdc_init_ea_size(struct obd_export *exp, int easize,
+                    int def_easize, int cookiesize)
+{
+       struct obd_device *obd = exp->exp_obd;
+       struct client_obd *cli = &obd->u.cli;
+       ENTRY;
+
+       if (cli->cl_max_mds_easize < easize)
+               cli->cl_max_mds_easize = easize;
+
+       if (cli->cl_default_mds_easize < def_easize)
+               cli->cl_default_mds_easize = def_easize;
+
+       if (cli->cl_max_mds_cookiesize < cookiesize)
+               cli->cl_max_mds_cookiesize = cookiesize;
+
+       RETURN(0);
+}
+
+static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       int rc = 0;
+       ENTRY;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY:
+               break;
+       case OBD_CLEANUP_EXPORTS:
+               /* Failsafe, ok if racy */
+               if (obd->obd_type->typ_refcnt <= 1)
+                       libcfs_kkuc_group_rem(0, KUC_GRP_HSM);
+
+               obd_cleanup_client_import(obd);
+               ptlrpc_lprocfs_unregister_obd(obd);
+               lprocfs_obd_cleanup(obd);
+
+               rc = obd_llog_finish(obd, 0);
+               if (rc != 0)
+                       CERROR("failed to cleanup llogging subsystems\n");
+               break;
+       }
+       RETURN(rc);
+}
+
+static int mdc_cleanup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+
+       OBD_FREE(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock));
+       OBD_FREE(cli->cl_close_lock, sizeof (*cli->cl_close_lock));
+
+       ptlrpcd_decref();
+
+       return client_obd_cleanup(obd);
+}
+
+
+static int mdc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                        struct obd_device *tgt, int *index)
+{
+       struct llog_ctxt        *ctxt;
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(olg == &obd->obd_olg);
+
+       rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, tgt,
+                       &llog_client_ops);
+       if (rc)
+               RETURN(rc);
+
+       ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT);
+       llog_initiator_connect(ctxt);
+       llog_ctxt_put(ctxt);
+
+       RETURN(0);
+}
+
+static int mdc_llog_finish(struct obd_device *obd, int count)
+{
+       struct llog_ctxt *ctxt;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+
+       RETURN(0);
+}
+
+static int mdc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+       struct lustre_cfg *lcfg = buf;
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc = 0;
+
+       lprocfs_mdc_init_vars(&lvars);
+       switch (lcfg->lcfg_command) {
+       default:
+               rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars,
+                                             lcfg, obd);
+               if (rc > 0)
+                       rc = 0;
+               break;
+       }
+       return(rc);
+}
+
+
+/* get remote permission for current user on fid */
+int mdc_get_remote_perm(struct obd_export *exp, const struct lu_fid *fid,
+                       struct obd_capa *oc, __u32 suppgid,
+                       struct ptlrpc_request **request)
+{
+       struct ptlrpc_request  *req;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(client_is_remote(exp));
+
+       *request = NULL;
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       mdc_set_capa_size(req, &RMF_CAPA1, oc);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       mdc_pack_body(req, fid, oc, OBD_MD_FLRMTPERM, 0, suppgid, 0);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER,
+                            sizeof(struct mdt_remote_perm));
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               ptlrpc_req_finished(req);
+       else
+               *request = req;
+       RETURN(rc);
+}
+
+static int mdc_interpret_renew_capa(const struct lu_env *env,
+                                   struct ptlrpc_request *req, void *args,
+                                   int status)
+{
+       struct mdc_renew_capa_args *ra = args;
+       struct mdt_body *body = NULL;
+       struct lustre_capa *capa;
+       ENTRY;
+
+       if (status)
+               GOTO(out, capa = ERR_PTR(status));
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       if (body == NULL)
+               GOTO(out, capa = ERR_PTR(-EFAULT));
+
+       if ((body->valid & OBD_MD_FLOSSCAPA) == 0)
+               GOTO(out, capa = ERR_PTR(-ENOENT));
+
+       capa = req_capsule_server_get(&req->rq_pill, &RMF_CAPA2);
+       if (!capa)
+               GOTO(out, capa = ERR_PTR(-EFAULT));
+       EXIT;
+out:
+       ra->ra_cb(ra->ra_oc, capa);
+       return 0;
+}
+
+static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc,
+                         renew_capa_cb_t cb)
+{
+       struct ptlrpc_request *req;
+       struct mdc_renew_capa_args *ra;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_GETATTR,
+                                       LUSTRE_MDS_VERSION, MDS_GETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       /* NB, OBD_MD_FLOSSCAPA is set here, but it doesn't necessarily mean the
+        * capa to renew is oss capa.
+        */
+       mdc_pack_body(req, &oc->c_capa.lc_fid, oc, OBD_MD_FLOSSCAPA, 0, -1, 0);
+       ptlrpc_request_set_replen(req);
+
+       CLASSERT(sizeof(*ra) <= sizeof(req->rq_async_args));
+       ra = ptlrpc_req_async_args(req);
+       ra->ra_oc = oc;
+       ra->ra_cb = cb;
+       req->rq_interpret_reply = mdc_interpret_renew_capa;
+       ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+       RETURN(0);
+}
+
+static int mdc_connect(const struct lu_env *env,
+                      struct obd_export **exp,
+                      struct obd_device *obd, struct obd_uuid *cluuid,
+                      struct obd_connect_data *data,
+                      void *localdata)
+{
+       struct obd_import *imp = obd->u.cli.cl_import;
+
+       /* mds-mds import features */
+       if (data && (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS)) {
+               spin_lock(&imp->imp_lock);
+               imp->imp_server_timeout = 1;
+               spin_unlock(&imp->imp_lock);
+               imp->imp_client->cli_request_portal = MDS_MDS_PORTAL;
+               CDEBUG(D_OTHER, "%s: Set 'mds' portal and timeout\n",
+                      obd->obd_name);
+       }
+
+       return client_connect_import(env, exp, obd, cluuid, data, NULL);
+}
+
+struct obd_ops mdc_obd_ops = {
+       .o_owner            = THIS_MODULE,
+       .o_setup            = mdc_setup,
+       .o_precleanup       = mdc_precleanup,
+       .o_cleanup        = mdc_cleanup,
+       .o_add_conn      = client_import_add_conn,
+       .o_del_conn      = client_import_del_conn,
+       .o_connect        = mdc_connect,
+       .o_disconnect       = client_disconnect_export,
+       .o_iocontrol    = mdc_iocontrol,
+       .o_set_info_async   = mdc_set_info_async,
+       .o_statfs          = mdc_statfs,
+       .o_pin        = mdc_pin,
+       .o_unpin            = mdc_unpin,
+       .o_fid_init         = client_fid_init,
+       .o_fid_fini         = client_fid_fini,
+       .o_fid_alloc    = mdc_fid_alloc,
+       .o_import_event     = mdc_import_event,
+       .o_llog_init    = mdc_llog_init,
+       .o_llog_finish      = mdc_llog_finish,
+       .o_get_info      = mdc_get_info,
+       .o_process_config   = mdc_process_config,
+       .o_get_uuid      = mdc_get_uuid,
+       .o_quotactl      = mdc_quotactl,
+       .o_quotacheck       = mdc_quotacheck
+};
+
+struct md_ops mdc_md_ops = {
+       .m_getstatus    = mdc_getstatus,
+       .m_null_inode       = mdc_null_inode,
+       .m_find_cbdata      = mdc_find_cbdata,
+       .m_close            = mdc_close,
+       .m_create          = mdc_create,
+       .m_done_writing     = mdc_done_writing,
+       .m_enqueue        = mdc_enqueue,
+       .m_getattr        = mdc_getattr,
+       .m_getattr_name     = mdc_getattr_name,
+       .m_intent_lock      = mdc_intent_lock,
+       .m_link      = mdc_link,
+       .m_is_subdir    = mdc_is_subdir,
+       .m_rename          = mdc_rename,
+       .m_setattr        = mdc_setattr,
+       .m_setxattr      = mdc_setxattr,
+       .m_getxattr      = mdc_getxattr,
+       .m_sync      = mdc_sync,
+       .m_readpage      = mdc_readpage,
+       .m_unlink          = mdc_unlink,
+       .m_cancel_unused    = mdc_cancel_unused,
+       .m_init_ea_size     = mdc_init_ea_size,
+       .m_set_lock_data    = mdc_set_lock_data,
+       .m_lock_match       = mdc_lock_match,
+       .m_get_lustre_md    = mdc_get_lustre_md,
+       .m_free_lustre_md   = mdc_free_lustre_md,
+       .m_set_open_replay_data = mdc_set_open_replay_data,
+       .m_clear_open_replay_data = mdc_clear_open_replay_data,
+       .m_renew_capa       = mdc_renew_capa,
+       .m_unpack_capa      = mdc_unpack_capa,
+       .m_get_remote_perm  = mdc_get_remote_perm,
+       .m_intent_getattr_async = mdc_intent_getattr_async,
+       .m_revalidate_lock      = mdc_revalidate_lock
+};
+
+int __init mdc_init(void)
+{
+       int rc;
+       struct lprocfs_static_vars lvars = { 0 };
+       lprocfs_mdc_init_vars(&lvars);
+
+       rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, lvars.module_vars,
+                                LUSTRE_MDC_NAME, NULL);
+       RETURN(rc);
+}
+
+static void /*__exit*/ mdc_exit(void)
+{
+       class_unregister_type(LUSTRE_MDC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Metadata Client");
+MODULE_LICENSE("GPL");
+
+module_init(mdc_init);
+module_exit(mdc_exit);
diff --git a/drivers/staging/lustre/lustre/mgc/Makefile b/drivers/staging/lustre/lustre/mgc/Makefile
new file mode 100644 (file)
index 0000000..2672463
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += mgc.o
+mgc-y := mgc_request.o lproc_mgc.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/mgc/libmgc.c b/drivers/staging/lustre/lustre/mgc/libmgc.c
new file mode 100644 (file)
index 0000000..442146c
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/libmgc.c
+ *
+ * Lustre Management Client
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+/* Minimal MGC for liblustre: only used to read the config log from the MGS
+   at setup time, no updates. */
+
+#define DEBUG_SUBSYSTEM S_MGC
+
+#include <liblustre.h>
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       int rc;
+       ENTRY;
+
+       ptlrpcd_addref();
+
+       rc = client_obd_setup(obd, lcfg);
+       if (rc)
+               GOTO(err_decref, rc);
+
+       /* liblustre only support null flavor to MGS */
+       obd->u.cli.cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_NULL;
+
+       rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+       if (rc) {
+               CERROR("failed to setup llogging subsystems\n");
+               GOTO(err_cleanup, rc);
+       }
+
+       RETURN(rc);
+
+err_cleanup:
+       client_obd_cleanup(obd);
+err_decref:
+       ptlrpcd_decref();
+       RETURN(rc);
+}
+
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       int rc = 0;
+       ENTRY;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY:
+       case OBD_CLEANUP_EXPORTS:
+               obd_cleanup_client_import(obd);
+               rc = obd_llog_finish(obd, 0);
+               if (rc != 0)
+                       CERROR("failed to cleanup llogging subsystems\n");
+               break;
+       }
+       RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       int rc;
+       ENTRY;
+
+       LASSERT(cli->cl_mgc_vfsmnt == NULL);
+
+       ptlrpcd_decref();
+
+       rc = client_obd_cleanup(obd);
+       RETURN(rc);
+}
+
+static int mgc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                        struct obd_device *tgt, int *index)
+{
+       struct llog_ctxt *ctxt;
+       int rc;
+       ENTRY;
+
+       LASSERT(olg == &obd->obd_olg);
+       rc = llog_setup(NULL, obd, olg, LLOG_CONFIG_REPL_CTXT, tgt,
+                       &llog_client_ops);
+       if (rc < 0)
+               RETURN(rc);
+
+       ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_REPL_CTXT);
+       llog_initiator_connect(ctxt);
+       llog_ctxt_put(ctxt);
+
+       RETURN(rc);
+}
+
+static int mgc_llog_finish(struct obd_device *obd, int count)
+{
+       struct llog_ctxt *ctxt;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+
+       RETURN(0);
+}
+
+struct obd_ops mgc_obd_ops = {
+       .o_owner        = THIS_MODULE,
+       .o_setup        = mgc_setup,
+       .o_precleanup   = mgc_precleanup,
+       .o_cleanup      = mgc_cleanup,
+       .o_add_conn     = client_import_add_conn,
+       .o_del_conn     = client_import_del_conn,
+       .o_connect      = client_connect_import,
+       .o_disconnect   = client_disconnect_export,
+       .o_llog_init    = mgc_llog_init,
+       .o_llog_finish  = mgc_llog_finish,
+};
+
+int __init mgc_init(void)
+{
+       return class_register_type(&mgc_obd_ops, NULL,
+                                  NULL, LUSTRE_MGC_NAME, NULL);
+}
diff --git a/drivers/staging/lustre/lustre/mgc/lproc_mgc.c b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c
new file mode 100644 (file)
index 0000000..1105eaa
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include "mgc_internal.h"
+
+#ifdef LPROCFS
+
+LPROC_SEQ_FOPS_RO_TYPE(mgc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, import);
+LPROC_SEQ_FOPS_RO_TYPE(mgc, state);
+
+LPROC_SEQ_FOPS_WR_ONLY(mgc, ping);
+
+static int mgc_ir_state_seq_show(struct seq_file *m, void *v)
+{
+       return lprocfs_mgc_rd_ir_state(m, m->private);
+}
+LPROC_SEQ_FOPS_RO(mgc_ir_state);
+
+static struct lprocfs_vars lprocfs_mgc_obd_vars[] = {
+       { "uuid",            &mgc_uuid_fops,      0, 0 },
+       { "ping",            &mgc_ping_fops,      0, 0222 },
+       { "connect_flags",   &mgc_connect_flags_fops, 0, 0 },
+       { "mgs_server_uuid", &mgc_server_uuid_fops,   0, 0 },
+       { "mgs_conn_uuid",   &mgc_conn_uuid_fops,     0, 0 },
+       { "import",          &mgc_import_fops,  0, 0 },
+       { "state",           &mgc_state_fops,    0, 0 },
+       { "ir_state",        &mgc_ir_state_fops,  0, 0 },
+       { 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(mgc, numrefs);
+static struct lprocfs_vars lprocfs_mgc_module_vars[] = {
+       { "num_refs",   &mgc_numrefs_fops,       0, 0 },
+       { 0 }
+};
+
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+       lvars->module_vars = lprocfs_mgc_module_vars;
+       lvars->obd_vars    = lprocfs_mgc_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/mgc/mgc_internal.h b/drivers/staging/lustre/lustre/mgc/mgc_internal.h
new file mode 100644 (file)
index 0000000..dbd6982
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef _MGC_INTERNAL_H
+#define _MGC_INTERNAL_H
+
+#include <linux/libcfs/libcfs.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_lib.h>
+#include <lustre_dlm.h>
+#include <lustre_log.h>
+#include <lustre_export.h>
+
+#ifdef LPROCFS
+void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars);
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data);
+#else
+static void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+static inline int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
+{
+       return 0;
+}
+#endif  /* LPROCFS */
+
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
+
+static inline int cld_is_sptlrpc(struct config_llog_data *cld)
+{
+       return cld->cld_type == CONFIG_T_SPTLRPC;
+}
+
+static inline int cld_is_recover(struct config_llog_data *cld)
+{
+       return cld->cld_type == CONFIG_T_RECOVER;
+}
+
+#endif  /* _MGC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c
new file mode 100644 (file)
index 0000000..c6c84d9
--- /dev/null
@@ -0,0 +1,1860 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgc/mgc_request.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MGC
+#define D_MGC D_CONFIG /*|D_WARNING*/
+
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include "mgc_internal.h"
+
+static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id,
+                         int type)
+{
+       __u64 resname = 0;
+
+       if (len > 8) {
+               CERROR("name too long: %s\n", name);
+               return -EINVAL;
+       }
+       if (len <= 0) {
+               CERROR("missing name: %s\n", name);
+               return -EINVAL;
+       }
+       memcpy(&resname, name, len);
+
+       /* Always use the same endianness for the resid */
+       memset(res_id, 0, sizeof(*res_id));
+       res_id->name[0] = cpu_to_le64(resname);
+       /* XXX: unfortunately, sptlprc and config llog share one lock */
+       switch(type) {
+       case CONFIG_T_CONFIG:
+       case CONFIG_T_SPTLRPC:
+               resname = 0;
+               break;
+       case CONFIG_T_RECOVER:
+               resname = type;
+               break;
+       default:
+               LBUG();
+       }
+       res_id->name[1] = cpu_to_le64(resname);
+       CDEBUG(D_MGC, "log %s to resid "LPX64"/"LPX64" (%.8s)\n", name,
+              res_id->name[0], res_id->name[1], (char *)&res_id->name[0]);
+       return 0;
+}
+
+int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type)
+{
+       /* fsname is at most 8 chars long, maybe contain "-".
+        * e.g. "lustre", "SUN-000" */
+       return mgc_name2resid(fsname, strlen(fsname), res_id, type);
+}
+EXPORT_SYMBOL(mgc_fsname2resid);
+
+int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type)
+{
+       char *name_end;
+       int len;
+
+       /* logname consists of "fsname-nodetype".
+        * e.g. "lustre-MDT0001", "SUN-000-client" */
+       name_end = strrchr(logname, '-');
+       LASSERT(name_end);
+       len = name_end - logname;
+       return mgc_name2resid(logname, len, res_id, type);
+}
+
+/********************** config llog list **********************/
+static LIST_HEAD(config_llog_list);
+static DEFINE_SPINLOCK(config_list_lock);
+
+/* Take a reference to a config log */
+static int config_log_get(struct config_llog_data *cld)
+{
+       ENTRY;
+       atomic_inc(&cld->cld_refcount);
+       CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+              atomic_read(&cld->cld_refcount));
+       RETURN(0);
+}
+
+/* Drop a reference to a config log.  When no longer referenced,
+   we can free the config log data */
+static void config_log_put(struct config_llog_data *cld)
+{
+       ENTRY;
+
+       CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname,
+              atomic_read(&cld->cld_refcount));
+       LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+       /* spinlock to make sure no item with 0 refcount in the list */
+       if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) {
+               list_del(&cld->cld_list_chain);
+               spin_unlock(&config_list_lock);
+
+               CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname);
+
+               if (cld->cld_recover)
+                       config_log_put(cld->cld_recover);
+               if (cld->cld_sptlrpc)
+                       config_log_put(cld->cld_sptlrpc);
+               if (cld_is_sptlrpc(cld))
+                       sptlrpc_conf_log_stop(cld->cld_logname);
+
+               class_export_put(cld->cld_mgcexp);
+               OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1);
+       }
+
+       EXIT;
+}
+
+/* Find a config log by name */
+static
+struct config_llog_data *config_log_find(char *logname,
+                                        struct config_llog_instance *cfg)
+{
+       struct config_llog_data *cld;
+       struct config_llog_data *found = NULL;
+       void *             instance;
+       ENTRY;
+
+       LASSERT(logname != NULL);
+
+       instance = cfg ? cfg->cfg_instance : NULL;
+       spin_lock(&config_list_lock);
+       list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+               /* check if instance equals */
+               if (instance != cld->cld_cfg.cfg_instance)
+                       continue;
+
+               /* instance may be NULL, should check name */
+               if (strcmp(logname, cld->cld_logname) == 0) {
+                       found = cld;
+                       break;
+               }
+       }
+       if (found) {
+               atomic_inc(&found->cld_refcount);
+               LASSERT(found->cld_stopping == 0 || cld_is_sptlrpc(found) == 0);
+       }
+       spin_unlock(&config_list_lock);
+       RETURN(found);
+}
+
+static
+struct config_llog_data *do_config_log_add(struct obd_device *obd,
+                                          char *logname,
+                                          int type,
+                                          struct config_llog_instance *cfg,
+                                          struct super_block *sb)
+{
+       struct config_llog_data *cld;
+       int                   rc;
+       ENTRY;
+
+       CDEBUG(D_MGC, "do adding config log %s:%p\n", logname,
+              cfg ? cfg->cfg_instance : 0);
+
+       OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1);
+       if (!cld)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       strcpy(cld->cld_logname, logname);
+       if (cfg)
+               cld->cld_cfg = *cfg;
+       else
+               cld->cld_cfg.cfg_callback = class_config_llog_handler;
+       mutex_init(&cld->cld_lock);
+       cld->cld_cfg.cfg_last_idx = 0;
+       cld->cld_cfg.cfg_flags = 0;
+       cld->cld_cfg.cfg_sb = sb;
+       cld->cld_type = type;
+       atomic_set(&cld->cld_refcount, 1);
+
+       /* Keep the mgc around until we are done */
+       cld->cld_mgcexp = class_export_get(obd->obd_self_export);
+
+       if (cld_is_sptlrpc(cld)) {
+               sptlrpc_conf_log_start(logname);
+               cld->cld_cfg.cfg_obdname = obd->obd_name;
+       }
+
+       rc = mgc_logname2resid(logname, &cld->cld_resid, type);
+
+       spin_lock(&config_list_lock);
+       list_add(&cld->cld_list_chain, &config_llog_list);
+       spin_unlock(&config_list_lock);
+
+       if (rc) {
+               config_log_put(cld);
+               RETURN(ERR_PTR(rc));
+       }
+
+       if (cld_is_sptlrpc(cld)) {
+               rc = mgc_process_log(obd, cld);
+               if (rc && rc != -ENOENT)
+                       CERROR("failed processing sptlrpc log: %d\n", rc);
+       }
+
+       RETURN(cld);
+}
+
+static struct config_llog_data *config_recover_log_add(struct obd_device *obd,
+       char *fsname,
+       struct config_llog_instance *cfg,
+       struct super_block *sb)
+{
+       struct config_llog_instance lcfg = *cfg;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct config_llog_data *cld;
+       char logname[32];
+
+       if (IS_OST(lsi))
+               return NULL;
+
+       /* for osp-on-ost, see lustre_start_osp() */
+       if (IS_MDT(lsi) && lcfg.cfg_instance)
+               return NULL;
+
+       /* we have to use different llog for clients and mdts for cmd
+        * where only clients are notified if one of cmd server restarts */
+       LASSERT(strlen(fsname) < sizeof(logname) / 2);
+       strcpy(logname, fsname);
+       if (IS_SERVER(lsi)) { /* mdt */
+               LASSERT(lcfg.cfg_instance == NULL);
+               lcfg.cfg_instance = sb;
+               strcat(logname, "-mdtir");
+       } else {
+               LASSERT(lcfg.cfg_instance != NULL);
+               strcat(logname, "-cliir");
+       }
+
+       cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb);
+       return cld;
+}
+
+
+/** Add this log to the list of active logs watched by an MGC.
+ * Active means we're watching for updates.
+ * We have one active log per "mount" - client instance or servername.
+ * Each instance may be at a different point in the log.
+ */
+static int config_log_add(struct obd_device *obd, char *logname,
+                         struct config_llog_instance *cfg,
+                         struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct config_llog_data *cld;
+       struct config_llog_data *sptlrpc_cld;
+       char                 seclogname[32];
+       char                *ptr;
+       ENTRY;
+
+       CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance);
+
+       /*
+        * for each regular log, the depended sptlrpc log name is
+        * <fsname>-sptlrpc. multiple regular logs may share one sptlrpc log.
+        */
+       ptr = strrchr(logname, '-');
+       if (ptr == NULL || ptr - logname > 8) {
+               CERROR("logname %s is too long\n", logname);
+               RETURN(-EINVAL);
+       }
+
+       memcpy(seclogname, logname, ptr - logname);
+       strcpy(seclogname + (ptr - logname), "-sptlrpc");
+
+       sptlrpc_cld = config_log_find(seclogname, NULL);
+       if (sptlrpc_cld == NULL) {
+               sptlrpc_cld = do_config_log_add(obd, seclogname,
+                                               CONFIG_T_SPTLRPC, NULL, NULL);
+               if (IS_ERR(sptlrpc_cld)) {
+                       CERROR("can't create sptlrpc log: %s\n", seclogname);
+                       RETURN(PTR_ERR(sptlrpc_cld));
+               }
+       }
+
+       cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb);
+       if (IS_ERR(cld)) {
+               CERROR("can't create log: %s\n", logname);
+               config_log_put(sptlrpc_cld);
+               RETURN(PTR_ERR(cld));
+       }
+
+       cld->cld_sptlrpc = sptlrpc_cld;
+
+       LASSERT(lsi->lsi_lmd);
+       if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) {
+               struct config_llog_data *recover_cld;
+               *strrchr(seclogname, '-') = 0;
+               recover_cld = config_recover_log_add(obd, seclogname, cfg, sb);
+               if (IS_ERR(recover_cld)) {
+                       config_log_put(cld);
+                       RETURN(PTR_ERR(recover_cld));
+               }
+               cld->cld_recover = recover_cld;
+       }
+
+       RETURN(0);
+}
+
+DEFINE_MUTEX(llog_process_lock);
+
+/** Stop watching for updates on this log.
+ */
+static int config_log_end(char *logname, struct config_llog_instance *cfg)
+{
+       struct config_llog_data *cld;
+       struct config_llog_data *cld_sptlrpc = NULL;
+       struct config_llog_data *cld_recover = NULL;
+       int rc = 0;
+       ENTRY;
+
+       cld = config_log_find(logname, cfg);
+       if (cld == NULL)
+               RETURN(-ENOENT);
+
+       mutex_lock(&cld->cld_lock);
+       /*
+        * if cld_stopping is set, it means we didn't start the log thus
+        * not owning the start ref. this can happen after previous umount:
+        * the cld still hanging there waiting for lock cancel, and we
+        * remount again but failed in the middle and call log_end without
+        * calling start_log.
+        */
+       if (unlikely(cld->cld_stopping)) {
+               mutex_unlock(&cld->cld_lock);
+               /* drop the ref from the find */
+               config_log_put(cld);
+               RETURN(rc);
+       }
+
+       cld->cld_stopping = 1;
+
+       cld_recover = cld->cld_recover;
+       cld->cld_recover = NULL;
+       mutex_unlock(&cld->cld_lock);
+
+       if (cld_recover) {
+               mutex_lock(&cld_recover->cld_lock);
+               cld_recover->cld_stopping = 1;
+               mutex_unlock(&cld_recover->cld_lock);
+               config_log_put(cld_recover);
+       }
+
+       spin_lock(&config_list_lock);
+       cld_sptlrpc = cld->cld_sptlrpc;
+       cld->cld_sptlrpc = NULL;
+       spin_unlock(&config_list_lock);
+
+       if (cld_sptlrpc)
+               config_log_put(cld_sptlrpc);
+
+       /* drop the ref from the find */
+       config_log_put(cld);
+       /* drop the start ref */
+       config_log_put(cld);
+
+       CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client",
+              rc);
+       RETURN(rc);
+}
+
+int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data)
+{
+       struct obd_device       *obd = data;
+       struct obd_import       *imp = obd->u.cli.cl_import;
+       struct obd_connect_data *ocd = &imp->imp_connect_data;
+       struct config_llog_data *cld;
+       ENTRY;
+
+       seq_printf(m, "imperative_recovery: %s\n",
+                     OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED");
+       seq_printf(m, "client_state:\n");
+
+       spin_lock(&config_list_lock);
+       list_for_each_entry(cld, &config_llog_list, cld_list_chain) {
+               if (cld->cld_recover == NULL)
+                       continue;
+               seq_printf(m,  "    - { client: %s, nidtbl_version: %u }\n",
+                              cld->cld_logname,
+                              cld->cld_recover->cld_cfg.cfg_last_idx);
+       }
+       spin_unlock(&config_list_lock);
+
+       RETURN(0);
+}
+
+/* reenqueue any lost locks */
+#define RQ_RUNNING 0x1
+#define RQ_NOW     0x2
+#define RQ_LATER   0x4
+#define RQ_STOP    0x8
+static int                 rq_state = 0;
+static wait_queue_head_t           rq_waitq;
+static DECLARE_COMPLETION(rq_exit);
+
+static void do_requeue(struct config_llog_data *cld)
+{
+       ENTRY;
+       LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+       /* Do not run mgc_process_log on a disconnected export or an
+          export which is being disconnected. Take the client
+          semaphore to make the check non-racy. */
+       down_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+       if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) {
+               CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname);
+               mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
+       } else {
+               CDEBUG(D_MGC, "disconnecting, won't update log %s\n",
+                      cld->cld_logname);
+       }
+       up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
+
+       EXIT;
+}
+
+/* this timeout represents how many seconds MGC should wait before
+ * requeue config and recover lock to the MGS. We need to randomize this
+ * in order to not flood the MGS.
+ */
+#define MGC_TIMEOUT_MIN_SECONDS   5
+#define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */
+
+static int mgc_requeue_thread(void *data)
+{
+       int rc = 0;
+       ENTRY;
+
+       CDEBUG(D_MGC, "Starting requeue thread\n");
+
+       /* Keep trying failed locks periodically */
+       spin_lock(&config_list_lock);
+       rq_state |= RQ_RUNNING;
+       while (1) {
+               struct l_wait_info lwi;
+               struct config_llog_data *cld, *cld_prev;
+               int rand = cfs_rand() & MGC_TIMEOUT_RAND_CENTISEC;
+               int stopped = !!(rq_state & RQ_STOP);
+               int to;
+
+               /* Any new or requeued lostlocks will change the state */
+               rq_state &= ~(RQ_NOW | RQ_LATER);
+               spin_unlock(&config_list_lock);
+
+               /* Always wait a few seconds to allow the server who
+                  caused the lock revocation to finish its setup, plus some
+                  random so everyone doesn't try to reconnect at once. */
+               to = MGC_TIMEOUT_MIN_SECONDS * HZ;
+               to += rand * HZ / 100; /* rand is centi-seconds */
+               lwi = LWI_TIMEOUT(to, NULL, NULL);
+               l_wait_event(rq_waitq, rq_state & RQ_STOP, &lwi);
+
+               /*
+                * iterate & processing through the list. for each cld, process
+                * its depending sptlrpc cld firstly (if any) and then itself.
+                *
+                * it's guaranteed any item in the list must have
+                * reference > 0; and if cld_lostlock is set, at
+                * least one reference is taken by the previous enqueue.
+                */
+               cld_prev = NULL;
+
+               spin_lock(&config_list_lock);
+               list_for_each_entry(cld, &config_llog_list,
+                                       cld_list_chain) {
+                       if (!cld->cld_lostlock)
+                               continue;
+
+                       spin_unlock(&config_list_lock);
+
+                       LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+                       /* Whether we enqueued again or not in mgc_process_log,
+                        * we're done with the ref from the old enqueue */
+                       if (cld_prev)
+                               config_log_put(cld_prev);
+                       cld_prev = cld;
+
+                       cld->cld_lostlock = 0;
+                       if (likely(!stopped))
+                               do_requeue(cld);
+
+                       spin_lock(&config_list_lock);
+               }
+               spin_unlock(&config_list_lock);
+               if (cld_prev)
+                       config_log_put(cld_prev);
+
+               /* break after scanning the list so that we can drop
+                * refcount to losing lock clds */
+               if (unlikely(stopped)) {
+                       spin_lock(&config_list_lock);
+                       break;
+               }
+
+               /* Wait a bit to see if anyone else needs a requeue */
+               lwi = (struct l_wait_info) { 0 };
+               l_wait_event(rq_waitq, rq_state & (RQ_NOW | RQ_STOP),
+                            &lwi);
+               spin_lock(&config_list_lock);
+       }
+       /* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */
+       rq_state &= ~RQ_RUNNING;
+       spin_unlock(&config_list_lock);
+
+       complete(&rq_exit);
+
+       CDEBUG(D_MGC, "Ending requeue thread\n");
+       RETURN(rc);
+}
+
+/* Add a cld to the list to requeue.  Start the requeue thread if needed.
+   We are responsible for dropping the config log reference from here on out. */
+static void mgc_requeue_add(struct config_llog_data *cld)
+{
+       ENTRY;
+
+       CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n",
+              cld->cld_logname, atomic_read(&cld->cld_refcount),
+              cld->cld_stopping, rq_state);
+       LASSERT(atomic_read(&cld->cld_refcount) > 0);
+
+       mutex_lock(&cld->cld_lock);
+       if (cld->cld_stopping || cld->cld_lostlock) {
+               mutex_unlock(&cld->cld_lock);
+               RETURN_EXIT;
+       }
+       /* this refcount will be released in mgc_requeue_thread. */
+       config_log_get(cld);
+       cld->cld_lostlock = 1;
+       mutex_unlock(&cld->cld_lock);
+
+       /* Hold lock for rq_state */
+       spin_lock(&config_list_lock);
+       if (rq_state & RQ_STOP) {
+               spin_unlock(&config_list_lock);
+               cld->cld_lostlock = 0;
+               config_log_put(cld);
+       } else {
+               rq_state |= RQ_NOW;
+               spin_unlock(&config_list_lock);
+               wake_up(&rq_waitq);
+       }
+       EXIT;
+}
+
+/********************** class fns **********************/
+
+static int mgc_fs_setup(struct obd_device *obd, struct super_block *sb,
+                       struct vfsmount *mnt)
+{
+       struct lvfs_run_ctxt saved;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct client_obd *cli = &obd->u.cli;
+       struct dentry *dentry;
+       char *label;
+       int err = 0;
+       ENTRY;
+
+       LASSERT(lsi);
+       LASSERT(lsi->lsi_srv_mnt == mnt);
+
+       /* The mgc fs exclusion sem. Only one fs can be setup at a time. */
+       down(&cli->cl_mgc_sem);
+
+       cfs_cleanup_group_info();
+
+       obd->obd_fsops = fsfilt_get_ops(lsi->lsi_fstype);
+       if (IS_ERR(obd->obd_fsops)) {
+               up(&cli->cl_mgc_sem);
+               CERROR("%s: No fstype %s: rc = %ld\n", lsi->lsi_fstype,
+                      obd->obd_name, PTR_ERR(obd->obd_fsops));
+               RETURN(PTR_ERR(obd->obd_fsops));
+       }
+
+       cli->cl_mgc_vfsmnt = mnt;
+       err = fsfilt_setup(obd, mnt->mnt_sb);
+       if (err)
+               GOTO(err_ops, err);
+
+       OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+       obd->obd_lvfs_ctxt.pwdmnt = mnt;
+       obd->obd_lvfs_ctxt.pwd = mnt->mnt_root;
+       obd->obd_lvfs_ctxt.fs = get_ds();
+
+       push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+       dentry = ll_lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
+                                  strlen(MOUNT_CONFIGS_DIR));
+       pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+       if (IS_ERR(dentry)) {
+               err = PTR_ERR(dentry);
+               CERROR("cannot lookup %s directory: rc = %d\n",
+                      MOUNT_CONFIGS_DIR, err);
+               GOTO(err_ops, err);
+       }
+       cli->cl_mgc_configs_dir = dentry;
+
+       /* We take an obd ref to insure that we can't get to mgc_cleanup
+          without calling mgc_fs_cleanup first. */
+       class_incref(obd, "mgc_fs", obd);
+
+       label = fsfilt_get_label(obd, mnt->mnt_sb);
+       if (label)
+               CDEBUG(D_MGC, "MGC using disk labelled=%s\n", label);
+
+       /* We keep the cl_mgc_sem until mgc_fs_cleanup */
+       RETURN(0);
+
+err_ops:
+       fsfilt_put_ops(obd->obd_fsops);
+       obd->obd_fsops = NULL;
+       cli->cl_mgc_vfsmnt = NULL;
+       up(&cli->cl_mgc_sem);
+       RETURN(err);
+}
+
+static int mgc_fs_cleanup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(cli->cl_mgc_vfsmnt != NULL);
+
+       if (cli->cl_mgc_configs_dir != NULL) {
+               struct lvfs_run_ctxt saved;
+               push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+               l_dput(cli->cl_mgc_configs_dir);
+               cli->cl_mgc_configs_dir = NULL;
+               pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+               class_decref(obd, "mgc_fs", obd);
+       }
+
+       cli->cl_mgc_vfsmnt = NULL;
+       if (obd->obd_fsops)
+               fsfilt_put_ops(obd->obd_fsops);
+
+       up(&cli->cl_mgc_sem);
+
+       RETURN(rc);
+}
+
+static atomic_t mgc_count = ATOMIC_INIT(0);
+static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       int rc = 0;
+       ENTRY;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY:
+               break;
+       case OBD_CLEANUP_EXPORTS:
+               if (atomic_dec_and_test(&mgc_count)) {
+                       int running;
+                       /* stop requeue thread */
+                       spin_lock(&config_list_lock);
+                       running = rq_state & RQ_RUNNING;
+                       if (running)
+                               rq_state |= RQ_STOP;
+                       spin_unlock(&config_list_lock);
+                       if (running) {
+                               wake_up(&rq_waitq);
+                               wait_for_completion(&rq_exit);
+                       }
+               }
+               obd_cleanup_client_import(obd);
+               rc = obd_llog_finish(obd, 0);
+               if (rc != 0)
+                       CERROR("failed to cleanup llogging subsystems\n");
+               break;
+       }
+       RETURN(rc);
+}
+
+static int mgc_cleanup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       int rc;
+       ENTRY;
+
+       LASSERT(cli->cl_mgc_vfsmnt == NULL);
+
+       /* COMPAT_146 - old config logs may have added profiles we don't
+          know about */
+       if (obd->obd_type->typ_refcnt <= 1)
+               /* Only for the last mgc */
+               class_del_profiles();
+
+       lprocfs_obd_cleanup(obd);
+       ptlrpcd_decref();
+
+       rc = client_obd_cleanup(obd);
+       RETURN(rc);
+}
+
+static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lprocfs_static_vars lvars;
+       int rc;
+       ENTRY;
+
+       ptlrpcd_addref();
+
+       rc = client_obd_setup(obd, lcfg);
+       if (rc)
+               GOTO(err_decref, rc);
+
+       rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
+       if (rc) {
+               CERROR("failed to setup llogging subsystems\n");
+               GOTO(err_cleanup, rc);
+       }
+
+       lprocfs_mgc_init_vars(&lvars);
+       lprocfs_obd_setup(obd, lvars.obd_vars);
+       sptlrpc_lprocfs_cliobd_attach(obd);
+
+       if (atomic_inc_return(&mgc_count) == 1) {
+               rq_state = 0;
+               init_waitqueue_head(&rq_waitq);
+
+               /* start requeue thread */
+               rc = PTR_ERR(kthread_run(mgc_requeue_thread, NULL,
+                                            "ll_cfg_requeue"));
+               if (IS_ERR_VALUE(rc)) {
+                       CERROR("%s: Cannot start requeue thread (%d),"
+                              "no more log updates!\n",
+                              obd->obd_name, rc);
+                       GOTO(err_cleanup, rc);
+               }
+               /* rc is the task_struct pointer of mgc_requeue_thread. */
+               rc = 0;
+       }
+
+       RETURN(rc);
+
+err_cleanup:
+       client_obd_cleanup(obd);
+err_decref:
+       ptlrpcd_decref();
+       RETURN(rc);
+}
+
+/* based on ll_mdc_blocking_ast */
+static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                           void *data, int flag)
+{
+       struct lustre_handle lockh;
+       struct config_llog_data *cld = (struct config_llog_data *)data;
+       int rc = 0;
+       ENTRY;
+
+       switch (flag) {
+       case LDLM_CB_BLOCKING:
+               /* mgs wants the lock, give it up... */
+               LDLM_DEBUG(lock, "MGC blocking CB");
+               ldlm_lock2handle(lock, &lockh);
+               rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
+               break;
+       case LDLM_CB_CANCELING:
+               /* We've given up the lock, prepare ourselves to update. */
+               LDLM_DEBUG(lock, "MGC cancel CB");
+
+               CDEBUG(D_MGC, "Lock res "LPX64" (%.8s)\n",
+                      lock->l_resource->lr_name.name[0],
+                      (char *)&lock->l_resource->lr_name.name[0]);
+
+               if (!cld) {
+                       CDEBUG(D_INFO, "missing data, won't requeue\n");
+                       break;
+               }
+
+               /* held at mgc_process_log(). */
+               LASSERT(atomic_read(&cld->cld_refcount) > 0);
+               /* Are we done with this log? */
+               if (cld->cld_stopping) {
+                       CDEBUG(D_MGC, "log %s: stopping, won't requeue\n",
+                              cld->cld_logname);
+                       config_log_put(cld);
+                       break;
+               }
+               /* Make sure not to re-enqueue when the mgc is stopping
+                  (we get called from client_disconnect_export) */
+               if (!lock->l_conn_export ||
+                   !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) {
+                       CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n",
+                              cld->cld_logname);
+                       config_log_put(cld);
+                       break;
+               }
+
+               /* Re-enqueue now */
+               mgc_requeue_add(cld);
+               config_log_put(cld);
+               break;
+       default:
+               LBUG();
+       }
+
+       RETURN(rc);
+}
+
+/* Not sure where this should go... */
+#define  MGC_ENQUEUE_LIMIT 50
+#define  MGC_TARGET_REG_LIMIT 10
+#define  MGC_SEND_PARAM_LIMIT 10
+
+/* Send parameter to MGS*/
+static int mgc_set_mgs_param(struct obd_export *exp,
+                            struct mgs_send_param *msp)
+{
+       struct ptlrpc_request *req;
+       struct mgs_send_param *req_msp, *rep_msp;
+       int rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION,
+                                       MGS_SET_INFO);
+       if (!req)
+               RETURN(-ENOMEM);
+
+       req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+       if (!req_msp) {
+               ptlrpc_req_finished(req);
+               RETURN(-ENOMEM);
+       }
+
+       memcpy(req_msp, msp, sizeof(*req_msp));
+       ptlrpc_request_set_replen(req);
+
+       /* Limit how long we will wait for the enqueue to complete */
+       req->rq_delay_limit = MGC_SEND_PARAM_LIMIT;
+       rc = ptlrpc_queue_wait(req);
+       if (!rc) {
+               rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM);
+               memcpy(msp, rep_msp, sizeof(*rep_msp));
+       }
+
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+/* Take a config lock so we can get cancel notifications */
+static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
+                      __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+                      __u64 *flags, void *bl_cb, void *cp_cb, void *gl_cb,
+                      void *data, __u32 lvb_len, void *lvb_swabber,
+                      struct lustre_handle *lockh)
+{
+       struct config_llog_data *cld = (struct config_llog_data *)data;
+       struct ldlm_enqueue_info einfo = { type, mode, mgc_blocking_ast,
+                        ldlm_completion_ast, NULL, NULL, NULL };
+       struct ptlrpc_request *req;
+       int short_limit = cld_is_sptlrpc(cld);
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_MGC, "Enqueue for %s (res "LPX64")\n", cld->cld_logname,
+              cld->cld_resid.name[0]);
+
+       /* We need a callback for every lockholder, so don't try to
+          ldlm_lock_match (see rev 1.1.2.11.2.47) */
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION,
+                                       LDLM_ENQUEUE);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0);
+       ptlrpc_request_set_replen(req);
+
+       /* check if this is server or client */
+       if (cld->cld_cfg.cfg_sb) {
+               struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb);
+               if (lsi && IS_SERVER(lsi))
+                       short_limit = 1;
+       }
+       /* Limit how long we will wait for the enqueue to complete */
+       req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT;
+       rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags,
+                             NULL, 0, LVB_T_NONE, lockh, 0);
+       /* A failed enqueue should still call the mgc_blocking_ast,
+          where it will be requeued if needed ("grant failed"). */
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+static int mgc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
+                     __u32 mode, struct lustre_handle *lockh)
+{
+       ENTRY;
+
+       ldlm_lock_decref(lockh, mode);
+
+       RETURN(0);
+}
+
+static void mgc_notify_active(struct obd_device *unused)
+{
+       /* wakeup mgc_requeue_thread to requeue mgc lock */
+       spin_lock(&config_list_lock);
+       rq_state |= RQ_NOW;
+       spin_unlock(&config_list_lock);
+       wake_up(&rq_waitq);
+
+       /* TODO: Help the MGS rebuild nidtbl. -jay */
+}
+
+/* Send target_reg message to MGS */
+static int mgc_target_register(struct obd_export *exp,
+                              struct mgs_target_info *mti)
+{
+       struct ptlrpc_request  *req;
+       struct mgs_target_info *req_mti, *rep_mti;
+       int                  rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION,
+                                       MGS_TARGET_REG);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO);
+       if (!req_mti) {
+               ptlrpc_req_finished(req);
+               RETURN(-ENOMEM);
+       }
+
+       memcpy(req_mti, mti, sizeof(*req_mti));
+       ptlrpc_request_set_replen(req);
+       CDEBUG(D_MGC, "register %s\n", mti->mti_svname);
+       /* Limit how long we will wait for the enqueue to complete */
+       req->rq_delay_limit = MGC_TARGET_REG_LIMIT;
+
+       rc = ptlrpc_queue_wait(req);
+       if (!rc) {
+               rep_mti = req_capsule_server_get(&req->rq_pill,
+                                                &RMF_MGS_TARGET_INFO);
+               memcpy(mti, rep_mti, sizeof(*rep_mti));
+               CDEBUG(D_MGC, "register %s got index = %d\n",
+                      mti->mti_svname, mti->mti_stripe_index);
+       }
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                      obd_count keylen, void *key, obd_count vallen,
+                      void *val, struct ptlrpc_request_set *set)
+{
+       int rc = -EINVAL;
+       ENTRY;
+
+       /* Turn off initial_recov after we try all backup servers once */
+       if (KEY_IS(KEY_INIT_RECOV_BACKUP)) {
+               struct obd_import *imp = class_exp2cliimp(exp);
+               int value;
+               if (vallen != sizeof(int))
+                       RETURN(-EINVAL);
+               value = *(int *)val;
+               CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n",
+                      imp->imp_obd->obd_name, value,
+                      imp->imp_deactive, imp->imp_invalid,
+                      imp->imp_replayable, imp->imp_obd->obd_replayable,
+                      ptlrpc_import_state_name(imp->imp_state));
+               /* Resurrect if we previously died */
+               if ((imp->imp_state != LUSTRE_IMP_FULL &&
+                    imp->imp_state != LUSTRE_IMP_NEW) || value > 1)
+                       ptlrpc_reconnect_import(imp);
+               RETURN(0);
+       }
+       /* FIXME move this to mgc_process_config */
+       if (KEY_IS(KEY_REGISTER_TARGET)) {
+               struct mgs_target_info *mti;
+               if (vallen != sizeof(struct mgs_target_info))
+                       RETURN(-EINVAL);
+               mti = (struct mgs_target_info *)val;
+               CDEBUG(D_MGC, "register_target %s %#x\n",
+                      mti->mti_svname, mti->mti_flags);
+               rc =  mgc_target_register(exp, mti);
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_SET_FS)) {
+               struct super_block *sb = (struct super_block *)val;
+               struct lustre_sb_info *lsi;
+               if (vallen != sizeof(struct super_block))
+                       RETURN(-EINVAL);
+               lsi = s2lsi(sb);
+               rc = mgc_fs_setup(exp->exp_obd, sb, lsi->lsi_srv_mnt);
+               if (rc) {
+                       CERROR("set_fs got %d\n", rc);
+               }
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_CLEAR_FS)) {
+               if (vallen != 0)
+                       RETURN(-EINVAL);
+               rc = mgc_fs_cleanup(exp->exp_obd);
+               if (rc) {
+                       CERROR("clear_fs got %d\n", rc);
+               }
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_SET_INFO)) {
+               struct mgs_send_param *msp;
+
+               msp = (struct mgs_send_param *)val;
+               rc =  mgc_set_mgs_param(exp, msp);
+               RETURN(rc);
+       }
+       if (KEY_IS(KEY_MGSSEC)) {
+               struct client_obd     *cli = &exp->exp_obd->u.cli;
+               struct sptlrpc_flavor  flvr;
+
+               /*
+                * empty string means using current flavor, if which haven't
+                * been set yet, set it as null.
+                *
+                * if flavor has been set previously, check the asking flavor
+                * must match the existing one.
+                */
+               if (vallen == 0) {
+                       if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID)
+                               RETURN(0);
+                       val = "null";
+                       vallen = 4;
+               }
+
+               rc = sptlrpc_parse_flavor(val, &flvr);
+               if (rc) {
+                       CERROR("invalid sptlrpc flavor %s to MGS\n",
+                              (char *) val);
+                       RETURN(rc);
+               }
+
+               /*
+                * caller already hold a mutex
+                */
+               if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) {
+                       cli->cl_flvr_mgc = flvr;
+               } else if (memcmp(&cli->cl_flvr_mgc, &flvr,
+                                 sizeof(flvr)) != 0) {
+                       char    str[20];
+
+                       sptlrpc_flavor2name(&cli->cl_flvr_mgc,
+                                           str, sizeof(str));
+                       LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but "
+                                      "currently %s is in use\n",
+                                      (char *) val, str);
+                       rc = -EPERM;
+               }
+               RETURN(rc);
+       }
+
+       RETURN(rc);
+}
+
+static int mgc_get_info(const struct lu_env *env, struct obd_export *exp,
+                       __u32 keylen, void *key, __u32 *vallen, void *val,
+                       struct lov_stripe_md *unused)
+{
+       int rc = -EINVAL;
+
+       if (KEY_IS(KEY_CONN_DATA)) {
+               struct obd_import *imp = class_exp2cliimp(exp);
+               struct obd_connect_data *data = val;
+
+               if (*vallen == sizeof(*data)) {
+                       *data = imp->imp_connect_data;
+                       rc = 0;
+               }
+       }
+
+       return rc;
+}
+
+static int mgc_import_event(struct obd_device *obd,
+                           struct obd_import *imp,
+                           enum obd_import_event event)
+{
+       int rc = 0;
+
+       LASSERT(imp->imp_obd == obd);
+       CDEBUG(D_MGC, "import event %#x\n", event);
+
+       switch (event) {
+       case IMP_EVENT_DISCON:
+               /* MGC imports should not wait for recovery */
+               if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+                       ptlrpc_pinger_ir_down();
+               break;
+       case IMP_EVENT_INACTIVE:
+               break;
+       case IMP_EVENT_INVALIDATE: {
+               struct ldlm_namespace *ns = obd->obd_namespace;
+               ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+               break;
+       }
+       case IMP_EVENT_ACTIVE:
+               CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name);
+               /* Clearing obd_no_recov allows us to continue pinging */
+               obd->obd_no_recov = 0;
+               mgc_notify_active(obd);
+               if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV))
+                       ptlrpc_pinger_ir_up();
+               break;
+       case IMP_EVENT_OCD:
+               break;
+       case IMP_EVENT_DEACTIVATE:
+       case IMP_EVENT_ACTIVATE:
+               break;
+       default:
+               CERROR("Unknown import event %#x\n", event);
+               LBUG();
+       }
+       RETURN(rc);
+}
+
+static int mgc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                        struct obd_device *tgt, int *index)
+{
+       struct llog_ctxt *ctxt;
+       int rc;
+       ENTRY;
+
+       LASSERT(olg == &obd->obd_olg);
+
+
+       rc = llog_setup(NULL, obd, olg, LLOG_CONFIG_REPL_CTXT, tgt,
+                       &llog_client_ops);
+       if (rc)
+               GOTO(out, rc);
+
+       ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_REPL_CTXT);
+       if (!ctxt)
+               GOTO(out, rc = -ENODEV);
+
+       llog_initiator_connect(ctxt);
+       llog_ctxt_put(ctxt);
+
+       RETURN(0);
+out:
+       ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+       RETURN(rc);
+}
+
+static int mgc_llog_finish(struct obd_device *obd, int count)
+{
+       struct llog_ctxt *ctxt;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+
+       ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+       RETURN(0);
+}
+
+enum {
+       CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_CACHE_SHIFT),
+       CONFIG_READ_NRPAGES      = 4
+};
+
+static int mgc_apply_recover_logs(struct obd_device *mgc,
+                                 struct config_llog_data *cld,
+                                 __u64 max_version,
+                                 void *data, int datalen, bool mne_swab)
+{
+       struct config_llog_instance *cfg = &cld->cld_cfg;
+       struct lustre_sb_info       *lsi = s2lsi(cfg->cfg_sb);
+       struct mgs_nidtbl_entry *entry;
+       struct lustre_cfg       *lcfg;
+       struct lustre_cfg_bufs   bufs;
+       u64   prev_version = 0;
+       char *inst;
+       char *buf;
+       int   bufsz;
+       int   pos;
+       int   rc  = 0;
+       int   off = 0;
+       ENTRY;
+
+       LASSERT(cfg->cfg_instance != NULL);
+       LASSERT(cfg->cfg_sb == cfg->cfg_instance);
+
+       OBD_ALLOC(inst, PAGE_CACHE_SIZE);
+       if (inst == NULL)
+               RETURN(-ENOMEM);
+
+       if (!IS_SERVER(lsi)) {
+               pos = snprintf(inst, PAGE_CACHE_SIZE, "%p", cfg->cfg_instance);
+               if (pos >= PAGE_CACHE_SIZE) {
+                       OBD_FREE(inst, PAGE_CACHE_SIZE);
+                       return -E2BIG;
+               }
+       } else {
+               LASSERT(IS_MDT(lsi));
+               rc = server_name2svname(lsi->lsi_svname, inst, NULL,
+                                       PAGE_CACHE_SIZE);
+               if (rc) {
+                       OBD_FREE(inst, PAGE_CACHE_SIZE);
+                       RETURN(-EINVAL);
+               }
+               pos = strlen(inst);
+       }
+
+       ++pos;
+       buf   = inst + pos;
+       bufsz = PAGE_CACHE_SIZE - pos;
+
+       while (datalen > 0) {
+               int   entry_len = sizeof(*entry);
+               int   is_ost;
+               struct obd_device *obd;
+               char *obdname;
+               char *cname;
+               char *params;
+               char *uuid;
+
+               rc = -EINVAL;
+               if (datalen < sizeof(*entry))
+                       break;
+
+               entry = (typeof(entry))(data + off);
+
+               /* sanity check */
+               if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */
+                       break;
+               if (entry->mne_nid_count == 0) /* at least one nid entry */
+                       break;
+               if (entry->mne_nid_size != sizeof(lnet_nid_t))
+                       break;
+
+               entry_len += entry->mne_nid_count * entry->mne_nid_size;
+               if (datalen < entry_len) /* must have entry_len at least */
+                       break;
+
+               /* Keep this swab for normal mixed endian handling. LU-1644 */
+               if (mne_swab)
+                       lustre_swab_mgs_nidtbl_entry(entry);
+               if (entry->mne_length > PAGE_CACHE_SIZE) {
+                       CERROR("MNE too large (%u)\n", entry->mne_length);
+                       break;
+               }
+
+               if (entry->mne_length < entry_len)
+                       break;
+
+               off     += entry->mne_length;
+               datalen -= entry->mne_length;
+               if (datalen < 0)
+                       break;
+
+               if (entry->mne_version > max_version) {
+                       CERROR("entry index(%lld) is over max_index(%lld)\n",
+                              entry->mne_version, max_version);
+                       break;
+               }
+
+               if (prev_version >= entry->mne_version) {
+                       CERROR("index unsorted, prev %lld, now %lld\n",
+                              prev_version, entry->mne_version);
+                       break;
+               }
+               prev_version = entry->mne_version;
+
+               /*
+                * Write a string with format "nid::instance" to
+                * lustre/<osc|mdc>/<target>-<osc|mdc>-<instance>/import.
+                */
+
+               is_ost = entry->mne_type == LDD_F_SV_TYPE_OST;
+               memset(buf, 0, bufsz);
+               obdname = buf;
+               pos = 0;
+
+               /* lustre-OST0001-osc-<instance #> */
+               strcpy(obdname, cld->cld_logname);
+               cname = strrchr(obdname, '-');
+               if (cname == NULL) {
+                       CERROR("mgc %s: invalid logname %s\n",
+                              mgc->obd_name, obdname);
+                       break;
+               }
+
+               pos = cname - obdname;
+               obdname[pos] = 0;
+               pos += sprintf(obdname + pos, "-%s%04x",
+                                 is_ost ? "OST" : "MDT", entry->mne_index);
+
+               cname = is_ost ? "osc" : "mdc",
+               pos += sprintf(obdname + pos, "-%s-%s", cname, inst);
+               lustre_cfg_bufs_reset(&bufs, obdname);
+
+               /* find the obd by obdname */
+               obd = class_name2obd(obdname);
+               if (obd == NULL) {
+                       CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n",
+                              mgc->obd_name, obdname);
+                       rc = 0;
+                       /* this is a safe race, when the ost is starting up...*/
+                       continue;
+               }
+
+               /* osc.import = "connection=<Conn UUID>::<target instance>" */
+               ++pos;
+               params = buf + pos;
+               pos += sprintf(params, "%s.import=%s", cname, "connection=");
+               uuid = buf + pos;
+
+               down_read(&obd->u.cli.cl_sem);
+               if (obd->u.cli.cl_import == NULL) {
+                       /* client does not connect to the OST yet */
+                       up_read(&obd->u.cli.cl_sem);
+                       rc = 0;
+                       continue;
+               }
+
+               /* TODO: iterate all nids to find one */
+               /* find uuid by nid */
+               rc = client_import_find_conn(obd->u.cli.cl_import,
+                                            entry->u.nids[0],
+                                            (struct obd_uuid *)uuid);
+               up_read(&obd->u.cli.cl_sem);
+               if (rc < 0) {
+                       CERROR("mgc: cannot find uuid by nid %s\n",
+                              libcfs_nid2str(entry->u.nids[0]));
+                       break;
+               }
+
+               CDEBUG(D_INFO, "Find uuid %s by nid %s\n",
+                      uuid, libcfs_nid2str(entry->u.nids[0]));
+
+               pos += strlen(uuid);
+               pos += sprintf(buf + pos, "::%u", entry->mne_instance);
+               LASSERT(pos < bufsz);
+
+               lustre_cfg_bufs_set_string(&bufs, 1, params);
+
+               rc = -ENOMEM;
+               lcfg = lustre_cfg_new(LCFG_PARAM, &bufs);
+               if (lcfg == NULL) {
+                       CERROR("mgc: cannot allocate memory\n");
+                       break;
+               }
+
+               CDEBUG(D_INFO, "ir apply logs "LPD64"/"LPD64" for %s -> %s\n",
+                      prev_version, max_version, obdname, params);
+
+               rc = class_process_config(lcfg);
+               lustre_cfg_free(lcfg);
+               if (rc)
+                       CDEBUG(D_INFO, "process config for %s error %d\n",
+                              obdname, rc);
+
+               /* continue, even one with error */
+       }
+
+       OBD_FREE(inst, PAGE_CACHE_SIZE);
+       RETURN(rc);
+}
+
+/**
+ * This function is called if this client was notified for target restarting
+ * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery logs.
+ */
+static int mgc_process_recover_log(struct obd_device *obd,
+                                  struct config_llog_data *cld)
+{
+       struct ptlrpc_request *req = NULL;
+       struct config_llog_instance *cfg = &cld->cld_cfg;
+       struct mgs_config_body *body;
+       struct mgs_config_res  *res;
+       struct ptlrpc_bulk_desc *desc;
+       struct page **pages;
+       int nrpages;
+       bool eof = true;
+       bool mne_swab = false;
+       int i;
+       int ealen;
+       int rc;
+       ENTRY;
+
+       /* allocate buffer for bulk transfer.
+        * if this is the first time for this mgs to read logs,
+        * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs
+        * once; otherwise, it only reads increment of logs, this should be
+        * small and CONFIG_READ_NRPAGES will be used.
+        */
+       nrpages = CONFIG_READ_NRPAGES;
+       if (cfg->cfg_last_idx == 0) /* the first time */
+               nrpages = CONFIG_READ_NRPAGES_INIT;
+
+       OBD_ALLOC(pages, sizeof(*pages) * nrpages);
+       if (pages == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       for (i = 0; i < nrpages; i++) {
+               pages[i] = alloc_page(GFP_IOFS);
+               if (pages[i] == NULL)
+                       GOTO(out, rc = -ENOMEM);
+       }
+
+again:
+       LASSERT(cld_is_recover(cld));
+       LASSERT(mutex_is_locked(&cld->cld_lock));
+       req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp),
+                                  &RQF_MGS_CONFIG_READ);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ);
+       if (rc)
+               GOTO(out, rc);
+
+       /* pack request */
+       body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY);
+       LASSERT(body != NULL);
+       LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname));
+       if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name))
+           >= sizeof(body->mcb_name))
+               GOTO(out, rc = -E2BIG);
+       body->mcb_offset = cfg->cfg_last_idx + 1;
+       body->mcb_type   = cld->cld_type;
+       body->mcb_bits   = PAGE_CACHE_SHIFT;
+       body->mcb_units  = nrpages;
+
+       /* allocate bulk transfer descriptor */
+       desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, BULK_PUT_SINK,
+                                   MGS_BULK_PORTAL);
+       if (desc == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       for (i = 0; i < nrpages; i++)
+               ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES);
+       if (res->mcr_size < res->mcr_offset)
+               GOTO(out, rc = -EINVAL);
+
+       /* always update the index even though it might have errors with
+        * handling the recover logs */
+       cfg->cfg_last_idx = res->mcr_offset;
+       eof = res->mcr_offset == res->mcr_size;
+
+       CDEBUG(D_INFO, "Latest version "LPD64", more %d.\n",
+              res->mcr_offset, eof == false);
+
+       ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0);
+       if (ealen < 0)
+               GOTO(out, rc = ealen);
+
+       if (ealen > nrpages << PAGE_CACHE_SHIFT)
+               GOTO(out, rc = -EINVAL);
+
+       if (ealen == 0) { /* no logs transferred */
+               if (!eof)
+                       rc = -EINVAL;
+               GOTO(out, rc);
+       }
+
+       mne_swab = !!ptlrpc_rep_need_swab(req);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+       /* This import flag means the server did an extra swab of IR MNE
+        * records (fixed in LU-1252), reverse it here if needed. LU-1644 */
+       if (unlikely(req->rq_import->imp_need_mne_swab))
+               mne_swab = !mne_swab;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+       for (i = 0; i < nrpages && ealen > 0; i++) {
+               int rc2;
+               void *ptr;
+
+               ptr = kmap(pages[i]);
+               rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr,
+                                            min_t(int, ealen, PAGE_CACHE_SIZE),
+                                            mne_swab);
+               kunmap(pages[i]);
+               if (rc2 < 0) {
+                       CWARN("Process recover log %s error %d\n",
+                             cld->cld_logname, rc2);
+                       break;
+               }
+
+               ealen -= PAGE_CACHE_SIZE;
+       }
+
+out:
+       if (req)
+               ptlrpc_req_finished(req);
+
+       if (rc == 0 && !eof)
+               goto again;
+
+       if (pages) {
+               for (i = 0; i < nrpages; i++) {
+                       if (pages[i] == NULL)
+                               break;
+                       __free_page(pages[i]);
+               }
+               OBD_FREE(pages, sizeof(*pages) * nrpages);
+       }
+       return rc;
+}
+
+
+/* local_only means it cannot get remote llogs */
+static int mgc_process_cfg_log(struct obd_device *mgc,
+                              struct config_llog_data *cld,
+                              int local_only)
+{
+       struct llog_ctxt *ctxt, *lctxt = NULL;
+       struct lvfs_run_ctxt *saved_ctxt;
+       struct lustre_sb_info *lsi = NULL;
+       int rc = 0, must_pop = 0;
+       bool sptlrpc_started = false;
+
+       ENTRY;
+
+       LASSERT(cld);
+       LASSERT(mutex_is_locked(&cld->cld_lock));
+
+       /*
+        * local copy of sptlrpc log is controlled elsewhere, don't try to
+        * read it up here.
+        */
+       if (cld_is_sptlrpc(cld) && local_only)
+               RETURN(0);
+
+       if (cld->cld_cfg.cfg_sb)
+               lsi = s2lsi(cld->cld_cfg.cfg_sb);
+
+       ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT);
+       if (!ctxt) {
+               CERROR("missing llog context\n");
+               RETURN(-EINVAL);
+       }
+
+       OBD_ALLOC_PTR(saved_ctxt);
+       if (saved_ctxt == NULL)
+               RETURN(-ENOMEM);
+
+       lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT);
+
+               if (local_only) { /* no local log at client side */
+               GOTO(out_pop, rc = -EIO);
+       }
+
+       if (cld_is_sptlrpc(cld)) {
+               sptlrpc_conf_log_update_begin(cld->cld_logname);
+               sptlrpc_started = true;
+       }
+
+       /* logname and instance info should be the same, so use our
+          copy of the instance for the update.  The cfg_last_idx will
+          be updated here. */
+       rc = class_config_parse_llog(NULL, ctxt, cld->cld_logname,
+                                    &cld->cld_cfg);
+       EXIT;
+
+out_pop:
+       llog_ctxt_put(ctxt);
+       if (lctxt)
+               llog_ctxt_put(lctxt);
+       if (must_pop)
+               pop_ctxt(saved_ctxt, &mgc->obd_lvfs_ctxt, NULL);
+
+       OBD_FREE_PTR(saved_ctxt);
+       /*
+        * update settings on existing OBDs. doing it inside
+        * of llog_process_lock so no device is attaching/detaching
+        * in parallel.
+        * the logname must be <fsname>-sptlrpc
+        */
+       if (sptlrpc_started) {
+               LASSERT(cld_is_sptlrpc(cld));
+               sptlrpc_conf_log_update_end(cld->cld_logname);
+               class_notify_sptlrpc_conf(cld->cld_logname,
+                                         strlen(cld->cld_logname) -
+                                         strlen("-sptlrpc"));
+       }
+
+       RETURN(rc);
+}
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Copy the log locally before parsing it if appropriate (non-MGS server)
+ */
+int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld)
+{
+       struct lustre_handle lockh = { 0 };
+       __u64 flags = LDLM_FL_NO_LRU;
+       int rc = 0, rcl;
+       ENTRY;
+
+       LASSERT(cld);
+
+       /* I don't want multiple processes running process_log at once --
+          sounds like badness.  It actually might be fine, as long as
+          we're not trying to update from the same log
+          simultaneously (in which case we should use a per-log sem.) */
+       mutex_lock(&cld->cld_lock);
+       if (cld->cld_stopping) {
+               mutex_unlock(&cld->cld_lock);
+               RETURN(0);
+       }
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20);
+
+       CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname,
+              cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1);
+
+       /* Get the cfg lock on the llog */
+       rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL,
+                         LCK_CR, &flags, NULL, NULL, NULL,
+                         cld, 0, NULL, &lockh);
+       if (rcl == 0) {
+               /* Get the cld, it will be released in mgc_blocking_ast. */
+               config_log_get(cld);
+               rc = ldlm_lock_set_data(&lockh, (void *)cld);
+               LASSERT(rc == 0);
+       } else {
+               CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl);
+
+               /* mark cld_lostlock so that it will requeue
+                * after MGC becomes available. */
+               cld->cld_lostlock = 1;
+               /* Get extra reference, it will be put in requeue thread */
+               config_log_get(cld);
+       }
+
+
+       if (cld_is_recover(cld)) {
+               rc = 0; /* this is not a fatal error for recover log */
+               if (rcl == 0)
+                       rc = mgc_process_recover_log(mgc, cld);
+       } else {
+               rc = mgc_process_cfg_log(mgc, cld, rcl != 0);
+       }
+
+       CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n",
+              mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc);
+
+       mutex_unlock(&cld->cld_lock);
+
+       /* Now drop the lock so MGS can revoke it */
+       if (!rcl) {
+               rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, NULL,
+                                LCK_CR, &lockh);
+               if (rcl)
+                       CERROR("Can't drop cfg lock: %d\n", rcl);
+       }
+
+       RETURN(rc);
+}
+
+
+/** Called from lustre_process_log.
+ * LCFG_LOG_START gets the config log from the MGS, processes it to start
+ * any services, and adds it to the list logs to watch (follow).
+ */
+static int mgc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+       struct lustre_cfg *lcfg = buf;
+       struct config_llog_instance *cfg = NULL;
+       char *logname;
+       int rc = 0;
+       ENTRY;
+
+       switch(lcfg->lcfg_command) {
+       case LCFG_LOV_ADD_OBD: {
+               /* Overloading this cfg command: register a new target */
+               struct mgs_target_info *mti;
+
+               if (LUSTRE_CFG_BUFLEN(lcfg, 1) !=
+                   sizeof(struct mgs_target_info))
+                       GOTO(out, rc = -EINVAL);
+
+               mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1);
+               CDEBUG(D_MGC, "add_target %s %#x\n",
+                      mti->mti_svname, mti->mti_flags);
+               rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti);
+               break;
+       }
+       case LCFG_LOV_DEL_OBD:
+               /* Unregister has no meaning at the moment. */
+               CERROR("lov_del_obd unimplemented\n");
+               rc = -ENOSYS;
+               break;
+       case LCFG_SPTLRPC_CONF: {
+               rc = sptlrpc_process_config(lcfg);
+               break;
+       }
+       case LCFG_LOG_START: {
+               struct config_llog_data *cld;
+               struct super_block *sb;
+
+               logname = lustre_cfg_string(lcfg, 1);
+               cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2);
+               sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3);
+
+               CDEBUG(D_MGC, "parse_log %s from %d\n", logname,
+                      cfg->cfg_last_idx);
+
+               /* We're only called through here on the initial mount */
+               rc = config_log_add(obd, logname, cfg, sb);
+               if (rc)
+                       break;
+               cld = config_log_find(logname, cfg);
+               if (cld == NULL) {
+                       rc = -ENOENT;
+                       break;
+               }
+
+               /* COMPAT_146 */
+               /* FIXME only set this for old logs!  Right now this forces
+                  us to always skip the "inside markers" check */
+               cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146;
+
+               rc = mgc_process_log(obd, cld);
+               if (rc == 0 && cld->cld_recover != NULL) {
+                       if (OCD_HAS_FLAG(&obd->u.cli.cl_import->
+                                        imp_connect_data, IMP_RECOV)) {
+                               rc = mgc_process_log(obd, cld->cld_recover);
+                       } else {
+                               struct config_llog_data *cir = cld->cld_recover;
+                               cld->cld_recover = NULL;
+                               config_log_put(cir);
+                       }
+                       if (rc)
+                               CERROR("Cannot process recover llog %d\n", rc);
+               }
+               config_log_put(cld);
+
+               break;
+       }
+       case LCFG_LOG_END: {
+               logname = lustre_cfg_string(lcfg, 1);
+
+               if (lcfg->lcfg_bufcount >= 2)
+                       cfg = (struct config_llog_instance *)lustre_cfg_buf(
+                               lcfg, 2);
+               rc = config_log_end(logname, cfg);
+               break;
+       }
+       default: {
+               CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+               GOTO(out, rc = -EINVAL);
+
+       }
+       }
+out:
+       RETURN(rc);
+}
+
+struct obd_ops mgc_obd_ops = {
+       .o_owner        = THIS_MODULE,
+       .o_setup        = mgc_setup,
+       .o_precleanup   = mgc_precleanup,
+       .o_cleanup      = mgc_cleanup,
+       .o_add_conn     = client_import_add_conn,
+       .o_del_conn     = client_import_del_conn,
+       .o_connect      = client_connect_import,
+       .o_disconnect   = client_disconnect_export,
+       //.o_enqueue      = mgc_enqueue,
+       .o_cancel       = mgc_cancel,
+       //.o_iocontrol    = mgc_iocontrol,
+       .o_set_info_async = mgc_set_info_async,
+       .o_get_info       = mgc_get_info,
+       .o_import_event = mgc_import_event,
+       .o_llog_init    = mgc_llog_init,
+       .o_llog_finish  = mgc_llog_finish,
+       .o_process_config = mgc_process_config,
+};
+
+int __init mgc_init(void)
+{
+       return class_register_type(&mgc_obd_ops, NULL, NULL,
+                                  LUSTRE_MGC_NAME, NULL);
+}
+
+static void /*__exit*/ mgc_exit(void)
+{
+       class_unregister_type(LUSTRE_MGC_NAME);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Management Client");
+MODULE_LICENSE("GPL");
+
+module_init(mgc_init);
+module_exit(mgc_exit);
diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile
new file mode 100644 (file)
index 0000000..b80c13c
--- /dev/null
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTRE_FS) += obdclass.o llog_test.o
+
+obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \
+             llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \
+             genops.o uuid.o llog_ioctl.o lprocfs_status.o                \
+             lprocfs_jobstats.o lustre_handles.o lustre_peer.o llog_osd.o \
+             local_storage.o statfs_pack.o obdo.o obd_config.o obd_mount.o\
+             mea.o lu_object.o dt_object.o capa.o cl_object.o   \
+             cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o idmap.o           \
+             lu_ucred.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/obdclass/acl.c b/drivers/staging/lustre/lustre/obdclass/acl.c
new file mode 100644 (file)
index 0000000..c2a6702
--- /dev/null
@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/acl.c
+ *
+ * Lustre Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <lu_object.h>
+#include <lustre_acl.h>
+#include <lustre_eacl.h>
+#include <obd_support.h>
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION
+
+enum {
+       ES_UNK  = 0,    /* unknown stat */
+       ES_UNC  = 1,    /* ACL entry is not changed */
+       ES_MOD  = 2,    /* ACL entry is modified */
+       ES_ADD  = 3,    /* ACL entry is added */
+       ES_DEL  = 4     /* ACL entry is deleted */
+};
+
+static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d,
+                                           ext_acl_xattr_entry *s)
+{
+       d->e_tag        = le16_to_cpu(s->e_tag);
+       d->e_perm       = le16_to_cpu(s->e_perm);
+       d->e_id  = le32_to_cpu(s->e_id);
+       d->e_stat       = le32_to_cpu(s->e_stat);
+}
+
+static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d,
+                                           ext_acl_xattr_entry *s)
+{
+       d->e_tag        = cpu_to_le16(s->e_tag);
+       d->e_perm       = cpu_to_le16(s->e_perm);
+       d->e_id  = cpu_to_le32(s->e_id);
+       d->e_stat       = cpu_to_le32(s->e_stat);
+}
+
+static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
+                                             posix_acl_xattr_entry *s)
+{
+       d->e_tag        = le16_to_cpu(s->e_tag);
+       d->e_perm       = le16_to_cpu(s->e_perm);
+       d->e_id  = le32_to_cpu(s->e_id);
+}
+
+static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+                                             posix_acl_xattr_entry *s)
+{
+       d->e_tag        = cpu_to_le16(s->e_tag);
+       d->e_perm       = cpu_to_le16(s->e_perm);
+       d->e_id  = cpu_to_le32(s->e_id);
+}
+
+
+/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */
+static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header,
+                                              int old_count, int new_count)
+{
+       int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr);
+       int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr);
+       posix_acl_xattr_header *new;
+
+       if (unlikely(old_count <= new_count))
+               return old_size;
+
+       OBD_ALLOC(new, new_size);
+       if (unlikely(new == NULL))
+               return -ENOMEM;
+
+       memcpy(new, *header, new_size);
+       OBD_FREE(*header, old_size);
+       *header = new;
+       return new_size;
+}
+
+/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */
+static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header,
+                                            int old_count)
+{
+       int ext_count = le32_to_cpu((*header)->a_count);
+       int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+       int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr);
+       ext_acl_xattr_header *new;
+
+       if (unlikely(old_count <= ext_count))
+               return 0;
+
+       OBD_ALLOC(new, ext_size);
+       if (unlikely(new == NULL))
+               return -ENOMEM;
+
+       memcpy(new, *header, ext_size);
+       OBD_FREE(*header, old_size);
+       *header = new;
+       return 0;
+}
+
+/*
+ * Generate new extended ACL based on the posix ACL.
+ */
+ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size)
+{
+       int count, i, esize;
+       ext_acl_xattr_header *new;
+       ENTRY;
+
+       if (unlikely(size < 0))
+               RETURN(ERR_PTR(-EINVAL));
+       else if (!size)
+               count = 0;
+       else
+               count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+       esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr);
+       OBD_ALLOC(new, esize);
+       if (unlikely(new == NULL))
+               RETURN(ERR_PTR(-ENOMEM));
+
+       new->a_count = cpu_to_le32(count);
+       for (i = 0; i < count; i++) {
+               new->a_entries[i].e_tag  = header->a_entries[i].e_tag;
+               new->a_entries[i].e_perm = header->a_entries[i].e_perm;
+               new->a_entries[i].e_id   = header->a_entries[i].e_id;
+               new->a_entries[i].e_stat = cpu_to_le32(ES_UNK);
+       }
+
+       RETURN(new);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext);
+
+/*
+ * Filter out the "nobody" entries in the posix ACL.
+ */
+int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+                                 posix_acl_xattr_header **out)
+{
+       int count, i, j, rc = 0;
+       __u32 id;
+       posix_acl_xattr_header *new;
+       ENTRY;
+
+       if (unlikely(size < 0))
+               RETURN(-EINVAL);
+       else if (!size)
+               RETURN(0);
+
+       OBD_ALLOC(new, size);
+       if (unlikely(new == NULL))
+               RETURN(-ENOMEM);
+
+       new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+       count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+       for (i = 0, j = 0; i < count; i++) {
+               id = le32_to_cpu(header->a_entries[i].e_id);
+               switch (le16_to_cpu(header->a_entries[i].e_tag)) {
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       if (id != ACL_UNDEFINED_ID)
+                               GOTO(_out, rc = -EIO);
+
+                       memcpy(&new->a_entries[j++], &header->a_entries[i],
+                              sizeof(posix_acl_xattr_entry));
+                       break;
+               case ACL_USER:
+                       if (id != NOBODY_UID)
+                               memcpy(&new->a_entries[j++],
+                                      &header->a_entries[i],
+                                      sizeof(posix_acl_xattr_entry));
+                       break;
+               case ACL_GROUP:
+                       if (id != NOBODY_GID)
+                               memcpy(&new->a_entries[j++],
+                                      &header->a_entries[i],
+                                      sizeof(posix_acl_xattr_entry));
+                       break;
+               default:
+                       GOTO(_out, rc = -EIO);
+               }
+       }
+
+       /* free unused space. */
+       rc = lustre_posix_acl_xattr_reduce_space(&new, count, j);
+       if (rc >= 0) {
+               size = rc;
+               *out = new;
+               rc = 0;
+       }
+       EXIT;
+
+_out:
+       if (rc) {
+               OBD_FREE(new, size);
+               size = rc;
+       }
+       return size;
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_filter);
+
+/*
+ * Release the posix ACL space.
+ */
+void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size)
+{
+       OBD_FREE(header, size);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_free);
+
+/*
+ * Release the extended ACL space.
+ */
+void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header)
+{
+       OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \
+                                           ext_acl_xattr));
+}
+EXPORT_SYMBOL(lustre_ext_acl_xattr_free);
+
+static ext_acl_xattr_entry *
+lustre_ext_acl_xattr_search(ext_acl_xattr_header *header,
+                           posix_acl_xattr_entry *entry, int *pos)
+{
+       int once, start, end, i, j, count = le32_to_cpu(header->a_count);
+
+       once = 0;
+       start = *pos;
+       end = count;
+
+again:
+       for (i = start; i < end; i++) {
+               if (header->a_entries[i].e_tag == entry->e_tag &&
+                   header->a_entries[i].e_id == entry->e_id) {
+                       j = i;
+                       if (++i >= count)
+                               i = 0;
+                       *pos = i;
+                       return &header->a_entries[j];
+               }
+       }
+
+       if (!once) {
+               once = 1;
+               start = 0;
+               end = *pos;
+               goto again;
+       }
+
+       return NULL;
+}
+
+/*
+ * Merge the posix ACL and the extended ACL into new posix ACL.
+ */
+int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+                                ext_acl_xattr_header *ext_header,
+                                posix_acl_xattr_header **out)
+{
+       int posix_count, posix_size, i, j;
+       int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0;
+       posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID};
+       posix_acl_xattr_header *new;
+       ext_acl_xattr_entry *ee, ae;
+       ENTRY;
+
+       lustre_posix_acl_cpu_to_le(&pe, &pe);
+       ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos);
+       if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) {
+               /* there are only base ACL entries at most. */
+               posix_count = 3;
+               posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+               OBD_ALLOC(new, posix_size);
+               if (unlikely(new == NULL))
+                       RETURN(-ENOMEM);
+
+               new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+               for (i = 0, j = 0; i < ext_count; i++) {
+                       lustre_ext_acl_le_to_cpu(&ae,
+                                                &ext_header->a_entries[i]);
+                       switch (ae.e_tag) {
+                       case ACL_USER_OBJ:
+                       case ACL_GROUP_OBJ:
+                       case ACL_OTHER:
+                               if (ae.e_id != ACL_UNDEFINED_ID)
+                                       GOTO(_out, rc = -EIO);
+
+                               if (ae.e_stat != ES_DEL) {
+                                       new->a_entries[j].e_tag =
+                                               ext_header->a_entries[i].e_tag;
+                                       new->a_entries[j].e_perm =
+                                               ext_header->a_entries[i].e_perm;
+                                       new->a_entries[j++].e_id =
+                                               ext_header->a_entries[i].e_id;
+                               }
+                               break;
+                       case ACL_MASK:
+                       case ACL_USER:
+                       case ACL_GROUP:
+                               if (ae.e_stat == ES_DEL)
+                                       break;
+                       default:
+                               GOTO(_out, rc = -EIO);
+                       }
+               }
+       } else {
+               /* maybe there are valid ACL_USER or ACL_GROUP entries in the
+                * original server-side ACL, they are regarded as ES_UNC stat.*/
+               int ori_posix_count;
+
+               if (unlikely(size < 0))
+                       RETURN(-EINVAL);
+               else if (!size)
+                       ori_posix_count = 0;
+               else
+                       ori_posix_count =
+                               CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+               posix_count = ori_posix_count + ext_count;
+               posix_size =
+                       CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+               OBD_ALLOC(new, posix_size);
+               if (unlikely(new == NULL))
+                       RETURN(-ENOMEM);
+
+               new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+               /* 1. process the unchanged ACL entries
+                *    in the original server-side ACL. */
+               pos = 0;
+               for (i = 0, j = 0; i < ori_posix_count; i++) {
+                       ee = lustre_ext_acl_xattr_search(ext_header,
+                                       &posix_header->a_entries[i], &pos);
+                       if (ee == NULL)
+                               memcpy(&new->a_entries[j++],
+                                      &posix_header->a_entries[i],
+                                      sizeof(posix_acl_xattr_entry));
+               }
+
+               /* 2. process the non-deleted entries
+                *    from client-side extended ACL. */
+               for (i = 0; i < ext_count; i++) {
+                       if (le16_to_cpu(ext_header->a_entries[i].e_stat) !=
+                           ES_DEL) {
+                               new->a_entries[j].e_tag =
+                                               ext_header->a_entries[i].e_tag;
+                               new->a_entries[j].e_perm =
+                                               ext_header->a_entries[i].e_perm;
+                               new->a_entries[j++].e_id =
+                                               ext_header->a_entries[i].e_id;
+                       }
+               }
+       }
+
+       /* free unused space. */
+       rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j);
+       if (rc >= 0) {
+               posix_size = rc;
+               *out = new;
+               rc = 0;
+       }
+       EXIT;
+
+_out:
+       if (rc) {
+               OBD_FREE(new, posix_size);
+               posix_size = rc;
+       }
+       return posix_size;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2posix);
+
+/*
+ * Merge the posix ACL and the extended ACL into new extended ACL.
+ */
+ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+                          ext_acl_xattr_header *ext_header)
+{
+       int ori_ext_count, posix_count, ext_count, ext_size;
+       int i, j, pos = 0, rc = 0;
+       posix_acl_xattr_entry pae;
+       ext_acl_xattr_header *new;
+       ext_acl_xattr_entry *ee, eae;
+       ENTRY;
+
+       if (unlikely(size < 0))
+               RETURN(ERR_PTR(-EINVAL));
+       else if (!size)
+               posix_count = 0;
+       else
+               posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+       ori_ext_count = le32_to_cpu(ext_header->a_count);
+       ext_count = posix_count + ori_ext_count;
+       ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+
+       OBD_ALLOC(new, ext_size);
+       if (unlikely(new == NULL))
+               RETURN(ERR_PTR(-ENOMEM));
+
+       for (i = 0, j = 0; i < posix_count; i++) {
+               lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]);
+               switch (pae.e_tag) {
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       if (pae.e_id != ACL_UNDEFINED_ID)
+                               GOTO(out, rc = -EIO);
+               case ACL_USER:
+                       /* ignore "nobody" entry. */
+                       if (pae.e_id == NOBODY_UID)
+                               break;
+
+                       new->a_entries[j].e_tag =
+                                       posix_header->a_entries[i].e_tag;
+                       new->a_entries[j].e_perm =
+                                       posix_header->a_entries[i].e_perm;
+                       new->a_entries[j].e_id =
+                                       posix_header->a_entries[i].e_id;
+                       ee = lustre_ext_acl_xattr_search(ext_header,
+                                       &posix_header->a_entries[i], &pos);
+                       if (ee) {
+                               if (posix_header->a_entries[i].e_perm !=
+                                                               ee->e_perm)
+                                       /* entry modified. */
+                                       ee->e_stat =
+                                       new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_MOD);
+                               else
+                                       /* entry unchanged. */
+                                       ee->e_stat =
+                                       new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_UNC);
+                       } else {
+                               /* new entry. */
+                               new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_ADD);
+                       }
+                       break;
+               case ACL_GROUP:
+                       /* ignore "nobody" entry. */
+                       if (pae.e_id == NOBODY_GID)
+                               break;
+                       new->a_entries[j].e_tag =
+                                       posix_header->a_entries[i].e_tag;
+                       new->a_entries[j].e_perm =
+                                       posix_header->a_entries[i].e_perm;
+                       new->a_entries[j].e_id =
+                                       posix_header->a_entries[i].e_id;
+                       ee = lustre_ext_acl_xattr_search(ext_header,
+                                       &posix_header->a_entries[i], &pos);
+                       if (ee) {
+                               if (posix_header->a_entries[i].e_perm !=
+                                                               ee->e_perm)
+                                       /* entry modified. */
+                                       ee->e_stat =
+                                       new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_MOD);
+                               else
+                                       /* entry unchanged. */
+                                       ee->e_stat =
+                                       new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_UNC);
+                       } else {
+                               /* new entry. */
+                               new->a_entries[j++].e_stat =
+                                                       cpu_to_le32(ES_ADD);
+                       }
+                       break;
+               default:
+                       GOTO(out, rc = -EIO);
+               }
+       }
+
+       /* process deleted entries. */
+       for (i = 0; i < ori_ext_count; i++) {
+               lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]);
+               if (eae.e_stat == ES_UNK) {
+                       /* ignore "nobody" entry. */
+                       if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) ||
+                           (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID))
+                               continue;
+
+                       new->a_entries[j].e_tag =
+                                               ext_header->a_entries[i].e_tag;
+                       new->a_entries[j].e_perm =
+                                               ext_header->a_entries[i].e_perm;
+                       new->a_entries[j].e_id = ext_header->a_entries[i].e_id;
+                       new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL);
+               }
+       }
+
+       new->a_count = cpu_to_le32(j);
+       /* free unused space. */
+       rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count);
+       EXIT;
+
+out:
+       if (rc) {
+               OBD_FREE(new, ext_size);
+               new = ERR_PTR(rc);
+       }
+       return new;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2ext);
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/capa.c b/drivers/staging/lustre/lustre/obdclass/capa.c
new file mode 100644 (file)
index 0000000..3e532f5
--- /dev/null
@@ -0,0 +1,401 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/capa.c
+ *
+ * Lustre Capability Hash Management
+ *
+ * Author: Lai Siyao<lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/list.h>
+#include <lustre_capa.h>
+
+#define NR_CAPAHASH 32
+#define CAPA_HASH_SIZE 3000          /* for MDS & OSS */
+
+struct kmem_cache *capa_cachep = NULL;
+
+/* lock for capa hash/capa_list/fo_capa_keys */
+DEFINE_SPINLOCK(capa_lock);
+
+struct list_head capa_list[CAPA_SITE_MAX];
+
+static struct capa_hmac_alg capa_hmac_algs[] = {
+       DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20),
+};
+/* capa count */
+int capa_count[CAPA_SITE_MAX] = { 0, };
+
+EXPORT_SYMBOL(capa_cachep);
+EXPORT_SYMBOL(capa_list);
+EXPORT_SYMBOL(capa_lock);
+EXPORT_SYMBOL(capa_count);
+
+struct hlist_head *init_capa_hash(void)
+{
+       struct hlist_head *hash;
+       int nr_hash, i;
+
+       OBD_ALLOC(hash, PAGE_CACHE_SIZE);
+       if (!hash)
+               return NULL;
+
+       nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head);
+       LASSERT(nr_hash > NR_CAPAHASH);
+
+       for (i = 0; i < NR_CAPAHASH; i++)
+               INIT_HLIST_HEAD(hash + i);
+       return hash;
+}
+EXPORT_SYMBOL(init_capa_hash);
+
+static inline int capa_on_server(struct obd_capa *ocapa)
+{
+       return ocapa->c_site == CAPA_SITE_SERVER;
+}
+
+static inline void capa_delete(struct obd_capa *ocapa)
+{
+       LASSERT(capa_on_server(ocapa));
+       hlist_del_init(&ocapa->u.tgt.c_hash);
+       list_del_init(&ocapa->c_list);
+       capa_count[ocapa->c_site]--;
+       /* release the ref when alloc */
+       capa_put(ocapa);
+}
+
+void cleanup_capa_hash(struct hlist_head *hash)
+{
+       int i;
+       struct hlist_node *next;
+       struct obd_capa *oc;
+
+       spin_lock(&capa_lock);
+       for (i = 0; i < NR_CAPAHASH; i++) {
+               hlist_for_each_entry_safe(oc, next, hash + i,
+                                             u.tgt.c_hash)
+                       capa_delete(oc);
+       }
+       spin_unlock(&capa_lock);
+
+       OBD_FREE(hash, PAGE_CACHE_SIZE);
+}
+EXPORT_SYMBOL(cleanup_capa_hash);
+
+static inline int capa_hashfn(struct lu_fid *fid)
+{
+       return (fid_oid(fid) ^ fid_ver(fid)) *
+              (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH;
+}
+
+/* capa renewal time check is earlier than that on client, which is to prevent
+ * client renew right after obtaining it. */
+static inline int capa_is_to_expire(struct obd_capa *oc)
+{
+       return cfs_time_before(cfs_time_sub(oc->c_expiry,
+                                  cfs_time_seconds(oc->c_capa.lc_timeout)*2/3),
+                              cfs_time_current());
+}
+
+static struct obd_capa *find_capa(struct lustre_capa *capa,
+                                 struct hlist_head *head, int alive)
+{
+       struct obd_capa *ocapa;
+       int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa);
+
+       hlist_for_each_entry(ocapa, head, u.tgt.c_hash) {
+               if (memcmp(&ocapa->c_capa, capa, len))
+                       continue;
+               /* don't return one that will expire soon in this case */
+               if (alive && capa_is_to_expire(ocapa))
+                       continue;
+
+               LASSERT(capa_on_server(ocapa));
+
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found");
+               return ocapa;
+       }
+
+       return NULL;
+}
+
+#define LRU_CAPA_DELETE_COUNT 12
+static inline void capa_delete_lru(struct list_head *head)
+{
+       struct obd_capa *ocapa;
+       struct list_head *node = head->next;
+       int count = 0;
+
+       /* free LRU_CAPA_DELETE_COUNT unused capa from head */
+       while (count++ < LRU_CAPA_DELETE_COUNT) {
+               ocapa = list_entry(node, struct obd_capa, c_list);
+               node = node->next;
+               if (atomic_read(&ocapa->c_refc))
+                       continue;
+
+               DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru");
+               capa_delete(ocapa);
+       }
+}
+
+/* add or update */
+struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa)
+{
+       struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid);
+       struct obd_capa *ocapa, *old = NULL;
+       struct list_head *list = &capa_list[CAPA_SITE_SERVER];
+
+       ocapa = alloc_capa(CAPA_SITE_SERVER);
+       if (IS_ERR(ocapa))
+               return NULL;
+
+       spin_lock(&capa_lock);
+       old = find_capa(capa, head, 0);
+       if (!old) {
+               ocapa->c_capa = *capa;
+               set_capa_expiry(ocapa);
+               hlist_add_head(&ocapa->u.tgt.c_hash, head);
+               list_add_tail(&ocapa->c_list, list);
+               capa_get(ocapa);
+               capa_count[CAPA_SITE_SERVER]++;
+               if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE)
+                       capa_delete_lru(list);
+               spin_unlock(&capa_lock);
+               return ocapa;
+       } else {
+               capa_get(old);
+               spin_unlock(&capa_lock);
+               capa_put(ocapa);
+               return old;
+       }
+}
+EXPORT_SYMBOL(capa_add);
+
+struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa,
+                            int alive)
+{
+       struct obd_capa *ocapa;
+
+       spin_lock(&capa_lock);
+       ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive);
+       if (ocapa) {
+               list_move_tail(&ocapa->c_list,
+                                  &capa_list[CAPA_SITE_SERVER]);
+               capa_get(ocapa);
+       }
+       spin_unlock(&capa_lock);
+
+       return ocapa;
+}
+EXPORT_SYMBOL(capa_lookup);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key)
+{
+       struct ll_crypto_hash *tfm;
+       struct capa_hmac_alg  *alg;
+       int keylen;
+       struct scatterlist sl;
+
+       if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) {
+               CERROR("unknown capability hmac algorithm!\n");
+               return -EFAULT;
+       }
+
+       alg = &capa_hmac_algs[capa_alg(capa)];
+
+       tfm = ll_crypto_alloc_hash(alg->ha_name, 0, 0);
+       if (!tfm) {
+               CERROR("crypto_alloc_tfm failed, check whether your kernel"
+                      "has crypto support!\n");
+               return -ENOMEM;
+       }
+       keylen = alg->ha_keylen;
+
+       sg_set_page(&sl, virt_to_page(capa),
+                   offsetof(struct lustre_capa, lc_hmac),
+                   (unsigned long)(capa) % PAGE_CACHE_SIZE);
+
+       ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac);
+       ll_crypto_free_hash(tfm);
+
+       return 0;
+}
+EXPORT_SYMBOL(capa_hmac);
+
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+       struct ll_crypto_cipher *tfm;
+       struct scatterlist sd;
+       struct scatterlist ss;
+       struct blkcipher_desc desc;
+       unsigned int min;
+       int rc;
+       char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+       ENTRY;
+
+       /* passing "aes" in a variable instead of a constant string keeps gcc
+        * 4.3.2 happy */
+       tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+       if (IS_ERR(tfm)) {
+               CERROR("failed to load transform for aes\n");
+               RETURN(PTR_ERR(tfm));
+       }
+
+       min = ll_crypto_tfm_alg_min_keysize(tfm);
+       if (keylen < min) {
+               CERROR("keylen at least %d bits for aes\n", min * 8);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+       if (rc) {
+               CERROR("failed to setting key for aes\n");
+               GOTO(out, rc);
+       }
+
+       sg_set_page(&sd, virt_to_page(d), 16,
+                   (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+       sg_set_page(&ss, virt_to_page(s), 16,
+                   (unsigned long)(s) % PAGE_CACHE_SIZE);
+       desc.tfm   = tfm;
+       desc.info  = NULL;
+       desc.flags = 0;
+       rc = ll_crypto_blkcipher_encrypt(&desc, &sd, &ss, 16);
+       if (rc) {
+               CERROR("failed to encrypt for aes\n");
+               GOTO(out, rc);
+       }
+
+       EXIT;
+
+out:
+       ll_crypto_free_blkcipher(tfm);
+       return rc;
+}
+EXPORT_SYMBOL(capa_encrypt_id);
+
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+       struct ll_crypto_cipher *tfm;
+       struct scatterlist sd;
+       struct scatterlist ss;
+       struct blkcipher_desc desc;
+       unsigned int min;
+       int rc;
+       char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+       ENTRY;
+
+       /* passing "aes" in a variable instead of a constant string keeps gcc
+        * 4.3.2 happy */
+       tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+       if (IS_ERR(tfm)) {
+               CERROR("failed to load transform for aes\n");
+               RETURN(PTR_ERR(tfm));
+       }
+
+       min = ll_crypto_tfm_alg_min_keysize(tfm);
+       if (keylen < min) {
+               CERROR("keylen at least %d bits for aes\n", min * 8);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+       if (rc) {
+               CERROR("failed to setting key for aes\n");
+               GOTO(out, rc);
+       }
+
+       sg_set_page(&sd, virt_to_page(d), 16,
+                   (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+       sg_set_page(&ss, virt_to_page(s), 16,
+                   (unsigned long)(s) % PAGE_CACHE_SIZE);
+
+       desc.tfm   = tfm;
+       desc.info  = NULL;
+       desc.flags = 0;
+       rc = ll_crypto_blkcipher_decrypt(&desc, &sd, &ss, 16);
+       if (rc) {
+               CERROR("failed to decrypt for aes\n");
+               GOTO(out, rc);
+       }
+
+       EXIT;
+
+out:
+       ll_crypto_free_blkcipher(tfm);
+       return rc;
+}
+EXPORT_SYMBOL(capa_decrypt_id);
+
+void capa_cpy(void *capa, struct obd_capa *ocapa)
+{
+       spin_lock(&ocapa->c_lock);
+       *(struct lustre_capa *)capa = ocapa->c_capa;
+       spin_unlock(&ocapa->c_lock);
+}
+EXPORT_SYMBOL(capa_cpy);
+
+void _debug_capa(struct lustre_capa *c,
+                struct libcfs_debug_msg_data *msgdata,
+                const char *fmt, ... )
+{
+       va_list args;
+       va_start(args, fmt);
+       libcfs_debug_vmsg2(msgdata, fmt, args,
+                          " capability@%p fid "DFID" opc "LPX64" uid "LPU64
+                          " gid "LPU64" flags %u alg %d keyid %u timeout %u "
+                          "expiry %u\n", c, PFID(capa_fid(c)), capa_opc(c),
+                          capa_uid(c), capa_gid(c), capa_flags(c),
+                          capa_alg(c), capa_keyid(c), capa_timeout(c),
+                          capa_expiry(c));
+       va_end(args);
+}
+EXPORT_SYMBOL(_debug_capa);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
new file mode 100644 (file)
index 0000000..7eb0ad7
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal cl interfaces.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+#define CLT_PVEC_SIZE (14)
+
+/**
+ * Possible levels of the nesting. Currently this is 2: there are "top"
+ * entities (files, extent locks), and "sub" entities (stripes and stripe
+ * locks). This is used only for debugging counters right now.
+ */
+enum clt_nesting_level {
+       CNL_TOP,
+       CNL_SUB,
+       CNL_NR
+};
+
+/**
+ * Counters used to check correctness of cl_lock interface usage.
+ */
+struct cl_thread_counters {
+       /**
+        * Number of outstanding calls to cl_lock_mutex_get() made by the
+        * current thread. For debugging.
+        */
+       int        ctc_nr_locks_locked;
+       /** List of locked locks. */
+       struct lu_ref ctc_locks_locked;
+       /** Number of outstanding holds on locks. */
+       int        ctc_nr_held;
+       /** Number of outstanding uses on locks. */
+       int        ctc_nr_used;
+       /** Number of held extent locks. */
+       int        ctc_nr_locks_acquired;
+};
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+       /*
+        * Common fields.
+        */
+       struct cl_io     clt_io;
+       struct cl_2queue     clt_queue;
+
+       /*
+        * Fields used by cl_lock.c
+        */
+       struct cl_lock_descr clt_descr;
+       struct cl_page_list  clt_list;
+       /**
+        * Counters for every level of lock nesting.
+        */
+       struct cl_thread_counters clt_counters[CNL_NR];
+       /** @} debugging */
+
+       /*
+        * Fields used by cl_page.c
+        */
+       struct cl_page      *clt_pvec[CLT_PVEC_SIZE];
+
+       /*
+        * Fields used by cl_io.c
+        */
+       /**
+        * Pointer to the topmost ongoing IO in this thread.
+        */
+       struct cl_io    *clt_current_io;
+       /**
+        * Used for submitting a sync io.
+        */
+       struct cl_sync_io    clt_anchor;
+       /**
+        * Fields used by cl_lock_discard_pages().
+        */
+       pgoff_t       clt_next_index;
+       pgoff_t       clt_fn_index; /* first non-overlapped index */
+};
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+
+#endif /* _CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c
new file mode 100644 (file)
index 0000000..75c9be8
--- /dev/null
@@ -0,0 +1,1753 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client IO.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+#define cl_io_for_each(slice, io) \
+       list_for_each_entry((slice), &io->ci_layers, cis_linkage)
+#define cl_io_for_each_reverse(slice, io)               \
+       list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage)
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+       return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+       return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * Returns true iff there is an IO ongoing in the given environment.
+ */
+int cl_io_is_going(const struct lu_env *env)
+{
+       return cl_env_info(env)->clt_current_io != NULL;
+}
+EXPORT_SYMBOL(cl_io_is_going);
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+       struct cl_io *up;
+
+       up = io->ci_parent;
+       return
+               /*
+                * io can own pages only when it is ongoing. Sub-io might
+                * still be in CIS_LOCKED state when top-io is in
+                * CIS_IO_GOING.
+                */
+               ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+                    (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+       struct cl_io_slice    *slice;
+       struct cl_thread_info *info;
+
+       LINVRNT(cl_io_type_is_valid(io->ci_type));
+       LINVRNT(cl_io_invariant(io));
+       ENTRY;
+
+       while (!list_empty(&io->ci_layers)) {
+               slice = container_of(io->ci_layers.prev, struct cl_io_slice,
+                                    cis_linkage);
+               list_del_init(&slice->cis_linkage);
+               if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+                       slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+               /*
+                * Invalidate slice to catch use after free. This assumes that
+                * slices are allocated within session and can be touched
+                * after ->cio_fini() returns.
+                */
+               slice->cis_io = NULL;
+       }
+       io->ci_state = CIS_FINI;
+       info = cl_env_info(env);
+       if (info->clt_current_io == io)
+               info->clt_current_io = NULL;
+
+       /* sanity check for layout change */
+       switch(io->ci_type) {
+       case CIT_READ:
+       case CIT_WRITE:
+               break;
+       case CIT_FAULT:
+       case CIT_FSYNC:
+               LASSERT(!io->ci_need_restart);
+               break;
+       case CIT_SETATTR:
+       case CIT_MISC:
+               /* Check ignore layout change conf */
+               LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
+                               !io->ci_need_restart));
+               break;
+       default:
+               LBUG();
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+                      enum cl_io_type iot, struct cl_object *obj)
+{
+       struct cl_object *scan;
+       int result;
+
+       LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+       LINVRNT(cl_io_type_is_valid(iot));
+       LINVRNT(cl_io_invariant(io));
+       ENTRY;
+
+       io->ci_type = iot;
+       INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+       INIT_LIST_HEAD(&io->ci_lockset.cls_curr);
+       INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+       INIT_LIST_HEAD(&io->ci_layers);
+
+       result = 0;
+       cl_object_for_each(scan, obj) {
+               if (scan->co_ops->coo_io_init != NULL) {
+                       result = scan->co_ops->coo_io_init(env, scan, io);
+                       if (result != 0)
+                               break;
+               }
+       }
+       if (result == 0)
+               io->ci_state = CIS_INIT;
+       RETURN(result);
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+                  enum cl_io_type iot, struct cl_object *obj)
+{
+       struct cl_thread_info *info = cl_env_info(env);
+
+       LASSERT(obj != cl_object_top(obj));
+       if (info->clt_current_io == NULL)
+               info->clt_current_io = io;
+       return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+              enum cl_io_type iot, struct cl_object *obj)
+{
+       struct cl_thread_info *info = cl_env_info(env);
+
+       LASSERT(obj == cl_object_top(obj));
+       LASSERT(info->clt_current_io == NULL);
+
+       info->clt_current_io = io;
+       return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+                 enum cl_io_type iot, loff_t pos, size_t count)
+{
+       LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+       LINVRNT(io->ci_obj != NULL);
+       ENTRY;
+
+       LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+                        "io range: %u ["LPU64", "LPU64") %u %u\n",
+                        iot, (__u64)pos, (__u64)pos + count,
+                        io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+       io->u.ci_rw.crw_pos    = pos;
+       io->u.ci_rw.crw_count  = count;
+       RETURN(cl_io_init(env, io, iot, io->ci_obj));
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+static inline const struct lu_fid *
+cl_lock_descr_fid(const struct cl_lock_descr *descr)
+{
+       return lu_object_fid(&descr->cld_obj->co_lu);
+}
+
+static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
+                             const struct cl_lock_descr *d1)
+{
+       return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?:
+               __diff_normalize(d0->cld_start, d1->cld_start);
+}
+
+static int cl_lock_descr_cmp(const struct cl_lock_descr *d0,
+                            const struct cl_lock_descr *d1)
+{
+       int ret;
+
+       ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1));
+       if (ret)
+               return ret;
+       if (d0->cld_end < d1->cld_start)
+               return -1;
+       if (d0->cld_start > d0->cld_end)
+               return 1;
+       return 0;
+}
+
+static void cl_lock_descr_merge(struct cl_lock_descr *d0,
+                               const struct cl_lock_descr *d1)
+{
+       d0->cld_start = min(d0->cld_start, d1->cld_start);
+       d0->cld_end = max(d0->cld_end, d1->cld_end);
+
+       if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE)
+               d0->cld_mode = CLM_WRITE;
+
+       if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP)
+               d0->cld_mode = CLM_GROUP;
+}
+
+/*
+ * Sort locks in lexicographical order of their (fid, start-offset) pairs.
+ */
+static void cl_io_locks_sort(struct cl_io *io)
+{
+       int done = 0;
+
+       ENTRY;
+       /* hidden treasure: bubble sort for now. */
+       do {
+               struct cl_io_lock_link *curr;
+               struct cl_io_lock_link *prev;
+               struct cl_io_lock_link *temp;
+
+               done = 1;
+               prev = NULL;
+
+               list_for_each_entry_safe(curr, temp,
+                                            &io->ci_lockset.cls_todo,
+                                            cill_linkage) {
+                       if (prev != NULL) {
+                               switch (cl_lock_descr_sort(&prev->cill_descr,
+                                                         &curr->cill_descr)) {
+                               case 0:
+                                       /*
+                                        * IMPOSSIBLE: Identical locks are
+                                        *           already removed at
+                                        *           this point.
+                                        */
+                               default:
+                                       LBUG();
+                               case +1:
+                                       list_move_tail(&curr->cill_linkage,
+                                                          &prev->cill_linkage);
+                                       done = 0;
+                                       continue; /* don't change prev: it's
+                                                  * still "previous" */
+                               case -1: /* already in order */
+                                       break;
+                               }
+                       }
+                       prev = curr;
+               }
+       } while (!done);
+       EXIT;
+}
+
+/**
+ * Check whether \a queue contains locks matching \a need.
+ *
+ * \retval +ve there is a matching lock in the \a queue
+ * \retval   0 there are no matching locks in the \a queue
+ */
+int cl_queue_match(const struct list_head *queue,
+                  const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+              if (cl_lock_descr_match(&scan->cill_descr, need))
+                      RETURN(+1);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(cl_queue_match);
+
+static int cl_queue_merge(const struct list_head *queue,
+                         const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+              if (cl_lock_descr_cmp(&scan->cill_descr, need))
+                      continue;
+              cl_lock_descr_merge(&scan->cill_descr, need);
+              CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+                     scan->cill_descr.cld_mode, scan->cill_descr.cld_start,
+                     scan->cill_descr.cld_end);
+              RETURN(+1);
+       }
+       RETURN(0);
+
+}
+
+static int cl_lockset_match(const struct cl_lockset *set,
+                           const struct cl_lock_descr *need)
+{
+       return cl_queue_match(&set->cls_curr, need) ||
+              cl_queue_match(&set->cls_done, need);
+}
+
+static int cl_lockset_merge(const struct cl_lockset *set,
+                           const struct cl_lock_descr *need)
+{
+       return cl_queue_merge(&set->cls_todo, need) ||
+              cl_lockset_match(set, need);
+}
+
+static int cl_lockset_lock_one(const struct lu_env *env,
+                              struct cl_io *io, struct cl_lockset *set,
+                              struct cl_io_lock_link *link)
+{
+       struct cl_lock *lock;
+       int          result;
+
+       ENTRY;
+
+       lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+
+       if (!IS_ERR(lock)) {
+               link->cill_lock = lock;
+               list_move(&link->cill_linkage, &set->cls_curr);
+               if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) {
+                       result = cl_wait(env, lock);
+                       if (result == 0)
+                               list_move(&link->cill_linkage,
+                                             &set->cls_done);
+               } else
+                       result = 0;
+       } else
+               result = PTR_ERR(lock);
+       RETURN(result);
+}
+
+static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io,
+                             struct cl_io_lock_link *link)
+{
+       struct cl_lock *lock = link->cill_lock;
+
+       ENTRY;
+       list_del_init(&link->cill_linkage);
+       if (lock != NULL) {
+               cl_lock_release(env, lock, "io", io);
+               link->cill_lock = NULL;
+       }
+       if (link->cill_fini != NULL)
+               link->cill_fini(env, link);
+       EXIT;
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+                          struct cl_lockset *set)
+{
+       struct cl_io_lock_link *link;
+       struct cl_io_lock_link *temp;
+       struct cl_lock   *lock;
+       int result;
+
+       ENTRY;
+       result = 0;
+       list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+               if (!cl_lockset_match(set, &link->cill_descr)) {
+                       /* XXX some locking to guarantee that locks aren't
+                        * expanded in between. */
+                       result = cl_lockset_lock_one(env, io, set, link);
+                       if (result != 0)
+                               break;
+               } else
+                       cl_lock_link_fini(env, io, link);
+       }
+       if (result == 0) {
+               list_for_each_entry_safe(link, temp,
+                                            &set->cls_curr, cill_linkage) {
+                       lock = link->cill_lock;
+                       result = cl_wait(env, lock);
+                       if (result == 0)
+                               list_move(&link->cill_linkage,
+                                             &set->cls_done);
+                       else
+                               break;
+               }
+       }
+       RETURN(result);
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+       const struct cl_io_slice *scan;
+       int result = 0;
+
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(io->ci_state == CIS_IT_STARTED);
+       LINVRNT(cl_io_invariant(io));
+
+       ENTRY;
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+                       continue;
+               result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+               if (result != 0)
+                       break;
+       }
+       if (result == 0) {
+               cl_io_locks_sort(io);
+               result = cl_lockset_lock(env, io, &io->ci_lockset);
+       }
+       if (result != 0)
+               cl_io_unlock(env, io);
+       else
+               io->ci_state = CIS_LOCKED;
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+       struct cl_lockset       *set;
+       struct cl_io_lock_link   *link;
+       struct cl_io_lock_link   *temp;
+       const struct cl_io_slice *scan;
+
+       LASSERT(cl_io_is_loopable(io));
+       LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+       LINVRNT(cl_io_invariant(io));
+
+       ENTRY;
+       set = &io->ci_lockset;
+
+       list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage)
+               cl_lock_link_fini(env, io, link);
+
+       list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage)
+               cl_lock_link_fini(env, io, link);
+
+       list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+               cl_unuse(env, link->cill_lock);
+               cl_lock_link_fini(env, io, link);
+       }
+       cl_io_for_each_reverse(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+                       scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+       }
+       io->ci_state = CIS_UNLOCKED;
+       LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+       const struct cl_io_slice *scan;
+       int result;
+
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+       LINVRNT(cl_io_invariant(io));
+
+       ENTRY;
+       result = 0;
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+                       continue;
+               result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+                                                                     scan);
+               if (result != 0)
+                       break;
+       }
+       if (result == 0)
+               io->ci_state = CIS_IT_STARTED;
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+       const struct cl_io_slice *scan;
+
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(io->ci_state == CIS_UNLOCKED);
+       LINVRNT(cl_io_invariant(io));
+
+       ENTRY;
+       cl_io_for_each_reverse(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+                       scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+       }
+       io->ci_state = CIS_IT_ENDED;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+       const struct cl_io_slice *scan;
+
+       LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+               nob == 0);
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(cl_io_invariant(io));
+
+       ENTRY;
+
+       io->u.ci_rw.crw_pos   += nob;
+       io->u.ci_rw.crw_count -= nob;
+
+       /* layers have to be notified. */
+       cl_io_for_each_reverse(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+                       scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+                                                                  nob);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_rw_advance);
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+                  struct cl_io_lock_link *link)
+{
+       int result;
+
+       ENTRY;
+       if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr))
+               result = +1;
+       else {
+               list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+               result = 0;
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+                                struct cl_io_lock_link *link)
+{
+       OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+                        struct cl_lock_descr *descr)
+{
+       struct cl_io_lock_link *link;
+       int result;
+
+       ENTRY;
+       OBD_ALLOC_PTR(link);
+       if (link != NULL) {
+               link->cill_descr     = *descr;
+               link->cill_fini      = cl_free_io_lock_link;
+               result = cl_io_lock_add(env, io, link);
+               if (result) /* lock match */
+                       link->cill_fini(env, link);
+       } else
+               result = -ENOMEM;
+
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+       const struct cl_io_slice *scan;
+       int result = 0;
+
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(io->ci_state == CIS_LOCKED);
+       LINVRNT(cl_io_invariant(io));
+       ENTRY;
+
+       io->ci_state = CIS_IO_GOING;
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+                       continue;
+               result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+               if (result != 0)
+                       break;
+       }
+       if (result >= 0)
+               result = 0;
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+       const struct cl_io_slice *scan;
+
+       LINVRNT(cl_io_is_loopable(io));
+       LINVRNT(io->ci_state == CIS_IO_GOING);
+       LINVRNT(cl_io_invariant(io));
+       ENTRY;
+
+       cl_io_for_each_reverse(scan, io) {
+               if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+                       scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+               /* TODO: error handling. */
+       }
+       io->ci_state = CIS_IO_FINISHED;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+static const struct cl_page_slice *
+cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page)
+{
+       const struct cl_page_slice *slice;
+
+       slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type);
+       LINVRNT(slice != NULL);
+       return slice;
+}
+
+/**
+ * True iff \a page is within \a io range.
+ */
+static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io)
+{
+       int     result = 1;
+       loff_t  start;
+       loff_t  end;
+       pgoff_t idx;
+
+       idx = page->cp_index;
+       switch (io->ci_type) {
+       case CIT_READ:
+       case CIT_WRITE:
+               /*
+                * check that [start, end) and [pos, pos + count) extents
+                * overlap.
+                */
+               if (!cl_io_is_append(io)) {
+                       const struct cl_io_rw_common *crw = &(io->u.ci_rw);
+                       start = cl_offset(page->cp_obj, idx);
+                       end   = cl_offset(page->cp_obj, idx + 1);
+                       result = crw->crw_pos < end &&
+                                start < crw->crw_pos + crw->crw_count;
+               }
+               break;
+       case CIT_FAULT:
+               result = io->u.ci_fault.ft_index == idx;
+               break;
+       default:
+               LBUG();
+       }
+       return result;
+}
+
+/**
+ * Called by read io, when page has to be read from the server.
+ *
+ * \see cl_io_operations::cio_read_page()
+ */
+int cl_io_read_page(const struct lu_env *env, struct cl_io *io,
+                   struct cl_page *page)
+{
+       const struct cl_io_slice *scan;
+       struct cl_2queue         *queue;
+       int                    result = 0;
+
+       LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT);
+       LINVRNT(cl_page_is_owned(page, io));
+       LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+       LINVRNT(cl_page_in_io(page, io));
+       LINVRNT(cl_io_invariant(io));
+       ENTRY;
+
+       queue = &io->ci_queue;
+
+       cl_2queue_init(queue);
+       /*
+        * ->cio_read_page() methods called in the loop below are supposed to
+        * never block waiting for network (the only subtle point is the
+        * creation of new pages for read-ahead that might result in cache
+        * shrinking, but currently only clean pages are shrunk and this
+        * requires no network io).
+        *
+        * Should this ever starts blocking, retry loop would be needed for
+        * "parallel io" (see CLO_REPEAT loops in cl_lock.c).
+        */
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->cio_read_page != NULL) {
+                       const struct cl_page_slice *slice;
+
+                       slice = cl_io_slice_page(scan, page);
+                       LINVRNT(slice != NULL);
+                       result = scan->cis_iop->cio_read_page(env, scan, slice);
+                       if (result != 0)
+                               break;
+               }
+       }
+       if (result == 0)
+               result = cl_io_submit_rw(env, io, CRT_READ, queue);
+       /*
+        * Unlock unsent pages in case of error.
+        */
+       cl_page_list_disown(env, io, &queue->c2_qin);
+       cl_2queue_fini(env, queue);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_read_page);
+
+/**
+ * Called by write io to prepare page to receive data from user buffer.
+ *
+ * \see cl_io_operations::cio_prepare_write()
+ */
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+                       struct cl_page *page, unsigned from, unsigned to)
+{
+       const struct cl_io_slice *scan;
+       int result = 0;
+
+       LINVRNT(io->ci_type == CIT_WRITE);
+       LINVRNT(cl_page_is_owned(page, io));
+       LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+       LINVRNT(cl_io_invariant(io));
+       LASSERT(cl_page_in_io(page, io));
+       ENTRY;
+
+       cl_io_for_each_reverse(scan, io) {
+               if (scan->cis_iop->cio_prepare_write != NULL) {
+                       const struct cl_page_slice *slice;
+
+                       slice = cl_io_slice_page(scan, page);
+                       result = scan->cis_iop->cio_prepare_write(env, scan,
+                                                                 slice,
+                                                                 from, to);
+                       if (result != 0)
+                               break;
+               }
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_prepare_write);
+
+/**
+ * Called by write io after user data were copied into a page.
+ *
+ * \see cl_io_operations::cio_commit_write()
+ */
+int cl_io_commit_write(const struct lu_env *env, struct cl_io *io,
+                      struct cl_page *page, unsigned from, unsigned to)
+{
+       const struct cl_io_slice *scan;
+       int result = 0;
+
+       LINVRNT(io->ci_type == CIT_WRITE);
+       LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+       LINVRNT(cl_io_invariant(io));
+       /*
+        * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov)
+        * already called cl_page_cache_add(), moving page into CPS_CACHED
+        * state. Better (and more general) way of dealing with such situation
+        * is needed.
+        */
+       LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL);
+       LASSERT(cl_page_in_io(page, io));
+       ENTRY;
+
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->cio_commit_write != NULL) {
+                       const struct cl_page_slice *slice;
+
+                       slice = cl_io_slice_page(scan, page);
+                       result = scan->cis_iop->cio_commit_write(env, scan,
+                                                                slice,
+                                                                from, to);
+                       if (result != 0)
+                               break;
+               }
+       }
+       LINVRNT(result <= 0);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_commit_write);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+                   enum cl_req_type crt, struct cl_2queue *queue)
+{
+       const struct cl_io_slice *scan;
+       int result = 0;
+
+       LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op));
+       ENTRY;
+
+       cl_io_for_each(scan, io) {
+               if (scan->cis_iop->req_op[crt].cio_submit == NULL)
+                       continue;
+               result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt,
+                                                              queue);
+               if (result != 0)
+                       break;
+       }
+       /*
+        * If ->cio_submit() failed, no pages were sent.
+        */
+       LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Submit a sync_io and wait for the IO to be finished, or error happens.
+ * If \a timeout is zero, it means to wait for the IO unconditionally.
+ */
+int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
+                     enum cl_req_type iot, struct cl_2queue *queue,
+                     long timeout)
+{
+       struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
+       struct cl_page *pg;
+       int rc;
+
+       cl_page_list_for_each(pg, &queue->c2_qin) {
+               LASSERT(pg->cp_sync_io == NULL);
+               pg->cp_sync_io = anchor;
+       }
+
+       cl_sync_io_init(anchor, queue->c2_qin.pl_nr);
+       rc = cl_io_submit_rw(env, io, iot, queue);
+       if (rc == 0) {
+               /*
+                * If some pages weren't sent for any reason (e.g.,
+                * read found up-to-date pages in the cache, or write found
+                * clean pages), count them as completed to avoid infinite
+                * wait.
+                */
+                cl_page_list_for_each(pg, &queue->c2_qin) {
+                       pg->cp_sync_io = NULL;
+                       cl_sync_io_note(anchor, +1);
+                }
+
+                /* wait for the IO to be finished. */
+                rc = cl_sync_io_wait(env, io, &queue->c2_qout,
+                                     anchor, timeout);
+       } else {
+               LASSERT(list_empty(&queue->c2_qout.pl_pages));
+               cl_page_list_for_each(pg, &queue->c2_qin)
+                       pg->cp_sync_io = NULL;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(cl_io_submit_sync);
+
+/**
+ * Cancel an IO which has been submitted by cl_io_submit_rw.
+ */
+int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
+                struct cl_page_list *queue)
+{
+       struct cl_page *page;
+       int result = 0;
+
+       CERROR("Canceling ongoing page trasmission\n");
+       cl_page_list_for_each(page, queue) {
+               int rc;
+
+               LINVRNT(cl_page_in_io(page, io));
+               rc = cl_page_cancel(env, page);
+               result = result ?: rc;
+       }
+       return result;
+}
+EXPORT_SYMBOL(cl_io_cancel);
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ *    - cl_io_iter_init()
+ *
+ *    - cl_io_lock()
+ *
+ *    - cl_io_start()
+ *
+ *    - cl_io_end()
+ *
+ *    - cl_io_unlock()
+ *
+ *    - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+       int result   = 0;
+
+       LINVRNT(cl_io_is_loopable(io));
+       ENTRY;
+
+       do {
+               size_t nob;
+
+               io->ci_continue = 0;
+               result = cl_io_iter_init(env, io);
+               if (result == 0) {
+                       nob    = io->ci_nob;
+                       result = cl_io_lock(env, io);
+                       if (result == 0) {
+                               /*
+                                * Notify layers that locks has been taken,
+                                * and do actual i/o.
+                                *
+                                *   - llite: kms, short read;
+                                *   - llite: generic_file_read();
+                                */
+                               result = cl_io_start(env, io);
+                               /*
+                                * Send any remaining pending
+                                * io, etc.
+                                *
+                                *   - llite: ll_rw_stats_tally.
+                                */
+                               cl_io_end(env, io);
+                               cl_io_unlock(env, io);
+                               cl_io_rw_advance(env, io, io->ci_nob - nob);
+                       }
+               }
+               cl_io_iter_fini(env, io);
+       } while (result == 0 && io->ci_continue);
+       if (result == 0)
+               result = io->ci_result;
+       RETURN(result < 0 ? result : 0);
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+                    struct cl_object *obj,
+                    const struct cl_io_operations *ops)
+{
+       struct list_head *linkage = &slice->cis_linkage;
+
+       LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+               list_empty(linkage));
+       ENTRY;
+
+       list_add_tail(linkage, &io->ci_layers);
+       slice->cis_io  = io;
+       slice->cis_obj = obj;
+       slice->cis_iop = ops;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+       ENTRY;
+       plist->pl_nr = 0;
+       INIT_LIST_HEAD(&plist->pl_pages);
+       plist->pl_owner = current;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page)
+{
+       ENTRY;
+       /* it would be better to check that page is owned by "current" io, but
+        * it is not passed here. */
+       LASSERT(page->cp_owner != NULL);
+       LINVRNT(plist->pl_owner == current);
+
+       lockdep_off();
+       mutex_lock(&page->cp_mutex);
+       lockdep_on();
+       LASSERT(list_empty(&page->cp_batch));
+       list_add_tail(&page->cp_batch, &plist->pl_pages);
+       ++plist->pl_nr;
+       page->cp_queue_ref = lu_ref_add(&page->cp_reference, "queue", plist);
+       cl_page_get(page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+                     struct cl_page_list *plist, struct cl_page *page)
+{
+       LASSERT(plist->pl_nr > 0);
+       LINVRNT(plist->pl_owner == current);
+
+       ENTRY;
+       list_del_init(&page->cp_batch);
+       lockdep_off();
+       mutex_unlock(&page->cp_mutex);
+       lockdep_on();
+       --plist->pl_nr;
+       lu_ref_del_at(&page->cp_reference, page->cp_queue_ref, "queue", plist);
+       cl_page_put(env, page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+                      struct cl_page *page)
+{
+       LASSERT(src->pl_nr > 0);
+       LINVRNT(dst->pl_owner == current);
+       LINVRNT(src->pl_owner == current);
+
+       ENTRY;
+       list_move_tail(&page->cp_batch, &dst->pl_pages);
+       --src->pl_nr;
+       ++dst->pl_nr;
+       lu_ref_set_at(&page->cp_reference,
+                     page->cp_queue_ref, "queue", src, dst);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head)
+{
+       struct cl_page *page;
+       struct cl_page *tmp;
+
+       LINVRNT(list->pl_owner == current);
+       LINVRNT(head->pl_owner == current);
+
+       ENTRY;
+       cl_page_list_for_each_safe(page, tmp, list)
+               cl_page_list_move(head, list, page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+void cl_page_disown0(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+                        struct cl_io *io, struct cl_page_list *plist)
+{
+       struct cl_page *page;
+       struct cl_page *temp;
+
+       LINVRNT(plist->pl_owner == current);
+
+       ENTRY;
+       cl_page_list_for_each_safe(page, temp, plist) {
+               LASSERT(plist->pl_nr > 0);
+
+               list_del_init(&page->cp_batch);
+               lockdep_off();
+               mutex_unlock(&page->cp_mutex);
+               lockdep_on();
+               --plist->pl_nr;
+               /*
+                * cl_page_disown0 rather than usual cl_page_disown() is used,
+                * because pages are possibly in CPS_FREEING state already due
+                * to the call to cl_page_list_discard().
+                */
+               /*
+                * XXX cl_page_disown0() will fail if page is not locked.
+                */
+               cl_page_disown0(env, io, page);
+               lu_ref_del(&page->cp_reference, "queue", plist);
+               cl_page_put(env, page);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+       struct cl_page *page;
+       struct cl_page *temp;
+
+       LINVRNT(plist->pl_owner == current);
+
+       ENTRY;
+       cl_page_list_for_each_safe(page, temp, plist)
+               cl_page_list_del(env, plist, page);
+       LASSERT(plist->pl_nr == 0);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Owns all pages in a queue.
+ */
+int cl_page_list_own(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page_list *plist)
+{
+       struct cl_page *page;
+       struct cl_page *temp;
+       pgoff_t index = 0;
+       int result;
+
+       LINVRNT(plist->pl_owner == current);
+
+       ENTRY;
+       result = 0;
+       cl_page_list_for_each_safe(page, temp, plist) {
+               LASSERT(index <= page->cp_index);
+               index = page->cp_index;
+               if (cl_page_own(env, io, page) == 0)
+                       result = result ?: page->cp_error;
+               else
+                       cl_page_list_del(env, plist, page);
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_own);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+                        struct cl_io *io, struct cl_page_list *plist)
+{
+       struct cl_page *page;
+
+       LINVRNT(plist->pl_owner == current);
+
+       cl_page_list_for_each(page, plist)
+               cl_page_assume(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_assume);
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+                         struct cl_page_list *plist)
+{
+       struct cl_page *page;
+
+       LINVRNT(plist->pl_owner == current);
+       ENTRY;
+       cl_page_list_for_each(page, plist)
+               cl_page_discard(env, io, page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_discard);
+
+/**
+ * Unmaps all pages in a queue from user virtual memory.
+ */
+int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io,
+                       struct cl_page_list *plist)
+{
+       struct cl_page *page;
+       int result;
+
+       LINVRNT(plist->pl_owner == current);
+       ENTRY;
+       result = 0;
+       cl_page_list_for_each(page, plist) {
+               result = cl_page_unmap(env, io, page);
+               if (result != 0)
+                       break;
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_unmap);
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+       ENTRY;
+       cl_page_list_init(&queue->c2_qin);
+       cl_page_list_init(&queue->c2_qout);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page)
+{
+       ENTRY;
+       cl_page_list_add(&queue->c2_qin, page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+                     struct cl_io *io, struct cl_2queue *queue)
+{
+       ENTRY;
+       cl_page_list_disown(env, io, &queue->c2_qin);
+       cl_page_list_disown(env, io, &queue->c2_qout);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+                      struct cl_io *io, struct cl_2queue *queue)
+{
+       ENTRY;
+       cl_page_list_discard(env, io, &queue->c2_qin);
+       cl_page_list_discard(env, io, &queue->c2_qout);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+                     struct cl_io *io, struct cl_2queue *queue)
+{
+       cl_page_list_assume(env, io, &queue->c2_qin);
+       cl_page_list_assume(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_assume);
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+       ENTRY;
+       cl_page_list_fini(env, &queue->c2_qout);
+       cl_page_list_fini(env, &queue->c2_qin);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+       ENTRY;
+       cl_2queue_init(queue);
+       cl_2queue_add(queue, page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top(), cl_page_top().
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+       ENTRY;
+       while (io->ci_parent != NULL)
+               io = io->ci_parent;
+       RETURN(io);
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+                lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Adds request slice to the compound request.
+ *
+ * This is called by cl_device_operations::cdo_req_init() methods to add a
+ * per-layer state to the request. New state is added at the end of
+ * cl_req::crq_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+                     struct cl_device *dev,
+                     const struct cl_req_operations *ops)
+{
+       ENTRY;
+       list_add_tail(&slice->crs_linkage, &req->crq_layers);
+       slice->crs_dev = dev;
+       slice->crs_ops = ops;
+       slice->crs_req = req;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_req_slice_add);
+
+static void cl_req_free(const struct lu_env *env, struct cl_req *req)
+{
+       unsigned i;
+
+       LASSERT(list_empty(&req->crq_pages));
+       LASSERT(req->crq_nrpages == 0);
+       LINVRNT(list_empty(&req->crq_layers));
+       LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL));
+       ENTRY;
+
+       if (req->crq_o != NULL) {
+               for (i = 0; i < req->crq_nrobjs; ++i) {
+                       struct cl_object *obj = req->crq_o[i].ro_obj;
+                       if (obj != NULL) {
+                               lu_object_ref_del_at(&obj->co_lu,
+                                                    req->crq_o[i].ro_obj_ref,
+                                                    "cl_req", req);
+                               cl_object_put(env, obj);
+                       }
+               }
+               OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof req->crq_o[0]);
+       }
+       OBD_FREE_PTR(req);
+       EXIT;
+}
+
+static int cl_req_init(const struct lu_env *env, struct cl_req *req,
+                      struct cl_page *page)
+{
+       struct cl_device     *dev;
+       struct cl_page_slice *slice;
+       int result;
+
+       ENTRY;
+       result = 0;
+       page = cl_page_top(page);
+       do {
+               list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+                       dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev);
+                       if (dev->cd_ops->cdo_req_init != NULL) {
+                               result = dev->cd_ops->cdo_req_init(env,
+                                                                  dev, req);
+                               if (result != 0)
+                                       break;
+                       }
+               }
+               page = page->cp_child;
+       } while (page != NULL && result == 0);
+       RETURN(result);
+}
+
+/**
+ * Invokes per-request transfer completion call-backs
+ * (cl_req_operations::cro_completion()) bottom-to-top.
+ */
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc)
+{
+       struct cl_req_slice *slice;
+
+       ENTRY;
+       /*
+        * for the lack of list_for_each_entry_reverse_safe()...
+        */
+       while (!list_empty(&req->crq_layers)) {
+               slice = list_entry(req->crq_layers.prev,
+                                      struct cl_req_slice, crs_linkage);
+               list_del_init(&slice->crs_linkage);
+               if (slice->crs_ops->cro_completion != NULL)
+                       slice->crs_ops->cro_completion(env, slice, rc);
+       }
+       cl_req_free(env, req);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_req_completion);
+
+/**
+ * Allocates new transfer request.
+ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+                           enum cl_req_type crt, int nr_objects)
+{
+       struct cl_req *req;
+
+       LINVRNT(nr_objects > 0);
+       ENTRY;
+
+       OBD_ALLOC_PTR(req);
+       if (req != NULL) {
+               int result;
+
+               OBD_ALLOC(req->crq_o, nr_objects * sizeof req->crq_o[0]);
+               if (req->crq_o != NULL) {
+                       req->crq_nrobjs = nr_objects;
+                       req->crq_type = crt;
+                       INIT_LIST_HEAD(&req->crq_pages);
+                       INIT_LIST_HEAD(&req->crq_layers);
+                       result = cl_req_init(env, req, page);
+               } else
+                       result = -ENOMEM;
+               if (result != 0) {
+                       cl_req_completion(env, req, result);
+                       req = ERR_PTR(result);
+               }
+       } else
+               req = ERR_PTR(-ENOMEM);
+       RETURN(req);
+}
+EXPORT_SYMBOL(cl_req_alloc);
+
+/**
+ * Adds a page to a request.
+ */
+void cl_req_page_add(const struct lu_env *env,
+                    struct cl_req *req, struct cl_page *page)
+{
+       struct cl_object  *obj;
+       struct cl_req_obj *rqo;
+       int i;
+
+       ENTRY;
+       page = cl_page_top(page);
+
+       LASSERT(list_empty(&page->cp_flight));
+       LASSERT(page->cp_req == NULL);
+
+       CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n",
+                     req, req->crq_type, req->crq_nrpages);
+
+       list_add_tail(&page->cp_flight, &req->crq_pages);
+       ++req->crq_nrpages;
+       page->cp_req = req;
+       obj = cl_object_top(page->cp_obj);
+       for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) {
+               if (rqo->ro_obj == NULL) {
+                       rqo->ro_obj = obj;
+                       cl_object_get(obj);
+                       rqo->ro_obj_ref = lu_object_ref_add(&obj->co_lu,
+                                                           "cl_req", req);
+                       break;
+               }
+       }
+       LASSERT(i < req->crq_nrobjs);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_add);
+
+/**
+ * Removes a page from a request.
+ */
+void cl_req_page_done(const struct lu_env *env, struct cl_page *page)
+{
+       struct cl_req *req = page->cp_req;
+
+       ENTRY;
+       page = cl_page_top(page);
+
+       LASSERT(!list_empty(&page->cp_flight));
+       LASSERT(req->crq_nrpages > 0);
+
+       list_del_init(&page->cp_flight);
+       --req->crq_nrpages;
+       page->cp_req = NULL;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_done);
+
+/**
+ * Notifies layers that request is about to depart by calling
+ * cl_req_operations::cro_prep() top-to-bottom.
+ */
+int cl_req_prep(const struct lu_env *env, struct cl_req *req)
+{
+       int i;
+       int result;
+       const struct cl_req_slice *slice;
+
+       ENTRY;
+       /*
+        * Check that the caller of cl_req_alloc() didn't lie about the number
+        * of objects.
+        */
+       for (i = 0; i < req->crq_nrobjs; ++i)
+               LASSERT(req->crq_o[i].ro_obj != NULL);
+
+       result = 0;
+       list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+               if (slice->crs_ops->cro_prep != NULL) {
+                       result = slice->crs_ops->cro_prep(env, slice);
+                       if (result != 0)
+                               break;
+               }
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_req_prep);
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_req *req,
+                    struct cl_req_attr *attr, obd_valid flags)
+{
+       const struct cl_req_slice *slice;
+       struct cl_page      *page;
+       int i;
+
+       LASSERT(!list_empty(&req->crq_pages));
+       ENTRY;
+
+       /* Take any page to use as a model. */
+       page = list_entry(req->crq_pages.next, struct cl_page, cp_flight);
+
+       for (i = 0; i < req->crq_nrobjs; ++i) {
+               list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+                       const struct cl_page_slice *scan;
+                       const struct cl_object     *obj;
+
+                       scan = cl_page_at(page,
+                                         slice->crs_dev->cd_lu_dev.ld_type);
+                       LASSERT(scan != NULL);
+                       obj = scan->cpl_obj;
+                       if (slice->crs_ops->cro_attr_set != NULL)
+                               slice->crs_ops->cro_attr_set(env, slice, obj,
+                                                            attr + i, flags);
+               }
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/* XXX complete(), init_completion(), and wait_for_completion(), until they are
+ * implemented in libcfs. */
+# include <linux/sched.h>
+
+/**
+ * Initialize synchronous io wait anchor, for transfer of \a nrpages pages.
+ */
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages)
+{
+       ENTRY;
+       init_waitqueue_head(&anchor->csi_waitq);
+       atomic_set(&anchor->csi_sync_nr, nrpages);
+       atomic_set(&anchor->csi_barrier, nrpages > 0);
+       anchor->csi_sync_rc = 0;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_init);
+
+/**
+ * Wait until all transfer completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every page.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+                   struct cl_page_list *queue, struct cl_sync_io *anchor,
+                   long timeout)
+{
+       struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+                                                 NULL, NULL, NULL);
+       int rc;
+       ENTRY;
+
+       LASSERT(timeout >= 0);
+
+       rc = l_wait_event(anchor->csi_waitq,
+                         atomic_read(&anchor->csi_sync_nr) == 0,
+                         &lwi);
+       if (rc < 0) {
+               CERROR("SYNC IO failed with error: %d, try to cancel "
+                      "%d remaining pages\n",
+                      rc, atomic_read(&anchor->csi_sync_nr));
+
+               (void)cl_io_cancel(env, io, queue);
+
+               lwi = (struct l_wait_info) { 0 };
+               (void)l_wait_event(anchor->csi_waitq,
+                                  atomic_read(&anchor->csi_sync_nr) == 0,
+                                  &lwi);
+       } else {
+               rc = anchor->csi_sync_rc;
+       }
+       LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+       cl_page_list_assume(env, io, queue);
+
+       /* wait until cl_sync_io_note() has done wakeup */
+       while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) {
+               cpu_relax();
+       }
+
+       POISON(anchor, 0x5a, sizeof *anchor);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret)
+{
+       ENTRY;
+       if (anchor->csi_sync_rc == 0 && ioret < 0)
+               anchor->csi_sync_rc = ioret;
+       /*
+        * Synchronous IO done without releasing page lock (e.g., as a part of
+        * ->{prepare,commit}_write(). Completion is used to signal the end of
+        * IO.
+        */
+       LASSERT(atomic_read(&anchor->csi_sync_nr) > 0);
+       if (atomic_dec_and_test(&anchor->csi_sync_nr)) {
+               wake_up_all(&anchor->csi_waitq);
+               /* it's safe to nuke or reuse anchor now */
+               atomic_set(&anchor->csi_barrier, 0);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_note);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
new file mode 100644 (file)
index 0000000..d34e044
--- /dev/null
@@ -0,0 +1,2304 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Extent Lock.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/** Lock class of cl_lock::cll_guard */
+static struct lock_class_key cl_lock_guard_class;
+static struct kmem_cache *cl_lock_kmem;
+
+static struct lu_kmem_descr cl_lock_caches[] = {
+       {
+               .ckd_cache = &cl_lock_kmem,
+               .ckd_name  = "cl_lock_kmem",
+               .ckd_size  = sizeof (struct cl_lock)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+#define CS_LOCK_INC(o, item)
+#define CS_LOCK_DEC(o, item)
+#define CS_LOCKSTATE_INC(o, state)
+#define CS_LOCKSTATE_DEC(o, state)
+
+/**
+ * Basic lock invariant that is maintained at all times. Caller either has a
+ * reference to \a lock, or somehow assures that \a lock cannot be freed.
+ *
+ * \see cl_lock_invariant()
+ */
+static int cl_lock_invariant_trusted(const struct lu_env *env,
+                                    const struct cl_lock *lock)
+{
+       return  ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) &&
+               atomic_read(&lock->cll_ref) >= lock->cll_holds &&
+               lock->cll_holds >= lock->cll_users &&
+               lock->cll_holds >= 0 &&
+               lock->cll_users >= 0 &&
+               lock->cll_depth >= 0;
+}
+
+/**
+ * Stronger lock invariant, checking that caller has a reference on a lock.
+ *
+ * \see cl_lock_invariant_trusted()
+ */
+static int cl_lock_invariant(const struct lu_env *env,
+                            const struct cl_lock *lock)
+{
+       int result;
+
+       result = atomic_read(&lock->cll_ref) > 0 &&
+               cl_lock_invariant_trusted(env, lock);
+       if (!result && env != NULL)
+               CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken");
+       return result;
+}
+
+/**
+ * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock.
+ */
+static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock)
+{
+       return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting;
+}
+
+/**
+ * Returns a set of counters for this lock, depending on a lock nesting.
+ */
+static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env,
+                                                  const struct cl_lock *lock)
+{
+       struct cl_thread_info *info;
+       enum clt_nesting_level nesting;
+
+       info = cl_env_info(env);
+       nesting = cl_lock_nesting(lock);
+       LASSERT(nesting < ARRAY_SIZE(info->clt_counters));
+       return &info->clt_counters[nesting];
+}
+
+static void cl_lock_trace0(int level, const struct lu_env *env,
+                          const char *prefix, const struct cl_lock *lock,
+                          const char *func, const int line)
+{
+       struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+       CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)"
+                     "(%p/%d/%d) at %s():%d\n",
+              prefix, lock, atomic_read(&lock->cll_ref),
+              lock->cll_guarder, lock->cll_depth,
+              lock->cll_state, lock->cll_error, lock->cll_holds,
+              lock->cll_users, lock->cll_flags,
+              env, h->coh_nesting, cl_lock_nr_mutexed(env),
+              func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock)                         \
+       cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__)
+
+#define RETIP ((unsigned long)__builtin_return_address(0))
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key cl_lock_key;
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{
+       lockdep_set_class_and_name(lock, &cl_lock_key, "EXT");
+}
+
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+                                   struct cl_lock *lock, __u32 enqflags)
+{
+       cl_lock_counters(env, lock)->ctc_nr_locks_acquired++;
+       lock_map_acquire(&lock->dep_map);
+}
+
+static void cl_lock_lockdep_release(const struct lu_env *env,
+                                   struct cl_lock *lock)
+{
+       cl_lock_counters(env, lock)->ctc_nr_locks_acquired--;
+       lock_release(&lock->dep_map, 0, RETIP);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{}
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+                                   struct cl_lock *lock, __u32 enqflags)
+{}
+static void cl_lock_lockdep_release(const struct lu_env *env,
+                                   struct cl_lock *lock)
+{}
+
+#endif /* !CONFIG_LOCKDEP */
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+                      struct cl_object *obj,
+                      const struct cl_lock_operations *ops)
+{
+       ENTRY;
+       slice->cls_lock = lock;
+       list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+       slice->cls_obj = obj;
+       slice->cls_ops = ops;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+/**
+ * Returns true iff a lock with the mode \a has provides at least the same
+ * guarantees as a lock with the mode \a need.
+ */
+int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need)
+{
+       LINVRNT(need == CLM_READ || need == CLM_WRITE ||
+               need == CLM_PHANTOM || need == CLM_GROUP);
+       LINVRNT(has == CLM_READ || has == CLM_WRITE ||
+               has == CLM_PHANTOM || has == CLM_GROUP);
+       CLASSERT(CLM_PHANTOM < CLM_READ);
+       CLASSERT(CLM_READ < CLM_WRITE);
+       CLASSERT(CLM_WRITE < CLM_GROUP);
+
+       if (has != CLM_GROUP)
+               return need <= has;
+       else
+               return need == has;
+}
+EXPORT_SYMBOL(cl_lock_mode_match);
+
+/**
+ * Returns true iff extent portions of lock descriptions match.
+ */
+int cl_lock_ext_match(const struct cl_lock_descr *has,
+                     const struct cl_lock_descr *need)
+{
+       return
+               has->cld_start <= need->cld_start &&
+               has->cld_end >= need->cld_end &&
+               cl_lock_mode_match(has->cld_mode, need->cld_mode) &&
+               (has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid);
+}
+EXPORT_SYMBOL(cl_lock_ext_match);
+
+/**
+ * Returns true iff a lock with the description \a has provides at least the
+ * same guarantees as a lock with the description \a need.
+ */
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+                       const struct cl_lock_descr *need)
+{
+       return
+               cl_object_same(has->cld_obj, need->cld_obj) &&
+               cl_lock_ext_match(has, need);
+}
+EXPORT_SYMBOL(cl_lock_descr_match);
+
+static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_object *obj = lock->cll_descr.cld_obj;
+
+       LINVRNT(!cl_lock_is_mutexed(lock));
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "free lock", lock);
+       might_sleep();
+       while (!list_empty(&lock->cll_layers)) {
+               struct cl_lock_slice *slice;
+
+               slice = list_entry(lock->cll_layers.next,
+                                      struct cl_lock_slice, cls_linkage);
+               list_del_init(lock->cll_layers.next);
+               slice->cls_ops->clo_fini(env, slice);
+       }
+       CS_LOCK_DEC(obj, total);
+       CS_LOCKSTATE_DEC(obj, lock->cll_state);
+       lu_object_ref_del_at(&obj->co_lu, lock->cll_obj_ref, "cl_lock", lock);
+       cl_object_put(env, obj);
+       lu_ref_fini(&lock->cll_reference);
+       lu_ref_fini(&lock->cll_holders);
+       mutex_destroy(&lock->cll_guard);
+       OBD_SLAB_FREE_PTR(lock, cl_lock_kmem);
+       EXIT;
+}
+
+/**
+ * Releases a reference on a lock.
+ *
+ * When last reference is released, lock is returned to the cache, unless it
+ * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed
+ * immediately.
+ *
+ * \see cl_object_put(), cl_page_put()
+ */
+void cl_lock_put(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_object        *obj;
+
+       LINVRNT(cl_lock_invariant(env, lock));
+       ENTRY;
+       obj = lock->cll_descr.cld_obj;
+       LINVRNT(obj != NULL);
+
+       CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n",
+              atomic_read(&lock->cll_ref), lock, RETIP);
+
+       if (atomic_dec_and_test(&lock->cll_ref)) {
+               if (lock->cll_state == CLS_FREEING) {
+                       LASSERT(list_empty(&lock->cll_linkage));
+                       cl_lock_free(env, lock);
+               }
+               CS_LOCK_DEC(obj, busy);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_put);
+
+/**
+ * Acquires an additional reference to a lock.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * lock.
+ *
+ * \see cl_object_get(), cl_page_get()
+ */
+void cl_lock_get(struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_invariant(NULL, lock));
+       CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n",
+              atomic_read(&lock->cll_ref), lock, RETIP);
+       atomic_inc(&lock->cll_ref);
+}
+EXPORT_SYMBOL(cl_lock_get);
+
+/**
+ * Acquires a reference to a lock.
+ *
+ * This is much like cl_lock_get(), except that this function can be used to
+ * acquire initial reference to the cached lock. Caller has to deal with all
+ * possible races. Use with care!
+ *
+ * \see cl_page_get_trust()
+ */
+void cl_lock_get_trust(struct cl_lock *lock)
+{
+       CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n",
+              atomic_read(&lock->cll_ref), lock, RETIP);
+       if (atomic_inc_return(&lock->cll_ref) == 1)
+               CS_LOCK_INC(lock->cll_descr.cld_obj, busy);
+}
+EXPORT_SYMBOL(cl_lock_get_trust);
+
+/**
+ * Helper function destroying the lock that wasn't completely initialized.
+ *
+ * Other threads can acquire references to the top-lock through its
+ * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately.
+ */
+static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock)
+{
+       cl_lock_mutex_get(env, lock);
+       cl_lock_cancel(env, lock);
+       cl_lock_delete(env, lock);
+       cl_lock_mutex_put(env, lock);
+       cl_lock_put(env, lock);
+}
+
+static struct cl_lock *cl_lock_alloc(const struct lu_env *env,
+                                    struct cl_object *obj,
+                                    const struct cl_io *io,
+                                    const struct cl_lock_descr *descr)
+{
+       struct cl_lock    *lock;
+       struct lu_object_header *head;
+
+       ENTRY;
+       OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, __GFP_IO);
+       if (lock != NULL) {
+               atomic_set(&lock->cll_ref, 1);
+               lock->cll_descr = *descr;
+               lock->cll_state = CLS_NEW;
+               cl_object_get(obj);
+               lock->cll_obj_ref = lu_object_ref_add(&obj->co_lu,
+                                                     "cl_lock", lock);
+               INIT_LIST_HEAD(&lock->cll_layers);
+               INIT_LIST_HEAD(&lock->cll_linkage);
+               INIT_LIST_HEAD(&lock->cll_inclosure);
+               lu_ref_init(&lock->cll_reference);
+               lu_ref_init(&lock->cll_holders);
+               mutex_init(&lock->cll_guard);
+               lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class);
+               init_waitqueue_head(&lock->cll_wq);
+               head = obj->co_lu.lo_header;
+               CS_LOCKSTATE_INC(obj, CLS_NEW);
+               CS_LOCK_INC(obj, total);
+               CS_LOCK_INC(obj, create);
+               cl_lock_lockdep_init(lock);
+               list_for_each_entry(obj, &head->loh_layers,
+                                       co_lu.lo_linkage) {
+                       int err;
+
+                       err = obj->co_ops->coo_lock_init(env, obj, lock, io);
+                       if (err != 0) {
+                               cl_lock_finish(env, lock);
+                               lock = ERR_PTR(err);
+                               break;
+                       }
+               }
+       } else
+               lock = ERR_PTR(-ENOMEM);
+       RETURN(lock);
+}
+
+/**
+ * Transfer the lock into INTRANSIT state and return the original state.
+ *
+ * \pre  state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED
+ * \post state: CLS_INTRANSIT
+ * \see CLS_INTRANSIT
+ */
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+                                    struct cl_lock *lock)
+{
+       enum cl_lock_state state = lock->cll_state;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+       LASSERT(state != CLS_INTRANSIT);
+       LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED,
+                "Malformed lock state %d.\n", state);
+
+       cl_lock_state_set(env, lock, CLS_INTRANSIT);
+       lock->cll_intransit_owner = current;
+       cl_lock_hold_add(env, lock, "intransit", current);
+       return state;
+}
+EXPORT_SYMBOL(cl_lock_intransit);
+
+/**
+ *  Exit the intransit state and restore the lock state to the original state
+ */
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+                      enum cl_lock_state state)
+{
+       LASSERT(cl_lock_is_mutexed(lock));
+       LASSERT(lock->cll_state == CLS_INTRANSIT);
+       LASSERT(state != CLS_INTRANSIT);
+       LASSERT(lock->cll_intransit_owner == current);
+
+       lock->cll_intransit_owner = NULL;
+       cl_lock_state_set(env, lock, state);
+       cl_lock_unhold(env, lock, "intransit", current);
+}
+EXPORT_SYMBOL(cl_lock_extransit);
+
+/**
+ * Checking whether the lock is intransit state
+ */
+int cl_lock_is_intransit(struct cl_lock *lock)
+{
+       LASSERT(cl_lock_is_mutexed(lock));
+       return lock->cll_state == CLS_INTRANSIT &&
+              lock->cll_intransit_owner != current;
+}
+EXPORT_SYMBOL(cl_lock_is_intransit);
+/**
+ * Returns true iff lock is "suitable" for given io. E.g., locks acquired by
+ * truncate and O_APPEND cannot be reused for read/non-append-write, as they
+ * cover multiple stripes and can trigger cascading timeouts.
+ */
+static int cl_lock_fits_into(const struct lu_env *env,
+                            const struct cl_lock *lock,
+                            const struct cl_lock_descr *need,
+                            const struct cl_io *io)
+{
+       const struct cl_lock_slice *slice;
+
+       LINVRNT(cl_lock_invariant_trusted(env, lock));
+       ENTRY;
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_ops->clo_fits_into != NULL &&
+                   !slice->cls_ops->clo_fits_into(env, slice, need, io))
+                       RETURN(0);
+       }
+       RETURN(1);
+}
+
+static struct cl_lock *cl_lock_lookup(const struct lu_env *env,
+                                     struct cl_object *obj,
+                                     const struct cl_io *io,
+                                     const struct cl_lock_descr *need)
+{
+       struct cl_lock    *lock;
+       struct cl_object_header *head;
+
+       ENTRY;
+
+       head = cl_object_header(obj);
+       LINVRNT(spin_is_locked(&head->coh_lock_guard));
+       CS_LOCK_INC(obj, lookup);
+       list_for_each_entry(lock, &head->coh_locks, cll_linkage) {
+               int matched;
+
+               matched = cl_lock_ext_match(&lock->cll_descr, need) &&
+                         lock->cll_state < CLS_FREEING &&
+                         lock->cll_error == 0 &&
+                         !(lock->cll_flags & CLF_CANCELLED) &&
+                         cl_lock_fits_into(env, lock, need, io);
+               CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n",
+                      PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need),
+                      matched);
+               if (matched) {
+                       cl_lock_get_trust(lock);
+                       CS_LOCK_INC(obj, hit);
+                       RETURN(lock);
+               }
+       }
+       RETURN(NULL);
+}
+
+/**
+ * Returns a lock matching description \a need.
+ *
+ * This is the main entry point into the cl_lock caching interface. First, a
+ * cache (implemented as a per-object linked list) is consulted. If lock is
+ * found there, it is returned immediately. Otherwise new lock is allocated
+ * and returned. In any case, additional reference to lock is acquired.
+ *
+ * \see cl_object_find(), cl_page_find()
+ */
+static struct cl_lock *cl_lock_find(const struct lu_env *env,
+                                   const struct cl_io *io,
+                                   const struct cl_lock_descr *need)
+{
+       struct cl_object_header *head;
+       struct cl_object        *obj;
+       struct cl_lock    *lock;
+
+       ENTRY;
+
+       obj  = need->cld_obj;
+       head = cl_object_header(obj);
+
+       spin_lock(&head->coh_lock_guard);
+       lock = cl_lock_lookup(env, obj, io, need);
+       spin_unlock(&head->coh_lock_guard);
+
+       if (lock == NULL) {
+               lock = cl_lock_alloc(env, obj, io, need);
+               if (!IS_ERR(lock)) {
+                       struct cl_lock *ghost;
+
+                       spin_lock(&head->coh_lock_guard);
+                       ghost = cl_lock_lookup(env, obj, io, need);
+                       if (ghost == NULL) {
+                               list_add_tail(&lock->cll_linkage,
+                                                 &head->coh_locks);
+                               spin_unlock(&head->coh_lock_guard);
+                               CS_LOCK_INC(obj, busy);
+                       } else {
+                               spin_unlock(&head->coh_lock_guard);
+                               /*
+                                * Other threads can acquire references to the
+                                * top-lock through its sub-locks. Hence, it
+                                * cannot be cl_lock_free()-ed immediately.
+                                */
+                               cl_lock_finish(env, lock);
+                               lock = ghost;
+                       }
+               }
+       }
+       RETURN(lock);
+}
+
+/**
+ * Returns existing lock matching given description. This is similar to
+ * cl_lock_find() except that no new lock is created, and returned lock is
+ * guaranteed to be in enum cl_lock_state::CLS_HELD state.
+ */
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+                            const struct cl_lock_descr *need,
+                            const char *scope, const void *source)
+{
+       struct cl_object_header *head;
+       struct cl_object        *obj;
+       struct cl_lock    *lock;
+
+       obj  = need->cld_obj;
+       head = cl_object_header(obj);
+
+       do {
+               spin_lock(&head->coh_lock_guard);
+               lock = cl_lock_lookup(env, obj, io, need);
+               spin_unlock(&head->coh_lock_guard);
+               if (lock == NULL)
+                       return NULL;
+
+               cl_lock_mutex_get(env, lock);
+               if (lock->cll_state == CLS_INTRANSIT)
+                       /* Don't care return value. */
+                       cl_lock_state_wait(env, lock);
+               if (lock->cll_state == CLS_FREEING) {
+                       cl_lock_mutex_put(env, lock);
+                       cl_lock_put(env, lock);
+                       lock = NULL;
+               }
+       } while (lock == NULL);
+
+       cl_lock_hold_add(env, lock, scope, source);
+       cl_lock_user_add(env, lock);
+       if (lock->cll_state == CLS_CACHED)
+               cl_use_try(env, lock, 1);
+       if (lock->cll_state == CLS_HELD) {
+               cl_lock_mutex_put(env, lock);
+               cl_lock_lockdep_acquire(env, lock, 0);
+               cl_lock_put(env, lock);
+       } else {
+               cl_unuse_try(env, lock);
+               cl_lock_unhold(env, lock, scope, source);
+               cl_lock_mutex_put(env, lock);
+               cl_lock_put(env, lock);
+               lock = NULL;
+       }
+
+       return lock;
+}
+EXPORT_SYMBOL(cl_lock_peek);
+
+/**
+ * Returns a slice within a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+                                      const struct lu_device_type *dtype)
+{
+       const struct cl_lock_slice *slice;
+
+       LINVRNT(cl_lock_invariant_trusted(NULL, lock));
+       ENTRY;
+
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+                       RETURN(slice);
+       }
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_thread_counters *counters;
+
+       counters = cl_lock_counters(env, lock);
+       lock->cll_depth++;
+       counters->ctc_nr_locks_locked++;
+       lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock);
+       cl_lock_trace(D_TRACE, env, "got mutex", lock);
+}
+
+/**
+ * Locks cl_lock object.
+ *
+ * This is used to manipulate cl_lock fields, and to serialize state
+ * transitions in the lock state machine.
+ *
+ * \post cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_put()
+ */
+void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       if (lock->cll_guarder == current) {
+               LINVRNT(cl_lock_is_mutexed(lock));
+               LINVRNT(lock->cll_depth > 0);
+       } else {
+               struct cl_object_header *hdr;
+               struct cl_thread_info   *info;
+               int i;
+
+               LINVRNT(lock->cll_guarder != current);
+               hdr = cl_object_header(lock->cll_descr.cld_obj);
+               /*
+                * Check that mutices are taken in the bottom-to-top order.
+                */
+               info = cl_env_info(env);
+               for (i = 0; i < hdr->coh_nesting; ++i)
+                       LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+               mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting);
+               lock->cll_guarder = current;
+               LINVRNT(lock->cll_depth == 0);
+       }
+       cl_lock_mutex_tail(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_mutex_get);
+
+/**
+ * Try-locks cl_lock object.
+ *
+ * \retval 0 \a lock was successfully locked
+ *
+ * \retval -EBUSY \a lock cannot be locked right now
+ *
+ * \post ergo(result == 0, cl_lock_is_mutexed(lock))
+ *
+ * \see cl_lock_mutex_get()
+ */
+int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock)
+{
+       int result;
+
+       LINVRNT(cl_lock_invariant_trusted(env, lock));
+       ENTRY;
+
+       result = 0;
+       if (lock->cll_guarder == current) {
+               LINVRNT(lock->cll_depth > 0);
+               cl_lock_mutex_tail(env, lock);
+       } else if (mutex_trylock(&lock->cll_guard)) {
+               LINVRNT(lock->cll_depth == 0);
+               lock->cll_guarder = current;
+               cl_lock_mutex_tail(env, lock);
+       } else
+               result = -EBUSY;
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_mutex_try);
+
+/**
+ {* Unlocks cl_lock object.
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_get()
+ */
+void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_thread_counters *counters;
+
+       LINVRNT(cl_lock_invariant(env, lock));
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(lock->cll_guarder == current);
+       LINVRNT(lock->cll_depth > 0);
+
+       counters = cl_lock_counters(env, lock);
+       LINVRNT(counters->ctc_nr_locks_locked > 0);
+
+       cl_lock_trace(D_TRACE, env, "put mutex", lock);
+       lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock);
+       counters->ctc_nr_locks_locked--;
+       if (--lock->cll_depth == 0) {
+               lock->cll_guarder = NULL;
+               mutex_unlock(&lock->cll_guard);
+       }
+}
+EXPORT_SYMBOL(cl_lock_mutex_put);
+
+/**
+ * Returns true iff lock's mutex is owned by the current thread.
+ */
+int cl_lock_is_mutexed(struct cl_lock *lock)
+{
+       return lock->cll_guarder == current;
+}
+EXPORT_SYMBOL(cl_lock_is_mutexed);
+
+/**
+ * Returns number of cl_lock mutices held by the current thread (environment).
+ */
+int cl_lock_nr_mutexed(const struct lu_env *env)
+{
+       struct cl_thread_info *info;
+       int i;
+       int locked;
+
+       /*
+        * NOTE: if summation across all nesting levels (currently 2) proves
+        *       too expensive, a summary counter can be added to
+        *       struct cl_thread_info.
+        */
+       info = cl_env_info(env);
+       for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+               locked += info->clt_counters[i].ctc_nr_locks_locked;
+       return locked;
+}
+EXPORT_SYMBOL(cl_lock_nr_mutexed);
+
+static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       ENTRY;
+       if (!(lock->cll_flags & CLF_CANCELLED)) {
+               const struct cl_lock_slice *slice;
+
+               lock->cll_flags |= CLF_CANCELLED;
+               list_for_each_entry_reverse(slice, &lock->cll_layers,
+                                               cls_linkage) {
+                       if (slice->cls_ops->clo_cancel != NULL)
+                               slice->cls_ops->clo_cancel(env, slice);
+               }
+       }
+       EXIT;
+}
+
+static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_object_header    *head;
+       const struct cl_lock_slice *slice;
+
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       ENTRY;
+       if (lock->cll_state < CLS_FREEING) {
+               LASSERT(lock->cll_state != CLS_INTRANSIT);
+               cl_lock_state_set(env, lock, CLS_FREEING);
+
+               head = cl_object_header(lock->cll_descr.cld_obj);
+
+               spin_lock(&head->coh_lock_guard);
+               list_del_init(&lock->cll_linkage);
+               spin_unlock(&head->coh_lock_guard);
+
+               /*
+                * From now on, no new references to this lock can be acquired
+                * by cl_lock_lookup().
+                */
+               list_for_each_entry_reverse(slice, &lock->cll_layers,
+                                               cls_linkage) {
+                       if (slice->cls_ops->clo_delete != NULL)
+                               slice->cls_ops->clo_delete(env, slice);
+               }
+               /*
+                * From now on, no new references to this lock can be acquired
+                * by layer-specific means (like a pointer from struct
+                * ldlm_lock in osc, or a pointer from top-lock to sub-lock in
+                * lov).
+                *
+                * Lock will be finally freed in cl_lock_put() when last of
+                * existing references goes away.
+                */
+       }
+       EXIT;
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a
+ * top-lock (nesting == 0) accounts for this modification in the per-thread
+ * debugging counters. Sub-lock holds can be released by a thread different
+ * from one that acquired it.
+ */
+static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock,
+                            int delta)
+{
+       struct cl_thread_counters *counters;
+       enum clt_nesting_level     nesting;
+
+       lock->cll_holds += delta;
+       nesting = cl_lock_nesting(lock);
+       if (nesting == CNL_TOP) {
+               counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+               counters->ctc_nr_held += delta;
+               LASSERT(counters->ctc_nr_held >= 0);
+       }
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_users counter for a given lock. See
+ * cl_lock_hold_mod() for the explanation of the debugging code.
+ */
+static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock,
+                            int delta)
+{
+       struct cl_thread_counters *counters;
+       enum clt_nesting_level     nesting;
+
+       lock->cll_users += delta;
+       nesting = cl_lock_nesting(lock);
+       if (nesting == CNL_TOP) {
+               counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+               counters->ctc_nr_used += delta;
+               LASSERT(counters->ctc_nr_used >= 0);
+       }
+}
+
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+                         const char *scope, const void *source)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(lock->cll_holds > 0);
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock);
+       lu_ref_del(&lock->cll_holders, scope, source);
+       cl_lock_hold_mod(env, lock, -1);
+       if (lock->cll_holds == 0) {
+               CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock);
+               if (lock->cll_descr.cld_mode == CLM_PHANTOM ||
+                   lock->cll_descr.cld_mode == CLM_GROUP ||
+                   lock->cll_state != CLS_CACHED)
+                       /*
+                        * If lock is still phantom or grouplock when user is
+                        * done with it---destroy the lock.
+                        */
+                       lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
+               if (lock->cll_flags & CLF_CANCELPEND) {
+                       lock->cll_flags &= ~CLF_CANCELPEND;
+                       cl_lock_cancel0(env, lock);
+               }
+               if (lock->cll_flags & CLF_DOOMED) {
+                       /* no longer doomed: it's dead... Jim. */
+                       lock->cll_flags &= ~CLF_DOOMED;
+                       cl_lock_delete0(env, lock);
+               }
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_release);
+
+/**
+ * Waits until lock state is changed.
+ *
+ * This function is called with cl_lock mutex locked, atomically releases
+ * mutex and goes to sleep, waiting for a lock state change (signaled by
+ * cl_lock_signal()), and re-acquires the mutex before return.
+ *
+ * This function is used to wait until lock state machine makes some progress
+ * and to emulate synchronous operations on top of asynchronous lock
+ * interface.
+ *
+ * \retval -EINTR wait was interrupted
+ *
+ * \retval 0 wait wasn't interrupted
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_signal()
+ */
+int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+       wait_queue_t waiter;
+       sigset_t blocked;
+       int result;
+
+       ENTRY;
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(lock->cll_depth == 1);
+       LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */
+
+       cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock);
+       result = lock->cll_error;
+       if (result == 0) {
+               /* To avoid being interrupted by the 'non-fatal' signals
+                * (SIGCHLD, for instance), we'd block them temporarily.
+                * LU-305 */
+               blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+               init_waitqueue_entry_current(&waiter);
+               add_wait_queue(&lock->cll_wq, &waiter);
+               set_current_state(TASK_INTERRUPTIBLE);
+               cl_lock_mutex_put(env, lock);
+
+               LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+               /* Returning ERESTARTSYS instead of EINTR so syscalls
+                * can be restarted if signals are pending here */
+               result = -ERESTARTSYS;
+               if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) {
+                       waitq_wait(&waiter, TASK_INTERRUPTIBLE);
+                       if (!cfs_signal_pending())
+                               result = 0;
+               }
+
+               cl_lock_mutex_get(env, lock);
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&lock->cll_wq, &waiter);
+
+               /* Restore old blocked signals */
+               cfs_restore_sigs(blocked);
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_state_wait);
+
+static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
+                                enum cl_lock_state state)
+{
+       const struct cl_lock_slice *slice;
+
+       ENTRY;
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage)
+               if (slice->cls_ops->clo_state != NULL)
+                       slice->cls_ops->clo_state(env, slice, state);
+       wake_up_all(&lock->cll_wq);
+       EXIT;
+}
+
+/**
+ * Notifies waiters that lock state changed.
+ *
+ * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all
+ * layers about state change by calling cl_lock_operations::clo_state()
+ * top-to-bottom.
+ */
+void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock)
+{
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock);
+       cl_lock_state_signal(env, lock, lock->cll_state);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_signal);
+
+/**
+ * Changes lock state.
+ *
+ * This function is invoked to notify layers that lock state changed, possible
+ * as a result of an asynchronous event such as call-back reception.
+ *
+ * \post lock->cll_state == state
+ *
+ * \see cl_lock_operations::clo_state()
+ */
+void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
+                      enum cl_lock_state state)
+{
+       ENTRY;
+       LASSERT(lock->cll_state <= state ||
+               (lock->cll_state == CLS_CACHED &&
+                (state == CLS_HELD || /* lock found in cache */
+                 state == CLS_NEW  ||   /* sub-lock canceled */
+                 state == CLS_INTRANSIT)) ||
+               /* lock is in transit state */
+               lock->cll_state == CLS_INTRANSIT);
+
+       if (lock->cll_state != state) {
+               CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state);
+               CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state);
+
+               cl_lock_state_signal(env, lock, state);
+               lock->cll_state = state;
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_state_set);
+
+static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock)
+{
+       const struct cl_lock_slice *slice;
+       int result;
+
+       do {
+               result = 0;
+
+               LINVRNT(cl_lock_is_mutexed(lock));
+               LINVRNT(cl_lock_invariant(env, lock));
+               LASSERT(lock->cll_state == CLS_INTRANSIT);
+
+               result = -ENOSYS;
+               list_for_each_entry_reverse(slice, &lock->cll_layers,
+                                               cls_linkage) {
+                       if (slice->cls_ops->clo_unuse != NULL) {
+                               result = slice->cls_ops->clo_unuse(env, slice);
+                               if (result != 0)
+                                       break;
+                       }
+               }
+               LASSERT(result != -ENOSYS);
+       } while (result == CLO_REPEAT);
+
+       return result;
+}
+
+/**
+ * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling
+ * cl_lock_operations::clo_use() top-to-bottom to notify layers.
+ * @atomic = 1, it must unuse the lock to recovery the lock to keep the
+ *  use process atomic
+ */
+int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic)
+{
+       const struct cl_lock_slice *slice;
+       int result;
+       enum cl_lock_state state;
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "use lock", lock);
+
+       LASSERT(lock->cll_state == CLS_CACHED);
+       if (lock->cll_error)
+               RETURN(lock->cll_error);
+
+       result = -ENOSYS;
+       state = cl_lock_intransit(env, lock);
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_ops->clo_use != NULL) {
+                       result = slice->cls_ops->clo_use(env, slice);
+                       if (result != 0)
+                               break;
+               }
+       }
+       LASSERT(result != -ENOSYS);
+
+       LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n",
+                lock->cll_state);
+
+       if (result == 0) {
+               state = CLS_HELD;
+       } else {
+               if (result == -ESTALE) {
+                       /*
+                        * ESTALE means sublock being cancelled
+                        * at this time, and set lock state to
+                        * be NEW here and ask the caller to repeat.
+                        */
+                       state = CLS_NEW;
+                       result = CLO_REPEAT;
+               }
+
+               /* @atomic means back-off-on-failure. */
+               if (atomic) {
+                       int rc;
+                       rc = cl_unuse_try_internal(env, lock);
+                       /* Vet the results. */
+                       if (rc < 0 && result > 0)
+                               result = rc;
+               }
+
+       }
+       cl_lock_extransit(env, lock, state);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_use_try);
+
+/**
+ * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers
+ * top-to-bottom.
+ */
+static int cl_enqueue_kick(const struct lu_env *env,
+                          struct cl_lock *lock,
+                          struct cl_io *io, __u32 flags)
+{
+       int result;
+       const struct cl_lock_slice *slice;
+
+       ENTRY;
+       result = -ENOSYS;
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_ops->clo_enqueue != NULL) {
+                       result = slice->cls_ops->clo_enqueue(env,
+                                                            slice, io, flags);
+                       if (result != 0)
+                               break;
+               }
+       }
+       LASSERT(result != -ENOSYS);
+       RETURN(result);
+}
+
+/**
+ * Tries to enqueue a lock.
+ *
+ * This function is called repeatedly by cl_enqueue() until either lock is
+ * enqueued, or error occurs. This function does not block waiting for
+ * networking communication to complete.
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *                      lock->cll_state == CLS_HELD)
+ *
+ * \see cl_enqueue() cl_lock_operations::clo_enqueue()
+ * \see cl_lock_state::CLS_ENQUEUED
+ */
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+                  struct cl_io *io, __u32 flags)
+{
+       int result;
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock);
+       do {
+               LINVRNT(cl_lock_is_mutexed(lock));
+
+               result = lock->cll_error;
+               if (result != 0)
+                       break;
+
+               switch (lock->cll_state) {
+               case CLS_NEW:
+                       cl_lock_state_set(env, lock, CLS_QUEUING);
+                       /* fall-through */
+               case CLS_QUEUING:
+                       /* kick layers. */
+                       result = cl_enqueue_kick(env, lock, io, flags);
+                       /* For AGL case, the cl_lock::cll_state may
+                        * become CLS_HELD already. */
+                       if (result == 0 && lock->cll_state == CLS_QUEUING)
+                               cl_lock_state_set(env, lock, CLS_ENQUEUED);
+                       break;
+               case CLS_INTRANSIT:
+                       LASSERT(cl_lock_is_intransit(lock));
+                       result = CLO_WAIT;
+                       break;
+               case CLS_CACHED:
+                       /* yank lock from the cache. */
+                       result = cl_use_try(env, lock, 0);
+                       break;
+               case CLS_ENQUEUED:
+               case CLS_HELD:
+                       result = 0;
+                       break;
+               default:
+               case CLS_FREEING:
+                       /*
+                        * impossible, only held locks with increased
+                        * ->cll_holds can be enqueued, and they cannot be
+                        * freed.
+                        */
+                       LBUG();
+               }
+       } while (result == CLO_REPEAT);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue_try);
+
+/**
+ * Cancel the conflicting lock found during previous enqueue.
+ *
+ * \retval 0 conflicting lock has been canceled.
+ * \retval -ve error code.
+ */
+int cl_lock_enqueue_wait(const struct lu_env *env,
+                        struct cl_lock *lock,
+                        int keep_mutex)
+{
+       struct cl_lock  *conflict;
+       int           rc = 0;
+       ENTRY;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+       LASSERT(lock->cll_state == CLS_QUEUING);
+       LASSERT(lock->cll_conflict != NULL);
+
+       conflict = lock->cll_conflict;
+       lock->cll_conflict = NULL;
+
+       cl_lock_mutex_put(env, lock);
+       LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+       cl_lock_mutex_get(env, conflict);
+       cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict);
+       cl_lock_cancel(env, conflict);
+       cl_lock_delete(env, conflict);
+
+       while (conflict->cll_state != CLS_FREEING) {
+               rc = cl_lock_state_wait(env, conflict);
+               if (rc != 0)
+                       break;
+       }
+       cl_lock_mutex_put(env, conflict);
+       lu_ref_del(&conflict->cll_reference, "cancel-wait", lock);
+       cl_lock_put(env, conflict);
+
+       if (keep_mutex)
+               cl_lock_mutex_get(env, lock);
+
+       LASSERT(rc <= 0);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(cl_lock_enqueue_wait);
+
+static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock,
+                            struct cl_io *io, __u32 enqflags)
+{
+       int result;
+
+       ENTRY;
+
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(lock->cll_holds > 0);
+
+       cl_lock_user_add(env, lock);
+       do {
+               result = cl_enqueue_try(env, lock, io, enqflags);
+               if (result == CLO_WAIT) {
+                       if (lock->cll_conflict != NULL)
+                               result = cl_lock_enqueue_wait(env, lock, 1);
+                       else
+                               result = cl_lock_state_wait(env, lock);
+                       if (result == 0)
+                               continue;
+               }
+               break;
+       } while (1);
+       if (result != 0)
+               cl_unuse_try(env, lock);
+       LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL),
+                    lock->cll_state == CLS_ENQUEUED ||
+                    lock->cll_state == CLS_HELD));
+       RETURN(result);
+}
+
+/**
+ * Enqueues a lock.
+ *
+ * \pre current thread or io owns a hold on lock.
+ *
+ * \post ergo(result == 0, lock->users increased)
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *                      lock->cll_state == CLS_HELD)
+ */
+int cl_enqueue(const struct lu_env *env, struct cl_lock *lock,
+              struct cl_io *io, __u32 enqflags)
+{
+       int result;
+
+       ENTRY;
+
+       cl_lock_lockdep_acquire(env, lock, enqflags);
+       cl_lock_mutex_get(env, lock);
+       result = cl_enqueue_locked(env, lock, io, enqflags);
+       cl_lock_mutex_put(env, lock);
+       if (result != 0)
+               cl_lock_lockdep_release(env, lock);
+       LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+                    lock->cll_state == CLS_HELD));
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue);
+
+/**
+ * Tries to unlock a lock.
+ *
+ * This function is called to release underlying resource:
+ * 1. for top lock, the resource is sublocks it held;
+ * 2. for sublock, the resource is the reference to dlmlock.
+ *
+ * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT.
+ *
+ * \see cl_unuse() cl_lock_operations::clo_unuse()
+ * \see cl_lock_state::CLS_CACHED
+ */
+int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
+{
+       int                      result;
+       enum cl_lock_state        state = CLS_NEW;
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock);
+
+       if (lock->cll_users > 1) {
+               cl_lock_user_del(env, lock);
+               RETURN(0);
+       }
+
+       /* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold
+        * underlying resources. */
+       if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) {
+               cl_lock_user_del(env, lock);
+               RETURN(0);
+       }
+
+       /*
+        * New lock users (->cll_users) are not protecting unlocking
+        * from proceeding. From this point, lock eventually reaches
+        * CLS_CACHED, is reinitialized to CLS_NEW or fails into
+        * CLS_FREEING.
+        */
+       state = cl_lock_intransit(env, lock);
+
+       result = cl_unuse_try_internal(env, lock);
+       LASSERT(lock->cll_state == CLS_INTRANSIT);
+       LASSERT(result != CLO_WAIT);
+       cl_lock_user_del(env, lock);
+       if (result == 0 || result == -ESTALE) {
+               /*
+                * Return lock back to the cache. This is the only
+                * place where lock is moved into CLS_CACHED state.
+                *
+                * If one of ->clo_unuse() methods returned -ESTALE, lock
+                * cannot be placed into cache and has to be
+                * re-initialized. This happens e.g., when a sub-lock was
+                * canceled while unlocking was in progress.
+                */
+               if (state == CLS_HELD && result == 0)
+                       state = CLS_CACHED;
+               else
+                       state = CLS_NEW;
+               cl_lock_extransit(env, lock, state);
+
+               /*
+                * Hide -ESTALE error.
+                * If the lock is a glimpse lock, and it has multiple
+                * stripes. Assuming that one of its sublock returned -ENAVAIL,
+                * and other sublocks are matched write locks. In this case,
+                * we can't set this lock to error because otherwise some of
+                * its sublocks may not be canceled. This causes some dirty
+                * pages won't be written to OSTs. -jay
+                */
+               result = 0;
+       } else {
+               CERROR("result = %d, this is unlikely!\n", result);
+               state = CLS_NEW;
+               cl_lock_extransit(env, lock, state);
+       }
+       RETURN(result ?: lock->cll_error);
+}
+EXPORT_SYMBOL(cl_unuse_try);
+
+static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock)
+{
+       int result;
+       ENTRY;
+
+       result = cl_unuse_try(env, lock);
+       if (result)
+               CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result);
+
+       EXIT;
+}
+
+/**
+ * Unlocks a lock.
+ */
+void cl_unuse(const struct lu_env *env, struct cl_lock *lock)
+{
+       ENTRY;
+       cl_lock_mutex_get(env, lock);
+       cl_unuse_locked(env, lock);
+       cl_lock_mutex_put(env, lock);
+       cl_lock_lockdep_release(env, lock);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_unuse);
+
+/**
+ * Tries to wait for a lock.
+ *
+ * This function is called repeatedly by cl_wait() until either lock is
+ * granted, or error occurs. This function does not block waiting for network
+ * communication to complete.
+ *
+ * \see cl_wait() cl_lock_operations::clo_wait()
+ * \see cl_lock_state::CLS_HELD
+ */
+int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
+{
+       const struct cl_lock_slice *slice;
+       int                      result;
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock);
+       do {
+               LINVRNT(cl_lock_is_mutexed(lock));
+               LINVRNT(cl_lock_invariant(env, lock));
+               LASSERTF(lock->cll_state == CLS_QUEUING ||
+                        lock->cll_state == CLS_ENQUEUED ||
+                        lock->cll_state == CLS_HELD ||
+                        lock->cll_state == CLS_INTRANSIT,
+                        "lock state: %d\n", lock->cll_state);
+               LASSERT(lock->cll_users > 0);
+               LASSERT(lock->cll_holds > 0);
+
+               result = lock->cll_error;
+               if (result != 0)
+                       break;
+
+               if (cl_lock_is_intransit(lock)) {
+                       result = CLO_WAIT;
+                       break;
+               }
+
+               if (lock->cll_state == CLS_HELD)
+                       /* nothing to do */
+                       break;
+
+               result = -ENOSYS;
+               list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                       if (slice->cls_ops->clo_wait != NULL) {
+                               result = slice->cls_ops->clo_wait(env, slice);
+                               if (result != 0)
+                                       break;
+                       }
+               }
+               LASSERT(result != -ENOSYS);
+               if (result == 0) {
+                       LASSERT(lock->cll_state != CLS_INTRANSIT);
+                       cl_lock_state_set(env, lock, CLS_HELD);
+               }
+       } while (result == CLO_REPEAT);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait_try);
+
+/**
+ * Waits until enqueued lock is granted.
+ *
+ * \pre current thread or io owns a hold on the lock
+ * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *                     lock->cll_state == CLS_HELD)
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_HELD)
+ */
+int cl_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+       int result;
+
+       ENTRY;
+       cl_lock_mutex_get(env, lock);
+
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD,
+                "Wrong state %d \n", lock->cll_state);
+       LASSERT(lock->cll_holds > 0);
+
+       do {
+               result = cl_wait_try(env, lock);
+               if (result == CLO_WAIT) {
+                       result = cl_lock_state_wait(env, lock);
+                       if (result == 0)
+                               continue;
+               }
+               break;
+       } while (1);
+       if (result < 0) {
+               cl_unuse_try(env, lock);
+               cl_lock_lockdep_release(env, lock);
+       }
+       cl_lock_trace(D_DLMTRACE, env, "wait lock", lock);
+       cl_lock_mutex_put(env, lock);
+       LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD));
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait);
+
+/**
+ * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock
+ * value.
+ */
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock)
+{
+       const struct cl_lock_slice *slice;
+       unsigned long pound;
+       unsigned long ounce;
+
+       ENTRY;
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       pound = 0;
+       list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_ops->clo_weigh != NULL) {
+                       ounce = slice->cls_ops->clo_weigh(env, slice);
+                       pound += ounce;
+                       if (pound < ounce) /* over-weight^Wflow */
+                               pound = ~0UL;
+               }
+       }
+       RETURN(pound);
+}
+EXPORT_SYMBOL(cl_lock_weigh);
+
+/**
+ * Notifies layers that lock description changed.
+ *
+ * The server can grant client a lock different from one that was requested
+ * (e.g., larger in extent). This method is called when actually granted lock
+ * description becomes known to let layers to accommodate for changed lock
+ * description.
+ *
+ * \see cl_lock_operations::clo_modify()
+ */
+int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock,
+                  const struct cl_lock_descr *desc)
+{
+       const struct cl_lock_slice *slice;
+       struct cl_object           *obj = lock->cll_descr.cld_obj;
+       struct cl_object_header    *hdr = cl_object_header(obj);
+       int result;
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "modify lock", lock);
+       /* don't allow object to change */
+       LASSERT(obj == desc->cld_obj);
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+               if (slice->cls_ops->clo_modify != NULL) {
+                       result = slice->cls_ops->clo_modify(env, slice, desc);
+                       if (result != 0)
+                               RETURN(result);
+               }
+       }
+       CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n",
+                     PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu)));
+       /*
+        * Just replace description in place. Nothing more is needed for
+        * now. If locks were indexed according to their extent and/or mode,
+        * that index would have to be updated here.
+        */
+       spin_lock(&hdr->coh_lock_guard);
+       lock->cll_descr = *desc;
+       spin_unlock(&hdr->coh_lock_guard);
+       RETURN(0);
+}
+EXPORT_SYMBOL(cl_lock_modify);
+
+/**
+ * Initializes lock closure with a given origin.
+ *
+ * \see cl_lock_closure
+ */
+void cl_lock_closure_init(const struct lu_env *env,
+                         struct cl_lock_closure *closure,
+                         struct cl_lock *origin, int wait)
+{
+       LINVRNT(cl_lock_is_mutexed(origin));
+       LINVRNT(cl_lock_invariant(env, origin));
+
+       INIT_LIST_HEAD(&closure->clc_list);
+       closure->clc_origin = origin;
+       closure->clc_wait   = wait;
+       closure->clc_nr     = 0;
+}
+EXPORT_SYMBOL(cl_lock_closure_init);
+
+/**
+ * Builds a closure of \a lock.
+ *
+ * Building of a closure consists of adding initial lock (\a lock) into it,
+ * and calling cl_lock_operations::clo_closure() methods of \a lock. These
+ * methods might call cl_lock_closure_build() recursively again, adding more
+ * locks to the closure, etc.
+ *
+ * \see cl_lock_closure
+ */
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+                         struct cl_lock_closure *closure)
+{
+       const struct cl_lock_slice *slice;
+       int result;
+
+       ENTRY;
+       LINVRNT(cl_lock_is_mutexed(closure->clc_origin));
+       LINVRNT(cl_lock_invariant(env, closure->clc_origin));
+
+       result = cl_lock_enclosure(env, lock, closure);
+       if (result == 0) {
+               list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                       if (slice->cls_ops->clo_closure != NULL) {
+                               result = slice->cls_ops->clo_closure(env, slice,
+                                                                    closure);
+                               if (result != 0)
+                                       break;
+                       }
+               }
+       }
+       if (result != 0)
+               cl_lock_disclosure(env, closure);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_closure_build);
+
+/**
+ * Adds new lock to a closure.
+ *
+ * Try-locks \a lock and if succeeded, adds it to the closure (never more than
+ * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting
+ * until next try-lock is likely to succeed.
+ */
+int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock,
+                     struct cl_lock_closure *closure)
+{
+       int result = 0;
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock);
+       if (!cl_lock_mutex_try(env, lock)) {
+               /*
+                * If lock->cll_inclosure is not empty, lock is already in
+                * this closure.
+                */
+               if (list_empty(&lock->cll_inclosure)) {
+                       cl_lock_get_trust(lock);
+                       lu_ref_add(&lock->cll_reference, "closure", closure);
+                       list_add(&lock->cll_inclosure, &closure->clc_list);
+                       closure->clc_nr++;
+               } else
+                       cl_lock_mutex_put(env, lock);
+               result = 0;
+       } else {
+               cl_lock_disclosure(env, closure);
+               if (closure->clc_wait) {
+                       cl_lock_get_trust(lock);
+                       lu_ref_add(&lock->cll_reference, "closure-w", closure);
+                       cl_lock_mutex_put(env, closure->clc_origin);
+
+                       LASSERT(cl_lock_nr_mutexed(env) == 0);
+                       cl_lock_mutex_get(env, lock);
+                       cl_lock_mutex_put(env, lock);
+
+                       cl_lock_mutex_get(env, closure->clc_origin);
+                       lu_ref_del(&lock->cll_reference, "closure-w", closure);
+                       cl_lock_put(env, lock);
+               }
+               result = CLO_REPEAT;
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_enclosure);
+
+/** Releases mutices of enclosed locks. */
+void cl_lock_disclosure(const struct lu_env *env,
+                       struct cl_lock_closure *closure)
+{
+       struct cl_lock *scan;
+       struct cl_lock *temp;
+
+       cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin);
+       list_for_each_entry_safe(scan, temp, &closure->clc_list,
+                                    cll_inclosure){
+               list_del_init(&scan->cll_inclosure);
+               cl_lock_mutex_put(env, scan);
+               lu_ref_del(&scan->cll_reference, "closure", closure);
+               cl_lock_put(env, scan);
+               closure->clc_nr--;
+       }
+       LASSERT(closure->clc_nr == 0);
+}
+EXPORT_SYMBOL(cl_lock_disclosure);
+
+/** Finalizes a closure. */
+void cl_lock_closure_fini(struct cl_lock_closure *closure)
+{
+       LASSERT(closure->clc_nr == 0);
+       LASSERT(list_empty(&closure->clc_list));
+}
+EXPORT_SYMBOL(cl_lock_closure_fini);
+
+/**
+ * Destroys this lock. Notifies layers (bottom-to-top) that lock is being
+ * destroyed, then destroy the lock. If there are holds on the lock, postpone
+ * destruction until all holds are released. This is called when a decision is
+ * made to destroy the lock in the future. E.g., when a blocking AST is
+ * received on it, or fatal communication error happens.
+ *
+ * Caller must have a reference on this lock to prevent a situation, when
+ * deleted lock lingers in memory for indefinite time, because nobody calls
+ * cl_lock_put() to finish it.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ * \pre ergo(cl_lock_nesting(lock) == CNL_TOP,
+ *        cl_lock_nr_mutexed(env) == 1)
+ *      [i.e., if a top-lock is deleted, mutices of no other locks can be
+ *      held, as deletion of sub-locks might require releasing a top-lock
+ *      mutex]
+ *
+ * \see cl_lock_operations::clo_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP,
+                    cl_lock_nr_mutexed(env) == 1));
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "delete lock", lock);
+       if (lock->cll_holds == 0)
+               cl_lock_delete0(env, lock);
+       else
+               lock->cll_flags |= CLF_DOOMED;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_delete);
+
+/**
+ * Mark lock as irrecoverably failed, and mark it for destruction. This
+ * happens when, e.g., server fails to grant a lock to us, or networking
+ * time-out happens.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ *
+ * \see clo_lock_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       ENTRY;
+       if (lock->cll_error == 0 && error != 0) {
+               cl_lock_trace(D_DLMTRACE, env, "set lock error", lock);
+               lock->cll_error = error;
+               cl_lock_signal(env, lock);
+               cl_lock_cancel(env, lock);
+               cl_lock_delete(env, lock);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_error);
+
+/**
+ * Cancels this lock. Notifies layers
+ * (bottom-to-top) that lock is being cancelled, then destroy the lock. If
+ * there are holds on the lock, postpone cancellation until
+ * all holds are released.
+ *
+ * Cancellation notification is delivered to layers at most once.
+ *
+ * \see cl_lock_operations::clo_cancel()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
+       if (lock->cll_holds == 0)
+               cl_lock_cancel0(env, lock);
+       else
+               lock->cll_flags |= CLF_CANCELPEND;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Finds an existing lock covering given index and optionally different from a
+ * given \a except lock.
+ */
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+                                struct cl_object *obj, pgoff_t index,
+                                struct cl_lock *except,
+                                int pending, int canceld)
+{
+       struct cl_object_header *head;
+       struct cl_lock    *scan;
+       struct cl_lock    *lock;
+       struct cl_lock_descr    *need;
+
+       ENTRY;
+
+       head = cl_object_header(obj);
+       need = &cl_env_info(env)->clt_descr;
+       lock = NULL;
+
+       need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but
+                                   * not PHANTOM */
+       need->cld_start = need->cld_end = index;
+       need->cld_enq_flags = 0;
+
+       spin_lock(&head->coh_lock_guard);
+       /* It is fine to match any group lock since there could be only one
+        * with a uniq gid and it conflicts with all other lock modes too */
+       list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
+               if (scan != except &&
+                   (scan->cll_descr.cld_mode == CLM_GROUP ||
+                   cl_lock_ext_match(&scan->cll_descr, need)) &&
+                   scan->cll_state >= CLS_HELD &&
+                   scan->cll_state < CLS_FREEING &&
+                   /*
+                    * This check is racy as the lock can be canceled right
+                    * after it is done, but this is fine, because page exists
+                    * already.
+                    */
+                   (canceld || !(scan->cll_flags & CLF_CANCELLED)) &&
+                   (pending || !(scan->cll_flags & CLF_CANCELPEND))) {
+                       /* Don't increase cs_hit here since this
+                        * is just a helper function. */
+                       cl_lock_get_trust(scan);
+                       lock = scan;
+                       break;
+               }
+       }
+       spin_unlock(&head->coh_lock_guard);
+       RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_at_pgoff);
+
+/**
+ * Calculate the page offset at the layer of @lock.
+ * At the time of this writing, @page is top page and @lock is sub lock.
+ */
+static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock)
+{
+       struct lu_device_type *dtype;
+       const struct cl_page_slice *slice;
+
+       dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type;
+       slice = cl_page_at(page, dtype);
+       LASSERT(slice != NULL);
+       return slice->cpl_page->cp_index;
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+                               struct cl_page *page, void *cbdata)
+{
+       struct cl_thread_info *info = cl_env_info(env);
+       struct cl_lock *lock = cbdata;
+       pgoff_t index = pgoff_at_lock(page, lock);
+
+       if (index >= info->clt_fn_index) {
+               struct cl_lock *tmp;
+
+               /* refresh non-overlapped index */
+               tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index,
+                                       lock, 1, 0);
+               if (tmp != NULL) {
+                       /* Cache the first-non-overlapped index so as to skip
+                        * all pages within [index, clt_fn_index). This
+                        * is safe because if tmp lock is canceled, it will
+                        * discard these pages. */
+                       info->clt_fn_index = tmp->cll_descr.cld_end + 1;
+                       if (tmp->cll_descr.cld_end == CL_PAGE_EOF)
+                               info->clt_fn_index = CL_PAGE_EOF;
+                       cl_lock_put(env, tmp);
+               } else if (cl_page_own(env, io, page) == 0) {
+                       /* discard the page */
+                       cl_page_unmap(env, io, page);
+                       cl_page_discard(env, io, page);
+                       cl_page_disown(env, io, page);
+               } else {
+                       LASSERT(page->cp_state == CPS_FREEING);
+               }
+       }
+
+       info->clt_next_index = index + 1;
+       return CLP_GANG_OKAY;
+}
+
+static int discard_cb(const struct lu_env *env, struct cl_io *io,
+                     struct cl_page *page, void *cbdata)
+{
+       struct cl_thread_info *info = cl_env_info(env);
+       struct cl_lock *lock   = cbdata;
+
+       LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+       KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+                     !PageWriteback(cl_page_vmpage(env, page))));
+       KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+                     !PageDirty(cl_page_vmpage(env, page))));
+
+       info->clt_next_index = pgoff_at_lock(page, lock) + 1;
+       if (cl_page_own(env, io, page) == 0) {
+               /* discard the page */
+               cl_page_unmap(env, io, page);
+               cl_page_discard(env, io, page);
+               cl_page_disown(env, io, page);
+       } else {
+               LASSERT(page->cp_state == CPS_FREEING);
+       }
+
+       return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock)
+{
+       struct cl_thread_info *info  = cl_env_info(env);
+       struct cl_io      *io    = &info->clt_io;
+       struct cl_lock_descr  *descr = &lock->cll_descr;
+       cl_page_gang_cb_t      cb;
+       int res;
+       int result;
+
+       LINVRNT(cl_lock_invariant(env, lock));
+       ENTRY;
+
+       io->ci_obj = cl_object_top(descr->cld_obj);
+       io->ci_ignore_layout = 1;
+       result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+       if (result != 0)
+               GOTO(out, result);
+
+       cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb;
+       info->clt_fn_index = info->clt_next_index = descr->cld_start;
+       do {
+               res = cl_page_gang_lookup(env, descr->cld_obj, io,
+                                         info->clt_next_index, descr->cld_end,
+                                         cb, (void *)lock);
+               if (info->clt_next_index > descr->cld_end)
+                       break;
+
+               if (res == CLP_GANG_RESCHED)
+                       cond_resched();
+       } while (res != CLP_GANG_OKAY);
+out:
+       cl_io_fini(env, io);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_discard_pages);
+
+/**
+ * Eliminate all locks for a given object.
+ *
+ * Caller has to guarantee that no lock is in active use.
+ *
+ * \param cancel when this is set, cl_locks_prune() cancels locks before
+ *            destroying.
+ */
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel)
+{
+       struct cl_object_header *head;
+       struct cl_lock    *lock;
+
+       ENTRY;
+       head = cl_object_header(obj);
+       /*
+        * If locks are destroyed without cancellation, all pages must be
+        * already destroyed (as otherwise they will be left unprotected).
+        */
+       LASSERT(ergo(!cancel,
+                    head->coh_tree.rnode == NULL && head->coh_pages == 0));
+
+       spin_lock(&head->coh_lock_guard);
+       while (!list_empty(&head->coh_locks)) {
+               lock = container_of(head->coh_locks.next,
+                                   struct cl_lock, cll_linkage);
+               cl_lock_get_trust(lock);
+               spin_unlock(&head->coh_lock_guard);
+               lu_ref_add(&lock->cll_reference, "prune", current);
+
+again:
+               cl_lock_mutex_get(env, lock);
+               if (lock->cll_state < CLS_FREEING) {
+                       LASSERT(lock->cll_users <= 1);
+                       if (unlikely(lock->cll_users == 1)) {
+                               struct l_wait_info lwi = { 0 };
+
+                               cl_lock_mutex_put(env, lock);
+                               l_wait_event(lock->cll_wq,
+                                            lock->cll_users == 0,
+                                            &lwi);
+                               goto again;
+                       }
+
+                       if (cancel)
+                               cl_lock_cancel(env, lock);
+                       cl_lock_delete(env, lock);
+               }
+               cl_lock_mutex_put(env, lock);
+               lu_ref_del(&lock->cll_reference, "prune", current);
+               cl_lock_put(env, lock);
+               spin_lock(&head->coh_lock_guard);
+       }
+       spin_unlock(&head->coh_lock_guard);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_locks_prune);
+
+static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env,
+                                         const struct cl_io *io,
+                                         const struct cl_lock_descr *need,
+                                         const char *scope, const void *source)
+{
+       struct cl_lock *lock;
+
+       ENTRY;
+
+       while (1) {
+               lock = cl_lock_find(env, io, need);
+               if (IS_ERR(lock))
+                       break;
+               cl_lock_mutex_get(env, lock);
+               if (lock->cll_state < CLS_FREEING &&
+                   !(lock->cll_flags & CLF_CANCELLED)) {
+                       cl_lock_hold_mod(env, lock, +1);
+                       lu_ref_add(&lock->cll_holders, scope, source);
+                       lu_ref_add(&lock->cll_reference, scope, source);
+                       break;
+               }
+               cl_lock_mutex_put(env, lock);
+               cl_lock_put(env, lock);
+       }
+       RETURN(lock);
+}
+
+/**
+ * Returns a lock matching \a need description with a reference and a hold on
+ * it.
+ *
+ * This is much like cl_lock_find(), except that cl_lock_hold() additionally
+ * guarantees that lock is not in the CLS_FREEING state on return.
+ */
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+                            const struct cl_lock_descr *need,
+                            const char *scope, const void *source)
+{
+       struct cl_lock *lock;
+
+       ENTRY;
+
+       lock = cl_lock_hold_mutex(env, io, need, scope, source);
+       if (!IS_ERR(lock))
+               cl_lock_mutex_put(env, lock);
+       RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_hold);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+                               const struct cl_lock_descr *need,
+                               const char *scope, const void *source)
+{
+       struct cl_lock       *lock;
+       int                rc;
+       __u32            enqflags = need->cld_enq_flags;
+
+       ENTRY;
+       do {
+               lock = cl_lock_hold_mutex(env, io, need, scope, source);
+               if (IS_ERR(lock))
+                       break;
+
+               rc = cl_enqueue_locked(env, lock, io, enqflags);
+               if (rc == 0) {
+                       if (cl_lock_fits_into(env, lock, need, io)) {
+                               if (!(enqflags & CEF_AGL)) {
+                                       cl_lock_mutex_put(env, lock);
+                                       cl_lock_lockdep_acquire(env, lock,
+                                                               enqflags);
+                                       break;
+                               }
+                               rc = 1;
+                       }
+                       cl_unuse_locked(env, lock);
+               }
+               cl_lock_trace(D_DLMTRACE, env,
+                             rc <= 0 ? "enqueue failed" : "agl succeed", lock);
+               cl_lock_hold_release(env, lock, scope, source);
+               cl_lock_mutex_put(env, lock);
+               lu_ref_del(&lock->cll_reference, scope, source);
+               cl_lock_put(env, lock);
+               if (rc > 0) {
+                       LASSERT(enqflags & CEF_AGL);
+                       lock = NULL;
+               } else if (rc != 0) {
+                       lock = ERR_PTR(rc);
+               }
+       } while (rc == 0);
+       RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Adds a hold to a known lock.
+ */
+void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock,
+                     const char *scope, const void *source)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(lock->cll_state != CLS_FREEING);
+
+       ENTRY;
+       cl_lock_hold_mod(env, lock, +1);
+       cl_lock_get(lock);
+       lu_ref_add(&lock->cll_holders, scope, source);
+       lu_ref_add(&lock->cll_reference, scope, source);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_add);
+
+/**
+ * Releases a hold and a reference on a lock, on which caller acquired a
+ * mutex.
+ */
+void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock,
+                   const char *scope, const void *source)
+{
+       LINVRNT(cl_lock_invariant(env, lock));
+       ENTRY;
+       cl_lock_hold_release(env, lock, scope, source);
+       lu_ref_del(&lock->cll_reference, scope, source);
+       cl_lock_put(env, lock);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_unhold);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock,
+                    const char *scope, const void *source)
+{
+       LINVRNT(cl_lock_invariant(env, lock));
+       ENTRY;
+       cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
+       cl_lock_mutex_get(env, lock);
+       cl_lock_hold_release(env, lock, scope, source);
+       cl_lock_mutex_put(env, lock);
+       lu_ref_del(&lock->cll_reference, scope, source);
+       cl_lock_put(env, lock);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+
+       ENTRY;
+       cl_lock_used_mod(env, lock, +1);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_add);
+
+void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock)
+{
+       LINVRNT(cl_lock_is_mutexed(lock));
+       LINVRNT(cl_lock_invariant(env, lock));
+       LASSERT(lock->cll_users > 0);
+
+       ENTRY;
+       cl_lock_used_mod(env, lock, -1);
+       if (lock->cll_users == 0)
+               wake_up_all(&lock->cll_wq);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_del);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+       static const char *names[] = {
+               [CLM_PHANTOM] = "P",
+               [CLM_READ]    = "R",
+               [CLM_WRITE]   = "W",
+               [CLM_GROUP]   = "G"
+       };
+       if (0 <= mode && mode < ARRAY_SIZE(names))
+               return names[mode];
+       else
+               return "U";
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+                      lu_printer_t printer,
+                      const struct cl_lock_descr *descr)
+{
+       const struct lu_fid  *fid;
+
+       fid = lu_object_fid(&descr->cld_obj->co_lu);
+       (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+                  lu_printer_t printer, const struct cl_lock *lock)
+{
+       const struct cl_lock_slice *slice;
+       (*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ",
+                  lock, atomic_read(&lock->cll_ref),
+                  lock->cll_state, lock->cll_error, lock->cll_holds,
+                  lock->cll_users, lock->cll_flags);
+       cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+       (*printer)(env, cookie, " {\n");
+
+       list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+               (*printer)(env, cookie, "    %s@%p: ",
+                          slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+                          slice);
+               if (slice->cls_ops->clo_print != NULL)
+                       slice->cls_ops->clo_print(env, cookie, printer, slice);
+               (*printer)(env, cookie, "\n");
+       }
+       (*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
+
+int cl_lock_init(void)
+{
+       return lu_kmem_init(cl_lock_caches);
+}
+
+void cl_lock_fini(void)
+{
+       lu_kmem_fini(cl_lock_caches);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c
new file mode 100644 (file)
index 0000000..cdb5fba
--- /dev/null
@@ -0,0 +1,1148 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Object.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/*
+ * Locking.
+ *
+ *  i_mutex
+ *      PG_locked
+ *       ->coh_page_guard
+ *       ->coh_lock_guard
+ *       ->coh_attr_guard
+ *       ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+/* class_put_type() */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <linux/libcfs/libcfs_hash.h> /* for cfs_hash stuff */
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static struct kmem_cache *cl_env_kmem;
+
+/** Lock class of cl_object_header::coh_page_guard */
+static struct lock_class_key cl_page_guard_class;
+/** Lock class of cl_object_header::coh_lock_guard */
+static struct lock_class_key cl_lock_guard_class;
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+       int result;
+
+       ENTRY;
+       result = lu_object_header_init(&h->coh_lu);
+       if (result == 0) {
+               spin_lock_init(&h->coh_page_guard);
+               spin_lock_init(&h->coh_lock_guard);
+               spin_lock_init(&h->coh_attr_guard);
+               lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class);
+               lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class);
+               lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+               h->coh_pages = 0;
+               /* XXX hard coded GFP_* mask. */
+               INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC);
+               INIT_LIST_HEAD(&h->coh_locks);
+               h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8);
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+       LASSERT(list_empty(&h->coh_locks));
+       lu_object_header_fini(&h->coh_lu);
+}
+EXPORT_SYMBOL(cl_object_header_fini);
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+                                struct cl_device *cd, const struct lu_fid *fid,
+                                const struct cl_object_conf *c)
+{
+       might_sleep();
+       return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+       lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+       lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_page_top(), cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+       struct cl_object_header *hdr = cl_object_header(o);
+       struct cl_object *top;
+
+       while (hdr->coh_parent != NULL)
+               hdr = hdr->coh_parent;
+
+       top = lu2cl(lu_object_top(&hdr->coh_lu));
+       CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+       return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+       return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_set().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+{
+       spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+{
+       spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+                      struct cl_attr *attr)
+{
+       struct lu_object_header *top;
+       int result;
+
+       LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+       ENTRY;
+
+       top = obj->co_lu.lo_header;
+       result = 0;
+       list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+               if (obj->co_ops->coo_attr_get != NULL) {
+                       result = obj->co_ops->coo_attr_get(env, obj, attr);
+                       if (result != 0) {
+                               if (result > 0)
+                                       result = 0;
+                               break;
+                       }
+               }
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj,
+                      const struct cl_attr *attr, unsigned v)
+{
+       struct lu_object_header *top;
+       int result;
+
+       LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+       ENTRY;
+
+       top = obj->co_lu.lo_header;
+       result = 0;
+       list_for_each_entry_reverse(obj, &top->loh_layers,
+                                       co_lu.lo_linkage) {
+               if (obj->co_ops->coo_attr_set != NULL) {
+                       result = obj->co_ops->coo_attr_set(env, obj, attr, v);
+                       if (result != 0) {
+                               if (result > 0)
+                                       result = 0;
+                               break;
+                       }
+               }
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_set);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj,
+                     struct ost_lvb *lvb)
+{
+       struct lu_object_header *top;
+       int result;
+
+       ENTRY;
+       top = obj->co_lu.lo_header;
+       result = 0;
+       list_for_each_entry_reverse(obj, &top->loh_layers,
+                                       co_lu.lo_linkage) {
+               if (obj->co_ops->coo_glimpse != NULL) {
+                       result = obj->co_ops->coo_glimpse(env, obj, lvb);
+                       if (result != 0)
+                               break;
+               }
+       }
+       LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top),
+                        "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+                        "ctime: "LPU64" blocks: "LPU64"\n",
+                        lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+                        lvb->lvb_ctime, lvb->lvb_blocks);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *obj,
+               const struct cl_object_conf *conf)
+{
+       struct lu_object_header *top;
+       int result;
+
+       ENTRY;
+       top = obj->co_lu.lo_header;
+       result = 0;
+       list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+               if (obj->co_ops->coo_conf_set != NULL) {
+                       result = obj->co_ops->coo_conf_set(env, obj, conf);
+                       if (result != 0)
+                               break;
+               }
+       }
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+       struct cl_object_header *hdr;
+
+       hdr = cl_object_header(obj);
+       LASSERT(hdr->coh_tree.rnode == NULL);
+       LASSERT(hdr->coh_pages == 0);
+
+       set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+       /*
+        * Destroy all locks. Object destruction (including cl_inode_fini())
+        * cannot cancel the locks, because in the case of a local client,
+        * where client and server share the same thread running
+        * prune_icache(), this can dead-lock with ldlm_cancel_handler()
+        * waiting on __wait_on_freeing_inode().
+        */
+       cl_locks_prune(env, obj, 0);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+void cl_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+       ENTRY;
+       cl_pages_prune(env, obj);
+       cl_locks_prune(env, obj, 1);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+/**
+ * Check if the object has locks.
+ */
+int cl_object_has_locks(struct cl_object *obj)
+{
+       struct cl_object_header *head = cl_object_header(obj);
+       int has;
+
+       spin_lock(&head->coh_lock_guard);
+       has = list_empty(&head->coh_locks);
+       spin_unlock(&head->coh_lock_guard);
+
+       return (has == 0);
+}
+EXPORT_SYMBOL(cl_object_has_locks);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+       int i;
+
+       cs->cs_name = name;
+       for (i = 0; i < CS_NR; i++)
+               atomic_set(&cs->cs_stats[i], 0);
+}
+
+int cache_stats_print(const struct cache_stats *cs, struct seq_file *m, int h)
+{
+       int i;
+       /*
+        *   lookup    hit    total  cached create
+        * env: ...... ...... ...... ...... ......
+        */
+       if (h) {
+               const char *names[CS_NR] = CS_NAMES;
+
+               seq_printf(m, "%6s", " ");
+               for (i = 0; i < CS_NR; i++)
+                       seq_printf(m, "%8s", names[i]);
+               seq_printf(m, "\n");
+       }
+
+       seq_printf(m, "%5.5s:", cs->cs_name);
+       for (i = 0; i < CS_NR; i++)
+               seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i]));
+       return 0;
+}
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+       int i;
+       int result;
+
+       result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+       if (result == 0) {
+               cache_stats_init(&s->cs_pages, "pages");
+               cache_stats_init(&s->cs_locks, "locks");
+               for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+                       atomic_set(&s->cs_pages_state[0], 0);
+               for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i)
+                       atomic_set(&s->cs_locks_state[i], 0);
+       }
+       return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+       lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+       .cs_name    = "envs",
+       .cs_stats = { ATOMIC_INIT(0), }
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m)
+{
+       int i;
+       static const char *pstate[] = {
+               [CPS_CACHED]  = "c",
+               [CPS_OWNED]   = "o",
+               [CPS_PAGEOUT] = "w",
+               [CPS_PAGEIN]  = "r",
+               [CPS_FREEING] = "f"
+       };
+       static const char *lstate[] = {
+               [CLS_NEW]       = "n",
+               [CLS_QUEUING]   = "q",
+               [CLS_ENQUEUED]  = "e",
+               [CLS_HELD]      = "h",
+               [CLS_INTRANSIT] = "t",
+               [CLS_CACHED]    = "c",
+               [CLS_FREEING]   = "f"
+       };
+/*
+       lookup    hit  total   busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+  env: ...... ...... ...... ...... ......
+ */
+       lu_site_stats_print(&site->cs_lu, m);
+       cache_stats_print(&site->cs_pages, m, 1);
+       seq_printf(m, " [");
+       for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+               seq_printf(m, "%s: %u ", pstate[i],
+                               atomic_read(&site->cs_pages_state[i]));
+       seq_printf(m, "]\n");
+       cache_stats_print(&site->cs_locks, m, 0);
+       seq_printf(m, " [");
+       for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i)
+               seq_printf(m, "%s: %u ", lstate[i],
+                               atomic_read(&site->cs_locks_state[i]));
+       seq_printf(m, "]\n");
+       cache_stats_print(&cl_env_stats, m, 0);
+       seq_printf(m, "\n");
+       return 0;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/**
+ * The most efficient way is to store cl_env pointer in task specific
+ * structures. On Linux, it wont' be easy to use task_struct->journal_info
+ * because Lustre code may call into other fs which has certain assumptions
+ * about journal_info. Currently following fields in task_struct are identified
+ * can be used for this purpose:
+ *  - cl_env: for liblustre.
+ *  - tux_info: ony on RedHat kernel.
+ *  - ...
+ * \note As long as we use task_struct to store cl_env, we assume that once
+ * called into Lustre, we'll never call into the other part of the kernel
+ * which will use those fields in task_struct without explicitly exiting
+ * Lustre.
+ *
+ * If there's no space in task_struct is available, hash will be used.
+ * bz20044, bz22683.
+ */
+
+struct cl_env {
+       void         *ce_magic;
+       struct lu_env     ce_lu;
+       struct lu_context ce_ses;
+
+       /**
+        * This allows cl_env to be entered into cl_env_hash which implements
+        * the current thread -> client environment lookup.
+        */
+       struct hlist_node  ce_node;
+       /**
+        * Owner for the current cl_env.
+        *
+        * If LL_TASK_CL_ENV is defined, this point to the owning current,
+        * only for debugging purpose ;
+        * Otherwise hash is used, and this is the key for cfs_hash.
+        * Now current thread pid is stored. Note using thread pointer would
+        * lead to unbalanced hash because of its specific allocation locality
+        * and could be varied for different platforms and OSes, even different
+        * OS versions.
+        */
+       void         *ce_owner;
+
+       /*
+        * Linkage into global list of all client environments. Used for
+        * garbage collection.
+        */
+       struct list_head        ce_linkage;
+       /*
+        *
+        */
+       int            ce_ref;
+       /*
+        * Debugging field: address of the caller who made original
+        * allocation.
+        */
+       void         *ce_debug;
+};
+
+#define CL_ENV_INC(counter)
+#define CL_ENV_DEC(counter)
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+       LASSERT(cle->ce_ref == 0);
+       LASSERT(cle->ce_magic == &cl_env_init0);
+       LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL);
+
+       cle->ce_ref = 1;
+       cle->ce_debug = debug;
+       CL_ENV_INC(busy);
+}
+
+
+/*
+ * The implementation of using hash table to connect cl_env and thread
+ */
+
+static cfs_hash_t *cl_env_hash;
+
+static unsigned cl_env_hops_hash(cfs_hash_t *lh,
+                                const void *key, unsigned mask)
+{
+#if BITS_PER_LONG == 64
+       return cfs_hash_u64_hash((__u64)key, mask);
+#else
+       return cfs_hash_u32_hash((__u32)key, mask);
+#endif
+}
+
+static void *cl_env_hops_obj(struct hlist_node *hn)
+{
+       struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+       LASSERT(cle->ce_magic == &cl_env_init0);
+       return (void *)cle;
+}
+
+static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn)
+{
+       struct cl_env *cle = cl_env_hops_obj(hn);
+
+       LASSERT(cle->ce_owner != NULL);
+       return (key == cle->ce_owner);
+}
+
+static void cl_env_hops_noop(cfs_hash_t *hs, struct hlist_node *hn)
+{
+       struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+       LASSERT(cle->ce_magic == &cl_env_init0);
+}
+
+static cfs_hash_ops_t cl_env_hops = {
+       .hs_hash        = cl_env_hops_hash,
+       .hs_key  = cl_env_hops_obj,
+       .hs_keycmp      = cl_env_hops_keycmp,
+       .hs_object      = cl_env_hops_obj,
+       .hs_get  = cl_env_hops_noop,
+       .hs_put_locked  = cl_env_hops_noop,
+};
+
+static inline struct cl_env *cl_env_fetch(void)
+{
+       struct cl_env *cle;
+
+       cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid);
+       LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0));
+       return cle;
+}
+
+static inline void cl_env_attach(struct cl_env *cle)
+{
+       if (cle) {
+               int rc;
+
+               LASSERT(cle->ce_owner == NULL);
+               cle->ce_owner = (void *) (long) current->pid;
+               rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner,
+                                        &cle->ce_node);
+               LASSERT(rc == 0);
+       }
+}
+
+static inline void cl_env_do_detach(struct cl_env *cle)
+{
+       void *cookie;
+
+       LASSERT(cle->ce_owner == (void *) (long) current->pid);
+       cookie = cfs_hash_del(cl_env_hash, cle->ce_owner,
+                             &cle->ce_node);
+       LASSERT(cookie == cle);
+       cle->ce_owner = NULL;
+}
+
+static int cl_env_store_init(void) {
+       cl_env_hash = cfs_hash_create("cl_env",
+                                     HASH_CL_ENV_BITS, HASH_CL_ENV_BITS,
+                                     HASH_CL_ENV_BKT_BITS, 0,
+                                     CFS_HASH_MIN_THETA,
+                                     CFS_HASH_MAX_THETA,
+                                     &cl_env_hops,
+                                     CFS_HASH_RW_BKTLOCK);
+       return cl_env_hash != NULL ? 0 :-ENOMEM;
+}
+
+static void cl_env_store_fini(void) {
+       cfs_hash_putref(cl_env_hash);
+}
+
+
+static inline struct cl_env *cl_env_detach(struct cl_env *cle)
+{
+       if (cle == NULL)
+               cle = cl_env_fetch();
+
+       if (cle && cle->ce_owner)
+               cl_env_do_detach(cle);
+
+       return cle;
+}
+
+static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
+{
+       struct lu_env *env;
+       struct cl_env *cle;
+
+       OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, __GFP_IO);
+       if (cle != NULL) {
+               int rc;
+
+               INIT_LIST_HEAD(&cle->ce_linkage);
+               cle->ce_magic = &cl_env_init0;
+               env = &cle->ce_lu;
+               rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags);
+               if (rc == 0) {
+                       rc = lu_context_init(&cle->ce_ses,
+                                            LCT_SESSION | ses_tags);
+                       if (rc == 0) {
+                               lu_context_enter(&cle->ce_ses);
+                               env->le_ses = &cle->ce_ses;
+                               cl_env_init0(cle, debug);
+                       } else
+                               lu_env_fini(env);
+               }
+               if (rc != 0) {
+                       OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+                       env = ERR_PTR(rc);
+               } else {
+                       CL_ENV_INC(create);
+                       CL_ENV_INC(total);
+               }
+       } else
+               env = ERR_PTR(-ENOMEM);
+       return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+       CL_ENV_DEC(total);
+       lu_context_fini(&cle->ce_lu.le_ctx);
+       lu_context_fini(&cle->ce_ses);
+       OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+       return container_of(env, struct cl_env, ce_lu);
+}
+
+struct lu_env *cl_env_peek(int *refcheck)
+{
+       struct lu_env *env;
+       struct cl_env *cle;
+
+       CL_ENV_INC(lookup);
+
+       /* check that we don't go far from untrusted pointer */
+       CLASSERT(offsetof(struct cl_env, ce_magic) == 0);
+
+       env = NULL;
+       cle = cl_env_fetch();
+       if (cle != NULL) {
+               CL_ENV_INC(hit);
+               env = &cle->ce_lu;
+               *refcheck = ++cle->ce_ref;
+       }
+       CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle);
+       return env;
+}
+EXPORT_SYMBOL(cl_env_peek);
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(int *refcheck)
+{
+       struct lu_env *env;
+
+       env = cl_env_peek(refcheck);
+       if (env == NULL) {
+               env = cl_env_new(lu_context_tags_default,
+                                lu_session_tags_default,
+                                __builtin_return_address(0));
+
+               if (!IS_ERR(env)) {
+                       struct cl_env *cle;
+
+                       cle = cl_env_container(env);
+                       cl_env_attach(cle);
+                       *refcheck = cle->ce_ref;
+                       CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+               }
+       }
+       return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(int *refcheck, __u32 tags)
+{
+       struct lu_env *env;
+
+       LASSERT(cl_env_peek(refcheck) == NULL);
+       env = cl_env_new(tags, tags, __builtin_return_address(0));
+       if (!IS_ERR(env)) {
+               struct cl_env *cle;
+
+               cle = cl_env_container(env);
+               *refcheck = cle->ce_ref;
+               CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+       }
+       return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+       LASSERT(cle->ce_owner == NULL);
+       lu_context_exit(&cle->ce_lu.le_ctx);
+       lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, int *refcheck)
+{
+       struct cl_env *cle;
+
+       cle = cl_env_container(env);
+
+       LASSERT(cle->ce_ref > 0);
+       LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+       CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+       if (--cle->ce_ref == 0) {
+               CL_ENV_DEC(busy);
+               cl_env_detach(cle);
+               cle->ce_debug = NULL;
+               cl_env_exit(cle);
+               cl_env_fini(cle);
+       }
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Declares a point of re-entrancy.
+ *
+ * \see cl_env_reexit()
+ */
+void *cl_env_reenter(void)
+{
+       return cl_env_detach(NULL);
+}
+EXPORT_SYMBOL(cl_env_reenter);
+
+/**
+ * Exits re-entrancy.
+ */
+void cl_env_reexit(void *cookie)
+{
+       cl_env_detach(NULL);
+       cl_env_attach(cookie);
+}
+EXPORT_SYMBOL(cl_env_reexit);
+
+/**
+ * Setup user-supplied \a env as a current environment. This is to be used to
+ * guaranteed that environment exists even when cl_env_get() fails. It is up
+ * to user to ensure proper concurrency control.
+ *
+ * \see cl_env_unplant()
+ */
+void cl_env_implant(struct lu_env *env, int *refcheck)
+{
+       struct cl_env *cle = cl_env_container(env);
+
+       LASSERT(cle->ce_ref > 0);
+
+       cl_env_attach(cle);
+       cl_env_get(refcheck);
+       CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+}
+EXPORT_SYMBOL(cl_env_implant);
+
+/**
+ * Detach environment installed earlier by cl_env_implant().
+ */
+void cl_env_unplant(struct lu_env *env, int *refcheck)
+{
+       struct cl_env *cle = cl_env_container(env);
+
+       LASSERT(cle->ce_ref > 1);
+
+       CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+
+       cl_env_detach(cle);
+       cl_env_put(env, refcheck);
+}
+EXPORT_SYMBOL(cl_env_unplant);
+
+struct lu_env *cl_env_nested_get(struct cl_env_nest *nest)
+{
+       struct lu_env *env;
+
+       nest->cen_cookie = NULL;
+       env = cl_env_peek(&nest->cen_refcheck);
+       if (env != NULL) {
+               if (!cl_io_is_going(env))
+                       return env;
+               else {
+                       cl_env_put(env, &nest->cen_refcheck);
+                       nest->cen_cookie = cl_env_reenter();
+               }
+       }
+       env = cl_env_get(&nest->cen_refcheck);
+       if (IS_ERR(env)) {
+               cl_env_reexit(nest->cen_cookie);
+               return env;
+       }
+
+       LASSERT(!cl_io_is_going(env));
+       return env;
+}
+EXPORT_SYMBOL(cl_env_nested_get);
+
+void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env)
+{
+       cl_env_put(env, &nest->cen_refcheck);
+       cl_env_reexit(nest->cen_cookie);
+}
+EXPORT_SYMBOL(cl_env_nested_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+       ENTRY;
+       lvb->lvb_size   = attr->cat_size;
+       lvb->lvb_mtime  = attr->cat_mtime;
+       lvb->lvb_atime  = attr->cat_atime;
+       lvb->lvb_ctime  = attr->cat_ctime;
+       lvb->lvb_blocks = attr->cat_blocks;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_attr2lvb);
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+       ENTRY;
+       attr->cat_size   = lvb->lvb_size;
+       attr->cat_mtime  = lvb->lvb_mtime;
+       attr->cat_atime  = lvb->lvb_atime;
+       attr->cat_ctime  = lvb->lvb_ctime;
+       attr->cat_blocks = lvb->lvb_blocks;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+                               struct lu_device_type *ldt,
+                               struct lu_device *next)
+{
+       const char       *typename;
+       struct lu_device *d;
+
+       LASSERT(ldt != NULL);
+
+       typename = ldt->ldt_name;
+       d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+       if (!IS_ERR(d)) {
+               int rc;
+
+               if (site != NULL)
+                       d->ld_site = site;
+               rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+               if (rc == 0) {
+                       lu_device_get(d);
+                       lu_ref_add(&d->ld_reference,
+                                  "lu-stack", &lu_site_init);
+               } else {
+                       ldt->ldt_ops->ldto_device_free(env, d);
+                       CERROR("can't init device '%s', %d\n", typename, rc);
+                       d = ERR_PTR(rc);
+               }
+       } else
+               CERROR("Cannot allocate device: '%s'\n", typename);
+       return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+       lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+int  cl_lock_init(void);
+void cl_lock_fini(void);
+
+int  cl_page_init(void);
+void cl_page_fini(void);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+       return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl0_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl0, struct cl_thread_info);
+
+static void *cl_key_init(const struct lu_context *ctx,
+                        struct lu_context_key *key)
+{
+       struct cl_thread_info *info;
+
+       info = cl0_key_init(ctx, key);
+       if (!IS_ERR(info)) {
+               int i;
+
+               for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+                       lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+       }
+       return info;
+}
+
+static void cl_key_fini(const struct lu_context *ctx,
+                       struct lu_context_key *key, void *data)
+{
+       struct cl_thread_info *info;
+       int i;
+
+       info = data;
+       for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+               lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+       cl0_key_fini(ctx, key, data);
+}
+
+static void cl_key_exit(const struct lu_context *ctx,
+                       struct lu_context_key *key, void *data)
+{
+       struct cl_thread_info *info = data;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) {
+               LASSERT(info->clt_counters[i].ctc_nr_held == 0);
+               LASSERT(info->clt_counters[i].ctc_nr_used == 0);
+               LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0);
+               LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+               lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+               lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+       }
+}
+
+static struct lu_context_key cl_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = cl_key_init,
+       .lct_fini = cl_key_fini,
+       .lct_exit = cl_key_exit
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+       {
+               .ckd_cache = &cl_env_kmem,
+               .ckd_name  = "cl_env_kmem",
+               .ckd_size  = sizeof (struct cl_env)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+       int result;
+
+       result = cl_env_store_init();
+       if (result)
+               return result;
+
+       result = lu_kmem_init(cl_object_caches);
+       if (result)
+               goto out_store;
+
+       LU_CONTEXT_KEY_INIT(&cl_key);
+       result = lu_context_key_register(&cl_key);
+       if (result)
+               goto out_kmem;
+
+       result = cl_lock_init();
+       if (result)
+               goto out_context;
+
+       result = cl_page_init();
+       if (result)
+               goto out_lock;
+
+       return 0;
+out_lock:
+       cl_lock_fini();
+out_context:
+       lu_context_key_degister(&cl_key);
+out_kmem:
+       lu_kmem_fini(cl_object_caches);
+out_store:
+       cl_env_store_fini();
+       return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+       cl_lock_fini();
+       cl_page_fini();
+       lu_context_key_degister(&cl_key);
+       lu_kmem_fini(cl_object_caches);
+       cl_env_store_fini();
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c
new file mode 100644 (file)
index 0000000..bb93359
--- /dev/null
@@ -0,0 +1,1605 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Page.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <linux/list.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+                           int radix);
+
+# define PASSERT(env, page, expr)                                     \
+  do {                                                             \
+         if (unlikely(!(expr))) {                                    \
+                 CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+                 LASSERT(0);                                      \
+         }                                                          \
+  } while (0)
+
+# define PINVRNT(env, page, exp) \
+       ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+
+/* Disable page statistic by default due to huge performance penalty. */
+#define CS_PAGE_INC(o, item)
+#define CS_PAGE_DEC(o, item)
+#define CS_PAGESTATE_INC(o, state)
+#define CS_PAGESTATE_DEC(o, state)
+
+/**
+ * Internal version of cl_page_top, it should be called if the page is
+ * known to be not freed, says with page referenced, or radix tree lock held,
+ * or page owned.
+ */
+static struct cl_page *cl_page_top_trusted(struct cl_page *page)
+{
+       while (page->cp_parent != NULL)
+               page = page->cp_parent;
+       return page;
+}
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by locking page radix-tree
+ * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+       LASSERT(atomic_read(&page->cp_ref) > 0);
+       atomic_inc(&page->cp_ref);
+}
+
+/**
+ * Returns a slice within a page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *page,
+                  const struct lu_device_type *dtype)
+{
+       const struct cl_page_slice *slice;
+       ENTRY;
+
+       page = cl_page_top_trusted((struct cl_page *)page);
+       do {
+               list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+                       if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+                               RETURN(slice);
+               }
+               page = page->cp_child;
+       } while (page != NULL);
+       RETURN(NULL);
+}
+
+/**
+ * Returns a page with given index in the given object, or NULL if no page is
+ * found. Acquires a reference on \a page.
+ *
+ * Locking: called under cl_object_header::coh_page_guard spin-lock.
+ */
+struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index)
+{
+       struct cl_page *page;
+
+       LASSERT(spin_is_locked(&hdr->coh_page_guard));
+
+       page = radix_tree_lookup(&hdr->coh_tree, index);
+       if (page != NULL)
+               cl_page_get_trust(page);
+       return page;
+}
+EXPORT_SYMBOL(cl_page_lookup);
+
+/**
+ * Returns a list of pages by a given [start, end] of \a obj.
+ *
+ * \param resched If not NULL, then we give up before hogging CPU for too
+ * long and set *resched = 1, in that case caller should implement a retry
+ * logic.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
+ */
+int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_io *io, pgoff_t start, pgoff_t end,
+                       cl_page_gang_cb_t cb, void *cbdata)
+{
+       struct cl_object_header *hdr;
+       struct cl_page    *page;
+       struct cl_page   **pvec;
+       const struct cl_page_slice  *slice;
+       const struct lu_device_type *dtype;
+       pgoff_t           idx;
+       unsigned int         nr;
+       unsigned int         i;
+       unsigned int         j;
+       int                   res = CLP_GANG_OKAY;
+       int                   tree_lock = 1;
+       ENTRY;
+
+       idx = start;
+       hdr = cl_object_header(obj);
+       pvec = cl_env_info(env)->clt_pvec;
+       dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type;
+       spin_lock(&hdr->coh_page_guard);
+       while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
+                                           idx, CLT_PVEC_SIZE)) > 0) {
+               int end_of_region = 0;
+               idx = pvec[nr - 1]->cp_index + 1;
+               for (i = 0, j = 0; i < nr; ++i) {
+                       page = pvec[i];
+                       pvec[i] = NULL;
+
+                       LASSERT(page->cp_type == CPT_CACHEABLE);
+                       if (page->cp_index > end) {
+                               end_of_region = 1;
+                               break;
+                       }
+                       if (page->cp_state == CPS_FREEING)
+                               continue;
+
+                       slice = cl_page_at_trusted(page, dtype);
+                       /*
+                        * Pages for lsm-less file has no underneath sub-page
+                        * for osc, in case of ...
+                        */
+                       PASSERT(env, page, slice != NULL);
+
+                       page = slice->cpl_page;
+                       /*
+                        * Can safely call cl_page_get_trust() under
+                        * radix-tree spin-lock.
+                        *
+                        * XXX not true, because @page is from object another
+                        * than @hdr and protected by different tree lock.
+                        */
+                       cl_page_get_trust(page);
+                       lu_ref_add_atomic(&page->cp_reference,
+                                         "gang_lookup", current);
+                       pvec[j++] = page;
+               }
+
+               /*
+                * Here a delicate locking dance is performed. Current thread
+                * holds a reference to a page, but has to own it before it
+                * can be placed into queue. Owning implies waiting, so
+                * radix-tree lock is to be released. After a wait one has to
+                * check that pages weren't truncated (cl_page_own() returns
+                * error in the latter case).
+                */
+               spin_unlock(&hdr->coh_page_guard);
+               tree_lock = 0;
+
+               for (i = 0; i < j; ++i) {
+                       page = pvec[i];
+                       if (res == CLP_GANG_OKAY)
+                               res = (*cb)(env, io, page, cbdata);
+                       lu_ref_del(&page->cp_reference,
+                                  "gang_lookup", current);
+                       cl_page_put(env, page);
+               }
+               if (nr < CLT_PVEC_SIZE || end_of_region)
+                       break;
+
+               if (res == CLP_GANG_OKAY && need_resched())
+                       res = CLP_GANG_RESCHED;
+               if (res != CLP_GANG_OKAY)
+                       break;
+
+               spin_lock(&hdr->coh_page_guard);
+               tree_lock = 1;
+       }
+       if (tree_lock)
+               spin_unlock(&hdr->coh_page_guard);
+       RETURN(res);
+}
+EXPORT_SYMBOL(cl_page_gang_lookup);
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+{
+       struct cl_object *obj  = page->cp_obj;
+       int pagesize = cl_object_header(obj)->coh_page_bufsize;
+
+       PASSERT(env, page, list_empty(&page->cp_batch));
+       PASSERT(env, page, page->cp_owner == NULL);
+       PASSERT(env, page, page->cp_req == NULL);
+       PASSERT(env, page, page->cp_parent == NULL);
+       PASSERT(env, page, page->cp_state == CPS_FREEING);
+
+       ENTRY;
+       might_sleep();
+       while (!list_empty(&page->cp_layers)) {
+               struct cl_page_slice *slice;
+
+               slice = list_entry(page->cp_layers.next,
+                                      struct cl_page_slice, cpl_linkage);
+               list_del_init(page->cp_layers.next);
+               slice->cpl_ops->cpo_fini(env, slice);
+       }
+       CS_PAGE_DEC(obj, total);
+       CS_PAGESTATE_DEC(obj, page->cp_state);
+       lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page);
+       cl_object_put(env, obj);
+       lu_ref_fini(&page->cp_reference);
+       OBD_FREE(page, pagesize);
+       EXIT;
+}
+
+/**
+ * Helper function updating page state. This is the only place in the code
+ * where cl_page::cp_state field is mutated.
+ */
+static inline void cl_page_state_set_trust(struct cl_page *page,
+                                          enum cl_page_state state)
+{
+       /* bypass const. */
+       *(enum cl_page_state *)&page->cp_state = state;
+}
+
+static struct cl_page *cl_page_alloc(const struct lu_env *env,
+               struct cl_object *o, pgoff_t ind, struct page *vmpage,
+               enum cl_page_type type)
+{
+       struct cl_page    *page;
+       struct lu_object_header *head;
+
+       ENTRY;
+       OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize,
+                       __GFP_IO);
+       if (page != NULL) {
+               int result = 0;
+               atomic_set(&page->cp_ref, 1);
+               if (type == CPT_CACHEABLE) /* for radix tree */
+                       atomic_inc(&page->cp_ref);
+               page->cp_obj = o;
+               cl_object_get(o);
+               page->cp_obj_ref = lu_object_ref_add(&o->co_lu, "cl_page",page);
+               page->cp_index = ind;
+               cl_page_state_set_trust(page, CPS_CACHED);
+               page->cp_type = type;
+               INIT_LIST_HEAD(&page->cp_layers);
+               INIT_LIST_HEAD(&page->cp_batch);
+               INIT_LIST_HEAD(&page->cp_flight);
+               mutex_init(&page->cp_mutex);
+               lu_ref_init(&page->cp_reference);
+               head = o->co_lu.lo_header;
+               list_for_each_entry(o, &head->loh_layers,
+                                       co_lu.lo_linkage) {
+                       if (o->co_ops->coo_page_init != NULL) {
+                               result = o->co_ops->coo_page_init(env, o,
+                                                                 page, vmpage);
+                               if (result != 0) {
+                                       cl_page_delete0(env, page, 0);
+                                       cl_page_free(env, page);
+                                       page = ERR_PTR(result);
+                                       break;
+                               }
+                       }
+               }
+               if (result == 0) {
+                       CS_PAGE_INC(o, total);
+                       CS_PAGE_INC(o, create);
+                       CS_PAGESTATE_DEC(o, CPS_CACHED);
+               }
+       } else {
+               page = ERR_PTR(-ENOMEM);
+       }
+       RETURN(page);
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+static struct cl_page *cl_page_find0(const struct lu_env *env,
+                                    struct cl_object *o,
+                                    pgoff_t idx, struct page *vmpage,
+                                    enum cl_page_type type,
+                                    struct cl_page *parent)
+{
+       struct cl_page    *page = NULL;
+       struct cl_page    *ghost = NULL;
+       struct cl_object_header *hdr;
+       int err;
+
+       LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+       might_sleep();
+
+       ENTRY;
+
+       hdr = cl_object_header(o);
+       CS_PAGE_INC(o, lookup);
+
+       CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
+              idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+       /* fast path. */
+       if (type == CPT_CACHEABLE) {
+               /* vmpage lock is used to protect the child/parent
+                * relationship */
+               KLASSERT(PageLocked(vmpage));
+               /*
+                * cl_vmpage_page() can be called here without any locks as
+                *
+                *     - "vmpage" is locked (which prevents ->private from
+                *       concurrent updates), and
+                *
+                *     - "o" cannot be destroyed while current thread holds a
+                *       reference on it.
+                */
+               page = cl_vmpage_page(vmpage, o);
+               PINVRNT(env, page,
+                       ergo(page != NULL,
+                            cl_page_vmpage(env, page) == vmpage &&
+                            (void *)radix_tree_lookup(&hdr->coh_tree,
+                                                      idx) == page));
+       }
+
+       if (page != NULL) {
+               CS_PAGE_INC(o, hit);
+               RETURN(page);
+       }
+
+       /* allocate and initialize cl_page */
+       page = cl_page_alloc(env, o, idx, vmpage, type);
+       if (IS_ERR(page))
+               RETURN(page);
+
+       if (type == CPT_TRANSIENT) {
+               if (parent) {
+                       LASSERT(page->cp_parent == NULL);
+                       page->cp_parent = parent;
+                       parent->cp_child = page;
+               }
+               RETURN(page);
+       }
+
+       /*
+        * XXX optimization: use radix_tree_preload() here, and change tree
+        * gfp mask to GFP_KERNEL in cl_object_header_init().
+        */
+       spin_lock(&hdr->coh_page_guard);
+       err = radix_tree_insert(&hdr->coh_tree, idx, page);
+       if (err != 0) {
+               ghost = page;
+               /*
+                * Noted by Jay: a lock on \a vmpage protects cl_page_find()
+                * from this race, but
+                *
+                *     0. it's better to have cl_page interface "locally
+                *     consistent" so that its correctness can be reasoned
+                *     about without appealing to the (obscure world of) VM
+                *     locking.
+                *
+                *     1. handling this race allows ->coh_tree to remain
+                *     consistent even when VM locking is somehow busted,
+                *     which is very useful during diagnosing and debugging.
+                */
+               page = ERR_PTR(err);
+               CL_PAGE_DEBUG(D_ERROR, env, ghost,
+                             "fail to insert into radix tree: %d\n", err);
+       } else {
+               if (parent) {
+                       LASSERT(page->cp_parent == NULL);
+                       page->cp_parent = parent;
+                       parent->cp_child = page;
+               }
+               hdr->coh_pages++;
+       }
+       spin_unlock(&hdr->coh_page_guard);
+
+       if (unlikely(ghost != NULL)) {
+               cl_page_delete0(env, ghost, 0);
+               cl_page_free(env, ghost);
+       }
+       RETURN(page);
+}
+
+struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o,
+                            pgoff_t idx, struct page *vmpage,
+                            enum cl_page_type type)
+{
+       return cl_page_find0(env, o, idx, vmpage, type, NULL);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+
+struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o,
+                                pgoff_t idx, struct page *vmpage,
+                                struct cl_page *parent)
+{
+       return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent);
+}
+EXPORT_SYMBOL(cl_page_find_sub);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+       struct cl_object_header *header;
+       struct cl_page    *parent;
+       struct cl_page    *child;
+       struct cl_io        *owner;
+
+       /*
+        * Page invariant is protected by a VM lock.
+        */
+       LINVRNT(cl_page_is_vmlocked(NULL, pg));
+
+       header = cl_object_header(pg->cp_obj);
+       parent = pg->cp_parent;
+       child  = pg->cp_child;
+       owner  = pg->cp_owner;
+
+       return cl_page_in_use(pg) &&
+               ergo(parent != NULL, parent->cp_child == pg) &&
+               ergo(child != NULL, child->cp_parent == pg) &&
+               ergo(child != NULL, pg->cp_obj != child->cp_obj) &&
+               ergo(parent != NULL, pg->cp_obj != parent->cp_obj) &&
+               ergo(owner != NULL && parent != NULL,
+                    parent->cp_owner == pg->cp_owner->ci_parent) &&
+               ergo(owner != NULL && child != NULL,
+                    child->cp_owner->ci_parent == owner) &&
+               /*
+                * Either page is early in initialization (has neither child
+                * nor parent yet), or it is in the object radix tree.
+                */
+               ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE,
+                    (void *)radix_tree_lookup(&header->coh_tree,
+                                              pg->cp_index) == pg ||
+                    (child == NULL && parent == NULL));
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+                              struct cl_page *page, enum cl_page_state state)
+{
+       enum cl_page_state old;
+
+       /*
+        * Matrix of allowed state transitions [old][new], for sanity
+        * checking.
+        */
+       static const int allowed_transitions[CPS_NR][CPS_NR] = {
+               [CPS_CACHED] = {
+                       [CPS_CACHED]  = 0,
+                       [CPS_OWNED]   = 1, /* io finds existing cached page */
+                       [CPS_PAGEIN]  = 0,
+                       [CPS_PAGEOUT] = 1, /* write-out from the cache */
+                       [CPS_FREEING] = 1, /* eviction on the memory pressure */
+               },
+               [CPS_OWNED] = {
+                       [CPS_CACHED]  = 1, /* release to the cache */
+                       [CPS_OWNED]   = 0,
+                       [CPS_PAGEIN]  = 1, /* start read immediately */
+                       [CPS_PAGEOUT] = 1, /* start write immediately */
+                       [CPS_FREEING] = 1, /* lock invalidation or truncate */
+               },
+               [CPS_PAGEIN] = {
+                       [CPS_CACHED]  = 1, /* io completion */
+                       [CPS_OWNED]   = 0,
+                       [CPS_PAGEIN]  = 0,
+                       [CPS_PAGEOUT] = 0,
+                       [CPS_FREEING] = 0,
+               },
+               [CPS_PAGEOUT] = {
+                       [CPS_CACHED]  = 1, /* io completion */
+                       [CPS_OWNED]   = 0,
+                       [CPS_PAGEIN]  = 0,
+                       [CPS_PAGEOUT] = 0,
+                       [CPS_FREEING] = 0,
+               },
+               [CPS_FREEING] = {
+                       [CPS_CACHED]  = 0,
+                       [CPS_OWNED]   = 0,
+                       [CPS_PAGEIN]  = 0,
+                       [CPS_PAGEOUT] = 0,
+                       [CPS_FREEING] = 0,
+               }
+       };
+
+       ENTRY;
+       old = page->cp_state;
+       PASSERT(env, page, allowed_transitions[old][state]);
+       CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
+       for (; page != NULL; page = page->cp_child) {
+               PASSERT(env, page, page->cp_state == old);
+               PASSERT(env, page,
+                       equi(state == CPS_OWNED, page->cp_owner != NULL));
+
+               CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
+               CS_PAGESTATE_INC(page->cp_obj, state);
+               cl_page_state_set_trust(page, state);
+       }
+       EXIT;
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+                             struct cl_page *page, enum cl_page_state state)
+{
+       cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+       ENTRY;
+       cl_page_get_trust(page);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page.
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+       PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent);
+
+       ENTRY;
+       CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
+                      atomic_read(&page->cp_ref));
+
+       if (atomic_dec_and_test(&page->cp_ref)) {
+               LASSERT(page->cp_state == CPS_FREEING);
+
+               LASSERT(atomic_read(&page->cp_ref) == 0);
+               PASSERT(env, page, page->cp_owner == NULL);
+               PASSERT(env, page, list_empty(&page->cp_batch));
+               /*
+                * Page is no longer reachable by other threads. Tear
+                * it down.
+                */
+               cl_page_free(env, page);
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a VM page associated with a given cl_page.
+ */
+struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page)
+{
+       const struct cl_page_slice *slice;
+
+       /*
+        * Find uppermost layer with ->cpo_vmpage() method, and return its
+        * result.
+        */
+       page = cl_page_top(page);
+       do {
+               list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+                       if (slice->cpl_ops->cpo_vmpage != NULL)
+                               RETURN(slice->cpl_ops->cpo_vmpage(env, slice));
+               }
+               page = page->cp_child;
+       } while (page != NULL);
+       LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */
+}
+EXPORT_SYMBOL(cl_page_vmpage);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
+{
+       struct cl_page *top;
+       struct cl_page *page;
+
+       ENTRY;
+       KLASSERT(PageLocked(vmpage));
+
+       /*
+        * NOTE: absence of races and liveness of data are guaranteed by page
+        *       lock on a "vmpage". That works because object destruction has
+        *       bottom-to-top pass.
+        */
+
+       /*
+        * This loop assumes that ->private points to the top-most page. This
+        * can be rectified easily.
+        */
+       top = (struct cl_page *)vmpage->private;
+       if (top == NULL)
+               RETURN(NULL);
+
+       for (page = top; page != NULL; page = page->cp_child) {
+               if (cl_object_same(page->cp_obj, obj)) {
+                       cl_page_get_trust(page);
+                       break;
+               }
+       }
+       LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE));
+       RETURN(page);
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+/**
+ * Returns the top-page for a given page.
+ *
+ * \see cl_object_top(), cl_io_top()
+ */
+struct cl_page *cl_page_top(struct cl_page *page)
+{
+       return cl_page_top_trusted(page);
+}
+EXPORT_SYMBOL(cl_page_top);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+                                      const struct lu_device_type *dtype)
+{
+       return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname)
+
+#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...)             \
+({                                                                   \
+       const struct lu_env     *__env  = (_env);                   \
+       struct cl_page       *__page = (_page);            \
+       const struct cl_page_slice *__scan;                          \
+       int                      __result;                         \
+       ptrdiff_t                  __op   = (_op);                   \
+       int                    (*__method)_proto;                   \
+                                                                       \
+       __result = 0;                                              \
+       __page = cl_page_top(__page);                              \
+       do {                                                        \
+               list_for_each_entry(__scan, &__page->cp_layers,     \
+                                       cpl_linkage) {            \
+                       __method = *(void **)((char *)__scan->cpl_ops + \
+                                             __op);                \
+                       if (__method != NULL) {                  \
+                               __result = (*__method)(__env, __scan,   \
+                                                      ## __VA_ARGS__); \
+                               if (__result != 0)                    \
+                                       break;                    \
+                       }                                              \
+               }                                                      \
+               __page = __page->cp_child;                            \
+       } while (__page != NULL && __result == 0);                    \
+       if (__result > 0)                                              \
+               __result = 0;                                      \
+       __result;                                                      \
+})
+
+#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...)             \
+do {                                                               \
+       const struct lu_env     *__env  = (_env);                   \
+       struct cl_page       *__page = (_page);            \
+       const struct cl_page_slice *__scan;                          \
+       ptrdiff_t                  __op   = (_op);                   \
+       void                  (*__method)_proto;                    \
+                                                                       \
+       __page = cl_page_top(__page);                              \
+       do {                                                        \
+               list_for_each_entry(__scan, &__page->cp_layers,     \
+                                       cpl_linkage) {            \
+                       __method = *(void **)((char *)__scan->cpl_ops + \
+                                             __op);                \
+                       if (__method != NULL)                      \
+                               (*__method)(__env, __scan,            \
+                                           ## __VA_ARGS__);        \
+               }                                                      \
+               __page = __page->cp_child;                            \
+       } while (__page != NULL);                                      \
+} while (0)
+
+#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...)         \
+do {                                                                   \
+       const struct lu_env     *__env  = (_env);                       \
+       struct cl_page       *__page = (_page);                \
+       const struct cl_page_slice *__scan;                              \
+       ptrdiff_t                  __op   = (_op);                       \
+       void                  (*__method)_proto;                        \
+                                                                           \
+       /* get to the bottom page. */                                  \
+       while (__page->cp_child != NULL)                                    \
+               __page = __page->cp_child;                                \
+       do {                                                            \
+               list_for_each_entry_reverse(__scan, &__page->cp_layers, \
+                                               cpl_linkage) {        \
+                       __method = *(void **)((char *)__scan->cpl_ops +     \
+                                             __op);                    \
+                       if (__method != NULL)                          \
+                               (*__method)(__env, __scan,                \
+                                           ## __VA_ARGS__);            \
+               }                                                          \
+               __page = __page->cp_parent;                              \
+       } while (__page != NULL);                                          \
+} while (0)
+
+static int cl_page_invoke(const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+       PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+       ENTRY;
+       RETURN(CL_PAGE_INVOKE(env, page, op,
+                             (const struct lu_env *,
+                              const struct cl_page_slice *, struct cl_io *),
+                             io));
+}
+
+static void cl_page_invoid(const struct lu_env *env,
+                          struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+       PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+       ENTRY;
+       CL_PAGE_INVOID(env, page, op,
+                      (const struct lu_env *,
+                       const struct cl_page_slice *, struct cl_io *), io);
+       EXIT;
+}
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+       ENTRY;
+       for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+               if (page->cp_owner != NULL) {
+                       LASSERT(page->cp_owner->ci_owned_nr > 0);
+                       page->cp_owner->ci_owned_nr--;
+                       page->cp_owner = NULL;
+                       page->cp_task = NULL;
+               }
+       }
+       EXIT;
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+       ENTRY;
+       for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+               LASSERT(page->cp_owner != NULL);
+               page->cp_owner->ci_owned_nr++;
+       }
+       EXIT;
+}
+
+void cl_page_disown0(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg)
+{
+       enum cl_page_state state;
+
+       ENTRY;
+       state = pg->cp_state;
+       PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING);
+       PINVRNT(env, pg, cl_page_invariant(pg));
+       cl_page_owner_clear(pg);
+
+       if (state == CPS_OWNED)
+               cl_page_state_set(env, pg, CPS_CACHED);
+       /*
+        * Completion call-backs are executed in the bottom-up order, so that
+        * uppermost layer (llite), responsible for VFS/VM interaction runs
+        * last and can release locks safely.
+        */
+       CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown),
+                              (const struct lu_env *,
+                               const struct cl_page_slice *, struct cl_io *),
+                              io);
+       EXIT;
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+       LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+       ENTRY;
+       RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == io);
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Try to own a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre  !cl_page_is_owned(pg, io)
+ * \post result == 0 iff cl_page_is_owned(pg, io)
+ *
+ * \retval 0   success
+ *
+ * \retval -ve failure, e.g., page was destroyed (and landed in
+ *          cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ *          or, page was owned by another thread, or in IO.
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
+ */
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+                       struct cl_page *pg, int nonblock)
+{
+       int result;
+
+       PINVRNT(env, pg, !cl_page_is_owned(pg, io));
+
+       ENTRY;
+       pg = cl_page_top(pg);
+       io = cl_io_top(io);
+
+       if (pg->cp_state == CPS_FREEING) {
+               result = -ENOENT;
+       } else {
+               result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
+                                       (const struct lu_env *,
+                                        const struct cl_page_slice *,
+                                        struct cl_io *, int),
+                                       io, nonblock);
+               if (result == 0) {
+                       PASSERT(env, pg, pg->cp_owner == NULL);
+                       PASSERT(env, pg, pg->cp_req == NULL);
+                       pg->cp_owner = io;
+                       pg->cp_task  = current;
+                       cl_page_owner_set(pg);
+                       if (pg->cp_state != CPS_FREEING) {
+                               cl_page_state_set(env, pg, CPS_OWNED);
+                       } else {
+                               cl_page_disown0(env, io, pg);
+                               result = -ENOENT;
+                       }
+               }
+       }
+       PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
+       RETURN(result);
+}
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+       return cl_page_own0(env, io, pg, 0);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+                   struct cl_page *pg)
+{
+       return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+                   struct cl_io *io, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
+
+       ENTRY;
+       pg = cl_page_top(pg);
+       io = cl_io_top(io);
+
+       cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume));
+       PASSERT(env, pg, pg->cp_owner == NULL);
+       pg->cp_owner = io;
+       pg->cp_task = current;
+       cl_page_owner_set(pg);
+       cl_page_state_set(env, pg, CPS_OWNED);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the
+ * underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+                     struct cl_io *io, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       ENTRY;
+       pg = cl_page_top(pg);
+       io = cl_io_top(io);
+       cl_page_owner_clear(pg);
+       cl_page_state_set(env, pg, CPS_CACHED);
+       CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume),
+                              (const struct lu_env *,
+                               const struct cl_page_slice *, struct cl_io *),
+                              io);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+                   struct cl_io *io, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+
+       ENTRY;
+       pg = cl_page_top(pg);
+       io = cl_io_top(io);
+       cl_page_disown0(env, io, pg);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when page is to be removed from the object, e.g., as a result of
+ * truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard));
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+                           int radix)
+{
+       struct cl_page *tmp = pg;
+       ENTRY;
+
+       PASSERT(env, pg, pg == cl_page_top(pg));
+       PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+
+       /*
+        * Severe all ways to obtain new pointers to @pg.
+        */
+       cl_page_owner_clear(pg);
+
+       /*
+        * unexport the page firstly before freeing it so that
+        * the page content is considered to be invalid.
+        * We have to do this because a CPS_FREEING cl_page may
+        * be NOT under the protection of a cl_lock.
+        * Afterwards, if this page is found by other threads, then this
+        * page will be forced to reread.
+        */
+       cl_page_export(env, pg, 0);
+       cl_page_state_set0(env, pg, CPS_FREEING);
+
+       CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete),
+                      (const struct lu_env *, const struct cl_page_slice *));
+
+       if (tmp->cp_type == CPT_CACHEABLE) {
+               if (!radix)
+                       /* !radix means that @pg is not yet in the radix tree,
+                        * skip removing it.
+                        */
+                       tmp = pg->cp_child;
+               for (; tmp != NULL; tmp = tmp->cp_child) {
+                       void                *value;
+                       struct cl_object_header *hdr;
+
+                       hdr = cl_object_header(tmp->cp_obj);
+                       spin_lock(&hdr->coh_page_guard);
+                       value = radix_tree_delete(&hdr->coh_tree,
+                                                 tmp->cp_index);
+                       PASSERT(env, tmp, value == tmp);
+                       PASSERT(env, tmp, hdr->coh_pages > 0);
+                       hdr->coh_pages--;
+                       spin_unlock(&hdr->coh_page_guard);
+                       cl_page_put(env, tmp);
+               }
+       }
+
+       EXIT;
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ *     - removes page from the radix trees,
+ *
+ *     - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre  pg == cl_page_top(pg)
+ * \pre  VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_page_invariant(pg));
+       ENTRY;
+       cl_page_delete0(env, pg, 1);
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Unmaps page from user virtual memory.
+ *
+ * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to unmap page from user space
+ * virtual memory.
+ *
+ * \see cl_page_operations::cpo_unmap()
+ */
+int cl_page_unmap(const struct lu_env *env,
+                 struct cl_io *io, struct cl_page *pg)
+{
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap));
+}
+EXPORT_SYMBOL(cl_page_unmap);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark/clear page as up-to-date
+ * by the \a uptodate argument.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate)
+{
+       PINVRNT(env, pg, cl_page_invariant(pg));
+       CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export),
+                      (const struct lu_env *,
+                       const struct cl_page_slice *, int), uptodate);
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, iff \a pg is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
+{
+       int result;
+       const struct cl_page_slice *slice;
+
+       ENTRY;
+       pg = cl_page_top_trusted((struct cl_page *)pg);
+       slice = container_of(pg->cp_layers.next,
+                            const struct cl_page_slice, cpl_linkage);
+       PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL);
+       /*
+        * Call ->cpo_is_vmlocked() directly instead of going through
+        * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+        * cl_page_invariant().
+        */
+       result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+       PASSERT(env, pg, result == -EBUSY || result == -ENODATA);
+       RETURN(result == -EBUSY);
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+       ENTRY;
+       RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN);
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+                            struct cl_page *pg, enum cl_req_type crt)
+{
+       /*
+        * Page is queued for IO, change its state.
+        */
+       ENTRY;
+       cl_page_owner_clear(pg);
+       cl_page_state_set(env, pg, cl_req_type_state(crt));
+       EXIT;
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+                struct cl_page *pg, enum cl_req_type crt)
+{
+       int result;
+
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+       PINVRNT(env, pg, crt < CRT_NR);
+
+       /*
+        * XXX this has to be called bottom-to-top, so that llite can set up
+        * PG_writeback without risking other layers deciding to skip this
+        * page.
+        */
+       if (crt >= CRT_NR)
+               return -EINVAL;
+       result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep));
+       if (result == 0)
+               cl_page_io_start(env, pg, crt);
+
+       KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE,
+                     equi(result == 0,
+                          PageWriteback(cl_page_vmpage(env, pg)))));
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+       return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre  pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ * \post pg->cp_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+                       struct cl_page *pg, enum cl_req_type crt, int ioret)
+{
+       struct cl_sync_io *anchor = pg->cp_sync_io;
+
+       PASSERT(env, pg, crt < CRT_NR);
+       /* cl_page::cp_req already cleared by the caller (osc_completion()) */
+       PASSERT(env, pg, pg->cp_req == NULL);
+       PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+
+       ENTRY;
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
+       if (crt == CRT_READ && ioret == 0) {
+               PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
+               pg->cp_flags |= CPF_READ_COMPLETED;
+       }
+
+       cl_page_state_set(env, pg, CPS_CACHED);
+       if (crt >= CRT_NR)
+               return;
+       CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion),
+                              (const struct lu_env *,
+                               const struct cl_page_slice *, int), ioret);
+       if (anchor) {
+               LASSERT(cl_page_is_vmlocked(env, pg));
+               LASSERT(pg->cp_sync_io == anchor);
+               pg->cp_sync_io = NULL;
+       }
+       /*
+        * As page->cp_obj is pinned by a reference from page->cp_req, it is
+        * safe to call cl_page_put() without risking object destruction in a
+        * non-blocking context.
+        */
+       cl_page_put(env, pg);
+
+       if (anchor)
+               cl_sync_io_note(anchor, ioret);
+
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre  pg->cp_state == CPS_CACHED
+ * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
+                      enum cl_req_type crt)
+{
+       int result;
+
+       PINVRNT(env, pg, crt < CRT_NR);
+
+       ENTRY;
+       if (crt >= CRT_NR)
+               RETURN(-EINVAL);
+       result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready),
+                               (const struct lu_env *,
+                                const struct cl_page_slice *));
+       if (result == 0) {
+               PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+               cl_page_io_start(env, pg, crt);
+       }
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Notify layers that high level io decided to place this page into a cache
+ * for future transfer.
+ *
+ * The layer implementing transfer engine (osc) has to register this page in
+ * its queues.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_cache_add()
+ */
+int cl_page_cache_add(const struct lu_env *env, struct cl_io *io,
+                     struct cl_page *pg, enum cl_req_type crt)
+{
+       const struct cl_page_slice *scan;
+       int result = 0;
+
+       PINVRNT(env, pg, crt < CRT_NR);
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       ENTRY;
+
+       if (crt >= CRT_NR)
+               RETURN(-EINVAL);
+
+       list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) {
+               if (scan->cpl_ops->io[crt].cpo_cache_add == NULL)
+                       continue;
+
+               result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io);
+               if (result != 0)
+                       break;
+       }
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_cache_add);
+
+/**
+ * Called if a pge is being written back by kernel's intention.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+                 struct cl_page *pg)
+{
+       int result;
+
+       PINVRNT(env, pg, cl_page_is_owned(pg, io));
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       ENTRY;
+
+       result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush));
+
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
+ * Checks whether page is protected by any extent lock is at least required
+ * mode.
+ *
+ * \return the same as in cl_page_operations::cpo_is_under_lock() method.
+ * \see cl_page_operations::cpo_is_under_lock()
+ */
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page)
+{
+       int rc;
+
+       PINVRNT(env, page, cl_page_invariant(page));
+
+       ENTRY;
+       rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock),
+                           (const struct lu_env *,
+                            const struct cl_page_slice *, struct cl_io *),
+                           io);
+       PASSERT(env, page, rc != 0);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(cl_page_is_under_lock);
+
+static int page_prune_cb(const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *page, void *cbdata)
+{
+       cl_page_own(env, io, page);
+       cl_page_unmap(env, io, page);
+       cl_page_discard(env, io, page);
+       cl_page_disown(env, io, page);
+       return CLP_GANG_OKAY;
+}
+
+/**
+ * Purges all cached pages belonging to the object \a obj.
+ */
+int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
+{
+       struct cl_thread_info   *info;
+       struct cl_object        *obj = cl_object_top(clobj);
+       struct cl_io        *io;
+       int                   result;
+
+       ENTRY;
+       info  = cl_env_info(env);
+       io    = &info->clt_io;
+
+       /*
+        * initialize the io. This is ugly since we never do IO in this
+        * function, we just make cl_page_list functions happy. -jay
+        */
+       io->ci_obj = obj;
+       io->ci_ignore_layout = 1;
+       result = cl_io_init(env, io, CIT_MISC, obj);
+       if (result != 0) {
+               cl_io_fini(env, io);
+               RETURN(io->ci_result);
+       }
+
+       do {
+               result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF,
+                                            page_prune_cb, NULL);
+               if (result == CLP_GANG_RESCHED)
+                       cond_resched();
+       } while (result != CLP_GANG_OKAY);
+
+       cl_io_fini(env, io);
+       RETURN(result);
+}
+EXPORT_SYMBOL(cl_pages_prune);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *pg,
+                 int from, int to)
+{
+       PINVRNT(env, pg, cl_page_invariant(pg));
+
+       CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to);
+       CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip),
+                      (const struct lu_env *,
+                       const struct cl_page_slice *,int, int),
+                      from, to);
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+                         lu_printer_t printer, const struct cl_page *pg)
+{
+       (*printer)(env, cookie,
+                  "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n",
+                  pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+                  pg->cp_index, pg->cp_parent, pg->cp_child,
+                  pg->cp_state, pg->cp_error, pg->cp_type,
+                  pg->cp_owner, pg->cp_req, pg->cp_flags);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+                  lu_printer_t printer, const struct cl_page *pg)
+{
+       struct cl_page *scan;
+
+       for (scan = cl_page_top((struct cl_page *)pg);
+            scan != NULL; scan = scan->cp_child)
+               cl_page_header_print(env, cookie, printer, scan);
+       CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print),
+                      (const struct lu_env *env,
+                       const struct cl_page_slice *slice,
+                       void *cookie, lu_printer_t p), cookie, printer);
+       (*printer)(env, cookie, "end page@%p\n", pg);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Cancel a page which is still in a transfer.
+ */
+int cl_page_cancel(const struct lu_env *env, struct cl_page *page)
+{
+       return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel),
+                             (const struct lu_env *,
+                              const struct cl_page_slice *));
+}
+EXPORT_SYMBOL(cl_page_cancel);
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+       /*
+        * XXX for now.
+        */
+       return (loff_t)idx << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+       /*
+        * XXX for now.
+        */
+       return offset >> PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+int cl_page_size(const struct cl_object *obj)
+{
+       return 1 << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+                      struct cl_object *obj,
+                      const struct cl_page_operations *ops)
+{
+       ENTRY;
+       list_add_tail(&slice->cpl_linkage, &page->cp_layers);
+       slice->cpl_obj  = obj;
+       slice->cpl_ops  = ops;
+       slice->cpl_page = page;
+       EXIT;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+int  cl_page_init(void)
+{
+       return 0;
+}
+
+void cl_page_fini(void)
+{
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
new file mode 100644 (file)
index 0000000..af1c2d0
--- /dev/null
@@ -0,0 +1,689 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+# include <asm/atomic.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_build_version.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "llog_internal.h"
+
+
+struct obd_device *obd_devs[MAX_OBD_DEVICES];
+EXPORT_SYMBOL(obd_devs);
+struct list_head obd_types;
+DEFINE_RWLOCK(obd_dev_lock);
+
+__u64 obd_max_pages = 0;
+__u64 obd_max_alloc = 0;
+DEFINE_SPINLOCK(obd_updatemax_lock);
+
+/* The following are visible and mutable through /proc/sys/lustre/. */
+unsigned int obd_alloc_fail_rate = 0;
+EXPORT_SYMBOL(obd_alloc_fail_rate);
+unsigned int obd_debug_peer_on_timeout;
+EXPORT_SYMBOL(obd_debug_peer_on_timeout);
+unsigned int obd_dump_on_timeout;
+EXPORT_SYMBOL(obd_dump_on_timeout);
+unsigned int obd_dump_on_eviction;
+EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_max_dirty_pages = 256;
+EXPORT_SYMBOL(obd_max_dirty_pages);
+atomic_t obd_dirty_pages;
+EXPORT_SYMBOL(obd_dirty_pages);
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
+EXPORT_SYMBOL(obd_timeout);
+unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(ldlm_timeout);
+unsigned int obd_timeout_set;
+EXPORT_SYMBOL(obd_timeout_set);
+unsigned int ldlm_timeout_set;
+EXPORT_SYMBOL(ldlm_timeout_set);
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+EXPORT_SYMBOL(at_min);
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+unsigned int at_history = 600;
+EXPORT_SYMBOL(at_history);
+int at_early_margin = 5;
+EXPORT_SYMBOL(at_early_margin);
+int at_extra = 30;
+EXPORT_SYMBOL(at_extra);
+
+atomic_t obd_dirty_transit_pages;
+EXPORT_SYMBOL(obd_dirty_transit_pages);
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+EXPORT_SYMBOL(obd_jobid_var);
+
+/* Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * TODO:
+ * It's better to cache the jobid for later use if there is any
+ * efficient way, the cl_env code probably could be reused for this
+ * purpose.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/api. Then, the jobid must be cached.
+ */
+int lustre_get_jobid(char *jobid)
+{
+       int jobid_len = JOBSTATS_JOBID_SIZE;
+       int rc = 0;
+       ENTRY;
+
+       memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+       /* Jobstats isn't enabled */
+       if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+               RETURN(0);
+
+       /* Use process name + fsuid as jobid */
+       if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+               snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u",
+                        current_comm(), current_fsuid());
+               RETURN(0);
+       }
+
+       rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len);
+       if (rc) {
+               if (rc == -EOVERFLOW) {
+                       /* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+                        * variable length strings instead of just numbers), it
+                        * might make sense to keep the unique parts for JobID,
+                        * instead of just returning an error.  That means a
+                        * larger temp buffer for cfs_get_environ(), then
+                        * truncating the string at some separator to fit into
+                        * the specified jobid_len.  Fix later if needed. */
+                       static bool printed;
+                       if (unlikely(!printed)) {
+                               LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
+                                                  "for JobID buffer (%d)\n",
+                                                  obd_jobid_var, jobid_len);
+                               printed = true;
+                       }
+               } else {
+                       CDEBUG((rc == -ENOENT || rc == -EINVAL ||
+                               rc == -EDEADLK) ? D_INFO : D_ERROR,
+                              "Get jobid for (%s) failed: rc = %d\n",
+                              obd_jobid_var, rc);
+               }
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+                  size_t size, const char *file, int line)
+{
+       if (ptr == NULL ||
+           (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) {
+               CERROR("%s%salloc of %s ("LPU64" bytes) failed at %s:%d\n",
+                      ptr ? "force " :"", type, name, (__u64)size, file,
+                      line);
+               CERROR(LPU64" total bytes and "LPU64" total pages "
+                      "("LPU64" bytes) allocated by Lustre, "
+                      "%d total bytes by LNET\n",
+                      obd_memory_sum(),
+                      obd_pages_sum() << PAGE_CACHE_SHIFT,
+                      obd_pages_sum(),
+                       atomic_read(&libcfs_kmemory));
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(obd_alloc_fail);
+
+static inline void obd_data2conn(struct lustre_handle *conn,
+                                struct obd_ioctl_data *data)
+{
+       memset(conn, 0, sizeof *conn);
+       conn->cookie = data->ioc_cookie;
+}
+
+static inline void obd_conn2data(struct obd_ioctl_data *data,
+                                struct lustre_handle *conn)
+{
+       data->ioc_cookie = conn->cookie;
+}
+
+int class_resolve_dev_name(__u32 len, const char *name)
+{
+       int rc;
+       int dev;
+
+       ENTRY;
+       if (!len || !name) {
+               CERROR("No name passed,!\n");
+               GOTO(out, rc = -EINVAL);
+       }
+       if (name[len - 1] != 0) {
+               CERROR("Name not nul terminated!\n");
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CDEBUG(D_IOCTL, "device name %s\n", name);
+       dev = class_name2dev(name);
+       if (dev == -1) {
+               CDEBUG(D_IOCTL, "No device for name %s!\n", name);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev);
+       rc = dev;
+
+out:
+       RETURN(rc);
+}
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+       char *buf = NULL;
+       struct obd_ioctl_data *data;
+       struct libcfs_debug_ioctl_data *debug_data;
+       struct obd_device *obd = NULL;
+       int err = 0, len = 0;
+       ENTRY;
+
+       /* only for debugging */
+       if (cmd == LIBCFS_IOC_DEBUG_MASK) {
+               debug_data = (struct libcfs_debug_ioctl_data*)arg;
+               libcfs_subsystem_debug = debug_data->subs;
+               libcfs_debug = debug_data->debug;
+               return 0;
+       }
+
+       CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+       if (obd_ioctl_getdata(&buf, &len, (void *)arg)) {
+               CERROR("OBD ioctl: data error\n");
+               RETURN(-EINVAL);
+       }
+       data = (struct obd_ioctl_data *)buf;
+
+       switch (cmd) {
+       case OBD_IOC_PROCESS_CFG: {
+               struct lustre_cfg *lcfg;
+
+               if (!data->ioc_plen1 || !data->ioc_pbuf1) {
+                       CERROR("No config buffer passed!\n");
+                       GOTO(out, err = -EINVAL);
+               }
+               OBD_ALLOC(lcfg, data->ioc_plen1);
+               if (lcfg == NULL)
+                       GOTO(out, err = -ENOMEM);
+               err = copy_from_user(lcfg, data->ioc_pbuf1,
+                                        data->ioc_plen1);
+               if (!err)
+                       err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
+               if (!err)
+                       err = class_process_config(lcfg);
+
+               OBD_FREE(lcfg, data->ioc_plen1);
+               GOTO(out, err);
+       }
+
+       case OBD_GET_VERSION:
+               if (!data->ioc_inlbuf1) {
+                       CERROR("No buffer passed in ioctl\n");
+                       GOTO(out, err = -EINVAL);
+               }
+
+               if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) {
+                       CERROR("ioctl buffer too small to hold version\n");
+                       GOTO(out, err = -EINVAL);
+               }
+
+               memcpy(data->ioc_bulk, BUILD_VERSION,
+                      strlen(BUILD_VERSION) + 1);
+
+               err = obd_ioctl_popdata((void *)arg, data, len);
+               if (err)
+                       err = -EFAULT;
+               GOTO(out, err);
+
+       case OBD_IOC_NAME2DEV: {
+               /* Resolve a device name.  This does not change the
+                * currently selected device.
+                */
+               int dev;
+
+               dev = class_resolve_dev_name(data->ioc_inllen1,
+                                            data->ioc_inlbuf1);
+               data->ioc_dev = dev;
+               if (dev < 0)
+                       GOTO(out, err = -EINVAL);
+
+               err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+               if (err)
+                       err = -EFAULT;
+               GOTO(out, err);
+       }
+
+       case OBD_IOC_UUID2DEV: {
+               /* Resolve a device uuid.  This does not change the
+                * currently selected device.
+                */
+               int dev;
+               struct obd_uuid uuid;
+
+               if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
+                       CERROR("No UUID passed!\n");
+                       GOTO(out, err = -EINVAL);
+               }
+               if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+                       CERROR("UUID not NUL terminated!\n");
+                       GOTO(out, err = -EINVAL);
+               }
+
+               CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
+               obd_str2uuid(&uuid, data->ioc_inlbuf1);
+               dev = class_uuid2dev(&uuid);
+               data->ioc_dev = dev;
+               if (dev == -1) {
+                       CDEBUG(D_IOCTL, "No device for UUID %s!\n",
+                              data->ioc_inlbuf1);
+                       GOTO(out, err = -EINVAL);
+               }
+
+               CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
+                      dev);
+               err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+               if (err)
+                       err = -EFAULT;
+               GOTO(out, err);
+       }
+
+       case OBD_IOC_CLOSE_UUID: {
+               CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n",
+                      data->ioc_inlbuf1);
+               GOTO(out, err = 0);
+       }
+
+       case OBD_IOC_GETDEVICE: {
+               int     index = data->ioc_count;
+               char    *status, *str;
+
+               if (!data->ioc_inlbuf1) {
+                       CERROR("No buffer passed in ioctl\n");
+                       GOTO(out, err = -EINVAL);
+               }
+               if (data->ioc_inllen1 < 128) {
+                       CERROR("ioctl buffer too small to hold version\n");
+                       GOTO(out, err = -EINVAL);
+               }
+
+               obd = class_num2obd(index);
+               if (!obd)
+                       GOTO(out, err = -ENOENT);
+
+               if (obd->obd_stopping)
+                       status = "ST";
+               else if (obd->obd_set_up)
+                       status = "UP";
+               else if (obd->obd_attached)
+                       status = "AT";
+               else
+                       status = "--";
+               str = (char *)data->ioc_bulk;
+               snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
+                        (int)index, status, obd->obd_type->typ_name,
+                        obd->obd_name, obd->obd_uuid.uuid,
+                        atomic_read(&obd->obd_refcount));
+               err = obd_ioctl_popdata((void *)arg, data, len);
+
+               GOTO(out, err = 0);
+       }
+
+       }
+
+       if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+               if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL)
+                       GOTO(out, err = -EINVAL);
+               if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME)
+                       GOTO(out, err = -EINVAL);
+               obd = class_name2obd(data->ioc_inlbuf4);
+       } else if (data->ioc_dev < class_devno_max()) {
+               obd = class_num2obd(data->ioc_dev);
+       } else {
+               CERROR("OBD ioctl: No device\n");
+               GOTO(out, err = -EINVAL);
+       }
+
+       if (obd == NULL) {
+               CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+               GOTO(out, err = -EINVAL);
+       }
+       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+
+       if (!obd->obd_set_up || obd->obd_stopping) {
+               CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev);
+               GOTO(out, err = -EINVAL);
+       }
+
+       switch(cmd) {
+       case OBD_IOC_NO_TRANSNO: {
+               if (!obd->obd_attached) {
+                       CERROR("Device %d not attached\n", obd->obd_minor);
+                       GOTO(out, err = -ENODEV);
+               }
+               CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+                      obd->obd_name);
+               obd->obd_no_transno = 1;
+               GOTO(out, err = 0);
+       }
+
+       default: {
+               err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL);
+               if (err)
+                       GOTO(out, err);
+
+               err = obd_ioctl_popdata((void *)arg, data, len);
+               if (err)
+                       err = -EFAULT;
+               GOTO(out, err);
+       }
+       }
+
+ out:
+       if (buf)
+               obd_ioctl_freedata(buf, len);
+       RETURN(err);
+} /* class_handle_ioctl */
+
+extern psdev_t obd_psdev;
+
+#define OBD_INIT_CHECK
+int obd_init_checks(void)
+{
+       __u64 u64val, div64val;
+       char buf[64];
+       int len, ret = 0;
+
+       CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", LPU64, LPD64, LPX64);
+
+       CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", (__u64)OBD_OBJECT_EOF);
+
+       u64val = OBD_OBJECT_EOF;
+       CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+       if (u64val != OBD_OBJECT_EOF) {
+               CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+                      u64val, (int)sizeof(u64val));
+               ret = -EINVAL;
+       }
+       len = snprintf(buf, sizeof(buf), LPX64, u64val);
+       if (len != 18) {
+               CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+               ret = -EINVAL;
+       }
+
+       div64val = OBD_OBJECT_EOF;
+       CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+       if (u64val != OBD_OBJECT_EOF) {
+               CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+                      u64val, (int)sizeof(u64val));
+               ret = -EOVERFLOW;
+       }
+       if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+               CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+                      u64val, (int)sizeof(u64val));
+               return -EOVERFLOW;
+       }
+       if (do_div(div64val, 256) != (u64val & 255)) {
+               CERROR("do_div("LPX64",256) != "LPU64"\n", u64val, u64val &255);
+               return -EOVERFLOW;
+       }
+       if (u64val >> 8 != div64val) {
+               CERROR("do_div("LPX64",256) "LPU64" != "LPU64"\n",
+                      u64val, div64val, u64val >> 8);
+               return -EOVERFLOW;
+       }
+       len = snprintf(buf, sizeof(buf), LPX64, u64val);
+       if (len != 18) {
+               CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+               ret = -EINVAL;
+       }
+       len = snprintf(buf, sizeof(buf), LPU64, u64val);
+       if (len != 20) {
+               CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+               ret = -EINVAL;
+       }
+       len = snprintf(buf, sizeof(buf), LPD64, u64val);
+       if (len != 2) {
+               CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+               ret = -EINVAL;
+       }
+       if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) {
+               CWARN("mask failed: u64val "LPU64" >= "LPU64"\n", u64val,
+                     (__u64)PAGE_CACHE_SIZE);
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+extern spinlock_t obd_types_lock;
+extern int class_procfs_init(void);
+extern int class_procfs_clean(void);
+
+static int __init init_obdclass(void)
+{
+       int i, err;
+       int lustre_register_fs(void);
+
+       for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++)
+               INIT_LIST_HEAD(&capa_list[i]);
+
+       LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n");
+
+       spin_lock_init(&obd_types_lock);
+       obd_zombie_impexp_init();
+#ifdef LPROCFS
+       obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+                                        LPROCFS_STATS_FLAG_NONE |
+                                        LPROCFS_STATS_FLAG_IRQ_SAFE);
+       if (obd_memory == NULL) {
+               CERROR("kmalloc of 'obd_memory' failed\n");
+               RETURN(-ENOMEM);
+       }
+
+       lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+                            LPROCFS_CNTR_AVGMINMAX,
+                            "memused", "bytes");
+       lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT,
+                            LPROCFS_CNTR_AVGMINMAX,
+                            "pagesused", "pages");
+#endif
+       err = obd_init_checks();
+       if (err == -EOVERFLOW)
+               return err;
+
+       class_init_uuidlist();
+       err = class_handle_init();
+       if (err)
+               return err;
+
+       INIT_LIST_HEAD(&obd_types);
+
+       err = misc_register(&obd_psdev);
+       if (err) {
+               CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err);
+               return err;
+       }
+
+       /* This struct is already zeroed for us (static global) */
+       for (i = 0; i < class_devno_max(); i++)
+               obd_devs[i] = NULL;
+
+       /* Default the dirty page cache cap to 1/2 of system memory.
+        * For clients with less memory, a larger fraction is needed
+        * for other purposes (mostly for BGL). */
+       if (num_physpages <= 512 << (20 - PAGE_CACHE_SHIFT))
+               obd_max_dirty_pages = num_physpages / 4;
+       else
+               obd_max_dirty_pages = num_physpages / 2;
+
+       err = obd_init_caches();
+       if (err)
+               return err;
+       err = class_procfs_init();
+       if (err)
+               return err;
+
+       err = lu_global_init();
+       if (err)
+               return err;
+
+       err = cl_global_init();
+       if (err != 0)
+               return err;
+
+
+       err = llog_info_init();
+       if (err)
+               return err;
+
+       err = lustre_register_fs();
+
+       return err;
+}
+
+void obd_update_maxusage(void)
+{
+       __u64 max1, max2;
+
+       max1 = obd_pages_sum();
+       max2 = obd_memory_sum();
+
+       spin_lock(&obd_updatemax_lock);
+       if (max1 > obd_max_pages)
+               obd_max_pages = max1;
+       if (max2 > obd_max_alloc)
+               obd_max_alloc = max2;
+       spin_unlock(&obd_updatemax_lock);
+}
+EXPORT_SYMBOL(obd_update_maxusage);
+
+#ifdef LPROCFS
+__u64 obd_memory_max(void)
+{
+       __u64 ret;
+
+       spin_lock(&obd_updatemax_lock);
+       ret = obd_max_alloc;
+       spin_unlock(&obd_updatemax_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL(obd_memory_max);
+
+__u64 obd_pages_max(void)
+{
+       __u64 ret;
+
+       spin_lock(&obd_updatemax_lock);
+       ret = obd_max_pages;
+       spin_unlock(&obd_updatemax_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL(obd_pages_max);
+#endif
+
+/* liblustre doesn't call cleanup_obdclass, apparently.  we carry on in this
+ * ifdef to the end of the file to cover module and versioning goo.*/
+static void cleanup_obdclass(void)
+{
+       int i;
+       int lustre_unregister_fs(void);
+       __u64 memory_leaked, pages_leaked;
+       __u64 memory_max, pages_max;
+       ENTRY;
+
+       lustre_unregister_fs();
+
+       misc_deregister(&obd_psdev);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+               if (obd && obd->obd_set_up &&
+                   OBT(obd) && OBP(obd, detach)) {
+                       /* XXX should this call generic detach otherwise? */
+                       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+                       OBP(obd, detach)(obd);
+               }
+       }
+       llog_info_fini();
+       cl_global_fini();
+       lu_global_fini();
+
+       obd_cleanup_caches();
+       obd_sysctl_clean();
+
+       class_procfs_clean();
+
+       class_handle_cleanup();
+       class_exit_uuidlist();
+       obd_zombie_impexp_stop();
+
+       memory_leaked = obd_memory_sum();
+       pages_leaked = obd_pages_sum();
+
+       memory_max = obd_memory_max();
+       pages_max = obd_pages_max();
+
+       lprocfs_free_stats(&obd_memory);
+       CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+              "obd_memory max: "LPU64", leaked: "LPU64"\n",
+              memory_max, memory_leaked);
+       CDEBUG((pages_leaked) ? D_ERROR : D_INFO,
+              "obd_memory_pages max: "LPU64", leaked: "LPU64"\n",
+              pages_max, pages_leaked);
+
+       EXIT;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
+MODULE_LICENSE("GPL");
+
+cfs_module(obdclass, LUSTRE_VERSION_STRING, init_obdclass, cleanup_obdclass);
diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c
new file mode 100644 (file)
index 0000000..15f71bb
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/debug.c
+ *
+ * Helper routines for dumping data structs for debugging.
+ */
+
+#define DEBUG_SUBSYSTEM D_OTHER
+
+
+#include <obd_ost.h>
+#include <obd_support.h>
+#include <lustre_debug.h>
+#include <lustre_net.h>
+
+void dump_lniobuf(struct niobuf_local *nb)
+{
+       CDEBUG(D_RPCTRACE,
+              "niobuf_local: file_offset="LPD64", len=%d, page=%p, rc=%d\n",
+              nb->lnb_file_offset, nb->len, nb->page, nb->rc);
+       CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n",
+                       nb->page ? page_index(nb->page) : -1);
+}
+EXPORT_SYMBOL(dump_lniobuf);
+
+void dump_lsm(int level, struct lov_stripe_md *lsm)
+{
+       CDEBUG(level, "lsm %p, objid "DOSTID", maxbytes "LPX64", magic 0x%08X,"
+              " stripe_size %u, stripe_count %u, refc: %d,"
+              " layout_gen %u, pool ["LOV_POOLNAMEF"]\n", lsm,
+              POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic,
+              lsm->lsm_stripe_size, lsm->lsm_stripe_count,
+              atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen,
+              lsm->lsm_pool_name);
+}
+EXPORT_SYMBOL(dump_lsm);
+
+#define LPDS sizeof(__u64)
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id)
+{
+       LASSERT(addr);
+
+       off = cpu_to_le64 (off);
+       id = cpu_to_le64 (id);
+       memcpy(addr, (char *)&off, LPDS);
+       memcpy(addr + LPDS, (char *)&id, LPDS);
+
+       addr += len - LPDS - LPDS;
+       memcpy(addr, (char *)&off, LPDS);
+       memcpy(addr + LPDS, (char *)&id, LPDS);
+
+       return 0;
+}
+EXPORT_SYMBOL(block_debug_setup);
+
+int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
+{
+       __u64 ne_off;
+       int err = 0;
+
+       LASSERT(addr);
+
+       ne_off = le64_to_cpu (off);
+       id = le64_to_cpu (id);
+       if (memcmp(addr, (char *)&ne_off, LPDS)) {
+               CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" off: "LPX64" != "
+                      LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+               err = -EINVAL;
+       }
+       if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+               CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n",
+                      who, id, off, *(__u64 *)(addr + LPDS), id);
+               err = -EINVAL;
+       }
+
+       addr += end - LPDS - LPDS;
+       if (memcmp(addr, (char *)&ne_off, LPDS)) {
+               CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end off: "LPX64" != "
+                      LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+               err = -EINVAL;
+       }
+       if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+               CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end id: "LPX64" != "
+                      LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id);
+               err = -EINVAL;
+       }
+
+       return err;
+}
+EXPORT_SYMBOL(block_debug_check);
+#undef LPDS
diff --git a/drivers/staging/lustre/lustre/obdclass/dt_object.c b/drivers/staging/lustre/lustre/obdclass/dt_object.c
new file mode 100644 (file)
index 0000000..1c962dd
--- /dev/null
@@ -0,0 +1,1055 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/dt_object.c
+ *
+ * Dt Object.
+ * Generic functions from dt_object.h
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <dt_object.h>
+#include <linux/list.h>
+/* fid_be_to_cpu() */
+#include <lustre_fid.h>
+
+#include <lustre_quota.h>
+
+/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */
+LU_KEY_INIT(dt_global, struct dt_thread_info);
+LU_KEY_FINI(dt_global, struct dt_thread_info);
+
+struct lu_context_key dt_key = {
+       .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+       .lct_init = dt_global_key_init,
+       .lct_fini = dt_global_key_fini
+};
+EXPORT_SYMBOL(dt_key);
+
+/* no lock is necessary to protect the list, because call-backs
+ * are added during system startup. Please refer to "struct dt_device".
+ */
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+       list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks);
+}
+EXPORT_SYMBOL(dt_txn_callback_add);
+
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+       list_del_init(&cb->dtc_linkage);
+}
+EXPORT_SYMBOL(dt_txn_callback_del);
+
+int dt_txn_hook_start(const struct lu_env *env,
+                     struct dt_device *dev, struct thandle *th)
+{
+       int rc = 0;
+       struct dt_txn_callback *cb;
+
+       if (th->th_local)
+               return 0;
+
+       list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+               if (cb->dtc_txn_start == NULL ||
+                   !(cb->dtc_tag & env->le_ctx.lc_tags))
+                       continue;
+               rc = cb->dtc_txn_start(env, th, cb->dtc_cookie);
+               if (rc < 0)
+                       break;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_start);
+
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn)
+{
+       struct dt_device       *dev = txn->th_dev;
+       struct dt_txn_callback *cb;
+       int                  rc = 0;
+
+       if (txn->th_local)
+               return 0;
+
+       list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+               if (cb->dtc_txn_stop == NULL ||
+                   !(cb->dtc_tag & env->le_ctx.lc_tags))
+                       continue;
+               rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie);
+               if (rc < 0)
+                       break;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_stop);
+
+void dt_txn_hook_commit(struct thandle *txn)
+{
+       struct dt_txn_callback *cb;
+
+       if (txn->th_local)
+               return;
+
+       list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks,
+                               dtc_linkage) {
+               if (cb->dtc_txn_commit)
+                       cb->dtc_txn_commit(txn, cb->dtc_cookie);
+       }
+}
+EXPORT_SYMBOL(dt_txn_hook_commit);
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
+{
+
+       INIT_LIST_HEAD(&dev->dd_txn_callbacks);
+       return lu_device_init(&dev->dd_lu_dev, t);
+}
+EXPORT_SYMBOL(dt_device_init);
+
+void dt_device_fini(struct dt_device *dev)
+{
+       lu_device_fini(&dev->dd_lu_dev);
+}
+EXPORT_SYMBOL(dt_device_fini);
+
+int dt_object_init(struct dt_object *obj,
+                  struct lu_object_header *h, struct lu_device *d)
+
+{
+       return lu_object_init(&obj->do_lu, h, d);
+}
+EXPORT_SYMBOL(dt_object_init);
+
+void dt_object_fini(struct dt_object *obj)
+{
+       lu_object_fini(&obj->do_lu);
+}
+EXPORT_SYMBOL(dt_object_fini);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
+{
+       if (obj->do_index_ops == NULL)
+               obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+       return obj->do_index_ops != NULL;
+}
+EXPORT_SYMBOL(dt_try_as_dir);
+
+enum dt_format_type dt_mode_to_dft(__u32 mode)
+{
+       enum dt_format_type result;
+
+       switch (mode & S_IFMT) {
+       case S_IFDIR:
+               result = DFT_DIR;
+               break;
+       case S_IFREG:
+               result = DFT_REGULAR;
+               break;
+       case S_IFLNK:
+               result = DFT_SYM;
+               break;
+       case S_IFCHR:
+       case S_IFBLK:
+       case S_IFIFO:
+       case S_IFSOCK:
+               result = DFT_NODE;
+               break;
+       default:
+               LBUG();
+               break;
+       }
+       return result;
+}
+EXPORT_SYMBOL(dt_mode_to_dft);
+
+/**
+ * lookup fid for object named \a name in directory \a dir.
+ */
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+                 const char *name, struct lu_fid *fid)
+{
+       if (dt_try_as_dir(env, dir))
+               return dt_lookup(env, dir, (struct dt_rec *)fid,
+                                (const struct dt_key *)name, BYPASS_CAPA);
+       return -ENOTDIR;
+}
+EXPORT_SYMBOL(dt_lookup_dir);
+
+/* this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site */
+struct dt_object *dt_locate_at(const struct lu_env *env,
+                              struct dt_device *dev, const struct lu_fid *fid,
+                              struct lu_device *top_dev)
+{
+       struct lu_object *lo, *n;
+       ENTRY;
+
+       lo = lu_object_find_at(env, top_dev, fid, NULL);
+       if (IS_ERR(lo))
+               return (void *)lo;
+
+       LASSERT(lo != NULL);
+
+       list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) {
+               if (n->lo_dev == &dev->dd_lu_dev)
+                       return container_of0(n, struct dt_object, do_lu);
+       }
+       return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(dt_locate_at);
+
+/**
+ * find a object named \a entry in given \a dfh->dfh_o directory.
+ */
+static int dt_find_entry(const struct lu_env *env, const char *entry, void *data)
+{
+       struct dt_find_hint  *dfh = data;
+       struct dt_device     *dt = dfh->dfh_dt;
+       struct lu_fid   *fid = dfh->dfh_fid;
+       struct dt_object     *obj = dfh->dfh_o;
+       int                result;
+
+       result = dt_lookup_dir(env, obj, entry, fid);
+       lu_object_put(env, &obj->do_lu);
+       if (result == 0) {
+               obj = dt_locate(env, dt, fid);
+               if (IS_ERR(obj))
+                       result = PTR_ERR(obj);
+       }
+       dfh->dfh_o = obj;
+       return result;
+}
+
+/**
+ * Abstract function which parses path name. This function feeds
+ * path component to \a entry_func.
+ */
+int dt_path_parser(const struct lu_env *env,
+                  char *path, dt_entry_func_t entry_func,
+                  void *data)
+{
+       char *e;
+       int rc = 0;
+
+       while (1) {
+               e = strsep(&path, "/");
+               if (e == NULL)
+                       break;
+
+               if (e[0] == 0) {
+                       if (!path || path[0] == '\0')
+                               break;
+                       continue;
+               }
+               rc = entry_func(env, e, data);
+               if (rc)
+                       break;
+       }
+
+       return rc;
+}
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+                const char *path, struct lu_fid *fid)
+{
+       struct dt_thread_info *info = dt_info(env);
+       struct dt_find_hint   *dfh = &info->dti_dfh;
+       struct dt_object      *obj;
+       char                  *local = info->dti_buf;
+       int                    result;
+
+
+       dfh->dfh_dt = dt;
+       dfh->dfh_fid = fid;
+
+       strncpy(local, path, DT_MAX_PATH);
+       local[DT_MAX_PATH - 1] = '\0';
+
+       result = dt->dd_ops->dt_root_get(env, dt, fid);
+       if (result == 0) {
+               obj = dt_locate(env, dt, fid);
+               if (!IS_ERR(obj)) {
+                       dfh->dfh_o = obj;
+                       result = dt_path_parser(env, local, dt_find_entry, dfh);
+                       if (result != 0)
+                               obj = ERR_PTR(result);
+                       else
+                               obj = dfh->dfh_o;
+               }
+       } else {
+               obj = ERR_PTR(result);
+       }
+       return obj;
+}
+EXPORT_SYMBOL(dt_store_resolve);
+
+static struct dt_object *dt_reg_open(const struct lu_env *env,
+                                    struct dt_device *dt,
+                                    struct dt_object *p,
+                                    const char *name,
+                                    struct lu_fid *fid)
+{
+       struct dt_object *o;
+       int result;
+
+       result = dt_lookup_dir(env, p, name, fid);
+       if (result == 0){
+               o = dt_locate(env, dt, fid);
+       }
+       else
+               o = ERR_PTR(result);
+
+       return o;
+}
+
+/**
+ * Open dt object named \a filename from \a dirname directory.
+ *      \param  dt      dt device
+ *      \param  fid     on success, object fid is stored in *fid
+ */
+struct dt_object *dt_store_open(const struct lu_env *env,
+                               struct dt_device *dt,
+                               const char *dirname,
+                               const char *filename,
+                               struct lu_fid *fid)
+{
+       struct dt_object *file;
+       struct dt_object *dir;
+
+       dir = dt_store_resolve(env, dt, dirname, fid);
+       if (!IS_ERR(dir)) {
+               file = dt_reg_open(env, dt, dir,
+                                  filename, fid);
+               lu_object_put(env, &dir->do_lu);
+       } else {
+               file = dir;
+       }
+       return file;
+}
+EXPORT_SYMBOL(dt_store_open);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+                                   struct dt_device *dt,
+                                   const struct lu_fid *fid,
+                                   struct dt_object_format *dof,
+                                   struct lu_attr *at)
+{
+       struct dt_object *dto;
+       struct thandle *th;
+       int rc;
+
+       ENTRY;
+
+       dto = dt_locate(env, dt, fid);
+       if (IS_ERR(dto))
+               RETURN(dto);
+
+       LASSERT(dto != NULL);
+       if (dt_object_exists(dto))
+               RETURN(dto);
+
+       th = dt_trans_create(env, dt);
+       if (IS_ERR(th))
+               GOTO(out, rc = PTR_ERR(th));
+
+       rc = dt_declare_create(env, dto, at, NULL, dof, th);
+       if (rc)
+               GOTO(trans_stop, rc);
+
+       rc = dt_trans_start_local(env, dt, th);
+       if (rc)
+               GOTO(trans_stop, rc);
+
+       dt_write_lock(env, dto, 0);
+       if (dt_object_exists(dto))
+               GOTO(unlock, rc = 0);
+
+       CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+
+       rc = dt_create(env, dto, at, NULL, dof, th);
+       if (rc)
+               GOTO(unlock, rc);
+       LASSERT(dt_object_exists(dto));
+unlock:
+       dt_write_unlock(env, dto);
+trans_stop:
+       dt_trans_stop(env, dt, th);
+out:
+       if (rc) {
+               lu_object_put(env, &dto->do_lu);
+               RETURN(ERR_PTR(rc));
+       }
+       RETURN(dto);
+}
+EXPORT_SYMBOL(dt_find_or_create);
+
+/* dt class init function. */
+int dt_global_init(void)
+{
+       int result;
+
+       LU_CONTEXT_KEY_INIT(&dt_key);
+       result = lu_context_key_register(&dt_key);
+       return result;
+}
+
+void dt_global_fini(void)
+{
+       lu_context_key_degister(&dt_key);
+}
+
+/**
+ * Generic read helper. May return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval real size of data read
+ * \retval -ve errno on failure
+ */
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+           struct lu_buf *buf, loff_t *pos)
+{
+       LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+       return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+}
+EXPORT_SYMBOL(dt_read);
+
+/**
+ * Read structures of fixed size from storage.  Unlike dt_read(), using
+ * dt_record_read() will return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval 0 on successfully reading full buffer
+ * \retval -EFAULT on short read
+ * \retval -ve errno on failure
+ */
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+                  struct lu_buf *buf, loff_t *pos)
+{
+       int rc;
+
+       LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+
+       rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+
+       if (rc == buf->lb_len)
+               rc = 0;
+       else if (rc >= 0)
+               rc = -EFAULT;
+       return rc;
+}
+EXPORT_SYMBOL(dt_record_read);
+
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+                   const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+{
+       int rc;
+
+       LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+       LASSERT(th != NULL);
+       LASSERT(dt->do_body_ops);
+       LASSERT(dt->do_body_ops->dbo_write);
+       rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1);
+       if (rc == buf->lb_len)
+               rc = 0;
+       else if (rc >= 0)
+               rc = -EFAULT;
+       return rc;
+}
+EXPORT_SYMBOL(dt_record_write);
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+                          struct thandle *th)
+{
+       struct lu_buf vbuf;
+       char *xname = XATTR_NAME_VERSION;
+
+       LASSERT(o);
+       vbuf.lb_buf = NULL;
+       vbuf.lb_len = sizeof(dt_obj_version_t);
+       return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+
+}
+EXPORT_SYMBOL(dt_declare_version_set);
+
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+                   dt_obj_version_t version, struct thandle *th)
+{
+       struct lu_buf vbuf;
+       char *xname = XATTR_NAME_VERSION;
+       int rc;
+
+       LASSERT(o);
+       vbuf.lb_buf = &version;
+       vbuf.lb_len = sizeof(version);
+
+       rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA);
+       if (rc < 0)
+               CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+       return;
+}
+EXPORT_SYMBOL(dt_version_set);
+
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
+{
+       struct lu_buf vbuf;
+       char *xname = XATTR_NAME_VERSION;
+       dt_obj_version_t version;
+       int rc;
+
+       LASSERT(o);
+       vbuf.lb_buf = &version;
+       vbuf.lb_len = sizeof(version);
+       rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA);
+       if (rc != sizeof(version)) {
+               CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+               version = 0;
+       }
+       return version;
+}
+EXPORT_SYMBOL(dt_version_get);
+
+/* list of all supported index types */
+
+/* directories */
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
+/* scrub iterator */
+const struct dt_index_features dt_otable_features;
+EXPORT_SYMBOL(dt_otable_features);
+
+/* lfsck */
+const struct dt_index_features dt_lfsck_features = {
+       .dif_flags              = DT_IND_UPDATE,
+       .dif_keysize_min        = sizeof(struct lu_fid),
+       .dif_keysize_max        = sizeof(struct lu_fid),
+       .dif_recsize_min        = sizeof(__u8),
+       .dif_recsize_max        = sizeof(__u8),
+       .dif_ptrsize            = 4
+};
+EXPORT_SYMBOL(dt_lfsck_features);
+
+/* accounting indexes */
+const struct dt_index_features dt_acct_features = {
+       .dif_flags              = DT_IND_UPDATE,
+       .dif_keysize_min        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_keysize_max        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_recsize_min        = sizeof(struct lquota_acct_rec), /* 16 bytes */
+       .dif_recsize_max        = sizeof(struct lquota_acct_rec), /* 16 bytes */
+       .dif_ptrsize            = 4
+};
+EXPORT_SYMBOL(dt_acct_features);
+
+/* global quota files */
+const struct dt_index_features dt_quota_glb_features = {
+       .dif_flags              = DT_IND_UPDATE,
+       /* a different key would have to be used for per-directory quota */
+       .dif_keysize_min        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_keysize_max        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_recsize_min        = sizeof(struct lquota_glb_rec), /* 32 bytes */
+       .dif_recsize_max        = sizeof(struct lquota_glb_rec), /* 32 bytes */
+       .dif_ptrsize            = 4
+};
+EXPORT_SYMBOL(dt_quota_glb_features);
+
+/* slave quota files */
+const struct dt_index_features dt_quota_slv_features = {
+       .dif_flags              = DT_IND_UPDATE,
+       /* a different key would have to be used for per-directory quota */
+       .dif_keysize_min        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_keysize_max        = sizeof(__u64), /* 64-bit uid/gid */
+       .dif_recsize_min        = sizeof(struct lquota_slv_rec), /* 8 bytes */
+       .dif_recsize_max        = sizeof(struct lquota_slv_rec), /* 8 bytes */
+       .dif_ptrsize            = 4
+};
+EXPORT_SYMBOL(dt_quota_slv_features);
+
+/* helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
+                                                                  __u32 mode)
+{
+       if (seq == FID_SEQ_QUOTA_GLB) {
+               /* global quota index */
+               if (!S_ISREG(mode))
+                       /* global quota index should be a regular file */
+                       return ERR_PTR(-ENOENT);
+               return &dt_quota_glb_features;
+       } else if (seq == FID_SEQ_QUOTA) {
+               /* quota slave index */
+               if (!S_ISREG(mode))
+                       /* slave index should be a regular file */
+                       return ERR_PTR(-ENOENT);
+               return &dt_quota_slv_features;
+       } else if (seq >= FID_SEQ_NORMAL) {
+               /* object is part of the namespace, verify that it is a
+                * directory */
+               if (!S_ISDIR(mode))
+                       /* sorry, we can only deal with directory */
+                       return ERR_PTR(-ENOTDIR);
+               return &dt_directory_features;
+       }
+
+       return ERR_PTR(-EOPNOTSUPP);
+}
+
+/*
+ * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ
+ * RPC
+ *
+ * \param env - is the environment passed by the caller
+ * \param lp  - is a pointer to the lu_page to fill
+ * \param nob - is the maximum number of bytes that should be copied
+ * \param iops - is the index operation vector associated with the index object
+ * \param it   - is a pointer to the current iterator
+ * \param attr - is the index attribute to pass to iops->rec()
+ * \param arg  - is a pointer to the idx_info structure
+ */
+static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
+                              int nob, const struct dt_it_ops *iops,
+                              struct dt_it *it, __u32 attr, void *arg)
+{
+       struct idx_info         *ii = (struct idx_info *)arg;
+       struct lu_idxpage       *lip = &lp->lp_idx;
+       char                    *entry;
+       int                      rc, size;
+       ENTRY;
+
+       /* no support for variable key & record size for now */
+       LASSERT((ii->ii_flags & II_FL_VARKEY) == 0);
+       LASSERT((ii->ii_flags & II_FL_VARREC) == 0);
+
+       /* initialize the header of the new container */
+       memset(lip, 0, LIP_HDR_SIZE);
+       lip->lip_magic = LIP_MAGIC;
+       nob        -= LIP_HDR_SIZE;
+
+       /* compute size needed to store a key/record pair */
+       size = ii->ii_recsize + ii->ii_keysize;
+       if ((ii->ii_flags & II_FL_NOHASH) == 0)
+               /* add hash if the client wants it */
+               size += sizeof(__u64);
+
+       entry = lip->lip_entries;
+       do {
+               char            *tmp_entry = entry;
+               struct dt_key   *key;
+               __u64            hash;
+
+               /* fetch 64-bit hash value */
+               hash = iops->store(env, it);
+               ii->ii_hash_end = hash;
+
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+                       if (lip->lip_nr != 0)
+                               GOTO(out, rc = 0);
+               }
+
+               if (nob < size) {
+                       if (lip->lip_nr == 0)
+                               GOTO(out, rc = -EINVAL);
+                       GOTO(out, rc = 0);
+               }
+
+               if ((ii->ii_flags & II_FL_NOHASH) == 0) {
+                       /* client wants to the 64-bit hash value associated with
+                        * each record */
+                       memcpy(tmp_entry, &hash, sizeof(hash));
+                       tmp_entry += sizeof(hash);
+               }
+
+               /* then the key value */
+               LASSERT(iops->key_size(env, it) == ii->ii_keysize);
+               key = iops->key(env, it);
+               memcpy(tmp_entry, key, ii->ii_keysize);
+               tmp_entry += ii->ii_keysize;
+
+               /* and finally the record */
+               rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
+               if (rc != -ESTALE) {
+                       if (rc != 0)
+                               GOTO(out, rc);
+
+                       /* hash/key/record successfully copied! */
+                       lip->lip_nr++;
+                       if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
+                               ii->ii_hash_start = hash;
+                       entry = tmp_entry + ii->ii_recsize;
+                       nob -= size;
+               }
+
+               /* move on to the next record */
+               do {
+                       rc = iops->next(env, it);
+               } while (rc == -ESTALE);
+
+       } while (rc == 0);
+
+       GOTO(out, rc);
+out:
+       if (rc >= 0 && lip->lip_nr > 0)
+               /* one more container */
+               ii->ii_count++;
+       if (rc > 0)
+               /* no more entries */
+               ii->ii_hash_end = II_END_OFF;
+       return rc;
+}
+
+/*
+ * Walk index and fill lu_page containers with key/record pairs
+ *
+ * \param env - is the environment passed by the caller
+ * \param obj - is the index object to parse
+ * \param rdpg - is the lu_rdpg descriptor associated with the transfer
+ * \param filler - is the callback function responsible for filling a lu_page
+ *              with key/record pairs in the format wanted by the caller
+ * \param arg    - is an opaq argument passed to the filler function
+ *
+ * \retval sum (in bytes) of all filled lu_pages
+ * \retval -ve errno on failure
+ */
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+                 const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+                 void *arg)
+{
+       struct dt_it            *it;
+       const struct dt_it_ops  *iops;
+       unsigned int             pageidx, nob, nlupgs = 0;
+       int                      rc;
+       ENTRY;
+
+       LASSERT(rdpg->rp_pages != NULL);
+       LASSERT(obj->do_index_ops != NULL);
+
+       nob = rdpg->rp_count;
+       if (nob <= 0)
+               RETURN(-EFAULT);
+
+       /* Iterate through index and fill containers from @rdpg */
+       iops = &obj->do_index_ops->dio_it;
+       LASSERT(iops != NULL);
+       it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA);
+       if (IS_ERR(it))
+               RETURN(PTR_ERR(it));
+
+       rc = iops->load(env, it, rdpg->rp_hash);
+       if (rc == 0) {
+               /*
+                * Iterator didn't find record with exactly the key requested.
+                *
+                * It is currently either
+                *
+                *     - positioned above record with key less than
+                *     requested---skip it.
+                *     - or not positioned at all (is in IAM_IT_SKEWED
+                *     state)---position it on the next item.
+                */
+               rc = iops->next(env, it);
+       } else if (rc > 0) {
+               rc = 0;
+       }
+
+       /* Fill containers one after the other. There might be multiple
+        * containers per physical page.
+        *
+        * At this point and across for-loop:
+        *  rc == 0 -> ok, proceed.
+        *  rc >  0 -> end of index.
+        *  rc <  0 -> error. */
+       for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
+               union lu_page   *lp;
+               int              i;
+
+               LASSERT(pageidx < rdpg->rp_npages);
+               lp = kmap(rdpg->rp_pages[pageidx]);
+
+               /* fill lu pages */
+               for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) {
+                       rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE),
+                                   iops, it, rdpg->rp_attrs, arg);
+                       if (rc < 0)
+                               break;
+                       /* one more lu_page */
+                       nlupgs++;
+                       if (rc > 0)
+                               /* end of index */
+                               break;
+               }
+               kunmap(rdpg->rp_pages[i]);
+       }
+
+       iops->put(env, it);
+       iops->fini(env, it);
+
+       if (rc >= 0)
+               rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(dt_index_walk);
+
+/**
+ * Walk key/record pairs of an index and copy them into 4KB containers to be
+ * transferred over the network. This is the common handler for OBD_IDX_READ
+ * RPC processing.
+ *
+ * \param env - is the environment passed by the caller
+ * \param dev - is the dt_device storing the index
+ * \param ii  - is the idx_info structure packed by the client in the
+ *           OBD_IDX_READ request
+ * \param rdpg - is the lu_rdpg descriptor
+ *
+ * \retval on success, return sum (in bytes) of all filled containers
+ * \retval appropriate error otherwise.
+ */
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+                 struct idx_info *ii, const struct lu_rdpg *rdpg)
+{
+       const struct dt_index_features  *feat;
+       struct dt_object                *obj;
+       int                              rc;
+       ENTRY;
+
+       /* rp_count shouldn't be null and should be a multiple of the container
+        * size */
+       if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
+               RETURN(-EFAULT);
+
+       if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL)
+               /* we don't support directory transfer via OBD_IDX_READ for the
+                * time being */
+               RETURN(-EOPNOTSUPP);
+
+       if (!fid_is_quota(&ii->ii_fid))
+               /* block access to all local files except quota files */
+               RETURN(-EPERM);
+
+       /* lookup index object subject to the transfer */
+       obj = dt_locate(env, dev, &ii->ii_fid);
+       if (IS_ERR(obj))
+               RETURN(PTR_ERR(obj));
+       if (dt_object_exists(obj) == 0)
+               GOTO(out, rc = -ENOENT);
+
+       /* fetch index features associated with index object */
+       feat = dt_index_feat_select(fid_seq(&ii->ii_fid),
+                                   lu_object_attr(&obj->do_lu));
+       if (IS_ERR(feat))
+               GOTO(out, rc = PTR_ERR(feat));
+
+       /* load index feature if not done already */
+       if (obj->do_index_ops == NULL) {
+               rc = obj->do_ops->do_index_try(env, obj, feat);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       /* fill ii_flags with supported index features */
+       ii->ii_flags &= II_FL_NOHASH;
+
+       ii->ii_keysize = feat->dif_keysize_max;
+       if ((feat->dif_flags & DT_IND_VARKEY) != 0) {
+               /* key size is variable */
+               ii->ii_flags |= II_FL_VARKEY;
+               /* we don't support variable key size for the time being */
+               GOTO(out, rc = -EOPNOTSUPP);
+       }
+
+       ii->ii_recsize = feat->dif_recsize_max;
+       if ((feat->dif_flags & DT_IND_VARREC) != 0) {
+               /* record size is variable */
+               ii->ii_flags |= II_FL_VARREC;
+               /* we don't support variable record size for the time being */
+               GOTO(out, rc = -EOPNOTSUPP);
+       }
+
+       if ((feat->dif_flags & DT_IND_NONUNQ) != 0)
+               /* key isn't necessarily unique */
+               ii->ii_flags |= II_FL_NONUNQ;
+
+       dt_read_lock(env, obj, 0);
+       /* fetch object version before walking the index */
+       ii->ii_version = dt_version_get(env, obj);
+
+       /* walk the index and fill lu_idxpages with key/record pairs */
+       rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii);
+       dt_read_unlock(env, obj);
+
+       if (rc == 0) {
+               /* index is empty */
+               LASSERT(ii->ii_count == 0);
+               ii->ii_hash_end = II_END_OFF;
+       }
+
+       GOTO(out, rc);
+out:
+       lu_object_put(env, &obj->do_lu);
+       return rc;
+}
+EXPORT_SYMBOL(dt_index_read);
+
+#ifdef LPROCFS
+
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc == 0) {
+               *eof = 1;
+               rc = snprintf(page, count, "%u\n",
+                               (unsigned) osfs.os_bsize);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_blksize);
+
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+                             int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc == 0) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_blocks;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal);
+
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc == 0) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bfree;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree);
+
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+                             int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc == 0) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bavail;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", result);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail);
+
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc == 0) {
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", osfs.os_files);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filestotal);
+
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+                           int count, int *eof, void *data)
+{
+       struct dt_device *dt = data;
+       struct obd_statfs osfs;
+
+       int rc = dt_statfs(NULL, dt, &osfs);
+       if (rc == 0) {
+               *eof = 1;
+               rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filesfree);
+
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c
new file mode 100644 (file)
index 0000000..d96876e
--- /dev/null
@@ -0,0 +1,1853 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/genops.c
+ *
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_ost.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+extern struct list_head obd_types;
+spinlock_t obd_types_lock;
+
+struct kmem_cache *obd_device_cachep;
+struct kmem_cache *obdo_cachep;
+EXPORT_SYMBOL(obdo_cachep);
+struct kmem_cache *import_cachep;
+
+struct list_head      obd_zombie_imports;
+struct list_head      obd_zombie_exports;
+spinlock_t  obd_zombie_impexp_lock;
+static void obd_zombie_impexp_notify(void);
+static void obd_zombie_export_add(struct obd_export *exp);
+static void obd_zombie_import_add(struct obd_import *imp);
+static void print_export_data(struct obd_export *exp,
+                             const char *status, int locks);
+
+int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
+
+/*
+ * support functions: we could use inter-module communication, but this
+ * is more portable to other OS's
+ */
+static struct obd_device *obd_device_alloc(void)
+{
+       struct obd_device *obd;
+
+       OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, __GFP_IO);
+       if (obd != NULL) {
+               obd->obd_magic = OBD_DEVICE_MAGIC;
+       }
+       return obd;
+}
+
+static void obd_device_free(struct obd_device *obd)
+{
+       LASSERT(obd != NULL);
+       LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n",
+                obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+       if (obd->obd_namespace != NULL) {
+               CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n",
+                      obd, obd->obd_namespace, obd->obd_force);
+               LBUG();
+       }
+       lu_ref_fini(&obd->obd_reference);
+       OBD_SLAB_FREE_PTR(obd, obd_device_cachep);
+}
+
+struct obd_type *class_search_type(const char *name)
+{
+       struct list_head *tmp;
+       struct obd_type *type;
+
+       spin_lock(&obd_types_lock);
+       list_for_each(tmp, &obd_types) {
+               type = list_entry(tmp, struct obd_type, typ_chain);
+               if (strcmp(type->typ_name, name) == 0) {
+                       spin_unlock(&obd_types_lock);
+                       return type;
+               }
+       }
+       spin_unlock(&obd_types_lock);
+       return NULL;
+}
+EXPORT_SYMBOL(class_search_type);
+
+struct obd_type *class_get_type(const char *name)
+{
+       struct obd_type *type = class_search_type(name);
+
+       if (!type) {
+               const char *modname = name;
+
+               if (strcmp(modname, "obdfilter") == 0)
+                       modname = "ofd";
+
+               if (strcmp(modname, LUSTRE_LWP_NAME) == 0)
+                       modname = LUSTRE_OSP_NAME;
+
+               if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)))
+                       modname = LUSTRE_MDT_NAME;
+
+               if (!request_module("%s", modname)) {
+                       CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
+                       type = class_search_type(name);
+               } else {
+                       LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n",
+                                          modname);
+               }
+       }
+       if (type) {
+               spin_lock(&type->obd_type_lock);
+               type->typ_refcnt++;
+               try_module_get(type->typ_dt_ops->o_owner);
+               spin_unlock(&type->obd_type_lock);
+       }
+       return type;
+}
+EXPORT_SYMBOL(class_get_type);
+
+void class_put_type(struct obd_type *type)
+{
+       LASSERT(type);
+       spin_lock(&type->obd_type_lock);
+       type->typ_refcnt--;
+       module_put(type->typ_dt_ops->o_owner);
+       spin_unlock(&type->obd_type_lock);
+}
+EXPORT_SYMBOL(class_put_type);
+
+#define CLASS_MAX_NAME 1024
+
+int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
+                       struct lprocfs_vars *vars, const char *name,
+                       struct lu_device_type *ldt)
+{
+       struct obd_type *type;
+       int rc = 0;
+       ENTRY;
+
+       /* sanity check */
+       LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+
+       if (class_search_type(name)) {
+               CDEBUG(D_IOCTL, "Type %s already registered\n", name);
+               RETURN(-EEXIST);
+       }
+
+       rc = -ENOMEM;
+       OBD_ALLOC(type, sizeof(*type));
+       if (type == NULL)
+               RETURN(rc);
+
+       OBD_ALLOC_PTR(type->typ_dt_ops);
+       OBD_ALLOC_PTR(type->typ_md_ops);
+       OBD_ALLOC(type->typ_name, strlen(name) + 1);
+
+       if (type->typ_dt_ops == NULL ||
+           type->typ_md_ops == NULL ||
+           type->typ_name == NULL)
+               GOTO (failed, rc);
+
+       *(type->typ_dt_ops) = *dt_ops;
+       /* md_ops is optional */
+       if (md_ops)
+               *(type->typ_md_ops) = *md_ops;
+       strcpy(type->typ_name, name);
+       spin_lock_init(&type->obd_type_lock);
+
+#ifdef LPROCFS
+       type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
+                                             vars, type);
+       if (IS_ERR(type->typ_procroot)) {
+               rc = PTR_ERR(type->typ_procroot);
+               type->typ_procroot = NULL;
+               GOTO (failed, rc);
+       }
+#endif
+       if (ldt != NULL) {
+               type->typ_lu = ldt;
+               rc = lu_device_type_init(ldt);
+               if (rc != 0)
+                       GOTO (failed, rc);
+       }
+
+       spin_lock(&obd_types_lock);
+       list_add(&type->typ_chain, &obd_types);
+       spin_unlock(&obd_types_lock);
+
+       RETURN (0);
+
+ failed:
+       if (type->typ_name != NULL)
+               OBD_FREE(type->typ_name, strlen(name) + 1);
+       if (type->typ_md_ops != NULL)
+               OBD_FREE_PTR(type->typ_md_ops);
+       if (type->typ_dt_ops != NULL)
+               OBD_FREE_PTR(type->typ_dt_ops);
+       OBD_FREE(type, sizeof(*type));
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_register_type);
+
+int class_unregister_type(const char *name)
+{
+       struct obd_type *type = class_search_type(name);
+       ENTRY;
+
+       if (!type) {
+               CERROR("unknown obd type\n");
+               RETURN(-EINVAL);
+       }
+
+       if (type->typ_refcnt) {
+               CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt);
+               /* This is a bad situation, let's make the best of it */
+               /* Remove ops, but leave the name for debugging */
+               OBD_FREE_PTR(type->typ_dt_ops);
+               OBD_FREE_PTR(type->typ_md_ops);
+               RETURN(-EBUSY);
+       }
+
+       if (type->typ_procroot) {
+               lprocfs_remove(&type->typ_procroot);
+       }
+
+       if (type->typ_lu)
+               lu_device_type_fini(type->typ_lu);
+
+       spin_lock(&obd_types_lock);
+       list_del(&type->typ_chain);
+       spin_unlock(&obd_types_lock);
+       OBD_FREE(type->typ_name, strlen(name) + 1);
+       if (type->typ_dt_ops != NULL)
+               OBD_FREE_PTR(type->typ_dt_ops);
+       if (type->typ_md_ops != NULL)
+               OBD_FREE_PTR(type->typ_md_ops);
+       OBD_FREE(type, sizeof(*type));
+       RETURN(0);
+} /* class_unregister_type */
+EXPORT_SYMBOL(class_unregister_type);
+
+/**
+ * Create a new obd device.
+ *
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ *
+ * \param[in] type_name obd device type string.
+ * \param[in] name      obd device name.
+ *
+ * \retval NULL if create fails, otherwise return the obd device
+ *      pointer created.
+ */
+struct obd_device *class_newdev(const char *type_name, const char *name)
+{
+       struct obd_device *result = NULL;
+       struct obd_device *newdev;
+       struct obd_type *type = NULL;
+       int i;
+       int new_obd_minor = 0;
+       ENTRY;
+
+       if (strlen(name) >= MAX_OBD_NAME) {
+               CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
+               RETURN(ERR_PTR(-EINVAL));
+       }
+
+       type = class_get_type(type_name);
+       if (type == NULL){
+               CERROR("OBD: unknown type: %s\n", type_name);
+               RETURN(ERR_PTR(-ENODEV));
+       }
+
+       newdev = obd_device_alloc();
+       if (newdev == NULL)
+               GOTO(out_type, result = ERR_PTR(-ENOMEM));
+
+       LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+
+       write_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd && (strcmp(name, obd->obd_name) == 0)) {
+                       CERROR("Device %s already exists at %d, won't add\n",
+                              name, i);
+                       if (result) {
+                               LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
+                                        "%p obd_magic %08x != %08x\n", result,
+                                        result->obd_magic, OBD_DEVICE_MAGIC);
+                               LASSERTF(result->obd_minor == new_obd_minor,
+                                        "%p obd_minor %d != %d\n", result,
+                                        result->obd_minor, new_obd_minor);
+
+                               obd_devs[result->obd_minor] = NULL;
+                               result->obd_name[0]='\0';
+                        }
+                       result = ERR_PTR(-EEXIST);
+                       break;
+               }
+               if (!result && !obd) {
+                       result = newdev;
+                       result->obd_minor = i;
+                       new_obd_minor = i;
+                       result->obd_type = type;
+                       strncpy(result->obd_name, name,
+                               sizeof(result->obd_name) - 1);
+                       obd_devs[i] = result;
+               }
+       }
+       write_unlock(&obd_dev_lock);
+
+       if (result == NULL && i >= class_devno_max()) {
+               CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
+                      class_devno_max());
+               GOTO(out, result = ERR_PTR(-EOVERFLOW));
+       }
+
+       if (IS_ERR(result))
+               GOTO(out, result);
+
+       CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+              result->obd_name, result);
+
+       RETURN(result);
+out:
+       obd_device_free(newdev);
+out_type:
+       class_put_type(type);
+       return result;
+}
+
+void class_release_dev(struct obd_device *obd)
+{
+       struct obd_type *obd_type = obd->obd_type;
+
+       LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
+                obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+       LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
+                obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+       LASSERT(obd_type != NULL);
+
+       CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
+              obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+
+       write_lock(&obd_dev_lock);
+       obd_devs[obd->obd_minor] = NULL;
+       write_unlock(&obd_dev_lock);
+       obd_device_free(obd);
+
+       class_put_type(obd_type);
+}
+
+int class_name2dev(const char *name)
+{
+       int i;
+
+       if (!name)
+               return -1;
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd && strcmp(name, obd->obd_name) == 0) {
+                       /* Make sure we finished attaching before we give
+                          out any references */
+                       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+                       if (obd->obd_attached) {
+                               read_unlock(&obd_dev_lock);
+                               return i;
+                       }
+                       break;
+               }
+       }
+       read_unlock(&obd_dev_lock);
+
+       return -1;
+}
+EXPORT_SYMBOL(class_name2dev);
+
+struct obd_device *class_name2obd(const char *name)
+{
+       int dev = class_name2dev(name);
+
+       if (dev < 0 || dev > class_devno_max())
+               return NULL;
+       return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_name2obd);
+
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+       int i;
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
+                       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+                       read_unlock(&obd_dev_lock);
+                       return i;
+               }
+       }
+       read_unlock(&obd_dev_lock);
+
+       return -1;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
+{
+       int dev = class_uuid2dev(uuid);
+       if (dev < 0)
+               return NULL;
+       return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_uuid2obd);
+
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ *      otherwise return the obd device there.
+ */
+struct obd_device *class_num2obd(int num)
+{
+       struct obd_device *obd = NULL;
+
+       if (num < class_devno_max()) {
+               obd = obd_devs[num];
+               if (obd == NULL)
+                       return NULL;
+
+               LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                        "%p obd_magic %08x != %08x\n",
+                        obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+               LASSERTF(obd->obd_minor == num,
+                        "%p obd_minor %0d != %0d\n",
+                        obd, obd->obd_minor, num);
+       }
+
+       return obd;
+}
+EXPORT_SYMBOL(class_num2obd);
+
+/**
+ * Get obd devices count. Device in any
+ *    state are counted
+ * \retval obd device count
+ */
+int get_devices_count(void)
+{
+       int index, max_index = class_devno_max(), dev_count = 0;
+
+       read_lock(&obd_dev_lock);
+       for (index = 0; index <= max_index; index++) {
+               struct obd_device *obd = class_num2obd(index);
+               if (obd != NULL)
+                       dev_count++;
+       }
+       read_unlock(&obd_dev_lock);
+
+       return dev_count;
+}
+EXPORT_SYMBOL(get_devices_count);
+
+void class_obd_list(void)
+{
+       char *status;
+       int i;
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd == NULL)
+                       continue;
+               if (obd->obd_stopping)
+                       status = "ST";
+               else if (obd->obd_set_up)
+                       status = "UP";
+               else if (obd->obd_attached)
+                       status = "AT";
+               else
+                       status = "--";
+               LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+                        i, status, obd->obd_type->typ_name,
+                        obd->obd_name, obd->obd_uuid.uuid,
+                        atomic_read(&obd->obd_refcount));
+       }
+       read_unlock(&obd_dev_lock);
+       return;
+}
+
+/* Search for a client OBD connected to tgt_uuid.  If grp_uuid is
+   specified, then only the client with that uuid is returned,
+   otherwise any client connected to the tgt is returned. */
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+                                         const char * typ_name,
+                                         struct obd_uuid *grp_uuid)
+{
+       int i;
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd == NULL)
+                       continue;
+               if ((strncmp(obd->obd_type->typ_name, typ_name,
+                            strlen(typ_name)) == 0)) {
+                       if (obd_uuid_equals(tgt_uuid,
+                                           &obd->u.cli.cl_target_uuid) &&
+                           ((grp_uuid)? obd_uuid_equals(grp_uuid,
+                                                        &obd->obd_uuid) : 1)) {
+                               read_unlock(&obd_dev_lock);
+                               return obd;
+                       }
+               }
+       }
+       read_unlock(&obd_dev_lock);
+
+       return NULL;
+}
+EXPORT_SYMBOL(class_find_client_obd);
+
+/* Iterate the obd_device list looking devices have grp_uuid. Start
+   searching at *next, and if a device is found, the next index to look
+   at is saved in *next. If next is NULL, then the first matching device
+   will always be returned. */
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
+{
+       int i;
+
+       if (next == NULL)
+               i = 0;
+       else if (*next >= 0 && *next < class_devno_max())
+               i = *next;
+       else
+               return NULL;
+
+       read_lock(&obd_dev_lock);
+       for (; i < class_devno_max(); i++) {
+               struct obd_device *obd = class_num2obd(i);
+
+               if (obd == NULL)
+                       continue;
+               if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) {
+                       if (next != NULL)
+                               *next = i+1;
+                       read_unlock(&obd_dev_lock);
+                       return obd;
+               }
+       }
+       read_unlock(&obd_dev_lock);
+
+       return NULL;
+}
+EXPORT_SYMBOL(class_devices_in_group);
+
+/**
+ * to notify sptlrpc log for \a fsname has changed, let every relevant OBD
+ * adjust sptlrpc settings accordingly.
+ */
+int class_notify_sptlrpc_conf(const char *fsname, int namelen)
+{
+       struct obd_device  *obd;
+       const char       *type;
+       int              i, rc = 0, rc2;
+
+       LASSERT(namelen > 0);
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               obd = class_num2obd(i);
+
+               if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping)
+                       continue;
+
+               /* only notify mdc, osc, mdt, ost */
+               type = obd->obd_type->typ_name;
+               if (strcmp(type, LUSTRE_MDC_NAME) != 0 &&
+                   strcmp(type, LUSTRE_OSC_NAME) != 0 &&
+                   strcmp(type, LUSTRE_MDT_NAME) != 0 &&
+                   strcmp(type, LUSTRE_OST_NAME) != 0)
+                       continue;
+
+               if (strncmp(obd->obd_name, fsname, namelen))
+                       continue;
+
+               class_incref(obd, __FUNCTION__, obd);
+               read_unlock(&obd_dev_lock);
+               rc2 = obd_set_info_async(NULL, obd->obd_self_export,
+                                        sizeof(KEY_SPTLRPC_CONF),
+                                        KEY_SPTLRPC_CONF, 0, NULL, NULL);
+               rc = rc ? rc : rc2;
+               class_decref(obd, __FUNCTION__, obd);
+               read_lock(&obd_dev_lock);
+       }
+       read_unlock(&obd_dev_lock);
+       return rc;
+}
+EXPORT_SYMBOL(class_notify_sptlrpc_conf);
+
+void obd_cleanup_caches(void)
+{
+       ENTRY;
+       if (obd_device_cachep) {
+               kmem_cache_destroy(obd_device_cachep);
+               obd_device_cachep = NULL;
+       }
+       if (obdo_cachep) {
+               kmem_cache_destroy(obdo_cachep);
+               obdo_cachep = NULL;
+       }
+       if (import_cachep) {
+               kmem_cache_destroy(import_cachep);
+               import_cachep = NULL;
+       }
+       if (capa_cachep) {
+               kmem_cache_destroy(capa_cachep);
+               capa_cachep = NULL;
+       }
+       EXIT;
+}
+
+int obd_init_caches(void)
+{
+       ENTRY;
+
+       LASSERT(obd_device_cachep == NULL);
+       obd_device_cachep = kmem_cache_create("ll_obd_dev_cache",
+                                                sizeof(struct obd_device),
+                                                0, 0, NULL);
+       if (!obd_device_cachep)
+               GOTO(out, -ENOMEM);
+
+       LASSERT(obdo_cachep == NULL);
+       obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
+                                          0, 0, NULL);
+       if (!obdo_cachep)
+               GOTO(out, -ENOMEM);
+
+       LASSERT(import_cachep == NULL);
+       import_cachep = kmem_cache_create("ll_import_cache",
+                                            sizeof(struct obd_import),
+                                            0, 0, NULL);
+       if (!import_cachep)
+               GOTO(out, -ENOMEM);
+
+       LASSERT(capa_cachep == NULL);
+       capa_cachep = kmem_cache_create("capa_cache",
+                                          sizeof(struct obd_capa), 0, 0, NULL);
+       if (!capa_cachep)
+               GOTO(out, -ENOMEM);
+
+       RETURN(0);
+ out:
+       obd_cleanup_caches();
+       RETURN(-ENOMEM);
+
+}
+
+/* map connection to client */
+struct obd_export *class_conn2export(struct lustre_handle *conn)
+{
+       struct obd_export *export;
+       ENTRY;
+
+       if (!conn) {
+               CDEBUG(D_CACHE, "looking for null handle\n");
+               RETURN(NULL);
+       }
+
+       if (conn->cookie == -1) {  /* this means assign a new connection */
+               CDEBUG(D_CACHE, "want a new connection\n");
+               RETURN(NULL);
+       }
+
+       CDEBUG(D_INFO, "looking for export cookie "LPX64"\n", conn->cookie);
+       export = class_handle2object(conn->cookie);
+       RETURN(export);
+}
+EXPORT_SYMBOL(class_conn2export);
+
+struct obd_device *class_exp2obd(struct obd_export *exp)
+{
+       if (exp)
+               return exp->exp_obd;
+       return NULL;
+}
+EXPORT_SYMBOL(class_exp2obd);
+
+struct obd_device *class_conn2obd(struct lustre_handle *conn)
+{
+       struct obd_export *export;
+       export = class_conn2export(conn);
+       if (export) {
+               struct obd_device *obd = export->exp_obd;
+               class_export_put(export);
+               return obd;
+       }
+       return NULL;
+}
+EXPORT_SYMBOL(class_conn2obd);
+
+struct obd_import *class_exp2cliimp(struct obd_export *exp)
+{
+       struct obd_device *obd = exp->exp_obd;
+       if (obd == NULL)
+               return NULL;
+       return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_exp2cliimp);
+
+struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
+{
+       struct obd_device *obd = class_conn2obd(conn);
+       if (obd == NULL)
+               return NULL;
+       return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_conn2cliimp);
+
+/* Export management functions */
+static void class_export_destroy(struct obd_export *exp)
+{
+       struct obd_device *obd = exp->exp_obd;
+       ENTRY;
+
+       LASSERT_ATOMIC_ZERO(&exp->exp_refcount);
+       LASSERT(obd != NULL);
+
+       CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp,
+              exp->exp_client_uuid.uuid, obd->obd_name);
+
+       /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+       if (exp->exp_connection)
+               ptlrpc_put_connection_superhack(exp->exp_connection);
+
+       LASSERT(list_empty(&exp->exp_outstanding_replies));
+       LASSERT(list_empty(&exp->exp_uncommitted_replies));
+       LASSERT(list_empty(&exp->exp_req_replay_queue));
+       LASSERT(list_empty(&exp->exp_hp_rpcs));
+       obd_destroy_export(exp);
+       class_decref(obd, "export", exp);
+
+       OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
+       EXIT;
+}
+
+static void export_handle_addref(void *export)
+{
+       class_export_get(export);
+}
+
+static struct portals_handle_ops export_handle_ops = {
+       .hop_addref = export_handle_addref,
+       .hop_free   = NULL,
+};
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+       atomic_inc(&exp->exp_refcount);
+       CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp,
+              atomic_read(&exp->exp_refcount));
+       return exp;
+}
+EXPORT_SYMBOL(class_export_get);
+
+void class_export_put(struct obd_export *exp)
+{
+       LASSERT(exp != NULL);
+       LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON);
+       CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+              atomic_read(&exp->exp_refcount) - 1);
+
+       if (atomic_dec_and_test(&exp->exp_refcount)) {
+               LASSERT(!list_empty(&exp->exp_obd_chain));
+               CDEBUG(D_IOCTL, "final put %p/%s\n",
+                      exp, exp->exp_client_uuid.uuid);
+
+               /* release nid stat refererence */
+               lprocfs_exp_cleanup(exp);
+
+               obd_zombie_export_add(exp);
+       }
+}
+EXPORT_SYMBOL(class_export_put);
+
+/* Creates a new export, adds it to the hash table, and returns a
+ * pointer to it. The refcount is 2: one for the hash reference, and
+ * one for the pointer returned by this function. */
+struct obd_export *class_new_export(struct obd_device *obd,
+                                   struct obd_uuid *cluuid)
+{
+       struct obd_export *export;
+       cfs_hash_t *hash = NULL;
+       int rc = 0;
+       ENTRY;
+
+       OBD_ALLOC_PTR(export);
+       if (!export)
+               return ERR_PTR(-ENOMEM);
+
+       export->exp_conn_cnt = 0;
+       export->exp_lock_hash = NULL;
+       export->exp_flock_hash = NULL;
+       atomic_set(&export->exp_refcount, 2);
+       atomic_set(&export->exp_rpc_count, 0);
+       atomic_set(&export->exp_cb_count, 0);
+       atomic_set(&export->exp_locks_count, 0);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       INIT_LIST_HEAD(&export->exp_locks_list);
+       spin_lock_init(&export->exp_locks_list_guard);
+#endif
+       atomic_set(&export->exp_replay_count, 0);
+       export->exp_obd = obd;
+       INIT_LIST_HEAD(&export->exp_outstanding_replies);
+       spin_lock_init(&export->exp_uncommitted_replies_lock);
+       INIT_LIST_HEAD(&export->exp_uncommitted_replies);
+       INIT_LIST_HEAD(&export->exp_req_replay_queue);
+       INIT_LIST_HEAD(&export->exp_handle.h_link);
+       INIT_LIST_HEAD(&export->exp_hp_rpcs);
+       class_handle_hash(&export->exp_handle, &export_handle_ops);
+       export->exp_last_request_time = cfs_time_current_sec();
+       spin_lock_init(&export->exp_lock);
+       spin_lock_init(&export->exp_rpc_lock);
+       INIT_HLIST_NODE(&export->exp_uuid_hash);
+       INIT_HLIST_NODE(&export->exp_nid_hash);
+       spin_lock_init(&export->exp_bl_list_lock);
+       INIT_LIST_HEAD(&export->exp_bl_list);
+
+       export->exp_sp_peer = LUSTRE_SP_ANY;
+       export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
+       export->exp_client_uuid = *cluuid;
+       obd_init_export(export);
+
+       spin_lock(&obd->obd_dev_lock);
+       /* shouldn't happen, but might race */
+       if (obd->obd_stopping)
+               GOTO(exit_unlock, rc = -ENODEV);
+
+       hash = cfs_hash_getref(obd->obd_uuid_hash);
+       if (hash == NULL)
+               GOTO(exit_unlock, rc = -ENODEV);
+       spin_unlock(&obd->obd_dev_lock);
+
+       if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+               rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
+               if (rc != 0) {
+                       LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
+                                     obd->obd_name, cluuid->uuid, rc);
+                       GOTO(exit_err, rc = -EALREADY);
+               }
+       }
+
+       spin_lock(&obd->obd_dev_lock);
+       if (obd->obd_stopping) {
+               cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+               GOTO(exit_unlock, rc = -ENODEV);
+       }
+
+       class_incref(obd, "export", export);
+       list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+       list_add_tail(&export->exp_obd_chain_timed,
+                         &export->exp_obd->obd_exports_timed);
+       export->exp_obd->obd_num_exports++;
+       spin_unlock(&obd->obd_dev_lock);
+       cfs_hash_putref(hash);
+       RETURN(export);
+
+exit_unlock:
+       spin_unlock(&obd->obd_dev_lock);
+exit_err:
+       if (hash)
+               cfs_hash_putref(hash);
+       class_handle_unhash(&export->exp_handle);
+       LASSERT(hlist_unhashed(&export->exp_uuid_hash));
+       obd_destroy_export(export);
+       OBD_FREE_PTR(export);
+       return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(class_new_export);
+
+void class_unlink_export(struct obd_export *exp)
+{
+       class_handle_unhash(&exp->exp_handle);
+
+       spin_lock(&exp->exp_obd->obd_dev_lock);
+       /* delete an uuid-export hashitem from hashtables */
+       if (!hlist_unhashed(&exp->exp_uuid_hash))
+               cfs_hash_del(exp->exp_obd->obd_uuid_hash,
+                            &exp->exp_client_uuid,
+                            &exp->exp_uuid_hash);
+
+       list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
+       list_del_init(&exp->exp_obd_chain_timed);
+       exp->exp_obd->obd_num_exports--;
+       spin_unlock(&exp->exp_obd->obd_dev_lock);
+       class_export_put(exp);
+}
+EXPORT_SYMBOL(class_unlink_export);
+
+/* Import management functions */
+void class_import_destroy(struct obd_import *imp)
+{
+       ENTRY;
+
+       CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp,
+               imp->imp_obd->obd_name);
+
+       LASSERT_ATOMIC_ZERO(&imp->imp_refcount);
+
+       ptlrpc_put_connection_superhack(imp->imp_connection);
+
+       while (!list_empty(&imp->imp_conn_list)) {
+               struct obd_import_conn *imp_conn;
+
+               imp_conn = list_entry(imp->imp_conn_list.next,
+                                         struct obd_import_conn, oic_item);
+               list_del_init(&imp_conn->oic_item);
+               ptlrpc_put_connection_superhack(imp_conn->oic_conn);
+               OBD_FREE(imp_conn, sizeof(*imp_conn));
+       }
+
+       LASSERT(imp->imp_sec == NULL);
+       class_decref(imp->imp_obd, "import", imp);
+       OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
+       EXIT;
+}
+
+static void import_handle_addref(void *import)
+{
+       class_import_get(import);
+}
+
+static struct portals_handle_ops import_handle_ops = {
+       .hop_addref = import_handle_addref,
+       .hop_free   = NULL,
+};
+
+struct obd_import *class_import_get(struct obd_import *import)
+{
+       atomic_inc(&import->imp_refcount);
+       CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+              atomic_read(&import->imp_refcount),
+              import->imp_obd->obd_name);
+       return import;
+}
+EXPORT_SYMBOL(class_import_get);
+
+void class_import_put(struct obd_import *imp)
+{
+       ENTRY;
+
+       LASSERT(list_empty(&imp->imp_zombie_chain));
+       LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
+
+       CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
+              atomic_read(&imp->imp_refcount) - 1,
+              imp->imp_obd->obd_name);
+
+       if (atomic_dec_and_test(&imp->imp_refcount)) {
+               CDEBUG(D_INFO, "final put import %p\n", imp);
+               obd_zombie_import_add(imp);
+       }
+
+       /* catch possible import put race */
+       LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
+       EXIT;
+}
+EXPORT_SYMBOL(class_import_put);
+
+static void init_imp_at(struct imp_at *at) {
+       int i;
+       at_init(&at->iat_net_latency, 0, 0);
+       for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+               /* max service estimates are tracked on the server side, so
+                  don't use the AT history here, just use the last reported
+                  val. (But keep hist for proc histogram, worst_ever) */
+               at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+                       AT_FLG_NOHIST);
+       }
+}
+
+struct obd_import *class_new_import(struct obd_device *obd)
+{
+       struct obd_import *imp;
+
+       OBD_ALLOC(imp, sizeof(*imp));
+       if (imp == NULL)
+               return NULL;
+
+       INIT_LIST_HEAD(&imp->imp_pinger_chain);
+       INIT_LIST_HEAD(&imp->imp_zombie_chain);
+       INIT_LIST_HEAD(&imp->imp_replay_list);
+       INIT_LIST_HEAD(&imp->imp_sending_list);
+       INIT_LIST_HEAD(&imp->imp_delayed_list);
+       spin_lock_init(&imp->imp_lock);
+       imp->imp_last_success_conn = 0;
+       imp->imp_state = LUSTRE_IMP_NEW;
+       imp->imp_obd = class_incref(obd, "import", imp);
+       mutex_init(&imp->imp_sec_mutex);
+       init_waitqueue_head(&imp->imp_recovery_waitq);
+
+       atomic_set(&imp->imp_refcount, 2);
+       atomic_set(&imp->imp_unregistering, 0);
+       atomic_set(&imp->imp_inflight, 0);
+       atomic_set(&imp->imp_replay_inflight, 0);
+       atomic_set(&imp->imp_inval_count, 0);
+       INIT_LIST_HEAD(&imp->imp_conn_list);
+       INIT_LIST_HEAD(&imp->imp_handle.h_link);
+       class_handle_hash(&imp->imp_handle, &import_handle_ops);
+       init_imp_at(&imp->imp_at);
+
+       /* the default magic is V2, will be used in connect RPC, and
+        * then adjusted according to the flags in request/reply. */
+       imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
+
+       return imp;
+}
+EXPORT_SYMBOL(class_new_import);
+
+void class_destroy_import(struct obd_import *import)
+{
+       LASSERT(import != NULL);
+       LASSERT(import != LP_POISON);
+
+       class_handle_unhash(&import->imp_handle);
+
+       spin_lock(&import->imp_lock);
+       import->imp_generation++;
+       spin_unlock(&import->imp_lock);
+       class_import_put(import);
+}
+EXPORT_SYMBOL(class_destroy_import);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+       spin_lock(&exp->exp_locks_list_guard);
+
+       LASSERT(lock->l_exp_refs_nr >= 0);
+
+       if (lock->l_exp_refs_target != NULL &&
+           lock->l_exp_refs_target != exp) {
+               LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n",
+                             exp, lock, lock->l_exp_refs_target);
+       }
+       if ((lock->l_exp_refs_nr ++) == 0) {
+               list_add(&lock->l_exp_refs_link, &exp->exp_locks_list);
+               lock->l_exp_refs_target = exp;
+       }
+       CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+              lock, exp, lock->l_exp_refs_nr);
+       spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_add_lock_ref);
+
+void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+       spin_lock(&exp->exp_locks_list_guard);
+       LASSERT(lock->l_exp_refs_nr > 0);
+       if (lock->l_exp_refs_target != exp) {
+               LCONSOLE_WARN("lock %p, "
+                             "mismatching export pointers: %p, %p\n",
+                             lock, lock->l_exp_refs_target, exp);
+       }
+       if (-- lock->l_exp_refs_nr == 0) {
+               list_del_init(&lock->l_exp_refs_link);
+               lock->l_exp_refs_target = NULL;
+       }
+       CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+              lock, exp, lock->l_exp_refs_nr);
+       spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_del_lock_ref);
+#endif
+
+/* A connection defines an export context in which preallocation can
+   be managed. This releases the export pointer reference, and returns
+   the export handle, so the export refcount is 1 when this function
+   returns. */
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+                 struct obd_uuid *cluuid)
+{
+       struct obd_export *export;
+       LASSERT(conn != NULL);
+       LASSERT(obd != NULL);
+       LASSERT(cluuid != NULL);
+       ENTRY;
+
+       export = class_new_export(obd, cluuid);
+       if (IS_ERR(export))
+               RETURN(PTR_ERR(export));
+
+       conn->cookie = export->exp_handle.h_cookie;
+       class_export_put(export);
+
+       CDEBUG(D_IOCTL, "connect: client %s, cookie "LPX64"\n",
+              cluuid->uuid, conn->cookie);
+       RETURN(0);
+}
+EXPORT_SYMBOL(class_connect);
+
+/* if export is involved in recovery then clean up related things */
+void class_export_recovery_cleanup(struct obd_export *exp)
+{
+       struct obd_device *obd = exp->exp_obd;
+
+       spin_lock(&obd->obd_recovery_task_lock);
+       if (exp->exp_delayed)
+               obd->obd_delayed_clients--;
+       if (obd->obd_recovering) {
+               if (exp->exp_in_recovery) {
+                       spin_lock(&exp->exp_lock);
+                       exp->exp_in_recovery = 0;
+                       spin_unlock(&exp->exp_lock);
+                       LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+                       atomic_dec(&obd->obd_connected_clients);
+               }
+
+               /* if called during recovery then should update
+                * obd_stale_clients counter,
+                * lightweight exports are not counted */
+               if (exp->exp_failed &&
+                   (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0)
+                       exp->exp_obd->obd_stale_clients++;
+       }
+       spin_unlock(&obd->obd_recovery_task_lock);
+       /** Cleanup req replay fields */
+       if (exp->exp_req_replay_needed) {
+               spin_lock(&exp->exp_lock);
+               exp->exp_req_replay_needed = 0;
+               spin_unlock(&exp->exp_lock);
+               LASSERT(atomic_read(&obd->obd_req_replay_clients));
+               atomic_dec(&obd->obd_req_replay_clients);
+       }
+       /** Cleanup lock replay data */
+       if (exp->exp_lock_replay_needed) {
+               spin_lock(&exp->exp_lock);
+               exp->exp_lock_replay_needed = 0;
+               spin_unlock(&exp->exp_lock);
+               LASSERT(atomic_read(&obd->obd_lock_replay_clients));
+               atomic_dec(&obd->obd_lock_replay_clients);
+       }
+}
+
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
+int class_disconnect(struct obd_export *export)
+{
+       int already_disconnected;
+       ENTRY;
+
+       if (export == NULL) {
+               CWARN("attempting to free NULL export %p\n", export);
+               RETURN(-EINVAL);
+       }
+
+       spin_lock(&export->exp_lock);
+       already_disconnected = export->exp_disconnected;
+       export->exp_disconnected = 1;
+       spin_unlock(&export->exp_lock);
+
+       /* class_cleanup(), abort_recovery(), and class_fail_export()
+        * all end up in here, and if any of them race we shouldn't
+        * call extra class_export_puts(). */
+       if (already_disconnected) {
+               LASSERT(hlist_unhashed(&export->exp_nid_hash));
+               GOTO(no_disconn, already_disconnected);
+       }
+
+       CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n",
+              export->exp_handle.h_cookie);
+
+       if (!hlist_unhashed(&export->exp_nid_hash))
+               cfs_hash_del(export->exp_obd->obd_nid_hash,
+                            &export->exp_connection->c_peer.nid,
+                            &export->exp_nid_hash);
+
+       class_export_recovery_cleanup(export);
+       class_unlink_export(export);
+no_disconn:
+       class_export_put(export);
+       RETURN(0);
+}
+EXPORT_SYMBOL(class_disconnect);
+
+/* Return non-zero for a fully connected export */
+int class_connected_export(struct obd_export *exp)
+{
+       if (exp) {
+               int connected;
+               spin_lock(&exp->exp_lock);
+               connected = (exp->exp_conn_cnt > 0);
+               spin_unlock(&exp->exp_lock);
+               return connected;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(class_connected_export);
+
+static void class_disconnect_export_list(struct list_head *list,
+                                        enum obd_option flags)
+{
+       int rc;
+       struct obd_export *exp;
+       ENTRY;
+
+       /* It's possible that an export may disconnect itself, but
+        * nothing else will be added to this list. */
+       while (!list_empty(list)) {
+               exp = list_entry(list->next, struct obd_export,
+                                    exp_obd_chain);
+               /* need for safe call CDEBUG after obd_disconnect */
+               class_export_get(exp);
+
+               spin_lock(&exp->exp_lock);
+               exp->exp_flags = flags;
+               spin_unlock(&exp->exp_lock);
+
+               if (obd_uuid_equals(&exp->exp_client_uuid,
+                                   &exp->exp_obd->obd_uuid)) {
+                       CDEBUG(D_HA,
+                              "exp %p export uuid == obd uuid, don't discon\n",
+                              exp);
+                       /* Need to delete this now so we don't end up pointing
+                        * to work_list later when this export is cleaned up. */
+                       list_del_init(&exp->exp_obd_chain);
+                       class_export_put(exp);
+                       continue;
+               }
+
+               class_export_get(exp);
+               CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
+                      "last request at "CFS_TIME_T"\n",
+                      exp->exp_obd->obd_name, obd_export_nid2str(exp),
+                      exp, exp->exp_last_request_time);
+               /* release one export reference anyway */
+               rc = obd_disconnect(exp);
+
+               CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+                      obd_export_nid2str(exp), exp, rc);
+               class_export_put(exp);
+       }
+       EXIT;
+}
+
+void class_disconnect_exports(struct obd_device *obd)
+{
+       struct list_head work_list;
+       ENTRY;
+
+       /* Move all of the exports from obd_exports to a work list, en masse. */
+       INIT_LIST_HEAD(&work_list);
+       spin_lock(&obd->obd_dev_lock);
+       list_splice_init(&obd->obd_exports, &work_list);
+       list_splice_init(&obd->obd_delayed_exports, &work_list);
+       spin_unlock(&obd->obd_dev_lock);
+
+       if (!list_empty(&work_list)) {
+               CDEBUG(D_HA, "OBD device %d (%p) has exports, "
+                      "disconnecting them\n", obd->obd_minor, obd);
+               class_disconnect_export_list(&work_list,
+                                            exp_flags_from_obd(obd));
+       } else
+               CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
+                      obd->obd_minor, obd);
+       EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_exports);
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd,
+                                   int (*test_export)(struct obd_export *))
+{
+       struct list_head work_list;
+       struct obd_export *exp, *n;
+       int evicted = 0;
+       ENTRY;
+
+       INIT_LIST_HEAD(&work_list);
+       spin_lock(&obd->obd_dev_lock);
+       list_for_each_entry_safe(exp, n, &obd->obd_exports,
+                                    exp_obd_chain) {
+               /* don't count self-export as client */
+               if (obd_uuid_equals(&exp->exp_client_uuid,
+                                   &exp->exp_obd->obd_uuid))
+                       continue;
+
+               /* don't evict clients which have no slot in last_rcvd
+                * (e.g. lightweight connection) */
+               if (exp->exp_target_data.ted_lr_idx == -1)
+                       continue;
+
+               spin_lock(&exp->exp_lock);
+               if (exp->exp_failed || test_export(exp)) {
+                       spin_unlock(&exp->exp_lock);
+                       continue;
+               }
+               exp->exp_failed = 1;
+               spin_unlock(&exp->exp_lock);
+
+               list_move(&exp->exp_obd_chain, &work_list);
+               evicted++;
+               CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+                      obd->obd_name, exp->exp_client_uuid.uuid,
+                      exp->exp_connection == NULL ? "<unknown>" :
+                      libcfs_nid2str(exp->exp_connection->c_peer.nid));
+               print_export_data(exp, "EVICTING", 0);
+       }
+       spin_unlock(&obd->obd_dev_lock);
+
+       if (evicted)
+               LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+                             obd->obd_name, evicted);
+
+       class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+                                                OBD_OPT_ABORT_RECOV);
+       EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_stale_exports);
+
+void class_fail_export(struct obd_export *exp)
+{
+       int rc, already_failed;
+
+       spin_lock(&exp->exp_lock);
+       already_failed = exp->exp_failed;
+       exp->exp_failed = 1;
+       spin_unlock(&exp->exp_lock);
+
+       if (already_failed) {
+               CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+                      exp, exp->exp_client_uuid.uuid);
+               return;
+       }
+
+       CDEBUG(D_HA, "disconnecting export %p/%s\n",
+              exp, exp->exp_client_uuid.uuid);
+
+       if (obd_dump_on_timeout)
+               libcfs_debug_dumplog();
+
+       /* need for safe call CDEBUG after obd_disconnect */
+       class_export_get(exp);
+
+       /* Most callers into obd_disconnect are removing their own reference
+        * (request, for example) in addition to the one from the hash table.
+        * We don't have such a reference here, so make one. */
+       class_export_get(exp);
+       rc = obd_disconnect(exp);
+       if (rc)
+               CERROR("disconnecting export %p failed: %d\n", exp, rc);
+       else
+               CDEBUG(D_HA, "disconnected export %p/%s\n",
+                      exp, exp->exp_client_uuid.uuid);
+       class_export_put(exp);
+}
+EXPORT_SYMBOL(class_fail_export);
+
+char *obd_export_nid2str(struct obd_export *exp)
+{
+       if (exp->exp_connection != NULL)
+               return libcfs_nid2str(exp->exp_connection->c_peer.nid);
+
+       return "(no nid)";
+}
+EXPORT_SYMBOL(obd_export_nid2str);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
+{
+       cfs_hash_t *nid_hash;
+       struct obd_export *doomed_exp = NULL;
+       int exports_evicted = 0;
+
+       lnet_nid_t nid_key = libcfs_str2nid((char *)nid);
+
+       spin_lock(&obd->obd_dev_lock);
+       /* umount has run already, so evict thread should leave
+        * its task to umount thread now */
+       if (obd->obd_stopping) {
+               spin_unlock(&obd->obd_dev_lock);
+               return exports_evicted;
+       }
+       nid_hash = obd->obd_nid_hash;
+       cfs_hash_getref(nid_hash);
+       spin_unlock(&obd->obd_dev_lock);
+
+       do {
+               doomed_exp = cfs_hash_lookup(nid_hash, &nid_key);
+               if (doomed_exp == NULL)
+                       break;
+
+               LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key,
+                        "nid %s found, wanted nid %s, requested nid %s\n",
+                        obd_export_nid2str(doomed_exp),
+                        libcfs_nid2str(nid_key), nid);
+               LASSERTF(doomed_exp != obd->obd_self_export,
+                        "self-export is hashed by NID?\n");
+               exports_evicted++;
+               LCONSOLE_WARN("%s: evicting %s (at %s) by administrative "
+                             "request\n", obd->obd_name,
+                             obd_uuid2str(&doomed_exp->exp_client_uuid),
+                             obd_export_nid2str(doomed_exp));
+               class_fail_export(doomed_exp);
+               class_export_put(doomed_exp);
+       } while (1);
+
+       cfs_hash_putref(nid_hash);
+
+       if (!exports_evicted)
+               CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n",
+                      obd->obd_name, nid);
+       return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_nid);
+
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
+{
+       cfs_hash_t *uuid_hash;
+       struct obd_export *doomed_exp = NULL;
+       struct obd_uuid doomed_uuid;
+       int exports_evicted = 0;
+
+       spin_lock(&obd->obd_dev_lock);
+       if (obd->obd_stopping) {
+               spin_unlock(&obd->obd_dev_lock);
+               return exports_evicted;
+       }
+       uuid_hash = obd->obd_uuid_hash;
+       cfs_hash_getref(uuid_hash);
+       spin_unlock(&obd->obd_dev_lock);
+
+       obd_str2uuid(&doomed_uuid, uuid);
+       if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
+               CERROR("%s: can't evict myself\n", obd->obd_name);
+               cfs_hash_putref(uuid_hash);
+               return exports_evicted;
+       }
+
+       doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid);
+
+       if (doomed_exp == NULL) {
+               CERROR("%s: can't disconnect %s: no exports found\n",
+                      obd->obd_name, uuid);
+       } else {
+               CWARN("%s: evicting %s at adminstrative request\n",
+                      obd->obd_name, doomed_exp->exp_client_uuid.uuid);
+               class_fail_export(doomed_exp);
+               class_export_put(doomed_exp);
+               exports_evicted++;
+       }
+       cfs_hash_putref(uuid_hash);
+
+       return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_uuid);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void (*class_export_dump_hook)(struct obd_export*) = NULL;
+EXPORT_SYMBOL(class_export_dump_hook);
+#endif
+
+static void print_export_data(struct obd_export *exp, const char *status,
+                             int locks)
+{
+       struct ptlrpc_reply_state *rs;
+       struct ptlrpc_reply_state *first_reply = NULL;
+       int nreplies = 0;
+
+       spin_lock(&exp->exp_lock);
+       list_for_each_entry(rs, &exp->exp_outstanding_replies,
+                               rs_exp_list) {
+               if (nreplies == 0)
+                       first_reply = rs;
+               nreplies++;
+       }
+       spin_unlock(&exp->exp_lock);
+
+       CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s "LPU64"\n",
+              exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+              obd_export_nid2str(exp), atomic_read(&exp->exp_refcount),
+              atomic_read(&exp->exp_rpc_count),
+              atomic_read(&exp->exp_cb_count),
+              atomic_read(&exp->exp_locks_count),
+              exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+              nreplies, first_reply, nreplies > 3 ? "..." : "",
+              exp->exp_last_committed);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+       if (locks && class_export_dump_hook != NULL)
+               class_export_dump_hook(exp);
+#endif
+}
+
+void dump_exports(struct obd_device *obd, int locks)
+{
+       struct obd_export *exp;
+
+       spin_lock(&obd->obd_dev_lock);
+       list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+               print_export_data(exp, "ACTIVE", locks);
+       list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+               print_export_data(exp, "UNLINKED", locks);
+       list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+               print_export_data(exp, "DELAYED", locks);
+       spin_unlock(&obd->obd_dev_lock);
+       spin_lock(&obd_zombie_impexp_lock);
+       list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+               print_export_data(exp, "ZOMBIE", locks);
+       spin_unlock(&obd_zombie_impexp_lock);
+}
+EXPORT_SYMBOL(dump_exports);
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+       int waited = 2;
+       LASSERT(list_empty(&obd->obd_exports));
+       spin_lock(&obd->obd_dev_lock);
+       while (!list_empty(&obd->obd_unlinked_exports)) {
+               spin_unlock(&obd->obd_dev_lock);
+               schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+                                                  cfs_time_seconds(waited));
+               if (waited > 5 && IS_PO2(waited)) {
+                       LCONSOLE_WARN("%s is waiting for obd_unlinked_exports "
+                                     "more than %d seconds. "
+                                     "The obd refcount = %d. Is it stuck?\n",
+                                     obd->obd_name, waited,
+                                     atomic_read(&obd->obd_refcount));
+                       dump_exports(obd, 1);
+               }
+               waited *= 2;
+               spin_lock(&obd->obd_dev_lock);
+       }
+       spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
+/* Total amount of zombies to be destroyed */
+static int zombies_count = 0;
+
+/**
+ * kill zombie imports and exports
+ */
+void obd_zombie_impexp_cull(void)
+{
+       struct obd_import *import;
+       struct obd_export *export;
+       ENTRY;
+
+       do {
+               spin_lock(&obd_zombie_impexp_lock);
+
+               import = NULL;
+               if (!list_empty(&obd_zombie_imports)) {
+                       import = list_entry(obd_zombie_imports.next,
+                                               struct obd_import,
+                                               imp_zombie_chain);
+                       list_del_init(&import->imp_zombie_chain);
+               }
+
+               export = NULL;
+               if (!list_empty(&obd_zombie_exports)) {
+                       export = list_entry(obd_zombie_exports.next,
+                                               struct obd_export,
+                                               exp_obd_chain);
+                       list_del_init(&export->exp_obd_chain);
+               }
+
+               spin_unlock(&obd_zombie_impexp_lock);
+
+               if (import != NULL) {
+                       class_import_destroy(import);
+                       spin_lock(&obd_zombie_impexp_lock);
+                       zombies_count--;
+                       spin_unlock(&obd_zombie_impexp_lock);
+               }
+
+               if (export != NULL) {
+                       class_export_destroy(export);
+                       spin_lock(&obd_zombie_impexp_lock);
+                       zombies_count--;
+                       spin_unlock(&obd_zombie_impexp_lock);
+               }
+
+               cond_resched();
+       } while (import != NULL || export != NULL);
+       EXIT;
+}
+
+static struct completion       obd_zombie_start;
+static struct completion       obd_zombie_stop;
+static unsigned long           obd_zombie_flags;
+static wait_queue_head_t               obd_zombie_waitq;
+static pid_t                   obd_zombie_pid;
+
+enum {
+       OBD_ZOMBIE_STOP         = 0x0001,
+};
+
+/**
+ * check for work for kill zombie import/export thread.
+ */
+static int obd_zombie_impexp_check(void *arg)
+{
+       int rc;
+
+       spin_lock(&obd_zombie_impexp_lock);
+       rc = (zombies_count == 0) &&
+            !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+       spin_unlock(&obd_zombie_impexp_lock);
+
+       RETURN(rc);
+}
+
+/**
+ * Add export to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_export_add(struct obd_export *exp) {
+       spin_lock(&exp->exp_obd->obd_dev_lock);
+       LASSERT(!list_empty(&exp->exp_obd_chain));
+       list_del_init(&exp->exp_obd_chain);
+       spin_unlock(&exp->exp_obd->obd_dev_lock);
+       spin_lock(&obd_zombie_impexp_lock);
+       zombies_count++;
+       list_add(&exp->exp_obd_chain, &obd_zombie_exports);
+       spin_unlock(&obd_zombie_impexp_lock);
+
+       obd_zombie_impexp_notify();
+}
+
+/**
+ * Add import to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_import_add(struct obd_import *imp) {
+       LASSERT(imp->imp_sec == NULL);
+       LASSERT(imp->imp_rq_pool == NULL);
+       spin_lock(&obd_zombie_impexp_lock);
+       LASSERT(list_empty(&imp->imp_zombie_chain));
+       zombies_count++;
+       list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
+       spin_unlock(&obd_zombie_impexp_lock);
+
+       obd_zombie_impexp_notify();
+}
+
+/**
+ * notify import/export destroy thread about new zombie.
+ */
+static void obd_zombie_impexp_notify(void)
+{
+       /*
+        * Make sure obd_zomebie_impexp_thread get this notification.
+        * It is possible this signal only get by obd_zombie_barrier, and
+        * barrier gulps this notification and sleeps away and hangs ensues
+        */
+       wake_up_all(&obd_zombie_waitq);
+}
+
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+       int rc;
+
+       LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+       spin_lock(&obd_zombie_impexp_lock);
+       rc = (zombies_count == 0);
+       spin_unlock(&obd_zombie_impexp_lock);
+       return rc;
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+       struct l_wait_info lwi = { 0 };
+
+       if (obd_zombie_pid == current_pid())
+               /* don't wait for myself */
+               return;
+       l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
+
+/**
+ * destroy zombie export/import thread.
+ */
+static int obd_zombie_impexp_thread(void *unused)
+{
+       unshare_fs_struct();
+       complete(&obd_zombie_start);
+
+       obd_zombie_pid = current_pid();
+
+       while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
+               struct l_wait_info lwi = { 0 };
+
+               l_wait_event(obd_zombie_waitq,
+                            !obd_zombie_impexp_check(NULL), &lwi);
+               obd_zombie_impexp_cull();
+
+               /*
+                * Notify obd_zombie_barrier callers that queues
+                * may be empty.
+                */
+               wake_up(&obd_zombie_waitq);
+       }
+
+       complete(&obd_zombie_stop);
+
+       RETURN(0);
+}
+
+
+/**
+ * start destroy zombie import/export thread
+ */
+int obd_zombie_impexp_init(void)
+{
+       task_t *task;
+
+       INIT_LIST_HEAD(&obd_zombie_imports);
+       INIT_LIST_HEAD(&obd_zombie_exports);
+       spin_lock_init(&obd_zombie_impexp_lock);
+       init_completion(&obd_zombie_start);
+       init_completion(&obd_zombie_stop);
+       init_waitqueue_head(&obd_zombie_waitq);
+       obd_zombie_pid = 0;
+
+       task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
+       if (IS_ERR(task))
+               RETURN(PTR_ERR(task));
+
+       wait_for_completion(&obd_zombie_start);
+       RETURN(0);
+}
+/**
+ * stop destroy zombie import/export thread
+ */
+void obd_zombie_impexp_stop(void)
+{
+       set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+       obd_zombie_impexp_notify();
+       wait_for_completion(&obd_zombie_stop);
+}
+
+/***** Kernel-userspace comm helpers *******/
+
+/* Get length of entire message, including header */
+int kuc_len(int payload_len)
+{
+       return sizeof(struct kuc_hdr) + payload_len;
+}
+EXPORT_SYMBOL(kuc_len);
+
+/* Get a pointer to kuc header, given a ptr to the payload
+ * @param p Pointer to payload area
+ * @returns Pointer to kuc header
+ */
+struct kuc_hdr * kuc_ptr(void *p)
+{
+       struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1;
+       LASSERT(lh->kuc_magic == KUC_MAGIC);
+       return lh;
+}
+EXPORT_SYMBOL(kuc_ptr);
+
+/* Test if payload is part of kuc message
+ * @param p Pointer to payload area
+ * @returns boolean
+ */
+int kuc_ispayload(void *p)
+{
+       struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1;
+
+       if (kh->kuc_magic == KUC_MAGIC)
+               return 1;
+       else
+               return 0;
+}
+EXPORT_SYMBOL(kuc_ispayload);
+
+/* Alloc space for a message, and fill in header
+ * @return Pointer to payload area
+ */
+void *kuc_alloc(int payload_len, int transport, int type)
+{
+       struct kuc_hdr *lh;
+       int len = kuc_len(payload_len);
+
+       OBD_ALLOC(lh, len);
+       if (lh == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       lh->kuc_magic = KUC_MAGIC;
+       lh->kuc_transport = transport;
+       lh->kuc_msgtype = type;
+       lh->kuc_msglen = len;
+
+       return (void *)(lh + 1);
+}
+EXPORT_SYMBOL(kuc_alloc);
+
+/* Takes pointer to payload area */
+inline void kuc_free(void *p, int payload_len)
+{
+       struct kuc_hdr *lh = kuc_ptr(p);
+       OBD_FREE(lh, kuc_len(payload_len));
+}
+EXPORT_SYMBOL(kuc_free);
diff --git a/drivers/staging/lustre/lustre/obdclass/idmap.c b/drivers/staging/lustre/lustre/obdclass/idmap.c
new file mode 100644 (file)
index 0000000..622f8d1
--- /dev/null
@@ -0,0 +1,474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/idmap.c
+ *
+ * Lustre user identity mapping.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <lustre_idmap.h>
+#include <md_object.h>
+#include <obd_support.h>
+
+#define lustre_get_group_info(group_info) do {      \
+       atomic_inc(&(group_info)->usage);             \
+} while (0)
+
+#define lustre_put_group_info(group_info) do {      \
+       if (atomic_dec_and_test(&(group_info)->usage)) \
+               groups_free(group_info);               \
+} while (0)
+
+/*
+ * groups_search() is copied from linux kernel!
+ * A simple bsearch.
+ */
+static int lustre_groups_search(group_info_t *group_info,
+                               gid_t grp)
+{
+       int left, right;
+
+       if (!group_info)
+               return 0;
+
+       left = 0;
+       right = group_info->ngroups;
+       while (left < right) {
+               int mid = (left + right) / 2;
+               int cmp = grp - CFS_GROUP_AT(group_info, mid);
+
+               if (cmp > 0)
+                       left = mid + 1;
+               else if (cmp < 0)
+                       right = mid;
+               else
+                       return 1;
+       }
+       return 0;
+}
+
+void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist)
+{
+       int i;
+       int count = ginfo->ngroups;
+
+       /* fill group_info from gid array */
+       for (i = 0; i < ginfo->nblocks && count > 0; i++) {
+               int cp_count = min(CFS_NGROUPS_PER_BLOCK, count);
+               int off = i * CFS_NGROUPS_PER_BLOCK;
+               int len = cp_count * sizeof(*glist);
+
+               memcpy(ginfo->blocks[i], glist + off, len);
+               count -= cp_count;
+       }
+}
+EXPORT_SYMBOL(lustre_groups_from_list);
+
+/* groups_sort() is copied from linux kernel! */
+/* a simple shell-metzner sort */
+void lustre_groups_sort(group_info_t *group_info)
+{
+       int base, max, stride;
+       int gidsetsize = group_info->ngroups;
+
+       for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+               ; /* nothing */
+       stride /= 3;
+
+       while (stride) {
+               max = gidsetsize - stride;
+               for (base = 0; base < max; base++) {
+                       int left = base;
+                       int right = left + stride;
+                       gid_t tmp = CFS_GROUP_AT(group_info, right);
+
+                       while (left >= 0 &&
+                              CFS_GROUP_AT(group_info, left) > tmp) {
+                               CFS_GROUP_AT(group_info, right) =
+                                   CFS_GROUP_AT(group_info, left);
+                               right = left;
+                               left -= stride;
+                       }
+                       CFS_GROUP_AT(group_info, right) = tmp;
+               }
+               stride /= 3;
+       }
+}
+EXPORT_SYMBOL(lustre_groups_sort);
+
+int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
+{
+       int rc = 1;
+
+       if (grp != mu->uc_fsgid) {
+               group_info_t *group_info = NULL;
+
+               if (mu->uc_ginfo || !mu->uc_identity ||
+                   mu->uc_valid == UCRED_OLD)
+                       if (grp == mu->uc_suppgids[0] ||
+                           grp == mu->uc_suppgids[1])
+                               return 1;
+
+               if (mu->uc_ginfo)
+                       group_info = mu->uc_ginfo;
+               else if (mu->uc_identity)
+                       group_info = mu->uc_identity->mi_ginfo;
+
+               if (!group_info)
+                       return 0;
+
+               lustre_get_group_info(group_info);
+               rc = lustre_groups_search(group_info, grp);
+               lustre_put_group_info(group_info);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lustre_in_group_p);
+
+struct lustre_idmap_entry {
+       struct list_head       lie_rmt_uid_hash; /* hashed as lie_rmt_uid; */
+       struct list_head       lie_lcl_uid_hash; /* hashed as lie_lcl_uid; */
+       struct list_head       lie_rmt_gid_hash; /* hashed as lie_rmt_gid; */
+       struct list_head       lie_lcl_gid_hash; /* hashed as lie_lcl_gid; */
+       uid_t       lie_rmt_uid;      /* remote uid */
+       uid_t       lie_lcl_uid;      /* local uid */
+       gid_t       lie_rmt_gid;      /* remote gid */
+       gid_t       lie_lcl_gid;      /* local gid */
+};
+
+static inline __u32 lustre_idmap_hashfunc(__u32 id)
+{
+       return id & (CFS_IDMAP_HASHSIZE - 1);
+}
+
+static
+struct lustre_idmap_entry *idmap_entry_alloc(uid_t rmt_uid, uid_t lcl_uid,
+                                            gid_t rmt_gid, gid_t lcl_gid)
+{
+       struct lustre_idmap_entry *e;
+
+       OBD_ALLOC_PTR(e);
+       if (e == NULL)
+               return NULL;
+
+       INIT_LIST_HEAD(&e->lie_rmt_uid_hash);
+       INIT_LIST_HEAD(&e->lie_lcl_uid_hash);
+       INIT_LIST_HEAD(&e->lie_rmt_gid_hash);
+       INIT_LIST_HEAD(&e->lie_lcl_gid_hash);
+       e->lie_rmt_uid = rmt_uid;
+       e->lie_lcl_uid = lcl_uid;
+       e->lie_rmt_gid = rmt_gid;
+       e->lie_lcl_gid = lcl_gid;
+
+       return e;
+}
+
+static void idmap_entry_free(struct lustre_idmap_entry *e)
+{
+       if (!list_empty(&e->lie_rmt_uid_hash))
+               list_del(&e->lie_rmt_uid_hash);
+       if (!list_empty(&e->lie_lcl_uid_hash))
+               list_del(&e->lie_lcl_uid_hash);
+       if (!list_empty(&e->lie_rmt_gid_hash))
+               list_del(&e->lie_rmt_gid_hash);
+       if (!list_empty(&e->lie_lcl_gid_hash))
+               list_del(&e->lie_lcl_gid_hash);
+       OBD_FREE_PTR(e);
+}
+
+/*
+ * return value
+ * NULL: not found entry
+ * ERR_PTR(-EACCES): found 1(remote):N(local) mapped entry
+ * others: found normal entry
+ */
+static
+struct lustre_idmap_entry *idmap_search_entry(struct lustre_idmap_table *t,
+                                             uid_t rmt_uid, uid_t lcl_uid,
+                                             gid_t rmt_gid, gid_t lcl_gid)
+{
+       struct list_head *head;
+       struct lustre_idmap_entry *e;
+
+       head = &t->lit_idmaps[RMT_UIDMAP_IDX][lustre_idmap_hashfunc(rmt_uid)];
+       list_for_each_entry(e, head, lie_rmt_uid_hash)
+               if (e->lie_rmt_uid == rmt_uid) {
+                       if (e->lie_lcl_uid == lcl_uid) {
+                               if (e->lie_rmt_gid == rmt_gid &&
+                                   e->lie_lcl_gid == lcl_gid)
+                                       /* must be quaternion match */
+                                       return e;
+                       } else {
+                               /* 1:N uid mapping */
+                               CERROR("rmt uid %u already be mapped to %u"
+                                      " (new %u)\n", e->lie_rmt_uid,
+                                      e->lie_lcl_uid, lcl_uid);
+                               return ERR_PTR(-EACCES);
+                       }
+               }
+
+       head = &t->lit_idmaps[RMT_GIDMAP_IDX][lustre_idmap_hashfunc(rmt_gid)];
+       list_for_each_entry(e, head, lie_rmt_gid_hash)
+               if (e->lie_rmt_gid == rmt_gid) {
+                       if (e->lie_lcl_gid == lcl_gid) {
+                               if (unlikely(e->lie_rmt_uid == rmt_uid &&
+                                   e->lie_lcl_uid == lcl_uid))
+                                       /* after uid mapping search above,
+                                        * we should never come here */
+                                       LBUG();
+                       } else {
+                               /* 1:N gid mapping */
+                               CERROR("rmt gid %u already be mapped to %u"
+                                      " (new %u)\n", e->lie_rmt_gid,
+                                      e->lie_lcl_gid, lcl_gid);
+                               return ERR_PTR(-EACCES);
+                       }
+               }
+
+       return NULL;
+}
+
+static __u32 idmap_lookup_uid(struct list_head *hash, int reverse,
+                             __u32 uid)
+{
+       struct list_head *head = &hash[lustre_idmap_hashfunc(uid)];
+       struct lustre_idmap_entry *e;
+
+       if (!reverse) {
+               list_for_each_entry(e, head, lie_rmt_uid_hash)
+                       if (e->lie_rmt_uid == uid)
+                               return e->lie_lcl_uid;
+       } else {
+               list_for_each_entry(e, head, lie_lcl_uid_hash)
+                       if (e->lie_lcl_uid == uid)
+                               return e->lie_rmt_uid;
+       }
+
+       return CFS_IDMAP_NOTFOUND;
+}
+
+static __u32 idmap_lookup_gid(struct list_head *hash, int reverse, __u32 gid)
+{
+       struct list_head *head = &hash[lustre_idmap_hashfunc(gid)];
+       struct lustre_idmap_entry *e;
+
+       if (!reverse) {
+               list_for_each_entry(e, head, lie_rmt_gid_hash)
+                       if (e->lie_rmt_gid == gid)
+                               return e->lie_lcl_gid;
+       } else {
+               list_for_each_entry(e, head, lie_lcl_gid_hash)
+                       if (e->lie_lcl_gid == gid)
+                               return e->lie_rmt_gid;
+       }
+
+       return CFS_IDMAP_NOTFOUND;
+}
+
+int lustre_idmap_add(struct lustre_idmap_table *t,
+                    uid_t ruid, uid_t luid,
+                    gid_t rgid, gid_t lgid)
+{
+       struct lustre_idmap_entry *e0, *e1;
+
+       LASSERT(t);
+
+       spin_lock(&t->lit_lock);
+       e0 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+       spin_unlock(&t->lit_lock);
+       if (!e0) {
+               e0 = idmap_entry_alloc(ruid, luid, rgid, lgid);
+               if (!e0)
+                       return -ENOMEM;
+
+               spin_lock(&t->lit_lock);
+               e1 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+               if (e1 == NULL) {
+                       list_add_tail(&e0->lie_rmt_uid_hash,
+                                         &t->lit_idmaps[RMT_UIDMAP_IDX]
+                                         [lustre_idmap_hashfunc(ruid)]);
+                       list_add_tail(&e0->lie_lcl_uid_hash,
+                                         &t->lit_idmaps[LCL_UIDMAP_IDX]
+                                         [lustre_idmap_hashfunc(luid)]);
+                       list_add_tail(&e0->lie_rmt_gid_hash,
+                                         &t->lit_idmaps[RMT_GIDMAP_IDX]
+                                         [lustre_idmap_hashfunc(rgid)]);
+                       list_add_tail(&e0->lie_lcl_gid_hash,
+                                         &t->lit_idmaps[LCL_GIDMAP_IDX]
+                                         [lustre_idmap_hashfunc(lgid)]);
+               }
+               spin_unlock(&t->lit_lock);
+               if (e1 != NULL) {
+                       idmap_entry_free(e0);
+                       if (IS_ERR(e1))
+                               return PTR_ERR(e1);
+               }
+       } else if (IS_ERR(e0)) {
+               return PTR_ERR(e0);
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(lustre_idmap_add);
+
+int lustre_idmap_del(struct lustre_idmap_table *t,
+                   uid_t ruid, uid_t luid,
+                   gid_t rgid, gid_t lgid)
+{
+       struct lustre_idmap_entry *e;
+       int rc = 0;
+
+       LASSERT(t);
+
+       spin_lock(&t->lit_lock);
+       e = idmap_search_entry(t, ruid, luid, rgid, lgid);
+       if (IS_ERR(e))
+               rc = PTR_ERR(e);
+       else if (e)
+               idmap_entry_free(e);
+       spin_unlock(&t->lit_lock);
+
+       return rc;
+}
+EXPORT_SYMBOL(lustre_idmap_del);
+
+int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+                           struct lustre_idmap_table *t,
+                           int reverse, uid_t uid)
+{
+       struct list_head *hash;
+
+       if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+               if (!reverse) {
+                       if (uid == mu->uc_o_uid)
+                               return mu->uc_uid;
+                       else if (uid == mu->uc_o_fsuid)
+                               return mu->uc_fsuid;
+               } else {
+                       if (uid == mu->uc_uid)
+                               return mu->uc_o_uid;
+                       else if (uid == mu->uc_fsuid)
+                               return mu->uc_o_fsuid;
+               }
+       }
+
+       if (t == NULL)
+               return CFS_IDMAP_NOTFOUND;
+
+       hash = t->lit_idmaps[reverse ? LCL_UIDMAP_IDX : RMT_UIDMAP_IDX];
+
+       spin_lock(&t->lit_lock);
+       uid = idmap_lookup_uid(hash, reverse, uid);
+       spin_unlock(&t->lit_lock);
+
+       return uid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_uid);
+
+int lustre_idmap_lookup_gid(struct lu_ucred *mu, struct lustre_idmap_table *t,
+                           int reverse, gid_t gid)
+{
+       struct list_head *hash;
+
+       if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+               if (!reverse) {
+                       if (gid == mu->uc_o_gid)
+                               return mu->uc_gid;
+                       else if (gid == mu->uc_o_fsgid)
+                               return mu->uc_fsgid;
+               } else {
+                       if (gid == mu->uc_gid)
+                               return mu->uc_o_gid;
+                       else if (gid == mu->uc_fsgid)
+                               return mu->uc_o_fsgid;
+               }
+       }
+
+       if (t == NULL)
+               return CFS_IDMAP_NOTFOUND;
+
+       hash = t->lit_idmaps[reverse ? LCL_GIDMAP_IDX : RMT_GIDMAP_IDX];
+
+       spin_lock(&t->lit_lock);
+       gid = idmap_lookup_gid(hash, reverse, gid);
+       spin_unlock(&t->lit_lock);
+
+       return gid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_gid);
+
+struct lustre_idmap_table *lustre_idmap_init(void)
+{
+       struct lustre_idmap_table *t;
+       int i, j;
+
+       OBD_ALLOC_PTR(t);
+       if(unlikely(t == NULL))
+               return (ERR_PTR(-ENOMEM));
+
+       spin_lock_init(&t->lit_lock);
+       for (i = 0; i < ARRAY_SIZE(t->lit_idmaps); i++)
+               for (j = 0; j < ARRAY_SIZE(t->lit_idmaps[i]); j++)
+                       INIT_LIST_HEAD(&t->lit_idmaps[i][j]);
+
+       return t;
+}
+EXPORT_SYMBOL(lustre_idmap_init);
+
+void lustre_idmap_fini(struct lustre_idmap_table *t)
+{
+       struct list_head *list;
+       struct lustre_idmap_entry *e;
+       int i;
+       LASSERT(t);
+
+       list = t->lit_idmaps[RMT_UIDMAP_IDX];
+       spin_lock(&t->lit_lock);
+       for (i = 0; i < CFS_IDMAP_HASHSIZE; i++)
+               while (!list_empty(&list[i])) {
+                       e = list_entry(list[i].next,
+                                          struct lustre_idmap_entry,
+                                          lie_rmt_uid_hash);
+                       idmap_entry_free(e);
+               }
+       spin_unlock(&t->lit_lock);
+
+       OBD_FREE_PTR(t);
+}
+EXPORT_SYMBOL(lustre_idmap_fini);
diff --git a/drivers/staging/lustre/lustre/obdclass/linkea.c b/drivers/staging/lustre/lustre/obdclass/linkea.c
new file mode 100644 (file)
index 0000000..b5c19ac
--- /dev/null
@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <lustre_linkea.h>
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf)
+{
+       ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_CACHE_SIZE);
+       if (ldata->ld_buf->lb_buf == NULL)
+               return -ENOMEM;
+       ldata->ld_leh = ldata->ld_buf->lb_buf;
+       ldata->ld_leh->leh_magic = LINK_EA_MAGIC;
+       ldata->ld_leh->leh_len = sizeof(struct link_ea_header);
+       ldata->ld_leh->leh_reccount = 0;
+       return 0;
+}
+EXPORT_SYMBOL(linkea_data_new);
+
+int linkea_init(struct linkea_data *ldata)
+{
+       struct link_ea_header *leh;
+
+       LASSERT(ldata->ld_buf != NULL);
+       leh = ldata->ld_buf->lb_buf;
+       if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
+               leh->leh_magic = LINK_EA_MAGIC;
+               leh->leh_reccount = __swab32(leh->leh_reccount);
+               leh->leh_len = __swab64(leh->leh_len);
+               /* entries are swabbed by linkea_entry_unpack */
+       }
+       if (leh->leh_magic != LINK_EA_MAGIC)
+               return -EINVAL;
+       if (leh->leh_reccount == 0)
+               return -ENODATA;
+
+       ldata->ld_leh = leh;
+       return 0;
+}
+EXPORT_SYMBOL(linkea_init);
+
+/**
+ * Pack a link_ea_entry.
+ * All elements are stored as chars to avoid alignment issues.
+ * Numbers are always big-endian
+ * \retval record length
+ */
+static int linkea_entry_pack(struct link_ea_entry *lee,
+                            const struct lu_name *lname,
+                            const struct lu_fid *pfid)
+{
+       struct lu_fid   tmpfid;
+       int          reclen;
+
+       fid_cpu_to_be(&tmpfid, pfid);
+       if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH))
+               tmpfid.f_ver = ~0;
+       memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid));
+       memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen);
+       reclen = sizeof(struct link_ea_entry) + lname->ln_namelen;
+
+       lee->lee_reclen[0] = (reclen >> 8) & 0xff;
+       lee->lee_reclen[1] = reclen & 0xff;
+       return reclen;
+}
+
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+                        struct lu_name *lname, struct lu_fid *pfid)
+{
+       *reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1];
+       memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid));
+       fid_be_to_cpu(pfid, pfid);
+       lname->ln_name = lee->lee_name;
+       lname->ln_namelen = *reclen - sizeof(struct link_ea_entry);
+}
+EXPORT_SYMBOL(linkea_entry_unpack);
+
+/**
+ * Add a record to the end of link ea buf
+ **/
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+                  const struct lu_fid *pfid)
+{
+       LASSERT(ldata->ld_leh != NULL);
+
+       if (lname == NULL || pfid == NULL)
+               return -EINVAL;
+
+       ldata->ld_reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
+       if (ldata->ld_leh->leh_len + ldata->ld_reclen >
+           ldata->ld_buf->lb_len) {
+               if (lu_buf_check_and_grow(ldata->ld_buf,
+                                         ldata->ld_leh->leh_len +
+                                         ldata->ld_reclen) < 0)
+                       return -ENOMEM;
+       }
+
+       ldata->ld_leh = ldata->ld_buf->lb_buf;
+       ldata->ld_lee = ldata->ld_buf->lb_buf + ldata->ld_leh->leh_len;
+       ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid);
+       ldata->ld_leh->leh_len += ldata->ld_reclen;
+       ldata->ld_leh->leh_reccount++;
+       CDEBUG(D_INODE, "New link_ea name '%.*s' is added\n",
+              lname->ln_namelen, lname->ln_name);
+       return 0;
+}
+EXPORT_SYMBOL(linkea_add_buf);
+
+/** Del the current record from the link ea buf */
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname)
+{
+       LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL);
+
+       ldata->ld_leh->leh_reccount--;
+       ldata->ld_leh->leh_len -= ldata->ld_reclen;
+       memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen,
+               (char *)ldata->ld_leh + ldata->ld_leh->leh_len -
+               (char *)ldata->ld_lee);
+       CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n",
+              lname->ln_namelen, lname->ln_name);
+}
+EXPORT_SYMBOL(linkea_del_buf);
+
+/**
+ * Check if such a link exists in linkEA.
+ *
+ * \param ldata link data the search to be done on
+ * \param lname name in the parent's directory entry pointing to this object
+ * \param pfid parent fid the link to be found for
+ *
+ * \retval   0 success
+ * \retval -ENOENT link does not exist
+ * \retval -ve on error
+ */
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+                     const struct lu_fid  *pfid)
+{
+       struct lu_name tmpname;
+       struct lu_fid  tmpfid;
+       int count;
+
+       LASSERT(ldata->ld_leh != NULL);
+
+       /* link #0 */
+       ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1);
+
+       for (count = 0; count < ldata->ld_leh->leh_reccount; count++) {
+               linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen,
+                                   &tmpname, &tmpfid);
+               if (tmpname.ln_namelen == lname->ln_namelen &&
+                   lu_fid_eq(&tmpfid, pfid) &&
+                   (strncmp(tmpname.ln_name, lname->ln_name,
+                            tmpname.ln_namelen) == 0))
+                       break;
+               ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+                                                        ldata->ld_reclen);
+       }
+
+       if (count == ldata->ld_leh->leh_reccount) {
+               CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n",
+                      lname->ln_namelen, lname->ln_name);
+               ldata->ld_lee = NULL;
+               return -ENOENT;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(linkea_links_find);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
new file mode 100644 (file)
index 0000000..d2c3072
--- /dev/null
@@ -0,0 +1,408 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lprocfs_status.h>
+#include <lustre_ver.h>
+#include <lustre/lustre_build_version.h>
+
+int proc_version;
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void *arg)
+{
+       struct obd_ioctl_hdr hdr;
+       struct obd_ioctl_data *data;
+       int err;
+       int offset = 0;
+       ENTRY;
+
+       err = copy_from_user(&hdr, (void *)arg, sizeof(hdr));
+       if ( err )
+               RETURN(err);
+
+       if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+               CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+                      OBD_IOCTL_VERSION, hdr.ioc_version);
+               RETURN(-EINVAL);
+       }
+
+       if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+               CERROR("User buffer len %d exceeds %d max buffer\n",
+                      hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+               RETURN(-EINVAL);
+       }
+
+       if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+               CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+               RETURN(-EINVAL);
+       }
+
+       /* When there are lots of processes calling vmalloc on multi-core
+        * system, the high lock contention will hurt performance badly,
+        * obdfilter-survey is an example, which relies on ioctl. So we'd
+        * better avoid vmalloc on ioctl path. LU-66 */
+       OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+       if (*buf == NULL) {
+               CERROR("Cannot allocate control buffer of len %d\n",
+                      hdr.ioc_len);
+               RETURN(-EINVAL);
+       }
+       *len = hdr.ioc_len;
+       data = (struct obd_ioctl_data *)*buf;
+
+       err = copy_from_user(*buf, (void *)arg, hdr.ioc_len);
+       if ( err ) {
+               OBD_FREE_LARGE(*buf, hdr.ioc_len);
+               RETURN(err);
+       }
+
+       if (obd_ioctl_is_invalid(data)) {
+               CERROR("ioctl not correctly formatted\n");
+               OBD_FREE_LARGE(*buf, hdr.ioc_len);
+               RETURN(-EINVAL);
+       }
+
+       if (data->ioc_inllen1) {
+               data->ioc_inlbuf1 = &data->ioc_bulk[0];
+               offset += cfs_size_round(data->ioc_inllen1);
+       }
+
+       if (data->ioc_inllen2) {
+               data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+               offset += cfs_size_round(data->ioc_inllen2);
+       }
+
+       if (data->ioc_inllen3) {
+               data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+               offset += cfs_size_round(data->ioc_inllen3);
+       }
+
+       if (data->ioc_inllen4) {
+               data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+       }
+
+       EXIT;
+       return 0;
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+int obd_ioctl_popdata(void *arg, void *data, int len)
+{
+       int err;
+
+       err = copy_to_user(arg, data, len);
+       if (err)
+               err = -EFAULT;
+       return err;
+}
+EXPORT_SYMBOL(obd_ioctl_popdata);
+
+/*  opening /dev/obd */
+static int obd_class_open(struct inode * inode, struct file * file)
+{
+       ENTRY;
+
+       try_module_get(THIS_MODULE);
+       RETURN(0);
+}
+
+/*  closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+       ENTRY;
+
+       module_put(THIS_MODULE);
+       RETURN(0);
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+                           unsigned long arg)
+{
+       int err = 0;
+       ENTRY;
+
+       /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+       if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+               RETURN(err = -EACCES);
+       if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+               RETURN(err = -ENOTTY);
+
+       err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+       RETURN(err);
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+       .owner    = THIS_MODULE,
+       .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
+       .open      = obd_class_open,      /* open */
+       .release        = obd_class_release,   /* release */
+};
+
+/* modules setup */
+psdev_t obd_psdev = {
+       .minor = OBD_DEV_MINOR,
+       .name  = OBD_DEV_NAME,
+       .fops  = &obd_psdev_fops,
+};
+
+
+#ifdef LPROCFS
+int obd_proc_version_seq_show(struct seq_file *m, void *v)
+{
+       return seq_printf(m, "lustre: %s\nkernel: %s\nbuild:  %s\n",
+                       LUSTRE_VERSION_STRING, "patchless_client",
+                       BUILD_VERSION);
+}
+LPROC_SEQ_FOPS_RO(obd_proc_version);
+
+int obd_proc_pinger_seq_show(struct seq_file *m, void *v)
+{
+       return seq_printf(m, "%s\n", "on");
+}
+LPROC_SEQ_FOPS_RO(obd_proc_pinger);
+
+static int obd_proc_health_seq_show(struct seq_file *m, void *v)
+{
+       int rc = 0, i;
+
+       if (libcfs_catastrophe)
+               seq_printf(m, "LBUG\n");
+
+       read_lock(&obd_dev_lock);
+       for (i = 0; i < class_devno_max(); i++) {
+               struct obd_device *obd;
+
+               obd = class_num2obd(i);
+               if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+                       continue;
+
+               LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+               if (obd->obd_stopping)
+                       continue;
+
+               class_incref(obd, __FUNCTION__, current);
+               read_unlock(&obd_dev_lock);
+
+               if (obd_health_check(NULL, obd)) {
+                       seq_printf(m, "device %s reported unhealthy\n",
+                                     obd->obd_name);
+                       rc++;
+               }
+               class_decref(obd, __FUNCTION__, current);
+               read_lock(&obd_dev_lock);
+       }
+       read_unlock(&obd_dev_lock);
+
+       if (rc == 0)
+               return seq_printf(m, "healthy\n");
+
+       seq_printf(m, "NOT HEALTHY\n");
+       return 0;
+}
+LPROC_SEQ_FOPS_RO(obd_proc_health);
+
+static int obd_proc_jobid_var_seq_show(struct seq_file *m, void *v)
+{
+       return seq_printf(m, "%s\n", obd_jobid_var);
+}
+
+static ssize_t obd_proc_jobid_var_seq_write(struct file *file, const char *buffer,
+                                       size_t count, loff_t *off)
+{
+       if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+               return -EINVAL;
+
+       memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+       /* Trim the trailing '\n' if any */
+       memcpy(obd_jobid_var, buffer, count - (buffer[count - 1] == '\n'));
+       return count;
+}
+LPROC_SEQ_FOPS(obd_proc_jobid_var);
+
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root = NULL;
+EXPORT_SYMBOL(proc_lustre_root);
+
+struct lprocfs_vars lprocfs_base[] = {
+       { "version", &obd_proc_version_fops },
+       { "pinger", &obd_proc_pinger_fops },
+       { "health_check", &obd_proc_health_fops },
+       { "jobid_var", &obd_proc_jobid_var_fops },
+       { 0 }
+};
+#else
+#define lprocfs_base NULL
+#endif /* LPROCFS */
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+       if (*pos >= class_devno_max())
+               return NULL;
+
+       return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       ++*pos;
+       if (*pos >= class_devno_max())
+               return NULL;
+
+       return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+       loff_t index = *(loff_t *)v;
+       struct obd_device *obd = class_num2obd((int)index);
+       char *status;
+
+       if (obd == NULL)
+               return 0;
+
+       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+       if (obd->obd_stopping)
+               status = "ST";
+       else if (obd->obd_inactive)
+               status = "IN";
+       else if (obd->obd_set_up)
+               status = "UP";
+       else if (obd->obd_attached)
+               status = "AT";
+       else
+               status = "--";
+
+       return seq_printf(p, "%3d %s %s %s %s %d\n",
+                         (int)index, status, obd->obd_type->typ_name,
+                         obd->obd_name, obd->obd_uuid.uuid,
+                         atomic_read(&obd->obd_refcount));
+}
+
+struct seq_operations obd_device_list_sops = {
+       .start = obd_device_list_seq_start,
+       .stop = obd_device_list_seq_stop,
+       .next = obd_device_list_seq_next,
+       .show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq;
+       int rc = seq_open(file, &obd_device_list_sops);
+
+       if (rc)
+               return rc;
+
+       seq = file->private_data;
+       seq->private = PDE_DATA(inode);
+
+       return 0;
+}
+
+struct file_operations obd_device_list_fops = {
+       .owner   = THIS_MODULE,
+       .open    = obd_device_list_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+int class_procfs_init(void)
+{
+       int rc;
+       ENTRY;
+
+       obd_sysctl_init();
+       proc_lustre_root = lprocfs_register("fs/lustre", NULL,
+                                           lprocfs_base, NULL);
+       rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444,
+                               &obd_device_list_fops, NULL);
+       if (rc)
+               CERROR("error adding /proc/fs/lustre/devices file\n");
+       RETURN(0);
+}
+
+int class_procfs_clean(void)
+{
+       ENTRY;
+       if (proc_lustre_root) {
+               lprocfs_remove(&proc_lustre_root);
+       }
+       RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
new file mode 100644 (file)
index 0000000..6ee3471
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/fs.h>
+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid)
+{
+       obd_flag newvalid = 0;
+
+       if (valid & LA_ATIME) {
+               dst->o_atime = la->la_atime;
+               newvalid |= OBD_MD_FLATIME;
+       }
+       if (valid & LA_MTIME) {
+               dst->o_mtime = la->la_mtime;
+               newvalid |= OBD_MD_FLMTIME;
+       }
+       if (valid & LA_CTIME) {
+               dst->o_ctime = la->la_ctime;
+               newvalid |= OBD_MD_FLCTIME;
+       }
+       if (valid & LA_SIZE) {
+               dst->o_size = la->la_size;
+               newvalid |= OBD_MD_FLSIZE;
+       }
+       if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+               dst->o_blocks = la->la_blocks;
+               newvalid |= OBD_MD_FLBLOCKS;
+       }
+       if (valid & LA_TYPE) {
+               dst->o_mode = (dst->o_mode & S_IALLUGO) |
+                             (la->la_mode & S_IFMT);
+               newvalid |= OBD_MD_FLTYPE;
+       }
+       if (valid & LA_MODE) {
+               dst->o_mode = (dst->o_mode & S_IFMT) |
+                             (la->la_mode & S_IALLUGO);
+               newvalid |= OBD_MD_FLMODE;
+       }
+       if (valid & LA_UID) {
+               dst->o_uid = la->la_uid;
+               newvalid |= OBD_MD_FLUID;
+       }
+       if (valid & LA_GID) {
+               dst->o_gid = la->la_gid;
+               newvalid |= OBD_MD_FLGID;
+       }
+       dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, obd_flag valid)
+{
+       __u64 newvalid = 0;
+
+       valid &= obdo->o_valid;
+
+       if (valid & OBD_MD_FLATIME) {
+               dst->la_atime = obdo->o_atime;
+               newvalid |= LA_ATIME;
+       }
+       if (valid & OBD_MD_FLMTIME) {
+               dst->la_mtime = obdo->o_mtime;
+               newvalid |= LA_MTIME;
+       }
+       if (valid & OBD_MD_FLCTIME) {
+               dst->la_ctime = obdo->o_ctime;
+               newvalid |= LA_CTIME;
+       }
+       if (valid & OBD_MD_FLSIZE) {
+               dst->la_size = obdo->o_size;
+               newvalid |= LA_SIZE;
+       }
+       if (valid & OBD_MD_FLBLOCKS) {
+               dst->la_blocks = obdo->o_blocks;
+               newvalid |= LA_BLOCKS;
+       }
+       if (valid & OBD_MD_FLTYPE) {
+               dst->la_mode = (dst->la_mode & S_IALLUGO) |
+                              (obdo->o_mode & S_IFMT);
+               newvalid |= LA_TYPE;
+       }
+       if (valid & OBD_MD_FLMODE) {
+               dst->la_mode = (dst->la_mode & S_IFMT) |
+                              (obdo->o_mode & S_IALLUGO);
+               newvalid |= LA_MODE;
+       }
+       if (valid & OBD_MD_FLUID) {
+               dst->la_uid = obdo->o_uid;
+               newvalid |= LA_UID;
+       }
+       if (valid & OBD_MD_FLGID) {
+               dst->la_gid = obdo->o_gid;
+               newvalid |= LA_GID;
+       }
+       dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
+
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+       valid &= src->o_valid;
+
+       if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+               CDEBUG(D_INODE,
+                      "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+                      src->o_valid, LTIME_S(dst->i_mtime),
+                      LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+       if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime))
+               LTIME_S(dst->i_atime) = src->o_atime;
+       if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime))
+               LTIME_S(dst->i_mtime) = src->o_mtime;
+       if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+               LTIME_S(dst->i_ctime) = src->o_ctime;
+       if (valid & OBD_MD_FLSIZE)
+               i_size_write(dst, src->o_size);
+       /* optimum IO size */
+       if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits))
+               dst->i_blkbits = ffs(src->o_blksize) - 1;
+
+       if (dst->i_blkbits < PAGE_CACHE_SHIFT)
+               dst->i_blkbits = PAGE_CACHE_SHIFT;
+
+       /* allocation of space */
+       if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks)
+               /*
+                * XXX shouldn't overflow be checked here like in
+                * obdo_to_inode().
+                */
+               dst->i_blocks = src->o_blocks;
+}
+EXPORT_SYMBOL(obdo_refresh_inode);
+
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+       valid &= src->o_valid;
+
+       LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID |
+                           OBD_MD_FLID | OBD_MD_FLGROUP)),
+                "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid);
+
+       if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+               CDEBUG(D_INODE,
+                      "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+                      src->o_valid, LTIME_S(dst->i_mtime),
+                      LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+       if (valid & OBD_MD_FLATIME)
+               LTIME_S(dst->i_atime) = src->o_atime;
+       if (valid & OBD_MD_FLMTIME)
+               LTIME_S(dst->i_mtime) = src->o_mtime;
+       if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+               LTIME_S(dst->i_ctime) = src->o_ctime;
+       if (valid & OBD_MD_FLSIZE)
+               i_size_write(dst, src->o_size);
+       if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */
+               dst->i_blocks = src->o_blocks;
+               if (dst->i_blocks < src->o_blocks) /* overflow */
+                       dst->i_blocks = -1;
+
+       }
+       if (valid & OBD_MD_FLBLKSZ)
+               dst->i_blkbits = ffs(src->o_blksize)-1;
+       if (valid & OBD_MD_FLMODE)
+               dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+       if (valid & OBD_MD_FLUID)
+               dst->i_uid = src->o_uid;
+       if (valid & OBD_MD_FLGID)
+               dst->i_gid = src->o_gid;
+       if (valid & OBD_MD_FLFLAGS)
+               dst->i_flags = src->o_flags;
+}
+EXPORT_SYMBOL(obdo_to_inode);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
new file mode 100644 (file)
index 0000000..46aad68
--- /dev/null
@@ -0,0 +1,445 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <asm/bitops.h>
+#include <asm/uaccess.h>
+#include <linux/utsname.h>
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#ifdef CONFIG_SYSCTL
+ctl_table_header_t *obd_table_header = NULL;
+#endif
+
+
+#define OBD_SYSCTL 300
+
+enum {
+       OBD_TIMEOUT = 3,        /* RPC timeout before recovery/intr */
+       OBD_DUMP_ON_TIMEOUT,    /* dump kernel debug log upon eviction */
+       OBD_MEMUSED,        /* bytes currently OBD_ALLOCated */
+       OBD_PAGESUSED,    /* pages currently OBD_PAGE_ALLOCated */
+       OBD_MAXMEMUSED,  /* maximum bytes OBD_ALLOCated concurrently */
+       OBD_MAXPAGESUSED,       /* maximum pages OBD_PAGE_ALLOCated concurrently */
+       OBD_SYNCFILTER,  /* XXX temporary, as we play with sync osts.. */
+       OBD_LDLM_TIMEOUT,       /* LDLM timeout for ASTs before client eviction */
+       OBD_DUMP_ON_EVICTION,   /* dump kernel debug log upon eviction */
+       OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
+       OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
+       OBD_MAX_DIRTY_PAGES,    /* maximum dirty pages */
+       OBD_AT_MIN,          /* Adaptive timeouts params */
+       OBD_AT_MAX,
+       OBD_AT_EXTRA,
+       OBD_AT_EARLY_MARGIN,
+       OBD_AT_HISTORY,
+};
+
+
+int LL_PROC_PROTO(proc_set_timeout)
+{
+       int rc;
+
+       rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+       if (ldlm_timeout >= obd_timeout)
+               ldlm_timeout = max(obd_timeout / 3, 1U);
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_memory_alloc)
+{
+       char buf[22];
+       int len;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write)
+               return -EINVAL;
+
+       len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_sum());
+       if (len > *lenp)
+               len = *lenp;
+       buf[len] = '\0';
+       if (copy_to_user(buffer, buf, len))
+               return -EFAULT;
+       *lenp = len;
+       *ppos += *lenp;
+       return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_alloc)
+{
+       char buf[22];
+       int len;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write)
+               return -EINVAL;
+
+       len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_sum());
+       if (len > *lenp)
+               len = *lenp;
+       buf[len] = '\0';
+       if (copy_to_user(buffer, buf, len))
+               return -EFAULT;
+       *lenp = len;
+       *ppos += *lenp;
+       return 0;
+}
+
+int LL_PROC_PROTO(proc_mem_max)
+{
+       char buf[22];
+       int len;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write)
+               return -EINVAL;
+
+       len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_max());
+       if (len > *lenp)
+               len = *lenp;
+       buf[len] = '\0';
+       if (copy_to_user(buffer, buf, len))
+               return -EFAULT;
+       *lenp = len;
+       *ppos += *lenp;
+       return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_max)
+{
+       char buf[22];
+       int len;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write)
+               return -EINVAL;
+
+       len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_max());
+       if (len > *lenp)
+               len = *lenp;
+       buf[len] = '\0';
+       if (copy_to_user(buffer, buf, len))
+               return -EFAULT;
+       *lenp = len;
+       *ppos += *lenp;
+       return 0;
+}
+
+int LL_PROC_PROTO(proc_max_dirty_pages_in_mb)
+{
+       int rc = 0;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write) {
+               rc = lprocfs_write_frac_helper(buffer, *lenp,
+                                              (unsigned int*)table->data,
+                                              1 << (20 - PAGE_CACHE_SHIFT));
+               /* Don't allow them to let dirty pages exceed 90% of system
+                * memory and set a hard minimum of 4MB. */
+               if (obd_max_dirty_pages > ((num_physpages / 10) * 9)) {
+                       CERROR("Refusing to set max dirty pages to %u, which "
+                              "is more than 90%% of available RAM; setting "
+                              "to %lu\n", obd_max_dirty_pages,
+                              ((num_physpages / 10) * 9));
+                       obd_max_dirty_pages = ((num_physpages / 10) * 9);
+               } else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) {
+                       obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT);
+               }
+       } else {
+               char buf[21];
+               int len;
+
+               len = lprocfs_read_frac_helper(buf, sizeof(buf),
+                                              *(unsigned int*)table->data,
+                                              1 << (20 - PAGE_CACHE_SHIFT));
+               if (len > *lenp)
+                       len = *lenp;
+               buf[len] = '\0';
+               if (copy_to_user(buffer, buf, len))
+                       return -EFAULT;
+               *lenp = len;
+       }
+       *ppos += *lenp;
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_alloc_fail_rate)
+{
+       int rc    = 0;
+       DECLARE_LL_PROC_PPOS_DECL;
+
+       if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+               *lenp = 0;
+               return 0;
+       }
+       if (write) {
+               rc = lprocfs_write_frac_helper(buffer, *lenp,
+                                              (unsigned int*)table->data,
+                                              OBD_ALLOC_FAIL_MULT);
+       } else {
+               char buf[21];
+               int  len;
+
+               len = lprocfs_read_frac_helper(buf, 21,
+                                              *(unsigned int*)table->data,
+                                              OBD_ALLOC_FAIL_MULT);
+               if (len > *lenp)
+                       len = *lenp;
+               buf[len] = '\0';
+               if (copy_to_user(buffer, buf, len))
+                       return -EFAULT;
+               *lenp = len;
+       }
+       *ppos += *lenp;
+       return rc;
+}
+
+int LL_PROC_PROTO(proc_at_min)
+{
+       return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_max)
+{
+       return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_extra)
+{
+       return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_early_margin)
+{
+       return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_history)
+{
+       return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_t obd_table[] = {
+       {
+               INIT_CTL_NAME(OBD_TIMEOUT)
+               .procname = "timeout",
+               .data     = &obd_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_set_timeout
+       },
+       {
+               INIT_CTL_NAME(OBD_DEBUG_PEER_ON_TIMEOUT)
+               .procname = "debug_peer_on_timeout",
+               .data     = &obd_debug_peer_on_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(OBD_DUMP_ON_TIMEOUT)
+               .procname = "dump_on_timeout",
+               .data     = &obd_dump_on_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(OBD_DUMP_ON_EVICTION)
+               .procname = "dump_on_eviction",
+               .data     = &obd_dump_on_eviction,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               INIT_CTL_NAME(OBD_MEMUSED)
+               .procname = "memused",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0444,
+               .proc_handler = &proc_memory_alloc
+       },
+       {
+               INIT_CTL_NAME(OBD_PAGESUSED)
+               .procname = "pagesused",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0444,
+               .proc_handler = &proc_pages_alloc
+       },
+       {
+               INIT_CTL_NAME(OBD_MAXMEMUSED)
+               .procname = "memused_max",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0444,
+               .proc_handler = &proc_mem_max
+       },
+       {
+               INIT_CTL_NAME(OBD_MAXPAGESUSED)
+               .procname = "pagesused_max",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0444,
+               .proc_handler = &proc_pages_max
+       },
+       {
+               INIT_CTL_NAME(OBD_LDLM_TIMEOUT)
+               .procname = "ldlm_timeout",
+               .data     = &ldlm_timeout,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_set_timeout
+       },
+       {
+               INIT_CTL_NAME(OBD_ALLOC_FAIL_RATE)
+               .procname = "alloc_fail_rate",
+               .data     = &obd_alloc_fail_rate,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_alloc_fail_rate
+       },
+       {
+               INIT_CTL_NAME(OBD_MAX_DIRTY_PAGES)
+               .procname = "max_dirty_mb",
+               .data     = &obd_max_dirty_pages,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_max_dirty_pages_in_mb
+       },
+       {
+               INIT_CTL_NAME(OBD_AT_MIN)
+               .procname = "at_min",
+               .data     = &at_min,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_at_min
+       },
+       {
+               INIT_CTL_NAME(OBD_AT_MAX)
+               .procname = "at_max",
+               .data     = &at_max,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_at_max
+       },
+       {
+               INIT_CTL_NAME(OBD_AT_EXTRA)
+               .procname = "at_extra",
+               .data     = &at_extra,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_at_extra
+       },
+       {
+               INIT_CTL_NAME(OBD_AT_EARLY_MARGIN)
+               .procname = "at_early_margin",
+               .data     = &at_early_margin,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_at_early_margin
+       },
+       {
+               INIT_CTL_NAME(OBD_AT_HISTORY)
+               .procname = "at_history",
+               .data     = &at_history,
+               .maxlen   = sizeof(int),
+               .mode     = 0644,
+               .proc_handler = &proc_at_history
+       },
+       {       INIT_CTL_NAME(0)    }
+};
+
+static ctl_table_t parent_table[] = {
+       {
+               INIT_CTL_NAME(OBD_SYSCTL)
+               .procname = "lustre",
+               .data     = NULL,
+               .maxlen   = 0,
+               .mode     = 0555,
+               .child    = obd_table
+       },
+       {       INIT_CTL_NAME(0)   }
+};
+#endif
+
+void obd_sysctl_init (void)
+{
+#ifdef CONFIG_SYSCTL
+       if ( !obd_table_header )
+               obd_table_header = cfs_register_sysctl_table(parent_table, 0);
+#endif
+}
+
+void obd_sysctl_clean (void)
+{
+#ifdef CONFIG_SYSCTL
+       if ( obd_table_header )
+               unregister_sysctl_table(obd_table_header);
+       obd_table_header = NULL;
+#endif
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c
new file mode 100644 (file)
index 0000000..b1d215e
--- /dev/null
@@ -0,0 +1,966 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Mikhail Pershin <tappro@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/*
+ * Allocate a new log or catalog handle
+ * Used inside llog_open().
+ */
+struct llog_handle *llog_alloc_handle(void)
+{
+       struct llog_handle *loghandle;
+
+       OBD_ALLOC_PTR(loghandle);
+       if (loghandle == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       init_rwsem(&loghandle->lgh_lock);
+       spin_lock_init(&loghandle->lgh_hdr_lock);
+       INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
+       atomic_set(&loghandle->lgh_refcount, 1);
+
+       return loghandle;
+}
+
+/*
+ * Free llog handle and header data if exists. Used in llog_close() only
+ */
+void llog_free_handle(struct llog_handle *loghandle)
+{
+       LASSERT(loghandle != NULL);
+
+       /* failed llog_init_handle */
+       if (!loghandle->lgh_hdr)
+               goto out;
+
+       if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)
+               LASSERT(list_empty(&loghandle->u.phd.phd_entry));
+       else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+               LASSERT(list_empty(&loghandle->u.chd.chd_head));
+       LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE);
+       OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE);
+out:
+       OBD_FREE_PTR(loghandle);
+}
+
+void llog_handle_get(struct llog_handle *loghandle)
+{
+       atomic_inc(&loghandle->lgh_refcount);
+}
+
+void llog_handle_put(struct llog_handle *loghandle)
+{
+       LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
+       if (atomic_dec_and_test(&loghandle->lgh_refcount))
+               llog_free_handle(loghandle);
+}
+
+/* returns negative on error; 0 if success; 1 if success & log destroyed */
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+                   int index)
+{
+       struct llog_log_hdr *llh = loghandle->lgh_hdr;
+       int rc = 0;
+       ENTRY;
+
+       CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n",
+              index, POSTID(&loghandle->lgh_id.lgl_oi));
+
+       if (index == 0) {
+               CERROR("Can't cancel index 0 which is header\n");
+               RETURN(-EINVAL);
+       }
+
+       spin_lock(&loghandle->lgh_hdr_lock);
+       if (!ext2_clear_bit(index, llh->llh_bitmap)) {
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index);
+               RETURN(-ENOENT);
+       }
+
+       llh->llh_count--;
+
+       if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+           (llh->llh_count == 1) &&
+           (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) {
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               rc = llog_destroy(env, loghandle);
+               if (rc < 0) {
+                       CERROR("%s: can't destroy empty llog #"DOSTID
+                              "#%08x: rc = %d\n",
+                              loghandle->lgh_ctxt->loc_obd->obd_name,
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, rc);
+                       GOTO(out_err, rc);
+               }
+               RETURN(1);
+       }
+       spin_unlock(&loghandle->lgh_hdr_lock);
+
+       rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0);
+       if (rc < 0) {
+               CERROR("%s: fail to write header for llog #"DOSTID
+                      "#%08x: rc = %d\n",
+                      loghandle->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&loghandle->lgh_id.lgl_oi),
+                      loghandle->lgh_id.lgl_ogen, rc);
+               GOTO(out_err, rc);
+       }
+       RETURN(0);
+out_err:
+       spin_lock(&loghandle->lgh_hdr_lock);
+       ext2_set_bit(index, llh->llh_bitmap);
+       llh->llh_count++;
+       spin_unlock(&loghandle->lgh_hdr_lock);
+       return rc;
+}
+EXPORT_SYMBOL(llog_cancel_rec);
+
+static int llog_read_header(const struct lu_env *env,
+                           struct llog_handle *handle,
+                           struct obd_uuid *uuid)
+{
+       struct llog_operations *lop;
+       int rc;
+
+       rc = llog_handle2ops(handle, &lop);
+       if (rc)
+               RETURN(rc);
+
+       if (lop->lop_read_header == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_read_header(env, handle);
+       if (rc == LLOG_EEMPTY) {
+               struct llog_log_hdr *llh = handle->lgh_hdr;
+
+               handle->lgh_last_idx = 0; /* header is record with index 0 */
+               llh->llh_count = 1;      /* for the header record */
+               llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC;
+               llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE;
+               llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0;
+               llh->llh_timestamp = cfs_time_current_sec();
+               if (uuid)
+                       memcpy(&llh->llh_tgtuuid, uuid,
+                              sizeof(llh->llh_tgtuuid));
+               llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap);
+               ext2_set_bit(0, llh->llh_bitmap);
+               rc = 0;
+       }
+       return rc;
+}
+
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+                    int flags, struct obd_uuid *uuid)
+{
+       struct llog_log_hdr     *llh;
+       int                      rc;
+
+       ENTRY;
+       LASSERT(handle->lgh_hdr == NULL);
+
+       OBD_ALLOC_PTR(llh);
+       if (llh == NULL)
+               RETURN(-ENOMEM);
+       handle->lgh_hdr = llh;
+       /* first assign flags to use llog_client_ops */
+       llh->llh_flags = flags;
+       rc = llog_read_header(env, handle, uuid);
+       if (rc == 0) {
+               if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN &&
+                             flags & LLOG_F_IS_CAT) ||
+                            (llh->llh_flags & LLOG_F_IS_CAT &&
+                             flags & LLOG_F_IS_PLAIN))) {
+                       CERROR("%s: llog type is %s but initializing %s\n",
+                              handle->lgh_ctxt->loc_obd->obd_name,
+                              llh->llh_flags & LLOG_F_IS_CAT ?
+                              "catalog" : "plain",
+                              flags & LLOG_F_IS_CAT ? "catalog" : "plain");
+                       GOTO(out, rc = -EINVAL);
+               } else if (llh->llh_flags &
+                          (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) {
+                       /*
+                        * it is possible to open llog without specifying llog
+                        * type so it is taken from llh_flags
+                        */
+                       flags = llh->llh_flags;
+               } else {
+                       /* for some reason the llh_flags has no type set */
+                       CERROR("llog type is not specified!\n");
+                       GOTO(out, rc = -EINVAL);
+               }
+               if (unlikely(uuid &&
+                            !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
+                       CERROR("%s: llog uuid mismatch: %s/%s\n",
+                              handle->lgh_ctxt->loc_obd->obd_name,
+                              (char *)uuid->uuid,
+                              (char *)llh->llh_tgtuuid.uuid);
+                       GOTO(out, rc = -EEXIST);
+               }
+       }
+       if (flags & LLOG_F_IS_CAT) {
+               LASSERT(list_empty(&handle->u.chd.chd_head));
+               INIT_LIST_HEAD(&handle->u.chd.chd_head);
+               llh->llh_size = sizeof(struct llog_logid_rec);
+       } else if (!(flags & LLOG_F_IS_PLAIN)) {
+               CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
+                      handle->lgh_ctxt->loc_obd->obd_name,
+                      flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+               rc = -EINVAL;
+       }
+out:
+       if (rc) {
+               OBD_FREE_PTR(llh);
+               handle->lgh_hdr = NULL;
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_init_handle);
+
+int llog_copy_handler(const struct lu_env *env,
+                     struct llog_handle *llh,
+                     struct llog_rec_hdr *rec,
+                     void *data)
+{
+       struct llog_rec_hdr local_rec = *rec;
+       struct llog_handle *local_llh = (struct llog_handle *)data;
+       char *cfg_buf = (char*) (rec + 1);
+       struct lustre_cfg *lcfg;
+       int rc = 0;
+       ENTRY;
+
+       /* Append all records */
+       local_rec.lrh_len -= sizeof(*rec) + sizeof(struct llog_rec_tail);
+       rc = llog_write(env, local_llh, &local_rec, NULL, 0,
+                       (void *)cfg_buf, -1);
+
+       lcfg = (struct lustre_cfg *)cfg_buf;
+       CDEBUG(D_INFO, "idx=%d, rc=%d, len=%d, cmd %x %s %s\n",
+              rec->lrh_index, rc, rec->lrh_len, lcfg->lcfg_command,
+              lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1));
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_copy_handler);
+
+static int llog_process_thread(void *arg)
+{
+       struct llog_process_info        *lpi = arg;
+       struct llog_handle              *loghandle = lpi->lpi_loghandle;
+       struct llog_log_hdr             *llh = loghandle->lgh_hdr;
+       struct llog_process_cat_data    *cd  = lpi->lpi_catdata;
+       char                            *buf;
+       __u64                            cur_offset = LLOG_CHUNK_SIZE;
+       __u64                            last_offset;
+       int                              rc = 0, index = 1, last_index;
+       int                              saved_index = 0;
+       int                              last_called_index = 0;
+
+       ENTRY;
+
+       LASSERT(llh);
+
+       OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+       if (!buf) {
+               lpi->lpi_rc = -ENOMEM;
+               RETURN(0);
+       }
+
+       if (cd != NULL) {
+               last_called_index = cd->lpcd_first_idx;
+               index = cd->lpcd_first_idx + 1;
+       }
+       if (cd != NULL && cd->lpcd_last_idx)
+               last_index = cd->lpcd_last_idx;
+       else
+               last_index = LLOG_BITMAP_BYTES * 8 - 1;
+
+       while (rc == 0) {
+               struct llog_rec_hdr *rec;
+
+               /* skip records not set in bitmap */
+               while (index <= last_index &&
+                      !ext2_test_bit(index, llh->llh_bitmap))
+                       ++index;
+
+               LASSERT(index <= last_index + 1);
+               if (index == last_index + 1)
+                       break;
+repeat:
+               CDEBUG(D_OTHER, "index: %d last_index %d\n",
+                      index, last_index);
+
+               /* get the buf with our target record; avoid old garbage */
+               memset(buf, 0, LLOG_CHUNK_SIZE);
+               last_offset = cur_offset;
+               rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
+                                    index, &cur_offset, buf, LLOG_CHUNK_SIZE);
+               if (rc)
+                       GOTO(out, rc);
+
+               /* NB: when rec->lrh_len is accessed it is already swabbed
+                * since it is used at the "end" of the loop and the rec
+                * swabbing is done at the beginning of the loop. */
+               for (rec = (struct llog_rec_hdr *)buf;
+                    (char *)rec < buf + LLOG_CHUNK_SIZE;
+                    rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){
+
+                       CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+                              rec, rec->lrh_type);
+
+                       if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                               lustre_swab_llog_rec(rec);
+
+                       CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
+                              rec->lrh_type, rec->lrh_index);
+
+                       if (rec->lrh_index == 0) {
+                               /* probably another rec just got added? */
+                               if (index <= loghandle->lgh_last_idx)
+                                       GOTO(repeat, rc = 0);
+                               GOTO(out, rc = 0); /* no more records */
+                       }
+                       if (rec->lrh_len == 0 ||
+                           rec->lrh_len > LLOG_CHUNK_SIZE) {
+                               CWARN("invalid length %d in llog record for "
+                                     "index %d/%d\n", rec->lrh_len,
+                                     rec->lrh_index, index);
+                               GOTO(out, rc = -EINVAL);
+                       }
+
+                       if (rec->lrh_index < index) {
+                               CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+                                      rec->lrh_index);
+                               continue;
+                       }
+
+                       CDEBUG(D_OTHER,
+                              "lrh_index: %d lrh_len: %d (%d remains)\n",
+                              rec->lrh_index, rec->lrh_len,
+                              (int)(buf + LLOG_CHUNK_SIZE - (char *)rec));
+
+                       loghandle->lgh_cur_idx = rec->lrh_index;
+                       loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
+                                                   last_offset;
+
+                       /* if set, process the callback on this record */
+                       if (ext2_test_bit(index, llh->llh_bitmap)) {
+                               rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
+                                                lpi->lpi_cbdata);
+                               last_called_index = index;
+                               if (rc == LLOG_PROC_BREAK) {
+                                       GOTO(out, rc);
+                               } else if (rc == LLOG_DEL_RECORD) {
+                                       llog_cancel_rec(lpi->lpi_env,
+                                                       loghandle,
+                                                       rec->lrh_index);
+                                       rc = 0;
+                               }
+                               if (rc)
+                                       GOTO(out, rc);
+                       } else {
+                               CDEBUG(D_OTHER, "Skipped index %d\n", index);
+                       }
+
+                       /* next record, still in buffer? */
+                       ++index;
+                       if (index > last_index)
+                               GOTO(out, rc = 0);
+               }
+       }
+
+out:
+       if (cd != NULL)
+               cd->lpcd_last_idx = last_called_index;
+
+       OBD_FREE(buf, LLOG_CHUNK_SIZE);
+       lpi->lpi_rc = rc;
+       return 0;
+}
+
+static int llog_process_thread_daemonize(void *arg)
+{
+       struct llog_process_info        *lpi = arg;
+       struct lu_env                    env;
+       int                              rc;
+
+       unshare_fs_struct();
+
+       /* client env has no keys, tags is just 0 */
+       rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+       if (rc)
+               goto out;
+       lpi->lpi_env = &env;
+
+       rc = llog_process_thread(arg);
+
+       lu_env_fini(&env);
+out:
+       complete(&lpi->lpi_completion);
+       return rc;
+}
+
+int llog_process_or_fork(const struct lu_env *env,
+                        struct llog_handle *loghandle,
+                        llog_cb_t cb, void *data, void *catdata, bool fork)
+{
+       struct llog_process_info *lpi;
+       int                   rc;
+
+       ENTRY;
+
+       OBD_ALLOC_PTR(lpi);
+       if (lpi == NULL) {
+               CERROR("cannot alloc pointer\n");
+               RETURN(-ENOMEM);
+       }
+       lpi->lpi_loghandle = loghandle;
+       lpi->lpi_cb     = cb;
+       lpi->lpi_cbdata    = data;
+       lpi->lpi_catdata   = catdata;
+
+       if (fork) {
+               /* The new thread can't use parent env,
+                * init the new one in llog_process_thread_daemonize. */
+               lpi->lpi_env = NULL;
+               init_completion(&lpi->lpi_completion);
+               rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi,
+                                            "llog_process_thread"));
+               if (IS_ERR_VALUE(rc)) {
+                       CERROR("%s: cannot start thread: rc = %d\n",
+                              loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+                       OBD_FREE_PTR(lpi);
+                       RETURN(rc);
+               }
+               wait_for_completion(&lpi->lpi_completion);
+       } else {
+               lpi->lpi_env = env;
+               llog_process_thread(lpi);
+       }
+       rc = lpi->lpi_rc;
+       OBD_FREE_PTR(lpi);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_process_or_fork);
+
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+                llog_cb_t cb, void *data, void *catdata)
+{
+       return llog_process_or_fork(env, loghandle, cb, data, catdata, true);
+}
+EXPORT_SYMBOL(llog_process);
+
+inline int llog_get_size(struct llog_handle *loghandle)
+{
+       if (loghandle && loghandle->lgh_hdr)
+               return loghandle->lgh_hdr->llh_count;
+       return 0;
+}
+EXPORT_SYMBOL(llog_get_size);
+
+int llog_reverse_process(const struct lu_env *env,
+                        struct llog_handle *loghandle, llog_cb_t cb,
+                        void *data, void *catdata)
+{
+       struct llog_log_hdr *llh = loghandle->lgh_hdr;
+       struct llog_process_cat_data *cd = catdata;
+       void *buf;
+       int rc = 0, first_index = 1, index, idx;
+       ENTRY;
+
+       OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+       if (!buf)
+               RETURN(-ENOMEM);
+
+       if (cd != NULL)
+               first_index = cd->lpcd_first_idx + 1;
+       if (cd != NULL && cd->lpcd_last_idx)
+               index = cd->lpcd_last_idx;
+       else
+               index = LLOG_BITMAP_BYTES * 8 - 1;
+
+       while (rc == 0) {
+               struct llog_rec_hdr *rec;
+               struct llog_rec_tail *tail;
+
+               /* skip records not set in bitmap */
+               while (index >= first_index &&
+                      !ext2_test_bit(index, llh->llh_bitmap))
+                       --index;
+
+               LASSERT(index >= first_index - 1);
+               if (index == first_index - 1)
+                       break;
+
+               /* get the buf with our target record; avoid old garbage */
+               memset(buf, 0, LLOG_CHUNK_SIZE);
+               rc = llog_prev_block(env, loghandle, index, buf,
+                                    LLOG_CHUNK_SIZE);
+               if (rc)
+                       GOTO(out, rc);
+
+               rec = buf;
+               idx = rec->lrh_index;
+               CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx);
+               while (idx < index) {
+                       rec = (void *)rec + rec->lrh_len;
+                       if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                               lustre_swab_llog_rec(rec);
+                       idx ++;
+               }
+               LASSERT(idx == index);
+               tail = (void *)rec + rec->lrh_len - sizeof(*tail);
+
+               /* process records in buffer, starting where we found one */
+               while ((void *)tail > buf) {
+                       if (tail->lrt_index == 0)
+                               GOTO(out, rc = 0); /* no more records */
+
+                       /* if set, process the callback on this record */
+                       if (ext2_test_bit(index, llh->llh_bitmap)) {
+                               rec = (void *)tail - tail->lrt_len +
+                                     sizeof(*tail);
+
+                               rc = cb(env, loghandle, rec, data);
+                               if (rc == LLOG_PROC_BREAK) {
+                                       GOTO(out, rc);
+                               } else if (rc == LLOG_DEL_RECORD) {
+                                       llog_cancel_rec(env, loghandle,
+                                                       tail->lrt_index);
+                                       rc = 0;
+                               }
+                               if (rc)
+                                       GOTO(out, rc);
+                       }
+
+                       /* previous record, still in buffer? */
+                       --index;
+                       if (index < first_index)
+                               GOTO(out, rc = 0);
+                       tail = (void *)tail - tail->lrt_len;
+               }
+       }
+
+out:
+       if (buf)
+               OBD_FREE(buf, LLOG_CHUNK_SIZE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_reverse_process);
+
+/**
+ * new llog API
+ *
+ * API functions:
+ *      llog_open - open llog, may not exist
+ *      llog_exist - check if llog exists
+ *      llog_close - close opened llog, pair for open, frees llog_handle
+ *      llog_declare_create - declare llog creation
+ *      llog_create - create new llog on disk, need transaction handle
+ *      llog_declare_write_rec - declaration of llog write
+ *      llog_write_rec - write llog record on disk, need transaction handle
+ *      llog_declare_add - declare llog catalog record addition
+ *      llog_add - add llog record in catalog, need transaction handle
+ */
+int llog_exist(struct llog_handle *loghandle)
+{
+       struct llog_operations  *lop;
+       int                      rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(loghandle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_exist == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       rc = lop->lop_exist(loghandle);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_exist);
+
+int llog_declare_create(const struct lu_env *env,
+                       struct llog_handle *loghandle, struct thandle *th)
+{
+       struct llog_operations  *lop;
+       int                      raised, rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(loghandle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_declare_create == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lop->lop_declare_create(env, loghandle, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_create);
+
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+               struct thandle *th)
+{
+       struct llog_operations  *lop;
+       int                      raised, rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(handle, &lop);
+       if (rc)
+               RETURN(rc);
+       if (lop->lop_create == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lop->lop_create(env, handle, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_create);
+
+int llog_declare_write_rec(const struct lu_env *env,
+                          struct llog_handle *handle,
+                          struct llog_rec_hdr *rec, int idx,
+                          struct thandle *th)
+{
+       struct llog_operations  *lop;
+       int                      raised, rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(handle, &lop);
+       if (rc)
+               RETURN(rc);
+       LASSERT(lop);
+       if (lop->lop_declare_write_rec == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lop->lop_declare_write_rec(env, handle, rec, idx, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_write_rec);
+
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+                  struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+                  int numcookies, void *buf, int idx, struct thandle *th)
+{
+       struct llog_operations  *lop;
+       int                      raised, rc, buflen;
+
+       ENTRY;
+
+       rc = llog_handle2ops(handle, &lop);
+       if (rc)
+               RETURN(rc);
+
+       LASSERT(lop);
+       if (lop->lop_write_rec == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       if (buf)
+               buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) +
+                        sizeof(struct llog_rec_tail);
+       else
+               buflen = rec->lrh_len;
+       LASSERT(cfs_size_round(buflen) == buflen);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies,
+                               buf, idx, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write_rec);
+
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+            struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+            void *buf, struct thandle *th)
+{
+       int raised, rc;
+
+       ENTRY;
+
+       if (lgh->lgh_logops->lop_add == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_add);
+
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+                    struct llog_rec_hdr *rec, struct thandle *th)
+{
+       int raised, rc;
+
+       ENTRY;
+
+       if (lgh->lgh_logops->lop_declare_add == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_add);
+
+/**
+ * Helper function to open llog or create it if doesn't exist.
+ * It hides all transaction handling from caller.
+ */
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+                    struct llog_handle **res, struct llog_logid *logid,
+                    char *name)
+{
+       struct thandle  *th;
+       int              rc;
+
+       ENTRY;
+
+       rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW);
+       if (rc)
+               RETURN(rc);
+
+       if (llog_exist(*res))
+               RETURN(0);
+
+       if ((*res)->lgh_obj != NULL) {
+               struct dt_device *d;
+
+               d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
+
+               th = dt_trans_create(env, d);
+               if (IS_ERR(th))
+                       GOTO(out, rc = PTR_ERR(th));
+
+               rc = llog_declare_create(env, *res, th);
+               if (rc == 0) {
+                       rc = dt_trans_start_local(env, d, th);
+                       if (rc == 0)
+                               rc = llog_create(env, *res, th);
+               }
+               dt_trans_stop(env, d, th);
+       } else {
+               /* lvfs compat code */
+               LASSERT((*res)->lgh_file == NULL);
+               rc = llog_create(env, *res, NULL);
+       }
+out:
+       if (rc)
+               llog_close(env, *res);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open_create);
+
+/**
+ * Helper function to delete existent llog.
+ */
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+              struct llog_logid *logid, char *name)
+{
+       struct llog_handle      *handle;
+       int                      rc = 0, rc2;
+
+       ENTRY;
+
+       /* nothing to erase */
+       if (name == NULL && logid == NULL)
+               RETURN(0);
+
+       rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS);
+       if (rc < 0)
+               RETURN(rc);
+
+       rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL);
+       if (rc == 0)
+               rc = llog_destroy(env, handle);
+
+       rc2 = llog_close(env, handle);
+       if (rc == 0)
+               rc = rc2;
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_erase);
+
+/*
+ * Helper function for write record in llog.
+ * It hides all transaction handling from caller.
+ * Valid only with local llog.
+ */
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+              struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+              int cookiecount, void *buf, int idx)
+{
+       int rc;
+
+       ENTRY;
+
+       LASSERT(loghandle);
+       LASSERT(loghandle->lgh_ctxt);
+
+       if (loghandle->lgh_obj != NULL) {
+               struct dt_device        *dt;
+               struct thandle          *th;
+
+               dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+               th = dt_trans_create(env, dt);
+               if (IS_ERR(th))
+                       RETURN(PTR_ERR(th));
+
+               rc = llog_declare_write_rec(env, loghandle, rec, idx, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_trans_start_local(env, dt, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               down_write(&loghandle->lgh_lock);
+               rc = llog_write_rec(env, loghandle, rec, reccookie,
+                                   cookiecount, buf, idx, th);
+               up_write(&loghandle->lgh_lock);
+out_trans:
+               dt_trans_stop(env, dt, th);
+       } else { /* lvfs compatibility */
+               down_write(&loghandle->lgh_lock);
+               rc = llog_write_rec(env, loghandle, rec, reccookie,
+                                   cookiecount, buf, idx, NULL);
+               up_write(&loghandle->lgh_lock);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write);
+
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+             struct llog_handle **lgh, struct llog_logid *logid,
+             char *name, enum llog_open_param open_param)
+{
+       int      raised;
+       int      rc;
+
+       ENTRY;
+
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_logops);
+
+       if (ctxt->loc_logops->lop_open == NULL) {
+               *lgh = NULL;
+               RETURN(-EOPNOTSUPP);
+       }
+
+       *lgh = llog_alloc_handle();
+       if (*lgh == NULL)
+               RETURN(-ENOMEM);
+       (*lgh)->lgh_ctxt = ctxt;
+       (*lgh)->lgh_logops = ctxt->loc_logops;
+
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       if (rc) {
+               llog_free_handle(*lgh);
+               *lgh = NULL;
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open);
+
+int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
+{
+       struct llog_operations  *lop;
+       int                      rc;
+
+       ENTRY;
+
+       rc = llog_handle2ops(loghandle, &lop);
+       if (rc)
+               GOTO(out, rc);
+       if (lop->lop_close == NULL)
+               GOTO(out, rc = -EOPNOTSUPP);
+       rc = lop->lop_close(env, loghandle);
+out:
+       llog_handle_put(loghandle);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_close);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
new file mode 100644 (file)
index 0000000..cf00b2f
--- /dev/null
@@ -0,0 +1,833 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_cat.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+
+#include "llog_internal.h"
+
+/* Create a new log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ */
+static int llog_cat_new_log(const struct lu_env *env,
+                           struct llog_handle *cathandle,
+                           struct llog_handle *loghandle,
+                           struct thandle *th)
+{
+
+       struct llog_log_hdr *llh;
+       struct llog_logid_rec rec = { { 0 }, };
+       int rc, index, bitmap_size;
+       ENTRY;
+
+       llh = cathandle->lgh_hdr;
+       bitmap_size = LLOG_BITMAP_SIZE(llh);
+
+       index = (cathandle->lgh_last_idx + 1) % bitmap_size;
+
+       /* maximum number of available slots in catlog is bitmap_size - 2 */
+       if (llh->llh_cat_idx == index) {
+               CERROR("no free catalog slots for log...\n");
+               RETURN(-ENOSPC);
+       }
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
+               RETURN(-ENOSPC);
+
+       rc = llog_create(env, loghandle, th);
+       /* if llog is already created, no need to initialize it */
+       if (rc == -EEXIST) {
+               RETURN(0);
+       } else if (rc != 0) {
+               CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
+                      loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+               RETURN(rc);
+       }
+
+       rc = llog_init_handle(env, loghandle,
+                             LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+                             &cathandle->lgh_hdr->llh_tgtuuid);
+       if (rc)
+               GOTO(out_destroy, rc);
+
+       if (index == 0)
+               index = 1;
+
+       spin_lock(&loghandle->lgh_hdr_lock);
+       llh->llh_count++;
+       if (ext2_set_bit(index, llh->llh_bitmap)) {
+               CERROR("argh, index %u already set in log bitmap?\n",
+                      index);
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               LBUG(); /* should never happen */
+       }
+       spin_unlock(&loghandle->lgh_hdr_lock);
+
+       cathandle->lgh_last_idx = index;
+       llh->llh_tail.lrt_index = index;
+
+       CDEBUG(D_RPCTRACE,"new recovery log "DOSTID":%x for index %u of catalog"
+              DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+              loghandle->lgh_id.lgl_ogen, index,
+              POSTID(&cathandle->lgh_id.lgl_oi));
+       /* build the record for this log in the catalog */
+       rec.lid_hdr.lrh_len = sizeof(rec);
+       rec.lid_hdr.lrh_index = index;
+       rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+       rec.lid_id = loghandle->lgh_id;
+       rec.lid_tail.lrt_len = sizeof(rec);
+       rec.lid_tail.lrt_index = index;
+
+       /* update the catalog: header and record */
+       rc = llog_write_rec(env, cathandle, &rec.lid_hdr,
+                           &loghandle->u.phd.phd_cookie, 1, NULL, index, th);
+       if (rc < 0)
+               GOTO(out_destroy, rc);
+
+       loghandle->lgh_hdr->llh_cat_idx = index;
+       RETURN(0);
+out_destroy:
+       llog_destroy(env, loghandle);
+       RETURN(rc);
+}
+
+/* Open an existent log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ * We return a lock on the handle to ensure nobody yanks it from us.
+ *
+ * This takes extra reference on llog_handle via llog_handle_get() and require
+ * this reference to be put by caller using llog_handle_put()
+ */
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+                      struct llog_handle **res, struct llog_logid *logid)
+{
+       struct llog_handle      *loghandle;
+       int                      rc = 0;
+
+       ENTRY;
+
+       if (cathandle == NULL)
+               RETURN(-EBADF);
+
+       down_write(&cathandle->lgh_lock);
+       list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+                               u.phd.phd_entry) {
+               struct llog_logid *cgl = &loghandle->lgh_id;
+
+               if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) &&
+                   ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
+                       if (cgl->lgl_ogen != logid->lgl_ogen) {
+                               CERROR("%s: log "DOSTID" generation %x != %x\n",
+                                      loghandle->lgh_ctxt->loc_obd->obd_name,
+                                      POSTID(&logid->lgl_oi), cgl->lgl_ogen,
+                                      logid->lgl_ogen);
+                               continue;
+                       }
+                       loghandle->u.phd.phd_cat_handle = cathandle;
+                       up_write(&cathandle->lgh_lock);
+                       GOTO(out, rc = 0);
+               }
+       }
+       up_write(&cathandle->lgh_lock);
+
+       rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL,
+                      LLOG_OPEN_EXISTS);
+       if (rc < 0) {
+               CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n",
+                      cathandle->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+               RETURN(rc);
+       }
+
+       rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL);
+       if (rc < 0) {
+               llog_close(env, loghandle);
+               loghandle = NULL;
+               RETURN(rc);
+       }
+
+       down_write(&cathandle->lgh_lock);
+       list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
+       up_write(&cathandle->lgh_lock);
+
+       loghandle->u.phd.phd_cat_handle = cathandle;
+       loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
+       loghandle->u.phd.phd_cookie.lgc_index =
+                               loghandle->lgh_hdr->llh_cat_idx;
+       EXIT;
+out:
+       llog_handle_get(loghandle);
+       *res = loghandle;
+       return 0;
+}
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
+{
+       struct llog_handle      *loghandle, *n;
+       int                      rc;
+
+       ENTRY;
+
+       list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head,
+                                    u.phd.phd_entry) {
+               struct llog_log_hdr     *llh = loghandle->lgh_hdr;
+               int                      index;
+
+               /* unlink open-not-created llogs */
+               list_del_init(&loghandle->u.phd.phd_entry);
+               llh = loghandle->lgh_hdr;
+               if (loghandle->lgh_obj != NULL && llh != NULL &&
+                   (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+                   (llh->llh_count == 1)) {
+                       rc = llog_destroy(env, loghandle);
+                       if (rc)
+                               CERROR("%s: failure destroying log during "
+                                      "cleanup: rc = %d\n",
+                                      loghandle->lgh_ctxt->loc_obd->obd_name,
+                                      rc);
+
+                       index = loghandle->u.phd.phd_cookie.lgc_index;
+                       llog_cat_cleanup(env, cathandle, NULL, index);
+               }
+               llog_close(env, loghandle);
+       }
+       /* if handle was stored in ctxt, remove it too */
+       if (cathandle->lgh_ctxt->loc_handle == cathandle)
+               cathandle->lgh_ctxt->loc_handle = NULL;
+       rc = llog_close(env, cathandle);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_close);
+
+/**
+ * lockdep markers for nested struct llog_handle::lgh_lock locking.
+ */
+enum {
+       LLOGH_CAT,
+       LLOGH_LOG
+};
+
+/** Return the currently active log handle.  If the current log handle doesn't
+ * have enough space left for the current record, start a new one.
+ *
+ * If reclen is 0, we only want to know what the currently active log is,
+ * otherwise we get a lock on this log so nobody can steal our space.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ *
+ * NOTE: loghandle is write-locked upon successful return
+ */
+static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
+                                               struct thandle *th)
+{
+       struct llog_handle *loghandle = NULL;
+       ENTRY;
+
+       down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+       loghandle = cathandle->u.chd.chd_current_log;
+       if (loghandle) {
+               struct llog_log_hdr *llh;
+
+               down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+               llh = loghandle->lgh_hdr;
+               if (llh == NULL ||
+                   loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+                       up_read(&cathandle->lgh_lock);
+                       RETURN(loghandle);
+               } else {
+                       up_write(&loghandle->lgh_lock);
+               }
+       }
+       up_read(&cathandle->lgh_lock);
+
+       /* time to use next log */
+
+       /* first, we have to make sure the state hasn't changed */
+       down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+       loghandle = cathandle->u.chd.chd_current_log;
+       if (loghandle) {
+               struct llog_log_hdr *llh;
+
+               down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+               llh = loghandle->lgh_hdr;
+               LASSERT(llh);
+               if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+                       up_write(&cathandle->lgh_lock);
+                       RETURN(loghandle);
+               } else {
+                       up_write(&loghandle->lgh_lock);
+               }
+       }
+
+       CDEBUG(D_INODE, "use next log\n");
+
+       loghandle = cathandle->u.chd.chd_next_log;
+       cathandle->u.chd.chd_current_log = loghandle;
+       cathandle->u.chd.chd_next_log = NULL;
+       down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+       up_write(&cathandle->lgh_lock);
+       LASSERT(loghandle);
+       RETURN(loghandle);
+}
+
+/* Add a single record to the recovery log(s) using a catalog
+ * Returns as llog_write_record
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+                    struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+                    void *buf, struct thandle *th)
+{
+       struct llog_handle *loghandle;
+       int rc;
+       ENTRY;
+
+       LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE);
+       loghandle = llog_cat_current_log(cathandle, th);
+       LASSERT(!IS_ERR(loghandle));
+
+       /* loghandle is already locked by llog_cat_current_log() for us */
+       if (!llog_exist(loghandle)) {
+               rc = llog_cat_new_log(env, cathandle, loghandle, th);
+               if (rc < 0) {
+                       up_write(&loghandle->lgh_lock);
+                       RETURN(rc);
+               }
+       }
+       /* now let's try to add the record */
+       rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th);
+       if (rc < 0)
+               CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR,
+                            "llog_write_rec %d: lh=%p\n", rc, loghandle);
+       up_write(&loghandle->lgh_lock);
+       if (rc == -ENOSPC) {
+               /* try to use next log */
+               loghandle = llog_cat_current_log(cathandle, th);
+               LASSERT(!IS_ERR(loghandle));
+               /* new llog can be created concurrently */
+               if (!llog_exist(loghandle)) {
+                       rc = llog_cat_new_log(env, cathandle, loghandle, th);
+                       if (rc < 0) {
+                               up_write(&loghandle->lgh_lock);
+                               RETURN(rc);
+                       }
+               }
+               /* now let's try to add the record */
+               rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf,
+                                   -1, th);
+               if (rc < 0)
+                       CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle);
+               up_write(&loghandle->lgh_lock);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add_rec);
+
+int llog_cat_declare_add_rec(const struct lu_env *env,
+                            struct llog_handle *cathandle,
+                            struct llog_rec_hdr *rec, struct thandle *th)
+{
+       struct llog_handle      *loghandle, *next;
+       int                      rc = 0;
+
+       ENTRY;
+
+       if (cathandle->u.chd.chd_current_log == NULL) {
+               /* declare new plain llog */
+               down_write(&cathandle->lgh_lock);
+               if (cathandle->u.chd.chd_current_log == NULL) {
+                       rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+                                      NULL, NULL, LLOG_OPEN_NEW);
+                       if (rc == 0) {
+                               cathandle->u.chd.chd_current_log = loghandle;
+                               list_add_tail(&loghandle->u.phd.phd_entry,
+                                                 &cathandle->u.chd.chd_head);
+                       }
+               }
+               up_write(&cathandle->lgh_lock);
+       } else if (cathandle->u.chd.chd_next_log == NULL) {
+               /* declare next plain llog */
+               down_write(&cathandle->lgh_lock);
+               if (cathandle->u.chd.chd_next_log == NULL) {
+                       rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+                                      NULL, NULL, LLOG_OPEN_NEW);
+                       if (rc == 0) {
+                               cathandle->u.chd.chd_next_log = loghandle;
+                               list_add_tail(&loghandle->u.phd.phd_entry,
+                                                 &cathandle->u.chd.chd_head);
+                       }
+               }
+               up_write(&cathandle->lgh_lock);
+       }
+       if (rc)
+               GOTO(out, rc);
+
+       if (!llog_exist(cathandle->u.chd.chd_current_log)) {
+               rc = llog_declare_create(env, cathandle->u.chd.chd_current_log,
+                                        th);
+               if (rc)
+                       GOTO(out, rc);
+               llog_declare_write_rec(env, cathandle, NULL, -1, th);
+       }
+       /* declare records in the llogs */
+       rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
+                                   rec, -1, th);
+       if (rc)
+               GOTO(out, rc);
+
+       next = cathandle->u.chd.chd_next_log;
+       if (next) {
+               if (!llog_exist(next)) {
+                       rc = llog_declare_create(env, next, th);
+                       llog_declare_write_rec(env, cathandle, NULL, -1, th);
+               }
+               llog_declare_write_rec(env, next, rec, -1, th);
+       }
+out:
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_declare_add_rec);
+
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+                struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+                void *buf)
+{
+       struct llog_ctxt        *ctxt;
+       struct dt_device        *dt;
+       struct thandle          *th = NULL;
+       int                      rc;
+
+       ctxt = cathandle->lgh_ctxt;
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_exp);
+
+       if (cathandle->lgh_obj != NULL) {
+               dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+               LASSERT(dt);
+
+               th = dt_trans_create(env, dt);
+               if (IS_ERR(th))
+                       RETURN(PTR_ERR(th));
+
+               rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_trans_start_local(env, dt, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+               rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th);
+out_trans:
+               dt_trans_stop(env, dt, th);
+       } else { /* lvfs compat code */
+               LASSERT(cathandle->lgh_file != NULL);
+               rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+               if (rc == 0)
+                       rc = llog_cat_add_rec(env, cathandle, rec, reccookie,
+                                             buf, th);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add);
+
+/* For each cookie in the cookie array, we clear the log in-use bit and either:
+ * - the log is empty, so mark it free in the catalog header and delete it
+ * - the log is not empty, just write out the log header
+ *
+ * The cookies may be in different log files, so we need to get new logs
+ * each time.
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_cancel_records(const struct lu_env *env,
+                           struct llog_handle *cathandle, int count,
+                           struct llog_cookie *cookies)
+{
+       int i, index, rc = 0, failed = 0;
+
+       ENTRY;
+
+       for (i = 0; i < count; i++, cookies++) {
+               struct llog_handle      *loghandle;
+               struct llog_logid       *lgl = &cookies->lgc_lgl;
+               int                      lrc;
+
+               rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
+               if (rc) {
+                       CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+                              cathandle->lgh_ctxt->loc_obd->obd_name,
+                              POSTID(&lgl->lgl_oi), rc);
+                       failed++;
+                       continue;
+               }
+
+               lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index);
+               if (lrc == 1) {   /* log has been destroyed */
+                       index = loghandle->u.phd.phd_cookie.lgc_index;
+                       rc = llog_cat_cleanup(env, cathandle, loghandle,
+                                             index);
+               } else if (lrc == -ENOENT) {
+                       if (rc == 0) /* ENOENT shouldn't rewrite any error */
+                               rc = lrc;
+               } else if (lrc < 0) {
+                       failed++;
+                       rc = lrc;
+               }
+               llog_handle_put(loghandle);
+       }
+       if (rc)
+               CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
+                      cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
+                      rc);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_cancel_records);
+
+int llog_cat_process_cb(const struct lu_env *env, struct llog_handle *cat_llh,
+                       struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_process_data *d = data;
+       struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+       struct llog_handle *llh;
+       int rc;
+
+       ENTRY;
+       if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+               CERROR("invalid record in catalog\n");
+               RETURN(-EINVAL);
+       }
+       CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+              DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+              rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi));
+
+       rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+       if (rc) {
+               CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+                      cat_llh->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&lir->lid_id.lgl_oi), rc);
+               RETURN(rc);
+       }
+
+       if (rec->lrh_index < d->lpd_startcat)
+               /* Skip processing of the logs until startcat */
+               RETURN(0);
+
+       if (d->lpd_startidx > 0) {
+               struct llog_process_cat_data cd;
+
+               cd.lpcd_first_idx = d->lpd_startidx;
+               cd.lpcd_last_idx = 0;
+               rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+                                         &cd, false);
+               /* Continue processing the next log from idx 0 */
+               d->lpd_startidx = 0;
+       } else {
+               rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+                                         NULL, false);
+       }
+       llog_handle_put(llh);
+
+       RETURN(rc);
+}
+
+int llog_cat_process_or_fork(const struct lu_env *env,
+                            struct llog_handle *cat_llh,
+                            llog_cb_t cb, void *data, int startcat,
+                            int startidx, bool fork)
+{
+       struct llog_process_data d;
+       struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+       int rc;
+       ENTRY;
+
+       LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+       d.lpd_data = data;
+       d.lpd_cb = cb;
+       d.lpd_startcat = startcat;
+       d.lpd_startidx = startidx;
+
+       if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+               struct llog_process_cat_data cd;
+
+               CWARN("catlog "DOSTID" crosses index zero\n",
+                     POSTID(&cat_llh->lgh_id.lgl_oi));
+
+               cd.lpcd_first_idx = llh->llh_cat_idx;
+               cd.lpcd_last_idx = 0;
+               rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+                                         &d, &cd, fork);
+               if (rc != 0)
+                       RETURN(rc);
+
+               cd.lpcd_first_idx = 0;
+               cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+               rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+                                         &d, &cd, fork);
+       } else {
+               rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+                                         &d, NULL, fork);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_process_or_fork);
+
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+                    llog_cb_t cb, void *data, int startcat, int startidx)
+{
+       return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat,
+                                       startidx, false);
+}
+EXPORT_SYMBOL(llog_cat_process);
+
+static int llog_cat_reverse_process_cb(const struct lu_env *env,
+                                      struct llog_handle *cat_llh,
+                                      struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_process_data *d = data;
+       struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+       struct llog_handle *llh;
+       int rc;
+
+       if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+               CERROR("invalid record in catalog\n");
+               RETURN(-EINVAL);
+       }
+       CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+              DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+              le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi));
+
+       rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+       if (rc) {
+               CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+                      cat_llh->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&lir->lid_id.lgl_oi), rc);
+               RETURN(rc);
+       }
+
+       rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
+       llog_handle_put(llh);
+       RETURN(rc);
+}
+
+int llog_cat_reverse_process(const struct lu_env *env,
+                            struct llog_handle *cat_llh,
+                            llog_cb_t cb, void *data)
+{
+       struct llog_process_data d;
+       struct llog_process_cat_data cd;
+       struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+       int rc;
+       ENTRY;
+
+       LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+       d.lpd_data = data;
+       d.lpd_cb = cb;
+
+       if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+               CWARN("catalog "DOSTID" crosses index zero\n",
+                     POSTID(&cat_llh->lgh_id.lgl_oi));
+
+               cd.lpcd_first_idx = 0;
+               cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+               rc = llog_reverse_process(env, cat_llh,
+                                         llog_cat_reverse_process_cb,
+                                         &d, &cd);
+               if (rc != 0)
+                       RETURN(rc);
+
+               cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+               cd.lpcd_last_idx = 0;
+               rc = llog_reverse_process(env, cat_llh,
+                                         llog_cat_reverse_process_cb,
+                                         &d, &cd);
+       } else {
+               rc = llog_reverse_process(env, cat_llh,
+                                         llog_cat_reverse_process_cb,
+                                         &d, NULL);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
+int llog_cat_set_first_idx(struct llog_handle *cathandle, int index)
+{
+       struct llog_log_hdr *llh = cathandle->lgh_hdr;
+       int i, bitmap_size, idx;
+       ENTRY;
+
+       bitmap_size = LLOG_BITMAP_SIZE(llh);
+       if (llh->llh_cat_idx == (index - 1)) {
+               idx = llh->llh_cat_idx + 1;
+               llh->llh_cat_idx = idx;
+               if (idx == cathandle->lgh_last_idx)
+                       goto out;
+               for (i = (index + 1) % bitmap_size;
+                    i != cathandle->lgh_last_idx;
+                    i = (i + 1) % bitmap_size) {
+                       if (!ext2_test_bit(i, llh->llh_bitmap)) {
+                               idx = llh->llh_cat_idx + 1;
+                               llh->llh_cat_idx = idx;
+                       } else if (i == 0) {
+                               llh->llh_cat_idx = 0;
+                       } else {
+                               break;
+                       }
+               }
+out:
+               CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n",
+                      POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx);
+       }
+
+       RETURN(0);
+}
+
+/* Cleanup deleted plain llog traces from catalog */
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+                    struct llog_handle *loghandle, int index)
+{
+       int rc;
+
+       LASSERT(index);
+       if (loghandle != NULL) {
+               /* remove destroyed llog from catalog list and
+                * chd_current_log variable */
+               down_write(&cathandle->lgh_lock);
+               if (cathandle->u.chd.chd_current_log == loghandle)
+                       cathandle->u.chd.chd_current_log = NULL;
+               list_del_init(&loghandle->u.phd.phd_entry);
+               up_write(&cathandle->lgh_lock);
+               LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index);
+               /* llog was opened and keep in a list, close it now */
+               llog_close(env, loghandle);
+       }
+       /* remove plain llog entry from catalog by index */
+       llog_cat_set_first_idx(cathandle, index);
+       rc = llog_cancel_rec(env, cathandle, index);
+       if (rc == 0)
+               CDEBUG(D_HA, "cancel plain log at index"
+                      " %u of catalog "DOSTID"\n",
+                      index, POSTID(&cathandle->lgh_id.lgl_oi));
+       return rc;
+}
+
+int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle,
+                 struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_logid_rec   *lir = (struct llog_logid_rec *)rec;
+       struct llog_handle      *loghandle;
+       struct llog_log_hdr     *llh;
+       int                      rc;
+
+       ENTRY;
+
+       if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+               CERROR("invalid record in catalog\n");
+               RETURN(-EINVAL);
+       }
+
+       CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+              DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+              rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi));
+
+       rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id);
+       if (rc) {
+               CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+                      cathandle->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&lir->lid_id.lgl_oi), rc);
+               if (rc == -ENOENT || rc == -ESTALE) {
+                       /* remove index from catalog */
+                       llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index);
+               }
+               RETURN(rc);
+       }
+
+       llh = loghandle->lgh_hdr;
+       if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+           (llh->llh_count == 1)) {
+               rc = llog_destroy(env, loghandle);
+               if (rc)
+                       CERROR("%s: fail to destroy empty log: rc = %d\n",
+                              loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+
+               llog_cat_cleanup(env, cathandle, loghandle,
+                                loghandle->u.phd.phd_cookie.lgc_index);
+       }
+       llog_handle_put(loghandle);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(cat_cancel_cb);
+
+/* helper to initialize catalog llog and process it to cancel */
+int llog_cat_init_and_process(const struct lu_env *env,
+                             struct llog_handle *llh)
+{
+       int rc;
+
+       rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL);
+       if (rc)
+               RETURN(rc);
+
+       rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false);
+       if (rc)
+               CERROR("%s: llog_process() with cat_cancel_cb failed: rc = "
+                      "%d\n", llh->lgh_ctxt->loc_obd->obd_name, rc);
+       RETURN(0);
+}
+EXPORT_SYMBOL(llog_cat_init_and_process);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
new file mode 100644 (file)
index 0000000..539e1d4
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LLOG_INTERNAL_H__
+#define __LLOG_INTERNAL_H__
+
+#include <lustre_log.h>
+
+struct llog_process_info {
+       struct llog_handle *lpi_loghandle;
+       llog_cb_t          lpi_cb;
+       void           *lpi_cbdata;
+       void           *lpi_catdata;
+       int              lpi_rc;
+       struct completion       lpi_completion;
+       const struct lu_env     *lpi_env;
+
+};
+
+struct llog_thread_info {
+       struct lu_attr                   lgi_attr;
+       struct lu_fid                    lgi_fid;
+       struct dt_object_format          lgi_dof;
+       struct lu_buf                    lgi_buf;
+       loff_t                           lgi_off;
+       struct llog_rec_hdr              lgi_lrh;
+       struct llog_rec_tail             lgi_tail;
+};
+
+extern struct lu_context_key llog_thread_key;
+
+static inline struct llog_thread_info *llog_info(const struct lu_env *env)
+{
+       struct llog_thread_info *lgi;
+
+       lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key);
+       LASSERT(lgi);
+       return lgi;
+}
+
+static inline void
+lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen)
+{
+       ostid_set_seq_llog(&logid->lgl_oi);
+       ostid_set_id(&logid->lgl_oi, ino);
+       logid->lgl_ogen = gen;
+}
+
+int llog_info_init(void);
+void llog_info_fini(void);
+
+void llog_handle_get(struct llog_handle *loghandle);
+void llog_handle_put(struct llog_handle *loghandle);
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+                      struct llog_handle **res, struct llog_logid *logid);
+int class_config_dump_handler(const struct lu_env *env,
+                             struct llog_handle *handle,
+                             struct llog_rec_hdr *rec, void *data);
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size);
+int llog_process_or_fork(const struct lu_env *env,
+                        struct llog_handle *loghandle,
+                        llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+                    struct llog_handle *loghandle, int index);
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c
new file mode 100644 (file)
index 0000000..0732874
--- /dev/null
@@ -0,0 +1,427 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+static int str2logid(struct llog_logid *logid, char *str, int len)
+{
+       char *start, *end, *endp;
+       __u64 id, seq;
+
+       ENTRY;
+       start = str;
+       if (*start != '#')
+               RETURN(-EINVAL);
+
+       start++;
+       if (start - str >= len - 1)
+               RETURN(-EINVAL);
+       end = strchr(start, '#');
+       if (end == NULL || end == start)
+               RETURN(-EINVAL);
+
+       *end = '\0';
+       id = simple_strtoull(start, &endp, 0);
+       if (endp != end)
+               RETURN(-EINVAL);
+
+       start = ++end;
+       if (start - str >= len - 1)
+               RETURN(-EINVAL);
+       end = strchr(start, '#');
+       if (end == NULL || end == start)
+               RETURN(-EINVAL);
+
+       *end = '\0';
+       seq = simple_strtoull(start, &endp, 0);
+       if (endp != end)
+               RETURN(-EINVAL);
+
+       ostid_set_seq(&logid->lgl_oi, seq);
+       ostid_set_id(&logid->lgl_oi, id);
+
+       start = ++end;
+       if (start - str >= len - 1)
+               RETURN(-EINVAL);
+       logid->lgl_ogen = simple_strtoul(start, &endp, 16);
+       if (*endp != '\0')
+               RETURN(-EINVAL);
+
+       RETURN(0);
+}
+
+static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
+                        struct llog_rec_hdr *rec, void *data)
+{
+       struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+       static int l, remains, from, to;
+       static char *out;
+       char *endp;
+       int cur_index, rc = 0;
+
+       ENTRY;
+
+       if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+               l = 0;
+               remains = ioc_data->ioc_inllen4 +
+                       cfs_size_round(ioc_data->ioc_inllen1) +
+                       cfs_size_round(ioc_data->ioc_inllen2) +
+                       cfs_size_round(ioc_data->ioc_inllen3);
+               from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+               if (*endp != '\0')
+                       RETURN(-EINVAL);
+               to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+               if (*endp != '\0')
+                       RETURN(-EINVAL);
+               ioc_data->ioc_inllen1 = 0;
+               out = ioc_data->ioc_bulk;
+       }
+
+       cur_index = rec->lrh_index;
+       if (cur_index < from)
+               RETURN(0);
+       if (to > 0 && cur_index > to)
+               RETURN(-LLOG_EEMPTY);
+
+       if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+               struct llog_logid_rec   *lir = (struct llog_logid_rec *)rec;
+               struct llog_handle      *loghandle;
+
+               if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+                       l = snprintf(out, remains, "[index]: %05d  [type]: "
+                                    "%02x  [len]: %04d failed\n",
+                                    cur_index, rec->lrh_type,
+                                    rec->lrh_len);
+               }
+               if (handle->lgh_ctxt == NULL)
+                       RETURN(-EOPNOTSUPP);
+               rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
+               if (rc) {
+                       CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+                              POSTID(&lir->lid_id.lgl_oi),
+                              lir->lid_id.lgl_ogen);
+                       RETURN(rc);
+               }
+               rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
+               llog_handle_put(loghandle);
+       } else {
+               bool ok;
+
+               switch (rec->lrh_type) {
+               case OST_SZ_REC:
+               case MDS_UNLINK_REC:
+               case MDS_UNLINK64_REC:
+               case MDS_SETATTR64_REC:
+               case OBD_CFG_REC:
+               case LLOG_GEN_REC:
+               case LLOG_HDR_MAGIC:
+                       ok = true;
+                       break;
+               default:
+                       ok = false;
+               }
+
+               l = snprintf(out, remains, "[index]: %05d  [type]: "
+                            "%02x  [len]: %04d %s\n",
+                            cur_index, rec->lrh_type, rec->lrh_len,
+                            ok ? "ok" : "failed");
+               out += l;
+               remains -= l;
+               if (remains <= 0) {
+                       CERROR("%s: no space to print log records\n",
+                              handle->lgh_ctxt->loc_obd->obd_name);
+                       RETURN(-LLOG_EEMPTY);
+               }
+       }
+       RETURN(rc);
+}
+
+static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
+                        struct llog_rec_hdr *rec, void *data)
+{
+       struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+       static int l, remains, from, to;
+       static char *out;
+       char *endp;
+       int cur_index;
+
+       ENTRY;
+       if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) {
+               l = 0;
+               remains = ioc_data->ioc_inllen4 +
+                       cfs_size_round(ioc_data->ioc_inllen1) +
+                       cfs_size_round(ioc_data->ioc_inllen2) +
+                       cfs_size_round(ioc_data->ioc_inllen3);
+               from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+               if (*endp != '\0')
+                       RETURN(-EINVAL);
+               to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+               if (*endp != '\0')
+                       RETURN(-EINVAL);
+               out = ioc_data->ioc_bulk;
+               ioc_data->ioc_inllen1 = 0;
+       }
+
+       cur_index = rec->lrh_index;
+       if (cur_index < from)
+               RETURN(0);
+       if (to > 0 && cur_index > to)
+               RETURN(-LLOG_EEMPTY);
+
+       if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+               struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+               if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+                       CERROR("invalid record in catalog\n");
+                       RETURN(-EINVAL);
+               }
+
+               l = snprintf(out, remains,
+                            "[index]: %05d  [logid]: #"DOSTID"#%08x\n",
+                            cur_index, POSTID(&lir->lid_id.lgl_oi),
+                            lir->lid_id.lgl_ogen);
+       } else if (rec->lrh_type == OBD_CFG_REC) {
+               int rc;
+
+               rc = class_config_parse_rec(rec, out, remains);
+               if (rc < 0)
+                       RETURN(rc);
+               l = rc;
+       } else {
+               l = snprintf(out, remains,
+                            "[index]: %05d  [type]: %02x  [len]: %04d\n",
+                            cur_index, rec->lrh_type, rec->lrh_len);
+       }
+       out += l;
+       remains -= l;
+       if (remains <= 0) {
+               CERROR("not enough space for print log records\n");
+               RETURN(-LLOG_EEMPTY);
+       }
+
+       RETURN(0);
+}
+static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
+                          struct llog_logid *logid)
+{
+       struct llog_handle      *log;
+       int                      rc;
+
+       ENTRY;
+
+       rc = llog_cat_id2handle(env, cat, &log, logid);
+       if (rc) {
+               CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+                      POSTID(&logid->lgl_oi), logid->lgl_ogen);
+               RETURN(-ENOENT);
+       }
+
+       rc = llog_destroy(env, log);
+       if (rc) {
+               CDEBUG(D_IOCTL, "cannot destroy log\n");
+               GOTO(out, rc);
+       }
+       llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
+out:
+       llog_handle_put(log);
+       RETURN(rc);
+
+}
+
+static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
+                         struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_logid_rec   *lir = (struct llog_logid_rec *)rec;
+       int                      rc;
+
+       ENTRY;
+       if (rec->lrh_type != LLOG_LOGID_MAGIC)
+               RETURN(-EINVAL);
+       rc = llog_remove_log(env, handle, &lir->lid_id);
+
+       RETURN(rc);
+}
+
+
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+              struct obd_ioctl_data *data)
+{
+       struct llog_logid        logid;
+       int                      rc = 0;
+       struct llog_handle      *handle = NULL;
+
+       ENTRY;
+
+       if (*data->ioc_inlbuf1 == '#') {
+               rc = str2logid(&logid, data->ioc_inlbuf1, data->ioc_inllen1);
+               if (rc)
+                       RETURN(rc);
+               rc = llog_open(env, ctxt, &handle, &logid, NULL,
+                              LLOG_OPEN_EXISTS);
+               if (rc)
+                       RETURN(rc);
+       } else if (*data->ioc_inlbuf1 == '$') {
+               char *name = data->ioc_inlbuf1 + 1;
+
+               rc = llog_open(env, ctxt, &handle, NULL, name,
+                              LLOG_OPEN_EXISTS);
+               if (rc)
+                       RETURN(rc);
+       } else {
+               RETURN(-EINVAL);
+       }
+
+       rc = llog_init_handle(env, handle, 0, NULL);
+       if (rc)
+               GOTO(out_close, rc = -ENOENT);
+
+       switch (cmd) {
+       case OBD_IOC_LLOG_INFO: {
+               int      l;
+               int      remains = data->ioc_inllen2 +
+                                  cfs_size_round(data->ioc_inllen1);
+               char    *out = data->ioc_bulk;
+
+               l = snprintf(out, remains,
+                            "logid:        #"DOSTID"#%08x\n"
+                            "flags:        %x (%s)\n"
+                            "records count:    %d\n"
+                            "last index:       %d\n",
+                            POSTID(&handle->lgh_id.lgl_oi),
+                            handle->lgh_id.lgl_ogen,
+                            handle->lgh_hdr->llh_flags,
+                            handle->lgh_hdr->llh_flags &
+                            LLOG_F_IS_CAT ? "cat" : "plain",
+                            handle->lgh_hdr->llh_count,
+                            handle->lgh_last_idx);
+               out += l;
+               remains -= l;
+               if (remains <= 0) {
+                       CERROR("%s: not enough space for log header info\n",
+                              ctxt->loc_obd->obd_name);
+                       rc = -ENOSPC;
+               }
+               break;
+       }
+       case OBD_IOC_LLOG_CHECK:
+               LASSERT(data->ioc_inllen1 > 0);
+               rc = llog_process(env, handle, llog_check_cb, data, NULL);
+               if (rc == -LLOG_EEMPTY)
+                       rc = 0;
+               else if (rc)
+                       GOTO(out_close, rc);
+               break;
+       case OBD_IOC_LLOG_PRINT:
+               LASSERT(data->ioc_inllen1 > 0);
+               rc = llog_process(env, handle, llog_print_cb, data, NULL);
+               if (rc == -LLOG_EEMPTY)
+                       rc = 0;
+               else if (rc)
+                       GOTO(out_close, rc);
+               break;
+       case OBD_IOC_LLOG_CANCEL: {
+               struct llog_cookie cookie;
+               struct llog_logid plain;
+               char *endp;
+
+               cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0);
+               if (*endp != '\0')
+                       GOTO(out_close, rc = -EINVAL);
+
+               if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+                       rc = llog_cancel_rec(NULL, handle, cookie.lgc_index);
+                       GOTO(out_close, rc);
+               } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+                       GOTO(out_close, rc = -EINVAL);
+               }
+
+               if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */
+                       GOTO(out_close, rc = -ENOTTY);
+
+               rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2);
+               if (rc)
+                       GOTO(out_close, rc);
+               cookie.lgc_lgl = plain;
+               rc = llog_cat_cancel_records(env, handle, 1, &cookie);
+               if (rc)
+                       GOTO(out_close, rc);
+               break;
+       }
+       case OBD_IOC_LLOG_REMOVE: {
+               struct llog_logid plain;
+
+               if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+                       rc = llog_destroy(env, handle);
+                       GOTO(out_close, rc);
+               } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+                       GOTO(out_close, rc = -EINVAL);
+               }
+
+               if (data->ioc_inlbuf2 > 0) {
+                       /* remove indicate log from the catalog */
+                       rc = str2logid(&plain, data->ioc_inlbuf2,
+                                      data->ioc_inllen2);
+                       if (rc)
+                               GOTO(out_close, rc);
+                       rc = llog_remove_log(env, handle, &plain);
+               } else {
+                       /* remove all the log of the catalog */
+                       rc = llog_process(env, handle, llog_delete_cb, NULL,
+                                         NULL);
+                       if (rc)
+                               GOTO(out_close, rc);
+               }
+               break;
+       }
+       default:
+               CERROR("%s: Unknown ioctl cmd %#x\n",
+                      ctxt->loc_obd->obd_name, cmd);
+               GOTO(out_close, rc = -ENOTTY);
+       }
+
+out_close:
+       if (handle->lgh_hdr &&
+           handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+               llog_cat_close(env, handle);
+       else
+               llog_close(env, handle);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_ioctl);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c
new file mode 100644 (file)
index 0000000..7e12dc6
--- /dev/null
@@ -0,0 +1,862 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_lvfs.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <obd_ost.h>
+#include <linux/list.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include "llog_internal.h"
+
+#if  defined(LLOG_LVFS)
+
+static int llog_lvfs_pad(struct obd_device *obd, struct l_file *file,
+                               int len, int index)
+{
+       struct llog_rec_hdr rec = { 0 };
+       struct llog_rec_tail tail;
+       int rc;
+       ENTRY;
+
+       LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+       tail.lrt_len = rec.lrh_len = len;
+       tail.lrt_index = rec.lrh_index = index;
+       rec.lrh_type = LLOG_PAD_MAGIC;
+
+       rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0);
+       if (rc) {
+               CERROR("error writing padding record: rc %d\n", rc);
+               goto out;
+       }
+
+       file->f_pos += len - sizeof(rec) - sizeof(tail);
+       rc = fsfilt_write_record(obd, file, &tail, sizeof(tail),&file->f_pos,0);
+       if (rc) {
+               CERROR("error writing padding record: rc %d\n", rc);
+               goto out;
+       }
+
+ out:
+       RETURN(rc);
+}
+
+static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file,
+                               struct llog_rec_hdr *rec, void *buf, loff_t off)
+{
+       int rc;
+       struct llog_rec_tail end;
+       loff_t saved_off = file->f_pos;
+       int buflen = rec->lrh_len;
+
+       ENTRY;
+
+       file->f_pos = off;
+
+       if (buflen == 0)
+               CWARN("0-length record\n");
+
+       if (!buf) {
+               rc = fsfilt_write_record(obd, file, rec, buflen,&file->f_pos,0);
+               if (rc) {
+                       CERROR("error writing log record: rc %d\n", rc);
+                       goto out;
+               }
+               GOTO(out, rc = 0);
+       }
+
+       /* the buf case */
+       rec->lrh_len = sizeof(*rec) + buflen + sizeof(end);
+       rc = fsfilt_write_record(obd, file, rec, sizeof(*rec), &file->f_pos, 0);
+       if (rc) {
+               CERROR("error writing log hdr: rc %d\n", rc);
+               goto out;
+       }
+
+       rc = fsfilt_write_record(obd, file, buf, buflen, &file->f_pos, 0);
+       if (rc) {
+               CERROR("error writing log buffer: rc %d\n", rc);
+               goto out;
+       }
+
+       end.lrt_len = rec->lrh_len;
+       end.lrt_index = rec->lrh_index;
+       rc = fsfilt_write_record(obd, file, &end, sizeof(end), &file->f_pos, 0);
+       if (rc) {
+               CERROR("error writing log tail: rc %d\n", rc);
+               goto out;
+       }
+
+       rc = 0;
+ out:
+       if (saved_off > file->f_pos)
+               file->f_pos = saved_off;
+       LASSERT(rc <= 0);
+       RETURN(rc);
+}
+
+static int llog_lvfs_read_blob(struct obd_device *obd, struct l_file *file,
+                               void *buf, int size, loff_t off)
+{
+       loff_t offset = off;
+       int rc;
+       ENTRY;
+
+       rc = fsfilt_read_record(obd, file, buf, size, &offset);
+       if (rc) {
+               CERROR("error reading log record: rc %d\n", rc);
+               RETURN(rc);
+       }
+       RETURN(0);
+}
+
+static int llog_lvfs_read_header(const struct lu_env *env,
+                                struct llog_handle *handle)
+{
+       struct obd_device *obd;
+       int rc;
+       ENTRY;
+
+       LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+       obd = handle->lgh_ctxt->loc_exp->exp_obd;
+
+       if (i_size_read(handle->lgh_file->f_dentry->d_inode) == 0) {
+               CDEBUG(D_HA, "not reading header from 0-byte log\n");
+               RETURN(LLOG_EEMPTY);
+       }
+
+       rc = llog_lvfs_read_blob(obd, handle->lgh_file, handle->lgh_hdr,
+                                LLOG_CHUNK_SIZE, 0);
+       if (rc) {
+               CERROR("error reading log header from %.*s\n",
+                      handle->lgh_file->f_dentry->d_name.len,
+                      handle->lgh_file->f_dentry->d_name.name);
+       } else {
+               struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr;
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+                       lustre_swab_llog_hdr(handle->lgh_hdr);
+
+               if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+                       CERROR("bad log %.*s header magic: %#x (expected %#x)\n",
+                              handle->lgh_file->f_dentry->d_name.len,
+                              handle->lgh_file->f_dentry->d_name.name,
+                              llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+                       rc = -EIO;
+               } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+                       CERROR("incorrectly sized log %.*s header: %#x "
+                              "(expected %#x)\n",
+                              handle->lgh_file->f_dentry->d_name.len,
+                              handle->lgh_file->f_dentry->d_name.name,
+                              llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+                       CERROR("you may need to re-run lconf --write_conf.\n");
+                       rc = -EIO;
+               }
+       }
+
+       handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+       handle->lgh_file->f_pos = i_size_read(handle->lgh_file->f_dentry->d_inode);
+
+       RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_lvfs_write_rec(const struct lu_env *env,
+                              struct llog_handle *loghandle,
+                              struct llog_rec_hdr *rec,
+                              struct llog_cookie *reccookie, int cookiecount,
+                              void *buf, int idx, struct thandle *th)
+{
+       struct llog_log_hdr *llh;
+       int reclen = rec->lrh_len, index, rc;
+       struct llog_rec_tail *lrt;
+       struct obd_device *obd;
+       struct file *file;
+       size_t left;
+       ENTRY;
+
+       llh = loghandle->lgh_hdr;
+       file = loghandle->lgh_file;
+       obd = loghandle->lgh_ctxt->loc_exp->exp_obd;
+
+       /* record length should not bigger than LLOG_CHUNK_SIZE */
+       if (buf)
+               rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+                     sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+       else
+               rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+       if (rc)
+               RETURN(rc);
+
+       if (buf)
+               /* write_blob adds header and tail to lrh_len. */
+               reclen = sizeof(*rec) + rec->lrh_len +
+                        sizeof(struct llog_rec_tail);
+
+       if (idx != -1) {
+               loff_t saved_offset;
+
+               /* no header: only allowed to insert record 1 */
+               if (idx != 1 && !i_size_read(file->f_dentry->d_inode)) {
+                       CERROR("idx != -1 in empty log\n");
+                       LBUG();
+               }
+
+               if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+                       RETURN(-EINVAL);
+
+               if (!ext2_test_bit(idx, llh->llh_bitmap))
+                       CERROR("Modify unset record %u\n", idx);
+               if (idx != rec->lrh_index)
+                       CERROR("Index mismatch %d %u\n", idx, rec->lrh_index);
+
+               rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+               /* we are done if we only write the header or on error */
+               if (rc || idx == 0)
+                       RETURN(rc);
+
+               if (buf) {
+                       /* We assume that caller has set lgh_cur_* */
+                       saved_offset = loghandle->lgh_cur_offset;
+                       CDEBUG(D_OTHER,
+                              "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+                              "offset %llu\n",
+                              POSTID(&loghandle->lgh_id.lgl_oi), idx, rec->lrh_index,
+                              loghandle->lgh_cur_idx, rec->lrh_len,
+                              (long long)(saved_offset - sizeof(*llh)));
+                       if (rec->lrh_index != loghandle->lgh_cur_idx) {
+                               CERROR("modify idx mismatch %u/%d\n",
+                                      idx, loghandle->lgh_cur_idx);
+                               RETURN(-EFAULT);
+                       }
+               } else {
+                       /* Assumes constant lrh_len */
+                       saved_offset = sizeof(*llh) + (idx - 1) * reclen;
+               }
+
+               rc = llog_lvfs_write_blob(obd, file, rec, buf, saved_offset);
+               if (rc == 0 && reccookie) {
+                       reccookie->lgc_lgl = loghandle->lgh_id;
+                       reccookie->lgc_index = idx;
+                       rc = 1;
+               }
+               RETURN(rc);
+       }
+
+       /* Make sure that records don't cross a chunk boundary, so we can
+        * process them page-at-a-time if needed.  If it will cross a chunk
+        * boundary, write in a fake (but referenced) entry to pad the chunk.
+        *
+        * We know that llog_current_log() will return a loghandle that is
+        * big enough to hold reclen, so all we care about is padding here.
+        */
+       left = LLOG_CHUNK_SIZE - (file->f_pos & (LLOG_CHUNK_SIZE - 1));
+
+       /* NOTE: padding is a record, but no bit is set */
+       if (left != 0 && left != reclen &&
+           left < (reclen + LLOG_MIN_REC_SIZE)) {
+                index = loghandle->lgh_last_idx + 1;
+                rc = llog_lvfs_pad(obd, file, left, index);
+                if (rc)
+                        RETURN(rc);
+                loghandle->lgh_last_idx++; /*for pad rec*/
+        }
+        /* if it's the last idx in log file, then return -ENOSPC */
+        if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+                RETURN(-ENOSPC);
+       loghandle->lgh_last_idx++;
+       index = loghandle->lgh_last_idx;
+       LASSERT(index < LLOG_BITMAP_SIZE(llh));
+       rec->lrh_index = index;
+       if (buf == NULL) {
+               lrt = (struct llog_rec_tail *)
+                       ((char *)rec + rec->lrh_len - sizeof(*lrt));
+               lrt->lrt_len = rec->lrh_len;
+               lrt->lrt_index = rec->lrh_index;
+       }
+       /*The caller should make sure only 1 process access the lgh_last_idx,
+        *Otherwise it might hit the assert.*/
+       LASSERT(index < LLOG_BITMAP_SIZE(llh));
+       spin_lock(&loghandle->lgh_hdr_lock);
+       if (ext2_set_bit(index, llh->llh_bitmap)) {
+               CERROR("argh, index %u already set in log bitmap?\n", index);
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               LBUG(); /* should never happen */
+       }
+       llh->llh_count++;
+       spin_unlock(&loghandle->lgh_hdr_lock);
+       llh->llh_tail.lrt_index = index;
+
+       rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+       if (rc)
+               RETURN(rc);
+
+       rc = llog_lvfs_write_blob(obd, file, rec, buf, file->f_pos);
+       if (rc)
+               RETURN(rc);
+
+       CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u \n",
+              POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+       if (rc == 0 && reccookie) {
+               reccookie->lgc_lgl = loghandle->lgh_id;
+               reccookie->lgc_index = index;
+               if ((rec->lrh_type == MDS_UNLINK_REC) ||
+                   (rec->lrh_type == MDS_SETATTR64_REC))
+                       reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+               else if (rec->lrh_type == OST_SZ_REC)
+                       reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+               else
+                       reccookie->lgc_subsys = -1;
+               rc = 1;
+       }
+       if (rc == 0 && rec->lrh_type == LLOG_GEN_REC)
+               rc = 1;
+
+       RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+* minimum sized log records we are skipping.  If it turns out
+* that we are not far enough along the log (because the
+* actual records are larger than minimum size) we just skip
+* some more records. */
+
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+       if (goal <= curr)
+               return;
+       *off = (*off + (goal-curr-1) * LLOG_MIN_REC_SIZE) &
+               ~(LLOG_CHUNK_SIZE - 1);
+}
+
+
+/* sets:
+ *  - cur_offset to the furthest point read in the log file
+ *  - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_lvfs_next_block(const struct lu_env *env,
+                               struct llog_handle *loghandle, int *cur_idx,
+                               int next_idx, __u64 *cur_offset, void *buf,
+                               int len)
+{
+       int rc;
+       ENTRY;
+
+       if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+              next_idx, *cur_idx, *cur_offset);
+
+       while (*cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+               struct llog_rec_hdr *rec, *last_rec;
+               struct llog_rec_tail *tail;
+               loff_t ppos;
+               int llen;
+
+               llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+               /* read up to next LLOG_CHUNK_SIZE block */
+               ppos = *cur_offset;
+               llen = LLOG_CHUNK_SIZE - (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+               rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+                                       loghandle->lgh_file, buf, llen,
+                                       cur_offset);
+               if (rc < 0) {
+                       CERROR("Cant read llog block at log id "DOSTID
+                              "/%u offset "LPU64"\n",
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen,
+                              *cur_offset);
+                       RETURN(rc);
+               }
+
+               /* put number of bytes read into rc to make code simpler */
+               rc = *cur_offset - ppos;
+               if (rc < len) {
+                       /* signal the end of the valid buffer to llog_process */
+                       memset(buf + rc, 0, len - rc);
+               }
+
+               if (rc == 0) /* end of file, nothing to do */
+                       RETURN(0);
+
+               if (rc < sizeof(*tail)) {
+                       CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+                              LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, *cur_offset);
+                       RETURN(-EINVAL);
+               }
+
+               rec = buf;
+               if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                       lustre_swab_llog_rec(rec);
+
+               tail = (struct llog_rec_tail *)(buf + rc -
+                                               sizeof(struct llog_rec_tail));
+
+               /* get the last record in block */
+               last_rec = (struct llog_rec_hdr *)(buf + rc -
+                                                  le32_to_cpu(tail->lrt_len));
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+                       lustre_swab_llog_rec(last_rec);
+               LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+               *cur_idx = tail->lrt_index;
+
+               /* this shouldn't happen */
+               if (tail->lrt_index == 0) {
+                       CERROR("Invalid llog tail at log id "DOSTID"/%u offset "
+                              LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, *cur_offset);
+                       RETURN(-EINVAL);
+               }
+               if (tail->lrt_index < next_idx)
+                       continue;
+
+               /* sanity check that the start of the new buffer is no farther
+                * than the record that we wanted.  This shouldn't happen. */
+               if (rec->lrh_index > next_idx) {
+                       CERROR("missed desired record? %u > %u\n",
+                              rec->lrh_index, next_idx);
+                       RETURN(-ENOENT);
+               }
+               RETURN(0);
+       }
+       RETURN(-EIO);
+}
+
+static int llog_lvfs_prev_block(const struct lu_env *env,
+                               struct llog_handle *loghandle,
+                               int prev_idx, void *buf, int len)
+{
+       __u64 cur_offset;
+       int rc;
+       ENTRY;
+
+       if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+       cur_offset = LLOG_CHUNK_SIZE;
+       llog_skip_over(&cur_offset, 0, prev_idx);
+
+       while (cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+               struct llog_rec_hdr *rec, *last_rec;
+               struct llog_rec_tail *tail;
+               loff_t ppos = cur_offset;
+
+               rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+                                       loghandle->lgh_file, buf, len,
+                                       &cur_offset);
+               if (rc < 0) {
+                       CERROR("Cant read llog block at log id "DOSTID
+                              "/%u offset "LPU64"\n",
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen,
+                              cur_offset);
+                       RETURN(rc);
+               }
+
+               /* put number of bytes read into rc to make code simpler */
+               rc = cur_offset - ppos;
+
+               if (rc == 0) /* end of file, nothing to do */
+                       RETURN(0);
+
+               if (rc < sizeof(*tail)) {
+                       CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+                              LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, cur_offset);
+                       RETURN(-EINVAL);
+               }
+
+               rec = buf;
+               if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                       lustre_swab_llog_rec(rec);
+
+               tail = (struct llog_rec_tail *)(buf + rc -
+                                               sizeof(struct llog_rec_tail));
+
+               /* get the last record in block */
+               last_rec = (struct llog_rec_hdr *)(buf + rc -
+                                                  le32_to_cpu(tail->lrt_len));
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+                       lustre_swab_llog_rec(last_rec);
+               LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+               /* this shouldn't happen */
+               if (tail->lrt_index == 0) {
+                       CERROR("Invalid llog tail at log id "DOSTID"/%u offset"
+                              LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, cur_offset);
+                       RETURN(-EINVAL);
+               }
+               if (tail->lrt_index < prev_idx)
+                       continue;
+
+               /* sanity check that the start of the new buffer is no farther
+                * than the record that we wanted.  This shouldn't happen. */
+               if (rec->lrh_index > prev_idx) {
+                       CERROR("missed desired record? %u > %u\n",
+                              rec->lrh_index, prev_idx);
+                       RETURN(-ENOENT);
+               }
+               RETURN(0);
+       }
+       RETURN(-EIO);
+}
+
+static struct file *llog_filp_open(char *dir, char *name, int flags, int mode)
+{
+       char *logname;
+       struct file *filp;
+       int len;
+
+       OBD_ALLOC(logname, PATH_MAX);
+       if (logname == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       len = snprintf(logname, PATH_MAX, "%s/%s", dir, name);
+       if (len >= PATH_MAX - 1) {
+               filp = ERR_PTR(-ENAMETOOLONG);
+       } else {
+               filp = l_filp_open(logname, flags, mode);
+               if (IS_ERR(filp) && PTR_ERR(filp) != -ENOENT)
+                       CERROR("logfile creation %s: %ld\n", logname,
+                              PTR_ERR(filp));
+       }
+       OBD_FREE(logname, PATH_MAX);
+       return filp;
+}
+
+static int llog_lvfs_open(const struct lu_env *env,  struct llog_handle *handle,
+                         struct llog_logid *logid, char *name,
+                         enum llog_open_param open_param)
+{
+       struct llog_ctxt        *ctxt = handle->lgh_ctxt;
+       struct l_dentry         *dchild = NULL;
+       struct obd_device       *obd;
+       int                      rc = 0;
+
+       ENTRY;
+
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_exp);
+       LASSERT(ctxt->loc_exp->exp_obd);
+       obd = ctxt->loc_exp->exp_obd;
+
+       LASSERT(handle);
+       if (logid != NULL) {
+               dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &logid->lgl_oi,
+                                            logid->lgl_ogen);
+               if (IS_ERR(dchild)) {
+                       rc = PTR_ERR(dchild);
+                       CERROR("%s: error looking up logfile #"DOSTID "#%08x:"
+                              " rc = %d\n", ctxt->loc_obd->obd_name,
+                              POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+                       GOTO(out, rc);
+               }
+               if (dchild->d_inode == NULL) {
+                       l_dput(dchild);
+                       rc = -ENOENT;
+                       CERROR("%s: nonexistent llog #"DOSTID"#%08x:"
+                              "rc = %d\n", ctxt->loc_obd->obd_name,
+                              POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+                       GOTO(out, rc);
+               }
+               handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild,
+                                                O_RDWR | O_LARGEFILE);
+               l_dput(dchild);
+               if (IS_ERR(handle->lgh_file)) {
+                       rc = PTR_ERR(handle->lgh_file);
+                       handle->lgh_file = NULL;
+                       CERROR("%s: error opening llog #"DOSTID"#%08x:"
+                              "rc = %d\n", ctxt->loc_obd->obd_name,
+                              POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+                       GOTO(out, rc);
+               }
+               handle->lgh_id = *logid;
+       } else if (name) {
+               handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, name,
+                                                 O_RDWR | O_LARGEFILE, 0644);
+               if (IS_ERR(handle->lgh_file)) {
+                       rc = PTR_ERR(handle->lgh_file);
+                       handle->lgh_file = NULL;
+                       if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+                               OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+                               if (handle->lgh_name)
+                                       strcpy(handle->lgh_name, name);
+                               else
+                                       GOTO(out, rc = -ENOMEM);
+                               rc = 0;
+                       } else {
+                               GOTO(out, rc);
+                       }
+               } else {
+                       lustre_build_llog_lvfs_oid(&handle->lgh_id,
+                           handle->lgh_file->f_dentry->d_inode->i_ino,
+                           handle->lgh_file->f_dentry->d_inode->i_generation);
+               }
+       } else {
+               LASSERTF(open_param == LLOG_OPEN_NEW, "%#x\n", open_param);
+               handle->lgh_file = NULL;
+       }
+
+       /* No new llog is expected but doesn't exist */
+       if (open_param != LLOG_OPEN_NEW && handle->lgh_file == NULL)
+               GOTO(out_name, rc = -ENOENT);
+
+       RETURN(0);
+out_name:
+       if (handle->lgh_name != NULL)
+               OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+       RETURN(rc);
+}
+
+static int llog_lvfs_exist(struct llog_handle *handle)
+{
+       return (handle->lgh_file != NULL);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_lvfs_create(const struct lu_env *env,
+                           struct llog_handle *handle,
+                           struct thandle *th)
+{
+       struct llog_ctxt        *ctxt = handle->lgh_ctxt;
+       struct obd_device       *obd;
+       struct l_dentry         *dchild = NULL;
+       struct file             *file;
+       struct obdo             *oa = NULL;
+       int                      rc = 0;
+       int                      open_flags = O_RDWR | O_CREAT | O_LARGEFILE;
+
+       ENTRY;
+
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_exp);
+       obd = ctxt->loc_exp->exp_obd;
+       LASSERT(handle->lgh_file == NULL);
+
+       if (handle->lgh_name) {
+               file = llog_filp_open(MOUNT_CONFIGS_DIR, handle->lgh_name,
+                                     open_flags, 0644);
+               if (IS_ERR(file))
+                       RETURN(PTR_ERR(file));
+
+               lustre_build_llog_lvfs_oid(&handle->lgh_id,
+                               file->f_dentry->d_inode->i_ino,
+                               file->f_dentry->d_inode->i_generation);
+               handle->lgh_file = file;
+       } else {
+               OBDO_ALLOC(oa);
+               if (oa == NULL)
+                       RETURN(-ENOMEM);
+
+               ostid_set_seq_llog(&oa->o_oi);
+               oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
+
+               rc = obd_create(NULL, ctxt->loc_exp, oa, NULL, NULL);
+               if (rc)
+                       GOTO(out, rc);
+
+               /* FIXME: rationalize the misuse of o_generation in
+                *      this API along with mds_obd_{create,destroy}.
+                *      Hopefully it is only an internal API issue. */
+#define o_generation o_parent_oid
+               dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &oa->o_oi,
+                                            oa->o_generation);
+               if (IS_ERR(dchild))
+                       GOTO(out, rc = PTR_ERR(dchild));
+
+               file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, open_flags);
+               l_dput(dchild);
+               if (IS_ERR(file))
+                       GOTO(out, rc = PTR_ERR(file));
+               handle->lgh_id.lgl_oi = oa->o_oi;
+               handle->lgh_id.lgl_ogen = oa->o_generation;
+               handle->lgh_file = file;
+out:
+               OBDO_FREE(oa);
+       }
+       RETURN(rc);
+}
+
+static int llog_lvfs_close(const struct lu_env *env,
+                          struct llog_handle *handle)
+{
+       int rc;
+
+       ENTRY;
+
+       if (handle->lgh_file == NULL)
+               RETURN(0);
+       rc = filp_close(handle->lgh_file, 0);
+       if (rc)
+               CERROR("%s: error closing llog #"DOSTID"#%08x: "
+                      "rc = %d\n", handle->lgh_ctxt->loc_obd->obd_name,
+                      POSTID(&handle->lgh_id.lgl_oi),
+                      handle->lgh_id.lgl_ogen, rc);
+       handle->lgh_file = NULL;
+       if (handle->lgh_name) {
+               OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+               handle->lgh_name = NULL;
+       }
+       RETURN(rc);
+}
+
+static int llog_lvfs_destroy(const struct lu_env *env,
+                            struct llog_handle *handle)
+{
+       struct dentry *fdentry;
+       struct obdo *oa;
+       struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd;
+       char *dir;
+       void *th;
+       struct inode *inode;
+       int rc, rc1;
+       ENTRY;
+
+       dir = MOUNT_CONFIGS_DIR;
+
+       LASSERT(handle->lgh_file);
+       fdentry = handle->lgh_file->f_dentry;
+       inode = fdentry->d_parent->d_inode;
+       if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) {
+               struct lvfs_run_ctxt saved;
+               struct vfsmount *mnt = mntget(handle->lgh_file->f_vfsmnt);
+
+               push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+               dget(fdentry);
+               rc = llog_lvfs_close(env, handle);
+               if (rc == 0) {
+                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+                       rc = ll_vfs_unlink(inode, fdentry, mnt);
+                       mutex_unlock(&inode->i_mutex);
+               }
+               mntput(mnt);
+
+               dput(fdentry);
+               pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+               RETURN(rc);
+       }
+
+       OBDO_ALLOC(oa);
+       if (oa == NULL)
+               RETURN(-ENOMEM);
+
+       oa->o_oi = handle->lgh_id.lgl_oi;
+       oa->o_generation = handle->lgh_id.lgl_ogen;
+#undef o_generation
+       oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER;
+
+       rc = llog_lvfs_close(env, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       th = fsfilt_start_log(obd, inode, FSFILT_OP_UNLINK, NULL, 1);
+       if (IS_ERR(th)) {
+               CERROR("fsfilt_start failed: %ld\n", PTR_ERR(th));
+               GOTO(out, rc = PTR_ERR(th));
+       }
+
+       rc = obd_destroy(NULL, handle->lgh_ctxt->loc_exp, oa,
+                        NULL, NULL, NULL, NULL);
+
+       rc1 = fsfilt_commit(obd, inode, th, 0);
+       if (rc == 0 && rc1 != 0)
+               rc = rc1;
+ out:
+       OBDO_FREE(oa);
+       RETURN(rc);
+}
+
+static int llog_lvfs_declare_create(const struct lu_env *env,
+                                   struct llog_handle *res,
+                                   struct thandle *th)
+{
+       return 0;
+}
+
+static int llog_lvfs_declare_write_rec(const struct lu_env *env,
+                                      struct llog_handle *loghandle,
+                                      struct llog_rec_hdr *rec,
+                                      int idx, struct thandle *th)
+{
+       return 0;
+}
+
+struct llog_operations llog_lvfs_ops = {
+       .lop_write_rec          = llog_lvfs_write_rec,
+       .lop_next_block         = llog_lvfs_next_block,
+       .lop_prev_block         = llog_lvfs_prev_block,
+       .lop_read_header        = llog_lvfs_read_header,
+       .lop_create             = llog_lvfs_create,
+       .lop_destroy            = llog_lvfs_destroy,
+       .lop_close              = llog_lvfs_close,
+       .lop_open               = llog_lvfs_open,
+       .lop_exist              = llog_lvfs_exist,
+       .lop_declare_create     = llog_lvfs_declare_create,
+       .lop_declare_write_rec  = llog_lvfs_declare_write_rec,
+};
+EXPORT_SYMBOL(llog_lvfs_ops);
+#else /* !__KERNEL__ */
+struct llog_operations llog_lvfs_ops = {};
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
new file mode 100644 (file)
index 0000000..7e22907
--- /dev/null
@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/* helper functions for calling the llog obd methods */
+static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
+{
+       struct llog_ctxt *ctxt;
+
+       OBD_ALLOC_PTR(ctxt);
+       if (!ctxt)
+               return NULL;
+
+       ctxt->loc_obd = obd;
+       atomic_set(&ctxt->loc_refcount, 1);
+
+       return ctxt;
+}
+
+static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
+{
+       if (ctxt->loc_exp) {
+               class_export_put(ctxt->loc_exp);
+               ctxt->loc_exp = NULL;
+       }
+       if (ctxt->loc_imp) {
+               class_import_put(ctxt->loc_imp);
+               ctxt->loc_imp = NULL;
+       }
+       OBD_FREE_PTR(ctxt);
+}
+
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+       struct obd_llog_group *olg = ctxt->loc_olg;
+       struct obd_device *obd;
+       int rc = 0;
+
+       spin_lock(&olg->olg_lock);
+       if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
+               spin_unlock(&olg->olg_lock);
+               return rc;
+       }
+       olg->olg_ctxts[ctxt->loc_idx] = NULL;
+       spin_unlock(&olg->olg_lock);
+
+       obd = ctxt->loc_obd;
+       spin_lock(&obd->obd_dev_lock);
+       /* sync with llog ctxt user thread */
+       spin_unlock(&obd->obd_dev_lock);
+
+       /* obd->obd_starting is needed for the case of cleanup
+        * in error case while obd is starting up. */
+       LASSERTF(obd->obd_starting == 1 ||
+                obd->obd_stopping == 1 || obd->obd_set_up == 0,
+                "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+                !!obd->obd_stopping, !!obd->obd_set_up);
+
+       /* cleanup the llog ctxt here */
+       if (CTXTP(ctxt, cleanup))
+               rc = CTXTP(ctxt, cleanup)(env, ctxt);
+
+       llog_ctxt_destroy(ctxt);
+       wake_up(&olg->olg_waitq);
+       return rc;
+}
+EXPORT_SYMBOL(__llog_ctxt_put);
+
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+       struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+       struct obd_llog_group *olg;
+       int rc, idx;
+       ENTRY;
+
+       LASSERT(ctxt != NULL);
+       LASSERT(ctxt != LP_POISON);
+
+       olg = ctxt->loc_olg;
+       LASSERT(olg != NULL);
+       LASSERT(olg != LP_POISON);
+
+       idx = ctxt->loc_idx;
+
+       /*
+        * Banlance the ctxt get when calling llog_cleanup()
+        */
+       LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
+       LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
+       llog_ctxt_put(ctxt);
+
+       /*
+        * Try to free the ctxt.
+        */
+       rc = __llog_ctxt_put(env, ctxt);
+       if (rc)
+               CERROR("Error %d while cleaning up ctxt %p\n",
+                      rc, ctxt);
+
+       l_wait_event(olg->olg_waitq,
+                    llog_group_ctxt_null(olg, idx), &lwi);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cleanup);
+
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+              struct obd_llog_group *olg, int index,
+              struct obd_device *disk_obd, struct llog_operations *op)
+{
+       struct llog_ctxt *ctxt;
+       int rc = 0;
+       ENTRY;
+
+       if (index < 0 || index >= LLOG_MAX_CTXTS)
+               RETURN(-EINVAL);
+
+       LASSERT(olg != NULL);
+
+       ctxt = llog_new_ctxt(obd);
+       if (!ctxt)
+               RETURN(-ENOMEM);
+
+       ctxt->loc_obd = obd;
+       ctxt->loc_olg = olg;
+       ctxt->loc_idx = index;
+       ctxt->loc_logops = op;
+       mutex_init(&ctxt->loc_mutex);
+       ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
+       ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
+
+       rc = llog_group_set_ctxt(olg, ctxt, index);
+       if (rc) {
+               llog_ctxt_destroy(ctxt);
+               if (rc == -EEXIST) {
+                       ctxt = llog_group_get_ctxt(olg, index);
+                       if (ctxt) {
+                               /*
+                                * mds_lov_update_desc() might call here multiple
+                                * times. So if the llog is already set up then
+                                * don't to do it again.
+                                */
+                               CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n",
+                                      obd->obd_name, index);
+                               LASSERT(ctxt->loc_olg == olg);
+                               LASSERT(ctxt->loc_obd == obd);
+                               LASSERT(ctxt->loc_exp == disk_obd->obd_self_export);
+                               LASSERT(ctxt->loc_logops == op);
+                               llog_ctxt_put(ctxt);
+                       }
+                       rc = 0;
+               }
+               RETURN(rc);
+       }
+
+       if (op->lop_setup) {
+               if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+                       rc = -EOPNOTSUPP;
+               else
+                       rc = op->lop_setup(env, obd, olg, index, disk_obd);
+       }
+
+       if (rc) {
+               CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n",
+                      obd->obd_name, index, op->lop_setup, rc);
+               llog_group_clear_ctxt(olg, index);
+               llog_ctxt_destroy(ctxt);
+       } else {
+               CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+                      obd->obd_name, index);
+               ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_setup);
+
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (!ctxt)
+               RETURN(0);
+
+       if (CTXTP(ctxt, sync))
+               rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_sync);
+
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+                struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+                struct llog_cookie *logcookies, int numcookies)
+{
+       int raised, rc;
+       ENTRY;
+
+       if (!ctxt) {
+               CERROR("No ctxt\n");
+               RETURN(-ENODEV);
+       }
+
+       if (ctxt->loc_flags & LLOG_CTXT_FLAG_UNINITIALIZED)
+               RETURN(-ENXIO);
+
+       CTXT_CHECK_OP(ctxt, obd_add, -EOPNOTSUPP);
+       raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+       if (!raised)
+               cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+       rc = CTXTP(ctxt, obd_add)(env, ctxt, rec, lsm, logcookies,
+                                 numcookies);
+       if (!raised)
+               cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_obd_add);
+
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+               struct lov_stripe_md *lsm, int count,
+               struct llog_cookie *cookies, int flags)
+{
+       int rc;
+       ENTRY;
+
+       if (!ctxt) {
+               CERROR("No ctxt\n");
+               RETURN(-ENODEV);
+       }
+
+       CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
+       rc = CTXTP(ctxt, cancel)(env, ctxt, lsm, count, cookies, flags);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cancel);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                 struct obd_device *disk_obd, int *index)
+{
+       int rc;
+       ENTRY;
+       OBD_CHECK_DT_OP(obd, llog_init, 0);
+       OBD_COUNTER_INCREMENT(obd, llog_init);
+
+       rc = OBP(obd, llog_init)(obd, olg, disk_obd, index);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_init);
+
+int obd_llog_finish(struct obd_device *obd, int count)
+{
+       int rc;
+       ENTRY;
+       OBD_CHECK_DT_OP(obd, llog_finish, 0);
+       OBD_COUNTER_INCREMENT(obd, llog_finish);
+
+       rc = OBP(obd, llog_finish)(obd, count);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_finish);
+
+/* context key constructor/destructor: llog_key_init, llog_key_fini */
+LU_KEY_INIT_FINI(llog, struct llog_thread_info);
+/* context key: llog_thread_key */
+LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL);
+LU_KEY_INIT_GENERIC(llog);
+EXPORT_SYMBOL(llog_thread_key);
+
+int llog_info_init(void)
+{
+       llog_key_init_generic(&llog_thread_key, NULL);
+       lu_context_key_register(&llog_thread_key);
+       return 0;
+}
+
+void llog_info_fini(void)
+{
+       lu_context_key_degister(&llog_thread_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_osd.c b/drivers/staging/lustre/lustre/obdclass/llog_osd.c
new file mode 100644 (file)
index 0000000..6dbd21a
--- /dev/null
@@ -0,0 +1,1323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_osd.c - low level llog routines on top of OSD API
+ *
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#ifndef EXPORT_SYMTAB
+#define EXPORT_SYMTAB
+#endif
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <dt_object.h>
+
+#include "llog_internal.h"
+#include "local_storage.h"
+
+/*
+ * - multi-chunks or big-declaration approach
+ * - use unique sequence instead of llog sb tracking unique ids
+ * - re-use existing environment
+ * - named llog support (can be used for testing only at the present)
+ * - llog_origin_connect() work with OSD API
+ */
+
+static int llog_osd_declare_new_object(const struct lu_env *env,
+                                      struct local_oid_storage *los,
+                                      struct dt_object *o,
+                                      struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+
+       lgi->lgi_attr.la_valid = LA_MODE;
+       lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+       lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+       return local_object_declare_create(env, los, o, &lgi->lgi_attr,
+                                          &lgi->lgi_dof, th);
+}
+
+static int llog_osd_create_new_object(const struct lu_env *env,
+                                     struct local_oid_storage *los,
+                                     struct dt_object *o,
+                                     struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+
+       lgi->lgi_attr.la_valid = LA_MODE;
+       lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+       lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+       return local_object_create(env, los, o, &lgi->lgi_attr,
+                                  &lgi->lgi_dof, th);
+}
+
+static int llog_osd_pad(const struct lu_env *env, struct dt_object *o,
+                       loff_t *off, int len, int index, struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(th);
+       LASSERT(off);
+       LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+       lgi->lgi_tail.lrt_len = lgi->lgi_lrh.lrh_len = len;
+       lgi->lgi_tail.lrt_index = lgi->lgi_lrh.lrh_index = index;
+       lgi->lgi_lrh.lrh_type = LLOG_PAD_MAGIC;
+
+       lgi->lgi_buf.lb_buf = &lgi->lgi_lrh;
+       lgi->lgi_buf.lb_len = sizeof(lgi->lgi_lrh);
+       dt_write_lock(env, o, 0);
+       rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+       if (rc) {
+               CERROR("%s: error writing padding record: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, rc);
+               GOTO(out, rc);
+       }
+
+       lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+       lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+       *off += len - sizeof(lgi->lgi_lrh) - sizeof(lgi->lgi_tail);
+       rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+       if (rc)
+               CERROR("%s: error writing padding record: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, rc);
+out:
+       dt_write_unlock(env, o);
+       RETURN(rc);
+}
+
+static int llog_osd_write_blob(const struct lu_env *env, struct dt_object *o,
+                              struct llog_rec_hdr *rec, void *buf,
+                              loff_t *off, struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       int                      buflen = rec->lrh_len;
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(env);
+       LASSERT(o);
+
+       if (buflen == 0)
+               CWARN("0-length record\n");
+
+       CDEBUG(D_OTHER, "write blob with type %x, buf %p/%u at off %llu\n",
+              rec->lrh_type, buf, buflen, *off);
+
+       lgi->lgi_attr.la_valid = LA_SIZE;
+       lgi->lgi_attr.la_size = *off;
+
+       if (!buf) {
+               lgi->lgi_buf.lb_len = buflen;
+               lgi->lgi_buf.lb_buf = rec;
+               rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+               if (rc)
+                       CERROR("%s: error writing log record: rc = %d\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name, rc);
+               GOTO(out, rc);
+       }
+
+       /* the buf case */
+       /* protect the following 3 writes from concurrent read */
+       dt_write_lock(env, o, 0);
+       rec->lrh_len = sizeof(*rec) + buflen + sizeof(lgi->lgi_tail);
+       lgi->lgi_buf.lb_len = sizeof(*rec);
+       lgi->lgi_buf.lb_buf = rec;
+       rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+       if (rc) {
+               CERROR("%s: error writing log hdr: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, rc);
+               GOTO(out_unlock, rc);
+       }
+
+       lgi->lgi_buf.lb_len = buflen;
+       lgi->lgi_buf.lb_buf = buf;
+       rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+       if (rc) {
+               CERROR("%s: error writing log buffer: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+               GOTO(out_unlock, rc);
+       }
+
+       lgi->lgi_tail.lrt_len = rec->lrh_len;
+       lgi->lgi_tail.lrt_index = rec->lrh_index;
+       lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+       lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+       rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+       if (rc)
+               CERROR("%s: error writing log tail: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, rc);
+
+out_unlock:
+       dt_write_unlock(env, o);
+
+out:
+       /* cleanup the content written above */
+       if (rc) {
+               dt_punch(env, o, lgi->lgi_attr.la_size, OBD_OBJECT_EOF, th,
+                        BYPASS_CAPA);
+               dt_attr_set(env, o, &lgi->lgi_attr, th, BYPASS_CAPA);
+       }
+
+       RETURN(rc);
+}
+
+static int llog_osd_read_header(const struct lu_env *env,
+                               struct llog_handle *handle)
+{
+       struct llog_rec_hdr     *llh_hdr;
+       struct dt_object        *o;
+       struct llog_thread_info *lgi;
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+       o = handle->lgh_obj;
+       LASSERT(o);
+
+       lgi = llog_info(env);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+       if (rc)
+               RETURN(rc);
+
+       LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+
+       if (lgi->lgi_attr.la_size == 0) {
+               CDEBUG(D_HA, "not reading header from 0-byte log\n");
+               RETURN(LLOG_EEMPTY);
+       }
+
+       lgi->lgi_off = 0;
+       lgi->lgi_buf.lb_buf = handle->lgh_hdr;
+       lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE;
+
+       rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+       if (rc) {
+               CERROR("%s: error reading log header from "DFID": rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,
+                      PFID(lu_object_fid(&o->do_lu)), rc);
+               RETURN(rc);
+       }
+
+       llh_hdr = &handle->lgh_hdr->llh_hdr;
+       if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+               lustre_swab_llog_hdr(handle->lgh_hdr);
+
+       if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+               CERROR("%s: bad log %s "DFID" header magic: %#x "
+                      "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name,
+                      handle->lgh_name ? handle->lgh_name : "",
+                      PFID(lu_object_fid(&o->do_lu)),
+                      llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+               RETURN(-EIO);
+       } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+               CERROR("%s: incorrectly sized log %s "DFID" header: "
+                      "%#x (expected %#x)\n"
+                      "you may need to re-run lconf --write_conf.\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,
+                      handle->lgh_name ? handle->lgh_name : "",
+                      PFID(lu_object_fid(&o->do_lu)),
+                      llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+               RETURN(-EIO);
+       }
+
+       handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+       RETURN(0);
+}
+
+static int llog_osd_declare_write_rec(const struct lu_env *env,
+                                     struct llog_handle *loghandle,
+                                     struct llog_rec_hdr *rec,
+                                     int idx, struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct dt_object        *o;
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(env);
+       LASSERT(th);
+       LASSERT(loghandle);
+
+       o = loghandle->lgh_obj;
+       LASSERT(o);
+
+       /* each time we update header */
+       rc = dt_declare_record_write(env, o, sizeof(struct llog_log_hdr), 0,
+                                    th);
+       if (rc || idx == 0) /* if error or just header */
+               RETURN(rc);
+
+       if (dt_object_exists(o)) {
+               rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+               lgi->lgi_off = lgi->lgi_attr.la_size;
+               LASSERT(ergo(rc == 0, lgi->lgi_attr.la_valid & LA_SIZE));
+               if (rc)
+                       RETURN(rc);
+
+               rc = dt_declare_punch(env, o, lgi->lgi_off, OBD_OBJECT_EOF, th);
+               if (rc)
+                       RETURN(rc);
+       } else {
+               lgi->lgi_off = 0;
+       }
+
+       /* XXX: implement declared window or multi-chunks approach */
+       rc = dt_declare_record_write(env, o, 32 * 1024, lgi->lgi_off, th);
+
+       RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_osd_write_rec(const struct lu_env *env,
+                             struct llog_handle *loghandle,
+                             struct llog_rec_hdr *rec,
+                             struct llog_cookie *reccookie, int cookiecount,
+                             void *buf, int idx, struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct llog_log_hdr     *llh;
+       int                      reclen = rec->lrh_len;
+       int                      index, rc, old_tail_idx;
+       struct llog_rec_tail    *lrt;
+       struct dt_object        *o;
+       size_t                   left;
+
+       ENTRY;
+
+       LASSERT(env);
+       llh = loghandle->lgh_hdr;
+       LASSERT(llh);
+       o = loghandle->lgh_obj;
+       LASSERT(o);
+       LASSERT(th);
+
+       CDEBUG(D_OTHER, "new record %x to "DFID"\n",
+              rec->lrh_type, PFID(lu_object_fid(&o->do_lu)));
+
+       /* record length should not bigger than LLOG_CHUNK_SIZE */
+       if (buf)
+               rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+                     sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+       else
+               rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+       if (rc)
+               RETURN(rc);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+       if (rc)
+               RETURN(rc);
+
+       if (buf)
+               /* write_blob adds header and tail to lrh_len. */
+               reclen = sizeof(*rec) + rec->lrh_len +
+                        sizeof(struct llog_rec_tail);
+
+       if (idx != -1) {
+               /* no header: only allowed to insert record 1 */
+               if (idx != 1 && lgi->lgi_attr.la_size == 0)
+                       LBUG();
+
+               if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+                       RETURN(-EINVAL);
+
+               if (!ext2_test_bit(idx, llh->llh_bitmap))
+                       CERROR("%s: modify unset record %u\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name, idx);
+               if (idx != rec->lrh_index)
+                       CERROR("%s: index mismatch %d %u\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name, idx,
+                              rec->lrh_index);
+
+               lgi->lgi_off = 0;
+               rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+                                        &lgi->lgi_off, th);
+               /* we are done if we only write the header or on error */
+               if (rc || idx == 0)
+                       RETURN(rc);
+
+               if (buf) {
+                       /* We assume that caller has set lgh_cur_* */
+                       lgi->lgi_off = loghandle->lgh_cur_offset;
+                       CDEBUG(D_OTHER,
+                              "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+                              "offset %llu\n",
+                              POSTID(&loghandle->lgh_id.lgl_oi), idx,
+                              rec->lrh_index,
+                              loghandle->lgh_cur_idx, rec->lrh_len,
+                              (long long)(lgi->lgi_off - sizeof(*llh)));
+                       if (rec->lrh_index != loghandle->lgh_cur_idx) {
+                               CERROR("%s: modify idx mismatch %u/%d\n",
+                                      o->do_lu.lo_dev->ld_obd->obd_name, idx,
+                                      loghandle->lgh_cur_idx);
+                               RETURN(-EFAULT);
+                       }
+               } else {
+                       /* Assumes constant lrh_len */
+                       lgi->lgi_off = sizeof(*llh) + (idx - 1) * reclen;
+               }
+
+               rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+               if (rc == 0 && reccookie) {
+                       reccookie->lgc_lgl = loghandle->lgh_id;
+                       reccookie->lgc_index = idx;
+                       rc = 1;
+               }
+               RETURN(rc);
+       }
+
+       /* Make sure that records don't cross a chunk boundary, so we can
+        * process them page-at-a-time if needed.  If it will cross a chunk
+        * boundary, write in a fake (but referenced) entry to pad the chunk.
+        *
+        * We know that llog_current_log() will return a loghandle that is
+        * big enough to hold reclen, so all we care about is padding here.
+        */
+       LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+       lgi->lgi_off = lgi->lgi_attr.la_size;
+       left = LLOG_CHUNK_SIZE - (lgi->lgi_off & (LLOG_CHUNK_SIZE - 1));
+       /* NOTE: padding is a record, but no bit is set */
+       if (left != 0 && left != reclen &&
+           left < (reclen + LLOG_MIN_REC_SIZE)) {
+               index = loghandle->lgh_last_idx + 1;
+               rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th);
+               if (rc)
+                       RETURN(rc);
+               loghandle->lgh_last_idx++; /*for pad rec*/
+       }
+       /* if it's the last idx in log file, then return -ENOSPC */
+       if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+               RETURN(-ENOSPC);
+
+       loghandle->lgh_last_idx++;
+       index = loghandle->lgh_last_idx;
+       LASSERT(index < LLOG_BITMAP_SIZE(llh));
+       rec->lrh_index = index;
+       if (buf == NULL) {
+               lrt = (struct llog_rec_tail *)((char *)rec + rec->lrh_len -
+                                              sizeof(*lrt));
+               lrt->lrt_len = rec->lrh_len;
+               lrt->lrt_index = rec->lrh_index;
+       }
+       /* The caller should make sure only 1 process access the lgh_last_idx,
+        * Otherwise it might hit the assert.*/
+       LASSERT(index < LLOG_BITMAP_SIZE(llh));
+       spin_lock(&loghandle->lgh_hdr_lock);
+       if (ext2_set_bit(index, llh->llh_bitmap)) {
+               CERROR("%s: index %u already set in log bitmap\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, index);
+               spin_unlock(&loghandle->lgh_hdr_lock);
+               LBUG(); /* should never happen */
+       }
+       llh->llh_count++;
+       spin_unlock(&loghandle->lgh_hdr_lock);
+       old_tail_idx = llh->llh_tail.lrt_index;
+       llh->llh_tail.lrt_index = index;
+
+       lgi->lgi_off = 0;
+       rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, &lgi->lgi_off,
+                                th);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+       if (rc)
+               GOTO(out, rc);
+
+       LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+       lgi->lgi_off = lgi->lgi_attr.la_size;
+
+       rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+
+out:
+       /* cleanup llog for error case */
+       if (rc) {
+               spin_lock(&loghandle->lgh_hdr_lock);
+               ext2_clear_bit(index, llh->llh_bitmap);
+               llh->llh_count--;
+               spin_unlock(&loghandle->lgh_hdr_lock);
+
+               /* restore the header */
+               loghandle->lgh_last_idx--;
+               llh->llh_tail.lrt_index = old_tail_idx;
+               lgi->lgi_off = 0;
+               llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+                                   &lgi->lgi_off, th);
+       }
+
+       CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u\n",
+              POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+       if (rc == 0 && reccookie) {
+               reccookie->lgc_lgl = loghandle->lgh_id;
+               reccookie->lgc_index = index;
+               if ((rec->lrh_type == MDS_UNLINK_REC) ||
+                   (rec->lrh_type == MDS_SETATTR64_REC))
+                       reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+               else if (rec->lrh_type == OST_SZ_REC)
+                       reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+               else
+                       reccookie->lgc_subsys = -1;
+               rc = 1;
+       }
+       RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+ * minimum sized log records we are skipping.  If it turns out
+ * that we are not far enough along the log (because the
+ * actual records are larger than minimum size) we just skip
+ * some more records.
+ */
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+       if (goal <= curr)
+               return;
+       *off = (*off + (goal - curr - 1) * LLOG_MIN_REC_SIZE) &
+               ~(LLOG_CHUNK_SIZE - 1);
+}
+
+/* sets:
+ *  - cur_offset to the furthest point read in the log file
+ *  - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_osd_next_block(const struct lu_env *env,
+                              struct llog_handle *loghandle, int *cur_idx,
+                              int next_idx, __u64 *cur_offset, void *buf,
+                              int len)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct dt_object        *o;
+       struct dt_device        *dt;
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(env);
+       LASSERT(lgi);
+
+       if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+              next_idx, *cur_idx, *cur_offset);
+
+       LASSERT(loghandle);
+       LASSERT(loghandle->lgh_ctxt);
+
+       o = loghandle->lgh_obj;
+       LASSERT(o);
+       LASSERT(dt_object_exists(o));
+       dt = lu2dt_dev(o->do_lu.lo_dev);
+       LASSERT(dt);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+       if (rc)
+               GOTO(out, rc);
+
+       while (*cur_offset < lgi->lgi_attr.la_size) {
+               struct llog_rec_hdr     *rec, *last_rec;
+               struct llog_rec_tail    *tail;
+
+               llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+               /* read up to next LLOG_CHUNK_SIZE block */
+               lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE -
+                                     (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+               lgi->lgi_buf.lb_buf = buf;
+
+               /* Note: read lock is not needed around la_size get above at
+                * the time of dt_attr_get(). There are only two cases that
+                * matter. Either la_size == cur_offset, in which case the
+                * entire read is skipped, or la_size > cur_offset and the loop
+                * is entered and this thread is blocked at dt_read_lock()
+                * until the write is completed. When the write completes, then
+                * the dt_read() will be done with the full length, and will
+                * get the full data.
+                */
+               dt_read_lock(env, o, 0);
+               rc = dt_read(env, o, &lgi->lgi_buf, cur_offset);
+               dt_read_unlock(env, o);
+               if (rc < 0) {
+                       CERROR("%s: can't read llog block from log "DFID
+                              " offset "LPU64": rc = %d\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              PFID(lu_object_fid(&o->do_lu)), *cur_offset,
+                              rc);
+                       GOTO(out, rc);
+               }
+
+               if (rc < len) {
+                       /* signal the end of the valid buffer to
+                        * llog_process */
+                       memset(buf + rc, 0, len - rc);
+               }
+
+               if (rc == 0) /* end of file, nothing to do */
+                       GOTO(out, rc);
+
+               if (rc < sizeof(*tail)) {
+                       CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+                              "offset "LPU64"\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, *cur_offset);
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               rec = buf;
+               if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                       lustre_swab_llog_rec(rec);
+
+               tail = (struct llog_rec_tail *)((char *)buf + rc -
+                                               sizeof(struct llog_rec_tail));
+               /* get the last record in block */
+               last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+                                                  le32_to_cpu(tail->lrt_len));
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+                       lustre_swab_llog_rec(last_rec);
+               LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+               *cur_idx = tail->lrt_index;
+
+               /* this shouldn't happen */
+               if (tail->lrt_index == 0) {
+                       CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+                              "offset "LPU64"\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, *cur_offset);
+                       GOTO(out, rc = -EINVAL);
+               }
+               if (tail->lrt_index < next_idx)
+                       continue;
+
+               /* sanity check that the start of the new buffer is no farther
+                * than the record that we wanted.  This shouldn't happen. */
+               if (rec->lrh_index > next_idx) {
+                       CERROR("%s: missed desired record? %u > %u\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              rec->lrh_index, next_idx);
+                       GOTO(out, rc = -ENOENT);
+               }
+               GOTO(out, rc = 0);
+       }
+       GOTO(out, rc = -EIO);
+out:
+       return rc;
+}
+
+static int llog_osd_prev_block(const struct lu_env *env,
+                              struct llog_handle *loghandle,
+                              int prev_idx, void *buf, int len)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct dt_object        *o;
+       struct dt_device        *dt;
+       loff_t                   cur_offset;
+       int                      rc;
+
+       ENTRY;
+
+       if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+               RETURN(-EINVAL);
+
+       CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+       LASSERT(loghandle);
+       LASSERT(loghandle->lgh_ctxt);
+
+       o = loghandle->lgh_obj;
+       LASSERT(o);
+       LASSERT(dt_object_exists(o));
+       dt = lu2dt_dev(o->do_lu.lo_dev);
+       LASSERT(dt);
+
+       cur_offset = LLOG_CHUNK_SIZE;
+       llog_skip_over(&cur_offset, 0, prev_idx);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+       if (rc)
+               GOTO(out, rc);
+
+       while (cur_offset < lgi->lgi_attr.la_size) {
+               struct llog_rec_hdr     *rec, *last_rec;
+               struct llog_rec_tail    *tail;
+
+               lgi->lgi_buf.lb_len = len;
+               lgi->lgi_buf.lb_buf = buf;
+               /* It is OK to have locking around dt_read() only, see
+                * comment in llog_osd_next_block for details
+                */
+               dt_read_lock(env, o, 0);
+               rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset);
+               dt_read_unlock(env, o);
+               if (rc < 0) {
+                       CERROR("%s: can't read llog block from log "DFID
+                              " offset "LPU64": rc = %d\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              PFID(lu_object_fid(&o->do_lu)), cur_offset, rc);
+                       GOTO(out, rc);
+               }
+
+               if (rc == 0) /* end of file, nothing to do */
+                       GOTO(out, rc);
+
+               if (rc < sizeof(*tail)) {
+                       CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+                              "offset "LPU64"\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, cur_offset);
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               rec = buf;
+               if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+                       lustre_swab_llog_rec(rec);
+
+               tail = (struct llog_rec_tail *)((char *)buf + rc -
+                                               sizeof(struct llog_rec_tail));
+               /* get the last record in block */
+               last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+                                                  le32_to_cpu(tail->lrt_len));
+
+               if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+                       lustre_swab_llog_rec(last_rec);
+               LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+               /* this shouldn't happen */
+               if (tail->lrt_index == 0) {
+                       CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+                              "offset "LPU64"\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              POSTID(&loghandle->lgh_id.lgl_oi),
+                              loghandle->lgh_id.lgl_ogen, cur_offset);
+                       GOTO(out, rc = -EINVAL);
+               }
+               if (tail->lrt_index < prev_idx)
+                       continue;
+
+               /* sanity check that the start of the new buffer is no farther
+                * than the record that we wanted.  This shouldn't happen. */
+               if (rec->lrh_index > prev_idx) {
+                       CERROR("%s: missed desired record? %u > %u\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              rec->lrh_index, prev_idx);
+                       GOTO(out, rc = -ENOENT);
+               }
+               GOTO(out, rc = 0);
+       }
+       GOTO(out, rc = -EIO);
+out:
+       return rc;
+}
+
+struct dt_object *llog_osd_dir_get(const struct lu_env *env,
+                                  struct llog_ctxt *ctxt)
+{
+       struct dt_device        *dt;
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dir;
+       int                      rc;
+
+       dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+       if (ctxt->loc_dir == NULL) {
+               rc = dt_root_get(env, dt, &dti->dti_fid);
+               if (rc)
+                       return ERR_PTR(rc);
+               dir = dt_locate(env, dt, &dti->dti_fid);
+       } else {
+               lu_object_get(&ctxt->loc_dir->do_lu);
+               dir = ctxt->loc_dir;
+       }
+
+       return dir;
+}
+
+static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle,
+                        struct llog_logid *logid, char *name,
+                        enum llog_open_param open_param)
+{
+       struct llog_thread_info         *lgi = llog_info(env);
+       struct llog_ctxt                *ctxt = handle->lgh_ctxt;
+       struct dt_object                *o;
+       struct dt_device                *dt;
+       struct ls_device                *ls;
+       struct local_oid_storage        *los;
+       int                              rc = 0;
+
+       ENTRY;
+
+       LASSERT(env);
+       LASSERT(ctxt);
+       LASSERT(ctxt->loc_exp);
+       LASSERT(ctxt->loc_exp->exp_obd);
+       dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+       LASSERT(dt);
+
+       ls = ls_device_get(dt);
+       if (IS_ERR(ls))
+               RETURN(PTR_ERR(ls));
+
+       mutex_lock(&ls->ls_los_mutex);
+       los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG);
+       mutex_unlock(&ls->ls_los_mutex);
+       LASSERT(los);
+       ls_device_put(env, ls);
+
+       LASSERT(handle);
+
+       if (logid != NULL) {
+               logid_to_fid(logid, &lgi->lgi_fid);
+       } else if (name) {
+               struct dt_object *llog_dir;
+
+               llog_dir = llog_osd_dir_get(env, ctxt);
+               if (IS_ERR(llog_dir))
+                       GOTO(out, rc = PTR_ERR(llog_dir));
+               dt_read_lock(env, llog_dir, 0);
+               rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid);
+               dt_read_unlock(env, llog_dir);
+               lu_object_put(env, &llog_dir->do_lu);
+               if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+                       /* generate fid for new llog */
+                       rc = local_object_fid_generate(env, los,
+                                                      &lgi->lgi_fid);
+               }
+               if (rc < 0)
+                       GOTO(out, rc);
+               OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+               if (handle->lgh_name)
+                       strcpy(handle->lgh_name, name);
+               else
+                       GOTO(out, rc = -ENOMEM);
+       } else {
+               LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param);
+               /* generate fid for new llog */
+               rc = local_object_fid_generate(env, los, &lgi->lgi_fid);
+               if (rc < 0)
+                       GOTO(out, rc);
+       }
+
+       o = ls_locate(env, ls, &lgi->lgi_fid);
+       if (IS_ERR(o))
+               GOTO(out_name, rc = PTR_ERR(o));
+
+       /* No new llog is expected but doesn't exist */
+       if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o))
+               GOTO(out_put, rc = -ENOENT);
+
+       fid_to_logid(&lgi->lgi_fid, &handle->lgh_id);
+       handle->lgh_obj = o;
+       handle->private_data = los;
+       LASSERT(handle->lgh_ctxt);
+
+       RETURN(rc);
+
+out_put:
+       lu_object_put(env, &o->do_lu);
+out_name:
+       if (handle->lgh_name != NULL)
+               OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+       dt_los_put(los);
+       RETURN(rc);
+}
+
+static int llog_osd_exist(struct llog_handle *handle)
+{
+       LASSERT(handle->lgh_obj);
+       return (dt_object_exists(handle->lgh_obj) &&
+               !lu_object_is_dying(handle->lgh_obj->do_lu.lo_header));
+}
+
+static int llog_osd_declare_create(const struct lu_env *env,
+                                  struct llog_handle *res, struct thandle *th)
+{
+       struct llog_thread_info         *lgi = llog_info(env);
+       struct local_oid_storage        *los;
+       struct dt_object                *o;
+       int                              rc;
+
+       ENTRY;
+
+       LASSERT(res->lgh_obj);
+       LASSERT(th);
+
+       /* object can be created by another thread */
+       o = res->lgh_obj;
+       if (dt_object_exists(o))
+               RETURN(0);
+
+       los = res->private_data;
+       LASSERT(los);
+
+       rc = llog_osd_declare_new_object(env, los, o, th);
+       if (rc)
+               RETURN(rc);
+
+       rc = dt_declare_record_write(env, o, LLOG_CHUNK_SIZE, 0, th);
+       if (rc)
+               RETURN(rc);
+
+       if (res->lgh_name) {
+               struct dt_object *llog_dir;
+
+               llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+               if (IS_ERR(llog_dir))
+                       RETURN(PTR_ERR(llog_dir));
+               logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+               rc = dt_declare_insert(env, llog_dir,
+                                      (struct dt_rec *)&lgi->lgi_fid,
+                                      (struct dt_key *)res->lgh_name, th);
+               lu_object_put(env, &llog_dir->do_lu);
+               if (rc)
+                       CERROR("%s: can't declare named llog %s: rc = %d\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              res->lgh_name, rc);
+       }
+       RETURN(rc);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
+                          struct thandle *th)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct local_oid_storage *los;
+       struct dt_object        *o;
+       int                   rc = 0;
+
+       ENTRY;
+
+       LASSERT(env);
+       o = res->lgh_obj;
+       LASSERT(o);
+
+       /* llog can be already created */
+       if (dt_object_exists(o))
+               RETURN(-EEXIST);
+
+       los = res->private_data;
+       LASSERT(los);
+
+       dt_write_lock(env, o, 0);
+       if (!dt_object_exists(o))
+               rc = llog_osd_create_new_object(env, los, o, th);
+       else
+               rc = -EEXIST;
+
+       dt_write_unlock(env, o);
+       if (rc)
+               RETURN(rc);
+
+       if (res->lgh_name) {
+               struct dt_object *llog_dir;
+
+               llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+               if (IS_ERR(llog_dir))
+                       RETURN(PTR_ERR(llog_dir));
+
+               logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+               dt_read_lock(env, llog_dir, 0);
+               rc = dt_insert(env, llog_dir,
+                              (struct dt_rec *)&lgi->lgi_fid,
+                              (struct dt_key *)res->lgh_name,
+                              th, BYPASS_CAPA, 1);
+               dt_read_unlock(env, llog_dir);
+               lu_object_put(env, &llog_dir->do_lu);
+               if (rc)
+                       CERROR("%s: can't create named llog %s: rc = %d\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              res->lgh_name, rc);
+       }
+       RETURN(rc);
+}
+
+static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle)
+{
+       struct local_oid_storage        *los;
+       int                              rc = 0;
+
+       ENTRY;
+
+       LASSERT(handle->lgh_obj);
+
+       lu_object_put(env, &handle->lgh_obj->do_lu);
+
+       los = handle->private_data;
+       LASSERT(los);
+       dt_los_put(los);
+
+       if (handle->lgh_name)
+               OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+
+       RETURN(rc);
+}
+
+static int llog_osd_destroy(const struct lu_env *env,
+                           struct llog_handle *loghandle)
+{
+       struct llog_ctxt        *ctxt;
+       struct dt_object        *o, *llog_dir = NULL;
+       struct dt_device        *d;
+       struct thandle          *th;
+       char                    *name = NULL;
+       int                      rc;
+
+       ENTRY;
+
+       ctxt = loghandle->lgh_ctxt;
+       LASSERT(ctxt);
+
+       o = loghandle->lgh_obj;
+       LASSERT(o);
+
+       d = lu2dt_dev(o->do_lu.lo_dev);
+       LASSERT(d);
+       LASSERT(d == ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt);
+
+       th = dt_trans_create(env, d);
+       if (IS_ERR(th))
+               RETURN(PTR_ERR(th));
+
+       if (loghandle->lgh_name) {
+               llog_dir = llog_osd_dir_get(env, ctxt);
+               if (IS_ERR(llog_dir))
+                       GOTO(out_trans, rc = PTR_ERR(llog_dir));
+
+               name = loghandle->lgh_name;
+               rc = dt_declare_delete(env, llog_dir,
+                                      (struct dt_key *)name, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+       }
+
+       dt_declare_ref_del(env, o, th);
+
+       rc = dt_declare_destroy(env, o, th);
+       if (rc)
+               GOTO(out_trans, rc);
+
+       rc = dt_trans_start_local(env, d, th);
+       if (rc)
+               GOTO(out_trans, rc);
+
+       dt_write_lock(env, o, 0);
+       if (dt_object_exists(o)) {
+               if (name) {
+                       dt_read_lock(env, llog_dir, 0);
+                       rc = dt_delete(env, llog_dir,
+                                      (struct dt_key *) name,
+                                      th, BYPASS_CAPA);
+                       dt_read_unlock(env, llog_dir);
+                       if (rc) {
+                               CERROR("%s: can't remove llog %s: rc = %d\n",
+                                      o->do_lu.lo_dev->ld_obd->obd_name,
+                                      name, rc);
+                               GOTO(out_unlock, rc);
+                       }
+               }
+               dt_ref_del(env, o, th);
+               rc = dt_destroy(env, o, th);
+               if (rc)
+                       GOTO(out_unlock, rc);
+       }
+out_unlock:
+       dt_write_unlock(env, o);
+out_trans:
+       dt_trans_stop(env, d, th);
+       if (llog_dir != NULL)
+               lu_object_put(env, &llog_dir->do_lu);
+       RETURN(rc);
+}
+
+static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd,
+                         struct obd_llog_group *olg, int ctxt_idx,
+                         struct obd_device *disk_obd)
+{
+       struct local_oid_storage        *los;
+       struct llog_thread_info         *lgi = llog_info(env);
+       struct llog_ctxt                *ctxt;
+       int                              rc = 0;
+
+       ENTRY;
+
+       LASSERT(obd);
+       LASSERT(olg->olg_ctxts[ctxt_idx]);
+
+       ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]);
+       LASSERT(ctxt);
+
+       /* initialize data allowing to generate new fids,
+        * literally we need a sequece */
+       lgi->lgi_fid.f_seq = FID_SEQ_LLOG;
+       lgi->lgi_fid.f_oid = 1;
+       lgi->lgi_fid.f_ver = 0;
+       rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+                                   &lgi->lgi_fid, &los);
+       if (rc < 0)
+               return rc;
+
+       lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME;
+       lgi->lgi_fid.f_oid = 1;
+       lgi->lgi_fid.f_ver = 0;
+       rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+                                   &lgi->lgi_fid, &los);
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+
+static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+       struct dt_device                *dt;
+       struct ls_device                *ls;
+       struct local_oid_storage        *los, *nlos;
+
+       LASSERT(ctxt->loc_exp->exp_obd);
+       dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+       ls = ls_device_get(dt);
+       if (IS_ERR(ls))
+               RETURN(PTR_ERR(ls));
+
+       mutex_lock(&ls->ls_los_mutex);
+       los = dt_los_find(ls, FID_SEQ_LLOG);
+       nlos = dt_los_find(ls, FID_SEQ_LLOG_NAME);
+       mutex_unlock(&ls->ls_los_mutex);
+       if (los != NULL) {
+               dt_los_put(los);
+               local_oid_storage_fini(env, los);
+       }
+       if (nlos != NULL) {
+               dt_los_put(nlos);
+               local_oid_storage_fini(env, nlos);
+       }
+       ls_device_put(env, ls);
+       return 0;
+}
+
+struct llog_operations llog_osd_ops = {
+       .lop_next_block         = llog_osd_next_block,
+       .lop_prev_block         = llog_osd_prev_block,
+       .lop_read_header        = llog_osd_read_header,
+       .lop_destroy            = llog_osd_destroy,
+       .lop_setup              = llog_osd_setup,
+       .lop_cleanup            = llog_osd_cleanup,
+       .lop_open               = llog_osd_open,
+       .lop_exist              = llog_osd_exist,
+       .lop_declare_create     = llog_osd_declare_create,
+       .lop_create             = llog_osd_create,
+       .lop_declare_write_rec  = llog_osd_declare_write_rec,
+       .lop_write_rec          = llog_osd_write_rec,
+       .lop_close              = llog_osd_close,
+};
+EXPORT_SYMBOL(llog_osd_ops);
+
+/* reads the catalog list */
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+                         int idx, int count, struct llog_catid *idarray)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct dt_object        *o = NULL;
+       struct thandle          *th;
+       int                      rc, size;
+
+       ENTRY;
+
+       LASSERT(d);
+
+       size = sizeof(*idarray) * count;
+       lgi->lgi_off = idx *  sizeof(*idarray);
+
+       lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+       o = dt_locate(env, d, &lgi->lgi_fid);
+       if (IS_ERR(o))
+               RETURN(PTR_ERR(o));
+
+       if (!dt_object_exists(o)) {
+               th = dt_trans_create(env, d);
+               if (IS_ERR(th))
+                       GOTO(out, rc = PTR_ERR(th));
+
+               lgi->lgi_attr.la_valid = LA_MODE;
+               lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+               lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+               rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL,
+                                      &lgi->lgi_dof, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_trans_start_local(env, d, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               dt_write_lock(env, o, 0);
+               if (!dt_object_exists(o))
+                       rc = dt_create(env, o, &lgi->lgi_attr, NULL,
+                                      &lgi->lgi_dof, th);
+               dt_write_unlock(env, o);
+out_trans:
+               dt_trans_stop(env, d, th);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+       if (rc)
+               GOTO(out, rc);
+
+       if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+               CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,
+                      lgi->lgi_attr.la_mode);
+               GOTO(out, rc = -ENOENT);
+       }
+
+       CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n",
+              (int)lgi->lgi_attr.la_size, size);
+
+       /* return just number of llogs */
+       if (idarray == NULL) {
+               rc = lgi->lgi_attr.la_size / sizeof(*idarray);
+               GOTO(out, rc);
+       }
+
+       /* read for new ost index or for empty file */
+       memset(idarray, 0, size);
+       if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+               GOTO(out, rc = 0);
+       if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+               size = lgi->lgi_attr.la_size - lgi->lgi_off;
+
+       lgi->lgi_buf.lb_buf = idarray;
+       lgi->lgi_buf.lb_len = size;
+       rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+       if (rc) {
+               CERROR("%s: error reading CATALOGS: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+               GOTO(out, rc);
+       }
+
+       EXIT;
+out:
+       lu_object_put(env, &o->do_lu);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_get_cat_list);
+
+/* writes the cat list */
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+                         int idx, int count, struct llog_catid *idarray)
+{
+       struct llog_thread_info *lgi = llog_info(env);
+       struct dt_object        *o = NULL;
+       struct thandle          *th;
+       int                      rc, size;
+
+       if (!count)
+               RETURN(0);
+
+       LASSERT(d);
+
+       size = sizeof(*idarray) * count;
+       lgi->lgi_off = idx * sizeof(*idarray);
+
+       lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+       o = dt_locate(env, d, &lgi->lgi_fid);
+       if (IS_ERR(o))
+               RETURN(PTR_ERR(o));
+
+       if (!dt_object_exists(o))
+               GOTO(out, rc = -ENOENT);
+
+       rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+       if (rc)
+               GOTO(out, rc);
+
+       if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+               CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name,
+                      lgi->lgi_attr.la_mode);
+               GOTO(out, rc = -ENOENT);
+       }
+
+       th = dt_trans_create(env, d);
+       if (IS_ERR(th))
+               GOTO(out, rc = PTR_ERR(th));
+
+       rc = dt_declare_record_write(env, o, size, lgi->lgi_off, th);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = dt_trans_start_local(env, d, th);
+       if (rc)
+               GOTO(out_trans, rc);
+
+       lgi->lgi_buf.lb_buf = idarray;
+       lgi->lgi_buf.lb_len = size;
+       rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+       if (rc)
+               CDEBUG(D_INODE, "error writeing CATALOGS: rc = %d\n", rc);
+out_trans:
+       dt_trans_stop(env, d, th);
+out:
+       lu_object_put(env, &o->do_lu);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_put_cat_list);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
new file mode 100644 (file)
index 0000000..dedfecf
--- /dev/null
@@ -0,0 +1,407 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_swab.c
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ * Author: jacob berkman  <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <lustre_log.h>
+
+static void print_llogd_body(struct llogd_body *d)
+{
+       CDEBUG(D_OTHER, "llogd body: %p\n", d);
+       CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n",
+              POSTID(&d->lgd_logid.lgl_oi));
+       CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+       CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+       CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+       CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+       CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+       CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+       CDEBUG(D_OTHER, "\tlgd_cur_offset: "LPX64"\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+       __swab64s (&fid->f_seq);
+       __swab32s (&fid->f_oid);
+       __swab32s (&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
+void lustre_swab_ost_id(struct ost_id *oid)
+{
+       if (fid_seq_is_mdt0(oid->oi.oi_seq)) {
+               __swab64s(&oid->oi.oi_id);
+               __swab64s(&oid->oi.oi_seq);
+       } else {
+               lustre_swab_lu_fid(&oid->oi_fid);
+       }
+}
+EXPORT_SYMBOL(lustre_swab_ost_id);
+
+void lustre_swab_llog_id(struct llog_logid *log_id)
+{
+       __swab64s(&log_id->lgl_oi.oi.oi_id);
+       __swab64s(&log_id->lgl_oi.oi.oi_seq);
+        __swab32s(&log_id->lgl_ogen);
+}
+EXPORT_SYMBOL(lustre_swab_llog_id);
+
+void lustre_swab_llogd_body (struct llogd_body *d)
+{
+       ENTRY;
+       print_llogd_body(d);
+       lustre_swab_llog_id(&d->lgd_logid);
+       __swab32s (&d->lgd_ctxt_idx);
+       __swab32s (&d->lgd_llh_flags);
+       __swab32s (&d->lgd_index);
+       __swab32s (&d->lgd_saved_index);
+       __swab32s (&d->lgd_len);
+       __swab64s (&d->lgd_cur_offset);
+       print_llogd_body(d);
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
+{
+       __swab64s (&d->lgdc_gen.mnt_cnt);
+       __swab64s (&d->lgdc_gen.conn_cnt);
+       lustre_swab_llog_id(&d->lgdc_logid);
+       __swab32s (&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid(struct ll_fid *fid)
+{
+       __swab64s (&fid->id);
+       __swab32s (&fid->generation);
+       __swab32s (&fid->f_type);
+}
+EXPORT_SYMBOL(lustre_swab_ll_fid);
+
+void lustre_swab_lu_seq_range(struct lu_seq_range *range)
+{
+       __swab64s (&range->lsr_start);
+       __swab64s (&range->lsr_end);
+       __swab32s (&range->lsr_index);
+       __swab32s (&range->lsr_flags);
+}
+EXPORT_SYMBOL(lustre_swab_lu_seq_range);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
+{
+       struct llog_rec_tail *tail = NULL;
+
+       __swab32s(&rec->lrh_len);
+       __swab32s(&rec->lrh_index);
+       __swab32s(&rec->lrh_type);
+       __swab32s(&rec->lrh_id);
+
+       switch (rec->lrh_type) {
+       case OST_SZ_REC:
+       {
+               struct llog_size_change_rec *lsc =
+                       (struct llog_size_change_rec *)rec;
+
+               lustre_swab_ll_fid(&lsc->lsc_fid);
+               __swab32s(&lsc->lsc_ioepoch);
+               tail = &lsc->lsc_tail;
+               break;
+       }
+       case MDS_UNLINK_REC:
+       {
+               struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+               __swab64s(&lur->lur_oid);
+               __swab32s(&lur->lur_oseq);
+               __swab32s(&lur->lur_count);
+               tail = &lur->lur_tail;
+               break;
+       }
+       case MDS_UNLINK64_REC:
+       {
+               struct llog_unlink64_rec *lur =
+                       (struct llog_unlink64_rec *)rec;
+
+               lustre_swab_lu_fid(&lur->lur_fid);
+               __swab32s(&lur->lur_count);
+               tail = &lur->lur_tail;
+               break;
+       }
+       case CHANGELOG_REC:
+       {
+               struct llog_changelog_rec *cr = (struct llog_changelog_rec*)rec;
+
+               __swab16s(&cr->cr.cr_namelen);
+               __swab16s(&cr->cr.cr_flags);
+               __swab32s(&cr->cr.cr_type);
+               __swab64s(&cr->cr.cr_index);
+               __swab64s(&cr->cr.cr_prev);
+               __swab64s(&cr->cr.cr_time);
+               lustre_swab_lu_fid(&cr->cr.cr_tfid);
+               lustre_swab_lu_fid(&cr->cr.cr_pfid);
+               if (CHANGELOG_REC_EXTENDED(&cr->cr)) {
+                       struct llog_changelog_ext_rec *ext =
+                               (struct llog_changelog_ext_rec *)rec;
+
+                       lustre_swab_lu_fid(&ext->cr.cr_sfid);
+                       lustre_swab_lu_fid(&ext->cr.cr_spfid);
+                       tail = &ext->cr_tail;
+               } else {
+                       tail = &cr->cr_tail;
+               }
+               break;
+       }
+       case CHANGELOG_USER_REC:
+       {
+               struct llog_changelog_user_rec *cur =
+                       (struct llog_changelog_user_rec*)rec;
+
+               __swab32s(&cur->cur_id);
+               __swab64s(&cur->cur_endrec);
+               tail = &cur->cur_tail;
+               break;
+       }
+
+       case MDS_SETATTR64_REC:
+       {
+               struct llog_setattr64_rec *lsr =
+                       (struct llog_setattr64_rec *)rec;
+
+               lustre_swab_ost_id(&lsr->lsr_oi);
+               __swab32s(&lsr->lsr_uid);
+               __swab32s(&lsr->lsr_uid_h);
+               __swab32s(&lsr->lsr_gid);
+               __swab32s(&lsr->lsr_gid_h);
+               tail = &lsr->lsr_tail;
+               break;
+       }
+       case OBD_CFG_REC:
+               /* these are swabbed as they are consumed */
+               break;
+       case LLOG_HDR_MAGIC:
+       {
+               struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+               __swab64s(&llh->llh_timestamp);
+               __swab32s(&llh->llh_count);
+               __swab32s(&llh->llh_bitmap_offset);
+               __swab32s(&llh->llh_flags);
+               __swab32s(&llh->llh_size);
+               __swab32s(&llh->llh_cat_idx);
+               tail = &llh->llh_tail;
+               break;
+       }
+       case LLOG_LOGID_MAGIC:
+       {
+               struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+               lustre_swab_llog_id(&lid->lid_id);
+               tail = &lid->lid_tail;
+               break;
+       }
+       case LLOG_GEN_REC:
+       {
+               struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+               __swab64s(&lgr->lgr_gen.mnt_cnt);
+               __swab64s(&lgr->lgr_gen.conn_cnt);
+               tail = &lgr->lgr_tail;
+               break;
+       }
+       case LLOG_PAD_MAGIC:
+               break;
+       default:
+               CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+                      rec->lrh_type, rec);
+       }
+
+       if (tail) {
+               __swab32s(&tail->lrt_len);
+               __swab32s(&tail->lrt_index);
+       }
+}
+EXPORT_SYMBOL(lustre_swab_llog_rec);
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+       CDEBUG(D_OTHER, "llog header: %p\n", h);
+       CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+       CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+       CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+       CDEBUG(D_OTHER, "\tllh_timestamp: "LPX64"\n", h->llh_timestamp);
+       CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+       CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+       CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+       CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+       CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+       CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index);
+       CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len);
+}
+
+void lustre_swab_llog_hdr (struct llog_log_hdr *h)
+{
+       ENTRY;
+       print_llog_hdr(h);
+
+       lustre_swab_llog_rec(&h->llh_hdr);
+
+       print_llog_hdr(h);
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+static void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+       int i;
+       ENTRY;
+
+       if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+               return;
+       CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+
+       CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+       if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+               for (i = 0; i < lcfg->lcfg_bufcount; i++)
+                       CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n",
+                              i, lcfg->lcfg_buflens[i]);
+       EXIT;
+}
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+       int i;
+       ENTRY;
+
+       __swab32s(&lcfg->lcfg_version);
+
+       if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+               CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+                      lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+               EXIT;
+               return;
+       }
+
+       __swab32s(&lcfg->lcfg_command);
+       __swab32s(&lcfg->lcfg_num);
+       __swab32s(&lcfg->lcfg_flags);
+       __swab64s(&lcfg->lcfg_nid);
+       __swab32s(&lcfg->lcfg_bufcount);
+       for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+               __swab32s(&lcfg->lcfg_buflens[i]);
+
+       print_lustre_cfg(lcfg);
+       EXIT;
+       return;
+}
+EXPORT_SYMBOL(lustre_swab_lustre_cfg);
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+       __u32   cm_step;
+       __u32   cm_flags;
+       __u32   cm_vers;
+       __u32   padding;
+       __u32   cm_createtime;
+       __u32   cm_canceltime;
+       char    cm_tgtname[MTI_NAME_MAXLEN];
+       char    cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
+       (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+       struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
+       ENTRY;
+
+       if (swab) {
+               __swab32s(&marker->cm_step);
+               __swab32s(&marker->cm_flags);
+               __swab32s(&marker->cm_vers);
+       }
+       if (size == sizeof(*cm32)) {
+               __u32 createtime, canceltime;
+               /* There was a problem with the original declaration of
+                * cfg_marker on 32-bit systems because it used time_t as
+                * a wire protocol structure, and didn't verify this in
+                * wirecheck.  We now have to convert the offsets of the
+                * later fields in order to work on 32- and 64-bit systems.
+                *
+                * Fortunately, the cm_comment field has no functional use
+                * so can be sacrificed when converting the timestamp size.
+                *
+                * Overwrite fields from the end first, so they are not
+                * clobbered, and use memmove() instead of memcpy() because
+                * the source and target buffers overlap.  bug 16771 */
+               createtime = cm32->cm_createtime;
+               canceltime = cm32->cm_canceltime;
+               memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+               marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+               memmove(marker->cm_tgtname, cm32->cm_tgtname,
+                       sizeof(marker->cm_tgtname));
+               if (swab) {
+                       __swab32s(&createtime);
+                       __swab32s(&canceltime);
+               }
+               marker->cm_createtime = createtime;
+               marker->cm_canceltime = canceltime;
+               CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
+                      "for target %s, converting\n",
+                      marker->cm_tgtname);
+       } else if (swab) {
+               __swab64s(&marker->cm_createtime);
+               __swab64s(&marker->cm_canceltime);
+       }
+
+       EXIT;
+       return;
+}
+EXPORT_SYMBOL(lustre_swab_cfg_marker);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_test.c b/drivers/staging/lustre/lustre/obdclass/llog_test.c
new file mode 100644 (file)
index 0000000..d397f78
--- /dev/null
@@ -0,0 +1,1087 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_test.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lustre_log.h>
+
+/* This is slightly more than the number of records that can fit into a
+ * single llog file, because the llog_log_header takes up some of the
+ * space in the first block that cannot be used for the bitmap. */
+#define LLOG_TEST_RECNUM  (LLOG_CHUNK_SIZE * 8)
+
+static int llog_test_rand;
+static struct obd_uuid uuid = { .uuid = "test_uuid" };
+static struct llog_logid cat_logid;
+
+struct llog_mini_rec {
+       struct llog_rec_hdr     lmr_hdr;
+       struct llog_rec_tail    lmr_tail;
+} __attribute__((packed));
+
+static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
+{
+       int i;
+       int last_idx = 0;
+       int active_recs = 0;
+
+       for (i = 0; i < LLOG_BITMAP_BYTES * 8; i++) {
+               if (ext2_test_bit(i, llh->lgh_hdr->llh_bitmap)) {
+                       last_idx = i;
+                       active_recs++;
+               }
+       }
+
+       if (active_recs != num_recs) {
+               CERROR("%s: expected %d active recs after write, found %d\n",
+                      test, num_recs, active_recs);
+               RETURN(-ERANGE);
+       }
+
+       if (llh->lgh_hdr->llh_count != num_recs) {
+               CERROR("%s: handle->count is %d, expected %d after write\n",
+                      test, llh->lgh_hdr->llh_count, num_recs);
+               RETURN(-ERANGE);
+       }
+
+       if (llh->lgh_last_idx < last_idx) {
+               CERROR("%s: handle->last_idx is %d, expected %d after write\n",
+                      test, llh->lgh_last_idx, last_idx);
+               RETURN(-ERANGE);
+       }
+
+       RETURN(0);
+}
+
+/* Test named-log create/open, close */
+static int llog_test_1(const struct lu_env *env,
+                      struct obd_device *obd, char *name)
+{
+       struct llog_handle      *llh;
+       struct llog_ctxt        *ctxt;
+       int rc;
+       int rc2;
+
+       ENTRY;
+
+       CWARN("1a: create a log with name: %s\n", name);
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+
+       rc = llog_open_create(env, ctxt, &llh, NULL, name);
+       if (rc) {
+               CERROR("1a: llog_create with name %s failed: %d\n", name, rc);
+               GOTO(out, rc);
+       }
+       rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid);
+       if (rc) {
+               CERROR("1a: can't init llog handle: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+
+       rc = verify_handle("1", llh, 1);
+
+       CWARN("1b: close newly-created log\n");
+out_close:
+       rc2 = llog_close(env, llh);
+       if (rc2) {
+               CERROR("1b: close log %s failed: %d\n", name, rc2);
+               if (rc == 0)
+                       rc = rc2;
+       }
+out:
+       llog_ctxt_put(ctxt);
+       RETURN(rc);
+}
+
+/* Test named-log reopen; returns opened log on success */
+static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
+                      char *name, struct llog_handle **llh)
+{
+       struct llog_ctxt        *ctxt;
+       struct llog_handle      *loghandle;
+       struct llog_logid        logid;
+       int                      rc;
+
+       ENTRY;
+
+       CWARN("2a: re-open a log with name: %s\n", name);
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+
+       rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS);
+       if (rc) {
+               CERROR("2a: re-open log with name %s failed: %d\n", name, rc);
+               GOTO(out_put, rc);
+       }
+
+       rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid);
+       if (rc) {
+               CERROR("2a: can't init llog handle: %d\n", rc);
+               GOTO(out_close_llh, rc);
+       }
+
+       rc = verify_handle("2", *llh, 1);
+       if (rc)
+               GOTO(out_close_llh, rc);
+
+       /* XXX: there is known issue with tests 2b, MGS is not able to create
+        * anonymous llog, exit now to allow following tests run.
+        * It is fixed in upcoming llog over OSD code */
+       GOTO(out_put, rc);
+
+       CWARN("2b: create a log without specified NAME & LOGID\n");
+       rc = llog_open_create(env, ctxt, &loghandle, NULL, NULL);
+       if (rc) {
+               CERROR("2b: create log failed\n");
+               GOTO(out_close_llh, rc);
+       }
+       rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+       if (rc) {
+               CERROR("2b: can't init llog handle: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+
+       logid = loghandle->lgh_id;
+       llog_close(env, loghandle);
+
+       CWARN("2c: re-open the log by LOGID\n");
+       rc = llog_open(env, ctxt, &loghandle, &logid, NULL, LLOG_OPEN_EXISTS);
+       if (rc) {
+               CERROR("2c: re-open log by LOGID failed\n");
+               GOTO(out_close_llh, rc);
+       }
+
+       rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+       if (rc) {
+               CERROR("2c: can't init llog handle: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+
+       CWARN("2b: destroy this log\n");
+       rc = llog_destroy(env, loghandle);
+       if (rc)
+               CERROR("2d: destroy log failed\n");
+out_close:
+       llog_close(env, loghandle);
+out_close_llh:
+       if (rc)
+               llog_close(env, *llh);
+out_put:
+       llog_ctxt_put(ctxt);
+
+       RETURN(rc);
+}
+
+/* Test record writing, single and in bulk */
+static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
+                      struct llog_handle *llh)
+{
+       struct llog_gen_rec      lgr;
+       int                      rc, i;
+       int                      num_recs = 1; /* 1 for the header */
+
+       ENTRY;
+
+       lgr.lgr_hdr.lrh_len = lgr.lgr_tail.lrt_len = sizeof(lgr);
+       lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+       CWARN("3a: write one create_rec\n");
+       rc = llog_write(env, llh,  &lgr.lgr_hdr, NULL, 0, NULL, -1);
+       num_recs++;
+       if (rc < 0) {
+               CERROR("3a: write one log record failed: %d\n", rc);
+               RETURN(rc);
+       }
+
+       rc = verify_handle("3a", llh, num_recs);
+       if (rc)
+               RETURN(rc);
+
+       CWARN("3b: write 10 cfg log records with 8 bytes bufs\n");
+       for (i = 0; i < 10; i++) {
+               struct llog_rec_hdr     hdr;
+               char                    buf[8];
+
+               hdr.lrh_len = 8;
+               hdr.lrh_type = OBD_CFG_REC;
+               memset(buf, 0, sizeof buf);
+               rc = llog_write(env, llh, &hdr, NULL, 0, buf, -1);
+               if (rc < 0) {
+                       CERROR("3b: write 10 records failed at #%d: %d\n",
+                              i + 1, rc);
+                       RETURN(rc);
+               }
+               num_recs++;
+       }
+
+       rc = verify_handle("3b", llh, num_recs);
+       if (rc)
+               RETURN(rc);
+
+       CWARN("3c: write 1000 more log records\n");
+       for (i = 0; i < 1000; i++) {
+               rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1);
+               if (rc < 0) {
+                       CERROR("3c: write 1000 records failed at #%d: %d\n",
+                              i + 1, rc);
+                       RETURN(rc);
+               }
+               num_recs++;
+       }
+
+       rc = verify_handle("3c", llh, num_recs);
+       if (rc)
+               RETURN(rc);
+
+       CWARN("3d: write log more than BITMAP_SIZE, return -ENOSPC\n");
+       for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr) + 1; i++) {
+               struct llog_rec_hdr     hdr;
+               char                    buf_even[24];
+               char                    buf_odd[32];
+
+               memset(buf_odd, 0, sizeof buf_odd);
+               memset(buf_even, 0, sizeof buf_even);
+               if ((i % 2) == 0) {
+                       hdr.lrh_len = 24;
+                       hdr.lrh_type = OBD_CFG_REC;
+                       rc = llog_write(env, llh, &hdr, NULL, 0, buf_even, -1);
+               } else {
+                       hdr.lrh_len = 32;
+                       hdr.lrh_type = OBD_CFG_REC;
+                       rc = llog_write(env, llh, &hdr, NULL, 0, buf_odd, -1);
+               }
+               if (rc == -ENOSPC) {
+                       break;
+               } else if (rc < 0) {
+                       CERROR("3d: write recs failed at #%d: %d\n",
+                              i + 1, rc);
+                       RETURN(rc);
+               }
+               num_recs++;
+       }
+       if (rc != -ENOSPC) {
+               CWARN("3d: write record more than BITMAP size!\n");
+               RETURN(-EINVAL);
+       }
+       CWARN("3d: wrote %d more records before end of llog is reached\n",
+             num_recs);
+
+       rc = verify_handle("3d", llh, num_recs);
+
+       RETURN(rc);
+}
+
+/* Test catalogue additions */
+static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
+{
+       struct llog_handle      *cath;
+       char                     name[10];
+       int                      rc, rc2, i, buflen;
+       struct llog_mini_rec     lmr;
+       struct llog_cookie       cookie;
+       struct llog_ctxt        *ctxt;
+       int                      num_recs = 0;
+       char                    *buf;
+       struct llog_rec_hdr      rec;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+
+       lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+       lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+       sprintf(name, "%x", llog_test_rand + 1);
+       CWARN("4a: create a catalog log with name: %s\n", name);
+       rc = llog_open_create(env, ctxt, &cath, NULL, name);
+       if (rc) {
+               CERROR("4a: llog_create with name %s failed: %d\n", name, rc);
+               GOTO(ctxt_release, rc);
+       }
+       rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid);
+       if (rc) {
+               CERROR("4a: can't init llog handle: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       num_recs++;
+       cat_logid = cath->lgh_id;
+
+       CWARN("4b: write 1 record into the catalog\n");
+       rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie, NULL);
+       if (rc != 1) {
+               CERROR("4b: write 1 catalog record failed at: %d\n", rc);
+               GOTO(out, rc);
+       }
+       num_recs++;
+       rc = verify_handle("4b", cath, 2);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs);
+       if (rc)
+               GOTO(out, rc);
+
+       CWARN("4c: cancel 1 log record\n");
+       rc = llog_cat_cancel_records(env, cath, 1, &cookie);
+       if (rc) {
+               CERROR("4c: cancel 1 catalog based record failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+       num_recs--;
+
+       rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs);
+       if (rc)
+               GOTO(out, rc);
+
+       CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM);
+       for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+               rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL, NULL);
+               if (rc) {
+                       CERROR("4d: write %d records failed at #%d: %d\n",
+                              LLOG_TEST_RECNUM, i + 1, rc);
+                       GOTO(out, rc);
+               }
+               num_recs++;
+       }
+
+       /* make sure new plain llog appears */
+       rc = verify_handle("4d", cath, 3);
+       if (rc)
+               GOTO(out, rc);
+
+       CWARN("4e: add 5 large records, one record per block\n");
+       buflen = LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+                sizeof(struct llog_rec_tail);
+       OBD_ALLOC(buf, buflen);
+       if (buf == NULL)
+               GOTO(out, rc = -ENOMEM);
+       for (i = 0; i < 5; i++) {
+               rec.lrh_len = buflen;
+               rec.lrh_type = OBD_CFG_REC;
+               rc = llog_cat_add(env, cath, &rec, NULL, buf);
+               if (rc) {
+                       CERROR("4e: write 5 records failed at #%d: %d\n",
+                              i + 1, rc);
+                       GOTO(out_free, rc);
+               }
+               num_recs++;
+       }
+out_free:
+       OBD_FREE(buf, buflen);
+out:
+       CWARN("4f: put newly-created catalog\n");
+       rc2 = llog_cat_close(env, cath);
+       if (rc2) {
+               CERROR("4: close log %s failed: %d\n", name, rc2);
+               if (rc == 0)
+                       rc = rc2;
+       }
+ctxt_release:
+       llog_ctxt_put(ctxt);
+       RETURN(rc);
+}
+
+static int cat_counter;
+
+static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
+                       struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_logid_rec   *lir = (struct llog_logid_rec *)rec;
+       struct lu_fid            fid = {0};
+
+       if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+               CERROR("invalid record in catalog\n");
+               RETURN(-EINVAL);
+       }
+
+       logid_to_fid(&lir->lid_id, &fid);
+
+       CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+             rec->lrh_index, PFID(&fid),
+             PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+       cat_counter++;
+
+       RETURN(0);
+}
+
+static int plain_counter;
+
+static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh,
+                         struct llog_rec_hdr *rec, void *data)
+{
+       struct lu_fid fid = {0};
+
+       if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+               CERROR("log is not plain\n");
+               RETURN(-EINVAL);
+       }
+
+       logid_to_fid(&llh->lgh_id, &fid);
+
+       CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n",
+              rec->lrh_index, PFID(&fid));
+
+       plain_counter++;
+
+       RETURN(0);
+}
+
+static int cancel_count;
+
+static int llog_cancel_rec_cb(const struct lu_env *env,
+                             struct llog_handle *llh,
+                             struct llog_rec_hdr *rec, void *data)
+{
+       struct llog_cookie cookie;
+
+       if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+               CERROR("log is not plain\n");
+               RETURN(-EINVAL);
+       }
+
+       cookie.lgc_lgl = llh->lgh_id;
+       cookie.lgc_index = rec->lrh_index;
+
+       llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
+       cancel_count++;
+       if (cancel_count == LLOG_TEST_RECNUM)
+               RETURN(-LLOG_EEMPTY);
+       RETURN(0);
+}
+
+/* Test log and catalogue processing */
+static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
+{
+       struct llog_handle      *llh = NULL;
+       char                     name[10];
+       int                      rc, rc2;
+       struct llog_mini_rec     lmr;
+       struct llog_ctxt        *ctxt;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+
+       lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+       lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+       CWARN("5a: re-open catalog by id\n");
+       rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+       if (rc) {
+               CERROR("5a: llog_create with logid failed: %d\n", rc);
+               GOTO(out_put, rc);
+       }
+
+       rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+       if (rc) {
+               CERROR("5a: can't init llog handle: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       CWARN("5b: print the catalog entries.. we expect 2\n");
+       cat_counter = 0;
+       rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+       if (rc) {
+               CERROR("5b: process with cat_print_cb failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+       if (cat_counter != 2) {
+               CERROR("5b: %d entries in catalog\n", cat_counter);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+       cancel_count = 0;
+       rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
+       if (rc != -LLOG_EEMPTY) {
+               CERROR("5c: process with cat_cancel_cb failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       CWARN("5c: print the catalog entries.. we expect 1\n");
+       cat_counter = 0;
+       rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+       if (rc) {
+               CERROR("5c: process with cat_print_cb failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+       if (cat_counter != 1) {
+               CERROR("5c: %d entries in catalog\n", cat_counter);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CWARN("5d: add 1 record to the log with many canceled empty pages\n");
+       rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL, NULL);
+       if (rc) {
+               CERROR("5d: add record to the log with many canceled empty "
+                      "pages failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("5e: print plain log entries.. expect 6\n");
+       plain_counter = 0;
+       rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0);
+       if (rc) {
+               CERROR("5e: process with plain_print_cb failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+       if (plain_counter != 6) {
+               CERROR("5e: found %d records\n", plain_counter);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CWARN("5f: print plain log entries reversely.. expect 6\n");
+       plain_counter = 0;
+       rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar");
+       if (rc) {
+               CERROR("5f: reversely process with plain_print_cb failed:"
+                      "%d\n", rc);
+               GOTO(out, rc);
+       }
+       if (plain_counter != 6) {
+               CERROR("5f: found %d records\n", plain_counter);
+               GOTO(out, rc = -EINVAL);
+       }
+
+out:
+       CWARN("5g: close re-opened catalog\n");
+       rc2 = llog_cat_close(env, llh);
+       if (rc2) {
+               CERROR("5g: close log %s failed: %d\n", name, rc2);
+               if (rc == 0)
+                       rc = rc2;
+       }
+out_put:
+       llog_ctxt_put(ctxt);
+
+       RETURN(rc);
+}
+
+/* Test client api; open log by name and process */
+static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
+                      char *name)
+{
+       struct obd_device       *mgc_obd;
+       struct llog_ctxt        *ctxt;
+       struct obd_uuid         *mgs_uuid;
+       struct obd_export       *exp;
+       struct obd_uuid          uuid = { "LLOG_TEST6_UUID" };
+       struct llog_handle      *llh = NULL;
+       struct llog_ctxt        *nctxt;
+       int                      rc, rc2;
+
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+       mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid;
+
+       CWARN("6a: re-open log %s using client API\n", name);
+       mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL);
+       if (mgc_obd == NULL) {
+               CERROR("6a: no MGC devices connected to %s found.\n",
+                      mgs_uuid->uuid);
+               GOTO(ctxt_release, rc = -ENOENT);
+       }
+
+       rc = obd_connect(NULL, &exp, mgc_obd, &uuid,
+                        NULL /* obd_connect_data */, NULL);
+       if (rc != -EALREADY) {
+               CERROR("6a: connect on connected MGC (%s) failed to return"
+                      " -EALREADY", mgc_obd->obd_name);
+               if (rc == 0)
+                       obd_disconnect(exp);
+               GOTO(ctxt_release, rc = -EINVAL);
+       }
+
+       nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT);
+       rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+       if (rc) {
+               CERROR("6a: llog_open failed %d\n", rc);
+               GOTO(nctxt_put, rc);
+       }
+
+       rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+       if (rc) {
+               CERROR("6a: llog_init_handle failed %d\n", rc);
+               GOTO(parse_out, rc);
+       }
+
+       plain_counter = 1; /* llog header is first record */
+       CWARN("6b: process log %s using client API\n", name);
+       rc = llog_process(env, llh, plain_print_cb, NULL, NULL);
+       if (rc)
+               CERROR("6b: llog_process failed %d\n", rc);
+       CWARN("6b: processed %d records\n", plain_counter);
+
+       rc = verify_handle("6b", llh, plain_counter);
+       if (rc)
+               GOTO(parse_out, rc);
+
+       plain_counter = 1; /* llog header is first record */
+       CWARN("6c: process log %s reversely using client API\n", name);
+       rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL);
+       if (rc)
+               CERROR("6c: llog_reverse_process failed %d\n", rc);
+       CWARN("6c: processed %d records\n", plain_counter);
+
+       rc = verify_handle("6c", llh, plain_counter);
+       if (rc)
+               GOTO(parse_out, rc);
+
+parse_out:
+       rc2 = llog_close(env, llh);
+       if (rc2) {
+               CERROR("6: llog_close failed: rc = %d\n", rc2);
+               if (rc == 0)
+                       rc = rc2;
+       }
+nctxt_put:
+       llog_ctxt_put(nctxt);
+ctxt_release:
+       llog_ctxt_put(ctxt);
+       RETURN(rc);
+}
+
+static union {
+       struct llog_rec_hdr             lrh;   /* common header */
+       struct llog_logid_rec           llr;   /* LLOG_LOGID_MAGIC */
+       struct llog_unlink64_rec        lur;   /* MDS_UNLINK64_REC */
+       struct llog_setattr64_rec       lsr64; /* MDS_SETATTR64_REC */
+       struct llog_size_change_rec     lscr;  /* OST_SZ_REC */
+       struct llog_changelog_rec       lcr;   /* CHANGELOG_REC */
+       struct llog_changelog_user_rec  lcur;  /* CHANGELOG_USER_REC */
+       struct llog_gen_rec             lgr;   /* LLOG_GEN_REC */
+} llog_records;
+
+static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh,
+                          struct llog_rec_hdr *rec, void *data)
+{
+       struct lu_fid fid = {0};
+
+       logid_to_fid(&llh->lgh_id, &fid);
+
+       CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n",
+              rec->lrh_type, rec->lrh_index, PFID(&fid));
+
+       plain_counter++;
+       return 0;
+}
+
+static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
+                           struct llog_rec_hdr *rec, void *data)
+{
+       plain_counter++;
+       /* test LLOG_DEL_RECORD is working */
+       return LLOG_DEL_RECORD;
+}
+
+static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+       struct llog_handle      *llh;
+       int                      rc = 0, i, process_count;
+       int                      num_recs = 0;
+
+       ENTRY;
+
+       rc = llog_open_create(env, ctxt, &llh, NULL, NULL);
+       if (rc) {
+               CERROR("7_sub: create log failed\n");
+               RETURN(rc);
+       }
+
+       rc = llog_init_handle(env, llh,
+                             LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+                             &uuid);
+       if (rc) {
+               CERROR("7_sub: can't init llog handle: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+       for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr); i++) {
+               rc = llog_write(env, llh, &llog_records.lrh, NULL, 0,
+                               NULL, -1);
+               if (rc == -ENOSPC) {
+                       break;
+               } else if (rc < 0) {
+                       CERROR("7_sub: write recs failed at #%d: %d\n",
+                              i + 1, rc);
+                       GOTO(out_close, rc);
+               }
+               num_recs++;
+       }
+       if (rc != -ENOSPC) {
+               CWARN("7_sub: write record more than BITMAP size!\n");
+               GOTO(out_close, rc = -EINVAL);
+       }
+
+       rc = verify_handle("7_sub", llh, num_recs + 1);
+       if (rc) {
+               CERROR("7_sub: verify handle failed: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+       if (num_recs < LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1)
+               CWARN("7_sub: records are not aligned, written %d from %u\n",
+                     num_recs, LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1);
+
+       plain_counter = 0;
+       rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL);
+       if (rc) {
+               CERROR("7_sub: llog process failed: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+       process_count = plain_counter;
+       if (process_count != num_recs) {
+               CERROR("7_sub: processed %d records from %d total\n",
+                      process_count, num_recs);
+               GOTO(out_close, rc = -EINVAL);
+       }
+
+       plain_counter = 0;
+       rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL);
+       if (rc) {
+               CERROR("7_sub: reverse llog process failed: %d\n", rc);
+               GOTO(out_close, rc);
+       }
+       if (process_count != plain_counter) {
+               CERROR("7_sub: Reverse/direct processing found different"
+                      "number of records: %d/%d\n",
+                      plain_counter, process_count);
+               GOTO(out_close, rc = -EINVAL);
+       }
+       if (llog_exist(llh)) {
+               CERROR("7_sub: llog exists but should be zapped\n");
+               GOTO(out_close, rc = -EEXIST);
+       }
+
+       rc = verify_handle("7_sub", llh, 1);
+out_close:
+       if (rc)
+               llog_destroy(env, llh);
+       llog_close(env, llh);
+       RETURN(rc);
+}
+
+/* Test all llog records writing and processing */
+static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
+{
+       struct llog_ctxt        *ctxt;
+       int                      rc;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+
+       CWARN("7a: test llog_logid_rec\n");
+       llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr);
+       llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr);
+       llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7a: llog_logid_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7b: test llog_unlink64_rec\n");
+       llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur);
+       llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur);
+       llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7b: llog_unlink_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7c: test llog_setattr64_rec\n");
+       llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64);
+       llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64);
+       llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7c: llog_setattr64_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7d: test llog_size_change_rec\n");
+       llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr);
+       llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr);
+       llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7d: llog_size_change_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7e: test llog_changelog_rec\n");
+       llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr);
+       llog_records.lcr.cr_tail.lrt_len = sizeof(llog_records.lcr);
+       llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7e: llog_changelog_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7f: test llog_changelog_user_rec\n");
+       llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur);
+       llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur);
+       llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7f: llog_changelog_user_rec test failed\n");
+               GOTO(out, rc);
+       }
+
+       CWARN("7g: test llog_gen_rec\n");
+       llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr);
+       llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr);
+       llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+       rc = llog_test_7_sub(env, ctxt);
+       if (rc) {
+               CERROR("7g: llog_size_change_rec test failed\n");
+               GOTO(out, rc);
+       }
+out:
+       llog_ctxt_put(ctxt);
+       RETURN(rc);
+}
+
+/* -------------------------------------------------------------------------
+ * Tests above, boring obd functions below
+ * ------------------------------------------------------------------------- */
+static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
+{
+       struct llog_handle      *llh = NULL;
+       struct llog_ctxt        *ctxt;
+       int                      rc, err;
+       char                     name[10];
+
+       ENTRY;
+       ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+
+       sprintf(name, "%x", llog_test_rand);
+
+       rc = llog_test_1(env, obd, name);
+       if (rc)
+               GOTO(cleanup_ctxt, rc);
+
+       rc = llog_test_2(env, obd, name, &llh);
+       if (rc)
+               GOTO(cleanup_ctxt, rc);
+
+       rc = llog_test_3(env, obd, llh);
+       if (rc)
+               GOTO(cleanup, rc);
+
+       rc = llog_test_4(env, obd);
+       if (rc)
+               GOTO(cleanup, rc);
+
+       rc = llog_test_5(env, obd);
+       if (rc)
+               GOTO(cleanup, rc);
+
+       rc = llog_test_6(env, obd, name);
+       if (rc)
+               GOTO(cleanup, rc);
+
+       rc = llog_test_7(env, obd);
+       if (rc)
+               GOTO(cleanup, rc);
+
+cleanup:
+       err = llog_destroy(env, llh);
+       if (err)
+               CERROR("cleanup: llog_destroy failed: %d\n", err);
+       llog_close(env, llh);
+       if (rc == 0)
+               rc = err;
+cleanup_ctxt:
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+
+#ifdef LPROCFS
+static struct lprocfs_vars lprocfs_llog_test_obd_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_llog_test_module_vars[] = { {0} };
+static void lprocfs_llog_test_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_llog_test_module_vars;
+    lvars->obd_vars     = lprocfs_llog_test_obd_vars;
+}
+#endif
+
+static int llog_test_cleanup(struct obd_device *obd)
+{
+       struct obd_device       *tgt;
+       struct lu_env            env;
+       int                      rc;
+
+       ENTRY;
+
+       rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+       if (rc)
+               RETURN(rc);
+
+       tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd;
+       rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT));
+       if (rc)
+               CERROR("failed to llog_test_llog_finish: %d\n", rc);
+       lu_env_fini(&env);
+       RETURN(rc);
+}
+
+static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct obd_device       *tgt;
+       struct llog_ctxt        *ctxt;
+       struct dt_object        *o;
+       struct lu_env            env;
+       struct lu_context        test_session;
+       int                      rc;
+
+       ENTRY;
+
+       if (lcfg->lcfg_bufcount < 2) {
+               CERROR("requires a TARGET OBD name\n");
+               RETURN(-EINVAL);
+       }
+
+       if (lcfg->lcfg_buflens[1] < 1) {
+               CERROR("requires a TARGET OBD name\n");
+               RETURN(-EINVAL);
+       }
+
+       /* disk obd */
+       tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+       if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+               CERROR("target device not attached or not set up (%s)\n",
+                      lustre_cfg_string(lcfg, 1));
+               RETURN(-EINVAL);
+       }
+
+       rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+       if (rc)
+               RETURN(rc);
+
+       rc = lu_context_init(&test_session, LCT_SESSION);
+       if (rc)
+               GOTO(cleanup_env, rc);
+       test_session.lc_thread = (struct ptlrpc_thread *)current;
+       lu_context_enter(&test_session);
+       env.le_ses = &test_session;
+
+       CWARN("Setup llog-test device over %s device\n",
+             lustre_cfg_string(lcfg, 1));
+
+       OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+       obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev);
+
+       rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt,
+                       &llog_osd_ops);
+       if (rc)
+               GOTO(cleanup_session, rc);
+
+       /* use MGS llog dir for tests */
+       ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT);
+       LASSERT(ctxt);
+       o = ctxt->loc_dir;
+       llog_ctxt_put(ctxt);
+
+       ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT);
+       LASSERT(ctxt);
+       ctxt->loc_dir = o;
+       llog_ctxt_put(ctxt);
+
+       llog_test_rand = cfs_rand();
+
+       rc = llog_run_tests(&env, tgt);
+       if (rc)
+               llog_test_cleanup(obd);
+cleanup_session:
+       lu_context_exit(&test_session);
+       lu_context_fini(&test_session);
+cleanup_env:
+       lu_env_fini(&env);
+       RETURN(rc);
+}
+
+static struct obd_ops llog_obd_ops = {
+       .o_owner       = THIS_MODULE,
+       .o_setup       = llog_test_setup,
+       .o_cleanup     = llog_test_cleanup,
+};
+
+static int __init llog_test_init(void)
+{
+       struct lprocfs_static_vars lvars;
+
+       lprocfs_llog_test_init_vars(&lvars);
+       return class_register_type(&llog_obd_ops, NULL,
+                                  lvars.module_vars, "llog_test", NULL);
+}
+
+static void __exit llog_test_exit(void)
+{
+       class_unregister_type("llog_test");
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("llog test module");
+MODULE_LICENSE("GPL");
+
+module_init(llog_test_init);
+module_exit(llog_test_exit);
diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.c b/drivers/staging/lustre/lustre/obdclass/local_storage.c
new file mode 100644 (file)
index 0000000..3be35a8
--- /dev/null
@@ -0,0 +1,903 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "local_storage.h"
+
+/* all initialized local storages on this node are linked on this */
+static LIST_HEAD(ls_list_head);
+static DEFINE_MUTEX(ls_list_mutex);
+
+static int ls_object_init(const struct lu_env *env, struct lu_object *o,
+                         const struct lu_object_conf *unused)
+{
+       struct ls_device        *ls;
+       struct lu_object        *below;
+       struct lu_device        *under;
+
+       ENTRY;
+
+       ls = container_of0(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev);
+       under = &ls->ls_osd->dd_lu_dev;
+       below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
+       if (below == NULL)
+               RETURN(-ENOMEM);
+
+       lu_object_add(o, below);
+
+       RETURN(0);
+}
+
+static void ls_object_free(const struct lu_env *env, struct lu_object *o)
+{
+       struct ls_object        *obj = lu2ls_obj(o);
+       struct lu_object_header *h = o->lo_header;
+
+       dt_object_fini(&obj->ls_obj);
+       lu_object_header_fini(h);
+       OBD_FREE_PTR(obj);
+}
+
+struct lu_object_operations ls_lu_obj_ops = {
+       .loo_object_init  = ls_object_init,
+       .loo_object_free  = ls_object_free,
+};
+
+struct lu_object *ls_object_alloc(const struct lu_env *env,
+                                 const struct lu_object_header *_h,
+                                 struct lu_device *d)
+{
+       struct lu_object_header *h;
+       struct ls_object        *o;
+       struct lu_object        *l;
+
+       LASSERT(_h == NULL);
+
+       OBD_ALLOC_PTR(o);
+       if (o != NULL) {
+               l = &o->ls_obj.do_lu;
+               h = &o->ls_header;
+
+               lu_object_header_init(h);
+               dt_object_init(&o->ls_obj, h, d);
+               lu_object_add_top(h, l);
+
+               l->lo_ops = &ls_lu_obj_ops;
+
+               return l;
+       } else {
+               return NULL;
+       }
+}
+
+static struct lu_device_operations ls_lu_dev_ops = {
+       .ldo_object_alloc =     ls_object_alloc
+};
+
+static struct ls_device *__ls_find_dev(struct dt_device *dev)
+{
+       struct ls_device *ls, *ret = NULL;
+
+       list_for_each_entry(ls, &ls_list_head, ls_linkage) {
+               if (ls->ls_osd == dev) {
+                       atomic_inc(&ls->ls_refcount);
+                       ret = ls;
+                       break;
+               }
+       }
+       return ret;
+}
+
+struct ls_device *ls_find_dev(struct dt_device *dev)
+{
+       struct ls_device *ls;
+
+       mutex_lock(&ls_list_mutex);
+       ls = __ls_find_dev(dev);
+       mutex_unlock(&ls_list_mutex);
+
+       return ls;
+}
+
+static struct lu_device_type_operations ls_device_type_ops = {
+       .ldto_start = NULL,
+       .ldto_stop  = NULL,
+};
+
+static struct lu_device_type ls_lu_type = {
+       .ldt_name = "local_storage",
+       .ldt_ops  = &ls_device_type_ops,
+};
+
+struct ls_device *ls_device_get(struct dt_device *dev)
+{
+       struct ls_device *ls;
+
+       ENTRY;
+
+       mutex_lock(&ls_list_mutex);
+       ls = __ls_find_dev(dev);
+       if (ls)
+               GOTO(out_ls, ls);
+
+       /* not found, then create */
+       OBD_ALLOC_PTR(ls);
+       if (ls == NULL)
+               GOTO(out_ls, ls = ERR_PTR(-ENOMEM));
+
+       atomic_set(&ls->ls_refcount, 1);
+       INIT_LIST_HEAD(&ls->ls_los_list);
+       mutex_init(&ls->ls_los_mutex);
+
+       ls->ls_osd = dev;
+
+       LASSERT(dev->dd_lu_dev.ld_site);
+       lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type);
+       ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops;
+       ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site;
+
+       /* finally add ls to the list */
+       list_add(&ls->ls_linkage, &ls_list_head);
+out_ls:
+       mutex_unlock(&ls_list_mutex);
+       RETURN(ls);
+}
+
+void ls_device_put(const struct lu_env *env, struct ls_device *ls)
+{
+       LASSERT(env);
+       if (!atomic_dec_and_test(&ls->ls_refcount))
+               return;
+
+       mutex_lock(&ls_list_mutex);
+       if (atomic_read(&ls->ls_refcount) == 0) {
+               LASSERT(list_empty(&ls->ls_los_list));
+               list_del(&ls->ls_linkage);
+               lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0);
+               lu_device_fini(&ls->ls_top_dev.dd_lu_dev);
+               OBD_FREE_PTR(ls);
+       }
+       mutex_unlock(&ls_list_mutex);
+}
+
+/**
+ * local file fid generation
+ */
+int local_object_fid_generate(const struct lu_env *env,
+                             struct local_oid_storage *los,
+                             struct lu_fid *fid)
+{
+       LASSERT(los->los_dev);
+       LASSERT(los->los_obj);
+
+       /* take next OID */
+
+       /* to make it unique after reboot we store
+        * the latest generated fid atomically with
+        * object creation see local_object_create() */
+
+       mutex_lock(&los->los_id_lock);
+       fid->f_seq = los->los_seq;
+       fid->f_oid = ++los->los_last_oid;
+       fid->f_ver = 0;
+       mutex_unlock(&los->los_id_lock);
+
+       return 0;
+}
+
+int local_object_declare_create(const struct lu_env *env,
+                               struct local_oid_storage *los,
+                               struct dt_object *o, struct lu_attr *attr,
+                               struct dt_object_format *dof,
+                               struct thandle *th)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       int                      rc;
+
+       ENTRY;
+
+       /* update fid generation file */
+       if (los != NULL) {
+               LASSERT(dt_object_exists(los->los_obj));
+               rc = dt_declare_record_write(env, los->los_obj,
+                                            sizeof(struct los_ondisk), 0, th);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       rc = dt_declare_create(env, o, attr, NULL, dof, th);
+       if (rc)
+               RETURN(rc);
+
+       dti->dti_lb.lb_buf = NULL;
+       dti->dti_lb.lb_len = sizeof(dti->dti_lma);
+       rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th);
+
+       RETURN(rc);
+}
+
+int local_object_create(const struct lu_env *env,
+                       struct local_oid_storage *los,
+                       struct dt_object *o, struct lu_attr *attr,
+                       struct dt_object_format *dof, struct thandle *th)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       obd_id                   lastid;
+       int                      rc;
+
+       ENTRY;
+
+       rc = dt_create(env, o, attr, NULL, dof, th);
+       if (rc)
+               RETURN(rc);
+
+       if (los == NULL)
+               RETURN(rc);
+
+       LASSERT(los->los_obj);
+       LASSERT(dt_object_exists(los->los_obj));
+
+       /* many threads can be updated this, serialize
+        * them here to avoid the race where one thread
+        * takes the value first, but writes it last */
+       mutex_lock(&los->los_id_lock);
+
+       /* update local oid number on disk so that
+        * we know the last one used after reboot */
+       lastid = cpu_to_le64(los->los_last_oid);
+
+       dti->dti_off = 0;
+       dti->dti_lb.lb_buf = &lastid;
+       dti->dti_lb.lb_len = sizeof(lastid);
+       rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off,
+                            th);
+       mutex_unlock(&los->los_id_lock);
+
+       RETURN(rc);
+}
+
+/*
+ * Create local named object (file, directory or index) in parent directory.
+ */
+struct dt_object *__local_file_create(const struct lu_env *env,
+                                     const struct lu_fid *fid,
+                                     struct local_oid_storage *los,
+                                     struct ls_device *ls,
+                                     struct dt_object *parent,
+                                     const char *name, struct lu_attr *attr,
+                                     struct dt_object_format *dof)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       struct thandle          *th;
+       int                      rc;
+
+       dto = ls_locate(env, ls, fid);
+       if (unlikely(IS_ERR(dto)))
+               RETURN(dto);
+
+       LASSERT(dto != NULL);
+       if (dt_object_exists(dto))
+               GOTO(out, rc = -EEXIST);
+
+       th = dt_trans_create(env, ls->ls_osd);
+       if (IS_ERR(th))
+               GOTO(out, rc = PTR_ERR(th));
+
+       rc = local_object_declare_create(env, los, dto, attr, dof, th);
+       if (rc)
+               GOTO(trans_stop, rc);
+
+       if (dti->dti_dof.dof_type == DFT_DIR) {
+               dt_declare_ref_add(env, dto, th);
+               dt_declare_ref_add(env, parent, th);
+       }
+
+       rc = dt_declare_insert(env, parent, (void *)fid, (void *)name, th);
+       if (rc)
+               GOTO(trans_stop, rc);
+
+       rc = dt_trans_start_local(env, ls->ls_osd, th);
+       if (rc)
+               GOTO(trans_stop, rc);
+
+       dt_write_lock(env, dto, 0);
+       if (dt_object_exists(dto))
+               GOTO(unlock, rc = 0);
+
+       CDEBUG(D_OTHER, "create new object "DFID"\n",
+              PFID(lu_object_fid(&dto->do_lu)));
+       rc = local_object_create(env, los, dto, attr, dof, th);
+       if (rc)
+               GOTO(unlock, rc);
+       LASSERT(dt_object_exists(dto));
+
+       if (dti->dti_dof.dof_type == DFT_DIR) {
+               if (!dt_try_as_dir(env, dto))
+                       GOTO(destroy, rc = -ENOTDIR);
+               /* Add "." and ".." for newly created dir */
+               rc = dt_insert(env, dto, (void *)fid, (void *)".", th,
+                              BYPASS_CAPA, 1);
+               if (rc)
+                       GOTO(destroy, rc);
+               dt_ref_add(env, dto, th);
+               rc = dt_insert(env, dto, (void *)lu_object_fid(&parent->do_lu),
+                              (void *)"..", th, BYPASS_CAPA, 1);
+               if (rc)
+                       GOTO(destroy, rc);
+       }
+
+       dt_write_lock(env, parent, 0);
+       rc = dt_insert(env, parent, (const struct dt_rec *)fid,
+                      (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+       if (dti->dti_dof.dof_type == DFT_DIR)
+               dt_ref_add(env, parent, th);
+       dt_write_unlock(env, parent);
+       if (rc)
+               GOTO(destroy, rc);
+destroy:
+       if (rc)
+               dt_destroy(env, dto, th);
+unlock:
+       dt_write_unlock(env, dto);
+trans_stop:
+       dt_trans_stop(env, ls->ls_osd, th);
+out:
+       if (rc) {
+               lu_object_put_nocache(env, &dto->do_lu);
+               dto = ERR_PTR(rc);
+       }
+       RETURN(dto);
+}
+
+/*
+ * Look up and create (if it does not exist) a local named file or directory in
+ * parent directory.
+ */
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+                                           struct local_oid_storage *los,
+                                           struct dt_object *parent,
+                                           const char *name, __u32 mode)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       int                      rc;
+
+       LASSERT(parent);
+
+       rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+       if (rc == 0)
+               /* name is found, get the object */
+               dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+       else if (rc != -ENOENT)
+               dto = ERR_PTR(rc);
+       else {
+               rc = local_object_fid_generate(env, los, &dti->dti_fid);
+               if (rc < 0) {
+                       dto = ERR_PTR(rc);
+               } else {
+                       /* create the object */
+                       dti->dti_attr.la_valid  = LA_MODE;
+                       dti->dti_attr.la_mode   = mode;
+                       dti->dti_dof.dof_type   = dt_mode_to_dft(mode & S_IFMT);
+                       dto = __local_file_create(env, &dti->dti_fid, los,
+                                                 dt2ls_dev(los->los_dev),
+                                                 parent, name, &dti->dti_attr,
+                                                 &dti->dti_dof);
+               }
+       }
+       return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create);
+
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+                                                    struct dt_device *dt,
+                                                    const struct lu_fid *fid,
+                                                    struct dt_object *parent,
+                                                    const char *name,
+                                                    __u32 mode)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       int                      rc;
+
+       LASSERT(parent);
+
+       rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+       if (rc == 0) {
+               dto = dt_locate(env, dt, &dti->dti_fid);
+       } else if (rc != -ENOENT) {
+               dto = ERR_PTR(rc);
+       } else {
+               struct ls_device *ls;
+
+               ls = ls_device_get(dt);
+               if (IS_ERR(ls)) {
+                       dto = ERR_PTR(PTR_ERR(ls));
+               } else {
+                       /* create the object */
+                       dti->dti_attr.la_valid  = LA_MODE;
+                       dti->dti_attr.la_mode   = mode;
+                       dti->dti_dof.dof_type   = dt_mode_to_dft(mode & S_IFMT);
+                       dto = __local_file_create(env, fid, NULL, ls, parent,
+                                                 name, &dti->dti_attr,
+                                                 &dti->dti_dof);
+                       /* ls_device_put() will finalize the ls device, we
+                        * have to open the object in other device stack */
+                       if (!IS_ERR(dto)) {
+                               dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+                               lu_object_put_nocache(env, &dto->do_lu);
+                               dto = dt_locate(env, dt, &dti->dti_fid);
+                       }
+                       ls_device_put(env, ls);
+               }
+       }
+       return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create_with_fid);
+
+/*
+ * Look up and create (if it does not exist) a local named index file in parent
+ * directory.
+ */
+struct dt_object *local_index_find_or_create(const struct lu_env *env,
+                                            struct local_oid_storage *los,
+                                            struct dt_object *parent,
+                                            const char *name, __u32 mode,
+                                            const struct dt_index_features *ft)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       int                      rc;
+
+       LASSERT(parent);
+
+       rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+       if (rc == 0) {
+               /* name is found, get the object */
+               dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+       } else if (rc != -ENOENT) {
+               dto = ERR_PTR(rc);
+       } else {
+               rc = local_object_fid_generate(env, los, &dti->dti_fid);
+               if (rc < 0) {
+                       dto = ERR_PTR(rc);
+               } else {
+                       /* create the object */
+                       dti->dti_attr.la_valid          = LA_MODE;
+                       dti->dti_attr.la_mode           = mode;
+                       dti->dti_dof.dof_type           = DFT_INDEX;
+                       dti->dti_dof.u.dof_idx.di_feat  = ft;
+                       dto = __local_file_create(env, &dti->dti_fid, los,
+                                                 dt2ls_dev(los->los_dev),
+                                                 parent, name, &dti->dti_attr,
+                                                 &dti->dti_dof);
+               }
+       }
+       return dto;
+
+}
+EXPORT_SYMBOL(local_index_find_or_create);
+
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+                                   struct dt_device *dt,
+                                   const struct lu_fid *fid,
+                                   struct dt_object *parent,
+                                   const char *name, __u32 mode,
+                                   const struct dt_index_features *ft)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       int                      rc;
+
+       LASSERT(parent);
+
+       rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+       if (rc == 0) {
+               /* name is found, get the object */
+               if (!lu_fid_eq(fid, &dti->dti_fid))
+                       dto = ERR_PTR(-EINVAL);
+               else
+                       dto = dt_locate(env, dt, fid);
+       } else if (rc != -ENOENT) {
+               dto = ERR_PTR(rc);
+       } else {
+               struct ls_device *ls;
+
+               ls = ls_device_get(dt);
+               if (IS_ERR(ls)) {
+                       dto = ERR_PTR(PTR_ERR(ls));
+               } else {
+                       /* create the object */
+                       dti->dti_attr.la_valid          = LA_MODE;
+                       dti->dti_attr.la_mode           = mode;
+                       dti->dti_dof.dof_type           = DFT_INDEX;
+                       dti->dti_dof.u.dof_idx.di_feat  = ft;
+                       dto = __local_file_create(env, fid, NULL, ls, parent,
+                                                 name, &dti->dti_attr,
+                                                 &dti->dti_dof);
+                       /* ls_device_put() will finalize the ls device, we
+                        * have to open the object in other device stack */
+                       if (!IS_ERR(dto)) {
+                               dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+                               lu_object_put_nocache(env, &dto->do_lu);
+                               dto = dt_locate(env, dt, &dti->dti_fid);
+                       }
+                       ls_device_put(env, ls);
+               }
+       }
+       return dto;
+}
+EXPORT_SYMBOL(local_index_find_or_create_with_fid);
+
+static int local_object_declare_unlink(const struct lu_env *env,
+                                      struct dt_device *dt,
+                                      struct dt_object *p,
+                                      struct dt_object *c, const char *name,
+                                      struct thandle *th)
+{
+       int rc;
+
+       rc = dt_declare_delete(env, p, (const struct dt_key *)name, th);
+       if (rc < 0)
+               return rc;
+
+       rc = dt_declare_ref_del(env, c, th);
+       if (rc < 0)
+               return rc;
+
+       return dt_declare_destroy(env, c, th);
+}
+
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+                       struct dt_object *parent, const char *name)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *dto;
+       struct thandle          *th;
+       int                      rc;
+
+       ENTRY;
+
+       rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+       if (rc == -ENOENT)
+               RETURN(0);
+       else if (rc < 0)
+               RETURN(rc);
+
+       dto = dt_locate(env, dt, &dti->dti_fid);
+       if (unlikely(IS_ERR(dto)))
+               RETURN(PTR_ERR(dto));
+
+       th = dt_trans_create(env, dt);
+       if (IS_ERR(th))
+               GOTO(out, rc = PTR_ERR(th));
+
+       rc = local_object_declare_unlink(env, dt, parent, dto, name, th);
+       if (rc < 0)
+               GOTO(stop, rc);
+
+       rc = dt_trans_start_local(env, dt, th);
+       if (rc < 0)
+               GOTO(stop, rc);
+
+       dt_write_lock(env, dto, 0);
+       rc = dt_delete(env, parent, (struct dt_key *)name, th, BYPASS_CAPA);
+       if (rc < 0)
+               GOTO(unlock, rc);
+
+       rc = dt_ref_del(env, dto, th);
+       if (rc < 0) {
+               rc = dt_insert(env, parent,
+                              (const struct dt_rec *)&dti->dti_fid,
+                              (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+               GOTO(unlock, rc);
+       }
+
+       rc = dt_destroy(env, dto, th);
+unlock:
+       dt_write_unlock(env, dto);
+stop:
+       dt_trans_stop(env, dt, th);
+out:
+       lu_object_put_nocache(env, &dto->do_lu);
+       return rc;
+}
+EXPORT_SYMBOL(local_object_unlink);
+
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq)
+{
+       struct local_oid_storage *los, *ret = NULL;
+
+       list_for_each_entry(los, &ls->ls_los_list, los_list) {
+               if (los->los_seq == seq) {
+                       atomic_inc(&los->los_refcount);
+                       ret = los;
+                       break;
+               }
+       }
+       return ret;
+}
+
+void dt_los_put(struct local_oid_storage *los)
+{
+       if (atomic_dec_and_test(&los->los_refcount))
+               /* should never happen, only local_oid_storage_fini should
+                * drop refcount to zero */
+               LBUG();
+       return;
+}
+
+/* after Lustre 2.3 release there may be old file to store last generated FID
+ * If such file exists then we have to read its content
+ */
+int lastid_compat_check(const struct lu_env *env, struct dt_device *dev,
+                       __u64 lastid_seq, __u32 *first_oid, struct ls_device *ls)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct dt_object        *root = NULL;
+       struct los_ondisk        losd;
+       struct dt_object        *o = NULL;
+       int                      rc = 0;
+
+       rc = dt_root_get(env, dev, &dti->dti_fid);
+       if (rc)
+               return rc;
+
+       root = ls_locate(env, ls, &dti->dti_fid);
+       if (IS_ERR(root))
+               return PTR_ERR(root);
+
+       /* find old last_id file */
+       snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-"LPX64"-lastid",
+                lastid_seq);
+       rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid);
+       lu_object_put_nocache(env, &root->do_lu);
+       if (rc == -ENOENT) {
+               /* old llog lastid accessed by FID only */
+               if (lastid_seq != FID_SEQ_LLOG)
+                       return 0;
+               dti->dti_fid.f_seq = FID_SEQ_LLOG;
+               dti->dti_fid.f_oid = 1;
+               dti->dti_fid.f_ver = 0;
+               o = ls_locate(env, ls, &dti->dti_fid);
+               if (IS_ERR(o))
+                       return PTR_ERR(o);
+
+               if (!dt_object_exists(o)) {
+                       lu_object_put_nocache(env, &o->do_lu);
+                       return 0;
+               }
+               CDEBUG(D_INFO, "Found old llog lastid file\n");
+       } else if (rc < 0) {
+               return rc;
+       } else {
+               CDEBUG(D_INFO, "Found old lastid file for sequence "LPX64"\n",
+                      lastid_seq);
+               o = ls_locate(env, ls, &dti->dti_fid);
+               if (IS_ERR(o))
+                       return PTR_ERR(o);
+       }
+       /* let's read seq-NNNNNN-lastid file value */
+       LASSERT(dt_object_exists(o));
+       dti->dti_off = 0;
+       dti->dti_lb.lb_buf = &losd;
+       dti->dti_lb.lb_len = sizeof(losd);
+       dt_read_lock(env, o, 0);
+       rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+       dt_read_unlock(env, o);
+       lu_object_put_nocache(env, &o->do_lu);
+       if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) {
+               CERROR("%s: wrong content of seq-"LPX64"-lastid file, magic %x\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq,
+                      le32_to_cpu(losd.lso_magic));
+               return -EINVAL;
+       } else if (rc < 0) {
+               CERROR("%s: failed to read seq-"LPX64"-lastid: rc = %d\n",
+                      o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, rc);
+               return rc;
+       }
+       *first_oid = le32_to_cpu(losd.lso_next_oid);
+       return rc;
+}
+
+/**
+ * Initialize local OID storage for required sequence.
+ * That may be needed for services that uses local files and requires
+ * dynamic OID allocation for them.
+ *
+ * Per each sequence we have an object with 'first_fid' identificator
+ * containing the counter for OIDs of locally created files with that
+ * sequence.
+ *
+ * It is used now by llog subsystem and MGS for NID tables
+ *
+ * Function gets first_fid to create counter object.
+ * All dynamic fids will be generated with the same sequence and incremented
+ * OIDs
+ *
+ * Returned local_oid_storage is in-memory representaion of OID storage
+ */
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+                          const struct lu_fid *first_fid,
+                          struct local_oid_storage **los)
+{
+       struct dt_thread_info   *dti = dt_info(env);
+       struct ls_device        *ls;
+       obd_id                   lastid;
+       struct dt_object        *o = NULL;
+       struct thandle          *th;
+       __u32                    first_oid = fid_oid(first_fid);
+       int                      rc = 0;
+
+       ENTRY;
+
+       ls = ls_device_get(dev);
+       if (IS_ERR(ls))
+               RETURN(PTR_ERR(ls));
+
+       mutex_lock(&ls->ls_los_mutex);
+       *los = dt_los_find(ls, fid_seq(first_fid));
+       if (*los != NULL)
+               GOTO(out, rc = 0);
+
+       /* not found, then create */
+       OBD_ALLOC_PTR(*los);
+       if (*los == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       atomic_set(&(*los)->los_refcount, 1);
+       mutex_init(&(*los)->los_id_lock);
+       (*los)->los_dev = &ls->ls_top_dev;
+       atomic_inc(&ls->ls_refcount);
+       list_add(&(*los)->los_list, &ls->ls_los_list);
+
+       /* Use {seq, 0, 0} to create the LAST_ID file for every
+        * sequence.  OIDs start at LUSTRE_FID_INIT_OID.
+        */
+       dti->dti_fid.f_seq = fid_seq(first_fid);
+       dti->dti_fid.f_oid = LUSTRE_FID_LASTID_OID;
+       dti->dti_fid.f_ver = 0;
+       o = ls_locate(env, ls, &dti->dti_fid);
+       if (IS_ERR(o))
+               GOTO(out_los, rc = PTR_ERR(o));
+
+       if (!dt_object_exists(o)) {
+               rc = lastid_compat_check(env, dev, fid_seq(first_fid),
+                                        &first_oid, ls);
+               if (rc < 0)
+                       GOTO(out_los, rc);
+
+               th = dt_trans_create(env, dev);
+               if (IS_ERR(th))
+                       GOTO(out_los, rc = PTR_ERR(th));
+
+               dti->dti_attr.la_valid = LA_MODE | LA_TYPE;
+               dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+               dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+               rc = dt_declare_create(env, o, &dti->dti_attr, NULL,
+                                      &dti->dti_dof, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_declare_record_write(env, o, sizeof(lastid), 0, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               rc = dt_trans_start_local(env, dev, th);
+               if (rc)
+                       GOTO(out_trans, rc);
+
+               dt_write_lock(env, o, 0);
+               if (dt_object_exists(o))
+                       GOTO(out_lock, rc = 0);
+
+               rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof,
+                              th);
+               if (rc)
+                       GOTO(out_lock, rc);
+
+               lastid = cpu_to_le64(first_oid);
+
+               dti->dti_off = 0;
+               dti->dti_lb.lb_buf = &lastid;
+               dti->dti_lb.lb_len = sizeof(lastid);
+               rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th);
+               if (rc)
+                       GOTO(out_lock, rc);
+out_lock:
+               dt_write_unlock(env, o);
+out_trans:
+               dt_trans_stop(env, dev, th);
+       } else {
+               dti->dti_off = 0;
+               dti->dti_lb.lb_buf = &lastid;
+               dti->dti_lb.lb_len = sizeof(lastid);
+               dt_read_lock(env, o, 0);
+               rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+               dt_read_unlock(env, o);
+               if (rc == 0 && le64_to_cpu(lastid) > OBIF_MAX_OID) {
+                       CERROR("%s: bad oid "LPU64" is read from LAST_ID\n",
+                              o->do_lu.lo_dev->ld_obd->obd_name,
+                              le64_to_cpu(lastid));
+                       rc = -EINVAL;
+               }
+       }
+out_los:
+       if (rc != 0) {
+               list_del(&(*los)->los_list);
+               atomic_dec(&ls->ls_refcount);
+               OBD_FREE_PTR(*los);
+               *los = NULL;
+               if (o != NULL && !IS_ERR(o))
+                       lu_object_put_nocache(env, &o->do_lu);
+       } else {
+               (*los)->los_seq = fid_seq(first_fid);
+               (*los)->los_last_oid = le64_to_cpu(lastid);
+               (*los)->los_obj = o;
+               /* read value should not be less than initial one */
+               LASSERTF((*los)->los_last_oid >= first_oid, "%u < %u\n",
+                        (*los)->los_last_oid, first_oid);
+       }
+out:
+       mutex_unlock(&ls->ls_los_mutex);
+       ls_device_put(env, ls);
+       return rc;
+}
+EXPORT_SYMBOL(local_oid_storage_init);
+
+void local_oid_storage_fini(const struct lu_env *env,
+                           struct local_oid_storage *los)
+{
+       struct ls_device *ls;
+
+       if (!atomic_dec_and_test(&los->los_refcount))
+               return;
+
+       LASSERT(env);
+       LASSERT(los->los_dev);
+       ls = dt2ls_dev(los->los_dev);
+
+       mutex_lock(&ls->ls_los_mutex);
+       if (atomic_read(&los->los_refcount) == 0) {
+               if (los->los_obj)
+                       lu_object_put_nocache(env, &los->los_obj->do_lu);
+               list_del(&los->los_list);
+               OBD_FREE_PTR(los);
+       }
+       mutex_unlock(&ls->ls_los_mutex);
+       ls_device_put(env, ls);
+}
+EXPORT_SYMBOL(local_oid_storage_fini);
diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.h b/drivers/staging/lustre/lustre/obdclass/local_storage.h
new file mode 100644 (file)
index 0000000..d553c37
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#include <dt_object.h>
+#include <obd.h>
+#include <lustre_fid.h>
+#include <lustre_disk.h>
+
+struct ls_device {
+       struct dt_device         ls_top_dev;
+       /* all initialized ls_devices on this node linked by this */
+       struct list_head                 ls_linkage;
+       /* how many handle's reference this local storage */
+       atomic_t                 ls_refcount;
+       /* underlaying OSD device */
+       struct dt_device        *ls_osd;
+       /* list of all local OID storages */
+       struct list_head                 ls_los_list;
+       struct mutex             ls_los_mutex;
+};
+
+static inline struct ls_device *dt2ls_dev(struct dt_device *d)
+{
+       return container_of0(d, struct ls_device, ls_top_dev);
+}
+
+struct ls_object {
+       struct lu_object_header  ls_header;
+       struct dt_object         ls_obj;
+};
+
+static inline struct ls_object *lu2ls_obj(struct lu_object *o)
+{
+       return container_of0(o, struct ls_object, ls_obj.do_lu);
+}
+
+static inline struct dt_object *ls_locate(const struct lu_env *env,
+                                         struct ls_device *ls,
+                                         const struct lu_fid *fid)
+{
+       return dt_locate_at(env, ls->ls_osd, fid, &ls->ls_top_dev.dd_lu_dev);
+}
+
+struct ls_device *ls_device_get(struct dt_device *dev);
+void ls_device_put(const struct lu_env *env, struct ls_device *ls);
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq);
+void dt_los_put(struct local_oid_storage *los);
+
+/* Lustre 2.3 on-disk structure describing local object OIDs storage
+ * the structure to be used with any sequence managed by
+ * local object library.
+ * Obsoleted since 2.4 but is kept for compatibility reasons,
+ * see lastid_compat_check() in obdclass/local_storage.c */
+struct los_ondisk {
+       __u32 lso_magic;
+       __u32 lso_next_oid;
+};
+
+#define LOS_MAGIC      0xdecafbee
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c
new file mode 100644 (file)
index 0000000..e2d57fe
--- /dev/null
@@ -0,0 +1,562 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu Yawei <niu@whamcloud.com>
+ */
+/*
+ * lustre/obdclass/lprocfs_jobstats.c
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+#if defined(LPROCFS)
+
+/*
+ * JobID formats & JobID environment variable names for supported
+ * job schedulers:
+ *
+ * SLURM:
+ *   JobID format:  32 bit integer.
+ *   JobID env var: SLURM_JOB_ID.
+ * SGE:
+ *   JobID format:  Decimal integer range to 99999.
+ *   JobID env var: JOB_ID.
+ * LSF:
+ *   JobID format:  6 digit integer by default (up to 999999), can be
+ *               increased to 10 digit (up to 2147483646).
+ *   JobID env var: LSB_JOBID.
+ * Loadleveler:
+ *   JobID format:  String of machine_name.cluster_id.process_id, for
+ *               example: fr2n02.32.0
+ *   JobID env var: LOADL_STEP_ID.
+ * PBS:
+ *   JobID format:  String of sequence_number[.server_name][@server].
+ *   JobID env var: PBS_JOBID.
+ * Maui/MOAB:
+ *   JobID format:  Same as PBS.
+ *   JobID env var: Same as PBS.
+ */
+
+struct job_stat {
+       struct hlist_node      js_hash;
+       struct list_head            js_list;
+       atomic_t          js_refcount;
+       char              js_jobid[JOBSTATS_JOBID_SIZE];
+       time_t          js_timestamp; /* seconds */
+       struct lprocfs_stats *js_stats;
+       struct obd_job_stats *js_jobstats;
+};
+
+static unsigned job_stat_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static void *job_stat_key(struct hlist_node *hnode)
+{
+       struct job_stat *job;
+       job = hlist_entry(hnode, struct job_stat, js_hash);
+       return job->js_jobid;
+}
+
+static int job_stat_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct job_stat *job;
+       job = hlist_entry(hnode, struct job_stat, js_hash);
+       return (strlen(job->js_jobid) == strlen(key)) &&
+              !strncmp(job->js_jobid, key, strlen(key));
+}
+
+static void *job_stat_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct job_stat, js_hash);
+}
+
+static void job_stat_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct job_stat *job;
+       job = hlist_entry(hnode, struct job_stat, js_hash);
+       atomic_inc(&job->js_refcount);
+}
+
+static void job_free(struct job_stat *job)
+{
+       LASSERT(atomic_read(&job->js_refcount) == 0);
+       LASSERT(job->js_jobstats);
+
+       write_lock(&job->js_jobstats->ojs_lock);
+       list_del_init(&job->js_list);
+       write_unlock(&job->js_jobstats->ojs_lock);
+
+       lprocfs_free_stats(&job->js_stats);
+       OBD_FREE_PTR(job);
+}
+
+static void job_putref(struct job_stat *job)
+{
+       LASSERT(atomic_read(&job->js_refcount) > 0);
+       if (atomic_dec_and_test(&job->js_refcount))
+               job_free(job);
+}
+
+static void job_stat_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct job_stat *job;
+       job = hlist_entry(hnode, struct job_stat, js_hash);
+       job_putref(job);
+}
+
+static void job_stat_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       CERROR("Should not have any items!");
+}
+
+static cfs_hash_ops_t job_stats_hash_ops = {
+       .hs_hash       = job_stat_hash,
+       .hs_key = job_stat_key,
+       .hs_keycmp     = job_stat_keycmp,
+       .hs_object     = job_stat_object,
+       .hs_get = job_stat_get,
+       .hs_put_locked = job_stat_put_locked,
+       .hs_exit       = job_stat_exit,
+};
+
+static int job_iter_callback(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                            struct hlist_node *hnode, void *data)
+{
+       time_t oldest = *((time_t *)data);
+       struct job_stat *job;
+
+       job = hlist_entry(hnode, struct job_stat, js_hash);
+       if (!oldest || job->js_timestamp < oldest)
+               cfs_hash_bd_del_locked(hs, bd, hnode);
+
+       return 0;
+}
+
+static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool force)
+{
+       time_t oldest, now;
+
+       if (stats->ojs_cleanup_interval == 0)
+               return;
+
+       now = cfs_time_current_sec();
+       if (!force && now < stats->ojs_last_cleanup +
+                           stats->ojs_cleanup_interval)
+               return;
+
+       oldest = now - stats->ojs_cleanup_interval;
+       cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+                              &oldest);
+       stats->ojs_last_cleanup = cfs_time_current_sec();
+}
+
+static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
+{
+       struct job_stat *job;
+
+       LASSERT(jobs->ojs_cntr_num && jobs->ojs_cntr_init_fn);
+
+       OBD_ALLOC_PTR(job);
+       if (job == NULL)
+               return NULL;
+
+       job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0);
+       if (job->js_stats == NULL) {
+               OBD_FREE_PTR(job);
+               return NULL;
+       }
+
+       jobs->ojs_cntr_init_fn(job->js_stats);
+
+       memcpy(job->js_jobid, jobid, JOBSTATS_JOBID_SIZE);
+       job->js_timestamp = cfs_time_current_sec();
+       job->js_jobstats = jobs;
+       INIT_HLIST_NODE(&job->js_hash);
+       INIT_LIST_HEAD(&job->js_list);
+       atomic_set(&job->js_refcount, 1);
+
+       return job;
+}
+
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+                         int event, long amount)
+{
+       struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+       struct job_stat *job, *job2;
+       ENTRY;
+
+       LASSERT(stats && stats->ojs_hash);
+
+       lprocfs_job_cleanup(stats, false);
+
+       if (!jobid || !strlen(jobid))
+               RETURN(-EINVAL);
+
+       if (strlen(jobid) >= JOBSTATS_JOBID_SIZE) {
+               CERROR("Invalid jobid size (%lu), expect(%d)\n",
+                      (unsigned long)strlen(jobid) + 1, JOBSTATS_JOBID_SIZE);
+               RETURN(-EINVAL);
+       }
+
+       job = cfs_hash_lookup(stats->ojs_hash, jobid);
+       if (job)
+               goto found;
+
+       job = job_alloc(jobid, stats);
+       if (job == NULL)
+               RETURN(-ENOMEM);
+
+       job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid,
+                                      &job->js_hash);
+       if (job2 != job) {
+               job_putref(job);
+               job = job2;
+               /* We cannot LASSERT(!list_empty(&job->js_list)) here,
+                * since we just lost the race for inserting "job" into the
+                * ojs_list, and some other thread is doing it _right_now_.
+                * Instead, be content the other thread is doing this, since
+                * "job2" was initialized in job_alloc() already. LU-2163 */
+       } else {
+               LASSERT(list_empty(&job->js_list));
+               write_lock(&stats->ojs_lock);
+               list_add_tail(&job->js_list, &stats->ojs_list);
+               write_unlock(&stats->ojs_lock);
+       }
+
+found:
+       LASSERT(stats == job->js_jobstats);
+       LASSERT(stats->ojs_cntr_num > event);
+       job->js_timestamp = cfs_time_current_sec();
+       lprocfs_counter_add(job->js_stats, event, amount);
+
+       job_putref(job);
+       RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_log);
+
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{
+       struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+       time_t oldest = 0;
+
+       if (stats->ojs_hash == NULL)
+               return;
+       cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, &oldest);
+       cfs_hash_putref(stats->ojs_hash);
+       stats->ojs_hash = NULL;
+       LASSERT(list_empty(&stats->ojs_list));
+}
+EXPORT_SYMBOL(lprocfs_job_stats_fini);
+
+static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos)
+{
+       struct obd_job_stats *stats = p->private;
+       loff_t off = *pos;
+       struct job_stat *job;
+
+       read_lock(&stats->ojs_lock);
+       if (off == 0)
+               return SEQ_START_TOKEN;
+       off--;
+       list_for_each_entry(job, &stats->ojs_list, js_list) {
+               if (!off--)
+                       return job;
+       }
+       return NULL;
+}
+
+static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v)
+{
+       struct obd_job_stats *stats = p->private;
+
+       read_unlock(&stats->ojs_lock);
+}
+
+static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       struct obd_job_stats *stats = p->private;
+       struct job_stat *job;
+       struct list_head *next;
+
+       ++*pos;
+       if (v == SEQ_START_TOKEN) {
+               next = stats->ojs_list.next;
+       } else {
+               job = (struct job_stat *)v;
+               next = job->js_list.next;
+       }
+
+       return next == &stats->ojs_list ? NULL :
+               list_entry(next, struct job_stat, js_list);
+}
+
+/*
+ * Example of output on MDT:
+ *
+ * job_stats:
+ * - job_id:   test_id.222.25844
+ *   snapshot_time: 1322494486
+ *   open:       { samples:           3, unit: reqs }
+ *   close:     { samples:            3, unit: reqs }
+ *   mknod:     { samples:            0, unit: reqs }
+ *   link:       { samples:           0, unit: reqs }
+ *   unlink:   { samples:             0, unit: reqs }
+ *   mkdir:     { samples:            0, unit: reqs }
+ *   rmdir:     { samples:            0, unit: reqs }
+ *   rename:   { samples:             1, unit: reqs }
+ *   getattr:       { samples:        7, unit: reqs }
+ *   setattr:       { samples:        0, unit: reqs }
+ *   getxattr:      { samples:        0, unit: reqs }
+ *   setxattr:      { samples:        0, unit: reqs }
+ *   statfs:   { samples:             0, unit: reqs }
+ *   sync:       { samples:           0, unit: reqs }
+ *
+ * Example of output on OST:
+ *
+ * job_stats:
+ * - job_id     4854
+ *   snapshot_time: 1322494602
+ *   read:       { samples:  0, unit: bytes, min:  0, max:  0, sum:  0 }
+ *   write:     { samples:  1, unit: bytes, min: 10, max: 10, sum: 10 }
+ *   setattr:       { samples:  0, unit: reqs }
+ *   punch:     { samples:  0, unit: reqs }
+ *   sync:       { samples:  0, unit: reqs }
+ */
+
+static const char spaces[] = "             ";
+
+static int inline width(const char *str, int len)
+{
+       return len - min((int)strlen(str), 15);
+}
+
+static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
+{
+       struct job_stat                 *job = v;
+       struct lprocfs_stats            *s;
+       struct lprocfs_counter          ret;
+       struct lprocfs_counter          *cntr;
+       struct lprocfs_counter_header   *cntr_header;
+       int                             i;
+
+       if (v == SEQ_START_TOKEN) {
+               seq_printf(p, "job_stats:\n");
+               return 0;
+       }
+
+       seq_printf(p, "- %-16s %s\n", "job_id:", job->js_jobid);
+       seq_printf(p, "  %-16s %ld\n", "snapshot_time:", job->js_timestamp);
+
+       s = job->js_stats;
+       for (i = 0; i < s->ls_num; i++) {
+               cntr = lprocfs_stats_counter_get(s, 0, i);
+               cntr_header = &s->ls_cnt_header[i];
+               lprocfs_stats_collect(s, i, &ret);
+
+               seq_printf(p, "  %s:%.*s { samples: %11"LPF64"u",
+                          cntr_header->lc_name,
+                          width(cntr_header->lc_name, 15), spaces,
+                          ret.lc_count);
+               if (cntr_header->lc_units[0] != '\0')
+                       seq_printf(p, ", unit: %5s", cntr_header->lc_units);
+
+               if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+                       seq_printf(p, ", min:%8"LPF64"u, max:%8"LPF64"u,"
+                                  " sum:%16"LPF64"u",
+                                  ret.lc_count ? ret.lc_min : 0,
+                                  ret.lc_count ? ret.lc_max : 0,
+                                  ret.lc_count ? ret.lc_sum : 0);
+               }
+               if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) {
+                       seq_printf(p, ", sumsq: %18"LPF64"u",
+                                  ret.lc_count ? ret.lc_sumsquare : 0);
+               }
+
+               seq_printf(p, " }\n");
+
+       }
+       return 0;
+}
+
+struct seq_operations lprocfs_jobstats_seq_sops = {
+       start: lprocfs_jobstats_seq_start,
+       stop:  lprocfs_jobstats_seq_stop,
+       next:  lprocfs_jobstats_seq_next,
+       show:  lprocfs_jobstats_seq_show,
+};
+
+static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq;
+       int rc;
+
+       rc = seq_open(file, &lprocfs_jobstats_seq_sops);
+       if (rc)
+               return rc;
+       seq = file->private_data;
+       seq->private = PDE_DATA(inode);
+       return 0;
+}
+
+static ssize_t lprocfs_jobstats_seq_write(struct file *file, const char *buf,
+                                         size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct obd_job_stats *stats = seq->private;
+       char jobid[JOBSTATS_JOBID_SIZE];
+       int all = 0;
+       struct job_stat *job;
+
+       if (!memcmp(buf, "clear", strlen("clear"))) {
+               all = 1;
+       } else if (len < JOBSTATS_JOBID_SIZE) {
+               memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+               /* Trim '\n' if any */
+               if (buf[len - 1] == '\n')
+                       memcpy(jobid, buf, len - 1);
+               else
+                       memcpy(jobid, buf, len);
+       } else {
+               return -EINVAL;
+       }
+
+       LASSERT(stats->ojs_hash);
+       if (all) {
+               time_t oldest = 0;
+               cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+                                      &oldest);
+               return len;
+       }
+
+       if (!strlen(jobid))
+               return -EINVAL;
+
+       job = cfs_hash_lookup(stats->ojs_hash, jobid);
+       if (!job)
+               return -EINVAL;
+
+       cfs_hash_del_key(stats->ojs_hash, jobid);
+
+       job_putref(job);
+       return len;
+}
+
+struct file_operations lprocfs_jobstats_seq_fops = {
+       .owner   = THIS_MODULE,
+       .open    = lprocfs_jobstats_seq_open,
+       .read    = seq_read,
+       .write   = lprocfs_jobstats_seq_write,
+       .llseek  = seq_lseek,
+       .release = lprocfs_seq_release,
+};
+
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+                          cntr_init_callback init_fn)
+{
+       struct proc_dir_entry *entry;
+       struct obd_job_stats *stats;
+       ENTRY;
+
+       LASSERT(obd->obd_proc_entry != NULL);
+       LASSERT(obd->obd_type->typ_name);
+
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) {
+               CERROR("Invalid obd device type.\n");
+               RETURN(-EINVAL);
+       }
+       stats = &obd->u.obt.obt_jobstats;
+
+       LASSERT(stats->ojs_hash == NULL);
+       stats->ojs_hash = cfs_hash_create("JOB_STATS",
+                                         HASH_JOB_STATS_CUR_BITS,
+                                         HASH_JOB_STATS_MAX_BITS,
+                                         HASH_JOB_STATS_BKT_BITS, 0,
+                                         CFS_HASH_MIN_THETA,
+                                         CFS_HASH_MAX_THETA,
+                                         &job_stats_hash_ops,
+                                         CFS_HASH_DEFAULT);
+       if (stats->ojs_hash == NULL)
+               RETURN(-ENOMEM);
+
+       INIT_LIST_HEAD(&stats->ojs_list);
+       rwlock_init(&stats->ojs_lock);
+       stats->ojs_cntr_num = cntr_num;
+       stats->ojs_cntr_init_fn = init_fn;
+       stats->ojs_cleanup_interval = 600; /* 10 mins by default */
+       stats->ojs_last_cleanup = cfs_time_current_sec();
+
+       entry = proc_create_data("job_stats", 0644, obd->obd_proc_entry,
+                                &lprocfs_jobstats_seq_fops, stats);
+       if (entry)
+               RETURN(0);
+       else
+               RETURN(-ENOMEM);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_init);
+
+int lprocfs_rd_job_interval(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct obd_job_stats *stats;
+
+       LASSERT(obd != NULL);
+       stats = &obd->u.obt.obt_jobstats;
+       return seq_printf(m, "%d\n", stats->ojs_cleanup_interval);
+}
+EXPORT_SYMBOL(lprocfs_rd_job_interval);
+
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct obd_job_stats *stats;
+       int val, rc;
+
+       LASSERT(obd != NULL);
+       stats = &obd->u.obt.obt_jobstats;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       stats->ojs_cleanup_interval = val;
+       lprocfs_job_cleanup(stats, true);
+
+       return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_job_interval);
+
+#endif /* LPROCFS*/
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
new file mode 100644 (file)
index 0000000..3b157f8
--- /dev/null
@@ -0,0 +1,1985 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <linux/seq_file.h>
+
+#if defined(LPROCFS)
+
+static int lprocfs_no_percpu_stats = 0;
+CFS_MODULE_PARM(lprocfs_no_percpu_stats, "i", int, 0644,
+               "Do not alloc percpu data for lprocfs stats");
+
+#define MAX_STRING_SIZE 128
+
+int lprocfs_single_release(struct inode *inode, struct file *file)
+{
+       return single_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_single_release);
+
+int lprocfs_seq_release(struct inode *inode, struct file *file)
+{
+       return seq_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_seq_release);
+
+/* lprocfs API calls */
+
+proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+                                    char *name, void *data,
+                                    struct file_operations *fops)
+{
+       proc_dir_entry_t *proc;
+       mode_t mode = 0;
+
+       if (root == NULL || name == NULL || fops == NULL)
+               return ERR_PTR(-EINVAL);
+
+       if (fops->read)
+               mode = 0444;
+       if (fops->write)
+               mode |= 0200;
+       proc = proc_create_data(name, mode, root, fops, data);
+       if (!proc) {
+               CERROR("LprocFS: No memory to create /proc entry %s", name);
+               return ERR_PTR(-ENOMEM);
+       }
+       return proc;
+}
+EXPORT_SYMBOL(lprocfs_add_simple);
+
+struct proc_dir_entry *lprocfs_add_symlink(const char *name,
+                       struct proc_dir_entry *parent, const char *format, ...)
+{
+       struct proc_dir_entry *entry;
+       char *dest;
+       va_list ap;
+
+       if (parent == NULL || format == NULL)
+               return NULL;
+
+       OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+       if (dest == NULL)
+               return NULL;
+
+       va_start(ap, format);
+       vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+       va_end(ap);
+
+       entry = proc_symlink(name, parent, dest);
+       if (entry == NULL)
+               CERROR("LprocFS: Could not create symbolic link from %s to %s",
+                       name, dest);
+
+       OBD_FREE(dest, MAX_STRING_SIZE + 1);
+       return entry;
+}
+EXPORT_SYMBOL(lprocfs_add_symlink);
+
+static struct file_operations lprocfs_generic_fops = { };
+
+/**
+ * Add /proc entries.
+ *
+ * \param root [in]  The parent proc entry on which new entry will be added.
+ * \param list [in]  Array of proc entries to be added.
+ * \param data [in]  The argument to be passed when entries read/write routines
+ *                are called through /proc file.
+ *
+ * \retval 0   on success
+ *      < 0 on error
+ */
+int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
+                    void *data)
+{
+       if (root == NULL || list == NULL)
+               return -EINVAL;
+
+       while (list->name != NULL) {
+               struct proc_dir_entry *proc;
+               mode_t mode = 0;
+
+               if (list->proc_mode != 0000) {
+                       mode = list->proc_mode;
+               } else if (list->fops) {
+                       if (list->fops->read)
+                               mode = 0444;
+                       if (list->fops->write)
+                               mode |= 0200;
+               }
+               proc = proc_create_data(list->name, mode, root,
+                                       list->fops ?: &lprocfs_generic_fops,
+                                       list->data ?: data);
+               if (proc == NULL)
+                       return -ENOMEM;
+               list++;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_add_vars);
+
+void lprocfs_remove(struct proc_dir_entry **rooth)
+{
+       proc_remove(*rooth);
+       *rooth = NULL;
+}
+EXPORT_SYMBOL(lprocfs_remove);
+
+void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+       LASSERT(parent != NULL);
+       remove_proc_entry(name, parent);
+}
+EXPORT_SYMBOL(lprocfs_remove_proc_entry);
+
+struct proc_dir_entry *lprocfs_register(const char *name,
+                                       struct proc_dir_entry *parent,
+                                       struct lprocfs_vars *list, void *data)
+{
+       struct proc_dir_entry *newchild;
+
+       newchild = proc_mkdir(name, parent);
+       if (newchild != NULL && list != NULL) {
+               int rc = lprocfs_add_vars(newchild, list, data);
+               if (rc) {
+                       lprocfs_remove(&newchild);
+                       return ERR_PTR(rc);
+               }
+       }
+       return newchild;
+}
+EXPORT_SYMBOL(lprocfs_register);
+
+/* Generic callbacks */
+int lprocfs_rd_uint(struct seq_file *m, void *data)
+{
+       return seq_printf(m, "%u\n", *(unsigned int *)data);
+}
+EXPORT_SYMBOL(lprocfs_rd_uint);
+
+int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+                   unsigned long count, void *data)
+{
+       unsigned *p = data;
+       char dummy[MAX_STRING_SIZE + 1], *end;
+       unsigned long tmp;
+
+       dummy[MAX_STRING_SIZE] = '\0';
+       if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+               return -EFAULT;
+
+       tmp = simple_strtoul(dummy, &end, 0);
+       if (dummy == end)
+               return -EINVAL;
+
+       *p = (unsigned int)tmp;
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_uint);
+
+int lprocfs_rd_u64(struct seq_file *m, void *data)
+{
+       return seq_printf(m, LPU64"\n", *(__u64 *)data);
+}
+EXPORT_SYMBOL(lprocfs_rd_u64);
+
+int lprocfs_rd_atomic(struct seq_file *m, void *data)
+{
+       atomic_t *atom = data;
+       LASSERT(atom != NULL);
+       return seq_printf(m, "%d\n", atomic_read(atom));
+}
+EXPORT_SYMBOL(lprocfs_rd_atomic);
+
+int lprocfs_wr_atomic(struct file *file, const char __user *buffer,
+                     unsigned long count, void *data)
+{
+       atomic_t *atm = data;
+       int val = 0;
+       int rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc < 0)
+               return rc;
+
+       if (val <= 0)
+               return -ERANGE;
+
+       atomic_set(atm, val);
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_atomic);
+
+int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+
+       LASSERT(obd != NULL);
+       return seq_printf(m, "%s\n", obd->obd_uuid.uuid);
+}
+EXPORT_SYMBOL(lprocfs_rd_uuid);
+
+int lprocfs_rd_name(struct seq_file *m, void *data)
+{
+       struct obd_device *dev = data;
+
+       LASSERT(dev != NULL);
+       return seq_printf(m, "%s\n", dev->obd_name);
+}
+EXPORT_SYMBOL(lprocfs_rd_name);
+
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc)
+               rc = seq_printf(m, "%u\n", osfs.os_bsize);
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_blksize);
+
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_blocks;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               rc = seq_printf(m, LPU64"\n", result);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
+
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bfree;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               rc = seq_printf(m, LPU64"\n", result);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
+
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc) {
+               __u32 blk_size = osfs.os_bsize >> 10;
+               __u64 result = osfs.os_bavail;
+
+               while (blk_size >>= 1)
+                       result <<= 1;
+
+               rc = seq_printf(m, LPU64"\n", result);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesavail);
+
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc)
+               rc = seq_printf(m, LPU64"\n", osfs.os_files);
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filestotal);
+
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_statfs  osfs;
+       int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+                           cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+                           OBD_STATFS_NODELAY);
+       if (!rc)
+               rc = seq_printf(m, LPU64"\n", osfs.os_ffree);
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filesfree);
+
+int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+       struct obd_import *imp;
+       char *imp_state_name = NULL;
+       int rc = 0;
+
+       LASSERT(obd != NULL);
+       LPROCFS_CLIMP_CHECK(obd);
+       imp = obd->u.cli.cl_import;
+       imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+       rc = seq_printf(m, "%s\t%s%s\n", obd2cli_tgt(obd), imp_state_name,
+                       imp->imp_deactive ? "\tDEACTIVATED" : "");
+
+       LPROCFS_CLIMP_EXIT(obd);
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_server_uuid);
+
+int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+       struct ptlrpc_connection *conn;
+       int rc = 0;
+
+       LASSERT(obd != NULL);
+
+       LPROCFS_CLIMP_CHECK(obd);
+       conn = obd->u.cli.cl_import->imp_connection;
+       if (conn && obd->u.cli.cl_import)
+               rc = seq_printf(m, "%s\n", conn->c_remote_uuid.uuid);
+       else
+               rc = seq_printf(m, "%s\n", "<none>");
+
+       LPROCFS_CLIMP_EXIT(obd);
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
+
+/** add up per-cpu counters */
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+                          struct lprocfs_counter *cnt)
+{
+       unsigned int                    num_entry;
+       struct lprocfs_counter          *percpu_cntr;
+       struct lprocfs_counter_header   *cntr_header;
+       int                             i;
+       unsigned long                   flags = 0;
+
+       memset(cnt, 0, sizeof(*cnt));
+
+       if (stats == NULL) {
+               /* set count to 1 to avoid divide-by-zero errs in callers */
+               cnt->lc_count = 1;
+               return;
+       }
+
+       cnt->lc_min = LC_MIN_INIT;
+
+       num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+       for (i = 0; i < num_entry; i++) {
+               if (stats->ls_percpu[i] == NULL)
+                       continue;
+               cntr_header = &stats->ls_cnt_header[idx];
+               percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
+
+               cnt->lc_count += percpu_cntr->lc_count;
+               cnt->lc_sum += percpu_cntr->lc_sum;
+               if (percpu_cntr->lc_min < cnt->lc_min)
+                       cnt->lc_min = percpu_cntr->lc_min;
+               if (percpu_cntr->lc_max > cnt->lc_max)
+                       cnt->lc_max = percpu_cntr->lc_max;
+               cnt->lc_sumsquare += percpu_cntr->lc_sumsquare;
+       }
+
+       lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_stats_collect);
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(flag, first)                                          \
+       do {                                                            \
+               if (imp->imp_##flag)                                    \
+                    seq_printf(m, "%s" #flag, first ? "" : ", ");      \
+       } while (0)
+static int obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
+{
+       bool first = true;
+
+       if (imp->imp_obd->obd_no_recov) {
+               seq_printf(m, "no_recov");
+               first = false;
+       }
+
+       flag2str(invalid, first);
+       first = false;
+       flag2str(deactive, first);
+       flag2str(replayable, first);
+       flag2str(pingable, first);
+       return 0;
+}
+#undef flags2str
+
+static const char *obd_connect_names[] = {
+       "read_only",
+       "lov_index",
+       "unused",
+       "write_grant",
+       "server_lock",
+       "version",
+       "request_portal",
+       "acl",
+       "xattr",
+       "create_on_write",
+       "truncate_lock",
+       "initial_transno",
+       "inode_bit_locks",
+       "join_file(obsolete)",
+       "getattr_by_fid",
+       "no_oh_for_devices",
+       "remote_client",
+       "remote_client_by_force",
+       "max_byte_per_rpc",
+       "64bit_qdata",
+       "mds_capability",
+       "oss_capability",
+       "early_lock_cancel",
+       "som",
+       "adaptive_timeouts",
+       "lru_resize",
+       "mds_mds_connection",
+       "real_conn",
+       "change_qunit_size",
+       "alt_checksum_algorithm",
+       "fid_is_enabled",
+       "version_recovery",
+       "pools",
+       "grant_shrink",
+       "skip_orphan",
+       "large_ea",
+       "full20",
+       "layout_lock",
+       "64bithash",
+       "object_max_bytes",
+       "imp_recov",
+       "jobstats",
+       "umask",
+       "einprogress",
+       "grant_param",
+       "flock_owner",
+       "lvb_type",
+       "nanoseconds_times",
+       "lightweight_conn",
+       "short_io",
+       "pingless",
+       "unknown",
+       NULL
+};
+
+static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, char *sep)
+{
+       __u64 mask = 1;
+       int i;
+       bool first = true;
+
+       for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+               if (flags & mask) {
+                       seq_printf(m, "%s%s",
+                                       first ? sep : "", obd_connect_names[i]);
+                       first = false;
+               }
+       }
+       if (flags & ~(mask - 1))
+               seq_printf(m, "%sunknown flags "LPX64,
+                               first ? sep : "", flags & ~(mask - 1));
+}
+
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep)
+{
+       __u64 mask = 1;
+       int i, ret = 0;
+
+       for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+               if (flags & mask)
+                       ret += snprintf(page + ret, count - ret, "%s%s",
+                                       ret ? sep : "", obd_connect_names[i]);
+       }
+       if (flags & ~(mask - 1))
+               ret += snprintf(page + ret, count - ret,
+                               "%sunknown flags "LPX64,
+                               ret ? sep : "", flags & ~(mask - 1));
+       return ret;
+}
+EXPORT_SYMBOL(obd_connect_flags2str);
+
+int lprocfs_rd_import(struct seq_file *m, void *data)
+{
+       struct lprocfs_counter          ret;
+       struct lprocfs_counter_header   *header;
+       struct obd_device               *obd    = (struct obd_device *)data;
+       struct obd_import               *imp;
+       struct obd_import_conn          *conn;
+       int                             j;
+       int                             k;
+       int                             rw      = 0;
+
+       LASSERT(obd != NULL);
+       LPROCFS_CLIMP_CHECK(obd);
+       imp = obd->u.cli.cl_import;
+
+       seq_printf(m,
+                    "import:\n"
+                    "    name: %s\n"
+                    "    target: %s\n"
+                    "    state: %s\n"
+                    "    instance: %u\n"
+                    "    connect_flags: [",
+                    obd->obd_name,
+                    obd2cli_tgt(obd),
+                    ptlrpc_import_state_name(imp->imp_state),
+                    imp->imp_connect_data.ocd_instance);
+       obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, ", ");
+       seq_printf(m,
+                     "]\n"
+                     "    import_flags: [");
+       obd_import_flags2str(imp, m);
+
+       seq_printf(m,
+                     "]\n"
+                     "    connection:\n"
+                     "       failover_nids: [");
+       spin_lock(&imp->imp_lock);
+       j = 0;
+       list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+               seq_printf(m, "%s%s", j ? ", " : "",
+                          libcfs_nid2str(conn->oic_conn->c_peer.nid));
+               j++;
+       }
+       seq_printf(m,
+                     "]\n"
+                     "       current_connection: %s\n"
+                     "       connection_attempts: %u\n"
+                     "       generation: %u\n"
+                     "       in-progress_invalidations: %u\n",
+                     imp->imp_connection == NULL ? "<none>" :
+                             libcfs_nid2str(imp->imp_connection->c_peer.nid),
+                     imp->imp_conn_cnt,
+                     imp->imp_generation,
+                     atomic_read(&imp->imp_inval_count));
+       spin_unlock(&imp->imp_lock);
+
+       if (obd->obd_svc_stats == NULL)
+               goto out_climp;
+
+       header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
+       lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret);
+       if (ret.lc_count != 0) {
+               /* first argument to do_div MUST be __u64 */
+               __u64 sum = ret.lc_sum;
+               do_div(sum, ret.lc_count);
+               ret.lc_sum = sum;
+       } else
+               ret.lc_sum = 0;
+       seq_printf(m,
+                     "    rpcs:\n"
+                     "       inflight: %u\n"
+                     "       unregistering: %u\n"
+                     "       timeouts: %u\n"
+                     "       avg_waittime: "LPU64" %s\n",
+                     atomic_read(&imp->imp_inflight),
+                     atomic_read(&imp->imp_unregistering),
+                     atomic_read(&imp->imp_timeouts),
+                     ret.lc_sum, header->lc_units);
+
+       k = 0;
+       for(j = 0; j < IMP_AT_MAX_PORTALS; j++) {
+               if (imp->imp_at.iat_portal[j] == 0)
+                       break;
+               k = max_t(unsigned int, k,
+                         at_get(&imp->imp_at.iat_service_estimate[j]));
+       }
+       seq_printf(m,
+                     "    service_estimates:\n"
+                     "       services: %u sec\n"
+                     "       network: %u sec\n",
+                     k,
+                     at_get(&imp->imp_at.iat_net_latency));
+
+       seq_printf(m,
+                     "    transactions:\n"
+                     "       last_replay: "LPU64"\n"
+                     "       peer_committed: "LPU64"\n"
+                     "       last_checked: "LPU64"\n",
+                     imp->imp_last_replay_transno,
+                     imp->imp_peer_committed_transno,
+                     imp->imp_last_transno_checked);
+
+       /* avg data rates */
+       for (rw = 0; rw <= 1; rw++) {
+               lprocfs_stats_collect(obd->obd_svc_stats,
+                                     PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw,
+                                     &ret);
+               if (ret.lc_sum > 0 && ret.lc_count > 0) {
+                       /* first argument to do_div MUST be __u64 */
+                       __u64 sum = ret.lc_sum;
+                       do_div(sum, ret.lc_count);
+                       ret.lc_sum = sum;
+                       seq_printf(m,
+                                     "    %s_data_averages:\n"
+                                     "       bytes_per_rpc: "LPU64"\n",
+                                     rw ? "write" : "read",
+                                     ret.lc_sum);
+               }
+               k = (int)ret.lc_sum;
+               j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES;
+               header = &obd->obd_svc_stats->ls_cnt_header[j];
+               lprocfs_stats_collect(obd->obd_svc_stats, j, &ret);
+               if (ret.lc_sum > 0 && ret.lc_count != 0) {
+                       /* first argument to do_div MUST be __u64 */
+                       __u64 sum = ret.lc_sum;
+                       do_div(sum, ret.lc_count);
+                       ret.lc_sum = sum;
+                       seq_printf(m,
+                                     "       %s_per_rpc: "LPU64"\n",
+                                     header->lc_units, ret.lc_sum);
+                       j = (int)ret.lc_sum;
+                       if (j > 0)
+                               seq_printf(m,
+                                             "       MB_per_sec: %u.%.02u\n",
+                                             k / j, (100 * k / j) % 100);
+               }
+       }
+
+out_climp:
+       LPROCFS_CLIMP_EXIT(obd);
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_import);
+
+int lprocfs_rd_state(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct obd_import *imp;
+       int j, k;
+
+       LASSERT(obd != NULL);
+       LPROCFS_CLIMP_CHECK(obd);
+       imp = obd->u.cli.cl_import;
+
+       seq_printf(m, "current_state: %s\n",
+                    ptlrpc_import_state_name(imp->imp_state));
+       seq_printf(m, "state_history:\n");
+       k = imp->imp_state_hist_idx;
+       for (j = 0; j < IMP_STATE_HIST_LEN; j++) {
+               struct import_state_hist *ish =
+                       &imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN];
+               if (ish->ish_state == 0)
+                       continue;
+               seq_printf(m, " - ["CFS_TIME_T", %s]\n",
+                             ish->ish_time,
+                             ptlrpc_import_state_name(ish->ish_state));
+       }
+
+       LPROCFS_CLIMP_EXIT(obd);
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_state);
+
+int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at)
+{
+       int i;
+       for (i = 0; i < AT_BINS; i++)
+               seq_printf(m, "%3u ", at->at_hist[i]);
+       seq_printf(m, "\n");
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct obd_import *imp;
+       unsigned int cur, worst;
+       time_t now, worstt;
+       struct dhms ts;
+       int i;
+
+       LASSERT(obd != NULL);
+       LPROCFS_CLIMP_CHECK(obd);
+       imp = obd->u.cli.cl_import;
+
+       now = cfs_time_current_sec();
+
+       /* Some network health info for kicks */
+       s2dhms(&ts, now - imp->imp_last_reply_time);
+       seq_printf(m, "%-10s : %ld, "DHMS_FMT" ago\n",
+                      "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+       cur = at_get(&imp->imp_at.iat_net_latency);
+       worst = imp->imp_at.iat_net_latency.at_worst_ever;
+       worstt = imp->imp_at.iat_net_latency.at_worst_time;
+       s2dhms(&ts, now - worstt);
+       seq_printf(m, "%-10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
+                      "network", cur, worst, worstt, DHMS_VARS(&ts));
+       lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency);
+
+       for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+               if (imp->imp_at.iat_portal[i] == 0)
+                       break;
+               cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+               worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+               worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+               s2dhms(&ts, now - worstt);
+               seq_printf(m, "portal %-2d  : cur %3u  worst %3u (at %ld, "
+                              DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
+                              cur, worst, worstt, DHMS_VARS(&ts));
+               lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]);
+       }
+
+       LPROCFS_CLIMP_EXIT(obd);
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
+
+int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+       __u64 flags;
+
+       LPROCFS_CLIMP_CHECK(obd);
+       flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
+       seq_printf(m, "flags="LPX64"\n", flags);
+       obd_connect_seq_flags2str(m, flags, "\n");
+       seq_printf(m, "\n");
+       LPROCFS_CLIMP_EXIT(obd);
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_connect_flags);
+
+int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{
+       struct obd_device *obd = data;
+
+       LASSERT(obd != NULL);
+       return seq_printf(m, "%u\n", obd->obd_num_exports);
+}
+EXPORT_SYMBOL(lprocfs_rd_num_exports);
+
+int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{
+       struct obd_type *class = (struct obd_type*) data;
+
+       LASSERT(class != NULL);
+       return seq_printf(m, "%d\n", class->typ_refcnt);
+}
+EXPORT_SYMBOL(lprocfs_rd_numrefs);
+
+int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list)
+{
+       int rc = 0;
+
+       LASSERT(obd != NULL);
+       LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+       LASSERT(obd->obd_type->typ_procroot != NULL);
+
+       obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+                                              obd->obd_type->typ_procroot,
+                                              list, obd);
+       if (IS_ERR(obd->obd_proc_entry)) {
+               rc = PTR_ERR(obd->obd_proc_entry);
+               CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
+               obd->obd_proc_entry = NULL;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_setup);
+
+int lprocfs_obd_cleanup(struct obd_device *obd)
+{
+       if (!obd)
+               return -EINVAL;
+       if (obd->obd_proc_exports_entry) {
+               /* Should be no exports left */
+               lprocfs_remove(&obd->obd_proc_exports_entry);
+               obd->obd_proc_exports_entry = NULL;
+       }
+       if (obd->obd_proc_entry) {
+               lprocfs_remove(&obd->obd_proc_entry);
+               obd->obd_proc_entry = NULL;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_cleanup);
+
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
+{
+       CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat,
+              client_stat->nid_proc, client_stat->nid_stats);
+
+       LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+                "nid %s:count %d\n", libcfs_nid2str(client_stat->nid),
+                atomic_read(&client_stat->nid_exp_ref_count));
+
+       if (client_stat->nid_proc)
+               lprocfs_remove(&client_stat->nid_proc);
+
+       if (client_stat->nid_stats)
+               lprocfs_free_stats(&client_stat->nid_stats);
+
+       if (client_stat->nid_ldlm_stats)
+               lprocfs_free_stats(&client_stat->nid_ldlm_stats);
+
+       OBD_FREE_PTR(client_stat);
+       return;
+
+}
+
+void lprocfs_free_per_client_stats(struct obd_device *obd)
+{
+       cfs_hash_t *hash = obd->obd_nid_stats_hash;
+       struct nid_stat *stat;
+       ENTRY;
+
+       /* we need extra list - because hash_exit called to early */
+       /* not need locking because all clients is died */
+       while (!list_empty(&obd->obd_nid_stats)) {
+               stat = list_entry(obd->obd_nid_stats.next,
+                                     struct nid_stat, nid_list);
+               list_del_init(&stat->nid_list);
+               cfs_hash_del(hash, &stat->nid, &stat->nid_hash);
+               lprocfs_free_client_stats(stat);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(lprocfs_free_per_client_stats);
+
+struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
+                                         enum lprocfs_stats_flags flags)
+{
+       struct lprocfs_stats    *stats;
+       unsigned int            num_entry;
+       unsigned int            percpusize = 0;
+       int                     i;
+
+       if (num == 0)
+               return NULL;
+
+       if (lprocfs_no_percpu_stats != 0)
+               flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+
+       if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
+               num_entry = 1;
+       else
+               num_entry = num_possible_cpus();
+
+       /* alloc percpu pointers for all possible cpu slots */
+       LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+       if (stats == NULL)
+               return NULL;
+
+       stats->ls_num = num;
+       stats->ls_flags = flags;
+       spin_lock_init(&stats->ls_lock);
+
+       /* alloc num of counter headers */
+       LIBCFS_ALLOC(stats->ls_cnt_header,
+                    stats->ls_num * sizeof(struct lprocfs_counter_header));
+       if (stats->ls_cnt_header == NULL)
+               goto fail;
+
+       if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
+               /* contains only one set counters */
+               percpusize = lprocfs_stats_counter_size(stats);
+               LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
+               if (stats->ls_percpu[0] == NULL)
+                       goto fail;
+               stats->ls_biggest_alloc_num = 1;
+       } else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
+               /* alloc all percpu data, currently only obd_memory use this */
+               for (i = 0; i < num_entry; ++i)
+                       if (lprocfs_stats_alloc_one(stats, i) < 0)
+                               goto fail;
+       }
+
+       return stats;
+
+fail:
+       lprocfs_free_stats(&stats);
+       return NULL;
+}
+EXPORT_SYMBOL(lprocfs_alloc_stats);
+
+void lprocfs_free_stats(struct lprocfs_stats **statsh)
+{
+       struct lprocfs_stats *stats = *statsh;
+       unsigned int num_entry;
+       unsigned int percpusize;
+       unsigned int i;
+
+       if (stats == NULL || stats->ls_num == 0)
+               return;
+       *statsh = NULL;
+
+       if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
+               num_entry = 1;
+       else
+               num_entry = num_possible_cpus();
+
+       percpusize = lprocfs_stats_counter_size(stats);
+       for (i = 0; i < num_entry; i++)
+               if (stats->ls_percpu[i] != NULL)
+                       LIBCFS_FREE(stats->ls_percpu[i], percpusize);
+       if (stats->ls_cnt_header != NULL)
+               LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
+                                       sizeof(struct lprocfs_counter_header));
+       LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+}
+EXPORT_SYMBOL(lprocfs_free_stats);
+
+void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{
+       struct lprocfs_counter          *percpu_cntr;
+       struct lprocfs_counter_header   *header;
+       int                             i;
+       int                             j;
+       unsigned int                    num_entry;
+       unsigned long                   flags = 0;
+
+       num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+       for (i = 0; i < num_entry; i++) {
+               if (stats->ls_percpu[i] == NULL)
+                       continue;
+               for (j = 0; j < stats->ls_num; j++) {
+                       header = &stats->ls_cnt_header[j];
+                       percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
+                       percpu_cntr->lc_count           = 0;
+                       percpu_cntr->lc_min             = LC_MIN_INIT;
+                       percpu_cntr->lc_max             = 0;
+                       percpu_cntr->lc_sumsquare       = 0;
+                       percpu_cntr->lc_sum             = 0;
+                       if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+                               percpu_cntr->lc_sum_irq = 0;
+               }
+       }
+
+       lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_clear_stats);
+
+static ssize_t lprocfs_stats_seq_write(struct file *file,
+                                      const char __user *buf,
+                                      size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct lprocfs_stats *stats = seq->private;
+
+       lprocfs_clear_stats(stats);
+
+       return len;
+}
+
+static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
+{
+       struct lprocfs_stats *stats = p->private;
+
+       return (*pos < stats->ls_num) ? pos : NULL;
+}
+
+static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       (*pos)++;
+       return lprocfs_stats_seq_start(p, pos);
+}
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
+{
+       struct lprocfs_stats            *stats  = p->private;
+       struct lprocfs_counter_header   *hdr;
+       struct lprocfs_counter           ctr;
+       int                              idx    = *(loff_t *)v;
+       int                              rc     = 0;
+
+       if (idx == 0) {
+               struct timeval now;
+               do_gettimeofday(&now);
+               rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n",
+                               "snapshot_time", now.tv_sec, now.tv_usec);
+               if (rc < 0)
+                       return rc;
+       }
+       hdr = &stats->ls_cnt_header[idx];
+       lprocfs_stats_collect(stats, idx, &ctr);
+
+       if (ctr.lc_count == 0)
+               goto out;
+
+       rc = seq_printf(p, "%-25s "LPD64" samples [%s]", hdr->lc_name,
+                       ctr.lc_count, hdr->lc_units);
+
+       if (rc < 0)
+               goto out;
+
+       if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && (ctr.lc_count > 0)) {
+               rc = seq_printf(p, " "LPD64" "LPD64" "LPD64,
+                               ctr.lc_min, ctr.lc_max, ctr.lc_sum);
+               if (rc < 0)
+                       goto out;
+               if (hdr->lc_config & LPROCFS_CNTR_STDDEV)
+                       rc = seq_printf(p, " "LPD64, ctr.lc_sumsquare);
+               if (rc < 0)
+                       goto out;
+       }
+       rc = seq_printf(p, "\n");
+out:
+       return (rc < 0) ? rc : 0;
+}
+
+struct seq_operations lprocfs_stats_seq_sops = {
+       .start  = lprocfs_stats_seq_start,
+       .stop   = lprocfs_stats_seq_stop,
+       .next   = lprocfs_stats_seq_next,
+       .show   = lprocfs_stats_seq_show,
+};
+
+static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq;
+       int rc;
+
+       rc = seq_open(file, &lprocfs_stats_seq_sops);
+       if (rc)
+               return rc;
+       seq = file->private_data;
+       seq->private = PDE_DATA(inode);
+       return 0;
+}
+
+struct file_operations lprocfs_stats_seq_fops = {
+       .owner   = THIS_MODULE,
+       .open    = lprocfs_stats_seq_open,
+       .read    = seq_read,
+       .write   = lprocfs_stats_seq_write,
+       .llseek  = seq_lseek,
+       .release = lprocfs_seq_release,
+};
+
+int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+                          struct lprocfs_stats *stats)
+{
+       struct proc_dir_entry *entry;
+       LASSERT(root != NULL);
+
+       entry = proc_create_data(name, 0644, root,
+                                &lprocfs_stats_seq_fops, stats);
+       if (entry == NULL)
+               return -ENOMEM;
+
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_register_stats);
+
+void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+                         unsigned conf, const char *name, const char *units)
+{
+       struct lprocfs_counter_header   *header;
+       struct lprocfs_counter          *percpu_cntr;
+       unsigned long                   flags = 0;
+       unsigned int                    i;
+       unsigned int                    num_cpu;
+
+       LASSERT(stats != NULL);
+
+       header = &stats->ls_cnt_header[index];
+       LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n",
+                index, name, units);
+
+       header->lc_config = conf;
+       header->lc_name   = name;
+       header->lc_units  = units;
+
+       num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+       for (i = 0; i < num_cpu; ++i) {
+               if (stats->ls_percpu[i] == NULL)
+                       continue;
+               percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
+               percpu_cntr->lc_count           = 0;
+               percpu_cntr->lc_min             = LC_MIN_INIT;
+               percpu_cntr->lc_max             = 0;
+               percpu_cntr->lc_sumsquare       = 0;
+               percpu_cntr->lc_sum             = 0;
+               if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+                       percpu_cntr->lc_sum_irq = 0;
+       }
+       lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_init);
+
+#define LPROCFS_OBD_OP_INIT(base, stats, op)                          \
+do {                                                                  \
+       unsigned int coffset = base + OBD_COUNTER_OFFSET(op);         \
+       LASSERT(coffset < stats->ls_num);                                 \
+       lprocfs_counter_init(stats, coffset, 0, #op, "reqs");         \
+} while (0)
+
+void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, create_async);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, migrate);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, copy);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, iterate);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, extent_calc);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_connect);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+       LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
+}
+EXPORT_SYMBOL(lprocfs_init_ops_stats);
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+{
+       struct lprocfs_stats *stats;
+       unsigned int num_stats;
+       int rc, i;
+
+       LASSERT(obd->obd_stats == NULL);
+       LASSERT(obd->obd_proc_entry != NULL);
+       LASSERT(obd->obd_cntr_base == 0);
+
+       num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
+               num_private_stats - 1 /* o_owner */;
+       stats = lprocfs_alloc_stats(num_stats, 0);
+       if (stats == NULL)
+               return -ENOMEM;
+
+       lprocfs_init_ops_stats(num_private_stats, stats);
+
+       for (i = num_private_stats; i < num_stats; i++) {
+               /* If this LBUGs, it is likely that an obd
+                * operation was added to struct obd_ops in
+                * <obd.h>, and that the corresponding line item
+                * LPROCFS_OBD_OP_INIT(.., .., opname)
+                * is missing from the list above. */
+               LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
+                        "Missing obd_stat initializer obd_op "
+                        "operation at offset %d.\n", i - num_private_stats);
+       }
+       rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
+       if (rc < 0) {
+               lprocfs_free_stats(&stats);
+       } else {
+               obd->obd_stats  = stats;
+               obd->obd_cntr_base = num_private_stats;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
+
+void lprocfs_free_obd_stats(struct obd_device *obd)
+{
+       if (obd->obd_stats)
+               lprocfs_free_stats(&obd->obd_stats);
+}
+EXPORT_SYMBOL(lprocfs_free_obd_stats);
+
+#define LPROCFS_MD_OP_INIT(base, stats, op)                         \
+do {                                                               \
+       unsigned int coffset = base + MD_COUNTER_OFFSET(op);        \
+       LASSERT(coffset < stats->ls_num);                              \
+       lprocfs_counter_init(stats, coffset, 0, #op, "reqs");      \
+} while (0)
+
+void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, sync);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
+       LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
+}
+EXPORT_SYMBOL(lprocfs_init_mps_stats);
+
+int lprocfs_alloc_md_stats(struct obd_device *obd,
+                          unsigned num_private_stats)
+{
+       struct lprocfs_stats *stats;
+       unsigned int num_stats;
+       int rc, i;
+
+       LASSERT(obd->md_stats == NULL);
+       LASSERT(obd->obd_proc_entry != NULL);
+       LASSERT(obd->md_cntr_base == 0);
+
+       num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) +
+                   num_private_stats;
+       stats = lprocfs_alloc_stats(num_stats, 0);
+       if (stats == NULL)
+               return -ENOMEM;
+
+       lprocfs_init_mps_stats(num_private_stats, stats);
+
+       for (i = num_private_stats; i < num_stats; i++) {
+               if (stats->ls_cnt_header[i].lc_name == NULL) {
+                       CERROR("Missing md_stat initializer md_op "
+                              "operation at offset %d. Aborting.\n",
+                              i - num_private_stats);
+                       LBUG();
+               }
+       }
+       rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats);
+       if (rc < 0) {
+               lprocfs_free_stats(&stats);
+       } else {
+               obd->md_stats  = stats;
+               obd->md_cntr_base = num_private_stats;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_md_stats);
+
+void lprocfs_free_md_stats(struct obd_device *obd)
+{
+       struct lprocfs_stats *stats = obd->md_stats;
+
+       if (stats != NULL) {
+               obd->md_stats = NULL;
+               obd->md_cntr_base = 0;
+               lprocfs_free_stats(&stats);
+       }
+}
+EXPORT_SYMBOL(lprocfs_free_md_stats);
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_ENQUEUE - LDLM_FIRST_OPC,
+                            0, "ldlm_enqueue", "reqs");
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_CONVERT - LDLM_FIRST_OPC,
+                            0, "ldlm_convert", "reqs");
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_CANCEL - LDLM_FIRST_OPC,
+                            0, "ldlm_cancel", "reqs");
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+                            0, "ldlm_bl_callback", "reqs");
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+                            0, "ldlm_cp_callback", "reqs");
+       lprocfs_counter_init(ldlm_stats,
+                            LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+                            0, "ldlm_gl_callback", "reqs");
+}
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
+
+int lprocfs_exp_print_uuid(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                          struct hlist_node *hnode, void *data)
+
+{
+       struct obd_export *exp = cfs_hash_object(hs, hnode);
+       struct seq_file *m = (struct seq_file *)data;
+
+       if (exp->exp_nid_stats)
+               seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid));
+
+       return 0;
+}
+
+static int
+lproc_exp_uuid_seq_show(struct seq_file *m, void *unused)
+{
+       struct nid_stat *stats = (struct nid_stat *)m->private;
+       struct obd_device *obd = stats->nid_obd;
+
+       cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+                             lprocfs_exp_print_uuid, m);
+       return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_uuid);
+
+struct exp_hash_cb_data {
+       struct seq_file *m;
+       bool            first;
+};
+
+int lprocfs_exp_print_hash(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                          struct hlist_node *hnode, void *cb_data)
+
+{
+       struct exp_hash_cb_data *data = (struct exp_hash_cb_data *)cb_data;
+       struct obd_export       *exp = cfs_hash_object(hs, hnode);
+
+       if (exp->exp_lock_hash != NULL) {
+               if (data->first) {
+                       cfs_hash_debug_header(data->m);
+                       data->first = false;
+               }
+               cfs_hash_debug_str(hs, data->m);
+       }
+
+       return 0;
+}
+
+static int
+lproc_exp_hash_seq_show(struct seq_file *m, void *unused)
+{
+       struct nid_stat *stats = (struct nid_stat *)m->private;
+       struct obd_device *obd = stats->nid_obd;
+       struct exp_hash_cb_data cb_data = {m, true};
+
+       cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+                             lprocfs_exp_print_hash, &cb_data);
+       return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_hash);
+
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{
+       return seq_printf(m, "%s\n",
+                       "Write into this file to clear all nid stats and "
+                       "stale nid entries");
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_read);
+
+static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
+{
+       struct nid_stat *stat = obj;
+       ENTRY;
+
+       CDEBUG(D_INFO,"refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+       if (atomic_read(&stat->nid_exp_ref_count) == 1) {
+               /* object has only hash references. */
+               spin_lock(&stat->nid_obd->obd_nid_lock);
+               list_move(&stat->nid_list, data);
+               spin_unlock(&stat->nid_obd->obd_nid_lock);
+               RETURN(1);
+       }
+       /* we has reference to object - only clear data*/
+       if (stat->nid_stats)
+               lprocfs_clear_stats(stat->nid_stats);
+
+       RETURN(0);
+}
+
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct nid_stat *client_stat;
+       LIST_HEAD(free_list);
+
+       cfs_hash_cond_del(obd->obd_nid_stats_hash,
+                         lprocfs_nid_stats_clear_write_cb, &free_list);
+
+       while (!list_empty(&free_list)) {
+               client_stat = list_entry(free_list.next, struct nid_stat,
+                                            nid_list);
+               list_del_init(&client_stat->nid_list);
+               lprocfs_free_client_stats(client_stat);
+       }
+
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
+
+int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
+{
+       struct nid_stat *new_stat, *old_stat;
+       struct obd_device *obd = NULL;
+       proc_dir_entry_t *entry;
+       char *buffer = NULL;
+       int rc = 0;
+       ENTRY;
+
+       *newnid = 0;
+
+       if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
+           !exp->exp_obd->obd_nid_stats_hash)
+               RETURN(-EINVAL);
+
+       /* not test against zero because eric say:
+        * You may only test nid against another nid, or LNET_NID_ANY.
+        * Anything else is nonsense.*/
+       if (!nid || *nid == LNET_NID_ANY)
+               RETURN(0);
+
+       obd = exp->exp_obd;
+
+       CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
+
+       OBD_ALLOC_PTR(new_stat);
+       if (new_stat == NULL)
+               RETURN(-ENOMEM);
+
+       new_stat->nid          = *nid;
+       new_stat->nid_obd          = exp->exp_obd;
+       /* we need set default refcount to 1 to balance obd_disconnect */
+       atomic_set(&new_stat->nid_exp_ref_count, 1);
+
+       old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash,
+                                          nid, &new_stat->nid_hash);
+       CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+              old_stat, libcfs_nid2str(*nid),
+              atomic_read(&new_stat->nid_exp_ref_count));
+
+       /* We need to release old stats because lprocfs_exp_cleanup() hasn't
+        * been and will never be called. */
+       if (exp->exp_nid_stats) {
+               nidstat_putref(exp->exp_nid_stats);
+               exp->exp_nid_stats = NULL;
+       }
+
+       /* Return -EALREADY here so that we know that the /proc
+        * entry already has been created */
+       if (old_stat != new_stat) {
+               exp->exp_nid_stats = old_stat;
+               GOTO(destroy_new, rc = -EALREADY);
+       }
+       /* not found - create */
+       OBD_ALLOC(buffer, LNET_NIDSTR_SIZE);
+       if (buffer == NULL)
+               GOTO(destroy_new, rc = -ENOMEM);
+
+       memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE);
+       new_stat->nid_proc = lprocfs_register(buffer,
+                                             obd->obd_proc_exports_entry,
+                                             NULL, NULL);
+       OBD_FREE(buffer, LNET_NIDSTR_SIZE);
+
+       if (new_stat->nid_proc == NULL) {
+               CERROR("Error making export directory for nid %s\n",
+                      libcfs_nid2str(*nid));
+               GOTO(destroy_new_ns, rc = -ENOMEM);
+       }
+
+       entry = lprocfs_add_simple(new_stat->nid_proc, "uuid",
+                                  new_stat, &lproc_exp_uuid_fops);
+       if (IS_ERR(entry)) {
+               CWARN("Error adding the NID stats file\n");
+               rc = PTR_ERR(entry);
+               GOTO(destroy_new_ns, rc);
+       }
+
+       entry = lprocfs_add_simple(new_stat->nid_proc, "hash",
+                                  new_stat, &lproc_exp_hash_fops);
+       if (IS_ERR(entry)) {
+               CWARN("Error adding the hash file\n");
+               rc = PTR_ERR(entry);
+               GOTO(destroy_new_ns, rc);
+       }
+
+       exp->exp_nid_stats = new_stat;
+       *newnid = 1;
+       /* protect competitive add to list, not need locking on destroy */
+       spin_lock(&obd->obd_nid_lock);
+       list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+       spin_unlock(&obd->obd_nid_lock);
+
+       RETURN(rc);
+
+destroy_new_ns:
+       if (new_stat->nid_proc != NULL)
+               lprocfs_remove(&new_stat->nid_proc);
+       cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
+
+destroy_new:
+       nidstat_putref(new_stat);
+       OBD_FREE_PTR(new_stat);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_exp_setup);
+
+int lprocfs_exp_cleanup(struct obd_export *exp)
+{
+       struct nid_stat *stat = exp->exp_nid_stats;
+
+       if(!stat || !exp->exp_obd)
+               RETURN(0);
+
+       nidstat_putref(exp->exp_nid_stats);
+       exp->exp_nid_stats = NULL;
+
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_exp_cleanup);
+
+int lprocfs_write_helper(const char *buffer, unsigned long count,
+                        int *val)
+{
+       return lprocfs_write_frac_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_helper);
+
+int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+                             int *val, int mult)
+{
+       char kernbuf[20], *end, *pbuf;
+
+       if (count > (sizeof(kernbuf) - 1))
+               return -EINVAL;
+
+       if (copy_from_user(kernbuf, buffer, count))
+               return -EFAULT;
+
+       kernbuf[count] = '\0';
+       pbuf = kernbuf;
+       if (*pbuf == '-') {
+               mult = -mult;
+               pbuf++;
+       }
+
+       *val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+       if (pbuf == end)
+               return -EINVAL;
+
+       if (end != NULL && *end == '.') {
+               int temp_val, pow = 1;
+               int i;
+
+               pbuf = end + 1;
+               if (strlen(pbuf) > 5)
+                       pbuf[5] = '\0'; /*only allow 5bits fractional*/
+
+               temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+
+               if (pbuf < end) {
+                       for (i = 0; i < (end - pbuf); i++)
+                               pow *= 10;
+
+                       *val += temp_val / pow;
+               }
+       }
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_helper);
+
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+                            int mult)
+{
+       long decimal_val, frac_val;
+       int prtn;
+
+       if (count < 10)
+               return -EINVAL;
+
+       decimal_val = val / mult;
+       prtn = snprintf(buffer, count, "%ld", decimal_val);
+       frac_val = val % mult;
+
+       if (prtn < (count - 4) && frac_val > 0) {
+               long temp_frac;
+               int i, temp_mult = 1, frac_bits = 0;
+
+               temp_frac = frac_val * 10;
+               buffer[prtn++] = '.';
+               while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
+                       /* only reserved 2 bits fraction */
+                       buffer[prtn++] ='0';
+                       temp_frac *= 10;
+                       frac_bits++;
+               }
+               /*
+                * Need to think these cases :
+                *      1. #echo x.00 > /proc/xxx       output result : x
+                *      2. #echo x.0x > /proc/xxx       output result : x.0x
+                *      3. #echo x.x0 > /proc/xxx       output result : x.x
+                *      4. #echo x.xx > /proc/xxx       output result : x.xx
+                *      Only reserved 2 bits fraction.
+                */
+               for (i = 0; i < (5 - prtn); i++)
+                       temp_mult *= 10;
+
+               frac_bits = min((int)count - prtn, 3 - frac_bits);
+               prtn += snprintf(buffer + prtn, frac_bits, "%ld",
+                                frac_val * temp_mult / mult);
+
+               prtn--;
+               while(buffer[prtn] < '1' || buffer[prtn] > '9') {
+                       prtn--;
+                       if (buffer[prtn] == '.') {
+                               prtn--;
+                               break;
+                       }
+               }
+               prtn++;
+       }
+       buffer[prtn++] ='\n';
+       return prtn;
+}
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
+
+int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult)
+{
+       long decimal_val, frac_val;
+
+       decimal_val = val / mult;
+       seq_printf(m, "%ld", decimal_val);
+       frac_val = val % mult;
+
+       if (frac_val > 0) {
+               frac_val *= 100;
+               frac_val /= mult;
+       }
+       if (frac_val > 0) {
+               /* Three cases: x0, xx, 0x */
+               if ((frac_val % 10) != 0)
+                       seq_printf(m, ".%ld", frac_val);
+               else
+                       seq_printf(m, ".%ld", frac_val / 10);
+       }
+
+       seq_printf(m, "\n");
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_seq_read_frac_helper);
+
+int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val)
+{
+       return lprocfs_write_frac_u64_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_u64_helper);
+
+int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
+                             __u64 *val, int mult)
+{
+       char kernbuf[22], *end, *pbuf;
+       __u64 whole, frac = 0, units;
+       unsigned frac_d = 1;
+
+       if (count > (sizeof(kernbuf) - 1))
+               return -EINVAL;
+
+       if (copy_from_user(kernbuf, buffer, count))
+               return -EFAULT;
+
+       kernbuf[count] = '\0';
+       pbuf = kernbuf;
+       if (*pbuf == '-') {
+               mult = -mult;
+               pbuf++;
+       }
+
+       whole = simple_strtoull(pbuf, &end, 10);
+       if (pbuf == end)
+               return -EINVAL;
+
+       if (end != NULL && *end == '.') {
+               int i;
+               pbuf = end + 1;
+
+               /* need to limit frac_d to a __u32 */
+               if (strlen(pbuf) > 10)
+                       pbuf[10] = '\0';
+
+               frac = simple_strtoull(pbuf, &end, 10);
+               /* count decimal places */
+               for (i = 0; i < (end - pbuf); i++)
+                       frac_d *= 10;
+       }
+
+       units = 1;
+       switch(*end) {
+       case 'p': case 'P':
+               units <<= 10;
+       case 't': case 'T':
+               units <<= 10;
+       case 'g': case 'G':
+               units <<= 10;
+       case 'm': case 'M':
+               units <<= 10;
+       case 'k': case 'K':
+               units <<= 10;
+       }
+       /* Specified units override the multiplier */
+       if (units)
+               mult = mult < 0 ? -units : units;
+
+       frac *= mult;
+       do_div(frac, frac_d);
+       *val = whole * mult + frac;
+       return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_u64_helper);
+
+static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len)
+{
+       size_t l2;
+
+       l2 = strlen(s2);
+       if (!l2)
+               return (char *)s1;
+       while (len >= l2) {
+               len--;
+               if (!memcmp(s1, s2, l2))
+                       return (char *)s1;
+               s1++;
+       }
+       return NULL;
+}
+
+/**
+ * Find the string \a name in the input \a buffer, and return a pointer to the
+ * value immediately following \a name, reducing \a count appropriately.
+ * If \a name is not found the original \a buffer is returned.
+ */
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+                               unsigned long *count)
+{
+       char *val;
+       size_t buflen = *count;
+
+       /* there is no strnstr() in rhel5 and ubuntu kernels */
+       val = lprocfs_strnstr(buffer, name, buflen);
+       if (val == NULL)
+               return (char *)buffer;
+
+       val += strlen(name);                         /* skip prefix */
+       while (val < buffer + buflen && isspace(*val)) /* skip separator */
+               val++;
+
+       *count = 0;
+       while (val < buffer + buflen && isalnum(*val)) {
+               ++*count;
+               ++val;
+       }
+
+       return val - *count;
+}
+EXPORT_SYMBOL(lprocfs_find_named_value);
+
+int lprocfs_seq_create(proc_dir_entry_t *parent,
+                      const char *name,
+                      mode_t mode,
+                      const struct file_operations *seq_fops,
+                      void *data)
+{
+       struct proc_dir_entry *entry;
+       ENTRY;
+
+       /* Disallow secretly (un)writable entries. */
+       LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0));
+       entry = proc_create_data(name, mode, parent, seq_fops, data);
+
+       if (entry == NULL)
+               RETURN(-ENOMEM);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_seq_create);
+
+int lprocfs_obd_seq_create(struct obd_device *dev,
+                          const char *name,
+                          mode_t mode,
+                          const struct file_operations *seq_fops,
+                          void *data)
+{
+       return (lprocfs_seq_create(dev->obd_proc_entry, name,
+                                  mode, seq_fops, data));
+}
+EXPORT_SYMBOL(lprocfs_obd_seq_create);
+
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{
+       if (value >= OBD_HIST_MAX)
+               value = OBD_HIST_MAX - 1;
+
+       spin_lock(&oh->oh_lock);
+       oh->oh_buckets[value]++;
+       spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally);
+
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{
+       unsigned int val;
+
+       for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++)
+               ;
+
+       lprocfs_oh_tally(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2);
+
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{
+       unsigned long ret = 0;
+       int i;
+
+       for (i = 0; i < OBD_HIST_MAX; i++)
+               ret +=  oh->oh_buckets[i];
+       return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum);
+
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{
+       spin_lock(&oh->oh_lock);
+       memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
+       spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear);
+
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data)
+{
+       struct obd_device *dev = data;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc);
+
+#endif /* LPROCFS*/
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
new file mode 100644 (file)
index 0000000..fdf0ed3
--- /dev/null
@@ -0,0 +1,2185 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+
+# include <linux/module.h>
+
+/* hash_long() */
+#include <linux/libcfs/libcfs_hash.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+#include <lu_ref.h>
+#include <linux/list.h>
+
+static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+
+/**
+ * Decrease reference counter on object. If last reference is freed, return
+ * object to the cache, unless lu_object_is_dying(o) holds. In the latter
+ * case, free object immediately.
+ */
+void lu_object_put(const struct lu_env *env, struct lu_object *o)
+{
+       struct lu_site_bkt_data *bkt;
+       struct lu_object_header *top;
+       struct lu_site    *site;
+       struct lu_object        *orig;
+       cfs_hash_bd_t       bd;
+       const struct lu_fid     *fid;
+
+       top  = o->lo_header;
+       site = o->lo_dev->ld_site;
+       orig = o;
+
+       /*
+        * till we have full fids-on-OST implemented anonymous objects
+        * are possible in OSP. such an object isn't listed in the site
+        * so we should not remove it from the site.
+        */
+       fid = lu_object_fid(o);
+       if (fid_is_zero(fid)) {
+               LASSERT(top->loh_hash.next == NULL
+                       && top->loh_hash.pprev == NULL);
+               LASSERT(list_empty(&top->loh_lru));
+               if (!atomic_dec_and_test(&top->loh_ref))
+                       return;
+               list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+                       if (o->lo_ops->loo_object_release != NULL)
+                               o->lo_ops->loo_object_release(env, o);
+               }
+               lu_object_free(env, orig);
+               return;
+       }
+
+       cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
+       bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+
+       if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
+               if (lu_object_is_dying(top)) {
+
+                       /*
+                        * somebody may be waiting for this, currently only
+                        * used for cl_object, see cl_object_put_last().
+                        */
+                       wake_up_all(&bkt->lsb_marche_funebre);
+               }
+               return;
+       }
+
+       LASSERT(bkt->lsb_busy > 0);
+       bkt->lsb_busy--;
+       /*
+        * When last reference is released, iterate over object
+        * layers, and notify them that object is no longer busy.
+        */
+       list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+               if (o->lo_ops->loo_object_release != NULL)
+                       o->lo_ops->loo_object_release(env, o);
+       }
+
+       if (!lu_object_is_dying(top)) {
+               LASSERT(list_empty(&top->loh_lru));
+               list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+               cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+               return;
+       }
+
+       /*
+        * If object is dying (will not be cached), removed it
+        * from hash table and LRU.
+        *
+        * This is done with hash table and LRU lists locked. As the only
+        * way to acquire first reference to previously unreferenced
+        * object is through hash-table lookup (lu_object_find()),
+        * or LRU scanning (lu_site_purge()), that are done under hash-table
+        * and LRU lock, no race with concurrent object lookup is possible
+        * and we can safely destroy object below.
+        */
+       if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+               cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
+       cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+       /*
+        * Object was already removed from hash and lru above, can
+        * kill it.
+        */
+       lu_object_free(env, orig);
+}
+EXPORT_SYMBOL(lu_object_put);
+
+/**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+       set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
+       return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+       struct lu_object_header *top;
+
+       top = o->lo_header;
+       set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+       if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+               cfs_hash_t *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+               cfs_hash_bd_t bd;
+
+               cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
+               list_del_init(&top->loh_lru);
+               cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
+               cfs_hash_bd_unlock(obj_hash, &bd, 1);
+       }
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
+ * Allocate new object.
+ *
+ * This follows object creation protocol, described in the comment within
+ * struct lu_device_operations definition.
+ */
+static struct lu_object *lu_object_alloc(const struct lu_env *env,
+                                        struct lu_device *dev,
+                                        const struct lu_fid *f,
+                                        const struct lu_object_conf *conf)
+{
+       struct lu_object *scan;
+       struct lu_object *top;
+       struct list_head *layers;
+       int clean;
+       int result;
+       ENTRY;
+
+       /*
+        * Create top-level object slice. This will also create
+        * lu_object_header.
+        */
+       top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+       if (top == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+       if (IS_ERR(top))
+               RETURN(top);
+       /*
+        * This is the only place where object fid is assigned. It's constant
+        * after this point.
+        */
+       top->lo_header->loh_fid = *f;
+       layers = &top->lo_header->loh_layers;
+       do {
+               /*
+                * Call ->loo_object_init() repeatedly, until no more new
+                * object slices are created.
+                */
+               clean = 1;
+               list_for_each_entry(scan, layers, lo_linkage) {
+                       if (scan->lo_flags & LU_OBJECT_ALLOCATED)
+                               continue;
+                       clean = 0;
+                       scan->lo_header = top->lo_header;
+                       result = scan->lo_ops->loo_object_init(env, scan, conf);
+                       if (result != 0) {
+                               lu_object_free(env, top);
+                               RETURN(ERR_PTR(result));
+                       }
+                       scan->lo_flags |= LU_OBJECT_ALLOCATED;
+               }
+       } while (!clean);
+
+       list_for_each_entry_reverse(scan, layers, lo_linkage) {
+               if (scan->lo_ops->loo_object_start != NULL) {
+                       result = scan->lo_ops->loo_object_start(env, scan);
+                       if (result != 0) {
+                               lu_object_free(env, top);
+                               RETURN(ERR_PTR(result));
+                       }
+               }
+       }
+
+       lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+       RETURN(top);
+}
+
+/**
+ * Free an object.
+ */
+static void lu_object_free(const struct lu_env *env, struct lu_object *o)
+{
+       struct lu_site_bkt_data *bkt;
+       struct lu_site    *site;
+       struct lu_object        *scan;
+       struct list_head              *layers;
+       struct list_head               splice;
+
+       site   = o->lo_dev->ld_site;
+       layers = &o->lo_header->loh_layers;
+       bkt    = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+       /*
+        * First call ->loo_object_delete() method to release all resources.
+        */
+       list_for_each_entry_reverse(scan, layers, lo_linkage) {
+               if (scan->lo_ops->loo_object_delete != NULL)
+                       scan->lo_ops->loo_object_delete(env, scan);
+       }
+
+       /*
+        * Then, splice object layers into stand-alone list, and call
+        * ->loo_object_free() on all layers to free memory. Splice is
+        * necessary, because lu_object_header is freed together with the
+        * top-level slice.
+        */
+       INIT_LIST_HEAD(&splice);
+       list_splice_init(layers, &splice);
+       while (!list_empty(&splice)) {
+               /*
+                * Free layers in bottom-to-top order, so that object header
+                * lives as long as possible and ->loo_object_free() methods
+                * can look at its contents.
+                */
+               o = container_of0(splice.prev, struct lu_object, lo_linkage);
+               list_del_init(&o->lo_linkage);
+               LASSERT(o->lo_ops->loo_object_free != NULL);
+               o->lo_ops->loo_object_free(env, o);
+       }
+
+       if (waitqueue_active(&bkt->lsb_marche_funebre))
+               wake_up_all(&bkt->lsb_marche_funebre);
+}
+
+/**
+ * Free \a nr objects from the cold end of the site LRU list.
+ */
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
+{
+       struct lu_object_header *h;
+       struct lu_object_header *temp;
+       struct lu_site_bkt_data *bkt;
+       cfs_hash_bd_t       bd;
+       cfs_hash_bd_t       bd2;
+       struct list_head               dispose;
+       int                   did_sth;
+       int                   start;
+       int                   count;
+       int                   bnr;
+       int                   i;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU))
+               RETURN(0);
+
+       INIT_LIST_HEAD(&dispose);
+       /*
+        * Under LRU list lock, scan LRU list and move unreferenced objects to
+        * the dispose list, removing them from LRU and hash table.
+        */
+       start = s->ls_purge_start;
+       bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1;
+ again:
+       did_sth = 0;
+       cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+               if (i < start)
+                       continue;
+               count = bnr;
+               cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1);
+               bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+
+               list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) {
+                       LASSERT(atomic_read(&h->loh_ref) == 0);
+
+                       cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2);
+                       LASSERT(bd.bd_bucket == bd2.bd_bucket);
+
+                       cfs_hash_bd_del_locked(s->ls_obj_hash,
+                                              &bd2, &h->loh_hash);
+                       list_move(&h->loh_lru, &dispose);
+                       if (did_sth == 0)
+                               did_sth = 1;
+
+                       if (nr != ~0 && --nr == 0)
+                               break;
+
+                       if (count > 0 && --count == 0)
+                               break;
+
+               }
+               cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1);
+               cond_resched();
+               /*
+                * Free everything on the dispose list. This is safe against
+                * races due to the reasons described in lu_object_put().
+                */
+               while (!list_empty(&dispose)) {
+                       h = container_of0(dispose.next,
+                                         struct lu_object_header, loh_lru);
+                       list_del_init(&h->loh_lru);
+                       lu_object_free(env, lu_object_top(h));
+                       lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
+               }
+
+               if (nr == 0)
+                       break;
+       }
+
+       if (nr != 0 && did_sth && start != 0) {
+               start = 0; /* restart from the first bucket */
+               goto again;
+       }
+       /* race on s->ls_purge_start, but nobody cares */
+       s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash);
+
+       return nr;
+}
+EXPORT_SYMBOL(lu_site_purge);
+
+/*
+ * Object printing.
+ *
+ * Code below has to jump through certain loops to output object description
+ * into libcfs_debug_msg-based log. The problem is that lu_object_print()
+ * composes object description from strings that are parts of _lines_ of
+ * output (i.e., strings that are not terminated by newline). This doesn't fit
+ * very well into libcfs_debug_msg() interface that assumes that each message
+ * supplied to it is a self-contained output line.
+ *
+ * To work around this, strings are collected in a temporary buffer
+ * (implemented as a value of lu_cdebug_key key), until terminating newline
+ * character is detected.
+ *
+ */
+
+enum {
+       /**
+        * Maximal line size.
+        *
+        * XXX overflow is not handled correctly.
+        */
+       LU_CDEBUG_LINE = 512
+};
+
+struct lu_cdebug_data {
+       /**
+        * Temporary buffer.
+        */
+       char lck_area[LU_CDEBUG_LINE];
+};
+
+/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */
+LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data);
+
+/**
+ * Key, holding temporary buffer. This key is registered very early by
+ * lu_global_init().
+ */
+struct lu_context_key lu_global_key = {
+       .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD |
+                   LCT_MG_THREAD | LCT_CL_THREAD,
+       .lct_init = lu_global_key_init,
+       .lct_fini = lu_global_key_fini
+};
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+                     void *cookie, const char *format, ...)
+{
+       struct libcfs_debug_msg_data *msgdata = cookie;
+       struct lu_cdebug_data   *key;
+       int used;
+       int complete;
+       va_list args;
+
+       va_start(args, format);
+
+       key = lu_context_key_get(&env->le_ctx, &lu_global_key);
+       LASSERT(key != NULL);
+
+       used = strlen(key->lck_area);
+       complete = format[strlen(format) - 1] == '\n';
+       /*
+        * Append new chunk to the buffer.
+        */
+       vsnprintf(key->lck_area + used,
+                 ARRAY_SIZE(key->lck_area) - used, format, args);
+       if (complete) {
+               if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys))
+                       libcfs_debug_msg(msgdata, "%s", key->lck_area);
+               key->lck_area[0] = 0;
+       }
+       va_end(args);
+       return 0;
+}
+EXPORT_SYMBOL(lu_cdebug_printer);
+
+/**
+ * Print object header.
+ */
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t printer,
+                           const struct lu_object_header *hdr)
+{
+       (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
+                  hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
+                  PFID(&hdr->loh_fid),
+                  hlist_unhashed(&hdr->loh_hash) ? "" : " hash",
+                  list_empty((struct list_head *)&hdr->loh_lru) ? \
+                  "" : " lru",
+                  hdr->loh_attr & LOHA_EXISTS ? " exist":"");
+}
+EXPORT_SYMBOL(lu_object_header_print);
+
+/**
+ * Print human readable representation of the \a o to the \a printer.
+ */
+void lu_object_print(const struct lu_env *env, void *cookie,
+                    lu_printer_t printer, const struct lu_object *o)
+{
+       static const char ruler[] = "........................................";
+       struct lu_object_header *top;
+       int depth;
+
+       top = o->lo_header;
+       lu_object_header_print(env, cookie, printer, top);
+       (*printer)(env, cookie, "{ \n");
+       list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+               depth = o->lo_depth + 4;
+
+               /*
+                * print `.' \a depth times followed by type name and address
+                */
+               (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
+                          o->lo_dev->ld_type->ldt_name, o);
+               if (o->lo_ops->loo_object_print != NULL)
+                       o->lo_ops->loo_object_print(env, cookie, printer, o);
+               (*printer)(env, cookie, "\n");
+       }
+       (*printer)(env, cookie, "} header@%p\n", top);
+}
+EXPORT_SYMBOL(lu_object_print);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o)
+{
+       struct lu_object_header *top;
+
+       top = o->lo_header;
+       list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+               if (o->lo_ops->loo_object_invariant != NULL &&
+                   !o->lo_ops->loo_object_invariant(o))
+                       return 0;
+       }
+       return 1;
+}
+EXPORT_SYMBOL(lu_object_invariant);
+
+static struct lu_object *htable_lookup(struct lu_site *s,
+                                      cfs_hash_bd_t *bd,
+                                      const struct lu_fid *f,
+                                      wait_queue_t *waiter,
+                                      __u64 *version)
+{
+       struct lu_site_bkt_data *bkt;
+       struct lu_object_header *h;
+       struct hlist_node       *hnode;
+       __u64  ver = cfs_hash_bd_version_get(bd);
+
+       if (*version == ver)
+               return NULL;
+
+       *version = ver;
+       bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
+       /* cfs_hash_bd_peek_locked is a somehow "internal" function
+        * of cfs_hash, it doesn't add refcount on object. */
+       hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
+       if (hnode == NULL) {
+               lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+               return NULL;
+       }
+
+       h = container_of0(hnode, struct lu_object_header, loh_hash);
+       if (likely(!lu_object_is_dying(h))) {
+               cfs_hash_get(s->ls_obj_hash, hnode);
+               lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
+               list_del_init(&h->loh_lru);
+               return lu_object_top(h);
+       }
+
+       /*
+        * Lookup found an object being destroyed this object cannot be
+        * returned (to assure that references to dying objects are eventually
+        * drained), and moreover, lookup has to wait until object is freed.
+        */
+
+       init_waitqueue_entry_current(waiter);
+       add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE);
+       return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * Search cache for an object with the fid \a f. If such object is found,
+ * return it. Otherwise, create new object, insert it into cache and return
+ * it. In any case, additional reference is acquired on the returned object.
+ */
+struct lu_object *lu_object_find(const struct lu_env *env,
+                                struct lu_device *dev, const struct lu_fid *f,
+                                const struct lu_object_conf *conf)
+{
+       return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf);
+}
+EXPORT_SYMBOL(lu_object_find);
+
+static struct lu_object *lu_object_new(const struct lu_env *env,
+                                      struct lu_device *dev,
+                                      const struct lu_fid *f,
+                                      const struct lu_object_conf *conf)
+{
+       struct lu_object        *o;
+       cfs_hash_t            *hs;
+       cfs_hash_bd_t       bd;
+       struct lu_site_bkt_data *bkt;
+
+       o = lu_object_alloc(env, dev, f, conf);
+       if (unlikely(IS_ERR(o)))
+               return o;
+
+       hs = dev->ld_site->ls_obj_hash;
+       cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+       bkt = cfs_hash_bd_extra_get(hs, &bd);
+       cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+       bkt->lsb_busy++;
+       cfs_hash_bd_unlock(hs, &bd, 1);
+       return o;
+}
+
+/**
+ * Core logic of lu_object_find*() functions.
+ */
+static struct lu_object *lu_object_find_try(const struct lu_env *env,
+                                           struct lu_device *dev,
+                                           const struct lu_fid *f,
+                                           const struct lu_object_conf *conf,
+                                           wait_queue_t *waiter)
+{
+       struct lu_object      *o;
+       struct lu_object      *shadow;
+       struct lu_site  *s;
+       cfs_hash_t          *hs;
+       cfs_hash_bd_t     bd;
+       __u64             version = 0;
+
+       /*
+        * This uses standard index maintenance protocol:
+        *
+        *     - search index under lock, and return object if found;
+        *     - otherwise, unlock index, allocate new object;
+        *     - lock index and search again;
+        *     - if nothing is found (usual case), insert newly created
+        *       object into index;
+        *     - otherwise (race: other thread inserted object), free
+        *       object just allocated.
+        *     - unlock index;
+        *     - return object.
+        *
+        * For "LOC_F_NEW" case, we are sure the object is new established.
+        * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
+        * just alloc and insert directly.
+        *
+        * If dying object is found during index search, add @waiter to the
+        * site wait-queue and return ERR_PTR(-EAGAIN).
+        */
+       if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+               return lu_object_new(env, dev, f, conf);
+
+       s  = dev->ld_site;
+       hs = s->ls_obj_hash;
+       cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+       o = htable_lookup(s, &bd, f, waiter, &version);
+       cfs_hash_bd_unlock(hs, &bd, 1);
+       if (o != NULL)
+               return o;
+
+       /*
+        * Allocate new object. This may result in rather complicated
+        * operations, including fld queries, inode loading, etc.
+        */
+       o = lu_object_alloc(env, dev, f, conf);
+       if (unlikely(IS_ERR(o)))
+               return o;
+
+       LASSERT(lu_fid_eq(lu_object_fid(o), f));
+
+       cfs_hash_bd_lock(hs, &bd, 1);
+
+       shadow = htable_lookup(s, &bd, f, waiter, &version);
+       if (likely(shadow == NULL)) {
+               struct lu_site_bkt_data *bkt;
+
+               bkt = cfs_hash_bd_extra_get(hs, &bd);
+               cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+               bkt->lsb_busy++;
+               cfs_hash_bd_unlock(hs, &bd, 1);
+               return o;
+       }
+
+       lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
+       cfs_hash_bd_unlock(hs, &bd, 1);
+       lu_object_free(env, o);
+       return shadow;
+}
+
+/**
+ * Much like lu_object_find(), but top level device of object is specifically
+ * \a dev rather than top level device of the site. This interface allows
+ * objects of different "stacking" to be created within the same site.
+ */
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+                                   struct lu_device *dev,
+                                   const struct lu_fid *f,
+                                   const struct lu_object_conf *conf)
+{
+       struct lu_site_bkt_data *bkt;
+       struct lu_object        *obj;
+       wait_queue_t       wait;
+
+       while (1) {
+               obj = lu_object_find_try(env, dev, f, conf, &wait);
+               if (obj != ERR_PTR(-EAGAIN))
+                       return obj;
+               /*
+                * lu_object_find_try() already added waiter into the
+                * wait queue.
+                */
+               waitq_wait(&wait, TASK_UNINTERRUPTIBLE);
+               bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f);
+               remove_wait_queue(&bkt->lsb_marche_funebre, &wait);
+       }
+}
+EXPORT_SYMBOL(lu_object_find_at);
+
+/**
+ * Find object with given fid, and return its slice belonging to given device.
+ */
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+                                      struct lu_device *dev,
+                                      const struct lu_fid *f,
+                                      const struct lu_object_conf *conf)
+{
+       struct lu_object *top;
+       struct lu_object *obj;
+
+       top = lu_object_find(env, dev, f, conf);
+       if (!IS_ERR(top)) {
+               obj = lu_object_locate(top->lo_header, dev->ld_type);
+               if (obj == NULL)
+                       lu_object_put(env, top);
+       } else
+               obj = top;
+       return obj;
+}
+EXPORT_SYMBOL(lu_object_find_slice);
+
+/**
+ * Global list of all device types.
+ */
+static LIST_HEAD(lu_device_types);
+
+int lu_device_type_init(struct lu_device_type *ldt)
+{
+       int result = 0;
+
+       INIT_LIST_HEAD(&ldt->ldt_linkage);
+       if (ldt->ldt_ops->ldto_init)
+               result = ldt->ldt_ops->ldto_init(ldt);
+       if (result == 0)
+               list_add(&ldt->ldt_linkage, &lu_device_types);
+       return result;
+}
+EXPORT_SYMBOL(lu_device_type_init);
+
+void lu_device_type_fini(struct lu_device_type *ldt)
+{
+       list_del_init(&ldt->ldt_linkage);
+       if (ldt->ldt_ops->ldto_fini)
+               ldt->ldt_ops->ldto_fini(ldt);
+}
+EXPORT_SYMBOL(lu_device_type_fini);
+
+void lu_types_stop(void)
+{
+       struct lu_device_type *ldt;
+
+       list_for_each_entry(ldt, &lu_device_types, ldt_linkage) {
+               if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop)
+                       ldt->ldt_ops->ldto_stop(ldt);
+       }
+}
+EXPORT_SYMBOL(lu_types_stop);
+
+/**
+ * Global list of all sites on this node
+ */
+static LIST_HEAD(lu_sites);
+static DEFINE_MUTEX(lu_sites_guard);
+
+/**
+ * Global environment used by site shrinker.
+ */
+static struct lu_env lu_shrink_env;
+
+struct lu_site_print_arg {
+       struct lu_env   *lsp_env;
+       void        *lsp_cookie;
+       lu_printer_t     lsp_printer;
+};
+
+static int
+lu_site_obj_print(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                 struct hlist_node *hnode, void *data)
+{
+       struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data;
+       struct lu_object_header  *h;
+
+       h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+       if (!list_empty(&h->loh_layers)) {
+               const struct lu_object *o;
+
+               o = lu_object_top(h);
+               lu_object_print(arg->lsp_env, arg->lsp_cookie,
+                               arg->lsp_printer, o);
+       } else {
+               lu_object_header_print(arg->lsp_env, arg->lsp_cookie,
+                                      arg->lsp_printer, h);
+       }
+       return 0;
+}
+
+/**
+ * Print all objects in \a s.
+ */
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+                  lu_printer_t printer)
+{
+       struct lu_site_print_arg arg = {
+               .lsp_env     = (struct lu_env *)env,
+               .lsp_cookie  = cookie,
+               .lsp_printer = printer,
+       };
+
+       cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg);
+}
+EXPORT_SYMBOL(lu_site_print);
+
+enum {
+       LU_CACHE_PERCENT_MAX     = 50,
+       LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644,
+               "Percentage of memory to be used as lu_object cache");
+
+/**
+ * Return desired hash table order.
+ */
+static int lu_htable_order(void)
+{
+       unsigned long cache_size;
+       int bits;
+
+       /*
+        * Calculate hash table size, assuming that we want reasonable
+        * performance when 20% of total memory is occupied by cache of
+        * lu_objects.
+        *
+        * Size of lu_object is (arbitrary) taken as 1K (together with inode).
+        */
+       cache_size = num_physpages;
+
+#if BITS_PER_LONG == 32
+       /* limit hashtable size for lowmem systems to low RAM */
+       if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT))
+               cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4;
+#endif
+
+       /* clear off unreasonable cache setting. */
+       if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) {
+               CWARN("obdclass: invalid lu_cache_percent: %u, it must be in"
+                     " the range of (0, %u]. Will use default value: %u.\n",
+                     lu_cache_percent, LU_CACHE_PERCENT_MAX,
+                     LU_CACHE_PERCENT_DEFAULT);
+
+               lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+       }
+       cache_size = cache_size / 100 * lu_cache_percent *
+               (PAGE_CACHE_SIZE / 1024);
+
+       for (bits = 1; (1 << bits) < cache_size; ++bits) {
+               ;
+       }
+       return bits;
+}
+
+static unsigned lu_obj_hop_hash(cfs_hash_t *hs,
+                               const void *key, unsigned mask)
+{
+       struct lu_fid  *fid = (struct lu_fid *)key;
+       __u32      hash;
+
+       hash = fid_flatten32(fid);
+       hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+       hash = cfs_hash_long(hash, hs->hs_bkt_bits);
+
+       /* give me another random factor */
+       hash -= cfs_hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3);
+
+       hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+       hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1);
+
+       return hash & mask;
+}
+
+static void *lu_obj_hop_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct lu_object_header, loh_hash);
+}
+
+static void *lu_obj_hop_key(struct hlist_node *hnode)
+{
+       struct lu_object_header *h;
+
+       h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+       return &h->loh_fid;
+}
+
+static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct lu_object_header *h;
+
+       h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+       return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key);
+}
+
+static void lu_obj_hop_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct lu_object_header *h;
+
+       h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+       if (atomic_add_return(1, &h->loh_ref) == 1) {
+               struct lu_site_bkt_data *bkt;
+               cfs_hash_bd_t       bd;
+
+               cfs_hash_bd_get(hs, &h->loh_fid, &bd);
+               bkt = cfs_hash_bd_extra_get(hs, &bd);
+               bkt->lsb_busy++;
+       }
+}
+
+static void lu_obj_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       LBUG(); /* we should never called it */
+}
+
+cfs_hash_ops_t lu_site_hash_ops = {
+       .hs_hash        = lu_obj_hop_hash,
+       .hs_key  = lu_obj_hop_key,
+       .hs_keycmp      = lu_obj_hop_keycmp,
+       .hs_object      = lu_obj_hop_object,
+       .hs_get  = lu_obj_hop_get,
+       .hs_put_locked  = lu_obj_hop_put_locked,
+};
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
+{
+       spin_lock(&s->ls_ld_lock);
+       if (list_empty(&d->ld_linkage))
+               list_add(&d->ld_linkage, &s->ls_ld_linkage);
+       spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_add_linkage);
+
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
+{
+       spin_lock(&s->ls_ld_lock);
+       list_del_init(&d->ld_linkage);
+       spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_del_linkage);
+
+/**
+ * Initialize site \a s, with \a d as the top level device.
+ */
+#define LU_SITE_BITS_MIN    12
+#define LU_SITE_BITS_MAX    24
+/**
+ * total 256 buckets, we don't want too many buckets because:
+ * - consume too much memory
+ * - avoid unbalanced LRU list
+ */
+#define LU_SITE_BKT_BITS    8
+
+int lu_site_init(struct lu_site *s, struct lu_device *top)
+{
+       struct lu_site_bkt_data *bkt;
+       cfs_hash_bd_t bd;
+       char name[16];
+       int bits;
+       int i;
+       ENTRY;
+
+       memset(s, 0, sizeof *s);
+       bits = lu_htable_order();
+       snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name);
+       for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX);
+            bits >= LU_SITE_BITS_MIN; bits--) {
+               s->ls_obj_hash = cfs_hash_create(name, bits, bits,
+                                                bits - LU_SITE_BKT_BITS,
+                                                sizeof(*bkt), 0, 0,
+                                                &lu_site_hash_ops,
+                                                CFS_HASH_SPIN_BKTLOCK |
+                                                CFS_HASH_NO_ITEMREF |
+                                                CFS_HASH_DEPTH |
+                                                CFS_HASH_ASSERT_EMPTY);
+               if (s->ls_obj_hash != NULL)
+                       break;
+       }
+
+       if (s->ls_obj_hash == NULL) {
+               CERROR("failed to create lu_site hash with bits: %d\n", bits);
+               return -ENOMEM;
+       }
+
+       cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+               bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+               INIT_LIST_HEAD(&bkt->lsb_lru);
+               init_waitqueue_head(&bkt->lsb_marche_funebre);
+       }
+
+       s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
+       if (s->ls_stats == NULL) {
+               cfs_hash_putref(s->ls_obj_hash);
+               s->ls_obj_hash = NULL;
+               return -ENOMEM;
+       }
+
+       lprocfs_counter_init(s->ls_stats, LU_SS_CREATED,
+                            0, "created", "created");
+       lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT,
+                            0, "cache_hit", "cache_hit");
+       lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS,
+                            0, "cache_miss", "cache_miss");
+       lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE,
+                            0, "cache_race", "cache_race");
+       lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE,
+                            0, "cache_death_race", "cache_death_race");
+       lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
+                            0, "lru_purged", "lru_purged");
+
+       INIT_LIST_HEAD(&s->ls_linkage);
+       s->ls_top_dev = top;
+       top->ld_site = s;
+       lu_device_get(top);
+       lu_ref_add(&top->ld_reference, "site-top", s);
+
+       INIT_LIST_HEAD(&s->ls_ld_linkage);
+       spin_lock_init(&s->ls_ld_lock);
+
+       lu_dev_add_linkage(s, top);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(lu_site_init);
+
+/**
+ * Finalize \a s and release its resources.
+ */
+void lu_site_fini(struct lu_site *s)
+{
+       mutex_lock(&lu_sites_guard);
+       list_del_init(&s->ls_linkage);
+       mutex_unlock(&lu_sites_guard);
+
+       if (s->ls_obj_hash != NULL) {
+               cfs_hash_putref(s->ls_obj_hash);
+               s->ls_obj_hash = NULL;
+       }
+
+       if (s->ls_top_dev != NULL) {
+               s->ls_top_dev->ld_site = NULL;
+               lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s);
+               lu_device_put(s->ls_top_dev);
+               s->ls_top_dev = NULL;
+       }
+
+       if (s->ls_stats != NULL)
+               lprocfs_free_stats(&s->ls_stats);
+}
+EXPORT_SYMBOL(lu_site_fini);
+
+/**
+ * Called when initialization of stack for this site is completed.
+ */
+int lu_site_init_finish(struct lu_site *s)
+{
+       int result;
+       mutex_lock(&lu_sites_guard);
+       result = lu_context_refill(&lu_shrink_env.le_ctx);
+       if (result == 0)
+               list_add(&s->ls_linkage, &lu_sites);
+       mutex_unlock(&lu_sites_guard);
+       return result;
+}
+EXPORT_SYMBOL(lu_site_init_finish);
+
+/**
+ * Acquire additional reference on device \a d
+ */
+void lu_device_get(struct lu_device *d)
+{
+       atomic_inc(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_get);
+
+/**
+ * Release reference on device \a d.
+ */
+void lu_device_put(struct lu_device *d)
+{
+       LASSERT(atomic_read(&d->ld_ref) > 0);
+       atomic_dec(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_put);
+
+/**
+ * Initialize device \a d of type \a t.
+ */
+int lu_device_init(struct lu_device *d, struct lu_device_type *t)
+{
+       if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL)
+               t->ldt_ops->ldto_start(t);
+       memset(d, 0, sizeof *d);
+       atomic_set(&d->ld_ref, 0);
+       d->ld_type = t;
+       lu_ref_init(&d->ld_reference);
+       INIT_LIST_HEAD(&d->ld_linkage);
+       return 0;
+}
+EXPORT_SYMBOL(lu_device_init);
+
+/**
+ * Finalize device \a d.
+ */
+void lu_device_fini(struct lu_device *d)
+{
+       struct lu_device_type *t;
+
+       t = d->ld_type;
+       if (d->ld_obd != NULL) {
+               d->ld_obd->obd_lu_dev = NULL;
+               d->ld_obd = NULL;
+       }
+
+       lu_ref_fini(&d->ld_reference);
+       LASSERTF(atomic_read(&d->ld_ref) == 0,
+                "Refcount is %u\n", atomic_read(&d->ld_ref));
+       LASSERT(t->ldt_device_nr > 0);
+       if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL)
+               t->ldt_ops->ldto_stop(t);
+}
+EXPORT_SYMBOL(lu_device_fini);
+
+/**
+ * Initialize object \a o that is part of compound object \a h and was created
+ * by device \a d.
+ */
+int lu_object_init(struct lu_object *o,
+                  struct lu_object_header *h, struct lu_device *d)
+{
+       memset(o, 0, sizeof *o);
+       o->lo_header = h;
+       o->lo_dev    = d;
+       lu_device_get(d);
+       o->lo_dev_ref = lu_ref_add(&d->ld_reference, "lu_object", o);
+       INIT_LIST_HEAD(&o->lo_linkage);
+       return 0;
+}
+EXPORT_SYMBOL(lu_object_init);
+
+/**
+ * Finalize object and release its resources.
+ */
+void lu_object_fini(struct lu_object *o)
+{
+       struct lu_device *dev = o->lo_dev;
+
+       LASSERT(list_empty(&o->lo_linkage));
+
+       if (dev != NULL) {
+               lu_ref_del_at(&dev->ld_reference,
+                             o->lo_dev_ref , "lu_object", o);
+               lu_device_put(dev);
+               o->lo_dev = NULL;
+       }
+}
+EXPORT_SYMBOL(lu_object_fini);
+
+/**
+ * Add object \a o as first layer of compound object \a h
+ *
+ * This is typically called by the ->ldo_object_alloc() method of top-level
+ * device.
+ */
+void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
+{
+       list_move(&o->lo_linkage, &h->loh_layers);
+}
+EXPORT_SYMBOL(lu_object_add_top);
+
+/**
+ * Add object \a o as a layer of compound object, going after \a before.
+ *
+ * This is typically called by the ->ldo_object_alloc() method of \a
+ * before->lo_dev.
+ */
+void lu_object_add(struct lu_object *before, struct lu_object *o)
+{
+       list_move(&o->lo_linkage, &before->lo_linkage);
+}
+EXPORT_SYMBOL(lu_object_add);
+
+/**
+ * Initialize compound object.
+ */
+int lu_object_header_init(struct lu_object_header *h)
+{
+       memset(h, 0, sizeof *h);
+       atomic_set(&h->loh_ref, 1);
+       INIT_HLIST_NODE(&h->loh_hash);
+       INIT_LIST_HEAD(&h->loh_lru);
+       INIT_LIST_HEAD(&h->loh_layers);
+       lu_ref_init(&h->loh_reference);
+       return 0;
+}
+EXPORT_SYMBOL(lu_object_header_init);
+
+/**
+ * Finalize compound object.
+ */
+void lu_object_header_fini(struct lu_object_header *h)
+{
+       LASSERT(list_empty(&h->loh_layers));
+       LASSERT(list_empty(&h->loh_lru));
+       LASSERT(hlist_unhashed(&h->loh_hash));
+       lu_ref_fini(&h->loh_reference);
+}
+EXPORT_SYMBOL(lu_object_header_fini);
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+                                  const struct lu_device_type *dtype)
+{
+       struct lu_object *o;
+
+       list_for_each_entry(o, &h->loh_layers, lo_linkage) {
+               if (o->lo_dev->ld_type == dtype)
+                       return o;
+       }
+       return NULL;
+}
+EXPORT_SYMBOL(lu_object_locate);
+
+
+
+/**
+ * Finalize and free devices in the device stack.
+ *
+ * Finalize device stack by purging object cache, and calling
+ * lu_device_type_operations::ldto_device_fini() and
+ * lu_device_type_operations::ldto_device_free() on all devices in the stack.
+ */
+void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
+{
+       struct lu_site   *site = top->ld_site;
+       struct lu_device *scan;
+       struct lu_device *next;
+
+       lu_site_purge(env, site, ~0);
+       for (scan = top; scan != NULL; scan = next) {
+               next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan);
+               lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init);
+               lu_device_put(scan);
+       }
+
+       /* purge again. */
+       lu_site_purge(env, site, ~0);
+
+       for (scan = top; scan != NULL; scan = next) {
+               const struct lu_device_type *ldt = scan->ld_type;
+               struct obd_type      *type;
+
+               next = ldt->ldt_ops->ldto_device_free(env, scan);
+               type = ldt->ldt_obd_type;
+               if (type != NULL) {
+                       type->typ_refcnt--;
+                       class_put_type(type);
+               }
+       }
+}
+EXPORT_SYMBOL(lu_stack_fini);
+
+enum {
+       /**
+        * Maximal number of tld slots.
+        */
+       LU_CONTEXT_KEY_NR = 40
+};
+
+static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
+
+static DEFINE_SPINLOCK(lu_keys_guard);
+
+/**
+ * Global counter incremented whenever key is registered, unregistered,
+ * revived or quiesced. This is used to void unnecessary calls to
+ * lu_context_refill(). No locking is provided, as initialization and shutdown
+ * are supposed to be externally serialized.
+ */
+static unsigned key_set_version = 0;
+
+/**
+ * Register new key.
+ */
+int lu_context_key_register(struct lu_context_key *key)
+{
+       int result;
+       int i;
+
+       LASSERT(key->lct_init != NULL);
+       LASSERT(key->lct_fini != NULL);
+       LASSERT(key->lct_tags != 0);
+       LASSERT(key->lct_owner != NULL);
+
+       result = -ENFILE;
+       spin_lock(&lu_keys_guard);
+       for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+               if (lu_keys[i] == NULL) {
+                       key->lct_index = i;
+                       atomic_set(&key->lct_used, 1);
+                       lu_keys[i] = key;
+                       lu_ref_init(&key->lct_reference);
+                       result = 0;
+                       ++key_set_version;
+                       break;
+               }
+       }
+       spin_unlock(&lu_keys_guard);
+       return result;
+}
+EXPORT_SYMBOL(lu_context_key_register);
+
+static void key_fini(struct lu_context *ctx, int index)
+{
+       if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) {
+               struct lu_context_key *key;
+
+               key = lu_keys[index];
+               LASSERT(key != NULL);
+               LASSERT(key->lct_fini != NULL);
+               LASSERT(atomic_read(&key->lct_used) > 1);
+
+               key->lct_fini(ctx, key, ctx->lc_value[index]);
+               lu_ref_del(&key->lct_reference, "ctx", ctx);
+               atomic_dec(&key->lct_used);
+
+               LASSERT(key->lct_owner != NULL);
+               if ((ctx->lc_tags & LCT_NOREF) == 0) {
+#ifdef CONFIG_MODULE_UNLOAD
+                       LINVRNT(module_refcount(key->lct_owner) > 0);
+#endif
+                       module_put(key->lct_owner);
+               }
+               ctx->lc_value[index] = NULL;
+       }
+}
+
+/**
+ * Deregister key.
+ */
+void lu_context_key_degister(struct lu_context_key *key)
+{
+       LASSERT(atomic_read(&key->lct_used) >= 1);
+       LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+
+       lu_context_key_quiesce(key);
+
+       ++key_set_version;
+       spin_lock(&lu_keys_guard);
+       key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+       if (lu_keys[key->lct_index]) {
+               lu_keys[key->lct_index] = NULL;
+               lu_ref_fini(&key->lct_reference);
+       }
+       spin_unlock(&lu_keys_guard);
+
+       LASSERTF(atomic_read(&key->lct_used) == 1,
+                "key has instances: %d\n",
+                atomic_read(&key->lct_used));
+}
+EXPORT_SYMBOL(lu_context_key_degister);
+
+/**
+ * Register a number of keys. This has to be called after all keys have been
+ * initialized by a call to LU_CONTEXT_KEY_INIT().
+ */
+int lu_context_key_register_many(struct lu_context_key *k, ...)
+{
+       struct lu_context_key *key = k;
+       va_list args;
+       int result;
+
+       va_start(args, k);
+       do {
+               result = lu_context_key_register(key);
+               if (result)
+                       break;
+               key = va_arg(args, struct lu_context_key *);
+       } while (key != NULL);
+       va_end(args);
+
+       if (result != 0) {
+               va_start(args, k);
+               while (k != key) {
+                       lu_context_key_degister(k);
+                       k = va_arg(args, struct lu_context_key *);
+               }
+               va_end(args);
+       }
+
+       return result;
+}
+EXPORT_SYMBOL(lu_context_key_register_many);
+
+/**
+ * De-register a number of keys. This is a dual to
+ * lu_context_key_register_many().
+ */
+void lu_context_key_degister_many(struct lu_context_key *k, ...)
+{
+       va_list args;
+
+       va_start(args, k);
+       do {
+               lu_context_key_degister(k);
+               k = va_arg(args, struct lu_context_key*);
+       } while (k != NULL);
+       va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_degister_many);
+
+/**
+ * Revive a number of keys.
+ */
+void lu_context_key_revive_many(struct lu_context_key *k, ...)
+{
+       va_list args;
+
+       va_start(args, k);
+       do {
+               lu_context_key_revive(k);
+               k = va_arg(args, struct lu_context_key*);
+       } while (k != NULL);
+       va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_revive_many);
+
+/**
+ * Quiescent a number of keys.
+ */
+void lu_context_key_quiesce_many(struct lu_context_key *k, ...)
+{
+       va_list args;
+
+       va_start(args, k);
+       do {
+               lu_context_key_quiesce(k);
+               k = va_arg(args, struct lu_context_key*);
+       } while (k != NULL);
+       va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_quiesce_many);
+
+/**
+ * Return value associated with key \a key in context \a ctx.
+ */
+void *lu_context_key_get(const struct lu_context *ctx,
+                        const struct lu_context_key *key)
+{
+       LINVRNT(ctx->lc_state == LCS_ENTERED);
+       LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+       LASSERT(lu_keys[key->lct_index] == key);
+       return ctx->lc_value[key->lct_index];
+}
+EXPORT_SYMBOL(lu_context_key_get);
+
+/**
+ * List of remembered contexts. XXX document me.
+ */
+static LIST_HEAD(lu_context_remembered);
+
+/**
+ * Destroy \a key in all remembered contexts. This is used to destroy key
+ * values in "shared" contexts (like service threads), when a module owning
+ * the key is about to be unloaded.
+ */
+void lu_context_key_quiesce(struct lu_context_key *key)
+{
+       struct lu_context *ctx;
+
+       if (!(key->lct_tags & LCT_QUIESCENT)) {
+               /*
+                * XXX layering violation.
+                */
+               key->lct_tags |= LCT_QUIESCENT;
+               /*
+                * XXX memory barrier has to go here.
+                */
+               spin_lock(&lu_keys_guard);
+               list_for_each_entry(ctx, &lu_context_remembered,
+                                       lc_remember)
+                       key_fini(ctx, key->lct_index);
+               spin_unlock(&lu_keys_guard);
+               ++key_set_version;
+       }
+}
+EXPORT_SYMBOL(lu_context_key_quiesce);
+
+void lu_context_key_revive(struct lu_context_key *key)
+{
+       key->lct_tags &= ~LCT_QUIESCENT;
+       ++key_set_version;
+}
+EXPORT_SYMBOL(lu_context_key_revive);
+
+static void keys_fini(struct lu_context *ctx)
+{
+       int     i;
+
+       if (ctx->lc_value == NULL)
+               return;
+
+       for (i = 0; i < ARRAY_SIZE(lu_keys); ++i)
+               key_fini(ctx, i);
+
+       OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+       ctx->lc_value = NULL;
+}
+
+static int keys_fill(struct lu_context *ctx)
+{
+       int i;
+
+       LINVRNT(ctx->lc_value != NULL);
+       for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+               struct lu_context_key *key;
+
+               key = lu_keys[i];
+               if (ctx->lc_value[i] == NULL && key != NULL &&
+                   (key->lct_tags & ctx->lc_tags) &&
+                   /*
+                    * Don't create values for a LCT_QUIESCENT key, as this
+                    * will pin module owning a key.
+                    */
+                   !(key->lct_tags & LCT_QUIESCENT)) {
+                       void *value;
+
+                       LINVRNT(key->lct_init != NULL);
+                       LINVRNT(key->lct_index == i);
+
+                       value = key->lct_init(ctx, key);
+                       if (unlikely(IS_ERR(value)))
+                               return PTR_ERR(value);
+
+                       LASSERT(key->lct_owner != NULL);
+                       if (!(ctx->lc_tags & LCT_NOREF))
+                               try_module_get(key->lct_owner);
+                       lu_ref_add_atomic(&key->lct_reference, "ctx", ctx);
+                       atomic_inc(&key->lct_used);
+                       /*
+                        * This is the only place in the code, where an
+                        * element of ctx->lc_value[] array is set to non-NULL
+                        * value.
+                        */
+                       ctx->lc_value[i] = value;
+                       if (key->lct_exit != NULL)
+                               ctx->lc_tags |= LCT_HAS_EXIT;
+               }
+               ctx->lc_version = key_set_version;
+       }
+       return 0;
+}
+
+static int keys_init(struct lu_context *ctx)
+{
+       OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+       if (likely(ctx->lc_value != NULL))
+               return keys_fill(ctx);
+
+       return -ENOMEM;
+}
+
+/**
+ * Initialize context data-structure. Create values for all keys.
+ */
+int lu_context_init(struct lu_context *ctx, __u32 tags)
+{
+       int     rc;
+
+       memset(ctx, 0, sizeof *ctx);
+       ctx->lc_state = LCS_INITIALIZED;
+       ctx->lc_tags = tags;
+       if (tags & LCT_REMEMBER) {
+               spin_lock(&lu_keys_guard);
+               list_add(&ctx->lc_remember, &lu_context_remembered);
+               spin_unlock(&lu_keys_guard);
+       } else {
+               INIT_LIST_HEAD(&ctx->lc_remember);
+       }
+
+       rc = keys_init(ctx);
+       if (rc != 0)
+               lu_context_fini(ctx);
+
+       return rc;
+}
+EXPORT_SYMBOL(lu_context_init);
+
+/**
+ * Finalize context data-structure. Destroy key values.
+ */
+void lu_context_fini(struct lu_context *ctx)
+{
+       LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+       ctx->lc_state = LCS_FINALIZED;
+
+       if ((ctx->lc_tags & LCT_REMEMBER) == 0) {
+               LASSERT(list_empty(&ctx->lc_remember));
+               keys_fini(ctx);
+
+       } else { /* could race with key degister */
+               spin_lock(&lu_keys_guard);
+               keys_fini(ctx);
+               list_del_init(&ctx->lc_remember);
+               spin_unlock(&lu_keys_guard);
+       }
+}
+EXPORT_SYMBOL(lu_context_fini);
+
+/**
+ * Called before entering context.
+ */
+void lu_context_enter(struct lu_context *ctx)
+{
+       LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+       ctx->lc_state = LCS_ENTERED;
+}
+EXPORT_SYMBOL(lu_context_enter);
+
+/**
+ * Called after exiting from \a ctx
+ */
+void lu_context_exit(struct lu_context *ctx)
+{
+       int i;
+
+       LINVRNT(ctx->lc_state == LCS_ENTERED);
+       ctx->lc_state = LCS_LEFT;
+       if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) {
+               for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+                       if (ctx->lc_value[i] != NULL) {
+                               struct lu_context_key *key;
+
+                               key = lu_keys[i];
+                               LASSERT(key != NULL);
+                               if (key->lct_exit != NULL)
+                                       key->lct_exit(ctx,
+                                                     key, ctx->lc_value[i]);
+                       }
+               }
+       }
+}
+EXPORT_SYMBOL(lu_context_exit);
+
+/**
+ * Allocate for context all missing keys that were registered after context
+ * creation. key_set_version is only changed in rare cases when modules
+ * are loaded and removed.
+ */
+int lu_context_refill(struct lu_context *ctx)
+{
+       return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx);
+}
+EXPORT_SYMBOL(lu_context_refill);
+
+/**
+ * lu_ctx_tags/lu_ses_tags will be updated if there are new types of
+ * obd being added. Currently, this is only used on client side, specifically
+ * for echo device client, for other stack (like ptlrpc threads), context are
+ * predefined when the lu_device type are registered, during the module probe
+ * phase.
+ */
+__u32 lu_context_tags_default = 0;
+__u32 lu_session_tags_default = 0;
+
+void lu_context_tags_update(__u32 tags)
+{
+       spin_lock(&lu_keys_guard);
+       lu_context_tags_default |= tags;
+       key_set_version++;
+       spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_update);
+
+void lu_context_tags_clear(__u32 tags)
+{
+       spin_lock(&lu_keys_guard);
+       lu_context_tags_default &= ~tags;
+       key_set_version++;
+       spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_clear);
+
+void lu_session_tags_update(__u32 tags)
+{
+       spin_lock(&lu_keys_guard);
+       lu_session_tags_default |= tags;
+       key_set_version++;
+       spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_update);
+
+void lu_session_tags_clear(__u32 tags)
+{
+       spin_lock(&lu_keys_guard);
+       lu_session_tags_default &= ~tags;
+       key_set_version++;
+       spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_clear);
+
+int lu_env_init(struct lu_env *env, __u32 tags)
+{
+       int result;
+
+       env->le_ses = NULL;
+       result = lu_context_init(&env->le_ctx, tags);
+       if (likely(result == 0))
+               lu_context_enter(&env->le_ctx);
+       return result;
+}
+EXPORT_SYMBOL(lu_env_init);
+
+void lu_env_fini(struct lu_env *env)
+{
+       lu_context_exit(&env->le_ctx);
+       lu_context_fini(&env->le_ctx);
+       env->le_ses = NULL;
+}
+EXPORT_SYMBOL(lu_env_fini);
+
+int lu_env_refill(struct lu_env *env)
+{
+       int result;
+
+       result = lu_context_refill(&env->le_ctx);
+       if (result == 0 && env->le_ses != NULL)
+               result = lu_context_refill(env->le_ses);
+       return result;
+}
+EXPORT_SYMBOL(lu_env_refill);
+
+/**
+ * Currently, this API will only be used by echo client.
+ * Because echo client and normal lustre client will share
+ * same cl_env cache. So echo client needs to refresh
+ * the env context after it get one from the cache, especially
+ * when normal client and echo client co-exist in the same client.
+ */
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
+                         __u32 stags)
+{
+       int    result;
+
+       if ((env->le_ctx.lc_tags & ctags) != ctags) {
+               env->le_ctx.lc_version = 0;
+               env->le_ctx.lc_tags |= ctags;
+       }
+
+       if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) {
+               env->le_ses->lc_version = 0;
+               env->le_ses->lc_tags |= stags;
+       }
+
+       result = lu_env_refill(env);
+
+       return result;
+}
+EXPORT_SYMBOL(lu_env_refill_by_tags);
+
+static struct shrinker *lu_site_shrinker = NULL;
+
+typedef struct lu_site_stats{
+       unsigned        lss_populated;
+       unsigned        lss_max_search;
+       unsigned        lss_total;
+       unsigned        lss_busy;
+} lu_site_stats_t;
+
+static void lu_site_stats_get(cfs_hash_t *hs,
+                             lu_site_stats_t *stats, int populated)
+{
+       cfs_hash_bd_t bd;
+       int        i;
+
+       cfs_hash_for_each_bucket(hs, &bd, i) {
+               struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
+               struct hlist_head       *hhead;
+
+               cfs_hash_bd_lock(hs, &bd, 1);
+               stats->lss_busy  += bkt->lsb_busy;
+               stats->lss_total += cfs_hash_bd_count_get(&bd);
+               stats->lss_max_search = max((int)stats->lss_max_search,
+                                           cfs_hash_bd_depmax_get(&bd));
+               if (!populated) {
+                       cfs_hash_bd_unlock(hs, &bd, 1);
+                       continue;
+               }
+
+               cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+                       if (!hlist_empty(hhead))
+                               stats->lss_populated++;
+               }
+               cfs_hash_bd_unlock(hs, &bd, 1);
+       }
+}
+
+
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the  lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
+static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+       lu_site_stats_t stats;
+       struct lu_site *s;
+       struct lu_site *tmp;
+       int cached = 0;
+       int remain = shrink_param(sc, nr_to_scan);
+       LIST_HEAD(splice);
+
+       if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) {
+               if (remain != 0)
+                       return -1;
+               else
+                       /* We must not take the lu_sites_guard lock when
+                        * __GFP_FS is *not* set because of the deadlock
+                        * possibility detailed above. Additionally,
+                        * since we cannot determine the number of
+                        * objects in the cache without taking this
+                        * lock, we're in a particularly tough spot. As
+                        * a result, we'll just lie and say our cache is
+                        * empty. This _should_ be ok, as we can't
+                        * reclaim objects when __GFP_FS is *not* set
+                        * anyways.
+                        */
+                       return 0;
+       }
+
+       CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+
+       mutex_lock(&lu_sites_guard);
+       list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+               if (shrink_param(sc, nr_to_scan) != 0) {
+                       remain = lu_site_purge(&lu_shrink_env, s, remain);
+                       /*
+                        * Move just shrunk site to the tail of site list to
+                        * assure shrinking fairness.
+                        */
+                       list_move_tail(&s->ls_linkage, &splice);
+               }
+
+               memset(&stats, 0, sizeof(stats));
+               lu_site_stats_get(s->ls_obj_hash, &stats, 0);
+               cached += stats.lss_total - stats.lss_busy;
+               if (shrink_param(sc, nr_to_scan) && remain <= 0)
+                       break;
+       }
+       list_splice(&splice, lu_sites.prev);
+       mutex_unlock(&lu_sites_guard);
+
+       cached = (cached / 100) * sysctl_vfs_cache_pressure;
+       if (shrink_param(sc, nr_to_scan) == 0)
+               CDEBUG(D_INODE, "%d objects cached\n", cached);
+       return cached;
+}
+
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+                     void *unused, const char *format, ...)
+{
+       va_list args;
+
+       va_start(args, format);
+       vprintk(format, args);
+       va_end(args);
+       return 0;
+}
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void)
+{
+       int result;
+
+       CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys);
+
+       result = lu_ref_global_init();
+       if (result != 0)
+               return result;
+
+       LU_CONTEXT_KEY_INIT(&lu_global_key);
+       result = lu_context_key_register(&lu_global_key);
+       if (result != 0)
+               return result;
+
+       /*
+        * At this level, we don't know what tags are needed, so allocate them
+        * conservatively. This should not be too bad, because this
+        * environment is global.
+        */
+       mutex_lock(&lu_sites_guard);
+       result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+       mutex_unlock(&lu_sites_guard);
+       if (result != 0)
+               return result;
+
+       /*
+        * seeks estimation: 3 seeks to read a record from oi, one to read
+        * inode, one for ea. Unfortunately setting this high value results in
+        * lu_object/inode cache consuming all the memory.
+        */
+       lu_site_shrinker = set_shrinker(DEFAULT_SEEKS, lu_cache_shrink);
+       if (lu_site_shrinker == NULL)
+               return -ENOMEM;
+
+       return result;
+}
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void)
+{
+       if (lu_site_shrinker != NULL) {
+               remove_shrinker(lu_site_shrinker);
+               lu_site_shrinker = NULL;
+       }
+
+       lu_context_key_degister(&lu_global_key);
+
+       /*
+        * Tear shrinker environment down _after_ de-registering
+        * lu_global_key, because the latter has a value in the former.
+        */
+       mutex_lock(&lu_sites_guard);
+       lu_env_fini(&lu_shrink_env);
+       mutex_unlock(&lu_sites_guard);
+
+       lu_ref_global_fini();
+}
+
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
+{
+#ifdef LPROCFS
+       struct lprocfs_counter ret;
+
+       lprocfs_stats_collect(stats, idx, &ret);
+       return (__u32)ret.lc_count;
+#else
+       return 0;
+#endif
+}
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * lprocfs_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
+{
+       lu_site_stats_t stats;
+
+       memset(&stats, 0, sizeof(stats));
+       lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+
+       return seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+                       stats.lss_busy,
+                       stats.lss_total,
+                       stats.lss_populated,
+                       CFS_HASH_NHLIST(s->ls_obj_hash),
+                       stats.lss_max_search,
+                       ls_stats_read(s->ls_stats, LU_SS_CREATED),
+                       ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT),
+                       ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
+                       ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
+                       ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
+                       ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+}
+EXPORT_SYMBOL(lu_site_stats_print);
+
+/**
+ * Helper function to initialize a number of kmem slab caches at once.
+ */
+int lu_kmem_init(struct lu_kmem_descr *caches)
+{
+       int result;
+       struct lu_kmem_descr *iter = caches;
+
+       for (result = 0; iter->ckd_cache != NULL; ++iter) {
+               *iter->ckd_cache = kmem_cache_create(iter->ckd_name,
+                                                       iter->ckd_size,
+                                                       0, 0, NULL);
+               if (*iter->ckd_cache == NULL) {
+                       result = -ENOMEM;
+                       /* free all previously allocated caches */
+                       lu_kmem_fini(caches);
+                       break;
+               }
+       }
+       return result;
+}
+EXPORT_SYMBOL(lu_kmem_init);
+
+/**
+ * Helper function to finalize a number of kmem slab cached at once. Dual to
+ * lu_kmem_init().
+ */
+void lu_kmem_fini(struct lu_kmem_descr *caches)
+{
+       for (; caches->ckd_cache != NULL; ++caches) {
+               if (*caches->ckd_cache != NULL) {
+                       kmem_cache_destroy(*caches->ckd_cache);
+                       *caches->ckd_cache = NULL;
+               }
+       }
+}
+EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+                         const struct lu_fid *fid)
+{
+       struct lu_site          *s = o->lo_dev->ld_site;
+       struct lu_fid           *old = &o->lo_header->loh_fid;
+       struct lu_site_bkt_data *bkt;
+       struct lu_object        *shadow;
+       wait_queue_t             waiter;
+       cfs_hash_t              *hs;
+       cfs_hash_bd_t            bd;
+       __u64                    version = 0;
+
+       LASSERT(fid_is_zero(old));
+
+       hs = s->ls_obj_hash;
+       cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1);
+       shadow = htable_lookup(s, &bd, fid, &waiter, &version);
+       /* supposed to be unique */
+       LASSERT(shadow == NULL);
+       *old = *fid;
+       bkt = cfs_hash_bd_extra_get(hs, &bd);
+       cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+       bkt->lsb_busy++;
+       cfs_hash_bd_unlock(hs, &bd, 1);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ *      till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+                                struct lu_device *dev,
+                                const struct lu_object_conf *conf)
+{
+       struct lu_fid     fid;
+       struct lu_object *o;
+
+       fid_zero(&fid);
+       o = lu_object_alloc(env, dev, &fid, conf);
+
+       return o;
+}
+EXPORT_SYMBOL(lu_object_anon);
+
+struct lu_buf LU_BUF_NULL = {
+       .lb_buf = NULL,
+       .lb_len = 0
+};
+EXPORT_SYMBOL(LU_BUF_NULL);
+
+void lu_buf_free(struct lu_buf *buf)
+{
+       LASSERT(buf);
+       if (buf->lb_buf) {
+               LASSERT(buf->lb_len > 0);
+               OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+               buf->lb_buf = NULL;
+               buf->lb_len = 0;
+       }
+}
+EXPORT_SYMBOL(lu_buf_free);
+
+void lu_buf_alloc(struct lu_buf *buf, int size)
+{
+       LASSERT(buf);
+       LASSERT(buf->lb_buf == NULL);
+       LASSERT(buf->lb_len == 0);
+       OBD_ALLOC_LARGE(buf->lb_buf, size);
+       if (likely(buf->lb_buf))
+               buf->lb_len = size;
+}
+EXPORT_SYMBOL(lu_buf_alloc);
+
+void lu_buf_realloc(struct lu_buf *buf, int size)
+{
+       lu_buf_free(buf);
+       lu_buf_alloc(buf, size);
+}
+EXPORT_SYMBOL(lu_buf_realloc);
+
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len)
+{
+       if (buf->lb_buf == NULL && buf->lb_len == 0)
+               lu_buf_alloc(buf, len);
+
+       if ((len > buf->lb_len) && (buf->lb_buf != NULL))
+               lu_buf_realloc(buf, len);
+
+       return buf;
+}
+EXPORT_SYMBOL(lu_buf_check_and_alloc);
+
+/**
+ * Increase the size of the \a buf.
+ * preserves old data in buffer
+ * old buffer remains unchanged on error
+ * \retval 0 or -ENOMEM
+ */
+int lu_buf_check_and_grow(struct lu_buf *buf, int len)
+{
+       char *ptr;
+
+       if (len <= buf->lb_len)
+               return 0;
+
+       OBD_ALLOC_LARGE(ptr, len);
+       if (ptr == NULL)
+               return -ENOMEM;
+
+       /* Free the old buf */
+       if (buf->lb_buf != NULL) {
+               memcpy(ptr, buf->lb_buf, buf->lb_len);
+               OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+       }
+
+       buf->lb_buf = ptr;
+       buf->lb_len = len;
+       return 0;
+}
+EXPORT_SYMBOL(lu_buf_check_and_grow);
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
new file mode 100644 (file)
index 0000000..23a76f1
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_ref.c
+ *
+ * Lustre reference.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lu_ref.h>
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ucred.c b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c
new file mode 100644 (file)
index 0000000..229db6c
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <lu_object.h>
+#include <md_object.h>
+
+/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */
+LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred);
+
+static struct lu_context_key lu_ucred_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = lu_ucred_key_init,
+       .lct_fini = lu_ucred_key_fini
+};
+
+/**
+ * Get ucred key if session exists and ucred key is allocated on it.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred(const struct lu_env *env)
+{
+       if (!env->le_ses)
+               return NULL;
+       return lu_context_key_get(env->le_ses, &lu_ucred_key);
+}
+EXPORT_SYMBOL(lu_ucred);
+
+/**
+ * Get ucred key and check if it is properly initialized.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred_check(const struct lu_env *env)
+{
+       struct lu_ucred *uc = lu_ucred(env);
+       if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW)
+               return NULL;
+       return uc;
+}
+EXPORT_SYMBOL(lu_ucred_check);
+
+/**
+ * Get ucred key, which must exist and must be properly initialized.
+ * Assert otherwise.
+ */
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env)
+{
+       struct lu_ucred *uc = lu_ucred_check(env);
+       LASSERT(uc != NULL);
+       return uc;
+}
+EXPORT_SYMBOL(lu_ucred_assert);
+
+int lu_ucred_global_init(void)
+{
+       LU_CONTEXT_KEY_INIT(&lu_ucred_key);
+       return lu_context_key_register(&lu_ucred_key);
+}
+
+void lu_ucred_global_fini(void)
+{
+       lu_context_key_degister(&lu_ucred_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
new file mode 100644 (file)
index 0000000..69d6499
--- /dev/null
@@ -0,0 +1,263 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lustre_handles.h>
+#include <lustre_lib.h>
+
+
+static __u64 handle_base;
+#define HANDLE_INCR 7
+static spinlock_t handle_base_lock;
+
+static struct handle_bucket {
+       spinlock_t      lock;
+       struct list_head        head;
+} *handle_hash;
+
+#define HANDLE_HASH_SIZE (1 << 16)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
+void class_handle_hash(struct portals_handle *h,
+                      struct portals_handle_ops *ops)
+{
+       struct handle_bucket *bucket;
+       ENTRY;
+
+       LASSERT(h != NULL);
+       LASSERT(list_empty(&h->h_link));
+
+       /*
+        * This is fast, but simplistic cookie generation algorithm, it will
+        * need a re-do at some point in the future for security.
+        */
+       spin_lock(&handle_base_lock);
+       handle_base += HANDLE_INCR;
+
+       if (unlikely(handle_base == 0)) {
+               /*
+                * Cookie of zero is "dangerous", because in many places it's
+                * assumed that 0 means "unassigned" handle, not bound to any
+                * object.
+                */
+               CWARN("The universe has been exhausted: cookie wrap-around.\n");
+               handle_base += HANDLE_INCR;
+       }
+       h->h_cookie = handle_base;
+       spin_unlock(&handle_base_lock);
+
+       h->h_ops = ops;
+       spin_lock_init(&h->h_lock);
+
+       bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK];
+       spin_lock(&bucket->lock);
+       list_add_rcu(&h->h_link, &bucket->head);
+       h->h_in = 1;
+       spin_unlock(&bucket->lock);
+
+       CDEBUG(D_INFO, "added object %p with handle "LPX64" to hash\n",
+              h, h->h_cookie);
+       EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash);
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+       if (list_empty(&h->h_link)) {
+               CERROR("removing an already-removed handle ("LPX64")\n",
+                      h->h_cookie);
+               return;
+       }
+
+       CDEBUG(D_INFO, "removing object %p with handle "LPX64" from hash\n",
+              h, h->h_cookie);
+
+       spin_lock(&h->h_lock);
+       if (h->h_in == 0) {
+               spin_unlock(&h->h_lock);
+               return;
+       }
+       h->h_in = 0;
+       spin_unlock(&h->h_lock);
+       list_del_rcu(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+       struct handle_bucket *bucket;
+       bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+       spin_lock(&bucket->lock);
+       class_handle_unhash_nolock(h);
+       spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_unhash);
+
+void class_handle_hash_back(struct portals_handle *h)
+{
+       struct handle_bucket *bucket;
+       ENTRY;
+
+       bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+       spin_lock(&bucket->lock);
+       list_add_rcu(&h->h_link, &bucket->head);
+       h->h_in = 1;
+       spin_unlock(&bucket->lock);
+
+       EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash_back);
+
+void *class_handle2object(__u64 cookie)
+{
+       struct handle_bucket *bucket;
+       struct portals_handle *h;
+       void *retval = NULL;
+       ENTRY;
+
+       LASSERT(handle_hash != NULL);
+
+       /* Be careful when you want to change this code. See the
+        * rcu_read_lock() definition on top this file. - jxiong */
+       bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(h, &bucket->head, h_link) {
+               if (h->h_cookie != cookie)
+                       continue;
+
+               spin_lock(&h->h_lock);
+               if (likely(h->h_in != 0)) {
+                       h->h_ops->hop_addref(h);
+                       retval = h;
+               }
+               spin_unlock(&h->h_lock);
+               break;
+       }
+       rcu_read_unlock();
+
+       RETURN(retval);
+}
+EXPORT_SYMBOL(class_handle2object);
+
+void class_handle_free_cb(cfs_rcu_head_t *rcu)
+{
+       struct portals_handle *h = RCU2HANDLE(rcu);
+       void *ptr = (void *)(unsigned long)h->h_cookie;
+
+       if (h->h_ops->hop_free != NULL)
+               h->h_ops->hop_free(ptr, h->h_size);
+       else
+               OBD_FREE(ptr, h->h_size);
+}
+EXPORT_SYMBOL(class_handle_free_cb);
+
+int class_handle_init(void)
+{
+       struct handle_bucket *bucket;
+       struct timeval tv;
+       int seed[2];
+
+       LASSERT(handle_hash == NULL);
+
+       OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+       if (handle_hash == NULL)
+               return -ENOMEM;
+
+       spin_lock_init(&handle_base_lock);
+       for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+            bucket--) {
+               INIT_LIST_HEAD(&bucket->head);
+               spin_lock_init(&bucket->lock);
+       }
+
+       /** bug 21430: add randomness to the initial base */
+       cfs_get_random_bytes(seed, sizeof(seed));
+       do_gettimeofday(&tv);
+       cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+       cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+       LASSERT(handle_base != 0ULL);
+
+       return 0;
+}
+
+static int cleanup_all_handles(void)
+{
+       int rc;
+       int i;
+
+       for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) {
+               struct portals_handle *h;
+
+               spin_lock(&handle_hash[i].lock);
+               list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) {
+                       CERROR("force clean handle "LPX64" addr %p ops %p\n",
+                              h->h_cookie, h, h->h_ops);
+
+                       class_handle_unhash_nolock(h);
+                       rc++;
+               }
+               spin_unlock(&handle_hash[i].lock);
+       }
+
+       return rc;
+}
+
+void class_handle_cleanup(void)
+{
+       int count;
+       LASSERT(handle_hash != NULL);
+
+       count = cleanup_all_handles();
+
+       OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+       handle_hash = NULL;
+
+       if (count != 0)
+               CERROR("handle_count at cleanup: %d\n", count);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
new file mode 100644 (file)
index 0000000..2fa2589
--- /dev/null
@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+
+#define NIDS_MAX       32
+
+struct uuid_nid_data {
+       struct list_head       un_list;
+       struct obd_uuid  un_uuid;
+       int           un_nid_count;
+       lnet_nid_t       un_nids[NIDS_MAX];
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static struct list_head        g_uuid_list;
+static spinlock_t      g_uuid_lock;
+
+void class_init_uuidlist(void)
+{
+       INIT_LIST_HEAD(&g_uuid_list);
+       spin_lock_init(&g_uuid_lock);
+}
+
+void class_exit_uuidlist(void)
+{
+       /* delete all */
+       class_del_uuid(NULL);
+}
+
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index)
+{
+       struct uuid_nid_data *data;
+       struct obd_uuid tmp;
+       int rc = -ENOENT;
+
+       obd_str2uuid(&tmp, uuid);
+       spin_lock(&g_uuid_lock);
+       list_for_each_entry(data, &g_uuid_list, un_list) {
+               if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+                       if (index >= data->un_nid_count)
+                               break;
+
+                       rc = 0;
+                       *peer_nid = data->un_nids[index];
+                       break;
+               }
+       }
+       spin_unlock(&g_uuid_lock);
+       return rc;
+}
+EXPORT_SYMBOL(lustre_uuid_to_peer);
+
+/* Add a nid to a niduuid.  Multiple nids can be added to a single uuid;
+   LNET will choose the best one. */
+int class_add_uuid(const char *uuid, __u64 nid)
+{
+       struct uuid_nid_data *data, *entry;
+       int found = 0;
+
+       LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+
+       if (strlen(uuid) > UUID_MAX - 1)
+               return -EOVERFLOW;
+
+       OBD_ALLOC_PTR(data);
+       if (data == NULL)
+               return -ENOMEM;
+
+       obd_str2uuid(&data->un_uuid, uuid);
+       data->un_nids[0] = nid;
+       data->un_nid_count = 1;
+
+       spin_lock(&g_uuid_lock);
+       list_for_each_entry(entry, &g_uuid_list, un_list) {
+               if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+                       int i;
+
+                       found = 1;
+                       for (i = 0; i < entry->un_nid_count; i++)
+                               if (nid == entry->un_nids[i])
+                                       break;
+
+                       if (i == entry->un_nid_count) {
+                               LASSERT(entry->un_nid_count < NIDS_MAX);
+                               entry->un_nids[entry->un_nid_count++] = nid;
+                       }
+                       break;
+               }
+       }
+       if (!found)
+               list_add(&data->un_list, &g_uuid_list);
+       spin_unlock(&g_uuid_lock);
+
+       if (found) {
+               CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+                      libcfs_nid2str(nid), entry->un_nid_count);
+               OBD_FREE(data, sizeof(*data));
+       } else {
+               CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+       }
+       return 0;
+}
+EXPORT_SYMBOL(class_add_uuid);
+
+/* Delete the nids for one uuid if specified, otherwise delete all */
+int class_del_uuid(const char *uuid)
+{
+       LIST_HEAD(deathrow);
+       struct uuid_nid_data *data;
+
+       spin_lock(&g_uuid_lock);
+       if (uuid != NULL) {
+               struct obd_uuid tmp;
+
+               obd_str2uuid(&tmp, uuid);
+               list_for_each_entry(data, &g_uuid_list, un_list) {
+                       if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+                               list_move(&data->un_list, &deathrow);
+                               break;
+                       }
+               }
+       } else
+               list_splice_init(&g_uuid_list, &deathrow);
+       spin_unlock(&g_uuid_lock);
+
+       if (uuid != NULL && list_empty(&deathrow)) {
+               CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid);
+               return -EINVAL;
+       }
+
+       while (!list_empty(&deathrow)) {
+               data = list_entry(deathrow.next, struct uuid_nid_data,
+                                     un_list);
+               list_del(&data->un_list);
+
+               CDEBUG(D_INFO, "del uuid %s %s/%d\n",
+                      obd_uuid2str(&data->un_uuid),
+                      libcfs_nid2str(data->un_nids[0]),
+                      data->un_nid_count);
+
+               OBD_FREE(data, sizeof(*data));
+       }
+
+       return 0;
+}
+
+/* check if @nid exists in nid list of @uuid */
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
+{
+       struct uuid_nid_data *entry;
+       int found = 0;
+       ENTRY;
+
+       CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+              obd_uuid2str(uuid), libcfs_nid2str(nid));
+
+       spin_lock(&g_uuid_lock);
+       list_for_each_entry(entry, &g_uuid_list, un_list) {
+               int i;
+
+               if (!obd_uuid_equals(&entry->un_uuid, uuid))
+                       continue;
+
+               /* found the uuid, check if it has @nid */
+               for (i = 0; i < entry->un_nid_count; i++) {
+                       if (entry->un_nids[i] == nid) {
+                               found = 1;
+                               break;
+                       }
+               }
+               break;
+       }
+       spin_unlock(&g_uuid_lock);
+       RETURN(found);
+}
+EXPORT_SYMBOL(class_check_uuid);
diff --git a/drivers/staging/lustre/lustre/obdclass/md_attrs.c b/drivers/staging/lustre/lustre/obdclass/md_attrs.c
new file mode 100644 (file)
index 0000000..b71344a
--- /dev/null
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <md_object.h>
+
+/**
+ * Initialize new \a lma. Only fid is stored.
+ *
+ * \param lma - is the new LMA structure to be initialized
+ * \param fid - is the FID of the object this LMA belongs to
+ * \param incompat - features that MDS must understand to access object
+ */
+void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
+                    __u32 incompat)
+{
+       lma->lma_compat   = 0;
+       lma->lma_incompat = incompat;
+       lma->lma_self_fid = *fid;
+
+       /* If a field is added in struct lustre_mdt_attrs, zero it explicitly
+        * and change the test below. */
+       LASSERT(sizeof(*lma) ==
+               (offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+                sizeof(lma->lma_self_fid)));
+};
+EXPORT_SYMBOL(lustre_lma_init);
+
+/**
+ * Swab, if needed, LMA structure which is stored on-disk in little-endian order.
+ *
+ * \param lma - is a pointer to the LMA structure to be swabbed.
+ */
+void lustre_lma_swab(struct lustre_mdt_attrs *lma)
+{
+       /* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+       if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+               __swab32s(&lma->lma_compat);
+               __swab32s(&lma->lma_incompat);
+               lustre_swab_lu_fid(&lma->lma_self_fid);
+       }
+};
+EXPORT_SYMBOL(lustre_lma_swab);
+
+/**
+ * Swab, if needed, SOM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the SOM structure to be swabbed.
+ */
+void lustre_som_swab(struct som_attrs *attrs)
+{
+       /* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+       if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+               __swab32s(&attrs->som_compat);
+               __swab32s(&attrs->som_incompat);
+               __swab64s(&attrs->som_ioepoch);
+               __swab64s(&attrs->som_size);
+               __swab64s(&attrs->som_blocks);
+               __swab64s(&attrs->som_mountid);
+       }
+};
+EXPORT_SYMBOL(lustre_som_swab);
+
+/*
+ * Swab and extract SOM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk SOM extended attribute.
+ * \param rc  - is the SOM xattr stored in \a buf
+ * \param msd - is the md_som_data structure where to extract SOM attributes.
+ */
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd)
+{
+       struct som_attrs *attrs = (struct som_attrs *)buf;
+       ENTRY;
+
+       if (rc == 0 ||  rc == -ENODATA)
+               /* no SOM attributes */
+               RETURN(-ENODATA);
+
+       if (rc < 0)
+               /* error hit while fetching xattr */
+               RETURN(rc);
+
+       /* check SOM compatibility */
+       if (attrs->som_incompat & ~cpu_to_le32(SOM_INCOMPAT_SUPP))
+               RETURN(-ENODATA);
+
+       /* unpack SOM attributes */
+       lustre_som_swab(attrs);
+
+       /* fill in-memory msd structure */
+       msd->msd_compat   = attrs->som_compat;
+       msd->msd_incompat = attrs->som_incompat;
+       msd->msd_ioepoch  = attrs->som_ioepoch;
+       msd->msd_size     = attrs->som_size;
+       msd->msd_blocks   = attrs->som_blocks;
+       msd->msd_mountid  = attrs->som_mountid;
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2som);
+
+/**
+ * Swab, if needed, HSM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the HSM structure to be swabbed.
+ */
+void lustre_hsm_swab(struct hsm_attrs *attrs)
+{
+       /* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+       if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+               __swab32s(&attrs->hsm_compat);
+               __swab32s(&attrs->hsm_flags);
+               __swab64s(&attrs->hsm_arch_id);
+               __swab64s(&attrs->hsm_arch_ver);
+       }
+};
+EXPORT_SYMBOL(lustre_hsm_swab);
+
+/*
+ * Swab and extract HSM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk HSM extended attribute.
+ * \param rc  - is the HSM xattr stored in \a buf
+ * \param mh  - is the md_hsm structure where to extract HSM attributes.
+ */
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh)
+{
+       struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+       ENTRY;
+
+       if (rc == 0 ||  rc == -ENODATA)
+               /* no HSM attributes */
+               RETURN(-ENODATA);
+
+       if (rc < 0)
+               /* error hit while fetching xattr */
+               RETURN(rc);
+
+       /* unpack HSM attributes */
+       lustre_hsm_swab(attrs);
+
+       /* fill md_hsm structure */
+       mh->mh_compat   = attrs->hsm_compat;
+       mh->mh_flags    = attrs->hsm_flags;
+       mh->mh_arch_id  = attrs->hsm_arch_id;
+       mh->mh_arch_ver = attrs->hsm_arch_ver;
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2hsm);
+
+/*
+ * Pack HSM attributes.
+ *
+ * \param buf - is the output buffer where to pack the on-disk HSM xattr.
+ * \param mh  - is the md_hsm structure to pack.
+ */
+void lustre_hsm2buf(void *buf, struct md_hsm *mh)
+{
+       struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+       ENTRY;
+
+       /* copy HSM attributes */
+       attrs->hsm_compat   = mh->mh_compat;
+       attrs->hsm_flags    = mh->mh_flags;
+       attrs->hsm_arch_id  = mh->mh_arch_id;
+       attrs->hsm_arch_ver = mh->mh_arch_ver;
+
+       /* pack xattr */
+       lustre_hsm_swab(attrs);
+}
+EXPORT_SYMBOL(lustre_hsm2buf);
diff --git a/drivers/staging/lustre/lustre/obdclass/mea.c b/drivers/staging/lustre/lustre/obdclass/mea.c
new file mode 100644 (file)
index 0000000..c4f0dbc
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/kmod.h>   /* for request_module() */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+static int mea_last_char_hash(int count, char *name, int namelen)
+{
+       unsigned int c;
+
+       c = name[namelen - 1];
+       if (c == 0)
+               CWARN("looks like wrong len is passed\n");
+       c = c % count;
+       return c;
+}
+
+static int mea_all_chars_hash(int count, char *name, int namelen)
+{
+       unsigned int c = 0;
+
+       while (--namelen >= 0)
+               c += name[namelen];
+       c = c % count;
+       return c;
+}
+
+int raw_name2idx(int hashtype, int count, const char *name, int namelen)
+{
+       unsigned int    c = 0;
+       int             idx;
+
+       LASSERT(namelen > 0);
+
+       if (filename_is_volatile(name, namelen, &idx)) {
+               if ((idx >= 0) && (idx < count))
+                       return idx;
+               goto hashchoice;
+       }
+
+       if (count <= 1)
+               return 0;
+
+hashchoice:
+       switch (hashtype) {
+       case MEA_MAGIC_LAST_CHAR:
+               c = mea_last_char_hash(count, (char *)name, namelen);
+               break;
+       case MEA_MAGIC_ALL_CHARS:
+               c = mea_all_chars_hash(count, (char *)name, namelen);
+               break;
+       case MEA_MAGIC_HASH_SEGMENT:
+               CERROR("Unsupported hash type MEA_MAGIC_HASH_SEGMENT\n");
+               break;
+       default:
+               CERROR("Unknown hash type 0x%x\n", hashtype);
+       }
+
+       LASSERT(c < count);
+       return c;
+}
+EXPORT_SYMBOL(raw_name2idx);
+
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen)
+{
+       unsigned int c;
+
+       LASSERT(mea && mea->mea_count);
+
+       c = raw_name2idx(mea->mea_magic, mea->mea_count, name, namelen);
+
+       LASSERT(c < mea->mea_count);
+       return c;
+}
+EXPORT_SYMBOL(mea_name2idx);
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c
new file mode 100644 (file)
index 0000000..bbf06d0
--- /dev/null
@@ -0,0 +1,1904 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/string.h>
+#include <lustre_log.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+
+#include "llog_internal.h"
+
+static cfs_hash_ops_t uuid_hash_ops;
+static cfs_hash_ops_t nid_hash_ops;
+static cfs_hash_ops_t nid_stat_hash_ops;
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+       char *ptr;
+
+       if (!buf)
+               return 1;
+
+       if ((ptr = strstr(buf, key)) == NULL)
+               return 1;
+
+       if (valp)
+               *valp = ptr + strlen(key);
+
+       return 0;
+}
+EXPORT_SYMBOL(class_find_param);
+
+/**
+ * Check whether the proc parameter \a param is an old parameter or not from
+ * the array \a ptr which contains the mapping from old parameters to new ones.
+ * If it's an old one, then return the pointer to the cfg_interop_param struc-
+ * ture which contains both the old and new parameters.
+ *
+ * \param param                        proc parameter
+ * \param ptr                  an array which contains the mapping from
+ *                             old parameters to new ones
+ *
+ * \retval valid-pointer       pointer to the cfg_interop_param structure
+ *                             which contains the old and new parameters
+ * \retval NULL                        \a param or \a ptr is NULL,
+ *                             or \a param is not an old parameter
+ */
+struct cfg_interop_param *class_find_old_param(const char *param,
+                                              struct cfg_interop_param *ptr)
+{
+       char *value = NULL;
+       int   name_len = 0;
+
+       if (param == NULL || ptr == NULL)
+               RETURN(NULL);
+
+       value = strchr(param, '=');
+       if (value == NULL)
+               name_len = strlen(param);
+       else
+               name_len = value - param;
+
+       while (ptr->old_param != NULL) {
+               if (strncmp(param, ptr->old_param, name_len) == 0 &&
+                   name_len == strlen(ptr->old_param))
+                       RETURN(ptr);
+               ptr++;
+       }
+
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(class_find_old_param);
+
+/**
+ * Finds a parameter in \a params and copies it to \a copy.
+ *
+ * Leading spaces are skipped. Next space or end of string is the
+ * parameter terminator with the exception that spaces inside single or double
+ * quotes get included into a parameter. The parameter is copied into \a copy
+ * which has to be allocated big enough by a caller, quotes are stripped in
+ * the copy and the copy is terminated by 0.
+ *
+ * On return \a params is set to next parameter or to NULL if last
+ * parameter is returned.
+ *
+ * \retval 0 if parameter is returned in \a copy
+ * \retval 1 otherwise
+ * \retval -EINVAL if unbalanced quota is found
+ */
+int class_get_next_param(char **params, char *copy)
+{
+       char *q1, *q2, *str;
+       int len;
+
+       str = *params;
+       while (*str == ' ')
+               str++;
+
+       if (*str == '\0') {
+               *params = NULL;
+               return 1;
+       }
+
+       while (1) {
+               q1 = strpbrk(str, " '\"");
+               if (q1 == NULL) {
+                       len = strlen(str);
+                       memcpy(copy, str, len);
+                       copy[len] = '\0';
+                       *params = NULL;
+                       return 0;
+               }
+               len = q1 - str;
+               if (*q1 == ' ') {
+                       memcpy(copy, str, len);
+                       copy[len] = '\0';
+                       *params = str + len;
+                       return 0;
+               }
+
+               memcpy(copy, str, len);
+               copy += len;
+
+               /* search for the matching closing quote */
+               str = q1 + 1;
+               q2 = strchr(str, *q1);
+               if (q2 == NULL) {
+                       CERROR("Unbalanced quota in parameters: \"%s\"\n",
+                              *params);
+                       return -EINVAL;
+               }
+               len = q2 - str;
+               memcpy(copy, str, len);
+               copy += len;
+               str = q2 + 1;
+       }
+       return 1;
+}
+EXPORT_SYMBOL(class_get_next_param);
+
+/* returns 0 if this is the first key in the buffer, else 1.
+   valp points to first char after key. */
+int class_match_param(char *buf, char *key, char **valp)
+{
+       if (!buf)
+               return 1;
+
+       if (memcmp(buf, key, strlen(key)) != 0)
+               return 1;
+
+       if (valp)
+               *valp = buf + strlen(key);
+
+       return 0;
+}
+EXPORT_SYMBOL(class_match_param);
+
+static int parse_nid(char *buf, void *value, int quiet)
+{
+       lnet_nid_t *nid = (lnet_nid_t *)value;
+
+       *nid = libcfs_str2nid(buf);
+       if (*nid != LNET_NID_ANY)
+               return 0;
+
+       if (!quiet)
+               LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf);
+       return -EINVAL;
+}
+
+static int parse_net(char *buf, void *value)
+{
+       __u32 *net = (__u32 *)value;
+
+       *net = libcfs_str2net(buf);
+       CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net));
+       return 0;
+}
+
+enum {
+       CLASS_PARSE_NID = 1,
+       CLASS_PARSE_NET,
+};
+
+/* 0 is good nid,
+   1 not found
+   < 0 error
+   endh is set to next separator */
+static int class_parse_value(char *buf, int opc, void *value, char **endh,
+                            int quiet)
+{
+       char *endp;
+       char  tmp;
+       int   rc = 0;
+
+       if (!buf)
+               return 1;
+       while (*buf == ',' || *buf == ':')
+               buf++;
+       if (*buf == ' ' || *buf == '/' || *buf == '\0')
+               return 1;
+
+       /* nid separators or end of nids */
+       endp = strpbrk(buf, ",: /");
+       if (endp == NULL)
+               endp = buf + strlen(buf);
+
+       tmp = *endp;
+       *endp = '\0';
+       switch (opc) {
+       default:
+               LBUG();
+       case CLASS_PARSE_NID:
+               rc = parse_nid(buf, value, quiet);
+               break;
+       case CLASS_PARSE_NET:
+               rc = parse_net(buf, value);
+               break;
+       }
+       *endp = tmp;
+       if (rc != 0)
+               return rc;
+       if (endh)
+               *endh = endp;
+       return 0;
+}
+
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+       return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_nid);
+
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh)
+{
+       return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1);
+}
+EXPORT_SYMBOL(class_parse_nid_quiet);
+
+int class_parse_net(char *buf, __u32 *net, char **endh)
+{
+       return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_net);
+
+/* 1 param contains key and match
+ * 0 param contains key and not match
+ * -1 param does not contain key
+ */
+int class_match_nid(char *buf, char *key, lnet_nid_t nid)
+{
+       lnet_nid_t tmp;
+       int   rc = -1;
+
+       while (class_find_param(buf, key, &buf) == 0) {
+               /* please restrict to the nids pertaining to
+                * the specified nids */
+               while (class_parse_nid(buf, &tmp, &buf) == 0) {
+                       if (tmp == nid)
+                               return 1;
+               }
+               rc = 0;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(class_match_nid);
+
+int class_match_net(char *buf, char *key, __u32 net)
+{
+       __u32 tmp;
+       int   rc = -1;
+
+       while (class_find_param(buf, key, &buf) == 0) {
+               /* please restrict to the nids pertaining to
+                * the specified networks */
+               while (class_parse_net(buf, &tmp, &buf) == 0) {
+                       if (tmp == net)
+                               return 1;
+               }
+               rc = 0;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(class_match_net);
+
+/********************** class fns **********************/
+
+/**
+ * Create a new obd device and set the type, name and uuid.  If successful,
+ * the new device can be accessed by either name or uuid.
+ */
+int class_attach(struct lustre_cfg *lcfg)
+{
+       struct obd_device *obd = NULL;
+       char *typename, *name, *uuid;
+       int rc, len;
+       ENTRY;
+
+       if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+               CERROR("No type passed!\n");
+               RETURN(-EINVAL);
+       }
+       typename = lustre_cfg_string(lcfg, 1);
+
+       if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
+               CERROR("No name passed!\n");
+               RETURN(-EINVAL);
+       }
+       name = lustre_cfg_string(lcfg, 0);
+
+       if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
+               CERROR("No UUID passed!\n");
+               RETURN(-EINVAL);
+       }
+       uuid = lustre_cfg_string(lcfg, 2);
+
+       CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
+              MKSTR(typename), MKSTR(name), MKSTR(uuid));
+
+       obd = class_newdev(typename, name);
+       if (IS_ERR(obd)) {
+               /* Already exists or out of obds */
+               rc = PTR_ERR(obd);
+               obd = NULL;
+               CERROR("Cannot create device %s of type %s : %d\n",
+                      name, typename, rc);
+               GOTO(out, rc);
+       }
+       LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
+                name, typename);
+       LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                "obd %p obd_magic %08X != %08X\n",
+                obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+       LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+                "%p obd_name %s != %s\n", obd, obd->obd_name, name);
+
+       rwlock_init(&obd->obd_pool_lock);
+       obd->obd_pool_limit = 0;
+       obd->obd_pool_slv = 0;
+
+       INIT_LIST_HEAD(&obd->obd_exports);
+       INIT_LIST_HEAD(&obd->obd_unlinked_exports);
+       INIT_LIST_HEAD(&obd->obd_delayed_exports);
+       INIT_LIST_HEAD(&obd->obd_exports_timed);
+       INIT_LIST_HEAD(&obd->obd_nid_stats);
+       spin_lock_init(&obd->obd_nid_lock);
+       spin_lock_init(&obd->obd_dev_lock);
+       mutex_init(&obd->obd_dev_mutex);
+       spin_lock_init(&obd->obd_osfs_lock);
+       /* obd->obd_osfs_age must be set to a value in the distant
+        * past to guarantee a fresh statfs is fetched on mount. */
+       obd->obd_osfs_age = cfs_time_shift_64(-1000);
+
+       /* XXX belongs in setup not attach  */
+       init_rwsem(&obd->obd_observer_link_sem);
+       /* recovery data */
+       cfs_init_timer(&obd->obd_recovery_timer);
+       spin_lock_init(&obd->obd_recovery_task_lock);
+       init_waitqueue_head(&obd->obd_next_transno_waitq);
+       init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
+       INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+       INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+       INIT_LIST_HEAD(&obd->obd_final_req_queue);
+       INIT_LIST_HEAD(&obd->obd_evict_list);
+
+       llog_group_init(&obd->obd_olg, FID_SEQ_LLOG);
+
+       obd->obd_conn_inprogress = 0;
+
+       len = strlen(uuid);
+       if (len >= sizeof(obd->obd_uuid)) {
+               CERROR("uuid must be < %d bytes long\n",
+                      (int)sizeof(obd->obd_uuid));
+               GOTO(out, rc = -EINVAL);
+       }
+       memcpy(obd->obd_uuid.uuid, uuid, len);
+
+       /* do the attach */
+       if (OBP(obd, attach)) {
+               rc = OBP(obd,attach)(obd, sizeof *lcfg, lcfg);
+               if (rc)
+                       GOTO(out, rc = -EINVAL);
+       }
+
+       /* Detach drops this */
+       spin_lock(&obd->obd_dev_lock);
+       atomic_set(&obd->obd_refcount, 1);
+       spin_unlock(&obd->obd_dev_lock);
+       lu_ref_init(&obd->obd_reference);
+       lu_ref_add(&obd->obd_reference, "attach", obd);
+
+       obd->obd_attached = 1;
+       CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+              obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
+       RETURN(0);
+ out:
+       if (obd != NULL) {
+               class_release_dev(obd);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(class_attach);
+
+/** Create hashes, self-export, and call type-specific setup.
+ * Setup is effectively the "start this obd" call.
+ */
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       int err = 0;
+       struct obd_export *exp;
+       ENTRY;
+
+       LASSERT(obd != NULL);
+       LASSERTF(obd == class_num2obd(obd->obd_minor),
+                "obd %p != obd_devs[%d] %p\n",
+                obd, obd->obd_minor, class_num2obd(obd->obd_minor));
+       LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+                "obd %p obd_magic %08x != %08x\n",
+                obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+
+       /* have we attached a type to this device? */
+       if (!obd->obd_attached) {
+               CERROR("Device %d not attached\n", obd->obd_minor);
+               RETURN(-ENODEV);
+       }
+
+       if (obd->obd_set_up) {
+               CERROR("Device %d already setup (type %s)\n",
+                      obd->obd_minor, obd->obd_type->typ_name);
+               RETURN(-EEXIST);
+       }
+
+       /* is someone else setting us up right now? (attach inits spinlock) */
+       spin_lock(&obd->obd_dev_lock);
+       if (obd->obd_starting) {
+               spin_unlock(&obd->obd_dev_lock);
+               CERROR("Device %d setup in progress (type %s)\n",
+                      obd->obd_minor, obd->obd_type->typ_name);
+               RETURN(-EEXIST);
+       }
+       /* just leave this on forever.  I can't use obd_set_up here because
+          other fns check that status, and we're not actually set up yet. */
+       obd->obd_starting = 1;
+       obd->obd_uuid_hash = NULL;
+       obd->obd_nid_hash = NULL;
+       obd->obd_nid_stats_hash = NULL;
+       spin_unlock(&obd->obd_dev_lock);
+
+       /* create an uuid-export lustre hash */
+       obd->obd_uuid_hash = cfs_hash_create("UUID_HASH",
+                                            HASH_UUID_CUR_BITS,
+                                            HASH_UUID_MAX_BITS,
+                                            HASH_UUID_BKT_BITS, 0,
+                                            CFS_HASH_MIN_THETA,
+                                            CFS_HASH_MAX_THETA,
+                                            &uuid_hash_ops, CFS_HASH_DEFAULT);
+       if (!obd->obd_uuid_hash)
+               GOTO(err_hash, err = -ENOMEM);
+
+       /* create a nid-export lustre hash */
+       obd->obd_nid_hash = cfs_hash_create("NID_HASH",
+                                           HASH_NID_CUR_BITS,
+                                           HASH_NID_MAX_BITS,
+                                           HASH_NID_BKT_BITS, 0,
+                                           CFS_HASH_MIN_THETA,
+                                           CFS_HASH_MAX_THETA,
+                                           &nid_hash_ops, CFS_HASH_DEFAULT);
+       if (!obd->obd_nid_hash)
+               GOTO(err_hash, err = -ENOMEM);
+
+       /* create a nid-stats lustre hash */
+       obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
+                                                 HASH_NID_STATS_CUR_BITS,
+                                                 HASH_NID_STATS_MAX_BITS,
+                                                 HASH_NID_STATS_BKT_BITS, 0,
+                                                 CFS_HASH_MIN_THETA,
+                                                 CFS_HASH_MAX_THETA,
+                                                 &nid_stat_hash_ops, CFS_HASH_DEFAULT);
+       if (!obd->obd_nid_stats_hash)
+               GOTO(err_hash, err = -ENOMEM);
+
+       exp = class_new_export(obd, &obd->obd_uuid);
+       if (IS_ERR(exp))
+               GOTO(err_hash, err = PTR_ERR(exp));
+
+       obd->obd_self_export = exp;
+       list_del_init(&exp->exp_obd_chain_timed);
+       class_export_put(exp);
+
+       err = obd_setup(obd, lcfg);
+       if (err)
+               GOTO(err_exp, err);
+
+       obd->obd_set_up = 1;
+
+       spin_lock(&obd->obd_dev_lock);
+       /* cleanup drops this */
+       class_incref(obd, "setup", obd);
+       spin_unlock(&obd->obd_dev_lock);
+
+       CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
+              obd->obd_name, obd->obd_uuid.uuid);
+
+       RETURN(0);
+err_exp:
+       if (obd->obd_self_export) {
+               class_unlink_export(obd->obd_self_export);
+               obd->obd_self_export = NULL;
+       }
+err_hash:
+       if (obd->obd_uuid_hash) {
+               cfs_hash_putref(obd->obd_uuid_hash);
+               obd->obd_uuid_hash = NULL;
+       }
+       if (obd->obd_nid_hash) {
+               cfs_hash_putref(obd->obd_nid_hash);
+               obd->obd_nid_hash = NULL;
+       }
+       if (obd->obd_nid_stats_hash) {
+               cfs_hash_putref(obd->obd_nid_stats_hash);
+               obd->obd_nid_stats_hash = NULL;
+       }
+       obd->obd_starting = 0;
+       CERROR("setup %s failed (%d)\n", obd->obd_name, err);
+       return err;
+}
+EXPORT_SYMBOL(class_setup);
+
+/** We have finished using this obd and are ready to destroy it.
+ * There can be no more references to this obd.
+ */
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       ENTRY;
+
+       if (obd->obd_set_up) {
+               CERROR("OBD device %d still set up\n", obd->obd_minor);
+               RETURN(-EBUSY);
+       }
+
+       spin_lock(&obd->obd_dev_lock);
+       if (!obd->obd_attached) {
+               spin_unlock(&obd->obd_dev_lock);
+               CERROR("OBD device %d not attached\n", obd->obd_minor);
+               RETURN(-ENODEV);
+       }
+       obd->obd_attached = 0;
+       spin_unlock(&obd->obd_dev_lock);
+
+       CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
+              obd->obd_name, obd->obd_uuid.uuid);
+
+       class_decref(obd, "attach", obd);
+       RETURN(0);
+}
+EXPORT_SYMBOL(class_detach);
+
+/** Start shutting down the obd.  There may be in-progess ops when
+ * this is called.  We tell them to start shutting down with a call
+ * to class_disconnect_exports().
+ */
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       int err = 0;
+       char *flag;
+       ENTRY;
+
+       OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
+
+       if (!obd->obd_set_up) {
+               CERROR("Device %d not setup\n", obd->obd_minor);
+               RETURN(-ENODEV);
+       }
+
+       spin_lock(&obd->obd_dev_lock);
+       if (obd->obd_stopping) {
+               spin_unlock(&obd->obd_dev_lock);
+               CERROR("OBD %d already stopping\n", obd->obd_minor);
+               RETURN(-ENODEV);
+       }
+       /* Leave this on forever */
+       obd->obd_stopping = 1;
+
+       /* wait for already-arrived-connections to finish. */
+       while (obd->obd_conn_inprogress > 0) {
+               spin_unlock(&obd->obd_dev_lock);
+
+               cond_resched();
+
+               spin_lock(&obd->obd_dev_lock);
+       }
+       spin_unlock(&obd->obd_dev_lock);
+
+       if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+               for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
+                       switch (*flag) {
+                       case 'F':
+                               obd->obd_force = 1;
+                               break;
+                       case 'A':
+                               LCONSOLE_WARN("Failing over %s\n",
+                                             obd->obd_name);
+                               obd->obd_fail = 1;
+                               obd->obd_no_transno = 1;
+                               obd->obd_no_recov = 1;
+                               if (OBP(obd, iocontrol)) {
+                                       obd_iocontrol(OBD_IOC_SYNC,
+                                                     obd->obd_self_export,
+                                                     0, NULL, NULL);
+                               }
+                               break;
+                       default:
+                               CERROR("Unrecognised flag '%c'\n", *flag);
+                       }
+       }
+
+       LASSERT(obd->obd_self_export);
+
+       /* The three references that should be remaining are the
+        * obd_self_export and the attach and setup references. */
+       if (atomic_read(&obd->obd_refcount) > 3) {
+               /* refcounf - 3 might be the number of real exports
+                  (excluding self export). But class_incref is called
+                  by other things as well, so don't count on it. */
+               CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
+                      obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
+               dump_exports(obd, 0);
+               class_disconnect_exports(obd);
+       }
+
+       /* Precleanup, we must make sure all exports get destroyed. */
+       err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS);
+       if (err)
+               CERROR("Precleanup %s returned %d\n",
+                      obd->obd_name, err);
+
+       /* destroy an uuid-export hash body */
+       if (obd->obd_uuid_hash) {
+               cfs_hash_putref(obd->obd_uuid_hash);
+               obd->obd_uuid_hash = NULL;
+       }
+
+       /* destroy a nid-export hash body */
+       if (obd->obd_nid_hash) {
+               cfs_hash_putref(obd->obd_nid_hash);
+               obd->obd_nid_hash = NULL;
+       }
+
+       /* destroy a nid-stats hash body */
+       if (obd->obd_nid_stats_hash) {
+               cfs_hash_putref(obd->obd_nid_stats_hash);
+               obd->obd_nid_stats_hash = NULL;
+       }
+
+       class_decref(obd, "setup", obd);
+       obd->obd_set_up = 0;
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(class_cleanup);
+
+struct obd_device *class_incref(struct obd_device *obd,
+                               const char *scope, const void *source)
+{
+       lu_ref_add_atomic(&obd->obd_reference, scope, source);
+       atomic_inc(&obd->obd_refcount);
+       CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+              atomic_read(&obd->obd_refcount));
+
+       return obd;
+}
+EXPORT_SYMBOL(class_incref);
+
+void class_decref(struct obd_device *obd, const char *scope, const void *source)
+{
+       int err;
+       int refs;
+
+       spin_lock(&obd->obd_dev_lock);
+       atomic_dec(&obd->obd_refcount);
+       refs = atomic_read(&obd->obd_refcount);
+       spin_unlock(&obd->obd_dev_lock);
+       lu_ref_del(&obd->obd_reference, scope, source);
+
+       CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+
+       if ((refs == 1) && obd->obd_stopping) {
+               /* All exports have been destroyed; there should
+                  be no more in-progress ops by this point.*/
+
+               spin_lock(&obd->obd_self_export->exp_lock);
+               obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
+               spin_unlock(&obd->obd_self_export->exp_lock);
+
+               /* note that we'll recurse into class_decref again */
+               class_unlink_export(obd->obd_self_export);
+               return;
+       }
+
+       if (refs == 0) {
+               CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+                      obd->obd_name, obd->obd_uuid.uuid);
+               LASSERT(!obd->obd_attached);
+               if (obd->obd_stopping) {
+                       /* If we're not stopping, we were never set up */
+                       err = obd_cleanup(obd);
+                       if (err)
+                               CERROR("Cleanup %s returned %d\n",
+                                      obd->obd_name, err);
+               }
+               if (OBP(obd, detach)) {
+                       err = OBP(obd, detach)(obd);
+                       if (err)
+                               CERROR("Detach returned %d\n", err);
+               }
+               class_release_dev(obd);
+       }
+}
+EXPORT_SYMBOL(class_decref);
+
+/** Add a failover nid location.
+ * Client obd types contact server obd types using this nid list.
+ */
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct obd_import *imp;
+       struct obd_uuid uuid;
+       int rc;
+       ENTRY;
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+           LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+               CERROR("invalid conn_uuid\n");
+               RETURN(-EINVAL);
+       }
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+               CERROR("can't add connection on non-client dev\n");
+               RETURN(-EINVAL);
+       }
+
+       imp = obd->u.cli.cl_import;
+       if (!imp) {
+               CERROR("try to add conn on immature client dev\n");
+               RETURN(-EINVAL);
+       }
+
+       obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+       rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_add_conn);
+
+/** Remove a failover nid location.
+ */
+int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct obd_import *imp;
+       struct obd_uuid uuid;
+       int rc;
+       ENTRY;
+
+       if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+           LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+               CERROR("invalid conn_uuid\n");
+               RETURN(-EINVAL);
+       }
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+               CERROR("can't del connection on non-client dev\n");
+               RETURN(-EINVAL);
+       }
+
+       imp = obd->u.cli.cl_import;
+       if (!imp) {
+               CERROR("try to del conn on immature client dev\n");
+               RETURN(-EINVAL);
+       }
+
+       obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+       rc = obd_del_conn(imp, &uuid);
+
+       RETURN(rc);
+}
+
+LIST_HEAD(lustre_profile_list);
+
+struct lustre_profile *class_get_profile(const char * prof)
+{
+       struct lustre_profile *lprof;
+
+       ENTRY;
+       list_for_each_entry(lprof, &lustre_profile_list, lp_list) {
+               if (!strcmp(lprof->lp_profile, prof)) {
+                       RETURN(lprof);
+               }
+       }
+       RETURN(NULL);
+}
+EXPORT_SYMBOL(class_get_profile);
+
+/** Create a named "profile".
+ * This defines the mdc and osc names to use for a client.
+ * This also is used to define the lov to be used by a mdt.
+ */
+int class_add_profile(int proflen, char *prof, int osclen, char *osc,
+                     int mdclen, char *mdc)
+{
+       struct lustre_profile *lprof;
+       int err = 0;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "Add profile %s\n", prof);
+
+       OBD_ALLOC(lprof, sizeof(*lprof));
+       if (lprof == NULL)
+               RETURN(-ENOMEM);
+       INIT_LIST_HEAD(&lprof->lp_list);
+
+       LASSERT(proflen == (strlen(prof) + 1));
+       OBD_ALLOC(lprof->lp_profile, proflen);
+       if (lprof->lp_profile == NULL)
+               GOTO(out, err = -ENOMEM);
+       memcpy(lprof->lp_profile, prof, proflen);
+
+       LASSERT(osclen == (strlen(osc) + 1));
+       OBD_ALLOC(lprof->lp_dt, osclen);
+       if (lprof->lp_dt == NULL)
+               GOTO(out, err = -ENOMEM);
+       memcpy(lprof->lp_dt, osc, osclen);
+
+       if (mdclen > 0) {
+               LASSERT(mdclen == (strlen(mdc) + 1));
+               OBD_ALLOC(lprof->lp_md, mdclen);
+               if (lprof->lp_md == NULL)
+                       GOTO(out, err = -ENOMEM);
+               memcpy(lprof->lp_md, mdc, mdclen);
+       }
+
+       list_add(&lprof->lp_list, &lustre_profile_list);
+       RETURN(err);
+
+out:
+       if (lprof->lp_md)
+               OBD_FREE(lprof->lp_md, mdclen);
+       if (lprof->lp_dt)
+               OBD_FREE(lprof->lp_dt, osclen);
+       if (lprof->lp_profile)
+               OBD_FREE(lprof->lp_profile, proflen);
+       OBD_FREE(lprof, sizeof(*lprof));
+       RETURN(err);
+}
+
+void class_del_profile(const char *prof)
+{
+       struct lustre_profile *lprof;
+       ENTRY;
+
+       CDEBUG(D_CONFIG, "Del profile %s\n", prof);
+
+       lprof = class_get_profile(prof);
+       if (lprof) {
+               list_del(&lprof->lp_list);
+               OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+               OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+               if (lprof->lp_md)
+                       OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+               OBD_FREE(lprof, sizeof *lprof);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(class_del_profile);
+
+/* COMPAT_146 */
+void class_del_profiles(void)
+{
+       struct lustre_profile *lprof, *n;
+       ENTRY;
+
+       list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) {
+               list_del(&lprof->lp_list);
+               OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+               OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+               if (lprof->lp_md)
+                       OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+               OBD_FREE(lprof, sizeof *lprof);
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(class_del_profiles);
+
+static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
+{
+       ENTRY;
+       if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+               at_min = val;
+       else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+               at_max = val;
+       else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+               at_extra = val;
+       else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+               at_early_margin = val;
+       else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+               at_history = val;
+       else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
+               strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
+                       JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+       else
+               RETURN(-EINVAL);
+
+       CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+       RETURN(0);
+}
+
+
+/* We can't call ll_process_config or lquota_process_config directly because
+ * it lives in a module that must be loaded after this one. */
+static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
+{
+       client_process_config = cpc;
+}
+EXPORT_SYMBOL(lustre_register_client_process_config);
+
+/**
+ * Rename the proc parameter in \a cfg with a new name \a new_name.
+ *
+ * \param cfg     config structure which contains the proc parameter
+ * \param new_name new name of the proc parameter
+ *
+ * \retval valid-pointer    pointer to the newly-allocated config structure
+ *                         which contains the renamed proc parameter
+ * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does
+ *                         not contain a proc parameter
+ * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs
+ */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+                                    const char *new_name)
+{
+       struct lustre_cfg_bufs  *bufs = NULL;
+       struct lustre_cfg       *new_cfg = NULL;
+       char                    *param = NULL;
+       char                    *new_param = NULL;
+       char                    *value = NULL;
+       int                      name_len = 0;
+       int                      new_len = 0;
+       ENTRY;
+
+       if (cfg == NULL || new_name == NULL)
+               RETURN(ERR_PTR(-EINVAL));
+
+       param = lustre_cfg_string(cfg, 1);
+       if (param == NULL)
+               RETURN(ERR_PTR(-EINVAL));
+
+       value = strchr(param, '=');
+       if (value == NULL)
+               name_len = strlen(param);
+       else
+               name_len = value - param;
+
+       new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len;
+
+       OBD_ALLOC(new_param, new_len);
+       if (new_param == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       strcpy(new_param, new_name);
+       if (value != NULL)
+               strcat(new_param, value);
+
+       OBD_ALLOC_PTR(bufs);
+       if (bufs == NULL) {
+               OBD_FREE(new_param, new_len);
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       lustre_cfg_bufs_reset(bufs, NULL);
+       lustre_cfg_bufs_init(bufs, cfg);
+       lustre_cfg_bufs_set_string(bufs, 1, new_param);
+
+       new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs);
+
+       OBD_FREE(new_param, new_len);
+       OBD_FREE_PTR(bufs);
+       if (new_cfg == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       new_cfg->lcfg_num = cfg->lcfg_num;
+       new_cfg->lcfg_flags = cfg->lcfg_flags;
+       new_cfg->lcfg_nid = cfg->lcfg_nid;
+       new_cfg->lcfg_nal = cfg->lcfg_nal;
+
+       RETURN(new_cfg);
+}
+EXPORT_SYMBOL(lustre_cfg_rename);
+
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
+{
+       quota_process_config = qpc;
+}
+EXPORT_SYMBOL(lustre_register_quota_process_config);
+
+/** Process configuration commands given in lustre_cfg form.
+ * These may come from direct calls (e.g. class_manual_cleanup)
+ * or processing the config llog, or ioctl from lctl.
+ */
+int class_process_config(struct lustre_cfg *lcfg)
+{
+       struct obd_device *obd;
+       int err;
+
+       LASSERT(lcfg && !IS_ERR(lcfg));
+       CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
+
+       /* Commands that don't need a device */
+       switch(lcfg->lcfg_command) {
+       case LCFG_ATTACH: {
+               err = class_attach(lcfg);
+               GOTO(out, err);
+       }
+       case LCFG_ADD_UUID: {
+               CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64
+                      " (%s)\n", lustre_cfg_string(lcfg, 1),
+                      lcfg->lcfg_nid, libcfs_nid2str(lcfg->lcfg_nid));
+
+               err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid);
+               GOTO(out, err);
+       }
+       case LCFG_DEL_UUID: {
+               CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+                      (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0)
+                      ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
+
+               err = class_del_uuid(lustre_cfg_string(lcfg, 1));
+               GOTO(out, err);
+       }
+       case LCFG_MOUNTOPT: {
+               CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
+                      lustre_cfg_string(lcfg, 1),
+                      lustre_cfg_string(lcfg, 2),
+                      lustre_cfg_string(lcfg, 3));
+               /* set these mount options somewhere, so ll_fill_super
+                * can find them. */
+               err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+                                       lustre_cfg_string(lcfg, 1),
+                                       LUSTRE_CFG_BUFLEN(lcfg, 2),
+                                       lustre_cfg_string(lcfg, 2),
+                                       LUSTRE_CFG_BUFLEN(lcfg, 3),
+                                       lustre_cfg_string(lcfg, 3));
+               GOTO(out, err);
+       }
+       case LCFG_DEL_MOUNTOPT: {
+               CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+                      lustre_cfg_string(lcfg, 1));
+               class_del_profile(lustre_cfg_string(lcfg, 1));
+               GOTO(out, err = 0);
+       }
+       case LCFG_SET_TIMEOUT: {
+               CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
+                      obd_timeout, lcfg->lcfg_num);
+               obd_timeout = max(lcfg->lcfg_num, 1U);
+               obd_timeout_set = 1;
+               GOTO(out, err = 0);
+       }
+       case LCFG_SET_LDLM_TIMEOUT: {
+               CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n",
+                      ldlm_timeout, lcfg->lcfg_num);
+               ldlm_timeout = max(lcfg->lcfg_num, 1U);
+               if (ldlm_timeout >= obd_timeout)
+                       ldlm_timeout = max(obd_timeout / 3, 1U);
+               ldlm_timeout_set = 1;
+               GOTO(out, err = 0);
+       }
+       case LCFG_SET_UPCALL: {
+               LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n");
+               /* COMPAT_146 Don't fail on old configs */
+               GOTO(out, err = 0);
+       }
+       case LCFG_MARKER: {
+               struct cfg_marker *marker;
+               marker = lustre_cfg_buf(lcfg, 1);
+               CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+                      marker->cm_flags, marker->cm_tgtname, marker->cm_comment);
+               GOTO(out, err = 0);
+       }
+       case LCFG_PARAM: {
+               char *tmp;
+               /* llite has no obd */
+               if ((class_match_param(lustre_cfg_string(lcfg, 1),
+                                      PARAM_LLITE, 0) == 0) &&
+                   client_process_config) {
+                       err = (*client_process_config)(lcfg);
+                       GOTO(out, err);
+               } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+                                             PARAM_SYS, &tmp) == 0)) {
+                       /* Global param settings */
+                       err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+                       /*
+                        * Client or server should not fail to mount if
+                        * it hits an unknown configuration parameter.
+                        */
+                       if (err != 0)
+                               CWARN("Ignoring unknown param %s\n", tmp);
+
+                       GOTO(out, err = 0);
+               } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+                                             PARAM_QUOTA, &tmp) == 0) &&
+                          quota_process_config) {
+                       err = (*quota_process_config)(lcfg);
+                       GOTO(out, err);
+               }
+               /* Fall through */
+               break;
+       }
+       }
+
+       /* Commands that require a device */
+       obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+       if (obd == NULL) {
+               if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
+                       CERROR("this lcfg command requires a device name\n");
+               else
+                       CERROR("no device for: %s\n",
+                              lustre_cfg_string(lcfg, 0));
+
+               GOTO(out, err = -EINVAL);
+       }
+
+       switch(lcfg->lcfg_command) {
+       case LCFG_SETUP: {
+               err = class_setup(obd, lcfg);
+               GOTO(out, err);
+       }
+       case LCFG_DETACH: {
+               err = class_detach(obd, lcfg);
+               GOTO(out, err = 0);
+       }
+       case LCFG_CLEANUP: {
+               err = class_cleanup(obd, lcfg);
+               GOTO(out, err = 0);
+       }
+       case LCFG_ADD_CONN: {
+               err = class_add_conn(obd, lcfg);
+               GOTO(out, err = 0);
+       }
+       case LCFG_DEL_CONN: {
+               err = class_del_conn(obd, lcfg);
+               GOTO(out, err = 0);
+       }
+       case LCFG_POOL_NEW: {
+               err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+               GOTO(out, err = 0);
+               break;
+       }
+       case LCFG_POOL_ADD: {
+               err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+                                  lustre_cfg_string(lcfg, 3));
+               GOTO(out, err = 0);
+               break;
+       }
+       case LCFG_POOL_REM: {
+               err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+                                  lustre_cfg_string(lcfg, 3));
+               GOTO(out, err = 0);
+               break;
+       }
+       case LCFG_POOL_DEL: {
+               err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+               GOTO(out, err = 0);
+               break;
+       }
+       default: {
+               err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+               GOTO(out, err);
+
+       }
+       }
+out:
+       if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+               CWARN("Ignoring error %d on optional command %#x\n", err,
+                     lcfg->lcfg_command);
+               err = 0;
+       }
+       return err;
+}
+EXPORT_SYMBOL(class_process_config);
+
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+                            struct lustre_cfg *lcfg, void *data)
+{
+       struct lprocfs_vars *var;
+       struct file fakefile;
+       struct seq_file fake_seqfile;
+       char *key, *sval;
+       int i, keylen, vallen;
+       int matched = 0, j = 0;
+       int rc = 0;
+       int skip = 0;
+       ENTRY;
+
+       if (lcfg->lcfg_command != LCFG_PARAM) {
+               CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+               RETURN(-EINVAL);
+       }
+
+       /* fake a seq file so that var->fops->write can work... */
+       fakefile.private_data = &fake_seqfile;
+       fake_seqfile.private = data;
+       /* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+          or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+          or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+       for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+               key = lustre_cfg_buf(lcfg, i);
+               /* Strip off prefix */
+               class_match_param(key, prefix, &key);
+               sval = strchr(key, '=');
+               if (!sval || (*(sval + 1) == 0)) {
+                       CERROR("Can't parse param %s (missing '=')\n", key);
+                       /* rc = -EINVAL;        continue parsing other params */
+                       continue;
+               }
+               keylen = sval - key;
+               sval++;
+               vallen = strlen(sval);
+               matched = 0;
+               j = 0;
+               /* Search proc entries */
+               while (lvars[j].name) {
+                       var = &lvars[j];
+                       if (class_match_param(key, (char *)var->name, 0) == 0 &&
+                           keylen == strlen(var->name)) {
+                               matched++;
+                               rc = -EROFS;
+                               if (var->fops && var->fops->write) {
+                                       mm_segment_t oldfs;
+                                       oldfs = get_fs();
+                                       set_fs(KERNEL_DS);
+                                       rc = (var->fops->write)(&fakefile, sval,
+                                                               vallen, NULL);
+                                       set_fs(oldfs);
+                               }
+                               break;
+                       }
+                       j++;
+               }
+               if (!matched) {
+                       /* If the prefix doesn't match, return error so we
+                          can pass it down the stack */
+                       if (strnchr(key, keylen, '.'))
+                           RETURN(-ENOSYS);
+                       CERROR("%s: unknown param %s\n",
+                              (char *)lustre_cfg_string(lcfg, 0), key);
+                       /* rc = -EINVAL;        continue parsing other params */
+                       skip++;
+               } else if (rc < 0) {
+                       CERROR("writing proc entry %s err %d\n",
+                              var->name, rc);
+                       rc = 0;
+               } else {
+                       CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n",
+                                        lustre_cfg_string(lcfg, 0),
+                                        (int)strlen(prefix) - 1, prefix,
+                                        (int)(sval - key - 1), key, sval);
+               }
+       }
+
+       if (rc > 0)
+               rc = 0;
+       if (!rc && skip)
+               rc = skip;
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_process_proc_param);
+
+extern int lustre_check_exclusion(struct super_block *sb, char *svname);
+
+/** Parse a configuration llog, doing various manipulations on them
+ * for various reasons, (modifications for compatibility, skip obsolete
+ * records, change uuids, etc), then class_process_config() resulting
+ * net records.
+ */
+int class_config_llog_handler(const struct lu_env *env,
+                             struct llog_handle *handle,
+                             struct llog_rec_hdr *rec, void *data)
+{
+       struct config_llog_instance *clli = data;
+       int cfg_len = rec->lrh_len;
+       char *cfg_buf = (char*) (rec + 1);
+       int rc = 0;
+       ENTRY;
+
+       //class_config_dump_handler(handle, rec, data);
+
+       switch (rec->lrh_type) {
+       case OBD_CFG_REC: {
+               struct lustre_cfg *lcfg, *lcfg_new;
+               struct lustre_cfg_bufs bufs;
+               char *inst_name = NULL;
+               int inst_len = 0;
+               int inst = 0, swab = 0;
+
+               lcfg = (struct lustre_cfg *)cfg_buf;
+               if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+                       lustre_swab_lustre_cfg(lcfg);
+                       swab = 1;
+               }
+
+               rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+               if (rc)
+                       GOTO(out, rc);
+
+               /* Figure out config state info */
+               if (lcfg->lcfg_command == LCFG_MARKER) {
+                       struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+                       lustre_swab_cfg_marker(marker, swab,
+                                              LUSTRE_CFG_BUFLEN(lcfg, 1));
+                       CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
+                              clli->cfg_flags, marker->cm_flags);
+                       if (marker->cm_flags & CM_START) {
+                               /* all previous flags off */
+                               clli->cfg_flags = CFG_F_MARKER;
+                               if (marker->cm_flags & CM_SKIP) {
+                                       clli->cfg_flags |= CFG_F_SKIP;
+                                       CDEBUG(D_CONFIG, "SKIP #%d\n",
+                                              marker->cm_step);
+                               } else if ((marker->cm_flags & CM_EXCLUDE) ||
+                                          (clli->cfg_sb &&
+                                           lustre_check_exclusion(clli->cfg_sb,
+                                                        marker->cm_tgtname))) {
+                                       clli->cfg_flags |= CFG_F_EXCLUDE;
+                                       CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+                                              marker->cm_step);
+                               }
+                       } else if (marker->cm_flags & CM_END) {
+                               clli->cfg_flags = 0;
+                       }
+               }
+               /* A config command without a start marker before it is
+                  illegal (post 146) */
+               if (!(clli->cfg_flags & CFG_F_COMPAT146) &&
+                   !(clli->cfg_flags & CFG_F_MARKER) &&
+                   (lcfg->lcfg_command != LCFG_MARKER)) {
+                       CWARN("Config not inside markers, ignoring! "
+                             "(inst: %p, uuid: %s, flags: %#x)\n",
+                             clli->cfg_instance,
+                             clli->cfg_uuid.uuid, clli->cfg_flags);
+                       clli->cfg_flags |= CFG_F_SKIP;
+               }
+               if (clli->cfg_flags & CFG_F_SKIP) {
+                       CDEBUG(D_CONFIG, "skipping %#x\n",
+                              clli->cfg_flags);
+                       rc = 0;
+                       /* No processing! */
+                       break;
+               }
+
+               /*
+                * For interoperability between 1.8 and 2.0,
+                * rename "mds" obd device type to "mdt".
+                */
+               {
+                       char *typename = lustre_cfg_string(lcfg, 1);
+                       char *index = lustre_cfg_string(lcfg, 2);
+
+                       if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+                            strcmp(typename, "mds") == 0)) {
+                               CWARN("For 1.8 interoperability, rename obd "
+                                      "type from mds to mdt\n");
+                               typename[2] = 't';
+                       }
+                       if ((lcfg->lcfg_command == LCFG_SETUP && index &&
+                            strcmp(index, "type") == 0)) {
+                               CDEBUG(D_INFO, "For 1.8 interoperability, "
+                                      "set this index to '0'\n");
+                               index[0] = '0';
+                               index[1] = 0;
+                       }
+               }
+
+
+               if ((clli->cfg_flags & CFG_F_EXCLUDE) &&
+                   (lcfg->lcfg_command == LCFG_LOV_ADD_OBD))
+                       /* Add inactive instead */
+                       lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+
+               lustre_cfg_bufs_init(&bufs, lcfg);
+
+               if (clli && clli->cfg_instance &&
+                   LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){
+                       inst = 1;
+                       inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+                                  sizeof(clli->cfg_instance) * 2 + 4;
+                       OBD_ALLOC(inst_name, inst_len);
+                       if (inst_name == NULL)
+                               GOTO(out, rc = -ENOMEM);
+                       sprintf(inst_name, "%s-%p",
+                               lustre_cfg_string(lcfg, 0),
+                               clli->cfg_instance);
+                       lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+                       CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
+                              lcfg->lcfg_command, inst_name);
+               }
+
+               /* we override the llog's uuid for clients, to insure they
+               are unique */
+               if (clli && clli->cfg_instance != NULL &&
+                   lcfg->lcfg_command == LCFG_ATTACH) {
+                       lustre_cfg_bufs_set_string(&bufs, 2,
+                                                  clli->cfg_uuid.uuid);
+               }
+               /*
+                * sptlrpc config record, we expect 2 data segments:
+                *  [0]: fs_name/target_name,
+                *  [1]: rule string
+                * moving them to index [1] and [2], and insert MGC's
+                * obdname at index [0].
+                */
+               if (clli && clli->cfg_instance == NULL &&
+                   lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+                       lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
+                                           bufs.lcfg_buflen[1]);
+                       lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
+                                           bufs.lcfg_buflen[0]);
+                       lustre_cfg_bufs_set_string(&bufs, 0,
+                                                  clli->cfg_obdname);
+               }
+
+               lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs);
+
+               lcfg_new->lcfg_num   = lcfg->lcfg_num;
+               lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+
+               /* XXX Hack to try to remain binary compatible with
+                * pre-newconfig logs */
+               if (lcfg->lcfg_nal != 0 &&      /* pre-newconfig log? */
+                   (lcfg->lcfg_nid >> 32) == 0) {
+                       __u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff);
+
+                       lcfg_new->lcfg_nid =
+                               LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr);
+                       CWARN("Converted pre-newconfig NAL %d NID %x to %s\n",
+                             lcfg->lcfg_nal, addr,
+                             libcfs_nid2str(lcfg_new->lcfg_nid));
+               } else {
+                       lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+               }
+
+               lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */
+
+               rc = class_process_config(lcfg_new);
+               lustre_cfg_free(lcfg_new);
+
+               if (inst)
+                       OBD_FREE(inst_name, inst_len);
+               break;
+       }
+       default:
+               CERROR("Unknown llog record type %#x encountered\n",
+                      rec->lrh_type);
+               break;
+       }
+out:
+       if (rc) {
+               CERROR("%s: cfg command failed: rc = %d\n",
+                      handle->lgh_ctxt->loc_obd->obd_name, rc);
+               class_config_dump_handler(NULL, handle, rec, data);
+       }
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_llog_handler);
+
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+                           char *name, struct config_llog_instance *cfg)
+{
+       struct llog_process_cat_data     cd = {0, 0};
+       struct llog_handle              *llh;
+       llog_cb_t                        callback;
+       int                              rc;
+       ENTRY;
+
+       CDEBUG(D_INFO, "looking up llog %s\n", name);
+       rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+       if (rc)
+               RETURN(rc);
+
+       rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+       if (rc)
+               GOTO(parse_out, rc);
+
+       /* continue processing from where we last stopped to end-of-log */
+       if (cfg) {
+               cd.lpcd_first_idx = cfg->cfg_last_idx;
+               callback = cfg->cfg_callback;
+               LASSERT(callback != NULL);
+       } else {
+               callback = class_config_llog_handler;
+       }
+
+       cd.lpcd_last_idx = 0;
+
+       rc = llog_process(env, llh, callback, cfg, &cd);
+
+       CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+              cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
+       if (cfg)
+               cfg->cfg_last_idx = cd.lpcd_last_idx;
+
+parse_out:
+       llog_close(env, llh);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_parse_llog);
+
+/**
+ * parse config record and output dump in supplied buffer.
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ */
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size)
+{
+       struct lustre_cfg       *lcfg = (struct lustre_cfg *)(rec + 1);
+       char                    *ptr = buf;
+       char                    *end = buf + size;
+       int                      rc = 0;
+
+       ENTRY;
+
+       LASSERT(rec->lrh_type == OBD_CFG_REC);
+       rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+       if (rc < 0)
+               RETURN(rc);
+
+       ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command);
+       if (lcfg->lcfg_flags)
+               ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+                               lcfg->lcfg_flags);
+
+       if (lcfg->lcfg_num)
+               ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num);
+
+       if (lcfg->lcfg_nid)
+               ptr += snprintf(ptr, end-ptr, "nid=%s("LPX64")\n     ",
+                               libcfs_nid2str(lcfg->lcfg_nid),
+                               lcfg->lcfg_nid);
+
+       if (lcfg->lcfg_command == LCFG_MARKER) {
+               struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+               ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+                               marker->cm_step, marker->cm_flags,
+                               marker->cm_tgtname, marker->cm_comment);
+       } else {
+               int i;
+
+               for (i = 0; i <  lcfg->lcfg_bufcount; i++) {
+                       ptr += snprintf(ptr, end-ptr, "%d:%s  ", i,
+                                       lustre_cfg_string(lcfg, i));
+               }
+       }
+       /* return consumed bytes */
+       rc = ptr - buf;
+       RETURN(rc);
+}
+
+int class_config_dump_handler(const struct lu_env *env,
+                             struct llog_handle *handle,
+                             struct llog_rec_hdr *rec, void *data)
+{
+       char    *outstr;
+       int      rc = 0;
+
+       ENTRY;
+
+       OBD_ALLOC(outstr, 256);
+       if (outstr == NULL)
+               RETURN(-ENOMEM);
+
+       if (rec->lrh_type == OBD_CFG_REC) {
+               class_config_parse_rec(rec, outstr, 256);
+               LCONSOLE(D_WARNING, "   %s\n", outstr);
+       } else {
+               LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
+               rc = -EINVAL;
+       }
+
+       OBD_FREE(outstr, 256);
+       RETURN(rc);
+}
+
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+                          char *name, struct config_llog_instance *cfg)
+{
+       struct llog_handle      *llh;
+       int                      rc;
+
+       ENTRY;
+
+       LCONSOLE_INFO("Dumping config log %s\n", name);
+
+       rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+       if (rc)
+               RETURN(rc);
+
+       rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+       if (rc)
+               GOTO(parse_out, rc);
+
+       rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL);
+parse_out:
+       llog_close(env, llh);
+
+       LCONSOLE_INFO("End config log %s\n", name);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_dump_llog);
+
+/** Call class_cleanup and class_detach.
+ * "Manual" only in the sense that we're faking lcfg commands.
+ */
+int class_manual_cleanup(struct obd_device *obd)
+{
+       char                flags[3] = "";
+       struct lustre_cfg      *lcfg;
+       struct lustre_cfg_bufs  bufs;
+       int                  rc;
+       ENTRY;
+
+       if (!obd) {
+               CERROR("empty cleanup\n");
+               RETURN(-EALREADY);
+       }
+
+       if (obd->obd_force)
+               strcat(flags, "F");
+       if (obd->obd_fail)
+               strcat(flags, "A");
+
+       CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n",
+              obd->obd_name, flags);
+
+       lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+       lustre_cfg_bufs_set_string(&bufs, 1, flags);
+       lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+       if (!lcfg)
+               RETURN(-ENOMEM);
+
+       rc = class_process_config(lcfg);
+       if (rc) {
+               CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+               GOTO(out, rc);
+       }
+
+       /* the lcfg is almost the same for both ops */
+       lcfg->lcfg_command = LCFG_DETACH;
+       rc = class_process_config(lcfg);
+       if (rc)
+               CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
+       lustre_cfg_free(lcfg);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(class_manual_cleanup);
+
+/*
+ * uuid<->export lustre hash operations
+ */
+
+static unsigned
+uuid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid,
+                                 sizeof(((struct obd_uuid *)key)->uuid), mask);
+}
+
+static void *
+uuid_key(struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+       return &exp->exp_client_uuid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+uuid_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       LASSERT(key);
+       exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+       return obd_uuid_equals(key, &exp->exp_client_uuid) &&
+              !exp->exp_failed;
+}
+
+static void *
+uuid_export_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+}
+
+static void
+uuid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+       class_export_get(exp);
+}
+
+static void
+uuid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+       class_export_put(exp);
+}
+
+static cfs_hash_ops_t uuid_hash_ops = {
+       .hs_hash        = uuid_hash,
+       .hs_key  = uuid_key,
+       .hs_keycmp      = uuid_keycmp,
+       .hs_object      = uuid_export_object,
+       .hs_get  = uuid_export_get,
+       .hs_put_locked  = uuid_export_put_locked,
+};
+
+
+/*
+ * nid<->export hash operations
+ */
+
+static unsigned
+nid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static void *
+nid_key(struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+       RETURN(&exp->exp_connection->c_peer.nid);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+nid_kepcmp(const void *key, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       LASSERT(key);
+       exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+       RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key &&
+              !exp->exp_failed);
+}
+
+static void *
+nid_export_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct obd_export, exp_nid_hash);
+}
+
+static void
+nid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+       class_export_get(exp);
+}
+
+static void
+nid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct obd_export *exp;
+
+       exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+       class_export_put(exp);
+}
+
+static cfs_hash_ops_t nid_hash_ops = {
+       .hs_hash        = nid_hash,
+       .hs_key  = nid_key,
+       .hs_keycmp      = nid_kepcmp,
+       .hs_object      = nid_export_object,
+       .hs_get  = nid_export_get,
+       .hs_put_locked  = nid_export_put_locked,
+};
+
+
+/*
+ * nid<->nidstats hash operations
+ */
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+       struct nid_stat *ns;
+
+       ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+       return &ns->nid;
+}
+
+static int
+nidstats_keycmp(const void *key, struct hlist_node *hnode)
+{
+       return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key;
+}
+
+static void *
+nidstats_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct nid_stat, nid_hash);
+}
+
+static void
+nidstats_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct nid_stat *ns;
+
+       ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+       nidstat_getref(ns);
+}
+
+static void
+nidstats_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct nid_stat *ns;
+
+       ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+       nidstat_putref(ns);
+}
+
+static cfs_hash_ops_t nid_stat_hash_ops = {
+       .hs_hash        = nid_hash,
+       .hs_key  = nidstats_key,
+       .hs_keycmp      = nidstats_keycmp,
+       .hs_object      = nidstats_object,
+       .hs_get  = nidstats_get,
+       .hs_put_locked  = nidstats_put_locked,
+};
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
new file mode 100644 (file)
index 0000000..99adad9
--- /dev/null
@@ -0,0 +1,1321 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */)
+#define PRINT_CMD CDEBUG
+
+#include <obd.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+#include <linux/version.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+
+static int (*client_fill_super)(struct super_block *sb,
+                               struct vfsmount *mnt);
+
+static void (*kill_super_cb)(struct super_block *sb);
+
+/**************** config llog ********************/
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Continue to process new statements appended to the logs
+ * (whenever the config lock is revoked) until lustre_end_log
+ * is called.
+ * @param sb The superblock is used by the MGC to write to the local copy of
+ *   the config log
+ * @param logname The name of the llog to replicate from the MGS
+ * @param cfg Since the same mgc may be used to follow multiple config logs
+ *   (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
+ *   this log, and is added to the mgc's list of logs to follow.
+ */
+int lustre_process_log(struct super_block *sb, char *logname,
+                    struct config_llog_instance *cfg)
+{
+       struct lustre_cfg *lcfg;
+       struct lustre_cfg_bufs *bufs;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_device *mgc = lsi->lsi_mgc;
+       int rc;
+       ENTRY;
+
+       LASSERT(mgc);
+       LASSERT(cfg);
+
+       OBD_ALLOC_PTR(bufs);
+       if (bufs == NULL)
+               RETURN(-ENOMEM);
+
+       /* mgc_process_config */
+       lustre_cfg_bufs_reset(bufs, mgc->obd_name);
+       lustre_cfg_bufs_set_string(bufs, 1, logname);
+       lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
+       lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
+       lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
+       rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+       lustre_cfg_free(lcfg);
+
+       OBD_FREE_PTR(bufs);
+
+       if (rc == -EINVAL)
+               LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
+                                  "failed from the MGS (%d).  Make sure this "
+                                  "client and the MGS are running compatible "
+                                  "versions of Lustre.\n",
+                                  mgc->obd_name, logname, rc);
+
+       if (rc)
+               LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
+                                  "failed (%d). This may be the result of "
+                                  "communication errors between this node and "
+                                  "the MGS, a bad configuration, or other "
+                                  "errors. See the syslog for more "
+                                  "information.\n", mgc->obd_name, logname,
+                                  rc);
+
+       /* class_obd_list(); */
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_process_log);
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname,
+                      struct config_llog_instance *cfg)
+{
+       struct lustre_cfg *lcfg;
+       struct lustre_cfg_bufs bufs;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_device *mgc = lsi->lsi_mgc;
+       int rc;
+       ENTRY;
+
+       if (!mgc)
+               RETURN(-ENOENT);
+
+       /* mgc_process_config */
+       lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+       lustre_cfg_bufs_set_string(&bufs, 1, logname);
+       if (cfg)
+               lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+       lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
+       rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+       lustre_cfg_free(lcfg);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_end_log);
+
+/**************** obd start *******************/
+
+/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
+ * lctl (and do for echo cli/srv.
+ */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+           char *s1, char *s2, char *s3, char *s4)
+{
+       struct lustre_cfg_bufs bufs;
+       struct lustre_cfg    * lcfg = NULL;
+       int rc;
+
+       CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+              cmd, s1, s2, s3, s4);
+
+       lustre_cfg_bufs_reset(&bufs, cfgname);
+       if (s1)
+               lustre_cfg_bufs_set_string(&bufs, 1, s1);
+       if (s2)
+               lustre_cfg_bufs_set_string(&bufs, 2, s2);
+       if (s3)
+               lustre_cfg_bufs_set_string(&bufs, 3, s3);
+       if (s4)
+               lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+       lcfg = lustre_cfg_new(cmd, &bufs);
+       lcfg->lcfg_nid = nid;
+       rc = class_process_config(lcfg);
+       lustre_cfg_free(lcfg);
+       return(rc);
+}
+EXPORT_SYMBOL(do_lcfg);
+
+/** Call class_attach and class_setup.  These methods in turn call
+ * obd type-specific methods.
+ */
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+                       char *s1, char *s2, char *s3, char *s4)
+{
+       int rc;
+       CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
+
+       rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
+       if (rc) {
+               CERROR("%s attach error %d\n", obdname, rc);
+               return rc;
+       }
+       rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
+       if (rc) {
+               CERROR("%s setup error %d\n", obdname, rc);
+               do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
+       }
+       return rc;
+}
+
+DEFINE_MUTEX(mgc_start_lock);
+
+/** Set up a mgc obd to process startup logs
+ *
+ * \param sb [in] super block of the mgc obd
+ *
+ * \retval 0 success, otherwise error code
+ */
+int lustre_start_mgc(struct super_block *sb)
+{
+       struct obd_connect_data *data = NULL;
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_device *obd;
+       struct obd_export *exp;
+       struct obd_uuid *uuid;
+       class_uuid_t uuidc;
+       lnet_nid_t nid;
+       char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+       char *ptr;
+       int recov_bk;
+       int rc = 0, i = 0, j, len;
+       ENTRY;
+
+       LASSERT(lsi->lsi_lmd);
+
+       /* Find the first non-lo MGS nid for our MGC name */
+       if (IS_SERVER(lsi)) {
+               /* mount -o mgsnode=nid */
+               ptr = lsi->lsi_lmd->lmd_mgs;
+               if (lsi->lsi_lmd->lmd_mgs &&
+                   (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
+                       i++;
+               } else if (IS_MGS(lsi)) {
+                       lnet_process_id_t id;
+                       while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+                               if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+                                       continue;
+                               nid = id.nid;
+                               i++;
+                               break;
+                       }
+               }
+       } else { /* client */
+               /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+               ptr = lsi->lsi_lmd->lmd_dev;
+               if (class_parse_nid(ptr, &nid, &ptr) == 0)
+                       i++;
+       }
+       if (i == 0) {
+               CERROR("No valid MGS nids found.\n");
+               RETURN(-EINVAL);
+       }
+
+       mutex_lock(&mgc_start_lock);
+
+       len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
+       OBD_ALLOC(mgcname, len);
+       OBD_ALLOC(niduuid, len + 2);
+       if (!mgcname || !niduuid)
+               GOTO(out_free, rc = -ENOMEM);
+       sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
+
+       mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
+
+       OBD_ALLOC_PTR(data);
+       if (data == NULL)
+               GOTO(out_free, rc = -ENOMEM);
+
+       obd = class_name2obd(mgcname);
+       if (obd && !obd->obd_stopping) {
+               rc = obd_set_info_async(NULL, obd->obd_self_export,
+                                       strlen(KEY_MGSSEC), KEY_MGSSEC,
+                                       strlen(mgssec), mgssec, NULL);
+               if (rc)
+                       GOTO(out_free, rc);
+
+               /* Re-using an existing MGC */
+               atomic_inc(&obd->u.cli.cl_mgc_refcount);
+
+               /* IR compatibility check, only for clients */
+               if (lmd_is_client(lsi->lsi_lmd)) {
+                       int has_ir;
+                       int vallen = sizeof(*data);
+                       __u32 *flags = &lsi->lsi_lmd->lmd_flags;
+
+                       rc = obd_get_info(NULL, obd->obd_self_export,
+                                         strlen(KEY_CONN_DATA), KEY_CONN_DATA,
+                                         &vallen, data, NULL);
+                       LASSERT(rc == 0);
+                       has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
+                       if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
+                               /* LMD_FLG_NOIR is for test purpose only */
+                               LCONSOLE_WARN(
+                                   "Trying to mount a client with IR setting "
+                                   "not compatible with current mgc. "
+                                   "Force to use current mgc setting that is "
+                                   "IR %s.\n",
+                                   has_ir ? "enabled" : "disabled");
+                               if (has_ir)
+                                       *flags &= ~LMD_FLG_NOIR;
+                               else
+                                       *flags |= LMD_FLG_NOIR;
+                       }
+               }
+
+               recov_bk = 0;
+               /* If we are restarting the MGS, don't try to keep the MGC's
+                  old connection, or registration will fail. */
+               if (IS_MGS(lsi)) {
+                       CDEBUG(D_MOUNT, "New MGS with live MGC\n");
+                       recov_bk = 1;
+               }
+
+               /* Try all connections, but only once (again).
+                  We don't want to block another target from starting
+                  (using its local copy of the log), but we do want to connect
+                  if at all possible. */
+               recov_bk++;
+               CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
+               rc = obd_set_info_async(NULL, obd->obd_self_export,
+                                       sizeof(KEY_INIT_RECOV_BACKUP),
+                                       KEY_INIT_RECOV_BACKUP,
+                                       sizeof(recov_bk), &recov_bk, NULL);
+               GOTO(out, rc = 0);
+       }
+
+       CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
+
+       /* Add the primary nids for the MGS */
+       i = 0;
+       sprintf(niduuid, "%s_%x", mgcname, i);
+       if (IS_SERVER(lsi)) {
+               ptr = lsi->lsi_lmd->lmd_mgs;
+               if (IS_MGS(lsi)) {
+                       /* Use local nids (including LO) */
+                       lnet_process_id_t id;
+                       while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+                               rc = do_lcfg(mgcname, id.nid,
+                                            LCFG_ADD_UUID, niduuid, 0,0,0);
+                       }
+               } else {
+                       /* Use mgsnode= nids */
+                       /* mount -o mgsnode=nid */
+                       if (lsi->lsi_lmd->lmd_mgs) {
+                               ptr = lsi->lsi_lmd->lmd_mgs;
+                       } else if (class_find_param(ptr, PARAM_MGSNODE,
+                                                   &ptr) != 0) {
+                               CERROR("No MGS nids given.\n");
+                               GOTO(out_free, rc = -EINVAL);
+                       }
+                       while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+                               rc = do_lcfg(mgcname, nid,
+                                            LCFG_ADD_UUID, niduuid, 0,0,0);
+                               i++;
+                       }
+               }
+       } else { /* client */
+               /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+               ptr = lsi->lsi_lmd->lmd_dev;
+               while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+                       rc = do_lcfg(mgcname, nid,
+                                    LCFG_ADD_UUID, niduuid, 0,0,0);
+                       i++;
+                       /* Stop at the first failover nid */
+                       if (*ptr == ':')
+                               break;
+               }
+       }
+       if (i == 0) {
+               CERROR("No valid MGS nids found.\n");
+               GOTO(out_free, rc = -EINVAL);
+       }
+       lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+       /* Random uuid for MGC allows easier reconnects */
+       OBD_ALLOC_PTR(uuid);
+       ll_generate_random_uuid(uuidc);
+       class_uuid_unparse(uuidc, uuid);
+
+       /* Start the MGC */
+       rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
+                                (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+                                niduuid, 0, 0);
+       OBD_FREE_PTR(uuid);
+       if (rc)
+               GOTO(out_free, rc);
+
+       /* Add any failover MGS nids */
+       i = 1;
+       while (ptr && ((*ptr == ':' ||
+              class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
+               /* New failover node */
+               sprintf(niduuid, "%s_%x", mgcname, i);
+               j = 0;
+               while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+                       j++;
+                       rc = do_lcfg(mgcname, nid,
+                                    LCFG_ADD_UUID, niduuid, 0,0,0);
+                       if (*ptr == ':')
+                               break;
+               }
+               if (j > 0) {
+                       rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
+                                    niduuid, 0, 0, 0);
+                       i++;
+               } else {
+                       /* at ":/fsname" */
+                       break;
+               }
+       }
+       lsi->lsi_lmd->lmd_mgs_failnodes = i;
+
+       obd = class_name2obd(mgcname);
+       if (!obd) {
+               CERROR("Can't find mgcobd %s\n", mgcname);
+               GOTO(out_free, rc = -ENOTCONN);
+       }
+
+       rc = obd_set_info_async(NULL, obd->obd_self_export,
+                               strlen(KEY_MGSSEC), KEY_MGSSEC,
+                               strlen(mgssec), mgssec, NULL);
+       if (rc)
+               GOTO(out_free, rc);
+
+       /* Keep a refcount of servers/clients who started with "mount",
+          so we know when we can get rid of the mgc. */
+       atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+       /* Try all connections, but only once. */
+       recov_bk = 1;
+       rc = obd_set_info_async(NULL, obd->obd_self_export,
+                               sizeof(KEY_INIT_RECOV_BACKUP),
+                               KEY_INIT_RECOV_BACKUP,
+                               sizeof(recov_bk), &recov_bk, NULL);
+       if (rc)
+               /* nonfatal */
+               CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
+
+       /* We connect to the MGS at setup, and don't disconnect until cleanup */
+       data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+                                 OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
+                                 OBD_CONNECT_LVB_TYPE;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+       data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+       if (lmd_is_client(lsi->lsi_lmd) &&
+           lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
+               data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
+       data->ocd_version = LUSTRE_VERSION_CODE;
+       rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
+       if (rc) {
+               CERROR("connect failed %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       obd->u.cli.cl_mgc_mgsexp = exp;
+
+out:
+       /* Keep the mgc info in the sb. Note that many lsi's can point
+          to the same mgc.*/
+       lsi->lsi_mgc = obd;
+out_free:
+       mutex_unlock(&mgc_start_lock);
+
+       if (data)
+               OBD_FREE_PTR(data);
+       if (mgcname)
+               OBD_FREE(mgcname, len);
+       if (niduuid)
+               OBD_FREE(niduuid, len + 2);
+       RETURN(rc);
+}
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct obd_device *obd;
+       char *niduuid = 0, *ptr = 0;
+       int i, rc = 0, len = 0;
+       ENTRY;
+
+       if (!lsi)
+               RETURN(-ENOENT);
+       obd = lsi->lsi_mgc;
+       if (!obd)
+               RETURN(-ENOENT);
+       lsi->lsi_mgc = NULL;
+
+       mutex_lock(&mgc_start_lock);
+       LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
+       if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+               /* This is not fatal, every client that stops
+                  will call in here. */
+               CDEBUG(D_MOUNT, "mgc still has %d references.\n",
+                      atomic_read(&obd->u.cli.cl_mgc_refcount));
+               GOTO(out, rc = -EBUSY);
+       }
+
+       /* The MGC has no recoverable data in any case.
+        * force shotdown set in umount_begin */
+       obd->obd_no_recov = 1;
+
+       if (obd->u.cli.cl_mgc_mgsexp) {
+               /* An error is not fatal, if we are unable to send the
+                  disconnect mgs ping evictor cleans up the export */
+               rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+               if (rc)
+                       CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
+       }
+
+       /* Save the obdname for cleaning the nid uuids, which are
+          obdname_XX */
+       len = strlen(obd->obd_name) + 6;
+       OBD_ALLOC(niduuid, len);
+       if (niduuid) {
+               strcpy(niduuid, obd->obd_name);
+               ptr = niduuid + strlen(niduuid);
+       }
+
+       rc = class_manual_cleanup(obd);
+       if (rc)
+               GOTO(out, rc);
+
+       /* Clean the nid uuids */
+       if (!niduuid)
+               GOTO(out, rc = -ENOMEM);
+
+       for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+               sprintf(ptr, "_%x", i);
+               rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
+                            niduuid, 0, 0, 0);
+               if (rc)
+                       CERROR("del MDC UUID %s failed: rc = %d\n",
+                              niduuid, rc);
+       }
+out:
+       if (niduuid)
+               OBD_FREE(niduuid, len);
+
+       /* class_import_put will get rid of the additional connections */
+       mutex_unlock(&mgc_start_lock);
+       RETURN(rc);
+}
+
+/***************** lustre superblock **************/
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi;
+       ENTRY;
+
+       OBD_ALLOC_PTR(lsi);
+       if (!lsi)
+               RETURN(NULL);
+       OBD_ALLOC_PTR(lsi->lsi_lmd);
+       if (!lsi->lsi_lmd) {
+               OBD_FREE_PTR(lsi);
+               RETURN(NULL);
+       }
+
+       lsi->lsi_lmd->lmd_exclude_count = 0;
+       lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+       lsi->lsi_lmd->lmd_recovery_time_hard = 0;
+       s2lsi_nocast(sb) = lsi;
+       /* we take 1 extra ref for our setup */
+       atomic_set(&lsi->lsi_mounts, 1);
+
+       /* Default umount style */
+       lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+
+       RETURN(lsi);
+}
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       ENTRY;
+
+       LASSERT(lsi != NULL);
+       CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
+
+       /* someone didn't call server_put_mount. */
+       LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+       if (lsi->lsi_lmd != NULL) {
+               if (lsi->lsi_lmd->lmd_dev != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_dev,
+                                strlen(lsi->lsi_lmd->lmd_dev) + 1);
+               if (lsi->lsi_lmd->lmd_profile != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_profile,
+                                strlen(lsi->lsi_lmd->lmd_profile) + 1);
+               if (lsi->lsi_lmd->lmd_mgssec != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
+                                strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
+               if (lsi->lsi_lmd->lmd_opts != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_opts,
+                                strlen(lsi->lsi_lmd->lmd_opts) + 1);
+               if (lsi->lsi_lmd->lmd_exclude_count)
+                       OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+                                sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
+                                lsi->lsi_lmd->lmd_exclude_count);
+               if (lsi->lsi_lmd->lmd_mgs != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_mgs,
+                                strlen(lsi->lsi_lmd->lmd_mgs) + 1);
+               if (lsi->lsi_lmd->lmd_osd_type != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
+                                strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
+               if (lsi->lsi_lmd->lmd_params != NULL)
+                       OBD_FREE(lsi->lsi_lmd->lmd_params, 4096);
+
+               OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+       }
+
+       LASSERT(lsi->lsi_llsbi == NULL);
+       OBD_FREE(lsi, sizeof(*lsi));
+       s2lsi_nocast(sb) = NULL;
+
+       RETURN(0);
+}
+
+/* The lsi has one reference for every server that is using the disk -
+   e.g. MDT, MGS, and potentially MGC */
+int lustre_put_lsi(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       ENTRY;
+
+       LASSERT(lsi != NULL);
+
+       CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+       if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+               if (IS_SERVER(lsi) && lsi->lsi_osd_exp) {
+                       obd_disconnect(lsi->lsi_osd_exp);
+                       /* wait till OSD is gone */
+                       obd_zombie_barrier();
+               }
+               lustre_free_lsi(sb);
+               RETURN(1);
+       }
+       RETURN(0);
+}
+
+/** Get the fsname ("lustre") from the server name ("lustre-OST003F").
+ * @param [in] svname server name including type and index
+ * @param [out] fsname Buffer to copy filesystem name prefix into.
+ *  Must have at least 'strlen(fsname) + 1' chars.
+ * @param [out] endptr if endptr isn't NULL it is set to end of fsname
+ * rc < 0  on error
+ */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr)
+{
+       const char *dash = strrchr(svname, '-');
+       if (!dash) {
+               dash = strrchr(svname, ':');
+               if (!dash)
+                       return -EINVAL;
+       }
+
+       /* interpret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
+        * in the fsname, then determine the server index */
+       if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
+               dash--;
+               for (; dash > svname && *dash != '-' && *dash != ':'; dash--)
+                       ;
+               if (dash == svname)
+                       return -EINVAL;
+       }
+
+       if (fsname != NULL) {
+               strncpy(fsname, svname, dash - svname);
+               fsname[dash - svname] = '\0';
+       }
+
+       if (endptr != NULL)
+               *endptr = dash;
+
+       return 0;
+}
+EXPORT_SYMBOL(server_name2fsname);
+
+/**
+ * Get service name (svname) from string
+ * rc < 0 on error
+ * if endptr isn't NULL it is set to end of fsname *
+ */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+                      size_t svsize)
+{
+       int rc;
+       const const char *dash;
+
+       /* We use server_name2fsname() just for parsing */
+       rc = server_name2fsname(label, NULL, &dash);
+       if (rc != 0)
+               return rc;
+
+       if (*dash != '-')
+               return -1;
+
+       if (strlcpy(svname, dash + 1, svsize) >= svsize)
+               return -E2BIG;
+
+       return 0;
+}
+EXPORT_SYMBOL(server_name2svname);
+
+
+/* Get the index from the obd name.
+   rc = server type, or
+   rc < 0  on error
+   if endptr isn't NULL it is set to end of name */
+int server_name2index(const char *svname, __u32 *idx, const char **endptr)
+{
+       unsigned long index;
+       int rc;
+       const char *dash;
+
+       /* We use server_name2fsname() just for parsing */
+       rc = server_name2fsname(svname, NULL, &dash);
+       if (rc != 0)
+               return rc;
+
+       if (*dash != '-')
+               return -EINVAL;
+
+       dash++;
+
+       if (strncmp(dash, "MDT", 3) == 0)
+               rc = LDD_F_SV_TYPE_MDT;
+       else if (strncmp(dash, "OST", 3) == 0)
+               rc = LDD_F_SV_TYPE_OST;
+       else
+               return -EINVAL;
+
+       dash += 3;
+
+       if (strcmp(dash, "all") == 0)
+               return rc | LDD_F_SV_ALL;
+
+       index = simple_strtoul(dash, (char **)endptr, 16);
+       *idx = index;
+
+       return rc;
+}
+EXPORT_SYMBOL(server_name2index);
+
+/*************** mount common betweeen server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+
+       /* Drop a ref to the MGC */
+       rc = lustre_stop_mgc(sb);
+       if (rc && (rc != -ENOENT)) {
+               if (rc != -EBUSY) {
+                       CERROR("Can't stop MGC: %d\n", rc);
+                       RETURN(rc);
+               }
+               /* BUSY just means that there's some other obd that
+                  needs the mgc.  Let him clean it up. */
+               CDEBUG(D_MOUNT, "MGC still in use\n");
+       }
+       /* Drop a ref to the mounted disk */
+       lustre_put_lsi(sb);
+       lu_types_stop();
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_common_put_super);
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+       int i;
+
+       PRINT_CMD(D_MOUNT, "  mount data:\n");
+       if (lmd_is_client(lmd))
+               PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile);
+       PRINT_CMD(D_MOUNT, "device:  %s\n", lmd->lmd_dev);
+       PRINT_CMD(D_MOUNT, "flags:   %x\n", lmd->lmd_flags);
+
+       if (lmd->lmd_opts)
+               PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts);
+
+       if (lmd->lmd_recovery_time_soft)
+               PRINT_CMD(D_MOUNT, "recovery time soft: %d\n",
+                         lmd->lmd_recovery_time_soft);
+
+       if (lmd->lmd_recovery_time_hard)
+               PRINT_CMD(D_MOUNT, "recovery time hard: %d\n",
+                         lmd->lmd_recovery_time_hard);
+
+       for (i = 0; i < lmd->lmd_exclude_count; i++) {
+               PRINT_CMD(D_MOUNT, "exclude %d:  OST%04x\n", i,
+                         lmd->lmd_exclude[i]);
+       }
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+       struct lustre_mount_data *lmd = lsi->lsi_lmd;
+       __u32 index;
+       int i, rc;
+       ENTRY;
+
+       rc = server_name2index(svname, &index, NULL);
+       if (rc != LDD_F_SV_TYPE_OST)
+               /* Only exclude OSTs */
+               RETURN(0);
+
+       CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
+              index, lmd->lmd_exclude_count, lmd->lmd_dev);
+
+       for(i = 0; i < lmd->lmd_exclude_count; i++) {
+               if (index == lmd->lmd_exclude[i]) {
+                       CWARN("Excluding %s (on exclusion list)\n", svname);
+                       RETURN(1);
+               }
+       }
+       RETURN(0);
+}
+
+/* mount -v  -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr)
+{
+       const char *s1 = ptr, *s2;
+       __u32 index, *exclude_list;
+       int rc = 0, devmax;
+       ENTRY;
+
+       /* The shortest an ost name can be is 8 chars: -OST0000.
+          We don't actually know the fsname at this time, so in fact
+          a user could specify any fsname. */
+       devmax = strlen(ptr) / 8 + 1;
+
+       /* temp storage until we figure out how many we have */
+       OBD_ALLOC(exclude_list, sizeof(index) * devmax);
+       if (!exclude_list)
+               RETURN(-ENOMEM);
+
+       /* we enter this fn pointing at the '=' */
+       while (*s1 && *s1 != ' ' && *s1 != ',') {
+               s1++;
+               rc = server_name2index(s1, &index, &s2);
+               if (rc < 0) {
+                       CERROR("Can't parse server name '%s'\n", s1);
+                       break;
+               }
+               if (rc == LDD_F_SV_TYPE_OST)
+                       exclude_list[lmd->lmd_exclude_count++] = index;
+               else
+                       CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
+               s1 = s2;
+               /* now we are pointing at ':' (next exclude)
+                  or ',' (end of excludes) */
+               if (lmd->lmd_exclude_count >= devmax)
+                       break;
+       }
+       if (rc >= 0) /* non-err */
+               rc = 0;
+
+       if (lmd->lmd_exclude_count) {
+               /* permanent, freed in lustre_free_lsi */
+               OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
+                         lmd->lmd_exclude_count);
+               if (lmd->lmd_exclude) {
+                       memcpy(lmd->lmd_exclude, exclude_list,
+                              sizeof(index) * lmd->lmd_exclude_count);
+               } else {
+                       rc = -ENOMEM;
+                       lmd->lmd_exclude_count = 0;
+               }
+       }
+       OBD_FREE(exclude_list, sizeof(index) * devmax);
+       RETURN(rc);
+}
+
+static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
+{
+       char   *tail;
+       int     length;
+
+       if (lmd->lmd_mgssec != NULL) {
+               OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
+               lmd->lmd_mgssec = NULL;
+       }
+
+       tail = strchr(ptr, ',');
+       if (tail == NULL)
+               length = strlen(ptr);
+       else
+               length = tail - ptr;
+
+       OBD_ALLOC(lmd->lmd_mgssec, length + 1);
+       if (lmd->lmd_mgssec == NULL)
+               return -ENOMEM;
+
+       memcpy(lmd->lmd_mgssec, ptr, length);
+       lmd->lmd_mgssec[length] = '\0';
+       return 0;
+}
+
+static int lmd_parse_string(char **handle, char *ptr)
+{
+       char   *tail;
+       int     length;
+
+       if ((handle == NULL) || (ptr == NULL))
+               return -EINVAL;
+
+       if (*handle != NULL) {
+               OBD_FREE(*handle, strlen(*handle) + 1);
+               *handle = NULL;
+       }
+
+       tail = strchr(ptr, ',');
+       if (tail == NULL)
+               length = strlen(ptr);
+       else
+               length = tail - ptr;
+
+       OBD_ALLOC(*handle, length + 1);
+       if (*handle == NULL)
+               return -ENOMEM;
+
+       memcpy(*handle, ptr, length);
+       (*handle)[length] = '\0';
+
+       return 0;
+}
+
+/* Collect multiple values for mgsnid specifiers */
+static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
+{
+       lnet_nid_t nid;
+       char *tail = *ptr;
+       char *mgsnid;
+       int   length;
+       int   oldlen = 0;
+
+       /* Find end of nidlist */
+       while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {}
+       length = tail - *ptr;
+       if (length == 0) {
+               LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
+               return -EINVAL;
+       }
+
+       if (lmd->lmd_mgs != NULL)
+               oldlen = strlen(lmd->lmd_mgs) + 1;
+
+       OBD_ALLOC(mgsnid, oldlen + length + 1);
+       if (mgsnid == NULL)
+               return -ENOMEM;
+
+       if (lmd->lmd_mgs != NULL) {
+               /* Multiple mgsnid= are taken to mean failover locations */
+               memcpy(mgsnid, lmd->lmd_mgs, oldlen);
+               mgsnid[oldlen - 1] = ':';
+               OBD_FREE(lmd->lmd_mgs, oldlen);
+       }
+       memcpy(mgsnid + oldlen, *ptr, length);
+       mgsnid[oldlen + length] = '\0';
+       lmd->lmd_mgs = mgsnid;
+       *ptr = tail;
+
+       return 0;
+}
+
+/** Parse mount line options
+ * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
+ * dev is passed as device=uml1:/lustre by mount.lustre
+ */
+static int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+       char *s1, *s2, *devname = NULL;
+       struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(lmd);
+       if (!options) {
+               LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
+                                  "/sbin/mount.lustre is installed.\n");
+               RETURN(-EINVAL);
+       }
+
+       /* Options should be a string - try to detect old lmd data */
+       if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
+               LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
+                                  "/sbin/mount.lustre.  Please install "
+                                  "version %s\n", LUSTRE_VERSION_STRING);
+               RETURN(-EINVAL);
+       }
+       lmd->lmd_magic = LMD_MAGIC;
+
+       OBD_ALLOC(lmd->lmd_params, 4096);
+       if (lmd->lmd_params == NULL)
+               RETURN(-ENOMEM);
+       lmd->lmd_params[0] = '\0';
+
+       /* Set default flags here */
+
+       s1 = options;
+       while (*s1) {
+               int clear = 0;
+               int time_min = OBD_RECOVERY_TIME_MIN;
+
+               /* Skip whitespace and extra commas */
+               while (*s1 == ' ' || *s1 == ',')
+                       s1++;
+
+               /* Client options are parsed in ll_options: eg. flock,
+                  user_xattr, acl */
+
+               /* Parse non-ldiskfs options here. Rather than modifying
+                  ldiskfs, we just zero these out here */
+               if (strncmp(s1, "abort_recov", 11) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+                       clear++;
+               } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+                       lmd->lmd_recovery_time_soft = max_t(int,
+                               simple_strtoul(s1 + 19, NULL, 10), time_min);
+                       clear++;
+               } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+                       lmd->lmd_recovery_time_hard = max_t(int,
+                               simple_strtoul(s1 + 19, NULL, 10), time_min);
+                       clear++;
+               } else if (strncmp(s1, "noir", 4) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
+                       clear++;
+               } else if (strncmp(s1, "nosvc", 5) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_NOSVC;
+                       clear++;
+               } else if (strncmp(s1, "nomgs", 5) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_NOMGS;
+                       clear++;
+               } else if (strncmp(s1, "noscrub", 7) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_NOSCRUB;
+                       clear++;
+               } else if (strncmp(s1, PARAM_MGSNODE,
+                                  sizeof(PARAM_MGSNODE) - 1) == 0) {
+                       s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
+                       /* Assume the next mount opt is the first
+                          invalid nid we get to. */
+                       rc = lmd_parse_mgs(lmd, &s2);
+                       if (rc)
+                               goto invalid;
+                       clear++;
+               } else if (strncmp(s1, "writeconf", 9) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_WRITECONF;
+                       clear++;
+               } else if (strncmp(s1, "update", 6) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_UPDATE;
+                       clear++;
+               } else if (strncmp(s1, "virgin", 6) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_VIRGIN;
+                       clear++;
+               } else if (strncmp(s1, "noprimnode", 10) == 0) {
+                       lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE;
+                       clear++;
+               } else if (strncmp(s1, "mgssec=", 7) == 0) {
+                       rc = lmd_parse_mgssec(lmd, s1 + 7);
+                       if (rc)
+                               goto invalid;
+                       clear++;
+               /* ost exclusion list */
+               } else if (strncmp(s1, "exclude=", 8) == 0) {
+                       rc = lmd_make_exclusion(lmd, s1 + 7);
+                       if (rc)
+                               goto invalid;
+                       clear++;
+               } else if (strncmp(s1, "mgs", 3) == 0) {
+                       /* We are an MGS */
+                       lmd->lmd_flags |= LMD_FLG_MGS;
+                       clear++;
+               } else if (strncmp(s1, "svname=", 7) == 0) {
+                       rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
+                       if (rc)
+                               goto invalid;
+                       clear++;
+               } else if (strncmp(s1, "param=", 6) == 0) {
+                       int length;
+                       char *tail = strchr(s1 + 6, ',');
+                       if (tail == NULL)
+                               length = strlen(s1);
+                       else
+                               length = tail - s1;
+                       length -= 6;
+                       strncat(lmd->lmd_params, s1 + 6, length);
+                       strcat(lmd->lmd_params, " ");
+                       clear++;
+               } else if (strncmp(s1, "osd=", 4) == 0) {
+                       rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
+                       if (rc)
+                               goto invalid;
+                       clear++;
+               }
+               /* Linux 2.4 doesn't pass the device, so we stuck it at the
+                  end of the options. */
+               else if (strncmp(s1, "device=", 7) == 0) {
+                       devname = s1 + 7;
+                       /* terminate options right before device.  device
+                          must be the last one. */
+                       *s1 = '\0';
+                       break;
+               }
+
+               /* Find next opt */
+               s2 = strchr(s1, ',');
+               if (s2 == NULL) {
+                       if (clear)
+                               *s1 = '\0';
+                       break;
+               }
+               s2++;
+               if (clear)
+                       memmove(s1, s2, strlen(s2) + 1);
+               else
+                       s1 = s2;
+       }
+
+       if (!devname) {
+               LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
+                                  "(need mount option 'device=...')\n");
+               goto invalid;
+       }
+
+       s1 = strstr(devname, ":/");
+       if (s1) {
+               ++s1;
+               lmd->lmd_flags |= LMD_FLG_CLIENT;
+               /* Remove leading /s from fsname */
+               while (*++s1 == '/') ;
+               /* Freed in lustre_free_lsi */
+               OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
+               if (!lmd->lmd_profile)
+                       RETURN(-ENOMEM);
+               sprintf(lmd->lmd_profile, "%s-client", s1);
+       }
+
+       /* Freed in lustre_free_lsi */
+       OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+       if (!lmd->lmd_dev)
+               RETURN(-ENOMEM);
+       strcpy(lmd->lmd_dev, devname);
+
+       /* Save mount options */
+       s1 = options + strlen(options) - 1;
+       while (s1 >= options && (*s1 == ',' || *s1 == ' '))
+               *s1-- = 0;
+       if (*options != 0) {
+               /* Freed in lustre_free_lsi */
+               OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+               if (!lmd->lmd_opts)
+                       RETURN(-ENOMEM);
+               strcpy(lmd->lmd_opts, options);
+       }
+
+       lmd_print(lmd);
+       lmd->lmd_magic = LMD_MAGIC;
+
+       RETURN(rc);
+
+invalid:
+       CERROR("Bad mount options %s\n", options);
+       RETURN(-EINVAL);
+}
+
+struct lustre_mount_data2 {
+       void *lmd2_data;
+       struct vfsmount *lmd2_mnt;
+};
+
+/** This is the entry point for the mount call into Lustre.
+ * This is called when a server or client is mounted,
+ * and this is where we start setting things up.
+ * @param data Mount options (e.g. -o flock,abort_recov)
+ */
+int lustre_fill_super(struct super_block *sb, void *data, int silent)
+{
+       struct lustre_mount_data *lmd;
+       struct lustre_mount_data2 *lmd2 = data;
+       struct lustre_sb_info *lsi;
+       int rc;
+       ENTRY;
+
+       CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+       lsi = lustre_init_lsi(sb);
+       if (!lsi)
+               RETURN(-ENOMEM);
+       lmd = lsi->lsi_lmd;
+
+       /*
+        * Disable lockdep during mount, because mount locking patterns are
+        * `special'.
+        */
+       lockdep_off();
+
+       /*
+        * LU-639: the obd cleanup of last mount may not finish yet, wait here.
+        */
+       obd_zombie_barrier();
+
+       /* Figure out the lmd from the mount options */
+       if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
+               lustre_put_lsi(sb);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       if (lmd_is_client(lmd)) {
+               CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+               if (!client_fill_super) {
+                       LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
+                                          "client mount! Is the 'lustre' "
+                                          "module loaded?\n");
+                       lustre_put_lsi(sb);
+                       rc = -ENODEV;
+               } else {
+                       rc = lustre_start_mgc(sb);
+                       if (rc) {
+                               lustre_put_lsi(sb);
+                               GOTO(out, rc);
+                       }
+                       /* Connect and start */
+                       /* (should always be ll_fill_super) */
+                       rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
+                       /* c_f_s will call lustre_common_put_super on failure */
+               }
+       } else {
+               CERROR("This is client-side-only module, "
+                      "cannot handle server mount.\n");
+               rc = -EINVAL;
+       }
+
+       /* If error happens in fill_super() call, @lsi will be killed there.
+        * This is why we do not put it here. */
+       GOTO(out, rc);
+out:
+       if (rc) {
+               CERROR("Unable to mount %s (%d)\n",
+                      s2lsi(sb) ? lmd->lmd_dev : "", rc);
+       } else {
+               CDEBUG(D_SUPER, "Mount %s complete\n",
+                      lmd->lmd_dev);
+       }
+       lockdep_on();
+       return rc;
+}
+
+
+/* We can't call ll_fill_super by name because it lives in a module that
+   must be loaded after this one. */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+                                                 struct vfsmount *mnt))
+{
+       client_fill_super = cfs;
+}
+EXPORT_SYMBOL(lustre_register_client_fill_super);
+
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
+{
+       kill_super_cb = cfs;
+}
+EXPORT_SYMBOL(lustre_register_kill_super_cb);
+
+/***************** FS registration ******************/
+struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
+                               const char *devname, void *data)
+{
+       struct lustre_mount_data2 lmd2 = { data, NULL };
+
+       return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
+}
+
+void lustre_kill_super(struct super_block *sb)
+{
+       struct lustre_sb_info *lsi = s2lsi(sb);
+
+       if (kill_super_cb && lsi && !IS_SERVER(lsi))
+               (*kill_super_cb)(sb);
+
+       kill_anon_super(sb);
+}
+
+/** Register the "lustre" fs type
+ */
+struct file_system_type lustre_fs_type = {
+       .owner  = THIS_MODULE,
+       .name    = "lustre",
+       .mount  = lustre_mount,
+       .kill_sb      = lustre_kill_super,
+       .fs_flags     = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
+                       FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+};
+
+int lustre_register_fs(void)
+{
+       return register_filesystem(&lustre_fs_type);
+}
+
+int lustre_unregister_fs(void)
+{
+       return unregister_filesystem(&lustre_fs_type);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c
new file mode 100644 (file)
index 0000000..01a0e1f
--- /dev/null
@@ -0,0 +1,362 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
+{
+       dst->o_parent_oid = fid_oid(parent);
+       dst->o_parent_seq = fid_seq(parent);
+       dst->o_parent_ver = fid_ver(parent);
+       dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+}
+EXPORT_SYMBOL(obdo_set_parent_fid);
+
+/* WARNING: the file systems must take care not to tinker with
+   attributes they don't manage (such as blocks). */
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid)
+{
+       obd_flag newvalid = 0;
+
+       if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+               CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
+                      valid, LTIME_S(src->i_mtime),
+                      LTIME_S(src->i_ctime));
+
+       if (valid & OBD_MD_FLATIME) {
+               dst->o_atime = LTIME_S(src->i_atime);
+               newvalid |= OBD_MD_FLATIME;
+       }
+       if (valid & OBD_MD_FLMTIME) {
+               dst->o_mtime = LTIME_S(src->i_mtime);
+               newvalid |= OBD_MD_FLMTIME;
+       }
+       if (valid & OBD_MD_FLCTIME) {
+               dst->o_ctime = LTIME_S(src->i_ctime);
+               newvalid |= OBD_MD_FLCTIME;
+       }
+       if (valid & OBD_MD_FLSIZE) {
+               dst->o_size = i_size_read(src);
+               newvalid |= OBD_MD_FLSIZE;
+       }
+       if (valid & OBD_MD_FLBLOCKS) {  /* allocation of space (x512 bytes) */
+               dst->o_blocks = src->i_blocks;
+               newvalid |= OBD_MD_FLBLOCKS;
+       }
+       if (valid & OBD_MD_FLBLKSZ) {   /* optimal block size */
+               dst->o_blksize = ll_inode_blksize(src);
+               newvalid |= OBD_MD_FLBLKSZ;
+       }
+       if (valid & OBD_MD_FLTYPE) {
+               dst->o_mode = (dst->o_mode & S_IALLUGO) |
+                             (src->i_mode & S_IFMT);
+               newvalid |= OBD_MD_FLTYPE;
+       }
+       if (valid & OBD_MD_FLMODE) {
+               dst->o_mode = (dst->o_mode & S_IFMT) |
+                             (src->i_mode & S_IALLUGO);
+               newvalid |= OBD_MD_FLMODE;
+       }
+       if (valid & OBD_MD_FLUID) {
+               dst->o_uid = src->i_uid;
+               newvalid |= OBD_MD_FLUID;
+       }
+       if (valid & OBD_MD_FLGID) {
+               dst->o_gid = src->i_gid;
+               newvalid |= OBD_MD_FLGID;
+       }
+       if (valid & OBD_MD_FLFLAGS) {
+               dst->o_flags = ll_inode_flags(src);
+               newvalid |= OBD_MD_FLFLAGS;
+       }
+       dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_inode);
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid)
+{
+       CDEBUG(D_INODE, "src obdo "DOSTID" valid "LPX64", dst obdo "DOSTID"\n",
+              POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+       if (valid & OBD_MD_FLATIME)
+               dst->o_atime = src->o_atime;
+       if (valid & OBD_MD_FLMTIME)
+               dst->o_mtime = src->o_mtime;
+       if (valid & OBD_MD_FLCTIME)
+               dst->o_ctime = src->o_ctime;
+       if (valid & OBD_MD_FLSIZE)
+               dst->o_size = src->o_size;
+       if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+               dst->o_blocks = src->o_blocks;
+       if (valid & OBD_MD_FLBLKSZ)
+               dst->o_blksize = src->o_blksize;
+       if (valid & OBD_MD_FLTYPE)
+               dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+       if (valid & OBD_MD_FLMODE)
+               dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+       if (valid & OBD_MD_FLUID)
+               dst->o_uid = src->o_uid;
+       if (valid & OBD_MD_FLGID)
+               dst->o_gid = src->o_gid;
+       if (valid & OBD_MD_FLFLAGS)
+               dst->o_flags = src->o_flags;
+       if (valid & OBD_MD_FLFID) {
+               dst->o_parent_seq = src->o_parent_seq;
+               dst->o_parent_ver = src->o_parent_ver;
+       }
+       if (valid & OBD_MD_FLGENER)
+               dst->o_parent_oid = src->o_parent_oid;
+       if (valid & OBD_MD_FLHANDLE)
+               dst->o_handle = src->o_handle;
+       if (valid & OBD_MD_FLCOOKIE)
+               dst->o_lcookie = src->o_lcookie;
+
+       dst->o_valid |= valid;
+}
+EXPORT_SYMBOL(obdo_cpy_md);
+
+/* returns FALSE if comparison (by flags) is same, TRUE if changed */
+int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare)
+{
+       int res = 0;
+
+       if ( compare & OBD_MD_FLATIME )
+               res = (res || (dst->o_atime != src->o_atime));
+       if ( compare & OBD_MD_FLMTIME )
+               res = (res || (dst->o_mtime != src->o_mtime));
+       if ( compare & OBD_MD_FLCTIME )
+               res = (res || (dst->o_ctime != src->o_ctime));
+       if ( compare & OBD_MD_FLSIZE )
+               res = (res || (dst->o_size != src->o_size));
+       if ( compare & OBD_MD_FLBLOCKS ) /* allocation of space */
+               res = (res || (dst->o_blocks != src->o_blocks));
+       if ( compare & OBD_MD_FLBLKSZ )
+               res = (res || (dst->o_blksize != src->o_blksize));
+       if ( compare & OBD_MD_FLTYPE )
+               res = (res || (((dst->o_mode ^ src->o_mode) & S_IFMT) != 0));
+       if ( compare & OBD_MD_FLMODE )
+               res = (res || (((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0));
+       if ( compare & OBD_MD_FLUID )
+               res = (res || (dst->o_uid != src->o_uid));
+       if ( compare & OBD_MD_FLGID )
+               res = (res || (dst->o_gid != src->o_gid));
+       if ( compare & OBD_MD_FLFLAGS )
+               res = (res || (dst->o_flags != src->o_flags));
+       if ( compare & OBD_MD_FLNLINK )
+               res = (res || (dst->o_nlink != src->o_nlink));
+       if ( compare & OBD_MD_FLFID ) {
+               res = (res || (dst->o_parent_seq != src->o_parent_seq));
+               res = (res || (dst->o_parent_ver != src->o_parent_ver));
+       }
+       if ( compare & OBD_MD_FLGENER )
+               res = (res || (dst->o_parent_oid != src->o_parent_oid));
+       /* XXX Don't know if thses should be included here - wasn't previously
+       if ( compare & OBD_MD_FLINLINE )
+               res = (res || memcmp(dst->o_inline, src->o_inline));
+       */
+       return res;
+}
+EXPORT_SYMBOL(obdo_cmp_md);
+
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj)
+{
+       ioobj->ioo_oid = oa->o_oi;
+       if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
+               ostid_set_seq_mdt0(&ioobj->ioo_oid);
+
+       /* Since 2.4 this does not contain o_mode in the low 16 bits.
+        * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+       ioobj->ioo_max_brw = 0;
+}
+EXPORT_SYMBOL(obdo_to_ioobj);
+
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid)
+{
+       if (ia_valid & ATTR_ATIME) {
+               oa->o_atime = LTIME_S(attr->ia_atime);
+               oa->o_valid |= OBD_MD_FLATIME;
+       }
+       if (ia_valid & ATTR_MTIME) {
+               oa->o_mtime = LTIME_S(attr->ia_mtime);
+               oa->o_valid |= OBD_MD_FLMTIME;
+       }
+       if (ia_valid & ATTR_CTIME) {
+               oa->o_ctime = LTIME_S(attr->ia_ctime);
+               oa->o_valid |= OBD_MD_FLCTIME;
+       }
+       if (ia_valid & ATTR_SIZE) {
+               oa->o_size = attr->ia_size;
+               oa->o_valid |= OBD_MD_FLSIZE;
+       }
+       if (ia_valid & ATTR_MODE) {
+               oa->o_mode = attr->ia_mode;
+               oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE;
+               if (!current_is_in_group(oa->o_gid) &&
+                   !cfs_capable(CFS_CAP_FSETID))
+                       oa->o_mode &= ~S_ISGID;
+       }
+       if (ia_valid & ATTR_UID) {
+               oa->o_uid = attr->ia_uid;
+               oa->o_valid |= OBD_MD_FLUID;
+       }
+       if (ia_valid & ATTR_GID) {
+               oa->o_gid = attr->ia_gid;
+               oa->o_valid |= OBD_MD_FLGID;
+       }
+}
+EXPORT_SYMBOL(obdo_from_iattr);
+
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid)
+{
+       valid &= oa->o_valid;
+
+       if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+               CDEBUG(D_INODE, "valid "LPX64", new time "LPU64"/"LPU64"\n",
+                      oa->o_valid, oa->o_mtime, oa->o_ctime);
+
+       attr->ia_valid = 0;
+       if (valid & OBD_MD_FLATIME) {
+               LTIME_S(attr->ia_atime) = oa->o_atime;
+               attr->ia_valid |= ATTR_ATIME;
+       }
+       if (valid & OBD_MD_FLMTIME) {
+               LTIME_S(attr->ia_mtime) = oa->o_mtime;
+               attr->ia_valid |= ATTR_MTIME;
+       }
+       if (valid & OBD_MD_FLCTIME) {
+               LTIME_S(attr->ia_ctime) = oa->o_ctime;
+               attr->ia_valid |= ATTR_CTIME;
+       }
+       if (valid & OBD_MD_FLSIZE) {
+               attr->ia_size = oa->o_size;
+               attr->ia_valid |= ATTR_SIZE;
+       }
+#if 0   /* you shouldn't be able to change a file's type with setattr */
+       if (valid & OBD_MD_FLTYPE) {
+               attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT);
+               attr->ia_valid |= ATTR_MODE;
+       }
+#endif
+       if (valid & OBD_MD_FLMODE) {
+               attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT);
+               attr->ia_valid |= ATTR_MODE;
+               if (!current_is_in_group(oa->o_gid) &&
+                   !cfs_capable(CFS_CAP_FSETID))
+                       attr->ia_mode &= ~S_ISGID;
+       }
+       if (valid & OBD_MD_FLUID) {
+               attr->ia_uid = oa->o_uid;
+               attr->ia_valid |= ATTR_UID;
+       }
+       if (valid & OBD_MD_FLGID) {
+               attr->ia_gid = oa->o_gid;
+               attr->ia_valid |= ATTR_GID;
+       }
+}
+EXPORT_SYMBOL(iattr_from_obdo);
+
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid)
+{
+       iattr_from_obdo(&op_data->op_attr, oa, valid);
+       if (valid & OBD_MD_FLBLOCKS) {
+               op_data->op_attr_blocks = oa->o_blocks;
+               op_data->op_attr.ia_valid |= ATTR_BLOCKS;
+       }
+       if (valid & OBD_MD_FLFLAGS) {
+               ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+                       oa->o_flags;
+               op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+       }
+}
+EXPORT_SYMBOL(md_from_obdo);
+
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+                 unsigned int valid)
+{
+       obdo_from_iattr(oa, &op_data->op_attr, valid);
+       if (valid & ATTR_BLOCKS) {
+               oa->o_blocks = op_data->op_attr_blocks;
+               oa->o_valid |= OBD_MD_FLBLOCKS;
+       }
+       if (valid & ATTR_ATTR_FLAG) {
+               oa->o_flags =
+                       ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+               oa->o_valid |= OBD_MD_FLFLAGS;
+       }
+}
+EXPORT_SYMBOL(obdo_from_md);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo)
+{
+       dobdo->o_size = cpu_to_le64(sobdo->o_size);
+       dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime);
+       dobdo->o_atime = cpu_to_le64(sobdo->o_atime);
+       dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime);
+       dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks);
+       dobdo->o_mode = cpu_to_le32(sobdo->o_mode);
+       dobdo->o_uid = cpu_to_le32(sobdo->o_uid);
+       dobdo->o_gid = cpu_to_le32(sobdo->o_gid);
+       dobdo->o_flags = cpu_to_le32(sobdo->o_flags);
+       dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink);
+       dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize);
+       dobdo->o_valid = cpu_to_le64(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_cpu_to_le);
+
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo)
+{
+       dobdo->o_size = le64_to_cpu(sobdo->o_size);
+       dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime);
+       dobdo->o_atime = le64_to_cpu(sobdo->o_atime);
+       dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime);
+       dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks);
+       dobdo->o_mode = le32_to_cpu(sobdo->o_mode);
+       dobdo->o_uid = le32_to_cpu(sobdo->o_uid);
+       dobdo->o_gid = le32_to_cpu(sobdo->o_gid);
+       dobdo->o_flags = le32_to_cpu(sobdo->o_flags);
+       dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink);
+       dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize);
+       dobdo->o_valid = le64_to_cpu(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_le_to_cpu);
diff --git a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
new file mode 100644 (file)
index 0000000..c3b7a78
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/statfs_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <lustre_export.h>
+#include <lustre_net.h>
+#include <obd_support.h>
+#include <obd_class.h>
+
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
+{
+       memset(osfs, 0, sizeof(*osfs));
+       osfs->os_type = sfs->f_type;
+       osfs->os_blocks = sfs->f_blocks;
+       osfs->os_bfree = sfs->f_bfree;
+       osfs->os_bavail = sfs->f_bavail;
+       osfs->os_files = sfs->f_files;
+       osfs->os_ffree = sfs->f_ffree;
+       osfs->os_bsize = sfs->f_bsize;
+       osfs->os_namelen = sfs->f_namelen;
+}
+EXPORT_SYMBOL(statfs_pack);
+
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
+{
+       memset(sfs, 0, sizeof(*sfs));
+       sfs->f_type = osfs->os_type;
+       sfs->f_blocks = osfs->os_blocks;
+       sfs->f_bfree = osfs->os_bfree;
+       sfs->f_bavail = osfs->os_bavail;
+       sfs->f_files = osfs->os_files;
+       sfs->f_ffree = osfs->os_ffree;
+       sfs->f_bsize = osfs->os_bsize;
+       sfs->f_namelen = osfs->os_namelen;
+}
+EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustre/lustre/obdclass/uuid.c b/drivers/staging/lustre/lustre/obdclass/uuid.c
new file mode 100644 (file)
index 0000000..af5f27f
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
+ *
+ * Public include file for the UUID library
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+
+
+static inline __u32 consume(int nob, __u8 **ptr)
+{
+       __u32 value;
+
+       LASSERT(nob <= sizeof value);
+
+       for (value = 0; nob > 0; --nob)
+               value = (value << 8) | *((*ptr)++);
+       return value;
+}
+
+#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
+
+static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr)
+{
+       __u8 *ptr = in;
+
+       LASSERT(nr * sizeof *uu == sizeof(class_uuid_t));
+
+       while (nr-- > 0)
+               CONSUME(uu[nr], &ptr);
+}
+
+void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+       /* uu as an array of __u16's */
+       __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
+
+       CLASSERT(ARRAY_SIZE(uuid) == 8);
+
+       uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
+       sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
+               uuid[0], uuid[1], uuid[2], uuid[3],
+               uuid[4], uuid[5], uuid[6], uuid[7]);
+}
+EXPORT_SYMBOL(class_uuid_unparse);
diff --git a/drivers/staging/lustre/lustre/obdecho/Makefile b/drivers/staging/lustre/lustre/obdecho/Makefile
new file mode 100644 (file)
index 0000000..4c48e24
--- /dev/null
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LUSTRE_FS) += obdecho.o
+obdecho-y := echo_client.o lproc_echo.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/obdecho/echo.c b/drivers/staging/lustre/lustre/obdecho/echo.c
new file mode 100644 (file)
index 0000000..9e64939
--- /dev/null
@@ -0,0 +1,679 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre_dlm.h>
+#include <lprocfs_status.h>
+
+#include "echo_internal.h"
+
+/* The echo objid needs to be below 2^32, because regular FID numbers are
+ * limited to 2^32 objects in f_oid for the FID_SEQ_ECHO range. b=23335 */
+#define ECHO_INIT_OID  0x10000000ULL
+#define ECHO_HANDLE_MAGIC    0xabcd0123fedc9876ULL
+
+#define ECHO_PERSISTENT_PAGES (ECHO_PERSISTENT_SIZE >> PAGE_CACHE_SHIFT)
+static struct page *echo_persistent_pages[ECHO_PERSISTENT_PAGES];
+
+enum {
+       LPROC_ECHO_READ_BYTES = 1,
+       LPROC_ECHO_WRITE_BYTES = 2,
+       LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES +1
+};
+
+static int echo_connect(const struct lu_env *env,
+                       struct obd_export **exp, struct obd_device *obd,
+                       struct obd_uuid *cluuid, struct obd_connect_data *data,
+                       void *localdata)
+{
+       struct lustre_handle conn = { 0 };
+       int rc;
+
+       data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED;
+       rc = class_connect(&conn, obd, cluuid);
+       if (rc) {
+               CERROR("can't connect %d\n", rc);
+               return rc;
+       }
+       *exp = class_conn2export(&conn);
+
+       return 0;
+}
+
+static int echo_disconnect(struct obd_export *exp)
+{
+       LASSERT (exp != NULL);
+
+       return server_disconnect_export(exp);
+}
+
+static int echo_init_export(struct obd_export *exp)
+{
+       return ldlm_init_export(exp);
+}
+
+static int echo_destroy_export(struct obd_export *exp)
+{
+       ENTRY;
+
+       target_destroy_export(exp);
+       ldlm_destroy_export(exp);
+
+       RETURN(0);
+}
+
+ static __u64 echo_next_id(struct obd_device *obddev)
+{
+       obd_id id;
+
+       spin_lock(&obddev->u.echo.eo_lock);
+       id = ++obddev->u.echo.eo_lastino;
+       spin_unlock(&obddev->u.echo.eo_lock);
+
+       return id;
+}
+
+static int echo_create(const struct lu_env *env, struct obd_export *exp,
+                      struct obdo *oa, struct lov_stripe_md **ea,
+                      struct obd_trans_info *oti)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+
+       if (!obd) {
+               CERROR("invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               return -EINVAL;
+       }
+
+       if (!(oa->o_mode && S_IFMT)) {
+               CERROR("echo obd: no type!\n");
+               return -ENOENT;
+       }
+
+       if (!(oa->o_valid & OBD_MD_FLTYPE)) {
+               CERROR("invalid o_valid "LPX64"\n", oa->o_valid);
+               return -EINVAL;
+       }
+
+       ostid_set_seq_echo(&oa->o_oi);
+       ostid_set_id(&oa->o_oi, echo_next_id(obd));
+       oa->o_valid = OBD_MD_FLID;
+
+       return 0;
+}
+
+static int echo_destroy(const struct lu_env *env, struct obd_export *exp,
+                       struct obdo *oa, struct lov_stripe_md *ea,
+                       struct obd_trans_info *oti, struct obd_export *md_exp,
+                       void *capa)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+
+       ENTRY;
+       if (!obd) {
+               CERROR("invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               RETURN(-EINVAL);
+       }
+
+       if (!(oa->o_valid & OBD_MD_FLID)) {
+               CERROR("obdo missing FLID valid flag: "LPX64"\n", oa->o_valid);
+               RETURN(-EINVAL);
+       }
+
+       if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino ||
+           ostid_id(&oa->o_oi) < ECHO_INIT_OID) {
+               CERROR("bad destroy objid: "DOSTID"\n", POSTID(&oa->o_oi));
+               RETURN(-EINVAL);
+       }
+
+       RETURN(0);
+}
+
+static int echo_getattr(const struct lu_env *env, struct obd_export *exp,
+                       struct obd_info *oinfo)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       obd_id id = ostid_id(&oinfo->oi_oa->o_oi);
+
+       ENTRY;
+       if (!obd) {
+               CERROR("invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               RETURN(-EINVAL);
+       }
+
+       if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
+               CERROR("obdo missing FLID valid flag: "LPX64"\n",
+                      oinfo->oi_oa->o_valid);
+               RETURN(-EINVAL);
+       }
+
+       obdo_cpy_md(oinfo->oi_oa, &obd->u.echo.eo_oa, oinfo->oi_oa->o_valid);
+       ostid_set_seq_echo(&oinfo->oi_oa->o_oi);
+       ostid_set_id(&oinfo->oi_oa->o_oi, id);
+
+       RETURN(0);
+}
+
+static int echo_setattr(const struct lu_env *env, struct obd_export *exp,
+                       struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+
+       ENTRY;
+       if (!obd) {
+               CERROR("invalid client cookie "LPX64"\n",
+                      exp->exp_handle.h_cookie);
+               RETURN(-EINVAL);
+       }
+
+       if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) {
+               CERROR("obdo missing FLID valid flag: "LPX64"\n",
+                      oinfo->oi_oa->o_valid);
+               RETURN(-EINVAL);
+       }
+
+       memcpy(&obd->u.echo.eo_oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
+
+       if (ostid_id(&oinfo->oi_oa->o_oi) & 4) {
+               /* Save lock to force ACKed reply */
+               ldlm_lock_addref (&obd->u.echo.eo_nl_lock, LCK_NL);
+               oti->oti_ack_locks[0].mode = LCK_NL;
+               oti->oti_ack_locks[0].lock = obd->u.echo.eo_nl_lock;
+       }
+
+       RETURN(0);
+}
+
+static void
+echo_page_debug_setup(struct page *page, int rw, obd_id id,
+                     __u64 offset, int len)
+{
+       int   page_offset = offset & ~CFS_PAGE_MASK;
+       char *addr      = ((char *)kmap(page)) + page_offset;
+
+       if (len % OBD_ECHO_BLOCK_SIZE != 0)
+               CERROR("Unexpected block size %d\n", len);
+
+       while (len > 0) {
+               if (rw & OBD_BRW_READ)
+                       block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+                                         offset, id);
+               else
+                       block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE,
+                                         0xecc0ecc0ecc0ecc0ULL,
+                                         0xecc0ecc0ecc0ecc0ULL);
+
+               addr   += OBD_ECHO_BLOCK_SIZE;
+               offset += OBD_ECHO_BLOCK_SIZE;
+               len    -= OBD_ECHO_BLOCK_SIZE;
+       }
+
+       kunmap(page);
+}
+
+static int
+echo_page_debug_check(struct page *page, obd_id id,
+                     __u64 offset, int len)
+{
+       int   page_offset = offset & ~CFS_PAGE_MASK;
+       char *addr      = ((char *)kmap(page)) + page_offset;
+       int   rc          = 0;
+       int   rc2;
+
+       if (len % OBD_ECHO_BLOCK_SIZE != 0)
+               CERROR("Unexpected block size %d\n", len);
+
+       while (len > 0) {
+               rc2 = block_debug_check("echo", addr, OBD_ECHO_BLOCK_SIZE,
+                                       offset, id);
+
+               if (rc2 != 0 && rc == 0)
+                       rc = rc2;
+
+               addr   += OBD_ECHO_BLOCK_SIZE;
+               offset += OBD_ECHO_BLOCK_SIZE;
+               len    -= OBD_ECHO_BLOCK_SIZE;
+       }
+
+       kunmap(page);
+
+       return (rc);
+}
+
+/* This allows us to verify that desc_private is passed unmolested */
+#define DESC_PRIV 0x10293847
+
+static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj,
+                            struct niobuf_remote *nb, int *pages,
+                            struct niobuf_local *lb, int cmd, int *left)
+{
+       int gfp_mask = (ostid_id(&obj->ioo_oid) & 1) ?
+                       GFP_HIGHUSER : GFP_IOFS;
+       int ispersistent = ostid_id(&obj->ioo_oid) == ECHO_PERSISTENT_OBJID;
+       int debug_setup = (!ispersistent &&
+                          (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+                          (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+       struct niobuf_local *res = lb;
+       obd_off offset = nb->offset;
+       int len = nb->len;
+
+       while (len > 0) {
+               int plen = PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1));
+               if (len < plen)
+                       plen = len;
+
+               /* check for local buf overflow */
+               if (*left == 0)
+                       return -EINVAL;
+
+               res->lnb_file_offset = offset;
+               res->len = plen;
+               LASSERT((res->lnb_file_offset & ~CFS_PAGE_MASK) + res->len <=
+                       PAGE_CACHE_SIZE);
+
+               if (ispersistent &&
+                   ((res->lnb_file_offset >> PAGE_CACHE_SHIFT) <
+                     ECHO_PERSISTENT_PAGES)) {
+                       res->page =
+                               echo_persistent_pages[res->lnb_file_offset >>
+                                                     PAGE_CACHE_SHIFT];
+                       /* Take extra ref so __free_pages() can be called OK */
+                       get_page (res->page);
+               } else {
+                       OBD_PAGE_ALLOC(res->page, gfp_mask);
+                       if (res->page == NULL) {
+                               CERROR("can't get page for id " DOSTID"\n",
+                                      POSTID(&obj->ioo_oid));
+                               return -ENOMEM;
+                       }
+               }
+
+               CDEBUG(D_PAGE, "$$$$ get page %p @ "LPU64" for %d\n",
+                      res->page, res->lnb_file_offset, res->len);
+
+               if (cmd & OBD_BRW_READ)
+                       res->rc = res->len;
+
+               if (debug_setup)
+                       echo_page_debug_setup(res->page, cmd,
+                                             ostid_id(&obj->ioo_oid),
+                                             res->lnb_file_offset, res->len);
+
+               offset += plen;
+               len -= plen;
+               res++;
+
+               (*left)--;
+               (*pages)++;
+       }
+
+       return 0;
+}
+
+static int echo_finalize_lb(struct obdo *oa, struct obd_ioobj *obj,
+                           struct niobuf_remote *rb, int *pgs,
+                           struct niobuf_local *lb, int verify)
+{
+       struct niobuf_local *res = lb;
+       obd_off start  = rb->offset >> PAGE_CACHE_SHIFT;
+       obd_off end    = (rb->offset + rb->len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       int     count  = (int)(end - start);
+       int     rc     = 0;
+       int     i;
+
+       for (i = 0; i < count; i++, (*pgs) ++, res++) {
+               struct page *page = res->page;
+               void       *addr;
+
+               if (page == NULL) {
+                       CERROR("null page objid "LPU64":%p, buf %d/%d\n",
+                              ostid_id(&obj->ioo_oid), page, i,
+                              obj->ioo_bufcnt);
+                       return -EFAULT;
+               }
+
+               addr = kmap(page);
+
+               CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n",
+                      res->page, addr, res->lnb_file_offset);
+
+               if (verify) {
+                       int vrc = echo_page_debug_check(page,
+                                                       ostid_id(&obj->ioo_oid),
+                                                       res->lnb_file_offset,
+                                                       res->len);
+                       /* check all the pages always */
+                       if (vrc != 0 && rc == 0)
+                               rc = vrc;
+               }
+
+               kunmap(page);
+               /* NB see comment above regarding persistent pages */
+               OBD_PAGE_FREE(page);
+       }
+
+       return rc;
+}
+
+static int echo_preprw(const struct lu_env *env, int cmd,
+                      struct obd_export *export, struct obdo *oa,
+                      int objcount, struct obd_ioobj *obj,
+                      struct niobuf_remote *nb, int *pages,
+                      struct niobuf_local *res, struct obd_trans_info *oti,
+                      struct lustre_capa *unused)
+{
+       struct obd_device *obd;
+       int tot_bytes = 0;
+       int rc = 0;
+       int i, left;
+       ENTRY;
+
+       obd = export->exp_obd;
+       if (obd == NULL)
+               RETURN(-EINVAL);
+
+       /* Temp fix to stop falling foul of osc_announce_cached() */
+       oa->o_valid &= ~(OBD_MD_FLBLOCKS | OBD_MD_FLGRANT);
+
+       memset(res, 0, sizeof(*res) * *pages);
+
+       CDEBUG(D_PAGE, "%s %d obdos with %d IOs\n",
+              cmd == OBD_BRW_READ ? "reading" : "writing", objcount, *pages);
+
+       if (oti)
+               oti->oti_handle = (void *)DESC_PRIV;
+
+       left = *pages;
+       *pages = 0;
+
+       for (i = 0; i < objcount; i++, obj++) {
+               int j;
+
+               for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++) {
+
+                       rc = echo_map_nb_to_lb(oa, obj, nb, pages,
+                                              res + *pages, cmd, &left);
+                       if (rc)
+                               GOTO(preprw_cleanup, rc);
+
+                       tot_bytes += nb->len;
+               }
+       }
+
+       atomic_add(*pages, &obd->u.echo.eo_prep);
+
+       if (cmd & OBD_BRW_READ)
+               lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+                                   tot_bytes);
+       else
+               lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+                                   tot_bytes);
+
+       CDEBUG(D_PAGE, "%d pages allocated after prep\n",
+              atomic_read(&obd->u.echo.eo_prep));
+
+       RETURN(0);
+
+preprw_cleanup:
+       /* It is possible that we would rather handle errors by  allow
+        * any already-set-up pages to complete, rather than tearing them
+        * all down again.  I believe that this is what the in-kernel
+        * prep/commit operations do.
+        */
+       CERROR("cleaning up %u pages (%d obdos)\n", *pages, objcount);
+       for (i = 0; i < *pages; i++) {
+               kunmap(res[i].page);
+               /* NB if this is a persistent page, __free_pages will just
+                * lose the extra ref gained above */
+               OBD_PAGE_FREE(res[i].page);
+               res[i].page = NULL;
+               atomic_dec(&obd->u.echo.eo_prep);
+       }
+
+       return rc;
+}
+
+static int echo_commitrw(const struct lu_env *env, int cmd,
+                        struct obd_export *export, struct obdo *oa,
+                        int objcount, struct obd_ioobj *obj,
+                        struct niobuf_remote *rb, int niocount,
+                        struct niobuf_local *res, struct obd_trans_info *oti,
+                        int rc)
+{
+       struct obd_device *obd;
+       int pgs = 0;
+       int i;
+       ENTRY;
+
+       obd = export->exp_obd;
+       if (obd == NULL)
+               RETURN(-EINVAL);
+
+       if (rc)
+               GOTO(commitrw_cleanup, rc);
+
+       if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) {
+               CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n",
+                      objcount, niocount);
+       } else {
+               CDEBUG(D_PAGE, "writing %d obdos with %d IOs\n",
+                      objcount, niocount);
+       }
+
+       if (niocount && res == NULL) {
+               CERROR("NULL res niobuf with niocount %d\n", niocount);
+               RETURN(-EINVAL);
+       }
+
+       LASSERT(oti == NULL || oti->oti_handle == (void *)DESC_PRIV);
+
+       for (i = 0; i < objcount; i++, obj++) {
+               int verify = (rc == 0 &&
+                            ostid_id(&obj->ioo_oid) != ECHO_PERSISTENT_OBJID &&
+                             (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+                             (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+               int j;
+
+               for (j = 0 ; j < obj->ioo_bufcnt ; j++, rb++) {
+                       int vrc = echo_finalize_lb(oa, obj, rb, &pgs, &res[pgs],
+                                                  verify);
+                       if (vrc == 0)
+                               continue;
+
+                       if (vrc == -EFAULT)
+                               GOTO(commitrw_cleanup, rc = vrc);
+
+                       if (rc == 0)
+                               rc = vrc;
+               }
+
+       }
+
+       atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+       CDEBUG(D_PAGE, "%d pages remain after commit\n",
+              atomic_read(&obd->u.echo.eo_prep));
+       RETURN(rc);
+
+commitrw_cleanup:
+       atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+       CERROR("cleaning up %d pages (%d obdos)\n",
+              niocount - pgs - 1, objcount);
+
+       while (pgs < niocount) {
+               struct page *page = res[pgs++].page;
+
+               if (page == NULL)
+                       continue;
+
+               /* NB see comment above regarding persistent pages */
+               OBD_PAGE_FREE(page);
+               atomic_dec(&obd->u.echo.eo_prep);
+       }
+       return rc;
+}
+
+static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lprocfs_static_vars lvars;
+       int                     rc;
+       __u64                 lock_flags = 0;
+       struct ldlm_res_id       res_id = {.name = {1}};
+       char                   ns_name[48];
+       ENTRY;
+
+       obd->u.echo.eo_obt.obt_magic = OBT_MAGIC;
+       spin_lock_init(&obd->u.echo.eo_lock);
+       obd->u.echo.eo_lastino = ECHO_INIT_OID;
+
+       sprintf(ns_name, "echotgt-%s", obd->obd_uuid.uuid);
+       obd->obd_namespace = ldlm_namespace_new(obd, ns_name,
+                                               LDLM_NAMESPACE_SERVER,
+                                               LDLM_NAMESPACE_MODEST,
+                                               LDLM_NS_TYPE_OST);
+       if (obd->obd_namespace == NULL) {
+               LBUG();
+               RETURN(-ENOMEM);
+       }
+
+       rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_PLAIN,
+                                   NULL, LCK_NL, &lock_flags, NULL,
+                                   ldlm_completion_ast, NULL, NULL, 0,
+                                   LVB_T_NONE, NULL, &obd->u.echo.eo_nl_lock);
+       LASSERT (rc == ELDLM_OK);
+
+       lprocfs_echo_init_vars(&lvars);
+       if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0 &&
+           lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) {
+               lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES,
+                                    LPROCFS_CNTR_AVGMINMAX,
+                                    "read_bytes", "bytes");
+               lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_WRITE_BYTES,
+                                    LPROCFS_CNTR_AVGMINMAX,
+                                    "write_bytes", "bytes");
+       }
+
+       ptlrpc_init_client (LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
+                           "echo_ldlm_cb_client", &obd->obd_ldlm_client);
+       RETURN(0);
+}
+
+static int echo_cleanup(struct obd_device *obd)
+{
+       int leaked;
+       ENTRY;
+
+       lprocfs_obd_cleanup(obd);
+       lprocfs_free_obd_stats(obd);
+
+       ldlm_lock_decref(&obd->u.echo.eo_nl_lock, LCK_NL);
+
+       /* XXX Bug 3413; wait for a bit to ensure the BL callback has
+        * happened before calling ldlm_namespace_free() */
+       schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE, cfs_time_seconds(1));
+
+       ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force);
+       obd->obd_namespace = NULL;
+
+       leaked = atomic_read(&obd->u.echo.eo_prep);
+       if (leaked != 0)
+               CERROR("%d prep/commitrw pages leaked\n", leaked);
+
+       RETURN(0);
+}
+
+struct obd_ops echo_obd_ops = {
+       .o_owner           = THIS_MODULE,
+       .o_connect       = echo_connect,
+       .o_disconnect      = echo_disconnect,
+       .o_init_export     = echo_init_export,
+       .o_destroy_export  = echo_destroy_export,
+       .o_create         = echo_create,
+       .o_destroy       = echo_destroy,
+       .o_getattr       = echo_getattr,
+       .o_setattr       = echo_setattr,
+       .o_preprw         = echo_preprw,
+       .o_commitrw     = echo_commitrw,
+       .o_setup           = echo_setup,
+       .o_cleanup       = echo_cleanup
+};
+
+void echo_persistent_pages_fini(void)
+{
+       int     i;
+
+       for (i = 0; i < ECHO_PERSISTENT_PAGES; i++)
+               if (echo_persistent_pages[i] != NULL) {
+                       OBD_PAGE_FREE(echo_persistent_pages[i]);
+                       echo_persistent_pages[i] = NULL;
+               }
+}
+
+int echo_persistent_pages_init(void)
+{
+       struct page *pg;
+       int       i;
+
+       for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) {
+               int gfp_mask = (i < ECHO_PERSISTENT_PAGES/2) ?
+                       GFP_IOFS : GFP_HIGHUSER;
+
+               OBD_PAGE_ALLOC(pg, gfp_mask);
+               if (pg == NULL) {
+                       echo_persistent_pages_fini ();
+                       return (-ENOMEM);
+               }
+
+               memset (kmap (pg), 0, PAGE_CACHE_SIZE);
+               kunmap (pg);
+
+               echo_persistent_pages[i] = pg;
+       }
+
+       return (0);
+}
diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c
new file mode 100644 (file)
index 0000000..184195f
--- /dev/null
@@ -0,0 +1,3223 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_ECHO
+#include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <cl_object.h>
+#include <lustre_fid.h>
+#include <lustre_acl.h>
+#include <lustre_net.h>
+#include <obd_lov.h>
+
+#include "echo_internal.h"
+
+/** \defgroup echo_client Echo Client
+ * @{
+ */
+
+struct echo_device {
+       struct cl_device        ed_cl;
+       struct echo_client_obd *ed_ec;
+
+       struct cl_site    ed_site_myself;
+       struct cl_site   *ed_site;
+       struct lu_device       *ed_next;
+       int                  ed_next_islov;
+       int                  ed_next_ismd;
+       struct lu_client_seq   *ed_cl_seq;
+};
+
+struct echo_object {
+       struct cl_object        eo_cl;
+       struct cl_object_header eo_hdr;
+
+       struct echo_device     *eo_dev;
+       struct list_head              eo_obj_chain;
+       struct lov_stripe_md   *eo_lsm;
+       atomic_t            eo_npages;
+       int                  eo_deleted;
+};
+
+struct echo_object_conf {
+       struct cl_object_conf  eoc_cl;
+       struct lov_stripe_md **eoc_md;
+};
+
+struct echo_page {
+       struct cl_page_slice   ep_cl;
+       struct mutex            ep_lock;
+       struct page         *ep_vmpage;
+};
+
+struct echo_lock {
+       struct cl_lock_slice   el_cl;
+       struct list_head             el_chain;
+       struct echo_object    *el_object;
+       __u64             el_cookie;
+       atomic_t           el_refcount;
+};
+
+struct echo_io {
+       struct cl_io_slice     ei_cl;
+};
+
+#if 0
+struct echo_req {
+       struct cl_req_slice er_cl;
+};
+#endif
+
+static int echo_client_setup(const struct lu_env *env,
+                            struct obd_device *obddev,
+                            struct lustre_cfg *lcfg);
+static int echo_client_cleanup(struct obd_device *obddev);
+
+
+/** \defgroup echo_helpers Helper functions
+ * @{
+ */
+static inline struct echo_device *cl2echo_dev(const struct cl_device *dev)
+{
+       return container_of0(dev, struct echo_device, ed_cl);
+}
+
+static inline struct cl_device *echo_dev2cl(struct echo_device *d)
+{
+       return &d->ed_cl;
+}
+
+static inline struct echo_device *obd2echo_dev(const struct obd_device *obd)
+{
+       return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev));
+}
+
+static inline struct cl_object *echo_obj2cl(struct echo_object *eco)
+{
+       return &eco->eo_cl;
+}
+
+static inline struct echo_object *cl2echo_obj(const struct cl_object *o)
+{
+       return container_of(o, struct echo_object, eo_cl);
+}
+
+static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s)
+{
+       return container_of(s, struct echo_page, ep_cl);
+}
+
+static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s)
+{
+       return container_of(s, struct echo_lock, el_cl);
+}
+
+static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl)
+{
+       return ecl->el_cl.cls_lock;
+}
+
+static struct lu_context_key echo_thread_key;
+static inline struct echo_thread_info *echo_env_info(const struct lu_env *env)
+{
+       struct echo_thread_info *info;
+       info = lu_context_key_get(&env->le_ctx, &echo_thread_key);
+       LASSERT(info != NULL);
+       return info;
+}
+
+static inline
+struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c)
+{
+       return container_of(c, struct echo_object_conf, eoc_cl);
+}
+
+/** @} echo_helpers */
+
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+                                              struct lov_stripe_md **lsm);
+static int cl_echo_object_put(struct echo_object *eco);
+static int cl_echo_enqueue   (struct echo_object *eco, obd_off start,
+                             obd_off end, int mode, __u64 *cookie);
+static int cl_echo_cancel    (struct echo_device *d, __u64 cookie);
+static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset,
+                             struct page **pages, int npages, int async);
+
+static struct echo_thread_info *echo_env_info(const struct lu_env *env);
+
+struct echo_thread_info {
+       struct echo_object_conf eti_conf;
+       struct lustre_md        eti_md;
+
+       struct cl_2queue        eti_queue;
+       struct cl_io        eti_io;
+       struct cl_lock_descr    eti_descr;
+       struct lu_fid      eti_fid;
+       struct lu_fid           eti_fid2;
+       struct md_op_spec       eti_spec;
+       struct lov_mds_md_v3    eti_lmm;
+       struct lov_user_md_v3   eti_lum;
+       struct md_attr    eti_ma;
+       struct lu_name    eti_lname;
+       /* per-thread values, can be re-used */
+       void                    *eti_big_lmm;
+       int                     eti_big_lmmsize;
+       char                eti_name[20];
+       struct lu_buf      eti_buf;
+       char                eti_xattr_buf[LUSTRE_POSIX_ACL_MAX_SIZE];
+};
+
+/* No session used right now */
+struct echo_session_info {
+       unsigned long dummy;
+};
+
+static struct kmem_cache *echo_lock_kmem;
+static struct kmem_cache *echo_object_kmem;
+static struct kmem_cache *echo_thread_kmem;
+static struct kmem_cache *echo_session_kmem;
+//static struct kmem_cache *echo_req_kmem;
+
+static struct lu_kmem_descr echo_caches[] = {
+       {
+               .ckd_cache = &echo_lock_kmem,
+               .ckd_name  = "echo_lock_kmem",
+               .ckd_size  = sizeof (struct echo_lock)
+       },
+       {
+               .ckd_cache = &echo_object_kmem,
+               .ckd_name  = "echo_object_kmem",
+               .ckd_size  = sizeof (struct echo_object)
+       },
+       {
+               .ckd_cache = &echo_thread_kmem,
+               .ckd_name  = "echo_thread_kmem",
+               .ckd_size  = sizeof (struct echo_thread_info)
+       },
+       {
+               .ckd_cache = &echo_session_kmem,
+               .ckd_name  = "echo_session_kmem",
+               .ckd_size  = sizeof (struct echo_session_info)
+       },
+#if 0
+       {
+               .ckd_cache = &echo_req_kmem,
+               .ckd_name  = "echo_req_kmem",
+               .ckd_size  = sizeof (struct echo_req)
+       },
+#endif
+       {
+               .ckd_cache = NULL
+       }
+};
+
+/** \defgroup echo_page Page operations
+ *
+ * Echo page operations.
+ *
+ * @{
+ */
+static struct page *echo_page_vmpage(const struct lu_env *env,
+                                   const struct cl_page_slice *slice)
+{
+       return cl2echo_page(slice)->ep_vmpage;
+}
+
+static int echo_page_own(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        struct cl_io *io, int nonblock)
+{
+       struct echo_page *ep = cl2echo_page(slice);
+
+       if (!nonblock)
+               mutex_lock(&ep->ep_lock);
+       else if (!mutex_trylock(&ep->ep_lock))
+               return -EAGAIN;
+       return 0;
+}
+
+static void echo_page_disown(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *io)
+{
+       struct echo_page *ep = cl2echo_page(slice);
+
+       LASSERT(mutex_is_locked(&ep->ep_lock));
+       mutex_unlock(&ep->ep_lock);
+}
+
+static void echo_page_discard(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *unused)
+{
+       cl_page_delete(env, slice->cpl_page);
+}
+
+static int echo_page_is_vmlocked(const struct lu_env *env,
+                                const struct cl_page_slice *slice)
+{
+       if (mutex_is_locked(&cl2echo_page(slice)->ep_lock))
+               return -EBUSY;
+       return -ENODATA;
+}
+
+static void echo_page_completion(const struct lu_env *env,
+                                const struct cl_page_slice *slice,
+                                int ioret)
+{
+       LASSERT(slice->cpl_page->cp_sync_io != NULL);
+}
+
+static void echo_page_fini(const struct lu_env *env,
+                          struct cl_page_slice *slice)
+{
+       struct echo_page *ep    = cl2echo_page(slice);
+       struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
+       struct page *vmpage      = ep->ep_vmpage;
+       ENTRY;
+
+       atomic_dec(&eco->eo_npages);
+       page_cache_release(vmpage);
+       EXIT;
+}
+
+static int echo_page_prep(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         struct cl_io *unused)
+{
+       return 0;
+}
+
+static int echo_page_print(const struct lu_env *env,
+                          const struct cl_page_slice *slice,
+                          void *cookie, lu_printer_t printer)
+{
+       struct echo_page *ep = cl2echo_page(slice);
+
+       (*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n",
+                  ep, mutex_is_locked(&ep->ep_lock), ep->ep_vmpage);
+       return 0;
+}
+
+static const struct cl_page_operations echo_page_ops = {
+       .cpo_own           = echo_page_own,
+       .cpo_disown     = echo_page_disown,
+       .cpo_discard       = echo_page_discard,
+       .cpo_vmpage     = echo_page_vmpage,
+       .cpo_fini         = echo_page_fini,
+       .cpo_print       = echo_page_print,
+       .cpo_is_vmlocked   = echo_page_is_vmlocked,
+       .io = {
+               [CRT_READ] = {
+                       .cpo_prep       = echo_page_prep,
+                       .cpo_completion  = echo_page_completion,
+               },
+               [CRT_WRITE] = {
+                       .cpo_prep       = echo_page_prep,
+                       .cpo_completion  = echo_page_completion,
+               }
+       }
+};
+/** @} echo_page */
+
+/** \defgroup echo_lock Locking
+ *
+ * echo lock operations
+ *
+ * @{
+ */
+static void echo_lock_fini(const struct lu_env *env,
+                          struct cl_lock_slice *slice)
+{
+       struct echo_lock *ecl = cl2echo_lock(slice);
+
+       LASSERT(list_empty(&ecl->el_chain));
+       OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem);
+}
+
+static void echo_lock_delete(const struct lu_env *env,
+                            const struct cl_lock_slice *slice)
+{
+       struct echo_lock *ecl      = cl2echo_lock(slice);
+
+       LASSERT(list_empty(&ecl->el_chain));
+}
+
+static int echo_lock_fits_into(const struct lu_env *env,
+                              const struct cl_lock_slice *slice,
+                              const struct cl_lock_descr *need,
+                              const struct cl_io *unused)
+{
+       return 1;
+}
+
+static struct cl_lock_operations echo_lock_ops = {
+       .clo_fini      = echo_lock_fini,
+       .clo_delete    = echo_lock_delete,
+       .clo_fits_into = echo_lock_fits_into
+};
+
+/** @} echo_lock */
+
+/** \defgroup echo_cl_ops cl_object operations
+ *
+ * operations for cl_object
+ *
+ * @{
+ */
+static int echo_page_init(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_page *page, struct page *vmpage)
+{
+       struct echo_page *ep = cl_object_page_slice(obj, page);
+       struct echo_object *eco = cl2echo_obj(obj);
+       ENTRY;
+
+       ep->ep_vmpage = vmpage;
+       page_cache_get(vmpage);
+       mutex_init(&ep->ep_lock);
+       cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops);
+       atomic_inc(&eco->eo_npages);
+       RETURN(0);
+}
+
+static int echo_io_init(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_io *io)
+{
+       return 0;
+}
+
+static int echo_lock_init(const struct lu_env *env,
+                         struct cl_object *obj, struct cl_lock *lock,
+                         const struct cl_io *unused)
+{
+       struct echo_lock *el;
+       ENTRY;
+
+       OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, __GFP_IO);
+       if (el != NULL) {
+               cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops);
+               el->el_object = cl2echo_obj(obj);
+               INIT_LIST_HEAD(&el->el_chain);
+               atomic_set(&el->el_refcount, 0);
+       }
+       RETURN(el == NULL ? -ENOMEM : 0);
+}
+
+static int echo_conf_set(const struct lu_env *env, struct cl_object *obj,
+                        const struct cl_object_conf *conf)
+{
+       return 0;
+}
+
+static const struct cl_object_operations echo_cl_obj_ops = {
+       .coo_page_init = echo_page_init,
+       .coo_lock_init = echo_lock_init,
+       .coo_io_init   = echo_io_init,
+       .coo_conf_set  = echo_conf_set
+};
+/** @} echo_cl_ops */
+
+/** \defgroup echo_lu_ops lu_object operations
+ *
+ * operations for echo lu object.
+ *
+ * @{
+ */
+static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
+                           const struct lu_object_conf *conf)
+{
+       struct echo_device *ed   = cl2echo_dev(lu2cl_dev(obj->lo_dev));
+       struct echo_client_obd *ec     = ed->ed_ec;
+       struct echo_object *eco = cl2echo_obj(lu2cl(obj));
+       ENTRY;
+
+       if (ed->ed_next) {
+               struct lu_object  *below;
+               struct lu_device  *under;
+
+               under = ed->ed_next;
+               below = under->ld_ops->ldo_object_alloc(env, obj->lo_header,
+                                                       under);
+               if (below == NULL)
+                       RETURN(-ENOMEM);
+               lu_object_add(obj, below);
+       }
+
+       if (!ed->ed_next_ismd) {
+               const struct cl_object_conf *cconf = lu2cl_conf(conf);
+               struct echo_object_conf *econf = cl2echo_conf(cconf);
+
+               LASSERT(econf->eoc_md);
+               eco->eo_lsm = *econf->eoc_md;
+               /* clear the lsm pointer so that it won't get freed. */
+               *econf->eoc_md = NULL;
+       } else {
+               eco->eo_lsm = NULL;
+       }
+
+       eco->eo_dev = ed;
+       atomic_set(&eco->eo_npages, 0);
+       cl_object_page_init(lu2cl(obj), sizeof(struct echo_page));
+
+       spin_lock(&ec->ec_lock);
+       list_add_tail(&eco->eo_obj_chain, &ec->ec_objects);
+       spin_unlock(&ec->ec_lock);
+
+       RETURN(0);
+}
+
+/* taken from osc_unpackmd() */
+static int echo_alloc_memmd(struct echo_device *ed,
+                           struct lov_stripe_md **lsmp)
+{
+       int lsm_size;
+
+       ENTRY;
+
+       /* If export is lov/osc then use their obd method */
+       if (ed->ed_next != NULL)
+               return obd_alloc_memmd(ed->ed_ec->ec_exp, lsmp);
+       /* OFD has no unpackmd method, do everything here */
+       lsm_size = lov_stripe_md_size(1);
+
+       LASSERT(*lsmp == NULL);
+       OBD_ALLOC(*lsmp, lsm_size);
+       if (*lsmp == NULL)
+               RETURN(-ENOMEM);
+
+       OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+       if ((*lsmp)->lsm_oinfo[0] == NULL) {
+               OBD_FREE(*lsmp, lsm_size);
+               RETURN(-ENOMEM);
+       }
+
+       loi_init((*lsmp)->lsm_oinfo[0]);
+       (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+       ostid_set_seq_echo(&(*lsmp)->lsm_oi);
+
+       RETURN(lsm_size);
+}
+
+static int echo_free_memmd(struct echo_device *ed, struct lov_stripe_md **lsmp)
+{
+       int lsm_size;
+
+       ENTRY;
+
+       /* If export is lov/osc then use their obd method */
+       if (ed->ed_next != NULL)
+               return obd_free_memmd(ed->ed_ec->ec_exp, lsmp);
+       /* OFD has no unpackmd method, do everything here */
+       lsm_size = lov_stripe_md_size(1);
+
+       LASSERT(*lsmp != NULL);
+       OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+       OBD_FREE(*lsmp, lsm_size);
+       *lsmp = NULL;
+       RETURN(0);
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+       struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+       struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+       ENTRY;
+
+       LASSERT(atomic_read(&eco->eo_npages) == 0);
+
+       spin_lock(&ec->ec_lock);
+       list_del_init(&eco->eo_obj_chain);
+       spin_unlock(&ec->ec_lock);
+
+       lu_object_fini(obj);
+       lu_object_header_fini(obj->lo_header);
+
+       if (eco->eo_lsm)
+               echo_free_memmd(eco->eo_dev, &eco->eo_lsm);
+       OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
+       EXIT;
+}
+
+static int echo_object_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *o)
+{
+       struct echo_object *obj = cl2echo_obj(lu2cl(o));
+
+       return (*p)(env, cookie, "echoclient-object@%p", obj);
+}
+
+static const struct lu_object_operations echo_lu_obj_ops = {
+       .loo_object_init      = echo_object_init,
+       .loo_object_delete    = NULL,
+       .loo_object_release   = NULL,
+       .loo_object_free      = echo_object_free,
+       .loo_object_print     = echo_object_print,
+       .loo_object_invariant = NULL
+};
+/** @} echo_lu_ops */
+
+/** \defgroup echo_lu_dev_ops  lu_device operations
+ *
+ * Operations for echo lu device.
+ *
+ * @{
+ */
+static struct lu_object *echo_object_alloc(const struct lu_env *env,
+                                          const struct lu_object_header *hdr,
+                                          struct lu_device *dev)
+{
+       struct echo_object *eco;
+       struct lu_object *obj = NULL;
+       ENTRY;
+
+       /* we're the top dev. */
+       LASSERT(hdr == NULL);
+       OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, __GFP_IO);
+       if (eco != NULL) {
+               struct cl_object_header *hdr = &eco->eo_hdr;
+
+               obj = &echo_obj2cl(eco)->co_lu;
+               cl_object_header_init(hdr);
+               lu_object_init(obj, &hdr->coh_lu, dev);
+               lu_object_add_top(&hdr->coh_lu, obj);
+
+               eco->eo_cl.co_ops = &echo_cl_obj_ops;
+               obj->lo_ops       = &echo_lu_obj_ops;
+       }
+       RETURN(obj);
+}
+
+static struct lu_device_operations echo_device_lu_ops = {
+       .ldo_object_alloc   = echo_object_alloc,
+};
+
+/** @} echo_lu_dev_ops */
+
+static struct cl_device_operations echo_device_cl_ops = {
+};
+
+/** \defgroup echo_init Setup and teardown
+ *
+ * Init and fini functions for echo client.
+ *
+ * @{
+ */
+static int echo_site_init(const struct lu_env *env, struct echo_device *ed)
+{
+       struct cl_site *site = &ed->ed_site_myself;
+       int rc;
+
+       /* initialize site */
+       rc = cl_site_init(site, &ed->ed_cl);
+       if (rc) {
+               CERROR("Cannot initilize site for echo client(%d)\n", rc);
+               return rc;
+       }
+
+       rc = lu_site_init_finish(&site->cs_lu);
+       if (rc)
+               return rc;
+
+       ed->ed_site = site;
+       return 0;
+}
+
+static void echo_site_fini(const struct lu_env *env, struct echo_device *ed)
+{
+       if (ed->ed_site) {
+               if (!ed->ed_next_ismd)
+                       cl_site_fini(ed->ed_site);
+               ed->ed_site = NULL;
+       }
+}
+
+static void *echo_thread_key_init(const struct lu_context *ctx,
+                         struct lu_context_key *key)
+{
+       struct echo_thread_info *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void echo_thread_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+       struct echo_thread_info *info = data;
+       OBD_SLAB_FREE_PTR(info, echo_thread_kmem);
+}
+
+static void echo_thread_key_exit(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_thread_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = echo_thread_key_init,
+       .lct_fini = echo_thread_key_fini,
+       .lct_exit = echo_thread_key_exit
+};
+
+static void *echo_session_key_init(const struct lu_context *ctx,
+                                 struct lu_context_key *key)
+{
+       struct echo_session_info *session;
+
+       OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, __GFP_IO);
+       if (session == NULL)
+               session = ERR_PTR(-ENOMEM);
+       return session;
+}
+
+static void echo_session_key_fini(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data)
+{
+       struct echo_session_info *session = data;
+       OBD_SLAB_FREE_PTR(session, echo_session_kmem);
+}
+
+static void echo_session_key_exit(const struct lu_context *ctx,
+                                struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_session_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = echo_session_key_init,
+       .lct_fini = echo_session_key_fini,
+       .lct_exit = echo_session_key_exit
+};
+
+LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key);
+
+#define ECHO_SEQ_WIDTH 0xffffffff
+static int echo_fid_init(struct echo_device *ed, char *obd_name,
+                        struct seq_server_site *ss)
+{
+       char *prefix;
+       int rc;
+       ENTRY;
+
+       OBD_ALLOC_PTR(ed->ed_cl_seq);
+       if (ed->ed_cl_seq == NULL)
+               RETURN(-ENOMEM);
+
+       OBD_ALLOC(prefix, MAX_OBD_NAME + 5);
+       if (prefix == NULL)
+               GOTO(out_free_seq, rc = -ENOMEM);
+
+       snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", obd_name);
+
+       /* Init client side sequence-manager */
+       rc = seq_client_init(ed->ed_cl_seq, NULL,
+                            LUSTRE_SEQ_METADATA,
+                            prefix, ss->ss_server_seq);
+       ed->ed_cl_seq->lcs_width = ECHO_SEQ_WIDTH;
+       OBD_FREE(prefix, MAX_OBD_NAME + 5);
+       if (rc)
+               GOTO(out_free_seq, rc);
+
+       RETURN(0);
+
+out_free_seq:
+       OBD_FREE_PTR(ed->ed_cl_seq);
+       ed->ed_cl_seq = NULL;
+       RETURN(rc);
+}
+
+static int echo_fid_fini(struct obd_device *obddev)
+{
+       struct echo_device *ed = obd2echo_dev(obddev);
+       ENTRY;
+
+       if (ed->ed_cl_seq != NULL) {
+               seq_client_fini(ed->ed_cl_seq);
+               OBD_FREE_PTR(ed->ed_cl_seq);
+               ed->ed_cl_seq = NULL;
+       }
+
+       RETURN(0);
+}
+
+static struct lu_device *echo_device_alloc(const struct lu_env *env,
+                                          struct lu_device_type *t,
+                                          struct lustre_cfg *cfg)
+{
+       struct lu_device   *next;
+       struct echo_device *ed;
+       struct cl_device   *cd;
+       struct obd_device  *obd = NULL; /* to keep compiler happy */
+       struct obd_device  *tgt;
+       const char *tgt_type_name;
+       int rc;
+       int cleanup = 0;
+       ENTRY;
+
+       OBD_ALLOC_PTR(ed);
+       if (ed == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       cleanup = 1;
+       cd = &ed->ed_cl;
+       rc = cl_device_init(cd, t);
+       if (rc)
+               GOTO(out, rc);
+
+       cd->cd_lu_dev.ld_ops = &echo_device_lu_ops;
+       cd->cd_ops = &echo_device_cl_ops;
+
+       cleanup = 2;
+       obd = class_name2obd(lustre_cfg_string(cfg, 0));
+       LASSERT(obd != NULL);
+       LASSERT(env != NULL);
+
+       tgt = class_name2obd(lustre_cfg_string(cfg, 1));
+       if (tgt == NULL) {
+               CERROR("Can not find tgt device %s\n",
+                       lustre_cfg_string(cfg, 1));
+               GOTO(out, rc = -ENODEV);
+       }
+
+       next = tgt->obd_lu_dev;
+       if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+               ed->ed_next_ismd = 1;
+       } else {
+               ed->ed_next_ismd = 0;
+               rc = echo_site_init(env, ed);
+               if (rc)
+                       GOTO(out, rc);
+       }
+       cleanup = 3;
+
+       rc = echo_client_setup(env, obd, cfg);
+       if (rc)
+               GOTO(out, rc);
+
+       ed->ed_ec = &obd->u.echo_client;
+       cleanup = 4;
+
+       if (ed->ed_next_ismd) {
+               /* Suppose to connect to some Metadata layer */
+               struct lu_site *ls;
+               struct lu_device *ld;
+               int    found = 0;
+
+               if (next == NULL) {
+                       CERROR("%s is not lu device type!\n",
+                              lustre_cfg_string(cfg, 1));
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               tgt_type_name = lustre_cfg_string(cfg, 2);
+               if (!tgt_type_name) {
+                       CERROR("%s no type name for echo %s setup\n",
+                               lustre_cfg_string(cfg, 1),
+                               tgt->obd_type->typ_name);
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               ls = next->ld_site;
+
+               spin_lock(&ls->ls_ld_lock);
+               list_for_each_entry(ld, &ls->ls_ld_linkage, ld_linkage) {
+                       if (strcmp(ld->ld_type->ldt_name, tgt_type_name) == 0) {
+                               found = 1;
+                               break;
+                       }
+               }
+               spin_unlock(&ls->ls_ld_lock);
+
+               if (found == 0) {
+                       CERROR("%s is not lu device type!\n",
+                              lustre_cfg_string(cfg, 1));
+                       GOTO(out, rc = -EINVAL);
+               }
+
+               next = ld;
+               /* For MD echo client, it will use the site in MDS stack */
+               ed->ed_site_myself.cs_lu = *ls;
+               ed->ed_site = &ed->ed_site_myself;
+               ed->ed_cl.cd_lu_dev.ld_site = &ed->ed_site_myself.cs_lu;
+               rc = echo_fid_init(ed, obd->obd_name, lu_site2seq(ls));
+               if (rc) {
+                       CERROR("echo fid init error %d\n", rc);
+                       GOTO(out, rc);
+               }
+       } else {
+                /* if echo client is to be stacked upon ost device, the next is
+                 * NULL since ost is not a clio device so far */
+               if (next != NULL && !lu_device_is_cl(next))
+                       next = NULL;
+
+               tgt_type_name = tgt->obd_type->typ_name;
+               if (next != NULL) {
+                       LASSERT(next != NULL);
+                       if (next->ld_site != NULL)
+                               GOTO(out, rc = -EBUSY);
+
+                       next->ld_site = &ed->ed_site->cs_lu;
+                       rc = next->ld_type->ldt_ops->ldto_device_init(env, next,
+                                                    next->ld_type->ldt_name,
+                                                    NULL);
+                       if (rc)
+                               GOTO(out, rc);
+
+                       /* Tricky case, I have to determine the obd type since
+                        * CLIO uses the different parameters to initialize
+                        * objects for lov & osc. */
+                       if (strcmp(tgt_type_name, LUSTRE_LOV_NAME) == 0)
+                               ed->ed_next_islov = 1;
+                       else
+                               LASSERT(strcmp(tgt_type_name,
+                                              LUSTRE_OSC_NAME) == 0);
+               } else
+                       LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0);
+       }
+
+       ed->ed_next = next;
+       RETURN(&cd->cd_lu_dev);
+out:
+       switch(cleanup) {
+       case 4: {
+               int rc2;
+               rc2 = echo_client_cleanup(obd);
+               if (rc2)
+                       CERROR("Cleanup obd device %s error(%d)\n",
+                              obd->obd_name, rc2);
+       }
+
+       case 3:
+               echo_site_fini(env, ed);
+       case 2:
+               cl_device_fini(&ed->ed_cl);
+       case 1:
+               OBD_FREE_PTR(ed);
+       case 0:
+       default:
+               break;
+       }
+       return(ERR_PTR(rc));
+}
+
+static int echo_device_init(const struct lu_env *env, struct lu_device *d,
+                         const char *name, struct lu_device *next)
+{
+       LBUG();
+       return 0;
+}
+
+static struct lu_device *echo_device_fini(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+       struct echo_device *ed = cl2echo_dev(lu2cl_dev(d));
+       struct lu_device *next = ed->ed_next;
+
+       while (next && !ed->ed_next_ismd)
+               next = next->ld_type->ldt_ops->ldto_device_fini(env, next);
+       return NULL;
+}
+
+static void echo_lock_release(const struct lu_env *env,
+                             struct echo_lock *ecl,
+                             int still_used)
+{
+       struct cl_lock *clk = echo_lock2cl(ecl);
+
+       cl_lock_get(clk);
+       cl_unuse(env, clk);
+       cl_lock_release(env, clk, "ec enqueue", ecl->el_object);
+       if (!still_used) {
+               cl_lock_mutex_get(env, clk);
+               cl_lock_cancel(env, clk);
+               cl_lock_delete(env, clk);
+               cl_lock_mutex_put(env, clk);
+       }
+       cl_lock_put(env, clk);
+}
+
+static struct lu_device *echo_device_free(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+       struct echo_device     *ed   = cl2echo_dev(lu2cl_dev(d));
+       struct echo_client_obd *ec   = ed->ed_ec;
+       struct echo_object     *eco;
+       struct lu_device       *next = ed->ed_next;
+
+       CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n",
+              ed, next);
+
+       lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+       /* check if there are objects still alive.
+        * It shouldn't have any object because lu_site_purge would cleanup
+        * all of cached objects. Anyway, probably the echo device is being
+        * parallelly accessed.
+        */
+       spin_lock(&ec->ec_lock);
+       list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain)
+               eco->eo_deleted = 1;
+       spin_unlock(&ec->ec_lock);
+
+       /* purge again */
+       lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+       CDEBUG(D_INFO,
+              "Waiting for the reference of echo object to be dropped\n");
+
+       /* Wait for the last reference to be dropped. */
+       spin_lock(&ec->ec_lock);
+       while (!list_empty(&ec->ec_objects)) {
+               spin_unlock(&ec->ec_lock);
+               CERROR("echo_client still has objects at cleanup time, "
+                      "wait for 1 second\n");
+               schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+                                                  cfs_time_seconds(1));
+               lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+               spin_lock(&ec->ec_lock);
+       }
+       spin_unlock(&ec->ec_lock);
+
+       LASSERT(list_empty(&ec->ec_locks));
+
+       CDEBUG(D_INFO, "No object exists, exiting...\n");
+
+       echo_client_cleanup(d->ld_obd);
+       echo_fid_fini(d->ld_obd);
+       while (next && !ed->ed_next_ismd)
+               next = next->ld_type->ldt_ops->ldto_device_free(env, next);
+
+       LASSERT(ed->ed_site == lu2cl_site(d->ld_site));
+       echo_site_fini(env, ed);
+       cl_device_fini(&ed->ed_cl);
+       OBD_FREE_PTR(ed);
+
+       return NULL;
+}
+
+static const struct lu_device_type_operations echo_device_type_ops = {
+       .ldto_init = echo_type_init,
+       .ldto_fini = echo_type_fini,
+
+       .ldto_start = echo_type_start,
+       .ldto_stop  = echo_type_stop,
+
+       .ldto_device_alloc = echo_device_alloc,
+       .ldto_device_free  = echo_device_free,
+       .ldto_device_init  = echo_device_init,
+       .ldto_device_fini  = echo_device_fini
+};
+
+static struct lu_device_type echo_device_type = {
+       .ldt_tags     = LU_DEVICE_CL,
+       .ldt_name     = LUSTRE_ECHO_CLIENT_NAME,
+       .ldt_ops      = &echo_device_type_ops,
+       .ldt_ctx_tags = LCT_CL_THREAD | LCT_MD_THREAD | LCT_DT_THREAD,
+};
+/** @} echo_init */
+
+/** \defgroup echo_exports Exported operations
+ *
+ * exporting functions to echo client
+ *
+ * @{
+ */
+
+/* Interfaces to echo client obd device */
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+                                              struct lov_stripe_md **lsmp)
+{
+       struct lu_env *env;
+       struct echo_thread_info *info;
+       struct echo_object_conf *conf;
+       struct lov_stripe_md    *lsm;
+       struct echo_object *eco;
+       struct cl_object   *obj;
+       struct lu_fid *fid;
+       int refcheck;
+       int rc;
+       ENTRY;
+
+       LASSERT(lsmp);
+       lsm = *lsmp;
+       LASSERT(lsm);
+       LASSERTF(ostid_id(&lsm->lsm_oi) != 0, DOSTID"\n", POSTID(&lsm->lsm_oi));
+       LASSERTF(ostid_seq(&lsm->lsm_oi) == FID_SEQ_ECHO, DOSTID"\n",
+                POSTID(&lsm->lsm_oi));
+
+       /* Never return an object if the obd is to be freed. */
+       if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping)
+               RETURN(ERR_PTR(-ENODEV));
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN((void *)env);
+
+       info = echo_env_info(env);
+       conf = &info->eti_conf;
+       if (d->ed_next) {
+               if (!d->ed_next_islov) {
+                       struct lov_oinfo *oinfo = lsm->lsm_oinfo[0];
+                       LASSERT(oinfo != NULL);
+                       oinfo->loi_oi = lsm->lsm_oi;
+                       conf->eoc_cl.u.coc_oinfo = oinfo;
+               } else {
+                       struct lustre_md *md;
+                       md = &info->eti_md;
+                       memset(md, 0, sizeof *md);
+                       md->lsm = lsm;
+                       conf->eoc_cl.u.coc_md = md;
+               }
+       }
+       conf->eoc_md = lsmp;
+
+       fid  = &info->eti_fid;
+       rc = ostid_to_fid(fid, &lsm->lsm_oi, 0);
+       if (rc != 0)
+               GOTO(out, eco = ERR_PTR(rc));
+
+       /* In the function below, .hs_keycmp resolves to
+        * lu_obj_hop_keycmp() */
+       /* coverity[overrun-buffer-val] */
+       obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl);
+       if (IS_ERR(obj))
+               GOTO(out, eco = (void*)obj);
+
+       eco = cl2echo_obj(obj);
+       if (eco->eo_deleted) {
+               cl_object_put(env, obj);
+               eco = ERR_PTR(-EAGAIN);
+       }
+
+out:
+       cl_env_put(env, &refcheck);
+       RETURN(eco);
+}
+
+static int cl_echo_object_put(struct echo_object *eco)
+{
+       struct lu_env *env;
+       struct cl_object *obj = echo_obj2cl(eco);
+       int refcheck;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       /* an external function to kill an object? */
+       if (eco->eo_deleted) {
+               struct lu_object_header *loh = obj->co_lu.lo_header;
+               LASSERT(&eco->eo_hdr == luh2coh(loh));
+               set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags);
+       }
+
+       cl_object_put(env, obj);
+       cl_env_put(env, &refcheck);
+       RETURN(0);
+}
+
+static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco,
+                           obd_off start, obd_off end, int mode,
+                           __u64 *cookie , __u32 enqflags)
+{
+       struct cl_io *io;
+       struct cl_lock *lck;
+       struct cl_object *obj;
+       struct cl_lock_descr *descr;
+       struct echo_thread_info *info;
+       int rc = -ENOMEM;
+       ENTRY;
+
+       info = echo_env_info(env);
+       io = &info->eti_io;
+       descr = &info->eti_descr;
+       obj = echo_obj2cl(eco);
+
+       descr->cld_obj   = obj;
+       descr->cld_start = cl_index(obj, start);
+       descr->cld_end   = cl_index(obj, end);
+       descr->cld_mode  = mode == LCK_PW ? CLM_WRITE : CLM_READ;
+       descr->cld_enq_flags = enqflags;
+       io->ci_obj = obj;
+
+       lck = cl_lock_request(env, io, descr, "ec enqueue", eco);
+       if (lck) {
+               struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+               struct echo_lock *el;
+
+               rc = cl_wait(env, lck);
+               if (rc == 0) {
+                       el = cl2echo_lock(cl_lock_at(lck, &echo_device_type));
+                       spin_lock(&ec->ec_lock);
+                       if (list_empty(&el->el_chain)) {
+                               list_add(&el->el_chain, &ec->ec_locks);
+                               el->el_cookie = ++ec->ec_unique;
+                       }
+                       atomic_inc(&el->el_refcount);
+                       *cookie = el->el_cookie;
+                       spin_unlock(&ec->ec_lock);
+               } else {
+                       cl_lock_release(env, lck, "ec enqueue", current);
+               }
+       }
+       RETURN(rc);
+}
+
+static int cl_echo_enqueue(struct echo_object *eco, obd_off start, obd_off end,
+                          int mode, __u64 *cookie)
+{
+       struct echo_thread_info *info;
+       struct lu_env *env;
+       struct cl_io *io;
+       int refcheck;
+       int result;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       info = echo_env_info(env);
+       io = &info->eti_io;
+
+       io->ci_ignore_layout = 1;
+       result = cl_io_init(env, io, CIT_MISC, echo_obj2cl(eco));
+       if (result < 0)
+               GOTO(out, result);
+       LASSERT(result == 0);
+
+       result = cl_echo_enqueue0(env, eco, start, end, mode, cookie, 0);
+       cl_io_fini(env, io);
+
+       EXIT;
+out:
+       cl_env_put(env, &refcheck);
+       return result;
+}
+
+static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed,
+                          __u64 cookie)
+{
+       struct echo_client_obd *ec = ed->ed_ec;
+       struct echo_lock       *ecl = NULL;
+       struct list_head             *el;
+       int found = 0, still_used = 0;
+       ENTRY;
+
+       LASSERT(ec != NULL);
+       spin_lock(&ec->ec_lock);
+       list_for_each (el, &ec->ec_locks) {
+               ecl = list_entry (el, struct echo_lock, el_chain);
+               CDEBUG(D_INFO, "ecl: %p, cookie: "LPX64"\n", ecl, ecl->el_cookie);
+               found = (ecl->el_cookie == cookie);
+               if (found) {
+                       if (atomic_dec_and_test(&ecl->el_refcount))
+                               list_del_init(&ecl->el_chain);
+                       else
+                               still_used = 1;
+                       break;
+               }
+       }
+       spin_unlock(&ec->ec_lock);
+
+       if (!found)
+               RETURN(-ENOENT);
+
+       echo_lock_release(env, ecl, still_used);
+       RETURN(0);
+}
+
+static int cl_echo_cancel(struct echo_device *ed, __u64 cookie)
+{
+       struct lu_env *env;
+       int refcheck;
+       int rc;
+       ENTRY;
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       rc = cl_echo_cancel0(env, ed, cookie);
+
+       cl_env_put(env, &refcheck);
+       RETURN(rc);
+}
+
+static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io,
+                            enum cl_req_type unused, struct cl_2queue *queue)
+{
+       struct cl_page *clp;
+       struct cl_page *temp;
+       int result = 0;
+       ENTRY;
+
+       cl_page_list_for_each_safe(clp, temp, &queue->c2_qin) {
+               int rc;
+               rc = cl_page_cache_add(env, io, clp, CRT_WRITE);
+               if (rc == 0)
+                       continue;
+               result = result ?: rc;
+       }
+       RETURN(result);
+}
+
+static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset,
+                             struct page **pages, int npages, int async)
+{
+       struct lu_env      *env;
+       struct echo_thread_info *info;
+       struct cl_object        *obj = echo_obj2cl(eco);
+       struct echo_device      *ed  = eco->eo_dev;
+       struct cl_2queue        *queue;
+       struct cl_io        *io;
+       struct cl_page    *clp;
+       struct lustre_handle    lh = { 0 };
+       int page_size = cl_page_size(obj);
+       int refcheck;
+       int rc;
+       int i;
+       ENTRY;
+
+       LASSERT((offset & ~CFS_PAGE_MASK) == 0);
+       LASSERT(ed->ed_next != NULL);
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       info    = echo_env_info(env);
+       io      = &info->eti_io;
+       queue   = &info->eti_queue;
+
+       cl_2queue_init(queue);
+
+       io->ci_ignore_layout = 1;
+       rc = cl_io_init(env, io, CIT_MISC, obj);
+       if (rc < 0)
+               GOTO(out, rc);
+       LASSERT(rc == 0);
+
+
+       rc = cl_echo_enqueue0(env, eco, offset,
+                             offset + npages * PAGE_CACHE_SIZE - 1,
+                             rw == READ ? LCK_PR : LCK_PW, &lh.cookie,
+                             CEF_NEVER);
+       if (rc < 0)
+               GOTO(error_lock, rc);
+
+       for (i = 0; i < npages; i++) {
+               LASSERT(pages[i]);
+               clp = cl_page_find(env, obj, cl_index(obj, offset),
+                                  pages[i], CPT_TRANSIENT);
+               if (IS_ERR(clp)) {
+                       rc = PTR_ERR(clp);
+                       break;
+               }
+               LASSERT(clp->cp_type == CPT_TRANSIENT);
+
+               rc = cl_page_own(env, io, clp);
+               if (rc) {
+                       LASSERT(clp->cp_state == CPS_FREEING);
+                       cl_page_put(env, clp);
+                       break;
+               }
+
+               cl_2queue_add(queue, clp);
+
+               /* drop the reference count for cl_page_find, so that the page
+                * will be freed in cl_2queue_fini. */
+               cl_page_put(env, clp);
+               cl_page_clip(env, clp, 0, page_size);
+
+               offset += page_size;
+       }
+
+       if (rc == 0) {
+               enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE;
+
+               async = async && (typ == CRT_WRITE);
+               if (async)
+                       rc = cl_echo_async_brw(env, io, typ, queue);
+               else
+                       rc = cl_io_submit_sync(env, io, typ, queue, 0);
+               CDEBUG(D_INFO, "echo_client %s write returns %d\n",
+                      async ? "async" : "sync", rc);
+       }
+
+       cl_echo_cancel0(env, ed, lh.cookie);
+       EXIT;
+error_lock:
+       cl_2queue_discard(env, io, queue);
+       cl_2queue_disown(env, io, queue);
+       cl_2queue_fini(env, queue);
+       cl_io_fini(env, io);
+out:
+       cl_env_put(env, &refcheck);
+       return rc;
+}
+/** @} echo_exports */
+
+
+static obd_id last_object_id;
+
+static int
+echo_copyout_lsm (struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob)
+{
+       struct lov_stripe_md *ulsm = _ulsm;
+       int nob, i;
+
+       nob = offsetof (struct lov_stripe_md, lsm_oinfo[lsm->lsm_stripe_count]);
+       if (nob > ulsm_nob)
+               return (-EINVAL);
+
+       if (copy_to_user (ulsm, lsm, sizeof(ulsm)))
+               return (-EFAULT);
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               if (copy_to_user (ulsm->lsm_oinfo[i], lsm->lsm_oinfo[i],
+                                     sizeof(lsm->lsm_oinfo[0])))
+                       return (-EFAULT);
+       }
+       return 0;
+}
+
+static int
+echo_copyin_lsm (struct echo_device *ed, struct lov_stripe_md *lsm,
+                void *ulsm, int ulsm_nob)
+{
+       struct echo_client_obd *ec = ed->ed_ec;
+       int                  i;
+
+       if (ulsm_nob < sizeof (*lsm))
+               return (-EINVAL);
+
+       if (copy_from_user (lsm, ulsm, sizeof (*lsm)))
+               return (-EFAULT);
+
+       if (lsm->lsm_stripe_count > ec->ec_nstripes ||
+           lsm->lsm_magic != LOV_MAGIC ||
+           (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 ||
+           ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL))
+               return (-EINVAL);
+
+
+       for (i = 0; i < lsm->lsm_stripe_count; i++) {
+               if (copy_from_user(lsm->lsm_oinfo[i],
+                                      ((struct lov_stripe_md *)ulsm)-> \
+                                      lsm_oinfo[i],
+                                      sizeof(lsm->lsm_oinfo[0])))
+                       return (-EFAULT);
+       }
+       return (0);
+}
+
+static inline void echo_md_build_name(struct lu_name *lname, char *name,
+                                     __u64 id)
+{
+       sprintf(name, LPU64, id);
+       lname->ln_name = name;
+       lname->ln_namelen = strlen(name);
+}
+
+/* similar to mdt_attr_get_complex */
+static int echo_big_lmm_get(const struct lu_env *env, struct md_object *o,
+                           struct md_attr *ma)
+{
+       struct echo_thread_info *info = echo_env_info(env);
+       int                      rc;
+
+       ENTRY;
+
+       LASSERT(ma->ma_lmm_size > 0);
+
+       rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LOV);
+       if (rc < 0)
+               RETURN(rc);
+
+       /* big_lmm may need to be grown */
+       if (info->eti_big_lmmsize < rc) {
+               int size = size_roundup_power2(rc);
+
+               if (info->eti_big_lmmsize > 0) {
+                       /* free old buffer */
+                       LASSERT(info->eti_big_lmm);
+                       OBD_FREE_LARGE(info->eti_big_lmm,
+                                      info->eti_big_lmmsize);
+                       info->eti_big_lmm = NULL;
+                       info->eti_big_lmmsize = 0;
+               }
+
+               OBD_ALLOC_LARGE(info->eti_big_lmm, size);
+               if (info->eti_big_lmm == NULL)
+                       RETURN(-ENOMEM);
+               info->eti_big_lmmsize = size;
+       }
+       LASSERT(info->eti_big_lmmsize >= rc);
+
+       info->eti_buf.lb_buf = info->eti_big_lmm;
+       info->eti_buf.lb_len = info->eti_big_lmmsize;
+       rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LOV);
+       if (rc < 0)
+               RETURN(rc);
+
+       ma->ma_valid |= MA_LOV;
+       ma->ma_lmm = info->eti_big_lmm;
+       ma->ma_lmm_size = rc;
+
+       RETURN(0);
+}
+
+int echo_attr_get_complex(const struct lu_env *env, struct md_object *next,
+                         struct md_attr *ma)
+{
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_buf           *buf = &info->eti_buf;
+       umode_t          mode = lu_object_attr(&next->mo_lu);
+       int                      need = ma->ma_need;
+       int                      rc = 0, rc2;
+
+       ENTRY;
+
+       ma->ma_valid = 0;
+
+       if (need & MA_INODE) {
+               ma->ma_need = MA_INODE;
+               rc = mo_attr_get(env, next, ma);
+               if (rc)
+                       GOTO(out, rc);
+               ma->ma_valid |= MA_INODE;
+       }
+
+       if (need & MA_LOV) {
+               if (S_ISREG(mode) || S_ISDIR(mode)) {
+                       LASSERT(ma->ma_lmm_size > 0);
+                       buf->lb_buf = ma->ma_lmm;
+                       buf->lb_len = ma->ma_lmm_size;
+                       rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV);
+                       if (rc2 > 0) {
+                               ma->ma_lmm_size = rc2;
+                               ma->ma_valid |= MA_LOV;
+                       } else if (rc2 == -ENODATA) {
+                               /* no LOV EA */
+                               ma->ma_lmm_size = 0;
+                       } else if (rc2 == -ERANGE) {
+                               rc2 = echo_big_lmm_get(env, next, ma);
+                               if (rc2 < 0)
+                                       GOTO(out, rc = rc2);
+                       } else {
+                               GOTO(out, rc = rc2);
+                       }
+               }
+       }
+
+#ifdef CONFIG_FS_POSIX_ACL
+       if (need & MA_ACL_DEF && S_ISDIR(mode)) {
+               buf->lb_buf = ma->ma_acl;
+               buf->lb_len = ma->ma_acl_size;
+               rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT);
+               if (rc2 > 0) {
+                       ma->ma_acl_size = rc2;
+                       ma->ma_valid |= MA_ACL_DEF;
+               } else if (rc2 == -ENODATA) {
+                       /* no ACLs */
+                       ma->ma_acl_size = 0;
+               } else {
+                       GOTO(out, rc = rc2);
+               }
+       }
+#endif
+out:
+       ma->ma_need = need;
+       CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = "LPX64" ma_lmm=%p\n",
+              rc, ma->ma_valid, ma->ma_lmm);
+       RETURN(rc);
+}
+
+static int
+echo_md_create_internal(const struct lu_env *env, struct echo_device *ed,
+                       struct md_object *parent, struct lu_fid *fid,
+                       struct lu_name *lname, struct md_op_spec *spec,
+                       struct md_attr *ma)
+{
+       struct lu_object        *ec_child, *child;
+       struct lu_device        *ld = ed->ed_next;
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_fid           *fid2 = &info->eti_fid2;
+       struct lu_object_conf    conf = { .loc_flags = LOC_F_NEW };
+       int                      rc;
+
+       ENTRY;
+
+       rc = mdo_lookup(env, parent, lname, fid2, spec);
+       if (rc == 0)
+               return -EEXIST;
+       else if (rc != -ENOENT)
+               return rc;
+
+       ec_child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev,
+                                    fid, &conf);
+       if (IS_ERR(ec_child)) {
+               CERROR("Can not find the child "DFID": rc = %ld\n", PFID(fid),
+                       PTR_ERR(ec_child));
+               RETURN(PTR_ERR(ec_child));
+       }
+
+       child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+       if (child == NULL) {
+               CERROR("Can not locate the child "DFID"\n", PFID(fid));
+               GOTO(out_put, rc = -EINVAL);
+       }
+
+       CDEBUG(D_RPCTRACE, "Start creating object "DFID" %s %p\n",
+              PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+       /*
+        * Do not perform lookup sanity check. We know that name does not exist.
+        */
+       spec->sp_cr_lookup = 0;
+       rc = mdo_create(env, parent, lname, lu2md(child), spec, ma);
+       if (rc) {
+               CERROR("Can not create child "DFID": rc = %d\n", PFID(fid), rc);
+               GOTO(out_put, rc);
+       }
+       CDEBUG(D_RPCTRACE, "End creating object "DFID" %s %p rc  = %d\n",
+              PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent, rc);
+       EXIT;
+out_put:
+       lu_object_put(env, ec_child);
+       return rc;
+}
+
+static int echo_set_lmm_size(const struct lu_env *env, struct lu_device *ld,
+                            struct md_attr *ma)
+{
+       struct echo_thread_info *info = echo_env_info(env);
+
+       if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+               ma->ma_lmm = (void *)&info->eti_lmm;
+               ma->ma_lmm_size = sizeof(info->eti_lmm);
+       } else {
+               LASSERT(info->eti_big_lmmsize);
+               ma->ma_lmm = info->eti_big_lmm;
+               ma->ma_lmm_size = info->eti_big_lmmsize;
+       }
+
+       return 0;
+}
+
+static int echo_create_md_object(const struct lu_env *env,
+                                struct echo_device *ed,
+                                struct lu_object *ec_parent,
+                                struct lu_fid *fid,
+                                char *name, int namelen,
+                                __u64 id, __u32 mode, int count,
+                                int stripe_count, int stripe_offset)
+{
+       struct lu_object        *parent;
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_name    *lname = &info->eti_lname;
+       struct md_op_spec       *spec = &info->eti_spec;
+       struct md_attr    *ma = &info->eti_ma;
+       struct lu_device        *ld = ed->ed_next;
+       int                   rc = 0;
+       int                   i;
+
+       ENTRY;
+
+       if (ec_parent == NULL)
+               return -1;
+       parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+       if (parent == NULL)
+               RETURN(-ENXIO);
+
+       memset(ma, 0, sizeof(*ma));
+       memset(spec, 0, sizeof(*spec));
+       if (stripe_count != 0) {
+               spec->sp_cr_flags |= FMODE_WRITE;
+               echo_set_lmm_size(env, ld, ma);
+               if (stripe_count != -1) {
+                       struct lov_user_md_v3 *lum = &info->eti_lum;
+
+                       lum->lmm_magic = LOV_USER_MAGIC_V3;
+                       lum->lmm_stripe_count = stripe_count;
+                       lum->lmm_stripe_offset = stripe_offset;
+                       lum->lmm_pattern = 0;
+                       spec->u.sp_ea.eadata = lum;
+                       spec->u.sp_ea.eadatalen = sizeof(*lum);
+                       spec->sp_cr_flags |= MDS_OPEN_HAS_EA;
+               }
+       }
+
+       ma->ma_attr.la_mode = mode;
+       ma->ma_attr.la_valid = LA_CTIME | LA_MODE;
+       ma->ma_attr.la_ctime = cfs_time_current_64();
+
+       if (name != NULL) {
+               lname->ln_name = name;
+               lname->ln_namelen = namelen;
+               /* If name is specified, only create one object by name */
+               rc = echo_md_create_internal(env, ed, lu2md(parent), fid, lname,
+                                            spec, ma);
+               RETURN(rc);
+       }
+
+       /* Create multiple object sequenced by id */
+       for (i = 0; i < count; i++) {
+               char *tmp_name = info->eti_name;
+
+               echo_md_build_name(lname, tmp_name, id);
+
+               rc = echo_md_create_internal(env, ed, lu2md(parent), fid, lname,
+                                            spec, ma);
+               if (rc) {
+                       CERROR("Can not create child %s: rc = %d\n", tmp_name,
+                               rc);
+                       break;
+               }
+               id++;
+               fid->f_oid++;
+       }
+
+       RETURN(rc);
+}
+
+static struct lu_object *echo_md_lookup(const struct lu_env *env,
+                                       struct echo_device *ed,
+                                       struct md_object *parent,
+                                       struct lu_name *lname)
+{
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_fid      *fid = &info->eti_fid;
+       struct lu_object        *child;
+       int    rc;
+       ENTRY;
+
+       CDEBUG(D_INFO, "lookup %s in parent "DFID" %p\n", lname->ln_name,
+              PFID(fid), parent);
+       rc = mdo_lookup(env, parent, lname, fid, NULL);
+       if (rc) {
+               CERROR("lookup %s: rc = %d\n", lname->ln_name, rc);
+               RETURN(ERR_PTR(rc));
+       }
+
+       /* In the function below, .hs_keycmp resolves to
+        * lu_obj_hop_keycmp() */
+       /* coverity[overrun-buffer-val] */
+       child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+
+       RETURN(child);
+}
+
+static int echo_setattr_object(const struct lu_env *env,
+                              struct echo_device *ed,
+                              struct lu_object *ec_parent,
+                              __u64 id, int count)
+{
+       struct lu_object        *parent;
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_name    *lname = &info->eti_lname;
+       char                *name = info->eti_name;
+       struct lu_device        *ld = ed->ed_next;
+       struct lu_buf      *buf = &info->eti_buf;
+       int                   rc = 0;
+       int                   i;
+
+       ENTRY;
+
+       if (ec_parent == NULL)
+               return -1;
+       parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+       if (parent == NULL)
+               RETURN(-ENXIO);
+
+       for (i = 0; i < count; i++) {
+               struct lu_object *ec_child, *child;
+
+               echo_md_build_name(lname, name, id);
+
+               ec_child = echo_md_lookup(env, ed, lu2md(parent), lname);
+               if (IS_ERR(ec_child)) {
+                       CERROR("Can't find child %s: rc = %ld\n",
+                               lname->ln_name, PTR_ERR(ec_child));
+                       RETURN(PTR_ERR(ec_child));
+               }
+
+               child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+               if (child == NULL) {
+                       CERROR("Can not locate the child %s\n", lname->ln_name);
+                       lu_object_put(env, ec_child);
+                       rc = -EINVAL;
+                       break;
+               }
+
+               CDEBUG(D_RPCTRACE, "Start setattr object "DFID"\n",
+                      PFID(lu_object_fid(child)));
+
+               buf->lb_buf = info->eti_xattr_buf;
+               buf->lb_len = sizeof(info->eti_xattr_buf);
+
+               sprintf(name, "%s.test1", XATTR_USER_PREFIX);
+               rc = mo_xattr_set(env, lu2md(child), buf, name,
+                                 LU_XATTR_CREATE);
+               if (rc < 0) {
+                       CERROR("Can not setattr child "DFID": rc = %d\n",
+                               PFID(lu_object_fid(child)), rc);
+                       lu_object_put(env, ec_child);
+                       break;
+               }
+               CDEBUG(D_RPCTRACE, "End setattr object "DFID"\n",
+                      PFID(lu_object_fid(child)));
+               id++;
+               lu_object_put(env, ec_child);
+       }
+       RETURN(rc);
+}
+
+static int echo_getattr_object(const struct lu_env *env,
+                              struct echo_device *ed,
+                              struct lu_object *ec_parent,
+                              __u64 id, int count)
+{
+       struct lu_object        *parent;
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_name    *lname = &info->eti_lname;
+       char                *name = info->eti_name;
+       struct md_attr    *ma = &info->eti_ma;
+       struct lu_device        *ld = ed->ed_next;
+       int                   rc = 0;
+       int                   i;
+
+       ENTRY;
+
+       if (ec_parent == NULL)
+               return -1;
+       parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+       if (parent == NULL)
+               RETURN(-ENXIO);
+
+       memset(ma, 0, sizeof(*ma));
+       ma->ma_need |= MA_INODE | MA_LOV | MA_PFID | MA_HSM | MA_ACL_DEF;
+       ma->ma_acl = info->eti_xattr_buf;
+       ma->ma_acl_size = sizeof(info->eti_xattr_buf);
+
+       for (i = 0; i < count; i++) {
+               struct lu_object *ec_child, *child;
+
+               ma->ma_valid = 0;
+               echo_md_build_name(lname, name, id);
+               echo_set_lmm_size(env, ld, ma);
+
+               ec_child = echo_md_lookup(env, ed, lu2md(parent), lname);
+               if (IS_ERR(ec_child)) {
+                       CERROR("Can't find child %s: rc = %ld\n",
+                              lname->ln_name, PTR_ERR(ec_child));
+                       RETURN(PTR_ERR(ec_child));
+               }
+
+               child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+               if (child == NULL) {
+                       CERROR("Can not locate the child %s\n", lname->ln_name);
+                       lu_object_put(env, ec_child);
+                       RETURN(-EINVAL);
+               }
+
+               CDEBUG(D_RPCTRACE, "Start getattr object "DFID"\n",
+                      PFID(lu_object_fid(child)));
+               rc = echo_attr_get_complex(env, lu2md(child), ma);
+               if (rc) {
+                       CERROR("Can not getattr child "DFID": rc = %d\n",
+                               PFID(lu_object_fid(child)), rc);
+                       lu_object_put(env, ec_child);
+                       break;
+               }
+               CDEBUG(D_RPCTRACE, "End getattr object "DFID"\n",
+                      PFID(lu_object_fid(child)));
+               id++;
+               lu_object_put(env, ec_child);
+       }
+
+       RETURN(rc);
+}
+
+static int echo_lookup_object(const struct lu_env *env,
+                             struct echo_device *ed,
+                             struct lu_object *ec_parent,
+                             __u64 id, int count)
+{
+       struct lu_object        *parent;
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_name    *lname = &info->eti_lname;
+       char                *name = info->eti_name;
+       struct lu_fid      *fid = &info->eti_fid;
+       struct lu_device        *ld = ed->ed_next;
+       int                   rc = 0;
+       int                   i;
+
+       if (ec_parent == NULL)
+               return -1;
+       parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+       if (parent == NULL)
+               return -ENXIO;
+
+       /*prepare the requests*/
+       for (i = 0; i < count; i++) {
+               echo_md_build_name(lname, name, id);
+
+               CDEBUG(D_RPCTRACE, "Start lookup object "DFID" %s %p\n",
+                      PFID(lu_object_fid(parent)), lname->ln_name, parent);
+
+               rc = mdo_lookup(env, lu2md(parent), lname, fid, NULL);
+               if (rc) {
+                       CERROR("Can not lookup child %s: rc = %d\n", name, rc);
+                       break;
+               }
+               CDEBUG(D_RPCTRACE, "End lookup object "DFID" %s %p\n",
+                      PFID(lu_object_fid(parent)), lname->ln_name, parent);
+
+               id++;
+       }
+       return rc;
+}
+
+static int echo_md_destroy_internal(const struct lu_env *env,
+                                   struct echo_device *ed,
+                                   struct md_object *parent,
+                                   struct lu_name *lname,
+                                   struct md_attr *ma)
+{
+       struct lu_device   *ld = ed->ed_next;
+       struct lu_object   *ec_child;
+       struct lu_object   *child;
+       int              rc;
+
+       ENTRY;
+
+       ec_child = echo_md_lookup(env, ed, parent, lname);
+       if (IS_ERR(ec_child)) {
+               CERROR("Can't find child %s: rc = %ld\n", lname->ln_name,
+                       PTR_ERR(ec_child));
+               RETURN(PTR_ERR(ec_child));
+       }
+
+       child = lu_object_locate(ec_child->lo_header, ld->ld_type);
+       if (child == NULL) {
+               CERROR("Can not locate the child %s\n", lname->ln_name);
+               GOTO(out_put, rc = -EINVAL);
+       }
+
+       CDEBUG(D_RPCTRACE, "Start destroy object "DFID" %s %p\n",
+              PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+
+       rc = mdo_unlink(env, parent, lu2md(child), lname, ma, 0);
+       if (rc) {
+               CERROR("Can not unlink child %s: rc = %d\n",
+                       lname->ln_name, rc);
+               GOTO(out_put, rc);
+       }
+       CDEBUG(D_RPCTRACE, "End destroy object "DFID" %s %p\n",
+              PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent);
+out_put:
+       lu_object_put(env, ec_child);
+       return rc;
+}
+
+static int echo_destroy_object(const struct lu_env *env,
+                              struct echo_device *ed,
+                              struct lu_object *ec_parent,
+                              char *name, int namelen,
+                              __u64 id, __u32 mode,
+                              int count)
+{
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_name    *lname = &info->eti_lname;
+       struct md_attr    *ma = &info->eti_ma;
+       struct lu_device        *ld = ed->ed_next;
+       struct lu_object        *parent;
+       int                   rc = 0;
+       int                   i;
+       ENTRY;
+
+       parent = lu_object_locate(ec_parent->lo_header, ld->ld_type);
+       if (parent == NULL)
+               RETURN(-EINVAL);
+
+       memset(ma, 0, sizeof(*ma));
+       ma->ma_attr.la_mode = mode;
+       ma->ma_attr.la_valid = LA_CTIME;
+       ma->ma_attr.la_ctime = cfs_time_current_64();
+       ma->ma_need = MA_INODE;
+       ma->ma_valid = 0;
+
+       if (name != NULL) {
+               lname->ln_name = name;
+               lname->ln_namelen = namelen;
+               rc = echo_md_destroy_internal(env, ed, lu2md(parent), lname,
+                                             ma);
+               RETURN(rc);
+       }
+
+       /*prepare the requests*/
+       for (i = 0; i < count; i++) {
+               char *tmp_name = info->eti_name;
+
+               ma->ma_valid = 0;
+               echo_md_build_name(lname, tmp_name, id);
+
+               rc = echo_md_destroy_internal(env, ed, lu2md(parent), lname,
+                                             ma);
+               if (rc) {
+                       CERROR("Can not unlink child %s: rc = %d\n", name, rc);
+                       break;
+               }
+               id++;
+       }
+
+       RETURN(rc);
+}
+
+static struct lu_object *echo_resolve_path(const struct lu_env *env,
+                                          struct echo_device *ed, char *path,
+                                          int path_len)
+{
+       struct lu_device        *ld = ed->ed_next;
+       struct md_device        *md = lu2md_dev(ld);
+       struct echo_thread_info *info = echo_env_info(env);
+       struct lu_fid      *fid = &info->eti_fid;
+       struct lu_name    *lname = &info->eti_lname;
+       struct lu_object        *parent = NULL;
+       struct lu_object        *child = NULL;
+       int rc = 0;
+       ENTRY;
+
+       /*Only support MDD layer right now*/
+       rc = md->md_ops->mdo_root_get(env, md, fid);
+       if (rc) {
+               CERROR("get root error: rc = %d\n", rc);
+               RETURN(ERR_PTR(rc));
+       }
+
+       /* In the function below, .hs_keycmp resolves to
+        * lu_obj_hop_keycmp() */
+       /* coverity[overrun-buffer-val] */
+       parent = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL);
+       if (IS_ERR(parent)) {
+               CERROR("Can not find the parent "DFID": rc = %ld\n",
+                       PFID(fid), PTR_ERR(parent));
+               RETURN(parent);
+       }
+
+       while (1) {
+               struct lu_object *ld_parent;
+               char *e;
+
+               e = strsep(&path, "/");
+               if (e == NULL)
+                       break;
+
+               if (e[0] == 0) {
+                       if (!path || path[0] == '\0')
+                               break;
+                       continue;
+               }
+
+               lname->ln_name = e;
+               lname->ln_namelen = strlen(e);
+
+               ld_parent = lu_object_locate(parent->lo_header, ld->ld_type);
+               if (ld_parent == NULL) {
+                       lu_object_put(env, parent);
+                       rc = -EINVAL;
+                       break;
+               }
+
+               child = echo_md_lookup(env, ed, lu2md(ld_parent), lname);
+               lu_object_put(env, parent);
+               if (IS_ERR(child)) {
+                       rc = (int)PTR_ERR(child);
+                       CERROR("lookup %s under parent "DFID": rc = %d\n",
+                               lname->ln_name, PFID(lu_object_fid(ld_parent)),
+                               rc);
+                       break;
+               }
+               parent = child;
+       }
+       if (rc)
+               RETURN(ERR_PTR(rc));
+
+       RETURN(parent);
+}
+
+static void echo_ucred_init(struct lu_env *env)
+{
+       struct lu_ucred *ucred = lu_ucred(env);
+
+       ucred->uc_valid = UCRED_INVALID;
+
+       ucred->uc_suppgids[0] = -1;
+       ucred->uc_suppgids[1] = -1;
+
+       ucred->uc_uid   = ucred->uc_o_uid   = current_uid();
+       ucred->uc_gid   = ucred->uc_o_gid   = current_gid();
+       ucred->uc_fsuid = ucred->uc_o_fsuid = current_fsuid();
+       ucred->uc_fsgid = ucred->uc_o_fsgid = current_fsgid();
+       ucred->uc_cap   = cfs_curproc_cap_pack();
+
+       /* remove fs privilege for non-root user. */
+       if (ucred->uc_fsuid)
+               ucred->uc_cap &= ~CFS_CAP_FS_MASK;
+       ucred->uc_valid = UCRED_NEW;
+}
+
+static void echo_ucred_fini(struct lu_env *env)
+{
+       struct lu_ucred *ucred = lu_ucred(env);
+       ucred->uc_valid = UCRED_INIT;
+}
+
+#define ECHO_MD_CTX_TAG (LCT_REMEMBER | LCT_MD_THREAD)
+#define ECHO_MD_SES_TAG (LCT_REMEMBER | LCT_SESSION)
+static int echo_md_handler(struct echo_device *ed, int command,
+                          char *path, int path_len, __u64 id, int count,
+                          struct obd_ioctl_data *data)
+{
+       struct echo_thread_info *info;
+       struct lu_device      *ld = ed->ed_next;
+       struct lu_env    *env;
+       int                 refcheck;
+       struct lu_object      *parent;
+       char              *name = NULL;
+       int                 namelen = data->ioc_plen2;
+       int                 rc = 0;
+       ENTRY;
+
+       if (ld == NULL) {
+               CERROR("MD echo client is not being initialized properly\n");
+               RETURN(-EINVAL);
+       }
+
+       if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) {
+               CERROR("Only support MDD layer right now!\n");
+               RETURN(-EINVAL);
+       }
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       rc = lu_env_refill_by_tags(env, ECHO_MD_CTX_TAG, ECHO_MD_SES_TAG);
+       if (rc != 0)
+               GOTO(out_env, rc);
+
+       /* init big_lmm buffer */
+       info = echo_env_info(env);
+       LASSERT(info->eti_big_lmm == NULL);
+       OBD_ALLOC_LARGE(info->eti_big_lmm, MIN_MD_SIZE);
+       if (info->eti_big_lmm == NULL)
+               GOTO(out_env, rc = -ENOMEM);
+       info->eti_big_lmmsize = MIN_MD_SIZE;
+
+       parent = echo_resolve_path(env, ed, path, path_len);
+       if (IS_ERR(parent)) {
+               CERROR("Can not resolve the path %s: rc = %ld\n", path,
+                       PTR_ERR(parent));
+               GOTO(out_free, rc = PTR_ERR(parent));
+       }
+
+       if (namelen > 0) {
+               OBD_ALLOC(name, namelen + 1);
+               if (name == NULL)
+                       GOTO(out_put, rc = -ENOMEM);
+               if (copy_from_user(name, data->ioc_pbuf2, namelen))
+                       GOTO(out_name, rc = -EFAULT);
+       }
+
+       echo_ucred_init(env);
+
+       switch (command) {
+       case ECHO_MD_CREATE:
+       case ECHO_MD_MKDIR: {
+               struct echo_thread_info *info = echo_env_info(env);
+               __u32 mode = data->ioc_obdo2.o_mode;
+               struct lu_fid *fid = &info->eti_fid;
+               int stripe_count = (int)data->ioc_obdo2.o_misc;
+               int stripe_index = (int)data->ioc_obdo2.o_stripe_idx;
+
+               rc = ostid_to_fid(fid, &data->ioc_obdo1.o_oi, 0);
+               if (rc != 0)
+                       break;
+
+               /* In the function below, .hs_keycmp resolves to
+                * lu_obj_hop_keycmp() */
+               /* coverity[overrun-buffer-val] */
+               rc = echo_create_md_object(env, ed, parent, fid, name, namelen,
+                                          id, mode, count, stripe_count,
+                                          stripe_index);
+               break;
+       }
+       case ECHO_MD_DESTROY:
+       case ECHO_MD_RMDIR: {
+               __u32 mode = data->ioc_obdo2.o_mode;
+
+               rc = echo_destroy_object(env, ed, parent, name, namelen,
+                                        id, mode, count);
+               break;
+       }
+       case ECHO_MD_LOOKUP:
+               rc = echo_lookup_object(env, ed, parent, id, count);
+               break;
+       case ECHO_MD_GETATTR:
+               rc = echo_getattr_object(env, ed, parent, id, count);
+               break;
+       case ECHO_MD_SETATTR:
+               rc = echo_setattr_object(env, ed, parent, id, count);
+               break;
+       default:
+               CERROR("unknown command %d\n", command);
+               rc = -EINVAL;
+               break;
+       }
+       echo_ucred_fini(env);
+
+out_name:
+       if (name != NULL)
+               OBD_FREE(name, namelen + 1);
+out_put:
+       lu_object_put(env, parent);
+out_free:
+       LASSERT(info->eti_big_lmm);
+       OBD_FREE_LARGE(info->eti_big_lmm, info->eti_big_lmmsize);
+       info->eti_big_lmm = NULL;
+       info->eti_big_lmmsize = 0;
+out_env:
+       cl_env_put(env, &refcheck);
+       return rc;
+}
+
+static int echo_create_object(const struct lu_env *env, struct echo_device *ed,
+                             int on_target, struct obdo *oa, void *ulsm,
+                             int ulsm_nob, struct obd_trans_info *oti)
+{
+       struct echo_object     *eco;
+       struct echo_client_obd *ec = ed->ed_ec;
+       struct lov_stripe_md   *lsm = NULL;
+       int                  rc;
+       int                  created = 0;
+       ENTRY;
+
+       if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */
+           (on_target ||                      /* set_stripe */
+            ec->ec_nstripes != 0)) {      /* LOV */
+               CERROR ("No valid oid\n");
+               RETURN(-EINVAL);
+       }
+
+       rc = echo_alloc_memmd(ed, &lsm);
+       if (rc < 0) {
+               CERROR("Cannot allocate md: rc = %d\n", rc);
+               GOTO(failed, rc);
+       }
+
+       if (ulsm != NULL) {
+               int i, idx;
+
+               rc = echo_copyin_lsm (ed, lsm, ulsm, ulsm_nob);
+               if (rc != 0)
+                       GOTO(failed, rc);
+
+               if (lsm->lsm_stripe_count == 0)
+                       lsm->lsm_stripe_count = ec->ec_nstripes;
+
+               if (lsm->lsm_stripe_size == 0)
+                       lsm->lsm_stripe_size = PAGE_CACHE_SIZE;
+
+               idx = cfs_rand();
+
+               /* setup stripes: indices + default ids if required */
+               for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                       if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) == 0)
+                               lsm->lsm_oinfo[i]->loi_oi = lsm->lsm_oi;
+
+                       lsm->lsm_oinfo[i]->loi_ost_idx =
+                               (idx + i) % ec->ec_nstripes;
+               }
+       }
+
+       /* setup object ID here for !on_target and LOV hint */
+       if (oa->o_valid & OBD_MD_FLID) {
+               LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+               lsm->lsm_oi = oa->o_oi;
+       }
+
+       if (ostid_id(&lsm->lsm_oi) == 0)
+               ostid_set_id(&lsm->lsm_oi, ++last_object_id);
+
+       rc = 0;
+       if (on_target) {
+               /* Only echo objects are allowed to be created */
+               LASSERT((oa->o_valid & OBD_MD_FLGROUP) &&
+                       (ostid_seq(&oa->o_oi) == FID_SEQ_ECHO));
+               rc = obd_create(env, ec->ec_exp, oa, &lsm, oti);
+               if (rc != 0) {
+                       CERROR("Cannot create objects: rc = %d\n", rc);
+                       GOTO(failed, rc);
+               }
+               created = 1;
+       }
+
+       /* See what object ID we were given */
+       oa->o_oi = lsm->lsm_oi;
+       oa->o_valid |= OBD_MD_FLID;
+
+       eco = cl_echo_object_find(ed, &lsm);
+       if (IS_ERR(eco))
+               GOTO(failed, rc = PTR_ERR(eco));
+       cl_echo_object_put(eco);
+
+       CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi));
+       EXIT;
+
+ failed:
+       if (created && rc)
+               obd_destroy(env, ec->ec_exp, oa, lsm, oti, NULL, NULL);
+       if (lsm)
+               echo_free_memmd(ed, &lsm);
+       if (rc)
+               CERROR("create object failed with: rc = %d\n", rc);
+       return (rc);
+}
+
+static int echo_get_object(struct echo_object **ecop, struct echo_device *ed,
+                          struct obdo *oa)
+{
+       struct lov_stripe_md   *lsm = NULL;
+       struct echo_object     *eco;
+       int                  rc;
+       ENTRY;
+
+       if ((oa->o_valid & OBD_MD_FLID) == 0 || ostid_id(&oa->o_oi) == 0) {
+               /* disallow use of object id 0 */
+               CERROR ("No valid oid\n");
+               RETURN(-EINVAL);
+       }
+
+       rc = echo_alloc_memmd(ed, &lsm);
+       if (rc < 0)
+               RETURN(rc);
+
+       lsm->lsm_oi = oa->o_oi;
+       if (!(oa->o_valid & OBD_MD_FLGROUP))
+               ostid_set_seq_echo(&lsm->lsm_oi);
+
+       rc = 0;
+       eco = cl_echo_object_find(ed, &lsm);
+       if (!IS_ERR(eco))
+               *ecop = eco;
+       else
+               rc = PTR_ERR(eco);
+       if (lsm)
+               echo_free_memmd(ed, &lsm);
+       RETURN(rc);
+}
+
+static void echo_put_object(struct echo_object *eco)
+{
+       if (cl_echo_object_put(eco))
+               CERROR("echo client: drop an object failed");
+}
+
+static void
+echo_get_stripe_off_id (struct lov_stripe_md *lsm, obd_off *offp, obd_id *idp)
+{
+       unsigned long stripe_count;
+       unsigned long stripe_size;
+       unsigned long width;
+       unsigned long woffset;
+       int        stripe_index;
+       obd_off       offset;
+
+       if (lsm->lsm_stripe_count <= 1)
+               return;
+
+       offset       = *offp;
+       stripe_size  = lsm->lsm_stripe_size;
+       stripe_count = lsm->lsm_stripe_count;
+
+       /* width = # bytes in all stripes */
+       width = stripe_size * stripe_count;
+
+       /* woffset = offset within a width; offset = whole number of widths */
+       woffset = do_div (offset, width);
+
+       stripe_index = woffset / stripe_size;
+
+       *idp = ostid_id(&lsm->lsm_oinfo[stripe_index]->loi_oi);
+       *offp = offset * stripe_size + woffset % stripe_size;
+}
+
+static void
+echo_client_page_debug_setup(struct lov_stripe_md *lsm,
+                            struct page *page, int rw, obd_id id,
+                            obd_off offset, obd_off count)
+{
+       char    *addr;
+       obd_off  stripe_off;
+       obd_id   stripe_id;
+       int      delta;
+
+       /* no partial pages on the client */
+       LASSERT(count == PAGE_CACHE_SIZE);
+
+       addr = kmap(page);
+
+       for (delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+               if (rw == OBD_BRW_WRITE) {
+                       stripe_off = offset + delta;
+                       stripe_id = id;
+                       echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id);
+               } else {
+                       stripe_off = 0xdeadbeef00c0ffeeULL;
+                       stripe_id = 0xdeadbeef00c0ffeeULL;
+               }
+               block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE,
+                                 stripe_off, stripe_id);
+       }
+
+       kunmap(page);
+}
+
+static int echo_client_page_debug_check(struct lov_stripe_md *lsm,
+                                       struct page *page, obd_id id,
+                                       obd_off offset, obd_off count)
+{
+       obd_off stripe_off;
+       obd_id  stripe_id;
+       char   *addr;
+       int     delta;
+       int     rc;
+       int     rc2;
+
+       /* no partial pages on the client */
+       LASSERT(count == PAGE_CACHE_SIZE);
+
+       addr = kmap(page);
+
+       for (rc = delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+               stripe_off = offset + delta;
+               stripe_id = id;
+               echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id);
+
+               rc2 = block_debug_check("test_brw",
+                                       addr + delta, OBD_ECHO_BLOCK_SIZE,
+                                       stripe_off, stripe_id);
+               if (rc2 != 0) {
+                       CERROR ("Error in echo object "LPX64"\n", id);
+                       rc = rc2;
+               }
+       }
+
+       kunmap(page);
+       return rc;
+}
+
+static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
+                           struct echo_object *eco, obd_off offset,
+                           obd_size count, int async,
+                           struct obd_trans_info *oti)
+{
+       struct lov_stripe_md   *lsm = eco->eo_lsm;
+       obd_count              npages;
+       struct brw_page *pga;
+       struct brw_page *pgp;
+       struct page         **pages;
+       obd_off          off;
+       int                  i;
+       int                  rc;
+       int                  verify;
+       int                  gfp_mask;
+       int                  brw_flags = 0;
+       ENTRY;
+
+       verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID &&
+                 (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
+                 (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
+
+       gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_IOFS : GFP_HIGHUSER;
+
+       LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
+       LASSERT(lsm != NULL);
+       LASSERT(ostid_id(&lsm->lsm_oi) == ostid_id(&oa->o_oi));
+
+       if (count <= 0 ||
+           (count & (~CFS_PAGE_MASK)) != 0)
+               RETURN(-EINVAL);
+
+       /* XXX think again with misaligned I/O */
+       npages = count >> PAGE_CACHE_SHIFT;
+
+       if (rw == OBD_BRW_WRITE)
+               brw_flags = OBD_BRW_ASYNC;
+
+       OBD_ALLOC(pga, npages * sizeof(*pga));
+       if (pga == NULL)
+               RETURN(-ENOMEM);
+
+       OBD_ALLOC(pages, npages * sizeof(*pages));
+       if (pages == NULL) {
+               OBD_FREE(pga, npages * sizeof(*pga));
+               RETURN(-ENOMEM);
+       }
+
+       for (i = 0, pgp = pga, off = offset;
+            i < npages;
+            i++, pgp++, off += PAGE_CACHE_SIZE) {
+
+               LASSERT (pgp->pg == NULL);      /* for cleanup */
+
+               rc = -ENOMEM;
+               OBD_PAGE_ALLOC(pgp->pg, gfp_mask);
+               if (pgp->pg == NULL)
+                       goto out;
+
+               pages[i] = pgp->pg;
+               pgp->count = PAGE_CACHE_SIZE;
+               pgp->off = off;
+               pgp->flag = brw_flags;
+
+               if (verify)
+                       echo_client_page_debug_setup(lsm, pgp->pg, rw,
+                                                    ostid_id(&oa->o_oi), off,
+                                                    pgp->count);
+       }
+
+       /* brw mode can only be used at client */
+       LASSERT(ed->ed_next != NULL);
+       rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async);
+
+ out:
+       if (rc != 0 || rw != OBD_BRW_READ)
+               verify = 0;
+
+       for (i = 0, pgp = pga; i < npages; i++, pgp++) {
+               if (pgp->pg == NULL)
+                       continue;
+
+               if (verify) {
+                       int vrc;
+                       vrc = echo_client_page_debug_check(lsm, pgp->pg,
+                                                          ostid_id(&oa->o_oi),
+                                                          pgp->off, pgp->count);
+                       if (vrc != 0 && rc == 0)
+                               rc = vrc;
+               }
+               OBD_PAGE_FREE(pgp->pg);
+       }
+       OBD_FREE(pga, npages * sizeof(*pga));
+       OBD_FREE(pages, npages * sizeof(*pages));
+       RETURN(rc);
+}
+
+static int echo_client_prep_commit(const struct lu_env *env,
+                                  struct obd_export *exp, int rw,
+                                  struct obdo *oa, struct echo_object *eco,
+                                  obd_off offset, obd_size count,
+                                  obd_size batch, struct obd_trans_info *oti,
+                                  int async)
+{
+       struct lov_stripe_md *lsm = eco->eo_lsm;
+       struct obd_ioobj ioo;
+       struct niobuf_local *lnb;
+       struct niobuf_remote *rnb;
+       obd_off off;
+       obd_size npages, tot_pages;
+       int i, ret = 0, brw_flags = 0;
+
+       ENTRY;
+
+       if (count <= 0 || (count & (~CFS_PAGE_MASK)) != 0 ||
+           (lsm != NULL && ostid_id(&lsm->lsm_oi) != ostid_id(&oa->o_oi)))
+               RETURN(-EINVAL);
+
+       npages = batch >> PAGE_CACHE_SHIFT;
+       tot_pages = count >> PAGE_CACHE_SHIFT;
+
+       OBD_ALLOC(lnb, npages * sizeof(struct niobuf_local));
+       OBD_ALLOC(rnb, npages * sizeof(struct niobuf_remote));
+
+       if (lnb == NULL || rnb == NULL)
+               GOTO(out, ret = -ENOMEM);
+
+       if (rw == OBD_BRW_WRITE && async)
+               brw_flags |= OBD_BRW_ASYNC;
+
+       obdo_to_ioobj(oa, &ioo);
+
+       off = offset;
+
+       for(; tot_pages; tot_pages -= npages) {
+               int lpages;
+
+               if (tot_pages < npages)
+                       npages = tot_pages;
+
+               for (i = 0; i < npages; i++, off += PAGE_CACHE_SIZE) {
+                       rnb[i].offset = off;
+                       rnb[i].len = PAGE_CACHE_SIZE;
+                       rnb[i].flags = brw_flags;
+               }
+
+               ioo.ioo_bufcnt = npages;
+               oti->oti_transno = 0;
+
+               lpages = npages;
+               ret = obd_preprw(env, rw, exp, oa, 1, &ioo, rnb, &lpages,
+                                lnb, oti, NULL);
+               if (ret != 0)
+                       GOTO(out, ret);
+               LASSERT(lpages == npages);
+
+               for (i = 0; i < lpages; i++) {
+                       struct page *page = lnb[i].page;
+
+                       /* read past eof? */
+                       if (page == NULL && lnb[i].rc == 0)
+                               continue;
+
+                       if (async)
+                               lnb[i].flags |= OBD_BRW_ASYNC;
+
+                       if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID ||
+                           (oa->o_valid & OBD_MD_FLFLAGS) == 0 ||
+                           (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0)
+                               continue;
+
+                       if (rw == OBD_BRW_WRITE)
+                               echo_client_page_debug_setup(lsm, page, rw,
+                                                           ostid_id(&oa->o_oi),
+                                                            rnb[i].offset,
+                                                            rnb[i].len);
+                       else
+                               echo_client_page_debug_check(lsm, page,
+                                                           ostid_id(&oa->o_oi),
+                                                            rnb[i].offset,
+                                                            rnb[i].len);
+               }
+
+               ret = obd_commitrw(env, rw, exp, oa, 1, &ioo,
+                                  rnb, npages, lnb, oti, ret);
+               if (ret != 0)
+                       GOTO(out, ret);
+
+               /* Reset oti otherwise it would confuse ldiskfs. */
+               memset(oti, 0, sizeof(*oti));
+
+               /* Reuse env context. */
+               lu_context_exit((struct lu_context *)&env->le_ctx);
+               lu_context_enter((struct lu_context *)&env->le_ctx);
+       }
+
+out:
+       if (lnb)
+               OBD_FREE(lnb, npages * sizeof(struct niobuf_local));
+       if (rnb)
+               OBD_FREE(rnb, npages * sizeof(struct niobuf_remote));
+       RETURN(ret);
+}
+
+static int echo_client_brw_ioctl(const struct lu_env *env, int rw,
+                                struct obd_export *exp,
+                                struct obd_ioctl_data *data,
+                                struct obd_trans_info *dummy_oti)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct echo_device *ed = obd2echo_dev(obd);
+       struct echo_client_obd *ec = ed->ed_ec;
+       struct obdo *oa = &data->ioc_obdo1;
+       struct echo_object *eco;
+       int rc;
+       int async = 1;
+       long test_mode;
+       ENTRY;
+
+       LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+       rc = echo_get_object(&eco, ed, oa);
+       if (rc)
+               RETURN(rc);
+
+       oa->o_valid &= ~OBD_MD_FLHANDLE;
+
+       /* OFD/obdfilter works only via prep/commit */
+       test_mode = (long)data->ioc_pbuf1;
+       if (test_mode == 1)
+               async = 0;
+
+       if (ed->ed_next == NULL && test_mode != 3) {
+               test_mode = 3;
+               data->ioc_plen1 = data->ioc_count;
+       }
+
+       /* Truncate batch size to maximum */
+       if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE)
+               data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE;
+
+       switch (test_mode) {
+       case 1:
+               /* fall through */
+       case 2:
+               rc = echo_client_kbrw(ed, rw, oa,
+                                     eco, data->ioc_offset,
+                                     data->ioc_count, async, dummy_oti);
+               break;
+       case 3:
+               rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa,
+                                            eco, data->ioc_offset,
+                                            data->ioc_count, data->ioc_plen1,
+                                            dummy_oti, async);
+               break;
+       default:
+               rc = -EINVAL;
+       }
+       echo_put_object(eco);
+       RETURN(rc);
+}
+
+static int
+echo_client_enqueue(struct obd_export *exp, struct obdo *oa,
+                   int mode, obd_off offset, obd_size nob)
+{
+       struct echo_device     *ed = obd2echo_dev(exp->exp_obd);
+       struct lustre_handle   *ulh = &oa->o_handle;
+       struct echo_object     *eco;
+       obd_off          end;
+       int                  rc;
+       ENTRY;
+
+       if (ed->ed_next == NULL)
+               RETURN(-EOPNOTSUPP);
+
+       if (!(mode == LCK_PR || mode == LCK_PW))
+               RETURN(-EINVAL);
+
+       if ((offset & (~CFS_PAGE_MASK)) != 0 ||
+           (nob & (~CFS_PAGE_MASK)) != 0)
+               RETURN(-EINVAL);
+
+       rc = echo_get_object (&eco, ed, oa);
+       if (rc != 0)
+               RETURN(rc);
+
+       end = (nob == 0) ? ((obd_off) -1) : (offset + nob - 1);
+       rc = cl_echo_enqueue(eco, offset, end, mode, &ulh->cookie);
+       if (rc == 0) {
+               oa->o_valid |= OBD_MD_FLHANDLE;
+               CDEBUG(D_INFO, "Cookie is "LPX64"\n", ulh->cookie);
+       }
+       echo_put_object(eco);
+       RETURN(rc);
+}
+
+static int
+echo_client_cancel(struct obd_export *exp, struct obdo *oa)
+{
+       struct echo_device *ed     = obd2echo_dev(exp->exp_obd);
+       __u64          cookie = oa->o_handle.cookie;
+
+       if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
+               return -EINVAL;
+
+       CDEBUG(D_INFO, "Cookie is "LPX64"\n", cookie);
+       return cl_echo_cancel(ed, cookie);
+}
+
+static int
+echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+                     void *karg, void *uarg)
+{
+       struct obd_device      *obd = exp->exp_obd;
+       struct echo_device     *ed = obd2echo_dev(obd);
+       struct echo_client_obd *ec = ed->ed_ec;
+       struct echo_object     *eco;
+       struct obd_ioctl_data  *data = karg;
+       struct obd_trans_info   dummy_oti;
+       struct lu_env     *env;
+       struct oti_req_ack_lock *ack_lock;
+       struct obdo         *oa;
+       struct lu_fid      fid;
+       int                  rw = OBD_BRW_READ;
+       int                  rc = 0;
+       int                  i;
+       ENTRY;
+
+       memset(&dummy_oti, 0, sizeof(dummy_oti));
+
+       oa = &data->ioc_obdo1;
+       if (!(oa->o_valid & OBD_MD_FLGROUP)) {
+               oa->o_valid |= OBD_MD_FLGROUP;
+               ostid_set_seq_echo(&oa->o_oi);
+       }
+
+       /* This FID is unpacked just for validation at this point */
+       rc = ostid_to_fid(&fid, &oa->o_oi, 0);
+       if (rc < 0)
+               RETURN(rc);
+
+       OBD_ALLOC_PTR(env);
+       if (env == NULL)
+               RETURN(-ENOMEM);
+
+       rc = lu_env_init(env, LCT_DT_THREAD);
+       if (rc)
+               GOTO(out, rc = -ENOMEM);
+
+       switch (cmd) {
+       case OBD_IOC_CREATE:                /* may create echo object */
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               rc = echo_create_object(env, ed, 1, oa, data->ioc_pbuf1,
+                                       data->ioc_plen1, &dummy_oti);
+               GOTO(out, rc);
+
+       case OBD_IOC_ECHO_MD: {
+               int count;
+               int cmd;
+               char *dir = NULL;
+               int dirlen;
+               __u64 id;
+
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO(out, rc = -EPERM);
+
+               count = data->ioc_count;
+               cmd = data->ioc_command;
+
+               id = ostid_id(&data->ioc_obdo2.o_oi);
+
+               dirlen = data->ioc_plen1;
+               OBD_ALLOC(dir, dirlen + 1);
+               if (dir == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               if (copy_from_user(dir, data->ioc_pbuf1, dirlen)) {
+                       OBD_FREE(dir, data->ioc_plen1 + 1);
+                       GOTO(out, rc = -EFAULT);
+               }
+
+               rc = echo_md_handler(ed, cmd, dir, dirlen, id, count, data);
+               OBD_FREE(dir, dirlen + 1);
+               GOTO(out, rc);
+       }
+       case OBD_IOC_ECHO_ALLOC_SEQ: {
+               struct lu_env   *cl_env;
+               int           refcheck;
+               __u64       seq;
+               int           max_count;
+
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO(out, rc = -EPERM);
+
+               cl_env = cl_env_get(&refcheck);
+               if (IS_ERR(cl_env))
+                       GOTO(out, rc = PTR_ERR(cl_env));
+
+               rc = lu_env_refill_by_tags(cl_env, ECHO_MD_CTX_TAG,
+                                           ECHO_MD_SES_TAG);
+               if (rc != 0) {
+                       cl_env_put(cl_env, &refcheck);
+                       GOTO(out, rc);
+               }
+
+               rc = seq_client_get_seq(cl_env, ed->ed_cl_seq, &seq);
+               cl_env_put(cl_env, &refcheck);
+               if (rc < 0) {
+                       CERROR("%s: Can not alloc seq: rc = %d\n",
+                              obd->obd_name, rc);
+                       GOTO(out, rc);
+               }
+
+               if (copy_to_user(data->ioc_pbuf1, &seq, data->ioc_plen1))
+                       return -EFAULT;
+
+               max_count = LUSTRE_METADATA_SEQ_MAX_WIDTH;
+               if (copy_to_user(data->ioc_pbuf2, &max_count,
+                                    data->ioc_plen2))
+                       return -EFAULT;
+               GOTO(out, rc);
+       }
+       case OBD_IOC_DESTROY:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               rc = echo_get_object(&eco, ed, oa);
+               if (rc == 0) {
+                       rc = obd_destroy(env, ec->ec_exp, oa, eco->eo_lsm,
+                                        &dummy_oti, NULL, NULL);
+                       if (rc == 0)
+                               eco->eo_deleted = 1;
+                       echo_put_object(eco);
+               }
+               GOTO(out, rc);
+
+       case OBD_IOC_GETATTR:
+               rc = echo_get_object(&eco, ed, oa);
+               if (rc == 0) {
+                       struct obd_info oinfo = { { { 0 } } };
+                       oinfo.oi_md = eco->eo_lsm;
+                       oinfo.oi_oa = oa;
+                       rc = obd_getattr(env, ec->ec_exp, &oinfo);
+                       echo_put_object(eco);
+               }
+               GOTO(out, rc);
+
+       case OBD_IOC_SETATTR:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               rc = echo_get_object(&eco, ed, oa);
+               if (rc == 0) {
+                       struct obd_info oinfo = { { { 0 } } };
+                       oinfo.oi_oa = oa;
+                       oinfo.oi_md = eco->eo_lsm;
+
+                       rc = obd_setattr(env, ec->ec_exp, &oinfo, NULL);
+                       echo_put_object(eco);
+               }
+               GOTO(out, rc);
+
+       case OBD_IOC_BRW_WRITE:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               rw = OBD_BRW_WRITE;
+               /* fall through */
+       case OBD_IOC_BRW_READ:
+               rc = echo_client_brw_ioctl(env, rw, exp, data, &dummy_oti);
+               GOTO(out, rc);
+
+       case ECHO_IOC_GET_STRIPE:
+               rc = echo_get_object(&eco, ed, oa);
+               if (rc == 0) {
+                       rc = echo_copyout_lsm(eco->eo_lsm, data->ioc_pbuf1,
+                                             data->ioc_plen1);
+                       echo_put_object(eco);
+               }
+               GOTO(out, rc);
+
+       case ECHO_IOC_SET_STRIPE:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               if (data->ioc_pbuf1 == NULL) {  /* unset */
+                       rc = echo_get_object(&eco, ed, oa);
+                       if (rc == 0) {
+                               eco->eo_deleted = 1;
+                               echo_put_object(eco);
+                       }
+               } else {
+                       rc = echo_create_object(env, ed, 0, oa,
+                                               data->ioc_pbuf1,
+                                               data->ioc_plen1, &dummy_oti);
+               }
+               GOTO (out, rc);
+
+       case ECHO_IOC_ENQUEUE:
+               if (!cfs_capable(CFS_CAP_SYS_ADMIN))
+                       GOTO (out, rc = -EPERM);
+
+               rc = echo_client_enqueue(exp, oa,
+                                        data->ioc_conn1, /* lock mode */
+                                        data->ioc_offset,
+                                        data->ioc_count);/*extent*/
+               GOTO (out, rc);
+
+       case ECHO_IOC_CANCEL:
+               rc = echo_client_cancel(exp, oa);
+               GOTO (out, rc);
+
+       default:
+               CERROR ("echo_ioctl(): unrecognised ioctl %#x\n", cmd);
+               GOTO (out, rc = -ENOTTY);
+       }
+
+       EXIT;
+out:
+       lu_env_fini(env);
+       OBD_FREE_PTR(env);
+
+       /* XXX this should be in a helper also called by target_send_reply */
+       for (ack_lock = dummy_oti.oti_ack_locks, i = 0; i < 4;
+            i++, ack_lock++) {
+               if (!ack_lock->mode)
+                       break;
+               ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
+       }
+
+       return rc;
+}
+
+static int echo_client_setup(const struct lu_env *env,
+                            struct obd_device *obddev, struct lustre_cfg *lcfg)
+{
+       struct echo_client_obd *ec = &obddev->u.echo_client;
+       struct obd_device *tgt;
+       struct obd_uuid echo_uuid = { "ECHO_UUID" };
+       struct obd_connect_data *ocd = NULL;
+       int rc;
+       ENTRY;
+
+       if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
+               CERROR("requires a TARGET OBD name\n");
+               RETURN(-EINVAL);
+       }
+
+       tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+       if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+               CERROR("device not attached or not set up (%s)\n",
+                      lustre_cfg_string(lcfg, 1));
+               RETURN(-EINVAL);
+       }
+
+       spin_lock_init(&ec->ec_lock);
+       INIT_LIST_HEAD (&ec->ec_objects);
+       INIT_LIST_HEAD (&ec->ec_locks);
+       ec->ec_unique = 0;
+       ec->ec_nstripes = 0;
+
+       if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) {
+               lu_context_tags_update(ECHO_MD_CTX_TAG);
+               lu_session_tags_update(ECHO_MD_SES_TAG);
+               RETURN(0);
+       }
+
+       OBD_ALLOC(ocd, sizeof(*ocd));
+       if (ocd == NULL) {
+               CERROR("Can't alloc ocd connecting to %s\n",
+                      lustre_cfg_string(lcfg, 1));
+               return -ENOMEM;
+       }
+
+       ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL |
+                                OBD_CONNECT_BRW_SIZE |
+                                OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 |
+                                OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE |
+                                OBD_CONNECT_FID;
+       ocd->ocd_brw_size = DT_MAX_BRW_SIZE;
+       ocd->ocd_version = LUSTRE_VERSION_CODE;
+       ocd->ocd_group = FID_SEQ_ECHO;
+
+       rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL);
+       if (rc == 0) {
+               /* Turn off pinger because it connects to tgt obd directly. */
+               spin_lock(&tgt->obd_dev_lock);
+               list_del_init(&ec->ec_exp->exp_obd_chain_timed);
+               spin_unlock(&tgt->obd_dev_lock);
+       }
+
+       OBD_FREE(ocd, sizeof(*ocd));
+
+       if (rc != 0) {
+               CERROR("fail to connect to device %s\n",
+                      lustre_cfg_string(lcfg, 1));
+               return (rc);
+       }
+
+       RETURN(rc);
+}
+
+static int echo_client_cleanup(struct obd_device *obddev)
+{
+       struct echo_device *ed = obd2echo_dev(obddev);
+       struct echo_client_obd *ec = &obddev->u.echo_client;
+       int rc;
+       ENTRY;
+
+       /*Do nothing for Metadata echo client*/
+       if (ed == NULL )
+               RETURN(0);
+
+       if (ed->ed_next_ismd) {
+               lu_context_tags_clear(ECHO_MD_CTX_TAG);
+               lu_session_tags_clear(ECHO_MD_SES_TAG);
+               RETURN(0);
+       }
+
+       if (!list_empty(&obddev->obd_exports)) {
+               CERROR("still has clients!\n");
+               RETURN(-EBUSY);
+       }
+
+       LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0);
+       rc = obd_disconnect(ec->ec_exp);
+       if (rc != 0)
+               CERROR("fail to disconnect device: %d\n", rc);
+
+       RETURN(rc);
+}
+
+static int echo_client_connect(const struct lu_env *env,
+                              struct obd_export **exp,
+                              struct obd_device *src, struct obd_uuid *cluuid,
+                              struct obd_connect_data *data, void *localdata)
+{
+       int             rc;
+       struct lustre_handle conn = { 0 };
+
+       ENTRY;
+       rc = class_connect(&conn, src, cluuid);
+       if (rc == 0) {
+               *exp = class_conn2export(&conn);
+       }
+
+       RETURN (rc);
+}
+
+static int echo_client_disconnect(struct obd_export *exp)
+{
+#if 0
+       struct obd_device      *obd;
+       struct echo_client_obd *ec;
+       struct ec_lock   *ecl;
+#endif
+       int                  rc;
+       ENTRY;
+
+       if (exp == NULL)
+               GOTO(out, rc = -EINVAL);
+
+#if 0
+       obd = exp->exp_obd;
+       ec = &obd->u.echo_client;
+
+       /* no more contention on export's lock list */
+       while (!list_empty (&exp->exp_ec_data.eced_locks)) {
+               ecl = list_entry (exp->exp_ec_data.eced_locks.next,
+                                     struct ec_lock, ecl_exp_chain);
+               list_del (&ecl->ecl_exp_chain);
+
+               rc = obd_cancel(ec->ec_exp, ecl->ecl_object->eco_lsm,
+                                ecl->ecl_mode, &ecl->ecl_lock_handle);
+
+               CDEBUG (D_INFO, "Cancel lock on object "LPX64" on disconnect "
+                       "(%d)\n", ecl->ecl_object->eco_id, rc);
+
+               echo_put_object (ecl->ecl_object);
+               OBD_FREE (ecl, sizeof (*ecl));
+       }
+#endif
+
+       rc = class_disconnect(exp);
+       GOTO(out, rc);
+ out:
+       return rc;
+}
+
+static struct obd_ops echo_client_obd_ops = {
+       .o_owner       = THIS_MODULE,
+
+#if 0
+       .o_setup       = echo_client_setup,
+       .o_cleanup     = echo_client_cleanup,
+#endif
+
+       .o_iocontrol   = echo_client_iocontrol,
+       .o_connect     = echo_client_connect,
+       .o_disconnect  = echo_client_disconnect
+};
+
+int echo_client_init(void)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc;
+
+       lprocfs_echo_init_vars(&lvars);
+
+       rc = lu_kmem_init(echo_caches);
+       if (rc == 0) {
+               rc = class_register_type(&echo_client_obd_ops, NULL,
+                                        lvars.module_vars,
+                                        LUSTRE_ECHO_CLIENT_NAME,
+                                        &echo_device_type);
+               if (rc)
+                       lu_kmem_fini(echo_caches);
+       }
+       return rc;
+}
+
+void echo_client_exit(void)
+{
+       class_unregister_type(LUSTRE_ECHO_CLIENT_NAME);
+       lu_kmem_fini(echo_caches);
+}
+
+static int __init obdecho_init(void)
+{
+       struct lprocfs_static_vars lvars;
+       int rc;
+
+       ENTRY;
+       LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n");
+
+       LASSERT(PAGE_CACHE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
+
+       lprocfs_echo_init_vars(&lvars);
+
+
+       rc = echo_client_init();
+
+       RETURN(rc);
+}
+
+static void /*__exit*/ obdecho_exit(void)
+{
+       echo_client_exit();
+
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Testing Echo OBD driver");
+MODULE_LICENSE("GPL");
+
+cfs_module(obdecho, LUSTRE_VERSION_STRING, obdecho_init, obdecho_exit);
+
+/** @} echo_client */
diff --git a/drivers/staging/lustre/lustre/obdecho/echo_internal.h b/drivers/staging/lustre/lustre/obdecho/echo_internal.h
new file mode 100644 (file)
index 0000000..8e9dbc2
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Whamcloud, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdecho/echo_internal.h
+ */
+
+#ifndef _ECHO_INTERNAL_H
+#define _ECHO_INTERNAL_H
+
+/* The persistent object (i.e. actually stores stuff!) */
+#define ECHO_PERSISTENT_OBJID    1ULL
+#define ECHO_PERSISTENT_SIZE     ((__u64)(1<<20))
+
+/* block size to use for data verification */
+#define OBD_ECHO_BLOCK_SIZE    (4<<10)
+
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdecho/lproc_echo.c b/drivers/staging/lustre/lustre/obdecho/lproc_echo.c
new file mode 100644 (file)
index 0000000..b9abac1
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_ECHO
+
+#include <lprocfs_status.h>
+#include <obd_class.h>
+
+#ifdef LPROCFS
+LPROC_SEQ_FOPS_RO_TYPE(echo, uuid);
+static struct lprocfs_vars lprocfs_echo_obd_vars[] = {
+       { "uuid",        &echo_uuid_fops,       0, 0 },
+       { 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(echo, numrefs);
+static struct lprocfs_vars lprocfs_echo_module_vars[] = {
+       { "num_refs",     &echo_numrefs_fops,     0, 0 },
+       { 0 }
+};
+
+void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_echo_module_vars;
+    lvars->obd_vars     = lprocfs_echo_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/osc/Makefile b/drivers/staging/lustre/lustre/osc/Makefile
new file mode 100644 (file)
index 0000000..bbd2f77
--- /dev/null
@@ -0,0 +1,7 @@
+obj-$(CONFIG_LUSTRE_FS) += osc.o
+osc-y := osc_request.o lproc_osc.o osc_dev.o osc_object.o \
+        osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o
+
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c
new file mode 100644 (file)
index 0000000..198cf3b
--- /dev/null
@@ -0,0 +1,728 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/version.h>
+#include <asm/statfs.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <linux/seq_file.h>
+#include "osc_internal.h"
+
+#ifdef LPROCFS
+static int osc_active_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = m->private;
+       int rc;
+
+       LPROCFS_CLIMP_CHECK(dev);
+       rc = seq_printf(m, "%d\n", !dev->u.cli.cl_import->imp_deactive);
+       LPROCFS_CLIMP_EXIT(dev);
+       return rc;
+}
+
+static ssize_t osc_active_seq_write(struct file *file, const char *buffer,
+                                   size_t count, loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+       if (val < 0 || val > 1)
+               return -ERANGE;
+
+       /* opposite senses */
+       if (dev->u.cli.cl_import->imp_deactive == val)
+               rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val);
+       else
+               CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n", val);
+
+       return count;
+}
+LPROC_SEQ_FOPS(osc_active);
+
+static int osc_max_rpcs_in_flight_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = m->private;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = seq_printf(m, "%u\n", cli->cl_max_rpcs_in_flight);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+static ssize_t osc_max_rpcs_in_flight_seq_write(struct file *file,
+                       const char *buffer, size_t count, loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       struct client_obd *cli = &dev->u.cli;
+       struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool;
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val < 1 || val > OSC_MAX_RIF_MAX)
+               return -ERANGE;
+
+       LPROCFS_CLIMP_CHECK(dev);
+       if (pool && val > cli->cl_max_rpcs_in_flight)
+               pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_max_rpcs_in_flight = val;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       LPROCFS_CLIMP_EXIT(dev);
+       return count;
+}
+LPROC_SEQ_FOPS(osc_max_rpcs_in_flight);
+
+static int osc_max_dirty_mb_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = m->private;
+       struct client_obd *cli = &dev->u.cli;
+       long val;
+       int mult;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       val = cli->cl_dirty_max;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       mult = 1 << 20;
+       return lprocfs_seq_read_frac_helper(m, val, mult);
+}
+
+static ssize_t osc_max_dirty_mb_seq_write(struct file *file, const char *buffer,
+                                     size_t count, loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       struct client_obd *cli = &dev->u.cli;
+       int pages_number, mult, rc;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               return rc;
+
+       if (pages_number <= 0 ||
+           pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_CACHE_SHIFT) ||
+           pages_number > num_physpages / 4) /* 1/4 of RAM */
+               return -ERANGE;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_dirty_max = (obd_count)(pages_number << PAGE_CACHE_SHIFT);
+       osc_wake_cache_waiters(cli);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(osc_max_dirty_mb);
+
+static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = m->private;
+       struct client_obd *cli = &dev->u.cli;
+       int shift = 20 - PAGE_CACHE_SHIFT;
+       int rc;
+
+       rc = seq_printf(m,
+                     "used_mb: %d\n"
+                     "busy_cnt: %d\n",
+                     (atomic_read(&cli->cl_lru_in_list) +
+                       atomic_read(&cli->cl_lru_busy)) >> shift,
+                     atomic_read(&cli->cl_lru_busy));
+
+       return rc;
+}
+
+/* shrink the number of caching pages to a specific number */
+static ssize_t osc_cached_mb_seq_write(struct file *file, const char *buffer,
+                                  size_t count, loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       struct client_obd *cli = &dev->u.cli;
+       int pages_number, mult, rc;
+
+       mult = 1 << (20 - PAGE_CACHE_SHIFT);
+       buffer = lprocfs_find_named_value(buffer, "used_mb:", &count);
+       rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+       if (rc)
+               return rc;
+
+       if (pages_number < 0)
+               return -ERANGE;
+
+       rc = atomic_read(&cli->cl_lru_in_list) - pages_number;
+       if (rc > 0)
+               (void)osc_lru_shrink(cli, rc);
+
+       return count;
+}
+LPROC_SEQ_FOPS(osc_cached_mb);
+
+static int osc_cur_dirty_bytes_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = m->private;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = seq_printf(m, "%lu\n", cli->cl_dirty);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_dirty_bytes);
+
+static int osc_cur_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = m->private;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = seq_printf(m, "%lu\n", cli->cl_avail_grant);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+static ssize_t osc_cur_grant_bytes_seq_write(struct file *file, const char *buffer,
+                                 size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       struct client_obd *cli = &obd->u.cli;
+       int             rc;
+       __u64         val;
+
+       if (obd == NULL)
+               return 0;
+
+       rc = lprocfs_write_u64_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       /* this is only for shrinking grant */
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (val >= cli->cl_avail_grant) {
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               return 0;
+       }
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       LPROCFS_CLIMP_CHECK(obd);
+       if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
+               rc = osc_shrink_grant_to_target(cli, val);
+       LPROCFS_CLIMP_EXIT(obd);
+       if (rc)
+               return rc;
+       return count;
+}
+LPROC_SEQ_FOPS(osc_cur_grant_bytes);
+
+static int osc_cur_lost_grant_bytes_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *dev = m->private;
+       struct client_obd *cli = &dev->u.cli;
+       int rc;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = seq_printf(m, "%lu\n", cli->cl_lost_grant);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+LPROC_SEQ_FOPS_RO(osc_cur_lost_grant_bytes);
+
+static int osc_grant_shrink_interval_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *obd = m->private;
+
+       if (obd == NULL)
+               return 0;
+       return seq_printf(m, "%d\n",
+                       obd->u.cli.cl_grant_shrink_interval);
+}
+
+static ssize_t osc_grant_shrink_interval_seq_write(struct file *file,
+                               const char *buffer, size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       int val, rc;
+
+       if (obd == NULL)
+               return 0;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val <= 0)
+               return -ERANGE;
+
+       obd->u.cli.cl_grant_shrink_interval = val;
+
+       return count;
+}
+LPROC_SEQ_FOPS(osc_grant_shrink_interval);
+
+static int osc_checksum_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *obd = m->private;
+
+       if (obd == NULL)
+               return 0;
+
+       return seq_printf(m, "%d\n",
+                       obd->u.cli.cl_checksum ? 1 : 0);
+}
+
+static ssize_t osc_checksum_seq_write(struct file *file, const char *buffer,
+                          size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       int val, rc;
+
+       if (obd == NULL)
+               return 0;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       obd->u.cli.cl_checksum = (val ? 1 : 0);
+
+       return count;
+}
+LPROC_SEQ_FOPS(osc_checksum);
+
+static int osc_checksum_type_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *obd = m->private;
+       int i;
+       DECLARE_CKSUM_NAME;
+
+       if (obd == NULL)
+               return 0;
+
+       for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+               if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+                       continue;
+               if (obd->u.cli.cl_cksum_type == (1 << i))
+                       seq_printf(m, "[%s] ", cksum_name[i]);
+               else
+                       seq_printf(m, "%s ", cksum_name[i]);
+       }
+       seq_printf(m, "\n");
+       return 0;
+}
+
+static ssize_t osc_checksum_type_seq_write(struct file *file, const char *buffer,
+                               size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       int i;
+       DECLARE_CKSUM_NAME;
+       char kernbuf[10];
+
+       if (obd == NULL)
+               return 0;
+
+       if (count > sizeof(kernbuf) - 1)
+               return -EINVAL;
+       if (copy_from_user(kernbuf, buffer, count))
+               return -EFAULT;
+       if (count > 0 && kernbuf[count - 1] == '\n')
+               kernbuf[count - 1] = '\0';
+       else
+               kernbuf[count] = '\0';
+
+       for (i = 0; i < ARRAY_SIZE(cksum_name); i++) {
+               if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0)
+                       continue;
+               if (!strcmp(kernbuf, cksum_name[i])) {
+                      obd->u.cli.cl_cksum_type = 1 << i;
+                      return count;
+               }
+       }
+       return -EINVAL;
+}
+LPROC_SEQ_FOPS(osc_checksum_type);
+
+static int osc_resend_count_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *obd = m->private;
+
+       return seq_printf(m, "%u\n", atomic_read(&obd->u.cli.cl_resends));
+}
+
+static ssize_t osc_resend_count_seq_write(struct file *file, const char *buffer,
+                              size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       int val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val < 0)
+              return -EINVAL;
+
+       atomic_set(&obd->u.cli.cl_resends, val);
+
+       return count;
+}
+LPROC_SEQ_FOPS(osc_resend_count);
+
+static int osc_contention_seconds_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *obd = m->private;
+       struct osc_device *od  = obd2osc_dev(obd);
+
+       return seq_printf(m, "%u\n", od->od_contention_time);
+}
+
+static ssize_t osc_contention_seconds_seq_write(struct file *file, const char *buffer,
+                                    size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       struct osc_device *od  = obd2osc_dev(obd);
+
+       return lprocfs_write_helper(buffer, count, &od->od_contention_time) ?:
+               count;
+}
+LPROC_SEQ_FOPS(osc_contention_seconds);
+
+static int osc_lockless_truncate_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *obd = m->private;
+       struct osc_device *od  = obd2osc_dev(obd);
+
+       return seq_printf(m, "%u\n", od->od_lockless_truncate);
+}
+
+static ssize_t osc_lockless_truncate_seq_write(struct file *file, const char *buffer,
+                                   size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       struct osc_device *od  = obd2osc_dev(obd);
+
+       return lprocfs_write_helper(buffer, count, &od->od_lockless_truncate) ?:
+               count;
+}
+LPROC_SEQ_FOPS(osc_lockless_truncate);
+
+static int osc_destroys_in_flight_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *obd = m->private;
+       return seq_printf(m, "%u\n",
+                       atomic_read(&obd->u.cli.cl_destroy_in_flight));
+}
+LPROC_SEQ_FOPS_RO(osc_destroys_in_flight);
+
+static int osc_obd_max_pages_per_rpc_seq_show(struct seq_file *m, void *v)
+{
+       return lprocfs_obd_rd_max_pages_per_rpc(m, m->private);
+}
+
+static ssize_t osc_obd_max_pages_per_rpc_seq_write(struct file *file,
+                               const char *buffer, size_t count, loff_t *off)
+{
+       struct obd_device *dev = ((struct seq_file *)file->private_data)->private;
+       struct client_obd *cli = &dev->u.cli;
+       struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data;
+       int chunk_mask, rc;
+       __u64 val;
+
+       rc = lprocfs_write_u64_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       /* if the max_pages is specified in bytes, convert to pages */
+       if (val >= ONE_MB_BRW_SIZE)
+               val >>= PAGE_CACHE_SHIFT;
+
+       LPROCFS_CLIMP_CHECK(dev);
+
+       chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_CACHE_SHIFT)) - 1);
+       /* max_pages_per_rpc must be chunk aligned */
+       val = (val + ~chunk_mask) & chunk_mask;
+       if (val == 0 || val > ocd->ocd_brw_size >> PAGE_CACHE_SHIFT) {
+               LPROCFS_CLIMP_EXIT(dev);
+               return -ERANGE;
+       }
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_max_pages_per_rpc = val;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       LPROCFS_CLIMP_EXIT(dev);
+       return count;
+}
+LPROC_SEQ_FOPS(osc_obd_max_pages_per_rpc);
+
+LPROC_SEQ_FOPS_RO_TYPE(osc, uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags);
+LPROC_SEQ_FOPS_RO_TYPE(osc, blksize);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, kbytesavail);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filestotal);
+LPROC_SEQ_FOPS_RO_TYPE(osc, filesfree);
+LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid);
+LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts);
+LPROC_SEQ_FOPS_RO_TYPE(osc, state);
+
+LPROC_SEQ_FOPS_WR_ONLY(osc, ping);
+
+LPROC_SEQ_FOPS_RW_TYPE(osc, import);
+LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov);
+
+static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
+       { "uuid",            &osc_uuid_fops,    0, 0 },
+       { "ping",            &osc_ping_fops,    0, 0222 },
+       { "connect_flags",   &osc_connect_flags_fops, 0, 0 },
+       { "blocksize",       &osc_blksize_fops,     0, 0 },
+       { "kbytestotal",     &osc_kbytestotal_fops, 0, 0 },
+       { "kbytesfree",      &osc_kbytesfree_fops,  0, 0 },
+       { "kbytesavail",     &osc_kbytesavail_fops, 0, 0 },
+       { "filestotal",      &osc_filestotal_fops,  0, 0 },
+       { "filesfree",       &osc_filesfree_fops,   0, 0 },
+       //{ "filegroups",      lprocfs_rd_filegroups,  0, 0 },
+       { "ost_server_uuid", &osc_server_uuid_fops, 0, 0 },
+       { "ost_conn_uuid",   &osc_conn_uuid_fops, 0, 0 },
+       { "active",          &osc_active_fops, 0 },
+       { "max_pages_per_rpc", &osc_obd_max_pages_per_rpc_fops, 0 },
+       { "max_rpcs_in_flight", &osc_max_rpcs_in_flight_fops, 0 },
+       { "destroys_in_flight", &osc_destroys_in_flight_fops, 0, 0 },
+       { "max_dirty_mb",    &osc_max_dirty_mb_fops, 0 },
+       { "osc_cached_mb",   &osc_cached_mb_fops, 0 },
+       { "cur_dirty_bytes", &osc_cur_dirty_bytes_fops, 0, 0 },
+       { "cur_grant_bytes", &osc_cur_grant_bytes_fops, 0 },
+       { "cur_lost_grant_bytes", &osc_cur_lost_grant_bytes_fops, 0, 0},
+       { "grant_shrink_interval", &osc_grant_shrink_interval_fops, 0 },
+       { "checksums",       &osc_checksum_fops, 0 },
+       { "checksum_type",   &osc_checksum_type_fops, 0 },
+       { "resend_count",    &osc_resend_count_fops, 0},
+       { "timeouts",        &osc_timeouts_fops, 0, 0 },
+       { "contention_seconds", &osc_contention_seconds_fops, 0 },
+       { "lockless_truncate",  &osc_lockless_truncate_fops, 0 },
+       { "import",             &osc_import_fops, 0 },
+       { "state",              &osc_state_fops, 0, 0 },
+       { "pinger_recov",       &osc_pinger_recov_fops, 0 },
+       { 0 }
+};
+
+LPROC_SEQ_FOPS_RO_TYPE(osc, numrefs);
+static struct lprocfs_vars lprocfs_osc_module_vars[] = {
+       { "num_refs",   &osc_numrefs_fops,     0, 0 },
+       { 0 }
+};
+
+#define pct(a,b) (b ? a * 100 / b : 0)
+
+static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v)
+{
+       struct timeval now;
+       struct obd_device *dev = seq->private;
+       struct client_obd *cli = &dev->u.cli;
+       unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum;
+       int i;
+
+       do_gettimeofday(&now);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+
+       seq_printf(seq, "snapshot_time:  %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+       seq_printf(seq, "read RPCs in flight:  %d\n",
+                  cli->cl_r_in_flight);
+       seq_printf(seq, "write RPCs in flight: %d\n",
+                  cli->cl_w_in_flight);
+       seq_printf(seq, "pending write pages:  %d\n",
+                  atomic_read(&cli->cl_pending_w_pages));
+       seq_printf(seq, "pending read pages:   %d\n",
+                  atomic_read(&cli->cl_pending_r_pages));
+
+       seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+       seq_printf(seq, "pages per rpc   rpcs   %% cum %% |");
+       seq_printf(seq, "       rpcs   %% cum %%\n");
+
+       read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist);
+       write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist);
+
+       read_cum = 0;
+       write_cum = 0;
+       for (i = 0; i < OBD_HIST_MAX; i++) {
+               unsigned long r = cli->cl_read_page_hist.oh_buckets[i];
+               unsigned long w = cli->cl_write_page_hist.oh_buckets[i];
+               read_cum += r;
+               write_cum += w;
+               seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+                                1 << i, r, pct(r, read_tot),
+                                pct(read_cum, read_tot), w,
+                                pct(w, write_tot),
+                                pct(write_cum, write_tot));
+               if (read_cum == read_tot && write_cum == write_tot)
+                       break;
+       }
+
+       seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+       seq_printf(seq, "rpcs in flight rpcs   %% cum %% |");
+       seq_printf(seq, "       rpcs   %% cum %%\n");
+
+       read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist);
+       write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist);
+
+       read_cum = 0;
+       write_cum = 0;
+       for (i = 0; i < OBD_HIST_MAX; i++) {
+               unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i];
+               unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i];
+               read_cum += r;
+               write_cum += w;
+               seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+                                i, r, pct(r, read_tot),
+                                pct(read_cum, read_tot), w,
+                                pct(w, write_tot),
+                                pct(write_cum, write_tot));
+               if (read_cum == read_tot && write_cum == write_tot)
+                       break;
+       }
+
+       seq_printf(seq, "\n\t\t\tread\t\t\twrite\n");
+       seq_printf(seq, "offset         rpcs   %% cum %% |");
+       seq_printf(seq, "       rpcs   %% cum %%\n");
+
+       read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist);
+       write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist);
+
+       read_cum = 0;
+       write_cum = 0;
+       for (i = 0; i < OBD_HIST_MAX; i++) {
+               unsigned long r = cli->cl_read_offset_hist.oh_buckets[i];
+               unsigned long w = cli->cl_write_offset_hist.oh_buckets[i];
+               read_cum += r;
+               write_cum += w;
+               seq_printf(seq, "%d:\t\t%10lu %3lu %3lu   | %10lu %3lu %3lu\n",
+                          (i == 0) ? 0 : 1 << (i - 1),
+                          r, pct(r, read_tot), pct(read_cum, read_tot),
+                          w, pct(w, write_tot), pct(write_cum, write_tot));
+               if (read_cum == read_tot && write_cum == write_tot)
+                       break;
+       }
+
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       return 0;
+}
+#undef pct
+
+static ssize_t osc_rpc_stats_seq_write(struct file *file, const char *buf,
+                                      size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct obd_device *dev = seq->private;
+       struct client_obd *cli = &dev->u.cli;
+
+       lprocfs_oh_clear(&cli->cl_read_rpc_hist);
+       lprocfs_oh_clear(&cli->cl_write_rpc_hist);
+       lprocfs_oh_clear(&cli->cl_read_page_hist);
+       lprocfs_oh_clear(&cli->cl_write_page_hist);
+       lprocfs_oh_clear(&cli->cl_read_offset_hist);
+       lprocfs_oh_clear(&cli->cl_write_offset_hist);
+
+       return len;
+}
+
+LPROC_SEQ_FOPS(osc_rpc_stats);
+
+static int osc_stats_seq_show(struct seq_file *seq, void *v)
+{
+       struct timeval now;
+       struct obd_device *dev = seq->private;
+       struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+       do_gettimeofday(&now);
+
+       seq_printf(seq, "snapshot_time:  %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+       seq_printf(seq, "lockless_write_bytes\t\t"LPU64"\n",
+                  stats->os_lockless_writes);
+       seq_printf(seq, "lockless_read_bytes\t\t"LPU64"\n",
+                  stats->os_lockless_reads);
+       seq_printf(seq, "lockless_truncate\t\t"LPU64"\n",
+                  stats->os_lockless_truncates);
+       return 0;
+}
+
+static ssize_t osc_stats_seq_write(struct file *file, const char *buf,
+                                  size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct obd_device *dev = seq->private;
+       struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+       memset(stats, 0, sizeof(*stats));
+       return len;
+}
+
+LPROC_SEQ_FOPS(osc_stats);
+
+int lproc_osc_attach_seqstat(struct obd_device *dev)
+{
+       int rc;
+
+       rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0644,
+                               &osc_stats_fops, dev);
+       if (rc == 0)
+               rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0644,
+                                           &osc_rpc_stats_fops, dev);
+
+       return rc;
+}
+
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+       lvars->module_vars = lprocfs_osc_module_vars;
+       lvars->obd_vars    = lprocfs_osc_obd_vars;
+}
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c
new file mode 100644 (file)
index 0000000..0a0ec6f
--- /dev/null
@@ -0,0 +1,2916 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * osc cache management.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+#include "osc_internal.h"
+
+static int extent_debug; /* set it to be true for more debug */
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta);
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+                          int state);
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+                             struct osc_async_page *oap, int sent, int rc);
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+                         int cmd);
+static int osc_refresh_count(const struct lu_env *env,
+                            struct osc_async_page *oap, int cmd);
+static int osc_io_unplug_async(const struct lu_env *env,
+                              struct client_obd *cli, struct osc_object *osc);
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+                          unsigned int lost_grant);
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+                                 const char *func, int line);
+#define osc_extent_tree_dump(lvl, obj) \
+       osc_extent_tree_dump0(lvl, obj, __func__, __LINE__)
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/* ------------------ osc extent ------------------ */
+static inline char *ext_flags(struct osc_extent *ext, char *flags)
+{
+       char *buf = flags;
+       *buf++ = ext->oe_rw ? 'r' : 'w';
+       if (ext->oe_intree)
+               *buf++ = 'i';
+       if (ext->oe_srvlock)
+               *buf++ = 's';
+       if (ext->oe_hp)
+               *buf++ = 'h';
+       if (ext->oe_urgent)
+               *buf++ = 'u';
+       if (ext->oe_memalloc)
+               *buf++ = 'm';
+       if (ext->oe_trunc_pending)
+               *buf++ = 't';
+       if (ext->oe_fsync_wait)
+               *buf++ = 'Y';
+       *buf = 0;
+       return flags;
+}
+
+static inline char list_empty_marker(struct list_head *list)
+{
+       return list_empty(list) ? '-' : '+';
+}
+
+#define EXTSTR       "[%lu -> %lu/%lu]"
+#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end
+static const char *oes_strings[] = {
+       "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL };
+
+#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do {                          \
+       struct osc_extent *__ext = (extent);                                  \
+       char __buf[16];                                                       \
+                                                                             \
+       CDEBUG(lvl,                                                           \
+               "extent %p@{" EXTSTR ", "                                     \
+               "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt,          \
+               /* ----- extent part 0 ----- */                               \
+               __ext, EXTPARA(__ext),                                        \
+               /* ----- part 1 ----- */                                      \
+               atomic_read(&__ext->oe_refc),                         \
+               atomic_read(&__ext->oe_users),                        \
+               list_empty_marker(&__ext->oe_link),                           \
+               oes_strings[__ext->oe_state], ext_flags(__ext, __buf),        \
+               __ext->oe_obj,                                                \
+               /* ----- part 2 ----- */                                      \
+               __ext->oe_grants, __ext->oe_nr_pages,                         \
+               list_empty_marker(&__ext->oe_pages),                          \
+               waitqueue_active(&__ext->oe_waitq) ? '+' : '-',               \
+               __ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner,           \
+               /* ----- part 4 ----- */                                      \
+               ## __VA_ARGS__);                                              \
+} while (0)
+
+#undef EASSERTF
+#define EASSERTF(expr, ext, fmt, args...) do {                         \
+       if (!(expr)) {                                                  \
+               OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args);           \
+               osc_extent_tree_dump(D_ERROR, (ext)->oe_obj);           \
+               LASSERT(expr);                                          \
+       }                                                               \
+} while (0)
+
+#undef EASSERT
+#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n")
+
+static inline struct osc_extent *rb_extent(struct rb_node *n)
+{
+       if (n == NULL)
+               return NULL;
+
+       return container_of(n, struct osc_extent, oe_node);
+}
+
+static inline struct osc_extent *next_extent(struct osc_extent *ext)
+{
+       if (ext == NULL)
+               return NULL;
+
+       LASSERT(ext->oe_intree);
+       return rb_extent(rb_next(&ext->oe_node));
+}
+
+static inline struct osc_extent *prev_extent(struct osc_extent *ext)
+{
+       if (ext == NULL)
+               return NULL;
+
+       LASSERT(ext->oe_intree);
+       return rb_extent(rb_prev(&ext->oe_node));
+}
+
+static inline struct osc_extent *first_extent(struct osc_object *obj)
+{
+       return rb_extent(rb_first(&obj->oo_root));
+}
+
+/* object must be locked by caller. */
+static int osc_extent_sanity_check0(struct osc_extent *ext,
+                                   const char *func, const int line)
+{
+       struct osc_object *obj = ext->oe_obj;
+       struct osc_async_page *oap;
+       int page_count;
+       int rc = 0;
+
+       if (!osc_object_is_locked(obj))
+               GOTO(out, rc = 9);
+
+       if (ext->oe_state >= OES_STATE_MAX)
+               GOTO(out, rc = 10);
+
+       if (atomic_read(&ext->oe_refc) <= 0)
+               GOTO(out, rc = 20);
+
+       if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users))
+               GOTO(out, rc = 30);
+
+       switch (ext->oe_state) {
+       case OES_INV:
+               if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages))
+                       GOTO(out, rc = 35);
+               GOTO(out, rc = 0);
+               break;
+       case OES_ACTIVE:
+               if (atomic_read(&ext->oe_users) == 0)
+                       GOTO(out, rc = 40);
+               if (ext->oe_hp)
+                       GOTO(out, rc = 50);
+               if (ext->oe_fsync_wait && !ext->oe_urgent)
+                       GOTO(out, rc = 55);
+               break;
+       case OES_CACHE:
+               if (ext->oe_grants == 0)
+                       GOTO(out, rc = 60);
+               if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp)
+                       GOTO(out, rc = 65);
+       default:
+               if (atomic_read(&ext->oe_users) > 0)
+                       GOTO(out, rc = 70);
+       }
+
+       if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start)
+               GOTO(out, rc = 80);
+
+       if (ext->oe_osclock == NULL && ext->oe_grants > 0)
+               GOTO(out, rc = 90);
+
+       if (ext->oe_osclock) {
+               struct cl_lock_descr *descr;
+               descr = &ext->oe_osclock->cll_descr;
+               if (!(descr->cld_start <= ext->oe_start &&
+                     descr->cld_end >= ext->oe_max_end))
+                       GOTO(out, rc = 100);
+       }
+
+       if (ext->oe_nr_pages > ext->oe_mppr)
+               GOTO(out, rc = 105);
+
+       /* Do not verify page list if extent is in RPC. This is because an
+        * in-RPC extent is supposed to be exclusively accessible w/o lock. */
+       if (ext->oe_state > OES_CACHE)
+               GOTO(out, rc = 0);
+
+       if (!extent_debug)
+               GOTO(out, rc = 0);
+
+       page_count = 0;
+       list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+               pgoff_t index = oap2cl_page(oap)->cp_index;
+               ++page_count;
+               if (index > ext->oe_end || index < ext->oe_start)
+                       GOTO(out, rc = 110);
+       }
+       if (page_count != ext->oe_nr_pages)
+               GOTO(out, rc = 120);
+
+out:
+       if (rc != 0)
+               OSC_EXTENT_DUMP(D_ERROR, ext,
+                               "%s:%d sanity check %p failed with rc = %d\n",
+                               func, line, ext, rc);
+       return rc;
+}
+
+#define sanity_check_nolock(ext) \
+       osc_extent_sanity_check0(ext, __func__, __LINE__)
+
+#define sanity_check(ext) ({                                              \
+       int __res;                                                           \
+       osc_object_lock((ext)->oe_obj);                                 \
+       __res = sanity_check_nolock(ext);                                     \
+       osc_object_unlock((ext)->oe_obj);                                     \
+       __res;                                                           \
+})
+
+
+/**
+ * sanity check - to make sure there is no overlapped extent in the tree.
+ */
+static int osc_extent_is_overlapped(struct osc_object *obj,
+                                   struct osc_extent *ext)
+{
+       struct osc_extent *tmp;
+
+       LASSERT(osc_object_is_locked(obj));
+
+       if (!extent_debug)
+               return 0;
+
+       for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) {
+               if (tmp == ext)
+                       continue;
+               if (tmp->oe_end >= ext->oe_start &&
+                   tmp->oe_start <= ext->oe_end)
+                       return 1;
+       }
+       return 0;
+}
+
+static void osc_extent_state_set(struct osc_extent *ext, int state)
+{
+       LASSERT(osc_object_is_locked(ext->oe_obj));
+       LASSERT(state >= OES_INV && state < OES_STATE_MAX);
+
+       /* Never try to sanity check a state changing extent :-) */
+       /* LASSERT(sanity_check_nolock(ext) == 0); */
+
+       /* TODO: validate the state machine */
+       ext->oe_state = state;
+       wake_up_all(&ext->oe_waitq);
+}
+
+static struct osc_extent *osc_extent_alloc(struct osc_object *obj)
+{
+       struct osc_extent *ext;
+
+       OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS);
+       if (ext == NULL)
+               return NULL;
+
+       RB_CLEAR_NODE(&ext->oe_node);
+       ext->oe_obj = obj;
+       atomic_set(&ext->oe_refc, 1);
+       atomic_set(&ext->oe_users, 0);
+       INIT_LIST_HEAD(&ext->oe_link);
+       ext->oe_state = OES_INV;
+       INIT_LIST_HEAD(&ext->oe_pages);
+       init_waitqueue_head(&ext->oe_waitq);
+       ext->oe_osclock = NULL;
+
+       return ext;
+}
+
+static void osc_extent_free(struct osc_extent *ext)
+{
+       OBD_SLAB_FREE_PTR(ext, osc_extent_kmem);
+}
+
+static struct osc_extent *osc_extent_get(struct osc_extent *ext)
+{
+       LASSERT(atomic_read(&ext->oe_refc) >= 0);
+       atomic_inc(&ext->oe_refc);
+       return ext;
+}
+
+static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext)
+{
+       LASSERT(atomic_read(&ext->oe_refc) > 0);
+       if (atomic_dec_and_test(&ext->oe_refc)) {
+               LASSERT(list_empty(&ext->oe_link));
+               LASSERT(atomic_read(&ext->oe_users) == 0);
+               LASSERT(ext->oe_state == OES_INV);
+               LASSERT(!ext->oe_intree);
+
+               if (ext->oe_osclock) {
+                       cl_lock_put(env, ext->oe_osclock);
+                       ext->oe_osclock = NULL;
+               }
+               osc_extent_free(ext);
+       }
+}
+
+/**
+ * osc_extent_put_trust() is a special version of osc_extent_put() when
+ * it's known that the caller is not the last user. This is to address the
+ * problem of lacking of lu_env ;-).
+ */
+static void osc_extent_put_trust(struct osc_extent *ext)
+{
+       LASSERT(atomic_read(&ext->oe_refc) > 1);
+       LASSERT(osc_object_is_locked(ext->oe_obj));
+       atomic_dec(&ext->oe_refc);
+}
+
+/**
+ * Return the extent which includes pgoff @index, or return the greatest
+ * previous extent in the tree.
+ */
+static struct osc_extent *osc_extent_search(struct osc_object *obj,
+                                           pgoff_t index)
+{
+       struct rb_node    *n = obj->oo_root.rb_node;
+       struct osc_extent *tmp, *p = NULL;
+
+       LASSERT(osc_object_is_locked(obj));
+       while (n != NULL) {
+               tmp = rb_extent(n);
+               if (index < tmp->oe_start) {
+                       n = n->rb_left;
+               } else if (index > tmp->oe_end) {
+                       p = rb_extent(n);
+                       n = n->rb_right;
+               } else {
+                       return tmp;
+               }
+       }
+       return p;
+}
+
+/*
+ * Return the extent covering @index, otherwise return NULL.
+ * caller must have held object lock.
+ */
+static struct osc_extent *osc_extent_lookup(struct osc_object *obj,
+                                           pgoff_t index)
+{
+       struct osc_extent *ext;
+
+       ext = osc_extent_search(obj, index);
+       if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end)
+               return osc_extent_get(ext);
+       return NULL;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext)
+{
+       struct rb_node   **n      = &obj->oo_root.rb_node;
+       struct rb_node    *parent = NULL;
+       struct osc_extent *tmp;
+
+       LASSERT(ext->oe_intree == 0);
+       LASSERT(ext->oe_obj == obj);
+       LASSERT(osc_object_is_locked(obj));
+       while (*n != NULL) {
+               tmp = rb_extent(*n);
+               parent = *n;
+
+               if (ext->oe_end < tmp->oe_start)
+                       n = &(*n)->rb_left;
+               else if (ext->oe_start > tmp->oe_end)
+                       n = &(*n)->rb_right;
+               else
+                       EASSERTF(0, tmp, EXTSTR, EXTPARA(ext));
+       }
+       rb_link_node(&ext->oe_node, parent, n);
+       rb_insert_color(&ext->oe_node, &obj->oo_root);
+       osc_extent_get(ext);
+       ext->oe_intree = 1;
+}
+
+/* caller must have held object lock. */
+static void osc_extent_erase(struct osc_extent *ext)
+{
+       struct osc_object *obj = ext->oe_obj;
+       LASSERT(osc_object_is_locked(obj));
+       if (ext->oe_intree) {
+               rb_erase(&ext->oe_node, &obj->oo_root);
+               ext->oe_intree = 0;
+               /* rbtree held a refcount */
+               osc_extent_put_trust(ext);
+       }
+}
+
+static struct osc_extent *osc_extent_hold(struct osc_extent *ext)
+{
+       struct osc_object *obj = ext->oe_obj;
+
+       LASSERT(osc_object_is_locked(obj));
+       LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE);
+       if (ext->oe_state == OES_CACHE) {
+               osc_extent_state_set(ext, OES_ACTIVE);
+               osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages);
+       }
+       atomic_inc(&ext->oe_users);
+       list_del_init(&ext->oe_link);
+       return osc_extent_get(ext);
+}
+
+static void __osc_extent_remove(struct osc_extent *ext)
+{
+       LASSERT(osc_object_is_locked(ext->oe_obj));
+       LASSERT(list_empty(&ext->oe_pages));
+       osc_extent_erase(ext);
+       list_del_init(&ext->oe_link);
+       osc_extent_state_set(ext, OES_INV);
+       OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n");
+}
+
+static void osc_extent_remove(struct osc_extent *ext)
+{
+       struct osc_object *obj = ext->oe_obj;
+
+       osc_object_lock(obj);
+       __osc_extent_remove(ext);
+       osc_object_unlock(obj);
+}
+
+/**
+ * This function is used to merge extents to get better performance. It checks
+ * if @cur and @victim are contiguous at chunk level.
+ */
+static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
+                           struct osc_extent *victim)
+{
+       struct osc_object *obj = cur->oe_obj;
+       pgoff_t chunk_start;
+       pgoff_t chunk_end;
+       int ppc_bits;
+
+       LASSERT(cur->oe_state == OES_CACHE);
+       LASSERT(osc_object_is_locked(obj));
+       if (victim == NULL)
+               return -EINVAL;
+
+       if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait)
+               return -EBUSY;
+
+       if (cur->oe_max_end != victim->oe_max_end)
+               return -ERANGE;
+
+       LASSERT(cur->oe_osclock == victim->oe_osclock);
+       ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT;
+       chunk_start = cur->oe_start >> ppc_bits;
+       chunk_end   = cur->oe_end   >> ppc_bits;
+       if (chunk_start   != (victim->oe_end >> ppc_bits) + 1 &&
+           chunk_end + 1 != victim->oe_start >> ppc_bits)
+               return -ERANGE;
+
+       OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur);
+
+       cur->oe_start     = min(cur->oe_start, victim->oe_start);
+       cur->oe_end       = max(cur->oe_end,   victim->oe_end);
+       cur->oe_grants   += victim->oe_grants;
+       cur->oe_nr_pages += victim->oe_nr_pages;
+       /* only the following bits are needed to merge */
+       cur->oe_urgent   |= victim->oe_urgent;
+       cur->oe_memalloc |= victim->oe_memalloc;
+       list_splice_init(&victim->oe_pages, &cur->oe_pages);
+       list_del_init(&victim->oe_link);
+       victim->oe_nr_pages = 0;
+
+       osc_extent_get(victim);
+       __osc_extent_remove(victim);
+       osc_extent_put(env, victim);
+
+       OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim);
+       return 0;
+}
+
+/**
+ * Drop user count of osc_extent, and unplug IO asynchronously.
+ */
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
+{
+       struct osc_object *obj = ext->oe_obj;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(atomic_read(&ext->oe_users) > 0);
+       LASSERT(sanity_check(ext) == 0);
+       LASSERT(ext->oe_grants > 0);
+
+       if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) {
+               LASSERT(ext->oe_state == OES_ACTIVE);
+               if (ext->oe_trunc_pending) {
+                       /* a truncate process is waiting for this extent.
+                        * This may happen due to a race, check
+                        * osc_cache_truncate_start(). */
+                       osc_extent_state_set(ext, OES_TRUNC);
+                       ext->oe_trunc_pending = 0;
+               } else {
+                       osc_extent_state_set(ext, OES_CACHE);
+                       osc_update_pending(obj, OBD_BRW_WRITE,
+                                          ext->oe_nr_pages);
+
+                       /* try to merge the previous and next extent. */
+                       osc_extent_merge(env, ext, prev_extent(ext));
+                       osc_extent_merge(env, ext, next_extent(ext));
+
+                       if (ext->oe_urgent)
+                               list_move_tail(&ext->oe_link,
+                                                  &obj->oo_urgent_exts);
+               }
+               osc_object_unlock(obj);
+
+               osc_io_unplug_async(env, osc_cli(obj), obj);
+       }
+       osc_extent_put(env, ext);
+       RETURN(rc);
+}
+
+static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2)
+{
+       return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
+}
+
+/**
+ * Find or create an extent which includes @index, core function to manage
+ * extent tree.
+ */
+struct osc_extent *osc_extent_find(const struct lu_env *env,
+                                  struct osc_object *obj, pgoff_t index,
+                                  int *grants)
+
+{
+       struct client_obd *cli = osc_cli(obj);
+       struct cl_lock    *lock;
+       struct osc_extent *cur;
+       struct osc_extent *ext;
+       struct osc_extent *conflict = NULL;
+       struct osc_extent *found = NULL;
+       pgoff_t    chunk;
+       pgoff_t    max_end;
+       int     max_pages; /* max_pages_per_rpc */
+       int     chunksize;
+       int     ppc_bits; /* pages per chunk bits */
+       int     chunk_mask;
+       int     rc;
+       ENTRY;
+
+       cur = osc_extent_alloc(obj);
+       if (cur == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0);
+       LASSERT(lock != NULL);
+       LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+
+       LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT);
+       ppc_bits   = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+       chunk_mask = ~((1 << ppc_bits) - 1);
+       chunksize  = 1 << cli->cl_chunkbits;
+       chunk      = index >> ppc_bits;
+
+       /* align end to rpc edge, rpc size may not be a power 2 integer. */
+       max_pages = cli->cl_max_pages_per_rpc;
+       LASSERT((max_pages & ~chunk_mask) == 0);
+       max_end = index - (index % max_pages) + max_pages - 1;
+       max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end);
+
+       /* initialize new extent by parameters so far */
+       cur->oe_max_end = max_end;
+       cur->oe_start   = index & chunk_mask;
+       cur->oe_end     = ((index + ~chunk_mask + 1) & chunk_mask) - 1;
+       if (cur->oe_start < lock->cll_descr.cld_start)
+               cur->oe_start = lock->cll_descr.cld_start;
+       if (cur->oe_end > max_end)
+               cur->oe_end = max_end;
+       cur->oe_osclock = lock;
+       cur->oe_grants  = 0;
+       cur->oe_mppr    = max_pages;
+
+       /* grants has been allocated by caller */
+       LASSERTF(*grants >= chunksize + cli->cl_extent_tax,
+                "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax);
+       LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur));
+
+restart:
+       osc_object_lock(obj);
+       ext = osc_extent_search(obj, cur->oe_start);
+       if (ext == NULL)
+               ext = first_extent(obj);
+       while (ext != NULL) {
+               loff_t ext_chk_start = ext->oe_start >> ppc_bits;
+               loff_t ext_chk_end   = ext->oe_end   >> ppc_bits;
+
+               LASSERT(sanity_check_nolock(ext) == 0);
+               if (chunk > ext_chk_end + 1)
+                       break;
+
+               /* if covering by different locks, no chance to match */
+               if (lock != ext->oe_osclock) {
+                       EASSERTF(!overlapped(ext, cur), ext,
+                                EXTSTR, EXTPARA(cur));
+
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               /* discontiguous chunks? */
+               if (chunk + 1 < ext_chk_start) {
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               /* ok, from now on, ext and cur have these attrs:
+                * 1. covered by the same lock
+                * 2. contiguous at chunk level or overlapping. */
+
+               if (overlapped(ext, cur)) {
+                       /* cur is the minimum unit, so overlapping means
+                        * full contain. */
+                       EASSERTF((ext->oe_start <= cur->oe_start &&
+                                 ext->oe_end >= cur->oe_end),
+                                ext, EXTSTR, EXTPARA(cur));
+
+                       if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) {
+                               /* for simplicity, we wait for this extent to
+                                * finish before going forward. */
+                               conflict = osc_extent_get(ext);
+                               break;
+                       }
+
+                       found = osc_extent_hold(ext);
+                       break;
+               }
+
+               /* non-overlapped extent */
+               if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) {
+                       /* we can't do anything for a non OES_CACHE extent, or
+                        * if there is someone waiting for this extent to be
+                        * flushed, try next one. */
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               /* check if they belong to the same rpc slot before trying to
+                * merge. the extents are not overlapped and contiguous at
+                * chunk level to get here. */
+               if (ext->oe_max_end != max_end) {
+                       /* if they don't belong to the same RPC slot or
+                        * max_pages_per_rpc has ever changed, do not merge. */
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               /* it's required that an extent must be contiguous at chunk
+                * level so that we know the whole extent is covered by grant
+                * (the pages in the extent are NOT required to be contiguous).
+                * Otherwise, it will be too much difficult to know which
+                * chunks have grants allocated. */
+
+               /* try to do front merge - extend ext's start */
+               if (chunk + 1 == ext_chk_start) {
+                       /* ext must be chunk size aligned */
+                       EASSERT((ext->oe_start & ~chunk_mask) == 0, ext);
+
+                       /* pull ext's start back to cover cur */
+                       ext->oe_start   = cur->oe_start;
+                       ext->oe_grants += chunksize;
+                       *grants -= chunksize;
+
+                       found = osc_extent_hold(ext);
+               } else if (chunk == ext_chk_end + 1) {
+                       /* rear merge */
+                       ext->oe_end     = cur->oe_end;
+                       ext->oe_grants += chunksize;
+                       *grants -= chunksize;
+
+                       /* try to merge with the next one because we just fill
+                        * in a gap */
+                       if (osc_extent_merge(env, ext, next_extent(ext)) == 0)
+                               /* we can save extent tax from next extent */
+                               *grants += cli->cl_extent_tax;
+
+                       found = osc_extent_hold(ext);
+               }
+               if (found != NULL)
+                       break;
+
+               ext = next_extent(ext);
+       }
+
+       osc_extent_tree_dump(D_CACHE, obj);
+       if (found != NULL) {
+               LASSERT(conflict == NULL);
+               if (!IS_ERR(found)) {
+                       LASSERT(found->oe_osclock == cur->oe_osclock);
+                       OSC_EXTENT_DUMP(D_CACHE, found,
+                                       "found caching ext for %lu.\n", index);
+               }
+       } else if (conflict == NULL) {
+               /* create a new extent */
+               EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur);
+               cur->oe_grants = chunksize + cli->cl_extent_tax;
+               *grants -= cur->oe_grants;
+               LASSERT(*grants >= 0);
+
+               cur->oe_state = OES_CACHE;
+               found = osc_extent_hold(cur);
+               osc_extent_insert(obj, cur);
+               OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n",
+                               index, lock->cll_descr.cld_end);
+       }
+       osc_object_unlock(obj);
+
+       if (conflict != NULL) {
+               LASSERT(found == NULL);
+
+               /* waiting for IO to finish. Please notice that it's impossible
+                * to be an OES_TRUNC extent. */
+               rc = osc_extent_wait(env, conflict, OES_INV);
+               osc_extent_put(env, conflict);
+               conflict = NULL;
+               if (rc < 0)
+                       GOTO(out, found = ERR_PTR(rc));
+
+               goto restart;
+       }
+       EXIT;
+
+out:
+       osc_extent_put(env, cur);
+       LASSERT(*grants >= 0);
+       return found;
+}
+
+/**
+ * Called when IO is finished to an extent.
+ */
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+                     int sent, int rc)
+{
+       struct client_obd *cli = osc_cli(ext->oe_obj);
+       struct osc_async_page *oap;
+       struct osc_async_page *tmp;
+       int nr_pages = ext->oe_nr_pages;
+       int lost_grant = 0;
+       int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096;
+       __u64 last_off = 0;
+       int last_count = -1;
+       ENTRY;
+
+       OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n");
+
+       ext->oe_rc = rc ?: ext->oe_nr_pages;
+       EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext);
+       list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+                                    oap_pending_item) {
+               list_del_init(&oap->oap_rpc_item);
+               list_del_init(&oap->oap_pending_item);
+               if (last_off <= oap->oap_obj_off) {
+                       last_off = oap->oap_obj_off;
+                       last_count = oap->oap_count;
+               }
+
+               --ext->oe_nr_pages;
+               osc_ap_completion(env, cli, oap, sent, rc);
+       }
+       EASSERT(ext->oe_nr_pages == 0, ext);
+
+       if (!sent) {
+               lost_grant = ext->oe_grants;
+       } else if (blocksize < PAGE_CACHE_SIZE &&
+                  last_count != PAGE_CACHE_SIZE) {
+               /* For short writes we shouldn't count parts of pages that
+                * span a whole chunk on the OST side, or our accounting goes
+                * wrong.  Should match the code in filter_grant_check. */
+               int offset = oap->oap_page_off & ~CFS_PAGE_MASK;
+               int count = oap->oap_count + (offset & (blocksize - 1));
+               int end = (offset + oap->oap_count) & (blocksize - 1);
+               if (end)
+                       count += blocksize - end;
+
+               lost_grant = PAGE_CACHE_SIZE - count;
+       }
+       if (ext->oe_grants > 0)
+               osc_free_grant(cli, nr_pages, lost_grant);
+
+       osc_extent_remove(ext);
+       /* put the refcount for RPC */
+       osc_extent_put(env, ext);
+       RETURN(0);
+}
+
+static int extent_wait_cb(struct osc_extent *ext, int state)
+{
+       int ret;
+
+       osc_object_lock(ext->oe_obj);
+       ret = ext->oe_state == state;
+       osc_object_unlock(ext->oe_obj);
+
+       return ret;
+}
+
+/**
+ * Wait for the extent's state to become @state.
+ */
+static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext,
+                          int state)
+{
+       struct osc_object *obj = ext->oe_obj;
+       struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL,
+                                                 LWI_ON_SIGNAL_NOOP, NULL);
+       int rc = 0;
+       ENTRY;
+
+       osc_object_lock(obj);
+       LASSERT(sanity_check_nolock(ext) == 0);
+       /* `Kick' this extent only if the caller is waiting for it to be
+        * written out. */
+       if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp) {
+               if (ext->oe_state == OES_ACTIVE) {
+                       ext->oe_urgent = 1;
+               } else if (ext->oe_state == OES_CACHE) {
+                       ext->oe_urgent = 1;
+                       osc_extent_hold(ext);
+                       rc = 1;
+               }
+       }
+       osc_object_unlock(obj);
+       if (rc == 1)
+               osc_extent_release(env, ext);
+
+       /* wait for the extent until its state becomes @state */
+       rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi);
+       if (rc == -ETIMEDOUT) {
+               OSC_EXTENT_DUMP(D_ERROR, ext,
+                       "%s: wait ext to %d timedout, recovery in progress?\n",
+                       osc_export(obj)->exp_obd->obd_name, state);
+
+               lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+               rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state),
+                                 &lwi);
+       }
+       if (rc == 0 && ext->oe_rc < 0)
+               rc = ext->oe_rc;
+       RETURN(rc);
+}
+
+/**
+ * Discard pages with index greater than @size. If @ext is overlapped with
+ * @size, then partial truncate happens.
+ */
+static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
+                               bool partial)
+{
+       struct cl_env_nest     nest;
+       struct lu_env    *env;
+       struct cl_io      *io;
+       struct osc_object     *obj = ext->oe_obj;
+       struct client_obd     *cli = osc_cli(obj);
+       struct osc_async_page *oap;
+       struct osc_async_page *tmp;
+       int                 pages_in_chunk = 0;
+       int                 ppc_bits    = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+       __u64             trunc_chunk = trunc_index >> ppc_bits;
+       int                 grants   = 0;
+       int                 nr_pages = 0;
+       int                 rc       = 0;
+       ENTRY;
+
+       LASSERT(sanity_check(ext) == 0);
+       LASSERT(ext->oe_state == OES_TRUNC);
+       LASSERT(!ext->oe_urgent);
+
+       /* Request new lu_env.
+        * We can't use that env from osc_cache_truncate_start() because
+        * it's from lov_io_sub and not fully initialized. */
+       env = cl_env_nested_get(&nest);
+       io  = &osc_env_info(env)->oti_io;
+       io->ci_obj = cl_object_top(osc2cl(obj));
+       rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /* discard all pages with index greater then trunc_index */
+       list_for_each_entry_safe(oap, tmp, &ext->oe_pages,
+                                    oap_pending_item) {
+               struct cl_page  *sub  = oap2cl_page(oap);
+               struct cl_page  *page = cl_page_top(sub);
+
+               LASSERT(list_empty(&oap->oap_rpc_item));
+
+               /* only discard the pages with their index greater than
+                * trunc_index, and ... */
+               if (sub->cp_index < trunc_index ||
+                   (sub->cp_index == trunc_index && partial)) {
+                       /* accounting how many pages remaining in the chunk
+                        * so that we can calculate grants correctly. */
+                       if (sub->cp_index >> ppc_bits == trunc_chunk)
+                               ++pages_in_chunk;
+                       continue;
+               }
+
+               list_del_init(&oap->oap_pending_item);
+
+               cl_page_get(page);
+               lu_ref_add(&page->cp_reference, "truncate", current);
+
+               if (cl_page_own(env, io, page) == 0) {
+                       cl_page_unmap(env, io, page);
+                       cl_page_discard(env, io, page);
+                       cl_page_disown(env, io, page);
+               } else {
+                       LASSERT(page->cp_state == CPS_FREEING);
+                       LASSERT(0);
+               }
+
+               lu_ref_del(&page->cp_reference, "truncate", current);
+               cl_page_put(env, page);
+
+               --ext->oe_nr_pages;
+               ++nr_pages;
+       }
+       EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial,
+                     ext->oe_nr_pages == 0),
+               ext, "trunc_index %lu, partial %d\n", trunc_index, partial);
+
+       osc_object_lock(obj);
+       if (ext->oe_nr_pages == 0) {
+               LASSERT(pages_in_chunk == 0);
+               grants = ext->oe_grants;
+               ext->oe_grants = 0;
+       } else { /* calculate how many grants we can free */
+               int     chunks = (ext->oe_end >> ppc_bits) - trunc_chunk;
+               pgoff_t last_index;
+
+
+               /* if there is no pages in this chunk, we can also free grants
+                * for the last chunk */
+               if (pages_in_chunk == 0) {
+                       /* if this is the 1st chunk and no pages in this chunk,
+                        * ext->oe_nr_pages must be zero, so we should be in
+                        * the other if-clause. */
+                       LASSERT(trunc_chunk > 0);
+                       --trunc_chunk;
+                       ++chunks;
+               }
+
+               /* this is what we can free from this extent */
+               grants    = chunks << cli->cl_chunkbits;
+               ext->oe_grants -= grants;
+               last_index      = ((trunc_chunk + 1) << ppc_bits) - 1;
+               ext->oe_end     = min(last_index, ext->oe_max_end);
+               LASSERT(ext->oe_end >= ext->oe_start);
+               LASSERT(ext->oe_grants > 0);
+       }
+       osc_object_unlock(obj);
+
+       if (grants > 0 || nr_pages > 0)
+               osc_free_grant(cli, nr_pages, grants);
+
+out:
+       cl_io_fini(env, io);
+       cl_env_nested_put(&nest, env);
+       RETURN(rc);
+}
+
+/**
+ * This function is used to make the extent prepared for transfer.
+ * A race with flusing page - ll_writepage() has to be handled cautiously.
+ */
+static int osc_extent_make_ready(const struct lu_env *env,
+                                struct osc_extent *ext)
+{
+       struct osc_async_page *oap;
+       struct osc_async_page *last = NULL;
+       struct osc_object *obj = ext->oe_obj;
+       int page_count = 0;
+       int rc;
+       ENTRY;
+
+       /* we're going to grab page lock, so object lock must not be taken. */
+       LASSERT(sanity_check(ext) == 0);
+       /* in locking state, any process should not touch this extent. */
+       EASSERT(ext->oe_state == OES_LOCKING, ext);
+       EASSERT(ext->oe_owner != NULL, ext);
+
+       OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n");
+
+       list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+               ++page_count;
+               if (last == NULL || last->oap_obj_off < oap->oap_obj_off)
+                       last = oap;
+
+               /* checking ASYNC_READY is race safe */
+               if ((oap->oap_async_flags & ASYNC_READY) != 0)
+                       continue;
+
+               rc = osc_make_ready(env, oap, OBD_BRW_WRITE);
+               switch (rc) {
+               case 0:
+                       spin_lock(&oap->oap_lock);
+                       oap->oap_async_flags |= ASYNC_READY;
+                       spin_unlock(&oap->oap_lock);
+                       break;
+               case -EALREADY:
+                       LASSERT((oap->oap_async_flags & ASYNC_READY) != 0);
+                       break;
+               default:
+                       LASSERTF(0, "unknown return code: %d\n", rc);
+               }
+       }
+
+       LASSERT(page_count == ext->oe_nr_pages);
+       LASSERT(last != NULL);
+       /* the last page is the only one we need to refresh its count by
+        * the size of file. */
+       if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
+               last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
+               LASSERT(last->oap_count > 0);
+               LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE);
+               last->oap_async_flags |= ASYNC_COUNT_STABLE;
+       }
+
+       /* for the rest of pages, we don't need to call osf_refresh_count()
+        * because it's known they are not the last page */
+       list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+               if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
+                       oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off;
+                       oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+               }
+       }
+
+       osc_object_lock(obj);
+       osc_extent_state_set(ext, OES_RPC);
+       osc_object_unlock(obj);
+       /* get a refcount for RPC. */
+       osc_extent_get(ext);
+
+       RETURN(0);
+}
+
+/**
+ * Quick and simple version of osc_extent_find(). This function is frequently
+ * called to expand the extent for the same IO. To expand the extent, the
+ * page index must be in the same or next chunk of ext->oe_end.
+ */
+static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants)
+{
+       struct osc_object *obj = ext->oe_obj;
+       struct client_obd *cli = osc_cli(obj);
+       struct osc_extent *next;
+       int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+       pgoff_t chunk = index >> ppc_bits;
+       pgoff_t end_chunk;
+       pgoff_t end_index;
+       int chunksize = 1 << cli->cl_chunkbits;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(ext->oe_max_end >= index && ext->oe_start <= index);
+       osc_object_lock(obj);
+       LASSERT(sanity_check_nolock(ext) == 0);
+       end_chunk = ext->oe_end >> ppc_bits;
+       if (chunk > end_chunk + 1)
+               GOTO(out, rc = -ERANGE);
+
+       if (end_chunk >= chunk)
+               GOTO(out, rc = 0);
+
+       LASSERT(end_chunk + 1 == chunk);
+       /* try to expand this extent to cover @index */
+       end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1);
+
+       next = next_extent(ext);
+       if (next != NULL && next->oe_start <= end_index)
+               /* complex mode - overlapped with the next extent,
+                * this case will be handled by osc_extent_find() */
+               GOTO(out, rc = -EAGAIN);
+
+       ext->oe_end = end_index;
+       ext->oe_grants += chunksize;
+       *grants -= chunksize;
+       LASSERT(*grants >= 0);
+       EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext,
+                "overlapped after expanding for %lu.\n", index);
+       EXIT;
+
+out:
+       osc_object_unlock(obj);
+       RETURN(rc);
+}
+
+static void osc_extent_tree_dump0(int level, struct osc_object *obj,
+                                 const char *func, int line)
+{
+       struct osc_extent *ext;
+       int cnt;
+
+       CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n",
+              obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc);
+
+       /* osc_object_lock(obj); */
+       cnt = 1;
+       for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext))
+               OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++);
+
+       cnt = 1;
+       list_for_each_entry(ext, &obj->oo_hp_exts, oe_link)
+               OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++);
+
+       cnt = 1;
+       list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link)
+               OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++);
+
+       cnt = 1;
+       list_for_each_entry(ext, &obj->oo_reading_exts, oe_link)
+               OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++);
+       /* osc_object_unlock(obj); */
+}
+
+/* ------------------ osc extent end ------------------ */
+
+static inline int osc_is_ready(struct osc_object *osc)
+{
+       return !list_empty(&osc->oo_ready_item) ||
+              !list_empty(&osc->oo_hp_ready_item);
+}
+
+#define OSC_IO_DEBUG(OSC, STR, args...)                                               \
+       CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR,     \
+              (OSC), osc_is_ready(OSC),                                       \
+              list_empty_marker(&(OSC)->oo_hp_ready_item),                    \
+              list_empty_marker(&(OSC)->oo_ready_item),                       \
+              atomic_read(&(OSC)->oo_nr_writes),                              \
+              list_empty_marker(&(OSC)->oo_hp_exts),                          \
+              list_empty_marker(&(OSC)->oo_urgent_exts),                      \
+              atomic_read(&(OSC)->oo_nr_reads),                               \
+              list_empty_marker(&(OSC)->oo_reading_exts),                     \
+              ##args)
+
+static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap,
+                         int cmd)
+{
+       struct osc_page *opg  = oap2osc_page(oap);
+       struct cl_page  *page = cl_page_top(oap2cl_page(oap));
+       int result;
+
+       LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */
+
+       ENTRY;
+       result = cl_page_make_ready(env, page, CRT_WRITE);
+       if (result == 0)
+               opg->ops_submit_time = cfs_time_current();
+       RETURN(result);
+}
+
+static int osc_refresh_count(const struct lu_env *env,
+                            struct osc_async_page *oap, int cmd)
+{
+       struct osc_page  *opg = oap2osc_page(oap);
+       struct cl_page   *page = oap2cl_page(oap);
+       struct cl_object *obj;
+       struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
+
+       int result;
+       loff_t kms;
+
+       /* readpage queues with _COUNT_STABLE, shouldn't get here. */
+       LASSERT(!(cmd & OBD_BRW_READ));
+       LASSERT(opg != NULL);
+       obj = opg->ops_cl.cpl_obj;
+
+       cl_object_attr_lock(obj);
+       result = cl_object_attr_get(env, obj, attr);
+       cl_object_attr_unlock(obj);
+       if (result < 0)
+               return result;
+       kms = attr->cat_kms;
+       if (cl_offset(obj, page->cp_index) >= kms)
+               /* catch race with truncate */
+               return 0;
+       else if (cl_offset(obj, page->cp_index + 1) > kms)
+               /* catch sub-page write at end of file */
+               return kms % PAGE_CACHE_SIZE;
+       else
+               return PAGE_CACHE_SIZE;
+}
+
+static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
+                         int cmd, int rc)
+{
+       struct osc_page   *opg  = oap2osc_page(oap);
+       struct cl_page    *page = cl_page_top(oap2cl_page(oap));
+       struct osc_object *obj  = cl2osc(opg->ops_cl.cpl_obj);
+       enum cl_req_type   crt;
+       int srvlock;
+
+       ENTRY;
+
+       cmd &= ~OBD_BRW_NOQUOTA;
+       LASSERT(equi(page->cp_state == CPS_PAGEIN,  cmd == OBD_BRW_READ));
+       LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE));
+       LASSERT(opg->ops_transfer_pinned);
+
+       /*
+        * page->cp_req can be NULL if io submission failed before
+        * cl_req was allocated.
+        */
+       if (page->cp_req != NULL)
+               cl_req_page_done(env, page);
+       LASSERT(page->cp_req == NULL);
+
+       crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
+       /* Clear opg->ops_transfer_pinned before VM lock is released. */
+       opg->ops_transfer_pinned = 0;
+
+       spin_lock(&obj->oo_seatbelt);
+       LASSERT(opg->ops_submitter != NULL);
+       LASSERT(!list_empty(&opg->ops_inflight));
+       list_del_init(&opg->ops_inflight);
+       opg->ops_submitter = NULL;
+       spin_unlock(&obj->oo_seatbelt);
+
+       opg->ops_submit_time = 0;
+       srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK;
+
+       /* statistic */
+       if (rc == 0 && srvlock) {
+               struct lu_device *ld    = opg->ops_cl.cpl_obj->co_lu.lo_dev;
+               struct osc_stats *stats = &lu2osc_dev(ld)->od_stats;
+               int bytes = oap->oap_count;
+
+               if (crt == CRT_READ)
+                       stats->os_lockless_reads += bytes;
+               else
+                       stats->os_lockless_writes += bytes;
+       }
+
+       /*
+        * This has to be the last operation with the page, as locks are
+        * released in cl_page_completion() and nothing except for the
+        * reference counter protects page from concurrent reclaim.
+        */
+       lu_ref_del(&page->cp_reference, "transfer", page);
+
+       cl_page_completion(env, page, crt, rc);
+
+       RETURN(0);
+}
+
+#define OSC_DUMP_GRANT(cli, fmt, args...) do {                               \
+       struct client_obd *__tmp = (cli);                                     \
+       CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d "            \
+              "dropped: %ld avail: %ld, reserved: %ld, flight: %d } " fmt,   \
+              __tmp->cl_import->imp_obd->obd_name,                           \
+              __tmp->cl_dirty, __tmp->cl_dirty_max,                          \
+              atomic_read(&obd_dirty_pages), obd_max_dirty_pages,            \
+              __tmp->cl_lost_grant, __tmp->cl_avail_grant,                   \
+              __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args);      \
+} while (0)
+
+/* caller must hold loi_list_lock */
+static void osc_consume_write_grant(struct client_obd *cli,
+                                   struct brw_page *pga)
+{
+       LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock));
+       LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
+       atomic_inc(&obd_dirty_pages);
+       cli->cl_dirty += PAGE_CACHE_SIZE;
+       pga->flag |= OBD_BRW_FROM_GRANT;
+       CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
+              PAGE_CACHE_SIZE, pga, pga->pg);
+       osc_update_next_shrink(cli);
+}
+
+/* the companion to osc_consume_write_grant, called when a brw has completed.
+ * must be called with the loi lock held. */
+static void osc_release_write_grant(struct client_obd *cli,
+                                   struct brw_page *pga)
+{
+       ENTRY;
+
+       LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock));
+       if (!(pga->flag & OBD_BRW_FROM_GRANT)) {
+               EXIT;
+               return;
+       }
+
+       pga->flag &= ~OBD_BRW_FROM_GRANT;
+       atomic_dec(&obd_dirty_pages);
+       cli->cl_dirty -= PAGE_CACHE_SIZE;
+       if (pga->flag & OBD_BRW_NOCACHE) {
+               pga->flag &= ~OBD_BRW_NOCACHE;
+               atomic_dec(&obd_dirty_transit_pages);
+               cli->cl_dirty_transit -= PAGE_CACHE_SIZE;
+       }
+       EXIT;
+}
+
+/**
+ * To avoid sleeping with object lock held, it's good for us allocate enough
+ * grants before entering into critical section.
+ *
+ * client_obd_list_lock held by caller
+ */
+static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes)
+{
+       int rc = -EDQUOT;
+
+       if (cli->cl_avail_grant >= bytes) {
+               cli->cl_avail_grant    -= bytes;
+               cli->cl_reserved_grant += bytes;
+               rc = 0;
+       }
+       return rc;
+}
+
+static void __osc_unreserve_grant(struct client_obd *cli,
+                                 unsigned int reserved, unsigned int unused)
+{
+       /* it's quite normal for us to get more grant than reserved.
+        * Thinking about a case that two extents merged by adding a new
+        * chunk, we can save one extent tax. If extent tax is greater than
+        * one chunk, we can save more grant by adding a new chunk */
+       cli->cl_reserved_grant -= reserved;
+       if (unused > reserved) {
+               cli->cl_avail_grant += reserved;
+               cli->cl_lost_grant  += unused - reserved;
+       } else {
+               cli->cl_avail_grant += unused;
+       }
+}
+
+void osc_unreserve_grant(struct client_obd *cli,
+                        unsigned int reserved, unsigned int unused)
+{
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       __osc_unreserve_grant(cli, reserved, unused);
+       if (unused > 0)
+               osc_wake_cache_waiters(cli);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Free grant after IO is finished or canceled.
+ *
+ * @lost_grant is used to remember how many grants we have allocated but not
+ * used, we should return these grants to OST. There're two cases where grants
+ * can be lost:
+ * 1. truncate;
+ * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was
+ *    written. In this case OST may use less chunks to serve this partial
+ *    write. OSTs don't actually know the page size on the client side. so
+ *    clients have to calculate lost grant by the blocksize on the OST.
+ *    See filter_grant_check() for details.
+ */
+static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
+                          unsigned int lost_grant)
+{
+       int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       atomic_sub(nr_pages, &obd_dirty_pages);
+       cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT;
+       cli->cl_lost_grant += lost_grant;
+       if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) {
+               /* borrow some grant from truncate to avoid the case that
+                * truncate uses up all avail grant */
+               cli->cl_lost_grant -= grant;
+               cli->cl_avail_grant += grant;
+       }
+       osc_wake_cache_waiters(cli);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n",
+              lost_grant, cli->cl_lost_grant,
+              cli->cl_avail_grant, cli->cl_dirty);
+}
+
+/**
+ * The companion to osc_enter_cache(), called when @oap is no longer part of
+ * the dirty accounting due to error.
+ */
+static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap)
+{
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       osc_release_write_grant(cli, &oap->oap_brw_page);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * Non-blocking version of osc_enter_cache() that consumes grant only when it
+ * is available.
+ */
+static int osc_enter_cache_try(struct client_obd *cli,
+                              struct osc_async_page *oap,
+                              int bytes, int transient)
+{
+       int rc;
+
+       OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+       rc = osc_reserve_grant(cli, bytes);
+       if (rc < 0)
+               return 0;
+
+       if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max &&
+           atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) {
+               osc_consume_write_grant(cli, &oap->oap_brw_page);
+               if (transient) {
+                       cli->cl_dirty_transit += PAGE_CACHE_SIZE;
+                       atomic_inc(&obd_dirty_transit_pages);
+                       oap->oap_brw_flags |= OBD_BRW_NOCACHE;
+               }
+               rc = 1;
+       } else {
+               __osc_unreserve_grant(cli, bytes, bytes);
+               rc = 0;
+       }
+       return rc;
+}
+
+static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
+{
+       int rc;
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       rc = list_empty(&ocw->ocw_entry);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       return rc;
+}
+
+/**
+ * The main entry to reserve dirty page accounting. Usually the grant reserved
+ * in this function will be freed in bulk in osc_free_grant() unless it fails
+ * to add osc cache, in that case, it will be freed in osc_exit_cache().
+ *
+ * The process will be put into sleep if it's already run out of grant.
+ */
+static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
+                          struct osc_async_page *oap, int bytes)
+{
+       struct osc_object *osc = oap->oap_obj;
+       struct lov_oinfo  *loi = osc->oo_oinfo;
+       struct osc_cache_waiter ocw;
+       struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+       int rc = -EDQUOT;
+       ENTRY;
+
+       OSC_DUMP_GRANT(cli, "need:%d.\n", bytes);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+
+       /* force the caller to try sync io.  this can jump the list
+        * of queued writes and create a discontiguous rpc stream */
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) ||
+           cli->cl_dirty_max < PAGE_CACHE_SIZE     ||
+           cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync)
+               GOTO(out, rc = -EDQUOT);
+
+       /* Hopefully normal case - cache space and write credits available */
+       if (osc_enter_cache_try(cli, oap, bytes, 0))
+               GOTO(out, rc = 0);
+
+       /* We can get here for two reasons: too many dirty pages in cache, or
+        * run out of grants. In both cases we should write dirty pages out.
+        * Adding a cache waiter will trigger urgent write-out no matter what
+        * RPC size will be.
+        * The exiting condition is no avail grants and no dirty pages caching,
+        * that really means there is no space on the OST. */
+       init_waitqueue_head(&ocw.ocw_waitq);
+       ocw.ocw_oap   = oap;
+       ocw.ocw_grant = bytes;
+       while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) {
+               list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters);
+               ocw.ocw_rc = 0;
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+               osc_io_unplug_async(env, cli, NULL);
+
+               CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n",
+                      cli->cl_import->imp_obd->obd_name, &ocw, oap);
+
+               rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi);
+
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+
+               /* l_wait_event is interrupted by signal */
+               if (rc < 0) {
+                       list_del_init(&ocw.ocw_entry);
+                       GOTO(out, rc);
+               }
+
+               LASSERT(list_empty(&ocw.ocw_entry));
+               rc = ocw.ocw_rc;
+
+               if (rc != -EDQUOT)
+                       GOTO(out, rc);
+               if (osc_enter_cache_try(cli, oap, bytes, 0))
+                       GOTO(out, rc = 0);
+       }
+       EXIT;
+out:
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       OSC_DUMP_GRANT(cli, "returned %d.\n", rc);
+       RETURN(rc);
+}
+
+/* caller must hold loi_list_lock */
+void osc_wake_cache_waiters(struct client_obd *cli)
+{
+       struct list_head *l, *tmp;
+       struct osc_cache_waiter *ocw;
+
+       ENTRY;
+       list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
+               ocw = list_entry(l, struct osc_cache_waiter, ocw_entry);
+               list_del_init(&ocw->ocw_entry);
+
+               ocw->ocw_rc = -EDQUOT;
+               /* we can't dirty more */
+               if ((cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max) ||
+                   (atomic_read(&obd_dirty_pages) + 1 >
+                    obd_max_dirty_pages)) {
+                       CDEBUG(D_CACHE, "no dirty room: dirty: %ld "
+                              "osc max %ld, sys max %d\n", cli->cl_dirty,
+                              cli->cl_dirty_max, obd_max_dirty_pages);
+                       goto wakeup;
+               }
+
+               ocw->ocw_rc = 0;
+               if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0))
+                       ocw->ocw_rc = -EDQUOT;
+
+wakeup:
+               CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n",
+                      ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc);
+
+               wake_up(&ocw->ocw_waitq);
+       }
+
+       EXIT;
+}
+
+static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc)
+{
+       int hprpc = !!list_empty(&osc->oo_hp_exts);
+       return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc;
+}
+
+/* This maintains the lists of pending pages to read/write for a given object
+ * (lop).  This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint()
+ * to quickly find objects that are ready to send an RPC. */
+static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc,
+                        int cmd)
+{
+       int invalid_import = 0;
+       ENTRY;
+
+       /* if we have an invalid import we want to drain the queued pages
+        * by forcing them through rpcs that immediately fail and complete
+        * the pages.  recovery relies on this to empty the queued pages
+        * before canceling the locks and evicting down the llite pages */
+       if ((cli->cl_import == NULL || cli->cl_import->imp_invalid))
+               invalid_import = 1;
+
+       if (cmd & OBD_BRW_WRITE) {
+               if (atomic_read(&osc->oo_nr_writes) == 0)
+                       RETURN(0);
+               if (invalid_import) {
+                       CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+                       RETURN(1);
+               }
+               if (!list_empty(&osc->oo_hp_exts)) {
+                       CDEBUG(D_CACHE, "high prio request forcing RPC\n");
+                       RETURN(1);
+               }
+               if (!list_empty(&osc->oo_urgent_exts)) {
+                       CDEBUG(D_CACHE, "urgent request forcing RPC\n");
+                       RETURN(1);
+               }
+               /* trigger a write rpc stream as long as there are dirtiers
+                * waiting for space.  as they're waiting, they're not going to
+                * create more pages to coalesce with what's waiting.. */
+               if (!list_empty(&cli->cl_cache_waiters)) {
+                       CDEBUG(D_CACHE, "cache waiters forcing RPC\n");
+                       RETURN(1);
+               }
+               if (atomic_read(&osc->oo_nr_writes) >=
+                   cli->cl_max_pages_per_rpc)
+                       RETURN(1);
+       } else {
+               if (atomic_read(&osc->oo_nr_reads) == 0)
+                       RETURN(0);
+               if (invalid_import) {
+                       CDEBUG(D_CACHE, "invalid import forcing RPC\n");
+                       RETURN(1);
+               }
+               /* all read are urgent. */
+               if (!list_empty(&osc->oo_reading_exts))
+                       RETURN(1);
+       }
+
+       RETURN(0);
+}
+
+static void osc_update_pending(struct osc_object *obj, int cmd, int delta)
+{
+       struct client_obd *cli = osc_cli(obj);
+       if (cmd & OBD_BRW_WRITE) {
+               atomic_add(delta, &obj->oo_nr_writes);
+               atomic_add(delta, &cli->cl_pending_w_pages);
+               LASSERT(atomic_read(&obj->oo_nr_writes) >= 0);
+       } else {
+               atomic_add(delta, &obj->oo_nr_reads);
+               atomic_add(delta, &cli->cl_pending_r_pages);
+               LASSERT(atomic_read(&obj->oo_nr_reads) >= 0);
+       }
+       OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta);
+}
+
+static int osc_makes_hprpc(struct osc_object *obj)
+{
+       return !list_empty(&obj->oo_hp_exts);
+}
+
+static void on_list(struct list_head *item, struct list_head *list, int should_be_on)
+{
+       if (list_empty(item) && should_be_on)
+               list_add_tail(item, list);
+       else if (!list_empty(item) && !should_be_on)
+               list_del_init(item);
+}
+
+/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc
+ * can find pages to build into rpcs quickly */
+static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+       if (osc_makes_hprpc(osc)) {
+               /* HP rpc */
+               on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0);
+               on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1);
+       } else {
+               on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0);
+               on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list,
+                       osc_makes_rpc(cli, osc, OBD_BRW_WRITE) ||
+                       osc_makes_rpc(cli, osc, OBD_BRW_READ));
+       }
+
+       on_list(&osc->oo_write_item, &cli->cl_loi_write_list,
+               atomic_read(&osc->oo_nr_writes) > 0);
+
+       on_list(&osc->oo_read_item, &cli->cl_loi_read_list,
+               atomic_read(&osc->oo_nr_reads) > 0);
+
+       return osc_is_ready(osc);
+}
+
+static int osc_list_maint(struct client_obd *cli, struct osc_object *osc)
+{
+       int is_ready;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       is_ready = __osc_list_maint(cli, osc);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       return is_ready;
+}
+
+/* this is trying to propogate async writeback errors back up to the
+ * application.  As an async write fails we record the error code for later if
+ * the app does an fsync.  As long as errors persist we force future rpcs to be
+ * sync so that the app can get a sync error and break the cycle of queueing
+ * pages for which writeback will fail. */
+static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
+                          int rc)
+{
+       if (rc) {
+               if (!ar->ar_rc)
+                       ar->ar_rc = rc;
+
+               ar->ar_force_sync = 1;
+               ar->ar_min_xid = ptlrpc_sample_next_xid();
+               return;
+
+       }
+
+       if (ar->ar_force_sync && (xid >= ar->ar_min_xid))
+               ar->ar_force_sync = 0;
+}
+
+
+/* this must be called holding the loi list lock to give coverage to exit_cache,
+ * async_flag maintenance, and oap_request */
+static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli,
+                             struct osc_async_page *oap, int sent, int rc)
+{
+       struct osc_object *osc = oap->oap_obj;
+       struct lov_oinfo  *loi = osc->oo_oinfo;
+       __u64 xid = 0;
+
+       ENTRY;
+       if (oap->oap_request != NULL) {
+               xid = ptlrpc_req_xid(oap->oap_request);
+               ptlrpc_req_finished(oap->oap_request);
+               oap->oap_request = NULL;
+       }
+
+       /* As the transfer for this page is being done, clear the flags */
+       spin_lock(&oap->oap_lock);
+       oap->oap_async_flags = 0;
+       spin_unlock(&oap->oap_lock);
+       oap->oap_interrupted = 0;
+
+       if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) {
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+               osc_process_ar(&cli->cl_ar, xid, rc);
+               osc_process_ar(&loi->loi_ar, xid, rc);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+       }
+
+       rc = osc_completion(env, oap, oap->oap_cmd, rc);
+       if (rc)
+               CERROR("completion on oap %p obj %p returns %d.\n",
+                      oap, osc, rc);
+
+       EXIT;
+}
+
+/**
+ * Try to add extent to one RPC. We need to think about the following things:
+ * - # of pages must not be over max_pages_per_rpc
+ * - extent must be compatible with previous ones
+ */
+static int try_to_add_extent_for_io(struct client_obd *cli,
+                                   struct osc_extent *ext, struct list_head *rpclist,
+                                   int *pc, unsigned int *max_pages)
+{
+       struct osc_extent *tmp;
+       ENTRY;
+
+       EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
+               ext);
+
+       *max_pages = max(ext->oe_mppr, *max_pages);
+       if (*pc + ext->oe_nr_pages > *max_pages)
+               RETURN(0);
+
+       list_for_each_entry(tmp, rpclist, oe_link) {
+               EASSERT(tmp->oe_owner == current, tmp);
+#if 0
+               if (overlapped(tmp, ext)) {
+                       OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext);
+                       EASSERT(0, ext);
+               }
+#endif
+
+               if (tmp->oe_srvlock != ext->oe_srvlock ||
+                   !tmp->oe_grants != !ext->oe_grants)
+                       RETURN(0);
+
+               /* remove break for strict check */
+               break;
+       }
+
+       *pc += ext->oe_nr_pages;
+       list_move_tail(&ext->oe_link, rpclist);
+       ext->oe_owner = current;
+       RETURN(1);
+}
+
+/**
+ * In order to prevent multiple ptlrpcd from breaking contiguous extents,
+ * get_write_extent() takes all appropriate extents in atomic.
+ *
+ * The following policy is used to collect extents for IO:
+ * 1. Add as many HP extents as possible;
+ * 2. Add the first urgent extent in urgent extent list and take it out of
+ *    urgent list;
+ * 3. Add subsequent extents of this urgent extent;
+ * 4. If urgent list is not empty, goto 2;
+ * 5. Traverse the extent tree from the 1st extent;
+ * 6. Above steps exit if there is no space in this RPC.
+ */
+static int get_write_extents(struct osc_object *obj, struct list_head *rpclist)
+{
+       struct client_obd *cli = osc_cli(obj);
+       struct osc_extent *ext;
+       int page_count = 0;
+       unsigned int max_pages = cli->cl_max_pages_per_rpc;
+
+       LASSERT(osc_object_is_locked(obj));
+       while (!list_empty(&obj->oo_hp_exts)) {
+               ext = list_entry(obj->oo_hp_exts.next, struct osc_extent,
+                                    oe_link);
+               LASSERT(ext->oe_state == OES_CACHE);
+               if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+                                             &max_pages))
+                       return page_count;
+               EASSERT(ext->oe_nr_pages <= max_pages, ext);
+       }
+       if (page_count == max_pages)
+               return page_count;
+
+       while (!list_empty(&obj->oo_urgent_exts)) {
+               ext = list_entry(obj->oo_urgent_exts.next,
+                                    struct osc_extent, oe_link);
+               if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+                                             &max_pages))
+                       return page_count;
+
+               if (!ext->oe_intree)
+                       continue;
+
+               while ((ext = next_extent(ext)) != NULL) {
+                       if ((ext->oe_state != OES_CACHE) ||
+                           (!list_empty(&ext->oe_link) &&
+                            ext->oe_owner != NULL))
+                               continue;
+
+                       if (!try_to_add_extent_for_io(cli, ext, rpclist,
+                                                     &page_count, &max_pages))
+                               return page_count;
+               }
+       }
+       if (page_count == max_pages)
+               return page_count;
+
+       ext = first_extent(obj);
+       while (ext != NULL) {
+               if ((ext->oe_state != OES_CACHE) ||
+                   /* this extent may be already in current rpclist */
+                   (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) {
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count,
+                                             &max_pages))
+                       return page_count;
+
+               ext = next_extent(ext);
+       }
+       return page_count;
+}
+
+static int
+osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli,
+                  struct osc_object *osc, pdl_policy_t pol)
+{
+       LIST_HEAD(rpclist);
+       struct osc_extent *ext;
+       struct osc_extent *tmp;
+       struct osc_extent *first = NULL;
+       obd_count page_count = 0;
+       int srvlock = 0;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(osc_object_is_locked(osc));
+
+       page_count = get_write_extents(osc, &rpclist);
+       LASSERT(equi(page_count == 0, list_empty(&rpclist)));
+
+       if (list_empty(&rpclist))
+               RETURN(0);
+
+       osc_update_pending(osc, OBD_BRW_WRITE, -page_count);
+
+       list_for_each_entry(ext, &rpclist, oe_link) {
+               LASSERT(ext->oe_state == OES_CACHE ||
+                       ext->oe_state == OES_LOCK_DONE);
+               if (ext->oe_state == OES_CACHE)
+                       osc_extent_state_set(ext, OES_LOCKING);
+               else
+                       osc_extent_state_set(ext, OES_RPC);
+       }
+
+       /* we're going to grab page lock, so release object lock because
+        * lock order is page lock -> object lock. */
+       osc_object_unlock(osc);
+
+       list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) {
+               if (ext->oe_state == OES_LOCKING) {
+                       rc = osc_extent_make_ready(env, ext);
+                       if (unlikely(rc < 0)) {
+                               list_del_init(&ext->oe_link);
+                               osc_extent_finish(env, ext, 0, rc);
+                               continue;
+                       }
+               }
+               if (first == NULL) {
+                       first = ext;
+                       srvlock = ext->oe_srvlock;
+               } else {
+                       LASSERT(srvlock == ext->oe_srvlock);
+               }
+       }
+
+       if (!list_empty(&rpclist)) {
+               LASSERT(page_count > 0);
+               rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE, pol);
+               LASSERT(list_empty(&rpclist));
+       }
+
+       osc_object_lock(osc);
+       RETURN(rc);
+}
+
+/**
+ * prepare pages for ASYNC io and put pages in send queue.
+ *
+ * \param cmd OBD_BRW_* macroses
+ * \param lop pending pages
+ *
+ * \return zero if no page added to send queue.
+ * \return 1 if pages successfully added to send queue.
+ * \return negative on errors.
+ */
+static int
+osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli,
+                 struct osc_object *osc, pdl_policy_t pol)
+{
+       struct osc_extent *ext;
+       struct osc_extent *next;
+       LIST_HEAD(rpclist);
+       int page_count = 0;
+       unsigned int max_pages = cli->cl_max_pages_per_rpc;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(osc_object_is_locked(osc));
+       list_for_each_entry_safe(ext, next,
+                                    &osc->oo_reading_exts, oe_link) {
+               EASSERT(ext->oe_state == OES_LOCK_DONE, ext);
+               if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count,
+                                             &max_pages))
+                       break;
+               osc_extent_state_set(ext, OES_RPC);
+               EASSERT(ext->oe_nr_pages <= max_pages, ext);
+       }
+       LASSERT(page_count <= max_pages);
+
+       osc_update_pending(osc, OBD_BRW_READ, -page_count);
+
+       if (!list_empty(&rpclist)) {
+               osc_object_unlock(osc);
+
+               LASSERT(page_count > 0);
+               rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ, pol);
+               LASSERT(list_empty(&rpclist));
+
+               osc_object_lock(osc);
+       }
+       RETURN(rc);
+}
+
+#define list_to_obj(list, item) ({                                           \
+       struct list_head *__tmp = (list)->next;                               \
+       list_del_init(__tmp);                                         \
+       list_entry(__tmp, struct osc_object, oo_##item);                      \
+})
+
+/* This is called by osc_check_rpcs() to find which objects have pages that
+ * we could be sending.  These lists are maintained by osc_makes_rpc(). */
+static struct osc_object *osc_next_obj(struct client_obd *cli)
+{
+       ENTRY;
+
+       /* First return objects that have blocked locks so that they
+        * will be flushed quickly and other clients can get the lock,
+        * then objects which have pages ready to be stuffed into RPCs */
+       if (!list_empty(&cli->cl_loi_hp_ready_list))
+               RETURN(list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item));
+       if (!list_empty(&cli->cl_loi_ready_list))
+               RETURN(list_to_obj(&cli->cl_loi_ready_list, ready_item));
+
+       /* then if we have cache waiters, return all objects with queued
+        * writes.  This is especially important when many small files
+        * have filled up the cache and not been fired into rpcs because
+        * they don't pass the nr_pending/object threshhold */
+       if (!list_empty(&cli->cl_cache_waiters) &&
+           !list_empty(&cli->cl_loi_write_list))
+               RETURN(list_to_obj(&cli->cl_loi_write_list, write_item));
+
+       /* then return all queued objects when we have an invalid import
+        * so that they get flushed */
+       if (cli->cl_import == NULL || cli->cl_import->imp_invalid) {
+               if (!list_empty(&cli->cl_loi_write_list))
+                       RETURN(list_to_obj(&cli->cl_loi_write_list,
+                                          write_item));
+               if (!list_empty(&cli->cl_loi_read_list))
+                       RETURN(list_to_obj(&cli->cl_loi_read_list,
+                                          read_item));
+       }
+       RETURN(NULL);
+}
+
+/* called with the loi list lock held */
+static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli,
+                          pdl_policy_t pol)
+{
+       struct osc_object *osc;
+       int rc = 0;
+       ENTRY;
+
+       while ((osc = osc_next_obj(cli)) != NULL) {
+               struct cl_object *obj = osc2cl(osc);
+               struct lu_ref_link *link;
+
+               OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli));
+
+               if (osc_max_rpc_in_flight(cli, osc)) {
+                       __osc_list_maint(cli, osc);
+                       break;
+               }
+
+               cl_object_get(obj);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               link = lu_object_ref_add(&obj->co_lu, "check", current);
+
+               /* attempt some read/write balancing by alternating between
+                * reads and writes in an object.  The makes_rpc checks here
+                * would be redundant if we were getting read/write work items
+                * instead of objects.  we don't want send_oap_rpc to drain a
+                * partial read pending queue when we're given this object to
+                * do io on writes while there are cache waiters */
+               osc_object_lock(osc);
+               if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) {
+                       rc = osc_send_write_rpc(env, cli, osc, pol);
+                       if (rc < 0) {
+                               CERROR("Write request failed with %d\n", rc);
+
+                               /* osc_send_write_rpc failed, mostly because of
+                                * memory pressure.
+                                *
+                                * It can't break here, because if:
+                                *  - a page was submitted by osc_io_submit, so
+                                *    page locked;
+                                *  - no request in flight
+                                *  - no subsequent request
+                                * The system will be in live-lock state,
+                                * because there is no chance to call
+                                * osc_io_unplug() and osc_check_rpcs() any
+                                * more. pdflush can't help in this case,
+                                * because it might be blocked at grabbing
+                                * the page lock as we mentioned.
+                                *
+                                * Anyway, continue to drain pages. */
+                               /* break; */
+                       }
+               }
+               if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) {
+                       rc = osc_send_read_rpc(env, cli, osc, pol);
+                       if (rc < 0)
+                               CERROR("Read request failed with %d\n", rc);
+               }
+               osc_object_unlock(osc);
+
+               osc_list_maint(cli, osc);
+               lu_object_ref_del_at(&obj->co_lu, link, "check", current);
+               cl_object_put(env, obj);
+
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+       }
+}
+
+static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli,
+                         struct osc_object *osc, pdl_policy_t pol, int async)
+{
+       int rc = 0;
+
+       if (osc != NULL && osc_list_maint(cli, osc) == 0)
+               return 0;
+
+       if (!async) {
+               /* disable osc_lru_shrink() temporarily to avoid
+                * potential stack overrun problem. LU-2859 */
+               atomic_inc(&cli->cl_lru_shrinkers);
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+               osc_check_rpcs(env, cli, pol);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               atomic_dec(&cli->cl_lru_shrinkers);
+       } else {
+               CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli);
+               LASSERT(cli->cl_writeback_work != NULL);
+               rc = ptlrpcd_queue_work(cli->cl_writeback_work);
+       }
+       return rc;
+}
+
+static int osc_io_unplug_async(const struct lu_env *env,
+                               struct client_obd *cli, struct osc_object *osc)
+{
+       /* XXX: policy is no use actually. */
+       return osc_io_unplug0(env, cli, osc, PDL_POLICY_ROUND, 1);
+}
+
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+                  struct osc_object *osc, pdl_policy_t pol)
+{
+       (void)osc_io_unplug0(env, cli, osc, pol, 0);
+}
+
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+                       struct page *page, loff_t offset)
+{
+       struct obd_export     *exp = osc_export(osc);
+       struct osc_async_page *oap = &ops->ops_oap;
+       ENTRY;
+
+       if (!page)
+               return cfs_size_round(sizeof(*oap));
+
+       oap->oap_magic = OAP_MAGIC;
+       oap->oap_cli = &exp->exp_obd->u.cli;
+       oap->oap_obj = osc;
+
+       oap->oap_page = page;
+       oap->oap_obj_off = offset;
+       LASSERT(!(offset & ~CFS_PAGE_MASK));
+
+       if (!client_is_remote(exp) && cfs_capable(CFS_CAP_SYS_RESOURCE))
+               oap->oap_brw_flags = OBD_BRW_NOQUOTA;
+
+       INIT_LIST_HEAD(&oap->oap_pending_item);
+       INIT_LIST_HEAD(&oap->oap_rpc_item);
+
+       spin_lock_init(&oap->oap_lock);
+       CDEBUG(D_INFO, "oap %p page %p obj off "LPU64"\n",
+              oap, page, oap->oap_obj_off);
+       RETURN(0);
+}
+
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+                      struct osc_page *ops)
+{
+       struct osc_io *oio = osc_env_io(env);
+       struct osc_extent     *ext = NULL;
+       struct osc_async_page *oap = &ops->ops_oap;
+       struct client_obd     *cli = oap->oap_cli;
+       struct osc_object     *osc = oap->oap_obj;
+       pgoff_t index;
+       int    grants = 0;
+       int    brw_flags = OBD_BRW_ASYNC;
+       int    cmd = OBD_BRW_WRITE;
+       int    need_release = 0;
+       int    rc = 0;
+       ENTRY;
+
+       if (oap->oap_magic != OAP_MAGIC)
+               RETURN(-EINVAL);
+
+       if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
+               RETURN(-EIO);
+
+       if (!list_empty(&oap->oap_pending_item) ||
+           !list_empty(&oap->oap_rpc_item))
+               RETURN(-EBUSY);
+
+       /* Set the OBD_BRW_SRVLOCK before the page is queued. */
+       brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
+       if (!client_is_remote(osc_export(osc)) &&
+           cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+               brw_flags |= OBD_BRW_NOQUOTA;
+               cmd |= OBD_BRW_NOQUOTA;
+       }
+
+       /* check if the file's owner/group is over quota */
+       if (!(cmd & OBD_BRW_NOQUOTA)) {
+               struct cl_object *obj;
+               struct cl_attr   *attr;
+               unsigned int qid[MAXQUOTAS];
+
+               obj = cl_object_top(&osc->oo_cl);
+               attr = &osc_env_info(env)->oti_attr;
+
+               cl_object_attr_lock(obj);
+               rc = cl_object_attr_get(env, obj, attr);
+               cl_object_attr_unlock(obj);
+
+               qid[USRQUOTA] = attr->cat_uid;
+               qid[GRPQUOTA] = attr->cat_gid;
+               if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA)
+                       rc = -EDQUOT;
+               if (rc)
+                       RETURN(rc);
+       }
+
+       oap->oap_cmd = cmd;
+       oap->oap_page_off = ops->ops_from;
+       oap->oap_count = ops->ops_to - ops->ops_from;
+       oap->oap_async_flags = 0;
+       oap->oap_brw_flags = brw_flags;
+
+       OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n",
+                    oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK);
+
+       index = oap2cl_page(oap)->cp_index;
+
+       /* Add this page into extent by the following steps:
+        * 1. if there exists an active extent for this IO, mostly this page
+        *    can be added to the active extent and sometimes we need to
+        *    expand extent to accomodate this page;
+        * 2. otherwise, a new extent will be allocated. */
+
+       ext = oio->oi_active;
+       if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) {
+               /* one chunk plus extent overhead must be enough to write this
+                * page */
+               grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+               if (ext->oe_end >= index)
+                       grants = 0;
+
+               /* it doesn't need any grant to dirty this page */
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+               rc = osc_enter_cache_try(cli, oap, grants, 0);
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               if (rc == 0) { /* try failed */
+                       grants = 0;
+                       need_release = 1;
+               } else if (ext->oe_end < index) {
+                       int tmp = grants;
+                       /* try to expand this extent */
+                       rc = osc_extent_expand(ext, index, &tmp);
+                       if (rc < 0) {
+                               need_release = 1;
+                               /* don't free reserved grant */
+                       } else {
+                               OSC_EXTENT_DUMP(D_CACHE, ext,
+                                               "expanded for %lu.\n", index);
+                               osc_unreserve_grant(cli, grants, tmp);
+                               grants = 0;
+                       }
+               }
+               rc = 0;
+       } else if (ext != NULL) {
+               /* index is located outside of active extent */
+               need_release = 1;
+       }
+       if (need_release) {
+               osc_extent_release(env, ext);
+               oio->oi_active = NULL;
+               ext = NULL;
+       }
+
+       if (ext == NULL) {
+               int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax;
+
+               /* try to find new extent to cover this page */
+               LASSERT(oio->oi_active == NULL);
+               /* we may have allocated grant for this page if we failed
+                * to expand the previous active extent. */
+               LASSERT(ergo(grants > 0, grants >= tmp));
+
+               rc = 0;
+               if (grants == 0) {
+                       /* we haven't allocated grant for this page. */
+                       rc = osc_enter_cache(env, cli, oap, tmp);
+                       if (rc == 0)
+                               grants = tmp;
+               }
+
+               tmp = grants;
+               if (rc == 0) {
+                       ext = osc_extent_find(env, osc, index, &tmp);
+                       if (IS_ERR(ext)) {
+                               LASSERT(tmp == grants);
+                               osc_exit_cache(cli, oap);
+                               rc = PTR_ERR(ext);
+                               ext = NULL;
+                       } else {
+                               oio->oi_active = ext;
+                       }
+               }
+               if (grants > 0)
+                       osc_unreserve_grant(cli, grants, tmp);
+       }
+
+       LASSERT(ergo(rc == 0, ext != NULL));
+       if (ext != NULL) {
+               EASSERTF(ext->oe_end >= index && ext->oe_start <= index,
+                        ext, "index = %lu.\n", index);
+               LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0);
+
+               osc_object_lock(osc);
+               if (ext->oe_nr_pages == 0)
+                       ext->oe_srvlock = ops->ops_srvlock;
+               else
+                       LASSERT(ext->oe_srvlock == ops->ops_srvlock);
+               ++ext->oe_nr_pages;
+               list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
+               osc_object_unlock(osc);
+       }
+       RETURN(rc);
+}
+
+int osc_teardown_async_page(const struct lu_env *env,
+                           struct osc_object *obj, struct osc_page *ops)
+{
+       struct osc_async_page *oap = &ops->ops_oap;
+       struct osc_extent     *ext = NULL;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(oap->oap_magic == OAP_MAGIC);
+
+       CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n",
+              oap, ops, oap2cl_page(oap)->cp_index);
+
+       osc_object_lock(obj);
+       if (!list_empty(&oap->oap_rpc_item)) {
+               CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap);
+               rc = -EBUSY;
+       } else if (!list_empty(&oap->oap_pending_item)) {
+               ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index);
+               /* only truncated pages are allowed to be taken out.
+                * See osc_extent_truncate() and osc_cache_truncate_start()
+                * for details. */
+               if (ext != NULL && ext->oe_state != OES_TRUNC) {
+                       OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n",
+                                       oap2cl_page(oap)->cp_index);
+                       rc = -EBUSY;
+               }
+       }
+       osc_object_unlock(obj);
+       if (ext != NULL)
+               osc_extent_put(env, ext);
+       RETURN(rc);
+}
+
+/**
+ * This is called when a page is picked up by kernel to write out.
+ *
+ * We should find out the corresponding extent and add the whole extent
+ * into urgent list. The extent may be being truncated or used, handle it
+ * carefully.
+ */
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+                        struct osc_page *ops)
+{
+       struct osc_extent *ext   = NULL;
+       struct osc_object *obj   = cl2osc(ops->ops_cl.cpl_obj);
+       struct cl_page    *cp    = ops->ops_cl.cpl_page;
+       pgoff_t     index = cp->cp_index;
+       struct osc_async_page *oap = &ops->ops_oap;
+       bool unplug = false;
+       int rc = 0;
+       ENTRY;
+
+       osc_object_lock(obj);
+       ext = osc_extent_lookup(obj, index);
+       if (ext == NULL) {
+               osc_extent_tree_dump(D_ERROR, obj);
+               LASSERTF(0, "page index %lu is NOT covered.\n", index);
+       }
+
+       switch (ext->oe_state) {
+       case OES_RPC:
+       case OES_LOCK_DONE:
+               CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp),
+                             "flush an in-rpc page?\n");
+               LASSERT(0);
+               break;
+       case OES_LOCKING:
+               /* If we know this extent is being written out, we should abort
+                * so that the writer can make this page ready. Otherwise, there
+                * exists a deadlock problem because other process can wait for
+                * page writeback bit holding page lock; and meanwhile in
+                * vvp_page_make_ready(), we need to grab page lock before
+                * really sending the RPC. */
+       case OES_TRUNC:
+               /* race with truncate, page will be redirtied */
+               GOTO(out, rc = -EAGAIN);
+       default:
+               break;
+       }
+
+       rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE);
+       if (rc)
+               GOTO(out, rc);
+
+       spin_lock(&oap->oap_lock);
+       oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT;
+       spin_unlock(&oap->oap_lock);
+
+       if (memory_pressure_get())
+               ext->oe_memalloc = 1;
+
+       ext->oe_urgent = 1;
+       if (ext->oe_state == OES_CACHE) {
+               OSC_EXTENT_DUMP(D_CACHE, ext,
+                               "flush page %p make it urgent.\n", oap);
+               if (list_empty(&ext->oe_link))
+                       list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+               unplug = true;
+       }
+       rc = 0;
+       EXIT;
+
+out:
+       osc_object_unlock(obj);
+       osc_extent_put(env, ext);
+       if (unplug)
+               osc_io_unplug_async(env, osc_cli(obj), obj);
+       return rc;
+}
+
+/**
+ * this is called when a sync waiter receives an interruption.  Its job is to
+ * get the caller woken as soon as possible.  If its page hasn't been put in an
+ * rpc yet it can dequeue immediately.  Otherwise it has to mark the rpc as
+ * desiring interruption which will forcefully complete the rpc once the rpc
+ * has timed out.
+ */
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
+{
+       struct osc_async_page *oap = &ops->ops_oap;
+       struct osc_object     *obj = oap->oap_obj;
+       struct client_obd     *cli = osc_cli(obj);
+       struct osc_extent     *ext;
+       struct osc_extent     *found = NULL;
+       struct list_head            *plist;
+       pgoff_t index = oap2cl_page(oap)->cp_index;
+       int     rc = -EBUSY;
+       int     cmd;
+       ENTRY;
+
+       LASSERT(!oap->oap_interrupted);
+       oap->oap_interrupted = 1;
+
+       /* Find out the caching extent */
+       osc_object_lock(obj);
+       if (oap->oap_cmd & OBD_BRW_WRITE) {
+               plist = &obj->oo_urgent_exts;
+               cmd   = OBD_BRW_WRITE;
+       } else {
+               plist = &obj->oo_reading_exts;
+               cmd   = OBD_BRW_READ;
+       }
+       list_for_each_entry(ext, plist, oe_link) {
+               if (ext->oe_start <= index && ext->oe_end >= index) {
+                       LASSERT(ext->oe_state == OES_LOCK_DONE);
+                       /* For OES_LOCK_DONE state extent, it has already held
+                        * a refcount for RPC. */
+                       found = osc_extent_get(ext);
+                       break;
+               }
+       }
+       if (found != NULL) {
+               list_del_init(&found->oe_link);
+               osc_update_pending(obj, cmd, -found->oe_nr_pages);
+               osc_object_unlock(obj);
+
+               osc_extent_finish(env, found, 0, -EINTR);
+               osc_extent_put(env, found);
+               rc = 0;
+       } else {
+               osc_object_unlock(obj);
+               /* ok, it's been put in an rpc. only one oap gets a request
+                * reference */
+               if (oap->oap_request != NULL) {
+                       ptlrpc_mark_interrupted(oap->oap_request);
+                       ptlrpcd_wake(oap->oap_request);
+                       ptlrpc_req_finished(oap->oap_request);
+                       oap->oap_request = NULL;
+               }
+       }
+
+       osc_list_maint(cli, obj);
+       RETURN(rc);
+}
+
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+                        struct list_head *list, int cmd, int brw_flags)
+{
+       struct client_obd     *cli = osc_cli(obj);
+       struct osc_extent     *ext;
+       struct osc_async_page *oap, *tmp;
+       int     page_count = 0;
+       int     mppr       = cli->cl_max_pages_per_rpc;
+       pgoff_t start      = CL_PAGE_EOF;
+       pgoff_t end     = 0;
+       ENTRY;
+
+       list_for_each_entry(oap, list, oap_pending_item) {
+               struct cl_page *cp = oap2cl_page(oap);
+               if (cp->cp_index > end)
+                       end = cp->cp_index;
+               if (cp->cp_index < start)
+                       start = cp->cp_index;
+               ++page_count;
+               mppr <<= (page_count > mppr);
+       }
+
+       ext = osc_extent_alloc(obj);
+       if (ext == NULL) {
+               list_for_each_entry_safe(oap, tmp, list, oap_pending_item) {
+                       list_del_init(&oap->oap_pending_item);
+                       osc_ap_completion(env, cli, oap, 0, -ENOMEM);
+               }
+               RETURN(-ENOMEM);
+       }
+
+       ext->oe_rw = !!(cmd & OBD_BRW_READ);
+       ext->oe_urgent = 1;
+       ext->oe_start = start;
+       ext->oe_end = ext->oe_max_end = end;
+       ext->oe_obj = obj;
+       ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+       ext->oe_nr_pages = page_count;
+       ext->oe_mppr = mppr;
+       list_splice_init(list, &ext->oe_pages);
+
+       osc_object_lock(obj);
+       /* Reuse the initial refcount for RPC, don't drop it */
+       osc_extent_state_set(ext, OES_LOCK_DONE);
+       if (cmd & OBD_BRW_WRITE) {
+               list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
+               osc_update_pending(obj, OBD_BRW_WRITE, page_count);
+       } else {
+               list_add_tail(&ext->oe_link, &obj->oo_reading_exts);
+               osc_update_pending(obj, OBD_BRW_READ, page_count);
+       }
+       osc_object_unlock(obj);
+
+       osc_io_unplug(env, cli, obj, PDL_POLICY_ROUND);
+       RETURN(0);
+}
+
+/**
+ * Called by osc_io_setattr_start() to freeze and destroy covering extents.
+ */
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+                            struct osc_object *obj, __u64 size)
+{
+       struct client_obd *cli = osc_cli(obj);
+       struct osc_extent *ext;
+       struct osc_extent *waiting = NULL;
+       pgoff_t index;
+       LIST_HEAD(list);
+       int result = 0;
+       bool partial;
+       ENTRY;
+
+       /* pages with index greater or equal to index will be truncated. */
+       index = cl_index(osc2cl(obj), size);
+       partial = size > cl_offset(osc2cl(obj), index);
+
+again:
+       osc_object_lock(obj);
+       ext = osc_extent_search(obj, index);
+       if (ext == NULL)
+               ext = first_extent(obj);
+       else if (ext->oe_end < index)
+               ext = next_extent(ext);
+       while (ext != NULL) {
+               EASSERT(ext->oe_state != OES_TRUNC, ext);
+
+               if (ext->oe_state > OES_CACHE || ext->oe_urgent) {
+                       /* if ext is in urgent state, it means there must exist
+                        * a page already having been flushed by write_page().
+                        * We have to wait for this extent because we can't
+                        * truncate that page. */
+                       LASSERT(!ext->oe_hp);
+                       OSC_EXTENT_DUMP(D_CACHE, ext,
+                                       "waiting for busy extent\n");
+                       waiting = osc_extent_get(ext);
+                       break;
+               }
+
+               OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:"LPU64".\n", size);
+
+               osc_extent_get(ext);
+               if (ext->oe_state == OES_ACTIVE) {
+                       /* though we grab inode mutex for write path, but we
+                        * release it before releasing extent(in osc_io_end()),
+                        * so there is a race window that an extent is still
+                        * in OES_ACTIVE when truncate starts. */
+                       LASSERT(!ext->oe_trunc_pending);
+                       ext->oe_trunc_pending = 1;
+               } else {
+                       EASSERT(ext->oe_state == OES_CACHE, ext);
+                       osc_extent_state_set(ext, OES_TRUNC);
+                       osc_update_pending(obj, OBD_BRW_WRITE,
+                                          -ext->oe_nr_pages);
+               }
+               EASSERT(list_empty(&ext->oe_link), ext);
+               list_add_tail(&ext->oe_link, &list);
+
+               ext = next_extent(ext);
+       }
+       osc_object_unlock(obj);
+
+       osc_list_maint(cli, obj);
+
+       while (!list_empty(&list)) {
+               int rc;
+
+               ext = list_entry(list.next, struct osc_extent, oe_link);
+               list_del_init(&ext->oe_link);
+
+               /* extent may be in OES_ACTIVE state because inode mutex
+                * is released before osc_io_end() in file write case */
+               if (ext->oe_state != OES_TRUNC)
+                       osc_extent_wait(env, ext, OES_TRUNC);
+
+               rc = osc_extent_truncate(ext, index, partial);
+               if (rc < 0) {
+                       if (result == 0)
+                               result = rc;
+
+                       OSC_EXTENT_DUMP(D_ERROR, ext,
+                                       "truncate error %d\n", rc);
+               } else if (ext->oe_nr_pages == 0) {
+                       osc_extent_remove(ext);
+               } else {
+                       /* this must be an overlapped extent which means only
+                        * part of pages in this extent have been truncated.
+                        */
+                       EASSERTF(ext->oe_start <= index, ext,
+                                "trunc index = %lu/%d.\n", index, partial);
+                       /* fix index to skip this partially truncated extent */
+                       index = ext->oe_end + 1;
+                       partial = false;
+
+                       /* we need to hold this extent in OES_TRUNC state so
+                        * that no writeback will happen. This is to avoid
+                        * BUG 17397. */
+                       LASSERT(oio->oi_trunc == NULL);
+                       oio->oi_trunc = osc_extent_get(ext);
+                       OSC_EXTENT_DUMP(D_CACHE, ext,
+                                       "trunc at "LPU64"\n", size);
+               }
+               osc_extent_put(env, ext);
+       }
+       if (waiting != NULL) {
+               int rc;
+
+               /* ignore the result of osc_extent_wait the write initiator
+                * should take care of it. */
+               rc = osc_extent_wait(env, waiting, OES_INV);
+               if (rc < 0)
+                       OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc);
+
+               osc_extent_put(env, waiting);
+               waiting = NULL;
+               goto again;
+       }
+       RETURN(result);
+}
+
+/**
+ * Called after osc_io_setattr_end to add oio->oi_trunc back to cache.
+ */
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+                           struct osc_object *obj)
+{
+       struct osc_extent *ext = oio->oi_trunc;
+
+       oio->oi_trunc = NULL;
+       if (ext != NULL) {
+               bool unplug = false;
+
+               EASSERT(ext->oe_nr_pages > 0, ext);
+               EASSERT(ext->oe_state == OES_TRUNC, ext);
+               EASSERT(!ext->oe_urgent, ext);
+
+               OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n");
+               osc_object_lock(obj);
+               osc_extent_state_set(ext, OES_CACHE);
+               if (ext->oe_fsync_wait && !ext->oe_urgent) {
+                       ext->oe_urgent = 1;
+                       list_move_tail(&ext->oe_link, &obj->oo_urgent_exts);
+                       unplug = true;
+               }
+               osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages);
+               osc_object_unlock(obj);
+               osc_extent_put(env, ext);
+
+               if (unplug)
+                       osc_io_unplug_async(env, osc_cli(obj), obj);
+       }
+}
+
+/**
+ * Wait for extents in a specific range to be written out.
+ * The caller must have called osc_cache_writeback_range() to issue IO
+ * otherwise it will take a long time for this function to finish.
+ *
+ * Caller must hold inode_mutex , or cancel exclusive dlm lock so that
+ * nobody else can dirty this range of file while we're waiting for
+ * extents to be written.
+ */
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+                        pgoff_t start, pgoff_t end)
+{
+       struct osc_extent *ext;
+       pgoff_t index = start;
+       int     result = 0;
+       ENTRY;
+
+again:
+       osc_object_lock(obj);
+       ext = osc_extent_search(obj, index);
+       if (ext == NULL)
+               ext = first_extent(obj);
+       else if (ext->oe_end < index)
+               ext = next_extent(ext);
+       while (ext != NULL) {
+               int rc;
+
+               if (ext->oe_start > end)
+                       break;
+
+               if (!ext->oe_fsync_wait) {
+                       ext = next_extent(ext);
+                       continue;
+               }
+
+               EASSERT(ergo(ext->oe_state == OES_CACHE,
+                            ext->oe_hp || ext->oe_urgent), ext);
+               EASSERT(ergo(ext->oe_state == OES_ACTIVE,
+                            !ext->oe_hp && ext->oe_urgent), ext);
+
+               index = ext->oe_end + 1;
+               osc_extent_get(ext);
+               osc_object_unlock(obj);
+
+               rc = osc_extent_wait(env, ext, OES_INV);
+               if (result == 0)
+                       result = rc;
+               osc_extent_put(env, ext);
+               goto again;
+       }
+       osc_object_unlock(obj);
+
+       OSC_IO_DEBUG(obj, "sync file range.\n");
+       RETURN(result);
+}
+
+/**
+ * Called to write out a range of osc object.
+ *
+ * @hp     : should be set this is caused by lock cancel;
+ * @discard: is set if dirty pages should be dropped - file will be deleted or
+ *        truncated, this implies there is no partially discarding extents.
+ *
+ * Return how many pages will be issued, or error code if error occurred.
+ */
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+                             pgoff_t start, pgoff_t end, int hp, int discard)
+{
+       struct osc_extent *ext;
+       LIST_HEAD(discard_list);
+       bool unplug = false;
+       int result = 0;
+       ENTRY;
+
+       osc_object_lock(obj);
+       ext = osc_extent_search(obj, start);
+       if (ext == NULL)
+               ext = first_extent(obj);
+       else if (ext->oe_end < start)
+               ext = next_extent(ext);
+       while (ext != NULL) {
+               if (ext->oe_start > end)
+                       break;
+
+               ext->oe_fsync_wait = 1;
+               switch (ext->oe_state) {
+               case OES_CACHE:
+                       result += ext->oe_nr_pages;
+                       if (!discard) {
+                               struct list_head *list = NULL;
+                               if (hp) {
+                                       EASSERT(!ext->oe_hp, ext);
+                                       ext->oe_hp = 1;
+                                       list = &obj->oo_hp_exts;
+                               } else if (!ext->oe_urgent) {
+                                       ext->oe_urgent = 1;
+                                       list = &obj->oo_urgent_exts;
+                               }
+                               if (list != NULL)
+                                       list_move_tail(&ext->oe_link, list);
+                               unplug = true;
+                       } else {
+                               /* the only discarder is lock cancelling, so
+                                * [start, end] must contain this extent */
+                               EASSERT(ext->oe_start >= start &&
+                                       ext->oe_max_end <= end, ext);
+                               osc_extent_state_set(ext, OES_LOCKING);
+                               ext->oe_owner = current;
+                               list_move_tail(&ext->oe_link,
+                                                  &discard_list);
+                               osc_update_pending(obj, OBD_BRW_WRITE,
+                                                  -ext->oe_nr_pages);
+                       }
+                       break;
+               case OES_ACTIVE:
+                       /* It's pretty bad to wait for ACTIVE extents, because
+                        * we don't know how long we will wait for it to be
+                        * flushed since it may be blocked at awaiting more
+                        * grants. We do this for the correctness of fsync. */
+                       LASSERT(hp == 0 && discard == 0);
+                       ext->oe_urgent = 1;
+                       break;
+               case OES_TRUNC:
+                       /* this extent is being truncated, can't do anything
+                        * for it now. it will be set to urgent after truncate
+                        * is finished in osc_cache_truncate_end(). */
+               default:
+                       break;
+               }
+               ext = next_extent(ext);
+       }
+       osc_object_unlock(obj);
+
+       LASSERT(ergo(!discard, list_empty(&discard_list)));
+       if (!list_empty(&discard_list)) {
+               struct osc_extent *tmp;
+               int rc;
+
+               osc_list_maint(osc_cli(obj), obj);
+               list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) {
+                       list_del_init(&ext->oe_link);
+                       EASSERT(ext->oe_state == OES_LOCKING, ext);
+
+                       /* Discard caching pages. We don't actually write this
+                        * extent out but we complete it as if we did. */
+                       rc = osc_extent_make_ready(env, ext);
+                       if (unlikely(rc < 0)) {
+                               OSC_EXTENT_DUMP(D_ERROR, ext,
+                                               "make_ready returned %d\n", rc);
+                               if (result >= 0)
+                                       result = rc;
+                       }
+
+                       /* finish the extent as if the pages were sent */
+                       osc_extent_finish(env, ext, 0, 0);
+               }
+       }
+
+       if (unplug)
+               osc_io_unplug(env, osc_cli(obj), obj, PDL_POLICY_ROUND);
+
+       if (hp || discard) {
+               int rc;
+               rc = osc_cache_wait_range(env, obj, start, end);
+               if (result >= 0 && rc < 0)
+                       result = rc;
+       }
+
+       OSC_IO_DEBUG(obj, "cache page out.\n");
+       RETURN(result);
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h
new file mode 100644 (file)
index 0000000..158e8ff
--- /dev/null
@@ -0,0 +1,677 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#ifndef OSC_CL_INTERNAL_H
+#define OSC_CL_INTERNAL_H
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+/* osc_build_res_name() */
+#include <obd_ost.h>
+#include <cl_object.h>
+#include <lclient.h>
+#include "osc_internal.h"
+
+/** \defgroup osc osc
+ *  @{
+ */
+
+struct osc_extent;
+
+/**
+ * State maintained by osc layer for each IO context.
+ */
+struct osc_io {
+       /** super class */
+       struct cl_io_slice oi_cl;
+       /** true if this io is lockless. */
+       int             oi_lockless;
+       /** active extents, we know how many bytes is going to be written,
+        * so having an active extent will prevent it from being fragmented */
+       struct osc_extent *oi_active;
+       /** partially truncated extent, we need to hold this extent to prevent
+        * page writeback from happening. */
+       struct osc_extent *oi_trunc;
+
+       struct obd_info    oi_info;
+       struct obdo     oi_oa;
+       struct osc_async_cbargs {
+               bool              opc_rpc_sent;
+               int            opc_rc;
+               struct completion       opc_sync;
+       } oi_cbarg;
+};
+
+/**
+ * State of transfer for osc.
+ */
+struct osc_req {
+       struct cl_req_slice    or_cl;
+};
+
+/**
+ * State maintained by osc layer for the duration of a system call.
+ */
+struct osc_session {
+       struct osc_io       os_io;
+};
+
+#define OTI_PVEC_SIZE 64
+struct osc_thread_info {
+       struct ldlm_res_id      oti_resname;
+       ldlm_policy_data_t      oti_policy;
+       struct cl_lock_descr    oti_descr;
+       struct cl_attr    oti_attr;
+       struct lustre_handle    oti_handle;
+       struct cl_page_list     oti_plist;
+       struct cl_io            oti_io;
+       struct cl_page         *oti_pvec[OTI_PVEC_SIZE];
+};
+
+struct osc_object {
+       struct cl_object   oo_cl;
+       struct lov_oinfo  *oo_oinfo;
+       /**
+        * True if locking against this stripe got -EUSERS.
+        */
+       int             oo_contended;
+       cfs_time_t       oo_contention_time;
+       /**
+        * List of pages in transfer.
+        */
+       struct list_head         oo_inflight[CRT_NR];
+       /**
+        * Lock, protecting ccc_object::cob_inflight, because a seat-belt is
+        * locked during take-off and landing.
+        */
+       spinlock_t         oo_seatbelt;
+
+       /**
+        * used by the osc to keep track of what objects to build into rpcs.
+        * Protected by client_obd->cli_loi_list_lock.
+        */
+       struct list_head           oo_ready_item;
+       struct list_head           oo_hp_ready_item;
+       struct list_head           oo_write_item;
+       struct list_head           oo_read_item;
+
+       /**
+        * extent is a red black tree to manage (async) dirty pages.
+        */
+       struct rb_root       oo_root;
+       /**
+        * Manage write(dirty) extents.
+        */
+       struct list_head           oo_hp_exts; /* list of hp extents */
+       struct list_head           oo_urgent_exts; /* list of writeback extents */
+       struct list_head           oo_rpc_exts;
+
+       struct list_head           oo_reading_exts;
+
+       atomic_t         oo_nr_reads;
+       atomic_t         oo_nr_writes;
+
+       /** Protect extent tree. Will be used to protect
+        * oo_{read|write}_pages soon. */
+       spinlock_t          oo_lock;
+};
+
+static inline void osc_object_lock(struct osc_object *obj)
+{
+       spin_lock(&obj->oo_lock);
+}
+
+static inline int osc_object_trylock(struct osc_object *obj)
+{
+       return spin_trylock(&obj->oo_lock);
+}
+
+static inline void osc_object_unlock(struct osc_object *obj)
+{
+       spin_unlock(&obj->oo_lock);
+}
+
+static inline int osc_object_is_locked(struct osc_object *obj)
+{
+       return spin_is_locked(&obj->oo_lock);
+}
+
+/*
+ * Lock "micro-states" for osc layer.
+ */
+enum osc_lock_state {
+       OLS_NEW,
+       OLS_ENQUEUED,
+       OLS_UPCALL_RECEIVED,
+       OLS_GRANTED,
+       OLS_RELEASED,
+       OLS_BLOCKED,
+       OLS_CANCELLED
+};
+
+/**
+ * osc-private state of cl_lock.
+ *
+ * Interaction with DLM.
+ *
+ * CLIO enqueues all DLM locks through ptlrpcd (that is, in "async" mode).
+ *
+ * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in
+ * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_lock.
+ *
+ * This pointer is protected through a reference, acquired by
+ * osc_lock_upcall0(). Also, an additional reference is acquired by
+ * ldlm_lock_addref() call protecting the lock from cancellation, until
+ * osc_lock_unuse() releases it.
+ *
+ * Below is a description of how lock references are acquired and released
+ * inside of DLM.
+ *
+ * - When new lock is created and enqueued to the server (ldlm_cli_enqueue())
+ *      - ldlm_lock_create()
+ *       - ldlm_lock_new(): initializes a lock with 2 references. One for
+ *         the caller (released when reply from the server is received, or on
+ *         error), and another for the hash table.
+ *      - ldlm_lock_addref_internal(): protects the lock from cancellation.
+ *
+ * - When reply is received from the server (osc_enqueue_interpret())
+ *      - ldlm_cli_enqueue_fini()
+ *       - LDLM_LOCK_PUT(): releases caller reference acquired by
+ *         ldlm_lock_new().
+ *       - if (rc != 0)
+ *             ldlm_lock_decref(): error case: matches ldlm_cli_enqueue().
+ *      - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue().
+ *
+ * - When lock is being cancelled (ldlm_lock_cancel())
+ *      - ldlm_lock_destroy()
+ *       - LDLM_LOCK_PUT(): releases hash-table reference acquired by
+ *         ldlm_lock_new().
+ *
+ * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called
+ * either when lock is cancelled (osc_lock_blocking()), or when locks is
+ * deleted without cancellation (e.g., from cl_locks_prune()). In the latter
+ * case ldlm lock remains in memory, and can be re-attached to osc_lock in the
+ * future.
+ */
+struct osc_lock {
+       struct cl_lock_slice     ols_cl;
+       /** underlying DLM lock */
+       struct ldlm_lock        *ols_lock;
+       /** lock value block */
+       struct ost_lvb     ols_lvb;
+       /** DLM flags with which osc_lock::ols_lock was enqueued */
+       __u64               ols_flags;
+       /** osc_lock::ols_lock handle */
+       struct lustre_handle     ols_handle;
+       struct ldlm_enqueue_info ols_einfo;
+       enum osc_lock_state      ols_state;
+
+       /**
+        * How many pages are using this lock for io, currently only used by
+        * read-ahead. If non-zero, the underlying dlm lock won't be cancelled
+        * during recovery to avoid deadlock. see bz16774.
+        *
+        * \see osc_page::ops_lock
+        * \see osc_page_addref_lock(), osc_page_putref_lock()
+        */
+       atomic_t             ols_pageref;
+
+       /**
+        * true, if ldlm_lock_addref() was called against
+        * osc_lock::ols_lock. This is used for sanity checking.
+        *
+        * \see osc_lock::ols_has_ref
+        */
+       unsigned                  ols_hold :1,
+       /**
+        * this is much like osc_lock::ols_hold, except that this bit is
+        * cleared _after_ reference in released in osc_lock_unuse(). This
+        * fine distinction is needed because:
+        *
+        *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+        *       to return associated cl_lock (so that a flag is needed that is
+        *       cleared after ldlm_lock_decref() returned), and
+        *
+        *     - ldlm_lock_decref() can invoke blocking ast (for a
+        *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+        *       osc_lock_cancel() called from there need to know whether to
+        *       release lock reference (so that a flag is needed that is
+        *       cleared before ldlm_lock_decref() is called).
+        */
+                                ols_has_ref:1,
+       /**
+        * inherit the lockless attribute from top level cl_io.
+        * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+        */
+                                ols_locklessable:1,
+       /**
+        * set by osc_lock_use() to wait until blocking AST enters into
+        * osc_ldlm_blocking_ast0(), so that cl_lock mutex can be used for
+        * further synchronization.
+        */
+                                ols_ast_wait:1,
+       /**
+        * If the data of this lock has been flushed to server side.
+        */
+                                ols_flush:1,
+       /**
+        * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+        * the EVAVAIL error as torerable, this will make upper logic happy
+        * to wait all glimpse locks to each OSTs to be completed.
+        * Glimpse lock converts to normal lock if the server lock is
+        * granted.
+        * Glimpse lock should be destroyed immediately after use.
+        */
+                                ols_glimpse:1,
+       /**
+        * For async glimpse lock.
+        */
+                                ols_agl:1;
+       /**
+        * IO that owns this lock. This field is used for a dead-lock
+        * avoidance by osc_lock_enqueue_wait().
+        *
+        * XXX: unfortunately, the owner of a osc_lock is not unique,
+        * the lock may have multiple users, if the lock is granted and
+        * then matched.
+        */
+       struct osc_io      *ols_owner;
+};
+
+
+/**
+ * Page state private for osc layer.
+ */
+struct osc_page {
+       struct cl_page_slice  ops_cl;
+       /**
+        * Page queues used by osc to detect when RPC can be formed.
+        */
+       struct osc_async_page ops_oap;
+       /**
+        * An offset within page from which next transfer starts. This is used
+        * by cl_page_clip() to submit partial page transfers.
+        */
+       int                ops_from;
+       /**
+        * An offset within page at which next transfer ends.
+        *
+        * \see osc_page::ops_from.
+        */
+       int                ops_to;
+       /**
+        * Boolean, true iff page is under transfer. Used for sanity checking.
+        */
+       unsigned              ops_transfer_pinned:1,
+       /**
+        * True for a `temporary page' created by read-ahead code, probably
+        * outside of any DLM lock.
+        */
+                             ops_temp:1,
+       /**
+        * in LRU?
+        */
+                             ops_in_lru:1,
+       /**
+        * Set if the page must be transferred with OBD_BRW_SRVLOCK.
+        */
+                             ops_srvlock:1;
+       union {
+               /**
+                * lru page list. ops_inflight and ops_lru are exclusive so
+                * that they can share the same data.
+                */
+               struct list_head              ops_lru;
+               /**
+                * Linkage into a per-osc_object list of pages in flight. For
+                * debugging.
+                */
+               struct list_head            ops_inflight;
+       };
+       /**
+        * Thread that submitted this page for transfer. For debugging.
+        */
+       task_t     *ops_submitter;
+       /**
+        * Submit time - the time when the page is starting RPC. For debugging.
+        */
+       cfs_time_t          ops_submit_time;
+
+       /**
+        * A lock of which we hold a reference covers this page. Only used by
+        * read-ahead: for a readahead page, we hold it's covering lock to
+        * prevent it from being canceled during recovery.
+        *
+        * \see osc_lock::ols_pageref
+        * \see osc_page_addref_lock(), osc_page_putref_lock().
+        */
+       struct cl_lock       *ops_lock;
+};
+
+extern struct kmem_cache *osc_lock_kmem;
+extern struct kmem_cache *osc_object_kmem;
+extern struct kmem_cache *osc_thread_kmem;
+extern struct kmem_cache *osc_session_kmem;
+extern struct kmem_cache *osc_req_kmem;
+extern struct kmem_cache *osc_extent_kmem;
+
+extern struct lu_device_type osc_device_type;
+extern struct lu_context_key osc_key;
+extern struct lu_context_key osc_session_key;
+
+#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
+
+int osc_lock_init(const struct lu_env *env,
+                 struct cl_object *obj, struct cl_lock *lock,
+                 const struct cl_io *io);
+int osc_io_init  (const struct lu_env *env,
+                 struct cl_object *obj, struct cl_io *io);
+int osc_req_init (const struct lu_env *env, struct cl_device *dev,
+                 struct cl_req *req);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *hdr,
+                                  struct lu_device *dev);
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+                 struct cl_page *page, struct page *vmpage);
+
+void osc_index2policy  (ldlm_policy_data_t *policy, const struct cl_object *obj,
+                       pgoff_t start, pgoff_t end);
+int  osc_lvb_print     (const struct lu_env *env, void *cookie,
+                       lu_printer_t p, const struct ost_lvb *lvb);
+
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+                    enum cl_req_type crt, int brw_flags);
+int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops);
+int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg,
+                       obd_flag async_flags);
+int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops,
+                       struct page *page, loff_t offset);
+int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
+                      struct osc_page *ops);
+int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
+                           struct osc_page *ops);
+int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
+                        struct osc_page *ops);
+int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
+                        struct list_head *list, int cmd, int brw_flags);
+int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio,
+                            struct osc_object *obj, __u64 size);
+void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio,
+                           struct osc_object *obj);
+int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj,
+                             pgoff_t start, pgoff_t end, int hp, int discard);
+int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj,
+                        pgoff_t start, pgoff_t end);
+void osc_io_unplug(const struct lu_env *env, struct client_obd *cli,
+                  struct osc_object *osc, pdl_policy_t pol);
+
+void osc_object_set_contended  (struct osc_object *obj);
+void osc_object_clear_contended(struct osc_object *obj);
+int  osc_object_is_contended   (struct osc_object *obj);
+
+int  osc_lock_is_lockless      (const struct osc_lock *olck);
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
+{
+       struct osc_thread_info *info;
+
+       info = lu_context_key_get(&env->le_ctx, &osc_key);
+       LASSERT(info != NULL);
+       return info;
+}
+
+static inline struct osc_session *osc_env_session(const struct lu_env *env)
+{
+       struct osc_session *ses;
+
+       ses = lu_context_key_get(env->le_ses, &osc_session_key);
+       LASSERT(ses != NULL);
+       return ses;
+}
+
+static inline struct osc_io *osc_env_io(const struct lu_env *env)
+{
+       return &osc_env_session(env)->os_io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+       return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
+{
+       LINVRNT(d->ld_type == &osc_device_type);
+       return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+}
+
+static inline struct obd_export *osc_export(const struct osc_object *obj)
+{
+       return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+}
+
+static inline struct client_obd *osc_cli(const struct osc_object *obj)
+{
+       return &osc_export(obj)->exp_obd->u.cli;
+}
+
+static inline struct osc_object *cl2osc(const struct cl_object *obj)
+{
+       LINVRNT(osc_is_object(&obj->co_lu));
+       return container_of0(obj, struct osc_object, oo_cl);
+}
+
+static inline struct cl_object *osc2cl(const struct osc_object *obj)
+{
+       return (struct cl_object *)&obj->oo_cl;
+}
+
+static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode)
+{
+       LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
+       if (mode == CLM_READ)
+               return LCK_PR;
+       else if (mode == CLM_WRITE)
+               return LCK_PW;
+       else
+               return LCK_GROUP;
+}
+
+static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode)
+{
+       LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP);
+       if (mode == LCK_PR)
+               return CLM_READ;
+       else if (mode == LCK_PW)
+               return CLM_WRITE;
+       else
+               return CLM_GROUP;
+}
+
+static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
+{
+       LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
+       return container_of0(slice, struct osc_page, ops_cl);
+}
+
+static inline struct osc_page *oap2osc(struct osc_async_page *oap)
+{
+       return container_of0(oap, struct osc_page, ops_oap);
+}
+
+static inline struct cl_page *oap2cl_page(struct osc_async_page *oap)
+{
+       return oap2osc(oap)->ops_cl.cpl_page;
+}
+
+static inline struct osc_page *oap2osc_page(struct osc_async_page *oap)
+{
+       return (struct osc_page *)container_of(oap, struct osc_page, ops_oap);
+}
+
+static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
+{
+       LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
+       return container_of0(slice, struct osc_lock, ols_cl);
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+       return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+static inline int osc_io_srvlock(struct osc_io *oio)
+{
+       return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+}
+
+enum osc_extent_state {
+       OES_INV       = 0, /** extent is just initialized or destroyed */
+       OES_ACTIVE    = 1, /** process is using this extent */
+       OES_CACHE     = 2, /** extent is ready for IO */
+       OES_LOCKING   = 3, /** locking page to prepare IO */
+       OES_LOCK_DONE = 4, /** locking finished, ready to send */
+       OES_RPC       = 5, /** in RPC */
+       OES_TRUNC     = 6, /** being truncated */
+       OES_STATE_MAX
+};
+
+/**
+ * osc_extent data to manage dirty pages.
+ * osc_extent has the following attributes:
+ * 1. all pages in the same must be in one RPC in write back;
+ * 2. # of pages must be less than max_pages_per_rpc - implied by 1;
+ * 3. must be covered by only 1 osc_lock;
+ * 4. exclusive. It's impossible to have overlapped osc_extent.
+ *
+ * The lifetime of an extent is from when the 1st page is dirtied to when
+ * all pages inside it are written out.
+ *
+ * LOCKING ORDER
+ * =============
+ * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock)
+ */
+struct osc_extent {
+       /** red-black tree node */
+       struct rb_node     oe_node;
+       /** osc_object of this extent */
+       struct osc_object *oe_obj;
+       /** refcount, removed from red-black tree if reaches zero. */
+       atomic_t       oe_refc;
+       /** busy if non-zero */
+       atomic_t       oe_users;
+       /** link list of osc_object's oo_{hp|urgent|locking}_exts. */
+       struct list_head         oe_link;
+       /** state of this extent */
+       unsigned int       oe_state;
+       /** flags for this extent. */
+       unsigned int       oe_intree:1,
+       /** 0 is write, 1 is read */
+                          oe_rw:1,
+                          oe_srvlock:1,
+                          oe_memalloc:1,
+       /** an ACTIVE extent is going to be truncated, so when this extent
+        * is released, it will turn into TRUNC state instead of CACHE. */
+                          oe_trunc_pending:1,
+       /** this extent should be written asap and someone may wait for the
+        * write to finish. This bit is usually set along with urgent if
+        * the extent was CACHE state.
+        * fsync_wait extent can't be merged because new extent region may
+        * exceed fsync range. */
+                          oe_fsync_wait:1,
+       /** covering lock is being canceled */
+                          oe_hp:1,
+       /** this extent should be written back asap. set if one of pages is
+        * called by page WB daemon, or sync write or reading requests. */
+                          oe_urgent:1;
+       /** how many grants allocated for this extent.
+        *  Grant allocated for this extent. There is no grant allocated
+        *  for reading extents and sync write extents. */
+       unsigned int       oe_grants;
+       /** # of dirty pages in this extent */
+       unsigned int       oe_nr_pages;
+       /** list of pending oap pages. Pages in this list are NOT sorted. */
+       struct list_head         oe_pages;
+       /** Since an extent has to be written out in atomic, this is used to
+        * remember the next page need to be locked to write this extent out.
+        * Not used right now.
+        */
+       struct osc_page   *oe_next_page;
+       /** start and end index of this extent, include start and end
+        * themselves. Page offset here is the page index of osc_pages.
+        * oe_start is used as keyword for red-black tree. */
+       pgoff_t     oe_start;
+       pgoff_t     oe_end;
+       /** maximum ending index of this extent, this is limited by
+        * max_pages_per_rpc, lock extent and chunk size. */
+       pgoff_t     oe_max_end;
+       /** waitqueue - for those who want to be notified if this extent's
+        * state has changed. */
+       wait_queue_head_t       oe_waitq;
+       /** lock covering this extent */
+       struct cl_lock    *oe_osclock;
+       /** terminator of this extent. Must be true if this extent is in IO. */
+       task_t  *oe_owner;
+       /** return value of writeback. If somebody is waiting for this extent,
+        * this value can be known by outside world. */
+       int             oe_rc;
+       /** max pages per rpc when this extent was created */
+       unsigned int       oe_mppr;
+};
+
+int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
+                     int sent, int rc);
+int osc_extent_release(const struct lu_env *env, struct osc_extent *ext);
+
+/** @} osc */
+
+#endif /* OSC_CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/osc/osc_dev.c b/drivers/staging/lustre/lustre/osc/osc_dev.c
new file mode 100644 (file)
index 0000000..4208ddf
--- /dev/null
@@ -0,0 +1,261 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device, cl_req for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ * @{
+ */
+
+struct kmem_cache *osc_lock_kmem;
+struct kmem_cache *osc_object_kmem;
+struct kmem_cache *osc_thread_kmem;
+struct kmem_cache *osc_session_kmem;
+struct kmem_cache *osc_req_kmem;
+struct kmem_cache *osc_extent_kmem;
+struct kmem_cache *osc_quota_kmem;
+
+struct lu_kmem_descr osc_caches[] = {
+       {
+               .ckd_cache = &osc_lock_kmem,
+               .ckd_name  = "osc_lock_kmem",
+               .ckd_size  = sizeof (struct osc_lock)
+       },
+       {
+               .ckd_cache = &osc_object_kmem,
+               .ckd_name  = "osc_object_kmem",
+               .ckd_size  = sizeof (struct osc_object)
+       },
+       {
+               .ckd_cache = &osc_thread_kmem,
+               .ckd_name  = "osc_thread_kmem",
+               .ckd_size  = sizeof (struct osc_thread_info)
+       },
+       {
+               .ckd_cache = &osc_session_kmem,
+               .ckd_name  = "osc_session_kmem",
+               .ckd_size  = sizeof (struct osc_session)
+       },
+       {
+               .ckd_cache = &osc_req_kmem,
+               .ckd_name  = "osc_req_kmem",
+               .ckd_size  = sizeof (struct osc_req)
+       },
+       {
+               .ckd_cache = &osc_extent_kmem,
+               .ckd_name  = "osc_extent_kmem",
+               .ckd_size  = sizeof (struct osc_extent)
+       },
+       {
+               .ckd_cache = &osc_quota_kmem,
+               .ckd_name  = "osc_quota_kmem",
+               .ckd_size  = sizeof(struct osc_quota_info)
+       },
+       {
+               .ckd_cache = NULL
+       }
+};
+
+struct lock_class_key osc_ast_guard_class;
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+       return &osc->od_cl.cd_lu_dev;
+}
+
+/*****************************************************************************
+ *
+ * Osc device and device type functions.
+ *
+ */
+
+static void *osc_key_init(const struct lu_context *ctx,
+                        struct lu_context_key *key)
+{
+       struct osc_thread_info *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void osc_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+       struct osc_thread_info *info = data;
+       OBD_SLAB_FREE_PTR(info, osc_thread_kmem);
+}
+
+struct lu_context_key osc_key = {
+       .lct_tags = LCT_CL_THREAD,
+       .lct_init = osc_key_init,
+       .lct_fini = osc_key_fini
+};
+
+static void *osc_session_init(const struct lu_context *ctx,
+                             struct lu_context_key *key)
+{
+       struct osc_session *info;
+
+       OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, __GFP_IO);
+       if (info == NULL)
+               info = ERR_PTR(-ENOMEM);
+       return info;
+}
+
+static void osc_session_fini(const struct lu_context *ctx,
+                            struct lu_context_key *key, void *data)
+{
+       struct osc_session *info = data;
+       OBD_SLAB_FREE_PTR(info, osc_session_kmem);
+}
+
+struct lu_context_key osc_session_key = {
+       .lct_tags = LCT_SESSION,
+       .lct_init = osc_session_init,
+       .lct_fini = osc_session_fini
+};
+
+/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
+
+static int osc_cl_process_config(const struct lu_env *env,
+                                struct lu_device *d, struct lustre_cfg *cfg)
+{
+       ENTRY;
+       RETURN(osc_process_config_base(d->ld_obd, cfg));
+}
+
+static const struct lu_device_operations osc_lu_ops = {
+       .ldo_object_alloc      = osc_object_alloc,
+       .ldo_process_config    = osc_cl_process_config,
+       .ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations osc_cl_ops = {
+       .cdo_req_init = osc_req_init
+};
+
+static int osc_device_init(const struct lu_env *env, struct lu_device *d,
+                          const char *name, struct lu_device *next)
+{
+       RETURN(0);
+}
+
+static struct lu_device *osc_device_fini(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       return 0;
+}
+
+static struct lu_device *osc_device_free(const struct lu_env *env,
+                                        struct lu_device *d)
+{
+       struct osc_device *od = lu2osc_dev(d);
+
+       cl_device_fini(lu2cl_dev(d));
+       OBD_FREE_PTR(od);
+       return NULL;
+}
+
+static struct lu_device *osc_device_alloc(const struct lu_env *env,
+                                         struct lu_device_type *t,
+                                         struct lustre_cfg *cfg)
+{
+       struct lu_device *d;
+       struct osc_device *od;
+       struct obd_device *obd;
+       int rc;
+
+       OBD_ALLOC_PTR(od);
+       if (od == NULL)
+               RETURN(ERR_PTR(-ENOMEM));
+
+       cl_device_init(&od->od_cl, t);
+       d = osc2lu_dev(od);
+       d->ld_ops = &osc_lu_ops;
+       od->od_cl.cd_ops = &osc_cl_ops;
+
+       /* Setup OSC OBD */
+       obd = class_name2obd(lustre_cfg_string(cfg, 0));
+       LASSERT(obd != NULL);
+       rc = osc_setup(obd, cfg);
+       if (rc) {
+               osc_device_free(env, d);
+               RETURN(ERR_PTR(rc));
+       }
+       od->od_exp = obd->obd_self_export;
+       RETURN(d);
+}
+
+static const struct lu_device_type_operations osc_device_type_ops = {
+       .ldto_init = osc_type_init,
+       .ldto_fini = osc_type_fini,
+
+       .ldto_start = osc_type_start,
+       .ldto_stop  = osc_type_stop,
+
+       .ldto_device_alloc = osc_device_alloc,
+       .ldto_device_free  = osc_device_free,
+
+       .ldto_device_init    = osc_device_init,
+       .ldto_device_fini    = osc_device_fini
+};
+
+struct lu_device_type osc_device_type = {
+       .ldt_tags     = LU_DEVICE_CL,
+       .ldt_name     = LUSTRE_OSC_NAME,
+       .ldt_ops      = &osc_device_type_ops,
+       .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h
new file mode 100644 (file)
index 0000000..efc5db4
--- /dev/null
@@ -0,0 +1,208 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef OSC_INTERNAL_H
+#define OSC_INTERNAL_H
+
+#define OAP_MAGIC 8675309
+
+struct lu_env;
+
+enum async_flags {
+       ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+                             page is added to an rpc */
+       ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+       ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+                                    to give the caller a chance to update
+                                    or cancel the size of the io */
+       ASYNC_HP = 0x10,
+};
+
+struct osc_async_page {
+       int                  oap_magic;
+       unsigned short    oap_cmd;
+       unsigned short    oap_interrupted:1;
+
+       struct list_head              oap_pending_item;
+       struct list_head              oap_rpc_item;
+
+       obd_off          oap_obj_off;
+       unsigned                oap_page_off;
+       enum async_flags        oap_async_flags;
+
+       struct brw_page  oap_brw_page;
+
+       struct ptlrpc_request   *oap_request;
+       struct client_obd       *oap_cli;
+       struct osc_object       *oap_obj;
+
+       struct ldlm_lock        *oap_ldlm_lock;
+       spinlock_t               oap_lock;
+};
+
+#define oap_page       oap_brw_page.pg
+#define oap_count       oap_brw_page.count
+#define oap_brw_flags   oap_brw_page.flag
+
+struct osc_cache_waiter {
+       struct list_head              ocw_entry;
+       wait_queue_head_t            ocw_waitq;
+       struct osc_async_page  *ocw_oap;
+       int                  ocw_grant;
+       int                  ocw_rc;
+};
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+              struct obdo *oa, struct lov_stripe_md **ea,
+              struct obd_trans_info *oti);
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+                   struct lov_stripe_md **ea, struct obd_trans_info *oti);
+void osc_wake_cache_waiters(struct client_obd *cli);
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes);
+void osc_update_next_shrink(struct client_obd *cli);
+
+/*
+ * cl integration.
+ */
+#include <cl_object.h>
+
+extern struct ptlrpc_request_set *PTLRPCD_SET;
+
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                    __u64 *flags, ldlm_policy_data_t *policy,
+                    struct ost_lvb *lvb, int kms_valid,
+                    obd_enqueue_update_f upcall,
+                    void *cookie, struct ldlm_enqueue_info *einfo,
+                    struct lustre_handle *lockh,
+                    struct ptlrpc_request_set *rqset, int async, int agl);
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode);
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                  __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+                  int *flags, void *data, struct lustre_handle *lockh,
+                  int unref);
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+                          struct obd_trans_info *oti,
+                          obd_enqueue_update_f upcall, void *cookie,
+                          struct ptlrpc_request_set *rqset);
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+                  obd_enqueue_update_f upcall, void *cookie,
+                  struct ptlrpc_request_set *rqset);
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+                 obd_enqueue_update_f upcall, void *cookie,
+                 struct ptlrpc_request_set *rqset);
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+                 struct list_head *ext_list, int cmd, pdl_policy_t p);
+int osc_lru_shrink(struct client_obd *cli, int target);
+
+extern spinlock_t osc_ast_guard;
+
+int osc_cleanup(struct obd_device *obd);
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+
+#ifdef LPROCFS
+int lproc_osc_attach_seqstat(struct obd_device *dev);
+void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars);
+#else
+static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
+static inline void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
+{
+       memset(lvars, 0, sizeof(*lvars));
+}
+#endif
+
+extern struct lu_device_type osc_device_type;
+
+static inline int osc_recoverable_error(int rc)
+{
+       return (rc == -EIO || rc == -EROFS || rc == -ENOMEM ||
+               rc == -EAGAIN || rc == -EINPROGRESS);
+}
+
+static inline unsigned long rpcs_in_flight(struct client_obd *cli)
+{
+       return cli->cl_r_in_flight + cli->cl_w_in_flight;
+}
+
+#ifndef min_t
+#define min_t(type,x,y) \
+       ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+#endif
+
+struct osc_device {
+       struct cl_device    od_cl;
+       struct obd_export  *od_exp;
+
+       /* Write stats is actually protected by client_obd's lock. */
+       struct osc_stats {
+               uint64_t     os_lockless_writes;          /* by bytes */
+               uint64_t     os_lockless_reads;    /* by bytes */
+               uint64_t     os_lockless_truncates;       /* by times */
+       } od_stats;
+
+       /* configuration item(s) */
+       int              od_contention_time;
+       int              od_lockless_truncate;
+};
+
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+       return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm);
+
+extern struct kmem_cache *osc_quota_kmem;
+struct osc_quota_info {
+       /** linkage for quota hash table */
+       struct hlist_node oqi_hash;
+       obd_uid   oqi_id;
+};
+int osc_quota_setup(struct obd_device *obd);
+int osc_quota_cleanup(struct obd_device *obd);
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+                   obd_flag valid, obd_flag flags);
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]);
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                struct obd_quotactl *oqctl);
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+                  struct obd_quotactl *oqctl);
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
+
+#endif /* OSC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c
new file mode 100644 (file)
index 0000000..1b27704
--- /dev/null
@@ -0,0 +1,836 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Jinshan Xiong <jinshan.xiong@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct osc_req *cl2osc_req(const struct cl_req_slice *slice)
+{
+       LINVRNT(slice->crs_dev->cd_lu_dev.ld_type == &osc_device_type);
+       return container_of0(slice, struct osc_req, or_cl);
+}
+
+static struct osc_io *cl2osc_io(const struct lu_env *env,
+                               const struct cl_io_slice *slice)
+{
+       struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+       LINVRNT(oio == osc_env_io(env));
+       return oio;
+}
+
+static struct osc_page *osc_cl_page_osc(struct cl_page *page)
+{
+       const struct cl_page_slice *slice;
+
+       slice = cl_page_at(page, &osc_device_type);
+       LASSERT(slice != NULL);
+
+       return cl2osc_page(slice);
+}
+
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
+{
+}
+
+/**
+ * An implementation of cl_io_operations::cio_io_submit() method for osc
+ * layer. Iterates over pages in the in-queue, prepares each for io by calling
+ * cl_page_prep() and then either submits them through osc_io_submit_page()
+ * or, if page is already submitted, changes osc flags through
+ * osc_set_async_flags().
+ */
+static int osc_io_submit(const struct lu_env *env,
+                        const struct cl_io_slice *ios,
+                        enum cl_req_type crt, struct cl_2queue *queue)
+{
+       struct cl_page    *page;
+       struct cl_page    *tmp;
+       struct client_obd *cli  = NULL;
+       struct osc_object *osc  = NULL; /* to keep gcc happy */
+       struct osc_page   *opg;
+       struct cl_io      *io;
+       LIST_HEAD     (list);
+
+       struct cl_page_list *qin      = &queue->c2_qin;
+       struct cl_page_list *qout     = &queue->c2_qout;
+       int queued = 0;
+       int result = 0;
+       int cmd;
+       int brw_flags;
+       int max_pages;
+
+       LASSERT(qin->pl_nr > 0);
+
+       CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt);
+
+       osc = cl2osc(ios->cis_obj);
+       cli = osc_cli(osc);
+       max_pages = cli->cl_max_pages_per_rpc;
+
+       cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+       brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+
+       /*
+        * NOTE: here @page is a top-level page. This is done to avoid
+        *       creation of sub-page-list.
+        */
+       cl_page_list_for_each_safe(page, tmp, qin) {
+               struct osc_async_page *oap;
+
+               /* Top level IO. */
+               io = page->cp_owner;
+               LASSERT(io != NULL);
+
+               opg = osc_cl_page_osc(page);
+               oap = &opg->ops_oap;
+               LASSERT(osc == oap->oap_obj);
+
+               if (!list_empty(&oap->oap_pending_item) ||
+                   !list_empty(&oap->oap_rpc_item)) {
+                       CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
+                              oap, opg);
+                       result = -EBUSY;
+                       break;
+               }
+
+               result = cl_page_prep(env, io, page, crt);
+               if (result != 0) {
+                       LASSERT(result < 0);
+                       if (result != -EALREADY)
+                               break;
+                       /*
+                        * Handle -EALREADY error: for read case, the page is
+                        * already in UPTODATE state; for write, the page
+                        * is not dirty.
+                        */
+                       result = 0;
+                       continue;
+               }
+
+               cl_page_list_move(qout, qin, page);
+               oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY;
+               oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+
+               osc_page_submit(env, opg, crt, brw_flags);
+               list_add_tail(&oap->oap_pending_item, &list);
+               if (++queued == max_pages) {
+                       queued = 0;
+                       result = osc_queue_sync_pages(env, osc, &list, cmd,
+                                                     brw_flags);
+                       if (result < 0)
+                               break;
+               }
+       }
+
+       if (queued > 0)
+               result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+
+       CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
+       return qout->pl_nr > 0 ? 0 : result;
+}
+
+static void osc_page_touch_at(const struct lu_env *env,
+                             struct cl_object *obj, pgoff_t idx, unsigned to)
+{
+       struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+       struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+       int valid;
+       __u64 kms;
+
+       /* offset within stripe */
+       kms = cl_offset(obj, idx) + to;
+
+       cl_object_attr_lock(obj);
+       /*
+        * XXX old code used
+        *
+        *       ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
+        *
+        * here
+        */
+       CDEBUG(D_INODE, "stripe KMS %sincreasing "LPU64"->"LPU64" "LPU64"\n",
+              kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+              loi->loi_lvb.lvb_size);
+
+       valid = 0;
+       if (kms > loi->loi_kms) {
+               attr->cat_kms = kms;
+               valid |= CAT_KMS;
+       }
+       if (kms > loi->loi_lvb.lvb_size) {
+               attr->cat_size = kms;
+               valid |= CAT_SIZE;
+       }
+       cl_object_attr_set(env, obj, attr, valid);
+       cl_object_attr_unlock(obj);
+}
+
+/**
+ * This is called when a page is accessed within file in a way that creates
+ * new page, if one were missing (i.e., if there were a hole at that place in
+ * the file, or accessed page is beyond the current file size). Examples:
+ * ->commit_write() and ->nopage() methods.
+ *
+ * Expand stripe KMS if necessary.
+ */
+static void osc_page_touch(const struct lu_env *env,
+                          struct osc_page *opage, unsigned to)
+{
+       struct cl_page    *page = opage->ops_cl.cpl_page;
+       struct cl_object  *obj  = opage->ops_cl.cpl_obj;
+
+       osc_page_touch_at(env, obj, page->cp_index, to);
+}
+
+/**
+ * Implements cl_io_operations::cio_prepare_write() method for osc layer.
+ *
+ * \retval -EIO transfer initiated against this osc will most likely fail
+ * \retval 0    transfer initiated against this osc will most likely succeed.
+ *
+ * The reason for this check is to immediately return an error to the caller
+ * in the case of a deactivated import. Note, that import can be deactivated
+ * later, while pages, dirtied by this IO, are still in the cache, but this is
+ * irrelevant, because that would still return an error to the application (if
+ * it does fsync), but many applications don't do fsync because of performance
+ * issues, and we wanted to return an -EIO at write time to notify the
+ * application.
+ */
+static int osc_io_prepare_write(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               const struct cl_page_slice *slice,
+                               unsigned from, unsigned to)
+{
+       struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev);
+       struct obd_import *imp = class_exp2cliimp(dev->od_exp);
+       struct osc_io     *oio = cl2osc_io(env, ios);
+       int result = 0;
+       ENTRY;
+
+       /*
+        * This implements OBD_BRW_CHECK logic from old client.
+        */
+
+       if (imp == NULL || imp->imp_invalid)
+               result = -EIO;
+       if (result == 0 && oio->oi_lockless)
+               /* this page contains `invalid' data, but who cares?
+                * nobody can access the invalid data.
+                * in osc_io_commit_write(), we're going to write exact
+                * [from, to) bytes of this page to OST. -jay */
+               cl_page_export(env, slice->cpl_page, 1);
+
+       RETURN(result);
+}
+
+static int osc_io_commit_write(const struct lu_env *env,
+                              const struct cl_io_slice *ios,
+                              const struct cl_page_slice *slice,
+                              unsigned from, unsigned to)
+{
+       struct osc_io    *oio = cl2osc_io(env, ios);
+       struct osc_page       *opg = cl2osc_page(slice);
+       struct osc_object     *obj = cl2osc(opg->ops_cl.cpl_obj);
+       struct osc_async_page *oap = &opg->ops_oap;
+       ENTRY;
+
+       LASSERT(to > 0);
+       /*
+        * XXX instead of calling osc_page_touch() here and in
+        * osc_io_fault_start() it might be more logical to introduce
+        * cl_page_touch() method, that generic cl_io_commit_write() and page
+        * fault code calls.
+        */
+       osc_page_touch(env, cl2osc_page(slice), to);
+       if (!client_is_remote(osc_export(obj)) &&
+           cfs_capable(CFS_CAP_SYS_RESOURCE))
+               oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+
+       if (oio->oi_lockless)
+               /* see osc_io_prepare_write() for lockless io handling. */
+               cl_page_clip(env, slice->cpl_page, from, to);
+
+       RETURN(0);
+}
+
+static int osc_io_fault_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+       struct cl_io       *io;
+       struct cl_fault_io *fio;
+
+       ENTRY;
+
+       io  = ios->cis_io;
+       fio = &io->u.ci_fault;
+       CDEBUG(D_INFO, "%lu %d %d\n",
+              fio->ft_index, fio->ft_writable, fio->ft_nob);
+       /*
+        * If mapping is writeable, adjust kms to cover this page,
+        * but do not extend kms beyond actual file size.
+        * See bug 10919.
+        */
+       if (fio->ft_writable)
+               osc_page_touch_at(env, ios->cis_obj,
+                                 fio->ft_index, fio->ft_nob);
+       RETURN(0);
+}
+
+static int osc_async_upcall(void *a, int rc)
+{
+       struct osc_async_cbargs *args = a;
+
+       args->opc_rc = rc;
+       complete(&args->opc_sync);
+       return 0;
+}
+
+/**
+ * Checks that there are no pages being written in the extent being truncated.
+ */
+static int trunc_check_cb(const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *page, void *cbdata)
+{
+       const struct cl_page_slice *slice;
+       struct osc_page *ops;
+       struct osc_async_page *oap;
+       __u64 start = *(__u64 *)cbdata;
+
+       slice = cl_page_at(page, &osc_device_type);
+       LASSERT(slice != NULL);
+       ops = cl2osc_page(slice);
+       oap = &ops->ops_oap;
+
+       if (oap->oap_cmd & OBD_BRW_WRITE &&
+           !list_empty(&oap->oap_pending_item))
+               CL_PAGE_DEBUG(D_ERROR, env, page, "exists " LPU64 "/%s.\n",
+                               start, current->comm);
+
+       {
+               struct page *vmpage = cl_page_vmpage(env, page);
+               if (PageLocked(vmpage))
+                       CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n",
+                              ops, page->cp_index,
+                              (oap->oap_cmd & OBD_BRW_RWMASK));
+       }
+
+       return CLP_GANG_OKAY;
+}
+
+static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
+                           struct osc_io *oio, __u64 size)
+{
+       struct cl_object *clob;
+       int     partial;
+       pgoff_t start;
+
+       clob    = oio->oi_cl.cis_obj;
+       start   = cl_index(clob, size);
+       partial = cl_offset(clob, start) < size;
+
+       /*
+        * Complain if there are pages in the truncated region.
+        */
+       cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF,
+                           trunc_check_cb, (void *)&size);
+}
+
+static int osc_io_setattr_start(const struct lu_env *env,
+                               const struct cl_io_slice *slice)
+{
+       struct cl_io        *io     = slice->cis_io;
+       struct osc_io      *oio    = cl2osc_io(env, slice);
+       struct cl_object        *obj    = slice->cis_obj;
+       struct lov_oinfo        *loi    = cl2osc(obj)->oo_oinfo;
+       struct cl_attr    *attr   = &osc_env_info(env)->oti_attr;
+       struct obdo          *oa     = &oio->oi_oa;
+       struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+       __u64               size   = io->u.ci_setattr.sa_attr.lvb_size;
+       unsigned int         ia_valid = io->u.ci_setattr.sa_valid;
+       int                   result = 0;
+       struct obd_info   oinfo = { { { 0 } } };
+
+       /* truncate cache dirty pages first */
+       if (cl_io_is_trunc(io))
+               result = osc_cache_truncate_start(env, oio, cl2osc(obj), size);
+
+       if (result == 0 && oio->oi_lockless == 0) {
+               cl_object_attr_lock(obj);
+               result = cl_object_attr_get(env, obj, attr);
+               if (result == 0) {
+                       struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr;
+                       unsigned int cl_valid = 0;
+
+                       if (ia_valid & ATTR_SIZE) {
+                               attr->cat_size = attr->cat_kms = size;
+                               cl_valid = (CAT_SIZE | CAT_KMS);
+                       }
+                       if (ia_valid & ATTR_MTIME_SET) {
+                               attr->cat_mtime = lvb->lvb_mtime;
+                               cl_valid |= CAT_MTIME;
+                       }
+                       if (ia_valid & ATTR_ATIME_SET) {
+                               attr->cat_atime = lvb->lvb_atime;
+                               cl_valid |= CAT_ATIME;
+                       }
+                       if (ia_valid & ATTR_CTIME_SET) {
+                               attr->cat_ctime = lvb->lvb_ctime;
+                               cl_valid |= CAT_CTIME;
+                       }
+                       result = cl_object_attr_set(env, obj, attr, cl_valid);
+               }
+               cl_object_attr_unlock(obj);
+       }
+       memset(oa, 0, sizeof(*oa));
+       if (result == 0) {
+               oa->o_oi = loi->loi_oi;
+               oa->o_mtime = attr->cat_mtime;
+               oa->o_atime = attr->cat_atime;
+               oa->o_ctime = attr->cat_ctime;
+               oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
+                       OBD_MD_FLCTIME | OBD_MD_FLMTIME;
+               if (ia_valid & ATTR_SIZE) {
+                       oa->o_size = size;
+                       oa->o_blocks = OBD_OBJECT_EOF;
+                       oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+                       if (oio->oi_lockless) {
+                               oa->o_flags = OBD_FL_SRVLOCK;
+                               oa->o_valid |= OBD_MD_FLFLAGS;
+                       }
+               } else {
+                       LASSERT(oio->oi_lockless == 0);
+               }
+
+               oinfo.oi_oa = oa;
+               oinfo.oi_capa = io->u.ci_setattr.sa_capa;
+               init_completion(&cbargs->opc_sync);
+
+               if (ia_valid & ATTR_SIZE)
+                       result = osc_punch_base(osc_export(cl2osc(obj)),
+                                               &oinfo, osc_async_upcall,
+                                               cbargs, PTLRPCD_SET);
+               else
+                       result = osc_setattr_async_base(osc_export(cl2osc(obj)),
+                                                       &oinfo, NULL,
+                                                       osc_async_upcall,
+                                                       cbargs, PTLRPCD_SET);
+               cbargs->opc_rpc_sent = result == 0;
+       }
+       return result;
+}
+
+static void osc_io_setattr_end(const struct lu_env *env,
+                              const struct cl_io_slice *slice)
+{
+       struct cl_io     *io  = slice->cis_io;
+       struct osc_io    *oio = cl2osc_io(env, slice);
+       struct cl_object *obj = slice->cis_obj;
+       struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+       int result = 0;
+
+       if (cbargs->opc_rpc_sent) {
+               wait_for_completion(&cbargs->opc_sync);
+               result = io->ci_result = cbargs->opc_rc;
+       }
+       if (result == 0) {
+               if (oio->oi_lockless) {
+                       /* lockless truncate */
+                       struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+                       LASSERT(cl_io_is_trunc(io));
+                       /* XXX: Need a lock. */
+                       osd->od_stats.os_lockless_truncates++;
+               }
+       }
+
+       if (cl_io_is_trunc(io)) {
+               __u64 size = io->u.ci_setattr.sa_attr.lvb_size;
+               osc_trunc_check(env, io, oio, size);
+               if (oio->oi_trunc != NULL) {
+                       osc_cache_truncate_end(env, oio, cl2osc(obj));
+                       oio->oi_trunc = NULL;
+               }
+       }
+}
+
+static int osc_io_read_start(const struct lu_env *env,
+                            const struct cl_io_slice *slice)
+{
+       struct osc_io    *oio   = cl2osc_io(env, slice);
+       struct cl_object *obj   = slice->cis_obj;
+       struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+       int           result = 0;
+       ENTRY;
+
+       if (oio->oi_lockless == 0) {
+               cl_object_attr_lock(obj);
+               result = cl_object_attr_get(env, obj, attr);
+               if (result == 0) {
+                       attr->cat_atime = LTIME_S(CFS_CURRENT_TIME);
+                       result = cl_object_attr_set(env, obj, attr,
+                                                   CAT_ATIME);
+               }
+               cl_object_attr_unlock(obj);
+       }
+       RETURN(result);
+}
+
+static int osc_io_write_start(const struct lu_env *env,
+                             const struct cl_io_slice *slice)
+{
+       struct osc_io    *oio   = cl2osc_io(env, slice);
+       struct cl_object *obj   = slice->cis_obj;
+       struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+       int           result = 0;
+       ENTRY;
+
+       if (oio->oi_lockless == 0) {
+               OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1);
+               cl_object_attr_lock(obj);
+               result = cl_object_attr_get(env, obj, attr);
+               if (result == 0) {
+                       attr->cat_mtime = attr->cat_ctime =
+                               LTIME_S(CFS_CURRENT_TIME);
+                       result = cl_object_attr_set(env, obj, attr,
+                                                   CAT_MTIME | CAT_CTIME);
+               }
+               cl_object_attr_unlock(obj);
+       }
+       RETURN(result);
+}
+
+static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj,
+                        struct cl_fsync_io *fio)
+{
+       struct osc_io    *oio   = osc_env_io(env);
+       struct obdo      *oa    = &oio->oi_oa;
+       struct obd_info  *oinfo = &oio->oi_info;
+       struct lov_oinfo *loi   = obj->oo_oinfo;
+       struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+       int rc = 0;
+       ENTRY;
+
+       memset(oa, 0, sizeof(*oa));
+       oa->o_oi = loi->loi_oi;
+       oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+       /* reload size abd blocks for start and end of sync range */
+       oa->o_size = fio->fi_start;
+       oa->o_blocks = fio->fi_end;
+       oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+       obdo_set_parent_fid(oa, fio->fi_fid);
+
+       memset(oinfo, 0, sizeof(*oinfo));
+       oinfo->oi_oa = oa;
+       oinfo->oi_capa = fio->fi_capa;
+       init_completion(&cbargs->opc_sync);
+
+       rc = osc_sync_base(osc_export(obj), oinfo, osc_async_upcall, cbargs,
+                          PTLRPCD_SET);
+       RETURN(rc);
+}
+
+static int osc_io_fsync_start(const struct lu_env *env,
+                             const struct cl_io_slice *slice)
+{
+       struct cl_io       *io  = slice->cis_io;
+       struct cl_fsync_io *fio = &io->u.ci_fsync;
+       struct cl_object   *obj = slice->cis_obj;
+       struct osc_object  *osc = cl2osc(obj);
+       pgoff_t start  = cl_index(obj, fio->fi_start);
+       pgoff_t end    = cl_index(obj, fio->fi_end);
+       int     result = 0;
+       ENTRY;
+
+       if (fio->fi_end == OBD_OBJECT_EOF)
+               end = CL_PAGE_EOF;
+
+       result = osc_cache_writeback_range(env, osc, start, end, 0,
+                                          fio->fi_mode == CL_FSYNC_DISCARD);
+       if (result > 0) {
+               fio->fi_nr_written += result;
+               result = 0;
+       }
+       if (fio->fi_mode == CL_FSYNC_ALL) {
+               int rc;
+
+               /* we have to wait for writeback to finish before we can
+                * send OST_SYNC RPC. This is bad because it causes extents
+                * to be written osc by osc. However, we usually start
+                * writeback before CL_FSYNC_ALL so this won't have any real
+                * problem. */
+               rc = osc_cache_wait_range(env, osc, start, end);
+               if (result == 0)
+                       result = rc;
+               rc = osc_fsync_ost(env, osc, fio);
+               if (result == 0)
+                       result = rc;
+       }
+
+       RETURN(result);
+}
+
+static void osc_io_fsync_end(const struct lu_env *env,
+                            const struct cl_io_slice *slice)
+{
+       struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync;
+       struct cl_object   *obj = slice->cis_obj;
+       pgoff_t start = cl_index(obj, fio->fi_start);
+       pgoff_t end   = cl_index(obj, fio->fi_end);
+       int result = 0;
+
+       if (fio->fi_mode == CL_FSYNC_LOCAL) {
+               result = osc_cache_wait_range(env, cl2osc(obj), start, end);
+       } else if (fio->fi_mode == CL_FSYNC_ALL) {
+               struct osc_io      *oio    = cl2osc_io(env, slice);
+               struct osc_async_cbargs *cbargs = &oio->oi_cbarg;
+
+               wait_for_completion(&cbargs->opc_sync);
+               if (result == 0)
+                       result = cbargs->opc_rc;
+       }
+       slice->cis_io->ci_result = result;
+}
+
+static void osc_io_end(const struct lu_env *env,
+                      const struct cl_io_slice *slice)
+{
+       struct osc_io *oio = cl2osc_io(env, slice);
+
+       if (oio->oi_active) {
+               osc_extent_release(env, oio->oi_active);
+               oio->oi_active = NULL;
+       }
+}
+
+static const struct cl_io_operations osc_io_ops = {
+       .op = {
+               [CIT_READ] = {
+                       .cio_start  = osc_io_read_start,
+                       .cio_fini   = osc_io_fini
+               },
+               [CIT_WRITE] = {
+                       .cio_start  = osc_io_write_start,
+                       .cio_end    = osc_io_end,
+                       .cio_fini   = osc_io_fini
+               },
+               [CIT_SETATTR] = {
+                       .cio_start  = osc_io_setattr_start,
+                       .cio_end    = osc_io_setattr_end
+               },
+               [CIT_FAULT] = {
+                       .cio_start  = osc_io_fault_start,
+                       .cio_end    = osc_io_end,
+                       .cio_fini   = osc_io_fini
+               },
+               [CIT_FSYNC] = {
+                       .cio_start  = osc_io_fsync_start,
+                       .cio_end    = osc_io_fsync_end,
+                       .cio_fini   = osc_io_fini
+               },
+               [CIT_MISC] = {
+                       .cio_fini   = osc_io_fini
+               }
+       },
+       .req_op = {
+                [CRT_READ] = {
+                        .cio_submit    = osc_io_submit
+                },
+                [CRT_WRITE] = {
+                        .cio_submit    = osc_io_submit
+                }
+        },
+       .cio_prepare_write = osc_io_prepare_write,
+       .cio_commit_write  = osc_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+static int osc_req_prep(const struct lu_env *env,
+                       const struct cl_req_slice *slice)
+{
+       return 0;
+}
+
+static void osc_req_completion(const struct lu_env *env,
+                              const struct cl_req_slice *slice, int ioret)
+{
+       struct osc_req *or;
+
+       or = cl2osc_req(slice);
+       OBD_SLAB_FREE_PTR(or, osc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for osc
+ * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
+ * fields.
+ */
+static void osc_req_attr_set(const struct lu_env *env,
+                            const struct cl_req_slice *slice,
+                            const struct cl_object *obj,
+                            struct cl_req_attr *attr, obd_valid flags)
+{
+       struct lov_oinfo *oinfo;
+       struct cl_req    *clerq;
+       struct cl_page   *apage; /* _some_ page in @clerq */
+       struct cl_lock   *lock;  /* _some_ lock protecting @apage */
+       struct osc_lock  *olck;
+       struct osc_page  *opg;
+       struct obdo      *oa;
+       struct ost_lvb   *lvb;
+
+       oinfo   = cl2osc(obj)->oo_oinfo;
+       lvb     = &oinfo->loi_lvb;
+       oa      = attr->cra_oa;
+
+       if ((flags & OBD_MD_FLMTIME) != 0) {
+               oa->o_mtime = lvb->lvb_mtime;
+               oa->o_valid |= OBD_MD_FLMTIME;
+       }
+       if ((flags & OBD_MD_FLATIME) != 0) {
+               oa->o_atime = lvb->lvb_atime;
+               oa->o_valid |= OBD_MD_FLATIME;
+       }
+       if ((flags & OBD_MD_FLCTIME) != 0) {
+               oa->o_ctime = lvb->lvb_ctime;
+               oa->o_valid |= OBD_MD_FLCTIME;
+       }
+       if (flags & OBD_MD_FLGROUP) {
+               ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi));
+               oa->o_valid |= OBD_MD_FLGROUP;
+       }
+       if (flags & OBD_MD_FLID) {
+               ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi));
+               oa->o_valid |= OBD_MD_FLID;
+       }
+       if (flags & OBD_MD_FLHANDLE) {
+               clerq = slice->crs_req;
+               LASSERT(!list_empty(&clerq->crq_pages));
+               apage = container_of(clerq->crq_pages.next,
+                                    struct cl_page, cp_flight);
+               opg = osc_cl_page_osc(apage);
+               apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */
+               lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1);
+               if (lock == NULL) {
+                       struct cl_object_header *head;
+                       struct cl_lock    *scan;
+
+                       head = cl_object_header(apage->cp_obj);
+                       list_for_each_entry(scan, &head->coh_locks,
+                                               cll_linkage)
+                               CL_LOCK_DEBUG(D_ERROR, env, scan,
+                                             "no cover page!\n");
+                       CL_PAGE_DEBUG(D_ERROR, env, apage,
+                                     "dump uncover page!\n");
+                       libcfs_debug_dumpstack(NULL);
+                       LBUG();
+               }
+
+               olck = osc_lock_at(lock);
+               LASSERT(olck != NULL);
+               LASSERT(ergo(opg->ops_srvlock, olck->ols_lock == NULL));
+               /* check for lockless io. */
+               if (olck->ols_lock != NULL) {
+                       oa->o_handle = olck->ols_lock->l_remote_handle;
+                       oa->o_valid |= OBD_MD_FLHANDLE;
+               }
+               cl_lock_put(env, lock);
+       }
+}
+
+static const struct cl_req_operations osc_req_ops = {
+       .cro_prep       = osc_req_prep,
+       .cro_attr_set   = osc_req_attr_set,
+       .cro_completion = osc_req_completion
+};
+
+
+int osc_io_init(const struct lu_env *env,
+               struct cl_object *obj, struct cl_io *io)
+{
+       struct osc_io *oio = osc_env_io(env);
+
+       CL_IO_SLICE_CLEAN(oio, oi_cl);
+       cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+       return 0;
+}
+
+int osc_req_init(const struct lu_env *env, struct cl_device *dev,
+                struct cl_req *req)
+{
+       struct osc_req *or;
+       int result;
+
+       OBD_SLAB_ALLOC_PTR_GFP(or, osc_req_kmem, __GFP_IO);
+       if (or != NULL) {
+               cl_req_slice_add(req, &or->or_cl, dev, &osc_req_ops);
+               result = 0;
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_lock.c b/drivers/staging/lustre/lustre/osc/osc_lock.c
new file mode 100644 (file)
index 0000000..640bc3d
--- /dev/null
@@ -0,0 +1,1663 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+# include <linux/libcfs/libcfs.h>
+/* fid_build_reg_res_name() */
+#include <lustre_fid.h>
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+#define _PAGEREF_MAGIC  (-10000000)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static const struct cl_lock_operations osc_lock_ops;
+static const struct cl_lock_operations osc_lock_lockless_ops;
+static void osc_lock_to_lockless(const struct lu_env *env,
+                                struct osc_lock *ols, int force);
+static int osc_lock_has_pages(struct osc_lock *olck);
+
+int osc_lock_is_lockless(const struct osc_lock *olck)
+{
+       return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
+}
+
+/**
+ * Returns a weak pointer to the ldlm lock identified by a handle. Returned
+ * pointer cannot be dereferenced, as lock is not protected from concurrent
+ * reclaim. This function is a helper for osc_lock_invariant().
+ */
+static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle)
+{
+       struct ldlm_lock *lock;
+
+       lock = ldlm_handle2lock(handle);
+       if (lock != NULL)
+               LDLM_LOCK_PUT(lock);
+       return lock;
+}
+
+/**
+ * Invariant that has to be true all of the time.
+ */
+static int osc_lock_invariant(struct osc_lock *ols)
+{
+       struct ldlm_lock *lock  = osc_handle_ptr(&ols->ols_handle);
+       struct ldlm_lock *olock       = ols->ols_lock;
+       int            handle_used = lustre_handle_is_used(&ols->ols_handle);
+
+       return
+               ergo(osc_lock_is_lockless(ols),
+                    ols->ols_locklessable && ols->ols_lock == NULL)  ||
+               (ergo(olock != NULL, handle_used) &&
+                ergo(olock != NULL,
+                     olock->l_handle.h_cookie == ols->ols_handle.cookie) &&
+                /*
+                 * Check that ->ols_handle and ->ols_lock are consistent, but
+                 * take into account that they are set at the different time.
+                 */
+                ergo(handle_used,
+                     ergo(lock != NULL && olock != NULL, lock == olock) &&
+                     ergo(lock == NULL, olock == NULL)) &&
+                ergo(ols->ols_state == OLS_CANCELLED,
+                     olock == NULL && !handle_used) &&
+                /*
+                 * DLM lock is destroyed only after we have seen cancellation
+                 * ast.
+                 */
+                ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
+                     !olock->l_destroyed) &&
+                ergo(ols->ols_state == OLS_GRANTED,
+                     olock != NULL &&
+                     olock->l_req_mode == olock->l_granted_mode &&
+                     ols->ols_hold));
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+/**
+ * Breaks a link between osc_lock and dlm_lock.
+ */
+static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
+{
+       struct ldlm_lock *dlmlock;
+
+       spin_lock(&osc_ast_guard);
+       dlmlock = olck->ols_lock;
+       if (dlmlock == NULL) {
+               spin_unlock(&osc_ast_guard);
+               return;
+       }
+
+       olck->ols_lock = NULL;
+       /* wb(); --- for all who checks (ols->ols_lock != NULL) before
+        * call to osc_lock_detach() */
+       dlmlock->l_ast_data = NULL;
+       olck->ols_handle.cookie = 0ULL;
+       spin_unlock(&osc_ast_guard);
+
+       lock_res_and_lock(dlmlock);
+       if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+               struct cl_object *obj = olck->ols_cl.cls_obj;
+               struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+               __u64 old_kms;
+
+               cl_object_attr_lock(obj);
+               /* Must get the value under the lock to avoid possible races. */
+               old_kms = cl2osc(obj)->oo_oinfo->loi_kms;
+               /* Update the kms. Need to loop all granted locks.
+                * Not a problem for the client */
+               attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms);
+
+               cl_object_attr_set(env, obj, attr, CAT_KMS);
+               cl_object_attr_unlock(obj);
+       }
+       unlock_res_and_lock(dlmlock);
+
+       /* release a reference taken in osc_lock_upcall0(). */
+       LASSERT(olck->ols_has_ref);
+       lu_ref_del(&dlmlock->l_reference, "osc_lock", olck);
+       LDLM_LOCK_RELEASE(dlmlock);
+       olck->ols_has_ref = 0;
+}
+
+static int osc_lock_unhold(struct osc_lock *ols)
+{
+       int result = 0;
+
+       if (ols->ols_hold) {
+               ols->ols_hold = 0;
+               result = osc_cancel_base(&ols->ols_handle,
+                                        ols->ols_einfo.ei_mode);
+       }
+       return result;
+}
+
+static int osc_lock_unuse(const struct lu_env *env,
+                         const struct cl_lock_slice *slice)
+{
+       struct osc_lock *ols = cl2osc_lock(slice);
+
+       LINVRNT(osc_lock_invariant(ols));
+
+       switch (ols->ols_state) {
+       case OLS_NEW:
+               LASSERT(!ols->ols_hold);
+               LASSERT(ols->ols_agl);
+               return 0;
+       case OLS_UPCALL_RECEIVED:
+               osc_lock_unhold(ols);
+       case OLS_ENQUEUED:
+               LASSERT(!ols->ols_hold);
+               osc_lock_detach(env, ols);
+               ols->ols_state = OLS_NEW;
+               return 0;
+       case OLS_GRANTED:
+               LASSERT(!ols->ols_glimpse);
+               LASSERT(ols->ols_hold);
+               /*
+                * Move lock into OLS_RELEASED state before calling
+                * osc_cancel_base() so that possible synchronous cancellation
+                * (that always happens e.g., for liblustre) sees that lock is
+                * released.
+                */
+               ols->ols_state = OLS_RELEASED;
+               return osc_lock_unhold(ols);
+       default:
+               CERROR("Impossible state: %d\n", ols->ols_state);
+               LBUG();
+       }
+}
+
+static void osc_lock_fini(const struct lu_env *env,
+                         struct cl_lock_slice *slice)
+{
+       struct osc_lock  *ols = cl2osc_lock(slice);
+
+       LINVRNT(osc_lock_invariant(ols));
+       /*
+        * ->ols_hold can still be true at this point if, for example, a
+        * thread that requested a lock was killed (and released a reference
+        * to the lock), before reply from a server was received. In this case
+        * lock is destroyed immediately after upcall.
+        */
+       osc_lock_unhold(ols);
+       LASSERT(ols->ols_lock == NULL);
+       LASSERT(atomic_read(&ols->ols_pageref) == 0 ||
+               atomic_read(&ols->ols_pageref) == _PAGEREF_MAGIC);
+
+       OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
+}
+
+static void osc_lock_build_policy(const struct lu_env *env,
+                                 const struct cl_lock *lock,
+                                 ldlm_policy_data_t *policy)
+{
+       const struct cl_lock_descr *d = &lock->cll_descr;
+
+       osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
+       policy->l_extent.gid = d->cld_gid;
+}
+
+static __u64 osc_enq2ldlm_flags(__u32 enqflags)
+{
+       __u64 result = 0;
+
+       LASSERT((enqflags & ~CEF_MASK) == 0);
+
+       if (enqflags & CEF_NONBLOCK)
+               result |= LDLM_FL_BLOCK_NOWAIT;
+       if (enqflags & CEF_ASYNC)
+               result |= LDLM_FL_HAS_INTENT;
+       if (enqflags & CEF_DISCARD_DATA)
+               result |= LDLM_AST_DISCARD_DATA;
+       return result;
+}
+
+/**
+ * Global spin-lock protecting consistency of ldlm_lock::l_ast_data
+ * pointers. Initialized in osc_init().
+ */
+spinlock_t osc_ast_guard;
+
+static struct osc_lock *osc_ast_data_get(struct ldlm_lock *dlm_lock)
+{
+       struct osc_lock *olck;
+
+       lock_res_and_lock(dlm_lock);
+       spin_lock(&osc_ast_guard);
+       olck = dlm_lock->l_ast_data;
+       if (olck != NULL) {
+               struct cl_lock *lock = olck->ols_cl.cls_lock;
+               /*
+                * If osc_lock holds a reference on ldlm lock, return it even
+                * when cl_lock is in CLS_FREEING state. This way
+                *
+                *       osc_ast_data_get(dlmlock) == NULL
+                *
+                * guarantees that all osc references on dlmlock were
+                * released. osc_dlm_blocking_ast0() relies on that.
+                */
+               if (lock->cll_state < CLS_FREEING || olck->ols_has_ref) {
+                       cl_lock_get_trust(lock);
+                       lu_ref_add_atomic(&lock->cll_reference,
+                                         "ast", current);
+               } else
+                       olck = NULL;
+       }
+       spin_unlock(&osc_ast_guard);
+       unlock_res_and_lock(dlm_lock);
+       return olck;
+}
+
+static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck)
+{
+       struct cl_lock *lock;
+
+       lock = olck->ols_cl.cls_lock;
+       lu_ref_del(&lock->cll_reference, "ast", current);
+       cl_lock_put(env, lock);
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server. Copy of osc_update_enqueue()
+ * logic.
+ *
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ *
+ * Called under lock and resource spin-locks.
+ */
+static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
+                               int rc)
+{
+       struct ost_lvb    *lvb;
+       struct cl_object  *obj;
+       struct lov_oinfo  *oinfo;
+       struct cl_attr    *attr;
+       unsigned           valid;
+
+       ENTRY;
+
+       if (!(olck->ols_flags & LDLM_FL_LVB_READY))
+               RETURN_EXIT;
+
+       lvb   = &olck->ols_lvb;
+       obj   = olck->ols_cl.cls_obj;
+       oinfo = cl2osc(obj)->oo_oinfo;
+       attr  = &osc_env_info(env)->oti_attr;
+       valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE;
+       cl_lvb2attr(attr, lvb);
+
+       cl_object_attr_lock(obj);
+       if (rc == 0) {
+               struct ldlm_lock  *dlmlock;
+               __u64 size;
+
+               dlmlock = olck->ols_lock;
+               LASSERT(dlmlock != NULL);
+
+               /* re-grab LVB from a dlm lock under DLM spin-locks. */
+               *lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+               size = lvb->lvb_size;
+               /* Extend KMS up to the end of this lock and no further
+                * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+               if (size > dlmlock->l_policy_data.l_extent.end)
+                       size = dlmlock->l_policy_data.l_extent.end + 1;
+               if (size >= oinfo->loi_kms) {
+                       LDLM_DEBUG(dlmlock, "lock acquired, setting rss="LPU64
+                                  ", kms="LPU64, lvb->lvb_size, size);
+                       valid |= CAT_KMS;
+                       attr->cat_kms = size;
+               } else {
+                       LDLM_DEBUG(dlmlock, "lock acquired, setting rss="
+                                  LPU64"; leaving kms="LPU64", end="LPU64,
+                                  lvb->lvb_size, oinfo->loi_kms,
+                                  dlmlock->l_policy_data.l_extent.end);
+               }
+               ldlm_lock_allow_match_locked(dlmlock);
+       } else if (rc == -ENAVAIL && olck->ols_glimpse) {
+               CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
+                      " kms="LPU64"\n", lvb->lvb_size, oinfo->loi_kms);
+       } else
+               valid = 0;
+
+       if (valid != 0)
+               cl_object_attr_set(env, obj, attr, valid);
+
+       cl_object_attr_unlock(obj);
+
+       EXIT;
+}
+
+/**
+ * Called when a lock is granted, from an upcall (when server returned a
+ * granted lock), or from completion AST, when server returned a blocked lock.
+ *
+ * Called under lock and resource spin-locks, that are released temporarily
+ * here.
+ */
+static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck,
+                            struct ldlm_lock *dlmlock, int rc)
+{
+       struct ldlm_extent   *ext;
+       struct cl_lock       *lock;
+       struct cl_lock_descr *descr;
+
+       LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
+
+       ENTRY;
+       if (olck->ols_state < OLS_GRANTED) {
+               lock  = olck->ols_cl.cls_lock;
+               ext   = &dlmlock->l_policy_data.l_extent;
+               descr = &osc_env_info(env)->oti_descr;
+               descr->cld_obj = lock->cll_descr.cld_obj;
+
+               /* XXX check that ->l_granted_mode is valid. */
+               descr->cld_mode  = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+               descr->cld_start = cl_index(descr->cld_obj, ext->start);
+               descr->cld_end   = cl_index(descr->cld_obj, ext->end);
+               descr->cld_gid   = ext->gid;
+               /*
+                * tell upper layers the extent of the lock that was actually
+                * granted
+                */
+               olck->ols_state = OLS_GRANTED;
+               osc_lock_lvb_update(env, olck, rc);
+
+               /* release DLM spin-locks to allow cl_lock_{modify,signal}()
+                * to take a semaphore on a parent lock. This is safe, because
+                * spin-locks are needed to protect consistency of
+                * dlmlock->l_*_mode and LVB, and we have finished processing
+                * them. */
+               unlock_res_and_lock(dlmlock);
+               cl_lock_modify(env, lock, descr);
+               cl_lock_signal(env, lock);
+               LINVRNT(osc_lock_invariant(olck));
+               lock_res_and_lock(dlmlock);
+       }
+       EXIT;
+}
+
+static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck)
+
+{
+       struct ldlm_lock *dlmlock;
+
+       ENTRY;
+
+       dlmlock = ldlm_handle2lock_long(&olck->ols_handle, 0);
+       LASSERT(dlmlock != NULL);
+
+       lock_res_and_lock(dlmlock);
+       spin_lock(&osc_ast_guard);
+       LASSERT(dlmlock->l_ast_data == olck);
+       LASSERT(olck->ols_lock == NULL);
+       olck->ols_lock = dlmlock;
+       spin_unlock(&osc_ast_guard);
+
+       /*
+        * Lock might be not yet granted. In this case, completion ast
+        * (osc_ldlm_completion_ast()) comes later and finishes lock
+        * granting.
+        */
+       if (dlmlock->l_granted_mode == dlmlock->l_req_mode)
+               osc_lock_granted(env, olck, dlmlock, 0);
+       unlock_res_and_lock(dlmlock);
+
+       /*
+        * osc_enqueue_interpret() decrefs asynchronous locks, counter
+        * this.
+        */
+       ldlm_lock_addref(&olck->ols_handle, olck->ols_einfo.ei_mode);
+       olck->ols_hold = 1;
+
+       /* lock reference taken by ldlm_handle2lock_long() is owned by
+        * osc_lock and released in osc_lock_detach() */
+       lu_ref_add(&dlmlock->l_reference, "osc_lock", olck);
+       olck->ols_has_ref = 1;
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int osc_lock_upcall(void *cookie, int errcode)
+{
+       struct osc_lock  *olck  = cookie;
+       struct cl_lock_slice    *slice = &olck->ols_cl;
+       struct cl_lock    *lock  = slice->cls_lock;
+       struct lu_env      *env;
+       struct cl_env_nest       nest;
+
+       ENTRY;
+       env = cl_env_nested_get(&nest);
+       if (!IS_ERR(env)) {
+               int rc;
+
+               cl_lock_mutex_get(env, lock);
+
+               LASSERT(lock->cll_state >= CLS_QUEUING);
+               if (olck->ols_state == OLS_ENQUEUED) {
+                       olck->ols_state = OLS_UPCALL_RECEIVED;
+                       rc = ldlm_error2errno(errcode);
+               } else if (olck->ols_state == OLS_CANCELLED) {
+                       rc = -EIO;
+               } else {
+                       CERROR("Impossible state: %d\n", olck->ols_state);
+                       LBUG();
+               }
+               if (rc) {
+                       struct ldlm_lock *dlmlock;
+
+                       dlmlock = ldlm_handle2lock(&olck->ols_handle);
+                       if (dlmlock != NULL) {
+                               lock_res_and_lock(dlmlock);
+                               spin_lock(&osc_ast_guard);
+                               LASSERT(olck->ols_lock == NULL);
+                               dlmlock->l_ast_data = NULL;
+                               olck->ols_handle.cookie = 0ULL;
+                               spin_unlock(&osc_ast_guard);
+                               ldlm_lock_fail_match_locked(dlmlock);
+                               unlock_res_and_lock(dlmlock);
+                               LDLM_LOCK_PUT(dlmlock);
+                       }
+               } else {
+                       if (olck->ols_glimpse)
+                               olck->ols_glimpse = 0;
+                       osc_lock_upcall0(env, olck);
+               }
+
+               /* Error handling, some errors are tolerable. */
+               if (olck->ols_locklessable && rc == -EUSERS) {
+                       /* This is a tolerable error, turn this lock into
+                        * lockless lock.
+                        */
+                       osc_object_set_contended(cl2osc(slice->cls_obj));
+                       LASSERT(slice->cls_ops == &osc_lock_ops);
+
+                       /* Change this lock to ldlmlock-less lock. */
+                       osc_lock_to_lockless(env, olck, 1);
+                       olck->ols_state = OLS_GRANTED;
+                       rc = 0;
+               } else if (olck->ols_glimpse && rc == -ENAVAIL) {
+                       osc_lock_lvb_update(env, olck, rc);
+                       cl_lock_delete(env, lock);
+                       /* Hide the error. */
+                       rc = 0;
+               }
+
+               if (rc == 0) {
+                       /* For AGL case, the RPC sponsor may exits the cl_lock
+                       *  processing without wait() called before related OSC
+                       *  lock upcall(). So update the lock status according
+                       *  to the enqueue result inside AGL upcall(). */
+                       if (olck->ols_agl) {
+                               lock->cll_flags |= CLF_FROM_UPCALL;
+                               cl_wait_try(env, lock);
+                               lock->cll_flags &= ~CLF_FROM_UPCALL;
+                               if (!olck->ols_glimpse)
+                                       olck->ols_agl = 0;
+                       }
+                       cl_lock_signal(env, lock);
+                       /* del user for lock upcall cookie */
+                       cl_unuse_try(env, lock);
+               } else {
+                       /* del user for lock upcall cookie */
+                       cl_lock_user_del(env, lock);
+                       cl_lock_error(env, lock, rc);
+               }
+
+               /* release cookie reference, acquired by osc_lock_enqueue() */
+               cl_lock_hold_release(env, lock, "upcall", lock);
+               cl_lock_mutex_put(env, lock);
+
+               lu_ref_del(&lock->cll_reference, "upcall", lock);
+               /* This maybe the last reference, so must be called after
+                * cl_lock_mutex_put(). */
+               cl_lock_put(env, lock);
+
+               cl_env_nested_put(&nest, env);
+       } else {
+               /* should never happen, similar to osc_ldlm_blocking_ast(). */
+               LBUG();
+       }
+       RETURN(errcode);
+}
+
+/**
+ * Core of osc_dlm_blocking_ast() logic.
+ */
+static void osc_lock_blocking(const struct lu_env *env,
+                             struct ldlm_lock *dlmlock,
+                             struct osc_lock *olck, int blocking)
+{
+       struct cl_lock *lock = olck->ols_cl.cls_lock;
+
+       LASSERT(olck->ols_lock == dlmlock);
+       CLASSERT(OLS_BLOCKED < OLS_CANCELLED);
+       LASSERT(!osc_lock_is_lockless(olck));
+
+       /*
+        * Lock might be still addref-ed here, if e.g., blocking ast
+        * is sent for a failed lock.
+        */
+       osc_lock_unhold(olck);
+
+       if (blocking && olck->ols_state < OLS_BLOCKED)
+               /*
+                * Move osc_lock into OLS_BLOCKED before canceling the lock,
+                * because it recursively re-enters osc_lock_blocking(), with
+                * the state set to OLS_CANCELLED.
+                */
+               olck->ols_state = OLS_BLOCKED;
+       /*
+        * cancel and destroy lock at least once no matter how blocking ast is
+        * entered (see comment above osc_ldlm_blocking_ast() for use
+        * cases). cl_lock_cancel() and cl_lock_delete() are idempotent.
+        */
+       cl_lock_cancel(env, lock);
+       cl_lock_delete(env, lock);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int osc_dlm_blocking_ast0(const struct lu_env *env,
+                                struct ldlm_lock *dlmlock,
+                                void *data, int flag)
+{
+       struct osc_lock *olck;
+       struct cl_lock  *lock;
+       int result;
+       int cancel;
+
+       LASSERT(flag == LDLM_CB_BLOCKING || flag == LDLM_CB_CANCELING);
+
+       cancel = 0;
+       olck = osc_ast_data_get(dlmlock);
+       if (olck != NULL) {
+               lock = olck->ols_cl.cls_lock;
+               cl_lock_mutex_get(env, lock);
+               LINVRNT(osc_lock_invariant(olck));
+               if (olck->ols_ast_wait) {
+                       /* wake up osc_lock_use() */
+                       cl_lock_signal(env, lock);
+                       olck->ols_ast_wait = 0;
+               }
+               /*
+                * Lock might have been canceled while this thread was
+                * sleeping for lock mutex, but olck is pinned in memory.
+                */
+               if (olck == dlmlock->l_ast_data) {
+                       /*
+                        * NOTE: DLM sends blocking AST's for failed locks
+                        *       (that are still in pre-OLS_GRANTED state)
+                        *       too, and they have to be canceled otherwise
+                        *       DLM lock is never destroyed and stuck in
+                        *       the memory.
+                        *
+                        *       Alternatively, ldlm_cli_cancel() can be
+                        *       called here directly for osc_locks with
+                        *       ols_state < OLS_GRANTED to maintain an
+                        *       invariant that ->clo_cancel() is only called
+                        *       for locks that were granted.
+                        */
+                       LASSERT(data == olck);
+                       osc_lock_blocking(env, dlmlock,
+                                         olck, flag == LDLM_CB_BLOCKING);
+               } else
+                       cancel = 1;
+               cl_lock_mutex_put(env, lock);
+               osc_ast_data_put(env, olck);
+       } else
+               /*
+                * DLM lock exists, but there is no cl_lock attached to it.
+                * This is a `normal' race. cl_object and its cl_lock's can be
+                * removed by memory pressure, together with all pages.
+                */
+               cancel = (flag == LDLM_CB_BLOCKING);
+
+       if (cancel) {
+               struct lustre_handle *lockh;
+
+               lockh = &osc_env_info(env)->oti_handle;
+               ldlm_lock2handle(dlmlock, lockh);
+               result = ldlm_cli_cancel(lockh, LCF_ASYNC);
+       } else
+               result = 0;
+       return result;
+}
+
+/**
+ * Blocking ast invoked by ldlm when dlm lock is either blocking progress of
+ * some other lock, or is canceled. This function is installed as a
+ * ldlm_lock::l_blocking_ast() for client extent locks.
+ *
+ * Control flow is tricky, because ldlm uses the same call-back
+ * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's.
+ *
+ * \param dlmlock lock for which ast occurred.
+ *
+ * \param new description of a conflicting lock in case of blocking ast.
+ *
+ * \param data value of dlmlock->l_ast_data
+ *
+ * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish
+ *          cancellation and blocking ast's.
+ *
+ * Possible use cases:
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel
+ *       lock due to lock lru pressure, or explicit user request to purge
+ *       locks.
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify
+ *       us that dlmlock conflicts with another lock that some client is
+ *       enqueing. Lock is canceled.
+ *
+ *        - cl_lock_cancel() is called. osc_lock_cancel() calls
+ *          ldlm_cli_cancel() that calls
+ *
+ *               dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ *          recursively entering osc_ldlm_blocking_ast().
+ *
+ *     - client cancels lock voluntary (e.g., as a part of early cancellation):
+ *
+ *        cl_lock_cancel()->
+ *          osc_lock_cancel()->
+ *            ldlm_cli_cancel()->
+ *              dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ */
+static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+                                struct ldlm_lock_desc *new, void *data,
+                                int flag)
+{
+       struct lu_env     *env;
+       struct cl_env_nest nest;
+       int             result;
+
+       /*
+        * This can be called in the context of outer IO, e.g.,
+        *
+        *     cl_enqueue()->...
+        *       ->osc_enqueue_base()->...
+        *       ->ldlm_prep_elc_req()->...
+        *         ->ldlm_cancel_callback()->...
+        *           ->osc_ldlm_blocking_ast()
+        *
+        * new environment has to be created to not corrupt outer context.
+        */
+       env = cl_env_nested_get(&nest);
+       if (!IS_ERR(env)) {
+               result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
+               cl_env_nested_put(&nest, env);
+       } else {
+               result = PTR_ERR(env);
+               /*
+                * XXX This should never happen, as cl_lock is
+                * stuck. Pre-allocated environment a la vvp_inode_fini_env
+                * should be used.
+                */
+               LBUG();
+       }
+       if (result != 0) {
+               if (result == -ENODATA)
+                       result = 0;
+               else
+                       CERROR("BAST failed: %d\n", result);
+       }
+       return result;
+}
+
+static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
+                                  __u64 flags, void *data)
+{
+       struct cl_env_nest nest;
+       struct lu_env     *env;
+       struct osc_lock   *olck;
+       struct cl_lock    *lock;
+       int result;
+       int dlmrc;
+
+       /* first, do dlm part of the work */
+       dlmrc = ldlm_completion_ast_async(dlmlock, flags, data);
+       /* then, notify cl_lock */
+       env = cl_env_nested_get(&nest);
+       if (!IS_ERR(env)) {
+               olck = osc_ast_data_get(dlmlock);
+               if (olck != NULL) {
+                       lock = olck->ols_cl.cls_lock;
+                       cl_lock_mutex_get(env, lock);
+                       /*
+                        * ldlm_handle_cp_callback() copied LVB from request
+                        * to lock->l_lvb_data, store it in osc_lock.
+                        */
+                       LASSERT(dlmlock->l_lvb_data != NULL);
+                       lock_res_and_lock(dlmlock);
+                       olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+                       if (olck->ols_lock == NULL) {
+                               /*
+                                * upcall (osc_lock_upcall()) hasn't yet been
+                                * called. Do nothing now, upcall will bind
+                                * olck to dlmlock and signal the waiters.
+                                *
+                                * This maintains an invariant that osc_lock
+                                * and ldlm_lock are always bound when
+                                * osc_lock is in OLS_GRANTED state.
+                                */
+                       } else if (dlmlock->l_granted_mode ==
+                                  dlmlock->l_req_mode) {
+                               osc_lock_granted(env, olck, dlmlock, dlmrc);
+                       }
+                       unlock_res_and_lock(dlmlock);
+
+                       if (dlmrc != 0) {
+                               CL_LOCK_DEBUG(D_ERROR, env, lock,
+                                             "dlmlock returned %d\n", dlmrc);
+                               cl_lock_error(env, lock, dlmrc);
+                       }
+                       cl_lock_mutex_put(env, lock);
+                       osc_ast_data_put(env, olck);
+                       result = 0;
+               } else
+                       result = -ELDLM_NO_LOCK_DATA;
+               cl_env_nested_put(&nest, env);
+       } else
+               result = PTR_ERR(env);
+       return dlmrc ?: result;
+}
+
+static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+       struct ptlrpc_request  *req  = data;
+       struct osc_lock *olck;
+       struct cl_lock   *lock;
+       struct cl_object       *obj;
+       struct cl_env_nest      nest;
+       struct lu_env     *env;
+       struct ost_lvb   *lvb;
+       struct req_capsule     *cap;
+       int                  result;
+
+       LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
+
+       env = cl_env_nested_get(&nest);
+       if (!IS_ERR(env)) {
+               /* osc_ast_data_get() has to go after environment is
+                * allocated, because osc_ast_data() acquires a
+                * reference to a lock, and it can only be released in
+                * environment.
+                */
+               olck = osc_ast_data_get(dlmlock);
+               if (olck != NULL) {
+                       lock = olck->ols_cl.cls_lock;
+                       /* Do not grab the mutex of cl_lock for glimpse.
+                        * See LU-1274 for details.
+                        * BTW, it's okay for cl_lock to be cancelled during
+                        * this period because server can handle this race.
+                        * See ldlm_server_glimpse_ast() for details.
+                        * cl_lock_mutex_get(env, lock); */
+                       cap = &req->rq_pill;
+                       req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK);
+                       req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER,
+                                            sizeof *lvb);
+                       result = req_capsule_server_pack(cap);
+                       if (result == 0) {
+                               lvb = req_capsule_server_get(cap, &RMF_DLM_LVB);
+                               obj = lock->cll_descr.cld_obj;
+                               result = cl_object_glimpse(env, obj, lvb);
+                       }
+                       if (!exp_connect_lvb_type(req->rq_export))
+                               req_capsule_shrink(&req->rq_pill,
+                                                  &RMF_DLM_LVB,
+                                                  sizeof(struct ost_lvb_v1),
+                                                  RCL_SERVER);
+                       osc_ast_data_put(env, olck);
+               } else {
+                       /*
+                        * These errors are normal races, so we don't want to
+                        * fill the console with messages by calling
+                        * ptlrpc_error()
+                        */
+                       lustre_pack_reply(req, 1, NULL, NULL);
+                       result = -ELDLM_NO_LOCK_DATA;
+               }
+               cl_env_nested_put(&nest, env);
+       } else
+               result = PTR_ERR(env);
+       req->rq_status = result;
+       return result;
+}
+
+static unsigned long osc_lock_weigh(const struct lu_env *env,
+                                   const struct cl_lock_slice *slice)
+{
+       /*
+        * don't need to grab coh_page_guard since we don't care the exact #
+        * of pages..
+        */
+       return cl_object_header(slice->cls_obj)->coh_pages;
+}
+
+/**
+ * Get the weight of dlm lock for early cancellation.
+ *
+ * XXX: it should return the pages covered by this \a dlmlock.
+ */
+static unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
+{
+       struct cl_env_nest       nest;
+       struct lu_env      *env;
+       struct osc_lock  *lock;
+       struct cl_lock    *cll;
+       unsigned long       weight;
+       ENTRY;
+
+       might_sleep();
+       /*
+        * osc_ldlm_weigh_ast has a complex context since it might be called
+        * because of lock canceling, or from user's input. We have to make
+        * a new environment for it. Probably it is implementation safe to use
+        * the upper context because cl_lock_put don't modify environment
+        * variables. But in case of ..
+        */
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               /* Mostly because lack of memory, tend to eliminate this lock*/
+               RETURN(0);
+
+       LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
+       lock = osc_ast_data_get(dlmlock);
+       if (lock == NULL) {
+               /* cl_lock was destroyed because of memory pressure.
+                * It is much reasonable to assign this type of lock
+                * a lower cost.
+                */
+               GOTO(out, weight = 0);
+       }
+
+       cll = lock->ols_cl.cls_lock;
+       cl_lock_mutex_get(env, cll);
+       weight = cl_lock_weigh(env, cll);
+       cl_lock_mutex_put(env, cll);
+       osc_ast_data_put(env, lock);
+       EXIT;
+
+out:
+       cl_env_nested_put(&nest, env);
+       return weight;
+}
+
+static void osc_lock_build_einfo(const struct lu_env *env,
+                                const struct cl_lock *clock,
+                                struct osc_lock *lock,
+                                struct ldlm_enqueue_info *einfo)
+{
+       enum cl_lock_mode mode;
+
+       mode = clock->cll_descr.cld_mode;
+       if (mode == CLM_PHANTOM)
+               /*
+                * For now, enqueue all glimpse locks in read mode. In the
+                * future, client might choose to enqueue LCK_PW lock for
+                * glimpse on a file opened for write.
+                */
+               mode = CLM_READ;
+
+       einfo->ei_type   = LDLM_EXTENT;
+       einfo->ei_mode   = osc_cl_lock2ldlm(mode);
+       einfo->ei_cb_bl  = osc_ldlm_blocking_ast;
+       einfo->ei_cb_cp  = osc_ldlm_completion_ast;
+       einfo->ei_cb_gl  = osc_ldlm_glimpse_ast;
+       einfo->ei_cb_wg  = osc_ldlm_weigh_ast;
+       einfo->ei_cbdata = lock; /* value to be put into ->l_ast_data */
+}
+
+/**
+ * Determine if the lock should be converted into a lockless lock.
+ *
+ * Steps to check:
+ * - if the lock has an explicite requirment for a non-lockless lock;
+ * - if the io lock request type ci_lockreq;
+ * - send the enqueue rpc to ost to make the further decision;
+ * - special treat to truncate lockless lock
+ *
+ *  Additional policy can be implemented here, e.g., never do lockless-io
+ *  for large extents.
+ */
+static void osc_lock_to_lockless(const struct lu_env *env,
+                                struct osc_lock *ols, int force)
+{
+       struct cl_lock_slice *slice = &ols->ols_cl;
+
+       LASSERT(ols->ols_state == OLS_NEW ||
+               ols->ols_state == OLS_UPCALL_RECEIVED);
+
+       if (force) {
+               ols->ols_locklessable = 1;
+               slice->cls_ops = &osc_lock_lockless_ops;
+       } else {
+               struct osc_io *oio     = osc_env_io(env);
+               struct cl_io  *io      = oio->oi_cl.cis_io;
+               struct cl_object *obj  = slice->cls_obj;
+               struct osc_object *oob = cl2osc(obj);
+               const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+               struct obd_connect_data *ocd;
+
+               LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+                       io->ci_lockreq == CILR_MAYBE ||
+                       io->ci_lockreq == CILR_NEVER);
+
+               ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+               ols->ols_locklessable = (io->ci_type != CIT_SETATTR) &&
+                               (io->ci_lockreq == CILR_MAYBE) &&
+                               (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
+               if (io->ci_lockreq == CILR_NEVER ||
+                       /* lockless IO */
+                   (ols->ols_locklessable && osc_object_is_contended(oob)) ||
+                       /* lockless truncate */
+                   (cl_io_is_trunc(io) &&
+                    (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
+                     osd->od_lockless_truncate)) {
+                       ols->ols_locklessable = 1;
+                       slice->cls_ops = &osc_lock_lockless_ops;
+               }
+       }
+       LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+}
+
+static int osc_lock_compatible(const struct osc_lock *qing,
+                              const struct osc_lock *qed)
+{
+       enum cl_lock_mode qing_mode;
+       enum cl_lock_mode qed_mode;
+
+       qing_mode = qing->ols_cl.cls_lock->cll_descr.cld_mode;
+       if (qed->ols_glimpse &&
+           (qed->ols_state >= OLS_UPCALL_RECEIVED || qing_mode == CLM_READ))
+               return 1;
+
+       qed_mode = qed->ols_cl.cls_lock->cll_descr.cld_mode;
+       return ((qing_mode == CLM_READ) && (qed_mode == CLM_READ));
+}
+
+/**
+ * Cancel all conflicting locks and wait for them to be destroyed.
+ *
+ * This function is used for two purposes:
+ *
+ *     - early cancel all conflicting locks before starting IO, and
+ *
+ *     - guarantee that pages added to the page cache by lockless IO are never
+ *       covered by locks other than lockless IO lock, and, hence, are not
+ *       visible to other threads.
+ */
+static int osc_lock_enqueue_wait(const struct lu_env *env,
+                                const struct osc_lock *olck)
+{
+       struct cl_lock    *lock    = olck->ols_cl.cls_lock;
+       struct cl_lock_descr    *descr   = &lock->cll_descr;
+       struct cl_object_header *hdr     = cl_object_header(descr->cld_obj);
+       struct cl_lock    *scan;
+       struct cl_lock    *conflict= NULL;
+       int lockless                 = osc_lock_is_lockless(olck);
+       int rc                     = 0;
+       ENTRY;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+
+       /* make it enqueue anyway for glimpse lock, because we actually
+        * don't need to cancel any conflicting locks. */
+       if (olck->ols_glimpse)
+               return 0;
+
+       spin_lock(&hdr->coh_lock_guard);
+       list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+               struct cl_lock_descr *cld = &scan->cll_descr;
+               const struct osc_lock *scan_ols;
+
+               if (scan == lock)
+                       break;
+
+               if (scan->cll_state < CLS_QUEUING ||
+                   scan->cll_state == CLS_FREEING ||
+                   cld->cld_start > descr->cld_end ||
+                   cld->cld_end < descr->cld_start)
+                       continue;
+
+               /* overlapped and living locks. */
+
+               /* We're not supposed to give up group lock. */
+               if (scan->cll_descr.cld_mode == CLM_GROUP) {
+                       LASSERT(descr->cld_mode != CLM_GROUP ||
+                               descr->cld_gid != scan->cll_descr.cld_gid);
+                       continue;
+               }
+
+               scan_ols = osc_lock_at(scan);
+
+               /* We need to cancel the compatible locks if we're enqueuing
+                * a lockless lock, for example:
+                * imagine that client has PR lock on [0, 1000], and thread T0
+                * is doing lockless IO in [500, 1500] region. Concurrent
+                * thread T1 can see lockless data in [500, 1000], which is
+                * wrong, because these data are possibly stale. */
+               if (!lockless && osc_lock_compatible(olck, scan_ols))
+                       continue;
+
+               cl_lock_get_trust(scan);
+               conflict = scan;
+               break;
+       }
+       spin_unlock(&hdr->coh_lock_guard);
+
+       if (conflict) {
+               if (lock->cll_descr.cld_mode == CLM_GROUP) {
+                       /* we want a group lock but a previous lock request
+                        * conflicts, we do not wait but return 0 so the
+                        * request is send to the server
+                        */
+                       CDEBUG(D_DLMTRACE, "group lock %p is conflicted "
+                                          "with %p, no wait, send to server\n",
+                              lock, conflict);
+                       cl_lock_put(env, conflict);
+                       rc = 0;
+               } else {
+                       CDEBUG(D_DLMTRACE, "lock %p is conflicted with %p, "
+                                          "will wait\n",
+                              lock, conflict);
+                       LASSERT(lock->cll_conflict == NULL);
+                       lu_ref_add(&conflict->cll_reference, "cancel-wait",
+                                  lock);
+                       lock->cll_conflict = conflict;
+                       rc = CLO_WAIT;
+               }
+       }
+       RETURN(rc);
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int osc_lock_enqueue(const struct lu_env *env,
+                           const struct cl_lock_slice *slice,
+                           struct cl_io *unused, __u32 enqflags)
+{
+       struct osc_lock   *ols     = cl2osc_lock(slice);
+       struct cl_lock     *lock    = ols->ols_cl.cls_lock;
+       int result;
+       ENTRY;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+       LASSERTF(ols->ols_state == OLS_NEW,
+                "Impossible state: %d\n", ols->ols_state);
+
+       LASSERTF(ergo(ols->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ),
+               "lock = %p, ols = %p\n", lock, ols);
+
+       result = osc_lock_enqueue_wait(env, ols);
+       if (result == 0) {
+               if (!osc_lock_is_lockless(ols)) {
+                       struct osc_object       *obj = cl2osc(slice->cls_obj);
+                       struct osc_thread_info   *info = osc_env_info(env);
+                       struct ldlm_res_id       *resname = &info->oti_resname;
+                       ldlm_policy_data_t       *policy = &info->oti_policy;
+                       struct ldlm_enqueue_info *einfo = &ols->ols_einfo;
+
+                       /* lock will be passed as upcall cookie,
+                        * hold ref to prevent to be released. */
+                       cl_lock_hold_add(env, lock, "upcall", lock);
+                       /* a user for lock also */
+                       cl_lock_user_add(env, lock);
+                       ols->ols_state = OLS_ENQUEUED;
+
+                       /*
+                        * XXX: this is possible blocking point as
+                        * ldlm_lock_match(LDLM_FL_LVB_READY) waits for
+                        * LDLM_CP_CALLBACK.
+                        */
+                       ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname);
+                       osc_lock_build_policy(env, lock, policy);
+                       result = osc_enqueue_base(osc_export(obj), resname,
+                                         &ols->ols_flags, policy,
+                                         &ols->ols_lvb,
+                                         obj->oo_oinfo->loi_kms_valid,
+                                         osc_lock_upcall,
+                                         ols, einfo, &ols->ols_handle,
+                                         PTLRPCD_SET, 1, ols->ols_agl);
+                       if (result != 0) {
+                               cl_lock_user_del(env, lock);
+                               cl_lock_unhold(env, lock, "upcall", lock);
+                               if (unlikely(result == -ECANCELED)) {
+                                       ols->ols_state = OLS_NEW;
+                                       result = 0;
+                               }
+                       }
+               } else {
+                       ols->ols_state = OLS_GRANTED;
+                       ols->ols_owner = osc_env_io(env);
+               }
+       }
+       LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols)));
+       RETURN(result);
+}
+
+static int osc_lock_wait(const struct lu_env *env,
+                        const struct cl_lock_slice *slice)
+{
+       struct osc_lock *olck = cl2osc_lock(slice);
+       struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+       LINVRNT(osc_lock_invariant(olck));
+
+       if (olck->ols_glimpse && olck->ols_state >= OLS_UPCALL_RECEIVED) {
+               if (olck->ols_flags & LDLM_FL_LVB_READY) {
+                       return 0;
+               } else if (olck->ols_agl) {
+                       if (lock->cll_flags & CLF_FROM_UPCALL)
+                               /* It is from enqueue RPC reply upcall for
+                                * updating state. Do not re-enqueue. */
+                               return -ENAVAIL;
+                       else
+                               olck->ols_state = OLS_NEW;
+               } else {
+                       LASSERT(lock->cll_error);
+                       return lock->cll_error;
+               }
+       }
+
+       if (olck->ols_state == OLS_NEW) {
+               int rc;
+
+               LASSERT(olck->ols_agl);
+               olck->ols_agl = 0;
+               rc = osc_lock_enqueue(env, slice, NULL, CEF_ASYNC | CEF_MUST);
+               if (rc != 0)
+                       return rc;
+               else
+                       return CLO_REENQUEUED;
+       }
+
+       LASSERT(equi(olck->ols_state >= OLS_UPCALL_RECEIVED &&
+                    lock->cll_error == 0, olck->ols_lock != NULL));
+
+       return lock->cll_error ?: olck->ols_state >= OLS_GRANTED ? 0 : CLO_WAIT;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_use() method that pins cached
+ * lock.
+ */
+static int osc_lock_use(const struct lu_env *env,
+                       const struct cl_lock_slice *slice)
+{
+       struct osc_lock *olck = cl2osc_lock(slice);
+       int rc;
+
+       LASSERT(!olck->ols_hold);
+
+       /*
+        * Atomically check for LDLM_FL_CBPENDING and addref a lock if this
+        * flag is not set. This protects us from a concurrent blocking ast.
+        */
+       rc = ldlm_lock_addref_try(&olck->ols_handle, olck->ols_einfo.ei_mode);
+       if (rc == 0) {
+               olck->ols_hold = 1;
+               olck->ols_state = OLS_GRANTED;
+       } else {
+               struct cl_lock *lock;
+
+               /*
+                * Lock is being cancelled somewhere within
+                * ldlm_handle_bl_callback(): LDLM_FL_CBPENDING is already
+                * set, but osc_ldlm_blocking_ast() hasn't yet acquired
+                * cl_lock mutex.
+                */
+               lock = slice->cls_lock;
+               LASSERT(lock->cll_state == CLS_INTRANSIT);
+               LASSERT(lock->cll_users > 0);
+               /* set a flag for osc_dlm_blocking_ast0() to signal the
+                * lock.*/
+               olck->ols_ast_wait = 1;
+               rc = CLO_WAIT;
+       }
+       return rc;
+}
+
+static int osc_lock_flush(struct osc_lock *ols, int discard)
+{
+       struct cl_lock       *lock  = ols->ols_cl.cls_lock;
+       struct cl_env_nest    nest;
+       struct lu_env   *env;
+       int result = 0;
+       ENTRY;
+
+       env = cl_env_nested_get(&nest);
+       if (!IS_ERR(env)) {
+               struct osc_object    *obj   = cl2osc(ols->ols_cl.cls_obj);
+               struct cl_lock_descr *descr = &lock->cll_descr;
+               int rc = 0;
+
+               if (descr->cld_mode >= CLM_WRITE) {
+                       result = osc_cache_writeback_range(env, obj,
+                                       descr->cld_start, descr->cld_end,
+                                       1, discard);
+                       LDLM_DEBUG(ols->ols_lock,
+                               "lock %p: %d pages were %s.\n", lock, result,
+                               discard ? "discarded" : "written");
+                       if (result > 0)
+                               result = 0;
+               }
+
+               rc = cl_lock_discard_pages(env, lock);
+               if (result == 0 && rc < 0)
+                       result = rc;
+
+               cl_env_nested_put(&nest, env);
+       } else
+               result = PTR_ERR(env);
+       if (result == 0) {
+               ols->ols_flush = 1;
+               LINVRNT(!osc_lock_has_pages(ols));
+       }
+       RETURN(result);
+}
+
+/**
+ * Implements cl_lock_operations::clo_cancel() method for osc layer. This is
+ * called (as part of cl_lock_cancel()) when lock is canceled either voluntary
+ * (LRU pressure, early cancellation, umount, etc.) or due to the conflict
+ * with some other lock some where in the cluster. This function does the
+ * following:
+ *
+ *     - invalidates all pages protected by this lock (after sending dirty
+ *       ones to the server, as necessary);
+ *
+ *     - decref's underlying ldlm lock;
+ *
+ *     - cancels ldlm lock (ldlm_cli_cancel()).
+ */
+static void osc_lock_cancel(const struct lu_env *env,
+                           const struct cl_lock_slice *slice)
+{
+       struct cl_lock   *lock    = slice->cls_lock;
+       struct osc_lock  *olck    = cl2osc_lock(slice);
+       struct ldlm_lock *dlmlock = olck->ols_lock;
+       int            result  = 0;
+       int            discard;
+
+       LASSERT(cl_lock_is_mutexed(lock));
+       LINVRNT(osc_lock_invariant(olck));
+
+       if (dlmlock != NULL) {
+               int do_cancel;
+
+               discard = !!(dlmlock->l_flags & LDLM_FL_DISCARD_DATA);
+               if (olck->ols_state >= OLS_GRANTED)
+                       result = osc_lock_flush(olck, discard);
+               osc_lock_unhold(olck);
+
+               lock_res_and_lock(dlmlock);
+               /* Now that we're the only user of dlm read/write reference,
+                * mostly the ->l_readers + ->l_writers should be zero.
+                * However, there is a corner case.
+                * See bug 18829 for details.*/
+               do_cancel = (dlmlock->l_readers == 0 &&
+                            dlmlock->l_writers == 0);
+               dlmlock->l_flags |= LDLM_FL_CBPENDING;
+               unlock_res_and_lock(dlmlock);
+               if (do_cancel)
+                       result = ldlm_cli_cancel(&olck->ols_handle, LCF_ASYNC);
+               if (result < 0)
+                       CL_LOCK_DEBUG(D_ERROR, env, lock,
+                                     "lock %p cancel failure with error(%d)\n",
+                                     lock, result);
+       }
+       olck->ols_state = OLS_CANCELLED;
+       olck->ols_flags &= ~LDLM_FL_LVB_READY;
+       osc_lock_detach(env, olck);
+}
+
+static int osc_lock_has_pages(struct osc_lock *olck)
+{
+       return 0;
+}
+
+static void osc_lock_delete(const struct lu_env *env,
+                           const struct cl_lock_slice *slice)
+{
+       struct osc_lock *olck;
+
+       olck = cl2osc_lock(slice);
+       if (olck->ols_glimpse) {
+               LASSERT(!olck->ols_hold);
+               LASSERT(!olck->ols_lock);
+               return;
+       }
+
+       LINVRNT(osc_lock_invariant(olck));
+       LINVRNT(!osc_lock_has_pages(olck));
+
+       osc_lock_unhold(olck);
+       osc_lock_detach(env, olck);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for osc layer.
+ *
+ * Maintains osc_lock::ols_owner field.
+ *
+ * This assumes that lock always enters CLS_HELD (from some other state) in
+ * the same IO context as one that requested the lock. This should not be a
+ * problem, because context is by definition shared by all activity pertaining
+ * to the same high-level IO.
+ */
+static void osc_lock_state(const struct lu_env *env,
+                          const struct cl_lock_slice *slice,
+                          enum cl_lock_state state)
+{
+       struct osc_lock *lock = cl2osc_lock(slice);
+
+       /*
+        * XXX multiple io contexts can use the lock at the same time.
+        */
+       LINVRNT(osc_lock_invariant(lock));
+       if (state == CLS_HELD && slice->cls_lock->cll_state != CLS_HELD) {
+               struct osc_io *oio = osc_env_io(env);
+
+               LASSERT(lock->ols_owner == NULL);
+               lock->ols_owner = oio;
+       } else if (state != CLS_HELD)
+               lock->ols_owner = NULL;
+}
+
+static int osc_lock_print(const struct lu_env *env, void *cookie,
+                         lu_printer_t p, const struct cl_lock_slice *slice)
+{
+       struct osc_lock *lock = cl2osc_lock(slice);
+
+       /*
+        * XXX print ldlm lock and einfo properly.
+        */
+       (*p)(env, cookie, "%p %#16llx "LPX64" %d %p ",
+            lock->ols_lock, lock->ols_flags, lock->ols_handle.cookie,
+            lock->ols_state, lock->ols_owner);
+       osc_lvb_print(env, cookie, p, &lock->ols_lvb);
+       return 0;
+}
+
+static int osc_lock_fits_into(const struct lu_env *env,
+                             const struct cl_lock_slice *slice,
+                             const struct cl_lock_descr *need,
+                             const struct cl_io *io)
+{
+       struct osc_lock *ols = cl2osc_lock(slice);
+
+       if (need->cld_enq_flags & CEF_NEVER)
+               return 0;
+
+       if (ols->ols_state >= OLS_CANCELLED)
+               return 0;
+
+       if (need->cld_mode == CLM_PHANTOM) {
+               if (ols->ols_agl)
+                       return !(ols->ols_state > OLS_RELEASED);
+
+               /*
+                * Note: the QUEUED lock can't be matched here, otherwise
+                * it might cause the deadlocks.
+                * In read_process,
+                * P1: enqueued read lock, create sublock1
+                * P2: enqueued write lock, create sublock2(conflicted
+                *     with sublock1).
+                * P1: Grant read lock.
+                * P1: enqueued glimpse lock(with holding sublock1_read),
+                *     matched with sublock2, waiting sublock2 to be granted.
+                *     But sublock2 can not be granted, because P1
+                *     will not release sublock1. Bang!
+                */
+               if (ols->ols_state < OLS_GRANTED ||
+                   ols->ols_state > OLS_RELEASED)
+                       return 0;
+       } else if (need->cld_enq_flags & CEF_MUST) {
+               /*
+                * If the lock hasn't ever enqueued, it can't be matched
+                * because enqueue process brings in many information
+                * which can be used to determine things such as lockless,
+                * CEF_MUST, etc.
+                */
+               if (ols->ols_state < OLS_UPCALL_RECEIVED &&
+                   ols->ols_locklessable)
+                       return 0;
+       }
+       return 1;
+}
+
+static const struct cl_lock_operations osc_lock_ops = {
+       .clo_fini    = osc_lock_fini,
+       .clo_enqueue = osc_lock_enqueue,
+       .clo_wait    = osc_lock_wait,
+       .clo_unuse   = osc_lock_unuse,
+       .clo_use     = osc_lock_use,
+       .clo_delete  = osc_lock_delete,
+       .clo_state   = osc_lock_state,
+       .clo_cancel  = osc_lock_cancel,
+       .clo_weigh   = osc_lock_weigh,
+       .clo_print   = osc_lock_print,
+       .clo_fits_into = osc_lock_fits_into,
+};
+
+static int osc_lock_lockless_unuse(const struct lu_env *env,
+                                  const struct cl_lock_slice *slice)
+{
+       struct osc_lock *ols = cl2osc_lock(slice);
+       struct cl_lock *lock = slice->cls_lock;
+
+       LASSERT(ols->ols_state == OLS_GRANTED);
+       LINVRNT(osc_lock_invariant(ols));
+
+       cl_lock_cancel(env, lock);
+       cl_lock_delete(env, lock);
+       return 0;
+}
+
+static void osc_lock_lockless_cancel(const struct lu_env *env,
+                                    const struct cl_lock_slice *slice)
+{
+       struct osc_lock   *ols  = cl2osc_lock(slice);
+       int result;
+
+       result = osc_lock_flush(ols, 0);
+       if (result)
+               CERROR("Pages for lockless lock %p were not purged(%d)\n",
+                      ols, result);
+       ols->ols_state = OLS_CANCELLED;
+}
+
+static int osc_lock_lockless_wait(const struct lu_env *env,
+                                 const struct cl_lock_slice *slice)
+{
+       struct osc_lock *olck = cl2osc_lock(slice);
+       struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+       LINVRNT(osc_lock_invariant(olck));
+       LASSERT(olck->ols_state >= OLS_UPCALL_RECEIVED);
+
+       return lock->cll_error;
+}
+
+static void osc_lock_lockless_state(const struct lu_env *env,
+                                   const struct cl_lock_slice *slice,
+                                   enum cl_lock_state state)
+{
+       struct osc_lock *lock = cl2osc_lock(slice);
+
+       LINVRNT(osc_lock_invariant(lock));
+       if (state == CLS_HELD) {
+               struct osc_io *oio  = osc_env_io(env);
+
+               LASSERT(ergo(lock->ols_owner, lock->ols_owner == oio));
+               lock->ols_owner = oio;
+
+               /* set the io to be lockless if this lock is for io's
+                * host object */
+               if (cl_object_same(oio->oi_cl.cis_obj, slice->cls_obj))
+                       oio->oi_lockless = 1;
+       }
+}
+
+static int osc_lock_lockless_fits_into(const struct lu_env *env,
+                                      const struct cl_lock_slice *slice,
+                                      const struct cl_lock_descr *need,
+                                      const struct cl_io *io)
+{
+       struct osc_lock *lock = cl2osc_lock(slice);
+
+       if (!(need->cld_enq_flags & CEF_NEVER))
+               return 0;
+
+       /* lockless lock should only be used by its owning io. b22147 */
+       return (lock->ols_owner == osc_env_io(env));
+}
+
+static const struct cl_lock_operations osc_lock_lockless_ops = {
+       .clo_fini      = osc_lock_fini,
+       .clo_enqueue   = osc_lock_enqueue,
+       .clo_wait      = osc_lock_lockless_wait,
+       .clo_unuse     = osc_lock_lockless_unuse,
+       .clo_state     = osc_lock_lockless_state,
+       .clo_fits_into = osc_lock_lockless_fits_into,
+       .clo_cancel    = osc_lock_lockless_cancel,
+       .clo_print     = osc_lock_print
+};
+
+int osc_lock_init(const struct lu_env *env,
+                 struct cl_object *obj, struct cl_lock *lock,
+                 const struct cl_io *unused)
+{
+       struct osc_lock *clk;
+       int result;
+
+       OBD_SLAB_ALLOC_PTR_GFP(clk, osc_lock_kmem, __GFP_IO);
+       if (clk != NULL) {
+               __u32 enqflags = lock->cll_descr.cld_enq_flags;
+
+               osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo);
+               atomic_set(&clk->ols_pageref, 0);
+               clk->ols_state = OLS_NEW;
+
+               clk->ols_flags = osc_enq2ldlm_flags(enqflags);
+               clk->ols_agl = !!(enqflags & CEF_AGL);
+               if (clk->ols_agl)
+                       clk->ols_flags |= LDLM_FL_BLOCK_NOWAIT;
+               if (clk->ols_flags & LDLM_FL_HAS_INTENT)
+                       clk->ols_glimpse = 1;
+
+               cl_lock_slice_add(lock, &clk->ols_cl, obj, &osc_lock_ops);
+
+               if (!(enqflags & CEF_MUST))
+                       /* try to convert this lock to a lockless lock */
+                       osc_lock_to_lockless(env, clk, (enqflags & CEF_NEVER));
+               if (clk->ols_locklessable && !(enqflags & CEF_DISCARD_DATA))
+                       clk->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
+
+               LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx\n",
+                               lock, clk, clk->ols_flags);
+
+               result = 0;
+       } else
+               result = -ENOMEM;
+       return result;
+}
+
+int osc_dlm_lock_pageref(struct ldlm_lock *dlm)
+{
+       struct osc_lock *olock;
+       int           rc = 0;
+
+       spin_lock(&osc_ast_guard);
+       olock = dlm->l_ast_data;
+       /*
+        * there's a very rare race with osc_page_addref_lock(), but that
+        * doesn't matter because in the worst case we don't cancel a lock
+        * which we actually can, that's no harm.
+        */
+       if (olock != NULL &&
+           atomic_add_return(_PAGEREF_MAGIC,
+                                 &olock->ols_pageref) != _PAGEREF_MAGIC) {
+               atomic_sub(_PAGEREF_MAGIC, &olock->ols_pageref);
+               rc = 1;
+       }
+       spin_unlock(&osc_ast_guard);
+       return rc;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c
new file mode 100644 (file)
index 0000000..ca94e63
--- /dev/null
@@ -0,0 +1,275 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_object *osc2lu(struct osc_object *osc)
+{
+       return &osc->oo_cl.co_lu;
+}
+
+static struct osc_object *lu2osc(const struct lu_object *obj)
+{
+       LINVRNT(osc_is_object(obj));
+       return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+                          const struct lu_object_conf *conf)
+{
+       struct osc_object          *osc   = lu2osc(obj);
+       const struct cl_object_conf *cconf = lu2cl_conf(conf);
+       int i;
+
+       osc->oo_oinfo = cconf->u.coc_oinfo;
+       spin_lock_init(&osc->oo_seatbelt);
+       for (i = 0; i < CRT_NR; ++i)
+               INIT_LIST_HEAD(&osc->oo_inflight[i]);
+
+       INIT_LIST_HEAD(&osc->oo_ready_item);
+       INIT_LIST_HEAD(&osc->oo_hp_ready_item);
+       INIT_LIST_HEAD(&osc->oo_write_item);
+       INIT_LIST_HEAD(&osc->oo_read_item);
+
+       osc->oo_root.rb_node = NULL;
+       INIT_LIST_HEAD(&osc->oo_hp_exts);
+       INIT_LIST_HEAD(&osc->oo_urgent_exts);
+       INIT_LIST_HEAD(&osc->oo_rpc_exts);
+       INIT_LIST_HEAD(&osc->oo_reading_exts);
+       atomic_set(&osc->oo_nr_reads, 0);
+       atomic_set(&osc->oo_nr_writes, 0);
+       spin_lock_init(&osc->oo_lock);
+
+       cl_object_page_init(lu2cl(obj), sizeof(struct osc_page));
+
+       return 0;
+}
+
+static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+       struct osc_object *osc = lu2osc(obj);
+       int i;
+
+       for (i = 0; i < CRT_NR; ++i)
+               LASSERT(list_empty(&osc->oo_inflight[i]));
+
+       LASSERT(list_empty(&osc->oo_ready_item));
+       LASSERT(list_empty(&osc->oo_hp_ready_item));
+       LASSERT(list_empty(&osc->oo_write_item));
+       LASSERT(list_empty(&osc->oo_read_item));
+
+       LASSERT(osc->oo_root.rb_node == NULL);
+       LASSERT(list_empty(&osc->oo_hp_exts));
+       LASSERT(list_empty(&osc->oo_urgent_exts));
+       LASSERT(list_empty(&osc->oo_rpc_exts));
+       LASSERT(list_empty(&osc->oo_reading_exts));
+       LASSERT(atomic_read(&osc->oo_nr_reads) == 0);
+       LASSERT(atomic_read(&osc->oo_nr_writes) == 0);
+
+       lu_object_fini(obj);
+       OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
+}
+
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+                 lu_printer_t p, const struct ost_lvb *lvb)
+{
+       return (*p)(env, cookie, "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+                   "ctime: "LPU64" blocks: "LPU64,
+                   lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+                   lvb->lvb_ctime, lvb->lvb_blocks);
+}
+
+static int osc_object_print(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *obj)
+{
+       struct osc_object   *osc   = lu2osc(obj);
+       struct lov_oinfo    *oinfo = osc->oo_oinfo;
+       struct osc_async_rc *ar    = &oinfo->loi_ar;
+
+       (*p)(env, cookie, "id: "DOSTID" "
+            "idx: %d gen: %d kms_valid: %u kms "LPU64" "
+            "rc: %d force_sync: %d min_xid: "LPU64" ",
+            POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx,
+            oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms,
+            ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid);
+       osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
+       return 0;
+}
+
+
+static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_attr *attr)
+{
+       struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+       cl_lvb2attr(attr, &oinfo->loi_lvb);
+       attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+       return 0;
+}
+
+int osc_attr_set(const struct lu_env *env, struct cl_object *obj,
+                const struct cl_attr *attr, unsigned valid)
+{
+       struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+       struct ost_lvb   *lvb   = &oinfo->loi_lvb;
+
+       if (valid & CAT_SIZE)
+               lvb->lvb_size = attr->cat_size;
+       if (valid & CAT_MTIME)
+               lvb->lvb_mtime = attr->cat_mtime;
+       if (valid & CAT_ATIME)
+               lvb->lvb_atime = attr->cat_atime;
+       if (valid & CAT_CTIME)
+               lvb->lvb_ctime = attr->cat_ctime;
+       if (valid & CAT_BLOCKS)
+               lvb->lvb_blocks = attr->cat_blocks;
+       if (valid & CAT_KMS) {
+               CDEBUG(D_CACHE, "set kms from "LPU64"to "LPU64"\n",
+                      oinfo->loi_kms, (__u64)attr->cat_kms);
+               loi_kms_set(oinfo, attr->cat_kms);
+       }
+       return 0;
+}
+
+static int osc_object_glimpse(const struct lu_env *env,
+                             const struct cl_object *obj, struct ost_lvb *lvb)
+{
+       struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+       ENTRY;
+       lvb->lvb_size   = oinfo->loi_kms;
+       lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+       RETURN(0);
+}
+
+
+void osc_object_set_contended(struct osc_object *obj)
+{
+       obj->oo_contention_time = cfs_time_current();
+       /* mb(); */
+       obj->oo_contended = 1;
+}
+
+void osc_object_clear_contended(struct osc_object *obj)
+{
+       obj->oo_contended = 0;
+}
+
+int osc_object_is_contended(struct osc_object *obj)
+{
+       struct osc_device *dev  = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+       int osc_contention_time = dev->od_contention_time;
+       cfs_time_t cur_time     = cfs_time_current();
+       cfs_time_t retry_time;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
+               return 1;
+
+       if (!obj->oo_contended)
+               return 0;
+
+       /*
+        * I like copy-paste. the code is copied from
+        * ll_file_is_contended.
+        */
+       retry_time = cfs_time_add(obj->oo_contention_time,
+                                 cfs_time_seconds(osc_contention_time));
+       if (cfs_time_after(cur_time, retry_time)) {
+               osc_object_clear_contended(obj);
+               return 0;
+       }
+       return 1;
+}
+
+static const struct cl_object_operations osc_ops = {
+       .coo_page_init = osc_page_init,
+       .coo_lock_init = osc_lock_init,
+       .coo_io_init   = osc_io_init,
+       .coo_attr_get  = osc_attr_get,
+       .coo_attr_set  = osc_attr_set,
+       .coo_glimpse   = osc_object_glimpse
+};
+
+static const struct lu_object_operations osc_lu_obj_ops = {
+       .loo_object_init      = osc_object_init,
+       .loo_object_delete    = NULL,
+       .loo_object_release   = NULL,
+       .loo_object_free      = osc_object_free,
+       .loo_object_print     = osc_object_print,
+       .loo_object_invariant = NULL
+};
+
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+                                  const struct lu_object_header *unused,
+                                  struct lu_device *dev)
+{
+       struct osc_object *osc;
+       struct lu_object  *obj;
+
+       OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, __GFP_IO);
+       if (osc != NULL) {
+               obj = osc2lu(osc);
+               lu_object_init(obj, NULL, dev);
+               osc->oo_cl.co_ops = &osc_ops;
+               obj->lo_ops = &osc_lu_obj_ops;
+       } else
+               obj = NULL;
+       return obj;
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c
new file mode 100644 (file)
index 0000000..baba959
--- /dev/null
@@ -0,0 +1,927 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del);
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg);
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+                          struct osc_page *opg);
+
+/** \addtogroup osc
+ *  @{
+ */
+
+/*
+ * Comment out osc_page_protected because it may sleep inside the
+ * the client_obd_list_lock.
+ * client_obd_list_lock -> osc_ap_completion -> osc_completion ->
+ *   -> osc_page_protected -> osc_page_is_dlocked -> osc_match_base
+ *   -> ldlm_lock_match -> sptlrpc_import_check_ctx -> sleep.
+ */
+#if 0
+static int osc_page_is_dlocked(const struct lu_env *env,
+                              const struct osc_page *opg,
+                              enum cl_lock_mode mode, int pending, int unref)
+{
+       struct cl_page   *page;
+       struct osc_object      *obj;
+       struct osc_thread_info *info;
+       struct ldlm_res_id     *resname;
+       struct lustre_handle   *lockh;
+       ldlm_policy_data_t     *policy;
+       ldlm_mode_t          dlmmode;
+       int                  flags;
+
+       might_sleep();
+
+       info = osc_env_info(env);
+       resname = &info->oti_resname;
+       policy = &info->oti_policy;
+       lockh = &info->oti_handle;
+       page = opg->ops_cl.cpl_page;
+       obj = cl2osc(opg->ops_cl.cpl_obj);
+
+       flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED;
+       if (pending)
+               flags |= LDLM_FL_CBPENDING;
+
+       dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW;
+       osc_lock_build_res(env, obj, resname);
+       osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index);
+       return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
+                             dlmmode, &flags, NULL, lockh, unref);
+}
+
+/**
+ * Checks an invariant that a page in the cache is covered by a lock, as
+ * needed.
+ */
+static int osc_page_protected(const struct lu_env *env,
+                             const struct osc_page *opg,
+                             enum cl_lock_mode mode, int unref)
+{
+       struct cl_object_header *hdr;
+       struct cl_lock    *scan;
+       struct cl_page    *page;
+       struct cl_lock_descr    *descr;
+       int result;
+
+       LINVRNT(!opg->ops_temp);
+
+       page = opg->ops_cl.cpl_page;
+       if (page->cp_owner != NULL &&
+           cl_io_top(page->cp_owner)->ci_lockreq == CILR_NEVER)
+               /*
+                * If IO is done without locks (liblustre, or lloop), lock is
+                * not required.
+                */
+               result = 1;
+       else
+               /* otherwise check for a DLM lock */
+       result = osc_page_is_dlocked(env, opg, mode, 1, unref);
+       if (result == 0) {
+               /* maybe this page is a part of a lockless io? */
+               hdr = cl_object_header(opg->ops_cl.cpl_obj);
+               descr = &osc_env_info(env)->oti_descr;
+               descr->cld_mode = mode;
+               descr->cld_start = page->cp_index;
+               descr->cld_end   = page->cp_index;
+               spin_lock(&hdr->coh_lock_guard);
+               list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+                       /*
+                        * Lock-less sub-lock has to be either in HELD state
+                        * (when io is actively going on), or in CACHED state,
+                        * when top-lock is being unlocked:
+                        * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse().
+                        */
+                       if ((scan->cll_state == CLS_HELD ||
+                            scan->cll_state == CLS_CACHED) &&
+                           cl_lock_ext_match(&scan->cll_descr, descr)) {
+                               struct osc_lock *olck;
+
+                               olck = osc_lock_at(scan);
+                               result = osc_lock_is_lockless(olck);
+                               break;
+                       }
+               }
+               spin_unlock(&hdr->coh_lock_guard);
+       }
+       return result;
+}
+#else
+static int osc_page_protected(const struct lu_env *env,
+                             const struct osc_page *opg,
+                             enum cl_lock_mode mode, int unref)
+{
+       return 1;
+}
+#endif
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+static void osc_page_fini(const struct lu_env *env,
+                         struct cl_page_slice *slice)
+{
+       struct osc_page *opg = cl2osc_page(slice);
+       CDEBUG(D_TRACE, "%p\n", opg);
+       LASSERT(opg->ops_lock == NULL);
+}
+
+static void osc_page_transfer_get(struct osc_page *opg, const char *label)
+{
+       struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+       LASSERT(!opg->ops_transfer_pinned);
+       cl_page_get(page);
+       lu_ref_add_atomic(&page->cp_reference, label, page);
+       opg->ops_transfer_pinned = 1;
+}
+
+static void osc_page_transfer_put(const struct lu_env *env,
+                                 struct osc_page *opg)
+{
+       struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+       if (opg->ops_transfer_pinned) {
+               lu_ref_del(&page->cp_reference, "transfer", page);
+               opg->ops_transfer_pinned = 0;
+               cl_page_put(env, page);
+       }
+}
+
+/**
+ * This is called once for every page when it is submitted for a transfer
+ * either opportunistic (osc_page_cache_add()), or immediate
+ * (osc_page_submit()).
+ */
+static void osc_page_transfer_add(const struct lu_env *env,
+                                 struct osc_page *opg, enum cl_req_type crt)
+{
+       struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+       /* ops_lru and ops_inflight share the same field, so take it from LRU
+        * first and then use it as inflight. */
+       osc_lru_del(osc_cli(obj), opg, false);
+
+       spin_lock(&obj->oo_seatbelt);
+       list_add(&opg->ops_inflight, &obj->oo_inflight[crt]);
+       opg->ops_submitter = current;
+       spin_unlock(&obj->oo_seatbelt);
+}
+
+static int osc_page_cache_add(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *io)
+{
+       struct osc_io   *oio = osc_env_io(env);
+       struct osc_page *opg = cl2osc_page(slice);
+       int result;
+       ENTRY;
+
+       LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0));
+
+       osc_page_transfer_get(opg, "transfer\0cache");
+       result = osc_queue_async_io(env, io, opg);
+       if (result != 0)
+               osc_page_transfer_put(env, opg);
+       else
+               osc_page_transfer_add(env, opg, CRT_WRITE);
+
+       /* for sync write, kernel will wait for this page to be flushed before
+        * osc_io_end() is called, so release it earlier.
+        * for mkwrite(), it's known there is no further pages. */
+       if (cl_io_is_sync_write(io) || cl_io_is_mkwrite(io)) {
+               if (oio->oi_active != NULL) {
+                       osc_extent_release(env, oio->oi_active);
+                       oio->oi_active = NULL;
+               }
+       }
+
+       RETURN(result);
+}
+
+void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj,
+                     pgoff_t start, pgoff_t end)
+{
+       memset(policy, 0, sizeof *policy);
+       policy->l_extent.start = cl_offset(obj, start);
+       policy->l_extent.end   = cl_offset(obj, end + 1) - 1;
+}
+
+static int osc_page_addref_lock(const struct lu_env *env,
+                               struct osc_page *opg,
+                               struct cl_lock *lock)
+{
+       struct osc_lock *olock;
+       int           rc;
+
+       LASSERT(opg->ops_lock == NULL);
+
+       olock = osc_lock_at(lock);
+       if (atomic_inc_return(&olock->ols_pageref) <= 0) {
+               atomic_dec(&olock->ols_pageref);
+               rc = -ENODATA;
+       } else {
+               cl_lock_get(lock);
+               opg->ops_lock = lock;
+               rc = 0;
+       }
+       return rc;
+}
+
+static void osc_page_putref_lock(const struct lu_env *env,
+                                struct osc_page *opg)
+{
+       struct cl_lock  *lock = opg->ops_lock;
+       struct osc_lock *olock;
+
+       LASSERT(lock != NULL);
+       olock = osc_lock_at(lock);
+
+       atomic_dec(&olock->ols_pageref);
+       opg->ops_lock = NULL;
+
+       cl_lock_put(env, lock);
+}
+
+static int osc_page_is_under_lock(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 struct cl_io *unused)
+{
+       struct cl_lock *lock;
+       int          result = -ENODATA;
+
+       ENTRY;
+       lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page,
+                              NULL, 1, 0);
+       if (lock != NULL) {
+               if (osc_page_addref_lock(env, cl2osc_page(slice), lock) == 0)
+                       result = -EBUSY;
+               cl_lock_put(env, lock);
+       }
+       RETURN(result);
+}
+
+static void osc_page_disown(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *io)
+{
+       struct osc_page *opg = cl2osc_page(slice);
+
+       if (unlikely(opg->ops_lock))
+               osc_page_putref_lock(env, opg);
+}
+
+static void osc_page_completion_read(const struct lu_env *env,
+                                    const struct cl_page_slice *slice,
+                                    int ioret)
+{
+       struct osc_page   *opg = cl2osc_page(slice);
+       struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+
+       if (likely(opg->ops_lock))
+               osc_page_putref_lock(env, opg);
+       osc_lru_add(osc_cli(obj), opg);
+}
+
+static void osc_page_completion_write(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     int ioret)
+{
+       struct osc_page   *opg = cl2osc_page(slice);
+       struct osc_object *obj = cl2osc(slice->cpl_obj);
+
+       osc_lru_add(osc_cli(obj), opg);
+}
+
+static int osc_page_fail(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        struct cl_io *unused)
+{
+       /*
+        * Cached read?
+        */
+       LBUG();
+       return 0;
+}
+
+
+static const char *osc_list(struct list_head *head)
+{
+       return list_empty(head) ? "-" : "+";
+}
+
+static inline cfs_time_t osc_submit_duration(struct osc_page *opg)
+{
+       if (opg->ops_submit_time == 0)
+               return 0;
+
+       return (cfs_time_current() - opg->ops_submit_time);
+}
+
+static int osc_page_print(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         void *cookie, lu_printer_t printer)
+{
+       struct osc_page       *opg = cl2osc_page(slice);
+       struct osc_async_page *oap = &opg->ops_oap;
+       struct osc_object     *obj = cl2osc(slice->cpl_obj);
+       struct client_obd     *cli = &osc_export(obj)->exp_obd->u.cli;
+
+       return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p: "
+                         "1< %#x %d %u %s %s > "
+                         "2< "LPU64" %u %u %#x %#x | %p %p %p > "
+                         "3< %s %p %d %lu %d > "
+                         "4< %d %d %d %lu %s | %s %s %s %s > "
+                         "5< %s %s %s %s | %d %s | %d %s %s>\n",
+                         opg,
+                         /* 1 */
+                         oap->oap_magic, oap->oap_cmd,
+                         oap->oap_interrupted,
+                         osc_list(&oap->oap_pending_item),
+                         osc_list(&oap->oap_rpc_item),
+                         /* 2 */
+                         oap->oap_obj_off, oap->oap_page_off, oap->oap_count,
+                         oap->oap_async_flags, oap->oap_brw_flags,
+                         oap->oap_request, oap->oap_cli, obj,
+                         /* 3 */
+                         osc_list(&opg->ops_inflight),
+                         opg->ops_submitter, opg->ops_transfer_pinned,
+                         osc_submit_duration(opg), opg->ops_srvlock,
+                         /* 4 */
+                         cli->cl_r_in_flight, cli->cl_w_in_flight,
+                         cli->cl_max_rpcs_in_flight,
+                         cli->cl_avail_grant,
+                         osc_list(&cli->cl_cache_waiters),
+                         osc_list(&cli->cl_loi_ready_list),
+                         osc_list(&cli->cl_loi_hp_ready_list),
+                         osc_list(&cli->cl_loi_write_list),
+                         osc_list(&cli->cl_loi_read_list),
+                         /* 5 */
+                         osc_list(&obj->oo_ready_item),
+                         osc_list(&obj->oo_hp_ready_item),
+                         osc_list(&obj->oo_write_item),
+                         osc_list(&obj->oo_read_item),
+                         atomic_read(&obj->oo_nr_reads),
+                         osc_list(&obj->oo_reading_exts),
+                         atomic_read(&obj->oo_nr_writes),
+                         osc_list(&obj->oo_hp_exts),
+                         osc_list(&obj->oo_urgent_exts));
+}
+
+static void osc_page_delete(const struct lu_env *env,
+                           const struct cl_page_slice *slice)
+{
+       struct osc_page   *opg = cl2osc_page(slice);
+       struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+       int rc;
+
+       LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1));
+
+       ENTRY;
+       CDEBUG(D_TRACE, "%p\n", opg);
+       osc_page_transfer_put(env, opg);
+       rc = osc_teardown_async_page(env, obj, opg);
+       if (rc) {
+               CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(slice->cpl_page),
+                             "Trying to teardown failed: %d\n", rc);
+               LASSERT(0);
+       }
+
+       spin_lock(&obj->oo_seatbelt);
+       if (opg->ops_submitter != NULL) {
+               LASSERT(!list_empty(&opg->ops_inflight));
+               list_del_init(&opg->ops_inflight);
+               opg->ops_submitter = NULL;
+       }
+       spin_unlock(&obj->oo_seatbelt);
+
+       osc_lru_del(osc_cli(obj), opg, true);
+       EXIT;
+}
+
+void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice,
+                  int from, int to)
+{
+       struct osc_page       *opg = cl2osc_page(slice);
+       struct osc_async_page *oap = &opg->ops_oap;
+
+       LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+       opg->ops_from = from;
+       opg->ops_to   = to;
+       spin_lock(&oap->oap_lock);
+       oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+       spin_unlock(&oap->oap_lock);
+}
+
+static int osc_page_cancel(const struct lu_env *env,
+                          const struct cl_page_slice *slice)
+{
+       struct osc_page *opg = cl2osc_page(slice);
+       int rc = 0;
+
+       LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+       /* Check if the transferring against this page
+        * is completed, or not even queued. */
+       if (opg->ops_transfer_pinned)
+               /* FIXME: may not be interrupted.. */
+               rc = osc_cancel_async_page(env, opg);
+       LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0));
+       return rc;
+}
+
+static int osc_page_flush(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         struct cl_io *io)
+{
+       struct osc_page *opg = cl2osc_page(slice);
+       int rc = 0;
+       ENTRY;
+       rc = osc_flush_async_page(env, io, opg);
+       RETURN(rc);
+}
+
+static const struct cl_page_operations osc_page_ops = {
+       .cpo_fini         = osc_page_fini,
+       .cpo_print       = osc_page_print,
+       .cpo_delete     = osc_page_delete,
+       .cpo_is_under_lock = osc_page_is_under_lock,
+       .cpo_disown     = osc_page_disown,
+       .io = {
+               [CRT_READ] = {
+                       .cpo_cache_add  = osc_page_fail,
+                       .cpo_completion = osc_page_completion_read
+               },
+               [CRT_WRITE] = {
+                       .cpo_cache_add  = osc_page_cache_add,
+                       .cpo_completion = osc_page_completion_write
+               }
+       },
+       .cpo_clip          = osc_page_clip,
+       .cpo_cancel      = osc_page_cancel,
+       .cpo_flush        = osc_page_flush
+};
+
+int osc_page_init(const struct lu_env *env, struct cl_object *obj,
+               struct cl_page *page, struct page *vmpage)
+{
+       struct osc_object *osc = cl2osc(obj);
+       struct osc_page   *opg = cl_object_page_slice(obj, page);
+       int result;
+
+       opg->ops_from = 0;
+       opg->ops_to   = PAGE_CACHE_SIZE;
+
+       result = osc_prep_async_page(osc, opg, vmpage,
+                                       cl_offset(obj, page->cp_index));
+       if (result == 0) {
+               struct osc_io *oio = osc_env_io(env);
+               opg->ops_srvlock = osc_io_srvlock(oio);
+               cl_page_slice_add(page, &opg->ops_cl, obj,
+                               &osc_page_ops);
+       }
+       /*
+        * Cannot assert osc_page_protected() here as read-ahead
+        * creates temporary pages outside of a lock.
+        */
+       /* ops_inflight and ops_lru are the same field, but it doesn't
+        * hurt to initialize it twice :-) */
+       INIT_LIST_HEAD(&opg->ops_inflight);
+       INIT_LIST_HEAD(&opg->ops_lru);
+
+       /* reserve an LRU space for this page */
+       if (page->cp_type == CPT_CACHEABLE && result == 0)
+               result = osc_lru_reserve(env, osc, opg);
+
+       return result;
+}
+
+/**
+ * Helper function called by osc_io_submit() for every page in an immediate
+ * transfer (i.e., transferred synchronously).
+ */
+void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
+                    enum cl_req_type crt, int brw_flags)
+{
+       struct osc_async_page *oap = &opg->ops_oap;
+       struct osc_object     *obj = oap->oap_obj;
+
+       LINVRNT(osc_page_protected(env, opg,
+                                  crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1));
+
+       LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, "
+                "magic 0x%x\n", oap, oap->oap_magic);
+       LASSERT(oap->oap_async_flags & ASYNC_READY);
+       LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE);
+
+       oap->oap_cmd       = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+       oap->oap_page_off  = opg->ops_from;
+       oap->oap_count     = opg->ops_to - opg->ops_from;
+       oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
+
+       if (!client_is_remote(osc_export(obj)) &&
+                       cfs_capable(CFS_CAP_SYS_RESOURCE)) {
+               oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
+               oap->oap_cmd |= OBD_BRW_NOQUOTA;
+       }
+
+       opg->ops_submit_time = cfs_time_current();
+       osc_page_transfer_get(opg, "transfer\0imm");
+       osc_page_transfer_add(env, opg, crt);
+}
+
+/* --------------- LRU page management ------------------ */
+
+/* OSC is a natural place to manage LRU pages as applications are specialized
+ * to write OSC by OSC. Ideally, if one OSC is used more frequently it should
+ * occupy more LRU slots. On the other hand, we should avoid using up all LRU
+ * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep
+ * for free LRU slots - this will be very bad so the algorithm requires each
+ * OSC to free slots voluntarily to maintain a reasonable number of free slots
+ * at any time.
+ */
+
+static CFS_DECL_WAITQ(osc_lru_waitq);
+static atomic_t osc_lru_waiters = ATOMIC_INIT(0);
+/* LRU pages are freed in batch mode. OSC should at least free this
+ * number of pages to avoid running out of LRU budget, and.. */
+static const int lru_shrink_min = 2 << (20 - PAGE_CACHE_SHIFT);  /* 2M */
+/* free this number at most otherwise it will take too long time to finsih. */
+static const int lru_shrink_max = 32 << (20 - PAGE_CACHE_SHIFT); /* 32M */
+
+/* Check if we can free LRU slots from this OSC. If there exists LRU waiters,
+ * we should free slots aggressively. In this way, slots are freed in a steady
+ * step to maintain fairness among OSCs.
+ *
+ * Return how many LRU pages should be freed. */
+static int osc_cache_too_much(struct client_obd *cli)
+{
+       struct cl_client_cache *cache = cli->cl_cache;
+       int pages = atomic_read(&cli->cl_lru_in_list) >> 1;
+
+       if (atomic_read(&osc_lru_waiters) > 0 &&
+           atomic_read(cli->cl_lru_left) < lru_shrink_max)
+               /* drop lru pages aggressively */
+               return min(pages, lru_shrink_max);
+
+       /* if it's going to run out LRU slots, we should free some, but not
+        * too much to maintain faireness among OSCs. */
+       if (atomic_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) {
+               unsigned long tmp;
+
+               tmp = cache->ccc_lru_max / atomic_read(&cache->ccc_users);
+               if (pages > tmp)
+                       return min(pages, lru_shrink_max);
+
+               return pages > lru_shrink_min ? lru_shrink_min : 0;
+       }
+
+       return 0;
+}
+
+/* Return how many pages are not discarded in @pvec. */
+static int discard_pagevec(const struct lu_env *env, struct cl_io *io,
+                          struct cl_page **pvec, int max_index)
+{
+       int count;
+       int i;
+
+       for (count = 0, i = 0; i < max_index; i++) {
+               struct cl_page *page = pvec[i];
+               if (cl_page_own_try(env, io, page) == 0) {
+                       /* free LRU page only if nobody is using it.
+                        * This check is necessary to avoid freeing the pages
+                        * having already been removed from LRU and pinned
+                        * for IO. */
+                       if (!cl_page_in_use(page)) {
+                               cl_page_unmap(env, io, page);
+                               cl_page_discard(env, io, page);
+                               ++count;
+                       }
+                       cl_page_disown(env, io, page);
+               }
+               cl_page_put(env, page);
+               pvec[i] = NULL;
+       }
+       return max_index - count;
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+int osc_lru_shrink(struct client_obd *cli, int target)
+{
+       struct cl_env_nest nest;
+       struct lu_env *env;
+       struct cl_io *io;
+       struct cl_object *clobj = NULL;
+       struct cl_page **pvec;
+       struct osc_page *opg;
+       int maxscan = 0;
+       int count = 0;
+       int index = 0;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(atomic_read(&cli->cl_lru_in_list) >= 0);
+       if (atomic_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+               RETURN(0);
+
+       env = cl_env_nested_get(&nest);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       pvec = osc_env_info(env)->oti_pvec;
+       io = &osc_env_info(env)->oti_io;
+
+       client_obd_list_lock(&cli->cl_lru_list_lock);
+       atomic_inc(&cli->cl_lru_shrinkers);
+       maxscan = min(target << 1, atomic_read(&cli->cl_lru_in_list));
+       while (!list_empty(&cli->cl_lru_list)) {
+               struct cl_page *page;
+
+               if (--maxscan < 0)
+                       break;
+
+               opg = list_entry(cli->cl_lru_list.next, struct osc_page,
+                                    ops_lru);
+               page = cl_page_top(opg->ops_cl.cpl_page);
+               if (cl_page_in_use_noref(page)) {
+                       list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+                       continue;
+               }
+
+               LASSERT(page->cp_obj != NULL);
+               if (clobj != page->cp_obj) {
+                       struct cl_object *tmp = page->cp_obj;
+
+                       cl_object_get(tmp);
+                       client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+                       if (clobj != NULL) {
+                               count -= discard_pagevec(env, io, pvec, index);
+                               index = 0;
+
+                               cl_io_fini(env, io);
+                               cl_object_put(env, clobj);
+                               clobj = NULL;
+                       }
+
+                       clobj = tmp;
+                       io->ci_obj = clobj;
+                       io->ci_ignore_layout = 1;
+                       rc = cl_io_init(env, io, CIT_MISC, clobj);
+
+                       client_obd_list_lock(&cli->cl_lru_list_lock);
+
+                       if (rc != 0)
+                               break;
+
+                       ++maxscan;
+                       continue;
+               }
+
+               /* move this page to the end of list as it will be discarded
+                * soon. The page will be finally removed from LRU list in
+                * osc_page_delete().  */
+               list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+
+               /* it's okay to grab a refcount here w/o holding lock because
+                * it has to grab cl_lru_list_lock to delete the page. */
+               cl_page_get(page);
+               pvec[index++] = page;
+               if (++count >= target)
+                       break;
+
+               if (unlikely(index == OTI_PVEC_SIZE)) {
+                       client_obd_list_unlock(&cli->cl_lru_list_lock);
+                       count -= discard_pagevec(env, io, pvec, index);
+                       index = 0;
+
+                       client_obd_list_lock(&cli->cl_lru_list_lock);
+               }
+       }
+       client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+       if (clobj != NULL) {
+               count -= discard_pagevec(env, io, pvec, index);
+
+               cl_io_fini(env, io);
+               cl_object_put(env, clobj);
+       }
+       cl_env_nested_put(&nest, env);
+
+       atomic_dec(&cli->cl_lru_shrinkers);
+       RETURN(count > 0 ? count : rc);
+}
+
+static void osc_lru_add(struct client_obd *cli, struct osc_page *opg)
+{
+       bool wakeup = false;
+
+       if (!opg->ops_in_lru)
+               return;
+
+       atomic_dec(&cli->cl_lru_busy);
+       client_obd_list_lock(&cli->cl_lru_list_lock);
+       if (list_empty(&opg->ops_lru)) {
+               list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+               atomic_inc_return(&cli->cl_lru_in_list);
+               wakeup = atomic_read(&osc_lru_waiters) > 0;
+       }
+       client_obd_list_unlock(&cli->cl_lru_list_lock);
+
+       if (wakeup) {
+               osc_lru_shrink(cli, osc_cache_too_much(cli));
+               wake_up_all(&osc_lru_waitq);
+       }
+}
+
+/* delete page from LRUlist. The page can be deleted from LRUlist for two
+ * reasons: redirtied or deleted from page cache. */
+static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del)
+{
+       if (opg->ops_in_lru) {
+               client_obd_list_lock(&cli->cl_lru_list_lock);
+               if (!list_empty(&opg->ops_lru)) {
+                       LASSERT(atomic_read(&cli->cl_lru_in_list) > 0);
+                       list_del_init(&opg->ops_lru);
+                       atomic_dec(&cli->cl_lru_in_list);
+                       if (!del)
+                               atomic_inc(&cli->cl_lru_busy);
+               } else if (del) {
+                       LASSERT(atomic_read(&cli->cl_lru_busy) > 0);
+                       atomic_dec(&cli->cl_lru_busy);
+               }
+               client_obd_list_unlock(&cli->cl_lru_list_lock);
+               if (del) {
+                       atomic_inc(cli->cl_lru_left);
+                       /* this is a great place to release more LRU pages if
+                        * this osc occupies too many LRU pages and kernel is
+                        * stealing one of them.
+                        * cl_lru_shrinkers is to avoid recursive call in case
+                        * we're already in the context of osc_lru_shrink(). */
+                       if (atomic_read(&cli->cl_lru_shrinkers) == 0 &&
+                           !memory_pressure_get())
+                               osc_lru_shrink(cli, osc_cache_too_much(cli));
+                       wake_up(&osc_lru_waitq);
+               }
+       } else {
+               LASSERT(list_empty(&opg->ops_lru));
+       }
+}
+
+static inline int max_to_shrink(struct client_obd *cli)
+{
+       return min(atomic_read(&cli->cl_lru_in_list) >> 1, lru_shrink_max);
+}
+
+static int osc_lru_reclaim(struct client_obd *cli)
+{
+       struct cl_client_cache *cache = cli->cl_cache;
+       int max_scans;
+       int rc;
+
+       LASSERT(cache != NULL);
+       LASSERT(!list_empty(&cache->ccc_lru));
+
+       rc = osc_lru_shrink(cli, lru_shrink_min);
+       if (rc != 0) {
+               CDEBUG(D_CACHE, "%s: Free %d pages from own LRU: %p.\n",
+                       cli->cl_import->imp_obd->obd_name, rc, cli);
+               return rc;
+       }
+
+       CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %d, busy: %d.\n",
+               cli->cl_import->imp_obd->obd_name, cli,
+               atomic_read(&cli->cl_lru_in_list),
+               atomic_read(&cli->cl_lru_busy));
+
+       /* Reclaim LRU slots from other client_obd as it can't free enough
+        * from its own. This should rarely happen. */
+       spin_lock(&cache->ccc_lru_lock);
+       cache->ccc_lru_shrinkers++;
+       list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+
+       max_scans = atomic_read(&cache->ccc_users);
+       while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) {
+               cli = list_entry(cache->ccc_lru.next, struct client_obd,
+                                       cl_lru_osc);
+
+               CDEBUG(D_CACHE, "%s: cli %p LRU pages: %d, busy: %d.\n",
+                       cli->cl_import->imp_obd->obd_name, cli,
+                       atomic_read(&cli->cl_lru_in_list),
+                       atomic_read(&cli->cl_lru_busy));
+
+               list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru);
+               if (atomic_read(&cli->cl_lru_in_list) > 0) {
+                       spin_unlock(&cache->ccc_lru_lock);
+
+                       rc = osc_lru_shrink(cli, max_to_shrink(cli));
+                       spin_lock(&cache->ccc_lru_lock);
+                       if (rc != 0)
+                               break;
+               }
+       }
+       spin_unlock(&cache->ccc_lru_lock);
+
+       CDEBUG(D_CACHE, "%s: cli %p freed %d pages.\n",
+               cli->cl_import->imp_obd->obd_name, cli, rc);
+       return rc;
+}
+
+static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj,
+                          struct osc_page *opg)
+{
+       struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+       struct client_obd *cli = osc_cli(obj);
+       int rc = 0;
+       ENTRY;
+
+       if (cli->cl_cache == NULL) /* shall not be in LRU */
+               RETURN(0);
+
+       LASSERT(atomic_read(cli->cl_lru_left) >= 0);
+       while (!cfs_atomic_add_unless(cli->cl_lru_left, -1, 0)) {
+               int gen;
+
+               /* run out of LRU spaces, try to drop some by itself */
+               rc = osc_lru_reclaim(cli);
+               if (rc < 0)
+                       break;
+               if (rc > 0)
+                       continue;
+
+               cond_resched();
+
+               /* slowest case, all of caching pages are busy, notifying
+                * other OSCs that we're lack of LRU slots. */
+               atomic_inc(&osc_lru_waiters);
+
+               gen = atomic_read(&cli->cl_lru_in_list);
+               rc = l_wait_event(osc_lru_waitq,
+                               atomic_read(cli->cl_lru_left) > 0 ||
+                               (atomic_read(&cli->cl_lru_in_list) > 0 &&
+                                gen != atomic_read(&cli->cl_lru_in_list)),
+                               &lwi);
+
+               atomic_dec(&osc_lru_waiters);
+               if (rc < 0)
+                       break;
+       }
+
+       if (rc >= 0) {
+               atomic_inc(&cli->cl_lru_busy);
+               opg->ops_in_lru = 1;
+               rc = 0;
+       }
+
+       RETURN(rc);
+}
+
+/** @} osc */
diff --git a/drivers/staging/lustre/lustre/osc/osc_quota.c b/drivers/staging/lustre/lustre/osc/osc_quota.c
new file mode 100644 (file)
index 0000000..69caab7
--- /dev/null
@@ -0,0 +1,332 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Code originally extracted from quota directory
+ */
+
+#include <obd_ost.h>
+#include "osc_internal.h"
+
+static inline struct osc_quota_info *osc_oqi_alloc(obd_uid id)
+{
+       struct osc_quota_info *oqi;
+
+       OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem);
+       if (oqi != NULL)
+               oqi->oqi_id = id;
+
+       return oqi;
+}
+
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[])
+{
+       int type;
+       ENTRY;
+
+       for (type = 0; type < MAXQUOTAS; type++) {
+               struct osc_quota_info *oqi;
+
+               oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+               if (oqi) {
+                       obd_uid id = oqi->oqi_id;
+
+                       LASSERTF(id == qid[type],
+                                "The ids don't match %u != %u\n",
+                                id, qid[type]);
+
+                       /* the slot is busy, the user is about to run out of
+                        * quota space on this OST */
+                       CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n",
+                              type == USRQUOTA ? "user" : "grout", qid[type]);
+                       RETURN(NO_QUOTA);
+               }
+       }
+
+       RETURN(QUOTA_OK);
+}
+
+#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \
+                                               : OBD_MD_FLGRPQUOTA)
+#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \
+                                               : OBD_FL_NO_GRPQUOTA)
+
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
+                   obd_flag valid, obd_flag flags)
+{
+       int type;
+       int rc = 0;
+       ENTRY;
+
+       if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0)
+               RETURN(0);
+
+       for (type = 0; type < MAXQUOTAS; type++) {
+               struct osc_quota_info *oqi;
+
+               if ((valid & MD_QUOTA_FLAG(type)) == 0)
+                       continue;
+
+               /* lookup the ID in the per-type hash table */
+               oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]);
+               if ((flags & FL_QUOTA_FLAG(type)) != 0) {
+                       /* This ID is getting close to its quota limit, let's
+                        * switch to sync I/O */
+                       if (oqi != NULL)
+                               continue;
+
+                       oqi = osc_oqi_alloc(qid[type]);
+                       if (oqi == NULL) {
+                               rc = -ENOMEM;
+                               break;
+                       }
+
+                       rc = cfs_hash_add_unique(cli->cl_quota_hash[type],
+                                                &qid[type], &oqi->oqi_hash);
+                       /* race with others? */
+                       if (rc == -EALREADY) {
+                               rc = 0;
+                               OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+                       }
+
+                       CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n",
+                              cli->cl_import->imp_obd->obd_name,
+                              type == USRQUOTA ? "user" : "group",
+                              qid[type], rc);
+               } else {
+                       /* This ID is now off the hook, let's remove it from
+                        * the hash table */
+                       if (oqi == NULL)
+                               continue;
+
+                       oqi = cfs_hash_del_key(cli->cl_quota_hash[type],
+                                              &qid[type]);
+                       if (oqi)
+                               OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+
+                       CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n",
+                              cli->cl_import->imp_obd->obd_name,
+                              type == USRQUOTA ? "user" : "group",
+                              qid[type], oqi);
+               }
+       }
+
+       RETURN(rc);
+}
+
+/*
+ * Hash operations for uid/gid <-> osc_quota_info
+ */
+static unsigned
+oqi_hashfn(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_u32_hash(*((__u32*)key), mask);
+}
+
+static int
+oqi_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct osc_quota_info *oqi;
+       obd_uid uid;
+
+       LASSERT(key != NULL);
+       uid = *((obd_uid*)key);
+       oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+       return uid == oqi->oqi_id;
+}
+
+static void *
+oqi_key(struct hlist_node *hnode)
+{
+       struct osc_quota_info *oqi;
+       oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+       return &oqi->oqi_id;
+}
+
+static void *
+oqi_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+}
+
+static void
+oqi_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+}
+
+static void
+oqi_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct osc_quota_info *oqi;
+
+       oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash);
+
+       OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem);
+}
+
+#define HASH_QUOTA_BKT_BITS 5
+#define HASH_QUOTA_CUR_BITS 5
+#define HASH_QUOTA_MAX_BITS 15
+
+static cfs_hash_ops_t quota_hash_ops = {
+       .hs_hash        = oqi_hashfn,
+       .hs_keycmp      = oqi_keycmp,
+       .hs_key         = oqi_key,
+       .hs_object      = oqi_object,
+       .hs_get         = oqi_get,
+       .hs_put_locked  = oqi_put_locked,
+       .hs_exit        = oqi_exit,
+};
+
+int osc_quota_setup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       int i, type;
+       ENTRY;
+
+       for (type = 0; type < MAXQUOTAS; type++) {
+               cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH",
+                                                          HASH_QUOTA_CUR_BITS,
+                                                          HASH_QUOTA_MAX_BITS,
+                                                          HASH_QUOTA_BKT_BITS,
+                                                          0,
+                                                          CFS_HASH_MIN_THETA,
+                                                          CFS_HASH_MAX_THETA,
+                                                          &quota_hash_ops,
+                                                          CFS_HASH_DEFAULT);
+               if (cli->cl_quota_hash[type] == NULL)
+                       break;
+       }
+
+       if (type == MAXQUOTAS)
+               RETURN(0);
+
+       for (i = 0; i < type; i++)
+               cfs_hash_putref(cli->cl_quota_hash[i]);
+
+       RETURN(-ENOMEM);
+}
+
+int osc_quota_cleanup(struct obd_device *obd)
+{
+       struct client_obd     *cli = &obd->u.cli;
+       int type;
+       ENTRY;
+
+       for (type = 0; type < MAXQUOTAS; type++)
+               cfs_hash_putref(cli->cl_quota_hash[type]);
+
+       RETURN(0);
+}
+
+int osc_quotactl(struct obd_device *unused, struct obd_export *exp,
+                struct obd_quotactl *oqctl)
+{
+       struct ptlrpc_request *req;
+       struct obd_quotactl   *oqc;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION,
+                                       OST_QUOTACTL);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+       *oqc = *oqctl;
+
+       ptlrpc_request_set_replen(req);
+       ptlrpc_at_set_req_timeout(req);
+       req->rq_no_resend = 1;
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc);
+
+       if (req->rq_repmsg &&
+           (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) {
+               *oqctl = *oqc;
+       } else if (!rc) {
+               CERROR ("Can't unpack obd_quotactl\n");
+               rc = -EPROTO;
+       }
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+int osc_quotacheck(struct obd_device *unused, struct obd_export *exp,
+                  struct obd_quotactl *oqctl)
+{
+       struct client_obd       *cli = &exp->exp_obd->u.cli;
+       struct ptlrpc_request   *req;
+       struct obd_quotactl     *body;
+       int                   rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
+                                       &RQF_OST_QUOTACHECK, LUSTRE_OST_VERSION,
+                                       OST_QUOTACHECK);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL);
+       *body = *oqctl;
+
+       ptlrpc_request_set_replen(req);
+
+       /* the next poll will find -ENODATA, that means quotacheck is
+        * going on */
+       cli->cl_qchk_stat = -ENODATA;
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               cli->cl_qchk_stat = rc;
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk)
+{
+       struct client_obd *cli = &exp->exp_obd->u.cli;
+       int rc;
+       ENTRY;
+
+       qchk->obd_uuid = cli->cl_target_uuid;
+       memcpy(qchk->obd_type, LUSTRE_OST_NAME, strlen(LUSTRE_OST_NAME));
+
+       rc = cli->cl_qchk_stat;
+       /* the client is not the previous one */
+       if (rc == CL_NOT_QUOTACHECKED)
+               rc = -EINTR;
+       RETURN(rc);
+}
diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c
new file mode 100644 (file)
index 0000000..53d6a35
--- /dev/null
@@ -0,0 +1,3708 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include <linux/libcfs/libcfs.h>
+
+
+#include <lustre_dlm.h>
+#include <lustre_net.h>
+#include <lustre/lustre_user.h>
+#include <obd_cksum.h>
+#include <obd_ost.h>
+#include <obd_lov.h>
+
+#ifdef  __CYGWIN__
+# include <ctype.h>
+#endif
+
+#include <lustre_ha.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+#include <lustre_debug.h>
+#include <lustre_param.h>
+#include <lustre_fid.h>
+#include "osc_internal.h"
+#include "osc_cl_internal.h"
+
+static void osc_release_ppga(struct brw_page **ppga, obd_count count);
+static int brw_interpret(const struct lu_env *env,
+                        struct ptlrpc_request *req, void *data, int rc);
+int osc_cleanup(struct obd_device *obd);
+
+/* Pack OSC object metadata for disk storage (LE byte order). */
+static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
+                     struct lov_stripe_md *lsm)
+{
+       int lmm_size;
+       ENTRY;
+
+       lmm_size = sizeof(**lmmp);
+       if (lmmp == NULL)
+               RETURN(lmm_size);
+
+       if (*lmmp != NULL && lsm == NULL) {
+               OBD_FREE(*lmmp, lmm_size);
+               *lmmp = NULL;
+               RETURN(0);
+       } else if (unlikely(lsm != NULL && ostid_id(&lsm->lsm_oi) == 0)) {
+               RETURN(-EBADF);
+       }
+
+       if (*lmmp == NULL) {
+               OBD_ALLOC(*lmmp, lmm_size);
+               if (*lmmp == NULL)
+                       RETURN(-ENOMEM);
+       }
+
+       if (lsm)
+               ostid_cpu_to_le(&lsm->lsm_oi, &(*lmmp)->lmm_oi);
+
+       RETURN(lmm_size);
+}
+
+/* Unpack OSC object metadata from disk storage (LE byte order). */
+static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+                       struct lov_mds_md *lmm, int lmm_bytes)
+{
+       int lsm_size;
+       struct obd_import *imp = class_exp2cliimp(exp);
+       ENTRY;
+
+       if (lmm != NULL) {
+               if (lmm_bytes < sizeof(*lmm)) {
+                       CERROR("%s: lov_mds_md too small: %d, need %d\n",
+                              exp->exp_obd->obd_name, lmm_bytes,
+                              (int)sizeof(*lmm));
+                       RETURN(-EINVAL);
+               }
+               /* XXX LOV_MAGIC etc check? */
+
+               if (unlikely(ostid_id(&lmm->lmm_oi) == 0)) {
+                       CERROR("%s: zero lmm_object_id: rc = %d\n",
+                              exp->exp_obd->obd_name, -EINVAL);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       lsm_size = lov_stripe_md_size(1);
+       if (lsmp == NULL)
+               RETURN(lsm_size);
+
+       if (*lsmp != NULL && lmm == NULL) {
+               OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+               OBD_FREE(*lsmp, lsm_size);
+               *lsmp = NULL;
+               RETURN(0);
+       }
+
+       if (*lsmp == NULL) {
+               OBD_ALLOC(*lsmp, lsm_size);
+               if (unlikely(*lsmp == NULL))
+                       RETURN(-ENOMEM);
+               OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo));
+               if (unlikely((*lsmp)->lsm_oinfo[0] == NULL)) {
+                       OBD_FREE(*lsmp, lsm_size);
+                       RETURN(-ENOMEM);
+               }
+               loi_init((*lsmp)->lsm_oinfo[0]);
+       } else if (unlikely(ostid_id(&(*lsmp)->lsm_oi) == 0)) {
+               RETURN(-EBADF);
+       }
+
+       if (lmm != NULL)
+               /* XXX zero *lsmp? */
+               ostid_le_to_cpu(&lmm->lmm_oi, &(*lsmp)->lsm_oi);
+
+       if (imp != NULL &&
+           (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES))
+               (*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes;
+       else
+               (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
+
+       RETURN(lsm_size);
+}
+
+static inline void osc_pack_capa(struct ptlrpc_request *req,
+                                struct ost_body *body, void *capa)
+{
+       struct obd_capa *oc = (struct obd_capa *)capa;
+       struct lustre_capa *c;
+
+       if (!capa)
+               return;
+
+       c = req_capsule_client_get(&req->rq_pill, &RMF_CAPA1);
+       LASSERT(c);
+       capa_cpy(c, oc);
+       body->oa.o_valid |= OBD_MD_FLOSSCAPA;
+       DEBUG_CAPA(D_SEC, c, "pack");
+}
+
+static inline void osc_pack_req_body(struct ptlrpc_request *req,
+                                    struct obd_info *oinfo)
+{
+       struct ost_body *body;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+
+       lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+                            oinfo->oi_oa);
+       osc_pack_capa(req, body, oinfo->oi_capa);
+}
+
+static inline void osc_set_capa_size(struct ptlrpc_request *req,
+                                    const struct req_msg_field *field,
+                                    struct obd_capa *oc)
+{
+       if (oc == NULL)
+               req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0);
+       else
+               /* it is already calculated as sizeof struct obd_capa */
+               ;
+}
+
+static int osc_getattr_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req,
+                                struct osc_async_args *aa, int rc)
+{
+       struct ost_body *body;
+       ENTRY;
+
+       if (rc != 0)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body) {
+               CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+               lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
+                                    aa->aa_oi->oi_oa, &body->oa);
+
+               /* This should really be sent by the OST */
+               aa->aa_oi->oi_oa->o_blksize = DT_MAX_BRW_SIZE;
+               aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+       } else {
+               CDEBUG(D_INFO, "can't unpack ost_body\n");
+               rc = -EPROTO;
+               aa->aa_oi->oi_oa->o_valid = 0;
+       }
+out:
+       rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+       RETURN(rc);
+}
+
+static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo,
+                            struct ptlrpc_request_set *set)
+{
+       struct ptlrpc_request *req;
+       struct osc_async_args *aa;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       osc_pack_req_body(req, oinfo);
+
+       ptlrpc_request_set_replen(req);
+       req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_getattr_interpret;
+
+       CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       aa->aa_oi = oinfo;
+
+       ptlrpc_set_add_req(set, req);
+       RETURN(0);
+}
+
+static int osc_getattr(const struct lu_env *env, struct obd_export *exp,
+                      struct obd_info *oinfo)
+{
+       struct ptlrpc_request *req;
+       struct ost_body       *body;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       osc_pack_req_body(req, oinfo);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+       lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa,
+                            &body->oa);
+
+       oinfo->oi_oa->o_blksize = cli_brw_size(exp->exp_obd);
+       oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ;
+
+       EXIT;
+ out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int osc_setattr(const struct lu_env *env, struct obd_export *exp,
+                      struct obd_info *oinfo, struct obd_trans_info *oti)
+{
+       struct ptlrpc_request *req;
+       struct ost_body       *body;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(oinfo->oi_oa->o_valid & OBD_MD_FLGROUP);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       osc_pack_req_body(req, oinfo);
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oinfo->oi_oa,
+                            &body->oa);
+
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+static int osc_setattr_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req,
+                                struct osc_setattr_args *sa, int rc)
+{
+       struct ost_body *body;
+       ENTRY;
+
+       if (rc != 0)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EPROTO);
+
+       lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa,
+                            &body->oa);
+out:
+       rc = sa->sa_upcall(sa->sa_cookie, rc);
+       RETURN(rc);
+}
+
+int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo,
+                          struct obd_trans_info *oti,
+                          obd_enqueue_update_f upcall, void *cookie,
+                          struct ptlrpc_request_set *rqset)
+{
+       struct ptlrpc_request   *req;
+       struct osc_setattr_args *sa;
+       int                   rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       if (oti && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE)
+               oinfo->oi_oa->o_lcookie = *oti->oti_logcookies;
+
+       osc_pack_req_body(req, oinfo);
+
+       ptlrpc_request_set_replen(req);
+
+       /* do mds to ost setattr asynchronously */
+       if (!rqset) {
+               /* Do not wait for response. */
+               ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       } else {
+               req->rq_interpret_reply =
+                       (ptlrpc_interpterer_t)osc_setattr_interpret;
+
+               CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+               sa = ptlrpc_req_async_args(req);
+               sa->sa_oa = oinfo->oi_oa;
+               sa->sa_upcall = upcall;
+               sa->sa_cookie = cookie;
+
+               if (rqset == PTLRPCD_SET)
+                       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+               else
+                       ptlrpc_set_add_req(rqset, req);
+       }
+
+       RETURN(0);
+}
+
+static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
+                            struct obd_trans_info *oti,
+                            struct ptlrpc_request_set *rqset)
+{
+       return osc_setattr_async_base(exp, oinfo, oti,
+                                     oinfo->oi_cb_up, oinfo, rqset);
+}
+
+int osc_real_create(struct obd_export *exp, struct obdo *oa,
+                   struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+       struct ptlrpc_request *req;
+       struct ost_body       *body;
+       struct lov_stripe_md  *lsm;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(oa);
+       LASSERT(ea);
+
+       lsm = *ea;
+       if (!lsm) {
+               rc = obd_alloc_memmd(exp, &lsm);
+               if (rc < 0)
+                       RETURN(rc);
+       }
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE);
+       if (rc) {
+               ptlrpc_request_free(req);
+               GOTO(out, rc);
+       }
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+
+       lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+       ptlrpc_request_set_replen(req);
+
+       if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+           oa->o_flags == OBD_FL_DELORPHAN) {
+               DEBUG_REQ(D_HA, req,
+                         "delorphan from OST integration");
+               /* Don't resend the delorphan req */
+               req->rq_no_resend = req->rq_no_delay = 1;
+       }
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out_req, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL)
+               GOTO(out_req, rc = -EPROTO);
+
+       CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags);
+       lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa);
+
+       oa->o_blksize = cli_brw_size(exp->exp_obd);
+       oa->o_valid |= OBD_MD_FLBLKSZ;
+
+       /* XXX LOV STACKING: the lsm that is passed to us from LOV does not
+        * have valid lsm_oinfo data structs, so don't go touching that.
+        * This needs to be fixed in a big way.
+        */
+       lsm->lsm_oi = oa->o_oi;
+       *ea = lsm;
+
+       if (oti != NULL) {
+               oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg);
+
+               if (oa->o_valid & OBD_MD_FLCOOKIE) {
+                       if (!oti->oti_logcookies)
+                               oti_alloc_cookies(oti, 1);
+                       *oti->oti_logcookies = oa->o_lcookie;
+               }
+       }
+
+       CDEBUG(D_HA, "transno: "LPD64"\n",
+              lustre_msg_get_transno(req->rq_repmsg));
+out_req:
+       ptlrpc_req_finished(req);
+out:
+       if (rc && !*ea)
+               obd_free_memmd(exp, &lsm);
+       RETURN(rc);
+}
+
+int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo,
+                  obd_enqueue_update_f upcall, void *cookie,
+                  struct ptlrpc_request_set *rqset)
+{
+       struct ptlrpc_request   *req;
+       struct osc_setattr_args *sa;
+       struct ost_body  *body;
+       int                   rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+       req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+       ptlrpc_at_set_req_timeout(req);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+       lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+                            oinfo->oi_oa);
+       osc_pack_capa(req, body, oinfo->oi_capa);
+
+       ptlrpc_request_set_replen(req);
+
+       req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
+       CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args));
+       sa = ptlrpc_req_async_args(req);
+       sa->sa_oa     = oinfo->oi_oa;
+       sa->sa_upcall = upcall;
+       sa->sa_cookie = cookie;
+       if (rqset == PTLRPCD_SET)
+               ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       else
+               ptlrpc_set_add_req(rqset, req);
+
+       RETURN(0);
+}
+
+static int osc_punch(const struct lu_env *env, struct obd_export *exp,
+                    struct obd_info *oinfo, struct obd_trans_info *oti,
+                    struct ptlrpc_request_set *rqset)
+{
+       oinfo->oi_oa->o_size   = oinfo->oi_policy.l_extent.start;
+       oinfo->oi_oa->o_blocks = oinfo->oi_policy.l_extent.end;
+       oinfo->oi_oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+       return osc_punch_base(exp, oinfo,
+                             oinfo->oi_cb_up, oinfo, rqset);
+}
+
+static int osc_sync_interpret(const struct lu_env *env,
+                             struct ptlrpc_request *req,
+                             void *arg, int rc)
+{
+       struct osc_fsync_args *fa = arg;
+       struct ost_body *body;
+       ENTRY;
+
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL) {
+               CERROR ("can't unpack ost_body\n");
+               GOTO(out, rc = -EPROTO);
+       }
+
+       *fa->fa_oi->oi_oa = body->oa;
+out:
+       rc = fa->fa_upcall(fa->fa_cookie, rc);
+       RETURN(rc);
+}
+
+int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo,
+                 obd_enqueue_update_f upcall, void *cookie,
+                 struct ptlrpc_request_set *rqset)
+{
+       struct ptlrpc_request *req;
+       struct ost_body       *body;
+       struct osc_fsync_args *fa;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       /* overload the size and blocks fields in the oa with start/end */
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+       lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa,
+                            oinfo->oi_oa);
+       osc_pack_capa(req, body, oinfo->oi_capa);
+
+       ptlrpc_request_set_replen(req);
+       req->rq_interpret_reply = osc_sync_interpret;
+
+       CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args));
+       fa = ptlrpc_req_async_args(req);
+       fa->fa_oi = oinfo;
+       fa->fa_upcall = upcall;
+       fa->fa_cookie = cookie;
+
+       if (rqset == PTLRPCD_SET)
+               ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       else
+               ptlrpc_set_add_req(rqset, req);
+
+       RETURN (0);
+}
+
+static int osc_sync(const struct lu_env *env, struct obd_export *exp,
+                   struct obd_info *oinfo, obd_size start, obd_size end,
+                   struct ptlrpc_request_set *set)
+{
+       ENTRY;
+
+       if (!oinfo->oi_oa) {
+               CDEBUG(D_INFO, "oa NULL\n");
+               RETURN(-EINVAL);
+       }
+
+       oinfo->oi_oa->o_size = start;
+       oinfo->oi_oa->o_blocks = end;
+       oinfo->oi_oa->o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+
+       RETURN(osc_sync_base(exp, oinfo, oinfo->oi_cb_up, oinfo, set));
+}
+
+/* Find and cancel locally locks matched by @mode in the resource found by
+ * @objid. Found locks are added into @cancel list. Returns the amount of
+ * locks added to @cancels list. */
+static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa,
+                                  struct list_head *cancels,
+                                  ldlm_mode_t mode, int lock_flags)
+{
+       struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
+       struct ldlm_res_id res_id;
+       struct ldlm_resource *res;
+       int count;
+       ENTRY;
+
+       /* Return, i.e. cancel nothing, only if ELC is supported (flag in
+        * export) but disabled through procfs (flag in NS).
+        *
+        * This distinguishes from a case when ELC is not supported originally,
+        * when we still want to cancel locks in advance and just cancel them
+        * locally, without sending any RPC. */
+       if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns))
+               RETURN(0);
+
+       ostid_build_res_name(&oa->o_oi, &res_id);
+       res = ldlm_resource_get(ns, NULL, &res_id, 0, 0);
+       if (res == NULL)
+               RETURN(0);
+
+       LDLM_RESOURCE_ADDREF(res);
+       count = ldlm_cancel_resource_local(res, cancels, NULL, mode,
+                                          lock_flags, 0, NULL);
+       LDLM_RESOURCE_DELREF(res);
+       ldlm_resource_putref(res);
+       RETURN(count);
+}
+
+static int osc_destroy_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req, void *data,
+                                int rc)
+{
+       struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+
+       atomic_dec(&cli->cl_destroy_in_flight);
+       wake_up(&cli->cl_destroy_waitq);
+       return 0;
+}
+
+static int osc_can_send_destroy(struct client_obd *cli)
+{
+       if (atomic_inc_return(&cli->cl_destroy_in_flight) <=
+           cli->cl_max_rpcs_in_flight) {
+               /* The destroy request can be sent */
+               return 1;
+       }
+       if (atomic_dec_return(&cli->cl_destroy_in_flight) <
+           cli->cl_max_rpcs_in_flight) {
+               /*
+                * The counter has been modified between the two atomic
+                * operations.
+                */
+               wake_up(&cli->cl_destroy_waitq);
+       }
+       return 0;
+}
+
+int osc_create(const struct lu_env *env, struct obd_export *exp,
+              struct obdo *oa, struct lov_stripe_md **ea,
+              struct obd_trans_info *oti)
+{
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(oa);
+       LASSERT(ea);
+       LASSERT(oa->o_valid & OBD_MD_FLGROUP);
+
+       if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+           oa->o_flags == OBD_FL_RECREATE_OBJS) {
+               RETURN(osc_real_create(exp, oa, ea, oti));
+       }
+
+       if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi)))
+               RETURN(osc_real_create(exp, oa, ea, oti));
+
+       /* we should not get here anymore */
+       LBUG();
+
+       RETURN(rc);
+}
+
+/* Destroy requests can be async always on the client, and we don't even really
+ * care about the return code since the client cannot do anything at all about
+ * a destroy failure.
+ * When the MDS is unlinking a filename, it saves the file objects into a
+ * recovery llog, and these object records are cancelled when the OST reports
+ * they were destroyed and sync'd to disk (i.e. transaction committed).
+ * If the client dies, or the OST is down when the object should be destroyed,
+ * the records are not cancelled, and when the OST reconnects to the MDS next,
+ * it will retrieve the llog unlink logs and then sends the log cancellation
+ * cookies to the MDS after committing destroy transactions. */
+static int osc_destroy(const struct lu_env *env, struct obd_export *exp,
+                      struct obdo *oa, struct lov_stripe_md *ea,
+                      struct obd_trans_info *oti, struct obd_export *md_export,
+                      void *capa)
+{
+       struct client_obd     *cli = &exp->exp_obd->u.cli;
+       struct ptlrpc_request *req;
+       struct ost_body       *body;
+       LIST_HEAD(cancels);
+       int rc, count;
+       ENTRY;
+
+       if (!oa) {
+               CDEBUG(D_INFO, "oa NULL\n");
+               RETURN(-EINVAL);
+       }
+
+       count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW,
+                                       LDLM_FL_DISCARD_DATA);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+
+       osc_set_capa_size(req, &RMF_CAPA1, (struct obd_capa *)capa);
+       rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY,
+                              0, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+       ptlrpc_at_set_req_timeout(req);
+
+       if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE)
+               oa->o_lcookie = *oti->oti_logcookies;
+       body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+       lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+       osc_pack_capa(req, body, (struct obd_capa *)capa);
+       ptlrpc_request_set_replen(req);
+
+       /* If osc_destory is for destroying the unlink orphan,
+        * sent from MDT to OST, which should not be blocked here,
+        * because the process might be triggered by ptlrpcd, and
+        * it is not good to block ptlrpcd thread (b=16006)*/
+       if (!(oa->o_flags & OBD_FL_DELORPHAN)) {
+               req->rq_interpret_reply = osc_destroy_interpret;
+               if (!osc_can_send_destroy(cli)) {
+                       struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP,
+                                                         NULL);
+
+                       /*
+                        * Wait until the number of on-going destroy RPCs drops
+                        * under max_rpc_in_flight
+                        */
+                       l_wait_event_exclusive(cli->cl_destroy_waitq,
+                                              osc_can_send_destroy(cli), &lwi);
+               }
+       }
+
+       /* Do not wait for response */
+       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       RETURN(0);
+}
+
+static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
+                               long writing_bytes)
+{
+       obd_flag bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT;
+
+       LASSERT(!(oa->o_valid & bits));
+
+       oa->o_valid |= bits;
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       oa->o_dirty = cli->cl_dirty;
+       if (unlikely(cli->cl_dirty - cli->cl_dirty_transit >
+                    cli->cl_dirty_max)) {
+               CERROR("dirty %lu - %lu > dirty_max %lu\n",
+                      cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max);
+               oa->o_undirty = 0;
+       } else if (unlikely(atomic_read(&obd_dirty_pages) -
+                           atomic_read(&obd_dirty_transit_pages) >
+                           (long)(obd_max_dirty_pages + 1))) {
+               /* The atomic_read() allowing the atomic_inc() are
+                * not covered by a lock thus they may safely race and trip
+                * this CERROR() unless we add in a small fudge factor (+1). */
+               CERROR("dirty %d - %d > system dirty_max %d\n",
+                      atomic_read(&obd_dirty_pages),
+                      atomic_read(&obd_dirty_transit_pages),
+                      obd_max_dirty_pages);
+               oa->o_undirty = 0;
+       } else if (unlikely(cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff)) {
+               CERROR("dirty %lu - dirty_max %lu too big???\n",
+                      cli->cl_dirty, cli->cl_dirty_max);
+               oa->o_undirty = 0;
+       } else {
+               long max_in_flight = (cli->cl_max_pages_per_rpc <<
+                                     PAGE_CACHE_SHIFT)*
+                                    (cli->cl_max_rpcs_in_flight + 1);
+               oa->o_undirty = max(cli->cl_dirty_max, max_in_flight);
+       }
+       oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant;
+       oa->o_dropped = cli->cl_lost_grant;
+       cli->cl_lost_grant = 0;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       CDEBUG(D_CACHE,"dirty: "LPU64" undirty: %u dropped %u grant: "LPU64"\n",
+              oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant);
+
+}
+
+void osc_update_next_shrink(struct client_obd *cli)
+{
+       cli->cl_next_shrink_grant =
+               cfs_time_shift(cli->cl_grant_shrink_interval);
+       CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
+              cli->cl_next_shrink_grant);
+}
+
+static void __osc_update_grant(struct client_obd *cli, obd_size grant)
+{
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       cli->cl_avail_grant += grant;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
+{
+       if (body->oa.o_valid & OBD_MD_FLGRANT) {
+               CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant);
+               __osc_update_grant(cli, body->oa.o_grant);
+       }
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                             obd_count keylen, void *key, obd_count vallen,
+                             void *val, struct ptlrpc_request_set *set);
+
+static int osc_shrink_grant_interpret(const struct lu_env *env,
+                                     struct ptlrpc_request *req,
+                                     void *aa, int rc)
+{
+       struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
+       struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
+       struct ost_body *body;
+
+       if (rc != 0) {
+               __osc_update_grant(cli, oa->o_grant);
+               GOTO(out, rc);
+       }
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       LASSERT(body);
+       osc_update_grant(cli, body);
+out:
+       OBDO_FREE(oa);
+       return rc;
+}
+
+static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
+{
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       oa->o_grant = cli->cl_avail_grant / 4;
+       cli->cl_avail_grant -= oa->o_grant;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       if (!(oa->o_valid & OBD_MD_FLFLAGS)) {
+               oa->o_valid |= OBD_MD_FLFLAGS;
+               oa->o_flags = 0;
+       }
+       oa->o_flags |= OBD_FL_SHRINK_GRANT;
+       osc_update_next_shrink(cli);
+}
+
+/* Shrink the current grant, either from some large amount to enough for a
+ * full set of in-flight RPCs, or if we have already shrunk to that limit
+ * then to enough for a single RPC.  This avoids keeping more grant than
+ * needed, and avoids shrinking the grant piecemeal. */
+static int osc_shrink_grant(struct client_obd *cli)
+{
+       __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
+                            (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (cli->cl_avail_grant <= target_bytes)
+               target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       return osc_shrink_grant_to_target(cli, target_bytes);
+}
+
+int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
+{
+       int                     rc = 0;
+       struct ost_body *body;
+       ENTRY;
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       /* Don't shrink if we are already above or below the desired limit
+        * We don't want to shrink below a single RPC, as that will negatively
+        * impact block allocation and long-term performance. */
+       if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT)
+               target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+       if (target_bytes >= cli->cl_avail_grant) {
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               RETURN(0);
+       }
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       OBD_ALLOC_PTR(body);
+       if (!body)
+               RETURN(-ENOMEM);
+
+       osc_announce_cached(cli, &body->oa, 0);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       body->oa.o_grant = cli->cl_avail_grant - target_bytes;
+       cli->cl_avail_grant = target_bytes;
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+       if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) {
+               body->oa.o_valid |= OBD_MD_FLFLAGS;
+               body->oa.o_flags = 0;
+       }
+       body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
+       osc_update_next_shrink(cli);
+
+       rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export,
+                               sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
+                               sizeof(*body), body, NULL);
+       if (rc != 0)
+               __osc_update_grant(cli, body->oa.o_grant);
+       OBD_FREE_PTR(body);
+       RETURN(rc);
+}
+
+static int osc_should_shrink_grant(struct client_obd *client)
+{
+       cfs_time_t time = cfs_time_current();
+       cfs_time_t next_shrink = client->cl_next_shrink_grant;
+
+       if ((client->cl_import->imp_connect_data.ocd_connect_flags &
+            OBD_CONNECT_GRANT_SHRINK) == 0)
+               return 0;
+
+       if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) {
+               /* Get the current RPC size directly, instead of going via:
+                * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
+                * Keep comment here so that it can be found by searching. */
+               int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+
+               if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
+                   client->cl_avail_grant > brw_size)
+                       return 1;
+               else
+                       osc_update_next_shrink(client);
+       }
+       return 0;
+}
+
+static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data)
+{
+       struct client_obd *client;
+
+       list_for_each_entry(client, &item->ti_obd_list,
+                               cl_grant_shrink_list) {
+               if (osc_should_shrink_grant(client))
+                       osc_shrink_grant(client);
+       }
+       return 0;
+}
+
+static int osc_add_shrink_grant(struct client_obd *client)
+{
+       int rc;
+
+       rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
+                                      TIMEOUT_GRANT,
+                                      osc_grant_shrink_grant_cb, NULL,
+                                      &client->cl_grant_shrink_list);
+       if (rc) {
+               CERROR("add grant client %s error %d\n",
+                       client->cl_import->imp_obd->obd_name, rc);
+               return rc;
+       }
+       CDEBUG(D_CACHE, "add grant client %s \n",
+              client->cl_import->imp_obd->obd_name);
+       osc_update_next_shrink(client);
+       return 0;
+}
+
+static int osc_del_shrink_grant(struct client_obd *client)
+{
+       return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
+                                        TIMEOUT_GRANT);
+}
+
+static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
+{
+       /*
+        * ocd_grant is the total grant amount we're expect to hold: if we've
+        * been evicted, it's the new avail_grant amount, cl_dirty will drop
+        * to 0 as inflight RPCs fail out; otherwise, it's avail_grant + dirty.
+        *
+        * race is tolerable here: if we're evicted, but imp_state already
+        * left EVICTED state, then cl_dirty must be 0 already.
+        */
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED)
+               cli->cl_avail_grant = ocd->ocd_grant;
+       else
+               cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty;
+
+       if (cli->cl_avail_grant < 0) {
+               CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n",
+                     cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant,
+                     ocd->ocd_grant, cli->cl_dirty);
+               /* workaround for servers which do not have the patch from
+                * LU-2679 */
+               cli->cl_avail_grant = ocd->ocd_grant;
+       }
+
+       /* determine the appropriate chunk size used by osc_extent. */
+       cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld."
+               "chunk bits: %d.\n", cli->cl_import->imp_obd->obd_name,
+               cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits);
+
+       if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK &&
+           list_empty(&cli->cl_grant_shrink_list))
+               osc_add_shrink_grant(cli);
+}
+
+/* We assume that the reason this OSC got a short read is because it read
+ * beyond the end of a stripe file; i.e. lustre is reading a sparse file
+ * via the LOV, and it _knows_ it's reading inside the file, it's just that
+ * this stripe never got written at or beyond this stripe offset yet. */
+static void handle_short_read(int nob_read, obd_count page_count,
+                             struct brw_page **pga)
+{
+       char *ptr;
+       int i = 0;
+
+       /* skip bytes read OK */
+       while (nob_read > 0) {
+               LASSERT (page_count > 0);
+
+               if (pga[i]->count > nob_read) {
+                       /* EOF inside this page */
+                       ptr = kmap(pga[i]->pg) +
+                               (pga[i]->off & ~CFS_PAGE_MASK);
+                       memset(ptr + nob_read, 0, pga[i]->count - nob_read);
+                       kunmap(pga[i]->pg);
+                       page_count--;
+                       i++;
+                       break;
+               }
+
+               nob_read -= pga[i]->count;
+               page_count--;
+               i++;
+       }
+
+       /* zero remaining pages */
+       while (page_count-- > 0) {
+               ptr = kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK);
+               memset(ptr, 0, pga[i]->count);
+               kunmap(pga[i]->pg);
+               i++;
+       }
+}
+
+static int check_write_rcs(struct ptlrpc_request *req,
+                          int requested_nob, int niocount,
+                          obd_count page_count, struct brw_page **pga)
+{
+       int     i;
+       __u32   *remote_rcs;
+
+       remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS,
+                                                 sizeof(*remote_rcs) *
+                                                 niocount);
+       if (remote_rcs == NULL) {
+               CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n");
+               return(-EPROTO);
+       }
+
+       /* return error if any niobuf was in error */
+       for (i = 0; i < niocount; i++) {
+               if ((int)remote_rcs[i] < 0)
+                       return(remote_rcs[i]);
+
+               if (remote_rcs[i] != 0) {
+                       CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n",
+                               i, remote_rcs[i], req);
+                       return(-EPROTO);
+               }
+       }
+
+       if (req->rq_bulk->bd_nob_transferred != requested_nob) {
+               CERROR("Unexpected # bytes transferred: %d (requested %d)\n",
+                      req->rq_bulk->bd_nob_transferred, requested_nob);
+               return(-EPROTO);
+       }
+
+       return (0);
+}
+
+static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
+{
+       if (p1->flag != p2->flag) {
+               unsigned mask = ~(OBD_BRW_FROM_GRANT| OBD_BRW_NOCACHE|
+                                 OBD_BRW_SYNC|OBD_BRW_ASYNC|OBD_BRW_NOQUOTA);
+
+               /* warn if we try to combine flags that we don't know to be
+                * safe to combine */
+               if (unlikely((p1->flag & mask) != (p2->flag & mask))) {
+                       CWARN("Saw flags 0x%x and 0x%x in the same brw, please "
+                             "report this at http://bugs.whamcloud.com/\n",
+                             p1->flag, p2->flag);
+               }
+               return 0;
+       }
+
+       return (p1->off + p1->count == p2->off);
+}
+
+static obd_count osc_checksum_bulk(int nob, obd_count pg_count,
+                                  struct brw_page **pga, int opc,
+                                  cksum_type_t cksum_type)
+{
+       __u32                           cksum;
+       int                             i = 0;
+       struct cfs_crypto_hash_desc     *hdesc;
+       unsigned int                    bufsize;
+       int                             err;
+       unsigned char                   cfs_alg = cksum_obd2cfs(cksum_type);
+
+       LASSERT(pg_count > 0);
+
+       hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0);
+       if (IS_ERR(hdesc)) {
+               CERROR("Unable to initialize checksum hash %s\n",
+                      cfs_crypto_hash_name(cfs_alg));
+               return PTR_ERR(hdesc);
+       }
+
+       while (nob > 0 && pg_count > 0) {
+               int count = pga[i]->count > nob ? nob : pga[i]->count;
+
+               /* corrupt the data before we compute the checksum, to
+                * simulate an OST->client data error */
+               if (i == 0 && opc == OST_READ &&
+                   OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) {
+                       unsigned char *ptr = kmap(pga[i]->pg);
+                       int off = pga[i]->off & ~CFS_PAGE_MASK;
+                       memcpy(ptr + off, "bad1", min(4, nob));
+                       kunmap(pga[i]->pg);
+               }
+               cfs_crypto_hash_update_page(hdesc, pga[i]->pg,
+                                 pga[i]->off & ~CFS_PAGE_MASK,
+                                 count);
+               LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n",
+                              (int)(pga[i]->off & ~CFS_PAGE_MASK));
+
+               nob -= pga[i]->count;
+               pg_count--;
+               i++;
+       }
+
+       bufsize = 4;
+       err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize);
+
+       if (err)
+               cfs_crypto_hash_final(hdesc, NULL, NULL);
+
+       /* For sending we only compute the wrong checksum instead
+        * of corrupting the data so it is still correct on a redo */
+       if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND))
+               cksum++;
+
+       return cksum;
+}
+
+static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
+                               struct lov_stripe_md *lsm, obd_count page_count,
+                               struct brw_page **pga,
+                               struct ptlrpc_request **reqp,
+                               struct obd_capa *ocapa, int reserve,
+                               int resend)
+{
+       struct ptlrpc_request   *req;
+       struct ptlrpc_bulk_desc *desc;
+       struct ost_body  *body;
+       struct obd_ioobj        *ioobj;
+       struct niobuf_remote    *niobuf;
+       int niocount, i, requested_nob, opc, rc;
+       struct osc_brw_async_args *aa;
+       struct req_capsule      *pill;
+       struct brw_page *pg_prev;
+
+       ENTRY;
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
+               RETURN(-ENOMEM); /* Recoverable */
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2))
+               RETURN(-EINVAL); /* Fatal */
+
+       if ((cmd & OBD_BRW_WRITE) != 0) {
+               opc = OST_WRITE;
+               req = ptlrpc_request_alloc_pool(cli->cl_import,
+                                               cli->cl_import->imp_rq_pool,
+                                               &RQF_OST_BRW_WRITE);
+       } else {
+               opc = OST_READ;
+               req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ);
+       }
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       for (niocount = i = 1; i < page_count; i++) {
+               if (!can_merge_pages(pga[i - 1], pga[i]))
+                       niocount++;
+       }
+
+       pill = &req->rq_pill;
+       req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT,
+                            sizeof(*ioobj));
+       req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT,
+                            niocount * sizeof(*niobuf));
+       osc_set_capa_size(req, &RMF_CAPA1, ocapa);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+       req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
+       ptlrpc_at_set_req_timeout(req);
+       /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own
+        * retry logic */
+       req->rq_no_retry_einprogress = 1;
+
+       desc = ptlrpc_prep_bulk_imp(req, page_count,
+               cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS,
+               opc == OST_WRITE ? BULK_GET_SOURCE : BULK_PUT_SINK,
+               OST_BULK_PORTAL);
+
+       if (desc == NULL)
+               GOTO(out, rc = -ENOMEM);
+       /* NB request now owns desc and will free it when it gets freed */
+
+       body = req_capsule_client_get(pill, &RMF_OST_BODY);
+       ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
+       niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
+       LASSERT(body != NULL && ioobj != NULL && niobuf != NULL);
+
+       lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa);
+
+       obdo_to_ioobj(oa, ioobj);
+       ioobj->ioo_bufcnt = niocount;
+       /* The high bits of ioo_max_brw tells server _maximum_ number of bulks
+        * that might be send for this request.  The actual number is decided
+        * when the RPC is finally sent in ptlrpc_register_bulk(). It sends
+        * "max - 1" for old client compatibility sending "0", and also so the
+        * the actual maximum is a power-of-two number, not one less. LU-1431 */
+       ioobj_max_brw_set(ioobj, desc->bd_md_max_brw);
+       osc_pack_capa(req, body, ocapa);
+       LASSERT(page_count > 0);
+       pg_prev = pga[0];
+       for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
+               struct brw_page *pg = pga[i];
+               int poff = pg->off & ~CFS_PAGE_MASK;
+
+               LASSERT(pg->count > 0);
+               /* make sure there is no gap in the middle of page array */
+               LASSERTF(page_count == 1 ||
+                        (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) &&
+                         ergo(i > 0 && i < page_count - 1,
+                              poff == 0 && pg->count == PAGE_CACHE_SIZE)   &&
+                         ergo(i == page_count - 1, poff == 0)),
+                        "i: %d/%d pg: %p off: "LPU64", count: %u\n",
+                        i, page_count, pg, pg->off, pg->count);
+               LASSERTF(i == 0 || pg->off > pg_prev->off,
+                        "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64
+                        " prev_pg %p [pri %lu ind %lu] off "LPU64"\n",
+                        i, page_count,
+                        pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
+                        pg_prev->pg, page_private(pg_prev->pg),
+                        pg_prev->pg->index, pg_prev->off);
+               LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) ==
+                       (pg->flag & OBD_BRW_SRVLOCK));
+
+               ptlrpc_prep_bulk_page_pin(desc, pg->pg, poff, pg->count);
+               requested_nob += pg->count;
+
+               if (i > 0 && can_merge_pages(pg_prev, pg)) {
+                       niobuf--;
+                       niobuf->len += pg->count;
+               } else {
+                       niobuf->offset = pg->off;
+                       niobuf->len    = pg->count;
+                       niobuf->flags  = pg->flag;
+               }
+               pg_prev = pg;
+       }
+
+       LASSERTF((void *)(niobuf - niocount) ==
+               req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE),
+               "want %p - real %p\n", req_capsule_client_get(&req->rq_pill,
+               &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount));
+
+       osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
+       if (resend) {
+               if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+                       body->oa.o_valid |= OBD_MD_FLFLAGS;
+                       body->oa.o_flags = 0;
+               }
+               body->oa.o_flags |= OBD_FL_RECOV_RESEND;
+       }
+
+       if (osc_should_shrink_grant(cli))
+               osc_shrink_grant_local(cli, &body->oa);
+
+       /* size[REQ_REC_OFF] still sizeof (*body) */
+       if (opc == OST_WRITE) {
+               if (cli->cl_checksum &&
+                   !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+                       /* store cl_cksum_type in a local variable since
+                        * it can be changed via lprocfs */
+                       cksum_type_t cksum_type = cli->cl_cksum_type;
+
+                       if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+                               oa->o_flags &= OBD_FL_LOCAL_MASK;
+                               body->oa.o_flags = 0;
+                       }
+                       body->oa.o_flags |= cksum_type_pack(cksum_type);
+                       body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+                       body->oa.o_cksum = osc_checksum_bulk(requested_nob,
+                                                            page_count, pga,
+                                                            OST_WRITE,
+                                                            cksum_type);
+                       CDEBUG(D_PAGE, "checksum at write origin: %x\n",
+                              body->oa.o_cksum);
+                       /* save this in 'oa', too, for later checking */
+                       oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+                       oa->o_flags |= cksum_type_pack(cksum_type);
+               } else {
+                       /* clear out the checksum flag, in case this is a
+                        * resend but cl_checksum is no longer set. b=11238 */
+                       oa->o_valid &= ~OBD_MD_FLCKSUM;
+               }
+               oa->o_cksum = body->oa.o_cksum;
+               /* 1 RC per niobuf */
+               req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
+                                    sizeof(__u32) * niocount);
+       } else {
+               if (cli->cl_checksum &&
+                   !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+                       if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
+                               body->oa.o_flags = 0;
+                       body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type);
+                       body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+               }
+       }
+       ptlrpc_request_set_replen(req);
+
+       CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       aa->aa_oa = oa;
+       aa->aa_requested_nob = requested_nob;
+       aa->aa_nio_count = niocount;
+       aa->aa_page_count = page_count;
+       aa->aa_resends = 0;
+       aa->aa_ppga = pga;
+       aa->aa_cli = cli;
+       INIT_LIST_HEAD(&aa->aa_oaps);
+       if (ocapa && reserve)
+               aa->aa_ocapa = capa_get(ocapa);
+
+       *reqp = req;
+       RETURN(0);
+
+ out:
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer,
+                               __u32 client_cksum, __u32 server_cksum, int nob,
+                               obd_count page_count, struct brw_page **pga,
+                               cksum_type_t client_cksum_type)
+{
+       __u32 new_cksum;
+       char *msg;
+       cksum_type_t cksum_type;
+
+       if (server_cksum == client_cksum) {
+               CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+               return 0;
+       }
+
+       cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ?
+                                      oa->o_flags : 0);
+       new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE,
+                                     cksum_type);
+
+       if (cksum_type != client_cksum_type)
+               msg = "the server did not use the checksum type specified in "
+                     "the original request - likely a protocol problem";
+       else if (new_cksum == server_cksum)
+               msg = "changed on the client after we checksummed it - "
+                     "likely false positive due to mmap IO (bug 11742)";
+       else if (new_cksum == client_cksum)
+               msg = "changed in transit before arrival at OST";
+       else
+               msg = "changed in transit AND doesn't match the original - "
+                     "likely false positive due to mmap IO (bug 11742)";
+
+       LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID
+                          " object "DOSTID" extent ["LPU64"-"LPU64"]\n",
+                          msg, libcfs_nid2str(peer->nid),
+                          oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0,
+                          oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
+                          oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
+                          POSTID(&oa->o_oi), pga[0]->off,
+                          pga[page_count-1]->off + pga[page_count-1]->count - 1);
+       CERROR("original client csum %x (type %x), server csum %x (type %x), "
+              "client csum now %x\n", client_cksum, client_cksum_type,
+              server_cksum, cksum_type, new_cksum);
+       return 1;
+}
+
+/* Note rc enters this function as number of bytes transferred */
+static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
+{
+       struct osc_brw_async_args *aa = (void *)&req->rq_async_args;
+       const lnet_process_id_t *peer =
+                       &req->rq_import->imp_connection->c_peer;
+       struct client_obd *cli = aa->aa_cli;
+       struct ost_body *body;
+       __u32 client_cksum = 0;
+       ENTRY;
+
+       if (rc < 0 && rc != -EDQUOT) {
+               DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc);
+               RETURN(rc);
+       }
+
+       LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
+       body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY);
+       if (body == NULL) {
+               DEBUG_REQ(D_INFO, req, "Can't unpack body\n");
+               RETURN(-EPROTO);
+       }
+
+       /* set/clear over quota flag for a uid/gid */
+       if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
+           body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) {
+               unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid };
+
+               CDEBUG(D_QUOTA, "setdq for [%u %u] with valid "LPX64", flags %x\n",
+                      body->oa.o_uid, body->oa.o_gid, body->oa.o_valid,
+                      body->oa.o_flags);
+               osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags);
+       }
+
+       osc_update_grant(cli, body);
+
+       if (rc < 0)
+               RETURN(rc);
+
+       if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM)
+               client_cksum = aa->aa_oa->o_cksum; /* save for later */
+
+       if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) {
+               if (rc > 0) {
+                       CERROR("Unexpected +ve rc %d\n", rc);
+                       RETURN(-EPROTO);
+               }
+               LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob);
+
+               if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk))
+                       RETURN(-EAGAIN);
+
+               if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum &&
+                   check_write_checksum(&body->oa, peer, client_cksum,
+                                        body->oa.o_cksum, aa->aa_requested_nob,
+                                        aa->aa_page_count, aa->aa_ppga,
+                                        cksum_type_unpack(aa->aa_oa->o_flags)))
+                       RETURN(-EAGAIN);
+
+               rc = check_write_rcs(req, aa->aa_requested_nob,aa->aa_nio_count,
+                                    aa->aa_page_count, aa->aa_ppga);
+               GOTO(out, rc);
+       }
+
+       /* The rest of this function executes only for OST_READs */
+
+       /* if unwrap_bulk failed, return -EAGAIN to retry */
+       rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
+       if (rc < 0)
+               GOTO(out, rc = -EAGAIN);
+
+       if (rc > aa->aa_requested_nob) {
+               CERROR("Unexpected rc %d (%d requested)\n", rc,
+                      aa->aa_requested_nob);
+               RETURN(-EPROTO);
+       }
+
+       if (rc != req->rq_bulk->bd_nob_transferred) {
+               CERROR ("Unexpected rc %d (%d transferred)\n",
+                       rc, req->rq_bulk->bd_nob_transferred);
+               return (-EPROTO);
+       }
+
+       if (rc < aa->aa_requested_nob)
+               handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
+
+       if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+               static int cksum_counter;
+               __u32      server_cksum = body->oa.o_cksum;
+               char      *via;
+               char      *router;
+               cksum_type_t cksum_type;
+
+               cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS?
+                                              body->oa.o_flags : 0);
+               client_cksum = osc_checksum_bulk(rc, aa->aa_page_count,
+                                                aa->aa_ppga, OST_READ,
+                                                cksum_type);
+
+               if (peer->nid == req->rq_bulk->bd_sender) {
+                       via = router = "";
+               } else {
+                       via = " via ";
+                       router = libcfs_nid2str(req->rq_bulk->bd_sender);
+               }
+
+               if (server_cksum == ~0 && rc > 0) {
+                       CERROR("Protocol error: server %s set the 'checksum' "
+                              "bit, but didn't send a checksum.  Not fatal, "
+                              "but please notify on http://bugs.whamcloud.com/\n",
+                              libcfs_nid2str(peer->nid));
+               } else if (server_cksum != client_cksum) {
+                       LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
+                                          "%s%s%s inode "DFID" object "DOSTID
+                                          " extent ["LPU64"-"LPU64"]\n",
+                                          req->rq_import->imp_obd->obd_name,
+                                          libcfs_nid2str(peer->nid),
+                                          via, router,
+                                          body->oa.o_valid & OBD_MD_FLFID ?
+                                               body->oa.o_parent_seq : (__u64)0,
+                                          body->oa.o_valid & OBD_MD_FLFID ?
+                                               body->oa.o_parent_oid : 0,
+                                          body->oa.o_valid & OBD_MD_FLFID ?
+                                               body->oa.o_parent_ver : 0,
+                                          POSTID(&body->oa.o_oi),
+                                          aa->aa_ppga[0]->off,
+                                          aa->aa_ppga[aa->aa_page_count-1]->off +
+                                          aa->aa_ppga[aa->aa_page_count-1]->count -
+                                                                       1);
+                       CERROR("client %x, server %x, cksum_type %x\n",
+                              client_cksum, server_cksum, cksum_type);
+                       cksum_counter = 0;
+                       aa->aa_oa->o_cksum = client_cksum;
+                       rc = -EAGAIN;
+               } else {
+                       cksum_counter++;
+                       CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum);
+                       rc = 0;
+               }
+       } else if (unlikely(client_cksum)) {
+               static int cksum_missed;
+
+               cksum_missed++;
+               if ((cksum_missed & (-cksum_missed)) == cksum_missed)
+                       CERROR("Checksum %u requested from %s but not sent\n",
+                              cksum_missed, libcfs_nid2str(peer->nid));
+       } else {
+               rc = 0;
+       }
+out:
+       if (rc >= 0)
+               lustre_get_wire_obdo(&req->rq_import->imp_connect_data,
+                                    aa->aa_oa, &body->oa);
+
+       RETURN(rc);
+}
+
+static int osc_brw_internal(int cmd, struct obd_export *exp, struct obdo *oa,
+                           struct lov_stripe_md *lsm,
+                           obd_count page_count, struct brw_page **pga,
+                           struct obd_capa *ocapa)
+{
+       struct ptlrpc_request *req;
+       int                 rc;
+       wait_queue_head_t           waitq;
+       int                 generation, resends = 0;
+       struct l_wait_info     lwi;
+
+       ENTRY;
+
+       init_waitqueue_head(&waitq);
+       generation = exp->exp_obd->u.cli.cl_import->imp_generation;
+
+restart_bulk:
+       rc = osc_brw_prep_request(cmd, &exp->exp_obd->u.cli, oa, lsm,
+                                 page_count, pga, &req, ocapa, 0, resends);
+       if (rc != 0)
+               return (rc);
+
+       if (resends) {
+               req->rq_generation_set = 1;
+               req->rq_import_generation = generation;
+               req->rq_sent = cfs_time_current_sec() + resends;
+       }
+
+       rc = ptlrpc_queue_wait(req);
+
+       if (rc == -ETIMEDOUT && req->rq_resend) {
+               DEBUG_REQ(D_HA, req,  "BULK TIMEOUT");
+               ptlrpc_req_finished(req);
+               goto restart_bulk;
+       }
+
+       rc = osc_brw_fini_request(req, rc);
+
+       ptlrpc_req_finished(req);
+       /* When server return -EINPROGRESS, client should always retry
+        * regardless of the number of times the bulk was resent already.*/
+       if (osc_recoverable_error(rc)) {
+               resends++;
+               if (rc != -EINPROGRESS &&
+                   !client_should_resend(resends, &exp->exp_obd->u.cli)) {
+                       CERROR("%s: too many resend retries for object: "
+                              ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name,
+                              POSTID(&oa->o_oi), rc);
+                       goto out;
+               }
+               if (generation !=
+                   exp->exp_obd->u.cli.cl_import->imp_generation) {
+                       CDEBUG(D_HA, "%s: resend cross eviction for object: "
+                              ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name,
+                              POSTID(&oa->o_oi), rc);
+                       goto out;
+               }
+
+               lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL,
+                                      NULL);
+               l_wait_event(waitq, 0, &lwi);
+
+               goto restart_bulk;
+       }
+out:
+       if (rc == -EAGAIN || rc == -EINPROGRESS)
+               rc = -EIO;
+       RETURN (rc);
+}
+
+static int osc_brw_redo_request(struct ptlrpc_request *request,
+                               struct osc_brw_async_args *aa, int rc)
+{
+       struct ptlrpc_request *new_req;
+       struct osc_brw_async_args *new_aa;
+       struct osc_async_page *oap;
+       ENTRY;
+
+       DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request,
+                 "redo for recoverable error %d", rc);
+
+       rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) ==
+                                       OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ,
+                                 aa->aa_cli, aa->aa_oa,
+                                 NULL /* lsm unused by osc currently */,
+                                 aa->aa_page_count, aa->aa_ppga,
+                                 &new_req, aa->aa_ocapa, 0, 1);
+       if (rc)
+               RETURN(rc);
+
+       list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+               if (oap->oap_request != NULL) {
+                       LASSERTF(request == oap->oap_request,
+                                "request %p != oap_request %p\n",
+                                request, oap->oap_request);
+                       if (oap->oap_interrupted) {
+                               ptlrpc_req_finished(new_req);
+                               RETURN(-EINTR);
+                       }
+               }
+       }
+       /* New request takes over pga and oaps from old request.
+        * Note that copying a list_head doesn't work, need to move it... */
+       aa->aa_resends++;
+       new_req->rq_interpret_reply = request->rq_interpret_reply;
+       new_req->rq_async_args = request->rq_async_args;
+       /* cap resend delay to the current request timeout, this is similar to
+        * what ptlrpc does (see after_reply()) */
+       if (aa->aa_resends > new_req->rq_timeout)
+               new_req->rq_sent = cfs_time_current_sec() + new_req->rq_timeout;
+       else
+               new_req->rq_sent = cfs_time_current_sec() + aa->aa_resends;
+       new_req->rq_generation_set = 1;
+       new_req->rq_import_generation = request->rq_import_generation;
+
+       new_aa = ptlrpc_req_async_args(new_req);
+
+       INIT_LIST_HEAD(&new_aa->aa_oaps);
+       list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps);
+       INIT_LIST_HEAD(&new_aa->aa_exts);
+       list_splice_init(&aa->aa_exts, &new_aa->aa_exts);
+       new_aa->aa_resends = aa->aa_resends;
+
+       list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) {
+               if (oap->oap_request) {
+                       ptlrpc_req_finished(oap->oap_request);
+                       oap->oap_request = ptlrpc_request_addref(new_req);
+               }
+       }
+
+       new_aa->aa_ocapa = aa->aa_ocapa;
+       aa->aa_ocapa = NULL;
+
+       /* XXX: This code will run into problem if we're going to support
+        * to add a series of BRW RPCs into a self-defined ptlrpc_request_set
+        * and wait for all of them to be finished. We should inherit request
+        * set from old request. */
+       ptlrpcd_add_req(new_req, PDL_POLICY_SAME, -1);
+
+       DEBUG_REQ(D_INFO, new_req, "new request");
+       RETURN(0);
+}
+
+/*
+ * ugh, we want disk allocation on the target to happen in offset order.  we'll
+ * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
+ * fine for our small page arrays and doesn't require allocation.  its an
+ * insertion sort that swaps elements that are strides apart, shrinking the
+ * stride down until its '1' and the array is sorted.
+ */
+static void sort_brw_pages(struct brw_page **array, int num)
+{
+       int stride, i, j;
+       struct brw_page *tmp;
+
+       if (num == 1)
+               return;
+       for (stride = 1; stride < num ; stride = (stride * 3) + 1)
+               ;
+
+       do {
+               stride /= 3;
+               for (i = stride ; i < num ; i++) {
+                       tmp = array[i];
+                       j = i;
+                       while (j >= stride && array[j - stride]->off > tmp->off) {
+                               array[j] = array[j - stride];
+                               j -= stride;
+                       }
+                       array[j] = tmp;
+               }
+       } while (stride > 1);
+}
+
+static obd_count max_unfragmented_pages(struct brw_page **pg, obd_count pages)
+{
+       int count = 1;
+       int offset;
+       int i = 0;
+
+       LASSERT (pages > 0);
+       offset = pg[i]->off & ~CFS_PAGE_MASK;
+
+       for (;;) {
+               pages--;
+               if (pages == 0)  /* that's all */
+                       return count;
+
+               if (offset + pg[i]->count < PAGE_CACHE_SIZE)
+                       return count;   /* doesn't end on page boundary */
+
+               i++;
+               offset = pg[i]->off & ~CFS_PAGE_MASK;
+               if (offset != 0)        /* doesn't start on page boundary */
+                       return count;
+
+               count++;
+       }
+}
+
+static struct brw_page **osc_build_ppga(struct brw_page *pga, obd_count count)
+{
+       struct brw_page **ppga;
+       int i;
+
+       OBD_ALLOC(ppga, sizeof(*ppga) * count);
+       if (ppga == NULL)
+               return NULL;
+
+       for (i = 0; i < count; i++)
+               ppga[i] = pga + i;
+       return ppga;
+}
+
+static void osc_release_ppga(struct brw_page **ppga, obd_count count)
+{
+       LASSERT(ppga != NULL);
+       OBD_FREE(ppga, sizeof(*ppga) * count);
+}
+
+static int osc_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
+                  obd_count page_count, struct brw_page *pga,
+                  struct obd_trans_info *oti)
+{
+       struct obdo *saved_oa = NULL;
+       struct brw_page **ppga, **orig;
+       struct obd_import *imp = class_exp2cliimp(exp);
+       struct client_obd *cli;
+       int rc, page_count_orig;
+       ENTRY;
+
+       LASSERT((imp != NULL) && (imp->imp_obd != NULL));
+       cli = &imp->imp_obd->u.cli;
+
+       if (cmd & OBD_BRW_CHECK) {
+               /* The caller just wants to know if there's a chance that this
+                * I/O can succeed */
+
+               if (imp->imp_invalid)
+                       RETURN(-EIO);
+               RETURN(0);
+       }
+
+       /* test_brw with a failed create can trip this, maybe others. */
+       LASSERT(cli->cl_max_pages_per_rpc);
+
+       rc = 0;
+
+       orig = ppga = osc_build_ppga(pga, page_count);
+       if (ppga == NULL)
+               RETURN(-ENOMEM);
+       page_count_orig = page_count;
+
+       sort_brw_pages(ppga, page_count);
+       while (page_count) {
+               obd_count pages_per_brw;
+
+               if (page_count > cli->cl_max_pages_per_rpc)
+                       pages_per_brw = cli->cl_max_pages_per_rpc;
+               else
+                       pages_per_brw = page_count;
+
+               pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw);
+
+               if (saved_oa != NULL) {
+                       /* restore previously saved oa */
+                       *oinfo->oi_oa = *saved_oa;
+               } else if (page_count > pages_per_brw) {
+                       /* save a copy of oa (brw will clobber it) */
+                       OBDO_ALLOC(saved_oa);
+                       if (saved_oa == NULL)
+                               GOTO(out, rc = -ENOMEM);
+                       *saved_oa = *oinfo->oi_oa;
+               }
+
+               rc = osc_brw_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md,
+                                     pages_per_brw, ppga, oinfo->oi_capa);
+
+               if (rc != 0)
+                       break;
+
+               page_count -= pages_per_brw;
+               ppga += pages_per_brw;
+       }
+
+out:
+       osc_release_ppga(orig, page_count_orig);
+
+       if (saved_oa != NULL)
+               OBDO_FREE(saved_oa);
+
+       RETURN(rc);
+}
+
+static int brw_interpret(const struct lu_env *env,
+                        struct ptlrpc_request *req, void *data, int rc)
+{
+       struct osc_brw_async_args *aa = data;
+       struct osc_extent *ext;
+       struct osc_extent *tmp;
+       struct cl_object  *obj = NULL;
+       struct client_obd *cli = aa->aa_cli;
+       ENTRY;
+
+       rc = osc_brw_fini_request(req, rc);
+       CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
+       /* When server return -EINPROGRESS, client should always retry
+        * regardless of the number of times the bulk was resent already. */
+       if (osc_recoverable_error(rc)) {
+               if (req->rq_import_generation !=
+                   req->rq_import->imp_generation) {
+                       CDEBUG(D_HA, "%s: resend cross eviction for object: "
+                              ""DOSTID", rc = %d.\n",
+                              req->rq_import->imp_obd->obd_name,
+                              POSTID(&aa->aa_oa->o_oi), rc);
+               } else if (rc == -EINPROGRESS ||
+                   client_should_resend(aa->aa_resends, aa->aa_cli)) {
+                       rc = osc_brw_redo_request(req, aa, rc);
+               } else {
+                       CERROR("%s: too many resent retries for object: "
+                              ""LPU64":"LPU64", rc = %d.\n",
+                              req->rq_import->imp_obd->obd_name,
+                              POSTID(&aa->aa_oa->o_oi), rc);
+               }
+
+               if (rc == 0)
+                       RETURN(0);
+               else if (rc == -EAGAIN || rc == -EINPROGRESS)
+                       rc = -EIO;
+       }
+
+       if (aa->aa_ocapa) {
+               capa_put(aa->aa_ocapa);
+               aa->aa_ocapa = NULL;
+       }
+
+       list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
+               if (obj == NULL && rc == 0) {
+                       obj = osc2cl(ext->oe_obj);
+                       cl_object_get(obj);
+               }
+
+               list_del_init(&ext->oe_link);
+               osc_extent_finish(env, ext, 1, rc);
+       }
+       LASSERT(list_empty(&aa->aa_exts));
+       LASSERT(list_empty(&aa->aa_oaps));
+
+       if (obj != NULL) {
+               struct obdo *oa = aa->aa_oa;
+               struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+               unsigned long valid = 0;
+
+               LASSERT(rc == 0);
+               if (oa->o_valid & OBD_MD_FLBLOCKS) {
+                       attr->cat_blocks = oa->o_blocks;
+                       valid |= CAT_BLOCKS;
+               }
+               if (oa->o_valid & OBD_MD_FLMTIME) {
+                       attr->cat_mtime = oa->o_mtime;
+                       valid |= CAT_MTIME;
+               }
+               if (oa->o_valid & OBD_MD_FLATIME) {
+                       attr->cat_atime = oa->o_atime;
+                       valid |= CAT_ATIME;
+               }
+               if (oa->o_valid & OBD_MD_FLCTIME) {
+                       attr->cat_ctime = oa->o_ctime;
+                       valid |= CAT_CTIME;
+               }
+               if (valid != 0) {
+                       cl_object_attr_lock(obj);
+                       cl_object_attr_set(env, obj, attr, valid);
+                       cl_object_attr_unlock(obj);
+               }
+               cl_object_put(env, obj);
+       }
+       OBDO_FREE(aa->aa_oa);
+
+       cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc :
+                         req->rq_bulk->bd_nob_transferred);
+       osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
+       ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters
+        * is called so we know whether to go to sync BRWs or wait for more
+        * RPCs to complete */
+       if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
+               cli->cl_w_in_flight--;
+       else
+               cli->cl_r_in_flight--;
+       osc_wake_cache_waiters(cli);
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+       RETURN(rc);
+}
+
+/**
+ * Build an RPC by the list of extent @ext_list. The caller must ensure
+ * that the total pages in this list are NOT over max pages per RPC.
+ * Extents in the list must be in OES_RPC state.
+ */
+int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
+                 struct list_head *ext_list, int cmd, pdl_policy_t pol)
+{
+       struct ptlrpc_request           *req = NULL;
+       struct osc_extent               *ext;
+       struct brw_page                 **pga = NULL;
+       struct osc_brw_async_args       *aa = NULL;
+       struct obdo                     *oa = NULL;
+       struct osc_async_page           *oap;
+       struct osc_async_page           *tmp;
+       struct cl_req                   *clerq = NULL;
+       enum cl_req_type                crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE :
+                                                                     CRT_READ;
+       struct ldlm_lock                *lock = NULL;
+       struct cl_req_attr              *crattr = NULL;
+       obd_off                         starting_offset = OBD_OBJECT_EOF;
+       obd_off                         ending_offset = 0;
+       int                             mpflag = 0;
+       int                             mem_tight = 0;
+       int                             page_count = 0;
+       int                             i;
+       int                             rc;
+       LIST_HEAD(rpc_list);
+
+       ENTRY;
+       LASSERT(!list_empty(ext_list));
+
+       /* add pages into rpc_list to build BRW rpc */
+       list_for_each_entry(ext, ext_list, oe_link) {
+               LASSERT(ext->oe_state == OES_RPC);
+               mem_tight |= ext->oe_memalloc;
+               list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
+                       ++page_count;
+                       list_add_tail(&oap->oap_rpc_item, &rpc_list);
+                       if (starting_offset > oap->oap_obj_off)
+                               starting_offset = oap->oap_obj_off;
+                       else
+                               LASSERT(oap->oap_page_off == 0);
+                       if (ending_offset < oap->oap_obj_off + oap->oap_count)
+                               ending_offset = oap->oap_obj_off +
+                                               oap->oap_count;
+                       else
+                               LASSERT(oap->oap_page_off + oap->oap_count ==
+                                       PAGE_CACHE_SIZE);
+               }
+       }
+
+       if (mem_tight)
+               mpflag = cfs_memory_pressure_get_and_set();
+
+       OBD_ALLOC(crattr, sizeof(*crattr));
+       if (crattr == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       OBD_ALLOC(pga, sizeof(*pga) * page_count);
+       if (pga == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       OBDO_ALLOC(oa);
+       if (oa == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       i = 0;
+       list_for_each_entry(oap, &rpc_list, oap_rpc_item) {
+               struct cl_page *page = oap2cl_page(oap);
+               if (clerq == NULL) {
+                       clerq = cl_req_alloc(env, page, crt,
+                                            1 /* only 1-object rpcs for now */);
+                       if (IS_ERR(clerq))
+                               GOTO(out, rc = PTR_ERR(clerq));
+                       lock = oap->oap_ldlm_lock;
+               }
+               if (mem_tight)
+                       oap->oap_brw_flags |= OBD_BRW_MEMALLOC;
+               pga[i] = &oap->oap_brw_page;
+               pga[i]->off = oap->oap_obj_off + oap->oap_page_off;
+               CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n",
+                      pga[i]->pg, page_index(oap->oap_page), oap,
+                      pga[i]->flag);
+               i++;
+               cl_req_page_add(env, clerq, page);
+       }
+
+       /* always get the data for the obdo for the rpc */
+       LASSERT(clerq != NULL);
+       crattr->cra_oa = oa;
+       cl_req_attr_set(env, clerq, crattr, ~0ULL);
+       if (lock) {
+               oa->o_handle = lock->l_remote_handle;
+               oa->o_valid |= OBD_MD_FLHANDLE;
+       }
+
+       rc = cl_req_prep(env, clerq);
+       if (rc != 0) {
+               CERROR("cl_req_prep failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       sort_brw_pages(pga, page_count);
+       rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count,
+                       pga, &req, crattr->cra_capa, 1, 0);
+       if (rc != 0) {
+               CERROR("prep_req failed: %d\n", rc);
+               GOTO(out, rc);
+       }
+
+       req->rq_interpret_reply = brw_interpret;
+
+       if (mem_tight != 0)
+               req->rq_memalloc = 1;
+
+       /* Need to update the timestamps after the request is built in case
+        * we race with setattr (locally or in queue at OST).  If OST gets
+        * later setattr before earlier BRW (as determined by the request xid),
+        * the OST will not use BRW timestamps.  Sadly, there is no obvious
+        * way to do this in a single call.  bug 10150 */
+       cl_req_attr_set(env, clerq, crattr,
+                       OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME);
+
+       lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid);
+
+       CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       INIT_LIST_HEAD(&aa->aa_oaps);
+       list_splice_init(&rpc_list, &aa->aa_oaps);
+       INIT_LIST_HEAD(&aa->aa_exts);
+       list_splice_init(ext_list, &aa->aa_exts);
+       aa->aa_clerq = clerq;
+
+       /* queued sync pages can be torn down while the pages
+        * were between the pending list and the rpc */
+       tmp = NULL;
+       list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) {
+               /* only one oap gets a request reference */
+               if (tmp == NULL)
+                       tmp = oap;
+               if (oap->oap_interrupted && !req->rq_intr) {
+                       CDEBUG(D_INODE, "oap %p in req %p interrupted\n",
+                                       oap, req);
+                       ptlrpc_mark_interrupted(req);
+               }
+       }
+       if (tmp != NULL)
+               tmp->oap_request = ptlrpc_request_addref(req);
+
+       client_obd_list_lock(&cli->cl_loi_list_lock);
+       starting_offset >>= PAGE_CACHE_SHIFT;
+       if (cmd == OBD_BRW_READ) {
+               cli->cl_r_in_flight++;
+               lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
+               lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
+               lprocfs_oh_tally_log2(&cli->cl_read_offset_hist,
+                                     starting_offset + 1);
+       } else {
+               cli->cl_w_in_flight++;
+               lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
+               lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight);
+               lprocfs_oh_tally_log2(&cli->cl_write_offset_hist,
+                                     starting_offset + 1);
+       }
+       client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+       DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight",
+                 page_count, aa, cli->cl_r_in_flight,
+                 cli->cl_w_in_flight);
+
+       /* XXX: Maybe the caller can check the RPC bulk descriptor to
+        * see which CPU/NUMA node the majority of pages were allocated
+        * on, and try to assign the async RPC to the CPU core
+        * (PDL_POLICY_PREFERRED) to reduce cross-CPU memory traffic.
+        *
+        * But on the other hand, we expect that multiple ptlrpcd
+        * threads and the initial write sponsor can run in parallel,
+        * especially when data checksum is enabled, which is CPU-bound
+        * operation and single ptlrpcd thread cannot process in time.
+        * So more ptlrpcd threads sharing BRW load
+        * (with PDL_POLICY_ROUND) seems better.
+        */
+       ptlrpcd_add_req(req, pol, -1);
+       rc = 0;
+       EXIT;
+
+out:
+       if (mem_tight != 0)
+               cfs_memory_pressure_restore(mpflag);
+
+       if (crattr != NULL) {
+               capa_put(crattr->cra_capa);
+               OBD_FREE(crattr, sizeof(*crattr));
+       }
+
+       if (rc != 0) {
+               LASSERT(req == NULL);
+
+               if (oa)
+                       OBDO_FREE(oa);
+               if (pga)
+                       OBD_FREE(pga, sizeof(*pga) * page_count);
+               /* this should happen rarely and is pretty bad, it makes the
+                * pending list not follow the dirty order */
+               while (!list_empty(ext_list)) {
+                       ext = list_entry(ext_list->next, struct osc_extent,
+                                            oe_link);
+                       list_del_init(&ext->oe_link);
+                       osc_extent_finish(env, ext, 0, rc);
+               }
+               if (clerq && !IS_ERR(clerq))
+                       cl_req_completion(env, clerq, rc);
+       }
+       RETURN(rc);
+}
+
+static int osc_set_lock_data_with_check(struct ldlm_lock *lock,
+                                       struct ldlm_enqueue_info *einfo)
+{
+       void *data = einfo->ei_cbdata;
+       int set = 0;
+
+       LASSERT(lock != NULL);
+       LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl);
+       LASSERT(lock->l_resource->lr_type == einfo->ei_type);
+       LASSERT(lock->l_completion_ast == einfo->ei_cb_cp);
+       LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl);
+
+       lock_res_and_lock(lock);
+       spin_lock(&osc_ast_guard);
+
+       if (lock->l_ast_data == NULL)
+               lock->l_ast_data = data;
+       if (lock->l_ast_data == data)
+               set = 1;
+
+       spin_unlock(&osc_ast_guard);
+       unlock_res_and_lock(lock);
+
+       return set;
+}
+
+static int osc_set_data_with_check(struct lustre_handle *lockh,
+                                  struct ldlm_enqueue_info *einfo)
+{
+       struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+       int set = 0;
+
+       if (lock != NULL) {
+               set = osc_set_lock_data_with_check(lock, einfo);
+               LDLM_LOCK_PUT(lock);
+       } else
+               CERROR("lockh %p, data %p - client evicted?\n",
+                      lockh, einfo->ei_cbdata);
+       return set;
+}
+
+static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
+                            ldlm_iterator_t replace, void *data)
+{
+       struct ldlm_res_id res_id;
+       struct obd_device *obd = class_exp2obd(exp);
+
+       ostid_build_res_name(&lsm->lsm_oi, &res_id);
+       ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
+       return 0;
+}
+
+/* find any ldlm lock of the inode in osc
+ * return 0    not find
+ *     1    find one
+ *      < 0    error */
+static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
+                          ldlm_iterator_t replace, void *data)
+{
+       struct ldlm_res_id res_id;
+       struct obd_device *obd = class_exp2obd(exp);
+       int rc = 0;
+
+       ostid_build_res_name(&lsm->lsm_oi, &res_id);
+       rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data);
+       if (rc == LDLM_ITER_STOP)
+               return(1);
+       if (rc == LDLM_ITER_CONTINUE)
+               return(0);
+       return(rc);
+}
+
+static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb,
+                           obd_enqueue_update_f upcall, void *cookie,
+                           __u64 *flags, int agl, int rc)
+{
+       int intent = *flags & LDLM_FL_HAS_INTENT;
+       ENTRY;
+
+       if (intent) {
+               /* The request was created before ldlm_cli_enqueue call. */
+               if (rc == ELDLM_LOCK_ABORTED) {
+                       struct ldlm_reply *rep;
+                       rep = req_capsule_server_get(&req->rq_pill,
+                                                    &RMF_DLM_REP);
+
+                       LASSERT(rep != NULL);
+                       if (rep->lock_policy_res1)
+                               rc = rep->lock_policy_res1;
+               }
+       }
+
+       if ((intent != 0 && rc == ELDLM_LOCK_ABORTED && agl == 0) ||
+           (rc == 0)) {
+               *flags |= LDLM_FL_LVB_READY;
+               CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n",
+                      lvb->lvb_size, lvb->lvb_blocks, lvb->lvb_mtime);
+       }
+
+       /* Call the update callback. */
+       rc = (*upcall)(cookie, rc);
+       RETURN(rc);
+}
+
+static int osc_enqueue_interpret(const struct lu_env *env,
+                                struct ptlrpc_request *req,
+                                struct osc_enqueue_args *aa, int rc)
+{
+       struct ldlm_lock *lock;
+       struct lustre_handle handle;
+       __u32 mode;
+       struct ost_lvb *lvb;
+       __u32 lvb_len;
+       __u64 *flags = aa->oa_flags;
+
+       /* Make a local copy of a lock handle and a mode, because aa->oa_*
+        * might be freed anytime after lock upcall has been called. */
+       lustre_handle_copy(&handle, aa->oa_lockh);
+       mode = aa->oa_ei->ei_mode;
+
+       /* ldlm_cli_enqueue is holding a reference on the lock, so it must
+        * be valid. */
+       lock = ldlm_handle2lock(&handle);
+
+       /* Take an additional reference so that a blocking AST that
+        * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+        * to arrive after an upcall has been executed by
+        * osc_enqueue_fini(). */
+       ldlm_lock_addref(&handle, mode);
+
+       /* Let CP AST to grant the lock first. */
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1);
+
+       if (aa->oa_agl && rc == ELDLM_LOCK_ABORTED) {
+               lvb = NULL;
+               lvb_len = 0;
+       } else {
+               lvb = aa->oa_lvb;
+               lvb_len = sizeof(*aa->oa_lvb);
+       }
+
+       /* Complete obtaining the lock procedure. */
+       rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1,
+                                  mode, flags, lvb, lvb_len, &handle, rc);
+       /* Complete osc stuff. */
+       rc = osc_enqueue_fini(req, aa->oa_lvb, aa->oa_upcall, aa->oa_cookie,
+                             flags, aa->oa_agl, rc);
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10);
+
+       /* Release the lock for async request. */
+       if (lustre_handle_is_used(&handle) && rc == ELDLM_OK)
+               /*
+                * Releases a reference taken by ldlm_cli_enqueue(), if it is
+                * not already released by
+                * ldlm_cli_enqueue_fini()->failed_lock_cleanup()
+                */
+               ldlm_lock_decref(&handle, mode);
+
+       LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n",
+                aa->oa_lockh, req, aa);
+       ldlm_lock_decref(&handle, mode);
+       LDLM_LOCK_PUT(lock);
+       return rc;
+}
+
+void osc_update_enqueue(struct lustre_handle *lov_lockhp,
+                       struct lov_oinfo *loi, int flags,
+                       struct ost_lvb *lvb, __u32 mode, int rc)
+{
+       struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp);
+
+       if (rc == ELDLM_OK) {
+               __u64 tmp;
+
+               LASSERT(lock != NULL);
+               loi->loi_lvb = *lvb;
+               tmp = loi->loi_lvb.lvb_size;
+               /* Extend KMS up to the end of this lock and no further
+                * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+               if (tmp > lock->l_policy_data.l_extent.end)
+                       tmp = lock->l_policy_data.l_extent.end + 1;
+               if (tmp >= loi->loi_kms) {
+                       LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64
+                                  ", kms="LPU64, loi->loi_lvb.lvb_size, tmp);
+                       loi_kms_set(loi, tmp);
+               } else {
+                       LDLM_DEBUG(lock, "lock acquired, setting rss="
+                                  LPU64"; leaving kms="LPU64", end="LPU64,
+                                  loi->loi_lvb.lvb_size, loi->loi_kms,
+                                  lock->l_policy_data.l_extent.end);
+               }
+               ldlm_lock_allow_match(lock);
+       } else if (rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT)) {
+               LASSERT(lock != NULL);
+               loi->loi_lvb = *lvb;
+               ldlm_lock_allow_match(lock);
+               CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
+                      " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms);
+               rc = ELDLM_OK;
+       }
+
+       if (lock != NULL) {
+               if (rc != ELDLM_OK)
+                       ldlm_lock_fail_match(lock);
+
+               LDLM_LOCK_PUT(lock);
+       }
+}
+EXPORT_SYMBOL(osc_update_enqueue);
+
+struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
+
+/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
+ * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
+ * other synchronous requests, however keeping some locks and trying to obtain
+ * others may take a considerable amount of time in a case of ost failure; and
+ * when other sync requests do not get released lock from a client, the client
+ * is excluded from the cluster -- such scenarious make the life difficult, so
+ * release locks just after they are obtained. */
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                    __u64 *flags, ldlm_policy_data_t *policy,
+                    struct ost_lvb *lvb, int kms_valid,
+                    obd_enqueue_update_f upcall, void *cookie,
+                    struct ldlm_enqueue_info *einfo,
+                    struct lustre_handle *lockh,
+                    struct ptlrpc_request_set *rqset, int async, int agl)
+{
+       struct obd_device *obd = exp->exp_obd;
+       struct ptlrpc_request *req = NULL;
+       int intent = *flags & LDLM_FL_HAS_INTENT;
+       int match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY);
+       ldlm_mode_t mode;
+       int rc;
+       ENTRY;
+
+       /* Filesystem lock extents are extended to page boundaries so that
+        * dealing with the page cache is a little smoother.  */
+       policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+       policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+       /*
+        * kms is not valid when either object is completely fresh (so that no
+        * locks are cached), or object was evicted. In the latter case cached
+        * lock cannot be used, because it would prime inode state with
+        * potentially stale LVB.
+        */
+       if (!kms_valid)
+               goto no_match;
+
+       /* Next, search for already existing extent locks that will cover us */
+       /* If we're trying to read, we also search for an existing PW lock.  The
+        * VFS and page cache already protect us locally, so lots of readers/
+        * writers can share a single PW lock.
+        *
+        * There are problems with conversion deadlocks, so instead of
+        * converting a read lock to a write lock, we'll just enqueue a new
+        * one.
+        *
+        * At some point we should cancel the read lock instead of making them
+        * send us a blocking callback, but there are problems with canceling
+        * locks out from other users right now, too. */
+       mode = einfo->ei_mode;
+       if (einfo->ei_mode == LCK_PR)
+               mode |= LCK_PW;
+       mode = ldlm_lock_match(obd->obd_namespace, *flags | match_lvb, res_id,
+                              einfo->ei_type, policy, mode, lockh, 0);
+       if (mode) {
+               struct ldlm_lock *matched = ldlm_handle2lock(lockh);
+
+               if ((agl != 0) && !(matched->l_flags & LDLM_FL_LVB_READY)) {
+                       /* For AGL, if enqueue RPC is sent but the lock is not
+                        * granted, then skip to process this strpe.
+                        * Return -ECANCELED to tell the caller. */
+                       ldlm_lock_decref(lockh, mode);
+                       LDLM_LOCK_PUT(matched);
+                       RETURN(-ECANCELED);
+               } else if (osc_set_lock_data_with_check(matched, einfo)) {
+                       *flags |= LDLM_FL_LVB_READY;
+                       /* addref the lock only if not async requests and PW
+                        * lock is matched whereas we asked for PR. */
+                       if (!rqset && einfo->ei_mode != mode)
+                               ldlm_lock_addref(lockh, LCK_PR);
+                       if (intent) {
+                               /* I would like to be able to ASSERT here that
+                                * rss <= kms, but I can't, for reasons which
+                                * are explained in lov_enqueue() */
+                       }
+
+                       /* We already have a lock, and it's referenced.
+                        *
+                        * At this point, the cl_lock::cll_state is CLS_QUEUING,
+                        * AGL upcall may change it to CLS_HELD directly. */
+                       (*upcall)(cookie, ELDLM_OK);
+
+                       if (einfo->ei_mode != mode)
+                               ldlm_lock_decref(lockh, LCK_PW);
+                       else if (rqset)
+                               /* For async requests, decref the lock. */
+                               ldlm_lock_decref(lockh, einfo->ei_mode);
+                       LDLM_LOCK_PUT(matched);
+                       RETURN(ELDLM_OK);
+               } else {
+                       ldlm_lock_decref(lockh, mode);
+                       LDLM_LOCK_PUT(matched);
+               }
+       }
+
+ no_match:
+       if (intent) {
+               LIST_HEAD(cancels);
+               req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                          &RQF_LDLM_ENQUEUE_LVB);
+               if (req == NULL)
+                       RETURN(-ENOMEM);
+
+               rc = ldlm_prep_enqueue_req(exp, req, &cancels, 0);
+               if (rc) {
+                       ptlrpc_request_free(req);
+                       RETURN(rc);
+               }
+
+               req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
+                                    sizeof *lvb);
+               ptlrpc_request_set_replen(req);
+       }
+
+       /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
+       *flags &= ~LDLM_FL_BLOCK_GRANTED;
+
+       rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
+                             sizeof(*lvb), LVB_T_OST, lockh, async);
+       if (rqset) {
+               if (!rc) {
+                       struct osc_enqueue_args *aa;
+                       CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+                       aa = ptlrpc_req_async_args(req);
+                       aa->oa_ei = einfo;
+                       aa->oa_exp = exp;
+                       aa->oa_flags  = flags;
+                       aa->oa_upcall = upcall;
+                       aa->oa_cookie = cookie;
+                       aa->oa_lvb    = lvb;
+                       aa->oa_lockh  = lockh;
+                       aa->oa_agl    = !!agl;
+
+                       req->rq_interpret_reply =
+                               (ptlrpc_interpterer_t)osc_enqueue_interpret;
+                       if (rqset == PTLRPCD_SET)
+                               ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+                       else
+                               ptlrpc_set_add_req(rqset, req);
+               } else if (intent) {
+                       ptlrpc_req_finished(req);
+               }
+               RETURN(rc);
+       }
+
+       rc = osc_enqueue_fini(req, lvb, upcall, cookie, flags, agl, rc);
+       if (intent)
+               ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+
+static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+                      struct ldlm_enqueue_info *einfo,
+                      struct ptlrpc_request_set *rqset)
+{
+       struct ldlm_res_id res_id;
+       int rc;
+       ENTRY;
+
+       ostid_build_res_name(&oinfo->oi_md->lsm_oi, &res_id);
+       rc = osc_enqueue_base(exp, &res_id, &oinfo->oi_flags, &oinfo->oi_policy,
+                             &oinfo->oi_md->lsm_oinfo[0]->loi_lvb,
+                             oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid,
+                             oinfo->oi_cb_up, oinfo, einfo, oinfo->oi_lockh,
+                             rqset, rqset != NULL, 0);
+       RETURN(rc);
+}
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                  __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+                  int *flags, void *data, struct lustre_handle *lockh,
+                  int unref)
+{
+       struct obd_device *obd = exp->exp_obd;
+       int lflags = *flags;
+       ldlm_mode_t rc;
+       ENTRY;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
+               RETURN(-EIO);
+
+       /* Filesystem lock extents are extended to page boundaries so that
+        * dealing with the page cache is a little smoother */
+       policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+       policy->l_extent.end |= ~CFS_PAGE_MASK;
+
+       /* Next, search for already existing extent locks that will cover us */
+       /* If we're trying to read, we also search for an existing PW lock.  The
+        * VFS and page cache already protect us locally, so lots of readers/
+        * writers can share a single PW lock. */
+       rc = mode;
+       if (mode == LCK_PR)
+               rc |= LCK_PW;
+       rc = ldlm_lock_match(obd->obd_namespace, lflags,
+                            res_id, type, policy, rc, lockh, unref);
+       if (rc) {
+               if (data != NULL) {
+                       if (!osc_set_data_with_check(lockh, data)) {
+                               if (!(lflags & LDLM_FL_TEST_LOCK))
+                                       ldlm_lock_decref(lockh, rc);
+                               RETURN(0);
+                       }
+               }
+               if (!(lflags & LDLM_FL_TEST_LOCK) && mode != rc) {
+                       ldlm_lock_addref(lockh, LCK_PR);
+                       ldlm_lock_decref(lockh, LCK_PW);
+               }
+               RETURN(rc);
+       }
+       RETURN(rc);
+}
+
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode)
+{
+       ENTRY;
+
+       if (unlikely(mode == LCK_GROUP))
+               ldlm_lock_decref_and_cancel(lockh, mode);
+       else
+               ldlm_lock_decref(lockh, mode);
+
+       RETURN(0);
+}
+
+static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
+                     __u32 mode, struct lustre_handle *lockh)
+{
+       ENTRY;
+       RETURN(osc_cancel_base(lockh, mode));
+}
+
+static int osc_cancel_unused(struct obd_export *exp,
+                            struct lov_stripe_md *lsm,
+                            ldlm_cancel_flags_t flags,
+                            void *opaque)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct ldlm_res_id res_id, *resp = NULL;
+
+       if (lsm != NULL) {
+               ostid_build_res_name(&lsm->lsm_oi, &res_id);
+               resp = &res_id;
+       }
+
+       return ldlm_cli_cancel_unused(obd->obd_namespace, resp, flags, opaque);
+}
+
+static int osc_statfs_interpret(const struct lu_env *env,
+                               struct ptlrpc_request *req,
+                               struct osc_async_args *aa, int rc)
+{
+       struct obd_statfs *msfs;
+       ENTRY;
+
+       if (rc == -EBADR)
+               /* The request has in fact never been sent
+                * due to issues at a higher level (LOV).
+                * Exit immediately since the caller is
+                * aware of the problem and takes care
+                * of the clean up */
+                RETURN(rc);
+
+       if ((rc == -ENOTCONN || rc == -EAGAIN) &&
+           (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
+               GOTO(out, rc = 0);
+
+       if (rc != 0)
+               GOTO(out, rc);
+
+       msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+       if (msfs == NULL) {
+               GOTO(out, rc = -EPROTO);
+       }
+
+       *aa->aa_oi->oi_osfs = *msfs;
+out:
+       rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+       RETURN(rc);
+}
+
+static int osc_statfs_async(struct obd_export *exp,
+                           struct obd_info *oinfo, __u64 max_age,
+                           struct ptlrpc_request_set *rqset)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct ptlrpc_request *req;
+       struct osc_async_args *aa;
+       int                 rc;
+       ENTRY;
+
+       /* We could possibly pass max_age in the request (as an absolute
+        * timestamp or a "seconds.usec ago") so the target can avoid doing
+        * extra calls into the filesystem if that isn't necessary (e.g.
+        * during mount that would help a bit).  Having relative timestamps
+        * is not so great if request processing is slow, while absolute
+        * timestamps are not ideal because they need time synchronization. */
+       req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+       ptlrpc_request_set_replen(req);
+       req->rq_request_portal = OST_CREATE_PORTAL;
+       ptlrpc_at_set_req_timeout(req);
+
+       if (oinfo->oi_flags & OBD_STATFS_NODELAY) {
+               /* procfs requests not want stat in wait for avoid deadlock */
+               req->rq_no_resend = 1;
+               req->rq_no_delay = 1;
+       }
+
+       req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret;
+       CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       aa->aa_oi = oinfo;
+
+       ptlrpc_set_add_req(rqset, req);
+       RETURN(0);
+}
+
+static int osc_statfs(const struct lu_env *env, struct obd_export *exp,
+                     struct obd_statfs *osfs, __u64 max_age, __u32 flags)
+{
+       struct obd_device     *obd = class_exp2obd(exp);
+       struct obd_statfs     *msfs;
+       struct ptlrpc_request *req;
+       struct obd_import     *imp = NULL;
+       int rc;
+       ENTRY;
+
+       /*Since the request might also come from lprocfs, so we need
+        *sync this with client_disconnect_export Bug15684*/
+       down_read(&obd->u.cli.cl_sem);
+       if (obd->u.cli.cl_import)
+               imp = class_import_get(obd->u.cli.cl_import);
+       up_read(&obd->u.cli.cl_sem);
+       if (!imp)
+               RETURN(-ENODEV);
+
+       /* We could possibly pass max_age in the request (as an absolute
+        * timestamp or a "seconds.usec ago") so the target can avoid doing
+        * extra calls into the filesystem if that isn't necessary (e.g.
+        * during mount that would help a bit).  Having relative timestamps
+        * is not so great if request processing is slow, while absolute
+        * timestamps are not ideal because they need time synchronization. */
+       req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS);
+
+       class_import_put(imp);
+
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+       ptlrpc_request_set_replen(req);
+       req->rq_request_portal = OST_CREATE_PORTAL;
+       ptlrpc_at_set_req_timeout(req);
+
+       if (flags & OBD_STATFS_NODELAY) {
+               /* procfs requests not want stat in wait for avoid deadlock */
+               req->rq_no_resend = 1;
+               req->rq_no_delay = 1;
+       }
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS);
+       if (msfs == NULL) {
+               GOTO(out, rc = -EPROTO);
+       }
+
+       *osfs = *msfs;
+
+       EXIT;
+ out:
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+/* Retrieve object striping information.
+ *
+ * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_MAGIC (we only use 1 slot here).
+ */
+static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump)
+{
+       /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+       struct lov_user_md_v3 lum, *lumk;
+       struct lov_user_ost_data_v1 *lmm_objects;
+       int rc = 0, lum_size;
+       ENTRY;
+
+       if (!lsm)
+               RETURN(-ENODATA);
+
+       /* we only need the header part from user space to get lmm_magic and
+        * lmm_stripe_count, (the header part is common to v1 and v3) */
+       lum_size = sizeof(struct lov_user_md_v1);
+       if (copy_from_user(&lum, lump, lum_size))
+               RETURN(-EFAULT);
+
+       if ((lum.lmm_magic != LOV_USER_MAGIC_V1) &&
+           (lum.lmm_magic != LOV_USER_MAGIC_V3))
+               RETURN(-EINVAL);
+
+       /* lov_user_md_vX and lov_mds_md_vX must have the same size */
+       LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1));
+       LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3));
+       LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0]));
+
+       /* we can use lov_mds_md_size() to compute lum_size
+        * because lov_user_md_vX and lov_mds_md_vX have the same size */
+       if (lum.lmm_stripe_count > 0) {
+               lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic);
+               OBD_ALLOC(lumk, lum_size);
+               if (!lumk)
+                       RETURN(-ENOMEM);
+
+               if (lum.lmm_magic == LOV_USER_MAGIC_V1)
+                       lmm_objects =
+                           &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]);
+               else
+                       lmm_objects = &(lumk->lmm_objects[0]);
+               lmm_objects->l_ost_oi = lsm->lsm_oi;
+       } else {
+               lum_size = lov_mds_md_size(0, lum.lmm_magic);
+               lumk = &lum;
+       }
+
+       lumk->lmm_oi = lsm->lsm_oi;
+       lumk->lmm_stripe_count = 1;
+
+       if (copy_to_user(lump, lumk, lum_size))
+               rc = -EFAULT;
+
+       if (lumk != &lum)
+               OBD_FREE(lumk, lum_size);
+
+       RETURN(rc);
+}
+
+
+static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
+                        void *karg, void *uarg)
+{
+       struct obd_device *obd = exp->exp_obd;
+       struct obd_ioctl_data *data = karg;
+       int err = 0;
+       ENTRY;
+
+       if (!try_module_get(THIS_MODULE)) {
+               CERROR("Can't get module. Is it alive?");
+               return -EINVAL;
+       }
+       switch (cmd) {
+       case OBD_IOC_LOV_GET_CONFIG: {
+               char *buf;
+               struct lov_desc *desc;
+               struct obd_uuid uuid;
+
+               buf = NULL;
+               len = 0;
+               if (obd_ioctl_getdata(&buf, &len, (void *)uarg))
+                       GOTO(out, err = -EINVAL);
+
+               data = (struct obd_ioctl_data *)buf;
+
+               if (sizeof(*desc) > data->ioc_inllen1) {
+                       obd_ioctl_freedata(buf, len);
+                       GOTO(out, err = -EINVAL);
+               }
+
+               if (data->ioc_inllen2 < sizeof(uuid)) {
+                       obd_ioctl_freedata(buf, len);
+                       GOTO(out, err = -EINVAL);
+               }
+
+               desc = (struct lov_desc *)data->ioc_inlbuf1;
+               desc->ld_tgt_count = 1;
+               desc->ld_active_tgt_count = 1;
+               desc->ld_default_stripe_count = 1;
+               desc->ld_default_stripe_size = 0;
+               desc->ld_default_stripe_offset = 0;
+               desc->ld_pattern = 0;
+               memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid));
+
+               memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid));
+
+               err = copy_to_user((void *)uarg, buf, len);
+               if (err)
+                       err = -EFAULT;
+               obd_ioctl_freedata(buf, len);
+               GOTO(out, err);
+       }
+       case LL_IOC_LOV_SETSTRIPE:
+               err = obd_alloc_memmd(exp, karg);
+               if (err > 0)
+                       err = 0;
+               GOTO(out, err);
+       case LL_IOC_LOV_GETSTRIPE:
+               err = osc_getstripe(karg, uarg);
+               GOTO(out, err);
+       case OBD_IOC_CLIENT_RECOVER:
+               err = ptlrpc_recover_import(obd->u.cli.cl_import,
+                                           data->ioc_inlbuf1, 0);
+               if (err > 0)
+                       err = 0;
+               GOTO(out, err);
+       case IOC_OSC_SET_ACTIVE:
+               err = ptlrpc_set_import_active(obd->u.cli.cl_import,
+                                              data->ioc_offset);
+               GOTO(out, err);
+       case OBD_IOC_POLL_QUOTACHECK:
+               err = osc_quota_poll_check(exp, (struct if_quotacheck *)karg);
+               GOTO(out, err);
+       case OBD_IOC_PING_TARGET:
+               err = ptlrpc_obd_ping(obd);
+               GOTO(out, err);
+       default:
+               CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
+                      cmd, current_comm());
+               GOTO(out, err = -ENOTTY);
+       }
+out:
+       module_put(THIS_MODULE);
+       return err;
+}
+
+static int osc_get_info(const struct lu_env *env, struct obd_export *exp,
+                       obd_count keylen, void *key, __u32 *vallen, void *val,
+                       struct lov_stripe_md *lsm)
+{
+       ENTRY;
+       if (!vallen || !val)
+               RETURN(-EFAULT);
+
+       if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
+               __u32 *stripe = val;
+               *vallen = sizeof(*stripe);
+               *stripe = 0;
+               RETURN(0);
+       } else if (KEY_IS(KEY_LAST_ID)) {
+               struct ptlrpc_request *req;
+               obd_id          *reply;
+               char              *tmp;
+               int                 rc;
+
+               req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                          &RQF_OST_GET_INFO_LAST_ID);
+               if (req == NULL)
+                       RETURN(-ENOMEM);
+
+               req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                                    RCL_CLIENT, keylen);
+               rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+               if (rc) {
+                       ptlrpc_request_free(req);
+                       RETURN(rc);
+               }
+
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+               memcpy(tmp, key, keylen);
+
+               req->rq_no_delay = req->rq_no_resend = 1;
+               ptlrpc_request_set_replen(req);
+               rc = ptlrpc_queue_wait(req);
+               if (rc)
+                       GOTO(out, rc);
+
+               reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID);
+               if (reply == NULL)
+                       GOTO(out, rc = -EPROTO);
+
+               *((obd_id *)val) = *reply;
+       out:
+               ptlrpc_req_finished(req);
+               RETURN(rc);
+       } else if (KEY_IS(KEY_FIEMAP)) {
+               struct ll_fiemap_info_key *fm_key =
+                               (struct ll_fiemap_info_key *)key;
+               struct ldlm_res_id       res_id;
+               ldlm_policy_data_t       policy;
+               struct lustre_handle     lockh;
+               ldlm_mode_t              mode = 0;
+               struct ptlrpc_request   *req;
+               struct ll_user_fiemap   *reply;
+               char                    *tmp;
+               int                      rc;
+
+               if (!(fm_key->fiemap.fm_flags & FIEMAP_FLAG_SYNC))
+                       goto skip_locking;
+
+               policy.l_extent.start = fm_key->fiemap.fm_start &
+                                               CFS_PAGE_MASK;
+
+               if (OBD_OBJECT_EOF - fm_key->fiemap.fm_length <=
+                   fm_key->fiemap.fm_start + PAGE_CACHE_SIZE - 1)
+                       policy.l_extent.end = OBD_OBJECT_EOF;
+               else
+                       policy.l_extent.end = (fm_key->fiemap.fm_start +
+                               fm_key->fiemap.fm_length +
+                               PAGE_CACHE_SIZE - 1) & CFS_PAGE_MASK;
+
+               ostid_build_res_name(&fm_key->oa.o_oi, &res_id);
+               mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
+                                      LDLM_FL_BLOCK_GRANTED |
+                                      LDLM_FL_LVB_READY,
+                                      &res_id, LDLM_EXTENT, &policy,
+                                      LCK_PR | LCK_PW, &lockh, 0);
+               if (mode) { /* lock is cached on client */
+                       if (mode != LCK_PR) {
+                               ldlm_lock_addref(&lockh, LCK_PR);
+                               ldlm_lock_decref(&lockh, LCK_PW);
+                       }
+               } else { /* no cached lock, needs acquire lock on server side */
+                       fm_key->oa.o_valid |= OBD_MD_FLFLAGS;
+                       fm_key->oa.o_flags |= OBD_FL_SRVLOCK;
+               }
+
+skip_locking:
+               req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                          &RQF_OST_GET_INFO_FIEMAP);
+               if (req == NULL)
+                       GOTO(drop_lock, rc = -ENOMEM);
+
+               req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY,
+                                    RCL_CLIENT, keylen);
+               req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+                                    RCL_CLIENT, *vallen);
+               req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL,
+                                    RCL_SERVER, *vallen);
+
+               rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO);
+               if (rc) {
+                       ptlrpc_request_free(req);
+                       GOTO(drop_lock, rc);
+               }
+
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY);
+               memcpy(tmp, key, keylen);
+               tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+               memcpy(tmp, val, *vallen);
+
+               ptlrpc_request_set_replen(req);
+               rc = ptlrpc_queue_wait(req);
+               if (rc)
+                       GOTO(fini_req, rc);
+
+               reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL);
+               if (reply == NULL)
+                       GOTO(fini_req, rc = -EPROTO);
+
+               memcpy(val, reply, *vallen);
+fini_req:
+               ptlrpc_req_finished(req);
+drop_lock:
+               if (mode)
+                       ldlm_lock_decref(&lockh, LCK_PR);
+               RETURN(rc);
+       }
+
+       RETURN(-EINVAL);
+}
+
+static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
+                             obd_count keylen, void *key, obd_count vallen,
+                             void *val, struct ptlrpc_request_set *set)
+{
+       struct ptlrpc_request *req;
+       struct obd_device     *obd = exp->exp_obd;
+       struct obd_import     *imp = class_exp2cliimp(exp);
+       char              *tmp;
+       int                 rc;
+       ENTRY;
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10);
+
+       if (KEY_IS(KEY_CHECKSUM)) {
+               if (vallen != sizeof(int))
+                       RETURN(-EINVAL);
+               exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0;
+               RETURN(0);
+       }
+
+       if (KEY_IS(KEY_SPTLRPC_CONF)) {
+               sptlrpc_conf_client_adapt(obd);
+               RETURN(0);
+       }
+
+       if (KEY_IS(KEY_FLUSH_CTX)) {
+               sptlrpc_import_flush_my_ctx(imp);
+               RETURN(0);
+       }
+
+       if (KEY_IS(KEY_CACHE_SET)) {
+               struct client_obd *cli = &obd->u.cli;
+
+               LASSERT(cli->cl_cache == NULL); /* only once */
+               cli->cl_cache = (struct cl_client_cache *)val;
+               atomic_inc(&cli->cl_cache->ccc_users);
+               cli->cl_lru_left = &cli->cl_cache->ccc_lru_left;
+
+               /* add this osc into entity list */
+               LASSERT(list_empty(&cli->cl_lru_osc));
+               spin_lock(&cli->cl_cache->ccc_lru_lock);
+               list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru);
+               spin_unlock(&cli->cl_cache->ccc_lru_lock);
+
+               RETURN(0);
+       }
+
+       if (KEY_IS(KEY_CACHE_LRU_SHRINK)) {
+               struct client_obd *cli = &obd->u.cli;
+               int nr = atomic_read(&cli->cl_lru_in_list) >> 1;
+               int target = *(int *)val;
+
+               nr = osc_lru_shrink(cli, min(nr, target));
+               *(int *)val -= nr;
+               RETURN(0);
+       }
+
+       if (!set && !KEY_IS(KEY_GRANT_SHRINK))
+               RETURN(-EINVAL);
+
+       /* We pass all other commands directly to OST. Since nobody calls osc
+          methods directly and everybody is supposed to go through LOV, we
+          assume lov checked invalid values for us.
+          The only recognised values so far are evict_by_nid and mds_conn.
+          Even if something bad goes through, we'd get a -EINVAL from OST
+          anyway. */
+
+       req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ?
+                                               &RQF_OST_SET_GRANT_INFO :
+                                               &RQF_OBD_SET_INFO);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                            RCL_CLIENT, keylen);
+       if (!KEY_IS(KEY_GRANT_SHRINK))
+               req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+                                    RCL_CLIENT, vallen);
+       rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+       memcpy(tmp, key, keylen);
+       tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ?
+                                                       &RMF_OST_BODY :
+                                                       &RMF_SETINFO_VAL);
+       memcpy(tmp, val, vallen);
+
+       if (KEY_IS(KEY_GRANT_SHRINK)) {
+               struct osc_grant_args *aa;
+               struct obdo *oa;
+
+               CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+               aa = ptlrpc_req_async_args(req);
+               OBDO_ALLOC(oa);
+               if (!oa) {
+                       ptlrpc_req_finished(req);
+                       RETURN(-ENOMEM);
+               }
+               *oa = ((struct ost_body *)val)->oa;
+               aa->aa_oa = oa;
+               req->rq_interpret_reply = osc_shrink_grant_interpret;
+       }
+
+       ptlrpc_request_set_replen(req);
+       if (!KEY_IS(KEY_GRANT_SHRINK)) {
+               LASSERT(set != NULL);
+               ptlrpc_set_add_req(set, req);
+               ptlrpc_check_set(NULL, set);
+       } else
+               ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+       RETURN(0);
+}
+
+
+static int osc_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+                        struct obd_device *disk_obd, int *index)
+{
+       /* this code is not supposed to be used with LOD/OSP
+        * to be removed soon */
+       LBUG();
+       return 0;
+}
+
+static int osc_llog_finish(struct obd_device *obd, int count)
+{
+       struct llog_ctxt *ctxt;
+
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
+       if (ctxt) {
+               llog_cat_close(NULL, ctxt->loc_handle);
+               llog_cleanup(NULL, ctxt);
+       }
+
+       ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+       if (ctxt)
+               llog_cleanup(NULL, ctxt);
+       RETURN(0);
+}
+
+static int osc_reconnect(const struct lu_env *env,
+                        struct obd_export *exp, struct obd_device *obd,
+                        struct obd_uuid *cluuid,
+                        struct obd_connect_data *data,
+                        void *localdata)
+{
+       struct client_obd *cli = &obd->u.cli;
+
+       if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+               long lost_grant;
+
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+               data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?:
+                               2 * cli_brw_size(obd);
+               lost_grant = cli->cl_lost_grant;
+               cli->cl_lost_grant = 0;
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+               CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d"
+                      " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags,
+                      data->ocd_version, data->ocd_grant, lost_grant);
+       }
+
+       RETURN(0);
+}
+
+static int osc_disconnect(struct obd_export *exp)
+{
+       struct obd_device *obd = class_exp2obd(exp);
+       struct llog_ctxt  *ctxt;
+       int rc;
+
+       ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT);
+       if (ctxt) {
+               if (obd->u.cli.cl_conn_count == 1) {
+                       /* Flush any remaining cancel messages out to the
+                        * target */
+                       llog_sync(ctxt, exp, 0);
+               }
+               llog_ctxt_put(ctxt);
+       } else {
+               CDEBUG(D_HA, "No LLOG_SIZE_REPL_CTXT found in obd %p\n",
+                      obd);
+       }
+
+       rc = client_disconnect_export(exp);
+       /**
+        * Initially we put del_shrink_grant before disconnect_export, but it
+        * causes the following problem if setup (connect) and cleanup
+        * (disconnect) are tangled together.
+        *      connect p1                   disconnect p2
+        *   ptlrpc_connect_import
+        *     ...............         class_manual_cleanup
+        *                                   osc_disconnect
+        *                                   del_shrink_grant
+        *   ptlrpc_connect_interrupt
+        *     init_grant_shrink
+        *   add this client to shrink list
+        *                                    cleanup_osc
+        * Bang! pinger trigger the shrink.
+        * So the osc should be disconnected from the shrink list, after we
+        * are sure the import has been destroyed. BUG18662
+        */
+       if (obd->u.cli.cl_import == NULL)
+               osc_del_shrink_grant(&obd->u.cli);
+       return rc;
+}
+
+static int osc_import_event(struct obd_device *obd,
+                           struct obd_import *imp,
+                           enum obd_import_event event)
+{
+       struct client_obd *cli;
+       int rc = 0;
+
+       ENTRY;
+       LASSERT(imp->imp_obd == obd);
+
+       switch (event) {
+       case IMP_EVENT_DISCON: {
+               cli = &obd->u.cli;
+               client_obd_list_lock(&cli->cl_loi_list_lock);
+               cli->cl_avail_grant = 0;
+               cli->cl_lost_grant = 0;
+               client_obd_list_unlock(&cli->cl_loi_list_lock);
+               break;
+       }
+       case IMP_EVENT_INACTIVE: {
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL);
+               break;
+       }
+       case IMP_EVENT_INVALIDATE: {
+               struct ldlm_namespace *ns = obd->obd_namespace;
+               struct lu_env    *env;
+               int                 refcheck;
+
+               env = cl_env_get(&refcheck);
+               if (!IS_ERR(env)) {
+                       /* Reset grants */
+                       cli = &obd->u.cli;
+                       /* all pages go to failing rpcs due to the invalid
+                        * import */
+                       osc_io_unplug(env, cli, NULL, PDL_POLICY_ROUND);
+
+                       ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+                       cl_env_put(env, &refcheck);
+               } else
+                       rc = PTR_ERR(env);
+               break;
+       }
+       case IMP_EVENT_ACTIVE: {
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL);
+               break;
+       }
+       case IMP_EVENT_OCD: {
+               struct obd_connect_data *ocd = &imp->imp_connect_data;
+
+               if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT)
+                       osc_init_grant(&obd->u.cli, ocd);
+
+               /* See bug 7198 */
+               if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL)
+                       imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL;
+
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL);
+               break;
+       }
+       case IMP_EVENT_DEACTIVATE: {
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL);
+               break;
+       }
+       case IMP_EVENT_ACTIVATE: {
+               rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL);
+               break;
+       }
+       default:
+               CERROR("Unknown import event %d\n", event);
+               LBUG();
+       }
+       RETURN(rc);
+}
+
+/**
+ * Determine whether the lock can be canceled before replaying the lock
+ * during recovery, see bug16774 for detailed information.
+ *
+ * \retval zero the lock can't be canceled
+ * \retval other ok to cancel
+ */
+static int osc_cancel_for_recovery(struct ldlm_lock *lock)
+{
+       check_res_locked(lock->l_resource);
+
+       /*
+        * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR.
+        *
+        * XXX as a future improvement, we can also cancel unused write lock
+        * if it doesn't have dirty data and active mmaps.
+        */
+       if (lock->l_resource->lr_type == LDLM_EXTENT &&
+           (lock->l_granted_mode == LCK_PR ||
+            lock->l_granted_mode == LCK_CR) &&
+           (osc_dlm_lock_pageref(lock) == 0))
+               RETURN(1);
+
+       RETURN(0);
+}
+
+static int brw_queue_work(const struct lu_env *env, void *data)
+{
+       struct client_obd *cli = data;
+
+       CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli);
+
+       osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME);
+       RETURN(0);
+}
+
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       struct client_obd         *cli = &obd->u.cli;
+       void                   *handler;
+       int                     rc;
+       ENTRY;
+
+       rc = ptlrpcd_addref();
+       if (rc)
+               RETURN(rc);
+
+       rc = client_obd_setup(obd, lcfg);
+       if (rc)
+               GOTO(out_ptlrpcd, rc);
+
+       handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli);
+       if (IS_ERR(handler))
+               GOTO(out_client_setup, rc = PTR_ERR(handler));
+       cli->cl_writeback_work = handler;
+
+       rc = osc_quota_setup(obd);
+       if (rc)
+               GOTO(out_ptlrpcd_work, rc);
+
+       cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+       lprocfs_osc_init_vars(&lvars);
+       if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
+               lproc_osc_attach_seqstat(obd);
+               sptlrpc_lprocfs_cliobd_attach(obd);
+               ptlrpc_lprocfs_register_obd(obd);
+       }
+
+       /* We need to allocate a few requests more, because
+        * brw_interpret tries to create new requests before freeing
+        * previous ones, Ideally we want to have 2x max_rpcs_in_flight
+        * reserved, but I'm afraid that might be too much wasted RAM
+        * in fact, so 2 is just my guess and still should work. */
+       cli->cl_import->imp_rq_pool =
+               ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
+                                   OST_MAXREQSIZE,
+                                   ptlrpc_add_rqs_to_pool);
+
+       INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
+       ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery);
+       RETURN(rc);
+
+out_ptlrpcd_work:
+       ptlrpcd_destroy_work(handler);
+out_client_setup:
+       client_obd_cleanup(obd);
+out_ptlrpcd:
+       ptlrpcd_decref();
+       RETURN(rc);
+}
+
+static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
+{
+       int rc = 0;
+       ENTRY;
+
+       switch (stage) {
+       case OBD_CLEANUP_EARLY: {
+               struct obd_import *imp;
+               imp = obd->u.cli.cl_import;
+               CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name);
+               /* ptlrpc_abort_inflight to stop an mds_lov_synchronize */
+               ptlrpc_deactivate_import(imp);
+               spin_lock(&imp->imp_lock);
+               imp->imp_pingable = 0;
+               spin_unlock(&imp->imp_lock);
+               break;
+       }
+       case OBD_CLEANUP_EXPORTS: {
+               struct client_obd *cli = &obd->u.cli;
+               /* LU-464
+                * for echo client, export may be on zombie list, wait for
+                * zombie thread to cull it, because cli.cl_import will be
+                * cleared in client_disconnect_export():
+                *   class_export_destroy() -> obd_cleanup() ->
+                *   echo_device_free() -> echo_client_cleanup() ->
+                *   obd_disconnect() -> osc_disconnect() ->
+                *   client_disconnect_export()
+                */
+               obd_zombie_barrier();
+               if (cli->cl_writeback_work) {
+                       ptlrpcd_destroy_work(cli->cl_writeback_work);
+                       cli->cl_writeback_work = NULL;
+               }
+               obd_cleanup_client_import(obd);
+               ptlrpc_lprocfs_unregister_obd(obd);
+               lprocfs_obd_cleanup(obd);
+               rc = obd_llog_finish(obd, 0);
+               if (rc != 0)
+                       CERROR("failed to cleanup llogging subsystems\n");
+               break;
+               }
+       }
+       RETURN(rc);
+}
+
+int osc_cleanup(struct obd_device *obd)
+{
+       struct client_obd *cli = &obd->u.cli;
+       int rc;
+
+       ENTRY;
+
+       /* lru cleanup */
+       if (cli->cl_cache != NULL) {
+               LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0);
+               spin_lock(&cli->cl_cache->ccc_lru_lock);
+               list_del_init(&cli->cl_lru_osc);
+               spin_unlock(&cli->cl_cache->ccc_lru_lock);
+               cli->cl_lru_left = NULL;
+               atomic_dec(&cli->cl_cache->ccc_users);
+               cli->cl_cache = NULL;
+       }
+
+       /* free memory of osc quota cache */
+       osc_quota_cleanup(obd);
+
+       rc = client_obd_cleanup(obd);
+
+       ptlrpcd_decref();
+       RETURN(rc);
+}
+
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc = 0;
+
+       lprocfs_osc_init_vars(&lvars);
+
+       switch (lcfg->lcfg_command) {
+       default:
+               rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars,
+                                             lcfg, obd);
+               if (rc > 0)
+                       rc = 0;
+               break;
+       }
+
+       return(rc);
+}
+
+static int osc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+       return osc_process_config_base(obd, buf);
+}
+
+struct obd_ops osc_obd_ops = {
+       .o_owner                = THIS_MODULE,
+       .o_setup                = osc_setup,
+       .o_precleanup      = osc_precleanup,
+       .o_cleanup            = osc_cleanup,
+       .o_add_conn          = client_import_add_conn,
+       .o_del_conn          = client_import_del_conn,
+       .o_connect            = client_connect_import,
+       .o_reconnect        = osc_reconnect,
+       .o_disconnect      = osc_disconnect,
+       .o_statfs              = osc_statfs,
+       .o_statfs_async  = osc_statfs_async,
+       .o_packmd              = osc_packmd,
+       .o_unpackmd          = osc_unpackmd,
+       .o_create              = osc_create,
+       .o_destroy            = osc_destroy,
+       .o_getattr            = osc_getattr,
+       .o_getattr_async        = osc_getattr_async,
+       .o_setattr            = osc_setattr,
+       .o_setattr_async        = osc_setattr_async,
+       .o_brw            = osc_brw,
+       .o_punch                = osc_punch,
+       .o_sync          = osc_sync,
+       .o_enqueue            = osc_enqueue,
+       .o_change_cbdata        = osc_change_cbdata,
+       .o_find_cbdata    = osc_find_cbdata,
+       .o_cancel              = osc_cancel,
+       .o_cancel_unused        = osc_cancel_unused,
+       .o_iocontrol        = osc_iocontrol,
+       .o_get_info          = osc_get_info,
+       .o_set_info_async       = osc_set_info_async,
+       .o_import_event  = osc_import_event,
+       .o_llog_init        = osc_llog_init,
+       .o_llog_finish    = osc_llog_finish,
+       .o_process_config       = osc_process_config,
+       .o_quotactl          = osc_quotactl,
+       .o_quotacheck      = osc_quotacheck,
+};
+
+extern struct lu_kmem_descr osc_caches[];
+extern spinlock_t osc_ast_guard;
+extern struct lock_class_key osc_ast_guard_class;
+
+int __init osc_init(void)
+{
+       struct lprocfs_static_vars lvars = { 0 };
+       int rc;
+       ENTRY;
+
+       /* print an address of _any_ initialized kernel symbol from this
+        * module, to allow debugging with gdb that doesn't support data
+        * symbols from modules.*/
+       CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches);
+
+       rc = lu_kmem_init(osc_caches);
+
+       lprocfs_osc_init_vars(&lvars);
+
+       rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars,
+                                LUSTRE_OSC_NAME, &osc_device_type);
+       if (rc) {
+               lu_kmem_fini(osc_caches);
+               RETURN(rc);
+       }
+
+       spin_lock_init(&osc_ast_guard);
+       lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class);
+
+       RETURN(rc);
+}
+
+static void /*__exit*/ osc_exit(void)
+{
+       class_unregister_type(LUSTRE_OSC_NAME);
+       lu_kmem_fini(osc_caches);
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)");
+MODULE_LICENSE("GPL");
+
+cfs_module(osc, LUSTRE_VERSION_STRING, osc_init, osc_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/Makefile b/drivers/staging/lustre/lustre/ptlrpc/Makefile
new file mode 100644 (file)
index 0000000..983eb66
--- /dev/null
@@ -0,0 +1,23 @@
+obj-$(CONFIG_LUSTRE_FS) += ptlrpc.o
+LDLM := ../../lustre/ldlm/
+
+ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o
+ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o
+ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
+ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
+ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
+ldlm_objs += $(LDLM)ldlm_pool.o
+ldlm_objs += $(LDLM)interval_tree.o
+ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
+ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o
+ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
+ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o
+ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o
+ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o
+
+ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs)
+
+obj-$(CONFIG_PTLRPC_GSS) += gss/
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c
new file mode 100644 (file)
index 0000000..22f7e65
--- /dev/null
@@ -0,0 +1,3059 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/** Implementation of client-side PortalRPC interfaces */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+static int ptlrpc_send_new_req(struct ptlrpc_request *req);
+
+/**
+ * Initialize passed in client structure \a cl.
+ */
+void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
+                       struct ptlrpc_client *cl)
+{
+       cl->cli_request_portal = req_portal;
+       cl->cli_reply_portal   = rep_portal;
+       cl->cli_name       = name;
+}
+EXPORT_SYMBOL(ptlrpc_init_client);
+
+/**
+ * Return PortalRPC connection for remore uud \a uuid
+ */
+struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
+{
+       struct ptlrpc_connection *c;
+       lnet_nid_t              self;
+       lnet_process_id_t        peer;
+       int                    err;
+
+       /* ptlrpc_uuid_to_peer() initializes its 2nd parameter
+        * before accessing its values. */
+       /* coverity[uninit_use_in_call] */
+       err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
+       if (err != 0) {
+               CNETERR("cannot find peer %s!\n", uuid->uuid);
+               return NULL;
+       }
+
+       c = ptlrpc_connection_get(peer, self, uuid);
+       if (c) {
+               memcpy(c->c_remote_uuid.uuid,
+                      uuid->uuid, sizeof(c->c_remote_uuid.uuid));
+       }
+
+       CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
+
+       return c;
+}
+EXPORT_SYMBOL(ptlrpc_uuid_to_connection);
+
+/**
+ * Allocate and initialize new bulk descriptor on the sender.
+ * Returns pointer to the descriptor or NULL on error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+                                        unsigned type, unsigned portal)
+{
+       struct ptlrpc_bulk_desc *desc;
+       int i;
+
+       OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]));
+       if (!desc)
+               return NULL;
+
+       spin_lock_init(&desc->bd_lock);
+       init_waitqueue_head(&desc->bd_waitq);
+       desc->bd_max_iov = npages;
+       desc->bd_iov_count = 0;
+       desc->bd_portal = portal;
+       desc->bd_type = type;
+       desc->bd_md_count = 0;
+       LASSERT(max_brw > 0);
+       desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
+       /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
+        * node. Negotiated ocd_brw_size will always be <= this number. */
+       for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
+               LNetInvalidateHandle(&desc->bd_mds[i]);
+
+       return desc;
+}
+
+/**
+ * Prepare bulk descriptor for specified outgoing request \a req that
+ * can fit \a npages * pages. \a type is bulk type. \a portal is where
+ * the bulk to be sent. Used on client-side.
+ * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on
+ * error.
+ */
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
+                                             unsigned npages, unsigned max_brw,
+                                             unsigned type, unsigned portal)
+{
+       struct obd_import *imp = req->rq_import;
+       struct ptlrpc_bulk_desc *desc;
+
+       ENTRY;
+       LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
+       desc = ptlrpc_new_bulk(npages, max_brw, type, portal);
+       if (desc == NULL)
+               RETURN(NULL);
+
+       desc->bd_import_generation = req->rq_import_generation;
+       desc->bd_import = class_import_get(imp);
+       desc->bd_req = req;
+
+       desc->bd_cbid.cbid_fn  = client_bulk_callback;
+       desc->bd_cbid.cbid_arg = desc;
+
+       /* This makes req own desc, and free it when she frees herself */
+       req->rq_bulk = desc;
+
+       return desc;
+}
+EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
+
+/**
+ * Add a page \a page to the bulk descriptor \a desc.
+ * Data to transfer in the page starts at offset \a pageoffset and
+ * amount of data to transfer from the page is \a len
+ */
+void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+                            struct page *page, int pageoffset, int len, int pin)
+{
+       LASSERT(desc->bd_iov_count < desc->bd_max_iov);
+       LASSERT(page != NULL);
+       LASSERT(pageoffset >= 0);
+       LASSERT(len > 0);
+       LASSERT(pageoffset + len <= PAGE_CACHE_SIZE);
+
+       desc->bd_nob += len;
+
+       if (pin)
+               page_cache_get(page);
+
+       ptlrpc_add_bulk_page(desc, page, pageoffset, len);
+}
+EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
+
+/**
+ * Uninitialize and free bulk descriptor \a desc.
+ * Works on bulk descriptors both from server and client side.
+ */
+void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin)
+{
+       int i;
+       ENTRY;
+
+       LASSERT(desc != NULL);
+       LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
+       LASSERT(desc->bd_md_count == 0);         /* network hands off */
+       LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
+
+       sptlrpc_enc_pool_put_pages(desc);
+
+       if (desc->bd_export)
+               class_export_put(desc->bd_export);
+       else
+               class_import_put(desc->bd_import);
+
+       if (unpin) {
+               for (i = 0; i < desc->bd_iov_count ; i++)
+                       page_cache_release(desc->bd_iov[i].kiov_page);
+       }
+
+       OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
+                               bd_iov[desc->bd_max_iov]));
+       EXIT;
+}
+EXPORT_SYMBOL(__ptlrpc_free_bulk);
+
+/**
+ * Set server timelimit for this req, i.e. how long are we willing to wait
+ * for reply before timing out this request.
+ */
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
+{
+       __u32 serv_est;
+       int idx;
+       struct imp_at *at;
+
+       LASSERT(req->rq_import);
+
+       if (AT_OFF) {
+               /* non-AT settings */
+               /**
+                * \a imp_server_timeout means this is reverse import and
+                * we send (currently only) ASTs to the client and cannot afford
+                * to wait too long for the reply, otherwise the other client
+                * (because of which we are sending this request) would
+                * timeout waiting for us
+                */
+               req->rq_timeout = req->rq_import->imp_server_timeout ?
+                                 obd_timeout / 2 : obd_timeout;
+       } else {
+               at = &req->rq_import->imp_at;
+               idx = import_at_get_index(req->rq_import,
+                                         req->rq_request_portal);
+               serv_est = at_get(&at->iat_service_estimate[idx]);
+               req->rq_timeout = at_est2timeout(serv_est);
+       }
+       /* We could get even fancier here, using history to predict increased
+          loading... */
+
+       /* Let the server know what this RPC timeout is by putting it in the
+          reqmsg*/
+       lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
+}
+EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
+
+/* Adjust max service estimate based on server value */
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+                                 unsigned int serv_est)
+{
+       int idx;
+       unsigned int oldse;
+       struct imp_at *at;
+
+       LASSERT(req->rq_import);
+       at = &req->rq_import->imp_at;
+
+       idx = import_at_get_index(req->rq_import, req->rq_request_portal);
+       /* max service estimates are tracked on the server side,
+          so just keep minimal history here */
+       oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
+       if (oldse != 0)
+               CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d "
+                      "has changed from %d to %d\n",
+                      req->rq_import->imp_obd->obd_name,req->rq_request_portal,
+                      oldse, at_get(&at->iat_service_estimate[idx]));
+}
+
+/* Expected network latency per remote node (secs) */
+int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
+{
+       return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
+}
+
+/* Adjust expected network latency */
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+                                     unsigned int service_time)
+{
+       unsigned int nl, oldnl;
+       struct imp_at *at;
+       time_t now = cfs_time_current_sec();
+
+       LASSERT(req->rq_import);
+       at = &req->rq_import->imp_at;
+
+       /* Network latency is total time less server processing time */
+       nl = max_t(int, now - req->rq_sent - service_time, 0) +1/*st rounding*/;
+       if (service_time > now - req->rq_sent + 3 /* bz16408 */)
+               CWARN("Reported service time %u > total measured time "
+                     CFS_DURATION_T"\n", service_time,
+                     cfs_time_sub(now, req->rq_sent));
+
+       oldnl = at_measured(&at->iat_net_latency, nl);
+       if (oldnl != 0)
+               CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) "
+                      "has changed from %d to %d\n",
+                      req->rq_import->imp_obd->obd_name,
+                      obd_uuid2str(
+                              &req->rq_import->imp_connection->c_remote_uuid),
+                      oldnl, at_get(&at->iat_net_latency));
+}
+
+static int unpack_reply(struct ptlrpc_request *req)
+{
+       int rc;
+
+       if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+               rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
+               if (rc) {
+                       DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
+                       return(-EPROTO);
+               }
+       }
+
+       rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+       if (rc) {
+               DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
+               return(-EPROTO);
+       }
+       return 0;
+}
+
+/**
+ * Handle an early reply message, called with the rq_lock held.
+ * If anything goes wrong just ignore it - same as if it never happened
+ */
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+{
+       struct ptlrpc_request *early_req;
+       time_t           olddl;
+       int                 rc;
+       ENTRY;
+
+       req->rq_early = 0;
+       spin_unlock(&req->rq_lock);
+
+       rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+       if (rc) {
+               spin_lock(&req->rq_lock);
+               RETURN(rc);
+       }
+
+       rc = unpack_reply(early_req);
+       if (rc == 0) {
+               /* Expecting to increase the service time estimate here */
+               ptlrpc_at_adj_service(req,
+                       lustre_msg_get_timeout(early_req->rq_repmsg));
+               ptlrpc_at_adj_net_latency(req,
+                       lustre_msg_get_service_time(early_req->rq_repmsg));
+       }
+
+       sptlrpc_cli_finish_early_reply(early_req);
+
+       if (rc != 0) {
+               spin_lock(&req->rq_lock);
+               RETURN(rc);
+       }
+
+       /* Adjust the local timeout for this req */
+       ptlrpc_at_set_req_timeout(req);
+
+       spin_lock(&req->rq_lock);
+       olddl = req->rq_deadline;
+       /* server assumes it now has rq_timeout from when it sent the
+        * early reply, so client should give it at least that long. */
+       req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
+                          ptlrpc_at_get_net_latency(req);
+
+       DEBUG_REQ(D_ADAPTTO, req,
+                 "Early reply #%d, new deadline in "CFS_DURATION_T"s "
+                 "("CFS_DURATION_T"s)", req->rq_early_count,
+                 cfs_time_sub(req->rq_deadline, cfs_time_current_sec()),
+                 cfs_time_sub(req->rq_deadline, olddl));
+
+       RETURN(rc);
+}
+
+/**
+ * Wind down request pool \a pool.
+ * Frees all requests from the pool too
+ */
+void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
+{
+       struct list_head *l, *tmp;
+       struct ptlrpc_request *req;
+
+       LASSERT(pool != NULL);
+
+       spin_lock(&pool->prp_lock);
+       list_for_each_safe(l, tmp, &pool->prp_req_list) {
+               req = list_entry(l, struct ptlrpc_request, rq_list);
+               list_del(&req->rq_list);
+               LASSERT(req->rq_reqbuf);
+               LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
+               OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size);
+               OBD_FREE(req, sizeof(*req));
+       }
+       spin_unlock(&pool->prp_lock);
+       OBD_FREE(pool, sizeof(*pool));
+}
+EXPORT_SYMBOL(ptlrpc_free_rq_pool);
+
+/**
+ * Allocates, initializes and adds \a num_rq requests to the pool \a pool
+ */
+void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
+{
+       int i;
+       int size = 1;
+
+       while (size < pool->prp_rq_size)
+               size <<= 1;
+
+       LASSERTF(list_empty(&pool->prp_req_list) ||
+                size == pool->prp_rq_size,
+                "Trying to change pool size with nonempty pool "
+                "from %d to %d bytes\n", pool->prp_rq_size, size);
+
+       spin_lock(&pool->prp_lock);
+       pool->prp_rq_size = size;
+       for (i = 0; i < num_rq; i++) {
+               struct ptlrpc_request *req;
+               struct lustre_msg *msg;
+
+               spin_unlock(&pool->prp_lock);
+               OBD_ALLOC(req, sizeof(struct ptlrpc_request));
+               if (!req)
+                       return;
+               OBD_ALLOC_LARGE(msg, size);
+               if (!msg) {
+                       OBD_FREE(req, sizeof(struct ptlrpc_request));
+                       return;
+               }
+               req->rq_reqbuf = msg;
+               req->rq_reqbuf_len = size;
+               req->rq_pool = pool;
+               spin_lock(&pool->prp_lock);
+               list_add_tail(&req->rq_list, &pool->prp_req_list);
+       }
+       spin_unlock(&pool->prp_lock);
+       return;
+}
+EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
+
+/**
+ * Create and initialize new request pool with given attributes:
+ * \a num_rq - initial number of requests to create for the pool
+ * \a msgsize - maximum message size possible for requests in thid pool
+ * \a populate_pool - function to be called when more requests need to be added
+ *                 to the pool
+ * Returns pointer to newly created pool or NULL on error.
+ */
+struct ptlrpc_request_pool *
+ptlrpc_init_rq_pool(int num_rq, int msgsize,
+                   void (*populate_pool)(struct ptlrpc_request_pool *, int))
+{
+       struct ptlrpc_request_pool *pool;
+
+       OBD_ALLOC(pool, sizeof (struct ptlrpc_request_pool));
+       if (!pool)
+               return NULL;
+
+       /* Request next power of two for the allocation, because internally
+          kernel would do exactly this */
+
+       spin_lock_init(&pool->prp_lock);
+       INIT_LIST_HEAD(&pool->prp_req_list);
+       pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
+       pool->prp_populate = populate_pool;
+
+       populate_pool(pool, num_rq);
+
+       if (list_empty(&pool->prp_req_list)) {
+               /* have not allocated a single request for the pool */
+               OBD_FREE(pool, sizeof (struct ptlrpc_request_pool));
+               pool = NULL;
+       }
+       return pool;
+}
+EXPORT_SYMBOL(ptlrpc_init_rq_pool);
+
+/**
+ * Fetches one request from pool \a pool
+ */
+static struct ptlrpc_request *
+ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
+{
+       struct ptlrpc_request *request;
+       struct lustre_msg *reqbuf;
+
+       if (!pool)
+               return NULL;
+
+       spin_lock(&pool->prp_lock);
+
+       /* See if we have anything in a pool, and bail out if nothing,
+        * in writeout path, where this matters, this is safe to do, because
+        * nothing is lost in this case, and when some in-flight requests
+        * complete, this code will be called again. */
+       if (unlikely(list_empty(&pool->prp_req_list))) {
+               spin_unlock(&pool->prp_lock);
+               return NULL;
+       }
+
+       request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
+                                rq_list);
+       list_del_init(&request->rq_list);
+       spin_unlock(&pool->prp_lock);
+
+       LASSERT(request->rq_reqbuf);
+       LASSERT(request->rq_pool);
+
+       reqbuf = request->rq_reqbuf;
+       memset(request, 0, sizeof(*request));
+       request->rq_reqbuf = reqbuf;
+       request->rq_reqbuf_len = pool->prp_rq_size;
+       request->rq_pool = pool;
+
+       return request;
+}
+
+/**
+ * Returns freed \a request to pool.
+ */
+static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
+{
+       struct ptlrpc_request_pool *pool = request->rq_pool;
+
+       spin_lock(&pool->prp_lock);
+       LASSERT(list_empty(&request->rq_list));
+       LASSERT(!request->rq_receiving_reply);
+       list_add_tail(&request->rq_list, &pool->prp_req_list);
+       spin_unlock(&pool->prp_lock);
+}
+
+static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+                                     __u32 version, int opcode,
+                                     int count, __u32 *lengths, char **bufs,
+                                     struct ptlrpc_cli_ctx *ctx)
+{
+       struct obd_import  *imp = request->rq_import;
+       int              rc;
+       ENTRY;
+
+       if (unlikely(ctx))
+               request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
+       else {
+               rc = sptlrpc_req_get_ctx(request);
+               if (rc)
+                       GOTO(out_free, rc);
+       }
+
+       sptlrpc_req_set_flavor(request, opcode);
+
+       rc = lustre_pack_request(request, imp->imp_msg_magic, count,
+                                lengths, bufs);
+       if (rc) {
+               LASSERT(!request->rq_pool);
+               GOTO(out_ctx, rc);
+       }
+
+       lustre_msg_add_version(request->rq_reqmsg, version);
+       request->rq_send_state = LUSTRE_IMP_FULL;
+       request->rq_type = PTL_RPC_MSG_REQUEST;
+       request->rq_export = NULL;
+
+       request->rq_req_cbid.cbid_fn  = request_out_callback;
+       request->rq_req_cbid.cbid_arg = request;
+
+       request->rq_reply_cbid.cbid_fn  = reply_in_callback;
+       request->rq_reply_cbid.cbid_arg = request;
+
+       request->rq_reply_deadline = 0;
+       request->rq_phase = RQ_PHASE_NEW;
+       request->rq_next_phase = RQ_PHASE_UNDEFINED;
+
+       request->rq_request_portal = imp->imp_client->cli_request_portal;
+       request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+
+       ptlrpc_at_set_req_timeout(request);
+
+       spin_lock_init(&request->rq_lock);
+       INIT_LIST_HEAD(&request->rq_list);
+       INIT_LIST_HEAD(&request->rq_timed_list);
+       INIT_LIST_HEAD(&request->rq_replay_list);
+       INIT_LIST_HEAD(&request->rq_ctx_chain);
+       INIT_LIST_HEAD(&request->rq_set_chain);
+       INIT_LIST_HEAD(&request->rq_history_list);
+       INIT_LIST_HEAD(&request->rq_exp_list);
+       init_waitqueue_head(&request->rq_reply_waitq);
+       init_waitqueue_head(&request->rq_set_waitq);
+       request->rq_xid = ptlrpc_next_xid();
+       atomic_set(&request->rq_refcount, 1);
+
+       lustre_msg_set_opc(request->rq_reqmsg, opcode);
+
+       RETURN(0);
+out_ctx:
+       sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
+out_free:
+       class_import_put(imp);
+       return rc;
+}
+
+int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
+                            __u32 version, int opcode, char **bufs,
+                            struct ptlrpc_cli_ctx *ctx)
+{
+       int count;
+
+       count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
+       return __ptlrpc_request_bufs_pack(request, version, opcode, count,
+                                         request->rq_pill.rc_area[RCL_CLIENT],
+                                         bufs, ctx);
+}
+EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
+
+/**
+ * Pack request buffers for network transfer, performing necessary encryption
+ * steps if necessary.
+ */
+int ptlrpc_request_pack(struct ptlrpc_request *request,
+                       __u32 version, int opcode)
+{
+       int rc;
+       rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
+       if (rc)
+               return rc;
+
+       /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
+        * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
+        * have to send old ptlrpc_body to keep interoprability with these
+        * clients.
+        *
+        * Only three kinds of server->client RPCs so far:
+        *  - LDLM_BL_CALLBACK
+        *  - LDLM_CP_CALLBACK
+        *  - LDLM_GL_CALLBACK
+        *
+        * XXX This should be removed whenever we drop the interoprability with
+        *     the these old clients.
+        */
+       if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
+           opcode == LDLM_GL_CALLBACK)
+               req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
+                                  sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
+
+       return rc;
+}
+EXPORT_SYMBOL(ptlrpc_request_pack);
+
+/**
+ * Helper function to allocate new request on import \a imp
+ * and possibly using existing request from pool \a pool if provided.
+ * Returns allocated request structure with import field filled or
+ * NULL on error.
+ */
+static inline
+struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
+                                             struct ptlrpc_request_pool *pool)
+{
+       struct ptlrpc_request *request = NULL;
+
+       if (pool)
+               request = ptlrpc_prep_req_from_pool(pool);
+
+       if (!request)
+               OBD_ALLOC_PTR(request);
+
+       if (request) {
+               LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
+               LASSERT(imp != LP_POISON);
+               LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
+                       imp->imp_client);
+               LASSERT(imp->imp_client != LP_POISON);
+
+               request->rq_import = class_import_get(imp);
+       } else {
+               CERROR("request allocation out of memory\n");
+       }
+
+       return request;
+}
+
+/**
+ * Helper function for creating a request.
+ * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits
+ * buffer structures according to capsule template \a format.
+ * Returns allocated request structure pointer or NULL on error.
+ */
+static struct ptlrpc_request *
+ptlrpc_request_alloc_internal(struct obd_import *imp,
+                             struct ptlrpc_request_pool * pool,
+                             const struct req_format *format)
+{
+       struct ptlrpc_request *request;
+
+       request = __ptlrpc_request_alloc(imp, pool);
+       if (request == NULL)
+               return NULL;
+
+       req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
+       req_capsule_set(&request->rq_pill, format);
+       return request;
+}
+
+/**
+ * Allocate new request structure for import \a imp and initialize its
+ * buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
+                                           const struct req_format *format)
+{
+       return ptlrpc_request_alloc_internal(imp, NULL, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc);
+
+/**
+ * Allocate new request structure for import \a imp from pool \a pool and
+ * initialize its buffer structure according to capsule template \a format.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
+                                           struct ptlrpc_request_pool * pool,
+                                           const struct req_format *format)
+{
+       return ptlrpc_request_alloc_internal(imp, pool, format);
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
+
+/**
+ * For requests not from pool, free memory of the request structure.
+ * For requests obtained from a pool earlier, return request back to pool.
+ */
+void ptlrpc_request_free(struct ptlrpc_request *request)
+{
+       if (request->rq_pool)
+               __ptlrpc_free_req_to_pool(request);
+       else
+               OBD_FREE_PTR(request);
+}
+EXPORT_SYMBOL(ptlrpc_request_free);
+
+/**
+ * Allocate new request for operatione \a opcode and immediatelly pack it for
+ * network transfer.
+ * Only used for simple requests like OBD_PING where the only important
+ * part of the request is operation itself.
+ * Returns allocated request or NULL on error.
+ */
+struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
+                                               const struct req_format *format,
+                                               __u32 version, int opcode)
+{
+       struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
+       int                 rc;
+
+       if (req) {
+               rc = ptlrpc_request_pack(req, version, opcode);
+               if (rc) {
+                       ptlrpc_request_free(req);
+                       req = NULL;
+               }
+       }
+       return req;
+}
+EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
+
+/**
+ * Prepare request (fetched from pool \a poolif not NULL) on import \a imp
+ * for operation \a opcode. Request would contain \a count buffers.
+ * Sizes of buffers are described in array \a lengths and buffers themselves
+ * are provided by a pointer \a bufs.
+ * Returns prepared request structure pointer or NULL on error.
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req_pool(struct obd_import *imp,
+                    __u32 version, int opcode,
+                    int count, __u32 *lengths, char **bufs,
+                    struct ptlrpc_request_pool *pool)
+{
+       struct ptlrpc_request *request;
+       int                 rc;
+
+       request = __ptlrpc_request_alloc(imp, pool);
+       if (!request)
+               return NULL;
+
+       rc = __ptlrpc_request_bufs_pack(request, version, opcode, count,
+                                       lengths, bufs, NULL);
+       if (rc) {
+               ptlrpc_request_free(request);
+               request = NULL;
+       }
+       return request;
+}
+EXPORT_SYMBOL(ptlrpc_prep_req_pool);
+
+/**
+ * Same as ptlrpc_prep_req_pool, but without pool
+ */
+struct ptlrpc_request *
+ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count,
+               __u32 *lengths, char **bufs)
+{
+       return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs,
+                                   NULL);
+}
+EXPORT_SYMBOL(ptlrpc_prep_req);
+
+/**
+ * Allocate and initialize new request set structure.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_set(void)
+{
+       struct ptlrpc_request_set *set;
+
+       ENTRY;
+       OBD_ALLOC(set, sizeof *set);
+       if (!set)
+               RETURN(NULL);
+       atomic_set(&set->set_refcount, 1);
+       INIT_LIST_HEAD(&set->set_requests);
+       init_waitqueue_head(&set->set_waitq);
+       atomic_set(&set->set_new_count, 0);
+       atomic_set(&set->set_remaining, 0);
+       spin_lock_init(&set->set_new_req_lock);
+       INIT_LIST_HEAD(&set->set_new_requests);
+       INIT_LIST_HEAD(&set->set_cblist);
+       set->set_max_inflight = UINT_MAX;
+       set->set_producer     = NULL;
+       set->set_producer_arg = NULL;
+       set->set_rc        = 0;
+
+       RETURN(set);
+}
+EXPORT_SYMBOL(ptlrpc_prep_set);
+
+/**
+ * Allocate and initialize new request set structure with flow control
+ * extension. This extension allows to control the number of requests in-flight
+ * for the whole set. A callback function to generate requests must be provided
+ * and the request set will keep the number of requests sent over the wire to
+ * @max_inflight.
+ * Returns a pointer to the newly allocated set structure or NULL on error.
+ */
+struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
+                                            void *arg)
+
+{
+       struct ptlrpc_request_set *set;
+
+       set = ptlrpc_prep_set();
+       if (!set)
+               RETURN(NULL);
+
+       set->set_max_inflight  = max;
+       set->set_producer      = func;
+       set->set_producer_arg  = arg;
+
+       RETURN(set);
+}
+EXPORT_SYMBOL(ptlrpc_prep_fcset);
+
+/**
+ * Wind down and free request set structure previously allocated with
+ * ptlrpc_prep_set.
+ * Ensures that all requests on the set have completed and removes
+ * all requests from the request list in a set.
+ * If any unsent request happen to be on the list, pretends that they got
+ * an error in flight and calls their completion handler.
+ */
+void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
+{
+       struct list_head       *tmp;
+       struct list_head       *next;
+       int            expected_phase;
+       int            n = 0;
+       ENTRY;
+
+       /* Requests on the set should either all be completed, or all be new */
+       expected_phase = (atomic_read(&set->set_remaining) == 0) ?
+                        RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
+       list_for_each (tmp, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_set_chain);
+
+               LASSERT(req->rq_phase == expected_phase);
+               n++;
+       }
+
+       LASSERTF(atomic_read(&set->set_remaining) == 0 ||
+                atomic_read(&set->set_remaining) == n, "%d / %d\n",
+                atomic_read(&set->set_remaining), n);
+
+       list_for_each_safe(tmp, next, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_set_chain);
+               list_del_init(&req->rq_set_chain);
+
+               LASSERT(req->rq_phase == expected_phase);
+
+               if (req->rq_phase == RQ_PHASE_NEW) {
+                       ptlrpc_req_interpret(NULL, req, -EBADR);
+                       atomic_dec(&set->set_remaining);
+               }
+
+               spin_lock(&req->rq_lock);
+               req->rq_set = NULL;
+               req->rq_invalid_rqset = 0;
+               spin_unlock(&req->rq_lock);
+
+               ptlrpc_req_finished (req);
+       }
+
+       LASSERT(atomic_read(&set->set_remaining) == 0);
+
+       ptlrpc_reqset_put(set);
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_set_destroy);
+
+/**
+ * Add a callback function \a fn to the set.
+ * This function would be called when all requests on this set are completed.
+ * The function will be passed \a data argument.
+ */
+int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
+                     set_interpreter_func fn, void *data)
+{
+       struct ptlrpc_set_cbdata *cbdata;
+
+       OBD_ALLOC_PTR(cbdata);
+       if (cbdata == NULL)
+               RETURN(-ENOMEM);
+
+       cbdata->psc_interpret = fn;
+       cbdata->psc_data = data;
+       list_add_tail(&cbdata->psc_item, &set->set_cblist);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_cb);
+
+/**
+ * Add a new request to the general purpose request set.
+ * Assumes request reference from the caller.
+ */
+void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
+                       struct ptlrpc_request *req)
+{
+       LASSERT(list_empty(&req->rq_set_chain));
+
+       /* The set takes over the caller's request reference */
+       list_add_tail(&req->rq_set_chain, &set->set_requests);
+       req->rq_set = set;
+       atomic_inc(&set->set_remaining);
+       req->rq_queued_time = cfs_time_current();
+
+       if (req->rq_reqmsg != NULL)
+               lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+       if (set->set_producer != NULL)
+               /* If the request set has a producer callback, the RPC must be
+                * sent straight away */
+               ptlrpc_send_new_req(req);
+}
+EXPORT_SYMBOL(ptlrpc_set_add_req);
+
+/**
+ * Add a request to a request with dedicated server thread
+ * and wake the thread to make any necessary processing.
+ * Currently only used for ptlrpcd.
+ */
+void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+                          struct ptlrpc_request *req)
+{
+       struct ptlrpc_request_set *set = pc->pc_set;
+       int count, i;
+
+       LASSERT(req->rq_set == NULL);
+       LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
+
+       spin_lock(&set->set_new_req_lock);
+       /*
+        * The set takes over the caller's request reference.
+        */
+       req->rq_set = set;
+       req->rq_queued_time = cfs_time_current();
+       list_add_tail(&req->rq_set_chain, &set->set_new_requests);
+       count = atomic_inc_return(&set->set_new_count);
+       spin_unlock(&set->set_new_req_lock);
+
+       /* Only need to call wakeup once for the first entry. */
+       if (count == 1) {
+               wake_up(&set->set_waitq);
+
+               /* XXX: It maybe unnecessary to wakeup all the partners. But to
+                *      guarantee the async RPC can be processed ASAP, we have
+                *      no other better choice. It maybe fixed in future. */
+               for (i = 0; i < pc->pc_npartners; i++)
+                       wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+       }
+}
+EXPORT_SYMBOL(ptlrpc_set_add_new_req);
+
+/**
+ * Based on the current state of the import, determine if the request
+ * can be sent, is an error, or should be delayed.
+ *
+ * Returns true if this request should be delayed. If false, and
+ * *status is set, then the request can not be sent and *status is the
+ * error code.  If false and status is 0, then request can be sent.
+ *
+ * The imp->imp_lock must be held.
+ */
+static int ptlrpc_import_delay_req(struct obd_import *imp,
+                                  struct ptlrpc_request *req, int *status)
+{
+       int delay = 0;
+       ENTRY;
+
+       LASSERT (status != NULL);
+       *status = 0;
+
+       if (req->rq_ctx_init || req->rq_ctx_fini) {
+               /* always allow ctx init/fini rpc go through */
+       } else if (imp->imp_state == LUSTRE_IMP_NEW) {
+               DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
+               *status = -EIO;
+       } else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+               /* pings may safely race with umount */
+               DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
+                         D_HA : D_ERROR, req, "IMP_CLOSED ");
+               *status = -EIO;
+       } else if (ptlrpc_send_limit_expired(req)) {
+               /* probably doesn't need to be a D_ERROR after initial testing */
+               DEBUG_REQ(D_ERROR, req, "send limit expired ");
+               *status = -EIO;
+       } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
+                  imp->imp_state == LUSTRE_IMP_CONNECTING) {
+               /* allow CONNECT even if import is invalid */ ;
+               if (atomic_read(&imp->imp_inval_count) != 0) {
+                       DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+                       *status = -EIO;
+               }
+       } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
+               if (!imp->imp_deactive)
+                       DEBUG_REQ(D_NET, req, "IMP_INVALID");
+               *status = -ESHUTDOWN; /* bz 12940 */
+       } else if (req->rq_import_generation != imp->imp_generation) {
+               DEBUG_REQ(D_ERROR, req, "req wrong generation:");
+               *status = -EIO;
+       } else if (req->rq_send_state != imp->imp_state) {
+               /* invalidate in progress - any requests should be drop */
+               if (atomic_read(&imp->imp_inval_count) != 0) {
+                       DEBUG_REQ(D_ERROR, req, "invalidate in flight");
+                       *status = -EIO;
+               } else if (imp->imp_dlm_fake || req->rq_no_delay) {
+                       *status = -EWOULDBLOCK;
+               } else if (req->rq_allow_replay &&
+                         (imp->imp_state == LUSTRE_IMP_REPLAY ||
+                          imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
+                          imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
+                          imp->imp_state == LUSTRE_IMP_RECOVER)) {
+                       DEBUG_REQ(D_HA, req, "allow during recovery.\n");
+               } else {
+                       delay = 1;
+               }
+       }
+
+       RETURN(delay);
+}
+
+/**
+ * Decide if the eror message regarding provided request \a req
+ * should be printed to the console or not.
+ * Makes it's decision on request status and other properties.
+ * Returns 1 to print error on the system console or 0 if not.
+ */
+static int ptlrpc_console_allow(struct ptlrpc_request *req)
+{
+       __u32 opc;
+       int err;
+
+       LASSERT(req->rq_reqmsg != NULL);
+       opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+       /* Suppress particular reconnect errors which are to be expected.  No
+        * errors are suppressed for the initial connection on an import */
+       if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
+           (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
+
+               /* Suppress timed out reconnect requests */
+               if (req->rq_timedout)
+                       return 0;
+
+               /* Suppress unavailable/again reconnect requests */
+               err = lustre_msg_get_status(req->rq_repmsg);
+               if (err == -ENODEV || err == -EAGAIN)
+                       return 0;
+       }
+
+       return 1;
+}
+
+/**
+ * Check request processing status.
+ * Returns the status.
+ */
+static int ptlrpc_check_status(struct ptlrpc_request *req)
+{
+       int err;
+       ENTRY;
+
+       err = lustre_msg_get_status(req->rq_repmsg);
+       if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+               struct obd_import *imp = req->rq_import;
+               __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+               if (ptlrpc_console_allow(req))
+                       LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s,"
+                                          " operation %s failed with %d.\n",
+                                          imp->imp_obd->obd_name,
+                                          libcfs_nid2str(
+                                          imp->imp_connection->c_peer.nid),
+                                          ll_opcode2str(opc), err);
+               RETURN(err < 0 ? err : -EINVAL);
+       }
+
+       if (err < 0) {
+               DEBUG_REQ(D_INFO, req, "status is %d", err);
+       } else if (err > 0) {
+               /* XXX: translate this error from net to host */
+               DEBUG_REQ(D_INFO, req, "status is %d", err);
+       }
+
+       RETURN(err);
+}
+
+/**
+ * save pre-versions of objects into request for replay.
+ * Versions are obtained from server reply.
+ * used for VBR.
+ */
+static void ptlrpc_save_versions(struct ptlrpc_request *req)
+{
+       struct lustre_msg *repmsg = req->rq_repmsg;
+       struct lustre_msg *reqmsg = req->rq_reqmsg;
+       __u64 *versions = lustre_msg_get_versions(repmsg);
+       ENTRY;
+
+       if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
+               return;
+
+       LASSERT(versions);
+       lustre_msg_set_versions(reqmsg, versions);
+       CDEBUG(D_INFO, "Client save versions ["LPX64"/"LPX64"]\n",
+              versions[0], versions[1]);
+
+       EXIT;
+}
+
+/**
+ * Callback function called when client receives RPC reply for \a req.
+ * Returns 0 on success or error code.
+ * The return alue would be assigned to req->rq_status by the caller
+ * as request processing status.
+ * This function also decides if the request needs to be saved for later replay.
+ */
+static int after_reply(struct ptlrpc_request *req)
+{
+       struct obd_import *imp = req->rq_import;
+       struct obd_device *obd = req->rq_import->imp_obd;
+       int rc;
+       struct timeval work_start;
+       long timediff;
+       ENTRY;
+
+       LASSERT(obd != NULL);
+       /* repbuf must be unlinked */
+       LASSERT(!req->rq_receiving_reply && !req->rq_must_unlink);
+
+       if (req->rq_reply_truncate) {
+               if (ptlrpc_no_resend(req)) {
+                       DEBUG_REQ(D_ERROR, req, "reply buffer overflow,"
+                                 " expected: %d, actual size: %d",
+                                 req->rq_nob_received, req->rq_repbuf_len);
+                       RETURN(-EOVERFLOW);
+               }
+
+               sptlrpc_cli_free_repbuf(req);
+               /* Pass the required reply buffer size (include
+                * space for early reply).
+                * NB: no need to roundup because alloc_repbuf
+                * will roundup it */
+               req->rq_replen       = req->rq_nob_received;
+               req->rq_nob_received = 0;
+               req->rq_resend       = 1;
+               RETURN(0);
+       }
+
+       /*
+        * NB Until this point, the whole of the incoming message,
+        * including buflens, status etc is in the sender's byte order.
+        */
+       rc = sptlrpc_cli_unwrap_reply(req);
+       if (rc) {
+               DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
+               RETURN(rc);
+       }
+
+       /*
+        * Security layer unwrap might ask resend this request.
+        */
+       if (req->rq_resend)
+               RETURN(0);
+
+       rc = unpack_reply(req);
+       if (rc)
+               RETURN(rc);
+
+       /* retry indefinitely on EINPROGRESS */
+       if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
+           ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
+               time_t  now = cfs_time_current_sec();
+
+               DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
+               req->rq_resend = 1;
+               req->rq_nr_resend++;
+
+               /* allocate new xid to avoid reply reconstruction */
+               if (!req->rq_bulk) {
+                       /* new xid is already allocated for bulk in
+                        * ptlrpc_check_set() */
+                       req->rq_xid = ptlrpc_next_xid();
+                       DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for "
+                                 "resend on EINPROGRESS");
+               }
+
+               /* Readjust the timeout for current conditions */
+               ptlrpc_at_set_req_timeout(req);
+               /* delay resend to give a chance to the server to get ready.
+                * The delay is increased by 1s on every resend and is capped to
+                * the current request timeout (i.e. obd_timeout if AT is off,
+                * or AT service time x 125% + 5s, see at_est2timeout) */
+               if (req->rq_nr_resend > req->rq_timeout)
+                       req->rq_sent = now + req->rq_timeout;
+               else
+                       req->rq_sent = now + req->rq_nr_resend;
+
+               RETURN(0);
+       }
+
+       do_gettimeofday(&work_start);
+       timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL);
+       if (obd->obd_svc_stats != NULL) {
+               lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
+                                   timediff);
+               ptlrpc_lprocfs_rpc_sent(req, timediff);
+       }
+
+       if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
+           lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
+               DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
+                         lustre_msg_get_type(req->rq_repmsg));
+               RETURN(-EPROTO);
+       }
+
+       if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
+               CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
+       ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+       ptlrpc_at_adj_net_latency(req,
+                                 lustre_msg_get_service_time(req->rq_repmsg));
+
+       rc = ptlrpc_check_status(req);
+       imp->imp_connect_error = rc;
+
+       if (rc) {
+               /*
+                * Either we've been evicted, or the server has failed for
+                * some reason. Try to reconnect, and if that fails, punt to
+                * the upcall.
+                */
+               if (ll_rpc_recoverable_error(rc)) {
+                       if (req->rq_send_state != LUSTRE_IMP_FULL ||
+                           imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
+                               RETURN(rc);
+                       }
+                       ptlrpc_request_handle_notconn(req);
+                       RETURN(rc);
+               }
+       } else {
+               /*
+                * Let's look if server sent slv. Do it only for RPC with
+                * rc == 0.
+                */
+               ldlm_cli_update_pool(req);
+       }
+
+       /*
+        * Store transno in reqmsg for replay.
+        */
+       if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
+               req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
+               lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
+       }
+
+       if (imp->imp_replayable) {
+               spin_lock(&imp->imp_lock);
+               /*
+                * No point in adding already-committed requests to the replay
+                * list, we will just remove them immediately. b=9829
+                */
+               if (req->rq_transno != 0 &&
+                   (req->rq_transno >
+                    lustre_msg_get_last_committed(req->rq_repmsg) ||
+                    req->rq_replay)) {
+                       /** version recovery */
+                       ptlrpc_save_versions(req);
+                       ptlrpc_retain_replayable_request(req, imp);
+               } else if (req->rq_commit_cb != NULL) {
+                       spin_unlock(&imp->imp_lock);
+                       req->rq_commit_cb(req);
+                       spin_lock(&imp->imp_lock);
+               }
+
+               /*
+                * Replay-enabled imports return commit-status information.
+                */
+               if (lustre_msg_get_last_committed(req->rq_repmsg)) {
+                       imp->imp_peer_committed_transno =
+                               lustre_msg_get_last_committed(req->rq_repmsg);
+               }
+
+               ptlrpc_free_committed(imp);
+
+               if (!list_empty(&imp->imp_replay_list)) {
+                       struct ptlrpc_request *last;
+
+                       last = list_entry(imp->imp_replay_list.prev,
+                                             struct ptlrpc_request,
+                                             rq_replay_list);
+                       /*
+                        * Requests with rq_replay stay on the list even if no
+                        * commit is expected.
+                        */
+                       if (last->rq_transno > imp->imp_peer_committed_transno)
+                               ptlrpc_pinger_commit_expected(imp);
+               }
+
+               spin_unlock(&imp->imp_lock);
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * Helper function to send request \a req over the network for the first time
+ * Also adjusts request phase.
+ * Returns 0 on success or error code.
+ */
+static int ptlrpc_send_new_req(struct ptlrpc_request *req)
+{
+       struct obd_import     *imp = req->rq_import;
+       int rc;
+       ENTRY;
+
+       LASSERT(req->rq_phase == RQ_PHASE_NEW);
+       if (req->rq_sent && (req->rq_sent > cfs_time_current_sec()) &&
+           (!req->rq_generation_set ||
+            req->rq_import_generation == imp->imp_generation))
+               RETURN (0);
+
+       ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
+
+       spin_lock(&imp->imp_lock);
+
+       if (!req->rq_generation_set)
+               req->rq_import_generation = imp->imp_generation;
+
+       if (ptlrpc_import_delay_req(imp, req, &rc)) {
+               spin_lock(&req->rq_lock);
+               req->rq_waiting = 1;
+               spin_unlock(&req->rq_lock);
+
+               DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: "
+                         "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg),
+                         ptlrpc_import_state_name(req->rq_send_state),
+                         ptlrpc_import_state_name(imp->imp_state));
+               LASSERT(list_empty(&req->rq_list));
+               list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+               atomic_inc(&req->rq_import->imp_inflight);
+               spin_unlock(&imp->imp_lock);
+               RETURN(0);
+       }
+
+       if (rc != 0) {
+               spin_unlock(&imp->imp_lock);
+               req->rq_status = rc;
+               ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+               RETURN(rc);
+       }
+
+       LASSERT(list_empty(&req->rq_list));
+       list_add_tail(&req->rq_list, &imp->imp_sending_list);
+       atomic_inc(&req->rq_import->imp_inflight);
+       spin_unlock(&imp->imp_lock);
+
+       lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+       rc = sptlrpc_req_refresh_ctx(req, -1);
+       if (rc) {
+               if (req->rq_err) {
+                       req->rq_status = rc;
+                       RETURN(1);
+               } else {
+                       req->rq_wait_ctx = 1;
+                       RETURN(0);
+               }
+       }
+
+       CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc"
+              " %s:%s:%d:"LPU64":%s:%d\n", current_comm(),
+              imp->imp_obd->obd_uuid.uuid,
+              lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+              libcfs_nid2str(imp->imp_connection->c_peer.nid),
+              lustre_msg_get_opc(req->rq_reqmsg));
+
+       rc = ptl_send_rpc(req, 0);
+       if (rc) {
+               DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
+               req->rq_net_err = 1;
+               RETURN(rc);
+       }
+       RETURN(0);
+}
+
+static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
+{
+       int remaining, rc;
+       ENTRY;
+
+       LASSERT(set->set_producer != NULL);
+
+       remaining = atomic_read(&set->set_remaining);
+
+       /* populate the ->set_requests list with requests until we
+        * reach the maximum number of RPCs in flight for this set */
+       while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
+               rc = set->set_producer(set, set->set_producer_arg);
+               if (rc == -ENOENT) {
+                       /* no more RPC to produce */
+                       set->set_producer     = NULL;
+                       set->set_producer_arg = NULL;
+                       RETURN(0);
+               }
+       }
+
+       RETURN((atomic_read(&set->set_remaining) - remaining));
+}
+
+/**
+ * this sends any unsent RPCs in \a set and returns 1 if all are sent
+ * and no more replies are expected.
+ * (it is possible to get less replies than requests sent e.g. due to timed out
+ * requests or requests that we had trouble to send out)
+ */
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
+{
+       struct list_head *tmp, *next;
+       int force_timer_recalc = 0;
+       ENTRY;
+
+       if (atomic_read(&set->set_remaining) == 0)
+               RETURN(1);
+
+       list_for_each_safe(tmp, next, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_set_chain);
+               struct obd_import *imp = req->rq_import;
+               int unregistered = 0;
+               int rc = 0;
+
+               if (req->rq_phase == RQ_PHASE_NEW &&
+                   ptlrpc_send_new_req(req)) {
+                       force_timer_recalc = 1;
+               }
+
+               /* delayed send - skip */
+               if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
+                       continue;
+
+               /* delayed resend - skip */
+               if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
+                   req->rq_sent > cfs_time_current_sec())
+                       continue;
+
+               if (!(req->rq_phase == RQ_PHASE_RPC ||
+                     req->rq_phase == RQ_PHASE_BULK ||
+                     req->rq_phase == RQ_PHASE_INTERPRET ||
+                     req->rq_phase == RQ_PHASE_UNREGISTERING ||
+                     req->rq_phase == RQ_PHASE_COMPLETE)) {
+                       DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
+                       LBUG();
+               }
+
+               if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
+                       LASSERT(req->rq_next_phase != req->rq_phase);
+                       LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
+
+                       /*
+                        * Skip processing until reply is unlinked. We
+                        * can't return to pool before that and we can't
+                        * call interpret before that. We need to make
+                        * sure that all rdma transfers finished and will
+                        * not corrupt any data.
+                        */
+                       if (ptlrpc_client_recv_or_unlink(req) ||
+                           ptlrpc_client_bulk_active(req))
+                               continue;
+
+                       /*
+                        * Turn fail_loc off to prevent it from looping
+                        * forever.
+                        */
+                       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+                               OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
+                                                    OBD_FAIL_ONCE);
+                       }
+                       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
+                               OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
+                                                    OBD_FAIL_ONCE);
+                       }
+
+                       /*
+                        * Move to next phase if reply was successfully
+                        * unlinked.
+                        */
+                       ptlrpc_rqphase_move(req, req->rq_next_phase);
+               }
+
+               if (req->rq_phase == RQ_PHASE_COMPLETE)
+                       continue;
+
+               if (req->rq_phase == RQ_PHASE_INTERPRET)
+                       GOTO(interpret, req->rq_status);
+
+               /*
+                * Note that this also will start async reply unlink.
+                */
+               if (req->rq_net_err && !req->rq_timedout) {
+                       ptlrpc_expire_one_request(req, 1);
+
+                       /*
+                        * Check if we still need to wait for unlink.
+                        */
+                       if (ptlrpc_client_recv_or_unlink(req) ||
+                           ptlrpc_client_bulk_active(req))
+                               continue;
+                       /* If there is no need to resend, fail it now. */
+                       if (req->rq_no_resend) {
+                               if (req->rq_status == 0)
+                                       req->rq_status = -EIO;
+                               ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                               GOTO(interpret, req->rq_status);
+                       } else {
+                               continue;
+                       }
+               }
+
+               if (req->rq_err) {
+                       spin_lock(&req->rq_lock);
+                       req->rq_replied = 0;
+                       spin_unlock(&req->rq_lock);
+                       if (req->rq_status == 0)
+                               req->rq_status = -EIO;
+                       ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                       GOTO(interpret, req->rq_status);
+               }
+
+               /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
+                * so it sets rq_intr regardless of individual rpc
+                * timeouts. The synchronous IO waiting path sets
+                * rq_intr irrespective of whether ptlrpcd
+                * has seen a timeout.  Our policy is to only interpret
+                * interrupted rpcs after they have timed out, so we
+                * need to enforce that here.
+                */
+
+               if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
+                                    req->rq_wait_ctx)) {
+                       req->rq_status = -EINTR;
+                       ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                       GOTO(interpret, req->rq_status);
+               }
+
+               if (req->rq_phase == RQ_PHASE_RPC) {
+                       if (req->rq_timedout || req->rq_resend ||
+                           req->rq_waiting || req->rq_wait_ctx) {
+                               int status;
+
+                               if (!ptlrpc_unregister_reply(req, 1))
+                                       continue;
+
+                               spin_lock(&imp->imp_lock);
+                               if (ptlrpc_import_delay_req(imp, req, &status)){
+                                       /* put on delay list - only if we wait
+                                        * recovery finished - before send */
+                                       list_del_init(&req->rq_list);
+                                       list_add_tail(&req->rq_list,
+                                                         &imp->
+                                                         imp_delayed_list);
+                                       spin_unlock(&imp->imp_lock);
+                                       continue;
+                               }
+
+                               if (status != 0)  {
+                                       req->rq_status = status;
+                                       ptlrpc_rqphase_move(req,
+                                               RQ_PHASE_INTERPRET);
+                                       spin_unlock(&imp->imp_lock);
+                                       GOTO(interpret, req->rq_status);
+                               }
+                               if (ptlrpc_no_resend(req) &&
+                                   !req->rq_wait_ctx) {
+                                       req->rq_status = -ENOTCONN;
+                                       ptlrpc_rqphase_move(req,
+                                                           RQ_PHASE_INTERPRET);
+                                       spin_unlock(&imp->imp_lock);
+                                       GOTO(interpret, req->rq_status);
+                               }
+
+                               list_del_init(&req->rq_list);
+                               list_add_tail(&req->rq_list,
+                                                 &imp->imp_sending_list);
+
+                               spin_unlock(&imp->imp_lock);
+
+                               spin_lock(&req->rq_lock);
+                               req->rq_waiting = 0;
+                               spin_unlock(&req->rq_lock);
+
+                               if (req->rq_timedout || req->rq_resend) {
+                                       /* This is re-sending anyways,
+                                        * let's mark req as resend. */
+                                       spin_lock(&req->rq_lock);
+                                       req->rq_resend = 1;
+                                       spin_unlock(&req->rq_lock);
+                                       if (req->rq_bulk) {
+                                               __u64 old_xid;
+
+                                               if (!ptlrpc_unregister_bulk(req, 1))
+                                                       continue;
+
+                                               /* ensure previous bulk fails */
+                                               old_xid = req->rq_xid;
+                                               req->rq_xid = ptlrpc_next_xid();
+                                               CDEBUG(D_HA, "resend bulk "
+                                                      "old x"LPU64
+                                                      " new x"LPU64"\n",
+                                                      old_xid, req->rq_xid);
+                                       }
+                               }
+                               /*
+                                * rq_wait_ctx is only touched by ptlrpcd,
+                                * so no lock is needed here.
+                                */
+                               status = sptlrpc_req_refresh_ctx(req, -1);
+                               if (status) {
+                                       if (req->rq_err) {
+                                               req->rq_status = status;
+                                               spin_lock(&req->rq_lock);
+                                               req->rq_wait_ctx = 0;
+                                               spin_unlock(&req->rq_lock);
+                                               force_timer_recalc = 1;
+                                       } else {
+                                               spin_lock(&req->rq_lock);
+                                               req->rq_wait_ctx = 1;
+                                               spin_unlock(&req->rq_lock);
+                                       }
+
+                                       continue;
+                               } else {
+                                       spin_lock(&req->rq_lock);
+                                       req->rq_wait_ctx = 0;
+                                       spin_unlock(&req->rq_lock);
+                               }
+
+                               rc = ptl_send_rpc(req, 0);
+                               if (rc) {
+                                       DEBUG_REQ(D_HA, req,
+                                                 "send failed: rc = %d", rc);
+                                       force_timer_recalc = 1;
+                                       spin_lock(&req->rq_lock);
+                                       req->rq_net_err = 1;
+                                       spin_unlock(&req->rq_lock);
+                               }
+                               /* need to reset the timeout */
+                               force_timer_recalc = 1;
+                       }
+
+                       spin_lock(&req->rq_lock);
+
+                       if (ptlrpc_client_early(req)) {
+                               ptlrpc_at_recv_early_reply(req);
+                               spin_unlock(&req->rq_lock);
+                               continue;
+                       }
+
+                       /* Still waiting for a reply? */
+                       if (ptlrpc_client_recv(req)) {
+                               spin_unlock(&req->rq_lock);
+                               continue;
+                       }
+
+                       /* Did we actually receive a reply? */
+                       if (!ptlrpc_client_replied(req)) {
+                               spin_unlock(&req->rq_lock);
+                               continue;
+                       }
+
+                       spin_unlock(&req->rq_lock);
+
+                       /* unlink from net because we are going to
+                        * swab in-place of reply buffer */
+                       unregistered = ptlrpc_unregister_reply(req, 1);
+                       if (!unregistered)
+                               continue;
+
+                       req->rq_status = after_reply(req);
+                       if (req->rq_resend)
+                               continue;
+
+                       /* If there is no bulk associated with this request,
+                        * then we're done and should let the interpreter
+                        * process the reply. Similarly if the RPC returned
+                        * an error, and therefore the bulk will never arrive.
+                        */
+                       if (req->rq_bulk == NULL || req->rq_status < 0) {
+                               ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+                               GOTO(interpret, req->rq_status);
+                       }
+
+                       ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
+               }
+
+               LASSERT(req->rq_phase == RQ_PHASE_BULK);
+               if (ptlrpc_client_bulk_active(req))
+                       continue;
+
+               if (req->rq_bulk->bd_failure) {
+                       /* The RPC reply arrived OK, but the bulk screwed
+                        * up!  Dead weird since the server told us the RPC
+                        * was good after getting the REPLY for her GET or
+                        * the ACK for her PUT. */
+                       DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
+                       req->rq_status = -EIO;
+               }
+
+               ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
+
+       interpret:
+               LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
+
+               /* This moves to "unregistering" phase we need to wait for
+                * reply unlink. */
+               if (!unregistered && !ptlrpc_unregister_reply(req, 1)) {
+                       /* start async bulk unlink too */
+                       ptlrpc_unregister_bulk(req, 1);
+                       continue;
+               }
+
+               if (!ptlrpc_unregister_bulk(req, 1))
+                       continue;
+
+               /* When calling interpret receiving already should be
+                * finished. */
+               LASSERT(!req->rq_receiving_reply);
+
+               ptlrpc_req_interpret(env, req, req->rq_status);
+
+               ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
+
+               CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0,
+                       "Completed RPC pname:cluuid:pid:xid:nid:"
+                       "opc %s:%s:%d:"LPU64":%s:%d\n",
+                       current_comm(), imp->imp_obd->obd_uuid.uuid,
+                       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+                       libcfs_nid2str(imp->imp_connection->c_peer.nid),
+                       lustre_msg_get_opc(req->rq_reqmsg));
+
+               spin_lock(&imp->imp_lock);
+               /* Request already may be not on sending or delaying list. This
+                * may happen in the case of marking it erroneous for the case
+                * ptlrpc_import_delay_req(req, status) find it impossible to
+                * allow sending this rpc and returns *status != 0. */
+               if (!list_empty(&req->rq_list)) {
+                       list_del_init(&req->rq_list);
+                       atomic_dec(&imp->imp_inflight);
+               }
+               spin_unlock(&imp->imp_lock);
+
+               atomic_dec(&set->set_remaining);
+               wake_up_all(&imp->imp_recovery_waitq);
+
+               if (set->set_producer) {
+                       /* produce a new request if possible */
+                       if (ptlrpc_set_producer(set) > 0)
+                               force_timer_recalc = 1;
+
+                       /* free the request that has just been completed
+                        * in order not to pollute set->set_requests */
+                       list_del_init(&req->rq_set_chain);
+                       spin_lock(&req->rq_lock);
+                       req->rq_set = NULL;
+                       req->rq_invalid_rqset = 0;
+                       spin_unlock(&req->rq_lock);
+
+                       /* record rq_status to compute the final status later */
+                       if (req->rq_status != 0)
+                               set->set_rc = req->rq_status;
+                       ptlrpc_req_finished(req);
+               }
+       }
+
+       /* If we hit an error, we want to recover promptly. */
+       RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc);
+}
+EXPORT_SYMBOL(ptlrpc_check_set);
+
+/**
+ * Time out request \a req. is \a async_unlink is set, that means do not wait
+ * until LNet actually confirms network buffer unlinking.
+ * Return 1 if we should give up further retrying attempts or 0 otherwise.
+ */
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
+{
+       struct obd_import *imp = req->rq_import;
+       int rc = 0;
+       ENTRY;
+
+       spin_lock(&req->rq_lock);
+       req->rq_timedout = 1;
+       spin_unlock(&req->rq_lock);
+
+       DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T
+                 "/real "CFS_DURATION_T"]",
+                 req->rq_net_err ? "failed due to network error" :
+                    ((req->rq_real_sent == 0 ||
+                      cfs_time_before(req->rq_real_sent, req->rq_sent) ||
+                      cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ?
+                     "timed out for sent delay" : "timed out for slow reply"),
+                 req->rq_sent, req->rq_real_sent);
+
+       if (imp != NULL && obd_debug_peer_on_timeout)
+               LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
+
+       ptlrpc_unregister_reply(req, async_unlink);
+       ptlrpc_unregister_bulk(req, async_unlink);
+
+       if (obd_dump_on_timeout)
+               libcfs_debug_dumplog();
+
+       if (imp == NULL) {
+               DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
+               RETURN(1);
+       }
+
+       atomic_inc(&imp->imp_timeouts);
+
+       /* The DLM server doesn't want recovery run on its imports. */
+       if (imp->imp_dlm_fake)
+               RETURN(1);
+
+       /* If this request is for recovery or other primordial tasks,
+        * then error it out here. */
+       if (req->rq_ctx_init || req->rq_ctx_fini ||
+           req->rq_send_state != LUSTRE_IMP_FULL ||
+           imp->imp_obd->obd_no_recov) {
+               DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
+                         ptlrpc_import_state_name(req->rq_send_state),
+                         ptlrpc_import_state_name(imp->imp_state));
+               spin_lock(&req->rq_lock);
+               req->rq_status = -ETIMEDOUT;
+               req->rq_err = 1;
+               spin_unlock(&req->rq_lock);
+               RETURN(1);
+       }
+
+       /* if a request can't be resent we can't wait for an answer after
+          the timeout */
+       if (ptlrpc_no_resend(req)) {
+               DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
+               rc = 1;
+       }
+
+       ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
+
+       RETURN(rc);
+}
+
+/**
+ * Time out all uncompleted requests in request set pointed by \a data
+ * Callback used when waiting on sets with l_wait_event.
+ * Always returns 1.
+ */
+int ptlrpc_expired_set(void *data)
+{
+       struct ptlrpc_request_set *set = data;
+       struct list_head                *tmp;
+       time_t               now = cfs_time_current_sec();
+       ENTRY;
+
+       LASSERT(set != NULL);
+
+       /*
+        * A timeout expired. See which reqs it applies to...
+        */
+       list_for_each (tmp, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_set_chain);
+
+               /* don't expire request waiting for context */
+               if (req->rq_wait_ctx)
+                       continue;
+
+               /* Request in-flight? */
+               if (!((req->rq_phase == RQ_PHASE_RPC &&
+                      !req->rq_waiting && !req->rq_resend) ||
+                     (req->rq_phase == RQ_PHASE_BULK)))
+                       continue;
+
+               if (req->rq_timedout ||     /* already dealt with */
+                   req->rq_deadline > now) /* not expired */
+                       continue;
+
+               /* Deal with this guy. Do it asynchronously to not block
+                * ptlrpcd thread. */
+               ptlrpc_expire_one_request(req, 1);
+       }
+
+       /*
+        * When waiting for a whole set, we always break out of the
+        * sleep so we can recalculate the timeout, or enable interrupts
+        * if everyone's timed out.
+        */
+       RETURN(1);
+}
+EXPORT_SYMBOL(ptlrpc_expired_set);
+
+/**
+ * Sets rq_intr flag in \a req under spinlock.
+ */
+void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
+{
+       spin_lock(&req->rq_lock);
+       req->rq_intr = 1;
+       spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_mark_interrupted);
+
+/**
+ * Interrupts (sets interrupted flag) all uncompleted requests in
+ * a set \a data. Callback for l_wait_event for interruptible waits.
+ */
+void ptlrpc_interrupted_set(void *data)
+{
+       struct ptlrpc_request_set *set = data;
+       struct list_head *tmp;
+
+       LASSERT(set != NULL);
+       CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
+
+       list_for_each(tmp, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_set_chain);
+
+               if (req->rq_phase != RQ_PHASE_RPC &&
+                   req->rq_phase != RQ_PHASE_UNREGISTERING)
+                       continue;
+
+               ptlrpc_mark_interrupted(req);
+       }
+}
+EXPORT_SYMBOL(ptlrpc_interrupted_set);
+
+/**
+ * Get the smallest timeout in the set; this does NOT set a timeout.
+ */
+int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
+{
+       struct list_head            *tmp;
+       time_t           now = cfs_time_current_sec();
+       int                 timeout = 0;
+       struct ptlrpc_request *req;
+       int                 deadline;
+       ENTRY;
+
+       SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
+
+       list_for_each(tmp, &set->set_requests) {
+               req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+               /*
+                * Request in-flight?
+                */
+               if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+                     (req->rq_phase == RQ_PHASE_BULK) ||
+                     (req->rq_phase == RQ_PHASE_NEW)))
+                       continue;
+
+               /*
+                * Already timed out.
+                */
+               if (req->rq_timedout)
+                       continue;
+
+               /*
+                * Waiting for ctx.
+                */
+               if (req->rq_wait_ctx)
+                       continue;
+
+               if (req->rq_phase == RQ_PHASE_NEW)
+                       deadline = req->rq_sent;
+               else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
+                       deadline = req->rq_sent;
+               else
+                       deadline = req->rq_sent + req->rq_timeout;
+
+               if (deadline <= now)    /* actually expired already */
+                       timeout = 1;    /* ASAP */
+               else if (timeout == 0 || timeout > deadline - now)
+                       timeout = deadline - now;
+       }
+       RETURN(timeout);
+}
+EXPORT_SYMBOL(ptlrpc_set_next_timeout);
+
+/**
+ * Send all unset request from the set and then wait untill all
+ * requests in the set complete (either get a reply, timeout, get an
+ * error or otherwise be interrupted).
+ * Returns 0 on success or error code otherwise.
+ */
+int ptlrpc_set_wait(struct ptlrpc_request_set *set)
+{
+       struct list_head            *tmp;
+       struct ptlrpc_request *req;
+       struct l_wait_info     lwi;
+       int                 rc, timeout;
+       ENTRY;
+
+       if (set->set_producer)
+               (void)ptlrpc_set_producer(set);
+       else
+               list_for_each(tmp, &set->set_requests) {
+                       req = list_entry(tmp, struct ptlrpc_request,
+                                            rq_set_chain);
+                       if (req->rq_phase == RQ_PHASE_NEW)
+                               (void)ptlrpc_send_new_req(req);
+               }
+
+       if (list_empty(&set->set_requests))
+               RETURN(0);
+
+       do {
+               timeout = ptlrpc_set_next_timeout(set);
+
+               /* wait until all complete, interrupted, or an in-flight
+                * req times out */
+               CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
+                      set, timeout);
+
+               if (timeout == 0 && !cfs_signal_pending())
+                       /*
+                        * No requests are in-flight (ether timed out
+                        * or delayed), so we can allow interrupts.
+                        * We still want to block for a limited time,
+                        * so we allow interrupts during the timeout.
+                        */
+                       lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1),
+                                                  ptlrpc_expired_set,
+                                                  ptlrpc_interrupted_set, set);
+               else
+                       /*
+                        * At least one request is in flight, so no
+                        * interrupts are allowed. Wait until all
+                        * complete, or an in-flight req times out.
+                        */
+                       lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1),
+                                         ptlrpc_expired_set, set);
+
+               rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
+
+               /* LU-769 - if we ignored the signal because it was already
+                * pending when we started, we need to handle it now or we risk
+                * it being ignored forever */
+               if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr &&
+                   cfs_signal_pending()) {
+                       sigset_t blocked_sigs =
+                                          cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+                       /* In fact we only interrupt for the "fatal" signals
+                        * like SIGINT or SIGKILL. We still ignore less
+                        * important signals since ptlrpc set is not easily
+                        * reentrant from userspace again */
+                       if (cfs_signal_pending())
+                               ptlrpc_interrupted_set(set);
+                       cfs_restore_sigs(blocked_sigs);
+               }
+
+               LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
+
+               /* -EINTR => all requests have been flagged rq_intr so next
+                * check completes.
+                * -ETIMEDOUT => someone timed out.  When all reqs have
+                * timed out, signals are enabled allowing completion with
+                * EINTR.
+                * I don't really care if we go once more round the loop in
+                * the error cases -eeb. */
+               if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
+                       list_for_each(tmp, &set->set_requests) {
+                               req = list_entry(tmp, struct ptlrpc_request,
+                                                    rq_set_chain);
+                               spin_lock(&req->rq_lock);
+                               req->rq_invalid_rqset = 1;
+                               spin_unlock(&req->rq_lock);
+                       }
+               }
+       } while (rc != 0 || atomic_read(&set->set_remaining) != 0);
+
+       LASSERT(atomic_read(&set->set_remaining) == 0);
+
+       rc = set->set_rc; /* rq_status of already freed requests if any */
+       list_for_each(tmp, &set->set_requests) {
+               req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+               LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
+               if (req->rq_status != 0)
+                       rc = req->rq_status;
+       }
+
+       if (set->set_interpret != NULL) {
+               int (*interpreter)(struct ptlrpc_request_set *set,void *,int) =
+                       set->set_interpret;
+               rc = interpreter (set, set->set_arg, rc);
+       } else {
+               struct ptlrpc_set_cbdata *cbdata, *n;
+               int err;
+
+               list_for_each_entry_safe(cbdata, n,
+                                        &set->set_cblist, psc_item) {
+                       list_del_init(&cbdata->psc_item);
+                       err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
+                       if (err && !rc)
+                               rc = err;
+                       OBD_FREE_PTR(cbdata);
+               }
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_wait);
+
+/**
+ * Helper fuction for request freeing.
+ * Called when request count reached zero and request needs to be freed.
+ * Removes request from all sorts of sending/replay lists it might be on,
+ * frees network buffers if any are present.
+ * If \a locked is set, that means caller is already holding import imp_lock
+ * and so we no longer need to reobtain it (for certain lists manipulations)
+ */
+static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
+{
+       ENTRY;
+       if (request == NULL) {
+               EXIT;
+               return;
+       }
+
+       LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
+       LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */
+       LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
+       LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+       LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request);
+       LASSERTF(!request->rq_replay, "req %p\n", request);
+
+       req_capsule_fini(&request->rq_pill);
+
+       /* We must take it off the imp_replay_list first.  Otherwise, we'll set
+        * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
+       if (request->rq_import != NULL) {
+               if (!locked)
+                       spin_lock(&request->rq_import->imp_lock);
+               list_del_init(&request->rq_replay_list);
+               if (!locked)
+                       spin_unlock(&request->rq_import->imp_lock);
+       }
+       LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
+
+       if (atomic_read(&request->rq_refcount) != 0) {
+               DEBUG_REQ(D_ERROR, request,
+                         "freeing request with nonzero refcount");
+               LBUG();
+       }
+
+       if (request->rq_repbuf != NULL)
+               sptlrpc_cli_free_repbuf(request);
+       if (request->rq_export != NULL) {
+               class_export_put(request->rq_export);
+               request->rq_export = NULL;
+       }
+       if (request->rq_import != NULL) {
+               class_import_put(request->rq_import);
+               request->rq_import = NULL;
+       }
+       if (request->rq_bulk != NULL)
+               ptlrpc_free_bulk_pin(request->rq_bulk);
+
+       if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL)
+               sptlrpc_cli_free_reqbuf(request);
+
+       if (request->rq_cli_ctx)
+               sptlrpc_req_put_ctx(request, !locked);
+
+       if (request->rq_pool)
+               __ptlrpc_free_req_to_pool(request);
+       else
+               OBD_FREE(request, sizeof(*request));
+       EXIT;
+}
+
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
+/**
+ * Drop one request reference. Must be called with import imp_lock held.
+ * When reference count drops to zero, reuqest is freed.
+ */
+void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
+{
+       LASSERT(spin_is_locked(&request->rq_import->imp_lock));
+       (void)__ptlrpc_req_finished(request, 1);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock);
+
+/**
+ * Helper function
+ * Drops one reference count for request \a request.
+ * \a locked set indicates that caller holds import imp_lock.
+ * Frees the request whe reference count reaches zero.
+ */
+static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
+{
+       ENTRY;
+       if (request == NULL)
+               RETURN(1);
+
+       if (request == LP_POISON ||
+           request->rq_reqmsg == LP_POISON) {
+               CERROR("dereferencing freed request (bug 575)\n");
+               LBUG();
+               RETURN(1);
+       }
+
+       DEBUG_REQ(D_INFO, request, "refcount now %u",
+                 atomic_read(&request->rq_refcount) - 1);
+
+       if (atomic_dec_and_test(&request->rq_refcount)) {
+               __ptlrpc_free_req(request, locked);
+               RETURN(1);
+       }
+
+       RETURN(0);
+}
+
+/**
+ * Drops one reference count for a request.
+ */
+void ptlrpc_req_finished(struct ptlrpc_request *request)
+{
+       __ptlrpc_req_finished(request, 0);
+}
+EXPORT_SYMBOL(ptlrpc_req_finished);
+
+/**
+ * Returns xid of a \a request
+ */
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request)
+{
+       return request->rq_xid;
+}
+EXPORT_SYMBOL(ptlrpc_req_xid);
+
+/**
+ * Disengage the client's reply buffer from the network
+ * NB does _NOT_ unregister any client-side bulk.
+ * IDEMPOTENT, but _not_ safe against concurrent callers.
+ * The request owner (i.e. the thread doing the I/O) must call...
+ * Returns 0 on success or 1 if unregistering cannot be made.
+ */
+int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
+{
+       int             rc;
+       wait_queue_head_t       *wq;
+       struct l_wait_info lwi;
+
+       /*
+        * Might sleep.
+        */
+       LASSERT(!in_interrupt());
+
+       /*
+        * Let's setup deadline for reply unlink.
+        */
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
+           async && request->rq_reply_deadline == 0)
+               request->rq_reply_deadline = cfs_time_current_sec()+LONG_UNLINK;
+
+       /*
+        * Nothing left to do.
+        */
+       if (!ptlrpc_client_recv_or_unlink(request))
+               RETURN(1);
+
+       LNetMDUnlink(request->rq_reply_md_h);
+
+       /*
+        * Let's check it once again.
+        */
+       if (!ptlrpc_client_recv_or_unlink(request))
+               RETURN(1);
+
+       /*
+        * Move to "Unregistering" phase as reply was not unlinked yet.
+        */
+       ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING);
+
+       /*
+        * Do not wait for unlink to finish.
+        */
+       if (async)
+               RETURN(0);
+
+       /*
+        * We have to l_wait_event() whatever the result, to give liblustre
+        * a chance to run reply_in_callback(), and to make sure we've
+        * unlinked before returning a req to the pool.
+        */
+       if (request->rq_set != NULL)
+               wq = &request->rq_set->set_waitq;
+       else
+               wq = &request->rq_reply_waitq;
+
+       for (;;) {
+               /* Network access will complete in finite time but the HUGE
+                * timeout lets us CWARN for visibility of sluggish NALs */
+               lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+                                          cfs_time_seconds(1), NULL, NULL);
+               rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
+                                 &lwi);
+               if (rc == 0) {
+                       ptlrpc_rqphase_move(request, request->rq_next_phase);
+                       RETURN(1);
+               }
+
+               LASSERT(rc == -ETIMEDOUT);
+               DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout "
+                         "rvcng=%d unlnk=%d", request->rq_receiving_reply,
+                         request->rq_must_unlink);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_reply);
+
+/**
+ * Iterates through replay_list on import and prunes
+ * all requests have transno smaller than last_committed for the
+ * import and don't have rq_replay set.
+ * Since requests are sorted in transno order, stops when meetign first
+ * transno bigger than last_committed.
+ * caller must hold imp->imp_lock
+ */
+void ptlrpc_free_committed(struct obd_import *imp)
+{
+       struct list_head *tmp, *saved;
+       struct ptlrpc_request *req;
+       struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
+       ENTRY;
+
+       LASSERT(imp != NULL);
+
+       LASSERT(spin_is_locked(&imp->imp_lock));
+
+
+       if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
+           imp->imp_generation == imp->imp_last_generation_checked) {
+               CDEBUG(D_INFO, "%s: skip recheck: last_committed "LPU64"\n",
+                      imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+               EXIT;
+               return;
+       }
+       CDEBUG(D_RPCTRACE, "%s: committing for last_committed "LPU64" gen %d\n",
+              imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
+              imp->imp_generation);
+       imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
+       imp->imp_last_generation_checked = imp->imp_generation;
+
+       list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
+               req = list_entry(tmp, struct ptlrpc_request,
+                                    rq_replay_list);
+
+               /* XXX ok to remove when 1357 resolved - rread 05/29/03  */
+               LASSERT(req != last_req);
+               last_req = req;
+
+               if (req->rq_transno == 0) {
+                       DEBUG_REQ(D_EMERG, req, "zero transno during replay");
+                       LBUG();
+               }
+               if (req->rq_import_generation < imp->imp_generation) {
+                       DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
+                       GOTO(free_req, 0);
+               }
+
+               if (req->rq_replay) {
+                       DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
+                       continue;
+               }
+
+               /* not yet committed */
+               if (req->rq_transno > imp->imp_peer_committed_transno) {
+                       DEBUG_REQ(D_RPCTRACE, req, "stopping search");
+                       break;
+               }
+
+               DEBUG_REQ(D_INFO, req, "commit (last_committed "LPU64")",
+                         imp->imp_peer_committed_transno);
+free_req:
+               spin_lock(&req->rq_lock);
+               req->rq_replay = 0;
+               spin_unlock(&req->rq_lock);
+               if (req->rq_commit_cb != NULL)
+                       req->rq_commit_cb(req);
+               list_del_init(&req->rq_replay_list);
+               __ptlrpc_req_finished(req, 1);
+       }
+
+       EXIT;
+       return;
+}
+
+void ptlrpc_cleanup_client(struct obd_import *imp)
+{
+       ENTRY;
+       EXIT;
+       return;
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_client);
+
+/**
+ * Schedule previously sent request for resend.
+ * For bulk requests we assign new xid (to avoid problems with
+ * lost replies and therefore several transfers landing into same buffer
+ * from different sending attempts).
+ */
+void ptlrpc_resend_req(struct ptlrpc_request *req)
+{
+       DEBUG_REQ(D_HA, req, "going to resend");
+       lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
+       req->rq_status = -EAGAIN;
+
+       spin_lock(&req->rq_lock);
+       req->rq_resend = 1;
+       req->rq_net_err = 0;
+       req->rq_timedout = 0;
+       if (req->rq_bulk) {
+               __u64 old_xid = req->rq_xid;
+
+               /* ensure previous bulk fails */
+               req->rq_xid = ptlrpc_next_xid();
+               CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n",
+                      old_xid, req->rq_xid);
+       }
+       ptlrpc_client_wake_req(req);
+       spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_resend_req);
+
+/* XXX: this function and rq_status are currently unused */
+void ptlrpc_restart_req(struct ptlrpc_request *req)
+{
+       DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
+       req->rq_status = -ERESTARTSYS;
+
+       spin_lock(&req->rq_lock);
+       req->rq_restart = 1;
+       req->rq_timedout = 0;
+       ptlrpc_client_wake_req(req);
+       spin_unlock(&req->rq_lock);
+}
+EXPORT_SYMBOL(ptlrpc_restart_req);
+
+/**
+ * Grab additional reference on a request \a req
+ */
+struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
+{
+       ENTRY;
+       atomic_inc(&req->rq_refcount);
+       RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpc_request_addref);
+
+/**
+ * Add a request to import replay_list.
+ * Must be called under imp_lock
+ */
+void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
+                                     struct obd_import *imp)
+{
+       struct list_head *tmp;
+
+       LASSERT(spin_is_locked(&imp->imp_lock));
+
+       if (req->rq_transno == 0) {
+               DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
+               LBUG();
+       }
+
+       /* clear this for new requests that were resent as well
+          as resent replayed requests. */
+       lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
+
+       /* don't re-add requests that have been replayed */
+       if (!list_empty(&req->rq_replay_list))
+               return;
+
+       lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
+
+       LASSERT(imp->imp_replayable);
+       /* Balanced in ptlrpc_free_committed, usually. */
+       ptlrpc_request_addref(req);
+       list_for_each_prev(tmp, &imp->imp_replay_list) {
+               struct ptlrpc_request *iter =
+                       list_entry(tmp, struct ptlrpc_request,
+                                      rq_replay_list);
+
+               /* We may have duplicate transnos if we create and then
+                * open a file, or for closes retained if to match creating
+                * opens, so use req->rq_xid as a secondary key.
+                * (See bugs 684, 685, and 428.)
+                * XXX no longer needed, but all opens need transnos!
+                */
+               if (iter->rq_transno > req->rq_transno)
+                       continue;
+
+               if (iter->rq_transno == req->rq_transno) {
+                       LASSERT(iter->rq_xid != req->rq_xid);
+                       if (iter->rq_xid > req->rq_xid)
+                               continue;
+               }
+
+               list_add(&req->rq_replay_list, &iter->rq_replay_list);
+               return;
+       }
+
+       list_add(&req->rq_replay_list, &imp->imp_replay_list);
+}
+EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
+
+/**
+ * Send request and wait until it completes.
+ * Returns request processing status.
+ */
+int ptlrpc_queue_wait(struct ptlrpc_request *req)
+{
+       struct ptlrpc_request_set *set;
+       int rc;
+       ENTRY;
+
+       LASSERT(req->rq_set == NULL);
+       LASSERT(!req->rq_receiving_reply);
+
+       set = ptlrpc_prep_set();
+       if (set == NULL) {
+               CERROR("Unable to allocate ptlrpc set.");
+               RETURN(-ENOMEM);
+       }
+
+       /* for distributed debugging */
+       lustre_msg_set_status(req->rq_reqmsg, current_pid());
+
+       /* add a ref for the set (see comment in ptlrpc_set_add_req) */
+       ptlrpc_request_addref(req);
+       ptlrpc_set_add_req(set, req);
+       rc = ptlrpc_set_wait(set);
+       ptlrpc_set_destroy(set);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_queue_wait);
+
+struct ptlrpc_replay_async_args {
+       int praa_old_state;
+       int praa_old_status;
+};
+
+/**
+ * Callback used for replayed requests reply processing.
+ * In case of succesful reply calls registeresd request replay callback.
+ * In case of error restart replay process.
+ */
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+                                  struct ptlrpc_request *req,
+                                  void * data, int rc)
+{
+       struct ptlrpc_replay_async_args *aa = data;
+       struct obd_import *imp = req->rq_import;
+
+       ENTRY;
+       atomic_dec(&imp->imp_replay_inflight);
+
+       if (!ptlrpc_client_replied(req)) {
+               CERROR("request replay timed out, restarting recovery\n");
+               GOTO(out, rc = -ETIMEDOUT);
+       }
+
+       if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
+           (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
+            lustre_msg_get_status(req->rq_repmsg) == -ENODEV))
+               GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg));
+
+       /** VBR: check version failure */
+       if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
+               /** replay was failed due to version mismatch */
+               DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
+               spin_lock(&imp->imp_lock);
+               imp->imp_vbr_failed = 1;
+               imp->imp_no_lock_replay = 1;
+               spin_unlock(&imp->imp_lock);
+               lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+       } else {
+               /** The transno had better not change over replay. */
+               LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
+                        lustre_msg_get_transno(req->rq_repmsg) ||
+                        lustre_msg_get_transno(req->rq_repmsg) == 0,
+                        LPX64"/"LPX64"\n",
+                        lustre_msg_get_transno(req->rq_reqmsg),
+                        lustre_msg_get_transno(req->rq_repmsg));
+       }
+
+       spin_lock(&imp->imp_lock);
+       /** if replays by version then gap occur on server, no trust to locks */
+       if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
+               imp->imp_no_lock_replay = 1;
+       imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
+       spin_unlock(&imp->imp_lock);
+       LASSERT(imp->imp_last_replay_transno);
+
+       /* transaction number shouldn't be bigger than the latest replayed */
+       if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
+               DEBUG_REQ(D_ERROR, req,
+                         "Reported transno "LPU64" is bigger than the "
+                         "replayed one: "LPU64, req->rq_transno,
+                         lustre_msg_get_transno(req->rq_reqmsg));
+               GOTO(out, rc = -EINVAL);
+       }
+
+       DEBUG_REQ(D_HA, req, "got rep");
+
+       /* let the callback do fixups, possibly including in the request */
+       if (req->rq_replay_cb)
+               req->rq_replay_cb(req);
+
+       if (ptlrpc_client_replied(req) &&
+           lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
+               DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
+                         lustre_msg_get_status(req->rq_repmsg),
+                         aa->praa_old_status);
+       } else {
+               /* Put it back for re-replay. */
+               lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
+       }
+
+       /*
+        * Errors while replay can set transno to 0, but
+        * imp_last_replay_transno shouldn't be set to 0 anyway
+        */
+       if (req->rq_transno == 0)
+               CERROR("Transno is 0 during replay!\n");
+
+       /* continue with recovery */
+       rc = ptlrpc_import_recovery_state_machine(imp);
+ out:
+       req->rq_send_state = aa->praa_old_state;
+
+       if (rc != 0)
+               /* this replay failed, so restart recovery */
+               ptlrpc_connect_import(imp);
+
+       RETURN(rc);
+}
+
+/**
+ * Prepares and queues request for replay.
+ * Adds it to ptlrpcd queue for actual sending.
+ * Returns 0 on success.
+ */
+int ptlrpc_replay_req(struct ptlrpc_request *req)
+{
+       struct ptlrpc_replay_async_args *aa;
+       ENTRY;
+
+       LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
+
+       LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args));
+       aa = ptlrpc_req_async_args(req);
+       memset(aa, 0, sizeof *aa);
+
+       /* Prepare request to be resent with ptlrpcd */
+       aa->praa_old_state = req->rq_send_state;
+       req->rq_send_state = LUSTRE_IMP_REPLAY;
+       req->rq_phase = RQ_PHASE_NEW;
+       req->rq_next_phase = RQ_PHASE_UNDEFINED;
+       if (req->rq_repmsg)
+               aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
+       req->rq_status = 0;
+       req->rq_interpret_reply = ptlrpc_replay_interpret;
+       /* Readjust the timeout for current conditions */
+       ptlrpc_at_set_req_timeout(req);
+
+       /* Tell server the net_latency, so the server can calculate how long
+        * it should wait for next replay */
+       lustre_msg_set_service_time(req->rq_reqmsg,
+                                   ptlrpc_at_get_net_latency(req));
+       DEBUG_REQ(D_HA, req, "REPLAY");
+
+       atomic_inc(&req->rq_import->imp_replay_inflight);
+       ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
+
+       ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_replay_req);
+
+/**
+ * Aborts all in-flight request on import \a imp sending and delayed lists
+ */
+void ptlrpc_abort_inflight(struct obd_import *imp)
+{
+       struct list_head *tmp, *n;
+       ENTRY;
+
+       /* Make sure that no new requests get processed for this import.
+        * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
+        * this flag and then putting requests on sending_list or delayed_list.
+        */
+       spin_lock(&imp->imp_lock);
+
+       /* XXX locking?  Maybe we should remove each request with the list
+        * locked?  Also, how do we know if the requests on the list are
+        * being freed at this time?
+        */
+       list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request, rq_list);
+
+               DEBUG_REQ(D_RPCTRACE, req, "inflight");
+
+               spin_lock(&req->rq_lock);
+               if (req->rq_import_generation < imp->imp_generation) {
+                       req->rq_err = 1;
+                       req->rq_status = -EIO;
+                       ptlrpc_client_wake_req(req);
+               }
+               spin_unlock(&req->rq_lock);
+       }
+
+       list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+               struct ptlrpc_request *req =
+                       list_entry(tmp, struct ptlrpc_request, rq_list);
+
+               DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
+
+               spin_lock(&req->rq_lock);
+               if (req->rq_import_generation < imp->imp_generation) {
+                       req->rq_err = 1;
+                       req->rq_status = -EIO;
+                       ptlrpc_client_wake_req(req);
+               }
+               spin_unlock(&req->rq_lock);
+       }
+
+       /* Last chance to free reqs left on the replay list, but we
+        * will still leak reqs that haven't committed.  */
+       if (imp->imp_replayable)
+               ptlrpc_free_committed(imp);
+
+       spin_unlock(&imp->imp_lock);
+
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_abort_inflight);
+
+/**
+ * Abort all uncompleted requests in request set \a set
+ */
+void ptlrpc_abort_set(struct ptlrpc_request_set *set)
+{
+       struct list_head *tmp, *pos;
+
+       LASSERT(set != NULL);
+
+       list_for_each_safe(pos, tmp, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(pos, struct ptlrpc_request,
+                                      rq_set_chain);
+
+               spin_lock(&req->rq_lock);
+               if (req->rq_phase != RQ_PHASE_RPC) {
+                       spin_unlock(&req->rq_lock);
+                       continue;
+               }
+
+               req->rq_err = 1;
+               req->rq_status = -EINTR;
+               ptlrpc_client_wake_req(req);
+               spin_unlock(&req->rq_lock);
+       }
+}
+
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/**
+ * Initialize the XID for the node.  This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing.  It does not need to be sequential.  Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would deliver old data into the wrong RDMA buffer) initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+       time_t now = cfs_time_current_sec();
+
+       spin_lock_init(&ptlrpc_last_xid_lock);
+       if (now < YEAR_2004) {
+               cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+               ptlrpc_last_xid >>= 2;
+               ptlrpc_last_xid |= (1ULL << 61);
+       } else {
+               ptlrpc_last_xid = (__u64)now << 20;
+       }
+
+       /* Need to always be aligned to a power-of-two for mutli-bulk BRW */
+       CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
+       ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
+}
+
+/**
+ * Increase xid and returns resulting new value to the caller.
+ *
+ * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
+ * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
+ * itself uses the last bulk xid needed, so the server can determine the
+ * the number of bulk transfers from the RPC XID and a bitmask.  The starting
+ * xid must align to a power-of-two value.
+ *
+ * This is assumed to be true due to the initial ptlrpc_last_xid
+ * value also being initialized to a power-of-two value. LU-1431
+ */
+__u64 ptlrpc_next_xid(void)
+{
+       __u64 next;
+
+       spin_lock(&ptlrpc_last_xid_lock);
+       next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+       ptlrpc_last_xid = next;
+       spin_unlock(&ptlrpc_last_xid_lock);
+
+       return next;
+}
+EXPORT_SYMBOL(ptlrpc_next_xid);
+
+/**
+ * Get a glimpse at what next xid value might have been.
+ * Returns possible next xid.
+ */
+__u64 ptlrpc_sample_next_xid(void)
+{
+#if BITS_PER_LONG == 32
+       /* need to avoid possible word tearing on 32-bit systems */
+       __u64 next;
+
+       spin_lock(&ptlrpc_last_xid_lock);
+       next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+       spin_unlock(&ptlrpc_last_xid_lock);
+
+       return next;
+#else
+       /* No need to lock, since returned value is racy anyways */
+       return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
+#endif
+}
+EXPORT_SYMBOL(ptlrpc_sample_next_xid);
+
+/**
+ * Functions for operating ptlrpc workers.
+ *
+ * A ptlrpc work is a function which will be running inside ptlrpc context.
+ * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
+ *
+ * 1. after a work is created, it can be used many times, that is:
+ *      handler = ptlrpcd_alloc_work();
+ *      ptlrpcd_queue_work();
+ *
+ *    queue it again when necessary:
+ *      ptlrpcd_queue_work();
+ *      ptlrpcd_destroy_work();
+ * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
+ *    it will only be queued once in any time. Also as its name implies, it may
+ *    have delay before it really runs by ptlrpcd thread.
+ */
+struct ptlrpc_work_async_args {
+       __u64   magic;
+       int   (*cb)(const struct lu_env *, void *);
+       void   *cbdata;
+};
+
+#define PTLRPC_WORK_MAGIC 0x6655436b676f4f44ULL /* magic code */
+
+static int work_interpreter(const struct lu_env *env,
+                           struct ptlrpc_request *req, void *data, int rc)
+{
+       struct ptlrpc_work_async_args *arg = data;
+
+       LASSERT(arg->magic == PTLRPC_WORK_MAGIC);
+       LASSERT(arg->cb != NULL);
+
+       return arg->cb(env, arg->cbdata);
+}
+
+/**
+ * Create a work for ptlrpc.
+ */
+void *ptlrpcd_alloc_work(struct obd_import *imp,
+                        int (*cb)(const struct lu_env *, void *), void *cbdata)
+{
+       struct ptlrpc_request    *req = NULL;
+       struct ptlrpc_work_async_args *args;
+       ENTRY;
+
+       might_sleep();
+
+       if (cb == NULL)
+               RETURN(ERR_PTR(-EINVAL));
+
+       /* copy some code from deprecated fakereq. */
+       OBD_ALLOC_PTR(req);
+       if (req == NULL) {
+               CERROR("ptlrpc: run out of memory!\n");
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       req->rq_send_state = LUSTRE_IMP_FULL;
+       req->rq_type = PTL_RPC_MSG_REQUEST;
+       req->rq_import = class_import_get(imp);
+       req->rq_export = NULL;
+       req->rq_interpret_reply = work_interpreter;
+       /* don't want reply */
+       req->rq_receiving_reply = 0;
+       req->rq_must_unlink = 0;
+       req->rq_no_delay = req->rq_no_resend = 1;
+
+       spin_lock_init(&req->rq_lock);
+       INIT_LIST_HEAD(&req->rq_list);
+       INIT_LIST_HEAD(&req->rq_replay_list);
+       INIT_LIST_HEAD(&req->rq_set_chain);
+       INIT_LIST_HEAD(&req->rq_history_list);
+       INIT_LIST_HEAD(&req->rq_exp_list);
+       init_waitqueue_head(&req->rq_reply_waitq);
+       init_waitqueue_head(&req->rq_set_waitq);
+       atomic_set(&req->rq_refcount, 1);
+
+       CLASSERT (sizeof(*args) <= sizeof(req->rq_async_args));
+       args = ptlrpc_req_async_args(req);
+       args->magic  = PTLRPC_WORK_MAGIC;
+       args->cb     = cb;
+       args->cbdata = cbdata;
+
+       RETURN(req);
+}
+EXPORT_SYMBOL(ptlrpcd_alloc_work);
+
+void ptlrpcd_destroy_work(void *handler)
+{
+       struct ptlrpc_request *req = handler;
+
+       if (req)
+               ptlrpc_req_finished(req);
+}
+EXPORT_SYMBOL(ptlrpcd_destroy_work);
+
+int ptlrpcd_queue_work(void *handler)
+{
+       struct ptlrpc_request *req = handler;
+
+       /*
+        * Check if the req is already being queued.
+        *
+        * Here comes a trick: it lacks a way of checking if a req is being
+        * processed reliably in ptlrpc. Here I have to use refcount of req
+        * for this purpose. This is okay because the caller should use this
+        * req as opaque data. - Jinshan
+        */
+       LASSERT(atomic_read(&req->rq_refcount) > 0);
+       if (atomic_read(&req->rq_refcount) > 1)
+               return -EBUSY;
+
+       if (atomic_inc_return(&req->rq_refcount) > 2) { /* race */
+               atomic_dec(&req->rq_refcount);
+               return -EBUSY;
+       }
+
+       /* re-initialize the req */
+       req->rq_timeout = obd_timeout;
+       req->rq_sent       = cfs_time_current_sec();
+       req->rq_deadline       = req->rq_sent + req->rq_timeout;
+       req->rq_reply_deadline = req->rq_deadline;
+       req->rq_phase     = RQ_PHASE_INTERPRET;
+       req->rq_next_phase     = RQ_PHASE_COMPLETE;
+       req->rq_xid         = ptlrpc_next_xid();
+       req->rq_import_generation = req->rq_import->imp_generation;
+
+       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpcd_queue_work);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/connection.c b/drivers/staging/lustre/lustre/ptlrpc/connection.c
new file mode 100644 (file)
index 0000000..a0757f3
--- /dev/null
@@ -0,0 +1,248 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+
+#include "ptlrpc_internal.h"
+
+static cfs_hash_t *conn_hash = NULL;
+static cfs_hash_ops_t conn_hash_ops;
+
+struct ptlrpc_connection *
+ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self,
+                     struct obd_uuid *uuid)
+{
+       struct ptlrpc_connection *conn, *conn2;
+       ENTRY;
+
+       conn = cfs_hash_lookup(conn_hash, &peer);
+       if (conn)
+               GOTO(out, conn);
+
+       OBD_ALLOC_PTR(conn);
+       if (!conn)
+               RETURN(NULL);
+
+       conn->c_peer = peer;
+       conn->c_self = self;
+       INIT_HLIST_NODE(&conn->c_hash);
+       atomic_set(&conn->c_refcount, 1);
+       if (uuid)
+               obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
+
+       /*
+        * Add the newly created conn to the hash, on key collision we
+        * lost a racing addition and must destroy our newly allocated
+        * connection.  The object which exists in the has will be
+        * returned and may be compared against out object.
+        */
+       /* In the function below, .hs_keycmp resolves to
+        * conn_keycmp() */
+       /* coverity[overrun-buffer-val] */
+       conn2 = cfs_hash_findadd_unique(conn_hash, &peer, &conn->c_hash);
+       if (conn != conn2) {
+               OBD_FREE_PTR(conn);
+               conn = conn2;
+       }
+       EXIT;
+out:
+       CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+              conn, atomic_read(&conn->c_refcount),
+              libcfs_nid2str(conn->c_peer.nid));
+       return conn;
+}
+EXPORT_SYMBOL(ptlrpc_connection_get);
+
+int ptlrpc_connection_put(struct ptlrpc_connection *conn)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (!conn)
+               RETURN(rc);
+
+       LASSERT(atomic_read(&conn->c_refcount) > 1);
+
+       /*
+        * We do not remove connection from hashtable and
+        * do not free it even if last caller released ref,
+        * as we want to have it cached for the case it is
+        * needed again.
+        *
+        * Deallocating it and later creating new connection
+        * again would be wastful. This way we also avoid
+        * expensive locking to protect things from get/put
+        * race when found cached connection is freed by
+        * ptlrpc_connection_put().
+        *
+        * It will be freed later in module unload time,
+        * when ptlrpc_connection_fini()->lh_exit->conn_exit()
+        * path is called.
+        */
+       if (atomic_dec_return(&conn->c_refcount) == 1)
+               rc = 1;
+
+       CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n",
+              conn, atomic_read(&conn->c_refcount),
+              libcfs_nid2str(conn->c_peer.nid));
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_connection_put);
+
+struct ptlrpc_connection *
+ptlrpc_connection_addref(struct ptlrpc_connection *conn)
+{
+       ENTRY;
+
+       atomic_inc(&conn->c_refcount);
+       CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
+              conn, atomic_read(&conn->c_refcount),
+              libcfs_nid2str(conn->c_peer.nid));
+
+       RETURN(conn);
+}
+EXPORT_SYMBOL(ptlrpc_connection_addref);
+
+int ptlrpc_connection_init(void)
+{
+       ENTRY;
+
+       conn_hash = cfs_hash_create("CONN_HASH",
+                                   HASH_CONN_CUR_BITS,
+                                   HASH_CONN_MAX_BITS,
+                                   HASH_CONN_BKT_BITS, 0,
+                                   CFS_HASH_MIN_THETA,
+                                   CFS_HASH_MAX_THETA,
+                                   &conn_hash_ops, CFS_HASH_DEFAULT);
+       if (!conn_hash)
+               RETURN(-ENOMEM);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_connection_init);
+
+void ptlrpc_connection_fini(void) {
+       ENTRY;
+       cfs_hash_putref(conn_hash);
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_connection_fini);
+
+/*
+ * Hash operations for net_peer<->connection
+ */
+static unsigned
+conn_hashfn(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+       return cfs_hash_djb2_hash(key, sizeof(lnet_process_id_t), mask);
+}
+
+static int
+conn_keycmp(const void *key, struct hlist_node *hnode)
+{
+       struct ptlrpc_connection *conn;
+       const lnet_process_id_t *conn_key;
+
+       LASSERT(key != NULL);
+       conn_key = (lnet_process_id_t*)key;
+       conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+
+       return conn_key->nid == conn->c_peer.nid &&
+              conn_key->pid == conn->c_peer.pid;
+}
+
+static void *
+conn_key(struct hlist_node *hnode)
+{
+       struct ptlrpc_connection *conn;
+       conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+       return &conn->c_peer;
+}
+
+static void *
+conn_object(struct hlist_node *hnode)
+{
+       return hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+}
+
+static void
+conn_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ptlrpc_connection *conn;
+
+       conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+       atomic_inc(&conn->c_refcount);
+}
+
+static void
+conn_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ptlrpc_connection *conn;
+
+       conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+       atomic_dec(&conn->c_refcount);
+}
+
+static void
+conn_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+       struct ptlrpc_connection *conn;
+
+       conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
+       /*
+        * Nothing should be left. Connection user put it and
+        * connection also was deleted from table by this time
+        * so we should have 0 refs.
+        */
+       LASSERTF(atomic_read(&conn->c_refcount) == 0,
+                "Busy connection with %d refs\n",
+                atomic_read(&conn->c_refcount));
+       OBD_FREE_PTR(conn);
+}
+
+static cfs_hash_ops_t conn_hash_ops = {
+       .hs_hash        = conn_hashfn,
+       .hs_keycmp      = conn_keycmp,
+       .hs_key  = conn_key,
+       .hs_object      = conn_object,
+       .hs_get  = conn_get,
+       .hs_put_locked  = conn_put_locked,
+       .hs_exit        = conn_exit,
+};
diff --git a/drivers/staging/lustre/lustre/ptlrpc/events.c b/drivers/staging/lustre/lustre/ptlrpc/events.c
new file mode 100644 (file)
index 0000000..0264c10
--- /dev/null
@@ -0,0 +1,595 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# include <linux/libcfs/libcfs.h>
+# ifdef __mips64__
+#  include <linux/kernel.h>
+# endif
+
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+#include "ptlrpc_internal.h"
+
+lnet_handle_eq_t   ptlrpc_eq_h;
+
+/*
+ *  Client's outgoing request callback
+ */
+void request_out_callback(lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+       struct ptlrpc_request *req = cbid->cbid_arg;
+       ENTRY;
+
+       LASSERT (ev->type == LNET_EVENT_SEND ||
+                ev->type == LNET_EVENT_UNLINK);
+       LASSERT (ev->unlinked);
+
+       DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+       sptlrpc_request_out_callback(req);
+       req->rq_real_sent = cfs_time_current_sec();
+
+       if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) {
+
+               /* Failed send: make it seem like the reply timed out, just
+                * like failing sends in client.c does currently...  */
+
+               spin_lock(&req->rq_lock);
+               req->rq_net_err = 1;
+               spin_unlock(&req->rq_lock);
+
+               ptlrpc_client_wake_req(req);
+       }
+
+       ptlrpc_req_finished(req);
+
+       EXIT;
+}
+
+/*
+ * Client's incoming reply callback
+ */
+void reply_in_callback(lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id   *cbid = ev->md.user_ptr;
+       struct ptlrpc_request *req = cbid->cbid_arg;
+       ENTRY;
+
+       DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status);
+
+       LASSERT (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK);
+       LASSERT (ev->md.start == req->rq_repbuf);
+       LASSERT (ev->offset + ev->mlength <= req->rq_repbuf_len);
+       /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests
+          for adaptive timeouts' early reply. */
+       LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0);
+
+       spin_lock(&req->rq_lock);
+
+       req->rq_receiving_reply = 0;
+       req->rq_early = 0;
+       if (ev->unlinked)
+               req->rq_must_unlink = 0;
+
+       if (ev->status)
+               goto out_wake;
+
+       if (ev->type == LNET_EVENT_UNLINK) {
+               LASSERT(ev->unlinked);
+               DEBUG_REQ(D_NET, req, "unlink");
+               goto out_wake;
+       }
+
+       if (ev->mlength < ev->rlength ) {
+               CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req,
+                      req->rq_replen, ev->rlength, ev->offset);
+               req->rq_reply_truncate = 1;
+               req->rq_replied = 1;
+               req->rq_status = -EOVERFLOW;
+               req->rq_nob_received = ev->rlength + ev->offset;
+               goto out_wake;
+       }
+
+       if ((ev->offset == 0) &&
+           ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) {
+               /* Early reply */
+               DEBUG_REQ(D_ADAPTTO, req,
+                         "Early reply received: mlen=%u offset=%d replen=%d "
+                         "replied=%d unlinked=%d", ev->mlength, ev->offset,
+                         req->rq_replen, req->rq_replied, ev->unlinked);
+
+               req->rq_early_count++; /* number received, client side */
+
+               if (req->rq_replied)   /* already got the real reply */
+                       goto out_wake;
+
+               req->rq_early = 1;
+               req->rq_reply_off = ev->offset;
+               req->rq_nob_received = ev->mlength;
+               /* And we're still receiving */
+               req->rq_receiving_reply = 1;
+       } else {
+               /* Real reply */
+               req->rq_rep_swab_mask = 0;
+               req->rq_replied = 1;
+               req->rq_reply_off = ev->offset;
+               req->rq_nob_received = ev->mlength;
+               /* LNetMDUnlink can't be called under the LNET_LOCK,
+                  so we must unlink in ptlrpc_unregister_reply */
+               DEBUG_REQ(D_INFO, req,
+                         "reply in flags=%x mlen=%u offset=%d replen=%d",
+                         lustre_msg_get_flags(req->rq_reqmsg),
+                         ev->mlength, ev->offset, req->rq_replen);
+       }
+
+       req->rq_import->imp_last_reply_time = cfs_time_current_sec();
+
+out_wake:
+       /* NB don't unlock till after wakeup; req can disappear under us
+        * since we don't have our own ref */
+       ptlrpc_client_wake_req(req);
+       spin_unlock(&req->rq_lock);
+       EXIT;
+}
+
+/*
+ * Client's bulk has been written/read
+ */
+void client_bulk_callback (lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id     *cbid = ev->md.user_ptr;
+       struct ptlrpc_bulk_desc *desc = cbid->cbid_arg;
+       struct ptlrpc_request   *req;
+       ENTRY;
+
+       LASSERT ((desc->bd_type == BULK_PUT_SINK &&
+                 ev->type == LNET_EVENT_PUT) ||
+                (desc->bd_type == BULK_GET_SOURCE &&
+                 ev->type == LNET_EVENT_GET) ||
+                ev->type == LNET_EVENT_UNLINK);
+       LASSERT (ev->unlinked);
+
+       if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+               ev->status = -EIO;
+
+       if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
+               ev->status = -EIO;
+
+       CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+              "event type %d, status %d, desc %p\n",
+              ev->type, ev->status, desc);
+
+       spin_lock(&desc->bd_lock);
+       req = desc->bd_req;
+       LASSERT(desc->bd_md_count > 0);
+       desc->bd_md_count--;
+
+       if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) {
+               desc->bd_nob_transferred += ev->mlength;
+               desc->bd_sender = ev->sender;
+       } else {
+               /* start reconnect and resend if network error hit */
+               spin_lock(&req->rq_lock);
+               req->rq_net_err = 1;
+               spin_unlock(&req->rq_lock);
+       }
+
+       if (ev->status != 0)
+               desc->bd_failure = 1;
+
+       /* NB don't unlock till after wakeup; desc can disappear under us
+        * otherwise */
+       if (desc->bd_md_count == 0)
+               ptlrpc_client_wake_req(desc->bd_req);
+
+       spin_unlock(&desc->bd_lock);
+       EXIT;
+}
+
+/*
+ * We will have percpt request history list for ptlrpc service in upcoming
+ * patches because we don't want to be serialized by current per-service
+ * history operations. So we require history ID can (somehow) show arriving
+ * order w/o grabbing global lock, and user can sort them in userspace.
+ *
+ * This is how we generate history ID for ptlrpc_request:
+ * ----------------------------------------------------
+ * |  32 bits  |  16 bits  | (16 - X)bits  |  X bits  |
+ * ----------------------------------------------------
+ * |  seconds  | usec / 16 |   sequence    | CPT id   |
+ * ----------------------------------------------------
+ *
+ * it might not be precise but should be good enough.
+ */
+
+#define REQS_CPT_BITS(svcpt)   ((svcpt)->scp_service->srv_cpt_bits)
+
+#define REQS_SEC_SHIFT         32
+#define REQS_USEC_SHIFT                16
+#define REQS_SEQ_SHIFT(svcpt)  REQS_CPT_BITS(svcpt)
+
+static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt,
+                                  struct ptlrpc_request *req)
+{
+       __u64   sec = req->rq_arrival_time.tv_sec;
+       __u32   usec = req->rq_arrival_time.tv_usec >> 4; /* usec / 16 */
+       __u64   new_seq;
+
+       /* set sequence ID for request and add it to history list,
+        * it must be called with hold svcpt::scp_lock */
+
+       new_seq = (sec << REQS_SEC_SHIFT) |
+                 (usec << REQS_USEC_SHIFT) |
+                 (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt);
+
+       if (new_seq > svcpt->scp_hist_seq) {
+               /* This handles the initial case of scp_hist_seq == 0 or
+                * we just jumped into a new time window */
+               svcpt->scp_hist_seq = new_seq;
+       } else {
+               LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT);
+               /* NB: increase sequence number in current usec bucket,
+                * however, it's possible that we used up all bits for
+                * sequence and jumped into the next usec bucket (future time),
+                * then we hope there will be less RPCs per bucket at some
+                * point, and sequence will catch up again */
+               svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt));
+               new_seq = svcpt->scp_hist_seq;
+       }
+
+       req->rq_history_seq = new_seq;
+
+       list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs);
+}
+
+/*
+ * Server's incoming request callback
+ */
+void request_in_callback(lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id               *cbid = ev->md.user_ptr;
+       struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg;
+       struct ptlrpc_service_part        *svcpt = rqbd->rqbd_svcpt;
+       struct ptlrpc_service        *service = svcpt->scp_service;
+       struct ptlrpc_request        *req;
+       ENTRY;
+
+       LASSERT (ev->type == LNET_EVENT_PUT ||
+                ev->type == LNET_EVENT_UNLINK);
+       LASSERT ((char *)ev->md.start >= rqbd->rqbd_buffer);
+       LASSERT ((char *)ev->md.start + ev->offset + ev->mlength <=
+                rqbd->rqbd_buffer + service->srv_buf_size);
+
+       CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
+              "event type %d, status %d, service %s\n",
+              ev->type, ev->status, service->srv_name);
+
+       if (ev->unlinked) {
+               /* If this is the last request message to fit in the
+                * request buffer we can use the request object embedded in
+                * rqbd.  Note that if we failed to allocate a request,
+                * we'd have to re-post the rqbd, which we can't do in this
+                * context. */
+               req = &rqbd->rqbd_req;
+               memset(req, 0, sizeof (*req));
+       } else {
+               LASSERT (ev->type == LNET_EVENT_PUT);
+               if (ev->status != 0) {
+                       /* We moaned above already... */
+                       return;
+               }
+               OBD_ALLOC_GFP(req, sizeof(*req), ALLOC_ATOMIC_TRY);
+               if (req == NULL) {
+                       CERROR("Can't allocate incoming request descriptor: "
+                              "Dropping %s RPC from %s\n",
+                              service->srv_name,
+                              libcfs_id2str(ev->initiator));
+                       return;
+               }
+       }
+
+       /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL,
+        * flags are reset and scalars are zero.  We only set the message
+        * size to non-zero if this was a successful receive. */
+       req->rq_xid = ev->match_bits;
+       req->rq_reqbuf = ev->md.start + ev->offset;
+       if (ev->type == LNET_EVENT_PUT && ev->status == 0)
+               req->rq_reqdata_len = ev->mlength;
+       do_gettimeofday(&req->rq_arrival_time);
+       req->rq_peer = ev->initiator;
+       req->rq_self = ev->target.nid;
+       req->rq_rqbd = rqbd;
+       req->rq_phase = RQ_PHASE_NEW;
+       spin_lock_init(&req->rq_lock);
+       INIT_LIST_HEAD(&req->rq_timed_list);
+       INIT_LIST_HEAD(&req->rq_exp_list);
+       atomic_set(&req->rq_refcount, 1);
+       if (ev->type == LNET_EVENT_PUT)
+               CDEBUG(D_INFO, "incoming req@%p x"LPU64" msgsize %u\n",
+                      req, req->rq_xid, ev->mlength);
+
+       CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));
+
+       spin_lock(&svcpt->scp_lock);
+
+       ptlrpc_req_add_history(svcpt, req);
+
+       if (ev->unlinked) {
+               svcpt->scp_nrqbds_posted--;
+               CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n",
+                      svcpt->scp_nrqbds_posted);
+
+               /* Normally, don't complain about 0 buffers posted; LNET won't
+                * drop incoming reqs since we set the portal lazy */
+               if (test_req_buffer_pressure &&
+                   ev->type != LNET_EVENT_UNLINK &&
+                   svcpt->scp_nrqbds_posted == 0)
+                       CWARN("All %s request buffers busy\n",
+                             service->srv_name);
+
+               /* req takes over the network's ref on rqbd */
+       } else {
+               /* req takes a ref on rqbd */
+               rqbd->rqbd_refcount++;
+       }
+
+       list_add_tail(&req->rq_list, &svcpt->scp_req_incoming);
+       svcpt->scp_nreqs_incoming++;
+
+       /* NB everything can disappear under us once the request
+        * has been queued and we unlock, so do the wake now... */
+       wake_up(&svcpt->scp_waitq);
+
+       spin_unlock(&svcpt->scp_lock);
+       EXIT;
+}
+
+/*
+ *  Server's outgoing reply callback
+ */
+void reply_out_callback(lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id       *cbid = ev->md.user_ptr;
+       struct ptlrpc_reply_state *rs = cbid->cbid_arg;
+       struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+       ENTRY;
+
+       LASSERT (ev->type == LNET_EVENT_SEND ||
+                ev->type == LNET_EVENT_ACK ||
+                ev->type == LNET_EVENT_UNLINK);
+
+       if (!rs->rs_difficult) {
+               /* 'Easy' replies have no further processing so I drop the
+                * net's ref on 'rs' */
+               LASSERT (ev->unlinked);
+               ptlrpc_rs_decref(rs);
+               EXIT;
+               return;
+       }
+
+       LASSERT (rs->rs_on_net);
+
+       if (ev->unlinked) {
+               /* Last network callback. The net's ref on 'rs' stays put
+                * until ptlrpc_handle_rs() is done with it */
+               spin_lock(&svcpt->scp_rep_lock);
+               spin_lock(&rs->rs_lock);
+
+               rs->rs_on_net = 0;
+               if (!rs->rs_no_ack ||
+                   rs->rs_transno <=
+                   rs->rs_export->exp_obd->obd_last_committed)
+                       ptlrpc_schedule_difficult_reply(rs);
+
+               spin_unlock(&rs->rs_lock);
+               spin_unlock(&svcpt->scp_rep_lock);
+       }
+       EXIT;
+}
+
+
+static void ptlrpc_master_callback(lnet_event_t *ev)
+{
+       struct ptlrpc_cb_id *cbid = ev->md.user_ptr;
+       void (*callback)(lnet_event_t *ev) = cbid->cbid_fn;
+
+       /* Honestly, it's best to find out early. */
+       LASSERT (cbid->cbid_arg != LP_POISON);
+       LASSERT (callback == request_out_callback ||
+                callback == reply_in_callback ||
+                callback == client_bulk_callback ||
+                callback == request_in_callback ||
+                callback == reply_out_callback
+                );
+
+       callback (ev);
+}
+
+int ptlrpc_uuid_to_peer (struct obd_uuid *uuid,
+                        lnet_process_id_t *peer, lnet_nid_t *self)
+{
+       int            best_dist = 0;
+       __u32        best_order = 0;
+       int            count = 0;
+       int            rc = -ENOENT;
+       int            portals_compatibility;
+       int            dist;
+       __u32        order;
+       lnet_nid_t      dst_nid;
+       lnet_nid_t      src_nid;
+
+       portals_compatibility = LNetCtl(IOC_LIBCFS_PORTALS_COMPATIBILITY, NULL);
+
+       peer->pid = LUSTRE_SRV_LNET_PID;
+
+       /* Choose the matching UUID that's closest */
+       while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) {
+               dist = LNetDist(dst_nid, &src_nid, &order);
+               if (dist < 0)
+                       continue;
+
+               if (dist == 0) {                /* local! use loopback LND */
+                       peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0);
+                       rc = 0;
+                       break;
+               }
+
+               if (rc < 0 ||
+                   dist < best_dist ||
+                   (dist == best_dist && order < best_order)) {
+                       best_dist = dist;
+                       best_order = order;
+
+                       if (portals_compatibility > 1) {
+                               /* Strong portals compatibility: Zero the nid's
+                                * NET, so if I'm reading new config logs, or
+                                * getting configured by (new) lconf I can
+                                * still talk to old servers. */
+                               dst_nid = LNET_MKNID(0, LNET_NIDADDR(dst_nid));
+                               src_nid = LNET_MKNID(0, LNET_NIDADDR(src_nid));
+                       }
+                       peer->nid = dst_nid;
+                       *self = src_nid;
+                       rc = 0;
+               }
+       }
+
+       CDEBUG(D_NET,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
+       return rc;
+}
+
+void ptlrpc_ni_fini(void)
+{
+       wait_queue_head_t        waitq;
+       struct l_wait_info  lwi;
+       int              rc;
+       int              retries;
+
+       /* Wait for the event queue to become idle since there may still be
+        * messages in flight with pending events (i.e. the fire-and-forget
+        * messages == client requests and "non-difficult" server
+        * replies */
+
+       for (retries = 0;; retries++) {
+               rc = LNetEQFree(ptlrpc_eq_h);
+               switch (rc) {
+               default:
+                       LBUG();
+
+               case 0:
+                       LNetNIFini();
+                       return;
+
+               case -EBUSY:
+                       if (retries != 0)
+                               CWARN("Event queue still busy\n");
+
+                       /* Wait for a bit */
+                       init_waitqueue_head(&waitq);
+                       lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL);
+                       l_wait_event(waitq, 0, &lwi);
+                       break;
+               }
+       }
+       /* notreached */
+}
+
+lnet_pid_t ptl_get_pid(void)
+{
+       lnet_pid_t      pid;
+
+       pid = LUSTRE_SRV_LNET_PID;
+       return pid;
+}
+
+int ptlrpc_ni_init(void)
+{
+       int           rc;
+       lnet_pid_t       pid;
+
+       pid = ptl_get_pid();
+       CDEBUG(D_NET, "My pid is: %x\n", pid);
+
+       /* We're not passing any limits yet... */
+       rc = LNetNIInit(pid);
+       if (rc < 0) {
+               CDEBUG (D_NET, "Can't init network interface: %d\n", rc);
+               return (-ENOENT);
+       }
+
+       /* CAVEAT EMPTOR: how we process portals events is _radically_
+        * different depending on... */
+       /* kernel LNet calls our master callback when there are new event,
+        * because we are guaranteed to get every event via callback,
+        * so we just set EQ size to 0 to avoid overhread of serializing
+        * enqueue/dequeue operations in LNet. */
+       rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h);
+       if (rc == 0)
+               return 0;
+
+       CERROR ("Failed to allocate event queue: %d\n", rc);
+       LNetNIFini();
+
+       return (-ENOMEM);
+}
+
+
+int ptlrpc_init_portals(void)
+{
+       int   rc = ptlrpc_ni_init();
+
+       if (rc != 0) {
+               CERROR("network initialisation failed\n");
+               return -EIO;
+       }
+       rc = ptlrpcd_addref();
+       if (rc == 0)
+               return 0;
+
+       CERROR("rpcd initialisation failed\n");
+       ptlrpc_ni_fini();
+       return rc;
+}
+
+void ptlrpc_exit_portals(void)
+{
+       ptlrpcd_decref();
+       ptlrpc_ni_fini();
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile b/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile
new file mode 100644 (file)
index 0000000..8cdfbee
--- /dev/null
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LUSTRE_FS) := ptlrpc_gss.o
+
+ptlrpc_gss-y := sec_gss.o gss_bulk.o gss_cli_upcall.o gss_svc_upcall.o \
+               gss_rawobj.o lproc_gss.o gss_generic_token.o            \
+               gss_mech_switch.o gss_krb5_mech.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h
new file mode 100644 (file)
index 0000000..feac604
--- /dev/null
@@ -0,0 +1,179 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Somewhat simplified version of the gss api.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2000 The Regents of the University of Michigan
+ *
+ */
+
+#ifndef __PTLRPC_GSS_GSS_API_H_
+#define __PTLRPC_GSS_GSS_API_H_
+
+struct gss_api_mech;
+
+/* The mechanism-independent gss-api context: */
+struct gss_ctx {
+       struct gss_api_mech    *mech_type;
+       void               *internal_ctx_id;
+};
+
+#define GSS_C_NO_BUFFER         ((rawobj_t) 0)
+#define GSS_C_NO_CONTEXT       ((struct gss_ctx *) 0)
+#define GSS_C_NULL_OID   ((rawobj_t) 0)
+
+/*
+ * gss-api prototypes; note that these are somewhat simplified versions of
+ * the prototypes specified in RFC 2744.
+ */
+__u32 lgss_import_sec_context(
+               rawobj_t                *input_token,
+               struct gss_api_mech     *mech,
+               struct gss_ctx   **ctx);
+__u32 lgss_copy_reverse_context(
+               struct gss_ctx    *ctx,
+               struct gss_ctx   **ctx_new);
+__u32 lgss_inquire_context(
+               struct gss_ctx    *ctx,
+               unsigned long      *endtime);
+__u32 lgss_get_mic(
+               struct gss_ctx    *ctx,
+               int                   msgcnt,
+               rawobj_t                *msgs,
+               int                   iovcnt,
+               lnet_kiov_t          *iovs,
+               rawobj_t                *mic_token);
+__u32 lgss_verify_mic(
+               struct gss_ctx    *ctx,
+               int                   msgcnt,
+               rawobj_t                *msgs,
+               int                   iovcnt,
+               lnet_kiov_t          *iovs,
+               rawobj_t                *mic_token);
+__u32 lgss_wrap(
+               struct gss_ctx    *ctx,
+               rawobj_t                *gsshdr,
+               rawobj_t                *msg,
+               int                   msg_buflen,
+               rawobj_t                *out_token);
+__u32 lgss_unwrap(
+               struct gss_ctx    *ctx,
+               rawobj_t                *gsshdr,
+               rawobj_t                *token,
+               rawobj_t                *out_msg);
+__u32 lgss_prep_bulk(
+               struct gss_ctx    *gctx,
+               struct ptlrpc_bulk_desc *desc);
+__u32 lgss_wrap_bulk(
+               struct gss_ctx    *gctx,
+               struct ptlrpc_bulk_desc *desc,
+               rawobj_t                *token,
+               int                   adj_nob);
+__u32 lgss_unwrap_bulk(
+               struct gss_ctx    *gctx,
+               struct ptlrpc_bulk_desc *desc,
+               rawobj_t                *token,
+               int                   adj_nob);
+__u32 lgss_delete_sec_context(
+               struct gss_ctx   **ctx);
+int lgss_display(
+               struct gss_ctx    *ctx,
+               char                *buf,
+               int                   bufsize);
+
+struct subflavor_desc {
+       __u32      sf_subflavor;
+       __u32      sf_qop;
+       __u32      sf_service;
+       char       *sf_name;
+};
+
+/* Each mechanism is described by the following struct: */
+struct gss_api_mech {
+       struct list_head              gm_list;
+       module_t           *gm_owner;
+       char               *gm_name;
+       rawobj_t                gm_oid;
+       atomic_t            gm_count;
+       struct gss_api_ops     *gm_ops;
+       int                  gm_sf_num;
+       struct subflavor_desc  *gm_sfs;
+};
+
+/* and must provide the following operations: */
+struct gss_api_ops {
+       __u32 (*gss_import_sec_context)(
+                       rawobj_t               *input_token,
+                       struct gss_ctx   *ctx);
+       __u32 (*gss_copy_reverse_context)(
+                       struct gss_ctx   *ctx,
+                       struct gss_ctx   *ctx_new);
+       __u32 (*gss_inquire_context)(
+                       struct gss_ctx   *ctx,
+                       unsigned long     *endtime);
+       __u32 (*gss_get_mic)(
+                       struct gss_ctx   *ctx,
+                       int                  msgcnt,
+                       rawobj_t               *msgs,
+                       int                  iovcnt,
+                       lnet_kiov_t         *iovs,
+                       rawobj_t               *mic_token);
+       __u32 (*gss_verify_mic)(
+                       struct gss_ctx   *ctx,
+                       int                  msgcnt,
+                       rawobj_t               *msgs,
+                       int                  iovcnt,
+                       lnet_kiov_t         *iovs,
+                       rawobj_t               *mic_token);
+       __u32 (*gss_wrap)(
+                       struct gss_ctx   *ctx,
+                       rawobj_t               *gsshdr,
+                       rawobj_t               *msg,
+                       int                  msg_buflen,
+                       rawobj_t               *out_token);
+       __u32 (*gss_unwrap)(
+                       struct gss_ctx   *ctx,
+                       rawobj_t               *gsshdr,
+                       rawobj_t               *token,
+                       rawobj_t               *out_msg);
+       __u32 (*gss_prep_bulk)(
+                       struct gss_ctx   *gctx,
+                       struct ptlrpc_bulk_desc *desc);
+       __u32 (*gss_wrap_bulk)(
+                       struct gss_ctx   *gctx,
+                       struct ptlrpc_bulk_desc *desc,
+                       rawobj_t               *token,
+                       int                  adj_nob);
+       __u32 (*gss_unwrap_bulk)(
+                       struct gss_ctx   *gctx,
+                       struct ptlrpc_bulk_desc *desc,
+                       rawobj_t               *token,
+                       int                  adj_nob);
+       void (*gss_delete_sec_context)(
+                       void               *ctx);
+       int  (*gss_display)(
+                       struct gss_ctx   *ctx,
+                       char               *buf,
+                       int                  bufsize);
+};
+
+int lgss_mech_register(struct gss_api_mech *mech);
+void lgss_mech_unregister(struct gss_api_mech *mech);
+
+struct gss_api_mech * lgss_OID_to_mech(rawobj_t *oid);
+struct gss_api_mech * lgss_name_to_mech(char *name);
+struct gss_api_mech * lgss_subflavor_to_mech(__u32 subflavor);
+
+struct gss_api_mech * lgss_mech_get(struct gss_api_mech *mech);
+void lgss_mech_put(struct gss_api_mech *mech);
+
+#endif /* __PTLRPC_GSS_GSS_API_H_ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h
new file mode 100644 (file)
index 0000000..c70eb00
--- /dev/null
@@ -0,0 +1,84 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  minimal asn1 for generic encoding/decoding of gss tokens
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#define SIZEOF_INT 4
+
+/* from gssapi_err_generic.h */
+#define G_BAD_SERVICE_NAME                    (-2045022976L)
+#define G_BAD_STRING_UID                        (-2045022975L)
+#define G_NOUSER                                (-2045022974L)
+#define G_VALIDATE_FAILED                      (-2045022973L)
+#define G_BUFFER_ALLOC                    (-2045022972L)
+#define G_BAD_MSG_CTX                      (-2045022971L)
+#define G_WRONG_SIZE                        (-2045022970L)
+#define G_BAD_USAGE                          (-2045022969L)
+#define G_UNKNOWN_QOP                      (-2045022968L)
+#define G_NO_HOSTNAME                      (-2045022967L)
+#define G_BAD_HOSTNAME                    (-2045022966L)
+#define G_WRONG_MECH                        (-2045022965L)
+#define G_BAD_TOK_HEADER                        (-2045022964L)
+#define G_BAD_DIRECTION                          (-2045022963L)
+#define G_TOK_TRUNC                          (-2045022962L)
+#define G_REFLECT                              (-2045022961L)
+#define G_WRONG_TOKID                      (-2045022960L)
+
+#define g_OID_equal(o1,o2) \
+   (((o1)->len == (o2)->len) && \
+    (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0))
+
+__u32 g_verify_token_header(rawobj_t *mech,
+                           int *body_size,
+                           unsigned char **buf_in,
+                           int toksize);
+
+__u32 g_get_mech_oid(rawobj_t *mech,
+                    rawobj_t *in_buf);
+
+int g_token_size(rawobj_t *mech,
+                unsigned int body_size);
+
+void g_make_token_header(rawobj_t *mech,
+                        int body_size,
+                        unsigned char **buf);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c
new file mode 100644 (file)
index 0000000..ed95bbb
--- /dev/null
@@ -0,0 +1,512 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_bulk.c
+ *
+ * Author: Eric Mei <eric.mei@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                         struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc)
+{
+       struct gss_cli_ctx            *gctx;
+       struct lustre_msg              *msg;
+       struct ptlrpc_bulk_sec_desc     *bsd;
+       rawobj_t                         token;
+       __u32                       maj;
+       int                           offset;
+       int                           rc;
+       ENTRY;
+
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+       gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+       LASSERT(gctx->gc_mechctx);
+
+       switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+       case SPTLRPC_SVC_NULL:
+               LASSERT(req->rq_reqbuf->lm_bufcount >= 3);
+               msg = req->rq_reqbuf;
+               offset = msg->lm_bufcount - 1;
+               break;
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               LASSERT(req->rq_reqbuf->lm_bufcount >= 4);
+               msg = req->rq_reqbuf;
+               offset = msg->lm_bufcount - 2;
+               break;
+       case SPTLRPC_SVC_PRIV:
+               LASSERT(req->rq_clrbuf->lm_bufcount >= 2);
+               msg = req->rq_clrbuf;
+               offset = msg->lm_bufcount - 1;
+               break;
+       default:
+               LBUG();
+       }
+
+       bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+       bsd->bsd_version = 0;
+       bsd->bsd_flags = 0;
+       bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+       if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+               RETURN(0);
+
+       LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+               bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+       if (req->rq_bulk_read) {
+               /*
+                * bulk read: prepare receiving pages only for privacy mode.
+                */
+               if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+                       return gss_cli_prep_bulk(req, desc);
+       } else {
+               /*
+                * bulk write: sign or encrypt bulk pages.
+                */
+               bsd->bsd_nob = desc->bd_nob;
+
+               if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+                       /* integrity mode */
+                       token.data = bsd->bsd_data;
+                       token.len = lustre_msg_buflen(msg, offset) -
+                                   sizeof(*bsd);
+
+                       maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL,
+                                          desc->bd_iov_count, desc->bd_iov,
+                                          &token);
+                       if (maj != GSS_S_COMPLETE) {
+                               CWARN("failed to sign bulk data: %x\n", maj);
+                               RETURN(-EACCES);
+                       }
+               } else {
+                       /* privacy mode */
+                       if (desc->bd_iov_count == 0)
+                               RETURN(0);
+
+                       rc = sptlrpc_enc_pool_get_pages(desc);
+                       if (rc) {
+                               CERROR("bulk write: failed to allocate "
+                                      "encryption pages: %d\n", rc);
+                               RETURN(rc);
+                       }
+
+                       token.data = bsd->bsd_data;
+                       token.len = lustre_msg_buflen(msg, offset) -
+                                   sizeof(*bsd);
+
+                       maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0);
+                       if (maj != GSS_S_COMPLETE) {
+                               CWARN("fail to encrypt bulk data: %x\n", maj);
+                               RETURN(-EACCES);
+                       }
+               }
+       }
+
+       RETURN(0);
+}
+
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                           struct ptlrpc_request *req,
+                           struct ptlrpc_bulk_desc *desc)
+{
+       struct gss_cli_ctx            *gctx;
+       struct lustre_msg              *rmsg, *vmsg;
+       struct ptlrpc_bulk_sec_desc     *bsdr, *bsdv;
+       rawobj_t                         token;
+       __u32                       maj;
+       int                           roff, voff;
+       ENTRY;
+
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+       switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+       case SPTLRPC_SVC_NULL:
+               vmsg = req->rq_repdata;
+               voff = vmsg->lm_bufcount - 1;
+               LASSERT(vmsg && vmsg->lm_bufcount >= 3);
+
+               rmsg = req->rq_reqbuf;
+               roff = rmsg->lm_bufcount - 1; /* last segment */
+               LASSERT(rmsg && rmsg->lm_bufcount >= 3);
+               break;
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               vmsg = req->rq_repdata;
+               voff = vmsg->lm_bufcount - 2;
+               LASSERT(vmsg && vmsg->lm_bufcount >= 4);
+
+               rmsg = req->rq_reqbuf;
+               roff = rmsg->lm_bufcount - 2; /* second last segment */
+               LASSERT(rmsg && rmsg->lm_bufcount >= 4);
+               break;
+       case SPTLRPC_SVC_PRIV:
+               vmsg = req->rq_repdata;
+               voff = vmsg->lm_bufcount - 1;
+               LASSERT(vmsg && vmsg->lm_bufcount >= 2);
+
+               rmsg = req->rq_clrbuf;
+               roff = rmsg->lm_bufcount - 1; /* last segment */
+               LASSERT(rmsg && rmsg->lm_bufcount >= 2);
+               break;
+       default:
+               LBUG();
+       }
+
+       bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr));
+       bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv));
+       LASSERT(bsdr && bsdv);
+
+       if (bsdr->bsd_version != bsdv->bsd_version ||
+           bsdr->bsd_type != bsdv->bsd_type ||
+           bsdr->bsd_svc != bsdv->bsd_svc) {
+               CERROR("bulk security descriptor mismatch: "
+                      "(%u,%u,%u) != (%u,%u,%u)\n",
+                      bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc,
+                      bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc);
+               RETURN(-EPROTO);
+       }
+
+       LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL ||
+               bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG ||
+               bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV);
+
+       /*
+        * in privacy mode if return success, make sure bd_nob_transferred
+        * is the actual size of the clear text, otherwise upper layer
+        * may be surprised.
+        */
+       if (req->rq_bulk_write) {
+               if (bsdv->bsd_flags & BSD_FL_ERR) {
+                       CERROR("server reported bulk i/o failure\n");
+                       RETURN(-EIO);
+               }
+
+               if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV)
+                       desc->bd_nob_transferred = desc->bd_nob;
+       } else {
+               /*
+                * bulk read, upon return success, bd_nob_transferred is
+                * the size of plain text actually received.
+                */
+               gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+               LASSERT(gctx->gc_mechctx);
+
+               if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) {
+                       int i, nob;
+
+                       /* fix the actual data size */
+                       for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+                               if (desc->bd_iov[i].kiov_len + nob >
+                                   desc->bd_nob_transferred) {
+                                       desc->bd_iov[i].kiov_len =
+                                               desc->bd_nob_transferred - nob;
+                               }
+                               nob += desc->bd_iov[i].kiov_len;
+                       }
+
+                       token.data = bsdv->bsd_data;
+                       token.len = lustre_msg_buflen(vmsg, voff) -
+                                   sizeof(*bsdv);
+
+                       maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL,
+                                             desc->bd_iov_count, desc->bd_iov,
+                                             &token);
+                       if (maj != GSS_S_COMPLETE) {
+                               CERROR("failed to verify bulk read: %x\n", maj);
+                               RETURN(-EACCES);
+                       }
+               } else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) {
+                       desc->bd_nob = bsdv->bsd_nob;
+                       if (desc->bd_nob == 0)
+                               RETURN(0);
+
+                       token.data = bsdv->bsd_data;
+                       token.len = lustre_msg_buflen(vmsg, voff) -
+                                   sizeof(*bsdr);
+
+                       maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc,
+                                              &token, 1);
+                       if (maj != GSS_S_COMPLETE) {
+                               CERROR("failed to decrypt bulk read: %x\n",
+                                      maj);
+                               RETURN(-EACCES);
+                       }
+
+                       desc->bd_nob_transferred = desc->bd_nob;
+               }
+       }
+
+       RETURN(0);
+}
+
+static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc,
+                        struct gss_ctx *mechctx)
+{
+       int     rc;
+
+       if (desc->bd_iov_count == 0)
+               return 0;
+
+       rc = sptlrpc_enc_pool_get_pages(desc);
+       if (rc)
+               return rc;
+
+       if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE)
+               return -EACCES;
+
+       return 0;
+}
+
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc)
+{
+       int          rc;
+       ENTRY;
+
+       LASSERT(req->rq_cli_ctx);
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_read);
+
+       if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV)
+               RETURN(0);
+
+       rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx);
+       if (rc)
+               CERROR("bulk read: failed to prepare encryption "
+                      "pages: %d\n", rc);
+
+       RETURN(rc);
+}
+
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc)
+{
+       struct gss_svc_reqctx   *grctx;
+       struct ptlrpc_bulk_sec_desc  *bsd;
+       int                        rc;
+       ENTRY;
+
+       LASSERT(req->rq_svc_ctx);
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_write);
+
+       grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       LASSERT(grctx->src_reqbsd);
+       LASSERT(grctx->src_repbsd);
+       LASSERT(grctx->src_ctx);
+       LASSERT(grctx->src_ctx->gsc_mechctx);
+
+       bsd = grctx->src_reqbsd;
+       if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)
+               RETURN(0);
+
+       rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx);
+       if (rc)
+               CERROR("bulk write: failed to prepare encryption "
+                      "pages: %d\n", rc);
+
+       RETURN(rc);
+}
+
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+                       struct ptlrpc_bulk_desc *desc)
+{
+       struct gss_svc_reqctx   *grctx;
+       struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+       rawobj_t                      token;
+       __u32                    maj;
+       ENTRY;
+
+       LASSERT(req->rq_svc_ctx);
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_write);
+
+       grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+       LASSERT(grctx->src_reqbsd);
+       LASSERT(grctx->src_repbsd);
+       LASSERT(grctx->src_ctx);
+       LASSERT(grctx->src_ctx->gsc_mechctx);
+
+       bsdr = grctx->src_reqbsd;
+       bsdv = grctx->src_repbsd;
+
+       /* bsdr has been sanity checked during unpacking */
+       bsdv->bsd_version = 0;
+       bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsdv->bsd_svc = bsdr->bsd_svc;
+       bsdv->bsd_flags = 0;
+
+       switch (bsdv->bsd_svc) {
+       case SPTLRPC_BULK_SVC_INTG:
+               token.data = bsdr->bsd_data;
+               token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+               maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+                                     desc->bd_iov_count, desc->bd_iov, &token);
+               if (maj != GSS_S_COMPLETE) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("failed to verify bulk signature: %x\n", maj);
+                       RETURN(-EACCES);
+               }
+               break;
+       case SPTLRPC_BULK_SVC_PRIV:
+               if (bsdr->bsd_nob != desc->bd_nob) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("prepared nob %d doesn't match the actual "
+                              "nob %d\n", desc->bd_nob, bsdr->bsd_nob);
+                       RETURN(-EPROTO);
+               }
+
+               if (desc->bd_iov_count == 0) {
+                       LASSERT(desc->bd_nob == 0);
+                       break;
+               }
+
+               token.data = bsdr->bsd_data;
+               token.len = grctx->src_reqbsd_size - sizeof(*bsdr);
+
+               maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx,
+                                      desc, &token, 0);
+               if (maj != GSS_S_COMPLETE) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("failed decrypt bulk data: %x\n", maj);
+                       RETURN(-EACCES);
+               }
+               break;
+       }
+
+       RETURN(0);
+}
+
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc)
+{
+       struct gss_svc_reqctx   *grctx;
+       struct ptlrpc_bulk_sec_desc  *bsdr, *bsdv;
+       rawobj_t                      token;
+       __u32                    maj;
+       int                        rc;
+       ENTRY;
+
+       LASSERT(req->rq_svc_ctx);
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_bulk_read);
+
+       grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+
+       LASSERT(grctx->src_reqbsd);
+       LASSERT(grctx->src_repbsd);
+       LASSERT(grctx->src_ctx);
+       LASSERT(grctx->src_ctx->gsc_mechctx);
+
+       bsdr = grctx->src_reqbsd;
+       bsdv = grctx->src_repbsd;
+
+       /* bsdr has been sanity checked during unpacking */
+       bsdv->bsd_version = 0;
+       bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsdv->bsd_svc = bsdr->bsd_svc;
+       bsdv->bsd_flags = 0;
+
+       switch (bsdv->bsd_svc) {
+       case SPTLRPC_BULK_SVC_INTG:
+               token.data = bsdv->bsd_data;
+               token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+               maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL,
+                                  desc->bd_iov_count, desc->bd_iov, &token);
+               if (maj != GSS_S_COMPLETE) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("failed to sign bulk data: %x\n", maj);
+                       RETURN(-EACCES);
+               }
+               break;
+       case SPTLRPC_BULK_SVC_PRIV:
+               bsdv->bsd_nob = desc->bd_nob;
+
+               if (desc->bd_iov_count == 0) {
+                       LASSERT(desc->bd_nob == 0);
+                       break;
+               }
+
+               rc = sptlrpc_enc_pool_get_pages(desc);
+               if (rc) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("bulk read: failed to allocate encryption "
+                              "pages: %d\n", rc);
+                       RETURN(rc);
+               }
+
+               token.data = bsdv->bsd_data;
+               token.len = grctx->src_repbsd_size - sizeof(*bsdv);
+
+               maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx,
+                                    desc, &token, 1);
+               if (maj != GSS_S_COMPLETE) {
+                       bsdv->bsd_flags |= BSD_FL_ERR;
+                       CERROR("failed to encrypt bulk data: %x\n", maj);
+                       RETURN(-EACCES);
+               }
+               break;
+       }
+
+       RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c
new file mode 100644 (file)
index 0000000..142c789
--- /dev/null
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_cli_upcall.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+/**********************************************
+ * gss context init/fini helper               *
+ **********************************************/
+
+static
+int ctx_init_pack_request(struct obd_import *imp,
+                         struct ptlrpc_request *req,
+                         int lustre_srv,
+                         uid_t uid, gid_t gid,
+                         long token_size,
+                         char __user *token)
+{
+       struct lustre_msg       *msg = req->rq_reqbuf;
+       struct gss_sec    *gsec;
+       struct gss_header       *ghdr;
+       struct ptlrpc_user_desc *pud;
+       __u32              *p, size, offset = 2;
+       rawobj_t                 obj;
+
+       LASSERT(msg->lm_bufcount <= 4);
+       LASSERT(req->rq_cli_ctx);
+       LASSERT(req->rq_cli_ctx->cc_sec);
+
+       /* gss hdr */
+       ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr));
+       ghdr->gh_version = PTLRPC_GSS_VERSION;
+       ghdr->gh_sp = (__u8) imp->imp_sec->ps_part;
+       ghdr->gh_flags = 0;
+       ghdr->gh_proc = PTLRPC_GSS_PROC_INIT;
+       ghdr->gh_seq = 0;
+       ghdr->gh_svc = SPTLRPC_SVC_NULL;
+       ghdr->gh_handle.len = 0;
+
+       /* fix the user desc */
+       if (req->rq_pack_udesc) {
+               ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+               pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+               LASSERT(pud);
+               pud->pud_uid = pud->pud_fsuid = uid;
+               pud->pud_gid = pud->pud_fsgid = gid;
+               pud->pud_cap = 0;
+               pud->pud_ngroups = 0;
+               offset++;
+       }
+
+       /* security payload */
+       p = lustre_msg_buf(msg, offset, 0);
+       size = msg->lm_buflens[offset];
+       LASSERT(p);
+
+       /* 1. lustre svc type */
+       LASSERT(size > 4);
+       *p++ = cpu_to_le32(lustre_srv);
+       size -= 4;
+
+       /* 2. target uuid */
+       obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1;
+       obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid;
+       if (rawobj_serialize(&obj, &p, &size))
+               LBUG();
+
+       /* 3. reverse context handle. actually only needed by root user,
+        *    but we send it anyway. */
+       gsec = sec2gsec(req->rq_cli_ctx->cc_sec);
+       obj.len = sizeof(gsec->gs_rvs_hdl);
+       obj.data = (__u8 *) &gsec->gs_rvs_hdl;
+       if (rawobj_serialize(&obj, &p, &size))
+               LBUG();
+
+       /* 4. now the token */
+       LASSERT(size >= (sizeof(__u32) + token_size));
+       *p++ = cpu_to_le32(((__u32) token_size));
+       if (copy_from_user(p, token, token_size)) {
+               CERROR("can't copy token\n");
+               return -EFAULT;
+       }
+       size -= sizeof(__u32) + cfs_size_round4(token_size);
+
+       req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset,
+                                               msg->lm_buflens[offset] - size, 0);
+       return 0;
+}
+
+static
+int ctx_init_parse_reply(struct lustre_msg *msg, int swabbed,
+                        char __user *outbuf, long outlen)
+{
+       struct gss_rep_header   *ghdr;
+       __u32               obj_len, round_len;
+       __u32               status, effective = 0;
+
+       if (msg->lm_bufcount != 3) {
+               CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+               return -EPROTO;
+       }
+
+       ghdr = (struct gss_rep_header *) gss_swab_header(msg, 0, swabbed);
+       if (ghdr == NULL) {
+               CERROR("unable to extract gss reply header\n");
+               return -EPROTO;
+       }
+
+       if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+               CERROR("invalid gss version %u\n", ghdr->gh_version);
+               return -EPROTO;
+       }
+
+       if (outlen < (4 + 2) * 4 + cfs_size_round4(ghdr->gh_handle.len) +
+                    cfs_size_round4(msg->lm_buflens[2])) {
+               CERROR("output buffer size %ld too small\n", outlen);
+               return -EFAULT;
+       }
+
+       status = 0;
+       effective = 0;
+
+       if (copy_to_user(outbuf, &status, 4))
+               return -EFAULT;
+       outbuf += 4;
+       if (copy_to_user(outbuf, &ghdr->gh_major, 4))
+               return -EFAULT;
+       outbuf += 4;
+       if (copy_to_user(outbuf, &ghdr->gh_minor, 4))
+               return -EFAULT;
+       outbuf += 4;
+       if (copy_to_user(outbuf, &ghdr->gh_seqwin, 4))
+               return -EFAULT;
+       outbuf += 4;
+       effective += 4 * 4;
+
+       /* handle */
+       obj_len = ghdr->gh_handle.len;
+       round_len = (obj_len + 3) & ~ 3;
+       if (copy_to_user(outbuf, &obj_len, 4))
+               return -EFAULT;
+       outbuf += 4;
+       if (copy_to_user(outbuf, (char *) ghdr->gh_handle.data, round_len))
+               return -EFAULT;
+       outbuf += round_len;
+       effective += 4 + round_len;
+
+       /* out token */
+       obj_len = msg->lm_buflens[2];
+       round_len = (obj_len + 3) & ~ 3;
+       if (copy_to_user(outbuf, &obj_len, 4))
+               return -EFAULT;
+       outbuf += 4;
+       if (copy_to_user(outbuf, lustre_msg_buf(msg, 2, 0), round_len))
+               return -EFAULT;
+       outbuf += round_len;
+       effective += 4 + round_len;
+
+       return effective;
+}
+
+/* XXX move to where lgssd could see */
+struct lgssd_ioctl_param {
+       int          version;   /* in   */
+       int          secid;       /* in   */
+       char       *uuid;          /* in   */
+       int          lustre_svc;     /* in   */
+       uid_t      uid;     /* in   */
+       gid_t      gid;     /* in   */
+       long        send_token_size;/* in   */
+       char       *send_token;     /* in   */
+       long        reply_buf_size; /* in   */
+       char       *reply_buf;      /* in   */
+       long        status;      /* out  */
+       long        reply_length;   /* out  */
+};
+
+int gss_do_ctx_init_rpc(__user char *buffer, unsigned long count)
+{
+       struct obd_import       *imp;
+       struct ptlrpc_request    *req;
+       struct lgssd_ioctl_param  param;
+       struct obd_device       *obd;
+       char                  obdname[64];
+       long                  lsize;
+       int                    rc;
+
+       if (count != sizeof(param)) {
+               CERROR("ioctl size %lu, expect %lu, please check lgss_keyring "
+                      "version\n", count, (unsigned long) sizeof(param));
+               RETURN(-EINVAL);
+       }
+       if (copy_from_user(&param, buffer, sizeof(param))) {
+               CERROR("failed copy data from lgssd\n");
+               RETURN(-EFAULT);
+       }
+
+       if (param.version != GSSD_INTERFACE_VERSION) {
+               CERROR("gssd interface version %d (expect %d)\n",
+                       param.version, GSSD_INTERFACE_VERSION);
+               RETURN(-EINVAL);
+       }
+
+       /* take name */
+       if (strncpy_from_user(obdname, param.uuid, sizeof(obdname)) <= 0) {
+               CERROR("Invalid obdname pointer\n");
+               RETURN(-EFAULT);
+       }
+
+       obd = class_name2obd(obdname);
+       if (!obd) {
+               CERROR("no such obd %s\n", obdname);
+               RETURN(-EINVAL);
+       }
+
+       if (unlikely(!obd->obd_set_up)) {
+               CERROR("obd %s not setup\n", obdname);
+               RETURN(-EINVAL);
+       }
+
+       spin_lock(&obd->obd_dev_lock);
+       if (obd->obd_stopping) {
+               CERROR("obd %s has stopped\n", obdname);
+               spin_unlock(&obd->obd_dev_lock);
+               RETURN(-EINVAL);
+       }
+
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+           strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+               CERROR("obd %s is not a client device\n", obdname);
+               spin_unlock(&obd->obd_dev_lock);
+               RETURN(-EINVAL);
+       }
+       spin_unlock(&obd->obd_dev_lock);
+
+       down_read(&obd->u.cli.cl_sem);
+       if (obd->u.cli.cl_import == NULL) {
+               CERROR("obd %s: import has gone\n", obd->obd_name);
+               up_read(&obd->u.cli.cl_sem);
+               RETURN(-EINVAL);
+       }
+       imp = class_import_get(obd->u.cli.cl_import);
+       up_read(&obd->u.cli.cl_sem);
+
+       if (imp->imp_deactive) {
+               CERROR("import has been deactivated\n");
+               class_import_put(imp);
+               RETURN(-EINVAL);
+       }
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_SEC_CTX, LUSTRE_OBD_VERSION,
+                                       SEC_CTX_INIT);
+       if (req == NULL) {
+               param.status = -ENOMEM;
+               goto out_copy;
+       }
+
+       if (req->rq_cli_ctx->cc_sec->ps_id != param.secid) {
+               CWARN("original secid %d, now has changed to %d, "
+                     "cancel this negotiation\n", param.secid,
+                     req->rq_cli_ctx->cc_sec->ps_id);
+               param.status = -EINVAL;
+               goto out_copy;
+       }
+
+       /* get token */
+       rc = ctx_init_pack_request(imp, req,
+                                  param.lustre_svc,
+                                  param.uid, param.gid,
+                                  param.send_token_size,
+                                  param.send_token);
+       if (rc) {
+               param.status = rc;
+               goto out_copy;
+       }
+
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc) {
+               /* If any _real_ denial be made, we expect server return
+                * -EACCES reply or return success but indicate gss error
+                * inside reply messsage. All other errors are treated as
+                * timeout, caller might try the negotiation repeatedly,
+                * leave recovery decisions to general ptlrpc layer.
+                *
+                * FIXME maybe some other error code shouldn't be treated
+                * as timeout. */
+               param.status = rc;
+               if (rc != -EACCES)
+                       param.status = -ETIMEDOUT;
+               goto out_copy;
+       }
+
+       LASSERT(req->rq_repdata);
+       lsize = ctx_init_parse_reply(req->rq_repdata,
+                                    ptlrpc_rep_need_swab(req),
+                                    param.reply_buf, param.reply_buf_size);
+       if (lsize < 0) {
+               param.status = (int) lsize;
+               goto out_copy;
+       }
+
+       param.status = 0;
+       param.reply_length = lsize;
+
+out_copy:
+       if (copy_to_user(buffer, &param, sizeof(param)))
+               rc = -EFAULT;
+       else
+               rc = 0;
+
+       class_import_put(imp);
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
+
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx)
+{
+       struct ptlrpc_cli_ctx   *ctx = &gctx->gc_base;
+       struct obd_import       *imp = ctx->cc_sec->ps_import;
+       struct ptlrpc_request   *req;
+       struct ptlrpc_user_desc *pud;
+       int                   rc;
+       ENTRY;
+
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+       if (cli_ctx_is_error(ctx) || !cli_ctx_is_uptodate(ctx)) {
+               CDEBUG(D_SEC, "ctx %p(%u->%s) not uptodate, "
+                      "don't send destroy rpc\n", ctx,
+                      ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+               RETURN(0);
+       }
+
+       might_sleep();
+
+       CWARN("%s ctx %p idx "LPX64" (%u->%s)\n",
+             sec_is_reverse(ctx->cc_sec) ?
+             "server finishing reverse" : "client finishing forward",
+             ctx, gss_handle_to_u64(&gctx->gc_handle),
+             ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+       gctx->gc_proc = PTLRPC_GSS_PROC_DESTROY;
+
+       req = ptlrpc_request_alloc(imp, &RQF_SEC_CTX);
+       if (req == NULL) {
+               CWARN("ctx %p(%u): fail to prepare rpc, destroy locally\n",
+                     ctx, ctx->cc_vcred.vc_uid);
+               GOTO(out, rc = -ENOMEM);
+       }
+
+       rc = ptlrpc_request_bufs_pack(req, LUSTRE_OBD_VERSION, SEC_CTX_FINI,
+                                     NULL, ctx);
+       if (rc) {
+               ptlrpc_request_free(req);
+               GOTO(out_ref, rc);
+       }
+
+       /* fix the user desc */
+       if (req->rq_pack_udesc) {
+               /* we rely the fact that this request is in AUTH mode,
+                * and user_desc at offset 2. */
+               pud = lustre_msg_buf(req->rq_reqbuf, 2, sizeof(*pud));
+               LASSERT(pud);
+               pud->pud_uid = pud->pud_fsuid = ctx->cc_vcred.vc_uid;
+               pud->pud_gid = pud->pud_fsgid = ctx->cc_vcred.vc_gid;
+               pud->pud_cap = 0;
+               pud->pud_ngroups = 0;
+       }
+
+       req->rq_phase = RQ_PHASE_RPC;
+       rc = ptl_send_rpc(req, 1);
+       if (rc)
+               CWARN("ctx %p(%u->%s): rpc error %d, destroy locally\n", ctx,
+                     ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), rc);
+
+out_ref:
+       ptlrpc_req_finished(req);
+out:
+       RETURN(rc);
+}
+
+int __init gss_init_cli_upcall(void)
+{
+       return 0;
+}
+
+void __exit gss_exit_cli_upcall(void)
+{
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h
new file mode 100644 (file)
index 0000000..1342579
--- /dev/null
@@ -0,0 +1,193 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __PTLRPC_GSS_GSS_ERR_H_
+#define __PTLRPC_GSS_GSS_ERR_H_
+
+typedef unsigned int OM_uint32;
+
+/*
+ * Flag bits for context-level services.
+ */
+#define GSS_C_DELEG_FLAG       (1)
+#define GSS_C_MUTUAL_FLAG       (2)
+#define GSS_C_REPLAY_FLAG       (4)
+#define GSS_C_SEQUENCE_FLAG     (8)
+#define GSS_C_CONF_FLAG         (16)
+#define GSS_C_INTEG_FLAG       (32)
+#define GSS_C_ANON_FLAG         (64)
+#define GSS_C_PROT_READY_FLAG   (128)
+#define GSS_C_TRANS_FLAG       (256)
+
+/*
+ * Credential usage options
+ */
+#define GSS_C_BOTH           (0)
+#define GSS_C_INITIATE   (1)
+#define GSS_C_ACCEPT       (2)
+
+/*
+ * Status code types for gss_display_status
+ */
+#define GSS_C_GSS_CODE   (1)
+#define GSS_C_MECH_CODE         (2)
+
+
+/*
+ * Define the default Quality of Protection for per-message services.  Note
+ * that an implementation that offers multiple levels of QOP may either reserve
+ * a value (for example zero, as assumed here) to mean "default protection", or
+ * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit
+ * QOP value.  However a value of 0 should always be interpreted by a GSSAPI
+ * implementation as a request for the default protection level.
+ */
+#define GSS_C_QOP_DEFAULT       (0)
+
+/*
+ * Expiration time of 2^32-1 seconds means infinite lifetime for a
+ * credential or security context
+ */
+#define GSS_C_INDEFINITE       ((OM_uint32) 0xfffffffful)
+
+
+/* Major status codes */
+
+#define GSS_S_COMPLETE   (0)
+
+/*
+ * Some "helper" definitions to make the status code macros obvious.
+ */
+#define GSS_C_CALLING_ERROR_OFFSET      (24)
+#define GSS_C_ROUTINE_ERROR_OFFSET      (16)
+#define GSS_C_SUPPLEMENTARY_OFFSET      (0)
+#define GSS_C_CALLING_ERROR_MASK       ((OM_uint32) 0377ul)
+#define GSS_C_ROUTINE_ERROR_MASK       ((OM_uint32) 0377ul)
+#define GSS_C_SUPPLEMENTARY_MASK       ((OM_uint32) 0177777ul)
+
+/*
+ * The macros that test status codes for error conditions.  Note that the
+ * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now
+ * evaluates its argument only once.
+ */
+#define GSS_CALLING_ERROR(x) \
+  ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET))
+#define GSS_ROUTINE_ERROR(x) \
+  ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))
+#define GSS_SUPPLEMENTARY_INFO(x) \
+  ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET))
+#define GSS_ERROR(x) \
+  ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \
+         (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)))
+
+/*
+ * Now the actual status code definitions
+ */
+
+/*
+ * Calling errors:
+ */
+#define GSS_S_CALL_INACCESSIBLE_READ \
+       (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_INACCESSIBLE_WRITE \
+       (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_BAD_STRUCTURE \
+       (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET)
+
+/*
+ * Routine errors:
+ */
+#define GSS_S_BAD_MECH \
+       (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAME \
+       (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAMETYPE \
+       (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_BINDINGS \
+       (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_STATUS \
+       (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_SIG \
+       (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CRED \
+       (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CONTEXT \
+       (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_TOKEN \
+       (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_CREDENTIAL \
+       (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CREDENTIALS_EXPIRED \
+       (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CONTEXT_EXPIRED \
+       (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_FAILURE \
+       (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_QOP \
+       (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAUTHORIZED \
+       (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAVAILABLE \
+       (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DUPLICATE_ELEMENT \
+       (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NAME_NOT_MN \
+       (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+
+/*
+ * Supplementary info bits:
+ */
+#define GSS_S_CONTINUE_NEEDED   (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0))
+#define GSS_S_DUPLICATE_TOKEN   (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1))
+#define GSS_S_OLD_TOKEN         (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2))
+#define GSS_S_UNSEQ_TOKEN       (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3))
+#define GSS_S_GAP_TOKEN         (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4))
+
+/* XXXX these are not part of the GSSAPI C bindings!  (but should be) */
+
+#define GSS_CALLING_ERROR_FIELD(x) \
+       (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK)
+#define GSS_ROUTINE_ERROR_FIELD(x) \
+       (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK)
+#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \
+       (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK)
+
+/* XXXX This is a necessary evil until the spec is fixed */
+#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE
+
+#endif /* __PTLRPC_GSS_GSS_ERR_H_ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c
new file mode 100644 (file)
index 0000000..20b1638
--- /dev/null
@@ -0,0 +1,285 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_generic_token.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+#include "gss_asn1.h"
+
+
+/* TWRITE_STR from gssapiP_generic.h */
+#define TWRITE_STR(ptr, str, len) \
+       memcpy((ptr), (char *) (str), (len)); \
+       (ptr) += (len);
+
+/* XXXX this code currently makes the assumption that a mech oid will
+   never be longer than 127 bytes.  This assumption is not inherent in
+   the interfaces, so the code can be fixed if the OSI namespace
+   balloons unexpectedly. */
+
+/* Each token looks like this:
+
+0x60                           tag for APPLICATION 0, SEQUENCE
+                                       (constructed, definite-length)
+       <length>                possible multiple bytes, need to parse/generate
+       0x06                    tag for OBJECT IDENTIFIER
+               <moid_length>   compile-time constant string (assume 1 byte)
+               <moid_bytes>    compile-time constant string
+       <inner_bytes>           the ANY containing the application token
+                                       bytes 0,1 are the token type
+                                       bytes 2,n are the token data
+
+For the purposes of this abstraction, the token "header" consists of
+the sequence tag and length octets, the mech OID DER encoding, and the
+first two inner bytes, which indicate the token type.  The token
+"body" consists of everything else.
+
+*/
+
+static
+int der_length_size(int length)
+{
+       if (length < (1 << 7))
+               return 1;
+       else if (length < (1 << 8))
+               return 2;
+#if (SIZEOF_INT == 2)
+       else
+               return 3;
+#else
+       else if (length < (1 << 16))
+               return 3;
+       else if (length < (1 << 24))
+               return 4;
+       else
+               return 5;
+#endif
+}
+
+static
+void der_write_length(unsigned char **buf, int length)
+{
+       if (length < (1 << 7)) {
+               *(*buf)++ = (unsigned char) length;
+       } else {
+               *(*buf)++ = (unsigned char) (der_length_size(length) + 127);
+#if (SIZEOF_INT > 2)
+               if (length >= (1 << 24))
+                       *(*buf)++ = (unsigned char) (length >> 24);
+               if (length >= (1 << 16))
+                       *(*buf)++ = (unsigned char) ((length >> 16) & 0xff);
+#endif
+               if (length >= (1 << 8))
+                       *(*buf)++ = (unsigned char) ((length >> 8) & 0xff);
+               *(*buf)++ = (unsigned char) (length & 0xff);
+       }
+}
+
+/*
+ * returns decoded length, or < 0 on failure.  Advances buf and
+ * decrements bufsize
+ */
+static
+int der_read_length(unsigned char **buf, int *bufsize)
+{
+       unsigned char sf;
+       int ret;
+
+       if (*bufsize < 1)
+               return -1;
+       sf = *(*buf)++;
+       (*bufsize)--;
+       if (sf & 0x80) {
+               if ((sf &= 0x7f) > ((*bufsize) - 1))
+                       return -1;
+               if (sf > SIZEOF_INT)
+                       return -1;
+               ret = 0;
+               for (; sf; sf--) {
+                       ret = (ret << 8) + (*(*buf)++);
+                       (*bufsize)--;
+               }
+       } else {
+               ret = sf;
+       }
+
+       return ret;
+}
+
+/*
+ * returns the length of a token, given the mech oid and the body size
+ */
+int g_token_size(rawobj_t *mech, unsigned int body_size)
+{
+       /* set body_size to sequence contents size */
+       body_size += 4 + (int) mech->len; /* NEED overflow check */
+       return (1 + der_length_size(body_size) + body_size);
+}
+
+/*
+ * fills in a buffer with the token header.  The buffer is assumed to
+ * be the right size.  buf is advanced past the token header
+ */
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf)
+{
+       *(*buf)++ = 0x60;
+       der_write_length(buf, 4 + mech->len + body_size);
+       *(*buf)++ = 0x06;
+       *(*buf)++ = (unsigned char) mech->len;
+       TWRITE_STR(*buf, mech->data, ((int) mech->len));
+}
+
+/*
+ * Given a buffer containing a token, reads and verifies the token,
+ * leaving buf advanced past the token header, and setting body_size
+ * to the number of remaining bytes.  Returns 0 on success,
+ * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
+ * mechanism in the token does not match the mech argument.  buf and
+ * *body_size are left unmodified on error.
+ */
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+                           unsigned char **buf_in, int toksize)
+{
+       unsigned char *buf = *buf_in;
+       int seqsize;
+       rawobj_t toid;
+       int ret = 0;
+
+       if ((toksize -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       if (*buf++ != 0x60)
+               return (G_BAD_TOK_HEADER);
+
+       if ((seqsize = der_read_length(&buf, &toksize)) < 0)
+               return(G_BAD_TOK_HEADER);
+
+       if (seqsize != toksize)
+               return (G_BAD_TOK_HEADER);
+
+       if ((toksize -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       if (*buf++ != 0x06)
+               return (G_BAD_TOK_HEADER);
+
+       if ((toksize -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       toid.len = *buf++;
+
+       if ((toksize -= toid.len) < 0)
+               return (G_BAD_TOK_HEADER);
+       toid.data = buf;
+       buf += toid.len;
+
+       if (!g_OID_equal(&toid, mech))
+               ret = G_WRONG_MECH;
+
+       /* G_WRONG_MECH is not returned immediately because it's more
+        * important to return G_BAD_TOK_HEADER if the token header is
+        * in fact bad
+        */
+       if ((toksize -= 2) < 0)
+               return (G_BAD_TOK_HEADER);
+
+       if (ret)
+               return (ret);
+
+       if (!ret) {
+               *buf_in = buf;
+               *body_size = toksize;
+       }
+
+       return (ret);
+}
+
+/*
+ * Given a buffer containing a token, returns a copy of the mech oid in
+ * the parameter mech.
+ */
+__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t *in_buf)
+{
+       unsigned char *buf = in_buf->data;
+       int len = in_buf->len;
+       int ret = 0;
+       int seqsize;
+
+       if ((len -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       if (*buf++ != 0x60)
+               return (G_BAD_TOK_HEADER);
+
+       if ((seqsize = der_read_length(&buf, &len)) < 0)
+               return (G_BAD_TOK_HEADER);
+
+       if ((len -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       if (*buf++ != 0x06)
+               return (G_BAD_TOK_HEADER);
+
+       if ((len -= 1) < 0)
+               return (G_BAD_TOK_HEADER);
+       mech->len = *buf++;
+
+       if ((len -= mech->len) < 0)
+               return (G_BAD_TOK_HEADER);
+       OBD_ALLOC_LARGE(mech->data, mech->len);
+       if (!mech->data)
+               return (G_BUFFER_ALLOC);
+       memcpy(mech->data, buf, mech->len);
+
+       return ret;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h
new file mode 100644 (file)
index 0000000..cbfc47c
--- /dev/null
@@ -0,0 +1,526 @@
+/*
+ * Modified from NFSv4 project for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#ifndef __PTLRPC_GSS_GSS_INTERNAL_H_
+#define __PTLRPC_GSS_GSS_INTERNAL_H_
+
+#include <lustre_sec.h>
+
+/*
+ * rawobj stuff
+ */
+typedef struct netobj_s {
+       __u32      len;
+       __u8        data[0];
+} netobj_t;
+
+#define NETOBJ_EMPTY    ((netobj_t) { 0 })
+
+typedef struct rawobj_s {
+       __u32      len;
+       __u8       *data;
+} rawobj_t;
+
+#define RAWOBJ_EMPTY    ((rawobj_t) { 0, NULL })
+
+typedef struct rawobj_buf_s {
+       __u32      dataoff;
+       __u32      datalen;
+       __u32      buflen;
+       __u8       *buf;
+} rawobj_buf_t;
+
+int rawobj_empty(rawobj_t *obj);
+int rawobj_alloc(rawobj_t *obj, char *buf, int len);
+void rawobj_free(rawobj_t *obj);
+int rawobj_equal(rawobj_t *a, rawobj_t *b);
+int rawobj_dup(rawobj_t *dest, rawobj_t *src);
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj);
+int rawobj_from_netobj_alloc(rawobj_t *obj, netobj_t *netobj);
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+                        void *res, __u32 reslen);
+
+/*
+ * several timeout values. client refresh upcall timeout we using
+ * default in pipefs implemnetation.
+ */
+#define __TIMEOUT_DELTA                 (10)
+
+#define GSS_SECINIT_RPC_TIMEOUT                                         \
+       (obd_timeout < __TIMEOUT_DELTA ?                                \
+        __TIMEOUT_DELTA : obd_timeout - __TIMEOUT_DELTA)
+
+#define GSS_SECFINI_RPC_TIMEOUT         (__TIMEOUT_DELTA)
+#define GSS_SECSVC_UPCALL_TIMEOUT       (GSS_SECINIT_RPC_TIMEOUT)
+
+/*
+ * default gc interval
+ */
+#define GSS_GC_INTERVAL                 (60 * 60) /* 60 minutes */
+
+static inline
+unsigned long gss_round_ctx_expiry(unsigned long expiry,
+                                  unsigned long sec_flags)
+{
+       if (sec_flags & PTLRPC_SEC_FL_REVERSE)
+               return expiry;
+
+       if (get_seconds() + __TIMEOUT_DELTA <= expiry)
+               return expiry - __TIMEOUT_DELTA;
+
+       return expiry;
+}
+
+/*
+ * Max encryption element in block cipher algorithms.
+ */
+#define GSS_MAX_CIPHER_BLOCK          (16)
+
+/*
+ * XXX make it visible of kernel and lgssd/lsvcgssd
+ */
+#define GSSD_INTERFACE_VERSION   (1)
+
+#define PTLRPC_GSS_VERSION           (1)
+
+
+enum ptlrpc_gss_proc {
+       PTLRPC_GSS_PROC_DATA        = 0,
+       PTLRPC_GSS_PROC_INIT        = 1,
+       PTLRPC_GSS_PROC_CONTINUE_INIT   = 2,
+       PTLRPC_GSS_PROC_DESTROY  = 3,
+       PTLRPC_GSS_PROC_ERR          = 4,
+};
+
+enum ptlrpc_gss_tgt {
+       LUSTRE_GSS_TGT_MGS            = 0,
+       LUSTRE_GSS_TGT_MDS            = 1,
+       LUSTRE_GSS_TGT_OSS            = 2,
+};
+
+enum ptlrpc_gss_header_flags {
+       LUSTRE_GSS_PACK_BULK        = 1,
+       LUSTRE_GSS_PACK_USER        = 2,
+};
+
+static inline
+__u32 import_to_gss_svc(struct obd_import *imp)
+{
+       const char *name = imp->imp_obd->obd_type->typ_name;
+
+       if (!strcmp(name, LUSTRE_MGC_NAME))
+               return LUSTRE_GSS_TGT_MGS;
+       if (!strcmp(name, LUSTRE_MDC_NAME))
+               return LUSTRE_GSS_TGT_MDS;
+       if (!strcmp(name, LUSTRE_OSC_NAME))
+               return LUSTRE_GSS_TGT_OSS;
+       LBUG();
+       return 0;
+}
+
+/*
+ * following 3 header must have the same size and offset
+ */
+struct gss_header {
+       __u8                gh_version;     /* gss version */
+       __u8                gh_sp;        /* sec part */
+       __u16              gh_pad0;
+       __u32              gh_flags;       /* wrap flags */
+       __u32              gh_proc;     /* proc */
+       __u32              gh_seq;       /* sequence */
+       __u32              gh_svc;       /* service */
+       __u32              gh_pad1;
+       __u32              gh_pad2;
+       __u32              gh_pad3;
+       netobj_t                gh_handle;      /* context handle */
+};
+
+struct gss_rep_header {
+       __u8                gh_version;
+       __u8                gh_sp;
+       __u16              gh_pad0;
+       __u32              gh_flags;
+       __u32              gh_proc;
+       __u32              gh_major;
+       __u32              gh_minor;
+       __u32              gh_seqwin;
+       __u32              gh_pad2;
+       __u32              gh_pad3;
+       netobj_t                gh_handle;
+};
+
+struct gss_err_header {
+       __u8                gh_version;
+       __u8                gh_sp;
+       __u16              gh_pad0;
+       __u32              gh_flags;
+       __u32              gh_proc;
+       __u32              gh_major;
+       __u32              gh_minor;
+       __u32              gh_pad1;
+       __u32              gh_pad2;
+       __u32              gh_pad3;
+       netobj_t                gh_handle;
+};
+
+/*
+ * part of wire context information send from client which be saved and
+ * used later by server.
+ */
+struct gss_wire_ctx {
+       __u32              gw_flags;
+       __u32              gw_proc;
+       __u32              gw_seq;
+       __u32              gw_svc;
+       rawobj_t                gw_handle;
+};
+
+#define PTLRPC_GSS_MAX_HANDLE_SIZE      (8)
+#define PTLRPC_GSS_HEADER_SIZE   (sizeof(struct gss_header) + \
+                                        PTLRPC_GSS_MAX_HANDLE_SIZE)
+
+
+static inline __u64 gss_handle_to_u64(rawobj_t *handle)
+{
+       if (handle->len != PTLRPC_GSS_MAX_HANDLE_SIZE)
+               return -1;
+       return *((__u64 *) handle->data);
+}
+
+#define GSS_SEQ_WIN                 (2048)
+#define GSS_SEQ_WIN_MAIN               GSS_SEQ_WIN
+#define GSS_SEQ_WIN_BACK               (128)
+#define GSS_SEQ_REPACK_THRESHOLD       (GSS_SEQ_WIN_MAIN / 2 + \
+                                        GSS_SEQ_WIN_MAIN / 4)
+
+struct gss_svc_seq_data {
+       spinlock_t              ssd_lock;
+       /*
+        * highest sequence number seen so far, for main and back window
+        */
+       __u32              ssd_max_main;
+       __u32              ssd_max_back;
+       /*
+        * main and back window
+        * for i such that ssd_max - GSS_SEQ_WIN < i <= ssd_max, the i-th bit
+        * of ssd_win is nonzero iff sequence number i has been seen already.
+        */
+       unsigned long      ssd_win_main[GSS_SEQ_WIN_MAIN/BITS_PER_LONG];
+       unsigned long      ssd_win_back[GSS_SEQ_WIN_BACK/BITS_PER_LONG];
+};
+
+struct gss_svc_ctx {
+       struct gss_ctx   *gsc_mechctx;
+       struct gss_svc_seq_data gsc_seqdata;
+       rawobj_t                gsc_rvs_hdl;
+       __u32              gsc_rvs_seq;
+       uid_t              gsc_uid;
+       gid_t              gsc_gid;
+       uid_t              gsc_mapped_uid;
+       unsigned int        gsc_usr_root:1,
+                               gsc_usr_mds:1,
+                               gsc_usr_oss:1,
+                               gsc_remote:1,
+                               gsc_reverse:1;
+};
+
+struct gss_svc_reqctx {
+       struct ptlrpc_svc_ctx      src_base;
+       /*
+        * context
+        */
+       struct gss_wire_ctx          src_wirectx;
+       struct gss_svc_ctx           *src_ctx;
+       /*
+        * record place of bulk_sec_desc in request/reply buffer
+        */
+       struct ptlrpc_bulk_sec_desc    *src_reqbsd;
+       int                          src_reqbsd_size;
+       struct ptlrpc_bulk_sec_desc    *src_repbsd;
+       int                          src_repbsd_size;
+       /*
+        * flags
+        */
+       unsigned int                src_init:1,
+                                       src_init_continue:1,
+                                       src_err_notify:1;
+       int                          src_reserve_len;
+};
+
+struct gss_cli_ctx {
+       struct ptlrpc_cli_ctx   gc_base;
+       __u32              gc_flavor;
+       __u32              gc_proc;
+       __u32              gc_win;
+       atomic_t            gc_seq;
+       rawobj_t                gc_handle;
+       struct gss_ctx   *gc_mechctx;
+       /* handle for the buddy svc ctx */
+       rawobj_t                gc_svc_handle;
+};
+
+struct gss_cli_ctx_keyring {
+       struct gss_cli_ctx      gck_base;
+       struct key           *gck_key;
+       struct timer_list      *gck_timer;
+};
+
+struct gss_sec {
+       struct ptlrpc_sec       gs_base;
+       struct gss_api_mech     *gs_mech;
+       spinlock_t              gs_lock;
+       __u64                   gs_rvs_hdl;
+};
+
+struct gss_sec_pipefs {
+       struct gss_sec    gsp_base;
+       int                  gsp_chash_size;  /* must be 2^n */
+       struct hlist_head       gsp_chash[0];
+};
+
+/*
+ * FIXME cleanup the keyring upcall mutexes
+ */
+#define HAVE_KEYRING_UPCALL_SERIALIZED  1
+
+struct gss_sec_keyring {
+       struct gss_sec    gsk_base;
+       /*
+        * all contexts listed here. access is protected by sec spinlock.
+        */
+       struct hlist_head       gsk_clist;
+       /*
+        * specially point to root ctx (only one at a time). access is
+        * protected by sec spinlock.
+        */
+       struct ptlrpc_cli_ctx  *gsk_root_ctx;
+       /*
+        * specially serialize upcalls for root context.
+        */
+       struct mutex                    gsk_root_uc_lock;
+
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+       struct mutex            gsk_uc_lock;    /* serialize upcalls */
+#endif
+};
+
+static inline struct gss_cli_ctx *ctx2gctx(struct ptlrpc_cli_ctx *ctx)
+{
+       return container_of(ctx, struct gss_cli_ctx, gc_base);
+}
+
+static inline
+struct gss_cli_ctx_keyring *ctx2gctx_keyring(struct ptlrpc_cli_ctx *ctx)
+{
+       return container_of(ctx2gctx(ctx),
+                           struct gss_cli_ctx_keyring, gck_base);
+}
+
+static inline struct gss_sec *sec2gsec(struct ptlrpc_sec *sec)
+{
+       return container_of(sec, struct gss_sec, gs_base);
+}
+
+static inline struct gss_sec_pipefs *sec2gsec_pipefs(struct ptlrpc_sec *sec)
+{
+       return container_of(sec2gsec(sec), struct gss_sec_pipefs, gsp_base);
+}
+
+static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec)
+{
+       return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base);
+}
+
+
+#define GSS_CTX_INIT_MAX_LEN       (1024)
+
+/*
+ * This only guaranteed be enough for current krb5 des-cbc-crc . We might
+ * adjust this when new enc type or mech added in.
+ */
+#define GSS_PRIVBUF_PREFIX_LEN  (32)
+#define GSS_PRIVBUF_SUFFIX_LEN  (32)
+
+static inline
+struct gss_svc_reqctx *gss_svc_ctx2reqctx(struct ptlrpc_svc_ctx *ctx)
+{
+       LASSERT(ctx);
+       return container_of(ctx, struct gss_svc_reqctx, src_base);
+}
+
+static inline
+struct gss_svc_ctx *gss_svc_ctx2gssctx(struct ptlrpc_svc_ctx *ctx)
+{
+       LASSERT(ctx);
+       return gss_svc_ctx2reqctx(ctx)->src_ctx;
+}
+
+/* sec_gss.c */
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred);
+int gss_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize);
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req);
+
+int  gss_sec_install_rctx(struct obd_import *imp, struct ptlrpc_sec *sec,
+                         struct ptlrpc_cli_ctx *ctx);
+int  gss_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                     int msgsize);
+void gss_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                     int msgsize);
+void gss_free_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+int  gss_enlarge_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                       int segment, int newsize);
+
+int  gss_svc_accept(struct ptlrpc_sec_policy *policy,
+                   struct ptlrpc_request *req);
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx);
+int  gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen);
+int  gss_svc_authorize(struct ptlrpc_request *req);
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs);
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx);
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx);
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx);
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+                        struct ptlrpc_svc_ctx *svc_ctx);
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+                                  int swabbed);
+netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment);
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx);
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor);
+int gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num, int set);
+
+int gss_sec_create_common(struct gss_sec *gsec,
+                         struct ptlrpc_sec_policy *policy,
+                         struct obd_import *imp,
+                         struct ptlrpc_svc_ctx *ctx,
+                         struct sptlrpc_flavor *sf);
+void gss_sec_destroy_common(struct gss_sec *gsec);
+void gss_sec_kill(struct ptlrpc_sec *sec);
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx,
+                           struct ptlrpc_ctx_ops *ctxops,
+                           struct vfs_cred *vcred);
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx);
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize);
+
+/* gss_keyring.c */
+int  __init gss_init_keyring(void);
+void __exit gss_exit_keyring(void);
+
+/* gss_pipefs.c */
+int  __init gss_init_pipefs(void);
+void __exit gss_exit_pipefs(void);
+
+/* gss_bulk.c */
+int gss_cli_prep_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                         struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc);
+int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                           struct ptlrpc_request *req,
+                           struct ptlrpc_bulk_desc *desc);
+int gss_svc_prep_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc);
+int gss_svc_unwrap_bulk(struct ptlrpc_request *req,
+                       struct ptlrpc_bulk_desc *desc);
+int gss_svc_wrap_bulk(struct ptlrpc_request *req,
+                     struct ptlrpc_bulk_desc *desc);
+
+/* gss_mech_switch.c */
+int init_kerberos_module(void);
+void cleanup_kerberos_module(void);
+
+/* gss_generic_token.c */
+int g_token_size(rawobj_t *mech, unsigned int body_size);
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf);
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+                           unsigned char **buf_in, int toksize);
+
+
+/* gss_cli_upcall.c */
+int gss_do_ctx_init_rpc(char *buffer, unsigned long count);
+int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx);
+
+int  __init gss_init_cli_upcall(void);
+void __exit gss_exit_cli_upcall(void);
+
+/* gss_svc_upcall.c */
+__u64 gss_get_next_ctx_index(void);
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+                                  struct gss_sec *gsec,
+                                  struct gss_cli_ctx *gctx);
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle);
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx);
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq);
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+                              struct gss_svc_reqctx *grctx,
+                              struct gss_wire_ctx *gw,
+                              struct obd_device *target,
+                              __u32 lustre_svc,
+                              rawobj_t *rvs_hdl,
+                              rawobj_t *in_token);
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+                                          struct gss_wire_ctx *gw);
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx);
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx);
+
+int  __init gss_init_svc_upcall(void);
+void __exit gss_exit_svc_upcall(void);
+
+/* lproc_gss.c */
+void gss_stat_oos_record_cli(int behind);
+void gss_stat_oos_record_svc(int phase, int replay);
+
+int  __init gss_init_lproc(void);
+void __exit gss_exit_lproc(void);
+
+/* gss_krb5_mech.c */
+int __init init_kerberos_module(void);
+void __exit cleanup_kerberos_module(void);
+
+
+/* debug */
+static inline
+void __dbg_memdump(char *name, void *ptr, int size)
+{
+       char *buf, *p = (char *) ptr;
+       int bufsize = size * 2 + 1, i;
+
+       OBD_ALLOC(buf, bufsize);
+       if (!buf) {
+               CDEBUG(D_ERROR, "DUMP ERROR: can't alloc %d bytes\n", bufsize);
+               return;
+       }
+
+       for (i = 0; i < size; i++)
+               sprintf(&buf[i+i], "%02x", (__u8) p[i]);
+       buf[size + size] = '\0';
+       LCONSOLE_INFO("DUMP %s@%p(%d): %s\n", name, ptr, size, buf);
+       OBD_FREE(buf, bufsize);
+}
+
+#endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c
new file mode 100644 (file)
index 0000000..bb571ae
--- /dev/null
@@ -0,0 +1,1424 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_keyring.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_keyring;
+static struct ptlrpc_ctx_ops gss_keyring_ctxops;
+static struct key_type gss_key_type;
+
+static int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+                              struct ptlrpc_svc_ctx *svc_ctx);
+
+/*
+ * the timeout is only for the case that upcall child process die abnormally.
+ * in any other cases it should finally update kernel key.
+ *
+ * FIXME we'd better to incorporate the client & server side upcall timeouts
+ * into the framework of Adaptive Timeouts, but we need to figure out how to
+ * make sure that kernel knows the upcall processes is in-progress or died
+ * unexpectedly.
+ */
+#define KEYRING_UPCALL_TIMEOUT  (obd_timeout + obd_timeout)
+
+/****************************************
+ * internal helpers                 *
+ ****************************************/
+
+#define DUMP_PROCESS_KEYRINGS(tsk)                                     \
+{                                                                      \
+       CWARN("DUMP PK: %s[%u,%u/%u](<-%s[%u,%u/%u]): "                 \
+             "a %d, t %d, p %d, s %d, u %d, us %d, df %d\n",           \
+             tsk->comm, tsk->pid, tsk->uid, tsk->fsuid,                \
+             tsk->parent->comm, tsk->parent->pid,                      \
+             tsk->parent->uid, tsk->parent->fsuid,                     \
+             tsk->request_key_auth ?                                   \
+             tsk->request_key_auth->serial : 0,                        \
+             key_cred(tsk)->thread_keyring ?                           \
+             key_cred(tsk)->thread_keyring->serial : 0,                \
+             key_tgcred(tsk)->process_keyring ?                        \
+             key_tgcred(tsk)->process_keyring->serial : 0,             \
+             key_tgcred(tsk)->session_keyring ?                        \
+             key_tgcred(tsk)->session_keyring->serial : 0,             \
+             key_cred(tsk)->user->uid_keyring ?                        \
+             key_cred(tsk)->user->uid_keyring->serial : 0,             \
+             key_cred(tsk)->user->session_keyring ?                    \
+             key_cred(tsk)->user->session_keyring->serial : 0,         \
+             key_cred(tsk)->jit_keyring                                \
+            );                                                         \
+}
+
+#define DUMP_KEY(key)                                             \
+{                                                                     \
+       CWARN("DUMP KEY: %p(%d) ref %d u%u/g%u desc %s\n",            \
+             key, key->serial, atomic_read(&key->usage),              \
+             key->uid, key->gid,                                      \
+             key->description ? key->description : "n/a"              \
+            );                                                  \
+}
+
+#define key_cred(tsk)   ((tsk)->cred)
+#define key_tgcred(tsk) ((tsk)->cred->tgcred)
+
+static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+       mutex_lock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void keyring_upcall_unlock(struct gss_sec_keyring *gsec_kr)
+{
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+       mutex_unlock(&gsec_kr->gsk_uc_lock);
+#endif
+}
+
+static inline void key_revoke_locked(struct key *key)
+{
+       set_bit(KEY_FLAG_REVOKED, &key->flags);
+}
+
+static void ctx_upcall_timeout_kr(unsigned long data)
+{
+       struct ptlrpc_cli_ctx *ctx = (struct ptlrpc_cli_ctx *) data;
+       struct key          *key = ctx2gctx_keyring(ctx)->gck_key;
+
+       CWARN("ctx %p, key %p\n", ctx, key);
+
+       LASSERT(key);
+
+       cli_ctx_expire(ctx);
+       key_revoke_locked(key);
+}
+
+static
+void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, long timeout)
+{
+       struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+       struct timer_list         *timer = gctx_kr->gck_timer;
+
+       LASSERT(timer);
+
+       CDEBUG(D_SEC, "ctx %p: start timer %lds\n", ctx, timeout);
+       timeout = timeout * HZ + cfs_time_current();
+
+       init_timer(timer);
+       timer->expires = timeout;
+       timer->data = (unsigned long ) ctx;
+       timer->function = ctx_upcall_timeout_kr;
+
+       add_timer(timer);
+}
+
+/*
+ * caller should make sure no race with other threads
+ */
+static
+void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+       struct timer_list         *timer = gctx_kr->gck_timer;
+
+       if (timer == NULL)
+               return;
+
+       CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key);
+
+       gctx_kr->gck_timer = NULL;
+
+       del_singleshot_timer_sync(timer);
+
+       OBD_FREE_PTR(timer);
+}
+
+static
+struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec,
+                                    struct vfs_cred *vcred)
+{
+       struct ptlrpc_cli_ctx      *ctx;
+       struct gss_cli_ctx_keyring *gctx_kr;
+
+       OBD_ALLOC_PTR(gctx_kr);
+       if (gctx_kr == NULL)
+               return NULL;
+
+       OBD_ALLOC_PTR(gctx_kr->gck_timer);
+       if (gctx_kr->gck_timer == NULL) {
+               OBD_FREE_PTR(gctx_kr);
+               return NULL;
+       }
+       init_timer(gctx_kr->gck_timer);
+
+       ctx = &gctx_kr->gck_base.gc_base;
+
+       if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) {
+               OBD_FREE_PTR(gctx_kr->gck_timer);
+               OBD_FREE_PTR(gctx_kr);
+               return NULL;
+       }
+
+       ctx->cc_expire = cfs_time_current_sec() + KEYRING_UPCALL_TIMEOUT;
+       clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags);
+       atomic_inc(&ctx->cc_refcount); /* for the caller */
+
+       return ctx;
+}
+
+static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       struct ptlrpc_sec         *sec = ctx->cc_sec;
+       struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx);
+
+       CDEBUG(D_SEC, "destroying ctx %p\n", ctx);
+
+       /* at this time the association with key has been broken. */
+       LASSERT(sec);
+       LASSERT(atomic_read(&sec->ps_refcount) > 0);
+       LASSERT(atomic_read(&sec->ps_nctx) > 0);
+       LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+       LASSERT(gctx_kr->gck_key == NULL);
+
+       ctx_clear_timer_kr(ctx);
+       LASSERT(gctx_kr->gck_timer == NULL);
+
+       if (gss_cli_ctx_fini_common(sec, ctx))
+               return;
+
+       OBD_FREE_PTR(gctx_kr);
+
+       atomic_dec(&sec->ps_nctx);
+       sptlrpc_sec_put(sec);
+}
+
+static void ctx_release_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+       if (sync) {
+               ctx_destroy_kr(ctx);
+       } else {
+               atomic_inc(&ctx->cc_refcount);
+               sptlrpc_gc_add_ctx(ctx);
+       }
+}
+
+static void ctx_put_kr(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+       if (atomic_dec_and_test(&ctx->cc_refcount))
+               ctx_release_kr(ctx, sync);
+}
+
+/*
+ * key <-> ctx association and rules:
+ * - ctx might not bind with any key
+ * - key/ctx binding is protected by key semaphore (if the key present)
+ * - key and ctx each take a reference of the other
+ * - ctx enlist/unlist is protected by ctx spinlock
+ * - never enlist a ctx after it's been unlisted
+ * - whoever do enlist should also do bind, lock key before enlist:
+ *   - lock key -> lock ctx -> enlist -> unlock ctx -> bind -> unlock key
+ * - whoever do unlist should also do unbind:
+ *   - lock key -> lock ctx -> unlist -> unlock ctx -> unbind -> unlock key
+ *   - lock ctx -> unlist -> unlock ctx -> lock key -> unbind -> unlock key
+ */
+
+static inline void spin_lock_if(spinlock_t *lock, int condition)
+{
+       if (condition)
+               spin_lock(lock);
+}
+
+static inline void spin_unlock_if(spinlock_t *lock, int condition)
+{
+       if (condition)
+               spin_unlock(lock);
+}
+
+static void ctx_enlist_kr(struct ptlrpc_cli_ctx *ctx, int is_root, int locked)
+{
+       struct ptlrpc_sec      *sec = ctx->cc_sec;
+       struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+
+       LASSERT(!test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+       spin_lock_if(&sec->ps_lock, !locked);
+
+       atomic_inc(&ctx->cc_refcount);
+       set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+       hlist_add_head(&ctx->cc_cache, &gsec_kr->gsk_clist);
+       if (is_root)
+               gsec_kr->gsk_root_ctx = ctx;
+
+       spin_unlock_if(&sec->ps_lock, !locked);
+}
+
+/*
+ * Note after this get called, caller should not access ctx again because
+ * it might have been freed, unless caller hold at least one refcount of
+ * the ctx.
+ *
+ * return non-zero if we indeed unlist this ctx.
+ */
+static int ctx_unlist_kr(struct ptlrpc_cli_ctx *ctx, int locked)
+{
+       struct ptlrpc_sec       *sec = ctx->cc_sec;
+       struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+
+       /* if hashed bit has gone, leave the job to somebody who is doing it */
+       if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0)
+               return 0;
+
+       /* drop ref inside spin lock to prevent race with other operations */
+       spin_lock_if(&sec->ps_lock, !locked);
+
+       if (gsec_kr->gsk_root_ctx == ctx)
+               gsec_kr->gsk_root_ctx = NULL;
+       hlist_del_init(&ctx->cc_cache);
+       atomic_dec(&ctx->cc_refcount);
+
+       spin_unlock_if(&sec->ps_lock, !locked);
+
+       return 1;
+}
+
+/*
+ * bind a key with a ctx together.
+ * caller must hold write lock of the key, as well as ref on key & ctx.
+ */
+static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(atomic_read(&key->usage) > 0);
+       LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL);
+       LASSERT(key->payload.data == NULL);
+
+       /* at this time context may or may not in list. */
+       key_get(key);
+       atomic_inc(&ctx->cc_refcount);
+       ctx2gctx_keyring(ctx)->gck_key = key;
+       key->payload.data = ctx;
+}
+
+/*
+ * unbind a key and a ctx.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(key->payload.data == ctx);
+       LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+
+       /* must revoke the key, or others may treat it as newly created */
+       key_revoke_locked(key);
+
+       key->payload.data = NULL;
+       ctx2gctx_keyring(ctx)->gck_key = NULL;
+
+       /* once ctx get split from key, the timer is meaningless */
+       ctx_clear_timer_kr(ctx);
+
+       ctx_put_kr(ctx, 1);
+       key_put(key);
+}
+
+/*
+ * given a ctx, unbind with its coupled key, if any.
+ * unbind could only be called once, so we don't worry the key be released
+ * by someone else.
+ */
+static void unbind_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       struct key      *key = ctx2gctx_keyring(ctx)->gck_key;
+
+       if (key) {
+               LASSERT(key->payload.data == ctx);
+
+               key_get(key);
+               down_write(&key->sem);
+               unbind_key_ctx(key, ctx);
+               up_write(&key->sem);
+               key_put(key);
+       }
+}
+
+/*
+ * given a key, unbind with its coupled ctx, if any.
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void unbind_key_locked(struct key *key)
+{
+       struct ptlrpc_cli_ctx   *ctx = key->payload.data;
+
+       if (ctx)
+               unbind_key_ctx(key, ctx);
+}
+
+/*
+ * unlist a ctx, and unbind from coupled key
+ */
+static void kill_ctx_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       if (ctx_unlist_kr(ctx, 0))
+               unbind_ctx_kr(ctx);
+}
+
+/*
+ * given a key, unlist and unbind with the coupled ctx (if any).
+ * caller must hold write lock, as well as a ref of the key.
+ */
+static void kill_key_locked(struct key *key)
+{
+       struct ptlrpc_cli_ctx *ctx = key->payload.data;
+
+       if (ctx && ctx_unlist_kr(ctx, 0))
+               unbind_key_locked(key);
+}
+
+/*
+ * caller should hold one ref on contexts in freelist.
+ */
+static void dispose_ctx_list_kr(struct hlist_head *freelist)
+{
+       struct hlist_node      *next;
+       struct ptlrpc_cli_ctx  *ctx;
+       struct gss_cli_ctx     *gctx;
+
+       hlist_for_each_entry_safe(ctx, next, freelist, cc_cache) {
+               hlist_del_init(&ctx->cc_cache);
+
+               /* reverse ctx: update current seq to buddy svcctx if exist.
+                * ideally this should be done at gss_cli_ctx_finalize(), but
+                * the ctx destroy could be delayed by:
+                *  1) ctx still has reference;
+                *  2) ctx destroy is asynchronous;
+                * and reverse import call inval_all_ctx() require this be done
+                *_immediately_ otherwise newly created reverse ctx might copy
+                * the very old sequence number from svcctx. */
+               gctx = ctx2gctx(ctx);
+               if (!rawobj_empty(&gctx->gc_svc_handle) &&
+                   sec_is_reverse(gctx->gc_base.cc_sec)) {
+                       gss_svc_upcall_update_sequence(&gctx->gc_svc_handle,
+                                       (__u32) atomic_read(&gctx->gc_seq));
+               }
+
+               /* we need to wakeup waiting reqs here. the context might
+                * be forced released before upcall finished, then the
+                * late-arrived downcall can't find the ctx even. */
+               sptlrpc_cli_ctx_wakeup(ctx);
+
+               unbind_ctx_kr(ctx);
+               ctx_put_kr(ctx, 0);
+       }
+}
+
+/*
+ * lookup a root context directly in a sec, return root ctx with a
+ * reference taken or NULL.
+ */
+static
+struct ptlrpc_cli_ctx * sec_lookup_root_ctx_kr(struct ptlrpc_sec *sec)
+{
+       struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+       struct ptlrpc_cli_ctx   *ctx = NULL;
+
+       spin_lock(&sec->ps_lock);
+
+       ctx = gsec_kr->gsk_root_ctx;
+
+       if (ctx == NULL && unlikely(sec_is_reverse(sec))) {
+               struct ptlrpc_cli_ctx  *tmp;
+
+               /* reverse ctx, search root ctx in list, choose the one
+                * with shortest expire time, which is most possibly have
+                * an established peer ctx at client side. */
+               hlist_for_each_entry(tmp, &gsec_kr->gsk_clist, cc_cache) {
+                       if (ctx == NULL || ctx->cc_expire == 0 ||
+                           ctx->cc_expire > tmp->cc_expire) {
+                               ctx = tmp;
+                               /* promote to be root_ctx */
+                               gsec_kr->gsk_root_ctx = ctx;
+                       }
+               }
+       }
+
+       if (ctx) {
+               LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+               LASSERT(!hlist_empty(&gsec_kr->gsk_clist));
+               atomic_inc(&ctx->cc_refcount);
+       }
+
+       spin_unlock(&sec->ps_lock);
+
+       return ctx;
+}
+
+#define RVS_CTX_EXPIRE_NICE    (10)
+
+static
+void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec,
+                                struct ptlrpc_cli_ctx *new_ctx,
+                                struct key *key)
+{
+       struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+       struct ptlrpc_cli_ctx  *ctx;
+       cfs_time_t            now;
+       ENTRY;
+
+       LASSERT(sec_is_reverse(sec));
+
+       spin_lock(&sec->ps_lock);
+
+       now = cfs_time_current_sec();
+
+       /* set all existing ctxs short expiry */
+       hlist_for_each_entry(ctx, &gsec_kr->gsk_clist, cc_cache) {
+               if (ctx->cc_expire > now + RVS_CTX_EXPIRE_NICE) {
+                       ctx->cc_early_expire = 1;
+                       ctx->cc_expire = now + RVS_CTX_EXPIRE_NICE;
+               }
+       }
+
+       /* if there's root_ctx there, instead obsolete the current
+        * immediately, we leave it continue operating for a little while.
+        * hopefully when the first backward rpc with newest ctx send out,
+        * the client side already have the peer ctx well established. */
+       ctx_enlist_kr(new_ctx, gsec_kr->gsk_root_ctx ? 0 : 1, 1);
+
+       if (key)
+               bind_key_ctx(key, new_ctx);
+
+       spin_unlock(&sec->ps_lock);
+}
+
+static void construct_key_desc(void *buf, int bufsize,
+                              struct ptlrpc_sec *sec, uid_t uid)
+{
+       snprintf(buf, bufsize, "%d@%x", uid, sec->ps_id);
+       ((char *)buf)[bufsize - 1] = '\0';
+}
+
+/****************************************
+ * sec apis                         *
+ ****************************************/
+
+static
+struct ptlrpc_sec * gss_sec_create_kr(struct obd_import *imp,
+                                     struct ptlrpc_svc_ctx *svcctx,
+                                     struct sptlrpc_flavor *sf)
+{
+       struct gss_sec_keyring  *gsec_kr;
+       ENTRY;
+
+       OBD_ALLOC(gsec_kr, sizeof(*gsec_kr));
+       if (gsec_kr == NULL)
+               RETURN(NULL);
+
+       INIT_HLIST_HEAD(&gsec_kr->gsk_clist);
+       gsec_kr->gsk_root_ctx = NULL;
+       mutex_init(&gsec_kr->gsk_root_uc_lock);
+#ifdef HAVE_KEYRING_UPCALL_SERIALIZED
+       mutex_init(&gsec_kr->gsk_uc_lock);
+#endif
+
+       if (gss_sec_create_common(&gsec_kr->gsk_base, &gss_policy_keyring,
+                                 imp, svcctx, sf))
+               goto err_free;
+
+       if (svcctx != NULL &&
+           sec_install_rctx_kr(&gsec_kr->gsk_base.gs_base, svcctx)) {
+               gss_sec_destroy_common(&gsec_kr->gsk_base);
+               goto err_free;
+       }
+
+       RETURN(&gsec_kr->gsk_base.gs_base);
+
+err_free:
+       OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+       RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_kr(struct ptlrpc_sec *sec)
+{
+       struct gss_sec    *gsec = sec2gsec(sec);
+       struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+
+       CDEBUG(D_SEC, "destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+       LASSERT(hlist_empty(&gsec_kr->gsk_clist));
+       LASSERT(gsec_kr->gsk_root_ctx == NULL);
+
+       gss_sec_destroy_common(gsec);
+
+       OBD_FREE(gsec_kr, sizeof(*gsec_kr));
+}
+
+static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred)
+{
+       /* except the ROOTONLY flag, treat it as root user only if real uid
+        * is 0, euid/fsuid being 0 are handled as setuid scenarios */
+       if (sec_is_rootonly(sec) || (vcred->vc_uid == 0))
+               return 1;
+       else
+               return 0;
+}
+
+/*
+ * unlink request key from it's ring, which is linked during request_key().
+ * sadly, we have to 'guess' which keyring it's linked to.
+ *
+ * FIXME this code is fragile, depend on how request_key_link() is implemented.
+ */
+static void request_key_unlink(struct key *key)
+{
+       struct task_struct *tsk = current;
+       struct key *ring;
+
+       switch (key_cred(tsk)->jit_keyring) {
+       case KEY_REQKEY_DEFL_DEFAULT:
+       case KEY_REQKEY_DEFL_THREAD_KEYRING:
+               ring = key_get(key_cred(tsk)->thread_keyring);
+               if (ring)
+                       break;
+       case KEY_REQKEY_DEFL_PROCESS_KEYRING:
+               ring = key_get(key_tgcred(tsk)->process_keyring);
+               if (ring)
+                       break;
+       case KEY_REQKEY_DEFL_SESSION_KEYRING:
+               rcu_read_lock();
+               ring = key_get(rcu_dereference(key_tgcred(tsk)
+                                              ->session_keyring));
+               rcu_read_unlock();
+               if (ring)
+                       break;
+       case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
+               ring = key_get(key_cred(tsk)->user->session_keyring);
+               break;
+       case KEY_REQKEY_DEFL_USER_KEYRING:
+               ring = key_get(key_cred(tsk)->user->uid_keyring);
+               break;
+       case KEY_REQKEY_DEFL_GROUP_KEYRING:
+       default:
+               LBUG();
+       }
+
+       LASSERT(ring);
+       key_unlink(ring, key);
+       key_put(ring);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec,
+                                             struct vfs_cred *vcred,
+                                             int create, int remove_dead)
+{
+       struct obd_import       *imp = sec->ps_import;
+       struct gss_sec_keyring  *gsec_kr = sec2gsec_keyring(sec);
+       struct ptlrpc_cli_ctx   *ctx = NULL;
+       unsigned int         is_root = 0, create_new = 0;
+       struct key            *key;
+       char                 desc[24];
+       char                *coinfo;
+       int                   coinfo_size;
+       char                *co_flags = "";
+       ENTRY;
+
+       LASSERT(imp != NULL);
+
+       is_root = user_is_root(sec, vcred);
+
+       /* a little bit optimization for root context */
+       if (is_root) {
+               ctx = sec_lookup_root_ctx_kr(sec);
+               /*
+                * Only lookup directly for REVERSE sec, which should
+                * always succeed.
+                */
+               if (ctx || sec_is_reverse(sec))
+                       RETURN(ctx);
+       }
+
+       LASSERT(create != 0);
+
+       /* for root context, obtain lock and check again, this time hold
+        * the root upcall lock, make sure nobody else populated new root
+        * context after last check. */
+       if (is_root) {
+               mutex_lock(&gsec_kr->gsk_root_uc_lock);
+
+               ctx = sec_lookup_root_ctx_kr(sec);
+               if (ctx)
+                       goto out;
+
+               /* update reverse handle for root user */
+               sec2gsec(sec)->gs_rvs_hdl = gss_get_next_ctx_index();
+
+               switch (sec->ps_part) {
+               case LUSTRE_SP_MDT:
+                       co_flags = "m";
+                       break;
+               case LUSTRE_SP_OST:
+                       co_flags = "o";
+                       break;
+               case LUSTRE_SP_MGC:
+                       co_flags = "rmo";
+                       break;
+               case LUSTRE_SP_CLI:
+                       co_flags = "r";
+                       break;
+               case LUSTRE_SP_MGS:
+               default:
+                       LBUG();
+               }
+       }
+
+       /* in case of setuid, key will be constructed as owner of fsuid/fsgid,
+        * but we do authentication based on real uid/gid. the key permission
+        * bits will be exactly as POS_ALL, so only processes who subscribed
+        * this key could have the access, although the quota might be counted
+        * on others (fsuid/fsgid).
+        *
+        * keyring will use fsuid/fsgid as upcall parameters, so we have to
+        * encode real uid/gid into callout info.
+        */
+
+       construct_key_desc(desc, sizeof(desc), sec, vcred->vc_uid);
+
+       /* callout info format:
+        * secid:mech:uid:gid:flags:svc_type:peer_nid:target_uuid
+        */
+       coinfo_size = sizeof(struct obd_uuid) + MAX_OBD_NAME + 64;
+       OBD_ALLOC(coinfo, coinfo_size);
+       if (coinfo == NULL)
+               goto out;
+
+       snprintf(coinfo, coinfo_size, "%d:%s:%u:%u:%s:%d:"LPX64":%s",
+                sec->ps_id, sec2gsec(sec)->gs_mech->gm_name,
+                vcred->vc_uid, vcred->vc_gid,
+                co_flags, import_to_gss_svc(imp),
+                imp->imp_connection->c_peer.nid, imp->imp_obd->obd_name);
+
+       CDEBUG(D_SEC, "requesting key for %s\n", desc);
+
+       keyring_upcall_lock(gsec_kr);
+       key = request_key(&gss_key_type, desc, coinfo);
+       keyring_upcall_unlock(gsec_kr);
+
+       OBD_FREE(coinfo, coinfo_size);
+
+       if (IS_ERR(key)) {
+               CERROR("failed request key: %ld\n", PTR_ERR(key));
+               goto out;
+       }
+       CDEBUG(D_SEC, "obtained key %08x for %s\n", key->serial, desc);
+
+       /* once payload.data was pointed to a ctx, it never changes until
+        * we de-associate them; but parallel request_key() may return
+        * a key with payload.data == NULL at the same time. so we still
+        * need wirtelock of key->sem to serialize them. */
+       down_write(&key->sem);
+
+       if (likely(key->payload.data != NULL)) {
+               ctx = key->payload.data;
+
+               LASSERT(atomic_read(&ctx->cc_refcount) >= 1);
+               LASSERT(ctx2gctx_keyring(ctx)->gck_key == key);
+               LASSERT(atomic_read(&key->usage) >= 2);
+
+               /* simply take a ref and return. it's upper layer's
+                * responsibility to detect & replace dead ctx. */
+               atomic_inc(&ctx->cc_refcount);
+       } else {
+               /* pre initialization with a cli_ctx. this can't be done in
+                * key_instantiate() because we'v no enough information
+                * there. */
+               ctx = ctx_create_kr(sec, vcred);
+               if (ctx != NULL) {
+                       ctx_enlist_kr(ctx, is_root, 0);
+                       bind_key_ctx(key, ctx);
+
+                       ctx_start_timer_kr(ctx, KEYRING_UPCALL_TIMEOUT);
+
+                       CDEBUG(D_SEC, "installed key %p <-> ctx %p (sec %p)\n",
+                              key, ctx, sec);
+               } else {
+                       /* we'd prefer to call key_revoke(), but we more like
+                        * to revoke it within this key->sem locked period. */
+                       key_revoke_locked(key);
+               }
+
+               create_new = 1;
+       }
+
+       up_write(&key->sem);
+
+       if (is_root && create_new)
+               request_key_unlink(key);
+
+       key_put(key);
+out:
+       if (is_root)
+               mutex_unlock(&gsec_kr->gsk_root_uc_lock);
+       RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_kr(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx,
+                           int sync)
+{
+       LASSERT(atomic_read(&sec->ps_refcount) > 0);
+       LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+       ctx_release_kr(ctx, sync);
+}
+
+/*
+ * flush context of normal user, we must resort to keyring itself to find out
+ * contexts which belong to me.
+ *
+ * Note here we suppose only to flush _my_ context, the "uid" will
+ * be ignored in the search.
+ */
+static
+void flush_user_ctx_cache_kr(struct ptlrpc_sec *sec,
+                            uid_t uid,
+                            int grace, int force)
+{
+       struct key            *key;
+       char                 desc[24];
+
+       /* nothing to do for reverse or rootonly sec */
+       if (sec_is_reverse(sec) || sec_is_rootonly(sec))
+               return;
+
+       construct_key_desc(desc, sizeof(desc), sec, uid);
+
+       /* there should be only one valid key, but we put it in the
+        * loop in case of any weird cases */
+       for (;;) {
+               key = request_key(&gss_key_type, desc, NULL);
+               if (IS_ERR(key)) {
+                       CDEBUG(D_SEC, "No more key found for current user\n");
+                       break;
+               }
+
+               down_write(&key->sem);
+
+               kill_key_locked(key);
+
+               /* kill_key_locked() should usually revoke the key, but we
+                * revoke it again to make sure, e.g. some case the key may
+                * not well coupled with a context. */
+               key_revoke_locked(key);
+
+               up_write(&key->sem);
+
+               key_put(key);
+       }
+}
+
+/*
+ * flush context of root or all, we iterate through the list.
+ */
+static
+void flush_spec_ctx_cache_kr(struct ptlrpc_sec *sec,
+                            uid_t uid,
+                            int grace, int force)
+{
+       struct gss_sec_keyring *gsec_kr;
+       struct hlist_head       freelist = HLIST_HEAD_INIT;
+       struct hlist_node      *next;
+       struct ptlrpc_cli_ctx  *ctx;
+       ENTRY;
+
+       gsec_kr = sec2gsec_keyring(sec);
+
+       spin_lock(&sec->ps_lock);
+       hlist_for_each_entry_safe(ctx, next,
+                                     &gsec_kr->gsk_clist, cc_cache) {
+               LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+               if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+                       continue;
+
+               /* at this moment there's at least 2 base reference:
+                * key association and in-list. */
+               if (atomic_read(&ctx->cc_refcount) > 2) {
+                       if (!force)
+                               continue;
+                       CWARN("flush busy ctx %p(%u->%s, extra ref %d)\n",
+                             ctx, ctx->cc_vcred.vc_uid,
+                             sec2target_str(ctx->cc_sec),
+                             atomic_read(&ctx->cc_refcount) - 2);
+               }
+
+               set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+               if (!grace)
+                       clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+               atomic_inc(&ctx->cc_refcount);
+
+               if (ctx_unlist_kr(ctx, 1)) {
+                       hlist_add_head(&ctx->cc_cache, &freelist);
+               } else {
+                       LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+                       atomic_dec(&ctx->cc_refcount);
+               }
+       }
+       spin_unlock(&sec->ps_lock);
+
+       dispose_ctx_list_kr(&freelist);
+       EXIT;
+}
+
+static
+int gss_sec_flush_ctx_cache_kr(struct ptlrpc_sec *sec,
+                              uid_t uid, int grace, int force)
+{
+       ENTRY;
+
+       CDEBUG(D_SEC, "sec %p(%d, nctx %d), uid %d, grace %d, force %d\n",
+              sec, atomic_read(&sec->ps_refcount),
+              atomic_read(&sec->ps_nctx),
+              uid, grace, force);
+
+       if (uid != -1 && uid != 0)
+               flush_user_ctx_cache_kr(sec, uid, grace, force);
+       else
+               flush_spec_ctx_cache_kr(sec, uid, grace, force);
+
+       RETURN(0);
+}
+
+static
+void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec)
+{
+       struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+       struct hlist_head       freelist = HLIST_HEAD_INIT;
+       struct hlist_node      *next;
+       struct ptlrpc_cli_ctx  *ctx;
+       ENTRY;
+
+       CWARN("running gc\n");
+
+       spin_lock(&sec->ps_lock);
+       hlist_for_each_entry_safe(ctx, next,
+                                     &gsec_kr->gsk_clist, cc_cache) {
+               LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+               atomic_inc(&ctx->cc_refcount);
+
+               if (cli_ctx_check_death(ctx) && ctx_unlist_kr(ctx, 1)) {
+                       hlist_add_head(&ctx->cc_cache, &freelist);
+                       CWARN("unhashed ctx %p\n", ctx);
+               } else {
+                       LASSERT(atomic_read(&ctx->cc_refcount) >= 2);
+                       atomic_dec(&ctx->cc_refcount);
+               }
+       }
+       spin_unlock(&sec->ps_lock);
+
+       dispose_ctx_list_kr(&freelist);
+       EXIT;
+       return;
+}
+
+static
+int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq)
+{
+       struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec);
+       struct hlist_node      *next;
+       struct ptlrpc_cli_ctx  *ctx;
+       struct gss_cli_ctx     *gctx;
+       time_t            now = cfs_time_current_sec();
+       ENTRY;
+
+       spin_lock(&sec->ps_lock);
+       hlist_for_each_entry_safe(ctx, next,
+                                 &gsec_kr->gsk_clist, cc_cache) {
+               struct key           *key;
+               char                flags_str[40];
+               char                mech[40];
+
+               gctx = ctx2gctx(ctx);
+               key = ctx2gctx_keyring(ctx)->gck_key;
+
+               gss_cli_ctx_flags2str(ctx->cc_flags,
+                                     flags_str, sizeof(flags_str));
+
+               if (gctx->gc_mechctx)
+                       lgss_display(gctx->gc_mechctx, mech, sizeof(mech));
+               else
+                       snprintf(mech, sizeof(mech), "N/A");
+               mech[sizeof(mech) - 1] = '\0';
+
+               seq_printf(seq, "%p: uid %u, ref %d, expire %ld(%+ld), fl %s, "
+                          "seq %d, win %u, key %08x(ref %d), "
+                          "hdl "LPX64":"LPX64", mech: %s\n",
+                          ctx, ctx->cc_vcred.vc_uid,
+                          atomic_read(&ctx->cc_refcount),
+                          ctx->cc_expire,
+                          ctx->cc_expire ?  ctx->cc_expire - now : 0,
+                          flags_str,
+                          atomic_read(&gctx->gc_seq),
+                          gctx->gc_win,
+                          key ? key->serial : 0,
+                          key ? atomic_read(&key->usage) : 0,
+                          gss_handle_to_u64(&gctx->gc_handle),
+                          gss_handle_to_u64(&gctx->gc_svc_handle),
+                          mech);
+       }
+       spin_unlock(&sec->ps_lock);
+
+       RETURN(0);
+}
+
+/****************************************
+ * cli_ctx apis                         *
+ ****************************************/
+
+static
+int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       /* upcall is already on the way */
+       return 0;
+}
+
+static
+int gss_cli_ctx_validate_kr(struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(ctx->cc_sec);
+
+       if (cli_ctx_check_death(ctx)) {
+               kill_ctx_kr(ctx);
+               return 1;
+       }
+
+       if (cli_ctx_is_ready(ctx))
+               return 0;
+       return 1;
+}
+
+static
+void gss_cli_ctx_die_kr(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(ctx->cc_sec);
+
+       cli_ctx_expire(ctx);
+       kill_ctx_kr(ctx);
+}
+
+/****************************************
+ * (reverse) service               *
+ ****************************************/
+
+/*
+ * reverse context could have nothing to do with keyrings. here we still keep
+ * the version which bind to a key, for future reference.
+ */
+#define HAVE_REVERSE_CTX_NOKEY
+
+
+static
+int sec_install_rctx_kr(struct ptlrpc_sec *sec,
+                       struct ptlrpc_svc_ctx *svc_ctx)
+{
+       struct ptlrpc_cli_ctx   *cli_ctx;
+       struct vfs_cred   vcred = { 0, 0 };
+       int                   rc;
+
+       LASSERT(sec);
+       LASSERT(svc_ctx);
+
+       cli_ctx = ctx_create_kr(sec, &vcred);
+       if (cli_ctx == NULL)
+               return -ENOMEM;
+
+       rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+       if (rc) {
+               CERROR("failed copy reverse cli ctx: %d\n", rc);
+
+               ctx_put_kr(cli_ctx, 1);
+               return rc;
+       }
+
+       rvs_sec_install_root_ctx_kr(sec, cli_ctx, NULL);
+
+       ctx_put_kr(cli_ctx, 1);
+
+       return 0;
+}
+
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static
+int gss_svc_accept_kr(struct ptlrpc_request *req)
+{
+       return gss_svc_accept(&gss_policy_keyring, req);
+}
+
+static
+int gss_svc_install_rctx_kr(struct obd_import *imp,
+                           struct ptlrpc_svc_ctx *svc_ctx)
+{
+       struct ptlrpc_sec *sec;
+       int             rc;
+
+       sec = sptlrpc_import_sec_ref(imp);
+       LASSERT(sec);
+
+       rc = sec_install_rctx_kr(sec, svc_ctx);
+       sptlrpc_sec_put(sec);
+
+       return rc;
+}
+
+/****************************************
+ * key apis                         *
+ ****************************************/
+
+static
+int gss_kt_instantiate(struct key *key, const void *data, size_t datalen)
+{
+       int          rc;
+       ENTRY;
+
+       if (data != NULL || datalen != 0) {
+               CERROR("invalid: data %p, len %lu\n", data, (long)datalen);
+               RETURN(-EINVAL);
+       }
+
+       if (key->payload.data != 0) {
+               CERROR("key already have payload\n");
+               RETURN(-EINVAL);
+       }
+
+       /* link the key to session keyring, so following context negotiation
+        * rpc fired from user space could find this key. This will be unlinked
+        * automatically when upcall processes die.
+        *
+        * we can't do this through keyctl from userspace, because the upcall
+        * might be neither possessor nor owner of the key (setuid).
+        *
+        * the session keyring is created upon upcall, and don't change all
+        * the way until upcall finished, so rcu lock is not needed here.
+        */
+       LASSERT(key_tgcred(current)->session_keyring);
+
+       lockdep_off();
+       rc = key_link(key_tgcred(current)->session_keyring, key);
+       lockdep_on();
+       if (unlikely(rc)) {
+               CERROR("failed to link key %08x to keyring %08x: %d\n",
+                      key->serial,
+                      key_tgcred(current)->session_keyring->serial, rc);
+               RETURN(rc);
+       }
+
+       CDEBUG(D_SEC, "key %p instantiated, ctx %p\n", key, key->payload.data);
+       RETURN(0);
+}
+
+/*
+ * called with key semaphore write locked. it means we can operate
+ * on the context without fear of loosing refcount.
+ */
+static
+int gss_kt_update(struct key *key, const void *data, size_t datalen)
+{
+       struct ptlrpc_cli_ctx   *ctx = key->payload.data;
+       struct gss_cli_ctx      *gctx;
+       rawobj_t                 tmpobj = RAWOBJ_EMPTY;
+       __u32               datalen32 = (__u32) datalen;
+       int                   rc;
+       ENTRY;
+
+       if (data == NULL || datalen == 0) {
+               CWARN("invalid: data %p, len %lu\n", data, (long)datalen);
+               RETURN(-EINVAL);
+       }
+
+       /* if upcall finished negotiation too fast (mostly likely because
+        * of local error happened) and call kt_update(), the ctx
+        * might be still NULL. but the key will finally be associate
+        * with a context, or be revoked. if key status is fine, return
+        * -EAGAIN to allow userspace sleep a while and call again. */
+       if (ctx == NULL) {
+               CDEBUG(D_SEC, "update too soon: key %p(%x) flags %lx\n",
+                     key, key->serial, key->flags);
+
+               rc = key_validate(key);
+               if (rc == 0)
+                       RETURN(-EAGAIN);
+               else
+                       RETURN(rc);
+       }
+
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(ctx->cc_sec);
+
+       ctx_clear_timer_kr(ctx);
+
+       /* don't proceed if already refreshed */
+       if (cli_ctx_is_refreshed(ctx)) {
+               CWARN("ctx already done refresh\n");
+               RETURN(0);
+       }
+
+       sptlrpc_cli_ctx_get(ctx);
+       gctx = ctx2gctx(ctx);
+
+       rc = buffer_extract_bytes(&data, &datalen32, &gctx->gc_win,
+                                 sizeof(gctx->gc_win));
+       if (rc) {
+               CERROR("failed extract seq_win\n");
+               goto out;
+       }
+
+       if (gctx->gc_win == 0) {
+               __u32   nego_rpc_err, nego_gss_err;
+
+               rc = buffer_extract_bytes(&data, &datalen32, &nego_rpc_err,
+                                         sizeof(nego_rpc_err));
+               if (rc) {
+                       CERROR("failed to extrace rpc rc\n");
+                       goto out;
+               }
+
+               rc = buffer_extract_bytes(&data, &datalen32, &nego_gss_err,
+                                         sizeof(nego_gss_err));
+               if (rc) {
+                       CERROR("failed to extrace gss rc\n");
+                       goto out;
+               }
+
+               CERROR("negotiation: rpc err %d, gss err %x\n",
+                      nego_rpc_err, nego_gss_err);
+
+               rc = nego_rpc_err ? nego_rpc_err : -EACCES;
+       } else {
+               rc = rawobj_extract_local_alloc(&gctx->gc_handle,
+                                               (__u32 **) &data, &datalen32);
+               if (rc) {
+                       CERROR("failed extract handle\n");
+                       goto out;
+               }
+
+               rc = rawobj_extract_local(&tmpobj, (__u32 **) &data,&datalen32);
+               if (rc) {
+                       CERROR("failed extract mech\n");
+                       goto out;
+               }
+
+               rc = lgss_import_sec_context(&tmpobj,
+                                            sec2gsec(ctx->cc_sec)->gs_mech,
+                                            &gctx->gc_mechctx);
+               if (rc != GSS_S_COMPLETE)
+                       CERROR("failed import context\n");
+               else
+                       rc = 0;
+       }
+out:
+       /* we don't care what current status of this ctx, even someone else
+        * is operating on the ctx at the same time. we just add up our own
+        * opinions here. */
+       if (rc == 0) {
+               gss_cli_ctx_uptodate(gctx);
+       } else {
+               /* this will also revoke the key. has to be done before
+                * wakeup waiters otherwise they can find the stale key */
+               kill_key_locked(key);
+
+               cli_ctx_expire(ctx);
+
+               if (rc != -ERESTART)
+                       set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+       }
+
+       /* let user space think it's a success */
+       sptlrpc_cli_ctx_put(ctx, 1);
+       RETURN(0);
+}
+
+static
+int gss_kt_match(const struct key *key, const void *desc)
+{
+       return (strcmp(key->description, (const char *) desc) == 0);
+}
+
+static
+void gss_kt_destroy(struct key *key)
+{
+       ENTRY;
+       LASSERT(key->payload.data == NULL);
+       CDEBUG(D_SEC, "destroy key %p\n", key);
+       EXIT;
+}
+
+static
+void gss_kt_describe(const struct key *key, struct seq_file *s)
+{
+       if (key->description == NULL)
+               seq_puts(s, "[null]");
+       else
+               seq_puts(s, key->description);
+}
+
+static struct key_type gss_key_type =
+{
+       .name      = "lgssc",
+       .def_datalen    = 0,
+       .instantiate    = gss_kt_instantiate,
+       .update  = gss_kt_update,
+       .match    = gss_kt_match,
+       .destroy        = gss_kt_destroy,
+       .describe       = gss_kt_describe,
+};
+
+/****************************************
+ * lustre gss keyring policy       *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_keyring_ctxops = {
+       .match            = gss_cli_ctx_match,
+       .refresh                = gss_cli_ctx_refresh_kr,
+       .validate              = gss_cli_ctx_validate_kr,
+       .die                = gss_cli_ctx_die_kr,
+       .sign              = gss_cli_ctx_sign,
+       .verify          = gss_cli_ctx_verify,
+       .seal              = gss_cli_ctx_seal,
+       .unseal          = gss_cli_ctx_unseal,
+       .wrap_bulk            = gss_cli_ctx_wrap_bulk,
+       .unwrap_bulk        = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_keyring_cops = {
+       .create_sec          = gss_sec_create_kr,
+       .destroy_sec        = gss_sec_destroy_kr,
+       .kill_sec              = gss_sec_kill,
+       .lookup_ctx          = gss_sec_lookup_ctx_kr,
+       .release_ctx        = gss_sec_release_ctx_kr,
+       .flush_ctx_cache        = gss_sec_flush_ctx_cache_kr,
+       .gc_ctx          = gss_sec_gc_ctx_kr,
+       .install_rctx      = gss_sec_install_rctx,
+       .alloc_reqbuf      = gss_alloc_reqbuf,
+       .free_reqbuf        = gss_free_reqbuf,
+       .alloc_repbuf      = gss_alloc_repbuf,
+       .free_repbuf        = gss_free_repbuf,
+       .enlarge_reqbuf  = gss_enlarge_reqbuf,
+       .display                = gss_sec_display_kr,
+};
+
+static struct ptlrpc_sec_sops gss_sec_keyring_sops = {
+       .accept          = gss_svc_accept_kr,
+       .invalidate_ctx  = gss_svc_invalidate_ctx,
+       .alloc_rs              = gss_svc_alloc_rs,
+       .authorize            = gss_svc_authorize,
+       .free_rs                = gss_svc_free_rs,
+       .free_ctx              = gss_svc_free_ctx,
+       .prep_bulk            = gss_svc_prep_bulk,
+       .unwrap_bulk        = gss_svc_unwrap_bulk,
+       .wrap_bulk            = gss_svc_wrap_bulk,
+       .install_rctx      = gss_svc_install_rctx_kr,
+};
+
+static struct ptlrpc_sec_policy gss_policy_keyring = {
+       .sp_owner              = THIS_MODULE,
+       .sp_name                = "gss.keyring",
+       .sp_policy            = SPTLRPC_POLICY_GSS,
+       .sp_cops                = &gss_sec_keyring_cops,
+       .sp_sops                = &gss_sec_keyring_sops,
+};
+
+
+int __init gss_init_keyring(void)
+{
+       int rc;
+
+       rc = register_key_type(&gss_key_type);
+       if (rc) {
+               CERROR("failed to register keyring type: %d\n", rc);
+               return rc;
+       }
+
+       rc = sptlrpc_register_policy(&gss_policy_keyring);
+       if (rc) {
+               unregister_key_type(&gss_key_type);
+               return rc;
+       }
+
+       return 0;
+}
+
+void __exit gss_exit_keyring(void)
+{
+       unregister_key_type(&gss_key_type);
+       sptlrpc_unregister_policy(&gss_policy_keyring);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h
new file mode 100644 (file)
index 0000000..676d4b9
--- /dev/null
@@ -0,0 +1,163 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/include/linux/sunrpc/gss_krb5_types.h
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ *  Bruce Fields   <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#ifndef PTLRPC_GSS_KRB5_H
+#define PTLRPC_GSS_KRB5_H
+
+/*
+ * RFC 4142
+ */
+
+#define KG_USAGE_ACCEPTOR_SEAL   22
+#define KG_USAGE_ACCEPTOR_SIGN   23
+#define KG_USAGE_INITIATOR_SEAL         24
+#define KG_USAGE_INITIATOR_SIGN         25
+
+#define KG_TOK_MIC_MSG           0x0404
+#define KG_TOK_WRAP_MSG                 0x0504
+
+#define FLAG_SENDER_IS_ACCEPTOR         0x01
+#define FLAG_WRAP_CONFIDENTIAL   0x02
+#define FLAG_ACCEPTOR_SUBKEY       0x04
+
+struct krb5_header {
+       __u16      kh_tok_id;      /* token id */
+       __u8        kh_flags;       /* acceptor flags */
+       __u8        kh_filler;      /* 0xff */
+       __u16      kh_ec;         /* extra count */
+       __u16      kh_rrc;       /* right rotation count */
+       __u64      kh_seq;       /* sequence number */
+       __u8        kh_cksum[0];    /* checksum */
+};
+
+struct krb5_keyblock {
+       rawobj_t                 kb_key;
+       struct ll_crypto_cipher *kb_tfm;
+};
+
+struct krb5_ctx {
+       unsigned int        kc_initiate:1,
+                               kc_cfx:1,
+                               kc_seed_init:1,
+                               kc_have_acceptor_subkey:1;
+       __s32              kc_endtime;
+       __u8                kc_seed[16];
+       __u64              kc_seq_send;
+       __u64              kc_seq_recv;
+       __u32              kc_enctype;
+       struct krb5_keyblock    kc_keye;        /* encryption */
+       struct krb5_keyblock    kc_keyi;        /* integrity */
+       struct krb5_keyblock    kc_keyc;        /* checksum */
+       rawobj_t                kc_mech_used;
+};
+
+enum sgn_alg {
+       SGN_ALG_DES_MAC_MD5        = 0x0000,
+       SGN_ALG_MD2_5            = 0x0001,
+       SGN_ALG_DES_MAC        = 0x0002,
+       SGN_ALG_3                    = 0x0003, /* not published */
+       SGN_ALG_HMAC_MD5              = 0x0011, /* microsoft w2k; no support */
+       SGN_ALG_HMAC_SHA1_DES3_KD     = 0x0004
+};
+
+enum seal_alg {
+       SEAL_ALG_NONE            = 0xffff,
+       SEAL_ALG_DES              = 0x0000,
+       SEAL_ALG_1                  = 0x0001, /* not published */
+       SEAL_ALG_MICROSOFT_RC4  = 0x0010, /* microsoft w2k; no support */
+       SEAL_ALG_DES3KD        = 0x0002
+};
+
+#define CKSUMTYPE_CRC32                 0x0001
+#define CKSUMTYPE_RSA_MD4             0x0002
+#define CKSUMTYPE_RSA_MD4_DES     0x0003
+#define CKSUMTYPE_DESCBC               0x0004
+/* des-mac-k */
+/* rsa-md4-des-k */
+#define CKSUMTYPE_RSA_MD5             0x0007
+#define CKSUMTYPE_RSA_MD5_DES     0x0008
+#define CKSUMTYPE_NIST_SHA           0x0009
+#define CKSUMTYPE_HMAC_SHA1_DES3       0x000c
+#define CKSUMTYPE_HMAC_SHA1_96_AES128   0x000f
+#define CKSUMTYPE_HMAC_SHA1_96_AES256   0x0010
+#define CKSUMTYPE_HMAC_MD5_ARCFOUR      -138
+
+/* from gssapi_err_krb5.h */
+#define KG_CCACHE_NOMATCH                      (39756032L)
+#define KG_KEYTAB_NOMATCH                      (39756033L)
+#define KG_TGT_MISSING                    (39756034L)
+#define KG_NO_SUBKEY                        (39756035L)
+#define KG_CONTEXT_ESTABLISHED            (39756036L)
+#define KG_BAD_SIGN_TYPE                        (39756037L)
+#define KG_BAD_LENGTH                      (39756038L)
+#define KG_CTX_INCOMPLETE                      (39756039L)
+#define KG_CONTEXT                            (39756040L)
+#define KG_CRED                                  (39756041L)
+#define KG_ENC_DESC                          (39756042L)
+#define KG_BAD_SEQ                            (39756043L)
+#define KG_EMPTY_CCACHE                          (39756044L)
+#define KG_NO_CTYPES                        (39756045L)
+
+/* per Kerberos v5 protocol spec crypto types from the wire.
+ * these get mapped to linux kernel crypto routines.
+ */
+#define ENCTYPE_NULL       0x0000
+#define ENCTYPE_DES_CBC_CRC     0x0001 /* DES cbc mode with CRC-32 */
+#define ENCTYPE_DES_CBC_MD4     0x0002 /* DES cbc mode with RSA-MD4 */
+#define ENCTYPE_DES_CBC_MD5     0x0003 /* DES cbc mode with RSA-MD5 */
+#define ENCTYPE_DES_CBC_RAW     0x0004 /* DES cbc mode raw */
+/* XXX deprecated? */
+#define ENCTYPE_DES3_CBC_SHA    0x0005 /* DES-3 cbc mode with NIST-SHA */
+#define ENCTYPE_DES3_CBC_RAW    0x0006 /* DES-3 cbc mode raw */
+#define ENCTYPE_DES_HMAC_SHA1   0x0008
+#define ENCTYPE_DES3_CBC_SHA1   0x0010
+#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011
+#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012
+#define ENCTYPE_ARCFOUR_HMAC    0x0017
+#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018
+#define ENCTYPE_UNKNOWN         0x01ff
+
+#endif /* PTLRPC_GSS_KRB5_H */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c
new file mode 100644 (file)
index 0000000..4b28931
--- /dev/null
@@ -0,0 +1,1786 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_mech.c
+ *  linux/net/sunrpc/gss_krb5_crypto.c
+ *  linux/net/sunrpc/gss_krb5_seal.c
+ *  linux/net/sunrpc/gss_krb5_seqnum.c
+ *  linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_asn1.h"
+#include "gss_krb5.h"
+
+static spinlock_t krb5_seq_lock;
+
+struct krb5_enctype {
+       char       *ke_dispname;
+       char       *ke_enc_name;            /* linux tfm name */
+       char       *ke_hash_name;          /* linux tfm name */
+       int          ke_enc_mode;           /* linux tfm mode */
+       int          ke_hash_size;         /* checksum size */
+       int          ke_conf_size;         /* confounder size */
+       unsigned int    ke_hash_hmac:1;  /* is hmac? */
+};
+
+/*
+ * NOTE: for aes128-cts and aes256-cts, MIT implementation use CTS encryption.
+ * but currently we simply CBC with padding, because linux doesn't support CTS
+ * yet. this need to be fixed in the future.
+ */
+static struct krb5_enctype enctypes[] = {
+       [ENCTYPE_DES_CBC_RAW] = {              /* des-cbc-md5 */
+               "des-cbc-md5",
+               "cbc(des)",
+               "md5",
+               0,
+               16,
+               8,
+               0,
+       },
+       [ENCTYPE_DES3_CBC_RAW] = {            /* des3-hmac-sha1 */
+               "des3-hmac-sha1",
+               "cbc(des3_ede)",
+               "hmac(sha1)",
+               0,
+               20,
+               8,
+               1,
+       },
+       [ENCTYPE_AES128_CTS_HMAC_SHA1_96] = {   /* aes128-cts */
+               "aes128-cts-hmac-sha1-96",
+               "cbc(aes)",
+               "hmac(sha1)",
+               0,
+               12,
+               16,
+               1,
+       },
+       [ENCTYPE_AES256_CTS_HMAC_SHA1_96] = {   /* aes256-cts */
+               "aes256-cts-hmac-sha1-96",
+               "cbc(aes)",
+               "hmac(sha1)",
+               0,
+               12,
+               16,
+               1,
+       },
+       [ENCTYPE_ARCFOUR_HMAC] = {            /* arcfour-hmac-md5 */
+               "arcfour-hmac-md5",
+               "ecb(arc4)",
+               "hmac(md5)",
+               0,
+               16,
+               8,
+               1,
+       },
+};
+
+#define MAX_ENCTYPES    sizeof(enctypes)/sizeof(struct krb5_enctype)
+
+static const char * enctype2str(__u32 enctype)
+{
+       if (enctype < MAX_ENCTYPES && enctypes[enctype].ke_dispname)
+               return enctypes[enctype].ke_dispname;
+
+       return "unknown";
+}
+
+static
+int keyblock_init(struct krb5_keyblock *kb, char *alg_name, int alg_mode)
+{
+       kb->kb_tfm = ll_crypto_alloc_blkcipher(alg_name, alg_mode, 0);
+       if (IS_ERR(kb->kb_tfm)) {
+               CERROR("failed to alloc tfm: %s, mode %d\n",
+                      alg_name, alg_mode);
+               return -1;
+       }
+
+       if (ll_crypto_blkcipher_setkey(kb->kb_tfm, kb->kb_key.data, kb->kb_key.len)) {
+               CERROR("failed to set %s key, len %d\n",
+                      alg_name, kb->kb_key.len);
+               return -1;
+       }
+
+       return 0;
+}
+
+static
+int krb5_init_keys(struct krb5_ctx *kctx)
+{
+       struct krb5_enctype *ke;
+
+       if (kctx->kc_enctype >= MAX_ENCTYPES ||
+           enctypes[kctx->kc_enctype].ke_hash_size == 0) {
+               CERROR("unsupported enctype %x\n", kctx->kc_enctype);
+               return -1;
+       }
+
+       ke = &enctypes[kctx->kc_enctype];
+
+       /* tfm arc4 is stateful, user should alloc-use-free by his own */
+       if (kctx->kc_enctype != ENCTYPE_ARCFOUR_HMAC &&
+           keyblock_init(&kctx->kc_keye, ke->ke_enc_name, ke->ke_enc_mode))
+               return -1;
+
+       /* tfm hmac is stateful, user should alloc-use-free by his own */
+       if (ke->ke_hash_hmac == 0 &&
+           keyblock_init(&kctx->kc_keyi, ke->ke_enc_name, ke->ke_enc_mode))
+               return -1;
+       if (ke->ke_hash_hmac == 0 &&
+           keyblock_init(&kctx->kc_keyc, ke->ke_enc_name, ke->ke_enc_mode))
+               return -1;
+
+       return 0;
+}
+
+static
+void keyblock_free(struct krb5_keyblock *kb)
+{
+       rawobj_free(&kb->kb_key);
+       if (kb->kb_tfm)
+               ll_crypto_free_blkcipher(kb->kb_tfm);
+}
+
+static
+int keyblock_dup(struct krb5_keyblock *new, struct krb5_keyblock *kb)
+{
+       return rawobj_dup(&new->kb_key, &kb->kb_key);
+}
+
+static
+int get_bytes(char **ptr, const char *end, void *res, int len)
+{
+       char *p, *q;
+       p = *ptr;
+       q = p + len;
+       if (q > end || q < p)
+               return -1;
+       memcpy(res, p, len);
+       *ptr = q;
+       return 0;
+}
+
+static
+int get_rawobj(char **ptr, const char *end, rawobj_t *res)
+{
+       char   *p, *q;
+       __u32   len;
+
+       p = *ptr;
+       if (get_bytes(&p, end, &len, sizeof(len)))
+               return -1;
+
+       q = p + len;
+       if (q > end || q < p)
+               return -1;
+
+       OBD_ALLOC_LARGE(res->data, len);
+       if (!res->data)
+               return -1;
+
+       res->len = len;
+       memcpy(res->data, p, len);
+       *ptr = q;
+       return 0;
+}
+
+static
+int get_keyblock(char **ptr, const char *end,
+                struct krb5_keyblock *kb, __u32 keysize)
+{
+       char *buf;
+
+       OBD_ALLOC_LARGE(buf, keysize);
+       if (buf == NULL)
+               return -1;
+
+       if (get_bytes(ptr, end, buf, keysize)) {
+               OBD_FREE_LARGE(buf, keysize);
+               return -1;
+       }
+
+       kb->kb_key.len = keysize;
+       kb->kb_key.data = buf;
+       return 0;
+}
+
+static
+void delete_context_kerberos(struct krb5_ctx *kctx)
+{
+       rawobj_free(&kctx->kc_mech_used);
+
+       keyblock_free(&kctx->kc_keye);
+       keyblock_free(&kctx->kc_keyi);
+       keyblock_free(&kctx->kc_keyc);
+}
+
+static
+__u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end)
+{
+       unsigned int    tmp_uint, keysize;
+
+       /* seed_init flag */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+               goto out_err;
+       kctx->kc_seed_init = (tmp_uint != 0);
+
+       /* seed */
+       if (get_bytes(&p, end, kctx->kc_seed, sizeof(kctx->kc_seed)))
+               goto out_err;
+
+       /* sign/seal algorithm, not really used now */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+           get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+               goto out_err;
+
+       /* end time */
+       if (get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+               goto out_err;
+
+       /* seq send */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+               goto out_err;
+       kctx->kc_seq_send = tmp_uint;
+
+       /* mech oid */
+       if (get_rawobj(&p, end, &kctx->kc_mech_used))
+               goto out_err;
+
+       /* old style enc/seq keys in format:
+        *   - enctype (u32)
+        *   - keysize (u32)
+        *   - keydata
+        * we decompose them to fit into the new context
+        */
+
+       /* enc key */
+       if (get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+               goto out_err;
+
+       if (get_bytes(&p, end, &keysize, sizeof(keysize)))
+               goto out_err;
+
+       if (get_keyblock(&p, end, &kctx->kc_keye, keysize))
+               goto out_err;
+
+       /* seq key */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+           tmp_uint != kctx->kc_enctype)
+               goto out_err;
+
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) ||
+           tmp_uint != keysize)
+               goto out_err;
+
+       if (get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+               goto out_err;
+
+       /* old style fallback */
+       if (keyblock_dup(&kctx->kc_keyi, &kctx->kc_keyc))
+               goto out_err;
+
+       if (p != end)
+               goto out_err;
+
+       CDEBUG(D_SEC, "succesfully imported rfc1964 context\n");
+       return 0;
+out_err:
+       return GSS_S_FAILURE;
+}
+
+/* Flags for version 2 context flags */
+#define KRB5_CTX_FLAG_INITIATOR                0x00000001
+#define KRB5_CTX_FLAG_CFX              0x00000002
+#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY  0x00000004
+
+static
+__u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end)
+{
+       unsigned int    tmp_uint, keysize;
+
+       /* end time */
+       if (get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime)))
+               goto out_err;
+
+       /* flags */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+               goto out_err;
+
+       if (tmp_uint & KRB5_CTX_FLAG_INITIATOR)
+               kctx->kc_initiate = 1;
+       if (tmp_uint & KRB5_CTX_FLAG_CFX)
+               kctx->kc_cfx = 1;
+       if (tmp_uint & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY)
+               kctx->kc_have_acceptor_subkey = 1;
+
+       /* seq send */
+       if (get_bytes(&p, end, &kctx->kc_seq_send, sizeof(kctx->kc_seq_send)))
+               goto out_err;
+
+       /* enctype */
+       if (get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype)))
+               goto out_err;
+
+       /* size of each key */
+       if (get_bytes(&p, end, &keysize, sizeof(keysize)))
+               goto out_err;
+
+       /* number of keys - should always be 3 */
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)))
+               goto out_err;
+
+       if (tmp_uint != 3) {
+               CERROR("Invalid number of keys: %u\n", tmp_uint);
+               goto out_err;
+       }
+
+       /* ke */
+       if (get_keyblock(&p, end, &kctx->kc_keye, keysize))
+               goto out_err;
+       /* ki */
+       if (get_keyblock(&p, end, &kctx->kc_keyi, keysize))
+               goto out_err;
+       /* ki */
+       if (get_keyblock(&p, end, &kctx->kc_keyc, keysize))
+               goto out_err;
+
+       CDEBUG(D_SEC, "succesfully imported v2 context\n");
+       return 0;
+out_err:
+       return GSS_S_FAILURE;
+}
+
+/*
+ * The whole purpose here is trying to keep user level gss context parsing
+ * from nfs-utils unchanged as possible as we can, they are not quite mature
+ * yet, and many stuff still not clear, like heimdal etc.
+ */
+static
+__u32 gss_import_sec_context_kerberos(rawobj_t *inbuf,
+                                     struct gss_ctx *gctx)
+{
+       struct krb5_ctx *kctx;
+       char        *p = (char *) inbuf->data;
+       char        *end = (char *) (inbuf->data + inbuf->len);
+       unsigned int     tmp_uint, rc;
+
+       if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) {
+               CERROR("Fail to read version\n");
+               return GSS_S_FAILURE;
+       }
+
+       /* only support 0, 1 for the moment */
+       if (tmp_uint > 2) {
+               CERROR("Invalid version %u\n", tmp_uint);
+               return GSS_S_FAILURE;
+       }
+
+       OBD_ALLOC_PTR(kctx);
+       if (!kctx)
+               return GSS_S_FAILURE;
+
+       if (tmp_uint == 0 || tmp_uint == 1) {
+               kctx->kc_initiate = tmp_uint;
+               rc = import_context_rfc1964(kctx, p, end);
+       } else {
+               rc = import_context_rfc4121(kctx, p, end);
+       }
+
+       if (rc == 0)
+               rc = krb5_init_keys(kctx);
+
+       if (rc) {
+               delete_context_kerberos(kctx);
+               OBD_FREE_PTR(kctx);
+
+               return GSS_S_FAILURE;
+       }
+
+       gctx->internal_ctx_id = kctx;
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx,
+                                       struct gss_ctx *gctx_new)
+{
+       struct krb5_ctx *kctx = gctx->internal_ctx_id;
+       struct krb5_ctx *knew;
+
+       OBD_ALLOC_PTR(knew);
+       if (!knew)
+               return GSS_S_FAILURE;
+
+       knew->kc_initiate = kctx->kc_initiate ? 0 : 1;
+       knew->kc_cfx = kctx->kc_cfx;
+       knew->kc_seed_init = kctx->kc_seed_init;
+       knew->kc_have_acceptor_subkey = kctx->kc_have_acceptor_subkey;
+       knew->kc_endtime = kctx->kc_endtime;
+
+       memcpy(knew->kc_seed, kctx->kc_seed, sizeof(kctx->kc_seed));
+       knew->kc_seq_send = kctx->kc_seq_recv;
+       knew->kc_seq_recv = kctx->kc_seq_send;
+       knew->kc_enctype = kctx->kc_enctype;
+
+       if (rawobj_dup(&knew->kc_mech_used, &kctx->kc_mech_used))
+               goto out_err;
+
+       if (keyblock_dup(&knew->kc_keye, &kctx->kc_keye))
+               goto out_err;
+       if (keyblock_dup(&knew->kc_keyi, &kctx->kc_keyi))
+               goto out_err;
+       if (keyblock_dup(&knew->kc_keyc, &kctx->kc_keyc))
+               goto out_err;
+       if (krb5_init_keys(knew))
+               goto out_err;
+
+       gctx_new->internal_ctx_id = knew;
+       CDEBUG(D_SEC, "succesfully copied reverse context\n");
+       return GSS_S_COMPLETE;
+
+out_err:
+       delete_context_kerberos(knew);
+       OBD_FREE_PTR(knew);
+       return GSS_S_FAILURE;
+}
+
+static
+__u32 gss_inquire_context_kerberos(struct gss_ctx *gctx,
+                                  unsigned long  *endtime)
+{
+       struct krb5_ctx *kctx = gctx->internal_ctx_id;
+
+       *endtime = (unsigned long) ((__u32) kctx->kc_endtime);
+       return GSS_S_COMPLETE;
+}
+
+static
+void gss_delete_sec_context_kerberos(void *internal_ctx)
+{
+       struct krb5_ctx *kctx = internal_ctx;
+
+       delete_context_kerberos(kctx);
+       OBD_FREE_PTR(kctx);
+}
+
+static
+void buf_to_sg(struct scatterlist *sg, void *ptr, int len)
+{
+       sg_set_buf(sg, ptr, len);
+}
+
+static
+__u32 krb5_encrypt(struct ll_crypto_cipher *tfm,
+                  int decrypt,
+                  void * iv,
+                  void * in,
+                  void * out,
+                  int length)
+{
+       struct blkcipher_desc desc;
+       struct scatterlist    sg;
+       __u8 local_iv[16] = {0};
+       __u32 ret = -EINVAL;
+
+       LASSERT(tfm);
+       desc.tfm  = tfm;
+       desc.info = local_iv;
+       desc.flags= 0;
+
+       if (length % ll_crypto_blkcipher_blocksize(tfm) != 0) {
+               CERROR("output length %d mismatch blocksize %d\n",
+                      length, ll_crypto_blkcipher_blocksize(tfm));
+               goto out;
+       }
+
+       if (ll_crypto_blkcipher_ivsize(tfm) > 16) {
+               CERROR("iv size too large %d\n", ll_crypto_blkcipher_ivsize(tfm));
+               goto out;
+       }
+
+       if (iv)
+               memcpy(local_iv, iv, ll_crypto_blkcipher_ivsize(tfm));
+
+       memcpy(out, in, length);
+       buf_to_sg(&sg, out, length);
+
+       if (decrypt)
+               ret = ll_crypto_blkcipher_decrypt_iv(&desc, &sg, &sg, length);
+       else
+               ret = ll_crypto_blkcipher_encrypt_iv(&desc, &sg, &sg, length);
+
+out:
+       return(ret);
+}
+
+
+static inline
+int krb5_digest_hmac(struct ll_crypto_hash *tfm,
+                    rawobj_t *key,
+                    struct krb5_header *khdr,
+                    int msgcnt, rawobj_t *msgs,
+                    int iovcnt, lnet_kiov_t *iovs,
+                    rawobj_t *cksum)
+{
+       struct hash_desc   desc;
+       struct scatterlist sg[1];
+       int             i;
+
+       ll_crypto_hash_setkey(tfm, key->data, key->len);
+       desc.tfm  = tfm;
+       desc.flags= 0;
+
+       ll_crypto_hash_init(&desc);
+
+       for (i = 0; i < msgcnt; i++) {
+               if (msgs[i].len == 0)
+                       continue;
+               buf_to_sg(sg, (char *) msgs[i].data, msgs[i].len);
+               ll_crypto_hash_update(&desc, sg, msgs[i].len);
+       }
+
+       for (i = 0; i < iovcnt; i++) {
+               if (iovs[i].kiov_len == 0)
+                       continue;
+
+               sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
+                           iovs[i].kiov_offset);
+               ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+       }
+
+       if (khdr) {
+               buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
+               ll_crypto_hash_update(&desc, sg, sizeof(*khdr));
+       }
+
+       return ll_crypto_hash_final(&desc, cksum->data);
+}
+
+
+static inline
+int krb5_digest_norm(struct ll_crypto_hash *tfm,
+                    struct krb5_keyblock *kb,
+                    struct krb5_header *khdr,
+                    int msgcnt, rawobj_t *msgs,
+                    int iovcnt, lnet_kiov_t *iovs,
+                    rawobj_t *cksum)
+{
+       struct hash_desc   desc;
+       struct scatterlist sg[1];
+       int             i;
+
+       LASSERT(kb->kb_tfm);
+       desc.tfm  = tfm;
+       desc.flags= 0;
+
+       ll_crypto_hash_init(&desc);
+
+       for (i = 0; i < msgcnt; i++) {
+               if (msgs[i].len == 0)
+                       continue;
+               buf_to_sg(sg, (char *) msgs[i].data, msgs[i].len);
+               ll_crypto_hash_update(&desc, sg, msgs[i].len);
+       }
+
+       for (i = 0; i < iovcnt; i++) {
+               if (iovs[i].kiov_len == 0)
+                       continue;
+
+               sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len,
+                           iovs[i].kiov_offset);
+               ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len);
+       }
+
+       if (khdr) {
+               buf_to_sg(sg, (char *) khdr, sizeof(*khdr));
+               ll_crypto_hash_update(&desc, sg, sizeof(*khdr));
+       }
+
+       ll_crypto_hash_final(&desc, cksum->data);
+
+       return krb5_encrypt(kb->kb_tfm, 0, NULL, cksum->data,
+                           cksum->data, cksum->len);
+}
+
+/*
+ * compute (keyed/keyless) checksum against the plain text which appended
+ * with krb5 wire token header.
+ */
+static
+__s32 krb5_make_checksum(__u32 enctype,
+                        struct krb5_keyblock *kb,
+                        struct krb5_header *khdr,
+                        int msgcnt, rawobj_t *msgs,
+                        int iovcnt, lnet_kiov_t *iovs,
+                        rawobj_t *cksum)
+{
+       struct krb5_enctype   *ke = &enctypes[enctype];
+       struct ll_crypto_hash *tfm;
+       __u32             code = GSS_S_FAILURE;
+       int                 rc;
+
+       if (!(tfm = ll_crypto_alloc_hash(ke->ke_hash_name, 0, 0))) {
+               CERROR("failed to alloc TFM: %s\n", ke->ke_hash_name);
+               return GSS_S_FAILURE;
+       }
+
+       cksum->len = ll_crypto_hash_digestsize(tfm);
+       OBD_ALLOC_LARGE(cksum->data, cksum->len);
+       if (!cksum->data) {
+               cksum->len = 0;
+               goto out_tfm;
+       }
+
+       if (ke->ke_hash_hmac)
+               rc = krb5_digest_hmac(tfm, &kb->kb_key,
+                                     khdr, msgcnt, msgs, iovcnt, iovs, cksum);
+       else
+               rc = krb5_digest_norm(tfm, kb,
+                                     khdr, msgcnt, msgs, iovcnt, iovs, cksum);
+
+       if (rc == 0)
+               code = GSS_S_COMPLETE;
+out_tfm:
+       ll_crypto_free_hash(tfm);
+       return code;
+}
+
+static void fill_krb5_header(struct krb5_ctx *kctx,
+                            struct krb5_header *khdr,
+                            int privacy)
+{
+       unsigned char acceptor_flag;
+
+       acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR;
+
+       if (privacy) {
+               khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG);
+               khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL;
+               khdr->kh_ec = cpu_to_be16(0);
+               khdr->kh_rrc = cpu_to_be16(0);
+       } else {
+               khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG);
+               khdr->kh_flags = acceptor_flag;
+               khdr->kh_ec = cpu_to_be16(0xffff);
+               khdr->kh_rrc = cpu_to_be16(0xffff);
+       }
+
+       khdr->kh_filler = 0xff;
+       spin_lock(&krb5_seq_lock);
+       khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++);
+       spin_unlock(&krb5_seq_lock);
+}
+
+static __u32 verify_krb5_header(struct krb5_ctx *kctx,
+                               struct krb5_header *khdr,
+                               int privacy)
+{
+       unsigned char acceptor_flag;
+       __u16    tok_id, ec_rrc;
+
+       acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0;
+
+       if (privacy) {
+               tok_id = KG_TOK_WRAP_MSG;
+               ec_rrc = 0x0;
+       } else {
+               tok_id = KG_TOK_MIC_MSG;
+               ec_rrc = 0xffff;
+       }
+
+       /* sanity checks */
+       if (be16_to_cpu(khdr->kh_tok_id) != tok_id) {
+               CERROR("bad token id\n");
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+       if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) {
+               CERROR("bad direction flag\n");
+               return GSS_S_BAD_SIG;
+       }
+       if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) {
+               CERROR("missing confidential flag\n");
+               return GSS_S_BAD_SIG;
+       }
+       if (khdr->kh_filler != 0xff) {
+               CERROR("bad filler\n");
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+       if (be16_to_cpu(khdr->kh_ec) != ec_rrc ||
+           be16_to_cpu(khdr->kh_rrc) != ec_rrc) {
+               CERROR("bad EC or RRC\n");
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_get_mic_kerberos(struct gss_ctx *gctx,
+                          int msgcnt,
+                          rawobj_t *msgs,
+                          int iovcnt,
+                          lnet_kiov_t *iovs,
+                          rawobj_t *token)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+
+       /* fill krb5 header */
+       LASSERT(token->len >= sizeof(*khdr));
+       khdr = (struct krb5_header *) token->data;
+       fill_krb5_header(kctx, khdr, 0);
+
+       /* checksum */
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+                              khdr, msgcnt, msgs, iovcnt, iovs, &cksum))
+               return GSS_S_FAILURE;
+
+       LASSERT(cksum.len >= ke->ke_hash_size);
+       LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size);
+       memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+              ke->ke_hash_size);
+
+       token->len = sizeof(*khdr) + ke->ke_hash_size;
+       rawobj_free(&cksum);
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_verify_mic_kerberos(struct gss_ctx *gctx,
+                             int msgcnt,
+                             rawobj_t *msgs,
+                             int iovcnt,
+                             lnet_kiov_t *iovs,
+                             rawobj_t *token)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+       __u32           major;
+
+       if (token->len < sizeof(*khdr)) {
+               CERROR("short signature: %u\n", token->len);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       khdr = (struct krb5_header *) token->data;
+
+       major = verify_krb5_header(kctx, khdr, 0);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("bad krb5 header\n");
+               return major;
+       }
+
+       if (token->len < sizeof(*khdr) + ke->ke_hash_size) {
+               CERROR("short signature: %u, require %d\n",
+                      token->len, (int) sizeof(*khdr) + ke->ke_hash_size);
+               return GSS_S_FAILURE;
+       }
+
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc,
+                              khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) {
+               CERROR("failed to make checksum\n");
+               return GSS_S_FAILURE;
+       }
+
+       LASSERT(cksum.len >= ke->ke_hash_size);
+       if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size,
+                  ke->ke_hash_size)) {
+               CERROR("checksum mismatch\n");
+               rawobj_free(&cksum);
+               return GSS_S_BAD_SIG;
+       }
+
+       rawobj_free(&cksum);
+       return GSS_S_COMPLETE;
+}
+
+static
+int add_padding(rawobj_t *msg, int msg_buflen, int blocksize)
+{
+       int padding;
+
+       padding = (blocksize - (msg->len & (blocksize - 1))) &
+                 (blocksize - 1);
+       if (!padding)
+               return 0;
+
+       if (msg->len + padding > msg_buflen) {
+               CERROR("bufsize %u too small: datalen %u, padding %u\n",
+                       msg_buflen, msg->len, padding);
+               return -EINVAL;
+       }
+
+       memset(msg->data + msg->len, padding, padding);
+       msg->len += padding;
+       return 0;
+}
+
+static
+int krb5_encrypt_rawobjs(struct ll_crypto_cipher *tfm,
+                        int mode_ecb,
+                        int inobj_cnt,
+                        rawobj_t *inobjs,
+                        rawobj_t *outobj,
+                        int enc)
+{
+       struct blkcipher_desc desc;
+       struct scatterlist    src, dst;
+       __u8              local_iv[16] = {0}, *buf;
+       __u32            datalen = 0;
+       int                i, rc;
+       ENTRY;
+
+       buf = outobj->data;
+       desc.tfm  = tfm;
+       desc.info = local_iv;
+       desc.flags = 0;
+
+       for (i = 0; i < inobj_cnt; i++) {
+               LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len);
+
+               buf_to_sg(&src, inobjs[i].data, inobjs[i].len);
+               buf_to_sg(&dst, buf, outobj->len - datalen);
+
+               if (mode_ecb) {
+                       if (enc)
+                               rc = ll_crypto_blkcipher_encrypt(
+                                       &desc, &dst, &src, src.length);
+                       else
+                               rc = ll_crypto_blkcipher_decrypt(
+                                       &desc, &dst, &src, src.length);
+               } else {
+                       if (enc)
+                               rc = ll_crypto_blkcipher_encrypt_iv(
+                                       &desc, &dst, &src, src.length);
+                       else
+                               rc = ll_crypto_blkcipher_decrypt_iv(
+                                       &desc, &dst, &src, src.length);
+               }
+
+               if (rc) {
+                       CERROR("encrypt error %d\n", rc);
+                       RETURN(rc);
+               }
+
+               datalen += inobjs[i].len;
+               buf += inobjs[i].len;
+       }
+
+       outobj->len = datalen;
+       RETURN(0);
+}
+
+/*
+ * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size.
+ */
+static
+int krb5_encrypt_bulk(struct ll_crypto_cipher *tfm,
+                     struct krb5_header *khdr,
+                     char *confounder,
+                     struct ptlrpc_bulk_desc *desc,
+                     rawobj_t *cipher,
+                     int adj_nob)
+{
+       struct blkcipher_desc   ciph_desc;
+       __u8                local_iv[16] = {0};
+       struct scatterlist      src, dst;
+       int                  blocksize, i, rc, nob = 0;
+
+       LASSERT(desc->bd_iov_count);
+       LASSERT(desc->bd_enc_iov);
+
+       blocksize = ll_crypto_blkcipher_blocksize(tfm);
+       LASSERT(blocksize > 1);
+       LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+       ciph_desc.tfm  = tfm;
+       ciph_desc.info = local_iv;
+       ciph_desc.flags = 0;
+
+       /* encrypt confounder */
+       buf_to_sg(&src, confounder, blocksize);
+       buf_to_sg(&dst, cipher->data, blocksize);
+
+       rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src, blocksize);
+       if (rc) {
+               CERROR("error to encrypt confounder: %d\n", rc);
+               return rc;
+       }
+
+       /* encrypt clear pages */
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               sg_set_page(&src, desc->bd_iov[i].kiov_page,
+                           (desc->bd_iov[i].kiov_len + blocksize - 1) &
+                           (~(blocksize - 1)),
+                           desc->bd_iov[i].kiov_offset);
+               if (adj_nob)
+                       nob += src.length;
+               sg_set_page(&dst, desc->bd_enc_iov[i].kiov_page, src.length,
+                           src.offset);
+
+               desc->bd_enc_iov[i].kiov_offset = dst.offset;
+               desc->bd_enc_iov[i].kiov_len = dst.length;
+
+               rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src,
+                                                   src.length);
+               if (rc) {
+                       CERROR("error to encrypt page: %d\n", rc);
+                       return rc;
+               }
+       }
+
+       /* encrypt krb5 header */
+       buf_to_sg(&src, khdr, sizeof(*khdr));
+       buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr));
+
+       rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc,
+                                           &dst, &src, sizeof(*khdr));
+       if (rc) {
+               CERROR("error to encrypt krb5 header: %d\n", rc);
+               return rc;
+       }
+
+       if (adj_nob)
+               desc->bd_nob = nob;
+
+       return 0;
+}
+
+/*
+ * desc->bd_nob_transferred is the size of cipher text received.
+ * desc->bd_nob is the target size of plain text supposed to be.
+ *
+ * if adj_nob != 0, we adjust each page's kiov_len to the actual
+ * plain text size.
+ * - for client read: we don't know data size for each page, so
+ *   bd_iov[]->kiov_len is set to PAGE_SIZE, but actual data received might
+ *   be smaller, so we need to adjust it according to bd_enc_iov[]->kiov_len.
+ *   this means we DO NOT support the situation that server send an odd size
+ *   data in a page which is not the last one.
+ * - for server write: we knows exactly data size for each page being expected,
+ *   thus kiov_len is accurate already, so we should not adjust it at all.
+ *   and bd_enc_iov[]->kiov_len should be round_up(bd_iov[]->kiov_len) which
+ *   should have been done by prep_bulk().
+ */
+static
+int krb5_decrypt_bulk(struct ll_crypto_cipher *tfm,
+                     struct krb5_header *khdr,
+                     struct ptlrpc_bulk_desc *desc,
+                     rawobj_t *cipher,
+                     rawobj_t *plain,
+                     int adj_nob)
+{
+       struct blkcipher_desc   ciph_desc;
+       __u8                local_iv[16] = {0};
+       struct scatterlist      src, dst;
+       int                  ct_nob = 0, pt_nob = 0;
+       int                  blocksize, i, rc;
+
+       LASSERT(desc->bd_iov_count);
+       LASSERT(desc->bd_enc_iov);
+       LASSERT(desc->bd_nob_transferred);
+
+       blocksize = ll_crypto_blkcipher_blocksize(tfm);
+       LASSERT(blocksize > 1);
+       LASSERT(cipher->len == blocksize + sizeof(*khdr));
+
+       ciph_desc.tfm  = tfm;
+       ciph_desc.info = local_iv;
+       ciph_desc.flags = 0;
+
+       if (desc->bd_nob_transferred % blocksize) {
+               CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred);
+               return -EPROTO;
+       }
+
+       /* decrypt head (confounder) */
+       buf_to_sg(&src, cipher->data, blocksize);
+       buf_to_sg(&dst, plain->data, blocksize);
+
+       rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src, blocksize);
+       if (rc) {
+               CERROR("error to decrypt confounder: %d\n", rc);
+               return rc;
+       }
+
+       for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred;
+            i++) {
+               if (desc->bd_enc_iov[i].kiov_offset % blocksize != 0 ||
+                   desc->bd_enc_iov[i].kiov_len % blocksize != 0) {
+                       CERROR("page %d: odd offset %u len %u, blocksize %d\n",
+                              i, desc->bd_enc_iov[i].kiov_offset,
+                              desc->bd_enc_iov[i].kiov_len, blocksize);
+                       return -EFAULT;
+               }
+
+               if (adj_nob) {
+                       if (ct_nob + desc->bd_enc_iov[i].kiov_len >
+                           desc->bd_nob_transferred)
+                               desc->bd_enc_iov[i].kiov_len =
+                                       desc->bd_nob_transferred - ct_nob;
+
+                       desc->bd_iov[i].kiov_len = desc->bd_enc_iov[i].kiov_len;
+                       if (pt_nob + desc->bd_enc_iov[i].kiov_len >desc->bd_nob)
+                               desc->bd_iov[i].kiov_len = desc->bd_nob -pt_nob;
+               } else {
+                       /* this should be guaranteed by LNET */
+                       LASSERT(ct_nob + desc->bd_enc_iov[i].kiov_len <=
+                               desc->bd_nob_transferred);
+                       LASSERT(desc->bd_iov[i].kiov_len <=
+                               desc->bd_enc_iov[i].kiov_len);
+               }
+
+               if (desc->bd_enc_iov[i].kiov_len == 0)
+                       continue;
+
+               sg_set_page(&src, desc->bd_enc_iov[i].kiov_page,
+                           desc->bd_enc_iov[i].kiov_len,
+                           desc->bd_enc_iov[i].kiov_offset);
+               dst = src;
+               if (desc->bd_iov[i].kiov_len % blocksize == 0)
+                       sg_assign_page(&dst, desc->bd_iov[i].kiov_page);
+
+               rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src,
+                                                   src.length);
+               if (rc) {
+                       CERROR("error to decrypt page: %d\n", rc);
+                       return rc;
+               }
+
+               if (desc->bd_iov[i].kiov_len % blocksize != 0) {
+                       memcpy(page_address(desc->bd_iov[i].kiov_page) +
+                              desc->bd_iov[i].kiov_offset,
+                              page_address(desc->bd_enc_iov[i].kiov_page) +
+                              desc->bd_iov[i].kiov_offset,
+                              desc->bd_iov[i].kiov_len);
+               }
+
+               ct_nob += desc->bd_enc_iov[i].kiov_len;
+               pt_nob += desc->bd_iov[i].kiov_len;
+       }
+
+       if (unlikely(ct_nob != desc->bd_nob_transferred)) {
+               CERROR("%d cipher text transferred but only %d decrypted\n",
+                      desc->bd_nob_transferred, ct_nob);
+               return -EFAULT;
+       }
+
+       if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) {
+               CERROR("%d plain text expected but only %d received\n",
+                      desc->bd_nob, pt_nob);
+               return -EFAULT;
+       }
+
+       /* if needed, clear up the rest unused iovs */
+       if (adj_nob)
+               while (i < desc->bd_iov_count)
+                       desc->bd_iov[i++].kiov_len = 0;
+
+       /* decrypt tail (krb5 header) */
+       buf_to_sg(&src, cipher->data + blocksize, sizeof(*khdr));
+       buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr));
+
+       rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc,
+                                           &dst, &src, sizeof(*khdr));
+       if (rc) {
+               CERROR("error to decrypt tail: %d\n", rc);
+               return rc;
+       }
+
+       if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) {
+               CERROR("krb5 header doesn't match\n");
+               return -EACCES;
+       }
+
+       return 0;
+}
+
+static
+__u32 gss_wrap_kerberos(struct gss_ctx *gctx,
+                       rawobj_t *gsshdr,
+                       rawobj_t *msg,
+                       int msg_buflen,
+                       rawobj_t *token)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       int               blocksize;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+       rawobj_t             data_desc[3], cipher;
+       __u8             conf[GSS_MAX_CIPHER_BLOCK];
+       int               rc = 0;
+
+       LASSERT(ke);
+       LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+       LASSERT(kctx->kc_keye.kb_tfm == NULL ||
+               ke->ke_conf_size >=
+               ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm));
+
+       /*
+        * final token format:
+        * ---------------------------------------------------
+        * | krb5 header | cipher text | checksum (16 bytes) |
+        * ---------------------------------------------------
+        */
+
+       /* fill krb5 header */
+       LASSERT(token->len >= sizeof(*khdr));
+       khdr = (struct krb5_header *) token->data;
+       fill_krb5_header(kctx, khdr, 1);
+
+       /* generate confounder */
+       cfs_get_random_bytes(conf, ke->ke_conf_size);
+
+       /* get encryption blocksize. note kc_keye might not associated with
+        * a tfm, currently only for arcfour-hmac */
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               LASSERT(kctx->kc_keye.kb_tfm == NULL);
+               blocksize = 1;
+       } else {
+               LASSERT(kctx->kc_keye.kb_tfm);
+               blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+       }
+       LASSERT(blocksize <= ke->ke_conf_size);
+
+       /* padding the message */
+       if (add_padding(msg, msg_buflen, blocksize))
+               return GSS_S_FAILURE;
+
+       /*
+        * clear text layout for checksum:
+        * ------------------------------------------------------
+        * | confounder | gss header | clear msgs | krb5 header |
+        * ------------------------------------------------------
+        */
+       data_desc[0].data = conf;
+       data_desc[0].len = ke->ke_conf_size;
+       data_desc[1].data = gsshdr->data;
+       data_desc[1].len = gsshdr->len;
+       data_desc[2].data = msg->data;
+       data_desc[2].len = msg->len;
+
+       /* compute checksum */
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                              khdr, 3, data_desc, 0, NULL, &cksum))
+               return GSS_S_FAILURE;
+       LASSERT(cksum.len >= ke->ke_hash_size);
+
+       /*
+        * clear text layout for encryption:
+        * -----------------------------------------
+        * | confounder | clear msgs | krb5 header |
+        * -----------------------------------------
+        */
+       data_desc[0].data = conf;
+       data_desc[0].len = ke->ke_conf_size;
+       data_desc[1].data = msg->data;
+       data_desc[1].len = msg->len;
+       data_desc[2].data = (__u8 *) khdr;
+       data_desc[2].len = sizeof(*khdr);
+
+       /* cipher text will be directly inplace */
+       cipher.data = (__u8 *) (khdr + 1);
+       cipher.len = token->len - sizeof(*khdr);
+       LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr));
+
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               rawobj_t                 arc4_keye;
+               struct ll_crypto_cipher *arc4_tfm;
+
+               if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+                                      NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+                       CERROR("failed to obtain arc4 enc key\n");
+                       GOTO(arc4_out, rc = -EACCES);
+               }
+
+               arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+               if (IS_ERR(arc4_tfm)) {
+                       CERROR("failed to alloc tfm arc4 in ECB mode\n");
+                       GOTO(arc4_out_key, rc = -EACCES);
+               }
+
+               if (ll_crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data,
+                                              arc4_keye.len)) {
+                       CERROR("failed to set arc4 key, len %d\n",
+                              arc4_keye.len);
+                       GOTO(arc4_out_tfm, rc = -EACCES);
+               }
+
+               rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
+                                         3, data_desc, &cipher, 1);
+arc4_out_tfm:
+               ll_crypto_free_blkcipher(arc4_tfm);
+arc4_out_key:
+               rawobj_free(&arc4_keye);
+arc4_out:
+               do {} while(0); /* just to avoid compile warning */
+       } else {
+               rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
+                                         3, data_desc, &cipher, 1);
+       }
+
+       if (rc != 0) {
+               rawobj_free(&cksum);
+               return GSS_S_FAILURE;
+       }
+
+       /* fill in checksum */
+       LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+       memcpy((char *)(khdr + 1) + cipher.len,
+              cksum.data + cksum.len - ke->ke_hash_size,
+              ke->ke_hash_size);
+       rawobj_free(&cksum);
+
+       /* final token length */
+       token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx,
+                            struct ptlrpc_bulk_desc *desc)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       int               blocksize, i;
+
+       LASSERT(desc->bd_iov_count);
+       LASSERT(desc->bd_enc_iov);
+       LASSERT(kctx->kc_keye.kb_tfm);
+
+       blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               LASSERT(desc->bd_enc_iov[i].kiov_page);
+               /*
+                * offset should always start at page boundary of either
+                * client or server side.
+                */
+               if (desc->bd_iov[i].kiov_offset & blocksize) {
+                       CERROR("odd offset %d in page %d\n",
+                              desc->bd_iov[i].kiov_offset, i);
+                       return GSS_S_FAILURE;
+               }
+
+               desc->bd_enc_iov[i].kiov_offset = desc->bd_iov[i].kiov_offset;
+               desc->bd_enc_iov[i].kiov_len = (desc->bd_iov[i].kiov_len +
+                                               blocksize - 1) & (~(blocksize - 1));
+       }
+
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx,
+                            struct ptlrpc_bulk_desc *desc,
+                            rawobj_t *token, int adj_nob)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       int               blocksize;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+       rawobj_t             data_desc[1], cipher;
+       __u8             conf[GSS_MAX_CIPHER_BLOCK];
+       int               rc = 0;
+
+       LASSERT(ke);
+       LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK);
+
+       /*
+        * final token format:
+        * --------------------------------------------------
+        * | krb5 header | head/tail cipher text | checksum |
+        * --------------------------------------------------
+        */
+
+       /* fill krb5 header */
+       LASSERT(token->len >= sizeof(*khdr));
+       khdr = (struct krb5_header *) token->data;
+       fill_krb5_header(kctx, khdr, 1);
+
+       /* generate confounder */
+       cfs_get_random_bytes(conf, ke->ke_conf_size);
+
+       /* get encryption blocksize. note kc_keye might not associated with
+        * a tfm, currently only for arcfour-hmac */
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               LASSERT(kctx->kc_keye.kb_tfm == NULL);
+               blocksize = 1;
+       } else {
+               LASSERT(kctx->kc_keye.kb_tfm);
+               blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+       }
+
+       /*
+        * we assume the size of krb5_header (16 bytes) must be n * blocksize.
+        * the bulk token size would be exactly (sizeof(krb5_header) +
+        * blocksize + sizeof(krb5_header) + hashsize)
+        */
+       LASSERT(blocksize <= ke->ke_conf_size);
+       LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+       LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16);
+
+       /*
+        * clear text layout for checksum:
+        * ------------------------------------------
+        * | confounder | clear pages | krb5 header |
+        * ------------------------------------------
+        */
+       data_desc[0].data = conf;
+       data_desc[0].len = ke->ke_conf_size;
+
+       /* compute checksum */
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                              khdr, 1, data_desc,
+                              desc->bd_iov_count, desc->bd_iov,
+                              &cksum))
+               return GSS_S_FAILURE;
+       LASSERT(cksum.len >= ke->ke_hash_size);
+
+       /*
+        * clear text layout for encryption:
+        * ------------------------------------------
+        * | confounder | clear pages | krb5 header |
+        * ------------------------------------------
+        *      |             |      |
+        *      ----------  (cipher pages)   |
+        * result token:   |               |
+        * -------------------------------------------
+        * | krb5 header | cipher text | cipher text |
+        * -------------------------------------------
+        */
+       data_desc[0].data = conf;
+       data_desc[0].len = ke->ke_conf_size;
+
+       cipher.data = (__u8 *) (khdr + 1);
+       cipher.len = blocksize + sizeof(*khdr);
+
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               LBUG();
+               rc = 0;
+       } else {
+               rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+                                      conf, desc, &cipher, adj_nob);
+       }
+
+       if (rc != 0) {
+               rawobj_free(&cksum);
+               return GSS_S_FAILURE;
+       }
+
+       /* fill in checksum */
+       LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size);
+       memcpy((char *)(khdr + 1) + cipher.len,
+              cksum.data + cksum.len - ke->ke_hash_size,
+              ke->ke_hash_size);
+       rawobj_free(&cksum);
+
+       /* final token length */
+       token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size;
+       return GSS_S_COMPLETE;
+}
+
+static
+__u32 gss_unwrap_kerberos(struct gss_ctx  *gctx,
+                         rawobj_t      *gsshdr,
+                         rawobj_t      *token,
+                         rawobj_t      *msg)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       unsigned char       *tmpbuf;
+       int               blocksize, bodysize;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+       rawobj_t             cipher_in, plain_out;
+       rawobj_t             hash_objs[3];
+       int               rc = 0;
+       __u32           major;
+
+       LASSERT(ke);
+
+       if (token->len < sizeof(*khdr)) {
+               CERROR("short signature: %u\n", token->len);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       khdr = (struct krb5_header *) token->data;
+
+       major = verify_krb5_header(kctx, khdr, 1);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("bad krb5 header\n");
+               return major;
+       }
+
+       /* block size */
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               LASSERT(kctx->kc_keye.kb_tfm == NULL);
+               blocksize = 1;
+       } else {
+               LASSERT(kctx->kc_keye.kb_tfm);
+               blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+       }
+
+       /* expected token layout:
+        * ----------------------------------------
+        * | krb5 header | cipher text | checksum |
+        * ----------------------------------------
+        */
+       bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size;
+
+       if (bodysize % blocksize) {
+               CERROR("odd bodysize %d\n", bodysize);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) {
+               CERROR("incomplete token: bodysize %d\n", bodysize);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) {
+               CERROR("buffer too small: %u, require %d\n",
+                      msg->len, bodysize - ke->ke_conf_size);
+               return GSS_S_FAILURE;
+       }
+
+       /* decrypting */
+       OBD_ALLOC_LARGE(tmpbuf, bodysize);
+       if (!tmpbuf)
+               return GSS_S_FAILURE;
+
+       major = GSS_S_FAILURE;
+
+       cipher_in.data = (__u8 *) (khdr + 1);
+       cipher_in.len = bodysize;
+       plain_out.data = tmpbuf;
+       plain_out.len = bodysize;
+
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               rawobj_t                 arc4_keye;
+               struct ll_crypto_cipher *arc4_tfm;
+
+               cksum.data = token->data + token->len - ke->ke_hash_size;
+               cksum.len = ke->ke_hash_size;
+
+               if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi,
+                                      NULL, 1, &cksum, 0, NULL, &arc4_keye)) {
+                       CERROR("failed to obtain arc4 enc key\n");
+                       GOTO(arc4_out, rc = -EACCES);
+               }
+
+               arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0);
+               if (IS_ERR(arc4_tfm)) {
+                       CERROR("failed to alloc tfm arc4 in ECB mode\n");
+                       GOTO(arc4_out_key, rc = -EACCES);
+               }
+
+               if (ll_crypto_blkcipher_setkey(arc4_tfm,
+                                        arc4_keye.data, arc4_keye.len)) {
+                       CERROR("failed to set arc4 key, len %d\n",
+                              arc4_keye.len);
+                       GOTO(arc4_out_tfm, rc = -EACCES);
+               }
+
+               rc = krb5_encrypt_rawobjs(arc4_tfm, 1,
+                                         1, &cipher_in, &plain_out, 0);
+arc4_out_tfm:
+               ll_crypto_free_blkcipher(arc4_tfm);
+arc4_out_key:
+               rawobj_free(&arc4_keye);
+arc4_out:
+               cksum = RAWOBJ_EMPTY;
+       } else {
+               rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0,
+                                         1, &cipher_in, &plain_out, 0);
+       }
+
+       if (rc != 0) {
+               CERROR("error decrypt\n");
+               goto out_free;
+       }
+       LASSERT(plain_out.len == bodysize);
+
+       /* expected clear text layout:
+        * -----------------------------------------
+        * | confounder | clear msgs | krb5 header |
+        * -----------------------------------------
+        */
+
+       /* verify krb5 header in token is not modified */
+       if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr),
+                  sizeof(*khdr))) {
+               CERROR("decrypted krb5 header mismatch\n");
+               goto out_free;
+       }
+
+       /* verify checksum, compose clear text as layout:
+        * ------------------------------------------------------
+        * | confounder | gss header | clear msgs | krb5 header |
+        * ------------------------------------------------------
+        */
+       hash_objs[0].len = ke->ke_conf_size;
+       hash_objs[0].data = plain_out.data;
+       hash_objs[1].len = gsshdr->len;
+       hash_objs[1].data = gsshdr->data;
+       hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr);
+       hash_objs[2].data = plain_out.data + ke->ke_conf_size;
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                              khdr, 3, hash_objs, 0, NULL, &cksum))
+               goto out_free;
+
+       LASSERT(cksum.len >= ke->ke_hash_size);
+       if (memcmp((char *)(khdr + 1) + bodysize,
+                  cksum.data + cksum.len - ke->ke_hash_size,
+                  ke->ke_hash_size)) {
+               CERROR("checksum mismatch\n");
+               goto out_free;
+       }
+
+       msg->len =  bodysize - ke->ke_conf_size - sizeof(*khdr);
+       memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len);
+
+       major = GSS_S_COMPLETE;
+out_free:
+       OBD_FREE_LARGE(tmpbuf, bodysize);
+       rawobj_free(&cksum);
+       return major;
+}
+
+static
+__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx,
+                              struct ptlrpc_bulk_desc *desc,
+                              rawobj_t *token, int adj_nob)
+{
+       struct krb5_ctx     *kctx = gctx->internal_ctx_id;
+       struct krb5_enctype *ke = &enctypes[kctx->kc_enctype];
+       struct krb5_header  *khdr;
+       int               blocksize;
+       rawobj_t             cksum = RAWOBJ_EMPTY;
+       rawobj_t             cipher, plain;
+       rawobj_t             data_desc[1];
+       int               rc;
+       __u32           major;
+
+       LASSERT(ke);
+
+       if (token->len < sizeof(*khdr)) {
+               CERROR("short signature: %u\n", token->len);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       khdr = (struct krb5_header *) token->data;
+
+       major = verify_krb5_header(kctx, khdr, 1);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("bad krb5 header\n");
+               return major;
+       }
+
+       /* block size */
+       if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) {
+               LASSERT(kctx->kc_keye.kb_tfm == NULL);
+               blocksize = 1;
+               LBUG();
+       } else {
+               LASSERT(kctx->kc_keye.kb_tfm);
+               blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm);
+       }
+       LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0);
+
+       /*
+        * token format is expected as:
+        * -----------------------------------------------
+        * | krb5 header | head/tail cipher text | cksum |
+        * -----------------------------------------------
+        */
+       if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) +
+                        ke->ke_hash_size) {
+               CERROR("short token size: %u\n", token->len);
+               return GSS_S_DEFECTIVE_TOKEN;
+       }
+
+       cipher.data = (__u8 *) (khdr + 1);
+       cipher.len = blocksize + sizeof(*khdr);
+       plain.data = cipher.data;
+       plain.len = cipher.len;
+
+       rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr,
+                              desc, &cipher, &plain, adj_nob);
+       if (rc)
+               return GSS_S_DEFECTIVE_TOKEN;
+
+       /*
+        * verify checksum, compose clear text as layout:
+        * ------------------------------------------
+        * | confounder | clear pages | krb5 header |
+        * ------------------------------------------
+        */
+       data_desc[0].data = plain.data;
+       data_desc[0].len = blocksize;
+
+       if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi,
+                              khdr, 1, data_desc,
+                              desc->bd_iov_count, desc->bd_iov,
+                              &cksum))
+               return GSS_S_FAILURE;
+       LASSERT(cksum.len >= ke->ke_hash_size);
+
+       if (memcmp(plain.data + blocksize + sizeof(*khdr),
+                  cksum.data + cksum.len - ke->ke_hash_size,
+                  ke->ke_hash_size)) {
+               CERROR("checksum mismatch\n");
+               rawobj_free(&cksum);
+               return GSS_S_BAD_SIG;
+       }
+
+       rawobj_free(&cksum);
+       return GSS_S_COMPLETE;
+}
+
+int gss_display_kerberos(struct gss_ctx        *ctx,
+                        char             *buf,
+                        int                bufsize)
+{
+       struct krb5_ctx    *kctx = ctx->internal_ctx_id;
+       int              written;
+
+       written = snprintf(buf, bufsize, "krb5 (%s)",
+                          enctype2str(kctx->kc_enctype));
+       return written;
+}
+
+static struct gss_api_ops gss_kerberos_ops = {
+       .gss_import_sec_context     = gss_import_sec_context_kerberos,
+       .gss_copy_reverse_context   = gss_copy_reverse_context_kerberos,
+       .gss_inquire_context    = gss_inquire_context_kerberos,
+       .gss_get_mic            = gss_get_mic_kerberos,
+       .gss_verify_mic      = gss_verify_mic_kerberos,
+       .gss_wrap                  = gss_wrap_kerberos,
+       .gss_unwrap              = gss_unwrap_kerberos,
+       .gss_prep_bulk        = gss_prep_bulk_kerberos,
+       .gss_wrap_bulk        = gss_wrap_bulk_kerberos,
+       .gss_unwrap_bulk            = gss_unwrap_bulk_kerberos,
+       .gss_delete_sec_context     = gss_delete_sec_context_kerberos,
+       .gss_display            = gss_display_kerberos,
+};
+
+static struct subflavor_desc gss_kerberos_sfs[] = {
+       {
+               .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5N,
+               .sf_qop  = 0,
+               .sf_service     = SPTLRPC_SVC_NULL,
+               .sf_name        = "krb5n"
+       },
+       {
+               .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5A,
+               .sf_qop  = 0,
+               .sf_service     = SPTLRPC_SVC_AUTH,
+               .sf_name        = "krb5a"
+       },
+       {
+               .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5I,
+               .sf_qop  = 0,
+               .sf_service     = SPTLRPC_SVC_INTG,
+               .sf_name        = "krb5i"
+       },
+       {
+               .sf_subflavor   = SPTLRPC_SUBFLVR_KRB5P,
+               .sf_qop  = 0,
+               .sf_service     = SPTLRPC_SVC_PRIV,
+               .sf_name        = "krb5p"
+       },
+};
+
+/*
+ * currently we leave module owner NULL
+ */
+static struct gss_api_mech gss_kerberos_mech = {
+       .gm_owner       = NULL, /*THIS_MODULE, */
+       .gm_name        = "krb5",
+       .gm_oid  = (rawobj_t)
+                               {9, "\052\206\110\206\367\022\001\002\002"},
+       .gm_ops  = &gss_kerberos_ops,
+       .gm_sf_num      = 4,
+       .gm_sfs  = gss_kerberos_sfs,
+};
+
+int __init init_kerberos_module(void)
+{
+       int status;
+
+       spin_lock_init(&krb5_seq_lock);
+
+       status = lgss_mech_register(&gss_kerberos_mech);
+       if (status)
+               CERROR("Failed to register kerberos gss mechanism!\n");
+       return status;
+}
+
+void __exit cleanup_kerberos_module(void)
+{
+       lgss_mech_unregister(&gss_kerberos_mech);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c
new file mode 100644 (file)
index 0000000..8cdad80
--- /dev/null
@@ -0,0 +1,359 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_mech_switch.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  J. Bruce Fields   <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static LIST_HEAD(registered_mechs);
+static DEFINE_SPINLOCK(registered_mechs_lock);
+
+int lgss_mech_register(struct gss_api_mech *gm)
+{
+       spin_lock(&registered_mechs_lock);
+       list_add(&gm->gm_list, &registered_mechs);
+       spin_unlock(&registered_mechs_lock);
+       CWARN("Register %s mechanism\n", gm->gm_name);
+       return 0;
+}
+
+void lgss_mech_unregister(struct gss_api_mech *gm)
+{
+       spin_lock(&registered_mechs_lock);
+       list_del(&gm->gm_list);
+       spin_unlock(&registered_mechs_lock);
+       CWARN("Unregister %s mechanism\n", gm->gm_name);
+}
+
+
+struct gss_api_mech *lgss_mech_get(struct gss_api_mech *gm)
+{
+       __module_get(gm->gm_owner);
+       return gm;
+}
+
+struct gss_api_mech *lgss_name_to_mech(char *name)
+{
+       struct gss_api_mech *pos, *gm = NULL;
+
+       spin_lock(&registered_mechs_lock);
+       list_for_each_entry(pos, &registered_mechs, gm_list) {
+               if (0 == strcmp(name, pos->gm_name)) {
+                       if (!try_module_get(pos->gm_owner))
+                               continue;
+                       gm = pos;
+                       break;
+               }
+       }
+       spin_unlock(&registered_mechs_lock);
+       return gm;
+
+}
+
+static inline
+int mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor)
+{
+       int i;
+
+       for (i = 0; i < gm->gm_sf_num; i++) {
+               if (gm->gm_sfs[i].sf_subflavor == subflavor)
+                       return 1;
+       }
+       return 0;
+}
+
+struct gss_api_mech *lgss_subflavor_to_mech(__u32 subflavor)
+{
+       struct gss_api_mech *pos, *gm = NULL;
+
+       spin_lock(&registered_mechs_lock);
+       list_for_each_entry(pos, &registered_mechs, gm_list) {
+               if (!try_module_get(pos->gm_owner))
+                       continue;
+               if (!mech_supports_subflavor(pos, subflavor)) {
+                       module_put(pos->gm_owner);
+                       continue;
+               }
+               gm = pos;
+               break;
+       }
+       spin_unlock(&registered_mechs_lock);
+       return gm;
+}
+
+void lgss_mech_put(struct gss_api_mech *gm)
+{
+       module_put(gm->gm_owner);
+}
+
+/* The mech could probably be determined from the token instead, but it's just
+ * as easy for now to pass it in. */
+__u32 lgss_import_sec_context(rawobj_t *input_token,
+                             struct gss_api_mech *mech,
+                             struct gss_ctx **ctx_id)
+{
+       OBD_ALLOC_PTR(*ctx_id);
+       if (*ctx_id == NULL)
+               return GSS_S_FAILURE;
+
+       (*ctx_id)->mech_type = lgss_mech_get(mech);
+
+       LASSERT(mech);
+       LASSERT(mech->gm_ops);
+       LASSERT(mech->gm_ops->gss_import_sec_context);
+       return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+}
+
+__u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id,
+                               struct gss_ctx **ctx_id_new)
+{
+       struct gss_api_mech *mech = ctx_id->mech_type;
+       __u32           major;
+
+       LASSERT(mech);
+
+       OBD_ALLOC_PTR(*ctx_id_new);
+       if (*ctx_id_new == NULL)
+               return GSS_S_FAILURE;
+
+       (*ctx_id_new)->mech_type = lgss_mech_get(mech);
+
+       LASSERT(mech);
+       LASSERT(mech->gm_ops);
+       LASSERT(mech->gm_ops->gss_copy_reverse_context);
+
+       major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new);
+       if (major != GSS_S_COMPLETE) {
+               lgss_mech_put(mech);
+               OBD_FREE_PTR(*ctx_id_new);
+               *ctx_id_new = NULL;
+       }
+       return major;
+}
+
+/*
+ * this interface is much simplified, currently we only need endtime.
+ */
+__u32 lgss_inquire_context(struct gss_ctx *context_handle,
+                          unsigned long  *endtime)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_inquire_context(context_handle,
+                                     endtime);
+}
+
+/* gss_get_mic: compute a mic over message and return mic_token. */
+__u32 lgss_get_mic(struct gss_ctx *context_handle,
+                  int msgcnt,
+                  rawobj_t *msg,
+                  int iovcnt,
+                  lnet_kiov_t *iovs,
+                  rawobj_t *mic_token)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_get_mic);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_get_mic(context_handle,
+                             msgcnt,
+                             msg,
+                             iovcnt,
+                             iovs,
+                             mic_token);
+}
+
+/* gss_verify_mic: check whether the provided mic_token verifies message. */
+__u32 lgss_verify_mic(struct gss_ctx *context_handle,
+                     int msgcnt,
+                     rawobj_t *msg,
+                     int iovcnt,
+                     lnet_kiov_t *iovs,
+                     rawobj_t *mic_token)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_verify_mic(context_handle,
+                                msgcnt,
+                                msg,
+                                iovcnt,
+                                iovs,
+                                mic_token);
+}
+
+__u32 lgss_wrap(struct gss_ctx *context_handle,
+               rawobj_t *gsshdr,
+               rawobj_t *msg,
+               int msg_buflen,
+               rawobj_t *out_token)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_wrap);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_wrap(context_handle, gsshdr, msg, msg_buflen, out_token);
+}
+
+__u32 lgss_unwrap(struct gss_ctx *context_handle,
+                 rawobj_t *gsshdr,
+                 rawobj_t *token,
+                 rawobj_t *out_msg)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_unwrap);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_unwrap(context_handle, gsshdr, token, out_msg);
+}
+
+
+__u32 lgss_prep_bulk(struct gss_ctx *context_handle,
+                    struct ptlrpc_bulk_desc *desc)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_prep_bulk(context_handle, desc);
+}
+
+__u32 lgss_wrap_bulk(struct gss_ctx *context_handle,
+                    struct ptlrpc_bulk_desc *desc,
+                    rawobj_t *token,
+                    int adj_nob)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_wrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle,
+                      struct ptlrpc_bulk_desc *desc,
+                      rawobj_t *token,
+                      int adj_nob)
+{
+       LASSERT(context_handle);
+       LASSERT(context_handle->mech_type);
+       LASSERT(context_handle->mech_type->gm_ops);
+       LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk);
+
+       return context_handle->mech_type->gm_ops
+               ->gss_unwrap_bulk(context_handle, desc, token, adj_nob);
+}
+
+/* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+
+__u32 lgss_delete_sec_context(struct gss_ctx **context_handle)
+{
+       struct gss_api_mech *mech;
+
+       CDEBUG(D_SEC, "deleting %p\n", *context_handle);
+
+       if (!*context_handle)
+               return(GSS_S_NO_CONTEXT);
+
+       mech = (*context_handle)->mech_type;
+       if ((*context_handle)->internal_ctx_id != 0) {
+               LASSERT(mech);
+               LASSERT(mech->gm_ops);
+               LASSERT(mech->gm_ops->gss_delete_sec_context);
+               mech->gm_ops->gss_delete_sec_context(
+                                       (*context_handle)->internal_ctx_id);
+       }
+       if (mech)
+               lgss_mech_put(mech);
+
+       OBD_FREE_PTR(*context_handle);
+       *context_handle=NULL;
+       return GSS_S_COMPLETE;
+}
+
+int lgss_display(struct gss_ctx *ctx,
+                char      *buf,
+                int         bufsize)
+{
+       LASSERT(ctx);
+       LASSERT(ctx->mech_type);
+       LASSERT(ctx->mech_type->gm_ops);
+       LASSERT(ctx->mech_type->gm_ops->gss_display);
+
+       return ctx->mech_type->gm_ops->gss_display(ctx, buf, bufsize);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c
new file mode 100644 (file)
index 0000000..3df7257
--- /dev/null
@@ -0,0 +1,1252 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/crypto.h>
+#include <asm/atomic.h>
+struct rpc_clnt; /* for rpc_pipefs */
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_sec.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct ptlrpc_sec_policy gss_policy_pipefs;
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops;
+
+static int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx);
+
+static int gss_sec_pipe_upcall_init(struct gss_sec *gsec)
+{
+       return 0;
+}
+
+static void gss_sec_pipe_upcall_fini(struct gss_sec *gsec)
+{
+}
+
+/****************************************
+ * internel context helpers         *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *ctx_create_pf(struct ptlrpc_sec *sec,
+                                    struct vfs_cred *vcred)
+{
+       struct gss_cli_ctx *gctx;
+       int              rc;
+
+       OBD_ALLOC_PTR(gctx);
+       if (gctx == NULL)
+               return NULL;
+
+       rc = gss_cli_ctx_init_common(sec, &gctx->gc_base,
+                                    &gss_pipefs_ctxops, vcred);
+       if (rc) {
+               OBD_FREE_PTR(gctx);
+               return NULL;
+       }
+
+       return &gctx->gc_base;
+}
+
+static
+void ctx_destroy_pf(struct ptlrpc_sec *sec, struct ptlrpc_cli_ctx *ctx)
+{
+       struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+       if (gss_cli_ctx_fini_common(sec, ctx))
+               return;
+
+       OBD_FREE_PTR(gctx);
+
+       atomic_dec(&sec->ps_nctx);
+       sptlrpc_sec_put(sec);
+}
+
+static
+void ctx_enhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *hash)
+{
+       set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+       atomic_inc(&ctx->cc_refcount);
+       hlist_add_head(&ctx->cc_cache, hash);
+}
+
+/*
+ * caller must hold spinlock
+ */
+static
+void ctx_unhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *freelist)
+{
+       LASSERT(spin_is_locked(&ctx->cc_sec->ps_lock));
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+       LASSERT(!hlist_unhashed(&ctx->cc_cache));
+
+       clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags);
+
+       if (atomic_dec_and_test(&ctx->cc_refcount)) {
+               __hlist_del(&ctx->cc_cache);
+               hlist_add_head(&ctx->cc_cache, freelist);
+       } else {
+               hlist_del_init(&ctx->cc_cache);
+       }
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+static
+int ctx_check_death_pf(struct ptlrpc_cli_ctx *ctx,
+                      struct hlist_head *freelist)
+{
+       if (cli_ctx_check_death(ctx)) {
+               if (freelist)
+                       ctx_unhash_pf(ctx, freelist);
+               return 1;
+       }
+
+       return 0;
+}
+
+static inline
+int ctx_check_death_locked_pf(struct ptlrpc_cli_ctx *ctx,
+                             struct hlist_head *freelist)
+{
+       LASSERT(ctx->cc_sec);
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+       LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags));
+
+       return ctx_check_death_pf(ctx, freelist);
+}
+
+static inline
+int ctx_match_pf(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+       /* a little bit optimization for null policy */
+       if (!ctx->cc_ops->match)
+               return 1;
+
+       return ctx->cc_ops->match(ctx, vcred);
+}
+
+static
+void ctx_list_destroy_pf(struct hlist_head *head)
+{
+       struct ptlrpc_cli_ctx *ctx;
+
+       while (!hlist_empty(head)) {
+               ctx = hlist_entry(head->first, struct ptlrpc_cli_ctx,
+                                     cc_cache);
+
+               LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+               LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT,
+                                    &ctx->cc_flags) == 0);
+
+               hlist_del_init(&ctx->cc_cache);
+               ctx_destroy_pf(ctx->cc_sec, ctx);
+       }
+}
+
+/****************************************
+ * context apis                         *
+ ****************************************/
+
+static
+int gss_cli_ctx_validate_pf(struct ptlrpc_cli_ctx *ctx)
+{
+       if (ctx_check_death_pf(ctx, NULL))
+               return 1;
+       if (cli_ctx_is_ready(ctx))
+               return 0;
+       return 1;
+}
+
+static
+void gss_cli_ctx_die_pf(struct ptlrpc_cli_ctx *ctx, int grace)
+{
+       LASSERT(ctx->cc_sec);
+       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+       cli_ctx_expire(ctx);
+
+       spin_lock(&ctx->cc_sec->ps_lock);
+
+       if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)) {
+               LASSERT(!hlist_unhashed(&ctx->cc_cache));
+               LASSERT(atomic_read(&ctx->cc_refcount) > 1);
+
+               hlist_del_init(&ctx->cc_cache);
+               if (atomic_dec_and_test(&ctx->cc_refcount))
+                       LBUG();
+       }
+
+       spin_unlock(&ctx->cc_sec->ps_lock);
+}
+
+/****************************************
+ * reverse context installation         *
+ ****************************************/
+
+static inline
+unsigned int ctx_hash_index(int hashsize, __u64 key)
+{
+       return (unsigned int) (key & ((__u64) hashsize - 1));
+}
+
+static
+void gss_sec_ctx_replace_pf(struct gss_sec *gsec,
+                           struct ptlrpc_cli_ctx *new)
+{
+       struct gss_sec_pipefs *gsec_pf;
+       struct ptlrpc_cli_ctx *ctx;
+       struct hlist_node     *next;
+       HLIST_HEAD(freelist);
+       unsigned int hash;
+       ENTRY;
+
+       gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+       hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+                             (__u64) new->cc_vcred.vc_uid);
+       LASSERT(hash < gsec_pf->gsp_chash_size);
+
+       spin_lock(&gsec->gs_base.ps_lock);
+
+       hlist_for_each_entry_safe(ctx, next,
+                                     &gsec_pf->gsp_chash[hash], cc_cache) {
+               if (!ctx_match_pf(ctx, &new->cc_vcred))
+                       continue;
+
+               cli_ctx_expire(ctx);
+               ctx_unhash_pf(ctx, &freelist);
+               break;
+       }
+
+       ctx_enhash_pf(new, &gsec_pf->gsp_chash[hash]);
+
+       spin_unlock(&gsec->gs_base.ps_lock);
+
+       ctx_list_destroy_pf(&freelist);
+       EXIT;
+}
+
+static
+int gss_install_rvs_cli_ctx_pf(struct gss_sec *gsec,
+                              struct ptlrpc_svc_ctx *svc_ctx)
+{
+       struct vfs_cred   vcred;
+       struct ptlrpc_cli_ctx   *cli_ctx;
+       int                   rc;
+       ENTRY;
+
+       vcred.vc_uid = 0;
+       vcred.vc_gid = 0;
+
+       cli_ctx = ctx_create_pf(&gsec->gs_base, &vcred);
+       if (!cli_ctx)
+               RETURN(-ENOMEM);
+
+       rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx);
+       if (rc) {
+               ctx_destroy_pf(cli_ctx->cc_sec, cli_ctx);
+               RETURN(rc);
+       }
+
+       gss_sec_ctx_replace_pf(gsec, cli_ctx);
+       RETURN(0);
+}
+
+static
+void gss_ctx_cache_gc_pf(struct gss_sec_pipefs *gsec_pf,
+                        struct hlist_head *freelist)
+{
+       struct ptlrpc_sec       *sec;
+       struct ptlrpc_cli_ctx   *ctx;
+       struct hlist_node       *next;
+       int i;
+       ENTRY;
+
+       sec = &gsec_pf->gsp_base.gs_base;
+
+       CDEBUG(D_SEC, "do gc on sec %s@%p\n", sec->ps_policy->sp_name, sec);
+
+       for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+               hlist_for_each_entry_safe(ctx, next,
+                                             &gsec_pf->gsp_chash[i], cc_cache)
+                       ctx_check_death_locked_pf(ctx, freelist);
+       }
+
+       sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+       EXIT;
+}
+
+static
+struct ptlrpc_sec* gss_sec_create_pf(struct obd_import *imp,
+                                    struct ptlrpc_svc_ctx *ctx,
+                                    struct sptlrpc_flavor *sf)
+{
+       struct gss_sec_pipefs   *gsec_pf;
+       int                   alloc_size, hash_size, i;
+       ENTRY;
+
+#define GSS_SEC_PIPEFS_CTX_HASH_SIZE    (32)
+
+       if (ctx ||
+           sf->sf_flags & (PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_REVERSE))
+               hash_size = 1;
+       else
+               hash_size = GSS_SEC_PIPEFS_CTX_HASH_SIZE;
+
+       alloc_size = sizeof(*gsec_pf) +
+                    sizeof(struct hlist_head) * hash_size;
+
+       OBD_ALLOC(gsec_pf, alloc_size);
+       if (!gsec_pf)
+               RETURN(NULL);
+
+       gsec_pf->gsp_chash_size = hash_size;
+       for (i = 0; i < hash_size; i++)
+               INIT_HLIST_HEAD(&gsec_pf->gsp_chash[i]);
+
+       if (gss_sec_create_common(&gsec_pf->gsp_base, &gss_policy_pipefs,
+                                 imp, ctx, sf))
+               goto err_free;
+
+       if (ctx == NULL) {
+               if (gss_sec_pipe_upcall_init(&gsec_pf->gsp_base))
+                       goto err_destroy;
+       } else {
+               if (gss_install_rvs_cli_ctx_pf(&gsec_pf->gsp_base, ctx))
+                       goto err_destroy;
+       }
+
+       RETURN(&gsec_pf->gsp_base.gs_base);
+
+err_destroy:
+       gss_sec_destroy_common(&gsec_pf->gsp_base);
+err_free:
+       OBD_FREE(gsec_pf, alloc_size);
+       RETURN(NULL);
+}
+
+static
+void gss_sec_destroy_pf(struct ptlrpc_sec *sec)
+{
+       struct gss_sec_pipefs   *gsec_pf;
+       struct gss_sec    *gsec;
+
+       CWARN("destroy %s@%p\n", sec->ps_policy->sp_name, sec);
+
+       gsec = container_of(sec, struct gss_sec, gs_base);
+       gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+       LASSERT(gsec_pf->gsp_chash);
+       LASSERT(gsec_pf->gsp_chash_size);
+
+       gss_sec_pipe_upcall_fini(gsec);
+
+       gss_sec_destroy_common(gsec);
+
+       OBD_FREE(gsec, sizeof(*gsec_pf) +
+                      sizeof(struct hlist_head) * gsec_pf->gsp_chash_size);
+}
+
+static
+struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_pf(struct ptlrpc_sec *sec,
+                                             struct vfs_cred *vcred,
+                                             int create, int remove_dead)
+{
+       struct gss_sec   *gsec;
+       struct gss_sec_pipefs  *gsec_pf;
+       struct ptlrpc_cli_ctx  *ctx = NULL, *new = NULL;
+       struct hlist_head       *hash_head;
+       struct hlist_node       *next;
+       HLIST_HEAD(freelist);
+       unsigned int        hash, gc = 0, found = 0;
+       ENTRY;
+
+       might_sleep();
+
+       gsec = container_of(sec, struct gss_sec, gs_base);
+       gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+       hash = ctx_hash_index(gsec_pf->gsp_chash_size,
+                             (__u64) vcred->vc_uid);
+       hash_head = &gsec_pf->gsp_chash[hash];
+       LASSERT(hash < gsec_pf->gsp_chash_size);
+
+retry:
+       spin_lock(&sec->ps_lock);
+
+       /* gc_next == 0 means never do gc */
+       if (remove_dead && sec->ps_gc_next &&
+           cfs_time_after(cfs_time_current_sec(), sec->ps_gc_next)) {
+               gss_ctx_cache_gc_pf(gsec_pf, &freelist);
+               gc = 1;
+       }
+
+       hlist_for_each_entry_safe(ctx, next, hash_head, cc_cache) {
+               if (gc == 0 &&
+                   ctx_check_death_locked_pf(ctx,
+                                             remove_dead ? &freelist : NULL))
+                       continue;
+
+               if (ctx_match_pf(ctx, vcred)) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (found) {
+               if (new && new != ctx) {
+                       /* lost the race, just free it */
+                       hlist_add_head(&new->cc_cache, &freelist);
+                       new = NULL;
+               }
+
+               /* hot node, move to head */
+               if (hash_head->first != &ctx->cc_cache) {
+                       __hlist_del(&ctx->cc_cache);
+                       hlist_add_head(&ctx->cc_cache, hash_head);
+               }
+       } else {
+               /* don't allocate for reverse sec */
+               if (sec_is_reverse(sec)) {
+                       spin_unlock(&sec->ps_lock);
+                       RETURN(NULL);
+               }
+
+               if (new) {
+                       ctx_enhash_pf(new, hash_head);
+                       ctx = new;
+               } else if (create) {
+                       spin_unlock(&sec->ps_lock);
+                       new = ctx_create_pf(sec, vcred);
+                       if (new) {
+                               clear_bit(PTLRPC_CTX_NEW_BIT, &new->cc_flags);
+                               goto retry;
+                       }
+               } else {
+                       ctx = NULL;
+               }
+       }
+
+       /* hold a ref */
+       if (ctx)
+               atomic_inc(&ctx->cc_refcount);
+
+       spin_unlock(&sec->ps_lock);
+
+       /* the allocator of the context must give the first push to refresh */
+       if (new) {
+               LASSERT(new == ctx);
+               gss_cli_ctx_refresh_pf(new);
+       }
+
+       ctx_list_destroy_pf(&freelist);
+       RETURN(ctx);
+}
+
+static
+void gss_sec_release_ctx_pf(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx,
+                           int sync)
+{
+       LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0);
+       LASSERT(hlist_unhashed(&ctx->cc_cache));
+
+       /* if required async, we must clear the UPTODATE bit to prevent extra
+        * rpcs during destroy procedure. */
+       if (!sync)
+               clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+       /* destroy this context */
+       ctx_destroy_pf(sec, ctx);
+}
+
+/*
+ * @uid: which user. "-1" means flush all.
+ * @grace: mark context DEAD, allow graceful destroy like notify
+ *      server side, etc.
+ * @force: also flush busy entries.
+ *
+ * return the number of busy context encountered.
+ *
+ * In any cases, never touch "eternal" contexts.
+ */
+static
+int gss_sec_flush_ctx_cache_pf(struct ptlrpc_sec *sec,
+                              uid_t uid,
+                              int grace, int force)
+{
+       struct gss_sec    *gsec;
+       struct gss_sec_pipefs   *gsec_pf;
+       struct ptlrpc_cli_ctx   *ctx;
+       struct hlist_node       *next;
+       HLIST_HEAD(freelist);
+       int i, busy = 0;
+       ENTRY;
+
+       might_sleep_if(grace);
+
+       gsec = container_of(sec, struct gss_sec, gs_base);
+       gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base);
+
+       spin_lock(&sec->ps_lock);
+       for (i = 0; i < gsec_pf->gsp_chash_size; i++) {
+               hlist_for_each_entry_safe(ctx, next,
+                                             &gsec_pf->gsp_chash[i],
+                                             cc_cache) {
+                       LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+
+                       if (uid != -1 && uid != ctx->cc_vcred.vc_uid)
+                               continue;
+
+                       if (atomic_read(&ctx->cc_refcount) > 1) {
+                               busy++;
+                               if (!force)
+                                       continue;
+
+                               CWARN("flush busy(%d) ctx %p(%u->%s) by force, "
+                                     "grace %d\n",
+                                     atomic_read(&ctx->cc_refcount),
+                                     ctx, ctx->cc_vcred.vc_uid,
+                                     sec2target_str(ctx->cc_sec), grace);
+                       }
+                       ctx_unhash_pf(ctx, &freelist);
+
+                       set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags);
+                       if (!grace)
+                               clear_bit(PTLRPC_CTX_UPTODATE_BIT,
+                                         &ctx->cc_flags);
+               }
+       }
+       spin_unlock(&sec->ps_lock);
+
+       ctx_list_destroy_pf(&freelist);
+       RETURN(busy);
+}
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static
+int gss_svc_accept_pf(struct ptlrpc_request *req)
+{
+       return gss_svc_accept(&gss_policy_pipefs, req);
+}
+
+static
+int gss_svc_install_rctx_pf(struct obd_import *imp,
+                           struct ptlrpc_svc_ctx *ctx)
+{
+       struct ptlrpc_sec *sec;
+       int             rc;
+
+       sec = sptlrpc_import_sec_ref(imp);
+       LASSERT(sec);
+       rc = gss_install_rvs_cli_ctx_pf(sec2gsec(sec), ctx);
+
+       sptlrpc_sec_put(sec);
+       return rc;
+}
+
+/****************************************
+ * rpc_pipefs definitions             *
+ ****************************************/
+
+#define LUSTRE_PIPE_ROOT       "/lustre"
+#define LUSTRE_PIPE_KRB5       LUSTRE_PIPE_ROOT"/krb5"
+
+struct gss_upcall_msg_data {
+       __u32                      gum_seq;
+       __u32                      gum_uid;
+       __u32                      gum_gid;
+       __u32                      gum_svc;     /* MDS/OSS... */
+       __u64                      gum_nid;     /* peer NID */
+       __u8                        gum_obd[64];    /* client obd name */
+};
+
+struct gss_upcall_msg {
+       struct rpc_pipe_msg          gum_base;
+       atomic_t                    gum_refcount;
+       struct list_head                      gum_list;
+       __u32                      gum_mechidx;
+       struct gss_sec           *gum_gsec;
+       struct gss_cli_ctx           *gum_gctx;
+       struct gss_upcall_msg_data      gum_data;
+};
+
+static atomic_t upcall_seq = ATOMIC_INIT(0);
+
+static inline
+__u32 upcall_get_sequence(void)
+{
+       return (__u32) atomic_inc_return(&upcall_seq);
+}
+
+enum mech_idx_t {
+       MECH_KRB5   = 0,
+       MECH_MAX
+};
+
+static inline
+__u32 mech_name2idx(const char *name)
+{
+       LASSERT(!strcmp(name, "krb5"));
+       return MECH_KRB5;
+}
+
+/* pipefs dentries for each mechanisms */
+static struct dentry *de_pipes[MECH_MAX] = { NULL, };
+/* all upcall messgaes linked here */
+static struct list_head upcall_lists[MECH_MAX];
+/* and protected by this */
+static spinlock_t upcall_locks[MECH_MAX];
+
+static inline
+void upcall_list_lock(int idx)
+{
+       spin_lock(&upcall_locks[idx]);
+}
+
+static inline
+void upcall_list_unlock(int idx)
+{
+       spin_unlock(&upcall_locks[idx]);
+}
+
+static
+void upcall_msg_enlist(struct gss_upcall_msg *msg)
+{
+       __u32 idx = msg->gum_mechidx;
+
+       upcall_list_lock(idx);
+       list_add(&msg->gum_list, &upcall_lists[idx]);
+       upcall_list_unlock(idx);
+}
+
+static
+void upcall_msg_delist(struct gss_upcall_msg *msg)
+{
+       __u32 idx = msg->gum_mechidx;
+
+       upcall_list_lock(idx);
+       list_del_init(&msg->gum_list);
+       upcall_list_unlock(idx);
+}
+
+/****************************************
+ * rpc_pipefs upcall helpers       *
+ ****************************************/
+
+static
+void gss_release_msg(struct gss_upcall_msg *gmsg)
+{
+       ENTRY;
+       LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+       if (!atomic_dec_and_test(&gmsg->gum_refcount)) {
+               EXIT;
+               return;
+       }
+
+       if (gmsg->gum_gctx) {
+               sptlrpc_cli_ctx_wakeup(&gmsg->gum_gctx->gc_base);
+               sptlrpc_cli_ctx_put(&gmsg->gum_gctx->gc_base, 1);
+               gmsg->gum_gctx = NULL;
+       }
+
+       LASSERT(list_empty(&gmsg->gum_list));
+       LASSERT(list_empty(&gmsg->gum_base.list));
+       OBD_FREE_PTR(gmsg);
+       EXIT;
+}
+
+static
+void gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg)
+{
+       __u32 idx = gmsg->gum_mechidx;
+
+       LASSERT(idx < MECH_MAX);
+       LASSERT(spin_is_locked(&upcall_locks[idx]));
+
+       if (list_empty(&gmsg->gum_list))
+               return;
+
+       list_del_init(&gmsg->gum_list);
+       LASSERT(atomic_read(&gmsg->gum_refcount) > 1);
+       atomic_dec(&gmsg->gum_refcount);
+}
+
+static
+void gss_unhash_msg(struct gss_upcall_msg *gmsg)
+{
+       __u32 idx = gmsg->gum_mechidx;
+
+       LASSERT(idx < MECH_MAX);
+       upcall_list_lock(idx);
+       gss_unhash_msg_nolock(gmsg);
+       upcall_list_unlock(idx);
+}
+
+static
+void gss_msg_fail_ctx(struct gss_upcall_msg *gmsg)
+{
+       if (gmsg->gum_gctx) {
+               struct ptlrpc_cli_ctx *ctx = &gmsg->gum_gctx->gc_base;
+
+               LASSERT(atomic_read(&ctx->cc_refcount) > 0);
+               sptlrpc_cli_ctx_expire(ctx);
+               set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+       }
+}
+
+static
+struct gss_upcall_msg * gss_find_upcall(__u32 mechidx, __u32 seq)
+{
+       struct gss_upcall_msg *gmsg;
+
+       upcall_list_lock(mechidx);
+       list_for_each_entry(gmsg, &upcall_lists[mechidx], gum_list) {
+               if (gmsg->gum_data.gum_seq != seq)
+                       continue;
+
+               LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+               LASSERT(gmsg->gum_mechidx == mechidx);
+
+               atomic_inc(&gmsg->gum_refcount);
+               upcall_list_unlock(mechidx);
+               return gmsg;
+       }
+       upcall_list_unlock(mechidx);
+       return NULL;
+}
+
+static
+int simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen)
+{
+       if (*buflen < reslen) {
+               CERROR("buflen %u < %u\n", *buflen, reslen);
+               return -EINVAL;
+       }
+
+       memcpy(res, *buf, reslen);
+       *buf += reslen;
+       *buflen -= reslen;
+       return 0;
+}
+
+/****************************************
+ * rpc_pipefs apis                   *
+ ****************************************/
+
+static
+ssize_t gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+                       char *dst, size_t buflen)
+{
+       char *data = (char *)msg->data + msg->copied;
+       ssize_t mlen = msg->len;
+       ssize_t left;
+       ENTRY;
+
+       if (mlen > buflen)
+               mlen = buflen;
+       left = copy_to_user(dst, data, mlen);
+       if (left < 0) {
+               msg->errno = left;
+               RETURN(left);
+       }
+       mlen -= left;
+       msg->copied += mlen;
+       msg->errno = 0;
+       RETURN(mlen);
+}
+
+static
+ssize_t gss_pipe_downcall(struct file *filp, const char *src, size_t mlen)
+{
+       struct rpc_inode        *rpci = RPC_I(filp->f_dentry->d_inode);
+       struct gss_upcall_msg   *gss_msg;
+       struct ptlrpc_cli_ctx   *ctx;
+       struct gss_cli_ctx      *gctx = NULL;
+       char                *buf, *data;
+       int                   datalen;
+       int                   timeout, rc;
+       __u32               mechidx, seq, gss_err;
+       ENTRY;
+
+       mechidx = (__u32) (long) rpci->private;
+       LASSERT(mechidx < MECH_MAX);
+
+       OBD_ALLOC(buf, mlen);
+       if (!buf)
+               RETURN(-ENOMEM);
+
+       if (copy_from_user(buf, src, mlen)) {
+               CERROR("failed copy user space data\n");
+               GOTO(out_free, rc = -EFAULT);
+       }
+       data = buf;
+       datalen = mlen;
+
+       /* data passed down format:
+        *  - seq
+        *  - timeout
+        *  - gc_win / error
+        *  - wire_ctx (rawobj)
+        *  - mech_ctx (rawobj)
+        */
+       if (simple_get_bytes(&data, &datalen, &seq, sizeof(seq))) {
+               CERROR("fail to get seq\n");
+               GOTO(out_free, rc = -EFAULT);
+       }
+
+       gss_msg = gss_find_upcall(mechidx, seq);
+       if (!gss_msg) {
+               CERROR("upcall %u has aborted earlier\n", seq);
+               GOTO(out_free, rc = -EINVAL);
+       }
+
+       gss_unhash_msg(gss_msg);
+       gctx = gss_msg->gum_gctx;
+       LASSERT(gctx);
+       LASSERT(atomic_read(&gctx->gc_base.cc_refcount) > 0);
+
+       /* timeout is not in use for now */
+       if (simple_get_bytes(&data, &datalen, &timeout, sizeof(timeout)))
+               GOTO(out_msg, rc = -EFAULT);
+
+       /* lgssd signal an error by gc_win == 0 */
+       if (simple_get_bytes(&data, &datalen, &gctx->gc_win,
+                            sizeof(gctx->gc_win)))
+               GOTO(out_msg, rc = -EFAULT);
+
+       if (gctx->gc_win == 0) {
+               /* followed by:
+                * - rpc error
+                * - gss error
+                */
+               if (simple_get_bytes(&data, &datalen, &rc, sizeof(rc)))
+                       GOTO(out_msg, rc = -EFAULT);
+               if (simple_get_bytes(&data, &datalen, &gss_err,sizeof(gss_err)))
+                       GOTO(out_msg, rc = -EFAULT);
+
+               if (rc == 0 && gss_err == GSS_S_COMPLETE) {
+                       CWARN("both rpc & gss error code not set\n");
+                       rc = -EPERM;
+               }
+       } else {
+               rawobj_t tmpobj;
+
+               /* handle */
+               if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+                       GOTO(out_msg, rc = -EFAULT);
+               if (rawobj_dup(&gctx->gc_handle, &tmpobj))
+                       GOTO(out_msg, rc = -ENOMEM);
+
+               /* mechctx */
+               if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen))
+                       GOTO(out_msg, rc = -EFAULT);
+               gss_err = lgss_import_sec_context(&tmpobj,
+                                                 gss_msg->gum_gsec->gs_mech,
+                                                 &gctx->gc_mechctx);
+               rc = 0;
+       }
+
+       if (likely(rc == 0 && gss_err == GSS_S_COMPLETE)) {
+               gss_cli_ctx_uptodate(gctx);
+       } else {
+               ctx = &gctx->gc_base;
+               sptlrpc_cli_ctx_expire(ctx);
+               if (rc != -ERESTART || gss_err != GSS_S_COMPLETE)
+                       set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags);
+
+               CERROR("refresh ctx %p(uid %d) failed: %d/0x%08x: %s\n",
+                      ctx, ctx->cc_vcred.vc_uid, rc, gss_err,
+                      test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags) ?
+                      "fatal error" : "non-fatal");
+       }
+
+       rc = mlen;
+
+out_msg:
+       gss_release_msg(gss_msg);
+
+out_free:
+       OBD_FREE(buf, mlen);
+       /* FIXME
+        * hack pipefs: always return asked length unless all following
+        * downcalls might be messed up. */
+       rc = mlen;
+       RETURN(rc);
+}
+
+static
+void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+       struct gss_upcall_msg     *gmsg;
+       struct gss_upcall_msg_data     *gumd;
+       static cfs_time_t              ratelimit = 0;
+       ENTRY;
+
+       LASSERT(list_empty(&msg->list));
+
+       /* normally errno is >= 0 */
+       if (msg->errno >= 0) {
+               EXIT;
+               return;
+       }
+
+       gmsg = container_of(msg, struct gss_upcall_msg, gum_base);
+       gumd = &gmsg->gum_data;
+       LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+       CERROR("failed msg %p (seq %u, uid %u, svc %u, nid "LPX64", obd %.*s): "
+              "errno %d\n", msg, gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+              gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+              gumd->gum_obd, msg->errno);
+
+       atomic_inc(&gmsg->gum_refcount);
+       gss_unhash_msg(gmsg);
+       if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) {
+               cfs_time_t now = cfs_time_current_sec();
+
+               if (cfs_time_after(now, ratelimit)) {
+                       CWARN("upcall timed out, is lgssd running?\n");
+                       ratelimit = now + 15;
+               }
+       }
+       gss_msg_fail_ctx(gmsg);
+       gss_release_msg(gmsg);
+       EXIT;
+}
+
+static
+void gss_pipe_release(struct inode *inode)
+{
+       struct rpc_inode *rpci = RPC_I(inode);
+       __u32        idx;
+       ENTRY;
+
+       idx = (__u32) (long) rpci->private;
+       LASSERT(idx < MECH_MAX);
+
+       upcall_list_lock(idx);
+       while (!list_empty(&upcall_lists[idx])) {
+               struct gss_upcall_msg      *gmsg;
+               struct gss_upcall_msg_data *gumd;
+
+               gmsg = list_entry(upcall_lists[idx].next,
+                                     struct gss_upcall_msg, gum_list);
+               gumd = &gmsg->gum_data;
+               LASSERT(list_empty(&gmsg->gum_base.list));
+
+               CERROR("failing remaining msg %p:seq %u, uid %u, svc %u, "
+                      "nid "LPX64", obd %.*s\n", gmsg,
+                      gumd->gum_seq, gumd->gum_uid, gumd->gum_svc,
+                      gumd->gum_nid, (int) sizeof(gumd->gum_obd),
+                      gumd->gum_obd);
+
+               gmsg->gum_base.errno = -EPIPE;
+               atomic_inc(&gmsg->gum_refcount);
+               gss_unhash_msg_nolock(gmsg);
+
+               gss_msg_fail_ctx(gmsg);
+
+               upcall_list_unlock(idx);
+               gss_release_msg(gmsg);
+               upcall_list_lock(idx);
+       }
+       upcall_list_unlock(idx);
+       EXIT;
+}
+
+static struct rpc_pipe_ops gss_upcall_ops = {
+       .upcall  = gss_pipe_upcall,
+       .downcall       = gss_pipe_downcall,
+       .destroy_msg    = gss_pipe_destroy_msg,
+       .release_pipe   = gss_pipe_release,
+};
+
+/****************************************
+ * upcall helper functions           *
+ ****************************************/
+
+static
+int gss_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+       struct obd_import         *imp;
+       struct gss_sec       *gsec;
+       struct gss_upcall_msg      *gmsg;
+       int                      rc = 0;
+       ENTRY;
+
+       might_sleep();
+
+       LASSERT(ctx->cc_sec);
+       LASSERT(ctx->cc_sec->ps_import);
+       LASSERT(ctx->cc_sec->ps_import->imp_obd);
+
+       imp = ctx->cc_sec->ps_import;
+       if (!imp->imp_connection) {
+               CERROR("import has no connection set\n");
+               RETURN(-EINVAL);
+       }
+
+       gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+
+       OBD_ALLOC_PTR(gmsg);
+       if (!gmsg)
+               RETURN(-ENOMEM);
+
+       /* initialize pipefs base msg */
+       INIT_LIST_HEAD(&gmsg->gum_base.list);
+       gmsg->gum_base.data = &gmsg->gum_data;
+       gmsg->gum_base.len = sizeof(gmsg->gum_data);
+       gmsg->gum_base.copied = 0;
+       gmsg->gum_base.errno = 0;
+
+       /* init upcall msg */
+       atomic_set(&gmsg->gum_refcount, 1);
+       gmsg->gum_mechidx = mech_name2idx(gsec->gs_mech->gm_name);
+       gmsg->gum_gsec = gsec;
+       gmsg->gum_gctx = container_of(sptlrpc_cli_ctx_get(ctx),
+                                     struct gss_cli_ctx, gc_base);
+       gmsg->gum_data.gum_seq = upcall_get_sequence();
+       gmsg->gum_data.gum_uid = ctx->cc_vcred.vc_uid;
+       gmsg->gum_data.gum_gid = 0; /* not used for now */
+       gmsg->gum_data.gum_svc = import_to_gss_svc(imp);
+       gmsg->gum_data.gum_nid = imp->imp_connection->c_peer.nid;
+       strncpy(gmsg->gum_data.gum_obd, imp->imp_obd->obd_name,
+               sizeof(gmsg->gum_data.gum_obd));
+
+       /* This only could happen when sysadmin set it dead/expired
+        * using lctl by force. */
+       if (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK) {
+               CWARN("ctx %p(%u->%s) was set flags %lx unexpectedly\n",
+                     ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+                     ctx->cc_flags);
+
+               LASSERT(!(ctx->cc_flags & PTLRPC_CTX_UPTODATE));
+               ctx->cc_flags |= PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR;
+
+               rc = -EIO;
+               goto err_free;
+       }
+
+       upcall_msg_enlist(gmsg);
+
+       rc = rpc_queue_upcall(de_pipes[gmsg->gum_mechidx]->d_inode,
+                             &gmsg->gum_base);
+       if (rc) {
+               CERROR("rpc_queue_upcall failed: %d\n", rc);
+
+               upcall_msg_delist(gmsg);
+               goto err_free;
+       }
+
+       RETURN(0);
+err_free:
+       OBD_FREE_PTR(gmsg);
+       RETURN(rc);
+}
+
+static
+int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx)
+{
+       /* if we are refreshing for root, also update the reverse
+        * handle index, do not confuse reverse contexts. */
+       if (ctx->cc_vcred.vc_uid == 0) {
+               struct gss_sec *gsec;
+
+               gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base);
+               gsec->gs_rvs_hdl = gss_get_next_ctx_index();
+       }
+
+       return gss_ctx_refresh_pf(ctx);
+}
+
+/****************************************
+ * lustre gss pipefs policy         *
+ ****************************************/
+
+static struct ptlrpc_ctx_ops gss_pipefs_ctxops = {
+       .match            = gss_cli_ctx_match,
+       .refresh                = gss_cli_ctx_refresh_pf,
+       .validate              = gss_cli_ctx_validate_pf,
+       .die                = gss_cli_ctx_die_pf,
+       .sign              = gss_cli_ctx_sign,
+       .verify          = gss_cli_ctx_verify,
+       .seal              = gss_cli_ctx_seal,
+       .unseal          = gss_cli_ctx_unseal,
+       .wrap_bulk            = gss_cli_ctx_wrap_bulk,
+       .unwrap_bulk        = gss_cli_ctx_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops gss_sec_pipefs_cops = {
+       .create_sec          = gss_sec_create_pf,
+       .destroy_sec        = gss_sec_destroy_pf,
+       .kill_sec              = gss_sec_kill,
+       .lookup_ctx          = gss_sec_lookup_ctx_pf,
+       .release_ctx        = gss_sec_release_ctx_pf,
+       .flush_ctx_cache        = gss_sec_flush_ctx_cache_pf,
+       .install_rctx      = gss_sec_install_rctx,
+       .alloc_reqbuf      = gss_alloc_reqbuf,
+       .free_reqbuf        = gss_free_reqbuf,
+       .alloc_repbuf      = gss_alloc_repbuf,
+       .free_repbuf        = gss_free_repbuf,
+       .enlarge_reqbuf  = gss_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops gss_sec_pipefs_sops = {
+       .accept          = gss_svc_accept_pf,
+       .invalidate_ctx  = gss_svc_invalidate_ctx,
+       .alloc_rs              = gss_svc_alloc_rs,
+       .authorize            = gss_svc_authorize,
+       .free_rs                = gss_svc_free_rs,
+       .free_ctx              = gss_svc_free_ctx,
+       .unwrap_bulk        = gss_svc_unwrap_bulk,
+       .wrap_bulk            = gss_svc_wrap_bulk,
+       .install_rctx      = gss_svc_install_rctx_pf,
+};
+
+static struct ptlrpc_sec_policy gss_policy_pipefs = {
+       .sp_owner              = THIS_MODULE,
+       .sp_name                = "gss.pipefs",
+       .sp_policy            = SPTLRPC_POLICY_GSS_PIPEFS,
+       .sp_cops                = &gss_sec_pipefs_cops,
+       .sp_sops                = &gss_sec_pipefs_sops,
+};
+
+static
+int __init gss_init_pipefs_upcall(void)
+{
+       struct dentry   *de;
+
+       /* pipe dir */
+       de = rpc_mkdir(LUSTRE_PIPE_ROOT, NULL);
+       if (IS_ERR(de) && PTR_ERR(de) != -EEXIST) {
+               CERROR("Failed to create gss pipe dir: %ld\n", PTR_ERR(de));
+               return PTR_ERR(de);
+       }
+
+       /* FIXME hack pipefs: dput will sometimes cause oops during module
+        * unload and lgssd close the pipe fds. */
+
+       /* krb5 mechanism */
+       de = rpc_mkpipe(LUSTRE_PIPE_KRB5, (void *) MECH_KRB5, &gss_upcall_ops,
+                       RPC_PIPE_WAIT_FOR_OPEN);
+       if (!de || IS_ERR(de)) {
+               CERROR("failed to make rpc_pipe %s: %ld\n",
+                      LUSTRE_PIPE_KRB5, PTR_ERR(de));
+               rpc_rmdir(LUSTRE_PIPE_ROOT);
+               return PTR_ERR(de);
+       }
+
+       de_pipes[MECH_KRB5] = de;
+       INIT_LIST_HEAD(&upcall_lists[MECH_KRB5]);
+       spin_lock_init(&upcall_locks[MECH_KRB5]);
+
+       return 0;
+}
+
+static
+void __exit gss_exit_pipefs_upcall(void)
+{
+       __u32   i;
+
+       for (i = 0; i < MECH_MAX; i++) {
+               LASSERT(list_empty(&upcall_lists[i]));
+
+               /* dput pipe dentry here might cause lgssd oops. */
+               de_pipes[i] = NULL;
+       }
+
+       rpc_unlink(LUSTRE_PIPE_KRB5);
+       rpc_rmdir(LUSTRE_PIPE_ROOT);
+}
+
+int __init gss_init_pipefs(void)
+{
+       int rc;
+
+       rc = gss_init_pipefs_upcall();
+       if (rc)
+               return rc;
+
+       rc = sptlrpc_register_policy(&gss_policy_pipefs);
+       if (rc) {
+               gss_exit_pipefs_upcall();
+               return rc;
+       }
+
+       return 0;
+}
+
+void __exit gss_exit_pipefs(void)
+{
+       gss_exit_pipefs_upcall();
+       sptlrpc_unregister_policy(&gss_policy_pipefs);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c
new file mode 100644 (file)
index 0000000..474ecf8
--- /dev/null
@@ -0,0 +1,242 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/gss/gss_rawobj.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_sec.h>
+
+#include "gss_internal.h"
+
+int rawobj_empty(rawobj_t *obj)
+{
+       LASSERT(equi(obj->len, obj->data));
+       return (obj->len == 0);
+}
+
+int rawobj_alloc(rawobj_t *obj, char *buf, int len)
+{
+       LASSERT(obj);
+       LASSERT(len >= 0);
+
+       obj->len = len;
+       if (len) {
+               OBD_ALLOC_LARGE(obj->data, len);
+               if (!obj->data) {
+                       obj->len = 0;
+                       RETURN(-ENOMEM);
+               }
+               memcpy(obj->data, buf, len);
+       } else
+               obj->data = NULL;
+       return 0;
+}
+
+void rawobj_free(rawobj_t *obj)
+{
+       LASSERT(obj);
+
+       if (obj->len) {
+               LASSERT(obj->data);
+               OBD_FREE_LARGE(obj->data, obj->len);
+               obj->len = 0;
+               obj->data = NULL;
+       } else
+               LASSERT(!obj->data);
+}
+
+int rawobj_equal(rawobj_t *a, rawobj_t *b)
+{
+       LASSERT(a && b);
+
+       return (a->len == b->len &&
+               (!a->len || !memcmp(a->data, b->data, a->len)));
+}
+
+int rawobj_dup(rawobj_t *dest, rawobj_t *src)
+{
+       LASSERT(src && dest);
+
+       dest->len = src->len;
+       if (dest->len) {
+               OBD_ALLOC_LARGE(dest->data, dest->len);
+               if (!dest->data) {
+                       dest->len = 0;
+                       return -ENOMEM;
+               }
+               memcpy(dest->data, src->data, dest->len);
+       } else
+               dest->data = NULL;
+       return 0;
+}
+
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+       __u32 len;
+
+       LASSERT(obj);
+       LASSERT(buf);
+       LASSERT(buflen);
+
+       len = cfs_size_round4(obj->len);
+
+       if (*buflen < 4 + len) {
+               CERROR("buflen %u <  %u\n", *buflen, 4 + len);
+               return -EINVAL;
+       }
+
+       *(*buf)++ = cpu_to_le32(obj->len);
+       memcpy(*buf, obj->data, obj->len);
+       *buf += (len >> 2);
+       *buflen -= (4 + len);
+
+       return 0;
+}
+
+static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen,
+                           int alloc, int local)
+{
+       __u32 len;
+
+       if (*buflen < sizeof(__u32)) {
+               CERROR("buflen %u\n", *buflen);
+               return -EINVAL;
+       }
+
+       obj->len = *(*buf)++;
+       if (!local)
+               obj->len = le32_to_cpu(obj->len);
+       *buflen -= sizeof(__u32);
+
+       if (!obj->len) {
+               obj->data = NULL;
+               return 0;
+       }
+
+       len = local ? obj->len : cfs_size_round4(obj->len);
+       if (*buflen < len) {
+               CERROR("buflen %u < %u\n", *buflen, len);
+               obj->len = 0;
+               return -EINVAL;
+       }
+
+       if (!alloc)
+               obj->data = (__u8 *) *buf;
+       else {
+               OBD_ALLOC_LARGE(obj->data, obj->len);
+               if (!obj->data) {
+                       CERROR("fail to alloc %u bytes\n", obj->len);
+                       obj->len = 0;
+                       return -ENOMEM;
+               }
+               memcpy(obj->data, *buf, obj->len);
+       }
+
+       *((char **)buf) += len;
+       *buflen -= len;
+
+       return 0;
+}
+
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+       return __rawobj_extract(obj, buf, buflen, 0, 0);
+}
+
+int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+       return __rawobj_extract(obj, buf, buflen, 1, 0);
+}
+
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+       return __rawobj_extract(obj, buf, buflen, 0, 1);
+}
+
+int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+       return __rawobj_extract(obj, buf, buflen, 1, 1);
+}
+
+int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj)
+{
+       rawobj->len = netobj->len;
+       rawobj->data = netobj->data;
+       return 0;
+}
+
+int rawobj_from_netobj_alloc(rawobj_t *rawobj, netobj_t *netobj)
+{
+       rawobj->len = 0;
+       rawobj->data = NULL;
+
+       if (netobj->len == 0)
+               return 0;
+
+       OBD_ALLOC_LARGE(rawobj->data, netobj->len);
+       if (rawobj->data == NULL)
+               return -ENOMEM;
+
+       rawobj->len = netobj->len;
+       memcpy(rawobj->data, netobj->data, netobj->len);
+       return 0;
+}
+
+/****************************************
+ * misc more                       *
+ ****************************************/
+
+int buffer_extract_bytes(const void **buf, __u32 *buflen,
+                        void *res, __u32 reslen)
+{
+       if (*buflen < reslen) {
+               CERROR("buflen %u < %u\n", *buflen, reslen);
+               return -EINVAL;
+       }
+
+       memcpy(res, *buf, reslen);
+       *buf += reslen;
+       *buflen -= reslen;
+       return 0;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c
new file mode 100644 (file)
index 0000000..31b50ea
--- /dev/null
@@ -0,0 +1,1099 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Neil Brown <neilb@cse.unsw.edu.au>
+ * J. Bruce Fields <bfields@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ * Dug Song <dugsong@monkey.org>
+ *
+ * RPCSEC_GSS server authentication.
+ * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078
+ * (gssapi)
+ *
+ * The RPCSEC_GSS involves three stages:
+ *  1/ context creation
+ *  2/ data exchange
+ *  3/ context destruction
+ *
+ * Context creation is handled largely by upcalls to user-space.
+ *  In particular, GSS_Accept_sec_context is handled by an upcall
+ * Data exchange is handled entirely within the kernel
+ *  In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel.
+ * Context destruction is handled in-kernel
+ *  GSS_Delete_sec_context is in-kernel
+ *
+ * Context creation is initiated by a RPCSEC_GSS_INIT request arriving.
+ * The context handle and gss_token are used as a key into the rpcsec_init cache.
+ * The content of this cache includes some of the outputs of GSS_Accept_sec_context,
+ * being major_status, minor_status, context_handle, reply_token.
+ * These are sent back to the client.
+ * Sequence window management is handled by the kernel.  The window size if currently
+ * a compile time constant.
+ *
+ * When user-space is happy that a context is established, it places an entry
+ * in the rpcsec_context cache. The key for this cache is the context_handle.
+ * The content includes:
+ *   uid/gidlist - for determining access rights
+ *   mechanism type
+ *   mechanism specific information, such as a key
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/mutex.h>
+#include <linux/sunrpc/cache.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#define GSS_SVC_UPCALL_TIMEOUT  (20)
+
+static spinlock_t __ctx_index_lock;
+static __u64 __ctx_index;
+
+__u64 gss_get_next_ctx_index(void)
+{
+       __u64 idx;
+
+       spin_lock(&__ctx_index_lock);
+       idx = __ctx_index++;
+       spin_unlock(&__ctx_index_lock);
+
+       return idx;
+}
+
+static inline unsigned long hash_mem(char *buf, int length, int bits)
+{
+       unsigned long hash = 0;
+       unsigned long l = 0;
+       int len = 0;
+       unsigned char c;
+
+       do {
+               if (len == length) {
+                       c = (char) len;
+                       len = -1;
+               } else
+                       c = *buf++;
+
+               l = (l << 8) | c;
+               len++;
+
+               if ((len & (BITS_PER_LONG/8-1)) == 0)
+                       hash = cfs_hash_long(hash^l, BITS_PER_LONG);
+       } while (len);
+
+       return hash >> (BITS_PER_LONG - bits);
+}
+
+/****************************************
+ * rsi cache                       *
+ ****************************************/
+
+#define RSI_HASHBITS    (6)
+#define RSI_HASHMAX     (1 << RSI_HASHBITS)
+#define RSI_HASHMASK    (RSI_HASHMAX - 1)
+
+struct rsi {
+       struct cache_head       h;
+       __u32              lustre_svc;
+       __u64              nid;
+       wait_queue_head_t            waitq;
+       rawobj_t                in_handle, in_token;
+       rawobj_t                out_handle, out_token;
+       int                  major_status, minor_status;
+};
+
+static struct cache_head *rsi_table[RSI_HASHMAX];
+static struct cache_detail rsi_cache;
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old);
+static struct rsi *rsi_lookup(struct rsi *item);
+
+static inline int rsi_hash(struct rsi *item)
+{
+       return hash_mem((char *)item->in_handle.data, item->in_handle.len,
+                       RSI_HASHBITS) ^
+              hash_mem((char *)item->in_token.data, item->in_token.len,
+                       RSI_HASHBITS);
+}
+
+static inline int __rsi_match(struct rsi *item, struct rsi *tmp)
+{
+       return (rawobj_equal(&item->in_handle, &tmp->in_handle) &&
+               rawobj_equal(&item->in_token, &tmp->in_token));
+}
+
+static void rsi_free(struct rsi *rsi)
+{
+       rawobj_free(&rsi->in_handle);
+       rawobj_free(&rsi->in_token);
+       rawobj_free(&rsi->out_handle);
+       rawobj_free(&rsi->out_token);
+}
+
+static void rsi_request(struct cache_detail *cd,
+                       struct cache_head *h,
+                       char **bpp, int *blen)
+{
+       struct rsi *rsi = container_of(h, struct rsi, h);
+       __u64 index = 0;
+
+       /* if in_handle is null, provide kernel suggestion */
+       if (rsi->in_handle.len == 0)
+               index = gss_get_next_ctx_index();
+
+       qword_addhex(bpp, blen, (char *) &rsi->lustre_svc,
+                    sizeof(rsi->lustre_svc));
+       qword_addhex(bpp, blen, (char *) &rsi->nid, sizeof(rsi->nid));
+       qword_addhex(bpp, blen, (char *) &index, sizeof(index));
+       qword_addhex(bpp, blen, rsi->in_handle.data, rsi->in_handle.len);
+       qword_addhex(bpp, blen, rsi->in_token.data, rsi->in_token.len);
+       (*bpp)[-1] = '\n';
+}
+
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+       return sunrpc_cache_pipe_upcall(cd, h, rsi_request);
+}
+
+static inline void __rsi_init(struct rsi *new, struct rsi *item)
+{
+       new->out_handle = RAWOBJ_EMPTY;
+       new->out_token = RAWOBJ_EMPTY;
+
+       new->in_handle = item->in_handle;
+       item->in_handle = RAWOBJ_EMPTY;
+       new->in_token = item->in_token;
+       item->in_token = RAWOBJ_EMPTY;
+
+       new->lustre_svc = item->lustre_svc;
+       new->nid = item->nid;
+       init_waitqueue_head(&new->waitq);
+}
+
+static inline void __rsi_update(struct rsi *new, struct rsi *item)
+{
+       LASSERT(new->out_handle.len == 0);
+       LASSERT(new->out_token.len == 0);
+
+       new->out_handle = item->out_handle;
+       item->out_handle = RAWOBJ_EMPTY;
+       new->out_token = item->out_token;
+       item->out_token = RAWOBJ_EMPTY;
+
+       new->major_status = item->major_status;
+       new->minor_status = item->minor_status;
+}
+
+static void rsi_put(struct kref *ref)
+{
+       struct rsi *rsi = container_of(ref, struct rsi, h.ref);
+
+       LASSERT(rsi->h.next == NULL);
+       rsi_free(rsi);
+       OBD_FREE_PTR(rsi);
+}
+
+static int rsi_match(struct cache_head *a, struct cache_head *b)
+{
+       struct rsi *item = container_of(a, struct rsi, h);
+       struct rsi *tmp = container_of(b, struct rsi, h);
+
+       return __rsi_match(item, tmp);
+}
+
+static void rsi_init(struct cache_head *cnew, struct cache_head *citem)
+{
+       struct rsi *new = container_of(cnew, struct rsi, h);
+       struct rsi *item = container_of(citem, struct rsi, h);
+
+       __rsi_init(new, item);
+}
+
+static void update_rsi(struct cache_head *cnew, struct cache_head *citem)
+{
+       struct rsi *new = container_of(cnew, struct rsi, h);
+       struct rsi *item = container_of(citem, struct rsi, h);
+
+       __rsi_update(new, item);
+}
+
+static struct cache_head *rsi_alloc(void)
+{
+       struct rsi *rsi;
+
+       OBD_ALLOC_PTR(rsi);
+       if (rsi)
+               return &rsi->h;
+       else
+               return NULL;
+}
+
+static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+       char       *buf = mesg;
+       char       *ep;
+       int          len;
+       struct rsi      rsii, *rsip = NULL;
+       time_t    expiry;
+       int          status = -EINVAL;
+       ENTRY;
+
+
+       memset(&rsii, 0, sizeof(rsii));
+
+       /* handle */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0)
+               goto out;
+       if (rawobj_alloc(&rsii.in_handle, buf, len)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       /* token */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0)
+               goto out;
+       if (rawobj_alloc(&rsii.in_token, buf, len)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       rsip = rsi_lookup(&rsii);
+       if (!rsip)
+               goto out;
+
+       rsii.h.flags = 0;
+       /* expiry */
+       expiry = get_expiry(&mesg);
+       if (expiry == 0)
+               goto out;
+
+       len = qword_get(&mesg, buf, mlen);
+       if (len <= 0)
+               goto out;
+
+       /* major */
+       rsii.major_status = simple_strtol(buf, &ep, 10);
+       if (*ep)
+               goto out;
+
+       /* minor */
+       len = qword_get(&mesg, buf, mlen);
+       if (len <= 0)
+               goto out;
+       rsii.minor_status = simple_strtol(buf, &ep, 10);
+       if (*ep)
+               goto out;
+
+       /* out_handle */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0)
+               goto out;
+       if (rawobj_alloc(&rsii.out_handle, buf, len)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       /* out_token */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0)
+               goto out;
+       if (rawobj_alloc(&rsii.out_token, buf, len)) {
+               status = -ENOMEM;
+               goto out;
+       }
+
+       rsii.h.expiry_time = expiry;
+       rsip = rsi_update(&rsii, rsip);
+       status = 0;
+out:
+       rsi_free(&rsii);
+       if (rsip) {
+               wake_up_all(&rsip->waitq);
+               cache_put(&rsip->h, &rsi_cache);
+       } else {
+               status = -ENOMEM;
+       }
+
+       if (status)
+               CERROR("rsi parse error %d\n", status);
+       RETURN(status);
+}
+
+static struct cache_detail rsi_cache = {
+       .hash_size      = RSI_HASHMAX,
+       .hash_table     = rsi_table,
+       .name      = "auth.sptlrpc.init",
+       .cache_put      = rsi_put,
+       .cache_upcall   = rsi_upcall,
+       .cache_parse    = rsi_parse,
+       .match    = rsi_match,
+       .init      = rsi_init,
+       .update  = update_rsi,
+       .alloc    = rsi_alloc,
+};
+
+static struct rsi *rsi_lookup(struct rsi *item)
+{
+       struct cache_head *ch;
+       int hash = rsi_hash(item);
+
+       ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash);
+       if (ch)
+               return container_of(ch, struct rsi, h);
+       else
+               return NULL;
+}
+
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old)
+{
+       struct cache_head *ch;
+       int hash = rsi_hash(new);
+
+       ch = sunrpc_cache_update(&rsi_cache, &new->h, &old->h, hash);
+       if (ch)
+               return container_of(ch, struct rsi, h);
+       else
+               return NULL;
+}
+
+/****************************************
+ * rsc cache                       *
+ ****************************************/
+
+#define RSC_HASHBITS    (10)
+#define RSC_HASHMAX     (1 << RSC_HASHBITS)
+#define RSC_HASHMASK    (RSC_HASHMAX - 1)
+
+struct rsc {
+       struct cache_head       h;
+       struct obd_device      *target;
+       rawobj_t                handle;
+       struct gss_svc_ctx      ctx;
+};
+
+static struct cache_head *rsc_table[RSC_HASHMAX];
+static struct cache_detail rsc_cache;
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old);
+static struct rsc *rsc_lookup(struct rsc *item);
+
+static void rsc_free(struct rsc *rsci)
+{
+       rawobj_free(&rsci->handle);
+       rawobj_free(&rsci->ctx.gsc_rvs_hdl);
+       lgss_delete_sec_context(&rsci->ctx.gsc_mechctx);
+}
+
+static inline int rsc_hash(struct rsc *rsci)
+{
+       return hash_mem((char *)rsci->handle.data,
+                       rsci->handle.len, RSC_HASHBITS);
+}
+
+static inline int __rsc_match(struct rsc *new, struct rsc *tmp)
+{
+       return rawobj_equal(&new->handle, &tmp->handle);
+}
+
+static inline void __rsc_init(struct rsc *new, struct rsc *tmp)
+{
+       new->handle = tmp->handle;
+       tmp->handle = RAWOBJ_EMPTY;
+
+       new->target = NULL;
+       memset(&new->ctx, 0, sizeof(new->ctx));
+       new->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+}
+
+static inline void __rsc_update(struct rsc *new, struct rsc *tmp)
+{
+       new->ctx = tmp->ctx;
+       tmp->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY;
+       tmp->ctx.gsc_mechctx = NULL;
+
+       memset(&new->ctx.gsc_seqdata, 0, sizeof(new->ctx.gsc_seqdata));
+       spin_lock_init(&new->ctx.gsc_seqdata.ssd_lock);
+}
+
+static void rsc_put(struct kref *ref)
+{
+       struct rsc *rsci = container_of(ref, struct rsc, h.ref);
+
+       LASSERT(rsci->h.next == NULL);
+       rsc_free(rsci);
+       OBD_FREE_PTR(rsci);
+}
+
+static int rsc_match(struct cache_head *a, struct cache_head *b)
+{
+       struct rsc *new = container_of(a, struct rsc, h);
+       struct rsc *tmp = container_of(b, struct rsc, h);
+
+       return __rsc_match(new, tmp);
+}
+
+static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
+{
+       struct rsc *new = container_of(cnew, struct rsc, h);
+       struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+       __rsc_init(new, tmp);
+}
+
+static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
+{
+       struct rsc *new = container_of(cnew, struct rsc, h);
+       struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+       __rsc_update(new, tmp);
+}
+
+static struct cache_head * rsc_alloc(void)
+{
+       struct rsc *rsc;
+
+       OBD_ALLOC_PTR(rsc);
+       if (rsc)
+               return &rsc->h;
+       else
+               return NULL;
+}
+
+static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen)
+{
+       char            *buf = mesg;
+       int               len, rv, tmp_int;
+       struct rsc         rsci, *rscp = NULL;
+       time_t         expiry;
+       int               status = -EINVAL;
+       struct gss_api_mech *gm = NULL;
+
+       memset(&rsci, 0, sizeof(rsci));
+
+       /* context handle */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0) goto out;
+       status = -ENOMEM;
+       if (rawobj_alloc(&rsci.handle, buf, len))
+               goto out;
+
+       rsci.h.flags = 0;
+       /* expiry */
+       expiry = get_expiry(&mesg);
+       status = -EINVAL;
+       if (expiry == 0)
+               goto out;
+
+       /* remote flag */
+       rv = get_int(&mesg, &tmp_int);
+       if (rv) {
+               CERROR("fail to get remote flag\n");
+               goto out;
+       }
+       rsci.ctx.gsc_remote = (tmp_int != 0);
+
+       /* root user flag */
+       rv = get_int(&mesg, &tmp_int);
+       if (rv) {
+               CERROR("fail to get oss user flag\n");
+               goto out;
+       }
+       rsci.ctx.gsc_usr_root = (tmp_int != 0);
+
+       /* mds user flag */
+       rv = get_int(&mesg, &tmp_int);
+       if (rv) {
+               CERROR("fail to get mds user flag\n");
+               goto out;
+       }
+       rsci.ctx.gsc_usr_mds = (tmp_int != 0);
+
+       /* oss user flag */
+       rv = get_int(&mesg, &tmp_int);
+       if (rv) {
+               CERROR("fail to get oss user flag\n");
+               goto out;
+       }
+       rsci.ctx.gsc_usr_oss = (tmp_int != 0);
+
+       /* mapped uid */
+       rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid);
+       if (rv) {
+               CERROR("fail to get mapped uid\n");
+               goto out;
+       }
+
+       rscp = rsc_lookup(&rsci);
+       if (!rscp)
+               goto out;
+
+       /* uid, or NEGATIVE */
+       rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid);
+       if (rv == -EINVAL)
+               goto out;
+       if (rv == -ENOENT) {
+               CERROR("NOENT? set rsc entry negative\n");
+               set_bit(CACHE_NEGATIVE, &rsci.h.flags);
+       } else {
+               rawobj_t tmp_buf;
+               unsigned long ctx_expiry;
+
+               /* gid */
+               if (get_int(&mesg, (int *) &rsci.ctx.gsc_gid))
+                       goto out;
+
+               /* mech name */
+               len = qword_get(&mesg, buf, mlen);
+               if (len < 0)
+                       goto out;
+               gm = lgss_name_to_mech(buf);
+               status = -EOPNOTSUPP;
+               if (!gm)
+                       goto out;
+
+               status = -EINVAL;
+               /* mech-specific data: */
+               len = qword_get(&mesg, buf, mlen);
+               if (len < 0)
+                       goto out;
+
+               tmp_buf.len = len;
+               tmp_buf.data = (unsigned char *)buf;
+               if (lgss_import_sec_context(&tmp_buf, gm,
+                                           &rsci.ctx.gsc_mechctx))
+                       goto out;
+
+               /* currently the expiry time passed down from user-space
+                * is invalid, here we retrive it from mech. */
+               if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+                       CERROR("unable to get expire time, drop it\n");
+                       goto out;
+               }
+               expiry = (time_t) ctx_expiry;
+       }
+
+       rsci.h.expiry_time = expiry;
+       rscp = rsc_update(&rsci, rscp);
+       status = 0;
+out:
+       if (gm)
+               lgss_mech_put(gm);
+       rsc_free(&rsci);
+       if (rscp)
+               cache_put(&rscp->h, &rsc_cache);
+       else
+               status = -ENOMEM;
+
+       if (status)
+               CERROR("parse rsc error %d\n", status);
+       return status;
+}
+
+static struct cache_detail rsc_cache = {
+       .hash_size      = RSC_HASHMAX,
+       .hash_table     = rsc_table,
+       .name      = "auth.sptlrpc.context",
+       .cache_put      = rsc_put,
+       .cache_parse    = rsc_parse,
+       .match    = rsc_match,
+       .init      = rsc_init,
+       .update  = update_rsc,
+       .alloc    = rsc_alloc,
+};
+
+static struct rsc *rsc_lookup(struct rsc *item)
+{
+       struct cache_head *ch;
+       int             hash = rsc_hash(item);
+
+       ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash);
+       if (ch)
+               return container_of(ch, struct rsc, h);
+       else
+               return NULL;
+}
+
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old)
+{
+       struct cache_head *ch;
+       int             hash = rsc_hash(new);
+
+       ch = sunrpc_cache_update(&rsc_cache, &new->h, &old->h, hash);
+       if (ch)
+               return container_of(ch, struct rsc, h);
+       else
+               return NULL;
+}
+
+#define COMPAT_RSC_PUT(item, cd)       cache_put((item), (cd))
+
+/****************************************
+ * rsc cache flush                   *
+ ****************************************/
+
+typedef int rsc_entry_match(struct rsc *rscp, long data);
+
+static void rsc_flush(rsc_entry_match *match, long data)
+{
+       struct cache_head **ch;
+       struct rsc *rscp;
+       int n;
+       ENTRY;
+
+       write_lock(&rsc_cache.hash_lock);
+       for (n = 0; n < RSC_HASHMAX; n++) {
+               for (ch = &rsc_cache.hash_table[n]; *ch;) {
+                       rscp = container_of(*ch, struct rsc, h);
+
+                       if (!match(rscp, data)) {
+                               ch = &((*ch)->next);
+                               continue;
+                       }
+
+                       /* it seems simply set NEGATIVE doesn't work */
+                       *ch = (*ch)->next;
+                       rscp->h.next = NULL;
+                       cache_get(&rscp->h);
+                       set_bit(CACHE_NEGATIVE, &rscp->h.flags);
+                       COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+                       rsc_cache.entries--;
+               }
+       }
+       write_unlock(&rsc_cache.hash_lock);
+       EXIT;
+}
+
+static int match_uid(struct rsc *rscp, long uid)
+{
+       if ((int) uid == -1)
+               return 1;
+       return ((int) rscp->ctx.gsc_uid == (int) uid);
+}
+
+static int match_target(struct rsc *rscp, long target)
+{
+       return (rscp->target == (struct obd_device *) target);
+}
+
+static inline void rsc_flush_uid(int uid)
+{
+       if (uid == -1)
+               CWARN("flush all gss contexts...\n");
+
+       rsc_flush(match_uid, (long) uid);
+}
+
+static inline void rsc_flush_target(struct obd_device *target)
+{
+       rsc_flush(match_target, (long) target);
+}
+
+void gss_secsvc_flush(struct obd_device *target)
+{
+       rsc_flush_target(target);
+}
+EXPORT_SYMBOL(gss_secsvc_flush);
+
+static struct rsc *gss_svc_searchbyctx(rawobj_t *handle)
+{
+       struct rsc  rsci;
+       struct rsc *found;
+
+       memset(&rsci, 0, sizeof(rsci));
+       if (rawobj_dup(&rsci.handle, handle))
+               return NULL;
+
+       found = rsc_lookup(&rsci);
+       rsc_free(&rsci);
+       if (!found)
+               return NULL;
+       if (cache_check(&rsc_cache, &found->h, NULL))
+               return NULL;
+       return found;
+}
+
+int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp,
+                                  struct gss_sec *gsec,
+                                  struct gss_cli_ctx *gctx)
+{
+       struct rsc      rsci, *rscp = NULL;
+       unsigned long   ctx_expiry;
+       __u32      major;
+       int          rc;
+       ENTRY;
+
+       memset(&rsci, 0, sizeof(rsci));
+
+       if (rawobj_alloc(&rsci.handle, (char *) &gsec->gs_rvs_hdl,
+                        sizeof(gsec->gs_rvs_hdl)))
+               GOTO(out, rc = -ENOMEM);
+
+       rscp = rsc_lookup(&rsci);
+       if (rscp == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       major = lgss_copy_reverse_context(gctx->gc_mechctx,
+                                         &rsci.ctx.gsc_mechctx);
+       if (major != GSS_S_COMPLETE)
+               GOTO(out, rc = -ENOMEM);
+
+       if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) {
+               CERROR("unable to get expire time, drop it\n");
+               GOTO(out, rc = -EINVAL);
+       }
+       rsci.h.expiry_time = (time_t) ctx_expiry;
+
+       if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)
+               rsci.ctx.gsc_usr_mds = 1;
+       else if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0)
+               rsci.ctx.gsc_usr_oss = 1;
+       else
+               rsci.ctx.gsc_usr_root = 1;
+
+       rscp = rsc_update(&rsci, rscp);
+       if (rscp == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rscp->target = imp->imp_obd;
+       rawobj_dup(&gctx->gc_svc_handle, &rscp->handle);
+
+       CWARN("create reverse svc ctx %p to %s: idx "LPX64"\n",
+             &rscp->ctx, obd2cli_tgt(imp->imp_obd), gsec->gs_rvs_hdl);
+       rc = 0;
+out:
+       if (rscp)
+               cache_put(&rscp->h, &rsc_cache);
+       rsc_free(&rsci);
+
+       if (rc)
+               CERROR("create reverse svc ctx: idx "LPX64", rc %d\n",
+                      gsec->gs_rvs_hdl, rc);
+       RETURN(rc);
+}
+
+int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle)
+{
+       const cfs_time_t        expire = 20;
+       struct rsc           *rscp;
+
+       rscp = gss_svc_searchbyctx(handle);
+       if (rscp) {
+               CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n",
+                      &rscp->ctx, rscp);
+
+               rscp->h.expiry_time = cfs_time_current_sec() + expire;
+               COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+       }
+       return 0;
+}
+
+int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx)
+{
+       struct rsc *rscp = container_of(ctx, struct rsc, ctx);
+
+       return rawobj_dup(handle, &rscp->handle);
+}
+
+int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq)
+{
+       struct rsc           *rscp;
+
+       rscp = gss_svc_searchbyctx(handle);
+       if (rscp) {
+               CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) update seq to %u\n",
+                      &rscp->ctx, rscp, seq + 1);
+
+               rscp->ctx.gsc_rvs_seq = seq + 1;
+               COMPAT_RSC_PUT(&rscp->h, &rsc_cache);
+       }
+       return 0;
+}
+
+static struct cache_deferred_req* cache_upcall_defer(struct cache_req *req)
+{
+       return NULL;
+}
+static struct cache_req cache_upcall_chandle = { cache_upcall_defer };
+
+int gss_svc_upcall_handle_init(struct ptlrpc_request *req,
+                              struct gss_svc_reqctx *grctx,
+                              struct gss_wire_ctx *gw,
+                              struct obd_device *target,
+                              __u32 lustre_svc,
+                              rawobj_t *rvs_hdl,
+                              rawobj_t *in_token)
+{
+       struct ptlrpc_reply_state *rs;
+       struct rsc              *rsci = NULL;
+       struct rsi              *rsip = NULL, rsikey;
+       wait_queue_t         wait;
+       int                     replen = sizeof(struct ptlrpc_body);
+       struct gss_rep_header     *rephdr;
+       int                     first_check = 1;
+       int                     rc = SECSVC_DROP;
+       ENTRY;
+
+       memset(&rsikey, 0, sizeof(rsikey));
+       rsikey.lustre_svc = lustre_svc;
+       rsikey.nid = (__u64) req->rq_peer.nid;
+
+       /* duplicate context handle. for INIT it always 0 */
+       if (rawobj_dup(&rsikey.in_handle, &gw->gw_handle)) {
+               CERROR("fail to dup context handle\n");
+               GOTO(out, rc);
+       }
+
+       if (rawobj_dup(&rsikey.in_token, in_token)) {
+               CERROR("can't duplicate token\n");
+               rawobj_free(&rsikey.in_handle);
+               GOTO(out, rc);
+       }
+
+       rsip = rsi_lookup(&rsikey);
+       rsi_free(&rsikey);
+       if (!rsip) {
+               CERROR("error in rsi_lookup.\n");
+
+               if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0))
+                       rc = SECSVC_COMPLETE;
+
+               GOTO(out, rc);
+       }
+
+       cache_get(&rsip->h); /* take an extra ref */
+       init_waitqueue_head(&rsip->waitq);
+       init_waitqueue_entry_current(&wait);
+       add_wait_queue(&rsip->waitq, &wait);
+
+cache_check:
+       /* Note each time cache_check() will drop a reference if return
+        * non-zero. We hold an extra reference on initial rsip, but must
+        * take care of following calls. */
+       rc = cache_check(&rsi_cache, &rsip->h, &cache_upcall_chandle);
+       switch (rc) {
+       case -EAGAIN: {
+               int valid;
+
+               if (first_check) {
+                       first_check = 0;
+
+                       read_lock(&rsi_cache.hash_lock);
+                       valid = test_bit(CACHE_VALID, &rsip->h.flags);
+                       if (valid == 0)
+                               set_current_state(TASK_INTERRUPTIBLE);
+                       read_unlock(&rsi_cache.hash_lock);
+
+                       if (valid == 0)
+                               schedule_timeout(GSS_SVC_UPCALL_TIMEOUT *
+                                                    HZ);
+
+                       cache_get(&rsip->h);
+                       goto cache_check;
+               }
+               CWARN("waited %ds timeout, drop\n", GSS_SVC_UPCALL_TIMEOUT);
+               break;
+       }
+       case -ENOENT:
+               CWARN("cache_check return ENOENT, drop\n");
+               break;
+       case 0:
+               /* if not the first check, we have to release the extra
+                * reference we just added on it. */
+               if (!first_check)
+                       cache_put(&rsip->h, &rsi_cache);
+               CDEBUG(D_SEC, "cache_check is good\n");
+               break;
+       }
+
+       remove_wait_queue(&rsip->waitq, &wait);
+       cache_put(&rsip->h, &rsi_cache);
+
+       if (rc)
+               GOTO(out, rc = SECSVC_DROP);
+
+       rc = SECSVC_DROP;
+       rsci = gss_svc_searchbyctx(&rsip->out_handle);
+       if (!rsci) {
+               CERROR("authentication failed\n");
+
+               if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0))
+                       rc = SECSVC_COMPLETE;
+
+               GOTO(out, rc);
+       } else {
+               cache_get(&rsci->h);
+               grctx->src_ctx = &rsci->ctx;
+       }
+
+       if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) {
+               CERROR("failed duplicate reverse handle\n");
+               GOTO(out, rc);
+       }
+
+       rsci->target = target;
+
+       CDEBUG(D_SEC, "server create rsc %p(%u->%s)\n",
+              rsci, rsci->ctx.gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+       if (rsip->out_handle.len > PTLRPC_GSS_MAX_HANDLE_SIZE) {
+               CERROR("handle size %u too large\n", rsip->out_handle.len);
+               GOTO(out, rc = SECSVC_DROP);
+       }
+
+       grctx->src_init = 1;
+       grctx->src_reserve_len = cfs_size_round4(rsip->out_token.len);
+
+       rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+       if (rc) {
+               CERROR("failed to pack reply: %d\n", rc);
+               GOTO(out, rc = SECSVC_DROP);
+       }
+
+       rs = req->rq_reply_state;
+       LASSERT(rs->rs_repbuf->lm_bufcount == 3);
+       LASSERT(rs->rs_repbuf->lm_buflens[0] >=
+               sizeof(*rephdr) + rsip->out_handle.len);
+       LASSERT(rs->rs_repbuf->lm_buflens[2] >= rsip->out_token.len);
+
+       rephdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+       rephdr->gh_version = PTLRPC_GSS_VERSION;
+       rephdr->gh_flags = 0;
+       rephdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+       rephdr->gh_major = rsip->major_status;
+       rephdr->gh_minor = rsip->minor_status;
+       rephdr->gh_seqwin = GSS_SEQ_WIN;
+       rephdr->gh_handle.len = rsip->out_handle.len;
+       memcpy(rephdr->gh_handle.data, rsip->out_handle.data,
+              rsip->out_handle.len);
+
+       memcpy(lustre_msg_buf(rs->rs_repbuf, 2, 0), rsip->out_token.data,
+              rsip->out_token.len);
+
+       rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2,
+                                              rsip->out_token.len, 0);
+
+       rc = SECSVC_OK;
+
+out:
+       /* it looks like here we should put rsip also, but this mess up
+        * with NFS cache mgmt code... FIXME */
+#if 0
+       if (rsip)
+               rsi_put(&rsip->h, &rsi_cache);
+#endif
+
+       if (rsci) {
+               /* if anything went wrong, we don't keep the context too */
+               if (rc != SECSVC_OK)
+                       set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+               else
+                       CDEBUG(D_SEC, "create rsc with idx "LPX64"\n",
+                              gss_handle_to_u64(&rsci->handle));
+
+               COMPAT_RSC_PUT(&rsci->h, &rsc_cache);
+       }
+       RETURN(rc);
+}
+
+struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req,
+                                          struct gss_wire_ctx *gw)
+{
+       struct rsc *rsc;
+
+       rsc = gss_svc_searchbyctx(&gw->gw_handle);
+       if (!rsc) {
+               CWARN("Invalid gss ctx idx "LPX64" from %s\n",
+                     gss_handle_to_u64(&gw->gw_handle),
+                     libcfs_nid2str(req->rq_peer.nid));
+               return NULL;
+       }
+
+       return &rsc->ctx;
+}
+
+void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx)
+{
+       struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+       COMPAT_RSC_PUT(&rsc->h, &rsc_cache);
+}
+
+void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx)
+{
+       struct rsc *rsc = container_of(ctx, struct rsc, ctx);
+
+       /* can't be found */
+       set_bit(CACHE_NEGATIVE, &rsc->h.flags);
+       /* to be removed at next scan */
+       rsc->h.expiry_time = 1;
+}
+
+int __init gss_init_svc_upcall(void)
+{
+       int     i;
+
+       spin_lock_init(&__ctx_index_lock);
+       /*
+        * this helps reducing context index confliction. after server reboot,
+        * conflicting request from clients might be filtered out by initial
+        * sequence number checking, thus no chance to sent error notification
+        * back to clients.
+        */
+       cfs_get_random_bytes(&__ctx_index, sizeof(__ctx_index));
+
+
+       cache_register(&rsi_cache);
+       cache_register(&rsc_cache);
+
+       /* FIXME this looks stupid. we intend to give lsvcgssd a chance to open
+        * the init upcall channel, otherwise there's big chance that the first
+        * upcall issued before the channel be opened thus nfsv4 cache code will
+        * drop the request direclty, thus lead to unnecessary recovery time.
+        * here we wait at miximum 1.5 seconds. */
+       for (i = 0; i < 6; i++) {
+               if (atomic_read(&rsi_cache.readers) > 0)
+                       break;
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               LASSERT(HZ >= 4);
+               schedule_timeout(HZ / 4);
+       }
+
+       if (atomic_read(&rsi_cache.readers) == 0)
+               CWARN("Init channel is not opened by lsvcgssd, following "
+                     "request might be dropped until lsvcgssd is active\n");
+
+       return 0;
+}
+
+void __exit gss_exit_svc_upcall(void)
+{
+       cache_purge(&rsi_cache);
+       cache_unregister(&rsi_cache);
+
+       cache_purge(&rsc_cache);
+       cache_unregister(&rsc_cache);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c
new file mode 100644 (file)
index 0000000..3404000
--- /dev/null
@@ -0,0 +1,219 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lprocfs_status.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static struct proc_dir_entry *gss_proc_root = NULL;
+static struct proc_dir_entry *gss_proc_lk = NULL;
+
+/*
+ * statistic of "out-of-sequence-window"
+ */
+static struct {
+       spinlock_t  oos_lock;
+       atomic_t    oos_cli_count;       /* client occurrence */
+       int          oos_cli_behind;      /* client max seqs behind */
+       atomic_t    oos_svc_replay[3];   /* server replay detected */
+       atomic_t    oos_svc_pass[3];     /* server verified ok */
+} gss_stat_oos = {
+       .oos_cli_count  = ATOMIC_INIT(0),
+       .oos_cli_behind = 0,
+       .oos_svc_replay = { ATOMIC_INIT(0), },
+       .oos_svc_pass   = { ATOMIC_INIT(0), },
+};
+
+void gss_stat_oos_record_cli(int behind)
+{
+       atomic_inc(&gss_stat_oos.oos_cli_count);
+
+       spin_lock(&gss_stat_oos.oos_lock);
+       if (behind > gss_stat_oos.oos_cli_behind)
+               gss_stat_oos.oos_cli_behind = behind;
+       spin_unlock(&gss_stat_oos.oos_lock);
+}
+
+void gss_stat_oos_record_svc(int phase, int replay)
+{
+       LASSERT(phase >= 0 && phase <= 2);
+
+       if (replay)
+               atomic_inc(&gss_stat_oos.oos_svc_replay[phase]);
+       else
+               atomic_inc(&gss_stat_oos.oos_svc_pass[phase]);
+}
+
+static int gss_proc_oos_seq_show(struct seq_file *m, void *v)
+{
+       return seq_printf(m,
+                       "seqwin:                %u\n"
+                       "backwin:              %u\n"
+                       "client fall behind seqwin\n"
+                       "  occurrence:    %d\n"
+                       "  max seq behind:      %d\n"
+                       "server replay detected:\n"
+                       "  phase 0:          %d\n"
+                       "  phase 1:          %d\n"
+                       "  phase 2:          %d\n"
+                       "server verify ok:\n"
+                       "  phase 2:          %d\n",
+                       GSS_SEQ_WIN_MAIN,
+                       GSS_SEQ_WIN_BACK,
+                       atomic_read(&gss_stat_oos.oos_cli_count),
+                       gss_stat_oos.oos_cli_behind,
+                       atomic_read(&gss_stat_oos.oos_svc_replay[0]),
+                       atomic_read(&gss_stat_oos.oos_svc_replay[1]),
+                       atomic_read(&gss_stat_oos.oos_svc_replay[2]),
+                       atomic_read(&gss_stat_oos.oos_svc_pass[2]));
+}
+LPROC_SEQ_FOPS_RO(gss_proc_oos);
+
+static int gss_proc_write_secinit(struct file *file, const char *buffer,
+                                 size_t count, off_t *off)
+{
+       int rc;
+
+       rc = gss_do_ctx_init_rpc((char *) buffer, count);
+       if (rc) {
+               LASSERT(rc < 0);
+               return rc;
+       }
+
+       return count;
+}
+
+static const struct file_operations gss_proc_secinit = {
+       .write = gss_proc_write_secinit,
+};
+
+static struct lprocfs_vars gss_lprocfs_vars[] = {
+       { "replays", &gss_proc_oos_fops },
+       { "init_channel", &gss_proc_secinit, NULL, 0222 },
+       { NULL }
+};
+
+/*
+ * for userspace helper lgss_keyring.
+ *
+ * debug_level: [0, 4], defined in utils/gss/lgss_utils.h
+ */
+static int gss_lk_debug_level = 1;
+
+static int gss_lk_proc_dl_seq_show(struct seq_file *m, void *v)
+{
+       return seq_printf(m, "%u\n", gss_lk_debug_level);
+}
+
+static int gss_lk_proc_dl_seq_write(struct file *file, const char *buffer,
+                                   size_t count, off_t *off)
+{
+       int     val, rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc < 0)
+               return rc;
+
+       if (val < 0 || val > 4)
+               return -ERANGE;
+
+       gss_lk_debug_level = val;
+       return count;
+}
+LPROC_SEQ_FOPS(gss_lk_proc_dl);
+
+static struct lprocfs_vars gss_lk_lprocfs_vars[] = {
+       { "debug_level", &gss_lk_proc_dl_fops },
+       { NULL }
+};
+
+void gss_exit_lproc(void)
+{
+       if (gss_proc_lk) {
+               lprocfs_remove(&gss_proc_lk);
+               gss_proc_lk = NULL;
+       }
+
+       if (gss_proc_root) {
+               lprocfs_remove(&gss_proc_root);
+               gss_proc_root = NULL;
+       }
+}
+
+int gss_init_lproc(void)
+{
+       int     rc;
+
+       spin_lock_init(&gss_stat_oos.oos_lock);
+
+       gss_proc_root = lprocfs_register("gss", sptlrpc_proc_root,
+                                        gss_lprocfs_vars, NULL);
+       if (IS_ERR(gss_proc_root)) {
+               gss_proc_root = NULL;
+               GOTO(err_out, rc = PTR_ERR(gss_proc_root));
+       }
+
+       gss_proc_lk = lprocfs_register("lgss_keyring", gss_proc_root,
+                                      gss_lk_lprocfs_vars, NULL);
+       if (IS_ERR(gss_proc_lk)) {
+               gss_proc_lk = NULL;
+               GOTO(err_out, rc = PTR_ERR(gss_proc_root));
+       }
+
+       return 0;
+
+err_out:
+       CERROR("failed to initialize gss lproc entries: %d\n", rc);
+       gss_exit_lproc();
+       return rc;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c
new file mode 100644 (file)
index 0000000..ebca858
--- /dev/null
@@ -0,0 +1,2916 @@
+/*
+ * Modifications for Lustre
+ *
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#include <linux/crypto.h>
+#include <linux/crc32.h>
+
+/*
+ * early reply have fixed size, respectively in privacy and integrity mode.
+ * so we calculate them only once.
+ */
+static int gss_at_reply_off_integ;
+static int gss_at_reply_off_priv;
+
+
+static inline int msg_last_segidx(struct lustre_msg *msg)
+{
+       LASSERT(msg->lm_bufcount > 0);
+       return msg->lm_bufcount - 1;
+}
+static inline int msg_last_seglen(struct lustre_msg *msg)
+{
+       return msg->lm_buflens[msg_last_segidx(msg)];
+}
+
+/********************************************
+ * wire data swabber                   *
+ ********************************************/
+
+static
+void gss_header_swabber(struct gss_header *ghdr)
+{
+       __swab32s(&ghdr->gh_flags);
+       __swab32s(&ghdr->gh_proc);
+       __swab32s(&ghdr->gh_seq);
+       __swab32s(&ghdr->gh_svc);
+       __swab32s(&ghdr->gh_pad1);
+       __swab32s(&ghdr->gh_handle.len);
+}
+
+struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment,
+                                  int swabbed)
+{
+       struct gss_header *ghdr;
+
+       ghdr = lustre_msg_buf(msg, segment, sizeof(*ghdr));
+       if (ghdr == NULL)
+               return NULL;
+
+       if (swabbed)
+               gss_header_swabber(ghdr);
+
+       if (sizeof(*ghdr) + ghdr->gh_handle.len > msg->lm_buflens[segment]) {
+               CERROR("gss header has length %d, now %u received\n",
+                      (int) sizeof(*ghdr) + ghdr->gh_handle.len,
+                      msg->lm_buflens[segment]);
+               return NULL;
+       }
+
+       return ghdr;
+}
+
+#if 0
+static
+void gss_netobj_swabber(netobj_t *obj)
+{
+       __swab32s(&obj->len);
+}
+
+netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment)
+{
+       netobj_t  *obj;
+
+       obj = lustre_swab_buf(msg, segment, sizeof(*obj), gss_netobj_swabber);
+       if (obj && sizeof(*obj) + obj->len > msg->lm_buflens[segment]) {
+               CERROR("netobj require length %u but only %u received\n",
+                      (unsigned int) sizeof(*obj) + obj->len,
+                      msg->lm_buflens[segment]);
+               return NULL;
+       }
+
+       return obj;
+}
+#endif
+
+/*
+ * payload should be obtained from mechanism. but currently since we
+ * only support kerberos, we could simply use fixed value.
+ * krb5 "meta" data:
+ *  - krb5 header:      16
+ *  - krb5 checksum:    20
+ *
+ * for privacy mode, payload also include the cipher text which has the same
+ * size as plain text, plus possible confounder, padding both at maximum cipher
+ * block size.
+ */
+#define GSS_KRB5_INTEG_MAX_PAYLOAD      (40)
+
+static inline
+int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy)
+{
+       if (privacy)
+               return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize;
+       else
+               return GSS_KRB5_INTEG_MAX_PAYLOAD;
+}
+
+/*
+ * return signature size, otherwise < 0 to indicate error
+ */
+static int gss_sign_msg(struct lustre_msg *msg,
+                       struct gss_ctx *mechctx,
+                       enum lustre_sec_part sp,
+                       __u32 flags, __u32 proc, __u32 seq, __u32 svc,
+                       rawobj_t *handle)
+{
+       struct gss_header      *ghdr;
+       rawobj_t                text[4], mic;
+       int                  textcnt, max_textcnt, mic_idx;
+       __u32              major;
+
+       LASSERT(msg->lm_bufcount >= 2);
+
+       /* gss hdr */
+       LASSERT(msg->lm_buflens[0] >=
+               sizeof(*ghdr) + (handle ? handle->len : 0));
+       ghdr = lustre_msg_buf(msg, 0, 0);
+
+       ghdr->gh_version = PTLRPC_GSS_VERSION;
+       ghdr->gh_sp = (__u8) sp;
+       ghdr->gh_flags = flags;
+       ghdr->gh_proc = proc;
+       ghdr->gh_seq = seq;
+       ghdr->gh_svc = svc;
+       if (!handle) {
+               /* fill in a fake one */
+               ghdr->gh_handle.len = 0;
+       } else {
+               ghdr->gh_handle.len = handle->len;
+               memcpy(ghdr->gh_handle.data, handle->data, handle->len);
+       }
+
+       /* no actual signature for null mode */
+       if (svc == SPTLRPC_SVC_NULL)
+               return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+       /* MIC */
+       mic_idx = msg_last_segidx(msg);
+       max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+       for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+               text[textcnt].len = msg->lm_buflens[textcnt];
+               text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+       }
+
+       mic.len = msg->lm_buflens[mic_idx];
+       mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+       major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("fail to generate MIC: %08x\n", major);
+               return -EPERM;
+       }
+       LASSERT(mic.len <= msg->lm_buflens[mic_idx]);
+
+       return lustre_shrink_msg(msg, mic_idx, mic.len, 0);
+}
+
+/*
+ * return gss error
+ */
+static
+__u32 gss_verify_msg(struct lustre_msg *msg,
+                    struct gss_ctx *mechctx,
+                    __u32 svc)
+{
+       rawobj_t        text[4], mic;
+       int          textcnt, max_textcnt;
+       int          mic_idx;
+       __u32      major;
+
+       LASSERT(msg->lm_bufcount >= 2);
+
+       if (svc == SPTLRPC_SVC_NULL)
+               return GSS_S_COMPLETE;
+
+       mic_idx = msg_last_segidx(msg);
+       max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx;
+
+       for (textcnt = 0; textcnt < max_textcnt; textcnt++) {
+               text[textcnt].len = msg->lm_buflens[textcnt];
+               text[textcnt].data = lustre_msg_buf(msg, textcnt, 0);
+       }
+
+       mic.len = msg->lm_buflens[mic_idx];
+       mic.data = lustre_msg_buf(msg, mic_idx, 0);
+
+       major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic);
+       if (major != GSS_S_COMPLETE)
+               CERROR("mic verify error: %08x\n", major);
+
+       return major;
+}
+
+/*
+ * return gss error code
+ */
+static
+__u32 gss_unseal_msg(struct gss_ctx *mechctx,
+                  struct lustre_msg *msgbuf,
+                  int *msg_len, int msgbuf_len)
+{
+       rawobj_t                 clear_obj, hdrobj, token;
+       __u8                *clear_buf;
+       int                   clear_buflen;
+       __u32               major;
+       ENTRY;
+
+       if (msgbuf->lm_bufcount != 2) {
+               CERROR("invalid bufcount %d\n", msgbuf->lm_bufcount);
+               RETURN(GSS_S_FAILURE);
+       }
+
+       /* allocate a temporary clear text buffer, same sized as token,
+        * we assume the final clear text size <= token size */
+       clear_buflen = lustre_msg_buflen(msgbuf, 1);
+       OBD_ALLOC_LARGE(clear_buf, clear_buflen);
+       if (!clear_buf)
+               RETURN(GSS_S_FAILURE);
+
+       /* buffer objects */
+       hdrobj.len = lustre_msg_buflen(msgbuf, 0);
+       hdrobj.data = lustre_msg_buf(msgbuf, 0, 0);
+       token.len = lustre_msg_buflen(msgbuf, 1);
+       token.data = lustre_msg_buf(msgbuf, 1, 0);
+       clear_obj.len = clear_buflen;
+       clear_obj.data = clear_buf;
+
+       major = lgss_unwrap(mechctx, &hdrobj, &token, &clear_obj);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("unwrap message error: %08x\n", major);
+               GOTO(out_free, major = GSS_S_FAILURE);
+       }
+       LASSERT(clear_obj.len <= clear_buflen);
+       LASSERT(clear_obj.len <= msgbuf_len);
+
+       /* now the decrypted message */
+       memcpy(msgbuf, clear_obj.data, clear_obj.len);
+       *msg_len = clear_obj.len;
+
+       major = GSS_S_COMPLETE;
+out_free:
+       OBD_FREE_LARGE(clear_buf, clear_buflen);
+       RETURN(major);
+}
+
+/********************************************
+ * gss client context manipulation helpers  *
+ ********************************************/
+
+int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(atomic_read(&ctx->cc_refcount));
+
+       if (!test_and_set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags)) {
+               if (!ctx->cc_early_expire)
+                       clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+               CWARN("ctx %p(%u->%s) get expired: %lu(%+lds)\n",
+                     ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+                     ctx->cc_expire,
+                     ctx->cc_expire == 0 ? 0 :
+                     cfs_time_sub(ctx->cc_expire, cfs_time_current_sec()));
+
+               sptlrpc_cli_ctx_wakeup(ctx);
+               return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * return 1 if the context is dead.
+ */
+int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx)
+{
+       if (unlikely(cli_ctx_is_dead(ctx)))
+               return 1;
+
+       /* expire is 0 means never expire. a newly created gss context
+        * which during upcall may has 0 expiration */
+       if (ctx->cc_expire == 0)
+               return 0;
+
+       /* check real expiration */
+       if (cfs_time_after(ctx->cc_expire, cfs_time_current_sec()))
+               return 0;
+
+       cli_ctx_expire(ctx);
+       return 1;
+}
+
+void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx)
+{
+       struct ptlrpc_cli_ctx  *ctx = &gctx->gc_base;
+       unsigned long      ctx_expiry;
+
+       if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) {
+               CERROR("ctx %p(%u): unable to inquire, expire it now\n",
+                      gctx, ctx->cc_vcred.vc_uid);
+               ctx_expiry = 1; /* make it expired now */
+       }
+
+       ctx->cc_expire = gss_round_ctx_expiry(ctx_expiry,
+                                             ctx->cc_sec->ps_flvr.sf_flags);
+
+       /* At this point this ctx might have been marked as dead by
+        * someone else, in which case nobody will make further use
+        * of it. we don't care, and mark it UPTODATE will help
+        * destroying server side context when it be destroied. */
+       set_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags);
+
+       if (sec_is_reverse(ctx->cc_sec)) {
+               CWARN("server installed reverse ctx %p idx "LPX64", "
+                     "expiry %lu(%+lds)\n", ctx,
+                     gss_handle_to_u64(&gctx->gc_handle),
+                     ctx->cc_expire, ctx->cc_expire - cfs_time_current_sec());
+       } else {
+               CWARN("client refreshed ctx %p idx "LPX64" (%u->%s), "
+                     "expiry %lu(%+lds)\n", ctx,
+                     gss_handle_to_u64(&gctx->gc_handle),
+                     ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+                     ctx->cc_expire, ctx->cc_expire - cfs_time_current_sec());
+
+               /* install reverse svc ctx for root context */
+               if (ctx->cc_vcred.vc_uid == 0)
+                       gss_sec_install_rctx(ctx->cc_sec->ps_import,
+                                            ctx->cc_sec, ctx);
+       }
+
+       sptlrpc_cli_ctx_wakeup(ctx);
+}
+
+static void gss_cli_ctx_finalize(struct gss_cli_ctx *gctx)
+{
+       LASSERT(gctx->gc_base.cc_sec);
+
+       if (gctx->gc_mechctx) {
+               lgss_delete_sec_context(&gctx->gc_mechctx);
+               gctx->gc_mechctx = NULL;
+       }
+
+       if (!rawobj_empty(&gctx->gc_svc_handle)) {
+               /* forward ctx: mark buddy reverse svcctx soon-expire. */
+               if (!sec_is_reverse(gctx->gc_base.cc_sec) &&
+                   !rawobj_empty(&gctx->gc_svc_handle))
+                       gss_svc_upcall_expire_rvs_ctx(&gctx->gc_svc_handle);
+
+               rawobj_free(&gctx->gc_svc_handle);
+       }
+
+       rawobj_free(&gctx->gc_handle);
+}
+
+/*
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * modified for our own problem: arriving request has valid sequence number,
+ * but unwrapping request might cost a long time, after that its sequence
+ * are not valid anymore (fall behind the window). It rarely happen, mostly
+ * under extreme load.
+ *
+ * note we should not check sequence before verify the integrity of incoming
+ * request, because just one attacking request with high sequence number might
+ * cause all following request be dropped.
+ *
+ * so here we use a multi-phase approach: prepare 2 sequence windows,
+ * "main window" for normal sequence and "back window" for fall behind sequence.
+ * and 3-phase checking mechanism:
+ *  0 - before integrity verification, perform a initial sequence checking in
+ *      main window, which only try and don't actually set any bits. if the
+ *      sequence is high above the window or fit in the window and the bit
+ *      is 0, then accept and proceed to integrity verification. otherwise
+ *      reject this sequence.
+ *  1 - after integrity verification, check in main window again. if this
+ *      sequence is high above the window or fit in the window and the bit
+ *      is 0, then set the bit and accept; if it fit in the window but bit
+ *      already set, then reject; if it fall behind the window, then proceed
+ *      to phase 2.
+ *  2 - check in back window. if it is high above the window or fit in the
+ *      window and the bit is 0, then set the bit and accept. otherwise reject.
+ *
+ * return value:
+ *   1: looks like a replay
+ *   0: is ok
+ *  -1: is a replay
+ *
+ * note phase 0 is necessary, because otherwise replay attacking request of
+ * sequence which between the 2 windows can't be detected.
+ *
+ * this mechanism can't totally solve the problem, but could help much less
+ * number of valid requests be dropped.
+ */
+static
+int gss_do_check_seq(unsigned long *window, __u32 win_size, __u32 *max_seq,
+                    __u32 seq_num, int phase)
+{
+       LASSERT(phase >= 0 && phase <= 2);
+
+       if (seq_num > *max_seq) {
+               /*
+                * 1. high above the window
+                */
+               if (phase == 0)
+                       return 0;
+
+               if (seq_num >= *max_seq + win_size) {
+                       memset(window, 0, win_size / 8);
+                       *max_seq = seq_num;
+               } else {
+                       while(*max_seq < seq_num) {
+                               (*max_seq)++;
+                               __clear_bit((*max_seq) % win_size, window);
+                       }
+               }
+               __set_bit(seq_num % win_size, window);
+       } else if (seq_num + win_size <= *max_seq) {
+               /*
+                * 2. low behind the window
+                */
+               if (phase == 0 || phase == 2)
+                       goto replay;
+
+               CWARN("seq %u is %u behind (size %d), check backup window\n",
+                     seq_num, *max_seq - win_size - seq_num, win_size);
+               return 1;
+       } else {
+               /*
+                * 3. fit into the window
+                */
+               switch (phase) {
+               case 0:
+                       if (test_bit(seq_num % win_size, window))
+                               goto replay;
+                       break;
+               case 1:
+               case 2:
+                    if (__test_and_set_bit(seq_num % win_size, window))
+                               goto replay;
+                       break;
+               }
+       }
+
+       return 0;
+
+replay:
+       CERROR("seq %u (%s %s window) is a replay: max %u, winsize %d\n",
+              seq_num,
+              seq_num + win_size > *max_seq ? "in" : "behind",
+              phase == 2 ? "backup " : "main",
+              *max_seq, win_size);
+       return -1;
+}
+
+/*
+ * Based on sequence number algorithm as specified in RFC 2203.
+ *
+ * if @set == 0: initial check, don't set any bit in window
+ * if @sec == 1: final check, set bit in window
+ */
+int gss_check_seq_num(struct gss_svc_seq_data *ssd, __u32 seq_num, int set)
+{
+       int rc = 0;
+
+       spin_lock(&ssd->ssd_lock);
+
+       if (set == 0) {
+               /*
+                * phase 0 testing
+                */
+               rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+                                     &ssd->ssd_max_main, seq_num, 0);
+               if (unlikely(rc))
+                       gss_stat_oos_record_svc(0, 1);
+       } else {
+               /*
+                * phase 1 checking main window
+                */
+               rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN,
+                                     &ssd->ssd_max_main, seq_num, 1);
+               switch (rc) {
+               case -1:
+                       gss_stat_oos_record_svc(1, 1);
+                       /* fall through */
+               case 0:
+                       goto exit;
+               }
+               /*
+                * phase 2 checking back window
+                */
+               rc = gss_do_check_seq(ssd->ssd_win_back, GSS_SEQ_WIN_BACK,
+                                     &ssd->ssd_max_back, seq_num, 2);
+               if (rc)
+                       gss_stat_oos_record_svc(2, 1);
+               else
+                       gss_stat_oos_record_svc(2, 0);
+       }
+exit:
+       spin_unlock(&ssd->ssd_lock);
+       return rc;
+}
+
+/***************************************
+ * cred APIs                      *
+ ***************************************/
+
+static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx,
+                                 int msgsize, int privacy)
+{
+       return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx,
+                               struct sptlrpc_flavor *flvr,
+                               int reply, int read)
+{
+       int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+       LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT);
+
+       if ((!reply && !read) || (reply && read)) {
+               switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+               case SPTLRPC_BULK_SVC_NULL:
+                       break;
+               case SPTLRPC_BULK_SVC_INTG:
+                       payload += gss_cli_payload(ctx, 0, 0);
+                       break;
+               case SPTLRPC_BULK_SVC_PRIV:
+                       payload += gss_cli_payload(ctx, 0, 1);
+                       break;
+               case SPTLRPC_BULK_SVC_AUTH:
+               default:
+                       LBUG();
+               }
+       }
+
+       return payload;
+}
+
+int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred)
+{
+       return (ctx->cc_vcred.vc_uid == vcred->vc_uid);
+}
+
+void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+       buf[0] = '\0';
+
+       if (flags & PTLRPC_CTX_NEW)
+               strncat(buf, "new,", bufsize);
+       if (flags & PTLRPC_CTX_UPTODATE)
+               strncat(buf, "uptodate,", bufsize);
+       if (flags & PTLRPC_CTX_DEAD)
+               strncat(buf, "dead,", bufsize);
+       if (flags & PTLRPC_CTX_ERROR)
+               strncat(buf, "error,", bufsize);
+       if (flags & PTLRPC_CTX_CACHED)
+               strncat(buf, "cached,", bufsize);
+       if (flags & PTLRPC_CTX_ETERNAL)
+               strncat(buf, "eternal,", bufsize);
+       if (buf[0] == '\0')
+               strncat(buf, "-,", bufsize);
+
+       buf[strlen(buf) - 1] = '\0';
+}
+
+int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx,
+                    struct ptlrpc_request *req)
+{
+       struct gss_cli_ctx      *gctx = ctx2gctx(ctx);
+       __u32               flags = 0, seq, svc;
+       int                   rc;
+       ENTRY;
+
+       LASSERT(req->rq_reqbuf);
+       LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+       LASSERT(req->rq_cli_ctx == ctx);
+
+       /* nothing to do for context negotiation RPCs */
+       if (req->rq_ctx_init)
+               RETURN(0);
+
+       svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+       if (req->rq_pack_bulk)
+               flags |= LUSTRE_GSS_PACK_BULK;
+       if (req->rq_pack_udesc)
+               flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+       seq = atomic_inc_return(&gctx->gc_seq);
+
+       rc = gss_sign_msg(req->rq_reqbuf, gctx->gc_mechctx,
+                         ctx->cc_sec->ps_part,
+                         flags, gctx->gc_proc, seq, svc,
+                         &gctx->gc_handle);
+       if (rc < 0)
+               RETURN(rc);
+
+       /* gss_sign_msg() msg might take long time to finish, in which period
+        * more rpcs could be wrapped up and sent out. if we found too many
+        * of them we should repack this rpc, because sent it too late might
+        * lead to the sequence number fall behind the window on server and
+        * be dropped. also applies to gss_cli_ctx_seal().
+        *
+        * Note: null mode dosen't check sequence number. */
+       if (svc != SPTLRPC_SVC_NULL &&
+           atomic_read(&gctx->gc_seq) - seq > GSS_SEQ_REPACK_THRESHOLD) {
+               int behind = atomic_read(&gctx->gc_seq) - seq;
+
+               gss_stat_oos_record_cli(behind);
+               CWARN("req %p: %u behind, retry signing\n", req, behind);
+               goto redo;
+       }
+
+       req->rq_reqdata_len = rc;
+       RETURN(0);
+}
+
+static
+int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx,
+                                 struct ptlrpc_request *req,
+                                 struct gss_header *ghdr)
+{
+       struct gss_err_header *errhdr;
+       int rc;
+
+       LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR);
+
+       errhdr = (struct gss_err_header *) ghdr;
+
+       CWARN("req x"LPU64"/t"LPU64", ctx %p idx "LPX64"(%u->%s): "
+             "%sserver respond (%08x/%08x)\n",
+             req->rq_xid, req->rq_transno, ctx,
+             gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle),
+             ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec),
+             sec_is_reverse(ctx->cc_sec) ? "reverse" : "",
+             errhdr->gh_major, errhdr->gh_minor);
+
+       /* context fini rpc, let it failed */
+       if (req->rq_ctx_fini) {
+               CWARN("context fini rpc failed\n");
+               return -EINVAL;
+       }
+
+       /* reverse sec, just return error, don't expire this ctx because it's
+        * crucial to callback rpcs. note if the callback rpc failed because
+        * of bit flip during network transfer, the client will be evicted
+        * directly. so more gracefully we probably want let it retry for
+        * number of times. */
+       if (sec_is_reverse(ctx->cc_sec))
+               return -EINVAL;
+
+       if (errhdr->gh_major != GSS_S_NO_CONTEXT &&
+           errhdr->gh_major != GSS_S_BAD_SIG)
+               return -EACCES;
+
+       /* server return NO_CONTEXT might be caused by context expire
+        * or server reboot/failover. we try to refresh a new ctx which
+        * be transparent to upper layer.
+        *
+        * In some cases, our gss handle is possible to be incidentally
+        * identical to another handle since the handle itself is not
+        * fully random. In krb5 case, the GSS_S_BAD_SIG will be
+        * returned, maybe other gss error for other mechanism.
+        *
+        * if we add new mechanism, make sure the correct error are
+        * returned in this case. */
+       CWARN("%s: server might lost the context, retrying\n",
+             errhdr->gh_major == GSS_S_NO_CONTEXT ?  "NO_CONTEXT" : "BAD_SIG");
+
+       sptlrpc_cli_ctx_expire(ctx);
+
+       /* we need replace the ctx right here, otherwise during
+        * resent we'll hit the logic in sptlrpc_req_refresh_ctx()
+        * which keep the ctx with RESEND flag, thus we'll never
+        * get rid of this ctx. */
+       rc = sptlrpc_req_replace_dead_ctx(req);
+       if (rc == 0)
+               req->rq_resend = 1;
+
+       return rc;
+}
+
+int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx,
+                      struct ptlrpc_request *req)
+{
+       struct gss_cli_ctx     *gctx;
+       struct gss_header      *ghdr, *reqhdr;
+       struct lustre_msg      *msg = req->rq_repdata;
+       __u32              major;
+       int                  pack_bulk, swabbed, rc = 0;
+       ENTRY;
+
+       LASSERT(req->rq_cli_ctx == ctx);
+       LASSERT(msg);
+
+       gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+       /* special case for context negotiation, rq_repmsg/rq_replen actually
+        * are not used currently. but early reply always be treated normally */
+       if (req->rq_ctx_init && !req->rq_early) {
+               req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+               req->rq_replen = msg->lm_buflens[1];
+               RETURN(0);
+       }
+
+       if (msg->lm_bufcount < 2 || msg->lm_bufcount > 4) {
+               CERROR("unexpected bufcount %u\n", msg->lm_bufcount);
+               RETURN(-EPROTO);
+       }
+
+       swabbed = ptlrpc_rep_need_swab(req);
+
+       ghdr = gss_swab_header(msg, 0, swabbed);
+       if (ghdr == NULL) {
+               CERROR("can't decode gss header\n");
+               RETURN(-EPROTO);
+       }
+
+       /* sanity checks */
+       reqhdr = lustre_msg_buf(msg, 0, sizeof(*reqhdr));
+       LASSERT(reqhdr);
+
+       if (ghdr->gh_version != reqhdr->gh_version) {
+               CERROR("gss version %u mismatch, expect %u\n",
+                      ghdr->gh_version, reqhdr->gh_version);
+               RETURN(-EPROTO);
+       }
+
+       switch (ghdr->gh_proc) {
+       case PTLRPC_GSS_PROC_DATA:
+               pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+               if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+                       CERROR("%s bulk flag in reply\n",
+                              req->rq_pack_bulk ? "missing" : "unexpected");
+                       RETURN(-EPROTO);
+               }
+
+               if (ghdr->gh_seq != reqhdr->gh_seq) {
+                       CERROR("seqnum %u mismatch, expect %u\n",
+                              ghdr->gh_seq, reqhdr->gh_seq);
+                       RETURN(-EPROTO);
+               }
+
+               if (ghdr->gh_svc != reqhdr->gh_svc) {
+                       CERROR("svc %u mismatch, expect %u\n",
+                              ghdr->gh_svc, reqhdr->gh_svc);
+                       RETURN(-EPROTO);
+               }
+
+               if (swabbed)
+                       gss_header_swabber(ghdr);
+
+               major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc);
+               if (major != GSS_S_COMPLETE) {
+                       CERROR("failed to verify reply: %x\n", major);
+                       RETURN(-EPERM);
+               }
+
+               if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) {
+                       __u32 cksum;
+
+                       cksum = crc32_le(!(__u32) 0,
+                                        lustre_msg_buf(msg, 1, 0),
+                                        lustre_msg_buflen(msg, 1));
+                       if (cksum != msg->lm_cksum) {
+                               CWARN("early reply checksum mismatch: "
+                                     "%08x != %08x\n", cksum, msg->lm_cksum);
+                               RETURN(-EPROTO);
+                       }
+               }
+
+               if (pack_bulk) {
+                       /* bulk checksum is right after the lustre msg */
+                       if (msg->lm_bufcount < 3) {
+                               CERROR("Invalid reply bufcount %u\n",
+                                      msg->lm_bufcount);
+                               RETURN(-EPROTO);
+                       }
+
+                       rc = bulk_sec_desc_unpack(msg, 2, swabbed);
+                       if (rc) {
+                               CERROR("unpack bulk desc: %d\n", rc);
+                               RETURN(rc);
+                       }
+               }
+
+               req->rq_repmsg = lustre_msg_buf(msg, 1, 0);
+               req->rq_replen = msg->lm_buflens[1];
+               break;
+       case PTLRPC_GSS_PROC_ERR:
+               if (req->rq_early) {
+                       CERROR("server return error with early reply\n");
+                       rc = -EPROTO;
+               } else {
+                       rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+               }
+               break;
+       default:
+               CERROR("unknown gss proc %d\n", ghdr->gh_proc);
+               rc = -EPROTO;
+       }
+
+       RETURN(rc);
+}
+
+int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx,
+                    struct ptlrpc_request *req)
+{
+       struct gss_cli_ctx      *gctx;
+       rawobj_t                 hdrobj, msgobj, token;
+       struct gss_header       *ghdr;
+       __u32               buflens[2], major;
+       int                   wiresize, rc;
+       ENTRY;
+
+       LASSERT(req->rq_clrbuf);
+       LASSERT(req->rq_cli_ctx == ctx);
+       LASSERT(req->rq_reqlen);
+
+       gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+       /* final clear data length */
+       req->rq_clrdata_len = lustre_msg_size_v2(req->rq_clrbuf->lm_bufcount,
+                                                req->rq_clrbuf->lm_buflens);
+
+       /* calculate wire data length */
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = gss_cli_payload(&gctx->gc_base, req->rq_clrdata_len, 1);
+       wiresize = lustre_msg_size_v2(2, buflens);
+
+       /* allocate wire buffer */
+       if (req->rq_pool) {
+               /* pre-allocated */
+               LASSERT(req->rq_reqbuf);
+               LASSERT(req->rq_reqbuf != req->rq_clrbuf);
+               LASSERT(req->rq_reqbuf_len >= wiresize);
+       } else {
+               OBD_ALLOC_LARGE(req->rq_reqbuf, wiresize);
+               if (!req->rq_reqbuf)
+                       RETURN(-ENOMEM);
+               req->rq_reqbuf_len = wiresize;
+       }
+
+       lustre_init_msg_v2(req->rq_reqbuf, 2, buflens, NULL);
+       req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+       /* gss header */
+       ghdr = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+       ghdr->gh_version = PTLRPC_GSS_VERSION;
+       ghdr->gh_sp = (__u8) ctx->cc_sec->ps_part;
+       ghdr->gh_flags = 0;
+       ghdr->gh_proc = gctx->gc_proc;
+       ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+       ghdr->gh_handle.len = gctx->gc_handle.len;
+       memcpy(ghdr->gh_handle.data, gctx->gc_handle.data, gctx->gc_handle.len);
+       if (req->rq_pack_bulk)
+               ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+       if (req->rq_pack_udesc)
+               ghdr->gh_flags |= LUSTRE_GSS_PACK_USER;
+
+redo:
+       ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+
+       /* buffer objects */
+       hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+       hdrobj.data = (__u8 *) ghdr;
+       msgobj.len = req->rq_clrdata_len;
+       msgobj.data = (__u8 *) req->rq_clrbuf;
+       token.len = lustre_msg_buflen(req->rq_reqbuf, 1);
+       token.data = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+
+       major = lgss_wrap(gctx->gc_mechctx, &hdrobj, &msgobj,
+                         req->rq_clrbuf_len, &token);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("priv: wrap message error: %08x\n", major);
+               GOTO(err_free, rc = -EPERM);
+       }
+       LASSERT(token.len <= buflens[1]);
+
+       /* see explain in gss_cli_ctx_sign() */
+       if (unlikely(atomic_read(&gctx->gc_seq) - ghdr->gh_seq >
+                    GSS_SEQ_REPACK_THRESHOLD)) {
+               int behind = atomic_read(&gctx->gc_seq) - ghdr->gh_seq;
+
+               gss_stat_oos_record_cli(behind);
+               CWARN("req %p: %u behind, retry sealing\n", req, behind);
+
+               ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq);
+               goto redo;
+       }
+
+       /* now set the final wire data length */
+       req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, 1, token.len,0);
+       RETURN(0);
+
+err_free:
+       if (!req->rq_pool) {
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = NULL;
+               req->rq_reqbuf_len = 0;
+       }
+       RETURN(rc);
+}
+
+int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx,
+                      struct ptlrpc_request *req)
+{
+       struct gss_cli_ctx      *gctx;
+       struct gss_header       *ghdr;
+       struct lustre_msg       *msg = req->rq_repdata;
+       int                   msglen, pack_bulk, swabbed, rc;
+       __u32               major;
+       ENTRY;
+
+       LASSERT(req->rq_cli_ctx == ctx);
+       LASSERT(req->rq_ctx_init == 0);
+       LASSERT(msg);
+
+       gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+       swabbed = ptlrpc_rep_need_swab(req);
+
+       ghdr = gss_swab_header(msg, 0, swabbed);
+       if (ghdr == NULL) {
+               CERROR("can't decode gss header\n");
+               RETURN(-EPROTO);
+       }
+
+       /* sanity checks */
+       if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+               CERROR("gss version %u mismatch, expect %u\n",
+                      ghdr->gh_version, PTLRPC_GSS_VERSION);
+               RETURN(-EPROTO);
+       }
+
+       switch (ghdr->gh_proc) {
+       case PTLRPC_GSS_PROC_DATA:
+               pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK;
+
+               if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){
+                       CERROR("%s bulk flag in reply\n",
+                              req->rq_pack_bulk ? "missing" : "unexpected");
+                       RETURN(-EPROTO);
+               }
+
+               if (swabbed)
+                       gss_header_swabber(ghdr);
+
+               /* use rq_repdata_len as buffer size, which assume unseal
+                * doesn't need extra memory space. for precise control, we'd
+                * better calculate out actual buffer size as
+                * (repbuf_len - offset - repdata_len) */
+               major = gss_unseal_msg(gctx->gc_mechctx, msg,
+                                      &msglen, req->rq_repdata_len);
+               if (major != GSS_S_COMPLETE) {
+                       CERROR("failed to unwrap reply: %x\n", major);
+                       rc = -EPERM;
+                       break;
+               }
+
+               swabbed = __lustre_unpack_msg(msg, msglen);
+               if (swabbed < 0) {
+                       CERROR("Failed to unpack after decryption\n");
+                       RETURN(-EPROTO);
+               }
+
+               if (msg->lm_bufcount < 1) {
+                       CERROR("Invalid reply buffer: empty\n");
+                       RETURN(-EPROTO);
+               }
+
+               if (pack_bulk) {
+                       if (msg->lm_bufcount < 2) {
+                               CERROR("bufcount %u: missing bulk sec desc\n",
+                                      msg->lm_bufcount);
+                               RETURN(-EPROTO);
+                       }
+
+                       /* bulk checksum is the last segment */
+                       if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1,
+                                                swabbed))
+                               RETURN(-EPROTO);
+               }
+
+               req->rq_repmsg = lustre_msg_buf(msg, 0, 0);
+               req->rq_replen = msg->lm_buflens[0];
+
+               rc = 0;
+               break;
+       case PTLRPC_GSS_PROC_ERR:
+               if (req->rq_early) {
+                       CERROR("server return error with early reply\n");
+                       rc = -EPROTO;
+               } else {
+                       rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr);
+               }
+               break;
+       default:
+               CERROR("unexpected proc %d\n", ghdr->gh_proc);
+               rc = -EPERM;
+       }
+
+       RETURN(rc);
+}
+
+/*********************************************
+ * reverse context installation              *
+ *********************************************/
+
+static inline
+int gss_install_rvs_svc_ctx(struct obd_import *imp,
+                           struct gss_sec *gsec,
+                           struct gss_cli_ctx *gctx)
+{
+       return gss_svc_upcall_install_rvs_ctx(imp, gsec, gctx);
+}
+
+/*********************************************
+ * GSS security APIs                    *
+ *********************************************/
+int gss_sec_create_common(struct gss_sec *gsec,
+                         struct ptlrpc_sec_policy *policy,
+                         struct obd_import *imp,
+                         struct ptlrpc_svc_ctx *svcctx,
+                         struct sptlrpc_flavor *sf)
+{
+       struct ptlrpc_sec   *sec;
+
+       LASSERT(imp);
+       LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS);
+
+       gsec->gs_mech = lgss_subflavor_to_mech(
+                               SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+       if (!gsec->gs_mech) {
+               CERROR("gss backend 0x%x not found\n",
+                      SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc));
+               return -EOPNOTSUPP;
+       }
+
+       spin_lock_init(&gsec->gs_lock);
+       gsec->gs_rvs_hdl = 0ULL;
+
+       /* initialize upper ptlrpc_sec */
+       sec = &gsec->gs_base;
+       sec->ps_policy = policy;
+       atomic_set(&sec->ps_refcount, 0);
+       atomic_set(&sec->ps_nctx, 0);
+       sec->ps_id = sptlrpc_get_next_secid();
+       sec->ps_flvr = *sf;
+       sec->ps_import = class_import_get(imp);
+       spin_lock_init(&sec->ps_lock);
+       INIT_LIST_HEAD(&sec->ps_gc_list);
+
+       if (!svcctx) {
+               sec->ps_gc_interval = GSS_GC_INTERVAL;
+       } else {
+               LASSERT(sec_is_reverse(sec));
+
+               /* never do gc on reverse sec */
+               sec->ps_gc_interval = 0;
+       }
+
+       if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+               sptlrpc_enc_pool_add_user();
+
+       CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""),
+              policy->sp_name, gsec);
+       return 0;
+}
+
+void gss_sec_destroy_common(struct gss_sec *gsec)
+{
+       struct ptlrpc_sec      *sec = &gsec->gs_base;
+       ENTRY;
+
+       LASSERT(sec->ps_import);
+       LASSERT(atomic_read(&sec->ps_refcount) == 0);
+       LASSERT(atomic_read(&sec->ps_nctx) == 0);
+
+       if (gsec->gs_mech) {
+               lgss_mech_put(gsec->gs_mech);
+               gsec->gs_mech = NULL;
+       }
+
+       class_import_put(sec->ps_import);
+
+       if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV)
+               sptlrpc_enc_pool_del_user();
+
+       EXIT;
+}
+
+void gss_sec_kill(struct ptlrpc_sec *sec)
+{
+       sec->ps_dying = 1;
+}
+
+int gss_cli_ctx_init_common(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx,
+                           struct ptlrpc_ctx_ops *ctxops,
+                           struct vfs_cred *vcred)
+{
+       struct gss_cli_ctx    *gctx = ctx2gctx(ctx);
+
+       gctx->gc_win = 0;
+       atomic_set(&gctx->gc_seq, 0);
+
+       INIT_HLIST_NODE(&ctx->cc_cache);
+       atomic_set(&ctx->cc_refcount, 0);
+       ctx->cc_sec = sec;
+       ctx->cc_ops = ctxops;
+       ctx->cc_expire = 0;
+       ctx->cc_flags = PTLRPC_CTX_NEW;
+       ctx->cc_vcred = *vcred;
+       spin_lock_init(&ctx->cc_lock);
+       INIT_LIST_HEAD(&ctx->cc_req_list);
+       INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+       /* take a ref on belonging sec, balanced in ctx destroying */
+       atomic_inc(&sec->ps_refcount);
+       /* statistic only */
+       atomic_inc(&sec->ps_nctx);
+
+       CDEBUG(D_SEC, "%s@%p: create ctx %p(%u->%s)\n",
+              sec->ps_policy->sp_name, ctx->cc_sec,
+              ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+       return 0;
+}
+
+/*
+ * return value:
+ *   1: the context has been taken care of by someone else
+ *   0: proceed to really destroy the context locally
+ */
+int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec,
+                           struct ptlrpc_cli_ctx *ctx)
+{
+       struct gss_cli_ctx *gctx = ctx2gctx(ctx);
+
+       LASSERT(atomic_read(&sec->ps_nctx) > 0);
+       LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+       LASSERT(ctx->cc_sec == sec);
+
+       /*
+        * remove UPTODATE flag of reverse ctx thus we won't send fini rpc,
+        * this is to avoid potential problems of client side reverse svc ctx
+        * be mis-destroyed in various recovery senarios. anyway client can
+        * manage its reverse ctx well by associating it with its buddy ctx.
+        */
+       if (sec_is_reverse(sec))
+               ctx->cc_flags &= ~PTLRPC_CTX_UPTODATE;
+
+       if (gctx->gc_mechctx) {
+               /* the final context fini rpc will use this ctx too, and it's
+                * asynchronous which finished by request_out_callback(). so
+                * we add refcount, whoever drop finally drop the refcount to
+                * 0 should responsible for the rest of destroy. */
+               atomic_inc(&ctx->cc_refcount);
+
+               gss_do_ctx_fini_rpc(gctx);
+               gss_cli_ctx_finalize(gctx);
+
+               if (!atomic_dec_and_test(&ctx->cc_refcount))
+                       return 1;
+       }
+
+       if (sec_is_reverse(sec))
+               CWARN("reverse sec %p: destroy ctx %p\n",
+                     ctx->cc_sec, ctx);
+       else
+               CWARN("%s@%p: destroy ctx %p(%u->%s)\n",
+                     sec->ps_policy->sp_name, ctx->cc_sec,
+                     ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+
+       return 0;
+}
+
+static
+int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec,
+                         struct ptlrpc_request *req,
+                         int svc, int msgsize)
+{
+       int                    bufsize, txtsize;
+       int                    bufcnt = 2;
+       __u32                buflens[5];
+       ENTRY;
+
+       /*
+        * on-wire data layout:
+        * - gss header
+        * - lustre message
+        * - user descriptor (optional)
+        * - bulk sec descriptor (optional)
+        * - signature (optional)
+        *   - svc == NULL: NULL
+        *   - svc == AUTH: signature of gss header
+        *   - svc == INTG: signature of all above
+        *
+        * if this is context negotiation, reserver fixed space
+        * at the last (signature) segment regardless of svc mode.
+        */
+
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       txtsize = buflens[0];
+
+       buflens[1] = msgsize;
+       if (svc == SPTLRPC_SVC_INTG)
+               txtsize += buflens[1];
+
+       if (req->rq_pack_udesc) {
+               buflens[bufcnt] = sptlrpc_current_user_desc_size();
+               if (svc == SPTLRPC_SVC_INTG)
+                       txtsize += buflens[bufcnt];
+               bufcnt++;
+       }
+
+       if (req->rq_pack_bulk) {
+               buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                      &req->rq_flvr,
+                                                      0, req->rq_bulk_read);
+               if (svc == SPTLRPC_SVC_INTG)
+                       txtsize += buflens[bufcnt];
+               bufcnt++;
+       }
+
+       if (req->rq_ctx_init)
+               buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+       else if (svc != SPTLRPC_SVC_NULL)
+               buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+       bufsize = lustre_msg_size_v2(bufcnt, buflens);
+
+       if (!req->rq_reqbuf) {
+               bufsize = size_roundup_power2(bufsize);
+
+               OBD_ALLOC_LARGE(req->rq_reqbuf, bufsize);
+               if (!req->rq_reqbuf)
+                       RETURN(-ENOMEM);
+
+               req->rq_reqbuf_len = bufsize;
+       } else {
+               LASSERT(req->rq_pool);
+               LASSERT(req->rq_reqbuf_len >= bufsize);
+               memset(req->rq_reqbuf, 0, bufsize);
+       }
+
+       lustre_init_msg_v2(req->rq_reqbuf, bufcnt, buflens, NULL);
+       req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+       req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, msgsize);
+       LASSERT(req->rq_reqmsg);
+
+       /* pack user desc here, later we might leave current user's process */
+       if (req->rq_pack_udesc)
+               sptlrpc_pack_user_desc(req->rq_reqbuf, 2);
+
+       RETURN(0);
+}
+
+static
+int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec,
+                         struct ptlrpc_request *req,
+                         int msgsize)
+{
+       __u32                ibuflens[3], wbuflens[2];
+       int                    ibufcnt;
+       int                    clearsize, wiresize;
+       ENTRY;
+
+       LASSERT(req->rq_clrbuf == NULL);
+       LASSERT(req->rq_clrbuf_len == 0);
+
+       /* Inner (clear) buffers
+        *  - lustre message
+        *  - user descriptor (optional)
+        *  - bulk checksum (optional)
+        */
+       ibufcnt = 1;
+       ibuflens[0] = msgsize;
+
+       if (req->rq_pack_udesc)
+               ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size();
+       if (req->rq_pack_bulk)
+               ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                          &req->rq_flvr, 0,
+                                                          req->rq_bulk_read);
+
+       clearsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+       /* to allow append padding during encryption */
+       clearsize += GSS_MAX_CIPHER_BLOCK;
+
+       /* Wrapper (wire) buffers
+        *  - gss header
+        *  - cipher text
+        */
+       wbuflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       wbuflens[1] = gss_cli_payload(req->rq_cli_ctx, clearsize, 1);
+       wiresize = lustre_msg_size_v2(2, wbuflens);
+
+       if (req->rq_pool) {
+               /* rq_reqbuf is preallocated */
+               LASSERT(req->rq_reqbuf);
+               LASSERT(req->rq_reqbuf_len >= wiresize);
+
+               memset(req->rq_reqbuf, 0, req->rq_reqbuf_len);
+
+               /* if the pre-allocated buffer is big enough, we just pack
+                * both clear buf & request buf in it, to avoid more alloc. */
+               if (clearsize + wiresize <= req->rq_reqbuf_len) {
+                       req->rq_clrbuf =
+                               (void *) (((char *) req->rq_reqbuf) + wiresize);
+               } else {
+                       CWARN("pre-allocated buf size %d is not enough for "
+                             "both clear (%d) and cipher (%d) text, proceed "
+                             "with extra allocation\n", req->rq_reqbuf_len,
+                             clearsize, wiresize);
+               }
+       }
+
+       if (!req->rq_clrbuf) {
+               clearsize = size_roundup_power2(clearsize);
+
+               OBD_ALLOC_LARGE(req->rq_clrbuf, clearsize);
+               if (!req->rq_clrbuf)
+                       RETURN(-ENOMEM);
+       }
+       req->rq_clrbuf_len = clearsize;
+
+       lustre_init_msg_v2(req->rq_clrbuf, ibufcnt, ibuflens, NULL);
+       req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, msgsize);
+
+       if (req->rq_pack_udesc)
+               sptlrpc_pack_user_desc(req->rq_clrbuf, 1);
+
+       RETURN(0);
+}
+
+/*
+ * NOTE: any change of request buffer allocation should also consider
+ * changing enlarge_reqbuf() series functions.
+ */
+int gss_alloc_reqbuf(struct ptlrpc_sec *sec,
+                    struct ptlrpc_request *req,
+                    int msgsize)
+{
+       int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+       LASSERT(!req->rq_pack_bulk ||
+               (req->rq_bulk_read || req->rq_bulk_write));
+
+       switch (svc) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               return gss_alloc_reqbuf_intg(sec, req, svc, msgsize);
+       case SPTLRPC_SVC_PRIV:
+               return gss_alloc_reqbuf_priv(sec, req, msgsize);
+       default:
+               LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+               return 0;
+       }
+}
+
+void gss_free_reqbuf(struct ptlrpc_sec *sec,
+                    struct ptlrpc_request *req)
+{
+       int     privacy;
+       ENTRY;
+
+       LASSERT(!req->rq_pool || req->rq_reqbuf);
+       privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV;
+
+       if (!req->rq_clrbuf)
+               goto release_reqbuf;
+
+       /* release clear buffer */
+       LASSERT(privacy);
+       LASSERT(req->rq_clrbuf_len);
+
+       if (req->rq_pool == NULL ||
+           req->rq_clrbuf < req->rq_reqbuf ||
+           (char *) req->rq_clrbuf >=
+           (char *) req->rq_reqbuf + req->rq_reqbuf_len)
+               OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+
+       req->rq_clrbuf = NULL;
+       req->rq_clrbuf_len = 0;
+
+release_reqbuf:
+       if (!req->rq_pool && req->rq_reqbuf) {
+               LASSERT(req->rq_reqbuf_len);
+
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = NULL;
+               req->rq_reqbuf_len = 0;
+       }
+
+       EXIT;
+}
+
+static int do_alloc_repbuf(struct ptlrpc_request *req, int bufsize)
+{
+       bufsize = size_roundup_power2(bufsize);
+
+       OBD_ALLOC_LARGE(req->rq_repbuf, bufsize);
+       if (!req->rq_repbuf)
+               return -ENOMEM;
+
+       req->rq_repbuf_len = bufsize;
+       return 0;
+}
+
+static
+int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec,
+                         struct ptlrpc_request *req,
+                         int svc, int msgsize)
+{
+       int          txtsize;
+       __u32      buflens[4];
+       int          bufcnt = 2;
+       int          alloc_size;
+
+       /*
+        * on-wire data layout:
+        * - gss header
+        * - lustre message
+        * - bulk sec descriptor (optional)
+        * - signature (optional)
+        *   - svc == NULL: NULL
+        *   - svc == AUTH: signature of gss header
+        *   - svc == INTG: signature of all above
+        *
+        * if this is context negotiation, reserver fixed space
+        * at the last (signature) segment regardless of svc mode.
+        */
+
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       txtsize = buflens[0];
+
+       buflens[1] = msgsize;
+       if (svc == SPTLRPC_SVC_INTG)
+               txtsize += buflens[1];
+
+       if (req->rq_pack_bulk) {
+               buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                      &req->rq_flvr,
+                                                      1, req->rq_bulk_read);
+               if (svc == SPTLRPC_SVC_INTG)
+                       txtsize += buflens[bufcnt];
+               bufcnt++;
+       }
+
+       if (req->rq_ctx_init)
+               buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN;
+       else if (svc != SPTLRPC_SVC_NULL)
+               buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0);
+
+       alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+
+       /* add space for early reply */
+       alloc_size += gss_at_reply_off_integ;
+
+       return do_alloc_repbuf(req, alloc_size);
+}
+
+static
+int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec,
+                         struct ptlrpc_request *req,
+                         int msgsize)
+{
+       int          txtsize;
+       __u32      buflens[2];
+       int          bufcnt;
+       int          alloc_size;
+
+       /* inner buffers */
+       bufcnt = 1;
+       buflens[0] = msgsize;
+
+       if (req->rq_pack_bulk)
+               buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx,
+                                                        &req->rq_flvr,
+                                                        1, req->rq_bulk_read);
+       txtsize = lustre_msg_size_v2(bufcnt, buflens);
+       txtsize += GSS_MAX_CIPHER_BLOCK;
+
+       /* wrapper buffers */
+       bufcnt = 2;
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1);
+
+       alloc_size = lustre_msg_size_v2(bufcnt, buflens);
+       /* add space for early reply */
+       alloc_size += gss_at_reply_off_priv;
+
+       return do_alloc_repbuf(req, alloc_size);
+}
+
+int gss_alloc_repbuf(struct ptlrpc_sec *sec,
+                    struct ptlrpc_request *req,
+                    int msgsize)
+{
+       int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+       ENTRY;
+
+       LASSERT(!req->rq_pack_bulk ||
+               (req->rq_bulk_read || req->rq_bulk_write));
+
+       switch (svc) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               return gss_alloc_repbuf_intg(sec, req, svc, msgsize);
+       case SPTLRPC_SVC_PRIV:
+               return gss_alloc_repbuf_priv(sec, req, msgsize);
+       default:
+               LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+               return 0;
+       }
+}
+
+void gss_free_repbuf(struct ptlrpc_sec *sec,
+                    struct ptlrpc_request *req)
+{
+       OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+       req->rq_repbuf = NULL;
+       req->rq_repbuf_len = 0;
+       req->rq_repdata = NULL;
+       req->rq_repdata_len = 0;
+}
+
+static int get_enlarged_msgsize(struct lustre_msg *msg,
+                               int segment, int newsize)
+{
+       int save, newmsg_size;
+
+       LASSERT(newsize >= msg->lm_buflens[segment]);
+
+       save = msg->lm_buflens[segment];
+       msg->lm_buflens[segment] = newsize;
+       newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+       msg->lm_buflens[segment] = save;
+
+       return newmsg_size;
+}
+
+static int get_enlarged_msgsize2(struct lustre_msg *msg,
+                                int segment1, int newsize1,
+                                int segment2, int newsize2)
+{
+       int save1, save2, newmsg_size;
+
+       LASSERT(newsize1 >= msg->lm_buflens[segment1]);
+       LASSERT(newsize2 >= msg->lm_buflens[segment2]);
+
+       save1 = msg->lm_buflens[segment1];
+       save2 = msg->lm_buflens[segment2];
+       msg->lm_buflens[segment1] = newsize1;
+       msg->lm_buflens[segment2] = newsize2;
+       newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+       msg->lm_buflens[segment1] = save1;
+       msg->lm_buflens[segment2] = save2;
+
+       return newmsg_size;
+}
+
+static
+int gss_enlarge_reqbuf_intg(struct ptlrpc_sec *sec,
+                           struct ptlrpc_request *req,
+                           int svc,
+                           int segment, int newsize)
+{
+       struct lustre_msg      *newbuf;
+       int                  txtsize, sigsize = 0, i;
+       int                  newmsg_size, newbuf_size;
+
+       /*
+        * gss header is at seg 0;
+        * embedded msg is at seg 1;
+        * signature (if any) is at the last seg
+        */
+       LASSERT(req->rq_reqbuf);
+       LASSERT(req->rq_reqbuf_len > req->rq_reqlen);
+       LASSERT(req->rq_reqbuf->lm_bufcount >= 2);
+       LASSERT(lustre_msg_buf(req->rq_reqbuf, 1, 0) == req->rq_reqmsg);
+
+       /* 1. compute new embedded msg size */
+       newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+       LASSERT(newmsg_size >= req->rq_reqbuf->lm_buflens[1]);
+
+       /* 2. compute new wrapper msg size */
+       if (svc == SPTLRPC_SVC_NULL) {
+               /* no signature, get size directly */
+               newbuf_size = get_enlarged_msgsize(req->rq_reqbuf,
+                                                  1, newmsg_size);
+       } else {
+               txtsize = req->rq_reqbuf->lm_buflens[0];
+
+               if (svc == SPTLRPC_SVC_INTG) {
+                       for (i = 1; i < req->rq_reqbuf->lm_bufcount; i++)
+                               txtsize += req->rq_reqbuf->lm_buflens[i];
+                       txtsize += newmsg_size - req->rq_reqbuf->lm_buflens[1];
+               }
+
+               sigsize = gss_cli_payload(req->rq_cli_ctx, txtsize, 0);
+               LASSERT(sigsize >= msg_last_seglen(req->rq_reqbuf));
+
+               newbuf_size = get_enlarged_msgsize2(
+                                       req->rq_reqbuf,
+                                       1, newmsg_size,
+                                       msg_last_segidx(req->rq_reqbuf),
+                                       sigsize);
+       }
+
+       /* request from pool should always have enough buffer */
+       LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+       if (req->rq_reqbuf_len < newbuf_size) {
+               newbuf_size = size_roundup_power2(newbuf_size);
+
+               OBD_ALLOC_LARGE(newbuf, newbuf_size);
+               if (newbuf == NULL)
+                       RETURN(-ENOMEM);
+
+               memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = newbuf;
+               req->rq_reqbuf_len = newbuf_size;
+               req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, 0);
+       }
+
+       /* do enlargement, from wrapper to embedded, from end to begin */
+       if (svc != SPTLRPC_SVC_NULL)
+               _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf,
+                                            msg_last_segidx(req->rq_reqbuf),
+                                            sigsize);
+
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, 1, newmsg_size);
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+       req->rq_reqlen = newmsg_size;
+       RETURN(0);
+}
+
+static
+int gss_enlarge_reqbuf_priv(struct ptlrpc_sec *sec,
+                           struct ptlrpc_request *req,
+                           int segment, int newsize)
+{
+       struct lustre_msg      *newclrbuf;
+       int                  newmsg_size, newclrbuf_size, newcipbuf_size;
+       __u32              buflens[3];
+
+       /*
+        * embedded msg is at seg 0 of clear buffer;
+        * cipher text is at seg 2 of cipher buffer;
+        */
+       LASSERT(req->rq_pool ||
+               (req->rq_reqbuf == NULL && req->rq_reqbuf_len == 0));
+       LASSERT(req->rq_reqbuf == NULL ||
+               (req->rq_pool && req->rq_reqbuf->lm_bufcount == 3));
+       LASSERT(req->rq_clrbuf);
+       LASSERT(req->rq_clrbuf_len > req->rq_reqlen);
+       LASSERT(lustre_msg_buf(req->rq_clrbuf, 0, 0) == req->rq_reqmsg);
+
+       /* compute new embedded msg size */
+       newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize);
+
+       /* compute new clear buffer size */
+       newclrbuf_size = get_enlarged_msgsize(req->rq_clrbuf, 0, newmsg_size);
+       newclrbuf_size += GSS_MAX_CIPHER_BLOCK;
+
+       /* compute new cipher buffer size */
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0);
+       buflens[2] = gss_cli_payload(req->rq_cli_ctx, newclrbuf_size, 1);
+       newcipbuf_size = lustre_msg_size_v2(3, buflens);
+
+       /* handle the case that we put both clear buf and cipher buf into
+        * pre-allocated single buffer. */
+       if (unlikely(req->rq_pool) &&
+           req->rq_clrbuf >= req->rq_reqbuf &&
+           (char *) req->rq_clrbuf <
+           (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+               /* it couldn't be better we still fit into the
+                * pre-allocated buffer. */
+               if (newclrbuf_size + newcipbuf_size <= req->rq_reqbuf_len) {
+                       void *src, *dst;
+
+                       /* move clear text backward. */
+                       src = req->rq_clrbuf;
+                       dst = (char *) req->rq_reqbuf + newcipbuf_size;
+
+                       memmove(dst, src, req->rq_clrbuf_len);
+
+                       req->rq_clrbuf = (struct lustre_msg *) dst;
+                       req->rq_clrbuf_len = newclrbuf_size;
+                       req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+               } else {
+                       /* sadly we have to split out the clear buffer */
+                       LASSERT(req->rq_reqbuf_len >= newcipbuf_size);
+                       LASSERT(req->rq_clrbuf_len < newclrbuf_size);
+               }
+       }
+
+       if (req->rq_clrbuf_len < newclrbuf_size) {
+               newclrbuf_size = size_roundup_power2(newclrbuf_size);
+
+               OBD_ALLOC_LARGE(newclrbuf, newclrbuf_size);
+               if (newclrbuf == NULL)
+                       RETURN(-ENOMEM);
+
+               memcpy(newclrbuf, req->rq_clrbuf, req->rq_clrbuf_len);
+
+               if (req->rq_reqbuf == NULL ||
+                   req->rq_clrbuf < req->rq_reqbuf ||
+                   (char *) req->rq_clrbuf >=
+                   (char *) req->rq_reqbuf + req->rq_reqbuf_len) {
+                       OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len);
+               }
+
+               req->rq_clrbuf = newclrbuf;
+               req->rq_clrbuf_len = newclrbuf_size;
+               req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0);
+       }
+
+       _sptlrpc_enlarge_msg_inplace(req->rq_clrbuf, 0, newmsg_size);
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+       req->rq_reqlen = newmsg_size;
+
+       RETURN(0);
+}
+
+int gss_enlarge_reqbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req,
+                      int segment, int newsize)
+{
+       int     svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+
+       LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini);
+
+       switch (svc) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               return gss_enlarge_reqbuf_intg(sec, req, svc, segment, newsize);
+       case SPTLRPC_SVC_PRIV:
+               return gss_enlarge_reqbuf_priv(sec, req, segment, newsize);
+       default:
+               LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc);
+               return 0;
+       }
+}
+
+int gss_sec_install_rctx(struct obd_import *imp,
+                        struct ptlrpc_sec *sec,
+                        struct ptlrpc_cli_ctx *ctx)
+{
+       struct gss_sec     *gsec;
+       struct gss_cli_ctx *gctx;
+       int              rc;
+
+       gsec = container_of(sec, struct gss_sec, gs_base);
+       gctx = container_of(ctx, struct gss_cli_ctx, gc_base);
+
+       rc = gss_install_rvs_svc_ctx(imp, gsec, gctx);
+       return rc;
+}
+
+/********************************************
+ * server side API                       *
+ ********************************************/
+
+static inline
+int gss_svc_reqctx_is_special(struct gss_svc_reqctx *grctx)
+{
+       LASSERT(grctx);
+       return (grctx->src_init || grctx->src_init_continue ||
+               grctx->src_err_notify);
+}
+
+static
+void gss_svc_reqctx_free(struct gss_svc_reqctx *grctx)
+{
+       if (grctx->src_ctx)
+               gss_svc_upcall_put_ctx(grctx->src_ctx);
+
+       sptlrpc_policy_put(grctx->src_base.sc_policy);
+       OBD_FREE_PTR(grctx);
+}
+
+static inline
+void gss_svc_reqctx_addref(struct gss_svc_reqctx *grctx)
+{
+       LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+       atomic_inc(&grctx->src_base.sc_refcount);
+}
+
+static inline
+void gss_svc_reqctx_decref(struct gss_svc_reqctx *grctx)
+{
+       LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0);
+
+       if (atomic_dec_and_test(&grctx->src_base.sc_refcount))
+               gss_svc_reqctx_free(grctx);
+}
+
+static
+int gss_svc_sign(struct ptlrpc_request *req,
+                struct ptlrpc_reply_state *rs,
+                struct gss_svc_reqctx *grctx,
+                __u32 svc)
+{
+       __u32   flags = 0;
+       int     rc;
+       ENTRY;
+
+       LASSERT(rs->rs_msg == lustre_msg_buf(rs->rs_repbuf, 1, 0));
+
+       /* embedded lustre_msg might have been shrinked */
+       if (req->rq_replen != rs->rs_repbuf->lm_buflens[1])
+               lustre_shrink_msg(rs->rs_repbuf, 1, req->rq_replen, 1);
+
+       if (req->rq_pack_bulk)
+               flags |= LUSTRE_GSS_PACK_BULK;
+
+       rc = gss_sign_msg(rs->rs_repbuf, grctx->src_ctx->gsc_mechctx,
+                         LUSTRE_SP_ANY, flags, PTLRPC_GSS_PROC_DATA,
+                         grctx->src_wirectx.gw_seq, svc, NULL);
+       if (rc < 0)
+               RETURN(rc);
+
+       rs->rs_repdata_len = rc;
+
+       if (likely(req->rq_packed_final)) {
+               if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+                       req->rq_reply_off = gss_at_reply_off_integ;
+               else
+                       req->rq_reply_off = 0;
+       } else {
+               if (svc == SPTLRPC_SVC_NULL)
+                       rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0,
+                                       lustre_msg_buf(rs->rs_repbuf, 1, 0),
+                                       lustre_msg_buflen(rs->rs_repbuf, 1));
+               req->rq_reply_off = 0;
+       }
+
+       RETURN(0);
+}
+
+int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor)
+{
+       struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       struct ptlrpc_reply_state *rs;
+       struct gss_err_header     *ghdr;
+       int                     replen = sizeof(struct ptlrpc_body);
+       int                     rc;
+       ENTRY;
+
+       //if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_SVCGSS_ERR_NOTIFY, OBD_FAIL_ONCE))
+       //      RETURN(-EINVAL);
+
+       grctx->src_err_notify = 1;
+       grctx->src_reserve_len = 0;
+
+       rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0);
+       if (rc) {
+               CERROR("could not pack reply, err %d\n", rc);
+               RETURN(rc);
+       }
+
+       /* gss hdr */
+       rs = req->rq_reply_state;
+       LASSERT(rs->rs_repbuf->lm_buflens[1] >= sizeof(*ghdr));
+       ghdr = lustre_msg_buf(rs->rs_repbuf, 0, 0);
+       ghdr->gh_version = PTLRPC_GSS_VERSION;
+       ghdr->gh_flags = 0;
+       ghdr->gh_proc = PTLRPC_GSS_PROC_ERR;
+       ghdr->gh_major = major;
+       ghdr->gh_minor = minor;
+       ghdr->gh_handle.len = 0; /* fake context handle */
+
+       rs->rs_repdata_len = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+                                               rs->rs_repbuf->lm_buflens);
+
+       CDEBUG(D_SEC, "prepare gss error notify(0x%x/0x%x) to %s\n",
+              major, minor, libcfs_nid2str(req->rq_peer.nid));
+       RETURN(0);
+}
+
+static
+int gss_svc_handle_init(struct ptlrpc_request *req,
+                       struct gss_wire_ctx *gw)
+{
+       struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       struct lustre_msg        *reqbuf = req->rq_reqbuf;
+       struct obd_uuid    *uuid;
+       struct obd_device        *target;
+       rawobj_t                   uuid_obj, rvs_hdl, in_token;
+       __u32                 lustre_svc;
+       __u32                *secdata, seclen;
+       int                     swabbed, rc;
+       ENTRY;
+
+       CDEBUG(D_SEC, "processing gss init(%d) request from %s\n", gw->gw_proc,
+              libcfs_nid2str(req->rq_peer.nid));
+
+       req->rq_ctx_init = 1;
+
+       if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+               CERROR("unexpected bulk flag\n");
+               RETURN(SECSVC_DROP);
+       }
+
+       if (gw->gw_proc == PTLRPC_GSS_PROC_INIT && gw->gw_handle.len != 0) {
+               CERROR("proc %u: invalid handle length %u\n",
+                      gw->gw_proc, gw->gw_handle.len);
+               RETURN(SECSVC_DROP);
+       }
+
+       if (reqbuf->lm_bufcount < 3 || reqbuf->lm_bufcount > 4){
+               CERROR("Invalid bufcount %d\n", reqbuf->lm_bufcount);
+               RETURN(SECSVC_DROP);
+       }
+
+       swabbed = ptlrpc_req_need_swab(req);
+
+       /* ctx initiate payload is in last segment */
+       secdata = lustre_msg_buf(reqbuf, reqbuf->lm_bufcount - 1, 0);
+       seclen = reqbuf->lm_buflens[reqbuf->lm_bufcount - 1];
+
+       if (seclen < 4 + 4) {
+               CERROR("sec size %d too small\n", seclen);
+               RETURN(SECSVC_DROP);
+       }
+
+       /* lustre svc type */
+       lustre_svc = le32_to_cpu(*secdata++);
+       seclen -= 4;
+
+       /* extract target uuid, note this code is somewhat fragile
+        * because touched internal structure of obd_uuid */
+       if (rawobj_extract(&uuid_obj, &secdata, &seclen)) {
+               CERROR("failed to extract target uuid\n");
+               RETURN(SECSVC_DROP);
+       }
+       uuid_obj.data[uuid_obj.len - 1] = '\0';
+
+       uuid = (struct obd_uuid *) uuid_obj.data;
+       target = class_uuid2obd(uuid);
+       if (!target || target->obd_stopping || !target->obd_set_up) {
+               CERROR("target '%s' is not available for context init (%s)\n",
+                      uuid->uuid, target == NULL ? "no target" :
+                      (target->obd_stopping ? "stopping" : "not set up"));
+               RETURN(SECSVC_DROP);
+       }
+
+       /* extract reverse handle */
+       if (rawobj_extract(&rvs_hdl, &secdata, &seclen)) {
+               CERROR("failed extract reverse handle\n");
+               RETURN(SECSVC_DROP);
+       }
+
+       /* extract token */
+       if (rawobj_extract(&in_token, &secdata, &seclen)) {
+               CERROR("can't extract token\n");
+               RETURN(SECSVC_DROP);
+       }
+
+       rc = gss_svc_upcall_handle_init(req, grctx, gw, target, lustre_svc,
+                                       &rvs_hdl, &in_token);
+       if (rc != SECSVC_OK)
+               RETURN(rc);
+
+       if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss ||
+           grctx->src_ctx->gsc_usr_root)
+               CWARN("create svc ctx %p: user from %s authenticated as %s\n",
+                     grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid),
+                     grctx->src_ctx->gsc_usr_mds ? "mds" :
+                       (grctx->src_ctx->gsc_usr_oss ? "oss" : "root"));
+       else
+               CWARN("create svc ctx %p: accept user %u from %s\n",
+                     grctx->src_ctx, grctx->src_ctx->gsc_uid,
+                     libcfs_nid2str(req->rq_peer.nid));
+
+       if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+               if (reqbuf->lm_bufcount < 4) {
+                       CERROR("missing user descriptor\n");
+                       RETURN(SECSVC_DROP);
+               }
+               if (sptlrpc_unpack_user_desc(reqbuf, 2, swabbed)) {
+                       CERROR("Mal-formed user descriptor\n");
+                       RETURN(SECSVC_DROP);
+               }
+
+               req->rq_pack_udesc = 1;
+               req->rq_user_desc = lustre_msg_buf(reqbuf, 2, 0);
+       }
+
+       req->rq_reqmsg = lustre_msg_buf(reqbuf, 1, 0);
+       req->rq_reqlen = lustre_msg_buflen(reqbuf, 1);
+
+       RETURN(rc);
+}
+
+/*
+ * last segment must be the gss signature.
+ */
+static
+int gss_svc_verify_request(struct ptlrpc_request *req,
+                          struct gss_svc_reqctx *grctx,
+                          struct gss_wire_ctx *gw,
+                          __u32 *major)
+{
+       struct gss_svc_ctx *gctx = grctx->src_ctx;
+       struct lustre_msg  *msg = req->rq_reqbuf;
+       int              offset = 2;
+       int              swabbed;
+       ENTRY;
+
+       *major = GSS_S_COMPLETE;
+
+       if (msg->lm_bufcount < 2) {
+               CERROR("Too few segments (%u) in request\n", msg->lm_bufcount);
+               RETURN(-EINVAL);
+       }
+
+       if (gw->gw_svc == SPTLRPC_SVC_NULL)
+               goto verified;
+
+       if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+               CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+               *major = GSS_S_DUPLICATE_TOKEN;
+               RETURN(-EACCES);
+       }
+
+       *major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc);
+       if (*major != GSS_S_COMPLETE) {
+               CERROR("failed to verify request: %x\n", *major);
+               RETURN(-EACCES);
+       }
+
+       if (gctx->gsc_reverse == 0 &&
+           gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+               CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+               *major = GSS_S_DUPLICATE_TOKEN;
+               RETURN(-EACCES);
+       }
+
+verified:
+       swabbed = ptlrpc_req_need_swab(req);
+
+       /* user descriptor */
+       if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+               if (msg->lm_bufcount < (offset + 1)) {
+                       CERROR("no user desc included\n");
+                       RETURN(-EINVAL);
+               }
+
+               if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+                       CERROR("Mal-formed user descriptor\n");
+                       RETURN(-EINVAL);
+               }
+
+               req->rq_pack_udesc = 1;
+               req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+               offset++;
+       }
+
+       /* check bulk_sec_desc data */
+       if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+               if (msg->lm_bufcount < (offset + 1)) {
+                       CERROR("missing bulk sec descriptor\n");
+                       RETURN(-EINVAL);
+               }
+
+               if (bulk_sec_desc_unpack(msg, offset, swabbed))
+                       RETURN(-EINVAL);
+
+               req->rq_pack_bulk = 1;
+               grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+               grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+       }
+
+       req->rq_reqmsg = lustre_msg_buf(msg, 1, 0);
+       req->rq_reqlen = msg->lm_buflens[1];
+       RETURN(0);
+}
+
+static
+int gss_svc_unseal_request(struct ptlrpc_request *req,
+                          struct gss_svc_reqctx *grctx,
+                          struct gss_wire_ctx *gw,
+                          __u32 *major)
+{
+       struct gss_svc_ctx *gctx = grctx->src_ctx;
+       struct lustre_msg  *msg = req->rq_reqbuf;
+       int              swabbed, msglen, offset = 1;
+       ENTRY;
+
+       if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) {
+               CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq);
+               *major = GSS_S_DUPLICATE_TOKEN;
+               RETURN(-EACCES);
+       }
+
+       *major = gss_unseal_msg(gctx->gsc_mechctx, msg,
+                              &msglen, req->rq_reqdata_len);
+       if (*major != GSS_S_COMPLETE) {
+               CERROR("failed to unwrap request: %x\n", *major);
+               RETURN(-EACCES);
+       }
+
+       if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) {
+               CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq);
+               *major = GSS_S_DUPLICATE_TOKEN;
+               RETURN(-EACCES);
+       }
+
+       swabbed = __lustre_unpack_msg(msg, msglen);
+       if (swabbed < 0) {
+               CERROR("Failed to unpack after decryption\n");
+               RETURN(-EINVAL);
+       }
+       req->rq_reqdata_len = msglen;
+
+       if (msg->lm_bufcount < 1) {
+               CERROR("Invalid buffer: is empty\n");
+               RETURN(-EINVAL);
+       }
+
+       if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+               if (msg->lm_bufcount < offset + 1) {
+                       CERROR("no user descriptor included\n");
+                       RETURN(-EINVAL);
+               }
+
+               if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) {
+                       CERROR("Mal-formed user descriptor\n");
+                       RETURN(-EINVAL);
+               }
+
+               req->rq_pack_udesc = 1;
+               req->rq_user_desc = lustre_msg_buf(msg, offset, 0);
+               offset++;
+       }
+
+       if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) {
+               if (msg->lm_bufcount < offset + 1) {
+                       CERROR("no bulk checksum included\n");
+                       RETURN(-EINVAL);
+               }
+
+               if (bulk_sec_desc_unpack(msg, offset, swabbed))
+                       RETURN(-EINVAL);
+
+               req->rq_pack_bulk = 1;
+               grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0);
+               grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset);
+       }
+
+       req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 0, 0);
+       req->rq_reqlen = req->rq_reqbuf->lm_buflens[0];
+       RETURN(0);
+}
+
+static
+int gss_svc_handle_data(struct ptlrpc_request *req,
+                       struct gss_wire_ctx *gw)
+{
+       struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       __u32             major = 0;
+       int                 rc = 0;
+       ENTRY;
+
+       grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+       if (!grctx->src_ctx) {
+               major = GSS_S_NO_CONTEXT;
+               goto error;
+       }
+
+       switch (gw->gw_svc) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               rc = gss_svc_verify_request(req, grctx, gw, &major);
+               break;
+       case SPTLRPC_SVC_PRIV:
+               rc = gss_svc_unseal_request(req, grctx, gw, &major);
+               break;
+       default:
+               CERROR("unsupported gss service %d\n", gw->gw_svc);
+               rc = -EINVAL;
+       }
+
+       if (rc == 0)
+               RETURN(SECSVC_OK);
+
+       CERROR("svc %u failed: major 0x%08x: req xid "LPU64" ctx %p idx "
+              LPX64"(%u->%s)\n", gw->gw_svc, major, req->rq_xid,
+              grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+              grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+error:
+       /* we only notify client in case of NO_CONTEXT/BAD_SIG, which
+        * might happen after server reboot, to allow recovery. */
+       if ((major == GSS_S_NO_CONTEXT || major == GSS_S_BAD_SIG) &&
+           gss_pack_err_notify(req, major, 0) == 0)
+               RETURN(SECSVC_COMPLETE);
+
+       RETURN(SECSVC_DROP);
+}
+
+static
+int gss_svc_handle_destroy(struct ptlrpc_request *req,
+                          struct gss_wire_ctx *gw)
+{
+       struct gss_svc_reqctx  *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       __u32              major;
+       ENTRY;
+
+       req->rq_ctx_fini = 1;
+       req->rq_no_reply = 1;
+
+       grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw);
+       if (!grctx->src_ctx) {
+               CDEBUG(D_SEC, "invalid gss context handle for destroy.\n");
+               RETURN(SECSVC_DROP);
+       }
+
+       if (gw->gw_svc != SPTLRPC_SVC_INTG) {
+               CERROR("svc %u is not supported in destroy.\n", gw->gw_svc);
+               RETURN(SECSVC_DROP);
+       }
+
+       if (gss_svc_verify_request(req, grctx, gw, &major))
+               RETURN(SECSVC_DROP);
+
+       CWARN("destroy svc ctx %p idx "LPX64" (%u->%s)\n",
+             grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle),
+             grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid));
+
+       gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+       if (gw->gw_flags & LUSTRE_GSS_PACK_USER) {
+               if (req->rq_reqbuf->lm_bufcount < 4) {
+                       CERROR("missing user descriptor, ignore it\n");
+                       RETURN(SECSVC_OK);
+               }
+               if (sptlrpc_unpack_user_desc(req->rq_reqbuf, 2,
+                                            ptlrpc_req_need_swab(req))) {
+                       CERROR("Mal-formed user descriptor, ignore it\n");
+                       RETURN(SECSVC_OK);
+               }
+
+               req->rq_pack_udesc = 1;
+               req->rq_user_desc = lustre_msg_buf(req->rq_reqbuf, 2, 0);
+       }
+
+       RETURN(SECSVC_OK);
+}
+
+int gss_svc_accept(struct ptlrpc_sec_policy *policy, struct ptlrpc_request *req)
+{
+       struct gss_header      *ghdr;
+       struct gss_svc_reqctx  *grctx;
+       struct gss_wire_ctx    *gw;
+       int                  swabbed, rc;
+       ENTRY;
+
+       LASSERT(req->rq_reqbuf);
+       LASSERT(req->rq_svc_ctx == NULL);
+
+       if (req->rq_reqbuf->lm_bufcount < 2) {
+               CERROR("buf count only %d\n", req->rq_reqbuf->lm_bufcount);
+               RETURN(SECSVC_DROP);
+       }
+
+       swabbed = ptlrpc_req_need_swab(req);
+
+       ghdr = gss_swab_header(req->rq_reqbuf, 0, swabbed);
+       if (ghdr == NULL) {
+               CERROR("can't decode gss header\n");
+               RETURN(SECSVC_DROP);
+       }
+
+       /* sanity checks */
+       if (ghdr->gh_version != PTLRPC_GSS_VERSION) {
+               CERROR("gss version %u, expect %u\n", ghdr->gh_version,
+                      PTLRPC_GSS_VERSION);
+               RETURN(SECSVC_DROP);
+       }
+
+       req->rq_sp_from = ghdr->gh_sp;
+
+       /* alloc grctx data */
+       OBD_ALLOC_PTR(grctx);
+       if (!grctx)
+               RETURN(SECSVC_DROP);
+
+       grctx->src_base.sc_policy = sptlrpc_policy_get(policy);
+       atomic_set(&grctx->src_base.sc_refcount, 1);
+       req->rq_svc_ctx = &grctx->src_base;
+       gw = &grctx->src_wirectx;
+
+       /* save wire context */
+       gw->gw_flags = ghdr->gh_flags;
+       gw->gw_proc = ghdr->gh_proc;
+       gw->gw_seq = ghdr->gh_seq;
+       gw->gw_svc = ghdr->gh_svc;
+       rawobj_from_netobj(&gw->gw_handle, &ghdr->gh_handle);
+
+       /* keep original wire header which subject to checksum verification */
+       if (swabbed)
+               gss_header_swabber(ghdr);
+
+       switch(ghdr->gh_proc) {
+       case PTLRPC_GSS_PROC_INIT:
+       case PTLRPC_GSS_PROC_CONTINUE_INIT:
+               rc = gss_svc_handle_init(req, gw);
+               break;
+       case PTLRPC_GSS_PROC_DATA:
+               rc = gss_svc_handle_data(req, gw);
+               break;
+       case PTLRPC_GSS_PROC_DESTROY:
+               rc = gss_svc_handle_destroy(req, gw);
+               break;
+       default:
+               CERROR("unknown proc %u\n", gw->gw_proc);
+               rc = SECSVC_DROP;
+               break;
+       }
+
+       switch (rc) {
+       case SECSVC_OK:
+               LASSERT (grctx->src_ctx);
+
+               req->rq_auth_gss = 1;
+               req->rq_auth_remote = grctx->src_ctx->gsc_remote;
+               req->rq_auth_usr_mdt = grctx->src_ctx->gsc_usr_mds;
+               req->rq_auth_usr_ost = grctx->src_ctx->gsc_usr_oss;
+               req->rq_auth_usr_root = grctx->src_ctx->gsc_usr_root;
+               req->rq_auth_uid = grctx->src_ctx->gsc_uid;
+               req->rq_auth_mapped_uid = grctx->src_ctx->gsc_mapped_uid;
+               break;
+       case SECSVC_COMPLETE:
+               break;
+       case SECSVC_DROP:
+               gss_svc_reqctx_free(grctx);
+               req->rq_svc_ctx = NULL;
+               break;
+       }
+
+       RETURN(rc);
+}
+
+void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx)
+{
+       struct gss_svc_reqctx  *grctx;
+       ENTRY;
+
+       if (svc_ctx == NULL) {
+               EXIT;
+               return;
+       }
+
+       grctx = gss_svc_ctx2reqctx(svc_ctx);
+
+       CWARN("gss svc invalidate ctx %p(%u)\n",
+             grctx->src_ctx, grctx->src_ctx->gsc_uid);
+       gss_svc_upcall_destroy_ctx(grctx->src_ctx);
+
+       EXIT;
+}
+
+static inline
+int gss_svc_payload(struct gss_svc_reqctx *grctx, int early,
+                   int msgsize, int privacy)
+{
+       /* we should treat early reply normally, but which is actually sharing
+        * the same ctx with original request, so in this case we should
+        * ignore the special ctx's special flags */
+       if (early == 0 && gss_svc_reqctx_is_special(grctx))
+               return grctx->src_reserve_len;
+
+       return gss_mech_payload(NULL, msgsize, privacy);
+}
+
+static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx,
+                               struct sptlrpc_flavor *flvr,
+                               int read)
+{
+       int     payload = sizeof(struct ptlrpc_bulk_sec_desc);
+
+       if (read) {
+               switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+               case SPTLRPC_BULK_SVC_NULL:
+                       break;
+               case SPTLRPC_BULK_SVC_INTG:
+                       payload += gss_mech_payload(NULL, 0, 0);
+                       break;
+               case SPTLRPC_BULK_SVC_PRIV:
+                       payload += gss_mech_payload(NULL, 0, 1);
+                       break;
+               case SPTLRPC_BULK_SVC_AUTH:
+               default:
+                       LBUG();
+               }
+       }
+
+       return payload;
+}
+
+int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+       struct gss_svc_reqctx       *grctx;
+       struct ptlrpc_reply_state   *rs;
+       int                       early, privacy, svc, bsd_off = 0;
+       __u32                   ibuflens[2], buflens[4];
+       int                       ibufcnt = 0, bufcnt;
+       int                       txtsize, wmsg_size, rs_size;
+       ENTRY;
+
+       LASSERT(msglen % 8 == 0);
+
+       if (req->rq_pack_bulk && !req->rq_bulk_read && !req->rq_bulk_write) {
+               CERROR("client request bulk sec on non-bulk rpc\n");
+               RETURN(-EPROTO);
+       }
+
+       svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc);
+       early = (req->rq_packed_final == 0);
+
+       grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       if (!early && gss_svc_reqctx_is_special(grctx))
+               privacy = 0;
+       else
+               privacy = (svc == SPTLRPC_SVC_PRIV);
+
+       if (privacy) {
+               /* inner clear buffers */
+               ibufcnt = 1;
+               ibuflens[0] = msglen;
+
+               if (req->rq_pack_bulk) {
+                       LASSERT(grctx->src_reqbsd);
+
+                       bsd_off = ibufcnt;
+                       ibuflens[ibufcnt++] = gss_svc_bulk_payload(
+                                                       grctx->src_ctx,
+                                                       &req->rq_flvr,
+                                                       req->rq_bulk_read);
+               }
+
+               txtsize = lustre_msg_size_v2(ibufcnt, ibuflens);
+               txtsize += GSS_MAX_CIPHER_BLOCK;
+
+               /* wrapper buffer */
+               bufcnt = 2;
+               buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+               buflens[1] = gss_svc_payload(grctx, early, txtsize, 1);
+       } else {
+               bufcnt = 2;
+               buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+               buflens[1] = msglen;
+
+               txtsize = buflens[0];
+               if (svc == SPTLRPC_SVC_INTG)
+                       txtsize += buflens[1];
+
+               if (req->rq_pack_bulk) {
+                       LASSERT(grctx->src_reqbsd);
+
+                       bsd_off = bufcnt;
+                       buflens[bufcnt] = gss_svc_bulk_payload(
+                                                       grctx->src_ctx,
+                                                       &req->rq_flvr,
+                                                       req->rq_bulk_read);
+                       if (svc == SPTLRPC_SVC_INTG)
+                               txtsize += buflens[bufcnt];
+                       bufcnt++;
+               }
+
+               if ((!early && gss_svc_reqctx_is_special(grctx)) ||
+                   svc != SPTLRPC_SVC_NULL)
+                       buflens[bufcnt++] = gss_svc_payload(grctx, early,
+                                                           txtsize, 0);
+       }
+
+       wmsg_size = lustre_msg_size_v2(bufcnt, buflens);
+
+       rs_size = sizeof(*rs) + wmsg_size;
+       rs = req->rq_reply_state;
+
+       if (rs) {
+               /* pre-allocated */
+               LASSERT(rs->rs_size >= rs_size);
+       } else {
+               OBD_ALLOC_LARGE(rs, rs_size);
+               if (rs == NULL)
+                       RETURN(-ENOMEM);
+
+               rs->rs_size = rs_size;
+       }
+
+       rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+       rs->rs_repbuf_len = wmsg_size;
+
+       /* initialize the buffer */
+       if (privacy) {
+               lustre_init_msg_v2(rs->rs_repbuf, ibufcnt, ibuflens, NULL);
+               rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 0, msglen);
+       } else {
+               lustre_init_msg_v2(rs->rs_repbuf, bufcnt, buflens, NULL);
+               rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+               rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 1, 0);
+       }
+
+       if (bsd_off) {
+               grctx->src_repbsd = lustre_msg_buf(rs->rs_repbuf, bsd_off, 0);
+               grctx->src_repbsd_size = lustre_msg_buflen(rs->rs_repbuf,
+                                                          bsd_off);
+       }
+
+       gss_svc_reqctx_addref(grctx);
+       rs->rs_svc_ctx = req->rq_svc_ctx;
+
+       LASSERT(rs->rs_msg);
+       req->rq_reply_state = rs;
+       RETURN(0);
+}
+
+static int gss_svc_seal(struct ptlrpc_request *req,
+                       struct ptlrpc_reply_state *rs,
+                       struct gss_svc_reqctx *grctx)
+{
+       struct gss_svc_ctx      *gctx = grctx->src_ctx;
+       rawobj_t                 hdrobj, msgobj, token;
+       struct gss_header       *ghdr;
+       __u8                *token_buf;
+       int                   token_buflen;
+       __u32               buflens[2], major;
+       int                   msglen, rc;
+       ENTRY;
+
+       /* get clear data length. note embedded lustre_msg might
+        * have been shrinked */
+       if (req->rq_replen != lustre_msg_buflen(rs->rs_repbuf, 0))
+               msglen = lustre_shrink_msg(rs->rs_repbuf, 0, req->rq_replen, 1);
+       else
+               msglen = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount,
+                                           rs->rs_repbuf->lm_buflens);
+
+       /* temporarily use tail of buffer to hold gss header data */
+       LASSERT(msglen + PTLRPC_GSS_HEADER_SIZE <= rs->rs_repbuf_len);
+       ghdr = (struct gss_header *) ((char *) rs->rs_repbuf +
+                               rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE);
+       ghdr->gh_version = PTLRPC_GSS_VERSION;
+       ghdr->gh_sp = LUSTRE_SP_ANY;
+       ghdr->gh_flags = 0;
+       ghdr->gh_proc = PTLRPC_GSS_PROC_DATA;
+       ghdr->gh_seq = grctx->src_wirectx.gw_seq;
+       ghdr->gh_svc = SPTLRPC_SVC_PRIV;
+       ghdr->gh_handle.len = 0;
+       if (req->rq_pack_bulk)
+               ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK;
+
+       /* allocate temporary cipher buffer */
+       token_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1);
+       OBD_ALLOC_LARGE(token_buf, token_buflen);
+       if (token_buf == NULL)
+               RETURN(-ENOMEM);
+
+       hdrobj.len = PTLRPC_GSS_HEADER_SIZE;
+       hdrobj.data = (__u8 *) ghdr;
+       msgobj.len = msglen;
+       msgobj.data = (__u8 *) rs->rs_repbuf;
+       token.len = token_buflen;
+       token.data = token_buf;
+
+       major = lgss_wrap(gctx->gsc_mechctx, &hdrobj, &msgobj,
+                         rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE, &token);
+       if (major != GSS_S_COMPLETE) {
+               CERROR("wrap message error: %08x\n", major);
+               GOTO(out_free, rc = -EPERM);
+       }
+       LASSERT(token.len <= token_buflen);
+
+       /* we are about to override data at rs->rs_repbuf, nullify pointers
+        * to which to catch further illegal usage. */
+       if (req->rq_pack_bulk) {
+               grctx->src_repbsd = NULL;
+               grctx->src_repbsd_size = 0;
+       }
+
+       /* now fill the actual wire data
+        * - gss header
+        * - gss token
+        */
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = token.len;
+
+       rs->rs_repdata_len = lustre_msg_size_v2(2, buflens);
+       LASSERT(rs->rs_repdata_len <= rs->rs_repbuf_len);
+
+       lustre_init_msg_v2(rs->rs_repbuf, 2, buflens, NULL);
+       rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc;
+
+       memcpy(lustre_msg_buf(rs->rs_repbuf, 0, 0), ghdr,
+              PTLRPC_GSS_HEADER_SIZE);
+       memcpy(lustre_msg_buf(rs->rs_repbuf, 1, 0), token.data, token.len);
+
+       /* reply offset */
+       if (req->rq_packed_final &&
+           (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))
+               req->rq_reply_off = gss_at_reply_off_priv;
+       else
+               req->rq_reply_off = 0;
+
+       /* to catch upper layer's further access */
+       rs->rs_msg = NULL;
+       req->rq_repmsg = NULL;
+       req->rq_replen = 0;
+
+       rc = 0;
+out_free:
+       OBD_FREE_LARGE(token_buf, token_buflen);
+       RETURN(rc);
+}
+
+int gss_svc_authorize(struct ptlrpc_request *req)
+{
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+       struct gss_svc_reqctx     *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx);
+       struct gss_wire_ctx       *gw = &grctx->src_wirectx;
+       int                     early, rc;
+       ENTRY;
+
+       early = (req->rq_packed_final == 0);
+
+       if (!early && gss_svc_reqctx_is_special(grctx)) {
+               LASSERT(rs->rs_repdata_len != 0);
+
+               req->rq_reply_off = gss_at_reply_off_integ;
+               RETURN(0);
+       }
+
+       /* early reply could happen in many cases */
+       if (!early &&
+           gw->gw_proc != PTLRPC_GSS_PROC_DATA &&
+           gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) {
+               CERROR("proc %d not support\n", gw->gw_proc);
+               RETURN(-EINVAL);
+       }
+
+       LASSERT(grctx->src_ctx);
+
+       switch (gw->gw_svc) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               rc = gss_svc_sign(req, rs, grctx, gw->gw_svc);
+               break;
+       case SPTLRPC_SVC_PRIV:
+               rc = gss_svc_seal(req, rs, grctx);
+               break;
+       default:
+               CERROR("Unknown service %d\n", gw->gw_svc);
+               GOTO(out, rc = -EINVAL);
+       }
+       rc = 0;
+
+out:
+       RETURN(rc);
+}
+
+void gss_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+       struct gss_svc_reqctx *grctx;
+
+       LASSERT(rs->rs_svc_ctx);
+       grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base);
+
+       gss_svc_reqctx_decref(grctx);
+       rs->rs_svc_ctx = NULL;
+
+       if (!rs->rs_prealloc)
+               OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx)
+{
+       LASSERT(atomic_read(&ctx->sc_refcount) == 0);
+       gss_svc_reqctx_free(gss_svc_ctx2reqctx(ctx));
+}
+
+int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx,
+                        struct ptlrpc_svc_ctx *svc_ctx)
+{
+       struct gss_cli_ctx     *cli_gctx = ctx2gctx(cli_ctx);
+       struct gss_svc_ctx     *svc_gctx = gss_svc_ctx2gssctx(svc_ctx);
+       struct gss_ctx   *mechctx = NULL;
+
+       LASSERT(cli_gctx);
+       LASSERT(svc_gctx && svc_gctx->gsc_mechctx);
+
+       cli_gctx->gc_proc = PTLRPC_GSS_PROC_DATA;
+       cli_gctx->gc_win = GSS_SEQ_WIN;
+
+       /* The problem is the reverse ctx might get lost in some recovery
+        * situations, and the same svc_ctx will be used to re-create it.
+        * if there's callback be sentout before that, new reverse ctx start
+        * with sequence 0 will lead to future callback rpc be treated as
+        * replay.
+        *
+        * each reverse root ctx will record its latest sequence number on its
+        * buddy svcctx before be destroied, so here we continue use it.
+        */
+       atomic_set(&cli_gctx->gc_seq, svc_gctx->gsc_rvs_seq);
+
+       if (gss_svc_upcall_dup_handle(&cli_gctx->gc_svc_handle, svc_gctx)) {
+               CERROR("failed to dup svc handle\n");
+               goto err_out;
+       }
+
+       if (lgss_copy_reverse_context(svc_gctx->gsc_mechctx, &mechctx) !=
+           GSS_S_COMPLETE) {
+               CERROR("failed to copy mech context\n");
+               goto err_svc_handle;
+       }
+
+       if (rawobj_dup(&cli_gctx->gc_handle, &svc_gctx->gsc_rvs_hdl)) {
+               CERROR("failed to dup reverse handle\n");
+               goto err_ctx;
+       }
+
+       cli_gctx->gc_mechctx = mechctx;
+       gss_cli_ctx_uptodate(cli_gctx);
+
+       return 0;
+
+err_ctx:
+       lgss_delete_sec_context(&mechctx);
+err_svc_handle:
+       rawobj_free(&cli_gctx->gc_svc_handle);
+err_out:
+       return -ENOMEM;
+}
+
+static void gss_init_at_reply_offset(void)
+{
+       __u32 buflens[3];
+       int clearsize;
+
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = lustre_msg_early_size();
+       buflens[2] = gss_cli_payload(NULL, buflens[1], 0);
+       gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens);
+
+       buflens[0] = lustre_msg_early_size();
+       clearsize = lustre_msg_size_v2(1, buflens);
+       buflens[0] = PTLRPC_GSS_HEADER_SIZE;
+       buflens[1] = gss_cli_payload(NULL, clearsize, 0);
+       buflens[2] = gss_cli_payload(NULL, clearsize, 1);
+       gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens);
+}
+
+int __init sptlrpc_gss_init(void)
+{
+       int rc;
+
+       rc = gss_init_lproc();
+       if (rc)
+               return rc;
+
+       rc = gss_init_cli_upcall();
+       if (rc)
+               goto out_lproc;
+
+       rc = gss_init_svc_upcall();
+       if (rc)
+               goto out_cli_upcall;
+
+       rc = init_kerberos_module();
+       if (rc)
+               goto out_svc_upcall;
+
+       /* register policy after all other stuff be intialized, because it
+        * might be in used immediately after the registration. */
+
+       rc = gss_init_keyring();
+       if (rc)
+               goto out_kerberos;
+
+#ifdef HAVE_GSS_PIPEFS
+       rc = gss_init_pipefs();
+       if (rc)
+               goto out_keyring;
+#endif
+
+       gss_init_at_reply_offset();
+
+       return 0;
+
+#ifdef HAVE_GSS_PIPEFS
+out_keyring:
+       gss_exit_keyring();
+#endif
+
+out_kerberos:
+       cleanup_kerberos_module();
+out_svc_upcall:
+       gss_exit_svc_upcall();
+out_cli_upcall:
+       gss_exit_cli_upcall();
+out_lproc:
+       gss_exit_lproc();
+       return rc;
+}
+
+static void __exit sptlrpc_gss_exit(void)
+{
+       gss_exit_keyring();
+#ifdef HAVE_GSS_PIPEFS
+       gss_exit_pipefs();
+#endif
+       cleanup_kerberos_module();
+       gss_exit_svc_upcall();
+       gss_exit_cli_upcall();
+       gss_exit_lproc();
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("GSS security policy for Lustre");
+MODULE_LICENSE("GPL");
+
+module_init(sptlrpc_gss_init);
+module_exit(sptlrpc_gss_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c b/drivers/staging/lustre/lustre/ptlrpc/import.c
new file mode 100644 (file)
index 0000000..47a3c05
--- /dev/null
@@ -0,0 +1,1613 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/import.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpc_connect_async_args {
+        __u64 pcaa_peer_committed;
+       int pcaa_initial_connect;
+};
+
+/**
+ * Updates import \a imp current state to provided \a state value
+ * Helper function. Must be called under imp_lock.
+ */
+static void __import_set_state(struct obd_import *imp,
+                              enum lustre_imp_state state)
+{
+       imp->imp_state = state;
+       imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
+       imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
+               cfs_time_current_sec();
+       imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
+               IMP_STATE_HIST_LEN;
+}
+
+/* A CLOSED import should remain so. */
+#define IMPORT_SET_STATE_NOLOCK(imp, state)                                \
+do {                                                                      \
+       if (imp->imp_state != LUSTRE_IMP_CLOSED) {                           \
+              CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",    \
+                     imp, obd2cli_tgt(imp->imp_obd),                     \
+                     ptlrpc_import_state_name(imp->imp_state),         \
+                     ptlrpc_import_state_name(state));                 \
+              __import_set_state(imp, state);                           \
+       }                                                                     \
+} while(0)
+
+#define IMPORT_SET_STATE(imp, state)                                   \
+do {                                                                   \
+       spin_lock(&imp->imp_lock);                                      \
+       IMPORT_SET_STATE_NOLOCK(imp, state);                            \
+       spin_unlock(&imp->imp_lock);                                    \
+} while(0)
+
+
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+                                   struct ptlrpc_request *request,
+                                   void * data, int rc);
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
+
+/* Only this function is allowed to change the import state when it is
+ * CLOSED. I would rather refcount the import and free it after
+ * disconnection like we do with exports. To do that, the client_obd
+ * will need to save the peer info somewhere other than in the import,
+ * though. */
+int ptlrpc_init_import(struct obd_import *imp)
+{
+       spin_lock(&imp->imp_lock);
+
+       imp->imp_generation++;
+       imp->imp_state =  LUSTRE_IMP_NEW;
+
+       spin_unlock(&imp->imp_lock);
+
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_init_import);
+
+#define UUID_STR "_UUID"
+void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len)
+{
+       *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
+               ? uuid : uuid + strlen(prefix);
+
+       *uuid_len = strlen(*uuid_start);
+
+       if (*uuid_len < strlen(UUID_STR))
+               return;
+
+       if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
+                   UUID_STR, strlen(UUID_STR)))
+               *uuid_len -= strlen(UUID_STR);
+}
+EXPORT_SYMBOL(deuuidify);
+
+/**
+ * Returns true if import was FULL, false if import was already not
+ * connected.
+ * @imp - import to be disconnected
+ * @conn_cnt - connection count (epoch) of the request that timed out
+ *          and caused the disconnection.  In some cases, multiple
+ *          inflight requests can fail to a single target (e.g. OST
+ *          bulk requests) and if one has already caused a reconnection
+ *          (increasing the import->conn_cnt) the older failure should
+ *          not also cause a reconnection.  If zero it forces a reconnect.
+ */
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
+{
+       int rc = 0;
+
+       spin_lock(&imp->imp_lock);
+
+       if (imp->imp_state == LUSTRE_IMP_FULL &&
+           (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
+               char *target_start;
+               int   target_len;
+
+               deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                         &target_start, &target_len);
+
+               if (imp->imp_replayable) {
+                       LCONSOLE_WARN("%s: Connection to %.*s (at %s) was "
+                              "lost; in progress operations using this "
+                              "service will wait for recovery to complete\n",
+                              imp->imp_obd->obd_name, target_len, target_start,
+                              libcfs_nid2str(imp->imp_connection->c_peer.nid));
+               } else {
+                       LCONSOLE_ERROR_MSG(0x166, "%s: Connection to "
+                              "%.*s (at %s) was lost; in progress "
+                              "operations using this service will fail\n",
+                              imp->imp_obd->obd_name,
+                              target_len, target_start,
+                              libcfs_nid2str(imp->imp_connection->c_peer.nid));
+               }
+               ptlrpc_deactivate_timeouts(imp);
+               IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+               spin_unlock(&imp->imp_lock);
+
+               if (obd_dump_on_timeout)
+                       libcfs_debug_dumplog();
+
+               obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
+               rc = 1;
+       } else {
+               spin_unlock(&imp->imp_lock);
+               CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
+                      imp->imp_client->cli_name, imp,
+                      (imp->imp_state == LUSTRE_IMP_FULL &&
+                       imp->imp_conn_cnt > conn_cnt) ?
+                      "reconnected" : "not connected", imp->imp_conn_cnt,
+                      conn_cnt, ptlrpc_import_state_name(imp->imp_state));
+       }
+
+       return rc;
+}
+
+/* Must be called with imp_lock held! */
+static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp)
+{
+       ENTRY;
+       LASSERT(spin_is_locked(&imp->imp_lock));
+
+       CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
+       imp->imp_invalid = 1;
+       imp->imp_generation++;
+       spin_unlock(&imp->imp_lock);
+
+       ptlrpc_abort_inflight(imp);
+       obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
+
+       EXIT;
+}
+
+/*
+ * This acts as a barrier; all existing requests are rejected, and
+ * no new requests will be accepted until the import is valid again.
+ */
+void ptlrpc_deactivate_import(struct obd_import *imp)
+{
+       spin_lock(&imp->imp_lock);
+       ptlrpc_deactivate_and_unlock_import(imp);
+}
+EXPORT_SYMBOL(ptlrpc_deactivate_import);
+
+static unsigned int
+ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now)
+{
+       long dl;
+
+       if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
+             (req->rq_phase == RQ_PHASE_BULK) ||
+             (req->rq_phase == RQ_PHASE_NEW)))
+               return 0;
+
+       if (req->rq_timedout)
+               return 0;
+
+       if (req->rq_phase == RQ_PHASE_NEW)
+               dl = req->rq_sent;
+       else
+               dl = req->rq_deadline;
+
+       if (dl <= now)
+               return 0;
+
+       return dl - now;
+}
+
+static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
+{
+       time_t now = cfs_time_current_sec();
+       struct list_head *tmp, *n;
+       struct ptlrpc_request *req;
+       unsigned int timeout = 0;
+
+       spin_lock(&imp->imp_lock);
+       list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+               req = list_entry(tmp, struct ptlrpc_request, rq_list);
+               timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
+       }
+       spin_unlock(&imp->imp_lock);
+       return timeout;
+}
+
+/**
+ * This function will invalidate the import, if necessary, then block
+ * for all the RPC completions, and finally notify the obd to
+ * invalidate its state (ie cancel locks, clear pending requests,
+ * etc).
+ */
+void ptlrpc_invalidate_import(struct obd_import *imp)
+{
+       struct list_head *tmp, *n;
+       struct ptlrpc_request *req;
+       struct l_wait_info lwi;
+       unsigned int timeout;
+       int rc;
+
+       atomic_inc(&imp->imp_inval_count);
+
+       if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
+               ptlrpc_deactivate_import(imp);
+
+       LASSERT(imp->imp_invalid);
+
+       /* Wait forever until inflight == 0. We really can't do it another
+        * way because in some cases we need to wait for very long reply
+        * unlink. We can't do anything before that because there is really
+        * no guarantee that some rdma transfer is not in progress right now. */
+       do {
+               /* Calculate max timeout for waiting on rpcs to error
+                * out. Use obd_timeout if calculated value is smaller
+                * than it. */
+               if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
+                       timeout = ptlrpc_inflight_timeout(imp);
+                       timeout += timeout / 3;
+
+                       if (timeout == 0)
+                               timeout = obd_timeout;
+               } else {
+                       /* decrease the interval to increase race condition */
+                       timeout = 1;
+               }
+
+               CDEBUG(D_RPCTRACE,"Sleeping %d sec for inflight to error out\n",
+                      timeout);
+
+               /* Wait for all requests to error out and call completion
+                * callbacks. Cap it at obd_timeout -- these should all
+                * have been locally cancelled by ptlrpc_abort_inflight. */
+               lwi = LWI_TIMEOUT_INTERVAL(
+                       cfs_timeout_cap(cfs_time_seconds(timeout)),
+                       (timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
+                       NULL, NULL);
+               rc = l_wait_event(imp->imp_recovery_waitq,
+                                 (atomic_read(&imp->imp_inflight) == 0),
+                                 &lwi);
+               if (rc) {
+                       const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
+
+                       CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
+                              cli_tgt, rc,
+                              atomic_read(&imp->imp_inflight));
+
+                       spin_lock(&imp->imp_lock);
+                       if (atomic_read(&imp->imp_inflight) == 0) {
+                               int count = atomic_read(&imp->imp_unregistering);
+
+                               /* We know that "unregistering" rpcs only can
+                                * survive in sending or delaying lists (they
+                                * maybe waiting for long reply unlink in
+                                * sluggish nets). Let's check this. If there
+                                * is no inflight and unregistering != 0, this
+                                * is bug. */
+                               LASSERTF(count == 0, "Some RPCs are still "
+                                        "unregistering: %d\n", count);
+
+                               /* Let's save one loop as soon as inflight have
+                                * dropped to zero. No new inflights possible at
+                                * this point. */
+                               rc = 0;
+                       } else {
+                               list_for_each_safe(tmp, n,
+                                                      &imp->imp_sending_list) {
+                                       req = list_entry(tmp,
+                                                            struct ptlrpc_request,
+                                                            rq_list);
+                                       DEBUG_REQ(D_ERROR, req,
+                                                 "still on sending list");
+                               }
+                               list_for_each_safe(tmp, n,
+                                                      &imp->imp_delayed_list) {
+                                       req = list_entry(tmp,
+                                                            struct ptlrpc_request,
+                                                            rq_list);
+                                       DEBUG_REQ(D_ERROR, req,
+                                                 "still on delayed list");
+                               }
+
+                               CERROR("%s: RPCs in \"%s\" phase found (%d). "
+                                      "Network is sluggish? Waiting them "
+                                      "to error out.\n", cli_tgt,
+                                      ptlrpc_phase2str(RQ_PHASE_UNREGISTERING),
+                                      atomic_read(&imp->
+                                                      imp_unregistering));
+                       }
+                       spin_unlock(&imp->imp_lock);
+                 }
+       } while (rc != 0);
+
+       /*
+        * Let's additionally check that no new rpcs added to import in
+        * "invalidate" state.
+        */
+       LASSERT(atomic_read(&imp->imp_inflight) == 0);
+       obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
+       sptlrpc_import_flush_all_ctx(imp);
+
+       atomic_dec(&imp->imp_inval_count);
+       wake_up_all(&imp->imp_recovery_waitq);
+}
+EXPORT_SYMBOL(ptlrpc_invalidate_import);
+
+/* unset imp_invalid */
+void ptlrpc_activate_import(struct obd_import *imp)
+{
+       struct obd_device *obd = imp->imp_obd;
+
+       spin_lock(&imp->imp_lock);
+       imp->imp_invalid = 0;
+       ptlrpc_activate_timeouts(imp);
+       spin_unlock(&imp->imp_lock);
+       obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
+}
+EXPORT_SYMBOL(ptlrpc_activate_import);
+
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
+{
+       ENTRY;
+
+       LASSERT(!imp->imp_dlm_fake);
+
+       if (ptlrpc_set_import_discon(imp, conn_cnt)) {
+               if (!imp->imp_replayable) {
+                       CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+                              "auto-deactivating\n",
+                              obd2cli_tgt(imp->imp_obd),
+                              imp->imp_connection->c_remote_uuid.uuid,
+                              imp->imp_obd->obd_name);
+                       ptlrpc_deactivate_import(imp);
+               }
+
+               CDEBUG(D_HA, "%s: waking up pinger\n",
+                      obd2cli_tgt(imp->imp_obd));
+
+               spin_lock(&imp->imp_lock);
+               imp->imp_force_verify = 1;
+               spin_unlock(&imp->imp_lock);
+
+               ptlrpc_pinger_wake_up();
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_fail_import);
+
+int ptlrpc_reconnect_import(struct obd_import *imp)
+{
+       ptlrpc_set_import_discon(imp, 0);
+       /* Force a new connect attempt */
+       ptlrpc_invalidate_import(imp);
+       /* Do a fresh connect next time by zeroing the handle */
+       ptlrpc_disconnect_import(imp, 1);
+       /* Wait for all invalidate calls to finish */
+       if (atomic_read(&imp->imp_inval_count) > 0) {
+               int rc;
+               struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+               rc = l_wait_event(imp->imp_recovery_waitq,
+                                 (atomic_read(&imp->imp_inval_count) == 0),
+                                 &lwi);
+               if (rc)
+                       CERROR("Interrupted, inval=%d\n",
+                              atomic_read(&imp->imp_inval_count));
+       }
+
+       /* Allow reconnect attempts */
+       imp->imp_obd->obd_no_recov = 0;
+       /* Remove 'invalid' flag */
+       ptlrpc_activate_import(imp);
+       /* Attempt a new connect */
+       ptlrpc_recover_import(imp, NULL, 0);
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_reconnect_import);
+
+/**
+ * Connection on import \a imp is changed to another one (if more than one is
+ * present). We typically chose connection that we have not tried to connect to
+ * the longest
+ */
+static int import_select_connection(struct obd_import *imp)
+{
+       struct obd_import_conn *imp_conn = NULL, *conn;
+       struct obd_export *dlmexp;
+       char *target_start;
+       int target_len, tried_all = 1;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+
+       if (list_empty(&imp->imp_conn_list)) {
+               CERROR("%s: no connections available\n",
+                      imp->imp_obd->obd_name);
+               spin_unlock(&imp->imp_lock);
+               RETURN(-EINVAL);
+       }
+
+       list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+               CDEBUG(D_HA, "%s: connect to NID %s last attempt "LPU64"\n",
+                      imp->imp_obd->obd_name,
+                      libcfs_nid2str(conn->oic_conn->c_peer.nid),
+                      conn->oic_last_attempt);
+
+               /* If we have not tried this connection since
+                  the last successful attempt, go with this one */
+               if ((conn->oic_last_attempt == 0) ||
+                   cfs_time_beforeq_64(conn->oic_last_attempt,
+                                      imp->imp_last_success_conn)) {
+                       imp_conn = conn;
+                       tried_all = 0;
+                       break;
+               }
+
+               /* If all of the connections have already been tried
+                  since the last successful connection; just choose the
+                  least recently used */
+               if (!imp_conn)
+                       imp_conn = conn;
+               else if (cfs_time_before_64(conn->oic_last_attempt,
+                                           imp_conn->oic_last_attempt))
+                       imp_conn = conn;
+       }
+
+       /* if not found, simply choose the current one */
+       if (!imp_conn || imp->imp_force_reconnect) {
+               LASSERT(imp->imp_conn_current);
+               imp_conn = imp->imp_conn_current;
+               tried_all = 0;
+       }
+       LASSERT(imp_conn->oic_conn);
+
+       /* If we've tried everything, and we're back to the beginning of the
+          list, increase our timeout and try again. It will be reset when
+          we do finally connect. (FIXME: really we should wait for all network
+          state associated with the last connection attempt to drain before
+          trying to reconnect on it.) */
+       if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
+               struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+               if (at_get(at) < CONNECTION_SWITCH_MAX) {
+                       at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
+                       if (at_get(at) > CONNECTION_SWITCH_MAX)
+                               at_reset(at, CONNECTION_SWITCH_MAX);
+               }
+               LASSERT(imp_conn->oic_last_attempt);
+               CDEBUG(D_HA, "%s: tried all connections, increasing latency "
+                       "to %ds\n", imp->imp_obd->obd_name, at_get(at));
+       }
+
+       imp_conn->oic_last_attempt = cfs_time_current_64();
+
+       /* switch connection, don't mind if it's same as the current one */
+       if (imp->imp_connection)
+               ptlrpc_connection_put(imp->imp_connection);
+       imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+
+       dlmexp =  class_conn2export(&imp->imp_dlm_handle);
+       LASSERT(dlmexp != NULL);
+       if (dlmexp->exp_connection)
+               ptlrpc_connection_put(dlmexp->exp_connection);
+       dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+       class_export_put(dlmexp);
+
+       if (imp->imp_conn_current != imp_conn) {
+               if (imp->imp_conn_current) {
+                       deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                                 &target_start, &target_len);
+
+                       CDEBUG(D_HA, "%s: Connection changing to"
+                              " %.*s (at %s)\n",
+                              imp->imp_obd->obd_name,
+                              target_len, target_start,
+                              libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+               }
+
+               imp->imp_conn_current = imp_conn;
+       }
+
+       CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
+              imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
+              libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+
+       spin_unlock(&imp->imp_lock);
+
+       RETURN(0);
+}
+
+/*
+ * must be called under imp_lock
+ */
+static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
+{
+       struct ptlrpc_request *req;
+       struct list_head *tmp;
+
+       if (list_empty(&imp->imp_replay_list))
+               return 0;
+       tmp = imp->imp_replay_list.next;
+       req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
+       *transno = req->rq_transno;
+       if (req->rq_transno == 0) {
+               DEBUG_REQ(D_ERROR, req, "zero transno in replay");
+               LBUG();
+       }
+
+       return 1;
+}
+
+/**
+ * Attempt to (re)connect import \a imp. This includes all preparations,
+ * initializing CONNECT RPC request and passing it to ptlrpcd for
+ * actual sending.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_connect_import(struct obd_import *imp)
+{
+       struct obd_device *obd = imp->imp_obd;
+       int initial_connect = 0;
+       int set_transno = 0;
+       __u64 committed_before_reconnect = 0;
+       struct ptlrpc_request *request;
+       char *bufs[] = { NULL,
+                        obd2cli_tgt(imp->imp_obd),
+                        obd->obd_uuid.uuid,
+                        (char *)&imp->imp_dlm_handle,
+                        (char *)&imp->imp_connect_data };
+       struct ptlrpc_connect_async_args *aa;
+       int rc;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+               spin_unlock(&imp->imp_lock);
+               CERROR("can't connect to a closed import\n");
+               RETURN(-EINVAL);
+       } else if (imp->imp_state == LUSTRE_IMP_FULL) {
+               spin_unlock(&imp->imp_lock);
+               CERROR("already connected\n");
+               RETURN(0);
+       } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
+               spin_unlock(&imp->imp_lock);
+               CERROR("already connecting\n");
+               RETURN(-EALREADY);
+       }
+
+       IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
+
+       imp->imp_conn_cnt++;
+       imp->imp_resend_replay = 0;
+
+       if (!lustre_handle_is_used(&imp->imp_remote_handle))
+               initial_connect = 1;
+       else
+               committed_before_reconnect = imp->imp_peer_committed_transno;
+
+       set_transno = ptlrpc_first_transno(imp,
+                                          &imp->imp_connect_data.ocd_transno);
+       spin_unlock(&imp->imp_lock);
+
+       rc = import_select_connection(imp);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = sptlrpc_import_sec_adapt(imp, NULL, 0);
+       if (rc)
+               GOTO(out, rc);
+
+       /* Reset connect flags to the originally requested flags, in case
+        * the server is updated on-the-fly we will get the new features. */
+       imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
+       /* Reset ocd_version each time so the server knows the exact versions */
+       imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
+       imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+       imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+       rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
+                          &obd->obd_uuid, &imp->imp_connect_data, NULL);
+       if (rc)
+               GOTO(out, rc);
+
+       request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
+       if (request == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
+                                     imp->imp_connect_op, bufs, NULL);
+       if (rc) {
+               ptlrpc_request_free(request);
+               GOTO(out, rc);
+       }
+
+       /* Report the rpc service time to the server so that it knows how long
+        * to wait for clients to join recovery */
+       lustre_msg_set_service_time(request->rq_reqmsg,
+                                   at_timeout2est(request->rq_timeout));
+
+       /* The amount of time we give the server to process the connect req.
+        * import_select_connection will increase the net latency on
+        * repeated reconnect attempts to cover slow networks.
+        * We override/ignore the server rpc completion estimate here,
+        * which may be large if this is a reconnect attempt */
+       request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+       lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
+       lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER);
+
+       request->rq_no_resend = request->rq_no_delay = 1;
+       request->rq_send_state = LUSTRE_IMP_CONNECTING;
+       /* Allow a slightly larger reply for future growth compatibility */
+       req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
+                            sizeof(struct obd_connect_data)+16*sizeof(__u64));
+       ptlrpc_request_set_replen(request);
+       request->rq_interpret_reply = ptlrpc_connect_interpret;
+
+       CLASSERT(sizeof (*aa) <= sizeof (request->rq_async_args));
+       aa = ptlrpc_req_async_args(request);
+       memset(aa, 0, sizeof *aa);
+
+       aa->pcaa_peer_committed = committed_before_reconnect;
+       aa->pcaa_initial_connect = initial_connect;
+
+       if (aa->pcaa_initial_connect) {
+               spin_lock(&imp->imp_lock);
+               imp->imp_replayable = 1;
+               spin_unlock(&imp->imp_lock);
+               lustre_msg_add_op_flags(request->rq_reqmsg,
+                                       MSG_CONNECT_INITIAL);
+       }
+
+       if (set_transno)
+               lustre_msg_add_op_flags(request->rq_reqmsg,
+                                       MSG_CONNECT_TRANSNO);
+
+       DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
+                 request->rq_timeout);
+       ptlrpcd_add_req(request, PDL_POLICY_ROUND, -1);
+       rc = 0;
+out:
+       if (rc != 0) {
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_connect_import);
+
+static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
+{
+       int force_verify;
+
+       spin_lock(&imp->imp_lock);
+       force_verify = imp->imp_force_verify != 0;
+       spin_unlock(&imp->imp_lock);
+
+       if (force_verify)
+               ptlrpc_pinger_wake_up();
+}
+
+static int ptlrpc_busy_reconnect(int rc)
+{
+       return (rc == -EBUSY) || (rc == -EAGAIN);
+}
+
+/**
+ * interpret_reply callback for connect RPCs.
+ * Looks into returned status of connect operation and decides
+ * what to do with the import - i.e enter recovery, promote it to
+ * full state for normal operations of disconnect it due to an error.
+ */
+static int ptlrpc_connect_interpret(const struct lu_env *env,
+                                   struct ptlrpc_request *request,
+                                   void *data, int rc)
+{
+       struct ptlrpc_connect_async_args *aa = data;
+       struct obd_import *imp = request->rq_import;
+       struct client_obd *cli = &imp->imp_obd->u.cli;
+       struct lustre_handle old_hdl;
+       __u64 old_connect_flags;
+       int msg_flags;
+       struct obd_connect_data *ocd;
+       struct obd_export *exp;
+       int ret;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+               imp->imp_connect_tried = 1;
+               spin_unlock(&imp->imp_lock);
+               RETURN(0);
+       }
+
+       if (rc) {
+               /* if this reconnect to busy export - not need select new target
+                * for connecting*/
+               imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
+               spin_unlock(&imp->imp_lock);
+               ptlrpc_maybe_ping_import_soon(imp);
+               GOTO(out, rc);
+       }
+       spin_unlock(&imp->imp_lock);
+
+       LASSERT(imp->imp_conn_current);
+
+       msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
+
+       ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
+                                  RCL_SERVER);
+       /* server replied obd_connect_data is always bigger */
+       ocd = req_capsule_server_sized_get(&request->rq_pill,
+                                          &RMF_CONNECT_DATA, ret);
+
+       if (ocd == NULL) {
+               CERROR("%s: no connect data from server\n",
+                      imp->imp_obd->obd_name);
+               rc = -EPROTO;
+               GOTO(out, rc);
+       }
+
+       spin_lock(&imp->imp_lock);
+
+       /* All imports are pingable */
+       imp->imp_pingable = 1;
+       imp->imp_force_reconnect = 0;
+       imp->imp_force_verify = 0;
+
+       imp->imp_connect_data = *ocd;
+
+       CDEBUG(D_HA, "%s: connect to target with instance %u\n",
+              imp->imp_obd->obd_name, ocd->ocd_instance);
+       exp = class_conn2export(&imp->imp_dlm_handle);
+
+       spin_unlock(&imp->imp_lock);
+
+       /* check that server granted subset of flags we asked for. */
+       if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
+           ocd->ocd_connect_flags) {
+               CERROR("%s: Server didn't granted asked subset of flags: "
+                      "asked="LPX64" grranted="LPX64"\n",
+                      imp->imp_obd->obd_name,imp->imp_connect_flags_orig,
+                      ocd->ocd_connect_flags);
+               GOTO(out, rc = -EPROTO);
+       }
+
+       if (!exp) {
+               /* This could happen if export is cleaned during the
+                  connect attempt */
+               CERROR("%s: missing export after connect\n",
+                      imp->imp_obd->obd_name);
+               GOTO(out, rc = -ENODEV);
+       }
+       old_connect_flags = exp_connect_flags(exp);
+       exp->exp_connect_data = *ocd;
+       imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
+       class_export_put(exp);
+
+       obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
+
+       if (aa->pcaa_initial_connect) {
+               spin_lock(&imp->imp_lock);
+               if (msg_flags & MSG_CONNECT_REPLAYABLE) {
+                       imp->imp_replayable = 1;
+                       spin_unlock(&imp->imp_lock);
+                       CDEBUG(D_HA, "connected to replayable target: %s\n",
+                              obd2cli_tgt(imp->imp_obd));
+               } else {
+                       imp->imp_replayable = 0;
+                       spin_unlock(&imp->imp_lock);
+               }
+
+               /* if applies, adjust the imp->imp_msg_magic here
+                * according to reply flags */
+
+               imp->imp_remote_handle =
+                               *lustre_msg_get_handle(request->rq_repmsg);
+
+               /* Initial connects are allowed for clients with non-random
+                * uuids when servers are in recovery.  Simply signal the
+                * servers replay is complete and wait in REPLAY_WAIT. */
+               if (msg_flags & MSG_CONNECT_RECOVERING) {
+                       CDEBUG(D_HA, "connect to %s during recovery\n",
+                              obd2cli_tgt(imp->imp_obd));
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+               } else {
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+                       ptlrpc_activate_import(imp);
+               }
+
+               GOTO(finish, rc = 0);
+       }
+
+       /* Determine what recovery state to move the import to. */
+       if (MSG_CONNECT_RECONNECT & msg_flags) {
+               memset(&old_hdl, 0, sizeof(old_hdl));
+               if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
+                           sizeof (old_hdl))) {
+                       LCONSOLE_WARN("Reconnect to %s (at @%s) failed due "
+                                     "bad handle "LPX64"\n",
+                                     obd2cli_tgt(imp->imp_obd),
+                                     imp->imp_connection->c_remote_uuid.uuid,
+                                     imp->imp_dlm_handle.cookie);
+                       GOTO(out, rc = -ENOTCONN);
+               }
+
+               if (memcmp(&imp->imp_remote_handle,
+                          lustre_msg_get_handle(request->rq_repmsg),
+                          sizeof(imp->imp_remote_handle))) {
+                       int level = msg_flags & MSG_CONNECT_RECOVERING ?
+                               D_HA : D_WARNING;
+
+                       /* Bug 16611/14775: if server handle have changed,
+                        * that means some sort of disconnection happened.
+                        * If the server is not in recovery, that also means it
+                        * already erased all of our state because of previous
+                        * eviction. If it is in recovery - we are safe to
+                        * participate since we can reestablish all of our state
+                        * with server again */
+                       if ((MSG_CONNECT_RECOVERING & msg_flags)) {
+                               CDEBUG(level,"%s@%s changed server handle from "
+                                      LPX64" to "LPX64
+                                      " but is still in recovery\n",
+                                      obd2cli_tgt(imp->imp_obd),
+                                      imp->imp_connection->c_remote_uuid.uuid,
+                                      imp->imp_remote_handle.cookie,
+                                      lustre_msg_get_handle(
+                                      request->rq_repmsg)->cookie);
+                       } else {
+                               LCONSOLE_WARN("Evicted from %s (at %s) "
+                                             "after server handle changed from "
+                                             LPX64" to "LPX64"\n",
+                                             obd2cli_tgt(imp->imp_obd),
+                                             imp->imp_connection-> \
+                                             c_remote_uuid.uuid,
+                                             imp->imp_remote_handle.cookie,
+                                             lustre_msg_get_handle(
+                                             request->rq_repmsg)->cookie);
+                       }
+
+
+                       imp->imp_remote_handle =
+                                    *lustre_msg_get_handle(request->rq_repmsg);
+
+                       if (!(MSG_CONNECT_RECOVERING & msg_flags)) {
+                               IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+                               GOTO(finish, rc = 0);
+                       }
+
+               } else {
+                       CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
+                              obd2cli_tgt(imp->imp_obd),
+                              imp->imp_connection->c_remote_uuid.uuid);
+               }
+
+               if (imp->imp_invalid) {
+                       CDEBUG(D_HA, "%s: reconnected but import is invalid; "
+                              "marking evicted\n", imp->imp_obd->obd_name);
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+               } else if (MSG_CONNECT_RECOVERING & msg_flags) {
+                       CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
+                              imp->imp_obd->obd_name,
+                              obd2cli_tgt(imp->imp_obd));
+
+                       spin_lock(&imp->imp_lock);
+                       imp->imp_resend_replay = 1;
+                       spin_unlock(&imp->imp_lock);
+
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+               } else {
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+               }
+       } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
+               LASSERT(imp->imp_replayable);
+               imp->imp_remote_handle =
+                               *lustre_msg_get_handle(request->rq_repmsg);
+               imp->imp_last_replay_transno = 0;
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
+       } else {
+               DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags"
+                         " not set: %x)", imp->imp_obd->obd_name, msg_flags);
+               imp->imp_remote_handle =
+                               *lustre_msg_get_handle(request->rq_repmsg);
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
+       }
+
+       /* Sanity checks for a reconnected import. */
+       if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
+               CERROR("imp_replayable flag does not match server "
+                      "after reconnect. We should LBUG right here.\n");
+       }
+
+       if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
+           lustre_msg_get_last_committed(request->rq_repmsg) <
+           aa->pcaa_peer_committed) {
+               CERROR("%s went back in time (transno "LPD64
+                      " was previously committed, server now claims "LPD64
+                      ")!  See https://bugzilla.lustre.org/show_bug.cgi?"
+                      "id=9646\n",
+                      obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
+                      lustre_msg_get_last_committed(request->rq_repmsg));
+       }
+
+finish:
+       rc = ptlrpc_import_recovery_state_machine(imp);
+       if (rc != 0) {
+               if (rc == -ENOTCONN) {
+                       CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;"
+                              "invalidating and reconnecting\n",
+                              obd2cli_tgt(imp->imp_obd),
+                              imp->imp_connection->c_remote_uuid.uuid);
+                       ptlrpc_connect_import(imp);
+                       imp->imp_connect_tried = 1;
+                       RETURN(0);
+               }
+       } else {
+
+               spin_lock(&imp->imp_lock);
+               list_del(&imp->imp_conn_current->oic_item);
+               list_add(&imp->imp_conn_current->oic_item,
+                            &imp->imp_conn_list);
+               imp->imp_last_success_conn =
+                       imp->imp_conn_current->oic_last_attempt;
+
+               spin_unlock(&imp->imp_lock);
+
+               if (!ocd->ocd_ibits_known &&
+                   ocd->ocd_connect_flags & OBD_CONNECT_IBITS)
+                       CERROR("Inodebits aware server returned zero compatible"
+                              " bits?\n");
+
+               if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+                   (ocd->ocd_version > LUSTRE_VERSION_CODE +
+                                       LUSTRE_VERSION_OFFSET_WARN ||
+                    ocd->ocd_version < LUSTRE_VERSION_CODE -
+                                       LUSTRE_VERSION_OFFSET_WARN)) {
+                       /* Sigh, some compilers do not like #ifdef in the middle
+                          of macro arguments */
+                       const char *older = "older. Consider upgrading server "
+                                           "or downgrading client";
+                       const char *newer = "newer than client version. "
+                                           "Consider upgrading client";
+
+                       LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) "
+                                     "is much %s (%s)\n",
+                                     obd2cli_tgt(imp->imp_obd),
+                                     OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+                                     OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+                                     OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+                                     OBD_OCD_VERSION_FIX(ocd->ocd_version),
+                                     ocd->ocd_version > LUSTRE_VERSION_CODE ?
+                                     newer : older, LUSTRE_VERSION_STRING);
+               }
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+               /* Check if server has LU-1252 fix applied to not always swab
+                * the IR MNE entries. Do this only once per connection.  This
+                * fixup is version-limited, because we don't want to carry the
+                * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
+                * need interop with unpatched 2.2 servers.  For newer servers,
+                * the client will do MNE swabbing only as needed.  LU-1644 */
+               if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+                            !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
+                            OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
+                            OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
+                            OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
+                            strcmp(imp->imp_obd->obd_type->typ_name,
+                                   LUSTRE_MGC_NAME) == 0))
+                       imp->imp_need_mne_swab = 1;
+               else /* clear if server was upgraded since last connect */
+                       imp->imp_need_mne_swab = 0;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+               if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
+                       /* We sent to the server ocd_cksum_types with bits set
+                        * for algorithms we understand. The server masked off
+                        * the checksum types it doesn't support */
+                       if ((ocd->ocd_cksum_types &
+                            cksum_types_supported_client()) == 0) {
+                               LCONSOLE_WARN("The negotiation of the checksum "
+                                             "alogrithm to use with server %s "
+                                             "failed (%x/%x), disabling "
+                                             "checksums\n",
+                                             obd2cli_tgt(imp->imp_obd),
+                                             ocd->ocd_cksum_types,
+                                             cksum_types_supported_client());
+                               cli->cl_checksum = 0;
+                               cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+                       } else {
+                               cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
+                       }
+               } else {
+                       /* The server does not support OBD_CONNECT_CKSUM.
+                        * Enforce ADLER for backward compatibility*/
+                       cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
+               }
+               cli->cl_cksum_type =cksum_type_select(cli->cl_supp_cksum_types);
+
+               if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
+                       cli->cl_max_pages_per_rpc =
+                               min(ocd->ocd_brw_size >> PAGE_CACHE_SHIFT,
+                                   cli->cl_max_pages_per_rpc);
+               else if (imp->imp_connect_op == MDS_CONNECT ||
+                        imp->imp_connect_op == MGS_CONNECT)
+                       cli->cl_max_pages_per_rpc = 1;
+
+               /* Reset ns_connect_flags only for initial connect. It might be
+                * changed in while using FS and if we reset it in reconnect
+                * this leads to losing user settings done before such as
+                * disable lru_resize, etc. */
+               if (old_connect_flags != exp_connect_flags(exp) ||
+                   aa->pcaa_initial_connect) {
+                       CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server "
+                              "flags: "LPX64"\n", imp->imp_obd->obd_name,
+                             ocd->ocd_connect_flags);
+                       imp->imp_obd->obd_namespace->ns_connect_flags =
+                               ocd->ocd_connect_flags;
+                       imp->imp_obd->obd_namespace->ns_orig_connect_flags =
+                               ocd->ocd_connect_flags;
+               }
+
+               if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
+                   (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+                       /* We need a per-message support flag, because
+                          a. we don't know if the incoming connect reply
+                             supports AT or not (in reply_in_callback)
+                             until we unpack it.
+                          b. failovered server means export and flags are gone
+                             (in ptlrpc_send_reply).
+                          Can only be set when we know AT is supported at
+                          both ends */
+                       imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
+               else
+                       imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
+
+               if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) &&
+                   (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
+                       imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
+               else
+                       imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
+
+               LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
+                       (cli->cl_max_pages_per_rpc > 0));
+       }
+
+out:
+       imp->imp_connect_tried = 1;
+
+       if (rc != 0) {
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
+               if (rc == -EACCES) {
+                       /*
+                        * Give up trying to reconnect
+                        * EACCES means client has no permission for connection
+                        */
+                       imp->imp_obd->obd_no_recov = 1;
+                       ptlrpc_deactivate_import(imp);
+               }
+
+               if (rc == -EPROTO) {
+                       struct obd_connect_data *ocd;
+
+                       /* reply message might not be ready */
+                       if (request->rq_repmsg == NULL)
+                               RETURN(-EPROTO);
+
+                       ocd = req_capsule_server_get(&request->rq_pill,
+                                                    &RMF_CONNECT_DATA);
+                       if (ocd &&
+                           (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+                           (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
+                          /* Actually servers are only supposed to refuse
+                             connection from liblustre clients, so we should
+                             never see this from VFS context */
+                               LCONSOLE_ERROR_MSG(0x16a, "Server %s version "
+                                       "(%d.%d.%d.%d)"
+                                       " refused connection from this client "
+                                       "with an incompatible version (%s).  "
+                                       "Client must be recompiled\n",
+                                       obd2cli_tgt(imp->imp_obd),
+                                       OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
+                                       OBD_OCD_VERSION_MINOR(ocd->ocd_version),
+                                       OBD_OCD_VERSION_PATCH(ocd->ocd_version),
+                                       OBD_OCD_VERSION_FIX(ocd->ocd_version),
+                                       LUSTRE_VERSION_STRING);
+                               ptlrpc_deactivate_import(imp);
+                               IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
+                       }
+                       RETURN(-EPROTO);
+               }
+
+               ptlrpc_maybe_ping_import_soon(imp);
+
+               CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
+                      obd2cli_tgt(imp->imp_obd),
+                      (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
+       }
+
+       wake_up_all(&imp->imp_recovery_waitq);
+       RETURN(rc);
+}
+
+/**
+ * interpret callback for "completed replay" RPCs.
+ * \see signal_completed_replay
+ */
+static int completed_replay_interpret(const struct lu_env *env,
+                                     struct ptlrpc_request *req,
+                                     void * data, int rc)
+{
+       ENTRY;
+       atomic_dec(&req->rq_import->imp_replay_inflight);
+       if (req->rq_status == 0 &&
+           !req->rq_import->imp_vbr_failed) {
+               ptlrpc_import_recovery_state_machine(req->rq_import);
+       } else {
+               if (req->rq_import->imp_vbr_failed) {
+                       CDEBUG(D_WARNING,
+                              "%s: version recovery fails, reconnecting\n",
+                              req->rq_import->imp_obd->obd_name);
+               } else {
+                       CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
+                                    "reconnecting\n",
+                              req->rq_import->imp_obd->obd_name,
+                              req->rq_status);
+               }
+               ptlrpc_connect_import(req->rq_import);
+       }
+
+       RETURN(0);
+}
+
+/**
+ * Let server know that we have no requests to replay anymore.
+ * Achieved by just sending a PING request
+ */
+static int signal_completed_replay(struct obd_import *imp)
+{
+       struct ptlrpc_request *req;
+       ENTRY;
+
+       if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
+               RETURN(0);
+
+       LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+       atomic_inc(&imp->imp_replay_inflight);
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
+                                       OBD_PING);
+       if (req == NULL) {
+               atomic_dec(&imp->imp_replay_inflight);
+               RETURN(-ENOMEM);
+       }
+
+       ptlrpc_request_set_replen(req);
+       req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
+       lustre_msg_add_flags(req->rq_reqmsg,
+                            MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
+       if (AT_OFF)
+               req->rq_timeout *= 3;
+       req->rq_interpret_reply = completed_replay_interpret;
+
+       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+       RETURN(0);
+}
+
+/**
+ * In kernel code all import invalidation happens in its own
+ * separate thread, so that whatever application happened to encounter
+ * a problem could still be killed or otherwise continue
+ */
+static int ptlrpc_invalidate_import_thread(void *data)
+{
+       struct obd_import *imp = data;
+
+       ENTRY;
+
+       unshare_fs_struct();
+
+       CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
+              imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+              imp->imp_connection->c_remote_uuid.uuid);
+
+       ptlrpc_invalidate_import(imp);
+
+       if (obd_dump_on_eviction) {
+               CERROR("dump the log upon eviction\n");
+               libcfs_debug_dumplog();
+       }
+
+       IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+       ptlrpc_import_recovery_state_machine(imp);
+
+       class_import_put(imp);
+       RETURN(0);
+}
+
+/**
+ * This is the state machine for client-side recovery on import.
+ *
+ * Typicaly we have two possibly paths. If we came to server and it is not
+ * in recovery, we just enter IMP_EVICTED state, invalidate our import
+ * state and reconnect from scratch.
+ * If we came to server that is in recovery, we enter IMP_REPLAY import state.
+ * We go through our list of requests to replay and send them to server one by
+ * one.
+ * After sending all request from the list we change import state to
+ * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
+ * and also all the locks we don't yet have and wait for server to grant us.
+ * After that we send a special "replay completed" request and change import
+ * state to IMP_REPLAY_WAIT.
+ * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
+ * state and resend all requests from sending list.
+ * After that we promote import to FULL state and send all delayed requests
+ * and import is fully operational after that.
+ *
+ */
+int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
+{
+       int rc = 0;
+       int inflight;
+       char *target_start;
+       int target_len;
+
+       ENTRY;
+       if (imp->imp_state == LUSTRE_IMP_EVICTED) {
+               deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                         &target_start, &target_len);
+               /* Don't care about MGC eviction */
+               if (strcmp(imp->imp_obd->obd_type->typ_name,
+                          LUSTRE_MGC_NAME) != 0) {
+                       LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted "
+                                          "by %.*s; in progress operations "
+                                          "using this service will fail.\n",
+                                          imp->imp_obd->obd_name, target_len,
+                                          target_start);
+               }
+               CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
+                      obd2cli_tgt(imp->imp_obd),
+                      imp->imp_connection->c_remote_uuid.uuid);
+               /* reset vbr_failed flag upon eviction */
+               spin_lock(&imp->imp_lock);
+               imp->imp_vbr_failed = 0;
+               spin_unlock(&imp->imp_lock);
+
+               {
+               task_t *task;
+               /* bug 17802:  XXX client_disconnect_export vs connect request
+                * race. if client will evicted at this time, we start
+                * invalidate thread without reference to import and import can
+                * be freed at same time. */
+               class_import_get(imp);
+               task = kthread_run(ptlrpc_invalidate_import_thread, imp,
+                                    "ll_imp_inval");
+               if (IS_ERR(task)) {
+                       class_import_put(imp);
+                       CERROR("error starting invalidate thread: %d\n", rc);
+                       rc = PTR_ERR(task);
+               } else {
+                       rc = 0;
+               }
+               RETURN(rc);
+               }
+       }
+
+       if (imp->imp_state == LUSTRE_IMP_REPLAY) {
+               CDEBUG(D_HA, "replay requested by %s\n",
+                      obd2cli_tgt(imp->imp_obd));
+               rc = ptlrpc_replay_next(imp, &inflight);
+               if (inflight == 0 &&
+                   atomic_read(&imp->imp_replay_inflight) == 0) {
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
+                       rc = ldlm_replay_locks(imp);
+                       if (rc)
+                               GOTO(out, rc);
+               }
+               rc = 0;
+       }
+
+       if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
+               if (atomic_read(&imp->imp_replay_inflight) == 0) {
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
+                       rc = signal_completed_replay(imp);
+                       if (rc)
+                               GOTO(out, rc);
+               }
+
+       }
+
+       if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
+               if (atomic_read(&imp->imp_replay_inflight) == 0) {
+                       IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
+               }
+       }
+
+       if (imp->imp_state == LUSTRE_IMP_RECOVER) {
+               CDEBUG(D_HA, "reconnected to %s@%s\n",
+                      obd2cli_tgt(imp->imp_obd),
+                      imp->imp_connection->c_remote_uuid.uuid);
+
+               rc = ptlrpc_resend(imp);
+               if (rc)
+                       GOTO(out, rc);
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
+               ptlrpc_activate_import(imp);
+
+               deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
+                         &target_start, &target_len);
+               LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n",
+                             imp->imp_obd->obd_name,
+                             target_len, target_start,
+                             libcfs_nid2str(imp->imp_connection->c_peer.nid));
+       }
+
+       if (imp->imp_state == LUSTRE_IMP_FULL) {
+               wake_up_all(&imp->imp_recovery_waitq);
+               ptlrpc_wake_delayed(imp);
+       }
+
+out:
+       RETURN(rc);
+}
+
+int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
+{
+       struct ptlrpc_request *req;
+       int rq_opc, rc = 0;
+       int nowait = imp->imp_obd->obd_force;
+       ENTRY;
+
+       if (nowait)
+               GOTO(set_state, rc);
+
+       switch (imp->imp_connect_op) {
+       case OST_CONNECT: rq_opc = OST_DISCONNECT; break;
+       case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break;
+       case MGS_CONNECT: rq_opc = MGS_DISCONNECT; break;
+       default:
+               CERROR("don't know how to disconnect from %s (connect_op %d)\n",
+                      obd2cli_tgt(imp->imp_obd), imp->imp_connect_op);
+               RETURN(-EINVAL);
+       }
+
+       if (ptlrpc_import_in_recovery(imp)) {
+               struct l_wait_info lwi;
+               cfs_duration_t timeout;
+
+
+               if (AT_OFF) {
+                       if (imp->imp_server_timeout)
+                               timeout = cfs_time_seconds(obd_timeout / 2);
+                       else
+                               timeout = cfs_time_seconds(obd_timeout);
+               } else {
+                       int idx = import_at_get_index(imp,
+                               imp->imp_client->cli_request_portal);
+                       timeout = cfs_time_seconds(
+                               at_get(&imp->imp_at.iat_service_estimate[idx]));
+               }
+
+               lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
+                                      back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
+               rc = l_wait_event(imp->imp_recovery_waitq,
+                                 !ptlrpc_import_in_recovery(imp), &lwi);
+
+       }
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state != LUSTRE_IMP_FULL)
+               GOTO(out, 0);
+
+       spin_unlock(&imp->imp_lock);
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
+                                       LUSTRE_OBD_VERSION, rq_opc);
+       if (req) {
+               /* We are disconnecting, do not retry a failed DISCONNECT rpc if
+                * it fails.  We can get through the above with a down server
+                * if the client doesn't know the server is gone yet. */
+               req->rq_no_resend = 1;
+
+               /* We want client umounts to happen quickly, no matter the
+                  server state... */
+               req->rq_timeout = min_t(int, req->rq_timeout,
+                                       INITIAL_CONNECT_TIMEOUT);
+
+               IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
+               req->rq_send_state =  LUSTRE_IMP_CONNECTING;
+               ptlrpc_request_set_replen(req);
+               rc = ptlrpc_queue_wait(req);
+               ptlrpc_req_finished(req);
+       }
+
+set_state:
+       spin_lock(&imp->imp_lock);
+out:
+       if (noclose)
+               IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+       else
+               IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+       memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+       spin_unlock(&imp->imp_lock);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_disconnect_import);
+
+void ptlrpc_cleanup_imp(struct obd_import *imp)
+{
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
+       imp->imp_generation++;
+       spin_unlock(&imp->imp_lock);
+       ptlrpc_abort_inflight(imp);
+
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_cleanup_imp);
+
+/* Adaptive Timeout utils */
+extern unsigned int at_min, at_max, at_history;
+
+/* Bin into timeslices using AT_BINS bins.
+   This gives us a max of the last binlimit*AT_BINS secs without the storage,
+   but still smoothing out a return to normalcy from a slow response.
+   (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
+int at_measured(struct adaptive_timeout *at, unsigned int val)
+{
+       unsigned int old = at->at_current;
+       time_t now = cfs_time_current_sec();
+       time_t binlimit = max_t(time_t, at_history / AT_BINS, 1);
+
+       LASSERT(at);
+       CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
+              val, at, now - at->at_binstart, at->at_current,
+              at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
+
+       if (val == 0)
+               /* 0's don't count, because we never want our timeout to
+                  drop to 0, and because 0 could mean an error */
+               return 0;
+
+       spin_lock(&at->at_lock);
+
+       if (unlikely(at->at_binstart == 0)) {
+               /* Special case to remove default from history */
+               at->at_current = val;
+               at->at_worst_ever = val;
+               at->at_worst_time = now;
+               at->at_hist[0] = val;
+               at->at_binstart = now;
+       } else if (now - at->at_binstart < binlimit ) {
+               /* in bin 0 */
+               at->at_hist[0] = max(val, at->at_hist[0]);
+               at->at_current = max(val, at->at_current);
+       } else {
+               int i, shift;
+               unsigned int maxv = val;
+               /* move bins over */
+               shift = (now - at->at_binstart) / binlimit;
+               LASSERT(shift > 0);
+               for(i = AT_BINS - 1; i >= 0; i--) {
+                       if (i >= shift) {
+                               at->at_hist[i] = at->at_hist[i - shift];
+                               maxv = max(maxv, at->at_hist[i]);
+                       } else {
+                               at->at_hist[i] = 0;
+                       }
+               }
+               at->at_hist[0] = val;
+               at->at_current = maxv;
+               at->at_binstart += shift * binlimit;
+       }
+
+       if (at->at_current > at->at_worst_ever) {
+               at->at_worst_ever = at->at_current;
+               at->at_worst_time = now;
+       }
+
+       if (at->at_flags & AT_FLG_NOHIST)
+               /* Only keep last reported val; keeping the rest of the history
+                  for proc only */
+               at->at_current = val;
+
+       if (at_max > 0)
+               at->at_current =  min(at->at_current, at_max);
+       at->at_current =  max(at->at_current, at_min);
+
+       if (at->at_current != old)
+               CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d "
+                      "(val=%u) hist %u %u %u %u\n", at,
+                      old, at->at_current, at->at_current - old, val,
+                      at->at_hist[0], at->at_hist[1], at->at_hist[2],
+                      at->at_hist[3]);
+
+       /* if we changed, report the old value */
+       old = (at->at_current != old) ? old : 0;
+
+       spin_unlock(&at->at_lock);
+       return old;
+}
+
+/* Find the imp_at index for a given portal; assign if space available */
+int import_at_get_index(struct obd_import *imp, int portal)
+{
+       struct imp_at *at = &imp->imp_at;
+       int i;
+
+       for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+               if (at->iat_portal[i] == portal)
+                       return i;
+               if (at->iat_portal[i] == 0)
+                       /* unused */
+                       break;
+       }
+
+       /* Not found in list, add it under a lock */
+       spin_lock(&imp->imp_lock);
+
+       /* Check unused under lock */
+       for (; i < IMP_AT_MAX_PORTALS; i++) {
+               if (at->iat_portal[i] == portal)
+                       goto out;
+               if (at->iat_portal[i] == 0)
+                       /* unused */
+                       break;
+       }
+
+       /* Not enough portals? */
+       LASSERT(i < IMP_AT_MAX_PORTALS);
+
+       at->iat_portal[i] = portal;
+out:
+       spin_unlock(&imp->imp_lock);
+       return i;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/layout.c b/drivers/staging/lustre/lustre/ptlrpc/layout.c
new file mode 100644 (file)
index 0000000..2f55ce2
--- /dev/null
@@ -0,0 +1,2396 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/layout.c
+ *
+ * Lustre Metadata Target (mdt) request handler
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+/*
+ * This file contains the "capsule/pill" abstraction layered above PTLRPC.
+ *
+ * Every struct ptlrpc_request contains a "pill", which points to a description
+ * of the format that the request conforms to.
+ */
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/module.h>
+
+/* LUSTRE_VERSION_CODE */
+#include <lustre_ver.h>
+
+#include <obd_support.h>
+/* lustre_swab_mdt_body */
+#include <lustre/lustre_idl.h>
+/* obd2cli_tgt() (required by DEBUG_REQ()) */
+#include <obd.h>
+
+/* __REQ_LAYOUT_USER__ */
+#endif
+/* struct ptlrpc_request, lustre_msg* */
+#include <lustre_req_layout.h>
+#include <lustre_update.h>
+#include <lustre_acl.h>
+#include <lustre_debug.h>
+
+/*
+ * RQFs (see below) refer to two struct req_msg_field arrays describing the
+ * client request and server reply, respectively.
+ */
+/* empty set of fields... for suitable definition of emptiness. */
+static const struct req_msg_field *empty[] = {
+       &RMF_PTLRPC_BODY
+};
+
+static const struct req_msg_field *mgs_target_info_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MGS_TARGET_INFO
+};
+
+static const struct req_msg_field *mgs_set_info[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MGS_SEND_PARAM
+};
+
+static const struct req_msg_field *mgs_config_read_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MGS_CONFIG_BODY
+};
+
+static const struct req_msg_field *mgs_config_read_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MGS_CONFIG_RES
+};
+
+static const struct req_msg_field *log_cancel_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LOGCOOKIES
+};
+
+static const struct req_msg_field *mdt_body_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY
+};
+
+static const struct req_msg_field *mdt_body_capa[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_CAPA1
+};
+
+static const struct req_msg_field *quotactl_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OBD_QUOTACTL
+};
+
+static const struct req_msg_field *quota_body_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *ldlm_intent_quota_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP,
+       &RMF_DLM_LVB,
+       &RMF_QUOTA_BODY
+};
+
+static const struct req_msg_field *mdt_close_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_EPOCH,
+       &RMF_REC_REINT,
+       &RMF_CAPA1
+};
+
+static const struct req_msg_field *obd_statfs_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OBD_STATFS
+};
+
+static const struct req_msg_field *seq_query_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_SEQ_OPC,
+       &RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *seq_query_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_SEQ_RANGE
+};
+
+static const struct req_msg_field *fld_query_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_FLD_OPC,
+       &RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *fld_query_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_FLD_MDFLD
+};
+
+static const struct req_msg_field *mds_getattr_name_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_CAPA1,
+       &RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT
+};
+
+static const struct req_msg_field *mds_reint_create_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME
+};
+
+static const struct req_msg_field *mds_reint_create_slave_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_EADATA,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_rmt_acl_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_EADATA,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_create_sym_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_SYMTGT,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_open_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_CAPA2,
+       &RMF_NAME,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_reint_open_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL,
+       &RMF_CAPA1,
+       &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_unlink_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_link_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_CAPA2,
+       &RMF_NAME,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_rename_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_CAPA2,
+       &RMF_NAME,
+       &RMF_SYMTGT,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_last_unlink_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_LOGCOOKIES,
+       &RMF_CAPA1,
+       &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_reint_setattr_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_MDT_EPOCH,
+       &RMF_EADATA,
+       &RMF_LOGCOOKIES,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *mds_reint_setxattr_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *mdt_swap_layouts[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_SWAP_LAYOUTS,
+       &RMF_CAPA1,
+       &RMF_CAPA2,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *obd_connect_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_TGTUUID,
+       &RMF_CLUUID,
+       &RMF_CONN,
+       &RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_connect_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_CONNECT_DATA
+};
+
+static const struct req_msg_field *obd_set_info_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_SETINFO_KEY,
+       &RMF_SETINFO_VAL
+};
+
+static const struct req_msg_field *ost_grant_shrink_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_SETINFO_KEY,
+       &RMF_OST_BODY
+};
+
+static const struct req_msg_field *mds_getinfo_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_GETINFO_KEY,
+       &RMF_GETINFO_VALLEN
+};
+
+static const struct req_msg_field *mds_getinfo_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_GETINFO_VAL,
+};
+
+static const struct req_msg_field *ldlm_enqueue_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ
+};
+
+static const struct req_msg_field *ldlm_enqueue_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP
+};
+
+static const struct req_msg_field *ldlm_enqueue_lvb_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP,
+       &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_cp_callback_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_gl_callback_desc_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_DLM_GL_DESC
+};
+
+static const struct req_msg_field *ldlm_gl_callback_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_LVB
+};
+
+static const struct req_msg_field *ldlm_intent_basic_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+};
+
+static const struct req_msg_field *ldlm_intent_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_REC_REINT
+};
+
+static const struct req_msg_field *ldlm_intent_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL
+};
+
+static const struct req_msg_field *ldlm_intent_layout_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_LAYOUT_INTENT,
+       &RMF_EADATA /* for new layout to be set up */
+};
+static const struct req_msg_field *ldlm_intent_open_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL,
+       &RMF_CAPA1,
+       &RMF_CAPA2
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_MDT_BODY,     /* coincides with mds_getattr_name_client[] */
+       &RMF_CAPA1,
+       &RMF_NAME
+};
+
+static const struct req_msg_field *ldlm_intent_getattr_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REP,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL,
+       &RMF_CAPA1
+};
+
+static const struct req_msg_field *ldlm_intent_create_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_REC_REINT,    /* coincides with mds_reint_create_client[] */
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_open_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_REC_REINT,    /* coincides with mds_reint_open_client[] */
+       &RMF_CAPA1,
+       &RMF_CAPA2,
+       &RMF_NAME,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *ldlm_intent_unlink_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_DLM_REQ,
+       &RMF_LDLM_INTENT,
+       &RMF_REC_REINT,    /* coincides with mds_reint_unlink_client[] */
+       &RMF_CAPA1,
+       &RMF_NAME
+};
+
+static const struct req_msg_field *mds_getxattr_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_CAPA1,
+       &RMF_NAME,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getxattr_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *mds_getattr_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL,
+       &RMF_CAPA1,
+       &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_setattr_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDT_MD,
+       &RMF_ACL,
+       &RMF_CAPA1,
+       &RMF_CAPA2
+};
+
+static const struct req_msg_field *mds_update_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_UPDATE,
+};
+
+static const struct req_msg_field *mds_update_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_UPDATE_REPLY,
+};
+
+static const struct req_msg_field *llog_origin_handle_create_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LLOGD_BODY,
+       &RMF_NAME
+};
+
+static const struct req_msg_field *llogd_body_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LLOGD_BODY
+};
+
+static const struct req_msg_field *llog_log_hdr_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LLOG_LOG_HDR
+};
+
+static const struct req_msg_field *llogd_conn_body_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LLOGD_CONN_BODY
+};
+
+static const struct req_msg_field *llog_origin_handle_next_block_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_LLOGD_BODY,
+       &RMF_EADATA
+};
+
+static const struct req_msg_field *obd_idx_read_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_IDX_INFO
+};
+
+static const struct req_msg_field *obd_idx_read_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_IDX_INFO
+};
+
+static const struct req_msg_field *ost_body_only[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_body_capa[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY,
+       &RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_destroy_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY,
+       &RMF_DLM_REQ,
+       &RMF_CAPA1
+};
+
+
+static const struct req_msg_field *ost_brw_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY,
+       &RMF_OBD_IOOBJ,
+       &RMF_NIOBUF_REMOTE,
+       &RMF_CAPA1
+};
+
+static const struct req_msg_field *ost_brw_read_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY
+};
+
+static const struct req_msg_field *ost_brw_write_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OST_BODY,
+       &RMF_RCS
+};
+
+static const struct req_msg_field *ost_get_info_generic_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_GENERIC_DATA,
+};
+
+static const struct req_msg_field *ost_get_info_generic_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_SETINFO_KEY
+};
+
+static const struct req_msg_field *ost_get_last_id_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_OBD_ID
+};
+
+static const struct req_msg_field *ost_get_last_fid_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_FID,
+};
+
+static const struct req_msg_field *ost_get_fiemap_client[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_FIEMAP_KEY,
+       &RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *ost_get_fiemap_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_FIEMAP_VAL
+};
+
+static const struct req_msg_field *mdt_hsm_progress[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDS_HSM_PROGRESS,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_register[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDS_HSM_ARCHIVE,
+};
+
+static const struct req_msg_field *mdt_hsm_ct_unregister[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+};
+
+static const struct req_msg_field *mdt_hsm_action_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDS_HSM_CURRENT_ACTION,
+};
+
+static const struct req_msg_field *mdt_hsm_state_get_server[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_HSM_USER_STATE,
+};
+
+static const struct req_msg_field *mdt_hsm_state_set[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_CAPA1,
+       &RMF_HSM_STATE_SET,
+};
+
+static const struct req_msg_field *mdt_hsm_request[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_MDT_BODY,
+       &RMF_MDS_HSM_REQUEST,
+       &RMF_MDS_HSM_USER_ITEM,
+       &RMF_GENERIC_DATA,
+};
+
+static struct req_format *req_formats[] = {
+       &RQF_OBD_PING,
+       &RQF_OBD_SET_INFO,
+       &RQF_OBD_IDX_READ,
+       &RQF_SEC_CTX,
+       &RQF_MGS_TARGET_REG,
+       &RQF_MGS_SET_INFO,
+       &RQF_MGS_CONFIG_READ,
+       &RQF_SEQ_QUERY,
+       &RQF_FLD_QUERY,
+       &RQF_MDS_CONNECT,
+       &RQF_MDS_DISCONNECT,
+       &RQF_MDS_GET_INFO,
+       &RQF_MDS_GETSTATUS,
+       &RQF_MDS_STATFS,
+       &RQF_MDS_GETATTR,
+       &RQF_MDS_GETATTR_NAME,
+       &RQF_MDS_GETXATTR,
+       &RQF_MDS_SYNC,
+       &RQF_MDS_CLOSE,
+       &RQF_MDS_PIN,
+       &RQF_MDS_UNPIN,
+       &RQF_MDS_READPAGE,
+       &RQF_MDS_WRITEPAGE,
+       &RQF_MDS_IS_SUBDIR,
+       &RQF_MDS_DONE_WRITING,
+       &RQF_MDS_REINT,
+       &RQF_MDS_REINT_CREATE,
+       &RQF_MDS_REINT_CREATE_RMT_ACL,
+       &RQF_MDS_REINT_CREATE_SLAVE,
+       &RQF_MDS_REINT_CREATE_SYM,
+       &RQF_MDS_REINT_OPEN,
+       &RQF_MDS_REINT_UNLINK,
+       &RQF_MDS_REINT_LINK,
+       &RQF_MDS_REINT_RENAME,
+       &RQF_MDS_REINT_SETATTR,
+       &RQF_MDS_REINT_SETXATTR,
+       &RQF_MDS_QUOTACHECK,
+       &RQF_MDS_QUOTACTL,
+       &RQF_MDS_HSM_PROGRESS,
+       &RQF_MDS_HSM_CT_REGISTER,
+       &RQF_MDS_HSM_CT_UNREGISTER,
+       &RQF_MDS_HSM_STATE_GET,
+       &RQF_MDS_HSM_STATE_SET,
+       &RQF_MDS_HSM_ACTION,
+       &RQF_MDS_HSM_REQUEST,
+       &RQF_MDS_SWAP_LAYOUTS,
+       &RQF_UPDATE_OBJ,
+       &RQF_QC_CALLBACK,
+       &RQF_OST_CONNECT,
+       &RQF_OST_DISCONNECT,
+       &RQF_OST_QUOTACHECK,
+       &RQF_OST_QUOTACTL,
+       &RQF_OST_GETATTR,
+       &RQF_OST_SETATTR,
+       &RQF_OST_CREATE,
+       &RQF_OST_PUNCH,
+       &RQF_OST_SYNC,
+       &RQF_OST_DESTROY,
+       &RQF_OST_BRW_READ,
+       &RQF_OST_BRW_WRITE,
+       &RQF_OST_STATFS,
+       &RQF_OST_SET_GRANT_INFO,
+       &RQF_OST_GET_INFO_GENERIC,
+       &RQF_OST_GET_INFO_LAST_ID,
+       &RQF_OST_GET_INFO_LAST_FID,
+       &RQF_OST_SET_INFO_LAST_FID,
+       &RQF_OST_GET_INFO_FIEMAP,
+       &RQF_LDLM_ENQUEUE,
+       &RQF_LDLM_ENQUEUE_LVB,
+       &RQF_LDLM_CONVERT,
+       &RQF_LDLM_CANCEL,
+       &RQF_LDLM_CALLBACK,
+       &RQF_LDLM_CP_CALLBACK,
+       &RQF_LDLM_BL_CALLBACK,
+       &RQF_LDLM_GL_CALLBACK,
+       &RQF_LDLM_GL_DESC_CALLBACK,
+       &RQF_LDLM_INTENT,
+       &RQF_LDLM_INTENT_BASIC,
+       &RQF_LDLM_INTENT_LAYOUT,
+       &RQF_LDLM_INTENT_GETATTR,
+       &RQF_LDLM_INTENT_OPEN,
+       &RQF_LDLM_INTENT_CREATE,
+       &RQF_LDLM_INTENT_UNLINK,
+       &RQF_LDLM_INTENT_QUOTA,
+       &RQF_QUOTA_DQACQ,
+       &RQF_LOG_CANCEL,
+       &RQF_LLOG_ORIGIN_HANDLE_CREATE,
+       &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+       &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+       &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+       &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+       &RQF_LLOG_ORIGIN_CONNECT
+};
+
+struct req_msg_field {
+       const __u32 rmf_flags;
+       const char  *rmf_name;
+       /**
+        * Field length. (-1) means "variable length".  If the
+        * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length,
+        * but the actual size must be a whole multiple of \a rmf_size.
+        */
+       const int   rmf_size;
+       void    (*rmf_swabber)(void *);
+       void    (*rmf_dumper)(void *);
+       int      rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR];
+};
+
+enum rmf_flags {
+       /**
+        * The field is a string, must be NUL-terminated.
+        */
+       RMF_F_STRING = 1 << 0,
+       /**
+        * The field's buffer size need not match the declared \a rmf_size.
+        */
+       RMF_F_NO_SIZE_CHECK = 1 << 1,
+       /**
+        * The field's buffer size must be a whole multiple of the declared \a
+        * rmf_size and the \a rmf_swabber function must work on the declared \a
+        * rmf_size worth of bytes.
+        */
+       RMF_F_STRUCT_ARRAY = 1 << 2
+};
+
+struct req_capsule;
+
+/*
+ * Request fields.
+ */
+#define DEFINE_MSGF(name, flags, size, swabber, dumper) {       \
+       .rmf_name    = (name),                            \
+       .rmf_flags   = (flags),                          \
+       .rmf_size    = (size),                            \
+       .rmf_swabber = (void (*)(void*))(swabber),            \
+       .rmf_dumper  = (void (*)(void*))(dumper)                \
+}
+
+struct req_msg_field RMF_GENERIC_DATA =
+       DEFINE_MSGF("generic_data", 0,
+                   -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GENERIC_DATA);
+
+struct req_msg_field RMF_MGS_TARGET_INFO =
+       DEFINE_MSGF("mgs_target_info", 0,
+                   sizeof(struct mgs_target_info),
+                   lustre_swab_mgs_target_info, NULL);
+EXPORT_SYMBOL(RMF_MGS_TARGET_INFO);
+
+struct req_msg_field RMF_MGS_SEND_PARAM =
+       DEFINE_MSGF("mgs_send_param", 0,
+                   sizeof(struct mgs_send_param),
+                   NULL, NULL);
+EXPORT_SYMBOL(RMF_MGS_SEND_PARAM);
+
+struct req_msg_field RMF_MGS_CONFIG_BODY =
+       DEFINE_MSGF("mgs_config_read request", 0,
+                   sizeof(struct mgs_config_body),
+                   lustre_swab_mgs_config_body, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY);
+
+struct req_msg_field RMF_MGS_CONFIG_RES =
+       DEFINE_MSGF("mgs_config_read reply ", 0,
+                   sizeof(struct mgs_config_res),
+                   lustre_swab_mgs_config_res, NULL);
+EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
+
+struct req_msg_field RMF_U32 =
+       DEFINE_MSGF("generic u32", 0,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_U32);
+
+struct req_msg_field RMF_SETINFO_VAL =
+       DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_VAL);
+
+struct req_msg_field RMF_GETINFO_KEY =
+       DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_KEY);
+
+struct req_msg_field RMF_GETINFO_VALLEN =
+       DEFINE_MSGF("getinfo_vallen", 0,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VALLEN);
+
+struct req_msg_field RMF_GETINFO_VAL =
+       DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VAL);
+
+struct req_msg_field RMF_SEQ_OPC =
+       DEFINE_MSGF("seq_query_opc", 0,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_SEQ_OPC);
+
+struct req_msg_field RMF_SEQ_RANGE =
+       DEFINE_MSGF("seq_query_range", 0,
+                   sizeof(struct lu_seq_range),
+                   lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_SEQ_RANGE);
+
+struct req_msg_field RMF_FLD_OPC =
+       DEFINE_MSGF("fld_query_opc", 0,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_FLD_OPC);
+
+struct req_msg_field RMF_FLD_MDFLD =
+       DEFINE_MSGF("fld_query_mdfld", 0,
+                   sizeof(struct lu_seq_range),
+                   lustre_swab_lu_seq_range, NULL);
+EXPORT_SYMBOL(RMF_FLD_MDFLD);
+
+struct req_msg_field RMF_MDT_BODY =
+       DEFINE_MSGF("mdt_body", 0,
+                   sizeof(struct mdt_body), lustre_swab_mdt_body, NULL);
+EXPORT_SYMBOL(RMF_MDT_BODY);
+
+struct req_msg_field RMF_OBD_QUOTACTL =
+       DEFINE_MSGF("obd_quotactl", 0,
+                   sizeof(struct obd_quotactl),
+                   lustre_swab_obd_quotactl, NULL);
+EXPORT_SYMBOL(RMF_OBD_QUOTACTL);
+
+struct req_msg_field RMF_QUOTA_BODY =
+       DEFINE_MSGF("quota_body", 0,
+                   sizeof(struct quota_body), lustre_swab_quota_body, NULL);
+EXPORT_SYMBOL(RMF_QUOTA_BODY);
+
+struct req_msg_field RMF_MDT_EPOCH =
+       DEFINE_MSGF("mdt_ioepoch", 0,
+                   sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL);
+EXPORT_SYMBOL(RMF_MDT_EPOCH);
+
+struct req_msg_field RMF_PTLRPC_BODY =
+       DEFINE_MSGF("ptlrpc_body", 0,
+                   sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL);
+EXPORT_SYMBOL(RMF_PTLRPC_BODY);
+
+struct req_msg_field RMF_OBD_STATFS =
+       DEFINE_MSGF("obd_statfs", 0,
+                   sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL);
+EXPORT_SYMBOL(RMF_OBD_STATFS);
+
+struct req_msg_field RMF_SETINFO_KEY =
+       DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SETINFO_KEY);
+
+struct req_msg_field RMF_NAME =
+       DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_NAME);
+
+struct req_msg_field RMF_SYMTGT =
+       DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_SYMTGT);
+
+struct req_msg_field RMF_TGTUUID =
+       DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+       NULL);
+EXPORT_SYMBOL(RMF_TGTUUID);
+
+struct req_msg_field RMF_CLUUID =
+       DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL,
+       NULL);
+EXPORT_SYMBOL(RMF_CLUUID);
+
+struct req_msg_field RMF_STRING =
+       DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_STRING);
+
+struct req_msg_field RMF_LLOGD_BODY =
+       DEFINE_MSGF("llogd_body", 0,
+                   sizeof(struct llogd_body), lustre_swab_llogd_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_BODY);
+
+struct req_msg_field RMF_LLOG_LOG_HDR =
+       DEFINE_MSGF("llog_log_hdr", 0,
+                   sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL);
+EXPORT_SYMBOL(RMF_LLOG_LOG_HDR);
+
+struct req_msg_field RMF_LLOGD_CONN_BODY =
+       DEFINE_MSGF("llogd_conn_body", 0,
+                   sizeof(struct llogd_conn_body),
+                   lustre_swab_llogd_conn_body, NULL);
+EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY);
+
+/*
+ * connection handle received in MDS_CONNECT request.
+ *
+ * No swabbing needed because struct lustre_handle contains only a 64-bit cookie
+ * that the client does not interpret at all.
+ */
+struct req_msg_field RMF_CONN =
+       DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL);
+EXPORT_SYMBOL(RMF_CONN);
+
+struct req_msg_field RMF_CONNECT_DATA =
+       DEFINE_MSGF("cdata",
+                   RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */,
+#if LUSTRE_VERSION_CODE > OBD_OCD_VERSION(2, 7, 50, 0)
+                   sizeof(struct obd_connect_data),
+#else
+/* For interoperability with 1.8 and 2.0 clients/servers.
+ * The RPC verification code allows larger RPC buffers, but not
+ * smaller buffers.  Until we no longer need to keep compatibility
+ * with older servers/clients we can only check that the buffer
+ * size is at least as large as obd_connect_data_v1.  That is not
+ * not in itself harmful, since the chance of just corrupting this
+ * field is low.  See JIRA LU-16 for details. */
+                   sizeof(struct obd_connect_data_v1),
+#endif
+                   lustre_swab_connect, NULL);
+EXPORT_SYMBOL(RMF_CONNECT_DATA);
+
+struct req_msg_field RMF_DLM_REQ =
+       DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */,
+                   sizeof(struct ldlm_request),
+                   lustre_swab_ldlm_request, NULL);
+EXPORT_SYMBOL(RMF_DLM_REQ);
+
+struct req_msg_field RMF_DLM_REP =
+       DEFINE_MSGF("dlm_rep", 0,
+                   sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL);
+EXPORT_SYMBOL(RMF_DLM_REP);
+
+struct req_msg_field RMF_LDLM_INTENT =
+       DEFINE_MSGF("ldlm_intent", 0,
+                   sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL);
+EXPORT_SYMBOL(RMF_LDLM_INTENT);
+
+struct req_msg_field RMF_DLM_LVB =
+       DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL);
+EXPORT_SYMBOL(RMF_DLM_LVB);
+
+struct req_msg_field RMF_DLM_GL_DESC =
+       DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc),
+                   lustre_swab_gl_desc, NULL);
+EXPORT_SYMBOL(RMF_DLM_GL_DESC);
+
+struct req_msg_field RMF_MDT_MD =
+       DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_MDT_MD);
+
+struct req_msg_field RMF_REC_REINT =
+       DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint),
+                   lustre_swab_mdt_rec_reint, NULL);
+EXPORT_SYMBOL(RMF_REC_REINT);
+
+/* FIXME: this length should be defined as a macro */
+struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1,
+                                                   NULL, NULL);
+EXPORT_SYMBOL(RMF_EADATA);
+
+struct req_msg_field RMF_ACL =
+       DEFINE_MSGF("acl", RMF_F_NO_SIZE_CHECK,
+                   LUSTRE_POSIX_ACL_MAX_SIZE, NULL, NULL);
+EXPORT_SYMBOL(RMF_ACL);
+
+/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */
+struct req_msg_field RMF_LOGCOOKIES =
+       DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */,
+                   sizeof(struct llog_cookie), NULL, NULL);
+EXPORT_SYMBOL(RMF_LOGCOOKIES);
+
+struct req_msg_field RMF_CAPA1 =
+       DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+                   lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA1);
+
+struct req_msg_field RMF_CAPA2 =
+       DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa),
+                   lustre_swab_lustre_capa, NULL);
+EXPORT_SYMBOL(RMF_CAPA2);
+
+struct req_msg_field RMF_LAYOUT_INTENT =
+       DEFINE_MSGF("layout_intent", 0,
+                   sizeof(struct layout_intent), lustre_swab_layout_intent,
+                   NULL);
+EXPORT_SYMBOL(RMF_LAYOUT_INTENT);
+
+/*
+ * OST request field.
+ */
+struct req_msg_field RMF_OST_BODY =
+       DEFINE_MSGF("ost_body", 0,
+                   sizeof(struct ost_body), lustre_swab_ost_body, dump_ost_body);
+EXPORT_SYMBOL(RMF_OST_BODY);
+
+struct req_msg_field RMF_OBD_IOOBJ =
+       DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY,
+                   sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo);
+EXPORT_SYMBOL(RMF_OBD_IOOBJ);
+
+struct req_msg_field RMF_NIOBUF_REMOTE =
+       DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY,
+                   sizeof(struct niobuf_remote), lustre_swab_niobuf_remote,
+                   dump_rniobuf);
+EXPORT_SYMBOL(RMF_NIOBUF_REMOTE);
+
+struct req_msg_field RMF_RCS =
+       DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32),
+                   lustre_swab_generic_32s, dump_rcs);
+EXPORT_SYMBOL(RMF_RCS);
+
+struct req_msg_field RMF_OBD_ID =
+       DEFINE_MSGF("obd_id", 0,
+                   sizeof(obd_id), lustre_swab_ost_last_id, NULL);
+EXPORT_SYMBOL(RMF_OBD_ID);
+
+struct req_msg_field RMF_FID =
+       DEFINE_MSGF("fid", 0,
+                   sizeof(struct lu_fid), lustre_swab_lu_fid, NULL);
+EXPORT_SYMBOL(RMF_FID);
+
+struct req_msg_field RMF_OST_ID =
+       DEFINE_MSGF("ost_id", 0,
+                   sizeof(struct ost_id), lustre_swab_ost_id, NULL);
+EXPORT_SYMBOL(RMF_OST_ID);
+
+struct req_msg_field RMF_FIEMAP_KEY =
+       DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key),
+                   lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_KEY);
+
+struct req_msg_field RMF_FIEMAP_VAL =
+       DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL);
+EXPORT_SYMBOL(RMF_FIEMAP_VAL);
+
+struct req_msg_field RMF_IDX_INFO =
+       DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info),
+                   lustre_swab_idx_info, NULL);
+EXPORT_SYMBOL(RMF_IDX_INFO);
+struct req_msg_field RMF_HSM_USER_STATE =
+       DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state),
+                   lustre_swab_hsm_user_state, NULL);
+EXPORT_SYMBOL(RMF_HSM_USER_STATE);
+
+struct req_msg_field RMF_HSM_STATE_SET =
+       DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set),
+                   lustre_swab_hsm_state_set, NULL);
+EXPORT_SYMBOL(RMF_HSM_STATE_SET);
+
+struct req_msg_field RMF_MDS_HSM_PROGRESS =
+       DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel),
+                   lustre_swab_hsm_progress_kernel, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS);
+
+struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION =
+       DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action),
+                   lustre_swab_hsm_current_action, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION);
+
+struct req_msg_field RMF_MDS_HSM_USER_ITEM =
+       DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY,
+                   sizeof(struct hsm_user_item), lustre_swab_hsm_user_item,
+                   NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM);
+
+struct req_msg_field RMF_MDS_HSM_ARCHIVE =
+       DEFINE_MSGF("hsm_archive", 0,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE);
+
+struct req_msg_field RMF_MDS_HSM_REQUEST =
+       DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request),
+                   lustre_swab_hsm_request, NULL);
+EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST);
+
+struct req_msg_field RMF_UPDATE = DEFINE_MSGF("update", 0, -1,
+                                             lustre_swab_update_buf, NULL);
+EXPORT_SYMBOL(RMF_UPDATE);
+
+struct req_msg_field RMF_UPDATE_REPLY = DEFINE_MSGF("update_reply", 0, -1,
+                                               lustre_swab_update_reply_buf,
+                                                   NULL);
+EXPORT_SYMBOL(RMF_UPDATE_REPLY);
+
+struct req_msg_field RMF_SWAP_LAYOUTS =
+       DEFINE_MSGF("swap_layouts", 0, sizeof(struct  mdc_swap_layouts),
+                   lustre_swab_swap_layouts, NULL);
+EXPORT_SYMBOL(RMF_SWAP_LAYOUTS);
+/*
+ * Request formats.
+ */
+
+struct req_format {
+       const char *rf_name;
+       int      rf_idx;
+       struct {
+               int                       nr;
+               const struct req_msg_field **d;
+       } rf_fields[RCL_NR];
+};
+
+#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) {    \
+       .rf_name   = name,                                            \
+       .rf_fields = {                                            \
+               [RCL_CLIENT] = {                                        \
+                       .nr = client_nr,                                \
+                       .d  = client                                \
+               },                                                    \
+               [RCL_SERVER] = {                                        \
+                       .nr = server_nr,                                \
+                       .d  = server                                \
+               }                                                      \
+       }                                                              \
+}
+
+#define DEFINE_REQ_FMT0(name, client, server)                            \
+DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server))
+
+struct req_format RQF_OBD_PING =
+       DEFINE_REQ_FMT0("OBD_PING", empty, empty);
+EXPORT_SYMBOL(RQF_OBD_PING);
+
+struct req_format RQF_OBD_SET_INFO =
+       DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty);
+EXPORT_SYMBOL(RQF_OBD_SET_INFO);
+
+/* Read index file through the network */
+struct req_format RQF_OBD_IDX_READ =
+       DEFINE_REQ_FMT0("OBD_IDX_READ",
+                       obd_idx_read_client, obd_idx_read_server);
+EXPORT_SYMBOL(RQF_OBD_IDX_READ);
+
+struct req_format RQF_SEC_CTX =
+       DEFINE_REQ_FMT0("SEC_CTX", empty, empty);
+EXPORT_SYMBOL(RQF_SEC_CTX);
+
+struct req_format RQF_MGS_TARGET_REG =
+       DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only,
+                        mgs_target_info_only);
+EXPORT_SYMBOL(RQF_MGS_TARGET_REG);
+
+struct req_format RQF_MGS_SET_INFO =
+       DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info,
+                        mgs_set_info);
+EXPORT_SYMBOL(RQF_MGS_SET_INFO);
+
+struct req_format RQF_MGS_CONFIG_READ =
+       DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client,
+                        mgs_config_read_server);
+EXPORT_SYMBOL(RQF_MGS_CONFIG_READ);
+
+struct req_format RQF_SEQ_QUERY =
+       DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server);
+EXPORT_SYMBOL(RQF_SEQ_QUERY);
+
+struct req_format RQF_FLD_QUERY =
+       DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server);
+EXPORT_SYMBOL(RQF_FLD_QUERY);
+
+struct req_format RQF_LOG_CANCEL =
+       DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty);
+EXPORT_SYMBOL(RQF_LOG_CANCEL);
+
+struct req_format RQF_MDS_QUOTACHECK =
+       DEFINE_REQ_FMT0("MDS_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_MDS_QUOTACHECK);
+
+struct req_format RQF_OST_QUOTACHECK =
+       DEFINE_REQ_FMT0("OST_QUOTACHECK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_OST_QUOTACHECK);
+
+struct req_format RQF_MDS_QUOTACTL =
+       DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_MDS_QUOTACTL);
+
+struct req_format RQF_OST_QUOTACTL =
+       DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only);
+EXPORT_SYMBOL(RQF_OST_QUOTACTL);
+
+struct req_format RQF_QC_CALLBACK =
+       DEFINE_REQ_FMT0("QC_CALLBACK", quotactl_only, empty);
+EXPORT_SYMBOL(RQF_QC_CALLBACK);
+
+struct req_format RQF_QUOTA_DQACQ =
+       DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only);
+EXPORT_SYMBOL(RQF_QUOTA_DQACQ);
+
+struct req_format RQF_LDLM_INTENT_QUOTA =
+       DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA",
+                       ldlm_intent_quota_client,
+                       ldlm_intent_quota_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA);
+
+struct req_format RQF_MDS_GETSTATUS =
+       DEFINE_REQ_FMT0("MDS_GETSTATUS", mdt_body_only, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_GETSTATUS);
+
+struct req_format RQF_MDS_STATFS =
+       DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_MDS_STATFS);
+
+struct req_format RQF_MDS_SYNC =
+       DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_SYNC);
+
+struct req_format RQF_MDS_GETATTR =
+       DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR);
+
+struct req_format RQF_MDS_GETXATTR =
+       DEFINE_REQ_FMT0("MDS_GETXATTR",
+                       mds_getxattr_client, mds_getxattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETXATTR);
+
+struct req_format RQF_MDS_GETATTR_NAME =
+       DEFINE_REQ_FMT0("MDS_GETATTR_NAME",
+                       mds_getattr_name_client, mds_getattr_server);
+EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME);
+
+struct req_format RQF_MDS_REINT =
+       DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT);
+
+struct req_format RQF_MDS_REINT_CREATE =
+       DEFINE_REQ_FMT0("MDS_REINT_CREATE",
+                       mds_reint_create_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE);
+
+struct req_format RQF_MDS_REINT_CREATE_RMT_ACL =
+       DEFINE_REQ_FMT0("MDS_REINT_CREATE_RMT_ACL",
+                       mds_reint_create_rmt_acl_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_RMT_ACL);
+
+struct req_format RQF_MDS_REINT_CREATE_SLAVE =
+       DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA",
+                       mds_reint_create_slave_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE);
+
+struct req_format RQF_MDS_REINT_CREATE_SYM =
+       DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM",
+                       mds_reint_create_sym_client, mdt_body_capa);
+EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM);
+
+struct req_format RQF_MDS_REINT_OPEN =
+       DEFINE_REQ_FMT0("MDS_REINT_OPEN",
+                       mds_reint_open_client, mds_reint_open_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_OPEN);
+
+struct req_format RQF_MDS_REINT_UNLINK =
+       DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client,
+                       mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK);
+
+struct req_format RQF_MDS_REINT_LINK =
+       DEFINE_REQ_FMT0("MDS_REINT_LINK",
+                       mds_reint_link_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_LINK);
+
+struct req_format RQF_MDS_REINT_RENAME =
+       DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client,
+                       mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_RENAME);
+
+struct req_format RQF_MDS_REINT_SETATTR =
+       DEFINE_REQ_FMT0("MDS_REINT_SETATTR",
+                       mds_reint_setattr_client, mds_setattr_server);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR);
+
+struct req_format RQF_MDS_REINT_SETXATTR =
+       DEFINE_REQ_FMT0("MDS_REINT_SETXATTR",
+                       mds_reint_setxattr_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
+
+struct req_format RQF_MDS_CONNECT =
+       DEFINE_REQ_FMT0("MDS_CONNECT",
+                       obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_MDS_CONNECT);
+
+struct req_format RQF_MDS_DISCONNECT =
+       DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_MDS_DISCONNECT);
+
+struct req_format RQF_MDS_GET_INFO =
+       DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client,
+                       mds_getinfo_server);
+EXPORT_SYMBOL(RQF_MDS_GET_INFO);
+
+struct req_format RQF_UPDATE_OBJ =
+       DEFINE_REQ_FMT0("OBJECT_UPDATE_OBJ", mds_update_client,
+                       mds_update_server);
+EXPORT_SYMBOL(RQF_UPDATE_OBJ);
+
+struct req_format RQF_LDLM_ENQUEUE =
+       DEFINE_REQ_FMT0("LDLM_ENQUEUE",
+                       ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE);
+
+struct req_format RQF_LDLM_ENQUEUE_LVB =
+       DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB",
+                       ldlm_enqueue_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB);
+
+struct req_format RQF_LDLM_CONVERT =
+       DEFINE_REQ_FMT0("LDLM_CONVERT",
+                       ldlm_enqueue_client, ldlm_enqueue_server);
+EXPORT_SYMBOL(RQF_LDLM_CONVERT);
+
+struct req_format RQF_LDLM_CANCEL =
+       DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CANCEL);
+
+struct req_format RQF_LDLM_CALLBACK =
+       DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CALLBACK);
+
+struct req_format RQF_LDLM_CP_CALLBACK =
+       DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK);
+
+struct req_format RQF_LDLM_BL_CALLBACK =
+       DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty);
+EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_CALLBACK =
+       DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client,
+                       ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK);
+
+struct req_format RQF_LDLM_GL_DESC_CALLBACK =
+       DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client,
+                       ldlm_gl_callback_server);
+EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK);
+
+struct req_format RQF_LDLM_INTENT_BASIC =
+       DEFINE_REQ_FMT0("LDLM_INTENT_BASIC",
+                       ldlm_intent_basic_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC);
+
+struct req_format RQF_LDLM_INTENT =
+       DEFINE_REQ_FMT0("LDLM_INTENT",
+                       ldlm_intent_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT);
+
+struct req_format RQF_LDLM_INTENT_LAYOUT =
+       DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ",
+                       ldlm_intent_layout_client, ldlm_enqueue_lvb_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT);
+
+struct req_format RQF_LDLM_INTENT_GETATTR =
+       DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR",
+                       ldlm_intent_getattr_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR);
+
+struct req_format RQF_LDLM_INTENT_OPEN =
+       DEFINE_REQ_FMT0("LDLM_INTENT_OPEN",
+                       ldlm_intent_open_client, ldlm_intent_open_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN);
+
+struct req_format RQF_LDLM_INTENT_CREATE =
+       DEFINE_REQ_FMT0("LDLM_INTENT_CREATE",
+                       ldlm_intent_create_client, ldlm_intent_getattr_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE);
+
+struct req_format RQF_LDLM_INTENT_UNLINK =
+       DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK",
+                       ldlm_intent_unlink_client, ldlm_intent_server);
+EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK);
+
+struct req_format RQF_MDS_CLOSE =
+       DEFINE_REQ_FMT0("MDS_CLOSE",
+                       mdt_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_CLOSE);
+
+struct req_format RQF_MDS_PIN =
+       DEFINE_REQ_FMT0("MDS_PIN",
+                       mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_PIN);
+
+struct req_format RQF_MDS_UNPIN =
+       DEFINE_REQ_FMT0("MDS_UNPIN", mdt_body_only, empty);
+EXPORT_SYMBOL(RQF_MDS_UNPIN);
+
+struct req_format RQF_MDS_DONE_WRITING =
+       DEFINE_REQ_FMT0("MDS_DONE_WRITING",
+                       mdt_close_client, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_DONE_WRITING);
+
+struct req_format RQF_MDS_READPAGE =
+       DEFINE_REQ_FMT0("MDS_READPAGE",
+                       mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_READPAGE);
+
+struct req_format RQF_MDS_HSM_ACTION =
+       DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_ACTION);
+
+struct req_format RQF_MDS_HSM_PROGRESS =
+       DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS);
+
+struct req_format RQF_MDS_HSM_CT_REGISTER =
+       DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER);
+
+struct req_format RQF_MDS_HSM_CT_UNREGISTER =
+       DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER);
+
+struct req_format RQF_MDS_HSM_STATE_GET =
+       DEFINE_REQ_FMT0("MDS_HSM_STATE_GET",
+                       mdt_body_capa, mdt_hsm_state_get_server);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET);
+
+struct req_format RQF_MDS_HSM_STATE_SET =
+       DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET);
+
+struct req_format RQF_MDS_HSM_REQUEST =
+       DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty);
+EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST);
+
+struct req_format RQF_MDS_SWAP_LAYOUTS =
+       DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS",
+                       mdt_swap_layouts, empty);
+EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS);
+
+/* This is for split */
+struct req_format RQF_MDS_WRITEPAGE =
+       DEFINE_REQ_FMT0("MDS_WRITEPAGE",
+                       mdt_body_capa, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_WRITEPAGE);
+
+struct req_format RQF_MDS_IS_SUBDIR =
+       DEFINE_REQ_FMT0("MDS_IS_SUBDIR",
+                       mdt_body_only, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_IS_SUBDIR);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE",
+                       llog_origin_handle_create_client, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY",
+                       llogd_body_only, llogd_body_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK",
+                       llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK",
+                       llogd_body_only, llog_origin_handle_next_block_server);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+
+struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER",
+                       llogd_body_only, llog_log_hdr_only);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER);
+
+struct req_format RQF_LLOG_ORIGIN_CONNECT =
+       DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty);
+EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT);
+
+struct req_format RQF_OST_CONNECT =
+       DEFINE_REQ_FMT0("OST_CONNECT",
+                       obd_connect_client, obd_connect_server);
+EXPORT_SYMBOL(RQF_OST_CONNECT);
+
+struct req_format RQF_OST_DISCONNECT =
+       DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty);
+EXPORT_SYMBOL(RQF_OST_DISCONNECT);
+
+struct req_format RQF_OST_GETATTR =
+       DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_GETATTR);
+
+struct req_format RQF_OST_SETATTR =
+       DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SETATTR);
+
+struct req_format RQF_OST_CREATE =
+       DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_CREATE);
+
+struct req_format RQF_OST_PUNCH =
+       DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_PUNCH);
+
+struct req_format RQF_OST_SYNC =
+       DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SYNC);
+
+struct req_format RQF_OST_DESTROY =
+       DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only);
+EXPORT_SYMBOL(RQF_OST_DESTROY);
+
+struct req_format RQF_OST_BRW_READ =
+       DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server);
+EXPORT_SYMBOL(RQF_OST_BRW_READ);
+
+struct req_format RQF_OST_BRW_WRITE =
+       DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server);
+EXPORT_SYMBOL(RQF_OST_BRW_WRITE);
+
+struct req_format RQF_OST_STATFS =
+       DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server);
+EXPORT_SYMBOL(RQF_OST_STATFS);
+
+struct req_format RQF_OST_SET_GRANT_INFO =
+       DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client,
+                        ost_body_only);
+EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO);
+
+struct req_format RQF_OST_GET_INFO_GENERIC =
+       DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client,
+                                       ost_get_info_generic_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_GENERIC);
+
+struct req_format RQF_OST_GET_INFO_LAST_ID =
+       DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client,
+                                               ost_get_last_id_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID);
+
+struct req_format RQF_OST_GET_INFO_LAST_FID =
+       DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", obd_set_info_client,
+                                                ost_get_last_fid_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID);
+
+struct req_format RQF_OST_SET_INFO_LAST_FID =
+       DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client,
+                                                empty);
+EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID);
+
+struct req_format RQF_OST_GET_INFO_FIEMAP =
+       DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client,
+                                              ost_get_fiemap_server);
+EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP);
+
+#if !defined(__REQ_LAYOUT_USER__)
+
+/* Convenience macro */
+#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)]
+
+/**
+ * Initializes the capsule abstraction by computing and setting the \a rf_idx
+ * field of RQFs and the \a rmf_offset field of RMFs.
+ */
+int req_layout_init(void)
+{
+       int i;
+       int j;
+       int k;
+       struct req_format *rf = NULL;
+
+       for (i = 0; i < ARRAY_SIZE(req_formats); ++i) {
+               rf = req_formats[i];
+               rf->rf_idx = i;
+               for (j = 0; j < RCL_NR; ++j) {
+                       LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR);
+                       for (k = 0; k < rf->rf_fields[j].nr; ++k) {
+                               struct req_msg_field *field;
+
+                               field = (typeof(field))rf->rf_fields[j].d[k];
+                               LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY)
+                                       || field->rmf_size > 0);
+                               LASSERT(field->rmf_offset[i][j] == 0);
+                               /*
+                                * k + 1 to detect unused format/field
+                                * combinations.
+                                */
+                               field->rmf_offset[i][j] = k + 1;
+                       }
+               }
+       }
+       return 0;
+}
+EXPORT_SYMBOL(req_layout_init);
+
+void req_layout_fini(void)
+{
+}
+EXPORT_SYMBOL(req_layout_fini);
+
+/**
+ * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1.
+ *
+ * Actual/expected field sizes are set elsewhere in functions in this file:
+ * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and
+ * req_capsule_msg_size().  The \a rc_area information is used by.
+ * ptlrpc_request_set_replen().
+ */
+void req_capsule_init_area(struct req_capsule *pill)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) {
+               pill->rc_area[RCL_CLIENT][i] = -1;
+               pill->rc_area[RCL_SERVER][i] = -1;
+       }
+}
+EXPORT_SYMBOL(req_capsule_init_area);
+
+/**
+ * Initialize a pill.
+ *
+ * The \a location indicates whether the caller is executing on the client side
+ * (RCL_CLIENT) or server side (RCL_SERVER)..
+ */
+void req_capsule_init(struct req_capsule *pill,
+                     struct ptlrpc_request *req,
+                     enum req_location location)
+{
+       LASSERT(location == RCL_SERVER || location == RCL_CLIENT);
+
+       /*
+        * Today all capsules are embedded in ptlrpc_request structs,
+        * but just in case that ever isn't the case, we don't reach
+        * into req unless req != NULL and pill is the one embedded in
+        * the req.
+        *
+        * The req->rq_pill_init flag makes it safe to initialize a pill
+        * twice, which might happen in the OST paths as a result of the
+        * high-priority RPC queue getting peeked at before ost_handle()
+        * handles an OST RPC.
+        */
+       if (req != NULL && pill == &req->rq_pill && req->rq_pill_init)
+               return;
+
+       memset(pill, 0, sizeof *pill);
+       pill->rc_req = req;
+       pill->rc_loc = location;
+       req_capsule_init_area(pill);
+
+       if (req != NULL && pill == &req->rq_pill)
+               req->rq_pill_init = 1;
+}
+EXPORT_SYMBOL(req_capsule_init);
+
+void req_capsule_fini(struct req_capsule *pill)
+{
+}
+EXPORT_SYMBOL(req_capsule_fini);
+
+static int __req_format_is_sane(const struct req_format *fmt)
+{
+       return
+               0 <= fmt->rf_idx && fmt->rf_idx < ARRAY_SIZE(req_formats) &&
+               req_formats[fmt->rf_idx] == fmt;
+}
+
+static struct lustre_msg *__req_msg(const struct req_capsule *pill,
+                                   enum req_location loc)
+{
+       struct ptlrpc_request *req;
+
+       req = pill->rc_req;
+       return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg;
+}
+
+/**
+ * Set the format (\a fmt) of a \a pill; format changes are not allowed here
+ * (see req_capsule_extend()).
+ */
+void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt)
+{
+       LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt);
+       LASSERT(__req_format_is_sane(fmt));
+
+       pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_set);
+
+/**
+ * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in
+ * yet.
+
+ * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of
+ * variable-sized fields.  The field sizes come from the declared \a rmf_size
+ * field of a \a pill's \a rc_fmt's RMF's.
+ */
+int req_capsule_filled_sizes(struct req_capsule *pill,
+                          enum req_location loc)
+{
+       const struct req_format *fmt = pill->rc_fmt;
+       int                   i;
+
+       LASSERT(fmt != NULL);
+
+       for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+               if (pill->rc_area[loc][i] == -1) {
+                       pill->rc_area[loc][i] =
+                                           fmt->rf_fields[loc].d[i]->rmf_size;
+                       if (pill->rc_area[loc][i] == -1) {
+                               /*
+                                * Skip the following fields.
+                                *
+                                * If this LASSERT() trips then you're missing a
+                                * call to req_capsule_set_size().
+                                */
+                               LASSERT(loc != RCL_SERVER);
+                               break;
+                       }
+               }
+       }
+       return i;
+}
+EXPORT_SYMBOL(req_capsule_filled_sizes);
+
+/**
+ * Capsule equivalent of lustre_pack_request() and lustre_pack_reply().
+ *
+ * This function uses the \a pill's \a rc_area as filled in by
+ * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by
+ * this function).
+ */
+int req_capsule_server_pack(struct req_capsule *pill)
+{
+       const struct req_format *fmt;
+       int                   count;
+       int                   rc;
+
+       LASSERT(pill->rc_loc == RCL_SERVER);
+       fmt = pill->rc_fmt;
+       LASSERT(fmt != NULL);
+
+       count = req_capsule_filled_sizes(pill, RCL_SERVER);
+       rc = lustre_pack_reply(pill->rc_req, count,
+                              pill->rc_area[RCL_SERVER], NULL);
+       if (rc != 0) {
+               DEBUG_REQ(D_ERROR, pill->rc_req,
+                      "Cannot pack %d fields in format `%s': ",
+                      count, fmt->rf_name);
+       }
+       return rc;
+}
+EXPORT_SYMBOL(req_capsule_server_pack);
+
+/**
+ * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill
+ * corresponding to the given RMF (\a field).
+ */
+static int __req_capsule_offset(const struct req_capsule *pill,
+                               const struct req_msg_field *field,
+                               enum req_location loc)
+{
+       int offset;
+
+       offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+       LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n",
+                           pill->rc_fmt->rf_name,
+                           field->rmf_name, offset, loc);
+       offset --;
+
+       LASSERT(0 <= offset && offset < REQ_MAX_FIELD_NR);
+       return offset;
+}
+
+/**
+ * Helper for __req_capsule_get(); swabs value / array of values and/or dumps
+ * them if desired.
+ */
+static
+void
+swabber_dumper_helper(struct req_capsule *pill,
+                     const struct req_msg_field *field,
+                     enum req_location loc,
+                     int offset,
+                     void *value, int len, int dump, void (*swabber)( void *))
+{
+       void    *p;
+       int     i;
+       int     n;
+       int     do_swab;
+       int     inout = loc == RCL_CLIENT;
+
+       swabber = swabber ?: field->rmf_swabber;
+
+       if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) &&
+           swabber != NULL && value != NULL)
+               do_swab = 1;
+       else
+               do_swab = 0;
+
+       if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) {
+               if (dump && field->rmf_dumper) {
+                       CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n",
+                              do_swab ? "unswabbed " : "", field->rmf_name);
+                       field->rmf_dumper(value);
+               }
+               if (!do_swab)
+                       return;
+               swabber(value);
+               ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+               if (dump) {
+                       CDEBUG(D_RPCTRACE, "Dump of swabbed field %s "
+                              "follows\n", field->rmf_name);
+                       field->rmf_dumper(value);
+               }
+
+               return;
+       }
+
+       /*
+        * We're swabbing an array; swabber() swabs a single array element, so
+        * swab every element.
+        */
+       LASSERT((len % field->rmf_size) == 0);
+       for (p = value, i = 0, n = len / field->rmf_size;
+            i < n;
+            i++, p += field->rmf_size) {
+               if (dump && field->rmf_dumper) {
+                       CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, "
+                              "element %d follows\n",
+                              do_swab ? "unswabbed " : "", field->rmf_name, i);
+                       field->rmf_dumper(p);
+               }
+               if (!do_swab)
+                       continue;
+               swabber(p);
+               if (dump && field->rmf_dumper) {
+                       CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, "
+                              "element %d follows\n", field->rmf_name, i);
+                       field->rmf_dumper(value);
+               }
+       }
+       if (do_swab)
+               ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset);
+}
+
+/**
+ * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill
+ * corresponding to the given RMF (\a field).
+ *
+ * The buffer will be swabbed using the given \a swabber.  If \a swabber == NULL
+ * then the \a rmf_swabber from the RMF will be used.  Soon there will be no
+ * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then
+ * be removed.  Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each
+ * element of the array swabbed.
+ */
+static void *__req_capsule_get(struct req_capsule *pill,
+                              const struct req_msg_field *field,
+                              enum req_location loc,
+                              void (*swabber)( void *),
+                              int dump)
+{
+       const struct req_format *fmt;
+       struct lustre_msg       *msg;
+       void                *value;
+       int                   len;
+       int                   offset;
+
+       void *(*getter)(struct lustre_msg *m, int n, int minlen);
+
+       static const char *rcl_names[RCL_NR] = {
+               [RCL_CLIENT] = "client",
+               [RCL_SERVER] = "server"
+       };
+
+       LASSERT(pill != NULL);
+       LASSERT(pill != LP_POISON);
+       fmt = pill->rc_fmt;
+       LASSERT(fmt != NULL);
+       LASSERT(fmt != LP_POISON);
+       LASSERT(__req_format_is_sane(fmt));
+
+       offset = __req_capsule_offset(pill, field, loc);
+
+       msg = __req_msg(pill, loc);
+       LASSERT(msg != NULL);
+
+       getter = (field->rmf_flags & RMF_F_STRING) ?
+               (typeof(getter))lustre_msg_string : lustre_msg_buf;
+
+       if (field->rmf_flags & RMF_F_STRUCT_ARRAY) {
+               /*
+                * We've already asserted that field->rmf_size > 0 in
+                * req_layout_init().
+                */
+               len = lustre_msg_buflen(msg, offset);
+               if ((len % field->rmf_size) != 0) {
+                       CERROR("%s: array field size mismatch "
+                              "%d modulo %d != 0 (%d)\n",
+                              field->rmf_name, len, field->rmf_size, loc);
+                       return NULL;
+               }
+       } else if (pill->rc_area[loc][offset] != -1) {
+               len = pill->rc_area[loc][offset];
+       } else {
+               len = max(field->rmf_size, 0);
+       }
+       value = getter(msg, offset, len);
+
+       if (value == NULL) {
+               DEBUG_REQ(D_ERROR, pill->rc_req,
+                         "Wrong buffer for field `%s' (%d of %d) "
+                         "in format `%s': %d vs. %d (%s)\n",
+                         field->rmf_name, offset, lustre_msg_bufcount(msg),
+                         fmt->rf_name, lustre_msg_buflen(msg, offset), len,
+                         rcl_names[loc]);
+       } else {
+               swabber_dumper_helper(pill, field, loc, offset, value, len,
+                                     dump, swabber);
+       }
+
+       return value;
+}
+
+/**
+ * Dump a request and/or reply
+ */
+void __req_capsule_dump(struct req_capsule *pill, enum req_location loc)
+{
+       const struct    req_format *fmt;
+       const struct    req_msg_field *field;
+       int          len;
+       int          i;
+
+       fmt = pill->rc_fmt;
+
+       DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP\n");
+       for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
+               field = FMT_FIELD(fmt, loc, i);
+               if (field->rmf_dumper == NULL) {
+                       /*
+                        * FIXME Add a default hex dumper for fields that don't
+                        * have a specific dumper
+                        */
+                       len = req_capsule_get_size(pill, field, loc);
+                       CDEBUG(D_RPCTRACE, "Field %s has no dumper function;"
+                              "field size is %d\n", field->rmf_name, len);
+               } else {
+                       /* It's the dumping side-effect that we're interested in */
+                       (void) __req_capsule_get(pill, field, loc, NULL, 1);
+               }
+       }
+       CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n");
+}
+
+/**
+ * Dump a request.
+ */
+void req_capsule_client_dump(struct req_capsule *pill)
+{
+       __req_capsule_dump(pill, RCL_CLIENT);
+}
+EXPORT_SYMBOL(req_capsule_client_dump);
+
+/**
+ * Dump a reply
+ */
+void req_capsule_server_dump(struct req_capsule *pill)
+{
+       __req_capsule_dump(pill, RCL_SERVER);
+}
+EXPORT_SYMBOL(req_capsule_server_dump);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_client_get(struct req_capsule *pill,
+                            const struct req_msg_field *field)
+{
+       return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_get);
+
+/**
+ * Same as req_capsule_client_get(), but with a \a swabber argument.
+ *
+ * Currently unused; will be removed when req_capsule_server_swab_get() is
+ * unused too.
+ */
+void *req_capsule_client_swab_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field,
+                                 void *swabber)
+{
+       return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_client_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_client_sized_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  int len)
+{
+       req_capsule_set_size(pill, field, RCL_CLIENT, len);
+       return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_client_sized_get);
+
+/**
+ * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply
+ * buffer corresponding to the given RMF (\a field) of a \a pill.
+ */
+void *req_capsule_server_get(struct req_capsule *pill,
+                            const struct req_msg_field *field)
+{
+       return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_get);
+
+/**
+ * Same as req_capsule_server_get(), but with a \a swabber argument.
+ *
+ * Ideally all swabbing should be done pursuant to RMF definitions, with no
+ * swabbing done outside this capsule abstraction.
+ */
+void *req_capsule_server_swab_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field,
+                                 void *swabber)
+{
+       return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_swab_get);
+
+/**
+ * Utility that combines req_capsule_set_size() and req_capsule_server_get().
+ *
+ * First the \a pill's request \a field's size is set (\a rc_area) using
+ * req_capsule_set_size() with the given \a len.  Then the actual buffer is
+ * returned.
+ */
+void *req_capsule_server_sized_get(struct req_capsule *pill,
+                                  const struct req_msg_field *field,
+                                  int len)
+{
+       req_capsule_set_size(pill, field, RCL_SERVER, len);
+       return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_get);
+
+void *req_capsule_server_sized_swab_get(struct req_capsule *pill,
+                                       const struct req_msg_field *field,
+                                       int len, void *swabber)
+{
+       req_capsule_set_size(pill, field, RCL_SERVER, len);
+       return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0);
+}
+EXPORT_SYMBOL(req_capsule_server_sized_swab_get);
+
+/**
+ * Returns the buffer of a \a pill corresponding to the given \a field from the
+ * request (if the caller is executing on the server-side) or reply (if the
+ * caller is executing on the client-side).
+ *
+ * This function convienient for use is code that could be executed on the
+ * client and server alike.
+ */
+const void *req_capsule_other_get(struct req_capsule *pill,
+                                 const struct req_msg_field *field)
+{
+       return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, 0);
+}
+EXPORT_SYMBOL(req_capsule_other_get);
+
+/**
+ * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a
+ * field of the given \a pill.
+ *
+ * This function must be used when constructing variable sized fields of a
+ * request or reply.
+ */
+void req_capsule_set_size(struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc, int size)
+{
+       LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+       if ((size != field->rmf_size) &&
+           (field->rmf_size != -1) &&
+           !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+           (size > 0)) {
+               if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+                   (size % field->rmf_size != 0)) {
+                       CERROR("%s: array field size mismatch "
+                              "%d %% %d != 0 (%d)\n",
+                              field->rmf_name, size, field->rmf_size, loc);
+                       LBUG();
+               } else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) &&
+                   size < field->rmf_size) {
+                       CERROR("%s: field size mismatch %d != %d (%d)\n",
+                              field->rmf_name, size, field->rmf_size, loc);
+                       LBUG();
+               }
+       }
+
+       pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size;
+}
+EXPORT_SYMBOL(req_capsule_set_size);
+
+/**
+ * Return the actual PTLRPC buffer length of a request or reply (\a loc)
+ * for the given \a pill's given \a field.
+ *
+ * NB: this function doesn't correspond with req_capsule_set_size(), which
+ * actually sets the size in pill.rc_area[loc][offset], but this function
+ * returns the message buflen[offset], maybe we should use another name.
+ */
+int req_capsule_get_size(const struct req_capsule *pill,
+                        const struct req_msg_field *field,
+                        enum req_location loc)
+{
+       LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+       return lustre_msg_buflen(__req_msg(pill, loc),
+                                __req_capsule_offset(pill, field, loc));
+}
+EXPORT_SYMBOL(req_capsule_get_size);
+
+/**
+ * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the
+ * given \a pill's request or reply (\a loc) given the field size recorded in
+ * the \a pill's rc_area.
+ *
+ * See also req_capsule_set_size().
+ */
+int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc)
+{
+       return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic,
+                              pill->rc_fmt->rf_fields[loc].nr,
+                              pill->rc_area[loc]);
+}
+
+/**
+ * While req_capsule_msg_size() computes the size of a PTLRPC request or reply
+ * (\a loc) given a \a pill's \a rc_area, this function computes the size of a
+ * PTLRPC request or reply given only an RQF (\a fmt).
+ *
+ * This function should not be used for formats which contain variable size
+ * fields.
+ */
+int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt,
+                        enum req_location loc)
+{
+       int size, i = 0;
+
+       /*
+        * This function should probably LASSERT() that fmt has no fields with
+        * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many
+        * elements in the array there will ultimately be, but then, we could
+        * assume that there will be at least one element, and that's just what
+        * we do.
+        */
+       size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr);
+       if (size < 0)
+               return size;
+
+       for (; i < fmt->rf_fields[loc].nr; ++i)
+               if (fmt->rf_fields[loc].d[i]->rmf_size != -1)
+                       size += cfs_size_round(fmt->rf_fields[loc].d[i]->
+                                              rmf_size);
+       return size;
+}
+
+/**
+ * Changes the format of an RPC.
+ *
+ * The pill must already have been initialized, which means that it already has
+ * a request format.  The new format \a fmt must be an extension of the pill's
+ * old format.  Specifically: the new format must have as many request and reply
+ * fields as the old one, and all fields shared by the old and new format must
+ * be at least as large in the new format.
+ *
+ * The new format's fields may be of different "type" than the old format, but
+ * only for fields that are "opaque" blobs: fields which have a) have no
+ * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a
+ * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK.  For example,
+ * OBD_SET_INFO has a key field and an opaque value field that gets interpreted
+ * according to the key field.  When the value, according to the key, contains a
+ * structure (or array thereof) to be swabbed, the format should be changed to
+ * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set
+ * accordingly.
+ */
+void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt)
+{
+       int i;
+       int j;
+
+       const struct req_format *old;
+
+       LASSERT(pill->rc_fmt != NULL);
+       LASSERT(__req_format_is_sane(fmt));
+
+       old = pill->rc_fmt;
+       /*
+        * Sanity checking...
+        */
+       for (i = 0; i < RCL_NR; ++i) {
+               LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr);
+               for (j = 0; j < old->rf_fields[i].nr - 1; ++j) {
+                       const struct req_msg_field *ofield = FMT_FIELD(old, i, j);
+
+                       /* "opaque" fields can be transmogrified */
+                       if (ofield->rmf_swabber == NULL &&
+                           (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 &&
+                           (ofield->rmf_size == -1 ||
+                           ofield->rmf_flags == RMF_F_NO_SIZE_CHECK))
+                               continue;
+                       LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j));
+               }
+               /*
+                * Last field in old format can be shorter than in new.
+                */
+               LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >=
+                       FMT_FIELD(old, i, j)->rmf_size);
+       }
+
+       pill->rc_fmt = fmt;
+}
+EXPORT_SYMBOL(req_capsule_extend);
+
+/**
+ * This function returns a non-zero value if the given \a field is present in
+ * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it
+ * returns 0.
+ */
+int req_capsule_has_field(const struct req_capsule *pill,
+                         const struct req_msg_field *field,
+                         enum req_location loc)
+{
+       LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+
+       return field->rmf_offset[pill->rc_fmt->rf_idx][loc];
+}
+EXPORT_SYMBOL(req_capsule_has_field);
+
+/**
+ * Returns a non-zero value if the given \a field is present in the given \a
+ * pill's PTLRPC request or reply (\a loc), else it returns 0.
+ */
+int req_capsule_field_present(const struct req_capsule *pill,
+                             const struct req_msg_field *field,
+                             enum req_location loc)
+{
+       int offset;
+
+       LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
+       LASSERT(req_capsule_has_field(pill, field, loc));
+
+       offset = __req_capsule_offset(pill, field, loc);
+       return lustre_msg_bufcount(__req_msg(pill, loc)) > offset;
+}
+EXPORT_SYMBOL(req_capsule_field_present);
+
+/**
+ * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC
+ * request or reply (\a loc).
+ *
+ * This is not the opposite of req_capsule_extend().
+ */
+void req_capsule_shrink(struct req_capsule *pill,
+                       const struct req_msg_field *field,
+                       unsigned int newlen,
+                       enum req_location loc)
+{
+       const struct req_format *fmt;
+       struct lustre_msg       *msg;
+       int                   len;
+       int                   offset;
+
+       fmt = pill->rc_fmt;
+       LASSERT(fmt != NULL);
+       LASSERT(__req_format_is_sane(fmt));
+       LASSERT(req_capsule_has_field(pill, field, loc));
+       LASSERT(req_capsule_field_present(pill, field, loc));
+
+       offset = __req_capsule_offset(pill, field, loc);
+
+       msg = __req_msg(pill, loc);
+       len = lustre_msg_buflen(msg, offset);
+       LASSERTF(newlen <= len, "%s:%s, oldlen=%d, newlen=%d\n",
+                               fmt->rf_name, field->rmf_name, len, newlen);
+
+       if (loc == RCL_CLIENT)
+               pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen,
+                                                           1);
+       else
+               pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen,
+                                                           1);
+}
+EXPORT_SYMBOL(req_capsule_shrink);
+
+int req_capsule_server_grow(struct req_capsule *pill,
+                           const struct req_msg_field *field,
+                           unsigned int newlen)
+{
+       struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs;
+       char *from, *to;
+       int offset, len, rc;
+
+       LASSERT(pill->rc_fmt != NULL);
+       LASSERT(__req_format_is_sane(pill->rc_fmt));
+       LASSERT(req_capsule_has_field(pill, field, RCL_SERVER));
+       LASSERT(req_capsule_field_present(pill, field, RCL_SERVER));
+
+       len = req_capsule_get_size(pill, field, RCL_SERVER);
+       offset = __req_capsule_offset(pill, field, RCL_SERVER);
+       if (pill->rc_req->rq_repbuf_len >=
+           lustre_packed_msg_size(pill->rc_req->rq_repmsg) - len + newlen)
+               CERROR("Inplace repack might be done\n");
+
+       pill->rc_req->rq_reply_state = NULL;
+       req_capsule_set_size(pill, field, RCL_SERVER, newlen);
+       rc = req_capsule_server_pack(pill);
+       if (rc) {
+               /* put old rs back, the caller will decide what to do */
+               pill->rc_req->rq_reply_state = rs;
+               return rc;
+       }
+       nrs = pill->rc_req->rq_reply_state;
+       /* Now we need only buffers, copy first chunk */
+       to = lustre_msg_buf(nrs->rs_msg, 0, 0);
+       from = lustre_msg_buf(rs->rs_msg, 0, 0);
+       len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) - from;
+       memcpy(to, from, len);
+       /* check if we have tail and copy it too */
+       if (rs->rs_msg->lm_bufcount > offset + 1) {
+               to = lustre_msg_buf(nrs->rs_msg, offset + 1, 0);
+               from = lustre_msg_buf(rs->rs_msg, offset + 1, 0);
+               offset = rs->rs_msg->lm_bufcount - 1;
+               len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) +
+                     cfs_size_round(rs->rs_msg->lm_buflens[offset]) - from;
+               memcpy(to, from, len);
+       }
+       /* drop old reply if everything is fine */
+       if (rs->rs_difficult) {
+               /* copy rs data */
+               int i;
+
+               nrs->rs_difficult = 1;
+               nrs->rs_no_ack = rs->rs_no_ack;
+               for (i = 0; i < rs->rs_nlocks; i++) {
+                       nrs->rs_locks[i] = rs->rs_locks[i];
+                       nrs->rs_modes[i] = rs->rs_modes[i];
+                       nrs->rs_nlocks++;
+               }
+               rs->rs_nlocks = 0;
+               rs->rs_difficult = 0;
+               rs->rs_no_ack = 0;
+       }
+       ptlrpc_rs_decref(rs);
+       return 0;
+}
+EXPORT_SYMBOL(req_capsule_server_grow);
+/* __REQ_LAYOUT_USER__ */
+#endif
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_client.c b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c
new file mode 100644 (file)
index 0000000..367ca8e
--- /dev/null
@@ -0,0 +1,354 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_client.c
+ *
+ * remote api for llog - client side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+#include <linux/list.h>
+
+#define LLOG_CLIENT_ENTRY(ctxt, imp) do {                           \
+       mutex_lock(&ctxt->loc_mutex);                        \
+       if (ctxt->loc_imp) {                                      \
+               imp = class_import_get(ctxt->loc_imp);          \
+       } else {                                                      \
+               CERROR("ctxt->loc_imp == NULL for context idx %d."    \
+                      "Unable to complete MDS/OSS recovery,"    \
+                      "but I'll try again next time.  Not fatal.\n", \
+                      ctxt->loc_idx);                          \
+               imp = NULL;                                        \
+               mutex_unlock(&ctxt->loc_mutex);            \
+               return (-EINVAL);                                    \
+       }                                                            \
+       mutex_unlock(&ctxt->loc_mutex);                    \
+} while(0)
+
+#define LLOG_CLIENT_EXIT(ctxt, imp) do {                             \
+       mutex_lock(&ctxt->loc_mutex);                        \
+       if (ctxt->loc_imp != imp)                                    \
+               CWARN("loc_imp has changed from %p to %p\n",      \
+                      ctxt->loc_imp, imp);                        \
+       class_import_put(imp);                                  \
+       mutex_unlock(&ctxt->loc_mutex);                    \
+} while(0)
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_client_open(const struct lu_env *env,
+                           struct llog_handle *lgh, struct llog_logid *logid,
+                           char *name, enum llog_open_param open_param)
+{
+       struct obd_import     *imp;
+       struct llogd_body     *body;
+       struct llog_ctxt      *ctxt = lgh->lgh_ctxt;
+       struct ptlrpc_request *req = NULL;
+       int                 rc;
+       ENTRY;
+
+       LLOG_CLIENT_ENTRY(ctxt, imp);
+
+       /* client cannot create llog */
+       LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param);
+       LASSERT(lgh);
+
+       req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE);
+       if (req == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       if (name)
+               req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT,
+                                    strlen(name) + 1);
+
+       rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION,
+                                LLOG_ORIGIN_HANDLE_CREATE);
+       if (rc) {
+               ptlrpc_request_free(req);
+               req = NULL;
+               GOTO(out, rc);
+       }
+       ptlrpc_request_set_replen(req);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (logid)
+               body->lgd_logid = *logid;
+       body->lgd_ctxt_idx = ctxt->loc_idx - 1;
+
+       if (name) {
+               char *tmp;
+               tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME,
+                                                  strlen(name) + 1);
+               LASSERT(tmp);
+               strcpy(tmp, name);
+       }
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               GOTO(out, rc = -EFAULT);
+
+       lgh->lgh_id = body->lgd_logid;
+       lgh->lgh_ctxt = ctxt;
+       EXIT;
+out:
+       LLOG_CLIENT_EXIT(ctxt, imp);
+       ptlrpc_req_finished(req);
+       return rc;
+}
+
+static int llog_client_destroy(const struct lu_env *env,
+                              struct llog_handle *loghandle)
+{
+       struct obd_import     *imp;
+       struct ptlrpc_request *req = NULL;
+       struct llogd_body     *body;
+       int                 rc;
+       ENTRY;
+
+       LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+       req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY,
+                                       LUSTRE_LOG_VERSION,
+                                       LLOG_ORIGIN_HANDLE_DESTROY);
+       if (req == NULL)
+               GOTO(err_exit, rc =-ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       body->lgd_logid = loghandle->lgh_id;
+       body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+
+       if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+               CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name,
+                      body->lgd_llh_flags);
+
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+
+       ptlrpc_req_finished(req);
+err_exit:
+       LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+       RETURN(rc);
+}
+
+
+static int llog_client_next_block(const struct lu_env *env,
+                                 struct llog_handle *loghandle,
+                                 int *cur_idx, int next_idx,
+                                 __u64 *cur_offset, void *buf, int len)
+{
+       struct obd_import     *imp;
+       struct ptlrpc_request *req = NULL;
+       struct llogd_body     *body;
+       void              *ptr;
+       int                 rc;
+       ENTRY;
+
+       LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+       req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK,
+                                       LUSTRE_LOG_VERSION,
+                                       LLOG_ORIGIN_HANDLE_NEXT_BLOCK);
+       if (req == NULL)
+               GOTO(err_exit, rc =-ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       body->lgd_logid = loghandle->lgh_id;
+       body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+       body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+       body->lgd_index = next_idx;
+       body->lgd_saved_index = *cur_idx;
+       body->lgd_len = len;
+       body->lgd_cur_offset = *cur_offset;
+
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               GOTO(out, rc =-EFAULT);
+
+       /* The log records are swabbed as they are processed */
+       ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+       if (ptr == NULL)
+               GOTO(out, rc =-EFAULT);
+
+       *cur_idx = body->lgd_saved_index;
+       *cur_offset = body->lgd_cur_offset;
+
+       memcpy(buf, ptr, len);
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+err_exit:
+       LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+       return rc;
+}
+
+static int llog_client_prev_block(const struct lu_env *env,
+                                 struct llog_handle *loghandle,
+                                 int prev_idx, void *buf, int len)
+{
+       struct obd_import     *imp;
+       struct ptlrpc_request *req = NULL;
+       struct llogd_body     *body;
+       void              *ptr;
+       int                 rc;
+       ENTRY;
+
+       LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp);
+       req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK,
+                                       LUSTRE_LOG_VERSION,
+                                       LLOG_ORIGIN_HANDLE_PREV_BLOCK);
+       if (req == NULL)
+               GOTO(err_exit, rc = -ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       body->lgd_logid = loghandle->lgh_id;
+       body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+       body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+       body->lgd_index = prev_idx;
+       body->lgd_len = len;
+
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len);
+       ptlrpc_request_set_replen(req);
+
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               GOTO(out, rc =-EFAULT);
+
+       ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+       if (ptr == NULL)
+               GOTO(out, rc =-EFAULT);
+
+       memcpy(buf, ptr, len);
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+err_exit:
+       LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp);
+       return rc;
+}
+
+static int llog_client_read_header(const struct lu_env *env,
+                                  struct llog_handle *handle)
+{
+       struct obd_import     *imp;
+       struct ptlrpc_request *req = NULL;
+       struct llogd_body     *body;
+       struct llog_log_hdr   *hdr;
+       struct llog_rec_hdr   *llh_hdr;
+       int                 rc;
+       ENTRY;
+
+       LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp);
+       req = ptlrpc_request_alloc_pack(imp,&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER,
+                                       LUSTRE_LOG_VERSION,
+                                       LLOG_ORIGIN_HANDLE_READ_HEADER);
+       if (req == NULL)
+               GOTO(err_exit, rc = -ENOMEM);
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       body->lgd_logid = handle->lgh_id;
+       body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1;
+       body->lgd_llh_flags = handle->lgh_hdr->llh_flags;
+
+       ptlrpc_request_set_replen(req);
+       rc = ptlrpc_queue_wait(req);
+       if (rc)
+               GOTO(out, rc);
+
+       hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+       if (hdr == NULL)
+               GOTO(out, rc =-EFAULT);
+
+       memcpy(handle->lgh_hdr, hdr, sizeof (*hdr));
+       handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+       /* sanity checks */
+       llh_hdr = &handle->lgh_hdr->llh_hdr;
+       if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+               CERROR("bad log header magic: %#x (expecting %#x)\n",
+                      llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+               rc = -EIO;
+       } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+               CERROR("incorrectly sized log header: %#x "
+                      "(expecting %#x)\n",
+                      llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+               CERROR("you may need to re-run lconf --write_conf.\n");
+               rc = -EIO;
+       }
+       EXIT;
+out:
+       ptlrpc_req_finished(req);
+err_exit:
+       LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp);
+       return rc;
+}
+
+static int llog_client_close(const struct lu_env *env,
+                            struct llog_handle *handle)
+{
+       /* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because
+          the servers all close the file at the end of every
+          other LLOG_ RPC. */
+       return(0);
+}
+
+struct llog_operations llog_client_ops = {
+       .lop_next_block         = llog_client_next_block,
+       .lop_prev_block         = llog_client_prev_block,
+       .lop_read_header        = llog_client_read_header,
+       .lop_open               = llog_client_open,
+       .lop_destroy            = llog_client_destroy,
+       .lop_close              = llog_client_close,
+};
+EXPORT_SYMBOL(llog_client_ops);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_net.c b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c
new file mode 100644 (file)
index 0000000..a81f557
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_net.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <linux/list.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+
+int llog_initiator_connect(struct llog_ctxt *ctxt)
+{
+       struct obd_import *new_imp;
+       ENTRY;
+
+       LASSERT(ctxt);
+       new_imp = ctxt->loc_obd->u.cli.cl_import;
+       LASSERTF(ctxt->loc_imp == NULL || ctxt->loc_imp == new_imp,
+                "%p - %p\n", ctxt->loc_imp, new_imp);
+       mutex_lock(&ctxt->loc_mutex);
+       if (ctxt->loc_imp != new_imp) {
+               if (ctxt->loc_imp)
+                       class_import_put(ctxt->loc_imp);
+               ctxt->loc_imp = class_import_get(new_imp);
+       }
+       mutex_unlock(&ctxt->loc_mutex);
+       RETURN(0);
+}
+EXPORT_SYMBOL(llog_initiator_connect);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_server.c b/drivers/staging/lustre/lustre/ptlrpc/llog_server.c
new file mode 100644 (file)
index 0000000..bc1fcd8
--- /dev/null
@@ -0,0 +1,466 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/llog_server.c
+ *
+ * remote api for llog - server side
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <lustre_net.h>
+#include <lustre_fsfilt.h>
+
+#if  defined(LUSTRE_LOG_SERVER)
+static int llog_origin_close(const struct lu_env *env, struct llog_handle *lgh)
+{
+       if (lgh->lgh_hdr != NULL && lgh->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+               return llog_cat_close(env, lgh);
+       else
+               return llog_close(env, lgh);
+}
+
+/* Only open is supported, no new llog can be created remotely */
+int llog_origin_handle_open(struct ptlrpc_request *req)
+{
+       struct obd_export       *exp = req->rq_export;
+       struct obd_device       *obd = exp->exp_obd;
+       struct obd_device       *disk_obd;
+       struct lvfs_run_ctxt     saved;
+       struct llog_handle      *loghandle;
+       struct llogd_body       *body;
+       struct llog_logid       *logid = NULL;
+       struct llog_ctxt        *ctxt;
+       char                    *name = NULL;
+       int                      rc;
+
+       ENTRY;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               RETURN(-EFAULT);
+
+       if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+               logid = &body->lgd_logid;
+
+       if (req_capsule_field_present(&req->rq_pill, &RMF_NAME, RCL_CLIENT)) {
+               name = req_capsule_client_get(&req->rq_pill, &RMF_NAME);
+               if (name == NULL)
+                       RETURN(-EFAULT);
+               CDEBUG(D_INFO, "%s: opening log %s\n", obd->obd_name, name);
+       }
+
+       ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
+       if (ctxt == NULL) {
+               CDEBUG(D_WARNING, "%s: no ctxt. group=%p idx=%d name=%s\n",
+                      obd->obd_name, &obd->obd_olg, body->lgd_ctxt_idx, name);
+               RETURN(-ENODEV);
+       }
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+       rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, logid,
+                      name, LLOG_OPEN_EXISTS);
+       if (rc)
+               GOTO(out_pop, rc);
+
+       rc = req_capsule_server_pack(&req->rq_pill);
+       if (rc)
+               GOTO(out_close, rc = -ENOMEM);
+
+       body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       body->lgd_logid = loghandle->lgh_id;
+
+       EXIT;
+out_close:
+       llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_open);
+
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+       struct obd_device       *disk_obd;
+       struct lvfs_run_ctxt     saved;
+       struct llogd_body       *body;
+       struct llog_logid       *logid = NULL;
+       struct llog_ctxt        *ctxt;
+       int                      rc;
+
+       ENTRY;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               RETURN(-EFAULT);
+
+       if (ostid_id(&body->lgd_logid.lgl_oi) > 0)
+               logid = &body->lgd_logid;
+
+       if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN))
+               CERROR("%s: wrong llog flags %x\n",
+                      req->rq_export->exp_obd->obd_name, body->lgd_llh_flags);
+
+       ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+       if (ctxt == NULL)
+               RETURN(-ENODEV);
+
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+       rc = req_capsule_server_pack(&req->rq_pill);
+       /* erase only if no error and logid is valid */
+       if (rc == 0)
+               rc = llog_erase(req->rq_svc_thread->t_env, ctxt, logid, NULL);
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(llog_origin_handle_destroy);
+
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+       struct obd_device   *disk_obd;
+       struct llog_handle  *loghandle;
+       struct llogd_body   *body;
+       struct llogd_body   *repbody;
+       struct lvfs_run_ctxt saved;
+       struct llog_ctxt    *ctxt;
+       __u32           flags;
+       void            *ptr;
+       int               rc;
+
+       ENTRY;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               RETURN(-EFAULT);
+
+       ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+       if (ctxt == NULL)
+               RETURN(-ENODEV);
+
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+       rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+                      &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+       if (rc)
+               GOTO(out_pop, rc);
+
+       flags = body->lgd_llh_flags;
+       rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+                             NULL);
+       if (rc)
+               GOTO(out_close, rc);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+                            LLOG_CHUNK_SIZE);
+       rc = req_capsule_server_pack(&req->rq_pill);
+       if (rc)
+               GOTO(out_close, rc = -ENOMEM);
+
+       repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       *repbody = *body;
+
+       ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+       rc = llog_next_block(req->rq_svc_thread->t_env, loghandle,
+                            &repbody->lgd_saved_index, repbody->lgd_index,
+                            &repbody->lgd_cur_offset, ptr, LLOG_CHUNK_SIZE);
+       if (rc)
+               GOTO(out_close, rc);
+       EXIT;
+out_close:
+       llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_next_block);
+
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+       struct llog_handle   *loghandle;
+       struct llogd_body    *body;
+       struct llogd_body    *repbody;
+       struct obd_device    *disk_obd;
+       struct lvfs_run_ctxt  saved;
+       struct llog_ctxt     *ctxt;
+       __u32            flags;
+       void             *ptr;
+       int                rc;
+
+       ENTRY;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               RETURN(-EFAULT);
+
+       ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+       if (ctxt == NULL)
+               RETURN(-ENODEV);
+
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+       rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+                        &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+       if (rc)
+               GOTO(out_pop, rc);
+
+       flags = body->lgd_llh_flags;
+       rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+                             NULL);
+       if (rc)
+               GOTO(out_close, rc);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER,
+                            LLOG_CHUNK_SIZE);
+       rc = req_capsule_server_pack(&req->rq_pill);
+       if (rc)
+               GOTO(out_close, rc = -ENOMEM);
+
+       repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       *repbody = *body;
+
+       ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA);
+       rc = llog_prev_block(req->rq_svc_thread->t_env, loghandle,
+                            body->lgd_index, ptr, LLOG_CHUNK_SIZE);
+       if (rc)
+               GOTO(out_close, rc);
+
+       EXIT;
+out_close:
+       llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_prev_block);
+
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+       struct obd_device    *disk_obd;
+       struct llog_handle   *loghandle;
+       struct llogd_body    *body;
+       struct llog_log_hdr  *hdr;
+       struct lvfs_run_ctxt  saved;
+       struct llog_ctxt     *ctxt;
+       __u32            flags;
+       int                rc;
+
+       ENTRY;
+
+       body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY);
+       if (body == NULL)
+               RETURN(-EFAULT);
+
+       ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx);
+       if (ctxt == NULL)
+               RETURN(-ENODEV);
+
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+       rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle,
+                      &body->lgd_logid, NULL, LLOG_OPEN_EXISTS);
+       if (rc)
+               GOTO(out_pop, rc);
+
+       /*
+        * llog_init_handle() reads the llog header
+        */
+       flags = body->lgd_llh_flags;
+       rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags,
+                             NULL);
+       if (rc)
+               GOTO(out_close, rc);
+       flags = loghandle->lgh_hdr->llh_flags;
+
+       rc = req_capsule_server_pack(&req->rq_pill);
+       if (rc)
+               GOTO(out_close, rc = -ENOMEM);
+
+       hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR);
+       *hdr = *loghandle->lgh_hdr;
+       EXIT;
+out_close:
+       llog_origin_close(req->rq_svc_thread->t_env, loghandle);
+out_pop:
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_read_header);
+
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+       ENTRY;
+       /* Nothing to do */
+       RETURN(0);
+}
+EXPORT_SYMBOL(llog_origin_handle_close);
+
+int llog_origin_handle_cancel(struct ptlrpc_request *req)
+{
+       int num_cookies, rc = 0, err, i, failed = 0;
+       struct obd_device *disk_obd;
+       struct llog_cookie *logcookies;
+       struct llog_ctxt *ctxt = NULL;
+       struct lvfs_run_ctxt saved;
+       struct llog_handle *cathandle;
+       struct inode *inode;
+       void *handle;
+       ENTRY;
+
+       logcookies = req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES);
+       num_cookies = req_capsule_get_size(&req->rq_pill, &RMF_LOGCOOKIES,
+                                          RCL_CLIENT) / sizeof(*logcookies);
+       if (logcookies == NULL || num_cookies == 0) {
+               DEBUG_REQ(D_HA, req, "No llog cookies sent");
+               RETURN(-EFAULT);
+       }
+
+       ctxt = llog_get_context(req->rq_export->exp_obd,
+                               logcookies->lgc_subsys);
+       if (ctxt == NULL)
+               RETURN(-ENODEV);
+
+       disk_obd = ctxt->loc_exp->exp_obd;
+       push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       for (i = 0; i < num_cookies; i++, logcookies++) {
+               cathandle = ctxt->loc_handle;
+               LASSERT(cathandle != NULL);
+               inode = cathandle->lgh_file->f_dentry->d_inode;
+
+               handle = fsfilt_start_log(disk_obd, inode,
+                                         FSFILT_OP_CANCEL_UNLINK, NULL, 1);
+               if (IS_ERR(handle)) {
+                       CERROR("fsfilt_start_log() failed: %ld\n",
+                              PTR_ERR(handle));
+                       GOTO(pop_ctxt, rc = PTR_ERR(handle));
+               }
+
+               rc = llog_cat_cancel_records(req->rq_svc_thread->t_env,
+                                            cathandle, 1, logcookies);
+
+               /*
+                * Do not raise -ENOENT errors for resent rpcs. This rec already
+                * might be killed.
+                */
+               if (rc == -ENOENT &&
+                   (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) {
+                       /*
+                        * Do not change this message, reply-single.sh test_59b
+                        * expects to find this in log.
+                        */
+                       CDEBUG(D_RPCTRACE, "RESENT cancel req %p - ignored\n",
+                              req);
+                       rc = 0;
+               } else if (rc == 0) {
+                       CDEBUG(D_RPCTRACE, "Canceled %d llog-records\n",
+                              num_cookies);
+               }
+
+               err = fsfilt_commit(disk_obd, inode, handle, 0);
+               if (err) {
+                       CERROR("Error committing transaction: %d\n", err);
+                       if (!rc)
+                               rc = err;
+                       failed++;
+                       GOTO(pop_ctxt, rc);
+               } else if (rc)
+                       failed++;
+       }
+       GOTO(pop_ctxt, rc);
+pop_ctxt:
+       pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+       if (rc)
+               CERROR("Cancel %d of %d llog-records failed: %d\n",
+                      failed, num_cookies, rc);
+
+       llog_ctxt_put(ctxt);
+       return rc;
+}
+EXPORT_SYMBOL(llog_origin_handle_cancel);
+
+#else /* !__KERNEL__ */
+int llog_origin_handle_open(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+
+int llog_origin_handle_next_block(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+int llog_origin_handle_read_header(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+int llog_origin_handle_close(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+int llog_origin_handle_cancel(struct ptlrpc_request *req)
+{
+       LBUG();
+       return 0;
+}
+#endif
diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
new file mode 100644 (file)
index 0000000..3e73254
--- /dev/null
@@ -0,0 +1,1345 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_support.h>
+#include <obd.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <lustre_net.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+
+struct ll_rpc_opcode {
+     __u32       opcode;
+     const char *opname;
+} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = {
+       { OST_REPLY,    "ost_reply" },
+       { OST_GETATTR,      "ost_getattr" },
+       { OST_SETATTR,      "ost_setattr" },
+       { OST_READ,      "ost_read" },
+       { OST_WRITE,    "ost_write" },
+       { OST_CREATE ,      "ost_create" },
+       { OST_DESTROY,      "ost_destroy" },
+       { OST_GET_INFO,     "ost_get_info" },
+       { OST_CONNECT,      "ost_connect" },
+       { OST_DISCONNECT,   "ost_disconnect" },
+       { OST_PUNCH,    "ost_punch" },
+       { OST_OPEN,      "ost_open" },
+       { OST_CLOSE,    "ost_close" },
+       { OST_STATFS,       "ost_statfs" },
+       { 14,           NULL },    /* formerly OST_SAN_READ */
+       { 15,           NULL },    /* formerly OST_SAN_WRITE */
+       { OST_SYNC,      "ost_sync" },
+       { OST_SET_INFO,     "ost_set_info" },
+       { OST_QUOTACHECK,   "ost_quotacheck" },
+       { OST_QUOTACTL,     "ost_quotactl" },
+       { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" },
+       { MDS_GETATTR,      "mds_getattr" },
+       { MDS_GETATTR_NAME, "mds_getattr_lock" },
+       { MDS_CLOSE,    "mds_close" },
+       { MDS_REINT,    "mds_reint" },
+       { MDS_READPAGE,     "mds_readpage" },
+       { MDS_CONNECT,      "mds_connect" },
+       { MDS_DISCONNECT,   "mds_disconnect" },
+       { MDS_GETSTATUS,    "mds_getstatus" },
+       { MDS_STATFS,       "mds_statfs" },
+       { MDS_PIN,        "mds_pin" },
+       { MDS_UNPIN,    "mds_unpin" },
+       { MDS_SYNC,      "mds_sync" },
+       { MDS_DONE_WRITING, "mds_done_writing" },
+       { MDS_SET_INFO,     "mds_set_info" },
+       { MDS_QUOTACHECK,   "mds_quotacheck" },
+       { MDS_QUOTACTL,     "mds_quotactl" },
+       { MDS_GETXATTR,     "mds_getxattr" },
+       { MDS_SETXATTR,     "mds_setxattr" },
+       { MDS_WRITEPAGE,    "mds_writepage" },
+       { MDS_IS_SUBDIR,    "mds_is_subdir" },
+       { MDS_GET_INFO,     "mds_get_info" },
+       { MDS_HSM_STATE_GET, "mds_hsm_state_get" },
+       { MDS_HSM_STATE_SET, "mds_hsm_state_set" },
+       { MDS_HSM_ACTION,   "mds_hsm_action" },
+       { MDS_HSM_PROGRESS, "mds_hsm_progress" },
+       { MDS_HSM_REQUEST,  "mds_hsm_request" },
+       { MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" },
+       { MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" },
+       { MDS_SWAP_LAYOUTS,     "mds_swap_layouts" },
+       { LDLM_ENQUEUE,     "ldlm_enqueue" },
+       { LDLM_CONVERT,     "ldlm_convert" },
+       { LDLM_CANCEL,      "ldlm_cancel" },
+       { LDLM_BL_CALLBACK, "ldlm_bl_callback" },
+       { LDLM_CP_CALLBACK, "ldlm_cp_callback" },
+       { LDLM_GL_CALLBACK, "ldlm_gl_callback" },
+       { LDLM_SET_INFO,    "ldlm_set_info" },
+       { MGS_CONNECT,      "mgs_connect" },
+       { MGS_DISCONNECT,   "mgs_disconnect" },
+       { MGS_EXCEPTION,    "mgs_exception" },
+       { MGS_TARGET_REG,   "mgs_target_reg" },
+       { MGS_TARGET_DEL,   "mgs_target_del" },
+       { MGS_SET_INFO,     "mgs_set_info" },
+       { MGS_CONFIG_READ,  "mgs_config_read" },
+       { OBD_PING,      "obd_ping" },
+       { OBD_LOG_CANCEL,   "llog_origin_handle_cancel" },
+       { OBD_QC_CALLBACK,  "obd_quota_callback" },
+       { OBD_IDX_READ,     "dt_index_read" },
+       { LLOG_ORIGIN_HANDLE_CREATE,     "llog_origin_handle_create" },
+       { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" },
+       { LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" },
+       { LLOG_ORIGIN_HANDLE_WRITE_REC,  "llog_origin_handle_write_rec" },
+       { LLOG_ORIGIN_HANDLE_CLOSE,      "llog_origin_handle_close" },
+       { LLOG_ORIGIN_CONNECT,     "llog_origin_connect" },
+       { LLOG_CATINFO,           "llog_catinfo" },
+       { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" },
+       { LLOG_ORIGIN_HANDLE_DESTROY,    "llog_origin_handle_destroy" },
+       { QUOTA_DQACQ,      "quota_acquire" },
+       { QUOTA_DQREL,      "quota_release" },
+       { SEQ_QUERY,    "seq_query" },
+       { SEC_CTX_INIT,     "sec_ctx_init" },
+       { SEC_CTX_INIT_CONT,"sec_ctx_init_cont" },
+       { SEC_CTX_FINI,     "sec_ctx_fini" },
+       { FLD_QUERY,    "fld_query" },
+       { UPDATE_OBJ,       "update_obj" },
+};
+
+struct ll_eopcode {
+     __u32       opcode;
+     const char *opname;
+} ll_eopcode_table[EXTRA_LAST_OPC] = {
+       { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+       { LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+       { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+       { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+       { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+       { MDS_REINT_SETATTR,    "mds_reint_setattr" },
+       { MDS_REINT_CREATE,     "mds_reint_create" },
+       { MDS_REINT_LINK,       "mds_reint_link" },
+       { MDS_REINT_UNLINK,     "mds_reint_unlink" },
+       { MDS_REINT_RENAME,     "mds_reint_rename" },
+       { MDS_REINT_OPEN,       "mds_reint_open" },
+       { MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+       { BRW_READ_BYTES,       "read_bytes" },
+       { BRW_WRITE_BYTES,      "write_bytes" },
+};
+
+const char *ll_opcode2str(__u32 opcode)
+{
+       /* When one of the assertions below fail, chances are that:
+        *     1) A new opcode was added in include/lustre/lustre_idl.h,
+        *      but is missing from the table above.
+        * or  2) The opcode space was renumbered or rearranged,
+        *      and the opcode_offset() function in
+        *      ptlrpc_internal.h needs to be modified.
+        */
+       __u32 offset = opcode_offset(opcode);
+       LASSERTF(offset < LUSTRE_MAX_OPCODES,
+                "offset %u >= LUSTRE_MAX_OPCODES %u\n",
+                offset, LUSTRE_MAX_OPCODES);
+       LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode,
+                "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n",
+                offset, ll_rpc_opcode_table[offset].opcode, opcode);
+       return ll_rpc_opcode_table[offset].opname;
+}
+
+const char* ll_eopcode2str(__u32 opcode)
+{
+       LASSERT(ll_eopcode_table[opcode].opcode == opcode);
+       return ll_eopcode_table[opcode].opname;
+}
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir,
+                            char *name, struct proc_dir_entry **procroot_ret,
+                            struct lprocfs_stats **stats_ret)
+{
+       struct proc_dir_entry *svc_procroot;
+       struct lprocfs_stats *svc_stats;
+       int i, rc;
+       unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX |
+                                         LPROCFS_CNTR_STDDEV;
+
+       LASSERT(*procroot_ret == NULL);
+       LASSERT(*stats_ret == NULL);
+
+       svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,0);
+       if (svc_stats == NULL)
+               return;
+
+       if (dir) {
+               svc_procroot = lprocfs_register(dir, root, NULL, NULL);
+               if (IS_ERR(svc_procroot)) {
+                       lprocfs_free_stats(&svc_stats);
+                       return;
+               }
+       } else {
+               svc_procroot = root;
+       }
+
+       lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR,
+                            svc_counter_config, "req_waittime", "usec");
+       lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR,
+                            svc_counter_config, "req_qdepth", "reqs");
+       lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR,
+                            svc_counter_config, "req_active", "reqs");
+       lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT,
+                            svc_counter_config, "req_timeout", "sec");
+       lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR,
+                            svc_counter_config, "reqbuf_avail", "bufs");
+       for (i = 0; i < EXTRA_LAST_OPC; i++) {
+               char *units;
+
+               switch(i) {
+               case BRW_WRITE_BYTES:
+               case BRW_READ_BYTES:
+                       units = "bytes";
+                       break;
+               default:
+                       units = "reqs";
+                       break;
+               }
+               lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i,
+                                    svc_counter_config,
+                                    ll_eopcode2str(i), units);
+       }
+       for (i = 0; i < LUSTRE_MAX_OPCODES; i++) {
+               __u32 opcode = ll_rpc_opcode_table[i].opcode;
+               lprocfs_counter_init(svc_stats,
+                                    EXTRA_MAX_OPCODES + i, svc_counter_config,
+                                    ll_opcode2str(opcode), "usec");
+       }
+
+       rc = lprocfs_register_stats(svc_procroot, name, svc_stats);
+       if (rc < 0) {
+               if (dir)
+                       lprocfs_remove(&svc_procroot);
+               lprocfs_free_stats(&svc_stats);
+       } else {
+               if (dir)
+                       *procroot_ret = svc_procroot;
+               *stats_ret = svc_stats;
+       }
+}
+
+static int
+ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v)
+{
+       struct ptlrpc_service *svc = m->private;
+       struct ptlrpc_service_part *svcpt;
+       int     total = 0;
+       int     i;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc)
+               total += svcpt->scp_hist_nrqbds;
+
+       return seq_printf(m, "%d\n", total);
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len);
+
+static int
+ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n)
+{
+       struct ptlrpc_service *svc = m->private;
+       struct ptlrpc_service_part *svcpt;
+       int     total = 0;
+       int     i;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc)
+               total += svc->srv_hist_nrqbds_cpt_max;
+
+       return seq_printf(m, "%d\n", total);
+}
+
+static ssize_t
+ptlrpc_lprocfs_req_history_max_seq_write(struct file *file, const char *buffer,
+                                        size_t count, loff_t *off)
+{
+       struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+       int                         bufpages;
+       int                         val;
+       int                         rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc < 0)
+               return rc;
+
+       if (val < 0)
+               return -ERANGE;
+
+       /* This sanity check is more of an insanity check; we can still
+        * hose a kernel by allowing the request history to grow too
+        * far. */
+       bufpages = (svc->srv_buf_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       if (val > num_physpages/(2 * bufpages))
+               return -ERANGE;
+
+       spin_lock(&svc->srv_lock);
+
+       if (val == 0)
+               svc->srv_hist_nrqbds_cpt_max = 0;
+       else
+               svc->srv_hist_nrqbds_cpt_max = max(1, (val / svc->srv_ncpts));
+
+       spin_unlock(&svc->srv_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_req_history_max);
+
+static int
+ptlrpc_lprocfs_threads_min_seq_show(struct seq_file *m, void *n)
+{
+       struct ptlrpc_service *svc = m->private;
+
+       return seq_printf(m, "%d\n",
+                       svc->srv_nthrs_cpt_init * svc->srv_ncpts);
+}
+
+static ssize_t
+ptlrpc_lprocfs_threads_min_seq_write(struct file *file, const char *buffer,
+                                    size_t count, loff_t *off)
+{
+       struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+       int     val;
+       int     rc = lprocfs_write_helper(buffer, count, &val);
+
+       if (rc < 0)
+               return rc;
+
+       if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+               return -ERANGE;
+
+       spin_lock(&svc->srv_lock);
+       if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) {
+               spin_unlock(&svc->srv_lock);
+               return -ERANGE;
+       }
+
+       svc->srv_nthrs_cpt_init = val / svc->srv_ncpts;
+
+       spin_unlock(&svc->srv_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_min);
+
+static int
+ptlrpc_lprocfs_threads_started_seq_show(struct seq_file *m, void *n)
+{
+       struct ptlrpc_service *svc = m->private;
+       struct ptlrpc_service_part *svcpt;
+       int     total = 0;
+       int     i;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc)
+               total += svcpt->scp_nthrs_running;
+
+       return seq_printf(m, "%d\n", total);
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_threads_started);
+
+static int
+ptlrpc_lprocfs_threads_max_seq_show(struct seq_file *m, void *n)
+{
+       struct ptlrpc_service *svc = m->private;
+
+       return seq_printf(m, "%d\n",
+                       svc->srv_nthrs_cpt_limit * svc->srv_ncpts);
+}
+
+static ssize_t
+ptlrpc_lprocfs_threads_max_seq_write(struct file *file, const char *buffer,
+                                    size_t count, loff_t *off)
+{
+       struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+       int     val;
+       int     rc = lprocfs_write_helper(buffer, count, &val);
+
+       if (rc < 0)
+               return rc;
+
+       if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT)
+               return -ERANGE;
+
+       spin_lock(&svc->srv_lock);
+       if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) {
+               spin_unlock(&svc->srv_lock);
+               return -ERANGE;
+       }
+
+       svc->srv_nthrs_cpt_limit = val / svc->srv_ncpts;
+
+       spin_unlock(&svc->srv_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_threads_max);
+
+/**
+ * \addtogoup nrs
+ * @{
+ */
+extern struct nrs_core nrs_core;
+
+/**
+ * Translates \e ptlrpc_nrs_pol_state values to human-readable strings.
+ *
+ * \param[in] state The policy state
+ */
+static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state)
+{
+       switch (state) {
+       default:
+               LBUG();
+       case NRS_POL_STATE_INVALID:
+               return "invalid";
+       case NRS_POL_STATE_STOPPED:
+               return "stopped";
+       case NRS_POL_STATE_STOPPING:
+               return "stopping";
+       case NRS_POL_STATE_STARTING:
+               return "starting";
+       case NRS_POL_STATE_STARTED:
+               return "started";
+       }
+}
+
+/**
+ * Obtains status information for \a policy.
+ *
+ * Information is copied in \a info.
+ *
+ * \param[in] policy The policy
+ * \param[out] info  Holds returned status information
+ */
+void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy,
+                               struct ptlrpc_nrs_pol_info *info)
+{
+       LASSERT(policy != NULL);
+       LASSERT(info != NULL);
+       LASSERT(spin_is_locked(&policy->pol_nrs->nrs_lock));
+
+       memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX);
+
+       info->pi_fallback    = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK);
+       info->pi_state       = policy->pol_state;
+       /**
+        * XXX: These are accessed without holding
+        * ptlrpc_service_part::scp_req_lock.
+        */
+       info->pi_req_queued  = policy->pol_req_queued;
+       info->pi_req_started = policy->pol_req_started;
+}
+
+/**
+ * Reads and prints policy status information for all policies of a PTLRPC
+ * service.
+ */
+static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n)
+{
+       struct ptlrpc_service          *svc = m->private;
+       struct ptlrpc_service_part     *svcpt;
+       struct ptlrpc_nrs              *nrs;
+       struct ptlrpc_nrs_policy       *policy;
+       struct ptlrpc_nrs_pol_info     *infos;
+       struct ptlrpc_nrs_pol_info      tmp;
+       unsigned                        num_pols;
+       unsigned                        pol_idx = 0;
+       bool                            hp = false;
+       int                             i;
+       int                             rc = 0;
+       ENTRY;
+
+       /**
+        * Serialize NRS core lprocfs operations with policy registration/
+        * unregistration.
+        */
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       /**
+        * Use the first service partition's regular NRS head in order to obtain
+        * the number of policies registered with NRS heads of this service. All
+        * service partitions will have the same number of policies.
+        */
+       nrs = nrs_svcpt2nrs(svc->srv_parts[0], false);
+
+       spin_lock(&nrs->nrs_lock);
+       num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols;
+       spin_unlock(&nrs->nrs_lock);
+
+       OBD_ALLOC(infos, num_pols * sizeof(*infos));
+       if (infos == NULL)
+               GOTO(out, rc = -ENOMEM);
+again:
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               nrs = nrs_svcpt2nrs(svcpt, hp);
+               spin_lock(&nrs->nrs_lock);
+
+               pol_idx = 0;
+
+               list_for_each_entry(policy, &nrs->nrs_policy_list,
+                                       pol_list) {
+                       LASSERT(pol_idx < num_pols);
+
+                       nrs_policy_get_info_locked(policy, &tmp);
+                       /**
+                        * Copy values when handling the first service
+                        * partition.
+                        */
+                       if (i == 0) {
+                               memcpy(infos[pol_idx].pi_name, tmp.pi_name,
+                                      NRS_POL_NAME_MAX);
+                               memcpy(&infos[pol_idx].pi_state, &tmp.pi_state,
+                                      sizeof(tmp.pi_state));
+                               infos[pol_idx].pi_fallback = tmp.pi_fallback;
+                               /**
+                                * For the rest of the service partitions
+                                * sanity-check the values we get.
+                                */
+                       } else {
+                               LASSERT(strncmp(infos[pol_idx].pi_name,
+                                               tmp.pi_name,
+                                               NRS_POL_NAME_MAX) == 0);
+                               /**
+                                * Not asserting ptlrpc_nrs_pol_info::pi_state,
+                                * because it may be different between
+                                * instances of the same policy in different
+                                * service partitions.
+                                */
+                               LASSERT(infos[pol_idx].pi_fallback ==
+                                       tmp.pi_fallback);
+                       }
+
+                       infos[pol_idx].pi_req_queued += tmp.pi_req_queued;
+                       infos[pol_idx].pi_req_started += tmp.pi_req_started;
+
+                       pol_idx++;
+               }
+               spin_unlock(&nrs->nrs_lock);
+       }
+
+       /**
+        * Policy status information output is in YAML format.
+        * For example:
+        *
+        *      regular_requests:
+        *        - name: fifo
+        *          state: started
+        *          fallback: yes
+        *          queued: 0
+        *          active: 0
+        *
+        *        - name: crrn
+        *          state: started
+        *          fallback: no
+        *          queued: 2015
+        *          active: 384
+        *
+        *      high_priority_requests:
+        *        - name: fifo
+        *          state: started
+        *          fallback: yes
+        *          queued: 0
+        *          active: 2
+        *
+        *        - name: crrn
+        *          state: stopped
+        *          fallback: no
+        *          queued: 0
+        *          active: 0
+        */
+       seq_printf(m, "%s\n",
+                     !hp ?  "\nregular_requests:" : "high_priority_requests:");
+
+       for (pol_idx = 0; pol_idx < num_pols; pol_idx++) {
+               seq_printf(m,  "  - name: %s\n"
+                              "    state: %s\n"
+                              "    fallback: %s\n"
+                              "    queued: %-20d\n"
+                              "    active: %-20d\n\n",
+                              infos[pol_idx].pi_name,
+                              nrs_state2str(infos[pol_idx].pi_state),
+                              infos[pol_idx].pi_fallback ? "yes" : "no",
+                              (int)infos[pol_idx].pi_req_queued,
+                              (int)infos[pol_idx].pi_req_started);
+       }
+
+       if (!hp && nrs_svc_has_hp(svc)) {
+               memset(infos, 0, num_pols * sizeof(*infos));
+
+               /**
+                * Redo the processing for the service's HP NRS heads' policies.
+                */
+               hp = true;
+               goto again;
+       }
+
+out:
+       if (infos)
+               OBD_FREE(infos, num_pols * sizeof(*infos));
+
+       mutex_unlock(&nrs_core.nrs_mutex);
+
+       RETURN(rc);
+}
+
+/**
+ * The longest valid command string is the maxium policy name size, plus the
+ * length of the " reg" substring
+ */
+#define LPROCFS_NRS_WR_MAX_CMD (NRS_POL_NAME_MAX + sizeof(" reg") - 1)
+
+/**
+ * Starts and stops a given policy on a PTLRPC service.
+ *
+ * Commands consist of the policy name, followed by an optional [reg|hp] token;
+ * if the optional token is omitted, the operation is performed on both the
+ * regular and high-priority (if the service has one) NRS head.
+ */
+static ssize_t ptlrpc_lprocfs_nrs_seq_write(struct file *file, const char *buffer,
+                                       size_t count, loff_t *off)
+{
+       struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+       enum ptlrpc_nrs_queue_type      queue = PTLRPC_NRS_QUEUE_BOTH;
+       char                           *cmd;
+       char                           *cmd_copy = NULL;
+       char                           *token;
+       int                             rc = 0;
+       ENTRY;
+
+       if (count >= LPROCFS_NRS_WR_MAX_CMD)
+               GOTO(out, rc = -EINVAL);
+
+       OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD);
+       if (cmd == NULL)
+               GOTO(out, rc = -ENOMEM);
+       /**
+        * strsep() modifies its argument, so keep a copy
+        */
+       cmd_copy = cmd;
+
+       if (copy_from_user(cmd, buffer, count))
+               GOTO(out, rc = -EFAULT);
+
+       cmd[count] = '\0';
+
+       token = strsep(&cmd, " ");
+
+       if (strlen(token) > NRS_POL_NAME_MAX - 1)
+               GOTO(out, rc = -EINVAL);
+
+       /**
+        * No [reg|hp] token has been specified
+        */
+       if (cmd == NULL)
+               goto default_queue;
+
+       /**
+        * The second token is either NULL, or an optional [reg|hp] string
+        */
+       if (strcmp(cmd, "reg") == 0)
+               queue = PTLRPC_NRS_QUEUE_REG;
+       else if (strcmp(cmd, "hp") == 0)
+               queue = PTLRPC_NRS_QUEUE_HP;
+       else
+               GOTO(out, rc = -EINVAL);
+
+default_queue:
+
+       if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc))
+               GOTO(out, rc = -ENODEV);
+       else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc))
+               queue = PTLRPC_NRS_QUEUE_REG;
+
+       /**
+        * Serialize NRS core lprocfs operations with policy registration/
+        * unregistration.
+        */
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       rc = ptlrpc_nrs_policy_control(svc, queue, token, PTLRPC_NRS_CTL_START,
+                                      false, NULL);
+
+       mutex_unlock(&nrs_core.nrs_mutex);
+out:
+       if (cmd_copy)
+               OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD);
+
+       RETURN(rc < 0 ? rc : count);
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs);
+
+/** @} nrs */
+
+struct ptlrpc_srh_iterator {
+       int                     srhi_idx;
+       __u64                   srhi_seq;
+       struct ptlrpc_request   *srhi_req;
+};
+
+int
+ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt,
+                                   struct ptlrpc_srh_iterator *srhi,
+                                   __u64 seq)
+{
+       struct list_head                *e;
+       struct ptlrpc_request   *req;
+
+       if (srhi->srhi_req != NULL &&
+           srhi->srhi_seq > svcpt->scp_hist_seq_culled &&
+           srhi->srhi_seq <= seq) {
+               /* If srhi_req was set previously, hasn't been culled and
+                * we're searching for a seq on or after it (i.e. more
+                * recent), search from it onwards.
+                * Since the service history is LRU (i.e. culled reqs will
+                * be near the head), we shouldn't have to do long
+                * re-scans */
+               LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq,
+                        "%s:%d: seek seq "LPU64", request seq "LPU64"\n",
+                        svcpt->scp_service->srv_name, svcpt->scp_cpt,
+                        srhi->srhi_seq, srhi->srhi_req->rq_history_seq);
+               LASSERTF(!list_empty(&svcpt->scp_hist_reqs),
+                        "%s:%d: seek offset "LPU64", request seq "LPU64", "
+                        "last culled "LPU64"\n",
+                        svcpt->scp_service->srv_name, svcpt->scp_cpt,
+                        seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled);
+               e = &srhi->srhi_req->rq_history_list;
+       } else {
+               /* search from start */
+               e = svcpt->scp_hist_reqs.next;
+       }
+
+       while (e != &svcpt->scp_hist_reqs) {
+               req = list_entry(e, struct ptlrpc_request, rq_history_list);
+
+               if (req->rq_history_seq >= seq) {
+                       srhi->srhi_seq = req->rq_history_seq;
+                       srhi->srhi_req = req;
+                       return 0;
+               }
+               e = e->next;
+       }
+
+       return -ENOENT;
+}
+
+/*
+ * ptlrpc history sequence is used as "position" of seq_file, in some case,
+ * seq_read() will increase "position" to indicate reading the next
+ * element, however, low bits of history sequence are reserved for CPT id
+ * (check the details from comments before ptlrpc_req_add_history), which
+ * means seq_read() might change CPT id of history sequence and never
+ * finish reading of requests on a CPT. To make it work, we have to shift
+ * CPT id to high bits and timestamp to low bits, so seq_read() will only
+ * increase timestamp which can correctly indicate the next position.
+ */
+
+/* convert seq_file pos to cpt */
+#define PTLRPC_REQ_POS2CPT(svc, pos)                   \
+       ((svc)->srv_cpt_bits == 0 ? 0 :                 \
+        (__u64)(pos) >> (64 - (svc)->srv_cpt_bits))
+
+/* make up seq_file pos from cpt */
+#define PTLRPC_REQ_CPT2POS(svc, cpt)                   \
+       ((svc)->srv_cpt_bits == 0 ? 0 :                 \
+        (cpt) << (64 - (svc)->srv_cpt_bits))
+
+/* convert sequence to position */
+#define PTLRPC_REQ_SEQ2POS(svc, seq)                   \
+       ((svc)->srv_cpt_bits == 0 ? (seq) :             \
+        ((seq) >> (svc)->srv_cpt_bits) |               \
+        ((seq) << (64 - (svc)->srv_cpt_bits)))
+
+/* convert position to sequence */
+#define PTLRPC_REQ_POS2SEQ(svc, pos)                   \
+       ((svc)->srv_cpt_bits == 0 ? (pos) :             \
+        ((__u64)(pos) << (svc)->srv_cpt_bits) |        \
+        ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits)))
+
+static void *
+ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos)
+{
+       struct ptlrpc_service           *svc = s->private;
+       struct ptlrpc_service_part      *svcpt;
+       struct ptlrpc_srh_iterator      *srhi;
+       unsigned int                    cpt;
+       int                             rc;
+       int                             i;
+
+       if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */
+               CWARN("Failed to read request history because size of loff_t "
+                     "%d can't match size of u64\n", (int)sizeof(loff_t));
+               return NULL;
+       }
+
+       OBD_ALLOC(srhi, sizeof(*srhi));
+       if (srhi == NULL)
+               return NULL;
+
+       srhi->srhi_seq = 0;
+       srhi->srhi_req = NULL;
+
+       cpt = PTLRPC_REQ_POS2CPT(svc, *pos);
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (i < cpt) /* skip */
+                       continue;
+               if (i > cpt) /* make up the lowest position for this CPT */
+                       *pos = PTLRPC_REQ_CPT2POS(svc, i);
+
+               spin_lock(&svcpt->scp_lock);
+               rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi,
+                               PTLRPC_REQ_POS2SEQ(svc, *pos));
+               spin_unlock(&svcpt->scp_lock);
+               if (rc == 0) {
+                       *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+                       srhi->srhi_idx = i;
+                       return srhi;
+               }
+       }
+
+       OBD_FREE(srhi, sizeof(*srhi));
+       return NULL;
+}
+
+static void
+ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter)
+{
+       struct ptlrpc_srh_iterator *srhi = iter;
+
+       if (srhi != NULL)
+               OBD_FREE(srhi, sizeof(*srhi));
+}
+
+static void *
+ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s,
+                                   void *iter, loff_t *pos)
+{
+       struct ptlrpc_service           *svc = s->private;
+       struct ptlrpc_srh_iterator      *srhi = iter;
+       struct ptlrpc_service_part      *svcpt;
+       __u64                           seq;
+       int                             rc;
+       int                             i;
+
+       for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) {
+               svcpt = svc->srv_parts[i];
+
+               if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */
+                       srhi->srhi_req = NULL;
+                       seq = srhi->srhi_seq = 0;
+               } else { /* the next sequence */
+                       seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits);
+               }
+
+               spin_lock(&svcpt->scp_lock);
+               rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq);
+               spin_unlock(&svcpt->scp_lock);
+               if (rc == 0) {
+                       *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq);
+                       srhi->srhi_idx = i;
+                       return srhi;
+               }
+       }
+
+       OBD_FREE(srhi, sizeof(*srhi));
+       return NULL;
+}
+
+/* common ost/mdt so_req_printer */
+void target_print_req(void *seq_file, struct ptlrpc_request *req)
+{
+       /* Called holding srv_lock with irqs disabled.
+        * Print specific req contents and a newline.
+        * CAVEAT EMPTOR: check request message length before printing!!!
+        * You might have received any old crap so you must be just as
+        * careful here as the service's request parser!!! */
+       struct seq_file *sf = seq_file;
+
+       switch (req->rq_phase) {
+       case RQ_PHASE_NEW:
+               /* still awaiting a service thread's attention, or rejected
+                * because the generic request message didn't unpack */
+               seq_printf(sf, "<not swabbed>\n");
+               break;
+       case RQ_PHASE_INTERPRET:
+               /* being handled, so basic msg swabbed, and opc is valid
+                * but racing with mds_handle() */
+       case RQ_PHASE_COMPLETE:
+               /* been handled by mds_handle() reply state possibly still
+                * volatile */
+               seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg));
+               break;
+       default:
+               DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase);
+       }
+}
+EXPORT_SYMBOL(target_print_req);
+
+static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
+{
+       struct ptlrpc_service           *svc = s->private;
+       struct ptlrpc_srh_iterator      *srhi = iter;
+       struct ptlrpc_service_part      *svcpt;
+       struct ptlrpc_request           *req;
+       int                             rc;
+
+       LASSERT(srhi->srhi_idx < svc->srv_ncpts);
+
+       svcpt = svc->srv_parts[srhi->srhi_idx];
+
+       spin_lock(&svcpt->scp_lock);
+
+       rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq);
+
+       if (rc == 0) {
+               req = srhi->srhi_req;
+
+               /* Print common req fields.
+                * CAVEAT EMPTOR: we're racing with the service handler
+                * here.  The request could contain any old crap, so you
+                * must be just as careful as the service's request
+                * parser. Currently I only print stuff here I know is OK
+                * to look at coz it was set up in request_in_callback()!!! */
+               seq_printf(s, LPD64":%s:%s:x"LPU64":%d:%s:%ld:%lds(%+lds) ",
+                          req->rq_history_seq, libcfs_nid2str(req->rq_self),
+                          libcfs_id2str(req->rq_peer), req->rq_xid,
+                          req->rq_reqlen, ptlrpc_rqphase2str(req),
+                          req->rq_arrival_time.tv_sec,
+                          req->rq_sent - req->rq_arrival_time.tv_sec,
+                          req->rq_sent - req->rq_deadline);
+               if (svc->srv_ops.so_req_printer == NULL)
+                       seq_printf(s, "\n");
+               else
+                       svc->srv_ops.so_req_printer(s, srhi->srhi_req);
+       }
+
+       spin_unlock(&svcpt->scp_lock);
+       return rc;
+}
+
+static int
+ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file)
+{
+       static struct seq_operations sops = {
+               .start = ptlrpc_lprocfs_svc_req_history_start,
+               .stop  = ptlrpc_lprocfs_svc_req_history_stop,
+               .next  = ptlrpc_lprocfs_svc_req_history_next,
+               .show  = ptlrpc_lprocfs_svc_req_history_show,
+       };
+       struct seq_file       *seqf;
+       int                 rc;
+
+       rc = seq_open(file, &sops);
+       if (rc)
+               return rc;
+
+       seqf = file->private_data;
+       seqf->private = PDE_DATA(inode);
+       return 0;
+}
+
+/* See also lprocfs_rd_timeouts */
+static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n)
+{
+       struct ptlrpc_service           *svc = m->private;
+       struct ptlrpc_service_part      *svcpt;
+       struct dhms                     ts;
+       time_t                          worstt;
+       unsigned int                    cur;
+       unsigned int                    worst;
+       int                             i;
+
+       if (AT_OFF) {
+               seq_printf(m, "adaptive timeouts off, using obd_timeout %u\n",
+                              obd_timeout);
+               return 0;
+       }
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               cur     = at_get(&svcpt->scp_at_estimate);
+               worst   = svcpt->scp_at_estimate.at_worst_ever;
+               worstt  = svcpt->scp_at_estimate.at_worst_time;
+               s2dhms(&ts, cfs_time_current_sec() - worstt);
+
+               seq_printf(m, "%10s : cur %3u  worst %3u (at %ld, "
+                             DHMS_FMT" ago) ", "service",
+                             cur, worst, worstt, DHMS_VARS(&ts));
+
+               lprocfs_at_hist_helper(m, &svcpt->scp_at_estimate);
+       }
+
+       return 0;
+}
+LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts);
+
+static int ptlrpc_lprocfs_hp_ratio_seq_show(struct seq_file *m, void *v)
+{
+       struct ptlrpc_service *svc = m->private;
+       return seq_printf(m, "%d", svc->srv_hpreq_ratio);
+}
+
+static ssize_t ptlrpc_lprocfs_hp_ratio_seq_write(struct file *file,
+                                            const char *buffer,
+                                            size_t count,
+                                            loff_t *off)
+{
+       struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private;
+       int     rc;
+       int     val;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc < 0)
+               return rc;
+
+       if (val < 0)
+               return -ERANGE;
+
+       spin_lock(&svc->srv_lock);
+       svc->srv_hpreq_ratio = val;
+       spin_unlock(&svc->srv_lock);
+
+       return count;
+}
+LPROC_SEQ_FOPS(ptlrpc_lprocfs_hp_ratio);
+
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
+                                    struct ptlrpc_service *svc)
+{
+       struct lprocfs_vars lproc_vars[] = {
+               {.name       = "high_priority_ratio",
+                .fops       = &ptlrpc_lprocfs_hp_ratio_fops,
+                .data       = svc},
+               {.name       = "req_buffer_history_len",
+                .fops       = &ptlrpc_lprocfs_req_history_len_fops,
+                .data       = svc},
+               {.name       = "req_buffer_history_max",
+                .fops       = &ptlrpc_lprocfs_req_history_max_fops,
+                .data       = svc},
+               {.name       = "threads_min",
+                .fops       = &ptlrpc_lprocfs_threads_min_fops,
+                .data       = svc},
+               {.name       = "threads_max",
+                .fops       = &ptlrpc_lprocfs_threads_max_fops,
+                .data       = svc},
+               {.name       = "threads_started",
+                .fops       = &ptlrpc_lprocfs_threads_started_fops,
+                .data       = svc},
+               {.name       = "timeouts",
+                .fops       = &ptlrpc_lprocfs_timeouts_fops,
+                .data       = svc},
+               {.name       = "nrs_policies",
+                .fops       = &ptlrpc_lprocfs_nrs_fops,
+                .data       = svc},
+               {NULL}
+       };
+       static struct file_operations req_history_fops = {
+               .owner       = THIS_MODULE,
+               .open   = ptlrpc_lprocfs_svc_req_history_open,
+               .read   = seq_read,
+               .llseek      = seq_lseek,
+               .release     = lprocfs_seq_release,
+       };
+
+       int rc;
+
+       ptlrpc_lprocfs_register(entry, svc->srv_name,
+                               "stats", &svc->srv_procroot,
+                               &svc->srv_stats);
+
+       if (svc->srv_procroot == NULL)
+               return;
+
+       lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL);
+
+       rc = lprocfs_seq_create(svc->srv_procroot, "req_history",
+                               0400, &req_history_fops, svc);
+       if (rc)
+               CWARN("Error adding the req_history file\n");
+}
+
+void ptlrpc_lprocfs_register_obd(struct obd_device *obddev)
+{
+       ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats",
+                               &obddev->obd_svc_procroot,
+                               &obddev->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd);
+
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount)
+{
+       struct lprocfs_stats *svc_stats;
+       __u32 op = lustre_msg_get_opc(req->rq_reqmsg);
+       int opc = opcode_offset(op);
+
+       svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+       if (svc_stats == NULL || opc <= 0)
+               return;
+       LASSERT(opc < LUSTRE_MAX_OPCODES);
+       if (!(op == LDLM_ENQUEUE || op == MDS_REINT))
+               lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount);
+}
+
+void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes)
+{
+       struct lprocfs_stats *svc_stats;
+       int idx;
+
+       if (!req->rq_import)
+               return;
+       svc_stats = req->rq_import->imp_obd->obd_svc_stats;
+       if (!svc_stats)
+               return;
+       idx = lustre_msg_get_opc(req->rq_reqmsg);
+       switch (idx) {
+       case OST_READ:
+               idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR;
+               break;
+       case OST_WRITE:
+               idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR;
+               break;
+       default:
+               LASSERTF(0, "unsupported opcode %u\n", idx);
+               break;
+       }
+
+       lprocfs_counter_add(svc_stats, idx, bytes);
+}
+
+EXPORT_SYMBOL(ptlrpc_lprocfs_brw);
+
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
+{
+       if (svc->srv_procroot != NULL)
+               lprocfs_remove(&svc->srv_procroot);
+
+       if (svc->srv_stats)
+               lprocfs_free_stats(&svc->srv_stats);
+}
+
+void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd)
+{
+       if (obd->obd_svc_procroot)
+               lprocfs_remove(&obd->obd_svc_procroot);
+
+       if (obd->obd_svc_stats)
+               lprocfs_free_stats(&obd->obd_svc_stats);
+}
+EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd);
+
+
+#define BUFLEN (UUID_MAX + 5)
+
+int lprocfs_wr_evict_client(struct file *file, const char *buffer,
+                           size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       char          *kbuf;
+       char          *tmpbuf;
+
+       OBD_ALLOC(kbuf, BUFLEN);
+       if (kbuf == NULL)
+               return -ENOMEM;
+
+       /*
+        * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1
+        * bytes into kbuf, to ensure that the string is NUL-terminated.
+        * UUID_MAX should include a trailing NUL already.
+        */
+       if (copy_from_user(kbuf, buffer,
+                              min_t(unsigned long, BUFLEN - 1, count))) {
+               count = -EFAULT;
+               goto out;
+       }
+       tmpbuf = cfs_firststr(kbuf, min_t(unsigned long, BUFLEN - 1, count));
+       /* Kludge code(deadlock situation): the lprocfs lock has been held
+        * since the client is evicted by writting client's
+        * uuid/nid to procfs "evict_client" entry. However,
+        * obd_export_evict_by_uuid() will call lprocfs_remove() to destroy
+        * the proc entries under the being destroyed export{}, so I have
+        * to drop the lock at first here.
+        * - jay, jxiong@clusterfs.com */
+       class_incref(obd, __FUNCTION__, current);
+
+       if (strncmp(tmpbuf, "nid:", 4) == 0)
+               obd_export_evict_by_nid(obd, tmpbuf + 4);
+       else if (strncmp(tmpbuf, "uuid:", 5) == 0)
+               obd_export_evict_by_uuid(obd, tmpbuf + 5);
+       else
+               obd_export_evict_by_uuid(obd, tmpbuf);
+
+       class_decref(obd, __FUNCTION__, current);
+
+out:
+       OBD_FREE(kbuf, BUFLEN);
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_evict_client);
+
+#undef BUFLEN
+
+int lprocfs_wr_ping(struct file *file, const char *buffer,
+                   size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       struct ptlrpc_request *req;
+       int                 rc;
+       ENTRY;
+
+       LPROCFS_CLIMP_CHECK(obd);
+       req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+       LPROCFS_CLIMP_EXIT(obd);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req->rq_send_state = LUSTRE_IMP_FULL;
+
+       rc = ptlrpc_queue_wait(req);
+
+       ptlrpc_req_finished(req);
+       if (rc >= 0)
+               RETURN(count);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_wr_ping);
+
+/* Write the connection UUID to this file to attempt to connect to that node.
+ * The connection UUID is a node's primary NID. For example,
+ * "echo connection=192.168.0.1@tcp0::instance > .../import".
+ */
+int lprocfs_wr_import(struct file *file, const char *buffer,
+                     size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       struct obd_import *imp = obd->u.cli.cl_import;
+       char *kbuf = NULL;
+       char *uuid;
+       char *ptr;
+       int do_reconn = 1;
+       const char prefix[] = "connection=";
+       const int prefix_len = sizeof(prefix) - 1;
+
+       if (count > PAGE_CACHE_SIZE - 1 || count <= prefix_len)
+               return -EINVAL;
+
+       OBD_ALLOC(kbuf, count + 1);
+       if (kbuf == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(kbuf, buffer, count))
+               GOTO(out, count = -EFAULT);
+
+       kbuf[count] = 0;
+
+       /* only support connection=uuid::instance now */
+       if (strncmp(prefix, kbuf, prefix_len) != 0)
+               GOTO(out, count = -EINVAL);
+
+       uuid = kbuf + prefix_len;
+       ptr = strstr(uuid, "::");
+       if (ptr) {
+               __u32 inst;
+               char *endptr;
+
+               *ptr = 0;
+               do_reconn = 0;
+               ptr += strlen("::");
+               inst = simple_strtol(ptr, &endptr, 10);
+               if (*endptr) {
+                       CERROR("config: wrong instance # %s\n", ptr);
+               } else if (inst != imp->imp_connect_data.ocd_instance) {
+                       CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted "
+                              "target(%u/%u), reconnecting...\n",
+                              imp->imp_obd->obd_name,
+                              imp->imp_connect_data.ocd_instance, inst);
+                       do_reconn = 1;
+               } else {
+                       CDEBUG(D_INFO, "IR: %s has already been connecting to "
+                              "new target(%u)\n",
+                              imp->imp_obd->obd_name, inst);
+               }
+       }
+
+       if (do_reconn)
+               ptlrpc_recover_import(imp, uuid, 1);
+
+out:
+       OBD_FREE(kbuf, count + 1);
+       return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_import);
+
+int lprocfs_rd_pinger_recov(struct seq_file *m, void *n)
+{
+       struct obd_device *obd = m->private;
+       struct obd_import *imp = obd->u.cli.cl_import;
+       int rc;
+
+       LPROCFS_CLIMP_CHECK(obd);
+       rc = seq_printf(m, "%d\n", !imp->imp_no_pinger_recover);
+       LPROCFS_CLIMP_EXIT(obd);
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_pinger_recov);
+
+int lprocfs_wr_pinger_recov(struct file *file, const char *buffer,
+                     size_t count, loff_t *off)
+{
+       struct obd_device *obd = ((struct seq_file *)file->private_data)->private;
+       struct client_obd *cli = &obd->u.cli;
+       struct obd_import *imp = cli->cl_import;
+       int rc, val;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc < 0)
+               return rc;
+
+       if (val != 0 && val != 1)
+               return -ERANGE;
+
+       LPROCFS_CLIMP_CHECK(obd);
+       spin_lock(&imp->imp_lock);
+       imp->imp_no_pinger_recover = !val;
+       spin_unlock(&imp->imp_lock);
+       LPROCFS_CLIMP_EXIT(obd);
+
+       return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_pinger_recov);
+
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/niobuf.c b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c
new file mode 100644 (file)
index 0000000..de3f0db
--- /dev/null
@@ -0,0 +1,728 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_lib.h>
+#include <obd.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * Helper function. Sends \a len bytes from \a base at offset \a offset
+ * over \a conn connection to portal \a portal.
+ * Returns 0 on success or error code.
+ */
+static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
+                        lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+                        struct ptlrpc_connection *conn, int portal, __u64 xid,
+                        unsigned int offset)
+{
+       int           rc;
+       lnet_md_t        md;
+       ENTRY;
+
+       LASSERT (portal != 0);
+       LASSERT (conn != NULL);
+       CDEBUG (D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer));
+       md.start     = base;
+       md.length    = len;
+       md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
+       md.options   = PTLRPC_MD_OPTIONS;
+       md.user_ptr  = cbid;
+       md.eq_handle = ptlrpc_eq_h;
+
+       if (unlikely(ack == LNET_ACK_REQ &&
+                    OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
+               /* don't ask for the ack to simulate failing client */
+               ack = LNET_NOACK_REQ;
+       }
+
+       rc = LNetMDBind (md, LNET_UNLINK, mdh);
+       if (unlikely(rc != 0)) {
+               CERROR ("LNetMDBind failed: %d\n", rc);
+               LASSERT (rc == -ENOMEM);
+               RETURN (-ENOMEM);
+       }
+
+       CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n",
+              len, portal, xid, offset);
+
+       rc = LNetPut (conn->c_self, *mdh, ack,
+                     conn->c_peer, portal, xid, offset, 0);
+       if (unlikely(rc != 0)) {
+               int rc2;
+               /* We're going to get an UNLINK event when I unlink below,
+                * which will complete just like any other failed send, so
+                * I fall through and return success here! */
+               CERROR("LNetPut(%s, %d, "LPD64") failed: %d\n",
+                      libcfs_id2str(conn->c_peer), portal, xid, rc);
+               rc2 = LNetMDUnlink(*mdh);
+               LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
+       }
+
+       RETURN (0);
+}
+
+static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count)
+{
+       int i;
+
+       for (i = 0; i < count; i++)
+               LNetMDUnlink(bd_mds[i]);
+}
+
+
+/**
+ * Register bulk at the sender for later transfer.
+ * Returns 0 on success or error code.
+ */
+int ptlrpc_register_bulk(struct ptlrpc_request *req)
+{
+       struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+       lnet_process_id_t peer;
+       int rc = 0;
+       int rc2;
+       int posted_md;
+       int total_md;
+       __u64 xid;
+       lnet_handle_me_t  me_h;
+       lnet_md_t        md;
+       ENTRY;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET))
+               RETURN(0);
+
+       /* NB no locking required until desc is on the network */
+       LASSERT(desc->bd_nob > 0);
+       LASSERT(desc->bd_md_count == 0);
+       LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT);
+       LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+       LASSERT(desc->bd_req != NULL);
+       LASSERT(desc->bd_type == BULK_PUT_SINK ||
+               desc->bd_type == BULK_GET_SOURCE);
+
+       /* cleanup the state of the bulk for it will be reused */
+       if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY)
+               desc->bd_nob_transferred = 0;
+       else
+               LASSERT(desc->bd_nob_transferred == 0);
+
+       desc->bd_failure = 0;
+
+       peer = desc->bd_import->imp_connection->c_peer;
+
+       LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback);
+       LASSERT(desc->bd_cbid.cbid_arg == desc);
+
+       /* An XID is only used for a single request from the client.
+        * For retried bulk transfers, a new XID will be allocated in
+        * in ptlrpc_check_set() if it needs to be resent, so it is not
+        * using the same RDMA match bits after an error.
+        *
+        * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The
+        * first bulk XID is power-of-two aligned before rq_xid. LU-1431 */
+       xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1);
+       LASSERTF(!(desc->bd_registered &&
+                  req->rq_send_state != LUSTRE_IMP_REPLAY) ||
+                xid != desc->bd_last_xid,
+                "registered: %d  rq_xid: "LPU64" bd_last_xid: "LPU64"\n",
+                desc->bd_registered, xid, desc->bd_last_xid);
+
+       total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV;
+       desc->bd_registered = 1;
+       desc->bd_last_xid = xid;
+       desc->bd_md_count = total_md;
+       md.user_ptr = &desc->bd_cbid;
+       md.eq_handle = ptlrpc_eq_h;
+       md.threshold = 1;                      /* PUT or GET */
+
+       for (posted_md = 0; posted_md < total_md; posted_md++, xid++) {
+               md.options = PTLRPC_MD_OPTIONS |
+                            ((desc->bd_type == BULK_GET_SOURCE) ?
+                             LNET_MD_OP_GET : LNET_MD_OP_PUT);
+               ptlrpc_fill_bulk_md(&md, desc, posted_md);
+
+               rc = LNetMEAttach(desc->bd_portal, peer, xid, 0,
+                                 LNET_UNLINK, LNET_INS_AFTER, &me_h);
+               if (rc != 0) {
+                       CERROR("%s: LNetMEAttach failed x"LPU64"/%d: rc = %d\n",
+                              desc->bd_export->exp_obd->obd_name, xid,
+                              posted_md, rc);
+                       break;
+               }
+
+               /* About to let the network at it... */
+               rc = LNetMDAttach(me_h, md, LNET_UNLINK,
+                                 &desc->bd_mds[posted_md]);
+               if (rc != 0) {
+                       CERROR("%s: LNetMDAttach failed x"LPU64"/%d: rc = %d\n",
+                              desc->bd_export->exp_obd->obd_name, xid,
+                              posted_md, rc);
+                       rc2 = LNetMEUnlink(me_h);
+                       LASSERT(rc2 == 0);
+                       break;
+               }
+       }
+
+       if (rc != 0) {
+               LASSERT(rc == -ENOMEM);
+               spin_lock(&desc->bd_lock);
+               desc->bd_md_count -= total_md - posted_md;
+               spin_unlock(&desc->bd_lock);
+               LASSERT(desc->bd_md_count >= 0);
+               mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+               req->rq_status = -ENOMEM;
+               RETURN(-ENOMEM);
+       }
+
+       /* Set rq_xid to matchbits of the final bulk so that server can
+        * infer the number of bulks that were prepared */
+       req->rq_xid = --xid;
+       LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK),
+                "bd_last_xid = x"LPU64", rq_xid = x"LPU64"\n",
+                desc->bd_last_xid, req->rq_xid);
+
+       spin_lock(&desc->bd_lock);
+       /* Holler if peer manages to touch buffers before he knows the xid */
+       if (desc->bd_md_count != total_md)
+               CWARN("%s: Peer %s touched %d buffers while I registered\n",
+                     desc->bd_export->exp_obd->obd_name, libcfs_id2str(peer),
+                     total_md - desc->bd_md_count);
+       spin_unlock(&desc->bd_lock);
+
+       CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, "
+              "xid x"LPX64"-"LPX64", portal %u\n", desc->bd_md_count,
+              desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
+              desc->bd_iov_count, desc->bd_nob,
+              desc->bd_last_xid, req->rq_xid, desc->bd_portal);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_register_bulk);
+
+/**
+ * Disconnect a bulk desc from the network. Idempotent. Not
+ * thread-safe (i.e. only interlocks with completion callback).
+ * Returns 1 on success or 0 if network unregistration failed for whatever
+ * reason.
+ */
+int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async)
+{
+       struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+       wait_queue_head_t            *wq;
+       struct l_wait_info       lwi;
+       int                   rc;
+       ENTRY;
+
+       LASSERT(!in_interrupt());     /* might sleep */
+
+       /* Let's setup deadline for reply unlink. */
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) &&
+           async && req->rq_bulk_deadline == 0)
+               req->rq_bulk_deadline = cfs_time_current_sec() + LONG_UNLINK;
+
+       if (ptlrpc_client_bulk_active(req) == 0)        /* completed or */
+               RETURN(1);                              /* never registered */
+
+       LASSERT(desc->bd_req == req);  /* bd_req NULL until registered */
+
+       /* the unlink ensures the callback happens ASAP and is the last
+        * one.  If it fails, it must be because completion just happened,
+        * but we must still l_wait_event() in this case to give liblustre
+        * a chance to run client_bulk_callback() */
+       mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw);
+
+       if (ptlrpc_client_bulk_active(req) == 0)        /* completed or */
+               RETURN(1);                              /* never registered */
+
+       /* Move to "Unregistering" phase as bulk was not unlinked yet. */
+       ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING);
+
+       /* Do not wait for unlink to finish. */
+       if (async)
+               RETURN(0);
+
+       if (req->rq_set != NULL)
+               wq = &req->rq_set->set_waitq;
+       else
+               wq = &req->rq_reply_waitq;
+
+       for (;;) {
+               /* Network access will complete in finite time but the HUGE
+                * timeout lets us CWARN for visibility of sluggish NALs */
+               lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
+                                          cfs_time_seconds(1), NULL, NULL);
+               rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi);
+               if (rc == 0) {
+                       ptlrpc_rqphase_move(req, req->rq_next_phase);
+                       RETURN(1);
+               }
+
+               LASSERT(rc == -ETIMEDOUT);
+               DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p",
+                         desc);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_bulk);
+
+static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags)
+{
+       struct ptlrpc_service_part      *svcpt = req->rq_rqbd->rqbd_svcpt;
+       struct ptlrpc_service           *svc = svcpt->scp_service;
+       int service_time = max_t(int, cfs_time_current_sec() -
+                                req->rq_arrival_time.tv_sec, 1);
+
+       if (!(flags & PTLRPC_REPLY_EARLY) &&
+           (req->rq_type != PTL_RPC_MSG_ERR) &&
+           (req->rq_reqmsg != NULL) &&
+           !(lustre_msg_get_flags(req->rq_reqmsg) &
+             (MSG_RESENT | MSG_REPLAY |
+              MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) {
+               /* early replies, errors and recovery requests don't count
+                * toward our service time estimate */
+               int oldse = at_measured(&svcpt->scp_at_estimate, service_time);
+
+               if (oldse != 0) {
+                       DEBUG_REQ(D_ADAPTTO, req,
+                                 "svc %s changed estimate from %d to %d",
+                                 svc->srv_name, oldse,
+                                 at_get(&svcpt->scp_at_estimate));
+               }
+       }
+       /* Report actual service time for client latency calc */
+       lustre_msg_set_service_time(req->rq_repmsg, service_time);
+       /* Report service time estimate for future client reqs, but report 0
+        * (to be ignored by client) if it's a error reply during recovery.
+        * (bz15815) */
+       if (req->rq_type == PTL_RPC_MSG_ERR &&
+           (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering))
+               lustre_msg_set_timeout(req->rq_repmsg, 0);
+       else
+               lustre_msg_set_timeout(req->rq_repmsg,
+                                      at_get(&svcpt->scp_at_estimate));
+
+       if (req->rq_reqmsg &&
+           !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+               CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x "
+                      "req_flags=%#x magic=%d:%x/%x len=%d\n",
+                      flags, lustre_msg_get_flags(req->rq_reqmsg),
+                      lustre_msg_is_v1(req->rq_reqmsg),
+                      lustre_msg_get_magic(req->rq_reqmsg),
+                      lustre_msg_get_magic(req->rq_repmsg), req->rq_replen);
+       }
+}
+
+/**
+ * Send request reply from request \a req reply buffer.
+ * \a flags defines reply types
+ * Returns 0 on sucess or error code
+ */
+int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
+{
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+       struct ptlrpc_connection  *conn;
+       int                     rc;
+
+       /* We must already have a reply buffer (only ptlrpc_error() may be
+        * called without one). The reply generated by sptlrpc layer (e.g.
+        * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must
+        * have a request buffer which is either the actual (swabbed) incoming
+        * request, or a saved copy if this is a req saved in
+        * target_queue_final_reply().
+        */
+       LASSERT (req->rq_no_reply == 0);
+       LASSERT (req->rq_reqbuf != NULL);
+       LASSERT (rs != NULL);
+       LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult);
+       LASSERT (req->rq_repmsg != NULL);
+       LASSERT (req->rq_repmsg == rs->rs_msg);
+       LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
+       LASSERT (rs->rs_cb_id.cbid_arg == rs);
+
+       /* There may be no rq_export during failover */
+
+       if (unlikely(req->rq_export && req->rq_export->exp_obd &&
+                    req->rq_export->exp_obd->obd_fail)) {
+               /* Failed obd's only send ENODEV */
+               req->rq_type = PTL_RPC_MSG_ERR;
+               req->rq_status = -ENODEV;
+               CDEBUG(D_HA, "sending ENODEV from failed obd %d\n",
+                      req->rq_export->exp_obd->obd_minor);
+       }
+
+       /* In order to keep interoprability with the client (< 2.3) which
+        * doesn't have pb_jobid in ptlrpc_body, We have to shrink the
+        * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the
+        * reply buffer on client will be overflow.
+        *
+        * XXX Remove this whenver we drop the interoprability with such client.
+        */
+       req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0,
+                                          sizeof(struct ptlrpc_body_v2), 1);
+
+       if (req->rq_type != PTL_RPC_MSG_ERR)
+               req->rq_type = PTL_RPC_MSG_REPLY;
+
+       lustre_msg_set_type(req->rq_repmsg, req->rq_type);
+       lustre_msg_set_status(req->rq_repmsg, req->rq_status);
+       lustre_msg_set_opc(req->rq_repmsg,
+               req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0);
+
+       target_pack_pool_reply(req);
+
+       ptlrpc_at_set_reply(req, flags);
+
+       if (req->rq_export == NULL || req->rq_export->exp_connection == NULL)
+               conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL);
+       else
+               conn = ptlrpc_connection_addref(req->rq_export->exp_connection);
+
+       if (unlikely(conn == NULL)) {
+               CERROR("not replying on NULL connection\n"); /* bug 9635 */
+               return -ENOTCONN;
+       }
+       ptlrpc_rs_addref(rs);              /* +1 ref for the network */
+
+       rc = sptlrpc_svc_wrap_reply(req);
+       if (unlikely(rc))
+               goto out;
+
+       req->rq_sent = cfs_time_current_sec();
+
+       rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+                          (rs->rs_difficult && !rs->rs_no_ack) ?
+                          LNET_ACK_REQ : LNET_NOACK_REQ,
+                          &rs->rs_cb_id, conn,
+                          ptlrpc_req2svc(req)->srv_rep_portal,
+                          req->rq_xid, req->rq_reply_off);
+out:
+       if (unlikely(rc != 0))
+               ptlrpc_req_drop_rs(req);
+       ptlrpc_connection_put(conn);
+       return rc;
+}
+EXPORT_SYMBOL(ptlrpc_send_reply);
+
+int ptlrpc_reply (struct ptlrpc_request *req)
+{
+       if (req->rq_no_reply)
+               return 0;
+       else
+               return (ptlrpc_send_reply(req, 0));
+}
+EXPORT_SYMBOL(ptlrpc_reply);
+
+/**
+ * For request \a req send an error reply back. Create empty
+ * reply buffers if necessary.
+ */
+int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult)
+{
+       int rc;
+       ENTRY;
+
+       if (req->rq_no_reply)
+               RETURN(0);
+
+       if (!req->rq_repmsg) {
+               rc = lustre_pack_reply(req, 1, NULL, NULL);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       if (req->rq_status != -ENOSPC && req->rq_status != -EACCES &&
+           req->rq_status != -EPERM && req->rq_status != -ENOENT &&
+           req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT)
+               req->rq_type = PTL_RPC_MSG_ERR;
+
+       rc = ptlrpc_send_reply(req, may_be_difficult);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_send_error);
+
+int ptlrpc_error(struct ptlrpc_request *req)
+{
+       return ptlrpc_send_error(req, 0);
+}
+EXPORT_SYMBOL(ptlrpc_error);
+
+/**
+ * Send request \a request.
+ * if \a noreply is set, don't expect any reply back and don't set up
+ * reply buffers.
+ * Returns 0 on success or error code.
+ */
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
+{
+       int rc;
+       int rc2;
+       int mpflag = 0;
+       struct ptlrpc_connection *connection;
+       lnet_handle_me_t  reply_me_h;
+       lnet_md_t        reply_md;
+       struct obd_device *obd = request->rq_import->imp_obd;
+       ENTRY;
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
+               RETURN(0);
+
+       LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST);
+       LASSERT(request->rq_wait_ctx == 0);
+
+       /* If this is a re-transmit, we're required to have disengaged
+        * cleanly from the previous attempt */
+       LASSERT(!request->rq_receiving_reply);
+
+       if (request->rq_import->imp_obd &&
+           request->rq_import->imp_obd->obd_fail) {
+               CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
+                      request->rq_import->imp_obd->obd_name);
+               /* this prevents us from waiting in ptlrpc_queue_wait */
+               request->rq_err = 1;
+               request->rq_status = -ENODEV;
+               RETURN(-ENODEV);
+       }
+
+       connection = request->rq_import->imp_connection;
+
+       lustre_msg_set_handle(request->rq_reqmsg,
+                             &request->rq_import->imp_remote_handle);
+       lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST);
+       lustre_msg_set_conn_cnt(request->rq_reqmsg,
+                               request->rq_import->imp_conn_cnt);
+       lustre_msghdr_set_flags(request->rq_reqmsg,
+                               request->rq_import->imp_msghdr_flags);
+
+       if (request->rq_resend)
+               lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
+
+       if (request->rq_memalloc)
+               mpflag = cfs_memory_pressure_get_and_set();
+
+       rc = sptlrpc_cli_wrap_request(request);
+       if (rc)
+               GOTO(out, rc);
+
+       /* bulk register should be done after wrap_request() */
+       if (request->rq_bulk != NULL) {
+               rc = ptlrpc_register_bulk (request);
+               if (rc != 0)
+                       GOTO(out, rc);
+       }
+
+       if (!noreply) {
+               LASSERT (request->rq_replen != 0);
+               if (request->rq_repbuf == NULL) {
+                       LASSERT(request->rq_repdata == NULL);
+                       LASSERT(request->rq_repmsg == NULL);
+                       rc = sptlrpc_cli_alloc_repbuf(request,
+                                                     request->rq_replen);
+                       if (rc) {
+                               /* this prevents us from looping in
+                                * ptlrpc_queue_wait */
+                               request->rq_err = 1;
+                               request->rq_status = rc;
+                               GOTO(cleanup_bulk, rc);
+                       }
+               } else {
+                       request->rq_repdata = NULL;
+                       request->rq_repmsg = NULL;
+               }
+
+               rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
+                                 connection->c_peer, request->rq_xid, 0,
+                                 LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
+               if (rc != 0) {
+                       CERROR("LNetMEAttach failed: %d\n", rc);
+                       LASSERT (rc == -ENOMEM);
+                       GOTO(cleanup_bulk, rc = -ENOMEM);
+               }
+       }
+
+       spin_lock(&request->rq_lock);
+       /* If the MD attach succeeds, there _will_ be a reply_in callback */
+       request->rq_receiving_reply = !noreply;
+       /* We are responsible for unlinking the reply buffer */
+       request->rq_must_unlink = !noreply;
+       /* Clear any flags that may be present from previous sends. */
+       request->rq_replied = 0;
+       request->rq_err = 0;
+       request->rq_timedout = 0;
+       request->rq_net_err = 0;
+       request->rq_resend = 0;
+       request->rq_restart = 0;
+       request->rq_reply_truncate = 0;
+       spin_unlock(&request->rq_lock);
+
+       if (!noreply) {
+               reply_md.start     = request->rq_repbuf;
+               reply_md.length    = request->rq_repbuf_len;
+               /* Allow multiple early replies */
+               reply_md.threshold = LNET_MD_THRESH_INF;
+               /* Manage remote for early replies */
+               reply_md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT |
+                       LNET_MD_MANAGE_REMOTE |
+                       LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */;
+               reply_md.user_ptr  = &request->rq_reply_cbid;
+               reply_md.eq_handle = ptlrpc_eq_h;
+
+               /* We must see the unlink callback to unset rq_must_unlink,
+                  so we can't auto-unlink */
+               rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN,
+                                 &request->rq_reply_md_h);
+               if (rc != 0) {
+                       CERROR("LNetMDAttach failed: %d\n", rc);
+                       LASSERT (rc == -ENOMEM);
+                       spin_lock(&request->rq_lock);
+                       /* ...but the MD attach didn't succeed... */
+                       request->rq_receiving_reply = 0;
+                       spin_unlock(&request->rq_lock);
+                       GOTO(cleanup_me, rc = -ENOMEM);
+               }
+
+               CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
+                      ", portal %u\n",
+                      request->rq_repbuf_len, request->rq_xid,
+                      request->rq_reply_portal);
+       }
+
+       /* add references on request for request_out_callback */
+       ptlrpc_request_addref(request);
+       if (obd->obd_svc_stats != NULL)
+               lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR,
+                       atomic_read(&request->rq_import->imp_inflight));
+
+       OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
+
+       do_gettimeofday(&request->rq_arrival_time);
+       request->rq_sent = cfs_time_current_sec();
+       /* We give the server rq_timeout secs to process the req, and
+          add the network latency for our local timeout. */
+       request->rq_deadline = request->rq_sent + request->rq_timeout +
+               ptlrpc_at_get_net_latency(request);
+
+       ptlrpc_pinger_sending_on_import(request->rq_import);
+
+       DEBUG_REQ(D_INFO, request, "send flg=%x",
+                 lustre_msg_get_flags(request->rq_reqmsg));
+       rc = ptl_send_buf(&request->rq_req_md_h,
+                         request->rq_reqbuf, request->rq_reqdata_len,
+                         LNET_NOACK_REQ, &request->rq_req_cbid,
+                         connection,
+                         request->rq_request_portal,
+                         request->rq_xid, 0);
+       if (rc == 0)
+               GOTO(out, rc);
+
+       ptlrpc_req_finished(request);
+       if (noreply)
+               GOTO(out, rc);
+
+ cleanup_me:
+       /* MEUnlink is safe; the PUT didn't even get off the ground, and
+        * nobody apart from the PUT's target has the right nid+XID to
+        * access the reply buffer. */
+       rc2 = LNetMEUnlink(reply_me_h);
+       LASSERT (rc2 == 0);
+       /* UNLINKED callback called synchronously */
+       LASSERT(!request->rq_receiving_reply);
+
+ cleanup_bulk:
+       /* We do sync unlink here as there was no real transfer here so
+        * the chance to have long unlink to sluggish net is smaller here. */
+       ptlrpc_unregister_bulk(request, 0);
+ out:
+       if (request->rq_memalloc)
+               cfs_memory_pressure_restore(mpflag);
+       return rc;
+}
+EXPORT_SYMBOL(ptl_send_rpc);
+
+/**
+ * Register request buffer descriptor for request receiving.
+ */
+int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+       struct ptlrpc_service     *service = rqbd->rqbd_svcpt->scp_service;
+       static lnet_process_id_t  match_id = {LNET_NID_ANY, LNET_PID_ANY};
+       int                       rc;
+       lnet_md_t                md;
+       lnet_handle_me_t          me_h;
+
+       CDEBUG(D_NET, "LNetMEAttach: portal %d\n",
+              service->srv_req_portal);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD))
+               return (-ENOMEM);
+
+       /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL,
+        * which means buffer can only be attached on local CPT, and LND
+        * threads can find it by grabbing a local lock */
+       rc = LNetMEAttach(service->srv_req_portal,
+                         match_id, 0, ~0, LNET_UNLINK,
+                         rqbd->rqbd_svcpt->scp_cpt >= 0 ?
+                         LNET_INS_LOCAL : LNET_INS_AFTER, &me_h);
+       if (rc != 0) {
+               CERROR("LNetMEAttach failed: %d\n", rc);
+               return (-ENOMEM);
+       }
+
+       LASSERT(rqbd->rqbd_refcount == 0);
+       rqbd->rqbd_refcount = 1;
+
+       md.start     = rqbd->rqbd_buffer;
+       md.length    = service->srv_buf_size;
+       md.max_size  = service->srv_max_req_size;
+       md.threshold = LNET_MD_THRESH_INF;
+       md.options   = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE;
+       md.user_ptr  = &rqbd->rqbd_cbid;
+       md.eq_handle = ptlrpc_eq_h;
+
+       rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h);
+       if (rc == 0)
+               return (0);
+
+       CERROR("LNetMDAttach failed: %d; \n", rc);
+       LASSERT (rc == -ENOMEM);
+       rc = LNetMEUnlink (me_h);
+       LASSERT (rc == 0);
+       rqbd->rqbd_refcount = 0;
+
+       return (-ENOMEM);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs.c b/drivers/staging/lustre/lustre/ptlrpc/nrs.c
new file mode 100644 (file)
index 0000000..1996431
--- /dev/null
@@ -0,0 +1,1790 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs.c
+ *
+ * Network Request Scheduler (NRS)
+ *
+ * Allows to reorder the handling of RPCs at servers.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+#include <linux/libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/* XXX: This is just for liblustre. Remove the #if defined directive when the
+ * "cfs_" prefix is dropped from cfs_list_head. */
+extern struct list_head ptlrpc_all_services;
+
+/**
+ * NRS core object.
+ */
+struct nrs_core nrs_core;
+
+static int nrs_policy_init(struct ptlrpc_nrs_policy *policy)
+{
+       return policy->pol_desc->pd_ops->op_policy_init != NULL ?
+              policy->pol_desc->pd_ops->op_policy_init(policy) : 0;
+}
+
+static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy)
+{
+       LASSERT(policy->pol_ref == 0);
+       LASSERT(policy->pol_req_queued == 0);
+
+       if (policy->pol_desc->pd_ops->op_policy_fini != NULL)
+               policy->pol_desc->pd_ops->op_policy_fini(policy);
+}
+
+static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy,
+                                enum ptlrpc_nrs_ctl opc, void *arg)
+{
+       /**
+        * The policy may be stopped, but the lprocfs files and
+        * ptlrpc_nrs_policy instances remain present until unregistration time.
+        * Do not perform the ctl operation if the policy is stopped, as
+        * policy->pol_private will be NULL in such a case.
+        */
+       if (policy->pol_state == NRS_POL_STATE_STOPPED)
+               RETURN(-ENODEV);
+
+       RETURN(policy->pol_desc->pd_ops->op_policy_ctl != NULL ?
+              policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) :
+              -ENOSYS);
+}
+
+static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy)
+{
+       struct ptlrpc_nrs *nrs = policy->pol_nrs;
+       ENTRY;
+
+       if (policy->pol_desc->pd_ops->op_policy_stop != NULL) {
+               spin_unlock(&nrs->nrs_lock);
+
+               policy->pol_desc->pd_ops->op_policy_stop(policy);
+
+               spin_lock(&nrs->nrs_lock);
+       }
+
+       LASSERT(list_empty(&policy->pol_list_queued));
+       LASSERT(policy->pol_req_queued == 0 &&
+               policy->pol_req_started == 0);
+
+       policy->pol_private = NULL;
+
+       policy->pol_state = NRS_POL_STATE_STOPPED;
+
+       if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+               module_put(policy->pol_desc->pd_owner);
+
+       EXIT;
+}
+
+static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy)
+{
+       struct ptlrpc_nrs *nrs = policy->pol_nrs;
+       ENTRY;
+
+       if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping)
+               RETURN(-EPERM);
+
+       if (policy->pol_state == NRS_POL_STATE_STARTING)
+               RETURN(-EAGAIN);
+
+       /* In progress or already stopped */
+       if (policy->pol_state != NRS_POL_STATE_STARTED)
+               RETURN(0);
+
+       policy->pol_state = NRS_POL_STATE_STOPPING;
+
+       /* Immediately make it invisible */
+       if (nrs->nrs_policy_primary == policy) {
+               nrs->nrs_policy_primary = NULL;
+
+       } else {
+               LASSERT(nrs->nrs_policy_fallback == policy);
+               nrs->nrs_policy_fallback = NULL;
+       }
+
+       /* I have the only refcount */
+       if (policy->pol_ref == 1)
+               nrs_policy_stop0(policy);
+
+       RETURN(0);
+}
+
+/**
+ * Transitions the \a nrs NRS head's primary policy to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no
+ * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED.
+ *
+ * \param[in] nrs the NRS head to carry out this operation on
+ */
+static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs)
+{
+       struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary;
+       ENTRY;
+
+       if (tmp == NULL) {
+               /**
+                * XXX: This should really be RETURN_EXIT, but the latter does
+                * not currently print anything out, and possibly should be
+                * fixed to do so.
+                */
+               EXIT;
+               return;
+       }
+
+       nrs->nrs_policy_primary = NULL;
+
+       LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED);
+       tmp->pol_state = NRS_POL_STATE_STOPPING;
+
+       if (tmp->pol_ref == 0)
+               nrs_policy_stop0(tmp);
+       EXIT;
+}
+
+/**
+ * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in
+ * response to an lprocfs command to start a policy.
+ *
+ * If a primary policy different to the current one is specified, this function
+ * will transition the new policy to the
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition
+ * the old primary policy (if there is one) to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED.
+ *
+ * If the fallback policy is specified, this is taken to indicate an instruction
+ * to stop the current primary policy, without substituting it with another
+ * primary policy, so the primary policy (if any) is transitioned to
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding
+ * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In
+ * this case, the fallback policy is only left active in the NRS head.
+ */
+static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy)
+{
+       struct ptlrpc_nrs      *nrs = policy->pol_nrs;
+       int                     rc = 0;
+       ENTRY;
+
+       /**
+        * Don't allow multiple starting which is too complex, and has no real
+        * benefit.
+        */
+       if (nrs->nrs_policy_starting)
+               RETURN(-EAGAIN);
+
+       LASSERT(policy->pol_state != NRS_POL_STATE_STARTING);
+
+       if (policy->pol_state == NRS_POL_STATE_STOPPING)
+               RETURN(-EAGAIN);
+
+       if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+               /**
+                * This is for cases in which the user sets the policy to the
+                * fallback policy (currently fifo for all services); i.e. the
+                * user is resetting the policy to the default; so we stop the
+                * primary policy, if any.
+                */
+               if (policy == nrs->nrs_policy_fallback) {
+                       nrs_policy_stop_primary(nrs);
+                       RETURN(0);
+               }
+
+               /**
+                * If we reach here, we must be setting up the fallback policy
+                * at service startup time, and only a single policy with the
+                * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can
+                * register with NRS core.
+                */
+               LASSERT(nrs->nrs_policy_fallback == NULL);
+       } else {
+               /**
+                * Shouldn't start primary policy if w/o fallback policy.
+                */
+               if (nrs->nrs_policy_fallback == NULL)
+                       RETURN(-EPERM);
+
+               if (policy->pol_state == NRS_POL_STATE_STARTED)
+                       RETURN(0);
+       }
+
+       /**
+        * Increase the module usage count for policies registering from other
+        * modules.
+        */
+       if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 &&
+           !try_module_get(policy->pol_desc->pd_owner)) {
+               atomic_dec(&policy->pol_desc->pd_refs);
+               CERROR("NRS: cannot get module for policy %s; is it alive?\n",
+                      policy->pol_desc->pd_name);
+               RETURN(-ENODEV);
+       }
+
+       /**
+        * Serialize policy starting across the NRS head
+        */
+       nrs->nrs_policy_starting = 1;
+
+       policy->pol_state = NRS_POL_STATE_STARTING;
+
+       if (policy->pol_desc->pd_ops->op_policy_start) {
+               spin_unlock(&nrs->nrs_lock);
+
+               rc = policy->pol_desc->pd_ops->op_policy_start(policy);
+
+               spin_lock(&nrs->nrs_lock);
+               if (rc != 0) {
+                       if (atomic_dec_and_test(&policy->pol_desc->pd_refs))
+                               module_put(policy->pol_desc->pd_owner);
+
+                       policy->pol_state = NRS_POL_STATE_STOPPED;
+                       GOTO(out, rc);
+               }
+       }
+
+       policy->pol_state = NRS_POL_STATE_STARTED;
+
+       if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) {
+               /**
+                * This path is only used at PTLRPC service setup time.
+                */
+               nrs->nrs_policy_fallback = policy;
+       } else {
+               /*
+                * Try to stop the current primary policy if there is one.
+                */
+               nrs_policy_stop_primary(nrs);
+
+               /**
+                * And set the newly-started policy as the primary one.
+                */
+               nrs->nrs_policy_primary = policy;
+       }
+
+out:
+       nrs->nrs_policy_starting = 0;
+
+       RETURN(rc);
+}
+
+/**
+ * Increases the policy's usage reference count.
+ */
+static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy)
+{
+       policy->pol_ref++;
+}
+
+/**
+ * Decreases the policy's usage reference count, and stops the policy in case it
+ * was already stopping and have no more outstanding usage references (which
+ * indicates it has no more queued or started requests, and can be safely
+ * stopped).
+ */
+static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy)
+{
+       LASSERT(policy->pol_ref > 0);
+
+       policy->pol_ref--;
+       if (unlikely(policy->pol_ref == 0 &&
+           policy->pol_state == NRS_POL_STATE_STOPPING))
+               nrs_policy_stop0(policy);
+}
+
+static void nrs_policy_put(struct ptlrpc_nrs_policy *policy)
+{
+       spin_lock(&policy->pol_nrs->nrs_lock);
+       nrs_policy_put_locked(policy);
+       spin_unlock(&policy->pol_nrs->nrs_lock);
+}
+
+/**
+ * Find and return a policy by name.
+ */
+static struct ptlrpc_nrs_policy * nrs_policy_find_locked(struct ptlrpc_nrs *nrs,
+                                                        char *name)
+{
+       struct ptlrpc_nrs_policy *tmp;
+
+       list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) {
+               if (strncmp(tmp->pol_desc->pd_name, name,
+                           NRS_POL_NAME_MAX) == 0) {
+                       nrs_policy_get_locked(tmp);
+                       return tmp;
+               }
+       }
+       return NULL;
+}
+
+/**
+ * Release references for the resource hierarchy moving upwards towards the
+ * policy instance resource.
+ */
+static void nrs_resource_put(struct ptlrpc_nrs_resource *res)
+{
+       struct ptlrpc_nrs_policy *policy = res->res_policy;
+
+       if (policy->pol_desc->pd_ops->op_res_put != NULL) {
+               struct ptlrpc_nrs_resource *parent;
+
+               for (; res != NULL; res = parent) {
+                       parent = res->res_parent;
+                       policy->pol_desc->pd_ops->op_res_put(policy, res);
+               }
+       }
+}
+
+/**
+ * Obtains references for each resource in the resource hierarchy for request
+ * \a nrq if it is to be handled by \a policy.
+ *
+ * \param[in] policy     the policy
+ * \param[in] nrq        the request
+ * \param[in] moving_req  denotes whether this is a call to the function by
+ *                       ldlm_lock_reorder_req(), in order to move \a nrq to
+ *                       the high-priority NRS head; we should not sleep when
+ *                       set.
+ *
+ * \retval NULL                  resource hierarchy references not obtained
+ * \retval valid-pointer  the bottom level of the resource hierarchy
+ *
+ * \see ptlrpc_nrs_pol_ops::op_res_get()
+ */
+static
+struct ptlrpc_nrs_resource * nrs_resource_get(struct ptlrpc_nrs_policy *policy,
+                                             struct ptlrpc_nrs_request *nrq,
+                                             bool moving_req)
+{
+       /**
+        * Set to NULL to traverse the resource hierarchy from the top.
+        */
+       struct ptlrpc_nrs_resource *res = NULL;
+       struct ptlrpc_nrs_resource *tmp = NULL;
+       int                         rc;
+
+       while (1) {
+               rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res,
+                                                         &tmp, moving_req);
+               if (rc < 0) {
+                       if (res != NULL)
+                               nrs_resource_put(res);
+                       return NULL;
+               }
+
+               LASSERT(tmp != NULL);
+               tmp->res_parent = res;
+               tmp->res_policy = policy;
+               res = tmp;
+               tmp = NULL;
+               /**
+                * Return once we have obtained a reference to the bottom level
+                * of the resource hierarchy.
+                */
+               if (rc > 0)
+                       return res;
+       }
+}
+
+/**
+ * Obtains resources for the resource hierarchies and policy references for
+ * the fallback and current primary policy (if any), that will later be used
+ * to handle request \a nrq.
+ *
+ * \param[in]  nrs  the NRS head instance that will be handling request \a nrq.
+ * \param[in]  nrq  the request that is being handled.
+ * \param[out] resp the array where references to the resource hierarchy are
+ *                 stored.
+ * \param[in]  moving_req  is set when obtaining resources while moving a
+ *                        request from a policy on the regular NRS head to a
+ *                        policy on the HP NRS head (via
+ *                        ldlm_lock_reorder_req()). It signifies that
+ *                        allocations to get resources should be atomic; for
+ *                        a full explanation, see comment in
+ *                        ptlrpc_nrs_pol_ops::op_res_get().
+ */
+static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs,
+                                 struct ptlrpc_nrs_request *nrq,
+                                 struct ptlrpc_nrs_resource **resp,
+                                 bool moving_req)
+{
+       struct ptlrpc_nrs_policy   *primary = NULL;
+       struct ptlrpc_nrs_policy   *fallback = NULL;
+
+       memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX);
+
+       /**
+        * Obtain policy references.
+        */
+       spin_lock(&nrs->nrs_lock);
+
+       fallback = nrs->nrs_policy_fallback;
+       nrs_policy_get_locked(fallback);
+
+       primary = nrs->nrs_policy_primary;
+       if (primary != NULL)
+               nrs_policy_get_locked(primary);
+
+       spin_unlock(&nrs->nrs_lock);
+
+       /**
+        * Obtain resource hierarchy references.
+        */
+       resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req);
+       LASSERT(resp[NRS_RES_FALLBACK] != NULL);
+
+       if (primary != NULL) {
+               resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq,
+                                                        moving_req);
+               /**
+                * A primary policy may exist which may not wish to serve a
+                * particular request for different reasons; release the
+                * reference on the policy as it will not be used for this
+                * request.
+                */
+               if (resp[NRS_RES_PRIMARY] == NULL)
+                       nrs_policy_put(primary);
+       }
+}
+
+/**
+ * Releases references to resource hierarchies and policies, because they are no
+ * longer required; used when request handling has been completed, or the
+ * request is moving to the high priority NRS head.
+ *
+ * \param resp the resource hierarchy that is being released
+ *
+ * \see ptlrpcnrs_req_hp_move()
+ * \see ptlrpc_nrs_req_finalize()
+ */
+static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp)
+{
+       struct ptlrpc_nrs_policy *pols[NRS_RES_MAX];
+       struct ptlrpc_nrs        *nrs = NULL;
+       int                       i;
+
+       for (i = 0; i < NRS_RES_MAX; i++) {
+               if (resp[i] != NULL) {
+                       pols[i] = resp[i]->res_policy;
+                       nrs_resource_put(resp[i]);
+                       resp[i] = NULL;
+               } else {
+                       pols[i] = NULL;
+               }
+       }
+
+       for (i = 0; i < NRS_RES_MAX; i++) {
+               if (pols[i] == NULL)
+                       continue;
+
+               if (nrs == NULL) {
+                       nrs = pols[i]->pol_nrs;
+                       spin_lock(&nrs->nrs_lock);
+               }
+               nrs_policy_put_locked(pols[i]);
+       }
+
+       if (nrs != NULL)
+               spin_unlock(&nrs->nrs_lock);
+}
+
+/**
+ * Obtains an NRS request from \a policy for handling or examination; the
+ * request should be removed in the 'handling' case.
+ *
+ * Calling into this function implies we already know the policy has a request
+ * waiting to be handled.
+ *
+ * \param[in] policy the policy from which a request
+ * \param[in] peek   when set, signifies that we just want to examine the
+ *                  request, and not handle it, so the request is not removed
+ *                  from the policy.
+ * \param[in] force  when set, it will force a policy to return a request if it
+ *                  has one pending
+ *
+ * \retval the NRS request to be handled
+ */
+static inline
+struct ptlrpc_nrs_request * nrs_request_get(struct ptlrpc_nrs_policy *policy,
+                                           bool peek, bool force)
+{
+       struct ptlrpc_nrs_request *nrq;
+
+       LASSERT(policy->pol_req_queued > 0);
+
+       nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force);
+
+       LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy));
+
+       return nrq;
+}
+
+/**
+ * Enqueues request \a nrq for later handling, via one one the policies for
+ * which resources where earlier obtained via nrs_resource_get_safe(). The
+ * function attempts to enqueue the request first on the primary policy
+ * (if any), since this is the preferred choice.
+ *
+ * \param nrq the request being enqueued
+ *
+ * \see nrs_resource_get_safe()
+ */
+static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq)
+{
+       struct ptlrpc_nrs_policy *policy;
+       int                       rc;
+       int                       i;
+
+       /**
+        * Try in descending order, because the primary policy (if any) is
+        * the preferred choice.
+        */
+       for (i = NRS_RES_MAX - 1; i >= 0; i--) {
+               if (nrq->nr_res_ptrs[i] == NULL)
+                       continue;
+
+               nrq->nr_res_idx = i;
+               policy = nrq->nr_res_ptrs[i]->res_policy;
+
+               rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq);
+               if (rc == 0) {
+                       policy->pol_nrs->nrs_req_queued++;
+                       policy->pol_req_queued++;
+                       return;
+               }
+       }
+       /**
+        * Should never get here, as at least the primary policy's
+        * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always
+        * succeed.
+        */
+       LBUG();
+}
+
+/**
+ * Called when a request has been handled
+ *
+ * \param[in] nrs the request that has been handled; can be used for
+ *               job/resource control.
+ *
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq)
+{
+       struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq);
+
+       if (policy->pol_desc->pd_ops->op_req_stop)
+               policy->pol_desc->pd_ops->op_req_stop(policy, nrq);
+
+       LASSERT(policy->pol_nrs->nrs_req_started > 0);
+       LASSERT(policy->pol_req_started > 0);
+
+       policy->pol_nrs->nrs_req_started--;
+       policy->pol_req_started--;
+}
+
+/**
+ * Handler for operations that can be carried out on policies.
+ *
+ * Handles opcodes that are common to all policy types within NRS core, and
+ * passes any unknown opcodes to the policy-specific control function.
+ *
+ * \param[in]    nrs  the NRS head this policy belongs to.
+ * \param[in]    name the human-readable policy name; should be the same as
+ *                    ptlrpc_nrs_pol_desc::pd_name.
+ * \param[in]    opc  the opcode of the operation being carried out.
+ * \param[in,out] arg  can be used to pass information in and out between when
+ *                    carrying an operation; usually data that is private to
+ *                    the policy at some level, or generic policy status
+ *                    information.
+ *
+ * \retval -ve error condition
+ * \retval   0 operation was carried out successfully
+ */
+static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name,
+                         enum ptlrpc_nrs_ctl opc, void *arg)
+{
+       struct ptlrpc_nrs_policy       *policy;
+       int                             rc = 0;
+       ENTRY;
+
+       spin_lock(&nrs->nrs_lock);
+
+       policy = nrs_policy_find_locked(nrs, name);
+       if (policy == NULL)
+               GOTO(out, rc = -ENOENT);
+
+       switch (opc) {
+               /**
+                * Unknown opcode, pass it down to the policy-specific control
+                * function for handling.
+                */
+       default:
+               rc = nrs_policy_ctl_locked(policy, opc, arg);
+               break;
+
+               /**
+                * Start \e policy
+                */
+       case PTLRPC_NRS_CTL_START:
+               rc = nrs_policy_start_locked(policy);
+               break;
+       }
+out:
+       if (policy != NULL)
+               nrs_policy_put_locked(policy);
+
+       spin_unlock(&nrs->nrs_lock);
+
+       RETURN(rc);
+}
+
+/**
+ * Unregisters a policy by name.
+ *
+ * \param[in] nrs  the NRS head this policy belongs to.
+ * \param[in] name the human-readable policy name; should be the same as
+ *                ptlrpc_nrs_pol_desc::pd_name
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name)
+{
+       struct ptlrpc_nrs_policy *policy = NULL;
+       ENTRY;
+
+       spin_lock(&nrs->nrs_lock);
+
+       policy = nrs_policy_find_locked(nrs, name);
+       if (policy == NULL) {
+               spin_unlock(&nrs->nrs_lock);
+
+               CERROR("Can't find NRS policy %s\n", name);
+               RETURN(-ENOENT);
+       }
+
+       if (policy->pol_ref > 1) {
+               CERROR("Policy %s is busy with %d references\n", name,
+                      (int)policy->pol_ref);
+               nrs_policy_put_locked(policy);
+
+               spin_unlock(&nrs->nrs_lock);
+               RETURN(-EBUSY);
+       }
+
+       LASSERT(policy->pol_req_queued == 0);
+       LASSERT(policy->pol_req_started == 0);
+
+       if (policy->pol_state != NRS_POL_STATE_STOPPED) {
+               nrs_policy_stop_locked(policy);
+               LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED);
+       }
+
+       list_del(&policy->pol_list);
+       nrs->nrs_num_pols--;
+
+       nrs_policy_put_locked(policy);
+
+       spin_unlock(&nrs->nrs_lock);
+
+       nrs_policy_fini(policy);
+
+       LASSERT(policy->pol_private == NULL);
+       OBD_FREE_PTR(policy);
+
+       RETURN(0);
+}
+
+/**
+ * Register a policy from \policy descriptor \a desc with NRS head \a nrs.
+ *
+ * \param[in] nrs   the NRS head on which the policy will be registered.
+ * \param[in] desc  the policy descriptor from which the information will be
+ *                 obtained to register the policy.
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+static int nrs_policy_register(struct ptlrpc_nrs *nrs,
+                              struct ptlrpc_nrs_pol_desc *desc)
+{
+       struct ptlrpc_nrs_policy       *policy;
+       struct ptlrpc_nrs_policy       *tmp;
+       struct ptlrpc_service_part     *svcpt = nrs->nrs_svcpt;
+       int                             rc;
+       ENTRY;
+
+       LASSERT(svcpt != NULL);
+       LASSERT(desc->pd_ops != NULL);
+       LASSERT(desc->pd_ops->op_res_get != NULL);
+       LASSERT(desc->pd_ops->op_req_get != NULL);
+       LASSERT(desc->pd_ops->op_req_enqueue != NULL);
+       LASSERT(desc->pd_ops->op_req_dequeue != NULL);
+       LASSERT(desc->pd_compat != NULL);
+
+       OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable,
+                         svcpt->scp_cpt, sizeof(*policy), __GFP_IO);
+       if (policy == NULL)
+               RETURN(-ENOMEM);
+
+       policy->pol_nrs     = nrs;
+       policy->pol_desc    = desc;
+       policy->pol_state   = NRS_POL_STATE_STOPPED;
+       policy->pol_flags   = desc->pd_flags;
+
+       INIT_LIST_HEAD(&policy->pol_list);
+       INIT_LIST_HEAD(&policy->pol_list_queued);
+
+       rc = nrs_policy_init(policy);
+       if (rc != 0) {
+               OBD_FREE_PTR(policy);
+               RETURN(rc);
+       }
+
+       spin_lock(&nrs->nrs_lock);
+
+       tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name);
+       if (tmp != NULL) {
+               CERROR("NRS policy %s has been registered, can't register it "
+                      "for %s\n", policy->pol_desc->pd_name,
+                      svcpt->scp_service->srv_name);
+               nrs_policy_put_locked(tmp);
+
+               spin_unlock(&nrs->nrs_lock);
+               nrs_policy_fini(policy);
+               OBD_FREE_PTR(policy);
+
+               RETURN(-EEXIST);
+       }
+
+       list_add_tail(&policy->pol_list, &nrs->nrs_policy_list);
+       nrs->nrs_num_pols++;
+
+       if (policy->pol_flags & PTLRPC_NRS_FL_REG_START)
+               rc = nrs_policy_start_locked(policy);
+
+       spin_unlock(&nrs->nrs_lock);
+
+       if (rc != 0)
+               (void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+
+       RETURN(rc);
+}
+
+/**
+ * Enqueue request \a req using one of the policies its resources are referring
+ * to.
+ *
+ * \param[in] req the request to enqueue.
+ */
+static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req)
+{
+       struct ptlrpc_nrs_policy       *policy;
+
+       LASSERT(req->rq_nrq.nr_initialized);
+       LASSERT(!req->rq_nrq.nr_enqueued);
+
+       nrs_request_enqueue(&req->rq_nrq);
+       req->rq_nrq.nr_enqueued = 1;
+
+       policy = nrs_request_policy(&req->rq_nrq);
+       /**
+        * Add the policy to the NRS head's list of policies with enqueued
+        * requests, if it has not been added there.
+        */
+       if (unlikely(list_empty(&policy->pol_list_queued)))
+               list_add_tail(&policy->pol_list_queued,
+                                 &policy->pol_nrs->nrs_policy_queued);
+}
+
+/**
+ * Enqueue a request on the high priority NRS head.
+ *
+ * \param req the request to enqueue.
+ */
+static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req)
+{
+       int     opc = lustre_msg_get_opc(req->rq_reqmsg);
+       ENTRY;
+
+       spin_lock(&req->rq_lock);
+       req->rq_hp = 1;
+       ptlrpc_nrs_req_add_nolock(req);
+       if (opc != OBD_PING)
+               DEBUG_REQ(D_NET, req, "high priority req");
+       spin_unlock(&req->rq_lock);
+       EXIT;
+}
+
+/**
+ * Returns a boolean predicate indicating whether the policy described by
+ * \a desc is adequate for use with service \a svc.
+ *
+ * \param[in] svc  the service
+ * \param[in] desc the policy descriptor
+ *
+ * \retval false the policy is not compatible with the service
+ * \retval true         the policy is compatible with the service
+ */
+static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc,
+                                        const struct ptlrpc_nrs_pol_desc *desc)
+{
+       return desc->pd_compat(svc, desc);
+}
+
+/**
+ * Registers all compatible policies in nrs_core.nrs_policies, for NRS head
+ * \a nrs.
+ *
+ * \param[in] nrs the NRS head
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ *
+ * \see ptlrpc_service_nrs_setup()
+ */
+static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs)
+{
+       struct ptlrpc_nrs_pol_desc *desc;
+       /* for convenience */
+       struct ptlrpc_service_part       *svcpt = nrs->nrs_svcpt;
+       struct ptlrpc_service            *svc = svcpt->scp_service;
+       int                               rc = -EINVAL;
+       ENTRY;
+
+       LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+       list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+               if (nrs_policy_compatible(svc, desc)) {
+                       rc = nrs_policy_register(nrs, desc);
+                       if (rc != 0) {
+                               CERROR("Failed to register NRS policy %s for "
+                                      "partition %d of service %s: %d\n",
+                                      desc->pd_name, svcpt->scp_cpt,
+                                      svc->srv_name, rc);
+                               /**
+                                * Fail registration if any of the policies'
+                                * registration fails.
+                                */
+                               break;
+                       }
+               }
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * Initializes NRS head \a nrs of service partition \a svcpt, and registers all
+ * compatible policies in NRS core, with the NRS head.
+ *
+ * \param[in] nrs   the NRS head
+ * \param[in] svcpt the PTLRPC service partition to setup
+ *
+ * \retval -ve error
+ * \retval   0 success
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs,
+                                  struct ptlrpc_service_part *svcpt)
+{
+       int                             rc;
+       enum ptlrpc_nrs_queue_type      queue;
+
+       LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+       if (nrs == &svcpt->scp_nrs_reg)
+               queue = PTLRPC_NRS_QUEUE_REG;
+       else if (nrs == svcpt->scp_nrs_hp)
+               queue = PTLRPC_NRS_QUEUE_HP;
+       else
+               LBUG();
+
+       nrs->nrs_svcpt = svcpt;
+       nrs->nrs_queue_type = queue;
+       spin_lock_init(&nrs->nrs_lock);
+       INIT_LIST_HEAD(&nrs->nrs_policy_list);
+       INIT_LIST_HEAD(&nrs->nrs_policy_queued);
+
+       rc = nrs_register_policies_locked(nrs);
+
+       RETURN(rc);
+}
+
+/**
+ * Allocates a regular and optionally a high-priority NRS head (if the service
+ * handles high-priority RPCs), and then registers all available compatible
+ * policies on those NRS heads.
+ *
+ * \param[in,out] svcpt the PTLRPC service partition to setup
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_nrs              *nrs;
+       int                             rc;
+       ENTRY;
+
+       LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+       /**
+        * Initialize the regular NRS head.
+        */
+       nrs = nrs_svcpt2nrs(svcpt, false);
+       rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /**
+        * Optionally allocate a high-priority NRS head.
+        */
+       if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL)
+               GOTO(out, rc);
+
+       OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp,
+                         svcpt->scp_service->srv_cptable,
+                         svcpt->scp_cpt);
+       if (svcpt->scp_nrs_hp == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       nrs = nrs_svcpt2nrs(svcpt, true);
+       rc = nrs_svcpt_setup_locked0(nrs, svcpt);
+
+out:
+       RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all available NRS heads in a service partition;
+ * called at PTLRPC service unregistration time.
+ *
+ * \param[in] svcpt the PTLRPC service partition
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ */
+static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_nrs              *nrs;
+       struct ptlrpc_nrs_policy       *policy;
+       struct ptlrpc_nrs_policy       *tmp;
+       int                             rc;
+       bool                            hp = false;
+       ENTRY;
+
+       LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+
+again:
+       nrs = nrs_svcpt2nrs(svcpt, hp);
+       nrs->nrs_stopping = 1;
+
+       list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list,
+                                    pol_list) {
+               rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name);
+               LASSERT(rc == 0);
+       }
+
+       /**
+        * If the service partition has an HP NRS head, clean that up as well.
+        */
+       if (!hp && nrs_svcpt_has_hp(svcpt)) {
+               hp = true;
+               goto again;
+       }
+
+       if (hp)
+               OBD_FREE_PTR(nrs);
+
+       EXIT;
+}
+
+/**
+ * Returns the descriptor for a policy as identified by by \a name.
+ *
+ * \param[in] name the policy name
+ *
+ * \retval the policy descriptor
+ * \retval NULL
+ */
+static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name)
+{
+       struct ptlrpc_nrs_pol_desc     *tmp;
+       ENTRY;
+
+       list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) {
+               if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0)
+                       RETURN(tmp);
+       }
+       RETURN(NULL);
+}
+
+/**
+ * Removes the policy from all supported NRS heads of all partitions of all
+ * PTLRPC services.
+ *
+ * \param[in] desc the policy descriptor to unregister
+ *
+ * \retval -ve error
+ * \retval  0  successfully unregistered policy on all supported NRS heads
+ *
+ * \pre mutex_is_locked(&nrs_core.nrs_mutex)
+ * \pre mutex_is_locked(&ptlrpc_all_services_mutex)
+ */
+static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc)
+{
+       struct ptlrpc_nrs              *nrs;
+       struct ptlrpc_service          *svc;
+       struct ptlrpc_service_part     *svcpt;
+       int                             i;
+       int                             rc = 0;
+       ENTRY;
+
+       LASSERT(mutex_is_locked(&nrs_core.nrs_mutex));
+       LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex));
+
+       list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+
+               if (!nrs_policy_compatible(svc, desc) ||
+                   unlikely(svc->srv_is_stopping))
+                       continue;
+
+               ptlrpc_service_for_each_part(svcpt, i, svc) {
+                       bool hp = false;
+
+again:
+                       nrs = nrs_svcpt2nrs(svcpt, hp);
+                       rc = nrs_policy_unregister(nrs, desc->pd_name);
+                       /**
+                        * Ignore -ENOENT as the policy may not have registered
+                        * successfully on all service partitions.
+                        */
+                       if (rc == -ENOENT) {
+                               rc = 0;
+                       } else if (rc != 0) {
+                               CERROR("Failed to unregister NRS policy %s for "
+                                      "partition %d of service %s: %d\n",
+                                      desc->pd_name, svcpt->scp_cpt,
+                                      svcpt->scp_service->srv_name, rc);
+                               RETURN(rc);
+                       }
+
+                       if (!hp && nrs_svc_has_hp(svc)) {
+                               hp = true;
+                               goto again;
+                       }
+               }
+
+               if (desc->pd_ops->op_lprocfs_fini != NULL)
+                       desc->pd_ops->op_lprocfs_fini(svc);
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * Registers a new policy with NRS core.
+ *
+ * The function will only succeed if policy registration with all compatible
+ * service partitions (if any) is successful.
+ *
+ * N.B. This function should be called either at ptlrpc module initialization
+ *     time when registering a policy that ships with NRS core, or in a
+ *     module's init() function for policies registering from other modules.
+ *
+ * \param[in] conf configuration information for the new policy to register
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf)
+{
+       struct ptlrpc_service          *svc;
+       struct ptlrpc_nrs_pol_desc     *desc;
+       int                             rc = 0;
+       ENTRY;
+
+       LASSERT(conf != NULL);
+       LASSERT(conf->nc_ops != NULL);
+       LASSERT(conf->nc_compat != NULL);
+       LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one,
+               conf->nc_compat_svc_name != NULL));
+       LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0,
+                    conf->nc_owner != NULL));
+
+       conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+       /**
+        * External policies are not allowed to start immediately upon
+        * registration, as there is a relatively higher chance that their
+        * registration might fail. In such a case, some policy instances may
+        * already have requests queued wen unregistration needs to happen as
+        * part o cleanup; since there is currently no way to drain requests
+        * from a policy unless the service is unregistering, we just disallow
+        * this.
+        */
+       if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) &&
+           (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK |
+                              PTLRPC_NRS_FL_REG_START))) {
+               CERROR("NRS: failing to register policy %s. Please check "
+                      "policy flags; external policies cannot act as fallback "
+                      "policies, or be started immediately upon registration "
+                      "without interaction with lprocfs\n", conf->nc_name);
+               RETURN(-EINVAL);
+       }
+
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) {
+               CERROR("NRS: failing to register policy %s which has already "
+                      "been registered with NRS core!\n",
+                      conf->nc_name);
+               GOTO(fail, rc = -EEXIST);
+       }
+
+       OBD_ALLOC_PTR(desc);
+       if (desc == NULL)
+               GOTO(fail, rc = -ENOMEM);
+
+       strncpy(desc->pd_name, conf->nc_name, NRS_POL_NAME_MAX);
+       desc->pd_ops             = conf->nc_ops;
+       desc->pd_compat          = conf->nc_compat;
+       desc->pd_compat_svc_name = conf->nc_compat_svc_name;
+       if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0)
+               desc->pd_owner   = conf->nc_owner;
+       desc->pd_flags           = conf->nc_flags;
+       atomic_set(&desc->pd_refs, 0);
+
+       /**
+        * For policies that are held in the same module as NRS (currently
+        * ptlrpc), do not register the policy with all compatible services,
+        * as the services will not have started at this point, since we are
+        * calling from ptlrpc module initialization code. In such cases each
+        * service will register all compatible policies later, via
+        * ptlrpc_service_nrs_setup().
+        */
+       if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0)
+               goto internal;
+
+       /**
+        * Register the new policy on all compatible services
+        */
+       mutex_lock(&ptlrpc_all_services_mutex);
+
+       list_for_each_entry(svc, &ptlrpc_all_services, srv_list) {
+               struct ptlrpc_service_part     *svcpt;
+               int                             i;
+               int                             rc2;
+
+               if (!nrs_policy_compatible(svc, desc) ||
+                   unlikely(svc->srv_is_stopping))
+                       continue;
+
+               ptlrpc_service_for_each_part(svcpt, i, svc) {
+                       struct ptlrpc_nrs      *nrs;
+                       bool                    hp = false;
+again:
+                       nrs = nrs_svcpt2nrs(svcpt, hp);
+                       rc = nrs_policy_register(nrs, desc);
+                       if (rc != 0) {
+                               CERROR("Failed to register NRS policy %s for "
+                                      "partition %d of service %s: %d\n",
+                                      desc->pd_name, svcpt->scp_cpt,
+                                      svcpt->scp_service->srv_name, rc);
+
+                               rc2 = nrs_policy_unregister_locked(desc);
+                               /**
+                                * Should not fail at this point
+                                */
+                               LASSERT(rc2 == 0);
+                               mutex_unlock(&ptlrpc_all_services_mutex);
+                               OBD_FREE_PTR(desc);
+                               GOTO(fail, rc);
+                       }
+
+                       if (!hp && nrs_svc_has_hp(svc)) {
+                               hp = true;
+                               goto again;
+                       }
+               }
+
+               /**
+                * No need to take a reference to other modules here, as we
+                * will be calling from the module's init() function.
+                */
+               if (desc->pd_ops->op_lprocfs_init != NULL) {
+                       rc = desc->pd_ops->op_lprocfs_init(svc);
+                       if (rc != 0) {
+                               rc2 = nrs_policy_unregister_locked(desc);
+                               /**
+                                * Should not fail at this point
+                                */
+                               LASSERT(rc2 == 0);
+                               mutex_unlock(&ptlrpc_all_services_mutex);
+                               OBD_FREE_PTR(desc);
+                               GOTO(fail, rc);
+                       }
+               }
+       }
+
+       mutex_unlock(&ptlrpc_all_services_mutex);
+internal:
+       list_add_tail(&desc->pd_list, &nrs_core.nrs_policies);
+fail:
+       mutex_unlock(&nrs_core.nrs_mutex);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_register);
+
+/**
+ * Unregisters a previously registered policy with NRS core. All instances of
+ * the policy on all NRS heads of all supported services are removed.
+ *
+ * N.B. This function should only be called from a module's exit() function.
+ *     Although it can be used for policies that ship alongside NRS core, the
+ *     function is primarily intended for policies that register externally,
+ *     from other modules.
+ *
+ * \param[in] conf configuration information for the policy to unregister
+ *
+ * \retval -ve error
+ * \retval   0 success
+ */
+int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf)
+{
+       struct ptlrpc_nrs_pol_desc      *desc;
+       int                              rc;
+       ENTRY;
+
+       LASSERT(conf != NULL);
+
+       if (conf->nc_flags & PTLRPC_NRS_FL_FALLBACK) {
+               CERROR("Unable to unregister a fallback policy, unless the "
+                      "PTLRPC service is stopping.\n");
+               RETURN(-EPERM);
+       }
+
+       conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0';
+
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       desc = nrs_policy_find_desc_locked(conf->nc_name);
+       if (desc == NULL) {
+               CERROR("Failing to unregister NRS policy %s which has "
+                      "not been registered with NRS core!\n",
+                      conf->nc_name);
+               GOTO(not_exist, rc = -ENOENT);
+       }
+
+       mutex_lock(&ptlrpc_all_services_mutex);
+
+       rc = nrs_policy_unregister_locked(desc);
+       if (rc < 0) {
+               if (rc == -EBUSY)
+                       CERROR("Please first stop policy %s on all service "
+                              "partitions and then retry to unregister the "
+                              "policy.\n", conf->nc_name);
+               GOTO(fail, rc);
+       }
+
+       CDEBUG(D_INFO, "Unregistering policy %s from NRS core.\n",
+              conf->nc_name);
+
+       list_del(&desc->pd_list);
+       OBD_FREE_PTR(desc);
+
+fail:
+       mutex_unlock(&ptlrpc_all_services_mutex);
+
+not_exist:
+       mutex_unlock(&nrs_core.nrs_mutex);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_nrs_policy_unregister);
+
+/**
+ * Setup NRS heads on all service partitions of service \a svc, and register
+ * all compatible policies on those NRS heads.
+ *
+ * To be called from withing ptl
+ * \param[in] svc the service to setup
+ *
+ * \retval -ve error, the calling logic should eventually call
+ *                   ptlrpc_service_nrs_cleanup() to undo any work performed
+ *                   by this function.
+ *
+ * \see ptlrpc_register_service()
+ * \see ptlrpc_service_nrs_cleanup()
+ */
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part             *svcpt;
+       const struct ptlrpc_nrs_pol_desc       *desc;
+       int                                     i;
+       int                                     rc = 0;
+
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       /**
+        * Initialize NRS heads on all service CPTs.
+        */
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               rc = nrs_svcpt_setup_locked(svcpt);
+               if (rc != 0)
+                       GOTO(failed, rc);
+       }
+
+       /**
+        * Set up lprocfs interfaces for all supported policies for the
+        * service.
+        */
+       list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+               if (!nrs_policy_compatible(svc, desc))
+                       continue;
+
+               if (desc->pd_ops->op_lprocfs_init != NULL) {
+                       rc = desc->pd_ops->op_lprocfs_init(svc);
+                       if (rc != 0)
+                               GOTO(failed, rc);
+               }
+       }
+
+failed:
+
+       mutex_unlock(&nrs_core.nrs_mutex);
+
+       RETURN(rc);
+}
+
+/**
+ * Unregisters all policies on all service partitions of service \a svc.
+ *
+ * \param[in] svc the PTLRPC service to unregister
+ */
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part           *svcpt;
+       const struct ptlrpc_nrs_pol_desc     *desc;
+       int                                   i;
+
+       mutex_lock(&nrs_core.nrs_mutex);
+
+       /**
+        * Clean up NRS heads on all service partitions
+        */
+       ptlrpc_service_for_each_part(svcpt, i, svc)
+               nrs_svcpt_cleanup_locked(svcpt);
+
+       /**
+        * Clean up lprocfs interfaces for all supported policies for the
+        * service.
+        */
+       list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) {
+               if (!nrs_policy_compatible(svc, desc))
+                       continue;
+
+               if (desc->pd_ops->op_lprocfs_fini != NULL)
+                       desc->pd_ops->op_lprocfs_fini(svc);
+       }
+
+       mutex_unlock(&nrs_core.nrs_mutex);
+}
+
+/**
+ * Obtains NRS head resources for request \a req.
+ *
+ * These could be either on the regular or HP NRS head of \a svcpt; resources
+ * taken on the regular head can later be swapped for HP head resources by
+ * ldlm_lock_reorder_req().
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request
+ * \param[in] hp    which NRS head of \a svcpt to use
+ */
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+                              struct ptlrpc_request *req, bool hp)
+{
+       struct ptlrpc_nrs       *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+       memset(&req->rq_nrq, 0, sizeof(req->rq_nrq));
+       nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs,
+                             false);
+
+       /**
+        * It is fine to access \e nr_initialized without locking as there is
+        * no contention at this early stage.
+        */
+       req->rq_nrq.nr_initialized = 1;
+}
+
+/**
+ * Releases resources for a request; is called after the request has been
+ * handled.
+ *
+ * \param[in] req the request
+ *
+ * \see ptlrpc_server_finish_request()
+ */
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req)
+{
+       if (req->rq_nrq.nr_initialized) {
+               nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs);
+               /* no protection on bit nr_initialized because no
+                * contention at this late stage */
+               req->rq_nrq.nr_finalized = 1;
+       }
+}
+
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req)
+{
+       if (req->rq_nrq.nr_started)
+               nrs_request_stop(&req->rq_nrq);
+}
+
+/**
+ * Enqueues request \a req on either the regular or high-priority NRS head
+ * of service partition \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] req   the request to be enqueued
+ * \param[in] hp    whether to enqueue the request on the regular or
+ *                 high-priority NRS head.
+ */
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+                       struct ptlrpc_request *req, bool hp)
+{
+       spin_lock(&svcpt->scp_req_lock);
+
+       if (hp)
+               ptlrpc_nrs_hpreq_add_nolock(req);
+       else
+               ptlrpc_nrs_req_add_nolock(req);
+
+       spin_unlock(&svcpt->scp_req_lock);
+}
+
+static void nrs_request_removed(struct ptlrpc_nrs_policy *policy)
+{
+       LASSERT(policy->pol_nrs->nrs_req_queued > 0);
+       LASSERT(policy->pol_req_queued > 0);
+
+       policy->pol_nrs->nrs_req_queued--;
+       policy->pol_req_queued--;
+
+       /**
+        * If the policy has no more requests queued, remove it from
+        * ptlrpc_nrs::nrs_policy_queued.
+        */
+       if (unlikely(policy->pol_req_queued == 0)) {
+               list_del_init(&policy->pol_list_queued);
+
+               /**
+                * If there are other policies with queued requests, move the
+                * current policy to the end so that we can round robin over
+                * all policies and drain the requests.
+                */
+       } else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) {
+               LASSERT(policy->pol_req_queued <
+                       policy->pol_nrs->nrs_req_queued);
+
+               list_move_tail(&policy->pol_list_queued,
+                                  &policy->pol_nrs->nrs_policy_queued);
+       }
+}
+
+/**
+ * Obtains a request for handling from an NRS head of service partition
+ * \a svcpt.
+ *
+ * \param[in] svcpt the service partition
+ * \param[in] hp    whether to obtain a request from the regular or
+ *                 high-priority NRS head.
+ * \param[in] peek  when set, signifies that we just want to examine the
+ *                 request, and not handle it, so the request is not removed
+ *                 from the policy.
+ * \param[in] force when set, it will force a policy to return a request if it
+ *                 has one pending
+ *
+ * \retval the request to be handled
+ * \retval NULL the head has no requests to serve
+ */
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+                          bool peek, bool force)
+{
+       struct ptlrpc_nrs         *nrs = nrs_svcpt2nrs(svcpt, hp);
+       struct ptlrpc_nrs_policy  *policy;
+       struct ptlrpc_nrs_request *nrq;
+
+       /**
+        * Always try to drain requests from all NRS polices even if they are
+        * inactive, because the user can change policy status at runtime.
+        */
+       list_for_each_entry(policy, &nrs->nrs_policy_queued,
+                               pol_list_queued) {
+               nrq = nrs_request_get(policy, peek, force);
+               if (nrq != NULL) {
+                       if (likely(!peek)) {
+                               nrq->nr_started = 1;
+
+                               policy->pol_req_started++;
+                               policy->pol_nrs->nrs_req_started++;
+
+                               nrs_request_removed(policy);
+                       }
+
+                       return container_of(nrq, struct ptlrpc_request, rq_nrq);
+               }
+       }
+
+       return NULL;
+}
+
+/**
+ * Dequeues request \a req from the policy it has been enqueued on.
+ *
+ * \param[in] req the request
+ */
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req)
+{
+       struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq);
+
+       policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq);
+
+       req->rq_nrq.nr_enqueued = 0;
+
+       nrs_request_removed(policy);
+}
+
+/**
+ * Returns whether there are any requests currently enqueued on any of the
+ * policies of service partition's \a svcpt NRS head specified by \a hp. Should
+ * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable
+ * result.
+ *
+ * \param[in] svcpt the service partition to enquire.
+ * \param[in] hp    whether the regular or high-priority NRS head is to be
+ *                 enquired.
+ *
+ * \retval false the indicated NRS head has no enqueued requests.
+ * \retval true         the indicated NRS head has some enqueued requests.
+ */
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+       struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp);
+
+       return nrs->nrs_req_queued > 0;
+};
+
+/**
+ * Moves request \a req from the regular to the high-priority NRS head.
+ *
+ * \param[in] req the request to move
+ */
+void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req)
+{
+       struct ptlrpc_service_part      *svcpt = req->rq_rqbd->rqbd_svcpt;
+       struct ptlrpc_nrs_request       *nrq = &req->rq_nrq;
+       struct ptlrpc_nrs_resource      *res1[NRS_RES_MAX];
+       struct ptlrpc_nrs_resource      *res2[NRS_RES_MAX];
+       ENTRY;
+
+       /**
+        * Obtain the high-priority NRS head resources.
+        */
+       nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true);
+
+       spin_lock(&svcpt->scp_req_lock);
+
+       if (!ptlrpc_nrs_req_can_move(req))
+               goto out;
+
+       ptlrpc_nrs_req_del_nolock(req);
+
+       memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0]));
+       memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0]));
+
+       ptlrpc_nrs_hpreq_add_nolock(req);
+
+       memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0]));
+out:
+       spin_unlock(&svcpt->scp_req_lock);
+
+       /**
+        * Release either the regular NRS head resources if we moved the
+        * request, or the high-priority NRS head resources if we took a
+        * reference earlier in this function and ptlrpc_nrs_req_can_move()
+        * returned false.
+        */
+       nrs_resource_put_safe(res1);
+       EXIT;
+}
+
+/**
+ * Carries out a control operation \a opc on the policy identified by the
+ * human-readable \a name, on either all partitions, or only on the first
+ * partition of service \a svc.
+ *
+ * \param[in]    svc    the service the policy belongs to.
+ * \param[in]    queue  whether to carry out the command on the policy which
+ *                      belongs to the regular, high-priority, or both NRS
+ *                      heads of service partitions of \a svc.
+ * \param[in]    name   the policy to act upon, by human-readable name
+ * \param[in]    opc    the opcode of the operation to carry out
+ * \param[in]    single when set, the operation will only be carried out on the
+ *                      NRS heads of the first service partition of \a svc.
+ *                      This is useful for some policies which e.g. share
+ *                      identical values on the same parameters of different
+ *                      service partitions; when reading these parameters via
+ *                      lprocfs, these policies may just want to obtain and
+ *                      print out the values from the first service partition.
+ *                      Storing these values centrally elsewhere then could be
+ *                      another solution for this.
+ * \param[in,out] arg   can be used as a generic in/out buffer between control
+ *                      operations and the user environment.
+ *
+ *\retval -ve error condition
+ *\retval   0 operation was carried out successfully
+ */
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+                             enum ptlrpc_nrs_queue_type queue, char *name,
+                             enum ptlrpc_nrs_ctl opc, bool single, void *arg)
+{
+       struct ptlrpc_service_part     *svcpt;
+       int                             i;
+       int                             rc = 0;
+       ENTRY;
+
+       LASSERT(opc != PTLRPC_NRS_CTL_INVALID);
+
+       if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0)
+               return -EINVAL;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) {
+                       rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name,
+                                           opc, arg);
+                       if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG &&
+                                       single))
+                               GOTO(out, rc);
+               }
+
+               if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) {
+                       /**
+                        * XXX: We could optionally check for
+                        * nrs_svc_has_hp(svc) here, and return an error if it
+                        * is false. Right now we rely on the policies' lprocfs
+                        * handlers that call the present function to make this
+                        * check; if they fail to do so, they might hit the
+                        * assertion inside nrs_svcpt2nrs() below.
+                        */
+                       rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name,
+                                           opc, arg);
+                       if (rc != 0 || single)
+                               GOTO(out, rc);
+               }
+       }
+out:
+       RETURN(rc);
+}
+
+
+/* ptlrpc/nrs_fifo.c */
+extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo;
+
+/**
+ * Adds all policies that ship with the ptlrpc module, to NRS core's list of
+ * policies \e nrs_core.nrs_policies.
+ *
+ * \retval 0 all policies have been registered successfully
+ * \retval -ve error
+ */
+int ptlrpc_nrs_init(void)
+{
+       int     rc;
+       ENTRY;
+
+       mutex_init(&nrs_core.nrs_mutex);
+       INIT_LIST_HEAD(&nrs_core.nrs_policies);
+
+       rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo);
+       if (rc != 0)
+               GOTO(fail, rc);
+
+
+       RETURN(rc);
+fail:
+       /**
+        * Since no PTLRPC services have been started at this point, all we need
+        * to do for cleanup is to free the descriptors.
+        */
+       ptlrpc_nrs_fini();
+
+       RETURN(rc);
+}
+
+/**
+ * Removes all policy desciptors from nrs_core::nrs_policies, and frees the
+ * policy descriptors.
+ *
+ * Since all PTLRPC services are stopped at this point, there are no more
+ * instances of any policies, because each service will have stopped its policy
+ * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the
+ * descriptors here.
+ */
+void ptlrpc_nrs_fini(void)
+{
+       struct ptlrpc_nrs_pol_desc *desc;
+       struct ptlrpc_nrs_pol_desc *tmp;
+
+       list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies,
+                                    pd_list) {
+               list_del_init(&desc->pd_list);
+               OBD_FREE_PTR(desc);
+       }
+}
+
+/** @} nrs */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c
new file mode 100644 (file)
index 0000000..ddfb510
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_crr.c
+ *
+ * Network Request Scheduler (NRS) CRR-N policy
+ *
+ * Request ordering in a batched Round-Robin manner over client NIDs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c
new file mode 100644 (file)
index 0000000..7d3ee97
--- /dev/null
@@ -0,0 +1,270 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Intel Corporation
+ *
+ * Copyright 2012 Xyratex Technology Limited
+ */
+/*
+ * lustre/ptlrpc/nrs_fifo.c
+ *
+ * Network Request Scheduler (NRS) FIFO policy
+ *
+ * Handles RPCs in a FIFO manner, as received from the network. This policy is
+ * a logical wrapper around previous, non-NRS functionality. It is used as the
+ * default and fallback policy for all types of RPCs on all PTLRPC service
+ * partitions, for both regular and high-priority NRS heads. Default here means
+ * the policy is the one enabled at PTLRPC service partition startup time, and
+ * fallback means the policy is used to handle RPCs that are not handled
+ * successfully or are not handled at all by any primary policy that may be
+ * enabled on a given NRS head.
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ * Author: Nikitas Angelinas <nikitas_angelinas@xyratex.com>
+ */
+/**
+ * \addtogoup nrs
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/libcfs/libcfs.h>
+#include "ptlrpc_internal.h"
+
+/**
+ * \name fifo
+ *
+ * The FIFO policy is a logical wrapper around previous, non-NRS functionality.
+ * It schedules RPCs in the same order as they are queued from LNet.
+ *
+ * @{
+ */
+
+#define NRS_POL_NAME_FIFO      "fifo"
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a
+ * policy-specific private data structure.
+ *
+ * \param[in] policy The policy to start
+ *
+ * \retval -ENOMEM OOM error
+ * \retval  0     success
+ *
+ * \see nrs_policy_register()
+ * \see nrs_policy_ctl()
+ */
+static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy)
+{
+       struct nrs_fifo_head *head;
+
+       OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy));
+       if (head == NULL)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&head->fh_list);
+       policy->pol_private = head;
+       return 0;
+}
+
+/**
+ * Is called before the policy transitions into
+ * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific
+ * private data structure.
+ *
+ * \param[in] policy The policy to stop
+ *
+ * \see nrs_policy_stop0()
+ */
+static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy)
+{
+       struct nrs_fifo_head *head = policy->pol_private;
+
+       LASSERT(head != NULL);
+       LASSERT(list_empty(&head->fh_list));
+
+       OBD_FREE_PTR(head);
+}
+
+/**
+ * Is called for obtaining a FIFO policy resource.
+ *
+ * \param[in]  policy    The policy on which the request is being asked for
+ * \param[in]  nrq       The request for which resources are being taken
+ * \param[in]  parent    Parent resource, unused in this policy
+ * \param[out] resp      Resources references are placed in this array
+ * \param[in]  moving_req Signifies limited caller context; unused in this
+ *                       policy
+ *
+ * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since
+ *          it implements a simple scheduling algorithm in which request
+ *          priority is determined on the request arrival order, it does not
+ *          need to maintain a set of resources that would otherwise be used
+ *          to calculate a request's priority.
+ *
+ * \see nrs_resource_get_safe()
+ */
+static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy,
+                           struct ptlrpc_nrs_request *nrq,
+                           const struct ptlrpc_nrs_resource *parent,
+                           struct ptlrpc_nrs_resource **resp, bool moving_req)
+{
+       /**
+        * Just return the resource embedded inside nrs_fifo_head, and end this
+        * resource hierarchy reference request.
+        */
+       *resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res;
+       return 1;
+}
+
+/**
+ * Called when getting a request from the FIFO policy for handling, or just
+ * peeking; removes the request from the policy when it is to be handled.
+ *
+ * \param[in] policy The policy
+ * \param[in] peek   When set, signifies that we just want to examine the
+ *                  request, and not handle it, so the request is not removed
+ *                  from the policy.
+ * \param[in] force  Force the policy to return a request; unused in this
+ *                  policy
+ *
+ * \retval The request to be handled; this is the next request in the FIFO
+ *        queue
+ *
+ * \see ptlrpc_nrs_req_get_nolock()
+ * \see nrs_request_get()
+ */
+static
+struct ptlrpc_nrs_request * nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy,
+                                            bool peek, bool force)
+{
+       struct nrs_fifo_head      *head = policy->pol_private;
+       struct ptlrpc_nrs_request *nrq;
+
+       nrq = unlikely(list_empty(&head->fh_list)) ? NULL :
+             list_entry(head->fh_list.next, struct ptlrpc_nrs_request,
+                            nr_u.fifo.fr_list);
+
+       if (likely(!peek && nrq != NULL)) {
+               struct ptlrpc_request *req = container_of(nrq,
+                                                         struct ptlrpc_request,
+                                                         rq_nrq);
+
+               list_del_init(&nrq->nr_u.fifo.fr_list);
+
+               CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: "LPU64
+                      "\n", policy->pol_desc->pd_name,
+                      libcfs_id2str(req->rq_peer), nrq->nr_u.fifo.fr_sequence);
+       }
+
+       return nrq;
+}
+
+/**
+ * Adds request \a nrq to \a policy's list of queued requests
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to add
+ *
+ * \retval 0 success; nrs_request_enqueue() assumes this function will always
+ *                   succeed
+ */
+static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy,
+                           struct ptlrpc_nrs_request *nrq)
+{
+       struct nrs_fifo_head *head;
+
+       head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head,
+                           fh_res);
+       /**
+        * Only used for debugging
+        */
+       nrq->nr_u.fifo.fr_sequence = head->fh_sequence++;
+       list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list);
+
+       return 0;
+}
+
+/**
+ * Removes request \a nrq from \a policy's list of queued requests.
+ *
+ * \param[in] policy The policy
+ * \param[in] nrq    The request to remove
+ */
+static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy,
+                            struct ptlrpc_nrs_request *nrq)
+{
+       LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list));
+       list_del_init(&nrq->nr_u.fifo.fr_list);
+}
+
+/**
+ * Prints a debug statement right before the request \a nrq stops being
+ * handled.
+ *
+ * \param[in] policy The policy handling the request
+ * \param[in] nrq    The request being handled
+ *
+ * \see ptlrpc_server_finish_request()
+ * \see ptlrpc_nrs_req_stop_nolock()
+ */
+static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy,
+                             struct ptlrpc_nrs_request *nrq)
+{
+       struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request,
+                                                 rq_nrq);
+
+       CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: "LPU64"\n",
+              policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer),
+              nrq->nr_u.fifo.fr_sequence);
+}
+
+/**
+ * FIFO policy operations
+ */
+static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = {
+       .op_policy_start        = nrs_fifo_start,
+       .op_policy_stop         = nrs_fifo_stop,
+       .op_res_get             = nrs_fifo_res_get,
+       .op_req_get             = nrs_fifo_req_get,
+       .op_req_enqueue         = nrs_fifo_req_add,
+       .op_req_dequeue         = nrs_fifo_req_del,
+       .op_req_stop            = nrs_fifo_req_stop,
+};
+
+/**
+ * FIFO policy configuration
+ */
+struct ptlrpc_nrs_pol_conf nrs_conf_fifo = {
+       .nc_name                = NRS_POL_NAME_FIFO,
+       .nc_ops                 = &nrs_fifo_ops,
+       .nc_compat              = nrs_policy_compat_all,
+       .nc_flags               = PTLRPC_NRS_FL_FALLBACK |
+                                 PTLRPC_NRS_FL_REG_START
+};
+
+/** @} fifo */
+
+/** @} nrs */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c
new file mode 100644 (file)
index 0000000..1437636
--- /dev/null
@@ -0,0 +1,2575 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pack_generic.c
+ *
+ * (Un)packing of OST requests
+ *
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eeb@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <obd_cksum.h>
+#include <lustre/ll_fiemap.h>
+
+static inline int lustre_msg_hdr_size_v2(int count)
+{
+       return cfs_size_round(offsetof(struct lustre_msg_v2,
+                                      lm_buflens[count]));
+}
+
+int lustre_msg_hdr_size(__u32 magic, int count)
+{
+       switch (magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_hdr_size_v2(count);
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_hdr_size);
+
+void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout,
+                           int index)
+{
+       if (inout)
+               lustre_set_req_swabbed(req, index);
+       else
+               lustre_set_rep_swabbed(req, index);
+}
+EXPORT_SYMBOL(ptlrpc_buf_set_swabbed);
+
+int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout,
+                        int index)
+{
+       if (inout)
+               return (ptlrpc_req_need_swab(req) &&
+                       !lustre_req_swabbed(req, index));
+       else
+               return (ptlrpc_rep_need_swab(req) &&
+                       !lustre_rep_swabbed(req, index));
+}
+EXPORT_SYMBOL(ptlrpc_buf_need_swab);
+
+static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg,
+                                             __u32 version)
+{
+       __u32 ver = lustre_msg_get_version(msg);
+       return (ver & LUSTRE_VERSION_MASK) != version;
+}
+
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               CERROR("msg v1 not supported - please upgrade you system\n");
+               return -EINVAL;
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_check_version_v2(msg, version);
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_check_version);
+
+/* early reply size */
+int lustre_msg_early_size()
+{
+       static int size = 0;
+       if (!size) {
+               /* Always reply old ptlrpc_body_v2 to keep interoprability
+                * with the old client (< 2.3) which doesn't have pb_jobid
+                * in the ptlrpc_body.
+                *
+                * XXX Remove this whenever we dorp interoprability with such
+                *     client.
+                */
+               __u32 pblen = sizeof(struct ptlrpc_body_v2);
+               size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen);
+       }
+       return size;
+}
+EXPORT_SYMBOL(lustre_msg_early_size);
+
+int lustre_msg_size_v2(int count, __u32 *lengths)
+{
+       int size;
+       int i;
+
+       size = lustre_msg_hdr_size_v2(count);
+       for (i = 0; i < count; i++)
+               size += cfs_size_round(lengths[i]);
+
+       return size;
+}
+EXPORT_SYMBOL(lustre_msg_size_v2);
+
+/* This returns the size of the buffer that is required to hold a lustre_msg
+ * with the given sub-buffer lengths.
+ * NOTE: this should only be used for NEW requests, and should always be
+ *       in the form of a v2 request.  If this is a connection to a v1
+ *       target then the first buffer will be stripped because the ptlrpc
+ *       data is part of the lustre_msg_v1 header. b=14043 */
+int lustre_msg_size(__u32 magic, int count, __u32 *lens)
+{
+       __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+       if (!lens) {
+               LASSERT(count == 1);
+               lens = size;
+       }
+
+       LASSERT(count > 0);
+       LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2));
+
+       switch (magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_size_v2(count, lens);
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_size);
+
+/* This is used to determine the size of a buffer that was already packed
+ * and will correctly handle the different message formats. */
+int lustre_packed_msg_size(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_packed_msg_size);
+
+void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens,
+                       char **bufs)
+{
+       char *ptr;
+       int i;
+
+       msg->lm_bufcount = count;
+       /* XXX: lm_secflvr uninitialized here */
+       msg->lm_magic = LUSTRE_MSG_MAGIC_V2;
+
+       for (i = 0; i < count; i++)
+               msg->lm_buflens[i] = lens[i];
+
+       if (bufs == NULL)
+               return;
+
+       ptr = (char *)msg + lustre_msg_hdr_size_v2(count);
+       for (i = 0; i < count; i++) {
+               char *tmp = bufs[i];
+               LOGL(tmp, lens[i], ptr);
+       }
+}
+EXPORT_SYMBOL(lustre_init_msg_v2);
+
+static int lustre_pack_request_v2(struct ptlrpc_request *req,
+                                 int count, __u32 *lens, char **bufs)
+{
+       int reqlen, rc;
+
+       reqlen = lustre_msg_size_v2(count, lens);
+
+       rc = sptlrpc_cli_alloc_reqbuf(req, reqlen);
+       if (rc)
+               return rc;
+
+       req->rq_reqlen = reqlen;
+
+       lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs);
+       lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION);
+       return 0;
+}
+
+int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count,
+                       __u32 *lens, char **bufs)
+{
+       __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+       if (!lens) {
+               LASSERT(count == 1);
+               lens = size;
+       }
+
+       LASSERT(count > 0);
+       LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+       /* only use new format, we don't need to be compatible with 1.4 */
+       magic = LUSTRE_MSG_MAGIC_V2;
+
+       switch (magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_pack_request_v2(req, count, lens, bufs);
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_pack_request);
+
+#if RS_DEBUG
+LIST_HEAD(ptlrpc_rs_debug_lru);
+spinlock_t ptlrpc_rs_debug_lock;
+
+#define PTLRPC_RS_DEBUG_LRU_ADD(rs)                                    \
+do {                                                                   \
+       spin_lock(&ptlrpc_rs_debug_lock);                               \
+       list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru);      \
+       spin_unlock(&ptlrpc_rs_debug_lock);                             \
+} while (0)
+
+#define PTLRPC_RS_DEBUG_LRU_DEL(rs)                                    \
+do {                                                                   \
+       spin_lock(&ptlrpc_rs_debug_lock);                               \
+       list_del(&(rs)->rs_debug_list);                         \
+       spin_unlock(&ptlrpc_rs_debug_lock);                             \
+} while (0)
+#else
+# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0)
+# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0)
+#endif
+
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_reply_state *rs = NULL;
+
+       spin_lock(&svcpt->scp_rep_lock);
+
+       /* See if we have anything in a pool, and wait if nothing */
+       while (list_empty(&svcpt->scp_rep_idle)) {
+               struct l_wait_info      lwi;
+               int                     rc;
+
+               spin_unlock(&svcpt->scp_rep_lock);
+               /* If we cannot get anything for some long time, we better
+                * bail out instead of waiting infinitely */
+               lwi = LWI_TIMEOUT(cfs_time_seconds(10), NULL, NULL);
+               rc = l_wait_event(svcpt->scp_rep_waitq,
+                                 !list_empty(&svcpt->scp_rep_idle), &lwi);
+               if (rc != 0)
+                       goto out;
+               spin_lock(&svcpt->scp_rep_lock);
+       }
+
+       rs = list_entry(svcpt->scp_rep_idle.next,
+                           struct ptlrpc_reply_state, rs_list);
+       list_del(&rs->rs_list);
+
+       spin_unlock(&svcpt->scp_rep_lock);
+
+       memset(rs, 0, svcpt->scp_service->srv_max_reply_size);
+       rs->rs_svcpt = svcpt;
+       rs->rs_prealloc = 1;
+out:
+       return rs;
+}
+
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs)
+{
+       struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+       spin_lock(&svcpt->scp_rep_lock);
+       list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+       spin_unlock(&svcpt->scp_rep_lock);
+       wake_up(&svcpt->scp_rep_waitq);
+}
+
+int lustre_pack_reply_v2(struct ptlrpc_request *req, int count,
+                        __u32 *lens, char **bufs, int flags)
+{
+       struct ptlrpc_reply_state *rs;
+       int                     msg_len, rc;
+       ENTRY;
+
+       LASSERT(req->rq_reply_state == NULL);
+
+       if ((flags & LPRFL_EARLY_REPLY) == 0) {
+               spin_lock(&req->rq_lock);
+               req->rq_packed_final = 1;
+               spin_unlock(&req->rq_lock);
+       }
+
+       msg_len = lustre_msg_size_v2(count, lens);
+       rc = sptlrpc_svc_alloc_rs(req, msg_len);
+       if (rc)
+               RETURN(rc);
+
+       rs = req->rq_reply_state;
+       atomic_set(&rs->rs_refcount, 1);    /* 1 ref for rq_reply_state */
+       rs->rs_cb_id.cbid_fn = reply_out_callback;
+       rs->rs_cb_id.cbid_arg = rs;
+       rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt;
+       INIT_LIST_HEAD(&rs->rs_exp_list);
+       INIT_LIST_HEAD(&rs->rs_obd_list);
+       INIT_LIST_HEAD(&rs->rs_list);
+       spin_lock_init(&rs->rs_lock);
+
+       req->rq_replen = msg_len;
+       req->rq_reply_state = rs;
+       req->rq_repmsg = rs->rs_msg;
+
+       lustre_init_msg_v2(rs->rs_msg, count, lens, bufs);
+       lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION);
+
+       PTLRPC_RS_DEBUG_LRU_ADD(rs);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(lustre_pack_reply_v2);
+
+int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens,
+                           char **bufs, int flags)
+{
+       int rc = 0;
+       __u32 size[] = { sizeof(struct ptlrpc_body) };
+
+       if (!lens) {
+               LASSERT(count == 1);
+               lens = size;
+       }
+
+       LASSERT(count > 0);
+       LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body));
+
+       switch (req->rq_reqmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               rc = lustre_pack_reply_v2(req, count, lens, bufs, flags);
+               break;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n",
+                        req->rq_reqmsg->lm_magic);
+               rc = -EINVAL;
+       }
+       if (rc != 0)
+               CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc,
+                      lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens));
+       return rc;
+}
+EXPORT_SYMBOL(lustre_pack_reply_flags);
+
+int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens,
+                     char **bufs)
+{
+       return lustre_pack_reply_flags(req, count, lens, bufs, 0);
+}
+EXPORT_SYMBOL(lustre_pack_reply);
+
+void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size)
+{
+       int i, offset, buflen, bufcount;
+
+       LASSERT(m != NULL);
+       LASSERT(n >= 0);
+
+       bufcount = m->lm_bufcount;
+       if (unlikely(n >= bufcount)) {
+               CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+                      m, n, bufcount);
+               return NULL;
+       }
+
+       buflen = m->lm_buflens[n];
+       if (unlikely(buflen < min_size)) {
+               CERROR("msg %p buffer[%d] size %d too small "
+                      "(required %d, opc=%d)\n", m, n, buflen, min_size,
+                      n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m));
+               return NULL;
+       }
+
+       offset = lustre_msg_hdr_size_v2(bufcount);
+       for (i = 0; i < n; i++)
+               offset += cfs_size_round(m->lm_buflens[i]);
+
+       return (char *)m + offset;
+}
+
+void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size)
+{
+       switch (m->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_buf_v2(m, n, min_size);
+       default:
+               LASSERTF(0, "incorrect message magic: %08x(msg:%p)\n", m->lm_magic, m);
+               return NULL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_buf);
+
+int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, int segment,
+                        unsigned int newlen, int move_data)
+{
+       char   *tail = NULL, *newpos;
+       int     tail_len = 0, n;
+
+       LASSERT(msg);
+       LASSERT(msg->lm_bufcount > segment);
+       LASSERT(msg->lm_buflens[segment] >= newlen);
+
+       if (msg->lm_buflens[segment] == newlen)
+               goto out;
+
+       if (move_data && msg->lm_bufcount > segment + 1) {
+               tail = lustre_msg_buf_v2(msg, segment + 1, 0);
+               for (n = segment + 1; n < msg->lm_bufcount; n++)
+                       tail_len += cfs_size_round(msg->lm_buflens[n]);
+       }
+
+       msg->lm_buflens[segment] = newlen;
+
+       if (tail && tail_len) {
+               newpos = lustre_msg_buf_v2(msg, segment + 1, 0);
+               LASSERT(newpos <= tail);
+               if (newpos != tail)
+                       memmove(newpos, tail, tail_len);
+       }
+out:
+       return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+}
+
+/*
+ * for @msg, shrink @segment to size @newlen. if @move_data is non-zero,
+ * we also move data forward from @segment + 1.
+ *
+ * if @newlen == 0, we remove the segment completely, but we still keep the
+ * totally bufcount the same to save possible data moving. this will leave a
+ * unused segment with size 0 at the tail, but that's ok.
+ *
+ * return new msg size after shrinking.
+ *
+ * CAUTION:
+ * + if any buffers higher than @segment has been filled in, must call shrink
+ *   with non-zero @move_data.
+ * + caller should NOT keep pointers to msg buffers which higher than @segment
+ *   after call shrink.
+ */
+int lustre_shrink_msg(struct lustre_msg *msg, int segment,
+                     unsigned int newlen, int move_data)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_shrink_msg_v2(msg, segment, newlen, move_data);
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_shrink_msg);
+
+void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
+{
+       PTLRPC_RS_DEBUG_LRU_DEL(rs);
+
+       LASSERT (atomic_read(&rs->rs_refcount) == 0);
+       LASSERT (!rs->rs_difficult || rs->rs_handled);
+       LASSERT (!rs->rs_on_net);
+       LASSERT (!rs->rs_scheduled);
+       LASSERT (rs->rs_export == NULL);
+       LASSERT (rs->rs_nlocks == 0);
+       LASSERT (list_empty(&rs->rs_exp_list));
+       LASSERT (list_empty(&rs->rs_obd_list));
+
+       sptlrpc_svc_free_rs(rs);
+}
+EXPORT_SYMBOL(lustre_free_reply_state);
+
+static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
+{
+       int swabbed, required_len, i;
+
+       /* Now we know the sender speaks my language. */
+       required_len = lustre_msg_hdr_size_v2(0);
+       if (len < required_len) {
+               /* can't even look inside the message */
+               CERROR("message length %d too small for lustre_msg\n", len);
+               return -EINVAL;
+       }
+
+       swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+       if (swabbed) {
+               __swab32s(&m->lm_magic);
+               __swab32s(&m->lm_bufcount);
+               __swab32s(&m->lm_secflvr);
+               __swab32s(&m->lm_repsize);
+               __swab32s(&m->lm_cksum);
+               __swab32s(&m->lm_flags);
+               CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0);
+               CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0);
+       }
+
+       required_len = lustre_msg_hdr_size_v2(m->lm_bufcount);
+       if (len < required_len) {
+               /* didn't receive all the buffer lengths */
+               CERROR ("message length %d too small for %d buflens\n",
+                       len, m->lm_bufcount);
+               return -EINVAL;
+       }
+
+       for (i = 0; i < m->lm_bufcount; i++) {
+               if (swabbed)
+                       __swab32s(&m->lm_buflens[i]);
+               required_len += cfs_size_round(m->lm_buflens[i]);
+       }
+
+       if (len < required_len) {
+               CERROR("len: %d, required_len %d\n", len, required_len);
+               CERROR("bufcount: %d\n", m->lm_bufcount);
+               for (i = 0; i < m->lm_bufcount; i++)
+                       CERROR("buffer %d length %d\n", i, m->lm_buflens[i]);
+               return -EINVAL;
+       }
+
+       return swabbed;
+}
+
+int __lustre_unpack_msg(struct lustre_msg *m, int len)
+{
+       int required_len, rc;
+       ENTRY;
+
+       /* We can provide a slightly better error log, if we check the
+        * message magic and version first.  In the future, struct
+        * lustre_msg may grow, and we'd like to log a version mismatch,
+        * rather than a short message.
+        *
+        */
+       required_len = offsetof(struct lustre_msg, lm_magic) +
+                      sizeof(m->lm_magic);
+       if (len < required_len) {
+               /* can't even look inside the message */
+               CERROR("message length %d too small for magic/version check\n",
+                      len);
+               RETURN(-EINVAL);
+       }
+
+       rc = lustre_unpack_msg_v2(m, len);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(__lustre_unpack_msg);
+
+int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len)
+{
+       int rc;
+       rc = __lustre_unpack_msg(req->rq_reqmsg, len);
+       if (rc == 1) {
+               lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+               rc = 0;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_req_msg);
+
+int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len)
+{
+       int rc;
+       rc = __lustre_unpack_msg(req->rq_repmsg, len);
+       if (rc == 1) {
+               lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+               rc = 0;
+       }
+       return rc;
+}
+EXPORT_SYMBOL(ptlrpc_unpack_rep_msg);
+
+static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req,
+                                              const int inout, int offset)
+{
+       struct ptlrpc_body *pb;
+       struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg;
+
+       pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2));
+       if (!pb) {
+               CERROR("error unpacking ptlrpc body\n");
+               return -EFAULT;
+       }
+       if (ptlrpc_buf_need_swab(req, inout, offset)) {
+               lustre_swab_ptlrpc_body(pb);
+               ptlrpc_buf_set_swabbed(req, inout, offset);
+       }
+
+       if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
+                CERROR("wrong lustre_msg version %08x\n", pb->pb_version);
+                return -EINVAL;
+       }
+
+       return 0;
+}
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+       switch (req->rq_reqmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_unpack_ptlrpc_body_v2(req, 1, offset);
+       default:
+               CERROR("bad lustre msg magic: %08x\n",
+                      req->rq_reqmsg->lm_magic);
+               return -EINVAL;
+       }
+}
+
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset)
+{
+       switch (req->rq_repmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_unpack_ptlrpc_body_v2(req, 0, offset);
+       default:
+               CERROR("bad lustre msg magic: %08x\n",
+                      req->rq_repmsg->lm_magic);
+               return -EINVAL;
+       }
+}
+
+static inline int lustre_msg_buflen_v2(struct lustre_msg_v2 *m, int n)
+{
+       if (n >= m->lm_bufcount)
+               return 0;
+
+       return m->lm_buflens[n];
+}
+
+/**
+ * lustre_msg_buflen - return the length of buffer \a n in message \a m
+ * \param m lustre_msg (request or reply) to look at
+ * \param n message index (base 0)
+ *
+ * returns zero for non-existent message indices
+ */
+int lustre_msg_buflen(struct lustre_msg *m, int n)
+{
+       switch (m->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_msg_buflen_v2(m, n);
+       default:
+               CERROR("incorrect message magic: %08x\n", m->lm_magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_buflen);
+
+static inline void
+lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, int n, int len)
+{
+       if (n >= m->lm_bufcount)
+               LBUG();
+
+       m->lm_buflens[n] = len;
+}
+
+void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len)
+{
+       switch (m->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               lustre_msg_set_buflen_v2(m, n, len);
+               return;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+       }
+}
+
+EXPORT_SYMBOL(lustre_msg_set_buflen);
+
+/* NB return the bufcount for lustre_msg_v2 format, so if message is packed
+ * in V1 format, the result is one bigger. (add struct ptlrpc_body). */
+int lustre_msg_bufcount(struct lustre_msg *m)
+{
+       switch (m->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return m->lm_bufcount;
+       default:
+               CERROR("incorrect message magic: %08x\n", m->lm_magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_bufcount);
+
+char *lustre_msg_string(struct lustre_msg *m, int index, int max_len)
+{
+       /* max_len == 0 means the string should fill the buffer */
+       char *str;
+       int slen, blen;
+
+       switch (m->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               str = lustre_msg_buf_v2(m, index, 0);
+               blen = lustre_msg_buflen_v2(m, index);
+               break;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic);
+       }
+
+       if (str == NULL) {
+               CERROR ("can't unpack string in msg %p buffer[%d]\n", m, index);
+               return NULL;
+       }
+
+       slen = strnlen(str, blen);
+
+       if (slen == blen) {                  /* not NULL terminated */
+               CERROR("can't unpack non-NULL terminated string in "
+                       "msg %p buffer[%d] len %d\n", m, index, blen);
+               return NULL;
+       }
+
+       if (max_len == 0) {
+               if (slen != blen - 1) {
+                       CERROR("can't unpack short string in msg %p "
+                              "buffer[%d] len %d: strlen %d\n",
+                              m, index, blen, slen);
+                       return NULL;
+               }
+       } else if (slen > max_len) {
+               CERROR("can't unpack oversized string in msg %p "
+                      "buffer[%d] len %d strlen %d: max %d expected\n",
+                      m, index, blen, slen, max_len);
+               return NULL;
+       }
+
+       return str;
+}
+EXPORT_SYMBOL(lustre_msg_string);
+
+/* Wrap up the normal fixed length cases */
+static inline void *__lustre_swab_buf(struct lustre_msg *msg, int index,
+                                     int min_size, void *swabber)
+{
+       void *ptr = NULL;
+
+       LASSERT(msg != NULL);
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               ptr = lustre_msg_buf_v2(msg, index, min_size);
+               break;
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+       }
+
+       if (ptr && swabber)
+               ((void (*)(void *))swabber)(ptr);
+
+       return ptr;
+}
+
+static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg)
+{
+       return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+                                sizeof(struct ptlrpc_body_v2));
+}
+
+__u32 lustre_msghdr_get_flags(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+       case LUSTRE_MSG_MAGIC_V1_SWABBED:
+               return 0;
+       case LUSTRE_MSG_MAGIC_V2:
+               /* already in host endian */
+               return msg->lm_flags;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msghdr_get_flags);
+
+void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2:
+               msg->lm_flags = flags;
+               return;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+
+__u32 lustre_msg_get_flags(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_flags;
+       }
+       default:
+               /* flags might be printed in debug code while message
+                * uninitialized */
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_flags);
+
+void lustre_msg_add_flags(struct lustre_msg *msg, int flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_flags |= flags;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_add_flags);
+
+void lustre_msg_set_flags(struct lustre_msg *msg, int flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_flags = flags;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_flags);
+
+void lustre_msg_clear_flags(struct lustre_msg *msg, int flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags);
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_clear_flags);
+
+__u32 lustre_msg_get_op_flags(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_op_flags;
+       }
+       default:
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_op_flags);
+
+void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_op_flags |= flags;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_add_op_flags);
+
+void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_op_flags |= flags;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_op_flags);
+
+struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return NULL;
+               }
+               return &pb->pb_handle;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return NULL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_handle);
+
+__u32 lustre_msg_get_type(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return PTL_RPC_MSG_ERR;
+               }
+               return pb->pb_type;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return PTL_RPC_MSG_ERR;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_type);
+
+__u32 lustre_msg_get_version(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_version;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_version);
+
+void lustre_msg_add_version(struct lustre_msg *msg, int version)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_version |= version;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_add_version);
+
+__u32 lustre_msg_get_opc(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_opc;
+       }
+       default:
+               CERROR("incorrect message magic: %08x(msg:%p)\n", msg->lm_magic, msg);
+               LBUG();
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_opc);
+
+__u64 lustre_msg_get_last_xid(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_last_xid;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_last_xid);
+
+__u64 lustre_msg_get_last_committed(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_last_committed;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_last_committed);
+
+__u64 *lustre_msg_get_versions(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return NULL;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return NULL;
+               }
+               return pb->pb_pre_versions;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return NULL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_versions);
+
+__u64 lustre_msg_get_transno(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_transno;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_transno);
+
+int lustre_msg_get_status(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return -EINVAL;
+               }
+               return pb->pb_status;
+       }
+       default:
+               /* status might be printed in debug code while message
+                * uninitialized */
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_status);
+
+__u64 lustre_msg_get_slv(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return -EINVAL;
+               }
+               return pb->pb_slv;
+       }
+       default:
+               CERROR("invalid msg magic %08x\n", msg->lm_magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_slv);
+
+
+void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return;
+               }
+               pb->pb_slv = slv;
+               return;
+       }
+       default:
+               CERROR("invalid msg magic %x\n", msg->lm_magic);
+               return;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_slv);
+
+__u32 lustre_msg_get_limit(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return -EINVAL;
+               }
+               return pb->pb_limit;
+       }
+       default:
+               CERROR("invalid msg magic %x\n", msg->lm_magic);
+               return -EINVAL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_limit);
+
+
+void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return;
+               }
+               pb->pb_limit = limit;
+               return;
+       }
+       default:
+               CERROR("invalid msg magic %08x\n", msg->lm_magic);
+               return;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_limit);
+
+__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+               }
+               return pb->pb_conn_cnt;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_conn_cnt);
+
+int lustre_msg_is_v1(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+       case LUSTRE_MSG_MAGIC_V1_SWABBED:
+               return 1;
+       default:
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_is_v1);
+
+__u32 lustre_msg_get_magic(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return msg->lm_magic;
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_magic);
+
+__u32 lustre_msg_get_timeout(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+       case LUSTRE_MSG_MAGIC_V1_SWABBED:
+               return 0;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+
+               }
+               return pb->pb_timeout;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+
+__u32 lustre_msg_get_service_time(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+       case LUSTRE_MSG_MAGIC_V1_SWABBED:
+               return 0;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               if (!pb) {
+                       CERROR("invalid msg %p: no ptlrpc body!\n", msg);
+                       return 0;
+
+               }
+               return pb->pb_service_time;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+
+char *lustre_msg_get_jobid(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+       case LUSTRE_MSG_MAGIC_V1_SWABBED:
+               return NULL;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb =
+                       lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+                                         sizeof(struct ptlrpc_body));
+               if (!pb)
+                       return NULL;
+
+               return pb->pb_jobid;
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return NULL;
+       }
+}
+EXPORT_SYMBOL(lustre_msg_get_jobid);
+
+__u32 lustre_msg_get_cksum(struct lustre_msg *msg)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return msg->lm_cksum;
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+/*
+ * In 1.6 and 1.8 the checksum was computed only on struct ptlrpc_body as
+ * it was in 1.6 (88 bytes, smaller than the full size in 1.8).  It makes
+ * more sense to compute the checksum on the full ptlrpc_body, regardless
+ * of what size it is, but in order to keep interoperability with 1.8 we
+ * can optionally also checksum only the first 88 bytes (caller decides). */
+# define ptlrpc_body_cksum_size_compat18        88
+
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18)
+#else
+# warning "remove checksum compatibility support for b1_8"
+__u32 lustre_msg_calc_cksum(struct lustre_msg *msg)
+#endif
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+               __u32 crc;
+               unsigned int hsize = 4;
+               __u32 len = compat18 ? ptlrpc_body_cksum_size_compat18 :
+                           lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+                                      len, NULL, 0, (unsigned char *)&crc,
+                                      &hsize);
+               return crc;
+#else
+# warning "remove checksum compatibility support for b1_8"
+               __u32 crc;
+               unsigned int hsize = 4;
+               cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb,
+                                  lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF),
+                                  NULL, 0, (unsigned char *)&crc, &hsize);
+               return crc;
+#endif
+       }
+       default:
+               CERROR("incorrect message magic: %08x\n", msg->lm_magic);
+               return 0;
+       }
+}
+
+void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_handle = *handle;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_handle);
+
+void lustre_msg_set_type(struct lustre_msg *msg, __u32 type)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_type = type;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_type);
+
+void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_opc = opc;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_opc);
+
+void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_last_xid = last_xid;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_last_xid);
+
+void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_last_committed = last_committed;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_last_committed);
+
+void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_pre_versions[0] = versions[0];
+               pb->pb_pre_versions[1] = versions[1];
+               pb->pb_pre_versions[2] = versions[2];
+               pb->pb_pre_versions[3] = versions[3];
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_versions);
+
+void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_transno = transno;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_transno);
+
+void lustre_msg_set_status(struct lustre_msg *msg, __u32 status)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_status = status;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_status);
+
+void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_conn_cnt = conn_cnt;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_conn_cnt);
+
+void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_timeout = timeout;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+
+void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2: {
+               struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg);
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+               pb->pb_service_time = service_time;
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+
+void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2: {
+               __u32 opc = lustre_msg_get_opc(msg);
+               struct ptlrpc_body *pb;
+
+               /* Don't set jobid for ldlm ast RPCs, they've been shrinked.
+                * See the comment in ptlrpc_request_pack(). */
+               if (!opc || opc == LDLM_BL_CALLBACK ||
+                   opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK)
+                       return;
+
+               pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF,
+                                      sizeof(struct ptlrpc_body));
+               LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg);
+
+               if (jobid != NULL)
+                       memcpy(pb->pb_jobid, jobid, JOBSTATS_JOBID_SIZE);
+               else if (pb->pb_jobid[0] == '\0')
+                       lustre_get_jobid(pb->pb_jobid);
+               return;
+       }
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+EXPORT_SYMBOL(lustre_msg_set_jobid);
+
+void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum)
+{
+       switch (msg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V1:
+               return;
+       case LUSTRE_MSG_MAGIC_V2:
+               msg->lm_cksum = cksum;
+               return;
+       default:
+               LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic);
+       }
+}
+
+
+void ptlrpc_request_set_replen(struct ptlrpc_request *req)
+{
+       int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER);
+
+       req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count,
+                                        req->rq_pill.rc_area[RCL_SERVER]);
+       if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+               req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_request_set_replen);
+
+void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens)
+{
+       req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens);
+       if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2)
+               req->rq_reqmsg->lm_repsize = req->rq_replen;
+}
+EXPORT_SYMBOL(ptlrpc_req_set_repsize);
+
+/**
+ * Send a remote set_info_async.
+ *
+ * This may go from client to server or server to client.
+ */
+int do_set_info_async(struct obd_import *imp,
+                     int opcode, int version,
+                     obd_count keylen, void *key,
+                     obd_count vallen, void *val,
+                     struct ptlrpc_request_set *set)
+{
+       struct ptlrpc_request *req;
+       char              *tmp;
+       int                 rc;
+       ENTRY;
+
+       req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
+                            RCL_CLIENT, keylen);
+       req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL,
+                            RCL_CLIENT, vallen);
+       rc = ptlrpc_request_pack(req, version, opcode);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
+       memcpy(tmp, key, keylen);
+       tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL);
+       memcpy(tmp, val, vallen);
+
+       ptlrpc_request_set_replen(req);
+
+       if (set) {
+               ptlrpc_set_add_req(set, req);
+               ptlrpc_check_set(NULL, set);
+       } else {
+               rc = ptlrpc_queue_wait(req);
+               ptlrpc_req_finished(req);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(do_set_info_async);
+
+/* byte flipping routines for all wire types declared in
+ * lustre_idl.h implemented here.
+ */
+void lustre_swab_ptlrpc_body(struct ptlrpc_body *b)
+{
+       __swab32s (&b->pb_type);
+       __swab32s (&b->pb_version);
+       __swab32s (&b->pb_opc);
+       __swab32s (&b->pb_status);
+       __swab64s (&b->pb_last_xid);
+       __swab64s (&b->pb_last_seen);
+       __swab64s (&b->pb_last_committed);
+       __swab64s (&b->pb_transno);
+       __swab32s (&b->pb_flags);
+       __swab32s (&b->pb_op_flags);
+       __swab32s (&b->pb_conn_cnt);
+       __swab32s (&b->pb_timeout);
+       __swab32s (&b->pb_service_time);
+       __swab32s (&b->pb_limit);
+       __swab64s (&b->pb_slv);
+       __swab64s (&b->pb_pre_versions[0]);
+       __swab64s (&b->pb_pre_versions[1]);
+       __swab64s (&b->pb_pre_versions[2]);
+       __swab64s (&b->pb_pre_versions[3]);
+       CLASSERT(offsetof(typeof(*b), pb_padding) != 0);
+       /* While we need to maintain compatibility between
+        * clients and servers without ptlrpc_body_v2 (< 2.3)
+        * do not swab any fields beyond pb_jobid, as we are
+        * using this swab function for both ptlrpc_body
+        * and ptlrpc_body_v2. */
+       CLASSERT(offsetof(typeof(*b), pb_jobid) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_ptlrpc_body);
+
+void lustre_swab_connect(struct obd_connect_data *ocd)
+{
+       __swab64s(&ocd->ocd_connect_flags);
+       __swab32s(&ocd->ocd_version);
+       __swab32s(&ocd->ocd_grant);
+       __swab64s(&ocd->ocd_ibits_known);
+       __swab32s(&ocd->ocd_index);
+       __swab32s(&ocd->ocd_brw_size);
+       /* ocd_blocksize and ocd_inodespace don't need to be swabbed because
+        * they are 8-byte values */
+       __swab16s(&ocd->ocd_grant_extent);
+       __swab32s(&ocd->ocd_unused);
+       __swab64s(&ocd->ocd_transno);
+       __swab32s(&ocd->ocd_group);
+       __swab32s(&ocd->ocd_cksum_types);
+       __swab32s(&ocd->ocd_instance);
+       /* Fields after ocd_cksum_types are only accessible by the receiver
+        * if the corresponding flag in ocd_connect_flags is set. Accessing
+        * any field after ocd_maxbytes on the receiver without a valid flag
+        * may result in out-of-bound memory access and kernel oops. */
+       if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)
+               __swab32s(&ocd->ocd_max_easize);
+       if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES)
+               __swab64s(&ocd->ocd_maxbytes);
+       CLASSERT(offsetof(typeof(*ocd), padding1) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding2) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding3) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding4) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding5) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding6) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding7) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding8) != 0);
+       CLASSERT(offsetof(typeof(*ocd), padding9) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingA) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingB) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingC) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingD) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingE) != 0);
+       CLASSERT(offsetof(typeof(*ocd), paddingF) != 0);
+}
+
+void lustre_swab_obdo (struct obdo  *o)
+{
+       __swab64s (&o->o_valid);
+       lustre_swab_ost_id(&o->o_oi);
+       __swab64s (&o->o_parent_seq);
+       __swab64s (&o->o_size);
+       __swab64s (&o->o_mtime);
+       __swab64s (&o->o_atime);
+       __swab64s (&o->o_ctime);
+       __swab64s (&o->o_blocks);
+       __swab64s (&o->o_grant);
+       __swab32s (&o->o_blksize);
+       __swab32s (&o->o_mode);
+       __swab32s (&o->o_uid);
+       __swab32s (&o->o_gid);
+       __swab32s (&o->o_flags);
+       __swab32s (&o->o_nlink);
+       __swab32s (&o->o_parent_oid);
+       __swab32s (&o->o_misc);
+       __swab64s (&o->o_ioepoch);
+       __swab32s (&o->o_stripe_idx);
+       __swab32s (&o->o_parent_ver);
+       /* o_handle is opaque */
+       /* o_lcookie is swabbed elsewhere */
+       __swab32s (&o->o_uid_h);
+       __swab32s (&o->o_gid_h);
+       __swab64s (&o->o_data_version);
+       CLASSERT(offsetof(typeof(*o), o_padding_4) != 0);
+       CLASSERT(offsetof(typeof(*o), o_padding_5) != 0);
+       CLASSERT(offsetof(typeof(*o), o_padding_6) != 0);
+
+}
+EXPORT_SYMBOL(lustre_swab_obdo);
+
+void lustre_swab_obd_statfs (struct obd_statfs *os)
+{
+       __swab64s (&os->os_type);
+       __swab64s (&os->os_blocks);
+       __swab64s (&os->os_bfree);
+       __swab64s (&os->os_bavail);
+       __swab64s (&os->os_files);
+       __swab64s (&os->os_ffree);
+       /* no need to swab os_fsid */
+       __swab32s (&os->os_bsize);
+       __swab32s (&os->os_namelen);
+       __swab64s (&os->os_maxbytes);
+       __swab32s (&os->os_state);
+       CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare2) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare3) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare4) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare5) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare6) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare7) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare8) != 0);
+       CLASSERT(offsetof(typeof(*os), os_spare9) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_obd_statfs);
+
+void lustre_swab_obd_ioobj(struct obd_ioobj *ioo)
+{
+       lustre_swab_ost_id(&ioo->ioo_oid);
+       __swab32s(&ioo->ioo_max_brw);
+       __swab32s(&ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(lustre_swab_obd_ioobj);
+
+void lustre_swab_niobuf_remote (struct niobuf_remote *nbr)
+{
+       __swab64s (&nbr->offset);
+       __swab32s (&nbr->len);
+       __swab32s (&nbr->flags);
+}
+EXPORT_SYMBOL(lustre_swab_niobuf_remote);
+
+void lustre_swab_ost_body (struct ost_body *b)
+{
+       lustre_swab_obdo (&b->oa);
+}
+EXPORT_SYMBOL(lustre_swab_ost_body);
+
+void lustre_swab_ost_last_id(obd_id *id)
+{
+       __swab64s(id);
+}
+EXPORT_SYMBOL(lustre_swab_ost_last_id);
+
+void lustre_swab_generic_32s(__u32 *val)
+{
+       __swab32s(val);
+}
+EXPORT_SYMBOL(lustre_swab_generic_32s);
+
+void lustre_swab_gl_desc(union ldlm_gl_desc *desc)
+{
+       lustre_swab_lu_fid(&desc->lquota_desc.gl_id.qid_fid);
+       __swab64s(&desc->lquota_desc.gl_flags);
+       __swab64s(&desc->lquota_desc.gl_ver);
+       __swab64s(&desc->lquota_desc.gl_hardlimit);
+       __swab64s(&desc->lquota_desc.gl_softlimit);
+       __swab64s(&desc->lquota_desc.gl_time);
+       CLASSERT(offsetof(typeof(desc->lquota_desc), gl_pad2) != 0);
+}
+
+void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb)
+{
+       __swab64s(&lvb->lvb_size);
+       __swab64s(&lvb->lvb_mtime);
+       __swab64s(&lvb->lvb_atime);
+       __swab64s(&lvb->lvb_ctime);
+       __swab64s(&lvb->lvb_blocks);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb_v1);
+
+void lustre_swab_ost_lvb(struct ost_lvb *lvb)
+{
+       __swab64s(&lvb->lvb_size);
+       __swab64s(&lvb->lvb_mtime);
+       __swab64s(&lvb->lvb_atime);
+       __swab64s(&lvb->lvb_ctime);
+       __swab64s(&lvb->lvb_blocks);
+       __swab32s(&lvb->lvb_mtime_ns);
+       __swab32s(&lvb->lvb_atime_ns);
+       __swab32s(&lvb->lvb_ctime_ns);
+       __swab32s(&lvb->lvb_padding);
+}
+EXPORT_SYMBOL(lustre_swab_ost_lvb);
+
+void lustre_swab_lquota_lvb(struct lquota_lvb *lvb)
+{
+       __swab64s(&lvb->lvb_flags);
+       __swab64s(&lvb->lvb_id_may_rel);
+       __swab64s(&lvb->lvb_id_rel);
+       __swab64s(&lvb->lvb_id_qunit);
+       __swab64s(&lvb->lvb_pad1);
+}
+EXPORT_SYMBOL(lustre_swab_lquota_lvb);
+
+void lustre_swab_mdt_body (struct mdt_body *b)
+{
+       lustre_swab_lu_fid (&b->fid1);
+       lustre_swab_lu_fid (&b->fid2);
+       /* handle is opaque */
+       __swab64s (&b->valid);
+       __swab64s (&b->size);
+       __swab64s (&b->mtime);
+       __swab64s (&b->atime);
+       __swab64s (&b->ctime);
+       __swab64s (&b->blocks);
+       __swab64s (&b->ioepoch);
+       CLASSERT(offsetof(typeof(*b), unused1) != 0);
+       __swab32s (&b->fsuid);
+       __swab32s (&b->fsgid);
+       __swab32s (&b->capability);
+       __swab32s (&b->mode);
+       __swab32s (&b->uid);
+       __swab32s (&b->gid);
+       __swab32s (&b->flags);
+       __swab32s (&b->rdev);
+       __swab32s (&b->nlink);
+       CLASSERT(offsetof(typeof(*b), unused2) != 0);
+       __swab32s (&b->suppgid);
+       __swab32s (&b->eadatasize);
+       __swab32s (&b->aclsize);
+       __swab32s (&b->max_mdsize);
+       __swab32s (&b->max_cookiesize);
+       __swab32s (&b->uid_h);
+       __swab32s (&b->gid_h);
+       CLASSERT(offsetof(typeof(*b), padding_5) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_body);
+
+void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b)
+{
+       /* handle is opaque */
+        __swab64s (&b->ioepoch);
+        __swab32s (&b->flags);
+        CLASSERT(offsetof(typeof(*b), padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_mdt_ioepoch);
+
+void lustre_swab_mgs_target_info(struct mgs_target_info *mti)
+{
+       int i;
+       __swab32s(&mti->mti_lustre_ver);
+       __swab32s(&mti->mti_stripe_index);
+       __swab32s(&mti->mti_config_ver);
+       __swab32s(&mti->mti_flags);
+       __swab32s(&mti->mti_instance);
+       __swab32s(&mti->mti_nid_count);
+       CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+       for (i = 0; i < MTI_NIDS_MAX; i++)
+               __swab64s(&mti->mti_nids[i]);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_target_info);
+
+void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry)
+{
+       int i;
+
+       __swab64s(&entry->mne_version);
+       __swab32s(&entry->mne_instance);
+       __swab32s(&entry->mne_index);
+       __swab32s(&entry->mne_length);
+
+       /* mne_nid_(count|type) must be one byte size because we're gonna
+        * access it w/o swapping. */
+       CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8));
+       CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8));
+
+       /* remove this assertion if ipv6 is supported. */
+       LASSERT(entry->mne_nid_type == 0);
+       for (i = 0; i < entry->mne_nid_count; i++) {
+               CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
+               __swab64s(&entry->u.nids[i]);
+       }
+}
+EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry);
+
+void lustre_swab_mgs_config_body(struct mgs_config_body *body)
+{
+       __swab64s(&body->mcb_offset);
+       __swab32s(&body->mcb_units);
+       __swab16s(&body->mcb_type);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_body);
+
+void lustre_swab_mgs_config_res(struct mgs_config_res *body)
+{
+       __swab64s(&body->mcr_offset);
+       __swab64s(&body->mcr_size);
+}
+EXPORT_SYMBOL(lustre_swab_mgs_config_res);
+
+static void lustre_swab_obd_dqinfo (struct obd_dqinfo *i)
+{
+       __swab64s (&i->dqi_bgrace);
+       __swab64s (&i->dqi_igrace);
+       __swab32s (&i->dqi_flags);
+       __swab32s (&i->dqi_valid);
+}
+
+static void lustre_swab_obd_dqblk (struct obd_dqblk *b)
+{
+       __swab64s (&b->dqb_ihardlimit);
+       __swab64s (&b->dqb_isoftlimit);
+       __swab64s (&b->dqb_curinodes);
+       __swab64s (&b->dqb_bhardlimit);
+       __swab64s (&b->dqb_bsoftlimit);
+       __swab64s (&b->dqb_curspace);
+       __swab64s (&b->dqb_btime);
+       __swab64s (&b->dqb_itime);
+       __swab32s (&b->dqb_valid);
+       CLASSERT(offsetof(typeof(*b), dqb_padding) != 0);
+}
+
+void lustre_swab_obd_quotactl (struct obd_quotactl *q)
+{
+       __swab32s (&q->qc_cmd);
+       __swab32s (&q->qc_type);
+       __swab32s (&q->qc_id);
+       __swab32s (&q->qc_stat);
+       lustre_swab_obd_dqinfo (&q->qc_dqinfo);
+       lustre_swab_obd_dqblk (&q->qc_dqblk);
+}
+EXPORT_SYMBOL(lustre_swab_obd_quotactl);
+
+void lustre_swab_mdt_remote_perm (struct mdt_remote_perm *p)
+{
+       __swab32s (&p->rp_uid);
+       __swab32s (&p->rp_gid);
+       __swab32s (&p->rp_fsuid);
+       __swab32s (&p->rp_fsuid_h);
+       __swab32s (&p->rp_fsgid);
+       __swab32s (&p->rp_fsgid_h);
+       __swab32s (&p->rp_access_perm);
+       __swab32s (&p->rp_padding);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_remote_perm);
+
+void lustre_swab_fid2path(struct getinfo_fid2path *gf)
+{
+       lustre_swab_lu_fid(&gf->gf_fid);
+       __swab64s(&gf->gf_recno);
+       __swab32s(&gf->gf_linkno);
+       __swab32s(&gf->gf_pathlen);
+}
+EXPORT_SYMBOL(lustre_swab_fid2path);
+
+void lustre_swab_fiemap_extent(struct ll_fiemap_extent *fm_extent)
+{
+       __swab64s(&fm_extent->fe_logical);
+       __swab64s(&fm_extent->fe_physical);
+       __swab64s(&fm_extent->fe_length);
+       __swab32s(&fm_extent->fe_flags);
+       __swab32s(&fm_extent->fe_device);
+}
+
+void lustre_swab_fiemap(struct ll_user_fiemap *fiemap)
+{
+       int i;
+
+       __swab64s(&fiemap->fm_start);
+       __swab64s(&fiemap->fm_length);
+       __swab32s(&fiemap->fm_flags);
+       __swab32s(&fiemap->fm_mapped_extents);
+       __swab32s(&fiemap->fm_extent_count);
+       __swab32s(&fiemap->fm_reserved);
+
+       for (i = 0; i < fiemap->fm_mapped_extents; i++)
+               lustre_swab_fiemap_extent(&fiemap->fm_extents[i]);
+}
+EXPORT_SYMBOL(lustre_swab_fiemap);
+
+void lustre_swab_idx_info(struct idx_info *ii)
+{
+       __swab32s(&ii->ii_magic);
+       __swab32s(&ii->ii_flags);
+       __swab16s(&ii->ii_count);
+       __swab32s(&ii->ii_attrs);
+       lustre_swab_lu_fid(&ii->ii_fid);
+       __swab64s(&ii->ii_version);
+       __swab64s(&ii->ii_hash_start);
+       __swab64s(&ii->ii_hash_end);
+       __swab16s(&ii->ii_keysize);
+       __swab16s(&ii->ii_recsize);
+}
+
+void lustre_swab_lip_header(struct lu_idxpage *lip)
+{
+       /* swab header */
+       __swab32s(&lip->lip_magic);
+       __swab16s(&lip->lip_flags);
+       __swab16s(&lip->lip_nr);
+}
+EXPORT_SYMBOL(lustre_swab_lip_header);
+
+void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr)
+{
+       __swab32s(&rr->rr_opcode);
+       __swab32s(&rr->rr_cap);
+       __swab32s(&rr->rr_fsuid);
+       /* rr_fsuid_h is unused */
+       __swab32s(&rr->rr_fsgid);
+       /* rr_fsgid_h is unused */
+       __swab32s(&rr->rr_suppgid1);
+       /* rr_suppgid1_h is unused */
+       __swab32s(&rr->rr_suppgid2);
+       /* rr_suppgid2_h is unused */
+       lustre_swab_lu_fid(&rr->rr_fid1);
+       lustre_swab_lu_fid(&rr->rr_fid2);
+       __swab64s(&rr->rr_mtime);
+       __swab64s(&rr->rr_atime);
+       __swab64s(&rr->rr_ctime);
+       __swab64s(&rr->rr_size);
+       __swab64s(&rr->rr_blocks);
+       __swab32s(&rr->rr_bias);
+       __swab32s(&rr->rr_mode);
+       __swab32s(&rr->rr_flags);
+       __swab32s(&rr->rr_flags_h);
+       __swab32s(&rr->rr_umask);
+
+       CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0);
+};
+EXPORT_SYMBOL(lustre_swab_mdt_rec_reint);
+
+void lustre_swab_lov_desc (struct lov_desc *ld)
+{
+       __swab32s (&ld->ld_tgt_count);
+       __swab32s (&ld->ld_active_tgt_count);
+       __swab32s (&ld->ld_default_stripe_count);
+       __swab32s (&ld->ld_pattern);
+       __swab64s (&ld->ld_default_stripe_size);
+       __swab64s (&ld->ld_default_stripe_offset);
+       __swab32s (&ld->ld_qos_maxage);
+       /* uuid endian insensitive */
+}
+EXPORT_SYMBOL(lustre_swab_lov_desc);
+
+void lustre_swab_lmv_desc (struct lmv_desc *ld)
+{
+       __swab32s (&ld->ld_tgt_count);
+       __swab32s (&ld->ld_active_tgt_count);
+       __swab32s (&ld->ld_default_stripe_count);
+       __swab32s (&ld->ld_pattern);
+       __swab64s (&ld->ld_default_hash_size);
+       __swab32s (&ld->ld_qos_maxage);
+       /* uuid endian insensitive */
+}
+
+void lustre_swab_lmv_stripe_md (struct lmv_stripe_md *mea)
+{
+       __swab32s(&mea->mea_magic);
+       __swab32s(&mea->mea_count);
+       __swab32s(&mea->mea_master);
+       CLASSERT(offsetof(typeof(*mea), mea_padding) != 0);
+}
+
+void lustre_swab_lmv_user_md(struct lmv_user_md *lum)
+{
+       int i;
+
+       __swab32s(&lum->lum_magic);
+       __swab32s(&lum->lum_stripe_count);
+       __swab32s(&lum->lum_stripe_offset);
+       __swab32s(&lum->lum_hash_type);
+       __swab32s(&lum->lum_type);
+       CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0);
+       CLASSERT(offsetof(typeof(*lum), lum_padding2) != 0);
+       CLASSERT(offsetof(typeof(*lum), lum_padding3) != 0);
+
+       for (i = 0; i < lum->lum_stripe_count; i++) {
+               __swab32s(&lum->lum_objects[i].lum_mds);
+               lustre_swab_lu_fid(&lum->lum_objects[i].lum_fid);
+       }
+
+}
+EXPORT_SYMBOL(lustre_swab_lmv_user_md);
+
+static void print_lum (struct lov_user_md *lum)
+{
+       CDEBUG(D_OTHER, "lov_user_md %p:\n", lum);
+       CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic);
+       CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern);
+       CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lmm_oi_id(&lum->lmm_oi));
+       CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lmm_oi_seq(&lum->lmm_oi));
+       CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size);
+       CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count);
+       CDEBUG(D_OTHER, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n",
+                       lum->lmm_stripe_offset);
+}
+
+static void lustre_swab_lmm_oi(struct ost_id *oi)
+{
+       __swab64s(&oi->oi.oi_id);
+       __swab64s(&oi->oi.oi_seq);
+}
+
+static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum)
+{
+       ENTRY;
+       __swab32s(&lum->lmm_magic);
+       __swab32s(&lum->lmm_pattern);
+       lustre_swab_lmm_oi(&lum->lmm_oi);
+       __swab32s(&lum->lmm_stripe_size);
+       __swab16s(&lum->lmm_stripe_count);
+       __swab16s(&lum->lmm_stripe_offset);
+       print_lum(lum);
+       EXIT;
+}
+
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum)
+{
+       ENTRY;
+       CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n");
+       lustre_swab_lov_user_md_common(lum);
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
+
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
+{
+       ENTRY;
+       CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n");
+       lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum);
+       /* lmm_pool_name nothing to do with char */
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
+
+void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
+{
+       ENTRY;
+       CDEBUG(D_IOCTL, "swabbing lov_mds_md\n");
+       __swab32s(&lmm->lmm_magic);
+       __swab32s(&lmm->lmm_pattern);
+       lustre_swab_lmm_oi(&lmm->lmm_oi);
+       __swab32s(&lmm->lmm_stripe_size);
+       __swab16s(&lmm->lmm_stripe_count);
+       __swab16s(&lmm->lmm_layout_gen);
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+                                    int stripe_count)
+{
+       int i;
+       ENTRY;
+       for (i = 0; i < stripe_count; i++) {
+               lustre_swab_ost_id(&(lod[i].l_ost_oi));
+               __swab32s(&(lod[i].l_ost_gen));
+               __swab32s(&(lod[i].l_ost_idx));
+       }
+       EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
+
+void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
+{
+       int  i;
+
+       for (i = 0; i < RES_NAME_SIZE; i++)
+               __swab64s (&id->name[i]);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
+
+void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d)
+{
+       /* the lock data is a union and the first two fields are always an
+        * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock
+        * data the same way. */
+       __swab64s(&d->l_extent.start);
+       __swab64s(&d->l_extent.end);
+       __swab64s(&d->l_extent.gid);
+       __swab64s(&d->l_flock.lfw_owner);
+       __swab32s(&d->l_flock.lfw_pid);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_policy_data);
+
+void lustre_swab_ldlm_intent (struct ldlm_intent *i)
+{
+       __swab64s (&i->opc);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_intent);
+
+void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r)
+{
+       __swab32s (&r->lr_type);
+       CLASSERT(offsetof(typeof(*r), lr_padding) != 0);
+       lustre_swab_ldlm_res_id (&r->lr_name);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc);
+
+void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l)
+{
+       lustre_swab_ldlm_resource_desc (&l->l_resource);
+       __swab32s (&l->l_req_mode);
+       __swab32s (&l->l_granted_mode);
+       lustre_swab_ldlm_policy_data (&l->l_policy_data);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc);
+
+void lustre_swab_ldlm_request (struct ldlm_request *rq)
+{
+       __swab32s (&rq->lock_flags);
+       lustre_swab_ldlm_lock_desc (&rq->lock_desc);
+       __swab32s (&rq->lock_count);
+       /* lock_handle[] opaque */
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_request);
+
+void lustre_swab_ldlm_reply (struct ldlm_reply *r)
+{
+       __swab32s (&r->lock_flags);
+       CLASSERT(offsetof(typeof(*r), lock_padding) != 0);
+       lustre_swab_ldlm_lock_desc (&r->lock_desc);
+       /* lock_handle opaque */
+       __swab64s (&r->lock_policy_res1);
+       __swab64s (&r->lock_policy_res2);
+}
+EXPORT_SYMBOL(lustre_swab_ldlm_reply);
+
+void lustre_swab_quota_body(struct quota_body *b)
+{
+       lustre_swab_lu_fid(&b->qb_fid);
+       lustre_swab_lu_fid((struct lu_fid *)&b->qb_id);
+       __swab32s(&b->qb_flags);
+       __swab64s(&b->qb_count);
+       __swab64s(&b->qb_usage);
+       __swab64s(&b->qb_slv_ver);
+}
+
+/* Dump functions */
+void dump_ioo(struct obd_ioobj *ioo)
+{
+       CDEBUG(D_RPCTRACE,
+              "obd_ioobj: ioo_oid="DOSTID", ioo_max_brw=%#x, "
+              "ioo_bufct=%d\n", POSTID(&ioo->ioo_oid), ioo->ioo_max_brw,
+              ioo->ioo_bufcnt);
+}
+EXPORT_SYMBOL(dump_ioo);
+
+void dump_rniobuf(struct niobuf_remote *nb)
+{
+       CDEBUG(D_RPCTRACE, "niobuf_remote: offset="LPU64", len=%d, flags=%x\n",
+              nb->offset, nb->len, nb->flags);
+}
+EXPORT_SYMBOL(dump_rniobuf);
+
+void dump_obdo(struct obdo *oa)
+{
+       __u32 valid = oa->o_valid;
+
+       CDEBUG(D_RPCTRACE, "obdo: o_valid = %08x\n", valid);
+       if (valid & OBD_MD_FLID)
+               CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi));
+       if (valid & OBD_MD_FLFID)
+               CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = "LPX64"\n",
+                      oa->o_parent_seq);
+       if (valid & OBD_MD_FLSIZE)
+               CDEBUG(D_RPCTRACE, "obdo: o_size = "LPD64"\n", oa->o_size);
+       if (valid & OBD_MD_FLMTIME)
+               CDEBUG(D_RPCTRACE, "obdo: o_mtime = "LPD64"\n", oa->o_mtime);
+       if (valid & OBD_MD_FLATIME)
+               CDEBUG(D_RPCTRACE, "obdo: o_atime = "LPD64"\n", oa->o_atime);
+       if (valid & OBD_MD_FLCTIME)
+               CDEBUG(D_RPCTRACE, "obdo: o_ctime = "LPD64"\n", oa->o_ctime);
+       if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+               CDEBUG(D_RPCTRACE, "obdo: o_blocks = "LPD64"\n", oa->o_blocks);
+       if (valid & OBD_MD_FLGRANT)
+               CDEBUG(D_RPCTRACE, "obdo: o_grant = "LPD64"\n", oa->o_grant);
+       if (valid & OBD_MD_FLBLKSZ)
+               CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize);
+       if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE))
+               CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n",
+                      oa->o_mode & ((valid & OBD_MD_FLTYPE ?  S_IFMT : 0) |
+                                    (valid & OBD_MD_FLMODE ? ~S_IFMT : 0)));
+       if (valid & OBD_MD_FLUID)
+               CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid);
+       if (valid & OBD_MD_FLUID)
+               CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h);
+       if (valid & OBD_MD_FLGID)
+               CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid);
+       if (valid & OBD_MD_FLGID)
+               CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h);
+       if (valid & OBD_MD_FLFLAGS)
+               CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags);
+       if (valid & OBD_MD_FLNLINK)
+               CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink);
+       else if (valid & OBD_MD_FLCKSUM)
+               CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n",
+                      oa->o_nlink);
+       if (valid & OBD_MD_FLGENER)
+               CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n",
+                      oa->o_parent_oid);
+       if (valid & OBD_MD_FLEPOCH)
+               CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = "LPD64"\n",
+                      oa->o_ioepoch);
+       if (valid & OBD_MD_FLFID) {
+               CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n",
+                      oa->o_stripe_idx);
+               CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n",
+                      oa->o_parent_ver);
+       }
+       if (valid & OBD_MD_FLHANDLE)
+               CDEBUG(D_RPCTRACE, "obdo: o_handle = "LPD64"\n",
+                      oa->o_handle.cookie);
+       if (valid & OBD_MD_FLCOOKIE)
+               CDEBUG(D_RPCTRACE, "obdo: o_lcookie = "
+                      "(llog_cookie dumping not yet implemented)\n");
+}
+EXPORT_SYMBOL(dump_obdo);
+
+void dump_ost_body(struct ost_body *ob)
+{
+       dump_obdo(&ob->oa);
+}
+EXPORT_SYMBOL(dump_ost_body);
+
+void dump_rcs(__u32 *rc)
+{
+       CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc);
+}
+EXPORT_SYMBOL(dump_rcs);
+
+static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+       LASSERT(req->rq_reqmsg);
+
+       switch (req->rq_reqmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF);
+       default:
+               CERROR("bad lustre msg magic: %#08X\n",
+                      req->rq_reqmsg->lm_magic);
+       }
+       return 0;
+}
+
+static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req)
+{
+       LASSERT(req->rq_repmsg);
+
+       switch (req->rq_repmsg->lm_magic) {
+       case LUSTRE_MSG_MAGIC_V2:
+               return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF);
+       default:
+               /* uninitialized yet */
+               return 0;
+       }
+}
+
+void _debug_req(struct ptlrpc_request *req,
+               struct libcfs_debug_msg_data *msgdata,
+               const char *fmt, ... )
+{
+       int req_ok = req->rq_reqmsg != NULL;
+       int rep_ok = req->rq_repmsg != NULL;
+       lnet_nid_t nid = LNET_NID_ANY;
+       va_list args;
+
+       if (ptlrpc_req_need_swab(req)) {
+               req_ok = req_ok && req_ptlrpc_body_swabbed(req);
+               rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req);
+       }
+
+       if (req->rq_import && req->rq_import->imp_connection)
+               nid = req->rq_import->imp_connection->c_peer.nid;
+       else if (req->rq_export && req->rq_export->exp_connection)
+               nid = req->rq_export->exp_connection->c_peer.nid;
+
+       va_start(args, fmt);
+       libcfs_debug_vmsg2(msgdata, fmt, args,
+                          " req@%p x"LPU64"/t"LPD64"("LPD64") o%d->%s@%s:%d/%d"
+                          " lens %d/%d e %d to %d dl "CFS_TIME_T" ref %d "
+                          "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
+                          req, req->rq_xid, req->rq_transno,
+                          req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0,
+                          req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
+                          req->rq_import ?
+                               req->rq_import->imp_obd->obd_name :
+                               req->rq_export ?
+                                    req->rq_export->exp_client_uuid.uuid :
+                                    "<?>",
+                          libcfs_nid2str(nid),
+                          req->rq_request_portal, req->rq_reply_portal,
+                          req->rq_reqlen, req->rq_replen,
+                          req->rq_early_count, req->rq_timedout,
+                          req->rq_deadline,
+                          atomic_read(&req->rq_refcount),
+                          DEBUG_REQ_FLAGS(req),
+                          req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1,
+                          rep_ok ? lustre_msg_get_flags(req->rq_repmsg) : -1,
+                          req->rq_status,
+                          rep_ok ? lustre_msg_get_status(req->rq_repmsg) : -1);
+}
+EXPORT_SYMBOL(_debug_req);
+
+void lustre_swab_lustre_capa(struct lustre_capa *c)
+{
+       lustre_swab_lu_fid(&c->lc_fid);
+       __swab64s (&c->lc_opc);
+       __swab64s (&c->lc_uid);
+       __swab64s (&c->lc_gid);
+       __swab32s (&c->lc_flags);
+       __swab32s (&c->lc_keyid);
+       __swab32s (&c->lc_timeout);
+       __swab32s (&c->lc_expiry);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa);
+
+void lustre_swab_lustre_capa_key(struct lustre_capa_key *k)
+{
+       __swab64s (&k->lk_seq);
+       __swab32s (&k->lk_keyid);
+       CLASSERT(offsetof(typeof(*k), lk_padding) != 0);
+}
+EXPORT_SYMBOL(lustre_swab_lustre_capa_key);
+
+void lustre_swab_hsm_user_state(struct hsm_user_state *state)
+{
+       __swab32s(&state->hus_states);
+       __swab32s(&state->hus_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_state);
+
+void lustre_swab_hsm_state_set(struct hsm_state_set *hss)
+{
+       __swab32s(&hss->hss_valid);
+       __swab64s(&hss->hss_setmask);
+       __swab64s(&hss->hss_clearmask);
+       __swab32s(&hss->hss_archive_id);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_state_set);
+
+void lustre_swab_hsm_extent(struct hsm_extent *extent)
+{
+       __swab64s(&extent->offset);
+       __swab64s(&extent->length);
+}
+
+void lustre_swab_hsm_current_action(struct hsm_current_action *action)
+{
+       __swab32s(&action->hca_state);
+       __swab32s(&action->hca_action);
+       lustre_swab_hsm_extent(&action->hca_location);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_current_action);
+
+void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
+{
+       lustre_swab_lu_fid(&hui->hui_fid);
+       lustre_swab_hsm_extent(&hui->hui_extent);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_user_item);
+
+void lustre_swab_layout_intent(struct layout_intent *li)
+{
+       __swab32s(&li->li_opc);
+       __swab32s(&li->li_flags);
+       __swab64s(&li->li_start);
+       __swab64s(&li->li_end);
+}
+EXPORT_SYMBOL(lustre_swab_layout_intent);
+
+void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
+{
+       lustre_swab_lu_fid(&hpk->hpk_fid);
+       __swab64s(&hpk->hpk_cookie);
+       __swab64s(&hpk->hpk_extent.offset);
+       __swab64s(&hpk->hpk_extent.length);
+       __swab16s(&hpk->hpk_flags);
+       __swab16s(&hpk->hpk_errval);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_progress_kernel);
+
+void lustre_swab_hsm_request(struct hsm_request *hr)
+{
+       __swab32s(&hr->hr_action);
+       __swab32s(&hr->hr_archive_id);
+       __swab64s(&hr->hr_flags);
+       __swab32s(&hr->hr_itemcount);
+       __swab32s(&hr->hr_data_len);
+}
+EXPORT_SYMBOL(lustre_swab_hsm_request);
+
+void lustre_swab_update_buf(struct update_buf *ub)
+{
+       __swab32s(&ub->ub_magic);
+       __swab32s(&ub->ub_count);
+}
+EXPORT_SYMBOL(lustre_swab_update_buf);
+
+void lustre_swab_update_reply_buf(struct update_reply *ur)
+{
+       int i;
+
+       __swab32s(&ur->ur_version);
+       __swab32s(&ur->ur_count);
+       for (i = 0; i < ur->ur_count; i++)
+               __swab32s(&ur->ur_lens[i]);
+}
+EXPORT_SYMBOL(lustre_swab_update_reply_buf);
+
+void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl)
+{
+       __swab64s(&msl->msl_flags);
+}
+EXPORT_SYMBOL(lustre_swab_swap_layouts);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pers.c b/drivers/staging/lustre/lustre/ptlrpc/pers.c
new file mode 100644 (file)
index 0000000..d926d2b
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_import.h>
+
+#include "ptlrpc_internal.h"
+
+
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+                        int mdidx)
+{
+       CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON);
+
+       LASSERT(mdidx < desc->bd_md_max_brw);
+       LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES);
+       LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV |
+                                LNET_MD_PHYS)));
+
+       md->options |= LNET_MD_KIOV;
+       md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV);
+       md->length = min_t(unsigned int, LNET_MAX_IOV, md->length);
+       if (desc->bd_enc_iov)
+               md->start = &desc->bd_enc_iov[mdidx * LNET_MAX_IOV];
+       else
+               md->start = &desc->bd_iov[mdidx * LNET_MAX_IOV];
+}
+
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+                         int pageoffset, int len)
+{
+       lnet_kiov_t *kiov = &desc->bd_iov[desc->bd_iov_count];
+
+       kiov->kiov_page = page;
+       kiov->kiov_offset = pageoffset;
+       kiov->kiov_len = len;
+
+       desc->bd_iov_count++;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
new file mode 100644 (file)
index 0000000..ef5269a
--- /dev/null
@@ -0,0 +1,763 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/pinger.c
+ *
+ * Portal-RPC reconnection and replay operations, for use in recovery.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include "ptlrpc_internal.h"
+
+static int suppress_pings;
+CFS_MODULE_PARM(suppress_pings, "i", int, 0644, "Suppress pings");
+
+struct mutex pinger_mutex;
+static LIST_HEAD(pinger_imports);
+static struct list_head timeout_list = LIST_HEAD_INIT(timeout_list);
+
+int ptlrpc_pinger_suppress_pings()
+{
+       return suppress_pings;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings);
+
+struct ptlrpc_request *
+ptlrpc_prep_ping(struct obd_import *imp)
+{
+       struct ptlrpc_request *req;
+
+       req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING,
+                                       LUSTRE_OBD_VERSION, OBD_PING);
+       if (req) {
+               ptlrpc_request_set_replen(req);
+               req->rq_no_resend = req->rq_no_delay = 1;
+       }
+       return req;
+}
+
+int ptlrpc_obd_ping(struct obd_device *obd)
+{
+       int rc;
+       struct ptlrpc_request *req;
+       ENTRY;
+
+       req = ptlrpc_prep_ping(obd->u.cli.cl_import);
+       if (req == NULL)
+               RETURN(-ENOMEM);
+
+       req->rq_send_state = LUSTRE_IMP_FULL;
+
+       rc = ptlrpc_queue_wait(req);
+
+       ptlrpc_req_finished(req);
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_obd_ping);
+
+int ptlrpc_ping(struct obd_import *imp)
+{
+       struct ptlrpc_request *req;
+       ENTRY;
+
+       req = ptlrpc_prep_ping(imp);
+       if (req == NULL) {
+               CERROR("OOM trying to ping %s->%s\n",
+                      imp->imp_obd->obd_uuid.uuid,
+                      obd2cli_tgt(imp->imp_obd));
+               RETURN(-ENOMEM);
+       }
+
+       DEBUG_REQ(D_INFO, req, "pinging %s->%s",
+                 imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+       ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
+
+       RETURN(0);
+}
+
+void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
+{
+       int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
+       if (imp->imp_state == LUSTRE_IMP_DISCON) {
+               int dtime = max_t(int, CONNECTION_SWITCH_MIN,
+                                 AT_OFF ? 0 :
+                                 at_get(&imp->imp_at.iat_net_latency));
+               time = min(time, dtime);
+       }
+       imp->imp_next_ping = cfs_time_shift(time);
+}
+
+void ptlrpc_ping_import_soon(struct obd_import *imp)
+{
+       imp->imp_next_ping = cfs_time_current();
+}
+
+static inline int imp_is_deactive(struct obd_import *imp)
+{
+       return (imp->imp_deactive ||
+               OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE));
+}
+
+static inline int ptlrpc_next_reconnect(struct obd_import *imp)
+{
+       if (imp->imp_server_timeout)
+               return cfs_time_shift(obd_timeout / 2);
+       else
+               return cfs_time_shift(obd_timeout);
+}
+
+static atomic_t suspend_timeouts = ATOMIC_INIT(0);
+static cfs_time_t suspend_wakeup_time = 0;
+
+cfs_duration_t pinger_check_timeout(cfs_time_t time)
+{
+       struct timeout_item *item;
+       cfs_time_t timeout = PING_INTERVAL;
+
+       /* The timeout list is a increase order sorted list */
+       mutex_lock(&pinger_mutex);
+       list_for_each_entry(item, &timeout_list, ti_chain) {
+               int ti_timeout = item->ti_timeout;
+               if (timeout > ti_timeout)
+                       timeout = ti_timeout;
+               break;
+       }
+       mutex_unlock(&pinger_mutex);
+
+       return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)),
+                                        cfs_time_current());
+}
+
+static wait_queue_head_t suspend_timeouts_waitq;
+
+cfs_time_t ptlrpc_suspend_wakeup_time(void)
+{
+       return suspend_wakeup_time;
+}
+
+void ptlrpc_deactivate_timeouts(struct obd_import *imp)
+{
+       /*XXX: disabled for now, will be replaced by adaptive timeouts */
+#if 0
+       if (imp->imp_no_timeout)
+               return;
+       imp->imp_no_timeout = 1;
+       atomic_inc(&suspend_timeouts);
+       CDEBUG(D_HA|D_WARNING, "deactivate timeouts %u\n",
+              atomic_read(&suspend_timeouts));
+#endif
+}
+
+void ptlrpc_activate_timeouts(struct obd_import *imp)
+{
+       /*XXX: disabled for now, will be replaced by adaptive timeouts */
+#if 0
+       if (!imp->imp_no_timeout)
+               return;
+       imp->imp_no_timeout = 0;
+       LASSERT(atomic_read(&suspend_timeouts) > 0);
+       if (atomic_dec_and_test(&suspend_timeouts)) {
+               suspend_wakeup_time = cfs_time_current();
+               wake_up(&suspend_timeouts_waitq);
+       }
+       CDEBUG(D_HA|D_WARNING, "activate timeouts %u\n",
+              atomic_read(&suspend_timeouts));
+#endif
+}
+
+int ptlrpc_check_suspend(void)
+{
+       if (atomic_read(&suspend_timeouts))
+               return 1;
+       return 0;
+}
+
+int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req)
+{
+       struct l_wait_info lwi;
+
+       if (atomic_read(&suspend_timeouts)) {
+               DEBUG_REQ(D_NET, req, "-- suspend %d regular timeout",
+                         atomic_read(&suspend_timeouts));
+               lwi = LWI_INTR(NULL, NULL);
+               l_wait_event(suspend_timeouts_waitq,
+                            atomic_read(&suspend_timeouts) == 0, &lwi);
+               DEBUG_REQ(D_NET, req, "-- recharge regular timeout");
+               return 1;
+       }
+       return 0;
+}
+
+
+static bool ir_up;
+
+void ptlrpc_pinger_ir_up(void)
+{
+       CDEBUG(D_HA, "IR up\n");
+       ir_up = true;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_up);
+
+void ptlrpc_pinger_ir_down(void)
+{
+       CDEBUG(D_HA, "IR down\n");
+       ir_up = false;
+}
+EXPORT_SYMBOL(ptlrpc_pinger_ir_down);
+
+static void ptlrpc_pinger_process_import(struct obd_import *imp,
+                                        unsigned long this_ping)
+{
+       int level;
+       int force;
+       int force_next;
+       int suppress;
+
+       spin_lock(&imp->imp_lock);
+
+       level = imp->imp_state;
+       force = imp->imp_force_verify;
+       force_next = imp->imp_force_next_verify;
+       /*
+        * This will be used below only if the import is "FULL".
+        */
+       suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS);
+
+       imp->imp_force_verify = 0;
+
+       if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) &&
+           !force) {
+               spin_unlock(&imp->imp_lock);
+               return;
+       }
+
+       imp->imp_force_next_verify = 0;
+
+       spin_unlock(&imp->imp_lock);
+
+       CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u "
+              "force %u force_next %u deactive %u pingable %u suppress %u\n",
+              imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+              ptlrpc_import_state_name(level), level, force, force_next,
+              imp->imp_deactive, imp->imp_pingable, suppress);
+
+       if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) {
+               /* wait for a while before trying recovery again */
+               imp->imp_next_ping = ptlrpc_next_reconnect(imp);
+               if (!imp->imp_no_pinger_recover)
+                       ptlrpc_initiate_recovery(imp);
+       } else if (level != LUSTRE_IMP_FULL ||
+                  imp->imp_obd->obd_no_recov ||
+                  imp_is_deactive(imp)) {
+               CDEBUG(D_HA, "%s->%s: not pinging (in recovery "
+                      "or recovery disabled: %s)\n",
+                      imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd),
+                      ptlrpc_import_state_name(level));
+       } else if ((imp->imp_pingable && !suppress) || force_next || force) {
+               ptlrpc_ping(imp);
+       }
+}
+
+static int ptlrpc_pinger_main(void *arg)
+{
+       struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg;
+       ENTRY;
+
+       /* Record that the thread is running */
+       thread_set_flags(thread, SVC_RUNNING);
+       wake_up(&thread->t_ctl_waitq);
+
+       /* And now, loop forever, pinging as needed. */
+       while (1) {
+               cfs_time_t this_ping = cfs_time_current();
+               struct l_wait_info lwi;
+               cfs_duration_t time_to_next_wake;
+               struct timeout_item *item;
+               struct list_head *iter;
+
+               mutex_lock(&pinger_mutex);
+               list_for_each_entry(item, &timeout_list, ti_chain) {
+                       item->ti_cb(item, item->ti_cb_data);
+               }
+               list_for_each(iter, &pinger_imports) {
+                       struct obd_import *imp =
+                               list_entry(iter, struct obd_import,
+                                              imp_pinger_chain);
+
+                       ptlrpc_pinger_process_import(imp, this_ping);
+                       /* obd_timeout might have changed */
+                       if (imp->imp_pingable && imp->imp_next_ping &&
+                           cfs_time_after(imp->imp_next_ping,
+                                          cfs_time_add(this_ping,
+                                                       cfs_time_seconds(PING_INTERVAL))))
+                               ptlrpc_update_next_ping(imp, 0);
+               }
+               mutex_unlock(&pinger_mutex);
+               /* update memory usage info */
+               obd_update_maxusage();
+
+               /* Wait until the next ping time, or until we're stopped. */
+               time_to_next_wake = pinger_check_timeout(this_ping);
+               /* The ping sent by ptlrpc_send_rpc may get sent out
+                  say .01 second after this.
+                  ptlrpc_pinger_sending_on_import will then set the
+                  next ping time to next_ping + .01 sec, which means
+                  we will SKIP the next ping at next_ping, and the
+                  ping will get sent 2 timeouts from now!  Beware. */
+               CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" ("
+                      CFS_TIME_T")\n", time_to_next_wake,
+                      cfs_time_add(this_ping,cfs_time_seconds(PING_INTERVAL)));
+               if (time_to_next_wake > 0) {
+                       lwi = LWI_TIMEOUT(max_t(cfs_duration_t,
+                                               time_to_next_wake,
+                                               cfs_time_seconds(1)),
+                                         NULL, NULL);
+                       l_wait_event(thread->t_ctl_waitq,
+                                    thread_is_stopping(thread) ||
+                                    thread_is_event(thread),
+                                    &lwi);
+                       if (thread_test_and_clear_flags(thread, SVC_STOPPING)) {
+                               EXIT;
+                               break;
+                       } else {
+                               /* woken after adding import to reset timer */
+                               thread_test_and_clear_flags(thread, SVC_EVENT);
+                       }
+               }
+       }
+
+       thread_set_flags(thread, SVC_STOPPED);
+       wake_up(&thread->t_ctl_waitq);
+
+       CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid());
+       return 0;
+}
+
+static struct ptlrpc_thread *pinger_thread = NULL;
+
+int ptlrpc_start_pinger(void)
+{
+       struct l_wait_info lwi = { 0 };
+       int rc;
+       ENTRY;
+
+       if (pinger_thread != NULL)
+               RETURN(-EALREADY);
+
+       OBD_ALLOC_PTR(pinger_thread);
+       if (pinger_thread == NULL)
+               RETURN(-ENOMEM);
+       init_waitqueue_head(&pinger_thread->t_ctl_waitq);
+       init_waitqueue_head(&suspend_timeouts_waitq);
+
+       strcpy(pinger_thread->t_name, "ll_ping");
+
+       /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
+        * just drop the VM and FILES in cfs_daemonize_ctxt() right away. */
+       rc = PTR_ERR(kthread_run(ptlrpc_pinger_main,
+                                pinger_thread, pinger_thread->t_name));
+       if (IS_ERR_VALUE(rc)) {
+               CERROR("cannot start thread: %d\n", rc);
+               OBD_FREE(pinger_thread, sizeof(*pinger_thread));
+               pinger_thread = NULL;
+               RETURN(rc);
+       }
+       l_wait_event(pinger_thread->t_ctl_waitq,
+                    thread_is_running(pinger_thread), &lwi);
+
+       if (suppress_pings)
+               CWARN("Pings will be suppressed at the request of the "
+                     "administrator.  The configuration shall meet the "
+                     "additional requirements described in the manual.  "
+                     "(Search for the \"suppress_pings\" kernel module "
+                     "parameter.)\n");
+
+       RETURN(0);
+}
+
+int ptlrpc_pinger_remove_timeouts(void);
+
+int ptlrpc_stop_pinger(void)
+{
+       struct l_wait_info lwi = { 0 };
+       int rc = 0;
+       ENTRY;
+
+       if (pinger_thread == NULL)
+               RETURN(-EALREADY);
+
+       ptlrpc_pinger_remove_timeouts();
+       mutex_lock(&pinger_mutex);
+       thread_set_flags(pinger_thread, SVC_STOPPING);
+       wake_up(&pinger_thread->t_ctl_waitq);
+       mutex_unlock(&pinger_mutex);
+
+       l_wait_event(pinger_thread->t_ctl_waitq,
+                    thread_is_stopped(pinger_thread), &lwi);
+
+       OBD_FREE_PTR(pinger_thread);
+       pinger_thread = NULL;
+       RETURN(rc);
+}
+
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
+{
+       ptlrpc_update_next_ping(imp, 0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import);
+
+void ptlrpc_pinger_commit_expected(struct obd_import *imp)
+{
+       ptlrpc_update_next_ping(imp, 1);
+       LASSERT(spin_is_locked(&imp->imp_lock));
+       /*
+        * Avoid reading stale imp_connect_data.  When not sure if pings are
+        * expected or not on next connection, we assume they are not and force
+        * one anyway to guarantee the chance of updating
+        * imp_peer_committed_transno.
+        */
+       if (imp->imp_state != LUSTRE_IMP_FULL ||
+           OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS))
+               imp->imp_force_next_verify = 1;
+}
+
+int ptlrpc_pinger_add_import(struct obd_import *imp)
+{
+       ENTRY;
+       if (!list_empty(&imp->imp_pinger_chain))
+               RETURN(-EALREADY);
+
+       mutex_lock(&pinger_mutex);
+       CDEBUG(D_HA, "adding pingable import %s->%s\n",
+              imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+       /* if we add to pinger we want recovery on this import */
+       imp->imp_obd->obd_no_recov = 0;
+       ptlrpc_update_next_ping(imp, 0);
+       /* XXX sort, blah blah */
+       list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
+       class_import_get(imp);
+
+       ptlrpc_pinger_wake_up();
+       mutex_unlock(&pinger_mutex);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_add_import);
+
+int ptlrpc_pinger_del_import(struct obd_import *imp)
+{
+       ENTRY;
+       if (list_empty(&imp->imp_pinger_chain))
+               RETURN(-ENOENT);
+
+       mutex_lock(&pinger_mutex);
+       list_del_init(&imp->imp_pinger_chain);
+       CDEBUG(D_HA, "removing pingable import %s->%s\n",
+              imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
+       /* if we remove from pinger we don't want recovery on this import */
+       imp->imp_obd->obd_no_recov = 1;
+       class_import_put(imp);
+       mutex_unlock(&pinger_mutex);
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_pinger_del_import);
+
+/**
+ * Register a timeout callback to the pinger list, and the callback will
+ * be called when timeout happens.
+ */
+struct timeout_item* ptlrpc_new_timeout(int time, enum timeout_event event,
+                                       timeout_cb_t cb, void *data)
+{
+       struct timeout_item *ti;
+
+       OBD_ALLOC_PTR(ti);
+       if (!ti)
+               return(NULL);
+
+       INIT_LIST_HEAD(&ti->ti_obd_list);
+       INIT_LIST_HEAD(&ti->ti_chain);
+       ti->ti_timeout = time;
+       ti->ti_event = event;
+       ti->ti_cb = cb;
+       ti->ti_cb_data = data;
+
+       return ti;
+}
+
+/**
+ * Register timeout event on the the pinger thread.
+ * Note: the timeout list is an sorted list with increased timeout value.
+ */
+static struct timeout_item*
+ptlrpc_pinger_register_timeout(int time, enum timeout_event event,
+                              timeout_cb_t cb, void *data)
+{
+       struct timeout_item *item, *tmp;
+
+       LASSERT(mutex_is_locked(&pinger_mutex));
+
+       list_for_each_entry(item, &timeout_list, ti_chain)
+               if (item->ti_event == event)
+                       goto out;
+
+       item = ptlrpc_new_timeout(time, event, cb, data);
+       if (item) {
+               list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) {
+                       if (tmp->ti_timeout < time) {
+                               list_add(&item->ti_chain, &tmp->ti_chain);
+                               goto out;
+                       }
+               }
+               list_add(&item->ti_chain, &timeout_list);
+       }
+out:
+       return item;
+}
+
+/* Add a client_obd to the timeout event list, when timeout(@time)
+ * happens, the callback(@cb) will be called.
+ */
+int ptlrpc_add_timeout_client(int time, enum timeout_event event,
+                             timeout_cb_t cb, void *data,
+                             struct list_head *obd_list)
+{
+       struct timeout_item *ti;
+
+       mutex_lock(&pinger_mutex);
+       ti = ptlrpc_pinger_register_timeout(time, event, cb, data);
+       if (!ti) {
+               mutex_unlock(&pinger_mutex);
+               return (-EINVAL);
+       }
+       list_add(obd_list, &ti->ti_obd_list);
+       mutex_unlock(&pinger_mutex);
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_add_timeout_client);
+
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
+                             enum timeout_event event)
+{
+       struct timeout_item *ti = NULL, *item;
+
+       if (list_empty(obd_list))
+               return 0;
+       mutex_lock(&pinger_mutex);
+       list_del_init(obd_list);
+       /**
+        * If there are no obd attached to the timeout event
+        * list, remove this timeout event from the pinger
+        */
+       list_for_each_entry(item, &timeout_list, ti_chain) {
+               if (item->ti_event == event) {
+                       ti = item;
+                       break;
+               }
+       }
+       LASSERTF(ti != NULL, "ti is NULL ! \n");
+       if (list_empty(&ti->ti_obd_list)) {
+               list_del(&ti->ti_chain);
+               OBD_FREE_PTR(ti);
+       }
+       mutex_unlock(&pinger_mutex);
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_del_timeout_client);
+
+int ptlrpc_pinger_remove_timeouts(void)
+{
+       struct timeout_item *item, *tmp;
+
+       mutex_lock(&pinger_mutex);
+       list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) {
+               LASSERT(list_empty(&item->ti_obd_list));
+               list_del(&item->ti_chain);
+               OBD_FREE_PTR(item);
+       }
+       mutex_unlock(&pinger_mutex);
+       return 0;
+}
+
+void ptlrpc_pinger_wake_up()
+{
+       thread_add_flags(pinger_thread, SVC_EVENT);
+       wake_up(&pinger_thread->t_ctl_waitq);
+}
+
+/* Ping evictor thread */
+#define PET_READY     1
+#define PET_TERMINATE 2
+
+static int            pet_refcount = 0;
+static int            pet_state;
+static wait_queue_head_t       pet_waitq;
+LIST_HEAD(pet_list);
+static DEFINE_SPINLOCK(pet_lock);
+
+int ping_evictor_wake(struct obd_export *exp)
+{
+       struct obd_device *obd;
+
+       spin_lock(&pet_lock);
+       if (pet_state != PET_READY) {
+               /* eventually the new obd will call here again. */
+               spin_unlock(&pet_lock);
+               return 1;
+       }
+
+       obd = class_exp2obd(exp);
+       if (list_empty(&obd->obd_evict_list)) {
+               class_incref(obd, "evictor", obd);
+               list_add(&obd->obd_evict_list, &pet_list);
+       }
+       spin_unlock(&pet_lock);
+
+       wake_up(&pet_waitq);
+       return 0;
+}
+
+static int ping_evictor_main(void *arg)
+{
+       struct obd_device *obd;
+       struct obd_export *exp;
+       struct l_wait_info lwi = { 0 };
+       time_t expire_time;
+       ENTRY;
+
+       unshare_fs_struct();
+
+       CDEBUG(D_HA, "Starting Ping Evictor\n");
+       pet_state = PET_READY;
+       while (1) {
+               l_wait_event(pet_waitq, (!list_empty(&pet_list)) ||
+                            (pet_state == PET_TERMINATE), &lwi);
+
+               /* loop until all obd's will be removed */
+               if ((pet_state == PET_TERMINATE) && list_empty(&pet_list))
+                       break;
+
+               /* we only get here if pet_exp != NULL, and the end of this
+                * loop is the only place which sets it NULL again, so lock
+                * is not strictly necessary. */
+               spin_lock(&pet_lock);
+               obd = list_entry(pet_list.next, struct obd_device,
+                                    obd_evict_list);
+               spin_unlock(&pet_lock);
+
+               expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT;
+
+               CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+                      obd->obd_name, expire_time);
+
+               /* Exports can't be deleted out of the list while we hold
+                * the obd lock (class_unlink_export), which means we can't
+                * lose the last ref on the export.  If they've already been
+                * removed from the list, we won't find them here. */
+               spin_lock(&obd->obd_dev_lock);
+               while (!list_empty(&obd->obd_exports_timed)) {
+                       exp = list_entry(obd->obd_exports_timed.next,
+                                            struct obd_export,
+                                            exp_obd_chain_timed);
+                       if (expire_time > exp->exp_last_request_time) {
+                               class_export_get(exp);
+                               spin_unlock(&obd->obd_dev_lock);
+                               LCONSOLE_WARN("%s: haven't heard from client %s"
+                                             " (at %s) in %ld seconds. I think"
+                                             " it's dead, and I am evicting"
+                                             " it. exp %p, cur %ld expire %ld"
+                                             " last %ld\n",
+                                             obd->obd_name,
+                                             obd_uuid2str(&exp->exp_client_uuid),
+                                             obd_export_nid2str(exp),
+                                             (long)(cfs_time_current_sec() -
+                                                    exp->exp_last_request_time),
+                                             exp, (long)cfs_time_current_sec(),
+                                             (long)expire_time,
+                                             (long)exp->exp_last_request_time);
+                               CDEBUG(D_HA, "Last request was at %ld\n",
+                                      exp->exp_last_request_time);
+                               class_fail_export(exp);
+                               class_export_put(exp);
+                               spin_lock(&obd->obd_dev_lock);
+                       } else {
+                               /* List is sorted, so everyone below is ok */
+                               break;
+                       }
+               }
+               spin_unlock(&obd->obd_dev_lock);
+
+               spin_lock(&pet_lock);
+               list_del_init(&obd->obd_evict_list);
+               spin_unlock(&pet_lock);
+
+               class_decref(obd, "evictor", obd);
+       }
+       CDEBUG(D_HA, "Exiting Ping Evictor\n");
+
+       RETURN(0);
+}
+
+void ping_evictor_start(void)
+{
+       task_t *task;
+
+       if (++pet_refcount > 1)
+               return;
+
+       init_waitqueue_head(&pet_waitq);
+
+       task = kthread_run(ping_evictor_main, NULL, "ll_evictor");
+       if (IS_ERR(task)) {
+               pet_refcount--;
+               CERROR("Cannot start ping evictor thread: %ld\n",
+                       PTR_ERR(task));
+       }
+}
+EXPORT_SYMBOL(ping_evictor_start);
+
+void ping_evictor_stop(void)
+{
+       if (--pet_refcount > 0)
+               return;
+
+       pet_state = PET_TERMINATE;
+       wake_up(&pet_waitq);
+}
+EXPORT_SYMBOL(ping_evictor_stop);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h
new file mode 100644 (file)
index 0000000..ab36347
--- /dev/null
@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+/* Intramodule declarations for ptlrpc. */
+
+#ifndef PTLRPC_INTERNAL_H
+#define PTLRPC_INTERNAL_H
+
+#include "../ldlm/ldlm_internal.h"
+
+struct ldlm_namespace;
+struct obd_import;
+struct ldlm_res_id;
+struct ptlrpc_request_set;
+extern int test_req_buffer_pressure;
+extern struct mutex ptlrpc_all_services_mutex;
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait);
+/* ptlrpcd.c */
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc);
+
+/* client.c */
+struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
+                                        unsigned type, unsigned portal);
+void ptlrpc_init_xid(void);
+
+/* events.c */
+int ptlrpc_init_portals(void);
+void ptlrpc_exit_portals(void);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
+void lustre_assert_wire_constants(void);
+int ptlrpc_import_in_recovery(struct obd_import *imp);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
+void ptlrpc_handle_failed_import(struct obd_import *imp);
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
+void ptlrpc_initiate_recovery(struct obd_import *imp);
+
+int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset);
+int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset);
+
+#ifdef LPROCFS
+void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry,
+                                    struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount);
+void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
+                                    long q_usec, long work_usec);
+#else
+#define ptlrpc_lprocfs_register_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
+#define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
+#define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
+#endif /* LPROCFS */
+
+/* NRS */
+
+/**
+ * NRS core object.
+ *
+ * Holds NRS core fields.
+ */
+struct nrs_core {
+       /**
+        * Protects nrs_core::nrs_policies, serializes external policy
+        * registration/unregistration, and NRS core lprocfs operations.
+        */
+       struct mutex nrs_mutex;
+       /* XXX: This is just for liblustre. Remove the #if defined directive
+        * when the * "cfs_" prefix is dropped from cfs_list_head. */
+       /**
+        * List of all policy descriptors registered with NRS core; protected
+        * by nrs_core::nrs_mutex.
+        */
+       struct list_head nrs_policies;
+
+};
+
+int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc);
+void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc);
+
+void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt,
+                              struct ptlrpc_request *req, bool hp);
+void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req);
+void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt,
+                       struct ptlrpc_request *req, bool hp);
+
+struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp,
+                          bool peek, bool force);
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp,
+                         bool force)
+{
+       return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force);
+}
+
+static inline struct ptlrpc_request *
+ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp)
+{
+       return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false);
+}
+
+void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req);
+bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp);
+
+int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc,
+                             enum ptlrpc_nrs_queue_type queue, char *name,
+                             enum ptlrpc_nrs_ctl opc, bool single, void *arg);
+
+int ptlrpc_nrs_init(void);
+void ptlrpc_nrs_fini(void);
+
+static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt)
+{
+       return svcpt->scp_nrs_hp != NULL;
+}
+
+static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc)
+{
+       /**
+        * If the first service partition has an HP NRS head, all service
+        * partitions will.
+        */
+       return nrs_svcpt_has_hp(svc->srv_parts[0]);
+}
+
+static inline
+struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp)
+{
+       LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt)));
+       return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg;
+}
+
+static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy)
+{
+       return policy->pol_nrs->nrs_svcpt->scp_cpt;
+}
+
+static inline
+struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy)
+{
+       return policy->pol_nrs->nrs_svcpt->scp_service;
+}
+
+static inline
+struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy)
+{
+       return policy->pol_nrs->nrs_svcpt;
+}
+
+static inline
+struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy)
+{
+       return nrs_pol2svc(policy)->srv_cptable;
+}
+
+static inline struct ptlrpc_nrs_resource *
+nrs_request_resource(struct ptlrpc_nrs_request *nrq)
+{
+       LASSERT(nrq->nr_initialized);
+       LASSERT(!nrq->nr_finalized);
+
+       return nrq->nr_res_ptrs[nrq->nr_res_idx];
+}
+
+static inline
+struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq)
+{
+       return nrs_request_resource(nrq)->res_policy;
+}
+
+#define NRS_LPROCFS_QUANTUM_NAME_REG   "reg_quantum:"
+#define NRS_LPROCFS_QUANTUM_NAME_HP    "hp_quantum:"
+
+/**
+ * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum.
+ */
+#define LPROCFS_NRS_QUANTUM_MAX                65535
+
+/**
+ * Max valid command string is the size of the labels, plus "65535" twice, plus
+ * a separating space character.
+ */
+#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD                                        \
+ sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " "  \
+       NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX))
+
+/* recovd_thread.c */
+
+int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink);
+
+/* pers.c */
+void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc,
+                        int mdcnt);
+void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
+                         int pageoffset, int len);
+
+/* pack_generic.c */
+struct ptlrpc_reply_state *
+lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt);
+void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs);
+
+/* pinger.c */
+int ptlrpc_start_pinger(void);
+int ptlrpc_stop_pinger(void);
+void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
+void ptlrpc_pinger_commit_expected(struct obd_import *imp);
+void ptlrpc_pinger_wake_up(void);
+void ptlrpc_ping_import_soon(struct obd_import *imp);
+int ping_evictor_wake(struct obd_export *exp);
+
+/* sec_null.c */
+int  sptlrpc_null_init(void);
+void sptlrpc_null_fini(void);
+
+/* sec_plain.c */
+int  sptlrpc_plain_init(void);
+void sptlrpc_plain_fini(void);
+
+/* sec_bulk.c */
+int  sptlrpc_enc_pool_init(void);
+void sptlrpc_enc_pool_fini(void);
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v);
+
+/* sec_lproc.c */
+int  sptlrpc_lproc_init(void);
+void sptlrpc_lproc_fini(void);
+
+/* sec_gc.c */
+int sptlrpc_gc_init(void);
+void sptlrpc_gc_fini(void);
+
+/* sec_config.c */
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+                               enum lustre_sec_part to,
+                               struct obd_uuid *target,
+                               lnet_nid_t nid,
+                               struct sptlrpc_flavor *sf);
+int  sptlrpc_conf_init(void);
+void sptlrpc_conf_fini(void);
+
+/* sec.c */
+int  sptlrpc_init(void);
+void sptlrpc_fini(void);
+
+static inline int ll_rpc_recoverable_error(int rc)
+{
+       return (rc == -ENOTCONN || rc == -ENODEV);
+}
+
+static inline int tgt_mod_init(void)
+{
+       return 0;
+}
+
+static inline void tgt_mod_exit(void)
+{
+       return;
+}
+
+static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set)
+{
+       if (atomic_dec_and_test(&set->set_refcount))
+               OBD_FREE_PTR(set);
+}
+#endif /* PTLRPC_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c
new file mode 100644 (file)
index 0000000..f6ea80f
--- /dev/null
@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_req_layout.h>
+
+#include "ptlrpc_internal.h"
+
+extern spinlock_t ptlrpc_last_xid_lock;
+#if RS_DEBUG
+extern spinlock_t ptlrpc_rs_debug_lock;
+#endif
+extern struct mutex pinger_mutex;
+extern struct mutex ptlrpcd_mutex;
+
+__init int ptlrpc_init(void)
+{
+       int rc, cleanup_phase = 0;
+       ENTRY;
+
+       lustre_assert_wire_constants();
+#if RS_DEBUG
+       spin_lock_init(&ptlrpc_rs_debug_lock);
+#endif
+       mutex_init(&ptlrpc_all_services_mutex);
+       mutex_init(&pinger_mutex);
+       mutex_init(&ptlrpcd_mutex);
+       ptlrpc_init_xid();
+
+       rc = req_layout_init();
+       if (rc)
+               RETURN(rc);
+
+       rc = ptlrpc_hr_init();
+       if (rc)
+               RETURN(rc);
+
+       cleanup_phase = 1;
+
+       rc = ptlrpc_init_portals();
+       if (rc)
+               GOTO(cleanup, rc);
+       cleanup_phase = 2;
+
+       rc = ptlrpc_connection_init();
+       if (rc)
+               GOTO(cleanup, rc);
+       cleanup_phase = 3;
+
+       ptlrpc_put_connection_superhack = ptlrpc_connection_put;
+
+       rc = ptlrpc_start_pinger();
+       if (rc)
+               GOTO(cleanup, rc);
+       cleanup_phase = 4;
+
+       rc = ldlm_init();
+       if (rc)
+               GOTO(cleanup, rc);
+       cleanup_phase = 5;
+
+       rc = sptlrpc_init();
+       if (rc)
+               GOTO(cleanup, rc);
+
+       cleanup_phase = 7;
+       rc = ptlrpc_nrs_init();
+       if (rc)
+               GOTO(cleanup, rc);
+
+       cleanup_phase = 8;
+       rc = tgt_mod_init();
+       if (rc)
+               GOTO(cleanup, rc);
+       RETURN(0);
+
+cleanup:
+       switch(cleanup_phase) {
+       case 8:
+               ptlrpc_nrs_fini();
+       case 7:
+               sptlrpc_fini();
+       case 5:
+               ldlm_exit();
+       case 4:
+               ptlrpc_stop_pinger();
+       case 3:
+               ptlrpc_connection_fini();
+       case 2:
+               ptlrpc_exit_portals();
+       case 1:
+               ptlrpc_hr_fini();
+               req_layout_fini();
+       default: ;
+       }
+
+       return rc;
+}
+
+static void __exit ptlrpc_exit(void)
+{
+       tgt_mod_exit();
+       ptlrpc_nrs_fini();
+       sptlrpc_fini();
+       ldlm_exit();
+       ptlrpc_stop_pinger();
+       ptlrpc_exit_portals();
+       ptlrpc_hr_fini();
+       ptlrpc_connection_fini();
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Request Processor and Lock Management");
+MODULE_LICENSE("GPL");
+
+cfs_module(ptlrpc, "1.0.0", ptlrpc_init, ptlrpc_exit);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c
new file mode 100644 (file)
index 0000000..5a66a1b
--- /dev/null
@@ -0,0 +1,827 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/ptlrpcd.c
+ */
+
+/** \defgroup ptlrpcd PortalRPC daemon
+ *
+ * ptlrpcd is a special thread with its own set where other user might add
+ * requests when they don't want to wait for their completion.
+ * PtlRPCD will take care of sending such requests and then processing their
+ * replies and calling completion callbacks as necessary.
+ * The callbacks are called directly from ptlrpcd context.
+ * It is important to never significantly block (esp. on RPCs!) within such
+ * completion handler or a deadlock might occur where ptlrpcd enters some
+ * callback that attempts to send another RPC and wait for it to return,
+ * during which time ptlrpcd is completely blocked, so e.g. if import
+ * fails, recovery cannot progress because connection requests are also
+ * sent by ptlrpcd.
+ *
+ * @{
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# include <linux/libcfs/libcfs.h>
+
+#include <lustre_net.h>
+# include <lustre_lib.h>
+
+#include <lustre_ha.h>
+#include <obd_class.h>   /* for obd_zombie */
+#include <obd_support.h> /* for OBD_FAIL_CHECK */
+#include <cl_object.h> /* cl_env_{get,put}() */
+#include <lprocfs_status.h>
+
+#include "ptlrpc_internal.h"
+
+struct ptlrpcd {
+       int             pd_size;
+       int             pd_index;
+       int             pd_nthreads;
+       struct ptlrpcd_ctl pd_thread_rcv;
+       struct ptlrpcd_ctl pd_threads[0];
+};
+
+static int max_ptlrpcds;
+CFS_MODULE_PARM(max_ptlrpcds, "i", int, 0644,
+               "Max ptlrpcd thread count to be started.");
+
+static int ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+CFS_MODULE_PARM(ptlrpcd_bind_policy, "i", int, 0644,
+               "Ptlrpcd threads binding mode.");
+static struct ptlrpcd *ptlrpcds;
+
+struct mutex ptlrpcd_mutex;
+static int ptlrpcd_users = 0;
+
+void ptlrpcd_wake(struct ptlrpc_request *req)
+{
+       struct ptlrpc_request_set *rq_set = req->rq_set;
+
+       LASSERT(rq_set != NULL);
+
+       wake_up(&rq_set->set_waitq);
+}
+EXPORT_SYMBOL(ptlrpcd_wake);
+
+static struct ptlrpcd_ctl *
+ptlrpcd_select_pc(struct ptlrpc_request *req, pdl_policy_t policy, int index)
+{
+       int idx = 0;
+
+       if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL)
+               return &ptlrpcds->pd_thread_rcv;
+
+       switch (policy) {
+       case PDL_POLICY_SAME:
+               idx = smp_processor_id() % ptlrpcds->pd_nthreads;
+               break;
+       case PDL_POLICY_LOCAL:
+               /* Before CPU partition patches available, process it the same
+                * as "PDL_POLICY_ROUND". */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix this code to use new CPU partition APIs"
+# endif
+               /* Fall through to PDL_POLICY_ROUND until the CPU
+                * CPU partition patches are available. */
+               index = -1;
+       case PDL_POLICY_PREFERRED:
+               if (index >= 0 && index < num_online_cpus()) {
+                       idx = index % ptlrpcds->pd_nthreads;
+                       break;
+               }
+               /* Fall through to PDL_POLICY_ROUND for bad index. */
+       default:
+               /* Fall through to PDL_POLICY_ROUND for unknown policy. */
+       case PDL_POLICY_ROUND:
+               /* We do not care whether it is strict load balance. */
+               idx = ptlrpcds->pd_index + 1;
+               if (idx == smp_processor_id())
+                       idx++;
+               idx %= ptlrpcds->pd_nthreads;
+               ptlrpcds->pd_index = idx;
+               break;
+       }
+
+       return &ptlrpcds->pd_threads[idx];
+}
+
+/**
+ * Move all request from an existing request set to the ptlrpcd queue.
+ * All requests from the set must be in phase RQ_PHASE_NEW.
+ */
+void ptlrpcd_add_rqset(struct ptlrpc_request_set *set)
+{
+       struct list_head *tmp, *pos;
+       struct ptlrpcd_ctl *pc;
+       struct ptlrpc_request_set *new;
+       int count, i;
+
+       pc = ptlrpcd_select_pc(NULL, PDL_POLICY_LOCAL, -1);
+       new = pc->pc_set;
+
+       list_for_each_safe(pos, tmp, &set->set_requests) {
+               struct ptlrpc_request *req =
+                       list_entry(pos, struct ptlrpc_request,
+                                      rq_set_chain);
+
+               LASSERT(req->rq_phase == RQ_PHASE_NEW);
+               req->rq_set = new;
+               req->rq_queued_time = cfs_time_current();
+       }
+
+       spin_lock(&new->set_new_req_lock);
+       list_splice_init(&set->set_requests, &new->set_new_requests);
+       i = atomic_read(&set->set_remaining);
+       count = atomic_add_return(i, &new->set_new_count);
+       atomic_set(&set->set_remaining, 0);
+       spin_unlock(&new->set_new_req_lock);
+       if (count == i) {
+               wake_up(&new->set_waitq);
+
+               /* XXX: It maybe unnecessary to wakeup all the partners. But to
+                *      guarantee the async RPC can be processed ASAP, we have
+                *      no other better choice. It maybe fixed in future. */
+               for (i = 0; i < pc->pc_npartners; i++)
+                       wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
+       }
+}
+EXPORT_SYMBOL(ptlrpcd_add_rqset);
+
+/**
+ * Return transferred RPCs count.
+ */
+static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des,
+                              struct ptlrpc_request_set *src)
+{
+       struct list_head *tmp, *pos;
+       struct ptlrpc_request *req;
+       int rc = 0;
+
+       spin_lock(&src->set_new_req_lock);
+       if (likely(!list_empty(&src->set_new_requests))) {
+               list_for_each_safe(pos, tmp, &src->set_new_requests) {
+                       req = list_entry(pos, struct ptlrpc_request,
+                                            rq_set_chain);
+                       req->rq_set = des;
+               }
+               list_splice_init(&src->set_new_requests,
+                                    &des->set_requests);
+               rc = atomic_read(&src->set_new_count);
+               atomic_add(rc, &des->set_remaining);
+               atomic_set(&src->set_new_count, 0);
+       }
+       spin_unlock(&src->set_new_req_lock);
+       return rc;
+}
+
+/**
+ * Requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set().
+ */
+void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx)
+{
+       struct ptlrpcd_ctl *pc;
+
+       if (req->rq_reqmsg)
+               lustre_msg_set_jobid(req->rq_reqmsg, NULL);
+
+       spin_lock(&req->rq_lock);
+       if (req->rq_invalid_rqset) {
+               struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(5),
+                                                    back_to_sleep, NULL);
+
+               req->rq_invalid_rqset = 0;
+               spin_unlock(&req->rq_lock);
+               l_wait_event(req->rq_set_waitq, (req->rq_set == NULL), &lwi);
+       } else if (req->rq_set) {
+               /* If we have a vaid "rq_set", just reuse it to avoid double
+                * linked. */
+               LASSERT(req->rq_phase == RQ_PHASE_NEW);
+               LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY);
+
+               /* ptlrpc_check_set will decrease the count */
+               atomic_inc(&req->rq_set->set_remaining);
+               spin_unlock(&req->rq_lock);
+               wake_up(&req->rq_set->set_waitq);
+               return;
+       } else {
+               spin_unlock(&req->rq_lock);
+       }
+
+       pc = ptlrpcd_select_pc(req, policy, idx);
+
+       DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]",
+                 req, pc->pc_name, pc->pc_index);
+
+       ptlrpc_set_add_new_req(pc, req);
+}
+EXPORT_SYMBOL(ptlrpcd_add_req);
+
+static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set)
+{
+       atomic_inc(&set->set_refcount);
+}
+
+/**
+ * Check if there is more work to do on ptlrpcd set.
+ * Returns 1 if yes.
+ */
+static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc)
+{
+       struct list_head *tmp, *pos;
+       struct ptlrpc_request *req;
+       struct ptlrpc_request_set *set = pc->pc_set;
+       int rc = 0;
+       int rc2;
+       ENTRY;
+
+       if (atomic_read(&set->set_new_count)) {
+               spin_lock(&set->set_new_req_lock);
+               if (likely(!list_empty(&set->set_new_requests))) {
+                       list_splice_init(&set->set_new_requests,
+                                            &set->set_requests);
+                       atomic_add(atomic_read(&set->set_new_count),
+                                      &set->set_remaining);
+                       atomic_set(&set->set_new_count, 0);
+                       /*
+                        * Need to calculate its timeout.
+                        */
+                       rc = 1;
+               }
+               spin_unlock(&set->set_new_req_lock);
+       }
+
+       /* We should call lu_env_refill() before handling new requests to make
+        * sure that env key the requests depending on really exists.
+        */
+       rc2 = lu_env_refill(env);
+       if (rc2 != 0) {
+               /*
+                * XXX This is very awkward situation, because
+                * execution can neither continue (request
+                * interpreters assume that env is set up), nor repeat
+                * the loop (as this potentially results in a tight
+                * loop of -ENOMEM's).
+                *
+                * Fortunately, refill only ever does something when
+                * new modules are loaded, i.e., early during boot up.
+                */
+               CERROR("Failure to refill session: %d\n", rc2);
+               RETURN(rc);
+       }
+
+       if (atomic_read(&set->set_remaining))
+               rc |= ptlrpc_check_set(env, set);
+
+       if (!list_empty(&set->set_requests)) {
+               /*
+                * XXX: our set never completes, so we prune the completed
+                * reqs after each iteration. boy could this be smarter.
+                */
+               list_for_each_safe(pos, tmp, &set->set_requests) {
+                       req = list_entry(pos, struct ptlrpc_request,
+                                            rq_set_chain);
+                       if (req->rq_phase != RQ_PHASE_COMPLETE)
+                               continue;
+
+                       list_del_init(&req->rq_set_chain);
+                       req->rq_set = NULL;
+                       ptlrpc_req_finished(req);
+               }
+       }
+
+       if (rc == 0) {
+               /*
+                * If new requests have been added, make sure to wake up.
+                */
+               rc = atomic_read(&set->set_new_count);
+
+               /* If we have nothing to do, check whether we can take some
+                * work from our partner threads. */
+               if (rc == 0 && pc->pc_npartners > 0) {
+                       struct ptlrpcd_ctl *partner;
+                       struct ptlrpc_request_set *ps;
+                       int first = pc->pc_cursor;
+
+                       do {
+                               partner = pc->pc_partners[pc->pc_cursor++];
+                               if (pc->pc_cursor >= pc->pc_npartners)
+                                       pc->pc_cursor = 0;
+                               if (partner == NULL)
+                                       continue;
+
+                               spin_lock(&partner->pc_lock);
+                               ps = partner->pc_set;
+                               if (ps == NULL) {
+                                       spin_unlock(&partner->pc_lock);
+                                       continue;
+                               }
+
+                               ptlrpc_reqset_get(ps);
+                               spin_unlock(&partner->pc_lock);
+
+                               if (atomic_read(&ps->set_new_count)) {
+                                       rc = ptlrpcd_steal_rqset(set, ps);
+                                       if (rc > 0)
+                                               CDEBUG(D_RPCTRACE, "transfer %d"
+                                                      " async RPCs [%d->%d]\n",
+                                                       rc, partner->pc_index,
+                                                       pc->pc_index);
+                               }
+                               ptlrpc_reqset_put(ps);
+                       } while (rc == 0 && pc->pc_cursor != first);
+               }
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * Main ptlrpcd thread.
+ * ptlrpc's code paths like to execute in process context, so we have this
+ * thread which spins on a set which contains the rpcs and sends them.
+ *
+ */
+static int ptlrpcd(void *arg)
+{
+       struct ptlrpcd_ctl *pc = arg;
+       struct ptlrpc_request_set *set = pc->pc_set;
+       struct lu_env env = { .le_ses = NULL };
+       int rc, exit = 0;
+       ENTRY;
+
+       unshare_fs_struct();
+#if defined(CONFIG_SMP)
+       if (test_bit(LIOD_BIND, &pc->pc_flags)) {
+               int index = pc->pc_index;
+
+               if (index >= 0 && index < num_possible_cpus()) {
+                       while (!cpu_online(index)) {
+                               if (++index >= num_possible_cpus())
+                                       index = 0;
+                       }
+                       set_cpus_allowed_ptr(current,
+                                       cpumask_of_node(cpu_to_node(index)));
+               }
+       }
+#endif
+       /*
+        * XXX So far only "client" ptlrpcd uses an environment. In
+        * the future, ptlrpcd thread (or a thread-set) has to given
+        * an argument, describing its "scope".
+        */
+       rc = lu_context_init(&env.le_ctx,
+                            LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF);
+       complete(&pc->pc_starting);
+
+       if (rc != 0)
+               RETURN(rc);
+
+       /*
+        * This mainloop strongly resembles ptlrpc_set_wait() except that our
+        * set never completes.  ptlrpcd_check() calls ptlrpc_check_set() when
+        * there are requests in the set. New requests come in on the set's
+        * new_req_list and ptlrpcd_check() moves them into the set.
+        */
+       do {
+               struct l_wait_info lwi;
+               int timeout;
+
+               timeout = ptlrpc_set_next_timeout(set);
+               lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
+                                 ptlrpc_expired_set, set);
+
+               lu_context_enter(&env.le_ctx);
+               l_wait_event(set->set_waitq,
+                            ptlrpcd_check(&env, pc), &lwi);
+               lu_context_exit(&env.le_ctx);
+
+               /*
+                * Abort inflight rpcs for forced stop case.
+                */
+               if (test_bit(LIOD_STOP, &pc->pc_flags)) {
+                       if (test_bit(LIOD_FORCE, &pc->pc_flags))
+                               ptlrpc_abort_set(set);
+                       exit++;
+               }
+
+               /*
+                * Let's make one more loop to make sure that ptlrpcd_check()
+                * copied all raced new rpcs into the set so we can kill them.
+                */
+       } while (exit < 2);
+
+       /*
+        * Wait for inflight requests to drain.
+        */
+       if (!list_empty(&set->set_requests))
+               ptlrpc_set_wait(set);
+       lu_context_fini(&env.le_ctx);
+
+       complete(&pc->pc_finishing);
+
+       return 0;
+}
+
+/* XXX: We want multiple CPU cores to share the async RPC load. So we start many
+ *      ptlrpcd threads. We also want to reduce the ptlrpcd overhead caused by
+ *      data transfer cross-CPU cores. So we bind ptlrpcd thread to specified
+ *      CPU core. But binding all ptlrpcd threads maybe cause response delay
+ *      because of some CPU core(s) busy with other loads.
+ *
+ *      For example: "ls -l", some async RPCs for statahead are assigned to
+ *      ptlrpcd_0, and ptlrpcd_0 is bound to CPU_0, but CPU_0 may be quite busy
+ *      with other non-ptlrpcd, like "ls -l" itself (we want to the "ls -l"
+ *      thread, statahead thread, and ptlrpcd thread can run in parallel), under
+ *      such case, the statahead async RPCs can not be processed in time, it is
+ *      unexpected. If ptlrpcd_0 can be re-scheduled on other CPU core, it may
+ *      be better. But it breaks former data transfer policy.
+ *
+ *      So we shouldn't be blind for avoiding the data transfer. We make some
+ *      compromise: divide the ptlrpcd threds pool into two parts. One part is
+ *      for bound mode, each ptlrpcd thread in this part is bound to some CPU
+ *      core. The other part is for free mode, all the ptlrpcd threads in the
+ *      part can be scheduled on any CPU core. We specify some partnership
+ *      between bound mode ptlrpcd thread(s) and free mode ptlrpcd thread(s),
+ *      and the async RPC load within the partners are shared.
+ *
+ *      It can partly avoid data transfer cross-CPU (if the bound mode ptlrpcd
+ *      thread can be scheduled in time), and try to guarantee the async RPC
+ *      processed ASAP (as long as the free mode ptlrpcd thread can be scheduled
+ *      on any CPU core).
+ *
+ *      As for how to specify the partnership between bound mode ptlrpcd
+ *      thread(s) and free mode ptlrpcd thread(s), the simplest way is to use
+ *      <free bound> pair. In future, we can specify some more complex
+ *      partnership based on the patches for CPU partition. But before such
+ *      patches are available, we prefer to use the simplest one.
+ */
+# ifdef CFS_CPU_MODE_NUMA
+# warning "fix ptlrpcd_bind() to use new CPU partition APIs"
+# endif
+static int ptlrpcd_bind(int index, int max)
+{
+       struct ptlrpcd_ctl *pc;
+       int rc = 0;
+#if defined(CONFIG_NUMA)
+       cpumask_t mask;
+#endif
+       ENTRY;
+
+       LASSERT(index <= max - 1);
+       pc = &ptlrpcds->pd_threads[index];
+       switch (ptlrpcd_bind_policy) {
+       case PDB_POLICY_NONE:
+               pc->pc_npartners = -1;
+               break;
+       case PDB_POLICY_FULL:
+               pc->pc_npartners = 0;
+               set_bit(LIOD_BIND, &pc->pc_flags);
+               break;
+       case PDB_POLICY_PAIR:
+               LASSERT(max % 2 == 0);
+               pc->pc_npartners = 1;
+               break;
+       case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+       {
+               int i;
+               mask = *cpumask_of_node(cpu_to_node(index));
+               for (i = max; i < num_online_cpus(); i++)
+                       cpu_clear(i, mask);
+               pc->pc_npartners = cpus_weight(mask) - 1;
+               set_bit(LIOD_BIND, &pc->pc_flags);
+       }
+#else
+               LASSERT(max >= 3);
+               pc->pc_npartners = 2;
+#endif
+               break;
+       default:
+               CERROR("unknown ptlrpcd bind policy %d\n", ptlrpcd_bind_policy);
+               rc = -EINVAL;
+       }
+
+       if (rc == 0 && pc->pc_npartners > 0) {
+               OBD_ALLOC(pc->pc_partners,
+                         sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+               if (pc->pc_partners == NULL) {
+                       pc->pc_npartners = 0;
+                       rc = -ENOMEM;
+               } else {
+                       switch (ptlrpcd_bind_policy) {
+                       case PDB_POLICY_PAIR:
+                               if (index & 0x1) {
+                                       set_bit(LIOD_BIND, &pc->pc_flags);
+                                       pc->pc_partners[0] = &ptlrpcds->
+                                               pd_threads[index - 1];
+                                       ptlrpcds->pd_threads[index - 1].
+                                               pc_partners[0] = pc;
+                               }
+                               break;
+                       case PDB_POLICY_NEIGHBOR:
+#if defined(CONFIG_NUMA)
+                       {
+                               struct ptlrpcd_ctl *ppc;
+                               int i, pidx;
+                               /* partners are cores in the same NUMA node.
+                                * setup partnership only with ptlrpcd threads
+                                * that are already initialized
+                                */
+                               for (pidx = 0, i = 0; i < index; i++) {
+                                       if (cpu_isset(i, mask)) {
+                                               ppc = &ptlrpcds->pd_threads[i];
+                                               pc->pc_partners[pidx++] = ppc;
+                                               ppc->pc_partners[ppc->
+                                                         pc_npartners++] = pc;
+                                       }
+                               }
+                               /* adjust number of partners to the number
+                                * of partnership really setup */
+                               pc->pc_npartners = pidx;
+                       }
+#else
+                               if (index & 0x1)
+                                       set_bit(LIOD_BIND, &pc->pc_flags);
+                               if (index > 0) {
+                                       pc->pc_partners[0] = &ptlrpcds->
+                                               pd_threads[index - 1];
+                                       ptlrpcds->pd_threads[index - 1].
+                                               pc_partners[1] = pc;
+                                       if (index == max - 1) {
+                                               pc->pc_partners[1] =
+                                               &ptlrpcds->pd_threads[0];
+                                               ptlrpcds->pd_threads[0].
+                                               pc_partners[0] = pc;
+                                       }
+                               }
+#endif
+                               break;
+                       }
+               }
+       }
+
+       RETURN(rc);
+}
+
+
+int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc)
+{
+       int rc;
+       int env = 0;
+       ENTRY;
+
+       /*
+        * Do not allow start second thread for one pc.
+        */
+       if (test_and_set_bit(LIOD_START, &pc->pc_flags)) {
+               CWARN("Starting second thread (%s) for same pc %p\n",
+                     name, pc);
+               RETURN(0);
+       }
+
+       pc->pc_index = index;
+       init_completion(&pc->pc_starting);
+       init_completion(&pc->pc_finishing);
+       spin_lock_init(&pc->pc_lock);
+       strncpy(pc->pc_name, name, sizeof(pc->pc_name) - 1);
+       pc->pc_set = ptlrpc_prep_set();
+       if (pc->pc_set == NULL)
+               GOTO(out, rc = -ENOMEM);
+       /*
+        * So far only "client" ptlrpcd uses an environment. In the future,
+        * ptlrpcd thread (or a thread-set) has to be given an argument,
+        * describing its "scope".
+        */
+       rc = lu_context_init(&pc->pc_env.le_ctx, LCT_CL_THREAD|LCT_REMEMBER);
+       if (rc != 0)
+               GOTO(out, rc);
+
+       env = 1;
+       {
+               task_t *task;
+               if (index >= 0) {
+                       rc = ptlrpcd_bind(index, max);
+                       if (rc < 0)
+                               GOTO(out, rc);
+               }
+
+               task = kthread_run(ptlrpcd, pc, pc->pc_name);
+               if (IS_ERR(task))
+                       GOTO(out, rc = PTR_ERR(task));
+
+               rc = 0;
+               wait_for_completion(&pc->pc_starting);
+       }
+out:
+       if (rc) {
+               if (pc->pc_set != NULL) {
+                       struct ptlrpc_request_set *set = pc->pc_set;
+
+                       spin_lock(&pc->pc_lock);
+                       pc->pc_set = NULL;
+                       spin_unlock(&pc->pc_lock);
+                       ptlrpc_set_destroy(set);
+               }
+               if (env != 0)
+                       lu_context_fini(&pc->pc_env.le_ctx);
+               clear_bit(LIOD_BIND, &pc->pc_flags);
+               clear_bit(LIOD_START, &pc->pc_flags);
+       }
+       RETURN(rc);
+}
+
+void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force)
+{
+       ENTRY;
+
+       if (!test_bit(LIOD_START, &pc->pc_flags)) {
+               CWARN("Thread for pc %p was not started\n", pc);
+               goto out;
+       }
+
+       set_bit(LIOD_STOP, &pc->pc_flags);
+       if (force)
+               set_bit(LIOD_FORCE, &pc->pc_flags);
+       wake_up(&pc->pc_set->set_waitq);
+
+out:
+       EXIT;
+}
+
+void ptlrpcd_free(struct ptlrpcd_ctl *pc)
+{
+       struct ptlrpc_request_set *set = pc->pc_set;
+       ENTRY;
+
+       if (!test_bit(LIOD_START, &pc->pc_flags)) {
+               CWARN("Thread for pc %p was not started\n", pc);
+               goto out;
+       }
+
+       wait_for_completion(&pc->pc_finishing);
+       lu_context_fini(&pc->pc_env.le_ctx);
+
+       spin_lock(&pc->pc_lock);
+       pc->pc_set = NULL;
+       spin_unlock(&pc->pc_lock);
+       ptlrpc_set_destroy(set);
+
+       clear_bit(LIOD_START, &pc->pc_flags);
+       clear_bit(LIOD_STOP, &pc->pc_flags);
+       clear_bit(LIOD_FORCE, &pc->pc_flags);
+       clear_bit(LIOD_BIND, &pc->pc_flags);
+
+out:
+       if (pc->pc_npartners > 0) {
+               LASSERT(pc->pc_partners != NULL);
+
+               OBD_FREE(pc->pc_partners,
+                        sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners);
+               pc->pc_partners = NULL;
+       }
+       pc->pc_npartners = 0;
+       EXIT;
+}
+
+static void ptlrpcd_fini(void)
+{
+       int i;
+       ENTRY;
+
+       if (ptlrpcds != NULL) {
+               for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+                       ptlrpcd_stop(&ptlrpcds->pd_threads[i], 0);
+               for (i = 0; i < ptlrpcds->pd_nthreads; i++)
+                       ptlrpcd_free(&ptlrpcds->pd_threads[i]);
+               ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+               ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+               OBD_FREE(ptlrpcds, ptlrpcds->pd_size);
+               ptlrpcds = NULL;
+       }
+
+       EXIT;
+}
+
+static int ptlrpcd_init(void)
+{
+       int nthreads = num_online_cpus();
+       char name[16];
+       int size, i = -1, j, rc = 0;
+       ENTRY;
+
+       if (max_ptlrpcds > 0 && max_ptlrpcds < nthreads)
+               nthreads = max_ptlrpcds;
+       if (nthreads < 2)
+               nthreads = 2;
+       if (nthreads < 3 && ptlrpcd_bind_policy == PDB_POLICY_NEIGHBOR)
+               ptlrpcd_bind_policy = PDB_POLICY_PAIR;
+       else if (nthreads % 2 != 0 && ptlrpcd_bind_policy == PDB_POLICY_PAIR)
+               nthreads &= ~1; /* make sure it is even */
+
+       size = offsetof(struct ptlrpcd, pd_threads[nthreads]);
+       OBD_ALLOC(ptlrpcds, size);
+       if (ptlrpcds == NULL)
+               GOTO(out, rc = -ENOMEM);
+
+       snprintf(name, 15, "ptlrpcd_rcv");
+       set_bit(LIOD_RECOVERY, &ptlrpcds->pd_thread_rcv.pc_flags);
+       rc = ptlrpcd_start(-1, nthreads, name, &ptlrpcds->pd_thread_rcv);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /* XXX: We start nthreads ptlrpc daemons. Each of them can process any
+        *      non-recovery async RPC to improve overall async RPC efficiency.
+        *
+        *      But there are some issues with async I/O RPCs and async non-I/O
+        *      RPCs processed in the same set under some cases. The ptlrpcd may
+        *      be blocked by some async I/O RPC(s), then will cause other async
+        *      non-I/O RPC(s) can not be processed in time.
+        *
+        *      Maybe we should distinguish blocked async RPCs from non-blocked
+        *      async RPCs, and process them in different ptlrpcd sets to avoid
+        *      unnecessary dependency. But how to distribute async RPCs load
+        *      among all the ptlrpc daemons becomes another trouble. */
+       for (i = 0; i < nthreads; i++) {
+               snprintf(name, 15, "ptlrpcd_%d", i);
+               rc = ptlrpcd_start(i, nthreads, name, &ptlrpcds->pd_threads[i]);
+               if (rc < 0)
+                       GOTO(out, rc);
+       }
+
+       ptlrpcds->pd_size = size;
+       ptlrpcds->pd_index = 0;
+       ptlrpcds->pd_nthreads = nthreads;
+
+out:
+       if (rc != 0 && ptlrpcds != NULL) {
+               for (j = 0; j <= i; j++)
+                       ptlrpcd_stop(&ptlrpcds->pd_threads[j], 0);
+               for (j = 0; j <= i; j++)
+                       ptlrpcd_free(&ptlrpcds->pd_threads[j]);
+               ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0);
+               ptlrpcd_free(&ptlrpcds->pd_thread_rcv);
+               OBD_FREE(ptlrpcds, size);
+               ptlrpcds = NULL;
+       }
+
+       RETURN(0);
+}
+
+int ptlrpcd_addref(void)
+{
+       int rc = 0;
+       ENTRY;
+
+       mutex_lock(&ptlrpcd_mutex);
+       if (++ptlrpcd_users == 1)
+               rc = ptlrpcd_init();
+       mutex_unlock(&ptlrpcd_mutex);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpcd_addref);
+
+void ptlrpcd_decref(void)
+{
+       mutex_lock(&ptlrpcd_mutex);
+       if (--ptlrpcd_users == 0)
+               ptlrpcd_fini();
+       mutex_unlock(&ptlrpcd_mutex);
+}
+EXPORT_SYMBOL(ptlrpcd_decref);
+/** @} ptlrpcd */
diff --git a/drivers/staging/lustre/lustre/ptlrpc/recover.c b/drivers/staging/lustre/lustre/ptlrpc/recover.c
new file mode 100644 (file)
index 0000000..2960889
--- /dev/null
@@ -0,0 +1,357 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/recover.c
+ *
+ * Author: Mike Shaver <shaver@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+# include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_ost.h>
+#include <obd_class.h>
+#include <obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */
+#include <linux/list.h>
+
+#include "ptlrpc_internal.h"
+
+/**
+ * Start recovery on disconnected import.
+ * This is done by just attempting a connect
+ */
+void ptlrpc_initiate_recovery(struct obd_import *imp)
+{
+       ENTRY;
+
+       CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd));
+       ptlrpc_connect_import(imp);
+
+       EXIT;
+}
+
+/**
+ * Identify what request from replay list needs to be replayed next
+ * (based on what we have already replayed) and send it to server.
+ */
+int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
+{
+       int rc = 0;
+       struct list_head *tmp, *pos;
+       struct ptlrpc_request *req = NULL;
+       __u64 last_transno;
+       ENTRY;
+
+       *inflight = 0;
+
+       /* It might have committed some after we last spoke, so make sure we
+        * get rid of them now.
+        */
+       spin_lock(&imp->imp_lock);
+       imp->imp_last_transno_checked = 0;
+       ptlrpc_free_committed(imp);
+       last_transno = imp->imp_last_replay_transno;
+       spin_unlock(&imp->imp_lock);
+
+       CDEBUG(D_HA, "import %p from %s committed "LPU64" last "LPU64"\n",
+              imp, obd2cli_tgt(imp->imp_obd),
+              imp->imp_peer_committed_transno, last_transno);
+
+       /* Do I need to hold a lock across this iteration?  We shouldn't be
+        * racing with any additions to the list, because we're in recovery
+        * and are therefore not processing additional requests to add.  Calls
+        * to ptlrpc_free_committed might commit requests, but nothing "newer"
+        * than the one we're replaying (it can't be committed until it's
+        * replayed, and we're doing that here).  l_f_e_safe protects against
+        * problems with the current request being committed, in the unlikely
+        * event of that race.  So, in conclusion, I think that it's safe to
+        * perform this list-walk without the imp_lock held.
+        *
+        * But, the {mdc,osc}_replay_open callbacks both iterate
+        * request lists, and have comments saying they assume the
+        * imp_lock is being held by ptlrpc_replay, but it's not. it's
+        * just a little race...
+        */
+       list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
+               req = list_entry(tmp, struct ptlrpc_request,
+                                    rq_replay_list);
+
+               /* If need to resend the last sent transno (because a
+                  reconnect has occurred), then stop on the matching
+                  req and send it again. If, however, the last sent
+                  transno has been committed then we continue replay
+                  from the next request. */
+               if (req->rq_transno > last_transno) {
+                       if (imp->imp_resend_replay)
+                               lustre_msg_add_flags(req->rq_reqmsg,
+                                                    MSG_RESENT);
+                       break;
+               }
+               req = NULL;
+       }
+
+       spin_lock(&imp->imp_lock);
+       imp->imp_resend_replay = 0;
+       spin_unlock(&imp->imp_lock);
+
+       if (req != NULL) {
+               rc = ptlrpc_replay_req(req);
+               if (rc) {
+                       CERROR("recovery replay error %d for req "
+                              LPU64"\n", rc, req->rq_xid);
+                       RETURN(rc);
+               }
+               *inflight = 1;
+       }
+       RETURN(rc);
+}
+
+/**
+ * Schedule resending of request on sending_list. This is done after
+ * we completed replaying of requests and locks.
+ */
+int ptlrpc_resend(struct obd_import *imp)
+{
+       struct ptlrpc_request *req, *next;
+
+       ENTRY;
+
+       /* As long as we're in recovery, nothing should be added to the sending
+        * list, so we don't need to hold the lock during this iteration and
+        * resend process.
+        */
+       /* Well... what if lctl recover is called twice at the same time?
+        */
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state != LUSTRE_IMP_RECOVER) {
+               spin_unlock(&imp->imp_lock);
+               RETURN(-1);
+       }
+
+       list_for_each_entry_safe(req, next, &imp->imp_sending_list,
+                                    rq_list) {
+               LASSERTF((long)req > PAGE_CACHE_SIZE && req != LP_POISON,
+                        "req %p bad\n", req);
+               LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req);
+               if (!ptlrpc_no_resend(req))
+                       ptlrpc_resend_req(req);
+       }
+       spin_unlock(&imp->imp_lock);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_resend);
+
+/**
+ * Go through all requests in delayed list and wake their threads
+ * for resending
+ */
+void ptlrpc_wake_delayed(struct obd_import *imp)
+{
+       struct list_head *tmp, *pos;
+       struct ptlrpc_request *req;
+
+       spin_lock(&imp->imp_lock);
+       list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
+               req = list_entry(tmp, struct ptlrpc_request, rq_list);
+
+               DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+               ptlrpc_client_wake_req(req);
+       }
+       spin_unlock(&imp->imp_lock);
+}
+EXPORT_SYMBOL(ptlrpc_wake_delayed);
+
+void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
+{
+       struct obd_import *imp = failed_req->rq_import;
+       ENTRY;
+
+       CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
+              imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
+              imp->imp_connection->c_remote_uuid.uuid);
+
+       if (ptlrpc_set_import_discon(imp,
+                             lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) {
+               if (!imp->imp_replayable) {
+                       CDEBUG(D_HA, "import %s@%s for %s not replayable, "
+                              "auto-deactivating\n",
+                              obd2cli_tgt(imp->imp_obd),
+                              imp->imp_connection->c_remote_uuid.uuid,
+                              imp->imp_obd->obd_name);
+                       ptlrpc_deactivate_import(imp);
+               }
+               /* to control recovery via lctl {disable|enable}_recovery */
+               if (imp->imp_deactive == 0)
+                       ptlrpc_connect_import(imp);
+       }
+
+       /* Wait for recovery to complete and resend. If evicted, then
+          this request will be errored out later.*/
+       spin_lock(&failed_req->rq_lock);
+       if (!failed_req->rq_no_resend)
+               failed_req->rq_resend = 1;
+       spin_unlock(&failed_req->rq_lock);
+
+       EXIT;
+}
+
+/**
+ * Administratively active/deactive a client.
+ * This should only be called by the ioctl interface, currently
+ *  - the lctl deactivate and activate commands
+ *  - echo 0/1 >> /proc/osc/XXX/active
+ *  - client umount -f (ll_umount_begin)
+ */
+int ptlrpc_set_import_active(struct obd_import *imp, int active)
+{
+       struct obd_device *obd = imp->imp_obd;
+       int rc = 0;
+
+       ENTRY;
+       LASSERT(obd);
+
+       /* When deactivating, mark import invalid, and abort in-flight
+        * requests. */
+       if (!active) {
+               LCONSOLE_WARN("setting import %s INACTIVE by administrator "
+                             "request\n", obd2cli_tgt(imp->imp_obd));
+
+               /* set before invalidate to avoid messages about imp_inval
+                * set without imp_deactive in ptlrpc_import_delay_req */
+               spin_lock(&imp->imp_lock);
+               imp->imp_deactive = 1;
+               spin_unlock(&imp->imp_lock);
+
+               obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE);
+
+               ptlrpc_invalidate_import(imp);
+       }
+
+       /* When activating, mark import valid, and attempt recovery */
+       if (active) {
+               CDEBUG(D_HA, "setting import %s VALID\n",
+                      obd2cli_tgt(imp->imp_obd));
+
+               spin_lock(&imp->imp_lock);
+               imp->imp_deactive = 0;
+               spin_unlock(&imp->imp_lock);
+               obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE);
+
+               rc = ptlrpc_recover_import(imp, NULL, 0);
+       }
+
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_set_import_active);
+
+/* Attempt to reconnect an import */
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async)
+{
+       int rc = 0;
+       ENTRY;
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive ||
+           atomic_read(&imp->imp_inval_count))
+               rc = -EINVAL;
+       spin_unlock(&imp->imp_lock);
+       if (rc)
+               GOTO(out, rc);
+
+       /* force import to be disconnected. */
+       ptlrpc_set_import_discon(imp, 0);
+
+       if (new_uuid) {
+               struct obd_uuid uuid;
+
+               /* intruct import to use new uuid */
+               obd_str2uuid(&uuid, new_uuid);
+               rc = import_set_conn_priority(imp, &uuid);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       /* Check if reconnect is already in progress */
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state != LUSTRE_IMP_DISCON) {
+               imp->imp_force_verify = 1;
+               rc = -EALREADY;
+       }
+       spin_unlock(&imp->imp_lock);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = ptlrpc_connect_import(imp);
+       if (rc)
+               GOTO(out, rc);
+
+       if (!async) {
+               struct l_wait_info lwi;
+               int secs = cfs_time_seconds(obd_timeout);
+
+               CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
+                      obd2cli_tgt(imp->imp_obd), secs);
+
+               lwi = LWI_TIMEOUT(secs, NULL, NULL);
+               rc = l_wait_event(imp->imp_recovery_waitq,
+                                 !ptlrpc_import_in_recovery(imp), &lwi);
+               CDEBUG(D_HA, "%s: recovery finished\n",
+                      obd2cli_tgt(imp->imp_obd));
+       }
+       EXIT;
+
+out:
+       return rc;
+}
+EXPORT_SYMBOL(ptlrpc_recover_import);
+
+int ptlrpc_import_in_recovery(struct obd_import *imp)
+{
+       int in_recovery = 1;
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_state == LUSTRE_IMP_FULL ||
+           imp->imp_state == LUSTRE_IMP_CLOSED ||
+           imp->imp_state == LUSTRE_IMP_DISCON)
+               in_recovery = 0;
+       spin_unlock(&imp->imp_lock);
+       return in_recovery;
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c
new file mode 100644 (file)
index 0000000..36e8bed
--- /dev/null
@@ -0,0 +1,2465 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+/***********************************************
+ * policy registers                        *
+ ***********************************************/
+
+static rwlock_t policy_lock;
+static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = {
+       NULL,
+};
+
+int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy)
+{
+       __u16 number = policy->sp_policy;
+
+       LASSERT(policy->sp_name);
+       LASSERT(policy->sp_cops);
+       LASSERT(policy->sp_sops);
+
+       if (number >= SPTLRPC_POLICY_MAX)
+               return -EINVAL;
+
+       write_lock(&policy_lock);
+       if (unlikely(policies[number])) {
+               write_unlock(&policy_lock);
+               return -EALREADY;
+       }
+       policies[number] = policy;
+       write_unlock(&policy_lock);
+
+       CDEBUG(D_SEC, "%s: registered\n", policy->sp_name);
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_register_policy);
+
+int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy)
+{
+       __u16 number = policy->sp_policy;
+
+       LASSERT(number < SPTLRPC_POLICY_MAX);
+
+       write_lock(&policy_lock);
+       if (unlikely(policies[number] == NULL)) {
+               write_unlock(&policy_lock);
+               CERROR("%s: already unregistered\n", policy->sp_name);
+               return -EINVAL;
+       }
+
+       LASSERT(policies[number] == policy);
+       policies[number] = NULL;
+       write_unlock(&policy_lock);
+
+       CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name);
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unregister_policy);
+
+static
+struct ptlrpc_sec_policy * sptlrpc_wireflavor2policy(__u32 flavor)
+{
+       static DEFINE_MUTEX(load_mutex);
+       static atomic_t       loaded = ATOMIC_INIT(0);
+       struct ptlrpc_sec_policy *policy;
+       __u16                number = SPTLRPC_FLVR_POLICY(flavor);
+       __u16                flag = 0;
+
+       if (number >= SPTLRPC_POLICY_MAX)
+               return NULL;
+
+       while (1) {
+               read_lock(&policy_lock);
+               policy = policies[number];
+               if (policy && !try_module_get(policy->sp_owner))
+                       policy = NULL;
+               if (policy == NULL)
+                       flag = atomic_read(&loaded);
+               read_unlock(&policy_lock);
+
+               if (policy != NULL || flag != 0 ||
+                   number != SPTLRPC_POLICY_GSS)
+                       break;
+
+               /* try to load gss module, once */
+               mutex_lock(&load_mutex);
+               if (atomic_read(&loaded) == 0) {
+                       if (request_module("ptlrpc_gss") == 0)
+                               CDEBUG(D_SEC,
+                                      "module ptlrpc_gss loaded on demand\n");
+                       else
+                               CERROR("Unable to load module ptlrpc_gss\n");
+
+                       atomic_set(&loaded, 1);
+               }
+               mutex_unlock(&load_mutex);
+       }
+
+       return policy;
+}
+
+__u32 sptlrpc_name2flavor_base(const char *name)
+{
+       if (!strcmp(name, "null"))
+               return SPTLRPC_FLVR_NULL;
+       if (!strcmp(name, "plain"))
+               return SPTLRPC_FLVR_PLAIN;
+       if (!strcmp(name, "krb5n"))
+               return SPTLRPC_FLVR_KRB5N;
+       if (!strcmp(name, "krb5a"))
+               return SPTLRPC_FLVR_KRB5A;
+       if (!strcmp(name, "krb5i"))
+               return SPTLRPC_FLVR_KRB5I;
+       if (!strcmp(name, "krb5p"))
+               return SPTLRPC_FLVR_KRB5P;
+
+       return SPTLRPC_FLVR_INVALID;
+}
+EXPORT_SYMBOL(sptlrpc_name2flavor_base);
+
+const char *sptlrpc_flavor2name_base(__u32 flvr)
+{
+       __u32   base = SPTLRPC_FLVR_BASE(flvr);
+
+       if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL))
+               return "null";
+       else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN))
+               return "plain";
+       else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N))
+               return "krb5n";
+       else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A))
+               return "krb5a";
+       else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I))
+               return "krb5i";
+       else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P))
+               return "krb5p";
+
+       CERROR("invalid wire flavor 0x%x\n", flvr);
+       return "invalid";
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_base);
+
+char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf,
+                              char *buf, int bufsize)
+{
+       if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN)
+               snprintf(buf, bufsize, "hash:%s",
+                        sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg));
+       else
+               snprintf(buf, bufsize, "%s",
+                        sptlrpc_flavor2name_base(sf->sf_rpc));
+
+       buf[bufsize - 1] = '\0';
+       return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name_bulk);
+
+char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize)
+{
+       snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc));
+
+       /*
+        * currently we don't support customized bulk specification for
+        * flavors other than plain
+        */
+       if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) {
+               char bspec[16];
+
+               bspec[0] = '-';
+               sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1);
+               strncat(buf, bspec, bufsize);
+       }
+
+       buf[bufsize - 1] = '\0';
+       return buf;
+}
+EXPORT_SYMBOL(sptlrpc_flavor2name);
+
+char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize)
+{
+       buf[0] = '\0';
+
+       if (flags & PTLRPC_SEC_FL_REVERSE)
+               strlcat(buf, "reverse,", bufsize);
+       if (flags & PTLRPC_SEC_FL_ROOTONLY)
+               strlcat(buf, "rootonly,", bufsize);
+       if (flags & PTLRPC_SEC_FL_UDESC)
+               strlcat(buf, "udesc,", bufsize);
+       if (flags & PTLRPC_SEC_FL_BULK)
+               strlcat(buf, "bulk,", bufsize);
+       if (buf[0] == '\0')
+               strlcat(buf, "-,", bufsize);
+
+       return buf;
+}
+EXPORT_SYMBOL(sptlrpc_secflags2str);
+
+/**************************************************
+ * client context APIs                     *
+ **************************************************/
+
+static
+struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec)
+{
+       struct vfs_cred vcred;
+       int create = 1, remove_dead = 1;
+
+       LASSERT(sec);
+       LASSERT(sec->ps_policy->sp_cops->lookup_ctx);
+
+       if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE |
+                                    PTLRPC_SEC_FL_ROOTONLY)) {
+               vcred.vc_uid = 0;
+               vcred.vc_gid = 0;
+               if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) {
+                       create = 0;
+                       remove_dead = 0;
+               }
+       } else {
+               vcred.vc_uid = current_uid();
+               vcred.vc_gid = current_gid();
+       }
+
+       return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred,
+                                                  create, remove_dead);
+}
+
+struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx)
+{
+       atomic_inc(&ctx->cc_refcount);
+       return ctx;
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_get);
+
+void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync)
+{
+       struct ptlrpc_sec *sec = ctx->cc_sec;
+
+       LASSERT(sec);
+       LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+       if (!atomic_dec_and_test(&ctx->cc_refcount))
+               return;
+
+       sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_put);
+
+/**
+ * Expire the client context immediately.
+ *
+ * \pre Caller must hold at least 1 reference on the \a ctx.
+ */
+void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(ctx->cc_ops->die);
+       ctx->cc_ops->die(ctx, 0);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_expire);
+
+/**
+ * To wake up the threads who are waiting for this client context. Called
+ * after some status change happened on \a ctx.
+ */
+void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx)
+{
+       struct ptlrpc_request *req, *next;
+
+       spin_lock(&ctx->cc_lock);
+       list_for_each_entry_safe(req, next, &ctx->cc_req_list,
+                                    rq_ctx_chain) {
+               list_del_init(&req->rq_ctx_chain);
+               ptlrpc_client_wake_req(req);
+       }
+       spin_unlock(&ctx->cc_lock);
+}
+EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup);
+
+int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize)
+{
+       LASSERT(ctx->cc_ops);
+
+       if (ctx->cc_ops->display == NULL)
+               return 0;
+
+       return ctx->cc_ops->display(ctx, buf, bufsize);
+}
+
+static int import_sec_check_expire(struct obd_import *imp)
+{
+       int     adapt = 0;
+
+       spin_lock(&imp->imp_lock);
+       if (imp->imp_sec_expire &&
+           imp->imp_sec_expire < cfs_time_current_sec()) {
+               adapt = 1;
+               imp->imp_sec_expire = 0;
+       }
+       spin_unlock(&imp->imp_lock);
+
+       if (!adapt)
+               return 0;
+
+       CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n");
+       return sptlrpc_import_sec_adapt(imp, NULL, 0);
+}
+
+static int import_sec_validate_get(struct obd_import *imp,
+                                  struct ptlrpc_sec **sec)
+{
+       int     rc;
+
+       if (unlikely(imp->imp_sec_expire)) {
+               rc = import_sec_check_expire(imp);
+               if (rc)
+                       return rc;
+       }
+
+       *sec = sptlrpc_import_sec_ref(imp);
+       if (*sec == NULL) {
+               CERROR("import %p (%s) with no sec\n",
+                      imp, ptlrpc_import_state_name(imp->imp_state));
+               return -EACCES;
+       }
+
+       if (unlikely((*sec)->ps_dying)) {
+               CERROR("attempt to use dying sec %p\n", sec);
+               sptlrpc_sec_put(*sec);
+               return -EACCES;
+       }
+
+       return 0;
+}
+
+/**
+ * Given a \a req, find or allocate a appropriate context for it.
+ * \pre req->rq_cli_ctx == NULL.
+ *
+ * \retval 0 succeed, and req->rq_cli_ctx is set.
+ * \retval -ev error number, and req->rq_cli_ctx == NULL.
+ */
+int sptlrpc_req_get_ctx(struct ptlrpc_request *req)
+{
+       struct obd_import *imp = req->rq_import;
+       struct ptlrpc_sec *sec;
+       int             rc;
+       ENTRY;
+
+       LASSERT(!req->rq_cli_ctx);
+       LASSERT(imp);
+
+       rc = import_sec_validate_get(imp, &sec);
+       if (rc)
+               RETURN(rc);
+
+       req->rq_cli_ctx = get_my_ctx(sec);
+
+       sptlrpc_sec_put(sec);
+
+       if (!req->rq_cli_ctx) {
+               CERROR("req %p: fail to get context\n", req);
+               RETURN(-ENOMEM);
+       }
+
+       RETURN(0);
+}
+
+/**
+ * Drop the context for \a req.
+ * \pre req->rq_cli_ctx != NULL.
+ * \post req->rq_cli_ctx == NULL.
+ *
+ * If \a sync == 0, this function should return quickly without sleep;
+ * otherwise it might trigger and wait for the whole process of sending
+ * an context-destroying rpc to server.
+ */
+void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync)
+{
+       ENTRY;
+
+       LASSERT(req);
+       LASSERT(req->rq_cli_ctx);
+
+       /* request might be asked to release earlier while still
+        * in the context waiting list.
+        */
+       if (!list_empty(&req->rq_ctx_chain)) {
+               spin_lock(&req->rq_cli_ctx->cc_lock);
+               list_del_init(&req->rq_ctx_chain);
+               spin_unlock(&req->rq_cli_ctx->cc_lock);
+       }
+
+       sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync);
+       req->rq_cli_ctx = NULL;
+       EXIT;
+}
+
+static
+int sptlrpc_req_ctx_switch(struct ptlrpc_request *req,
+                          struct ptlrpc_cli_ctx *oldctx,
+                          struct ptlrpc_cli_ctx *newctx)
+{
+       struct sptlrpc_flavor   old_flvr;
+       char               *reqmsg = NULL; /* to workaround old gcc */
+       int                  reqmsg_size;
+       int                  rc = 0;
+
+       LASSERT(req->rq_reqmsg);
+       LASSERT(req->rq_reqlen);
+       LASSERT(req->rq_replen);
+
+       CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), "
+              "switch sec %p(%s) -> %p(%s)\n", req,
+              oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec),
+              newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec),
+              oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name,
+              newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name);
+
+       /* save flavor */
+       old_flvr = req->rq_flvr;
+
+       /* save request message */
+       reqmsg_size = req->rq_reqlen;
+       if (reqmsg_size != 0) {
+               OBD_ALLOC_LARGE(reqmsg, reqmsg_size);
+               if (reqmsg == NULL)
+                       return -ENOMEM;
+               memcpy(reqmsg, req->rq_reqmsg, reqmsg_size);
+       }
+
+       /* release old req/rep buf */
+       req->rq_cli_ctx = oldctx;
+       sptlrpc_cli_free_reqbuf(req);
+       sptlrpc_cli_free_repbuf(req);
+       req->rq_cli_ctx = newctx;
+
+       /* recalculate the flavor */
+       sptlrpc_req_set_flavor(req, 0);
+
+       /* alloc new request buffer
+        * we don't need to alloc reply buffer here, leave it to the
+        * rest procedure of ptlrpc */
+       if (reqmsg_size != 0) {
+               rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size);
+               if (!rc) {
+                       LASSERT(req->rq_reqmsg);
+                       memcpy(req->rq_reqmsg, reqmsg, reqmsg_size);
+               } else {
+                       CWARN("failed to alloc reqbuf: %d\n", rc);
+                       req->rq_flvr = old_flvr;
+               }
+
+               OBD_FREE_LARGE(reqmsg, reqmsg_size);
+       }
+       return rc;
+}
+
+/**
+ * If current context of \a req is dead somehow, e.g. we just switched flavor
+ * thus marked original contexts dead, we'll find a new context for it. if
+ * no switch is needed, \a req will end up with the same context.
+ *
+ * \note a request must have a context, to keep other parts of code happy.
+ * In any case of failure during the switching, we must restore the old one.
+ */
+int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req)
+{
+       struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx;
+       struct ptlrpc_cli_ctx *newctx;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(oldctx);
+
+       sptlrpc_cli_ctx_get(oldctx);
+       sptlrpc_req_put_ctx(req, 0);
+
+       rc = sptlrpc_req_get_ctx(req);
+       if (unlikely(rc)) {
+               LASSERT(!req->rq_cli_ctx);
+
+               /* restore old ctx */
+               req->rq_cli_ctx = oldctx;
+               RETURN(rc);
+       }
+
+       newctx = req->rq_cli_ctx;
+       LASSERT(newctx);
+
+       if (unlikely(newctx == oldctx &&
+                    test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) {
+               /*
+                * still get the old dead ctx, usually means system too busy
+                */
+               CDEBUG(D_SEC,
+                      "ctx (%p, fl %lx) doesn't switch, relax a little bit\n",
+                      newctx, newctx->cc_flags);
+
+               schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+                                                  HZ);
+       } else {
+               /*
+                * it's possible newctx == oldctx if we're switching
+                * subflavor with the same sec.
+                */
+               rc = sptlrpc_req_ctx_switch(req, oldctx, newctx);
+               if (rc) {
+                       /* restore old ctx */
+                       sptlrpc_req_put_ctx(req, 0);
+                       req->rq_cli_ctx = oldctx;
+                       RETURN(rc);
+               }
+
+               LASSERT(req->rq_cli_ctx == newctx);
+       }
+
+       sptlrpc_cli_ctx_put(oldctx, 1);
+       RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx);
+
+static
+int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+       if (cli_ctx_is_refreshed(ctx))
+               return 1;
+       return 0;
+}
+
+static
+int ctx_refresh_timeout(void *data)
+{
+       struct ptlrpc_request *req = data;
+       int rc;
+
+       /* conn_cnt is needed in expire_one_request */
+       lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt);
+
+       rc = ptlrpc_expire_one_request(req, 1);
+       /* if we started recovery, we should mark this ctx dead; otherwise
+        * in case of lgssd died nobody would retire this ctx, following
+        * connecting will still find the same ctx thus cause deadlock.
+        * there's an assumption that expire time of the request should be
+        * later than the context refresh expire time.
+        */
+       if (rc == 0)
+               req->rq_cli_ctx->cc_ops->die(req->rq_cli_ctx, 0);
+       return rc;
+}
+
+static
+void ctx_refresh_interrupt(void *data)
+{
+       struct ptlrpc_request *req = data;
+
+       spin_lock(&req->rq_lock);
+       req->rq_intr = 1;
+       spin_unlock(&req->rq_lock);
+}
+
+static
+void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx)
+{
+       spin_lock(&ctx->cc_lock);
+       if (!list_empty(&req->rq_ctx_chain))
+               list_del_init(&req->rq_ctx_chain);
+       spin_unlock(&ctx->cc_lock);
+}
+
+/**
+ * To refresh the context of \req, if it's not up-to-date.
+ * \param timeout
+ * - < 0: don't wait
+ * - = 0: wait until success or fatal error occur
+ * - > 0: timeout value (in seconds)
+ *
+ * The status of the context could be subject to be changed by other threads
+ * at any time. We allow this race, but once we return with 0, the caller will
+ * suppose it's uptodated and keep using it until the owning rpc is done.
+ *
+ * \retval 0 only if the context is uptodated.
+ * \retval -ev error number.
+ */
+int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
+{
+       struct ptlrpc_cli_ctx  *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec      *sec;
+       struct l_wait_info      lwi;
+       int                  rc;
+       ENTRY;
+
+       LASSERT(ctx);
+
+       if (req->rq_ctx_init || req->rq_ctx_fini)
+               RETURN(0);
+
+       /*
+        * during the process a request's context might change type even
+        * (e.g. from gss ctx to null ctx), so each loop we need to re-check
+        * everything
+        */
+again:
+       rc = import_sec_validate_get(req->rq_import, &sec);
+       if (rc)
+               RETURN(rc);
+
+       if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+               CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n",
+                     req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc);
+               req_off_ctx_list(req, ctx);
+               sptlrpc_req_replace_dead_ctx(req);
+               ctx = req->rq_cli_ctx;
+       }
+       sptlrpc_sec_put(sec);
+
+       if (cli_ctx_is_eternal(ctx))
+               RETURN(0);
+
+       if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) {
+               LASSERT(ctx->cc_ops->refresh);
+               ctx->cc_ops->refresh(ctx);
+       }
+       LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0);
+
+       LASSERT(ctx->cc_ops->validate);
+       if (ctx->cc_ops->validate(ctx) == 0) {
+               req_off_ctx_list(req, ctx);
+               RETURN(0);
+       }
+
+       if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) {
+               spin_lock(&req->rq_lock);
+               req->rq_err = 1;
+               spin_unlock(&req->rq_lock);
+               req_off_ctx_list(req, ctx);
+               RETURN(-EPERM);
+       }
+
+       /*
+        * There's a subtle issue for resending RPCs, suppose following
+        * situation:
+        *  1. the request was sent to server.
+        *  2. recovery was kicked start, after finished the request was
+        *     marked as resent.
+        *  3. resend the request.
+        *  4. old reply from server received, we accept and verify the reply.
+        *     this has to be success, otherwise the error will be aware
+        *     by application.
+        *  5. new reply from server received, dropped by LNet.
+        *
+        * Note the xid of old & new request is the same. We can't simply
+        * change xid for the resent request because the server replies on
+        * it for reply reconstruction.
+        *
+        * Commonly the original context should be uptodate because we
+        * have a expiry nice time; server will keep its context because
+        * we at least hold a ref of old context which prevent context
+        * destroying RPC being sent. So server still can accept the request
+        * and finish the RPC. But if that's not the case:
+        *  1. If server side context has been trimmed, a NO_CONTEXT will
+        *     be returned, gss_cli_ctx_verify/unseal will switch to new
+        *     context by force.
+        *  2. Current context never be refreshed, then we are fine: we
+        *     never really send request with old context before.
+        */
+       if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) &&
+           unlikely(req->rq_reqmsg) &&
+           lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
+               req_off_ctx_list(req, ctx);
+               RETURN(0);
+       }
+
+       if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) {
+               req_off_ctx_list(req, ctx);
+               /*
+                * don't switch ctx if import was deactivated
+                */
+               if (req->rq_import->imp_deactive) {
+                       spin_lock(&req->rq_lock);
+                       req->rq_err = 1;
+                       spin_unlock(&req->rq_lock);
+                       RETURN(-EINTR);
+               }
+
+               rc = sptlrpc_req_replace_dead_ctx(req);
+               if (rc) {
+                       LASSERT(ctx == req->rq_cli_ctx);
+                       CERROR("req %p: failed to replace dead ctx %p: %d\n",
+                              req, ctx, rc);
+                       spin_lock(&req->rq_lock);
+                       req->rq_err = 1;
+                       spin_unlock(&req->rq_lock);
+                       RETURN(rc);
+               }
+
+               ctx = req->rq_cli_ctx;
+               goto again;
+       }
+
+       /*
+        * Now we're sure this context is during upcall, add myself into
+        * waiting list
+        */
+       spin_lock(&ctx->cc_lock);
+       if (list_empty(&req->rq_ctx_chain))
+               list_add(&req->rq_ctx_chain, &ctx->cc_req_list);
+       spin_unlock(&ctx->cc_lock);
+
+       if (timeout < 0)
+               RETURN(-EWOULDBLOCK);
+
+       /* Clear any flags that may be present from previous sends */
+       LASSERT(req->rq_receiving_reply == 0);
+       spin_lock(&req->rq_lock);
+       req->rq_err = 0;
+       req->rq_timedout = 0;
+       req->rq_resend = 0;
+       req->rq_restart = 0;
+       spin_unlock(&req->rq_lock);
+
+       lwi = LWI_TIMEOUT_INTR(timeout * HZ, ctx_refresh_timeout,
+                              ctx_refresh_interrupt, req);
+       rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi);
+
+       /*
+        * following cases could lead us here:
+        * - successfully refreshed;
+        * - interrupted;
+        * - timedout, and we don't want recover from the failure;
+        * - timedout, and waked up upon recovery finished;
+        * - someone else mark this ctx dead by force;
+        * - someone invalidate the req and call ptlrpc_client_wake_req(),
+        *   e.g. ptlrpc_abort_inflight();
+        */
+       if (!cli_ctx_is_refreshed(ctx)) {
+               /* timed out or interruptted */
+               req_off_ctx_list(req, ctx);
+
+               LASSERT(rc != 0);
+               RETURN(rc);
+       }
+
+       goto again;
+}
+
+/**
+ * Initialize flavor settings for \a req, according to \a opcode.
+ *
+ * \note this could be called in two situations:
+ * - new request from ptlrpc_pre_req(), with proper @opcode
+ * - old request which changed ctx in the middle, with @opcode == 0
+ */
+void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode)
+{
+       struct ptlrpc_sec *sec;
+
+       LASSERT(req->rq_import);
+       LASSERT(req->rq_cli_ctx);
+       LASSERT(req->rq_cli_ctx->cc_sec);
+       LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0);
+
+       /* special security flags accoding to opcode */
+       switch (opcode) {
+       case OST_READ:
+       case MDS_READPAGE:
+       case MGS_CONFIG_READ:
+       case OBD_IDX_READ:
+               req->rq_bulk_read = 1;
+               break;
+       case OST_WRITE:
+       case MDS_WRITEPAGE:
+               req->rq_bulk_write = 1;
+               break;
+       case SEC_CTX_INIT:
+               req->rq_ctx_init = 1;
+               break;
+       case SEC_CTX_FINI:
+               req->rq_ctx_fini = 1;
+               break;
+       case 0:
+               /* init/fini rpc won't be resend, so can't be here */
+               LASSERT(req->rq_ctx_init == 0);
+               LASSERT(req->rq_ctx_fini == 0);
+
+               /* cleanup flags, which should be recalculated */
+               req->rq_pack_udesc = 0;
+               req->rq_pack_bulk = 0;
+               break;
+       }
+
+       sec = req->rq_cli_ctx->cc_sec;
+
+       spin_lock(&sec->ps_lock);
+       req->rq_flvr = sec->ps_flvr;
+       spin_unlock(&sec->ps_lock);
+
+       /* force SVC_NULL for context initiation rpc, SVC_INTG for context
+        * destruction rpc */
+       if (unlikely(req->rq_ctx_init))
+               flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL);
+       else if (unlikely(req->rq_ctx_fini))
+               flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG);
+
+       /* user descriptor flag, null security can't do it anyway */
+       if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) &&
+           (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL))
+               req->rq_pack_udesc = 1;
+
+       /* bulk security flag */
+       if ((req->rq_bulk_read || req->rq_bulk_write) &&
+           sptlrpc_flavor_has_bulk(&req->rq_flvr))
+               req->rq_pack_bulk = 1;
+}
+
+void sptlrpc_request_out_callback(struct ptlrpc_request *req)
+{
+       if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV)
+               return;
+
+       LASSERT(req->rq_clrbuf);
+       if (req->rq_pool || !req->rq_reqbuf)
+               return;
+
+       OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len);
+       req->rq_reqbuf = NULL;
+       req->rq_reqbuf_len = 0;
+}
+
+/**
+ * Given an import \a imp, check whether current user has a valid context
+ * or not. We may create a new context and try to refresh it, and try
+ * repeatedly try in case of non-fatal errors. Return 0 means success.
+ */
+int sptlrpc_import_check_ctx(struct obd_import *imp)
+{
+       struct ptlrpc_sec     *sec;
+       struct ptlrpc_cli_ctx *ctx;
+       struct ptlrpc_request *req = NULL;
+       int rc;
+       ENTRY;
+
+       might_sleep();
+
+       sec = sptlrpc_import_sec_ref(imp);
+       ctx = get_my_ctx(sec);
+       sptlrpc_sec_put(sec);
+
+       if (!ctx)
+               RETURN(-ENOMEM);
+
+       if (cli_ctx_is_eternal(ctx) ||
+           ctx->cc_ops->validate(ctx) == 0) {
+               sptlrpc_cli_ctx_put(ctx, 1);
+               RETURN(0);
+       }
+
+       if (cli_ctx_is_error(ctx)) {
+               sptlrpc_cli_ctx_put(ctx, 1);
+               RETURN(-EACCES);
+       }
+
+       OBD_ALLOC_PTR(req);
+       if (!req)
+               RETURN(-ENOMEM);
+
+       spin_lock_init(&req->rq_lock);
+       atomic_set(&req->rq_refcount, 10000);
+       INIT_LIST_HEAD(&req->rq_ctx_chain);
+       init_waitqueue_head(&req->rq_reply_waitq);
+       init_waitqueue_head(&req->rq_set_waitq);
+       req->rq_import = imp;
+       req->rq_flvr = sec->ps_flvr;
+       req->rq_cli_ctx = ctx;
+
+       rc = sptlrpc_req_refresh_ctx(req, 0);
+       LASSERT(list_empty(&req->rq_ctx_chain));
+       sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1);
+       OBD_FREE_PTR(req);
+
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform the pre-defined security transformation
+ * upon the request message of \a req. After this function called,
+ * req->rq_reqmsg is still accessible as clear text.
+ */
+int sptlrpc_cli_wrap_request(struct ptlrpc_request *req)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+       /* we wrap bulk request here because now we can be sure
+        * the context is uptodate.
+        */
+       if (req->rq_bulk) {
+               rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               LASSERT(ctx->cc_ops->sign);
+               rc = ctx->cc_ops->sign(ctx, req);
+               break;
+       case SPTLRPC_SVC_PRIV:
+               LASSERT(ctx->cc_ops->seal);
+               rc = ctx->cc_ops->seal(ctx, req);
+               break;
+       default:
+               LBUG();
+       }
+
+       if (rc == 0) {
+               LASSERT(req->rq_reqdata_len);
+               LASSERT(req->rq_reqdata_len % 8 == 0);
+               LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len);
+       }
+
+       RETURN(rc);
+}
+
+static int do_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(req->rq_repbuf);
+       LASSERT(req->rq_repdata);
+       LASSERT(req->rq_repmsg == NULL);
+
+       req->rq_rep_swab_mask = 0;
+
+       rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len);
+       switch (rc) {
+       case 1:
+               lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+       case 0:
+               break;
+       default:
+               CERROR("failed unpack reply: x"LPU64"\n", req->rq_xid);
+               RETURN(-EPROTO);
+       }
+
+       if (req->rq_repdata_len < sizeof(struct lustre_msg)) {
+               CERROR("replied data length %d too small\n",
+                      req->rq_repdata_len);
+               RETURN(-EPROTO);
+       }
+
+       if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) !=
+           SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) {
+               CERROR("reply policy %u doesn't match request policy %u\n",
+                      SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr),
+                      SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc));
+               RETURN(-EPROTO);
+       }
+
+       switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) {
+       case SPTLRPC_SVC_NULL:
+       case SPTLRPC_SVC_AUTH:
+       case SPTLRPC_SVC_INTG:
+               LASSERT(ctx->cc_ops->verify);
+               rc = ctx->cc_ops->verify(ctx, req);
+               break;
+       case SPTLRPC_SVC_PRIV:
+               LASSERT(ctx->cc_ops->unseal);
+               rc = ctx->cc_ops->unseal(ctx, req);
+               break;
+       default:
+               LBUG();
+       }
+       LASSERT(rc || req->rq_repmsg || req->rq_resend);
+
+       if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL &&
+           !req->rq_ctx_init)
+               req->rq_rep_swab_mask = 0;
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the reply
+ * message of \a req. After return successfully, req->rq_repmsg points to
+ * the reply message in clear text.
+ *
+ * \pre the reply buffer should have been un-posted from LNet, so nothing is
+ * going to change.
+ */
+int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+       LASSERT(req->rq_repbuf);
+       LASSERT(req->rq_repdata == NULL);
+       LASSERT(req->rq_repmsg == NULL);
+       LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len);
+
+       if (req->rq_reply_off == 0 &&
+           (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) {
+               CERROR("real reply with offset 0\n");
+               return -EPROTO;
+       }
+
+       if (req->rq_reply_off % 8 != 0) {
+               CERROR("reply at odd offset %u\n", req->rq_reply_off);
+               return -EPROTO;
+       }
+
+       req->rq_repdata = (struct lustre_msg *)
+                               (req->rq_repbuf + req->rq_reply_off);
+       req->rq_repdata_len = req->rq_nob_received;
+
+       return do_cli_unwrap_reply(req);
+}
+
+/**
+ * Used by ptlrpc client, to perform security transformation upon the early
+ * reply message of \a req. We expect the rq_reply_off is 0, and
+ * rq_nob_received is the early reply size.
+ *
+ * Because the receive buffer might be still posted, the reply data might be
+ * changed at any time, no matter we're holding rq_lock or not. For this reason
+ * we allocate a separate ptlrpc_request and reply buffer for early reply
+ * processing.
+ *
+ * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request.
+ * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned
+ * \a *req_ret to release it.
+ * \retval -ev error number, and \a req_ret will not be set.
+ */
+int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req,
+                                  struct ptlrpc_request **req_ret)
+{
+       struct ptlrpc_request  *early_req;
+       char               *early_buf;
+       int                  early_bufsz, early_size;
+       int                  rc;
+       ENTRY;
+
+       OBD_ALLOC_PTR(early_req);
+       if (early_req == NULL)
+               RETURN(-ENOMEM);
+
+       early_size = req->rq_nob_received;
+       early_bufsz = size_roundup_power2(early_size);
+       OBD_ALLOC_LARGE(early_buf, early_bufsz);
+       if (early_buf == NULL)
+               GOTO(err_req, rc = -ENOMEM);
+
+       /* sanity checkings and copy data out, do it inside spinlock */
+       spin_lock(&req->rq_lock);
+
+       if (req->rq_replied) {
+               spin_unlock(&req->rq_lock);
+               GOTO(err_buf, rc = -EALREADY);
+       }
+
+       LASSERT(req->rq_repbuf);
+       LASSERT(req->rq_repdata == NULL);
+       LASSERT(req->rq_repmsg == NULL);
+
+       if (req->rq_reply_off != 0) {
+               CERROR("early reply with offset %u\n", req->rq_reply_off);
+               spin_unlock(&req->rq_lock);
+               GOTO(err_buf, rc = -EPROTO);
+       }
+
+       if (req->rq_nob_received != early_size) {
+               /* even another early arrived the size should be the same */
+               CERROR("data size has changed from %u to %u\n",
+                      early_size, req->rq_nob_received);
+               spin_unlock(&req->rq_lock);
+               GOTO(err_buf, rc = -EINVAL);
+       }
+
+       if (req->rq_nob_received < sizeof(struct lustre_msg)) {
+               CERROR("early reply length %d too small\n",
+                      req->rq_nob_received);
+               spin_unlock(&req->rq_lock);
+               GOTO(err_buf, rc = -EALREADY);
+       }
+
+       memcpy(early_buf, req->rq_repbuf, early_size);
+       spin_unlock(&req->rq_lock);
+
+       spin_lock_init(&early_req->rq_lock);
+       early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx);
+       early_req->rq_flvr = req->rq_flvr;
+       early_req->rq_repbuf = early_buf;
+       early_req->rq_repbuf_len = early_bufsz;
+       early_req->rq_repdata = (struct lustre_msg *) early_buf;
+       early_req->rq_repdata_len = early_size;
+       early_req->rq_early = 1;
+       early_req->rq_reqmsg = req->rq_reqmsg;
+
+       rc = do_cli_unwrap_reply(early_req);
+       if (rc) {
+               DEBUG_REQ(D_ADAPTTO, early_req,
+                         "error %d unwrap early reply", rc);
+               GOTO(err_ctx, rc);
+       }
+
+       LASSERT(early_req->rq_repmsg);
+       *req_ret = early_req;
+       RETURN(0);
+
+err_ctx:
+       sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+err_buf:
+       OBD_FREE_LARGE(early_buf, early_bufsz);
+err_req:
+       OBD_FREE_PTR(early_req);
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc client, to release a processed early reply \a early_req.
+ *
+ * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply().
+ */
+void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req)
+{
+       LASSERT(early_req->rq_repbuf);
+       LASSERT(early_req->rq_repdata);
+       LASSERT(early_req->rq_repmsg);
+
+       sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1);
+       OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len);
+       OBD_FREE_PTR(early_req);
+}
+
+/**************************************************
+ * sec ID                                       *
+ **************************************************/
+
+/*
+ * "fixed" sec (e.g. null) use sec_id < 0
+ */
+static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1);
+
+int sptlrpc_get_next_secid(void)
+{
+       return atomic_inc_return(&sptlrpc_sec_id);
+}
+EXPORT_SYMBOL(sptlrpc_get_next_secid);
+
+/**************************************************
+ * client side high-level security APIs           *
+ **************************************************/
+
+static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid,
+                                  int grace, int force)
+{
+       struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+       LASSERT(policy->sp_cops);
+       LASSERT(policy->sp_cops->flush_ctx_cache);
+
+       return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force);
+}
+
+static void sec_cop_destroy_sec(struct ptlrpc_sec *sec)
+{
+       struct ptlrpc_sec_policy *policy = sec->ps_policy;
+
+       LASSERT_ATOMIC_ZERO(&sec->ps_refcount);
+       LASSERT_ATOMIC_ZERO(&sec->ps_nctx);
+       LASSERT(policy->sp_cops->destroy_sec);
+
+       CDEBUG(D_SEC, "%s@%p: being destroied\n", sec->ps_policy->sp_name, sec);
+
+       policy->sp_cops->destroy_sec(sec);
+       sptlrpc_policy_put(policy);
+}
+
+void sptlrpc_sec_destroy(struct ptlrpc_sec *sec)
+{
+       sec_cop_destroy_sec(sec);
+}
+EXPORT_SYMBOL(sptlrpc_sec_destroy);
+
+static void sptlrpc_sec_kill(struct ptlrpc_sec *sec)
+{
+       LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+       if (sec->ps_policy->sp_cops->kill_sec) {
+               sec->ps_policy->sp_cops->kill_sec(sec);
+
+               sec_cop_flush_ctx_cache(sec, -1, 1, 1);
+       }
+}
+
+struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec)
+{
+       if (sec)
+               atomic_inc(&sec->ps_refcount);
+
+       return sec;
+}
+EXPORT_SYMBOL(sptlrpc_sec_get);
+
+void sptlrpc_sec_put(struct ptlrpc_sec *sec)
+{
+       if (sec) {
+               LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+               if (atomic_dec_and_test(&sec->ps_refcount)) {
+                       sptlrpc_gc_del_sec(sec);
+                       sec_cop_destroy_sec(sec);
+               }
+       }
+}
+EXPORT_SYMBOL(sptlrpc_sec_put);
+
+/*
+ * policy module is responsible for taking refrence of import
+ */
+static
+struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp,
+                                      struct ptlrpc_svc_ctx *svc_ctx,
+                                      struct sptlrpc_flavor *sf,
+                                      enum lustre_sec_part sp)
+{
+       struct ptlrpc_sec_policy *policy;
+       struct ptlrpc_sec       *sec;
+       char                  str[32];
+       ENTRY;
+
+       if (svc_ctx) {
+               LASSERT(imp->imp_dlm_fake == 1);
+
+               CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n",
+                      imp->imp_obd->obd_type->typ_name,
+                      imp->imp_obd->obd_name,
+                      sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+               policy = sptlrpc_policy_get(svc_ctx->sc_policy);
+               sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY;
+       } else {
+               LASSERT(imp->imp_dlm_fake == 0);
+
+               CDEBUG(D_SEC, "%s %s: select security flavor %s\n",
+                      imp->imp_obd->obd_type->typ_name,
+                      imp->imp_obd->obd_name,
+                      sptlrpc_flavor2name(sf, str, sizeof(str)));
+
+               policy = sptlrpc_wireflavor2policy(sf->sf_rpc);
+               if (!policy) {
+                       CERROR("invalid flavor 0x%x\n", sf->sf_rpc);
+                       RETURN(NULL);
+               }
+       }
+
+       sec = policy->sp_cops->create_sec(imp, svc_ctx, sf);
+       if (sec) {
+               atomic_inc(&sec->ps_refcount);
+
+               sec->ps_part = sp;
+
+               if (sec->ps_gc_interval && policy->sp_cops->gc_ctx)
+                       sptlrpc_gc_add_sec(sec);
+       } else {
+               sptlrpc_policy_put(policy);
+       }
+
+       RETURN(sec);
+}
+
+struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp)
+{
+       struct ptlrpc_sec *sec;
+
+       spin_lock(&imp->imp_lock);
+       sec = sptlrpc_sec_get(imp->imp_sec);
+       spin_unlock(&imp->imp_lock);
+
+       return sec;
+}
+EXPORT_SYMBOL(sptlrpc_import_sec_ref);
+
+static void sptlrpc_import_sec_install(struct obd_import *imp,
+                                      struct ptlrpc_sec *sec)
+{
+       struct ptlrpc_sec *old_sec;
+
+       LASSERT_ATOMIC_POS(&sec->ps_refcount);
+
+       spin_lock(&imp->imp_lock);
+       old_sec = imp->imp_sec;
+       imp->imp_sec = sec;
+       spin_unlock(&imp->imp_lock);
+
+       if (old_sec) {
+               sptlrpc_sec_kill(old_sec);
+
+               /* balance the ref taken by this import */
+               sptlrpc_sec_put(old_sec);
+       }
+}
+
+static inline
+int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2)
+{
+       return (memcmp(sf1, sf2, sizeof(*sf1)) == 0);
+}
+
+static inline
+void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src)
+{
+       *dst = *src;
+}
+
+static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp,
+                                            struct ptlrpc_sec *sec,
+                                            struct sptlrpc_flavor *sf)
+{
+       char    str1[32], str2[32];
+
+       if (sec->ps_flvr.sf_flags != sf->sf_flags)
+               CDEBUG(D_SEC, "changing sec flags: %s -> %s\n",
+                      sptlrpc_secflags2str(sec->ps_flvr.sf_flags,
+                                           str1, sizeof(str1)),
+                      sptlrpc_secflags2str(sf->sf_flags,
+                                           str2, sizeof(str2)));
+
+       spin_lock(&sec->ps_lock);
+       flavor_copy(&sec->ps_flvr, sf);
+       spin_unlock(&sec->ps_lock);
+}
+
+/**
+ * To get an appropriate ptlrpc_sec for the \a imp, according to the current
+ * configuration. Upon called, imp->imp_sec may or may not be NULL.
+ *
+ *  - regular import: \a svc_ctx should be NULL and \a flvr is ignored;
+ *  - reverse import: \a svc_ctx and \a flvr are obtained from incoming request.
+ */
+int sptlrpc_import_sec_adapt(struct obd_import *imp,
+                            struct ptlrpc_svc_ctx *svc_ctx,
+                            struct sptlrpc_flavor *flvr)
+{
+       struct ptlrpc_connection   *conn;
+       struct sptlrpc_flavor       sf;
+       struct ptlrpc_sec         *sec, *newsec;
+       enum lustre_sec_part    sp;
+       char                    str[24];
+       int                      rc = 0;
+       ENTRY;
+
+       might_sleep();
+
+       if (imp == NULL)
+               RETURN(0);
+
+       conn = imp->imp_connection;
+
+       if (svc_ctx == NULL) {
+               struct client_obd *cliobd = &imp->imp_obd->u.cli;
+               /*
+                * normal import, determine flavor from rule set, except
+                * for mgc the flavor is predetermined.
+                */
+               if (cliobd->cl_sp_me == LUSTRE_SP_MGC)
+                       sf = cliobd->cl_flvr_mgc;
+               else
+                       sptlrpc_conf_choose_flavor(cliobd->cl_sp_me,
+                                                  cliobd->cl_sp_to,
+                                                  &cliobd->cl_target_uuid,
+                                                  conn->c_self, &sf);
+
+               sp = imp->imp_obd->u.cli.cl_sp_me;
+       } else {
+               /* reverse import, determine flavor from incoming reqeust */
+               sf = *flvr;
+
+               if (sf.sf_rpc != SPTLRPC_FLVR_NULL)
+                       sf.sf_flags = PTLRPC_SEC_FL_REVERSE |
+                                     PTLRPC_SEC_FL_ROOTONLY;
+
+               sp = sptlrpc_target_sec_part(imp->imp_obd);
+       }
+
+       sec = sptlrpc_import_sec_ref(imp);
+       if (sec) {
+               char    str2[24];
+
+               if (flavor_equal(&sf, &sec->ps_flvr))
+                       GOTO(out, rc);
+
+               CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n",
+                      imp->imp_obd->obd_name,
+                      obd_uuid2str(&conn->c_remote_uuid),
+                      sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)),
+                      sptlrpc_flavor2name(&sf, str2, sizeof(str2)));
+
+               if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) ==
+                   SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) &&
+                   SPTLRPC_FLVR_MECH(sf.sf_rpc) ==
+                   SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) {
+                       sptlrpc_import_sec_adapt_inplace(imp, sec, &sf);
+                       GOTO(out, rc);
+               }
+       } else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) !=
+                  SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) {
+               CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n",
+                      imp->imp_obd->obd_name,
+                      obd_uuid2str(&conn->c_remote_uuid),
+                      LNET_NIDNET(conn->c_self),
+                      sptlrpc_flavor2name(&sf, str, sizeof(str)));
+       }
+
+       mutex_lock(&imp->imp_sec_mutex);
+
+       newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp);
+       if (newsec) {
+               sptlrpc_import_sec_install(imp, newsec);
+       } else {
+               CERROR("import %s->%s: failed to create new sec\n",
+                      imp->imp_obd->obd_name,
+                      obd_uuid2str(&conn->c_remote_uuid));
+               rc = -EPERM;
+       }
+
+       mutex_unlock(&imp->imp_sec_mutex);
+out:
+       sptlrpc_sec_put(sec);
+       RETURN(rc);
+}
+
+void sptlrpc_import_sec_put(struct obd_import *imp)
+{
+       if (imp->imp_sec) {
+               sptlrpc_sec_kill(imp->imp_sec);
+
+               sptlrpc_sec_put(imp->imp_sec);
+               imp->imp_sec = NULL;
+       }
+}
+
+static void import_flush_ctx_common(struct obd_import *imp,
+                                   uid_t uid, int grace, int force)
+{
+       struct ptlrpc_sec *sec;
+
+       if (imp == NULL)
+               return;
+
+       sec = sptlrpc_import_sec_ref(imp);
+       if (sec == NULL)
+               return;
+
+       sec_cop_flush_ctx_cache(sec, uid, grace, force);
+       sptlrpc_sec_put(sec);
+}
+
+void sptlrpc_import_flush_root_ctx(struct obd_import *imp)
+{
+       /* it's important to use grace mode, see explain in
+        * sptlrpc_req_refresh_ctx() */
+       import_flush_ctx_common(imp, 0, 1, 1);
+}
+
+void sptlrpc_import_flush_my_ctx(struct obd_import *imp)
+{
+       import_flush_ctx_common(imp, current_uid(), 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx);
+
+void sptlrpc_import_flush_all_ctx(struct obd_import *imp)
+{
+       import_flush_ctx_common(imp, -1, 1, 1);
+}
+EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx);
+
+/**
+ * Used by ptlrpc client to allocate request buffer of \a req. Upon return
+ * successfully, req->rq_reqmsg points to a buffer with size \a msgsize.
+ */
+int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec_policy *policy;
+       int rc;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(ctx->cc_sec->ps_policy);
+       LASSERT(req->rq_reqmsg == NULL);
+       LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+       policy = ctx->cc_sec->ps_policy;
+       rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize);
+       if (!rc) {
+               LASSERT(req->rq_reqmsg);
+               LASSERT(req->rq_reqbuf || req->rq_clrbuf);
+
+               /* zeroing preallocated buffer */
+               if (req->rq_pool)
+                       memset(req->rq_reqmsg, 0, msgsize);
+       }
+
+       return rc;
+}
+
+/**
+ * Used by ptlrpc client to free request buffer of \a req. After this
+ * req->rq_reqmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec_policy *policy;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(ctx->cc_sec->ps_policy);
+       LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+       if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL)
+               return;
+
+       policy = ctx->cc_sec->ps_policy;
+       policy->sp_cops->free_reqbuf(ctx->cc_sec, req);
+       req->rq_reqmsg = NULL;
+}
+
+/*
+ * NOTE caller must guarantee the buffer size is enough for the enlargement
+ */
+void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg,
+                                 int segment, int newsize)
+{
+       void   *src, *dst;
+       int     oldsize, oldmsg_size, movesize;
+
+       LASSERT(segment < msg->lm_bufcount);
+       LASSERT(msg->lm_buflens[segment] <= newsize);
+
+       if (msg->lm_buflens[segment] == newsize)
+               return;
+
+       /* nothing to do if we are enlarging the last segment */
+       if (segment == msg->lm_bufcount - 1) {
+               msg->lm_buflens[segment] = newsize;
+               return;
+       }
+
+       oldsize = msg->lm_buflens[segment];
+
+       src = lustre_msg_buf(msg, segment + 1, 0);
+       msg->lm_buflens[segment] = newsize;
+       dst = lustre_msg_buf(msg, segment + 1, 0);
+       msg->lm_buflens[segment] = oldsize;
+
+       /* move from segment + 1 to end segment */
+       LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2);
+       oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+       movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg);
+       LASSERT(movesize >= 0);
+
+       if (movesize)
+               memmove(dst, src, movesize);
+
+       /* note we don't clear the ares where old data live, not secret */
+
+       /* finally set new segment size */
+       msg->lm_buflens[segment] = newsize;
+}
+EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace);
+
+/**
+ * Used by ptlrpc client to enlarge the \a segment of request message pointed
+ * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be
+ * preserved after the enlargement. this must be called after original request
+ * buffer being allocated.
+ *
+ * \note after this be called, rq_reqmsg and rq_reqlen might have been changed,
+ * so caller should refresh its local pointers if needed.
+ */
+int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req,
+                              int segment, int newsize)
+{
+       struct ptlrpc_cli_ctx    *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec_cops   *cops;
+       struct lustre_msg       *msg = req->rq_reqmsg;
+
+       LASSERT(ctx);
+       LASSERT(msg);
+       LASSERT(msg->lm_bufcount > segment);
+       LASSERT(msg->lm_buflens[segment] <= newsize);
+
+       if (msg->lm_buflens[segment] == newsize)
+               return 0;
+
+       cops = ctx->cc_sec->ps_policy->sp_cops;
+       LASSERT(cops->enlarge_reqbuf);
+       return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize);
+}
+EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf);
+
+/**
+ * Used by ptlrpc client to allocate reply buffer of \a req.
+ *
+ * \note After this, req->rq_repmsg is still not accessible.
+ */
+int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec_policy *policy;
+       ENTRY;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(ctx->cc_sec->ps_policy);
+
+       if (req->rq_repbuf)
+               RETURN(0);
+
+       policy = ctx->cc_sec->ps_policy;
+       RETURN(policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize));
+}
+
+/**
+ * Used by ptlrpc client to free reply buffer of \a req. After this
+ * req->rq_repmsg is set to NULL and should not be accessed anymore.
+ */
+void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req)
+{
+       struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx;
+       struct ptlrpc_sec_policy *policy;
+       ENTRY;
+
+       LASSERT(ctx);
+       LASSERT(ctx->cc_sec);
+       LASSERT(ctx->cc_sec->ps_policy);
+       LASSERT_ATOMIC_POS(&ctx->cc_refcount);
+
+       if (req->rq_repbuf == NULL)
+               return;
+       LASSERT(req->rq_repbuf_len);
+
+       policy = ctx->cc_sec->ps_policy;
+       policy->sp_cops->free_repbuf(ctx->cc_sec, req);
+       req->rq_repmsg = NULL;
+       EXIT;
+}
+
+int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp,
+                               struct ptlrpc_cli_ctx *ctx)
+{
+       struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy;
+
+       if (!policy->sp_cops->install_rctx)
+               return 0;
+       return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx);
+}
+
+int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp,
+                               struct ptlrpc_svc_ctx *ctx)
+{
+       struct ptlrpc_sec_policy *policy = ctx->sc_policy;
+
+       if (!policy->sp_sops->install_rctx)
+               return 0;
+       return policy->sp_sops->install_rctx(imp, ctx);
+}
+
+/****************************************
+ * server side security                 *
+ ****************************************/
+
+static int flavor_allowed(struct sptlrpc_flavor *exp,
+                         struct ptlrpc_request *req)
+{
+       struct sptlrpc_flavor *flvr = &req->rq_flvr;
+
+       if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc)
+               return 1;
+
+       if ((req->rq_ctx_init || req->rq_ctx_fini) &&
+           SPTLRPC_FLVR_POLICY(exp->sf_rpc) ==
+           SPTLRPC_FLVR_POLICY(flvr->sf_rpc) &&
+           SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc))
+               return 1;
+
+       return 0;
+}
+
+#define EXP_FLVR_UPDATE_EXPIRE      (OBD_TIMEOUT_DEFAULT + 10)
+
+/**
+ * Given an export \a exp, check whether the flavor of incoming \a req
+ * is allowed by the export \a exp. Main logic is about taking care of
+ * changing configurations. Return 0 means success.
+ */
+int sptlrpc_target_export_check(struct obd_export *exp,
+                               struct ptlrpc_request *req)
+{
+       struct sptlrpc_flavor   flavor;
+
+       if (exp == NULL)
+               return 0;
+
+       /* client side export has no imp_reverse, skip
+        * FIXME maybe we should check flavor this as well??? */
+       if (exp->exp_imp_reverse == NULL)
+               return 0;
+
+       /* don't care about ctx fini rpc */
+       if (req->rq_ctx_fini)
+               return 0;
+
+       spin_lock(&exp->exp_lock);
+
+       /* if flavor just changed (exp->exp_flvr_changed != 0), we wait for
+        * the first req with the new flavor, then treat it as current flavor,
+        * adapt reverse sec according to it.
+        * note the first rpc with new flavor might not be with root ctx, in
+        * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. */
+       if (unlikely(exp->exp_flvr_changed) &&
+           flavor_allowed(&exp->exp_flvr_old[1], req)) {
+               /* make the new flavor as "current", and old ones as
+                * about-to-expire */
+               CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp,
+                      exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc);
+               flavor = exp->exp_flvr_old[1];
+               exp->exp_flvr_old[1] = exp->exp_flvr_old[0];
+               exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0];
+               exp->exp_flvr_old[0] = exp->exp_flvr;
+               exp->exp_flvr_expire[0] = cfs_time_current_sec() +
+                                         EXP_FLVR_UPDATE_EXPIRE;
+               exp->exp_flvr = flavor;
+
+               /* flavor change finished */
+               exp->exp_flvr_changed = 0;
+               LASSERT(exp->exp_flvr_adapt == 1);
+
+               /* if it's gss, we only interested in root ctx init */
+               if (req->rq_auth_gss &&
+                   !(req->rq_ctx_init &&
+                     (req->rq_auth_usr_root || req->rq_auth_usr_mdt ||
+                      req->rq_auth_usr_ost))) {
+                       spin_unlock(&exp->exp_lock);
+                       CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n",
+                              req->rq_auth_gss, req->rq_ctx_init,
+                              req->rq_auth_usr_root, req->rq_auth_usr_mdt,
+                              req->rq_auth_usr_ost);
+                       return 0;
+               }
+
+               exp->exp_flvr_adapt = 0;
+               spin_unlock(&exp->exp_lock);
+
+               return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+                                               req->rq_svc_ctx, &flavor);
+       }
+
+       /* if it equals to the current flavor, we accept it, but need to
+        * dealing with reverse sec/ctx */
+       if (likely(flavor_allowed(&exp->exp_flvr, req))) {
+               /* most cases should return here, we only interested in
+                * gss root ctx init */
+               if (!req->rq_auth_gss || !req->rq_ctx_init ||
+                   (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+                    !req->rq_auth_usr_ost)) {
+                       spin_unlock(&exp->exp_lock);
+                       return 0;
+               }
+
+               /* if flavor just changed, we should not proceed, just leave
+                * it and current flavor will be discovered and replaced
+                * shortly, and let _this_ rpc pass through */
+               if (exp->exp_flvr_changed) {
+                       LASSERT(exp->exp_flvr_adapt);
+                       spin_unlock(&exp->exp_lock);
+                       return 0;
+               }
+
+               if (exp->exp_flvr_adapt) {
+                       exp->exp_flvr_adapt = 0;
+                       CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n",
+                              exp, exp->exp_flvr.sf_rpc,
+                              exp->exp_flvr_old[0].sf_rpc,
+                              exp->exp_flvr_old[1].sf_rpc);
+                       flavor = exp->exp_flvr;
+                       spin_unlock(&exp->exp_lock);
+
+                       return sptlrpc_import_sec_adapt(exp->exp_imp_reverse,
+                                                       req->rq_svc_ctx,
+                                                       &flavor);
+               } else {
+                       CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, "
+                              "install rvs ctx\n", exp, exp->exp_flvr.sf_rpc,
+                              exp->exp_flvr_old[0].sf_rpc,
+                              exp->exp_flvr_old[1].sf_rpc);
+                       spin_unlock(&exp->exp_lock);
+
+                       return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse,
+                                                          req->rq_svc_ctx);
+               }
+       }
+
+       if (exp->exp_flvr_expire[0]) {
+               if (exp->exp_flvr_expire[0] >= cfs_time_current_sec()) {
+                       if (flavor_allowed(&exp->exp_flvr_old[0], req)) {
+                               CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the "
+                                      "middle one ("CFS_DURATION_T")\n", exp,
+                                      exp->exp_flvr.sf_rpc,
+                                      exp->exp_flvr_old[0].sf_rpc,
+                                      exp->exp_flvr_old[1].sf_rpc,
+                                      exp->exp_flvr_expire[0] -
+                                               cfs_time_current_sec());
+                               spin_unlock(&exp->exp_lock);
+                               return 0;
+                       }
+               } else {
+                       CDEBUG(D_SEC, "mark middle expired\n");
+                       exp->exp_flvr_expire[0] = 0;
+               }
+               CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp,
+                      exp->exp_flvr.sf_rpc,
+                      exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+                      req->rq_flvr.sf_rpc);
+       }
+
+       /* now it doesn't match the current flavor, the only chance we can
+        * accept it is match the old flavors which is not expired. */
+       if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) {
+               if (exp->exp_flvr_expire[1] >= cfs_time_current_sec()) {
+                       if (flavor_allowed(&exp->exp_flvr_old[1], req)) {
+                               CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the "
+                                      "oldest one ("CFS_DURATION_T")\n", exp,
+                                      exp->exp_flvr.sf_rpc,
+                                      exp->exp_flvr_old[0].sf_rpc,
+                                      exp->exp_flvr_old[1].sf_rpc,
+                                      exp->exp_flvr_expire[1] -
+                                               cfs_time_current_sec());
+                               spin_unlock(&exp->exp_lock);
+                               return 0;
+                       }
+               } else {
+                       CDEBUG(D_SEC, "mark oldest expired\n");
+                       exp->exp_flvr_expire[1] = 0;
+               }
+               CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n",
+                      exp, exp->exp_flvr.sf_rpc,
+                      exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc,
+                      req->rq_flvr.sf_rpc);
+       } else {
+               CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n",
+                      exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc,
+                      exp->exp_flvr_old[1].sf_rpc);
+       }
+
+       spin_unlock(&exp->exp_lock);
+
+       CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with "
+             "unauthorized flavor %x, expect %x|%x(%+ld)|%x(%+ld)\n",
+             exp, exp->exp_obd->obd_name,
+             req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini,
+             req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost,
+             req->rq_flvr.sf_rpc,
+             exp->exp_flvr.sf_rpc,
+             exp->exp_flvr_old[0].sf_rpc,
+             exp->exp_flvr_expire[0] ?
+             (unsigned long) (exp->exp_flvr_expire[0] -
+                              cfs_time_current_sec()) : 0,
+             exp->exp_flvr_old[1].sf_rpc,
+             exp->exp_flvr_expire[1] ?
+             (unsigned long) (exp->exp_flvr_expire[1] -
+                              cfs_time_current_sec()) : 0);
+       return -EACCES;
+}
+EXPORT_SYMBOL(sptlrpc_target_export_check);
+
+void sptlrpc_target_update_exp_flavor(struct obd_device *obd,
+                                     struct sptlrpc_rule_set *rset)
+{
+       struct obd_export       *exp;
+       struct sptlrpc_flavor    new_flvr;
+
+       LASSERT(obd);
+
+       spin_lock(&obd->obd_dev_lock);
+
+       list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+               if (exp->exp_connection == NULL)
+                       continue;
+
+               /* note if this export had just been updated flavor
+                * (exp_flvr_changed == 1), this will override the
+                * previous one. */
+               spin_lock(&exp->exp_lock);
+               sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer,
+                                            exp->exp_connection->c_peer.nid,
+                                            &new_flvr);
+               if (exp->exp_flvr_changed ||
+                   !flavor_equal(&new_flvr, &exp->exp_flvr)) {
+                       exp->exp_flvr_old[1] = new_flvr;
+                       exp->exp_flvr_expire[1] = 0;
+                       exp->exp_flvr_changed = 1;
+                       exp->exp_flvr_adapt = 1;
+
+                       CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n",
+                              exp, sptlrpc_part2name(exp->exp_sp_peer),
+                              exp->exp_flvr.sf_rpc,
+                              exp->exp_flvr_old[1].sf_rpc);
+               }
+               spin_unlock(&exp->exp_lock);
+       }
+
+       spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor);
+
+static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc)
+{
+       /* peer's claim is unreliable unless gss is being used */
+       if (!req->rq_auth_gss || svc_rc == SECSVC_DROP)
+               return svc_rc;
+
+       switch (req->rq_sp_from) {
+       case LUSTRE_SP_CLI:
+               if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) {
+                       DEBUG_REQ(D_ERROR, req, "faked source CLI");
+                       svc_rc = SECSVC_DROP;
+               }
+               break;
+       case LUSTRE_SP_MDT:
+               if (!req->rq_auth_usr_mdt) {
+                       DEBUG_REQ(D_ERROR, req, "faked source MDT");
+                       svc_rc = SECSVC_DROP;
+               }
+               break;
+       case LUSTRE_SP_OST:
+               if (!req->rq_auth_usr_ost) {
+                       DEBUG_REQ(D_ERROR, req, "faked source OST");
+                       svc_rc = SECSVC_DROP;
+               }
+               break;
+       case LUSTRE_SP_MGS:
+       case LUSTRE_SP_MGC:
+               if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt &&
+                   !req->rq_auth_usr_ost) {
+                       DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS");
+                       svc_rc = SECSVC_DROP;
+               }
+               break;
+       case LUSTRE_SP_ANY:
+       default:
+               DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from);
+               svc_rc = SECSVC_DROP;
+       }
+
+       return svc_rc;
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon request message of
+ * incoming \a req. This must be the first thing to do with a incoming
+ * request in ptlrpc layer.
+ *
+ * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in
+ * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set.
+ * \retval SECSVC_COMPLETE success, the request has been fully processed, and
+ * reply message has been prepared.
+ * \retval SECSVC_DROP failed, this request should be dropped.
+ */
+int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req)
+{
+       struct ptlrpc_sec_policy *policy;
+       struct lustre_msg       *msg = req->rq_reqbuf;
+       int                    rc;
+       ENTRY;
+
+       LASSERT(msg);
+       LASSERT(req->rq_reqmsg == NULL);
+       LASSERT(req->rq_repmsg == NULL);
+       LASSERT(req->rq_svc_ctx == NULL);
+
+       req->rq_req_swab_mask = 0;
+
+       rc = __lustre_unpack_msg(msg, req->rq_reqdata_len);
+       switch (rc) {
+       case 1:
+               lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF);
+       case 0:
+               break;
+       default:
+               CERROR("error unpacking request from %s x"LPU64"\n",
+                      libcfs_id2str(req->rq_peer), req->rq_xid);
+               RETURN(SECSVC_DROP);
+       }
+
+       req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr);
+       req->rq_sp_from = LUSTRE_SP_ANY;
+       req->rq_auth_uid = INVALID_UID;
+       req->rq_auth_mapped_uid = INVALID_UID;
+
+       policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc);
+       if (!policy) {
+               CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc);
+               RETURN(SECSVC_DROP);
+       }
+
+       LASSERT(policy->sp_sops->accept);
+       rc = policy->sp_sops->accept(req);
+       sptlrpc_policy_put(policy);
+       LASSERT(req->rq_reqmsg || rc != SECSVC_OK);
+       LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP);
+
+       /*
+        * if it's not null flavor (which means embedded packing msg),
+        * reset the swab mask for the comming inner msg unpacking.
+        */
+       if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL)
+               req->rq_req_swab_mask = 0;
+
+       /* sanity check for the request source */
+       rc = sptlrpc_svc_check_from(req, rc);
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed,
+ * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to
+ * a buffer of \a msglen size.
+ */
+int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen)
+{
+       struct ptlrpc_sec_policy *policy;
+       struct ptlrpc_reply_state *rs;
+       int rc;
+       ENTRY;
+
+       LASSERT(req->rq_svc_ctx);
+       LASSERT(req->rq_svc_ctx->sc_policy);
+
+       policy = req->rq_svc_ctx->sc_policy;
+       LASSERT(policy->sp_sops->alloc_rs);
+
+       rc = policy->sp_sops->alloc_rs(req, msglen);
+       if (unlikely(rc == -ENOMEM)) {
+               /* failed alloc, try emergency pool */
+               rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_svcpt);
+               if (rs == NULL)
+                       RETURN(-ENOMEM);
+
+               req->rq_reply_state = rs;
+               rc = policy->sp_sops->alloc_rs(req, msglen);
+               if (rc) {
+                       lustre_put_emerg_rs(rs);
+                       req->rq_reply_state = NULL;
+               }
+       }
+
+       LASSERT(rc != 0 ||
+               (req->rq_reply_state && req->rq_reply_state->rs_msg));
+
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to perform transformation upon reply message.
+ *
+ * \post req->rq_reply_off is set to approriate server-controlled reply offset.
+ * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible.
+ */
+int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req)
+{
+       struct ptlrpc_sec_policy *policy;
+       int rc;
+       ENTRY;
+
+       LASSERT(req->rq_svc_ctx);
+       LASSERT(req->rq_svc_ctx->sc_policy);
+
+       policy = req->rq_svc_ctx->sc_policy;
+       LASSERT(policy->sp_sops->authorize);
+
+       rc = policy->sp_sops->authorize(req);
+       LASSERT(rc || req->rq_reply_state->rs_repdata_len);
+
+       RETURN(rc);
+}
+
+/**
+ * Used by ptlrpc server, to free reply_state.
+ */
+void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs)
+{
+       struct ptlrpc_sec_policy *policy;
+       unsigned int prealloc;
+       ENTRY;
+
+       LASSERT(rs->rs_svc_ctx);
+       LASSERT(rs->rs_svc_ctx->sc_policy);
+
+       policy = rs->rs_svc_ctx->sc_policy;
+       LASSERT(policy->sp_sops->free_rs);
+
+       prealloc = rs->rs_prealloc;
+       policy->sp_sops->free_rs(rs);
+
+       if (prealloc)
+               lustre_put_emerg_rs(rs);
+       EXIT;
+}
+
+void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req)
+{
+       struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+       if (ctx != NULL)
+               atomic_inc(&ctx->sc_refcount);
+}
+
+void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req)
+{
+       struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+       if (ctx == NULL)
+               return;
+
+       LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+       if (atomic_dec_and_test(&ctx->sc_refcount)) {
+               if (ctx->sc_policy->sp_sops->free_ctx)
+                       ctx->sc_policy->sp_sops->free_ctx(ctx);
+       }
+       req->rq_svc_ctx = NULL;
+}
+
+void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req)
+{
+       struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx;
+
+       if (ctx == NULL)
+               return;
+
+       LASSERT_ATOMIC_POS(&ctx->sc_refcount);
+       if (ctx->sc_policy->sp_sops->invalidate_ctx)
+               ctx->sc_policy->sp_sops->invalidate_ctx(ctx);
+}
+EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate);
+
+/****************************************
+ * bulk security                       *
+ ****************************************/
+
+/**
+ * Perform transformation upon bulk data pointed by \a desc. This is called
+ * before transforming the request message.
+ */
+int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_cli_ctx *ctx;
+
+       LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+
+       if (!req->rq_pack_bulk)
+               return 0;
+
+       ctx = req->rq_cli_ctx;
+       if (ctx->cc_ops->wrap_bulk)
+               return ctx->cc_ops->wrap_bulk(ctx, req, desc);
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk);
+
+/**
+ * This is called after unwrap the reply message.
+ * return nob of actual plain text size received, or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req,
+                                struct ptlrpc_bulk_desc *desc,
+                                int nob)
+{
+       struct ptlrpc_cli_ctx  *ctx;
+       int                  rc;
+
+       LASSERT(req->rq_bulk_read && !req->rq_bulk_write);
+
+       if (!req->rq_pack_bulk)
+               return desc->bd_nob_transferred;
+
+       ctx = req->rq_cli_ctx;
+       if (ctx->cc_ops->unwrap_bulk) {
+               rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+               if (rc < 0)
+                       return rc;
+       }
+       return desc->bd_nob_transferred;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read);
+
+/**
+ * This is called after unwrap the reply message.
+ * return 0 for success or error code.
+ */
+int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req,
+                                 struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_cli_ctx  *ctx;
+       int                  rc;
+
+       LASSERT(!req->rq_bulk_read && req->rq_bulk_write);
+
+       if (!req->rq_pack_bulk)
+               return 0;
+
+       ctx = req->rq_cli_ctx;
+       if (ctx->cc_ops->unwrap_bulk) {
+               rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc);
+               if (rc < 0)
+                       return rc;
+       }
+
+       /*
+        * if everything is going right, nob should equals to nob_transferred.
+        * in case of privacy mode, nob_transferred needs to be adjusted.
+        */
+       if (desc->bd_nob != desc->bd_nob_transferred) {
+               CERROR("nob %d doesn't match transferred nob %d",
+                      desc->bd_nob, desc->bd_nob_transferred);
+               return -EPROTO;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write);
+
+
+/****************************************
+ * user descriptor helpers           *
+ ****************************************/
+
+int sptlrpc_current_user_desc_size(void)
+{
+       int ngroups;
+
+       ngroups = current_ngroups;
+
+       if (ngroups > LUSTRE_MAX_GROUPS)
+               ngroups = LUSTRE_MAX_GROUPS;
+       return sptlrpc_user_desc_size(ngroups);
+}
+EXPORT_SYMBOL(sptlrpc_current_user_desc_size);
+
+int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset)
+{
+       struct ptlrpc_user_desc *pud;
+
+       pud = lustre_msg_buf(msg, offset, 0);
+
+       pud->pud_uid = current_uid();
+       pud->pud_gid = current_gid();
+       pud->pud_fsuid = current_fsuid();
+       pud->pud_fsgid = current_fsgid();
+       pud->pud_cap = cfs_curproc_cap_pack();
+       pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4;
+
+       task_lock(current);
+       if (pud->pud_ngroups > current_ngroups)
+               pud->pud_ngroups = current_ngroups;
+       memcpy(pud->pud_groups, current_cred()->group_info->blocks[0],
+              pud->pud_ngroups * sizeof(__u32));
+       task_unlock(current);
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_pack_user_desc);
+
+int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed)
+{
+       struct ptlrpc_user_desc *pud;
+       int                   i;
+
+       pud = lustre_msg_buf(msg, offset, sizeof(*pud));
+       if (!pud)
+               return -EINVAL;
+
+       if (swabbed) {
+               __swab32s(&pud->pud_uid);
+               __swab32s(&pud->pud_gid);
+               __swab32s(&pud->pud_fsuid);
+               __swab32s(&pud->pud_fsgid);
+               __swab32s(&pud->pud_cap);
+               __swab32s(&pud->pud_ngroups);
+       }
+
+       if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) {
+               CERROR("%u groups is too large\n", pud->pud_ngroups);
+               return -EINVAL;
+       }
+
+       if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) >
+           msg->lm_buflens[offset]) {
+               CERROR("%u groups are claimed but bufsize only %u\n",
+                      pud->pud_ngroups, msg->lm_buflens[offset]);
+               return -EINVAL;
+       }
+
+       if (swabbed) {
+               for (i = 0; i < pud->pud_ngroups; i++)
+                       __swab32s(&pud->pud_groups[i]);
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_unpack_user_desc);
+
+/****************************************
+ * misc helpers                         *
+ ****************************************/
+
+const char * sec2target_str(struct ptlrpc_sec *sec)
+{
+       if (!sec || !sec->ps_import || !sec->ps_import->imp_obd)
+               return "*";
+       if (sec_is_reverse(sec))
+               return "c";
+       return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid);
+}
+EXPORT_SYMBOL(sec2target_str);
+
+/*
+ * return true if the bulk data is protected
+ */
+int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr)
+{
+       switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) {
+       case SPTLRPC_BULK_SVC_INTG:
+       case SPTLRPC_BULK_SVC_PRIV:
+               return 1;
+       default:
+               return 0;
+       }
+}
+EXPORT_SYMBOL(sptlrpc_flavor_has_bulk);
+
+/****************************************
+ * crypto API helper/alloc blkciper     *
+ ****************************************/
+
+/****************************************
+ * initialize/finalize           *
+ ****************************************/
+
+int sptlrpc_init(void)
+{
+       int rc;
+
+       rwlock_init(&policy_lock);
+
+       rc = sptlrpc_gc_init();
+       if (rc)
+               goto out;
+
+       rc = sptlrpc_conf_init();
+       if (rc)
+               goto out_gc;
+
+       rc = sptlrpc_enc_pool_init();
+       if (rc)
+               goto out_conf;
+
+       rc = sptlrpc_null_init();
+       if (rc)
+               goto out_pool;
+
+       rc = sptlrpc_plain_init();
+       if (rc)
+               goto out_null;
+
+       rc = sptlrpc_lproc_init();
+       if (rc)
+               goto out_plain;
+
+       return 0;
+
+out_plain:
+       sptlrpc_plain_fini();
+out_null:
+       sptlrpc_null_fini();
+out_pool:
+       sptlrpc_enc_pool_fini();
+out_conf:
+       sptlrpc_conf_fini();
+out_gc:
+       sptlrpc_gc_fini();
+out:
+       return rc;
+}
+
+void sptlrpc_fini(void)
+{
+       sptlrpc_lproc_fini();
+       sptlrpc_plain_fini();
+       sptlrpc_null_fini();
+       sptlrpc_enc_pool_fini();
+       sptlrpc_conf_fini();
+       sptlrpc_gc_fini();
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
new file mode 100644 (file)
index 0000000..bf53f1b
--- /dev/null
@@ -0,0 +1,880 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_bulk.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+/****************************************
+ * bulk encryption page pools     *
+ ****************************************/
+
+
+#define PTRS_PER_PAGE   (PAGE_CACHE_SIZE / sizeof(void *))
+#define PAGES_PER_POOL  (PTRS_PER_PAGE)
+
+#define IDLE_IDX_MAX       (100)
+#define IDLE_IDX_WEIGHT         (3)
+
+#define CACHE_QUIESCENT_PERIOD  (20)
+
+static struct ptlrpc_enc_page_pool {
+       /*
+        * constants
+        */
+       unsigned long    epp_max_pages;   /* maximum pages can hold, const */
+       unsigned int     epp_max_pools;   /* number of pools, const */
+
+       /*
+        * wait queue in case of not enough free pages.
+        */
+       wait_queue_head_t      epp_waitq;       /* waiting threads */
+       unsigned int     epp_waitqlen;    /* wait queue length */
+       unsigned long    epp_pages_short; /* # of pages wanted of in-q users */
+       unsigned int     epp_growing:1;   /* during adding pages */
+
+       /*
+        * indicating how idle the pools are, from 0 to MAX_IDLE_IDX
+        * this is counted based on each time when getting pages from
+        * the pools, not based on time. which means in case that system
+        * is idled for a while but the idle_idx might still be low if no
+        * activities happened in the pools.
+        */
+       unsigned long    epp_idle_idx;
+
+       /* last shrink time due to mem tight */
+       long         epp_last_shrink;
+       long         epp_last_access;
+
+       /*
+        * in-pool pages bookkeeping
+        */
+       spinlock_t       epp_lock;         /* protect following fields */
+       unsigned long    epp_total_pages; /* total pages in pools */
+       unsigned long    epp_free_pages;  /* current pages available */
+
+       /*
+        * statistics
+        */
+       unsigned long    epp_st_max_pages;      /* # of pages ever reached */
+       unsigned int     epp_st_grows;    /* # of grows */
+       unsigned int     epp_st_grow_fails;     /* # of add pages failures */
+       unsigned int     epp_st_shrinks;        /* # of shrinks */
+       unsigned long    epp_st_access;  /* # of access */
+       unsigned long    epp_st_missings;       /* # of cache missing */
+       unsigned long    epp_st_lowfree;        /* lowest free pages reached */
+       unsigned int     epp_st_max_wqlen;      /* highest waitqueue length */
+       cfs_time_t       epp_st_max_wait;       /* in jeffies */
+       /*
+        * pointers to pools
+        */
+       struct page    ***epp_pools;
+} page_pools;
+
+/*
+ * memory shrinker
+ */
+const int pools_shrinker_seeks = DEFAULT_SEEKS;
+static struct shrinker *pools_shrinker = NULL;
+
+
+/*
+ * /proc/fs/lustre/sptlrpc/encrypt_page_pools
+ */
+int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v)
+{
+       int     rc;
+
+       spin_lock(&page_pools.epp_lock);
+
+       rc = seq_printf(m,
+                     "physical pages:    %lu\n"
+                     "pages per pool:    %lu\n"
+                     "max pages:              %lu\n"
+                     "max pools:              %u\n"
+                     "total pages:          %lu\n"
+                     "total free:            %lu\n"
+                     "idle index:            %lu/100\n"
+                     "last shrink:          %lds\n"
+                     "last access:          %lds\n"
+                     "max pages reached:       %lu\n"
+                     "grows:              %u\n"
+                     "grows failure:      %u\n"
+                     "shrinks:          %u\n"
+                     "cache access:        %lu\n"
+                     "cache missing:      %lu\n"
+                     "low free mark:      %lu\n"
+                     "max waitqueue depth:     %u\n"
+                     "max wait time:      "CFS_TIME_T"/%u\n"
+                     ,
+                     num_physpages,
+                     PAGES_PER_POOL,
+                     page_pools.epp_max_pages,
+                     page_pools.epp_max_pools,
+                     page_pools.epp_total_pages,
+                     page_pools.epp_free_pages,
+                     page_pools.epp_idle_idx,
+                     cfs_time_current_sec() - page_pools.epp_last_shrink,
+                     cfs_time_current_sec() - page_pools.epp_last_access,
+                     page_pools.epp_st_max_pages,
+                     page_pools.epp_st_grows,
+                     page_pools.epp_st_grow_fails,
+                     page_pools.epp_st_shrinks,
+                     page_pools.epp_st_access,
+                     page_pools.epp_st_missings,
+                     page_pools.epp_st_lowfree,
+                     page_pools.epp_st_max_wqlen,
+                     page_pools.epp_st_max_wait, HZ
+                    );
+
+       spin_unlock(&page_pools.epp_lock);
+       return rc;
+}
+
+static void enc_pools_release_free_pages(long npages)
+{
+       int     p_idx, g_idx;
+       int     p_idx_max1, p_idx_max2;
+
+       LASSERT(npages > 0);
+       LASSERT(npages <= page_pools.epp_free_pages);
+       LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages);
+
+       /* max pool index before the release */
+       p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL;
+
+       page_pools.epp_free_pages -= npages;
+       page_pools.epp_total_pages -= npages;
+
+       /* max pool index after the release */
+       p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 :
+                    ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL);
+
+       p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+       g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+       LASSERT(page_pools.epp_pools[p_idx]);
+
+       while (npages--) {
+               LASSERT(page_pools.epp_pools[p_idx]);
+               LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+
+               __free_page(page_pools.epp_pools[p_idx][g_idx]);
+               page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+               if (++g_idx == PAGES_PER_POOL) {
+                       p_idx++;
+                       g_idx = 0;
+               }
+       };
+
+       /* free unused pools */
+       while (p_idx_max1 < p_idx_max2) {
+               LASSERT(page_pools.epp_pools[p_idx_max2]);
+               OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_CACHE_SIZE);
+               page_pools.epp_pools[p_idx_max2] = NULL;
+               p_idx_max2--;
+       }
+}
+
+/*
+ * could be called frequently for query (@nr_to_scan == 0).
+ * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool.
+ */
+static int enc_pools_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+       if (unlikely(shrink_param(sc, nr_to_scan) != 0)) {
+               spin_lock(&page_pools.epp_lock);
+               shrink_param(sc, nr_to_scan) = min_t(unsigned long,
+                                                  shrink_param(sc, nr_to_scan),
+                                                  page_pools.epp_free_pages -
+                                                  PTLRPC_MAX_BRW_PAGES);
+               if (shrink_param(sc, nr_to_scan) > 0) {
+                       enc_pools_release_free_pages(shrink_param(sc,
+                                                                 nr_to_scan));
+                       CDEBUG(D_SEC, "released %ld pages, %ld left\n",
+                              (long)shrink_param(sc, nr_to_scan),
+                              page_pools.epp_free_pages);
+
+                       page_pools.epp_st_shrinks++;
+                       page_pools.epp_last_shrink = cfs_time_current_sec();
+               }
+               spin_unlock(&page_pools.epp_lock);
+       }
+
+       /*
+        * if no pool access for a long time, we consider it's fully idle.
+        * a little race here is fine.
+        */
+       if (unlikely(cfs_time_current_sec() - page_pools.epp_last_access >
+                    CACHE_QUIESCENT_PERIOD)) {
+               spin_lock(&page_pools.epp_lock);
+               page_pools.epp_idle_idx = IDLE_IDX_MAX;
+               spin_unlock(&page_pools.epp_lock);
+       }
+
+       LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX);
+       return max((int)page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) *
+               (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX;
+}
+
+static inline
+int npages_to_npools(unsigned long npages)
+{
+       return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL);
+}
+
+/*
+ * return how many pages cleaned up.
+ */
+static unsigned long enc_pools_cleanup(struct page ***pools, int npools)
+{
+       unsigned long cleaned = 0;
+       int        i, j;
+
+       for (i = 0; i < npools; i++) {
+               if (pools[i]) {
+                       for (j = 0; j < PAGES_PER_POOL; j++) {
+                               if (pools[i][j]) {
+                                       __free_page(pools[i][j]);
+                                       cleaned++;
+                               }
+                       }
+                       OBD_FREE(pools[i], PAGE_CACHE_SIZE);
+                       pools[i] = NULL;
+               }
+       }
+
+       return cleaned;
+}
+
+/*
+ * merge @npools pointed by @pools which contains @npages new pages
+ * into current pools.
+ *
+ * we have options to avoid most memory copy with some tricks. but we choose
+ * the simplest way to avoid complexity. It's not frequently called.
+ */
+static void enc_pools_insert(struct page ***pools, int npools, int npages)
+{
+       int     freeslot;
+       int     op_idx, np_idx, og_idx, ng_idx;
+       int     cur_npools, end_npools;
+
+       LASSERT(npages > 0);
+       LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages);
+       LASSERT(npages_to_npools(npages) == npools);
+       LASSERT(page_pools.epp_growing);
+
+       spin_lock(&page_pools.epp_lock);
+
+       /*
+        * (1) fill all the free slots of current pools.
+        */
+       /* free slots are those left by rent pages, and the extra ones with
+        * index >= total_pages, locate at the tail of last pool. */
+       freeslot = page_pools.epp_total_pages % PAGES_PER_POOL;
+       if (freeslot != 0)
+               freeslot = PAGES_PER_POOL - freeslot;
+       freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages;
+
+       op_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+       og_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+       np_idx = npools - 1;
+       ng_idx = (npages - 1) % PAGES_PER_POOL;
+
+       while (freeslot) {
+               LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL);
+               LASSERT(pools[np_idx][ng_idx] != NULL);
+
+               page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx];
+               pools[np_idx][ng_idx] = NULL;
+
+               freeslot--;
+
+               if (++og_idx == PAGES_PER_POOL) {
+                       op_idx++;
+                       og_idx = 0;
+               }
+               if (--ng_idx < 0) {
+                       if (np_idx == 0)
+                               break;
+                       np_idx--;
+                       ng_idx = PAGES_PER_POOL - 1;
+               }
+       }
+
+       /*
+        * (2) add pools if needed.
+        */
+       cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) /
+                    PAGES_PER_POOL;
+       end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL -1) /
+                    PAGES_PER_POOL;
+       LASSERT(end_npools <= page_pools.epp_max_pools);
+
+       np_idx = 0;
+       while (cur_npools < end_npools) {
+               LASSERT(page_pools.epp_pools[cur_npools] == NULL);
+               LASSERT(np_idx < npools);
+               LASSERT(pools[np_idx] != NULL);
+
+               page_pools.epp_pools[cur_npools++] = pools[np_idx];
+               pools[np_idx++] = NULL;
+       }
+
+       page_pools.epp_total_pages += npages;
+       page_pools.epp_free_pages += npages;
+       page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+       if (page_pools.epp_total_pages > page_pools.epp_st_max_pages)
+               page_pools.epp_st_max_pages = page_pools.epp_total_pages;
+
+       CDEBUG(D_SEC, "add %d pages to total %lu\n", npages,
+              page_pools.epp_total_pages);
+
+       spin_unlock(&page_pools.epp_lock);
+}
+
+static int enc_pools_add_pages(int npages)
+{
+       static DEFINE_MUTEX(add_pages_mutex);
+       struct page   ***pools;
+       int          npools, alloced = 0;
+       int          i, j, rc = -ENOMEM;
+
+       if (npages < PTLRPC_MAX_BRW_PAGES)
+               npages = PTLRPC_MAX_BRW_PAGES;
+
+       mutex_lock(&add_pages_mutex);
+
+       if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages)
+               npages = page_pools.epp_max_pages - page_pools.epp_total_pages;
+       LASSERT(npages > 0);
+
+       page_pools.epp_st_grows++;
+
+       npools = npages_to_npools(npages);
+       OBD_ALLOC(pools, npools * sizeof(*pools));
+       if (pools == NULL)
+               goto out;
+
+       for (i = 0; i < npools; i++) {
+               OBD_ALLOC(pools[i], PAGE_CACHE_SIZE);
+               if (pools[i] == NULL)
+                       goto out_pools;
+
+               for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) {
+                       pools[i][j] = alloc_page(__GFP_IO |
+                                                    __GFP_HIGHMEM);
+                       if (pools[i][j] == NULL)
+                               goto out_pools;
+
+                       alloced++;
+               }
+       }
+       LASSERT(alloced == npages);
+
+       enc_pools_insert(pools, npools, npages);
+       CDEBUG(D_SEC, "added %d pages into pools\n", npages);
+       rc = 0;
+
+out_pools:
+       enc_pools_cleanup(pools, npools);
+       OBD_FREE(pools, npools * sizeof(*pools));
+out:
+       if (rc) {
+               page_pools.epp_st_grow_fails++;
+               CERROR("Failed to allocate %d enc pages\n", npages);
+       }
+
+       mutex_unlock(&add_pages_mutex);
+       return rc;
+}
+
+static inline void enc_pools_wakeup(void)
+{
+       LASSERT(spin_is_locked(&page_pools.epp_lock));
+       LASSERT(page_pools.epp_waitqlen >= 0);
+
+       if (unlikely(page_pools.epp_waitqlen)) {
+               LASSERT(waitqueue_active(&page_pools.epp_waitq));
+               wake_up_all(&page_pools.epp_waitq);
+       }
+}
+
+static int enc_pools_should_grow(int page_needed, long now)
+{
+       /* don't grow if someone else is growing the pools right now,
+        * or the pools has reached its full capacity
+        */
+       if (page_pools.epp_growing ||
+           page_pools.epp_total_pages == page_pools.epp_max_pages)
+               return 0;
+
+       /* if total pages is not enough, we need to grow */
+       if (page_pools.epp_total_pages < page_needed)
+               return 1;
+
+       /*
+        * we wanted to return 0 here if there was a shrink just happened
+        * moment ago, but this may cause deadlock if both client and ost
+        * live on single node.
+        */
+#if 0
+       if (now - page_pools.epp_last_shrink < 2)
+               return 0;
+#endif
+
+       /*
+        * here we perhaps need consider other factors like wait queue
+        * length, idle index, etc. ?
+        */
+
+       /* grow the pools in any other cases */
+       return 1;
+}
+
+/*
+ * we allocate the requested pages atomically.
+ */
+int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc)
+{
+       wait_queue_t  waitlink;
+       unsigned long   this_idle = -1;
+       cfs_time_t      tick = 0;
+       long        now;
+       int          p_idx, g_idx;
+       int          i;
+
+       LASSERT(desc->bd_iov_count > 0);
+       LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages);
+
+       /* resent bulk, enc iov might have been allocated previously */
+       if (desc->bd_enc_iov != NULL)
+               return 0;
+
+       OBD_ALLOC(desc->bd_enc_iov,
+                 desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+       if (desc->bd_enc_iov == NULL)
+               return -ENOMEM;
+
+       spin_lock(&page_pools.epp_lock);
+
+       page_pools.epp_st_access++;
+again:
+       if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) {
+               if (tick == 0)
+                       tick = cfs_time_current();
+
+               now = cfs_time_current_sec();
+
+               page_pools.epp_st_missings++;
+               page_pools.epp_pages_short += desc->bd_iov_count;
+
+               if (enc_pools_should_grow(desc->bd_iov_count, now)) {
+                       page_pools.epp_growing = 1;
+
+                       spin_unlock(&page_pools.epp_lock);
+                       enc_pools_add_pages(page_pools.epp_pages_short / 2);
+                       spin_lock(&page_pools.epp_lock);
+
+                       page_pools.epp_growing = 0;
+
+                       enc_pools_wakeup();
+               } else {
+                       if (++page_pools.epp_waitqlen >
+                           page_pools.epp_st_max_wqlen)
+                               page_pools.epp_st_max_wqlen =
+                                               page_pools.epp_waitqlen;
+
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       init_waitqueue_entry_current(&waitlink);
+                       add_wait_queue(&page_pools.epp_waitq, &waitlink);
+
+                       spin_unlock(&page_pools.epp_lock);
+                       waitq_wait(&waitlink, TASK_UNINTERRUPTIBLE);
+                       remove_wait_queue(&page_pools.epp_waitq, &waitlink);
+                       LASSERT(page_pools.epp_waitqlen > 0);
+                       spin_lock(&page_pools.epp_lock);
+                       page_pools.epp_waitqlen--;
+               }
+
+               LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count);
+               page_pools.epp_pages_short -= desc->bd_iov_count;
+
+               this_idle = 0;
+               goto again;
+       }
+
+       /* record max wait time */
+       if (unlikely(tick != 0)) {
+               tick = cfs_time_current() - tick;
+               if (tick > page_pools.epp_st_max_wait)
+                       page_pools.epp_st_max_wait = tick;
+       }
+
+       /* proceed with rest of allocation */
+       page_pools.epp_free_pages -= desc->bd_iov_count;
+
+       p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+       g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL);
+               desc->bd_enc_iov[i].kiov_page =
+                                       page_pools.epp_pools[p_idx][g_idx];
+               page_pools.epp_pools[p_idx][g_idx] = NULL;
+
+               if (++g_idx == PAGES_PER_POOL) {
+                       p_idx++;
+                       g_idx = 0;
+               }
+       }
+
+       if (page_pools.epp_free_pages < page_pools.epp_st_lowfree)
+               page_pools.epp_st_lowfree = page_pools.epp_free_pages;
+
+       /*
+        * new idle index = (old * weight + new) / (weight + 1)
+        */
+       if (this_idle == -1) {
+               this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX /
+                           page_pools.epp_total_pages;
+       }
+       page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT +
+                                  this_idle) /
+                                 (IDLE_IDX_WEIGHT + 1);
+
+       page_pools.epp_last_access = cfs_time_current_sec();
+
+       spin_unlock(&page_pools.epp_lock);
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages);
+
+void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc)
+{
+       int     p_idx, g_idx;
+       int     i;
+
+       if (desc->bd_enc_iov == NULL)
+               return;
+
+       LASSERT(desc->bd_iov_count > 0);
+
+       spin_lock(&page_pools.epp_lock);
+
+       p_idx = page_pools.epp_free_pages / PAGES_PER_POOL;
+       g_idx = page_pools.epp_free_pages % PAGES_PER_POOL;
+
+       LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <=
+               page_pools.epp_total_pages);
+       LASSERT(page_pools.epp_pools[p_idx]);
+
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               LASSERT(desc->bd_enc_iov[i].kiov_page != NULL);
+               LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]);
+               LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL);
+
+               page_pools.epp_pools[p_idx][g_idx] =
+                                       desc->bd_enc_iov[i].kiov_page;
+
+               if (++g_idx == PAGES_PER_POOL) {
+                       p_idx++;
+                       g_idx = 0;
+               }
+       }
+
+       page_pools.epp_free_pages += desc->bd_iov_count;
+
+       enc_pools_wakeup();
+
+       spin_unlock(&page_pools.epp_lock);
+
+       OBD_FREE(desc->bd_enc_iov,
+                desc->bd_iov_count * sizeof(*desc->bd_enc_iov));
+       desc->bd_enc_iov = NULL;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages);
+
+/*
+ * we don't do much stuff for add_user/del_user anymore, except adding some
+ * initial pages in add_user() if current pools are empty, rest would be
+ * handled by the pools's self-adaption.
+ */
+int sptlrpc_enc_pool_add_user(void)
+{
+       int     need_grow = 0;
+
+       spin_lock(&page_pools.epp_lock);
+       if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) {
+               page_pools.epp_growing = 1;
+               need_grow = 1;
+       }
+       spin_unlock(&page_pools.epp_lock);
+
+       if (need_grow) {
+               enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES +
+                                   PTLRPC_MAX_BRW_PAGES);
+
+               spin_lock(&page_pools.epp_lock);
+               page_pools.epp_growing = 0;
+               enc_pools_wakeup();
+               spin_unlock(&page_pools.epp_lock);
+       }
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_add_user);
+
+int sptlrpc_enc_pool_del_user(void)
+{
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_enc_pool_del_user);
+
+static inline void enc_pools_alloc(void)
+{
+       LASSERT(page_pools.epp_max_pools);
+       OBD_ALLOC_LARGE(page_pools.epp_pools,
+                       page_pools.epp_max_pools *
+                       sizeof(*page_pools.epp_pools));
+}
+
+static inline void enc_pools_free(void)
+{
+       LASSERT(page_pools.epp_max_pools);
+       LASSERT(page_pools.epp_pools);
+
+       OBD_FREE_LARGE(page_pools.epp_pools,
+                      page_pools.epp_max_pools *
+                      sizeof(*page_pools.epp_pools));
+}
+
+int sptlrpc_enc_pool_init(void)
+{
+       /*
+        * maximum capacity is 1/8 of total physical memory.
+        * is the 1/8 a good number?
+        */
+       page_pools.epp_max_pages = num_physpages / 8;
+       page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages);
+
+       init_waitqueue_head(&page_pools.epp_waitq);
+       page_pools.epp_waitqlen = 0;
+       page_pools.epp_pages_short = 0;
+
+       page_pools.epp_growing = 0;
+
+       page_pools.epp_idle_idx = 0;
+       page_pools.epp_last_shrink = cfs_time_current_sec();
+       page_pools.epp_last_access = cfs_time_current_sec();
+
+       spin_lock_init(&page_pools.epp_lock);
+       page_pools.epp_total_pages = 0;
+       page_pools.epp_free_pages = 0;
+
+       page_pools.epp_st_max_pages = 0;
+       page_pools.epp_st_grows = 0;
+       page_pools.epp_st_grow_fails = 0;
+       page_pools.epp_st_shrinks = 0;
+       page_pools.epp_st_access = 0;
+       page_pools.epp_st_missings = 0;
+       page_pools.epp_st_lowfree = 0;
+       page_pools.epp_st_max_wqlen = 0;
+       page_pools.epp_st_max_wait = 0;
+
+       enc_pools_alloc();
+       if (page_pools.epp_pools == NULL)
+               return -ENOMEM;
+
+       pools_shrinker = set_shrinker(pools_shrinker_seeks,
+                                         enc_pools_shrink);
+       if (pools_shrinker == NULL) {
+               enc_pools_free();
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+void sptlrpc_enc_pool_fini(void)
+{
+       unsigned long cleaned, npools;
+
+       LASSERT(pools_shrinker);
+       LASSERT(page_pools.epp_pools);
+       LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages);
+
+       remove_shrinker(pools_shrinker);
+
+       npools = npages_to_npools(page_pools.epp_total_pages);
+       cleaned = enc_pools_cleanup(page_pools.epp_pools, npools);
+       LASSERT(cleaned == page_pools.epp_total_pages);
+
+       enc_pools_free();
+
+       if (page_pools.epp_st_access > 0) {
+               CDEBUG(D_SEC,
+                      "max pages %lu, grows %u, grow fails %u, shrinks %u, "
+                      "access %lu, missing %lu, max qlen %u, max wait "
+                      CFS_TIME_T"/%d\n",
+                      page_pools.epp_st_max_pages, page_pools.epp_st_grows,
+                      page_pools.epp_st_grow_fails,
+                      page_pools.epp_st_shrinks, page_pools.epp_st_access,
+                      page_pools.epp_st_missings, page_pools.epp_st_max_wqlen,
+                      page_pools.epp_st_max_wait, HZ);
+       }
+}
+
+
+static int cfs_hash_alg_id[] = {
+       [BULK_HASH_ALG_NULL]    = CFS_HASH_ALG_NULL,
+       [BULK_HASH_ALG_ADLER32] = CFS_HASH_ALG_ADLER32,
+       [BULK_HASH_ALG_CRC32]   = CFS_HASH_ALG_CRC32,
+       [BULK_HASH_ALG_MD5]     = CFS_HASH_ALG_MD5,
+       [BULK_HASH_ALG_SHA1]    = CFS_HASH_ALG_SHA1,
+       [BULK_HASH_ALG_SHA256]  = CFS_HASH_ALG_SHA256,
+       [BULK_HASH_ALG_SHA384]  = CFS_HASH_ALG_SHA384,
+       [BULK_HASH_ALG_SHA512]  = CFS_HASH_ALG_SHA512,
+};
+const char * sptlrpc_get_hash_name(__u8 hash_alg)
+{
+       return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_name);
+
+__u8 sptlrpc_get_hash_alg(const char *algname)
+{
+       return cfs_crypto_hash_alg(algname);
+}
+EXPORT_SYMBOL(sptlrpc_get_hash_alg);
+
+int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed)
+{
+       struct ptlrpc_bulk_sec_desc *bsd;
+       int                       size = msg->lm_buflens[offset];
+
+       bsd = lustre_msg_buf(msg, offset, sizeof(*bsd));
+       if (bsd == NULL) {
+               CERROR("Invalid bulk sec desc: size %d\n", size);
+               return -EINVAL;
+       }
+
+       if (swabbed) {
+               __swab32s(&bsd->bsd_nob);
+       }
+
+       if (unlikely(bsd->bsd_version != 0)) {
+               CERROR("Unexpected version %u\n", bsd->bsd_version);
+               return -EPROTO;
+       }
+
+       if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) {
+               CERROR("Invalid type %u\n", bsd->bsd_type);
+               return -EPROTO;
+       }
+
+       /* FIXME more sanity check here */
+
+       if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+                    bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG &&
+                    bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) {
+               CERROR("Invalid svc %u\n", bsd->bsd_svc);
+               return -EPROTO;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(bulk_sec_desc_unpack);
+
+int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg,
+                             void *buf, int buflen)
+{
+       struct cfs_crypto_hash_desc     *hdesc;
+       int                             hashsize;
+       char                            hashbuf[64];
+       unsigned int                    bufsize;
+       int                             i, err;
+
+       LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX);
+       LASSERT(buflen >= 4);
+
+       hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0);
+       if (IS_ERR(hdesc)) {
+               CERROR("Unable to initialize checksum hash %s\n",
+                      cfs_crypto_hash_name(cfs_hash_alg_id[alg]));
+               return PTR_ERR(hdesc);
+       }
+
+       hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]);
+
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page,
+                                 desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK,
+                                 desc->bd_iov[i].kiov_len);
+       }
+       if (hashsize > buflen) {
+               bufsize = sizeof(hashbuf);
+               err = cfs_crypto_hash_final(hdesc, (unsigned char *)hashbuf,
+                                           &bufsize);
+               memcpy(buf, hashbuf, buflen);
+       } else {
+               bufsize = buflen;
+               err = cfs_crypto_hash_final(hdesc, (unsigned char *)buf,
+                                           &bufsize);
+       }
+
+       if (err)
+               cfs_crypto_hash_final(hdesc, NULL, NULL);
+       return err;
+}
+EXPORT_SYMBOL(sptlrpc_get_bulk_checksum);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c
new file mode 100644 (file)
index 0000000..a45a392
--- /dev/null
@@ -0,0 +1,1233 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_dlm.h>
+#include <lustre_param.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+const char *sptlrpc_part2name(enum lustre_sec_part part)
+{
+       switch (part) {
+       case LUSTRE_SP_CLI:
+               return "cli";
+       case LUSTRE_SP_MDT:
+               return "mdt";
+       case LUSTRE_SP_OST:
+               return "ost";
+       case LUSTRE_SP_MGC:
+               return "mgc";
+       case LUSTRE_SP_MGS:
+               return "mgs";
+       case LUSTRE_SP_ANY:
+               return "any";
+       default:
+               return "err";
+       }
+}
+EXPORT_SYMBOL(sptlrpc_part2name);
+
+enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd)
+{
+       const char *type = obd->obd_type->typ_name;
+
+       if (!strcmp(type, LUSTRE_MDT_NAME))
+               return LUSTRE_SP_MDT;
+       if (!strcmp(type, LUSTRE_OST_NAME))
+               return LUSTRE_SP_OST;
+       if (!strcmp(type, LUSTRE_MGS_NAME))
+               return LUSTRE_SP_MGS;
+
+       CERROR("unknown target %p(%s)\n", obd, type);
+       return LUSTRE_SP_ANY;
+}
+EXPORT_SYMBOL(sptlrpc_target_sec_part);
+
+/****************************************
+ * user supplied flavor string parsing  *
+ ****************************************/
+
+/*
+ * format: <base_flavor>[-<bulk_type:alg_spec>]
+ */
+int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr)
+{
+       char        buf[32];
+       char       *bulk, *alg;
+
+       memset(flvr, 0, sizeof(*flvr));
+
+       if (str == NULL || str[0] == '\0') {
+               flvr->sf_rpc = SPTLRPC_FLVR_INVALID;
+               return 0;
+       }
+
+       strncpy(buf, str, sizeof(buf));
+       buf[sizeof(buf) - 1] = '\0';
+
+       bulk = strchr(buf, '-');
+       if (bulk)
+               *bulk++ = '\0';
+
+       flvr->sf_rpc = sptlrpc_name2flavor_base(buf);
+       if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID)
+               goto err_out;
+
+       /*
+        * currently only base flavor "plain" can have bulk specification.
+        */
+       if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) {
+               flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32;
+               if (bulk) {
+                       /*
+                        * format: plain-hash:<hash_alg>
+                        */
+                       alg = strchr(bulk, ':');
+                       if (alg == NULL)
+                               goto err_out;
+                       *alg++ = '\0';
+
+                       if (strcmp(bulk, "hash"))
+                               goto err_out;
+
+                       flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg);
+                       if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX)
+                               goto err_out;
+               }
+
+               if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL)
+                       flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL);
+               else
+                       flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG);
+       } else {
+               if (bulk)
+                       goto err_out;
+       }
+
+       flvr->sf_flags = 0;
+       return 0;
+
+err_out:
+       CERROR("invalid flavor string: %s\n", str);
+       return -EINVAL;
+}
+EXPORT_SYMBOL(sptlrpc_parse_flavor);
+
+/****************************************
+ * configure rules                   *
+ ****************************************/
+
+static void get_default_flavor(struct sptlrpc_flavor *sf)
+{
+       memset(sf, 0, sizeof(*sf));
+
+       sf->sf_rpc = SPTLRPC_FLVR_NULL;
+       sf->sf_flags = 0;
+}
+
+static void sptlrpc_rule_init(struct sptlrpc_rule *rule)
+{
+       rule->sr_netid = LNET_NIDNET(LNET_NID_ANY);
+       rule->sr_from = LUSTRE_SP_ANY;
+       rule->sr_to = LUSTRE_SP_ANY;
+       rule->sr_padding = 0;
+
+       get_default_flavor(&rule->sr_flvr);
+}
+
+/*
+ * format: network[.direction]=flavor
+ */
+int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule)
+{
+       char       *flavor, *dir;
+       int          rc;
+
+       sptlrpc_rule_init(rule);
+
+       flavor = strchr(param, '=');
+       if (flavor == NULL) {
+               CERROR("invalid param, no '='\n");
+               RETURN(-EINVAL);
+       }
+       *flavor++ = '\0';
+
+       dir = strchr(param, '.');
+       if (dir)
+               *dir++ = '\0';
+
+       /* 1.1 network */
+       if (strcmp(param, "default")) {
+               rule->sr_netid = libcfs_str2net(param);
+               if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) {
+                       CERROR("invalid network name: %s\n", param);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       /* 1.2 direction */
+       if (dir) {
+               if (!strcmp(dir, "mdt2ost")) {
+                       rule->sr_from = LUSTRE_SP_MDT;
+                       rule->sr_to = LUSTRE_SP_OST;
+               } else if (!strcmp(dir, "mdt2mdt")) {
+                       rule->sr_from = LUSTRE_SP_MDT;
+                       rule->sr_to = LUSTRE_SP_MDT;
+               } else if (!strcmp(dir, "cli2ost")) {
+                       rule->sr_from = LUSTRE_SP_CLI;
+                       rule->sr_to = LUSTRE_SP_OST;
+               } else if (!strcmp(dir, "cli2mdt")) {
+                       rule->sr_from = LUSTRE_SP_CLI;
+                       rule->sr_to = LUSTRE_SP_MDT;
+               } else {
+                       CERROR("invalid rule dir segment: %s\n", dir);
+                       RETURN(-EINVAL);
+               }
+       }
+
+       /* 2.1 flavor */
+       rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr);
+       if (rc)
+               RETURN(-EINVAL);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(sptlrpc_parse_rule);
+
+void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset)
+{
+       LASSERT(rset->srs_nslot ||
+               (rset->srs_nrule == 0 && rset->srs_rules == NULL));
+
+       if (rset->srs_nslot) {
+               OBD_FREE(rset->srs_rules,
+                        rset->srs_nslot * sizeof(*rset->srs_rules));
+               sptlrpc_rule_set_init(rset);
+       }
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_free);
+
+/*
+ * return 0 if the rule set could accomodate one more rule.
+ */
+int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset)
+{
+       struct sptlrpc_rule *rules;
+       int nslot;
+
+       might_sleep();
+
+       if (rset->srs_nrule < rset->srs_nslot)
+               return 0;
+
+       nslot = rset->srs_nslot + 8;
+
+       /* better use realloc() if available */
+       OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules));
+       if (rules == NULL)
+               return -ENOMEM;
+
+       if (rset->srs_nrule) {
+               LASSERT(rset->srs_nslot && rset->srs_rules);
+               memcpy(rules, rset->srs_rules,
+                      rset->srs_nrule * sizeof(*rset->srs_rules));
+
+               OBD_FREE(rset->srs_rules,
+                        rset->srs_nslot * sizeof(*rset->srs_rules));
+       }
+
+       rset->srs_rules = rules;
+       rset->srs_nslot = nslot;
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_expand);
+
+static inline int rule_spec_dir(struct sptlrpc_rule *rule)
+{
+       return (rule->sr_from != LUSTRE_SP_ANY ||
+               rule->sr_to != LUSTRE_SP_ANY);
+}
+static inline int rule_spec_net(struct sptlrpc_rule *rule)
+{
+       return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY));
+}
+static inline int rule_match_dir(struct sptlrpc_rule *r1,
+                                struct sptlrpc_rule *r2)
+{
+       return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to);
+}
+static inline int rule_match_net(struct sptlrpc_rule *r1,
+                                struct sptlrpc_rule *r2)
+{
+       return (r1->sr_netid == r2->sr_netid);
+}
+
+/*
+ * merge @rule into @rset.
+ * the @rset slots might be expanded.
+ */
+int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset,
+                          struct sptlrpc_rule *rule)
+{
+       struct sptlrpc_rule      *p = rset->srs_rules;
+       int                    spec_dir, spec_net;
+       int                    rc, n, match = 0;
+
+       might_sleep();
+
+       spec_net = rule_spec_net(rule);
+       spec_dir = rule_spec_dir(rule);
+
+       for (n = 0; n < rset->srs_nrule; n++) {
+               p = &rset->srs_rules[n];
+
+               /* test network match, if failed:
+                * - spec rule: skip rules which is also spec rule match, until
+                *   we hit a wild rule, which means no more chance
+                * - wild rule: skip until reach the one which is also wild
+                *   and matches
+                */
+               if (!rule_match_net(p, rule)) {
+                       if (spec_net) {
+                               if (rule_spec_net(p))
+                                       continue;
+                               else
+                                       break;
+                       } else {
+                               continue;
+                       }
+               }
+
+               /* test dir match, same logic as net matching */
+               if (!rule_match_dir(p, rule)) {
+                       if (spec_dir) {
+                               if (rule_spec_dir(p))
+                                       continue;
+                               else
+                                       break;
+                       } else {
+                               continue;
+                       }
+               }
+
+               /* find a match */
+               match = 1;
+               break;
+       }
+
+       if (match) {
+               LASSERT(n >= 0 && n < rset->srs_nrule);
+
+               if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+                       /* remove this rule */
+                       if (n < rset->srs_nrule - 1)
+                               memmove(&rset->srs_rules[n],
+                                       &rset->srs_rules[n + 1],
+                                       (rset->srs_nrule - n - 1) *
+                                       sizeof(*rule));
+                       rset->srs_nrule--;
+               } else {
+                       /* override the rule */
+                       memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+               }
+       } else {
+               LASSERT(n >= 0 && n <= rset->srs_nrule);
+
+               if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) {
+                       rc = sptlrpc_rule_set_expand(rset);
+                       if (rc)
+                               return rc;
+
+                       if (n < rset->srs_nrule)
+                               memmove(&rset->srs_rules[n + 1],
+                                       &rset->srs_rules[n],
+                                       (rset->srs_nrule - n) * sizeof(*rule));
+                       memcpy(&rset->srs_rules[n], rule, sizeof(*rule));
+                       rset->srs_nrule++;
+               } else {
+                       CDEBUG(D_CONFIG, "ignore the unmatched deletion\n");
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_merge);
+
+/**
+ * given from/to/nid, determine a matching flavor in ruleset.
+ * return 1 if a match found, otherwise return 0.
+ */
+int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset,
+                           enum lustre_sec_part from,
+                           enum lustre_sec_part to,
+                           lnet_nid_t nid,
+                           struct sptlrpc_flavor *sf)
+{
+       struct sptlrpc_rule    *r;
+       int                  n;
+
+       for (n = 0; n < rset->srs_nrule; n++) {
+               r = &rset->srs_rules[n];
+
+               if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) &&
+                   r->sr_netid != LNET_NIDNET(LNET_NID_ANY) &&
+                   LNET_NIDNET(nid) != r->sr_netid)
+                       continue;
+
+               if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY &&
+                   from != r->sr_from)
+                       continue;
+
+               if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY &&
+                   to != r->sr_to)
+                       continue;
+
+               *sf = r->sr_flvr;
+               return 1;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_choose);
+
+void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset)
+{
+       struct sptlrpc_rule *r;
+       int     n;
+
+       for (n = 0; n < rset->srs_nrule; n++) {
+               r = &rset->srs_rules[n];
+               CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n,
+                      r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc);
+       }
+}
+EXPORT_SYMBOL(sptlrpc_rule_set_dump);
+
+static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen,
+                                   struct sptlrpc_rule_set *tgt,
+                                   enum lustre_sec_part from,
+                                   enum lustre_sec_part to,
+                                   struct sptlrpc_rule_set *rset)
+{
+       struct sptlrpc_rule_set *src[2] = { gen, tgt };
+       struct sptlrpc_rule     *rule;
+       int                   i, n, rc;
+
+       might_sleep();
+
+       /* merge general rules firstly, then target-specific rules */
+       for (i = 0; i < 2; i++) {
+               if (src[i] == NULL)
+                       continue;
+
+               for (n = 0; n < src[i]->srs_nrule; n++) {
+                       rule = &src[i]->srs_rules[n];
+
+                       if (from != LUSTRE_SP_ANY &&
+                           rule->sr_from != LUSTRE_SP_ANY &&
+                           rule->sr_from != from)
+                               continue;
+                       if (to != LUSTRE_SP_ANY &&
+                           rule->sr_to != LUSTRE_SP_ANY &&
+                           rule->sr_to != to)
+                               continue;
+
+                       rc = sptlrpc_rule_set_merge(rset, rule);
+                       if (rc) {
+                               CERROR("can't merge: %d\n", rc);
+                               return rc;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/**********************************
+ * sptlrpc configuration support  *
+ **********************************/
+
+struct sptlrpc_conf_tgt {
+       struct list_head              sct_list;
+       char                sct_name[MAX_OBD_NAME];
+       struct sptlrpc_rule_set sct_rset;
+};
+
+struct sptlrpc_conf {
+       struct list_head              sc_list;
+       char                sc_fsname[MTI_NAME_MAXLEN];
+       unsigned int        sc_modified;  /* modified during updating */
+       unsigned int        sc_updated:1, /* updated copy from MGS */
+                               sc_local:1;   /* local copy from target */
+       struct sptlrpc_rule_set sc_rset;      /* fs general rules */
+       struct list_head              sc_tgts;      /* target-specific rules */
+};
+
+static struct mutex sptlrpc_conf_lock;
+static LIST_HEAD(sptlrpc_confs);
+
+static inline int is_hex(char c)
+{
+       return ((c >= '0' && c <= '9') ||
+               (c >= 'a' && c <= 'f'));
+}
+
+static void target2fsname(const char *tgt, char *fsname, int buflen)
+{
+       const char     *ptr;
+       int          len;
+
+       ptr = strrchr(tgt, '-');
+       if (ptr) {
+               if ((strncmp(ptr, "-MDT", 4) != 0 &&
+                    strncmp(ptr, "-OST", 4) != 0) ||
+                   !is_hex(ptr[4]) || !is_hex(ptr[5]) ||
+                   !is_hex(ptr[6]) || !is_hex(ptr[7]))
+                       ptr = NULL;
+       }
+
+       /* if we didn't find the pattern, treat the whole string as fsname */
+       if (ptr == NULL)
+               len = strlen(tgt);
+       else
+               len = ptr - tgt;
+
+       len = min(len, buflen - 1);
+       memcpy(fsname, tgt, len);
+       fsname[len] = '\0';
+}
+
+static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf)
+{
+       struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next;
+
+       sptlrpc_rule_set_free(&conf->sc_rset);
+
+       list_for_each_entry_safe(conf_tgt, conf_tgt_next,
+                                    &conf->sc_tgts, sct_list) {
+               sptlrpc_rule_set_free(&conf_tgt->sct_rset);
+               list_del(&conf_tgt->sct_list);
+               OBD_FREE_PTR(conf_tgt);
+       }
+       LASSERT(list_empty(&conf->sc_tgts));
+
+       conf->sc_updated = 0;
+       conf->sc_local = 0;
+}
+
+static void sptlrpc_conf_free(struct sptlrpc_conf *conf)
+{
+       CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname);
+
+       sptlrpc_conf_free_rsets(conf);
+       list_del(&conf->sc_list);
+       OBD_FREE_PTR(conf);
+}
+
+static
+struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf,
+                                             const char *name,
+                                             int create)
+{
+       struct sptlrpc_conf_tgt *conf_tgt;
+
+       list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+               if (strcmp(conf_tgt->sct_name, name) == 0)
+                       return conf_tgt;
+       }
+
+       if (!create)
+               return NULL;
+
+       OBD_ALLOC_PTR(conf_tgt);
+       if (conf_tgt) {
+               strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name));
+               sptlrpc_rule_set_init(&conf_tgt->sct_rset);
+               list_add(&conf_tgt->sct_list, &conf->sc_tgts);
+       }
+
+       return conf_tgt;
+}
+
+static
+struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname,
+                                     int create)
+{
+       struct sptlrpc_conf *conf;
+
+       list_for_each_entry(conf, &sptlrpc_confs, sc_list) {
+               if (strcmp(conf->sc_fsname, fsname) == 0)
+                       return conf;
+       }
+
+       if (!create)
+               return NULL;
+
+       OBD_ALLOC_PTR(conf);
+       if (conf == NULL)
+               return NULL;
+
+       strcpy(conf->sc_fsname, fsname);
+       sptlrpc_rule_set_init(&conf->sc_rset);
+       INIT_LIST_HEAD(&conf->sc_tgts);
+       list_add(&conf->sc_list, &sptlrpc_confs);
+
+       CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname);
+       return conf;
+}
+
+/**
+ * caller must hold conf_lock already.
+ */
+static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf,
+                                  const char *target,
+                                  struct sptlrpc_rule *rule)
+{
+       struct sptlrpc_conf_tgt  *conf_tgt;
+       struct sptlrpc_rule_set  *rule_set;
+
+       /* fsname == target means general rules for the whole fs */
+       if (strcmp(conf->sc_fsname, target) == 0) {
+               rule_set = &conf->sc_rset;
+       } else {
+               conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1);
+               if (conf_tgt) {
+                       rule_set = &conf_tgt->sct_rset;
+               } else {
+                       CERROR("out of memory, can't merge rule!\n");
+                       return -ENOMEM;
+               }
+       }
+
+       return sptlrpc_rule_set_merge(rule_set, rule);
+}
+
+/**
+ * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we
+ * find one through the target name in the record inside conf_lock;
+ * otherwise means caller already hold conf_lock.
+ */
+static int __sptlrpc_process_config(struct lustre_cfg *lcfg,
+                                   struct sptlrpc_conf *conf)
+{
+       char               *target, *param;
+       char                fsname[MTI_NAME_MAXLEN];
+       struct sptlrpc_rule     rule;
+       int                  rc;
+       ENTRY;
+
+       target = lustre_cfg_string(lcfg, 1);
+       if (target == NULL) {
+               CERROR("missing target name\n");
+               RETURN(-EINVAL);
+       }
+
+       param = lustre_cfg_string(lcfg, 2);
+       if (param == NULL) {
+               CERROR("missing parameter\n");
+               RETURN(-EINVAL);
+       }
+
+       CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param);
+
+       /* parse rule to make sure the format is correct */
+       if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) {
+               CERROR("Invalid sptlrpc parameter: %s\n", param);
+               RETURN(-EINVAL);
+       }
+       param += sizeof(PARAM_SRPC_FLVR) - 1;
+
+       rc = sptlrpc_parse_rule(param, &rule);
+       if (rc)
+               RETURN(-EINVAL);
+
+       if (conf == NULL) {
+               target2fsname(target, fsname, sizeof(fsname));
+
+               mutex_lock(&sptlrpc_conf_lock);
+               conf = sptlrpc_conf_get(fsname, 0);
+               if (conf == NULL) {
+                       CERROR("can't find conf\n");
+                       rc = -ENOMEM;
+               } else {
+                       rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+               }
+               mutex_unlock(&sptlrpc_conf_lock);
+       } else {
+               LASSERT(mutex_is_locked(&sptlrpc_conf_lock));
+               rc = sptlrpc_conf_merge_rule(conf, target, &rule);
+       }
+
+       if (rc == 0)
+               conf->sc_modified++;
+
+       RETURN(rc);
+}
+
+int sptlrpc_process_config(struct lustre_cfg *lcfg)
+{
+       return __sptlrpc_process_config(lcfg, NULL);
+}
+EXPORT_SYMBOL(sptlrpc_process_config);
+
+static int logname2fsname(const char *logname, char *buf, int buflen)
+{
+       char   *ptr;
+       int     len;
+
+       ptr = strrchr(logname, '-');
+       if (ptr == NULL || strcmp(ptr, "-sptlrpc")) {
+               CERROR("%s is not a sptlrpc config log\n", logname);
+               return -EINVAL;
+       }
+
+       len = min((int) (ptr - logname), buflen - 1);
+
+       memcpy(buf, logname, len);
+       buf[len] = '\0';
+       return 0;
+}
+
+void sptlrpc_conf_log_update_begin(const char *logname)
+{
+       struct sptlrpc_conf *conf;
+       char             fsname[16];
+
+       if (logname2fsname(logname, fsname, sizeof(fsname)))
+               return;
+
+       mutex_lock(&sptlrpc_conf_lock);
+
+       conf = sptlrpc_conf_get(fsname, 0);
+       if (conf && conf->sc_local) {
+               LASSERT(conf->sc_updated == 0);
+               sptlrpc_conf_free_rsets(conf);
+       }
+       conf->sc_modified = 0;
+
+       mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_begin);
+
+/**
+ * mark a config log has been updated
+ */
+void sptlrpc_conf_log_update_end(const char *logname)
+{
+       struct sptlrpc_conf *conf;
+       char             fsname[16];
+
+       if (logname2fsname(logname, fsname, sizeof(fsname)))
+               return;
+
+       mutex_lock(&sptlrpc_conf_lock);
+
+       conf = sptlrpc_conf_get(fsname, 0);
+       if (conf) {
+               /*
+                * if original state is not updated, make sure the
+                * modified counter > 0 to enforce updating local copy.
+                */
+               if (conf->sc_updated == 0)
+                       conf->sc_modified++;
+
+               conf->sc_updated = 1;
+       }
+
+       mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_update_end);
+
+void sptlrpc_conf_log_start(const char *logname)
+{
+       char             fsname[16];
+
+       if (logname2fsname(logname, fsname, sizeof(fsname)))
+               return;
+
+       mutex_lock(&sptlrpc_conf_lock);
+       sptlrpc_conf_get(fsname, 1);
+       mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_start);
+
+void sptlrpc_conf_log_stop(const char *logname)
+{
+       struct sptlrpc_conf *conf;
+       char             fsname[16];
+
+       if (logname2fsname(logname, fsname, sizeof(fsname)))
+               return;
+
+       mutex_lock(&sptlrpc_conf_lock);
+       conf = sptlrpc_conf_get(fsname, 0);
+       if (conf)
+               sptlrpc_conf_free(conf);
+       mutex_unlock(&sptlrpc_conf_lock);
+}
+EXPORT_SYMBOL(sptlrpc_conf_log_stop);
+
+static void inline flavor_set_flags(struct sptlrpc_flavor *sf,
+                                   enum lustre_sec_part from,
+                                   enum lustre_sec_part to,
+                                   unsigned int fl_udesc)
+{
+       /*
+        * null flavor doesn't need to set any flavor, and in fact
+        * we'd better not do that because everybody share a single sec.
+        */
+       if (sf->sf_rpc == SPTLRPC_FLVR_NULL)
+               return;
+
+       if (from == LUSTRE_SP_MDT) {
+               /* MDT->MDT; MDT->OST */
+               sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY;
+       } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) {
+               /* CLI->OST */
+               sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK;
+       } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) {
+               /* CLI->MDT */
+               if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL)
+                       sf->sf_flags |= PTLRPC_SEC_FL_UDESC;
+       }
+}
+
+void sptlrpc_conf_choose_flavor(enum lustre_sec_part from,
+                               enum lustre_sec_part to,
+                               struct obd_uuid *target,
+                               lnet_nid_t nid,
+                               struct sptlrpc_flavor *sf)
+{
+       struct sptlrpc_conf     *conf;
+       struct sptlrpc_conf_tgt *conf_tgt;
+       char                 name[MTI_NAME_MAXLEN];
+       int                   len, rc = 0;
+
+       target2fsname(target->uuid, name, sizeof(name));
+
+       mutex_lock(&sptlrpc_conf_lock);
+
+       conf = sptlrpc_conf_get(name, 0);
+       if (conf == NULL)
+               goto out;
+
+       /* convert uuid name (supposed end with _UUID) to target name */
+       len = strlen(target->uuid);
+       LASSERT(len > 5);
+       memcpy(name, target->uuid, len - 5);
+       name[len - 5] = '\0';
+
+       conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0);
+       if (conf_tgt) {
+               rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset,
+                                            from, to, nid, sf);
+               if (rc)
+                       goto out;
+       }
+
+       rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf);
+out:
+       mutex_unlock(&sptlrpc_conf_lock);
+
+       if (rc == 0)
+               get_default_flavor(sf);
+
+       flavor_set_flags(sf, from, to, 1);
+}
+
+/**
+ * called by target devices, determine the expected flavor from
+ * certain peer (from, nid).
+ */
+void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset,
+                                 enum lustre_sec_part from,
+                                 lnet_nid_t nid,
+                                 struct sptlrpc_flavor *sf)
+{
+       if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0)
+               get_default_flavor(sf);
+}
+EXPORT_SYMBOL(sptlrpc_target_choose_flavor);
+
+#define SEC_ADAPT_DELAY         (10)
+
+/**
+ * called by client devices, notify the sptlrpc config has changed and
+ * do import_sec_adapt later.
+ */
+void sptlrpc_conf_client_adapt(struct obd_device *obd)
+{
+       struct obd_import  *imp;
+       ENTRY;
+
+       LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+               strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) ==0);
+       CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid);
+
+       /* serialize with connect/disconnect import */
+       down_read(&obd->u.cli.cl_sem);
+
+       imp = obd->u.cli.cl_import;
+       if (imp) {
+               spin_lock(&imp->imp_lock);
+               if (imp->imp_sec)
+                       imp->imp_sec_expire = cfs_time_current_sec() +
+                               SEC_ADAPT_DELAY;
+               spin_unlock(&imp->imp_lock);
+       }
+
+       up_read(&obd->u.cli.cl_sem);
+       EXIT;
+}
+EXPORT_SYMBOL(sptlrpc_conf_client_adapt);
+
+
+static void rule2string(struct sptlrpc_rule *r, char *buf, int buflen)
+{
+       char    dirbuf[8];
+       char   *net;
+       char   *ptr = buf;
+
+       if (r->sr_netid == LNET_NIDNET(LNET_NID_ANY))
+               net = "default";
+       else
+               net = libcfs_net2str(r->sr_netid);
+
+       if (r->sr_from == LUSTRE_SP_ANY && r->sr_to == LUSTRE_SP_ANY)
+               dirbuf[0] = '\0';
+       else
+               snprintf(dirbuf, sizeof(dirbuf), ".%s2%s",
+                        sptlrpc_part2name(r->sr_from),
+                        sptlrpc_part2name(r->sr_to));
+
+       ptr += snprintf(buf, buflen, "srpc.flavor.%s%s=", net, dirbuf);
+
+       sptlrpc_flavor2name(&r->sr_flvr, ptr, buflen - (ptr - buf));
+       buf[buflen - 1] = '\0';
+}
+
+static int sptlrpc_record_rule_set(struct llog_handle *llh,
+                                  char *target,
+                                  struct sptlrpc_rule_set *rset)
+{
+       struct lustre_cfg_bufs  bufs;
+       struct lustre_cfg      *lcfg;
+       struct llog_rec_hdr     rec;
+       int                  buflen;
+       char                param[48];
+       int                  i, rc;
+
+       for (i = 0; i < rset->srs_nrule; i++) {
+               rule2string(&rset->srs_rules[i], param, sizeof(param));
+
+               lustre_cfg_bufs_reset(&bufs, NULL);
+               lustre_cfg_bufs_set_string(&bufs, 1, target);
+               lustre_cfg_bufs_set_string(&bufs, 2, param);
+               lcfg = lustre_cfg_new(LCFG_SPTLRPC_CONF, &bufs);
+               LASSERT(lcfg);
+
+               buflen = lustre_cfg_len(lcfg->lcfg_bufcount,
+                                       lcfg->lcfg_buflens);
+               rec.lrh_len = llog_data_len(buflen);
+               rec.lrh_type = OBD_CFG_REC;
+               rc = llog_write(NULL, llh, &rec, NULL, 0, (void *)lcfg, -1);
+               if (rc)
+                       CERROR("failed to write a rec: rc = %d\n", rc);
+               lustre_cfg_free(lcfg);
+       }
+       return 0;
+}
+
+static int sptlrpc_record_rules(struct llog_handle *llh,
+                               struct sptlrpc_conf *conf)
+{
+       struct sptlrpc_conf_tgt *conf_tgt;
+
+       sptlrpc_record_rule_set(llh, conf->sc_fsname, &conf->sc_rset);
+
+       list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) {
+               sptlrpc_record_rule_set(llh, conf_tgt->sct_name,
+                                       &conf_tgt->sct_rset);
+       }
+       return 0;
+}
+
+#define LOG_SPTLRPC_TMP "sptlrpc.tmp"
+#define LOG_SPTLRPC     "sptlrpc"
+
+static
+int sptlrpc_target_local_copy_conf(struct obd_device *obd,
+                                  struct sptlrpc_conf *conf)
+{
+       struct llog_handle   *llh = NULL;
+       struct llog_ctxt     *ctxt;
+       struct lvfs_run_ctxt  saved;
+       struct dentry   *dentry;
+       int                rc;
+       ENTRY;
+
+       ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+       if (ctxt == NULL)
+               RETURN(-EINVAL);
+
+       push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+       dentry = ll_lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs),
+                                  strlen(MOUNT_CONFIGS_DIR));
+       if (IS_ERR(dentry)) {
+               rc = PTR_ERR(dentry);
+               CERROR("cannot lookup %s directory: rc = %d\n",
+                      MOUNT_CONFIGS_DIR, rc);
+               GOTO(out_ctx, rc);
+       }
+
+       /* erase the old tmp log */
+       rc = llog_erase(NULL, ctxt, NULL, LOG_SPTLRPC_TMP);
+       if (rc < 0 && rc != -ENOENT) {
+               CERROR("%s: cannot erase temporary sptlrpc log: rc = %d\n",
+                      obd->obd_name, rc);
+               GOTO(out_dput, rc);
+       }
+
+       /* write temporary log */
+       rc = llog_open_create(NULL, ctxt, &llh, NULL, LOG_SPTLRPC_TMP);
+       if (rc)
+               GOTO(out_dput, rc);
+       rc = llog_init_handle(NULL, llh, LLOG_F_IS_PLAIN, NULL);
+       if (rc)
+               GOTO(out_close, rc);
+
+       rc = sptlrpc_record_rules(llh, conf);
+
+out_close:
+       llog_close(NULL, llh);
+       if (rc == 0)
+               rc = lustre_rename(dentry, obd->obd_lvfs_ctxt.pwdmnt,
+                                  LOG_SPTLRPC_TMP, LOG_SPTLRPC);
+out_dput:
+       l_dput(dentry);
+out_ctx:
+       pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       CDEBUG(D_SEC, "target %s: write local sptlrpc conf: rc = %d\n",
+              obd->obd_name, rc);
+       RETURN(rc);
+}
+
+static int local_read_handler(const struct lu_env *env,
+                             struct llog_handle *llh,
+                             struct llog_rec_hdr *rec, void *data)
+{
+       struct sptlrpc_conf  *conf = (struct sptlrpc_conf *) data;
+       struct lustre_cfg    *lcfg = (struct lustre_cfg *)(rec + 1);
+       int                cfg_len, rc;
+       ENTRY;
+
+       if (rec->lrh_type != OBD_CFG_REC) {
+               CERROR("unhandled lrh_type: %#x\n", rec->lrh_type);
+               RETURN(-EINVAL);
+       }
+
+       cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) -
+                 sizeof(struct llog_rec_tail);
+
+       rc = lustre_cfg_sanity_check(lcfg, cfg_len);
+       if (rc) {
+               CERROR("Insane cfg\n");
+               RETURN(rc);
+       }
+
+       if (lcfg->lcfg_command != LCFG_SPTLRPC_CONF) {
+               CERROR("invalid command (%x)\n", lcfg->lcfg_command);
+               RETURN(-EINVAL);
+       }
+
+       RETURN(__sptlrpc_process_config(lcfg, conf));
+}
+
+static
+int sptlrpc_target_local_read_conf(struct obd_device *obd,
+                                  struct sptlrpc_conf *conf)
+{
+       struct llog_handle    *llh = NULL;
+       struct llog_ctxt      *ctxt;
+       struct lvfs_run_ctxt   saved;
+       int                 rc;
+       ENTRY;
+
+       LASSERT(conf->sc_updated == 0 && conf->sc_local == 0);
+
+       ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
+       if (ctxt == NULL) {
+               CERROR("missing llog context\n");
+               RETURN(-EINVAL);
+       }
+
+       push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+       rc = llog_open(NULL, ctxt, &llh, NULL, LOG_SPTLRPC, LLOG_OPEN_EXISTS);
+       if (rc < 0) {
+               if (rc == -ENOENT)
+                       rc = 0;
+               GOTO(out_pop, rc);
+       }
+
+       rc = llog_init_handle(NULL, llh, LLOG_F_IS_PLAIN, NULL);
+       if (rc)
+               GOTO(out_close, rc);
+
+       if (llog_get_size(llh) <= 1) {
+               CDEBUG(D_SEC, "no local sptlrpc copy found\n");
+               GOTO(out_close, rc = 0);
+       }
+
+       rc = llog_process(NULL, llh, local_read_handler, (void *)conf, NULL);
+
+       if (rc == 0) {
+               conf->sc_local = 1;
+       } else {
+               sptlrpc_conf_free_rsets(conf);
+       }
+
+out_close:
+       llog_close(NULL, llh);
+out_pop:
+       pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+       llog_ctxt_put(ctxt);
+       CDEBUG(D_SEC, "target %s: read local sptlrpc conf: rc = %d\n",
+              obd->obd_name, rc);
+       RETURN(rc);
+}
+
+
+/**
+ * called by target devices, extract sptlrpc rules which applies to
+ * this target, to be used for future rpc flavor checking.
+ */
+int sptlrpc_conf_target_get_rules(struct obd_device *obd,
+                                 struct sptlrpc_rule_set *rset,
+                                 int initial)
+{
+       struct sptlrpc_conf      *conf;
+       struct sptlrpc_conf_tgt  *conf_tgt;
+       enum lustre_sec_part      sp_dst;
+       char                  fsname[MTI_NAME_MAXLEN];
+       int                    rc = 0;
+       ENTRY;
+
+       if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) {
+               sp_dst = LUSTRE_SP_MDT;
+       } else if (strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) == 0) {
+               sp_dst = LUSTRE_SP_OST;
+       } else {
+               CERROR("unexpected obd type %s\n", obd->obd_type->typ_name);
+               RETURN(-EINVAL);
+       }
+       CDEBUG(D_SEC, "get rules for target %s\n", obd->obd_uuid.uuid);
+
+       target2fsname(obd->obd_uuid.uuid, fsname, sizeof(fsname));
+
+       mutex_lock(&sptlrpc_conf_lock);
+
+       conf = sptlrpc_conf_get(fsname, 0);
+       if (conf == NULL) {
+               CERROR("missing sptlrpc config log\n");
+               GOTO(out, rc);
+       }
+
+       if (conf->sc_updated  == 0) {
+               /*
+                * always read from local copy. here another option is
+                * if we already have a local copy (read from another
+                * target device hosted on the same node) we simply use that.
+                */
+               if (conf->sc_local)
+                       sptlrpc_conf_free_rsets(conf);
+
+               sptlrpc_target_local_read_conf(obd, conf);
+       } else {
+               LASSERT(conf->sc_local == 0);
+
+               /* write a local copy */
+               if (initial || conf->sc_modified)
+                       sptlrpc_target_local_copy_conf(obd, conf);
+               else
+                       CDEBUG(D_SEC, "unchanged, skip updating local copy\n");
+       }
+
+       /* extract rule set for this target */
+       conf_tgt = sptlrpc_conf_get_tgt(conf, obd->obd_name, 0);
+
+       rc = sptlrpc_rule_set_extract(&conf->sc_rset,
+                                     conf_tgt ? &conf_tgt->sct_rset: NULL,
+                                     LUSTRE_SP_ANY, sp_dst, rset);
+out:
+       mutex_unlock(&sptlrpc_conf_lock);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(sptlrpc_conf_target_get_rules);
+
+int  sptlrpc_conf_init(void)
+{
+       mutex_init(&sptlrpc_conf_lock);
+       return 0;
+}
+
+void sptlrpc_conf_fini(void)
+{
+       struct sptlrpc_conf  *conf, *conf_next;
+
+       mutex_lock(&sptlrpc_conf_lock);
+       list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) {
+               sptlrpc_conf_free(conf);
+       }
+       LASSERT(list_empty(&sptlrpc_confs));
+       mutex_unlock(&sptlrpc_conf_lock);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c
new file mode 100644 (file)
index 0000000..4c96a14
--- /dev/null
@@ -0,0 +1,250 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_gc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+#define SEC_GC_INTERVAL (30 * 60)
+
+
+static struct mutex sec_gc_mutex;
+static LIST_HEAD(sec_gc_list);
+static spinlock_t sec_gc_list_lock;
+
+static LIST_HEAD(sec_gc_ctx_list);
+static spinlock_t sec_gc_ctx_list_lock;
+
+static struct ptlrpc_thread sec_gc_thread;
+static atomic_t sec_gc_wait_del = ATOMIC_INIT(0);
+
+
+void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec)
+{
+       LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+       LASSERT(sec->ps_gc_interval > 0);
+       LASSERT(list_empty(&sec->ps_gc_list));
+
+       sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+
+       spin_lock(&sec_gc_list_lock);
+       list_add_tail(&sec_gc_list, &sec->ps_gc_list);
+       spin_unlock(&sec_gc_list_lock);
+
+       CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_sec);
+
+void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec)
+{
+       if (list_empty(&sec->ps_gc_list))
+               return;
+
+       might_sleep();
+
+       /* signal before list_del to make iteration in gc thread safe */
+       atomic_inc(&sec_gc_wait_del);
+
+       spin_lock(&sec_gc_list_lock);
+       list_del_init(&sec->ps_gc_list);
+       spin_unlock(&sec_gc_list_lock);
+
+       /* barrier */
+       mutex_lock(&sec_gc_mutex);
+       mutex_unlock(&sec_gc_mutex);
+
+       atomic_dec(&sec_gc_wait_del);
+
+       CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+}
+EXPORT_SYMBOL(sptlrpc_gc_del_sec);
+
+void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx)
+{
+       LASSERT(list_empty(&ctx->cc_gc_chain));
+
+       CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n",
+              ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+       spin_lock(&sec_gc_ctx_list_lock);
+       list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list);
+       spin_unlock(&sec_gc_ctx_list_lock);
+
+       thread_add_flags(&sec_gc_thread, SVC_SIGNAL);
+       wake_up(&sec_gc_thread.t_ctl_waitq);
+}
+EXPORT_SYMBOL(sptlrpc_gc_add_ctx);
+
+static void sec_process_ctx_list(void)
+{
+       struct ptlrpc_cli_ctx *ctx;
+
+       spin_lock(&sec_gc_ctx_list_lock);
+
+       while (!list_empty(&sec_gc_ctx_list)) {
+               ctx = list_entry(sec_gc_ctx_list.next,
+                                    struct ptlrpc_cli_ctx, cc_gc_chain);
+               list_del_init(&ctx->cc_gc_chain);
+               spin_unlock(&sec_gc_ctx_list_lock);
+
+               LASSERT(ctx->cc_sec);
+               LASSERT(atomic_read(&ctx->cc_refcount) == 1);
+               CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n",
+                      ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec));
+               sptlrpc_cli_ctx_put(ctx, 1);
+
+               spin_lock(&sec_gc_ctx_list_lock);
+       }
+
+       spin_unlock(&sec_gc_ctx_list_lock);
+}
+
+static void sec_do_gc(struct ptlrpc_sec *sec)
+{
+       LASSERT(sec->ps_policy->sp_cops->gc_ctx);
+
+       if (unlikely(sec->ps_gc_next == 0)) {
+               CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n",
+                     sec, sec->ps_policy->sp_name);
+               return;
+       }
+
+       CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name);
+
+       if (cfs_time_after(sec->ps_gc_next, cfs_time_current_sec()))
+               return;
+
+       sec->ps_policy->sp_cops->gc_ctx(sec);
+       sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval;
+}
+
+static int sec_gc_main(void *arg)
+{
+       struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg;
+       struct l_wait_info    lwi;
+
+       unshare_fs_struct();
+
+       /* Record that the thread is running */
+       thread_set_flags(thread, SVC_RUNNING);
+       wake_up(&thread->t_ctl_waitq);
+
+       while (1) {
+               struct ptlrpc_sec *sec;
+
+               thread_clear_flags(thread, SVC_SIGNAL);
+               sec_process_ctx_list();
+again:
+               /* go through sec list do gc.
+                * FIXME here we iterate through the whole list each time which
+                * is not optimal. we perhaps want to use balanced binary tree
+                * to trace each sec as order of expiry time.
+                * another issue here is we wakeup as fixed interval instead of
+                * according to each sec's expiry time */
+               mutex_lock(&sec_gc_mutex);
+               list_for_each_entry(sec, &sec_gc_list, ps_gc_list) {
+                       /* if someone is waiting to be deleted, let it
+                        * proceed as soon as possible. */
+                       if (atomic_read(&sec_gc_wait_del)) {
+                               CDEBUG(D_SEC, "deletion pending, start over\n");
+                               mutex_unlock(&sec_gc_mutex);
+                               goto again;
+                       }
+
+                       sec_do_gc(sec);
+               }
+               mutex_unlock(&sec_gc_mutex);
+
+               /* check ctx list again before sleep */
+               sec_process_ctx_list();
+
+               lwi = LWI_TIMEOUT(SEC_GC_INTERVAL * HZ, NULL, NULL);
+               l_wait_event(thread->t_ctl_waitq,
+                            thread_is_stopping(thread) ||
+                            thread_is_signal(thread),
+                            &lwi);
+
+               if (thread_test_and_clear_flags(thread, SVC_STOPPING))
+                       break;
+       }
+
+       thread_set_flags(thread, SVC_STOPPED);
+       wake_up(&thread->t_ctl_waitq);
+       return 0;
+}
+
+int sptlrpc_gc_init(void)
+{
+       struct l_wait_info lwi = { 0 };
+       task_t *task;
+
+       mutex_init(&sec_gc_mutex);
+       spin_lock_init(&sec_gc_list_lock);
+       spin_lock_init(&sec_gc_ctx_list_lock);
+
+       /* initialize thread control */
+       memset(&sec_gc_thread, 0, sizeof(sec_gc_thread));
+       init_waitqueue_head(&sec_gc_thread.t_ctl_waitq);
+
+       task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc");
+       if (IS_ERR(task)) {
+               CERROR("can't start gc thread: %ld\n", PTR_ERR(task));
+               return PTR_ERR(task);
+       }
+
+       l_wait_event(sec_gc_thread.t_ctl_waitq,
+                    thread_is_running(&sec_gc_thread), &lwi);
+       return 0;
+}
+
+void sptlrpc_gc_fini(void)
+{
+       struct l_wait_info lwi = { 0 };
+
+       thread_set_flags(&sec_gc_thread, SVC_STOPPING);
+       wake_up(&sec_gc_thread.t_ctl_waitq);
+
+       l_wait_event(sec_gc_thread.t_ctl_waitq,
+                    thread_is_stopped(&sec_gc_thread), &lwi);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c
new file mode 100644 (file)
index 0000000..1213621
--- /dev/null
@@ -0,0 +1,199 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_lproc.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/crypto.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_import.h>
+#include <lustre_dlm.h>
+#include <lustre_sec.h>
+
+#include "ptlrpc_internal.h"
+
+
+struct proc_dir_entry *sptlrpc_proc_root = NULL;
+EXPORT_SYMBOL(sptlrpc_proc_root);
+
+char *sec_flags2str(unsigned long flags, char *buf, int bufsize)
+{
+       buf[0] = '\0';
+
+       if (flags & PTLRPC_SEC_FL_REVERSE)
+               strlcat(buf, "reverse,", bufsize);
+       if (flags & PTLRPC_SEC_FL_ROOTONLY)
+               strlcat(buf, "rootonly,", bufsize);
+       if (flags & PTLRPC_SEC_FL_UDESC)
+               strlcat(buf, "udesc,", bufsize);
+       if (flags & PTLRPC_SEC_FL_BULK)
+               strlcat(buf, "bulk,", bufsize);
+       if (buf[0] == '\0')
+               strlcat(buf, "-,", bufsize);
+
+       return buf;
+}
+
+static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+       struct obd_device *dev = seq->private;
+       struct client_obd *cli = &dev->u.cli;
+       struct ptlrpc_sec *sec = NULL;
+       char           str[32];
+
+       LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+               strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+               strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+       if (cli->cl_import)
+               sec = sptlrpc_import_sec_ref(cli->cl_import);
+       if (sec == NULL)
+               goto out;
+
+       sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str));
+
+       seq_printf(seq, "rpc flavor:    %s\n",
+                  sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc));
+       seq_printf(seq, "bulk flavor:   %s\n",
+                  sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str)));
+       seq_printf(seq, "flags:  %s\n",
+                  sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)));
+       seq_printf(seq, "id:        %d\n", sec->ps_id);
+       seq_printf(seq, "refcount:      %d\n",
+                  atomic_read(&sec->ps_refcount));
+       seq_printf(seq, "nctx:    %d\n", atomic_read(&sec->ps_nctx));
+       seq_printf(seq, "gc internal    %ld\n", sec->ps_gc_interval);
+       seq_printf(seq, "gc next        %ld\n",
+                  sec->ps_gc_interval ?
+                  sec->ps_gc_next - cfs_time_current_sec() : 0);
+
+       sptlrpc_sec_put(sec);
+out:
+       return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs);
+
+static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v)
+{
+       struct obd_device *dev = seq->private;
+       struct client_obd *cli = &dev->u.cli;
+       struct ptlrpc_sec *sec = NULL;
+
+       LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 ||
+               strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 ||
+               strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0);
+
+       if (cli->cl_import)
+               sec = sptlrpc_import_sec_ref(cli->cl_import);
+       if (sec == NULL)
+               goto out;
+
+       if (sec->ps_policy->sp_cops->display)
+               sec->ps_policy->sp_cops->display(sec, seq);
+
+       sptlrpc_sec_put(sec);
+out:
+       return 0;
+}
+LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs);
+
+int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev)
+{
+       int     rc;
+
+       if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 &&
+           strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 &&
+           strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0) {
+               CERROR("can't register lproc for obd type %s\n",
+                      dev->obd_type->typ_name);
+               return -EINVAL;
+       }
+
+       rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444,
+                                   &sptlrpc_info_lprocfs_fops, dev);
+       if (rc) {
+               CERROR("create proc entry srpc_info for %s: %d\n",
+                      dev->obd_name, rc);
+               return rc;
+       }
+
+       rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444,
+                                   &sptlrpc_ctxs_lprocfs_fops, dev);
+       if (rc) {
+               CERROR("create proc entry srpc_contexts for %s: %d\n",
+                      dev->obd_name, rc);
+               return rc;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach);
+
+LPROC_SEQ_FOPS_RO(sptlrpc_proc_enc_pool);
+static struct lprocfs_vars sptlrpc_lprocfs_vars[] = {
+       { "encrypt_page_pools", &sptlrpc_proc_enc_pool_fops },
+       { NULL }
+};
+
+int sptlrpc_lproc_init(void)
+{
+       int     rc;
+
+       LASSERT(sptlrpc_proc_root == NULL);
+
+       sptlrpc_proc_root = lprocfs_register("sptlrpc", proc_lustre_root,
+                                            sptlrpc_lprocfs_vars, NULL);
+       if (IS_ERR(sptlrpc_proc_root)) {
+               rc = PTR_ERR(sptlrpc_proc_root);
+               sptlrpc_proc_root = NULL;
+               return rc;
+       }
+       return 0;
+}
+
+void sptlrpc_lproc_fini(void)
+{
+       if (sptlrpc_proc_root) {
+               lprocfs_remove(&sptlrpc_proc_root);
+               sptlrpc_proc_root = NULL;
+       }
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_null.c b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c
new file mode 100644 (file)
index 0000000..ff1137f
--- /dev/null
@@ -0,0 +1,464 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_null.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+static struct ptlrpc_sec_policy null_policy;
+static struct ptlrpc_sec       null_sec;
+static struct ptlrpc_cli_ctx    null_cli_ctx;
+static struct ptlrpc_svc_ctx    null_svc_ctx;
+
+/*
+ * we can temporarily use the topmost 8-bits of lm_secflvr to identify
+ * the source sec part.
+ */
+static inline
+void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp)
+{
+       msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24;
+}
+
+static inline
+enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg)
+{
+       return (msg->lm_secflvr >> 24) & 0xFF;
+}
+
+static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+       /* should never reach here */
+       LBUG();
+       return 0;
+}
+
+static
+int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+       req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+
+       if (!req->rq_import->imp_dlm_fake) {
+               struct obd_device *obd = req->rq_import->imp_obd;
+               null_encode_sec_part(req->rq_reqbuf,
+                                    obd->u.cli.cl_sp_me);
+       }
+       req->rq_reqdata_len = req->rq_reqlen;
+       return 0;
+}
+
+static
+int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+       __u32   cksums, cksumc;
+
+       LASSERT(req->rq_repdata);
+
+       req->rq_repmsg = req->rq_repdata;
+       req->rq_replen = req->rq_repdata_len;
+
+       if (req->rq_early) {
+               cksums = lustre_msg_get_cksum(req->rq_repdata);
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+               if (lustre_msghdr_get_flags(req->rq_reqmsg) &
+                   MSGHDR_CKSUM_INCOMPAT18)
+                       cksumc = lustre_msg_calc_cksum(req->rq_repmsg, 0);
+               else
+                       cksumc = lustre_msg_calc_cksum(req->rq_repmsg, 1);
+#else
+# warning "remove checksum compatibility support for b1_8"
+               cksumc = lustre_msg_calc_cksum(req->rq_repmsg);
+#endif
+               if (cksumc != cksums) {
+                       CDEBUG(D_SEC,
+                              "early reply checksum mismatch: %08x != %08x\n",
+                              cksumc, cksums);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+static
+struct ptlrpc_sec *null_create_sec(struct obd_import *imp,
+                                  struct ptlrpc_svc_ctx *svc_ctx,
+                                  struct sptlrpc_flavor *sf)
+{
+       LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL);
+
+       /* general layer has take a module reference for us, because we never
+        * really destroy the sec, simply release the reference here.
+        */
+       sptlrpc_policy_put(&null_policy);
+       return &null_sec;
+}
+
+static
+void null_destroy_sec(struct ptlrpc_sec *sec)
+{
+       LASSERT(sec == &null_sec);
+}
+
+static
+struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec,
+                                      struct vfs_cred *vcred,
+                                      int create, int remove_dead)
+{
+       atomic_inc(&null_cli_ctx.cc_refcount);
+       return &null_cli_ctx;
+}
+
+static
+int null_flush_ctx_cache(struct ptlrpc_sec *sec,
+                        uid_t uid,
+                        int grace, int force)
+{
+       return 0;
+}
+
+static
+int null_alloc_reqbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req,
+                     int msgsize)
+{
+       if (!req->rq_reqbuf) {
+               int alloc_size = size_roundup_power2(msgsize);
+
+               LASSERT(!req->rq_pool);
+               OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size);
+               if (!req->rq_reqbuf)
+                       return -ENOMEM;
+
+               req->rq_reqbuf_len = alloc_size;
+       } else {
+               LASSERT(req->rq_pool);
+               LASSERT(req->rq_reqbuf_len >= msgsize);
+               memset(req->rq_reqbuf, 0, msgsize);
+       }
+
+       req->rq_reqmsg = req->rq_reqbuf;
+       return 0;
+}
+
+static
+void null_free_reqbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req)
+{
+       if (!req->rq_pool) {
+               LASSERTF(req->rq_reqmsg == req->rq_reqbuf,
+                        "req %p: reqmsg %p is not reqbuf %p in null sec\n",
+                        req, req->rq_reqmsg, req->rq_reqbuf);
+               LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen,
+                        "req %p: reqlen %d should smaller than buflen %d\n",
+                        req, req->rq_reqlen, req->rq_reqbuf_len);
+
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = NULL;
+               req->rq_reqbuf_len = 0;
+       }
+}
+
+static
+int null_alloc_repbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req,
+                     int msgsize)
+{
+       /* add space for early replied */
+       msgsize += lustre_msg_early_size();
+
+       msgsize = size_roundup_power2(msgsize);
+
+       OBD_ALLOC_LARGE(req->rq_repbuf, msgsize);
+       if (!req->rq_repbuf)
+               return -ENOMEM;
+
+       req->rq_repbuf_len = msgsize;
+       return 0;
+}
+
+static
+void null_free_repbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req)
+{
+       LASSERT(req->rq_repbuf);
+
+       OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+       req->rq_repbuf = NULL;
+       req->rq_repbuf_len = 0;
+}
+
+static
+int null_enlarge_reqbuf(struct ptlrpc_sec *sec,
+                       struct ptlrpc_request *req,
+                       int segment, int newsize)
+{
+       struct lustre_msg      *newbuf;
+       struct lustre_msg      *oldbuf = req->rq_reqmsg;
+       int                  oldsize, newmsg_size, alloc_size;
+
+       LASSERT(req->rq_reqbuf);
+       LASSERT(req->rq_reqbuf == req->rq_reqmsg);
+       LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+       LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf));
+
+       /* compute new message size */
+       oldsize = req->rq_reqbuf->lm_buflens[segment];
+       req->rq_reqbuf->lm_buflens[segment] = newsize;
+       newmsg_size = lustre_packed_msg_size(oldbuf);
+       req->rq_reqbuf->lm_buflens[segment] = oldsize;
+
+       /* request from pool should always have enough buffer */
+       LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size);
+
+       if (req->rq_reqbuf_len < newmsg_size) {
+               alloc_size = size_roundup_power2(newmsg_size);
+
+               OBD_ALLOC_LARGE(newbuf, alloc_size);
+               if (newbuf == NULL)
+                       return -ENOMEM;
+
+               memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen);
+
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = req->rq_reqmsg = newbuf;
+               req->rq_reqbuf_len = alloc_size;
+       }
+
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+       req->rq_reqlen = newmsg_size;
+
+       return 0;
+}
+
+static struct ptlrpc_svc_ctx null_svc_ctx = {
+       .sc_refcount    = ATOMIC_INIT(1),
+       .sc_policy      = &null_policy,
+};
+
+static
+int null_accept(struct ptlrpc_request *req)
+{
+       LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+               SPTLRPC_POLICY_NULL);
+
+       if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) {
+               CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc);
+               return SECSVC_DROP;
+       }
+
+       req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf);
+
+       req->rq_reqmsg = req->rq_reqbuf;
+       req->rq_reqlen = req->rq_reqdata_len;
+
+       req->rq_svc_ctx = &null_svc_ctx;
+       atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+       return SECSVC_OK;
+}
+
+static
+int null_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+       struct ptlrpc_reply_state *rs;
+       int rs_size = sizeof(*rs) + msgsize;
+
+       LASSERT(msgsize % 8 == 0);
+
+       rs = req->rq_reply_state;
+
+       if (rs) {
+               /* pre-allocated */
+               LASSERT(rs->rs_size >= rs_size);
+       } else {
+               OBD_ALLOC_LARGE(rs, rs_size);
+               if (rs == NULL)
+                       return -ENOMEM;
+
+               rs->rs_size = rs_size;
+       }
+
+       rs->rs_svc_ctx = req->rq_svc_ctx;
+       atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+       rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+       rs->rs_repbuf_len = rs_size - sizeof(*rs);
+       rs->rs_msg = rs->rs_repbuf;
+
+       req->rq_reply_state = rs;
+       return 0;
+}
+
+static
+void null_free_rs(struct ptlrpc_reply_state *rs)
+{
+       LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1);
+       atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+       if (!rs->rs_prealloc)
+               OBD_FREE_LARGE(rs, rs->rs_size);
+}
+
+static
+int null_authorize(struct ptlrpc_request *req)
+{
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+
+       LASSERT(rs);
+
+       rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL;
+       rs->rs_repdata_len = req->rq_replen;
+
+       if (likely(req->rq_packed_final)) {
+               if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+                       req->rq_reply_off = lustre_msg_early_size();
+               else
+                       req->rq_reply_off = 0;
+       } else {
+               __u32 cksum;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0)
+               if (lustre_msghdr_get_flags(req->rq_reqmsg) &
+                   MSGHDR_CKSUM_INCOMPAT18)
+                       cksum = lustre_msg_calc_cksum(rs->rs_repbuf, 0);
+               else
+                       cksum = lustre_msg_calc_cksum(rs->rs_repbuf, 1);
+#else
+# warning "remove checksum compatibility support for b1_8"
+               cksum = lustre_msg_calc_cksum(rs->rs_repbuf);
+#endif
+               lustre_msg_set_cksum(rs->rs_repbuf, cksum);
+               req->rq_reply_off = 0;
+       }
+
+       return 0;
+}
+
+static struct ptlrpc_ctx_ops null_ctx_ops = {
+       .refresh                = null_ctx_refresh,
+       .sign              = null_ctx_sign,
+       .verify          = null_ctx_verify,
+};
+
+static struct ptlrpc_sec_cops null_sec_cops = {
+       .create_sec          = null_create_sec,
+       .destroy_sec        = null_destroy_sec,
+       .lookup_ctx          = null_lookup_ctx,
+       .flush_ctx_cache        = null_flush_ctx_cache,
+       .alloc_reqbuf      = null_alloc_reqbuf,
+       .alloc_repbuf      = null_alloc_repbuf,
+       .free_reqbuf        = null_free_reqbuf,
+       .free_repbuf        = null_free_repbuf,
+       .enlarge_reqbuf  = null_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops null_sec_sops = {
+       .accept          = null_accept,
+       .alloc_rs              = null_alloc_rs,
+       .authorize            = null_authorize,
+       .free_rs                = null_free_rs,
+};
+
+static struct ptlrpc_sec_policy null_policy = {
+       .sp_owner              = THIS_MODULE,
+       .sp_name                = "sec.null",
+       .sp_policy            = SPTLRPC_POLICY_NULL,
+       .sp_cops                = &null_sec_cops,
+       .sp_sops                = &null_sec_sops,
+};
+
+static void null_init_internal(void)
+{
+       static HLIST_HEAD(__list);
+
+       null_sec.ps_policy = &null_policy;
+       atomic_set(&null_sec.ps_refcount, 1);     /* always busy */
+       null_sec.ps_id = -1;
+       null_sec.ps_import = NULL;
+       null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
+       null_sec.ps_flvr.sf_flags = 0;
+       null_sec.ps_part = LUSTRE_SP_ANY;
+       null_sec.ps_dying = 0;
+       spin_lock_init(&null_sec.ps_lock);
+       atomic_set(&null_sec.ps_nctx, 1);        /* for "null_cli_ctx" */
+       INIT_LIST_HEAD(&null_sec.ps_gc_list);
+       null_sec.ps_gc_interval = 0;
+       null_sec.ps_gc_next = 0;
+
+       hlist_add_head(&null_cli_ctx.cc_cache, &__list);
+       atomic_set(&null_cli_ctx.cc_refcount, 1);    /* for hash */
+       null_cli_ctx.cc_sec = &null_sec;
+       null_cli_ctx.cc_ops = &null_ctx_ops;
+       null_cli_ctx.cc_expire = 0;
+       null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL |
+                               PTLRPC_CTX_UPTODATE;
+       null_cli_ctx.cc_vcred.vc_uid = 0;
+       spin_lock_init(&null_cli_ctx.cc_lock);
+       INIT_LIST_HEAD(&null_cli_ctx.cc_req_list);
+       INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain);
+}
+
+int sptlrpc_null_init(void)
+{
+       int rc;
+
+       null_init_internal();
+
+       rc = sptlrpc_register_policy(&null_policy);
+       if (rc)
+               CERROR("failed to register %s: %d\n", null_policy.sp_name, rc);
+
+       return rc;
+}
+
+void sptlrpc_null_fini(void)
+{
+       int rc;
+
+       rc = sptlrpc_unregister_policy(&null_policy);
+       if (rc)
+               CERROR("failed to unregister %s: %d\n", null_policy.sp_name,rc);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c
new file mode 100644 (file)
index 0000000..f552d2f
--- /dev/null
@@ -0,0 +1,1021 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/ptlrpc/sec_plain.c
+ *
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+
+#include <obd_support.h>
+#include <obd_cksum.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_sec.h>
+
+struct plain_sec {
+       struct ptlrpc_sec       pls_base;
+       rwlock_t            pls_lock;
+       struct ptlrpc_cli_ctx  *pls_ctx;
+};
+
+static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec)
+{
+       return container_of(sec, struct plain_sec, pls_base);
+}
+
+static struct ptlrpc_sec_policy plain_policy;
+static struct ptlrpc_ctx_ops    plain_ctx_ops;
+static struct ptlrpc_svc_ctx    plain_svc_ctx;
+
+static unsigned int plain_at_offset;
+
+/*
+ * for simplicity, plain policy rpc use fixed layout.
+ */
+#define PLAIN_PACK_SEGMENTS         (4)
+
+#define PLAIN_PACK_HDR_OFF           (0)
+#define PLAIN_PACK_MSG_OFF           (1)
+#define PLAIN_PACK_USER_OFF         (2)
+#define PLAIN_PACK_BULK_OFF         (3)
+
+#define PLAIN_FL_USER             (0x01)
+#define PLAIN_FL_BULK             (0x02)
+
+struct plain_header {
+       __u8        ph_ver;         /* 0 */
+       __u8        ph_flags;
+       __u8        ph_sp;           /* source */
+       __u8        ph_bulk_hash_alg;  /* complete flavor desc */
+       __u8        ph_pad[4];
+};
+
+struct plain_bulk_token {
+       __u8        pbt_hash[8];
+};
+
+#define PLAIN_BSD_SIZE \
+       (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token))
+
+/****************************************
+ * bulk checksum helpers               *
+ ****************************************/
+
+static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed)
+{
+       struct ptlrpc_bulk_sec_desc *bsd;
+
+       if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed))
+               return -EPROTO;
+
+       bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE);
+       if (bsd == NULL) {
+               CERROR("bulk sec desc has short size %d\n",
+                      lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF));
+               return -EPROTO;
+       }
+
+       if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL &&
+           bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) {
+               CERROR("invalid bulk svc %u\n", bsd->bsd_svc);
+               return -EPROTO;
+       }
+
+       return 0;
+}
+
+static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc,
+                                   __u8 hash_alg,
+                                   struct plain_bulk_token *token)
+{
+       if (hash_alg == BULK_HASH_ALG_NULL)
+               return 0;
+
+       memset(token->pbt_hash, 0, sizeof(token->pbt_hash));
+       return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash,
+                                        sizeof(token->pbt_hash));
+}
+
+static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc,
+                                 __u8 hash_alg,
+                                 struct plain_bulk_token *tokenr)
+{
+       struct plain_bulk_token tokenv;
+       int                  rc;
+
+       if (hash_alg == BULK_HASH_ALG_NULL)
+               return 0;
+
+       memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash));
+       rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash,
+                                      sizeof(tokenv.pbt_hash));
+       if (rc)
+               return rc;
+
+       if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash)))
+               return -EACCES;
+       return 0;
+}
+
+static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc)
+{
+       char       *ptr;
+       unsigned int    off, i;
+
+       for (i = 0; i < desc->bd_iov_count; i++) {
+               if (desc->bd_iov[i].kiov_len == 0)
+                       continue;
+
+               ptr = kmap(desc->bd_iov[i].kiov_page);
+               off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
+               ptr[off] ^= 0x1;
+               kunmap(desc->bd_iov[i].kiov_page);
+               return;
+       }
+}
+
+/****************************************
+ * cli_ctx apis                         *
+ ****************************************/
+
+static
+int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx)
+{
+       /* should never reach here */
+       LBUG();
+       return 0;
+}
+
+static
+int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx)
+{
+       return 0;
+}
+
+static
+int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+       struct lustre_msg   *msg = req->rq_reqbuf;
+       struct plain_header *phdr;
+       ENTRY;
+
+       msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+       phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+       phdr->ph_ver = 0;
+       phdr->ph_flags = 0;
+       phdr->ph_sp = ctx->cc_sec->ps_part;
+       phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+       if (req->rq_pack_udesc)
+               phdr->ph_flags |= PLAIN_FL_USER;
+       if (req->rq_pack_bulk)
+               phdr->ph_flags |= PLAIN_FL_BULK;
+
+       req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount,
+                                                msg->lm_buflens);
+       RETURN(0);
+}
+
+static
+int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req)
+{
+       struct lustre_msg   *msg = req->rq_repdata;
+       struct plain_header *phdr;
+       __u32           cksum;
+       int               swabbed;
+       ENTRY;
+
+       if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) {
+               CERROR("unexpected reply buf count %u\n", msg->lm_bufcount);
+               RETURN(-EPROTO);
+       }
+
+       swabbed = ptlrpc_rep_need_swab(req);
+
+       phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+       if (phdr == NULL) {
+               CERROR("missing plain header\n");
+               RETURN(-EPROTO);
+       }
+
+       if (phdr->ph_ver != 0) {
+               CERROR("Invalid header version\n");
+               RETURN(-EPROTO);
+       }
+
+       /* expect no user desc in reply */
+       if (phdr->ph_flags & PLAIN_FL_USER) {
+               CERROR("Unexpected udesc flag in reply\n");
+               RETURN(-EPROTO);
+       }
+
+       if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) {
+               CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg,
+                      req->rq_flvr.u_bulk.hash.hash_alg);
+               RETURN(-EPROTO);
+       }
+
+       if (unlikely(req->rq_early)) {
+               unsigned int hsize = 4;
+
+               cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+                               lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+                               lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+                               NULL, 0, (unsigned char *)&cksum, &hsize);
+               if (cksum != msg->lm_cksum) {
+                       CDEBUG(D_SEC,
+                              "early reply checksum mismatch: %08x != %08x\n",
+                              cpu_to_le32(cksum), msg->lm_cksum);
+                       RETURN(-EINVAL);
+               }
+       } else {
+               /* whether we sent with bulk or not, we expect the same
+                * in reply, except for early reply */
+               if (!req->rq_early &&
+                   !equi(req->rq_pack_bulk == 1,
+                         phdr->ph_flags & PLAIN_FL_BULK)) {
+                       CERROR("%s bulk checksum in reply\n",
+                              req->rq_pack_bulk ? "Missing" : "Unexpected");
+                       RETURN(-EPROTO);
+               }
+
+               if (phdr->ph_flags & PLAIN_FL_BULK) {
+                       if (plain_unpack_bsd(msg, swabbed))
+                               RETURN(-EPROTO);
+               }
+       }
+
+       req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+       req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF);
+       RETURN(0);
+}
+
+static
+int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                       struct ptlrpc_request *req,
+                       struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_bulk_sec_desc *bsd;
+       struct plain_bulk_token     *token;
+       int                       rc;
+
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+       bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+       token = (struct plain_bulk_token *) bsd->bsd_data;
+
+       bsd->bsd_version = 0;
+       bsd->bsd_flags = 0;
+       bsd->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc);
+
+       if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+               RETURN(0);
+
+       if (req->rq_bulk_read)
+               RETURN(0);
+
+       rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                     token);
+       if (rc) {
+               CERROR("bulk write: failed to compute checksum: %d\n", rc);
+       } else {
+               /*
+                * for sending we only compute the wrong checksum instead
+                * of corrupting the data so it is still correct on a redo
+                */
+               if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) &&
+                   req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL)
+                       token->pbt_hash[0] ^= 0x1;
+       }
+
+       return rc;
+}
+
+static
+int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx,
+                         struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_bulk_sec_desc *bsdv;
+       struct plain_bulk_token     *tokenv;
+       int                       rc;
+       int                       i, nob;
+
+       LASSERT(req->rq_pack_bulk);
+       LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS);
+       LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS);
+
+       bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0);
+       tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+       if (req->rq_bulk_write) {
+               if (bsdv->bsd_flags & BSD_FL_ERR)
+                       return -EIO;
+               return 0;
+       }
+
+       /* fix the actual data size */
+       for (i = 0, nob = 0; i < desc->bd_iov_count; i++) {
+               if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) {
+                       desc->bd_iov[i].kiov_len =
+                               desc->bd_nob_transferred - nob;
+               }
+               nob += desc->bd_iov[i].kiov_len;
+       }
+
+       rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                   tokenv);
+       if (rc)
+               CERROR("bulk read: client verify failed: %d\n", rc);
+
+       return rc;
+}
+
+/****************************************
+ * sec apis                         *
+ ****************************************/
+
+static
+struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec)
+{
+       struct ptlrpc_cli_ctx  *ctx, *ctx_new;
+
+       OBD_ALLOC_PTR(ctx_new);
+
+       write_lock(&plsec->pls_lock);
+
+       ctx = plsec->pls_ctx;
+       if (ctx) {
+               atomic_inc(&ctx->cc_refcount);
+
+               if (ctx_new)
+                       OBD_FREE_PTR(ctx_new);
+       } else if (ctx_new) {
+               ctx = ctx_new;
+
+               atomic_set(&ctx->cc_refcount, 1); /* for cache */
+               ctx->cc_sec = &plsec->pls_base;
+               ctx->cc_ops = &plain_ctx_ops;
+               ctx->cc_expire = 0;
+               ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE;
+               ctx->cc_vcred.vc_uid = 0;
+               spin_lock_init(&ctx->cc_lock);
+               INIT_LIST_HEAD(&ctx->cc_req_list);
+               INIT_LIST_HEAD(&ctx->cc_gc_chain);
+
+               plsec->pls_ctx = ctx;
+               atomic_inc(&plsec->pls_base.ps_nctx);
+               atomic_inc(&plsec->pls_base.ps_refcount);
+
+               atomic_inc(&ctx->cc_refcount); /* for caller */
+       }
+
+       write_unlock(&plsec->pls_lock);
+
+       return ctx;
+}
+
+static
+void plain_destroy_sec(struct ptlrpc_sec *sec)
+{
+       struct plain_sec       *plsec = sec2plsec(sec);
+       ENTRY;
+
+       LASSERT(sec->ps_policy == &plain_policy);
+       LASSERT(sec->ps_import);
+       LASSERT(atomic_read(&sec->ps_refcount) == 0);
+       LASSERT(atomic_read(&sec->ps_nctx) == 0);
+       LASSERT(plsec->pls_ctx == NULL);
+
+       class_import_put(sec->ps_import);
+
+       OBD_FREE_PTR(plsec);
+       EXIT;
+}
+
+static
+void plain_kill_sec(struct ptlrpc_sec *sec)
+{
+       sec->ps_dying = 1;
+}
+
+static
+struct ptlrpc_sec *plain_create_sec(struct obd_import *imp,
+                                   struct ptlrpc_svc_ctx *svc_ctx,
+                                   struct sptlrpc_flavor *sf)
+{
+       struct plain_sec       *plsec;
+       struct ptlrpc_sec      *sec;
+       struct ptlrpc_cli_ctx  *ctx;
+       ENTRY;
+
+       LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN);
+
+       OBD_ALLOC_PTR(plsec);
+       if (plsec == NULL)
+               RETURN(NULL);
+
+       /*
+        * initialize plain_sec
+        */
+       rwlock_init(&plsec->pls_lock);
+       plsec->pls_ctx = NULL;
+
+       sec = &plsec->pls_base;
+       sec->ps_policy = &plain_policy;
+       atomic_set(&sec->ps_refcount, 0);
+       atomic_set(&sec->ps_nctx, 0);
+       sec->ps_id = sptlrpc_get_next_secid();
+       sec->ps_import = class_import_get(imp);
+       sec->ps_flvr = *sf;
+       spin_lock_init(&sec->ps_lock);
+       INIT_LIST_HEAD(&sec->ps_gc_list);
+       sec->ps_gc_interval = 0;
+       sec->ps_gc_next = 0;
+
+       /* install ctx immediately if this is a reverse sec */
+       if (svc_ctx) {
+               ctx = plain_sec_install_ctx(plsec);
+               if (ctx == NULL) {
+                       plain_destroy_sec(sec);
+                       RETURN(NULL);
+               }
+               sptlrpc_cli_ctx_put(ctx, 1);
+       }
+
+       RETURN(sec);
+}
+
+static
+struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec,
+                                       struct vfs_cred *vcred,
+                                       int create, int remove_dead)
+{
+       struct plain_sec       *plsec = sec2plsec(sec);
+       struct ptlrpc_cli_ctx  *ctx;
+       ENTRY;
+
+       read_lock(&plsec->pls_lock);
+       ctx = plsec->pls_ctx;
+       if (ctx)
+               atomic_inc(&ctx->cc_refcount);
+       read_unlock(&plsec->pls_lock);
+
+       if (unlikely(ctx == NULL))
+               ctx = plain_sec_install_ctx(plsec);
+
+       RETURN(ctx);
+}
+
+static
+void plain_release_ctx(struct ptlrpc_sec *sec,
+                      struct ptlrpc_cli_ctx *ctx, int sync)
+{
+       LASSERT(atomic_read(&sec->ps_refcount) > 0);
+       LASSERT(atomic_read(&sec->ps_nctx) > 0);
+       LASSERT(atomic_read(&ctx->cc_refcount) == 0);
+       LASSERT(ctx->cc_sec == sec);
+
+       OBD_FREE_PTR(ctx);
+
+       atomic_dec(&sec->ps_nctx);
+       sptlrpc_sec_put(sec);
+}
+
+static
+int plain_flush_ctx_cache(struct ptlrpc_sec *sec,
+                         uid_t uid, int grace, int force)
+{
+       struct plain_sec       *plsec = sec2plsec(sec);
+       struct ptlrpc_cli_ctx  *ctx;
+       ENTRY;
+
+       /* do nothing unless caller want to flush for 'all' */
+       if (uid != -1)
+               RETURN(0);
+
+       write_lock(&plsec->pls_lock);
+       ctx = plsec->pls_ctx;
+       plsec->pls_ctx = NULL;
+       write_unlock(&plsec->pls_lock);
+
+       if (ctx)
+               sptlrpc_cli_ctx_put(ctx, 1);
+       RETURN(0);
+}
+
+static
+int plain_alloc_reqbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req,
+                      int msgsize)
+{
+       __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+       int   alloc_len;
+       ENTRY;
+
+       buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+       buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+       if (req->rq_pack_udesc)
+               buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size();
+
+       if (req->rq_pack_bulk) {
+               LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+               buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+       }
+
+       alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+       if (!req->rq_reqbuf) {
+               LASSERT(!req->rq_pool);
+
+               alloc_len = size_roundup_power2(alloc_len);
+               OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len);
+               if (!req->rq_reqbuf)
+                       RETURN(-ENOMEM);
+
+               req->rq_reqbuf_len = alloc_len;
+       } else {
+               LASSERT(req->rq_pool);
+               LASSERT(req->rq_reqbuf_len >= alloc_len);
+               memset(req->rq_reqbuf, 0, alloc_len);
+       }
+
+       lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+       req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0);
+
+       if (req->rq_pack_udesc)
+               sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF);
+
+       RETURN(0);
+}
+
+static
+void plain_free_reqbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req)
+{
+       ENTRY;
+       if (!req->rq_pool) {
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = NULL;
+               req->rq_reqbuf_len = 0;
+       }
+       EXIT;
+}
+
+static
+int plain_alloc_repbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req,
+                      int msgsize)
+{
+       __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+       int alloc_len;
+       ENTRY;
+
+       buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+       buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+       if (req->rq_pack_bulk) {
+               LASSERT(req->rq_bulk_read || req->rq_bulk_write);
+               buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+       }
+
+       alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+       /* add space for early reply */
+       alloc_len += plain_at_offset;
+
+       alloc_len = size_roundup_power2(alloc_len);
+
+       OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len);
+       if (!req->rq_repbuf)
+               RETURN(-ENOMEM);
+
+       req->rq_repbuf_len = alloc_len;
+       RETURN(0);
+}
+
+static
+void plain_free_repbuf(struct ptlrpc_sec *sec,
+                      struct ptlrpc_request *req)
+{
+       ENTRY;
+       OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len);
+       req->rq_repbuf = NULL;
+       req->rq_repbuf_len = 0;
+       EXIT;
+}
+
+static
+int plain_enlarge_reqbuf(struct ptlrpc_sec *sec,
+                        struct ptlrpc_request *req,
+                        int segment, int newsize)
+{
+       struct lustre_msg      *newbuf;
+       int                  oldsize;
+       int                  newmsg_size, newbuf_size;
+       ENTRY;
+
+       LASSERT(req->rq_reqbuf);
+       LASSERT(req->rq_reqbuf_len >= req->rq_reqlen);
+       LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) ==
+               req->rq_reqmsg);
+
+       /* compute new embedded msg size.  */
+       oldsize = req->rq_reqmsg->lm_buflens[segment];
+       req->rq_reqmsg->lm_buflens[segment] = newsize;
+       newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount,
+                                        req->rq_reqmsg->lm_buflens);
+       req->rq_reqmsg->lm_buflens[segment] = oldsize;
+
+       /* compute new wrapper msg size.  */
+       oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF];
+       req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size;
+       newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount,
+                                        req->rq_reqbuf->lm_buflens);
+       req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize;
+
+       /* request from pool should always have enough buffer */
+       LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size);
+
+       if (req->rq_reqbuf_len < newbuf_size) {
+               newbuf_size = size_roundup_power2(newbuf_size);
+
+               OBD_ALLOC_LARGE(newbuf, newbuf_size);
+               if (newbuf == NULL)
+                       RETURN(-ENOMEM);
+
+               memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len);
+
+               OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len);
+               req->rq_reqbuf = newbuf;
+               req->rq_reqbuf_len = newbuf_size;
+               req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf,
+                                               PLAIN_PACK_MSG_OFF, 0);
+       }
+
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF,
+                                    newmsg_size);
+       _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize);
+
+       req->rq_reqlen = newmsg_size;
+       RETURN(0);
+}
+
+/****************************************
+ * service apis                         *
+ ****************************************/
+
+static struct ptlrpc_svc_ctx plain_svc_ctx = {
+       .sc_refcount    = ATOMIC_INIT(1),
+       .sc_policy      = &plain_policy,
+};
+
+static
+int plain_accept(struct ptlrpc_request *req)
+{
+       struct lustre_msg   *msg = req->rq_reqbuf;
+       struct plain_header *phdr;
+       int               swabbed;
+       ENTRY;
+
+       LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) ==
+               SPTLRPC_POLICY_PLAIN);
+
+       if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) !=
+           SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) ||
+           SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) !=
+           SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) {
+               CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc);
+               RETURN(SECSVC_DROP);
+       }
+
+       if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) {
+               CERROR("unexpected request buf count %u\n", msg->lm_bufcount);
+               RETURN(SECSVC_DROP);
+       }
+
+       swabbed = ptlrpc_req_need_swab(req);
+
+       phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr));
+       if (phdr == NULL) {
+               CERROR("missing plain header\n");
+               RETURN(-EPROTO);
+       }
+
+       if (phdr->ph_ver != 0) {
+               CERROR("Invalid header version\n");
+               RETURN(-EPROTO);
+       }
+
+       if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) {
+               CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg);
+               RETURN(-EPROTO);
+       }
+
+       req->rq_sp_from = phdr->ph_sp;
+       req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg;
+
+       if (phdr->ph_flags & PLAIN_FL_USER) {
+               if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF,
+                                            swabbed)) {
+                       CERROR("Mal-formed user descriptor\n");
+                       RETURN(SECSVC_DROP);
+               }
+
+               req->rq_pack_udesc = 1;
+               req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0);
+       }
+
+       if (phdr->ph_flags & PLAIN_FL_BULK) {
+               if (plain_unpack_bsd(msg, swabbed))
+                       RETURN(SECSVC_DROP);
+
+               req->rq_pack_bulk = 1;
+       }
+
+       req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0);
+       req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF];
+
+       req->rq_svc_ctx = &plain_svc_ctx;
+       atomic_inc(&req->rq_svc_ctx->sc_refcount);
+
+       RETURN(SECSVC_OK);
+}
+
+static
+int plain_alloc_rs(struct ptlrpc_request *req, int msgsize)
+{
+       struct ptlrpc_reply_state   *rs;
+       __u32                   buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+       int                       rs_size = sizeof(*rs);
+       ENTRY;
+
+       LASSERT(msgsize % 8 == 0);
+
+       buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header);
+       buflens[PLAIN_PACK_MSG_OFF] = msgsize;
+
+       if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write))
+               buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE;
+
+       rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+       rs = req->rq_reply_state;
+
+       if (rs) {
+               /* pre-allocated */
+               LASSERT(rs->rs_size >= rs_size);
+       } else {
+               OBD_ALLOC_LARGE(rs, rs_size);
+               if (rs == NULL)
+                       RETURN(-ENOMEM);
+
+               rs->rs_size = rs_size;
+       }
+
+       rs->rs_svc_ctx = req->rq_svc_ctx;
+       atomic_inc(&req->rq_svc_ctx->sc_refcount);
+       rs->rs_repbuf = (struct lustre_msg *) (rs + 1);
+       rs->rs_repbuf_len = rs_size - sizeof(*rs);
+
+       lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL);
+       rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0);
+
+       req->rq_reply_state = rs;
+       RETURN(0);
+}
+
+static
+void plain_free_rs(struct ptlrpc_reply_state *rs)
+{
+       ENTRY;
+
+       LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1);
+       atomic_dec(&rs->rs_svc_ctx->sc_refcount);
+
+       if (!rs->rs_prealloc)
+               OBD_FREE_LARGE(rs, rs->rs_size);
+       EXIT;
+}
+
+static
+int plain_authorize(struct ptlrpc_request *req)
+{
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+       struct lustre_msg_v2      *msg = rs->rs_repbuf;
+       struct plain_header       *phdr;
+       int                     len;
+       ENTRY;
+
+       LASSERT(rs);
+       LASSERT(msg);
+
+       if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF])
+               len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF,
+                                       req->rq_replen, 1);
+       else
+               len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens);
+
+       msg->lm_secflvr = req->rq_flvr.sf_rpc;
+
+       phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0);
+       phdr->ph_ver = 0;
+       phdr->ph_flags = 0;
+       phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg;
+
+       if (req->rq_pack_bulk)
+               phdr->ph_flags |= PLAIN_FL_BULK;
+
+       rs->rs_repdata_len = len;
+
+       if (likely(req->rq_packed_final)) {
+               if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)
+                       req->rq_reply_off = plain_at_offset;
+               else
+                       req->rq_reply_off = 0;
+       } else {
+               unsigned int hsize = 4;
+
+               cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32,
+                       lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0),
+                       lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF),
+                       NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize);
+                       req->rq_reply_off = 0;
+       }
+
+       RETURN(0);
+}
+
+static
+int plain_svc_unwrap_bulk(struct ptlrpc_request *req,
+                         struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+       struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+       struct plain_bulk_token     *tokenr;
+       int                       rc;
+
+       LASSERT(req->rq_bulk_write);
+       LASSERT(req->rq_pack_bulk);
+
+       bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+       tokenr = (struct plain_bulk_token *) bsdr->bsd_data;
+       bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+
+       bsdv->bsd_version = 0;
+       bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsdv->bsd_svc = bsdr->bsd_svc;
+       bsdv->bsd_flags = 0;
+
+       if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+               return 0;
+
+       rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                   tokenr);
+       if (rc) {
+               bsdv->bsd_flags |= BSD_FL_ERR;
+               CERROR("bulk write: server verify failed: %d\n", rc);
+       }
+
+       return rc;
+}
+
+static
+int plain_svc_wrap_bulk(struct ptlrpc_request *req,
+                       struct ptlrpc_bulk_desc *desc)
+{
+       struct ptlrpc_reply_state   *rs = req->rq_reply_state;
+       struct ptlrpc_bulk_sec_desc *bsdr, *bsdv;
+       struct plain_bulk_token     *tokenv;
+       int                       rc;
+
+       LASSERT(req->rq_bulk_read);
+       LASSERT(req->rq_pack_bulk);
+
+       bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0);
+       bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0);
+       tokenv = (struct plain_bulk_token *) bsdv->bsd_data;
+
+       bsdv->bsd_version = 0;
+       bsdv->bsd_type = SPTLRPC_BULK_DEFAULT;
+       bsdv->bsd_svc = bsdr->bsd_svc;
+       bsdv->bsd_flags = 0;
+
+       if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL)
+               return 0;
+
+       rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg,
+                                     tokenv);
+       if (rc) {
+               CERROR("bulk read: server failed to compute "
+                      "checksum: %d\n", rc);
+       } else {
+               if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE))
+                       corrupt_bulk_data(desc);
+       }
+
+       return rc;
+}
+
+static struct ptlrpc_ctx_ops plain_ctx_ops = {
+       .refresh                = plain_ctx_refresh,
+       .validate              = plain_ctx_validate,
+       .sign              = plain_ctx_sign,
+       .verify          = plain_ctx_verify,
+       .wrap_bulk            = plain_cli_wrap_bulk,
+       .unwrap_bulk        = plain_cli_unwrap_bulk,
+};
+
+static struct ptlrpc_sec_cops plain_sec_cops = {
+       .create_sec          = plain_create_sec,
+       .destroy_sec        = plain_destroy_sec,
+       .kill_sec              = plain_kill_sec,
+       .lookup_ctx          = plain_lookup_ctx,
+       .release_ctx        = plain_release_ctx,
+       .flush_ctx_cache        = plain_flush_ctx_cache,
+       .alloc_reqbuf      = plain_alloc_reqbuf,
+       .free_reqbuf        = plain_free_reqbuf,
+       .alloc_repbuf      = plain_alloc_repbuf,
+       .free_repbuf        = plain_free_repbuf,
+       .enlarge_reqbuf  = plain_enlarge_reqbuf,
+};
+
+static struct ptlrpc_sec_sops plain_sec_sops = {
+       .accept          = plain_accept,
+       .alloc_rs              = plain_alloc_rs,
+       .authorize            = plain_authorize,
+       .free_rs                = plain_free_rs,
+       .unwrap_bulk        = plain_svc_unwrap_bulk,
+       .wrap_bulk            = plain_svc_wrap_bulk,
+};
+
+static struct ptlrpc_sec_policy plain_policy = {
+       .sp_owner              = THIS_MODULE,
+       .sp_name                = "plain",
+       .sp_policy            = SPTLRPC_POLICY_PLAIN,
+       .sp_cops                = &plain_sec_cops,
+       .sp_sops                = &plain_sec_sops,
+};
+
+int sptlrpc_plain_init(void)
+{
+       __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, };
+       int rc;
+
+       buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size();
+       plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens);
+
+       rc = sptlrpc_register_policy(&plain_policy);
+       if (rc)
+               CERROR("failed to register: %d\n", rc);
+
+       return rc;
+}
+
+void sptlrpc_plain_fini(void)
+{
+       int rc;
+
+       rc = sptlrpc_unregister_policy(&plain_policy);
+       if (rc)
+               CERROR("cannot unregister: %d\n", rc);
+}
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
new file mode 100644 (file)
index 0000000..1667b8e
--- /dev/null
@@ -0,0 +1,3129 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lu_object.h>
+#include <linux/lnet/types.h>
+#include "ptlrpc_internal.h"
+
+/* The following are visible and mutable through /sys/module/ptlrpc */
+int test_req_buffer_pressure = 0;
+CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444,
+               "set non-zero to put pressure on request buffer pools");
+CFS_MODULE_PARM(at_min, "i", int, 0644,
+               "Adaptive timeout minimum (sec)");
+CFS_MODULE_PARM(at_max, "i", int, 0644,
+               "Adaptive timeout maximum (sec)");
+CFS_MODULE_PARM(at_history, "i", int, 0644,
+               "Adaptive timeouts remember the slowest event that took place "
+               "within this period (sec)");
+CFS_MODULE_PARM(at_early_margin, "i", int, 0644,
+               "How soon before an RPC deadline to send an early reply");
+CFS_MODULE_PARM(at_extra, "i", int, 0644,
+               "How much extra time to give with each early reply");
+
+
+/* forward ref */
+static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt);
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req);
+static void ptlrpc_at_remove_timed(struct ptlrpc_request *req);
+
+/** Holds a list of all PTLRPC services */
+LIST_HEAD(ptlrpc_all_services);
+/** Used to protect the \e ptlrpc_all_services list */
+struct mutex ptlrpc_all_services_mutex;
+
+struct ptlrpc_request_buffer_desc *
+ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_service             *svc = svcpt->scp_service;
+       struct ptlrpc_request_buffer_desc *rqbd;
+
+       OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt);
+       if (rqbd == NULL)
+               return NULL;
+
+       rqbd->rqbd_svcpt = svcpt;
+       rqbd->rqbd_refcount = 0;
+       rqbd->rqbd_cbid.cbid_fn = request_in_callback;
+       rqbd->rqbd_cbid.cbid_arg = rqbd;
+       INIT_LIST_HEAD(&rqbd->rqbd_reqs);
+       OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable,
+                           svcpt->scp_cpt, svc->srv_buf_size);
+       if (rqbd->rqbd_buffer == NULL) {
+               OBD_FREE_PTR(rqbd);
+               return NULL;
+       }
+
+       spin_lock(&svcpt->scp_lock);
+       list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+       svcpt->scp_nrqbds_total++;
+       spin_unlock(&svcpt->scp_lock);
+
+       return rqbd;
+}
+
+void
+ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd)
+{
+       struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt;
+
+       LASSERT(rqbd->rqbd_refcount == 0);
+       LASSERT(list_empty(&rqbd->rqbd_reqs));
+
+       spin_lock(&svcpt->scp_lock);
+       list_del(&rqbd->rqbd_list);
+       svcpt->scp_nrqbds_total--;
+       spin_unlock(&svcpt->scp_lock);
+
+       OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size);
+       OBD_FREE_PTR(rqbd);
+}
+
+int
+ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post)
+{
+       struct ptlrpc_service             *svc = svcpt->scp_service;
+       struct ptlrpc_request_buffer_desc *rqbd;
+       int                             rc = 0;
+       int                             i;
+
+       if (svcpt->scp_rqbd_allocating)
+               goto try_post;
+
+       spin_lock(&svcpt->scp_lock);
+       /* check again with lock */
+       if (svcpt->scp_rqbd_allocating) {
+               /* NB: we might allow more than one thread in the future */
+               LASSERT(svcpt->scp_rqbd_allocating == 1);
+               spin_unlock(&svcpt->scp_lock);
+               goto try_post;
+       }
+
+       svcpt->scp_rqbd_allocating++;
+       spin_unlock(&svcpt->scp_lock);
+
+
+       for (i = 0; i < svc->srv_nbuf_per_group; i++) {
+               /* NB: another thread might have recycled enough rqbds, we
+                * need to make sure it wouldn't over-allocate, see LU-1212. */
+               if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group)
+                       break;
+
+               rqbd = ptlrpc_alloc_rqbd(svcpt);
+
+               if (rqbd == NULL) {
+                       CERROR("%s: Can't allocate request buffer\n",
+                              svc->srv_name);
+                       rc = -ENOMEM;
+                       break;
+               }
+       }
+
+       spin_lock(&svcpt->scp_lock);
+
+       LASSERT(svcpt->scp_rqbd_allocating == 1);
+       svcpt->scp_rqbd_allocating--;
+
+       spin_unlock(&svcpt->scp_lock);
+
+       CDEBUG(D_RPCTRACE,
+              "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n",
+              svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted,
+              svcpt->scp_nrqbds_total, rc);
+
+ try_post:
+       if (post && rc == 0)
+               rc = ptlrpc_server_post_idle_rqbds(svcpt);
+
+       return rc;
+}
+
+/**
+ * Part of Rep-Ack logic.
+ * Puts a lock and its mode into reply state assotiated to request reply.
+ */
+void
+ptlrpc_save_lock(struct ptlrpc_request *req,
+                struct lustre_handle *lock, int mode, int no_ack)
+{
+       struct ptlrpc_reply_state *rs = req->rq_reply_state;
+       int                     idx;
+
+       LASSERT(rs != NULL);
+       LASSERT(rs->rs_nlocks < RS_MAX_LOCKS);
+
+       if (req->rq_export->exp_disconnected) {
+               ldlm_lock_decref(lock, mode);
+       } else {
+               idx = rs->rs_nlocks++;
+               rs->rs_locks[idx] = *lock;
+               rs->rs_modes[idx] = mode;
+               rs->rs_difficult = 1;
+               rs->rs_no_ack = !!no_ack;
+       }
+}
+EXPORT_SYMBOL(ptlrpc_save_lock);
+
+
+struct ptlrpc_hr_partition;
+
+struct ptlrpc_hr_thread {
+       int                             hrt_id;         /* thread ID */
+       spinlock_t                      hrt_lock;
+       wait_queue_head_t                       hrt_waitq;
+       struct list_head                        hrt_queue;      /* RS queue */
+       struct ptlrpc_hr_partition      *hrt_partition;
+};
+
+struct ptlrpc_hr_partition {
+       /* # of started threads */
+       atomic_t                        hrp_nstarted;
+       /* # of stopped threads */
+       atomic_t                        hrp_nstopped;
+       /* cpu partition id */
+       int                             hrp_cpt;
+       /* round-robin rotor for choosing thread */
+       int                             hrp_rotor;
+       /* total number of threads on this partition */
+       int                             hrp_nthrs;
+       /* threads table */
+       struct ptlrpc_hr_thread         *hrp_thrs;
+};
+
+#define HRT_RUNNING 0
+#define HRT_STOPPING 1
+
+struct ptlrpc_hr_service {
+       /* CPU partition table, it's just cfs_cpt_table for now */
+       struct cfs_cpt_table            *hr_cpt_table;
+       /** controller sleep waitq */
+       wait_queue_head_t                       hr_waitq;
+       unsigned int                    hr_stopping;
+       /** roundrobin rotor for non-affinity service */
+       unsigned int                    hr_rotor;
+       /* partition data */
+       struct ptlrpc_hr_partition      **hr_partitions;
+};
+
+struct rs_batch {
+       struct list_head                        rsb_replies;
+       unsigned int                    rsb_n_replies;
+       struct ptlrpc_service_part      *rsb_svcpt;
+};
+
+/** reply handling service. */
+static struct ptlrpc_hr_service                ptlrpc_hr;
+
+/**
+ * maximum mumber of replies scheduled in one batch
+ */
+#define MAX_SCHEDULED 256
+
+/**
+ * Initialize a reply batch.
+ *
+ * \param b batch
+ */
+static void rs_batch_init(struct rs_batch *b)
+{
+       memset(b, 0, sizeof *b);
+       INIT_LIST_HEAD(&b->rsb_replies);
+}
+
+/**
+ * Choose an hr thread to dispatch requests to.
+ */
+static struct ptlrpc_hr_thread *
+ptlrpc_hr_select(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_hr_partition      *hrp;
+       unsigned int                    rotor;
+
+       if (svcpt->scp_cpt >= 0 &&
+           svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) {
+               /* directly match partition */
+               hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt];
+
+       } else {
+               rotor = ptlrpc_hr.hr_rotor++;
+               rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table);
+
+               hrp = ptlrpc_hr.hr_partitions[rotor];
+       }
+
+       rotor = hrp->hrp_rotor++;
+       return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs];
+}
+
+/**
+ * Dispatch all replies accumulated in the batch to one from
+ * dedicated reply handling threads.
+ *
+ * \param b batch
+ */
+static void rs_batch_dispatch(struct rs_batch *b)
+{
+       if (b->rsb_n_replies != 0) {
+               struct ptlrpc_hr_thread *hrt;
+
+               hrt = ptlrpc_hr_select(b->rsb_svcpt);
+
+               spin_lock(&hrt->hrt_lock);
+               list_splice_init(&b->rsb_replies, &hrt->hrt_queue);
+               spin_unlock(&hrt->hrt_lock);
+
+               wake_up(&hrt->hrt_waitq);
+               b->rsb_n_replies = 0;
+       }
+}
+
+/**
+ * Add a reply to a batch.
+ * Add one reply object to a batch, schedule batched replies if overload.
+ *
+ * \param b batch
+ * \param rs reply
+ */
+static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs)
+{
+       struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+
+       if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) {
+               if (b->rsb_svcpt != NULL) {
+                       rs_batch_dispatch(b);
+                       spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+               }
+               spin_lock(&svcpt->scp_rep_lock);
+               b->rsb_svcpt = svcpt;
+       }
+       spin_lock(&rs->rs_lock);
+       rs->rs_scheduled_ever = 1;
+       if (rs->rs_scheduled == 0) {
+               list_move(&rs->rs_list, &b->rsb_replies);
+               rs->rs_scheduled = 1;
+               b->rsb_n_replies++;
+       }
+       rs->rs_committed = 1;
+       spin_unlock(&rs->rs_lock);
+}
+
+/**
+ * Reply batch finalization.
+ * Dispatch remaining replies from the batch
+ * and release remaining spinlock.
+ *
+ * \param b batch
+ */
+static void rs_batch_fini(struct rs_batch *b)
+{
+       if (b->rsb_svcpt != NULL) {
+               rs_batch_dispatch(b);
+               spin_unlock(&b->rsb_svcpt->scp_rep_lock);
+       }
+}
+
+#define DECLARE_RS_BATCH(b)     struct rs_batch b
+
+
+/**
+ * Put reply state into a queue for processing because we received
+ * ACK from the client
+ */
+void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+       struct ptlrpc_hr_thread *hrt;
+       ENTRY;
+
+       LASSERT(list_empty(&rs->rs_list));
+
+       hrt = ptlrpc_hr_select(rs->rs_svcpt);
+
+       spin_lock(&hrt->hrt_lock);
+       list_add_tail(&rs->rs_list, &hrt->hrt_queue);
+       spin_unlock(&hrt->hrt_lock);
+
+       wake_up(&hrt->hrt_waitq);
+       EXIT;
+}
+
+void
+ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs)
+{
+       ENTRY;
+
+       LASSERT(spin_is_locked(&rs->rs_svcpt->scp_rep_lock));
+       LASSERT(spin_is_locked(&rs->rs_lock));
+       LASSERT (rs->rs_difficult);
+       rs->rs_scheduled_ever = 1;  /* flag any notification attempt */
+
+       if (rs->rs_scheduled) {     /* being set up or already notified */
+               EXIT;
+               return;
+       }
+
+       rs->rs_scheduled = 1;
+       list_del_init(&rs->rs_list);
+       ptlrpc_dispatch_difficult_reply(rs);
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply);
+
+void ptlrpc_commit_replies(struct obd_export *exp)
+{
+       struct ptlrpc_reply_state *rs, *nxt;
+       DECLARE_RS_BATCH(batch);
+       ENTRY;
+
+       rs_batch_init(&batch);
+       /* Find any replies that have been committed and get their service
+        * to attend to complete them. */
+
+       /* CAVEAT EMPTOR: spinlock ordering!!! */
+       spin_lock(&exp->exp_uncommitted_replies_lock);
+       list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies,
+                                    rs_obd_list) {
+               LASSERT (rs->rs_difficult);
+               /* VBR: per-export last_committed */
+               LASSERT(rs->rs_export);
+               if (rs->rs_transno <= exp->exp_last_committed) {
+                       list_del_init(&rs->rs_obd_list);
+                       rs_batch_add(&batch, rs);
+               }
+       }
+       spin_unlock(&exp->exp_uncommitted_replies_lock);
+       rs_batch_fini(&batch);
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_commit_replies);
+
+static int
+ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_request_buffer_desc *rqbd;
+       int                               rc;
+       int                               posted = 0;
+
+       for (;;) {
+               spin_lock(&svcpt->scp_lock);
+
+               if (list_empty(&svcpt->scp_rqbd_idle)) {
+                       spin_unlock(&svcpt->scp_lock);
+                       return posted;
+               }
+
+               rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+                                     struct ptlrpc_request_buffer_desc,
+                                     rqbd_list);
+               list_del(&rqbd->rqbd_list);
+
+               /* assume we will post successfully */
+               svcpt->scp_nrqbds_posted++;
+               list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted);
+
+               spin_unlock(&svcpt->scp_lock);
+
+               rc = ptlrpc_register_rqbd(rqbd);
+               if (rc != 0)
+                       break;
+
+               posted = 1;
+       }
+
+       spin_lock(&svcpt->scp_lock);
+
+       svcpt->scp_nrqbds_posted--;
+       list_del(&rqbd->rqbd_list);
+       list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle);
+
+       /* Don't complain if no request buffers are posted right now; LNET
+        * won't drop requests because we set the portal lazy! */
+
+       spin_unlock(&svcpt->scp_lock);
+
+       return -1;
+}
+
+static void ptlrpc_at_timer(unsigned long castmeharder)
+{
+       struct ptlrpc_service_part *svcpt;
+
+       svcpt = (struct ptlrpc_service_part *)castmeharder;
+
+       svcpt->scp_at_check = 1;
+       svcpt->scp_at_checktime = cfs_time_current();
+       wake_up(&svcpt->scp_waitq);
+}
+
+static void
+ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
+                            struct ptlrpc_service_conf *conf)
+{
+       struct ptlrpc_service_thr_conf  *tc = &conf->psc_thr;
+       unsigned                        init;
+       unsigned                        total;
+       unsigned                        nthrs;
+       int                             weight;
+
+       /*
+        * Common code for estimating & validating threads number.
+        * CPT affinity service could have percpt thread-pool instead
+        * of a global thread-pool, which means user might not always
+        * get the threads number they give it in conf::tc_nthrs_user
+        * even they did set. It's because we need to validate threads
+        * number for each CPT to guarantee each pool will have enough
+        * threads to keep the service healthy.
+        */
+       init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL);
+       init = max_t(int, init, tc->tc_nthrs_init);
+
+       /* NB: please see comments in lustre_lnet.h for definition
+        * details of these members */
+       LASSERT(tc->tc_nthrs_max != 0);
+
+       if (tc->tc_nthrs_user != 0) {
+               /* In case there is a reason to test a service with many
+                * threads, we give a less strict check here, it can
+                * be up to 8 * nthrs_max */
+               total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user);
+               nthrs = total / svc->srv_ncpts;
+               init  = max(init, nthrs);
+               goto out;
+       }
+
+       total = tc->tc_nthrs_max;
+       if (tc->tc_nthrs_base == 0) {
+               /* don't care about base threads number per partition,
+                * this is most for non-affinity service */
+               nthrs = total / svc->srv_ncpts;
+               goto out;
+       }
+
+       nthrs = tc->tc_nthrs_base;
+       if (svc->srv_ncpts == 1) {
+               int     i;
+
+               /* NB: Increase the base number if it's single partition
+                * and total number of cores/HTs is larger or equal to 4.
+                * result will always < 2 * nthrs_base */
+               weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY);
+               for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */
+                           (tc->tc_nthrs_base >> i) != 0; i++)
+                       nthrs += tc->tc_nthrs_base >> i;
+       }
+
+       if (tc->tc_thr_factor != 0) {
+               int       factor = tc->tc_thr_factor;
+               const int fade = 4;
+
+               /*
+                * User wants to increase number of threads with for
+                * each CPU core/HT, most likely the factor is larger then
+                * one thread/core because service threads are supposed to
+                * be blocked by lock or wait for IO.
+                */
+               /*
+                * Amdahl's law says that adding processors wouldn't give
+                * a linear increasing of parallelism, so it's nonsense to
+                * have too many threads no matter how many cores/HTs
+                * there are.
+                */
+               if (cfs_cpu_ht_nsiblings(0) > 1) { /* weight is # of HTs */
+                       /* depress thread factor for hyper-thread */
+                       factor = factor - (factor >> 1) + (factor >> 3);
+               }
+
+               weight = cfs_cpt_weight(svc->srv_cptable, 0);
+               LASSERT(weight > 0);
+
+               for (; factor > 0 && weight > 0; factor--, weight -= fade)
+                       nthrs += min(weight, fade) * factor;
+       }
+
+       if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+               nthrs = max(tc->tc_nthrs_base,
+                           tc->tc_nthrs_max / svc->srv_ncpts);
+       }
+ out:
+       nthrs = max(nthrs, tc->tc_nthrs_init);
+       svc->srv_nthrs_cpt_limit = nthrs;
+       svc->srv_nthrs_cpt_init = init;
+
+       if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) {
+               CDEBUG(D_OTHER, "%s: This service may have more threads (%d) "
+                      "than the given soft limit (%d)\n",
+                      svc->srv_name, nthrs * svc->srv_ncpts,
+                      tc->tc_nthrs_max);
+       }
+}
+
+/**
+ * Initialize percpt data for a service
+ */
+static int
+ptlrpc_service_part_init(struct ptlrpc_service *svc,
+                        struct ptlrpc_service_part *svcpt, int cpt)
+{
+       struct ptlrpc_at_array  *array;
+       int                     size;
+       int                     index;
+       int                     rc;
+
+       svcpt->scp_cpt = cpt;
+       INIT_LIST_HEAD(&svcpt->scp_threads);
+
+       /* rqbd and incoming request queue */
+       spin_lock_init(&svcpt->scp_lock);
+       INIT_LIST_HEAD(&svcpt->scp_rqbd_idle);
+       INIT_LIST_HEAD(&svcpt->scp_rqbd_posted);
+       INIT_LIST_HEAD(&svcpt->scp_req_incoming);
+       init_waitqueue_head(&svcpt->scp_waitq);
+       /* history request & rqbd list */
+       INIT_LIST_HEAD(&svcpt->scp_hist_reqs);
+       INIT_LIST_HEAD(&svcpt->scp_hist_rqbds);
+
+       /* acitve requests and hp requests */
+       spin_lock_init(&svcpt->scp_req_lock);
+
+       /* reply states */
+       spin_lock_init(&svcpt->scp_rep_lock);
+       INIT_LIST_HEAD(&svcpt->scp_rep_active);
+       INIT_LIST_HEAD(&svcpt->scp_rep_idle);
+       init_waitqueue_head(&svcpt->scp_rep_waitq);
+       atomic_set(&svcpt->scp_nreps_difficult, 0);
+
+       /* adaptive timeout */
+       spin_lock_init(&svcpt->scp_at_lock);
+       array = &svcpt->scp_at_array;
+
+       size = at_est2timeout(at_max);
+       array->paa_size     = size;
+       array->paa_count    = 0;
+       array->paa_deadline = -1;
+
+       /* allocate memory for scp_at_array (ptlrpc_at_array) */
+       OBD_CPT_ALLOC(array->paa_reqs_array,
+                     svc->srv_cptable, cpt, sizeof(struct list_head) * size);
+       if (array->paa_reqs_array == NULL)
+               return -ENOMEM;
+
+       for (index = 0; index < size; index++)
+               INIT_LIST_HEAD(&array->paa_reqs_array[index]);
+
+       OBD_CPT_ALLOC(array->paa_reqs_count,
+                     svc->srv_cptable, cpt, sizeof(__u32) * size);
+       if (array->paa_reqs_count == NULL)
+               goto failed;
+
+       cfs_timer_init(&svcpt->scp_at_timer, ptlrpc_at_timer, svcpt);
+       /* At SOW, service time should be quick; 10s seems generous. If client
+        * timeout is less than this, we'll be sending an early reply. */
+       at_init(&svcpt->scp_at_estimate, 10, 0);
+
+       /* assign this before call ptlrpc_grow_req_bufs */
+       svcpt->scp_service = svc;
+       /* Now allocate the request buffers, but don't post them now */
+       rc = ptlrpc_grow_req_bufs(svcpt, 0);
+       /* We shouldn't be under memory pressure at startup, so
+        * fail if we can't allocate all our buffers at this time. */
+       if (rc != 0)
+               goto failed;
+
+       return 0;
+
+ failed:
+       if (array->paa_reqs_count != NULL) {
+               OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size);
+               array->paa_reqs_count = NULL;
+       }
+
+       if (array->paa_reqs_array != NULL) {
+               OBD_FREE(array->paa_reqs_array,
+                        sizeof(struct list_head) * array->paa_size);
+               array->paa_reqs_array = NULL;
+       }
+
+       return -ENOMEM;
+}
+
+/**
+ * Initialize service on a given portal.
+ * This includes starting serving threads , allocating and posting rqbds and
+ * so on.
+ */
+struct ptlrpc_service *
+ptlrpc_register_service(struct ptlrpc_service_conf *conf,
+                       proc_dir_entry_t *proc_entry)
+{
+       struct ptlrpc_service_cpt_conf  *cconf = &conf->psc_cpt;
+       struct ptlrpc_service           *service;
+       struct ptlrpc_service_part      *svcpt;
+       struct cfs_cpt_table            *cptable;
+       __u32                           *cpts = NULL;
+       int                             ncpts;
+       int                             cpt;
+       int                             rc;
+       int                             i;
+       ENTRY;
+
+       LASSERT(conf->psc_buf.bc_nbufs > 0);
+       LASSERT(conf->psc_buf.bc_buf_size >=
+               conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD);
+       LASSERT(conf->psc_thr.tc_ctx_tags != 0);
+
+       cptable = cconf->cc_cptable;
+       if (cptable == NULL)
+               cptable = cfs_cpt_table;
+
+       if (!conf->psc_thr.tc_cpu_affinity) {
+               ncpts = 1;
+       } else {
+               ncpts = cfs_cpt_number(cptable);
+               if (cconf->cc_pattern != NULL) {
+                       struct cfs_expr_list    *el;
+
+                       rc = cfs_expr_list_parse(cconf->cc_pattern,
+                                                strlen(cconf->cc_pattern),
+                                                0, ncpts - 1, &el);
+                       if (rc != 0) {
+                               CERROR("%s: invalid CPT pattern string: %s",
+                                      conf->psc_name, cconf->cc_pattern);
+                               RETURN(ERR_PTR(-EINVAL));
+                       }
+
+                       rc = cfs_expr_list_values(el, ncpts, &cpts);
+                       cfs_expr_list_free(el);
+                       if (rc <= 0) {
+                               CERROR("%s: failed to parse CPT array %s: %d\n",
+                                      conf->psc_name, cconf->cc_pattern, rc);
+                               if (cpts != NULL)
+                                       OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+                               RETURN(ERR_PTR(rc < 0 ? rc : -EINVAL));
+                       }
+                       ncpts = rc;
+               }
+       }
+
+       OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts]));
+       if (service == NULL) {
+               if (cpts != NULL)
+                       OBD_FREE(cpts, sizeof(*cpts) * ncpts);
+               RETURN(ERR_PTR(-ENOMEM));
+       }
+
+       service->srv_cptable            = cptable;
+       service->srv_cpts               = cpts;
+       service->srv_ncpts              = ncpts;
+
+       service->srv_cpt_bits = 0; /* it's zero already, easy to read... */
+       while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable))
+               service->srv_cpt_bits++;
+
+       /* public members */
+       spin_lock_init(&service->srv_lock);
+       service->srv_name               = conf->psc_name;
+       service->srv_watchdog_factor    = conf->psc_watchdog_factor;
+       INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */
+
+       /* buffer configuration */
+       service->srv_nbuf_per_group     = test_req_buffer_pressure ?
+                                         1 : conf->psc_buf.bc_nbufs;
+       service->srv_max_req_size       = conf->psc_buf.bc_req_max_size +
+                                         SPTLRPC_MAX_PAYLOAD;
+       service->srv_buf_size           = conf->psc_buf.bc_buf_size;
+       service->srv_rep_portal         = conf->psc_buf.bc_rep_portal;
+       service->srv_req_portal         = conf->psc_buf.bc_req_portal;
+
+       /* Increase max reply size to next power of two */
+       service->srv_max_reply_size = 1;
+       while (service->srv_max_reply_size <
+              conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD)
+               service->srv_max_reply_size <<= 1;
+
+       service->srv_thread_name        = conf->psc_thr.tc_thr_name;
+       service->srv_ctx_tags           = conf->psc_thr.tc_ctx_tags;
+       service->srv_hpreq_ratio        = PTLRPC_SVC_HP_RATIO;
+       service->srv_ops                = conf->psc_ops;
+
+       for (i = 0; i < ncpts; i++) {
+               if (!conf->psc_thr.tc_cpu_affinity)
+                       cpt = CFS_CPT_ANY;
+               else
+                       cpt = cpts != NULL ? cpts[i] : i;
+
+               OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt));
+               if (svcpt == NULL)
+                       GOTO(failed, rc = -ENOMEM);
+
+               service->srv_parts[i] = svcpt;
+               rc = ptlrpc_service_part_init(service, svcpt, cpt);
+               if (rc != 0)
+                       GOTO(failed, rc);
+       }
+
+       ptlrpc_server_nthreads_check(service, conf);
+
+       rc = LNetSetLazyPortal(service->srv_req_portal);
+       LASSERT(rc == 0);
+
+       mutex_lock(&ptlrpc_all_services_mutex);
+       list_add (&service->srv_list, &ptlrpc_all_services);
+       mutex_unlock(&ptlrpc_all_services_mutex);
+
+       if (proc_entry != NULL)
+               ptlrpc_lprocfs_register_service(proc_entry, service);
+
+       rc = ptlrpc_service_nrs_setup(service);
+       if (rc != 0)
+               GOTO(failed, rc);
+
+       CDEBUG(D_NET, "%s: Started, listening on portal %d\n",
+              service->srv_name, service->srv_req_portal);
+
+       rc = ptlrpc_start_threads(service);
+       if (rc != 0) {
+               CERROR("Failed to start threads for service %s: %d\n",
+                      service->srv_name, rc);
+               GOTO(failed, rc);
+       }
+
+       RETURN(service);
+failed:
+       ptlrpc_unregister_service(service);
+       RETURN(ERR_PTR(rc));
+}
+EXPORT_SYMBOL(ptlrpc_register_service);
+
+/**
+ * to actually free the request, must be called without holding svc_lock.
+ * note it's caller's responsibility to unlink req->rq_list.
+ */
+static void ptlrpc_server_free_request(struct ptlrpc_request *req)
+{
+       LASSERT(atomic_read(&req->rq_refcount) == 0);
+       LASSERT(list_empty(&req->rq_timed_list));
+
+        /* DEBUG_REQ() assumes the reply state of a request with a valid
+         * ref will not be destroyed until that reference is dropped. */
+       ptlrpc_req_drop_rs(req);
+
+       sptlrpc_svc_ctx_decref(req);
+
+       if (req != &req->rq_rqbd->rqbd_req) {
+               /* NB request buffers use an embedded
+                * req if the incoming req unlinked the
+                * MD; this isn't one of them! */
+               OBD_FREE(req, sizeof(*req));
+       }
+}
+
+/**
+ * drop a reference count of the request. if it reaches 0, we either
+ * put it into history list, or free it immediately.
+ */
+void ptlrpc_server_drop_request(struct ptlrpc_request *req)
+{
+       struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd;
+       struct ptlrpc_service_part        *svcpt = rqbd->rqbd_svcpt;
+       struct ptlrpc_service             *svc = svcpt->scp_service;
+       int                             refcount;
+       struct list_head                        *tmp;
+       struct list_head                        *nxt;
+
+       if (!atomic_dec_and_test(&req->rq_refcount))
+               return;
+
+       if (req->rq_at_linked) {
+               spin_lock(&svcpt->scp_at_lock);
+               /* recheck with lock, in case it's unlinked by
+                * ptlrpc_at_check_timed() */
+               if (likely(req->rq_at_linked))
+                       ptlrpc_at_remove_timed(req);
+               spin_unlock(&svcpt->scp_at_lock);
+       }
+
+       LASSERT(list_empty(&req->rq_timed_list));
+
+       /* finalize request */
+       if (req->rq_export) {
+               class_export_put(req->rq_export);
+               req->rq_export = NULL;
+       }
+
+       spin_lock(&svcpt->scp_lock);
+
+       list_add(&req->rq_list, &rqbd->rqbd_reqs);
+
+       refcount = --(rqbd->rqbd_refcount);
+       if (refcount == 0) {
+               /* request buffer is now idle: add to history */
+               list_del(&rqbd->rqbd_list);
+
+               list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds);
+               svcpt->scp_hist_nrqbds++;
+
+               /* cull some history?
+                * I expect only about 1 or 2 rqbds need to be recycled here */
+               while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) {
+                       rqbd = list_entry(svcpt->scp_hist_rqbds.next,
+                                             struct ptlrpc_request_buffer_desc,
+                                             rqbd_list);
+
+                       list_del(&rqbd->rqbd_list);
+                       svcpt->scp_hist_nrqbds--;
+
+                       /* remove rqbd's reqs from svc's req history while
+                        * I've got the service lock */
+                       list_for_each(tmp, &rqbd->rqbd_reqs) {
+                               req = list_entry(tmp, struct ptlrpc_request,
+                                                    rq_list);
+                               /* Track the highest culled req seq */
+                               if (req->rq_history_seq >
+                                   svcpt->scp_hist_seq_culled) {
+                                       svcpt->scp_hist_seq_culled =
+                                               req->rq_history_seq;
+                               }
+                               list_del(&req->rq_history_list);
+                       }
+
+                       spin_unlock(&svcpt->scp_lock);
+
+                       list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) {
+                               req = list_entry(rqbd->rqbd_reqs.next,
+                                                    struct ptlrpc_request,
+                                                    rq_list);
+                               list_del(&req->rq_list);
+                               ptlrpc_server_free_request(req);
+                       }
+
+                       spin_lock(&svcpt->scp_lock);
+                       /*
+                        * now all reqs including the embedded req has been
+                        * disposed, schedule request buffer for re-use.
+                        */
+                       LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) ==
+                               0);
+                       list_add_tail(&rqbd->rqbd_list,
+                                         &svcpt->scp_rqbd_idle);
+               }
+
+               spin_unlock(&svcpt->scp_lock);
+       } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) {
+               /* If we are low on memory, we are not interested in history */
+               list_del(&req->rq_list);
+               list_del_init(&req->rq_history_list);
+
+               /* Track the highest culled req seq */
+               if (req->rq_history_seq > svcpt->scp_hist_seq_culled)
+                       svcpt->scp_hist_seq_culled = req->rq_history_seq;
+
+               spin_unlock(&svcpt->scp_lock);
+
+               ptlrpc_server_free_request(req);
+       } else {
+               spin_unlock(&svcpt->scp_lock);
+       }
+}
+
+/** Change request export and move hp request from old export to new */
+void ptlrpc_request_change_export(struct ptlrpc_request *req,
+                                 struct obd_export *export)
+{
+       if (req->rq_export != NULL) {
+               if (!list_empty(&req->rq_exp_list)) {
+                       /* remove rq_exp_list from last export */
+                       spin_lock_bh(&req->rq_export->exp_rpc_lock);
+                       list_del_init(&req->rq_exp_list);
+                       spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+
+                       /* export has one reference already, so it`s safe to
+                        * add req to export queue here and get another
+                        * reference for request later */
+                       spin_lock_bh(&export->exp_rpc_lock);
+                       list_add(&req->rq_exp_list, &export->exp_hp_rpcs);
+                       spin_unlock_bh(&export->exp_rpc_lock);
+               }
+               class_export_rpc_dec(req->rq_export);
+               class_export_put(req->rq_export);
+       }
+
+       /* request takes one export refcount */
+       req->rq_export = class_export_get(export);
+       class_export_rpc_inc(export);
+
+       return;
+}
+
+/**
+ * to finish a request: stop sending more early replies, and release
+ * the request.
+ */
+static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt,
+                                        struct ptlrpc_request *req)
+{
+       ptlrpc_server_hpreq_fini(req);
+
+       ptlrpc_server_drop_request(req);
+}
+
+/**
+ * to finish a active request: stop sending more early replies, and release
+ * the request. should be called after we finished handling the request.
+ */
+static void ptlrpc_server_finish_active_request(
+                                       struct ptlrpc_service_part *svcpt,
+                                       struct ptlrpc_request *req)
+{
+       spin_lock(&svcpt->scp_req_lock);
+       ptlrpc_nrs_req_stop_nolock(req);
+       svcpt->scp_nreqs_active--;
+       if (req->rq_hp)
+               svcpt->scp_nhreqs_active--;
+       spin_unlock(&svcpt->scp_req_lock);
+
+       ptlrpc_nrs_req_finalize(req);
+
+       if (req->rq_export != NULL)
+               class_export_rpc_dec(req->rq_export);
+
+       ptlrpc_server_finish_request(svcpt, req);
+}
+
+/**
+ * This function makes sure dead exports are evicted in a timely manner.
+ * This function is only called when some export receives a message (i.e.,
+ * the network is up.)
+ */
+static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+{
+       struct obd_export *oldest_exp;
+       time_t oldest_time, new_time;
+
+       ENTRY;
+
+       LASSERT(exp);
+
+       /* Compensate for slow machines, etc, by faking our request time
+          into the future.  Although this can break the strict time-ordering
+          of the list, we can be really lazy here - we don't have to evict
+          at the exact right moment.  Eventually, all silent exports
+          will make it to the top of the list. */
+
+       /* Do not pay attention on 1sec or smaller renewals. */
+       new_time = cfs_time_current_sec() + extra_delay;
+       if (exp->exp_last_request_time + 1 /*second */ >= new_time)
+               RETURN_EXIT;
+
+       exp->exp_last_request_time = new_time;
+       CDEBUG(D_HA, "updating export %s at "CFS_TIME_T" exp %p\n",
+              exp->exp_client_uuid.uuid,
+              exp->exp_last_request_time, exp);
+
+       /* exports may get disconnected from the chain even though the
+          export has references, so we must keep the spin lock while
+          manipulating the lists */
+       spin_lock(&exp->exp_obd->obd_dev_lock);
+
+       if (list_empty(&exp->exp_obd_chain_timed)) {
+               /* this one is not timed */
+               spin_unlock(&exp->exp_obd->obd_dev_lock);
+               RETURN_EXIT;
+       }
+
+       list_move_tail(&exp->exp_obd_chain_timed,
+                          &exp->exp_obd->obd_exports_timed);
+
+       oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+                                   struct obd_export, exp_obd_chain_timed);
+       oldest_time = oldest_exp->exp_last_request_time;
+       spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+       if (exp->exp_obd->obd_recovering) {
+               /* be nice to everyone during recovery */
+               EXIT;
+               return;
+       }
+
+       /* Note - racing to start/reset the obd_eviction timer is safe */
+       if (exp->exp_obd->obd_eviction_timer == 0) {
+               /* Check if the oldest entry is expired. */
+               if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT +
+                                             extra_delay)) {
+                       /* We need a second timer, in case the net was down and
+                        * it just came back. Since the pinger may skip every
+                        * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+                        * we better wait for 3. */
+                       exp->exp_obd->obd_eviction_timer =
+                               cfs_time_current_sec() + 3 * PING_INTERVAL;
+                       CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n",
+                              exp->exp_obd->obd_name,
+                              obd_export_nid2str(oldest_exp), oldest_time);
+               }
+       } else {
+               if (cfs_time_current_sec() >
+                   (exp->exp_obd->obd_eviction_timer + extra_delay)) {
+                       /* The evictor won't evict anyone who we've heard from
+                        * recently, so we don't have to check before we start
+                        * it. */
+                       if (!ping_evictor_wake(exp))
+                               exp->exp_obd->obd_eviction_timer = 0;
+               }
+       }
+
+       EXIT;
+}
+
+/**
+ * Sanity check request \a req.
+ * Return 0 if all is ok, error code otherwise.
+ */
+static int ptlrpc_check_req(struct ptlrpc_request *req)
+{
+       int rc = 0;
+
+       if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) <
+                    req->rq_export->exp_conn_cnt)) {
+               DEBUG_REQ(D_RPCTRACE, req,
+                         "DROPPING req from old connection %d < %d",
+                         lustre_msg_get_conn_cnt(req->rq_reqmsg),
+                         req->rq_export->exp_conn_cnt);
+               return -EEXIST;
+       }
+       if (unlikely(req->rq_export->exp_obd &&
+                    req->rq_export->exp_obd->obd_fail)) {
+            /* Failing over, don't handle any more reqs, send
+               error response instead. */
+               CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n",
+                      req, req->rq_export->exp_obd->obd_name);
+               rc = -ENODEV;
+       } else if (lustre_msg_get_flags(req->rq_reqmsg) &
+                  (MSG_REPLAY | MSG_REQ_REPLAY_DONE) &&
+                  !(req->rq_export->exp_obd->obd_recovering)) {
+                       DEBUG_REQ(D_ERROR, req,
+                                 "Invalid replay without recovery");
+                       class_fail_export(req->rq_export);
+                       rc = -ENODEV;
+       } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 &&
+                  !(req->rq_export->exp_obd->obd_recovering)) {
+                       DEBUG_REQ(D_ERROR, req, "Invalid req with transno "
+                                 LPU64" without recovery",
+                                 lustre_msg_get_transno(req->rq_reqmsg));
+                       class_fail_export(req->rq_export);
+                       rc = -ENODEV;
+       }
+
+       if (unlikely(rc < 0)) {
+               req->rq_status = rc;
+               ptlrpc_error(req);
+       }
+       return rc;
+}
+
+static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+       __s32 next;
+
+       if (array->paa_count == 0) {
+               cfs_timer_disarm(&svcpt->scp_at_timer);
+               return;
+       }
+
+       /* Set timer for closest deadline */
+       next = (__s32)(array->paa_deadline - cfs_time_current_sec() -
+                      at_early_margin);
+       if (next <= 0) {
+               ptlrpc_at_timer((unsigned long)svcpt);
+       } else {
+               cfs_timer_arm(&svcpt->scp_at_timer, cfs_time_shift(next));
+               CDEBUG(D_INFO, "armed %s at %+ds\n",
+                      svcpt->scp_service->srv_name, next);
+       }
+}
+
+/* Add rpc to early reply check list */
+static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
+{
+       struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+       struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+       struct ptlrpc_request *rq = NULL;
+       __u32 index;
+
+       if (AT_OFF)
+               return(0);
+
+       if (req->rq_no_reply)
+               return 0;
+
+       if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0)
+               return(-ENOSYS);
+
+       spin_lock(&svcpt->scp_at_lock);
+       LASSERT(list_empty(&req->rq_timed_list));
+
+       index = (unsigned long)req->rq_deadline % array->paa_size;
+       if (array->paa_reqs_count[index] > 0) {
+               /* latest rpcs will have the latest deadlines in the list,
+                * so search backward. */
+               list_for_each_entry_reverse(rq,
+                                               &array->paa_reqs_array[index],
+                                               rq_timed_list) {
+                       if (req->rq_deadline >= rq->rq_deadline) {
+                               list_add(&req->rq_timed_list,
+                                            &rq->rq_timed_list);
+                               break;
+                       }
+               }
+       }
+
+       /* Add the request at the head of the list */
+       if (list_empty(&req->rq_timed_list))
+               list_add(&req->rq_timed_list,
+                            &array->paa_reqs_array[index]);
+
+       spin_lock(&req->rq_lock);
+       req->rq_at_linked = 1;
+       spin_unlock(&req->rq_lock);
+       req->rq_at_index = index;
+       array->paa_reqs_count[index]++;
+       array->paa_count++;
+       if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) {
+               array->paa_deadline = req->rq_deadline;
+               ptlrpc_at_set_timer(svcpt);
+       }
+       spin_unlock(&svcpt->scp_at_lock);
+
+       return 0;
+}
+
+static void
+ptlrpc_at_remove_timed(struct ptlrpc_request *req)
+{
+       struct ptlrpc_at_array *array;
+
+       array = &req->rq_rqbd->rqbd_svcpt->scp_at_array;
+
+       /* NB: must call with hold svcpt::scp_at_lock */
+       LASSERT(!list_empty(&req->rq_timed_list));
+       list_del_init(&req->rq_timed_list);
+
+       spin_lock(&req->rq_lock);
+       req->rq_at_linked = 0;
+       spin_unlock(&req->rq_lock);
+
+       array->paa_reqs_count[req->rq_at_index]--;
+       array->paa_count--;
+}
+
+static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req)
+{
+       struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+       struct ptlrpc_request *reqcopy;
+       struct lustre_msg *reqmsg;
+       cfs_duration_t olddl = req->rq_deadline - cfs_time_current_sec();
+       time_t newdl;
+       int rc;
+       ENTRY;
+
+       /* deadline is when the client expects us to reply, margin is the
+          difference between clients' and servers' expectations */
+       DEBUG_REQ(D_ADAPTTO, req,
+                 "%ssending early reply (deadline %+lds, margin %+lds) for "
+                 "%d+%d", AT_OFF ? "AT off - not " : "",
+                 olddl, olddl - at_get(&svcpt->scp_at_estimate),
+                 at_get(&svcpt->scp_at_estimate), at_extra);
+
+       if (AT_OFF)
+               RETURN(0);
+
+       if (olddl < 0) {
+               DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), "
+                         "not sending early reply. Consider increasing "
+                         "at_early_margin (%d)?", olddl, at_early_margin);
+
+               /* Return an error so we're not re-added to the timed list. */
+               RETURN(-ETIMEDOUT);
+       }
+
+       if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){
+               DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, "
+                         "but no AT support");
+               RETURN(-ENOSYS);
+       }
+
+       if (req->rq_export &&
+           lustre_msg_get_flags(req->rq_reqmsg) &
+           (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) {
+               /* During recovery, we don't want to send too many early
+                * replies, but on the other hand we want to make sure the
+                * client has enough time to resend if the rpc is lost. So
+                * during the recovery period send at least 4 early replies,
+                * spacing them every at_extra if we can. at_estimate should
+                * always equal this fixed value during recovery. */
+               at_measured(&svcpt->scp_at_estimate, min(at_extra,
+                           req->rq_export->exp_obd->obd_recovery_timeout / 4));
+       } else {
+               /* Fake our processing time into the future to ask the clients
+                * for some extra amount of time */
+               at_measured(&svcpt->scp_at_estimate, at_extra +
+                           cfs_time_current_sec() -
+                           req->rq_arrival_time.tv_sec);
+
+               /* Check to see if we've actually increased the deadline -
+                * we may be past adaptive_max */
+               if (req->rq_deadline >= req->rq_arrival_time.tv_sec +
+                   at_get(&svcpt->scp_at_estimate)) {
+                       DEBUG_REQ(D_WARNING, req, "Couldn't add any time "
+                                 "(%ld/%ld), not sending early reply\n",
+                                 olddl, req->rq_arrival_time.tv_sec +
+                                 at_get(&svcpt->scp_at_estimate) -
+                                 cfs_time_current_sec());
+                       RETURN(-ETIMEDOUT);
+               }
+       }
+       newdl = cfs_time_current_sec() + at_get(&svcpt->scp_at_estimate);
+
+       OBD_ALLOC(reqcopy, sizeof *reqcopy);
+       if (reqcopy == NULL)
+               RETURN(-ENOMEM);
+       OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen);
+       if (!reqmsg) {
+               OBD_FREE(reqcopy, sizeof *reqcopy);
+               RETURN(-ENOMEM);
+       }
+
+       *reqcopy = *req;
+       reqcopy->rq_reply_state = NULL;
+       reqcopy->rq_rep_swab_mask = 0;
+       reqcopy->rq_pack_bulk = 0;
+       reqcopy->rq_pack_udesc = 0;
+       reqcopy->rq_packed_final = 0;
+       sptlrpc_svc_ctx_addref(reqcopy);
+       /* We only need the reqmsg for the magic */
+       reqcopy->rq_reqmsg = reqmsg;
+       memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+
+       LASSERT(atomic_read(&req->rq_refcount));
+       /** if it is last refcount then early reply isn't needed */
+       if (atomic_read(&req->rq_refcount) == 1) {
+               DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, "
+                         "abort sending early reply\n");
+               GOTO(out, rc = -EINVAL);
+       }
+
+       /* Connection ref */
+       reqcopy->rq_export = class_conn2export(
+                                    lustre_msg_get_handle(reqcopy->rq_reqmsg));
+       if (reqcopy->rq_export == NULL)
+               GOTO(out, rc = -ENODEV);
+
+       /* RPC ref */
+       class_export_rpc_inc(reqcopy->rq_export);
+       if (reqcopy->rq_export->exp_obd &&
+           reqcopy->rq_export->exp_obd->obd_fail)
+               GOTO(out_put, rc = -ENODEV);
+
+       rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY);
+       if (rc)
+               GOTO(out_put, rc);
+
+       rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY);
+
+       if (!rc) {
+               /* Adjust our own deadline to what we told the client */
+               req->rq_deadline = newdl;
+               req->rq_early_count++; /* number sent, server side */
+       } else {
+               DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc);
+       }
+
+       /* Free the (early) reply state from lustre_pack_reply.
+          (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */
+       ptlrpc_req_drop_rs(reqcopy);
+
+out_put:
+       class_export_rpc_dec(reqcopy->rq_export);
+       class_export_put(reqcopy->rq_export);
+out:
+       sptlrpc_svc_ctx_decref(reqcopy);
+       OBD_FREE_LARGE(reqmsg, req->rq_reqlen);
+       OBD_FREE(reqcopy, sizeof *reqcopy);
+       RETURN(rc);
+}
+
+/* Send early replies to everybody expiring within at_early_margin
+   asking for at_extra time */
+static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_at_array *array = &svcpt->scp_at_array;
+       struct ptlrpc_request *rq, *n;
+       struct list_head work_list;
+       __u32  index, count;
+       time_t deadline;
+       time_t now = cfs_time_current_sec();
+       cfs_duration_t delay;
+       int first, counter = 0;
+       ENTRY;
+
+       spin_lock(&svcpt->scp_at_lock);
+       if (svcpt->scp_at_check == 0) {
+               spin_unlock(&svcpt->scp_at_lock);
+               RETURN(0);
+       }
+       delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime);
+       svcpt->scp_at_check = 0;
+
+       if (array->paa_count == 0) {
+               spin_unlock(&svcpt->scp_at_lock);
+               RETURN(0);
+       }
+
+       /* The timer went off, but maybe the nearest rpc already completed. */
+       first = array->paa_deadline - now;
+       if (first > at_early_margin) {
+               /* We've still got plenty of time.  Reset the timer. */
+               ptlrpc_at_set_timer(svcpt);
+               spin_unlock(&svcpt->scp_at_lock);
+               RETURN(0);
+       }
+
+       /* We're close to a timeout, and we don't know how much longer the
+          server will take. Send early replies to everyone expiring soon. */
+       INIT_LIST_HEAD(&work_list);
+       deadline = -1;
+       index = (unsigned long)array->paa_deadline % array->paa_size;
+       count = array->paa_count;
+       while (count > 0) {
+               count -= array->paa_reqs_count[index];
+               list_for_each_entry_safe(rq, n,
+                                            &array->paa_reqs_array[index],
+                                            rq_timed_list) {
+                       if (rq->rq_deadline > now + at_early_margin) {
+                               /* update the earliest deadline */
+                               if (deadline == -1 ||
+                                   rq->rq_deadline < deadline)
+                                       deadline = rq->rq_deadline;
+                               break;
+                       }
+
+                       ptlrpc_at_remove_timed(rq);
+                       /**
+                        * ptlrpc_server_drop_request() may drop
+                        * refcount to 0 already. Let's check this and
+                        * don't add entry to work_list
+                        */
+                       if (likely(atomic_inc_not_zero(&rq->rq_refcount)))
+                               list_add(&rq->rq_timed_list, &work_list);
+                       counter++;
+               }
+
+               if (++index >= array->paa_size)
+                       index = 0;
+       }
+       array->paa_deadline = deadline;
+       /* we have a new earliest deadline, restart the timer */
+       ptlrpc_at_set_timer(svcpt);
+
+       spin_unlock(&svcpt->scp_at_lock);
+
+       CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early "
+              "replies\n", first, at_extra, counter);
+       if (first < 0) {
+               /* We're already past request deadlines before we even get a
+                  chance to send early replies */
+               LCONSOLE_WARN("%s: This server is not able to keep up with "
+                             "request traffic (cpu-bound).\n",
+                             svcpt->scp_service->srv_name);
+               CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, "
+                     "delay="CFS_DURATION_T"(jiff)\n",
+                     counter, svcpt->scp_nreqs_incoming,
+                     svcpt->scp_nreqs_active,
+                     at_get(&svcpt->scp_at_estimate), delay);
+       }
+
+       /* we took additional refcount so entries can't be deleted from list, no
+        * locking is needed */
+       while (!list_empty(&work_list)) {
+               rq = list_entry(work_list.next, struct ptlrpc_request,
+                                   rq_timed_list);
+               list_del_init(&rq->rq_timed_list);
+
+               if (ptlrpc_at_send_early_reply(rq) == 0)
+                       ptlrpc_at_add_timed(rq);
+
+               ptlrpc_server_drop_request(rq);
+       }
+
+       RETURN(1); /* return "did_something" for liblustre */
+}
+
+/**
+ * Put the request to the export list if the request may become
+ * a high priority one.
+ */
+static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt,
+                                   struct ptlrpc_request *req)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (svcpt->scp_service->srv_ops.so_hpreq_handler) {
+               rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req);
+               if (rc < 0)
+                       RETURN(rc);
+               LASSERT(rc == 0);
+       }
+       if (req->rq_export && req->rq_ops) {
+               /* Perform request specific check. We should do this check
+                * before the request is added into exp_hp_rpcs list otherwise
+                * it may hit swab race at LU-1044. */
+               if (req->rq_ops->hpreq_check) {
+                       rc = req->rq_ops->hpreq_check(req);
+                       /**
+                        * XXX: Out of all current
+                        * ptlrpc_hpreq_ops::hpreq_check(), only
+                        * ldlm_cancel_hpreq_check() can return an error code;
+                        * other functions assert in similar places, which seems
+                        * odd. What also does not seem right is that handlers
+                        * for those RPCs do not assert on the same checks, but
+                        * rather handle the error cases. e.g. see
+                        * ost_rw_hpreq_check(), and ost_brw_read(),
+                        * ost_brw_write().
+                        */
+                       if (rc < 0)
+                               RETURN(rc);
+                       LASSERT(rc == 0 || rc == 1);
+               }
+
+               spin_lock_bh(&req->rq_export->exp_rpc_lock);
+               list_add(&req->rq_exp_list,
+                            &req->rq_export->exp_hp_rpcs);
+               spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+       }
+
+       ptlrpc_nrs_req_initialize(svcpt, req, rc);
+
+       RETURN(rc);
+}
+
+/** Remove the request from the export list. */
+static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req)
+{
+       ENTRY;
+       if (req->rq_export && req->rq_ops) {
+               /* refresh lock timeout again so that client has more
+                * room to send lock cancel RPC. */
+               if (req->rq_ops->hpreq_fini)
+                       req->rq_ops->hpreq_fini(req);
+
+               spin_lock_bh(&req->rq_export->exp_rpc_lock);
+               list_del_init(&req->rq_exp_list);
+               spin_unlock_bh(&req->rq_export->exp_rpc_lock);
+       }
+       EXIT;
+}
+
+static int ptlrpc_hpreq_check(struct ptlrpc_request *req)
+{
+       return 1;
+}
+
+static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = {
+       .hpreq_check       = ptlrpc_hpreq_check,
+};
+
+/* Hi-Priority RPC check by RPC operation code. */
+int ptlrpc_hpreq_handler(struct ptlrpc_request *req)
+{
+       int opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+       /* Check for export to let only reconnects for not yet evicted
+        * export to become a HP rpc. */
+       if ((req->rq_export != NULL) &&
+           (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT))
+               req->rq_ops = &ptlrpc_hpreq_common;
+
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_hpreq_handler);
+
+static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
+                                    struct ptlrpc_request *req)
+{
+       int     rc;
+       ENTRY;
+
+       rc = ptlrpc_server_hpreq_init(svcpt, req);
+       if (rc < 0)
+               RETURN(rc);
+
+       ptlrpc_nrs_req_add(svcpt, req, !!rc);
+
+       RETURN(0);
+}
+
+/**
+ * Allow to handle high priority request
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt,
+                                    bool force)
+{
+       int running = svcpt->scp_nthrs_running;
+
+       if (!nrs_svcpt_has_hp(svcpt))
+               return false;
+
+       if (force)
+               return true;
+
+       if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+                    CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+               /* leave just 1 thread for normal RPCs */
+               running = PTLRPC_NTHRS_INIT;
+               if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+                       running += 1;
+       }
+
+       if (svcpt->scp_nreqs_active >= running - 1)
+               return false;
+
+       if (svcpt->scp_nhreqs_active == 0)
+               return true;
+
+       return !ptlrpc_nrs_req_pending_nolock(svcpt, false) ||
+              svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio;
+}
+
+static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt,
+                                      bool force)
+{
+       return ptlrpc_server_allow_high(svcpt, force) &&
+              ptlrpc_nrs_req_pending_nolock(svcpt, true);
+}
+
+/**
+ * Only allow normal priority requests on a service that has a high-priority
+ * queue if forced (i.e. cleanup), if there are other high priority requests
+ * already being processed (i.e. those threads can service more high-priority
+ * requests), or if there are enough idle threads that a later thread can do
+ * a high priority request.
+ * User can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_req_lock to get reliable result
+ */
+static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt,
+                                      bool force)
+{
+       int running = svcpt->scp_nthrs_running;
+       if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+                    CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+               /* leave just 1 thread for normal RPCs */
+               running = PTLRPC_NTHRS_INIT;
+               if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+                       running += 1;
+       }
+
+       if (force ||
+           svcpt->scp_nreqs_active < running - 2)
+               return true;
+
+       if (svcpt->scp_nreqs_active >= running - 1)
+               return false;
+
+       return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt);
+}
+
+static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt,
+                                        bool force)
+{
+       return ptlrpc_server_allow_normal(svcpt, force) &&
+              ptlrpc_nrs_req_pending_nolock(svcpt, false);
+}
+
+/**
+ * Returns true if there are requests available in incoming
+ * request queue for processing and it is allowed to fetch them.
+ * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock
+ * to get reliable result
+ * \see ptlrpc_server_allow_normal
+ * \see ptlrpc_server_allow high
+ */
+static inline bool
+ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force)
+{
+       return ptlrpc_server_high_pending(svcpt, force) ||
+              ptlrpc_server_normal_pending(svcpt, force);
+}
+
+/**
+ * Fetch a request for processing from queue of unprocessed requests.
+ * Favors high-priority requests.
+ * Returns a pointer to fetched request.
+ */
+static struct ptlrpc_request *
+ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force)
+{
+       struct ptlrpc_request *req = NULL;
+       ENTRY;
+
+       spin_lock(&svcpt->scp_req_lock);
+
+       if (ptlrpc_server_high_pending(svcpt, force)) {
+               req = ptlrpc_nrs_req_get_nolock(svcpt, true, force);
+               if (req != NULL) {
+                       svcpt->scp_hreq_count++;
+                       goto got_request;
+               }
+       }
+
+       if (ptlrpc_server_normal_pending(svcpt, force)) {
+               req = ptlrpc_nrs_req_get_nolock(svcpt, false, force);
+               if (req != NULL) {
+                       svcpt->scp_hreq_count = 0;
+                       goto got_request;
+               }
+       }
+
+       spin_unlock(&svcpt->scp_req_lock);
+       RETURN(NULL);
+
+got_request:
+       svcpt->scp_nreqs_active++;
+       if (req->rq_hp)
+               svcpt->scp_nhreqs_active++;
+
+       spin_unlock(&svcpt->scp_req_lock);
+
+       if (likely(req->rq_export))
+               class_export_rpc_inc(req->rq_export);
+
+       RETURN(req);
+}
+
+/**
+ * Handle freshly incoming reqs, add to timed early reply list,
+ * pass on to regular request queue.
+ * All incoming requests pass through here before getting into
+ * ptlrpc_server_handle_req later on.
+ */
+static int
+ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt,
+                           struct ptlrpc_thread *thread)
+{
+       struct ptlrpc_service   *svc = svcpt->scp_service;
+       struct ptlrpc_request   *req;
+       __u32                   deadline;
+       int                     rc;
+       ENTRY;
+
+       spin_lock(&svcpt->scp_lock);
+       if (list_empty(&svcpt->scp_req_incoming)) {
+               spin_unlock(&svcpt->scp_lock);
+               RETURN(0);
+       }
+
+       req = list_entry(svcpt->scp_req_incoming.next,
+                            struct ptlrpc_request, rq_list);
+       list_del_init(&req->rq_list);
+       svcpt->scp_nreqs_incoming--;
+       /* Consider this still a "queued" request as far as stats are
+        * concerned */
+       spin_unlock(&svcpt->scp_lock);
+
+       /* go through security check/transform */
+       rc = sptlrpc_svc_unwrap_request(req);
+       switch (rc) {
+       case SECSVC_OK:
+               break;
+       case SECSVC_COMPLETE:
+               target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+               goto err_req;
+       case SECSVC_DROP:
+               goto err_req;
+       default:
+               LBUG();
+       }
+
+       /*
+        * for null-flavored rpc, msg has been unpacked by sptlrpc, although
+        * redo it wouldn't be harmful.
+        */
+       if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
+               rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen);
+               if (rc != 0) {
+                       CERROR("error unpacking request: ptl %d from %s "
+                              "x"LPU64"\n", svc->srv_req_portal,
+                              libcfs_id2str(req->rq_peer), req->rq_xid);
+                       goto err_req;
+               }
+       }
+
+       rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
+       if (rc) {
+               CERROR ("error unpacking ptlrpc body: ptl %d from %s x"
+                       LPU64"\n", svc->srv_req_portal,
+                       libcfs_id2str(req->rq_peer), req->rq_xid);
+               goto err_req;
+       }
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) &&
+           lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) {
+               CERROR("drop incoming rpc opc %u, x"LPU64"\n",
+                      cfs_fail_val, req->rq_xid);
+               goto err_req;
+       }
+
+       rc = -EINVAL;
+       if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) {
+               CERROR("wrong packet type received (type=%u) from %s\n",
+                      lustre_msg_get_type(req->rq_reqmsg),
+                      libcfs_id2str(req->rq_peer));
+               goto err_req;
+       }
+
+       switch(lustre_msg_get_opc(req->rq_reqmsg)) {
+       case MDS_WRITEPAGE:
+       case OST_WRITE:
+               req->rq_bulk_write = 1;
+               break;
+       case MDS_READPAGE:
+       case OST_READ:
+       case MGS_CONFIG_READ:
+               req->rq_bulk_read = 1;
+               break;
+       }
+
+       CDEBUG(D_RPCTRACE, "got req x"LPU64"\n", req->rq_xid);
+
+       req->rq_export = class_conn2export(
+               lustre_msg_get_handle(req->rq_reqmsg));
+       if (req->rq_export) {
+               rc = ptlrpc_check_req(req);
+               if (rc == 0) {
+                       rc = sptlrpc_target_export_check(req->rq_export, req);
+                       if (rc)
+                               DEBUG_REQ(D_ERROR, req, "DROPPING req with "
+                                         "illegal security flavor,");
+               }
+
+               if (rc)
+                       goto err_req;
+               ptlrpc_update_export_timer(req->rq_export, 0);
+       }
+
+       /* req_in handling should/must be fast */
+       if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5)
+               DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s",
+                         cfs_time_sub(cfs_time_current_sec(),
+                                      req->rq_arrival_time.tv_sec));
+
+       /* Set rpc server deadline and add it to the timed list */
+       deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) &
+                   MSGHDR_AT_SUPPORT) ?
+                  /* The max time the client expects us to take */
+                  lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout;
+       req->rq_deadline = req->rq_arrival_time.tv_sec + deadline;
+       if (unlikely(deadline == 0)) {
+               DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout");
+               goto err_req;
+       }
+
+       req->rq_svc_thread = thread;
+
+       ptlrpc_at_add_timed(req);
+
+       /* Move it over to the request processing queue */
+       rc = ptlrpc_server_request_add(svcpt, req);
+       if (rc)
+               GOTO(err_req, rc);
+
+       wake_up(&svcpt->scp_waitq);
+       RETURN(1);
+
+err_req:
+       ptlrpc_server_finish_request(svcpt, req);
+
+       RETURN(1);
+}
+
+/**
+ * Main incoming request handling logic.
+ * Calls handler function from service to do actual processing.
+ */
+static int
+ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
+                            struct ptlrpc_thread *thread)
+{
+       struct ptlrpc_service *svc = svcpt->scp_service;
+       struct ptlrpc_request *request;
+       struct timeval   work_start;
+       struct timeval   work_end;
+       long               timediff;
+       int                 rc;
+       int                 fail_opc = 0;
+       ENTRY;
+
+       request = ptlrpc_server_request_get(svcpt, false);
+       if (request == NULL)
+               RETURN(0);
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
+               fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
+       else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
+               fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT;
+
+       if (unlikely(fail_opc)) {
+               if (request->rq_export && request->rq_ops)
+                       OBD_FAIL_TIMEOUT(fail_opc, 4);
+       }
+
+       ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET);
+
+       if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG))
+               libcfs_debug_dumplog();
+
+       do_gettimeofday(&work_start);
+       timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL);
+       if (likely(svc->srv_stats != NULL)) {
+               lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
+                                   timediff);
+               lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR,
+                                   svcpt->scp_nreqs_incoming);
+               lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR,
+                                   svcpt->scp_nreqs_active);
+               lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT,
+                                   at_get(&svcpt->scp_at_estimate));
+       }
+
+       rc = lu_context_init(&request->rq_session, LCT_SESSION | LCT_NOREF);
+       if (rc) {
+               CERROR("Failure to initialize session: %d\n", rc);
+               goto out_req;
+       }
+       request->rq_session.lc_thread = thread;
+       request->rq_session.lc_cookie = 0x5;
+       lu_context_enter(&request->rq_session);
+
+       CDEBUG(D_NET, "got req "LPU64"\n", request->rq_xid);
+
+       request->rq_svc_thread = thread;
+       if (thread)
+               request->rq_svc_thread->t_env->le_ses = &request->rq_session;
+
+       if (likely(request->rq_export)) {
+               if (unlikely(ptlrpc_check_req(request)))
+                       goto put_conn;
+               ptlrpc_update_export_timer(request->rq_export, timediff >> 19);
+       }
+
+       /* Discard requests queued for longer than the deadline.
+          The deadline is increased if we send an early reply. */
+       if (cfs_time_current_sec() > request->rq_deadline) {
+               DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s"
+                         ": deadline "CFS_DURATION_T":"CFS_DURATION_T"s ago\n",
+                         libcfs_id2str(request->rq_peer),
+                         cfs_time_sub(request->rq_deadline,
+                         request->rq_arrival_time.tv_sec),
+                         cfs_time_sub(cfs_time_current_sec(),
+                         request->rq_deadline));
+               goto put_conn;
+       }
+
+       CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc "
+              "%s:%s+%d:%d:x"LPU64":%s:%d\n", current_comm(),
+              (request->rq_export ?
+               (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+              (request->rq_export ?
+               atomic_read(&request->rq_export->exp_refcount) : -99),
+              lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
+              libcfs_id2str(request->rq_peer),
+              lustre_msg_get_opc(request->rq_reqmsg));
+
+       if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
+               CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
+
+       rc = svc->srv_ops.so_req_handler(request);
+
+       ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE);
+
+put_conn:
+       lu_context_exit(&request->rq_session);
+       lu_context_fini(&request->rq_session);
+
+       if (unlikely(cfs_time_current_sec() > request->rq_deadline)) {
+                    DEBUG_REQ(D_WARNING, request, "Request took longer "
+                              "than estimated ("CFS_DURATION_T":"CFS_DURATION_T"s);"
+                              " client may timeout.",
+                              cfs_time_sub(request->rq_deadline,
+                                           request->rq_arrival_time.tv_sec),
+                              cfs_time_sub(cfs_time_current_sec(),
+                                           request->rq_deadline));
+       }
+
+       do_gettimeofday(&work_end);
+       timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
+       CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc "
+              "%s:%s+%d:%d:x"LPU64":%s:%d Request procesed in "
+              "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
+               current_comm(),
+               (request->rq_export ?
+                (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+               (request->rq_export ?
+                atomic_read(&request->rq_export->exp_refcount) : -99),
+               lustre_msg_get_status(request->rq_reqmsg),
+               request->rq_xid,
+               libcfs_id2str(request->rq_peer),
+               lustre_msg_get_opc(request->rq_reqmsg),
+               timediff,
+               cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL),
+               (request->rq_repmsg ?
+                lustre_msg_get_transno(request->rq_repmsg) :
+                request->rq_transno),
+               request->rq_status,
+               (request->rq_repmsg ?
+                lustre_msg_get_status(request->rq_repmsg) : -999));
+       if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
+               __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
+               int opc = opcode_offset(op);
+               if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {
+                       LASSERT(opc < LUSTRE_MAX_OPCODES);
+                       lprocfs_counter_add(svc->srv_stats,
+                                           opc + EXTRA_MAX_OPCODES,
+                                           timediff);
+               }
+       }
+       if (unlikely(request->rq_early_count)) {
+               DEBUG_REQ(D_ADAPTTO, request,
+                         "sent %d early replies before finishing in "
+                         CFS_DURATION_T"s",
+                         request->rq_early_count,
+                         cfs_time_sub(work_end.tv_sec,
+                         request->rq_arrival_time.tv_sec));
+       }
+
+out_req:
+       ptlrpc_server_finish_active_request(svcpt, request);
+
+       RETURN(1);
+}
+
+/**
+ * An internal function to process a single reply state object.
+ */
+static int
+ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
+{
+       struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
+       struct ptlrpc_service     *svc = svcpt->scp_service;
+       struct obd_export        *exp;
+       int                     nlocks;
+       int                     been_handled;
+       ENTRY;
+
+       exp = rs->rs_export;
+
+       LASSERT (rs->rs_difficult);
+       LASSERT (rs->rs_scheduled);
+       LASSERT (list_empty(&rs->rs_list));
+
+       spin_lock(&exp->exp_lock);
+       /* Noop if removed already */
+       list_del_init (&rs->rs_exp_list);
+       spin_unlock(&exp->exp_lock);
+
+       /* The disk commit callback holds exp_uncommitted_replies_lock while it
+        * iterates over newly committed replies, removing them from
+        * exp_uncommitted_replies.  It then drops this lock and schedules the
+        * replies it found for handling here.
+        *
+        * We can avoid contention for exp_uncommitted_replies_lock between the
+        * HRT threads and further commit callbacks by checking rs_committed
+        * which is set in the commit callback while it holds both
+        * rs_lock and exp_uncommitted_reples.
+        *
+        * If we see rs_committed clear, the commit callback _may_ not have
+        * handled this reply yet and we race with it to grab
+        * exp_uncommitted_replies_lock before removing the reply from
+        * exp_uncommitted_replies.  Note that if we lose the race and the
+        * reply has already been removed, list_del_init() is a noop.
+        *
+        * If we see rs_committed set, we know the commit callback is handling,
+        * or has handled this reply since store reordering might allow us to
+        * see rs_committed set out of sequence.  But since this is done
+        * holding rs_lock, we can be sure it has all completed once we hold
+        * rs_lock, which we do right next.
+        */
+       if (!rs->rs_committed) {
+               spin_lock(&exp->exp_uncommitted_replies_lock);
+               list_del_init(&rs->rs_obd_list);
+               spin_unlock(&exp->exp_uncommitted_replies_lock);
+       }
+
+       spin_lock(&rs->rs_lock);
+
+       been_handled = rs->rs_handled;
+       rs->rs_handled = 1;
+
+       nlocks = rs->rs_nlocks;          /* atomic "steal", but */
+       rs->rs_nlocks = 0;                    /* locks still on rs_locks! */
+
+       if (nlocks == 0 && !been_handled) {
+               /* If we see this, we should already have seen the warning
+                * in mds_steal_ack_locks()  */
+               CDEBUG(D_HA, "All locks stolen from rs %p x"LPD64".t"LPD64
+                      " o%d NID %s\n",
+                      rs,
+                      rs->rs_xid, rs->rs_transno, rs->rs_opc,
+                      libcfs_nid2str(exp->exp_connection->c_peer.nid));
+       }
+
+       if ((!been_handled && rs->rs_on_net) || nlocks > 0) {
+               spin_unlock(&rs->rs_lock);
+
+               if (!been_handled && rs->rs_on_net) {
+                       LNetMDUnlink(rs->rs_md_h);
+                       /* Ignore return code; we're racing with completion */
+               }
+
+               while (nlocks-- > 0)
+                       ldlm_lock_decref(&rs->rs_locks[nlocks],
+                                        rs->rs_modes[nlocks]);
+
+               spin_lock(&rs->rs_lock);
+       }
+
+       rs->rs_scheduled = 0;
+
+       if (!rs->rs_on_net) {
+               /* Off the net */
+               spin_unlock(&rs->rs_lock);
+
+               class_export_put (exp);
+               rs->rs_export = NULL;
+               ptlrpc_rs_decref (rs);
+               if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) &&
+                   svc->srv_is_stopping)
+                       wake_up_all(&svcpt->scp_waitq);
+               RETURN(1);
+       }
+
+       /* still on the net; callback will schedule */
+       spin_unlock(&rs->rs_lock);
+       RETURN(1);
+}
+
+
+static void
+ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt)
+{
+       int avail = svcpt->scp_nrqbds_posted;
+       int low_water = test_req_buffer_pressure ? 0 :
+                       svcpt->scp_service->srv_nbuf_per_group / 2;
+
+       /* NB I'm not locking; just looking. */
+
+       /* CAVEAT EMPTOR: We might be allocating buffers here because we've
+        * allowed the request history to grow out of control.  We could put a
+        * sanity check on that here and cull some history if we need the
+        * space. */
+
+       if (avail <= low_water)
+               ptlrpc_grow_req_bufs(svcpt, 1);
+
+       if (svcpt->scp_service->srv_stats) {
+               lprocfs_counter_add(svcpt->scp_service->srv_stats,
+                                   PTLRPC_REQBUF_AVAIL_CNTR, avail);
+       }
+}
+
+static int
+ptlrpc_retry_rqbds(void *arg)
+{
+       struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg;
+
+       svcpt->scp_rqbd_timeout = 0;
+       return -ETIMEDOUT;
+}
+
+static inline int
+ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt)
+{
+       return svcpt->scp_nreqs_active <
+              svcpt->scp_nthrs_running - 1 -
+              (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL);
+}
+
+/**
+ * allowed to create more threads
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt)
+{
+       return svcpt->scp_nthrs_running +
+              svcpt->scp_nthrs_starting <
+              svcpt->scp_service->srv_nthrs_cpt_limit;
+}
+
+/**
+ * too many requests and allowed to create more threads
+ */
+static inline int
+ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt)
+{
+       return !ptlrpc_threads_enough(svcpt) &&
+               ptlrpc_threads_increasable(svcpt);
+}
+
+static inline int
+ptlrpc_thread_stopping(struct ptlrpc_thread *thread)
+{
+       return thread_is_stopping(thread) ||
+              thread->t_svcpt->scp_service->srv_is_stopping;
+}
+
+static inline int
+ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt)
+{
+       return !list_empty(&svcpt->scp_rqbd_idle) &&
+              svcpt->scp_rqbd_timeout == 0;
+}
+
+static inline int
+ptlrpc_at_check(struct ptlrpc_service_part *svcpt)
+{
+       return svcpt->scp_at_check;
+}
+
+/**
+ * requests wait on preprocessing
+ * user can call it w/o any lock but need to hold
+ * ptlrpc_service_part::scp_lock to get reliable result
+ */
+static inline int
+ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt)
+{
+       return !list_empty(&svcpt->scp_req_incoming);
+}
+
+static __attribute__((__noinline__)) int
+ptlrpc_wait_event(struct ptlrpc_service_part *svcpt,
+                 struct ptlrpc_thread *thread)
+{
+       /* Don't exit while there are replies to be handled */
+       struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout,
+                                            ptlrpc_retry_rqbds, svcpt);
+
+       lc_watchdog_disable(thread->t_watchdog);
+
+       cond_resched();
+
+       l_wait_event_exclusive_head(svcpt->scp_waitq,
+                               ptlrpc_thread_stopping(thread) ||
+                               ptlrpc_server_request_incoming(svcpt) ||
+                               ptlrpc_server_request_pending(svcpt, false) ||
+                               ptlrpc_rqbd_pending(svcpt) ||
+                               ptlrpc_at_check(svcpt), &lwi);
+
+       if (ptlrpc_thread_stopping(thread))
+               return -EINTR;
+
+       lc_watchdog_touch(thread->t_watchdog,
+                         ptlrpc_server_get_timeout(svcpt));
+       return 0;
+}
+
+/**
+ * Main thread body for service threads.
+ * Waits in a loop waiting for new requests to process to appear.
+ * Every time an incoming requests is added to its queue, a waitq
+ * is woken up and one of the threads will handle it.
+ */
+static int ptlrpc_main(void *arg)
+{
+       struct ptlrpc_thread            *thread = (struct ptlrpc_thread *)arg;
+       struct ptlrpc_service_part      *svcpt = thread->t_svcpt;
+       struct ptlrpc_service           *svc = svcpt->scp_service;
+       struct ptlrpc_reply_state       *rs;
+#ifdef WITH_GROUP_INFO
+       group_info_t *ginfo = NULL;
+#endif
+       struct lu_env *env;
+       int counter = 0, rc = 0;
+       ENTRY;
+
+       thread->t_pid = current_pid();
+       unshare_fs_struct();
+
+       /* NB: we will call cfs_cpt_bind() for all threads, because we
+        * might want to run lustre server only on a subset of system CPUs,
+        * in that case ->scp_cpt is CFS_CPT_ANY */
+       rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt);
+       if (rc != 0) {
+               CWARN("%s: failed to bind %s on CPT %d\n",
+                     svc->srv_name, thread->t_name, svcpt->scp_cpt);
+       }
+
+#ifdef WITH_GROUP_INFO
+       ginfo = groups_alloc(0);
+       if (!ginfo) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       set_current_groups(ginfo);
+       put_group_info(ginfo);
+#endif
+
+       if (svc->srv_ops.so_thr_init != NULL) {
+               rc = svc->srv_ops.so_thr_init(thread);
+               if (rc)
+                       goto out;
+       }
+
+       OBD_ALLOC_PTR(env);
+       if (env == NULL) {
+               rc = -ENOMEM;
+               goto out_srv_fini;
+       }
+
+       rc = lu_context_init(&env->le_ctx,
+                            svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
+       if (rc)
+               goto out_srv_fini;
+
+       thread->t_env = env;
+       env->le_ctx.lc_thread = thread;
+       env->le_ctx.lc_cookie = 0x6;
+
+       while (!list_empty(&svcpt->scp_rqbd_idle)) {
+               rc = ptlrpc_server_post_idle_rqbds(svcpt);
+               if (rc >= 0)
+                       continue;
+
+               CERROR("Failed to post rqbd for %s on CPT %d: %d\n",
+                       svc->srv_name, svcpt->scp_cpt, rc);
+               goto out_srv_fini;
+       }
+
+       /* Alloc reply state structure for this one */
+       OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size);
+       if (!rs) {
+               rc = -ENOMEM;
+               goto out_srv_fini;
+       }
+
+       spin_lock(&svcpt->scp_lock);
+
+       LASSERT(thread_is_starting(thread));
+       thread_clear_flags(thread, SVC_STARTING);
+
+       LASSERT(svcpt->scp_nthrs_starting == 1);
+       svcpt->scp_nthrs_starting--;
+
+       /* SVC_STOPPING may already be set here if someone else is trying
+        * to stop the service while this new thread has been dynamically
+        * forked. We still set SVC_RUNNING to let our creator know that
+        * we are now running, however we will exit as soon as possible */
+       thread_add_flags(thread, SVC_RUNNING);
+       svcpt->scp_nthrs_running++;
+       spin_unlock(&svcpt->scp_lock);
+
+       /* wake up our creator in case he's still waiting. */
+       wake_up(&thread->t_ctl_waitq);
+
+       thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt),
+                                            NULL, NULL);
+
+       spin_lock(&svcpt->scp_rep_lock);
+       list_add(&rs->rs_list, &svcpt->scp_rep_idle);
+       wake_up(&svcpt->scp_rep_waitq);
+       spin_unlock(&svcpt->scp_rep_lock);
+
+       CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id,
+              svcpt->scp_nthrs_running);
+
+       /* XXX maintain a list of all managed devices: insert here */
+       while (!ptlrpc_thread_stopping(thread)) {
+               if (ptlrpc_wait_event(svcpt, thread))
+                       break;
+
+               ptlrpc_check_rqbd_pool(svcpt);
+
+               if (ptlrpc_threads_need_create(svcpt)) {
+                       /* Ignore return code - we tried... */
+                       ptlrpc_start_thread(svcpt, 0);
+               }
+
+               /* Process all incoming reqs before handling any */
+               if (ptlrpc_server_request_incoming(svcpt)) {
+                       lu_context_enter(&env->le_ctx);
+                       env->le_ses = NULL;
+                       ptlrpc_server_handle_req_in(svcpt, thread);
+                       lu_context_exit(&env->le_ctx);
+
+                       /* but limit ourselves in case of flood */
+                       if (counter++ < 100)
+                               continue;
+                       counter = 0;
+               }
+
+               if (ptlrpc_at_check(svcpt))
+                       ptlrpc_at_check_timed(svcpt);
+
+               if (ptlrpc_server_request_pending(svcpt, false)) {
+                       lu_context_enter(&env->le_ctx);
+                       ptlrpc_server_handle_request(svcpt, thread);
+                       lu_context_exit(&env->le_ctx);
+               }
+
+               if (ptlrpc_rqbd_pending(svcpt) &&
+                   ptlrpc_server_post_idle_rqbds(svcpt) < 0) {
+                       /* I just failed to repost request buffers.
+                        * Wait for a timeout (unless something else
+                        * happens) before I try again */
+                       svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10;
+                       CDEBUG(D_RPCTRACE, "Posted buffers: %d\n",
+                              svcpt->scp_nrqbds_posted);
+               }
+       }
+
+       lc_watchdog_delete(thread->t_watchdog);
+       thread->t_watchdog = NULL;
+
+out_srv_fini:
+       /*
+        * deconstruct service specific state created by ptlrpc_start_thread()
+        */
+       if (svc->srv_ops.so_thr_done != NULL)
+               svc->srv_ops.so_thr_done(thread);
+
+       if (env != NULL) {
+               lu_context_fini(&env->le_ctx);
+               OBD_FREE_PTR(env);
+       }
+out:
+       CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n",
+              thread, thread->t_pid, thread->t_id, rc);
+
+       spin_lock(&svcpt->scp_lock);
+       if (thread_test_and_clear_flags(thread, SVC_STARTING))
+               svcpt->scp_nthrs_starting--;
+
+       if (thread_test_and_clear_flags(thread, SVC_RUNNING)) {
+               /* must know immediately */
+               svcpt->scp_nthrs_running--;
+       }
+
+       thread->t_id = rc;
+       thread_add_flags(thread, SVC_STOPPED);
+
+       wake_up(&thread->t_ctl_waitq);
+       spin_unlock(&svcpt->scp_lock);
+
+       return rc;
+}
+
+static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt,
+                         struct list_head *replies)
+{
+       int result;
+
+       spin_lock(&hrt->hrt_lock);
+
+       list_splice_init(&hrt->hrt_queue, replies);
+       result = ptlrpc_hr.hr_stopping || !list_empty(replies);
+
+       spin_unlock(&hrt->hrt_lock);
+       return result;
+}
+
+/**
+ * Main body of "handle reply" function.
+ * It processes acked reply states
+ */
+static int ptlrpc_hr_main(void *arg)
+{
+       struct ptlrpc_hr_thread         *hrt = (struct ptlrpc_hr_thread *)arg;
+       struct ptlrpc_hr_partition      *hrp = hrt->hrt_partition;
+       LIST_HEAD                       (replies);
+       char                            threadname[20];
+       int                             rc;
+
+       snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d",
+                hrp->hrp_cpt, hrt->hrt_id);
+       unshare_fs_struct();
+
+       rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt);
+       if (rc != 0) {
+               CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n",
+                     threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc);
+       }
+
+       atomic_inc(&hrp->hrp_nstarted);
+       wake_up(&ptlrpc_hr.hr_waitq);
+
+       while (!ptlrpc_hr.hr_stopping) {
+               l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies));
+
+               while (!list_empty(&replies)) {
+                       struct ptlrpc_reply_state *rs;
+
+                       rs = list_entry(replies.prev,
+                                           struct ptlrpc_reply_state,
+                                           rs_list);
+                       list_del_init(&rs->rs_list);
+                       ptlrpc_handle_rs(rs);
+               }
+       }
+
+       atomic_inc(&hrp->hrp_nstopped);
+       wake_up(&ptlrpc_hr.hr_waitq);
+
+       return 0;
+}
+
+static void ptlrpc_stop_hr_threads(void)
+{
+       struct ptlrpc_hr_partition      *hrp;
+       int                             i;
+       int                             j;
+
+       ptlrpc_hr.hr_stopping = 1;
+
+       cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+               if (hrp->hrp_thrs == NULL)
+                       continue; /* uninitialized */
+               for (j = 0; j < hrp->hrp_nthrs; j++)
+                       wake_up_all(&hrp->hrp_thrs[j].hrt_waitq);
+       }
+
+       cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+               if (hrp->hrp_thrs == NULL)
+                       continue; /* uninitialized */
+               wait_event(ptlrpc_hr.hr_waitq,
+                              atomic_read(&hrp->hrp_nstopped) ==
+                              atomic_read(&hrp->hrp_nstarted));
+       }
+}
+
+static int ptlrpc_start_hr_threads(void)
+{
+       struct ptlrpc_hr_partition      *hrp;
+       int                             i;
+       int                             j;
+       ENTRY;
+
+       cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+               int     rc = 0;
+
+               for (j = 0; j < hrp->hrp_nthrs; j++) {
+                       struct  ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j];
+                       rc = PTR_ERR(kthread_run(ptlrpc_hr_main,
+                                                &hrp->hrp_thrs[j],
+                                                "ptlrpc_hr%02d_%03d",
+                                                hrp->hrp_cpt,
+                                                hrt->hrt_id));
+                       if (IS_ERR_VALUE(rc))
+                               break;
+               }
+               wait_event(ptlrpc_hr.hr_waitq,
+                              atomic_read(&hrp->hrp_nstarted) == j);
+               if (!IS_ERR_VALUE(rc))
+                       continue;
+
+               CERROR("Reply handling thread %d:%d Failed on starting: "
+                      "rc = %d\n", i, j, rc);
+               ptlrpc_stop_hr_threads();
+               RETURN(rc);
+       }
+       RETURN(0);
+}
+
+static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt)
+{
+       struct l_wait_info      lwi = { 0 };
+       struct ptlrpc_thread    *thread;
+       LIST_HEAD               (zombie);
+
+       ENTRY;
+
+       CDEBUG(D_INFO, "Stopping threads for service %s\n",
+              svcpt->scp_service->srv_name);
+
+       spin_lock(&svcpt->scp_lock);
+       /* let the thread know that we would like it to stop asap */
+       list_for_each_entry(thread, &svcpt->scp_threads, t_link) {
+               CDEBUG(D_INFO, "Stopping thread %s #%u\n",
+                      svcpt->scp_service->srv_thread_name, thread->t_id);
+               thread_add_flags(thread, SVC_STOPPING);
+       }
+
+       wake_up_all(&svcpt->scp_waitq);
+
+       while (!list_empty(&svcpt->scp_threads)) {
+               thread = list_entry(svcpt->scp_threads.next,
+                                       struct ptlrpc_thread, t_link);
+               if (thread_is_stopped(thread)) {
+                       list_del(&thread->t_link);
+                       list_add(&thread->t_link, &zombie);
+                       continue;
+               }
+               spin_unlock(&svcpt->scp_lock);
+
+               CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n",
+                      svcpt->scp_service->srv_thread_name, thread->t_id);
+               l_wait_event(thread->t_ctl_waitq,
+                            thread_is_stopped(thread), &lwi);
+
+               spin_lock(&svcpt->scp_lock);
+       }
+
+       spin_unlock(&svcpt->scp_lock);
+
+       while (!list_empty(&zombie)) {
+               thread = list_entry(zombie.next,
+                                       struct ptlrpc_thread, t_link);
+               list_del(&thread->t_link);
+               OBD_FREE_PTR(thread);
+       }
+       EXIT;
+}
+
+/**
+ * Stops all threads of a particular service \a svc
+ */
+void ptlrpc_stop_all_threads(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part *svcpt;
+       int                        i;
+       ENTRY;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service != NULL)
+                       ptlrpc_svcpt_stop_threads(svcpt);
+       }
+
+       EXIT;
+}
+EXPORT_SYMBOL(ptlrpc_stop_all_threads);
+
+int ptlrpc_start_threads(struct ptlrpc_service *svc)
+{
+       int     rc = 0;
+       int     i;
+       int     j;
+       ENTRY;
+
+       /* We require 2 threads min, see note in ptlrpc_server_handle_request */
+       LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT);
+
+       for (i = 0; i < svc->srv_ncpts; i++) {
+               for (j = 0; j < svc->srv_nthrs_cpt_init; j++) {
+                       rc = ptlrpc_start_thread(svc->srv_parts[i], 1);
+                       if (rc == 0)
+                               continue;
+
+                       if (rc != -EMFILE)
+                               goto failed;
+                       /* We have enough threads, don't start more. b=15759 */
+                       break;
+               }
+       }
+
+       RETURN(0);
+ failed:
+       CERROR("cannot start %s thread #%d_%d: rc %d\n",
+              svc->srv_thread_name, i, j, rc);
+       ptlrpc_stop_all_threads(svc);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(ptlrpc_start_threads);
+
+int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait)
+{
+       struct l_wait_info      lwi = { 0 };
+       struct ptlrpc_thread    *thread;
+       struct ptlrpc_service   *svc;
+       int                     rc;
+       ENTRY;
+
+       LASSERT(svcpt != NULL);
+
+       svc = svcpt->scp_service;
+
+       CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n",
+              svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running,
+              svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit);
+
+ again:
+       if (unlikely(svc->srv_is_stopping))
+               RETURN(-ESRCH);
+
+       if (!ptlrpc_threads_increasable(svcpt) ||
+           (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) &&
+            svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1))
+               RETURN(-EMFILE);
+
+       OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt);
+       if (thread == NULL)
+               RETURN(-ENOMEM);
+       init_waitqueue_head(&thread->t_ctl_waitq);
+
+       spin_lock(&svcpt->scp_lock);
+       if (!ptlrpc_threads_increasable(svcpt)) {
+               spin_unlock(&svcpt->scp_lock);
+               OBD_FREE_PTR(thread);
+               RETURN(-EMFILE);
+       }
+
+       if (svcpt->scp_nthrs_starting != 0) {
+               /* serialize starting because some modules (obdfilter)
+                * might require unique and contiguous t_id */
+               LASSERT(svcpt->scp_nthrs_starting == 1);
+               spin_unlock(&svcpt->scp_lock);
+               OBD_FREE_PTR(thread);
+               if (wait) {
+                       CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n",
+                              svc->srv_thread_name, svcpt->scp_thr_nextid);
+                       schedule();
+                       goto again;
+               }
+
+               CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n",
+                      svc->srv_thread_name, svcpt->scp_thr_nextid);
+               RETURN(-EAGAIN);
+       }
+
+       svcpt->scp_nthrs_starting++;
+       thread->t_id = svcpt->scp_thr_nextid++;
+       thread_add_flags(thread, SVC_STARTING);
+       thread->t_svcpt = svcpt;
+
+       list_add(&thread->t_link, &svcpt->scp_threads);
+       spin_unlock(&svcpt->scp_lock);
+
+       if (svcpt->scp_cpt >= 0) {
+               snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d",
+                        svc->srv_thread_name, svcpt->scp_cpt, thread->t_id);
+       } else {
+               snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d",
+                        svc->srv_thread_name, thread->t_id);
+       }
+
+       CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name);
+       rc = PTR_ERR(kthread_run(ptlrpc_main, thread, thread->t_name));
+       if (IS_ERR_VALUE(rc)) {
+               CERROR("cannot start thread '%s': rc %d\n",
+                      thread->t_name, rc);
+               spin_lock(&svcpt->scp_lock);
+               list_del(&thread->t_link);
+               --svcpt->scp_nthrs_starting;
+               spin_unlock(&svcpt->scp_lock);
+
+               OBD_FREE(thread, sizeof(*thread));
+               RETURN(rc);
+       }
+
+       if (!wait)
+               RETURN(0);
+
+       l_wait_event(thread->t_ctl_waitq,
+                    thread_is_running(thread) || thread_is_stopped(thread),
+                    &lwi);
+
+       rc = thread_is_stopped(thread) ? thread->t_id : 0;
+       RETURN(rc);
+}
+
+int ptlrpc_hr_init(void)
+{
+       struct ptlrpc_hr_partition      *hrp;
+       struct ptlrpc_hr_thread         *hrt;
+       int                             rc;
+       int                             i;
+       int                             j;
+       ENTRY;
+
+       memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr));
+       ptlrpc_hr.hr_cpt_table = cfs_cpt_table;
+
+       ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table,
+                                                  sizeof(*hrp));
+       if (ptlrpc_hr.hr_partitions == NULL)
+               RETURN(-ENOMEM);
+
+       init_waitqueue_head(&ptlrpc_hr.hr_waitq);
+
+       cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+               hrp->hrp_cpt = i;
+
+               atomic_set(&hrp->hrp_nstarted, 0);
+               atomic_set(&hrp->hrp_nstopped, 0);
+
+               hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i);
+               hrp->hrp_nthrs /= cfs_cpu_ht_nsiblings(0);
+
+               LASSERT(hrp->hrp_nthrs > 0);
+               OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, i,
+                             hrp->hrp_nthrs * sizeof(*hrt));
+               if (hrp->hrp_thrs == NULL)
+                       GOTO(out, rc = -ENOMEM);
+
+               for (j = 0; j < hrp->hrp_nthrs; j++) {
+                       hrt = &hrp->hrp_thrs[j];
+
+                       hrt->hrt_id = j;
+                       hrt->hrt_partition = hrp;
+                       init_waitqueue_head(&hrt->hrt_waitq);
+                       spin_lock_init(&hrt->hrt_lock);
+                       INIT_LIST_HEAD(&hrt->hrt_queue);
+               }
+       }
+
+       rc = ptlrpc_start_hr_threads();
+out:
+       if (rc != 0)
+               ptlrpc_hr_fini();
+       RETURN(rc);
+}
+
+void ptlrpc_hr_fini(void)
+{
+       struct ptlrpc_hr_partition      *hrp;
+       int                             i;
+
+       if (ptlrpc_hr.hr_partitions == NULL)
+               return;
+
+       ptlrpc_stop_hr_threads();
+
+       cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
+               if (hrp->hrp_thrs != NULL) {
+                       OBD_FREE(hrp->hrp_thrs,
+                                hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0]));
+               }
+       }
+
+       cfs_percpt_free(ptlrpc_hr.hr_partitions);
+       ptlrpc_hr.hr_partitions = NULL;
+}
+
+
+/**
+ * Wait until all already scheduled replies are processed.
+ */
+static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt)
+{
+       while (1) {
+               int rc;
+               struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10),
+                                                    NULL, NULL);
+
+               rc = l_wait_event(svcpt->scp_waitq,
+                    atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi);
+               if (rc == 0)
+                       break;
+               CWARN("Unexpectedly long timeout %s %p\n",
+                     svcpt->scp_service->srv_name, svcpt->scp_service);
+       }
+}
+
+static void
+ptlrpc_service_del_atimer(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part      *svcpt;
+       int                             i;
+
+       /* early disarm AT timer... */
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service != NULL)
+                       cfs_timer_disarm(&svcpt->scp_at_timer);
+       }
+}
+
+static void
+ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part        *svcpt;
+       struct ptlrpc_request_buffer_desc *rqbd;
+       struct l_wait_info                lwi;
+       int                               rc;
+       int                               i;
+
+       /* All history will be culled when the next request buffer is
+        * freed in ptlrpc_service_purge_all() */
+       svc->srv_hist_nrqbds_cpt_max = 0;
+
+       rc = LNetClearLazyPortal(svc->srv_req_portal);
+       LASSERT(rc == 0);
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service == NULL)
+                       break;
+
+               /* Unlink all the request buffers.  This forces a 'final'
+                * event with its 'unlink' flag set for each posted rqbd */
+               list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted,
+                                       rqbd_list) {
+                       rc = LNetMDUnlink(rqbd->rqbd_md_h);
+                       LASSERT(rc == 0 || rc == -ENOENT);
+               }
+       }
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service == NULL)
+                       break;
+
+               /* Wait for the network to release any buffers
+                * it's currently filling */
+               spin_lock(&svcpt->scp_lock);
+               while (svcpt->scp_nrqbds_posted != 0) {
+                       spin_unlock(&svcpt->scp_lock);
+                       /* Network access will complete in finite time but
+                        * the HUGE timeout lets us CWARN for visibility
+                        * of sluggish NALs */
+                       lwi = LWI_TIMEOUT_INTERVAL(
+                                       cfs_time_seconds(LONG_UNLINK),
+                                       cfs_time_seconds(1), NULL, NULL);
+                       rc = l_wait_event(svcpt->scp_waitq,
+                                         svcpt->scp_nrqbds_posted == 0, &lwi);
+                       if (rc == -ETIMEDOUT) {
+                               CWARN("Service %s waiting for "
+                                     "request buffers\n",
+                                     svcpt->scp_service->srv_name);
+                       }
+                       spin_lock(&svcpt->scp_lock);
+               }
+               spin_unlock(&svcpt->scp_lock);
+       }
+}
+
+static void
+ptlrpc_service_purge_all(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part              *svcpt;
+       struct ptlrpc_request_buffer_desc       *rqbd;
+       struct ptlrpc_request                   *req;
+       struct ptlrpc_reply_state               *rs;
+       int                                     i;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service == NULL)
+                       break;
+
+               spin_lock(&svcpt->scp_rep_lock);
+               while (!list_empty(&svcpt->scp_rep_active)) {
+                       rs = list_entry(svcpt->scp_rep_active.next,
+                                           struct ptlrpc_reply_state, rs_list);
+                       spin_lock(&rs->rs_lock);
+                       ptlrpc_schedule_difficult_reply(rs);
+                       spin_unlock(&rs->rs_lock);
+               }
+               spin_unlock(&svcpt->scp_rep_lock);
+
+               /* purge the request queue.  NB No new replies (rqbds
+                * all unlinked) and no service threads, so I'm the only
+                * thread noodling the request queue now */
+               while (!list_empty(&svcpt->scp_req_incoming)) {
+                       req = list_entry(svcpt->scp_req_incoming.next,
+                                            struct ptlrpc_request, rq_list);
+
+                       list_del(&req->rq_list);
+                       svcpt->scp_nreqs_incoming--;
+                       ptlrpc_server_finish_request(svcpt, req);
+               }
+
+               while (ptlrpc_server_request_pending(svcpt, true)) {
+                       req = ptlrpc_server_request_get(svcpt, true);
+                       ptlrpc_server_finish_active_request(svcpt, req);
+               }
+
+               LASSERT(list_empty(&svcpt->scp_rqbd_posted));
+               LASSERT(svcpt->scp_nreqs_incoming == 0);
+               LASSERT(svcpt->scp_nreqs_active == 0);
+               /* history should have been culled by
+                * ptlrpc_server_finish_request */
+               LASSERT(svcpt->scp_hist_nrqbds == 0);
+
+               /* Now free all the request buffers since nothing
+                * references them any more... */
+
+               while (!list_empty(&svcpt->scp_rqbd_idle)) {
+                       rqbd = list_entry(svcpt->scp_rqbd_idle.next,
+                                             struct ptlrpc_request_buffer_desc,
+                                             rqbd_list);
+                       ptlrpc_free_rqbd(rqbd);
+               }
+               ptlrpc_wait_replies(svcpt);
+
+               while (!list_empty(&svcpt->scp_rep_idle)) {
+                       rs = list_entry(svcpt->scp_rep_idle.next,
+                                           struct ptlrpc_reply_state,
+                                           rs_list);
+                       list_del(&rs->rs_list);
+                       OBD_FREE_LARGE(rs, svc->srv_max_reply_size);
+               }
+       }
+}
+
+static void
+ptlrpc_service_free(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part      *svcpt;
+       struct ptlrpc_at_array          *array;
+       int                             i;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               if (svcpt->scp_service == NULL)
+                       break;
+
+               /* In case somebody rearmed this in the meantime */
+               cfs_timer_disarm(&svcpt->scp_at_timer);
+               array = &svcpt->scp_at_array;
+
+               if (array->paa_reqs_array != NULL) {
+                       OBD_FREE(array->paa_reqs_array,
+                                sizeof(struct list_head) * array->paa_size);
+                       array->paa_reqs_array = NULL;
+               }
+
+               if (array->paa_reqs_count != NULL) {
+                       OBD_FREE(array->paa_reqs_count,
+                                sizeof(__u32) * array->paa_size);
+                       array->paa_reqs_count = NULL;
+               }
+       }
+
+       ptlrpc_service_for_each_part(svcpt, i, svc)
+               OBD_FREE_PTR(svcpt);
+
+       if (svc->srv_cpts != NULL)
+               cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts);
+
+       OBD_FREE(svc, offsetof(struct ptlrpc_service,
+                              srv_parts[svc->srv_ncpts]));
+}
+
+int ptlrpc_unregister_service(struct ptlrpc_service *service)
+{
+       ENTRY;
+
+       CDEBUG(D_NET, "%s: tearing down\n", service->srv_name);
+
+       service->srv_is_stopping = 1;
+
+       mutex_lock(&ptlrpc_all_services_mutex);
+       list_del_init(&service->srv_list);
+       mutex_unlock(&ptlrpc_all_services_mutex);
+
+       ptlrpc_service_del_atimer(service);
+       ptlrpc_stop_all_threads(service);
+
+       ptlrpc_service_unlink_rqbd(service);
+       ptlrpc_service_purge_all(service);
+       ptlrpc_service_nrs_cleanup(service);
+
+       ptlrpc_lprocfs_unregister_service(service);
+
+       ptlrpc_service_free(service);
+
+       RETURN(0);
+}
+EXPORT_SYMBOL(ptlrpc_unregister_service);
+
+/**
+ * Returns 0 if the service is healthy.
+ *
+ * Right now, it just checks to make sure that requests aren't languishing
+ * in the queue.  We'll use this health check to govern whether a node needs
+ * to be shot, so it's intentionally non-aggressive. */
+int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt)
+{
+       struct ptlrpc_request           *request = NULL;
+       struct timeval                  right_now;
+       long                            timediff;
+
+       do_gettimeofday(&right_now);
+
+       spin_lock(&svcpt->scp_req_lock);
+       /* How long has the next entry been waiting? */
+       if (ptlrpc_server_high_pending(svcpt, true))
+               request = ptlrpc_nrs_req_peek_nolock(svcpt, true);
+       else if (ptlrpc_server_normal_pending(svcpt, true))
+               request = ptlrpc_nrs_req_peek_nolock(svcpt, false);
+
+       if (request == NULL) {
+               spin_unlock(&svcpt->scp_req_lock);
+               return 0;
+       }
+
+       timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
+       spin_unlock(&svcpt->scp_req_lock);
+
+       if ((timediff / ONE_MILLION) >
+           (AT_OFF ? obd_timeout * 3 / 2 : at_max)) {
+               CERROR("%s: unhealthy - request has been waiting %lds\n",
+                      svcpt->scp_service->srv_name, timediff / ONE_MILLION);
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+ptlrpc_service_health_check(struct ptlrpc_service *svc)
+{
+       struct ptlrpc_service_part      *svcpt;
+       int                             i;
+
+       if (svc == NULL)
+               return 0;
+
+       ptlrpc_service_for_each_part(svcpt, i, svc) {
+               int rc = ptlrpc_svcpt_health_check(svcpt);
+
+               if (rc != 0)
+                       return rc;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ptlrpc_service_health_check);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c
new file mode 100644 (file)
index 0000000..93bc40b
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/fs.h>
+#  include <linux/posix_acl_xattr.h>
+# endif
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_disk.h>
diff --git a/drivers/staging/lustre/lustre/ptlrpc/wiretest.c b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c
new file mode 100644 (file)
index 0000000..9890bd9
--- /dev/null
@@ -0,0 +1,4474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+# ifdef CONFIG_FS_POSIX_ACL
+#  include <linux/fs.h>
+#  include <linux/posix_acl_xattr.h>
+# endif
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_net.h>
+#include <lustre_disk.h>
+void lustre_assert_wire_constants(void)
+{
+        /* Wire protocol assertions generated by 'wirecheck'
+         * (make -C lustre/utils newwiretest)
+         * running on Linux deva 2.6.32.279.lustre #5 SMP Tue Apr 9 22:52:17 CST 2013 x86_64 x86_64 x
+         * with gcc version 4.4.4 20100726 (Red Hat 4.4.4-13) (GCC)  */
+
+
+       /* Constants... */
+       LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n",
+                (long long)PTL_RPC_MSG_REQUEST);
+       LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n",
+                (long long)PTL_RPC_MSG_ERR);
+       LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n",
+                (long long)PTL_RPC_MSG_REPLY);
+       LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n",
+                MDS_DIR_END_OFF);
+       LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n",
+                DEAD_HANDLE_MAGIC);
+       CLASSERT(MTI_NAME_MAXLEN == 64);
+       LASSERTF(OST_REPLY == 0, "found %lld\n",
+                (long long)OST_REPLY);
+       LASSERTF(OST_GETATTR == 1, "found %lld\n",
+                (long long)OST_GETATTR);
+       LASSERTF(OST_SETATTR == 2, "found %lld\n",
+                (long long)OST_SETATTR);
+       LASSERTF(OST_READ == 3, "found %lld\n",
+                (long long)OST_READ);
+       LASSERTF(OST_WRITE == 4, "found %lld\n",
+                (long long)OST_WRITE);
+       LASSERTF(OST_CREATE == 5, "found %lld\n",
+                (long long)OST_CREATE);
+       LASSERTF(OST_DESTROY == 6, "found %lld\n",
+                (long long)OST_DESTROY);
+       LASSERTF(OST_GET_INFO == 7, "found %lld\n",
+                (long long)OST_GET_INFO);
+       LASSERTF(OST_CONNECT == 8, "found %lld\n",
+                (long long)OST_CONNECT);
+       LASSERTF(OST_DISCONNECT == 9, "found %lld\n",
+                (long long)OST_DISCONNECT);
+       LASSERTF(OST_PUNCH == 10, "found %lld\n",
+                (long long)OST_PUNCH);
+       LASSERTF(OST_OPEN == 11, "found %lld\n",
+                (long long)OST_OPEN);
+       LASSERTF(OST_CLOSE == 12, "found %lld\n",
+                (long long)OST_CLOSE);
+       LASSERTF(OST_STATFS == 13, "found %lld\n",
+                (long long)OST_STATFS);
+       LASSERTF(OST_SYNC == 16, "found %lld\n",
+                (long long)OST_SYNC);
+       LASSERTF(OST_SET_INFO == 17, "found %lld\n",
+                (long long)OST_SET_INFO);
+       LASSERTF(OST_QUOTACHECK == 18, "found %lld\n",
+                (long long)OST_QUOTACHECK);
+       LASSERTF(OST_QUOTACTL == 19, "found %lld\n",
+                (long long)OST_QUOTACTL);
+       LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n",
+                (long long)OST_QUOTA_ADJUST_QUNIT);
+       LASSERTF(OST_LAST_OPC == 21, "found %lld\n",
+                (long long)OST_LAST_OPC);
+       LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+                OBD_OBJECT_EOF);
+       LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n",
+                (long long)OST_MIN_PRECREATE);
+       LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n",
+                (long long)OST_MAX_PRECREATE);
+       LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n",
+                OST_LVB_ERR_INIT);
+       LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n",
+                OST_LVB_ERR_MASK);
+       LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n",
+                (long long)MDS_FIRST_OPC);
+       LASSERTF(MDS_GETATTR == 33, "found %lld\n",
+                (long long)MDS_GETATTR);
+       LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n",
+                (long long)MDS_GETATTR_NAME);
+       LASSERTF(MDS_CLOSE == 35, "found %lld\n",
+                (long long)MDS_CLOSE);
+       LASSERTF(MDS_REINT == 36, "found %lld\n",
+                (long long)MDS_REINT);
+       LASSERTF(MDS_READPAGE == 37, "found %lld\n",
+                (long long)MDS_READPAGE);
+       LASSERTF(MDS_CONNECT == 38, "found %lld\n",
+                (long long)MDS_CONNECT);
+       LASSERTF(MDS_DISCONNECT == 39, "found %lld\n",
+                (long long)MDS_DISCONNECT);
+       LASSERTF(MDS_GETSTATUS == 40, "found %lld\n",
+                (long long)MDS_GETSTATUS);
+       LASSERTF(MDS_STATFS == 41, "found %lld\n",
+                (long long)MDS_STATFS);
+       LASSERTF(MDS_PIN == 42, "found %lld\n",
+                (long long)MDS_PIN);
+       LASSERTF(MDS_UNPIN == 43, "found %lld\n",
+                (long long)MDS_UNPIN);
+       LASSERTF(MDS_SYNC == 44, "found %lld\n",
+                (long long)MDS_SYNC);
+       LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n",
+                (long long)MDS_DONE_WRITING);
+       LASSERTF(MDS_SET_INFO == 46, "found %lld\n",
+                (long long)MDS_SET_INFO);
+       LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n",
+                (long long)MDS_QUOTACHECK);
+       LASSERTF(MDS_QUOTACTL == 48, "found %lld\n",
+                (long long)MDS_QUOTACTL);
+       LASSERTF(MDS_GETXATTR == 49, "found %lld\n",
+                (long long)MDS_GETXATTR);
+       LASSERTF(MDS_SETXATTR == 50, "found %lld\n",
+                (long long)MDS_SETXATTR);
+       LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n",
+                (long long)MDS_WRITEPAGE);
+       LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n",
+                (long long)MDS_IS_SUBDIR);
+       LASSERTF(MDS_GET_INFO == 53, "found %lld\n",
+                (long long)MDS_GET_INFO);
+       LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n",
+                (long long)MDS_HSM_STATE_GET);
+       LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n",
+                (long long)MDS_HSM_STATE_SET);
+       LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n",
+                (long long)MDS_HSM_ACTION);
+       LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n",
+                (long long)MDS_HSM_PROGRESS);
+       LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n",
+                (long long)MDS_HSM_REQUEST);
+       LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n",
+                (long long)MDS_HSM_CT_REGISTER);
+       LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n",
+                (long long)MDS_HSM_CT_UNREGISTER);
+       LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n",
+                (long long)MDS_SWAP_LAYOUTS);
+       LASSERTF(MDS_LAST_OPC == 62, "found %lld\n",
+                (long long)MDS_LAST_OPC);
+       LASSERTF(REINT_SETATTR == 1, "found %lld\n",
+                (long long)REINT_SETATTR);
+       LASSERTF(REINT_CREATE == 2, "found %lld\n",
+                (long long)REINT_CREATE);
+       LASSERTF(REINT_LINK == 3, "found %lld\n",
+                (long long)REINT_LINK);
+       LASSERTF(REINT_UNLINK == 4, "found %lld\n",
+                (long long)REINT_UNLINK);
+       LASSERTF(REINT_RENAME == 5, "found %lld\n",
+                (long long)REINT_RENAME);
+       LASSERTF(REINT_OPEN == 6, "found %lld\n",
+                (long long)REINT_OPEN);
+       LASSERTF(REINT_SETXATTR == 7, "found %lld\n",
+                (long long)REINT_SETXATTR);
+       LASSERTF(REINT_RMENTRY == 8, "found %lld\n",
+                (long long)REINT_RMENTRY);
+       LASSERTF(REINT_MAX == 9, "found %lld\n",
+                (long long)REINT_MAX);
+       LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_IT_EXECD);
+       LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_LOOKUP_EXECD);
+       LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_LOOKUP_NEG);
+       LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_LOOKUP_POS);
+       LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_OPEN_CREATE);
+       LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_OPEN_OPEN);
+       LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_ENQ_COMPLETE);
+       LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_ENQ_OPEN_REF);
+       LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_ENQ_CREATE_REF);
+       LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n",
+               (unsigned)DISP_OPEN_LOCK);
+       LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n",
+                (long long)MDS_STATUS_CONN);
+       LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n",
+                (long long)MDS_STATUS_LOV);
+       LASSERTF(LUSTRE_BFLAG_UNCOMMITTED_WRITES == 1, "found %lld\n",
+                (long long)LUSTRE_BFLAG_UNCOMMITTED_WRITES);
+       LASSERTF(MF_SOM_CHANGE == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)MF_SOM_CHANGE);
+       LASSERTF(MF_EPOCH_OPEN == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)MF_EPOCH_OPEN);
+       LASSERTF(MF_EPOCH_CLOSE == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)MF_EPOCH_CLOSE);
+       LASSERTF(MF_MDC_CANCEL_FID1 == 0x00000008UL, "found 0x%.8xUL\n",
+               (unsigned)MF_MDC_CANCEL_FID1);
+       LASSERTF(MF_MDC_CANCEL_FID2 == 0x00000010UL, "found 0x%.8xUL\n",
+               (unsigned)MF_MDC_CANCEL_FID2);
+       LASSERTF(MF_MDC_CANCEL_FID3 == 0x00000020UL, "found 0x%.8xUL\n",
+               (unsigned)MF_MDC_CANCEL_FID3);
+       LASSERTF(MF_MDC_CANCEL_FID4 == 0x00000040UL, "found 0x%.8xUL\n",
+               (unsigned)MF_MDC_CANCEL_FID4);
+       LASSERTF(MF_SOM_AU == 0x00000080UL, "found 0x%.8xUL\n",
+               (unsigned)MF_SOM_AU);
+       LASSERTF(MF_GETATTR_LOCK == 0x00000100UL, "found 0x%.8xUL\n",
+               (unsigned)MF_GETATTR_LOCK);
+       LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_MODE);
+       LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_UID);
+       LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_GID);
+       LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_SIZE);
+       LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_ATIME);
+       LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_MTIME);
+       LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_CTIME);
+       LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_ATIME_SET);
+       LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_MTIME_SET);
+       LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_FORCE);
+       LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_ATTR_FLAG);
+       LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_KILL_SUID);
+       LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_KILL_SGID);
+       LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_CTIME_SET);
+       LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_FROM_OPEN);
+       LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n",
+                       (long long)MDS_ATTR_BLOCKS);
+       LASSERTF(FLD_QUERY == 900, "found %lld\n",
+                (long long)FLD_QUERY);
+       LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n",
+                (long long)FLD_FIRST_OPC);
+       LASSERTF(FLD_LAST_OPC == 901, "found %lld\n",
+                (long long)FLD_LAST_OPC);
+       LASSERTF(SEQ_QUERY == 700, "found %lld\n",
+                (long long)SEQ_QUERY);
+       LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n",
+                (long long)SEQ_FIRST_OPC);
+       LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n",
+                (long long)SEQ_LAST_OPC);
+       LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n",
+                (long long)SEQ_ALLOC_SUPER);
+       LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n",
+                (long long)SEQ_ALLOC_META);
+       LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n",
+                (long long)LDLM_ENQUEUE);
+       LASSERTF(LDLM_CONVERT == 102, "found %lld\n",
+                (long long)LDLM_CONVERT);
+       LASSERTF(LDLM_CANCEL == 103, "found %lld\n",
+                (long long)LDLM_CANCEL);
+       LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n",
+                (long long)LDLM_BL_CALLBACK);
+       LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n",
+                (long long)LDLM_CP_CALLBACK);
+       LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n",
+                (long long)LDLM_GL_CALLBACK);
+       LASSERTF(LDLM_SET_INFO == 107, "found %lld\n",
+                (long long)LDLM_SET_INFO);
+       LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n",
+                (long long)LDLM_LAST_OPC);
+       LASSERTF(LCK_MINMODE == 0, "found %lld\n",
+                (long long)LCK_MINMODE);
+       LASSERTF(LCK_EX == 1, "found %lld\n",
+                (long long)LCK_EX);
+       LASSERTF(LCK_PW == 2, "found %lld\n",
+                (long long)LCK_PW);
+       LASSERTF(LCK_PR == 4, "found %lld\n",
+                (long long)LCK_PR);
+       LASSERTF(LCK_CW == 8, "found %lld\n",
+                (long long)LCK_CW);
+       LASSERTF(LCK_CR == 16, "found %lld\n",
+                (long long)LCK_CR);
+       LASSERTF(LCK_NL == 32, "found %lld\n",
+                (long long)LCK_NL);
+       LASSERTF(LCK_GROUP == 64, "found %lld\n",
+                (long long)LCK_GROUP);
+       LASSERTF(LCK_COS == 128, "found %lld\n",
+                (long long)LCK_COS);
+       LASSERTF(LCK_MAXMODE == 129, "found %lld\n",
+                (long long)LCK_MAXMODE);
+       LASSERTF(LCK_MODE_NUM == 8, "found %lld\n",
+                (long long)LCK_MODE_NUM);
+       CLASSERT(LDLM_PLAIN == 10);
+       CLASSERT(LDLM_EXTENT == 11);
+       CLASSERT(LDLM_FLOCK == 12);
+       CLASSERT(LDLM_IBITS == 13);
+       CLASSERT(LDLM_MAX_TYPE == 14);
+       CLASSERT(LUSTRE_RES_ID_SEQ_OFF == 0);
+       CLASSERT(LUSTRE_RES_ID_VER_OID_OFF == 1);
+       LASSERTF(UPDATE_OBJ == 1000, "found %lld\n",
+                (long long)UPDATE_OBJ);
+       LASSERTF(UPDATE_LAST_OPC == 1001, "found %lld\n",
+                (long long)UPDATE_LAST_OPC);
+       CLASSERT(LUSTRE_RES_ID_QUOTA_SEQ_OFF == 2);
+       CLASSERT(LUSTRE_RES_ID_QUOTA_VER_OID_OFF == 3);
+       CLASSERT(LUSTRE_RES_ID_HSH_OFF == 3);
+       CLASSERT(LQUOTA_TYPE_USR == 0);
+       CLASSERT(LQUOTA_TYPE_GRP == 1);
+       CLASSERT(LQUOTA_RES_MD == 1);
+       CLASSERT(LQUOTA_RES_DT == 2);
+       LASSERTF(OBD_PING == 400, "found %lld\n",
+                (long long)OBD_PING);
+       LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n",
+                (long long)OBD_LOG_CANCEL);
+       LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n",
+                (long long)OBD_QC_CALLBACK);
+       LASSERTF(OBD_IDX_READ == 403, "found %lld\n",
+                (long long)OBD_IDX_READ);
+       LASSERTF(OBD_LAST_OPC == 404, "found %lld\n",
+                (long long)OBD_LAST_OPC);
+       LASSERTF(QUOTA_DQACQ == 601, "found %lld\n",
+                (long long)QUOTA_DQACQ);
+       LASSERTF(QUOTA_DQREL == 602, "found %lld\n",
+                (long long)QUOTA_DQREL);
+       LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n",
+                (long long)QUOTA_LAST_OPC);
+       LASSERTF(MGS_CONNECT == 250, "found %lld\n",
+                (long long)MGS_CONNECT);
+       LASSERTF(MGS_DISCONNECT == 251, "found %lld\n",
+                (long long)MGS_DISCONNECT);
+       LASSERTF(MGS_EXCEPTION == 252, "found %lld\n",
+                (long long)MGS_EXCEPTION);
+       LASSERTF(MGS_TARGET_REG == 253, "found %lld\n",
+                (long long)MGS_TARGET_REG);
+       LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n",
+                (long long)MGS_TARGET_DEL);
+       LASSERTF(MGS_SET_INFO == 255, "found %lld\n",
+                (long long)MGS_SET_INFO);
+       LASSERTF(MGS_LAST_OPC == 257, "found %lld\n",
+                (long long)MGS_LAST_OPC);
+       LASSERTF(SEC_CTX_INIT == 801, "found %lld\n",
+                (long long)SEC_CTX_INIT);
+       LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n",
+                (long long)SEC_CTX_INIT_CONT);
+       LASSERTF(SEC_CTX_FINI == 803, "found %lld\n",
+                (long long)SEC_CTX_FINI);
+       LASSERTF(SEC_LAST_OPC == 804, "found %lld\n",
+                (long long)SEC_LAST_OPC);
+       /* Sizes and Offsets */
+
+       /* Checks for struct obd_uuid */
+       LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct obd_uuid));
+
+       /* Checks for struct lu_seq_range */
+       LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct lu_seq_range));
+       LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lu_seq_range, lsr_start));
+       LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start));
+       LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lu_seq_range, lsr_end));
+       LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end));
+       LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lu_seq_range, lsr_index));
+       LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index));
+       LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct lu_seq_range, lsr_flags));
+       LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags));
+       LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n",
+                (long long)LU_SEQ_RANGE_MDT);
+       LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n",
+                (long long)LU_SEQ_RANGE_OST);
+
+       /* Checks for struct lustre_mdt_attrs */
+       LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct lustre_mdt_attrs));
+       LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat));
+       LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat));
+       LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat));
+       LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat));
+       LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid));
+       LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid));
+       LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)LMAI_RELEASED);
+       LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)LMAC_HSM);
+       LASSERTF(LMAC_SOM == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)LMAC_SOM);
+       LASSERTF(OBJ_CREATE == 1, "found %lld\n",
+                (long long)OBJ_CREATE);
+       LASSERTF(OBJ_DESTROY == 2, "found %lld\n",
+                (long long)OBJ_DESTROY);
+       LASSERTF(OBJ_REF_ADD == 3, "found %lld\n",
+                (long long)OBJ_REF_ADD);
+       LASSERTF(OBJ_REF_DEL == 4, "found %lld\n",
+                (long long)OBJ_REF_DEL);
+       LASSERTF(OBJ_ATTR_SET == 5, "found %lld\n",
+                (long long)OBJ_ATTR_SET);
+       LASSERTF(OBJ_ATTR_GET == 6, "found %lld\n",
+                (long long)OBJ_ATTR_GET);
+       LASSERTF(OBJ_XATTR_SET == 7, "found %lld\n",
+                (long long)OBJ_XATTR_SET);
+       LASSERTF(OBJ_XATTR_GET == 8, "found %lld\n",
+                (long long)OBJ_XATTR_GET);
+       LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+                (long long)OBJ_INDEX_LOOKUP);
+       LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n",
+                (long long)OBJ_INDEX_LOOKUP);
+       LASSERTF(OBJ_INDEX_INSERT == 10, "found %lld\n",
+                (long long)OBJ_INDEX_INSERT);
+       LASSERTF(OBJ_INDEX_DELETE == 11, "found %lld\n",
+                (long long)OBJ_INDEX_DELETE);
+
+       /* Checks for struct som_attrs */
+       LASSERTF((int)sizeof(struct som_attrs) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct som_attrs));
+       LASSERTF((int)offsetof(struct som_attrs, som_compat) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_compat));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_compat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_compat));
+       LASSERTF((int)offsetof(struct som_attrs, som_incompat) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_incompat));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_incompat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_incompat));
+       LASSERTF((int)offsetof(struct som_attrs, som_ioepoch) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_ioepoch));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_ioepoch) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_ioepoch));
+       LASSERTF((int)offsetof(struct som_attrs, som_size) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_size));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_size));
+       LASSERTF((int)offsetof(struct som_attrs, som_blocks) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_blocks));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_blocks));
+       LASSERTF((int)offsetof(struct som_attrs, som_mountid) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct som_attrs, som_mountid));
+       LASSERTF((int)sizeof(((struct som_attrs *)0)->som_mountid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct som_attrs *)0)->som_mountid));
+
+       /* Checks for struct hsm_attrs */
+       LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_attrs));
+       LASSERTF((int)offsetof(struct hsm_attrs, hsm_compat) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_attrs, hsm_compat));
+       LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_compat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_compat));
+       LASSERTF((int)offsetof(struct hsm_attrs, hsm_flags) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_attrs, hsm_flags));
+       LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_flags));
+       LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_id) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_attrs, hsm_arch_id));
+       LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id));
+       LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_ver) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_attrs, hsm_arch_ver));
+       LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver));
+
+       /* Checks for struct ost_id */
+       LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct ost_id));
+       LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ost_id, oi));
+       LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_id *)0)->oi));
+       LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n",
+                (long long)LUSTRE_FID_INIT_OID);
+       LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n",
+                (long long)FID_SEQ_OST_MDT0);
+       LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n",
+                (long long)FID_SEQ_LLOG);
+       LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n",
+                (long long)FID_SEQ_ECHO);
+       LASSERTF(FID_SEQ_OST_MDT1 == 3, "found %lld\n",
+                (long long)FID_SEQ_OST_MDT1);
+       LASSERTF(FID_SEQ_OST_MAX == 9, "found %lld\n",
+                (long long)FID_SEQ_OST_MAX);
+       LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n",
+                (long long)FID_SEQ_RSVD);
+       LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n",
+                (long long)FID_SEQ_IGIF);
+       LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_IGIF_MAX);
+       LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_IDIF);
+       LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_IDIF_MAX);
+       LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_START);
+       LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_LOCAL_FILE);
+       LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_DOT_LUSTRE);
+       LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_SPECIAL);
+       LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_QUOTA);
+       LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_QUOTA_GLB);
+       LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_ROOT);
+       LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_NORMAL);
+       LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n",
+                       (long long)FID_SEQ_LOV_DEFAULT);
+       LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)FID_OID_SPECIAL_BFL);
+       LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)FID_OID_DOT_LUSTRE);
+       LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)FID_OID_DOT_LUSTRE_OBF);
+
+       /* Checks for struct lu_dirent */
+       LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct lu_dirent));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_fid));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_hash));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_reclen));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_namelen));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_attrs));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs));
+       LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirent, lde_name[0]));
+       LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0]));
+       LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)LUDA_FID);
+       LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)LUDA_TYPE);
+       LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)LUDA_64BITHASH);
+
+       /* Checks for struct luda_type */
+       LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n",
+                (long long)(int)sizeof(struct luda_type));
+       LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct luda_type, lt_type));
+       LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct luda_type *)0)->lt_type));
+
+       /* Checks for struct lu_dirpage */
+       LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct lu_dirpage));
+       LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start));
+       LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start));
+       LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end));
+       LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end));
+       LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirpage, ldp_flags));
+       LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags));
+       LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirpage, ldp_pad0));
+       LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0));
+       LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0]));
+       LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]));
+       LASSERTF(LDF_EMPTY == 1, "found %lld\n",
+                (long long)LDF_EMPTY);
+       LASSERTF(LDF_COLLIDE == 2, "found %lld\n",
+                (long long)LDF_COLLIDE);
+       LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n",
+                (long long)LU_PAGE_SIZE);
+       /* Checks for union lu_page */
+       LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n",
+                (long long)(int)sizeof(union lu_page));
+
+       /* Checks for struct lustre_handle */
+       LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct lustre_handle));
+       LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_handle, cookie));
+       LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_handle *)0)->cookie));
+
+       /* Checks for struct lustre_msg_v2 */
+       LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct lustre_msg_v2));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_magic));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_flags));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3));
+       LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0]));
+       LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]));
+       LASSERTF(LUSTRE_MSG_MAGIC_V1 == 0x0BD00BD0, "found 0x%.8x\n",
+               LUSTRE_MSG_MAGIC_V1);
+       LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n",
+               LUSTRE_MSG_MAGIC_V2);
+       LASSERTF(LUSTRE_MSG_MAGIC_V1_SWABBED == 0xD00BD00B, "found 0x%.8x\n",
+               LUSTRE_MSG_MAGIC_V1_SWABBED);
+       LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n",
+               LUSTRE_MSG_MAGIC_V2_SWABBED);
+
+       /* Checks for struct ptlrpc_body */
+       LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n",
+                (long long)(int)sizeof(struct ptlrpc_body_v3));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_seen));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv));
+       CLASSERT(PTLRPC_NUM_VERSIONS == 4);
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding));
+       CLASSERT(JOBSTATS_JOBID_SIZE == 32);
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n",
+                (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == (int)offsetof(struct ptlrpc_body_v2, pb_last_seen), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_last_seen), (int)offsetof(struct ptlrpc_body_v2, pb_last_seen));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions));
+       LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == (int)offsetof(struct ptlrpc_body_v2, pb_padding), "%d != %d\n",
+                (int)offsetof(struct ptlrpc_body_v3, pb_padding), (int)offsetof(struct ptlrpc_body_v2, pb_padding));
+       LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding), "%d != %d\n",
+                (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding));
+       LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n",
+                (long long)MSG_PTLRPC_BODY_OFF);
+       LASSERTF(REQ_REC_OFF == 1, "found %lld\n",
+                (long long)REQ_REC_OFF);
+       LASSERTF(REPLY_REC_OFF == 1, "found %lld\n",
+                (long long)REPLY_REC_OFF);
+       LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n",
+                (long long)DLM_LOCKREQ_OFF);
+       LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n",
+                (long long)DLM_REQ_REC_OFF);
+       LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n",
+                (long long)DLM_INTENT_IT_OFF);
+       LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n",
+                (long long)DLM_INTENT_REC_OFF);
+       LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n",
+                (long long)DLM_LOCKREPLY_OFF);
+       LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n",
+                (long long)DLM_REPLY_REC_OFF);
+       LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n",
+                (long long)MSG_PTLRPC_HEADER_OFF);
+       LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n",
+               PTLRPC_MSG_VERSION);
+       LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n",
+               LUSTRE_VERSION_MASK);
+       LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n",
+               LUSTRE_OBD_VERSION);
+       LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n",
+               LUSTRE_MDS_VERSION);
+       LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n",
+               LUSTRE_OST_VERSION);
+       LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n",
+               LUSTRE_DLM_VERSION);
+       LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n",
+               LUSTRE_LOG_VERSION);
+       LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n",
+               LUSTRE_MGS_VERSION);
+       LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n",
+                (long long)MSGHDR_AT_SUPPORT);
+       LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n",
+                (long long)MSGHDR_CKSUM_INCOMPAT18);
+       LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_OP_FLAG_MASK);
+       LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n",
+                (long long)MSG_OP_FLAG_SHIFT);
+       LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n",
+               (unsigned)MSG_GEN_FLAG_MASK);
+       LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_LAST_REPLAY);
+       LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_RESENT);
+       LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_REPLAY);
+       LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_DELAY_REPLAY);
+       LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_VERSION_REPLAY);
+       LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_REQ_REPLAY_DONE);
+       LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_LOCK_REPLAY_DONE);
+       LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_RECOVERING);
+       LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_RECONNECT);
+       LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_REPLAYABLE);
+       LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_LIBCLIENT);
+       LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_INITIAL);
+       LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_ASYNC);
+       LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_NEXT_VER);
+       LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n",
+               (unsigned)MSG_CONNECT_TRANSNO);
+
+       /* Checks for struct obd_connect_data */
+       LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n",
+                (long long)(int)sizeof(struct obd_connect_data));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_version));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_grant));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_index));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_unused));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_transno));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_group));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_instance));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance));
+       LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding1));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding2) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding2));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding3));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding4));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding5));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding6));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding7));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding8));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8));
+       LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, padding9));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingA));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingB));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingC));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingD));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingE));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE));
+       LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n",
+                (long long)(int)offsetof(struct obd_connect_data, paddingF));
+       LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF));
+       LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_RDONLY);
+       LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_INDEX);
+       LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_MDS);
+       LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_GRANT);
+       LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_SRVLOCK);
+       LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_VERSION);
+       LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_REQPORTAL);
+       LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_ACL);
+       LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_XATTR);
+       LASSERTF(OBD_CONNECT_CROW == 0x200ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_CROW);
+       LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_TRUNCLOCK);
+       LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_TRANSNO);
+       LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_IBITS);
+       LASSERTF(OBD_CONNECT_JOIN == 0x2000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_JOIN);
+       LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_ATTRFID);
+       LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_NODEVOH);
+       LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_RMT_CLIENT);
+       LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_RMT_CLIENT_FORCE);
+       LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_BRW_SIZE);
+       LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_QUOTA64);
+       LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_MDS_CAPA);
+       LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_OSS_CAPA);
+       LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_CANCELSET);
+       LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_SOM);
+       LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_AT);
+       LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_LRU_RESIZE);
+       LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_MDS_MDS);
+       LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_REAL);
+       LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_CHANGE_QS);
+       LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_CKSUM);
+       LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_FID);
+       LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_VBR);
+       LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_LOV_V3);
+       LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_GRANT_SHRINK);
+       LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_SKIP_ORPHAN);
+       LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_MAX_EASIZE);
+       LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_FULL20);
+       LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_LAYOUTLOCK);
+       LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_64BITHASH);
+       LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_MAXBYTES);
+       LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_IMP_RECOV);
+       LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_JOBSTATS);
+       LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_UMASK);
+       LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_EINPROGRESS);
+       LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_GRANT_PARAM);
+       LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_FLOCK_OWNER);
+       LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_LVB_TYPE);
+       LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_NANOSEC_TIME);
+       LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_LIGHTWEIGHT);
+       LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_SHORTIO);
+       LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n",
+                OBD_CONNECT_PINGLESS);
+       LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)OBD_CKSUM_CRC32);
+       LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)OBD_CKSUM_ADLER);
+       LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)OBD_CKSUM_CRC32C);
+
+       /* Checks for struct obdo */
+       LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n",
+                (long long)(int)sizeof(struct obdo));
+       LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_valid));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_valid));
+       LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_oi));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_oi));
+       LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_parent_seq));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq));
+       LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_size));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_size));
+       LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_mtime));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_mtime));
+       LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_atime));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_atime));
+       LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_ctime));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_ctime));
+       LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_blocks));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_blocks));
+       LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_grant));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_grant));
+       LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_blksize));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_blksize));
+       LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_mode));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_mode));
+       LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_uid));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_uid));
+       LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_gid));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_gid));
+       LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_flags));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_flags));
+       LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_nlink));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_nlink));
+       LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_parent_oid));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid));
+       LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_misc));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_misc));
+       LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_ioepoch));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch));
+       LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_stripe_idx));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx));
+       LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_parent_ver));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver));
+       LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_handle));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_handle));
+       LASSERTF((int)offsetof(struct obdo, o_lcookie) == 136, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_lcookie));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_lcookie) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_lcookie));
+       LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_uid_h));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_uid_h));
+       LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_gid_h));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_gid_h));
+       LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_data_version));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_data_version));
+       LASSERTF((int)offsetof(struct obdo, o_padding_4) == 184, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_padding_4));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_padding_4));
+       LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_padding_5));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_padding_5));
+       LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_padding_6));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_padding_6));
+       LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLID);
+       LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLATIME);
+       LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLMTIME);
+       LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLCTIME);
+       LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLSIZE);
+       LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLBLOCKS);
+       LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLBLKSZ);
+       LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLMODE);
+       LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLTYPE);
+       LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLUID);
+       LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGID);
+       LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLFLAGS);
+       LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLNLINK);
+       LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGENER);
+       LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRDEV);
+       LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLEASIZE);
+       LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_LINKNAME);
+       LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLHANDLE);
+       LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLCKSUM);
+       LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLQOS);
+       LASSERTF(OBD_MD_FLCOOKIE == (0x00800000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLCOOKIE);
+       LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGROUP);
+       LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLFID);
+       LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLEPOCH);
+       LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGRANT);
+       LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLDIREA);
+       LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLUSRQUOTA);
+       LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGRPQUOTA);
+       LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLMODEASIZE);
+       LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_MDS);
+       LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_REINT);
+       LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_MEA);
+       LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLXATTR);
+       LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLXATTRLS);
+       LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLXATTRRM);
+       LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLACL);
+       LASSERTF(OBD_MD_FLRMTPERM == (0x0000010000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRMTPERM);
+       LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLMDSCAPA);
+       LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLOSSCAPA);
+       LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLCKSPLIT);
+       LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLCROSSREF);
+       LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLGETATTRLOCK);
+       LASSERTF(OBD_MD_FLRMTLSETFACL == (0x0001000000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRMTLSETFACL);
+       LASSERTF(OBD_MD_FLRMTLGETFACL == (0x0002000000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRMTLGETFACL);
+       LASSERTF(OBD_MD_FLRMTRSETFACL == (0x0004000000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRMTRSETFACL);
+       LASSERTF(OBD_MD_FLRMTRGETFACL == (0x0008000000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLRMTRGETFACL);
+       LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n",
+                OBD_MD_FLDATAVERSION);
+       CLASSERT(OBD_FL_INLINEDATA == 0x00000001);
+       CLASSERT(OBD_FL_OBDMDEXISTS == 0x00000002);
+       CLASSERT(OBD_FL_DELORPHAN == 0x00000004);
+       CLASSERT(OBD_FL_NORPC == 0x00000008);
+       CLASSERT(OBD_FL_IDONLY == 0x00000010);
+       CLASSERT(OBD_FL_RECREATE_OBJS == 0x00000020);
+       CLASSERT(OBD_FL_DEBUG_CHECK == 0x00000040);
+       CLASSERT(OBD_FL_NO_USRQUOTA == 0x00000100);
+       CLASSERT(OBD_FL_NO_GRPQUOTA == 0x00000200);
+       CLASSERT(OBD_FL_CREATE_CROW == 0x00000400);
+       CLASSERT(OBD_FL_SRVLOCK == 0x00000800);
+       CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000);
+       CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000);
+       CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000);
+       CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000);
+       CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000);
+       CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000);
+       CLASSERT(OBD_FL_MMAP == 0x00040000);
+       CLASSERT(OBD_FL_RECOV_RESEND == 0x00080000);
+       CLASSERT(OBD_FL_NOSPC_BLK == 0x00100000);
+       CLASSERT(OBD_FL_LOCAL_MASK == 0xf0000000);
+
+       /* Checks for struct lov_ost_data_v1 */
+       LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct lov_ost_data_v1));
+       LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi));
+       LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi));
+       LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen));
+       LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen));
+       LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx));
+       LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx));
+
+       /* Checks for struct lov_mds_md_v1 */
+       LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct lov_mds_md_v1));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen));
+       LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0]));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]));
+       CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0);
+
+       /* Checks for struct lov_mds_md_v3 */
+       LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n",
+                (long long)(int)sizeof(struct lov_mds_md_v3));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen));
+       CLASSERT(LOV_MAXPOOLNAME == 16);
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]));
+       LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0]));
+       LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]));
+       CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0);
+       LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)LOV_PATTERN_RAID0);
+       LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)LOV_PATTERN_RAID1);
+       LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n",
+               (unsigned)LOV_PATTERN_FIRST);
+       LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n",
+               (unsigned)LOV_PATTERN_CMOBD);
+
+       /* Checks for struct obd_statfs */
+       LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n",
+                (long long)(int)sizeof(struct obd_statfs));
+       LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_type));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_type));
+       LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_blocks));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks));
+       LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_bfree));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree));
+       LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_bavail));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail));
+       LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_ffree));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree));
+       LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_fsid));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid));
+       LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_bsize));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize));
+       LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_namelen));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
+       LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_state));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
+       LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_fprecreated));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare2));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare3));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare4));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare5));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare6));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare7));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare8));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8));
+       LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n",
+                (long long)(int)offsetof(struct obd_statfs, os_spare9));
+       LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9));
+
+       /* Checks for struct obd_ioobj */
+       LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct obd_ioobj));
+       LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_ioobj, ioo_oid));
+       LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid));
+       LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw));
+       LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw));
+       LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt));
+       LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt));
+
+       /* Checks for union lquota_id */
+       LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n",
+                (long long)(int)sizeof(union lquota_id));
+
+       LASSERTF(QUOTABLOCK_BITS == 10, "found %lld\n",
+                (long long)QUOTABLOCK_BITS);
+       LASSERTF(QUOTABLOCK_SIZE == 1024, "found %lld\n",
+                (long long)QUOTABLOCK_SIZE);
+
+       /* Checks for struct obd_quotactl */
+       LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n",
+                (long long)(int)sizeof(struct obd_quotactl));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_cmd));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_type));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_id));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_stat));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo));
+       LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct obd_quotactl, qc_dqblk));
+       LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk));
+
+       /* Checks for struct obd_dqinfo */
+       LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct obd_dqinfo));
+       LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace));
+       LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace));
+       LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace));
+       LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace));
+       LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqinfo, dqi_flags));
+       LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags));
+       LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqinfo, dqi_valid));
+       LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid));
+
+       /* Checks for struct obd_dqblk */
+       LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n",
+                (long long)(int)sizeof(struct obd_dqblk));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_curspace));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_btime));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_itime));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_valid));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid));
+       LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n",
+                (long long)(int)offsetof(struct obd_dqblk, dqb_padding));
+       LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding));
+       LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n",
+               Q_QUOTACHECK);
+       LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n",
+               Q_INITQUOTA);
+       LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n",
+               Q_GETOINFO);
+       LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n",
+               Q_GETOQUOTA);
+       LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n",
+               Q_FINVALIDATE);
+
+       /* Checks for struct lquota_acct_rec */
+       LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct lquota_acct_rec));
+       LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_acct_rec, bspace));
+       LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace));
+       LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_acct_rec, ispace));
+       LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace));
+
+       /* Checks for struct lquota_glb_rec */
+       LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct lquota_glb_rec));
+       LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit));
+       LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit));
+       LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit));
+       LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit));
+       LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_glb_rec, qbr_time));
+       LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time));
+       LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted));
+       LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted));
+
+       /* Checks for struct lquota_slv_rec */
+       LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct lquota_slv_rec));
+       LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted));
+       LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted));
+
+       /* Checks for struct idx_info */
+       LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n",
+                (long long)(int)sizeof(struct idx_info));
+       LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_magic));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_magic));
+       LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_flags));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_flags));
+       LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_count));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_count));
+       LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_pad0));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0));
+       LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_attrs));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs));
+       LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_fid));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_fid));
+       LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_version));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_version));
+       LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_hash_start));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start));
+       LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_hash_end));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end));
+       LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_keysize));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize));
+       LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_recsize));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize));
+       LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_pad1));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1));
+       LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_pad2));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2));
+       LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct idx_info, ii_pad3));
+       LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3));
+       CLASSERT(IDX_INFO_MAGIC == 0x3D37CC37);
+
+       /* Checks for struct lu_idxpage */
+       LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct lu_idxpage));
+       LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lu_idxpage, lip_magic));
+       LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic));
+       LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lu_idxpage, lip_flags));
+       LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags));
+       LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n",
+                (long long)(int)offsetof(struct lu_idxpage, lip_nr));
+       LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr));
+       LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lu_idxpage, lip_pad0));
+       LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0));
+       CLASSERT(LIP_MAGIC == 0x8A6D6B6C);
+       LASSERTF(LIP_HDR_SIZE == 16, "found %lld\n",
+                (long long)LIP_HDR_SIZE);
+       LASSERTF(II_FL_NOHASH == 1, "found %lld\n",
+                (long long)II_FL_NOHASH);
+       LASSERTF(II_FL_VARKEY == 2, "found %lld\n",
+                (long long)II_FL_VARKEY);
+       LASSERTF(II_FL_VARREC == 4, "found %lld\n",
+                (long long)II_FL_VARREC);
+       LASSERTF(II_FL_NONUNQ == 8, "found %lld\n",
+                (long long)II_FL_NONUNQ);
+
+       /* Checks for struct niobuf_remote */
+       LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct niobuf_remote));
+       LASSERTF((int)offsetof(struct niobuf_remote, offset) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct niobuf_remote, offset));
+       LASSERTF((int)sizeof(((struct niobuf_remote *)0)->offset) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct niobuf_remote *)0)->offset));
+       LASSERTF((int)offsetof(struct niobuf_remote, len) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct niobuf_remote, len));
+       LASSERTF((int)sizeof(((struct niobuf_remote *)0)->len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct niobuf_remote *)0)->len));
+       LASSERTF((int)offsetof(struct niobuf_remote, flags) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct niobuf_remote, flags));
+       LASSERTF((int)sizeof(((struct niobuf_remote *)0)->flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct niobuf_remote *)0)->flags));
+       LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n",
+               OBD_BRW_READ);
+       LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n",
+               OBD_BRW_WRITE);
+       LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n",
+               OBD_BRW_SYNC);
+       LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n",
+               OBD_BRW_CHECK);
+       LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n",
+               OBD_BRW_FROM_GRANT);
+       LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n",
+               OBD_BRW_GRANTED);
+       LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n",
+               OBD_BRW_NOCACHE);
+       LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n",
+               OBD_BRW_NOQUOTA);
+       LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n",
+               OBD_BRW_SRVLOCK);
+       LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n",
+               OBD_BRW_ASYNC);
+       LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n",
+               OBD_BRW_MEMALLOC);
+
+       /* Checks for struct ost_body */
+       LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",
+                (long long)(int)sizeof(struct ost_body));
+       LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ost_body, oa));
+       LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_body *)0)->oa));
+
+       /* Checks for struct ll_fid */
+       LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct ll_fid));
+       LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fid, id));
+       LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fid *)0)->id));
+       LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fid, generation));
+       LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fid *)0)->generation));
+       LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fid, f_type));
+       LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fid *)0)->f_type));
+
+       /* Checks for struct mdt_body */
+       LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_body));
+       LASSERTF((int)offsetof(struct mdt_body, fid1) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, fid1));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->fid1));
+       LASSERTF((int)offsetof(struct mdt_body, fid2) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, fid2));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->fid2));
+       LASSERTF((int)offsetof(struct mdt_body, handle) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, handle));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->handle));
+       LASSERTF((int)offsetof(struct mdt_body, valid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, valid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->valid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->valid));
+       LASSERTF((int)offsetof(struct mdt_body, size) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, size));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->size));
+       LASSERTF((int)offsetof(struct mdt_body, mtime) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, mtime));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->mtime));
+       LASSERTF((int)offsetof(struct mdt_body, atime) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, atime));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->atime));
+       LASSERTF((int)offsetof(struct mdt_body, ctime) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, ctime));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->ctime));
+       LASSERTF((int)offsetof(struct mdt_body, blocks) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, blocks));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->blocks));
+       LASSERTF((int)offsetof(struct mdt_body, unused1) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, unused1));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->unused1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->unused1));
+       LASSERTF((int)offsetof(struct mdt_body, fsuid) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, fsuid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->fsuid));
+       LASSERTF((int)offsetof(struct mdt_body, fsgid) == 108, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, fsgid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->fsgid));
+       LASSERTF((int)offsetof(struct mdt_body, capability) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, capability));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->capability) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->capability));
+       LASSERTF((int)offsetof(struct mdt_body, mode) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, mode));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->mode));
+       LASSERTF((int)offsetof(struct mdt_body, uid) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, uid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->uid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->uid));
+       LASSERTF((int)offsetof(struct mdt_body, gid) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, gid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->gid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->gid));
+       LASSERTF((int)offsetof(struct mdt_body, flags) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, flags));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->flags));
+       LASSERTF((int)offsetof(struct mdt_body, rdev) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, rdev));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->rdev) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->rdev));
+       LASSERTF((int)offsetof(struct mdt_body, nlink) == 136, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, nlink));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->nlink) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->nlink));
+       LASSERTF((int)offsetof(struct mdt_body, unused2) == 140, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, unused2));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->unused2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->unused2));
+       LASSERTF((int)offsetof(struct mdt_body, suppgid) == 144, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, suppgid));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->suppgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->suppgid));
+       LASSERTF((int)offsetof(struct mdt_body, eadatasize) == 148, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, eadatasize));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->eadatasize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->eadatasize));
+       LASSERTF((int)offsetof(struct mdt_body, aclsize) == 152, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, aclsize));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->aclsize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->aclsize));
+       LASSERTF((int)offsetof(struct mdt_body, max_mdsize) == 156, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, max_mdsize));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->max_mdsize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->max_mdsize));
+       LASSERTF((int)offsetof(struct mdt_body, max_cookiesize) == 160, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, max_cookiesize));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->max_cookiesize) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->max_cookiesize));
+       LASSERTF((int)offsetof(struct mdt_body, uid_h) == 164, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, uid_h));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->uid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->uid_h));
+       LASSERTF((int)offsetof(struct mdt_body, gid_h) == 168, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, gid_h));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->gid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->gid_h));
+       LASSERTF((int)offsetof(struct mdt_body, padding_5) == 172, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_5));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_5));
+       LASSERTF((int)offsetof(struct mdt_body, padding_6) == 176, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_6));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_6) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_6));
+       LASSERTF((int)offsetof(struct mdt_body, padding_7) == 184, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_7));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_7) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_7));
+       LASSERTF((int)offsetof(struct mdt_body, padding_8) == 192, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_8));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_8) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_8));
+       LASSERTF((int)offsetof(struct mdt_body, padding_9) == 200, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_9));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_9) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_9));
+       LASSERTF((int)offsetof(struct mdt_body, padding_10) == 208, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_body, padding_10));
+       LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_10) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_body *)0)->padding_10));
+       LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n",
+               MDS_FMODE_CLOSED);
+       LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n",
+               MDS_FMODE_EXEC);
+       LASSERTF(MDS_FMODE_EPOCH == 000001000000UL, "found 0%.11oUL\n",
+               MDS_FMODE_EPOCH);
+       LASSERTF(MDS_FMODE_TRUNC == 000002000000UL, "found 0%.11oUL\n",
+               MDS_FMODE_TRUNC);
+       LASSERTF(MDS_FMODE_SOM == 000004000000UL, "found 0%.11oUL\n",
+               MDS_FMODE_SOM);
+       LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n",
+               MDS_OPEN_CREATED);
+       LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n",
+               MDS_OPEN_CROSS);
+       LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n",
+               MDS_OPEN_CREAT);
+       LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n",
+               MDS_OPEN_EXCL);
+       LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n",
+               MDS_OPEN_TRUNC);
+       LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n",
+               MDS_OPEN_APPEND);
+       LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n",
+               MDS_OPEN_SYNC);
+       LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n",
+               MDS_OPEN_DIRECTORY);
+       LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_BY_FID);
+       LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_DELAY_CREATE);
+       LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_OWNEROVERRIDE);
+       LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_JOIN_FILE);
+       LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_LOCK);
+       LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_HAS_EA);
+       LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n",
+               MDS_OPEN_HAS_OBJS);
+       LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n",
+                       (long long)MDS_OPEN_NORESTORE);
+       LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n",
+                       (long long)MDS_OPEN_NEWSTRIPE);
+       LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n",
+                       (long long)MDS_OPEN_VOLATILE);
+       LASSERTF(LUSTRE_SYNC_FL == 0x00000008, "found 0x%.8x\n",
+               LUSTRE_SYNC_FL);
+       LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010, "found 0x%.8x\n",
+               LUSTRE_IMMUTABLE_FL);
+       LASSERTF(LUSTRE_APPEND_FL == 0x00000020, "found 0x%.8x\n",
+               LUSTRE_APPEND_FL);
+       LASSERTF(LUSTRE_NOATIME_FL == 0x00000080, "found 0x%.8x\n",
+               LUSTRE_NOATIME_FL);
+       LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000, "found 0x%.8x\n",
+               LUSTRE_DIRSYNC_FL);
+       LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n",
+               MDS_INODELOCK_LOOKUP);
+       LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n",
+               MDS_INODELOCK_UPDATE);
+       LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n",
+               MDS_INODELOCK_OPEN);
+       LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n",
+               MDS_INODELOCK_LAYOUT);
+
+       /* Checks for struct mdt_ioepoch */
+       LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_ioepoch));
+       LASSERTF((int)offsetof(struct mdt_ioepoch, handle) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_ioepoch, handle));
+       LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_ioepoch *)0)->handle));
+       LASSERTF((int)offsetof(struct mdt_ioepoch, ioepoch) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_ioepoch, ioepoch));
+       LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->ioepoch) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_ioepoch *)0)->ioepoch));
+       LASSERTF((int)offsetof(struct mdt_ioepoch, flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_ioepoch, flags));
+       LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_ioepoch *)0)->flags));
+       LASSERTF((int)offsetof(struct mdt_ioepoch, padding) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_ioepoch, padding));
+       LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_ioepoch *)0)->padding));
+
+       /* Checks for struct mdt_remote_perm */
+       LASSERTF((int)sizeof(struct mdt_remote_perm) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_remote_perm));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_uid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_uid));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_uid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_uid));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_gid) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_gid));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_gid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_gid));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_access_perm) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_access_perm));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm));
+       LASSERTF((int)offsetof(struct mdt_remote_perm, rp_padding) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_remote_perm, rp_padding));
+       LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_padding));
+       LASSERTF(CFS_SETUID_PERM == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)CFS_SETUID_PERM);
+       LASSERTF(CFS_SETGID_PERM == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)CFS_SETGID_PERM);
+       LASSERTF(CFS_SETGRP_PERM == 0x00000004UL, "found 0x%.8xUL\n",
+               (unsigned)CFS_SETGRP_PERM);
+       LASSERTF(CFS_RMTACL_PERM == 0x00000008UL, "found 0x%.8xUL\n",
+               (unsigned)CFS_RMTACL_PERM);
+       LASSERTF(CFS_RMTOWN_PERM == 0x00000010UL, "found 0x%.8xUL\n",
+               (unsigned)CFS_RMTOWN_PERM);
+
+       /* Checks for struct mdt_rec_setattr */
+       LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_setattr));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_size));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_3) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_3));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4));
+       LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5));
+       LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5));
+
+       /* Checks for struct mdt_rec_create */
+       LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_create));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_fid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_time));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_rdev));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_mode));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_umask));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask));
+       LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4));
+
+       /* Checks for struct mdt_rec_link */
+       LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_link));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_fid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_time));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8));
+       LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9));
+       LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9));
+
+       /* Checks for struct mdt_rec_unlink */
+       LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_unlink));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_time));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8));
+       LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9));
+       LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9));
+
+       /* Checks for struct mdt_rec_rename */
+       LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_rename));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_time));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_mode));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7));
+       LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8));
+       LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8));
+
+       /* Checks for struct mdt_rec_setxattr */
+       LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_setxattr));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10));
+       LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11));
+       LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
+
+       /* Checks for struct mdt_rec_reint */
+       LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_reint));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_atime));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_size));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_mode));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_flags));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_umask));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask));
+       LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4));
+       LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4));
+
+       /* Checks for struct lmv_desc */
+       LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n",
+                (long long)(int)sizeof(struct lmv_desc));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_tgt_count));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_pattern));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_padding_1));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_padding_2));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_padding_3));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_padding_4));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4));
+       LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_desc, ld_uuid));
+       LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid));
+
+       /* Checks for struct lmv_stripe_md */
+       LASSERTF((int)sizeof(struct lmv_stripe_md) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct lmv_stripe_md));
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_magic));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_magic));
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_count));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_count));
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_master) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_master));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_master) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_master));
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_padding) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_padding));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_padding));
+       CLASSERT(LOV_MAXPOOLNAME == 16);
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_pool_name[16]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_pool_name[16]));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]));
+       LASSERTF((int)offsetof(struct lmv_stripe_md, mea_ids[0]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lmv_stripe_md, mea_ids[0]));
+       LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]));
+
+       /* Checks for struct lov_desc */
+       LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n",
+                (long long)(int)sizeof(struct lov_desc));
+       LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_tgt_count));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count));
+       LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count));
+       LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count));
+       LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_pattern));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern));
+       LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size));
+       LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+       LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_padding_0));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0));
+       LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
+       LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_padding_1));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1));
+       LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_padding_2));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2));
+       LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct lov_desc, ld_uuid));
+       LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid));
+       CLASSERT(LOV_DESC_MAGIC == 0xB0CCDE5C);
+
+       /* Checks for struct ldlm_res_id */
+       LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_res_id));
+       CLASSERT(RES_NAME_SIZE == 4);
+       LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_res_id, name[4]));
+       LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4]));
+
+       /* Checks for struct ldlm_extent */
+       LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_extent));
+       LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_extent, start));
+       LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_extent *)0)->start));
+       LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_extent, end));
+       LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_extent *)0)->end));
+       LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_extent, gid));
+       LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_extent *)0)->gid));
+
+       /* Checks for struct ldlm_inodebits */
+       LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_inodebits));
+       LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_inodebits, bits));
+       LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits));
+
+       /* Checks for struct ldlm_flock_wire */
+       LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_flock_wire));
+       LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start));
+       LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start));
+       LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end));
+       LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end));
+       LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner));
+       LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner));
+       LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding));
+       LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding));
+       LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid));
+       LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid));
+
+       /* Checks for struct ldlm_intent */
+       LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_intent));
+       LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_intent, opc));
+       LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_intent *)0)->opc));
+
+       /* Checks for struct ldlm_resource_desc */
+       LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_resource_desc));
+       LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_resource_desc, lr_type));
+       LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type));
+       LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding));
+       LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding));
+       LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_resource_desc, lr_name));
+       LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name));
+
+       /* Checks for struct ldlm_lock_desc */
+       LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_lock_desc));
+       LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_lock_desc, l_resource));
+       LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource));
+       LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode));
+       LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode));
+       LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode));
+       LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode));
+       LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data));
+       LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data));
+
+       /* Checks for struct ldlm_request */
+       LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_request));
+       LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_request, lock_flags));
+       LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags));
+       LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_request, lock_count));
+       LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count));
+       LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_request, lock_desc));
+       LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc));
+       LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_request, lock_handle));
+       LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle));
+
+       /* Checks for struct ldlm_reply */
+       LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_reply));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_flags));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_padding));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_desc));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_handle));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1));
+       LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2));
+       LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2));
+
+       /* Checks for struct ost_lvb_v1 */
+       LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct ost_lvb_v1));
+       LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb_v1, lvb_size));
+       LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size));
+       LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime));
+       LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime));
+       LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime));
+       LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime));
+       LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime));
+       LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime));
+       LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks));
+       LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks));
+
+       /* Checks for struct ost_lvb */
+       LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n",
+                (long long)(int)sizeof(struct ost_lvb));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_size));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_mtime));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_atime));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_ctime));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_blocks));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns));
+       LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n",
+                (long long)(int)offsetof(struct ost_lvb, lvb_padding));
+       LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding));
+
+       /* Checks for struct lquota_lvb */
+       LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct lquota_lvb));
+       LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_lvb, lvb_flags));
+       LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags));
+       LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel));
+       LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel));
+       LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel));
+       LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel));
+       LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit));
+       LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit));
+       LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lquota_lvb, lvb_pad1));
+       LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1));
+       LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n",
+                (long long)LQUOTA_FL_EDQUOT);
+
+       /* Checks for struct ldlm_gl_lquota_desc */
+       LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct ldlm_gl_lquota_desc));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time));
+       LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2));
+       LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2));
+
+       /* Checks for struct mgs_send_param */
+       LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n",
+                (long long)(int)sizeof(struct mgs_send_param));
+       CLASSERT(MGS_PARAM_MAXLEN == 1024);
+       LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024]));
+       LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]));
+
+       /* Checks for struct cfg_marker */
+       LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n",
+                (long long)(int)sizeof(struct cfg_marker));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_step));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_flags));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_vers));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_padding));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_createtime));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_canceltime));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_tgtname));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname));
+       LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct cfg_marker, cm_comment));
+       LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n",
+                (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment));
+
+       /* Checks for struct llog_logid */
+       LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n",
+                (long long)(int)sizeof(struct llog_logid));
+       LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid, lgl_oi));
+       LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi));
+       LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid, lgl_ogen));
+       LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen));
+       CLASSERT(OST_SZ_REC == 274730752);
+       CLASSERT(MDS_UNLINK_REC == 274801668);
+       CLASSERT(MDS_UNLINK64_REC == 275325956);
+       CLASSERT(MDS_SETATTR64_REC == 275325953);
+       CLASSERT(OBD_CFG_REC == 274857984);
+       CLASSERT(LLOG_GEN_REC == 274989056);
+       CLASSERT(CHANGELOG_REC == 275120128);
+       CLASSERT(CHANGELOG_USER_REC == 275185664);
+       CLASSERT(LLOG_HDR_MAGIC == 275010873);
+       CLASSERT(LLOG_LOGID_MAGIC == 275010875);
+
+       /* Checks for struct llog_catid */
+       LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct llog_catid));
+       LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_catid, lci_logid));
+       LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid));
+       LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct llog_catid, lci_padding1));
+       LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1));
+       LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llog_catid, lci_padding2));
+       LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2));
+       LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct llog_catid, lci_padding3));
+       LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3));
+
+       /* Checks for struct llog_rec_hdr */
+       LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct llog_rec_hdr));
+       LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_hdr, lrh_len));
+       LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len));
+       LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_hdr, lrh_index));
+       LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index));
+       LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_hdr, lrh_type));
+       LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type));
+       LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_hdr, lrh_id));
+       LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id));
+
+       /* Checks for struct llog_rec_tail */
+       LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct llog_rec_tail));
+       LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_tail, lrt_len));
+       LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len));
+       LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct llog_rec_tail, lrt_index));
+       LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index));
+
+       /* Checks for struct llog_logid_rec */
+       LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct llog_logid_rec));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_hdr));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_id));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_padding1));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_padding2));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_padding3));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3));
+       LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct llog_logid_rec, lid_tail));
+       LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail));
+
+       /* Checks for struct llog_unlink_rec */
+       LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct llog_unlink_rec));
+       LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr));
+       LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr));
+       LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink_rec, lur_oid));
+       LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid));
+       LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq));
+       LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq));
+       LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink_rec, lur_count));
+       LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count));
+       LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink_rec, lur_tail));
+       LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail));
+       /* Checks for struct llog_unlink64_rec */
+       LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct llog_unlink64_rec));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_count));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2));
+       LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3));
+       LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3));
+
+       /* Checks for struct llog_setattr64_rec */
+       LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct llog_setattr64_rec));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_padding) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_padding));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding));
+       LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail));
+       LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail));
+
+       /* Checks for struct llog_size_change_rec */
+       LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct llog_size_change_rec));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3));
+       LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail));
+       LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail));
+
+       /* Checks for struct changelog_rec */
+       LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct changelog_rec));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_namelen));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_flags));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_type));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_index));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_prev));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_time));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_tfid));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid));
+       LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_rec, cr_pfid));
+       LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid));
+
+       /* Checks for struct changelog_ext_rec */
+       LASSERTF((int)sizeof(struct changelog_ext_rec) == 96, "found %lld\n",
+                (long long)(int)sizeof(struct changelog_ext_rec));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_namelen) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_namelen));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_flags) == 2, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_flags));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_flags));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_type) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_type));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_type));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_index) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_index));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_index) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_index));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_prev) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_prev));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_prev) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_prev));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_time) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_time));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_time) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_time));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_tfid) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_tfid));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_pfid) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_pfid));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_sfid) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_sfid));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid));
+       LASSERTF((int)offsetof(struct changelog_ext_rec, cr_spfid) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_ext_rec, cr_spfid));
+       LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid));
+
+       /* Checks for struct changelog_setinfo */
+       LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n",
+                (long long)(int)sizeof(struct changelog_setinfo));
+       LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_setinfo, cs_recno));
+       LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno));
+       LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct changelog_setinfo, cs_id));
+       LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id));
+
+       /* Checks for struct llog_changelog_rec */
+       LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n",
+                (long long)(int)sizeof(struct llog_changelog_rec));
+       LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr));
+       LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr));
+       LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_rec, cr));
+       LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr));
+       LASSERTF((int)offsetof(struct llog_changelog_rec, cr_tail) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_rec, cr_tail));
+       LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_tail));
+
+       /* Checks for struct llog_changelog_user_rec */
+       LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct llog_changelog_user_rec));
+       LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr));
+       LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr));
+       LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id));
+       LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id));
+       LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding));
+       LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding));
+       LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec));
+       LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec));
+       LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail));
+       LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail));
+
+       /* Checks for struct llog_gen */
+       LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n",
+                (long long)(int)sizeof(struct llog_gen));
+       LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_gen, mnt_cnt));
+       LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt));
+       LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct llog_gen, conn_cnt));
+       LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt));
+
+       /* Checks for struct llog_gen_rec */
+       LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct llog_gen_rec));
+       LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr));
+       LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr));
+       LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_gen_rec, lgr_gen));
+       LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen));
+       LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct llog_gen_rec, lgr_tail));
+       LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail));
+
+       /* Checks for struct llog_log_hdr */
+       LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n",
+                (long long)(int)sizeof(struct llog_log_hdr));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_hdr));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_count));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_size));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_flags));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_reserved) == 84, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_reserved));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_reserved) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_reserved));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap) == 8096, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap));
+       LASSERTF((int)offsetof(struct llog_log_hdr, llh_tail) == 8184, "found %lld\n",
+                (long long)(int)offsetof(struct llog_log_hdr, llh_tail));
+       LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tail) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tail));
+
+       /* Checks for struct llog_cookie */
+       LASSERTF((int)sizeof(struct llog_cookie) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct llog_cookie));
+       LASSERTF((int)offsetof(struct llog_cookie, lgc_lgl) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llog_cookie, lgc_lgl));
+       LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_lgl) == 20, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_lgl));
+       LASSERTF((int)offsetof(struct llog_cookie, lgc_subsys) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct llog_cookie, lgc_subsys));
+       LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_subsys) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_subsys));
+       LASSERTF((int)offsetof(struct llog_cookie, lgc_index) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llog_cookie, lgc_index));
+       LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index));
+       LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct llog_cookie, lgc_padding));
+       LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding));
+
+       /* Checks for struct llogd_body */
+       LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n",
+                (long long)(int)sizeof(struct llogd_body));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_logid));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_llh_flags));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_index));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_saved_index));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_len));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len));
+       LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_body, lgd_cur_offset));
+       LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset));
+       CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501);
+       CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502);
+       CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503);
+       CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504);
+       CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505);
+       CLASSERT(LLOG_ORIGIN_CONNECT == 506);
+       CLASSERT(LLOG_CATINFO == 507);
+       CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508);
+       CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509);
+       CLASSERT(LLOG_FIRST_OPC == 501);
+       CLASSERT(LLOG_LAST_OPC == 510);
+
+       /* Checks for struct llogd_conn_body */
+       LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n",
+                (long long)(int)sizeof(struct llogd_conn_body));
+       LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen));
+       LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen));
+       LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid));
+       LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid));
+       LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx));
+       LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx));
+
+       /* Checks for struct ll_fiemap_info_key */
+       LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n",
+                (long long)(int)sizeof(struct ll_fiemap_info_key));
+       LASSERTF((int)offsetof(struct ll_fiemap_info_key, name[8]) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_info_key, name[8]));
+       LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]));
+       LASSERTF((int)offsetof(struct ll_fiemap_info_key, oa) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_info_key, oa));
+       LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->oa) == 208, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->oa));
+       LASSERTF((int)offsetof(struct ll_fiemap_info_key, fiemap) == 216, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_info_key, fiemap));
+       LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap));
+
+       /* Checks for struct quota_body */
+       LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n",
+                (long long)(int)sizeof(struct quota_body));
+       LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_fid));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_fid));
+       LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_id));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_id));
+       LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_flags));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_flags));
+       LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_padding));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_padding));
+       LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_count));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_count));
+       LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_usage));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_usage));
+       LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_slv_ver));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver));
+       LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_lockh));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh));
+       LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_glb_lockh));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh));
+       LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct quota_body, qb_padding1[4]));
+       LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4]));
+
+       /* Checks for struct mgs_target_info */
+       LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n",
+                (long long)(int)sizeof(struct mgs_target_info));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_config_ver));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_flags));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_nid_count));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_instance));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_fsname));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_svname));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_uuid));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_nids));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids));
+       LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n",
+                (long long)(int)offsetof(struct mgs_target_info, mti_params));
+       LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n",
+                (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params));
+
+       /* Checks for struct lustre_capa */
+       LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n",
+                (long long)(int)sizeof(struct lustre_capa));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_fid));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_opc));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_uid));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_gid));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_flags));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_keyid));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_timeout));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout));
+       LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_expiry));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry));
+       CLASSERT(CAPA_HMAC_MAX_LEN == 64);
+       LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa, lc_hmac[64]));
+       LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]));
+
+       /* Checks for struct lustre_capa_key */
+       LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n",
+                (long long)(int)sizeof(struct lustre_capa_key));
+       LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa_key, lk_seq));
+       LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq));
+       LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa_key, lk_keyid));
+       LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid));
+       LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa_key, lk_padding));
+       LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding));
+       CLASSERT(CAPA_HMAC_KEY_MAX_LEN == 56);
+       LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct lustre_capa_key, lk_key[56]));
+       LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]));
+
+       /* Checks for struct getinfo_fid2path */
+       LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct getinfo_fid2path));
+       LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct getinfo_fid2path, gf_fid));
+       LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid));
+       LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct getinfo_fid2path, gf_recno));
+       LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno));
+       LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno));
+       LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno));
+       LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen));
+       LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen));
+       LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0]));
+       LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n",
+                (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]));
+
+       /* Checks for struct ll_user_fiemap */
+       LASSERTF((int)sizeof(struct ll_user_fiemap) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct ll_user_fiemap));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_start) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_start));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_start));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_length) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_length));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_length) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_length));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_flags));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_flags));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_mapped_extents) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_mapped_extents));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extent_count) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_extent_count));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_reserved) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_reserved));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved));
+       LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extents) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct ll_user_fiemap, fm_extents));
+       LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extents) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extents));
+       CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001);
+       CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002);
+       CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000);
+
+       /* Checks for struct ll_fiemap_extent */
+       LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, "found %lld\n",
+                (long long)(int)sizeof(struct ll_fiemap_extent));
+       LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical));
+       LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical));
+       LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_physical) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_extent, fe_physical));
+       LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical));
+       LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_length) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_extent, fe_length));
+       LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length));
+       LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags));
+       LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags));
+       LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct ll_fiemap_extent, fe_device));
+       LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device));
+       CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001);
+       CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002);
+       CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004);
+       CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008);
+       CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080);
+       CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100);
+       CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200);
+       CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400);
+       CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800);
+       CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000);
+       CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000);
+       CLASSERT(FIEMAP_EXTENT_NET == 0x80000000);
+
+       /* Checks for type posix_acl_xattr_entry */
+       LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n",
+                (long long)(int)sizeof(posix_acl_xattr_entry));
+       LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n",
+                (long long)(int)offsetof(posix_acl_xattr_entry, e_tag));
+       LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n",
+                (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag));
+       LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n",
+                (long long)(int)offsetof(posix_acl_xattr_entry, e_perm));
+       LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n",
+                (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm));
+       LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n",
+                (long long)(int)offsetof(posix_acl_xattr_entry, e_id));
+       LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id));
+
+       /* Checks for type posix_acl_xattr_header */
+       LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n",
+                (long long)(int)sizeof(posix_acl_xattr_header));
+       LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n",
+                (long long)(int)offsetof(posix_acl_xattr_header, a_version));
+       LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version));
+       LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n",
+                (long long)(int)offsetof(posix_acl_xattr_header, a_entries));
+       LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n",
+                (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries));
+
+       /* Checks for struct link_ea_header */
+       LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct link_ea_header));
+       LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_header, leh_magic));
+       LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic));
+       LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_header, leh_reccount));
+       LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount));
+       LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_header, leh_len));
+       LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len));
+       LASSERTF((int)offsetof(struct link_ea_header, padding1) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_header, padding1));
+       LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_header *)0)->padding1));
+       LASSERTF((int)offsetof(struct link_ea_header, padding2) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_header, padding2));
+       LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_header *)0)->padding2));
+       CLASSERT(LINK_EA_MAGIC == 0x11EAF1DFUL);
+
+       /* Checks for struct link_ea_entry */
+       LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n",
+                (long long)(int)sizeof(struct link_ea_entry));
+       LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_entry, lee_reclen));
+       LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen));
+       LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid));
+       LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid));
+       LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n",
+                (long long)(int)offsetof(struct link_ea_entry, lee_name));
+       LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name));
+
+       /* Checks for struct layout_intent */
+       LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct layout_intent));
+       LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct layout_intent, li_opc));
+       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct layout_intent *)0)->li_opc));
+       LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct layout_intent, li_flags));
+       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
+       LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct layout_intent, li_start));
+       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
+       LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct layout_intent, li_end));
+       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+       LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
+                (long long)LAYOUT_INTENT_ACCESS);
+       LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
+                (long long)LAYOUT_INTENT_READ);
+       LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n",
+                (long long)LAYOUT_INTENT_WRITE);
+       LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n",
+                (long long)LAYOUT_INTENT_GLIMPSE);
+       LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n",
+                (long long)LAYOUT_INTENT_TRUNC);
+       LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n",
+                (long long)LAYOUT_INTENT_RELEASE);
+       LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n",
+                (long long)LAYOUT_INTENT_RESTORE);
+
+       /* Checks for struct hsm_action_item */
+       LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_action_item));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_len));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_action));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_fid));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_dfid));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_extent));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_cookie));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_gid));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid));
+       LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_item, hai_data));
+       LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data));
+
+       /* Checks for struct hsm_action_list */
+       LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_action_list));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_version));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_count));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_compound_id));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_flags));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_archive_id));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id));
+       LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, padding1));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1));
+       LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_action_list, hal_fsname));
+       LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname));
+
+       /* Checks for struct hsm_progress */
+       LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_progress));
+       LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, hp_fid));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid));
+       LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, hp_cookie));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie));
+       LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, hp_extent));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent));
+       LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, hp_flags));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags));
+       LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, hp_errval));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval));
+       LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress, padding));
+       LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress *)0)->padding));
+       LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n",
+               HP_FLAG_COMPLETED);
+       LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n",
+               HP_FLAG_RETRY);
+
+       LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_copy, hc_data_version));
+       LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version));
+       LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_copy, hc_flags));
+       LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags));
+       LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_copy, hc_errval));
+       LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval));
+       LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_copy, padding));
+       LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_copy *)0)->padding));
+       LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_copy, hc_hai));
+       LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai));
+
+       /* Checks for struct hsm_progress_kernel */
+       LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_progress_kernel));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version));
+       LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2));
+       LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2));
+
+       /* Checks for struct hsm_user_item */
+       LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_user_item));
+       LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_item, hui_fid));
+       LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid));
+       LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_item, hui_extent));
+       LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent));
+
+       /* Checks for struct hsm_user_state */
+       LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_user_state));
+       LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_state, hus_states));
+       LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states));
+       LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_state, hus_archive_id));
+       LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id));
+       LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state));
+       LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state));
+       LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action));
+       LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action));
+       LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location));
+       LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location));
+
+       /* Checks for struct hsm_state_set */
+       LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_state_set));
+       LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_state_set, hss_valid));
+       LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid));
+       LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_state_set, hss_archive_id));
+       LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id));
+       LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_state_set, hss_setmask));
+       LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask));
+       LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_state_set, hss_clearmask));
+       LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask));
+
+       /* Checks for struct hsm_current_action */
+       LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_current_action));
+       LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_current_action, hca_state));
+       LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state));
+       LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_current_action, hca_action));
+       LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action));
+       LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_current_action, hca_location));
+       LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location));
+
+       /* Checks for struct hsm_request */
+       LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_request));
+       LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_request, hr_action));
+       LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_request *)0)->hr_action));
+       LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_request, hr_archive_id));
+       LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id));
+       LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_request, hr_flags));
+       LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags));
+       LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_request, hr_itemcount));
+       LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount));
+       LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_request, hr_data_len));
+       LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len));
+       LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)HSM_FORCE_ACTION);
+       LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n",
+               (unsigned)HSM_GHOST_COPY);
+
+       /* Checks for struct hsm_user_request */
+       LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n",
+                (long long)(int)sizeof(struct hsm_user_request));
+       LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_request, hur_request));
+       LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request));
+       LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct hsm_user_request, hur_user_item));
+       LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item));
+
+       /* Checks for struct update_buf */
+       LASSERTF((int)sizeof(struct update_buf) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct update_buf));
+       LASSERTF((int)offsetof(struct update_buf, ub_magic) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct update_buf, ub_magic));
+       LASSERTF((int)sizeof(((struct update_buf *)0)->ub_magic) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update_buf *)0)->ub_magic));
+       LASSERTF((int)offsetof(struct update_buf, ub_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct update_buf, ub_count));
+       LASSERTF((int)sizeof(((struct update_buf *)0)->ub_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update_buf *)0)->ub_count));
+       LASSERTF((int)offsetof(struct update_buf, ub_bufs) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct update_buf, ub_bufs));
+       LASSERTF((int)sizeof(((struct update_buf *)0)->ub_bufs) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct update_buf *)0)->ub_bufs));
+
+       /* Checks for struct update_reply */
+       LASSERTF((int)sizeof(struct update_reply) == 8, "found %lld\n",
+                (long long)(int)sizeof(struct update_reply));
+       LASSERTF((int)offsetof(struct update_reply, ur_version) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct update_reply, ur_version));
+       LASSERTF((int)sizeof(((struct update_reply *)0)->ur_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update_reply *)0)->ur_version));
+       LASSERTF((int)offsetof(struct update_reply, ur_count) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct update_reply, ur_count));
+       LASSERTF((int)sizeof(((struct update_reply *)0)->ur_count) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update_reply *)0)->ur_count));
+       LASSERTF((int)offsetof(struct update_reply, ur_lens) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct update_reply, ur_lens));
+       LASSERTF((int)sizeof(((struct update_reply *)0)->ur_lens) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct update_reply *)0)->ur_lens));
+
+       /* Checks for struct update */
+       LASSERTF((int)sizeof(struct update) == 56, "found %lld\n",
+                (long long)(int)sizeof(struct update));
+       LASSERTF((int)offsetof(struct update, u_type) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct update, u_type));
+       LASSERTF((int)sizeof(((struct update *)0)->u_type) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update *)0)->u_type));
+       LASSERTF((int)offsetof(struct update, u_batchid) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct update, u_batchid));
+       LASSERTF((int)sizeof(((struct update *)0)->u_batchid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct update *)0)->u_batchid));
+       LASSERTF((int)offsetof(struct update, u_fid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct update, u_fid));
+       LASSERTF((int)sizeof(((struct update *)0)->u_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct update *)0)->u_fid));
+       LASSERTF((int)offsetof(struct update, u_lens) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct update, u_lens));
+       LASSERTF((int)sizeof(((struct update *)0)->u_lens) == 32, "found %lld\n",
+                (long long)(int)sizeof(((struct update *)0)->u_lens));
+       LASSERTF((int)offsetof(struct update, u_bufs) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct update, u_bufs));
+       LASSERTF((int)sizeof(((struct update *)0)->u_bufs) == 0, "found %lld\n",
+                (long long)(int)sizeof(((struct update *)0)->u_bufs));
+}
index dd98cb1468a45d5aadb911bbcb3476302ec05fc5..b529d79d0661dc56f92a0de1cdb1b4019acac648 100644 (file)
@@ -1020,12 +1020,12 @@ static int xlr_net_probe(struct platform_device *pdev)
                goto err_gmac;
        }
 
-       ndev->base_addr = (unsigned long) devm_request_and_ioremap
+       ndev->base_addr = (unsigned long) devm_ioremap_resource
                (&pdev->dev, res);
-       if (!ndev->base_addr) {
+       if (IS_ERR_VALUE(ndev->base_addr)) {
                dev_err(&pdev->dev,
-                               "devm_request_and_ioremap failed\n");
-               return -EBUSY;
+                               "devm_ioremap_resource failed\n");
+               return ndev->base_addr;
        }
 
        res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
index a0ec52a4114f277938ba274b9e4cfe8632a749d7..c17a1c3eb3cad47899fca4f0e8c3db56bc3caab7 100644 (file)
@@ -126,7 +126,7 @@ static int nvec_kbd_probe(struct platform_device *pdev)
        for (i = 0; i < ARRAY_SIZE(extcode_tab_us102); ++i)
                keycodes[j++] = extcode_tab_us102[i];
 
-       idev = input_allocate_device();
+       idev = devm_input_allocate_device(&pdev->dev);
        idev->name = "nvec keyboard";
        idev->phys = "nvec";
        idev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP) | BIT_MASK(EV_LED);
@@ -142,7 +142,7 @@ static int nvec_kbd_probe(struct platform_device *pdev)
        clear_bit(0, idev->keybit);
        err = input_register_device(idev);
        if (err)
-               goto fail;
+               return err;
 
        keys_dev.input = idev;
        keys_dev.notifier.notifier_call = nvec_keys_notifier;
@@ -161,10 +161,6 @@ static int nvec_kbd_probe(struct platform_device *pdev)
        nvec_write_async(nvec, clear_leds, sizeof(clear_leds));
 
        return 0;
-
-fail:
-       input_free_device(idev);
-       return err;
 }
 
 static int nvec_kbd_remove(struct platform_device *pdev)
@@ -177,8 +173,6 @@ static int nvec_kbd_remove(struct platform_device *pdev)
        nvec_write_async(nvec, disable_kbd, 2);
        nvec_unregister_notifier(nvec, &keys_dev.notifier);
 
-       input_unregister_device(keys_dev.input);
-
        return 0;
 }
 
diff --git a/drivers/staging/octeon-usb/Kconfig b/drivers/staging/octeon-usb/Kconfig
new file mode 100644 (file)
index 0000000..018af6d
--- /dev/null
@@ -0,0 +1,10 @@
+config OCTEON_USB
+       tristate "Cavium Networks Octeon USB support"
+       depends on CPU_CAVIUM_OCTEON && USB
+       help
+         This driver supports USB host controller on some Cavium
+         Networks' products in the Octeon family.
+
+         To compile this driver as a module, choose M here. The module
+         will be called octeon-usb.
+
diff --git a/drivers/staging/octeon-usb/Makefile b/drivers/staging/octeon-usb/Makefile
new file mode 100644 (file)
index 0000000..89df1ad
--- /dev/null
@@ -0,0 +1,3 @@
+obj-${CONFIG_OCTEON_USB} := octeon-usb.o
+octeon-usb-y := octeon-hcd.o
+octeon-usb-y += cvmx-usb.o
diff --git a/drivers/staging/octeon-usb/TODO b/drivers/staging/octeon-usb/TODO
new file mode 100644 (file)
index 0000000..cc58a7e
--- /dev/null
@@ -0,0 +1,11 @@
+This driver is functional and has been tested on EdgeRouter Lite with
+USB mass storage.
+
+TODO:
+       - kernel coding style
+       - checkpatch warnings
+       - dead code elimination
+       - device tree bindings
+       - possibly eliminate the extra "hardware abstraction layer"
+
+Contact: Aaro Koskinen <aaro.koskinen@iki.fi>
diff --git a/drivers/staging/octeon-usb/cvmx-usb.c b/drivers/staging/octeon-usb/cvmx-usb.c
new file mode 100644 (file)
index 0000000..bf36649
--- /dev/null
@@ -0,0 +1,3229 @@
+/***********************license start***************
+ * Copyright (c) 2003-2010  Cavium Networks (support@cavium.com). All rights
+ * reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+
+ *   * Neither the name of Cavium Networks nor the names of
+ *     its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written
+ *     permission.
+
+ * This Software, including technical data, may be subject to U.S. export  control
+ * laws, including the U.S. Export Administration Act and its  associated
+ * regulations, and may be subject to export or import  regulations in other
+ * countries.
+
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+ * AND WITH ALL FAULTS AND CAVIUM  NETWORKS MAKES NO PROMISES, REPRESENTATIONS OR
+ * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT TO
+ * THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY REPRESENTATION OR
+ * DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT DEFECTS, AND CAVIUM
+ * SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES OF TITLE,
+ * MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR PURPOSE, LACK OF
+ * VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT, QUIET POSSESSION OR
+ * CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK ARISING OUT OF USE OR
+ * PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
+ ***********************license end**************************************/
+
+
+/**
+ * @file
+ *
+ * "cvmx-usb.c" defines a set of low level USB functions to help
+ * developers create Octeon USB drivers for various operating
+ * systems. These functions provide a generic API to the Octeon
+ * USB blocks, hiding the internal hardware specific
+ * operations.
+ *
+ * <hr>$Revision: 32636 $<hr>
+ */
+#include <linux/delay.h>
+#include <asm/octeon/cvmx.h>
+#include <asm/octeon/octeon.h>
+#include <asm/octeon/cvmx-sysinfo.h>
+#include "cvmx-usbnx-defs.h"
+#include "cvmx-usbcx-defs.h"
+#include "cvmx-usb.h"
+#include <asm/octeon/cvmx-helper.h>
+#include <asm/octeon/cvmx-helper-board.h>
+
+#define CVMX_PREFETCH0(address) CVMX_PREFETCH(address, 0)
+#define CVMX_PREFETCH128(address) CVMX_PREFETCH(address, 128)
+// a normal prefetch
+#define CVMX_PREFETCH(address, offset) CVMX_PREFETCH_PREF0(address, offset)
+// normal prefetches that use the pref instruction
+#define CVMX_PREFETCH_PREFX(X, address, offset) asm volatile ("pref %[type], %[off](%[rbase])" : : [rbase] "d" (address), [off] "I" (offset), [type] "n" (X))
+#define CVMX_PREFETCH_PREF0(address, offset) CVMX_PREFETCH_PREFX(0, address, offset)
+#define CVMX_CLZ(result, input) asm ("clz %[rd],%[rs]" : [rd] "=d" (result) : [rs] "d" (input))
+
+#define cvmx_likely likely
+#define cvmx_wait_usec udelay
+#define cvmx_unlikely unlikely
+#define cvmx_le16_to_cpu le16_to_cpu
+
+#define MAX_RETRIES         3   /* Maximum number of times to retry failed transactions */
+#define MAX_PIPES           32  /* Maximum number of pipes that can be open at once */
+#define MAX_TRANSACTIONS    256 /* Maximum number of outstanding transactions across all pipes */
+#define MAX_CHANNELS        8   /* Maximum number of hardware channels supported by the USB block */
+#define MAX_USB_ADDRESS     127 /* The highest valid USB device address */
+#define MAX_USB_ENDPOINT    15  /* The highest valid USB endpoint number */
+#define MAX_USB_HUB_PORT    15  /* The highest valid port number on a hub */
+#define MAX_TRANSFER_BYTES  ((1<<19)-1) /* The low level hardware can transfer a maximum of this number of bytes in each transfer. The field is 19 bits wide */
+#define MAX_TRANSFER_PACKETS ((1<<10)-1) /* The low level hardware can transfer a maximum of this number of packets in each transfer. The field is 10 bits wide */
+
+/* These defines disable the normal read and write csr. This is so I can add
+    extra debug stuff to the usb specific version and I won't use the normal
+    version by mistake */
+#define cvmx_read_csr use_cvmx_usb_read_csr64_instead_of_cvmx_read_csr
+#define cvmx_write_csr use_cvmx_usb_write_csr64_instead_of_cvmx_write_csr
+
+typedef enum {
+    __CVMX_USB_TRANSACTION_FLAGS_IN_USE = 1<<16,
+} cvmx_usb_transaction_flags_t;
+
+enum {
+       USB_CLOCK_TYPE_REF_12,
+       USB_CLOCK_TYPE_REF_24,
+       USB_CLOCK_TYPE_REF_48,
+       USB_CLOCK_TYPE_CRYSTAL_12,
+};
+
+/**
+ * Logical transactions may take numerous low level
+ * transactions, especially when splits are concerned. This
+ * enum represents all of the possible stages a transaction can
+ * be in. Note that split completes are always even. This is so
+ * the NAK handler can backup to the previous low level
+ * transaction with a simple clearing of bit 0.
+ */
+typedef enum {
+    CVMX_USB_STAGE_NON_CONTROL,
+    CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE,
+    CVMX_USB_STAGE_SETUP,
+    CVMX_USB_STAGE_SETUP_SPLIT_COMPLETE,
+    CVMX_USB_STAGE_DATA,
+    CVMX_USB_STAGE_DATA_SPLIT_COMPLETE,
+    CVMX_USB_STAGE_STATUS,
+    CVMX_USB_STAGE_STATUS_SPLIT_COMPLETE,
+} cvmx_usb_stage_t;
+
+/**
+ * This structure describes each pending USB transaction
+ * regardless of type. These are linked together to form a list
+ * of pending requests for a pipe.
+ */
+typedef struct cvmx_usb_transaction {
+    struct cvmx_usb_transaction *prev;  /**< Transaction before this one in the pipe */
+    struct cvmx_usb_transaction *next;  /**< Transaction after this one in the pipe */
+    cvmx_usb_transfer_t type;           /**< Type of transaction, duplicated of the pipe */
+    cvmx_usb_transaction_flags_t flags; /**< State flags for this transaction */
+    uint64_t buffer;                    /**< User's physical buffer address to read/write */
+    int buffer_length;                  /**< Size of the user's buffer in bytes */
+    uint64_t control_header;            /**< For control transactions, physical address of the 8 byte standard header */
+    int iso_start_frame;                /**< For ISO transactions, the starting frame number */
+    int iso_number_packets;             /**< For ISO transactions, the number of packets in the request */
+    cvmx_usb_iso_packet_t *iso_packets; /**< For ISO transactions, the sub packets in the request */
+    int xfersize;
+    int pktcnt;
+    int retries;
+    int actual_bytes;                   /**< Actual bytes transfer for this transaction */
+    cvmx_usb_stage_t stage;             /**< For control transactions, the current stage */
+    cvmx_usb_callback_func_t callback;  /**< User's callback function when complete */
+    void *callback_data;                /**< User's data */
+} cvmx_usb_transaction_t;
+
+/**
+ * A pipe represents a virtual connection between Octeon and some
+ * USB device. It contains a list of pending request to the device.
+ */
+typedef struct cvmx_usb_pipe {
+    struct cvmx_usb_pipe *prev;         /**< Pipe before this one in the list */
+    struct cvmx_usb_pipe *next;         /**< Pipe after this one in the list */
+    cvmx_usb_transaction_t *head;       /**< The first pending transaction */
+    cvmx_usb_transaction_t *tail;       /**< The last pending transaction */
+    uint64_t interval;                  /**< For periodic pipes, the interval between packets in frames */
+    uint64_t next_tx_frame;             /**< The next frame this pipe is allowed to transmit on */
+    cvmx_usb_pipe_flags_t flags;        /**< State flags for this pipe */
+    cvmx_usb_speed_t device_speed;      /**< Speed of device connected to this pipe */
+    cvmx_usb_transfer_t transfer_type;  /**< Type of transaction supported by this pipe */
+    cvmx_usb_direction_t transfer_dir;  /**< IN or OUT. Ignored for Control */
+    int multi_count;                    /**< Max packet in a row for the device */
+    uint16_t max_packet;                /**< The device's maximum packet size in bytes */
+    uint8_t device_addr;                /**< USB device address at other end of pipe */
+    uint8_t endpoint_num;               /**< USB endpoint number at other end of pipe */
+    uint8_t hub_device_addr;            /**< Hub address this device is connected to */
+    uint8_t hub_port;                   /**< Hub port this device is connected to */
+    uint8_t pid_toggle;                 /**< This toggles between 0/1 on every packet send to track the data pid needed */
+    uint8_t channel;                    /**< Hardware DMA channel for this pipe */
+    int8_t  split_sc_frame;             /**< The low order bits of the frame number the split complete should be sent on */
+} cvmx_usb_pipe_t;
+
+typedef struct {
+    cvmx_usb_pipe_t *head;              /**< Head of the list, or NULL if empty */
+    cvmx_usb_pipe_t *tail;              /**< Tail if the list, or NULL if empty */
+} cvmx_usb_pipe_list_t;
+
+typedef struct {
+    struct {
+        int channel;
+        int size;
+        uint64_t address;
+    } entry[MAX_CHANNELS+1];
+    int head;
+    int tail;
+} cvmx_usb_tx_fifo_t;
+
+/**
+ * The state of the USB block is stored in this structure
+ */
+typedef struct {
+    int init_flags;                     /**< Flags passed to initialize */
+    int index;                          /**< Which USB block this is for */
+    int idle_hardware_channels;         /**< Bit set for every idle hardware channel */
+    cvmx_usbcx_hprt_t usbcx_hprt;       /**< Stored port status so we don't need to read a CSR to determine splits */
+    cvmx_usb_pipe_t *pipe_for_channel[MAX_CHANNELS];    /**< Map channels to pipes */
+    cvmx_usb_transaction_t *free_transaction_head;      /**< List of free transactions head */
+    cvmx_usb_transaction_t *free_transaction_tail;      /**< List of free transactions tail */
+    cvmx_usb_pipe_t pipe[MAX_PIPES];                    /**< Storage for pipes */
+    cvmx_usb_transaction_t transaction[MAX_TRANSACTIONS];       /**< Storage for transactions */
+    cvmx_usb_callback_func_t callback[__CVMX_USB_CALLBACK_END]; /**< User global callbacks */
+    void *callback_data[__CVMX_USB_CALLBACK_END];               /**< User data for each callback */
+    int indent;                         /**< Used by debug output to indent functions */
+    cvmx_usb_port_status_t port_status; /**< Last port status used for change notification */
+    cvmx_usb_pipe_list_t free_pipes;    /**< List of all pipes that are currently closed */
+    cvmx_usb_pipe_list_t idle_pipes;    /**< List of open pipes that have no transactions */
+    cvmx_usb_pipe_list_t active_pipes[4]; /**< Active pipes indexed by transfer type */
+    uint64_t frame_number;              /**< Increments every SOF interrupt for time keeping */
+    cvmx_usb_transaction_t *active_split; /**< Points to the current active split, or NULL */
+    cvmx_usb_tx_fifo_t periodic;
+    cvmx_usb_tx_fifo_t nonperiodic;
+} cvmx_usb_internal_state_t;
+
+/* This macro logs out whenever a function is called if debugging is on */
+#define CVMX_USB_LOG_CALLED() \
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS)) \
+        cvmx_dprintf("%*s%s: called\n", 2*usb->indent++, "", __FUNCTION__);
+
+/* This macro logs out each function parameter if debugging is on */
+#define CVMX_USB_LOG_PARAM(format, param) \
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS)) \
+        cvmx_dprintf("%*s%s: param %s = " format "\n", 2*usb->indent, "", __FUNCTION__, #param, param);
+
+/* This macro logs out when a function returns a value */
+#define CVMX_USB_RETURN(v)                                              \
+    do {                                                                \
+        typeof(v) r = v;                                                \
+        if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS))    \
+            cvmx_dprintf("%*s%s: returned %s(%d)\n", 2*--usb->indent, "", __FUNCTION__, #v, r); \
+        return r;                                                       \
+    } while (0);
+
+/* This macro logs out when a function doesn't return a value */
+#define CVMX_USB_RETURN_NOTHING()                                       \
+    do {                                                                \
+        if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS))    \
+            cvmx_dprintf("%*s%s: returned\n", 2*--usb->indent, "", __FUNCTION__); \
+        return;                                                         \
+    } while (0);
+
+/* This macro spins on a field waiting for it to reach a value */
+#define CVMX_WAIT_FOR_FIELD32(address, type, field, op, value, timeout_usec)\
+    ({int result;                                                       \
+    do {                                                                \
+        uint64_t done = cvmx_get_cycle() + (uint64_t)timeout_usec *     \
+                       octeon_get_clock_rate() / 1000000;              \
+        type c;                                                         \
+        while (1)                                                       \
+        {                                                               \
+            c.u32 = __cvmx_usb_read_csr32(usb, address);                \
+            if (c.s.field op (value)) {                                 \
+                result = 0;                                             \
+                break;                                                  \
+            } else if (cvmx_get_cycle() > done) {                       \
+                result = -1;                                            \
+                break;                                                  \
+            } else                                                      \
+                cvmx_wait(100);                                         \
+        }                                                               \
+    } while (0);                                                        \
+    result;})
+
+/* This macro logically sets a single field in a CSR. It does the sequence
+    read, modify, and write */
+#define USB_SET_FIELD32(address, type, field, value)\
+    do {                                            \
+        type c;                                     \
+        c.u32 = __cvmx_usb_read_csr32(usb, address);\
+        c.s.field = value;                          \
+        __cvmx_usb_write_csr32(usb, address, c.u32);\
+    } while (0)
+
+/* Returns the IO address to push/pop stuff data from the FIFOs */
+#define USB_FIFO_ADDRESS(channel, usb_index) (CVMX_USBCX_GOTGCTL(usb_index) + ((channel)+1)*0x1000)
+
+static int octeon_usb_get_clock_type(void)
+{
+       switch (cvmx_sysinfo_get()->board_type) {
+       case CVMX_BOARD_TYPE_BBGW_REF:
+       case CVMX_BOARD_TYPE_LANAI2_A:
+       case CVMX_BOARD_TYPE_LANAI2_U:
+       case CVMX_BOARD_TYPE_LANAI2_G:
+               return USB_CLOCK_TYPE_CRYSTAL_12;
+       }
+
+       /* FIXME: This should use CVMX_BOARD_TYPE_UBNT_E100 */
+       if (OCTEON_IS_MODEL(OCTEON_CN50XX) &&
+           cvmx_sysinfo_get()->board_type == 20002)
+               return USB_CLOCK_TYPE_CRYSTAL_12;
+
+       return USB_CLOCK_TYPE_REF_48;
+}
+
+/**
+ * @INTERNAL
+ * Read a USB 32bit CSR. It performs the necessary address swizzle
+ * for 32bit CSRs and logs the value in a readable format if
+ * debugging is on.
+ *
+ * @param usb     USB block this access is for
+ * @param address 64bit address to read
+ *
+ * @return Result of the read
+ */
+static inline uint32_t __cvmx_usb_read_csr32(cvmx_usb_internal_state_t *usb,
+                                             uint64_t address)
+{
+    uint32_t result = cvmx_read64_uint32(address ^ 4);
+    return result;
+}
+
+
+/**
+ * @INTERNAL
+ * Write a USB 32bit CSR. It performs the necessary address
+ * swizzle for 32bit CSRs and logs the value in a readable format
+ * if debugging is on.
+ *
+ * @param usb     USB block this access is for
+ * @param address 64bit address to write
+ * @param value   Value to write
+ */
+static inline void __cvmx_usb_write_csr32(cvmx_usb_internal_state_t *usb,
+                                          uint64_t address, uint32_t value)
+{
+    cvmx_write64_uint32(address ^ 4, value);
+    cvmx_read64_uint64(CVMX_USBNX_DMA0_INB_CHN0(usb->index));
+}
+
+
+/**
+ * @INTERNAL
+ * Read a USB 64bit CSR. It logs the value in a readable format if
+ * debugging is on.
+ *
+ * @param usb     USB block this access is for
+ * @param address 64bit address to read
+ *
+ * @return Result of the read
+ */
+static inline uint64_t __cvmx_usb_read_csr64(cvmx_usb_internal_state_t *usb,
+                                             uint64_t address)
+{
+    uint64_t result = cvmx_read64_uint64(address);
+    return result;
+}
+
+
+/**
+ * @INTERNAL
+ * Write a USB 64bit CSR. It logs the value in a readable format
+ * if debugging is on.
+ *
+ * @param usb     USB block this access is for
+ * @param address 64bit address to write
+ * @param value   Value to write
+ */
+static inline void __cvmx_usb_write_csr64(cvmx_usb_internal_state_t *usb,
+                                          uint64_t address, uint64_t value)
+{
+    cvmx_write64_uint64(address, value);
+}
+
+
+/**
+ * @INTERNAL
+ * Utility function to convert complete codes into strings
+ *
+ * @param complete_code
+ *               Code to convert
+ *
+ * @return Human readable string
+ */
+static const char *__cvmx_usb_complete_to_string(cvmx_usb_complete_t complete_code)
+{
+    switch (complete_code)
+    {
+        case CVMX_USB_COMPLETE_SUCCESS: return "SUCCESS";
+        case CVMX_USB_COMPLETE_SHORT:   return "SHORT";
+        case CVMX_USB_COMPLETE_CANCEL:  return "CANCEL";
+        case CVMX_USB_COMPLETE_ERROR:   return "ERROR";
+        case CVMX_USB_COMPLETE_STALL:   return "STALL";
+        case CVMX_USB_COMPLETE_XACTERR: return "XACTERR";
+        case CVMX_USB_COMPLETE_DATATGLERR: return "DATATGLERR";
+        case CVMX_USB_COMPLETE_BABBLEERR: return "BABBLEERR";
+        case CVMX_USB_COMPLETE_FRAMEERR: return "FRAMEERR";
+    }
+    return "Update __cvmx_usb_complete_to_string";
+}
+
+
+/**
+ * @INTERNAL
+ * Return non zero if this pipe connects to a non HIGH speed
+ * device through a high speed hub.
+ *
+ * @param usb    USB block this access is for
+ * @param pipe   Pipe to check
+ *
+ * @return Non zero if we need to do split transactions
+ */
+static inline int __cvmx_usb_pipe_needs_split(cvmx_usb_internal_state_t *usb, cvmx_usb_pipe_t *pipe)
+{
+    return ((pipe->device_speed != CVMX_USB_SPEED_HIGH) && (usb->usbcx_hprt.s.prtspd == CVMX_USB_SPEED_HIGH));
+}
+
+
+/**
+ * @INTERNAL
+ * Trivial utility function to return the correct PID for a pipe
+ *
+ * @param pipe   pipe to check
+ *
+ * @return PID for pipe
+ */
+static inline int __cvmx_usb_get_data_pid(cvmx_usb_pipe_t *pipe)
+{
+    if (pipe->pid_toggle)
+        return 2; /* Data1 */
+    else
+        return 0; /* Data0 */
+}
+
+
+/**
+ * Return the number of USB ports supported by this Octeon
+ * chip. If the chip doesn't support USB, or is not supported
+ * by this API, a zero will be returned. Most Octeon chips
+ * support one usb port, but some support two ports.
+ * cvmx_usb_initialize() must be called on independent
+ * cvmx_usb_state_t structures.
+ *
+ * @return Number of port, zero if usb isn't supported
+ */
+int cvmx_usb_get_num_ports(void)
+{
+    int arch_ports = 0;
+
+    if (OCTEON_IS_MODEL(OCTEON_CN56XX))
+        arch_ports = 1;
+    else if (OCTEON_IS_MODEL(OCTEON_CN52XX))
+        arch_ports = 2;
+    else if (OCTEON_IS_MODEL(OCTEON_CN50XX))
+        arch_ports = 1;
+    else if (OCTEON_IS_MODEL(OCTEON_CN31XX))
+        arch_ports = 1;
+    else if (OCTEON_IS_MODEL(OCTEON_CN30XX))
+        arch_ports = 1;
+    else
+        arch_ports = 0;
+
+    return arch_ports;
+}
+
+
+/**
+ * @INTERNAL
+ * Allocate a usb transaction for use
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return Transaction or NULL
+ */
+static inline cvmx_usb_transaction_t *__cvmx_usb_alloc_transaction(cvmx_usb_internal_state_t *usb)
+{
+    cvmx_usb_transaction_t *t;
+    t = usb->free_transaction_head;
+    if (t) {
+        usb->free_transaction_head = t->next;
+        if (!usb->free_transaction_head)
+            usb->free_transaction_tail = NULL;
+    }
+    else if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+        cvmx_dprintf("%s: Failed to allocate a transaction\n", __FUNCTION__);
+    if (t) {
+        memset(t, 0, sizeof(*t));
+        t->flags = __CVMX_USB_TRANSACTION_FLAGS_IN_USE;
+    }
+    return t;
+}
+
+
+/**
+ * @INTERNAL
+ * Free a usb transaction
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param transaction
+ *               Transaction to free
+ */
+static inline void __cvmx_usb_free_transaction(cvmx_usb_internal_state_t *usb,
+                                        cvmx_usb_transaction_t *transaction)
+{
+    transaction->flags = 0;
+    transaction->prev = NULL;
+    transaction->next = NULL;
+    if (usb->free_transaction_tail)
+        usb->free_transaction_tail->next = transaction;
+    else
+        usb->free_transaction_head = transaction;
+    usb->free_transaction_tail = transaction;
+}
+
+
+/**
+ * @INTERNAL
+ * Add a pipe to the tail of a list
+ * @param list   List to add pipe to
+ * @param pipe   Pipe to add
+ */
+static inline void __cvmx_usb_append_pipe(cvmx_usb_pipe_list_t *list, cvmx_usb_pipe_t *pipe)
+{
+    pipe->next = NULL;
+    pipe->prev = list->tail;
+    if (list->tail)
+        list->tail->next = pipe;
+    else
+        list->head = pipe;
+    list->tail = pipe;
+}
+
+
+/**
+ * @INTERNAL
+ * Remove a pipe from a list
+ * @param list   List to remove pipe from
+ * @param pipe   Pipe to remove
+ */
+static inline void __cvmx_usb_remove_pipe(cvmx_usb_pipe_list_t *list, cvmx_usb_pipe_t *pipe)
+{
+    if (list->head == pipe) {
+        list->head = pipe->next;
+        pipe->next = NULL;
+        if (list->head)
+            list->head->prev = NULL;
+        else
+            list->tail = NULL;
+    }
+    else if (list->tail == pipe) {
+        list->tail = pipe->prev;
+        list->tail->next = NULL;
+        pipe->prev = NULL;
+    }
+    else {
+        pipe->prev->next = pipe->next;
+        pipe->next->prev = pipe->prev;
+        pipe->prev = NULL;
+        pipe->next = NULL;
+    }
+}
+
+
+/**
+ * Initialize a USB port for use. This must be called before any
+ * other access to the Octeon USB port is made. The port starts
+ * off in the disabled state.
+ *
+ * @param state  Pointer to an empty cvmx_usb_state_t structure
+ *               that will be populated by the initialize call.
+ *               This structure is then passed to all other USB
+ *               functions.
+ * @param usb_port_number
+ *               Which Octeon USB port to initialize.
+ * @param flags  Flags to control hardware initialization. See
+ *               cvmx_usb_initialize_flags_t for the flag
+ *               definitions. Some flags are mandatory.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_initialize(cvmx_usb_state_t *state,
+                                      int usb_port_number,
+                                      cvmx_usb_initialize_flags_t flags)
+{
+    cvmx_usbnx_clk_ctl_t usbn_clk_ctl;
+    cvmx_usbnx_usbp_ctl_status_t usbn_usbp_ctl_status;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    usb->init_flags = flags;
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", usb_port_number);
+    CVMX_USB_LOG_PARAM("0x%x", flags);
+
+    /* Make sure that state is large enough to store the internal state */
+    if (sizeof(*state) < sizeof(*usb))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    /* At first allow 0-1 for the usb port number */
+    if ((usb_port_number < 0) || (usb_port_number > 1))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    /* For all chips except 52XX there is only one port */
+    if (!OCTEON_IS_MODEL(OCTEON_CN52XX) && (usb_port_number > 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    /* Try to determine clock type automatically */
+    if ((flags & (CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_XI |
+                  CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_GND)) == 0) {
+        if (octeon_usb_get_clock_type() == USB_CLOCK_TYPE_CRYSTAL_12)
+            flags |= CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_XI;  /* Only 12 MHZ crystals are supported */
+        else
+            flags |= CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_GND;
+    }
+
+    if (flags & CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_GND) {
+        /* Check for auto ref clock frequency */
+        if (!(flags & CVMX_USB_INITIALIZE_FLAGS_CLOCK_MHZ_MASK))
+            switch (octeon_usb_get_clock_type()) {
+                case USB_CLOCK_TYPE_REF_12:
+                    flags |= CVMX_USB_INITIALIZE_FLAGS_CLOCK_12MHZ;
+                    break;
+                case USB_CLOCK_TYPE_REF_24:
+                    flags |= CVMX_USB_INITIALIZE_FLAGS_CLOCK_24MHZ;
+                    break;
+                case USB_CLOCK_TYPE_REF_48:
+                    flags |= CVMX_USB_INITIALIZE_FLAGS_CLOCK_48MHZ;
+                    break;
+                default:
+                    CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+                    break;
+            }
+    }
+
+    memset(usb, 0, sizeof(usb));
+    usb->init_flags = flags;
+
+    /* Initialize the USB state structure */
+    {
+        int i;
+        usb->index = usb_port_number;
+
+        /* Initialize the transaction double linked list */
+        usb->free_transaction_head = NULL;
+        usb->free_transaction_tail = NULL;
+        for (i=0; i<MAX_TRANSACTIONS; i++)
+            __cvmx_usb_free_transaction(usb, usb->transaction + i);
+        for (i=0; i<MAX_PIPES; i++)
+            __cvmx_usb_append_pipe(&usb->free_pipes, usb->pipe + i);
+    }
+
+    /* Power On Reset and PHY Initialization */
+
+    /* 1. Wait for DCOK to assert (nothing to do) */
+    /* 2a. Write USBN0/1_CLK_CTL[POR] = 1 and
+        USBN0/1_CLK_CTL[HRST,PRST,HCLK_RST] = 0 */
+    usbn_clk_ctl.u64 = __cvmx_usb_read_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index));
+    usbn_clk_ctl.s.por = 1;
+    usbn_clk_ctl.s.hrst = 0;
+    usbn_clk_ctl.s.prst = 0;
+    usbn_clk_ctl.s.hclk_rst = 0;
+    usbn_clk_ctl.s.enable = 0;
+    /* 2b. Select the USB reference clock/crystal parameters by writing
+        appropriate values to USBN0/1_CLK_CTL[P_C_SEL, P_RTYPE, P_COM_ON] */
+    if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_GND) {
+        /* The USB port uses 12/24/48MHz 2.5V board clock
+            source at USB_XO. USB_XI should be tied to GND.
+            Most Octeon evaluation boards require this setting */
+        if (OCTEON_IS_MODEL(OCTEON_CN3XXX)) {
+            usbn_clk_ctl.cn31xx.p_rclk  = 1; /* From CN31XX,CN30XX manual */
+            usbn_clk_ctl.cn31xx.p_xenbn = 0;
+        }
+        else if (OCTEON_IS_MODEL(OCTEON_CN56XX) || OCTEON_IS_MODEL(OCTEON_CN50XX))
+            usbn_clk_ctl.cn56xx.p_rtype = 2; /* From CN56XX,CN50XX manual */
+        else
+            usbn_clk_ctl.cn52xx.p_rtype = 1; /* From CN52XX manual */
+
+        switch (flags & CVMX_USB_INITIALIZE_FLAGS_CLOCK_MHZ_MASK) {
+            case CVMX_USB_INITIALIZE_FLAGS_CLOCK_12MHZ:
+                usbn_clk_ctl.s.p_c_sel = 0;
+                break;
+            case CVMX_USB_INITIALIZE_FLAGS_CLOCK_24MHZ:
+                usbn_clk_ctl.s.p_c_sel = 1;
+                break;
+            case CVMX_USB_INITIALIZE_FLAGS_CLOCK_48MHZ:
+                usbn_clk_ctl.s.p_c_sel = 2;
+                break;
+        }
+    }
+    else {
+        /* The USB port uses a 12MHz crystal as clock source
+            at USB_XO and USB_XI */
+        if (OCTEON_IS_MODEL(OCTEON_CN3XXX)) {
+            usbn_clk_ctl.cn31xx.p_rclk  = 1; /* From CN31XX,CN30XX manual */
+            usbn_clk_ctl.cn31xx.p_xenbn = 1;
+        }
+        else if (OCTEON_IS_MODEL(OCTEON_CN56XX) || OCTEON_IS_MODEL(OCTEON_CN50XX))
+            usbn_clk_ctl.cn56xx.p_rtype = 0; /* From CN56XX,CN50XX manual */
+        else
+            usbn_clk_ctl.cn52xx.p_rtype = 0; /* From CN52XX manual */
+
+        usbn_clk_ctl.s.p_c_sel = 0;
+    }
+    /* 2c. Select the HCLK via writing USBN0/1_CLK_CTL[DIVIDE, DIVIDE2] and
+        setting USBN0/1_CLK_CTL[ENABLE] = 1.  Divide the core clock down such
+        that USB is as close as possible to 125Mhz */
+    {
+        int divisor = (octeon_get_clock_rate()+125000000-1)/125000000;
+        if (divisor < 4)  /* Lower than 4 doesn't seem to work properly */
+            divisor = 4;
+        usbn_clk_ctl.s.divide = divisor;
+        usbn_clk_ctl.s.divide2 = 0;
+    }
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    /* 2d. Write USBN0/1_CLK_CTL[HCLK_RST] = 1 */
+    usbn_clk_ctl.s.hclk_rst = 1;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    /* 2e.  Wait 64 core-clock cycles for HCLK to stabilize */
+    cvmx_wait(64);
+    /* 3. Program the power-on reset field in the USBN clock-control register:
+        USBN_CLK_CTL[POR] = 0 */
+    usbn_clk_ctl.s.por = 0;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    /* 4. Wait 1 ms for PHY clock to start */
+    cvmx_wait_usec(1000);
+    /* 5. Program the Reset input from automatic test equipment field in the
+        USBP control and status register: USBN_USBP_CTL_STATUS[ATE_RESET] = 1 */
+    usbn_usbp_ctl_status.u64 = __cvmx_usb_read_csr64(usb, CVMX_USBNX_USBP_CTL_STATUS(usb->index));
+    usbn_usbp_ctl_status.s.ate_reset = 1;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_USBP_CTL_STATUS(usb->index),
+                           usbn_usbp_ctl_status.u64);
+    /* 6. Wait 10 cycles */
+    cvmx_wait(10);
+    /* 7. Clear ATE_RESET field in the USBN clock-control register:
+        USBN_USBP_CTL_STATUS[ATE_RESET] = 0 */
+    usbn_usbp_ctl_status.s.ate_reset = 0;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_USBP_CTL_STATUS(usb->index),
+                           usbn_usbp_ctl_status.u64);
+    /* 8. Program the PHY reset field in the USBN clock-control register:
+        USBN_CLK_CTL[PRST] = 1 */
+    usbn_clk_ctl.s.prst = 1;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    /* 9. Program the USBP control and status register to select host or
+        device mode. USBN_USBP_CTL_STATUS[HST_MODE] = 0 for host, = 1 for
+        device */
+    usbn_usbp_ctl_status.s.hst_mode = 0;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_USBP_CTL_STATUS(usb->index),
+                           usbn_usbp_ctl_status.u64);
+    /* 10. Wait 1 us */
+    cvmx_wait_usec(1);
+    /* 11. Program the hreset_n field in the USBN clock-control register:
+        USBN_CLK_CTL[HRST] = 1 */
+    usbn_clk_ctl.s.hrst = 1;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    /* 12. Proceed to USB core initialization */
+    usbn_clk_ctl.s.enable = 1;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    cvmx_wait_usec(1);
+
+    /* USB Core Initialization */
+
+    /* 1. Read USBC_GHWCFG1, USBC_GHWCFG2, USBC_GHWCFG3, USBC_GHWCFG4 to
+        determine USB core configuration parameters. */
+    /* Nothing needed */
+    /* 2. Program the following fields in the global AHB configuration
+        register (USBC_GAHBCFG)
+        DMA mode, USBC_GAHBCFG[DMAEn]: 1 = DMA mode, 0 = slave mode
+        Burst length, USBC_GAHBCFG[HBSTLEN] = 0
+        Nonperiodic TxFIFO empty level (slave mode only),
+        USBC_GAHBCFG[NPTXFEMPLVL]
+        Periodic TxFIFO empty level (slave mode only),
+        USBC_GAHBCFG[PTXFEMPLVL]
+        Global interrupt mask, USBC_GAHBCFG[GLBLINTRMSK] = 1 */
+    {
+        cvmx_usbcx_gahbcfg_t usbcx_gahbcfg;
+        /* Due to an errata, CN31XX doesn't support DMA */
+        if (OCTEON_IS_MODEL(OCTEON_CN31XX))
+            usb->init_flags |= CVMX_USB_INITIALIZE_FLAGS_NO_DMA;
+        usbcx_gahbcfg.u32 = 0;
+        usbcx_gahbcfg.s.dmaen = !(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA);
+        if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)
+            usb->idle_hardware_channels = 0x1;  /* Only use one channel with non DMA */
+        else if (OCTEON_IS_MODEL(OCTEON_CN5XXX))
+            usb->idle_hardware_channels = 0xf7; /* CN5XXX have an errata with channel 3 */
+        else
+            usb->idle_hardware_channels = 0xff;
+        usbcx_gahbcfg.s.hbstlen = 0;
+        usbcx_gahbcfg.s.nptxfemplvl = 1;
+        usbcx_gahbcfg.s.ptxfemplvl = 1;
+        usbcx_gahbcfg.s.glblintrmsk = 1;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_GAHBCFG(usb->index),
+                               usbcx_gahbcfg.u32);
+    }
+    /* 3. Program the following fields in USBC_GUSBCFG register.
+        HS/FS timeout calibration, USBC_GUSBCFG[TOUTCAL] = 0
+        ULPI DDR select, USBC_GUSBCFG[DDRSEL] = 0
+        USB turnaround time, USBC_GUSBCFG[USBTRDTIM] = 0x5
+        PHY low-power clock select, USBC_GUSBCFG[PHYLPWRCLKSEL] = 0 */
+    {
+        cvmx_usbcx_gusbcfg_t usbcx_gusbcfg;
+        usbcx_gusbcfg.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GUSBCFG(usb->index));
+        usbcx_gusbcfg.s.toutcal = 0;
+        usbcx_gusbcfg.s.ddrsel = 0;
+        usbcx_gusbcfg.s.usbtrdtim = 0x5;
+        usbcx_gusbcfg.s.phylpwrclksel = 0;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_GUSBCFG(usb->index),
+                               usbcx_gusbcfg.u32);
+    }
+    /* 4. The software must unmask the following bits in the USBC_GINTMSK
+        register.
+        OTG interrupt mask, USBC_GINTMSK[OTGINTMSK] = 1
+        Mode mismatch interrupt mask, USBC_GINTMSK[MODEMISMSK] = 1 */
+    {
+        cvmx_usbcx_gintmsk_t usbcx_gintmsk;
+        int channel;
+
+        usbcx_gintmsk.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GINTMSK(usb->index));
+        usbcx_gintmsk.s.otgintmsk = 1;
+        usbcx_gintmsk.s.modemismsk = 1;
+        usbcx_gintmsk.s.hchintmsk = 1;
+        usbcx_gintmsk.s.sofmsk = 0;
+        /* We need RX FIFO interrupts if we don't have DMA */
+        if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)
+            usbcx_gintmsk.s.rxflvlmsk = 1;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_GINTMSK(usb->index),
+                               usbcx_gintmsk.u32);
+
+        /* Disable all channel interrupts. We'll enable them per channel later */
+        for (channel=0; channel<8; channel++)
+            __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCINTMSKX(channel, usb->index), 0);
+    }
+
+    {
+        /* Host Port Initialization */
+        if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+            cvmx_dprintf("%s: USB%d is in host mode\n", __FUNCTION__, usb->index);
+
+        /* 1. Program the host-port interrupt-mask field to unmask,
+            USBC_GINTMSK[PRTINT] = 1 */
+        USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t,
+                        prtintmsk, 1);
+        USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t,
+                        disconnintmsk, 1);
+        /* 2. Program the USBC_HCFG register to select full-speed host or
+            high-speed host. */
+        {
+            cvmx_usbcx_hcfg_t usbcx_hcfg;
+            usbcx_hcfg.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCFG(usb->index));
+            usbcx_hcfg.s.fslssupp = 0;
+            usbcx_hcfg.s.fslspclksel = 0;
+            __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCFG(usb->index), usbcx_hcfg.u32);
+        }
+        /* 3. Program the port power bit to drive VBUS on the USB,
+            USBC_HPRT[PRTPWR] = 1 */
+        USB_SET_FIELD32(CVMX_USBCX_HPRT(usb->index), cvmx_usbcx_hprt_t, prtpwr, 1);
+
+        /* Steps 4-15 from the manual are done later in the port enable */
+    }
+
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Shutdown a USB port after a call to cvmx_usb_initialize().
+ * The port should be disabled with all pipes closed when this
+ * function is called.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_shutdown(cvmx_usb_state_t *state)
+{
+    cvmx_usbnx_clk_ctl_t usbn_clk_ctl;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    /* Make sure all pipes are closed */
+    if (usb->idle_pipes.head ||
+        usb->active_pipes[CVMX_USB_TRANSFER_ISOCHRONOUS].head ||
+        usb->active_pipes[CVMX_USB_TRANSFER_INTERRUPT].head ||
+        usb->active_pipes[CVMX_USB_TRANSFER_CONTROL].head ||
+        usb->active_pipes[CVMX_USB_TRANSFER_BULK].head)
+        CVMX_USB_RETURN(CVMX_USB_BUSY);
+
+    /* Disable the clocks and put them in power on reset */
+    usbn_clk_ctl.u64 = __cvmx_usb_read_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index));
+    usbn_clk_ctl.s.enable = 1;
+    usbn_clk_ctl.s.por = 1;
+    usbn_clk_ctl.s.hclk_rst = 1;
+    usbn_clk_ctl.s.prst = 0;
+    usbn_clk_ctl.s.hrst = 0;
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_CLK_CTL(usb->index),
+                           usbn_clk_ctl.u64);
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Enable a USB port. After this call succeeds, the USB port is
+ * online and servicing requests.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_enable(cvmx_usb_state_t *state)
+{
+    cvmx_usbcx_ghwcfg3_t usbcx_ghwcfg3;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    usb->usbcx_hprt.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPRT(usb->index));
+
+    /* If the port is already enabled the just return. We don't need to do
+        anything */
+    if (usb->usbcx_hprt.s.prtena)
+        CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+
+    /* If there is nothing plugged into the port then fail immediately */
+    if (!usb->usbcx_hprt.s.prtconnsts) {
+        if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+            cvmx_dprintf("%s: USB%d Nothing plugged into the port\n", __FUNCTION__, usb->index);
+        CVMX_USB_RETURN(CVMX_USB_TIMEOUT);
+    }
+
+    /* Program the port reset bit to start the reset process */
+    USB_SET_FIELD32(CVMX_USBCX_HPRT(usb->index), cvmx_usbcx_hprt_t, prtrst, 1);
+
+    /* Wait at least 50ms (high speed), or 10ms (full speed) for the reset
+        process to complete. */
+    cvmx_wait_usec(50000);
+
+    /* Program the port reset bit to 0, USBC_HPRT[PRTRST] = 0 */
+    USB_SET_FIELD32(CVMX_USBCX_HPRT(usb->index), cvmx_usbcx_hprt_t, prtrst, 0);
+
+    /* Wait for the USBC_HPRT[PRTENA]. */
+    if (CVMX_WAIT_FOR_FIELD32(CVMX_USBCX_HPRT(usb->index), cvmx_usbcx_hprt_t,
+                              prtena, ==, 1, 100000)) {
+        if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+            cvmx_dprintf("%s: Timeout waiting for the port to finish reset\n",
+                         __FUNCTION__);
+        CVMX_USB_RETURN(CVMX_USB_TIMEOUT);
+    }
+
+    /* Read the port speed field to get the enumerated speed, USBC_HPRT[PRTSPD]. */
+    usb->usbcx_hprt.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPRT(usb->index));
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+        cvmx_dprintf("%s: USB%d is in %s speed mode\n", __FUNCTION__, usb->index,
+                     (usb->usbcx_hprt.s.prtspd == CVMX_USB_SPEED_HIGH) ? "high" :
+                     (usb->usbcx_hprt.s.prtspd == CVMX_USB_SPEED_FULL) ? "full" :
+                     "low");
+
+    usbcx_ghwcfg3.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GHWCFG3(usb->index));
+
+    /* 13. Program the USBC_GRXFSIZ register to select the size of the receive
+        FIFO (25%). */
+    USB_SET_FIELD32(CVMX_USBCX_GRXFSIZ(usb->index), cvmx_usbcx_grxfsiz_t,
+                    rxfdep, usbcx_ghwcfg3.s.dfifodepth / 4);
+    /* 14. Program the USBC_GNPTXFSIZ register to select the size and the
+        start address of the non- periodic transmit FIFO for nonperiodic
+        transactions (50%). */
+    {
+        cvmx_usbcx_gnptxfsiz_t siz;
+        siz.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GNPTXFSIZ(usb->index));
+        siz.s.nptxfdep = usbcx_ghwcfg3.s.dfifodepth / 2;
+        siz.s.nptxfstaddr = usbcx_ghwcfg3.s.dfifodepth / 4;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_GNPTXFSIZ(usb->index), siz.u32);
+    }
+    /* 15. Program the USBC_HPTXFSIZ register to select the size and start
+        address of the periodic transmit FIFO for periodic transactions (25%). */
+    {
+        cvmx_usbcx_hptxfsiz_t siz;
+        siz.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPTXFSIZ(usb->index));
+        siz.s.ptxfsize = usbcx_ghwcfg3.s.dfifodepth / 4;
+        siz.s.ptxfstaddr = 3 * usbcx_ghwcfg3.s.dfifodepth / 4;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HPTXFSIZ(usb->index), siz.u32);
+    }
+    /* Flush all FIFOs */
+    USB_SET_FIELD32(CVMX_USBCX_GRSTCTL(usb->index), cvmx_usbcx_grstctl_t, txfnum, 0x10);
+    USB_SET_FIELD32(CVMX_USBCX_GRSTCTL(usb->index), cvmx_usbcx_grstctl_t, txfflsh, 1);
+    CVMX_WAIT_FOR_FIELD32(CVMX_USBCX_GRSTCTL(usb->index), cvmx_usbcx_grstctl_t,
+                          txfflsh, ==, 0, 100);
+    USB_SET_FIELD32(CVMX_USBCX_GRSTCTL(usb->index), cvmx_usbcx_grstctl_t, rxfflsh, 1);
+    CVMX_WAIT_FOR_FIELD32(CVMX_USBCX_GRSTCTL(usb->index), cvmx_usbcx_grstctl_t,
+                          rxfflsh, ==, 0, 100);
+
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Disable a USB port. After this call the USB port will not
+ * generate data transfers and will not generate events.
+ * Transactions in process will fail and call their
+ * associated callbacks.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_disable(cvmx_usb_state_t *state)
+{
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    /* Disable the port */
+    USB_SET_FIELD32(CVMX_USBCX_HPRT(usb->index), cvmx_usbcx_hprt_t, prtena, 1);
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Get the current state of the USB port. Use this call to
+ * determine if the usb port has anything connected, is enabled,
+ * or has some sort of error condition. The return value of this
+ * call has "changed" bits to signal of the value of some fields
+ * have changed between calls. These "changed" fields are based
+ * on the last call to cvmx_usb_set_status(). In order to clear
+ * them, you must update the status through cvmx_usb_set_status().
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return Port status information
+ */
+cvmx_usb_port_status_t cvmx_usb_get_status(cvmx_usb_state_t *state)
+{
+    cvmx_usbcx_hprt_t usbc_hprt;
+    cvmx_usb_port_status_t result;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    memset(&result, 0, sizeof(result));
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    usbc_hprt.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPRT(usb->index));
+    result.port_enabled = usbc_hprt.s.prtena;
+    result.port_over_current = usbc_hprt.s.prtovrcurract;
+    result.port_powered = usbc_hprt.s.prtpwr;
+    result.port_speed = usbc_hprt.s.prtspd;
+    result.connected = usbc_hprt.s.prtconnsts;
+    result.connect_change = (result.connected != usb->port_status.connected);
+
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS))
+        cvmx_dprintf("%*s%s: returned port enabled=%d, over_current=%d, powered=%d, speed=%d, connected=%d, connect_change=%d\n",
+                     2*(--usb->indent), "", __FUNCTION__,
+                     result.port_enabled,
+                     result.port_over_current,
+                     result.port_powered,
+                     result.port_speed,
+                     result.connected,
+                     result.connect_change);
+    return result;
+}
+
+
+/**
+ * Set the current state of the USB port. The status is used as
+ * a reference for the "changed" bits returned by
+ * cvmx_usb_get_status(). Other than serving as a reference, the
+ * status passed to this function is not used. No fields can be
+ * changed through this call.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param port_status
+ *               Port status to set, most like returned by cvmx_usb_get_status()
+ */
+void cvmx_usb_set_status(cvmx_usb_state_t *state, cvmx_usb_port_status_t port_status)
+{
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    usb->port_status = port_status;
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Convert a USB transaction into a handle
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param transaction
+ *               Transaction to get handle for
+ *
+ * @return Handle
+ */
+static inline int __cvmx_usb_get_submit_handle(cvmx_usb_internal_state_t *usb,
+                                        cvmx_usb_transaction_t *transaction)
+{
+    return ((unsigned long)transaction - (unsigned long)usb->transaction) /
+            sizeof(*transaction);
+}
+
+
+/**
+ * @INTERNAL
+ * Convert a USB pipe into a handle
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe   Pipe to get handle for
+ *
+ * @return Handle
+ */
+static inline int __cvmx_usb_get_pipe_handle(cvmx_usb_internal_state_t *usb,
+                                        cvmx_usb_pipe_t *pipe)
+{
+    return ((unsigned long)pipe - (unsigned long)usb->pipe) / sizeof(*pipe);
+}
+
+
+/**
+ * Open a virtual pipe between the host and a USB device. A pipe
+ * must be opened before data can be transferred between a device
+ * and Octeon.
+ *
+ * @param state      USB device state populated by
+ *                   cvmx_usb_initialize().
+ * @param flags      Optional pipe flags defined in
+ *                   cvmx_usb_pipe_flags_t.
+ * @param device_addr
+ *                   USB device address to open the pipe to
+ *                   (0-127).
+ * @param endpoint_num
+ *                   USB endpoint number to open the pipe to
+ *                   (0-15).
+ * @param device_speed
+ *                   The speed of the device the pipe is going
+ *                   to. This must match the device's speed,
+ *                   which may be different than the port speed.
+ * @param max_packet The maximum packet length the device can
+ *                   transmit/receive (low speed=0-8, full
+ *                   speed=0-1023, high speed=0-1024). This value
+ *                   comes from the standard endpoint descriptor
+ *                   field wMaxPacketSize bits <10:0>.
+ * @param transfer_type
+ *                   The type of transfer this pipe is for.
+ * @param transfer_dir
+ *                   The direction the pipe is in. This is not
+ *                   used for control pipes.
+ * @param interval   For ISOCHRONOUS and INTERRUPT transfers,
+ *                   this is how often the transfer is scheduled
+ *                   for. All other transfers should specify
+ *                   zero. The units are in frames (8000/sec at
+ *                   high speed, 1000/sec for full speed).
+ * @param multi_count
+ *                   For high speed devices, this is the maximum
+ *                   allowed number of packet per microframe.
+ *                   Specify zero for non high speed devices. This
+ *                   value comes from the standard endpoint descriptor
+ *                   field wMaxPacketSize bits <12:11>.
+ * @param hub_device_addr
+ *                   Hub device address this device is connected
+ *                   to. Devices connected directly to Octeon
+ *                   use zero. This is only used when the device
+ *                   is full/low speed behind a high speed hub.
+ *                   The address will be of the high speed hub,
+ *                   not and full speed hubs after it.
+ * @param hub_port   Which port on the hub the device is
+ *                   connected. Use zero for devices connected
+ *                   directly to Octeon. Like hub_device_addr,
+ *                   this is only used for full/low speed
+ *                   devices behind a high speed hub.
+ *
+ * @return A non negative value is a pipe handle. Negative
+ *         values are failure codes from cvmx_usb_status_t.
+ */
+int cvmx_usb_open_pipe(cvmx_usb_state_t *state, cvmx_usb_pipe_flags_t flags,
+                       int device_addr, int endpoint_num,
+                       cvmx_usb_speed_t device_speed, int max_packet,
+                       cvmx_usb_transfer_t transfer_type,
+                       cvmx_usb_direction_t transfer_dir, int interval,
+                       int multi_count, int hub_device_addr, int hub_port)
+{
+    cvmx_usb_pipe_t *pipe;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("0x%x", flags);
+    CVMX_USB_LOG_PARAM("%d", device_addr);
+    CVMX_USB_LOG_PARAM("%d", endpoint_num);
+    CVMX_USB_LOG_PARAM("%d", device_speed);
+    CVMX_USB_LOG_PARAM("%d", max_packet);
+    CVMX_USB_LOG_PARAM("%d", transfer_type);
+    CVMX_USB_LOG_PARAM("%d", transfer_dir);
+    CVMX_USB_LOG_PARAM("%d", interval);
+    CVMX_USB_LOG_PARAM("%d", multi_count);
+    CVMX_USB_LOG_PARAM("%d", hub_device_addr);
+    CVMX_USB_LOG_PARAM("%d", hub_port);
+
+    if (cvmx_unlikely((device_addr < 0) || (device_addr > MAX_USB_ADDRESS)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((endpoint_num < 0) || (endpoint_num > MAX_USB_ENDPOINT)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(device_speed > CVMX_USB_SPEED_LOW))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((max_packet <= 0) || (max_packet > 1024)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(transfer_type > CVMX_USB_TRANSFER_INTERRUPT))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((transfer_dir != CVMX_USB_DIRECTION_OUT) &&
+        (transfer_dir != CVMX_USB_DIRECTION_IN)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(interval < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((transfer_type == CVMX_USB_TRANSFER_CONTROL) && interval))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(multi_count < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((device_speed != CVMX_USB_SPEED_HIGH) &&
+        (multi_count != 0)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((hub_device_addr < 0) || (hub_device_addr > MAX_USB_ADDRESS)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((hub_port < 0) || (hub_port > MAX_USB_HUB_PORT)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Find a free pipe */
+    pipe = usb->free_pipes.head;
+    if (!pipe)
+        CVMX_USB_RETURN(CVMX_USB_NO_MEMORY);
+    __cvmx_usb_remove_pipe(&usb->free_pipes, pipe);
+    pipe->flags = flags | __CVMX_USB_PIPE_FLAGS_OPEN;
+    if ((device_speed == CVMX_USB_SPEED_HIGH) &&
+        (transfer_dir == CVMX_USB_DIRECTION_OUT) &&
+        (transfer_type == CVMX_USB_TRANSFER_BULK))
+        pipe->flags |= __CVMX_USB_PIPE_FLAGS_NEED_PING;
+    pipe->device_addr = device_addr;
+    pipe->endpoint_num = endpoint_num;
+    pipe->device_speed = device_speed;
+    pipe->max_packet = max_packet;
+    pipe->transfer_type = transfer_type;
+    pipe->transfer_dir = transfer_dir;
+    /* All pipes use interval to rate limit NAK processing. Force an interval
+        if one wasn't supplied */
+    if (!interval)
+        interval = 1;
+    if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+        pipe->interval = interval*8;
+        /* Force start splits to be schedule on uFrame 0 */
+        pipe->next_tx_frame = ((usb->frame_number+7)&~7) + pipe->interval;
+    }
+    else {
+        pipe->interval = interval;
+        pipe->next_tx_frame = usb->frame_number + pipe->interval;
+    }
+    pipe->multi_count = multi_count;
+    pipe->hub_device_addr = hub_device_addr;
+    pipe->hub_port = hub_port;
+    pipe->pid_toggle = 0;
+    pipe->split_sc_frame = -1;
+    __cvmx_usb_append_pipe(&usb->idle_pipes, pipe);
+
+    /* We don't need to tell the hardware about this pipe yet since
+        it doesn't have any submitted requests */
+
+    CVMX_USB_RETURN(__cvmx_usb_get_pipe_handle(usb, pipe));
+}
+
+
+/**
+ * @INTERNAL
+ * Poll the RX FIFOs and remove data as needed. This function is only used
+ * in non DMA mode. It is very important that this function be called quickly
+ * enough to prevent FIFO overflow.
+ *
+ * @param usb     USB device state populated by
+ *                cvmx_usb_initialize().
+ */
+static void __cvmx_usb_poll_rx_fifo(cvmx_usb_internal_state_t *usb)
+{
+    cvmx_usbcx_grxstsph_t rx_status;
+    int channel;
+    int bytes;
+    uint64_t address;
+    uint32_t *ptr;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+
+    rx_status.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GRXSTSPH(usb->index));
+    /* Only read data if IN data is there */
+    if (rx_status.s.pktsts != 2)
+        CVMX_USB_RETURN_NOTHING();
+    /* Check if no data is available */
+    if (!rx_status.s.bcnt)
+        CVMX_USB_RETURN_NOTHING();
+
+    channel = rx_status.s.chnum;
+    bytes = rx_status.s.bcnt;
+    if (!bytes)
+        CVMX_USB_RETURN_NOTHING();
+
+    /* Get where the DMA engine would have written this data */
+    address = __cvmx_usb_read_csr64(usb, CVMX_USBNX_DMA0_INB_CHN0(usb->index) + channel*8);
+    ptr = cvmx_phys_to_ptr(address);
+    __cvmx_usb_write_csr64(usb, CVMX_USBNX_DMA0_INB_CHN0(usb->index) + channel*8, address + bytes);
+
+    /* Loop writing the FIFO data for this packet into memory */
+    while (bytes > 0) {
+        *ptr++ = __cvmx_usb_read_csr32(usb, USB_FIFO_ADDRESS(channel, usb->index));
+        bytes -= 4;
+    }
+    CVMX_SYNCW;
+
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * Fill the TX hardware fifo with data out of the software
+ * fifos
+ *
+ * @param usb       USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param fifo      Software fifo to use
+ * @param available Amount of space in the hardware fifo
+ *
+ * @return Non zero if the hardware fifo was too small and needs
+ *         to be serviced again.
+ */
+static int __cvmx_usb_fill_tx_hw(cvmx_usb_internal_state_t *usb, cvmx_usb_tx_fifo_t *fifo, int available)
+{
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%p", fifo);
+    CVMX_USB_LOG_PARAM("%d", available);
+
+    /* We're done either when there isn't anymore space or the software FIFO
+        is empty */
+    while (available && (fifo->head != fifo->tail)) {
+        int i = fifo->tail;
+        const uint32_t *ptr = cvmx_phys_to_ptr(fifo->entry[i].address);
+        uint64_t csr_address = USB_FIFO_ADDRESS(fifo->entry[i].channel, usb->index) ^ 4;
+        int words = available;
+
+        /* Limit the amount of data to waht the SW fifo has */
+        if (fifo->entry[i].size <= available) {
+            words = fifo->entry[i].size;
+            fifo->tail++;
+            if (fifo->tail > MAX_CHANNELS)
+                fifo->tail = 0;
+        }
+
+        /* Update the next locations and counts */
+        available -= words;
+        fifo->entry[i].address += words * 4;
+        fifo->entry[i].size -= words;
+
+        /* Write the HW fifo data. The read every three writes is due
+            to an errata on CN3XXX chips */
+        while (words > 3) {
+            cvmx_write64_uint32(csr_address, *ptr++);
+            cvmx_write64_uint32(csr_address, *ptr++);
+            cvmx_write64_uint32(csr_address, *ptr++);
+            cvmx_read64_uint64(CVMX_USBNX_DMA0_INB_CHN0(usb->index));
+            words -= 3;
+        }
+        cvmx_write64_uint32(csr_address, *ptr++);
+        if (--words) {
+            cvmx_write64_uint32(csr_address, *ptr++);
+            if (--words)
+                cvmx_write64_uint32(csr_address, *ptr++);
+        }
+        cvmx_read64_uint64(CVMX_USBNX_DMA0_INB_CHN0(usb->index));
+    }
+    CVMX_USB_RETURN(fifo->head != fifo->tail);
+}
+
+
+/**
+ * Check the hardware FIFOs and fill them as needed
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ */
+static void __cvmx_usb_poll_tx_fifo(cvmx_usb_internal_state_t *usb)
+{
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+
+    if (usb->periodic.head != usb->periodic.tail) {
+        cvmx_usbcx_hptxsts_t tx_status;
+        tx_status.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPTXSTS(usb->index));
+        if (__cvmx_usb_fill_tx_hw(usb, &usb->periodic, tx_status.s.ptxfspcavail))
+            USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t, ptxfempmsk, 1);
+        else
+            USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t, ptxfempmsk, 0);
+    }
+
+    if (usb->nonperiodic.head != usb->nonperiodic.tail) {
+        cvmx_usbcx_gnptxsts_t tx_status;
+        tx_status.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GNPTXSTS(usb->index));
+        if (__cvmx_usb_fill_tx_hw(usb, &usb->nonperiodic, tx_status.s.nptxfspcavail))
+            USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t, nptxfempmsk, 1);
+        else
+            USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t, nptxfempmsk, 0);
+    }
+
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Fill the TX FIFO with an outgoing packet
+ *
+ * @param usb     USB device state populated by
+ *                cvmx_usb_initialize().
+ * @param channel Channel number to get packet from
+ */
+static void __cvmx_usb_fill_tx_fifo(cvmx_usb_internal_state_t *usb, int channel)
+{
+    cvmx_usbcx_hccharx_t hcchar;
+    cvmx_usbcx_hcspltx_t usbc_hcsplt;
+    cvmx_usbcx_hctsizx_t usbc_hctsiz;
+    cvmx_usb_tx_fifo_t *fifo;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%d", channel);
+
+    /* We only need to fill data on outbound channels */
+    hcchar.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index));
+    if (hcchar.s.epdir != CVMX_USB_DIRECTION_OUT)
+        CVMX_USB_RETURN_NOTHING();
+
+    /* OUT Splits only have data on the start and not the complete */
+    usbc_hcsplt.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCSPLTX(channel, usb->index));
+    if (usbc_hcsplt.s.spltena && usbc_hcsplt.s.compsplt)
+        CVMX_USB_RETURN_NOTHING();
+
+    /* Find out how many bytes we need to fill and convert it into 32bit words */
+    usbc_hctsiz.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index));
+    if (!usbc_hctsiz.s.xfersize)
+        CVMX_USB_RETURN_NOTHING();
+
+    if ((hcchar.s.eptype == CVMX_USB_TRANSFER_INTERRUPT) ||
+        (hcchar.s.eptype == CVMX_USB_TRANSFER_ISOCHRONOUS))
+        fifo = &usb->periodic;
+    else
+        fifo = &usb->nonperiodic;
+
+    fifo->entry[fifo->head].channel = channel;
+    fifo->entry[fifo->head].address = __cvmx_usb_read_csr64(usb, CVMX_USBNX_DMA0_OUTB_CHN0(usb->index) + channel*8);
+    fifo->entry[fifo->head].size = (usbc_hctsiz.s.xfersize+3)>>2;
+    fifo->head++;
+    if (fifo->head > MAX_CHANNELS)
+        fifo->head = 0;
+
+    __cvmx_usb_poll_tx_fifo(usb);
+
+    CVMX_USB_RETURN_NOTHING();
+}
+
+/**
+ * @INTERNAL
+ * Perform channel specific setup for Control transactions. All
+ * the generic stuff will already have been done in
+ * __cvmx_usb_start_channel()
+ *
+ * @param usb     USB device state populated by
+ *                cvmx_usb_initialize().
+ * @param channel Channel to setup
+ * @param pipe    Pipe for control transaction
+ */
+static void __cvmx_usb_start_channel_control(cvmx_usb_internal_state_t *usb,
+                                             int channel,
+                                             cvmx_usb_pipe_t *pipe)
+{
+    cvmx_usb_transaction_t *transaction = pipe->head;
+    cvmx_usb_control_header_t *header = cvmx_phys_to_ptr(transaction->control_header);
+    int bytes_to_transfer = transaction->buffer_length - transaction->actual_bytes;
+    int packets_to_transfer;
+    cvmx_usbcx_hctsizx_t usbc_hctsiz;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%d", channel);
+    CVMX_USB_LOG_PARAM("%p", pipe);
+
+    usbc_hctsiz.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index));
+
+    switch (transaction->stage) {
+        case CVMX_USB_STAGE_NON_CONTROL:
+        case CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE:
+            cvmx_dprintf("%s: ERROR - Non control stage\n", __FUNCTION__);
+            break;
+        case CVMX_USB_STAGE_SETUP:
+            usbc_hctsiz.s.pid = 3; /* Setup */
+            bytes_to_transfer = sizeof(*header);
+            /* All Control operations start with a setup going OUT */
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index), cvmx_usbcx_hccharx_t, epdir, CVMX_USB_DIRECTION_OUT);
+            /* Setup send the control header instead of the buffer data. The
+                buffer data will be used in the next stage */
+            __cvmx_usb_write_csr64(usb, CVMX_USBNX_DMA0_OUTB_CHN0(usb->index) + channel*8, transaction->control_header);
+            break;
+        case CVMX_USB_STAGE_SETUP_SPLIT_COMPLETE:
+            usbc_hctsiz.s.pid = 3; /* Setup */
+            bytes_to_transfer = 0;
+            /* All Control operations start with a setup going OUT */
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index), cvmx_usbcx_hccharx_t, epdir, CVMX_USB_DIRECTION_OUT);
+            USB_SET_FIELD32(CVMX_USBCX_HCSPLTX(channel, usb->index), cvmx_usbcx_hcspltx_t, compsplt, 1);
+            break;
+        case CVMX_USB_STAGE_DATA:
+            usbc_hctsiz.s.pid = __cvmx_usb_get_data_pid(pipe);
+            if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                if (header->s.request_type & 0x80)
+                    bytes_to_transfer = 0;
+                else if (bytes_to_transfer > pipe->max_packet)
+                    bytes_to_transfer = pipe->max_packet;
+            }
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index),
+                            cvmx_usbcx_hccharx_t, epdir,
+                            ((header->s.request_type & 0x80) ?
+                             CVMX_USB_DIRECTION_IN :
+                             CVMX_USB_DIRECTION_OUT));
+            break;
+        case CVMX_USB_STAGE_DATA_SPLIT_COMPLETE:
+            usbc_hctsiz.s.pid = __cvmx_usb_get_data_pid(pipe);
+            if (!(header->s.request_type & 0x80))
+                bytes_to_transfer = 0;
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index),
+                            cvmx_usbcx_hccharx_t, epdir,
+                            ((header->s.request_type & 0x80) ?
+                             CVMX_USB_DIRECTION_IN :
+                             CVMX_USB_DIRECTION_OUT));
+            USB_SET_FIELD32(CVMX_USBCX_HCSPLTX(channel, usb->index), cvmx_usbcx_hcspltx_t, compsplt, 1);
+            break;
+        case CVMX_USB_STAGE_STATUS:
+            usbc_hctsiz.s.pid = __cvmx_usb_get_data_pid(pipe);
+            bytes_to_transfer = 0;
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index), cvmx_usbcx_hccharx_t, epdir,
+                            ((header->s.request_type & 0x80) ?
+                             CVMX_USB_DIRECTION_OUT :
+                             CVMX_USB_DIRECTION_IN));
+            break;
+        case CVMX_USB_STAGE_STATUS_SPLIT_COMPLETE:
+            usbc_hctsiz.s.pid = __cvmx_usb_get_data_pid(pipe);
+            bytes_to_transfer = 0;
+            USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index), cvmx_usbcx_hccharx_t, epdir,
+                            ((header->s.request_type & 0x80) ?
+                             CVMX_USB_DIRECTION_OUT :
+                             CVMX_USB_DIRECTION_IN));
+            USB_SET_FIELD32(CVMX_USBCX_HCSPLTX(channel, usb->index), cvmx_usbcx_hcspltx_t, compsplt, 1);
+            break;
+    }
+
+    /* Make sure the transfer never exceeds the byte limit of the hardware.
+        Further bytes will be sent as continued transactions */
+    if (bytes_to_transfer > MAX_TRANSFER_BYTES) {
+        /* Round MAX_TRANSFER_BYTES to a multiple of out packet size */
+        bytes_to_transfer = MAX_TRANSFER_BYTES / pipe->max_packet;
+        bytes_to_transfer *= pipe->max_packet;
+    }
+
+    /* Calculate the number of packets to transfer. If the length is zero
+        we still need to transfer one packet */
+    packets_to_transfer = (bytes_to_transfer + pipe->max_packet - 1) / pipe->max_packet;
+    if (packets_to_transfer == 0)
+        packets_to_transfer = 1;
+    else if ((packets_to_transfer>1) && (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)) {
+        /* Limit to one packet when not using DMA. Channels must be restarted
+            between every packet for IN transactions, so there is no reason to
+            do multiple packets in a row */
+        packets_to_transfer = 1;
+        bytes_to_transfer = packets_to_transfer * pipe->max_packet;
+    }
+    else if (packets_to_transfer > MAX_TRANSFER_PACKETS) {
+        /* Limit the number of packet and data transferred to what the
+            hardware can handle */
+        packets_to_transfer = MAX_TRANSFER_PACKETS;
+        bytes_to_transfer = packets_to_transfer * pipe->max_packet;
+    }
+
+    usbc_hctsiz.s.xfersize = bytes_to_transfer;
+    usbc_hctsiz.s.pktcnt = packets_to_transfer;
+
+    __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index), usbc_hctsiz.u32);
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Start a channel to perform the pipe's head transaction
+ *
+ * @param usb     USB device state populated by
+ *                cvmx_usb_initialize().
+ * @param channel Channel to setup
+ * @param pipe    Pipe to start
+ */
+static void __cvmx_usb_start_channel(cvmx_usb_internal_state_t *usb,
+                                     int channel,
+                                     cvmx_usb_pipe_t *pipe)
+{
+    cvmx_usb_transaction_t *transaction = pipe->head;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%d", channel);
+    CVMX_USB_LOG_PARAM("%p", pipe);
+
+    if (cvmx_unlikely((usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS) ||
+        (pipe->flags & CVMX_USB_PIPE_FLAGS_DEBUG_TRANSFERS)))
+        cvmx_dprintf("%s: Channel %d started. Pipe %d transaction %d stage %d\n",
+                     __FUNCTION__, channel, __cvmx_usb_get_pipe_handle(usb, pipe),
+                     __cvmx_usb_get_submit_handle(usb, transaction),
+                     transaction->stage);
+
+    /* Make sure all writes to the DMA region get flushed */
+    CVMX_SYNCW;
+
+    /* Attach the channel to the pipe */
+    usb->pipe_for_channel[channel] = pipe;
+    pipe->channel = channel;
+    pipe->flags |= __CVMX_USB_PIPE_FLAGS_SCHEDULED;
+
+    /* Mark this channel as in use */
+    usb->idle_hardware_channels &= ~(1<<channel);
+
+    /* Enable the channel interrupt bits */
+    {
+        cvmx_usbcx_hcintx_t usbc_hcint;
+        cvmx_usbcx_hcintmskx_t usbc_hcintmsk;
+        cvmx_usbcx_haintmsk_t usbc_haintmsk;
+
+        /* Clear all channel status bits */
+        usbc_hcint.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCINTX(channel, usb->index));
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCINTX(channel, usb->index), usbc_hcint.u32);
+
+        usbc_hcintmsk.u32 = 0;
+        usbc_hcintmsk.s.chhltdmsk = 1;
+        if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA) {
+            /* Channels need these extra interrupts when we aren't in DMA mode */
+            usbc_hcintmsk.s.datatglerrmsk = 1;
+            usbc_hcintmsk.s.frmovrunmsk = 1;
+            usbc_hcintmsk.s.bblerrmsk = 1;
+            usbc_hcintmsk.s.xacterrmsk = 1;
+            if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                /* Splits don't generate xfercompl, so we need ACK and NYET */
+                usbc_hcintmsk.s.nyetmsk = 1;
+                usbc_hcintmsk.s.ackmsk = 1;
+            }
+            usbc_hcintmsk.s.nakmsk = 1;
+            usbc_hcintmsk.s.stallmsk = 1;
+            usbc_hcintmsk.s.xfercomplmsk = 1;
+        }
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCINTMSKX(channel, usb->index), usbc_hcintmsk.u32);
+
+        /* Enable the channel interrupt to propagate */
+        usbc_haintmsk.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HAINTMSK(usb->index));
+        usbc_haintmsk.s.haintmsk |= 1<<channel;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HAINTMSK(usb->index), usbc_haintmsk.u32);
+    }
+
+    /* Setup the locations the DMA engines use  */
+    {
+        uint64_t dma_address = transaction->buffer + transaction->actual_bytes;
+        if (transaction->type == CVMX_USB_TRANSFER_ISOCHRONOUS)
+            dma_address = transaction->buffer + transaction->iso_packets[0].offset + transaction->actual_bytes;
+        __cvmx_usb_write_csr64(usb, CVMX_USBNX_DMA0_OUTB_CHN0(usb->index) + channel*8, dma_address);
+        __cvmx_usb_write_csr64(usb, CVMX_USBNX_DMA0_INB_CHN0(usb->index) + channel*8, dma_address);
+    }
+
+    /* Setup both the size of the transfer and the SPLIT characteristics */
+    {
+        cvmx_usbcx_hcspltx_t usbc_hcsplt = {.u32 = 0};
+        cvmx_usbcx_hctsizx_t usbc_hctsiz = {.u32 = 0};
+        int packets_to_transfer;
+        int bytes_to_transfer = transaction->buffer_length - transaction->actual_bytes;
+
+        /* ISOCHRONOUS transactions store each individual transfer size in the
+            packet structure, not the global buffer_length */
+        if (transaction->type == CVMX_USB_TRANSFER_ISOCHRONOUS)
+            bytes_to_transfer = transaction->iso_packets[0].length - transaction->actual_bytes;
+
+        /* We need to do split transactions when we are talking to non high
+            speed devices that are behind a high speed hub */
+        if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+            /* On the start split phase (stage is even) record the frame number we
+                will need to send the split complete. We only store the lower two bits
+                since the time ahead can only be two frames */
+            if ((transaction->stage&1) == 0) {
+                if (transaction->type == CVMX_USB_TRANSFER_BULK)
+                    pipe->split_sc_frame = (usb->frame_number + 1) & 0x7f;
+                else
+                    pipe->split_sc_frame = (usb->frame_number + 2) & 0x7f;
+            }
+            else
+                pipe->split_sc_frame = -1;
+
+            usbc_hcsplt.s.spltena = 1;
+            usbc_hcsplt.s.hubaddr = pipe->hub_device_addr;
+            usbc_hcsplt.s.prtaddr = pipe->hub_port;
+            usbc_hcsplt.s.compsplt = (transaction->stage == CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE);
+
+            /* SPLIT transactions can only ever transmit one data packet so
+                limit the transfer size to the max packet size */
+            if (bytes_to_transfer > pipe->max_packet)
+                bytes_to_transfer = pipe->max_packet;
+
+            /* ISOCHRONOUS OUT splits are unique in that they limit
+                data transfers to 188 byte chunks representing the
+                begin/middle/end of the data or all */
+            if (!usbc_hcsplt.s.compsplt &&
+                (pipe->transfer_dir == CVMX_USB_DIRECTION_OUT) &&
+                (pipe->transfer_type == CVMX_USB_TRANSFER_ISOCHRONOUS)) {
+                /* Clear the split complete frame number as there isn't going
+                    to be a split complete */
+                pipe->split_sc_frame = -1;
+                /* See if we've started this transfer and sent data */
+                if (transaction->actual_bytes == 0) {
+                    /* Nothing sent yet, this is either a begin or the
+                        entire payload */
+                    if (bytes_to_transfer <= 188)
+                        usbc_hcsplt.s.xactpos = 3; /* Entire payload in one go */
+                    else
+                        usbc_hcsplt.s.xactpos = 2; /* First part of payload */
+                }
+                else {
+                    /* Continuing the previous data, we must either be
+                        in the middle or at the end */
+                    if (bytes_to_transfer <= 188)
+                        usbc_hcsplt.s.xactpos = 1; /* End of payload */
+                    else
+                        usbc_hcsplt.s.xactpos = 0; /* Middle of payload */
+                }
+                /* Again, the transfer size is limited to 188 bytes */
+                if (bytes_to_transfer > 188)
+                    bytes_to_transfer = 188;
+            }
+        }
+
+        /* Make sure the transfer never exceeds the byte limit of the hardware.
+            Further bytes will be sent as continued transactions */
+        if (bytes_to_transfer > MAX_TRANSFER_BYTES) {
+            /* Round MAX_TRANSFER_BYTES to a multiple of out packet size */
+            bytes_to_transfer = MAX_TRANSFER_BYTES / pipe->max_packet;
+            bytes_to_transfer *= pipe->max_packet;
+        }
+
+        /* Calculate the number of packets to transfer. If the length is zero
+            we still need to transfer one packet */
+        packets_to_transfer = (bytes_to_transfer + pipe->max_packet - 1) / pipe->max_packet;
+        if (packets_to_transfer == 0)
+            packets_to_transfer = 1;
+        else if ((packets_to_transfer>1) && (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)) {
+            /* Limit to one packet when not using DMA. Channels must be restarted
+                between every packet for IN transactions, so there is no reason to
+                do multiple packets in a row */
+            packets_to_transfer = 1;
+            bytes_to_transfer = packets_to_transfer * pipe->max_packet;
+        }
+        else if (packets_to_transfer > MAX_TRANSFER_PACKETS) {
+            /* Limit the number of packet and data transferred to what the
+                hardware can handle */
+            packets_to_transfer = MAX_TRANSFER_PACKETS;
+            bytes_to_transfer = packets_to_transfer * pipe->max_packet;
+        }
+
+        usbc_hctsiz.s.xfersize = bytes_to_transfer;
+        usbc_hctsiz.s.pktcnt = packets_to_transfer;
+
+        /* Update the DATA0/DATA1 toggle */
+        usbc_hctsiz.s.pid = __cvmx_usb_get_data_pid(pipe);
+        /* High speed pipes may need a hardware ping before they start */
+        if (pipe->flags & __CVMX_USB_PIPE_FLAGS_NEED_PING)
+            usbc_hctsiz.s.dopng = 1;
+
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCSPLTX(channel, usb->index), usbc_hcsplt.u32);
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index), usbc_hctsiz.u32);
+    }
+
+    /* Setup the Host Channel Characteristics Register */
+    {
+        cvmx_usbcx_hccharx_t usbc_hcchar = {.u32 = 0};
+
+        /* Set the startframe odd/even properly. This is only used for periodic */
+        usbc_hcchar.s.oddfrm = usb->frame_number&1;
+
+        /* Set the number of back to back packets allowed by this endpoint.
+            Split transactions interpret "ec" as the number of immediate
+            retries of failure. These retries happen too quickly, so we
+            disable these entirely for splits */
+        if (__cvmx_usb_pipe_needs_split(usb, pipe))
+            usbc_hcchar.s.ec = 1;
+        else if (pipe->multi_count < 1)
+            usbc_hcchar.s.ec = 1;
+        else if (pipe->multi_count > 3)
+            usbc_hcchar.s.ec = 3;
+        else
+            usbc_hcchar.s.ec = pipe->multi_count;
+
+        /* Set the rest of the endpoint specific settings */
+        usbc_hcchar.s.devaddr = pipe->device_addr;
+        usbc_hcchar.s.eptype = transaction->type;
+        usbc_hcchar.s.lspddev = (pipe->device_speed == CVMX_USB_SPEED_LOW);
+        usbc_hcchar.s.epdir = pipe->transfer_dir;
+        usbc_hcchar.s.epnum = pipe->endpoint_num;
+        usbc_hcchar.s.mps = pipe->max_packet;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index), usbc_hcchar.u32);
+    }
+
+    /* Do transaction type specific fixups as needed */
+    switch (transaction->type) {
+        case CVMX_USB_TRANSFER_CONTROL:
+            __cvmx_usb_start_channel_control(usb, channel, pipe);
+            break;
+        case CVMX_USB_TRANSFER_BULK:
+        case CVMX_USB_TRANSFER_INTERRUPT:
+            break;
+        case CVMX_USB_TRANSFER_ISOCHRONOUS:
+            if (!__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                /* ISO transactions require different PIDs depending on direction
+                    and how many packets are needed */
+                if (pipe->transfer_dir == CVMX_USB_DIRECTION_OUT) {
+                    if (pipe->multi_count < 2) /* Need DATA0 */
+                        USB_SET_FIELD32(CVMX_USBCX_HCTSIZX(channel, usb->index), cvmx_usbcx_hctsizx_t, pid, 0);
+                    else /* Need MDATA */
+                        USB_SET_FIELD32(CVMX_USBCX_HCTSIZX(channel, usb->index), cvmx_usbcx_hctsizx_t, pid, 3);
+                }
+            }
+            break;
+    }
+    {
+        cvmx_usbcx_hctsizx_t usbc_hctsiz = {.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index))};
+        transaction->xfersize = usbc_hctsiz.s.xfersize;
+        transaction->pktcnt = usbc_hctsiz.s.pktcnt;
+    }
+    /* Remeber when we start a split transaction */
+    if (__cvmx_usb_pipe_needs_split(usb, pipe))
+        usb->active_split = transaction;
+    USB_SET_FIELD32(CVMX_USBCX_HCCHARX(channel, usb->index), cvmx_usbcx_hccharx_t, chena, 1);
+    if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)
+        __cvmx_usb_fill_tx_fifo(usb, channel);
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Find a pipe that is ready to be scheduled to hardware.
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param list   Pipe list to search
+ * @param current_frame
+ *               Frame counter to use as a time reference.
+ *
+ * @return Pipe or NULL if none are ready
+ */
+static cvmx_usb_pipe_t *__cvmx_usb_find_ready_pipe(cvmx_usb_internal_state_t *usb, cvmx_usb_pipe_list_t *list, uint64_t current_frame)
+{
+    cvmx_usb_pipe_t *pipe = list->head;
+    while (pipe) {
+        if (!(pipe->flags & __CVMX_USB_PIPE_FLAGS_SCHEDULED) && pipe->head &&
+            (pipe->next_tx_frame <= current_frame) &&
+            ((pipe->split_sc_frame == -1) || ((((int)current_frame - (int)pipe->split_sc_frame) & 0x7f) < 0x40)) &&
+            (!usb->active_split || (usb->active_split == pipe->head))) {
+            CVMX_PREFETCH(pipe, 128);
+            CVMX_PREFETCH(pipe->head, 0);
+            return pipe;
+        }
+        pipe = pipe->next;
+    }
+    return NULL;
+}
+
+
+/**
+ * @INTERNAL
+ * Called whenever a pipe might need to be scheduled to the
+ * hardware.
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param is_sof True if this schedule was called on a SOF interrupt.
+ */
+static void __cvmx_usb_schedule(cvmx_usb_internal_state_t *usb, int is_sof)
+{
+    int channel;
+    cvmx_usb_pipe_t *pipe;
+    int need_sof;
+    cvmx_usb_transfer_t ttype;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+
+    if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA) {
+        /* Without DMA we need to be careful to not schedule something at the end of a frame and cause an overrun */
+        cvmx_usbcx_hfnum_t hfnum = {.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HFNUM(usb->index))};
+        cvmx_usbcx_hfir_t hfir = {.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HFIR(usb->index))};
+        if (hfnum.s.frrem < hfir.s.frint/4)
+            goto done;
+    }
+
+    while (usb->idle_hardware_channels) {
+        /* Find an idle channel */
+        CVMX_CLZ(channel, usb->idle_hardware_channels);
+        channel = 31 - channel;
+        if (cvmx_unlikely(channel > 7)) {
+            if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO))
+                cvmx_dprintf("%s: Idle hardware channels has a channel higher than 7. This is wrong\n", __FUNCTION__);
+            break;
+        }
+
+        /* Find a pipe needing service */
+        pipe = NULL;
+        if (is_sof) {
+            /* Only process periodic pipes on SOF interrupts. This way we are
+                sure that the periodic data is sent in the beginning of the
+                frame */
+            pipe = __cvmx_usb_find_ready_pipe(usb, usb->active_pipes + CVMX_USB_TRANSFER_ISOCHRONOUS, usb->frame_number);
+            if (cvmx_likely(!pipe))
+                pipe = __cvmx_usb_find_ready_pipe(usb, usb->active_pipes + CVMX_USB_TRANSFER_INTERRUPT, usb->frame_number);
+        }
+        if (cvmx_likely(!pipe)) {
+            pipe = __cvmx_usb_find_ready_pipe(usb, usb->active_pipes + CVMX_USB_TRANSFER_CONTROL, usb->frame_number);
+            if (cvmx_likely(!pipe))
+                pipe = __cvmx_usb_find_ready_pipe(usb, usb->active_pipes + CVMX_USB_TRANSFER_BULK, usb->frame_number);
+        }
+        if (!pipe)
+            break;
+
+        CVMX_USB_LOG_PARAM("%d", channel);
+        CVMX_USB_LOG_PARAM("%p", pipe);
+
+        if (cvmx_unlikely((usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS) ||
+            (pipe->flags & CVMX_USB_PIPE_FLAGS_DEBUG_TRANSFERS))) {
+            cvmx_usb_transaction_t *transaction = pipe->head;
+            const cvmx_usb_control_header_t *header = (transaction->control_header) ? cvmx_phys_to_ptr(transaction->control_header) : NULL;
+            const char *dir = (pipe->transfer_dir == CVMX_USB_DIRECTION_IN) ? "IN" : "OUT";
+            const char *type;
+            switch (pipe->transfer_type) {
+                case CVMX_USB_TRANSFER_CONTROL:
+                    type = "SETUP";
+                    dir = (header->s.request_type & 0x80) ? "IN" : "OUT";
+                    break;
+                case CVMX_USB_TRANSFER_ISOCHRONOUS:
+                    type = "ISOCHRONOUS";
+                    break;
+                case CVMX_USB_TRANSFER_BULK:
+                    type = "BULK";
+                    break;
+                default: /* CVMX_USB_TRANSFER_INTERRUPT */
+                    type = "INTERRUPT";
+                    break;
+            }
+            cvmx_dprintf("%s: Starting pipe %d, transaction %d on channel %d. %s %s len=%d header=0x%llx\n",
+                         __FUNCTION__, __cvmx_usb_get_pipe_handle(usb, pipe),
+                         __cvmx_usb_get_submit_handle(usb, transaction),
+                         channel, type, dir,
+                         transaction->buffer_length,
+                         (header) ? (unsigned long long)header->u64 : 0ull);
+        }
+        __cvmx_usb_start_channel(usb, channel, pipe);
+    }
+
+done:
+    /* Only enable SOF interrupts when we have transactions pending in the
+        future that might need to be scheduled */
+    need_sof = 0;
+    for (ttype=CVMX_USB_TRANSFER_CONTROL; ttype<=CVMX_USB_TRANSFER_INTERRUPT; ttype++) {
+        pipe = usb->active_pipes[ttype].head;
+        while (pipe) {
+            if (pipe->next_tx_frame > usb->frame_number) {
+                need_sof = 1;
+                break;
+            }
+            pipe=pipe->next;
+        }
+    }
+    USB_SET_FIELD32(CVMX_USBCX_GINTMSK(usb->index), cvmx_usbcx_gintmsk_t, sofmsk, need_sof);
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Call a user's callback for a specific reason.
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe   Pipe the callback is for or NULL
+ * @param transaction
+ *               Transaction the callback is for or NULL
+ * @param reason Reason this callback is being called
+ * @param complete_code
+ *               Completion code for the transaction, if any
+ */
+static void __cvmx_usb_perform_callback(cvmx_usb_internal_state_t *usb,
+                                        cvmx_usb_pipe_t *pipe,
+                                        cvmx_usb_transaction_t *transaction,
+                                        cvmx_usb_callback_t reason,
+                                        cvmx_usb_complete_t complete_code)
+{
+    cvmx_usb_callback_func_t callback = usb->callback[reason];
+    void *user_data = usb->callback_data[reason];
+    int submit_handle = -1;
+    int pipe_handle = -1;
+    int bytes_transferred = 0;
+
+    if (pipe)
+        pipe_handle = __cvmx_usb_get_pipe_handle(usb, pipe);
+
+    if (transaction) {
+        submit_handle = __cvmx_usb_get_submit_handle(usb, transaction);
+        bytes_transferred = transaction->actual_bytes;
+        /* Transactions are allowed to override the default callback */
+        if ((reason == CVMX_USB_CALLBACK_TRANSFER_COMPLETE) && transaction->callback) {
+            callback = transaction->callback;
+            user_data = transaction->callback_data;
+        }
+    }
+
+    if (!callback)
+        return;
+
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLBACKS))
+        cvmx_dprintf("%*s%s: calling callback %p(usb=%p, complete_code=%s, "
+                     "pipe_handle=%d, submit_handle=%d, bytes_transferred=%d, user_data=%p);\n",
+                     2*usb->indent, "", __FUNCTION__, callback, usb,
+                     __cvmx_usb_complete_to_string(complete_code),
+                     pipe_handle, submit_handle, bytes_transferred, user_data);
+
+    callback((cvmx_usb_state_t *)usb, reason, complete_code, pipe_handle, submit_handle,
+             bytes_transferred, user_data);
+
+    if (cvmx_unlikely(usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLBACKS))
+        cvmx_dprintf("%*s%s: callback %p complete\n", 2*usb->indent, "",
+                      __FUNCTION__, callback);
+}
+
+
+/**
+ * @INTERNAL
+ * Signal the completion of a transaction and free it. The
+ * transaction will be removed from the pipe transaction list.
+ *
+ * @param usb    USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe   Pipe the transaction is on
+ * @param transaction
+ *               Transaction that completed
+ * @param complete_code
+ *               Completion code
+ */
+static void __cvmx_usb_perform_complete(cvmx_usb_internal_state_t * usb,
+                                        cvmx_usb_pipe_t *pipe,
+                                        cvmx_usb_transaction_t *transaction,
+                                        cvmx_usb_complete_t complete_code)
+{
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%p", pipe);
+    CVMX_USB_LOG_PARAM("%p", transaction);
+    CVMX_USB_LOG_PARAM("%d", complete_code);
+
+    /* If this was a split then clear our split in progress marker */
+    if (usb->active_split == transaction)
+        usb->active_split = NULL;
+
+    /* Isochronous transactions need extra processing as they might not be done
+        after a single data transfer */
+    if (cvmx_unlikely(transaction->type == CVMX_USB_TRANSFER_ISOCHRONOUS)) {
+        /* Update the number of bytes transferred in this ISO packet */
+        transaction->iso_packets[0].length = transaction->actual_bytes;
+        transaction->iso_packets[0].status = complete_code;
+
+        /* If there are more ISOs pending and we succeeded, schedule the next
+            one */
+        if ((transaction->iso_number_packets > 1) && (complete_code == CVMX_USB_COMPLETE_SUCCESS)) {
+            transaction->actual_bytes = 0;      /* No bytes transferred for this packet as of yet */
+            transaction->iso_number_packets--;  /* One less ISO waiting to transfer */
+            transaction->iso_packets++;         /* Increment to the next location in our packet array */
+            transaction->stage = CVMX_USB_STAGE_NON_CONTROL;
+            goto done;
+        }
+    }
+
+    /* Remove the transaction from the pipe list */
+    if (transaction->next)
+        transaction->next->prev = transaction->prev;
+    else
+        pipe->tail = transaction->prev;
+    if (transaction->prev)
+        transaction->prev->next = transaction->next;
+    else
+        pipe->head = transaction->next;
+    if (!pipe->head) {
+        __cvmx_usb_remove_pipe(usb->active_pipes + pipe->transfer_type, pipe);
+        __cvmx_usb_append_pipe(&usb->idle_pipes, pipe);
+
+    }
+    __cvmx_usb_perform_callback(usb, pipe, transaction,
+                                CVMX_USB_CALLBACK_TRANSFER_COMPLETE,
+                                complete_code);
+    __cvmx_usb_free_transaction(usb, transaction);
+done:
+    CVMX_USB_RETURN_NOTHING();
+}
+
+
+/**
+ * @INTERNAL
+ * Submit a usb transaction to a pipe. Called for all types
+ * of transactions.
+ *
+ * @param usb
+ * @param pipe_handle
+ *                  Which pipe to submit to. Will be validated in this function.
+ * @param type      Transaction type
+ * @param flags     Flags for the transaction
+ * @param buffer    User buffer for the transaction
+ * @param buffer_length
+ *                  User buffer's length in bytes
+ * @param control_header
+ *                  For control transactions, the 8 byte standard header
+ * @param iso_start_frame
+ *                  For ISO transactions, the start frame
+ * @param iso_number_packets
+ *                  For ISO, the number of packet in the transaction.
+ * @param iso_packets
+ *                  A description of each ISO packet
+ * @param callback  User callback to call when the transaction completes
+ * @param user_data User's data for the callback
+ *
+ * @return Submit handle or negative on failure. Matches the result
+ *         in the external API.
+ */
+static int __cvmx_usb_submit_transaction(cvmx_usb_internal_state_t *usb,
+                                         int pipe_handle,
+                                         cvmx_usb_transfer_t type,
+                                         int flags,
+                                         uint64_t buffer,
+                                         int buffer_length,
+                                         uint64_t control_header,
+                                         int iso_start_frame,
+                                         int iso_number_packets,
+                                         cvmx_usb_iso_packet_t *iso_packets,
+                                         cvmx_usb_callback_func_t callback,
+                                         void *user_data)
+{
+    int submit_handle;
+    cvmx_usb_transaction_t *transaction;
+    cvmx_usb_pipe_t *pipe = usb->pipe + pipe_handle;
+
+    CVMX_USB_LOG_CALLED();
+    if (cvmx_unlikely((pipe_handle < 0) || (pipe_handle >= MAX_PIPES)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    /* Fail if the pipe isn't open */
+    if (cvmx_unlikely((pipe->flags & __CVMX_USB_PIPE_FLAGS_OPEN) == 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(pipe->transfer_type != type))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    transaction = __cvmx_usb_alloc_transaction(usb);
+    if (cvmx_unlikely(!transaction))
+        CVMX_USB_RETURN(CVMX_USB_NO_MEMORY);
+
+    transaction->type = type;
+    transaction->flags |= flags;
+    transaction->buffer = buffer;
+    transaction->buffer_length = buffer_length;
+    transaction->control_header = control_header;
+    transaction->iso_start_frame = iso_start_frame; // FIXME: This is not used, implement it
+    transaction->iso_number_packets = iso_number_packets;
+    transaction->iso_packets = iso_packets;
+    transaction->callback = callback;
+    transaction->callback_data = user_data;
+    if (transaction->type == CVMX_USB_TRANSFER_CONTROL)
+        transaction->stage = CVMX_USB_STAGE_SETUP;
+    else
+        transaction->stage = CVMX_USB_STAGE_NON_CONTROL;
+
+    transaction->next = NULL;
+    if (pipe->tail) {
+        transaction->prev = pipe->tail;
+        transaction->prev->next = transaction;
+    }
+    else {
+        if (pipe->next_tx_frame < usb->frame_number)
+            pipe->next_tx_frame = usb->frame_number + pipe->interval -
+                (usb->frame_number - pipe->next_tx_frame) % pipe->interval;
+        transaction->prev = NULL;
+        pipe->head = transaction;
+        __cvmx_usb_remove_pipe(&usb->idle_pipes, pipe);
+        __cvmx_usb_append_pipe(usb->active_pipes + pipe->transfer_type, pipe);
+    }
+    pipe->tail = transaction;
+
+    submit_handle = __cvmx_usb_get_submit_handle(usb, transaction);
+
+    /* We may need to schedule the pipe if this was the head of the pipe */
+    if (!transaction->prev)
+        __cvmx_usb_schedule(usb, 0);
+
+    CVMX_USB_RETURN(submit_handle);
+}
+
+
+/**
+ * Call to submit a USB Bulk transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+int cvmx_usb_submit_bulk(cvmx_usb_state_t *state, int pipe_handle,
+                                uint64_t buffer, int buffer_length,
+                                cvmx_usb_callback_func_t callback,
+                                void *user_data)
+{
+    int submit_handle;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    CVMX_USB_LOG_PARAM("0x%llx", (unsigned long long)buffer);
+    CVMX_USB_LOG_PARAM("%d", buffer_length);
+
+    /* Pipe handle checking is done later in a common place */
+    if (cvmx_unlikely(!buffer))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(buffer_length < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    submit_handle = __cvmx_usb_submit_transaction(usb, pipe_handle,
+                                         CVMX_USB_TRANSFER_BULK,
+                                         0, /* flags */
+                                         buffer,
+                                         buffer_length,
+                                         0, /* control_header */
+                                         0, /* iso_start_frame */
+                                         0, /* iso_number_packets */
+                                         NULL, /* iso_packets */
+                                         callback,
+                                         user_data);
+    CVMX_USB_RETURN(submit_handle);
+}
+
+
+/**
+ * Call to submit a USB Interrupt transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+int cvmx_usb_submit_interrupt(cvmx_usb_state_t *state, int pipe_handle,
+                              uint64_t buffer, int buffer_length,
+                              cvmx_usb_callback_func_t callback,
+                              void *user_data)
+{
+    int submit_handle;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    CVMX_USB_LOG_PARAM("0x%llx", (unsigned long long)buffer);
+    CVMX_USB_LOG_PARAM("%d", buffer_length);
+
+    /* Pipe handle checking is done later in a common place */
+    if (cvmx_unlikely(!buffer))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(buffer_length < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    submit_handle = __cvmx_usb_submit_transaction(usb, pipe_handle,
+                                         CVMX_USB_TRANSFER_INTERRUPT,
+                                         0, /* flags */
+                                         buffer,
+                                         buffer_length,
+                                         0, /* control_header */
+                                         0, /* iso_start_frame */
+                                         0, /* iso_number_packets */
+                                         NULL, /* iso_packets */
+                                         callback,
+                                         user_data);
+    CVMX_USB_RETURN(submit_handle);
+}
+
+
+/**
+ * Call to submit a USB Control transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param control_header
+ *                  USB 8 byte control header physical address.
+ *                  Note that this is NOT A POINTER, but the
+ *                  full 64bit physical address of the buffer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+int cvmx_usb_submit_control(cvmx_usb_state_t *state, int pipe_handle,
+                            uint64_t control_header,
+                            uint64_t buffer, int buffer_length,
+                            cvmx_usb_callback_func_t callback,
+                            void *user_data)
+{
+    int submit_handle;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    cvmx_usb_control_header_t *header = cvmx_phys_to_ptr(control_header);
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    CVMX_USB_LOG_PARAM("0x%llx", (unsigned long long)control_header);
+    CVMX_USB_LOG_PARAM("0x%llx", (unsigned long long)buffer);
+    CVMX_USB_LOG_PARAM("%d", buffer_length);
+
+    /* Pipe handle checking is done later in a common place */
+    if (cvmx_unlikely(!control_header))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    /* Some drivers send a buffer with a zero length. God only knows why */
+    if (cvmx_unlikely(buffer && (buffer_length < 0)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(!buffer && (buffer_length != 0)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if ((header->s.request_type & 0x80) == 0)
+        buffer_length = cvmx_le16_to_cpu(header->s.length);
+
+    submit_handle = __cvmx_usb_submit_transaction(usb, pipe_handle,
+                                         CVMX_USB_TRANSFER_CONTROL,
+                                         0, /* flags */
+                                         buffer,
+                                         buffer_length,
+                                         control_header,
+                                         0, /* iso_start_frame */
+                                         0, /* iso_number_packets */
+                                         NULL, /* iso_packets */
+                                         callback,
+                                         user_data);
+    CVMX_USB_RETURN(submit_handle);
+}
+
+
+/**
+ * Call to submit a USB Isochronous transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param start_frame
+ *                  Number of frames into the future to schedule
+ *                  this transaction.
+ * @param flags     Flags to control the transfer. See
+ *                  cvmx_usb_isochronous_flags_t for the flag
+ *                  definitions.
+ * @param number_packets
+ *                  Number of sequential packets to transfer.
+ *                  "packets" is a pointer to an array of this
+ *                  many packet structures.
+ * @param packets   Description of each transfer packet as
+ *                  defined by cvmx_usb_iso_packet_t. The array
+ *                  pointed to here must stay valid until the
+ *                  complete callback is called.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+int cvmx_usb_submit_isochronous(cvmx_usb_state_t *state, int pipe_handle,
+                                int start_frame, int flags,
+                                int number_packets,
+                                cvmx_usb_iso_packet_t packets[],
+                                uint64_t buffer, int buffer_length,
+                                cvmx_usb_callback_func_t callback,
+                                void *user_data)
+{
+    int submit_handle;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    CVMX_USB_LOG_PARAM("%d", start_frame);
+    CVMX_USB_LOG_PARAM("0x%x", flags);
+    CVMX_USB_LOG_PARAM("%d", number_packets);
+    CVMX_USB_LOG_PARAM("%p", packets);
+    CVMX_USB_LOG_PARAM("0x%llx", (unsigned long long)buffer);
+    CVMX_USB_LOG_PARAM("%d", buffer_length);
+
+    /* Pipe handle checking is done later in a common place */
+    if (cvmx_unlikely(start_frame < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(flags & ~(CVMX_USB_ISOCHRONOUS_FLAGS_ALLOW_SHORT | CVMX_USB_ISOCHRONOUS_FLAGS_ASAP)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(number_packets < 1))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(!packets))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(!buffer))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(buffer_length < 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    submit_handle = __cvmx_usb_submit_transaction(usb, pipe_handle,
+                                         CVMX_USB_TRANSFER_ISOCHRONOUS,
+                                         flags,
+                                         buffer,
+                                         buffer_length,
+                                         0, /* control_header */
+                                         start_frame,
+                                         number_packets,
+                                         packets,
+                                         callback,
+                                         user_data);
+    CVMX_USB_RETURN(submit_handle);
+}
+
+
+/**
+ * Cancel one outstanding request in a pipe. Canceling a request
+ * can fail if the transaction has already completed before cancel
+ * is called. Even after a successful cancel call, it may take
+ * a frame or two for the cvmx_usb_poll() function to call the
+ * associated callback.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to cancel requests in.
+ * @param submit_handle
+ *               Handle to transaction to cancel, returned by the submit function.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_cancel(cvmx_usb_state_t *state, int pipe_handle,
+                                  int submit_handle)
+{
+    cvmx_usb_transaction_t *transaction;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    cvmx_usb_pipe_t *pipe = usb->pipe + pipe_handle;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    CVMX_USB_LOG_PARAM("%d", submit_handle);
+
+    if (cvmx_unlikely((pipe_handle < 0) || (pipe_handle >= MAX_PIPES)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely((submit_handle < 0) || (submit_handle >= MAX_TRANSACTIONS)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Fail if the pipe isn't open */
+    if (cvmx_unlikely((pipe->flags & __CVMX_USB_PIPE_FLAGS_OPEN) == 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    transaction = usb->transaction + submit_handle;
+
+    /* Fail if this transaction already completed */
+    if (cvmx_unlikely((transaction->flags & __CVMX_USB_TRANSACTION_FLAGS_IN_USE) == 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* If the transaction is the HEAD of the queue and scheduled. We need to
+        treat it special */
+    if ((pipe->head == transaction) &&
+        (pipe->flags & __CVMX_USB_PIPE_FLAGS_SCHEDULED)) {
+        cvmx_usbcx_hccharx_t usbc_hcchar;
+
+        usb->pipe_for_channel[pipe->channel] = NULL;
+        pipe->flags &= ~__CVMX_USB_PIPE_FLAGS_SCHEDULED;
+
+        CVMX_SYNCW;
+
+        usbc_hcchar.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCCHARX(pipe->channel, usb->index));
+        /* If the channel isn't enabled then the transaction already completed */
+        if (usbc_hcchar.s.chena) {
+            usbc_hcchar.s.chdis = 1;
+            __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCCHARX(pipe->channel, usb->index), usbc_hcchar.u32);
+        }
+    }
+    __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_CANCEL);
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Cancel all outstanding requests in a pipe. Logically all this
+ * does is call cvmx_usb_cancel() in a loop.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to cancel requests in.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_cancel_all(cvmx_usb_state_t *state, int pipe_handle)
+{
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    cvmx_usb_pipe_t *pipe = usb->pipe + pipe_handle;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    if (cvmx_unlikely((pipe_handle < 0) || (pipe_handle >= MAX_PIPES)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Fail if the pipe isn't open */
+    if (cvmx_unlikely((pipe->flags & __CVMX_USB_PIPE_FLAGS_OPEN) == 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Simply loop through and attempt to cancel each transaction */
+    while (pipe->head) {
+        cvmx_usb_status_t result = cvmx_usb_cancel(state, pipe_handle,
+            __cvmx_usb_get_submit_handle(usb, pipe->head));
+        if (cvmx_unlikely(result != CVMX_USB_SUCCESS))
+            CVMX_USB_RETURN(result);
+    }
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Close a pipe created with cvmx_usb_open_pipe().
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to close.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t. CVMX_USB_BUSY is returned if the
+ *         pipe has outstanding transfers.
+ */
+cvmx_usb_status_t cvmx_usb_close_pipe(cvmx_usb_state_t *state, int pipe_handle)
+{
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    cvmx_usb_pipe_t *pipe = usb->pipe + pipe_handle;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", pipe_handle);
+    if (cvmx_unlikely((pipe_handle < 0) || (pipe_handle >= MAX_PIPES)))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Fail if the pipe isn't open */
+    if (cvmx_unlikely((pipe->flags & __CVMX_USB_PIPE_FLAGS_OPEN) == 0))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    /* Fail if the pipe has pending transactions */
+    if (cvmx_unlikely(pipe->head))
+        CVMX_USB_RETURN(CVMX_USB_BUSY);
+
+    pipe->flags = 0;
+    __cvmx_usb_remove_pipe(&usb->idle_pipes, pipe);
+    __cvmx_usb_append_pipe(&usb->free_pipes, pipe);
+
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Register a function to be called when various USB events occur.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param reason    Which event to register for.
+ * @param callback  Function to call when the event occurs.
+ * @param user_data User data parameter to the function.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_register_callback(cvmx_usb_state_t *state,
+                                             cvmx_usb_callback_t reason,
+                                             cvmx_usb_callback_func_t callback,
+                                             void *user_data)
+{
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+    CVMX_USB_LOG_PARAM("%d", reason);
+    CVMX_USB_LOG_PARAM("%p", callback);
+    CVMX_USB_LOG_PARAM("%p", user_data);
+    if (cvmx_unlikely(reason >= __CVMX_USB_CALLBACK_END))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+    if (cvmx_unlikely(!callback))
+        CVMX_USB_RETURN(CVMX_USB_INVALID_PARAM);
+
+    usb->callback[reason] = callback;
+    usb->callback_data[reason] = user_data;
+
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
+
+
+/**
+ * Get the current USB protocol level frame number. The frame
+ * number is always in the range of 0-0x7ff.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return USB frame number
+ */
+int cvmx_usb_get_frame_number(cvmx_usb_state_t *state)
+{
+    int frame_number;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+    cvmx_usbcx_hfnum_t usbc_hfnum;
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    usbc_hfnum.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HFNUM(usb->index));
+    frame_number = usbc_hfnum.s.frnum;
+
+    CVMX_USB_RETURN(frame_number);
+}
+
+
+/**
+ * @INTERNAL
+ * Poll a channel for status
+ *
+ * @param usb     USB device
+ * @param channel Channel to poll
+ *
+ * @return Zero on success
+ */
+static int __cvmx_usb_poll_channel(cvmx_usb_internal_state_t *usb, int channel)
+{
+    cvmx_usbcx_hcintx_t usbc_hcint;
+    cvmx_usbcx_hctsizx_t usbc_hctsiz;
+    cvmx_usbcx_hccharx_t usbc_hcchar;
+    cvmx_usb_pipe_t *pipe;
+    cvmx_usb_transaction_t *transaction;
+    int bytes_this_transfer;
+    int bytes_in_last_packet;
+    int packets_processed;
+    int buffer_space_left;
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", usb);
+    CVMX_USB_LOG_PARAM("%d", channel);
+
+    /* Read the interrupt status bits for the channel */
+    usbc_hcint.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCINTX(channel, usb->index));
+
+    if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA) {
+        usbc_hcchar.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index));
+
+        if (usbc_hcchar.s.chena && usbc_hcchar.s.chdis) {
+            /* There seems to be a bug in CN31XX which can cause interrupt
+                IN transfers to get stuck until we do a write of HCCHARX
+                without changing things */
+            __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index), usbc_hcchar.u32);
+            CVMX_USB_RETURN(0);
+        }
+
+        /* In non DMA mode the channels don't halt themselves. We need to
+            manually disable channels that are left running */
+        if (!usbc_hcint.s.chhltd) {
+            if (usbc_hcchar.s.chena) {
+                cvmx_usbcx_hcintmskx_t hcintmsk;
+                /* Disable all interrupts except CHHLTD */
+                hcintmsk.u32 = 0;
+                hcintmsk.s.chhltdmsk = 1;
+                __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCINTMSKX(channel, usb->index), hcintmsk.u32);
+                usbc_hcchar.s.chdis = 1;
+                __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index), usbc_hcchar.u32);
+                CVMX_USB_RETURN(0);
+            }
+            else if (usbc_hcint.s.xfercompl) {
+                /* Successful IN/OUT with transfer complete. Channel halt isn't needed */
+            }
+            else {
+                cvmx_dprintf("USB%d: Channel %d interrupt without halt\n", usb->index, channel);
+                CVMX_USB_RETURN(0);
+            }
+        }
+    }
+    else {
+        /* There is are no interrupts that we need to process when the channel is
+            still running */
+        if (!usbc_hcint.s.chhltd)
+            CVMX_USB_RETURN(0);
+    }
+
+    /* Disable the channel interrupts now that it is done */
+    __cvmx_usb_write_csr32(usb, CVMX_USBCX_HCINTMSKX(channel, usb->index), 0);
+    usb->idle_hardware_channels |= (1<<channel);
+
+    /* Make sure this channel is tied to a valid pipe */
+    pipe = usb->pipe_for_channel[channel];
+    CVMX_PREFETCH(pipe, 0);
+    CVMX_PREFETCH(pipe, 128);
+    if (!pipe)
+        CVMX_USB_RETURN(0);
+    transaction = pipe->head;
+    CVMX_PREFETCH0(transaction);
+
+    /* Disconnect this pipe from the HW channel. Later the schedule function will
+        figure out which pipe needs to go */
+    usb->pipe_for_channel[channel] = NULL;
+    pipe->flags &= ~__CVMX_USB_PIPE_FLAGS_SCHEDULED;
+
+    /* Read the channel config info so we can figure out how much data
+        transfered */
+    usbc_hcchar.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCCHARX(channel, usb->index));
+    usbc_hctsiz.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HCTSIZX(channel, usb->index));
+
+    /* Calculating the number of bytes successfully transferred is dependent on
+        the transfer direction */
+    packets_processed = transaction->pktcnt - usbc_hctsiz.s.pktcnt;
+    if (usbc_hcchar.s.epdir) {
+        /* IN transactions are easy. For every byte received the hardware
+            decrements xfersize. All we need to do is subtract the current
+            value of xfersize from its starting value and we know how many
+            bytes were written to the buffer */
+        bytes_this_transfer = transaction->xfersize - usbc_hctsiz.s.xfersize;
+    }
+    else {
+        /* OUT transaction don't decrement xfersize. Instead pktcnt is
+            decremented on every successful packet send. The hardware does
+            this when it receives an ACK, or NYET. If it doesn't
+            receive one of these responses pktcnt doesn't change */
+        bytes_this_transfer = packets_processed * usbc_hcchar.s.mps;
+        /* The last packet may not be a full transfer if we didn't have
+            enough data */
+        if (bytes_this_transfer > transaction->xfersize)
+            bytes_this_transfer = transaction->xfersize;
+    }
+    /* Figure out how many bytes were in the last packet of the transfer */
+    if (packets_processed)
+        bytes_in_last_packet = bytes_this_transfer - (packets_processed-1) * usbc_hcchar.s.mps;
+    else
+        bytes_in_last_packet = bytes_this_transfer;
+
+    /* As a special case, setup transactions output the setup header, not
+        the user's data. For this reason we don't count setup data as bytes
+        transferred */
+    if ((transaction->stage == CVMX_USB_STAGE_SETUP) ||
+        (transaction->stage == CVMX_USB_STAGE_SETUP_SPLIT_COMPLETE))
+        bytes_this_transfer = 0;
+
+    /* Optional debug output */
+    if (cvmx_unlikely((usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS) ||
+        (pipe->flags & CVMX_USB_PIPE_FLAGS_DEBUG_TRANSFERS)))
+        cvmx_dprintf("%s: Channel %d halted. Pipe %d transaction %d stage %d bytes=%d\n",
+                     __FUNCTION__, channel,
+                     __cvmx_usb_get_pipe_handle(usb, pipe),
+                     __cvmx_usb_get_submit_handle(usb, transaction),
+                     transaction->stage, bytes_this_transfer);
+
+    /* Add the bytes transferred to the running total. It is important that
+        bytes_this_transfer doesn't count any data that needs to be
+        retransmitted */
+    transaction->actual_bytes += bytes_this_transfer;
+    if (transaction->type == CVMX_USB_TRANSFER_ISOCHRONOUS)
+        buffer_space_left = transaction->iso_packets[0].length - transaction->actual_bytes;
+    else
+        buffer_space_left = transaction->buffer_length - transaction->actual_bytes;
+
+    /* We need to remember the PID toggle state for the next transaction. The
+        hardware already updated it for the next transaction */
+    pipe->pid_toggle = !(usbc_hctsiz.s.pid == 0);
+
+    /* For high speed bulk out, assume the next transaction will need to do a
+        ping before proceeding. If this isn't true the ACK processing below
+        will clear this flag */
+    if ((pipe->device_speed == CVMX_USB_SPEED_HIGH) &&
+        (pipe->transfer_type == CVMX_USB_TRANSFER_BULK) &&
+        (pipe->transfer_dir == CVMX_USB_DIRECTION_OUT))
+        pipe->flags |= __CVMX_USB_PIPE_FLAGS_NEED_PING;
+
+    if (usbc_hcint.s.stall) {
+        /* STALL as a response means this transaction cannot be completed
+            because the device can't process transactions. Tell the user. Any
+            data that was transferred will be counted on the actual bytes
+            transferred */
+        pipe->pid_toggle = 0;
+        __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_STALL);
+    }
+    else if (usbc_hcint.s.xacterr) {
+        /* We know at least one packet worked if we get a ACK or NAK. Reset the retry counter */
+        if (usbc_hcint.s.nak || usbc_hcint.s.ack)
+            transaction->retries = 0;
+        transaction->retries++;
+        if (transaction->retries > MAX_RETRIES) {
+            /* XactErr as a response means the device signaled something wrong with
+                the transfer. For example, PID toggle errors cause these */
+            __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_XACTERR);
+        }
+        else {
+            /* If this was a split then clear our split in progress marker */
+            if (usb->active_split == transaction)
+                usb->active_split = NULL;
+            /* Rewind to the beginning of the transaction by anding off the
+                split complete bit */
+            transaction->stage &= ~1;
+            pipe->split_sc_frame = -1;
+            pipe->next_tx_frame += pipe->interval;
+            if (pipe->next_tx_frame < usb->frame_number)
+                pipe->next_tx_frame = usb->frame_number + pipe->interval -
+                    (usb->frame_number - pipe->next_tx_frame) % pipe->interval;
+        }
+    }
+    else if (usbc_hcint.s.bblerr)
+    {
+        /* Babble Error (BblErr) */
+        __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_BABBLEERR);
+    }
+    else if (usbc_hcint.s.datatglerr)
+    {
+        /* We'll retry the exact same transaction again */
+        transaction->retries++;
+    }
+    else if (usbc_hcint.s.nyet) {
+        /* NYET as a response is only allowed in three cases: as a response to
+            a ping, as a response to a split transaction, and as a response to
+            a bulk out. The ping case is handled by hardware, so we only have
+            splits and bulk out */
+        if (!__cvmx_usb_pipe_needs_split(usb, pipe)) {
+            transaction->retries = 0;
+            /* If there is more data to go then we need to try again. Otherwise
+                this transaction is complete */
+            if ((buffer_space_left == 0) || (bytes_in_last_packet < pipe->max_packet))
+                __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+        }
+        else {
+            /* Split transactions retry the split complete 4 times then rewind
+                to the start split and do the entire transactions again */
+            transaction->retries++;
+            if ((transaction->retries & 0x3) == 0) {
+                /* Rewind to the beginning of the transaction by anding off the
+                    split complete bit */
+                transaction->stage &= ~1;
+                pipe->split_sc_frame = -1;
+            }
+        }
+    }
+    else if (usbc_hcint.s.ack) {
+        transaction->retries = 0;
+        /* The ACK bit can only be checked after the other error bits. This is
+            because a multi packet transfer may succeed in a number of packets
+            and then get a different response on the last packet. In this case
+            both ACK and the last response bit will be set. If none of the
+            other response bits is set, then the last packet must have been an
+            ACK */
+
+        /* Since we got an ACK, we know we don't need to do a ping on this
+            pipe */
+        pipe->flags &= ~__CVMX_USB_PIPE_FLAGS_NEED_PING;
+
+        switch (transaction->type)
+        {
+            case CVMX_USB_TRANSFER_CONTROL:
+                switch (transaction->stage)
+                {
+                    case CVMX_USB_STAGE_NON_CONTROL:
+                    case CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE:
+                        /* This should be impossible */
+                        __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_ERROR);
+                        break;
+                    case CVMX_USB_STAGE_SETUP:
+                        pipe->pid_toggle = 1;
+                        if (__cvmx_usb_pipe_needs_split(usb, pipe))
+                            transaction->stage = CVMX_USB_STAGE_SETUP_SPLIT_COMPLETE;
+                        else {
+                            cvmx_usb_control_header_t *header = cvmx_phys_to_ptr(transaction->control_header);
+                            if (header->s.length)
+                                transaction->stage = CVMX_USB_STAGE_DATA;
+                            else
+                                transaction->stage = CVMX_USB_STAGE_STATUS;
+                        }
+                        break;
+                    case CVMX_USB_STAGE_SETUP_SPLIT_COMPLETE:
+                        {
+                            cvmx_usb_control_header_t *header = cvmx_phys_to_ptr(transaction->control_header);
+                            if (header->s.length)
+                                transaction->stage = CVMX_USB_STAGE_DATA;
+                            else
+                                transaction->stage = CVMX_USB_STAGE_STATUS;
+                        }
+                        break;
+                    case CVMX_USB_STAGE_DATA:
+                        if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                            transaction->stage = CVMX_USB_STAGE_DATA_SPLIT_COMPLETE;
+                            /* For setup OUT data that are splits, the hardware
+                                doesn't appear to count transferred data. Here
+                                we manually update the data transferred */
+                            if (!usbc_hcchar.s.epdir) {
+                                if (buffer_space_left < pipe->max_packet)
+                                    transaction->actual_bytes += buffer_space_left;
+                                else
+                                    transaction->actual_bytes += pipe->max_packet;
+                            }
+                        }
+                        else if ((buffer_space_left == 0) || (bytes_in_last_packet < pipe->max_packet)) {
+                            pipe->pid_toggle = 1;
+                            transaction->stage = CVMX_USB_STAGE_STATUS;
+                        }
+                        break;
+                    case CVMX_USB_STAGE_DATA_SPLIT_COMPLETE:
+                        if ((buffer_space_left == 0) || (bytes_in_last_packet < pipe->max_packet)) {
+                            pipe->pid_toggle = 1;
+                            transaction->stage = CVMX_USB_STAGE_STATUS;
+                        }
+                        else {
+                            transaction->stage = CVMX_USB_STAGE_DATA;
+                        }
+                        break;
+                    case CVMX_USB_STAGE_STATUS:
+                        if (__cvmx_usb_pipe_needs_split(usb, pipe))
+                            transaction->stage = CVMX_USB_STAGE_STATUS_SPLIT_COMPLETE;
+                        else
+                            __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                        break;
+                    case CVMX_USB_STAGE_STATUS_SPLIT_COMPLETE:
+                        __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                        break;
+                }
+                break;
+            case CVMX_USB_TRANSFER_BULK:
+            case CVMX_USB_TRANSFER_INTERRUPT:
+                /* The only time a bulk transfer isn't complete when
+                    it finishes with an ACK is during a split transaction. For
+                    splits we need to continue the transfer if more data is
+                    needed */
+                if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                    if (transaction->stage == CVMX_USB_STAGE_NON_CONTROL)
+                        transaction->stage = CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE;
+                    else {
+                        if (buffer_space_left && (bytes_in_last_packet == pipe->max_packet))
+                            transaction->stage = CVMX_USB_STAGE_NON_CONTROL;
+                        else {
+                            if (transaction->type == CVMX_USB_TRANSFER_INTERRUPT)
+                                pipe->next_tx_frame += pipe->interval;
+                            __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                        }
+                    }
+                }
+                else {
+                    if ((pipe->device_speed == CVMX_USB_SPEED_HIGH) &&
+                        (pipe->transfer_type == CVMX_USB_TRANSFER_BULK) &&
+                        (pipe->transfer_dir == CVMX_USB_DIRECTION_OUT) &&
+                        (usbc_hcint.s.nak))
+                        pipe->flags |= __CVMX_USB_PIPE_FLAGS_NEED_PING;
+                    if (!buffer_space_left || (bytes_in_last_packet < pipe->max_packet)) {
+                        if (transaction->type == CVMX_USB_TRANSFER_INTERRUPT)
+                            pipe->next_tx_frame += pipe->interval;
+                        __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                    }
+                }
+                break;
+            case CVMX_USB_TRANSFER_ISOCHRONOUS:
+                if (__cvmx_usb_pipe_needs_split(usb, pipe)) {
+                    /* ISOCHRONOUS OUT splits don't require a complete split stage.
+                        Instead they use a sequence of begin OUT splits to transfer
+                        the data 188 bytes at a time. Once the transfer is complete,
+                        the pipe sleeps until the next schedule interval */
+                    if (pipe->transfer_dir == CVMX_USB_DIRECTION_OUT) {
+                        /* If no space left or this wasn't a max size packet then
+                            this transfer is complete. Otherwise start it again
+                            to send the next 188 bytes */
+                        if (!buffer_space_left || (bytes_this_transfer < 188)) {
+                            pipe->next_tx_frame += pipe->interval;
+                            __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                        }
+                    }
+                    else {
+                        if (transaction->stage == CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE) {
+                            /* We are in the incoming data phase. Keep getting
+                                data until we run out of space or get a small
+                                packet */
+                            if ((buffer_space_left == 0) || (bytes_in_last_packet < pipe->max_packet)) {
+                                pipe->next_tx_frame += pipe->interval;
+                                __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                            }
+                        }
+                        else
+                            transaction->stage = CVMX_USB_STAGE_NON_CONTROL_SPLIT_COMPLETE;
+                    }
+                }
+                else {
+                    pipe->next_tx_frame += pipe->interval;
+                    __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_SUCCESS);
+                }
+                break;
+        }
+    }
+    else if (usbc_hcint.s.nak) {
+        /* If this was a split then clear our split in progress marker */
+        if (usb->active_split == transaction)
+            usb->active_split = NULL;
+        /* NAK as a response means the device couldn't accept the transaction,
+            but it should be retried in the future. Rewind to the beginning of
+            the transaction by anding off the split complete bit. Retry in the
+            next interval */
+        transaction->retries = 0;
+        transaction->stage &= ~1;
+        pipe->next_tx_frame += pipe->interval;
+        if (pipe->next_tx_frame < usb->frame_number)
+            pipe->next_tx_frame = usb->frame_number + pipe->interval -
+                (usb->frame_number - pipe->next_tx_frame) % pipe->interval;
+    }
+    else {
+        cvmx_usb_port_status_t port;
+        port = cvmx_usb_get_status((cvmx_usb_state_t *)usb);
+        if (port.port_enabled)
+        {
+            /* We'll retry the exact same transaction again */
+            transaction->retries++;
+        }
+        else
+        {
+            /* We get channel halted interrupts with no result bits sets when the
+                cable is unplugged */
+            __cvmx_usb_perform_complete(usb, pipe, transaction, CVMX_USB_COMPLETE_ERROR);
+        }
+    }
+    CVMX_USB_RETURN(0);
+}
+
+
+/**
+ * Poll the USB block for status and call all needed callback
+ * handlers. This function is meant to be called in the interrupt
+ * handler for the USB controller. It can also be called
+ * periodically in a loop for non-interrupt based operation.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+cvmx_usb_status_t cvmx_usb_poll(cvmx_usb_state_t *state)
+{
+    cvmx_usbcx_hfnum_t usbc_hfnum;
+    cvmx_usbcx_gintsts_t usbc_gintsts;
+    cvmx_usb_internal_state_t *usb = (cvmx_usb_internal_state_t*)state;
+
+    CVMX_PREFETCH(usb, 0);
+    CVMX_PREFETCH(usb, 1*128);
+    CVMX_PREFETCH(usb, 2*128);
+    CVMX_PREFETCH(usb, 3*128);
+    CVMX_PREFETCH(usb, 4*128);
+
+    CVMX_USB_LOG_CALLED();
+    CVMX_USB_LOG_PARAM("%p", state);
+
+    /* Update the frame counter */
+    usbc_hfnum.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HFNUM(usb->index));
+    if ((usb->frame_number&0x3fff) > usbc_hfnum.s.frnum)
+        usb->frame_number += 0x4000;
+    usb->frame_number &= ~0x3fffull;
+    usb->frame_number |= usbc_hfnum.s.frnum;
+
+    /* Read the pending interrupts */
+    usbc_gintsts.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_GINTSTS(usb->index));
+
+    /* Clear the interrupts now that we know about them */
+    __cvmx_usb_write_csr32(usb, CVMX_USBCX_GINTSTS(usb->index), usbc_gintsts.u32);
+
+    if (usbc_gintsts.s.rxflvl) {
+        /* RxFIFO Non-Empty (RxFLvl)
+            Indicates that there is at least one packet pending to be read
+            from the RxFIFO. */
+        /* In DMA mode this is handled by hardware */
+        if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)
+            __cvmx_usb_poll_rx_fifo(usb);
+    }
+    if (usbc_gintsts.s.ptxfemp || usbc_gintsts.s.nptxfemp) {
+        /* Fill the Tx FIFOs when not in DMA mode */
+        if (usb->init_flags & CVMX_USB_INITIALIZE_FLAGS_NO_DMA)
+            __cvmx_usb_poll_tx_fifo(usb);
+    }
+    if (usbc_gintsts.s.disconnint || usbc_gintsts.s.prtint) {
+        cvmx_usbcx_hprt_t usbc_hprt;
+        /* Disconnect Detected Interrupt (DisconnInt)
+            Asserted when a device disconnect is detected. */
+
+        /* Host Port Interrupt (PrtInt)
+            The core sets this bit to indicate a change in port status of one
+            of the O2P USB core ports in Host mode. The application must
+            read the Host Port Control and Status (HPRT) register to
+            determine the exact event that caused this interrupt. The
+            application must clear the appropriate status bit in the Host Port
+            Control and Status register to clear this bit. */
+
+        /* Call the user's port callback */
+        __cvmx_usb_perform_callback(usb, NULL, NULL,
+                                    CVMX_USB_CALLBACK_PORT_CHANGED,
+                                    CVMX_USB_COMPLETE_SUCCESS);
+        /* Clear the port change bits */
+        usbc_hprt.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HPRT(usb->index));
+        usbc_hprt.s.prtena = 0;
+        __cvmx_usb_write_csr32(usb, CVMX_USBCX_HPRT(usb->index), usbc_hprt.u32);
+    }
+    if (usbc_gintsts.s.hchint) {
+        /* Host Channels Interrupt (HChInt)
+            The core sets this bit to indicate that an interrupt is pending on
+            one of the channels of the core (in Host mode). The application
+            must read the Host All Channels Interrupt (HAINT) register to
+            determine the exact number of the channel on which the
+            interrupt occurred, and then read the corresponding Host
+            Channel-n Interrupt (HCINTn) register to determine the exact
+            cause of the interrupt. The application must clear the
+            appropriate status bit in the HCINTn register to clear this bit. */
+        cvmx_usbcx_haint_t usbc_haint;
+        usbc_haint.u32 = __cvmx_usb_read_csr32(usb, CVMX_USBCX_HAINT(usb->index));
+        while (usbc_haint.u32) {
+            int channel;
+            CVMX_CLZ(channel, usbc_haint.u32);
+            channel = 31 - channel;
+            __cvmx_usb_poll_channel(usb, channel);
+            usbc_haint.u32 ^= 1<<channel;
+        }
+    }
+
+    __cvmx_usb_schedule(usb, usbc_gintsts.s.sof);
+
+    CVMX_USB_RETURN(CVMX_USB_SUCCESS);
+}
diff --git a/drivers/staging/octeon-usb/cvmx-usb.h b/drivers/staging/octeon-usb/cvmx-usb.h
new file mode 100644 (file)
index 0000000..db9cc05
--- /dev/null
@@ -0,0 +1,1085 @@
+/***********************license start***************
+ * Copyright (c) 2003-2010  Cavium Networks (support@cavium.com). All rights
+ * reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+
+ *   * Neither the name of Cavium Networks nor the names of
+ *     its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written
+ *     permission.
+
+ * This Software, including technical data, may be subject to U.S. export  control
+ * laws, including the U.S. Export Administration Act and its  associated
+ * regulations, and may be subject to export or import  regulations in other
+ * countries.
+
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+ * AND WITH ALL FAULTS AND CAVIUM  NETWORKS MAKES NO PROMISES, REPRESENTATIONS OR
+ * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT TO
+ * THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY REPRESENTATION OR
+ * DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT DEFECTS, AND CAVIUM
+ * SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES OF TITLE,
+ * MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR PURPOSE, LACK OF
+ * VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT, QUIET POSSESSION OR
+ * CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK ARISING OUT OF USE OR
+ * PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
+ ***********************license end**************************************/
+
+
+/**
+ * @file
+ *
+ * "cvmx-usb.h" defines a set of low level USB functions to help
+ * developers create Octeon USB drivers for various operating
+ * systems. These functions provide a generic API to the Octeon
+ * USB blocks, hiding the internal hardware specific
+ * operations.
+ *
+ * At a high level the device driver needs to:
+ *
+ * -# Call cvmx_usb_get_num_ports() to get the number of
+ *  supported ports.
+ * -# Call cvmx_usb_initialize() for each Octeon USB port.
+ * -# Enable the port using cvmx_usb_enable().
+ * -# Either periodically, or in an interrupt handler, call
+ *  cvmx_usb_poll() to service USB events.
+ * -# Manage pipes using cvmx_usb_open_pipe() and
+ *  cvmx_usb_close_pipe().
+ * -# Manage transfers using cvmx_usb_submit_*() and
+ *  cvmx_usb_cancel*().
+ * -# Shutdown USB on unload using cvmx_usb_shutdown().
+ *
+ * To monitor USB status changes, the device driver must use
+ * cvmx_usb_register_callback() to register for events that it
+ * is interested in. Below are a few hints on successfully
+ * implementing a driver on top of this API.
+ *
+ * <h2>Initialization</h2>
+ *
+ * When a driver is first loaded, it is normally not necessary
+ * to bring up the USB port completely. Most operating systems
+ * expect to initialize and enable the port in two independent
+ * steps. Normally an operating system will probe hardware,
+ * initialize anything found, and then enable the hardware.
+ *
+ * In the probe phase you should:
+ * -# Use cvmx_usb_get_num_ports() to determine the number of
+ *  USB port to be supported.
+ * -# Allocate space for a cvmx_usb_state_t structure for each
+ *  port.
+ * -# Tell the operating system about each port
+ *
+ * In the initialization phase you should:
+ * -# Use cvmx_usb_initialize() on each port.
+ * -# Do not call cvmx_usb_enable(). This leaves the USB port in
+ *  the disabled state until the operating system is ready.
+ *
+ * Finally, in the enable phase you should:
+ * -# Call cvmx_usb_enable() on the appropriate port.
+ * -# Note that some operating system use a RESET instead of an
+ *  enable call. To implement RESET, you should call
+ *  cvmx_usb_disable() followed by cvmx_usb_enable().
+ *
+ * <h2>Locking</h2>
+ *
+ * All of the functions in the cvmx-usb API assume exclusive
+ * access to the USB hardware and internal data structures. This
+ * means that the driver must provide locking as necessary.
+ *
+ * In the single CPU state it is normally enough to disable
+ * interrupts before every call to cvmx_usb*() and enable them
+ * again after the call is complete. Keep in mind that it is
+ * very common for the callback handlers to make additional
+ * calls into cvmx-usb, so the disable/enable must be protected
+ * against recursion. As an example, the Linux kernel
+ * local_irq_save() and local_irq_restore() are perfect for this
+ * in the non SMP case.
+ *
+ * In the SMP case, locking is more complicated. For SMP you not
+ * only need to disable interrupts on the local core, but also
+ * take a lock to make sure that another core cannot call
+ * cvmx-usb.
+ *
+ * <h2>Port callback</h2>
+ *
+ * The port callback prototype needs to look as follows:
+ *
+ * void port_callback(cvmx_usb_state_t *usb,
+ *                    cvmx_usb_callback_t reason,
+ *                    cvmx_usb_complete_t status,
+ *                    int pipe_handle,
+ *                    int submit_handle,
+ *                    int bytes_transferred,
+ *                    void *user_data);
+ * - @b usb is the cvmx_usb_state_t for the port.
+ * - @b reason will always be
+ *   CVMX_USB_CALLBACK_PORT_CHANGED.
+ * - @b status will always be CVMX_USB_COMPLETE_SUCCESS.
+ * - @b pipe_handle will always be -1.
+ * - @b submit_handle will always be -1.
+ * - @b bytes_transferred will always be 0.
+ * - @b user_data is the void pointer originally passed along
+ *   with the callback. Use this for any state information you
+ *   need.
+ *
+ * The port callback will be called whenever the user plugs /
+ * unplugs a device from the port. It will not be called when a
+ * device is plugged / unplugged from a hub connected to the
+ * root port. Normally all the callback needs to do is tell the
+ * operating system to poll the root hub for status. Under
+ * Linux, this is performed by calling usb_hcd_poll_rh_status().
+ * In the Linux driver we use @b user_data. to pass around the
+ * Linux "hcd" structure. Once the port callback completes,
+ * Linux automatically calls octeon_usb_hub_status_data() which
+ * uses cvmx_usb_get_status() to determine the root port status.
+ *
+ * <h2>Complete callback</h2>
+ *
+ * The completion callback prototype needs to look as follows:
+ *
+ * void complete_callback(cvmx_usb_state_t *usb,
+ *                        cvmx_usb_callback_t reason,
+ *                        cvmx_usb_complete_t status,
+ *                        int pipe_handle,
+ *                        int submit_handle,
+ *                        int bytes_transferred,
+ *                        void *user_data);
+ * - @b usb is the cvmx_usb_state_t for the port.
+ * - @b reason will always be
+ *   CVMX_USB_CALLBACK_TRANSFER_COMPLETE.
+ * - @b status will be one of the cvmx_usb_complete_t
+ *   enumerations.
+ * - @b pipe_handle is the handle to the pipe the transaction
+ *   was originally submitted on.
+ * - @b submit_handle is the handle returned by the original
+ *   cvmx_usb_submit_* call.
+ * - @b bytes_transferred is the number of bytes successfully
+ *   transferred in the transaction. This will be zero on most
+ *   error conditions.
+ * - @b user_data is the void pointer originally passed along
+ *   with the callback. Use this for any state information you
+ *   need. For example, the Linux "urb" is stored in here in the
+ *   Linux driver.
+ *
+ * In general your callback handler should use @b status and @b
+ * bytes_transferred to tell the operating system the how the
+ * transaction completed. Normally the pipe is not changed in
+ * this callback.
+ *
+ * <h2>Canceling transactions</h2>
+ *
+ * When a transaction is cancelled using cvmx_usb_cancel*(), the
+ * actual length of time until the complete callback is called
+ * can vary greatly. It may be called before cvmx_usb_cancel*()
+ * returns, or it may be called a number of usb frames in the
+ * future once the hardware frees the transaction. In either of
+ * these cases, the complete handler will receive
+ * CVMX_USB_COMPLETE_CANCEL.
+ *
+ * <h2>Handling pipes</h2>
+ *
+ * USB "pipes" is a software construct created by this API to
+ * enable the ordering of usb transactions to a device endpoint.
+ * Octeon's underlying hardware doesn't have any concept
+ * equivalent to "pipes". The hardware instead has eight
+ * channels that can be used simultaneously to have up to eight
+ * transaction in process at the same time. In order to maintain
+ * ordering in a pipe, the transactions for a pipe will only be
+ * active in one hardware channel at a time. From an API user's
+ * perspective, this doesn't matter but it can be helpful to
+ * keep this in mind when you are probing hardware while
+ * debugging.
+ *
+ * Also keep in mind that usb transactions contain state
+ * information about the previous transaction to the same
+ * endpoint. Each transaction has a PID toggle that changes 0/1
+ * between each sub packet. This is maintained in the pipe data
+ * structures. For this reason, you generally cannot create and
+ * destroy a pipe for every transaction. A sequence of
+ * transaction to the same endpoint must use the same pipe.
+ *
+ * <h2>Root Hub</h2>
+ *
+ * Some operating systems view the usb root port as a normal usb
+ * hub. These systems attempt to control the root hub with
+ * messages similar to the usb 2.0 spec for hub control and
+ * status. For these systems it may be necessary to write
+ * function to decode standard usb control messages into
+ * equivalent cvmx-usb API calls. As an example, the following
+ * code is used under Linux for some of the basic hub control
+ * messages.
+ *
+ * @code
+ * static int octeon_usb_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength)
+ * {
+ *     cvmx_usb_state_t *usb = (cvmx_usb_state_t *)hcd->hcd_priv;
+ *     cvmx_usb_port_status_t usb_port_status;
+ *     int port_status;
+ *     struct usb_hub_descriptor *desc;
+ *     unsigned long flags;
+ *
+ *     switch (typeReq)
+ *     {
+ *         case ClearHubFeature:
+ *             DEBUG_ROOT_HUB("OcteonUSB: ClearHubFeature\n");
+ *             switch (wValue)
+ *             {
+ *                 case C_HUB_LOCAL_POWER:
+ *                 case C_HUB_OVER_CURRENT:
+ *                     // Nothing required here
+ *                     break;
+ *                 default:
+ *                     return -EINVAL;
+ *             }
+ *             break;
+ *         case ClearPortFeature:
+ *             DEBUG_ROOT_HUB("OcteonUSB: ClearPortFeature");
+ *             if (wIndex != 1)
+ *             {
+ *                 DEBUG_ROOT_HUB(" INVALID\n");
+ *                 return -EINVAL;
+ *             }
+ *
+ *             switch (wValue)
+ *             {
+ *                 case USB_PORT_FEAT_ENABLE:
+ *                     DEBUG_ROOT_HUB(" ENABLE");
+ *                     local_irq_save(flags);
+ *                     cvmx_usb_disable(usb);
+ *                     local_irq_restore(flags);
+ *                     break;
+ *                 case USB_PORT_FEAT_SUSPEND:
+ *                     DEBUG_ROOT_HUB(" SUSPEND");
+ *                     // Not supported on Octeon
+ *                     break;
+ *                 case USB_PORT_FEAT_POWER:
+ *                     DEBUG_ROOT_HUB(" POWER");
+ *                     // Not supported on Octeon
+ *                     break;
+ *                 case USB_PORT_FEAT_INDICATOR:
+ *                     DEBUG_ROOT_HUB(" INDICATOR");
+ *                     // Port inidicator not supported
+ *                     break;
+ *                 case USB_PORT_FEAT_C_CONNECTION:
+ *                     DEBUG_ROOT_HUB(" C_CONNECTION");
+ *                     // Clears drivers internal connect status change flag
+ *                     cvmx_usb_set_status(usb, cvmx_usb_get_status(usb));
+ *                     break;
+ *                 case USB_PORT_FEAT_C_RESET:
+ *                     DEBUG_ROOT_HUB(" C_RESET");
+ *                     // Clears the driver's internal Port Reset Change flag
+ *                     cvmx_usb_set_status(usb, cvmx_usb_get_status(usb));
+ *                     break;
+ *                 case USB_PORT_FEAT_C_ENABLE:
+ *                     DEBUG_ROOT_HUB(" C_ENABLE");
+ *                     // Clears the driver's internal Port Enable/Disable Change flag
+ *                     cvmx_usb_set_status(usb, cvmx_usb_get_status(usb));
+ *                     break;
+ *                 case USB_PORT_FEAT_C_SUSPEND:
+ *                     DEBUG_ROOT_HUB(" C_SUSPEND");
+ *                     // Clears the driver's internal Port Suspend Change flag,
+ *                         which is set when resume signaling on the host port is
+ *                         complete
+ *                     break;
+ *                 case USB_PORT_FEAT_C_OVER_CURRENT:
+ *                     DEBUG_ROOT_HUB(" C_OVER_CURRENT");
+ *                     // Clears the driver's overcurrent Change flag
+ *                     cvmx_usb_set_status(usb, cvmx_usb_get_status(usb));
+ *                     break;
+ *                 default:
+ *                     DEBUG_ROOT_HUB(" UNKNOWN\n");
+ *                     return -EINVAL;
+ *             }
+ *             DEBUG_ROOT_HUB("\n");
+ *             break;
+ *         case GetHubDescriptor:
+ *             DEBUG_ROOT_HUB("OcteonUSB: GetHubDescriptor\n");
+ *             desc = (struct usb_hub_descriptor *)buf;
+ *             desc->bDescLength = 9;
+ *             desc->bDescriptorType = 0x29;
+ *             desc->bNbrPorts = 1;
+ *             desc->wHubCharacteristics = 0x08;
+ *             desc->bPwrOn2PwrGood = 1;
+ *             desc->bHubContrCurrent = 0;
+ *             desc->bitmap[0] = 0;
+ *             desc->bitmap[1] = 0xff;
+ *             break;
+ *         case GetHubStatus:
+ *             DEBUG_ROOT_HUB("OcteonUSB: GetHubStatus\n");
+ *             *(__le32 *)buf = 0;
+ *             break;
+ *         case GetPortStatus:
+ *             DEBUG_ROOT_HUB("OcteonUSB: GetPortStatus");
+ *             if (wIndex != 1)
+ *             {
+ *                 DEBUG_ROOT_HUB(" INVALID\n");
+ *                 return -EINVAL;
+ *             }
+ *
+ *             usb_port_status = cvmx_usb_get_status(usb);
+ *             port_status = 0;
+ *
+ *             if (usb_port_status.connect_change)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_C_CONNECTION);
+ *                 DEBUG_ROOT_HUB(" C_CONNECTION");
+ *             }
+ *
+ *             if (usb_port_status.port_enabled)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_C_ENABLE);
+ *                 DEBUG_ROOT_HUB(" C_ENABLE");
+ *             }
+ *
+ *             if (usb_port_status.connected)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_CONNECTION);
+ *                 DEBUG_ROOT_HUB(" CONNECTION");
+ *             }
+ *
+ *             if (usb_port_status.port_enabled)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_ENABLE);
+ *                 DEBUG_ROOT_HUB(" ENABLE");
+ *             }
+ *
+ *             if (usb_port_status.port_over_current)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_OVER_CURRENT);
+ *                 DEBUG_ROOT_HUB(" OVER_CURRENT");
+ *             }
+ *
+ *             if (usb_port_status.port_powered)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_POWER);
+ *                 DEBUG_ROOT_HUB(" POWER");
+ *             }
+ *
+ *             if (usb_port_status.port_speed == CVMX_USB_SPEED_HIGH)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_HIGHSPEED);
+ *                 DEBUG_ROOT_HUB(" HIGHSPEED");
+ *             }
+ *             else if (usb_port_status.port_speed == CVMX_USB_SPEED_LOW)
+ *             {
+ *                 port_status |= (1 << USB_PORT_FEAT_LOWSPEED);
+ *                 DEBUG_ROOT_HUB(" LOWSPEED");
+ *             }
+ *
+ *             *((__le32 *)buf) = cpu_to_le32(port_status);
+ *             DEBUG_ROOT_HUB("\n");
+ *             break;
+ *         case SetHubFeature:
+ *             DEBUG_ROOT_HUB("OcteonUSB: SetHubFeature\n");
+ *             // No HUB features supported
+ *             break;
+ *         case SetPortFeature:
+ *             DEBUG_ROOT_HUB("OcteonUSB: SetPortFeature");
+ *             if (wIndex != 1)
+ *             {
+ *                 DEBUG_ROOT_HUB(" INVALID\n");
+ *                 return -EINVAL;
+ *             }
+ *
+ *             switch (wValue)
+ *             {
+ *                 case USB_PORT_FEAT_SUSPEND:
+ *                     DEBUG_ROOT_HUB(" SUSPEND\n");
+ *                     return -EINVAL;
+ *                 case USB_PORT_FEAT_POWER:
+ *                     DEBUG_ROOT_HUB(" POWER\n");
+ *                     return -EINVAL;
+ *                 case USB_PORT_FEAT_RESET:
+ *                     DEBUG_ROOT_HUB(" RESET\n");
+ *                     local_irq_save(flags);
+ *                     cvmx_usb_disable(usb);
+ *                     if (cvmx_usb_enable(usb))
+ *                         DEBUG_ERROR("Failed to enable the port\n");
+ *                     local_irq_restore(flags);
+ *                     return 0;
+ *                 case USB_PORT_FEAT_INDICATOR:
+ *                     DEBUG_ROOT_HUB(" INDICATOR\n");
+ *                     // Not supported
+ *                     break;
+ *                 default:
+ *                     DEBUG_ROOT_HUB(" UNKNOWN\n");
+ *                     return -EINVAL;
+ *             }
+ *             break;
+ *         default:
+ *             DEBUG_ROOT_HUB("OcteonUSB: Unknown root hub request\n");
+ *             return -EINVAL;
+ *     }
+ *     return 0;
+ * }
+ * @endcode
+ *
+ * <h2>Interrupts</h2>
+ *
+ * If you plan on using usb interrupts, cvmx_usb_poll() must be
+ * called on every usb interrupt. It will read the usb state,
+ * call any needed callbacks, and schedule transactions as
+ * needed. Your device driver needs only to hookup an interrupt
+ * handler and call cvmx_usb_poll(). Octeon's usb port 0 causes
+ * CIU bit CIU_INT*_SUM0[USB] to be set (bit 56). For port 1,
+ * CIU bit CIU_INT_SUM1[USB1] is set (bit 17). How these bits
+ * are turned into interrupt numbers is operating system
+ * specific. For Linux, there are the convenient defines
+ * OCTEON_IRQ_USB0 and OCTEON_IRQ_USB1 for the IRQ numbers.
+ *
+ * If you aren't using interrupts, simple call cvmx_usb_poll()
+ * in your main processing loop.
+ *
+ * <hr>$Revision: 32636 $<hr>
+ */
+
+#ifndef __CVMX_USB_H__
+#define __CVMX_USB_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Enumerations representing the status of function calls.
+ */
+typedef enum
+{
+    CVMX_USB_SUCCESS = 0,           /**< There were no errors */
+    CVMX_USB_INVALID_PARAM = -1,    /**< A parameter to the function was invalid */
+    CVMX_USB_NO_MEMORY = -2,        /**< Insufficient resources were available for the request */
+    CVMX_USB_BUSY = -3,             /**< The resource is busy and cannot service the request */
+    CVMX_USB_TIMEOUT = -4,          /**< Waiting for an action timed out */
+    CVMX_USB_INCORRECT_MODE = -5,   /**< The function call doesn't work in the current USB
+                                         mode. This happens when host only functions are
+                                         called in device mode or vice versa */
+} cvmx_usb_status_t;
+
+/**
+ * Enumerations representing the possible USB device speeds
+ */
+typedef enum
+{
+    CVMX_USB_SPEED_HIGH = 0,        /**< Device is operation at 480Mbps */
+    CVMX_USB_SPEED_FULL = 1,        /**< Device is operation at 12Mbps */
+    CVMX_USB_SPEED_LOW = 2,         /**< Device is operation at 1.5Mbps */
+} cvmx_usb_speed_t;
+
+/**
+ * Enumeration representing the possible USB transfer types.
+ */
+typedef enum
+{
+    CVMX_USB_TRANSFER_CONTROL = 0,      /**< USB transfer type control for hub and status transfers */
+    CVMX_USB_TRANSFER_ISOCHRONOUS = 1,  /**< USB transfer type isochronous for low priority periodic transfers */
+    CVMX_USB_TRANSFER_BULK = 2,         /**< USB transfer type bulk for large low priority transfers */
+    CVMX_USB_TRANSFER_INTERRUPT = 3,    /**< USB transfer type interrupt for high priority periodic transfers */
+} cvmx_usb_transfer_t;
+
+/**
+ * Enumeration of the transfer directions
+ */
+typedef enum
+{
+    CVMX_USB_DIRECTION_OUT,         /**< Data is transferring from Octeon to the device/host */
+    CVMX_USB_DIRECTION_IN,          /**< Data is transferring from the device/host to Octeon */
+} cvmx_usb_direction_t;
+
+/**
+ * Enumeration of all possible status codes passed to callback
+ * functions.
+ */
+typedef enum
+{
+    CVMX_USB_COMPLETE_SUCCESS,      /**< The transaction / operation finished without any errors */
+    CVMX_USB_COMPLETE_SHORT,        /**< FIXME: This is currently not implemented */
+    CVMX_USB_COMPLETE_CANCEL,       /**< The transaction was canceled while in flight by a user call to cvmx_usb_cancel* */
+    CVMX_USB_COMPLETE_ERROR,        /**< The transaction aborted with an unexpected error status */
+    CVMX_USB_COMPLETE_STALL,        /**< The transaction received a USB STALL response from the device */
+    CVMX_USB_COMPLETE_XACTERR,      /**< The transaction failed with an error from the device even after a number of retries */
+    CVMX_USB_COMPLETE_DATATGLERR,   /**< The transaction failed with a data toggle error even after a number of retries */
+    CVMX_USB_COMPLETE_BABBLEERR,    /**< The transaction failed with a babble error */
+    CVMX_USB_COMPLETE_FRAMEERR,     /**< The transaction failed with a frame error even after a number of retries */
+} cvmx_usb_complete_t;
+
+/**
+ * Structure returned containing the USB port status information.
+ */
+typedef struct
+{
+    uint32_t reserved           : 25;
+    uint32_t port_enabled       : 1; /**< 1 = Usb port is enabled, 0 = disabled */
+    uint32_t port_over_current  : 1; /**< 1 = Over current detected, 0 = Over current not detected. Octeon doesn't support over current detection */
+    uint32_t port_powered       : 1; /**< 1 = Port power is being supplied to the device, 0 = power is off. Octeon doesn't support turning port power off */
+    cvmx_usb_speed_t port_speed : 2; /**< Current port speed */
+    uint32_t connected          : 1; /**< 1 = A device is connected to the port, 0 = No device is connected */
+    uint32_t connect_change     : 1; /**< 1 = Device connected state changed since the last set status call */
+} cvmx_usb_port_status_t;
+
+/**
+ * This is the structure of a Control packet header
+ */
+typedef union
+{
+    uint64_t u64;
+    struct
+    {
+        uint64_t request_type   : 8;  /**< Bit 7 tells the direction: 1=IN, 0=OUT */
+        uint64_t request        : 8;  /**< The standard usb request to make */
+        uint64_t value          : 16; /**< Value parameter for the request in little endian format */
+        uint64_t index          : 16; /**< Index for the request in little endian format */
+        uint64_t length         : 16; /**< Length of the data associated with this request in little endian format */
+    } s;
+} cvmx_usb_control_header_t;
+
+/**
+ * Descriptor for Isochronous packets
+ */
+typedef struct
+{
+    int offset;                     /**< This is the offset in bytes into the main buffer where this data is stored */
+    int length;                     /**< This is the length in bytes of the data */
+    cvmx_usb_complete_t status;     /**< This is the status of this individual packet transfer */
+} cvmx_usb_iso_packet_t;
+
+/**
+ * Possible callback reasons for the USB API.
+ */
+typedef enum
+{
+    CVMX_USB_CALLBACK_TRANSFER_COMPLETE,
+                                    /**< A callback of this type is called when a submitted transfer
+                                        completes. The completion callback will be called even if the
+                                        transfer fails or is canceled. The status parameter will
+                                        contain details of why he callback was called. */
+    CVMX_USB_CALLBACK_PORT_CHANGED, /**< The status of the port changed. For example, someone may have
+                                        plugged a device in. The status parameter contains
+                                        CVMX_USB_COMPLETE_SUCCESS. Use cvmx_usb_get_status() to get
+                                        the new port status. */
+    __CVMX_USB_CALLBACK_END         /**< Do not use. Used internally for array bounds */
+} cvmx_usb_callback_t;
+
+/**
+ * USB state internal data. The contents of this structure
+ * may change in future SDKs. No data in it should be referenced
+ * by user's of this API.
+ */
+typedef struct
+{
+    char data[65536];
+} cvmx_usb_state_t;
+
+/**
+ * USB callback functions are always of the following type.
+ * The parameters are as follows:
+ *      - state = USB device state populated by
+ *        cvmx_usb_initialize().
+ *      - reason = The cvmx_usb_callback_t used to register
+ *        the callback.
+ *      - status = The cvmx_usb_complete_t representing the
+ *        status code of a transaction.
+ *      - pipe_handle = The Pipe that caused this callback, or
+ *        -1 if this callback wasn't associated with a pipe.
+ *      - submit_handle = Transfer submit handle causing this
+ *        callback, or -1 if this callback wasn't associated
+ *        with a transfer.
+ *      - Actual number of bytes transfer.
+ *      - user_data = The user pointer supplied to the
+ *        function cvmx_usb_submit() or
+ *        cvmx_usb_register_callback() */
+typedef void (*cvmx_usb_callback_func_t)(cvmx_usb_state_t *state,
+                                         cvmx_usb_callback_t reason,
+                                         cvmx_usb_complete_t status,
+                                         int pipe_handle, int submit_handle,
+                                         int bytes_transferred, void *user_data);
+
+/**
+ * Flags to pass the initialization function.
+ */
+typedef enum
+{
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_XI = 1<<0,       /**< The USB port uses a 12MHz crystal as clock source
+                                                            at USB_XO and USB_XI. */
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_XO_GND = 1<<1,      /**< The USB port uses 12/24/48MHz 2.5V board clock
+                                                            source at USB_XO. USB_XI should be tied to GND.*/
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_AUTO = 0,           /**< Automatically determine clock type based on function
+                                                             in cvmx-helper-board.c. */
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_MHZ_MASK  = 3<<3,       /**< Mask for clock speed field */
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_12MHZ = 1<<3,       /**< Speed of reference clock or crystal */
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_24MHZ = 2<<3,       /**< Speed of reference clock */
+    CVMX_USB_INITIALIZE_FLAGS_CLOCK_48MHZ = 3<<3,       /**< Speed of reference clock */
+    /* Bits 3-4 used to encode the clock frequency */
+    CVMX_USB_INITIALIZE_FLAGS_NO_DMA = 1<<5,            /**< Disable DMA and used polled IO for data transfer use for the USB  */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS = 1<<16,  /**< Enable extra console output for debugging USB transfers */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLBACKS = 1<<17,  /**< Enable extra console output for debugging USB callbacks */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO = 1<<18,       /**< Enable extra console output for USB informational data */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLS = 1<<19,      /**< Enable extra console output for every function call */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_CSRS = 1<<20,       /**< Enable extra console output for every CSR access */
+    CVMX_USB_INITIALIZE_FLAGS_DEBUG_ALL = ((CVMX_USB_INITIALIZE_FLAGS_DEBUG_CSRS<<1)-1) - (CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS-1),
+} cvmx_usb_initialize_flags_t;
+
+/**
+ * Flags for passing when a pipe is created. Currently no flags
+ * need to be passed.
+ */
+typedef enum
+{
+    CVMX_USB_PIPE_FLAGS_DEBUG_TRANSFERS = 1<<15,/**< Used to display CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS for a specific pipe only */
+    __CVMX_USB_PIPE_FLAGS_OPEN = 1<<16,         /**< Used internally to determine if a pipe is open. Do not use */
+    __CVMX_USB_PIPE_FLAGS_SCHEDULED = 1<<17,    /**< Used internally to determine if a pipe is actively using hardware. Do not use */
+    __CVMX_USB_PIPE_FLAGS_NEED_PING = 1<<18,    /**< Used internally to determine if a high speed pipe is in the ping state. Do not use */
+} cvmx_usb_pipe_flags_t;
+
+/**
+ * Return the number of USB ports supported by this Octeon
+ * chip. If the chip doesn't support USB, or is not supported
+ * by this API, a zero will be returned. Most Octeon chips
+ * support one usb port, but some support two ports.
+ * cvmx_usb_initialize() must be called on independent
+ * cvmx_usb_state_t structures.
+ *
+ * @return Number of port, zero if usb isn't supported
+ */
+extern int cvmx_usb_get_num_ports(void);
+
+/**
+ * Initialize a USB port for use. This must be called before any
+ * other access to the Octeon USB port is made. The port starts
+ * off in the disabled state.
+ *
+ * @param state  Pointer to an empty cvmx_usb_state_t structure
+ *               that will be populated by the initialize call.
+ *               This structure is then passed to all other USB
+ *               functions.
+ * @param usb_port_number
+ *               Which Octeon USB port to initialize.
+ * @param flags  Flags to control hardware initialization. See
+ *               cvmx_usb_initialize_flags_t for the flag
+ *               definitions. Some flags are mandatory.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_initialize(cvmx_usb_state_t *state,
+                                             int usb_port_number,
+                                             cvmx_usb_initialize_flags_t flags);
+
+/**
+ * Shutdown a USB port after a call to cvmx_usb_initialize().
+ * The port should be disabled with all pipes closed when this
+ * function is called.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_shutdown(cvmx_usb_state_t *state);
+
+/**
+ * Enable a USB port. After this call succeeds, the USB port is
+ * online and servicing requests.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_enable(cvmx_usb_state_t *state);
+
+/**
+ * Disable a USB port. After this call the USB port will not
+ * generate data transfers and will not generate events.
+ * Transactions in process will fail and call their
+ * associated callbacks.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_disable(cvmx_usb_state_t *state);
+
+/**
+ * Get the current state of the USB port. Use this call to
+ * determine if the usb port has anything connected, is enabled,
+ * or has some sort of error condition. The return value of this
+ * call has "changed" bits to signal of the value of some fields
+ * have changed between calls. These "changed" fields are based
+ * on the last call to cvmx_usb_set_status(). In order to clear
+ * them, you must update the status through cvmx_usb_set_status().
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return Port status information
+ */
+extern cvmx_usb_port_status_t cvmx_usb_get_status(cvmx_usb_state_t *state);
+
+/**
+ * Set the current state of the USB port. The status is used as
+ * a reference for the "changed" bits returned by
+ * cvmx_usb_get_status(). Other than serving as a reference, the
+ * status passed to this function is not used. No fields can be
+ * changed through this call.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param port_status
+ *               Port status to set, most like returned by cvmx_usb_get_status()
+ */
+extern void cvmx_usb_set_status(cvmx_usb_state_t *state, cvmx_usb_port_status_t port_status);
+
+/**
+ * Open a virtual pipe between the host and a USB device. A pipe
+ * must be opened before data can be transferred between a device
+ * and Octeon.
+ *
+ * @param state      USB device state populated by
+ *                   cvmx_usb_initialize().
+ * @param flags      Optional pipe flags defined in
+ *                   cvmx_usb_pipe_flags_t.
+ * @param device_addr
+ *                   USB device address to open the pipe to
+ *                   (0-127).
+ * @param endpoint_num
+ *                   USB endpoint number to open the pipe to
+ *                   (0-15).
+ * @param device_speed
+ *                   The speed of the device the pipe is going
+ *                   to. This must match the device's speed,
+ *                   which may be different than the port speed.
+ * @param max_packet The maximum packet length the device can
+ *                   transmit/receive (low speed=0-8, full
+ *                   speed=0-1023, high speed=0-1024). This value
+ *                   comes from the standard endpoint descriptor
+ *                   field wMaxPacketSize bits <10:0>.
+ * @param transfer_type
+ *                   The type of transfer this pipe is for.
+ * @param transfer_dir
+ *                   The direction the pipe is in. This is not
+ *                   used for control pipes.
+ * @param interval   For ISOCHRONOUS and INTERRUPT transfers,
+ *                   this is how often the transfer is scheduled
+ *                   for. All other transfers should specify
+ *                   zero. The units are in frames (8000/sec at
+ *                   high speed, 1000/sec for full speed).
+ * @param multi_count
+ *                   For high speed devices, this is the maximum
+ *                   allowed number of packet per microframe.
+ *                   Specify zero for non high speed devices. This
+ *                   value comes from the standard endpoint descriptor
+ *                   field wMaxPacketSize bits <12:11>.
+ * @param hub_device_addr
+ *                   Hub device address this device is connected
+ *                   to. Devices connected directly to Octeon
+ *                   use zero. This is only used when the device
+ *                   is full/low speed behind a high speed hub.
+ *                   The address will be of the high speed hub,
+ *                   not and full speed hubs after it.
+ * @param hub_port   Which port on the hub the device is
+ *                   connected. Use zero for devices connected
+ *                   directly to Octeon. Like hub_device_addr,
+ *                   this is only used for full/low speed
+ *                   devices behind a high speed hub.
+ *
+ * @return A non negative value is a pipe handle. Negative
+ *         values are failure codes from cvmx_usb_status_t.
+ */
+extern int cvmx_usb_open_pipe(cvmx_usb_state_t *state,
+                              cvmx_usb_pipe_flags_t flags,
+                              int device_addr, int endpoint_num,
+                              cvmx_usb_speed_t device_speed, int max_packet,
+                              cvmx_usb_transfer_t transfer_type,
+                              cvmx_usb_direction_t transfer_dir, int interval,
+                              int multi_count, int hub_device_addr,
+                              int hub_port);
+
+/**
+ * Call to submit a USB Bulk transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+extern int cvmx_usb_submit_bulk(cvmx_usb_state_t *state, int pipe_handle,
+                                uint64_t buffer, int buffer_length,
+                                cvmx_usb_callback_func_t callback,
+                                void *user_data);
+
+/**
+ * Call to submit a USB Interrupt transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+extern int cvmx_usb_submit_interrupt(cvmx_usb_state_t *state, int pipe_handle,
+                                     uint64_t buffer, int buffer_length,
+                                     cvmx_usb_callback_func_t callback,
+                                     void *user_data);
+
+/**
+ * Call to submit a USB Control transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param control_header
+ *                  USB 8 byte control header physical address.
+ *                  Note that this is NOT A POINTER, but the
+ *                  full 64bit physical address of the buffer.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+extern int cvmx_usb_submit_control(cvmx_usb_state_t *state, int pipe_handle,
+                                   uint64_t control_header,
+                                   uint64_t buffer, int buffer_length,
+                                   cvmx_usb_callback_func_t callback,
+                                   void *user_data);
+
+/**
+ * Flags to pass the cvmx_usb_submit_isochronous() function.
+ */
+typedef enum
+{
+    CVMX_USB_ISOCHRONOUS_FLAGS_ALLOW_SHORT = 1<<0,  /**< Do not return an error if a transfer is less than the maximum packet size of the device */
+    CVMX_USB_ISOCHRONOUS_FLAGS_ASAP = 1<<1,         /**< Schedule the transaction as soon as possible */
+} cvmx_usb_isochronous_flags_t;
+
+/**
+ * Call to submit a USB Isochronous transfer to a pipe.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param pipe_handle
+ *                  Handle to the pipe for the transfer.
+ * @param start_frame
+ *                  Number of frames into the future to schedule
+ *                  this transaction.
+ * @param flags     Flags to control the transfer. See
+ *                  cvmx_usb_isochronous_flags_t for the flag
+ *                  definitions.
+ * @param number_packets
+ *                  Number of sequential packets to transfer.
+ *                  "packets" is a pointer to an array of this
+ *                  many packet structures.
+ * @param packets   Description of each transfer packet as
+ *                  defined by cvmx_usb_iso_packet_t. The array
+ *                  pointed to here must stay valid until the
+ *                  complete callback is called.
+ * @param buffer    Physical address of the data buffer in
+ *                  memory. Note that this is NOT A POINTER, but
+ *                  the full 64bit physical address of the
+ *                  buffer. This may be zero if buffer_length is
+ *                  zero.
+ * @param buffer_length
+ *                  Length of buffer in bytes.
+ * @param callback  Function to call when this transaction
+ *                  completes. If the return value of this
+ *                  function isn't an error, then this function
+ *                  is guaranteed to be called when the
+ *                  transaction completes. If this parameter is
+ *                  NULL, then the generic callback registered
+ *                  through cvmx_usb_register_callback is
+ *                  called. If both are NULL, then there is no
+ *                  way to know when a transaction completes.
+ * @param user_data User supplied data returned when the
+ *                  callback is called. This is only used if
+ *                  callback in not NULL.
+ *
+ * @return A submitted transaction handle or negative on
+ *         failure. Negative values are failure codes from
+ *         cvmx_usb_status_t.
+ */
+extern int cvmx_usb_submit_isochronous(cvmx_usb_state_t *state, int pipe_handle,
+                                       int start_frame, int flags,
+                                       int number_packets,
+                                       cvmx_usb_iso_packet_t packets[],
+                                       uint64_t buffer, int buffer_length,
+                                       cvmx_usb_callback_func_t callback,
+                                       void *user_data);
+
+/**
+ * Cancel one outstanding request in a pipe. Canceling a request
+ * can fail if the transaction has already completed before cancel
+ * is called. Even after a successful cancel call, it may take
+ * a frame or two for the cvmx_usb_poll() function to call the
+ * associated callback.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to cancel requests in.
+ * @param submit_handle
+ *               Handle to transaction to cancel, returned by the submit function.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_cancel(cvmx_usb_state_t *state,
+                                         int pipe_handle, int submit_handle);
+
+
+/**
+ * Cancel all outstanding requests in a pipe. Logically all this
+ * does is call cvmx_usb_cancel() in a loop.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to cancel requests in.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_cancel_all(cvmx_usb_state_t *state,
+                                             int pipe_handle);
+
+/**
+ * Close a pipe created with cvmx_usb_open_pipe().
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ * @param pipe_handle
+ *               Pipe handle to close.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t. CVMX_USB_BUSY is returned if the
+ *         pipe has outstanding transfers.
+ */
+extern cvmx_usb_status_t cvmx_usb_close_pipe(cvmx_usb_state_t *state,
+                                             int pipe_handle);
+
+/**
+ * Register a function to be called when various USB events occur.
+ *
+ * @param state     USB device state populated by
+ *                  cvmx_usb_initialize().
+ * @param reason    Which event to register for.
+ * @param callback  Function to call when the event occurs.
+ * @param user_data User data parameter to the function.
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_register_callback(cvmx_usb_state_t *state,
+                                                    cvmx_usb_callback_t reason,
+                                                    cvmx_usb_callback_func_t callback,
+                                                    void *user_data);
+
+/**
+ * Get the current USB protocol level frame number. The frame
+ * number is always in the range of 0-0x7ff.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return USB frame number
+ */
+extern int cvmx_usb_get_frame_number(cvmx_usb_state_t *state);
+
+/**
+ * Poll the USB block for status and call all needed callback
+ * handlers. This function is meant to be called in the interrupt
+ * handler for the USB controller. It can also be called
+ * periodically in a loop for non-interrupt based operation.
+ *
+ * @param state  USB device state populated by
+ *               cvmx_usb_initialize().
+ *
+ * @return CVMX_USB_SUCCESS or a negative error code defined in
+ *         cvmx_usb_status_t.
+ */
+extern cvmx_usb_status_t cvmx_usb_poll(cvmx_usb_state_t *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __CVMX_USB_H__ */
diff --git a/drivers/staging/octeon-usb/cvmx-usbcx-defs.h b/drivers/staging/octeon-usb/cvmx-usbcx-defs.h
new file mode 100644 (file)
index 0000000..394e846
--- /dev/null
@@ -0,0 +1,1551 @@
+/***********************license start***************
+ * Copyright (c) 2003-2010  Cavium Networks (support@cavium.com). All rights
+ * reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+
+ *   * Neither the name of Cavium Networks nor the names of
+ *     its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written
+ *     permission.
+
+ * This Software, including technical data, may be subject to U.S. export
+ * control laws, including the U.S. Export Administration Act and its associated
+ * regulations, and may be subject to export or import  regulations in other
+ * countries.
+
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+ * AND WITH ALL FAULTS AND CAVIUM NETWORKS MAKES NO PROMISES, REPRESENTATIONS OR
+ * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT TO
+ * THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY REPRESENTATION
+ * OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT DEFECTS, AND CAVIUM
+ * SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES OF TITLE,
+ * MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR PURPOSE, LACK OF
+ * VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT, QUIET POSSESSION OR
+ * CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK ARISING OUT OF USE OR
+ * PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
+ ***********************license end**************************************/
+
+
+/**
+ * cvmx-usbcx-defs.h
+ *
+ * Configuration and status register (CSR) type definitions for
+ * Octeon usbcx.
+ *
+ */
+#ifndef __CVMX_USBCX_TYPEDEFS_H__
+#define __CVMX_USBCX_TYPEDEFS_H__
+
+#define CVMX_USBCXBASE 0x00016F0010000000ull
+#define CVMX_USBCXREG1(reg, bid) \
+       (CVMX_ADD_IO_SEG(CVMX_USBCXBASE | reg) + \
+        ((bid) & 1) * 0x100000000000ull)
+#define CVMX_USBCXREG2(reg, bid, off) \
+       (CVMX_ADD_IO_SEG(CVMX_USBCXBASE | reg) + \
+        (((off) & 7) + ((bid) & 1) * 0x8000000000ull) * 32)
+
+#define CVMX_USBCX_GAHBCFG(bid)                CVMX_USBCXREG1(0x008, bid)
+#define CVMX_USBCX_GHWCFG3(bid)                CVMX_USBCXREG1(0x04c, bid)
+#define CVMX_USBCX_GINTMSK(bid)                CVMX_USBCXREG1(0x018, bid)
+#define CVMX_USBCX_GINTSTS(bid)                CVMX_USBCXREG1(0x014, bid)
+#define CVMX_USBCX_GNPTXFSIZ(bid)      CVMX_USBCXREG1(0x028, bid)
+#define CVMX_USBCX_GNPTXSTS(bid)       CVMX_USBCXREG1(0x02c, bid)
+#define CVMX_USBCX_GOTGCTL(bid)                CVMX_USBCXREG1(0x000, bid)
+#define CVMX_USBCX_GRSTCTL(bid)                CVMX_USBCXREG1(0x010, bid)
+#define CVMX_USBCX_GRXFSIZ(bid)                CVMX_USBCXREG1(0x024, bid)
+#define CVMX_USBCX_GRXSTSPH(bid)       CVMX_USBCXREG1(0x020, bid)
+#define CVMX_USBCX_GUSBCFG(bid)                CVMX_USBCXREG1(0x00c, bid)
+#define CVMX_USBCX_HAINT(bid)          CVMX_USBCXREG1(0x414, bid)
+#define CVMX_USBCX_HAINTMSK(bid)       CVMX_USBCXREG1(0x418, bid)
+#define CVMX_USBCX_HCCHARX(off, bid)   CVMX_USBCXREG2(0x500, bid, off)
+#define CVMX_USBCX_HCFG(bid)           CVMX_USBCXREG1(0x400, bid)
+#define CVMX_USBCX_HCINTMSKX(off, bid) CVMX_USBCXREG2(0x50c, bid, off)
+#define CVMX_USBCX_HCINTX(off, bid)    CVMX_USBCXREG2(0x508, bid, off)
+#define CVMX_USBCX_HCSPLTX(off, bid)   CVMX_USBCXREG2(0x504, bid, off)
+#define CVMX_USBCX_HCTSIZX(off, bid)   CVMX_USBCXREG2(0x510, bid, off)
+#define CVMX_USBCX_HFIR(bid)           CVMX_USBCXREG1(0x404, bid)
+#define CVMX_USBCX_HFNUM(bid)          CVMX_USBCXREG1(0x408, bid)
+#define CVMX_USBCX_HPRT(bid)           CVMX_USBCXREG1(0x440, bid)
+#define CVMX_USBCX_HPTXFSIZ(bid)       CVMX_USBCXREG1(0x100, bid)
+#define CVMX_USBCX_HPTXSTS(bid)                CVMX_USBCXREG1(0x410, bid)
+
+/**
+ * cvmx_usbc#_gahbcfg
+ *
+ * Core AHB Configuration Register (GAHBCFG)
+ *
+ * This register can be used to configure the core after power-on or a change in
+ * mode of operation. This register mainly contains AHB system-related
+ * configuration parameters. The AHB is the processor interface to the O2P USB
+ * core. In general, software need not know about this interface except to
+ * program the values as specified.
+ *
+ * The application must program this register as part of the O2P USB core
+ * initialization. Do not change this register after the initial programming.
+ */
+union cvmx_usbcx_gahbcfg {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_gahbcfg_s
+        * @ptxfemplvl: Periodic TxFIFO Empty Level (PTxFEmpLvl)
+        *      Software should set this bit to 0x1.
+        *      Indicates when the Periodic TxFIFO Empty Interrupt bit in the
+        *      Core Interrupt register (GINTSTS.PTxFEmp) is triggered. This
+        *      bit is used only in Slave mode.
+        *      * 1'b0: GINTSTS.PTxFEmp interrupt indicates that the Periodic
+        *      TxFIFO is half empty
+        *      * 1'b1: GINTSTS.PTxFEmp interrupt indicates that the Periodic
+        *      TxFIFO is completely empty
+        * @nptxfemplvl: Non-Periodic TxFIFO Empty Level (NPTxFEmpLvl)
+        *      Software should set this bit to 0x1.
+        *      Indicates when the Non-Periodic TxFIFO Empty Interrupt bit in
+        *      the Core Interrupt register (GINTSTS.NPTxFEmp) is triggered.
+        *      This bit is used only in Slave mode.
+        *      * 1'b0: GINTSTS.NPTxFEmp interrupt indicates that the Non-
+        *      Periodic TxFIFO is half empty
+        *      * 1'b1: GINTSTS.NPTxFEmp interrupt indicates that the Non-
+        *      Periodic TxFIFO is completely empty
+        * @dmaen: DMA Enable (DMAEn)
+        *      * 1'b0: Core operates in Slave mode
+        *      * 1'b1: Core operates in a DMA mode
+        * @hbstlen: Burst Length/Type (HBstLen)
+        *      This field has not effect and should be left as 0x0.
+        * @glblintrmsk: Global Interrupt Mask (GlblIntrMsk)
+        *      Software should set this field to 0x1.
+        *      The application uses this bit to mask or unmask the interrupt
+        *      line assertion to itself. Irrespective of this bit's setting,
+        *      the interrupt status registers are updated by the core.
+        *      * 1'b0: Mask the interrupt assertion to the application.
+        *      * 1'b1: Unmask the interrupt assertion to the application.
+        */
+       struct cvmx_usbcx_gahbcfg_s {
+               uint32_t reserved_9_31  : 23;
+               uint32_t ptxfemplvl     : 1;
+               uint32_t nptxfemplvl    : 1;
+               uint32_t reserved_6_6   : 1;
+               uint32_t dmaen          : 1;
+               uint32_t hbstlen        : 4;
+               uint32_t glblintrmsk    : 1;
+       } s;
+};
+typedef union cvmx_usbcx_gahbcfg cvmx_usbcx_gahbcfg_t;
+
+/**
+ * cvmx_usbc#_ghwcfg3
+ *
+ * User HW Config3 Register (GHWCFG3)
+ *
+ * This register contains the configuration options of the O2P USB core.
+ */
+union cvmx_usbcx_ghwcfg3 {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_ghwcfg3_s
+        * @dfifodepth: DFIFO Depth (DfifoDepth)
+        *      This value is in terms of 32-bit words.
+        *      * Minimum value is 32
+        *      * Maximum value is 32768
+        * @ahbphysync: AHB and PHY Synchronous (AhbPhySync)
+        *      Indicates whether AHB and PHY clocks are synchronous to
+        *      each other.
+        *      * 1'b0: No
+        *      * 1'b1: Yes
+        *      This bit is tied to 1.
+        * @rsttype: Reset Style for Clocked always Blocks in RTL (RstType)
+        *      * 1'b0: Asynchronous reset is used in the core
+        *      * 1'b1: Synchronous reset is used in the core
+        * @optfeature: Optional Features Removed (OptFeature)
+        *      Indicates whether the User ID register, GPIO interface ports,
+        *      and SOF toggle and counter ports were removed for gate count
+        *      optimization.
+        * @vendor_control_interface_support: Vendor Control Interface Support
+        *      * 1'b0: Vendor Control Interface is not available on the core.
+        *      * 1'b1: Vendor Control Interface is available.
+        * @i2c_selection: I2C Selection
+        *      * 1'b0: I2C Interface is not available on the core.
+        *      * 1'b1: I2C Interface is available on the core.
+        * @otgen: OTG Function Enabled (OtgEn)
+        *      The application uses this bit to indicate the O2P USB core's
+        *      OTG capabilities.
+        *      * 1'b0: Not OTG capable
+        *      * 1'b1: OTG Capable
+        * @pktsizewidth: Width of Packet Size Counters (PktSizeWidth)
+        *      * 3'b000: 4 bits
+        *      * 3'b001: 5 bits
+        *      * 3'b010: 6 bits
+        *      * 3'b011: 7 bits
+        *      * 3'b100: 8 bits
+        *      * 3'b101: 9 bits
+        *      * 3'b110: 10 bits
+        *      * Others: Reserved
+        * @xfersizewidth: Width of Transfer Size Counters (XferSizeWidth)
+        *      * 4'b0000: 11 bits
+        *      * 4'b0001: 12 bits
+        *      - ...
+        *      * 4'b1000: 19 bits
+        *      * Others: Reserved
+        */
+       struct cvmx_usbcx_ghwcfg3_s {
+               uint32_t dfifodepth                             : 16;
+               uint32_t reserved_13_15                         : 3;
+               uint32_t ahbphysync                             : 1;
+               uint32_t rsttype                                : 1;
+               uint32_t optfeature                             : 1;
+               uint32_t vendor_control_interface_support       : 1;
+               uint32_t i2c_selection                          : 1;
+               uint32_t otgen                                  : 1;
+               uint32_t pktsizewidth                           : 3;
+               uint32_t xfersizewidth                          : 4;
+       } s;
+};
+typedef union cvmx_usbcx_ghwcfg3 cvmx_usbcx_ghwcfg3_t;
+
+/**
+ * cvmx_usbc#_gintmsk
+ *
+ * Core Interrupt Mask Register (GINTMSK)
+ *
+ * This register works with the Core Interrupt register to interrupt the
+ * application. When an interrupt bit is masked, the interrupt associated with
+ * that bit will not be generated. However, the Core Interrupt (GINTSTS)
+ * register bit corresponding to that interrupt will still be set.
+ * Mask interrupt: 1'b0, Unmask interrupt: 1'b1
+ */
+union cvmx_usbcx_gintmsk {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_gintmsk_s
+        * @wkupintmsk: Resume/Remote Wakeup Detected Interrupt Mask
+        *      (WkUpIntMsk)
+        * @sessreqintmsk: Session Request/New Session Detected Interrupt Mask
+        *      (SessReqIntMsk)
+        * @disconnintmsk: Disconnect Detected Interrupt Mask (DisconnIntMsk)
+        * @conidstschngmsk: Connector ID Status Change Mask (ConIDStsChngMsk)
+        * @ptxfempmsk: Periodic TxFIFO Empty Mask (PTxFEmpMsk)
+        * @hchintmsk: Host Channels Interrupt Mask (HChIntMsk)
+        * @prtintmsk: Host Port Interrupt Mask (PrtIntMsk)
+        * @fetsuspmsk: Data Fetch Suspended Mask (FetSuspMsk)
+        * @incomplpmsk: Incomplete Periodic Transfer Mask (incomplPMsk)
+        *      Incomplete Isochronous OUT Transfer Mask
+        *      (incompISOOUTMsk)
+        * @incompisoinmsk: Incomplete Isochronous IN Transfer Mask
+        *                  (incompISOINMsk)
+        * @oepintmsk: OUT Endpoints Interrupt Mask (OEPIntMsk)
+        * @inepintmsk: IN Endpoints Interrupt Mask (INEPIntMsk)
+        * @epmismsk: Endpoint Mismatch Interrupt Mask (EPMisMsk)
+        * @eopfmsk: End of Periodic Frame Interrupt Mask (EOPFMsk)
+        * @isooutdropmsk: Isochronous OUT Packet Dropped Interrupt Mask
+        *      (ISOOutDropMsk)
+        * @enumdonemsk: Enumeration Done Mask (EnumDoneMsk)
+        * @usbrstmsk: USB Reset Mask (USBRstMsk)
+        * @usbsuspmsk: USB Suspend Mask (USBSuspMsk)
+        * @erlysuspmsk: Early Suspend Mask (ErlySuspMsk)
+        * @i2cint: I2C Interrupt Mask (I2CINT)
+        * @ulpickintmsk: ULPI Carkit Interrupt Mask (ULPICKINTMsk)
+        *      I2C Carkit Interrupt Mask (I2CCKINTMsk)
+        * @goutnakeffmsk: Global OUT NAK Effective Mask (GOUTNakEffMsk)
+        * @ginnakeffmsk: Global Non-Periodic IN NAK Effective Mask
+        *                (GINNakEffMsk)
+        * @nptxfempmsk: Non-Periodic TxFIFO Empty Mask (NPTxFEmpMsk)
+        * @rxflvlmsk: Receive FIFO Non-Empty Mask (RxFLvlMsk)
+        * @sofmsk: Start of (micro)Frame Mask (SofMsk)
+        * @otgintmsk: OTG Interrupt Mask (OTGIntMsk)
+        * @modemismsk: Mode Mismatch Interrupt Mask (ModeMisMsk)
+        */
+       struct cvmx_usbcx_gintmsk_s {
+               uint32_t wkupintmsk             : 1;
+               uint32_t sessreqintmsk          : 1;
+               uint32_t disconnintmsk          : 1;
+               uint32_t conidstschngmsk        : 1;
+               uint32_t reserved_27_27         : 1;
+               uint32_t ptxfempmsk             : 1;
+               uint32_t hchintmsk              : 1;
+               uint32_t prtintmsk              : 1;
+               uint32_t reserved_23_23         : 1;
+               uint32_t fetsuspmsk             : 1;
+               uint32_t incomplpmsk            : 1;
+               uint32_t incompisoinmsk         : 1;
+               uint32_t oepintmsk              : 1;
+               uint32_t inepintmsk             : 1;
+               uint32_t epmismsk               : 1;
+               uint32_t reserved_16_16         : 1;
+               uint32_t eopfmsk                : 1;
+               uint32_t isooutdropmsk          : 1;
+               uint32_t enumdonemsk            : 1;
+               uint32_t usbrstmsk              : 1;
+               uint32_t usbsuspmsk             : 1;
+               uint32_t erlysuspmsk            : 1;
+               uint32_t i2cint                 : 1;
+               uint32_t ulpickintmsk           : 1;
+               uint32_t goutnakeffmsk          : 1;
+               uint32_t ginnakeffmsk           : 1;
+               uint32_t nptxfempmsk            : 1;
+               uint32_t rxflvlmsk              : 1;
+               uint32_t sofmsk                 : 1;
+               uint32_t otgintmsk              : 1;
+               uint32_t modemismsk             : 1;
+               uint32_t reserved_0_0           : 1;
+       } s;
+};
+typedef union cvmx_usbcx_gintmsk cvmx_usbcx_gintmsk_t;
+
+/**
+ * cvmx_usbc#_gintsts
+ *
+ * Core Interrupt Register (GINTSTS)
+ *
+ * This register interrupts the application for system-level events in the
+ * current mode of operation (Device mode or Host mode). It is shown in
+ * Interrupt. Some of the bits in this register are valid only in Host mode,
+ * while others are valid in Device mode only. This register also indicates the
+ * current mode of operation. In order to clear the interrupt status bits of
+ * type R_SS_WC, the application must write 1'b1 into the bit. The FIFO status
+ * interrupts are read only; once software reads from or writes to the FIFO
+ * while servicing these interrupts, FIFO interrupt conditions are cleared
+ * automatically.
+ */
+union cvmx_usbcx_gintsts {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_gintsts_s
+        * @wkupint: Resume/Remote Wakeup Detected Interrupt (WkUpInt)
+        *      In Device mode, this interrupt is asserted when a resume is
+        *      detected on the USB. In Host mode, this interrupt is asserted
+        *      when a remote wakeup is detected on the USB.
+        *      For more information on how to use this interrupt, see "Partial
+        *      Power-Down and Clock Gating Programming Model" on
+        *      page 353.
+        * @sessreqint: Session Request/New Session Detected Interrupt
+        *              (SessReqInt)
+        *      In Host mode, this interrupt is asserted when a session request
+        *      is detected from the device. In Device mode, this interrupt is
+        *      asserted when the utmiotg_bvalid signal goes high.
+        *      For more information on how to use this interrupt, see "Partial
+        *      Power-Down and Clock Gating Programming Model" on
+        *      page 353.
+        * @disconnint: Disconnect Detected Interrupt (DisconnInt)
+        *      Asserted when a device disconnect is detected.
+        * @conidstschng: Connector ID Status Change (ConIDStsChng)
+        *      The core sets this bit when there is a change in connector ID
+        *      status.
+        * @ptxfemp: Periodic TxFIFO Empty (PTxFEmp)
+        *      Asserted when the Periodic Transmit FIFO is either half or
+        *      completely empty and there is space for at least one entry to be
+        *      written in the Periodic Request Queue. The half or completely
+        *      empty status is determined by the Periodic TxFIFO Empty Level
+        *      bit in the Core AHB Configuration register
+        *      (GAHBCFG.PTxFEmpLvl).
+        * @hchint: Host Channels Interrupt (HChInt)
+        *      The core sets this bit to indicate that an interrupt is pending
+        *      on one of the channels of the core (in Host mode). The
+        *      application must read the Host All Channels Interrupt (HAINT)
+        *      register to determine the exact number of the channel on which
+        *      the interrupt occurred, and then read the corresponding Host
+        *      Channel-n Interrupt (HCINTn) register to determine the exact
+        *      cause of the interrupt. The application must clear the
+        *      appropriate status bit in the HCINTn register to clear this bit.
+        * @prtint: Host Port Interrupt (PrtInt)
+        *      The core sets this bit to indicate a change in port status of
+        *      one of the O2P USB core ports in Host mode. The application must
+        *      read the Host Port Control and Status (HPRT) register to
+        *      determine the exact event that caused this interrupt. The
+        *      application must clear the appropriate status bit in the Host
+        *      Port Control and Status register to clear this bit.
+        * @fetsusp: Data Fetch Suspended (FetSusp)
+        *      This interrupt is valid only in DMA mode. This interrupt
+        *      indicates that the core has stopped fetching data for IN
+        *      endpoints due to the unavailability of TxFIFO space or Request
+        *      Queue space. This interrupt is used by the application for an
+        *      endpoint mismatch algorithm.
+        * @incomplp: Incomplete Periodic Transfer (incomplP)
+        *      In Host mode, the core sets this interrupt bit when there are
+        *      incomplete periodic transactions still pending which are
+        *      scheduled for the current microframe.
+        *      Incomplete Isochronous OUT Transfer (incompISOOUT)
+        *      The Device mode, the core sets this interrupt to indicate that
+        *      there is at least one isochronous OUT endpoint on which the
+        *      transfer is not completed in the current microframe. This
+        *      interrupt is asserted along with the End of Periodic Frame
+        *      Interrupt (EOPF) bit in this register.
+        * @incompisoin: Incomplete Isochronous IN Transfer (incompISOIN)
+        *      The core sets this interrupt to indicate that there is at least
+        *      one isochronous IN endpoint on which the transfer is not
+        *      completed in the current microframe. This interrupt is asserted
+        *      along with the End of Periodic Frame Interrupt (EOPF) bit in
+        *      this register.
+        * @oepint: OUT Endpoints Interrupt (OEPInt)
+        *      The core sets this bit to indicate that an interrupt is pending
+        *      on one of the OUT endpoints of the core (in Device mode). The
+        *      application must read the Device All Endpoints Interrupt
+        *      (DAINT) register to determine the exact number of the OUT
+        *      endpoint on which the interrupt occurred, and then read the
+        *      corresponding Device OUT Endpoint-n Interrupt (DOEPINTn)
+        *      register to determine the exact cause of the interrupt. The
+        *      application must clear the appropriate status bit in the
+        *      corresponding DOEPINTn register to clear this bit.
+        * @iepint: IN Endpoints Interrupt (IEPInt)
+        *      The core sets this bit to indicate that an interrupt is pending
+        *      on one of the IN endpoints of the core (in Device mode). The
+        *      application must read the Device All Endpoints Interrupt
+        *      (DAINT) register to determine the exact number of the IN
+        *      endpoint on which the interrupt occurred, and then read the
+        *      corresponding Device IN Endpoint-n Interrupt (DIEPINTn)
+        *      register to determine the exact cause of the interrupt. The
+        *      application must clear the appropriate status bit in the
+        *      corresponding DIEPINTn register to clear this bit.
+        * @epmis: Endpoint Mismatch Interrupt (EPMis)
+        *      Indicates that an IN token has been received for a non-periodic
+        *      endpoint, but the data for another endpoint is present in the
+        *      top of the Non-Periodic Transmit FIFO and the IN endpoint
+        *      mismatch count programmed by the application has expired.
+        * @eopf: End of Periodic Frame Interrupt (EOPF)
+        *      Indicates that the period specified in the Periodic Frame
+        *      Interval field of the Device Configuration register
+        *      (DCFG.PerFrInt) has been reached in the current microframe.
+        * @isooutdrop: Isochronous OUT Packet Dropped Interrupt (ISOOutDrop)
+        *      The core sets this bit when it fails to write an isochronous OUT
+        *      packet into the RxFIFO because the RxFIFO doesn't have
+        *      enough space to accommodate a maximum packet size packet
+        *      for the isochronous OUT endpoint.
+        * @enumdone: Enumeration Done (EnumDone)
+        *      The core sets this bit to indicate that speed enumeration is
+        *      complete. The application must read the Device Status (DSTS)
+        *      register to obtain the enumerated speed.
+        * @usbrst: USB Reset (USBRst)
+        *      The core sets this bit to indicate that a reset is detected on
+        *      the USB.
+        * @usbsusp: USB Suspend (USBSusp)
+        *      The core sets this bit to indicate that a suspend was detected
+        *      on the USB. The core enters the Suspended state when there
+        *      is no activity on the phy_line_state_i signal for an extended
+        *      period of time.
+        * @erlysusp: Early Suspend (ErlySusp)
+        *      The core sets this bit to indicate that an Idle state has been
+        *      detected on the USB for 3 ms.
+        * @i2cint: I2C Interrupt (I2CINT)
+        *      This bit is always 0x0.
+        * @ulpickint: ULPI Carkit Interrupt (ULPICKINT)
+        *      This bit is always 0x0.
+        * @goutnakeff: Global OUT NAK Effective (GOUTNakEff)
+        *      Indicates that the Set Global OUT NAK bit in the Device Control
+        *      register (DCTL.SGOUTNak), set by the application, has taken
+        *      effect in the core. This bit can be cleared by writing the Clear
+        *      Global OUT NAK bit in the Device Control register
+        *      (DCTL.CGOUTNak).
+        * @ginnakeff: Global IN Non-Periodic NAK Effective (GINNakEff)
+        *      Indicates that the Set Global Non-Periodic IN NAK bit in the
+        *      Device Control register (DCTL.SGNPInNak), set by the
+        *      application, has taken effect in the core. That is, the core has
+        *      sampled the Global IN NAK bit set by the application. This bit
+        *      can be cleared by clearing the Clear Global Non-Periodic IN
+        *      NAK bit in the Device Control register (DCTL.CGNPInNak).
+        *      This interrupt does not necessarily mean that a NAK handshake
+        *      is sent out on the USB. The STALL bit takes precedence over
+        *      the NAK bit.
+        * @nptxfemp: Non-Periodic TxFIFO Empty (NPTxFEmp)
+        *      This interrupt is asserted when the Non-Periodic TxFIFO is
+        *      either half or completely empty, and there is space for at least
+        *      one entry to be written to the Non-Periodic Transmit Request
+        *      Queue. The half or completely empty status is determined by
+        *      the Non-Periodic TxFIFO Empty Level bit in the Core AHB
+        *      Configuration register (GAHBCFG.NPTxFEmpLvl).
+        * @rxflvl: RxFIFO Non-Empty (RxFLvl)
+        *      Indicates that there is at least one packet pending to be read
+        *      from the RxFIFO.
+        * @sof: Start of (micro)Frame (Sof)
+        *      In Host mode, the core sets this bit to indicate that an SOF
+        *      (FS), micro-SOF (HS), or Keep-Alive (LS) is transmitted on the
+        *      USB. The application must write a 1 to this bit to clear the
+        *      interrupt.
+        *      In Device mode, in the core sets this bit to indicate that an
+        *      SOF token has been received on the USB. The application can read
+        *      the Device Status register to get the current (micro)frame
+        *      number. This interrupt is seen only when the core is operating
+        *      at either HS or FS.
+        * @otgint: OTG Interrupt (OTGInt)
+        *      The core sets this bit to indicate an OTG protocol event. The
+        *      application must read the OTG Interrupt Status (GOTGINT)
+        *      register to determine the exact event that caused this
+        *      interrupt. The application must clear the appropriate status bit
+        *      in the GOTGINT register to clear this bit.
+        * @modemis: Mode Mismatch Interrupt (ModeMis)
+        *      The core sets this bit when the application is trying to access:
+        *      * A Host mode register, when the core is operating in Device
+        *      mode
+        *      * A Device mode register, when the core is operating in Host
+        *      mode
+        *      The register access is completed on the AHB with an OKAY
+        *      response, but is ignored by the core internally and doesn't
+        *      affect the operation of the core.
+        * @curmod: Current Mode of Operation (CurMod)
+        *      Indicates the current mode of operation.
+        *      * 1'b0: Device mode
+        *      * 1'b1: Host mode
+        */
+       struct cvmx_usbcx_gintsts_s {
+               uint32_t wkupint        : 1;
+               uint32_t sessreqint     : 1;
+               uint32_t disconnint     : 1;
+               uint32_t conidstschng   : 1;
+               uint32_t reserved_27_27 : 1;
+               uint32_t ptxfemp        : 1;
+               uint32_t hchint         : 1;
+               uint32_t prtint         : 1;
+               uint32_t reserved_23_23 : 1;
+               uint32_t fetsusp        : 1;
+               uint32_t incomplp       : 1;
+               uint32_t incompisoin    : 1;
+               uint32_t oepint         : 1;
+               uint32_t iepint         : 1;
+               uint32_t epmis          : 1;
+               uint32_t reserved_16_16 : 1;
+               uint32_t eopf           : 1;
+               uint32_t isooutdrop     : 1;
+               uint32_t enumdone       : 1;
+               uint32_t usbrst         : 1;
+               uint32_t usbsusp        : 1;
+               uint32_t erlysusp       : 1;
+               uint32_t i2cint         : 1;
+               uint32_t ulpickint      : 1;
+               uint32_t goutnakeff     : 1;
+               uint32_t ginnakeff      : 1;
+               uint32_t nptxfemp       : 1;
+               uint32_t rxflvl         : 1;
+               uint32_t sof            : 1;
+               uint32_t otgint         : 1;
+               uint32_t modemis        : 1;
+               uint32_t curmod         : 1;
+       } s;
+};
+typedef union cvmx_usbcx_gintsts cvmx_usbcx_gintsts_t;
+
+/**
+ * cvmx_usbc#_gnptxfsiz
+ *
+ * Non-Periodic Transmit FIFO Size Register (GNPTXFSIZ)
+ *
+ * The application can program the RAM size and the memory start address for the
+ * Non-Periodic TxFIFO.
+ */
+union cvmx_usbcx_gnptxfsiz {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_gnptxfsiz_s
+        * @nptxfdep: Non-Periodic TxFIFO Depth (NPTxFDep)
+        *      This value is in terms of 32-bit words.
+        *      Minimum value is 16
+        *      Maximum value is 32768
+        * @nptxfstaddr: Non-Periodic Transmit RAM Start Address (NPTxFStAddr)
+        *      This field contains the memory start address for Non-Periodic
+        *      Transmit FIFO RAM.
+        */
+       struct cvmx_usbcx_gnptxfsiz_s {
+               uint32_t nptxfdep       : 16;
+               uint32_t nptxfstaddr    : 16;
+       } s;
+};
+typedef union cvmx_usbcx_gnptxfsiz cvmx_usbcx_gnptxfsiz_t;
+
+/**
+ * cvmx_usbc#_gnptxsts
+ *
+ * Non-Periodic Transmit FIFO/Queue Status Register (GNPTXSTS)
+ *
+ * This read-only register contains the free space information for the
+ * Non-Periodic TxFIFO and the Non-Periodic Transmit Request Queue.
+ */
+union cvmx_usbcx_gnptxsts {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_gnptxsts_s
+        * @nptxqtop: Top of the Non-Periodic Transmit Request Queue (NPTxQTop)
+        *      Entry in the Non-Periodic Tx Request Queue that is currently
+        *      being processed by the MAC.
+        *      * Bits [30:27]: Channel/endpoint number
+        *      * Bits [26:25]:
+        *      - 2'b00: IN/OUT token
+        *      - 2'b01: Zero-length transmit packet (device IN/host OUT)
+        *      - 2'b10: PING/CSPLIT token
+        *      - 2'b11: Channel halt command
+        *      * Bit [24]: Terminate (last entry for selected channel/endpoint)
+        * @nptxqspcavail: Non-Periodic Transmit Request Queue Space Available
+        *      (NPTxQSpcAvail)
+        *      Indicates the amount of free space available in the Non-
+        *      Periodic Transmit Request Queue. This queue holds both IN
+        *      and OUT requests in Host mode. Device mode has only IN
+        *      requests.
+        *      * 8'h0: Non-Periodic Transmit Request Queue is full
+        *      * 8'h1: 1 location available
+        *      * 8'h2: 2 locations available
+        *      * n: n locations available (0..8)
+        *      * Others: Reserved
+        * @nptxfspcavail: Non-Periodic TxFIFO Space Avail (NPTxFSpcAvail)
+        *      Indicates the amount of free space available in the Non-
+        *      Periodic TxFIFO.
+        *      Values are in terms of 32-bit words.
+        *      * 16'h0: Non-Periodic TxFIFO is full
+        *      * 16'h1: 1 word available
+        *      * 16'h2: 2 words available
+        *      * 16'hn: n words available (where 0..32768)
+        *      * 16'h8000: 32768 words available
+        *      * Others: Reserved
+        */
+       struct cvmx_usbcx_gnptxsts_s {
+               uint32_t reserved_31_31 : 1;
+               uint32_t nptxqtop       : 7;
+               uint32_t nptxqspcavail  : 8;
+               uint32_t nptxfspcavail  : 16;
+       } s;
+};
+typedef union cvmx_usbcx_gnptxsts cvmx_usbcx_gnptxsts_t;
+
+/**
+ * cvmx_usbc#_grstctl
+ *
+ * Core Reset Register (GRSTCTL)
+ *
+ * The application uses this register to reset various hardware features inside
+ * the core.
+ */
+union cvmx_usbcx_grstctl {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_grstctl_s
+        * @ahbidle: AHB Master Idle (AHBIdle)
+        *      Indicates that the AHB Master State Machine is in the IDLE
+        *      condition.
+        * @dmareq: DMA Request Signal (DMAReq)
+        *      Indicates that the DMA request is in progress. Used for debug.
+        * @txfnum: TxFIFO Number (TxFNum)
+        *      This is the FIFO number that must be flushed using the TxFIFO
+        *      Flush bit. This field must not be changed until the core clears
+        *      the TxFIFO Flush bit.
+        *      * 5'h0: Non-Periodic TxFIFO flush
+        *      * 5'h1: Periodic TxFIFO 1 flush in Device mode or Periodic
+        *      TxFIFO flush in Host mode
+        *      * 5'h2: Periodic TxFIFO 2 flush in Device mode
+        *      - ...
+        *      * 5'hF: Periodic TxFIFO 15 flush in Device mode
+        *      * 5'h10: Flush all the Periodic and Non-Periodic TxFIFOs in the
+        *      core
+        * @txfflsh: TxFIFO Flush (TxFFlsh)
+        *      This bit selectively flushes a single or all transmit FIFOs, but
+        *      cannot do so if the core is in the midst of a transaction.
+        *      The application must only write this bit after checking that the
+        *      core is neither writing to the TxFIFO nor reading from the
+        *      TxFIFO.
+        *      The application must wait until the core clears this bit before
+        *      performing any operations. This bit takes 8 clocks (of phy_clk
+        *      or hclk, whichever is slower) to clear.
+        * @rxfflsh: RxFIFO Flush (RxFFlsh)
+        *      The application can flush the entire RxFIFO using this bit, but
+        *      must first ensure that the core is not in the middle of a
+        *      transaction.
+        *      The application must only write to this bit after checking that
+        *      the core is neither reading from the RxFIFO nor writing to the
+        *      RxFIFO.
+        *      The application must wait until the bit is cleared before
+        *      performing any other operations. This bit will take 8 clocks
+        *      (slowest of PHY or AHB clock) to clear.
+        * @intknqflsh: IN Token Sequence Learning Queue Flush (INTknQFlsh)
+        *      The application writes this bit to flush the IN Token Sequence
+        *      Learning Queue.
+        * @frmcntrrst: Host Frame Counter Reset (FrmCntrRst)
+        *      The application writes this bit to reset the (micro)frame number
+        *      counter inside the core. When the (micro)frame counter is reset,
+        *      the subsequent SOF sent out by the core will have a
+        *      (micro)frame number of 0.
+        * @hsftrst: HClk Soft Reset (HSftRst)
+        *      The application uses this bit to flush the control logic in the
+        *      AHB Clock domain. Only AHB Clock Domain pipelines are reset.
+        *      * FIFOs are not flushed with this bit.
+        *      * All state machines in the AHB clock domain are reset to the
+        *      Idle state after terminating the transactions on the AHB,
+        *      following the protocol.
+        *      * CSR control bits used by the AHB clock domain state
+        *      machines are cleared.
+        *      * To clear this interrupt, status mask bits that control the
+        *      interrupt status and are generated by the AHB clock domain
+        *      state machine are cleared.
+        *      * Because interrupt status bits are not cleared, the application
+        *      can get the status of any core events that occurred after it set
+        *      this bit.
+        *      This is a self-clearing bit that the core clears after all
+        *      necessary logic is reset in the core. This may take several
+        *      clocks, depending on the core's current state.
+        * @csftrst: Core Soft Reset (CSftRst)
+        *      Resets the hclk and phy_clock domains as follows:
+        *      * Clears the interrupts and all the CSR registers except the
+        *      following register bits:
+        *      - PCGCCTL.RstPdwnModule
+        *      - PCGCCTL.GateHclk
+        *      - PCGCCTL.PwrClmp
+        *      - PCGCCTL.StopPPhyLPwrClkSelclk
+        *      - GUSBCFG.PhyLPwrClkSel
+        *      - GUSBCFG.DDRSel
+        *      - GUSBCFG.PHYSel
+        *      - GUSBCFG.FSIntf
+        *      - GUSBCFG.ULPI_UTMI_Sel
+        *      - GUSBCFG.PHYIf
+        *      - HCFG.FSLSPclkSel
+        *      - DCFG.DevSpd
+        *      * All module state machines (except the AHB Slave Unit) are
+        *      reset to the IDLE state, and all the transmit FIFOs and the
+        *      receive FIFO are flushed.
+        *      * Any transactions on the AHB Master are terminated as soon
+        *      as possible, after gracefully completing the last data phase of
+        *      an AHB transfer. Any transactions on the USB are terminated
+        *      immediately.
+        *      The application can write to this bit any time it wants to reset
+        *      the core. This is a self-clearing bit and the core clears this
+        *      bit after all the necessary logic is reset in the core, which
+        *      may take several clocks, depending on the current state of the
+        *      core. Once this bit is cleared software should wait at least 3
+        *      PHY clocks before doing any access to the PHY domain
+        *      (synchronization delay). Software should also should check that
+        *      bit 31 of this register is 1 (AHB Master is IDLE) before
+        *      starting any operation.
+        *      Typically software reset is used during software development
+        *      and also when you dynamically change the PHY selection bits
+        *      in the USB configuration registers listed above. When you
+        *      change the PHY, the corresponding clock for the PHY is
+        *      selected and used in the PHY domain. Once a new clock is
+        *      selected, the PHY domain has to be reset for proper operation.
+        */
+       struct cvmx_usbcx_grstctl_s {
+               uint32_t ahbidle        : 1;
+               uint32_t dmareq         : 1;
+               uint32_t reserved_11_29 : 19;
+               uint32_t txfnum         : 5;
+               uint32_t txfflsh        : 1;
+               uint32_t rxfflsh        : 1;
+               uint32_t intknqflsh     : 1;
+               uint32_t frmcntrrst     : 1;
+               uint32_t hsftrst        : 1;
+               uint32_t csftrst        : 1;
+       } s;
+};
+typedef union cvmx_usbcx_grstctl cvmx_usbcx_grstctl_t;
+
+/**
+ * cvmx_usbc#_grxfsiz
+ *
+ * Receive FIFO Size Register (GRXFSIZ)
+ *
+ * The application can program the RAM size that must be allocated to the
+ * RxFIFO.
+ */
+union cvmx_usbcx_grxfsiz {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_grxfsiz_s
+        * @rxfdep: RxFIFO Depth (RxFDep)
+        *      This value is in terms of 32-bit words.
+        *      * Minimum value is 16
+        *      * Maximum value is 32768
+        */
+       struct cvmx_usbcx_grxfsiz_s {
+               uint32_t reserved_16_31 : 16;
+               uint32_t rxfdep         : 16;
+       } s;
+};
+typedef union cvmx_usbcx_grxfsiz cvmx_usbcx_grxfsiz_t;
+
+/**
+ * cvmx_usbc#_grxstsph
+ *
+ * Receive Status Read and Pop Register, Host Mode (GRXSTSPH)
+ *
+ * A read to the Receive Status Read and Pop register returns and additionally
+ * pops the top data entry out of the RxFIFO.
+ * This Description is only valid when the core is in Host Mode. For Device Mode
+ * use USBC_GRXSTSPD instead.
+ * NOTE: GRXSTSPH and GRXSTSPD are physically the same register and share the
+ *      same offset in the O2P USB core. The offset difference shown in this
+ *      document is for software clarity and is actually ignored by the
+ *       hardware.
+ */
+union cvmx_usbcx_grxstsph {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_grxstsph_s
+        * @pktsts: Packet Status (PktSts)
+        *      Indicates the status of the received packet
+        *      * 4'b0010: IN data packet received
+        *      * 4'b0011: IN transfer completed (triggers an interrupt)
+        *      * 4'b0101: Data toggle error (triggers an interrupt)
+        *      * 4'b0111: Channel halted (triggers an interrupt)
+        *      * Others: Reserved
+        * @dpid: Data PID (DPID)
+        *      * 2'b00: DATA0
+        *      * 2'b10: DATA1
+        *      * 2'b01: DATA2
+        *      * 2'b11: MDATA
+        * @bcnt: Byte Count (BCnt)
+        *      Indicates the byte count of the received IN data packet
+        * @chnum: Channel Number (ChNum)
+        *      Indicates the channel number to which the current received
+        *      packet belongs.
+        */
+       struct cvmx_usbcx_grxstsph_s {
+               uint32_t reserved_21_31 : 11;
+               uint32_t pktsts         : 4;
+               uint32_t dpid           : 2;
+               uint32_t bcnt           : 11;
+               uint32_t chnum          : 4;
+       } s;
+};
+typedef union cvmx_usbcx_grxstsph cvmx_usbcx_grxstsph_t;
+
+/**
+ * cvmx_usbc#_gusbcfg
+ *
+ * Core USB Configuration Register (GUSBCFG)
+ *
+ * This register can be used to configure the core after power-on or a changing
+ * to Host mode or Device mode. It contains USB and USB-PHY related
+ * configuration parameters. The application must program this register before
+ * starting any transactions on either the AHB or the USB. Do not make changes
+ * to this register after the initial programming.
+ */
+union cvmx_usbcx_gusbcfg {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_gusbcfg_s
+        * @otgi2csel: UTMIFS or I2C Interface Select (OtgI2CSel)
+        *      This bit is always 0x0.
+        * @phylpwrclksel: PHY Low-Power Clock Select (PhyLPwrClkSel)
+        *      Software should set this bit to 0x0.
+        *      Selects either 480-MHz or 48-MHz (low-power) PHY mode. In
+        *      FS and LS modes, the PHY can usually operate on a 48-MHz
+        *      clock to save power.
+        *      * 1'b0: 480-MHz Internal PLL clock
+        *      * 1'b1: 48-MHz External Clock
+        *      In 480 MHz mode, the UTMI interface operates at either 60 or
+        *      30-MHz, depending upon whether 8- or 16-bit data width is
+        *      selected. In 48-MHz mode, the UTMI interface operates at 48
+        *      MHz in FS mode and at either 48 or 6 MHz in LS mode
+        *      (depending on the PHY vendor).
+        *      This bit drives the utmi_fsls_low_power core output signal, and
+        *      is valid only for UTMI+ PHYs.
+        * @usbtrdtim: USB Turnaround Time (USBTrdTim)
+        *      Sets the turnaround time in PHY clocks.
+        *      Specifies the response time for a MAC request to the Packet
+        *      FIFO Controller (PFC) to fetch data from the DFIFO (SPRAM).
+        *      This must be programmed to 0x5.
+        * @hnpcap: HNP-Capable (HNPCap)
+        *      This bit is always 0x0.
+        * @srpcap: SRP-Capable (SRPCap)
+        *      This bit is always 0x0.
+        * @ddrsel: ULPI DDR Select (DDRSel)
+        *      Software should set this bit to 0x0.
+        * @physel: USB 2.0 High-Speed PHY or USB 1.1 Full-Speed Serial
+        *      Software should set this bit to 0x0.
+        * @fsintf: Full-Speed Serial Interface Select (FSIntf)
+        *      Software should set this bit to 0x0.
+        * @ulpi_utmi_sel: ULPI or UTMI+ Select (ULPI_UTMI_Sel)
+        *      This bit is always 0x0.
+        * @phyif: PHY Interface (PHYIf)
+        *      This bit is always 0x1.
+        * @toutcal: HS/FS Timeout Calibration (TOutCal)
+        *      The number of PHY clocks that the application programs in this
+        *      field is added to the high-speed/full-speed interpacket timeout
+        *      duration in the core to account for any additional delays
+        *      introduced by the PHY. This may be required, since the delay
+        *      introduced by the PHY in generating the linestate condition may
+        *      vary from one PHY to another.
+        *      The USB standard timeout value for high-speed operation is
+        *      736 to 816 (inclusive) bit times. The USB standard timeout
+        *      value for full-speed operation is 16 to 18 (inclusive) bit
+        *      times. The application must program this field based on the
+        *      speed of enumeration. The number of bit times added per PHY
+        *      clock are:
+        *      High-speed operation:
+        *      * One 30-MHz PHY clock = 16 bit times
+        *      * One 60-MHz PHY clock = 8 bit times
+        *      Full-speed operation:
+        *      * One 30-MHz PHY clock = 0.4 bit times
+        *      * One 60-MHz PHY clock = 0.2 bit times
+        *      * One 48-MHz PHY clock = 0.25 bit times
+        */
+       struct cvmx_usbcx_gusbcfg_s {
+               uint32_t reserved_17_31 : 15;
+               uint32_t otgi2csel      : 1;
+               uint32_t phylpwrclksel  : 1;
+               uint32_t reserved_14_14 : 1;
+               uint32_t usbtrdtim      : 4;
+               uint32_t hnpcap         : 1;
+               uint32_t srpcap         : 1;
+               uint32_t ddrsel         : 1;
+               uint32_t physel         : 1;
+               uint32_t fsintf         : 1;
+               uint32_t ulpi_utmi_sel  : 1;
+               uint32_t phyif          : 1;
+               uint32_t toutcal        : 3;
+       } s;
+};
+typedef union cvmx_usbcx_gusbcfg cvmx_usbcx_gusbcfg_t;
+
+/**
+ * cvmx_usbc#_haint
+ *
+ * Host All Channels Interrupt Register (HAINT)
+ *
+ * When a significant event occurs on a channel, the Host All Channels Interrupt
+ * register interrupts the application using the Host Channels Interrupt bit of
+ * the Core Interrupt register (GINTSTS.HChInt). This is shown in Interrupt.
+ * There is one interrupt bit per channel, up to a maximum of 16 bits. Bits in
+ * this register are set and cleared when the application sets and clears bits
+ * in the corresponding Host Channel-n Interrupt register.
+ */
+union cvmx_usbcx_haint {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_haint_s
+        * @haint: Channel Interrupts (HAINT)
+        *      One bit per channel: Bit 0 for Channel 0, bit 15 for Channel 15
+        */
+       struct cvmx_usbcx_haint_s {
+               uint32_t reserved_16_31 : 16;
+               uint32_t haint          : 16;
+       } s;
+};
+typedef union cvmx_usbcx_haint cvmx_usbcx_haint_t;
+
+/**
+ * cvmx_usbc#_haintmsk
+ *
+ * Host All Channels Interrupt Mask Register (HAINTMSK)
+ *
+ * The Host All Channel Interrupt Mask register works with the Host All Channel
+ * Interrupt register to interrupt the application when an event occurs on a
+ * channel. There is one interrupt mask bit per channel, up to a maximum of 16
+ * bits.
+ * Mask interrupt: 1'b0 Unmask interrupt: 1'b1
+ */
+union cvmx_usbcx_haintmsk {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_haintmsk_s
+        * @haintmsk: Channel Interrupt Mask (HAINTMsk)
+        *      One bit per channel: Bit 0 for channel 0, bit 15 for channel 15
+        */
+       struct cvmx_usbcx_haintmsk_s {
+               uint32_t reserved_16_31 : 16;
+               uint32_t haintmsk       : 16;
+       } s;
+};
+typedef union cvmx_usbcx_haintmsk cvmx_usbcx_haintmsk_t;
+
+/**
+ * cvmx_usbc#_hcchar#
+ *
+ * Host Channel-n Characteristics Register (HCCHAR)
+ *
+ */
+union cvmx_usbcx_hccharx {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hccharx_s
+        * @chena: Channel Enable (ChEna)
+        *      This field is set by the application and cleared by the OTG
+        *      host.
+        *      * 1'b0: Channel disabled
+        *      * 1'b1: Channel enabled
+        * @chdis: Channel Disable (ChDis)
+        *      The application sets this bit to stop transmitting/receiving
+        *      data on a channel, even before the transfer for that channel is
+        *      complete. The application must wait for the Channel Disabled
+        *      interrupt before treating the channel as disabled.
+        * @oddfrm: Odd Frame (OddFrm)
+        *      This field is set (reset) by the application to indicate that
+        *      the OTG host must perform a transfer in an odd (micro)frame.
+        *      This field is applicable for only periodic (isochronous and
+        *      interrupt) transactions.
+        *      * 1'b0: Even (micro)frame
+        *      * 1'b1: Odd (micro)frame
+        * @devaddr: Device Address (DevAddr)
+        *      This field selects the specific device serving as the data
+        *      source or sink.
+        * @ec: Multi Count (MC) / Error Count (EC)
+        *      When the Split Enable bit of the Host Channel-n Split Control
+        *      register (HCSPLTn.SpltEna) is reset (1'b0), this field indicates
+        *      to the host the number of transactions that should be executed
+        *      per microframe for this endpoint.
+        *      * 2'b00: Reserved. This field yields undefined results.
+        *      * 2'b01: 1 transaction
+        *      * 2'b10: 2 transactions to be issued for this endpoint per
+        *      microframe
+        *      * 2'b11: 3 transactions to be issued for this endpoint per
+        *      microframe
+        *      When HCSPLTn.SpltEna is set (1'b1), this field indicates the
+        *      number of immediate retries to be performed for a periodic split
+        *      transactions on transaction errors. This field must be set to at
+        *      least 2'b01.
+        * @eptype: Endpoint Type (EPType)
+        *      Indicates the transfer type selected.
+        *      * 2'b00: Control
+        *      * 2'b01: Isochronous
+        *      * 2'b10: Bulk
+        *      * 2'b11: Interrupt
+        * @lspddev: Low-Speed Device (LSpdDev)
+        *      This field is set by the application to indicate that this
+        *      channel is communicating to a low-speed device.
+        * @epdir: Endpoint Direction (EPDir)
+        *      Indicates whether the transaction is IN or OUT.
+        *      * 1'b0: OUT
+        *      * 1'b1: IN
+        * @epnum: Endpoint Number (EPNum)
+        *      Indicates the endpoint number on the device serving as the
+        *      data source or sink.
+        * @mps: Maximum Packet Size (MPS)
+        *      Indicates the maximum packet size of the associated endpoint.
+        */
+       struct cvmx_usbcx_hccharx_s {
+               uint32_t chena          : 1;
+               uint32_t chdis          : 1;
+               uint32_t oddfrm         : 1;
+               uint32_t devaddr        : 7;
+               uint32_t ec             : 2;
+               uint32_t eptype         : 2;
+               uint32_t lspddev        : 1;
+               uint32_t reserved_16_16 : 1;
+               uint32_t epdir          : 1;
+               uint32_t epnum          : 4;
+               uint32_t mps            : 11;
+       } s;
+};
+typedef union cvmx_usbcx_hccharx cvmx_usbcx_hccharx_t;
+
+/**
+ * cvmx_usbc#_hcfg
+ *
+ * Host Configuration Register (HCFG)
+ *
+ * This register configures the core after power-on. Do not make changes to this
+ * register after initializing the host.
+ */
+union cvmx_usbcx_hcfg {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hcfg_s
+        * @fslssupp: FS- and LS-Only Support (FSLSSupp)
+        *      The application uses this bit to control the core's enumeration
+        *      speed. Using this bit, the application can make the core
+        *      enumerate as a FS host, even if the connected device supports
+        *      HS traffic. Do not make changes to this field after initial
+        *      programming.
+        *      * 1'b0: HS/FS/LS, based on the maximum speed supported by
+        *      the connected device
+        *      * 1'b1: FS/LS-only, even if the connected device can support HS
+        * @fslspclksel: FS/LS PHY Clock Select (FSLSPclkSel)
+        *      When the core is in FS Host mode
+        *      * 2'b00: PHY clock is running at 30/60 MHz
+        *      * 2'b01: PHY clock is running at 48 MHz
+        *      * Others: Reserved
+        *      When the core is in LS Host mode
+        *      * 2'b00: PHY clock is running at 30/60 MHz. When the
+        *      UTMI+/ULPI PHY Low Power mode is not selected, use
+        *      30/60 MHz.
+        *      * 2'b01: PHY clock is running at 48 MHz. When the UTMI+
+        *      PHY Low Power mode is selected, use 48MHz if the PHY
+        *      supplies a 48 MHz clock during LS mode.
+        *      * 2'b10: PHY clock is running at 6 MHz. In USB 1.1 FS mode,
+        *      use 6 MHz when the UTMI+ PHY Low Power mode is
+        *      selected and the PHY supplies a 6 MHz clock during LS
+        *      mode. If you select a 6 MHz clock during LS mode, you must
+        *      do a soft reset.
+        *      * 2'b11: Reserved
+        */
+       struct cvmx_usbcx_hcfg_s {
+               uint32_t reserved_3_31  : 29;
+               uint32_t fslssupp       : 1;
+               uint32_t fslspclksel    : 2;
+       } s;
+};
+typedef union cvmx_usbcx_hcfg cvmx_usbcx_hcfg_t;
+
+/**
+ * cvmx_usbc#_hcint#
+ *
+ * Host Channel-n Interrupt Register (HCINT)
+ *
+ * This register indicates the status of a channel with respect to USB- and
+ * AHB-related events. The application must read this register when the Host
+ * Channels Interrupt bit of the Core Interrupt register (GINTSTS.HChInt) is
+ * set. Before the application can read this register, it must first read
+ * the Host All Channels Interrupt (HAINT) register to get the exact channel
+ * number for the Host Channel-n Interrupt register. The application must clear
+ * the appropriate bit in this register to clear the corresponding bits in the
+ * HAINT and GINTSTS registers.
+ */
+union cvmx_usbcx_hcintx {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hcintx_s
+        * @datatglerr: Data Toggle Error (DataTglErr)
+        * @frmovrun: Frame Overrun (FrmOvrun)
+        * @bblerr: Babble Error (BblErr)
+        * @xacterr: Transaction Error (XactErr)
+        * @nyet: NYET Response Received Interrupt (NYET)
+        * @ack: ACK Response Received Interrupt (ACK)
+        * @nak: NAK Response Received Interrupt (NAK)
+        * @stall: STALL Response Received Interrupt (STALL)
+        * @ahberr: This bit is always 0x0.
+        * @chhltd: Channel Halted (ChHltd)
+        *      Indicates the transfer completed abnormally either because of
+        *      any USB transaction error or in response to disable request by
+        *      the application.
+        * @xfercompl: Transfer Completed (XferCompl)
+        *      Transfer completed normally without any errors.
+        */
+       struct cvmx_usbcx_hcintx_s {
+               uint32_t reserved_11_31 : 21;
+               uint32_t datatglerr     : 1;
+               uint32_t frmovrun       : 1;
+               uint32_t bblerr         : 1;
+               uint32_t xacterr        : 1;
+               uint32_t nyet           : 1;
+               uint32_t ack            : 1;
+               uint32_t nak            : 1;
+               uint32_t stall          : 1;
+               uint32_t ahberr         : 1;
+               uint32_t chhltd         : 1;
+               uint32_t xfercompl      : 1;
+       } s;
+};
+typedef union cvmx_usbcx_hcintx cvmx_usbcx_hcintx_t;
+
+/**
+ * cvmx_usbc#_hcintmsk#
+ *
+ * Host Channel-n Interrupt Mask Register (HCINTMSKn)
+ *
+ * This register reflects the mask for each channel status described in the
+ * previous section.
+ * Mask interrupt: 1'b0 Unmask interrupt: 1'b1
+ */
+union cvmx_usbcx_hcintmskx {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hcintmskx_s
+        * @datatglerrmsk: Data Toggle Error Mask (DataTglErrMsk)
+        * @frmovrunmsk: Frame Overrun Mask (FrmOvrunMsk)
+        * @bblerrmsk: Babble Error Mask (BblErrMsk)
+        * @xacterrmsk: Transaction Error Mask (XactErrMsk)
+        * @nyetmsk: NYET Response Received Interrupt Mask (NyetMsk)
+        * @ackmsk: ACK Response Received Interrupt Mask (AckMsk)
+        * @nakmsk: NAK Response Received Interrupt Mask (NakMsk)
+        * @stallmsk: STALL Response Received Interrupt Mask (StallMsk)
+        * @ahberrmsk: AHB Error Mask (AHBErrMsk)
+        * @chhltdmsk: Channel Halted Mask (ChHltdMsk)
+        * @xfercomplmsk: Transfer Completed Mask (XferComplMsk)
+        */
+       struct cvmx_usbcx_hcintmskx_s {
+               uint32_t reserved_11_31 : 21;
+               uint32_t datatglerrmsk  : 1;
+               uint32_t frmovrunmsk    : 1;
+               uint32_t bblerrmsk      : 1;
+               uint32_t xacterrmsk     : 1;
+               uint32_t nyetmsk        : 1;
+               uint32_t ackmsk         : 1;
+               uint32_t nakmsk         : 1;
+               uint32_t stallmsk       : 1;
+               uint32_t ahberrmsk      : 1;
+               uint32_t chhltdmsk      : 1;
+               uint32_t xfercomplmsk   : 1;
+       } s;
+};
+typedef union cvmx_usbcx_hcintmskx cvmx_usbcx_hcintmskx_t;
+
+/**
+ * cvmx_usbc#_hcsplt#
+ *
+ * Host Channel-n Split Control Register (HCSPLT)
+ *
+ */
+union cvmx_usbcx_hcspltx {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hcspltx_s
+        * @spltena: Split Enable (SpltEna)
+        *      The application sets this field to indicate that this channel is
+        *      enabled to perform split transactions.
+        * @compsplt: Do Complete Split (CompSplt)
+        *      The application sets this field to request the OTG host to
+        *      perform a complete split transaction.
+        * @xactpos: Transaction Position (XactPos)
+        *      This field is used to determine whether to send all, first,
+        *      middle, or last payloads with each OUT transaction.
+        *      * 2'b11: All. This is the entire data payload is of this
+        *      transaction (which is less than or equal to 188 bytes).
+        *      * 2'b10: Begin. This is the first data payload of this
+        *      transaction (which is larger than 188 bytes).
+        *      * 2'b00: Mid. This is the middle payload of this transaction
+        *      (which is larger than 188 bytes).
+        *      * 2'b01: End. This is the last payload of this transaction
+        *      (which is larger than 188 bytes).
+        * @hubaddr: Hub Address (HubAddr)
+        *      This field holds the device address of the transaction
+        *      translator's hub.
+        * @prtaddr: Port Address (PrtAddr)
+        *      This field is the port number of the recipient transaction
+        *      translator.
+        */
+       struct cvmx_usbcx_hcspltx_s {
+               uint32_t spltena        : 1;
+               uint32_t reserved_17_30 : 14;
+               uint32_t compsplt       : 1;
+               uint32_t xactpos        : 2;
+               uint32_t hubaddr        : 7;
+               uint32_t prtaddr        : 7;
+       } s;
+};
+typedef union cvmx_usbcx_hcspltx cvmx_usbcx_hcspltx_t;
+
+/**
+ * cvmx_usbc#_hctsiz#
+ *
+ * Host Channel-n Transfer Size Register (HCTSIZ)
+ *
+ */
+union cvmx_usbcx_hctsizx {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hctsizx_s
+        * @dopng: Do Ping (DoPng)
+        *      Setting this field to 1 directs the host to do PING protocol.
+        * @pid: PID (Pid)
+        *      The application programs this field with the type of PID to use
+        *      for the initial transaction. The host will maintain this field
+        *      for the rest of the transfer.
+        *      * 2'b00: DATA0
+        *      * 2'b01: DATA2
+        *      * 2'b10: DATA1
+        *      * 2'b11: MDATA (non-control)/SETUP (control)
+        * @pktcnt: Packet Count (PktCnt)
+        *      This field is programmed by the application with the expected
+        *      number of packets to be transmitted (OUT) or received (IN).
+        *      The host decrements this count on every successful
+        *      transmission or reception of an OUT/IN packet. Once this count
+        *      reaches zero, the application is interrupted to indicate normal
+        *      completion.
+        * @xfersize: Transfer Size (XferSize)
+        *      For an OUT, this field is the number of data bytes the host will
+        *      send during the transfer.
+        *      For an IN, this field is the buffer size that the application
+        *      has reserved for the transfer. The application is expected to
+        *      program this field as an integer multiple of the maximum packet
+        *      size for IN transactions (periodic and non-periodic).
+        */
+       struct cvmx_usbcx_hctsizx_s {
+               uint32_t dopng          : 1;
+               uint32_t pid            : 2;
+               uint32_t pktcnt         : 10;
+               uint32_t xfersize       : 19;
+       } s;
+};
+typedef union cvmx_usbcx_hctsizx cvmx_usbcx_hctsizx_t;
+
+/**
+ * cvmx_usbc#_hfir
+ *
+ * Host Frame Interval Register (HFIR)
+ *
+ * This register stores the frame interval information for the current speed to
+ * which the O2P USB core has enumerated.
+ */
+union cvmx_usbcx_hfir {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hfir_s
+        * @frint: Frame Interval (FrInt)
+        *      The value that the application programs to this field specifies
+        *      the interval between two consecutive SOFs (FS) or micro-
+        *      SOFs (HS) or Keep-Alive tokens (HS). This field contains the
+        *      number of PHY clocks that constitute the required frame
+        *      interval. The default value set in this field for a FS operation
+        *      when the PHY clock frequency is 60 MHz. The application can
+        *      write a value to this register only after the Port Enable bit of
+        *      the Host Port Control and Status register (HPRT.PrtEnaPort)
+        *      has been set. If no value is programmed, the core calculates
+        *      the value based on the PHY clock specified in the FS/LS PHY
+        *      Clock Select field of the Host Configuration register
+        *      (HCFG.FSLSPclkSel). Do not change the value of this field
+        *      after the initial configuration.
+        *      * 125 us (PHY clock frequency for HS)
+        *      * 1 ms (PHY clock frequency for FS/LS)
+        */
+       struct cvmx_usbcx_hfir_s {
+               uint32_t reserved_16_31 : 16;
+               uint32_t frint          : 16;
+       } s;
+};
+typedef union cvmx_usbcx_hfir cvmx_usbcx_hfir_t;
+
+/**
+ * cvmx_usbc#_hfnum
+ *
+ * Host Frame Number/Frame Time Remaining Register (HFNUM)
+ *
+ * This register indicates the current frame number.
+ * It also indicates the time remaining (in terms of the number of PHY clocks)
+ * in the current (micro)frame.
+ */
+union cvmx_usbcx_hfnum {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hfnum_s
+        * @frrem: Frame Time Remaining (FrRem)
+        *      Indicates the amount of time remaining in the current
+        *      microframe (HS) or frame (FS/LS), in terms of PHY clocks.
+        *      This field decrements on each PHY clock. When it reaches
+        *      zero, this field is reloaded with the value in the Frame
+        *      Interval register and a new SOF is transmitted on the USB.
+        * @frnum: Frame Number (FrNum)
+        *      This field increments when a new SOF is transmitted on the
+        *      USB, and is reset to 0 when it reaches 16'h3FFF.
+        */
+       struct cvmx_usbcx_hfnum_s {
+               uint32_t frrem  : 16;
+               uint32_t frnum  : 16;
+       } s;
+};
+typedef union cvmx_usbcx_hfnum cvmx_usbcx_hfnum_t;
+
+/**
+ * cvmx_usbc#_hprt
+ *
+ * Host Port Control and Status Register (HPRT)
+ *
+ * This register is available in both Host and Device modes.
+ * Currently, the OTG Host supports only one port.
+ * A single register holds USB port-related information such as USB reset,
+ * enable, suspend, resume, connect status, and test mode for each port. The
+ * R_SS_WC bits in this register can trigger an interrupt to the application
+ * through the Host Port Interrupt bit of the Core Interrupt register
+ * (GINTSTS.PrtInt). On a Port Interrupt, the application must read this
+ * register and clear the bit that caused the interrupt. For the R_SS_WC bits,
+ * the application must write a 1 to the bit to clear the interrupt.
+ */
+union cvmx_usbcx_hprt {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hprt_s
+        * @prtspd: Port Speed (PrtSpd)
+        *      Indicates the speed of the device attached to this port.
+        *      * 2'b00: High speed
+        *      * 2'b01: Full speed
+        *      * 2'b10: Low speed
+        *      * 2'b11: Reserved
+        * @prttstctl: Port Test Control (PrtTstCtl)
+        *      The application writes a nonzero value to this field to put
+        *      the port into a Test mode, and the corresponding pattern is
+        *      signaled on the port.
+        *      * 4'b0000: Test mode disabled
+        *      * 4'b0001: Test_J mode
+        *      * 4'b0010: Test_K mode
+        *      * 4'b0011: Test_SE0_NAK mode
+        *      * 4'b0100: Test_Packet mode
+        *      * 4'b0101: Test_Force_Enable
+        *      * Others: Reserved
+        *      PrtSpd must be zero (i.e. the interface must be in high-speed
+        *      mode) to use the PrtTstCtl test modes.
+        * @prtpwr: Port Power (PrtPwr)
+        *      The application uses this field to control power to this port,
+        *      and the core clears this bit on an overcurrent condition.
+        *      * 1'b0: Power off
+        *      * 1'b1: Power on
+        * @prtlnsts: Port Line Status (PrtLnSts)
+        *      Indicates the current logic level USB data lines
+        *      * Bit [10]: Logic level of D-
+        *      * Bit [11]: Logic level of D+
+        * @prtrst: Port Reset (PrtRst)
+        *      When the application sets this bit, a reset sequence is
+        *      started on this port. The application must time the reset
+        *      period and clear this bit after the reset sequence is
+        *      complete.
+        *      * 1'b0: Port not in reset
+        *      * 1'b1: Port in reset
+        *      The application must leave this bit set for at least a
+        *      minimum duration mentioned below to start a reset on the
+        *      port. The application can leave it set for another 10 ms in
+        *      addition to the required minimum duration, before clearing
+        *      the bit, even though there is no maximum limit set by the
+        *      USB standard.
+        *      * High speed: 50 ms
+        *      * Full speed/Low speed: 10 ms
+        * @prtsusp: Port Suspend (PrtSusp)
+        *      The application sets this bit to put this port in Suspend
+        *      mode. The core only stops sending SOFs when this is set.
+        *      To stop the PHY clock, the application must set the Port
+        *      Clock Stop bit, which will assert the suspend input pin of
+        *      the PHY.
+        *      The read value of this bit reflects the current suspend
+        *      status of the port. This bit is cleared by the core after a
+        *      remote wakeup signal is detected or the application sets
+        *      the Port Reset bit or Port Resume bit in this register or the
+        *      Resume/Remote Wakeup Detected Interrupt bit or
+        *      Disconnect Detected Interrupt bit in the Core Interrupt
+        *      register (GINTSTS.WkUpInt or GINTSTS.DisconnInt,
+        *      respectively).
+        *      * 1'b0: Port not in Suspend mode
+        *      * 1'b1: Port in Suspend mode
+        * @prtres: Port Resume (PrtRes)
+        *      The application sets this bit to drive resume signaling on
+        *      the port. The core continues to drive the resume signal
+        *      until the application clears this bit.
+        *      If the core detects a USB remote wakeup sequence, as
+        *      indicated by the Port Resume/Remote Wakeup Detected
+        *      Interrupt bit of the Core Interrupt register
+        *      (GINTSTS.WkUpInt), the core starts driving resume
+        *      signaling without application intervention and clears this bit
+        *      when it detects a disconnect condition. The read value of
+        *      this bit indicates whether the core is currently driving
+        *      resume signaling.
+        *      * 1'b0: No resume driven
+        *      * 1'b1: Resume driven
+        * @prtovrcurrchng: Port Overcurrent Change (PrtOvrCurrChng)
+        *      The core sets this bit when the status of the Port
+        *      Overcurrent Active bit (bit 4) in this register changes.
+        * @prtovrcurract: Port Overcurrent Active (PrtOvrCurrAct)
+        *      Indicates the overcurrent condition of the port.
+        *      * 1'b0: No overcurrent condition
+        *      * 1'b1: Overcurrent condition
+        * @prtenchng: Port Enable/Disable Change (PrtEnChng)
+        *      The core sets this bit when the status of the Port Enable bit
+        *      [2] of this register changes.
+        * @prtena: Port Enable (PrtEna)
+        *      A port is enabled only by the core after a reset sequence,
+        *      and is disabled by an overcurrent condition, a disconnect
+        *      condition, or by the application clearing this bit. The
+        *      application cannot set this bit by a register write. It can only
+        *      clear it to disable the port. This bit does not trigger any
+        *      interrupt to the application.
+        *      * 1'b0: Port disabled
+        *      * 1'b1: Port enabled
+        * @prtconndet: Port Connect Detected (PrtConnDet)
+        *      The core sets this bit when a device connection is detected
+        *      to trigger an interrupt to the application using the Host Port
+        *      Interrupt bit of the Core Interrupt register (GINTSTS.PrtInt).
+        *      The application must write a 1 to this bit to clear the
+        *      interrupt.
+        * @prtconnsts: Port Connect Status (PrtConnSts)
+        *      * 0: No device is attached to the port.
+        *      * 1: A device is attached to the port.
+        */
+       struct cvmx_usbcx_hprt_s {
+               uint32_t reserved_19_31 : 13;
+               uint32_t prtspd         : 2;
+               uint32_t prttstctl      : 4;
+               uint32_t prtpwr         : 1;
+               uint32_t prtlnsts       : 2;
+               uint32_t reserved_9_9   : 1;
+               uint32_t prtrst         : 1;
+               uint32_t prtsusp        : 1;
+               uint32_t prtres         : 1;
+               uint32_t prtovrcurrchng : 1;
+               uint32_t prtovrcurract  : 1;
+               uint32_t prtenchng      : 1;
+               uint32_t prtena         : 1;
+               uint32_t prtconndet     : 1;
+               uint32_t prtconnsts     : 1;
+       } s;
+};
+typedef union cvmx_usbcx_hprt cvmx_usbcx_hprt_t;
+
+/**
+ * cvmx_usbc#_hptxfsiz
+ *
+ * Host Periodic Transmit FIFO Size Register (HPTXFSIZ)
+ *
+ * This register holds the size and the memory start address of the Periodic
+ * TxFIFO, as shown in Figures 310 and 311.
+ */
+union cvmx_usbcx_hptxfsiz {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hptxfsiz_s
+        * @ptxfsize: Host Periodic TxFIFO Depth (PTxFSize)
+        *      This value is in terms of 32-bit words.
+        *      * Minimum value is 16
+        *      * Maximum value is 32768
+        * @ptxfstaddr: Host Periodic TxFIFO Start Address (PTxFStAddr)
+        */
+       struct cvmx_usbcx_hptxfsiz_s {
+               uint32_t ptxfsize       : 16;
+               uint32_t ptxfstaddr     : 16;
+       } s;
+};
+typedef union cvmx_usbcx_hptxfsiz cvmx_usbcx_hptxfsiz_t;
+
+/**
+ * cvmx_usbc#_hptxsts
+ *
+ * Host Periodic Transmit FIFO/Queue Status Register (HPTXSTS)
+ *
+ * This read-only register contains the free space information for the Periodic
+ * TxFIFO and the Periodic Transmit Request Queue
+ */
+union cvmx_usbcx_hptxsts {
+       uint32_t u32;
+       /**
+        * struct cvmx_usbcx_hptxsts_s
+        * @ptxqtop: Top of the Periodic Transmit Request Queue (PTxQTop)
+        *      This indicates the entry in the Periodic Tx Request Queue that
+        *      is currently being processes by the MAC.
+        *      This register is used for debugging.
+        *      * Bit [31]: Odd/Even (micro)frame
+        *      - 1'b0: send in even (micro)frame
+        *      - 1'b1: send in odd (micro)frame
+        *      * Bits [30:27]: Channel/endpoint number
+        *      * Bits [26:25]: Type
+        *      - 2'b00: IN/OUT
+        *      - 2'b01: Zero-length packet
+        *      - 2'b10: CSPLIT
+        *      - 2'b11: Disable channel command
+        *      * Bit [24]: Terminate (last entry for the selected
+        *      channel/endpoint)
+        * @ptxqspcavail: Periodic Transmit Request Queue Space Available
+        *      (PTxQSpcAvail)
+        *      Indicates the number of free locations available to be written
+        *      in the Periodic Transmit Request Queue. This queue holds both
+        *      IN and OUT requests.
+        *      * 8'h0: Periodic Transmit Request Queue is full
+        *      * 8'h1: 1 location available
+        *      * 8'h2: 2 locations available
+        *      * n: n locations available (0..8)
+        *      * Others: Reserved
+        * @ptxfspcavail: Periodic Transmit Data FIFO Space Available
+        *                (PTxFSpcAvail)
+        *      Indicates the number of free locations available to be written
+        *      to in the Periodic TxFIFO.
+        *      Values are in terms of 32-bit words
+        *      * 16'h0: Periodic TxFIFO is full
+        *      * 16'h1: 1 word available
+        *      * 16'h2: 2 words available
+        *      * 16'hn: n words available (where 0..32768)
+        *      * 16'h8000: 32768 words available
+        *      * Others: Reserved
+        */
+       struct cvmx_usbcx_hptxsts_s {
+               uint32_t ptxqtop        : 8;
+               uint32_t ptxqspcavail   : 8;
+               uint32_t ptxfspcavail   : 16;
+       } s;
+};
+typedef union cvmx_usbcx_hptxsts cvmx_usbcx_hptxsts_t;
+
+#endif
diff --git a/drivers/staging/octeon-usb/cvmx-usbnx-defs.h b/drivers/staging/octeon-usb/cvmx-usbnx-defs.h
new file mode 100644 (file)
index 0000000..96d7067
--- /dev/null
@@ -0,0 +1,887 @@
+/***********************license start***************
+ * Copyright (c) 2003-2010  Cavium Networks (support@cavium.com). All rights
+ * reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+
+ *   * Neither the name of Cavium Networks nor the names of
+ *     its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written
+ *     permission.
+
+ * This Software, including technical data, may be subject to U.S. export
+ * control laws, including the U.S. Export Administration Act and its associated
+ * regulations, and may be subject to export or import  regulations in other
+ * countries.
+
+ * TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+ * AND WITH ALL FAULTS AND CAVIUM NETWORKS MAKES NO PROMISES, REPRESENTATIONS OR
+ * WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT TO
+ * THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY REPRESENTATION
+ * OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT DEFECTS, AND CAVIUM
+ * SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES OF TITLE,
+ * MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR PURPOSE, LACK OF
+ * VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT, QUIET POSSESSION OR
+ * CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK ARISING OUT OF USE OR
+ * PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
+ ***********************license end**************************************/
+
+
+/**
+ * cvmx-usbnx-defs.h
+ *
+ * Configuration and status register (CSR) type definitions for
+ * Octeon usbnx.
+ *
+ */
+#ifndef __CVMX_USBNX_TYPEDEFS_H__
+#define __CVMX_USBNX_TYPEDEFS_H__
+
+#define CVMX_USBNXBID1(bid) (((bid) & 1) * 0x10000000ull)
+#define CVMX_USBNXBID2(bid) (((bid) & 1) * 0x100000000000ull)
+
+#define CVMX_USBNXREG1(reg, bid) \
+       (CVMX_ADD_IO_SEG(0x0001180068000000ull | reg) + CVMX_USBNXBID1(bid))
+#define CVMX_USBNXREG2(reg, bid) \
+       (CVMX_ADD_IO_SEG(0x00016F0000000000ull | reg) + CVMX_USBNXBID2(bid))
+
+#define CVMX_USBNX_CLK_CTL(bid)                CVMX_USBNXREG1(0x10, bid)
+#define CVMX_USBNX_DMA0_INB_CHN0(bid)  CVMX_USBNXREG2(0x818, bid)
+#define CVMX_USBNX_DMA0_OUTB_CHN0(bid) CVMX_USBNXREG2(0x858, bid)
+#define CVMX_USBNX_USBP_CTL_STATUS(bid)        CVMX_USBNXREG1(0x18, bid)
+
+/**
+ * cvmx_usbn#_clk_ctl
+ *
+ * USBN_CLK_CTL = USBN's Clock Control
+ *
+ * This register is used to control the frequency of the hclk and the
+ * hreset and phy_rst signals.
+ */
+union cvmx_usbnx_clk_ctl {
+       uint64_t u64;
+       /**
+        * struct cvmx_usbnx_clk_ctl_s
+        * @divide2: The 'hclk' used by the USB subsystem is derived
+        *      from the eclk.
+        *      Also see the field DIVIDE. DIVIDE2<1> must currently
+        *      be zero because it is not implemented, so the maximum
+        *      ratio of eclk/hclk is currently 16.
+        *      The actual divide number for hclk is:
+        *      (DIVIDE2 + 1) * (DIVIDE + 1)
+        * @hclk_rst: When this field is '0' the HCLK-DIVIDER used to
+        *      generate the hclk in the USB Subsystem is held
+        *      in reset. This bit must be set to '0' before
+        *      changing the value os DIVIDE in this register.
+        *      The reset to the HCLK_DIVIDERis also asserted
+        *      when core reset is asserted.
+        * @p_x_on: Force USB-PHY on during suspend.
+        *      '1' USB-PHY XO block is powered-down during
+        *      suspend.
+        *      '0' USB-PHY XO block is powered-up during
+        *      suspend.
+        *      The value of this field must be set while POR is
+        *      active.
+        * @p_com_on: '0' Force USB-PHY XO Bias, Bandgap and PLL to
+        *      remain powered in Suspend Mode.
+        *      '1' The USB-PHY XO Bias, Bandgap and PLL are
+        *      powered down in suspend mode.
+        *      The value of this field must be set while POR is
+        *      active.
+        * @p_c_sel: Phy clock speed select.
+        *      Selects the reference clock / crystal frequency.
+        *      '11': Reserved
+        *      '10': 48 MHz (reserved when a crystal is used)
+        *      '01': 24 MHz (reserved when a crystal is used)
+        *      '00': 12 MHz
+        *      The value of this field must be set while POR is
+        *      active.
+        *      NOTE: if a crystal is used as a reference clock,
+        *      this field must be set to 12 MHz.
+        * @cdiv_byp: Used to enable the bypass input to the USB_CLK_DIV.
+        * @sd_mode: Scaledown mode for the USBC. Control timing events
+        *      in the USBC, for normal operation this must be '0'.
+        * @s_bist: Starts bist on the hclk memories, during the '0'
+        *      to '1' transition.
+        * @por: Power On Reset for the PHY.
+        *      Resets all the PHYS registers and state machines.
+        * @enable: When '1' allows the generation of the hclk. When
+        *      '0' the hclk will not be generated. SEE DIVIDE
+        *      field of this register.
+        * @prst: When this field is '0' the reset associated with
+        *      the phy_clk functionality in the USB Subsystem is
+        *      help in reset. This bit should not be set to '1'
+        *      until the time it takes 6 clocks (hclk or phy_clk,
+        *      whichever is slower) has passed. Under normal
+        *      operation once this bit is set to '1' it should not
+        *      be set to '0'.
+        * @hrst: When this field is '0' the reset associated with
+        *      the hclk functioanlity in the USB Subsystem is
+        *      held in reset.This bit should not be set to '1'
+        *      until 12ms after phy_clk is stable. Under normal
+        *      operation, once this bit is set to '1' it should
+        *      not be set to '0'.
+        * @divide: The frequency of 'hclk' used by the USB subsystem
+        *      is the eclk frequency divided by the value of
+        *      (DIVIDE2 + 1) * (DIVIDE + 1), also see the field
+        *      DIVIDE2 of this register.
+        *      The hclk frequency should be less than 125Mhz.
+        *      After writing a value to this field the SW should
+        *      read the field for the value written.
+        *      The ENABLE field of this register should not be set
+        *      until AFTER this field is set and then read.
+        */
+       struct cvmx_usbnx_clk_ctl_s {
+               uint64_t reserved_20_63 : 44;
+               uint64_t divide2        : 2;
+               uint64_t hclk_rst       : 1;
+               uint64_t p_x_on         : 1;
+               uint64_t reserved_14_15 : 2;
+               uint64_t p_com_on       : 1;
+               uint64_t p_c_sel        : 2;
+               uint64_t cdiv_byp       : 1;
+               uint64_t sd_mode        : 2;
+               uint64_t s_bist         : 1;
+               uint64_t por            : 1;
+               uint64_t enable         : 1;
+               uint64_t prst           : 1;
+               uint64_t hrst           : 1;
+               uint64_t divide         : 3;
+       } s;
+       /**
+        * struct cvmx_usbnx_clk_ctl_cn30xx
+        * @hclk_rst: When this field is '0' the HCLK-DIVIDER used to
+        *      generate the hclk in the USB Subsystem is held
+        *      in reset. This bit must be set to '0' before
+        *      changing the value os DIVIDE in this register.
+        *      The reset to the HCLK_DIVIDERis also asserted
+        *      when core reset is asserted.
+        * @p_x_on: Force USB-PHY on during suspend.
+        *      '1' USB-PHY XO block is powered-down during
+        *      suspend.
+        *      '0' USB-PHY XO block is powered-up during
+        *      suspend.
+        *      The value of this field must be set while POR is
+        *      active.
+        * @p_rclk: Phy refrence clock enable.
+        *      '1' The PHY PLL uses the XO block output as a
+        *      reference.
+        *      '0' Reserved.
+        * @p_xenbn: Phy external clock enable.
+        *      '1' The XO block uses the clock from a crystal.
+        *      '0' The XO block uses an external clock supplied
+        *      on the XO pin. USB_XI should be tied to
+        *      ground for this usage.
+        * @p_com_on: '0' Force USB-PHY XO Bias, Bandgap and PLL to
+        *      remain powered in Suspend Mode.
+        *      '1' The USB-PHY XO Bias, Bandgap and PLL are
+        *      powered down in suspend mode.
+        *      The value of this field must be set while POR is
+        *      active.
+        * @p_c_sel: Phy clock speed select.
+        *      Selects the reference clock / crystal frequency.
+        *      '11': Reserved
+        *      '10': 48 MHz
+        *      '01': 24 MHz
+        *      '00': 12 MHz
+        *      The value of this field must be set while POR is
+        *      active.
+        * @cdiv_byp: Used to enable the bypass input to the USB_CLK_DIV.
+        * @sd_mode: Scaledown mode for the USBC. Control timing events
+        *      in the USBC, for normal operation this must be '0'.
+        * @s_bist: Starts bist on the hclk memories, during the '0'
+        *      to '1' transition.
+        * @por: Power On Reset for the PHY.
+        *      Resets all the PHYS registers and state machines.
+        * @enable: When '1' allows the generation of the hclk. When
+        *      '0' the hclk will not be generated.
+        * @prst: When this field is '0' the reset associated with
+        *      the phy_clk functionality in the USB Subsystem is
+        *      help in reset. This bit should not be set to '1'
+        *      until the time it takes 6 clocks (hclk or phy_clk,
+        *      whichever is slower) has passed. Under normal
+        *      operation once this bit is set to '1' it should not
+        *      be set to '0'.
+        * @hrst: When this field is '0' the reset associated with
+        *      the hclk functioanlity in the USB Subsystem is
+        *      held in reset.This bit should not be set to '1'
+        *      until 12ms after phy_clk is stable. Under normal
+        *      operation, once this bit is set to '1' it should
+        *      not be set to '0'.
+        * @divide: The 'hclk' used by the USB subsystem is derived
+        *      from the eclk. The eclk will be divided by the
+        *      value of this field +1 to determine the hclk
+        *      frequency. (Also see HRST of this register).
+        *      The hclk frequency must be less than 125 MHz.
+        */
+       struct cvmx_usbnx_clk_ctl_cn30xx {
+               uint64_t reserved_18_63 : 46;
+               uint64_t hclk_rst       : 1;
+               uint64_t p_x_on         : 1;
+               uint64_t p_rclk         : 1;
+               uint64_t p_xenbn        : 1;
+               uint64_t p_com_on       : 1;
+               uint64_t p_c_sel        : 2;
+               uint64_t cdiv_byp       : 1;
+               uint64_t sd_mode        : 2;
+               uint64_t s_bist         : 1;
+               uint64_t por            : 1;
+               uint64_t enable         : 1;
+               uint64_t prst           : 1;
+               uint64_t hrst           : 1;
+               uint64_t divide         : 3;
+       } cn30xx;
+       struct cvmx_usbnx_clk_ctl_cn30xx cn31xx;
+       /**
+        * struct cvmx_usbnx_clk_ctl_cn50xx
+        * @divide2: The 'hclk' used by the USB subsystem is derived
+        *      from the eclk.
+        *      Also see the field DIVIDE. DIVIDE2<1> must currently
+        *      be zero because it is not implemented, so the maximum
+        *      ratio of eclk/hclk is currently 16.
+        *      The actual divide number for hclk is:
+        *      (DIVIDE2 + 1) * (DIVIDE + 1)
+        * @hclk_rst: When this field is '0' the HCLK-DIVIDER used to
+        *      generate the hclk in the USB Subsystem is held
+        *      in reset. This bit must be set to '0' before
+        *      changing the value os DIVIDE in this register.
+        *      The reset to the HCLK_DIVIDERis also asserted
+        *      when core reset is asserted.
+        * @p_rtype: PHY reference clock type
+        *      '0' The USB-PHY uses a 12MHz crystal as a clock
+        *      source at the USB_XO and USB_XI pins
+        *      '1' Reserved
+        *      '2' The USB_PHY uses 12/24/48MHz 2.5V board clock
+        *      at the USB_XO pin. USB_XI should be tied to
+        *      ground in this case.
+        *      '3' Reserved
+        *      (bit 14 was P_XENBN on 3xxx)
+        *      (bit 15 was P_RCLK on 3xxx)
+        * @p_com_on: '0' Force USB-PHY XO Bias, Bandgap and PLL to
+        *      remain powered in Suspend Mode.
+        *      '1' The USB-PHY XO Bias, Bandgap and PLL are
+        *      powered down in suspend mode.
+        *      The value of this field must be set while POR is
+        *      active.
+        * @p_c_sel: Phy clock speed select.
+        *      Selects the reference clock / crystal frequency.
+        *      '11': Reserved
+        *      '10': 48 MHz (reserved when a crystal is used)
+        *      '01': 24 MHz (reserved when a crystal is used)
+        *      '00': 12 MHz
+        *      The value of this field must be set while POR is
+        *      active.
+        *      NOTE: if a crystal is used as a reference clock,
+        *      this field must be set to 12 MHz.
+        * @cdiv_byp: Used to enable the bypass input to the USB_CLK_DIV.
+        * @sd_mode: Scaledown mode for the USBC. Control timing events
+        *      in the USBC, for normal operation this must be '0'.
+        * @s_bist: Starts bist on the hclk memories, during the '0'
+        *      to '1' transition.
+        * @por: Power On Reset for the PHY.
+        *      Resets all the PHYS registers and state machines.
+        * @enable: When '1' allows the generation of the hclk. When
+        *      '0' the hclk will not be generated. SEE DIVIDE
+        *      field of this register.
+        * @prst: When this field is '0' the reset associated with
+        *      the phy_clk functionality in the USB Subsystem is
+        *      help in reset. This bit should not be set to '1'
+        *      until the time it takes 6 clocks (hclk or phy_clk,
+        *      whichever is slower) has passed. Under normal
+        *      operation once this bit is set to '1' it should not
+        *      be set to '0'.
+        * @hrst: When this field is '0' the reset associated with
+        *      the hclk functioanlity in the USB Subsystem is
+        *      held in reset.This bit should not be set to '1'
+        *      until 12ms after phy_clk is stable. Under normal
+        *      operation, once this bit is set to '1' it should
+        *      not be set to '0'.
+        * @divide: The frequency of 'hclk' used by the USB subsystem
+        *      is the eclk frequency divided by the value of
+        *      (DIVIDE2 + 1) * (DIVIDE + 1), also see the field
+        *      DIVIDE2 of this register.
+        *      The hclk frequency should be less than 125Mhz.
+        *      After writing a value to this field the SW should
+        *      read the field for the value written.
+        *      The ENABLE field of this register should not be set
+        *      until AFTER this field is set and then read.
+        */
+       struct cvmx_usbnx_clk_ctl_cn50xx {
+               uint64_t reserved_20_63 : 44;
+               uint64_t divide2        : 2;
+               uint64_t hclk_rst       : 1;
+               uint64_t reserved_16_16 : 1;
+               uint64_t p_rtype        : 2;
+               uint64_t p_com_on       : 1;
+               uint64_t p_c_sel        : 2;
+               uint64_t cdiv_byp       : 1;
+               uint64_t sd_mode        : 2;
+               uint64_t s_bist         : 1;
+               uint64_t por            : 1;
+               uint64_t enable         : 1;
+               uint64_t prst           : 1;
+               uint64_t hrst           : 1;
+               uint64_t divide         : 3;
+       } cn50xx;
+       struct cvmx_usbnx_clk_ctl_cn50xx cn52xx;
+       struct cvmx_usbnx_clk_ctl_cn50xx cn56xx;
+};
+typedef union cvmx_usbnx_clk_ctl cvmx_usbnx_clk_ctl_t;
+
+/**
+ * cvmx_usbn#_usbp_ctl_status
+ *
+ * USBN_USBP_CTL_STATUS = USBP Control And Status Register
+ *
+ * Contains general control and status information for the USBN block.
+ */
+union cvmx_usbnx_usbp_ctl_status {
+       uint64_t u64;
+       /**
+        * struct cvmx_usbnx_usbp_ctl_status_s
+        * @txrisetune: HS Transmitter Rise/Fall Time Adjustment
+        * @txvreftune: HS DC Voltage Level Adjustment
+        * @txfslstune: FS/LS Source Impedence Adjustment
+        * @txhsxvtune: Transmitter High-Speed Crossover Adjustment
+        * @sqrxtune: Squelch Threshold Adjustment
+        * @compdistune: Disconnect Threshold Adjustment
+        * @otgtune: VBUS Valid Threshold Adjustment
+        * @otgdisable: OTG Block Disable
+        * @portreset: Per_Port Reset
+        * @drvvbus: Drive VBUS
+        * @lsbist: Low-Speed BIST Enable.
+        * @fsbist: Full-Speed BIST Enable.
+        * @hsbist: High-Speed BIST Enable.
+        * @bist_done: PHY Bist Done.
+        *      Asserted at the end of the PHY BIST sequence.
+        * @bist_err: PHY Bist Error.
+        *      Indicates an internal error was detected during
+        *      the BIST sequence.
+        * @tdata_out: PHY Test Data Out.
+        *      Presents either internaly generated signals or
+        *      test register contents, based upon the value of
+        *      test_data_out_sel.
+        * @siddq: Drives the USBP (USB-PHY) SIDDQ input.
+        *      Normally should be set to zero.
+        *      When customers have no intent to use USB PHY
+        *      interface, they should:
+        *      - still provide 3.3V to USB_VDD33, and
+        *      - tie USB_REXT to 3.3V supply, and
+        *      - set USBN*_USBP_CTL_STATUS[SIDDQ]=1
+        * @txpreemphasistune: HS Transmitter Pre-Emphasis Enable
+        * @dma_bmode: When set to 1 the L2C DMA address will be updated
+        *      with byte-counts between packets. When set to 0
+        *      the L2C DMA address is incremented to the next
+        *      4-byte aligned address after adding byte-count.
+        * @usbc_end: Bigendian input to the USB Core. This should be
+        *      set to '0' for operation.
+        * @usbp_bist: PHY, This is cleared '0' to run BIST on the USBP.
+        * @tclk: PHY Test Clock, used to load TDATA_IN to the USBP.
+        * @dp_pulld: PHY DP_PULLDOWN input to the USB-PHY.
+        *      This signal enables the pull-down resistance on
+        *      the D+ line. '1' pull down-resistance is connected
+        *      to D+/ '0' pull down resistance is not connected
+        *      to D+. When an A/B device is acting as a host
+        *      (downstream-facing port), dp_pulldown and
+        *      dm_pulldown are enabled. This must not toggle
+        *      during normal opeartion.
+        * @dm_pulld: PHY DM_PULLDOWN input to the USB-PHY.
+        *      This signal enables the pull-down resistance on
+        *      the D- line. '1' pull down-resistance is connected
+        *      to D-. '0' pull down resistance is not connected
+        *      to D-. When an A/B device is acting as a host
+        *      (downstream-facing port), dp_pulldown and
+        *      dm_pulldown are enabled. This must not toggle
+        *      during normal opeartion.
+        * @hst_mode: When '0' the USB is acting as HOST, when '1'
+        *      USB is acting as device. This field needs to be
+        *      set while the USB is in reset.
+        * @tuning: Transmitter Tuning for High-Speed Operation.
+        *      Tunes the current supply and rise/fall output
+        *      times for high-speed operation.
+        *      [20:19] == 11: Current supply increased
+        *      approximately 9%
+        *      [20:19] == 10: Current supply increased
+        *      approximately 4.5%
+        *      [20:19] == 01: Design default.
+        *      [20:19] == 00: Current supply decreased
+        *      approximately 4.5%
+        *      [22:21] == 11: Rise and fall times are increased.
+        *      [22:21] == 10: Design default.
+        *      [22:21] == 01: Rise and fall times are decreased.
+        *      [22:21] == 00: Rise and fall times are decreased
+        *      further as compared to the 01 setting.
+        * @tx_bs_enh: Transmit Bit Stuffing on [15:8].
+        *      Enables or disables bit stuffing on data[15:8]
+        *      when bit-stuffing is enabled.
+        * @tx_bs_en: Transmit Bit Stuffing on [7:0].
+        *      Enables or disables bit stuffing on data[7:0]
+        *      when bit-stuffing is enabled.
+        * @loop_enb: PHY Loopback Test Enable.
+        *      '1': During data transmission the receive is
+        *      enabled.
+        *      '0': During data transmission the receive is
+        *      disabled.
+        *      Must be '0' for normal operation.
+        * @vtest_enb: Analog Test Pin Enable.
+        *      '1' The PHY's analog_test pin is enabled for the
+        *      input and output of applicable analog test signals.
+        *      '0' THe analog_test pin is disabled.
+        * @bist_enb: Built-In Self Test Enable.
+        *      Used to activate BIST in the PHY.
+        * @tdata_sel: Test Data Out Select.
+        *      '1' test_data_out[3:0] (PHY) register contents
+        *      are output. '0' internaly generated signals are
+        *      output.
+        * @taddr_in: Mode Address for Test Interface.
+        *      Specifies the register address for writing to or
+        *      reading from the PHY test interface register.
+        * @tdata_in: Internal Testing Register Input Data and Select
+        *      This is a test bus. Data is present on [3:0],
+        *      and its corresponding select (enable) is present
+        *      on bits [7:4].
+        * @ate_reset: Reset input from automatic test equipment.
+        *      This is a test signal. When the USB Core is
+        *      powered up (not in Susned Mode), an automatic
+        *      tester can use this to disable phy_clock and
+        *      free_clk, then re-eanable them with an aligned
+        *      phase.
+        *      '1': The phy_clk and free_clk outputs are
+        *      disabled. "0": The phy_clock and free_clk outputs
+        *      are available within a specific period after the
+        *      de-assertion.
+        */
+       struct cvmx_usbnx_usbp_ctl_status_s {
+               uint64_t txrisetune             : 1;
+               uint64_t txvreftune             : 4;
+               uint64_t txfslstune             : 4;
+               uint64_t txhsxvtune             : 2;
+               uint64_t sqrxtune               : 3;
+               uint64_t compdistune            : 3;
+               uint64_t otgtune                : 3;
+               uint64_t otgdisable             : 1;
+               uint64_t portreset              : 1;
+               uint64_t drvvbus                : 1;
+               uint64_t lsbist                 : 1;
+               uint64_t fsbist                 : 1;
+               uint64_t hsbist                 : 1;
+               uint64_t bist_done              : 1;
+               uint64_t bist_err               : 1;
+               uint64_t tdata_out              : 4;
+               uint64_t siddq                  : 1;
+               uint64_t txpreemphasistune      : 1;
+               uint64_t dma_bmode              : 1;
+               uint64_t usbc_end               : 1;
+               uint64_t usbp_bist              : 1;
+               uint64_t tclk                   : 1;
+               uint64_t dp_pulld               : 1;
+               uint64_t dm_pulld               : 1;
+               uint64_t hst_mode               : 1;
+               uint64_t tuning                 : 4;
+               uint64_t tx_bs_enh              : 1;
+               uint64_t tx_bs_en               : 1;
+               uint64_t loop_enb               : 1;
+               uint64_t vtest_enb              : 1;
+               uint64_t bist_enb               : 1;
+               uint64_t tdata_sel              : 1;
+               uint64_t taddr_in               : 4;
+               uint64_t tdata_in               : 8;
+               uint64_t ate_reset              : 1;
+       } s;
+       /**
+        * struct cvmx_usbnx_usbp_ctl_status_cn30xx
+        * @bist_done: PHY Bist Done.
+        *      Asserted at the end of the PHY BIST sequence.
+        * @bist_err: PHY Bist Error.
+        *      Indicates an internal error was detected during
+        *      the BIST sequence.
+        * @tdata_out: PHY Test Data Out.
+        *      Presents either internaly generated signals or
+        *      test register contents, based upon the value of
+        *      test_data_out_sel.
+        * @dma_bmode: When set to 1 the L2C DMA address will be updated
+        *      with byte-counts between packets. When set to 0
+        *      the L2C DMA address is incremented to the next
+        *      4-byte aligned address after adding byte-count.
+        * @usbc_end: Bigendian input to the USB Core. This should be
+        *      set to '0' for operation.
+        * @usbp_bist: PHY, This is cleared '0' to run BIST on the USBP.
+        * @tclk: PHY Test Clock, used to load TDATA_IN to the USBP.
+        * @dp_pulld: PHY DP_PULLDOWN input to the USB-PHY.
+        *      This signal enables the pull-down resistance on
+        *      the D+ line. '1' pull down-resistance is connected
+        *      to D+/ '0' pull down resistance is not connected
+        *      to D+. When an A/B device is acting as a host
+        *      (downstream-facing port), dp_pulldown and
+        *      dm_pulldown are enabled. This must not toggle
+        *      during normal opeartion.
+        * @dm_pulld: PHY DM_PULLDOWN input to the USB-PHY.
+        *      This signal enables the pull-down resistance on
+        *      the D- line. '1' pull down-resistance is connected
+        *      to D-. '0' pull down resistance is not connected
+        *      to D-. When an A/B device is acting as a host
+        *      (downstream-facing port), dp_pulldown and
+        *      dm_pulldown are enabled. This must not toggle
+        *      during normal opeartion.
+        * @hst_mode: When '0' the USB is acting as HOST, when '1'
+        *      USB is acting as device. This field needs to be
+        *      set while the USB is in reset.
+        * @tuning: Transmitter Tuning for High-Speed Operation.
+        *      Tunes the current supply and rise/fall output
+        *      times for high-speed operation.
+        *      [20:19] == 11: Current supply increased
+        *      approximately 9%
+        *      [20:19] == 10: Current supply increased
+        *      approximately 4.5%
+        *      [20:19] == 01: Design default.
+        *      [20:19] == 00: Current supply decreased
+        *      approximately 4.5%
+        *      [22:21] == 11: Rise and fall times are increased.
+        *      [22:21] == 10: Design default.
+        *      [22:21] == 01: Rise and fall times are decreased.
+        *      [22:21] == 00: Rise and fall times are decreased
+        *      further as compared to the 01 setting.
+        * @tx_bs_enh: Transmit Bit Stuffing on [15:8].
+        *      Enables or disables bit stuffing on data[15:8]
+        *      when bit-stuffing is enabled.
+        * @tx_bs_en: Transmit Bit Stuffing on [7:0].
+        *      Enables or disables bit stuffing on data[7:0]
+        *      when bit-stuffing is enabled.
+        * @loop_enb: PHY Loopback Test Enable.
+        *      '1': During data transmission the receive is
+        *      enabled.
+        *      '0': During data transmission the receive is
+        *      disabled.
+        *      Must be '0' for normal operation.
+        * @vtest_enb: Analog Test Pin Enable.
+        *      '1' The PHY's analog_test pin is enabled for the
+        *      input and output of applicable analog test signals.
+        *      '0' THe analog_test pin is disabled.
+        * @bist_enb: Built-In Self Test Enable.
+        *      Used to activate BIST in the PHY.
+        * @tdata_sel: Test Data Out Select.
+        *      '1' test_data_out[3:0] (PHY) register contents
+        *      are output. '0' internaly generated signals are
+        *      output.
+        * @taddr_in: Mode Address for Test Interface.
+        *      Specifies the register address for writing to or
+        *      reading from the PHY test interface register.
+        * @tdata_in: Internal Testing Register Input Data and Select
+        *      This is a test bus. Data is present on [3:0],
+        *      and its corresponding select (enable) is present
+        *      on bits [7:4].
+        * @ate_reset: Reset input from automatic test equipment.
+        *      This is a test signal. When the USB Core is
+        *      powered up (not in Susned Mode), an automatic
+        *      tester can use this to disable phy_clock and
+        *      free_clk, then re-eanable them with an aligned
+        *      phase.
+        *      '1': The phy_clk and free_clk outputs are
+        *      disabled. "0": The phy_clock and free_clk outputs
+        *      are available within a specific period after the
+        *      de-assertion.
+        */
+       struct cvmx_usbnx_usbp_ctl_status_cn30xx {
+               uint64_t reserved_38_63 : 26;
+               uint64_t bist_done      : 1;
+               uint64_t bist_err       : 1;
+               uint64_t tdata_out      : 4;
+               uint64_t reserved_30_31 : 2;
+               uint64_t dma_bmode      : 1;
+               uint64_t usbc_end       : 1;
+               uint64_t usbp_bist      : 1;
+               uint64_t tclk           : 1;
+               uint64_t dp_pulld       : 1;
+               uint64_t dm_pulld       : 1;
+               uint64_t hst_mode       : 1;
+               uint64_t tuning         : 4;
+               uint64_t tx_bs_enh      : 1;
+               uint64_t tx_bs_en       : 1;
+               uint64_t loop_enb       : 1;
+               uint64_t vtest_enb      : 1;
+               uint64_t bist_enb       : 1;
+               uint64_t tdata_sel      : 1;
+               uint64_t taddr_in       : 4;
+               uint64_t tdata_in       : 8;
+               uint64_t ate_reset      : 1;
+       } cn30xx;
+       /**
+        * struct cvmx_usbnx_usbp_ctl_status_cn50xx
+        * @txrisetune: HS Transmitter Rise/Fall Time Adjustment
+        * @txvreftune: HS DC Voltage Level Adjustment
+        * @txfslstune: FS/LS Source Impedence Adjustment
+        * @txhsxvtune: Transmitter High-Speed Crossover Adjustment
+        * @sqrxtune: Squelch Threshold Adjustment
+        * @compdistune: Disconnect Threshold Adjustment
+        * @otgtune: VBUS Valid Threshold Adjustment
+        * @otgdisable: OTG Block Disable
+        * @portreset: Per_Port Reset
+        * @drvvbus: Drive VBUS
+        * @lsbist: Low-Speed BIST Enable.
+        * @fsbist: Full-Speed BIST Enable.
+        * @hsbist: High-Speed BIST Enable.
+        * @bist_done: PHY Bist Done.
+        *      Asserted at the end of the PHY BIST sequence.
+        * @bist_err: PHY Bist Error.
+        *      Indicates an internal error was detected during
+        *      the BIST sequence.
+        * @tdata_out: PHY Test Data Out.
+        *      Presents either internaly generated signals or
+        *      test register contents, based upon the value of
+        *      test_data_out_sel.
+        * @txpreemphasistune: HS Transmitter Pre-Emphasis Enable
+        * @dma_bmode: When set to 1 the L2C DMA address will be updated
+        *      with byte-counts between packets. When set to 0
+        *      the L2C DMA address is incremented to the next
+        *      4-byte aligned address after adding byte-count.
+        * @usbc_end: Bigendian input to the USB Core. This should be
+        *      set to '0' for operation.
+        * @usbp_bist: PHY, This is cleared '0' to run BIST on the USBP.
+        * @tclk: PHY Test Clock, used to load TDATA_IN to the USBP.
+        * @dp_pulld: PHY DP_PULLDOWN input to the USB-PHY.
+        *      This signal enables the pull-down resistance on
+        *      the D+ line. '1' pull down-resistance is connected
+        *      to D+/ '0' pull down resistance is not connected
+        *      to D+. When an A/B device is acting as a host
+        *      (downstream-facing port), dp_pulldown and
+        *      dm_pulldown are enabled. This must not toggle
+        *      during normal opeartion.
+        * @dm_pulld: PHY DM_PULLDOWN input to the USB-PHY.
+        *      This signal enables the pull-down resistance on
+        *      the D- line. '1' pull down-resistance is connected
+        *      to D-. '0' pull down resistance is not connected
+        *      to D-. When an A/B device is acting as a host
+        *      (downstream-facing port), dp_pulldown and
+        *      dm_pulldown are enabled. This must not toggle
+        *      during normal opeartion.
+        * @hst_mode: When '0' the USB is acting as HOST, when '1'
+        *      USB is acting as device. This field needs to be
+        *      set while the USB is in reset.
+        * @tx_bs_enh: Transmit Bit Stuffing on [15:8].
+        *      Enables or disables bit stuffing on data[15:8]
+        *      when bit-stuffing is enabled.
+        * @tx_bs_en: Transmit Bit Stuffing on [7:0].
+        *      Enables or disables bit stuffing on data[7:0]
+        *      when bit-stuffing is enabled.
+        * @loop_enb: PHY Loopback Test Enable.
+        *      '1': During data transmission the receive is
+        *      enabled.
+        *      '0': During data transmission the receive is
+        *      disabled.
+        *      Must be '0' for normal operation.
+        * @vtest_enb: Analog Test Pin Enable.
+        *      '1' The PHY's analog_test pin is enabled for the
+        *      input and output of applicable analog test signals.
+        *      '0' THe analog_test pin is disabled.
+        * @bist_enb: Built-In Self Test Enable.
+        *      Used to activate BIST in the PHY.
+        * @tdata_sel: Test Data Out Select.
+        *      '1' test_data_out[3:0] (PHY) register contents
+        *      are output. '0' internaly generated signals are
+        *      output.
+        * @taddr_in: Mode Address for Test Interface.
+        *      Specifies the register address for writing to or
+        *      reading from the PHY test interface register.
+        * @tdata_in: Internal Testing Register Input Data and Select
+        *      This is a test bus. Data is present on [3:0],
+        *      and its corresponding select (enable) is present
+        *      on bits [7:4].
+        * @ate_reset: Reset input from automatic test equipment.
+        *      This is a test signal. When the USB Core is
+        *      powered up (not in Susned Mode), an automatic
+        *      tester can use this to disable phy_clock and
+        *      free_clk, then re-eanable them with an aligned
+        *      phase.
+        *      '1': The phy_clk and free_clk outputs are
+        *      disabled. "0": The phy_clock and free_clk outputs
+        *      are available within a specific period after the
+        *      de-assertion.
+        */
+       struct cvmx_usbnx_usbp_ctl_status_cn50xx {
+               uint64_t txrisetune             : 1;
+               uint64_t txvreftune             : 4;
+               uint64_t txfslstune             : 4;
+               uint64_t txhsxvtune             : 2;
+               uint64_t sqrxtune               : 3;
+               uint64_t compdistune            : 3;
+               uint64_t otgtune                : 3;
+               uint64_t otgdisable             : 1;
+               uint64_t portreset              : 1;
+               uint64_t drvvbus                : 1;
+               uint64_t lsbist                 : 1;
+               uint64_t fsbist                 : 1;
+               uint64_t hsbist                 : 1;
+               uint64_t bist_done              : 1;
+               uint64_t bist_err               : 1;
+               uint64_t tdata_out              : 4;
+               uint64_t reserved_31_31         : 1;
+               uint64_t txpreemphasistune      : 1;
+               uint64_t dma_bmode              : 1;
+               uint64_t usbc_end               : 1;
+               uint64_t usbp_bist              : 1;
+               uint64_t tclk                   : 1;
+               uint64_t dp_pulld               : 1;
+               uint64_t dm_pulld               : 1;
+               uint64_t hst_mode               : 1;
+               uint64_t reserved_19_22         : 4;
+               uint64_t tx_bs_enh              : 1;
+               uint64_t tx_bs_en               : 1;
+               uint64_t loop_enb               : 1;
+               uint64_t vtest_enb              : 1;
+               uint64_t bist_enb               : 1;
+               uint64_t tdata_sel              : 1;
+               uint64_t taddr_in               : 4;
+               uint64_t tdata_in               : 8;
+               uint64_t ate_reset              : 1;
+       } cn50xx;
+       /**
+        * struct cvmx_usbnx_usbp_ctl_status_cn52xx
+        * @txrisetune: HS Transmitter Rise/Fall Time Adjustment
+        * @txvreftune: HS DC Voltage Level Adjustment
+        * @txfslstune: FS/LS Source Impedence Adjustment
+        * @txhsxvtune: Transmitter High-Speed Crossover Adjustment
+        * @sqrxtune: Squelch Threshold Adjustment
+        * @compdistune: Disconnect Threshold Adjustment
+        * @otgtune: VBUS Valid Threshold Adjustment
+        * @otgdisable: OTG Block Disable
+        * @portreset: Per_Port Reset
+        * @drvvbus: Drive VBUS
+        * @lsbist: Low-Speed BIST Enable.
+        * @fsbist: Full-Speed BIST Enable.
+        * @hsbist: High-Speed BIST Enable.
+        * @bist_done: PHY Bist Done.
+        *      Asserted at the end of the PHY BIST sequence.
+        * @bist_err: PHY Bist Error.
+        *      Indicates an internal error was detected during
+        *      the BIST sequence.
+        * @tdata_out: PHY Test Data Out.
+        *      Presents either internaly generated signals or
+        *      test register contents, based upon the value of
+        *      test_data_out_sel.
+        * @siddq: Drives the USBP (USB-PHY) SIDDQ input.
+        *      Normally should be set to zero.
+        *      When customers have no intent to use USB PHY
+        *      interface, they should:
+        *      - still provide 3.3V to USB_VDD33, and
+        *      - tie USB_REXT to 3.3V supply, and
+        *      - set USBN*_USBP_CTL_STATUS[SIDDQ]=1
+        * @txpreemphasistune: HS Transmitter Pre-Emphasis Enable
+        * @dma_bmode: When set to 1 the L2C DMA address will be updated
+        *      with byte-counts between packets. When set to 0
+        *      the L2C DMA address is incremented to the next
+        *      4-byte aligned address after adding byte-count.
+        * @usbc_end: Bigendian input to the USB Core. This should be
+        *      set to '0' for operation.
+        * @usbp_bist: PHY, This is cleared '0' to run BIST on the USBP.
+        * @tclk: PHY Test Clock, used to load TDATA_IN to the USBP.
+        * @dp_pulld: PHY DP_PULLDOWN input to the USB-PHY.
+        *      This signal enables the pull-down resistance on
+        *      the D+ line. '1' pull down-resistance is connected
+        *      to D+/ '0' pull down resistance is not connected
+        *      to D+. When an A/B device is acting as a host
+        *      (downstream-facing port), dp_pulldown and
+        *      dm_pulldown are enabled. This must not toggle
+        *      during normal opeartion.
+        * @dm_pulld: PHY DM_PULLDOWN input to the USB-PHY.
+        *      This signal enables the pull-down resistance on
+        *      the D- line. '1' pull down-resistance is connected
+        *      to D-. '0' pull down resistance is not connected
+        *      to D-. When an A/B device is acting as a host
+        *      (downstream-facing port), dp_pulldown and
+        *      dm_pulldown are enabled. This must not toggle
+        *      during normal opeartion.
+        * @hst_mode: When '0' the USB is acting as HOST, when '1'
+        *      USB is acting as device. This field needs to be
+        *      set while the USB is in reset.
+        * @tx_bs_enh: Transmit Bit Stuffing on [15:8].
+        *      Enables or disables bit stuffing on data[15:8]
+        *      when bit-stuffing is enabled.
+        * @tx_bs_en: Transmit Bit Stuffing on [7:0].
+        *      Enables or disables bit stuffing on data[7:0]
+        *      when bit-stuffing is enabled.
+        * @loop_enb: PHY Loopback Test Enable.
+        *      '1': During data transmission the receive is
+        *      enabled.
+        *      '0': During data transmission the receive is
+        *      disabled.
+        *      Must be '0' for normal operation.
+        * @vtest_enb: Analog Test Pin Enable.
+        *      '1' The PHY's analog_test pin is enabled for the
+        *      input and output of applicable analog test signals.
+        *      '0' THe analog_test pin is disabled.
+        * @bist_enb: Built-In Self Test Enable.
+        *      Used to activate BIST in the PHY.
+        * @tdata_sel: Test Data Out Select.
+        *      '1' test_data_out[3:0] (PHY) register contents
+        *      are output. '0' internaly generated signals are
+        *      output.
+        * @taddr_in: Mode Address for Test Interface.
+        *      Specifies the register address for writing to or
+        *      reading from the PHY test interface register.
+        * @tdata_in: Internal Testing Register Input Data and Select
+        *      This is a test bus. Data is present on [3:0],
+        *      and its corresponding select (enable) is present
+        *      on bits [7:4].
+        * @ate_reset: Reset input from automatic test equipment.
+        *      This is a test signal. When the USB Core is
+        *      powered up (not in Susned Mode), an automatic
+        *      tester can use this to disable phy_clock and
+        *      free_clk, then re-eanable them with an aligned
+        *      phase.
+        *      '1': The phy_clk and free_clk outputs are
+        *      disabled. "0": The phy_clock and free_clk outputs
+        *      are available within a specific period after the
+        *      de-assertion.
+        */
+       struct cvmx_usbnx_usbp_ctl_status_cn52xx {
+               uint64_t txrisetune             : 1;
+               uint64_t txvreftune             : 4;
+               uint64_t txfslstune             : 4;
+               uint64_t txhsxvtune             : 2;
+               uint64_t sqrxtune               : 3;
+               uint64_t compdistune            : 3;
+               uint64_t otgtune                : 3;
+               uint64_t otgdisable             : 1;
+               uint64_t portreset              : 1;
+               uint64_t drvvbus                : 1;
+               uint64_t lsbist                 : 1;
+               uint64_t fsbist                 : 1;
+               uint64_t hsbist                 : 1;
+               uint64_t bist_done              : 1;
+               uint64_t bist_err               : 1;
+               uint64_t tdata_out              : 4;
+               uint64_t siddq                  : 1;
+               uint64_t txpreemphasistune      : 1;
+               uint64_t dma_bmode              : 1;
+               uint64_t usbc_end               : 1;
+               uint64_t usbp_bist              : 1;
+               uint64_t tclk                   : 1;
+               uint64_t dp_pulld               : 1;
+               uint64_t dm_pulld               : 1;
+               uint64_t hst_mode               : 1;
+               uint64_t reserved_19_22         : 4;
+               uint64_t tx_bs_enh              : 1;
+               uint64_t tx_bs_en               : 1;
+               uint64_t loop_enb               : 1;
+               uint64_t vtest_enb              : 1;
+               uint64_t bist_enb               : 1;
+               uint64_t tdata_sel              : 1;
+               uint64_t taddr_in               : 4;
+               uint64_t tdata_in               : 8;
+               uint64_t ate_reset              : 1;
+       } cn52xx;
+};
+typedef union cvmx_usbnx_usbp_ctl_status cvmx_usbnx_usbp_ctl_status_t;
+
+#endif
diff --git a/drivers/staging/octeon-usb/octeon-hcd.c b/drivers/staging/octeon-usb/octeon-hcd.c
new file mode 100644 (file)
index 0000000..08de25f
--- /dev/null
@@ -0,0 +1,815 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2008 Cavium Networks
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/usb.h>
+
+#include <asm/time.h>
+#include <asm/delay.h>
+
+#include <asm/octeon/cvmx.h>
+#include "cvmx-usb.h"
+#include <asm/octeon/cvmx-iob-defs.h>
+
+#include <linux/usb/hcd.h>
+
+#include <linux/err.h>
+
+//#define DEBUG_CALL(format, ...)         printk(format, ##__VA_ARGS__)
+#define DEBUG_CALL(format, ...)         do {} while (0)
+//#define DEBUG_SUBMIT(format, ...)       printk(format, ##__VA_ARGS__)
+#define DEBUG_SUBMIT(format, ...)       do {} while (0)
+//#define DEBUG_ROOT_HUB(format, ...)     printk(format, ##__VA_ARGS__)
+#define DEBUG_ROOT_HUB(format, ...)     do {} while (0)
+//#define DEBUG_ERROR(format, ...)        printk(format, ##__VA_ARGS__)
+#define DEBUG_ERROR(format, ...)        do {} while (0)
+#define DEBUG_FATAL(format, ...)        printk(format, ##__VA_ARGS__)
+
+struct octeon_hcd {
+    spinlock_t lock;
+    cvmx_usb_state_t usb;
+    struct tasklet_struct dequeue_tasklet;
+    struct list_head dequeue_list;
+};
+
+/* convert between an HCD pointer and the corresponding struct octeon_hcd */
+static inline struct octeon_hcd *hcd_to_octeon(struct usb_hcd *hcd)
+{
+       return (struct octeon_hcd *)(hcd->hcd_priv);
+}
+
+static inline struct usb_hcd *octeon_to_hcd(struct octeon_hcd *p)
+{
+       return container_of((void *)p, struct usb_hcd, hcd_priv);
+}
+
+static inline struct octeon_hcd *cvmx_usb_to_octeon(cvmx_usb_state_t *p)
+{
+       return container_of(p, struct octeon_hcd, usb);
+}
+
+static irqreturn_t octeon_usb_irq(struct usb_hcd *hcd)
+{
+    struct octeon_hcd *priv = hcd_to_octeon(hcd);
+    unsigned long flags;
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+    spin_lock_irqsave(&priv->lock, flags);
+    cvmx_usb_poll(&priv->usb);
+    spin_unlock_irqrestore(&priv->lock, flags);
+    return IRQ_HANDLED;
+}
+
+static void octeon_usb_port_callback(cvmx_usb_state_t *usb,
+                                     cvmx_usb_callback_t reason,
+                                     cvmx_usb_complete_t status,
+                                     int pipe_handle,
+                                     int submit_handle,
+                                     int bytes_transferred,
+                                     void *user_data)
+{
+    struct octeon_hcd *priv = cvmx_usb_to_octeon(usb);
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+    spin_unlock(&priv->lock);
+    usb_hcd_poll_rh_status(octeon_to_hcd(priv));
+    spin_lock(&priv->lock);
+}
+
+static int octeon_usb_start(struct usb_hcd *hcd)
+{
+    struct octeon_hcd *priv = hcd_to_octeon(hcd);
+    unsigned long flags;
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+    hcd->state = HC_STATE_RUNNING;
+    spin_lock_irqsave(&priv->lock, flags);
+    cvmx_usb_register_callback(&priv->usb, CVMX_USB_CALLBACK_PORT_CHANGED,
+                               octeon_usb_port_callback, NULL);
+    spin_unlock_irqrestore(&priv->lock, flags);
+    return 0;
+}
+
+static void octeon_usb_stop(struct usb_hcd *hcd)
+{
+    struct octeon_hcd *priv = hcd_to_octeon(hcd);
+    unsigned long flags;
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+    spin_lock_irqsave(&priv->lock, flags);
+    cvmx_usb_register_callback(&priv->usb, CVMX_USB_CALLBACK_PORT_CHANGED,
+                               NULL, NULL);
+    spin_unlock_irqrestore(&priv->lock, flags);
+    hcd->state = HC_STATE_HALT;
+}
+
+static int octeon_usb_get_frame_number(struct usb_hcd *hcd)
+{
+    struct octeon_hcd *priv = hcd_to_octeon(hcd);
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+    return cvmx_usb_get_frame_number(&priv->usb);
+}
+
+static void octeon_usb_urb_complete_callback(cvmx_usb_state_t *usb,
+                                             cvmx_usb_callback_t reason,
+                                             cvmx_usb_complete_t status,
+                                             int pipe_handle,
+                                             int submit_handle,
+                                             int bytes_transferred,
+                                             void *user_data)
+{
+    struct octeon_hcd *priv = cvmx_usb_to_octeon(usb);
+    struct urb *urb = user_data;
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+    urb->actual_length = bytes_transferred;
+    urb->hcpriv = NULL;
+
+       if (!list_empty(&urb->urb_list)) {
+               /*
+                * It is on the dequeue_list, but we are going to call
+                * usb_hcd_giveback_urb(), so we must clear it from
+                * the list.  We got to it before the
+                * octeon_usb_urb_dequeue_work() tasklet did.
+                */
+               list_del(&urb->urb_list);
+               /* No longer on the dequeue_list. */
+               INIT_LIST_HEAD(&urb->urb_list);
+       }
+
+    /* For Isochronous transactions we need to update the URB packet status
+        list from data in our private copy */
+    if (usb_pipetype(urb->pipe) == PIPE_ISOCHRONOUS) {
+        int i;
+        /* The pointer to the private list is stored in the setup_packet field */
+        cvmx_usb_iso_packet_t *iso_packet = (cvmx_usb_iso_packet_t *)urb->setup_packet;
+        /* Recalculate the transfer size by adding up each packet */
+        urb->actual_length = 0;
+        for (i=0; i<urb->number_of_packets; i++) {
+            if (iso_packet[i].status == CVMX_USB_COMPLETE_SUCCESS) {
+                urb->iso_frame_desc[i].status = 0;
+                urb->iso_frame_desc[i].actual_length = iso_packet[i].length;
+                urb->actual_length += urb->iso_frame_desc[i].actual_length;
+            } else {
+                DEBUG_ERROR("%s: ISOCHRONOUS packet=%d of %d status=%d pipe=%d submit=%d size=%d\n",
+                            __FUNCTION__, i, urb->number_of_packets,
+                            iso_packet[i].status, pipe_handle,
+                            submit_handle, iso_packet[i].length);
+                urb->iso_frame_desc[i].status = -EREMOTEIO;
+            }
+        }
+        /* Free the private list now that we don't need it anymore */
+        kfree(iso_packet);
+        urb->setup_packet = NULL;
+    }
+
+    switch (status) {
+        case CVMX_USB_COMPLETE_SUCCESS:
+            urb->status = 0;
+            break;
+        case CVMX_USB_COMPLETE_CANCEL:
+            if (urb->status == 0)
+                urb->status = -ENOENT;
+            break;
+        case CVMX_USB_COMPLETE_STALL:
+            DEBUG_ERROR("%s: status=stall pipe=%d submit=%d size=%d\n", __FUNCTION__, pipe_handle, submit_handle, bytes_transferred);
+            urb->status = -EPIPE;
+            break;
+        case CVMX_USB_COMPLETE_BABBLEERR:
+            DEBUG_ERROR("%s: status=babble pipe=%d submit=%d size=%d\n", __FUNCTION__, pipe_handle, submit_handle, bytes_transferred);
+            urb->status = -EPIPE;
+            break;
+        case CVMX_USB_COMPLETE_SHORT:
+            DEBUG_ERROR("%s: status=short pipe=%d submit=%d size=%d\n", __FUNCTION__, pipe_handle, submit_handle, bytes_transferred);
+            urb->status = -EREMOTEIO;
+            break;
+        case CVMX_USB_COMPLETE_ERROR:
+        case CVMX_USB_COMPLETE_XACTERR:
+        case CVMX_USB_COMPLETE_DATATGLERR:
+        case CVMX_USB_COMPLETE_FRAMEERR:
+            DEBUG_ERROR("%s: status=%d pipe=%d submit=%d size=%d\n", __FUNCTION__, status, pipe_handle, submit_handle, bytes_transferred);
+            urb->status = -EPROTO;
+            break;
+    }
+    spin_unlock(&priv->lock);
+    usb_hcd_giveback_urb(octeon_to_hcd(priv), urb, urb->status);
+    spin_lock(&priv->lock);
+}
+
+static int octeon_usb_urb_enqueue(struct usb_hcd *hcd,
+                                  struct urb *urb,
+                                  gfp_t mem_flags)
+{
+    struct octeon_hcd *priv = hcd_to_octeon(hcd);
+    int submit_handle = -1;
+    int pipe_handle;
+    unsigned long flags;
+    cvmx_usb_iso_packet_t *iso_packet;
+    struct usb_host_endpoint *ep = urb->ep;
+
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+
+    urb->status = 0;
+    INIT_LIST_HEAD(&urb->urb_list); /* not enqueued on dequeue_list */
+    spin_lock_irqsave(&priv->lock, flags);
+
+    if (!ep->hcpriv) {
+        cvmx_usb_transfer_t transfer_type;
+        cvmx_usb_speed_t speed;
+        int split_device = 0;
+        int split_port = 0;
+        switch (usb_pipetype(urb->pipe)) {
+            case PIPE_ISOCHRONOUS:
+                transfer_type = CVMX_USB_TRANSFER_ISOCHRONOUS;
+                break;
+            case PIPE_INTERRUPT:
+                transfer_type = CVMX_USB_TRANSFER_INTERRUPT;
+                break;
+            case PIPE_CONTROL:
+                transfer_type = CVMX_USB_TRANSFER_CONTROL;
+                break;
+            default:
+                transfer_type = CVMX_USB_TRANSFER_BULK;
+                break;
+        }
+        switch (urb->dev->speed) {
+            case USB_SPEED_LOW:
+                speed = CVMX_USB_SPEED_LOW;
+                break;
+            case USB_SPEED_FULL:
+                speed = CVMX_USB_SPEED_FULL;
+                break;
+            default:
+                speed = CVMX_USB_SPEED_HIGH;
+                break;
+        }
+        /* For slow devices on high speed ports we need to find the hub that
+            does the speed translation so we know where to send the split
+            transactions */
+        if (speed != CVMX_USB_SPEED_HIGH) {
+            /* Start at this device and work our way up the usb tree */
+            struct usb_device *dev = urb->dev;
+            while (dev->parent) {
+                /* If our parent is high speed then he'll receive the splits */
+                if (dev->parent->speed == USB_SPEED_HIGH) {
+                    split_device = dev->parent->devnum;
+                    split_port = dev->portnum;
+                    break;
+                }
+                /* Move up the tree one level. If we make it all the way up the
+                    tree, then the port must not be in high speed mode and we
+                    don't need a split */
+                dev = dev->parent;
+            }
+        }
+        pipe_handle = cvmx_usb_open_pipe(&priv->usb,
+                                         0,
+                                         usb_pipedevice(urb->pipe),
+                                         usb_pipeendpoint(urb->pipe),
+                                         speed,
+                                         le16_to_cpu(ep->desc.wMaxPacketSize) & 0x7ff,
+                                         transfer_type,
+                                         usb_pipein(urb->pipe) ? CVMX_USB_DIRECTION_IN : CVMX_USB_DIRECTION_OUT,
+                                         urb->interval,
+                                         (le16_to_cpu(ep->desc.wMaxPacketSize)>>11) & 0x3,
+                                         split_device,
+                                         split_port);
+        if (pipe_handle < 0) {
+            spin_unlock_irqrestore(&priv->lock, flags);
+            DEBUG_ERROR("OcteonUSB: %s failed to create pipe\n", __FUNCTION__);
+            return -ENOMEM;
+        }
+        ep->hcpriv = (void*)(0x10000L + pipe_handle);
+    }
+    else
+        pipe_handle = 0xffff & (long)ep->hcpriv;
+
+    switch (usb_pipetype(urb->pipe)) {
+        case PIPE_ISOCHRONOUS:
+            DEBUG_SUBMIT("OcteonUSB: %s submit isochronous to %d.%d\n", __FUNCTION__, usb_pipedevice(urb->pipe), usb_pipeendpoint(urb->pipe));
+            /* Allocate a structure to use for our private list of isochronous
+                packets */
+            iso_packet = kmalloc(urb->number_of_packets * sizeof(cvmx_usb_iso_packet_t), GFP_ATOMIC);
+            if (iso_packet) {
+                int i;
+                /* Fill the list with the data from the URB */
+                for (i=0; i<urb->number_of_packets; i++) {
+                    iso_packet[i].offset = urb->iso_frame_desc[i].offset;
+                    iso_packet[i].length = urb->iso_frame_desc[i].length;
+                    iso_packet[i].status = CVMX_USB_COMPLETE_ERROR;
+                }
+                /* Store a pointer to the list in uthe URB setup_pakcet field.
+                    We know this currently isn't being used and this saves us
+                    a bunch of logic */
+                urb->setup_packet = (char*)iso_packet;
+                submit_handle = cvmx_usb_submit_isochronous(&priv->usb, pipe_handle,
+                                                            urb->start_frame,
+                                                            0 /* flags */,
+                                                            urb->number_of_packets,
+                                                            iso_packet,
+                                                            urb->transfer_dma,
+                                                            urb->transfer_buffer_length,
+                                                            octeon_usb_urb_complete_callback,
+                                                            urb);
+                /* If submit failed we need to free our private packet list */
+                if (submit_handle < 0) {
+                    urb->setup_packet = NULL;
+                    kfree(iso_packet);
+                }
+            }
+            break;
+        case PIPE_INTERRUPT:
+            DEBUG_SUBMIT("OcteonUSB: %s submit interrupt to %d.%d\n", __FUNCTION__, usb_pipedevice(urb->pipe), usb_pipeendpoint(urb->pipe));
+            submit_handle = cvmx_usb_submit_interrupt(&priv->usb, pipe_handle,
+                                                      urb->transfer_dma,
+                                                      urb->transfer_buffer_length,
+                                                      octeon_usb_urb_complete_callback,
+                                                      urb);
+            break;
+        case PIPE_CONTROL:
+            DEBUG_SUBMIT("OcteonUSB: %s submit control to %d.%d\n", __FUNCTION__, usb_pipedevice(urb->pipe), usb_pipeendpoint(urb->pipe));
+            submit_handle = cvmx_usb_submit_control(&priv->usb, pipe_handle,
+                                                    urb->setup_dma,
+                                                    urb->transfer_dma,
+                                                    urb->transfer_buffer_length,
+                                                    octeon_usb_urb_complete_callback,
+                                                    urb);
+            break;
+        case PIPE_BULK:
+            DEBUG_SUBMIT("OcteonUSB: %s submit bulk to %d.%d\n", __FUNCTION__, usb_pipedevice(urb->pipe), usb_pipeendpoint(urb->pipe));
+            submit_handle = cvmx_usb_submit_bulk(&priv->usb, pipe_handle,
+                                                 urb->transfer_dma,
+                                                 urb->transfer_buffer_length,
+                                                 octeon_usb_urb_complete_callback,
+                                                 urb);
+            break;
+    }
+    if (submit_handle < 0) {
+        spin_unlock_irqrestore(&priv->lock, flags);
+        DEBUG_ERROR("OcteonUSB: %s failed to submit\n", __FUNCTION__);
+        return -ENOMEM;
+    }
+    urb->hcpriv = (void*)(long)(((submit_handle & 0xffff) << 16) | pipe_handle);
+    spin_unlock_irqrestore(&priv->lock, flags);
+    return 0;
+}
+
+static void octeon_usb_urb_dequeue_work(unsigned long arg)
+{
+    unsigned long flags;
+    struct octeon_hcd *priv = (struct octeon_hcd *)arg;
+
+    spin_lock_irqsave(&priv->lock, flags);
+
+    while (!list_empty(&priv->dequeue_list)) {
+        int pipe_handle;
+        int submit_handle;
+        struct urb *urb = container_of(priv->dequeue_list.next, struct urb, urb_list);
+        list_del(&urb->urb_list);
+        /* not enqueued on dequeue_list */
+        INIT_LIST_HEAD(&urb->urb_list);
+        pipe_handle = 0xffff & (long)urb->hcpriv;
+        submit_handle = ((long)urb->hcpriv) >> 16;
+        cvmx_usb_cancel(&priv->usb, pipe_handle, submit_handle);
+    }
+
+    spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static int octeon_usb_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
+{
+    struct octeon_hcd *priv = hcd_to_octeon(hcd);
+    unsigned long flags;
+
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+
+    if (!urb->dev)
+        return -EINVAL;
+
+    spin_lock_irqsave(&priv->lock, flags);
+
+    urb->status = status;
+    list_add_tail(&urb->urb_list, &priv->dequeue_list);
+
+    spin_unlock_irqrestore(&priv->lock, flags);
+
+    tasklet_schedule(&priv->dequeue_tasklet);
+
+    return 0;
+}
+
+static void octeon_usb_endpoint_disable(struct usb_hcd *hcd, struct usb_host_endpoint *ep)
+{
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+    if (ep->hcpriv) {
+        struct octeon_hcd *priv = hcd_to_octeon(hcd);
+        int pipe_handle = 0xffff & (long)ep->hcpriv;
+        unsigned long flags;
+        spin_lock_irqsave(&priv->lock, flags);
+        cvmx_usb_cancel_all(&priv->usb, pipe_handle);
+        if (cvmx_usb_close_pipe(&priv->usb, pipe_handle))
+            DEBUG_ERROR("OcteonUSB: Closing pipe %d failed\n", pipe_handle);
+        spin_unlock_irqrestore(&priv->lock, flags);
+        ep->hcpriv = NULL;
+    }
+}
+
+static int octeon_usb_hub_status_data(struct usb_hcd *hcd, char *buf)
+{
+    struct octeon_hcd *priv = hcd_to_octeon(hcd);
+    cvmx_usb_port_status_t port_status;
+    unsigned long flags;
+
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+
+    spin_lock_irqsave(&priv->lock, flags);
+    port_status = cvmx_usb_get_status(&priv->usb);
+    spin_unlock_irqrestore(&priv->lock, flags);
+    buf[0] = 0;
+    buf[0] = port_status.connect_change << 1;
+
+    return(buf[0] != 0);
+}
+
+static int octeon_usb_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength)
+{
+    struct octeon_hcd *priv = hcd_to_octeon(hcd);
+    cvmx_usb_port_status_t usb_port_status;
+    int port_status;
+    struct usb_hub_descriptor *desc;
+    unsigned long flags;
+
+    switch (typeReq) {
+        case ClearHubFeature:
+            DEBUG_ROOT_HUB("OcteonUSB: ClearHubFeature\n");
+            switch (wValue) {
+                case C_HUB_LOCAL_POWER:
+                case C_HUB_OVER_CURRENT:
+                    /* Nothing required here */
+                    break;
+                default:
+                    return -EINVAL;
+            }
+            break;
+        case ClearPortFeature:
+            DEBUG_ROOT_HUB("OcteonUSB: ClearPortFeature");
+            if (wIndex != 1) {
+                DEBUG_ROOT_HUB(" INVALID\n");
+                return -EINVAL;
+            }
+
+            switch (wValue) {
+                case USB_PORT_FEAT_ENABLE:
+                    DEBUG_ROOT_HUB(" ENABLE");
+                    spin_lock_irqsave(&priv->lock, flags);
+                    cvmx_usb_disable(&priv->usb);
+                    spin_unlock_irqrestore(&priv->lock, flags);
+                    break;
+                case USB_PORT_FEAT_SUSPEND:
+                    DEBUG_ROOT_HUB(" SUSPEND");
+                    /* Not supported on Octeon */
+                    break;
+                case USB_PORT_FEAT_POWER:
+                    DEBUG_ROOT_HUB(" POWER");
+                    /* Not supported on Octeon */
+                    break;
+                case USB_PORT_FEAT_INDICATOR:
+                    DEBUG_ROOT_HUB(" INDICATOR");
+                    /* Port inidicator not supported */
+                    break;
+                case USB_PORT_FEAT_C_CONNECTION:
+                    DEBUG_ROOT_HUB(" C_CONNECTION");
+                    /* Clears drivers internal connect status change flag */
+                    spin_lock_irqsave(&priv->lock, flags);
+                    cvmx_usb_set_status(&priv->usb, cvmx_usb_get_status(&priv->usb));
+                    spin_unlock_irqrestore(&priv->lock, flags);
+                    break;
+                case USB_PORT_FEAT_C_RESET:
+                    DEBUG_ROOT_HUB(" C_RESET");
+                    /* Clears the driver's internal Port Reset Change flag */
+                    spin_lock_irqsave(&priv->lock, flags);
+                    cvmx_usb_set_status(&priv->usb, cvmx_usb_get_status(&priv->usb));
+                    spin_unlock_irqrestore(&priv->lock, flags);
+                    break;
+                case USB_PORT_FEAT_C_ENABLE:
+                    DEBUG_ROOT_HUB(" C_ENABLE");
+                    /* Clears the driver's internal Port Enable/Disable Change flag */
+                    spin_lock_irqsave(&priv->lock, flags);
+                    cvmx_usb_set_status(&priv->usb, cvmx_usb_get_status(&priv->usb));
+                    spin_unlock_irqrestore(&priv->lock, flags);
+                    break;
+                case USB_PORT_FEAT_C_SUSPEND:
+                    DEBUG_ROOT_HUB(" C_SUSPEND");
+                    /* Clears the driver's internal Port Suspend Change flag,
+                        which is set when resume signaling on the host port is
+                        complete */
+                    break;
+                case USB_PORT_FEAT_C_OVER_CURRENT:
+                    DEBUG_ROOT_HUB(" C_OVER_CURRENT");
+                    /* Clears the driver's overcurrent Change flag */
+                    spin_lock_irqsave(&priv->lock, flags);
+                    cvmx_usb_set_status(&priv->usb, cvmx_usb_get_status(&priv->usb));
+                    spin_unlock_irqrestore(&priv->lock, flags);
+                    break;
+                default:
+                    DEBUG_ROOT_HUB(" UNKNOWN\n");
+                    return -EINVAL;
+            }
+            DEBUG_ROOT_HUB("\n");
+            break;
+        case GetHubDescriptor:
+            DEBUG_ROOT_HUB("OcteonUSB: GetHubDescriptor\n");
+            desc = (struct usb_hub_descriptor *)buf;
+            desc->bDescLength = 9;
+            desc->bDescriptorType = 0x29;
+            desc->bNbrPorts = 1;
+            desc->wHubCharacteristics = 0x08;
+            desc->bPwrOn2PwrGood = 1;
+            desc->bHubContrCurrent = 0;
+            desc->u.hs.DeviceRemovable[0] = 0;
+            desc->u.hs.DeviceRemovable[1] = 0xff;
+            break;
+        case GetHubStatus:
+            DEBUG_ROOT_HUB("OcteonUSB: GetHubStatus\n");
+            *(__le32 *)buf = 0;
+            break;
+        case GetPortStatus:
+            DEBUG_ROOT_HUB("OcteonUSB: GetPortStatus");
+            if (wIndex != 1) {
+                DEBUG_ROOT_HUB(" INVALID\n");
+                return -EINVAL;
+            }
+
+            spin_lock_irqsave(&priv->lock, flags);
+            usb_port_status = cvmx_usb_get_status(&priv->usb);
+            spin_unlock_irqrestore(&priv->lock, flags);
+            port_status = 0;
+
+            if (usb_port_status.connect_change) {
+                port_status |= (1 << USB_PORT_FEAT_C_CONNECTION);
+                DEBUG_ROOT_HUB(" C_CONNECTION");
+            }
+
+            if (usb_port_status.port_enabled) {
+                port_status |= (1 << USB_PORT_FEAT_C_ENABLE);
+                DEBUG_ROOT_HUB(" C_ENABLE");
+            }
+
+            if (usb_port_status.connected) {
+                port_status |= (1 << USB_PORT_FEAT_CONNECTION);
+                DEBUG_ROOT_HUB(" CONNECTION");
+            }
+
+            if (usb_port_status.port_enabled) {
+                port_status |= (1 << USB_PORT_FEAT_ENABLE);
+                DEBUG_ROOT_HUB(" ENABLE");
+            }
+
+            if (usb_port_status.port_over_current) {
+                port_status |= (1 << USB_PORT_FEAT_OVER_CURRENT);
+                DEBUG_ROOT_HUB(" OVER_CURRENT");
+            }
+
+            if (usb_port_status.port_powered) {
+                port_status |= (1 << USB_PORT_FEAT_POWER);
+                DEBUG_ROOT_HUB(" POWER");
+            }
+
+            if (usb_port_status.port_speed == CVMX_USB_SPEED_HIGH) {
+               port_status |= USB_PORT_STAT_HIGH_SPEED;
+                DEBUG_ROOT_HUB(" HIGHSPEED");
+            } else if (usb_port_status.port_speed == CVMX_USB_SPEED_LOW) {
+                port_status |= (1 << USB_PORT_FEAT_LOWSPEED);
+                DEBUG_ROOT_HUB(" LOWSPEED");
+            }
+
+            *((__le32 *)buf) = cpu_to_le32(port_status);
+            DEBUG_ROOT_HUB("\n");
+            break;
+        case SetHubFeature:
+            DEBUG_ROOT_HUB("OcteonUSB: SetHubFeature\n");
+            /* No HUB features supported */
+            break;
+        case SetPortFeature:
+            DEBUG_ROOT_HUB("OcteonUSB: SetPortFeature");
+            if (wIndex != 1) {
+                DEBUG_ROOT_HUB(" INVALID\n");
+                return -EINVAL;
+            }
+
+            switch (wValue) {
+                case USB_PORT_FEAT_SUSPEND:
+                    DEBUG_ROOT_HUB(" SUSPEND\n");
+                    return -EINVAL;
+                case USB_PORT_FEAT_POWER:
+                    DEBUG_ROOT_HUB(" POWER\n");
+                    return -EINVAL;
+                case USB_PORT_FEAT_RESET:
+                    DEBUG_ROOT_HUB(" RESET\n");
+                    spin_lock_irqsave(&priv->lock, flags);
+                    cvmx_usb_disable(&priv->usb);
+                    if (cvmx_usb_enable(&priv->usb))
+                        DEBUG_ERROR("Failed to enable the port\n");
+                    spin_unlock_irqrestore(&priv->lock, flags);
+                    return 0;
+                case USB_PORT_FEAT_INDICATOR:
+                    DEBUG_ROOT_HUB(" INDICATOR\n");
+                    /* Not supported */
+                    break;
+                default:
+                    DEBUG_ROOT_HUB(" UNKNOWN\n");
+                    return -EINVAL;
+            }
+            break;
+        default:
+            DEBUG_ROOT_HUB("OcteonUSB: Unknown root hub request\n");
+            return -EINVAL;
+    }
+    return 0;
+}
+
+
+static const struct hc_driver octeon_hc_driver = {
+    .description =      "Octeon USB",
+    .product_desc =     "Octeon Host Controller",
+    .hcd_priv_size =    sizeof(struct octeon_hcd),
+    .irq =              octeon_usb_irq,
+    .flags =            HCD_MEMORY | HCD_USB2,
+    .start =            octeon_usb_start,
+    .stop =             octeon_usb_stop,
+    .urb_enqueue =      octeon_usb_urb_enqueue,
+    .urb_dequeue =      octeon_usb_urb_dequeue,
+    .endpoint_disable = octeon_usb_endpoint_disable,
+    .get_frame_number = octeon_usb_get_frame_number,
+    .hub_status_data =  octeon_usb_hub_status_data,
+    .hub_control =      octeon_usb_hub_control,
+};
+
+
+static int octeon_usb_driver_probe(struct device *dev)
+{
+    int status;
+    int usb_num = to_platform_device(dev)->id;
+    int irq = platform_get_irq(to_platform_device(dev), 0);
+    struct octeon_hcd *priv;
+    struct usb_hcd *hcd;
+    unsigned long flags;
+
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+
+    /* Set the DMA mask to 64bits so we get buffers already translated for
+        DMA */
+    dev->coherent_dma_mask = ~0;
+    dev->dma_mask = &dev->coherent_dma_mask;
+
+    hcd = usb_create_hcd(&octeon_hc_driver, dev, dev_name(dev));
+    if (!hcd) {
+        DEBUG_FATAL("OcteonUSB: Failed to allocate memory for HCD\n");
+        return -1;
+    }
+    hcd->uses_new_polling = 1;
+    priv = (struct octeon_hcd *)hcd->hcd_priv;
+
+    spin_lock_init(&priv->lock);
+
+    tasklet_init(&priv->dequeue_tasklet, octeon_usb_urb_dequeue_work, (unsigned long)priv);
+    INIT_LIST_HEAD(&priv->dequeue_list);
+
+    //status = cvmx_usb_initialize(&priv->usb, usb_num, CVMX_USB_INITIALIZE_FLAGS_CLOCK_AUTO | CVMX_USB_INITIALIZE_FLAGS_DEBUG_INFO | CVMX_USB_INITIALIZE_FLAGS_DEBUG_TRANSFERS | CVMX_USB_INITIALIZE_FLAGS_DEBUG_CALLBACKS);
+    status = cvmx_usb_initialize(&priv->usb, usb_num, CVMX_USB_INITIALIZE_FLAGS_CLOCK_AUTO);
+    if (status) {
+        DEBUG_FATAL("OcteonUSB: USB initialization failed with %d\n", status);
+        kfree(hcd);
+        return -1;
+    }
+
+    /* This delay is needed for CN3010, but I don't know why... */
+    mdelay(10);
+
+    spin_lock_irqsave(&priv->lock, flags);
+    cvmx_usb_poll(&priv->usb);
+    spin_unlock_irqrestore(&priv->lock, flags);
+
+    status = usb_add_hcd(hcd, irq, IRQF_SHARED);
+    if (status) {
+        DEBUG_FATAL("OcteonUSB: USB add HCD failed with %d\n", status);
+        kfree(hcd);
+        return -1;
+    }
+
+    printk("OcteonUSB: Registered HCD for port %d on irq %d\n", usb_num, irq);
+
+    return 0;
+}
+
+static int octeon_usb_driver_remove(struct device *dev)
+{
+    int status;
+    struct usb_hcd *hcd = dev_get_drvdata(dev);
+    struct octeon_hcd *priv = hcd_to_octeon(hcd);
+    unsigned long flags;
+
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+
+    usb_remove_hcd(hcd);
+    tasklet_kill(&priv->dequeue_tasklet);
+    spin_lock_irqsave(&priv->lock, flags);
+    status = cvmx_usb_shutdown(&priv->usb);
+    spin_unlock_irqrestore(&priv->lock, flags);
+    if (status)
+        DEBUG_FATAL("OcteonUSB: USB shutdown failed with %d\n", status);
+
+    kfree(hcd);
+
+    return 0;
+}
+
+static struct device_driver octeon_usb_driver = {
+    .name       = "OcteonUSB",
+    .bus        = &platform_bus_type,
+    .probe      = octeon_usb_driver_probe,
+    .remove     = octeon_usb_driver_remove,
+};
+
+
+#define MAX_USB_PORTS   10
+struct platform_device *pdev_glob[MAX_USB_PORTS];
+static int octeon_usb_registered;
+static int __init octeon_usb_module_init(void)
+{
+    int num_devices = cvmx_usb_get_num_ports();
+    int device;
+
+    if (usb_disabled() || num_devices == 0)
+       return -ENODEV;
+
+    if (driver_register(&octeon_usb_driver)) {
+        DEBUG_FATAL("OcteonUSB: Failed to register driver\n");
+        return -ENOMEM;
+    }
+    octeon_usb_registered = 1;
+    printk("OcteonUSB: Detected %d ports\n", num_devices);
+
+       /*
+        * Only cn52XX and cn56XX have DWC_OTG USB hardware and the
+        * IOB priority registers.  Under heavy network load USB
+        * hardware can be starved by the IOB causing a crash.  Give
+        * it a priority boost if it has been waiting more than 400
+        * cycles to avoid this situation.
+        *
+        * Testing indicates that a cnt_val of 8192 is not sufficient,
+        * but no failures are seen with 4096.  We choose a value of
+        * 400 to give a safety factor of 10.
+        */
+       if (OCTEON_IS_MODEL(OCTEON_CN52XX) || OCTEON_IS_MODEL(OCTEON_CN56XX)) {
+               union cvmx_iob_n2c_l2c_pri_cnt pri_cnt;
+
+               pri_cnt.u64 = 0;
+               pri_cnt.s.cnt_enb = 1;
+               pri_cnt.s.cnt_val = 400;
+               cvmx_write_csr(CVMX_IOB_N2C_L2C_PRI_CNT, pri_cnt.u64);
+       }
+
+    for (device = 0; device < num_devices; device++) {
+        struct resource irq_resource;
+        struct platform_device *pdev;
+        memset(&irq_resource, 0, sizeof(irq_resource));
+        irq_resource.start = (device==0) ? OCTEON_IRQ_USB0 : OCTEON_IRQ_USB1;
+        irq_resource.end = irq_resource.start;
+        irq_resource.flags = IORESOURCE_IRQ;
+        pdev = platform_device_register_simple((char*)octeon_usb_driver.name, device, &irq_resource, 1);
+        if (IS_ERR(pdev)) {
+            DEBUG_FATAL("OcteonUSB: Failed to allocate platform device for USB%d\n", device);
+               driver_unregister(&octeon_usb_driver);
+               octeon_usb_registered = 0;
+            return PTR_ERR(pdev);
+        }
+        if (device < MAX_USB_PORTS)
+            pdev_glob[device] = pdev;
+
+    }
+    return 0;
+}
+
+static void __exit octeon_usb_module_cleanup(void)
+{
+    int i;
+    DEBUG_CALL("OcteonUSB: %s called\n", __FUNCTION__);
+    for (i = 0; i <MAX_USB_PORTS; i++)
+        if (pdev_glob[i]) {
+            platform_device_unregister(pdev_glob[i]);
+            pdev_glob[i] = NULL;
+        }
+       if (octeon_usb_registered)
+               driver_unregister(&octeon_usb_driver);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cavium Networks <support@caviumnetworks.com>");
+MODULE_DESCRIPTION("Cavium Networks Octeon USB Host driver.");
+module_init(octeon_usb_module_init);
+module_exit(octeon_usb_module_cleanup);
index 27d06666c81ab4ce12aab8b15ddf79884ee72e3e..8dc24ea26bf5d34d389992cb48b3d62b4ebfd908 100644 (file)
@@ -355,11 +355,13 @@ int oz_cdev_register(void)
        g_oz_class = class_create(THIS_MODULE, "ozmo_wpan");
        if (IS_ERR(g_oz_class)) {
                oz_trace("Failed to register ozmo_wpan class\n");
+               err = PTR_ERR(g_oz_class);
                goto out1;
        }
        dev = device_create(g_oz_class, NULL, g_cdev.devnum, NULL, "ozwpan");
        if (IS_ERR(dev)) {
                oz_trace("Failed to create sysfs entry for cdev\n");
+               err = PTR_ERR(dev);
                goto out1;
        }
        return 0;
index 8ac26f584fd49cc40f9359c52146f161bb978354..5487a3a8d577f656d5d7523bbcbd7957c1dcf410 100644 (file)
@@ -1243,7 +1243,7 @@ static int oz_build_endpoints_for_interface(struct usb_hcd *hcd,
                if ((ep->attrib & USB_ENDPOINT_XFERTYPE_MASK)
                        == USB_ENDPOINT_XFER_ISOC) {
                        oz_trace("wMaxPacketSize = %d\n",
-                               hep->desc.wMaxPacketSize);
+                               usb_endpoint_maxp(&hep->desc));
                        ep->credit_ceiling = 200;
                        if (ep_addr & USB_ENDPOINT_DIR_MASK) {
                                ep->flags |= OZ_F_EP_BUFFERING;
index c54df3948e20206d4b0cc8f9ed8b68253daf4acc..cbc15c120981c62afd70723a994216e6af4ae784 100644 (file)
@@ -1756,17 +1756,18 @@ static inline int input_state_high(struct logical_input *input)
 
                        if (input->high_timer == 0) {
                                char *press_str = input->u.kbd.press_str;
-                               if (press_str[0])
-                                       keypad_send_key(press_str,
-                                                       sizeof(input->u.kbd.press_str));
+                               if (press_str[0]) {
+                                       int s = sizeof(input->u.kbd.press_str);
+                                       keypad_send_key(press_str, s);
+                               }
                        }
 
                        if (input->u.kbd.repeat_str[0]) {
                                char *repeat_str = input->u.kbd.repeat_str;
                                if (input->high_timer >= KEYPAD_REP_START) {
+                                       int s = sizeof(input->u.kbd.repeat_str);
                                        input->high_timer -= KEYPAD_REP_DELAY;
-                                       keypad_send_key(repeat_str,
-                                                       sizeof(input->u.kbd.repeat_str));
+                                       keypad_send_key(repeat_str, s);
                                }
                                /* we will need to come back here soon */
                                inputs_stable = 0;
@@ -1802,10 +1803,11 @@ static inline void input_state_falling(struct logical_input *input)
 
                        if (input->u.kbd.repeat_str[0]) {
                                char *repeat_str = input->u.kbd.repeat_str;
-                               if (input->high_timer >= KEYPAD_REP_START)
+                               if (input->high_timer >= KEYPAD_REP_START) {
+                                       int s = sizeof(input->u.kbd.repeat_str);
                                        input->high_timer -= KEYPAD_REP_DELAY;
-                                       keypad_send_key(repeat_str,
-                                                       sizeof(input->u.kbd.repeat_str));
+                                       keypad_send_key(repeat_str, s);
+                               }
                                /* we will need to come back here soon */
                                inputs_stable = 0;
                        }
@@ -1822,9 +1824,10 @@ static inline void input_state_falling(struct logical_input *input)
                                release_fct(input->u.std.release_data);
                } else if (input->type == INPUT_TYPE_KBD) {
                        char *release_str = input->u.kbd.release_str;
-                       if (release_str[0])
-                               keypad_send_key(release_str,
-                                               sizeof(input->u.kbd.release_str));
+                       if (release_str[0]) {
+                               int s = sizeof(input->u.kbd.release_str);
+                               keypad_send_key(release_str, s);
+                       }
                }
 
                input->state = INPUT_ST_LOW;
index ea91744f7ccfdb5d23204c1ba66b71bc0f363188..5f10e4075d39b6cdb8fc25c39d051f974fd54b88 100644 (file)
 #include "rtl_core.h"
 #include "r8192E_hw.h"
 #include "r8192E_cmdpkt.h"
-/*---------------------------Define Local Constant---------------------------*/
-/* Debug constant*/
-#define                CMPK_DEBOUNCE_CNT                       1
-#define                CMPK_PRINT(Address)\
-{\
-       unsigned char   i;\
-       u32     temp[10];\
-       \
-       memcpy(temp, Address, 40);\
-       for (i = 0; i < 40; i += 4)\
-               printk(KERN_INFO "\r\n %08x", temp[i]);\
-}
 
-/*---------------------------Define functions---------------------------------*/
 bool cmpk_message_handle_tx(
        struct net_device *dev,
        u8      *code_virtual_address,
@@ -100,7 +87,7 @@ bool cmpk_message_handle_tx(
        write_nic_byte(dev, TPPoll, TPPoll_CQ);
 Failed:
        return rt_status;
-}      /* CMPK_Message_Handle_Tx */
+}
 
 static void
 cmpk_count_txstatistic(
@@ -149,23 +136,19 @@ cmpk_count_txstatistic(
 
        priv->stats.txretrycount += pstx_fb->retry_cnt;
        priv->stats.txfeedbackretry += pstx_fb->retry_cnt;
-
-}      /* cmpk_CountTxStatistic */
-
-
+}
 
 static void cmpk_handle_tx_feedback(struct net_device *dev, u8 *pmsg)
 {
        struct r8192_priv *priv = rtllib_priv(dev);
-       struct cmpk_txfb rx_tx_fb;      /* */
+       struct cmpk_txfb rx_tx_fb;
 
        priv->stats.txfeedback++;
 
 
        memcpy((u8 *)&rx_tx_fb, pmsg, sizeof(struct cmpk_txfb));
        cmpk_count_txstatistic(dev, &rx_tx_fb);
-
-}      /* cmpk_Handle_Tx_Feedback */
+}
 
 static void cmdpkt_beacontimerinterrupt_819xusb(struct net_device *dev)
 {
@@ -182,7 +165,6 @@ static void cmdpkt_beacontimerinterrupt_819xusb(struct net_device *dev)
                tx_rate = 10;
                DMESG("send beacon frame  tx rate is 1Mbpm\n");
        }
-
 }
 
 static void cmpk_handle_interrupt_status(struct net_device *dev, u8 *pmsg)
@@ -192,14 +174,12 @@ static void cmpk_handle_interrupt_status(struct net_device *dev, u8 *pmsg)
 
        DMESG("---> cmpk_Handle_Interrupt_Status()\n");
 
-
        rx_intr_status.length = pmsg[1];
        if (rx_intr_status.length != (sizeof(struct cmpk_intr_sta) - 2)) {
                DMESG("cmpk_Handle_Interrupt_Status: wrong length!\n");
                return;
        }
 
-
        if (priv->rtllib->iw_mode == IW_MODE_ADHOC) {
                rx_intr_status.interrupt_status = *((u32 *)(pmsg + 4));
 
@@ -220,12 +200,11 @@ static void cmpk_handle_interrupt_status(struct net_device *dev, u8 *pmsg)
 
        DMESG("<---- cmpk_handle_interrupt_status()\n");
 
-}      /* cmpk_handle_interrupt_status */
-
+}
 
 static void cmpk_handle_query_config_rx(struct net_device *dev, u8 *pmsg)
 {
-       cmpk_query_cfg_t        rx_query_cfg;   /* */
+       cmpk_query_cfg_t        rx_query_cfg;
 
 
        rx_query_cfg.cfg_action = (pmsg[4] & 0x80000000)>>31;
@@ -238,8 +217,7 @@ static      void cmpk_handle_query_config_rx(struct net_device *dev, u8 *pmsg)
        rx_query_cfg.mask = (pmsg[12] << 24) | (pmsg[13] << 16) |
                            (pmsg[14] << 8) | (pmsg[15] << 0);
 
-}      /* cmpk_Handle_Query_Config_Rx */
-
+}
 
 static void cmpk_count_tx_status(struct net_device *dev,
                                 struct cmpk_tx_status *pstx_status)
@@ -280,13 +258,11 @@ static void cmpk_count_tx_status(struct net_device *dev,
        priv->stats.txbytesunicast              += pstx_status->txuclength;
 
        priv->stats.last_packet_rate            = pstx_status->rate;
-}      /* cmpk_CountTxStatus */
-
-
+}
 
 static void cmpk_handle_tx_status(struct net_device *dev, u8 *pmsg)
 {
-       struct cmpk_tx_status rx_tx_sts;        /* */
+       struct cmpk_tx_status rx_tx_sts;
 
        memcpy((void *)&rx_tx_sts, (void *)pmsg, sizeof(struct cmpk_tx_status));
        cmpk_count_tx_status(dev, &rx_tx_sts);
@@ -300,7 +276,6 @@ static      void cmpk_handle_tx_rate_history(struct net_device *dev, u8 *pmsg)
        u32 *ptemp;
        struct r8192_priv *priv = rtllib_priv(dev);
 
-
 #ifdef ENABLE_PS
        pAdapter->HalFunc.GetHwRegHandler(pAdapter, HW_VAR_RF_STATE,
                                         (pu1Byte)(&rtState));
@@ -335,10 +310,8 @@ static     void cmpk_handle_tx_rate_history(struct net_device *dev, u8 *pmsg)
                        priv->stats.txrate.ht_mcs[j][i] +=
                                                         ptxrate->ht_mcs[j][i];
        }
-
 }
 
-
 u32 cmpk_message_handle_rx(struct net_device *dev,
                           struct rtllib_rx_stats *pstats)
 {
@@ -349,12 +322,8 @@ u32 cmpk_message_handle_rx(struct net_device *dev,
 
        RT_TRACE(COMP_CMDPKT, "---->cmpk_message_handle_rx()\n");
 
-       if (pstats == NULL) {
-               /* Print error message. */
-               /*RT_TRACE(COMP_SEND, DebugLevel,
-                               ("\n\r[CMPK]-->Err queue id or pointer"));*/
+       if (pstats == NULL)
                return 0;
-       }
 
        total_length = pstats->Length;
 
index a9d78e9651c6006a981c7e26c920f71cd57692d2..50c7bb773984e0f0ee0c5e70180df412acc01e8d 100644 (file)
@@ -2128,10 +2128,11 @@ void rtl8192_update_ratr_table(struct net_device *dev)
        struct rtllib_device *ieee = priv->rtllib;
        u8 *pMcsRate = ieee->dot11HTOperationalRateSet;
        u32 ratr_value = 0;
+       u16 rate_config = 0;
        u8 rate_index = 0;
 
-       rtl8192_config_rate(dev, (u16 *)(&ratr_value));
-       ratr_value |= (*(u16 *)(pMcsRate)) << 12;
+       rtl8192_config_rate(dev, &rate_config);
+       ratr_value = rate_config | *pMcsRate << 12;
        switch (ieee->mode) {
        case IEEE_A:
                ratr_value &= 0x00000FF0;
index 8b8a5c661a268feed60866afa4ed93cf146df491..e75364e3eb434b7f8fae42cab33c7ce172991b5e 100644 (file)
@@ -1822,7 +1822,7 @@ int rtllib_parse_info_param(struct rtllib_device *ieee,
                                network->rates_ex[i] = info_element->data[i];
                                p += snprintf(p, sizeof(rates_str) -
                                              (p - rates_str), "%02X ",
-                                             network->rates[i]);
+                                             network->rates_ex[i]);
                                if (rtllib_is_ofdm_rate
                                    (info_element->data[i])) {
                                        network->flags |= NETWORK_HAS_OFDM;
index 4feecec8609c696e29e745a0b2b7833def2f917b..aefffac556a66c7c273b3cc32fc8b2d33a4a0a7e 100644 (file)
@@ -1801,8 +1801,9 @@ static inline u16 auth_parse(struct sk_buff *skb, u8** challenge, int *chlen)
 
                if (*(t++) == MFIE_TYPE_CHALLENGE) {
                        *chlen = *(t++);
-                       *challenge = kmalloc(*chlen, GFP_ATOMIC);
-                       memcpy(*challenge, t, *chlen);  /*TODO - check here*/
+                       *challenge = kmemdup(t, *chlen, GFP_ATOMIC);
+                       if (!*challenge)
+                               return -ENOMEM;
                }
        }
        return cpu_to_le16(a->status);
index f10fd5a93c386d3c74723563795e5485aadf308a..34edcfab96be0c5ae46171e25d935c340a14e52e 100644 (file)
@@ -67,9 +67,9 @@ Dot11d_Reset(struct ieee80211_device *ieee)
 void
 Dot11d_UpdateCountryIe(
        struct ieee80211_device *dev,
-       u8 *            pTaddr,
+       u8 *pTaddr,
        u16     CoutryIeLen,
-       u8 * pCoutryIe
+       u8 *pCoutryIe
        )
 {
        PRT_DOT11D_INFO pDot11dInfo = GET_DOT11D_INFO(dev);
@@ -101,7 +101,7 @@ Dot11d_UpdateCountryIe(
                        MaxChnlNum = pTriple->FirstChnl + j;
                }
 
-               pTriple = (PCHNL_TXPOWER_TRIPLE)((u8*)pTriple + 3);
+               pTriple = (PCHNL_TXPOWER_TRIPLE)((u8 *)pTriple + 3);
        }
        //printk("Dot11d_UpdateCountryIe(): Channel List:\n");
        printk("Channel List:");
@@ -143,12 +143,12 @@ DOT11D_GetMaxTxPwrInDbm(
 
 void
 DOT11D_ScanComplete(
-       struct ieee80211_device * dev
+       struct ieee80211_device *dev
        )
 {
        PRT_DOT11D_INFO pDot11dInfo = GET_DOT11D_INFO(dev);
 
-       switch(pDot11dInfo->State)
+       switch (pDot11dInfo->State)
        {
        case DOT11D_STATE_LEARNED:
                pDot11dInfo->State = DOT11D_STATE_DONE;
@@ -166,7 +166,7 @@ DOT11D_ScanComplete(
 }
 
 int IsLegalChannel(
-       struct ieee80211_device * dev,
+       struct ieee80211_device *dev,
        u8 channel
 )
 {
@@ -183,7 +183,7 @@ int IsLegalChannel(
 }
 
 int ToLegalChannel(
-       struct ieee80211_device * dev,
+       struct ieee80211_device *dev,
        u8 channel
 )
 {
index 54f2b4c434ffa6800972705f0cb1ae11c28fdb11..6aa8c15eba39a09d98e4c963af2dcbfd05dc4b73 100644 (file)
@@ -71,9 +71,9 @@ Dot11d_Reset(
 void
 Dot11d_UpdateCountryIe(
        struct ieee80211_device *dev,
-       u8 *            pTaddr,
+       u8 *pTaddr,
        u16     CoutryIeLen,
-       u8 * pCoutryIe
+       u8 *pCoutryIe
        );
 
 u8
@@ -84,16 +84,16 @@ DOT11D_GetMaxTxPwrInDbm(
 
 void
 DOT11D_ScanComplete(
-       struct ieee80211_device * dev
+       struct ieee80211_device *dev
        );
 
 int IsLegalChannel(
-       struct ieee80211_device * dev,
+       struct ieee80211_device *dev,
        u8 channel
 );
 
 int ToLegalChannel(
-       struct ieee80211_device * dev,
+       struct ieee80211_device *dev,
        u8 channel
 );
 #endif // #ifndef __INC_DOT11D_H
index 210898c8e66c149f5cbb661026dff5dae103821a..c9f3bb363be42f16eff5ba752ab3a406c4edc810 100644 (file)
@@ -493,8 +493,8 @@ typedef struct ieee_param {
 #define IsDataFrame(pdu)                       ( ((pdu[0] & 0x0C)==0x08) ? true : false )
 #define        IsLegacyDataFrame(pdu)  (IsDataFrame(pdu) && (!(pdu[0]&FC_QOS_BIT)) )
 //added by wb. Is this right?
-#define IsQoSDataFrame(pframe)  ((*(u16*)pframe&(IEEE80211_STYPE_QOS_DATA|IEEE80211_FTYPE_DATA)) == (IEEE80211_STYPE_QOS_DATA|IEEE80211_FTYPE_DATA))
-#define Frame_Order(pframe)     (*(u16*)pframe&IEEE80211_FCTL_ORDER)
+#define IsQoSDataFrame(pframe)  ((*(u16 *)pframe&(IEEE80211_STYPE_QOS_DATA|IEEE80211_FTYPE_DATA)) == (IEEE80211_STYPE_QOS_DATA|IEEE80211_FTYPE_DATA))
+#define Frame_Order(pframe)     (*(u16 *)pframe&IEEE80211_FCTL_ORDER)
 #define SN_LESS(a, b)          (((a-b)&0x800)!=0)
 #define SN_EQUAL(a, b) (a == b)
 #define MAX_DEV_ADDR_SIZE 8
@@ -538,7 +538,7 @@ do { if (ieee80211_debug_level & (level)) \
        do{ if ((ieee80211_debug_level & (level)) == (level))   \
                {       \
                        int i;                                  \
-                       u8* pdata = (u8*) data;                 \
+                       u8 *pdata = (u8 *) data;                        \
                        printk(KERN_DEBUG "ieee80211: %s()\n", __FUNCTION__);   \
                        for(i=0; i<(int)(datalen); i++)                 \
                        {                                               \
@@ -914,7 +914,7 @@ struct ieee80211_rx_stats {
        bool      bIsCCK;
        bool      bPacketToSelf;
        //added by amy
-       u8*       virtual_address;
+       u8        *virtual_address;
        u16          packetlength;              // Total packet length: Must equal to sum of all FragLength
        u16          fraglength;                        // FragLength should equal to PacketLength in non-fragment case
        u16          fragoffset;                        // Data offset for this fragment
@@ -1366,13 +1366,13 @@ static inline const char *eap_get_type(int type)
        return ((u32)type >= ARRAY_SIZE(eap_types)) ? "Unknown" : eap_types[type];
 }
 //added by amy for reorder
-static inline u8 Frame_QoSTID(u8buf)
+static inline u8 Frame_QoSTID(u8 *buf)
 {
        struct ieee80211_hdr_3addr *hdr;
        u16 fc;
        hdr = (struct ieee80211_hdr_3addr *)buf;
        fc = le16_to_cpu(hdr->frame_ctl);
-       return (u8)((frameqos*)(buf + (((fc & IEEE80211_FCTL_TODS)&&(fc & IEEE80211_FCTL_FROMDS))? 30 : 24)))->field.tid;
+       return (u8)((frameqos *)(buf + (((fc & IEEE80211_FCTL_TODS)&&(fc & IEEE80211_FCTL_FROMDS))? 30 : 24)))->field.tid;
 }
 
 //added by amy for reorder
@@ -1670,7 +1670,7 @@ typedef struct _bandwidth_autoswitch {
 typedef struct _RX_REORDER_ENTRY {
        struct list_head        List;
        u16                     SeqNum;
-       struct ieee80211_rxbprxb;
+       struct ieee80211_rxb *prxb;
 } RX_REORDER_ENTRY, *PRX_REORDER_ENTRY;
 //added by amy for order
 typedef enum _Fsync_State{
@@ -1965,7 +1965,7 @@ struct ieee80211_device {
 
        /* map of allowed channels. 0 is dummy */
        // FIXME: remember to default to a basic channel plan depending of the PHY type
-       voidpDot11dInfo;
+       void *pDot11dInfo;
        bool bGlobalDomain;
        int rate;       /* current rate */
        int basic_rate;
@@ -2107,10 +2107,10 @@ struct ieee80211_device {
                               struct net_device *dev);
 
        int (*reset_port)(struct net_device *dev);
-       int (*is_queue_full) (struct net_device * dev, int pri);
+       int (*is_queue_full) (struct net_device *dev, int pri);
 
-       int (*handle_management) (struct net_device * dev,
-                                 struct ieee80211_network * network, u16 type);
+       int (*handle_management) (struct net_device *dev,
+                                 struct ieee80211_network *network, u16 type);
        int (*is_qos_active) (struct net_device *dev, struct sk_buff *skb);
 
        /* Softmac-generated frames (management) are TXed via this
@@ -2187,8 +2187,8 @@ struct ieee80211_device {
        void (*ps_request_tx_ack) (struct net_device *dev);
        void (*enter_sleep_state) (struct net_device *dev, u32 th, u32 tl);
        short (*ps_is_queue_empty) (struct net_device *dev);
-       int (*handle_beacon) (struct net_device * dev, struct ieee80211_beacon * beacon, struct ieee80211_network * network);
-       int (*handle_assoc_response) (struct net_device * dev, struct ieee80211_assoc_response_frame * resp, struct ieee80211_network * network);
+       int (*handle_beacon) (struct net_device *dev, struct ieee80211_beacon *beacon, struct ieee80211_network *network);
+       int (*handle_assoc_response) (struct net_device *dev, struct ieee80211_assoc_response_frame *resp, struct ieee80211_network *network);
 
 
        /* check whether Tx hw resource available */
@@ -2197,9 +2197,9 @@ struct ieee80211_device {
 //     void (*SwChnlByTimerHandler)(struct net_device *dev, int channel);
        void (*SetBWModeHandler)(struct net_device *dev, HT_CHANNEL_WIDTH Bandwidth, HT_EXTCHNL_OFFSET Offset);
 //     void (*UpdateHalRATRTableHandler)(struct net_device* dev, u8* pMcsRate);
-       bool (*GetNmodeSupportBySecCfg)(struct net_devicedev);
-       void (*SetWirelessMode)(struct net_devicedev, u8 wireless_mode);
-       bool (*GetHalfNmodeSupportByAPsHandler)(struct net_devicedev);
+       bool (*GetNmodeSupportBySecCfg)(struct net_device *dev);
+       void (*SetWirelessMode)(struct net_device *dev, u8 wireless_mode);
+       bool (*GetHalfNmodeSupportByAPsHandler)(struct net_device *dev);
        void (*InitialGainHandler)(struct net_device *dev, u8 Operation);
 
        /* This must be the last item so that it points to the data
@@ -2401,10 +2401,10 @@ extern int ieee80211_wx_get_encode(struct ieee80211_device *ieee,
 #if WIRELESS_EXT >= 18
 extern int ieee80211_wx_get_encode_ext(struct ieee80211_device *ieee,
                            struct iw_request_info *info,
-                           union iwreq_datawrqu, char *extra);
+                           union iwreq_data *wrqu, char *extra);
 extern int ieee80211_wx_set_encode_ext(struct ieee80211_device *ieee,
                            struct iw_request_info *info,
-                           union iwreq_datawrqu, char *extra);
+                           union iwreq_data *wrqu, char *extra);
 extern int ieee80211_wx_set_auth(struct ieee80211_device *ieee,
                               struct iw_request_info *info,
                               struct iw_param *data, char *extra);
@@ -2422,7 +2422,7 @@ extern int ieee80211_rx_frame_softmac(struct ieee80211_device *ieee, struct sk_b
                        u16 stype);
 extern void ieee80211_softmac_new_net(struct ieee80211_device *ieee, struct ieee80211_network *net);
 
-void SendDisassociation(struct ieee80211_device *ieee, u8asSta, u8 asRsn);
+void SendDisassociation(struct ieee80211_device *ieee, u8 *asSta, u8 asRsn);
 extern void ieee80211_softmac_xmit(struct ieee80211_txb *txb, struct ieee80211_device *ieee);
 
 extern void ieee80211_stop_send_beacons(struct ieee80211_device *ieee);
@@ -2528,52 +2528,52 @@ extern int ieee80211_wx_get_rts(struct ieee80211_device *ieee,
                             union iwreq_data *wrqu, char *extra);
 //HT
 #define MAX_RECEIVE_BUFFER_SIZE 9100  //
-extern void HTDebugHTCapability(u8* CapIE, u8* TitleString );
-extern void HTDebugHTInfo(u8*  InfoIE, u8* TitleString);
-
-void HTSetConnectBwMode(struct ieee80211_deviceieee, HT_CHANNEL_WIDTH Bandwidth, HT_EXTCHNL_OFFSET    Offset);
-extern void HTUpdateDefaultSetting(struct ieee80211_deviceieee);
-extern void HTConstructCapabilityElement(struct ieee80211_device* ieee, u8* posHTCap, u8* len, u8 isEncrypt);
-extern void HTConstructInfoElement(struct ieee80211_device* ieee, u8* posHTInfo, u8* len, u8 isEncrypt);
-extern void HTConstructRT2RTAggElement(struct ieee80211_device* ieee, u8* posRT2RTAgg, u8* len);
+extern void HTDebugHTCapability(u8 *CapIE, u8 *TitleString );
+extern void HTDebugHTInfo(u8 *InfoIE, u8 *TitleString);
+
+void HTSetConnectBwMode(struct ieee80211_device *ieee, HT_CHANNEL_WIDTH Bandwidth, HT_EXTCHNL_OFFSET    Offset);
+extern void HTUpdateDefaultSetting(struct ieee80211_device *ieee);
+extern void HTConstructCapabilityElement(struct ieee80211_device *ieee, u8 *posHTCap, u8 *len, u8 isEncrypt);
+extern void HTConstructInfoElement(struct ieee80211_device *ieee, u8 *posHTInfo, u8 *len, u8 isEncrypt);
+extern void HTConstructRT2RTAggElement(struct ieee80211_device *ieee, u8 *posRT2RTAgg, u8 *len);
 extern void HTOnAssocRsp(struct ieee80211_device *ieee);
-extern void HTInitializeHTInfo(struct ieee80211_deviceieee);
+extern void HTInitializeHTInfo(struct ieee80211_device *ieee);
 extern void HTInitializeBssDesc(PBSS_HT pBssHT);
-extern void HTResetSelfAndSavePeerSetting(struct ieee80211_device* ieee, struct ieee80211_network * pNetwork);
-extern void HTUpdateSelfAndPeerSetting(struct ieee80211_device* ieee,   struct ieee80211_network * pNetwork);
-extern u8 HTGetHighestMCSRate(struct ieee80211_device* ieee, u8* pMCSRateSet, u8* pMCSFilter);
+extern void HTResetSelfAndSavePeerSetting(struct ieee80211_device *ieee, struct ieee80211_network *pNetwork);
+extern void HTUpdateSelfAndPeerSetting(struct ieee80211_device *ieee,   struct ieee80211_network *pNetwork);
+extern u8 HTGetHighestMCSRate(struct ieee80211_device *ieee, u8 *pMCSRateSet, u8 *pMCSFilter);
 extern u8 MCS_FILTER_ALL[];
 extern u16 MCS_DATA_RATE[2][2][77] ;
-extern u8 HTCCheck(struct ieee80211_device* ieee, u8*   pFrame);
+extern u8 HTCCheck(struct ieee80211_device *ieee, u8 *pFrame);
 //extern void HTSetConnectBwModeCallback(unsigned long data);
 extern void HTResetIOTSetting(PRT_HIGH_THROUGHPUT  pHTInfo);
-extern bool IsHTHalfNmodeAPs(struct ieee80211_deviceieee);
-extern u16 HTHalfMcsToDataRate(struct ieee80211_deviceieee,  u8      nMcsRate);
-extern u16 HTMcsToDataRate( struct ieee80211_deviceieee, u8 nMcsRate);
-extern u16  TxCountToDataRate( struct ieee80211_deviceieee, u8 nDataRate);
+extern bool IsHTHalfNmodeAPs(struct ieee80211_device *ieee);
+extern u16 HTHalfMcsToDataRate(struct ieee80211_device *ieee,  u8      nMcsRate);
+extern u16 HTMcsToDataRate( struct ieee80211_device *ieee, u8 nMcsRate);
+extern u16  TxCountToDataRate( struct ieee80211_device *ieee, u8 nDataRate);
 //function in BAPROC.c
-extern int ieee80211_rx_ADDBAReq( struct ieee80211_deviceieee, struct sk_buff *skb);
-extern int ieee80211_rx_ADDBARsp( struct ieee80211_deviceieee, struct sk_buff *skb);
-extern int ieee80211_rx_DELBA(struct ieee80211_deviceieee,struct sk_buff *skb);
-extern void TsInitAddBA( struct ieee80211_deviceieee, PTX_TS_RECORD   pTS, u8 Policy, u8 bOverwritePending);
-extern void TsInitDelBA( struct ieee80211_deviceieee, PTS_COMMON_INFO pTsCommonInfo, TR_SELECT TxRxSelect);
+extern int ieee80211_rx_ADDBAReq( struct ieee80211_device *ieee, struct sk_buff *skb);
+extern int ieee80211_rx_ADDBARsp( struct ieee80211_device *ieee, struct sk_buff *skb);
+extern int ieee80211_rx_DELBA(struct ieee80211_device *ieee,struct sk_buff *skb);
+extern void TsInitAddBA( struct ieee80211_device *ieee, PTX_TS_RECORD   pTS, u8 Policy, u8 bOverwritePending);
+extern void TsInitDelBA( struct ieee80211_device *ieee, PTS_COMMON_INFO pTsCommonInfo, TR_SELECT TxRxSelect);
 extern void BaSetupTimeOut(unsigned long data);
 extern void TxBaInactTimeout(unsigned long data);
 extern void RxBaInactTimeout(unsigned long data);
 extern void ResetBaEntry( PBA_RECORD pBA);
 //function in TS.c
 extern bool GetTs(
-       struct ieee80211_device*        ieee,
+       struct ieee80211_device         *ieee,
        PTS_COMMON_INFO                 *ppTS,
-       u8*                             Addr,
+       u8                              *Addr,
        u8                              TID,
        TR_SELECT                       TxRxSelect,  //Rx:1, Tx:0
        bool                            bAddNewTs
        );
 extern void TSInitialize(struct ieee80211_device *ieee);
-extern  void TsStartAddBaProcess(struct ieee80211_deviceieee, PTX_TS_RECORD   pTxTS);
-extern void RemovePeerTS(struct ieee80211_device* ieee, u8* Addr);
-extern void RemoveAllTS(struct ieee80211_deviceieee);
+extern  void TsStartAddBaProcess(struct ieee80211_device *ieee, PTX_TS_RECORD   pTxTS);
+extern void RemovePeerTS(struct ieee80211_device *ieee, u8 *Addr);
+extern void RemoveAllTS(struct ieee80211_device *ieee);
 void ieee80211_softmac_scan_syncro(struct ieee80211_device *ieee);
 
 extern const long ieee80211_wlan_frequencies[];
@@ -2623,6 +2623,6 @@ extern int ieee80211_parse_info_param(struct ieee80211_device *ieee,
                struct ieee80211_network *network,
                struct ieee80211_rx_stats *stats);
 
-void ieee80211_indicate_packets(struct ieee80211_device *ieee, struct ieee80211_rxb** prxbIndicateArray,u8  index);
+void ieee80211_indicate_packets(struct ieee80211_device *ieee, struct ieee80211_rxb **prxbIndicateArray,u8  index);
 #define RT_ASOC_RETRY_LIMIT    5
 #endif /* IEEE80211_H */
index a464d111d7385c80b290468cbb622edae1df46be..55332217c29fb211fceca01a7c08a4a8be453bca 100644 (file)
@@ -155,7 +155,7 @@ int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops)
 }
 
 
-struct ieee80211_crypto_ops * ieee80211_get_crypto_ops(const char *name)
+struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name)
 {
        unsigned long flags;
        struct list_head *ptr;
@@ -182,7 +182,7 @@ struct ieee80211_crypto_ops * ieee80211_get_crypto_ops(const char *name)
 }
 
 
-static void * ieee80211_crypt_null_init(int keyidx) { return (void *) 1; }
+static void *ieee80211_crypt_null_init(int keyidx) { return (void *) 1; }
 static void ieee80211_crypt_null_deinit(void *priv) {}
 
 static struct ieee80211_crypto_ops ieee80211_crypt_null = {
index b58a3bcc0dc0972c882f5992ad02264510769bf5..0b4ea431982d98f7511c45fbac9eae19abe7a0c9 100644 (file)
@@ -77,7 +77,7 @@ struct ieee80211_crypt_data {
 
 int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops);
 int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops);
-struct ieee80211_crypto_ops * ieee80211_get_crypto_ops(const char *name);
+struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name);
 void ieee80211_crypt_deinit_entries(struct ieee80211_device *, int);
 void ieee80211_crypt_deinit_handler(unsigned long);
 void ieee80211_crypt_delayed_deinit(struct ieee80211_device *ieee,
index fec0176888e271fc0e6db5d106d397db8ecc9ed6..f2b16775a638ce27468e3915f2569842f6f5b4c5 100644 (file)
@@ -60,10 +60,10 @@ struct ieee80211_ccmp_data {
 void ieee80211_ccmp_aes_encrypt(struct crypto_tfm *tfm,
                             const u8 pt[16], u8 ct[16])
 {
-       crypto_cipher_encrypt_one((void*)tfm, ct, pt);
+       crypto_cipher_encrypt_one((void *)tfm, ct, pt);
 }
 
-static void * ieee80211_ccmp_init(int key_idx)
+static void *ieee80211_ccmp_init(int key_idx)
 {
        struct ieee80211_ccmp_data *priv;
 
@@ -72,7 +72,7 @@ static void * ieee80211_ccmp_init(int key_idx)
                goto fail;
        priv->key_idx = key_idx;
 
-       priv->tfm = (void*)crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+       priv->tfm = (void *)crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
        if (IS_ERR(priv->tfm)) {
                printk(KERN_DEBUG "ieee80211_crypt_ccmp: could not allocate "
                       "crypto API aes\n");
@@ -85,7 +85,7 @@ static void * ieee80211_ccmp_init(int key_idx)
 fail:
        if (priv) {
                if (priv->tfm)
-                       crypto_free_cipher((void*)priv->tfm);
+                       crypto_free_cipher((void *)priv->tfm);
                kfree(priv);
        }
 
@@ -98,7 +98,7 @@ static void ieee80211_ccmp_deinit(void *priv)
        struct ieee80211_ccmp_data *_priv = priv;
 
        if (_priv && _priv->tfm)
-               crypto_free_cipher((void*)_priv->tfm);
+               crypto_free_cipher((void *)_priv->tfm);
        kfree(priv);
 }
 
@@ -393,7 +393,7 @@ static int ieee80211_ccmp_set_key(void *key, int len, u8 *seq, void *priv)
                        data->rx_pn[4] = seq[1];
                        data->rx_pn[5] = seq[0];
                }
-               crypto_cipher_setkey((void*)data->tfm, data->key, CCMP_TK_LEN);
+               crypto_cipher_setkey((void *)data->tfm, data->key, CCMP_TK_LEN);
        } else if (len == 0)
                data->key_set = 0;
        else
@@ -427,7 +427,7 @@ static int ieee80211_ccmp_get_key(void *key, int len, u8 *seq, void *priv)
 }
 
 
-static char * ieee80211_ccmp_print_stats(char *p, void *priv)
+static char *ieee80211_ccmp_print_stats(char *p, void *priv)
 {
        struct ieee80211_ccmp_data *ccmp = priv;
        p += sprintf(p, "key[%d] alg=CCMP key_set=%d "
index 555eb8038e95f4335a80549d0ed1cf98b351ca59..93121b42f16b7b4e43319721562def16609fb37a 100644 (file)
@@ -62,7 +62,7 @@ struct ieee80211_tkip_data {
        u8 rx_hdr[16], tx_hdr[16];
 };
 
-static void * ieee80211_tkip_init(int key_idx)
+static void *ieee80211_tkip_init(int key_idx)
 {
        struct ieee80211_tkip_data *priv;
 
@@ -499,8 +499,8 @@ static int ieee80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
        return keyidx;
 }
 
-static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr,
-                      u8 * data, size_t data_len, u8 * mic)
+static int michael_mic(struct crypto_hash *tfm_michael, u8 *key, u8 *hdr,
+                      u8 *data, size_t data_len, u8 *mic)
 {
        struct hash_desc desc;
        struct scatterlist sg[2];
@@ -718,7 +718,7 @@ static int ieee80211_tkip_get_key(void *key, int len, u8 *seq, void *priv)
 }
 
 
-static char * ieee80211_tkip_print_stats(char *p, void *priv)
+static char *ieee80211_tkip_print_stats(char *p, void *priv)
 {
        struct ieee80211_tkip_data *tkip = priv;
        p += sprintf(p, "key[%d] alg=TKIP key_set=%d "
index 3801f125f8f2c188f071583202d294b2cb80743d..f202236958975b5d53a1482f9fd5b5fd85303257 100644 (file)
@@ -38,7 +38,7 @@ struct prism2_wep_data {
 };
 
 
-static void * prism2_wep_init(int keyidx)
+static void *prism2_wep_init(int keyidx)
 {
        struct prism2_wep_data *priv;
 
@@ -253,7 +253,7 @@ static int prism2_wep_get_key(void *key, int len, u8 *seq, void *priv)
 }
 
 
-static char * prism2_wep_print_stats(char *p, void *priv)
+static char *prism2_wep_print_stats(char *p, void *priv)
 {
        struct prism2_wep_data *wep = priv;
        p += sprintf(p, "key[%d] alg=WEP len=%d\n",
index ee7ce5fca4628e35d6921625dfaecc95de676d4a..52936a6b9db37727a4cb5585ce91ecb33797c8c0 100644 (file)
@@ -218,7 +218,7 @@ ieee80211_rx_frame_mgmt(struct ieee80211_device *ieee, struct sk_buff *skb,
         * this is not mandatory.... but seems that the probe
         * response parser uses it
         */
-       struct ieee80211_hdr_3addr * hdr = (struct ieee80211_hdr_3addr *)skb->data;
+       struct ieee80211_hdr_3addr *hdr = (struct ieee80211_hdr_3addr *)skb->data;
 
        rx_stats->len = skb->len;
        ieee80211_rx_mgt(ieee,(struct ieee80211_hdr_4addr *)skb->data,rx_stats);
@@ -336,7 +336,7 @@ static int ieee80211_is_eapol_frame(struct ieee80211_device *ieee,
 
 /* Called only as a tasklet (software IRQ), by ieee80211_rx */
 static inline int
-ieee80211_rx_frame_decrypt(struct ieee80211_deviceieee, struct sk_buff *skb,
+ieee80211_rx_frame_decrypt(struct ieee80211_device *ieee, struct sk_buff *skb,
                           struct ieee80211_crypt_data *crypt)
 {
        struct ieee80211_hdr_4addr *hdr;
@@ -385,7 +385,7 @@ ieee80211_rx_frame_decrypt(struct ieee80211_device* ieee, struct sk_buff *skb,
 
 /* Called only as a tasklet (software IRQ), by ieee80211_rx */
 static inline int
-ieee80211_rx_frame_decrypt_msdu(struct ieee80211_deviceieee, struct sk_buff *skb,
+ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device *ieee, struct sk_buff *skb,
                             int keyidx, struct ieee80211_crypt_data *crypt)
 {
        struct ieee80211_hdr_4addr *hdr;
@@ -439,7 +439,7 @@ static int is_duplicate_packet(struct ieee80211_device *ieee,
          tid = UP2AC(tid);
          tid ++;
        } else if(IEEE80211_QOS_HAS_SEQ(fc)) { //QoS
-         hdr_3addrqos = (struct ieee80211_hdr_3addrqos*)header;
+         hdr_3addrqos = (struct ieee80211_hdr_3addrqos *)header;
          tid = le16_to_cpu(hdr_3addrqos->qos_ctl) & IEEE80211_QCTL_TID;
          tid = UP2AC(tid);
          tid ++;
@@ -548,7 +548,7 @@ AddReorderEntry(
        return true;
 }
 
-void ieee80211_indicate_packets(struct ieee80211_device *ieee, struct ieee80211_rxb** prxbIndicateArray,u8  index)
+void ieee80211_indicate_packets(struct ieee80211_device *ieee, struct ieee80211_rxb **prxbIndicateArray,u8  index)
 {
        u8 i = 0 , j=0;
        u16 ethertype;
@@ -557,7 +557,7 @@ void ieee80211_indicate_packets(struct ieee80211_device *ieee, struct ieee80211_
        for(j = 0; j<index; j++)
        {
 //added by amy for reorder
-               struct ieee80211_rxbprxb = prxbIndicateArray[j];
+               struct ieee80211_rxb *prxb = prxbIndicateArray[j];
                for(i = 0; i<prxb->nr_subframes; i++) {
                        struct sk_buff *sub_skb = prxb->subframes[i];
 
@@ -603,13 +603,13 @@ void ieee80211_indicate_packets(struct ieee80211_device *ieee, struct ieee80211_
 
 
 void RxReorderIndicatePacket( struct ieee80211_device *ieee,
-               struct ieee80211_rxbprxb,
+               struct ieee80211_rxb *prxb,
                PRX_TS_RECORD           pTS,
                u16                     SeqNum)
 {
        PRT_HIGH_THROUGHPUT     pHTInfo = ieee->pHTInfo;
        PRX_REORDER_ENTRY       pReorderEntry = NULL;
-       struct ieee80211_rxbprxbIndicateArray[REORDER_WIN_SIZE];
+       struct ieee80211_rxb *prxbIndicateArray[REORDER_WIN_SIZE];
        u8                      WinSize = pHTInfo->RxReorderWinSize;
        u16                     WinEnd = (pTS->RxIndicateSeq + WinSize -1)%4096;
        u8                      index = 0;
@@ -774,9 +774,9 @@ void RxReorderIndicatePacket( struct ieee80211_device *ieee,
 
 u8 parse_subframe(struct sk_buff *skb,
                  struct ieee80211_rx_stats *rx_stats,
-                 struct ieee80211_rxb *rxb,u8* src,u8* dst)
+                 struct ieee80211_rxb *rxb,u8 *src,u8 *dst)
 {
-       struct ieee80211_hdr_3addr  *hdr = (struct ieee80211_hdr_3addr)skb->data;
+       struct ieee80211_hdr_3addr  *hdr = (struct ieee80211_hdr_3addr *)skb->data;
        u16             fc = le16_to_cpu(hdr->frame_ctl);
 
        u16             LLCOffset= sizeof(struct ieee80211_hdr_3addr);
@@ -831,7 +831,7 @@ u8 parse_subframe(struct sk_buff *skb,
                memcpy(rxb->dst,dst,ETH_ALEN);
                while(skb->len > ETHERNET_HEADER_SIZE) {
                        /* Offset 12 denote 2 mac address */
-                       nSubframe_Length = *((u16*)(skb->data + 12));
+                       nSubframe_Length = *((u16 *)(skb->data + 12));
                        //==m==>change the length order
                        nSubframe_Length = (nSubframe_Length>>8) + (nSubframe_Length<<8);
 
@@ -926,7 +926,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
        int keyidx = 0;
 
        int i;
-       struct ieee80211_rxbrxb = NULL;
+       struct ieee80211_rxb *rxb = NULL;
        // cheat the the hdr type
        hdr = (struct ieee80211_hdr_4addr *)skb->data;
        stats = &ieee->stats;
@@ -1035,9 +1035,9 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
                        //IEEE80211_DEBUG(IEEE80211_DL_REORDER,"%s(): QOS ENABLE AND RECEIVE QOS DATA , we will get Ts, tid:%d\n",__FUNCTION__, tid);
                if(GetTs(
                                ieee,
-                               (PTS_COMMON_INFO*) &pRxTS,
+                               (PTS_COMMON_INFO *) &pRxTS,
                                hdr->addr2,
-                               (u8)Frame_QoSTID((u8*)(skb->data)),
+                               (u8)Frame_QoSTID((u8 *)(skb->data)),
                                RX_DIR,
                                true))
                {
@@ -1289,7 +1289,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
        {
                TID = Frame_QoSTID(skb->data);
                SeqNum = WLAN_GET_SEQ_SEQ(sc);
-               GetTs(ieee,(PTS_COMMON_INFO*) &pTS,hdr->addr2,TID,RX_DIR,true);
+               GetTs(ieee,(PTS_COMMON_INFO *) &pTS,hdr->addr2,TID,RX_DIR,true);
                if(TID !=0 && TID !=3)
                {
                        ieee->bis_any_nonbepkts = true;
@@ -1597,7 +1597,7 @@ static inline void ieee80211_extract_country_ie(
        struct ieee80211_device *ieee,
        struct ieee80211_info_element *info_element,
        struct ieee80211_network *network,
-       u8 * addr2
+       u8 *addr2
 )
 {
        if(IS_DOT11D_ENABLE(ieee))
@@ -2275,7 +2275,7 @@ static inline int ieee80211_network_init(
 }
 
 static inline int is_same_network(struct ieee80211_network *src,
-                                 struct ieee80211_network *dst, struct ieee80211_deviceieee)
+                                 struct ieee80211_network *dst, struct ieee80211_device *ieee)
 {
        /* A network is only a duplicate if the channel, BSSID, ESSID
         * and the capability field (in particular IBSS and BSS) all match.
index 454f8895d211927efdda4334725f6047677f67c9..8a0075db92531c7af181f262ab6cf9ce9b5fe675 100644 (file)
@@ -688,7 +688,7 @@ inline struct sk_buff *ieee80211_authentication_req(struct ieee80211_network *be
 }
 
 
-static struct sk_buffieee80211_probe_resp(struct ieee80211_device *ieee, u8 *dest)
+static struct sk_buff *ieee80211_probe_resp(struct ieee80211_device *ieee, u8 *dest)
 {
        u8 *tag;
        int beacon_size;
@@ -696,7 +696,7 @@ static struct sk_buff* ieee80211_probe_resp(struct ieee80211_device *ieee, u8 *d
        struct sk_buff *skb = NULL;
        int encrypt;
        int atim_len,erp_len;
-       struct ieee80211_crypt_datacrypt;
+       struct ieee80211_crypt_data *crypt;
 
        char *ssid = ieee->current_network.ssid;
        int ssid_len = ieee->current_network.ssid_len;
@@ -705,12 +705,12 @@ static struct sk_buff* ieee80211_probe_resp(struct ieee80211_device *ieee, u8 *d
        int wpa_ie_len = ieee->wpa_ie_len;
        u8 erpinfo_content = 0;
 
-       u8tmp_ht_cap_buf;
+       u8 *tmp_ht_cap_buf;
        u8 tmp_ht_cap_len=0;
-       u8tmp_ht_info_buf;
+       u8 *tmp_ht_info_buf;
        u8 tmp_ht_info_len=0;
        PRT_HIGH_THROUGHPUT     pHTInfo = ieee->pHTInfo;
-       u8tmp_generic_ie_buf=NULL;
+       u8 *tmp_generic_ie_buf=NULL;
        u8 tmp_generic_ie_len=0;
 
        if(rate_ex_len > 0) rate_ex_len+=2;
@@ -732,9 +732,9 @@ static struct sk_buff* ieee80211_probe_resp(struct ieee80211_device *ieee, u8 *d
        encrypt = ieee->host_encrypt && crypt && crypt->ops &&
                ((0 == strcmp(crypt->ops->name, "WEP") || wpa_ie_len));
        //HT ralated element
-       tmp_ht_cap_buf =(u8*) &(ieee->pHTInfo->SelfHTCap);
+       tmp_ht_cap_buf =(u8 *) &(ieee->pHTInfo->SelfHTCap);
        tmp_ht_cap_len = sizeof(ieee->pHTInfo->SelfHTCap);
-       tmp_ht_info_buf =(u8*) &(ieee->pHTInfo->SelfHTInfo);
+       tmp_ht_info_buf =(u8 *) &(ieee->pHTInfo->SelfHTInfo);
        tmp_ht_info_len = sizeof(ieee->pHTInfo->SelfHTInfo);
        HTConstructCapabilityElement(ieee, tmp_ht_cap_buf, &tmp_ht_cap_len,encrypt);
        HTConstructInfoElement(ieee,tmp_ht_info_buf,&tmp_ht_info_len, encrypt);
@@ -764,7 +764,7 @@ static struct sk_buff* ieee80211_probe_resp(struct ieee80211_device *ieee, u8 *d
        if (!skb)
                return NULL;
        skb_reserve(skb, ieee->tx_headroom);
-       beacon_buf = (struct ieee80211_probe_response*) skb_put(skb, (beacon_size - ieee->tx_headroom));
+       beacon_buf = (struct ieee80211_probe_response *) skb_put(skb, (beacon_size - ieee->tx_headroom));
        memcpy (beacon_buf->header.addr1, dest,ETH_ALEN);
        memcpy (beacon_buf->header.addr2, ieee->dev->dev_addr, ETH_ALEN);
        memcpy (beacon_buf->header.addr3, ieee->current_network.bssid, ETH_ALEN);
@@ -789,7 +789,7 @@ static struct sk_buff* ieee80211_probe_resp(struct ieee80211_device *ieee, u8 *d
        beacon_buf->info_element[0].id = MFIE_TYPE_SSID;
        beacon_buf->info_element[0].len = ssid_len;
 
-       tag = (u8*) beacon_buf->info_element[0].data;
+       tag = (u8 *) beacon_buf->info_element[0].data;
 
        memcpy(tag, ssid, ssid_len);
 
@@ -841,12 +841,12 @@ static struct sk_buff* ieee80211_probe_resp(struct ieee80211_device *ieee, u8 *d
 }
 
 
-struct sk_buffieee80211_assoc_resp(struct ieee80211_device *ieee, u8 *dest)
+struct sk_buff *ieee80211_assoc_resp(struct ieee80211_device *ieee, u8 *dest)
 {
        struct sk_buff *skb;
-       u8tag;
+       u8 *tag;
 
-       struct ieee80211_crypt_datacrypt;
+       struct ieee80211_crypt_data *crypt;
        struct ieee80211_assoc_response_frame *assoc;
        short encrypt;
 
@@ -888,7 +888,7 @@ struct sk_buff* ieee80211_assoc_resp(struct ieee80211_device *ieee, u8 *dest)
        if (ieee->assoc_id == 0x2007) ieee->assoc_id=0;
        else ieee->assoc_id++;
 
-       tag = (u8*) skb_put(skb, rate_len);
+       tag = (u8 *) skb_put(skb, rate_len);
 
        ieee80211_MFIE_Brate(ieee, &tag);
        ieee80211_MFIE_Grate(ieee, &tag);
@@ -896,7 +896,7 @@ struct sk_buff* ieee80211_assoc_resp(struct ieee80211_device *ieee, u8 *dest)
        return skb;
 }
 
-struct sk_buffieee80211_auth_resp(struct ieee80211_device *ieee,int status, u8 *dest)
+struct sk_buff *ieee80211_auth_resp(struct ieee80211_device *ieee,int status, u8 *dest)
 {
        struct sk_buff *skb;
        struct ieee80211_authentication *auth;
@@ -924,17 +924,17 @@ struct sk_buff* ieee80211_auth_resp(struct ieee80211_device *ieee,int status, u8
 
 }
 
-struct sk_buffieee80211_null_func(struct ieee80211_device *ieee,short pwr)
+struct sk_buff *ieee80211_null_func(struct ieee80211_device *ieee,short pwr)
 {
        struct sk_buff *skb;
-       struct ieee80211_hdr_3addrhdr;
+       struct ieee80211_hdr_3addr *hdr;
 
        skb = dev_alloc_skb(sizeof(struct ieee80211_hdr_3addr));
 
        if (!skb)
                return NULL;
 
-       hdr = (struct ieee80211_hdr_3addr*)skb_put(skb,sizeof(struct ieee80211_hdr_3addr));
+       hdr = (struct ieee80211_hdr_3addr *)skb_put(skb,sizeof(struct ieee80211_hdr_3addr));
 
        memcpy(hdr->addr1, ieee->current_network.bssid, ETH_ALEN);
        memcpy(hdr->addr2, ieee->dev->dev_addr, ETH_ALEN);
@@ -950,7 +950,7 @@ struct sk_buff* ieee80211_null_func(struct ieee80211_device *ieee,short pwr)
 }
 
 
-void ieee80211_resp_to_assoc_rq(struct ieee80211_device *ieee, u8dest)
+void ieee80211_resp_to_assoc_rq(struct ieee80211_device *ieee, u8 *dest)
 {
        struct sk_buff *buf = ieee80211_assoc_resp(ieee, dest);
 
@@ -959,7 +959,7 @@ void ieee80211_resp_to_assoc_rq(struct ieee80211_device *ieee, u8* dest)
 }
 
 
-void ieee80211_resp_to_auth(struct ieee80211_device *ieee, int s, u8dest)
+void ieee80211_resp_to_auth(struct ieee80211_device *ieee, int s, u8 *dest)
 {
        struct sk_buff *buf = ieee80211_auth_resp(ieee, s, dest);
 
@@ -991,15 +991,15 @@ inline struct sk_buff *ieee80211_association_req(struct ieee80211_network *beaco
        //u8 suit_select = 0;
        //unsigned int wpa_len = beacon->wpa_ie_len;
        //for HT
-       u8ht_cap_buf = NULL;
+       u8 *ht_cap_buf = NULL;
        u8 ht_cap_len=0;
-       u8realtek_ie_buf=NULL;
+       u8 *realtek_ie_buf=NULL;
        u8 realtek_ie_len=0;
        int wpa_ie_len= ieee->wpa_ie_len;
        unsigned int ckip_ie_len=0;
        unsigned int ccxrm_ie_len=0;
        unsigned int cxvernum_ie_len=0;
-       struct ieee80211_crypt_datacrypt;
+       struct ieee80211_crypt_data *crypt;
        int encrypt;
 
        unsigned int rate_len = ieee80211_MFIE_rate_len(ieee);
@@ -1016,7 +1016,7 @@ inline struct sk_buff *ieee80211_association_req(struct ieee80211_network *beaco
        //Include High Throuput capability && Realtek proprietary
        if(ieee->pHTInfo->bCurrentHTSupport&&ieee->pHTInfo->bEnableHT)
        {
-               ht_cap_buf = (u8*)&(ieee->pHTInfo->SelfHTCap);
+               ht_cap_buf = (u8 *)&(ieee->pHTInfo->SelfHTCap);
                ht_cap_len = sizeof(ieee->pHTInfo->SelfHTCap);
                HTConstructCapabilityElement(ieee, ht_cap_buf, &ht_cap_len, encrypt);
                if(ieee->pHTInfo->bCurrentRT2RTAggregation)
@@ -1314,7 +1314,7 @@ void ieee80211_auth_challenge(struct ieee80211_device *ieee, u8 *challenge, int
 
 void ieee80211_associate_step2(struct ieee80211_device *ieee)
 {
-       struct sk_buffskb;
+       struct sk_buff *skb;
        struct ieee80211_network *beacon = &ieee->current_network;
 
        del_timer_sync(&ieee->associate_timer);
@@ -1536,7 +1536,7 @@ void ieee80211_softmac_check_all_nets(struct ieee80211_device *ieee)
 }
 
 
-static inline u16 auth_parse(struct sk_buff *skb, u8** challenge, int *chlen)
+static inline u16 auth_parse(struct sk_buff *skb, u8 **challenge, int *chlen)
 {
        struct ieee80211_authentication *a;
        u8 *t;
@@ -1545,7 +1545,7 @@ static inline u16 auth_parse(struct sk_buff *skb, u8** challenge, int *chlen)
                return 0xcafe;
        }
        *challenge = NULL;
-       a = (struct ieee80211_authentication*) skb->data;
+       a = (struct ieee80211_authentication *) skb->data;
        if(skb->len > (sizeof(struct ieee80211_authentication) +3)){
                t = skb->data + sizeof(struct ieee80211_authentication);
 
@@ -1562,7 +1562,7 @@ static inline u16 auth_parse(struct sk_buff *skb, u8** challenge, int *chlen)
 }
 
 
-int auth_rq_parse(struct sk_buff *skb,u8dest)
+int auth_rq_parse(struct sk_buff *skb,u8 *dest)
 {
        struct ieee80211_authentication *a;
 
@@ -1570,7 +1570,7 @@ int auth_rq_parse(struct sk_buff *skb,u8* dest)
                IEEE80211_DEBUG_MGMT("invalid len in auth request: %d\n",skb->len);
                return -1;
        }
-       a = (struct ieee80211_authentication*) skb->data;
+       a = (struct ieee80211_authentication *) skb->data;
 
        memcpy(dest,a->header.addr2, ETH_ALEN);
 
@@ -1595,7 +1595,7 @@ static short probe_rq_parse(struct ieee80211_device *ieee, struct sk_buff *skb,
 
        memcpy(src,header->addr2, ETH_ALEN);
 
-       skbend = (u8*)skb->data + skb->len;
+       skbend = (u8 *)skb->data + skb->len;
 
        tag = skb->data + sizeof (struct ieee80211_hdr_3addr  );
 
@@ -1618,7 +1618,7 @@ static short probe_rq_parse(struct ieee80211_device *ieee, struct sk_buff *skb,
 
 }
 
-int assoc_rq_parse(struct sk_buff *skb,u8dest)
+int assoc_rq_parse(struct sk_buff *skb,u8 *dest)
 {
        struct ieee80211_assoc_request_frame *a;
 
@@ -1629,7 +1629,7 @@ int assoc_rq_parse(struct sk_buff *skb,u8* dest)
                return -1;
        }
 
-       a = (struct ieee80211_assoc_request_frame*) skb->data;
+       a = (struct ieee80211_assoc_request_frame *) skb->data;
 
        memcpy(dest,a->header.addr2,ETH_ALEN);
 
@@ -1646,7 +1646,7 @@ static inline u16 assoc_parse(struct ieee80211_device *ieee, struct sk_buff *skb
                return 0xcafe;
        }
 
-       response_head = (struct ieee80211_assoc_response_frame*) skb->data;
+       response_head = (struct ieee80211_assoc_response_frame *) skb->data;
        *aid = le16_to_cpu(response_head->aid) & 0x3fff;
 
        status_code = le16_to_cpu(response_head->status);
@@ -1888,10 +1888,10 @@ void ieee80211_ps_tx_ack(struct ieee80211_device *ieee, short success)
        }
        spin_unlock_irqrestore(&ieee->lock, flags);
 }
-void ieee80211_process_action(struct ieee80211_device* ieee, struct sk_buff* skb)
+void ieee80211_process_action(struct ieee80211_device *ieee, struct sk_buff *skb)
 {
-       struct ieee80211_hdr* header = (struct ieee80211_hdr*)skb->data;
-       u8act = ieee80211_get_payload(header);
+       struct ieee80211_hdr *header = (struct ieee80211_hdr *)skb->data;
+       u8 *act = ieee80211_get_payload(header);
        u8 tmp = 0;
 //     IEEE80211_DEBUG_DATA(IEEE80211_DL_DATA|IEEE80211_DL_BA, skb->data, skb->len);
        if (act == NULL)
@@ -1926,7 +1926,7 @@ ieee80211_rx_frame_softmac(struct ieee80211_device *ieee, struct sk_buff *skb,
 {
        struct ieee80211_hdr_3addr *header = (struct ieee80211_hdr_3addr *) skb->data;
        u16 errcode;
-       u8challenge;
+       u8 *challenge;
        int chlen=0;
        int aid;
        struct ieee80211_assoc_response_frame *assoc_resp;
@@ -1966,7 +1966,7 @@ ieee80211_rx_frame_softmac(struct ieee80211_device *ieee, struct sk_buff *skb,
                                /* station support qos */
                                /* Let the register setting defaultly with Legacy station */
                                if(ieee->qos_support) {
-                                       assoc_resp = (struct ieee80211_assoc_response_frame*)skb->data;
+                                       assoc_resp = (struct ieee80211_assoc_response_frame *)skb->data;
                                        memset(network, 0, sizeof(*network));
                                        if (ieee80211_parse_info_param(ieee,assoc_resp->info_element,\
                                                                rx_stats->len - sizeof(*assoc_resp),\
@@ -1979,7 +1979,7 @@ ieee80211_rx_frame_softmac(struct ieee80211_device *ieee, struct sk_buff *skb,
                                                memcpy(ieee->pHTInfo->PeerHTInfoBuf, network->bssht.bdHTInfoBuf, network->bssht.bdHTInfoLen);
                                        }
                                        if (ieee->handle_assoc_response != NULL)
-                                               ieee->handle_assoc_response(ieee->dev, (struct ieee80211_assoc_response_frame*)header, network);
+                                               ieee->handle_assoc_response(ieee->dev, (struct ieee80211_assoc_response_frame *)header, network);
                                }
                                ieee80211_associate_complete(ieee);
                        } else {
@@ -3124,7 +3124,7 @@ inline struct sk_buff *ieee80211_disassociate_skb(
 void
 SendDisassociation(
                struct ieee80211_device *ieee,
-               u8*                                     asSta,
+               u8                                      *asSta,
                u8                                              asRsn
 )
 {
index c39e680bb0ac2a2ec9df403f25280c9ca4d3488f..995504207fc6615c67cf092a47bacfce6c4c75c5 100644 (file)
@@ -183,7 +183,7 @@ int ieee80211_encrypt_fragment(
        struct sk_buff *frag,
        int hdr_len)
 {
-       struct ieee80211_crypt_datacrypt = ieee->crypt[ieee->tx_keyidx];
+       struct ieee80211_crypt_data *crypt = ieee->crypt[ieee->tx_keyidx];
        int res;
 
        if (!(crypt && crypt->ops))
@@ -243,7 +243,7 @@ struct ieee80211_txb *ieee80211_alloc_txb(int nr_frags, int txb_size,
        struct ieee80211_txb *txb;
        int i;
        txb = kmalloc(
-               sizeof(struct ieee80211_txb) + (sizeof(u8*) * nr_frags),
+               sizeof(struct ieee80211_txb) + (sizeof(u8 *) * nr_frags),
                gfp_mask);
        if (!txb)
                return NULL;
@@ -303,11 +303,11 @@ ieee80211_classify(struct sk_buff *skb, struct ieee80211_network *network)
 }
 
 #define SN_LESS(a, b)          (((a-b)&0x800)!=0)
-void ieee80211_tx_query_agg_cap(struct ieee80211_device* ieee, struct sk_buff* skb, cb_desc* tcb_desc)
+void ieee80211_tx_query_agg_cap(struct ieee80211_device *ieee, struct sk_buff *skb, cb_desc *tcb_desc)
 {
        PRT_HIGH_THROUGHPUT     pHTInfo = ieee->pHTInfo;
        PTX_TS_RECORD                   pTxTs = NULL;
-       struct ieee80211_hdr_1addr* hdr = (struct ieee80211_hdr_1addr*)skb->data;
+       struct ieee80211_hdr_1addr *hdr = (struct ieee80211_hdr_1addr *)skb->data;
 
        if (!pHTInfo->bCurrentHTSupport||!pHTInfo->bEnableHT)
                return;
@@ -330,7 +330,7 @@ void ieee80211_tx_query_agg_cap(struct ieee80211_device* ieee, struct sk_buff* s
        }
        if(pHTInfo->bCurrentAMPDUEnable)
        {
-               if (!GetTs(ieee, (PTS_COMMON_INFO*)(&pTxTs), hdr->addr1, skb->priority, TX_DIR, true))
+               if (!GetTs(ieee, (PTS_COMMON_INFO *)(&pTxTs), hdr->addr1, skb->priority, TX_DIR, true))
                {
                        printk("===>can't get TS\n");
                        return;
@@ -356,7 +356,7 @@ void ieee80211_tx_query_agg_cap(struct ieee80211_device* ieee, struct sk_buff* s
                }
        }
 FORCED_AGG_SETTING:
-       switch(pHTInfo->ForcedAMPDUMode )
+       switch (pHTInfo->ForcedAMPDUMode )
        {
                case HT_AGG_AUTO:
                        break;
@@ -377,7 +377,7 @@ FORCED_AGG_SETTING:
                return;
 }
 
-extern void ieee80211_qurey_ShortPreambleMode(struct ieee80211_device* ieee, cb_desc* tcb_desc)
+extern void ieee80211_qurey_ShortPreambleMode(struct ieee80211_device *ieee, cb_desc *tcb_desc)
 {
        tcb_desc->bUseShortPreamble = false;
        if (tcb_desc->data_rate == 2)
@@ -412,7 +412,7 @@ ieee80211_query_HTCapShortGI(struct ieee80211_device *ieee, cb_desc *tcb_desc)
                tcb_desc->bUseShortGI = true;
 }
 
-void ieee80211_query_BandwidthMode(struct ieee80211_deviceieee, cb_desc *tcb_desc)
+void ieee80211_query_BandwidthMode(struct ieee80211_device *ieee, cb_desc *tcb_desc)
 {
        PRT_HIGH_THROUGHPUT     pHTInfo = ieee->pHTInfo;
 
@@ -432,7 +432,7 @@ void ieee80211_query_BandwidthMode(struct ieee80211_device* ieee, cb_desc *tcb_d
        return;
 }
 
-void ieee80211_query_protectionmode(struct ieee80211_device* ieee, cb_desc* tcb_desc, struct sk_buff* skb)
+void ieee80211_query_protectionmode(struct ieee80211_device *ieee, cb_desc *tcb_desc, struct sk_buff *skb)
 {
        // Common Settings
        tcb_desc->bRTSSTBC                      = false;
@@ -543,7 +543,7 @@ NO_PROTECTION:
 }
 
 
-void ieee80211_txrate_selectmode(struct ieee80211_device* ieee, cb_desc* tcb_desc)
+void ieee80211_txrate_selectmode(struct ieee80211_device *ieee, cb_desc *tcb_desc)
 {
 #ifdef TO_DO_LIST
        if(!IsDataFrame(pFrame))
@@ -573,14 +573,14 @@ void ieee80211_txrate_selectmode(struct ieee80211_device* ieee, cb_desc* tcb_des
        }
 }
 
-void ieee80211_query_seqnum(struct ieee80211_device*ieee, struct sk_buff* skb, u8* dst)
+void ieee80211_query_seqnum(struct ieee80211_device *ieee, struct sk_buff *skb, u8 *dst)
 {
        if (is_multicast_ether_addr(dst))
                return;
        if (IsQoSDataFrame(skb->data)) //we deal qos data only
        {
                PTX_TS_RECORD pTS = NULL;
-               if (!GetTs(ieee, (PTS_COMMON_INFO*)(&pTS), dst, skb->priority, TX_DIR, true))
+               if (!GetTs(ieee, (PTS_COMMON_INFO *)(&pTS), dst, skb->priority, TX_DIR, true))
                {
                        return;
                }
@@ -607,7 +607,7 @@ int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev)
        u8 dest[ETH_ALEN], src[ETH_ALEN];
        int qos_actived = ieee->current_network.qos_data.active;
 
-       struct ieee80211_crypt_datacrypt;
+       struct ieee80211_crypt_data *crypt;
 
        cb_desc *tcb_desc;
 
index 69735d3203158f2ea1bbc537b24e4d3db9c990ca..db0db9347487db1b1ca16e26f46d2bf305eaf728 100644 (file)
@@ -13,7 +13,7 @@
  *          u16                        Time //indicate time delay.
  *  output:  none
 ********************************************************************************************************************/
-void ActivateBAEntry(struct ieee80211_deviceieee, PBA_RECORD pBA, u16 Time)
+void ActivateBAEntry(struct ieee80211_device *ieee, PBA_RECORD pBA, u16 Time)
 {
        pBA->bValid = true;
        if(Time != 0)
@@ -25,7 +25,7 @@ void ActivateBAEntry(struct ieee80211_device* ieee, PBA_RECORD pBA, u16 Time)
  *   input:  PBA_RECORD                        pBA  //BA entry to be disabled
  *  output:  none
 ********************************************************************************************************************/
-void DeActivateBAEntry( struct ieee80211_deviceieee, PBA_RECORD pBA)
+void DeActivateBAEntry( struct ieee80211_device *ieee, PBA_RECORD pBA)
 {
        pBA->bValid = false;
        del_timer_sync(&pBA->Timer);
@@ -37,7 +37,7 @@ void DeActivateBAEntry( struct ieee80211_device* ieee, PBA_RECORD pBA)
  *  output:  none
  *  notice:  As PTX_TS_RECORD structure will be defined in QOS, so wait to be merged. //FIXME
 ********************************************************************************************************************/
-u8 TxTsDeleteBA( struct ieee80211_deviceieee, PTX_TS_RECORD  pTxTs)
+u8 TxTsDeleteBA( struct ieee80211_device *ieee, PTX_TS_RECORD  pTxTs)
 {
        PBA_RECORD              pAdmittedBa = &pTxTs->TxAdmittedBARecord;  //These two BA entries must exist in TS structure
        PBA_RECORD              pPendingBa = &pTxTs->TxPendingBARecord;
@@ -67,7 +67,7 @@ u8 TxTsDeleteBA( struct ieee80211_device* ieee, PTX_TS_RECORD pTxTs)
  *  output:  none
  *  notice:  As PRX_TS_RECORD structure will be defined in QOS, so wait to be merged. //FIXME, same with above
 ********************************************************************************************************************/
-u8 RxTsDeleteBA( struct ieee80211_deviceieee, PRX_TS_RECORD  pRxTs)
+u8 RxTsDeleteBA( struct ieee80211_device *ieee, PRX_TS_RECORD  pRxTs)
 {
        PBA_RECORD              pBa = &pRxTs->RxAdmittedBARecord;
        u8                      bSendDELBA = false;
@@ -105,11 +105,11 @@ void ResetBaEntry( PBA_RECORD pBA)
  *  output:  none
  *  return:  sk_buff*          skb     //return constructed skb to xmit
 *******************************************************************************************************************************/
-static struct sk_buff* ieee80211_ADDBA(struct ieee80211_device* ieee, u8* Dst, PBA_RECORD pBA, u16 StatusCode, u8 type)
+static struct sk_buff *ieee80211_ADDBA(struct ieee80211_device *ieee, u8 *Dst, PBA_RECORD pBA, u16 StatusCode, u8 type)
 {
        struct sk_buff *skb = NULL;
-        struct ieee80211_hdr_3addrBAReq = NULL;
-       u8tag = NULL;
+        struct ieee80211_hdr_3addr *BAReq = NULL;
+       u8 *tag = NULL;
        u16 tmp = 0;
        u16 len = ieee->tx_headroom + 9;
        //category(1) + action field(1) + Dialog Token(1) + BA Parameter Set(2) +  BA Timeout Value(2) +  BA Start SeqCtrl(2)(or StatusCode(2))
@@ -139,7 +139,7 @@ static struct sk_buff* ieee80211_ADDBA(struct ieee80211_device* ieee, u8* Dst, P
        BAReq->frame_ctl = cpu_to_le16(IEEE80211_STYPE_MANAGE_ACT); //action frame
 
        //tag += sizeof( struct ieee80211_hdr_3addr); //move to action field
-       tag = (u8*)skb_put(skb, 9);
+       tag = (u8 *)skb_put(skb, 9);
        *tag ++= ACT_CAT_BA;
        *tag ++= type;
        // Dialog Token
@@ -150,22 +150,22 @@ static struct sk_buff* ieee80211_ADDBA(struct ieee80211_device* ieee, u8* Dst, P
                // Status Code
                printk("=====>to send ADDBARSP\n");
                tmp = cpu_to_le16(StatusCode);
-               memcpy(tag, (u8*)&tmp, 2);
+               memcpy(tag, (u8 *)&tmp, 2);
                tag += 2;
        }
        // BA Parameter Set
        tmp = cpu_to_le16(pBA->BaParamSet.shortData);
-       memcpy(tag, (u8*)&tmp, 2);
+       memcpy(tag, (u8 *)&tmp, 2);
        tag += 2;
        // BA Timeout Value
        tmp = cpu_to_le16(pBA->BaTimeoutValue);
-       memcpy(tag, (u8*)&tmp, 2);
+       memcpy(tag, (u8 *)&tmp, 2);
        tag += 2;
 
        if (ACT_ADDBAREQ == type)
        {
        // BA Start SeqCtrl
-               memcpy(tag,(u8*)&(pBA->BaStartSeqCtrl), 2);
+               memcpy(tag,(u8 *)&(pBA->BaStartSeqCtrl), 2);
                tag += 2;
        }
 
@@ -184,9 +184,9 @@ static struct sk_buff* ieee80211_ADDBA(struct ieee80211_device* ieee, u8* Dst, P
  *  output:  none
  *  return:  sk_buff*          skb     //return constructed skb to xmit
 ********************************************************************************************************************/
-static struct sk_buffieee80211_DELBA(
-       struct ieee80211_deviceieee,
-       u8*                      dst,
+static struct sk_buff *ieee80211_DELBA(
+       struct ieee80211_device  *ieee,
+       u8                       *dst,
        PBA_RECORD               pBA,
        TR_SELECT                TxRxSelect,
        u16                      ReasonCode
@@ -194,8 +194,8 @@ static struct sk_buff* ieee80211_DELBA(
 {
        DELBA_PARAM_SET DelbaParamSet;
        struct sk_buff *skb = NULL;
-        struct ieee80211_hdr_3addrDelba = NULL;
-       u8tag = NULL;
+        struct ieee80211_hdr_3addr *Delba = NULL;
+       u8 *tag = NULL;
        u16 tmp = 0;
        //len = head len + DELBA Parameter Set(2) + Reason Code(2)
        u16 len = 6 + ieee->tx_headroom;
@@ -224,18 +224,18 @@ static struct sk_buff* ieee80211_DELBA(
        memcpy(Delba->addr3, ieee->current_network.bssid, ETH_ALEN);
        Delba->frame_ctl = cpu_to_le16(IEEE80211_STYPE_MANAGE_ACT); //action frame
 
-       tag = (u8*)skb_put(skb, 6);
+       tag = (u8 *)skb_put(skb, 6);
 
        *tag ++= ACT_CAT_BA;
        *tag ++= ACT_DELBA;
 
        // DELBA Parameter Set
        tmp = cpu_to_le16(DelbaParamSet.shortData);
-       memcpy(tag, (u8*)&tmp, 2);
+       memcpy(tag, (u8 *)&tmp, 2);
        tag += 2;
        // Reason Code
        tmp = cpu_to_le16(ReasonCode);
-       memcpy(tag, (u8*)&tmp, 2);
+       memcpy(tag, (u8 *)&tmp, 2);
        tag += 2;
 
        IEEE80211_DEBUG_DATA(IEEE80211_DL_DATA|IEEE80211_DL_BA, skb->data, skb->len);
@@ -251,7 +251,7 @@ static struct sk_buff* ieee80211_DELBA(
  *  output:  none
  *  notice: If any possible, please hide pBA in ieee. And temporarily use Manage Queue as softmac_mgmt_xmit() usually does
 ********************************************************************************************************************/
-void ieee80211_send_ADDBAReq(struct ieee80211_device* ieee, u8*        dst, PBA_RECORD pBA)
+void ieee80211_send_ADDBAReq(struct ieee80211_device *ieee, u8 *dst, PBA_RECORD        pBA)
 {
        struct sk_buff *skb = NULL;
        skb = ieee80211_ADDBA(ieee, dst, pBA, 0, ACT_ADDBAREQ); //construct ACT_ADDBAREQ frames so set statuscode zero.
@@ -278,7 +278,7 @@ void ieee80211_send_ADDBAReq(struct ieee80211_device* ieee, u8*     dst, PBA_RECORD
  *  output:  none
  *  notice: If any possible, please hide pBA in ieee. And temporarily use Manage Queue as softmac_mgmt_xmit() usually does
 ********************************************************************************************************************/
-void ieee80211_send_ADDBARsp(struct ieee80211_device* ieee, u8* dst, PBA_RECORD pBA, u16 StatusCode)
+void ieee80211_send_ADDBARsp(struct ieee80211_device *ieee, u8 *dst, PBA_RECORD pBA, u16 StatusCode)
 {
        struct sk_buff *skb = NULL;
        skb = ieee80211_ADDBA(ieee, dst, pBA, StatusCode, ACT_ADDBARSP); //construct ACT_ADDBARSP frames
@@ -305,7 +305,7 @@ void ieee80211_send_ADDBARsp(struct ieee80211_device* ieee, u8* dst, PBA_RECORD
  *  notice: If any possible, please hide pBA in ieee. And temporarily use Manage Queue as softmac_mgmt_xmit() usually does
 ********************************************************************************************************************/
 
-void ieee80211_send_DELBA(struct ieee80211_device* ieee, u8* dst, PBA_RECORD pBA, TR_SELECT TxRxSelect, u16 ReasonCode)
+void ieee80211_send_DELBA(struct ieee80211_device *ieee, u8 *dst, PBA_RECORD pBA, TR_SELECT TxRxSelect, u16 ReasonCode)
 {
        struct sk_buff *skb = NULL;
        skb = ieee80211_DELBA(ieee, dst, pBA, TxRxSelect, ReasonCode); //construct ACT_ADDBARSP frames
@@ -327,14 +327,14 @@ void ieee80211_send_DELBA(struct ieee80211_device* ieee, u8* dst, PBA_RECORD pBA
  *  return:  0(pass), other(fail)
  *  notice:  As this function need support of QOS, I comment some code out. And when qos is ready, this code need to be support.
 ********************************************************************************************************************/
-int ieee80211_rx_ADDBAReq( struct ieee80211_deviceieee, struct sk_buff *skb)
+int ieee80211_rx_ADDBAReq( struct ieee80211_device *ieee, struct sk_buff *skb)
 {
-        struct ieee80211_hdr_3addrreq = NULL;
+        struct ieee80211_hdr_3addr *req = NULL;
        u16 rc = 0;
-       u8 * dst = NULL, *pDialogToken = NULL, *tag = NULL;
+       u8 *dst = NULL, *pDialogToken = NULL, *tag = NULL;
        PBA_RECORD pBA = NULL;
        PBA_PARAM_SET   pBaParamSet = NULL;
-       u16pBaTimeoutVal = NULL;
+       u16 *pBaTimeoutVal = NULL;
        PSEQUENCE_CONTROL pBaStartSeqCtrl = NULL;
        PRX_TS_RECORD   pTS = NULL;
 
@@ -346,13 +346,13 @@ int ieee80211_rx_ADDBAReq( struct ieee80211_device* ieee, struct sk_buff *skb)
 
        IEEE80211_DEBUG_DATA(IEEE80211_DL_DATA|IEEE80211_DL_BA, skb->data, skb->len);
 
-       req = ( struct ieee80211_hdr_3addr*) skb->data;
-       tag = (u8*)req;
-       dst = (u8*)(&req->addr2[0]);
+       req = ( struct ieee80211_hdr_3addr *) skb->data;
+       tag = (u8 *)req;
+       dst = (u8 *)(&req->addr2[0]);
        tag += sizeof( struct ieee80211_hdr_3addr);
        pDialogToken = tag + 2;  //category+action
        pBaParamSet = (PBA_PARAM_SET)(tag + 3);   //+DialogToken
-       pBaTimeoutVal = (u16*)(tag + 5);
+       pBaTimeoutVal = (u16 *)(tag + 5);
        pBaStartSeqCtrl = (PSEQUENCE_CONTROL)(req + 7);
 
        printk("====================>rx ADDBAREQ from :%pM\n", dst);
@@ -369,7 +369,7 @@ int ieee80211_rx_ADDBAReq( struct ieee80211_device* ieee, struct sk_buff *skb)
        // If there is no matched TS, reject the ADDBA request.
        if(     !GetTs(
                        ieee,
-                       (PTS_COMMON_INFO*)(&pTS),
+                       (PTS_COMMON_INFO *)(&pTS),
                        dst,
                        (u8)(pBaParamSet->field.TID),
                        RX_DIR,
@@ -427,13 +427,13 @@ OnADDBAReq_Fail:
  *  return:  0(pass), other(fail)
  *  notice:  As this function need support of QOS, I comment some code out. And when qos is ready, this code need to be support.
 ********************************************************************************************************************/
-int ieee80211_rx_ADDBARsp( struct ieee80211_deviceieee, struct sk_buff *skb)
+int ieee80211_rx_ADDBARsp( struct ieee80211_device *ieee, struct sk_buff *skb)
 {
-        struct ieee80211_hdr_3addrrsp = NULL;
+        struct ieee80211_hdr_3addr *rsp = NULL;
        PBA_RECORD              pPendingBA, pAdmittedBA;
        PTX_TS_RECORD           pTS = NULL;
-       u8dst = NULL, *pDialogToken = NULL, *tag = NULL;
-       u16pStatusCode = NULL, *pBaTimeoutVal = NULL;
+       u8 *dst = NULL, *pDialogToken = NULL, *tag = NULL;
+       u16 *pStatusCode = NULL, *pBaTimeoutVal = NULL;
        PBA_PARAM_SET           pBaParamSet = NULL;
        u16                     ReasonCode;
 
@@ -442,14 +442,14 @@ int ieee80211_rx_ADDBARsp( struct ieee80211_device* ieee, struct sk_buff *skb)
                IEEE80211_DEBUG(IEEE80211_DL_ERR, " Invalid skb len in BARSP(%d / %zu)\n", skb->len,    (sizeof( struct ieee80211_hdr_3addr) + 9));
                return -1;
        }
-       rsp = ( struct ieee80211_hdr_3addr*)skb->data;
-       tag = (u8*)rsp;
-       dst = (u8*)(&rsp->addr2[0]);
+       rsp = ( struct ieee80211_hdr_3addr *)skb->data;
+       tag = (u8 *)rsp;
+       dst = (u8 *)(&rsp->addr2[0]);
        tag += sizeof( struct ieee80211_hdr_3addr);
        pDialogToken = tag + 2;
-       pStatusCode = (u16*)(tag + 3);
+       pStatusCode = (u16 *)(tag + 3);
        pBaParamSet = (PBA_PARAM_SET)(tag + 5);
-       pBaTimeoutVal = (u16*)(tag + 7);
+       pBaTimeoutVal = (u16 *)(tag + 7);
 
        // Check the capability
        // Since we can always receive A-MPDU, we just check if it is under HT mode.
@@ -469,7 +469,7 @@ int ieee80211_rx_ADDBARsp( struct ieee80211_device* ieee, struct sk_buff *skb)
        //
        if (!GetTs(
                        ieee,
-                       (PTS_COMMON_INFO*)(&pTS),
+                       (PTS_COMMON_INFO *)(&pTS),
                        dst,
                        (u8)(pBaParamSet->field.TID),
                        TX_DIR,
@@ -560,12 +560,12 @@ OnADDBARsp_Reject:
  *  return:  0(pass), other(fail)
  *  notice:  As this function need support of QOS, I comment some code out. And when qos is ready, this code need to be support.
 ********************************************************************************************************************/
-int ieee80211_rx_DELBA(struct ieee80211_deviceieee,struct sk_buff *skb)
+int ieee80211_rx_DELBA(struct ieee80211_device *ieee,struct sk_buff *skb)
 {
-        struct ieee80211_hdr_3addrdelba = NULL;
+        struct ieee80211_hdr_3addr *delba = NULL;
        PDELBA_PARAM_SET        pDelBaParamSet = NULL;
-       u16*                    pReasonCode = NULL;
-       u8*                     dst = NULL;
+       u16                     *pReasonCode = NULL;
+       u8                      *dst = NULL;
 
        if (skb->len < sizeof( struct ieee80211_hdr_3addr) + 6)
        {
@@ -581,11 +581,11 @@ int ieee80211_rx_DELBA(struct ieee80211_device* ieee,struct sk_buff *skb)
        }
 
        IEEE80211_DEBUG_DATA(IEEE80211_DL_DATA|IEEE80211_DL_BA, skb->data, skb->len);
-       delba = ( struct ieee80211_hdr_3addr*)skb->data;
-       dst = (u8*)(&delba->addr2[0]);
+       delba = ( struct ieee80211_hdr_3addr *)skb->data;
+       dst = (u8 *)(&delba->addr2[0]);
        delba += sizeof( struct ieee80211_hdr_3addr);
        pDelBaParamSet = (PDELBA_PARAM_SET)(delba+2);
-       pReasonCode = (u16*)(delba+4);
+       pReasonCode = (u16 *)(delba+4);
 
        if(pDelBaParamSet->field.Initiator == 1)
        {
@@ -593,7 +593,7 @@ int ieee80211_rx_DELBA(struct ieee80211_device* ieee,struct sk_buff *skb)
 
                if( !GetTs(
                                ieee,
-                               (PTS_COMMON_INFO*)&pRxTs,
+                               (PTS_COMMON_INFO *)&pRxTs,
                                dst,
                                (u8)pDelBaParamSet->field.TID,
                                RX_DIR,
@@ -611,7 +611,7 @@ int ieee80211_rx_DELBA(struct ieee80211_device* ieee,struct sk_buff *skb)
 
                if(!GetTs(
                        ieee,
-                       (PTS_COMMON_INFO*)&pTxTs,
+                       (PTS_COMMON_INFO *)&pTxTs,
                        dst,
                        (u8)pDelBaParamSet->field.TID,
                        TX_DIR,
@@ -636,7 +636,7 @@ int ieee80211_rx_DELBA(struct ieee80211_device* ieee,struct sk_buff *skb)
 //
 void
 TsInitAddBA(
-       struct ieee80211_deviceieee,
+       struct ieee80211_device *ieee,
        PTX_TS_RECORD   pTS,
        u8              Policy,
        u8              bOverwritePending
@@ -665,7 +665,7 @@ TsInitAddBA(
 }
 
 void
-TsInitDelBA( struct ieee80211_deviceieee, PTS_COMMON_INFO pTsCommonInfo, TR_SELECT TxRxSelect)
+TsInitDelBA( struct ieee80211_device *ieee, PTS_COMMON_INFO pTsCommonInfo, TR_SELECT TxRxSelect)
 {
 
        if(TxRxSelect == TX_DIR)
index 268b270e94956af704494073dbd792d8f812f5ce..e956da5a2d76ebb13c5d56185eb65a2df33a6cac 100644 (file)
@@ -51,7 +51,7 @@ static u8 CISCO_BROADCOM[3] = {0x00, 0x17, 0x94};
  *  return:  none
  *  notice:  These value need be modified if any changes.
  * *****************************************************************************************************************/
-void HTUpdateDefaultSetting(struct ieee80211_deviceieee)
+void HTUpdateDefaultSetting(struct ieee80211_device *ieee)
 {
        PRT_HIGH_THROUGHPUT     pHTInfo = ieee->pHTInfo;
        //const typeof( ((struct ieee80211_device *)0)->pHTInfo ) *__mptr = &pHTInfo;
@@ -121,7 +121,7 @@ void HTUpdateDefaultSetting(struct ieee80211_device* ieee)
  *  return:  none
  *  notice:  Driver should not print out this message by default.
  * *****************************************************************************************************************/
-void HTDebugHTCapability(u8* CapIE, u8* TitleString )
+void HTDebugHTCapability(u8 *CapIE, u8 *TitleString )
 {
 
        static u8       EWC11NHTCap[] = {0x00, 0x90, 0x4c, 0x33};       // For 11n EWC definition, 2007.07.17, by Emily
@@ -158,7 +158,7 @@ void HTDebugHTCapability(u8* CapIE, u8* TitleString )
  *  return:  none
  *  notice:  Driver should not print out this message by default.
  * *****************************************************************************************************************/
-void HTDebugHTInfo(u8* InfoIE, u8* TitleString)
+void HTDebugHTInfo(u8 *InfoIE, u8 *TitleString)
 {
 
        static u8       EWC11NHTInfo[] = {0x00, 0x90, 0x4c, 0x34};      // For 11n EWC definition, 2007.07.17, by Emily
@@ -177,7 +177,7 @@ void HTDebugHTInfo(u8*      InfoIE, u8* TitleString)
 
        IEEE80211_DEBUG(IEEE80211_DL_HT, "\tPrimary channel = %d\n", pHTInfoEle->ControlChl);
        IEEE80211_DEBUG(IEEE80211_DL_HT, "\tSenondary channel =");
-       switch(pHTInfoEle->ExtChlOffset)
+       switch (pHTInfoEle->ExtChlOffset)
        {
                case 0:
                        IEEE80211_DEBUG(IEEE80211_DL_HT, "Not Present\n");
@@ -195,7 +195,7 @@ void HTDebugHTInfo(u8*      InfoIE, u8* TitleString)
        IEEE80211_DEBUG(IEEE80211_DL_HT, "\tRecommended channel width = %s\n", (pHTInfoEle->RecommemdedTxWidth)?"20Mhz": "40Mhz");
 
        IEEE80211_DEBUG(IEEE80211_DL_HT, "\tOperation mode for protection = ");
-       switch(pHTInfoEle->OptMode)
+       switch (pHTInfoEle->OptMode)
        {
                case 0:
                        IEEE80211_DEBUG(IEEE80211_DL_HT, "No Protection\n");
@@ -219,7 +219,7 @@ void HTDebugHTInfo(u8*      InfoIE, u8* TitleString)
 /*
 *      Return:         true if station in half n mode and AP supports 40 bw
 */
-bool IsHTHalfNmode40Bandwidth(struct ieee80211_deviceieee)
+bool IsHTHalfNmode40Bandwidth(struct ieee80211_device *ieee)
 {
        bool                    retValue = false;
        PRT_HIGH_THROUGHPUT      pHTInfo = ieee->pHTInfo;
@@ -238,7 +238,7 @@ bool IsHTHalfNmode40Bandwidth(struct ieee80211_device* ieee)
        return retValue;
 }
 
-bool IsHTHalfNmodeSGI(struct ieee80211_deviceieee, bool is40MHz)
+bool IsHTHalfNmodeSGI(struct ieee80211_device *ieee, bool is40MHz)
 {
        bool                    retValue = false;
        PRT_HIGH_THROUGHPUT      pHTInfo = ieee->pHTInfo;
@@ -265,7 +265,7 @@ bool IsHTHalfNmodeSGI(struct ieee80211_device* ieee, bool is40MHz)
        return retValue;
 }
 
-u16 HTHalfMcsToDataRate(struct ieee80211_deviceieee, u8      nMcsRate)
+u16 HTHalfMcsToDataRate(struct ieee80211_device *ieee, u8      nMcsRate)
 {
 
        u8      is40MHz;
@@ -278,7 +278,7 @@ u16 HTHalfMcsToDataRate(struct ieee80211_device* ieee,      u8      nMcsRate)
 }
 
 
-u16 HTMcsToDataRate( struct ieee80211_deviceieee, u8 nMcsRate)
+u16 HTMcsToDataRate( struct ieee80211_device *ieee, u8 nMcsRate)
 {
        PRT_HIGH_THROUGHPUT     pHTInfo = ieee->pHTInfo;
 
@@ -297,7 +297,7 @@ u16 HTMcsToDataRate( struct ieee80211_device* ieee, u8 nMcsRate)
  *  return:  tx rate
  *  notice:  quite unsure about how to use this function //wb
  * *****************************************************************************************************************/
-u16  TxCountToDataRate( struct ieee80211_deviceieee, u8 nDataRate)
+u16  TxCountToDataRate( struct ieee80211_device *ieee, u8 nDataRate)
 {
        //PRT_HIGH_THROUGHPUT   pHTInfo = ieee->pHTInfo;
        u16             CCKOFDMRate[12] = {0x02 , 0x04 , 0x0b , 0x16 , 0x0c , 0x12 , 0x18 , 0x24 , 0x30 , 0x48 , 0x60 , 0x6c};
@@ -344,10 +344,10 @@ u16  TxCountToDataRate( struct ieee80211_device* ieee, u8 nDataRate)
 
 
 
-bool IsHTHalfNmodeAPs(struct ieee80211_deviceieee)
+bool IsHTHalfNmodeAPs(struct ieee80211_device *ieee)
 {
        bool                    retValue = false;
-       struct ieee80211_networknet = &ieee->current_network;
+       struct ieee80211_network *net = &ieee->current_network;
        if((memcmp(net->bssid, BELKINF5D8233V1_RALINK, 3)==0) ||
                     (memcmp(net->bssid, BELKINF5D82334V3_RALINK, 3)==0) ||
                     (memcmp(net->bssid, PCI_RALINK, 3)==0) ||
@@ -376,10 +376,10 @@ bool IsHTHalfNmodeAPs(struct ieee80211_device* ieee)
  *  return:
  *  notice:
  * *****************************************************************************************************************/
-void HTIOTPeerDetermine(struct ieee80211_deviceieee)
+void HTIOTPeerDetermine(struct ieee80211_device *ieee)
 {
        PRT_HIGH_THROUGHPUT     pHTInfo = ieee->pHTInfo;
-       struct ieee80211_networknet = &ieee->current_network;
+       struct ieee80211_network *net = &ieee->current_network;
        if(net->bssht.bdRT2RTAggregation)
                pHTInfo->IOTPeer = HT_IOT_PEER_REALTEK;
        else if(net->broadcom_cap_exist)
@@ -413,7 +413,7 @@ void HTIOTPeerDetermine(struct ieee80211_device* ieee)
  *  output:  none
  *  return:  return 1 if driver should declare MCS13 only(otherwise return 0)
   * *****************************************************************************************************************/
-u8 HTIOTActIsDisableMCS14(struct ieee80211_device* ieee, u8* PeerMacAddr)
+u8 HTIOTActIsDisableMCS14(struct ieee80211_device *ieee, u8 *PeerMacAddr)
 {
        u8 ret = 0;
        return ret;
@@ -432,7 +432,7 @@ u8 HTIOTActIsDisableMCS14(struct ieee80211_device* ieee, u8* PeerMacAddr)
 * Return:      true if driver should disable MCS15
 * 2008.04.15   Emily
 */
-bool HTIOTActIsDisableMCS15(struct ieee80211_deviceieee)
+bool HTIOTActIsDisableMCS15(struct ieee80211_device *ieee)
 {
        bool retValue = false;
 
@@ -469,7 +469,7 @@ bool HTIOTActIsDisableMCS15(struct ieee80211_device* ieee)
 * Return:      true if driver should disable all two spatial stream packet
 * 2008.04.21   Emily
 */
-bool HTIOTActIsDisableMCSTwoSpatialStream(struct ieee80211_deviceieee, u8 *PeerMacAddr)
+bool HTIOTActIsDisableMCSTwoSpatialStream(struct ieee80211_device *ieee, u8 *PeerMacAddr)
 {
        bool retValue = false;
 
@@ -486,7 +486,7 @@ bool HTIOTActIsDisableMCSTwoSpatialStream(struct ieee80211_device* ieee, u8 *Pee
  *  output:  none
  *  return:  return 1 if driver should disable EDCA turbo mode(otherwise return 0)
   * *****************************************************************************************************************/
-u8 HTIOTActIsDisableEDCATurbo(struct ieee80211_device* ieee, u8* PeerMacAddr)
+u8 HTIOTActIsDisableEDCATurbo(struct ieee80211_device *ieee, u8 *PeerMacAddr)
 {
        u8      retValue = false;       // default enable EDCA Turbo mode.
        // Set specific EDCA parameter for different AP in DM handler.
@@ -515,7 +515,7 @@ u8 HTIOTActIsMgntUseCCK6M(struct ieee80211_network *network)
        return retValue;
 }
 
-u8 HTIOTActIsCCDFsync(u8PeerMacAddr)
+u8 HTIOTActIsCCDFsync(u8 *PeerMacAddr)
 {
        u8      retValue = 0;
        if(     (memcmp(PeerMacAddr, UNKNOWN_BORADCOM, 3)==0) ||
@@ -547,7 +547,7 @@ void HTResetIOTSetting(
  *  return:  none
  *  notice:  posHTCap can't be null and should be initialized before.
   * *****************************************************************************************************************/
-void HTConstructCapabilityElement(struct ieee80211_device* ieee, u8* posHTCap, u8* len, u8 IsEncrypt)
+void HTConstructCapabilityElement(struct ieee80211_device *ieee, u8 *posHTCap, u8 *len, u8 IsEncrypt)
 {
        PRT_HIGH_THROUGHPUT     pHT = ieee->pHTInfo;
        PHT_CAPABILITY_ELE      pCapELE = NULL;
@@ -666,7 +666,7 @@ void HTConstructCapabilityElement(struct ieee80211_device* ieee, u8* posHTCap, u
  *  return:  none
  *  notice:  posHTCap can't be null and be initialized before. only AP and IBSS sta should do this
   * *****************************************************************************************************************/
-void HTConstructInfoElement(struct ieee80211_device* ieee, u8* posHTInfo, u8* len, u8 IsEncrypt)
+void HTConstructInfoElement(struct ieee80211_device *ieee, u8 *posHTInfo, u8 *len, u8 IsEncrypt)
 {
        PRT_HIGH_THROUGHPUT     pHT = ieee->pHTInfo;
        PHT_INFORMATION_ELE             pHTInfoEle = (PHT_INFORMATION_ELE)posHTInfo;
@@ -738,7 +738,7 @@ void HTConstructInfoElement(struct ieee80211_device* ieee, u8* posHTInfo, u8* le
  *  return:  none
  *  notice:
   * *****************************************************************************************************************/
-void HTConstructRT2RTAggElement(struct ieee80211_device* ieee, u8* posRT2RTAgg, u8* len)
+void HTConstructRT2RTAggElement(struct ieee80211_device *ieee, u8 *posRT2RTAgg, u8 *len)
 {
        if (posRT2RTAgg == NULL) {
                IEEE80211_DEBUG(IEEE80211_DL_ERR, "posRT2RTAgg can't be null in HTConstructRT2RTAggElement()\n");
@@ -792,7 +792,7 @@ void HTConstructRT2RTAggElement(struct ieee80211_device* ieee, u8* posRT2RTAgg,
  *  return:  always we return true
  *  notice:
   * *****************************************************************************************************************/
-u8 HT_PickMCSRate(struct ieee80211_device* ieee, u8* pOperateMCS)
+u8 HT_PickMCSRate(struct ieee80211_device *ieee, u8 *pOperateMCS)
 {
        u8                                      i;
        if (pOperateMCS == NULL)
@@ -801,7 +801,7 @@ u8 HT_PickMCSRate(struct ieee80211_device* ieee, u8* pOperateMCS)
                return false;
        }
 
-       switch(ieee->mode)
+       switch (ieee->mode)
        {
        case IEEE_A:
        case IEEE_B:
@@ -855,7 +855,7 @@ u8 HT_PickMCSRate(struct ieee80211_device* ieee, u8* pOperateMCS)
  *  return:  Highest MCS rate included in pMCSRateSet and filtered by pMCSFilter
  *  notice:
   * *****************************************************************************************************************/
-u8 HTGetHighestMCSRate(struct ieee80211_device* ieee, u8* pMCSRateSet, u8* pMCSFilter)
+u8 HTGetHighestMCSRate(struct ieee80211_device *ieee, u8 *pMCSRateSet, u8 *pMCSFilter)
 {
        u8              i, j;
        u8              bitMap;
@@ -907,7 +907,7 @@ u8 HTGetHighestMCSRate(struct ieee80211_device* ieee, u8* pMCSRateSet, u8* pMCSF
 **
 ** \pHTSupportedCap: the connected STA's supported rate Capability element
 */
-u8 HTFilterMCSRate( struct ieee80211_device* ieee, u8* pSupportMCS, u8* pOperateMCS)
+u8 HTFilterMCSRate( struct ieee80211_device *ieee, u8 *pSupportMCS, u8 *pOperateMCS)
 {
 
        u8 i=0;
@@ -937,14 +937,14 @@ u8 HTFilterMCSRate( struct ieee80211_device* ieee, u8* pSupportMCS, u8* pOperate
 
        return true;
 }
-void HTSetConnectBwMode(struct ieee80211_deviceieee, HT_CHANNEL_WIDTH        Bandwidth, HT_EXTCHNL_OFFSET    Offset);
+void HTSetConnectBwMode(struct ieee80211_device *ieee, HT_CHANNEL_WIDTH        Bandwidth, HT_EXTCHNL_OFFSET    Offset);
 void HTOnAssocRsp(struct ieee80211_device *ieee)
 {
        PRT_HIGH_THROUGHPUT     pHTInfo = ieee->pHTInfo;
        PHT_CAPABILITY_ELE              pPeerHTCap = NULL;
        PHT_INFORMATION_ELE             pPeerHTInfo = NULL;
        u16     nMaxAMSDUSize = 0;
-       u8*     pMcsFilter = NULL;
+       u8      *pMcsFilter = NULL;
 
        static u8                               EWC11NHTCap[] = {0x00, 0x90, 0x4c, 0x33};               // For 11n EWC definition, 2007.07.17, by Emily
        static u8                               EWC11NHTInfo[] = {0x00, 0x90, 0x4c, 0x34};      // For 11n EWC definition, 2007.07.17, by Emily
@@ -1115,7 +1115,7 @@ void HTOnAssocRsp(struct ieee80211_device *ieee)
 
 }
 
-void HTSetConnectBwModeCallback(struct ieee80211_deviceieee);
+void HTSetConnectBwModeCallback(struct ieee80211_device *ieee);
 /********************************************************************************************************************
  *function:  initialize HT info(struct PRT_HIGH_THROUGHPUT)
  *   input:  struct ieee80211_device*  ieee
@@ -1124,7 +1124,7 @@ void HTSetConnectBwModeCallback(struct ieee80211_device* ieee);
  *  notice: This function is called when *  (1) MPInitialization Phase *  (2) Receiving of Deauthentication from AP
 ********************************************************************************************************************/
 // TODO: Should this funciton be called when receiving of Disassociation?
-void HTInitializeHTInfo(struct ieee80211_deviceieee)
+void HTInitializeHTInfo(struct ieee80211_device *ieee)
 {
        PRT_HIGH_THROUGHPUT pHTInfo = ieee->pHTInfo;
 
@@ -1160,10 +1160,10 @@ void HTInitializeHTInfo(struct ieee80211_device* ieee)
 
 
        // Initialize all of the parameters related to 11n
-       memset((void*)(&(pHTInfo->SelfHTCap)), 0, sizeof(pHTInfo->SelfHTCap));
-       memset((void*)(&(pHTInfo->SelfHTInfo)), 0, sizeof(pHTInfo->SelfHTInfo));
-       memset((void*)(&(pHTInfo->PeerHTCapBuf)), 0, sizeof(pHTInfo->PeerHTCapBuf));
-       memset((void*)(&(pHTInfo->PeerHTInfoBuf)), 0, sizeof(pHTInfo->PeerHTInfoBuf));
+       memset((void *)(&(pHTInfo->SelfHTCap)), 0, sizeof(pHTInfo->SelfHTCap));
+       memset((void *)(&(pHTInfo->SelfHTInfo)), 0, sizeof(pHTInfo->SelfHTInfo));
+       memset((void *)(&(pHTInfo->PeerHTCapBuf)), 0, sizeof(pHTInfo->PeerHTCapBuf));
+       memset((void *)(&(pHTInfo->PeerHTInfoBuf)), 0, sizeof(pHTInfo->PeerHTInfoBuf));
 
        pHTInfo->bSwBwInProgress = false;
        pHTInfo->ChnlOp = CHNLOP_NONE;
@@ -1179,7 +1179,7 @@ void HTInitializeHTInfo(struct ieee80211_device* ieee)
 
        //MCS rate initialized here
        {
-               u8RegHTSuppRateSets = &(ieee->RegHTSuppRateSet[0]);
+               u8 *RegHTSuppRateSets = &(ieee->RegHTSuppRateSet[0]);
                RegHTSuppRateSets[0] = 0xFF;    //support MCS 0~7
                RegHTSuppRateSets[1] = 0xFF;    //support MCS 8~15
                RegHTSuppRateSets[4] = 0x01;    //support MCS 32
@@ -1214,7 +1214,7 @@ void HTInitializeBssDesc(PBSS_HT pBssHT)
  *  return:  none
  *  notice: This function should ONLY be called before association
 ********************************************************************************************************************/
-void HTResetSelfAndSavePeerSetting(struct ieee80211_device* ieee,      struct ieee80211_network * pNetwork)
+void HTResetSelfAndSavePeerSetting(struct ieee80211_device *ieee,      struct ieee80211_network *pNetwork)
 {
        PRT_HIGH_THROUGHPUT             pHTInfo = ieee->pHTInfo;
 //     u16                                             nMaxAMSDUSize;
@@ -1297,7 +1297,7 @@ void HTResetSelfAndSavePeerSetting(struct ieee80211_device* ieee, struct ieee802
 
 }
 
-void HTUpdateSelfAndPeerSetting(struct ieee80211_device* ieee, struct ieee80211_network * pNetwork)
+void HTUpdateSelfAndPeerSetting(struct ieee80211_device *ieee, struct ieee80211_network *pNetwork)
 {
        PRT_HIGH_THROUGHPUT     pHTInfo = ieee->pHTInfo;
 //     PHT_CAPABILITY_ELE              pPeerHTCap = (PHT_CAPABILITY_ELE)pNetwork->bssht.bdHTCapBuf;
@@ -1317,7 +1317,7 @@ void HTUpdateSelfAndPeerSetting(struct ieee80211_device* ieee,    struct ieee80211_
        }
 }
 
-void HTUseDefaultSetting(struct ieee80211_deviceieee)
+void HTUseDefaultSetting(struct ieee80211_device *ieee)
 {
        PRT_HIGH_THROUGHPUT pHTInfo = ieee->pHTInfo;
 //     u8      regBwOpMode;
@@ -1370,7 +1370,7 @@ void HTUseDefaultSetting(struct ieee80211_device* ieee)
  *  return:  return true if HT control field exists(false otherwise)
  *  notice:
 ********************************************************************************************************************/
-u8 HTCCheck(struct ieee80211_device* ieee, u8* pFrame)
+u8 HTCCheck(struct ieee80211_device *ieee, u8 *pFrame)
 {
        if(ieee->pHTInfo->bCurrentHTSupport)
        {
@@ -1386,7 +1386,7 @@ u8 HTCCheck(struct ieee80211_device* ieee, u8*    pFrame)
 //
 // This function set bandwidth mode in protocol layer.
 //
-void HTSetConnectBwMode(struct ieee80211_deviceieee, HT_CHANNEL_WIDTH        Bandwidth, HT_EXTCHNL_OFFSET    Offset)
+void HTSetConnectBwMode(struct ieee80211_device *ieee, HT_CHANNEL_WIDTH        Bandwidth, HT_EXTCHNL_OFFSET    Offset)
 {
        PRT_HIGH_THROUGHPUT pHTInfo = ieee->pHTInfo;
 //     u32 flags = 0;
@@ -1435,7 +1435,7 @@ void HTSetConnectBwMode(struct ieee80211_device* ieee, HT_CHANNEL_WIDTH   Bandwidt
 //     spin_unlock_irqrestore(&(ieee->bw_spinlock), flags);
 }
 
-void HTSetConnectBwModeCallback(struct ieee80211_deviceieee)
+void HTSetConnectBwModeCallback(struct ieee80211_device *ieee)
 {
        PRT_HIGH_THROUGHPUT pHTInfo = ieee->pHTInfo;
 
index 2348ccd70be05ee862e22774b69cb4278c7f3cba..f2d52ca08cd034577bb7c1d73dd4ac3338ca0ee7 100644 (file)
@@ -483,7 +483,7 @@ typedef struct _OCTET_STRING{
 typedef struct _STA_QOS{
        //DECLARE_RT_OBJECT(STA_QOS);
        u8                              WMMIEBuf[MAX_WMMELE_LENGTH];
-       u8*                             WMMIE;
+       u8                              *WMMIE;
 
        // Part 1. Self QoS Mode.
        QOS_MODE                        QosCapability; //QoS Capability, 2006-06-14 Isaiah
@@ -498,7 +498,7 @@ typedef struct _STA_QOS{
        int                             NumBcnBeforeTrigger;
 
        // Part 2. EDCA Parameter (perAC)
-       u8 *                            pWMMInfoEle;
+       u8                              *pWMMInfoEle;
        u8                              WMMParamEle[WMM_PARAM_ELEMENT_SIZE];
        u8                              WMMPELength;
 
@@ -537,12 +537,12 @@ typedef struct _BSS_QOS{
        QOS_MODE                bdQoSMode;
 
        u8                      bdWMMIEBuf[MAX_WMMELE_LENGTH];
-       u8*             bdWMMIE;
+       u8              *bdWMMIE;
 
        QOS_ELE_SUBTYPE         EleSubType;
 
-       u8 *                    pWMMInfoEle;
-       u8 *                    pWMMParamEle;
+       u8                      *pWMMInfoEle;
+       u8                      *pWMMParamEle;
 
        QOS_INFO_FIELD          QosInfoField;
        AC_PARAM                AcParameter[4];
index 0310d07287a198e2e834a396fad47421b777dc92..3058120a32439dfbc1de30aee6b4bcc4d90bbea0 100644 (file)
@@ -234,12 +234,12 @@ void AdmitTS(struct ieee80211_device *ieee, PTS_COMMON_INFO pTsCommonInfo, u32 I
 }
 
 
-PTS_COMMON_INFO SearchAdmitTRStream(struct ieee80211_device *ieee, u8Addr, u8 TID, TR_SELECT TxRxSelect)
+PTS_COMMON_INFO SearchAdmitTRStream(struct ieee80211_device *ieee, u8 *Addr, u8 TID, TR_SELECT TxRxSelect)
 {
        //DIRECTION_VALUE       dir;
        u8      dir;
        bool                            search_dir[4] = {0, 0, 0, 0};
-       struct list_head*               psearch_list; //FIXME
+       struct list_head                *psearch_list; //FIXME
        PTS_COMMON_INFO pRet = NULL;
        if(ieee->iw_mode == IW_MODE_MASTER) //ap mode
        {
@@ -311,7 +311,7 @@ PTS_COMMON_INFO SearchAdmitTRStream(struct ieee80211_device *ieee, u8*      Addr, u8
 
 void MakeTSEntry(
                PTS_COMMON_INFO pTsCommonInfo,
-               u8*             Addr,
+               u8              *Addr,
                PTSPEC_BODY     pTSPEC,
                PQOS_TCLAS      pTCLAS,
                u8              TCLAS_Num,
@@ -326,10 +326,10 @@ void MakeTSEntry(
        memcpy(pTsCommonInfo->Addr, Addr, 6);
 
        if(pTSPEC != NULL)
-               memcpy((u8*)(&(pTsCommonInfo->TSpec)), (u8*)pTSPEC, sizeof(TSPEC_BODY));
+               memcpy((u8 *)(&(pTsCommonInfo->TSpec)), (u8 *)pTSPEC, sizeof(TSPEC_BODY));
 
        for(count = 0; count < TCLAS_Num; count++)
-               memcpy((u8*)(&(pTsCommonInfo->TClass[count])), (u8*)pTCLAS, sizeof(QOS_TCLAS));
+               memcpy((u8 *)(&(pTsCommonInfo->TClass[count])), (u8 *)pTCLAS, sizeof(QOS_TCLAS));
 
        pTsCommonInfo->TClasProc = TCLAS_Proc;
        pTsCommonInfo->TClasNum = TCLAS_Num;
@@ -337,9 +337,9 @@ void MakeTSEntry(
 
 
 bool GetTs(
-       struct ieee80211_device*        ieee,
+       struct ieee80211_device         *ieee,
        PTS_COMMON_INFO                 *ppTS,
-       u8*                             Addr,
+       u8                              *Addr,
        u8                              TID,
        TR_SELECT                       TxRxSelect,  //Rx:1, Tx:0
        bool                            bAddNewTs
@@ -367,7 +367,7 @@ bool GetTs(
                        return false;
                }
 
-               switch(TID)
+               switch (TID)
                {
                case 0:
                case 3:
@@ -416,12 +416,12 @@ bool GetTs(
                        //
                        TSPEC_BODY      TSpec;
                        PQOS_TSINFO             pTSInfo = &TSpec.f.TSInfo;
-                       struct list_head*       pUnusedList =
+                       struct list_head        *pUnusedList =
                                                                (TxRxSelect == TX_DIR)?
                                                                (&ieee->Tx_TS_Unused_List):
                                                                (&ieee->Rx_TS_Unused_List);
 
-                       struct list_head*       pAddmitList =
+                       struct list_head        *pAddmitList =
                                                                (TxRxSelect == TX_DIR)?
                                                                (&ieee->Tx_TS_Admit_List):
                                                                (&ieee->Rx_TS_Admit_List);
@@ -473,7 +473,7 @@ bool GetTs(
 }
 
 void RemoveTsEntry(
-       struct ieee80211_device*        ieee,
+       struct ieee80211_device         *ieee,
        PTS_COMMON_INFO                 pTs,
        TR_SELECT                       TxRxSelect
        )
@@ -501,7 +501,7 @@ void RemoveTsEntry(
                        list_del_init(&pRxReorderEntry->List);
                        {
                                int i = 0;
-                               struct ieee80211_rxb * prxb = pRxReorderEntry->prxb;
+                               struct ieee80211_rxb *prxb = pRxReorderEntry->prxb;
                                if (unlikely(!prxb))
                                {
                                        spin_unlock_irqrestore(&(ieee->reorder_spinlock), flags);
@@ -527,7 +527,7 @@ void RemoveTsEntry(
        }
 }
 
-void RemovePeerTS(struct ieee80211_device* ieee, u8* Addr)
+void RemovePeerTS(struct ieee80211_device *ieee, u8 *Addr)
 {
        PTS_COMMON_INFO pTS, pTmpTS;
 
@@ -574,7 +574,7 @@ void RemovePeerTS(struct ieee80211_device* ieee, u8* Addr)
        }
 }
 
-void RemoveAllTS(struct ieee80211_deviceieee)
+void RemoveAllTS(struct ieee80211_device *ieee)
 {
        PTS_COMMON_INFO pTS, pTmpTS;
 
@@ -607,7 +607,7 @@ void RemoveAllTS(struct ieee80211_device* ieee)
        }
 }
 
-void TsStartAddBaProcess(struct ieee80211_deviceieee, PTX_TS_RECORD  pTxTS)
+void TsStartAddBaProcess(struct ieee80211_device *ieee, PTX_TS_RECORD  pTxTS)
 {
        if(pTxTS->bAddBaReqInProgress == false)
        {
index 7e49ad8f48f6aabfcee755c20caeb14ff48a734d..d2199986d13265c524440769c1bafde2a77785b7 100644 (file)
 
 void eprom_cs(struct net_device *dev, short bit)
 {
-       if(bit)
-               write_nic_byte_E(dev, EPROM_CMD,
-                              (1<<EPROM_CS_SHIFT) | \
-                              read_nic_byte_E(dev, EPROM_CMD)); //enable EPROM
+       u8 cmdreg;
+
+       read_nic_byte_E(dev, EPROM_CMD, &cmdreg);
+       if (bit)
+               /* enable EPROM */
+               write_nic_byte_E(dev, EPROM_CMD, cmdreg | EPROM_CS_BIT);
        else
-               write_nic_byte_E(dev, EPROM_CMD, read_nic_byte_E(dev, EPROM_CMD)\
-                              &~(1<<EPROM_CS_SHIFT)); //disable EPROM
+               /* disable EPROM */
+               write_nic_byte_E(dev, EPROM_CMD, cmdreg & ~EPROM_CS_BIT);
 
        force_pci_posting(dev);
        udelay(EPROM_DELAY);
@@ -37,12 +39,15 @@ void eprom_cs(struct net_device *dev, short bit)
 
 void eprom_ck_cycle(struct net_device *dev)
 {
-       write_nic_byte_E(dev, EPROM_CMD,
-                      (1<<EPROM_CK_SHIFT) | read_nic_byte_E(dev,EPROM_CMD));
+       u8 cmdreg;
+
+       read_nic_byte_E(dev, EPROM_CMD, &cmdreg);
+       write_nic_byte_E(dev, EPROM_CMD, cmdreg | EPROM_CK_BIT);
        force_pci_posting(dev);
        udelay(EPROM_DELAY);
-       write_nic_byte_E(dev, EPROM_CMD,
-                      read_nic_byte_E(dev, EPROM_CMD) &~ (1<<EPROM_CK_SHIFT));
+
+       read_nic_byte_E(dev, EPROM_CMD, &cmdreg);
+       write_nic_byte_E(dev, EPROM_CMD, cmdreg & ~EPROM_CK_BIT);
        force_pci_posting(dev);
        udelay(EPROM_DELAY);
 }
@@ -50,12 +55,13 @@ void eprom_ck_cycle(struct net_device *dev)
 
 void eprom_w(struct net_device *dev,short bit)
 {
-       if(bit)
-               write_nic_byte_E(dev, EPROM_CMD, (1<<EPROM_W_SHIFT) | \
-                              read_nic_byte_E(dev,EPROM_CMD));
+       u8 cmdreg;
+
+       read_nic_byte_E(dev, EPROM_CMD, &cmdreg);
+       if (bit)
+               write_nic_byte_E(dev, EPROM_CMD, cmdreg | EPROM_W_BIT);
        else
-               write_nic_byte_E(dev, EPROM_CMD, read_nic_byte_E(dev,EPROM_CMD)\
-                              &~(1<<EPROM_W_SHIFT));
+               write_nic_byte_E(dev, EPROM_CMD, cmdreg & ~EPROM_W_BIT);
 
        force_pci_posting(dev);
        udelay(EPROM_DELAY);
@@ -64,12 +70,14 @@ void eprom_w(struct net_device *dev,short bit)
 
 short eprom_r(struct net_device *dev)
 {
-       short bit;
+       u8 bit;
 
-       bit=(read_nic_byte_E(dev, EPROM_CMD) & (1<<EPROM_R_SHIFT) );
+       read_nic_byte_E(dev, EPROM_CMD, &bit);
        udelay(EPROM_DELAY);
 
-       if(bit) return 1;
+       if (bit & EPROM_R_BIT)
+               return 1;
+
        return 0;
 }
 
index cf9713fa8b9d4a1ec20631884ed55efbfe7fed94..40b14a2d1cdb29bb44b7bc1eb36e3d1910f95f50 100644 (file)
@@ -23,7 +23,7 @@
  * Return:      NONE
  * Note:       8226 support both 20M  and 40 MHz
  *---------------------------------------------------------------------------*/
-void PHY_SetRF8256Bandwidth(struct net_devicedev , HT_CHANNEL_WIDTH Bandwidth)       //20M or 40M
+void PHY_SetRF8256Bandwidth(struct net_device *dev , HT_CHANNEL_WIDTH Bandwidth)       //20M or 40M
 {
        u8      eRFPath;
        struct r8192_priv *priv = ieee80211_priv(dev);
@@ -34,7 +34,7 @@ void PHY_SetRF8256Bandwidth(struct net_device* dev , HT_CHANNEL_WIDTH Bandwidth)
                if (!rtl8192_phy_CheckIsLegalRFPath(dev, eRFPath))
                                continue;
 
-               switch(Bandwidth)
+               switch (Bandwidth)
                {
                        case HT_CHANNEL_WIDTH_20:
                                if(priv->card_8192_version == VERSION_819xU_A || priv->card_8192_version == VERSION_819xU_B)// 8256 D-cut, E-cut, xiong: consider it later!
@@ -73,7 +73,7 @@ void PHY_SetRF8256Bandwidth(struct net_device* dev , HT_CHANNEL_WIDTH Bandwidth)
 
                                break;
                        default:
-                               RT_TRACE(COMP_ERR, "PHY_SetRF8256Bandwidth(): unknown Bandwidth: %#X\n",Bandwidth );
+                               RT_TRACE(COMP_ERR, "PHY_SetRF8256Bandwidth(): unknown Bandwidth: %#X\n",Bandwidth);
                                break;
 
                }
@@ -86,7 +86,7 @@ void PHY_SetRF8256Bandwidth(struct net_device* dev , HT_CHANNEL_WIDTH Bandwidth)
  * Output:      NONE
  * Return:      NONE
  *---------------------------------------------------------------------------*/
-void PHY_RF8256_Config(struct net_devicedev)
+void PHY_RF8256_Config(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        // Initialize general global value
@@ -104,7 +104,7 @@ void PHY_RF8256_Config(struct net_device* dev)
  * Output:      NONE
  * Return:      NONE
  *---------------------------------------------------------------------------*/
-void phy_RF8256_Config_ParaFile(struct net_devicedev)
+void phy_RF8256_Config_ParaFile(struct net_device *dev)
 {
        u32     u4RegValue = 0;
        //static s1Byte                         szRadioAFile[] = RTL819X_PHY_RADIO_A;
@@ -133,7 +133,7 @@ void phy_RF8256_Config_ParaFile(struct net_device* dev)
        //      pHalData->RfReg0Value[eRFPath] =  rtl8192_phy_QueryRFReg(dev, (RF90_RADIO_PATH_E)eRFPath, rGlobalCtrl, bMaskDWord);
 
                /*----Store original RFENV control type----*/
-               switch(eRFPath)
+               switch (eRFPath)
                {
                case RF90_PATH_A:
                case RF90_PATH_C:
@@ -168,7 +168,7 @@ void phy_RF8256_Config_ParaFile(struct net_device* dev)
                RetryTimes = ConstRetryTimes;
                RF3_Final_Value = 0;
                /*----Initialize RF fom connfiguration file----*/
-               switch(eRFPath)
+               switch (eRFPath)
                {
                case RF90_PATH_A:
                        while(RF3_Final_Value!=RegValueToBeCheck && RetryTimes!=0)
@@ -209,7 +209,7 @@ void phy_RF8256_Config_ParaFile(struct net_device* dev)
                }
 
                /*----Restore RFENV control type----*/;
-               switch(eRFPath)
+               switch (eRFPath)
                {
                case RF90_PATH_A:
                case RF90_PATH_C:
@@ -237,14 +237,14 @@ phy_RF8256_Config_ParaFile_Fail:
 }
 
 
-void PHY_SetRF8256CCKTxPower(struct net_device*        dev, u8 powerlevel)
+void PHY_SetRF8256CCKTxPower(struct net_device *dev, u8 powerlevel)
 {
        u32     TxAGC=0;
        struct r8192_priv *priv = ieee80211_priv(dev);
        //modified by vivi, 20080109
        TxAGC = powerlevel;
 
-       if(priv->bDynamicTxLowPower == TRUE ) //cosa 05/22/2008 for scan
+       if(priv->bDynamicTxLowPower == TRUE) //cosa 05/22/2008 for scan
        {
                if(priv->CustomerID == RT_CID_819x_Netcore)
                        TxAGC = 0x22;
@@ -258,7 +258,7 @@ void PHY_SetRF8256CCKTxPower(struct net_device*     dev, u8 powerlevel)
 }
 
 
-void PHY_SetRF8256OFDMTxPower(struct net_devicedev, u8 powerlevel)
+void PHY_SetRF8256OFDMTxPower(struct net_device *dev, u8 powerlevel)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        //Joseph TxPower for 8192 testing
index 5c1f650fe8248a969a464e0b14300ef9c4c44bba..b64dd662761af7d0b52e5e7be32124d1e52e5678 100644 (file)
 #else
 #define RTL819X_TOTAL_RF_PATH 2 //for 8192U
 #endif
-extern void PHY_SetRF8256Bandwidth(struct net_devicedev , HT_CHANNEL_WIDTH Bandwidth);
-extern void PHY_RF8256_Config(struct net_devicedev);
-extern void phy_RF8256_Config_ParaFile(struct net_devicedev);
-extern void PHY_SetRF8256CCKTxPower(struct net_devicedev, u8 powerlevel);
-extern void PHY_SetRF8256OFDMTxPower(struct net_devicedev, u8 powerlevel);
+extern void PHY_SetRF8256Bandwidth(struct net_device *dev , HT_CHANNEL_WIDTH Bandwidth);
+extern void PHY_RF8256_Config(struct net_device *dev);
+extern void phy_RF8256_Config_ParaFile(struct net_device *dev);
+extern void PHY_SetRF8256CCKTxPower(struct net_device *dev, u8 powerlevel);
+extern void PHY_SetRF8256OFDMTxPower(struct net_device *dev, u8 powerlevel);
 
 #endif
index bedeb330ad4f82c7eaa743b35db960a465c008f1..338e7bc237c3d86d2f4c58978b936cf33df84fa3 100644 (file)
@@ -1,40 +1,38 @@
 /*
  This is part of rtl8187 OpenSource driver.
  Copyright (C) Andrea Merello 2004-2005  <andreamrl@tiscali.it>
  Released under the terms of GPL (General Public Licence)
-
  Parts of this driver are based on the GPL part of the
  official realtek driver
-
  Parts of this driver are based on the rtl8192 driver skeleton
  from Patric Schenke & Andres Salomon
-
  Parts of this driver are based on the Intel Pro Wireless 2100 GPL driver
-
  We want to thank the Authors of those projects and the Ndiswrapper
  project Authors.
-*/
* This is part of rtl8187 OpenSource driver.
* Copyright (C) Andrea Merello 2004-2005  <andreamrl@tiscali.it>
* Released under the terms of GPL (General Public Licence)
+ *
* Parts of this driver are based on the GPL part of the
* official realtek driver
+ *
* Parts of this driver are based on the rtl8192 driver skeleton
* from Patric Schenke & Andres Salomon
+ *
* Parts of this driver are based on the Intel Pro Wireless 2100 GPL driver
+ *
* We want to thank the Authors of those projects and the Ndiswrapper
* project Authors.
+ */
 
 #ifndef R819xU_H
 #define R819xU_H
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-//#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
-//#include <linux/pci.h>
 #include <linux/usb.h>
 #include <linux/etherdevice.h>
 #include <linux/delay.h>
-#include <linux/rtnetlink.h>   //for rtnl_lock()
+#include <linux/rtnetlink.h>
 #include <linux/wireless.h>
 #include <linux/timer.h>
-#include <linux/proc_fs.h>     // Necessary because we use the proc fs
+#include <linux/proc_fs.h>
 #include <linux/if_arp.h>
 #include <linux/random.h>
 #include <asm/io.h>
@@ -42,7 +40,7 @@
 
 #define RTL8192U
 #define RTL819xU_MODULE_NAME "rtl819xU"
-//added for HW security, john.0629
+/* HW security */
 #define FALSE 0
 #define TRUE 1
 #define MAX_KEY_LEN     61
 #define BIT30           0x40000000
 #define BIT31           0x80000000
 
-// Rx smooth factor
 #define        Rx_Smooth_Factor                20
-#define DMESG(x,a...)
-#define DMESGW(x,a...)
-#define DMESGE(x,a...)
+#define DMESG(x, a...)
+#define DMESGW(x, a...)
+#define DMESGE(x, a...)
 extern u32 rt_global_debug_component;
 #define RT_TRACE(component, x, args...) \
-do { if(rt_global_debug_component & component) \
-       printk(KERN_DEBUG RTL819xU_MODULE_NAME ":" x "\n" , \
-              ##args);\
-}while(0);
-
-#define COMP_TRACE                             BIT0            // For function call tracing.
-#define COMP_DBG                               BIT1            // Only for temporary debug message.
-#define COMP_INIT                              BIT2            // during driver initialization / halt / reset.
-
-
-#define COMP_RECV                              BIT3            // Receive data path.
-#define COMP_SEND                              BIT4            // Send part path.
-#define COMP_IO                                        BIT5            // I/O Related. Added by Annie, 2006-03-02.
-#define COMP_POWER                             BIT6            // 802.11 Power Save mode or System/Device Power state related.
-#define COMP_EPROM                             BIT7            // 802.11 link related: join/start BSS, leave BSS.
-#define COMP_SWBW                              BIT8    // For bandwidth switch.
-#define COMP_POWER_TRACKING                    BIT9    //FOR 8190 TX POWER TRACKING
-#define COMP_TURBO                             BIT10   // For Turbo Mode related. By Annie, 2005-10-21.
-#define COMP_QOS                               BIT11   // For QoS.
-#define COMP_RATE                              BIT12   // For Rate Adaptive mechanism, 2006.07.02, by rcnjko.
-#define COMP_RM                                        BIT13   // For Radio Measurement.
-#define COMP_DIG                               BIT14   // For DIG, 2006.09.25, by rcnjko.
-#define COMP_PHY                               BIT15
-#define COMP_CH                                        BIT16   //channel setting debug
-#define COMP_TXAGC                             BIT17   // For Tx power, 060928, by rcnjko.
-#define COMP_HIPWR                             BIT18   // For High Power Mechanism, 060928, by rcnjko.
-#define COMP_HALDM                             BIT19   // For HW Dynamic Mechanism, 061010, by rcnjko.
-#define COMP_SEC                               BIT20   // Event handling
-#define COMP_LED                               BIT21   // For LED.
-#define COMP_RF                                        BIT22   // For RF.
-//1!!!!!!!!!!!!!!!!!!!!!!!!!!!
-#define COMP_RXDESC                            BIT23   // Show Rx desc information for SD3 debug. Added by Annie, 2006-07-15.
-//1//1Attention Please!!!<11n or 8190 specific code should be put below this line>
-//1!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-#define COMP_FIRMWARE                          BIT24   //for firmware downloading
-#define COMP_HT                                        BIT25   // For 802.11n HT related information. by Emily 2006-8-11
-#define COMP_AMSDU                             BIT26   // For A-MSDU Debugging
-
-#define COMP_SCAN                              BIT27
-//#define COMP_RESET                           BIT28
-#define COMP_DOWN                              BIT29  //for rm driver module
-#define COMP_RESET                             BIT30  //for silent reset
-#define COMP_ERR                               BIT31 //for error out, always on
+       do {                                                    \
+               if (rt_global_debug_component & component)      \
+                       pr_debug("RTL8192U: " x "\n", ##args);  \
+       } while (0)
+
+#define COMP_TRACE              BIT0  /* Function call tracing. */
+#define COMP_DBG                BIT1
+#define COMP_INIT               BIT2  /* Driver initialization/halt/reset. */
+
+
+#define COMP_RECV               BIT3  /* Receive data path. */
+#define COMP_SEND               BIT4  /* Send data path. */
+#define COMP_IO                 BIT5
+/* 802.11 Power Save mode or System/Device Power state. */
+#define COMP_POWER              BIT6
+/* 802.11 link related: join/start BSS, leave BSS. */
+#define COMP_EPROM              BIT7
+#define COMP_SWBW               BIT8  /* Bandwidth switch. */
+#define COMP_POWER_TRACKING     BIT9  /* 8190 TX Power Tracking */
+#define COMP_TURBO              BIT10 /* Turbo Mode */
+#define COMP_QOS                BIT11
+#define COMP_RATE               BIT12 /* Rate Adaptive mechanism */
+#define COMP_RM                 BIT13 /* Radio Measurement */
+#define COMP_DIG                BIT14
+#define COMP_PHY                BIT15
+#define COMP_CH                 BIT16 /* Channel setting debug */
+#define COMP_TXAGC              BIT17 /* Tx power */
+#define COMP_HIPWR              BIT18 /* High Power Mechanism */
+#define COMP_HALDM              BIT19 /* HW Dynamic Mechanism */
+#define COMP_SEC                BIT20 /* Event handling */
+#define COMP_LED                BIT21
+#define COMP_RF                 BIT22
+#define COMP_RXDESC             BIT23 /* Rx desc information for SD3 debug */
+
+/* 11n or 8190 specific code */
+
+#define COMP_FIRMWARE           BIT24 /* Firmware downloading */
+#define COMP_HT                 BIT25 /* 802.11n HT related information */
+#define COMP_AMSDU              BIT26 /* A-MSDU Debugging */
+#define COMP_SCAN               BIT27
+#define COMP_DOWN               BIT29 /* rm driver module */
+#define COMP_RESET              BIT30 /* Silent reset */
+#define COMP_ERR                BIT31 /* Error out, always on */
 
 #define RTL819x_DEBUG
 #ifdef RTL819x_DEBUG
-#define assert(expr) \
-       if (!(expr)) {                                  \
-               printk( "Assertion failed! %s,%s,%s,line=%d\n", \
-               #expr,__FILE__,__FUNCTION__,__LINE__);          \
-       }
-//wb added to debug out data buf
-//if you want print DATA buffer related BA, please set ieee80211_debug_level to DATA|BA
-#define RT_DEBUG_DATA(level, data, datalen)      \
-       do{ if ((rt_global_debug_component & (level)) == (level))   \
-               {       \
-                       int i;                                  \
-                       u8* pdata = (u8*) data;                 \
-                       printk(KERN_DEBUG RTL819xU_MODULE_NAME ": %s()\n", __FUNCTION__);   \
-                       for(i=0; i<(int)(datalen); i++)                 \
-                       {                                               \
+#define RTL8192U_ASSERT(expr) \
+       do {                                                            \
+               if (!(expr)) {                                          \
+                       pr_debug("Assertion failed! %s, %s, %s, line = %d\n", \
+                                #expr, __FILE__, __func__, __LINE__);  \
+               }                                                       \
+       } while (0)
+/*
+ * Debug out data buf.
+ * If you want to print DATA buffer related BA,
+ * please set ieee80211_debug_level to DATA|BA
+ */
+#define RT_DEBUG_DATA(level, data, datalen) \
+       do {                                                            \
+               if ((rt_global_debug_component & (level)) == (level)) { \
+                       int i;                                          \
+                       u8 *pdata = (u8 *) data;                        \
+                       pr_debug("RTL8192U: %s()\n", __func__);         \
+                       for (i = 0; i < (int)(datalen); i++) {          \
                                printk("%2x ", pdata[i]);               \
-                               if ((i+1)%16 == 0) printk("\n");        \
-                       }                               \
-                       printk("\n");                   \
-               }                                       \
+                               if ((i+1)%16 == 0)                      \
+                                       printk("\n");                   \
+                       }                                               \
+                       printk("\n");                                   \
+               }                                                       \
        } while (0)
 #else
-#define assert(expr) do {} while (0)
-#define RT_DEBUG_DATA(level, data, datalen) do {} while(0)
+#define RTL8192U_ASSERT(expr) do {} while (0)
+#define RT_DEBUG_DATA(level, data, datalen) do {} while (0)
 #endif /* RTL8169_DEBUG */
 
 
-//
-// Queue Select Value in TxDesc
-//
+/* Queue Select Value in TxDesc */
 #define QSLT_BK                                 0x1
 #define QSLT_BE                                 0x0
 #define QSLT_VI                                 0x4
@@ -208,13 +207,13 @@ do { if(rt_global_debug_component & component) \
 
 #define IEEE80211_WATCH_DOG_TIME    2000
 #define                PHY_Beacon_RSSI_SLID_WIN_MAX            10
-//for txpowertracking by amy
+/* For Tx Power Tracking */
 #define                OFDM_Table_Length       19
 #define        CCK_Table_length        12
 
-/* for rtl819x */
+/* For rtl819x */
 typedef struct _tx_desc_819x_usb {
-       //DWORD 0
+       /* DWORD 0 */
        u16     PktSize;
        u8      Offset;
        u8      Reserved0:3;
@@ -224,7 +223,7 @@ typedef struct _tx_desc_819x_usb {
        u8      LINIP:1;
        u8      OWN:1;
 
-       //DWORD 1
+       /* DWORD 1 */
        u8      TxFWInfoSize;
        u8      RATid:3;
        u8      DISFB:1;
@@ -239,27 +238,26 @@ typedef struct _tx_desc_819x_usb {
        u8      SecDescAssign:1;
        u8      SecType:2;
 
-       //DWORD 2
+       /* DWORD 2 */
        u16     TxBufferSize;
-       //u16 Reserved2;
        u8      ResvForPaddingLen:7;
        u8      Reserved3:1;
        u8      Reserved4;
 
-       //DWORD 3, 4, 5
+       /* DWORD 3, 4, 5 */
        u32     Reserved5;
        u32     Reserved6;
        u32     Reserved7;
-}tx_desc_819x_usb, *ptx_desc_819x_usb;
+} tx_desc_819x_usb, *ptx_desc_819x_usb;
 
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
 typedef struct _tx_desc_819x_usb_aggr_subframe {
-       //DWORD 0
+       /* DWORD 0 */
        u16     PktSize;
        u8      Offset;
        u8      TxFWInfoSize;
 
-       //DWORD 1
+       /* DWORD 1 */
        u8      RATid:3;
        u8      DISFB:1;
        u8      USERATE:1;
@@ -274,13 +272,13 @@ typedef struct _tx_desc_819x_usb_aggr_subframe {
        u8      SecType:2;
        u8      PacketID:7;
        u8      OWN:1;
-}tx_desc_819x_usb_aggr_subframe, *ptx_desc_819x_usb_aggr_subframe;
+} tx_desc_819x_usb_aggr_subframe, *ptx_desc_819x_usb_aggr_subframe;
 #endif
 
 
 
 typedef struct _tx_desc_cmd_819x_usb {
-       //DWORD 0
+       /* DWORD 0 */
        u16     Reserved0;
        u8      Reserved1;
        u8      Reserved2:3;
@@ -290,65 +288,64 @@ typedef struct _tx_desc_cmd_819x_usb {
        u8      LINIP:1;
        u8      OWN:1;
 
-       //DOWRD 1
-       //u32   Reserved3;
+       /* DOWRD 1 */
        u8      TxFWInfoSize;
        u8      Reserved3;
        u8      QueueSelect;
        u8      Reserved4;
 
-       //DOWRD 2
+       /* DOWRD 2 */
        u16     TxBufferSize;
        u16     Reserved5;
 
-       //DWORD 3,4,5
-       //u32   TxBufferAddr;
-       //u32   NextDescAddress;
+       /* DWORD 3, 4, 5 */
        u32     Reserved6;
        u32     Reserved7;
        u32     Reserved8;
-}tx_desc_cmd_819x_usb, *ptx_desc_cmd_819x_usb;
+} tx_desc_cmd_819x_usb, *ptx_desc_cmd_819x_usb;
 
 
 typedef struct _tx_fwinfo_819x_usb {
-       //DOWRD 0
-       u8              TxRate:7;
-       u8              CtsEnable:1;
-       u8              RtsRate:7;
-       u8              RtsEnable:1;
-       u8              TxHT:1;
-       u8              Short:1;                //Short PLCP for CCK, or short GI for 11n MCS
-       u8              TxBandwidth:1;          // This is used for HT MCS rate only.
-       u8              TxSubCarrier:2;         // This is used for legacy OFDM rate only.
-       u8              STBC:2;
-       u8              AllowAggregation:1;
-       u8              RtsHT:1;                //Interpret RtsRate field as high throughput data rate
-       u8              RtsShort:1;             //Short PLCP for CCK, or short GI for 11n MCS
-       u8              RtsBandwidth:1;         // This is used for HT MCS rate only.
-       u8              RtsSubcarrier:2;        // This is used for legacy OFDM rate only.
-       u8              RtsSTBC:2;
-       u8              EnableCPUDur:1;         //Enable firmware to recalculate and assign packet duration
-
-       //DWORD 1
-       u32             RxMF:2;
-       u32             RxAMD:3;
-       u32             TxPerPktInfoFeedback:1;//1 indicate Tx info gathtered by firmware and returned by Rx Cmd
-       u32             Reserved1:2;
-       u32             TxAGCOffSet:4;
-       u32             TxAGCSign:1;
-       u32             Tx_INFO_RSVD:6;
-       u32             PacketID:13;
-       //u32                Reserved;
-}tx_fwinfo_819x_usb, *ptx_fwinfo_819x_usb;
+       /* DOWRD 0 */
+       u8      TxRate:7;
+       u8      CtsEnable:1;
+       u8      RtsRate:7;
+       u8      RtsEnable:1;
+       u8      TxHT:1;
+       u8      Short:1;        /* Error out, always on */
+       u8      TxBandwidth:1;  /* Used for HT MCS rate only */
+       u8      TxSubCarrier:2; /* Used for legacy OFDM rate only */
+       u8      STBC:2;
+       u8      AllowAggregation:1;
+       /* Interpret RtsRate field as high throughput data rate */
+       u8      RtsHT:1;
+       u8      RtsShort:1;     /* Short PLCP for CCK or short GI for 11n MCS */
+       u8      RtsBandwidth:1; /* Used for HT MCS rate only */
+       u8      RtsSubcarrier:2;/* Used for legacy OFDM rate only */
+       u8      RtsSTBC:2;
+       /* Enable firmware to recalculate and assign packet duration */
+       u8      EnableCPUDur:1;
+
+       /* DWORD 1 */
+       u32     RxMF:2;
+       u32     RxAMD:3;
+       /* 1 indicate Tx info gathered by firmware and returned by Rx Cmd */
+       u32     TxPerPktInfoFeedback:1;
+       u32     Reserved1:2;
+       u32     TxAGCOffSet:4;
+       u32     TxAGCSign:1;
+       u32     Tx_INFO_RSVD:6;
+       u32     PacketID:13;
+} tx_fwinfo_819x_usb, *ptx_fwinfo_819x_usb;
 
 typedef struct rtl8192_rx_info {
        struct urb *urb;
        struct net_device *dev;
        u8 out_pipe;
-}rtl8192_rx_info ;
+} rtl8192_rx_info ;
 
-typedef struct rx_desc_819x_usb{
-       //DOWRD 0
+typedef struct rx_desc_819x_usb {
+       /* DOWRD 0 */
        u16                 Length:14;
        u16                 CRC32:1;
        u16                 ICV:1;
@@ -356,47 +353,32 @@ typedef struct rx_desc_819x_usb{
        u8                  Shift:2;
        u8                  PHYStatus:1;
        u8                  SWDec:1;
-       //u8                LastSeg:1;
-       //u8                FirstSeg:1;
-       //u8                EOR:1;
-       //u8                OWN:1;
        u8                  Reserved1:4;
 
-       //DWORD 1
+       /* DWORD 1 */
        u32                 Reserved2;
-
-       //DWORD 2
-       //u32               Reserved3;
-
-       //DWORD 3
-       //u32                BufferAddress;
-
-}rx_desc_819x_usb, *prx_desc_819x_usb;
+} rx_desc_819x_usb, *prx_desc_819x_usb;
 
 #ifdef USB_RX_AGGREGATION_SUPPORT
-typedef struct _rx_desc_819x_usb_aggr_subframe{
-       //DOWRD 0
+typedef struct _rx_desc_819x_usb_aggr_subframe {
+       /* DOWRD 0 */
        u16                     Length:14;
        u16                     CRC32:1;
        u16                     ICV:1;
        u8                      Offset;
        u8                      RxDrvInfoSize;
-       //DOWRD 1
+       /* DOWRD 1 */
        u8                      Shift:2;
        u8                      PHYStatus:1;
        u8                      SWDec:1;
        u8                      Reserved1:4;
        u8                      Reserved2;
        u16                     Reserved3;
-       //DWORD 2
-       //u4Byte                Reserved3;
-       //DWORD 3
-       //u4Byte                BufferAddress;
-}rx_desc_819x_usb_aggr_subframe, *prx_desc_819x_usb_aggr_subframe;
+} rx_desc_819x_usb_aggr_subframe, *prx_desc_819x_usb_aggr_subframe;
 #endif
 
-typedef struct rx_drvinfo_819x_usb{
-       //DWORD 0
+typedef struct rx_drvinfo_819x_usb {
+       /* DWORD 0 */
        u16                 Reserved1:12;
        u16                 PartAggr:1;
        u16                 FirstAGGR:1;
@@ -413,14 +395,15 @@ typedef struct rx_drvinfo_819x_usb{
        u8                  Bcast:1;
        u8                  Reserved4:1;
 
-       //DWORD 1
+       /* DWORD 1 */
        u32                  TSFL;
 
-}rx_drvinfo_819x_usb, *prx_drvinfo_819x_usb;
+} rx_drvinfo_819x_usb, *prx_drvinfo_819x_usb;
 
-
-#define MAX_DEV_ADDR_SIZE              8  /* support till 64 bit bus width OS */
-#define MAX_FIRMWARE_INFORMATION_SIZE   32 /*2006/04/30 by Emily forRTL8190*/
+/* Support till 64 bit bus width OS */
+#define MAX_DEV_ADDR_SIZE              8
+/* For RTL8190 */
+#define MAX_FIRMWARE_INFORMATION_SIZE   32
 #define MAX_802_11_HEADER_LENGTH        (40 + MAX_FIRMWARE_INFORMATION_SIZE)
 #define ENCRYPTION_MAX_OVERHEAD                128
 #define        USB_HWDESC_HEADER_LEN           sizeof(tx_desc_819x_usb)
@@ -438,55 +421,55 @@ typedef struct rx_drvinfo_819x_usb{
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
 #define TX_PACKET_DRVAGGR_SUBFRAME_SHIFT_BYTES (sizeof(tx_desc_819x_usb_aggr_subframe) + sizeof(tx_fwinfo_819x_usb))
 #endif
-#define scrclng                                        4               // octets for crc32 (FCS, ICV)
+/* Octets for crc32 (FCS, ICV) */
+#define scrclng                                        4
 
-typedef enum rf_optype
-{
+typedef enum rf_optype {
        RF_OP_By_SW_3wire = 0,
        RF_OP_By_FW,
        RF_OP_MAX
-}rf_op_type;
+} rf_op_type;
 /* 8190 Loopback Mode definition */
-typedef enum _rtl819xUsb_loopback{
+typedef enum _rtl819xUsb_loopback {
        RTL819xU_NO_LOOPBACK = 0,
        RTL819xU_MAC_LOOPBACK = 1,
        RTL819xU_DMA_LOOPBACK = 2,
        RTL819xU_CCK_LOOPBACK = 3,
-}rtl819xUsb_loopback_e;
+} rtl819xUsb_loopback_e;
 
 /* due to rtl8192 firmware */
-typedef enum _desc_packet_type_e{
+typedef enum _desc_packet_type_e {
        DESC_PACKET_TYPE_INIT = 0,
        DESC_PACKET_TYPE_NORMAL = 1,
-}desc_packet_type_e;
+} desc_packet_type_e;
 
-typedef enum _firmware_status{
+typedef enum _firmware_status {
        FW_STATUS_0_INIT = 0,
        FW_STATUS_1_MOVE_BOOT_CODE = 1,
        FW_STATUS_2_MOVE_MAIN_CODE = 2,
        FW_STATUS_3_TURNON_CPU = 3,
        FW_STATUS_4_MOVE_DATA_CODE = 4,
        FW_STATUS_5_READY = 5,
-}firmware_status_e;
+} firmware_status_e;
 
 typedef struct _rt_firmare_seg_container {
        u16     seg_size;
        u8      *seg_ptr;
-}fw_seg_container, *pfw_seg_container;
-typedef struct _rt_firmware{
+} fw_seg_container, *pfw_seg_container;
+typedef struct _rt_firmware {
        firmware_status_e firmware_status;
        u16               cmdpacket_frag_thresold;
-#define RTL8190_MAX_FIRMWARE_CODE_SIZE  64000   //64k
+#define RTL8190_MAX_FIRMWARE_CODE_SIZE  64000
        u8                firmware_buf[RTL8190_MAX_FIRMWARE_CODE_SIZE];
        u16               firmware_buf_size;
-}rt_firmware, *prt_firmware;
+} rt_firmware, *prt_firmware;
 
-//+by amy 080507
-#define MAX_RECEIVE_BUFFER_SIZE        9100    // Add this to 9100 bytes to receive A-MSDU from RT-AP
+/* Add this to 9100 bytes to receive A-MSDU from RT-AP */
+#define MAX_RECEIVE_BUFFER_SIZE        9100
 
-typedef struct _rt_firmware_info_819xUsb{
+typedef struct _rt_firmware_info_819xUsb {
        u8              sz_info[16];
-}rt_firmware_info_819xUsb, *prt_firmware_info_819xUsb;
+} rt_firmware_info_819xUsb, *prt_firmware_info_819xUsb;
 
 /* Firmware Queue Layout */
 #define NUM_OF_FIRMWARE_QUEUE          10
@@ -527,8 +510,11 @@ typedef struct _rt_firmware_info_819xUsb{
 #define RSVD_FW_QUEUE_PAGE_CMD_SHIFT   0x08
 #define RSVD_FW_QUEUE_PAGE_BCN_SHIFT   0x00
 #define RSVD_FW_QUEUE_PAGE_PUB_SHIFT   0x08
-//=================================================================
-//=================================================================
+
+/*
+ * =================================================================
+ * =================================================================
+ */
 
 #define EPROM_93c46 0
 #define EPROM_93c56 1
@@ -557,7 +543,7 @@ typedef enum _WIRELESS_MODE {
 } WIRELESS_MODE;
 
 
-#define RTL_IOCTL_WPA_SUPPLICANT               SIOCIWFIRSTPRIV+30
+#define RTL_IOCTL_WPA_SUPPLICANT               (SIOCIWFIRSTPRIV + 30)
 
 typedef struct buffer {
        struct buffer *next;
@@ -565,7 +551,7 @@ typedef struct buffer {
 
 } buffer;
 
-typedef struct rtl_reg_debug{
+typedef struct rtl_reg_debug {
        unsigned int  cmd;
        struct {
                unsigned char type;
@@ -574,7 +560,7 @@ typedef struct rtl_reg_debug{
                unsigned char length;
        } head;
        unsigned char buf[0xff];
-}rtl_reg_debug;
+} rtl_reg_debug;
 
 
 
@@ -584,58 +570,45 @@ typedef struct rtl_reg_debug{
 typedef struct _rt_9x_tx_rate_history {
        u32             cck[4];
        u32             ofdm[8];
-       // HT_MCS[0][]: BW=0 SG=0
-       // HT_MCS[1][]: BW=1 SG=0
-       // HT_MCS[2][]: BW=0 SG=1
-       // HT_MCS[3][]: BW=1 SG=1
        u32             ht_mcs[4][16];
-}rt_tx_rahis_t, *prt_tx_rahis_t;
+} rt_tx_rahis_t, *prt_tx_rahis_t;
 typedef struct _RT_SMOOTH_DATA_4RF {
-       char    elements[4][100];//array to store values
-       u32     index;                  //index to current array to store
-       u32     TotalNum;               //num of valid elements
-       u32     TotalVal[4];            //sum of valid elements
-}RT_SMOOTH_DATA_4RF, *PRT_SMOOTH_DATA_4RF;
-
-#define MAX_8192U_RX_SIZE                      8192    // This maybe changed for D-cut larger aggregation size
-//stats seems messed up, clean it ASAP
+       char    elements[4][100]; /* array to store values */
+       u32     index;            /* index to current array to store */
+       u32     TotalNum;         /* num of valid elements */
+       u32     TotalVal[4];      /* sum of valid elements */
+} RT_SMOOTH_DATA_4RF, *PRT_SMOOTH_DATA_4RF;
+
+/* This maybe changed for D-cut larger aggregation size */
+#define MAX_8192U_RX_SIZE                      8192
+/* Stats seems messed up, clean it ASAP */
 typedef struct Stats {
        unsigned long txrdu;
-//     unsigned long rxrdu;
-       //unsigned long rxnolast;
-       //unsigned long rxnodata;
-//     unsigned long rxreset;
-//     unsigned long rxnopointer;
        unsigned long rxok;
        unsigned long rxframgment;
        unsigned long rxurberr;
        unsigned long rxstaterr;
-       unsigned long received_rate_histogram[4][32];   //0: Total, 1:OK, 2:CRC, 3:ICV, 2007 07 03 cosa
-       unsigned long received_preamble_GI[2][32];              //0: Long preamble/GI, 1:Short preamble/GI
-       unsigned long rx_AMPDUsize_histogram[5]; // level: (<4K), (4K~8K), (8K~16K), (16K~32K), (32K~64K)
-       unsigned long rx_AMPDUnum_histogram[5]; // level: (<5), (5~10), (10~20), (20~40), (>40)
-       unsigned long numpacket_matchbssid;     // debug use only.
-       unsigned long numpacket_toself;         // debug use only.
-       unsigned long num_process_phyinfo;              // debug use only.
+       /* 0: Total, 1: OK, 2: CRC, 3: ICV */
+       unsigned long received_rate_histogram[4][32];
+       /* 0: Long preamble/GI, 1: Short preamble/GI */
+       unsigned long received_preamble_GI[2][32];
+       /* level: (<4K), (4K~8K), (8K~16K), (16K~32K), (32K~64K) */
+       unsigned long rx_AMPDUsize_histogram[5];
+       /* level: (<5), (5~10), (10~20), (20~40), (>40) */
+       unsigned long rx_AMPDUnum_histogram[5];
+       unsigned long numpacket_matchbssid;
+       unsigned long numpacket_toself;
+       unsigned long num_process_phyinfo;
        unsigned long numqry_phystatus;
        unsigned long numqry_phystatusCCK;
        unsigned long numqry_phystatusHT;
-       unsigned long received_bwtype[5];              //0: 20M, 1: funn40M, 2: upper20M, 3: lower20M, 4: duplicate
+       /* 0: 20M, 1: funn40M, 2: upper20M, 3: lower20M, 4: duplicate */
+       unsigned long received_bwtype[5];
        unsigned long txnperr;
        unsigned long txnpdrop;
        unsigned long txresumed;
-//     unsigned long rxerr;
-//     unsigned long rxoverflow;
-//     unsigned long rxint;
        unsigned long txnpokint;
-//     unsigned long txhpokint;
-//     unsigned long txhperr;
-//     unsigned long ints;
-//     unsigned long shints;
        unsigned long txoverflow;
-//     unsigned long rxdmafail;
-//     unsigned long txbeacon;
-//     unsigned long txbeaconerr;
        unsigned long txlpokint;
        unsigned long txlpdrop;
        unsigned long txlperr;
@@ -684,30 +657,35 @@ typedef struct Stats {
        u8            last_packet_rate;
        unsigned long slide_signal_strength[100];
        unsigned long slide_evm[100];
-       unsigned long slide_rssi_total; // For recording sliding window's RSSI value
-       unsigned long slide_evm_total;  // For recording sliding window's EVM value
-       long signal_strength; // Transformed, in dbm. Beautified signal strength for UI, not correct.
+       /* For recording sliding window's RSSI value */
+       unsigned long slide_rssi_total;
+       /* For recording sliding window's EVM value */
+       unsigned long slide_evm_total;
+       /* Transformed in dbm. Beautified signal strength for UI, not correct */
+       long signal_strength;
        long signal_quality;
        long last_signal_strength_inpercent;
-       long recv_signal_power; // Correct smoothed ss in Dbm, only used in driver to report real power now.
+       /* Correct smoothed ss in dbm, only used in driver
+        * to report real power now */
+       long recv_signal_power;
        u8 rx_rssi_percentage[4];
        u8 rx_evm_percentage[2];
        long rxSNRdB[4];
        rt_tx_rahis_t txrate;
-       u32 Slide_Beacon_pwdb[100];     //cosa add for beacon rssi
-       u32 Slide_Beacon_Total;         //cosa add for beacon rssi
+       /* For beacon RSSI */
+       u32 Slide_Beacon_pwdb[100];
+       u32 Slide_Beacon_Total;
        RT_SMOOTH_DATA_4RF              cck_adc_pwdb;
 
        u32     CurrentShowTxate;
 } Stats;
 
 
-// Bandwidth Offset
+/* Bandwidth Offset */
 #define HAL_PRIME_CHNL_OFFSET_DONT_CARE                0
 #define HAL_PRIME_CHNL_OFFSET_LOWER                    1
 #define HAL_PRIME_CHNL_OFFSET_UPPER                    2
 
-//+by amy 080507
 
 typedef struct ChnlAccessSetting {
        u16 SIFS_Timer;
@@ -716,35 +694,62 @@ typedef struct    ChnlAccessSetting {
        u16 EIFS_Timer;
        u16 CWminIndex;
        u16 CWmaxIndex;
-}*PCHANNEL_ACCESS_SETTING,CHANNEL_ACCESS_SETTING;
-
-typedef struct _BB_REGISTER_DEFINITION{
-       u32 rfintfs;                    // set software control: //             0x870~0x877[8 bytes]
-       u32 rfintfi;                    // readback data: //            0x8e0~0x8e7[8 bytes]
-       u32 rfintfo;                    // output data: //              0x860~0x86f [16 bytes]
-       u32 rfintfe;                    // output enable: //            0x860~0x86f [16 bytes]
-       u32 rf3wireOffset;              // LSSI data: //                0x840~0x84f [16 bytes]
-       u32 rfLSSI_Select;              // BB Band Select: //           0x878~0x87f [8 bytes]
-       u32 rfTxGainStage;              // Tx gain stage: //            0x80c~0x80f [4 bytes]
-       u32 rfHSSIPara1;                // wire parameter control1 : //         0x820~0x823,0x828~0x82b, 0x830~0x833, 0x838~0x83b [16 bytes]
-       u32 rfHSSIPara2;                // wire parameter control2 : //         0x824~0x827,0x82c~0x82f, 0x834~0x837, 0x83c~0x83f [16 bytes]
-       u32 rfSwitchControl;    //Tx Rx antenna control : //            0x858~0x85f [16 bytes]
-       u32 rfAGCControl1;      //AGC parameter control1 : //           0xc50~0xc53,0xc58~0xc5b, 0xc60~0xc63, 0xc68~0xc6b [16 bytes]
-       u32 rfAGCControl2;      //AGC parameter control2 : //           0xc54~0xc57,0xc5c~0xc5f, 0xc64~0xc67, 0xc6c~0xc6f [16 bytes]
-       u32 rfRxIQImbalance;    //OFDM Rx IQ imbalance matrix : //              0xc14~0xc17,0xc1c~0xc1f, 0xc24~0xc27, 0xc2c~0xc2f [16 bytes]
-       u32 rfRxAFE;                    //Rx IQ DC ofset and Rx digital filter, Rx DC notch filter : //         0xc10~0xc13,0xc18~0xc1b, 0xc20~0xc23, 0xc28~0xc2b [16 bytes]
-       u32 rfTxIQImbalance;    //OFDM Tx IQ imbalance matrix //                0xc80~0xc83,0xc88~0xc8b, 0xc90~0xc93, 0xc98~0xc9b [16 bytes]
-       u32 rfTxAFE;                    //Tx IQ DC Offset and Tx DFIR type //           0xc84~0xc87,0xc8c~0xc8f, 0xc94~0xc97, 0xc9c~0xc9f [16 bytes]
-       u32 rfLSSIReadBack;     //LSSI RF readback data //              0x8a0~0x8af [16 bytes]
-}BB_REGISTER_DEFINITION_T, *PBB_REGISTER_DEFINITION_T;
-
-typedef enum _RT_RF_TYPE_819xU{
+} *PCHANNEL_ACCESS_SETTING, CHANNEL_ACCESS_SETTING;
+
+typedef struct _BB_REGISTER_DEFINITION {
+       /* set software control:        0x870~0x877 [8 bytes]  */
+       u32 rfintfs;
+       /* readback data:               0x8e0~0x8e7 [8 bytes]  */
+       u32 rfintfi;
+       /* output data:                 0x860~0x86f [16 bytes] */
+       u32 rfintfo;
+       /* output enable:               0x860~0x86f [16 bytes] */
+       u32 rfintfe;
+       /* LSSI data:                   0x840~0x84f [16 bytes] */
+       u32 rf3wireOffset;
+       /* BB Band Select:              0x878~0x87f [8 bytes]  */
+       u32 rfLSSI_Select;
+       /* Tx gain stage:               0x80c~0x80f [4 bytes]  */
+       u32 rfTxGainStage;
+       /* wire parameter control1:     0x820~0x823, 0x828~0x82b,
+        *                              0x830~0x833, 0x838~0x83b [16 bytes] */
+       u32 rfHSSIPara1;
+       /* wire parameter control2:     0x824~0x827, 0x82c~0x82f,
+        *                              0x834~0x837, 0x83c~0x83f [16 bytes] */
+       u32 rfHSSIPara2;
+       /* Tx Rx antenna control:       0x858~0x85f [16 bytes] */
+       u32 rfSwitchControl;
+       /* AGC parameter control1:      0xc50~0xc53, 0xc58~0xc5b,
+        *                              0xc60~0xc63, 0xc68~0xc6b [16 bytes] */
+       u32 rfAGCControl1;
+       /* AGC parameter control2:      0xc54~0xc57, 0xc5c~0xc5f,
+        *                              0xc64~0xc67, 0xc6c~0xc6f [16 bytes] */
+       u32 rfAGCControl2;
+       /* OFDM Rx IQ imbalance matrix: 0xc14~0xc17, 0xc1c~0xc1f,
+        *                              0xc24~0xc27, 0xc2c~0xc2f [16 bytes] */
+       u32 rfRxIQImbalance;
+       /* Rx IQ DC offset and Rx digital filter, Rx DC notch filter:
+        *                              0xc10~0xc13, 0xc18~0xc1b,
+        *                              0xc20~0xc23, 0xc28~0xc2b [16 bytes] */
+       u32 rfRxAFE;
+       /* OFDM Tx IQ imbalance matrix: 0xc80~0xc83, 0xc88~0xc8b,
+        *                              0xc90~0xc93, 0xc98~0xc9b [16 bytes] */
+       u32 rfTxIQImbalance;
+       /* Tx IQ DC Offset and Tx DFIR type:
+        *                              0xc84~0xc87, 0xc8c~0xc8f,
+        *                              0xc94~0xc97, 0xc9c~0xc9f [16 bytes] */
+       u32 rfTxAFE;
+       /* LSSI RF readback data:       0x8a0~0x8af [16 bytes] */
+       u32 rfLSSIReadBack;
+} BB_REGISTER_DEFINITION_T, *PBB_REGISTER_DEFINITION_T;
+
+typedef enum _RT_RF_TYPE_819xU {
        RF_TYPE_MIN = 0,
        RF_8225,
        RF_8256,
        RF_8258,
        RF_PSEUDO_11N = 4,
-}RT_RF_TYPE_819xU, *PRT_RF_TYPE_819xU;
+} RT_RF_TYPE_819xU, *PRT_RF_TYPE_819xU;
 
 typedef struct _rate_adaptive {
        u8                              rate_adaptive_disabled;
@@ -762,9 +767,9 @@ typedef struct _rate_adaptive {
        u32                             low_rssi_threshold_ratr;
        u32                             low_rssi_threshold_ratr_40M;
        u32                             low_rssi_threshold_ratr_20M;
-       u8                              ping_rssi_enable;       //cosa add for test
-       u32                             ping_rssi_ratr; //cosa add for test
-       u32                             ping_rssi_thresh_for_ra;//cosa add for test
+       u8                              ping_rssi_enable;
+       u32                             ping_rssi_ratr;
+       u32                             ping_rssi_thresh_for_ra;
        u32                             last_ratr;
 
 } rate_adaptive, *prate_adaptive;
@@ -778,9 +783,9 @@ typedef struct _txbbgain_struct {
 } txbbgain_struct, *ptxbbgain_struct;
 
 typedef struct _ccktxbbgain_struct {
-       //The Value is from a22 to a29 one Byte one time is much Safer
+       /* The value is from a22 to a29, one byte one time is much safer */
        u8      ccktxbb_valuearray[8];
-} ccktxbbgain_struct,*pccktxbbgain_struct;
+} ccktxbbgain_struct, *pccktxbbgain_struct;
 
 
 typedef struct _init_gain {
@@ -791,7 +796,6 @@ typedef struct _init_gain {
        u8                              cca;
 
 } init_gain, *pinit_gain;
-//by amy 0606
 
 typedef struct _phy_ofdm_rx_status_report_819xusb {
        u8      trsw_gain_X[4];
@@ -807,26 +811,26 @@ typedef struct _phy_ofdm_rx_status_report_819xusb {
        u8      max_ex_pwr;
        u8      sgi_en;
        u8  rxsc_sgien_exflg;
-}phy_sts_ofdm_819xusb_t;
+} phy_sts_ofdm_819xusb_t;
 
 typedef struct _phy_cck_rx_status_report_819xusb {
-       /* For CCK rate descriptor. This is a unsigned 8:1 variable. LSB bit presend
-          0.5. And MSB 7 bts presend a signed value. Range from -64~+63.5. */
+       /* For CCK rate descriptor. This is an unsigned 8:1 variable.
+        * LSB bit presend 0.5. And MSB 7 bts presend a signed value.
+        * Range from -64~+63.5. */
        u8      adc_pwdb_X[4];
        u8      sq_rpt;
        u8      cck_agc_rpt;
-}phy_sts_cck_819xusb_t;
+} phy_sts_cck_819xusb_t;
 
 
-typedef struct _phy_ofdm_rx_status_rxsc_sgien_exintfflag{
+typedef struct _phy_ofdm_rx_status_rxsc_sgien_exintfflag {
        u8                      reserved:4;
        u8                      rxsc:2;
        u8                      sgi_en:1;
        u8                      ex_intf_flag:1;
-}phy_ofdm_rx_status_rxsc_sgien_exintfflag;
+} phy_ofdm_rx_status_rxsc_sgien_exintfflag;
 
-typedef enum _RT_CUSTOMER_ID
-{
+typedef enum _RT_CUSTOMER_ID {
        RT_CID_DEFAULT = 0,
        RT_CID_8187_ALPHA0 = 1,
        RT_CID_8187_SERCOMM_PS = 2,
@@ -836,25 +840,28 @@ typedef enum _RT_CUSTOMER_ID
        RT_CID_819x_CAMEO  = 6,
        RT_CID_819x_RUNTOP = 7,
        RT_CID_819x_Senao = 8,
-       RT_CID_TOSHIBA = 9,     // Merge by Jacken, 2008/01/31.
+       RT_CID_TOSHIBA = 9,
        RT_CID_819x_Netcore = 10,
        RT_CID_Nettronix = 11,
        RT_CID_DLINK = 12,
        RT_CID_PRONET = 13,
-}RT_CUSTOMER_ID, *PRT_CUSTOMER_ID;
+} RT_CUSTOMER_ID, *PRT_CUSTOMER_ID;
 
-//================================================================================
-// LED customization.
-//================================================================================
-
-typedef        enum _LED_STRATEGY_8190{
-       SW_LED_MODE0, // SW control 1 LED via GPIO0. It is default option.
-       SW_LED_MODE1, // SW control for PCI Express
-       SW_LED_MODE2, // SW control for Cameo.
-       SW_LED_MODE3, // SW contorl for RunTop.
-       SW_LED_MODE4, // SW control for Netcore
-       HW_LED, // HW control 2 LEDs, LED0 and LED1 (there are 4 different control modes)
-}LED_STRATEGY_8190, *PLED_STRATEGY_8190;
+/*
+ * ==========================================================================
+ * LED customization.
+ * ==========================================================================
+ */
+
+typedef        enum _LED_STRATEGY_8190 {
+       SW_LED_MODE0, /* SW control 1 LED via GPIO0. It is default option. */
+       SW_LED_MODE1, /* SW control for PCI Express */
+       SW_LED_MODE2, /* SW control for Cameo. */
+       SW_LED_MODE3, /* SW control for RunTop. */
+       SW_LED_MODE4, /* SW control for Netcore. */
+       /* HW control 2 LEDs, LED0 and LED1 (4 different control modes) */
+       HW_LED,
+} LED_STRATEGY_8190, *PLED_STRATEGY_8190;
 
 typedef enum _RESET_TYPE {
        RESET_TYPE_NORESET = 0x00,
@@ -863,7 +870,7 @@ typedef enum _RESET_TYPE {
 } RESET_TYPE;
 
 /* The simple tx command OP code. */
-typedef enum _tag_TxCmd_Config_Index{
+typedef enum _tag_TxCmd_Config_Index {
        TXCMD_TXRA_HISTORY_CTRL                         = 0xFF900000,
        TXCMD_RESET_TX_PKT_BUFF                         = 0xFF900001,
        TXCMD_RESET_RX_PKT_BUFF                         = 0xFF900002,
@@ -871,11 +878,11 @@ typedef enum _tag_TxCmd_Config_Index{
        TXCMD_SET_RX_RSSI                                               = 0xFF900004,
        TXCMD_SET_TX_PWR_TRACKING                       = 0xFF900005,
        TXCMD_XXXX_CTRL,
-}DCMD_TXCMD_OP;
+} DCMD_TXCMD_OP;
 
 typedef struct r8192_priv {
        struct usb_device *udev;
-       //added for maintain info from eeprom
+       /* For maintain info from eeprom */
        short epromtype;
        u16 eeprom_vid;
        u16 eeprom_pid;
@@ -887,105 +894,81 @@ typedef struct r8192_priv {
        int irq;
        struct ieee80211_device *ieee80211;
 
-       short card_8192; /* O: rtl8192, 1:rtl8185 V B/C, 2:rtl8185 V D */
-       u8 card_8192_version; /* if TCR reports card V B/C this discriminates */
-//     short phy_ver; /* meaningful for rtl8225 1:A 2:B 3:C */
+       /* O: rtl8192, 1: rtl8185 V B/C, 2: rtl8185 V D */
+       short card_8192;
+       /* If TCR reports card V B/C, this discriminates */
+       u8 card_8192_version;
        short enable_gpio0;
-       enum card_type {PCI,MINIPCI,CARDBUS,USB}card_type;
+       enum card_type {
+               PCI, MINIPCI, CARDBUS, USB
+       } card_type;
        short hw_plcp_len;
        short plcp_preamble_mode;
 
        spinlock_t irq_lock;
-//     spinlock_t irq_th_lock;
        spinlock_t tx_lock;
        struct mutex mutex;
-       //spinlock_t rf_lock; //used to lock rf write operation added by wb
 
        u16 irq_mask;
-//     short irq_enabled;
-//     struct net_device *dev; //comment this out.
        short chan;
        short sens;
        short max_sens;
 
 
-       //      u8 chtxpwr[15]; //channels from 1 to 14, 0 not used
-//     u8 chtxpwr_ofdm[15]; //channels from 1 to 14, 0 not used
-//     u8 cck_txpwr_base;
-//     u8 ofdm_txpwr_base;
-//     u8 challow[15]; //channels from 1 to 14, 0 not used
        short up;
-       short crcmon; //if 1 allow bad crc frame reception in monitor mode
-//     short prism_hdr;
-
-//     struct timer_list scan_timer;
-       /*short scanpending;
-       short stopscan;*/
-//     spinlock_t scan_lock;
-//     u8 active_probe;
-       //u8 active_scan_num;
+       /* If 1, allow bad crc frame, reception in monitor mode */
+       short crcmon;
+
        struct semaphore wx_sem;
-       struct semaphore rf_sem; //used to lock rf write operation added by wb, modified by david
-//     short hw_wep;
-
-//     short digphy;
-//     short antb;
-//     short diversity;
-//     u8 cs_treshold;
-//     short rcr_csense;
-       u8 rf_type; //0 means 1T2R, 1 means 2T4R
+       struct semaphore rf_sem;        /* Used to lock rf write operation */
+
+       u8 rf_type;                     /* 0: 1T2R, 1: 2T4R */
        RT_RF_TYPE_819xU rf_chip;
 
-//     u32 key0[4];
-       short (*rf_set_sens)(struct net_device *dev,short sens);
-       u8 (*rf_set_chan)(struct net_device *dev,u8 ch);
+       short (*rf_set_sens)(struct net_device *dev, short sens);
+       u8 (*rf_set_chan)(struct net_device *dev, u8 ch);
        void (*rf_close)(struct net_device *dev);
        void (*rf_init)(struct net_device *dev);
-       //short rate;
        short promisc;
-       /*stats*/
+       /* Stats */
        struct Stats stats;
        struct iw_statistics wstats;
 
-       /*RX stuff*/
-//     u32 *rxring;
-//     u32 *rxringtail;
-//     dma_addr_t rxringdma;
+       /* RX stuff */
        struct urb **rx_urb;
        struct urb **rx_cmd_urb;
 #ifdef THOMAS_BEACON
        u32 *oldaddr;
 #endif
 #ifdef THOMAS_TASKLET
-       atomic_t irt_counter;//count for irq_rx_tasklet
+       atomic_t irt_counter; /* count for irq_rx_tasklet */
 #endif
 #ifdef JACKSON_NEW_RX
        struct sk_buff **pp_rxskb;
        int     rx_inx;
 #endif
 
-/* modified by davad for Rx process */
        struct sk_buff_head rx_queue;
        struct sk_buff_head skb_queue;
        struct work_struct qos_activate;
        short  tx_urb_index;
-       atomic_t tx_pending[0x10];//UART_PRIORITY+1
+       atomic_t tx_pending[0x10]; /* UART_PRIORITY + 1 */
 
 
        struct tasklet_struct irq_rx_tasklet;
        struct urb *rxurb_task;
 
-       //2 Tx Related variables
+       /* Tx Related variables */
        u16     ShortRetryLimit;
        u16     LongRetryLimit;
        u32     TransmitConfig;
-       u8      RegCWinMin;             // For turbo mode CW adaptive. Added by Annie, 2005-10-27.
+       u8      RegCWinMin;     /* For turbo mode CW adaptive */
 
        u32     LastRxDescTSFHigh;
        u32     LastRxDescTSFLow;
 
 
-       //2 Rx Related variables
+       /* Rx Related variables */
        u16     EarlyRxThreshold;
        u32     ReceiveConfig;
        u8      AcmControl;
@@ -1000,13 +983,13 @@ typedef struct r8192_priv {
        struct work_struct reset_wq;
 
 /**********************************************************/
-       //for rtl819xUsb
+       /* For rtl819xUsb */
        u16     basic_rate;
        u8      short_preamble;
        u8      slot_time;
        bool    bDcut;
        bool bCurrentRxAggrEnable;
-       u8 Rf_Mode; //add for Firmware RF -R/W switch
+       u8 Rf_Mode;     /* For Firmware RF -R/W switch */
        prt_firmware            pFirmware;
        rtl819xUsb_loopback_e   LoopbackMode;
        u16 EEPROMTxPowerDiff;
@@ -1014,71 +997,70 @@ typedef struct r8192_priv {
        u8 EEPROMPwDiff;
        u8 EEPROMCrystalCap;
        u8 EEPROM_Def_Ver;
-       u8 EEPROMTxPowerLevelCCK;// CCK channel 1~14
+       u8 EEPROMTxPowerLevelCCK;               /* CCK channel 1~14 */
        u8 EEPROMTxPowerLevelCCK_V1[3];
-       u8 EEPROMTxPowerLevelOFDM24G[3]; // OFDM 2.4G channel 1~14
-       u8 EEPROMTxPowerLevelOFDM5G[24];        // OFDM 5G
+       u8 EEPROMTxPowerLevelOFDM24G[3];        /* OFDM 2.4G channel 1~14 */
+       u8 EEPROMTxPowerLevelOFDM5G[24];        /* OFDM 5G */
 
-/*PHY related*/
-       BB_REGISTER_DEFINITION_T        PHYRegDef[4];   //Radio A/B/C/D
-       // Read/write are allow for following hardware information variables
+       /* PHY related */
+       BB_REGISTER_DEFINITION_T PHYRegDef[4];  /* Radio A/B/C/D */
+       /* Read/write are allow for following hardware information variables */
        u32     MCSTxPowerLevelOriginalOffset[6];
        u32     CCKTxPowerLevelOriginalOffset;
-       u8      TxPowerLevelCCK[14];                    // CCK channel 1~14
-       u8      TxPowerLevelOFDM24G[14];                // OFDM 2.4G channel 1~14
-       u8      TxPowerLevelOFDM5G[14];                 // OFDM 5G
+       u8      TxPowerLevelCCK[14];            /* CCK channel 1~14 */
+       u8      TxPowerLevelOFDM24G[14];        /* OFDM 2.4G channel 1~14 */
+       u8      TxPowerLevelOFDM5G[14];         /* OFDM 5G */
        u32     Pwr_Track;
        u8      TxPowerDiff;
-       u8      AntennaTxPwDiff[2];                             // Antenna gain offset, index 0 for B, 1 for C, and 2 for D
-       u8      CrystalCap;                                             // CrystalCap.
-       u8      ThermalMeter[2];                                // ThermalMeter, index 0 for RFIC0, and 1 for RFIC1
+       u8      AntennaTxPwDiff[2]; /* Antenna gain offset, 0: B, 1: C, 2: D */
+       u8      CrystalCap;
+       u8      ThermalMeter[2];    /* index 0: RFIC0, index 1: RFIC1 */
 
        u8      CckPwEnl;
-       // Use to calculate PWBD.
+       /* Use to calculate PWBD */
        u8      bCckHighPower;
        long    undecorated_smoothed_pwdb;
 
-       //for set channel
+       /* For set channel */
        u8      SwChnlInProgress;
        u8      SwChnlStage;
        u8      SwChnlStep;
        u8      SetBWModeInProgress;
        HT_CHANNEL_WIDTH                CurrentChannelBW;
        u8      ChannelPlan;
-       // 8190 40MHz mode
-       //
-       u8      nCur40MhzPrimeSC;       // Control channel sub-carrier
-       // Joseph test for shorten RF configuration time.
-       // We save RF reg0 in this variable to reduce RF reading.
-       //
+       /* 8190 40MHz mode */
+       /* Control channel sub-carrier */
+       u8      nCur40MhzPrimeSC;
+       /* Test for shorten RF configuration time.
+        * We save RF reg0 in this variable to reduce RF reading. */
        u32                                     RfReg0Value[4];
        u8                                      NumTotalRFPath;
        bool                            brfpath_rxenable[4];
-       //RF set related
+       /* RF set related */
        bool                            SetRFPowerStateInProgress;
-//+by amy 080507
        struct timer_list watch_dog_timer;
 
-//+by amy 080515 for dynamic mechenism
-       //Add by amy Tx Power Control for Near/Far Range 2008/05/15
-       bool    bdynamic_txpower;  //bDynamicTxPower
-       bool    bDynamicTxHighPower;  // Tx high power state
-       bool    bDynamicTxLowPower;  // Tx low power state
+       /* For dynamic mechanism */
+       /* Tx Power Control for Near/Far Range */
+       bool    bdynamic_txpower;
+       bool    bDynamicTxHighPower;
+       bool    bDynamicTxLowPower;
        bool    bLastDTPFlag_High;
        bool    bLastDTPFlag_Low;
 
        bool    bstore_last_dtpflag;
-       bool    bstart_txctrl_bydtp;   //Define to discriminate on High power State or on sitesuvey to change Tx gain index
-       //Add by amy for Rate Adaptive
+       /* Define to discriminate on High power State or
+        * on sitesurvey to change Tx gain index */
+       bool    bstart_txctrl_bydtp;
        rate_adaptive rate_adaptive;
-       //Add by amy for TX power tracking
-       //2008/05/15  Mars OPEN/CLOSE TX POWER TRACKING
-       txbbgain_struct txbbgain_table[TxBBGainTableLength];
-       u8                         txpower_count;//For 6 sec do tracking again
-       bool                       btxpower_trackingInit;
-       u8                         OFDM_index;
-       u8                         CCK_index;
-       //2007/09/10 Mars Add CCK TX Power Tracking
+       /* TX power tracking
+        * OPEN/CLOSE TX POWER TRACKING */
+       txbbgain_struct txbbgain_table[TxBBGainTableLength];
+       u8              txpower_count; /* For 6 sec do tracking again */
+       bool            btxpower_trackingInit;
+       u8              OFDM_index;
+       u8              CCK_index;
+       /* CCK TX Power Tracking */
        ccktxbbgain_struct      cck_txbbgain_table[CCKTxBBGainTableLength];
        ccktxbbgain_struct      cck_txbbgain_ch14_table[CCKTxBBGainTableLength];
        u8 rfa_txpowertrackingindex;
@@ -1095,15 +1077,14 @@ typedef struct r8192_priv {
        bool bcck_in_ch14;
        bool btxpowerdata_readfromEEPORM;
        u16     TSSI_13dBm;
-       //For Backup Initial Gain
        init_gain initgain_backup;
        u8 DefaultInitialGain[4];
-       // For EDCA Turbo mode, Added by amy 080515.
+       /* For EDCA Turbo mode */
        bool            bis_any_nonbepkts;
        bool            bcurrent_turbo_EDCA;
        bool            bis_cur_rdlstate;
        struct timer_list fsync_timer;
-       bool bfsync_processing; // 500ms Fsync timer is active or not
+       bool bfsync_processing; /* 500ms Fsync timer is active or not */
        u32     rate_record;
        u32     rateCountDiffRecord;
        u32     ContinueDiffCount;
@@ -1112,17 +1093,14 @@ typedef struct r8192_priv {
        u8      framesync;
        u32     framesyncC34;
        u8      framesyncMonitor;
-               //Added by amy 080516  for RX related
        u16     nrxAMPDU_size;
        u8      nrxAMPDU_aggr_num;
 
-       //by amy for gpio
+       /* For gpio */
         bool bHwRadioOff;
 
-       //by amy for reset_count
        u32 reset_count;
        bool bpbc_pressed;
-       //by amy for debug
        u32 txpower_checkcnt;
        u32 txpower_tracking_callback_cnt;
        u8 thermal_read_val[40];
@@ -1131,7 +1109,7 @@ typedef struct r8192_priv {
        u32 ccktxpower_adjustcnt_ch14;
        u8 tx_fwinfo_force_subcarriermode;
        u8 tx_fwinfo_force_subcarrierval;
-       //by amy for silent reset
+       /* For silent reset */
        RESET_TYPE      ResetProgress;
        bool            bForcedSilentReset;
        bool            bDisableNormalResetCheck;
@@ -1144,7 +1122,7 @@ typedef struct r8192_priv {
 
        u16             SifsTime;
 
-       //define work item by amy 080526
+       /* Define work item */
 
        struct delayed_work update_beacon_wq;
        struct delayed_work watch_dog_wq;
@@ -1153,42 +1131,32 @@ typedef struct r8192_priv {
        struct delayed_work gpio_change_rf_wq;
        struct delayed_work initialgain_operate_wq;
        struct workqueue_struct *priv_wq;
-}r8192_priv;
+} r8192_priv;
 
-// for rtl8187
-// now mirging to rtl8187B
-/*
-typedef enum{
-       LOW_PRIORITY = 0x02,
-       NORM_PRIORITY
-       } priority_t;
-*/
-//for rtl8187B
+/* For rtl8187B */
 typedef enum{
        BULK_PRIORITY = 0x01,
-       //RSVD0,
-       //RSVD1,
        LOW_PRIORITY,
        NORM_PRIORITY,
        VO_PRIORITY,
-       VI_PRIORITY, //0x05
+       VI_PRIORITY,
        BE_PRIORITY,
        BK_PRIORITY,
        RSVD2,
        RSVD3,
-       BEACON_PRIORITY, //0x0A
+       BEACON_PRIORITY,
        HIGH_PRIORITY,
        MANAGE_PRIORITY,
        RSVD4,
        RSVD5,
-       UART_PRIORITY //0x0F
+       UART_PRIORITY
 } priority_t;
 
-typedef enum{
+typedef enum {
        NIC_8192U = 1,
        NIC_8190P = 2,
        NIC_8192E = 3,
-       } nic_t;
+} nic_t;
 
 
 #ifdef JOHN_HWSEC
@@ -1200,19 +1168,19 @@ struct ssid_thread {
 
 bool init_firmware(struct net_device *dev);
 short rtl819xU_tx_cmd(struct net_device *dev, struct sk_buff *skb);
-short rtl8192_tx(struct net_device *dev, struct sk_buffskb);
+short rtl8192_tx(struct net_device *dev, struct sk_buff *skb);
 
 u32 read_cam(struct net_device *dev, u8 addr);
 void write_cam(struct net_device *dev, u8 addr, u32 data);
 
-u8 read_nic_byte(struct net_device *dev, int x);
-u8 read_nic_byte_E(struct net_device *dev, int x);
-u32 read_nic_dword(struct net_device *dev, int x);
-u16 read_nic_word(struct net_device *dev, int x) ;
-void write_nic_byte(struct net_device *dev, int x,u8 y);
-void write_nic_byte_E(struct net_device *dev, int x,u8 y);
-void write_nic_word(struct net_device *dev, int x,u16 y);
-void write_nic_dword(struct net_device *dev, int x,u32 y);
+int read_nic_byte(struct net_device *dev, int x, u8 *data);
+int read_nic_byte_E(struct net_device *dev, int x, u8 *data);
+int read_nic_dword(struct net_device *dev, int x, u32 *data);
+int read_nic_word(struct net_device *dev, int x, u16 *data);
+void write_nic_byte(struct net_device *dev, int x, u8 y);
+void write_nic_byte_E(struct net_device *dev, int x, u8 y);
+void write_nic_word(struct net_device *dev, int x, u16 y);
+void write_nic_dword(struct net_device *dev, int x, u32 y);
 void force_pci_posting(struct net_device *dev);
 
 void rtl8192_rtx_disable(struct net_device *);
@@ -1220,26 +1188,24 @@ void rtl8192_rx_enable(struct net_device *);
 void rtl8192_tx_enable(struct net_device *);
 
 void rtl8192_disassociate(struct net_device *dev);
-//void fix_rx_fifo(struct net_device *dev);
-void rtl8185_set_rf_pins_enable(struct net_device *dev,u32 a);
+void rtl8185_set_rf_pins_enable(struct net_device *dev, u32 a);
 
-void rtl8192_set_anaparam(struct net_device *dev,u32 a);
-void rtl8185_set_anaparam2(struct net_device *dev,u32 a);
+void rtl8192_set_anaparam(struct net_device *dev, u32 a);
+void rtl8185_set_anaparam2(struct net_device *dev, u32 a);
 void rtl8192_update_msr(struct net_device *dev);
 int rtl8192_down(struct net_device *dev);
 int rtl8192_up(struct net_device *dev);
 void rtl8192_commit(struct net_device *dev);
-void rtl8192_set_chan(struct net_device *dev,short ch);
+void rtl8192_set_chan(struct net_device *dev, short ch);
 void write_phy(struct net_device *dev, u8 adr, u8 data);
 void write_phy_cck(struct net_device *dev, u8 adr, u32 data);
 void write_phy_ofdm(struct net_device *dev, u8 adr, u32 data);
 void rtl8185_tx_antenna(struct net_device *dev, u8 ant);
 void rtl8192_set_rxconf(struct net_device *dev);
-//short check_nic_enough_desc(struct net_device *dev, priority_t priority);
-extern void rtl819xusb_beacon_tx(struct net_device *dev,u16  tx_rate);
+extern void rtl819xusb_beacon_tx(struct net_device *dev, u16 tx_rate);
 
 void EnableHWSecurityConfig8192(struct net_device *dev);
-void setKey(struct net_device *dev, u8 EntryNo, u8 KeyIndex, u16 KeyType, u8 *MacAddr, u8 DefaultKey, u32 *KeyContent );
+void setKey(struct net_device *dev, u8 EntryNo, u8 KeyIndex, u16 KeyType, u8 *MacAddr, u8 DefaultKey, u32 *KeyContent);
 
 
 #endif
index 71f5cde9ed1c1ae775f993917662f81bdb1b1c19..c880adcaf0fdf722c01a68a8763796841a6b545e 100644 (file)
  */
 
 #ifndef CONFIG_FORCE_HARD_FLOAT
-double __floatsidf (int i) { return i; }
-unsigned int __fixunsdfsi (double d) { return d; }
-double __adddf3(double a, double b) { return a+b; }
-double __addsf3(float a, float b) { return a+b; }
-double __subdf3(double a, double b) { return a-b; }
-double __extendsfdf2(float a) {return a;}
+double __floatsidf(int i)
+{
+       return i;
+}
+
+unsigned int __fixunsdfsi(double d)
+{
+       return d;
+}
+
+double __adddf3(double a, double b)
+{
+       return a+b;
+}
+
+double __addsf3(float a, float b)
+{
+       return a+b;
+}
+
+double __subdf3(double a, double b)
+{
+       return a-b;
+}
+
+double __extendsfdf2(float a)
+{
+       return a;
+}
 #endif
 
 #undef LOOP_TEST
@@ -68,7 +91,6 @@ double __extendsfdf2(float a) {return a;}
 #include "r819xU_phyreg.h"
 #include "r819xU_cmdpkt.h"
 #include "r8192U_dm.h"
-//#include "r8192xU_phyreg.h"
 #include <linux/usb.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
@@ -81,26 +103,9 @@ double __extendsfdf2(float a) {return a;}
 
 #include "dot11d.h"
 //set here to open your trace code. //WB
-u32 rt_global_debug_component = \
-                       //      COMP_INIT       |
-//                             COMP_DBG        |
-                       //      COMP_EPROM      |
-//                             COMP_PHY        |
-                       //      COMP_RF         |
-//                             COMP_FIRMWARE   |
-//                             COMP_CH         |
-                       //      COMP_POWER_TRACKING |
-//                             COMP_RATE       |
-                       //      COMP_TXAGC      |
-               //              COMP_TRACE      |
-                               COMP_DOWN       |
-               //              COMP_RECV       |
-               //              COMP_SWBW       |
+u32 rt_global_debug_component = COMP_DOWN      |
                                COMP_SEC        |
-       //                      COMP_RESET      |
-               //              COMP_SEND       |
-                       //      COMP_EVENTS     |
-                               COMP_ERR ; //always open err flags on
+                               COMP_ERR; //always open err flags on
 
 #define TOTAL_CAM_ENTRY 32
 #define CAM_CONTENT_COUNT 8
@@ -130,24 +135,22 @@ MODULE_VERSION("V 1.1");
 MODULE_DEVICE_TABLE(usb, rtl8192_usb_id_tbl);
 MODULE_DESCRIPTION("Linux driver for Realtek RTL8192 USB WiFi cards");
 
-static charifname = "wlan%d";
+static char *ifname = "wlan%d";
 static int hwwep = 1;  //default use hw. set 0 to use software security
 static int channels = 0x3fff;
 
 
 
-module_param(ifname, charp, S_IRUGO|S_IWUSR );
-//module_param(hwseqnum,int, S_IRUGO|S_IWUSR);
-module_param(hwwep,int, S_IRUGO|S_IWUSR);
-module_param(channels,int, S_IRUGO|S_IWUSR);
+module_param(ifname, charp, S_IRUGO|S_IWUSR);
+module_param(hwwep, int, S_IRUGO|S_IWUSR);
+module_param(channels, int, S_IRUGO|S_IWUSR);
 
-MODULE_PARM_DESC(ifname," Net interface name, wlan%d=default");
-//MODULE_PARM_DESC(hwseqnum," Try to use hardware 802.11 header sequence numbers. Zero=default");
-MODULE_PARM_DESC(hwwep," Try to use hardware security support. ");
-MODULE_PARM_DESC(channels," Channel bitmask for specific locales. NYI");
+MODULE_PARM_DESC(ifname, " Net interface name, wlan%d=default");
+MODULE_PARM_DESC(hwwep, " Try to use hardware security support. ");
+MODULE_PARM_DESC(channels, " Channel bitmask for specific locales. NYI");
 
 static int rtl8192_usb_probe(struct usb_interface *intf,
-                        const struct usb_device_id *id);
+                            const struct usb_device_id *id);
 static void rtl8192_usb_disconnect(struct usb_interface *intf);
 
 
@@ -169,7 +172,7 @@ static struct usb_driver rtl8192_usb_driver = {
 typedef struct _CHANNEL_LIST {
        u8      Channel[32];
        u8      Len;
-}CHANNEL_LIST, *PCHANNEL_LIST;
+} CHANNEL_LIST, *PCHANNEL_LIST;
 
 static CHANNEL_LIST ChannelPlan[] = {
        {{1,2,3,4,5,6,7,8,9,10,11,36,40,44,48,52,56,60,64,149,153,157,161,165},24},             //FCC
@@ -185,12 +188,11 @@ static CHANNEL_LIST ChannelPlan[] = {
        {{1,2,3,4,5,6,7,8,9,10,11,12,13,14},14}                                 //For Global Domain. 1-11:active scan, 12-14 passive scan. //+YJ, 080626
 };
 
-static void rtl819x_set_channel_map(u8 channel_plan, struct r8192_privpriv)
+static void rtl819x_set_channel_map(u8 channel_plan, struct r8192_priv *priv)
 {
-       int i, max_chan=-1, min_chan=-1;
-       struct ieee80211_device* ieee = priv->ieee80211;
-       switch (channel_plan)
-       {
+       int i, max_chan = -1, min_chan = -1;
+       struct ieee80211_device *ieee = priv->ieee80211;
+       switch (channel_plan) {
        case COUNTRY_CODE_FCC:
        case COUNTRY_CODE_IC:
        case COUNTRY_CODE_ETSI:
@@ -200,22 +202,21 @@ static void rtl819x_set_channel_map(u8 channel_plan, struct r8192_priv* priv)
        case COUNTRY_CODE_MKK1:
        case COUNTRY_CODE_ISRAEL:
        case COUNTRY_CODE_TELEC:
-       case COUNTRY_CODE_MIC:  
+       case COUNTRY_CODE_MIC:
                Dot11d_Init(ieee);
                ieee->bGlobalDomain = false;
                //actually 8225 & 8256 rf chips only support B,G,24N mode
                if ((priv->rf_chip == RF_8225) || (priv->rf_chip == RF_8256)) {
                        min_chan = 1;
                        max_chan = 14;
-               }
-               else {
-                       RT_TRACE(COMP_ERR, "unknown rf chip, can't set channel map in function:%s()\n", __FUNCTION__);
+               } else {
+                       RT_TRACE(COMP_ERR, "unknown rf chip, can't set channel map in function:%s()\n", __func__);
                }
                if (ChannelPlan[channel_plan].Len != 0) {
                        // Clear old channel map
                        memset(GET_DOT11D_INFO(ieee)->channel_map, 0, sizeof(GET_DOT11D_INFO(ieee)->channel_map));
                        // Set new channel map
-                       for (i=0;i<ChannelPlan[channel_plan].Len;i++) {
+                       for (i = 0; i < ChannelPlan[channel_plan].Len; i++) {
                                if (ChannelPlan[channel_plan].Channel[i] < min_chan || ChannelPlan[channel_plan].Channel[i] > max_chan)
                                        break;
                                GET_DOT11D_INFO(ieee)->channel_map[ChannelPlan[channel_plan].Channel[i]] = 1;
@@ -228,19 +229,13 @@ static void rtl819x_set_channel_map(u8 channel_plan, struct r8192_priv* priv)
                Dot11d_Reset(ieee);
                ieee->bGlobalDomain = true;
                break;
-       
+
        default:
                break;
        }
 }
 
 
-#define                rx_hal_is_cck_rate(_pdrvinfo)\
-                       (_pdrvinfo->RxRate == DESC90_RATE1M ||\
-                       _pdrvinfo->RxRate == DESC90_RATE2M ||\
-                       _pdrvinfo->RxRate == DESC90_RATE5_5M ||\
-                       _pdrvinfo->RxRate == DESC90_RATE11M) &&\
-                       !_pdrvinfo->RxHT\
 
 
 void CamResetAllEntry(struct net_device *dev)
@@ -249,12 +244,6 @@ void CamResetAllEntry(struct net_device *dev)
        //2004/02/11  In static WEP, OID_ADD_KEY or OID_ADD_WEP are set before STA associate to AP.
        // However, ResetKey is called on OID_802_11_INFRASTRUCTURE_MODE and MlmeAssociateRequest
        // In this condition, Cam can not be reset because upper layer will not set this static key again.
-       //if(Adapter->EncAlgorithm == WEP_Encryption)
-       //      return;
-//debug
-       //DbgPrint("========================================\n");
-       //DbgPrint("                            Call ResetAllEntry                                              \n");
-       //DbgPrint("========================================\n\n");
        ulcommand |= BIT31|BIT30;
        write_nic_dword(dev, RWCAM, ulcommand);
 
@@ -264,13 +253,16 @@ void CamResetAllEntry(struct net_device *dev)
 void write_cam(struct net_device *dev, u8 addr, u32 data)
 {
        write_nic_dword(dev, WCAMI, data);
-       write_nic_dword(dev, RWCAM, BIT31|BIT16|(addr&0xff) );
+       write_nic_dword(dev, RWCAM, BIT31|BIT16|(addr&0xff));
 }
 
 u32 read_cam(struct net_device *dev, u8 addr)
 {
-       write_nic_dword(dev, RWCAM, 0x80000000|(addr&0xff) );
-       return read_nic_dword(dev, 0xa8);
+       u32 data;
+
+       write_nic_dword(dev, RWCAM, 0x80000000|(addr&0xff));
+       read_nic_dword(dev, 0xa8, &data);
+       return data;
 }
 
 void write_nic_byte_E(struct net_device *dev, int indx, u8 data)
@@ -280,32 +272,29 @@ void write_nic_byte_E(struct net_device *dev, int indx, u8 data)
        struct usb_device *udev = priv->udev;
 
        status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
-                              RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
-                              indx|0xfe00, 0, &data, 1, HZ / 2);
+                                RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
+                                indx|0xfe00, 0, &data, 1, HZ / 2);
 
        if (status < 0)
-       {
-               printk("write_nic_byte_E TimeOut! status:%d\n", status);
-       }
+               netdev_err(dev, "write_nic_byte_E TimeOut! status: %d\n", status);
 }
 
-u8 read_nic_byte_E(struct net_device *dev, int indx)
+int read_nic_byte_E(struct net_device *dev, int indx, u8 *data)
 {
        int status;
-       u8 data;
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        struct usb_device *udev = priv->udev;
 
        status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
-                              RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
-                              indx|0xfe00, 0, &data, 1, HZ / 2);
+                                RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
+                                indx|0xfe00, 0, data, 1, HZ / 2);
 
-       if (status < 0)
-       {
-               printk("read_nic_byte_E TimeOut! status:%d\n", status);
+       if (status < 0) {
+               netdev_err(dev, "%s failure status: %d\n", __func__, status);
+               return status;
        }
 
-       return data;
+       return 0;
 }
 //as 92U has extend page from 4 to 16, so modify functions below.
 void write_nic_byte(struct net_device *dev, int indx, u8 data)
@@ -316,13 +305,11 @@ void write_nic_byte(struct net_device *dev, int indx, u8 data)
        struct usb_device *udev = priv->udev;
 
        status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
-                              RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
-                              (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 1, HZ / 2);
+                                RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
+                                (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 1, HZ / 2);
 
        if (status < 0)
-       {
-               printk("write_nic_byte TimeOut! status:%d\n", status);
-       }
+               netdev_err(dev, "write_nic_byte TimeOut! status: %d\n", status);
 
 
 }
@@ -337,13 +324,11 @@ void write_nic_word(struct net_device *dev, int indx, u16 data)
        struct usb_device *udev = priv->udev;
 
        status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
-                              RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
-                              (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 2, HZ / 2);
+                                RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
+                                (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 2, HZ / 2);
 
        if (status < 0)
-       {
-               printk("write_nic_word TimeOut! status:%d\n", status);
-       }
+               netdev_err(dev, "write_nic_word TimeOut! status: %d\n", status);
 
 }
 
@@ -357,98 +342,92 @@ void write_nic_dword(struct net_device *dev, int indx, u32 data)
        struct usb_device *udev = priv->udev;
 
        status = usb_control_msg(udev, usb_sndctrlpipe(udev, 0),
-                              RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
-                              (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 4, HZ / 2);
+                                RTL8187_REQ_SET_REGS, RTL8187_REQT_WRITE,
+                                (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 4, HZ / 2);
 
 
        if (status < 0)
-       {
-               printk("write_nic_dword TimeOut! status:%d\n", status);
-       }
+               netdev_err(dev, "write_nic_dword TimeOut! status: %d\n", status);
 
 }
 
 
 
-u8 read_nic_byte(struct net_device *dev, int indx)
+int read_nic_byte(struct net_device *dev, int indx, u8 *data)
 {
-       u8 data;
        int status;
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        struct usb_device *udev = priv->udev;
 
        status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
-                              RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
-                              (indx&0xff)|0xff00, (indx>>8)&0x0f, &data, 1, HZ / 2);
+                                RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
+                                (indx&0xff)|0xff00, (indx>>8)&0x0f, data, 1, HZ / 2);
 
-       if (status < 0)
-       {
-               printk("read_nic_byte TimeOut! status:%d\n", status);
+       if (status < 0) {
+               netdev_err(dev, "%s failure status: %d\n", __func__, status);
+               return status;
        }
 
-       return data;
+       return 0;
 }
 
 
 
-u16 read_nic_word(struct net_device *dev, int indx)
+int read_nic_word(struct net_device *dev, int indx, u16 *data)
 {
-       u16 data;
        int status;
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        struct usb_device *udev = priv->udev;
 
        status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
-                                      RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
-                                      (indx&0xff)|0xff00, (indx>>8)&0x0f,
-                                                       &data, 2, HZ / 2);
+                                RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
+                                (indx&0xff)|0xff00, (indx>>8)&0x0f,
+                                data, 2, HZ / 2);
 
-       if (status < 0)
-               printk("read_nic_word TimeOut! status:%d\n", status);
+       if (status < 0) {
+               netdev_err(dev, "%s failure status: %d\n", __func__, status);
+               return status;
+       }
 
-       return data;
+       return 0;
 }
 
-u16 read_nic_word_E(struct net_device *dev, int indx)
+int read_nic_word_E(struct net_device *dev, int indx, u16 *data)
 {
-       u16 data;
        int status;
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        struct usb_device *udev = priv->udev;
 
        status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
-                              RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
-                                      indx|0xfe00, 0, &data, 2, HZ / 2);
+                                RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
+                                indx|0xfe00, 0, data, 2, HZ / 2);
 
-       if (status < 0)
-               printk("read_nic_word TimeOut! status:%d\n", status);
+       if (status < 0) {
+               netdev_err(dev, "%s failure status: %d\n", __func__, status);
+               return status;
+       }
 
-       return data;
+       return 0;
 }
 
-u32 read_nic_dword(struct net_device *dev, int indx)
+int read_nic_dword(struct net_device *dev, int indx, u32 *data)
 {
-       u32 data;
        int status;
-       /* int result; */
 
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        struct usb_device *udev = priv->udev;
 
        status = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0),
-                                      RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
-                                       (indx&0xff)|0xff00, (indx>>8)&0x0f,
-                                                       &data, 4, HZ / 2);
-       /* if(0 != result) {
-        *      printk(KERN_WARNING "read size of data = %d\, date = %d\n",
-        *                                                       result, data);
-        * }
-        */
+                                RTL8187_REQ_GET_REGS, RTL8187_REQT_READ,
+                                (indx&0xff)|0xff00, (indx>>8)&0x0f,
+                                data, 4, HZ / 2);
 
-       if (status < 0)
-               printk("read_nic_dword TimeOut! status:%d\n", status);
+       if (status < 0) {
+               netdev_err(dev, "%s failure status: %d\n", __func__, status);
+               return status;
+       }
 
-       return data;
+       return 0;
 }
 
 /* u8 read_phy_cck(struct net_device *dev, u8 adr); */
@@ -462,9 +441,7 @@ inline void force_pci_posting(struct net_device *dev)
 
 static struct net_device_stats *rtl8192_stats(struct net_device *dev);
 void rtl8192_commit(struct net_device *dev);
-/* void rtl8192_restart(struct net_device *dev); */
 void rtl8192_restart(struct work_struct *work);
-/* void rtl8192_rq_tx_ack(struct work_struct *work); */
 void watch_dog_timer_callback(unsigned long data);
 
 /****************************************************************************
@@ -495,40 +472,38 @@ static int proc_get_stats_ap(struct seq_file *m, void *v)
 static int proc_get_registers(struct seq_file *m, void *v)
 {
        struct net_device *dev = m->private;
-       int i,n, max = 0xff;
+       int i, n, max = 0xff;
+       u8 byte_rd;
 
        seq_puts(m, "\n####################page 0##################\n ");
 
-       for (n=0;n<=max;) {
-               //printk( "\nD: %2x> ", n);
-               seq_printf(m, "\nD:  %2x > ",n);
-
-               for (i=0;i<16 && n<=max;i++,n++)
-                       seq_printf(m, "%2x ",read_nic_byte(dev,0x000|n));
+       for (n = 0; n <= max;) {
+               seq_printf(m, "\nD:  %2x > ", n);
 
-               //      printk("%2x ",read_nic_byte(dev,n));
+               for (i = 0; i < 16 && n <= max; i++, n++) {
+                       read_nic_byte(dev, 0x000|n, &byte_rd);
+                       seq_printf(m, "%2x ", byte_rd);
+               }
        }
 
        seq_puts(m, "\n####################page 1##################\n ");
-       for (n=0;n<=max;) {
-               //printk( "\nD: %2x> ", n);
-               seq_printf(m, "\nD:  %2x > ",n);
-
-               for (i=0;i<16 && n<=max;i++,n++)
-                       seq_printf(m, "%2x ",read_nic_byte(dev,0x100|n));
+       for (n = 0; n <= max;) {
+               seq_printf(m, "\nD:  %2x > ", n);
 
-               //      printk("%2x ",read_nic_byte(dev,n));
+               for (i = 0; i < 16 && n <= max; i++, n++) {
+                       read_nic_byte(dev, 0x100|n, &byte_rd);
+                       seq_printf(m, "%2x ", byte_rd);
+               }
        }
 
        seq_puts(m, "\n####################page 3##################\n ");
-       for (n=0;n<=max;) {
-               //printk( "\nD: %2x> ", n);
-               seq_printf(m, "\nD:  %2x > ",n);
-
-               for(i=0;i<16 && n<=max;i++,n++)
-                       seq_printf(m, "%2x ",read_nic_byte(dev,0x300|n));
+       for (n = 0; n <= max;) {
+               seq_printf(m, "\nD:  %2x > ", n);
 
-               //      printk("%2x ",read_nic_byte(dev,n));
+               for (i = 0; i < 16 && n <= max; i++, n++) {
+                       read_nic_byte(dev, 0x300|n, &byte_rd);
+                       seq_printf(m, "%2x ", byte_rd);
+               }
        }
 
        seq_putc(m, '\n');
@@ -541,64 +516,54 @@ static int proc_get_stats_tx(struct seq_file *m, void *v)
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 
        seq_printf(m,
-               "TX VI priority ok int: %lu\n"
-               "TX VI priority error int: %lu\n"
-               "TX VO priority ok int: %lu\n"
-               "TX VO priority error int: %lu\n"
-               "TX BE priority ok int: %lu\n"
-               "TX BE priority error int: %lu\n"
-               "TX BK priority ok int: %lu\n"
-               "TX BK priority error int: %lu\n"
-               "TX MANAGE priority ok int: %lu\n"
-               "TX MANAGE priority error int: %lu\n"
-               "TX BEACON priority ok int: %lu\n"
-               "TX BEACON priority error int: %lu\n"
-//             "TX high priority ok int: %lu\n"
-//             "TX high priority failed error int: %lu\n"
-               "TX queue resume: %lu\n"
-               "TX queue stopped?: %d\n"
-               "TX fifo overflow: %lu\n"
-//             "TX beacon: %lu\n"
-               "TX VI queue: %d\n"
-               "TX VO queue: %d\n"
-               "TX BE queue: %d\n"
-               "TX BK queue: %d\n"
-//             "TX HW queue: %d\n"
-               "TX VI dropped: %lu\n"
-               "TX VO dropped: %lu\n"
-               "TX BE dropped: %lu\n"
-               "TX BK dropped: %lu\n"
-               "TX total data packets %lu\n",
-//             "TX beacon aborted: %lu\n",
-               priv->stats.txviokint,
-               priv->stats.txvierr,
-               priv->stats.txvookint,
-               priv->stats.txvoerr,
-               priv->stats.txbeokint,
-               priv->stats.txbeerr,
-               priv->stats.txbkokint,
-               priv->stats.txbkerr,
-               priv->stats.txmanageokint,
-               priv->stats.txmanageerr,
-               priv->stats.txbeaconokint,
-               priv->stats.txbeaconerr,
-//             priv->stats.txhpokint,
-//             priv->stats.txhperr,
-               priv->stats.txresumed,
-               netif_queue_stopped(dev),
-               priv->stats.txoverflow,
-//             priv->stats.txbeacon,
-               atomic_read(&(priv->tx_pending[VI_PRIORITY])),
-               atomic_read(&(priv->tx_pending[VO_PRIORITY])),
-               atomic_read(&(priv->tx_pending[BE_PRIORITY])),
-               atomic_read(&(priv->tx_pending[BK_PRIORITY])),
-//             read_nic_byte(dev, TXFIFOCOUNT),
-               priv->stats.txvidrop,
-               priv->stats.txvodrop,
-               priv->stats.txbedrop,
-               priv->stats.txbkdrop,
-               priv->stats.txdatapkt
-//             priv->stats.txbeaconerr
+                  "TX VI priority ok int: %lu\n"
+                  "TX VI priority error int: %lu\n"
+                  "TX VO priority ok int: %lu\n"
+                  "TX VO priority error int: %lu\n"
+                  "TX BE priority ok int: %lu\n"
+                  "TX BE priority error int: %lu\n"
+                  "TX BK priority ok int: %lu\n"
+                  "TX BK priority error int: %lu\n"
+                  "TX MANAGE priority ok int: %lu\n"
+                  "TX MANAGE priority error int: %lu\n"
+                  "TX BEACON priority ok int: %lu\n"
+                  "TX BEACON priority error int: %lu\n"
+                  "TX queue resume: %lu\n"
+                  "TX queue stopped?: %d\n"
+                  "TX fifo overflow: %lu\n"
+                  "TX VI queue: %d\n"
+                  "TX VO queue: %d\n"
+                  "TX BE queue: %d\n"
+                  "TX BK queue: %d\n"
+                  "TX VI dropped: %lu\n"
+                  "TX VO dropped: %lu\n"
+                  "TX BE dropped: %lu\n"
+                  "TX BK dropped: %lu\n"
+                  "TX total data packets %lu\n",
+                  priv->stats.txviokint,
+                  priv->stats.txvierr,
+                  priv->stats.txvookint,
+                  priv->stats.txvoerr,
+                  priv->stats.txbeokint,
+                  priv->stats.txbeerr,
+                  priv->stats.txbkokint,
+                  priv->stats.txbkerr,
+                  priv->stats.txmanageokint,
+                  priv->stats.txmanageerr,
+                  priv->stats.txbeaconokint,
+                  priv->stats.txbeaconerr,
+                  priv->stats.txresumed,
+                  netif_queue_stopped(dev),
+                  priv->stats.txoverflow,
+                  atomic_read(&(priv->tx_pending[VI_PRIORITY])),
+                  atomic_read(&(priv->tx_pending[VO_PRIORITY])),
+                  atomic_read(&(priv->tx_pending[BE_PRIORITY])),
+                  atomic_read(&(priv->tx_pending[BK_PRIORITY])),
+                  priv->stats.txvidrop,
+                  priv->stats.txvodrop,
+                  priv->stats.txbedrop,
+                  priv->stats.txbkdrop,
+                  priv->stats.txdatapkt
                );
 
        return 0;
@@ -610,12 +575,12 @@ static int proc_get_stats_rx(struct seq_file *m, void *v)
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 
        seq_printf(m,
-               "RX packets: %lu\n"
-               "RX urb status error: %lu\n"
-               "RX invalid urb error: %lu\n",
-               priv->stats.rxoktotal,
-               priv->stats.rxstaterr,
-               priv->stats.rxurberr);
+                  "RX packets: %lu\n"
+                  "RX urb status error: %lu\n"
+                  "RX invalid urb error: %lu\n",
+                  priv->stats.rxoktotal,
+                  priv->stats.rxstaterr,
+                  priv->stats.rxurberr);
 
        return 0;
 }
@@ -700,27 +665,7 @@ void rtl8192_proc_remove_one(struct net_device *dev)
    -----------------------------MISC STUFF-------------------------
 *****************************************************************************/
 
-/* this is only for debugging */
-void print_buffer(u32 *buffer, int len)
-{
-       int i;
-       u8 *buf =(u8*)buffer;
-
-       printk("ASCII BUFFER DUMP (len: %x):\n",len);
-
-       for(i=0;i<len;i++)
-               printk("%c",buf[i]);
-
-       printk("\nBINARY BUFFER DUMP (len: %x):\n",len);
-
-       for(i=0;i<len;i++)
-               printk("%x",buf[i]);
-
-       printk("\n");
-}
-
-//short check_nic_enough_desc(struct net_device *dev, priority_t priority)
-short check_nic_enough_desc(struct net_device *dev,int queue_index)
+short check_nic_enough_desc(struct net_device *dev, int queue_index)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        int used = atomic_read(&priv->tx_pending[queue_index]);
@@ -731,10 +676,8 @@ short check_nic_enough_desc(struct net_device *dev,int queue_index)
 void tx_timeout(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-       //rtl8192_commit(dev);
 
        schedule_work(&priv->reset_wq);
-       //DMESG("TXTIMEOUT");
 }
 
 
@@ -742,41 +685,24 @@ void tx_timeout(struct net_device *dev)
 void dump_eprom(struct net_device *dev)
 {
        int i;
-       for(i=0; i<63; i++)
-               RT_TRACE(COMP_EPROM, "EEPROM addr %x : %x", i, eprom_read(dev,i));
+       for (i = 0; i < 63; i++)
+               RT_TRACE(COMP_EPROM, "EEPROM addr %x : %x", i, eprom_read(dev, i));
 }
 
-/* this is only for debug */
-void rtl8192_dump_reg(struct net_device *dev)
-{
-       int i;
-       int n;
-       int max=0x1ff;
-
-       RT_TRACE(COMP_PHY, "Dumping NIC register map");
-
-       for(n=0;n<=max;)
-       {
-               printk( "\nD: %2x> ", n);
-               for(i=0;i<16 && n<=max;i++,n++)
-                       printk("%2x ",read_nic_byte(dev,n));
-       }
-       printk("\n");
-}
 
 /****************************************************************************
       ------------------------------HW STUFF---------------------------
 *****************************************************************************/
 
 
-void rtl8192_set_mode(struct net_device *dev,int mode)
+void rtl8192_set_mode(struct net_device *dev, int mode)
 {
        u8 ecmd;
-       ecmd=read_nic_byte(dev, EPROM_CMD);
-       ecmd=ecmd &~ EPROM_CMD_OPERATING_MODE_MASK;
-       ecmd=ecmd | (mode<<EPROM_CMD_OPERATING_MODE_SHIFT);
-       ecmd=ecmd &~ (1<<EPROM_CS_SHIFT);
-       ecmd=ecmd &~ (1<<EPROM_CK_SHIFT);
+       read_nic_byte(dev, EPROM_CMD, &ecmd);
+       ecmd = ecmd & ~EPROM_CMD_OPERATING_MODE_MASK;
+       ecmd = ecmd | (mode<<EPROM_CMD_OPERATING_MODE_SHIFT);
+       ecmd = ecmd & ~EPROM_CS_BIT;
+       ecmd = ecmd & ~EPROM_CK_BIT;
        write_nic_byte(dev, EPROM_CMD, ecmd);
 }
 
@@ -786,15 +712,15 @@ void rtl8192_update_msr(struct net_device *dev)
        struct r8192_priv *priv = ieee80211_priv(dev);
        u8 msr;
 
-       msr  = read_nic_byte(dev, MSR);
-       msr &= ~ MSR_LINK_MASK;
+       read_nic_byte(dev, MSR, &msr);
+       msr &= ~MSR_LINK_MASK;
 
        /* do not change in link_state != WLAN_LINK_ASSOCIATED.
         * msr must be updated if the state is ASSOCIATING.
         * this is intentional and make sense for ad-hoc and
         * master (see the create BSS/IBSS func)
         */
-       if (priv->ieee80211->state == IEEE80211_LINKED){
+       if (priv->ieee80211->state == IEEE80211_LINKED) {
 
                if (priv->ieee80211->iw_mode == IW_MODE_INFRA)
                        msr |= (MSR_LINK_MANAGED<<MSR_LINK_SHIFT);
@@ -803,39 +729,31 @@ void rtl8192_update_msr(struct net_device *dev)
                else if (priv->ieee80211->iw_mode == IW_MODE_MASTER)
                        msr |= (MSR_LINK_MASTER<<MSR_LINK_SHIFT);
 
-       }else
+       } else {
                msr |= (MSR_LINK_NONE<<MSR_LINK_SHIFT);
+       }
 
        write_nic_byte(dev, MSR, msr);
 }
 
-void rtl8192_set_chan(struct net_device *dev,short ch)
+void rtl8192_set_chan(struct net_device *dev, short ch)
 {
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-//     u32 tx;
-       RT_TRACE(COMP_CH, "=====>%s()====ch:%d\n", __FUNCTION__, ch);
-       priv->chan=ch;
+       RT_TRACE(COMP_CH, "=====>%s()====ch:%d\n", __func__, ch);
+       priv->chan = ch;
 
        /* this hack should avoid frame TX during channel setting*/
 
-
-//     tx = read_nic_dword(dev,TX_CONF);
-//     tx &= ~TX_LOOPBACK_MASK;
-
 #ifndef LOOP_TEST
-//     write_nic_dword(dev,TX_CONF, tx |( TX_LOOPBACK_MAC<<TX_LOOPBACK_SHIFT));
-
        //need to implement rf set channel here WB
 
        if (priv->rf_set_chan)
-       priv->rf_set_chan(dev,priv->chan);
+               priv->rf_set_chan(dev, priv->chan);
        mdelay(10);
-//     write_nic_dword(dev,TX_CONF,tx | (TX_LOOPBACK_NONE<<TX_LOOPBACK_SHIFT));
 #endif
 }
 
 static void rtl8192_rx_isr(struct urb *urb);
-//static void rtl8192_rx_isr(struct urb *rx_urb);
 
 u32 get_rxpacket_shiftbytes_819xusb(struct ieee80211_rx_stats *pstats)
 {
@@ -847,10 +765,10 @@ u32 get_rxpacket_shiftbytes_819xusb(struct ieee80211_rx_stats *pstats)
        else
 #endif
                return (sizeof(rx_desc_819x_usb) + pstats->RxDrvInfoSize
-                               + pstats->RxBufShift);
+                       + pstats->RxBufShift);
 
 }
-static int rtl8192_rx_initiate(struct net_device*dev)
+static int rtl8192_rx_initiate(struct net_device *dev)
 {
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        struct urb *entry;
@@ -867,7 +785,6 @@ static int rtl8192_rx_initiate(struct net_device*dev)
                        kfree_skb(skb);
                        break;
                }
-//             printk("nomal packet IN request!\n");
                usb_fill_bulk_urb(entry, priv->udev,
                                  usb_rcvbulkpipe(priv->udev, 3), skb_tail_pointer(skb),
                                  RX_URB_SIZE, rtl8192_rx_isr, skb);
@@ -881,8 +798,7 @@ static int rtl8192_rx_initiate(struct net_device*dev)
 
        /* command packet rx procedure */
        while (skb_queue_len(&priv->rx_queue) < MAX_RX_URB + 3) {
-//             printk("command packet IN request!\n");
-               skb = __dev_alloc_skb(RX_URB_SIZE ,GFP_KERNEL);
+               skb = __dev_alloc_skb(RX_URB_SIZE, GFP_KERNEL);
                if (!skb)
                        break;
                entry = usb_alloc_urb(0, GFP_KERNEL);
@@ -896,7 +812,7 @@ static int rtl8192_rx_initiate(struct net_device*dev)
                info = (struct rtl8192_rx_info *) skb->cb;
                info->urb = entry;
                info->dev = dev;
-                  info->out_pipe = 9; //denote rx cmd packet queue
+               info->out_pipe = 9; //denote rx cmd packet queue
                skb_queue_tail(&priv->rx_queue, skb);
                usb_submit_urb(entry, GFP_KERNEL);
        }
@@ -909,64 +825,47 @@ void rtl8192_set_rxconf(struct net_device *dev)
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        u32 rxconf;
 
-       rxconf=read_nic_dword(dev,RCR);
-       rxconf = rxconf &MAC_FILTER_MASK;
+       read_nic_dword(dev, RCR, &rxconf);
+       rxconf = rxconf & ~MAC_FILTER_MASK;
        rxconf = rxconf | RCR_AMF;
        rxconf = rxconf | RCR_ADF;
        rxconf = rxconf | RCR_AB;
        rxconf = rxconf | RCR_AM;
-       //rxconf = rxconf | RCR_ACF;
 
-       if (dev->flags & IFF_PROMISC) {DMESG ("NIC in promisc mode");}
+       if (dev->flags & IFF_PROMISC)
+               DMESG("NIC in promisc mode");
 
-       if(priv->ieee80211->iw_mode == IW_MODE_MONITOR || \
-          dev->flags & IFF_PROMISC){
+       if (priv->ieee80211->iw_mode == IW_MODE_MONITOR ||
+           dev->flags & IFF_PROMISC) {
                rxconf = rxconf | RCR_AAP;
-       } /*else if(priv->ieee80211->iw_mode == IW_MODE_MASTER){
-               rxconf = rxconf | (1<<ACCEPT_ALLMAC_FRAME_SHIFT);
-               rxconf = rxconf | (1<<RX_CHECK_BSSID_SHIFT);
-       }*/else{
+       } else {
                rxconf = rxconf | RCR_APM;
                rxconf = rxconf | RCR_CBSSID;
        }
 
 
-       if(priv->ieee80211->iw_mode == IW_MODE_MONITOR){
+       if (priv->ieee80211->iw_mode == IW_MODE_MONITOR) {
                rxconf = rxconf | RCR_AICV;
                rxconf = rxconf | RCR_APWRMGT;
        }
 
-       ifpriv->crcmon == 1 && priv->ieee80211->iw_mode == IW_MODE_MONITOR)
+       if (priv->crcmon == 1 && priv->ieee80211->iw_mode == IW_MODE_MONITOR)
                rxconf = rxconf | RCR_ACRC32;
 
 
-       rxconf = rxconf &RX_FIFO_THRESHOLD_MASK;
+       rxconf = rxconf & ~RX_FIFO_THRESHOLD_MASK;
        rxconf = rxconf | (RX_FIFO_THRESHOLD_NONE<<RX_FIFO_THRESHOLD_SHIFT);
-       rxconf = rxconf &MAX_RX_DMA_MASK;
+       rxconf = rxconf & ~MAX_RX_DMA_MASK;
        rxconf = rxconf | ((u32)7<<RCR_MXDMA_OFFSET);
 
-//     rxconf = rxconf | (1<<RX_AUTORESETPHY_SHIFT);
        rxconf = rxconf | RCR_ONLYERLPKT;
 
-//     rxconf = rxconf &~ RCR_CS_MASK;
-//     rxconf = rxconf | (1<<RCR_CS_SHIFT);
-
        write_nic_dword(dev, RCR, rxconf);
-
-       #ifdef DEBUG_RX
-       DMESG("rxconf: %x %x",rxconf ,read_nic_dword(dev,RCR));
-       #endif
 }
 //wait to be removed
 void rtl8192_rx_enable(struct net_device *dev)
 {
-       //u8 cmd;
-
-       //struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-
        rtl8192_rx_initiate(dev);
-
-//     rtl8192_set_rxconf(dev);
 }
 
 
@@ -983,9 +882,8 @@ void rtl8192_rtx_disable(struct net_device *dev)
        struct sk_buff *skb;
        struct rtl8192_rx_info *info;
 
-       cmd=read_nic_byte(dev,CMDR);
-       write_nic_byte(dev, CMDR, cmd &~ \
-               (CR_TE|CR_RE));
+       read_nic_byte(dev, CMDR, &cmd);
+       write_nic_byte(dev, CMDR, cmd & ~(CR_TE|CR_RE));
        force_pci_posting(dev);
        mdelay(10);
 
@@ -998,9 +896,8 @@ void rtl8192_rtx_disable(struct net_device *dev)
                kfree_skb(skb);
        }
 
-       if (skb_queue_len(&priv->skb_queue)) {
-               printk(KERN_WARNING "skb_queue not empty\n");
-       }
+       if (skb_queue_len(&priv->skb_queue))
+               netdev_warn(dev, "skb_queue not empty\n");
 
        skb_queue_purge(&priv->skb_queue);
        return;
@@ -1014,40 +911,40 @@ int alloc_tx_beacon_desc_ring(struct net_device *dev, int count)
 
 inline u16 ieeerate2rtlrate(int rate)
 {
-       switch(rate){
+       switch (rate) {
        case 10:
-       return 0;
+               return 0;
        case 20:
-       return 1;
+               return 1;
        case 55:
-       return 2;
+               return 2;
        case 110:
-       return 3;
+               return 3;
        case 60:
-       return 4;
+               return 4;
        case 90:
-       return 5;
+               return 5;
        case 120:
-       return 6;
+               return 6;
        case 180:
-       return 7;
+               return 7;
        case 240:
-       return 8;
+               return 8;
        case 360:
-       return 9;
+               return 9;
        case 480:
-       return 10;
+               return 10;
        case 540:
-       return 11;
+               return 11;
        default:
-       return 3;
+               return 3;
 
        }
 }
-static u16 rtl_rate[] = {10,20,55,110,60,90,120,180,240,360,480,540};
+static u16 rtl_rate[] = {10, 20, 55, 110, 60, 90, 120, 180, 240, 360, 480, 540};
 inline u16 rtl8192_rate2rate(short rate)
 {
-       if (rate >11) return 0;
+       if (rate > 11) return 0;
        return rtl_rate[rate];
 }
 
@@ -1061,14 +958,13 @@ static void rtl8192_rx_isr(struct urb *urb)
        struct r8192_priv *priv = ieee80211_priv(dev);
        int out_pipe = info->out_pipe;
        int err;
-       if(!priv->up)
+       if (!priv->up)
                return;
        if (unlikely(urb->status)) {
                info->urb = NULL;
                priv->stats.rxstaterr++;
                priv->ieee80211->stats.rx_errors++;
                usb_free_urb(urb);
-       //      printk("%s():rx status err\n",__FUNCTION__);
                return;
        }
        skb_unlink(skb, &priv->rx_queue);
@@ -1080,14 +976,14 @@ static void rtl8192_rx_isr(struct urb *urb)
        skb = dev_alloc_skb(RX_URB_SIZE);
        if (unlikely(!skb)) {
                usb_free_urb(urb);
-               printk("%s():can,t alloc skb\n",__FUNCTION__);
+               netdev_err(dev, "%s(): can't alloc skb\n", __func__);
                /* TODO check rx queue length and refill *somewhere* */
                return;
        }
 
        usb_fill_bulk_urb(urb, priv->udev,
-                       usb_rcvbulkpipe(priv->udev, out_pipe), skb_tail_pointer(skb),
-                       RX_URB_SIZE, rtl8192_rx_isr, skb);
+                         usb_rcvbulkpipe(priv->udev, out_pipe), skb_tail_pointer(skb),
+                         RX_URB_SIZE, rtl8192_rx_isr, skb);
 
        info = (struct rtl8192_rx_info *) skb->cb;
        info->urb = urb;
@@ -1098,31 +994,19 @@ static void rtl8192_rx_isr(struct urb *urb)
        urb->context = skb;
        skb_queue_tail(&priv->rx_queue, skb);
        err = usb_submit_urb(urb, GFP_ATOMIC);
-       if(err && err != EPERM)
-               printk("can not submit rxurb, err is %x,URB status is %x\n",err,urb->status);
+       if (err && err != EPERM)
+               netdev_err(dev, "can not submit rxurb, err is %x, URB status is %x\n", err, urb->status);
 }
 
-u32
-rtl819xusb_rx_command_packet(
-       struct net_device *dev,
-       struct ieee80211_rx_stats *pstats
-       )
+u32 rtl819xusb_rx_command_packet(struct net_device *dev,
+                                struct ieee80211_rx_stats *pstats)
 {
        u32     status;
 
-       //RT_TRACE(COMP_RECV, DBG_TRACE, ("---> RxCommandPacketHandle819xUsb()\n"));
-
        status = cmpk_message_handle_rx(dev, pstats);
        if (status)
-       {
                DMESG("rxcommandpackethandle819xusb: It is a command packet\n");
-       }
-       else
-       {
-               //RT_TRACE(COMP_RECV, DBG_TRACE, ("RxCommandPacketHandle819xUsb: It is not a command packet\n"));
-       }
 
-       //RT_TRACE(COMP_RECV, DBG_TRACE, ("<--- RxCommandPacketHandle819xUsb()\n"));
        return status;
 }
 
@@ -1150,24 +1034,17 @@ void rtl8192_hard_data_xmit(struct sk_buff *skb, struct net_device *dev, int rat
        u8 queue_index = tcb_desc->queue_index;
 
        /* shall not be referred by command packet */
-       assert(queue_index != TXCMD_QUEUE);
+       RTL8192U_ASSERT(queue_index != TXCMD_QUEUE);
 
-       spin_lock_irqsave(&priv->tx_lock,flags);
+       spin_lock_irqsave(&priv->tx_lock, flags);
 
-       memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-//     tcb_desc->RATRIndex = 7;
-//     tcb_desc->bTxDisableRateFallBack = 1;
-//     tcb_desc->bTxUseDriverAssingedRate = 1;
+       memcpy((unsigned char *)(skb->cb), &dev, sizeof(dev));
        tcb_desc->bTxEnableFwCalcDur = 1;
        skb_push(skb, priv->ieee80211->tx_headroom);
        ret = rtl8192_tx(dev, skb);
 
-       //priv->ieee80211->stats.tx_bytes+=(skb->len - priv->ieee80211->tx_headroom);
-       //priv->ieee80211->stats.tx_packets++;
+       spin_unlock_irqrestore(&priv->tx_lock, flags);
 
-       spin_unlock_irqrestore(&priv->tx_lock,flags);
-
-//     return ret;
        return;
 }
 
@@ -1176,7 +1053,7 @@ void rtl8192_hard_data_xmit(struct sk_buff *skb, struct net_device *dev, int rat
  * If the ring is full packet are dropped (for data frame the queue
  * is stopped before this can happen).
  */
-int rtl8192_hard_start_xmit(struct sk_buff *skb,struct net_device *dev)
+int rtl8192_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        int ret;
@@ -1185,21 +1062,21 @@ int rtl8192_hard_start_xmit(struct sk_buff *skb,struct net_device *dev)
        u8 queue_index = tcb_desc->queue_index;
 
 
-       spin_lock_irqsave(&priv->tx_lock,flags);
+       spin_lock_irqsave(&priv->tx_lock, flags);
 
-       memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-       if(queue_index == TXCMD_QUEUE) {
+       memcpy((unsigned char *)(skb->cb), &dev, sizeof(dev));
+       if (queue_index == TXCMD_QUEUE) {
                skb_push(skb, USB_HWDESC_HEADER_LEN);
                rtl819xU_tx_cmd(dev, skb);
                ret = 1;
-               spin_unlock_irqrestore(&priv->tx_lock,flags);
+               spin_unlock_irqrestore(&priv->tx_lock, flags);
                return ret;
        } else {
                skb_push(skb, priv->ieee80211->tx_headroom);
                ret = rtl8192_tx(dev, skb);
        }
 
-       spin_unlock_irqrestore(&priv->tx_lock,flags);
+       spin_unlock_irqrestore(&priv->tx_lock, flags);
 
        return ret;
 }
@@ -1211,7 +1088,7 @@ void rtl8192_try_wake_queue(struct net_device *dev, int pri);
 u16 DrvAggr_PaddingAdd(struct net_device *dev, struct sk_buff *skb)
 {
        u16     PaddingNum =  256 - ((skb->len + TX_PACKET_DRVAGGR_SUBFRAME_SHIFT_BYTES) % 256);
-       return  (PaddingNum&0xff);
+       return  PaddingNum & 0xff;
 }
 
 u8 MRateToHwRate8190Pci(u8 rate);
@@ -1239,7 +1116,7 @@ struct sk_buff *DrvAggr_Aggregation(struct net_device *dev, struct ieee80211_drv
        /* Get the total aggregation length including the padding space and
         * sub frame header.
         */
-       for(i = 1; i < pSendList->nr_drv_agg_frames; i++) {
+       for (i = 1; i < pSendList->nr_drv_agg_frames; i++) {
                TotalLength += DrvAggr_PaddingAdd(dev, skb);
                skb = pSendList->tx_agg_frames[i];
                TotalLength += (skb->len + TX_PACKET_DRVAGGR_SUBFRAME_SHIFT_BYTES);
@@ -1250,23 +1127,19 @@ struct sk_buff *DrvAggr_Aggregation(struct net_device *dev, struct ieee80211_drv
        memset(agg_skb->data, 0, agg_skb->len);
        skb_reserve(agg_skb, ieee->tx_headroom);
 
-//     RT_DEBUG_DATA(COMP_SEND, skb->cb, sizeof(skb->cb));
        /* reserve info for first subframe Tx descriptor to be set in the tx function */
        skb = pSendList->tx_agg_frames[0];
        tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
        tcb_desc->drv_agg_enable = 1;
        tcb_desc->pkt_size = skb->len;
        tcb_desc->DrvAggrNum = pSendList->nr_drv_agg_frames;
-       printk("DrvAggNum = %d\n", tcb_desc->DrvAggrNum);
-//     RT_DEBUG_DATA(COMP_SEND, skb->cb, sizeof(skb->cb));
-//     printk("========>skb->data ======> \n");
-//     RT_DEBUG_DATA(COMP_SEND, skb->data, skb->len);
+       netdev_dbg(dev, "DrvAggNum = %d\n", tcb_desc->DrvAggrNum);
        memcpy(agg_skb->cb, skb->cb, sizeof(skb->cb));
-       memcpy(skb_put(agg_skb,skb->len),skb->data,skb->len);
+       memcpy(skb_put(agg_skb, skb->len), skb->data, skb->len);
 
-       for(i = 1; i < pSendList->nr_drv_agg_frames; i++) {
+       for (i = 1; i < pSendList->nr_drv_agg_frames; i++) {
                /* push the next sub frame to be 256 byte aline */
-               skb_put(agg_skb,DrvAggr_PaddingAdd(dev,skb));
+               skb_put(agg_skb, DrvAggr_PaddingAdd(dev, skb));
 
                /* Subframe drv Tx descriptor and firmware info setting */
                skb = pSendList->tx_agg_frames[i];
@@ -1274,13 +1147,13 @@ struct sk_buff *DrvAggr_Aggregation(struct net_device *dev, struct ieee80211_drv
                tx_agg_desc = (tx_desc_819x_usb_aggr_subframe *)agg_skb->tail;
                tx_fwinfo = (tx_fwinfo_819x_usb *)(agg_skb->tail + sizeof(tx_desc_819x_usb_aggr_subframe));
 
-               memset(tx_fwinfo,0,sizeof(tx_fwinfo_819x_usb));
+               memset(tx_fwinfo, 0, sizeof(tx_fwinfo_819x_usb));
                /* DWORD 0 */
-               tx_fwinfo->TxHT = (tcb_desc->data_rate&0x80)?1:0;
+               tx_fwinfo->TxHT = (tcb_desc->data_rate&0x80) ? 1 : 0;
                tx_fwinfo->TxRate = MRateToHwRate8190Pci(tcb_desc->data_rate);
                tx_fwinfo->EnableCPUDur = tcb_desc->bTxEnableFwCalcDur;
                tx_fwinfo->Short = QueryIsShort(tx_fwinfo->TxHT, tx_fwinfo->TxRate, tcb_desc);
-               if(tcb_desc->bAMPDUEnable) {//AMPDU enabled
+               if (tcb_desc->bAMPDUEnable) {//AMPDU enabled
                        tx_fwinfo->AllowAggregation = 1;
                        /* DWORD 1 */
                        tx_fwinfo->RxMF = tcb_desc->ampdu_factor;
@@ -1293,20 +1166,19 @@ struct sk_buff *DrvAggr_Aggregation(struct net_device *dev, struct ieee80211_drv
                }
 
                /* Protection mode related */
-               tx_fwinfo->RtsEnable = (tcb_desc->bRTSEnable)?1:0;
-               tx_fwinfo->CtsEnable = (tcb_desc->bCTSEnable)?1:0;
-               tx_fwinfo->RtsSTBC = (tcb_desc->bRTSSTBC)?1:0;
-               tx_fwinfo->RtsHT = (tcb_desc->rts_rate&0x80)?1:0;
+               tx_fwinfo->RtsEnable = (tcb_desc->bRTSEnable) ? 1 : 0;
+               tx_fwinfo->CtsEnable = (tcb_desc->bCTSEnable) ? 1 : 0;
+               tx_fwinfo->RtsSTBC = (tcb_desc->bRTSSTBC) ? 1 : 0;
+               tx_fwinfo->RtsHT = (tcb_desc->rts_rate&0x80) ? 1 : 0;
                tx_fwinfo->RtsRate =  MRateToHwRate8190Pci((u8)tcb_desc->rts_rate);
-               tx_fwinfo->RtsSubcarrier = (tx_fwinfo->RtsHT==0)?(tcb_desc->RTSSC):0;
-               tx_fwinfo->RtsBandwidth = (tx_fwinfo->RtsHT==1)?((tcb_desc->bRTSBW)?1:0):0;
-               tx_fwinfo->RtsShort = (tx_fwinfo->RtsHT==0)?(tcb_desc->bRTSUseShortPreamble?1:0):\
-                                     (tcb_desc->bRTSUseShortGI?1:0);
+               tx_fwinfo->RtsSubcarrier = (tx_fwinfo->RtsHT == 0) ? (tcb_desc->RTSSC) : 0;
+               tx_fwinfo->RtsBandwidth = (tx_fwinfo->RtsHT == 1) ? ((tcb_desc->bRTSBW) ? 1 : 0) : 0;
+               tx_fwinfo->RtsShort = (tx_fwinfo->RtsHT == 0) ? (tcb_desc->bRTSUseShortPreamble ? 1 : 0) :
+                                     (tcb_desc->bRTSUseShortGI ? 1 : 0);
 
                /* Set Bandwidth and sub-channel settings. */
-               if(priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20_40)
-               {
-                       if(tcb_desc->bPacketBW) {
+               if (priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20_40) {
+                       if (tcb_desc->bPacketBW) {
                                tx_fwinfo->TxBandwidth = 1;
                                tx_fwinfo->TxSubCarrier = 0;    //By SD3's Jerry suggestion, use duplicated mode
                        } else {
@@ -1321,41 +1193,35 @@ struct sk_buff *DrvAggr_Aggregation(struct net_device *dev, struct ieee80211_drv
                /* Fill Tx descriptor */
                memset(tx_agg_desc, 0, sizeof(tx_desc_819x_usb_aggr_subframe));
                /* DWORD 0 */
-               //tx_agg_desc->LINIP = 0;
-               //tx_agg_desc->CmdInit = 1;
                tx_agg_desc->Offset =  sizeof(tx_fwinfo_819x_usb) + 8;
                /* already raw data, need not to subtract header length */
                tx_agg_desc->PktSize = skb->len & 0xffff;
 
                /*DWORD 1*/
-               tx_agg_desc->SecCAMID= 0;
+               tx_agg_desc->SecCAMID = 0;
                tx_agg_desc->RATid = tcb_desc->RATRIndex;
-               {
-                       //MPDUOverhead = 0;
-                       tx_agg_desc->NoEnc = 1;
-               }
+               tx_agg_desc->NoEnc = 1;
                tx_agg_desc->SecType = 0x0;
 
                if (tcb_desc->bHwSec) {
-                       switch (priv->ieee80211->pairwise_key_type)
-                       {
-                               case KEY_TYPE_WEP40:
-                               case KEY_TYPE_WEP104:
-                                       tx_agg_desc->SecType = 0x1;
-                                       tx_agg_desc->NoEnc = 0;
-                                       break;
-                               case KEY_TYPE_TKIP:
-                                       tx_agg_desc->SecType = 0x2;
-                                       tx_agg_desc->NoEnc = 0;
-                                       break;
-                               case KEY_TYPE_CCMP:
-                                       tx_agg_desc->SecType = 0x3;
-                                       tx_agg_desc->NoEnc = 0;
-                                       break;
-                               case KEY_TYPE_NA:
-                                       tx_agg_desc->SecType = 0x0;
-                                       tx_agg_desc->NoEnc = 1;
-                                       break;
+                       switch (priv->ieee80211->pairwise_key_type) {
+                       case KEY_TYPE_WEP40:
+                       case KEY_TYPE_WEP104:
+                               tx_agg_desc->SecType = 0x1;
+                               tx_agg_desc->NoEnc = 0;
+                               break;
+                       case KEY_TYPE_TKIP:
+                               tx_agg_desc->SecType = 0x2;
+                               tx_agg_desc->NoEnc = 0;
+                               break;
+                       case KEY_TYPE_CCMP:
+                               tx_agg_desc->SecType = 0x3;
+                               tx_agg_desc->NoEnc = 0;
+                               break;
+                       case KEY_TYPE_NA:
+                               tx_agg_desc->SecType = 0x0;
+                               tx_agg_desc->NoEnc = 1;
+                               break;
                        }
                }
 
@@ -1369,16 +1235,14 @@ struct sk_buff *DrvAggr_Aggregation(struct net_device *dev, struct ieee80211_drv
 
                //DWORD 2
                /* According windows driver, it seems that there no need to fill this field */
-               //tx_agg_desc->TxBufferSize= (u32)(skb->len - USB_HWDESC_HEADER_LEN);
 
                /* to fill next packet */
-               skb_put(agg_skb,TX_PACKET_DRVAGGR_SUBFRAME_SHIFT_BYTES);
-               memcpy(skb_put(agg_skb,skb->len),skb->data,skb->len);
+               skb_put(agg_skb, TX_PACKET_DRVAGGR_SUBFRAME_SHIFT_BYTES);
+               memcpy(skb_put(agg_skb, skb->len), skb->data, skb->len);
        }
 
-       for(i = 0; i < pSendList->nr_drv_agg_frames; i++) {
+       for (i = 0; i < pSendList->nr_drv_agg_frames; i++)
                dev_kfree_skb_any(pSendList->tx_agg_frames[i]);
-       }
 
        return agg_skb;
 }
@@ -1388,7 +1252,7 @@ struct sk_buff *DrvAggr_Aggregation(struct net_device *dev, struct ieee80211_drv
        If no proper TCB is found to do aggregation, SendList will only contain the input TCB.
 */
 u8 DrvAggr_GetAggregatibleList(struct net_device *dev, struct sk_buff *skb,
-               struct ieee80211_drv_agg_txb *pSendList)
+                              struct ieee80211_drv_agg_txb *pSendList)
 {
        struct ieee80211_device *ieee = netdev_priv(dev);
        PRT_HIGH_THROUGHPUT     pHTInfo = ieee->pHTInfo;
@@ -1398,11 +1262,10 @@ u8 DrvAggr_GetAggregatibleList(struct net_device *dev, struct sk_buff *skb,
 
        do {
                pSendList->tx_agg_frames[pSendList->nr_drv_agg_frames++] = skb;
-               if(pSendList->nr_drv_agg_frames >= nMaxAggrNum) {
+               if (pSendList->nr_drv_agg_frames >= nMaxAggrNum)
                        break;
-               }
 
-       } while((skb = skb_dequeue(&ieee->skb_drv_aggQ[QueueID])));
+       } while ((skb = skb_dequeue(&ieee->skb_drv_aggQ[QueueID])));
 
        RT_TRACE(COMP_AMSDU, "DrvAggr_GetAggregatibleList, nAggrTcbNum = %d \n", pSendList->nr_drv_agg_frames);
        return pSendList->nr_drv_agg_frames;
@@ -1411,105 +1274,86 @@ u8 DrvAggr_GetAggregatibleList(struct net_device *dev, struct sk_buff *skb,
 
 static void rtl8192_tx_isr(struct urb *tx_urb)
 {
-       struct sk_buff *skb = (struct sk_buff*)tx_urb->context;
+       struct sk_buff *skb = (struct sk_buff *)tx_urb->context;
        struct net_device *dev = NULL;
        struct r8192_priv *priv = NULL;
        cb_desc *tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
        u8  queue_index = tcb_desc->queue_index;
-//     bool bToSend0Byte;
-//     u16 BufLen = skb->len;
 
-       memcpy(&dev,(struct net_device*)(skb->cb),sizeof(struct net_device*));
+       memcpy(&dev, (struct net_device *)(skb->cb), sizeof(struct net_device *));
        priv = ieee80211_priv(dev);
 
-       if(tcb_desc->queue_index != TXCMD_QUEUE) {
-               if(tx_urb->status == 0) {
+       if (tcb_desc->queue_index != TXCMD_QUEUE) {
+               if (tx_urb->status == 0) {
                        dev->trans_start = jiffies;
-                       // Act as station mode, destination shall be unicast address.
-                       //priv->ieee80211->stats.tx_bytes+=(skb->len - priv->ieee80211->tx_headroom);
-                       //priv->ieee80211->stats.tx_packets++;
                        priv->stats.txoktotal++;
                        priv->ieee80211->LinkDetectInfo.NumTxOkInPeriod++;
                        priv->stats.txbytesunicast += (skb->len - priv->ieee80211->tx_headroom);
                } else {
                        priv->ieee80211->stats.tx_errors++;
-                       //priv->stats.txmanageerr++;
                        /* TODO */
                }
        }
 
        /* free skb and tx_urb */
-       if(skb != NULL) {
+       if (skb != NULL) {
                dev_kfree_skb_any(skb);
                usb_free_urb(tx_urb);
                atomic_dec(&priv->tx_pending[queue_index]);
        }
 
-       {
-               //
-               // Handle HW Beacon:
-               // We had transfer our beacon frame to host controller at this moment.
-               //
-               //
-               // Caution:
-               // Handling the wait queue of command packets.
-               // For Tx command packets, we must not do TCB fragment because it is not handled right now.
-               // We must cut the packets to match the size of TX_CMD_PKT before we send it.
-               //
+       //
+       // Handle HW Beacon:
+       // We had transfer our beacon frame to host controller at this moment.
+       //
+       //
+       // Caution:
+       // Handling the wait queue of command packets.
+       // For Tx command packets, we must not do TCB fragment because it is not handled right now.
+       // We must cut the packets to match the size of TX_CMD_PKT before we send it.
+       //
 
-               /* Handle MPDU in wait queue. */
-               if(queue_index != BEACON_QUEUE) {
-                       /* Don't send data frame during scanning.*/
-                       if((skb_queue_len(&priv->ieee80211->skb_waitQ[queue_index]) != 0)&&\
-                                       (!(priv->ieee80211->queue_stop))) {
-                               if(NULL != (skb = skb_dequeue(&(priv->ieee80211->skb_waitQ[queue_index]))))
-                                       priv->ieee80211->softmac_hard_start_xmit(skb, dev);
+       /* Handle MPDU in wait queue. */
+       if (queue_index != BEACON_QUEUE) {
+               /* Don't send data frame during scanning.*/
+               if ((skb_queue_len(&priv->ieee80211->skb_waitQ[queue_index]) != 0) &&
+                   (!(priv->ieee80211->queue_stop))) {
+                       if (NULL != (skb = skb_dequeue(&(priv->ieee80211->skb_waitQ[queue_index]))))
+                               priv->ieee80211->softmac_hard_start_xmit(skb, dev);
 
-                               return; //modified by david to avoid further processing AMSDU
-                       }
+                       return; //modified by david to avoid further processing AMSDU
+               }
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
-                       else if ((skb_queue_len(&priv->ieee80211->skb_drv_aggQ[queue_index])!= 0)&&\
-                               (!(priv->ieee80211->queue_stop))) {
-                               // Tx Driver Aggregation process
-                               /* The driver will aggregation the packets according to the following stats
-                                * 1. check whether there's tx irq available, for it's a completion return
-                                *    function, it should contain enough tx irq;
-                                * 2. check packet type;
-                                * 3. initialize sendlist, check whether the to-be send packet no greater than 1
-                                * 4. aggregates the packets, and fill firmware info and tx desc into it, etc.
-                                * 5. check whether the packet could be sent, otherwise just insert into wait head
-                                * */
-                               skb = skb_dequeue(&priv->ieee80211->skb_drv_aggQ[queue_index]);
-                               if(!check_nic_enough_desc(dev, queue_index)) {
-                                       skb_queue_head(&(priv->ieee80211->skb_drv_aggQ[queue_index]), skb);
-                                       return;
-                               }
+               else if ((skb_queue_len(&priv->ieee80211->skb_drv_aggQ[queue_index]) != 0) &&
+                        (!(priv->ieee80211->queue_stop))) {
+                       // Tx Driver Aggregation process
+                       /* The driver will aggregation the packets according to the following stats
+                        * 1. check whether there's tx irq available, for it's a completion return
+                        *    function, it should contain enough tx irq;
+                        * 2. check packet type;
+                        * 3. initialize sendlist, check whether the to-be send packet no greater than 1
+                        * 4. aggregates the packets, and fill firmware info and tx desc into it, etc.
+                        * 5. check whether the packet could be sent, otherwise just insert into wait head
+                        * */
+                       skb = skb_dequeue(&priv->ieee80211->skb_drv_aggQ[queue_index]);
+                       if (!check_nic_enough_desc(dev, queue_index)) {
+                               skb_queue_head(&(priv->ieee80211->skb_drv_aggQ[queue_index]), skb);
+                               return;
+                       }
+
+                       /*TODO*/
+                       {
+                               struct ieee80211_drv_agg_txb SendList;
+
+                               memset(&SendList, 0, sizeof(struct ieee80211_drv_agg_txb));
+                               if (DrvAggr_GetAggregatibleList(dev, skb, &SendList) > 1) {
+                                       skb = DrvAggr_Aggregation(dev, &SendList);
 
-                               {
-                                       /*TODO*/
-                                       /*
-                                       u8* pHeader = skb->data;
-
-                                       if(IsMgntQosData(pHeader) ||
-                                           IsMgntQData_Ack(pHeader) ||
-                                           IsMgntQData_Poll(pHeader) ||
-                                           IsMgntQData_Poll_Ack(pHeader)
-                                         )
-                                       */
-                                       {
-                                               struct ieee80211_drv_agg_txb SendList;
-
-                                               memset(&SendList, 0, sizeof(struct ieee80211_drv_agg_txb));
-                                               if(DrvAggr_GetAggregatibleList(dev, skb, &SendList) > 1) {
-                                                       skb = DrvAggr_Aggregation(dev, &SendList);
-
-                                               }
-                                       }
-                                       priv->ieee80211->softmac_hard_start_xmit(skb, dev);
                                }
                        }
-#endif
+                       priv->ieee80211->softmac_hard_start_xmit(skb, dev);
                }
+#endif
        }
 
 }
@@ -1519,72 +1363,67 @@ void rtl8192_beacon_stop(struct net_device *dev)
        u8 msr, msrm, msr2;
        struct r8192_priv *priv = ieee80211_priv(dev);
 
-       msr  = read_nic_byte(dev, MSR);
+       read_nic_byte(dev, MSR, &msr);
        msrm = msr & MSR_LINK_MASK;
        msr2 = msr & ~MSR_LINK_MASK;
 
-       if(NIC_8192U == priv->card_8192) {
+       if (NIC_8192U == priv->card_8192)
                usb_kill_urb(priv->rx_urb[MAX_RX_URB]);
-       }
        if ((msrm == (MSR_LINK_ADHOC<<MSR_LINK_SHIFT) ||
-               (msrm == (MSR_LINK_MASTER<<MSR_LINK_SHIFT)))){
+           (msrm == (MSR_LINK_MASTER<<MSR_LINK_SHIFT)))) {
                write_nic_byte(dev, MSR, msr2 | MSR_LINK_NONE);
                write_nic_byte(dev, MSR, msr);
        }
 }
 
-void rtl8192_config_rate(struct net_device* dev, u16* rate_config)
-{
-        struct r8192_priv *priv = ieee80211_priv(dev);
-        struct ieee80211_network *net;
-        u8 i=0, basic_rate = 0;
-        net = & priv->ieee80211->current_network;
-
-        for (i=0; i<net->rates_len; i++)
-        {
-                basic_rate = net->rates[i]&0x7f;
-                switch(basic_rate)
-                {
-                        case MGN_1M:   *rate_config |= RRSR_1M;        break;
-                        case MGN_2M:   *rate_config |= RRSR_2M;        break;
-                        case MGN_5_5M: *rate_config |= RRSR_5_5M;      break;
-                        case MGN_11M:  *rate_config |= RRSR_11M;       break;
-                        case MGN_6M:   *rate_config |= RRSR_6M;        break;
-                        case MGN_9M:   *rate_config |= RRSR_9M;        break;
-                        case MGN_12M:  *rate_config |= RRSR_12M;       break;
-                        case MGN_18M:  *rate_config |= RRSR_18M;       break;
-                        case MGN_24M:  *rate_config |= RRSR_24M;       break;
-                        case MGN_36M:  *rate_config |= RRSR_36M;       break;
-                        case MGN_48M:  *rate_config |= RRSR_48M;       break;
-                        case MGN_54M:  *rate_config |= RRSR_54M;       break;
-                }
-        }
-        for (i=0; i<net->rates_ex_len; i++)
-        {
-                basic_rate = net->rates_ex[i]&0x7f;
-                switch(basic_rate)
-                {
-                        case MGN_1M:   *rate_config |= RRSR_1M;        break;
-                        case MGN_2M:   *rate_config |= RRSR_2M;        break;
-                        case MGN_5_5M: *rate_config |= RRSR_5_5M;      break;
-                        case MGN_11M:  *rate_config |= RRSR_11M;       break;
-                        case MGN_6M:   *rate_config |= RRSR_6M;        break;
-                        case MGN_9M:   *rate_config |= RRSR_9M;        break;
-                        case MGN_12M:  *rate_config |= RRSR_12M;       break;
-                        case MGN_18M:  *rate_config |= RRSR_18M;       break;
-                        case MGN_24M:  *rate_config |= RRSR_24M;       break;
-                        case MGN_36M:  *rate_config |= RRSR_36M;       break;
-                        case MGN_48M:  *rate_config |= RRSR_48M;       break;
-                        case MGN_54M:  *rate_config |= RRSR_54M;       break;
-                }
-        }
+void rtl8192_config_rate(struct net_device *dev, u16 *rate_config)
+{
+       struct r8192_priv *priv = ieee80211_priv(dev);
+       struct ieee80211_network *net;
+       u8 i = 0, basic_rate = 0;
+       net = &priv->ieee80211->current_network;
+
+       for (i = 0; i < net->rates_len; i++) {
+               basic_rate = net->rates[i]&0x7f;
+               switch (basic_rate) {
+               case MGN_1M:    *rate_config |= RRSR_1M;        break;
+               case MGN_2M:    *rate_config |= RRSR_2M;        break;
+               case MGN_5_5M:  *rate_config |= RRSR_5_5M;      break;
+               case MGN_11M:   *rate_config |= RRSR_11M;       break;
+               case MGN_6M:    *rate_config |= RRSR_6M;        break;
+               case MGN_9M:    *rate_config |= RRSR_9M;        break;
+               case MGN_12M:   *rate_config |= RRSR_12M;       break;
+               case MGN_18M:   *rate_config |= RRSR_18M;       break;
+               case MGN_24M:   *rate_config |= RRSR_24M;       break;
+               case MGN_36M:   *rate_config |= RRSR_36M;       break;
+               case MGN_48M:   *rate_config |= RRSR_48M;       break;
+               case MGN_54M:   *rate_config |= RRSR_54M;       break;
+               }
+       }
+       for (i = 0; i < net->rates_ex_len; i++) {
+               basic_rate = net->rates_ex[i]&0x7f;
+               switch (basic_rate) {
+               case MGN_1M:    *rate_config |= RRSR_1M;        break;
+               case MGN_2M:    *rate_config |= RRSR_2M;        break;
+               case MGN_5_5M:  *rate_config |= RRSR_5_5M;      break;
+               case MGN_11M:   *rate_config |= RRSR_11M;       break;
+               case MGN_6M:    *rate_config |= RRSR_6M;        break;
+               case MGN_9M:    *rate_config |= RRSR_9M;        break;
+               case MGN_12M:   *rate_config |= RRSR_12M;       break;
+               case MGN_18M:   *rate_config |= RRSR_18M;       break;
+               case MGN_24M:   *rate_config |= RRSR_24M;       break;
+               case MGN_36M:   *rate_config |= RRSR_36M;       break;
+               case MGN_48M:   *rate_config |= RRSR_48M;       break;
+               case MGN_54M:   *rate_config |= RRSR_54M;       break;
+               }
+       }
 }
 
 
 #define SHORT_SLOT_TIME 9
 #define NON_SHORT_SLOT_TIME 20
 
-void rtl8192_update_cap(struct net_devicedev, u16 cap)
+void rtl8192_update_cap(struct net_device *dev, u16 cap)
 {
        u32 tmp = 0;
        struct r8192_priv *priv = ieee80211_priv(dev);
@@ -1595,13 +1434,10 @@ void rtl8192_update_cap(struct net_device* dev, u16 cap)
                tmp |= BRSR_AckShortPmb;
        write_nic_dword(dev, RRSR, tmp);
 
-       if (net->mode & (IEEE_G|IEEE_N_24G))
-       {
+       if (net->mode & (IEEE_G|IEEE_N_24G)) {
                u8 slot_time = 0;
-               if ((cap & WLAN_CAPABILITY_SHORT_SLOT)&&(!priv->ieee80211->pHTInfo->bCurrentRT2RTLongSlotTime))
-               {//short slot time
+               if ((cap & WLAN_CAPABILITY_SHORT_SLOT) && (!priv->ieee80211->pHTInfo->bCurrentRT2RTLongSlotTime)) //short slot time
                        slot_time = SHORT_SLOT_TIME;
-               }
                else //long slot time
                        slot_time = NON_SHORT_SLOT_TIME;
                priv->slot_time = slot_time;
@@ -1616,31 +1452,26 @@ void rtl8192_net_update(struct net_device *dev)
        struct ieee80211_network *net;
        u16 BcnTimeCfg = 0, BcnCW = 6, BcnIFS = 0xf;
        u16 rate_config = 0;
-       net = & priv->ieee80211->current_network;
+       net = &priv->ieee80211->current_network;
 
        rtl8192_config_rate(dev, &rate_config);
        priv->basic_rate = rate_config &= 0x15f;
 
-       write_nic_dword(dev,BSSIDR,((u32*)net->bssid)[0]);
-       write_nic_word(dev,BSSIDR+4,((u16*)net->bssid)[2]);
-       //for(i=0;i<ETH_ALEN;i++)
-       //      write_nic_byte(dev,BSSID+i,net->bssid[i]);
+       write_nic_dword(dev, BSSIDR, ((u32 *)net->bssid)[0]);
+       write_nic_word(dev, BSSIDR+4, ((u16 *)net->bssid)[2]);
 
        rtl8192_update_msr(dev);
-//     rtl8192_update_cap(dev, net->capability);
-       if (priv->ieee80211->iw_mode == IW_MODE_ADHOC)
-       {
-       write_nic_word(dev, ATIMWND, 2);
-       write_nic_word(dev, BCN_DMATIME, 1023);
-       write_nic_word(dev, BCN_INTERVAL, net->beacon_interval);
-//     write_nic_word(dev, BcnIntTime, 100);
-       write_nic_word(dev, BCN_DRV_EARLY_INT, 1);
-       write_nic_byte(dev, BCN_ERR_THRESH, 100);
+       if (priv->ieee80211->iw_mode == IW_MODE_ADHOC) {
+               write_nic_word(dev, ATIMWND, 2);
+               write_nic_word(dev, BCN_DMATIME, 1023);
+               write_nic_word(dev, BCN_INTERVAL, net->beacon_interval);
+               write_nic_word(dev, BCN_DRV_EARLY_INT, 1);
+               write_nic_byte(dev, BCN_ERR_THRESH, 100);
                BcnTimeCfg |= (BcnCW<<BCN_TCFG_CW_SHIFT);
-       // TODO: BcnIFS may required to be changed on ASIC
+               // TODO: BcnIFS may required to be changed on ASIC
                BcnTimeCfg |= BcnIFS<<BCN_TCFG_IFS;
 
-       write_nic_word(dev, BCN_TCFG, BcnTimeCfg);
+               write_nic_word(dev, BCN_TCFG, BcnTimeCfg);
        }
 
 
@@ -1649,46 +1480,37 @@ void rtl8192_net_update(struct net_device *dev)
 
 //temporary hw beacon is not used any more.
 //open it when necessary
-void rtl819xusb_beacon_tx(struct net_device *dev,u16  tx_rate)
+void rtl819xusb_beacon_tx(struct net_device *dev, u16  tx_rate)
 {
 
 }
 inline u8 rtl8192_IsWirelessBMode(u16 rate)
 {
-       if( ((rate <= 110) && (rate != 60) && (rate != 90)) || (rate == 220) )
+       if (((rate <= 110) && (rate != 60) && (rate != 90)) || (rate == 220))
                return 1;
        else return 0;
 }
 
 u16 N_DBPSOfRate(u16 DataRate);
 
-u16 ComputeTxTime(
-       u16             FrameLength,
-       u16             DataRate,
-       u8              bManagementFrame,
-       u8              bShortPreamble
-)
+u16 ComputeTxTime(u16 FrameLength, u16 DataRate, u8 bManagementFrame,
+                 u8 bShortPreamble)
 {
        u16     FrameTime;
        u16     N_DBPS;
        u16     Ceiling;
 
-       if( rtl8192_IsWirelessBMode(DataRate) )
-       {
-               if( bManagementFrame || !bShortPreamble || DataRate == 10 )
-               {       // long preamble
+       if (rtl8192_IsWirelessBMode(DataRate)) {
+               if (bManagementFrame || !bShortPreamble || DataRate == 10) // long preamble
                        FrameTime = (u16)(144+48+(FrameLength*8/(DataRate/10)));
-               }
-               else
-               {       // Short preamble
+               else // Short preamble
                        FrameTime = (u16)(72+24+(FrameLength*8/(DataRate/10)));
-               }
-               if( ( FrameLength*8 % (DataRate/10) ) != 0 ) //Get the Ceilling
-                               FrameTime ++;
+               if ((FrameLength*8 % (DataRate/10)) != 0) //Get the Ceilling
+                       FrameTime++;
        } else {        //802.11g DSSS-OFDM PLCP length field calculation.
                N_DBPS = N_DBPSOfRate(DataRate);
                Ceiling = (16 + 8*FrameLength + 6) / N_DBPS
-                               + (((16 + 8*FrameLength + 6) % N_DBPS) ? 1 : 0);
+                       + (((16 + 8*FrameLength + 6) % N_DBPS) ? 1 : 0);
                FrameTime = (u16)(16 + 4 + 4*Ceiling + 6);
        }
        return FrameTime;
@@ -1696,47 +1518,46 @@ u16 ComputeTxTime(
 
 u16 N_DBPSOfRate(u16 DataRate)
 {
-        u16 N_DBPS = 24;
+       u16 N_DBPS = 24;
 
-        switch(DataRate)
-        {
-        case 60:
-         N_DBPS = 24;
-         break;
+       switch (DataRate) {
+       case 60:
+               N_DBPS = 24;
+               break;
 
-        case 90:
-         N_DBPS = 36;
-         break;
+       case 90:
+               N_DBPS = 36;
+               break;
 
-        case 120:
-         N_DBPS = 48;
-         break;
+       case 120:
+               N_DBPS = 48;
+               break;
 
-        case 180:
-         N_DBPS = 72;
-         break;
+       case 180:
+               N_DBPS = 72;
+               break;
 
-        case 240:
-         N_DBPS = 96;
-         break;
+       case 240:
+               N_DBPS = 96;
+               break;
 
-        case 360:
-         N_DBPS = 144;
-         break;
+       case 360:
+               N_DBPS = 144;
+               break;
 
-        case 480:
-         N_DBPS = 192;
-         break;
+       case 480:
+               N_DBPS = 192;
+               break;
 
-        case 540:
-         N_DBPS = 216;
-         break;
+       case 540:
+               N_DBPS = 216;
+               break;
 
-        default:
-         break;
-        }
+       default:
+               break;
+       }
 
-        return N_DBPS;
+       return N_DBPS;
 }
 
 void rtl819xU_cmd_isr(struct urb *tx_cmd_urb, struct pt_regs *regs)
@@ -1744,11 +1565,10 @@ void rtl819xU_cmd_isr(struct urb *tx_cmd_urb, struct pt_regs *regs)
        usb_free_urb(tx_cmd_urb);
 }
 
-unsigned int txqueue2outpipe(struct r8192_priv* priv,unsigned int tx_queue) {
-
-       if(tx_queue >= 9)
-       {
-               RT_TRACE(COMP_ERR,"%s():Unknown queue ID!!!\n",__FUNCTION__);
+unsigned int txqueue2outpipe(struct r8192_priv *priv, unsigned int tx_queue)
+{
+       if (tx_queue >= 9) {
+               RT_TRACE(COMP_ERR, "%s():Unknown queue ID!!!\n", __func__);
                return 0x04;
        }
        return priv->txqueue_to_outpipemap[tx_queue];
@@ -1757,19 +1577,16 @@ unsigned int txqueue2outpipe(struct r8192_priv* priv,unsigned int tx_queue) {
 short rtl819xU_tx_cmd(struct net_device *dev, struct sk_buff *skb)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-       //u8                    *tx;
        int                     status;
        struct urb              *tx_urb;
-       //int                   urb_buf_len;
        unsigned int            idx_pipe;
        tx_desc_cmd_819x_usb *pdesc = (tx_desc_cmd_819x_usb *)skb->data;
        cb_desc *tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
        u8 queue_index = tcb_desc->queue_index;
 
-       //printk("\n %s::queue_index = %d\n",__FUNCTION__, queue_index);
        atomic_inc(&priv->tx_pending[queue_index]);
-       tx_urb = usb_alloc_urb(0,GFP_ATOMIC);
-       if(!tx_urb){
+       tx_urb = usb_alloc_urb(0, GFP_ATOMIC);
+       if (!tx_urb) {
                dev_kfree_skb(skb);
                return -ENOMEM;
        }
@@ -1788,27 +1605,26 @@ short rtl819xU_tx_cmd(struct net_device *dev, struct sk_buff *skb)
        //----------------------------------------------------------------------------
        // Get index to out pipe from specified QueueID.
 #ifndef USE_ONE_PIPE
-       idx_pipe = txqueue2outpipe(priv,queue_index);
+       idx_pipe = txqueue2outpipe(priv, queue_index);
 #else
        idx_pipe = 0x04;
 #endif
 #ifdef JOHN_DUMP_TXDESC
        int i;
-       printk("<Tx descriptor>--rate %x---",rate);
+       printk("<Tx descriptor>--rate %x---", rate);
        for (i = 0; i < 8; i++)
                printk("%8x ", tx[i]);
        printk("\n");
 #endif
-       usb_fill_bulk_urb(tx_urb,priv->udev, usb_sndbulkpipe(priv->udev,idx_pipe), \
-                       skb->data, skb->len, rtl8192_tx_isr, skb);
+       usb_fill_bulk_urb(tx_urb, priv->udev, usb_sndbulkpipe(priv->udev, idx_pipe),
+                         skb->data, skb->len, rtl8192_tx_isr, skb);
 
        status = usb_submit_urb(tx_urb, GFP_ATOMIC);
 
-       if (!status){
+       if (!status) {
                return 0;
-       }else{
-               DMESGE("Error TX CMD URB, error %d",
-                               status);
+       } else {
+               DMESGE("Error TX CMD URB, error %d", status);
                return -1;
        }
 }
@@ -1824,21 +1640,21 @@ u8 MapHwQueueToFirmwareQueue(u8 QueueID)
 {
        u8 QueueSelect = 0x0;       //defualt set to
 
-       switch(QueueID) {
+       switch (QueueID) {
        case BE_QUEUE:
-               QueueSelect = QSLT_BE;  //or QSelect = pTcb->priority;
+               QueueSelect = QSLT_BE;
                break;
 
        case BK_QUEUE:
-               QueueSelect = QSLT_BK;  //or QSelect = pTcb->priority;
+               QueueSelect = QSLT_BK;
                break;
 
        case VO_QUEUE:
-               QueueSelect = QSLT_VO;  //or QSelect = pTcb->priority;
+               QueueSelect = QSLT_VO;
                break;
 
        case VI_QUEUE:
-               QueueSelect = QSLT_VI;  //or QSelect = pTcb->priority;
+               QueueSelect = QSLT_VI;
                break;
        case MGNT_QUEUE:
                QueueSelect = QSLT_MGNT;
@@ -1850,11 +1666,9 @@ u8 MapHwQueueToFirmwareQueue(u8 QueueID)
 
                // TODO: 2006.10.30 mark other queue selection until we verify it is OK
                // TODO: Remove Assertions
-//#if (RTL819X_FPGA_VER & RTL819X_FPGA_GUANGAN_070502)
        case TXCMD_QUEUE:
                QueueSelect = QSLT_CMD;
                break;
-//#endif
        case HIGH_QUEUE:
                QueueSelect = QSLT_HIGH;
                break;
@@ -1870,7 +1684,7 @@ u8 MRateToHwRate8190Pci(u8 rate)
 {
        u8  ret = DESC90_RATE1M;
 
-       switch(rate) {
+       switch (rate) {
        case MGN_1M:    ret = DESC90_RATE1M;    break;
        case MGN_2M:    ret = DESC90_RATE2M;    break;
        case MGN_5_5M:  ret = DESC90_RATE5_5M;  break;
@@ -1913,9 +1727,9 @@ u8 QueryIsShort(u8 TxHT, u8 TxRate, cb_desc *tcb_desc)
 {
        u8   tmp_Short;
 
-       tmp_Short = (TxHT==1)?((tcb_desc->bUseShortGI)?1:0):((tcb_desc->bUseShortPreamble)?1:0);
+       tmp_Short = (TxHT == 1) ? ((tcb_desc->bUseShortGI) ? 1 : 0) : ((tcb_desc->bUseShortPreamble) ? 1 : 0);
 
-       if(TxHT==1 && TxRate != DESC90_RATEMCS15)
+       if (TxHT == 1 && TxRate != DESC90_RATEMCS15)
                tmp_Short = 0;
 
        return tmp_Short;
@@ -1931,7 +1745,7 @@ static void tx_zero_isr(struct urb *tx_urb)
  * skb->cb will contain all the following information,
  * priority, morefrag, rate, &dev.
  * */
-short rtl8192_tx(struct net_device *dev, struct sk_buffskb)
+short rtl8192_tx(struct net_device *dev, struct sk_buff *skb)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        cb_desc *tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
@@ -1941,35 +1755,32 @@ short rtl8192_tx(struct net_device *dev, struct sk_buff* skb)
        int pend;
        int status;
        struct urb *tx_urb = NULL, *tx_urb_zero = NULL;
-       //int urb_len;
        unsigned int idx_pipe;
-//     RT_DEBUG_DATA(COMP_SEND, tcb_desc, sizeof(cb_desc));
-//     printk("=============> %s\n", __FUNCTION__);
        pend = atomic_read(&priv->tx_pending[tcb_desc->queue_index]);
        /* we are locked here so the two atomic_read and inc are executed
         * without interleaves
         * !!! For debug purpose
         */
-       if( pend > MAX_TX_URB){
-               printk("To discard skb packet!\n");
+       if (pend > MAX_TX_URB) {
+               netdev_dbg(dev, "To discard skb packet!\n");
                dev_kfree_skb_any(skb);
                return -1;
        }
 
-       tx_urb = usb_alloc_urb(0,GFP_ATOMIC);
-       if(!tx_urb){
+       tx_urb = usb_alloc_urb(0, GFP_ATOMIC);
+       if (!tx_urb) {
                dev_kfree_skb_any(skb);
                return -ENOMEM;
        }
 
        /* Fill Tx firmware info */
-       memset(tx_fwinfo,0,sizeof(tx_fwinfo_819x_usb));
+       memset(tx_fwinfo, 0, sizeof(tx_fwinfo_819x_usb));
        /* DWORD 0 */
-       tx_fwinfo->TxHT = (tcb_desc->data_rate&0x80)?1:0;
+       tx_fwinfo->TxHT = (tcb_desc->data_rate&0x80) ? 1 : 0;
        tx_fwinfo->TxRate = MRateToHwRate8190Pci(tcb_desc->data_rate);
        tx_fwinfo->EnableCPUDur = tcb_desc->bTxEnableFwCalcDur;
        tx_fwinfo->Short = QueryIsShort(tx_fwinfo->TxHT, tx_fwinfo->TxRate, tcb_desc);
-       if(tcb_desc->bAMPDUEnable) {//AMPDU enabled
+       if (tcb_desc->bAMPDUEnable) {//AMPDU enabled
                tx_fwinfo->AllowAggregation = 1;
                /* DWORD 1 */
                tx_fwinfo->RxMF = tcb_desc->ampdu_factor;
@@ -1982,20 +1793,19 @@ short rtl8192_tx(struct net_device *dev, struct sk_buff* skb)
        }
 
        /* Protection mode related */
-       tx_fwinfo->RtsEnable = (tcb_desc->bRTSEnable)?1:0;
-       tx_fwinfo->CtsEnable = (tcb_desc->bCTSEnable)?1:0;
-       tx_fwinfo->RtsSTBC = (tcb_desc->bRTSSTBC)?1:0;
-       tx_fwinfo->RtsHT = (tcb_desc->rts_rate&0x80)?1:0;
+       tx_fwinfo->RtsEnable = (tcb_desc->bRTSEnable) ? 1 : 0;
+       tx_fwinfo->CtsEnable = (tcb_desc->bCTSEnable) ? 1 : 0;
+       tx_fwinfo->RtsSTBC = (tcb_desc->bRTSSTBC) ? 1 : 0;
+       tx_fwinfo->RtsHT = (tcb_desc->rts_rate&0x80) ? 1 : 0;
        tx_fwinfo->RtsRate =  MRateToHwRate8190Pci((u8)tcb_desc->rts_rate);
-       tx_fwinfo->RtsSubcarrier = (tx_fwinfo->RtsHT==0)?(tcb_desc->RTSSC):0;
-       tx_fwinfo->RtsBandwidth = (tx_fwinfo->RtsHT==1)?((tcb_desc->bRTSBW)?1:0):0;
-       tx_fwinfo->RtsShort = (tx_fwinfo->RtsHT==0)?(tcb_desc->bRTSUseShortPreamble?1:0):\
-                               (tcb_desc->bRTSUseShortGI?1:0);
+       tx_fwinfo->RtsSubcarrier = (tx_fwinfo->RtsHT == 0) ? (tcb_desc->RTSSC) : 0;
+       tx_fwinfo->RtsBandwidth = (tx_fwinfo->RtsHT == 1) ? ((tcb_desc->bRTSBW) ? 1 : 0) : 0;
+       tx_fwinfo->RtsShort = (tx_fwinfo->RtsHT == 0) ? (tcb_desc->bRTSUseShortPreamble ? 1 : 0) :
+                             (tcb_desc->bRTSUseShortGI ? 1 : 0);
 
        /* Set Bandwidth and sub-channel settings. */
-       if(priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20_40)
-       {
-               if(tcb_desc->bPacketBW) {
+       if (priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20_40) {
+               if (tcb_desc->bPacketBW) {
                        tx_fwinfo->TxBandwidth = 1;
                        tx_fwinfo->TxSubCarrier = 0;    //By SD3's Jerry suggestion, use duplicated mode
                } else {
@@ -2009,9 +1819,7 @@ short rtl8192_tx(struct net_device *dev, struct sk_buff* skb)
 
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
        if (tcb_desc->drv_agg_enable)
-       {
                tx_fwinfo->Tx_INFO_RSVD = (tcb_desc->DrvAggrNum & 0x1f) << 1;
-       }
 #endif
        /* Fill Tx descriptor */
        memset(tx_desc, 0, sizeof(tx_desc_819x_usb));
@@ -2021,45 +1829,40 @@ short rtl8192_tx(struct net_device *dev, struct sk_buff* skb)
        tx_desc->Offset =  sizeof(tx_fwinfo_819x_usb) + 8;
 
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
-       if (tcb_desc->drv_agg_enable) {
+       if (tcb_desc->drv_agg_enable)
                tx_desc->PktSize = tcb_desc->pkt_size;
-       else
+       else
 #endif
        {
                tx_desc->PktSize = (skb->len - TX_PACKET_SHIFT_BYTES) & 0xffff;
        }
 
        /*DWORD 1*/
-       tx_desc->SecCAMID= 0;
+       tx_desc->SecCAMID = 0;
        tx_desc->RATid = tcb_desc->RATRIndex;
-       {
-               //MPDUOverhead = 0;
-               tx_desc->NoEnc = 1;
-       }
+       tx_desc->NoEnc = 1;
        tx_desc->SecType = 0x0;
-               if (tcb_desc->bHwSec)
-                       {
-                               switch (priv->ieee80211->pairwise_key_type)
-                               {
-                                       case KEY_TYPE_WEP40:
-                                       case KEY_TYPE_WEP104:
-                                                tx_desc->SecType = 0x1;
-                                                tx_desc->NoEnc = 0;
-                                                break;
-                                       case KEY_TYPE_TKIP:
-                                                tx_desc->SecType = 0x2;
-                                                tx_desc->NoEnc = 0;
-                                                break;
-                                       case KEY_TYPE_CCMP:
-                                                tx_desc->SecType = 0x3;
-                                                tx_desc->NoEnc = 0;
-                                                break;
-                                       case KEY_TYPE_NA:
-                                                tx_desc->SecType = 0x0;
-                                                tx_desc->NoEnc = 1;
-                                                break;
-                               }
-                       }
+       if (tcb_desc->bHwSec) {
+               switch (priv->ieee80211->pairwise_key_type) {
+               case KEY_TYPE_WEP40:
+               case KEY_TYPE_WEP104:
+                       tx_desc->SecType = 0x1;
+                       tx_desc->NoEnc = 0;
+                       break;
+               case KEY_TYPE_TKIP:
+                       tx_desc->SecType = 0x2;
+                       tx_desc->NoEnc = 0;
+                       break;
+               case KEY_TYPE_CCMP:
+                       tx_desc->SecType = 0x3;
+                       tx_desc->NoEnc = 0;
+                       break;
+               case KEY_TYPE_NA:
+                       tx_desc->SecType = 0x0;
+                       tx_desc->NoEnc = 1;
+                       break;
+               }
+       }
 
        tx_desc->QueueSelect = MapHwQueueToFirmwareQueue(tcb_desc->queue_index);
        tx_desc->TxFWInfoSize =  sizeof(tx_fwinfo_819x_usb);
@@ -2084,48 +1887,41 @@ short rtl8192_tx(struct net_device *dev, struct sk_buff* skb)
        }
        /* Get index to out pipe from specified QueueID */
 #ifndef USE_ONE_PIPE
-       idx_pipe = txqueue2outpipe(priv,tcb_desc->queue_index);
+       idx_pipe = txqueue2outpipe(priv, tcb_desc->queue_index);
 #else
        idx_pipe = 0x5;
 #endif
 
-       //RT_DEBUG_DATA(COMP_SEND,tx_fwinfo,sizeof(tx_fwinfo_819x_usb));
-       //RT_DEBUG_DATA(COMP_SEND,tx_desc,sizeof(tx_desc_819x_usb));
-
        /* To submit bulk urb */
-       usb_fill_bulk_urb(tx_urb,udev,
-                       usb_sndbulkpipe(udev,idx_pipe), skb->data,
-                       skb->len, rtl8192_tx_isr, skb);
+       usb_fill_bulk_urb(tx_urb, udev,
+                         usb_sndbulkpipe(udev, idx_pipe), skb->data,
+                         skb->len, rtl8192_tx_isr, skb);
 
        status = usb_submit_urb(tx_urb, GFP_ATOMIC);
-       if (!status){
-//we need to send 0 byte packet whenever 512N bytes/64N(HIGN SPEED/NORMAL SPEED) bytes packet has been transmitted. Otherwise, it will be halt to wait for another packet. WB. 2008.08.27
+       if (!status) {
+               //we need to send 0 byte packet whenever 512N bytes/64N(HIGN SPEED/NORMAL SPEED) bytes packet has been transmitted. Otherwise, it will be halt to wait for another packet. WB. 2008.08.27
                bool bSend0Byte = false;
                u8 zero = 0;
-               if(udev->speed == USB_SPEED_HIGH)
-               {
+               if (udev->speed == USB_SPEED_HIGH) {
                        if (skb->len > 0 && skb->len % 512 == 0)
                                bSend0Byte = true;
-               }
-               else
-               {
+               } else {
                        if (skb->len > 0 && skb->len % 64 == 0)
                                bSend0Byte = true;
                }
-               if (bSend0Byte)
-               {
-                       tx_urb_zero = usb_alloc_urb(0,GFP_ATOMIC);
-                       if(!tx_urb_zero){
+               if (bSend0Byte) {
+                       tx_urb_zero = usb_alloc_urb(0, GFP_ATOMIC);
+                       if (!tx_urb_zero) {
                                RT_TRACE(COMP_ERR, "can't alloc urb for zero byte\n");
                                return -ENOMEM;
                        }
-                       usb_fill_bulk_urb(tx_urb_zero,udev,
-                                       usb_sndbulkpipe(udev,idx_pipe), &zero,
-                                       0, tx_zero_isr, dev);
+                       usb_fill_bulk_urb(tx_urb_zero, udev,
+                                         usb_sndbulkpipe(udev, idx_pipe), &zero,
+                                         0, tx_zero_isr, dev);
                        status = usb_submit_urb(tx_urb_zero, GFP_ATOMIC);
-                       if (status){
-                       RT_TRACE(COMP_ERR, "Error TX URB for zero byte %d, error %d", atomic_read(&priv->tx_pending[tcb_desc->queue_index]), status);
-                       return -1;
+                       if (status) {
+                               RT_TRACE(COMP_ERR, "Error TX URB for zero byte %d, error %d", atomic_read(&priv->tx_pending[tcb_desc->queue_index]), status);
+                               return -1;
                        }
                }
                dev->trans_start = jiffies;
@@ -2133,7 +1929,7 @@ short rtl8192_tx(struct net_device *dev, struct sk_buff* skb)
                return 0;
        } else {
                RT_TRACE(COMP_ERR, "Error TX URB %d, error %d", atomic_read(&priv->tx_pending[tcb_desc->queue_index]),
-                               status);
+                        status);
                return -1;
        }
 }
@@ -2143,14 +1939,14 @@ short rtl8192_usb_initendpoints(struct net_device *dev)
        struct r8192_priv *priv = ieee80211_priv(dev);
 
        priv->rx_urb = kmalloc(sizeof(struct urb *) * (MAX_RX_URB+1),
-                               GFP_KERNEL);
+                              GFP_KERNEL);
        if (priv->rx_urb == NULL)
                return -ENOMEM;
 
 #ifndef JACKSON_NEW_RX
-       for(i=0;i<(MAX_RX_URB+1);i++){
+       for (i = 0; i < (MAX_RX_URB+1); i++) {
 
-               priv->rx_urb[i] = usb_alloc_urb(0,GFP_KERNEL);
+               priv->rx_urb[i] = usb_alloc_urb(0, GFP_KERNEL);
 
                priv->rx_urb[i]->transfer_buffer = kmalloc(RX_URB_SIZE, GFP_KERNEL);
 
@@ -2159,26 +1955,26 @@ short rtl8192_usb_initendpoints(struct net_device *dev)
 #endif
 
 #ifdef THOMAS_BEACON
-{
-       long align = 0;
-       void *oldaddr, *newaddr;
-
-       priv->rx_urb[16] = usb_alloc_urb(0, GFP_KERNEL);
-       priv->oldaddr = kmalloc(16, GFP_KERNEL);
-       oldaddr = priv->oldaddr;
-       align = ((long)oldaddr) & 3;
-       if (align) {
-               newaddr = oldaddr + 4 - align;
-               priv->rx_urb[16]->transfer_buffer_length = 16 - 4 + align;
-       } else {
-               newaddr = oldaddr;
-               priv->rx_urb[16]->transfer_buffer_length = 16;
+       {
+               long align = 0;
+               void *oldaddr, *newaddr;
+
+               priv->rx_urb[16] = usb_alloc_urb(0, GFP_KERNEL);
+               priv->oldaddr = kmalloc(16, GFP_KERNEL);
+               oldaddr = priv->oldaddr;
+               align = ((long)oldaddr) & 3;
+               if (align) {
+                       newaddr = oldaddr + 4 - align;
+                       priv->rx_urb[16]->transfer_buffer_length = 16 - 4 + align;
+               } else {
+                       newaddr = oldaddr;
+                       priv->rx_urb[16]->transfer_buffer_length = 16;
+               }
+               priv->rx_urb[16]->transfer_buffer = newaddr;
        }
-       priv->rx_urb[16]->transfer_buffer = newaddr;
-}
 #endif
 
-       memset(priv->rx_urb, 0, sizeof(struct urb*) * MAX_RX_URB);
+       memset(priv->rx_urb, 0, sizeof(struct urb *) * MAX_RX_URB);
        priv->pp_rxskb = kcalloc(MAX_RX_URB, sizeof(struct sk_buff *),
                                 GFP_KERNEL);
        if (!priv->pp_rxskb) {
@@ -2191,7 +1987,7 @@ short rtl8192_usb_initendpoints(struct net_device *dev)
                return -ENOMEM;
        }
 
-       printk("End of initendpoints\n");
+       netdev_dbg(dev, "End of initendpoints\n");
        return 0;
 
 }
@@ -2201,8 +1997,8 @@ void rtl8192_usb_deleteendpoints(struct net_device *dev)
        int i;
        struct r8192_priv *priv = ieee80211_priv(dev);
 
-       if(priv->rx_urb){
-               for(i=0;i<(MAX_RX_URB+1);i++){
+       if (priv->rx_urb) {
+               for (i = 0; i < (MAX_RX_URB+1); i++) {
                        usb_kill_urb(priv->rx_urb[i]);
                        usb_free_urb(priv->rx_urb[i]);
                }
@@ -2224,8 +2020,8 @@ void rtl8192_usb_deleteendpoints(struct net_device *dev)
 
 #ifndef JACKSON_NEW_RX
 
-       if(priv->rx_urb){
-               for(i=0;i<(MAX_RX_URB+1);i++){
+       if (priv->rx_urb) {
+               for (i = 0; i < (MAX_RX_URB+1); i++) {
                        usb_kill_urb(priv->rx_urb[i]);
                        kfree(priv->rx_urb[i]->transfer_buffer);
                        usb_free_urb(priv->rx_urb[i]);
@@ -2249,54 +2045,45 @@ void rtl8192_usb_deleteendpoints(struct net_device *dev)
 }
 #endif
 
-extern void rtl8192_update_ratr_table(struct net_devicedev);
+extern void rtl8192_update_ratr_table(struct net_device *dev);
 void rtl8192_link_change(struct net_device *dev)
 {
-//     int i;
-
        struct r8192_priv *priv = ieee80211_priv(dev);
-       struct ieee80211_device* ieee = priv->ieee80211;
-       //write_nic_word(dev, BCN_INTR_ITV, net->beacon_interval);
-       if (ieee->state == IEEE80211_LINKED)
-       {
+       struct ieee80211_device *ieee = priv->ieee80211;
+       if (ieee->state == IEEE80211_LINKED) {
                rtl8192_net_update(dev);
                rtl8192_update_ratr_table(dev);
                //add this as in pure N mode, wep encryption will use software way, but there is no chance to set this as wep will not set group key in wext. WB.2008.07.08
                if ((KEY_TYPE_WEP40 == ieee->pairwise_key_type) || (KEY_TYPE_WEP104 == ieee->pairwise_key_type))
-               EnableHWSecurityConfig8192(dev);
+                       EnableHWSecurityConfig8192(dev);
        }
        /*update timing params*/
-//     RT_TRACE(COMP_CH, "========>%s(), chan:%d\n", __FUNCTION__, priv->chan);
-//     rtl8192_set_chan(dev, priv->chan);
-        if (ieee->iw_mode == IW_MODE_INFRA || ieee->iw_mode == IW_MODE_ADHOC)
-       {
+       if (ieee->iw_mode == IW_MODE_INFRA || ieee->iw_mode == IW_MODE_ADHOC) {
                u32 reg = 0;
-               reg = read_nic_dword(dev, RCR);
+               read_nic_dword(dev, RCR, &reg);
                if (priv->ieee80211->state == IEEE80211_LINKED)
                        priv->ReceiveConfig = reg |= RCR_CBSSID;
                else
                        priv->ReceiveConfig = reg &= ~RCR_CBSSID;
                write_nic_dword(dev, RCR, reg);
        }
-
-//     rtl8192_set_rxconf(dev);
 }
 
 static struct ieee80211_qos_parameters def_qos_parameters = {
-       {3,3,3,3},/* cw_min */
-       {7,7,7,7},/* cw_max */
-       {2,2,2,2},/* aifs */
-       {0,0,0,0},/* flags */
-       {0,0,0,0} /* tx_op_limit */
+       {3, 3, 3, 3},/* cw_min */
+       {7, 7, 7, 7},/* cw_max */
+       {2, 2, 2, 2},/* aifs */
+       {0, 0, 0, 0},/* flags */
+       {0, 0, 0, 0} /* tx_op_limit */
 };
 
 
-void rtl8192_update_beacon(struct work_struct * work)
+void rtl8192_update_beacon(struct work_struct *work)
 {
        struct r8192_priv *priv = container_of(work, struct r8192_priv, update_beacon_wq.work);
        struct net_device *dev = priv->ieee80211->dev;
-       struct ieee80211_deviceieee = priv->ieee80211;
-       struct ieee80211_networknet = &ieee->current_network;
+       struct ieee80211_device *ieee = priv->ieee80211;
+       struct ieee80211_network *net = &ieee->current_network;
 
        if (ieee->pHTInfo->bCurrentHTSupport)
                HTUpdateSelfAndPeerSetting(ieee, net);
@@ -2306,14 +2093,13 @@ void rtl8192_update_beacon(struct work_struct * work)
 /*
 * background support to run QoS activate functionality
 */
-int WDCAPARA_ADD[] = {EDCAPARA_BE,EDCAPARA_BK,EDCAPARA_VI,EDCAPARA_VO};
-void rtl8192_qos_activate(struct work_struct * work)
+int WDCAPARA_ADD[] = {EDCAPARA_BE, EDCAPARA_BK, EDCAPARA_VI, EDCAPARA_VO};
+void rtl8192_qos_activate(struct work_struct *work)
 {
        struct r8192_priv *priv = container_of(work, struct r8192_priv, qos_activate);
        struct net_device *dev = priv->ieee80211->dev;
        struct ieee80211_qos_parameters *qos_parameters = &priv->ieee80211->current_network.qos_data.parameters;
        u8 mode = priv->ieee80211->current_network.mode;
-       //u32 size = sizeof(struct ieee80211_qos_parameters);
        u8  u1bAIFS;
        u32 u4bAcParam;
        int i;
@@ -2321,37 +2107,36 @@ void rtl8192_qos_activate(struct work_struct * work)
        if (priv == NULL)
                return;
 
-       mutex_lock(&priv->mutex);
-       if(priv->ieee80211->state != IEEE80211_LINKED)
+       mutex_lock(&priv->mutex);
+       if (priv->ieee80211->state != IEEE80211_LINKED)
                goto success;
-       RT_TRACE(COMP_QOS,"qos active process with associate response received\n");
+       RT_TRACE(COMP_QOS, "qos active process with associate response received\n");
        /* It better set slot time at first */
        /* For we just support b/g mode at present, let the slot time at 9/20 selection */
        /* update the ac parameter to related registers */
-       for(i = 0; i <  QOS_QUEUE_NUM; i++) {
+       for (i = 0; i <  QOS_QUEUE_NUM; i++) {
                //Mode G/A: slotTimeTimer = 9; Mode B: 20
-               u1bAIFS = qos_parameters->aifs[i] * ((mode&(IEEE_G|IEEE_N_24G)) ?9:20) + aSifsTime;
+               u1bAIFS = qos_parameters->aifs[i] * ((mode&(IEEE_G|IEEE_N_24G)) ? 9 : 20) + aSifsTime;
                u4bAcParam = ((((u32)(qos_parameters->tx_op_limit[i]))<< AC_PARAM_TXOP_LIMIT_OFFSET)|
-                               (((u32)(qos_parameters->cw_max[i]))<< AC_PARAM_ECW_MAX_OFFSET)|
-                               (((u32)(qos_parameters->cw_min[i]))<< AC_PARAM_ECW_MIN_OFFSET)|
-                               ((u32)u1bAIFS << AC_PARAM_AIFS_OFFSET));
+                             (((u32)(qos_parameters->cw_max[i]))<< AC_PARAM_ECW_MAX_OFFSET)|
+                             (((u32)(qos_parameters->cw_min[i]))<< AC_PARAM_ECW_MIN_OFFSET)|
+                             ((u32)u1bAIFS << AC_PARAM_AIFS_OFFSET));
 
                write_nic_dword(dev, WDCAPARA_ADD[i], u4bAcParam);
-               //write_nic_dword(dev, WDCAPARA_ADD[i], 0x005e4332);
        }
 
 success:
-       mutex_unlock(&priv->mutex);
+       mutex_unlock(&priv->mutex);
 }
 
 static int rtl8192_qos_handle_probe_response(struct r8192_priv *priv,
-               int active_network,
-               struct ieee80211_network *network)
+                                            int active_network,
+                                            struct ieee80211_network *network)
 {
        int ret = 0;
        u32 size = sizeof(struct ieee80211_qos_parameters);
 
-       if(priv->ieee80211->state !=IEEE80211_LINKED)
+       if (priv->ieee80211->state != IEEE80211_LINKED)
                return ret;
 
        if ((priv->ieee80211->iw_mode != IW_MODE_INFRA))
@@ -2359,21 +2144,21 @@ static int rtl8192_qos_handle_probe_response(struct r8192_priv *priv,
 
        if (network->flags & NETWORK_HAS_QOS_MASK) {
                if (active_network &&
-                               (network->flags & NETWORK_HAS_QOS_PARAMETERS))
+                   (network->flags & NETWORK_HAS_QOS_PARAMETERS))
                        network->qos_data.active = network->qos_data.supported;
 
                if ((network->qos_data.active == 1) && (active_network == 1) &&
-                               (network->flags & NETWORK_HAS_QOS_PARAMETERS) &&
-                               (network->qos_data.old_param_count !=
-                                network->qos_data.param_count)) {
+                   (network->flags & NETWORK_HAS_QOS_PARAMETERS) &&
+                   (network->qos_data.old_param_count !=
+                    network->qos_data.param_count)) {
                        network->qos_data.old_param_count =
                                network->qos_data.param_count;
                        queue_work(priv->priv_wq, &priv->qos_activate);
-                       RT_TRACE (COMP_QOS, "QoS parameters change call "
-                                       "qos_activate\n");
+                       RT_TRACE(COMP_QOS, "QoS parameters change call "
+                                "qos_activate\n");
                }
        } else {
-               memcpy(&priv->ieee80211->current_network.qos_data.parameters,\
+               memcpy(&priv->ieee80211->current_network.qos_data.parameters,
                       &def_qos_parameters, size);
 
                if ((network->qos_data.active == 1) && (active_network == 1)) {
@@ -2388,13 +2173,13 @@ static int rtl8192_qos_handle_probe_response(struct r8192_priv *priv,
 }
 
 /* handle and manage frame from beacon and probe response */
-static int rtl8192_handle_beacon(struct net_device * dev,
-                             struct ieee80211_beacon * beacon,
-                             struct ieee80211_network * network)
+static int rtl8192_handle_beacon(struct net_device *dev,
+                                struct ieee80211_beacon *beacon,
+                                struct ieee80211_network *network)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
 
-       rtl8192_qos_handle_probe_response(priv,1,network);
+       rtl8192_qos_handle_probe_response(priv, 1, network);
        queue_delayed_work(priv->priv_wq, &priv->update_beacon_wq, 0);
        return 0;
 
@@ -2406,7 +2191,7 @@ static int rtl8192_handle_beacon(struct net_device * dev,
 * setting
 */
 static int rtl8192_qos_association_resp(struct r8192_priv *priv,
-                                   struct ieee80211_network *network)
+                                       struct ieee80211_network *network)
 {
        int ret = 0;
        unsigned long flags;
@@ -2416,28 +2201,26 @@ static int rtl8192_qos_association_resp(struct r8192_priv *priv,
        if ((priv == NULL) || (network == NULL))
                return ret;
 
-       if(priv->ieee80211->state !=IEEE80211_LINKED)
+       if (priv->ieee80211->state != IEEE80211_LINKED)
                return ret;
 
        if ((priv->ieee80211->iw_mode != IW_MODE_INFRA))
                return ret;
 
        spin_lock_irqsave(&priv->ieee80211->lock, flags);
-       if(network->flags & NETWORK_HAS_QOS_PARAMETERS) {
-               memcpy(&priv->ieee80211->current_network.qos_data.parameters,\
-                        &network->qos_data.parameters,\
-                       sizeof(struct ieee80211_qos_parameters));
+       if (network->flags & NETWORK_HAS_QOS_PARAMETERS) {
+               memcpy(&priv->ieee80211->current_network.qos_data.parameters,
+                      &network->qos_data.parameters,
+                      sizeof(struct ieee80211_qos_parameters));
                priv->ieee80211->current_network.qos_data.active = 1;
-                {
-                       set_qos_param = 1;
-                       /* update qos parameter for current network */
-                       priv->ieee80211->current_network.qos_data.old_param_count = \
-                                priv->ieee80211->current_network.qos_data.param_count;
-                       priv->ieee80211->current_network.qos_data.param_count = \
-                                network->qos_data.param_count;
-               }
+               set_qos_param = 1;
+               /* update qos parameter for current network */
+               priv->ieee80211->current_network.qos_data.old_param_count =
+                       priv->ieee80211->current_network.qos_data.param_count;
+               priv->ieee80211->current_network.qos_data.param_count =
+                       network->qos_data.param_count;
        } else {
-               memcpy(&priv->ieee80211->current_network.qos_data.parameters,\
+               memcpy(&priv->ieee80211->current_network.qos_data.parameters,
                       &def_qos_parameters, size);
                priv->ieee80211->current_network.qos_data.active = 0;
                priv->ieee80211->current_network.qos_data.supported = 0;
@@ -2446,7 +2229,7 @@ static int rtl8192_qos_association_resp(struct r8192_priv *priv,
 
        spin_unlock_irqrestore(&priv->ieee80211->lock, flags);
 
-       RT_TRACE(COMP_QOS, "%s: network->flags = %d,%d\n",__FUNCTION__,network->flags ,priv->ieee80211->current_network.qos_data.active);
+       RT_TRACE(COMP_QOS, "%s: network->flags = %d,%d\n", __func__, network->flags, priv->ieee80211->current_network.qos_data.active);
        if (set_qos_param == 1)
                queue_work(priv->priv_wq, &priv->qos_activate);
 
@@ -2456,8 +2239,8 @@ static int rtl8192_qos_association_resp(struct r8192_priv *priv,
 
 
 static int rtl8192_handle_assoc_response(struct net_device *dev,
-                                    struct ieee80211_assoc_response_frame *resp,
-                                    struct ieee80211_network *network)
+                                        struct ieee80211_assoc_response_frame *resp,
+                                        struct ieee80211_network *network)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        rtl8192_qos_association_resp(priv, network);
@@ -2465,79 +2248,70 @@ static int rtl8192_handle_assoc_response(struct net_device *dev,
 }
 
 
-void rtl8192_update_ratr_table(struct net_device* dev)
-       //      POCTET_STRING   posLegacyRate,
-       //      u8*                     pMcsRate)
-       //      PRT_WLAN_STA    pEntry)
+void rtl8192_update_ratr_table(struct net_device *dev)
 {
-       struct r8192_priv* priv = ieee80211_priv(dev);
-       struct ieee80211_device* ieee = priv->ieee80211;
-       u8* pMcsRate = ieee->dot11HTOperationalRateSet;
-       //struct ieee80211_network *net = &ieee->current_network;
+       struct r8192_priv *priv = ieee80211_priv(dev);
+       struct ieee80211_device *ieee = priv->ieee80211;
+       u8 *pMcsRate = ieee->dot11HTOperationalRateSet;
        u32 ratr_value = 0;
        u8 rate_index = 0;
-       rtl8192_config_rate(dev, (u16*)(&ratr_value));
-       ratr_value |= (*(u16*)(pMcsRate)) << 12;
-//     switch (net->mode)
-       switch (ieee->mode)
-       {
-               case IEEE_A:
-                       ratr_value &= 0x00000FF0;
-                       break;
-               case IEEE_B:
-                       ratr_value &= 0x0000000F;
-                       break;
-               case IEEE_G:
-                       ratr_value &= 0x00000FF7;
-                       break;
-               case IEEE_N_24G:
-               case IEEE_N_5G:
-                       if (ieee->pHTInfo->PeerMimoPs == 0) //MIMO_PS_STATIC
-                               ratr_value &= 0x0007F007;
-                       else{
-                               if (priv->rf_type == RF_1T2R)
-                                       ratr_value &= 0x000FF007;
-                               else
-                                       ratr_value &= 0x0F81F007;
-                       }
-                       break;
-               default:
-                       break;
+       rtl8192_config_rate(dev, (u16 *)(&ratr_value));
+       ratr_value |= (*(u16 *)(pMcsRate)) << 12;
+       switch (ieee->mode) {
+       case IEEE_A:
+               ratr_value &= 0x00000FF0;
+               break;
+       case IEEE_B:
+               ratr_value &= 0x0000000F;
+               break;
+       case IEEE_G:
+               ratr_value &= 0x00000FF7;
+               break;
+       case IEEE_N_24G:
+       case IEEE_N_5G:
+               if (ieee->pHTInfo->PeerMimoPs == 0) {//MIMO_PS_STATIC
+                       ratr_value &= 0x0007F007;
+               } else {
+                       if (priv->rf_type == RF_1T2R)
+                               ratr_value &= 0x000FF007;
+                       else
+                               ratr_value &= 0x0F81F007;
+               }
+               break;
+       default:
+               break;
        }
        ratr_value &= 0x0FFFFFFF;
-       if(ieee->pHTInfo->bCurTxBW40MHz && ieee->pHTInfo->bCurShortGI40MHz){
+       if (ieee->pHTInfo->bCurTxBW40MHz && ieee->pHTInfo->bCurShortGI40MHz)
                ratr_value |= 0x80000000;
-       }else if(!ieee->pHTInfo->bCurTxBW40MHz && ieee->pHTInfo->bCurShortGI20MHz){
+       else if (!ieee->pHTInfo->bCurTxBW40MHz && ieee->pHTInfo->bCurShortGI20MHz)
                ratr_value |= 0x80000000;
-       }
        write_nic_dword(dev, RATR0+rate_index*4, ratr_value);
        write_nic_byte(dev, UFWP, 1);
 }
 
-static u8 ccmp_ie[4] = {0x00,0x50,0xf2,0x04};
+static u8 ccmp_ie[4] = {0x00, 0x50, 0xf2, 0x04};
 static u8 ccmp_rsn_ie[4] = {0x00, 0x0f, 0xac, 0x04};
-bool GetNmodeSupportBySecCfg8192(struct net_device*dev)
+bool GetNmodeSupportBySecCfg8192(struct net_device *dev)
 {
-       struct r8192_privpriv = ieee80211_priv(dev);
-       struct ieee80211_deviceieee = priv->ieee80211;
-       struct ieee80211_network * network = &ieee->current_network;
-       int wpa_ie_len= ieee->wpa_ie_len;
-       struct ieee80211_crypt_datacrypt;
+       struct r8192_priv *priv = ieee80211_priv(dev);
+       struct ieee80211_device *ieee = priv->ieee80211;
+       struct ieee80211_network *network = &ieee->current_network;
+       int wpa_ie_len = ieee->wpa_ie_len;
+       struct ieee80211_crypt_data *crypt;
        int encrypt;
 
        crypt = ieee->crypt[ieee->tx_keyidx];
        //we use connecting AP's capability instead of only security config on our driver to distinguish whether it should use N mode or G mode
-       encrypt = (network->capability & WLAN_CAPABILITY_PRIVACY) || (ieee->host_encrypt && crypt && crypt->ops && (0 == strcmp(crypt->ops->name,"WEP")));
+       encrypt = (network->capability & WLAN_CAPABILITY_PRIVACY) || (ieee->host_encrypt && crypt && crypt->ops && (0 == strcmp(crypt->ops->name, "WEP")));
 
        /* simply judge  */
-       if(encrypt && (wpa_ie_len == 0)) {
+       if (encrypt && (wpa_ie_len == 0)) {
                /* wep encryption, no N mode setting */
                return false;
-//     } else if((wpa_ie_len != 0)&&(memcmp(&(ieee->wpa_ie[14]),ccmp_ie,4))) {
-       } else if((wpa_ie_len != 0)) {
+       } else if ((wpa_ie_len != 0)) {
                /* parse pairwise key type */
-               //if((pairwisekey = WEP40)||(pairwisekey = WEP104)||(pairwisekey = TKIP))
-               if (((ieee->wpa_ie[0] == 0xdd) && (!memcmp(&(ieee->wpa_ie[14]),ccmp_ie,4))) || ((ieee->wpa_ie[0] == 0x30) && (!memcmp(&ieee->wpa_ie[10],ccmp_rsn_ie, 4))))
+               if (((ieee->wpa_ie[0] == 0xdd) && (!memcmp(&(ieee->wpa_ie[14]), ccmp_ie, 4))) || ((ieee->wpa_ie[0] == 0x30) && (!memcmp(&ieee->wpa_ie[10], ccmp_rsn_ie, 4))))
                        return true;
                else
                        return false;
@@ -2548,13 +2322,13 @@ bool GetNmodeSupportBySecCfg8192(struct net_device*dev)
        return true;
 }
 
-bool GetHalfNmodeSupportByAPs819xUsb(struct net_devicedev)
+bool GetHalfNmodeSupportByAPs819xUsb(struct net_device *dev)
 {
        bool                    Reval;
-       struct r8192_privpriv = ieee80211_priv(dev);
-       struct ieee80211_deviceieee = priv->ieee80211;
+       struct r8192_priv *priv = ieee80211_priv(dev);
+       struct ieee80211_device *ieee = priv->ieee80211;
 
-       if(ieee->bHalfWirelessN24GMode == true)
+       if (ieee->bHalfWirelessN24GMode == true)
                Reval = true;
        else
                Reval =  false;
@@ -2562,75 +2336,59 @@ bool GetHalfNmodeSupportByAPs819xUsb(struct net_device* dev)
        return Reval;
 }
 
-void rtl8192_refresh_supportrate(struct r8192_privpriv)
+void rtl8192_refresh_supportrate(struct r8192_priv *priv)
 {
-       struct ieee80211_deviceieee = priv->ieee80211;
+       struct ieee80211_device *ieee = priv->ieee80211;
        //we do not consider set support rate for ABG mode, only HT MCS rate is set here.
        if (ieee->mode == WIRELESS_MODE_N_24G || ieee->mode == WIRELESS_MODE_N_5G)
-       {
                memcpy(ieee->Regdot11HTOperationalRateSet, ieee->RegHTSuppRateSet, 16);
-               //RT_DEBUG_DATA(COMP_INIT, ieee->RegHTSuppRateSet, 16);
-               //RT_DEBUG_DATA(COMP_INIT, ieee->Regdot11HTOperationalRateSet, 16);
-       }
        else
                memset(ieee->Regdot11HTOperationalRateSet, 0, 16);
        return;
 }
 
-u8 rtl8192_getSupportedWireleeMode(struct net_device*dev)
+u8 rtl8192_getSupportedWireleeMode(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u8 ret = 0;
-       switch(priv->rf_chip)
-       {
-               case RF_8225:
-               case RF_8256:
-               case RF_PSEUDO_11N:
-                       ret = (WIRELESS_MODE_N_24G|WIRELESS_MODE_G|WIRELESS_MODE_B);
-                       break;
-               case RF_8258:
-                       ret = (WIRELESS_MODE_A|WIRELESS_MODE_N_5G);
-                       break;
-               default:
-                       ret = WIRELESS_MODE_B;
-                       break;
+       switch (priv->rf_chip) {
+       case RF_8225:
+       case RF_8256:
+       case RF_PSEUDO_11N:
+               ret = (WIRELESS_MODE_N_24G|WIRELESS_MODE_G|WIRELESS_MODE_B);
+               break;
+       case RF_8258:
+               ret = (WIRELESS_MODE_A|WIRELESS_MODE_N_5G);
+               break;
+       default:
+               ret = WIRELESS_MODE_B;
+               break;
        }
        return ret;
 }
-void rtl8192_SetWirelessMode(struct net_devicedev, u8 wireless_mode)
+void rtl8192_SetWirelessMode(struct net_device *dev, u8 wireless_mode)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u8 bSupportMode = rtl8192_getSupportedWireleeMode(dev);
 
-       if ((wireless_mode == WIRELESS_MODE_AUTO) || ((wireless_mode&bSupportMode)==0))
-       {
-               if(bSupportMode & WIRELESS_MODE_N_24G)
-               {
+       if ((wireless_mode == WIRELESS_MODE_AUTO) || ((wireless_mode&bSupportMode) == 0)) {
+               if (bSupportMode & WIRELESS_MODE_N_24G) {
                        wireless_mode = WIRELESS_MODE_N_24G;
-               }
-               else if(bSupportMode & WIRELESS_MODE_N_5G)
-               {
+               } else if (bSupportMode & WIRELESS_MODE_N_5G) {
                        wireless_mode = WIRELESS_MODE_N_5G;
-               }
-               else if((bSupportMode & WIRELESS_MODE_A))
-               {
+               } else if ((bSupportMode & WIRELESS_MODE_A)) {
                        wireless_mode = WIRELESS_MODE_A;
-               }
-               else if((bSupportMode & WIRELESS_MODE_G))
-               {
+               } else if ((bSupportMode & WIRELESS_MODE_G)) {
                        wireless_mode = WIRELESS_MODE_G;
-               }
-               else if((bSupportMode & WIRELESS_MODE_B))
-               {
+               } else if ((bSupportMode & WIRELESS_MODE_B)) {
                        wireless_mode = WIRELESS_MODE_B;
-               }
-               else{
-                       RT_TRACE(COMP_ERR, "%s(), No valid wireless mode supported, SupportedWirelessMode(%x)!!!\n", __FUNCTION__,bSupportMode);
+               } else {
+                       RT_TRACE(COMP_ERR, "%s(), No valid wireless mode supported, SupportedWirelessMode(%x)!!!\n", __func__, bSupportMode);
                        wireless_mode = WIRELESS_MODE_B;
                }
        }
 #ifdef TO_DO_LIST //// TODO: this function doesn't work well at this time, we should wait for FPGA
-       ActUpdateChannelAccessSetting( pAdapter, pHalData->CurrentWirelessMode, &pAdapter->MgntInfo.Info8185.ChannelAccessSetting );
+       ActUpdateChannelAccessSetting(pAdapter, pHalData->CurrentWirelessMode, &pAdapter->MgntInfo.Info8185.ChannelAccessSetting);
 #endif
        priv->ieee80211->mode = wireless_mode;
 
@@ -2643,7 +2401,7 @@ void rtl8192_SetWirelessMode(struct net_device* dev, u8 wireless_mode)
 
 }
 //init priv variables here. only non_zero value should be initialized here.
-static void rtl8192_init_priv_variable(struct net_devicedev)
+static void rtl8192_init_priv_variable(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u8 i;
@@ -2651,13 +2409,13 @@ static void rtl8192_init_priv_variable(struct net_device* dev)
        priv->chan = 1; //set to channel 1
        priv->ieee80211->mode = WIRELESS_MODE_AUTO; //SET AUTO
        priv->ieee80211->iw_mode = IW_MODE_INFRA;
-       priv->ieee80211->ieee_up=0;
+       priv->ieee80211->ieee_up = 0;
        priv->retry_rts = DEFAULT_RETRY_RTS;
        priv->retry_data = DEFAULT_RETRY_DATA;
        priv->ieee80211->rts = DEFAULT_RTS_THRESHOLD;
        priv->ieee80211->rate = 110; //11 mbps
        priv->ieee80211->short_slot = 1;
-       priv->promisc = (dev->flags & IFF_PROMISC) ? 1:0;
+       priv->promisc = (dev->flags & IFF_PROMISC) ? 1 : 0;
        priv->CckPwEnl = 6;
        //for silent reset
        priv->IrpPendingCount = 1;
@@ -2672,14 +2430,14 @@ static void rtl8192_init_priv_variable(struct net_device* dev)
        priv->ieee80211->softmac_features  = IEEE_SOFTMAC_SCAN |
                IEEE_SOFTMAC_ASSOCIATE | IEEE_SOFTMAC_PROBERQ |
                IEEE_SOFTMAC_PROBERS | IEEE_SOFTMAC_TX_QUEUE |
-               IEEE_SOFTMAC_BEACONS;//added by amy 080604 //|  //IEEE_SOFTMAC_SINGLE_QUEUE;
+               IEEE_SOFTMAC_BEACONS;//added by amy 080604
 
        priv->ieee80211->active_scan = 1;
        priv->ieee80211->modulation = IEEE80211_CCK_MODULATION | IEEE80211_OFDM_MODULATION;
        priv->ieee80211->host_encrypt = 1;
        priv->ieee80211->host_decrypt = 1;
-       priv->ieee80211->start_send_beacons = NULL;//rtl819xusb_beacon_tx;//-by amy 080604
-       priv->ieee80211->stop_send_beacons = NULL;//rtl8192_beacon_stop;//-by amy 080604
+       priv->ieee80211->start_send_beacons = NULL; //-by amy 080604
+       priv->ieee80211->stop_send_beacons = NULL;  //-by amy 080604
        priv->ieee80211->softmac_hard_start_xmit = rtl8192_hard_start_xmit;
        priv->ieee80211->set_chan = rtl8192_set_chan;
        priv->ieee80211->link_change = rtl8192_link_change;
@@ -2693,7 +2451,6 @@ static void rtl8192_init_priv_variable(struct net_device* dev)
        priv->ieee80211->qos_support = 1;
 
        //added by WB
-//     priv->ieee80211->SwChnlByTimerHandler = rtl8192_phy_SwChnl;
        priv->ieee80211->SetBWModeHandler = rtl8192_SetBWMode;
        priv->ieee80211->handle_assoc_response = rtl8192_handle_assoc_response;
        priv->ieee80211->handle_beacon = rtl8192_handle_beacon;
@@ -2705,36 +2462,31 @@ static void rtl8192_init_priv_variable(struct net_device* dev)
        priv->ieee80211->InitialGainHandler = InitialGain819xUsb;
        priv->card_type = USB;
 #ifdef TO_DO_LIST
-       if(Adapter->bInHctTest)
-       {
+       if (Adapter->bInHctTest) {
                pHalData->ShortRetryLimit = 7;
                pHalData->LongRetryLimit = 7;
        }
 #endif
-       {
-               priv->ShortRetryLimit = 0x30;
-               priv->LongRetryLimit = 0x30;
-       }
+       priv->ShortRetryLimit = 0x30;
+       priv->LongRetryLimit = 0x30;
        priv->EarlyRxThreshold = 7;
        priv->enable_gpio0 = 0;
        priv->TransmitConfig =
-       //      TCR_DurProcMode |       //for RTL8185B, duration setting by HW
-       //?     TCR_DISReqQsize |
                (TCR_MXDMA_2048<<TCR_MXDMA_OFFSET)|  // Max DMA Burst Size per Tx DMA Burst, 7: reserved.
                (priv->ShortRetryLimit<<TCR_SRL_OFFSET)|        // Short retry limit
                (priv->LongRetryLimit<<TCR_LRL_OFFSET) |        // Long retry limit
-               (false ? TCR_SAT: 0);   // FALSE: HW provides PLCP length and LENGEXT, TRUE: SW provides them
+               (false ? TCR_SAT : 0);  // FALSE: HW provides PLCP length and LENGEXT, TRUE: SW provides them
 #ifdef TO_DO_LIST
-       if(Adapter->bInHctTest)
+       if (Adapter->bInHctTest)
                pHalData->ReceiveConfig =       pHalData->CSMethod |
-                                               RCR_AMF | RCR_ADF |     //RCR_AAP |     //accept management/data
+                                               RCR_AMF | RCR_ADF |     //accept management/data
                                                //guangan200710
                                                RCR_ACF |       //accept control frame for SW AP needs PS-poll, 2005.07.07, by rcnjko.
                                                RCR_AB | RCR_AM | RCR_APM |             //accept BC/MC/UC
                                                RCR_AICV | RCR_ACRC32 |                 //accept ICV/CRC error packet
                                                ((u32)7<<RCR_MXDMA_OFFSET) | // Max DMA Burst Size per Rx DMA Burst, 7: unlimited.
                                                (pHalData->EarlyRxThreshold<<RCR_FIFO_OFFSET) | // Rx FIFO Threshold, 7: No Rx threshold.
-                                               (pHalData->EarlyRxThreshold == 7 ? RCR_OnlyErlPkt:0);
+                                               (pHalData->EarlyRxThreshold == 7 ? RCR_OnlyErlPkt : 0);
        else
 
 #endif
@@ -2742,10 +2494,9 @@ static void rtl8192_init_priv_variable(struct net_device* dev)
                RCR_AMF | RCR_ADF |             //accept management/data
                RCR_ACF |                       //accept control frame for SW AP needs PS-poll, 2005.07.07, by rcnjko.
                RCR_AB | RCR_AM | RCR_APM |     //accept BC/MC/UC
-               //RCR_AICV | RCR_ACRC32 |       //accept ICV/CRC error packet
                ((u32)7<<RCR_MXDMA_OFFSET)| // Max DMA Burst Size per Rx DMA Burst, 7: unlimited.
                (priv->EarlyRxThreshold<<RX_FIFO_THRESHOLD_SHIFT) | // Rx FIFO Threshold, 7: No Rx threshold.
-               (priv->EarlyRxThreshold == 7 ? RCR_ONLYERLPKT:0);
+               (priv->EarlyRxThreshold == 7 ? RCR_ONLYERLPKT : 0);
 
        priv->AcmControl = 0;
        priv->pFirmware = kzalloc(sizeof(rt_firmware), GFP_KERNEL);
@@ -2755,26 +2506,22 @@ static void rtl8192_init_priv_variable(struct net_device* dev)
        skb_queue_head_init(&priv->skb_queue);
 
        /* Tx related queue */
-       for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-               skb_queue_head_init(&priv->ieee80211->skb_waitQ [i]);
-       }
-       for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-               skb_queue_head_init(&priv->ieee80211->skb_aggQ [i]);
-       }
-       for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-               skb_queue_head_init(&priv->ieee80211->skb_drv_aggQ [i]);
-       }
+       for (i = 0; i < MAX_QUEUE_SIZE; i++)
+               skb_queue_head_init(&priv->ieee80211->skb_waitQ[i]);
+       for (i = 0; i < MAX_QUEUE_SIZE; i++)
+               skb_queue_head_init(&priv->ieee80211->skb_aggQ[i]);
+       for (i = 0; i < MAX_QUEUE_SIZE; i++)
+               skb_queue_head_init(&priv->ieee80211->skb_drv_aggQ[i]);
        priv->rf_set_chan = rtl8192_phy_SwChnl;
 }
 
 //init lock here
-static void rtl8192_init_priv_lock(struct r8192_privpriv)
+static void rtl8192_init_priv_lock(struct r8192_priv *priv)
 {
        spin_lock_init(&priv->tx_lock);
        spin_lock_init(&priv->irq_lock);//added by thomas
-       //spin_lock_init(&priv->rf_lock);
-       sema_init(&priv->wx_sem,1);
-       sema_init(&priv->rf_sem,1);
+       sema_init(&priv->wx_sem, 1);
+       sema_init(&priv->rf_sem, 1);
        mutex_init(&priv->mutex);
 }
 
@@ -2783,7 +2530,7 @@ extern  void    rtl819x_watchdog_wqcallback(struct work_struct *work);
 void rtl8192_irq_rx_tasklet(struct r8192_priv *priv);
 //init tasklet and wait_queue here. only 2.6 above kernel is considered
 #define DRV_NAME "wlan0"
-static void rtl8192_init_priv_task(struct net_devicedev)
+static void rtl8192_init_priv_task(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
 
@@ -2791,71 +2538,64 @@ static void rtl8192_init_priv_task(struct net_device* dev)
 
        INIT_WORK(&priv->reset_wq, rtl8192_restart);
 
-       //INIT_DELAYED_WORK(&priv->watch_dog_wq, hal_dm_watchdog);
        INIT_DELAYED_WORK(&priv->watch_dog_wq, rtl819x_watchdog_wqcallback);
        INIT_DELAYED_WORK(&priv->txpower_tracking_wq,  dm_txpower_trackingcallback);
-//     INIT_DELAYED_WORK(&priv->gpio_change_rf_wq,  dm_gpio_change_rf_callback);
        INIT_DELAYED_WORK(&priv->rfpath_check_wq,  dm_rf_pathcheck_workitemcallback);
        INIT_DELAYED_WORK(&priv->update_beacon_wq, rtl8192_update_beacon);
        INIT_DELAYED_WORK(&priv->initialgain_operate_wq, InitialGainOperateWorkItemCallBack);
-       //INIT_WORK(&priv->SwChnlWorkItem,  rtl8192_SwChnl_WorkItem);
-       //INIT_WORK(&priv->SetBWModeWorkItem,  rtl8192_SetBWModeWorkItem);
        INIT_WORK(&priv->qos_activate, rtl8192_qos_activate);
 
        tasklet_init(&priv->irq_rx_tasklet,
-            (void(*)(unsigned long))rtl8192_irq_rx_tasklet,
-            (unsigned long)priv);
+                    (void(*)(unsigned long))rtl8192_irq_rx_tasklet,
+                    (unsigned long)priv);
 }
 
-static void rtl8192_get_eeprom_size(struct net_devicedev)
+static void rtl8192_get_eeprom_size(struct net_device *dev)
 {
        u16 curCR = 0;
        struct r8192_priv *priv = ieee80211_priv(dev);
-       RT_TRACE(COMP_EPROM, "===========>%s()\n", __FUNCTION__);
-       curCR = read_nic_word_E(dev,EPROM_CMD);
+       RT_TRACE(COMP_EPROM, "===========>%s()\n", __func__);
+       read_nic_word_E(dev, EPROM_CMD, &curCR);
        RT_TRACE(COMP_EPROM, "read from Reg EPROM_CMD(%x):%x\n", EPROM_CMD, curCR);
        //whether need I consider BIT5?
        priv->epromtype = (curCR & Cmd9346CR_9356SEL) ? EPROM_93c56 : EPROM_93c46;
-       RT_TRACE(COMP_EPROM, "<===========%s(), epromtype:%d\n", __FUNCTION__, priv->epromtype);
+       RT_TRACE(COMP_EPROM, "<===========%s(), epromtype:%d\n", __func__, priv->epromtype);
 }
 
 //used to swap endian. as ntohl & htonl are not necessary to swap endian, so use this instead.
-static inline u16 endian_swap(u16data)
+static inline u16 endian_swap(u16 *data)
 {
        u16 tmp = *data;
        *data = (tmp >> 8) | (tmp << 8);
        return *data;
 }
-static void rtl8192_read_eeprom_info(struct net_devicedev)
+static void rtl8192_read_eeprom_info(struct net_device *dev)
 {
        u16 wEPROM_ID = 0;
        u8 bMac_Tmp_Addr[6] = {0x00, 0xe0, 0x4c, 0x00, 0x00, 0x02};
        u8 bLoad_From_EEPOM = false;
        struct r8192_priv *priv = ieee80211_priv(dev);
        u16 tmpValue = 0;
-       RT_TRACE(COMP_EPROM, "===========>%s()\n", __FUNCTION__);
+       int i;
+       RT_TRACE(COMP_EPROM, "===========>%s()\n", __func__);
        wEPROM_ID = eprom_read(dev, 0); //first read EEPROM ID out;
        RT_TRACE(COMP_EPROM, "EEPROM ID is 0x%x\n", wEPROM_ID);
 
-       if (wEPROM_ID != RTL8190_EEPROM_ID)
-       {
+       if (wEPROM_ID != RTL8190_EEPROM_ID) {
                RT_TRACE(COMP_ERR, "EEPROM ID is invalid(is 0x%x(should be 0x%x)\n", wEPROM_ID, RTL8190_EEPROM_ID);
-       }
-       else
+       } else {
                bLoad_From_EEPOM = true;
+       }
 
-       if (bLoad_From_EEPOM)
-       {
+       if (bLoad_From_EEPOM) {
                tmpValue = eprom_read(dev, (EEPROM_VID>>1));
                priv->eeprom_vid = endian_swap(&tmpValue);
                priv->eeprom_pid = eprom_read(dev, (EEPROM_PID>>1));
                tmpValue = eprom_read(dev, (EEPROM_ChannelPlan>>1));
-               priv->eeprom_ChannelPlan =((tmpValue&0xff00)>>8);
+               priv->eeprom_ChannelPlan = ((tmpValue&0xff00)>>8);
                priv->btxpowerdata_readfromEEPORM = true;
                priv->eeprom_CustomerID = eprom_read(dev, (EEPROM_Customer_ID>>1)) >>8;
-       }
-       else
-       {
+       } else {
                priv->eeprom_vid = 0;
                priv->eeprom_pid = 0;
                priv->card_8192_version = VERSION_819xU_B;
@@ -2865,18 +2605,14 @@ static void rtl8192_read_eeprom_info(struct net_device* dev)
        RT_TRACE(COMP_EPROM, "vid:0x%4x, pid:0x%4x, CustomID:0x%2x, ChanPlan:0x%x\n", priv->eeprom_vid, priv->eeprom_pid, priv->eeprom_CustomerID, priv->eeprom_ChannelPlan);
        //set channelplan from eeprom
        priv->ChannelPlan = priv->eeprom_ChannelPlan;
-       if (bLoad_From_EEPOM)
-       {
+       if (bLoad_From_EEPOM) {
                int i;
-               for (i=0; i<6; i+=2)
-               {
+               for (i = 0; i < 6; i += 2) {
                        u16 tmp = 0;
                        tmp = eprom_read(dev, (u16)((EEPROM_NODE_ADDRESS_BYTE_0 + i)>>1));
-                       *(u16*)(&dev->dev_addr[i]) = tmp;
+                       *(u16 *)(&dev->dev_addr[i]) = tmp;
                }
-       }
-       else
-       {
+       } else {
                memcpy(dev->dev_addr, bMac_Tmp_Addr, 6);
                //should I set IDR0 here?
        }
@@ -2884,8 +2620,7 @@ static void rtl8192_read_eeprom_info(struct net_device* dev)
        priv->rf_type = RTL819X_DEFAULT_RF_TYPE; //default 1T2R
        priv->rf_chip = RF_8256;
 
-       if (priv->card_8192_version == (u8)VERSION_819xU_A)
-       {
+       if (priv->card_8192_version == (u8)VERSION_819xU_A) {
                //read Tx power gain offset of legacy OFDM to HT rate
                if (bLoad_From_EEPOM)
                        priv->EEPROMTxPowerDiff = (eprom_read(dev, (EEPROM_TxPowerDiff>>1))&0xff00) >> 8;
@@ -2918,51 +2653,45 @@ static void rtl8192_read_eeprom_info(struct net_device* dev)
                else
                        priv->EEPROM_Def_Ver = 1;
                RT_TRACE(COMP_EPROM, "EEPROM_DEF_VER:%d\n", priv->EEPROM_Def_Ver);
-               if (priv->EEPROM_Def_Ver == 0) //old eeprom definition
-               {
+               if (priv->EEPROM_Def_Ver == 0) { //old eeprom definition
                        int i;
                        if (bLoad_From_EEPOM)
                                priv->EEPROMTxPowerLevelCCK = (eprom_read(dev, (EEPROM_TxPwIndex_CCK>>1))&0xff) >> 8;
                        else
                                priv->EEPROMTxPowerLevelCCK = 0x10;
                        RT_TRACE(COMP_EPROM, "CCK Tx Power Levl: 0x%02x\n", priv->EEPROMTxPowerLevelCCK);
-                       for (i=0; i<3; i++)
-                       {
-                               if (bLoad_From_EEPOM)
-                               {
+                       for (i = 0; i < 3; i++) {
+                               if (bLoad_From_EEPOM) {
                                        tmpValue = eprom_read(dev, (EEPROM_TxPwIndex_OFDM_24G+i)>>1);
                                        if (((EEPROM_TxPwIndex_OFDM_24G+i) % 2) == 0)
                                                tmpValue = tmpValue & 0x00ff;
                                        else
                                                tmpValue = (tmpValue & 0xff00) >> 8;
-                               }
-                               else
+                               } else {
                                        tmpValue = 0x10;
+                               }
                                priv->EEPROMTxPowerLevelOFDM24G[i] = (u8) tmpValue;
                                RT_TRACE(COMP_EPROM, "OFDM 2.4G Tx Power Level, Index %d = 0x%02x\n", i, priv->EEPROMTxPowerLevelCCK);
                        }
-               }//end if EEPROM_DEF_VER == 0
-               else if (priv->EEPROM_Def_Ver == 1)
-               {
-                       if (bLoad_From_EEPOM)
-                       {
+               } else if (priv->EEPROM_Def_Ver == 1) {
+                       if (bLoad_From_EEPOM) {
                                tmpValue = eprom_read(dev, (EEPROM_TxPwIndex_CCK_V1>>1));
                                tmpValue = (tmpValue & 0xff00) >> 8;
-                       }
-                       else
+                       } else {
                                tmpValue = 0x10;
+                       }
                        priv->EEPROMTxPowerLevelCCK_V1[0] = (u8)tmpValue;
 
                        if (bLoad_From_EEPOM)
                                tmpValue = eprom_read(dev, (EEPROM_TxPwIndex_CCK_V1 + 2)>>1);
                        else
                                tmpValue = 0x1010;
-                       *((u16*)(&priv->EEPROMTxPowerLevelCCK_V1[1])) = tmpValue;
+                       *((u16 *)(&priv->EEPROMTxPowerLevelCCK_V1[1])) = tmpValue;
                        if (bLoad_From_EEPOM)
                                tmpValue = eprom_read(dev, (EEPROM_TxPwIndex_OFDM_24G_V1>>1));
                        else
                                tmpValue = 0x1010;
-                       *((u16*)(&priv->EEPROMTxPowerLevelOFDM24G[0])) = tmpValue;
+                       *((u16 *)(&priv->EEPROMTxPowerLevelOFDM24G[0])) = tmpValue;
                        if (bLoad_From_EEPOM)
                                tmpValue = eprom_read(dev, (EEPROM_TxPwIndex_OFDM_24G_V1+2)>>1);
                        else
@@ -2972,42 +2701,34 @@ static void rtl8192_read_eeprom_info(struct net_device* dev)
 
                //update HAL variables
                //
-               {
-                       int i;
-                       for (i=0; i<14; i++)
-                       {
-                               if (i<=3)
-                                       priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[0];
-                               else if (i>=4 && i<=9)
-                                       priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[1];
-                               else
-                                       priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[2];
-                       }
+               for (i = 0; i < 14; i++) {
+                       if (i <= 3)
+                               priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[0];
+                       else if (i >= 4 && i <= 9)
+                               priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[1];
+                       else
+                               priv->TxPowerLevelOFDM24G[i] = priv->EEPROMTxPowerLevelOFDM24G[2];
+               }
 
-                       for (i=0; i<14; i++)
-                       {
-                               if (priv->EEPROM_Def_Ver == 0)
-                               {
-                                       if (i<=3)
-                                               priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelOFDM24G[0] + (priv->EEPROMTxPowerLevelCCK - priv->EEPROMTxPowerLevelOFDM24G[1]);
-                                       else if (i>=4 && i<=9)
-                                               priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK;
-                                       else
-                                               priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelOFDM24G[2] + (priv->EEPROMTxPowerLevelCCK - priv->EEPROMTxPowerLevelOFDM24G[1]);
-                               }
-                               else if (priv->EEPROM_Def_Ver == 1)
-                               {
-                                       if (i<=3)
-                                               priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[0];
-                                       else if (i>=4 && i<=9)
-                                               priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[1];
-                                       else
-                                               priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[2];
-                               }
+               for (i = 0; i < 14; i++) {
+                       if (priv->EEPROM_Def_Ver == 0) {
+                               if (i <= 3)
+                                       priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelOFDM24G[0] + (priv->EEPROMTxPowerLevelCCK - priv->EEPROMTxPowerLevelOFDM24G[1]);
+                               else if (i >= 4 && i <= 9)
+                                       priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK;
+                               else
+                                       priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelOFDM24G[2] + (priv->EEPROMTxPowerLevelCCK - priv->EEPROMTxPowerLevelOFDM24G[1]);
+                       } else if (priv->EEPROM_Def_Ver == 1) {
+                               if (i <= 3)
+                                       priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[0];
+                               else if (i >= 4 && i <= 9)
+                                       priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[1];
+                               else
+                                       priv->TxPowerLevelCCK[i] = priv->EEPROMTxPowerLevelCCK_V1[2];
                        }
-               }//end update HAL variables
+               }
                priv->TxPowerDiff = priv->EEPROMPwDiff;
-// Antenna B gain offset to antenna A, bit0~3
+               // Antenna B gain offset to antenna A, bit0~3
                priv->AntennaTxPwDiff[0] = (priv->EEPROMTxPowerDiff & 0xf);
                // Antenna C gain offset to antenna A, bit4~7
                priv->AntennaTxPwDiff[1] = ((priv->EEPROMTxPowerDiff & 0xf0)>>4);
@@ -3018,46 +2739,41 @@ static void rtl8192_read_eeprom_info(struct net_device* dev)
                priv->ThermalMeter[0] = priv->EEPROMThermalMeter;
        }//end if VersionID == VERSION_819xU_A
 
-//added by vivi, for dlink led, 20080416
-       switch(priv->eeprom_CustomerID)
-       {
-               case EEPROM_CID_RUNTOP:
-                       priv->CustomerID = RT_CID_819x_RUNTOP;
-                       break;
+       //added by vivi, for dlink led, 20080416
+       switch (priv->eeprom_CustomerID) {
+       case EEPROM_CID_RUNTOP:
+               priv->CustomerID = RT_CID_819x_RUNTOP;
+               break;
 
-               case EEPROM_CID_DLINK:
-                       priv->CustomerID = RT_CID_DLINK;
-                       break;
+       case EEPROM_CID_DLINK:
+               priv->CustomerID = RT_CID_DLINK;
+               break;
 
-               default:
-                       priv->CustomerID = RT_CID_DEFAULT;
-                       break;
+       default:
+               priv->CustomerID = RT_CID_DEFAULT;
+               break;
 
        }
 
-       switch(priv->CustomerID)
-       {
-               case RT_CID_819x_RUNTOP:
-                       priv->LedStrategy = SW_LED_MODE2;
-                       break;
+       switch (priv->CustomerID) {
+       case RT_CID_819x_RUNTOP:
+               priv->LedStrategy = SW_LED_MODE2;
+               break;
 
-               case RT_CID_DLINK:
-                       priv->LedStrategy = SW_LED_MODE4;
-                       break;
+       case RT_CID_DLINK:
+               priv->LedStrategy = SW_LED_MODE4;
+               break;
 
-               default:
-                       priv->LedStrategy = SW_LED_MODE0;
-                       break;
+       default:
+               priv->LedStrategy = SW_LED_MODE0;
+               break;
 
        }
 
 
-       if(priv->rf_type == RF_1T2R)
-       {
+       if (priv->rf_type == RF_1T2R) {
                RT_TRACE(COMP_EPROM, "\n1T2R config\n");
-       }
-       else
-       {
+       } else {
                RT_TRACE(COMP_EPROM, "\n2T4R config\n");
        }
 
@@ -3066,18 +2782,18 @@ static void rtl8192_read_eeprom_info(struct net_device* dev)
        init_rate_adaptive(dev);
        //we need init DIG RATR table here again.
 
-       RT_TRACE(COMP_EPROM, "<===========%s()\n", __FUNCTION__);
+       RT_TRACE(COMP_EPROM, "<===========%s()\n", __func__);
        return;
 }
 
-short rtl8192_get_channel_map(struct net_device * dev)
+short rtl8192_get_channel_map(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-       if(priv->ChannelPlan > COUNTRY_CODE_GLOBAL_DOMAIN){
-               printk("rtl8180_init:Error channel plan! Set to default.\n");
-               priv->ChannelPlan= 0;
+       if (priv->ChannelPlan > COUNTRY_CODE_GLOBAL_DOMAIN) {
+               netdev_err(dev, "rtl8180_init: Error channel plan! Set to default.\n");
+               priv->ChannelPlan = 0;
        }
-       RT_TRACE(COMP_INIT, "Channel plan is %d\n",priv->ChannelPlan);
+       RT_TRACE(COMP_INIT, "Channel plan is %d\n", priv->ChannelPlan);
 
        rtl819x_set_channel_map(priv->ChannelPlan, priv);
        return 0;
@@ -3088,24 +2804,18 @@ short rtl8192_init(struct net_device *dev)
 
        struct r8192_priv *priv = ieee80211_priv(dev);
 
-       memset(&(priv->stats),0,sizeof(struct Stats));
-       memset(priv->txqueue_to_outpipemap,0,9);
+       memset(&(priv->stats), 0, sizeof(struct Stats));
+       memset(priv->txqueue_to_outpipemap, 0, 9);
 #ifdef PIPE12
        {
-               int i=0;
-               u8 queuetopipe[]={3,2,1,0,4,8,7,6,5};
-               memcpy(priv->txqueue_to_outpipemap,queuetopipe,9);
-/*             for(i=0;i<9;i++)
-                       printk("%d ",priv->txqueue_to_outpipemap[i]);
-               printk("\n");*/
+               int i = 0;
+               u8 queuetopipe[] = {3, 2, 1, 0, 4, 8, 7, 6, 5};
+               memcpy(priv->txqueue_to_outpipemap, queuetopipe, 9);
        }
 #else
        {
-               u8 queuetopipe[]={3,2,1,0,4,4,0,4,4};
-               memcpy(priv->txqueue_to_outpipemap,queuetopipe,9);
-/*             for(i=0;i<9;i++)
-                       printk("%d ",priv->txqueue_to_outpipemap[i]);
-               printk("\n");*/
+               u8 queuetopipe[] = {3, 2, 1, 0, 4, 4, 0, 4, 4};
+               memcpy(priv->txqueue_to_outpipemap, queuetopipe, 9);
        }
 #endif
        rtl8192_init_priv_variable(dev);
@@ -3118,12 +2828,11 @@ short rtl8192_init(struct net_device *dev)
        init_timer(&priv->watch_dog_timer);
        priv->watch_dog_timer.data = (unsigned long)dev;
        priv->watch_dog_timer.function = watch_dog_timer_callback;
-       if(rtl8192_usb_initendpoints(dev)!=0){
+       if (rtl8192_usb_initendpoints(dev) != 0) {
                DMESG("Endopoints initialization failed");
                return -ENOMEM;
        }
 
-       //rtl8192_adapter_start(dev);
 #ifdef DEBUG_EPROM
        dump_eprom(dev);
 #endif
@@ -3138,16 +2847,16 @@ short rtl8192_init(struct net_device *dev)
  *  return:  none
  *  notice:  This part need to modified according to the rate set we filtered
  * ****************************************************************************/
-void rtl8192_hwconfig(struct net_devicedev)
+void rtl8192_hwconfig(struct net_device *dev)
 {
        u32 regRATR = 0, regRRSR = 0;
        u8 regBwOpMode = 0, regTmp = 0;
        struct r8192_priv *priv = ieee80211_priv(dev);
+       u32 ratr_value = 0;
 
-// Set RRSR, RATR, and BW_OPMODE registers
+       // Set RRSR, RATR, and BW_OPMODE registers
        //
-       switch(priv->ieee80211->mode)
-       {
+       switch (priv->ieee80211->mode) {
        case WIRELESS_MODE_B:
                regBwOpMode = BW_OPMODE_20MHZ;
                regRATR = RATE_ALL_CCK;
@@ -3165,26 +2874,25 @@ void rtl8192_hwconfig(struct net_device* dev)
                break;
        case WIRELESS_MODE_AUTO:
 #ifdef TO_DO_LIST
-               if (Adapter->bInHctTest)
-               {
-                   regBwOpMode = BW_OPMODE_20MHZ;
-                   regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
-                   regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
+               if (Adapter->bInHctTest) {
+                       regBwOpMode = BW_OPMODE_20MHZ;
+                       regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
+                       regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
                }
                else
 #endif
                {
-                   regBwOpMode = BW_OPMODE_20MHZ;
-                   regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG | RATE_ALL_OFDM_1SS | RATE_ALL_OFDM_2SS;
-                   regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
+                       regBwOpMode = BW_OPMODE_20MHZ;
+                       regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG | RATE_ALL_OFDM_1SS | RATE_ALL_OFDM_2SS;
+                       regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
                }
                break;
        case WIRELESS_MODE_N_24G:
                // It support CCK rate by default.
                // CCK rate will be filtered out only when associated AP does not support it.
                regBwOpMode = BW_OPMODE_20MHZ;
-                       regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG | RATE_ALL_OFDM_1SS | RATE_ALL_OFDM_2SS;
-                       regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
+               regRATR = RATE_ALL_CCK | RATE_ALL_OFDM_AG | RATE_ALL_OFDM_1SS | RATE_ALL_OFDM_2SS;
+               regRRSR = RATE_ALL_CCK | RATE_ALL_OFDM_AG;
                break;
        case WIRELESS_MODE_N_5G:
                regBwOpMode = BW_OPMODE_5G;
@@ -3194,17 +2902,12 @@ void rtl8192_hwconfig(struct net_device* dev)
        }
 
        write_nic_byte(dev, BW_OPMODE, regBwOpMode);
-       {
-               u32 ratr_value = 0;
-               ratr_value = regRATR;
-               if (priv->rf_type == RF_1T2R)
-               {
-                       ratr_value &= ~(RATE_ALL_OFDM_2SS);
-               }
-               write_nic_dword(dev, RATR0, ratr_value);
-               write_nic_byte(dev, UFWP, 1);
-       }
-       regTmp = read_nic_byte(dev, 0x313);
+       ratr_value = regRATR;
+       if (priv->rf_type == RF_1T2R)
+               ratr_value &= ~(RATE_ALL_OFDM_2SS);
+       write_nic_dword(dev, RATR0, ratr_value);
+       write_nic_byte(dev, UFWP, 1);
+       read_nic_byte(dev, 0x313, &regTmp);
        regRRSR = ((regTmp) << 24) | (regRRSR & 0x00ffffff);
        write_nic_dword(dev, RRSR, regRRSR);
 
@@ -3212,8 +2915,8 @@ void rtl8192_hwconfig(struct net_device* dev)
        // Set Retry Limit here
        //
        write_nic_word(dev, RETRY_LIMIT,
-                       priv->ShortRetryLimit << RETRY_LIMIT_SHORT_SHIFT | \
-                       priv->LongRetryLimit << RETRY_LIMIT_LONG_SHIFT);
+                      priv->ShortRetryLimit << RETRY_LIMIT_SHORT_SHIFT |
+                      priv->LongRetryLimit << RETRY_LIMIT_LONG_SHIFT);
        // Set Contention Window here
 
        // Set Tx AGC
@@ -3232,7 +2935,9 @@ bool rtl8192_adapter_start(struct net_device *dev)
        struct r8192_priv *priv = ieee80211_priv(dev);
        u32 dwRegRead = 0;
        bool init_status = true;
-       RT_TRACE(COMP_INIT, "====>%s()\n", __FUNCTION__);
+       u8 SECR_value = 0x0;
+       u8 tmp;
+       RT_TRACE(COMP_INIT, "====>%s()\n", __func__);
        priv->Rf_Mode = RF_OP_By_SW_3wire;
        //for ASIC power on sequence
        write_nic_byte_E(dev, 0x5f, 0x80);
@@ -3242,34 +2947,31 @@ bool rtl8192_adapter_start(struct net_device *dev)
        write_nic_byte_E(dev, 0x5e, 0x80);
        write_nic_byte(dev, 0x17, 0x37);
        mdelay(10);
-//#ifdef TO_DO_LIST
        priv->pFirmware->firmware_status = FW_STATUS_0_INIT;
        //config CPUReset Register
        //Firmware Reset or not?
-       dwRegRead = read_nic_dword(dev, CPU_GEN);
+       read_nic_dword(dev, CPU_GEN, &dwRegRead);
        if (priv->pFirmware->firmware_status == FW_STATUS_0_INIT)
                dwRegRead |= CPU_GEN_SYSTEM_RESET; //do nothing here?
        else if (priv->pFirmware->firmware_status == FW_STATUS_5_READY)
                dwRegRead |= CPU_GEN_FIRMWARE_RESET;
        else
-               RT_TRACE(COMP_ERR, "ERROR in %s(): undefined firmware state(%d)\n", __FUNCTION__,   priv->pFirmware->firmware_status);
+               RT_TRACE(COMP_ERR, "ERROR in %s(): undefined firmware state(%d)\n", __func__,   priv->pFirmware->firmware_status);
 
        write_nic_dword(dev, CPU_GEN, dwRegRead);
-       //mdelay(30);
        //config BB.
        rtl8192_BBConfig(dev);
 
        //Loopback mode or not
        priv->LoopbackMode = RTL819xU_NO_LOOPBACK;
-//     priv->LoopbackMode = RTL819xU_MAC_LOOPBACK;
 
-       dwRegRead = read_nic_dword(dev, CPU_GEN);
+       read_nic_dword(dev, CPU_GEN, &dwRegRead);
        if (priv->LoopbackMode == RTL819xU_NO_LOOPBACK)
                dwRegRead = ((dwRegRead & CPU_GEN_NO_LOOPBACK_MSK) | CPU_GEN_NO_LOOPBACK_SET);
        else if (priv->LoopbackMode == RTL819xU_MAC_LOOPBACK)
                dwRegRead |= CPU_CCK_LOOPBACK;
        else
-               RT_TRACE(COMP_ERR, "Serious error in %s(): wrong loopback mode setting(%d)\n", __FUNCTION__,  priv->LoopbackMode);
+               RT_TRACE(COMP_ERR, "Serious error in %s(): wrong loopback mode setting(%d)\n", __func__,  priv->LoopbackMode);
 
        write_nic_dword(dev, CPU_GEN, dwRegRead);
 
@@ -3277,7 +2979,8 @@ bool rtl8192_adapter_start(struct net_device *dev)
        udelay(500);
 
        //xiong add for new bitfile:usb suspend reset pin set to 1. //do we need?
-       write_nic_byte_E(dev, 0x5f, (read_nic_byte_E(dev, 0x5f)|0x20));
+       read_nic_byte_E(dev, 0x5f, &tmp);
+       write_nic_byte_E(dev, 0x5f, tmp|0x20);
 
        //Set Hardware
        rtl8192_hwconfig(dev);
@@ -3286,61 +2989,54 @@ bool rtl8192_adapter_start(struct net_device *dev)
        write_nic_byte(dev, CMDR, CR_RE|CR_TE);
 
        //set IDR0 here
-       write_nic_dword(dev, MAC0, ((u32*)dev->dev_addr)[0]);
-       write_nic_word(dev, MAC4, ((u16*)(dev->dev_addr + 4))[0]);
+       write_nic_dword(dev, MAC0, ((u32 *)dev->dev_addr)[0]);
+       write_nic_word(dev, MAC4, ((u16 *)(dev->dev_addr + 4))[0]);
 
        //set RCR
        write_nic_dword(dev, RCR, priv->ReceiveConfig);
 
        //Initialize Number of Reserved Pages in Firmware Queue
-       write_nic_dword(dev, RQPN1,  NUM_OF_PAGE_IN_FW_QUEUE_BK << RSVD_FW_QUEUE_PAGE_BK_SHIFT |\
-                                               NUM_OF_PAGE_IN_FW_QUEUE_BE << RSVD_FW_QUEUE_PAGE_BE_SHIFT | \
-                                               NUM_OF_PAGE_IN_FW_QUEUE_VI << RSVD_FW_QUEUE_PAGE_VI_SHIFT | \
-                                               NUM_OF_PAGE_IN_FW_QUEUE_VO <<RSVD_FW_QUEUE_PAGE_VO_SHIFT);
-       write_nic_dword(dev, RQPN2, NUM_OF_PAGE_IN_FW_QUEUE_MGNT << RSVD_FW_QUEUE_PAGE_MGNT_SHIFT |\
-                                               NUM_OF_PAGE_IN_FW_QUEUE_CMD << RSVD_FW_QUEUE_PAGE_CMD_SHIFT);
-       write_nic_dword(dev, RQPN3, APPLIED_RESERVED_QUEUE_IN_FW| \
-                                               NUM_OF_PAGE_IN_FW_QUEUE_BCN<<RSVD_FW_QUEUE_PAGE_BCN_SHIFT
-//                                             | NUM_OF_PAGE_IN_FW_QUEUE_PUB<<RSVD_FW_QUEUE_PAGE_PUB_SHIFT
-                                               );
+       write_nic_dword(dev, RQPN1,  NUM_OF_PAGE_IN_FW_QUEUE_BK << RSVD_FW_QUEUE_PAGE_BK_SHIFT |
+                       NUM_OF_PAGE_IN_FW_QUEUE_BE << RSVD_FW_QUEUE_PAGE_BE_SHIFT |
+                       NUM_OF_PAGE_IN_FW_QUEUE_VI << RSVD_FW_QUEUE_PAGE_VI_SHIFT |
+                       NUM_OF_PAGE_IN_FW_QUEUE_VO <<RSVD_FW_QUEUE_PAGE_VO_SHIFT);
+       write_nic_dword(dev, RQPN2, NUM_OF_PAGE_IN_FW_QUEUE_MGNT << RSVD_FW_QUEUE_PAGE_MGNT_SHIFT |
+                       NUM_OF_PAGE_IN_FW_QUEUE_CMD << RSVD_FW_QUEUE_PAGE_CMD_SHIFT);
+       write_nic_dword(dev, RQPN3, APPLIED_RESERVED_QUEUE_IN_FW|
+                       NUM_OF_PAGE_IN_FW_QUEUE_BCN<<RSVD_FW_QUEUE_PAGE_BCN_SHIFT);
        write_nic_dword(dev, RATR0+4*7, (RATE_ALL_OFDM_AG | RATE_ALL_CCK));
 
        //Set AckTimeout
        // TODO: (it value is only for FPGA version). need to be changed!!2006.12.18, by Emily
        write_nic_byte(dev, ACK_TIMEOUT, 0x30);
 
-//     RT_TRACE(COMP_INIT, "%s():priv->ResetProgress is %d\n", __FUNCTION__,priv->ResetProgress);
-       if(priv->ResetProgress == RESET_TYPE_NORESET)
-       rtl8192_SetWirelessMode(dev, priv->ieee80211->mode);
-       if(priv->ResetProgress == RESET_TYPE_NORESET){
-       CamResetAllEntry(dev);
-       {
-               u8 SECR_value = 0x0;
+       if (priv->ResetProgress == RESET_TYPE_NORESET)
+               rtl8192_SetWirelessMode(dev, priv->ieee80211->mode);
+       if (priv->ResetProgress == RESET_TYPE_NORESET) {
+               CamResetAllEntry(dev);
                SECR_value |= SCR_TxEncEnable;
                SECR_value |= SCR_RxDecEnable;
                SECR_value |= SCR_NoSKMC;
                write_nic_byte(dev, SECR, SECR_value);
        }
-       }
 
        //Beacon related
        write_nic_word(dev, ATIMWND, 2);
        write_nic_word(dev, BCN_INTERVAL, 100);
 
-       {
 #define DEFAULT_EDCA 0x005e4332
+       {
                int i;
-               for (i=0; i<QOS_QUEUE_NUM; i++)
-               write_nic_dword(dev, WDCAPARA_ADD[i], DEFAULT_EDCA);
+               for (i = 0; i < QOS_QUEUE_NUM; i++)
+                       write_nic_dword(dev, WDCAPARA_ADD[i], DEFAULT_EDCA);
        }
 #ifdef USB_RX_AGGREGATION_SUPPORT
        //3 For usb rx firmware aggregation control
-       if(priv->ResetProgress == RESET_TYPE_NORESET)
-       {
+       if (priv->ResetProgress == RESET_TYPE_NORESET) {
                u32 ulValue;
                PRT_HIGH_THROUGHPUT     pHTInfo = priv->ieee80211->pHTInfo;
                ulValue = (pHTInfo->UsbRxFwAggrEn<<24) | (pHTInfo->UsbRxFwAggrPageNum<<16) |
-                                       (pHTInfo->UsbRxFwAggrPacketNum<<8) | (pHTInfo->UsbRxFwAggrTimeout);
+                         (pHTInfo->UsbRxFwAggrPacketNum<<8) | (pHTInfo->UsbRxFwAggrTimeout);
                /*
                 * If usb rx firmware aggregation is enabled,
                 * when anyone of three threshold conditions above is reached,
@@ -3353,63 +3049,52 @@ bool rtl8192_adapter_start(struct net_device *dev)
 
        rtl8192_phy_configmac(dev);
 
-       if (priv->card_8192_version == (u8) VERSION_819xU_A)
-       {
+       if (priv->card_8192_version == (u8) VERSION_819xU_A) {
                rtl8192_phy_getTxPower(dev);
                rtl8192_phy_setTxPower(dev, priv->chan);
        }
 
        //Firmware download
        init_status = init_firmware(dev);
-       if(!init_status)
-       {
-               RT_TRACE(COMP_ERR,"ERR!!! %s(): Firmware download is failed\n", __FUNCTION__);
+       if (!init_status) {
+               RT_TRACE(COMP_ERR, "ERR!!! %s(): Firmware download is failed\n", __func__);
                return init_status;
        }
-       RT_TRACE(COMP_INIT, "%s():after firmware download\n", __FUNCTION__);
+       RT_TRACE(COMP_INIT, "%s():after firmware download\n", __func__);
        //
 #ifdef TO_DO_LIST
-if(Adapter->ResetProgress == RESET_TYPE_NORESET)
-       {
-               if(pMgntInfo->RegRfOff == TRUE)
-               { // User disable RF via registry.
+       if (Adapter->ResetProgress == RESET_TYPE_NORESET) {
+               if (pMgntInfo->RegRfOff == TRUE) { // User disable RF via registry.
                        RT_TRACE((COMP_INIT|COMP_RF), DBG_LOUD, ("InitializeAdapter819xUsb(): Turn off RF for RegRfOff ----------\n"));
                        MgntActSet_RF_State(Adapter, eRfOff, RF_CHANGE_BY_SW);
                        // Those actions will be discard in MgntActSet_RF_State because of the same state
-                       for(eRFPath = 0; eRFPath <pHalData->NumTotalRFPath; eRFPath++)
+                       for (eRFPath = 0; eRFPath < pHalData->NumTotalRFPath; eRFPath++)
                                PHY_SetRFReg(Adapter, (RF90_RADIO_PATH_E)eRFPath, 0x4, 0xC00, 0x0);
-               }
-               else if(pMgntInfo->RfOffReason > RF_CHANGE_BY_PS)
-               { // H/W or S/W RF OFF before sleep.
+               } else if (pMgntInfo->RfOffReason > RF_CHANGE_BY_PS) { // H/W or S/W RF OFF before sleep.
                        RT_TRACE((COMP_INIT|COMP_RF), DBG_LOUD, ("InitializeAdapter819xUsb(): Turn off RF for RfOffReason(%d) ----------\n", pMgntInfo->RfOffReason));
                        MgntActSet_RF_State(Adapter, eRfOff, pMgntInfo->RfOffReason);
-               }
-               else
-               {
+               } else {
                        pHalData->eRFPowerState = eRfOn;
                        pMgntInfo->RfOffReason = 0;
                        RT_TRACE((COMP_INIT|COMP_RF), DBG_LOUD, ("InitializeAdapter819xUsb(): RF is on ----------\n"));
                }
-       }
-       else
-       {
-               if(pHalData->eRFPowerState == eRfOff)
-               {
+       } else {
+               if (pHalData->eRFPowerState == eRfOff) {
                        MgntActSet_RF_State(Adapter, eRfOff, pMgntInfo->RfOffReason);
                        // Those actions will be discard in MgntActSet_RF_State because of the same state
-                       for(eRFPath = 0; eRFPath <pHalData->NumTotalRFPath; eRFPath++)
+                       for (eRFPath = 0; eRFPath < pHalData->NumTotalRFPath; eRFPath++)
                                PHY_SetRFReg(Adapter, (RF90_RADIO_PATH_E)eRFPath, 0x4, 0xC00, 0x0);
                }
        }
 #endif
        //config RF.
-       if(priv->ResetProgress == RESET_TYPE_NORESET){
-       rtl8192_phy_RFConfig(dev);
-       RT_TRACE(COMP_INIT, "%s():after phy RF config\n", __FUNCTION__);
+       if (priv->ResetProgress == RESET_TYPE_NORESET) {
+               rtl8192_phy_RFConfig(dev);
+               RT_TRACE(COMP_INIT, "%s():after phy RF config\n", __func__);
        }
 
 
-       if(priv->ieee80211->FwRWRF)
+       if (priv->ieee80211->FwRWRF)
                // We can force firmware to do RF-R/W
                priv->Rf_Mode = RF_OP_By_FW;
        else
@@ -3421,54 +3106,44 @@ if(Adapter->ResetProgress == RESET_TYPE_NORESET)
        rtl8192_setBBreg(dev, rFPGA0_RFMOD, bCCKEn, 0x1);
        rtl8192_setBBreg(dev, rFPGA0_RFMOD, bOFDMEn, 0x1);
 
-       if(priv->ResetProgress == RESET_TYPE_NORESET)
-       {
+       if (priv->ResetProgress == RESET_TYPE_NORESET) {
                //if D or C cut
-               u8 tmpvalue = read_nic_byte(dev, 0x301);
-               if(tmpvalue ==0x03)
-               {
+               u8 tmpvalue;
+               read_nic_byte(dev, 0x301, &tmpvalue);
+               if (tmpvalue == 0x03) {
                        priv->bDcut = TRUE;
                        RT_TRACE(COMP_POWER_TRACKING, "D-cut\n");
-               }
-               else
-               {
+               } else {
                        priv->bDcut = FALSE;
                        RT_TRACE(COMP_POWER_TRACKING, "C-cut\n");
                }
                dm_initialize_txpower_tracking(dev);
 
-               if(priv->bDcut == TRUE)
-               {
+               if (priv->bDcut == TRUE) {
                        u32 i, TempCCk;
-                       u32 tmpRegA= rtl8192_QueryBBReg(dev,rOFDM0_XATxIQImbalance,bMaskDWord);
-               //      u32 tmpRegC= rtl8192_QueryBBReg(dev,rOFDM0_XCTxIQImbalance,bMaskDWord);
-                       for(i = 0; i<TxBBGainTableLength; i++)
-                       {
-                               if(tmpRegA == priv->txbbgain_table[i].txbbgain_value)
-                               {
-                                       priv->rfa_txpowertrackingindex= (u8)i;
-                                       priv->rfa_txpowertrackingindex_real= (u8)i;
-                                       priv->rfa_txpowertracking_default= priv->rfa_txpowertrackingindex;
+                       u32 tmpRegA = rtl8192_QueryBBReg(dev, rOFDM0_XATxIQImbalance, bMaskDWord);
+                       for (i = 0; i < TxBBGainTableLength; i++) {
+                               if (tmpRegA == priv->txbbgain_table[i].txbbgain_value) {
+                                       priv->rfa_txpowertrackingindex = (u8)i;
+                                       priv->rfa_txpowertrackingindex_real = (u8)i;
+                                       priv->rfa_txpowertracking_default = priv->rfa_txpowertrackingindex;
                                        break;
                                }
                        }
 
                        TempCCk = rtl8192_QueryBBReg(dev, rCCK0_TxFilter1, bMaskByte2);
 
-                       for(i=0 ; i<CCKTxBBGainTableLength ; i++)
-                       {
+                       for (i = 0; i < CCKTxBBGainTableLength; i++) {
 
-                               if(TempCCk == priv->cck_txbbgain_table[i].ccktxbb_valuearray[0])
-                               {
-                                       priv->cck_present_attentuation_20Mdefault=(u8) i;
+                               if (TempCCk == priv->cck_txbbgain_table[i].ccktxbb_valuearray[0]) {
+                                       priv->cck_present_attentuation_20Mdefault = (u8) i;
                                        break;
                                }
                        }
-                       priv->cck_present_attentuation_40Mdefault= 0;
-                       priv->cck_present_attentuation_difference= 0;
+                       priv->cck_present_attentuation_40Mdefault = 0;
+                       priv->cck_present_attentuation_difference = 0;
                        priv->cck_present_attentuation = priv->cck_present_attentuation_20Mdefault;
 
-       //              pMgntInfo->bTXPowerTracking = FALSE;//TEMPLY DISABLE
                }
        }
        write_nic_byte(dev, 0x87, 0x0);
@@ -3492,16 +3167,14 @@ static struct net_device_stats *rtl8192_stats(struct net_device *dev)
        return &priv->ieee80211->stats;
 }
 
-bool
-HalTxCheckStuck819xUsb(
-       struct net_device *dev
-       )
+bool HalTxCheckStuck819xUsb(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-       u16             RegTxCounter = read_nic_word(dev, 0x128);
+       u16             RegTxCounter;
        bool            bStuck = FALSE;
-       RT_TRACE(COMP_RESET,"%s():RegTxCounter is %d,TxCounter is %d\n",__FUNCTION__,RegTxCounter,priv->TxCounter);
-       if(priv->TxCounter==RegTxCounter)
+       read_nic_word(dev, 0x128, &RegTxCounter);
+       RT_TRACE(COMP_RESET, "%s():RegTxCounter is %d,TxCounter is %d\n", __func__, RegTxCounter, priv->TxCounter);
+       if (priv->TxCounter == RegTxCounter)
                bStuck = TRUE;
 
        priv->TxCounter = RegTxCounter;
@@ -3513,43 +3186,30 @@ HalTxCheckStuck819xUsb(
 *      <Assumption: RT_TX_SPINLOCK is acquired.>
 *      First added: 2006.11.19 by emily
 */
-RESET_TYPE
-TxCheckStuck(struct net_device *dev)
+RESET_TYPE TxCheckStuck(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u8                      QueueID;
-//     PRT_TCB                 pTcb;
-//     u8                      ResetThreshold;
        bool                    bCheckFwTxCnt = false;
-       //unsigned long flags;
 
        //
        // Decide such threshold according to current power save mode
        //
 
-//     RT_TRACE(COMP_RESET, " ==> TxCheckStuck()\n");
-//          PlatformAcquireSpinLock(Adapter, RT_TX_SPINLOCK);
-//          spin_lock_irqsave(&priv->ieee80211->lock,flags);
-            for (QueueID = 0; QueueID<=BEACON_QUEUE;QueueID ++)
-            {
-                       if(QueueID == TXCMD_QUEUE)
-                        continue;
+       for (QueueID = 0; QueueID <= BEACON_QUEUE; QueueID++) {
+               if (QueueID == TXCMD_QUEUE)
+                       continue;
 #ifdef USB_TX_DRIVER_AGGREGATION_ENABLE
-                       if((skb_queue_len(&priv->ieee80211->skb_waitQ[QueueID]) == 0) && (skb_queue_len(&priv->ieee80211->skb_aggQ[QueueID]) == 0) && (skb_queue_len(&priv->ieee80211->skb_drv_aggQ[QueueID]) == 0))
+               if ((skb_queue_len(&priv->ieee80211->skb_waitQ[QueueID]) == 0) && (skb_queue_len(&priv->ieee80211->skb_aggQ[QueueID]) == 0) && (skb_queue_len(&priv->ieee80211->skb_drv_aggQ[QueueID]) == 0))
 #else
-                       if((skb_queue_len(&priv->ieee80211->skb_waitQ[QueueID]) == 0)  && (skb_queue_len(&priv->ieee80211->skb_aggQ[QueueID]) == 0))
+               if ((skb_queue_len(&priv->ieee80211->skb_waitQ[QueueID]) == 0)  && (skb_queue_len(&priv->ieee80211->skb_aggQ[QueueID]) == 0))
 #endif
                                continue;
 
-                    bCheckFwTxCnt = true;
-            }
-//          PlatformReleaseSpinLock(Adapter, RT_TX_SPINLOCK);
-//     spin_unlock_irqrestore(&priv->ieee80211->lock,flags);
-//     RT_TRACE(COMP_RESET,"bCheckFwTxCnt is %d\n",bCheckFwTxCnt);
-       if(bCheckFwTxCnt)
-       {
-               if(HalTxCheckStuck819xUsb(dev))
-               {
+               bCheckFwTxCnt = true;
+       }
+       if (bCheckFwTxCnt) {
+               if (HalTxCheckStuck819xUsb(dev)) {
                        RT_TRACE(COMP_RESET, "TxCheckStuck(): Fw indicates no Tx condition! \n");
                        return RESET_TYPE_SILENT;
                }
@@ -3557,64 +3217,41 @@ TxCheckStuck(struct net_device *dev)
        return RESET_TYPE_NORESET;
 }
 
-bool
-HalRxCheckStuck819xUsb(struct net_device *dev)
+bool HalRxCheckStuck819xUsb(struct net_device *dev)
 {
-       u16     RegRxCounter = read_nic_word(dev, 0x130);
+       u16     RegRxCounter;
        struct r8192_priv *priv = ieee80211_priv(dev);
        bool bStuck = FALSE;
        static u8       rx_chk_cnt;
-       RT_TRACE(COMP_RESET,"%s(): RegRxCounter is %d,RxCounter is %d\n",__FUNCTION__,RegRxCounter,priv->RxCounter);
+       read_nic_word(dev, 0x130, &RegRxCounter);
+       RT_TRACE(COMP_RESET, "%s(): RegRxCounter is %d,RxCounter is %d\n", __func__, RegRxCounter, priv->RxCounter);
        // If rssi is small, we should check rx for long time because of bad rx.
        // or maybe it will continuous silent reset every 2 seconds.
        rx_chk_cnt++;
-       if(priv->undecorated_smoothed_pwdb >= (RateAdaptiveTH_High+5))
-       {
+       if (priv->undecorated_smoothed_pwdb >= (RateAdaptiveTH_High+5)) {
                rx_chk_cnt = 0; //high rssi, check rx stuck right now.
-       }
-       else if(priv->undecorated_smoothed_pwdb < (RateAdaptiveTH_High+5) &&
-               ((priv->CurrentChannelBW!=HT_CHANNEL_WIDTH_20&&priv->undecorated_smoothed_pwdb>=RateAdaptiveTH_Low_40M) ||
-               (priv->CurrentChannelBW==HT_CHANNEL_WIDTH_20&&priv->undecorated_smoothed_pwdb>=RateAdaptiveTH_Low_20M)) )
-       {
-               if(rx_chk_cnt < 2)
-               {
+       } else if (priv->undecorated_smoothed_pwdb < (RateAdaptiveTH_High+5) &&
+                  ((priv->CurrentChannelBW != HT_CHANNEL_WIDTH_20 && priv->undecorated_smoothed_pwdb >= RateAdaptiveTH_Low_40M) ||
+                   (priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20 && priv->undecorated_smoothed_pwdb >= RateAdaptiveTH_Low_20M))) {
+               if (rx_chk_cnt < 2)
                        return bStuck;
-               }
                else
-               {
                        rx_chk_cnt = 0;
-               }
-       }
-       else if(((priv->CurrentChannelBW!=HT_CHANNEL_WIDTH_20&&priv->undecorated_smoothed_pwdb<RateAdaptiveTH_Low_40M) ||
-               (priv->CurrentChannelBW==HT_CHANNEL_WIDTH_20&&priv->undecorated_smoothed_pwdb<RateAdaptiveTH_Low_20M)) &&
-               priv->undecorated_smoothed_pwdb >= VeryLowRSSI)
-       {
-               if(rx_chk_cnt < 4)
-               {
-                       //DbgPrint("RSSI < %d && RSSI >= %d, no check this time \n", RateAdaptiveTH_Low, VeryLowRSSI);
+       } else if (((priv->CurrentChannelBW != HT_CHANNEL_WIDTH_20 && priv->undecorated_smoothed_pwdb < RateAdaptiveTH_Low_40M) ||
+                   (priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20 && priv->undecorated_smoothed_pwdb < RateAdaptiveTH_Low_20M)) &&
+                    priv->undecorated_smoothed_pwdb >= VeryLowRSSI) {
+               if (rx_chk_cnt < 4)
                        return bStuck;
-               }
                else
-               {
                        rx_chk_cnt = 0;
-                       //DbgPrint("RSSI < %d && RSSI >= %d, check this time \n", RateAdaptiveTH_Low, VeryLowRSSI);
-               }
-       }
-       else
-       {
-               if(rx_chk_cnt < 8)
-               {
-                       //DbgPrint("RSSI <= %d, no check this time \n", VeryLowRSSI);
+       } else {
+               if (rx_chk_cnt < 8)
                        return bStuck;
-               }
                else
-               {
                        rx_chk_cnt = 0;
-                       //DbgPrint("RSSI <= %d, check this time \n", VeryLowRSSI);
-               }
        }
 
-       if(priv->RxCounter==RegRxCounter)
+       if (priv->RxCounter == RegRxCounter)
                bStuck = TRUE;
 
        priv->RxCounter = RegRxCounter;
@@ -3622,25 +3259,16 @@ HalRxCheckStuck819xUsb(struct net_device *dev)
        return bStuck;
 }
 
-RESET_TYPE
-RxCheckStuck(struct net_device *dev)
+RESET_TYPE RxCheckStuck(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-       //int                     i;
        bool        bRxCheck = FALSE;
 
-//       RT_TRACE(COMP_RESET," ==> RxCheckStuck()\n");
-       //PlatformAcquireSpinLock(Adapter, RT_RX_SPINLOCK);
-
-        if(priv->IrpPendingCount > 1)
+       if (priv->IrpPendingCount > 1)
                bRxCheck = TRUE;
-       //PlatformReleaseSpinLock(Adapter, RT_RX_SPINLOCK);
 
-//       RT_TRACE(COMP_RESET,"bRxCheck is %d \n",bRxCheck);
-       if(bRxCheck)
-       {
-               if(HalRxCheckStuck819xUsb(dev))
-               {
+       if (bRxCheck) {
+               if (HalRxCheckStuck819xUsb(dev)) {
                        RT_TRACE(COMP_RESET, "RxStuck Condition\n");
                        return RESET_TYPE_SILENT;
                }
@@ -3661,8 +3289,7 @@ RxCheckStuck(struct net_device *dev)
 *
 *      8185 and 8185b does not implement this function. This is added by Emily at 2006.11.24
 */
-RESET_TYPE
-rtl819x_ifcheck_resetornot(struct net_device *dev)
+RESET_TYPE rtl819x_ifcheck_resetornot(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        RESET_TYPE      TxResetType = RESET_TYPE_NORESET;
@@ -3672,10 +3299,8 @@ rtl819x_ifcheck_resetornot(struct net_device *dev)
        rfState = priv->ieee80211->eRFPowerState;
 
        TxResetType = TxCheckStuck(dev);
-       if( rfState != eRfOff ||
-               /*ADAPTER_TEST_STATUS_FLAG(Adapter, ADAPTER_STATUS_FW_DOWNLOAD_FAILURE)) &&*/
-               (priv->ieee80211->iw_mode != IW_MODE_ADHOC))
-       {
+       if (rfState != eRfOff ||
+           (priv->ieee80211->iw_mode != IW_MODE_ADHOC)) {
                // If driver is in the status of firmware download failure , driver skips RF initialization and RF is
                // in turned off state. Driver should check whether Rx stuck and do silent reset. And
                // if driver is in firmware download failure status, driver should initialize RF in the following
@@ -3686,155 +3311,91 @@ rtl819x_ifcheck_resetornot(struct net_device *dev)
                // set, STA cannot hear any packet at all. Emily, 2008.04.12
                RxResetType = RxCheckStuck(dev);
        }
-       if(TxResetType==RESET_TYPE_NORMAL || RxResetType==RESET_TYPE_NORMAL)
+       if (TxResetType == RESET_TYPE_NORMAL || RxResetType == RESET_TYPE_NORMAL) {
                return RESET_TYPE_NORMAL;
-       else if(TxResetType==RESET_TYPE_SILENT || RxResetType==RESET_TYPE_SILENT){
-               RT_TRACE(COMP_RESET,"%s():silent reset\n",__FUNCTION__);
+       } else if (TxResetType == RESET_TYPE_SILENT || RxResetType == RESET_TYPE_SILENT) {
+               RT_TRACE(COMP_RESET, "%s():silent reset\n", __func__);
                return RESET_TYPE_SILENT;
-       }
-       else
+       } else {
                return RESET_TYPE_NORESET;
+       }
 
 }
 
-void rtl8192_cancel_deferred_work(struct r8192_privpriv);
+void rtl8192_cancel_deferred_work(struct r8192_priv *priv);
 int _rtl8192_up(struct net_device *dev);
 int rtl8192_close(struct net_device *dev);
 
 
 
-void
-CamRestoreAllEntry(    struct net_device *dev)
+void CamRestoreAllEntry(struct net_device *dev)
 {
        u8 EntryId = 0;
        struct r8192_priv *priv = ieee80211_priv(dev);
-       u8*     MacAddr = priv->ieee80211->current_network.bssid;
+       u8      *MacAddr = priv->ieee80211->current_network.bssid;
 
        static u8       CAM_CONST_ADDR[4][6] = {
                {0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
                {0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
                {0x00, 0x00, 0x00, 0x00, 0x00, 0x02},
-               {0x00, 0x00, 0x00, 0x00, 0x00, 0x03}};
-       static u8       CAM_CONST_BROAD[] =
-               {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+               {0x00, 0x00, 0x00, 0x00, 0x00, 0x03} };
+       static u8       CAM_CONST_BROAD[] = {
+               0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
 
        RT_TRACE(COMP_SEC, "CamRestoreAllEntry: \n");
 
 
-       if ((priv->ieee80211->pairwise_key_type == KEY_TYPE_WEP40)||
-           (priv->ieee80211->pairwise_key_type == KEY_TYPE_WEP104))
-       {
+       if ((priv->ieee80211->pairwise_key_type == KEY_TYPE_WEP40) ||
+           (priv->ieee80211->pairwise_key_type == KEY_TYPE_WEP104)) {
 
-               for(EntryId=0; EntryId<4; EntryId++)
-               {
-                       {
-                               MacAddr = CAM_CONST_ADDR[EntryId];
-                               setKey(dev,
-                                               EntryId ,
-                                               EntryId,
-                                               priv->ieee80211->pairwise_key_type,
-                                               MacAddr,
-                                               0,
-                                               NULL);
-                       }
+               for (EntryId = 0; EntryId < 4; EntryId++) {
+                       MacAddr = CAM_CONST_ADDR[EntryId];
+                       setKey(dev, EntryId, EntryId,
+                              priv->ieee80211->pairwise_key_type,
+                              MacAddr, 0, NULL);
                }
 
-       }
-       else if(priv->ieee80211->pairwise_key_type == KEY_TYPE_TKIP)
-       {
+       } else if (priv->ieee80211->pairwise_key_type == KEY_TYPE_TKIP) {
 
-               {
-                       if(priv->ieee80211->iw_mode == IW_MODE_ADHOC)
-                               setKey(dev,
-                                               4,
-                                               0,
-                                               priv->ieee80211->pairwise_key_type,
-                                               (u8*)dev->dev_addr,
-                                               0,
-                                               NULL);
-                       else
-                               setKey(dev,
-                                               4,
-                                               0,
-                                               priv->ieee80211->pairwise_key_type,
-                                               MacAddr,
-                                               0,
-                                               NULL);
-               }
-       }
-       else if(priv->ieee80211->pairwise_key_type == KEY_TYPE_CCMP)
-       {
+               if (priv->ieee80211->iw_mode == IW_MODE_ADHOC)
+                       setKey(dev, 4, 0, priv->ieee80211->pairwise_key_type,
+                              (u8 *)dev->dev_addr, 0, NULL);
+               else
+                       setKey(dev, 4, 0, priv->ieee80211->pairwise_key_type,
+                              MacAddr, 0, NULL);
+       } else if (priv->ieee80211->pairwise_key_type == KEY_TYPE_CCMP) {
 
-               {
-                       if(priv->ieee80211->iw_mode == IW_MODE_ADHOC)
-                               setKey(dev,
-                                               4,
-                                               0,
-                                               priv->ieee80211->pairwise_key_type,
-                                               (u8*)dev->dev_addr,
-                                               0,
-                                               NULL);
-                       else
-                               setKey(dev,
-                                               4,
-                                               0,
-                                               priv->ieee80211->pairwise_key_type,
-                                               MacAddr,
-                                               0,
-                                               NULL);
-               }
+               if (priv->ieee80211->iw_mode == IW_MODE_ADHOC)
+                       setKey(dev, 4, 0, priv->ieee80211->pairwise_key_type,
+                              (u8 *)dev->dev_addr, 0, NULL);
+               else
+                       setKey(dev, 4, 0, priv->ieee80211->pairwise_key_type,
+                              MacAddr, 0, NULL);
        }
 
 
 
-       if(priv->ieee80211->group_key_type == KEY_TYPE_TKIP)
-       {
+       if (priv->ieee80211->group_key_type == KEY_TYPE_TKIP) {
                MacAddr = CAM_CONST_BROAD;
-               for(EntryId=1 ; EntryId<4 ; EntryId++)
-               {
-                       {
-                               setKey(dev,
-                                               EntryId,
-                                               EntryId,
-                                               priv->ieee80211->group_key_type,
-                                               MacAddr,
-                                               0,
-                                               NULL);
-                       }
+               for (EntryId = 1; EntryId < 4; EntryId++) {
+                       setKey(dev, EntryId, EntryId,
+                              priv->ieee80211->group_key_type,
+                              MacAddr, 0, NULL);
                }
-               if(priv->ieee80211->iw_mode == IW_MODE_ADHOC)
-                               setKey(dev,
-                                               0,
-                                               0,
-                                               priv->ieee80211->group_key_type,
-                                               CAM_CONST_ADDR[0],
-                                               0,
-                                               NULL);
-       }
-       else if(priv->ieee80211->group_key_type == KEY_TYPE_CCMP)
-       {
+               if (priv->ieee80211->iw_mode == IW_MODE_ADHOC)
+                       setKey(dev, 0, 0, priv->ieee80211->group_key_type,
+                              CAM_CONST_ADDR[0], 0, NULL);
+       } else if (priv->ieee80211->group_key_type == KEY_TYPE_CCMP) {
                MacAddr = CAM_CONST_BROAD;
-               for(EntryId=1; EntryId<4 ; EntryId++)
-               {
-                       {
-                               setKey(dev,
-                                               EntryId ,
-                                               EntryId,
-                                               priv->ieee80211->group_key_type,
-                                               MacAddr,
-                                               0,
-                                               NULL);
-                       }
+               for (EntryId = 1; EntryId < 4; EntryId++) {
+                       setKey(dev, EntryId, EntryId,
+                              priv->ieee80211->group_key_type,
+                              MacAddr, 0, NULL);
                }
 
-               if(priv->ieee80211->iw_mode == IW_MODE_ADHOC)
-                               setKey(dev,
-                                               0 ,
-                                               0,
-                                               priv->ieee80211->group_key_type,
-                                               CAM_CONST_ADDR[0],
-                                               0,
-                                               NULL);
+               if (priv->ieee80211->iw_mode == IW_MODE_ADHOC)
+                       setKey(dev, 0, 0, priv->ieee80211->group_key_type,
+                              CAM_CONST_ADDR[0], 0, NULL);
        }
 }
 //////////////////////////////////////////////////////////////
@@ -3843,10 +3404,8 @@ CamRestoreAllEntry(      struct net_device *dev)
 // The method checking Tx/Rx stuck of this function is supported by FW,
 // which reports Tx and Rx counter to register 0x128 and 0x130.
 //////////////////////////////////////////////////////////////
-void
-rtl819x_ifsilentreset(struct net_device *dev)
+void rtl819x_ifsilentreset(struct net_device *dev)
 {
-       //OCTET_STRING asocpdu;
        struct r8192_priv *priv = ieee80211_priv(dev);
        u8      reset_times = 0;
        int reset_status = 0;
@@ -3856,26 +3415,21 @@ rtl819x_ifsilentreset(struct net_device *dev)
        // 2007.07.20. If we need to check CCK stop, please uncomment this line.
        //bStuck = Adapter->HalFunc.CheckHWStopHandler(Adapter);
 
-       if(priv->ResetProgress==RESET_TYPE_NORESET)
-       {
+       if (priv->ResetProgress == RESET_TYPE_NORESET) {
 RESET_START:
 
-               RT_TRACE(COMP_RESET,"=========>Reset progress!! \n");
+               RT_TRACE(COMP_RESET, "=========>Reset progress!! \n");
 
                // Set the variable for reset.
                priv->ResetProgress = RESET_TYPE_SILENT;
-//             rtl8192_close(dev);
                down(&priv->wx_sem);
-               if(priv->up == 0)
-               {
-                       RT_TRACE(COMP_ERR,"%s():the driver is not up! return\n",__FUNCTION__);
+               if (priv->up == 0) {
+                       RT_TRACE(COMP_ERR, "%s():the driver is not up! return\n", __func__);
                        up(&priv->wx_sem);
-                       return ;
+                       return;
                }
                priv->up = 0;
-               RT_TRACE(COMP_RESET,"%s():======>start to down the driver\n",__FUNCTION__);
-//             if(!netif_queue_stopped(dev))
-//                     netif_stop_queue(dev);
+               RT_TRACE(COMP_RESET, "%s():======>start to down the driver\n", __func__);
 
                rtl8192_rtx_disable(dev);
                rtl8192_cancel_deferred_work(priv);
@@ -3883,55 +3437,44 @@ RESET_START:
                del_timer_sync(&priv->watch_dog_timer);
 
                ieee->sync_scan_hurryup = 1;
-               if(ieee->state == IEEE80211_LINKED)
-               {
+               if (ieee->state == IEEE80211_LINKED) {
                        down(&ieee->wx_sem);
-                       printk("ieee->state is IEEE80211_LINKED\n");
+                       netdev_dbg(dev, "ieee->state is IEEE80211_LINKED\n");
                        ieee80211_stop_send_beacons(priv->ieee80211);
                        del_timer_sync(&ieee->associate_timer);
                        cancel_delayed_work(&ieee->associate_retry_wq);
                        ieee80211_stop_scan(ieee);
                        netif_carrier_off(dev);
                        up(&ieee->wx_sem);
+               } else {
+                       netdev_dbg(dev, "ieee->state is NOT LINKED\n");
+                       ieee80211_softmac_stop_protocol(priv->ieee80211);
                }
-               else{
-                       printk("ieee->state is NOT LINKED\n");
-                       ieee80211_softmac_stop_protocol(priv->ieee80211);                       }
                up(&priv->wx_sem);
-               RT_TRACE(COMP_RESET,"%s():<==========down process is finished\n",__FUNCTION__);
-       //rtl8192_irq_disable(dev);
-               RT_TRACE(COMP_RESET,"%s():===========>start up the driver\n",__FUNCTION__);
+               RT_TRACE(COMP_RESET, "%s():<==========down process is finished\n", __func__);
+               RT_TRACE(COMP_RESET, "%s():===========>start up the driver\n", __func__);
                reset_status = _rtl8192_up(dev);
 
-               RT_TRACE(COMP_RESET,"%s():<===========up process is finished\n",__FUNCTION__);
-               if(reset_status == -EAGAIN)
-               {
-                       if(reset_times < 3)
-                       {
+               RT_TRACE(COMP_RESET, "%s():<===========up process is finished\n", __func__);
+               if (reset_status == -EAGAIN) {
+                       if (reset_times < 3) {
                                reset_times++;
                                goto RESET_START;
-                       }
-                       else
-                       {
-                               RT_TRACE(COMP_ERR," ERR!!! %s():  Reset Failed!!\n", __FUNCTION__);
+                       } else {
+                               RT_TRACE(COMP_ERR, " ERR!!! %s():  Reset Failed!!\n", __func__);
                        }
                }
                ieee->is_silent_reset = 1;
                EnableHWSecurityConfig8192(dev);
-               if(ieee->state == IEEE80211_LINKED && ieee->iw_mode == IW_MODE_INFRA)
-               {
+               if (ieee->state == IEEE80211_LINKED && ieee->iw_mode == IW_MODE_INFRA) {
                        ieee->set_chan(ieee->dev, ieee->current_network.channel);
 
                        queue_work(ieee->wq, &ieee->associate_complete_wq);
 
-               }
-               else if(ieee->state == IEEE80211_LINKED && ieee->iw_mode == IW_MODE_ADHOC)
-               {
+               } else if (ieee->state == IEEE80211_LINKED && ieee->iw_mode == IW_MODE_ADHOC) {
                        ieee->set_chan(ieee->dev, ieee->current_network.channel);
                        ieee->link_change(ieee->dev);
 
-               //      notify_wx_assoc_event(ieee);
-
                        ieee80211_start_send_beacons(ieee);
 
                        if (ieee->data_hard_resume)
@@ -3944,7 +3487,7 @@ RESET_START:
                priv->ResetProgress = RESET_TYPE_NORESET;
                priv->reset_count++;
 
-               priv->bForcedSilentReset =false;
+               priv->bForcedSilentReset = false;
                priv->bResetInProgress = false;
 
                // For test --> force write UFWP.
@@ -3953,50 +3496,36 @@ RESET_START:
        }
 }
 
-void CAM_read_entry(
-       struct net_device *dev,
-       u32                     iIndex
-)
+void CAM_read_entry(struct net_device *dev, u32 iIndex)
 {
-       u32 target_command=0;
-        u32 target_content=0;
-        u8 entry_i=0;
-        u32 ulStatus;
-       s32 i=100;
-//     printk("=======>start read CAM\n");
-       for(entry_i=0;entry_i<CAM_CONTENT_COUNT;entry_i++)
-       {
-       // polling bit, and No Write enable, and address
-               target_command= entry_i+CAM_CONTENT_COUNT*iIndex;
-               target_command= target_command | BIT31;
+       u32 target_command = 0;
+       u32 target_content = 0;
+       u8 entry_i = 0;
+       u32 ulStatus;
+       s32 i = 100;
+       for (entry_i = 0; entry_i < CAM_CONTENT_COUNT; entry_i++) {
+               // polling bit, and No Write enable, and address
+               target_command = entry_i+CAM_CONTENT_COUNT*iIndex;
+               target_command = target_command | BIT31;
 
-       //Check polling bit is clear
-//     mdelay(1);
-               while((i--)>=0)
-               {
-                       ulStatus = read_nic_dword(dev, RWCAM);
-                       if(ulStatus & BIT31){
+               //Check polling bit is clear
+               while ((i--) >= 0) {
+                       read_nic_dword(dev, RWCAM, &ulStatus);
+                       if (ulStatus & BIT31)
                                continue;
-                       }
-                       else{
+                       else
                                break;
-                       }
                }
                write_nic_dword(dev, RWCAM, target_command);
-               RT_TRACE(COMP_SEC,"CAM_read_entry(): WRITE A0: %x \n",target_command);
-        //     printk("CAM_read_entry(): WRITE A0: %lx \n",target_command);
-               target_content = read_nic_dword(dev, RCAMO);
-               RT_TRACE(COMP_SEC, "CAM_read_entry(): WRITE A8: %x \n",target_content);
-        //     printk("CAM_read_entry(): WRITE A8: %lx \n",target_content);
+               RT_TRACE(COMP_SEC, "CAM_read_entry(): WRITE A0: %x \n", target_command);
+               read_nic_dword(dev, RCAMO, &target_content);
+               RT_TRACE(COMP_SEC, "CAM_read_entry(): WRITE A8: %x \n", target_content);
        }
        printk("\n");
 }
 
-void rtl819x_update_rxcounts(
-       struct r8192_priv *priv,
-       u32* TotalRxBcnNum,
-       u32* TotalRxDataNum
-)
+void rtl819x_update_rxcounts(struct r8192_priv *priv, u32 *TotalRxBcnNum,
+                            u32 *TotalRxDataNum)
 {
        u16                     SlotIndex;
        u8                      i;
@@ -4007,80 +3536,68 @@ void rtl819x_update_rxcounts(
        SlotIndex = (priv->ieee80211->LinkDetectInfo.SlotIndex++)%(priv->ieee80211->LinkDetectInfo.SlotNum);
        priv->ieee80211->LinkDetectInfo.RxBcnNum[SlotIndex] = priv->ieee80211->LinkDetectInfo.NumRecvBcnInPeriod;
        priv->ieee80211->LinkDetectInfo.RxDataNum[SlotIndex] = priv->ieee80211->LinkDetectInfo.NumRecvDataInPeriod;
-       for( i=0; i<priv->ieee80211->LinkDetectInfo.SlotNum; i++ ){
+       for (i = 0; i < priv->ieee80211->LinkDetectInfo.SlotNum; i++) {
                *TotalRxBcnNum += priv->ieee80211->LinkDetectInfo.RxBcnNum[i];
                *TotalRxDataNum += priv->ieee80211->LinkDetectInfo.RxDataNum[i];
        }
 }
 
 
-extern void    rtl819x_watchdog_wqcallback(struct work_struct *work)
+extern void rtl819x_watchdog_wqcallback(struct work_struct *work)
 {
-       struct delayed_work *dwork = container_of(work,struct delayed_work,work);
-       struct r8192_priv *priv = container_of(dwork,struct r8192_priv,watch_dog_wq);
-       struct net_device *dev = priv->ieee80211->dev;
-       struct ieee80211_deviceieee = priv->ieee80211;
+       struct delayed_work *dwork = container_of(work, struct delayed_work, work);
+       struct r8192_priv *priv = container_of(dwork, struct r8192_priv, watch_dog_wq);
+       struct net_device *dev = priv->ieee80211->dev;
+       struct ieee80211_device *ieee = priv->ieee80211;
        RESET_TYPE      ResetType = RESET_TYPE_NORESET;
        static u8       check_reset_cnt;
        bool bBusyTraffic = false;
+       u32     TotalRxBcnNum = 0;
+       u32     TotalRxDataNum = 0;
 
-       if(!priv->up)
+       if (!priv->up)
                return;
        hal_dm_watchdog(dev);
 
-       {//to get busy traffic condition
-               if(ieee->state == IEEE80211_LINKED)
-               {
-                       if(     ieee->LinkDetectInfo.NumRxOkInPeriod> 666 ||
-                               ieee->LinkDetectInfo.NumTxOkInPeriod> 666 ) {
-                               bBusyTraffic = true;
-                       }
-                       ieee->LinkDetectInfo.NumRxOkInPeriod = 0;
-                       ieee->LinkDetectInfo.NumTxOkInPeriod = 0;
-                       ieee->LinkDetectInfo.bBusyTraffic = bBusyTraffic;
+       //to get busy traffic condition
+       if (ieee->state == IEEE80211_LINKED) {
+               if (ieee->LinkDetectInfo.NumRxOkInPeriod > 666 ||
+                   ieee->LinkDetectInfo.NumTxOkInPeriod > 666 ) {
+                       bBusyTraffic = true;
                }
+               ieee->LinkDetectInfo.NumRxOkInPeriod = 0;
+               ieee->LinkDetectInfo.NumTxOkInPeriod = 0;
+               ieee->LinkDetectInfo.bBusyTraffic = bBusyTraffic;
        }
        //added by amy for AP roaming
-       {
-               if(priv->ieee80211->state == IEEE80211_LINKED && priv->ieee80211->iw_mode == IW_MODE_INFRA)
-               {
-                       u32     TotalRxBcnNum = 0;
-                       u32     TotalRxDataNum = 0;
+       if (priv->ieee80211->state == IEEE80211_LINKED && priv->ieee80211->iw_mode == IW_MODE_INFRA) {
 
-                       rtl819x_update_rxcounts(priv, &TotalRxBcnNum, &TotalRxDataNum);
-                       if((TotalRxBcnNum+TotalRxDataNum) == 0)
-                       {
-                               #ifdef TODO
-                               if(rfState == eRfOff)
-                                       RT_TRACE(COMP_ERR,"========>%s()\n",__FUNCTION__);
-                               #endif
-                               printk("===>%s(): AP is power off,connect another one\n",__FUNCTION__);
-                       //      Dot11d_Reset(dev);
-                               priv->ieee80211->state = IEEE80211_ASSOCIATING;
-                               notify_wx_assoc_event(priv->ieee80211);
-                               RemovePeerTS(priv->ieee80211,priv->ieee80211->current_network.bssid);
-                               priv->ieee80211->link_change(dev);
-                               queue_work(priv->ieee80211->wq, &priv->ieee80211->associate_procedure_wq);
+               rtl819x_update_rxcounts(priv, &TotalRxBcnNum, &TotalRxDataNum);
+               if ((TotalRxBcnNum+TotalRxDataNum) == 0) {
+#ifdef TODO
+                       if (rfState == eRfOff)
+                               RT_TRACE(COMP_ERR, "========>%s()\n", __func__);
+#endif
+                       netdev_dbg(dev, "===>%s(): AP is power off, connect another one\n", __func__);
+                       priv->ieee80211->state = IEEE80211_ASSOCIATING;
+                       notify_wx_assoc_event(priv->ieee80211);
+                       RemovePeerTS(priv->ieee80211, priv->ieee80211->current_network.bssid);
+                       priv->ieee80211->link_change(dev);
+                       queue_work(priv->ieee80211->wq, &priv->ieee80211->associate_procedure_wq);
 
-                       }
                }
-               priv->ieee80211->LinkDetectInfo.NumRecvBcnInPeriod=0;
-               priv->ieee80211->LinkDetectInfo.NumRecvDataInPeriod=0;
        }
-//     CAM_read_entry(dev,4);
+       priv->ieee80211->LinkDetectInfo.NumRecvBcnInPeriod = 0;
+       priv->ieee80211->LinkDetectInfo.NumRecvDataInPeriod = 0;
        //check if reset the driver
-       if(check_reset_cnt++ >= 3)
-       {
+       if (check_reset_cnt++ >= 3) {
                ResetType = rtl819x_ifcheck_resetornot(dev);
                check_reset_cnt = 3;
-               //DbgPrint("Start to check silent reset\n");
        }
-       //      RT_TRACE(COMP_RESET,"%s():priv->force_reset is %d,priv->ResetProgress is %d, priv->bForcedSilentReset is %d,priv->bDisableNormalResetCheck is %d,ResetType is %d\n",__FUNCTION__,priv->force_reset,priv->ResetProgress,priv->bForcedSilentReset,priv->bDisableNormalResetCheck,ResetType);
-       if( (priv->force_reset) || (priv->ResetProgress==RESET_TYPE_NORESET &&
-               (priv->bForcedSilentReset ||
-               (!priv->bDisableNormalResetCheck && ResetType==RESET_TYPE_SILENT)))) // This is control by OID set in Pomelo
-       {
-               RT_TRACE(COMP_RESET,"%s():priv->force_reset is %d,priv->ResetProgress is %d, priv->bForcedSilentReset is %d,priv->bDisableNormalResetCheck is %d,ResetType is %d\n",__FUNCTION__,priv->force_reset,priv->ResetProgress,priv->bForcedSilentReset,priv->bDisableNormalResetCheck,ResetType);
+       if ((priv->force_reset) || (priv->ResetProgress == RESET_TYPE_NORESET &&
+           (priv->bForcedSilentReset ||
+           (!priv->bDisableNormalResetCheck && ResetType == RESET_TYPE_SILENT)))) { // This is control by OID set in Pomelo
+               RT_TRACE(COMP_RESET, "%s():priv->force_reset is %d,priv->ResetProgress is %d, priv->bForcedSilentReset is %d,priv->bDisableNormalResetCheck is %d,ResetType is %d\n", __func__, priv->force_reset, priv->ResetProgress, priv->bForcedSilentReset, priv->bDisableNormalResetCheck, ResetType);
                rtl819x_ifsilentreset(dev);
        }
        priv->force_reset = false;
@@ -4093,33 +3610,29 @@ extern  void    rtl819x_watchdog_wqcallback(struct work_struct *work)
 void watch_dog_timer_callback(unsigned long data)
 {
        struct r8192_priv *priv = ieee80211_priv((struct net_device *) data);
-       //printk("===============>watch_dog timer\n");
-       queue_delayed_work(priv->priv_wq,&priv->watch_dog_wq, 0);
+       queue_delayed_work(priv->priv_wq, &priv->watch_dog_wq, 0);
        mod_timer(&priv->watch_dog_timer, jiffies + MSECS(IEEE80211_WATCH_DOG_TIME));
 }
 int _rtl8192_up(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-       //int i;
        int init_status = 0;
-       priv->up=1;
-       priv->ieee80211->ieee_up=1;
+       priv->up = 1;
+       priv->ieee80211->ieee_up = 1;
        RT_TRACE(COMP_INIT, "Bringing up iface");
        init_status = rtl8192_adapter_start(dev);
-       if(!init_status)
-       {
-               RT_TRACE(COMP_ERR,"ERR!!! %s(): initialization failed!\n", __FUNCTION__);
-               priv->up=priv->ieee80211->ieee_up = 0;
+       if (!init_status) {
+               RT_TRACE(COMP_ERR, "ERR!!! %s(): initialization failed!\n", __func__);
+               priv->up = priv->ieee80211->ieee_up = 0;
                return -EAGAIN;
        }
        RT_TRACE(COMP_INIT, "start adapter finished\n");
        rtl8192_rx_enable(dev);
-//     rtl8192_tx_enable(dev);
-       if(priv->ieee80211->state != IEEE80211_LINKED)
-       ieee80211_softmac_start_protocol(priv->ieee80211);
+       if (priv->ieee80211->state != IEEE80211_LINKED)
+               ieee80211_softmac_start_protocol(priv->ieee80211);
        ieee80211_reset_queue(priv->ieee80211);
        watch_dog_timer_callback((unsigned long) dev);
-       if(!netif_queue_stopped(dev))
+       if (!netif_queue_stopped(dev))
                netif_start_queue(dev);
        else
                netif_wake_queue(dev);
@@ -4172,40 +3685,35 @@ int rtl8192_down(struct net_device *dev)
 
        if (priv->up == 0) return -1;
 
-       priv->up=0;
+       priv->up = 0;
        priv->ieee80211->ieee_up = 0;
-       RT_TRACE(COMP_DOWN, "==========>%s()\n", __FUNCTION__);
-/* FIXME */
+       RT_TRACE(COMP_DOWN, "==========>%s()\n", __func__);
+       /* FIXME */
        if (!netif_queue_stopped(dev))
                netif_stop_queue(dev);
 
        rtl8192_rtx_disable(dev);
-       //rtl8192_irq_disable(dev);
 
- /* Tx related queue release */
-       for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-               skb_queue_purge(&priv->ieee80211->skb_waitQ [i]);
-       }
-       for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-               skb_queue_purge(&priv->ieee80211->skb_aggQ [i]);
-       }
+       /* Tx related queue release */
+       for (i = 0; i < MAX_QUEUE_SIZE; i++)
+               skb_queue_purge(&priv->ieee80211->skb_waitQ[i]);
+       for (i = 0; i < MAX_QUEUE_SIZE; i++)
+               skb_queue_purge(&priv->ieee80211->skb_aggQ[i]);
 
-       for(i = 0; i < MAX_QUEUE_SIZE; i++) {
-               skb_queue_purge(&priv->ieee80211->skb_drv_aggQ [i]);
-       }
+       for (i = 0; i < MAX_QUEUE_SIZE; i++)
+               skb_queue_purge(&priv->ieee80211->skb_drv_aggQ[i]);
 
        //as cancel_delayed_work will del work->timer, so if work is not defined as struct delayed_work, it will corrupt
-//     flush_scheduled_work();
        rtl8192_cancel_deferred_work(priv);
        deinit_hal_dm(dev);
        del_timer_sync(&priv->watch_dog_timer);
 
 
        ieee80211_softmac_stop_protocol(priv->ieee80211);
-       memset(&priv->ieee80211->current_network, 0 , offsetof(struct ieee80211_network, list));
-       RT_TRACE(COMP_DOWN, "<==========%s()\n", __FUNCTION__);
+       memset(&priv->ieee80211->current_network, 0, offsetof(struct ieee80211_network, list));
+       RT_TRACE(COMP_DOWN, "<==========%s()\n", __func__);
 
-               return 0;
+       return 0;
 }
 
 
@@ -4213,27 +3721,19 @@ void rtl8192_commit(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        int reset_status = 0;
-       //u8 reset_times = 0;
-       if (priv->up == 0) return ;
+       if (priv->up == 0) return;
        priv->up = 0;
 
        rtl8192_cancel_deferred_work(priv);
        del_timer_sync(&priv->watch_dog_timer);
-       //cancel_delayed_work(&priv->SwChnlWorkItem);
 
        ieee80211_softmac_stop_protocol(priv->ieee80211);
 
-       //rtl8192_irq_disable(dev);
        rtl8192_rtx_disable(dev);
        reset_status = _rtl8192_up(dev);
 
 }
 
-/*
-void rtl8192_restart(struct net_device *dev)
-{
-       struct r8192_priv *priv = ieee80211_priv(dev);
-*/
 void rtl8192_restart(struct work_struct *work)
 {
        struct r8192_priv *priv = container_of(work, struct r8192_priv, reset_wq);
@@ -4251,19 +3751,13 @@ static void r8192_set_multicast(struct net_device *dev)
        struct r8192_priv *priv = ieee80211_priv(dev);
        short promisc;
 
-       //down(&priv->wx_sem);
-
        /* FIXME FIXME */
 
-       promisc = (dev->flags & IFF_PROMISC) ? 1:0;
+       promisc = (dev->flags & IFF_PROMISC) ? 1 : 0;
 
        if (promisc != priv->promisc)
-       //      rtl8192_commit(dev);
 
-       priv->promisc = promisc;
-
-       //schedule_work(&priv->reset_wq);
-       //up(&priv->wx_sem);
+               priv->promisc = promisc;
 }
 
 
@@ -4287,99 +3781,90 @@ int rtl8192_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        struct iwreq *wrq = (struct iwreq *)rq;
-       int ret=-1;
+       int ret = -1;
        struct ieee80211_device *ieee = priv->ieee80211;
        u32 key[4];
-       u8 broadcast_addr[6] = {0xff,0xff,0xff,0xff,0xff,0xff};
+       u8 broadcast_addr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
        struct iw_point *p = &wrq->u.data;
-       struct ieee_param *ipw = NULL;//(struct ieee_param *)wrq->u.data.pointer;
+       struct ieee_param *ipw = NULL;
 
        down(&priv->wx_sem);
 
 
-     if (p->length < sizeof(struct ieee_param) || !p->pointer){
-            ret = -EINVAL;
-            goto out;
+       if (p->length < sizeof(struct ieee_param) || !p->pointer) {
+               ret = -EINVAL;
+               goto out;
        }
 
-     ipw = kmalloc(p->length, GFP_KERNEL);
-     if (ipw == NULL){
-            ret = -ENOMEM;
-            goto out;
-     }
-     if (copy_from_user(ipw, p->pointer, p->length)) {
+       ipw = kmalloc(p->length, GFP_KERNEL);
+       if (ipw == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       if (copy_from_user(ipw, p->pointer, p->length)) {
                kfree(ipw);
-           ret = -EFAULT;
-           goto out;
+               ret = -EFAULT;
+               goto out;
        }
 
        switch (cmd) {
        case RTL_IOCTL_WPA_SUPPLICANT:
-       //parse here for HW security
-               if (ipw->cmd == IEEE_CMD_SET_ENCRYPTION)
-               {
-                       if (ipw->u.crypt.set_tx)
-                       {
-                               if (strcmp(ipw->u.crypt.alg, "CCMP") == 0)
+               //parse here for HW security
+               if (ipw->cmd == IEEE_CMD_SET_ENCRYPTION) {
+                       if (ipw->u.crypt.set_tx) {
+                               if (strcmp(ipw->u.crypt.alg, "CCMP") == 0) {
                                        ieee->pairwise_key_type = KEY_TYPE_CCMP;
-                               else if (strcmp(ipw->u.crypt.alg, "TKIP") == 0)
+                               } else if (strcmp(ipw->u.crypt.alg, "TKIP") == 0) {
                                        ieee->pairwise_key_type = KEY_TYPE_TKIP;
-                               else if (strcmp(ipw->u.crypt.alg, "WEP") == 0)
-                               {
+                               } else if (strcmp(ipw->u.crypt.alg, "WEP") == 0) {
                                        if (ipw->u.crypt.key_len == 13)
                                                ieee->pairwise_key_type = KEY_TYPE_WEP104;
                                        else if (ipw->u.crypt.key_len == 5)
                                                ieee->pairwise_key_type = KEY_TYPE_WEP40;
-                               }
-                               else
+                               } else {
                                        ieee->pairwise_key_type = KEY_TYPE_NA;
+                               }
 
-                               if (ieee->pairwise_key_type)
-                               {
-                                       memcpy((u8*)key, ipw->u.crypt.key, 16);
+                               if (ieee->pairwise_key_type) {
+                                       memcpy((u8 *)key, ipw->u.crypt.key, 16);
                                        EnableHWSecurityConfig8192(dev);
-                               //we fill both index entry and 4th entry for pairwise key as in IPW interface, adhoc will only get here, so we need index entry for its default key serching!
-                               //added by WB.
-                                       setKey(dev, 4, ipw->u.crypt.idx, ieee->pairwise_key_type, (u8*)ieee->ap_mac_addr, 0, key);
+                                       //we fill both index entry and 4th entry for pairwise key as in IPW interface, adhoc will only get here, so we need index entry for its default key serching!
+                                       //added by WB.
+                                       setKey(dev, 4, ipw->u.crypt.idx, ieee->pairwise_key_type, (u8 *)ieee->ap_mac_addr, 0, key);
                                        if (ieee->auth_mode != 2)
-                                       setKey(dev, ipw->u.crypt.idx, ipw->u.crypt.idx, ieee->pairwise_key_type, (u8*)ieee->ap_mac_addr, 0, key);
+                                               setKey(dev, ipw->u.crypt.idx, ipw->u.crypt.idx, ieee->pairwise_key_type, (u8 *)ieee->ap_mac_addr, 0, key);
                                }
-                       }
-                       else //if (ipw->u.crypt.idx) //group key use idx > 0
-                       {
-                               memcpy((u8*)key, ipw->u.crypt.key, 16);
-                               if (strcmp(ipw->u.crypt.alg, "CCMP") == 0)
-                                       ieee->group_key_type= KEY_TYPE_CCMP;
-                               else if (strcmp(ipw->u.crypt.alg, "TKIP") == 0)
+                       } else {
+                               memcpy((u8 *)key, ipw->u.crypt.key, 16);
+                               if (strcmp(ipw->u.crypt.alg, "CCMP") == 0) {
+                                       ieee->group_key_type = KEY_TYPE_CCMP;
+                               } else if (strcmp(ipw->u.crypt.alg, "TKIP") == 0) {
                                        ieee->group_key_type = KEY_TYPE_TKIP;
-                               else if (strcmp(ipw->u.crypt.alg, "WEP") == 0)
-                               {
+                               } else if (strcmp(ipw->u.crypt.alg, "WEP") == 0) {
                                        if (ipw->u.crypt.key_len == 13)
                                                ieee->group_key_type = KEY_TYPE_WEP104;
                                        else if (ipw->u.crypt.key_len == 5)
                                                ieee->group_key_type = KEY_TYPE_WEP40;
-                               }
-                               else
+                               } else {
                                        ieee->group_key_type = KEY_TYPE_NA;
+                               }
 
-                               if (ieee->group_key_type)
-                               {
-                                               setKey( dev,
-                                                       ipw->u.crypt.idx,
-                                                       ipw->u.crypt.idx,               //KeyIndex
-                                                       ieee->group_key_type,   //KeyType
-                                                       broadcast_addr, //MacAddr
-                                                       0,              //DefaultKey
-                                                       key);           //KeyContent
+                               if (ieee->group_key_type) {
+                                       setKey(dev, ipw->u.crypt.idx,
+                                              ipw->u.crypt.idx,                //KeyIndex
+                                              ieee->group_key_type,    //KeyType
+                                              broadcast_addr,  //MacAddr
+                                              0,               //DefaultKey
+                                              key);            //KeyContent
                                }
                        }
                }
 #ifdef JOHN_HWSEC_DEBUG
                //john's test 0711
                printk("@@ wrq->u pointer = ");
-               for(i=0;i<wrq->u.data.length;i++){
-                       if(i%10==0) printk("\n");
-                       printk( "%8x|", ((u32*)wrq->u.data.pointer)[i] );
+               for (i = 0; i < wrq->u.data.length; i++) {
+                       if (i%10 == 0) printk("\n");
+                       printk("%8x|", ((u32 *)wrq->u.data.pointer)[i]);
                }
                printk("\n");
 #endif /*JOHN_HWSEC_DEBUG*/
@@ -4401,8 +3886,8 @@ u8 HwRateToMRate90(bool bIsHT, u8 rate)
 {
        u8  ret_rate = 0xff;
 
-       if(!bIsHT) {
-               switch(rate) {
+       if (!bIsHT) {
+               switch (rate) {
                case DESC90_RATE1M:   ret_rate = MGN_1M;         break;
                case DESC90_RATE2M:   ret_rate = MGN_2M;         break;
                case DESC90_RATE5_5M: ret_rate = MGN_5_5M;       break;
@@ -4423,7 +3908,7 @@ u8 HwRateToMRate90(bool bIsHT, u8 rate)
                }
 
        } else {
-               switch(rate) {
+               switch (rate) {
                case DESC90_RATEMCS0:   ret_rate = MGN_MCS0;    break;
                case DESC90_RATEMCS1:   ret_rate = MGN_MCS1;    break;
                case DESC90_RATEMCS2:   ret_rate = MGN_MCS2;    break;
@@ -4444,7 +3929,7 @@ u8 HwRateToMRate90(bool bIsHT, u8 rate)
 
                default:
                        ret_rate = 0xff;
-                       RT_TRACE(COMP_RECV, "HwRateToMRate90(): Non supported Rate [%x], bIsHT = %d!!!\n",rate, bIsHT);
+                       RT_TRACE(COMP_RECV, "HwRateToMRate90(): Non supported Rate [%x], bIsHT = %d!!!\n", rate, bIsHT);
                        break;
                }
        }
@@ -4467,11 +3952,11 @@ u8 HwRateToMRate90(bool bIsHT, u8 rate)
  * Return:
  *               None
  */
-void UpdateRxPktTimeStamp8190 (struct net_device *dev, struct ieee80211_rx_stats *stats)
+void UpdateRxPktTimeStamp8190(struct net_device *dev, struct ieee80211_rx_stats *stats)
 {
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 
-       if(stats->bIsAMPDU && !stats->bFirstMPDU) {
+       if (stats->bIsAMPDU && !stats->bFirstMPDU) {
                stats->mac_time[0] = priv->LastRxDescTSFLow;
                stats->mac_time[1] = priv->LastRxDescTSFHigh;
        } else {
@@ -4482,7 +3967,7 @@ void UpdateRxPktTimeStamp8190 (struct net_device *dev, struct ieee80211_rx_stats
 
 //by amy 080606
 
-long rtl819x_translate_todbm(u8 signal_strength_index  )// 0-100 index.
+long rtl819x_translate_todbm(u8 signal_strength_index)// 0-100 index.
 {
        long    signal_power; // in dBm.
 
@@ -4498,12 +3983,11 @@ long rtl819x_translate_todbm(u8 signal_strength_index   )// 0-100 index.
     be a local static. Otherwise, it may increase when we return from S3/S4. The
     value will be kept in memory or disk. Declare the value in the adaptor
     and it will be reinitialized when returned from S3/S4. */
-void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee80211_rx_stats * pprevious_stats, struct ieee80211_rx_stats * pcurrent_stats)
+void rtl8192_process_phyinfo(struct r8192_priv *priv, u8 *buffer, struct ieee80211_rx_stats *pprevious_stats, struct ieee80211_rx_stats *pcurrent_stats)
 {
        bool bcheck = false;
        u8      rfpath;
        u32     nspatial_stream, tmp_val;
-       //u8    i;
        static u32 slide_rssi_index, slide_rssi_statistics;
        static u32 slide_evm_index, slide_evm_statistics;
        static u32 last_rssi, last_evm;
@@ -4512,8 +3996,8 @@ void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee802
        static u32 last_beacon_adc_pwdb;
 
        struct ieee80211_hdr_3addr *hdr;
-       u16 sc ;
-       unsigned int frag,seq;
+       u16 sc;
+       unsigned int frag, seq;
        hdr = (struct ieee80211_hdr_3addr *)buffer;
        sc = le16_to_cpu(hdr->seq_ctl);
        frag = WLAN_GET_SEQ_FRAG(sc);
@@ -4523,14 +4007,12 @@ void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee802
        //
        // Check whether we should take the previous packet into accounting
        //
-       if(!pprevious_stats->bIsAMPDU)
-       {
+       if (!pprevious_stats->bIsAMPDU) {
                // if previous packet is not aggregated packet
                bcheck = true;
        }
 
-       if(slide_rssi_statistics++ >= PHY_RSSI_SLID_WIN_MAX)
-       {
+       if (slide_rssi_statistics++ >= PHY_RSSI_SLID_WIN_MAX) {
                slide_rssi_statistics = PHY_RSSI_SLID_WIN_MAX;
                last_rssi = priv->stats.slide_signal_strength[slide_rssi_index];
                priv->stats.slide_rssi_total -= last_rssi;
@@ -4538,7 +4020,7 @@ void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee802
        priv->stats.slide_rssi_total += pprevious_stats->SignalStrength;
 
        priv->stats.slide_signal_strength[slide_rssi_index++] = pprevious_stats->SignalStrength;
-       if(slide_rssi_index >= PHY_RSSI_SLID_WIN_MAX)
+       if (slide_rssi_index >= PHY_RSSI_SLID_WIN_MAX)
                slide_rssi_index = 0;
 
        // <1> Showed on UI for user, in dbm
@@ -4548,13 +4030,12 @@ void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee802
        //
        // If the previous packet does not match the criteria, neglect it
        //
-       if(!pprevious_stats->bPacketMatchBSSID)
-       {
-               if(!pprevious_stats->bToSelfBA)
+       if (!pprevious_stats->bPacketMatchBSSID) {
+               if (!pprevious_stats->bToSelfBA)
                        return;
        }
 
-       if(!bcheck)
+       if (!bcheck)
                return;
 
 
@@ -4570,33 +4051,25 @@ void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee802
 
        // <2> Showed on UI for engineering
        // hardware does not provide rssi information for each rf path in CCK
-       if(!pprevious_stats->bIsCCK && (pprevious_stats->bPacketToSelf || pprevious_stats->bToSelfBA))
-       {
-               for (rfpath = RF90_PATH_A; rfpath < priv->NumTotalRFPath; rfpath++)
-               {
-                    if (!rtl8192_phy_CheckIsLegalRFPath(priv->ieee80211->dev, rfpath))
-                                continue;
+       if (!pprevious_stats->bIsCCK && (pprevious_stats->bPacketToSelf || pprevious_stats->bToSelfBA)) {
+               for (rfpath = RF90_PATH_A; rfpath < priv->NumTotalRFPath; rfpath++) {
+                       if (!rtl8192_phy_CheckIsLegalRFPath(priv->ieee80211->dev, rfpath))
+                               continue;
 
                        //Fixed by Jacken 2008-03-20
-                       if(priv->stats.rx_rssi_percentage[rfpath] == 0)
-                       {
+                       if (priv->stats.rx_rssi_percentage[rfpath] == 0)
                                priv->stats.rx_rssi_percentage[rfpath] = pprevious_stats->RxMIMOSignalStrength[rfpath];
-                               //DbgPrint("MIMO RSSI initialize \n");
-                       }
-                       if(pprevious_stats->RxMIMOSignalStrength[rfpath]  > priv->stats.rx_rssi_percentage[rfpath])
-                       {
+                       if (pprevious_stats->RxMIMOSignalStrength[rfpath]  > priv->stats.rx_rssi_percentage[rfpath]) {
                                priv->stats.rx_rssi_percentage[rfpath] =
-                                       ( (priv->stats.rx_rssi_percentage[rfpath]*(Rx_Smooth_Factor-1)) +
-                                       (pprevious_stats->RxMIMOSignalStrength[rfpath])) /(Rx_Smooth_Factor);
+                                       ((priv->stats.rx_rssi_percentage[rfpath]*(Rx_Smooth_Factor-1)) +
+                                        (pprevious_stats->RxMIMOSignalStrength[rfpath])) /(Rx_Smooth_Factor);
                                priv->stats.rx_rssi_percentage[rfpath] = priv->stats.rx_rssi_percentage[rfpath]  + 1;
-                       }
-                       else
-                       {
+                       } else {
                                priv->stats.rx_rssi_percentage[rfpath] =
-                                       ( (priv->stats.rx_rssi_percentage[rfpath]*(Rx_Smooth_Factor-1)) +
-                                       (pprevious_stats->RxMIMOSignalStrength[rfpath])) /(Rx_Smooth_Factor);
+                                       ((priv->stats.rx_rssi_percentage[rfpath]*(Rx_Smooth_Factor-1)) +
+                                        (pprevious_stats->RxMIMOSignalStrength[rfpath])) /(Rx_Smooth_Factor);
                        }
-                       RT_TRACE(COMP_DBG,"priv->stats.rx_rssi_percentage[rfPath]  = %d \n" ,priv->stats.rx_rssi_percentage[rfpath] );
+                       RT_TRACE(COMP_DBG, "priv->stats.rx_rssi_percentage[rfPath]  = %d \n", priv->stats.rx_rssi_percentage[rfpath]);
                }
        }
 
@@ -4605,55 +4078,43 @@ void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee802
        // Check PWDB.
        //
        RT_TRACE(COMP_RXDESC, "Smooth %s PWDB = %d\n",
-                               pprevious_stats->bIsCCK? "CCK": "OFDM",
-                               pprevious_stats->RxPWDBAll);
+                pprevious_stats->bIsCCK ? "CCK" : "OFDM",
+                pprevious_stats->RxPWDBAll);
 
-       if(pprevious_stats->bPacketBeacon)
-       {
-/* record the beacon pwdb to the sliding window. */
-               if(slide_beacon_adc_pwdb_statistics++ >= PHY_Beacon_RSSI_SLID_WIN_MAX)
-               {
+       if (pprevious_stats->bPacketBeacon) {
+               /* record the beacon pwdb to the sliding window. */
+               if (slide_beacon_adc_pwdb_statistics++ >= PHY_Beacon_RSSI_SLID_WIN_MAX) {
                        slide_beacon_adc_pwdb_statistics = PHY_Beacon_RSSI_SLID_WIN_MAX;
                        last_beacon_adc_pwdb = priv->stats.Slide_Beacon_pwdb[slide_beacon_adc_pwdb_index];
                        priv->stats.Slide_Beacon_Total -= last_beacon_adc_pwdb;
-                       //DbgPrint("slide_beacon_adc_pwdb_index = %d, last_beacon_adc_pwdb = %d, Adapter->RxStats.Slide_Beacon_Total = %d\n",
-                       //      slide_beacon_adc_pwdb_index, last_beacon_adc_pwdb, Adapter->RxStats.Slide_Beacon_Total);
                }
                priv->stats.Slide_Beacon_Total += pprevious_stats->RxPWDBAll;
                priv->stats.Slide_Beacon_pwdb[slide_beacon_adc_pwdb_index] = pprevious_stats->RxPWDBAll;
-               //DbgPrint("slide_beacon_adc_pwdb_index = %d, pPreviousRfd->Status.RxPWDBAll = %d\n", slide_beacon_adc_pwdb_index, pPreviousRfd->Status.RxPWDBAll);
                slide_beacon_adc_pwdb_index++;
-               if(slide_beacon_adc_pwdb_index >= PHY_Beacon_RSSI_SLID_WIN_MAX)
+               if (slide_beacon_adc_pwdb_index >= PHY_Beacon_RSSI_SLID_WIN_MAX)
                        slide_beacon_adc_pwdb_index = 0;
                pprevious_stats->RxPWDBAll = priv->stats.Slide_Beacon_Total/slide_beacon_adc_pwdb_statistics;
-               if(pprevious_stats->RxPWDBAll >= 3)
+               if (pprevious_stats->RxPWDBAll >= 3)
                        pprevious_stats->RxPWDBAll -= 3;
        }
 
        RT_TRACE(COMP_RXDESC, "Smooth %s PWDB = %d\n",
-                               pprevious_stats->bIsCCK? "CCK": "OFDM",
-                               pprevious_stats->RxPWDBAll);
+                pprevious_stats->bIsCCK ? "CCK" : "OFDM",
+                pprevious_stats->RxPWDBAll);
 
 
-       if(pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA)
-       {
-               if(priv->undecorated_smoothed_pwdb < 0) // initialize
-               {
+       if (pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA) {
+               if (priv->undecorated_smoothed_pwdb < 0)        // initialize
                        priv->undecorated_smoothed_pwdb = pprevious_stats->RxPWDBAll;
-                       //DbgPrint("First pwdb initialize \n");
-               }
-               if(pprevious_stats->RxPWDBAll > (u32)priv->undecorated_smoothed_pwdb)
-               {
+               if (pprevious_stats->RxPWDBAll > (u32)priv->undecorated_smoothed_pwdb) {
                        priv->undecorated_smoothed_pwdb =
-                                       ( ((priv->undecorated_smoothed_pwdb)*(Rx_Smooth_Factor-1)) +
-                                       (pprevious_stats->RxPWDBAll)) /(Rx_Smooth_Factor);
+                               (((priv->undecorated_smoothed_pwdb)*(Rx_Smooth_Factor-1)) +
+                                (pprevious_stats->RxPWDBAll)) /(Rx_Smooth_Factor);
                        priv->undecorated_smoothed_pwdb = priv->undecorated_smoothed_pwdb + 1;
-               }
-               else
-               {
+               } else {
                        priv->undecorated_smoothed_pwdb =
-                                       ( ((priv->undecorated_smoothed_pwdb)*(Rx_Smooth_Factor-1)) +
-                                       (pprevious_stats->RxPWDBAll)) /(Rx_Smooth_Factor);
+                               (((priv->undecorated_smoothed_pwdb)*(Rx_Smooth_Factor-1)) +
+                                (pprevious_stats->RxPWDBAll)) /(Rx_Smooth_Factor);
                }
 
        }
@@ -4662,13 +4123,9 @@ void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee802
        // Check EVM
        //
        /* record the general EVM to the sliding window. */
-       if(pprevious_stats->SignalQuality == 0)
-       {
-       }
-       else
-       {
-               if(pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA){
-                       if(slide_evm_statistics++ >= PHY_RSSI_SLID_WIN_MAX){
+       if (pprevious_stats->SignalQuality) {
+               if (pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA) {
+                       if (slide_evm_statistics++ >= PHY_RSSI_SLID_WIN_MAX) {
                                slide_evm_statistics = PHY_RSSI_SLID_WIN_MAX;
                                last_evm = priv->stats.slide_evm[slide_evm_index];
                                priv->stats.slide_evm_total -= last_evm;
@@ -4677,7 +4134,7 @@ void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee802
                        priv->stats.slide_evm_total += pprevious_stats->SignalQuality;
 
                        priv->stats.slide_evm[slide_evm_index++] = pprevious_stats->SignalQuality;
-                       if(slide_evm_index >= PHY_RSSI_SLID_WIN_MAX)
+                       if (slide_evm_index >= PHY_RSSI_SLID_WIN_MAX)
                                slide_evm_index = 0;
 
                        // <1> Showed on UI for user, in percentage.
@@ -4688,19 +4145,14 @@ void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee802
                }
 
                // <2> Showed on UI for engineering
-               if(pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA)
-               {
-                       for(nspatial_stream = 0; nspatial_stream<2 ; nspatial_stream++) // 2 spatial stream
-                       {
-                               if(pprevious_stats->RxMIMOSignalQuality[nspatial_stream] != -1)
-                               {
-                                       if(priv->stats.rx_evm_percentage[nspatial_stream] == 0) // initialize
-                                       {
+               if (pprevious_stats->bPacketToSelf || pprevious_stats->bPacketBeacon || pprevious_stats->bToSelfBA) {
+                       for (nspatial_stream = 0; nspatial_stream < 2; nspatial_stream++) { // 2 spatial stream
+                               if (pprevious_stats->RxMIMOSignalQuality[nspatial_stream] != -1) {
+                                       if (priv->stats.rx_evm_percentage[nspatial_stream] == 0) // initialize
                                                priv->stats.rx_evm_percentage[nspatial_stream] = pprevious_stats->RxMIMOSignalQuality[nspatial_stream];
-                                       }
                                        priv->stats.rx_evm_percentage[nspatial_stream] =
-                                               ( (priv->stats.rx_evm_percentage[nspatial_stream]* (Rx_Smooth_Factor-1)) +
-                                               (pprevious_stats->RxMIMOSignalQuality[nspatial_stream]* 1)) / (Rx_Smooth_Factor);
+                                               ((priv->stats.rx_evm_percentage[nspatial_stream]* (Rx_Smooth_Factor-1)) +
+                                                (pprevious_stats->RxMIMOSignalQuality[nspatial_stream]* 1)) / (Rx_Smooth_Factor);
                                }
                        }
                }
@@ -4725,126 +4177,104 @@ void rtl8192_process_phyinfo(struct r8192_priv * priv,u8* buffer, struct ieee802
  *     05/26/2008      amy             Create Version 0 porting from windows code.
  *
  *---------------------------------------------------------------------------*/
-static u8 rtl819x_query_rxpwrpercentage(
-       char            antpower
-       )
+static u8 rtl819x_query_rxpwrpercentage(char antpower)
 {
        if ((antpower <= -100) || (antpower >= 20))
-       {
                return  0;
-       }
        else if (antpower >= 0)
-       {
                return  100;
-       }
        else
-       {
-               return  (100+antpower);
-       }
+               return  100 + antpower;
 
 }      /* QueryRxPwrPercentage */
 
-static u8
-rtl819x_evm_dbtopercentage(
-    char value
-    )
+static u8 rtl819x_evm_dbtopercentage(char value)
 {
-    char ret_val;
+       char ret_val;
 
-    ret_val = value;
+       ret_val = value;
 
-    if(ret_val >= 0)
-       ret_val = 0;
-    if(ret_val <= -33)
-       ret_val = -33;
-    ret_val = 0 - ret_val;
-    ret_val*=3;
-       if(ret_val == 99)
+       if (ret_val >= 0)
+               ret_val = 0;
+       if (ret_val <= -33)
+               ret_val = -33;
+       ret_val = 0 - ret_val;
+       ret_val *= 3;
+       if (ret_val == 99)
                ret_val = 100;
-    return(ret_val);
+       return ret_val;
 }
 //
 //     Description:
 //     We want good-looking for signal strength/quality
 //     2007/7/19 01:09, by cosa.
 //
-long
-rtl819x_signal_scale_mapping(
-       long currsig
-       )
+long rtl819x_signal_scale_mapping(long currsig)
 {
        long retsig;
 
        // Step 1. Scale mapping.
-       if(currsig >= 61 && currsig <= 100)
-       {
+       if (currsig >= 61 && currsig <= 100)
                retsig = 90 + ((currsig - 60) / 4);
-       }
-       else if(currsig >= 41 && currsig <= 60)
-       {
+       else if (currsig >= 41 && currsig <= 60)
                retsig = 78 + ((currsig - 40) / 2);
-       }
-       else if(currsig >= 31 && currsig <= 40)
-       {
+       else if (currsig >= 31 && currsig <= 40)
                retsig = 66 + (currsig - 30);
-       }
-       else if(currsig >= 21 && currsig <= 30)
-       {
+       else if (currsig >= 21 && currsig <= 30)
                retsig = 54 + (currsig - 20);
-       }
-       else if(currsig >= 5 && currsig <= 20)
-       {
+       else if (currsig >= 5 && currsig <= 20)
                retsig = 42 + (((currsig - 5) * 2) / 3);
-       }
-       else if(currsig == 4)
-       {
+       else if (currsig == 4)
                retsig = 36;
-       }
-       else if(currsig == 3)
-       {
+       else if (currsig == 3)
                retsig = 27;
-       }
-       else if(currsig == 2)
-       {
+       else if (currsig == 2)
                retsig = 18;
-       }
-       else if(currsig == 1)
-       {
+       else if (currsig == 1)
                retsig = 9;
-       }
        else
-       {
                retsig = currsig;
-       }
 
        return retsig;
 }
 
-static void rtl8192_query_rxphystatus(
-       struct r8192_priv * priv,
-       struct ieee80211_rx_stats * pstats,
-       rx_drvinfo_819x_usb  * pdrvinfo,
-       struct ieee80211_rx_stats * precord_stats,
-       bool bpacket_match_bssid,
-       bool bpacket_toself,
-       bool bPacketBeacon,
-       bool bToSelfBA
-       )
-{
-       //PRT_RFD_STATUS                pRtRfdStatus = &(pRfd->Status);
-       phy_sts_ofdm_819xusb_t* pofdm_buf;
-       phy_sts_cck_819xusb_t   *       pcck_buf;
-       phy_ofdm_rx_status_rxsc_sgien_exintfflag* prxsc;
+static inline bool rx_hal_is_cck_rate(struct rx_drvinfo_819x_usb *pdrvinfo)
+{
+       if (pdrvinfo->RxHT)
+               return false;
+
+       switch (pdrvinfo->RxRate) {
+       case DESC90_RATE1M:
+       case DESC90_RATE2M:
+       case DESC90_RATE5_5M:
+       case DESC90_RATE11M:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static void rtl8192_query_rxphystatus(struct r8192_priv *priv,
+                                     struct ieee80211_rx_stats *pstats,
+                                     rx_drvinfo_819x_usb  *pdrvinfo,
+                                     struct ieee80211_rx_stats *precord_stats,
+                                     bool bpacket_match_bssid,
+                                     bool bpacket_toself,
+                                     bool bPacketBeacon,
+                                     bool bToSelfBA)
+{
+       phy_sts_ofdm_819xusb_t *pofdm_buf;
+       phy_sts_cck_819xusb_t   *pcck_buf;
+       phy_ofdm_rx_status_rxsc_sgien_exintfflag *prxsc;
        u8                              *prxpkt;
        u8                              i, max_spatial_stream, tmp_rxsnr, tmp_rxevm, rxsc_sgien_exflg;
-       char                            rx_pwr[4], rx_pwr_all=0;
-       //long                          rx_avg_pwr = 0;
+       char                            rx_pwr[4], rx_pwr_all = 0;
        char                            rx_snrX, rx_evmX;
        u8                              evm, pwdb_all;
-       u32                             RSSI, total_rssi=0;//, total_evm=0;
-//     long                            signal_strength_index = 0;
-       u8                              is_cck_rate=0;
+       u32                             RSSI, total_rssi = 0;
+       u8                              is_cck_rate = 0;
        u8                              rf_rx_num = 0;
+       u8                              sq;
 
 
        priv->stats.numqry_phystatus++;
@@ -4855,11 +4285,11 @@ static void rtl8192_query_rxphystatus(
        memset(precord_stats, 0, sizeof(struct ieee80211_rx_stats));
        pstats->bPacketMatchBSSID = precord_stats->bPacketMatchBSSID = bpacket_match_bssid;
        pstats->bPacketToSelf = precord_stats->bPacketToSelf = bpacket_toself;
-       pstats->bIsCCK = precord_stats->bIsCCK = is_cck_rate;//RX_HAL_IS_CCK_RATE(pDrvInfo);
+       pstats->bIsCCK = precord_stats->bIsCCK = is_cck_rate;
        pstats->bPacketBeacon = precord_stats->bPacketBeacon = bPacketBeacon;
        pstats->bToSelfBA = precord_stats->bToSelfBA = bToSelfBA;
 
-       prxpkt = (u8*)pdrvinfo;
+       prxpkt = (u8 *)pdrvinfo;
 
        /* Move pointer to the 16th bytes. Phy status start address. */
        prxpkt += sizeof(rx_drvinfo_819x_usb);
@@ -4873,8 +4303,7 @@ static void rtl8192_query_rxphystatus(
        precord_stats->RxMIMOSignalQuality[0] = -1;
        precord_stats->RxMIMOSignalQuality[1] = -1;
 
-       if(is_cck_rate)
-       {
+       if (is_cck_rate) {
                //
                // (1)Hardware does not provide RSSI for CCK
                //
@@ -4882,51 +4311,46 @@ static void rtl8192_query_rxphystatus(
                //
                // (2)PWDB, Average PWDB cacluated by hardware (for rate adaptive)
                //
-               u8 report;//, cck_agc_rpt;
+               u8 report;
 
                priv->stats.numqry_phystatusCCK++;
 
-               if(!priv->bCckHighPower)
-               {
+               if (!priv->bCckHighPower) {
                        report = pcck_buf->cck_agc_rpt & 0xc0;
                        report = report>>6;
-                       switch(report)
-                       {
+                       switch (report) {
                                //Fixed by Jacken from Bryant 2008-03-20
                                //Original value is -38 , -26 , -14 , -2
                                //Fixed value is -35 , -23 , -11 , 6
-                               case 0x3:
-                                       rx_pwr_all = -35 - (pcck_buf->cck_agc_rpt & 0x3e);
-                                       break;
-                               case 0x2:
-                                       rx_pwr_all = -23 - (pcck_buf->cck_agc_rpt & 0x3e);
-                                       break;
-                               case 0x1:
-                                       rx_pwr_all = -11 - (pcck_buf->cck_agc_rpt & 0x3e);
-                                       break;
-                               case 0x0:
-                                       rx_pwr_all = 6 - (pcck_buf->cck_agc_rpt & 0x3e);
-                                       break;
+                       case 0x3:
+                               rx_pwr_all = -35 - (pcck_buf->cck_agc_rpt & 0x3e);
+                               break;
+                       case 0x2:
+                               rx_pwr_all = -23 - (pcck_buf->cck_agc_rpt & 0x3e);
+                               break;
+                       case 0x1:
+                               rx_pwr_all = -11 - (pcck_buf->cck_agc_rpt & 0x3e);
+                               break;
+                       case 0x0:
+                               rx_pwr_all = 6 - (pcck_buf->cck_agc_rpt & 0x3e);
+                               break;
                        }
-               }
-               else
-               {
+               } else {
                        report = pcck_buf->cck_agc_rpt & 0x60;
                        report = report>>5;
-                       switch(report)
-                       {
-                               case 0x3:
-                                       rx_pwr_all = -35 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1) ;
-                                       break;
-                               case 0x2:
-                                       rx_pwr_all = -23 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1);
-                                       break;
-                               case 0x1:
-                                       rx_pwr_all = -11 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1) ;
-                                       break;
-                               case 0x0:
-                                       rx_pwr_all = 6 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1) ;
-                                       break;
+                       switch (report) {
+                       case 0x3:
+                               rx_pwr_all = -35 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1);
+                               break;
+                       case 0x2:
+                               rx_pwr_all = -23 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1);
+                               break;
+                       case 0x1:
+                               rx_pwr_all = -11 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1);
+                               break;
+                       case 0x0:
+                               rx_pwr_all = 6 - ((pcck_buf->cck_agc_rpt & 0x1f)<<1);
+                               break;
                        }
                }
 
@@ -4937,44 +4361,36 @@ static void rtl8192_query_rxphystatus(
                //
                // (3) Get Signal Quality (EVM)
                //
-               //if(bpacket_match_bssid)
-               {
-                       u8      sq;
 
-                       if(pstats->RxPWDBAll > 40)
-                       {
-                               sq = 100;
-                       }else
-                       {
-                               sq = pcck_buf->sq_rpt;
+               if (pstats->RxPWDBAll > 40) {
+                       sq = 100;
+               } else {
+                       sq = pcck_buf->sq_rpt;
 
-                               if(pcck_buf->sq_rpt > 64)
-                                       sq = 0;
-                               else if (pcck_buf->sq_rpt < 20)
-                                       sq = 100;
-                               else
-                                       sq = ((64-sq) * 100) / 44;
-                       }
-                       pstats->SignalQuality = precord_stats->SignalQuality = sq;
-                       pstats->RxMIMOSignalQuality[0] = precord_stats->RxMIMOSignalQuality[0] = sq;
-                       pstats->RxMIMOSignalQuality[1] = precord_stats->RxMIMOSignalQuality[1] = -1;
+                       if (pcck_buf->sq_rpt > 64)
+                               sq = 0;
+                       else if (pcck_buf->sq_rpt < 20)
+                               sq = 100;
+                       else
+                               sq = ((64-sq) * 100) / 44;
                }
-       }
-       else
-       {
+               pstats->SignalQuality = precord_stats->SignalQuality = sq;
+               pstats->RxMIMOSignalQuality[0] = precord_stats->RxMIMOSignalQuality[0] = sq;
+               pstats->RxMIMOSignalQuality[1] = precord_stats->RxMIMOSignalQuality[1] = -1;
+
+       } else {
                priv->stats.numqry_phystatusHT++;
                //
                // (1)Get RSSI for HT rate
                //
-               for(i=RF90_PATH_A; i<priv->NumTotalRFPath; i++)
-               {
+               for (i = RF90_PATH_A; i < priv->NumTotalRFPath; i++) {
                        // 2008/01/30 MH we will judge RF RX path now.
                        if (priv->brfpath_rxenable[i])
                                rf_rx_num++;
                        else
                                continue;
 
-               if (!rtl8192_phy_CheckIsLegalRFPath(priv->ieee80211->dev, i))
+                       if (!rtl8192_phy_CheckIsLegalRFPath(priv->ieee80211->dev, i))
                                continue;
 
                        //Fixed by Jacken from Bryant 2008-03-20
@@ -4984,7 +4400,6 @@ static void rtl8192_query_rxphystatus(
                        //Get Rx snr value in DB
                        tmp_rxsnr =     pofdm_buf->rxsnr_X[i];
                        rx_snrX = (char)(tmp_rxsnr);
-                       //rx_snrX >>= 1;
                        rx_snrX /= 2;
                        priv->stats.rxSNRdB[i] = (long)rx_snrX;
 
@@ -4993,11 +4408,8 @@ static void rtl8192_query_rxphystatus(
                        total_rssi += RSSI;
 
                        /* Record Signal Strength for next packet */
-                       //if(bpacket_match_bssid)
-                       {
-                               pstats->RxMIMOSignalStrength[i] =(u8) RSSI;
-                               precord_stats->RxMIMOSignalStrength[i] =(u8) RSSI;
-                       }
+                       pstats->RxMIMOSignalStrength[i] = (u8) RSSI;
+                       precord_stats->RxMIMOSignalStrength[i] = (u8) RSSI;
                }
 
 
@@ -5006,7 +4418,7 @@ static void rtl8192_query_rxphystatus(
                //
                //Fixed by Jacken from Bryant 2008-03-20
                //Original value is 106
-               rx_pwr_all = (((pofdm_buf->pwdb_all ) >> 1 )& 0x7f) -106;
+               rx_pwr_all = (((pofdm_buf->pwdb_all) >> 1)& 0x7f) -106;
                pwdb_all = rtl819x_query_rxpwrpercentage(rx_pwr_all);
 
                pstats->RxPWDBAll = precord_stats->RxPWDBAll = pwdb_all;
@@ -5015,14 +4427,13 @@ static void rtl8192_query_rxphystatus(
                //
                // (3)EVM of HT rate
                //
-               if(pdrvinfo->RxHT && pdrvinfo->RxRate>=DESC90_RATEMCS8 &&
-                       pdrvinfo->RxRate<=DESC90_RATEMCS15)
+               if (pdrvinfo->RxHT && pdrvinfo->RxRate >= DESC90_RATEMCS8 &&
+                   pdrvinfo->RxRate <= DESC90_RATEMCS15)
                        max_spatial_stream = 2; //both spatial stream make sense
                else
                        max_spatial_stream = 1; //only spatial stream 1 makes sense
 
-               for(i=0; i<max_spatial_stream; i++)
-               {
+               for (i = 0; i < max_spatial_stream; i++) {
                        tmp_rxevm =     pofdm_buf->rxevm_X[i];
                        rx_evmX = (char)(tmp_rxevm);
 
@@ -5032,19 +4443,16 @@ static void rtl8192_query_rxphystatus(
                        rx_evmX /= 2;   //dbm
 
                        evm = rtl819x_evm_dbtopercentage(rx_evmX);
-                       //if(bpacket_match_bssid)
-                       {
-                               if(i==0) // Fill value in RFD, Get the first spatial stream only
-                                       pstats->SignalQuality = precord_stats->SignalQuality = (u8)(evm & 0xff);
-                               pstats->RxMIMOSignalQuality[i] = precord_stats->RxMIMOSignalQuality[i] = (u8)(evm & 0xff);
-                       }
+                       if (i == 0) // Fill value in RFD, Get the first spatial stream only
+                               pstats->SignalQuality = precord_stats->SignalQuality = (u8)(evm & 0xff);
+                       pstats->RxMIMOSignalQuality[i] = precord_stats->RxMIMOSignalQuality[i] = (u8)(evm & 0xff);
                }
 
 
                /* record rx statistics for debug */
                rxsc_sgien_exflg = pofdm_buf->rxsc_sgien_exflg;
                prxsc = (phy_ofdm_rx_status_rxsc_sgien_exintfflag *)&rxsc_sgien_exflg;
-               if(pdrvinfo->BW)        //40M channel
+               if (pdrvinfo->BW)       //40M channel
                        priv->stats.received_bwtype[1+prxsc->rxsc]++;
                else                            //20M channel
                        priv->stats.received_bwtype[0]++;
@@ -5052,25 +4460,17 @@ static void rtl8192_query_rxphystatus(
 
        //UI BSS List signal strength(in percentage), make it good looking, from 0~100.
        //It is assigned to the BSS List in GetValueFromBeaconOrProbeRsp().
-       if(is_cck_rate)
-       {
-               pstats->SignalStrength = precord_stats->SignalStrength = (u8)(rtl819x_signal_scale_mapping((long)pwdb_all));//PWDB_ALL;
-
-       }
-       else
-       {
-               //pRfd->Status.SignalStrength = pRecordRfd->Status.SignalStrength = (u8)(SignalScaleMapping(total_rssi/=RF90_PATH_MAX));//(u8)(total_rssi/=RF90_PATH_MAX);
+       if (is_cck_rate) {
+               pstats->SignalStrength = precord_stats->SignalStrength = (u8)(rtl819x_signal_scale_mapping((long)pwdb_all));
+       } else {
                // We can judge RX path number now.
                if (rf_rx_num != 0)
-                       pstats->SignalStrength = precord_stats->SignalStrength = (u8)(rtl819x_signal_scale_mapping((long)(total_rssi/=rf_rx_num)));
+                       pstats->SignalStrength = precord_stats->SignalStrength = (u8)(rtl819x_signal_scale_mapping((long)(total_rssi /= rf_rx_num)));
        }
 }      /* QueryRxPhyStatus8190Pci */
 
-void
-rtl8192_record_rxdesc_forlateruse(
-       struct ieee80211_rx_stats *     psrc_stats,
-       struct ieee80211_rx_stats *     ptarget_stats
-)
+void rtl8192_record_rxdesc_forlateruse(struct ieee80211_rx_stats *psrc_stats,
+                                      struct ieee80211_rx_stats *ptarget_stats)
 {
        ptarget_stats->bIsAMPDU = psrc_stats->bIsAMPDU;
        ptarget_stats->bFirstMPDU = psrc_stats->bFirstMPDU;
@@ -5079,27 +4479,26 @@ rtl8192_record_rxdesc_forlateruse(
 
 
 void TranslateRxSignalStuff819xUsb(struct sk_buff *skb,
-                                  struct ieee80211_rx_stats * pstats,
+                                  struct ieee80211_rx_stats *pstats,
                                   rx_drvinfo_819x_usb  *pdrvinfo)
 {
        // TODO: We must only check packet for current MAC address. Not finish
        rtl8192_rx_info *info = (struct rtl8192_rx_info *)skb->cb;
-       struct net_device *dev=info->dev;
+       struct net_device *dev = info->dev;
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        bool bpacket_match_bssid, bpacket_toself;
-       bool bPacketBeacon=FALSE, bToSelfBA=FALSE;
+       bool bPacketBeacon = FALSE, bToSelfBA = FALSE;
        static struct ieee80211_rx_stats  previous_stats;
        struct ieee80211_hdr_3addr *hdr;//by amy
-       u16 fc,type;
+       u16 fc, type;
 
        // Get Signal Quality for only RX data queue (but not command queue)
 
-       u8* tmp_buf;
-       //u16 tmp_buf_len = 0;
+       u8 *tmp_buf;
        u8  *praddr;
 
        /* Get MAC frame start address. */
-       tmp_buf = (u8*)skb->data;// + get_rxpacket_shiftbytes_819xusb(pstats);
+       tmp_buf = (u8 *)skb->data;
 
        hdr = (struct ieee80211_hdr_3addr *)tmp_buf;
        fc = le16_to_cpu(hdr->frame_ctl);
@@ -5108,38 +4507,30 @@ void TranslateRxSignalStuff819xUsb(struct sk_buff *skb,
 
        /* Check if the received packet is acceptable. */
        bpacket_match_bssid = ((IEEE80211_FTYPE_CTL != type) &&
-                                                       (eqMacAddr(priv->ieee80211->current_network.bssid,  (fc & IEEE80211_FCTL_TODS)? hdr->addr1 : (fc & IEEE80211_FCTL_FROMDS )? hdr->addr2 : hdr->addr3))
-                                                                && (!pstats->bHwError) && (!pstats->bCRC)&& (!pstats->bICV));
+                              (eqMacAddr(priv->ieee80211->current_network.bssid,  (fc & IEEE80211_FCTL_TODS) ? hdr->addr1 : (fc & IEEE80211_FCTL_FROMDS) ? hdr->addr2 : hdr->addr3))
+                              && (!pstats->bHwError) && (!pstats->bCRC) && (!pstats->bICV));
        bpacket_toself =  bpacket_match_bssid & (eqMacAddr(praddr, priv->ieee80211->dev->dev_addr));
 
-               if(WLAN_FC_GET_FRAMETYPE(fc)== IEEE80211_STYPE_BEACON)
-               {
-                       bPacketBeacon = true;
-                       //DbgPrint("Beacon 2, MatchBSSID = %d, ToSelf = %d \n", bPacketMatchBSSID, bPacketToSelf);
-               }
-               if(WLAN_FC_GET_FRAMETYPE(fc) == IEEE80211_STYPE_BLOCKACK)
-               {
-                       if((eqMacAddr(praddr,dev->dev_addr)))
-                               bToSelfBA = true;
-                               //DbgPrint("BlockAck, MatchBSSID = %d, ToSelf = %d \n", bPacketMatchBSSID, bPacketToSelf);
-               }
+       if (WLAN_FC_GET_FRAMETYPE(fc) == IEEE80211_STYPE_BEACON)
+               bPacketBeacon = true;
+       if (WLAN_FC_GET_FRAMETYPE(fc) == IEEE80211_STYPE_BLOCKACK) {
+               if ((eqMacAddr(praddr, dev->dev_addr)))
+                       bToSelfBA = true;
+       }
 
 
 
-       if(bpacket_match_bssid)
-       {
+       if (bpacket_match_bssid)
                priv->stats.numpacket_matchbssid++;
-       }
-       if(bpacket_toself){
+       if (bpacket_toself)
                priv->stats.numpacket_toself++;
-       }
        //
        // Process PHY information for previous packet (RSSI/PWDB/EVM)
        //
        // Because phy information is contained in the last packet of AMPDU only, so driver
        // should process phy information of previous packet
        rtl8192_process_phyinfo(priv, tmp_buf, &previous_stats, pstats);
-       rtl8192_query_rxphystatus(priv, pstats, pdrvinfo, &previous_stats, bpacket_match_bssid,bpacket_toself,bPacketBeacon,bToSelfBA);
+       rtl8192_query_rxphystatus(priv, pstats, pdrvinfo, &previous_stats, bpacket_match_bssid, bpacket_toself, bPacketBeacon, bToSelfBA);
        rtl8192_record_rxdesc_forlateruse(pstats, &previous_stats);
 
 }
@@ -5158,91 +4549,85 @@ void TranslateRxSignalStuff819xUsb(struct sk_buff *skb,
 * Return:
 *              None
 */
-void
-UpdateReceivedRateHistogramStatistics8190(
-       struct net_device *dev,
-       struct ieee80211_rx_stats *stats
-       )
+void UpdateReceivedRateHistogramStatistics8190(struct net_device *dev,
+                                              struct ieee80211_rx_stats *stats)
 {
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-       u32 rcvType=1;   //0: Total, 1:OK, 2:CRC, 3:ICV
+       u32 rcvType = 1;   //0: Total, 1:OK, 2:CRC, 3:ICV
        u32 rateIndex;
        u32 preamble_guardinterval;  //1: short preamble/GI, 0: long preamble/GI
 
 
-       if(stats->bCRC)
-       rcvType = 2;
-       else if(stats->bICV)
-       rcvType = 3;
+       if (stats->bCRC)
+               rcvType = 2;
+       else if (stats->bICV)
+               rcvType = 3;
 
-       if(stats->bShortPreamble)
-       preamble_guardinterval = 1;// short
+       if (stats->bShortPreamble)
+               preamble_guardinterval = 1;// short
        else
-       preamble_guardinterval = 0;// long
+               preamble_guardinterval = 0;// long
 
-       switch(stats->rate)
-       {
+       switch (stats->rate) {
                //
                // CCK rate
                //
-               case MGN_1M:    rateIndex = 0;  break;
-               case MGN_2M:    rateIndex = 1;  break;
-               case MGN_5_5M:  rateIndex = 2;  break;
-               case MGN_11M:   rateIndex = 3;  break;
+       case MGN_1M:    rateIndex = 0;  break;
+       case MGN_2M:    rateIndex = 1;  break;
+       case MGN_5_5M:  rateIndex = 2;  break;
+       case MGN_11M:   rateIndex = 3;  break;
                //
                // Legacy OFDM rate
                //
-               case MGN_6M:    rateIndex = 4;  break;
-               case MGN_9M:    rateIndex = 5;  break;
-               case MGN_12M:   rateIndex = 6;  break;
-               case MGN_18M:   rateIndex = 7;  break;
-               case MGN_24M:   rateIndex = 8;  break;
-               case MGN_36M:   rateIndex = 9;  break;
-               case MGN_48M:   rateIndex = 10; break;
-               case MGN_54M:   rateIndex = 11; break;
+       case MGN_6M:    rateIndex = 4;  break;
+       case MGN_9M:    rateIndex = 5;  break;
+       case MGN_12M:   rateIndex = 6;  break;
+       case MGN_18M:   rateIndex = 7;  break;
+       case MGN_24M:   rateIndex = 8;  break;
+       case MGN_36M:   rateIndex = 9;  break;
+       case MGN_48M:   rateIndex = 10; break;
+       case MGN_54M:   rateIndex = 11; break;
                //
                // 11n High throughput rate
                //
-               case MGN_MCS0:  rateIndex = 12; break;
-               case MGN_MCS1:  rateIndex = 13; break;
-               case MGN_MCS2:  rateIndex = 14; break;
-               case MGN_MCS3:  rateIndex = 15; break;
-               case MGN_MCS4:  rateIndex = 16; break;
-               case MGN_MCS5:  rateIndex = 17; break;
-               case MGN_MCS6:  rateIndex = 18; break;
-               case MGN_MCS7:  rateIndex = 19; break;
-               case MGN_MCS8:  rateIndex = 20; break;
-               case MGN_MCS9:  rateIndex = 21; break;
-               case MGN_MCS10: rateIndex = 22; break;
-               case MGN_MCS11: rateIndex = 23; break;
-               case MGN_MCS12: rateIndex = 24; break;
-               case MGN_MCS13: rateIndex = 25; break;
-               case MGN_MCS14: rateIndex = 26; break;
-               case MGN_MCS15: rateIndex = 27; break;
-               default:        rateIndex = 28; break;
-       }
-    priv->stats.received_preamble_GI[preamble_guardinterval][rateIndex]++;
-    priv->stats.received_rate_histogram[0][rateIndex]++; //total
-    priv->stats.received_rate_histogram[rcvType][rateIndex]++;
+       case MGN_MCS0:  rateIndex = 12; break;
+       case MGN_MCS1:  rateIndex = 13; break;
+       case MGN_MCS2:  rateIndex = 14; break;
+       case MGN_MCS3:  rateIndex = 15; break;
+       case MGN_MCS4:  rateIndex = 16; break;
+       case MGN_MCS5:  rateIndex = 17; break;
+       case MGN_MCS6:  rateIndex = 18; break;
+       case MGN_MCS7:  rateIndex = 19; break;
+       case MGN_MCS8:  rateIndex = 20; break;
+       case MGN_MCS9:  rateIndex = 21; break;
+       case MGN_MCS10: rateIndex = 22; break;
+       case MGN_MCS11: rateIndex = 23; break;
+       case MGN_MCS12: rateIndex = 24; break;
+       case MGN_MCS13: rateIndex = 25; break;
+       case MGN_MCS14: rateIndex = 26; break;
+       case MGN_MCS15: rateIndex = 27; break;
+       default:        rateIndex = 28; break;
+       }
+       priv->stats.received_preamble_GI[preamble_guardinterval][rateIndex]++;
+       priv->stats.received_rate_histogram[0][rateIndex]++; //total
+       priv->stats.received_rate_histogram[rcvType][rateIndex]++;
 }
 
 
 void query_rxdesc_status(struct sk_buff *skb, struct ieee80211_rx_stats *stats, bool bIsRxAggrSubframe)
 {
        rtl8192_rx_info *info = (struct rtl8192_rx_info *)skb->cb;
-       struct net_device *dev=info->dev;
+       struct net_device *dev = info->dev;
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-       //rx_desc_819x_usb *desc = (rx_desc_819x_usb *)skb->data;
        rx_drvinfo_819x_usb  *driver_info = NULL;
 
        //
        //Get Rx Descriptor Information
        //
 #ifdef USB_RX_AGGREGATION_SUPPORT
-       if (bIsRxAggrSubframe)
-       {
+       if (bIsRxAggrSubframe) {
                rx_desc_819x_usb_aggr_subframe *desc = (rx_desc_819x_usb_aggr_subframe *)skb->data;
-               stats->Length = desc->Length ;
+               stats->Length = desc->Length;
                stats->RxDrvInfoSize = desc->RxDrvInfoSize;
                stats->RxBufShift = 0; //RxBufShift = 2 in RxDesc, but usb didn't shift bytes in fact.
                stats->bICV = desc->ICV;
@@ -5256,7 +4641,7 @@ void query_rxdesc_status(struct sk_buff *skb, struct ieee80211_rx_stats *stats,
 
                stats->Length = desc->Length;
                stats->RxDrvInfoSize = desc->RxDrvInfoSize;
-               stats->RxBufShift = 0;//desc->Shift&0x03;
+               stats->RxBufShift = 0;
                stats->bICV = desc->ICV;
                stats->bCRC = desc->CRC32;
                stats->bHwError = stats->bCRC|stats->bICV;
@@ -5264,16 +4649,12 @@ void query_rxdesc_status(struct sk_buff *skb, struct ieee80211_rx_stats *stats,
                stats->Decrypted = !desc->SWDec;
        }
 
-       if((priv->ieee80211->pHTInfo->bCurrentHTSupport == true) && (priv->ieee80211->pairwise_key_type == KEY_TYPE_CCMP))
-       {
+       if ((priv->ieee80211->pHTInfo->bCurrentHTSupport == true) && (priv->ieee80211->pairwise_key_type == KEY_TYPE_CCMP))
                stats->bHwError = false;
-       }
        else
-       {
                stats->bHwError = stats->bCRC|stats->bICV;
-       }
 
-       if(stats->Length < 24 || stats->Length > MAX_8192U_RX_SIZE)
+       if (stats->Length < 24 || stats->Length > MAX_8192U_RX_SIZE)
                stats->bHwError |= 1;
        //
        //Get Driver Info
@@ -5281,71 +4662,66 @@ void query_rxdesc_status(struct sk_buff *skb, struct ieee80211_rx_stats *stats,
        // TODO: Need to verify it on FGPA platform
        //Driver info are written to the RxBuffer following rx desc
        if (stats->RxDrvInfoSize != 0) {
-               driver_info = (rx_drvinfo_819x_usb *)(skb->data + sizeof(rx_desc_819x_usb) + \
-                               stats->RxBufShift);
+               driver_info = (rx_drvinfo_819x_usb *)(skb->data + sizeof(rx_desc_819x_usb) +
+                                                     stats->RxBufShift);
                /* unit: 0.5M */
                /* TODO */
-               if(!stats->bHwError){
+               if (!stats->bHwError) {
                        u8      ret_rate;
                        ret_rate = HwRateToMRate90(driver_info->RxHT, driver_info->RxRate);
-                       if(ret_rate == 0xff)
-                       {
+                       if (ret_rate == 0xff) {
                                // Abnormal Case: Receive CRC OK packet with Rx descriptor indicating non supported rate.
                                // Special Error Handling here, 2008.05.16, by Emily
 
                                stats->bHwError = 1;
                                stats->rate = MGN_1M;   //Set 1M rate by default
-                       }else
-                       {
+                       } else {
                                stats->rate = ret_rate;
                        }
-               }
-               else
+               } else {
                        stats->rate = 0x02;
+               }
 
                stats->bShortPreamble = driver_info->SPLCP;
 
 
                UpdateReceivedRateHistogramStatistics8190(dev, stats);
 
-               stats->bIsAMPDU = (driver_info->PartAggr==1);
-               stats->bFirstMPDU = (driver_info->PartAggr==1) && (driver_info->FirstAGGR==1);
+               stats->bIsAMPDU = (driver_info->PartAggr == 1);
+               stats->bFirstMPDU = (driver_info->PartAggr == 1) && (driver_info->FirstAGGR == 1);
                stats->TimeStampLow = driver_info->TSFL;
                // xiong mask it, 070514
-               //pRfd->Status.TimeStampHigh = PlatformEFIORead4Byte(Adapter, TSFR+4);
-               // stats->TimeStampHigh = read_nic_dword(dev,  TSFR+4);
 
                UpdateRxPktTimeStamp8190(dev, stats);
 
                //
                // Rx A-MPDU
                //
-               if(driver_info->FirstAGGR==1 || driver_info->PartAggr == 1)
+               if (driver_info->FirstAGGR == 1 || driver_info->PartAggr == 1)
                        RT_TRACE(COMP_RXDESC, "driver_info->FirstAGGR = %d, driver_info->PartAggr = %d\n",
-                                       driver_info->FirstAGGR, driver_info->PartAggr);
+                                driver_info->FirstAGGR, driver_info->PartAggr);
 
        }
 
-       skb_pull(skb,sizeof(rx_desc_819x_usb));
+       skb_pull(skb, sizeof(rx_desc_819x_usb));
        //
        // Get Total offset of MPDU Frame Body
        //
-       if((stats->RxBufShift + stats->RxDrvInfoSize) > 0) {
+       if ((stats->RxBufShift + stats->RxDrvInfoSize) > 0) {
                stats->bShift = 1;
-               skb_pull(skb,stats->RxBufShift + stats->RxDrvInfoSize);
+               skb_pull(skb, stats->RxBufShift + stats->RxDrvInfoSize);
        }
 
 #ifdef USB_RX_AGGREGATION_SUPPORT
        /* for the rx aggregated sub frame, the redundant space truly contained in the packet */
-       if(bIsRxAggrSubframe) {
+       if (bIsRxAggrSubframe)
                skb_pull(skb, 8);
-       }
 #endif
        /* for debug 2008.5.29 */
 
        //added by vivi, for MP, 20080108
        stats->RxIs40MHzPacket = driver_info->BW;
-       if(stats->RxDrvInfoSize != 0)
+       if (stats->RxDrvInfoSize != 0)
                TranslateRxSignalStuff819xUsb(skb, stats, driver_info);
 
 }
@@ -5359,19 +4735,18 @@ u32 GetRxPacketShiftBytes819xUsb(struct ieee80211_rx_stats  *Status, bool bIsRxA
        else
 #endif
                return (sizeof(rx_desc_819x_usb) + Status->RxDrvInfoSize
-                               + Status->RxBufShift);
+                       + Status->RxBufShift);
 }
 
-void rtl8192_rx_nomal(struct sk_buffskb)
+void rtl8192_rx_nomal(struct sk_buff *skb)
 {
        rtl8192_rx_info *info = (struct rtl8192_rx_info *)skb->cb;
-       struct net_device *dev=info->dev;
+       struct net_device *dev = info->dev;
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        struct ieee80211_rx_stats stats = {
                .signal = 0,
                .noise = -98,
                .rate = 0,
-               //      .mac_time = jiffies,
                .freq = IEEE80211_24GHZ_BAND,
        };
        u32 rx_pkt_len = 0;
@@ -5393,7 +4768,7 @@ void rtl8192_rx_nomal(struct sk_buff* skb)
 #endif
 
        /* 20 is for ps-poll */
-       if((skb->len >=(20 + sizeof(rx_desc_819x_usb))) && (skb->len < RX_URB_SIZE)) {
+       if ((skb->len >= (20 + sizeof(rx_desc_819x_usb))) && (skb->len < RX_URB_SIZE)) {
 #ifdef USB_RX_AGGREGATION_SUPPORT
                TempByte = *(skb->data + sizeof(rx_desc_819x_usb));
 #endif
@@ -5404,14 +4779,12 @@ void rtl8192_rx_nomal(struct sk_buff* skb)
 #ifdef USB_RX_AGGREGATION_SUPPORT
                if (TempByte & BIT0) {
                        agg_skb = skb;
-                       //TotalLength = agg_skb->len - 4; /*sCrcLng*/
                        TotalLength = stats.Length - 4; /*sCrcLng*/
-                       //RT_TRACE(COMP_RECV, "%s:first aggregated packet!Length=%d\n",__FUNCTION__,TotalLength);
                        /* though the head pointer has passed this position  */
                        TempDWord = *(u32 *)(agg_skb->data - 4);
                        PacketLength = (u16)(TempDWord & 0x3FFF); /*sCrcLng*/
                        skb = dev_alloc_skb(PacketLength);
-                       memcpy(skb_put(skb,PacketLength),agg_skb->data,PacketLength);
+                       memcpy(skb_put(skb, PacketLength), agg_skb->data, PacketLength);
                        PacketShiftBytes = GetRxPacketShiftBytes819xUsb(&stats, false);
                }
 #endif
@@ -5421,26 +4794,24 @@ void rtl8192_rx_nomal(struct sk_buff* skb)
                rx_pkt_len = skb->len;
                ieee80211_hdr = (struct ieee80211_hdr_1addr *)skb->data;
                unicast_packet = false;
-               if(is_broadcast_ether_addr(ieee80211_hdr->addr1)) {
+               if (is_broadcast_ether_addr(ieee80211_hdr->addr1)) {
                        //TODO
-               }else if(is_multicast_ether_addr(ieee80211_hdr->addr1)){
+               } else if (is_multicast_ether_addr(ieee80211_hdr->addr1)) {
                        //TODO
-               }else {
+               } else {
                        /* unicast packet */
                        unicast_packet = true;
                }
 
-               if(!ieee80211_rx(priv->ieee80211,skb, &stats)) {
+               if (!ieee80211_rx(priv->ieee80211, skb, &stats)) {
                        dev_kfree_skb_any(skb);
                } else {
                        priv->stats.rxoktotal++;
-                       if(unicast_packet) {
+                       if (unicast_packet)
                                priv->stats.rxbytesunicast += rx_pkt_len;
-                       }
                }
 #ifdef USB_RX_AGGREGATION_SUPPORT
                testing = 1;
-               // (PipeIndex == 0) && (TempByte & BIT0) => TotalLength > 0.
                if (TotalLength > 0) {
                        PacketOccupiedLendth = PacketLength + (PacketShiftBytes + 8);
                        if ((PacketOccupiedLendth & 0xFF) != 0)
@@ -5452,9 +4823,8 @@ void rtl8192_rx_nomal(struct sk_buff* skb)
                        else
                                agg_skb->len = 0;
 
-                       while (agg_skb->len>=GetRxPacketShiftBytes819xUsb(&stats, true)) {
+                       while (agg_skb->len >= GetRxPacketShiftBytes819xUsb(&stats, true)) {
                                u8 tmpCRC = 0, tmpICV = 0;
-                               //RT_TRACE(COMP_RECV,"%s:aggred pkt,total_len = %d\n",__FUNCTION__,agg_skb->len);
                                RxDescr = (rx_desc_819x_usb_aggr_subframe *)(agg_skb->data);
                                tmpCRC = RxDescr->CRC32;
                                tmpICV = RxDescr->ICV;
@@ -5470,32 +4840,30 @@ void rtl8192_rx_nomal(struct sk_buff* skb)
                                query_rxdesc_status(agg_skb, &stats, true);
                                PacketLength = stats.Length;
 
-                               if(PacketLength > agg_skb->len) {
+                               if (PacketLength > agg_skb->len)
                                        break;
-                               }
                                /* Process the MPDU received */
                                skb = dev_alloc_skb(PacketLength);
-                               memcpy(skb_put(skb,PacketLength),agg_skb->data, PacketLength);
+                               memcpy(skb_put(skb, PacketLength), agg_skb->data, PacketLength);
                                skb_trim(skb, skb->len - 4/*sCrcLng*/);
 
                                rx_pkt_len = skb->len;
                                ieee80211_hdr = (struct ieee80211_hdr_1addr *)skb->data;
                                unicast_packet = false;
-                               if(is_broadcast_ether_addr(ieee80211_hdr->addr1)) {
+                               if (is_broadcast_ether_addr(ieee80211_hdr->addr1)) {
                                        //TODO
-                               }else if(is_multicast_ether_addr(ieee80211_hdr->addr1)){
+                               } else if (is_multicast_ether_addr(ieee80211_hdr->addr1)) {
                                        //TODO
-                               }else {
+                               } else {
                                        /* unicast packet */
                                        unicast_packet = true;
                                }
-                               if(!ieee80211_rx(priv->ieee80211,skb, &stats)) {
+                               if (!ieee80211_rx(priv->ieee80211, skb, &stats)) {
                                        dev_kfree_skb_any(skb);
                                } else {
                                        priv->stats.rxoktotal++;
-                                       if(unicast_packet) {
+                                       if (unicast_packet)
                                                priv->stats.rxbytesunicast += rx_pkt_len;
-                                       }
                                }
                                /* should trim the packet which has been copied to target skb */
                                skb_pull(agg_skb, PacketLength);
@@ -5514,26 +4882,18 @@ void rtl8192_rx_nomal(struct sk_buff* skb)
 #endif
        } else {
                priv->stats.rxurberr++;
-               printk("actual_length:%d\n", skb->len);
+               netdev_dbg(dev, "actual_length: %d\n", skb->len);
                dev_kfree_skb_any(skb);
        }
 
 }
 
-void
-rtl819xusb_process_received_packet(
-       struct net_device *dev,
-       struct ieee80211_rx_stats *pstats
-       )
+void rtl819xusb_process_received_packet(struct net_device *dev,
+                                       struct ieee80211_rx_stats *pstats)
 {
-//     bool bfreerfd=false, bqueued=false;
-       u8*     frame;
-       u16     frame_len=0;
+       u8      *frame;
+       u16     frame_len = 0;
        struct r8192_priv *priv = ieee80211_priv(dev);
-//     u8                      index = 0;
-//     u8                      TID = 0;
-       //u16                   seqnum = 0;
-       //PRX_TS_RECORD pts = NULL;
 
        // Get shifted bytes of Starting address of 802.11 header. 2006.09.28, by Emily
        //porting by amy 080508
@@ -5541,33 +4901,27 @@ rtl819xusb_process_received_packet(
        frame = pstats->virtual_address;
        frame_len = pstats->packetlength;
 #ifdef TODO    // by amy about HCT
-       if(!Adapter->bInHctTest)
+       if (!Adapter->bInHctTest)
                CountRxErrStatistics(Adapter, pRfd);
 #endif
-       {
-       #ifdef ENABLE_PS  //by amy for adding ps function in future
-               RT_RF_POWER_STATE rtState;
-               // When RF is off, we should not count the packet for hw/sw synchronize
-               // reason, ie. there may be a duration while sw switch is changed and hw
-               // switch is being changed. 2006.12.04, by shien chang.
-               Adapter->HalFunc.GetHwRegHandler(Adapter, HW_VAR_RF_STATE, (u8* )(&rtState));
-               if (rtState == eRfOff)
-               {
-                       return;
-               }
-       #endif
+#ifdef ENABLE_PS  //by amy for adding ps function in future
+       RT_RF_POWER_STATE rtState;
+       // When RF is off, we should not count the packet for hw/sw synchronize
+       // reason, ie. there may be a duration while sw switch is changed and hw
+       // switch is being changed. 2006.12.04, by shien chang.
+       Adapter->HalFunc.GetHwRegHandler(Adapter, HW_VAR_RF_STATE, (u8 *)(&rtState));
+       if (rtState == eRfOff)
+               return;
+#endif
        priv->stats.rxframgment++;
 
-       }
 #ifdef TODO
        RmMonitorSignalStrength(Adapter, pRfd);
 #endif
        /* 2007/01/16 MH Add RX command packet handle here. */
        /* 2007/03/01 MH We have to release RFD and return if rx pkt is cmd pkt. */
        if (rtl819xusb_rx_command_packet(dev, pstats))
-       {
                return;
-       }
 
 #ifdef SW_CRC_CHECK
        SwCrcCheck();
@@ -5578,16 +4932,12 @@ rtl819xusb_process_received_packet(
 
 void query_rx_cmdpkt_desc_status(struct sk_buff *skb, struct ieee80211_rx_stats *stats)
 {
-//     rtl8192_rx_info *info = (struct rtl8192_rx_info *)skb->cb;
-//     struct net_device *dev=info->dev;
-//     struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
        rx_desc_819x_usb *desc = (rx_desc_819x_usb *)skb->data;
-//     rx_drvinfo_819x_usb  *driver_info;
 
        //
        //Get Rx Descriptor Information
        //
-       stats->virtual_address = (u8*)skb->data;
+       stats->virtual_address = (u8 *)skb->data;
        stats->Length = desc->Length;
        stats->RxDrvInfoSize = 0;
        stats->RxBufShift = 0;
@@ -5602,21 +4952,17 @@ void rtl8192_rx_cmd(struct sk_buff *skb)
 {
        struct rtl8192_rx_info *info = (struct rtl8192_rx_info *)skb->cb;
        struct net_device *dev = info->dev;
-       //int ret;
-//     struct urb *rx_urb = info->urb;
        /* TODO */
        struct ieee80211_rx_stats stats = {
                .signal = 0,
                .noise = -98,
                .rate = 0,
-               //      .mac_time = jiffies,
                .freq = IEEE80211_24GHZ_BAND,
        };
 
-       if((skb->len >=(20 + sizeof(rx_desc_819x_usb))) && (skb->len < RX_URB_SIZE))
-       {
+       if ((skb->len >= (20 + sizeof(rx_desc_819x_usb))) && (skb->len < RX_URB_SIZE)) {
 
-               query_rx_cmdpkt_desc_status(skb,&stats);
+               query_rx_cmdpkt_desc_status(skb, &stats);
                // this is to be done by amy 080508     prfd->queue_id = 1;
 
 
@@ -5624,7 +4970,7 @@ void rtl8192_rx_cmd(struct sk_buff *skb)
                //  Process the command packet received.
                //
 
-               rtl819xusb_process_received_packet(dev,&stats);
+               rtl819xusb_process_received_packet(dev, &stats);
 
                dev_kfree_skb_any(skb);
        }
@@ -5640,22 +4986,21 @@ void rtl8192_irq_rx_tasklet(struct r8192_priv *priv)
                switch (info->out_pipe) {
                /* Nomal packet pipe */
                case 3:
-                       //RT_TRACE(COMP_RECV, "normal in-pipe index(%d)\n",info->out_pipe);
                        priv->IrpPendingCount--;
                        rtl8192_rx_nomal(skb);
                        break;
 
-                       /* Command packet pipe */
+               /* Command packet pipe */
                case 9:
-                       RT_TRACE(COMP_RECV, "command in-pipe index(%d)\n",\
-                                       info->out_pipe);
+                       RT_TRACE(COMP_RECV, "command in-pipe index(%d)\n",
+                                info->out_pipe);
 
                        rtl8192_rx_cmd(skb);
                        break;
 
                default: /* should never get here! */
-                       RT_TRACE(COMP_ERR, "Unknown in-pipe index(%d)\n",\
-                                       info->out_pipe);
+                       RT_TRACE(COMP_ERR, "Unknown in-pipe index(%d)\n",
+                                info->out_pipe);
                        dev_kfree_skb(skb);
                        break;
 
@@ -5682,11 +5027,10 @@ static const struct net_device_ops rtl8192_netdev_ops = {
 *****************************************************************************/
 
 static int rtl8192_usb_probe(struct usb_interface *intf,
-                        const struct usb_device_id *id)
+                            const struct usb_device_id *id)
 {
-//     unsigned long ioaddr = 0;
        struct net_device *dev = NULL;
-       struct r8192_priv *priv= NULL;
+       struct r8192_priv *priv = NULL;
        struct usb_device *udev = interface_to_usbdev(intf);
        int ret;
        RT_TRACE(COMP_INIT, "Oops: i'm coming\n");
@@ -5699,29 +5043,28 @@ static int rtl8192_usb_probe(struct usb_interface *intf,
        SET_NETDEV_DEV(dev, &intf->dev);
        priv = ieee80211_priv(dev);
        priv->ieee80211 = netdev_priv(dev);
-       priv->udev=udev;
+       priv->udev = udev;
 
        dev->netdev_ops = &rtl8192_netdev_ops;
 
-        //DMESG("Oops: i'm coming\n");
 #if WIRELESS_EXT >= 12
 #if WIRELESS_EXT < 17
        dev->get_wireless_stats = r8192_get_wireless_stats;
 #endif
        dev->wireless_handlers = (struct iw_handler_def *) &r8192_wx_handlers_def;
 #endif
-       dev->type=ARPHRD_ETHER;
+       dev->type = ARPHRD_ETHER;
 
        dev->watchdog_timeo = HZ*3;     //modified by john, 0805
 
-       if (dev_alloc_name(dev, ifname) < 0){
+       if (dev_alloc_name(dev, ifname) < 0) {
                RT_TRACE(COMP_INIT, "Oops: devname already taken! Trying wlan%%d...\n");
                ifname = "wlan%d";
                dev_alloc_name(dev, ifname);
        }
 
        RT_TRACE(COMP_INIT, "Driver probe completed1\n");
-       if(rtl8192_init(dev)!=0){
+       if (rtl8192_init(dev) != 0) {
                RT_TRACE(COMP_ERR, "Initialization failed");
                ret = -ENODEV;
                goto fail;
@@ -5733,7 +5076,7 @@ static int rtl8192_usb_probe(struct usb_interface *intf,
        if (ret)
                goto fail2;
 
-       RT_TRACE(COMP_INIT, "dev name=======> %s\n",dev->name);
+       RT_TRACE(COMP_INIT, "dev name=======> %s\n", dev->name);
        rtl8192_proc_init_one(dev);
 
 
@@ -5755,16 +5098,13 @@ fail:
 }
 
 //detach all the work and timer structure declared or inititialize in r8192U_init function.
-void rtl8192_cancel_deferred_work(struct r8192_privpriv)
+void rtl8192_cancel_deferred_work(struct r8192_priv *priv)
 {
 
        cancel_work_sync(&priv->reset_wq);
        cancel_delayed_work(&priv->watch_dog_wq);
        cancel_delayed_work(&priv->update_beacon_wq);
        cancel_work_sync(&priv->qos_activate);
-       //cancel_work_sync(&priv->SetBWModeWorkItem);
-       //cancel_work_sync(&priv->SwChnlWorkItem);
-
 }
 
 
@@ -5773,22 +5113,18 @@ static void rtl8192_usb_disconnect(struct usb_interface *intf)
        struct net_device *dev = usb_get_intfdata(intf);
 
        struct r8192_priv *priv = ieee80211_priv(dev);
-       if(dev){
+       if (dev) {
 
                unregister_netdev(dev);
 
                RT_TRACE(COMP_DOWN, "=============>wlan driver to be removed\n");
                rtl8192_proc_remove_one(dev);
 
-                       rtl8192_down(dev);
+               rtl8192_down(dev);
                kfree(priv->pFirmware);
                priv->pFirmware = NULL;
-       //      priv->rf_close(dev);
-//             rtl8192_SetRFPowerState(dev, eRfOff);
                rtl8192_usb_deleteendpoints(dev);
                destroy_workqueue(priv->priv_wq);
-               //rtl8192_irq_disable(dev);
-               //rtl8192_reset(dev);
                mdelay(10);
 
        }
@@ -5815,38 +5151,36 @@ static int __init rtl8192_usb_module_init(void)
 #ifdef CONFIG_IEEE80211_DEBUG
        ret = ieee80211_debug_init();
        if (ret) {
-               printk(KERN_ERR "ieee80211_debug_init() failed %d\n", ret);
+               pr_err("ieee80211_debug_init() failed %d\n", ret);
                return ret;
        }
 #endif
        ret = ieee80211_crypto_init();
        if (ret) {
-               printk(KERN_ERR "ieee80211_crypto_init() failed %d\n", ret);
+               pr_err("ieee80211_crypto_init() failed %d\n", ret);
                return ret;
        }
 
        ret = ieee80211_crypto_tkip_init();
        if (ret) {
-               printk(KERN_ERR "ieee80211_crypto_tkip_init() failed %d\n",
-                       ret);
+               pr_err("ieee80211_crypto_tkip_init() failed %d\n", ret);
                return ret;
        }
 
        ret = ieee80211_crypto_ccmp_init();
        if (ret) {
-               printk(KERN_ERR "ieee80211_crypto_ccmp_init() failed %d\n",
-                       ret);
+               pr_err("ieee80211_crypto_ccmp_init() failed %d\n", ret);
                return ret;
        }
 
        ret = ieee80211_crypto_wep_init();
        if (ret) {
-               printk(KERN_ERR "ieee80211_crypto_wep_init() failed %d\n", ret);
+               pr_err("ieee80211_crypto_wep_init() failed %d\n", ret);
                return ret;
        }
 
-       printk(KERN_INFO "\nLinux kernel driver for RTL8192 based WLAN cards\n");
-       printk(KERN_INFO "Copyright (c) 2007-2008, Realsil Wlan\n");
+       pr_info("\nLinux kernel driver for RTL8192 based WLAN cards\n");
+       pr_info("Copyright (c) 2007-2008, Realsil Wlan\n");
        RT_TRACE(COMP_INIT, "Initializing module");
        RT_TRACE(COMP_INIT, "Wireless extensions version %d", WIRELESS_EXT);
        rtl8192_proc_module_init();
@@ -5859,7 +5193,6 @@ static void __exit rtl8192_usb_module_exit(void)
        usb_deregister(&rtl8192_usb_driver);
 
        RT_TRACE(COMP_DOWN, "Exiting");
-//     rtl8192_proc_module_remove();
 }
 
 
@@ -5869,11 +5202,11 @@ void rtl8192_try_wake_queue(struct net_device *dev, int pri)
        short enough_desc;
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 
-       spin_lock_irqsave(&priv->tx_lock,flags);
-       enough_desc = check_nic_enough_desc(dev,pri);
-       spin_unlock_irqrestore(&priv->tx_lock,flags);
+       spin_lock_irqsave(&priv->tx_lock, flags);
+       enough_desc = check_nic_enough_desc(dev, pri);
+       spin_unlock_irqrestore(&priv->tx_lock, flags);
 
-       if(enough_desc)
+       if (enough_desc)
                ieee80211_wake_queue(priv->ieee80211);
 }
 
@@ -5881,43 +5214,32 @@ void EnableHWSecurityConfig8192(struct net_device *dev)
 {
        u8 SECR_value = 0x0;
        struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
-        struct ieee80211_device* ieee = priv->ieee80211;
+       struct ieee80211_device *ieee = priv->ieee80211;
        SECR_value = SCR_TxEncEnable | SCR_RxDecEnable;
-       if (((KEY_TYPE_WEP40 == ieee->pairwise_key_type) || (KEY_TYPE_WEP104 == ieee->pairwise_key_type)) && (priv->ieee80211->auth_mode != 2))
-       {
+       if (((KEY_TYPE_WEP40 == ieee->pairwise_key_type) || (KEY_TYPE_WEP104 == ieee->pairwise_key_type)) && (priv->ieee80211->auth_mode != 2)) {
                SECR_value |= SCR_RxUseDK;
                SECR_value |= SCR_TxUseDK;
-       }
-       else if ((ieee->iw_mode == IW_MODE_ADHOC) && (ieee->pairwise_key_type & (KEY_TYPE_CCMP | KEY_TYPE_TKIP)))
-       {
+       } else if ((ieee->iw_mode == IW_MODE_ADHOC) && (ieee->pairwise_key_type & (KEY_TYPE_CCMP | KEY_TYPE_TKIP))) {
                SECR_value |= SCR_RxUseDK;
                SECR_value |= SCR_TxUseDK;
        }
        //add HWSec active enable here.
-//default using hwsec. when peer AP is in N mode only and pairwise_key_type is none_aes(which HT_IOT_ACT_PURE_N_MODE indicates it), use software security. when peer AP is in b,g,n mode mixed and pairwise_key_type is none_aes, use g mode hw security. WB on 2008.7.4
+       //default using hwsec. when peer AP is in N mode only and pairwise_key_type is none_aes(which HT_IOT_ACT_PURE_N_MODE indicates it), use software security. when peer AP is in b,g,n mode mixed and pairwise_key_type is none_aes, use g mode hw security. WB on 2008.7.4
 
        ieee->hwsec_active = 1;
 
-       if ((ieee->pHTInfo->IOTAction&HT_IOT_ACT_PURE_N_MODE) || !hwwep)//!ieee->hwsec_support) //add hwsec_support flag to totol control hw_sec on/off
-       {
+       if ((ieee->pHTInfo->IOTAction&HT_IOT_ACT_PURE_N_MODE) || !hwwep) { //add hwsec_support flag to totol control hw_sec on/off
                ieee->hwsec_active = 0;
                SECR_value &= ~SCR_RxDecEnable;
        }
-       RT_TRACE(COMP_SEC,"%s:, hwsec:%d, pairwise_key:%d, SECR_value:%x\n", __FUNCTION__, \
-                       ieee->hwsec_active, ieee->pairwise_key_type, SECR_value);
-       {
-               write_nic_byte(dev, SECR,  SECR_value);//SECR_value |  SCR_UseDK );
-       }
+       RT_TRACE(COMP_SEC, "%s:, hwsec:%d, pairwise_key:%d, SECR_value:%x\n", __func__,
+                ieee->hwsec_active, ieee->pairwise_key_type, SECR_value);
+       write_nic_byte(dev, SECR,  SECR_value);
 }
 
 
-void setKey(   struct net_device *dev,
-               u8 EntryNo,
-               u8 KeyIndex,
-               u16 KeyType,
-               u8 *MacAddr,
-               u8 DefaultKey,
-               u32 *KeyContent )
+void setKey(struct net_device *dev, u8 EntryNo, u8 KeyIndex, u16 KeyType,
+           u8 *MacAddr, u8 DefaultKey, u32 *KeyContent)
 {
        u32 TargetCommand = 0;
        u32 TargetContent = 0;
@@ -5926,44 +5248,40 @@ void setKey(    struct net_device *dev,
        if (EntryNo >= TOTAL_CAM_ENTRY)
                RT_TRACE(COMP_ERR, "cam entry exceeds in setKey()\n");
 
-       RT_TRACE(COMP_SEC, "====>to setKey(), dev:%p, EntryNo:%d, KeyIndex:%d, KeyType:%d, MacAddr%pM\n", dev,EntryNo, KeyIndex, KeyType, MacAddr);
+       RT_TRACE(COMP_SEC, "====>to setKey(), dev:%p, EntryNo:%d, KeyIndex:%d, KeyType:%d, MacAddr%pM\n", dev, EntryNo, KeyIndex, KeyType, MacAddr);
 
        if (DefaultKey)
                usConfig |= BIT15 | (KeyType<<2);
        else
                usConfig |= BIT15 | (KeyType<<2) | KeyIndex;
-//     usConfig |= BIT15 | (KeyType<<2) | (DefaultKey<<5) | KeyIndex;
 
 
-       for(i=0 ; i<CAM_CONTENT_COUNT; i++){
+       for (i = 0; i < CAM_CONTENT_COUNT; i++) {
                TargetCommand  = i+CAM_CONTENT_COUNT*EntryNo;
                TargetCommand |= BIT31|BIT16;
 
-               if(i==0){//MAC|Config
+               if (i == 0) { //MAC|Config
                        TargetContent = (u32)(*(MacAddr+0)) << 16|
                                        (u32)(*(MacAddr+1)) << 24|
                                        (u32)usConfig;
 
                        write_nic_dword(dev, WCAMI, TargetContent);
                        write_nic_dword(dev, RWCAM, TargetCommand);
-       //              printk("setkey cam =%8x\n", read_cam(dev, i+6*EntryNo));
-               }
-               else if(i==1){//MAC
+               } else if (i == 1) { //MAC
                        TargetContent = (u32)(*(MacAddr+2))      |
                                        (u32)(*(MacAddr+3)) <<  8|
                                        (u32)(*(MacAddr+4)) << 16|
                                        (u32)(*(MacAddr+5)) << 24;
                        write_nic_dword(dev, WCAMI, TargetContent);
                        write_nic_dword(dev, RWCAM, TargetCommand);
-               }
-               else {
+               } else {
                        //Key Material
-                       if(KeyContent !=NULL){
-                       write_nic_dword(dev, WCAMI, (u32)(*(KeyContent+i-2)) );
-                       write_nic_dword(dev, RWCAM, TargetCommand);
+                       if (KeyContent != NULL) {
+                               write_nic_dword(dev, WCAMI, (u32)(*(KeyContent+i-2)));
+                               write_nic_dword(dev, RWCAM, TargetCommand);
+                       }
                }
        }
-       }
 
 }
 
index ea46717f1fadfa2ce2801d325c6534fa22e27203..a6e4c37d9c78e2d2fc305e1fd0e7862ca9d07c78 100644 (file)
@@ -88,7 +88,7 @@ static        void    dm_check_rate_adaptive(struct net_device *dev);
 
 // DM --> Bandwidth switch
 static void    dm_init_bandwidth_autoswitch(struct net_device *dev);
-static void    dm_bandwidth_autoswitch(        struct net_device *dev);
+static void    dm_bandwidth_autoswitch(struct net_device *dev);
 
 // DM --> TX power control
 //static       void    dm_initialize_txpower_tracking(struct net_device *dev);
@@ -112,7 +112,7 @@ static      void    dm_bb_initialgain_backup(struct net_device *dev);
 static void    dm_dig_init(struct net_device *dev);
 static void    dm_ctrl_initgain_byrssi(struct net_device *dev);
 static void    dm_ctrl_initgain_byrssi_highpwr(struct net_device *dev);
-static void    dm_ctrl_initgain_byrssi_by_driverrssi(  struct net_device *dev);
+static void    dm_ctrl_initgain_byrssi_by_driverrssi(struct net_device *dev);
 static void    dm_ctrl_initgain_byrssi_by_fwfalse_alarm(struct net_device *dev);
 static void    dm_initial_gain(struct net_device *dev);
 static void    dm_pd_th(struct net_device *dev);
@@ -289,7 +289,7 @@ extern  void    hal_dm_watchdog(struct net_device *dev)
   *    01/16/2008      MHC             RF_Type is assigned in ReadAdapterInfo(). We must call
   *                                            the function after making sure RF_Type.
   */
-extern void init_rate_adaptive(struct net_device * dev)
+extern void init_rate_adaptive(struct net_device *dev)
 {
 
        struct r8192_priv *priv = ieee80211_priv(dev);
@@ -351,7 +351,7 @@ extern void init_rate_adaptive(struct net_device * dev)
  *     05/26/08        amy     Create version 0 porting from windows code.
  *
  *---------------------------------------------------------------------------*/
-static void dm_check_rate_adaptive(struct net_device * dev)
+static void dm_check_rate_adaptive(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        PRT_HIGH_THROUGHPUT     pHTInfo = priv->ieee80211->pHTInfo;
@@ -372,11 +372,11 @@ static void dm_check_rate_adaptive(struct net_device * dev)
                return;
 
        // TODO: Only 11n mode is implemented currently,
-       if( !(priv->ieee80211->mode == WIRELESS_MODE_N_24G ||
+       if(!(priv->ieee80211->mode == WIRELESS_MODE_N_24G ||
                 priv->ieee80211->mode == WIRELESS_MODE_N_5G))
                 return;
 
-       if( priv->ieee80211->state == IEEE80211_LINKED )
+       if(priv->ieee80211->state == IEEE80211_LINKED)
        {
        //      RT_TRACE(COMP_RATE, "dm_CheckRateAdaptive(): \t");
 
@@ -454,8 +454,8 @@ static void dm_check_rate_adaptive(struct net_device * dev)
                        //pHalData->UndecoratedSmoothedPWDB = 19;
                        if(priv->undecorated_smoothed_pwdb < (long)(pra->ping_rssi_thresh_for_ra+5))
                        {
-                               if( (priv->undecorated_smoothed_pwdb < (long)pra->ping_rssi_thresh_for_ra) ||
-                                       ping_rssi_state )
+                               if((priv->undecorated_smoothed_pwdb < (long)pra->ping_rssi_thresh_for_ra) ||
+                                       ping_rssi_state)
                                {
                                        //DbgPrint("TestRSSI = %d, set RATR to 0x%x \n", pHalData->UndecoratedSmoothedPWDB, pRA->TestRSSIRATR);
                                        pra->ratr_state = DM_RATR_STA_LOW;
@@ -480,8 +480,8 @@ static void dm_check_rate_adaptive(struct net_device * dev)
                //
                // Check whether updating of RATR0 is required
                //
-               currentRATR = read_nic_dword(dev, RATR0);
-               if( targetRATR !=  currentRATR )
+               read_nic_dword(dev, RATR0, &currentRATR);
+               if(targetRATR !=  currentRATR)
                {
                        u32 ratr_value;
                        ratr_value = targetRATR;
@@ -505,7 +505,7 @@ static void dm_check_rate_adaptive(struct net_device * dev)
 }      // dm_CheckRateAdaptive
 
 
-static void dm_init_bandwidth_autoswitch(struct net_device * dev)
+static void dm_init_bandwidth_autoswitch(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
 
@@ -517,7 +517,7 @@ static void dm_init_bandwidth_autoswitch(struct net_device * dev)
 }      // dm_init_bandwidth_autoswitch
 
 
-static void dm_bandwidth_autoswitch(struct net_device * dev)
+static void dm_bandwidth_autoswitch(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
 
@@ -588,7 +588,7 @@ static u8   CCKSwingTable_Ch14[CCK_Table_length][8] = {
        {0x0f, 0x0f, 0x0d, 0x08, 0x00, 0x00, 0x00, 0x00}        // 11, -11db
 };
 
-static void dm_TXPowerTrackingCallback_TSSI(struct net_device * dev)
+static void dm_TXPowerTrackingCallback_TSSI(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        bool                                            bHighpowerstate, viviflag = FALSE;
@@ -627,14 +627,14 @@ static void dm_TXPowerTrackingCallback_TSSI(struct net_device * dev)
                RT_TRACE(COMP_POWER_TRACKING, "Set configuration with tx cmd queue fail!\n");
        }
 #else
-       cmpk_message_handle_tx(dev, (u8*)&tx_cmd,
+       cmpk_message_handle_tx(dev, (u8 *)&tx_cmd,
                                                                DESC_PACKET_TYPE_INIT, sizeof(DCMD_TXCMD_T));
 #endif
        mdelay(1);
        //DbgPrint("hi, vivi, strange\n");
        for(i = 0;i <= 30; i++)
        {
-               Pwr_Flag = read_nic_byte(dev, 0x1ba);
+               read_nic_byte(dev, 0x1ba, &Pwr_Flag);
 
                if (Pwr_Flag == 0)
                {
@@ -642,9 +642,9 @@ static void dm_TXPowerTrackingCallback_TSSI(struct net_device * dev)
                        continue;
                }
 #ifdef RTL8190P
-               Avg_TSSI_Meas = read_nic_word(dev, 0x1bc);
+               read_nic_word(dev, 0x1bc, &Avg_TSSI_Meas);
 #else
-               Avg_TSSI_Meas = read_nic_word(dev, 0x13c);
+               read_nic_word(dev, 0x13c, &Avg_TSSI_Meas);
 #endif
                if(Avg_TSSI_Meas == 0)
                {
@@ -655,12 +655,12 @@ static void dm_TXPowerTrackingCallback_TSSI(struct net_device * dev)
                for(k = 0;k < 5; k++)
                {
 #ifdef RTL8190P
-                       tmp_report[k] = read_nic_byte(dev, 0x1d8+k);
+                       read_nic_byte(dev, 0x1d8+k, &tmp_report[k]);
 #else
                        if(k !=4)
-                               tmp_report[k] = read_nic_byte(dev, 0x134+k);
+                               read_nic_byte(dev, 0x134+k, &tmp_report[k]);
                        else
-                               tmp_report[k] = read_nic_byte(dev, 0x13e);
+                               read_nic_byte(dev, 0x13e, &tmp_report[k]);
 #endif
                        RT_TRACE(COMP_POWER_TRACKING, "TSSI_report_value = %d\n", tmp_report[k]);
                }
@@ -816,7 +816,7 @@ static void dm_TXPowerTrackingCallback_TSSI(struct net_device * dev)
                write_nic_byte(dev, 0x1ba, 0);
 }
 
-static void dm_TXPowerTrackingCallback_ThermalMeter(struct net_device * dev)
+static void dm_TXPowerTrackingCallback_ThermalMeter(struct net_device *dev)
 {
 #define ThermalMeterVal        9
        struct r8192_priv *priv = ieee80211_priv(dev);
@@ -1572,7 +1572,7 @@ static void dm_CCKTxPowerAdjust_TSSI(struct net_device *dev, bool  bInCH14)
                TempVal = 0;
                TempVal =       priv->cck_txbbgain_table[priv->cck_present_attentuation].ccktxbb_valuearray[2] +
                                        (priv->cck_txbbgain_table[priv->cck_present_attentuation].ccktxbb_valuearray[3]<<8) +
-                                       (priv->cck_txbbgain_table[priv->cck_present_attentuation].ccktxbb_valuearray[4]<<16 )+
+                                       (priv->cck_txbbgain_table[priv->cck_present_attentuation].ccktxbb_valuearray[4]<<16)+
                                        (priv->cck_txbbgain_table[priv->cck_present_attentuation].ccktxbb_valuearray[5]<<24);
                rtl8192_setBBreg(dev, rCCK0_TxFilter2,bMaskDWord, TempVal);
                //Write 0xa28  0xa29
@@ -1592,7 +1592,7 @@ static void dm_CCKTxPowerAdjust_TSSI(struct net_device *dev, bool  bInCH14)
                TempVal = 0;
                TempVal =       priv->cck_txbbgain_ch14_table[priv->cck_present_attentuation].ccktxbb_valuearray[2] +
                                        (priv->cck_txbbgain_ch14_table[priv->cck_present_attentuation].ccktxbb_valuearray[3]<<8) +
-                                       (priv->cck_txbbgain_ch14_table[priv->cck_present_attentuation].ccktxbb_valuearray[4]<<16 )+
+                                       (priv->cck_txbbgain_ch14_table[priv->cck_present_attentuation].ccktxbb_valuearray[4]<<16)+
                                        (priv->cck_txbbgain_ch14_table[priv->cck_present_attentuation].ccktxbb_valuearray[5]<<24);
                rtl8192_setBBreg(dev, rCCK0_TxFilter2,bMaskDWord, TempVal);
                //Write 0xa28  0xa29
@@ -1624,7 +1624,7 @@ static void dm_CCKTxPowerAdjust_ThermalMeter(struct net_device *dev,      bool  bInCH
                TempVal = 0;
                TempVal =       CCKSwingTable_Ch1_Ch13[priv->CCK_index][2] +
                                        (CCKSwingTable_Ch1_Ch13[priv->CCK_index][3]<<8) +
-                                       (CCKSwingTable_Ch1_Ch13[priv->CCK_index][4]<<16 )+
+                                       (CCKSwingTable_Ch1_Ch13[priv->CCK_index][4]<<16)+
                                        (CCKSwingTable_Ch1_Ch13[priv->CCK_index][5]<<24);
                rtl8192_setBBreg(dev, rCCK0_TxFilter2, bMaskDWord, TempVal);
                RT_TRACE(COMP_POWER_TRACKING, "CCK not chnl 14, reg 0x%x = 0x%x\n",
@@ -1652,7 +1652,7 @@ static void dm_CCKTxPowerAdjust_ThermalMeter(struct net_device *dev,      bool  bInCH
                TempVal = 0;
                TempVal =       CCKSwingTable_Ch14[priv->CCK_index][2] +
                                        (CCKSwingTable_Ch14[priv->CCK_index][3]<<8) +
-                                       (CCKSwingTable_Ch14[priv->CCK_index][4]<<16 )+
+                                       (CCKSwingTable_Ch14[priv->CCK_index][4]<<16)+
                                        (CCKSwingTable_Ch14[priv->CCK_index][5]<<24);
                rtl8192_setBBreg(dev, rCCK0_TxFilter2, bMaskDWord, TempVal);
                RT_TRACE(COMP_POWER_TRACKING, "CCK chnl 14, reg 0x%x = 0x%x\n",
@@ -1727,7 +1727,7 @@ extern void dm_restore_dynamic_mechanism_state(struct net_device *dev)
        if(priv->rate_adaptive.rate_adaptive_disabled)
                return;
        // TODO: Only 11n mode is implemented currently,
-       if( !(priv->ieee80211->mode==WIRELESS_MODE_N_24G ||
+       if(!(priv->ieee80211->mode==WIRELESS_MODE_N_24G ||
                 priv->ieee80211->mode==WIRELESS_MODE_N_5G))
                 return;
        {
@@ -1736,7 +1736,7 @@ extern void dm_restore_dynamic_mechanism_state(struct net_device *dev)
                        ratr_value = reg_ratr;
                        if(priv->rf_type == RF_1T2R)    // 1T2R, Spatial Stream 2 should be disabled
                        {
-                               ratr_value &=(RATE_ALL_OFDM_2SS);
+                               ratr_value &= ~(RATE_ALL_OFDM_2SS);
                                //DbgPrint("HW_VAR_TATR_0 from 0x%x ==> 0x%x\n", ((pu4Byte)(val))[0], ratr_value);
                        }
                        //DbgPrint("set HW_VAR_TATR_0 = 0x%x\n", ratr_value);
@@ -2222,7 +2222,7 @@ static void dm_ctrl_initgain_byrssi_by_fwfalse_alarm(
 
        /* 2. When RSSI increase, We have to judge if it is larger than a threshold
                  and then execute the step below.  */
-       if ((priv->undecorated_smoothed_pwdb >= dm_digtable.rssi_high_thresh) )
+       if ((priv->undecorated_smoothed_pwdb >= dm_digtable.rssi_high_thresh))
        {
                u8 reset_flag = 0;
 
@@ -2316,7 +2316,7 @@ static void dm_ctrl_initgain_byrssi_by_fwfalse_alarm(
  *
  *---------------------------------------------------------------------------*/
 static void dm_ctrl_initgain_byrssi_highpwr(
-       struct net_device * dev)
+       struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        static u32 reset_cnt_highpwr;
@@ -2391,12 +2391,13 @@ static void dm_ctrl_initgain_byrssi_highpwr(
 
 
 static void dm_initial_gain(
-       struct net_device * dev)
+       struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u8                                      initial_gain=0;
        static u8                               initialized, force_write;
        static u32                      reset_cnt;
+       u8                              tmp;
 
        if(dm_digtable.dig_algorithm_switch)
        {
@@ -2437,7 +2438,8 @@ static void dm_initial_gain(
                reset_cnt = priv->reset_count;
        }
 
-       if(dm_digtable.pre_ig_value != read_nic_byte(dev, rOFDM0_XAAGCCore1))
+       read_nic_byte(dev, rOFDM0_XAAGCCore1, &tmp);
+       if (dm_digtable.pre_ig_value != tmp)
                force_write = 1;
 
        {
@@ -2459,7 +2461,7 @@ static void dm_initial_gain(
 }
 
 static void dm_pd_th(
-       struct net_device * dev)
+       struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        static u8                               initialized, force_write;
@@ -2571,7 +2573,7 @@ static void dm_pd_th(
 }
 
 static void dm_cs_ratio(
-       struct net_device * dev)
+       struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        static u8                               initialized,force_write;
@@ -2589,7 +2591,7 @@ static    void dm_cs_ratio(
                {
                        if ((dm_digtable.rssi_val <= dm_digtable.rssi_low_thresh))
                                dm_digtable.curcs_ratio_state = DIG_CS_RATIO_LOWER;
-                       else if ((dm_digtable.rssi_val >= dm_digtable.rssi_high_thresh) )
+                       else if ((dm_digtable.rssi_val >= dm_digtable.rssi_high_thresh))
                                dm_digtable.curcs_ratio_state = DIG_CS_RATIO_HIGHER;
                        else
                                dm_digtable.curcs_ratio_state = dm_digtable.precs_ratio_state;
@@ -2634,7 +2636,7 @@ static    void dm_cs_ratio(
        }
 }
 
-extern void dm_init_edca_turbo(struct net_device * dev)
+extern void dm_init_edca_turbo(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
 
@@ -2644,7 +2646,7 @@ extern void dm_init_edca_turbo(struct net_device * dev)
 }      // dm_init_edca_turbo
 
 static void dm_check_edca_turbo(
-       struct net_device * dev)
+       struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        PRT_HIGH_THROUGHPUT     pHTInfo = priv->ieee80211->pHTInfo;
@@ -2727,8 +2729,9 @@ static void dm_check_edca_turbo(
                        // TODO:  Modified this part and try to set acm control in only 1 IO processing!!
 
                                        PACI_AIFSN      pAciAifsn = (PACI_AIFSN)&(qos_parameters->aifs[0]);
-                                       u8              AcmCtrl = read_nic_byte( dev, AcmHwCtrl );
-                                       if( pAciAifsn->f.ACM )
+                                       u8              AcmCtrl;
+                                       read_nic_byte(dev, AcmHwCtrl, &AcmCtrl);
+                                       if(pAciAifsn->f.ACM)
                                        { // ACM bit is 1.
                                                AcmCtrl |= AcmHw_BeqEn;
                                        }
@@ -2737,8 +2740,8 @@ static void dm_check_edca_turbo(
                                                AcmCtrl &= (~AcmHw_BeqEn);
                                        }
 
-                                       RT_TRACE( COMP_QOS,"SetHwReg8190pci(): [HW_VAR_ACM_CTRL] Write 0x%X\n", AcmCtrl ) ;
-                                       write_nic_byte(dev, AcmHwCtrl, AcmCtrl );
+                                       RT_TRACE(COMP_QOS,"SetHwReg8190pci(): [HW_VAR_ACM_CTRL] Write 0x%X\n", AcmCtrl) ;
+                                       write_nic_byte(dev, AcmHwCtrl, AcmCtrl);
                                }
                        }
                        priv->bcurrent_turbo_EDCA = false;
@@ -2753,7 +2756,7 @@ dm_CheckEdcaTurbo_EXIT:
        lastRxOkCnt = priv->stats.rxbytesunicast;
 }      // dm_CheckEdcaTurbo
 
-extern void DM_CTSToSelfSetting(struct net_device * dev,u32 DM_Type, u32 DM_Value)
+extern void DM_CTSToSelfSetting(struct net_device *dev,u32 DM_Type, u32 DM_Value)
 {
        struct r8192_priv *priv = ieee80211_priv((struct net_device *)dev);
 
@@ -2773,7 +2776,7 @@ extern void DM_CTSToSelfSetting(struct net_device * dev,u32 DM_Type, u32 DM_Valu
        }
 }
 
-static void dm_init_ctstoself(struct net_device * dev)
+static void dm_init_ctstoself(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv((struct net_device *)dev);
 
@@ -2837,7 +2840,7 @@ static void dm_ctstoself(struct net_device *dev)
  *     05/28/2008      amy             Create Version 0 porting from windows code.
  *
  *---------------------------------------------------------------------------*/
-static void dm_check_rfctrl_gpio(struct net_device * dev)
+static void dm_check_rfctrl_gpio(struct net_device *dev)
 {
        //struct r8192_priv *priv = ieee80211_priv(dev);
 
@@ -2881,7 +2884,7 @@ static    void    dm_check_pbc_gpio(struct net_device *dev)
        u8 tmp1byte;
 
 
-       tmp1byte = read_nic_byte(dev,GPI);
+       read_nic_byte(dev, GPI, &tmp1byte);
        if(tmp1byte == 0xff)
                return;
 
@@ -2933,18 +2936,18 @@ extern  void    dm_gpio_change_rf_callback(struct work_struct *work)
                {
                        // 0x108 GPIO input register is read only
                        //set 0x108 B1= 1: RF-ON; 0: RF-OFF.
-                       tmp1byte = read_nic_byte(dev,GPI);
+                       read_nic_byte(dev, GPI, &tmp1byte);
 
                        eRfPowerStateToSet = (tmp1byte&BIT1) ?  eRfOn : eRfOff;
 
-                       if( (priv->bHwRadioOff == true) && (eRfPowerStateToSet == eRfOn))
+                       if((priv->bHwRadioOff == true) && (eRfPowerStateToSet == eRfOn))
                        {
                                RT_TRACE(COMP_RF, "gpiochangeRF  - HW Radio ON\n");
 
                                priv->bHwRadioOff = false;
                                bActuallySet = true;
                        }
-                       else if ( (priv->bHwRadioOff == false) && (eRfPowerStateToSet == eRfOff))
+                       else if ((priv->bHwRadioOff == false) && (eRfPowerStateToSet == eRfOff))
                        {
                                RT_TRACE(COMP_RF, "gpiochangeRF  - HW Radio OFF\n");
                                priv->bHwRadioOff = true;
@@ -2996,7 +2999,7 @@ extern    void    dm_rf_pathcheck_workitemcallback(struct work_struct *work)
 
        /* 2008/01/30 MH After discussing with SD3 Jerry, 0xc04/0xd04 register will
           always be the same. We only read 0xc04 now. */
-       rfpath = read_nic_byte(dev, 0xc04);
+       read_nic_byte(dev, 0xc04, &rfpath);
 
        // Check Bit 0-3, it means if RF A-D is enabled.
        for (i = 0; i < RF90_PATH_MAX; i++)
@@ -3012,7 +3015,7 @@ extern    void    dm_rf_pathcheck_workitemcallback(struct work_struct *work)
        dm_rxpath_sel_byrssi(dev);
 }      /* DM_RFPathCheckWorkItemCallBack */
 
-static void dm_init_rxpath_selection(struct net_device * dev)
+static void dm_init_rxpath_selection(struct net_device *dev)
 {
        u8 i;
        struct r8192_priv *priv = ieee80211_priv(dev);
@@ -3033,7 +3036,7 @@ static void dm_init_rxpath_selection(struct net_device * dev)
        }
 }
 
-static void dm_rxpath_sel_byrssi(struct net_device * dev)
+static void dm_rxpath_sel_byrssi(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u8                              i, max_rssi_index=0, min_rssi_index=0, sec_rssi_index=0, rf_num=0;
@@ -3052,12 +3055,13 @@ static void dm_rxpath_sel_byrssi(struct net_device * dev)
 
        if(!cck_Rx_Path_initialized)
        {
-               DM_RxPathSelTable.cck_Rx_path = (read_nic_byte(dev, 0xa07)&0xf);
+               read_nic_byte(dev, 0xa07, &DM_RxPathSelTable.cck_Rx_path);
+               DM_RxPathSelTable.cck_Rx_path &= 0xf;
                cck_Rx_Path_initialized = 1;
        }
 
-       DM_RxPathSelTable.disabledRF = 0xf;
-       DM_RxPathSelTable.disabledRF &=~ (read_nic_byte(dev, 0xc04));
+       read_nic_byte(dev, 0xc04, &DM_RxPathSelTable.disabledRF);
+       DM_RxPathSelTable.disabledRF = ~DM_RxPathSelTable.disabledRF & 0xf;
 
        if(priv->ieee80211->mode == WIRELESS_MODE_B)
        {
@@ -3356,7 +3360,7 @@ extern void dm_fsync_timer_callback(unsigned long data)
        bool            bSwitchFromCountDiff = false;
        bool            bDoubleTimeInterval = false;
 
-       if(     priv->ieee80211->state == IEEE80211_LINKED &&
+       if(priv->ieee80211->state == IEEE80211_LINKED &&
                priv->ieee80211->bfsync_enable &&
                (priv->ieee80211->pHTInfo->IOTAction & HT_IOT_ACT_CDD_FSYNC))
        {
@@ -3576,12 +3580,12 @@ void dm_check_fsync(struct net_device *dev)
        RT_TRACE(COMP_HALDM, "RSSI %d TimeInterval %d MultipleTimeInterval %d\n", priv->ieee80211->fsync_rssi_threshold, priv->ieee80211->fsync_time_interval, priv->ieee80211->fsync_multiple_timeinterval);
        RT_TRACE(COMP_HALDM, "RateBitmap 0x%x FirstDiffRateThreshold %d SecondDiffRateThreshold %d\n", priv->ieee80211->fsync_rate_bitmap, priv->ieee80211->fsync_firstdiff_ratethreshold, priv->ieee80211->fsync_seconddiff_ratethreshold);
 
-       if(     priv->ieee80211->state == IEEE80211_LINKED &&
+       if(priv->ieee80211->state == IEEE80211_LINKED &&
                (priv->ieee80211->pHTInfo->IOTAction & HT_IOT_ACT_CDD_FSYNC))
        {
                if(priv->ieee80211->bfsync_enable == 0)
                {
-                       switch(priv->ieee80211->fsync_state)
+                       switch (priv->ieee80211->fsync_state)
                        {
                                case Default_Fsync:
                                        dm_StartHWFsync(dev);
@@ -3599,7 +3603,7 @@ void dm_check_fsync(struct net_device *dev)
                }
                else
                {
-                       switch(priv->ieee80211->fsync_state)
+                       switch (priv->ieee80211->fsync_state)
                        {
                                case Default_Fsync:
                                        dm_StartSWFsync(dev);
@@ -3632,7 +3636,7 @@ void dm_check_fsync(struct net_device *dev)
        }
        else
        {
-               switch(priv->ieee80211->fsync_state)
+               switch (priv->ieee80211->fsync_state)
                {
                        case HW_Fsync:
                                dm_EndHWFsync(dev);
@@ -3731,17 +3735,17 @@ extern void dm_shadow_init(struct net_device *dev)
        for (page = 0; page < 5; page++)
                for (offset = 0; offset < 256; offset++)
                {
-                       dm_shadow[page][offset] = read_nic_byte(dev, offset+page*256);
+                       read_nic_byte(dev, offset+page*256, &dm_shadow[page][offset]);
                        //DbgPrint("P-%d/O-%02x=%02x\r\n", page, offset, DM_Shadow[page][offset]);
                }
 
        for (page = 8; page < 11; page++)
                for (offset = 0; offset < 256; offset++)
-                       dm_shadow[page][offset] = read_nic_byte(dev, offset+page*256);
+                       read_nic_byte(dev, offset+page*256, &dm_shadow[page][offset]);
 
        for (page = 12; page < 15; page++)
                for (offset = 0; offset < 256; offset++)
-                       dm_shadow[page][offset] = read_nic_byte(dev, offset+page*256);
+                       read_nic_byte(dev, offset+page*256, &dm_shadow[page][offset]);
 
 }   /* dm_shadow_init */
 
@@ -3787,7 +3791,7 @@ static void dm_dynamic_txpower(struct net_device *dev)
                return;
        }
        //printk("priv->ieee80211->current_network.unknown_cap_exist is %d ,priv->ieee80211->current_network.broadcom_cap_exist is %d\n",priv->ieee80211->current_network.unknown_cap_exist,priv->ieee80211->current_network.broadcom_cap_exist);
-       if((priv->ieee80211->current_network.atheros_cap_exist ) && (priv->ieee80211->mode == IEEE_G)){
+       if((priv->ieee80211->current_network.atheros_cap_exist) && (priv->ieee80211->mode == IEEE_G)){
                txhipower_threshhold = TX_POWER_ATHEROAP_THRESH_HIGH;
                txlowpower_threshold = TX_POWER_ATHEROAP_THRESH_LOW;
        }
@@ -3832,8 +3836,8 @@ static void dm_dynamic_txpower(struct net_device *dev)
                priv->bDynamicTxLowPower = false;
        }
 
-       if( (priv->bDynamicTxHighPower != priv->bLastDTPFlag_High ) ||
-               (priv->bDynamicTxLowPower != priv->bLastDTPFlag_Low ) )
+       if((priv->bDynamicTxHighPower != priv->bLastDTPFlag_High) ||
+               (priv->bDynamicTxLowPower != priv->bLastDTPFlag_Low))
        {
                RT_TRACE(COMP_TXAGC,"SetTxPowerLevel8190()  channel = %d \n" , priv->ieee80211->current_network.channel);
 
@@ -3852,20 +3856,20 @@ static void dm_dynamic_txpower(struct net_device *dev)
 }      /* dm_dynamic_txpower */
 
 //added by vivi, for read tx rate and retrycount
-static void dm_check_txrateandretrycount(struct net_device * dev)
+static void dm_check_txrateandretrycount(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-       struct ieee80211_deviceieee = priv->ieee80211;
+       struct ieee80211_device *ieee = priv->ieee80211;
        //for 11n tx rate
 //     priv->stats.CurrentShowTxate = read_nic_byte(dev, Current_Tx_Rate_Reg);
-       ieee->softmac_stats.CurrentShowTxate = read_nic_byte(dev, Current_Tx_Rate_Reg);
+       read_nic_byte(dev, Current_Tx_Rate_Reg, &ieee->softmac_stats.CurrentShowTxate);
        //printk("=============>tx_rate_reg:%x\n", ieee->softmac_stats.CurrentShowTxate);
        //for initial tx rate
 //     priv->stats.last_packet_rate = read_nic_byte(dev, Initial_Tx_Rate_Reg);
-       ieee->softmac_stats.last_packet_rate = read_nic_byte(dev ,Initial_Tx_Rate_Reg);
+       read_nic_byte(dev, Initial_Tx_Rate_Reg, &ieee->softmac_stats.last_packet_rate);
        //for tx tx retry count
 //     priv->stats.txretrycount = read_nic_dword(dev, Tx_Retry_Count_Reg);
-       ieee->softmac_stats.txretrycount = read_nic_dword(dev, Tx_Retry_Count_Reg);
+       read_nic_dword(dev, Tx_Retry_Count_Reg, &ieee->softmac_stats.txretrycount);
 }
 
 static void dm_send_rssi_tofw(struct net_device *dev)
@@ -3882,7 +3886,7 @@ static void dm_send_rssi_tofw(struct net_device *dev)
        tx_cmd.Length   = 4;
        tx_cmd.Value            = priv->undecorated_smoothed_pwdb;
 
-       cmpk_message_handle_tx(dev, (u8*)&tx_cmd,
+       cmpk_message_handle_tx(dev, (u8 *)&tx_cmd,
                                                                DESC_PACKET_TYPE_INIT, sizeof(DCMD_TXCMD_T));
 }
 
index 15b0423356f8705597204c2cb2470cdb14b5999f..7e612aa56fa4c806d376eb1c830c39a885e6e245 100644 (file)
@@ -388,10 +388,11 @@ enum _RTL8192Usb_HW {
 #define EPROM_CMD_NORMAL 0
 #define EPROM_CMD_LOAD 1
 #define EPROM_CMD_PROGRAM 2
-#define EPROM_CS_SHIFT 3
-#define EPROM_CK_SHIFT 2
-#define EPROM_W_SHIFT 1
-#define EPROM_R_SHIFT 0
+#define EPROM_CS_BIT BIT(3)
+#define EPROM_CK_BIT BIT(2)
+#define EPROM_W_BIT  BIT(1)
+#define EPROM_R_BIT  BIT(0)
+
        MAC0                    = 0x000,
        MAC1                    = 0x001,
        MAC2                    = 0x002,
index c904aa8cc0a605028695c86b2c09ca9dbccd6b58..3e2576347d29ee73df1d22a7d42e0e5cd36b8624 100644 (file)
@@ -144,7 +144,7 @@ static int r8192_wx_read_regs(struct net_device *dev,
        down(&priv->wx_sem);
 
 
-       get_user(addr,(u8*)wrqu->data.pointer);
+       get_user(addr,(u8 *)wrqu->data.pointer);
        data1 = read_rtl8225(dev, addr);
        wrqu->data.length = data1;
 
@@ -162,7 +162,7 @@ static int r8192_wx_write_regs(struct net_device *dev,
 
        down(&priv->wx_sem);
 
-       get_user(addr, (u8*)wrqu->data.pointer);
+       get_user(addr, (u8 *)wrqu->data.pointer);
        write_rtl8225(dev, addr, wrqu->data.length);
 
        up(&priv->wx_sem);
@@ -199,7 +199,7 @@ static int r8192_wx_write_bb(struct net_device *dev,
 
        down(&priv->wx_sem);
 
-       get_user(databb, (u8*)wrqu->data.pointer);
+       get_user(databb, (u8 *)wrqu->data.pointer);
        rtl8187_write_phy(dev, wrqu->data.length, databb);
 
        up(&priv->wx_sem);
@@ -217,7 +217,7 @@ static int r8192_wx_write_nicb(struct net_device *dev,
 
        down(&priv->wx_sem);
 
-       get_user(addr, (u32*)wrqu->data.pointer);
+       get_user(addr, (u32 *)wrqu->data.pointer);
        write_nic_byte(dev, addr, wrqu->data.length);
 
        up(&priv->wx_sem);
@@ -234,8 +234,8 @@ static int r8192_wx_read_nicb(struct net_device *dev,
 
        down(&priv->wx_sem);
 
-       get_user(addr,(u32*)wrqu->data.pointer);
-       data1 = read_nic_byte(dev, addr);
+       get_user(addr,(u32 *)wrqu->data.pointer);
+       read_nic_byte(dev, addr, &data1);
        wrqu->data.length = data1;
 
        up(&priv->wx_sem);
@@ -254,12 +254,12 @@ static int r8192_wx_get_ap_status(struct net_device *dev,
        down(&priv->wx_sem);
 
        //count the length of input ssid
-       for(name_len=0 ; ((char*)wrqu->data.pointer)[name_len]!='\0' ; name_len++);
+       for(name_len=0 ; ((char *)wrqu->data.pointer)[name_len]!='\0' ; name_len++);
 
        //search for the corresponding info which is received
        list_for_each_entry(target, &ieee->network_list, list) {
                if ( (target->ssid_len == name_len) &&
-                    (strncmp(target->ssid, (char*)wrqu->data.pointer, name_len)==0)){
+                    (strncmp(target->ssid, (char *)wrqu->data.pointer, name_len)==0)){
                        if(target->wpa_ie_len>0 || target->rsn_ie_len>0 )
                                //set flags=1 to indicate this ap is WPA
                                wrqu->data.flags = 1;
@@ -380,7 +380,7 @@ static int rtl8180_wx_get_range(struct net_device *dev,
                                union iwreq_data *wrqu, char *extra)
 {
        struct iw_range *range = (struct iw_range *)extra;
-       struct iw_range_with_scan_capa* tmp = (struct iw_range_with_scan_capa*)range;
+       struct iw_range_with_scan_capa *tmp = (struct iw_range_with_scan_capa *)range;
        struct r8192_priv *priv = ieee80211_priv(dev);
        u16 val;
        int i;
@@ -483,7 +483,7 @@ static int r8192_wx_set_scan(struct net_device *dev, struct iw_request_info *a,
                             union iwreq_data *wrqu, char *b)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-       struct ieee80211_deviceieee = priv->ieee80211;
+       struct ieee80211_device *ieee = priv->ieee80211;
        int ret = 0;
 
        if(!priv->up) return -ENETDOWN;
@@ -492,7 +492,7 @@ static int r8192_wx_set_scan(struct net_device *dev, struct iw_request_info *a,
                return -EAGAIN;
        if (wrqu->data.flags & IW_SCAN_THIS_ESSID)
        {
-               struct iw_scan_req* req = (struct iw_scan_req*)b;
+               struct iw_scan_req *req = (struct iw_scan_req *)b;
                if (req->essid_len)
                {
                        //printk("==**&*&*&**===>scan set ssid:%s\n", req->essid);
@@ -709,7 +709,7 @@ static int r8192_wx_set_enc(struct net_device *dev,
                #define CONF_WEP40  0x4
                #define CONF_WEP104 0x14
 
-               switch(wrqu->encoding.flags & IW_ENCODE_INDEX){
+               switch (wrqu->encoding.flags & IW_ENCODE_INDEX){
                case 0: key_idx = ieee->tx_keyidx; break;
                case 1: key_idx = 0; break;
                case 2: key_idx = 1; break;
@@ -757,7 +757,7 @@ static int r8192_wx_set_scan_type(struct net_device *dev, struct iw_request_info
  iwreq_data *wrqu, char *p){
 
        struct r8192_priv *priv = ieee80211_priv(dev);
-       int *parms=(int*)p;
+       int *parms=(int *)p;
        int mode=parms[0];
 
        priv->ieee80211->active_scan = mode;
@@ -891,7 +891,7 @@ static int r8192_wx_set_enc_ext(struct net_device *dev,
 {
        int ret=0;
        struct r8192_priv *priv = ieee80211_priv(dev);
-       struct ieee80211_deviceieee = priv->ieee80211;
+       struct ieee80211_device *ieee = priv->ieee80211;
        //printk("===>%s()\n", __FUNCTION__);
 
 
@@ -922,7 +922,7 @@ static int r8192_wx_set_enc_ext(struct net_device *dev,
                        ieee->pairwise_key_type = alg;
                        EnableHWSecurityConfig8192(dev);
                }
-               memcpy((u8*)key, ext->key, 16); //we only get 16 bytes key.why? WB 2008.7.1
+               memcpy((u8 *)key, ext->key, 16); //we only get 16 bytes key.why? WB 2008.7.1
 
                if ((alg & KEY_TYPE_WEP40) && (ieee->auth_mode !=2) )
                {
@@ -952,7 +952,7 @@ static int r8192_wx_set_enc_ext(struct net_device *dev,
                                        4,//EntryNo
                                        idx, //KeyIndex
                                        alg,  //KeyType
-                                       (u8*)ieee->ap_mac_addr, //MacAddr
+                                       (u8 *)ieee->ap_mac_addr, //MacAddr
                                        0,              //DefaultKey
                                        key);           //KeyContent
                }
@@ -1180,8 +1180,8 @@ static iw_handler r8192_private_handler[] = {
 struct iw_statistics *r8192_get_wireless_stats(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-       struct ieee80211_deviceieee = priv->ieee80211;
-       struct iw_statisticswstats = &priv->wstats;
+       struct ieee80211_device *ieee = priv->ieee80211;
+       struct iw_statistics *wstats = &priv->wstats;
        int tmp_level = 0;
        int tmp_qual = 0;
        int tmp_noise = 0;
index b755eb46341fd519205d42eb96df3dd702e7bb8b..6810766edfcf19fa9f8d5578c1ae229536f9fdae 100644 (file)
@@ -41,7 +41,7 @@
 rt_status
 SendTxCommandPacket(
        struct net_device *dev,
-       void*                   pData,
+       void                    *pData,
        u32                             DataLen
        )
 {
@@ -57,7 +57,7 @@ SendTxCommandPacket(
        //Get TCB and local buffer from common pool. (It is shared by CmdQ, MgntQ, and USB coalesce DataQ)
        skb  = dev_alloc_skb(USB_HWDESC_HEADER_LEN + DataLen + 4);
        memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-       tcb_desc = (cb_desc*)(skb->cb + MAX_DEV_ADDR_SIZE);
+       tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
        tcb_desc->queue_index = TXCMD_QUEUE;
        tcb_desc->bCmdOrInit = DESC_PACKET_TYPE_NORMAL;
        tcb_desc->bLastIniPkt = 0;
@@ -66,7 +66,7 @@ SendTxCommandPacket(
        memcpy(ptr_buf,pData,DataLen);
        tcb_desc->txbuf_size= (u16)DataLen;
 
-       if(!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
+       if (!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
                        (!skb_queue_empty(&priv->ieee80211->skb_waitQ[tcb_desc->queue_index]))||\
                        (priv->ieee80211->queue_stop) ) {
                        RT_TRACE(COMP_FIRMWARE,"===================NULL packet==================================> tx full!\n");
@@ -101,7 +101,7 @@ SendTxCommandPacket(
  *---------------------------------------------------------------------------*/
  extern        rt_status       cmpk_message_handle_tx(
        struct net_device *dev,
-       u8*     codevirtualaddress,
+       u8      *codevirtualaddress,
        u32     packettype,
        u32     buffer_len)
 {
@@ -126,7 +126,7 @@ SendTxCommandPacket(
        //Fragmentation might be required
        frag_threshold = pfirmware->cmdpacket_frag_thresold;
        do {
-               if((buffer_len - frag_offset) > frag_threshold) {
+               if ((buffer_len - frag_offset) > frag_threshold) {
                        frag_length = frag_threshold ;
                        bLastIniPkt = 0;
 
@@ -145,7 +145,7 @@ SendTxCommandPacket(
                skb  = dev_alloc_skb(frag_length + 4);
                #endif
                memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-               tcb_desc = (cb_desc*)(skb->cb + MAX_DEV_ADDR_SIZE);
+               tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
                tcb_desc->queue_index = TXCMD_QUEUE;
                tcb_desc->bCmdOrInit = packettype;
                tcb_desc->bLastIniPkt = bLastIniPkt;
@@ -163,7 +163,7 @@ SendTxCommandPacket(
                tcb_desc->txbuf_size= (u16)buffer_len;
 
 
-               if(!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
+               if (!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
                        (!skb_queue_empty(&priv->ieee80211->skb_waitQ[tcb_desc->queue_index]))||\
                        (priv->ieee80211->queue_stop) ) {
                        RT_TRACE(COMP_FIRMWARE,"=====================================================> tx full!\n");
@@ -221,7 +221,7 @@ cmpk_count_txstatistic(
 #endif
 
 #ifdef TODO
-       if(pAdapter->bInHctTest)
+       if (pAdapter->bInHctTest)
                return;
 #endif
        /* We can not know the packet length and transmit type: broadcast or uni
@@ -303,7 +303,7 @@ cmpk_count_txstatistic(
 static void
 cmpk_handle_tx_feedback(
        struct net_device *dev,
-       u8      *       pmsg)
+       u8      *pmsg)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        cmpk_txfb_t             rx_tx_fb;       /* */
@@ -319,7 +319,7 @@ cmpk_handle_tx_feedback(
           endian type before copy the message copy. */
        /* 2007/07/05 MH Use pointer to transfer structure memory. */
        //memcpy((UINT8 *)&rx_tx_fb, pMsg, sizeof(CMPK_TXFB_T));
-       memcpy((u8*)&rx_tx_fb, pmsg, sizeof(cmpk_txfb_t));
+       memcpy((u8 *)&rx_tx_fb, pmsg, sizeof(cmpk_txfb_t));
        /* 2. Use tx feedback info to count TX statistics. */
        cmpk_count_txstatistic(dev, &rx_tx_fb);
        /* 2007/01/17 MH Comment previous method for TX statistic function. */
@@ -341,7 +341,7 @@ cmdpkt_beacontimerinterrupt_819xusb(
                //
                // 070117, rcnjko: 87B have to S/W beacon for DTM encryption_cmn.
                //
-               if(priv->ieee80211->current_network.mode == IEEE_A  ||
+               if (priv->ieee80211->current_network.mode == IEEE_A  ||
                        priv->ieee80211->current_network.mode == IEEE_N_5G ||
                        (priv->ieee80211->current_network.mode == IEEE_N_24G  && (!priv->ieee80211->pHTInfo->bCurSuppCCK)))
                {
@@ -386,7 +386,7 @@ cmdpkt_beacontimerinterrupt_819xusb(
 static void
 cmpk_handle_interrupt_status(
        struct net_device *dev,
-       u8*     pmsg)
+       u8      *pmsg)
 {
        cmpk_intr_sta_t         rx_intr_status; /* */
        struct r8192_priv *priv = ieee80211_priv(dev);
@@ -411,7 +411,7 @@ cmpk_handle_interrupt_status(
 
 
        // Statistics of beacon for ad-hoc mode.
-       if    priv->ieee80211->iw_mode == IW_MODE_ADHOC)
+       if (    priv->ieee80211->iw_mode == IW_MODE_ADHOC)
        {
                //2 maybe need endian transform?
                rx_intr_status.interrupt_status = *((u32 *)(pmsg + 4));
@@ -467,7 +467,7 @@ cmpk_handle_interrupt_status(
 static void
 cmpk_handle_query_config_rx(
        struct net_device *dev,
-       u8*        pmsg)
+       u8         *pmsg)
 {
        cmpk_query_cfg_t        rx_query_cfg;   /* */
 
@@ -580,11 +580,11 @@ static    void    cmpk_count_tx_status(   struct net_device *dev,
 static void
 cmpk_handle_tx_status(
        struct net_device *dev,
-       u8*        pmsg)
+       u8         *pmsg)
 {
        cmpk_tx_status_t        rx_tx_sts;      /* */
 
-       memcpy((void*)&rx_tx_sts, (void*)pmsg, sizeof(cmpk_tx_status_t));
+       memcpy((void *)&rx_tx_sts, (void *)pmsg, sizeof(cmpk_tx_status_t));
        /* 2. Use tx feedback info to count TX statistics. */
        cmpk_count_tx_status(dev, &rx_tx_sts);
 
@@ -610,7 +610,7 @@ cmpk_handle_tx_status(
 static void
 cmpk_handle_tx_rate_history(
        struct net_device *dev,
-       u8*        pmsg)
+       u8         *pmsg)
 {
        cmpk_tx_rahis_t *ptxrate;
 //     RT_RF_POWER_STATE       rtState;
@@ -727,12 +727,12 @@ cmpk_message_handle_rx(
              element type. Because FW may aggregate RX command packet to minimize
              transmit time between DRV and FW.*/
        // Add a counter to prevent the lock in the loop from being held too long
-       while (total_length > 0 || exe_cnt++ >100)
+       while (total_length > 0 && exe_cnt++ < 100)
        {
                /* 2007/01/17 MH We support aggregation of different cmd in the same packet. */
                element_id = pcmd_buff[0];
 
-               switch(element_id)
+               switch (element_id)
                {
                        case RX_TX_FEEDBACK:
                                cmpk_handle_tx_feedback (dev, pcmd_buff);
index 59caa4e05323c949e0c32b73b49f1506c1ca9203..ebe403270a5bb5ee7b7efc8c04e50d9a07995ea4 100644 (file)
@@ -192,10 +192,10 @@ typedef enum _rt_status{
        RT_STATUS_RESOURCE
 }rt_status,*prt_status;
 
-extern rt_status cmpk_message_handle_tx(struct net_device *dev, u8codevirtualaddress, u32 packettype, u32 buffer_len);
+extern rt_status cmpk_message_handle_tx(struct net_device *dev, u8 *codevirtualaddress, u32 packettype, u32 buffer_len);
 
-extern  u32 cmpk_message_handle_rx(struct net_device *dev, struct ieee80211_rx_stats * pstats);
-extern rt_status SendTxCommandPacket( struct net_device *dev, voidpData, u32 DataLen);
+extern  u32 cmpk_message_handle_rx(struct net_device *dev, struct ieee80211_rx_stats *pstats);
+extern rt_status SendTxCommandPacket( struct net_device *dev, void *pData, u32 DataLen);
 
 
 #endif
index 573e9cd68509dc16b7bf0ae274a7a58ef54b1138..bb924ac97e471d9e5613666a18b6f8acb6c0a151 100644 (file)
@@ -48,7 +48,7 @@ bool fw_download_code(struct net_device *dev, u8 *code_virtual_address, u32 buff
        //Fragmentation might be required
        frag_threshold = pfirmware->cmdpacket_frag_thresold;
        do {
-               if((buffer_len - frag_offset) > frag_threshold) {
+               if ((buffer_len - frag_offset) > frag_threshold) {
                        frag_length = frag_threshold ;
                        bLastIniPkt = 0;
 
@@ -67,7 +67,7 @@ bool fw_download_code(struct net_device *dev, u8 *code_virtual_address, u32 buff
                skb  = dev_alloc_skb(frag_length + 4);
                #endif
                memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-               tcb_desc = (cb_desc*)(skb->cb + MAX_DEV_ADDR_SIZE);
+               tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
                tcb_desc->queue_index = TXCMD_QUEUE;
                tcb_desc->bCmdOrInit = DESC_PACKET_TYPE_INIT;
                tcb_desc->bLastIniPkt = bLastIniPkt;
@@ -89,7 +89,7 @@ bool fw_download_code(struct net_device *dev, u8 *code_virtual_address, u32 buff
                tcb_desc->txbuf_size= (u16)i;
                skb_put(skb, i);
 
-               if(!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
+               if (!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
                        (!skb_queue_empty(&priv->ieee80211->skb_waitQ[tcb_desc->queue_index]))||\
                        (priv->ieee80211->queue_stop) ) {
                        RT_TRACE(COMP_FIRMWARE,"=====================================================> tx full!\n");
@@ -125,7 +125,7 @@ fwSendNullPacket(
        //Get TCB and local buffer from common pool. (It is shared by CmdQ, MgntQ, and USB coalesce DataQ)
        skb  = dev_alloc_skb(Length+ 4);
        memcpy((unsigned char *)(skb->cb),&dev,sizeof(dev));
-       tcb_desc = (cb_desc*)(skb->cb + MAX_DEV_ADDR_SIZE);
+       tcb_desc = (cb_desc *)(skb->cb + MAX_DEV_ADDR_SIZE);
        tcb_desc->queue_index = TXCMD_QUEUE;
        tcb_desc->bCmdOrInit = DESC_PACKET_TYPE_INIT;
        tcb_desc->bLastIniPkt = bLastInitPacket;
@@ -133,7 +133,7 @@ fwSendNullPacket(
        memset(ptr_buf,0,Length);
        tcb_desc->txbuf_size= (u16)Length;
 
-       if(!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
+       if (!priv->ieee80211->check_nic_enough_desc(dev,tcb_desc->queue_index)||
                        (!skb_queue_empty(&priv->ieee80211->skb_waitQ[tcb_desc->queue_index]))||\
                        (priv->ieee80211->queue_stop) ) {
                        RT_TRACE(COMP_FIRMWARE,"===================NULL packet==================================> tx full!\n");
@@ -168,14 +168,14 @@ bool CPUcheck_maincodeok_turnonCPU(struct net_device *dev)
 
        /* Check whether put code OK */
        do {
-               CPU_status = read_nic_dword(dev, CPU_GEN);
+               read_nic_dword(dev, CPU_GEN, &CPU_status);
 
-               if(CPU_status&CPU_GEN_PUT_CODE_OK)
+               if (CPU_status&CPU_GEN_PUT_CODE_OK)
                        break;
 
        }while(check_putcodeOK_time--);
 
-       if(!(CPU_status&CPU_GEN_PUT_CODE_OK)) {
+       if (!(CPU_status&CPU_GEN_PUT_CODE_OK)) {
                RT_TRACE(COMP_ERR, "Download Firmware: Put code fail!\n");
                goto CPUCheckMainCodeOKAndTurnOnCPU_Fail;
        } else {
@@ -183,19 +183,19 @@ bool CPUcheck_maincodeok_turnonCPU(struct net_device *dev)
        }
 
        /* Turn On CPU */
-       CPU_status = read_nic_dword(dev, CPU_GEN);
+       read_nic_dword(dev, CPU_GEN, &CPU_status);
        write_nic_byte(dev, CPU_GEN, (u8)((CPU_status|CPU_GEN_PWR_STB_CPU)&0xff));
        mdelay(1000);
 
        /* Check whether CPU boot OK */
        do {
-               CPU_status = read_nic_dword(dev, CPU_GEN);
+               read_nic_dword(dev, CPU_GEN, &CPU_status);
 
-               if(CPU_status&CPU_GEN_BOOT_RDY)
+               if (CPU_status&CPU_GEN_BOOT_RDY)
                        break;
        }while(check_bootOk_time--);
 
-       if(!(CPU_status&CPU_GEN_BOOT_RDY)) {
+       if (!(CPU_status&CPU_GEN_BOOT_RDY)) {
                goto CPUCheckMainCodeOKAndTurnOnCPU_Fail;
        } else {
                RT_TRACE(COMP_FIRMWARE, "Download Firmware: Boot ready!\n");
@@ -218,14 +218,14 @@ bool CPUcheck_firmware_ready(struct net_device *dev)
 
        /* Check Firmware Ready */
        do {
-               CPU_status = read_nic_dword(dev, CPU_GEN);
+               read_nic_dword(dev, CPU_GEN, &CPU_status);
 
-               if(CPU_status&CPU_GEN_FIRM_RDY)
+               if (CPU_status&CPU_GEN_FIRM_RDY)
                        break;
 
        }while(check_time--);
 
-       if(!(CPU_status&CPU_GEN_FIRM_RDY))
+       if (!(CPU_status&CPU_GEN_FIRM_RDY))
                goto CPUCheckFirmwareReady_Fail;
        else
                RT_TRACE(COMP_FIRMWARE, "Download Firmware: Firmware ready!\n");
@@ -265,7 +265,7 @@ bool init_firmware(struct net_device *dev)
                starting_state = FW_INIT_STEP0_BOOT;
                // TODO: system reset
 
-       }else if(pfirmware->firmware_status == FW_STATUS_5_READY) {
+       }else if (pfirmware->firmware_status == FW_STATUS_5_READY) {
                /* it is called by Initialize */
                rst_opt = OPT_FIRMWARE_RESET;
                starting_state = FW_INIT_STEP2_DATA;
@@ -282,19 +282,19 @@ bool init_firmware(struct net_device *dev)
                 * Open image file, and map file to continuous memory if open file success.
                 * or read image file from array. Default load from IMG file
                 */
-               if(rst_opt == OPT_SYSTEM_RESET) {
+               if (rst_opt == OPT_SYSTEM_RESET) {
                        rc = request_firmware(&fw_entry, fw_name[init_step],&priv->udev->dev);
-                       if(rc < 0 ) {
+                       if (rc < 0 ) {
                                RT_TRACE(COMP_ERR, "request firmware fail!\n");
                                goto download_firmware_fail;
                        }
 
-                       if(fw_entry->size > sizeof(pfirmware->firmware_buf)) {
+                       if (fw_entry->size > sizeof(pfirmware->firmware_buf)) {
                                RT_TRACE(COMP_ERR, "img file size exceed the container buffer fail!\n");
                                goto download_firmware_fail;
                        }
 
-                       if(init_step != FW_INIT_STEP1_MAIN) {
+                       if (init_step != FW_INIT_STEP1_MAIN) {
                                memcpy(pfirmware->firmware_buf,fw_entry->data,fw_entry->size);
                                mapped_file = pfirmware->firmware_buf;
                                file_length = fw_entry->size;
@@ -311,7 +311,7 @@ bool init_firmware(struct net_device *dev)
 #endif
                        }
                        pfirmware->firmware_buf_size = file_length;
-               }else if(rst_opt == OPT_FIRMWARE_RESET ) {
+               }else if (rst_opt == OPT_FIRMWARE_RESET ) {
                        /* we only need to download data.img here */
                        mapped_file = pfirmware->firmware_buf;
                        file_length = pfirmware->firmware_buf_size;
@@ -325,15 +325,15 @@ bool init_firmware(struct net_device *dev)
                 *   and Tx descriptor info
                 * */
                rt_status = fw_download_code(dev,mapped_file,file_length);
-               if(rst_opt == OPT_SYSTEM_RESET) {
+               if (rst_opt == OPT_SYSTEM_RESET) {
                        release_firmware(fw_entry);
                }
 
-               if(rt_status != TRUE) {
+               if (rt_status != TRUE) {
                        goto download_firmware_fail;
                }
 
-               switch(init_step) {
+               switch (init_step) {
                case FW_INIT_STEP0_BOOT:
                        /* Download boot
                         * initialize command descriptor.
@@ -343,7 +343,7 @@ bool init_firmware(struct net_device *dev)
 #ifdef RTL8190P
                        // To initialize IMEM, CPU move code  from 0x80000080, hence, we send 0x80 byte packet
                        rt_status = fwSendNullPacket(dev, RTL8190_CPU_START_OFFSET);
-                       if(rt_status != true)
+                       if (rt_status != true)
                        {
                                RT_TRACE(COMP_INIT, "fwSendNullPacket() fail ! \n");
                                goto  download_firmware_fail;
@@ -362,7 +362,7 @@ bool init_firmware(struct net_device *dev)
 
                        /* Check Put Code OK and Turn On CPU */
                        rt_status = CPUcheck_maincodeok_turnonCPU(dev);
-                       if(rt_status != TRUE) {
+                       if (rt_status != TRUE) {
                                RT_TRACE(COMP_ERR, "CPUcheck_maincodeok_turnonCPU fail!\n");
                                goto download_firmware_fail;
                        }
@@ -376,7 +376,7 @@ bool init_firmware(struct net_device *dev)
                        mdelay(1);
 
                        rt_status = CPUcheck_firmware_ready(dev);
-                       if(rt_status != TRUE) {
+                       if (rt_status != TRUE) {
                                RT_TRACE(COMP_ERR, "CPUcheck_firmware_ready fail(%d)!\n",rt_status);
                                goto download_firmware_fail;
                        }
index 17fac41c12d9a81eb7b9513406742b54b664d981..d66d88cc189eb20726314915b152e1a6deabbac5 100644 (file)
@@ -58,7 +58,7 @@ u32 rtl8192_CalculateBitShift(u32 dwBitMask)
  *  output:  none
  *  return:  0(illegal, false), 1(legal,true)
  * ***************************************************************************/
-u8 rtl8192_phy_CheckIsLegalRFPath(struct net_devicedev, u32 eRFPath)
+u8 rtl8192_phy_CheckIsLegalRFPath(struct net_device *dev, u32 eRFPath)
 {
        u8 ret = 1;
        struct r8192_priv *priv = ieee80211_priv(dev);
@@ -83,14 +83,14 @@ u8 rtl8192_phy_CheckIsLegalRFPath(struct net_device* dev, u32 eRFPath)
  *  return:  none
  *  notice:
  * ****************************************************************************/
-void rtl8192_setBBreg(struct net_devicedev, u32 dwRegAddr, u32 dwBitMask, u32 dwData)
+void rtl8192_setBBreg(struct net_device *dev, u32 dwRegAddr, u32 dwBitMask, u32 dwData)
 {
 
        u32 OriginalValue, BitShift, NewValue;
 
        if(dwBitMask!= bMaskDWord)
        {//if not "double word" write
-               OriginalValue = read_nic_dword(dev, dwRegAddr);
+               read_nic_dword(dev, dwRegAddr, &OriginalValue);
                BitShift = rtl8192_CalculateBitShift(dwBitMask);
                NewValue = (((OriginalValue) & (~dwBitMask)) | (dwData << BitShift));
                write_nic_dword(dev, dwRegAddr, NewValue);
@@ -107,19 +107,19 @@ void rtl8192_setBBreg(struct net_device* dev, u32 dwRegAddr, u32 dwBitMask, u32
  *  return:  u32       Data    //the readback register value
  *  notice:
  * ****************************************************************************/
-u32 rtl8192_QueryBBReg(struct net_devicedev, u32 dwRegAddr, u32 dwBitMask)
+u32 rtl8192_QueryBBReg(struct net_device *dev, u32 dwRegAddr, u32 dwBitMask)
 {
        u32 Ret = 0, OriginalValue, BitShift;
 
-       OriginalValue = read_nic_dword(dev, dwRegAddr);
+       read_nic_dword(dev, dwRegAddr, &OriginalValue);
        BitShift = rtl8192_CalculateBitShift(dwBitMask);
        Ret =(OriginalValue & dwBitMask) >> BitShift;
 
        return (Ret);
 }
-static  u32 phy_FwRFSerialRead( struct net_devicedev, RF90_RADIO_PATH_E       eRFPath, u32 Offset  );
+static  u32 phy_FwRFSerialRead( struct net_device *dev, RF90_RADIO_PATH_E       eRFPath, u32 Offset  );
 
-static void phy_FwRFSerialWrite( struct net_devicedev, RF90_RADIO_PATH_E       eRFPath, u32  Offset, u32  Data);
+static void phy_FwRFSerialWrite( struct net_device *dev, RF90_RADIO_PATH_E       eRFPath, u32  Offset, u32  Data);
 
 /******************************************************************************
  *function:  This function read register from RF chip
@@ -130,12 +130,12 @@ static void phy_FwRFSerialWrite( struct net_device* dev, RF90_RADIO_PATH_E
  *  return:  u32       readback value
  *  notice:  There are three types of serial operations:(1) Software serial write.(2)Hardware LSSI-Low Speed Serial Interface.(3)Hardware HSSI-High speed serial write. Driver here need to implement (1) and (2)---need more spec for this information.
  * ****************************************************************************/
-u32 rtl8192_phy_RFSerialRead(struct net_devicedev, RF90_RADIO_PATH_E eRFPath, u32 Offset)
+u32 rtl8192_phy_RFSerialRead(struct net_device *dev, RF90_RADIO_PATH_E eRFPath, u32 Offset)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u32 ret = 0;
        u32 NewOffset = 0;
-       BB_REGISTER_DEFINITION_TpPhyReg = &priv->PHYRegDef[eRFPath];
+       BB_REGISTER_DEFINITION_T *pPhyReg = &priv->PHYRegDef[eRFPath];
        rtl8192_setBBreg(dev, pPhyReg->rfLSSIReadBack, bLSSIReadBackData, 0);
        //make sure RF register offset is correct
        Offset &= 0x3f;
@@ -218,7 +218,7 @@ u32 rtl8192_phy_RFSerialRead(struct net_device* dev, RF90_RADIO_PATH_E eRFPath,
  * Reg_Mode2   1               1                       Reg 31 ~ 45(0x1 ~ 0xf)
  *------------------------------------------------------------------
  * ****************************************************************************/
-void rtl8192_phy_RFSerialWrite(struct net_devicedev, RF90_RADIO_PATH_E eRFPath, u32 Offset, u32 Data)
+void rtl8192_phy_RFSerialWrite(struct net_device *dev, RF90_RADIO_PATH_E eRFPath, u32 Offset, u32 Data)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u32 DataAndAddr = 0, NewOffset = 0;
@@ -291,7 +291,7 @@ void rtl8192_phy_RFSerialWrite(struct net_device* dev, RF90_RADIO_PATH_E eRFPath
  *  return:  none
  *  notice:
  * ****************************************************************************/
-void rtl8192_phy_SetRFReg(struct net_devicedev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask, u32 Data)
+void rtl8192_phy_SetRFReg(struct net_device *dev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask, u32 Data)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u32 Original_Value, BitShift, New_Value;
@@ -339,7 +339,7 @@ void rtl8192_phy_SetRFReg(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32
  *  return:  u32       Data    //the readback register value
  *  notice:
  * ****************************************************************************/
-u32 rtl8192_phy_QueryRFReg(struct net_devicedev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask)
+u32 rtl8192_phy_QueryRFReg(struct net_device *dev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask)
 {
        u32 Original_Value, Readback_Value, BitShift;
        struct r8192_priv *priv = ieee80211_priv(dev);
@@ -372,13 +372,14 @@ u32 rtl8192_phy_QueryRFReg(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u3
  * ***************************************************************************/
 static u32
 phy_FwRFSerialRead(
-       struct net_devicedev,
+       struct net_device *dev,
        RF90_RADIO_PATH_E       eRFPath,
        u32                             Offset  )
 {
        u32             retValue = 0;
        u32             Data = 0;
        u8              time = 0;
+       u32             tmp;
        //DbgPrint("FW RF CTRL\n\r");
        /* 2007/11/02 MH Firmware RF Write control. By Francis' suggestion, we can
           not execute the scheme in the initial step. Otherwise, RF-R/W will waste
@@ -394,13 +395,15 @@ phy_FwRFSerialRead(
        // 5. Trigger Fw to operate the command. bit 31
        Data |= 0x80000000;
        // 6. We can not execute read operation if bit 31 is 1.
-       while (read_nic_dword(dev, QPNR)&0x80000000)
+       read_nic_dword(dev, QPNR, &tmp);
+       while (tmp & 0x80000000)
        {
                // If FW can not finish RF-R/W for more than ?? times. We must reset FW.
                if (time++ < 100)
                {
                        //DbgPrint("FW not finish RF-R Time=%d\n\r", time);
                        udelay(10);
+                       read_nic_dword(dev, QPNR, &tmp);
                }
                else
                        break;
@@ -408,18 +411,20 @@ phy_FwRFSerialRead(
        // 7. Execute read operation.
        write_nic_dword(dev, QPNR, Data);
        // 8. Check if firmawre send back RF content.
-       while (read_nic_dword(dev, QPNR)&0x80000000)
+       read_nic_dword(dev, QPNR, &tmp);
+       while (tmp & 0x80000000)
        {
                // If FW can not finish RF-R/W for more than ?? times. We must reset FW.
                if (time++ < 100)
                {
                        //DbgPrint("FW not finish RF-W Time=%d\n\r", time);
                        udelay(10);
+                       read_nic_dword(dev, QPNR, &tmp);
                }
                else
                        return  (0);
        }
-       retValue = read_nic_dword(dev, RF_DATA);
+       read_nic_dword(dev, RF_DATA, &retValue);
 
        return  (retValue);
 
@@ -434,12 +439,13 @@ phy_FwRFSerialRead(
  * ***************************************************************************/
 static void
 phy_FwRFSerialWrite(
-               struct net_devicedev,
+               struct net_device *dev,
                RF90_RADIO_PATH_E       eRFPath,
                u32                             Offset,
                u32                             Data    )
 {
        u8      time = 0;
+       u32     tmp;
 
        //DbgPrint("N FW RF CTRL RF-%d OF%02x DATA=%03x\n\r", eRFPath, Offset, Data);
        /* 2007/11/02 MH Firmware RF Write control. By Francis' suggestion, we can
@@ -458,13 +464,15 @@ phy_FwRFSerialWrite(
        Data |= 0x80000000;
 
        // 6. Write operation. We can not write if bit 31 is 1.
-       while (read_nic_dword(dev, QPNR)&0x80000000)
+       read_nic_dword(dev, QPNR, &tmp);
+       while (tmp & 0x80000000)
        {
                // If FW can not finish RF-R/W for more than ?? times. We must reset FW.
                if (time++ < 100)
                {
                        //DbgPrint("FW not finish RF-W Time=%d\n\r", time);
                        udelay(10);
+                       read_nic_dword(dev, QPNR, &tmp);
                }
                else
                        break;
@@ -489,10 +497,10 @@ phy_FwRFSerialWrite(
  *  notice:  BB parameters may change all the time, so please make
  *           sure it has been synced with the newest.
  * ***************************************************************************/
-void rtl8192_phy_configmac(struct net_devicedev)
+void rtl8192_phy_configmac(struct net_device *dev)
 {
        u32 dwArrayLen = 0, i;
-       u32pdwArray = NULL;
+       u32 *pdwArray = NULL;
        struct r8192_priv *priv = ieee80211_priv(dev);
 
        if(priv->btxpowerdata_readfromEEPORM)
@@ -533,7 +541,7 @@ void rtl8192_phy_configmac(struct net_device* dev)
  *           sure it has been synced with the newest.
  * ***************************************************************************/
 
-void rtl8192_phyConfigBB(struct net_devicedev, u8 ConfigType)
+void rtl8192_phyConfigBB(struct net_device *dev, u8 ConfigType)
 {
        u32 i;
 
@@ -575,7 +583,7 @@ void rtl8192_phyConfigBB(struct net_device* dev, u8 ConfigType)
  *  return:  none
  *  notice:  Initialization value here is constant and it should never be changed
  * ***************************************************************************/
-void rtl8192_InitBBRFRegDef(struct net_devicedev)
+void rtl8192_InitBBRFRegDef(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
 // RF Interface Software Control
@@ -690,7 +698,7 @@ void rtl8192_InitBBRFRegDef(struct net_device* dev)
  *  return:  return whether BB and RF is ok(0:OK; 1:Fail)
  *  notice:  This function may be removed in the ASIC
  * ***************************************************************************/
-u8 rtl8192_phy_checkBBAndRF(struct net_devicedev, HW90_BLOCK_E CheckBlock, RF90_RADIO_PATH_E eRFPath)
+u8 rtl8192_phy_checkBBAndRF(struct net_device *dev, HW90_BLOCK_E CheckBlock, RF90_RADIO_PATH_E eRFPath)
 {
 //     struct r8192_priv *priv = ieee80211_priv(dev);
 //     BB_REGISTER_DEFINITION_T *pPhyReg = &priv->PHYRegDef[eRFPath];
@@ -710,7 +718,7 @@ u8 rtl8192_phy_checkBBAndRF(struct net_device* dev, HW90_BLOCK_E CheckBlock, RF9
                //
                // Write Data to register and readback
                //
-               switch(CheckBlock)
+               switch (CheckBlock)
                {
                case HW90_BLOCK_MAC:
                        RT_TRACE(COMP_ERR, "PHY_CheckBBRFOK(): Never Write 0x100 here!");
@@ -719,7 +727,7 @@ u8 rtl8192_phy_checkBBAndRF(struct net_device* dev, HW90_BLOCK_E CheckBlock, RF9
                case HW90_BLOCK_PHY0:
                case HW90_BLOCK_PHY1:
                        write_nic_dword(dev, WriteAddr[CheckBlock], WriteData[i]);
-                       dwRegRead = read_nic_dword(dev, WriteAddr[CheckBlock]);
+                       read_nic_dword(dev, WriteAddr[CheckBlock], &dwRegRead);
                        break;
 
                case HW90_BLOCK_RF:
@@ -760,7 +768,7 @@ u8 rtl8192_phy_checkBBAndRF(struct net_device* dev, HW90_BLOCK_E CheckBlock, RF9
  *  notice:  Initialization value may change all the time, so please make
  *           sure it has been synced with the newest.
  * ***************************************************************************/
-void rtl8192_BB_Config_ParaFile(struct net_devicedev)
+void rtl8192_BB_Config_ParaFile(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u8 bRegValue = 0, eCheckItem = 0, rtStatus = 0;
@@ -770,11 +778,11 @@ void rtl8192_BB_Config_ParaFile(struct net_device* dev)
        **************************************/
 
        /*--set BB Global Reset--*/
-       bRegValue = read_nic_byte(dev, BB_GLOBAL_RESET);
+       read_nic_byte(dev, BB_GLOBAL_RESET, &bRegValue);
        write_nic_byte(dev, BB_GLOBAL_RESET,(bRegValue|BB_GLOBAL_RESET_BIT));
        mdelay(50);
        /*---set BB reset Active---*/
-       dwRegValue = read_nic_dword(dev, CPU_GEN);
+       read_nic_dword(dev, CPU_GEN, &dwRegValue);
        write_nic_dword(dev, CPU_GEN, (dwRegValue&(~CPU_GEN_BB_RST)));
 
        /*----Ckeck FPGAPHY0 and PHY1 board is OK----*/
@@ -795,7 +803,7 @@ void rtl8192_BB_Config_ParaFile(struct net_device* dev)
        rtl8192_phyConfigBB(dev, BaseBand_Config_PHY_REG);
 
        /*----Set BB reset de-Active----*/
-       dwRegValue = read_nic_dword(dev, CPU_GEN);
+       read_nic_dword(dev, CPU_GEN, &dwRegValue);
        write_nic_dword(dev, CPU_GEN, (dwRegValue|CPU_GEN_BB_RST));
 
        /*----BB AGC table Initialization----*/
@@ -828,7 +836,7 @@ void rtl8192_BB_Config_ParaFile(struct net_device* dev)
  *  notice:  Initialization value may change all the time, so please make
  *           sure it has been synced with the newest.
  * ***************************************************************************/
-void rtl8192_BBConfig(struct net_devicedev)
+void rtl8192_BBConfig(struct net_device *dev)
 {
        rtl8192_InitBBRFRegDef(dev);
        //config BB&RF. As hardCode based initialization has not been well
@@ -843,39 +851,35 @@ void rtl8192_BBConfig(struct net_device* dev)
  *  output:  none
  *  return:  none
  * ***************************************************************************/
-void rtl8192_phy_getTxPower(struct net_devicedev)
+void rtl8192_phy_getTxPower(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
-       priv->MCSTxPowerLevelOriginalOffset[0] =
-               read_nic_dword(dev, rTxAGC_Rate18_06);
-       priv->MCSTxPowerLevelOriginalOffset[1] =
-               read_nic_dword(dev, rTxAGC_Rate54_24);
-       priv->MCSTxPowerLevelOriginalOffset[2] =
-               read_nic_dword(dev, rTxAGC_Mcs03_Mcs00);
-       priv->MCSTxPowerLevelOriginalOffset[3] =
-               read_nic_dword(dev, rTxAGC_Mcs07_Mcs04);
-       priv->MCSTxPowerLevelOriginalOffset[4] =
-               read_nic_dword(dev, rTxAGC_Mcs11_Mcs08);
-       priv->MCSTxPowerLevelOriginalOffset[5] =
-               read_nic_dword(dev, rTxAGC_Mcs15_Mcs12);
+       u8 tmp;
+       read_nic_dword(dev, rTxAGC_Rate18_06, &priv->MCSTxPowerLevelOriginalOffset[0]);
+       read_nic_dword(dev, rTxAGC_Rate54_24, &priv->MCSTxPowerLevelOriginalOffset[1]);
+       read_nic_dword(dev, rTxAGC_Mcs03_Mcs00, &priv->MCSTxPowerLevelOriginalOffset[2]);
+       read_nic_dword(dev, rTxAGC_Mcs07_Mcs04, &priv->MCSTxPowerLevelOriginalOffset[3]);
+       read_nic_dword(dev, rTxAGC_Mcs11_Mcs08, &priv->MCSTxPowerLevelOriginalOffset[4]);
+       read_nic_dword(dev, rTxAGC_Mcs15_Mcs12, &priv->MCSTxPowerLevelOriginalOffset[5]);
 
        // read rx initial gain
-       priv->DefaultInitialGain[0] = read_nic_byte(dev, rOFDM0_XAAGCCore1);
-       priv->DefaultInitialGain[1] = read_nic_byte(dev, rOFDM0_XBAGCCore1);
-       priv->DefaultInitialGain[2] = read_nic_byte(dev, rOFDM0_XCAGCCore1);
-       priv->DefaultInitialGain[3] = read_nic_byte(dev, rOFDM0_XDAGCCore1);
+       read_nic_byte(dev, rOFDM0_XAAGCCore1, &priv->DefaultInitialGain[0]);
+       read_nic_byte(dev, rOFDM0_XBAGCCore1, &priv->DefaultInitialGain[1]);
+       read_nic_byte(dev, rOFDM0_XCAGCCore1, &priv->DefaultInitialGain[2]);
+       read_nic_byte(dev, rOFDM0_XDAGCCore1, &priv->DefaultInitialGain[3]);
        RT_TRACE(COMP_INIT, "Default initial gain (c50=0x%x, c58=0x%x, c60=0x%x, c68=0x%x) \n",
                priv->DefaultInitialGain[0], priv->DefaultInitialGain[1],
                priv->DefaultInitialGain[2], priv->DefaultInitialGain[3]);
 
        // read framesync
-       priv->framesync = read_nic_byte(dev, rOFDM0_RxDetector3);
-       priv->framesyncC34 = read_nic_byte(dev, rOFDM0_RxDetector2);
+       read_nic_byte(dev, rOFDM0_RxDetector3, &priv->framesync);
+       read_nic_byte(dev, rOFDM0_RxDetector2, &tmp);
+       priv->framesyncC34 = tmp;
        RT_TRACE(COMP_INIT, "Default framesync (0x%x) = 0x%x \n",
                rOFDM0_RxDetector3, priv->framesync);
 
        // read SIFS (save the value read fome MACPHY_REG.txt)
-       priv->SifsTime = read_nic_word(dev, SIFS);
+       read_nic_word(dev, SIFS, &priv->SifsTime);
 
        return;
 }
@@ -886,13 +890,13 @@ void rtl8192_phy_getTxPower(struct net_device* dev)
  *  output:  none
  *  return:  none
  * ***************************************************************************/
-void rtl8192_phy_setTxPower(struct net_devicedev, u8 channel)
+void rtl8192_phy_setTxPower(struct net_device *dev, u8 channel)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        u8      powerlevel = priv->TxPowerLevelCCK[channel-1];
        u8      powerlevelOFDM24G = priv->TxPowerLevelOFDM24G[channel-1];
 
-       switch(priv->rf_chip)
+       switch (priv->rf_chip)
        {
        case RF_8256:
                PHY_SetRF8256CCKTxPower(dev, powerlevel); //need further implement
@@ -913,11 +917,11 @@ void rtl8192_phy_setTxPower(struct net_device* dev, u8 channel)
  *  output:  none
  *  return:  only 8256 is supported
  * ***************************************************************************/
-void rtl8192_phy_RFConfig(struct net_devicedev)
+void rtl8192_phy_RFConfig(struct net_device *dev)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
 
-       switch(priv->rf_chip)
+       switch (priv->rf_chip)
        {
                case RF_8256:
                        PHY_RF8256_Config(dev);
@@ -937,7 +941,7 @@ void rtl8192_phy_RFConfig(struct net_device* dev)
  *  output:  none
  *  return:  As Windows has not implemented this, wait for complement
  * ***************************************************************************/
-void rtl8192_phy_updateInitGain(struct net_devicedev)
+void rtl8192_phy_updateInitGain(struct net_device *dev)
 {
        return;
 }
@@ -949,14 +953,14 @@ void rtl8192_phy_updateInitGain(struct net_device* dev)
  *  return:  return code show if RF configuration is successful(0:pass, 1:fail)
  *    Note:  Delay may be required for RF configuration
  * ***************************************************************************/
-u8 rtl8192_phy_ConfigRFWithHeaderFile(struct net_devicedev, RF90_RADIO_PATH_E        eRFPath)
+u8 rtl8192_phy_ConfigRFWithHeaderFile(struct net_device *dev, RF90_RADIO_PATH_E        eRFPath)
 {
 
        int i;
        //u32* pRFArray;
        u8 ret = 0;
 
-       switch(eRFPath){
+       switch (eRFPath){
        case RF90_PATH_A:
                for(i = 0;i<RadioA_ArrayLength; i=i+2){
 
@@ -1026,7 +1030,7 @@ void rtl8192_SetTxPowerLevel(struct net_device *dev, u8 channel)
        u8      powerlevel = priv->TxPowerLevelCCK[channel-1];
        u8      powerlevelOFDM24G = priv->TxPowerLevelOFDM24G[channel-1];
 
-       switch(priv->rf_chip)
+       switch (priv->rf_chip)
        {
        case RF_8225:
 #ifdef TO_DO_LIST
@@ -1071,10 +1075,10 @@ bool rtl8192_SetRFPowerState(struct net_device *dev, RT_RF_POWER_STATE eRFPowerS
 
        priv->SetRFPowerStateInProgress = true;
 
-       switch(priv->rf_chip)
+       switch (priv->rf_chip)
        {
                case RF_8256:
-               switch( eRFPowerState )
+               switch ( eRFPowerState )
                {
                        case eRfOn:
        //RF-A, RF-B
@@ -1129,10 +1133,10 @@ bool rtl8192_SetRFPowerState(struct net_device *dev, RT_RF_POWER_STATE eRFPowerS
        {
                // Update current RF state variable.
                pHalData->eRFPowerState = eRFPowerState;
-               switch(pHalData->RFChipID )
+               switch (pHalData->RFChipID )
                {
                        case RF_8256:
-               switch(pHalData->eRFPowerState)
+               switch (pHalData->eRFPowerState)
                                {
                                case eRfOff:
                                        //
@@ -1195,7 +1199,7 @@ bool rtl8192_SetRFPowerState(struct net_device *dev, RT_RF_POWER_STATE eRFPowerS
  *    Note:
  * ************************************************************************************/
 u8 rtl8192_phy_SetSwChnlCmdArray(
-       SwChnlCmd*              CmdTable,
+       SwChnlCmd               *CmdTable,
        u32                     CmdTableIdx,
        u32                     CmdTableSz,
        SwChnlCmdID             CmdID,
@@ -1204,7 +1208,7 @@ u8 rtl8192_phy_SetSwChnlCmdArray(
        u32                     msDelay
        )
 {
-       SwChnlCmdpCmd;
+       SwChnlCmd *pCmd;
 
        if(CmdTable == NULL)
        {
@@ -1237,7 +1241,7 @@ u8 rtl8192_phy_SetSwChnlCmdArray(
  *  return:  true if finished, false otherwise
  *    Note:  Wait for simpler function to replace it //wb
  * ***************************************************************************/
-u8 rtl8192_phy_SwChnlStepByStep(struct net_device *dev, u8 channel, u8* stage, u8* step, u32* delay)
+u8 rtl8192_phy_SwChnlStepByStep(struct net_device *dev, u8 channel, u8 *stage, u8 *step, u32 *delay)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
 //     PCHANNEL_ACCESS_SETTING pChnlAccessSetting;
@@ -1283,7 +1287,7 @@ u8 rtl8192_phy_SwChnlStepByStep(struct net_device *dev, u8 channel, u8* stage, u
 
                // <3> Fill up RF dependent command.
                RfDependCmdCnt = 0;
-               switch( priv->rf_chip )
+               switch ( priv->rf_chip )
                {
                case RF_8225:
                        if (!(channel >= 1 && channel <= 14))
@@ -1321,7 +1325,7 @@ u8 rtl8192_phy_SwChnlStepByStep(struct net_device *dev, u8 channel, u8* stage, u
 
 
                do{
-                       switch(*stage)
+                       switch (*stage)
                        {
                        case 0:
                                CurrentCmd=&PreCommonCmd[*step];
@@ -1349,7 +1353,7 @@ u8 rtl8192_phy_SwChnlStepByStep(struct net_device *dev, u8 channel, u8* stage, u
                                }
                        }
 
-                       switch(CurrentCmd->CmdID)
+                       switch (CurrentCmd->CmdID)
                        {
                        case CmdID_SetTxPowerLevel:
                                if(priv->card_8192_version == (u8)VERSION_819xU_A) //xiong: consider it later!
@@ -1432,7 +1436,7 @@ void rtl8192_SwChnl_WorkItem(struct net_device *dev)
  *  return:  return code show if workitem is scheduled(1:pass, 0:fail)
  *    Note:  Delay may be required for RF configuration
  * ***************************************************************************/
-u8 rtl8192_phy_SwChnl(struct net_devicedev, u8 channel)
+u8 rtl8192_phy_SwChnl(struct net_device *dev, u8 channel)
 {
        struct r8192_priv *priv = ieee80211_priv(dev);
        RT_TRACE(COMP_CH, "=====>%s(), SwChnlInProgress:%d\n", __FUNCTION__, priv->SwChnlInProgress);
@@ -1452,7 +1456,7 @@ if (0) //to test current channel from RF reg 0x7.
        }
 }
        //--------------------------------------------
-       switch(priv->ieee80211->mode)
+       switch (priv->ieee80211->mode)
        {
        case WIRELESS_MODE_A:
        case WIRELESS_MODE_N_5G:
@@ -1515,7 +1519,7 @@ void rtl8192_SetBWModeWorkItem(struct net_device *dev)
        u8 regBwOpMode;
 
        RT_TRACE(COMP_SWBW, "==>rtl8192_SetBWModeWorkItem()  Switch to %s bandwidth\n", \
-                                       priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20?"20MHz":"40MHz")
+                priv->CurrentChannelBW == HT_CHANNEL_WIDTH_20?"20MHz":"40MHz");
 
 
        if(priv->rf_chip == RF_PSEUDO_11N)
@@ -1525,9 +1529,9 @@ void rtl8192_SetBWModeWorkItem(struct net_device *dev)
        }
 
        //<1>Set MAC register
-       regBwOpMode = read_nic_byte(dev, BW_OPMODE);
+       read_nic_byte(dev, BW_OPMODE, &regBwOpMode);
 
-       switch(priv->CurrentChannelBW)
+       switch (priv->CurrentChannelBW)
        {
                case HT_CHANNEL_WIDTH_20:
                        regBwOpMode |= BW_OPMODE_20MHZ;
@@ -1547,7 +1551,7 @@ void rtl8192_SetBWModeWorkItem(struct net_device *dev)
        }
 
        //<2>Set PHY related register
-       switch(priv->CurrentChannelBW)
+       switch (priv->CurrentChannelBW)
        {
                case HT_CHANNEL_WIDTH_20:
                        // Add by Vivi 20071119
@@ -1617,7 +1621,7 @@ void rtl8192_SetBWModeWorkItem(struct net_device *dev)
        //Skip over setting of J-mode in BB register here. Default value is "None J mode". Emily 20070315
 
        //<3>Set RF related register
-       switch( priv->rf_chip )
+       switch ( priv->rf_chip )
        {
                case RF_8225:
 #ifdef TO_DO_LIST
@@ -1704,7 +1708,7 @@ extern void InitialGainOperateWorkItemCallBack(struct work_struct *work)
 
        Operation = priv->InitialGainOperateType;
 
-       switch(Operation)
+       switch (Operation)
        {
                case IG_Backup:
                        RT_TRACE(COMP_SCAN, "IG_Backup, backup the initial gain.\n");
index 3e3bc577e6c3dae233f9a0f2691bf05d38ea5d0e..bf76d5993473be7a611aee3ea64f8bcba7100642 100644 (file)
@@ -58,26 +58,26 @@ typedef enum _RF90_RADIO_PATH{
 #define bMaskDWord                0xffffffff
 
 //extern u32 rtl8192_CalculateBitShift(u32 dwBitMask);
-extern u8 rtl8192_phy_CheckIsLegalRFPath(struct net_devicedev, u32 eRFPath);
-extern void rtl8192_setBBreg(struct net_devicedev, u32 dwRegAddr, u32 dwBitMask, u32 dwData);
-extern u32 rtl8192_QueryBBReg(struct net_devicedev, u32 dwRegAddr, u32 dwBitMask);
+extern u8 rtl8192_phy_CheckIsLegalRFPath(struct net_device *dev, u32 eRFPath);
+extern void rtl8192_setBBreg(struct net_device *dev, u32 dwRegAddr, u32 dwBitMask, u32 dwData);
+extern u32 rtl8192_QueryBBReg(struct net_device *dev, u32 dwRegAddr, u32 dwBitMask);
 //extern u32 rtl8192_phy_RFSerialRead(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32 Offset);
 //extern void rtl8192_phy_RFSerialWrite(struct net_device* dev, RF90_RADIO_PATH_E eRFPath, u32 Offset, u32 Data);
-extern void rtl8192_phy_SetRFReg(struct net_devicedev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask, u32 Data);
-extern u32 rtl8192_phy_QueryRFReg(struct net_devicedev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask);
-extern void rtl8192_phy_configmac(struct net_devicedev);
-extern void rtl8192_phyConfigBB(struct net_devicedev, u8 ConfigType);
+extern void rtl8192_phy_SetRFReg(struct net_device *dev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask, u32 Data);
+extern u32 rtl8192_phy_QueryRFReg(struct net_device *dev, RF90_RADIO_PATH_E eRFPath, u32 RegAddr, u32 BitMask);
+extern void rtl8192_phy_configmac(struct net_device *dev);
+extern void rtl8192_phyConfigBB(struct net_device *dev, u8 ConfigType);
 //extern void rtl8192_InitBBRFRegDef(struct net_device* dev);
-extern u8 rtl8192_phy_checkBBAndRF(struct net_devicedev, HW90_BLOCK_E CheckBlock, RF90_RADIO_PATH_E eRFPath);
+extern u8 rtl8192_phy_checkBBAndRF(struct net_device *dev, HW90_BLOCK_E CheckBlock, RF90_RADIO_PATH_E eRFPath);
 //extern void rtl8192_BB_Config_ParaFile(struct net_device* dev);
-extern void rtl8192_BBConfig(struct net_devicedev);
-extern void rtl8192_phy_getTxPower(struct net_devicedev);
-extern void rtl8192_phy_setTxPower(struct net_devicedev, u8 channel);
-extern void rtl8192_phy_RFConfig(struct net_devicedev);
-extern void rtl8192_phy_updateInitGain(struct net_devicedev);
-extern u8 rtl8192_phy_ConfigRFWithHeaderFile(struct net_devicedev, RF90_RADIO_PATH_E eRFPath);
+extern void rtl8192_BBConfig(struct net_device *dev);
+extern void rtl8192_phy_getTxPower(struct net_device *dev);
+extern void rtl8192_phy_setTxPower(struct net_device *dev, u8 channel);
+extern void rtl8192_phy_RFConfig(struct net_device *dev);
+extern void rtl8192_phy_updateInitGain(struct net_device *dev);
+extern u8 rtl8192_phy_ConfigRFWithHeaderFile(struct net_device *dev, RF90_RADIO_PATH_E eRFPath);
 
-extern u8 rtl8192_phy_SwChnl(struct net_devicedev, u8 channel);
+extern u8 rtl8192_phy_SwChnl(struct net_device *dev, u8 channel);
 extern void rtl8192_SetBWMode(struct net_device *dev, HT_CHANNEL_WIDTH Bandwidth, HT_EXTCHNL_OFFSET Offset);
 extern void rtl8192_SwChnl_WorkItem(struct net_device *dev);
 void rtl8192_SetBWModeWorkItem(struct net_device *dev);
index 89e4d805a345ce0c4849b0dcfdfab20cc70d0b03..c172f4ae7c2387c719f0c17146d279cc055efaf2 100644 (file)
@@ -635,12 +635,12 @@ int rts51x_get_epc_status(struct rts51x_chip *chip, u16 *status)
        ep = chip->usb->pusb_dev->ep_in[usb_pipeendpoint(pipe)];
 
        /* fill and submit the URB */
-       /* We set interval to 1 here, so the polling interval is controlled
-        * by our polling thread */
+       /* Set interval to 10 here to match the endpoint descriptor,
+        * the polling interval is controlled by the polling thread */
        usb_fill_int_urb(chip->usb->intr_urb, chip->usb->pusb_dev, pipe,
-                        status, 2, urb_done_completion, &urb_done, 1);
+                        status, 2, urb_done_completion, &urb_done, 10);
 
-       result = rts51x_msg_common(chip, chip->usb->intr_urb, 50);
+       result = rts51x_msg_common(chip, chip->usb->intr_urb, 100);
 
        return interpret_urb_result(chip, pipe, 2, result,
                                    chip->usb->intr_urb->actual_length);
index cd94f6c2772319bc289ead11aebb1e34c26955ef..23db32f07fd533ac4803b9a0a56815a5a7606eea 100644 (file)
@@ -18,11 +18,11 @@ static struct irq_info irq_lists[NR_IRQS];
 static _INLINE_ unsigned int serial_in(struct mp_port *mtpt, int offset);
 static _INLINE_ void serial_out(struct mp_port *mtpt, int offset, int value);
 static _INLINE_ unsigned int read_option_register(struct mp_port *mtpt, int offset);
-static int sb1054_get_register(struct sb_uart_port * port, int page, int reg);
-static int sb1054_set_register(struct sb_uart_port * port, int page, int reg, int value);
-static void SendATCommand(struct mp_port * mtpt);
-static int set_deep_fifo(struct sb_uart_port * port, int status);
-static int get_deep_fifo(struct sb_uart_port * port);
+static int sb1054_get_register(struct sb_uart_port *port, int page, int reg);
+static int sb1054_set_register(struct sb_uart_port *port, int page, int reg, int value);
+static void SendATCommand(struct mp_port *mtpt);
+static int set_deep_fifo(struct sb_uart_port *port, int status);
+static int get_deep_fifo(struct sb_uart_port *port);
 static int get_device_type(int arg);
 static int set_auto_rts(struct sb_uart_port *port, int status);
 static void mp_stop(struct tty_struct *tty);
@@ -38,7 +38,7 @@ static inline int __mp_put_char(struct sb_uart_port *port, struct circ_buf *circ
 static int mp_put_char(struct tty_struct *tty, unsigned char ch);
 
 static void mp_put_chars(struct tty_struct *tty);
-static int mp_write(struct tty_struct *tty, const unsigned char * buf, int count);
+static int mp_write(struct tty_struct *tty, const unsigned char *buf, int count);
 static int mp_write_room(struct tty_struct *tty);
 static int mp_chars_in_buffer(struct tty_struct *tty);
 static void mp_flush_buffer(struct tty_struct *tty);
@@ -102,7 +102,7 @@ static void multi_release_port(struct sb_uart_port *port);
 static int multi_request_port(struct sb_uart_port *port);
 static void multi_config_port(struct sb_uart_port *port, int flags);
 static int multi_verify_port(struct sb_uart_port *port, struct serial_struct *ser);
-static const char * multi_type(struct sb_uart_port *port);
+static const char *multi_type(struct sb_uart_port *port);
 static void __init multi_init_ports(void);
 static void __init multi_register_ports(struct uart_driver *drv);
 static int init_mp_dev(struct pci_dev *pcidev, mppcibrd_t brd);
@@ -173,7 +173,7 @@ static int sb1053a_get_interface(struct mp_port *mtpt, int port_num)
        return (interface);
 }
                
-static int sb1054_get_register(struct sb_uart_port * port, int page, int reg)
+static int sb1054_get_register(struct sb_uart_port *port, int page, int reg)
 {
        int ret = 0;
        unsigned int lcr = 0;
@@ -235,7 +235,7 @@ static int sb1054_get_register(struct sb_uart_port * port, int page, int reg)
        return ret;
 }
 
-static int sb1054_set_register(struct sb_uart_port * port, int page, int reg, int value)
+static int sb1054_set_register(struct sb_uart_port *port, int page, int reg, int value)
 {  
        int lcr = 0;
        int mcr = 0;
@@ -332,7 +332,7 @@ static int set_multidrop_addr(struct sb_uart_port *port, unsigned int addr)
        return 0;
 }
 
-static void SendATCommand(struct mp_port * mtpt)
+static void SendATCommand(struct mp_port *mtpt)
 {
        //                    a    t    cr   lf
        unsigned char ch[] = {0x61,0x74,0x0d,0x0a,0x0};
@@ -360,7 +360,7 @@ static void SendATCommand(struct mp_port * mtpt)
 
 }// end of SendATCommand()
 
-static int set_deep_fifo(struct sb_uart_port * port, int status)
+static int set_deep_fifo(struct sb_uart_port *port, int status)
 {
        int afr_status = 0;
        afr_status = sb1054_get_register(port, PAGE_4, SB105X_AFR);
@@ -416,7 +416,7 @@ static int get_device_type(int arg)
         }
 
 }
-static int get_deep_fifo(struct sb_uart_port * port)
+static int get_deep_fifo(struct sb_uart_port *port)
 {
        int afr_status = 0;
        afr_status = sb1054_get_register(port, PAGE_4, SB105X_AFR);
@@ -638,7 +638,7 @@ static void mp_put_chars(struct tty_struct *tty)
        mp_start(tty);
 }
 
-static int mp_write(struct tty_struct *tty, const unsigned char * buf, int count)
+static int mp_write(struct tty_struct *tty, const unsigned char *buf, int count)
 {
        struct sb_uart_state *state = tty->driver_data;
        struct sb_uart_port *port;
@@ -2754,7 +2754,7 @@ static int multi_verify_port(struct sb_uart_port *port, struct serial_struct *se
        return 0;
 }
 
-static const char * multi_type(struct sb_uart_port *port)
+static const char *multi_type(struct sb_uart_port *port)
 {
        int type = port->type;
 
@@ -2800,7 +2800,7 @@ static void __init multi_init_ports(void)
        int i,j,k;
        unsigned char osc;
        unsigned char b_ret = 0;
-       static struct mp_device_t * sbdev; 
+       static struct mp_device_t *sbdev; 
 
        if (!first)
                return;
@@ -2918,10 +2918,10 @@ static int pci_remap_base(struct pci_dev *pcidev, unsigned int offset,
 
 static int init_mp_dev(struct pci_dev *pcidev, mppcibrd_t brd)
 {
-       static struct mp_device_t * sbdev = mp_devs;
+       static struct mp_device_t *sbdev = mp_devs;
        unsigned long addr = 0;
        int j;
-       struct resource * ret = NULL;
+       struct resource *ret = NULL;
 
        sbdev->device_id = brd.device_id;
        pci_read_config_byte(pcidev, PCI_CLASS_REVISION, &(sbdev->revision));
index a15f470a172807b3fcbd04ffd49462e6772496b2..11d92992e9251e0ebaefa7275dfa34384d6c7b95 100644 (file)
@@ -19,7 +19,6 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/segment.h>
 #include <asm/serial.h>
 #include <linux/interrupt.h>
 
index b7e570ccb759dd4514a17206238b87dbe0299282..3082ba95c038d168b067b9bfd6fb1168c3e2fafa 100644 (file)
@@ -35,7 +35,7 @@
 #define BP_MOD_DESCR "Silicom Bypass-SD Control driver"
 #define BP_SYNC_FLAG 1
 
-static int major_num = 0;
+static int major_num;
 
 MODULE_AUTHOR("Anna Lukin, annal@silicom.co.il");
 MODULE_LICENSE("GPL");
@@ -43,12 +43,12 @@ MODULE_DESCRIPTION(BP_MOD_DESCR);
 MODULE_VERSION(BP_MOD_VER);
 spinlock_t bpvm_lock;
 
-#define lock_bpctl()                                   \
+#define lock_bpctl()                                   \
 if (down_interruptible(&bpctl_sema)) {                 \
        return -ERESTARTSYS;                            \
 }                                                      \
 
-#define unlock_bpctl()                                         \
+#define unlock_bpctl()                                 \
        up(&bpctl_sema);
 
 /* Media Types */
@@ -112,7 +112,7 @@ typedef struct _bpctl_dev {
 static bpctl_dev_t *bpctl_dev_arr;
 
 static struct semaphore bpctl_sema;
-static int device_num = 0;
+static int device_num;
 
 static int get_dev_idx(int ifindex);
 static bpctl_dev_t *get_master_port_fn(bpctl_dev_t *pbpctl_dev);
@@ -134,7 +134,7 @@ static int bp_device_event(struct notifier_block *unused,
                           unsigned long event, void *ptr)
 {
        struct net_device *dev = ptr;
-       static bpctl_dev_t *pbpctl_dev = NULL, *pbpctl_dev_m = NULL;
+       static bpctl_dev_t *pbpctl_dev, *pbpctl_dev_m;
        int dev_num = 0, ret = 0, ret_d = 0, time_left = 0;
        /* printk("BP_PROC_SUPPORT event =%d %s %d\n", event,dev->name, dev->ifindex ); */
        /* return NOTIFY_DONE; */
@@ -165,7 +165,7 @@ static int bp_device_event(struct notifier_block *unused,
                        memcpy(&cbuf, drvinfo.bus_info, 32);
                        buf = &cbuf[0];
 
-                       while (*buf++ != ':') ;
+                       while (*buf++ != ':');
                        for (i = 0; i < 10; i++, buf++) {
                                if (*buf == ':')
                                        break;
@@ -1415,7 +1415,7 @@ static int wdt_pulse(bpctl_dev_t *pbpctl_dev)
                                 ~(BP10G_MCLK_DATA_OUT | BP10G_MDIO_DATA_OUT)));
        }
        if ((pbpctl_dev->wdt_status == WDT_STATUS_EN)   /*&&
-                                                          (pbpctl_dev->bp_ext_ver<PXG4BPFI_VER) */ )
+                                                          (pbpctl_dev->bp_ext_ver<PXG4BPFI_VER) */)
                pbpctl_dev->bypass_wdt_on_time = jiffies;
 #ifdef BP_SYNC_FLAG
        spin_unlock_irqrestore(&pbpctl_dev->bypass_wr_lock, flags);
@@ -2154,7 +2154,7 @@ static void bp75_release_phy(bpctl_dev_t *pbpctl_dev)
        if ((pbpctl_dev->func == 1) || (pbpctl_dev->func == 3))
                mask = BPCTLI_SWFW_PHY1_SM;
 
-       while (bp75_get_hw_semaphore_generic(pbpctl_dev) != 0) ;
+       while (bp75_get_hw_semaphore_generic(pbpctl_dev) != 0);
        /* Empty */
 
        swfw_sync = BPCTL_READ_REG(pbpctl_dev, SW_FW_SYNC);
@@ -4334,7 +4334,7 @@ int get_bypass_wd_auto(bpctl_dev_t *pbpctl_dev)
        return BP_NOT_CAP;
 }
 
-#ifdef  BP_SELF_TEST
+#ifdef BP_SELF_TEST
 
 int set_bp_self_test(bpctl_dev_t *pbpctl_dev, unsigned int param)
 {
@@ -5345,7 +5345,7 @@ static void if_scan_init(void)
                memcpy(&cbuf, drvinfo.bus_info, 32);
                buf = &cbuf[0];
 
-               while (*buf++ != ':') ;
+               while (*buf++ != ':');
                for (i = 0; i < 10; i++, buf++) {
                        if (*buf == ':')
                                break;
@@ -5438,9 +5438,9 @@ static long device_ioctl(struct file *file,       /* see include/linux/fs.h */
                return -1;
        }
 
-/*     preempt_disable();
+/*     preempt_disable();
        rcu_read_lock();
-       spin_lock_irqsave(&bpvm_lock, flags);
+       spin_lock_irqsave(&bpvm_lock, flags);
 */
        if ((bpctl_cmd.in_param[5]) ||
            (bpctl_cmd.in_param[6]) || (bpctl_cmd.in_param[7]))
@@ -5787,7 +5787,7 @@ static const struct file_operations Fops = {
 };
 
 #ifndef PCI_DEVICE
-#define PCI_DEVICE(vend,dev) \
+#define PCI_DEVICE(vend, dev) \
        .vendor = (vend), .device = (dev), \
        .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID
 #endif
@@ -7263,7 +7263,7 @@ static int show_bypass_slave(struct seq_file *m, void *v)
        if (!slave)
                slave = dev;
        if (!slave)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (slave->ndev)
                seq_printf(m, "%s\n", slave->ndev->name);
        return 0;
@@ -7275,7 +7275,7 @@ static int show_bypass_caps(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_bypass_caps_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "-1\n");
+               seq_puts(m, "-1\n");
        else
                seq_printf(m, "0x%x\n", ret);
        return 0;
@@ -7287,7 +7287,7 @@ static int show_wd_set_caps(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_wd_set_caps_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "-1\n");
+               seq_puts(m, "-1\n");
        else
                seq_printf(m, "0x%x\n", ret);
        return 0;
@@ -7333,11 +7333,11 @@ static int show_bypass(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_bypass_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 1)
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        return 0;
 }
 RW_FOPS(bypass)
@@ -7357,11 +7357,11 @@ static int show_tap(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_tap_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 1)
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        return 0;
 }
 RW_FOPS(tap)
@@ -7381,11 +7381,11 @@ static int show_disc(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_disc_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 1)
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        return 0;
 }
 RW_FOPS(disc)
@@ -7395,11 +7395,11 @@ static int show_bypass_change(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_bypass_change_fn(dev);
        if (ret == 1)
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        return 0;
 }
 RO_FOPS(bypass_change)
@@ -7409,11 +7409,11 @@ static int show_tap_change(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_tap_change_fn(dev);
        if (ret == 1)
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        return 0;
 }
 RO_FOPS(tap_change)
@@ -7423,11 +7423,11 @@ static int show_disc_change(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_disc_change_fn(dev);
        if (ret == 1)
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        return 0;
 }
 RO_FOPS(disc_change)
@@ -7450,11 +7450,11 @@ static int show_bypass_wd(struct seq_file *m, void *v)
 
        ret = get_bypass_wd_fn(dev, &timeout);
        if (ret == BP_NOT_CAP)
-               seq_printf(m,  "fail\n");
+               seq_puts(m,  "fail\n");
        else if (timeout == -1)
-               seq_printf(m,  "unknown\n");
+               seq_puts(m,  "unknown\n");
        else if (timeout == 0)
-               seq_printf(m,  "disable\n");
+               seq_puts(m,  "disable\n");
        else
                seq_printf(m, "%d\n", timeout);
        return 0;
@@ -7467,11 +7467,11 @@ static int show_wd_expire_time(struct seq_file *m, void *v)
        int ret = 0, timeout = 0;
        ret = get_wd_expire_time_fn(dev, &timeout);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (timeout == -1)
-               seq_printf(m, "expire\n");
+               seq_puts(m, "expire\n");
        else if (timeout == 0)
-               seq_printf(m, "disable\n");
+               seq_puts(m, "disable\n");
        else
                seq_printf(m, "%d\n", timeout);
        return 0;
@@ -7494,11 +7494,11 @@ static int show_tpl(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_tpl_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 1)
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        return 0;
 }
 RW_FOPS(tpl)
@@ -7520,11 +7520,11 @@ static int show_wait_at_pwup(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_bp_wait_at_pwup_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 1)
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        return 0;
 }
 RW_FOPS(wait_at_pwup)
@@ -7545,11 +7545,11 @@ static int show_hw_reset(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_bp_hw_reset_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 1)
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        return 0;
 }
 RW_FOPS(hw_reset)
@@ -7561,11 +7561,11 @@ static int show_reset_bypass_wd(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = reset_bypass_wd_timer_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 0)
-               seq_printf(m, "disable\n");
+               seq_puts(m, "disable\n");
        else if (ret == 1)
-               seq_printf(m, "success\n");
+               seq_puts(m, "success\n");
        return 0;
 }
 RO_FOPS(reset_bypass_wd)
@@ -7585,11 +7585,11 @@ static int show_dis_bypass(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_dis_bypass_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        return 0;
 }
 RW_FOPS(dis_bypass)
@@ -7609,11 +7609,11 @@ static int show_dis_tap(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_dis_tap_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        return 0;
 }
 RW_FOPS(dis_tap)
@@ -7633,11 +7633,11 @@ static int show_dis_disc(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_dis_disc_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        return 0;
 }
 RW_FOPS(dis_disc)
@@ -7657,11 +7657,11 @@ static int show_bypass_pwup(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_bypass_pwup_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        return 0;
 }
 RW_FOPS(bypass_pwup)
@@ -7681,11 +7681,11 @@ static int show_bypass_pwoff(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_bypass_pwoff_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        return 0;
 }
 RW_FOPS(bypass_pwoff)
@@ -7705,11 +7705,11 @@ static int show_tap_pwup(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_tap_pwup_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        return 0;
 }
 RW_FOPS(tap_pwup)
@@ -7729,11 +7729,11 @@ static int show_disc_pwup(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_disc_pwup_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        return 0;
 }
 RW_FOPS(disc_pwup)
@@ -7753,11 +7753,11 @@ static int show_std_nic(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_std_nic_fn(dev);
        if (ret == BP_NOT_CAP)
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        else if (ret == 0)
-               seq_printf(m, "off\n");
+               seq_puts(m, "off\n");
        else
-               seq_printf(m, "on\n");
+               seq_puts(m, "on\n");
        return 0;
 }
 RW_FOPS(std_nic)
@@ -7795,13 +7795,13 @@ static int show_wd_exp_mode(struct seq_file *m, void *v)
        bpctl_dev_t *dev = m->private;
        int ret = get_wd_exp_mode_fn(dev);
        if (ret == 1)
-               seq_printf(m, "tap\n");
+               seq_puts(m, "tap\n");
        else if (ret == 0)
-               seq_printf(m, "bypass\n");
+               seq_puts(m, "bypass\n");
        else if (ret == 2)
-               seq_printf(m, "disc\n");
+               seq_puts(m, "disc\n");
        else
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        return 0;
 }
 RW_FOPS(wd_exp_mode)
@@ -7823,7 +7823,7 @@ static int show_wd_autoreset(struct seq_file *m, void *v)
        if (ret >= 0)
                seq_printf(m, "%d\n", ret);
        else
-               seq_printf(m, "fail\n");
+               seq_puts(m, "fail\n");
        return 0;
 }
 RW_FOPS(wd_autoreset)
@@ -7831,7 +7831,7 @@ RW_FOPS(wd_autoreset)
 int bypass_proc_create_dev_sd(bpctl_dev_t *pbp_device_block)
 {
        struct bypass_pfs_sd *current_pfs = &(pbp_device_block->bypass_pfs_set);
-       static struct proc_dir_entry *procfs_dir = NULL;
+       static struct proc_dir_entry *procfs_dir;
        int ret = 0;
 
        if (!pbp_device_block->ndev)
index 040c6fa8d5ada50f70bfe05e73e551cdacc5b340..2d1ef5384436db490775c4482ca526bb3213716c 100644 (file)
 #ifndef BP_IOCTL_H
 #define BP_IOCTL_H
 
-#define BP_CAP                   0x01  //BIT_0
-#define BP_STATUS_CAP            0x02  //BIT_1
-#define BP_STATUS_CHANGE_CAP     0x04  //BIT_2
-#define SW_CTL_CAP               0x08  //BIT_3
-#define BP_DIS_CAP               0x10  //BIT_4
-#define BP_DIS_STATUS_CAP        0x20  //BIT_5
-#define STD_NIC_CAP              0x40  //BIT_6
-#define BP_PWOFF_ON_CAP          0x80  //BIT_7
-#define BP_PWOFF_OFF_CAP         0x0100        //BIT_8
-#define BP_PWOFF_CTL_CAP         0x0200        //BIT_9
-#define BP_PWUP_ON_CAP           0x0400        //BIT_10
-#define BP_PWUP_OFF_CAP          0x0800        //BIT_11
-#define BP_PWUP_CTL_CAP          0x1000        //BIT_12
-#define WD_CTL_CAP               0x2000        //BIT_13
-#define WD_STATUS_CAP            0x4000        //BIT_14
-#define WD_TIMEOUT_CAP           0x8000        //BIT_15
-#define TX_CTL_CAP               0x10000       //BIT_16
-#define TX_STATUS_CAP            0x20000       //BIT_17
-#define TAP_CAP                  0x40000       //BIT_18
-#define TAP_STATUS_CAP           0x80000       //BIT_19
-#define TAP_STATUS_CHANGE_CAP    0x100000      //BIT_20
-#define TAP_DIS_CAP              0x200000      //BIT_21
-#define TAP_DIS_STATUS_CAP       0x400000      //BIT_22
-#define TAP_PWUP_ON_CAP          0x800000      //BIT_23
-#define TAP_PWUP_OFF_CAP         0x1000000     //BIT 24
-#define TAP_PWUP_CTL_CAP         0x2000000     //BIT 25
-#define NIC_CAP_NEG              0x4000000     //BIT 26
-#define TPL_CAP                  0x8000000     //BIT 27
-#define DISC_CAP                 0x10000000    //BIT 28
-#define DISC_DIS_CAP             0x20000000    //BIT 29
-#define DISC_PWUP_CTL_CAP        0x40000000    //BIT 30
+#define BP_CAP                   0x01  /* BIT_0 */
+#define BP_STATUS_CAP            0x02  /* BIT_1 */
+#define BP_STATUS_CHANGE_CAP     0x04  /* BIT_2 */
+#define SW_CTL_CAP               0x08  /* BIT_3 */
+#define BP_DIS_CAP               0x10  /* BIT_4 */
+#define BP_DIS_STATUS_CAP        0x20  /* BIT_5 */
+#define STD_NIC_CAP              0x40  /* BIT_6 */
+#define BP_PWOFF_ON_CAP          0x80  /* BIT_7 */
+#define BP_PWOFF_OFF_CAP         0x0100        /* BIT_8 */
+#define BP_PWOFF_CTL_CAP         0x0200        /* BIT_9 */
+#define BP_PWUP_ON_CAP           0x0400        /* BIT_10 */
+#define BP_PWUP_OFF_CAP          0x0800        /* BIT_11 */
+#define BP_PWUP_CTL_CAP          0x1000        /* BIT_12 */
+#define WD_CTL_CAP               0x2000        /* BIT_13 */
+#define WD_STATUS_CAP            0x4000        /* BIT_14 */
+#define WD_TIMEOUT_CAP           0x8000        /* BIT_15 */
+#define TX_CTL_CAP               0x10000       /* BIT_16 */
+#define TX_STATUS_CAP            0x20000       /* BIT_17 */
+#define TAP_CAP                  0x40000       /* BIT_18 */
+#define TAP_STATUS_CAP           0x80000       /* BIT_19 */
+#define TAP_STATUS_CHANGE_CAP    0x100000      /* BIT_20 */
+#define TAP_DIS_CAP              0x200000      /* BIT_21 */
+#define TAP_DIS_STATUS_CAP       0x400000      /* BIT_22 */
+#define TAP_PWUP_ON_CAP          0x800000      /* BIT_23 */
+#define TAP_PWUP_OFF_CAP         0x1000000     /* BIT 24 */
+#define TAP_PWUP_CTL_CAP         0x2000000     /* BIT 25 */
+#define NIC_CAP_NEG              0x4000000     /* BIT 26 */
+#define TPL_CAP                  0x8000000     /* BIT 27 */
+#define DISC_CAP                 0x10000000    /* BIT 28 */
+#define DISC_DIS_CAP             0x20000000    /* BIT 29 */
+#define DISC_PWUP_CTL_CAP        0x40000000    /* BIT 30 */
 
 #define WD_MIN_TIME_MASK(val)      (val & 0xf)
 #define WD_STEP_COUNT_MASK(val)    ((val & 0xf) << 5)
-#define WDT_STEP_TIME              0x10        //BIT_4
+#define WDT_STEP_TIME              0x10        /* BIT_4 */
 
 #define WD_MIN_TIME_GET(desc)   (desc & 0xf)
 #define WD_STEP_COUNT_GET(desc) (desc>>5) & 0xf
index d8c1d27650b4d437db779f1edbbe8ef7af03d31b..b31694fad53893f9bac4d32470e60316c313373e 100644 (file)
 #define INTEL_PEG4BPFII_PID 0x10a1
 
 #define PEGII_IF_SERIES(vid, pid) \
-        ((vid==0x8086)&& \
-        ((pid==INTEL_PEG4BPII_PID)||   \
-          (pid==INTEL_PEG4BPFII_PID)))
+       ((vid == 0x8086) && \
+        ((pid == INTEL_PEG4BPII_PID) ||   \
+         (pid == INTEL_PEG4BPFII_PID)))
 
 #define EXPORT_SYMBOL_NOVERS EXPORT_SYMBOL
 
 #ifdef BP_VENDOR_SUPPORT
-char *bp_desc_array[] =
-    { "e1000bp", "e1000bpe", "slcm5700", "bnx2xbp", "ixgbp", "ixgbpe", NULL };
+char *bp_desc_array[] = { "e1000bp", "e1000bpe", "slcm5700",
+                       "bnx2xbp", "ixgbp", "ixgbpe", NULL };
 #endif
 
 #endif
index e4b82770ed39f68d747d1afe144c2273ed60a4bc..869dcd3b385accb8606d8f53c73b298986c4712c 100644 (file)
@@ -3651,17 +3651,20 @@ static int slic_entry_probe(struct pci_dev *pcidev,
 
        if (!pci_set_dma_mask(pcidev, DMA_BIT_MASK(64))) {
                pci_using_dac = 1;
-               if (pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64))) {
+               err = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64));
+               if (err) {
                        dev_err(&pcidev->dev, "unable to obtain 64-bit DMA for "
                                        "consistent allocations\n");
                        goto err_out_disable_pci;
                }
-       } else if (pci_set_dma_mask(pcidev, DMA_BIT_MASK(32))) {
+       } else {
+               err = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
+               if (err) {
+                       dev_err(&pcidev->dev, "no usable DMA configuration\n");
+                       goto err_out_disable_pci;
+               }
                pci_using_dac = 0;
                pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32));
-       } else {
-               dev_err(&pcidev->dev, "no usable DMA configuration\n");
-               goto err_out_disable_pci;
        }
 
        err = pci_request_regions(pcidev, DRV_NAME);
@@ -3696,6 +3699,7 @@ static int slic_entry_probe(struct pci_dev *pcidev,
        if (!memmapped_ioaddr) {
                dev_err(&pcidev->dev, "cannot remap MMIO region %lx @ %lx\n",
                        mmio_len, mmio_start);
+               err = -ENOMEM;
                goto err_out_free_netdev;
        }
 
@@ -3706,8 +3710,8 @@ static int slic_entry_probe(struct pci_dev *pcidev,
        slic_init_adapter(netdev,
                          pcidev, pci_tbl_entry, memmapped_ioaddr, cards_found);
 
-       status = slic_card_locate(adapter);
-       if (status) {
+       err = slic_card_locate(adapter);
+       if (err) {
                dev_err(&pcidev->dev, "cannot locate card\n");
                goto err_out_free_mmio_region;
        }
index b416aceb13f2759c64399bec491a0c49e3c9ab23..8c3e7a60a9be380888cc6450353cbf0c9889880c 100644 (file)
@@ -11,7 +11,7 @@ config SPEAKUP
                point your browser at <http://www.linux-speakup.org/>.
                There is also a mailing list at the above url that you
                can subscribe to.
-               
+
                Supported synthesizers are accent sa, accent pc,
                appollo II., Auddapter, Braille 'n Speak, Dectalk
                external (old), Dectalk PC (full length isa board),
@@ -19,24 +19,24 @@ config SPEAKUP
                Litetalk, Keynote Gold internal PC, software
                synthesizers, Speakout, transport, and a dummy module
                that can be used with a plain text terminal.
-               
+
                Speakup can either be built in or compiled as a module
                by answering y or m.  If you answer y here, then you
                must answer either y or m to at least one of the
                synthesizer drivers below.  If you answer m here, then
                the synthesizer drivers below can only be built as
                modules.
-               
+
                These drivers are not standalone drivers, but must be
                used in conjunction with Speakup.  Think of them as
                video cards for blind people.
-               
-               
+
+
                The Dectalk pc driver can only be built as a module, and
                requires software to be pre-loaded on to the card before
                the module can be loaded.  See the decpc choice below
                for more details.
-               
+
                If you are not a blind person, or don't have access to
                one of the listed synthesizers, you should say n.
 
@@ -84,7 +84,7 @@ config SPEAKUP_SYNTH_BNS
 config SPEAKUP_SYNTH_DECTLK
        tristate "DECtalk Express synthesizer support"
        ---help---
-               
+
                This is the Speakup driver for the DecTalk Express
                synthesizer.  You can say y to build it into the kernel,
                or m to build it as a module.  See the configuration
@@ -93,7 +93,7 @@ config SPEAKUP_SYNTH_DECTLK
 config SPEAKUP_SYNTH_DECEXT
        tristate "DECtalk External (old) synthesizer support"
        ---help---
-               
+
                This is the Speakup driver for the DecTalk External
                (old) synthesizer.  You can say y to build it into the
                kernel, or m to build it as a module.  See the
@@ -104,12 +104,12 @@ config SPEAKUP_SYNTH_DECPC
        depends on m
        tristate "DECtalk PC (big ISA card) synthesizer support"
        ---help---
-               
+
                This is the Speakup driver for the DecTalk PC (full
                length ISA) synthesizer.  You can say m to build it as
                a module.  See the configuration help on the Speakup
                choice above for more info.
-               
+
                In order to use the DecTalk PC driver, you must download
                the dec_pc.tgz file from linux-speakup.org.  It is in
                the pub/linux/goodies directory.  The dec_pc.tgz file
@@ -118,14 +118,14 @@ config SPEAKUP_SYNTH_DECPC
                This driver must be built as a module, and can not be
                loaded until the file system is mounted and the DecTalk
                PC software has been pre-loaded on to the board.
-               
+
                See the README file in the dec_pc.tgz file for more
                details.
 
 config SPEAKUP_SYNTH_DTLK
        tristate "DoubleTalk PC synthesizer support"
        ---help---
-               
+
                This is the Speakup driver for the internal DoubleTalk
                PC synthesizer.  You can say y to build it into the
                kernel, or m to build it as a module.  See the
@@ -135,7 +135,7 @@ config SPEAKUP_SYNTH_DTLK
 config SPEAKUP_SYNTH_KEYPC
        tristate "Keynote Gold PC synthesizer support"
        ---help---
-               
+
                This is the Speakup driver for the Keynote Gold
                PC synthesizer.  You can say y to build it into the
                kernel, or m to build it as a module.  See the
@@ -166,7 +166,7 @@ config SPEAKUP_SYNTH_SOFT
 config SPEAKUP_SYNTH_SPKOUT
        tristate "Speak Out synthesizer support"
        ---help---
-               
+
                This is the Speakup driver for the Speakout synthesizer.
                 You can say y to build it into the kernel, or m to
                build it as a module.  See the configuration help on the
@@ -175,7 +175,7 @@ config SPEAKUP_SYNTH_SPKOUT
 config SPEAKUP_SYNTH_TXPRT
        tristate "Transport synthesizer support"
        ---help---
-               
+
                This is the Speakup driver for the Transport
                synthesizer.  You can say y to build it into the kernel,
                or m to build it as a module.  See the configuration
@@ -184,7 +184,7 @@ config SPEAKUP_SYNTH_TXPRT
 config SPEAKUP_SYNTH_DUMMY
        tristate "Dummy synthesizer driver (for testing)"
        ---help---
-               
+
                This is a dummy Speakup driver for plugging a mere serial
                terminal.  This is handy if you want to test speakup but
                don't have the hardware.  You can say y to build it into
index 940769ef883fcb7d6cad69931bc66fb5490328f9..71c728acf4ca1c28892fa1beebbb5cdea2095f6a 100644 (file)
 static int misc_registered;
 static int dev_opened;
 
-static ssize_t speakup_file_write(struct file *fp, const char *buffer,
-                  size_t nbytes, loff_t *ppos)
+static ssize_t speakup_file_write(struct file *fp, const char __user *buffer,
+                                 size_t nbytes, loff_t *ppos)
 {
        size_t count = nbytes;
-       const char *ptr = buffer;
+       const char __user *ptr = buffer;
        size_t bytes;
        unsigned long flags;
        u_char buf[256];
@@ -30,15 +30,15 @@ static ssize_t speakup_file_write(struct file *fp, const char *buffer,
                        return -EFAULT;
                count -= bytes;
                ptr += bytes;
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                synth_write(buf, bytes);
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        }
        return (ssize_t) nbytes;
 }
 
-static ssize_t speakup_file_read(struct file *fp, char *buf, size_t nbytes,
-       loff_t *ppos)
+static ssize_t speakup_file_read(struct file *fp, char __user *buf,
+                                size_t nbytes, loff_t *ppos)
 {
        return 0;
 }
index 2add1fcfd122a2ad23440a63fe1c9ef5f4b80bd1..9ea16c5b4d6c6d008b0e8923423438377f43583c 100644 (file)
@@ -558,11 +558,11 @@ ssize_t spk_msg_set(enum msg_index_t index, char *text, size_t length)
                                kfree(newstr);
                                return -EINVAL;
                        }
-                       spk_lock(flags);
+                       spin_lock_irqsave(&speakup_info.spinlock, flags);
                        if (speakup_msgs[index] != speakup_default_msgs[index])
                                kfree(speakup_msgs[index]);
                        speakup_msgs[index] = newstr;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                } else {
                        rc = -ENOMEM;
                }
@@ -595,14 +595,14 @@ void spk_reset_msg_group(struct msg_group_t *group)
        unsigned long flags;
        enum msg_index_t i;
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
 
        for (i = group->start; i <= group->end; i++) {
                if (speakup_msgs[i] != speakup_default_msgs[i])
                        kfree(speakup_msgs[i]);
                speakup_msgs[i] = speakup_default_msgs[i];
        }
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 /* Called at initialization time, to establish default messages. */
@@ -618,12 +618,12 @@ void spk_free_user_msgs(void)
        enum msg_index_t index;
        unsigned long flags;
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        for (index = MSG_FIRST_INDEX; index < MSG_LAST_INDEX; index++) {
                if (speakup_msgs[index] != speakup_default_msgs[index]) {
                        kfree(speakup_msgs[index]);
                        speakup_msgs[index] = speakup_default_msgs[index];
                }
        }
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
index 943b6c134a229d99e189f2376ddcf8a74d79eb06..51bdea3a5beada9a6de419abc9dad9b658355ad3 100644 (file)
@@ -35,7 +35,7 @@ static ssize_t chars_chartab_show(struct kobject *kobj,
        size_t bufsize = PAGE_SIZE;
        unsigned long flags;
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        *buf_pointer = '\0';
        for (i = 0; i < 256; i++) {
                if (bufsize <= 1)
@@ -70,7 +70,7 @@ static ssize_t chars_chartab_show(struct kobject *kobj,
                bufsize -= len;
                buf_pointer += len;
        }
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return buf_pointer - buf;
 }
 
@@ -127,7 +127,7 @@ static ssize_t chars_chartab_store(struct kobject *kobj,
        size_t desc_length = 0;
        int i;
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        while (cp < end) {
 
                while ((cp < end) && (*cp == ' ' || *cp == '\t'))
@@ -212,7 +212,7 @@ static ssize_t chars_chartab_store(struct kobject *kobj,
                        spk_reset_default_chartab();
        }
 
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        report_char_chartab_status(reset, received, used, rejected,
                do_characters);
        return retval;
@@ -232,7 +232,7 @@ static ssize_t keymap_show(struct kobject *kobj, struct kobj_attribute *attr,
        u_char *cp1;
        u_char ch;
        unsigned long flags;
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        cp1 = spk_key_buf + SHIFT_TBL_SIZE;
        num_keys = (int)(*cp1);
        nstates = (int)cp1[1];
@@ -248,7 +248,7 @@ static ssize_t keymap_show(struct kobject *kobj, struct kobj_attribute *attr,
                }
        }
        cp += sprintf(cp, "0, %d\n", KEY_MAP_VER);
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return (int)(cp-buf);
 }
 
@@ -265,17 +265,17 @@ static ssize_t keymap_store(struct kobject *kobj, struct kobj_attribute *attr,
        u_char *cp1;
        unsigned long flags;
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        in_buff = kmemdup(buf, count + 1, GFP_ATOMIC);
        if (!in_buff) {
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return -ENOMEM;
        }
        if (strchr("dDrR", *in_buff)) {
                spk_set_key_info(spk_key_defaults, spk_key_buf);
                pr_info("keymap set to default values\n");
                kfree(in_buff);
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return count;
        }
        if (in_buff[count - 1] == '\n')
@@ -294,7 +294,7 @@ static ssize_t keymap_store(struct kobject *kobj, struct kobj_attribute *attr,
                pr_warn("i %d %d %d %d\n", i,
                                (int)cp1[-3], (int)cp1[-2], (int)cp1[-1]);
                kfree(in_buff);
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return -EINVAL;
        }
        while (--i >= 0) {
@@ -315,7 +315,7 @@ static ssize_t keymap_store(struct kobject *kobj, struct kobj_attribute *attr,
                }
        }
        kfree(in_buff);
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return ret;
 }
 
@@ -341,7 +341,7 @@ static ssize_t silent_store(struct kobject *kobj, struct kobj_attribute *attr,
                pr_warn("silent value '%c' not in range (0,7)\n", ch);
                return -EINVAL;
        }
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        if (ch&2) {
                shut = 1;
                spk_do_flush();
@@ -354,7 +354,7 @@ static ssize_t silent_store(struct kobject *kobj, struct kobj_attribute *attr,
                spk_shut_up |= shut;
        else
                spk_shut_up &= ~shut;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return count;
 }
 
@@ -470,7 +470,7 @@ static ssize_t punc_show(struct kobject *kobj, struct kobj_attribute *attr,
                return -EINVAL;
        }
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        pb = (struct st_bits_data *) &spk_punc_info[var->value];
        mask = pb->mask;
        for (i = 33; i < 128; i++) {
@@ -478,7 +478,7 @@ static ssize_t punc_show(struct kobject *kobj, struct kobj_attribute *attr,
                        continue;
                *cp++ = (char)i;
        }
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return cp-buf;
 }
 
@@ -518,14 +518,14 @@ static ssize_t punc_store(struct kobject *kobj, struct kobj_attribute *attr,
                x--;
        punc_buf[x] = '\0';
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
 
        if (*punc_buf == 'd' || *punc_buf == 'r')
-               x = spk_set_mask_bits(0, var->value, 3);
+               x = spk_set_mask_bits(NULL, var->value, 3);
        else
                x = spk_set_mask_bits(punc_buf, var->value, 3);
 
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return count;
 }
 
@@ -547,7 +547,7 @@ ssize_t spk_var_show(struct kobject *kobj, struct kobj_attribute *attr,
        if (param == NULL)
                return -EINVAL;
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        var = (struct var_t *) param->data;
        switch (param->var_type) {
        case VAR_NUM:
@@ -580,7 +580,7 @@ ssize_t spk_var_show(struct kobject *kobj, struct kobj_attribute *attr,
                        param->name, param->var_type);
                break;
        }
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return rv;
 }
 EXPORT_SYMBOL_GPL(spk_var_show);
@@ -609,7 +609,7 @@ ssize_t spk_var_store(struct kobject *kobj, struct kobj_attribute *attr,
        cp = (char *)buf;
        string_unescape_any_inplace(cp);
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        switch (param->var_type) {
        case VAR_NUM:
        case VAR_TIME:
@@ -670,7 +670,7 @@ ssize_t spk_var_store(struct kobject *kobj, struct kobj_attribute *attr,
                        }
                }
        }
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 
        if (ret == -ERESTART)
                pr_info("%s reset to default value\n", attr->attr.name);
@@ -818,9 +818,9 @@ static ssize_t message_show(struct kobject *kobj,
        unsigned long flags;
 
        BUG_ON(!group);
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        retval = message_show_helper(buf, group->start, group->end);
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return retval;
 }
 
index 6c7b55c2947d0f0f872b6d5dca65af7fd518ba65..14079c4949a8038e801474a2842f09e38e5bd403 100644 (file)
@@ -95,7 +95,8 @@ const struct st_bits_data spk_punc_info[] = {
 
 static char mark_cut_flag;
 #define MAX_KEY 160
-u_char *spk_our_keys[MAX_KEY], *spk_shift_table;
+static u_char *spk_shift_table;
+u_char *spk_our_keys[MAX_KEY];
 u_char spk_key_buf[600];
 const u_char spk_key_defaults[] = {
 #include "speakupmap.h"
@@ -457,7 +458,7 @@ static void speak_char(u_char ch)
        synth_buffer_add(SPACE);
 }
 
-static u16 get_char(struct vc_data *vc, u16 * pos, u_char * attribs)
+static u16 get_char(struct vc_data *vc, u16 *pos, u_char *attribs)
 {
        u16 ch = ' ';
        if (vc && pos) {
@@ -1129,7 +1130,7 @@ static void do_handle_shift(struct vc_data *vc, u_char value, char up_flag)
        unsigned long flags;
        if (synth == NULL || up_flag || spk_killed)
                return;
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        if (cursor_track == read_all_mode) {
                switch (value) {
                case KVAL(K_SHIFT):
@@ -1151,20 +1152,20 @@ static void do_handle_shift(struct vc_data *vc, u_char value, char up_flag)
        }
        if (spk_say_ctrl && value < NUM_CTL_LABELS)
                synth_printf("%s", spk_msg_get(MSG_CTL_START + value));
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 static void do_handle_latin(struct vc_data *vc, u_char value, char up_flag)
 {
        unsigned long flags;
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        if (up_flag) {
                spk_lastkey = spk_keydown = 0;
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return;
        }
        if (synth == NULL || spk_killed) {
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return;
        }
        spk_shut_up &= 0xfe;
@@ -1173,7 +1174,7 @@ static void do_handle_latin(struct vc_data *vc, u_char value, char up_flag)
        spk_parked &= 0xfe;
        if (spk_key_echo == 2 && value >= MINECHOCHAR)
                speak_char(value);
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 int spk_set_key_info(const u_char *key_info, u_char *k_buffer)
@@ -1282,7 +1283,7 @@ static int edit_bits(struct vc_data *vc, u_char type, u_char ch, u_short key)
 }
 
 /* Allocation concurrency is protected by the console semaphore */
-int speakup_allocate(struct vc_data *vc)
+static int speakup_allocate(struct vc_data *vc)
 {
        int vc_num;
 
@@ -1299,7 +1300,7 @@ int speakup_allocate(struct vc_data *vc)
        return 0;
 }
 
-void speakup_deallocate(struct vc_data *vc)
+static void speakup_deallocate(struct vc_data *vc)
 {
        int vc_num;
 
@@ -1449,21 +1450,21 @@ static void handle_cursor_read_all(struct vc_data *vc, int command)
 static int pre_handle_cursor(struct vc_data *vc, u_char value, char up_flag)
 {
        unsigned long flags;
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        if (cursor_track == read_all_mode) {
                spk_parked &= 0xfe;
                if (synth == NULL || up_flag || spk_shut_up) {
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        return NOTIFY_STOP;
                }
                del_timer(&cursor_timer);
                spk_shut_up &= 0xfe;
                spk_do_flush();
                start_read_all_timer(vc, value + 1);
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return NOTIFY_STOP;
        }
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return NOTIFY_OK;
 }
 
@@ -1472,10 +1473,10 @@ static void do_handle_cursor(struct vc_data *vc, u_char value, char up_flag)
        unsigned long flags;
        struct var_t *cursor_timeout;
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        spk_parked &= 0xfe;
        if (synth == NULL || up_flag || spk_shut_up || cursor_track == CT_Off) {
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return;
        }
        spk_shut_up &= 0xfe;
@@ -1494,7 +1495,7 @@ static void do_handle_cursor(struct vc_data *vc, u_char value, char up_flag)
        cursor_timeout = spk_get_var(CURSOR_TIME);
        mod_timer(&cursor_timer,
                  jiffies + msecs_to_jiffies(cursor_timeout->u.n.value));
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 static void update_color_buffer(struct vc_data *vc, const char *ic, int len)
@@ -1619,7 +1620,7 @@ static void cursor_done(u_long data)
        struct vc_data *vc = vc_cons[cursor_con].d;
        unsigned long flags;
        del_timer(&cursor_timer);
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        if (cursor_con != fg_console) {
                is_cursor = 0;
                goto out;
@@ -1650,7 +1651,7 @@ static void cursor_done(u_long data)
                say_char(vc);
        spk_keydown = is_cursor = 0;
 out:
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 /* called by: vt_notifier_call() */
@@ -1659,13 +1660,13 @@ static void speakup_bs(struct vc_data *vc)
        unsigned long flags;
        if (!speakup_console[vc->vc_num])
                return;
-       if (!spk_trylock(flags))
+       if (!spin_trylock_irqsave(&speakup_info.spinlock, flags))
                /* Speakup output, discard */
                return;
        if (!spk_parked)
                speakup_date(vc);
        if (spk_shut_up || synth == NULL) {
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return;
        }
        if (vc->vc_num == fg_console && spk_keydown) {
@@ -1673,7 +1674,7 @@ static void speakup_bs(struct vc_data *vc)
                if (!is_cursor)
                        say_char(vc);
        }
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 /* called by: vt_notifier_call() */
@@ -1682,7 +1683,7 @@ static void speakup_con_write(struct vc_data *vc, const char *str, int len)
        unsigned long flags;
        if ((vc->vc_num != fg_console) || spk_shut_up || synth == NULL)
                return;
-       if (!spk_trylock(flags))
+       if (!spin_trylock_irqsave(&speakup_info.spinlock, flags))
                /* Speakup output, discard */
                return;
        if (spk_bell_pos && spk_keydown && (vc->vc_x == spk_bell_pos - 1))
@@ -1690,31 +1691,31 @@ static void speakup_con_write(struct vc_data *vc, const char *str, int len)
        if ((is_cursor) || (cursor_track == read_all_mode)) {
                if (cursor_track == CT_Highlight)
                        update_color_buffer(vc, str, len);
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return;
        }
        if (win_enabled) {
                if (vc->vc_x >= win_left && vc->vc_x <= win_right &&
                    vc->vc_y >= win_top && vc->vc_y <= win_bottom) {
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        return;
                }
        }
 
        spkup_write(str, len);
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
-void speakup_con_update(struct vc_data *vc)
+static void speakup_con_update(struct vc_data *vc)
 {
        unsigned long flags;
        if (speakup_console[vc->vc_num] == NULL || spk_parked)
                return;
-       if (!spk_trylock(flags))
+       if (!spin_trylock_irqsave(&speakup_info.spinlock, flags))
                /* Speakup output, discard */
                return;
        speakup_date(vc);
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 static void do_handle_spec(struct vc_data *vc, u_char value, char up_flag)
@@ -1724,7 +1725,7 @@ static void do_handle_spec(struct vc_data *vc, u_char value, char up_flag)
        char *label;
        if (synth == NULL || up_flag || spk_killed)
                return;
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        spk_shut_up &= 0xfe;
        if (spk_no_intr)
                spk_do_flush();
@@ -1745,13 +1746,13 @@ static void do_handle_spec(struct vc_data *vc, u_char value, char up_flag)
                break;
        default:
                spk_parked &= 0xfe;
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return;
        }
        if (on_off < 2)
                synth_printf("%s %s\n",
                             label, spk_msg_get(MSG_STATUS_START + on_off));
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 }
 
 static int inc_dec_var(u_char value)
@@ -1892,7 +1893,7 @@ oops:
                spk_special_handler = NULL;
                return 1;
        }
-       go_pos = simple_strtol(goto_buf, &cp, 10);
+       go_pos = kstrtol(goto_buf, 10, (long *)&cp);
        goto_pos = (u_long) go_pos;
        if (*cp == 'x') {
                if (*goto_buf < '0')
@@ -1964,7 +1965,7 @@ static void speakup_lock(struct vc_data *vc)
 }
 
 typedef void (*spkup_hand) (struct vc_data *);
-spkup_hand spkup_handler[] = {
+static spkup_hand spkup_handler[] = {
        /* must be ordered same as defines in speakup.h */
        do_nothing, speakup_goto, speech_kill, speakup_shut_up,
        speakup_cut, speakup_paste, say_first_char, say_last_char,
@@ -2002,7 +2003,7 @@ static void do_spkup(struct vc_data *vc, u_char value)
 
 static const char *pad_chars = "0123456789+-*/\015,.?()";
 
-int
+static int
 speakup_key(struct vc_data *vc, int shift_state, int keycode, u_short keysym,
            int up_flag)
 {
@@ -2015,7 +2016,7 @@ speakup_key(struct vc_data *vc, int shift_state, int keycode, u_short keysym,
        if (synth == NULL)
                return 0;
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        tty = vc->port.tty;
        if (type >= 0xf0)
                type -= 0xf0;
@@ -2033,7 +2034,7 @@ speakup_key(struct vc_data *vc, int shift_state, int keycode, u_short keysym,
        if (keycode >= MAX_KEY)
                goto no_map;
        key_info = spk_our_keys[keycode];
-       if (key_info == 0)
+       if (!key_info)
                goto no_map;
        /* Check valid read all mode keys */
        if ((cursor_track == read_all_mode) && (!up_flag)) {
@@ -2114,7 +2115,7 @@ no_map:
        }
        last_keycode = 0;
 out:
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return ret;
 }
 
@@ -2265,7 +2266,7 @@ static int __init speakup_init(void)
             (var->var_id >= 0) && (var->var_id < MAXVARS); var++)
                speakup_register_var(var);
        for (i = 1; spk_punc_info[i].mask != 0; i++)
-               spk_set_mask_bits(0, i, 2);
+               spk_set_mask_bits(NULL, i, 2);
 
        spk_set_key_info(spk_key_defaults, spk_key_buf);
 
index e4d27aa2898f451fa7f5c2e7e8a7b808e3d0cc07..135428856d47a88487ddb679a5db7ee7341f122c 100644 (file)
@@ -79,7 +79,7 @@ static irqreturn_t synth_readbuf_handler(int irq, void *dev_id)
 /*printk(KERN_ERR "in irq\n"); */
 /*pr_warn("in IRQ\n"); */
        int c;
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        while (inb_p(speakup_info.port_tts + UART_LSR) & UART_LSR_DR) {
 
                c = inb_p(speakup_info.port_tts+UART_RX);
@@ -87,7 +87,7 @@ static irqreturn_t synth_readbuf_handler(int irq, void *dev_id)
 /*printk(KERN_ERR "c = %d\n", c); */
 /*pr_warn("C = %d\n", c); */
        }
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return IRQ_HANDLED;
 }
 
index 1c1f0d560449365af18ea46bd910e16afdafd0ea..80141aca712f99039a7584d7b15f0201ce5c953c 100644 (file)
@@ -166,7 +166,7 @@ static const char *synth_immediate(struct spk_synth *synth, const char *buf)
                outb_p(ch, speakup_info.port_tts);
                buf++;
        }
-       return 0;
+       return NULL;
 }
 
 static void do_catch_up(struct spk_synth *synth)
@@ -186,26 +186,26 @@ static void do_catch_up(struct spk_synth *synth)
        delay_time = spk_get_var(DELAY);
        full_time = spk_get_var(FULL);
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        jiffy_delta_val = jiffy_delta->u.n.value;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 
        jiff_max = jiffies + jiffy_delta_val;
        while (!kthread_should_stop()) {
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                if (speakup_info.flushing) {
                        speakup_info.flushing = 0;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        synth->flush(synth);
                        continue;
                }
                if (synth_buffer_empty()) {
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        break;
                }
                set_current_state(TASK_INTERRUPTIBLE);
                full_time_val = full_time->u.n.value;
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (synth_full()) {
                        schedule_timeout(msecs_to_jiffies(full_time_val));
                        continue;
@@ -217,9 +217,9 @@ static void do_catch_up(struct spk_synth *synth)
                                break;
                        udelay(1);
                }
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                ch = synth_buffer_getc();
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (ch == '\n')
                        ch = PROCSPEECH;
                outb_p(ch, speakup_info.port_tts);
@@ -231,10 +231,10 @@ static void do_catch_up(struct spk_synth *synth)
                                udelay(1);
                        }
                        outb_p(PROCSPEECH, speakup_info.port_tts);
-                       spk_lock(flags);
+                       spin_lock_irqsave(&speakup_info.spinlock, flags);
                        jiffy_delta_val = jiffy_delta->u.n.value;
                        delay_time_val = delay_time->u.n.value;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        schedule_timeout(msecs_to_jiffies(delay_time_val));
                        jiff_max = jiffies+jiffy_delta_val;
                }
index 3e450ccbda6634a0ec7fc0bb09bdcbb59a0e7188..95d3132f0a35717a2451070f140f2e237260f1bf 100644 (file)
@@ -148,30 +148,30 @@ static void do_catch_up(struct spk_synth *synth)
        jiffy_delta = spk_get_var(JIFFY);
        delay_time = spk_get_var(DELAY);
        full_time = spk_get_var(FULL);
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        jiffy_delta_val = jiffy_delta->u.n.value;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        jiff_max = jiffies + jiffy_delta_val;
 
        while (!kthread_should_stop()) {
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                jiffy_delta_val = jiffy_delta->u.n.value;
                full_time_val = full_time->u.n.value;
                delay_time_val = delay_time->u.n.value;
                if (speakup_info.flushing) {
                        speakup_info.flushing = 0;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        synth->flush(synth);
                        continue;
                }
                if (synth_buffer_empty()) {
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        break;
                }
                ch = synth_buffer_peek();
                set_current_state(TASK_INTERRUPTIBLE);
                full_time_val = full_time->u.n.value;
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (!spk_serial_out(ch)) {
                        outb(UART_MCR_DTR, speakup_info.port_tts + UART_MCR);
                        outb(UART_MCR_DTR | UART_MCR_RTS,
@@ -180,11 +180,11 @@ static void do_catch_up(struct spk_synth *synth)
                        continue;
                }
                if ((jiffies >= jiff_max) && (ch == SPACE)) {
-                       spk_lock(flags);
+                       spin_lock_irqsave(&speakup_info.spinlock, flags);
                        jiffy_delta_val = jiffy_delta->u.n.value;
                        full_time_val = full_time->u.n.value;
                        delay_time_val = delay_time->u.n.value;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        if (spk_serial_out(synth->procspeech))
                                schedule_timeout(msecs_to_jiffies
                                                 (delay_time_val));
@@ -194,9 +194,9 @@ static void do_catch_up(struct spk_synth *synth)
                        jiff_max = jiffies + jiffy_delta_val;
                }
                set_current_state(TASK_RUNNING);
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                synth_buffer_getc();
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        }
        spk_serial_out(PROCSPEECH);
 }
index d39a0de286fb205fc54f488b9951d3b9d3d1116a..d306e010d3ea9799f2e60420918041d2a9339cb3 100644 (file)
@@ -165,27 +165,27 @@ static void do_catch_up(struct spk_synth *synth)
        jiffy_delta = spk_get_var(JIFFY);
        delay_time = spk_get_var(DELAY);
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        jiffy_delta_val = jiffy_delta->u.n.value;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        jiff_max = jiffies + jiffy_delta_val;
 
        while (!kthread_should_stop()) {
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                if (speakup_info.flushing) {
                        speakup_info.flushing = 0;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        synth->flush(synth);
                        continue;
                }
                if (synth_buffer_empty()) {
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        break;
                }
                ch = synth_buffer_peek();
                set_current_state(TASK_INTERRUPTIBLE);
                delay_time_val = delay_time->u.n.value;
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (ch == '\n')
                        ch = 0x0D;
                if (synth_full() || !spk_serial_out(ch)) {
@@ -193,9 +193,9 @@ static void do_catch_up(struct spk_synth *synth)
                        continue;
                }
                set_current_state(TASK_RUNNING);
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                synth_buffer_getc();
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (ch == '[')
                        in_escape = 1;
                else if (ch == ']')
@@ -206,10 +206,10 @@ static void do_catch_up(struct spk_synth *synth)
                        if (jiffies >= jiff_max) {
                                if (!in_escape)
                                        spk_serial_out(PROCSPEECH);
-                               spk_lock(flags);
+                               spin_lock_irqsave(&speakup_info.spinlock, flags);
                                jiffy_delta_val = jiffy_delta->u.n.value;
                                delay_time_val = delay_time->u.n.value;
-                               spk_unlock(flags);
+                               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                                schedule_timeout(msecs_to_jiffies
                                                 (delay_time_val));
                                jiff_max = jiffies + jiffy_delta_val;
index 6c88b55bdac8172190176776ae1bc52ec9c0e20a..ea6b72d40b317f9eb342b66c9a3c762b8f6f96cb 100644 (file)
@@ -377,27 +377,27 @@ static void do_catch_up(struct spk_synth *synth)
 
        jiffy_delta = spk_get_var(JIFFY);
        delay_time = spk_get_var(DELAY);
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        jiffy_delta_val = jiffy_delta->u.n.value;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        jiff_max = jiffies + jiffy_delta_val;
 
        while (!kthread_should_stop()) {
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                if (speakup_info.flushing) {
                        speakup_info.flushing = 0;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        synth->flush(synth);
                        continue;
                }
                if (synth_buffer_empty()) {
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        break;
                }
                ch = synth_buffer_peek();
                set_current_state(TASK_INTERRUPTIBLE);
                delay_time_val = delay_time->u.n.value;
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (ch == '\n')
                        ch = 0x0D;
                if (dt_sendchar(ch)) {
@@ -405,9 +405,9 @@ static void do_catch_up(struct spk_synth *synth)
                        continue;
                }
                set_current_state(TASK_RUNNING);
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                synth_buffer_getc();
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (ch == '[')
                        in_escape = 1;
                else if (ch == ']')
@@ -418,10 +418,10 @@ static void do_catch_up(struct spk_synth *synth)
                        if (jiffies >= jiff_max) {
                                if (!in_escape)
                                        dt_sendchar(PROCSPEECH);
-                               spk_lock(flags);
+                               spin_lock_irqsave(&speakup_info.spinlock, flags);
                                jiffy_delta_val = jiffy_delta->u.n.value;
                                delay_time_val = delay_time->u.n.value;
-                               spk_unlock(flags);
+                               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                                schedule_timeout(msecs_to_jiffies
                                                 (delay_time_val));
                                jiff_max = jiffies + jiffy_delta_val;
@@ -444,7 +444,7 @@ static const char *synth_immediate(struct spk_synth *synth, const char *buf)
                        return buf;
                buf++;
        }
-       return 0;
+       return NULL;
 }
 
 static int synth_probe(struct spk_synth *synth)
index 0dd2eb96cb28680dd042ee7301a46e530320b732..15fdec323a70f28b666125847c07ca5ad6683f03 100644 (file)
@@ -216,9 +216,9 @@ static void do_catch_up(struct spk_synth *synth)
 
        jiffy_delta = spk_get_var(JIFFY);
        delay_time = spk_get_var(DELAY);
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        jiffy_delta_val = jiffy_delta->u.n.value;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        jiff_max = jiffies + jiffy_delta_val;
 
        while (!kthread_should_stop()) {
@@ -234,22 +234,22 @@ static void do_catch_up(struct spk_synth *synth)
                is_flushing = 0;
                spin_unlock_irqrestore(&flush_lock, flags);
 
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                if (speakup_info.flushing) {
                        speakup_info.flushing = 0;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        synth->flush(synth);
                        continue;
                }
                if (synth_buffer_empty()) {
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        break;
                }
                ch = synth_buffer_peek();
                set_current_state(TASK_INTERRUPTIBLE);
                delay_time_val = delay_time->u.n.value;
                synth_full_val = synth_full();
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (ch == '\n')
                        ch = 0x0D;
                if (synth_full_val || !spk_serial_out(ch)) {
@@ -257,9 +257,9 @@ static void do_catch_up(struct spk_synth *synth)
                        continue;
                }
                set_current_state(TASK_RUNNING);
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                synth_buffer_getc();
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (ch == '[')
                        in_escape = 1;
                else if (ch == ']')
@@ -270,10 +270,10 @@ static void do_catch_up(struct spk_synth *synth)
                        if (jiffies >= jiff_max) {
                                if (!in_escape)
                                        spk_serial_out(PROCSPEECH);
-                               spk_lock(flags);
+                               spin_lock_irqsave(&speakup_info.spinlock, flags);
                                jiffy_delta_val = jiffy_delta->u.n.value;
                                delay_time_val = delay_time->u.n.value;
-                               spk_unlock(flags);
+                               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                                schedule_timeout(msecs_to_jiffies
                                                 (delay_time_val));
                                jiff_max = jiffies + jiffy_delta_val;
index a9cefbd3ea93f58ac8224e3fa3ed98650aed5a96..1feb0fba1b436bbe13ee619c296f434895c7dda6 100644 (file)
@@ -200,42 +200,42 @@ static void do_catch_up(struct spk_synth *synth)
 
        jiffy_delta = spk_get_var(JIFFY);
        delay_time = spk_get_var(DELAY);
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        jiffy_delta_val = jiffy_delta->u.n.value;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        jiff_max = jiffies + jiffy_delta_val;
        while (!kthread_should_stop()) {
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                if (speakup_info.flushing) {
                        speakup_info.flushing = 0;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        synth->flush(synth);
                        continue;
                }
                if (synth_buffer_empty()) {
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        break;
                }
                set_current_state(TASK_INTERRUPTIBLE);
                delay_time_val = delay_time->u.n.value;
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (synth_full()) {
                        schedule_timeout(msecs_to_jiffies(delay_time_val));
                        continue;
                }
                set_current_state(TASK_RUNNING);
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                ch = synth_buffer_getc();
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (ch == '\n')
                        ch = PROCSPEECH;
                spk_out(ch);
                if ((jiffies >= jiff_max) && (ch == SPACE)) {
                        spk_out(PROCSPEECH);
-                       spk_lock(flags);
+                       spin_lock_irqsave(&speakup_info.spinlock, flags);
                        delay_time_val = delay_time->u.n.value;
                        jiffy_delta_val = jiffy_delta->u.n.value;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        schedule_timeout(msecs_to_jiffies(delay_time_val));
                        jiff_max = jiffies + jiffy_delta_val;
                }
@@ -254,7 +254,7 @@ static const char *synth_immediate(struct spk_synth *synth, const char *buf)
                spk_out(ch);
                buf++;
        }
-       return 0;
+       return NULL;
 }
 
 static void synth_flush(struct spk_synth *synth)
index feb5f22cc169ab1e5a8477f8cf4db243abfcd950..2f2fe5eeff63e185f7ccf7e4db22f1308b52a65e 100644 (file)
@@ -168,7 +168,7 @@ static const char *synth_immediate(struct spk_synth *synth, const char *buf)
                udelay(70);
                buf++;
        }
-       return 0;
+       return NULL;
 }
 
 static void do_catch_up(struct spk_synth *synth)
@@ -187,26 +187,26 @@ static void do_catch_up(struct spk_synth *synth)
        jiffy_delta = spk_get_var(JIFFY);
        delay_time = spk_get_var(DELAY);
        full_time = spk_get_var(FULL);
-spk_lock(flags);
+spin_lock_irqsave(&speakup_info.spinlock, flags);
        jiffy_delta_val = jiffy_delta->u.n.value;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 
        jiff_max = jiffies + jiffy_delta_val;
        while (!kthread_should_stop()) {
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                if (speakup_info.flushing) {
                        speakup_info.flushing = 0;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        synth->flush(synth);
                        continue;
                }
                if (synth_buffer_empty()) {
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        break;
                }
                set_current_state(TASK_INTERRUPTIBLE);
                full_time_val = full_time->u.n.value;
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (synth_full()) {
                        schedule_timeout(msecs_to_jiffies(full_time_val));
                        continue;
@@ -220,9 +220,9 @@ spk_lock(flags);
                        oops();
                        break;
                }
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                ch = synth_buffer_getc();
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (ch == '\n')
                        ch = PROCSPEECH;
                outb_p(ch, synth_port);
@@ -237,10 +237,10 @@ spk_lock(flags);
                                break;
                        }
                        outb_p(PROCSPEECH, synth_port);
-                       spk_lock(flags);
+                       spin_lock_irqsave(&speakup_info.spinlock, flags);
                        jiffy_delta_val = jiffy_delta->u.n.value;
                        delay_time_val = delay_time->u.n.value;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        schedule_timeout(msecs_to_jiffies(delay_time_val));
                        jiff_max = jiffies+jiffy_delta_val;
                }
index e2f5c81e75483e82fb0ae84b04cf15238492ae4f..243c3d52fe5e616f75e829df544e3182cbc3ff7e 100644 (file)
@@ -179,45 +179,45 @@ static int softsynth_open(struct inode *inode, struct file *fp)
        unsigned long flags;
        /*if ((fp->f_flags & O_ACCMODE) != O_RDONLY) */
        /*      return -EPERM; */
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        if (synth_soft.alive) {
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                return -EBUSY;
        }
        synth_soft.alive = 1;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return 0;
 }
 
 static int softsynth_close(struct inode *inode, struct file *fp)
 {
        unsigned long flags;
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        synth_soft.alive = 0;
        init_pos = 0;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        /* Make sure we let applications go before leaving */
        speakup_start_ttys();
        return 0;
 }
 
-static ssize_t softsynth_read(struct file *fp, char *buf, size_t count,
+static ssize_t softsynth_read(struct file *fp, char __user *buf, size_t count,
                              loff_t *pos)
 {
        int chars_sent = 0;
-       char *cp;
+       char __user *cp;
        char *init;
        char ch;
        int empty;
        unsigned long flags;
        DEFINE_WAIT(wait);
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        while (1) {
                prepare_to_wait(&speakup_event, &wait, TASK_INTERRUPTIBLE);
                if (!synth_buffer_empty() || speakup_info.flushing)
                        break;
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (fp->f_flags & O_NONBLOCK) {
                        finish_wait(&speakup_event, &wait);
                        return -EAGAIN;
@@ -227,7 +227,7 @@ static ssize_t softsynth_read(struct file *fp, char *buf, size_t count,
                        return -ERESTARTSYS;
                }
                schedule();
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
        }
        finish_wait(&speakup_event, &wait);
 
@@ -244,16 +244,16 @@ static ssize_t softsynth_read(struct file *fp, char *buf, size_t count,
                } else {
                        ch = synth_buffer_getc();
                }
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (copy_to_user(cp, &ch, 1))
                        return -EFAULT;
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                chars_sent++;
                cp++;
        }
        *pos += chars_sent;
        empty = synth_buffer_empty();
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        if (empty) {
                speakup_start_ttys();
                *pos = 0;
@@ -263,8 +263,8 @@ static ssize_t softsynth_read(struct file *fp, char *buf, size_t count,
 
 static int last_index;
 
-static ssize_t softsynth_write(struct file *fp, const char *buf, size_t count,
-                              loff_t *pos)
+static ssize_t softsynth_write(struct file *fp, const char __user *buf,
+                              size_t count, loff_t *pos)
 {
        unsigned long supplied_index = 0;
        int converted;
@@ -285,10 +285,10 @@ static unsigned int softsynth_poll(struct file *fp,
        int ret = 0;
        poll_wait(fp, &speakup_event, wait);
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        if (!synth_buffer_empty() || speakup_info.flushing)
                ret = POLLIN | POLLRDNORM;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        return ret;
 }
 
index 303105b46013a3c2b6c55f4f635df98b675168e2..637ba6760ec060fb75a32fe661fe0a8880511970 100644 (file)
@@ -77,17 +77,4 @@ extern struct speakup_info_t speakup_info;
 
 extern struct var_t synth_time_vars[];
 
-/* Protect the whole speakup machinery, must be taken at each kernel->speakup
- * transition and released at all corresponding speakup->kernel transitions
- * (flags must be the same variable between lock/trylock and unlock).
- *
- * The progression thread only interferes with the speakup machinery through
- * the synth buffer, and so only needs to take the lock while tinkering with
- * it.
- */
-/* Speakup needs to disable the keyboard IRQ, hence _irqsave/restore */
-#define spk_lock(flags) spin_lock_irqsave(&speakup_info.spinlock, flags)
-#define spk_trylock(flags) spin_trylock_irqsave(&speakup_info.spinlock, flags)
-#define spk_unlock(flags) spin_unlock_irqrestore(&speakup_info.spinlock, flags)
-
 #endif
index d867dd9109ed614fb2f0c183796aed84a8f0b2e2..0b3549bd909d0619a594155f91a3f18372eeb7be 100644 (file)
@@ -25,6 +25,18 @@ static int module_status;
 bool spk_quiet_boot;
 
 struct speakup_info_t speakup_info = {
+       /*
+        * This spinlock is used to protect the entire speakup machinery, and
+        * must be taken at each kernel->speakup transition and released at
+        * each corresponding speakup->kernel transition.
+        *
+        * The progression thread only interferes with the speakup machinery through
+        * the synth buffer, so only needs to take the lock while tinkering with
+        * the buffer.
+        *
+        * We use spin_lock/trylock_irqsave and spin_unlock_irqrestore with this
+        * spinlock because speakup needs to disable the keyboard IRQ.
+        */
        .spinlock = __SPIN_LOCK_UNLOCKED(speakup_info.spinlock),
        .flushing = 0,
 };
@@ -83,27 +95,27 @@ void spk_do_catch_up(struct spk_synth *synth)
        full_time = spk_get_var(FULL);
        delay_time = spk_get_var(DELAY);
 
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        jiffy_delta_val = jiffy_delta->u.n.value;
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 
        jiff_max = jiffies + jiffy_delta_val;
        while (!kthread_should_stop()) {
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                if (speakup_info.flushing) {
                        speakup_info.flushing = 0;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        synth->flush(synth);
                        continue;
                }
                if (synth_buffer_empty()) {
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        break;
                }
                ch = synth_buffer_peek();
                set_current_state(TASK_INTERRUPTIBLE);
                full_time_val = full_time->u.n.value;
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                if (ch == '\n')
                        ch = synth->procspeech;
                if (!spk_serial_out(ch)) {
@@ -111,11 +123,11 @@ void spk_do_catch_up(struct spk_synth *synth)
                        continue;
                }
                if ((jiffies >= jiff_max) && (ch == SPACE)) {
-                       spk_lock(flags);
+                       spin_lock_irqsave(&speakup_info.spinlock, flags);
                        jiffy_delta_val = jiffy_delta->u.n.value;
                        delay_time_val = delay_time->u.n.value;
                        full_time_val = full_time->u.n.value;
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        if (spk_serial_out(synth->procspeech))
                                schedule_timeout(
                                        msecs_to_jiffies(delay_time_val));
@@ -125,9 +137,9 @@ void spk_do_catch_up(struct spk_synth *synth)
                        jiff_max = jiffies + jiffy_delta_val;
                }
                set_current_state(TASK_RUNNING);
-               spk_lock(flags);
+               spin_lock_irqsave(&speakup_info.spinlock, flags);
                synth_buffer_getc();
-               spk_unlock(flags);
+               spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        }
        spk_serial_out(synth->procspeech);
 }
@@ -145,7 +157,7 @@ const char *spk_synth_immediate(struct spk_synth *synth, const char *buff)
                        return buff;
                buff++;
        }
-       return 0;
+       return NULL;
 }
 EXPORT_SYMBOL_GPL(spk_synth_immediate);
 
@@ -403,11 +415,11 @@ void synth_release(void)
 
        if (synth == NULL)
                return;
-       spk_lock(flags);
+       spin_lock_irqsave(&speakup_info.spinlock, flags);
        pr_info("releasing synth %s\n", synth->name);
        synth->alive = 0;
        del_timer(&thread_timer);
-       spk_unlock(flags);
+       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
        if (synth->attributes.name)
                sysfs_remove_group(speakup_kobj, &(synth->attributes));
        for (var = synth->vars; var->var_id != MAXVARS; var++)
index 42fa660a7e0db5fc65e21cdfad506e53dfad0185..4397c8e898c723d0beaed285b491ab316ce34095 100644 (file)
@@ -22,7 +22,7 @@ int speakup_thread(void *data)
        while (1) {
                DEFINE_WAIT(wait);
                while (1) {
-                       spk_lock(flags);
+                       spin_lock_irqsave(&speakup_info.spinlock, flags);
                        our_sound = spk_unprocessed_sound;
                        spk_unprocessed_sound.active = 0;
                        prepare_to_wait(&speakup_event, &wait,
@@ -32,7 +32,7 @@ int speakup_thread(void *data)
                                (synth && synth->catch_up && synth->alive &&
                                        (speakup_info.flushing ||
                                        !synth_buffer_empty()));
-                       spk_unlock(flags);
+                       spin_unlock_irqrestore(&speakup_info.spinlock, flags);
                        if (should_break)
                                break;
                        mutex_unlock(&spk_mutex);
index 7f6288fc2299b91dbecc653d763ca7006ec6b8ca..9aa2a78cd71cd87dfad5785a9fab9d2f26e497e2 100644 (file)
@@ -137,18 +137,15 @@ struct st_var_header *spk_get_var_header(enum var_id_t var_id)
 struct st_var_header *spk_var_header_by_name(const char *name)
 {
        int i;
-       struct st_var_header *where = NULL;
 
-       if (name != NULL) {
-               i = 0;
-               while ((i < MAXVARS) && (where == NULL)) {
-                       if (strcmp(name, var_ptrs[i]->name) == 0)
-                               where = var_ptrs[i];
-                       else
-                               i++;
-               }
+       if (!name)
+               return NULL;
+
+       for (i = 0; i < MAXVARS; i++) {
+               if (strcmp(name, var_ptrs[i]->name) == 0)
+                       return var_ptrs[i];
        }
-       return where;
+       return NULL;
 }
 
 struct var_t *spk_get_var(enum var_id_t var_id)
@@ -280,7 +277,7 @@ int spk_set_mask_bits(const char *input, const int which, const int how)
                        spk_chartab[*cp] &= ~mask;
        }
        cp = (u_char *)input;
-       if (cp == 0)
+       if (!cp)
                cp = spk_punc_info[which].value;
        else {
                for ( ; *cp; cp++) {
index e3c5e677eaa5c7a13007354b9fa3d2f99bc4b125..8e67ebf98404bcc69dad79f146133ba17d52d17e 100644 (file)
@@ -38,6 +38,7 @@
 /* common data structures */
 struct ti_thermal_data {
        struct thermal_zone_device *ti_thermal;
+       struct thermal_zone_device *pcb_tz;
        struct thermal_cooling_device *cool_dev;
        struct ti_bandgap *bgp;
        enum thermal_device_mode mode;
@@ -77,10 +78,12 @@ static inline int ti_thermal_hotspot_temperature(int t, int s, int c)
 static inline int ti_thermal_get_temp(struct thermal_zone_device *thermal,
                                      unsigned long *temp)
 {
+       struct thermal_zone_device *pcb_tz = NULL;
        struct ti_thermal_data *data = thermal->devdata;
        struct ti_bandgap *bgp;
        const struct ti_temp_sensor *s;
-       int ret, tmp, pcb_temp, slope, constant;
+       int ret, tmp, slope, constant;
+       unsigned long pcb_temp;
 
        if (!data)
                return 0;
@@ -92,16 +95,22 @@ static inline int ti_thermal_get_temp(struct thermal_zone_device *thermal,
        if (ret)
                return ret;
 
-       pcb_temp = 0;
-       /* TODO: Introduce pcb temperature lookup */
+       /* Default constants */
+       slope = s->slope;
+       constant = s->constant;
+
+       pcb_tz = data->pcb_tz;
        /* In case pcb zone is available, use the extrapolation rule with it */
-       if (pcb_temp) {
-               tmp -= pcb_temp;
-               slope = s->slope_pcb;
-               constant = s->constant_pcb;
-       } else {
-               slope = s->slope;
-               constant = s->constant;
+       if (!IS_ERR_OR_NULL(pcb_tz)) {
+               ret = thermal_zone_get_temp(pcb_tz, &pcb_temp);
+               if (!ret) {
+                       tmp -= pcb_temp; /* got a valid PCB temp */
+                       slope = s->slope_pcb;
+                       constant = s->constant_pcb;
+               } else {
+                       dev_err(bgp->dev,
+                               "Failed to read PCB state. Using defaults\n");
+               }
        }
        *temp = ti_thermal_hotspot_temperature(tmp, slope, constant);
 
@@ -273,6 +282,7 @@ static struct ti_thermal_data
        data->sensor_id = id;
        data->bgp = bgp;
        data->mode = THERMAL_DEVICE_ENABLED;
+       data->pcb_tz = thermal_zone_get_zone_by_name("pcb");
        INIT_WORK(&data->thermal_wq, ti_thermal_work);
 
        return data;
index a4a33d1a07464a75b0b00e31adcb1b5c159144ce..1629652372b61d5c6e27849ebec820332193e07c 100644 (file)
@@ -57,4 +57,5 @@ bandgap {
                0x4a002380 0x2c
                0x4a0023C0 0x3c>;
        compatible = "ti,omap5430-bandgap";
+       interrupts = <0 126 4>; /* talert */
 };
index b783bfa59b1cea42f943596f8ddce5c620a08835..65971b784b78a011f854534bec7d1c3eea7496d0 100644 (file)
@@ -145,8 +145,8 @@ struct map_l4_peripheral {
 #define L4_PERIPHERAL_MBOX        0x48094000
 #define DSPVA_PERIPHERAL_MBOX     0x11808000
 
-#define PM_GRPSEL_BASE                         0x48307000
-#define DSPVA_GRPSEL_BASE              0x11821000
+#define PM_GRPSEL_BASE 0x48307000
+#define DSPVA_GRPSEL_BASE      0x11821000
 
 #define L4_PERIPHERAL_SIDETONE_MCBSP2        0x49028000
 #define DSPVA_PERIPHERAL_SIDETONE_MCBSP2 0x11824000
@@ -311,7 +311,7 @@ static const struct bpwr_clk_t bpwr_clks[] = {
 
 #define SET_GROUP_BITS16(reg, position, width, value) \
        do {\
-               reg &= ~((0xFFFF >> (16 - (width))) << (position)) ; \
+               reg &= ~((0xFFFF >> (16 - (width))) << (position)); \
                reg |= ((value & (0xFFFF >> (16 - (width)))) << (position)); \
        } while (0);
 
index bd0354d9ad03c600e3f3a4ec07b6397d252fc545..7bbd3802c15f77e4c46b4c943b5f68034d083681 100644 (file)
@@ -40,7 +40,7 @@ extern int sleep_dsp(struct bridge_dev_context *dev_context,
                            u32 dw_cmd, void *pargs);
 /*
  *  ========interrupt_dsp========
- *       Sends an interrupt to DSP unconditionally.
+ *     Sends an interrupt to DSP unconditionally.
  */
 extern void interrupt_dsp(struct bridge_dev_context *dev_context,
                                                        u16 mb_val);
@@ -53,24 +53,24 @@ extern int dsp_peripheral_clk_ctrl(struct bridge_dev_context
                                        *dev_context, void *pargs);
 /*
  *  ======== handle_hibernation_from_dsp ========
- *     Handle Hibernation requested from DSP
+ *     Handle Hibernation requested from DSP
  */
 int handle_hibernation_from_dsp(struct bridge_dev_context *dev_context);
 /*
  *  ======== post_scale_dsp ========
- *     Handle Post Scale notification to DSP
+ *     Handle Post Scale notification to DSP
  */
 int post_scale_dsp(struct bridge_dev_context *dev_context,
                                                        void *pargs);
 /*
  *  ======== pre_scale_dsp ========
- *     Handle Pre Scale notification to DSP
+ *     Handle Pre Scale notification to DSP
  */
 int pre_scale_dsp(struct bridge_dev_context *dev_context,
                                                        void *pargs);
 /*
  *  ======== handle_constraints_set ========
- *     Handle constraints request from DSP
+ *     Handle constraints request from DSP
  */
 int handle_constraints_set(struct bridge_dev_context *dev_context,
                                  void *pargs);
index dafa6d9b294848fe6408c56a4ecdbbd6684d46a1..1862afd80dc1b0987b45e5d3ecf43724c3f2e050 100644 (file)
@@ -51,7 +51,7 @@
 
 /*
  *  ======== handle_constraints_set ========
- *     Sets new DSP constraint
+ *     Sets new DSP constraint
  */
 int handle_constraints_set(struct bridge_dev_context *dev_context,
                                  void *pargs)
@@ -75,7 +75,7 @@ int handle_constraints_set(struct bridge_dev_context *dev_context,
 
 /*
  *  ======== handle_hibernation_from_dsp ========
- *     Handle Hibernation requested from DSP
+ *     Handle Hibernation requested from DSP
  */
 int handle_hibernation_from_dsp(struct bridge_dev_context *dev_context)
 {
@@ -144,7 +144,7 @@ int handle_hibernation_from_dsp(struct bridge_dev_context *dev_context)
 
 /*
  *  ======== sleep_dsp ========
- *     Put DSP in low power consuming state.
+ *     Put DSP in low power consuming state.
  */
 int sleep_dsp(struct bridge_dev_context *dev_context, u32 dw_cmd,
                     void *pargs)
@@ -250,7 +250,7 @@ int sleep_dsp(struct bridge_dev_context *dev_context, u32 dw_cmd,
 
 /*
  *  ======== wake_dsp ========
- *     Wake up DSP from sleep.
+ *     Wake up DSP from sleep.
  */
 int wake_dsp(struct bridge_dev_context *dev_context, void *pargs)
 {
@@ -276,7 +276,7 @@ int wake_dsp(struct bridge_dev_context *dev_context, void *pargs)
 
 /*
  *  ======== dsp_peripheral_clk_ctrl ========
- *     Enable/Disable the DSP peripheral clocks as needed..
+ *     Enable/Disable the DSP peripheral clocks as needed..
  */
 int dsp_peripheral_clk_ctrl(struct bridge_dev_context *dev_context,
                                   void *pargs)
index 6aea6f1b4982e4e2a3a830f0149c6ea200d2911b..e68f0ba8e12bfbe2f8b3e47bc66f07b6ba81a288 100644 (file)
@@ -177,7 +177,7 @@ static void mmu_fault_print_stack(struct bridge_dev_context *dev_context)
        void *dummy_va_addr;
 
        resources = dev_context->resources;
-       dummy_va_addr = (void*)__get_free_page(GFP_ATOMIC);
+       dummy_va_addr = (void *)__get_free_page(GFP_ATOMIC);
 
        /*
         * Before acking the MMU fault, let's make sure MMU can only
index 7ff0e6c980395c80c9c83fe4355a6eef854a6a79..c7ee467f0f12aaf88d63b3931cd9b04413ba53e7 100644 (file)
@@ -25,8 +25,8 @@
 #include <dspbridge/host_os.h>
 
 
-#define OMAP34XX_WDT3_BASE             (0x49000000 + 0x30000)
-#define INT_34XX_WDT3_IRQ              (36 + NR_IRQS)
+#define OMAP34XX_WDT3_BASE     (0x49000000 + 0x30000)
+#define INT_34XX_WDT3_IRQ      (36 + NR_IRQS)
 
 static struct dsp_wdt_setting dsp_wdt;
 
index df0f37ea1ee50bf765680edba66ac1292ccc4c6b..9c020562c84637ca85d8d9e5ffe768c8b7809440 100644 (file)
@@ -508,6 +508,7 @@ static int omap34_xx_bridge_probe(struct platform_device *pdev)
        bridge_class = class_create(THIS_MODULE, "ti_bridge");
        if (IS_ERR(bridge_class)) {
                pr_err("%s: Error creating bridge class\n", __func__);
+               err = PTR_ERR(bridge_class);
                goto err3;
        }
 
index 82123be8732d49861db86d11c8f6de895978b62d..64933b993d7a5f0a8ff64a82979ed20bb9890534 100644 (file)
@@ -85,7 +85,7 @@ int usbip_start_eh(struct usbip_device *ud)
 
        ud->eh = kthread_run(event_handler_loop, ud, "usbip_eh");
        if (IS_ERR(ud->eh)) {
-               pr_warning("Unable to start control thread\n");
+               pr_warn("Unable to start control thread\n");
                return PTR_ERR(ud->eh);
        }
 
index da7f759849792060678671cb3a5f70b17a8c4373..daec15565a43098c37e6d1ac8d50f56ae99acd88 100644 (file)
@@ -109,7 +109,7 @@ struct driver_stats {
        unsigned long ioctls;
        unsigned long irqs;
        unsigned long berrs;
-       unsigned long dmaErrors;
+       unsigned long dmaerrors;
        unsigned long timeouts;
        unsigned long external;
 };
@@ -160,7 +160,7 @@ static void reset_counters(void)
        statistics.ioctls = 0;
        statistics.irqs = 0;
        statistics.berrs = 0;
-       statistics.dmaErrors = 0;
+       statistics.dmaerrors = 0;
        statistics.timeouts = 0;
 }
 
@@ -734,6 +734,7 @@ static int vme_user_probe(struct vme_dev *vdev)
                if (image[i].resource == NULL) {
                        dev_warn(&vdev->dev,
                                 "Unable to allocate slave resource\n");
+                       err = -ENOMEM;
                        goto err_slave;
                }
                image[i].size_buf = PCI_BUF_SIZE;
@@ -760,6 +761,7 @@ static int vme_user_probe(struct vme_dev *vdev)
                if (image[i].resource == NULL) {
                        dev_warn(&vdev->dev,
                                 "Unable to allocate master resource\n");
+                       err = -ENOMEM;
                        goto err_master;
                }
                image[i].size_buf = PCI_BUF_SIZE;
index 7d24cd6343e4ca164708289317336a0fe3768fdb..280ccc7f26bb1eaff688219403703c31055ed7f6 100644 (file)
@@ -14,9 +14,9 @@ struct vme_master {
        u32 cycle;              /* Cycle properties */
        u32 dwidth;             /* Maximum Data Width */
 #if 0
-       char prefetchEnable;            /* Prefetch Read Enable State */
-       int prefetchSize;               /* Prefetch Read Size (Cache Lines) */
-       char wrPostEnable;              /* Write Post State */
+       char prefetchenable;            /* Prefetch Read Enable State */
+       int prefetchsize;               /* Prefetch Read Size (Cache Lines) */
+       char wrpostenable;              /* Write Post State */
 #endif
 };
 
@@ -37,9 +37,9 @@ struct vme_slave {
        u32 aspace;                     /* Address Space */
        u32 cycle;              /* Cycle properties */
 #if 0
-       char wrPostEnable;              /* Write Post State */
-       char rmwLock;                   /* Lock PCI during RMW Cycles */
-       char data64BitCapable;          /* non-VMEbus capable of 64-bit Data */
+       char wrpostenable;              /* Write Post State */
+       char rmwlock;                   /* Lock PCI during RMW Cycles */
+       char data64bitcapable;          /* non-VMEbus capable of 64-bit Data */
 #endif
 };
 
index 28078a114d4fe1cbcc2e4304164b911259befea0..ba533402a9aff74730a5a3abc7d5e289d2c70a7c 100644 (file)
@@ -68,7 +68,7 @@
 #define BIT30  0x40000000
 #define BIT31  0x80000000
 
-// 802.11 frame related, defined as 802.11 spec
+/* 802.11 frame related, defined as 802.11 spec */
 #define WLAN_ADDR_LEN               6
 #define WLAN_CRC_LEN                4
 #define WLAN_CRC32_LEN              4
index 4cb26f3faf26e5aeb1597d480cc898bd8a0b0303..76c8490b0734d212297e8fb00583c73bbaa5716b 100644 (file)
@@ -66,7 +66,7 @@
 /*---------------------  Static Variables  --------------------------*/
 
 static int msglevel = MSG_LEVEL_INFO;
-//static int          msglevel                =MSG_LEVEL_DEBUG;
+/* static int          msglevel                =MSG_LEVEL_DEBUG; */
 /*---------------------  Static Functions  --------------------------*/
 
 /*---------------------  Export Variables  --------------------------*/
@@ -90,7 +90,7 @@ vMgrEncodeBeacon(
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pqwTimestamp = (PQWORD)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                        + WLAN_BEACON_OFF_TS);
        pFrame->pwBeaconInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -123,7 +123,7 @@ vMgrDecodeBeacon(
 
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pqwTimestamp = (PQWORD)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                        + WLAN_BEACON_OFF_TS);
        pFrame->pwBeaconInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -131,7 +131,7 @@ vMgrDecodeBeacon(
        pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                               + WLAN_BEACON_OFF_CAPINFO);
 
-       // Information elements
+       /* Information elements */
        pItem = (PWLAN_IE)((unsigned char *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3)))
                           + WLAN_BEACON_OFF_SSID);
        while (((unsigned char *)pItem) < (pFrame->pBuf + pFrame->len)) {
@@ -145,7 +145,7 @@ vMgrDecodeBeacon(
                                pFrame->pSuppRates = (PWLAN_IE_SUPP_RATES)pItem;
                        break;
                case WLAN_EID_FH_PARMS:
-                       //pFrame->pFHParms = (PWLAN_IE_FH_PARMS)pItem;
+                       /* pFrame->pFHParms = (PWLAN_IE_FH_PARMS)pItem; */
                        break;
                case WLAN_EID_DS_PARMS:
                        if (pFrame->pDSParms == NULL)
@@ -185,22 +185,22 @@ vMgrDecodeBeacon(
                                pFrame->pExtSuppRates = (PWLAN_IE_SUPP_RATES)pItem;
                        break;
 
-               case WLAN_EID_COUNTRY:      //7
+               case WLAN_EID_COUNTRY:      /* 7 */
                        if (pFrame->pIE_Country == NULL)
                                pFrame->pIE_Country = (PWLAN_IE_COUNTRY)pItem;
                        break;
 
-               case WLAN_EID_PWR_CONSTRAINT:   //32
+               case WLAN_EID_PWR_CONSTRAINT:   /* 32 */
                        if (pFrame->pIE_PowerConstraint == NULL)
                                pFrame->pIE_PowerConstraint = (PWLAN_IE_PW_CONST)pItem;
                        break;
 
-               case WLAN_EID_CH_SWITCH:    //37
+               case WLAN_EID_CH_SWITCH:    /* 37 */
                        if (pFrame->pIE_CHSW == NULL)
                                pFrame->pIE_CHSW = (PWLAN_IE_CH_SW)pItem;
                        break;
 
-               case WLAN_EID_QUIET:        //40
+               case WLAN_EID_QUIET:        /* 40 */
                        if (pFrame->pIE_Quiet == NULL)
                                pFrame->pIE_Quiet = (PWLAN_IE_QUIET)pItem;
                        break;
@@ -282,7 +282,7 @@ vMgrEncodeDisassociation(
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwReason = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                              + WLAN_DISASSOC_OFF_REASON);
        pFrame->len = WLAN_HDR_ADDR3_LEN + WLAN_DISASSOC_OFF_REASON + sizeof(*(pFrame->pwReason));
@@ -308,7 +308,7 @@ vMgrDecodeDisassociation(
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwReason = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                              + WLAN_DISASSOC_OFF_REASON);
 
@@ -332,7 +332,7 @@ vMgrEncodeAssocRequest(
 )
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                               + WLAN_ASSOCREQ_OFF_CAP_INFO);
        pFrame->pwListenInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -360,13 +360,13 @@ vMgrDecodeAssocRequest(
        PWLAN_IE   pItem;
 
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                               + WLAN_ASSOCREQ_OFF_CAP_INFO);
        pFrame->pwListenInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                                      + WLAN_ASSOCREQ_OFF_LISTEN_INT);
 
-       // Information elements
+       /* Information elements */
        pItem = (PWLAN_IE)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                           + WLAN_ASSOCREQ_OFF_SSID);
 
@@ -425,7 +425,7 @@ vMgrEncodeAssocResponse(
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                               + WLAN_ASSOCRESP_OFF_CAP_INFO);
        pFrame->pwStatus = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -458,7 +458,7 @@ vMgrDecodeAssocResponse(
 
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                               + WLAN_ASSOCRESP_OFF_CAP_INFO);
        pFrame->pwStatus = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -466,7 +466,7 @@ vMgrDecodeAssocResponse(
        pFrame->pwAid = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                           + WLAN_ASSOCRESP_OFF_AID);
 
-       // Information elements
+       /* Information elements */
        pFrame->pSuppRates  = (PWLAN_IE_SUPP_RATES)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                                    + WLAN_ASSOCRESP_OFF_SUPP_RATES);
 
@@ -501,7 +501,7 @@ vMgrEncodeReassocRequest(
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                               + WLAN_REASSOCREQ_OFF_CAP_INFO);
        pFrame->pwListenInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -532,7 +532,7 @@ vMgrDecodeReassocRequest(
        PWLAN_IE   pItem;
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                               + WLAN_REASSOCREQ_OFF_CAP_INFO);
        pFrame->pwListenInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -540,7 +540,7 @@ vMgrDecodeReassocRequest(
        pFrame->pAddrCurrAP = (PIEEE_ADDR)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                           + WLAN_REASSOCREQ_OFF_CURR_AP);
 
-       // Information elements
+       /* Information elements */
        pItem = (PWLAN_IE)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                           + WLAN_REASSOCREQ_OFF_SSID);
 
@@ -622,7 +622,7 @@ vMgrDecodeProbeRequest(
 
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Information elements
+       /* Information elements */
        pItem = (PWLAN_IE)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3)));
 
        while (((unsigned char *)pItem) < (pFrame->pBuf + pFrame->len)) {
@@ -670,7 +670,7 @@ vMgrEncodeProbeResponse(
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pqwTimestamp = (PQWORD)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                        + WLAN_PROBERESP_OFF_TS);
        pFrame->pwBeaconInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -704,7 +704,7 @@ vMgrDecodeProbeResponse(
 
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pqwTimestamp = (PQWORD)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                        + WLAN_PROBERESP_OFF_TS);
        pFrame->pwBeaconInterval = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -712,7 +712,7 @@ vMgrDecodeProbeResponse(
        pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                               + WLAN_PROBERESP_OFF_CAP_INFO);
 
-       // Information elements
+       /* Information elements */
        pItem = (PWLAN_IE)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                           + WLAN_PROBERESP_OFF_SSID);
 
@@ -761,22 +761,22 @@ vMgrDecodeProbeResponse(
                                pFrame->pExtSuppRates = (PWLAN_IE_SUPP_RATES)pItem;
                        break;
 
-               case WLAN_EID_COUNTRY:      //7
+               case WLAN_EID_COUNTRY:      /* 7 */
                        if (pFrame->pIE_Country == NULL)
                                pFrame->pIE_Country = (PWLAN_IE_COUNTRY)pItem;
                        break;
 
-               case WLAN_EID_PWR_CONSTRAINT:   //32
+               case WLAN_EID_PWR_CONSTRAINT:   /* 32 */
                        if (pFrame->pIE_PowerConstraint == NULL)
                                pFrame->pIE_PowerConstraint = (PWLAN_IE_PW_CONST)pItem;
                        break;
 
-               case WLAN_EID_CH_SWITCH:    //37
+               case WLAN_EID_CH_SWITCH:    /* 37 */
                        if (pFrame->pIE_CHSW == NULL)
                                pFrame->pIE_CHSW = (PWLAN_IE_CH_SW)pItem;
                        break;
 
-               case WLAN_EID_QUIET:        //40
+               case WLAN_EID_QUIET:        /* 40 */
                        if (pFrame->pIE_Quiet == NULL)
                                pFrame->pIE_Quiet = (PWLAN_IE_QUIET)pItem;
                        break;
@@ -814,7 +814,7 @@ vMgrEncodeAuthen(
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwAuthAlgorithm = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                                     + WLAN_AUTHEN_OFF_AUTH_ALG);
        pFrame->pwAuthSequence = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -846,7 +846,7 @@ vMgrDecodeAuthen(
 
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwAuthAlgorithm = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                                     + WLAN_AUTHEN_OFF_AUTH_ALG);
        pFrame->pwAuthSequence = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -854,7 +854,7 @@ vMgrDecodeAuthen(
        pFrame->pwStatus = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                              + WLAN_AUTHEN_OFF_STATUS);
 
-       // Information elements
+       /* Information elements */
        pItem = (PWLAN_IE)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                           + WLAN_AUTHEN_OFF_CHALLENGE);
 
@@ -883,7 +883,7 @@ vMgrEncodeDeauthen(
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwReason = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                              + WLAN_DEAUTHEN_OFF_REASON);
        pFrame->len = WLAN_HDR_ADDR3_LEN + WLAN_DEAUTHEN_OFF_REASON + sizeof(*(pFrame->pwReason));
@@ -909,7 +909,7 @@ vMgrDecodeDeauthen(
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwReason = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                              + WLAN_DEAUTHEN_OFF_REASON);
 
@@ -934,7 +934,7 @@ vMgrEncodeReassocResponse(
 {
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                               + WLAN_REASSOCRESP_OFF_CAP_INFO);
        pFrame->pwStatus = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -967,7 +967,7 @@ vMgrDecodeReassocResponse(
 
        pFrame->pHdr = (PUWLAN_80211HDR)pFrame->pBuf;
 
-       // Fixed Fields
+       /* Fixed Fields */
        pFrame->pwCapInfo = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                               + WLAN_REASSOCRESP_OFF_CAP_INFO);
        pFrame->pwStatus = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
@@ -975,7 +975,7 @@ vMgrDecodeReassocResponse(
        pFrame->pwAid = (unsigned short *)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                           + WLAN_REASSOCRESP_OFF_AID);
 
-       //Information elements
+       /* Information elements */
        pFrame->pSuppRates = (PWLAN_IE_SUPP_RATES)(WLAN_HDR_A3_DATA_PTR(&(pFrame->pHdr->sA3))
                                                   + WLAN_REASSOCRESP_OFF_SUPP_RATES);
 
index 16402cf5d25efdc6b54ed3fc969844c63fd738be..065238beb4f4cfda45baa183f7c043c5f6961adb 100644 (file)
@@ -38,7 +38,7 @@
 
 #define WLAN_MIN_ARRAY          1
 
-// Information Element ID value
+/* Information Element ID value */
 #define WLAN_EID_SSID           0
 #define WLAN_EID_SUPP_RATES     1
 #define WLAN_EID_FH_PARMS       2
 #define WLAN_EID_QUIET          40
 #define WLAN_EID_IBSS_DFS       41
 #define WLAN_EID_ERP            42
-// reference 802.11i 7.3.2 table 20
+/* reference 802.11i 7.3.2 table 20 */
 #define WLAN_EID_RSN            48
 #define WLAN_EID_EXTSUPP_RATES  50
-// reference WiFi WPA spec.
+/* reference WiFi WPA spec. */
 #define WLAN_EID_RSN_WPA        221
 
 #define WLAN_EID_ERP_NONERP_PRESENT             0x01
 #define WLAN_EID_ERP_USE_PROTECTION             0x02
 #define WLAN_EID_ERP_BARKER_MODE                0x04
 
-// Reason Codes
+/* Reason Codes */
 #define WLAN_MGMT_REASON_RSVD                       0
 #define WLAN_MGMT_REASON_UNSPEC                     1
 #define WLAN_MGMT_REASON_PRIOR_AUTH_INVALID         2
@@ -94,7 +94,7 @@
 #define WLAN_MGMT_REASON_RSNE_CAP_INVALID           22
 #define WLAN_MGMT_REASON_80211X_AUTH_FAILED         23
 
-// Status Codes
+/* Status Codes */
 #define WLAN_MGMT_STATUS_SUCCESS                        0
 #define WLAN_MGMT_STATUS_UNSPEC_FAILURE                 1
 #define WLAN_MGMT_STATUS_CAPS_UNSUPPORTED               10
 #define WLAN_MGMT_STATUS_ASSOC_DENIED_PBCC              20
 #define WLAN_MGMT_STATUS_ASSOC_DENIED_AGILITY           21
 
-// reference 802.11h 7.3.1.9
-//
+/* reference 802.11h 7.3.1.9 */
 #define WLAN_MGMT_STATUS_ASSOC_REJECT_BCS_SPECTRUM_MNG  22
 #define WLAN_MGMT_STATUS_ASSOC_REJECT_BCS_PWR_CAP       23
 #define WLAN_MGMT_STATUS_ASSOC_REJECT_BCS_SUPP_CH       24
-//
-// reference 802.11g 7.3.1.9
-//
+/* reference 802.11g 7.3.1.9 */
 #define WLAN_MGMT_STATUS_SHORTSLOTTIME_UNSUPPORTED      25
 #define WLAN_MGMT_STATUS_DSSSOFDM_UNSUPPORTED           26
-//
-// reference 802.11i 7.3.1.9 table 19
-//
+/* reference 802.11i 3.7.1.9 table 19 */
 #define WLAN_MGMT_STATUS_INVALID_IE                     40
 #define WLAN_MGMT_STATUS_GROUP_CIPHER_INVALID           41
 #define WLAN_MGMT_STATUS_PAIRWISE_CIPHER_INVALID        42
 #define WLAN_MGMT_STATUS_INVALID_RSN_IE_CAP             45
 #define WLAN_MGMT_STATUS_CIPHER_REJECT                  46
 
-// Auth Algorithm
+/* Auth Algorithm */
 #define WLAN_AUTH_ALG_OPENSYSTEM                0
 #define WLAN_AUTH_ALG_SHAREDKEY                 1
 
-// Management Frame Field Offsets
-// Note: Not all fields are listed because of variable lengths.
-// Note: These offsets are from the start of the frame data
+/* Management Frame Field Offsets */
+/* Note: Not all fields are listed because of variable lengths. */
+/* Note: These offsets are from the start of the frame data */
 
 #define WLAN_BEACON_OFF_TS                  0
 #define WLAN_BEACON_OFF_BCN_INT             8
 
 #define WLAN_DEAUTHEN_OFF_REASON            0
 
-//
-// Cipher Suite Selectors defined in 802.11i
-//
+/* Cipher Suite Selectors defined in 802.11i */
 #define WLAN_11i_CSS_USE_GROUP              0
 #define WLAN_11i_CSS_WEP40                  1
 #define WLAN_11i_CSS_TKIP                   2
 #define WLAN_11i_CSS_WEP104                 5
 #define WLAN_11i_CSS_UNKNOWN                255
 
-//
-// Authentication and Key Management Suite Selectors defined in 802.11i
-//
+/* Authentication and Key Management Suite Selectors defined in 802.11i */
 #define WLAN_11i_AKMSS_802_1X               1
 #define WLAN_11i_AKMSS_PSK                  2
 #define WLAN_11i_AKMSS_UNKNOWN              255
 
-// Measurement type definitions reference ieee 802.11h Table 20b
+/* Measurement type definitions reference ieee 802.11h Table 20b */
 #define MEASURE_TYPE_BASIC      0
 #define MEASURE_TYPE_CCA        1
 #define MEASURE_TYPE_RPI        2
 
-// Measurement request mode definitions reference ieee 802.11h Figure 46h
+/* Measurement request mode definitions reference ieee 802.11h Figure 46h */
 #define MEASURE_MODE_ENABLE     0x02
 #define MEASURE_MODE_REQ        0x04
 #define MEASURE_MODE_REP        0x08
 
-// Measurement report mode definitions reference ieee 802.11h Figure 46m
+/* Measurement report mode definitions reference ieee 802.11h Figure 46m */
 #define MEASURE_MODE_LATE       0x01
 #define MEASURE_MODE_INCAPABLE  0x02
 #define MEASURE_MODE_REFUSED    0x04
 
 /*---------------------  Export Types  ------------------------------*/
 
-// Information Element Types
+/* Information Element Types */
 
 #pragma pack(1)
 typedef struct tagWLAN_IE {
@@ -226,7 +217,7 @@ typedef struct tagWLAN_IE {
 } __attribute__ ((__packed__))
 WLAN_IE, *PWLAN_IE;
 
-// Service Set Identity (SSID)
+/* Service Set Identity (SSID) */
 #pragma pack(1)
 typedef struct tagWLAN_IE_SSID {
        unsigned char byElementID;
@@ -235,7 +226,7 @@ typedef struct tagWLAN_IE_SSID {
 } __attribute__ ((__packed__))
 WLAN_IE_SSID, *PWLAN_IE_SSID;
 
-// Supported Rates
+/* Supported Rates */
 #pragma pack(1)
 typedef struct tagWLAN_IE_SUPP_RATES {
        unsigned char byElementID;
@@ -244,7 +235,7 @@ typedef struct tagWLAN_IE_SUPP_RATES {
 } __attribute__ ((__packed__))
 WLAN_IE_SUPP_RATES,  *PWLAN_IE_SUPP_RATES;
 
-// FH Parameter Set
+/* FH Parameter Set */
 #pragma pack(1)
 typedef struct _WLAN_IE_FH_PARMS {
        unsigned char byElementID;
@@ -255,7 +246,7 @@ typedef struct _WLAN_IE_FH_PARMS {
        unsigned char byHopIndex;
 } WLAN_IE_FH_PARMS,  *PWLAN_IE_FH_PARMS;
 
-// DS Parameter Set
+/* DS Parameter Set */
 #pragma pack(1)
 typedef struct tagWLAN_IE_DS_PARMS {
        unsigned char byElementID;
@@ -264,7 +255,7 @@ typedef struct tagWLAN_IE_DS_PARMS {
 } __attribute__ ((__packed__))
 WLAN_IE_DS_PARMS,  *PWLAN_IE_DS_PARMS;
 
-// CF Parameter Set
+/* CF Parameter Set */
 #pragma pack(1)
 typedef struct tagWLAN_IE_CF_PARMS {
        unsigned char byElementID;
@@ -276,7 +267,7 @@ typedef struct tagWLAN_IE_CF_PARMS {
 } __attribute__ ((__packed__))
 WLAN_IE_CF_PARMS,  *PWLAN_IE_CF_PARMS;
 
-// TIM
+/* TIM */
 #pragma pack(1)
 typedef struct tagWLAN_IE_TIM {
        unsigned char byElementID;
@@ -288,7 +279,7 @@ typedef struct tagWLAN_IE_TIM {
 } __attribute__ ((__packed__))
 WLAN_IE_TIM,  *PWLAN_IE_TIM;
 
-// IBSS Parameter Set
+/* IBSS Parameter Set */
 #pragma pack(1)
 typedef struct tagWLAN_IE_IBSS_PARMS {
        unsigned char byElementID;
@@ -297,7 +288,7 @@ typedef struct tagWLAN_IE_IBSS_PARMS {
 } __attribute__ ((__packed__))
 WLAN_IE_IBSS_PARMS, *PWLAN_IE_IBSS_PARMS;
 
-// Challenge Text
+/* Challenge Text */
 #pragma pack(1)
 typedef struct tagWLAN_IE_CHALLENGE {
        unsigned char byElementID;
@@ -316,8 +307,8 @@ typedef struct tagWLAN_IE_RSN_EXT {
        unsigned short wPKCount;
        struct {
                unsigned char abyOUI[4];
-       } PKSList[1]; // the rest is variable so need to
-       // overlay ieauth structure
+       } PKSList[1]; /* the rest is variable so need to */
+       /* overlay ieauth structure */
 } WLAN_IE_RSN_EXT, *PWLAN_IE_RSN_EXT;
 
 #pragma pack(1)
@@ -328,7 +319,7 @@ typedef struct tagWLAN_IE_RSN_AUTH {
        } AuthKSList[1];
 } WLAN_IE_RSN_AUTH, *PWLAN_IE_RSN_AUTH;
 
-// RSN Identity
+/* RSN Identity */
 #pragma pack(1)
 typedef struct tagWLAN_IE_RSN {
        unsigned char byElementID;
@@ -337,7 +328,7 @@ typedef struct tagWLAN_IE_RSN {
        unsigned char abyRSN[WLAN_MIN_ARRAY];
 } WLAN_IE_RSN, *PWLAN_IE_RSN;
 
-// ERP
+/* ERP */
 #pragma pack(1)
 typedef struct tagWLAN_IE_ERP {
        unsigned char byElementID;
@@ -466,8 +457,8 @@ typedef struct _WLAN_IE_IBSS_DFS {
 
 #pragma pack()
 
-// Frame Types
-// prototype structure, all mgmt frame types will start with these members
+/* Frame Types */
+/* prototype structure, all mgmt frame types will start with these members */
 typedef struct tagWLAN_FR_MGMT {
        unsigned int    uType;
        unsigned int    len;
@@ -475,20 +466,20 @@ typedef struct tagWLAN_FR_MGMT {
        PUWLAN_80211HDR       pHdr;
 } WLAN_FR_MGMT,  *PWLAN_FR_MGMT;
 
-// Beacon frame
+/* Beacon frame */
 typedef struct tagWLAN_FR_BEACON {
        unsigned int    uType;
        unsigned int    len;
        unsigned char *pBuf;
        PUWLAN_80211HDR         pHdr;
-       // fixed fields
+       /* fixed fields */
        PQWORD                  pqwTimestamp;
        unsigned short *pwBeaconInterval;
        unsigned short *pwCapInfo;
        /*-- info elements ----------*/
        PWLAN_IE_SSID           pSSID;
        PWLAN_IE_SUPP_RATES     pSuppRates;
-//  PWLAN_IE_FH_PARMS       pFHParms;
+/*  PWLAN_IE_FH_PARMS       pFHParms; */
        PWLAN_IE_DS_PARMS       pDSParms;
        PWLAN_IE_CF_PARMS       pCFParms;
        PWLAN_IE_TIM            pTIM;
@@ -504,19 +495,19 @@ typedef struct tagWLAN_FR_BEACON {
        PWLAN_IE_QUIET          pIE_Quiet;
 } WLAN_FR_BEACON, *PWLAN_FR_BEACON;
 
-// IBSS ATIM frame
+/* IBSS ATIM frame */
 typedef struct tagWLAN_FR_IBSSATIM {
        unsigned int    uType;
        unsigned int    len;
        unsigned char *pBuf;
        PUWLAN_80211HDR         pHdr;
 
-       // fixed fields
-       // info elements
-       // this frame type has a null body
+       /* fixed fields */
+       /* info elements */
+       /* this frame type has a null body */
 } WLAN_FR_IBSSATIM, *PWLAN_FR_IBSSATIM;
 
-// Disassociation
+/* Disassociation */
 typedef struct tagWLAN_FR_DISASSOC {
        unsigned int    uType;
        unsigned int    len;
@@ -527,7 +518,7 @@ typedef struct tagWLAN_FR_DISASSOC {
        /*-- info elements ----------*/
 } WLAN_FR_DISASSOC, *PWLAN_FR_DISASSOC;
 
-// Association Request
+/* Association Request */
 typedef struct tagWLAN_FR_ASSOCREQ {
        unsigned int    uType;
        unsigned int    len;
@@ -546,7 +537,7 @@ typedef struct tagWLAN_FR_ASSOCREQ {
        PWLAN_IE_SUPP_CH        pCurrSuppCh;
 } WLAN_FR_ASSOCREQ, *PWLAN_FR_ASSOCREQ;
 
-// Association Response
+/* Association Response */
 typedef struct tagWLAN_FR_ASSOCRESP {
        unsigned int    uType;
        unsigned int    len;
@@ -561,7 +552,7 @@ typedef struct tagWLAN_FR_ASSOCRESP {
        PWLAN_IE_SUPP_RATES     pExtSuppRates;
 } WLAN_FR_ASSOCRESP, *PWLAN_FR_ASSOCRESP;
 
-// Reassociation Request
+/* Reassociation Request */
 typedef struct tagWLAN_FR_REASSOCREQ {
        unsigned int    uType;
        unsigned int    len;
@@ -581,7 +572,7 @@ typedef struct tagWLAN_FR_REASSOCREQ {
        PWLAN_IE_SUPP_RATES     pExtSuppRates;
 } WLAN_FR_REASSOCREQ, *PWLAN_FR_REASSOCREQ;
 
-// Reassociation Response
+/* Reassociation Response */
 typedef struct tagWLAN_FR_REASSOCRESP {
        unsigned int    uType;
        unsigned int    len;
@@ -596,7 +587,7 @@ typedef struct tagWLAN_FR_REASSOCRESP {
        PWLAN_IE_SUPP_RATES     pExtSuppRates;
 } WLAN_FR_REASSOCRESP, *PWLAN_FR_REASSOCRESP;
 
-// Probe Request
+/* Probe Request */
 typedef struct tagWLAN_FR_PROBEREQ {
        unsigned int    uType;
        unsigned int    len;
@@ -609,7 +600,7 @@ typedef struct tagWLAN_FR_PROBEREQ {
        PWLAN_IE_SUPP_RATES     pExtSuppRates;
 } WLAN_FR_PROBEREQ, *PWLAN_FR_PROBEREQ;
 
-// Probe Response
+/* Probe Response */
 typedef struct tagWLAN_FR_PROBERESP {
        unsigned int    uType;
        unsigned int    len;
@@ -636,7 +627,7 @@ typedef struct tagWLAN_FR_PROBERESP {
        PWLAN_IE_QUIET          pIE_Quiet;
 } WLAN_FR_PROBERESP, *PWLAN_FR_PROBERESP;
 
-// Authentication
+/* Authentication */
 typedef struct tagWLAN_FR_AUTHEN {
        unsigned int    uType;
        unsigned int    len;
@@ -650,7 +641,7 @@ typedef struct tagWLAN_FR_AUTHEN {
        PWLAN_IE_CHALLENGE      pChallenge;
 } WLAN_FR_AUTHEN, *PWLAN_FR_AUTHEN;
 
-// Deauthenication
+/* Deauthenication */
 typedef struct tagWLAN_FR_DEAUTHEN {
        unsigned int    uType;
        unsigned int    len;
@@ -774,4 +765,4 @@ vMgrDecodeReassocResponse(
        PWLAN_FR_REASSOCRESP  pFrame
 );
 
-#endif// __80211MGR_H__
+#endif/* __80211MGR_H__ */
index 36081481c6d17f13ff88b25a915ffa49f0dbfaa9..93a2638a7ad55dbbd7b821acc4db45d4454b2834 100644 (file)
@@ -205,7 +205,7 @@ void AESv128(unsigned char *key, unsigned char *data, unsigned char *ciphertext)
                        SubBytes(ciphertext, TmpdataA);
                        ShiftRows(TmpdataA, TmpdataB);
                        xor_128(TmpdataB, abyRoundKey, ciphertext);
-               } else // round 1 ~ 9
+               } else /* round 1 ~ 9 */
                {
                        SubBytes(ciphertext, TmpdataA);
                        ShiftRows(TmpdataA, TmpdataB);
@@ -249,7 +249,7 @@ bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned shor
        unsigned char *pbyIV;
        unsigned char *pbyPayload;
        unsigned short wHLen = 22;
-       unsigned short wPayloadSize = wFrameSize - 8 - 8 - 4 - WLAN_HDR_ADDR3_LEN;//8 is IV, 8 is MIC, 4 is CRC
+       unsigned short wPayloadSize = wFrameSize - 8 - 8 - 4 - WLAN_HDR_ADDR3_LEN;/* 8 is IV, 8 is MIC, 4 is CRC */
        bool bA4 = false;
        unsigned char byTmp;
        unsigned short wCnt;
@@ -259,13 +259,13 @@ bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned shor
        if (WLAN_GET_FC_TODS(*(unsigned short *)pbyFrame) &&
            WLAN_GET_FC_FROMDS(*(unsigned short *)pbyFrame)) {
                bA4 = true;
-               pbyIV += 6;             // 6 is 802.11 address4
+               pbyIV += 6;             /* 6 is 802.11 address4 */
                wHLen += 6;
                wPayloadSize -= 6;
        }
-       pbyPayload = pbyIV + 8; //IV-length
+       pbyPayload = pbyIV + 8; /* IV-length */
 
-       abyNonce[0]  = 0x00; //now is 0, if Qos here will be priority
+       abyNonce[0]  = 0x00; /* now is 0, if Qos here will be priority */
        memcpy(&(abyNonce[1]), pMACHeader->abyAddr2, ETH_ALEN);
        abyNonce[7]  = pbyIV[7];
        abyNonce[8]  = pbyIV[6];
@@ -274,13 +274,13 @@ bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned shor
        abyNonce[11] = pbyIV[1];
        abyNonce[12] = pbyIV[0];
 
-       //MIC_IV
+       /* MIC_IV */
        MIC_IV[0] = 0x59;
        memcpy(&(MIC_IV[1]), &(abyNonce[0]), 13);
        MIC_IV[14] = (unsigned char)(wPayloadSize >> 8);
        MIC_IV[15] = (unsigned char)(wPayloadSize & 0xff);
 
-       //MIC_HDR1
+       /* MIC_HDR1 */
        MIC_HDR1[0] = (unsigned char)(wHLen >> 8);
        MIC_HDR1[1] = (unsigned char)(wHLen & 0xff);
        byTmp = (unsigned char)(pMACHeader->wFrameCtl & 0xff);
@@ -291,7 +291,7 @@ bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned shor
        memcpy(&(MIC_HDR1[4]), pMACHeader->abyAddr1, ETH_ALEN);
        memcpy(&(MIC_HDR1[10]), pMACHeader->abyAddr2, ETH_ALEN);
 
-       //MIC_HDR2
+       /* MIC_HDR2 */
        memcpy(&(MIC_HDR2[0]), pMACHeader->abyAddr3, ETH_ALEN);
        byTmp = (unsigned char)(pMACHeader->wSeqCtl & 0xff);
        MIC_HDR2[6] = byTmp & 0x0f;
@@ -309,7 +309,7 @@ bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned shor
        MIC_HDR2[14] = 0x00;
        MIC_HDR2[15] = 0x00;
 
-       //CCMP
+       /* CCMP */
        AESv128(pbyRxKey, MIC_IV, abyMIC);
        for (kk = 0; kk < 16; kk++) {
                abyTmp[kk] = MIC_HDR1[kk] ^ abyMIC[kk];
@@ -341,9 +341,9 @@ bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned shor
                memcpy(pbyPayload, abyPlainText, 16);
                wCnt++;
                pbyPayload += 16;
-       } //for wPayloadSize
+       } /* for wPayloadSize */
 
-       //last payload
+       /* last payload */
        memcpy(&(abyLastCipher[0]), pbyPayload, jj);
        for (ii = jj; ii < 16; ii++) {
                abyLastCipher[ii] = 0x00;
@@ -359,7 +359,7 @@ bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned shor
        memcpy(pbyPayload, abyPlainText, jj);
        pbyPayload += jj;
 
-       //for MIC calculation
+       /* for MIC calculation */
        for (ii = jj; ii < 16; ii++) {
                abyPlainText[ii] = 0x00;
        }
@@ -368,8 +368,8 @@ bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned shor
        }
        AESv128(pbyRxKey, abyTmp, abyMIC);
 
-       //=>above is the calculate MIC
-       //--------------------------------------------
+       /* =>above is the calculate MIC */
+       /* -------------------------------------------- */
 
        wCnt = 0;
        abyCTRPLD[14] = (unsigned char)(wCnt >> 8);
@@ -378,8 +378,8 @@ bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned shor
        for (kk = 0; kk < 8; kk++) {
                abyTmp[kk] = abyTmp[kk] ^ pbyPayload[kk];
        }
-       //=>above is the dec-MIC from packet
-       //--------------------------------------------
+       /* =>above is the dec-MIC from packet */
+       /* -------------------------------------------- */
 
        if (!memcmp(abyMIC, abyTmp, 8)) {
                return true;
index c8b28b0e9bdcc3740f2ce241eb97bee21876d8f3..cc02e645aa5679151c9eadabdec2ac6e8a276dc3 100644 (file)
@@ -43,4 +43,4 @@
 /*---------------------  Export Functions  --------------------------*/
 bool AESbGenCCMP(unsigned char *pbyRxKey, unsigned char *pbyFrame, unsigned short wFrameSize);
 
-#endif //__AES_H__
+#endif /* __AES_H__ */
index 8417c2f2c6cfdbf2208a039b626b60ce56524134..57a08c5771f29fd93568fe6fdc0a4d841dbed218 100644 (file)
@@ -80,7 +80,7 @@ static int hostap_enable_hostapd(PSDevice pDevice, int rtnl_locked)
 
        DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO "%s: Enabling hostapd mode\n", dev->name);
 
-       pDevice->apdev = kzalloc(sizeof(struct net_device), GFP_KERNEL);
+       pDevice->apdev = alloc_etherdev(sizeof(*apdev_priv));
        if (pDevice->apdev == NULL)
                return -ENOMEM;
 
@@ -104,6 +104,8 @@ static int hostap_enable_hostapd(PSDevice pDevice, int rtnl_locked)
        if (ret) {
                DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO "%s: register_netdevice(AP) failed!\n",
                        dev->name);
+               free_netdev(pDevice->apdev);
+               pDevice->apdev = NULL;
                return -1;
        }
 
@@ -141,7 +143,7 @@ static int hostap_disable_hostapd(PSDevice pDevice, int rtnl_locked)
                DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO "%s: Netdevice %s unregistered\n",
                        pDevice->dev->name, pDevice->apdev->name);
        }
-       kfree(pDevice->apdev);
+       free_netdev(pDevice->apdev);
        pDevice->apdev = NULL;
        pDevice->bEnable8021x = false;
        pDevice->bEnableHostWEP = false;
index 44cfe0b141800d5cbb3a0fa8f46314f7e9279941..d27fa434550da50181f34a519aa3e2545a291d68 100644 (file)
@@ -29,6 +29,9 @@
  *      IFRFbWriteEmbedded      - Embedded write RF register via MAC
  *
  * Revision History:
+ *     RF_VT3226: RobertYu:20051111, VT3226C0 and before
+ *     RF_VT3226D0: RobertYu:20051228
+ *     RF_VT3342A0: RobertYu:20060609
  *
  */
 
@@ -61,7 +64,7 @@ static int          msglevel                =MSG_LEVEL_INFO;
 #define VT3342_PWR_IDX_LEN    64
 //}}
 
-u8 abyAL2230InitTable[CB_AL2230_INIT_SEQ][3] = {
+static u8 al2230_init_table[CB_AL2230_INIT_SEQ][3] = {
     {0x03, 0xF7, 0x90},
     {0x03, 0x33, 0x31},
     {0x01, 0xB8, 0x02},
@@ -79,7 +82,7 @@ u8 abyAL2230InitTable[CB_AL2230_INIT_SEQ][3] = {
     {0x00, 0x58, 0x0F}
     };
 
-u8 abyAL2230ChannelTable0[CB_MAX_CHANNEL_24G][3] = {
+static u8 al2230_channel_table0[CB_MAX_CHANNEL_24G][3] = {
     {0x03, 0xF7, 0x90}, // channel = 1, Tf = 2412MHz
     {0x03, 0xF7, 0x90}, // channel = 2, Tf = 2417MHz
     {0x03, 0xE7, 0x90}, // channel = 3, Tf = 2422MHz
@@ -96,7 +99,7 @@ u8 abyAL2230ChannelTable0[CB_MAX_CHANNEL_24G][3] = {
     {0x03, 0xE7, 0xC0}  // channel = 14, Tf = 2412M
     };
 
-u8 abyAL2230ChannelTable1[CB_MAX_CHANNEL_24G][3] = {
+static u8 al2230_channel_table1[CB_MAX_CHANNEL_24G][3] = {
     {0x03, 0x33, 0x31}, // channel = 1, Tf = 2412MHz
     {0x0B, 0x33, 0x31}, // channel = 2, Tf = 2417MHz
     {0x03, 0x33, 0x31}, // channel = 3, Tf = 2422MHz
@@ -115,7 +118,7 @@ u8 abyAL2230ChannelTable1[CB_MAX_CHANNEL_24G][3] = {
 
 // 40MHz reference frequency
 // Need to Pull PLLON(PE3) low when writing channel registers through 3-wire.
-u8 abyAL7230InitTable[CB_AL7230_INIT_SEQ][3] = {
+static u8 al7230_init_table[CB_AL7230_INIT_SEQ][3] = {
     {0x20, 0x37, 0x90}, // Channel1 // Need modify for 11a
     {0x13, 0x33, 0x31}, // Channel1 // Need modify for 11a
     {0x84, 0x1F, 0xF2}, // Need modify for 11a: 451FE2
@@ -138,7 +141,7 @@ u8 abyAL7230InitTable[CB_AL7230_INIT_SEQ][3] = {
     {0x1A, 0xBA, 0x8F} // Need modify for 11a: 12BACF
     };
 
-u8 abyAL7230InitTableAMode[CB_AL7230_INIT_SEQ][3] = {
+static u8 al7230_init_table_amode[CB_AL7230_INIT_SEQ][3] = {
     {0x2F, 0xF5, 0x20}, // Channel184 // Need modify for 11b/g
     {0x00, 0x00, 0x01}, // Channel184 // Need modify for 11b/g
     {0x45, 0x1F, 0xE2}, // Need modify for 11b/g
@@ -157,7 +160,7 @@ u8 abyAL7230InitTableAMode[CB_AL7230_INIT_SEQ][3] = {
     {0x12, 0xBA, 0xCF} // Need modify for 11b/g
     };
 
-u8 abyAL7230ChannelTable0[CB_MAX_CHANNEL][3] = {
+static u8 al7230_channel_table0[CB_MAX_CHANNEL][3] = {
     {0x20, 0x37, 0x90}, // channel =  1, Tf = 2412MHz
     {0x20, 0x37, 0x90}, // channel =  2, Tf = 2417MHz
     {0x20, 0x37, 0x90}, // channel =  3, Tf = 2422MHz
@@ -223,7 +226,7 @@ u8 abyAL7230ChannelTable0[CB_MAX_CHANNEL][3] = {
     {0x2F, 0xF6, 0x10} // channel = 165, Tf = 5825MHz (56)
     };
 
-u8 abyAL7230ChannelTable1[CB_MAX_CHANNEL][3] = {
+static u8 al7230_channel_table1[CB_MAX_CHANNEL][3] = {
     {0x13, 0x33, 0x31}, // channel =  1, Tf = 2412MHz
     {0x1B, 0x33, 0x31}, // channel =  2, Tf = 2417MHz
     {0x03, 0x33, 0x31}, // channel =  3, Tf = 2422MHz
@@ -287,7 +290,7 @@ u8 abyAL7230ChannelTable1[CB_MAX_CHANNEL][3] = {
     {0x02, 0xAA, 0xB1}  // channel = 165, Tf = 5825MHz (56)
     };
 
-u8 abyAL7230ChannelTable2[CB_MAX_CHANNEL][3] = {
+static u8 al7230_channel_table2[CB_MAX_CHANNEL][3] = {
     {0x7F, 0xD7, 0x84}, // channel =  1, Tf = 2412MHz
     {0x7F, 0xD7, 0x84}, // channel =  2, Tf = 2417MHz
     {0x7F, 0xD7, 0x84}, // channel =  3, Tf = 2422MHz
@@ -352,7 +355,7 @@ u8 abyAL7230ChannelTable2[CB_MAX_CHANNEL][3] = {
     };
 
 ///{{RobertYu:20051111
-u8 abyVT3226_InitTable[CB_VT3226_INIT_SEQ][3] = {
+static u8 at3226_init_table[CB_VT3226_INIT_SEQ][3] = {
     {0x03, 0xFF, 0x80},
     {0x02, 0x82, 0xA1},
     {0x03, 0xC6, 0xA2},
@@ -366,7 +369,7 @@ u8 abyVT3226_InitTable[CB_VT3226_INIT_SEQ][3] = {
     {0x02, 0x00, 0x2A}
     };
 
-u8 abyVT3226D0_InitTable[CB_VT3226_INIT_SEQ][3] = {
+static u8 at3226d0_init_table[CB_VT3226_INIT_SEQ][3] = {
     {0x03, 0xFF, 0x80},
     {0x03, 0x02, 0x21}, //RobertYu:20060327
     {0x03, 0xC6, 0xA2},
@@ -380,7 +383,7 @@ u8 abyVT3226D0_InitTable[CB_VT3226_INIT_SEQ][3] = {
     {0x02, 0x01, 0xAA}  //RobertYu:20060523
     };
 
-u8 abyVT3226_ChannelTable0[CB_MAX_CHANNEL_24G][3] = {
+static u8 vt3226_channel_table0[CB_MAX_CHANNEL_24G][3] = {
     {0x01, 0x97, 0x83}, // channel = 1, Tf = 2412MHz
     {0x01, 0x97, 0x83}, // channel = 2, Tf = 2417MHz
     {0x01, 0x97, 0x93}, // channel = 3, Tf = 2422MHz
@@ -397,7 +400,7 @@ u8 abyVT3226_ChannelTable0[CB_MAX_CHANNEL_24G][3] = {
     {0x03, 0x37, 0xC3}  // channel = 14, Tf = 2484MHz
     };
 
-u8 abyVT3226_ChannelTable1[CB_MAX_CHANNEL_24G][3] = {
+static u8 vt3226_channel_table1[CB_MAX_CHANNEL_24G][3] = {
     {0x02, 0x66, 0x64}, // channel = 1, Tf = 2412MHz
     {0x03, 0x66, 0x64}, // channel = 2, Tf = 2417MHz
     {0x00, 0x66, 0x64}, // channel = 3, Tf = 2422MHz
@@ -416,7 +419,7 @@ u8 abyVT3226_ChannelTable1[CB_MAX_CHANNEL_24G][3] = {
 ///}}RobertYu
 
 //{{RobertYu:20060502, TWIF 1.14, LO Current for 11b mode
-u32 dwVT3226D0LoCurrentTable[CB_MAX_CHANNEL_24G] = {
+const u32 vt3226d0_lo_current_table[CB_MAX_CHANNEL_24G] = {
     0x0135C600+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW, // channel = 1, Tf = 2412MHz
     0x0135C600+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW, // channel = 2, Tf = 2417MHz
     0x0235C600+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW, // channel = 3, Tf = 2422MHz
@@ -435,7 +438,7 @@ u32 dwVT3226D0LoCurrentTable[CB_MAX_CHANNEL_24G] = {
 //}}
 
 //{{RobertYu:20060609
-u8 abyVT3342A0_InitTable[CB_VT3342_INIT_SEQ][3] = { /* 11b/g mode */
+static u8 vt3342a0_init_table[CB_VT3342_INIT_SEQ][3] = { /* 11b/g mode */
     {0x03, 0xFF, 0x80}, //update for mode//
     {0x02, 0x08, 0x81},
     {0x00, 0xC6, 0x02},
@@ -458,7 +461,7 @@ u8 abyVT3342A0_InitTable[CB_VT3342_INIT_SEQ][3] = { /* 11b/g mode */
  // channel56, 5280MHz  0x00C402 for disable Frac
  // other channels 0x00C602
 
-u8 abyVT3342_ChannelTable0[CB_MAX_CHANNEL][3] = {
+static u8 vt3342_channel_table0[CB_MAX_CHANNEL][3] = {
     {0x02, 0x05, 0x03}, // channel = 1, Tf = 2412MHz
     {0x01, 0x15, 0x03}, // channel = 2, Tf = 2417MHz
     {0x03, 0xC5, 0x03}, // channel = 3, Tf = 2422MHz
@@ -524,7 +527,7 @@ u8 abyVT3342_ChannelTable0[CB_MAX_CHANNEL][3] = {
     {0x00, 0x06, 0x03}  // channel = 165, Tf = 5825MHz (56), TBD
     };
 
-u8 abyVT3342_ChannelTable1[CB_MAX_CHANNEL][3] = {
+static u8 vt3342_channel_table1[CB_MAX_CHANNEL][3] = {
     {0x01, 0x99, 0x94}, // channel = 1, Tf = 2412MHz
     {0x02, 0x44, 0x44}, // channel = 2, Tf = 2417MHz
     {0x02, 0xEE, 0xE4}, // channel = 3, Tf = 2422MHz
@@ -594,7 +597,7 @@ u8 abyVT3342_ChannelTable1[CB_MAX_CHANNEL][3] = {
  *
 -*/
 
-const u32 dwAL2230PowerTable[AL2230_PWR_IDX_LEN] = {
+const u32 al2230_power_table[AL2230_PWR_IDX_LEN] = {
     0x04040900+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW,
     0x04041900+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW,
     0x04042900+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW,
@@ -732,42 +735,41 @@ int IFRFbWriteEmbedded(struct vnt_private *pDevice, u32 dwData)
  * Return Value: true if succeeded; false if failed.
  *
  */
-int RFbSetPower(struct vnt_private *pDevice, u32 uRATE, u32 uCH)
+int RFbSetPower(struct vnt_private *priv, u32 rate, u32 channel)
 {
-       int bResult = true;
-       u8 byPwr = pDevice->byCCKPwr;
+       int ret = true;
+       u8 power = priv->byCCKPwr;
 
-       if (pDevice->dwDiagRefCount)
+       if (priv->dwDiagRefCount)
                return true;
 
-       if (uCH == 0)
+       if (channel == 0)
                return -EINVAL;
 
-    switch (uRATE) {
-    case RATE_1M:
-    case RATE_2M:
-    case RATE_5M:
-    case RATE_11M:
-        byPwr = pDevice->abyCCKPwrTbl[uCH-1];
-        break;
-    case RATE_6M:
-    case RATE_9M:
-    case RATE_18M:
-    case RATE_24M:
-    case RATE_36M:
-    case RATE_48M:
-    case RATE_54M:
-        if (uCH > CB_MAX_CHANNEL_24G) {
-            byPwr = pDevice->abyOFDMAPwrTbl[uCH-15];
-        } else {
-            byPwr = pDevice->abyOFDMPwrTbl[uCH-1];
-        }
-        break;
-    }
-
-    bResult = RFbRawSetPower(pDevice, byPwr, uRATE);
-
-    return bResult;
+       switch (rate) {
+       case RATE_1M:
+       case RATE_2M:
+       case RATE_5M:
+       case RATE_11M:
+               power = priv->abyCCKPwrTbl[channel-1];
+               break;
+       case RATE_6M:
+       case RATE_9M:
+       case RATE_18M:
+       case RATE_24M:
+       case RATE_36M:
+       case RATE_48M:
+       case RATE_54M:
+               if (channel > CB_MAX_CHANNEL_24G)
+                       power = priv->abyOFDMAPwrTbl[channel-15];
+               else
+                       power = priv->abyOFDMPwrTbl[channel-1];
+               break;
+       }
+
+       ret = RFbRawSetPower(priv, power, rate);
+
+       return ret;
 }
 
 /*
@@ -784,136 +786,146 @@ int RFbSetPower(struct vnt_private *pDevice, u32 uRATE, u32 uCH)
  *
  */
 
-int RFbRawSetPower(struct vnt_private *pDevice, u8 byPwr, u32 uRATE)
+int RFbRawSetPower(struct vnt_private *priv, u8 power, u32 rate)
 {
-       int bResult = true;
-
-    if (pDevice->byCurPwr == byPwr)
-        return true;
-
-    pDevice->byCurPwr = byPwr;
-
-    switch (pDevice->byRFType) {
-
-        case RF_AL2230 :
-            if (pDevice->byCurPwr >= AL2230_PWR_IDX_LEN)
-                return false;
-            bResult &= IFRFbWriteEmbedded(pDevice, dwAL2230PowerTable[pDevice->byCurPwr]);
-            if (uRATE <= RATE_11M)
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x0001B400+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-            else
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x0005A400+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-            break;
-
-        case RF_AL2230S :
-            if (pDevice->byCurPwr >= AL2230_PWR_IDX_LEN)
-                return false;
-            bResult &= IFRFbWriteEmbedded(pDevice, dwAL2230PowerTable[pDevice->byCurPwr]);
-            if (uRATE <= RATE_11M) {
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x040C1400+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x00299B00+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-            }else {
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x0005A400+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x00099B00+(BY_AL2230_REG_LEN<<3)+IFREGCTL_REGW);
-            }
-            break;
-
-        case RF_AIROHA7230:
-            {
-                u32       dwMax7230Pwr;
-
-                if (uRATE <= RATE_11M) { //RobertYu:20060426, for better 11b mask
-                    bResult &= IFRFbWriteEmbedded(pDevice, 0x111BB900+(BY_AL7230_REG_LEN<<3)+IFREGCTL_REGW);
-                }
-                else {
-                    bResult &= IFRFbWriteEmbedded(pDevice, 0x221BB900+(BY_AL7230_REG_LEN<<3)+IFREGCTL_REGW);
-                }
-
-                if (pDevice->byCurPwr > AL7230_PWR_IDX_LEN) return false;
-
-                //  0x080F1B00 for 3 wire control TxGain(D10) and 0x31 as TX Gain value
-                dwMax7230Pwr = 0x080C0B00 | ( (pDevice->byCurPwr) << 12 ) |
-                                 (BY_AL7230_REG_LEN << 3 )  | IFREGCTL_REGW;
-
-                bResult &= IFRFbWriteEmbedded(pDevice, dwMax7230Pwr);
-                break;
-            }
-            break;
-
-        case RF_VT3226: //RobertYu:20051111, VT3226C0 and before
-        {
-            u32       dwVT3226Pwr;
-
-            if (pDevice->byCurPwr >= VT3226_PWR_IDX_LEN)
-                return false;
-            dwVT3226Pwr = ((0x3F-pDevice->byCurPwr) << 20 ) | ( 0x17 << 8 ) /* Reg7 */ |
-                           (BY_VT3226_REG_LEN << 3 )  | IFREGCTL_REGW;
-            bResult &= IFRFbWriteEmbedded(pDevice, dwVT3226Pwr);
-            break;
-        }
-
-        case RF_VT3226D0: //RobertYu:20051228
-        {
-            u32       dwVT3226Pwr;
-
-            if (pDevice->byCurPwr >= VT3226_PWR_IDX_LEN)
-                return false;
-
-            if (uRATE <= RATE_11M) {
-
-                dwVT3226Pwr = ((0x3F-pDevice->byCurPwr) << 20 ) | ( 0xE07 << 8 ) /* Reg7 */ |   //RobertYu:20060420, TWIF 1.10
-                               (BY_VT3226_REG_LEN << 3 )  | IFREGCTL_REGW;
-                bResult &= IFRFbWriteEmbedded(pDevice, dwVT3226Pwr);
-
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x03C6A200+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW);
-               if (pDevice->vnt_mgmt.eScanState != WMAC_NO_SCANNING) {
-                       /* scanning, channel number is pDevice->uScanChannel */
-                       DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO
+       u32 power_setting = 0;
+       int ret = true;
+
+       if (priv->byCurPwr == power)
+               return true;
+
+       priv->byCurPwr = power;
+
+       switch (priv->byRFType) {
+       case RF_AL2230:
+               if (priv->byCurPwr >= AL2230_PWR_IDX_LEN)
+                       return false;
+
+               ret &= IFRFbWriteEmbedded(priv,
+                       al2230_power_table[priv->byCurPwr]);
+
+               if (rate <= RATE_11M)
+                       ret &= IFRFbWriteEmbedded(priv, 0x0001b400 +
+                               (BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+               else
+                       ret &= IFRFbWriteEmbedded(priv, 0x0005a400 +
+                               (BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+               break;
+       case RF_AL2230S:
+               if (priv->byCurPwr >= AL2230_PWR_IDX_LEN)
+                       return false;
+
+               ret &= IFRFbWriteEmbedded(priv,
+                       al2230_power_table[priv->byCurPwr]);
+
+               if (rate <= RATE_11M) {
+                       ret &= IFRFbWriteEmbedded(priv, 0x040c1400 +
+                               (BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+                       ret &= IFRFbWriteEmbedded(priv, 0x00299b00 +
+                               (BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+               } else {
+                       ret &= IFRFbWriteEmbedded(priv, 0x0005a400 +
+                               (BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+                       ret &= IFRFbWriteEmbedded(priv, 0x00099b00 +
+                               (BY_AL2230_REG_LEN << 3) + IFREGCTL_REGW);
+               }
+               break;
+
+       case RF_AIROHA7230:
+               if (rate <= RATE_11M)
+                       ret &= IFRFbWriteEmbedded(priv, 0x111bb900 +
+                               (BY_AL7230_REG_LEN << 3)+IFREGCTL_REGW);
+               else
+                       ret &= IFRFbWriteEmbedded(priv, 0x221bb900 +
+                               (BY_AL7230_REG_LEN << 3)+IFREGCTL_REGW);
+
+               if (priv->byCurPwr > AL7230_PWR_IDX_LEN)
+                       return false;
+
+               /*
+               * 0x080F1B00 for 3 wire control TxGain(D10)
+               * and 0x31 as TX Gain value
+               */
+               power_setting = 0x080c0b00 | ((priv->byCurPwr) << 12) |
+                               (BY_AL7230_REG_LEN << 3) | IFREGCTL_REGW;
+
+               ret &= IFRFbWriteEmbedded(priv, power_setting);
+
+               break;
+
+       case RF_VT3226:
+               if (priv->byCurPwr >= VT3226_PWR_IDX_LEN)
+                       return false;
+               power_setting = ((0x3f - priv->byCurPwr) << 20) | (0x17 << 8) |
+                               (BY_VT3226_REG_LEN << 3) | IFREGCTL_REGW;
+
+               ret &= IFRFbWriteEmbedded(priv, power_setting);
+
+               break;
+       case RF_VT3226D0:
+               if (priv->byCurPwr >= VT3226_PWR_IDX_LEN)
+                       return false;
+
+               if (rate <= RATE_11M) {
+                       power_setting = ((0x3f-priv->byCurPwr) << 20) |
+                               (0xe07 << 8) | (BY_VT3226_REG_LEN << 3) |
+                                               IFREGCTL_REGW;
+
+                       ret &= IFRFbWriteEmbedded(priv, power_setting);
+                       ret &= IFRFbWriteEmbedded(priv, 0x03c6a200 +
+                                       (BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW);
+
+                       if (priv->vnt_mgmt.eScanState != WMAC_NO_SCANNING) {
+                               DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO
+                               "RFbRawSetPower> 11B mode uCurrChannel[%d]\n",
+                                               priv->vnt_mgmt.uScanChannel);
+                               ret &= IFRFbWriteEmbedded(priv,
+                                       vt3226d0_lo_current_table[priv->
+                                               vnt_mgmt.uScanChannel - 1]);
+                       } else {
+                               DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO
                                "RFbRawSetPower> 11B mode uCurrChannel[%d]\n",
-                               pDevice->vnt_mgmt.uScanChannel);
-                       bResult &= IFRFbWriteEmbedded(pDevice,
-                               dwVT3226D0LoCurrentTable[pDevice->
-                                       vnt_mgmt.uScanChannel - 1]);
+                                               priv->vnt_mgmt.uCurrChannel);
+                               ret &= IFRFbWriteEmbedded(priv,
+                                       vt3226d0_lo_current_table[priv->
+                                               vnt_mgmt.uCurrChannel - 1]);
+                       }
+
+                       ret &= IFRFbWriteEmbedded(priv, 0x015C0800 +
+                               (BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW);
                } else {
                        DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO
-                               "RFbRawSetPower> 11B mode uCurrChannel[%d]\n",
-                               pDevice->vnt_mgmt.uCurrChannel);
-                       bResult &= IFRFbWriteEmbedded(pDevice,
-                               dwVT3226D0LoCurrentTable[pDevice->
-                                       vnt_mgmt.uCurrChannel - 1]);
+                                       "@@@@ RFbRawSetPower> 11G mode\n");
+
+                       power_setting = ((0x3f-priv->byCurPwr) << 20) |
+                               (0x7 << 8) | (BY_VT3226_REG_LEN << 3) |
+                                       IFREGCTL_REGW;
+
+                       ret &= IFRFbWriteEmbedded(priv, power_setting);
+                       ret &= IFRFbWriteEmbedded(priv, 0x00C6A200 +
+                               (BY_VT3226_REG_LEN << 3) + IFREGCTL_REGW);
+                       ret &= IFRFbWriteEmbedded(priv, 0x016BC600 +
+                                       (BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW);
+                       ret &= IFRFbWriteEmbedded(priv, 0x00900800 +
+                                       (BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW);
                }
+               break;
+
+       case RF_VT3342A0:
+               if (priv->byCurPwr >= VT3342_PWR_IDX_LEN)
+                       return false;
+
+               power_setting =  ((0x3F-priv->byCurPwr) << 20) |
+                       (0x27 << 8) | (BY_VT3342_REG_LEN << 3) |
+                                       IFREGCTL_REGW;
 
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x015C0800+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW); //RobertYu:20060420, ok now, new switching power (mini-pci can have bigger power consumption)
-            } else {
-                DBG_PRT(MSG_LEVEL_DEBUG, KERN_INFO"@@@@ RFbRawSetPower> 11G mode\n");
-                dwVT3226Pwr = ((0x3F-pDevice->byCurPwr) << 20 ) | ( 0x7 << 8 ) /* Reg7 */ |   //RobertYu:20060420, TWIF 1.10
-                               (BY_VT3226_REG_LEN << 3 )  | IFREGCTL_REGW;
-                bResult &= IFRFbWriteEmbedded(pDevice, dwVT3226Pwr);
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x00C6A200+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW); //RobertYu:20060327
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x016BC600+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW); //RobertYu:20060111
-                bResult &= IFRFbWriteEmbedded(pDevice, 0x00900800+(BY_VT3226_REG_LEN<<3)+IFREGCTL_REGW); //RobertYu:20060111
-            }
-            break;
-        }
-
-        //{{RobertYu:20060609
-        case RF_VT3342A0:
-        {
-            u32       dwVT3342Pwr;
-
-            if (pDevice->byCurPwr >= VT3342_PWR_IDX_LEN)
-                return false;
-
-            dwVT3342Pwr =  ((0x3F-pDevice->byCurPwr) << 20 ) | ( 0x27 << 8 ) /* Reg7 */ |
-                            (BY_VT3342_REG_LEN << 3 )  | IFREGCTL_REGW;
-            bResult &= IFRFbWriteEmbedded(pDevice, dwVT3342Pwr);
-            break;
-        }
-
-        default :
-            break;
-    }
-    return bResult;
+               ret &= IFRFbWriteEmbedded(priv, power_setting);
+
+               break;
+       default:
+               break;
+       }
+       return ret;
 }
 
 /*+
@@ -931,169 +943,150 @@ int RFbRawSetPower(struct vnt_private *pDevice, u8 byPwr, u32 uRATE)
  * Return Value: none
  *
 -*/
-void RFvRSSITodBm(struct vnt_private *pDevice, u8 byCurrRSSI, long *pldBm)
+void RFvRSSITodBm(struct vnt_private *priv, u8 rssi, long *dbm)
 {
-       u8 byIdx = (((byCurrRSSI & 0xC0) >> 6) & 0x03);
-       signed long b = (byCurrRSSI & 0x3F);
-       signed long a = 0;
-       u8 abyAIROHARF[4] = {0, 18, 0, 40};
-
-    switch (pDevice->byRFType) {
-        case RF_AL2230:
-        case RF_AL2230S:
-        case RF_AIROHA7230:
-        case RF_VT3226: //RobertYu:20051111
-        case RF_VT3226D0:
-        case RF_VT3342A0:   //RobertYu:20060609
-            a = abyAIROHARF[byIdx];
-            break;
-        default:
-            break;
-    }
-
-    *pldBm = -1 * (a + b * 2);
+       u8 idx = (((rssi & 0xc0) >> 6) & 0x03);
+       long b = (rssi & 0x3f);
+       long a = 0;
+       u8 airoharf[4] = {0, 18, 0, 40};
+
+       switch (priv->byRFType) {
+       case RF_AL2230:
+       case RF_AL2230S:
+       case RF_AIROHA7230:
+       case RF_VT3226:
+       case RF_VT3226D0:
+       case RF_VT3342A0:
+               a = airoharf[idx];
+               break;
+       default:
+               break;
+       }
+
+       *dbm = -1 * (a + b * 2);
 }
 
-void RFbRFTableDownload(struct vnt_private *pDevice)
+void RFbRFTableDownload(struct vnt_private *priv)
 {
-       u16 wLength1 = 0, wLength2 = 0, wLength3 = 0;
-       u8 *pbyAddr1 = NULL, *pbyAddr2 = NULL, *pbyAddr3 = NULL;
-       u16 wLength, wValue;
-       u8 abyArray[256];
-
-    switch ( pDevice->byRFType ) {
-        case RF_AL2230:
-        case RF_AL2230S:
-            wLength1 = CB_AL2230_INIT_SEQ * 3;
-            wLength2 = CB_MAX_CHANNEL_24G * 3;
-            wLength3 = CB_MAX_CHANNEL_24G * 3;
-            pbyAddr1 = &(abyAL2230InitTable[0][0]);
-            pbyAddr2 = &(abyAL2230ChannelTable0[0][0]);
-            pbyAddr3 = &(abyAL2230ChannelTable1[0][0]);
-            break;
-        case RF_AIROHA7230:
-            wLength1 = CB_AL7230_INIT_SEQ * 3;
-            wLength2 = CB_MAX_CHANNEL * 3;
-            wLength3 = CB_MAX_CHANNEL * 3;
-            pbyAddr1 = &(abyAL7230InitTable[0][0]);
-            pbyAddr2 = &(abyAL7230ChannelTable0[0][0]);
-            pbyAddr3 = &(abyAL7230ChannelTable1[0][0]);
-            break;
-        case RF_VT3226: //RobertYu:20051111
-            wLength1 = CB_VT3226_INIT_SEQ * 3;
-            wLength2 = CB_MAX_CHANNEL_24G * 3;
-            wLength3 = CB_MAX_CHANNEL_24G * 3;
-            pbyAddr1 = &(abyVT3226_InitTable[0][0]);
-            pbyAddr2 = &(abyVT3226_ChannelTable0[0][0]);
-            pbyAddr3 = &(abyVT3226_ChannelTable1[0][0]);
-            break;
-        case RF_VT3226D0: //RobertYu:20051114
-            wLength1 = CB_VT3226_INIT_SEQ * 3;
-            wLength2 = CB_MAX_CHANNEL_24G * 3;
-            wLength3 = CB_MAX_CHANNEL_24G * 3;
-            pbyAddr1 = &(abyVT3226D0_InitTable[0][0]);
-            pbyAddr2 = &(abyVT3226_ChannelTable0[0][0]);
-            pbyAddr3 = &(abyVT3226_ChannelTable1[0][0]);
-            break;
-        case RF_VT3342A0: //RobertYu:20060609
-            wLength1 = CB_VT3342_INIT_SEQ * 3;
-            wLength2 = CB_MAX_CHANNEL * 3;
-            wLength3 = CB_MAX_CHANNEL * 3;
-            pbyAddr1 = &(abyVT3342A0_InitTable[0][0]);
-            pbyAddr2 = &(abyVT3342_ChannelTable0[0][0]);
-            pbyAddr3 = &(abyVT3342_ChannelTable1[0][0]);
-            break;
-
-    }
-    //Init Table
-
-    memcpy(abyArray, pbyAddr1, wLength1);
-    CONTROLnsRequestOut(pDevice,
-                    MESSAGE_TYPE_WRITE,
-                    0,
-                    MESSAGE_REQUEST_RF_INIT,
-                    wLength1,
-                    abyArray
-                    );
-    //Channel Table 0
-    wValue = 0;
-    while ( wLength2 > 0 ) {
-
-        if ( wLength2 >= 64 ) {
-            wLength = 64;
-        } else {
-            wLength = wLength2;
-        }
-        memcpy(abyArray, pbyAddr2, wLength);
-        CONTROLnsRequestOut(pDevice,
-                        MESSAGE_TYPE_WRITE,
-                        wValue,
-                        MESSAGE_REQUEST_RF_CH0,
-                        wLength,
-                        abyArray);
-
-        wLength2 -= wLength;
-        wValue += wLength;
-        pbyAddr2 += wLength;
-    }
-    //Channel table 1
-    wValue = 0;
-    while ( wLength3 > 0 ) {
-
-        if ( wLength3 >= 64 ) {
-            wLength = 64;
-        } else {
-            wLength = wLength3;
-        }
-        memcpy(abyArray, pbyAddr3, wLength);
-        CONTROLnsRequestOut(pDevice,
-                        MESSAGE_TYPE_WRITE,
-                        wValue,
-                        MESSAGE_REQUEST_RF_CH1,
-                        wLength,
-                        abyArray);
-
-        wLength3 -= wLength;
-        wValue += wLength;
-        pbyAddr3 += wLength;
-    }
-
-    //7230 needs 2 InitTable and 3 Channel Table
-    if ( pDevice->byRFType == RF_AIROHA7230 ) {
-        wLength1 = CB_AL7230_INIT_SEQ * 3;
-        wLength2 = CB_MAX_CHANNEL * 3;
-        pbyAddr1 = &(abyAL7230InitTableAMode[0][0]);
-        pbyAddr2 = &(abyAL7230ChannelTable2[0][0]);
-        memcpy(abyArray, pbyAddr1, wLength1);
-        //Init Table 2
-        CONTROLnsRequestOut(pDevice,
-                    MESSAGE_TYPE_WRITE,
-                    0,
-                    MESSAGE_REQUEST_RF_INIT2,
-                    wLength1,
-                    abyArray);
-
-        //Channel Table 0
-        wValue = 0;
-        while ( wLength2 > 0 ) {
-
-            if ( wLength2 >= 64 ) {
-                wLength = 64;
-            } else {
-                wLength = wLength2;
-            }
-            memcpy(abyArray, pbyAddr2, wLength);
-            CONTROLnsRequestOut(pDevice,
-                            MESSAGE_TYPE_WRITE,
-                            wValue,
-                            MESSAGE_REQUEST_RF_CH2,
-                            wLength,
-                            abyArray);
-
-            wLength2 -= wLength;
-            wValue += wLength;
-            pbyAddr2 += wLength;
-        }
-    }
-
+       u16 length1 = 0, length2 = 0, length3 = 0;
+       u8 *addr1 = NULL, *addr2 = NULL, *addr3 = NULL;
+       u16 length, value;
+       u8 array[256];
+
+       switch (priv->byRFType) {
+       case RF_AL2230:
+       case RF_AL2230S:
+               length1 = CB_AL2230_INIT_SEQ * 3;
+               length2 = CB_MAX_CHANNEL_24G * 3;
+               length3 = CB_MAX_CHANNEL_24G * 3;
+               addr1 = &al2230_init_table[0][0];
+               addr2 = &al2230_channel_table0[0][0];
+               addr3 = &al2230_channel_table1[0][0];
+               break;
+       case RF_AIROHA7230:
+               length1 = CB_AL7230_INIT_SEQ * 3;
+               length2 = CB_MAX_CHANNEL * 3;
+               length3 = CB_MAX_CHANNEL * 3;
+               addr1 = &al7230_init_table[0][0];
+               addr2 = &al7230_channel_table0[0][0];
+               addr3 = &al7230_channel_table1[0][0];
+               break;
+       case RF_VT3226:
+               length1 = CB_VT3226_INIT_SEQ * 3;
+               length2 = CB_MAX_CHANNEL_24G * 3;
+               length3 = CB_MAX_CHANNEL_24G * 3;
+               addr1 = &at3226_init_table[0][0];
+               addr2 = &vt3226_channel_table0[0][0];
+               addr3 = &vt3226_channel_table1[0][0];
+               break;
+       case RF_VT3226D0:
+               length1 = CB_VT3226_INIT_SEQ * 3;
+               length2 = CB_MAX_CHANNEL_24G * 3;
+               length3 = CB_MAX_CHANNEL_24G * 3;
+               addr1 = &at3226d0_init_table[0][0];
+               addr2 = &vt3226_channel_table0[0][0];
+               addr3 = &vt3226_channel_table1[0][0];
+               break;
+       case RF_VT3342A0:
+               length1 = CB_VT3342_INIT_SEQ * 3;
+               length2 = CB_MAX_CHANNEL * 3;
+               length3 = CB_MAX_CHANNEL * 3;
+               addr1 = &vt3342a0_init_table[0][0];
+               addr2 = &vt3342_channel_table0[0][0];
+               addr3 = &vt3342_channel_table1[0][0];
+               break;
+       }
+
+       /* Init Table */
+       memcpy(array, addr1, length1);
+
+       CONTROLnsRequestOut(priv, MESSAGE_TYPE_WRITE, 0,
+               MESSAGE_REQUEST_RF_INIT, length1, array);
+
+       /* Channel Table 0 */
+       value = 0;
+       while (length2 > 0) {
+               if (length2 >= 64)
+                       length = 64;
+               else
+                       length = length2;
+
+               memcpy(array, addr2, length);
+
+               CONTROLnsRequestOut(priv, MESSAGE_TYPE_WRITE,
+                       value, MESSAGE_REQUEST_RF_CH0, length, array);
+
+               length2 -= length;
+               value += length;
+               addr2 += length;
+       }
+
+       /* Channel table 1 */
+       value = 0;
+       while (length3 > 0) {
+               if (length3 >= 64)
+                       length = 64;
+               else
+                       length = length3;
+
+               memcpy(array, addr3, length);
+
+               CONTROLnsRequestOut(priv, MESSAGE_TYPE_WRITE,
+                       value, MESSAGE_REQUEST_RF_CH1, length, array);
+
+               length3 -= length;
+               value += length;
+               addr3 += length;
+       }
+
+       if (priv->byRFType == RF_AIROHA7230) {
+               length1 = CB_AL7230_INIT_SEQ * 3;
+               length2 = CB_MAX_CHANNEL * 3;
+               addr1 = &(al7230_init_table_amode[0][0]);
+               addr2 = &(al7230_channel_table2[0][0]);
+
+               memcpy(array, addr1, length1);
+
+               /* Init Table 2 */
+               CONTROLnsRequestOut(priv, MESSAGE_TYPE_WRITE,
+                       0, MESSAGE_REQUEST_RF_INIT2, length1, array);
+
+               /* Channel Table 0 */
+               value = 0;
+               while (length2 > 0) {
+                       if (length2 >= 64)
+                               length = 64;
+                       else
+                               length = length2;
+
+                       memcpy(array, addr2, length);
+
+                       CONTROLnsRequestOut(priv, MESSAGE_TYPE_WRITE,
+                               value, MESSAGE_REQUEST_RF_CH2, length, array);
+
+                       length2 -= length;
+                       value += length;
+                       addr2 += length;
+               }
+       }
 }
index 24465cfe3e6dd02330799dbbdff85de8d985eb7c..aec6b568a4a926b109441ef474296b4acb7f556c 100644 (file)
 
 #define WEP_IV_MASK         0x00FFFFFF
 
-//
-// 802_3 packet
-//
-typedef struct tagS802_3Header {
-    u8    abyDstAddr[ETH_ALEN];
-    u8    abySrcAddr[ETH_ALEN];
-    u16    wLen;
-} __attribute__ ((__packed__))
-S802_3Header, *PS802_3Header;
-
 //u8 ETHbyGetHashIndexByCrc(u8 * pbyMultiAddr);
 bool ETHbIsBufferCrc32Ok(u8 * pbyBuffer, unsigned int cbFrameLength);
 
index 15cd5abb8004b6be1b893a8fded64fe7c5aaa942..15e724e4d4ba0e7f1d39572d611ea4a3937f98d9 100644 (file)
 #define HIWORD(d)           ((u16)((((u32)(d)) >> 16) & 0xFFFF))
 #endif
 
-#define LODWORD(q)          ((q).u.dwLowDword)
-#define HIDWORD(q)          ((q).u.dwHighDword)
-
 #if !defined(MAKEWORD)
 #define MAKEWORD(lb, hb)    ((u16)(((u8)(lb)) | (((u16)((u8)(hb))) << 8)))
 #endif
-#if !defined(MAKEDWORD)
-#define MAKEDWORD(lw, hw)   ((u32)(((u16)(lw)) | (((u32)((u16)(hw))) << 16)))
-#endif
 
 #endif /* __TMACRO_H__ */
index cabae346670438485689f734557447656b9bf198..cfbfbbb53866cbfd08f31592fc1c78d3c2db6a6a 100644 (file)
@@ -296,7 +296,7 @@ void _sin_cos(s32 angle, s32 *sin, s32 *cos)
        }
 }
 
-static unsigned char hal_get_dxx_reg(struct hw_data *pHwData, u16 number, u32 * pValue)
+static unsigned char hal_get_dxx_reg(struct hw_data *pHwData, u16 number, u32 *pValue)
 {
        if (number < 0x1000)
                number += 0x1000;
index 5ecf9a121e78b8f0520a55dc8540aa36c4ad0029..75b775252af1ee4364e32939856216a4dcfd6175 100644 (file)
@@ -920,20 +920,20 @@ void Uxx_power_on_procedure(struct hw_data *pHwData)
        Wb35Reg_WriteSync(pHwData, 0x03f8, 0x7ff);
 }
 
-void Set_ChanIndep_RfData_al7230_24(struct hw_data *pHwData, u32 *pltmp , char number)
+static void Set_ChanIndep_RfData_al7230_24(struct hw_data *pHwData, u32 *pltmp, 
+                                       char number)
 {
        u8      i;
-
        for (i = 0; i < number; i++) {
                pHwData->phy_para[i] = al7230_rf_data_24[i];
                pltmp[i] = (1 << 31) | (0 << 30) | (24 << 24) | (al7230_rf_data_24[i] & 0xffffff);
        }
 }
 
-void Set_ChanIndep_RfData_al7230_50(struct hw_data *pHwData, u32 *pltmp, char number)
+static void Set_ChanIndep_RfData_al7230_50(struct hw_data *pHwData, u32 *pltmp, 
+                                       char number)
 {
        u8      i;
-
        for (i = 0; i < number; i++) {
                pHwData->phy_para[i] = al7230_rf_data_50[i];
                pltmp[i] = (1 << 31) | (0 << 30) | (24 << 24) | (al7230_rf_data_50[i] & 0xffffff);
@@ -1263,7 +1263,7 @@ void RFSynthesizer_initial(struct hw_data *pHwData)
        }
 }
 
-void BBProcessor_AL7230_2400(struct hw_data *pHwData)
+static void BBProcessor_AL7230_2400(struct hw_data *pHwData)
 {
        struct wb35_reg *reg = &pHwData->reg;
        u32     pltmp[12];
@@ -1304,7 +1304,7 @@ void BBProcessor_AL7230_2400(struct hw_data *pHwData)
        Wb35Reg_BurstWrite(pHwData, 0x1030, pltmp, 12, AUTO_INCREMENT);
 }
 
-void BBProcessor_AL7230_5000(struct hw_data *pHwData)
+static void BBProcessor_AL7230_5000(struct hw_data *pHwData)
 {
        struct wb35_reg *reg = &pHwData->reg;
        u32     pltmp[12];
@@ -1620,22 +1620,24 @@ void BBProcessor_initial(struct hw_data *pHwData)
                reg->SQ3_filter[i] = 0x2f; /* half of Bit 0 ~ 6 */
 }
 
-void set_tx_power_per_channel_max2829(struct hw_data *pHwData,  struct chan_info Channel)
+static inline void set_tx_power_per_channel_max2829(struct hw_data *pHwData,  
+                                               struct chan_info Channel)
 {
        RFSynthesizer_SetPowerIndex(pHwData, 100);
 }
 
-void set_tx_power_per_channel_al2230(struct hw_data *pHwData,  struct chan_info Channel)
+static void set_tx_power_per_channel_al2230(struct hw_data *pHwData,  
+                                       struct chan_info Channel)
 {
        u8      index = 100;
-
        if (pHwData->TxVgaFor24[Channel.ChanNo - 1] != 0xff)
                index = pHwData->TxVgaFor24[Channel.ChanNo - 1];
 
        RFSynthesizer_SetPowerIndex(pHwData, index);
 }
 
-void set_tx_power_per_channel_al7230(struct hw_data *pHwData,  struct chan_info Channel)
+static void set_tx_power_per_channel_al7230(struct hw_data *pHwData,  
+                                       struct chan_info Channel)
 {
        u8      i, index = 100;
 
@@ -1658,7 +1660,8 @@ void set_tx_power_per_channel_al7230(struct hw_data *pHwData,  struct chan_info
        RFSynthesizer_SetPowerIndex(pHwData, index);
 }
 
-void set_tx_power_per_channel_wb242(struct hw_data *pHwData,  struct chan_info Channel)
+static void set_tx_power_per_channel_wb242(struct hw_data *pHwData,  
+                                       struct chan_info Channel)
 {
        u8      index = 100;
 
index 1bff7d1c9a772aea2fa786e70222d345436de22c..9be1b3b004b0bad90bd09c30843121d775ba45e3 100644 (file)
@@ -30,46 +30,46 @@ unsigned char Wb35Reg_BurstWrite(struct hw_data *pHwData, u16 RegisterNo, u32 *p
        /* Trying to use burst write function if use new hardware */
        UrbSize = sizeof(struct wb35_reg_queue) + DataSize + sizeof(struct usb_ctrlrequest);
        reg_queue = kzalloc(UrbSize, GFP_ATOMIC);
+       if (reg_queue == NULL)
+               return false;
+
        urb = usb_alloc_urb(0, GFP_ATOMIC);
-       if (urb && reg_queue) {
-               reg_queue->DIRECT = 2; /* burst write register */
-               reg_queue->INDEX = RegisterNo;
-               reg_queue->pBuffer = (u32 *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
-               memcpy(reg_queue->pBuffer, pRegisterData, DataSize);
-               /* the function for reversing register data from little endian to big endian */
-               for (i = 0; i < NumberOfData ; i++)
-                       reg_queue->pBuffer[i] = cpu_to_le32(reg_queue->pBuffer[i]);
-
-               dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue) + DataSize);
-               dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
-               dr->bRequest = 0x04; /* USB or vendor-defined request code, burst mode */
-               dr->wValue = cpu_to_le16(Flag); /* 0: Register number auto-increment, 1: No auto increment */
-               dr->wIndex = cpu_to_le16(RegisterNo);
-               dr->wLength = cpu_to_le16(DataSize);
-               reg_queue->Next = NULL;
-               reg_queue->pUsbReq = dr;
-               reg_queue->urb = urb;
+       if (urb == NULL) {
+               kfree(reg_queue);
+               return false;
+       }
 
-               spin_lock_irq(&reg->EP0VM_spin_lock);
-               if (reg->reg_first == NULL)
-                       reg->reg_first = reg_queue;
-               else
-                       reg->reg_last->Next = reg_queue;
-               reg->reg_last = reg_queue;
+       reg_queue->DIRECT = 2; /* burst write register */
+       reg_queue->INDEX = RegisterNo;
+       reg_queue->pBuffer = (u32 *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
+       memcpy(reg_queue->pBuffer, pRegisterData, DataSize);
+       /* the function for reversing register data from little endian to big endian */
+       for (i = 0; i < NumberOfData ; i++)
+               reg_queue->pBuffer[i] = cpu_to_le32(reg_queue->pBuffer[i]);
+
+       dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue) + DataSize);
+       dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
+       dr->bRequest = 0x04; /* USB or vendor-defined request code, burst mode */
+       dr->wValue = cpu_to_le16(Flag); /* 0: Register number auto-increment, 1: No auto increment */
+       dr->wIndex = cpu_to_le16(RegisterNo);
+       dr->wLength = cpu_to_le16(DataSize);
+       reg_queue->Next = NULL;
+       reg_queue->pUsbReq = dr;
+       reg_queue->urb = urb;
 
-               spin_unlock_irq(&reg->EP0VM_spin_lock);
+       spin_lock_irq(&reg->EP0VM_spin_lock);
+       if (reg->reg_first == NULL)
+               reg->reg_first = reg_queue;
+       else
+               reg->reg_last->Next = reg_queue;
+       reg->reg_last = reg_queue;
 
-               /* Start EP0VM */
-               Wb35Reg_EP0VM_start(pHwData);
+       spin_unlock_irq(&reg->EP0VM_spin_lock);
 
-               return true;
-       } else {
-               if (urb)
-                       usb_free_urb(urb);
-               kfree(reg_queue);
-               return false;
-       }
-   return false;
+       /* Start EP0VM */
+       Wb35Reg_EP0VM_start(pHwData);
+
+       return true;
 }
 
 void Wb35Reg_Update(struct hw_data *pHwData,  u16 RegisterNo,  u32 RegisterValue)
@@ -174,43 +174,44 @@ unsigned char Wb35Reg_Write(struct hw_data *pHwData, u16 RegisterNo, u32 Registe
        /* update the register by send urb request */
        UrbSize = sizeof(struct wb35_reg_queue) + sizeof(struct usb_ctrlrequest);
        reg_queue = kzalloc(UrbSize, GFP_ATOMIC);
+       if (reg_queue == NULL)
+               return false;
+
        urb = usb_alloc_urb(0, GFP_ATOMIC);
-       if (urb && reg_queue) {
-               reg_queue->DIRECT = 1; /* burst write register */
-               reg_queue->INDEX = RegisterNo;
-               reg_queue->VALUE = cpu_to_le32(RegisterValue);
-               reg_queue->RESERVED_VALID = false;
-               dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
-               dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
-               dr->bRequest = 0x03; /* USB or vendor-defined request code, burst mode */
-               dr->wValue = cpu_to_le16(0x0);
-               dr->wIndex = cpu_to_le16(RegisterNo);
-               dr->wLength = cpu_to_le16(4);
-
-               /* Enter the sending queue */
-               reg_queue->Next = NULL;
-               reg_queue->pUsbReq = dr;
-               reg_queue->urb = urb;
+       if (urb == NULL) {
+               kfree(reg_queue);
+               return false;
+       }
 
-               spin_lock_irq(&reg->EP0VM_spin_lock);
-               if (reg->reg_first == NULL)
-                       reg->reg_first = reg_queue;
-               else
-                       reg->reg_last->Next = reg_queue;
-               reg->reg_last = reg_queue;
+       reg_queue->DIRECT = 1; /* burst write register */
+       reg_queue->INDEX = RegisterNo;
+       reg_queue->VALUE = cpu_to_le32(RegisterValue);
+       reg_queue->RESERVED_VALID = false;
+       dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
+       dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
+       dr->bRequest = 0x03; /* USB or vendor-defined request code, burst mode */
+       dr->wValue = cpu_to_le16(0x0);
+       dr->wIndex = cpu_to_le16(RegisterNo);
+       dr->wLength = cpu_to_le16(4);
+
+       /* Enter the sending queue */
+       reg_queue->Next = NULL;
+       reg_queue->pUsbReq = dr;
+       reg_queue->urb = urb;
 
-               spin_unlock_irq(&reg->EP0VM_spin_lock);
+       spin_lock_irq(&reg->EP0VM_spin_lock);
+       if (reg->reg_first == NULL)
+               reg->reg_first = reg_queue;
+       else
+               reg->reg_last->Next = reg_queue;
+       reg->reg_last = reg_queue;
 
-               /* Start EP0VM */
-               Wb35Reg_EP0VM_start(pHwData);
+       spin_unlock_irq(&reg->EP0VM_spin_lock);
 
-               return true;
-       } else {
-               if (urb)
-                       usb_free_urb(urb);
-               kfree(reg_queue);
-               return false;
-       }
+       /* Start EP0VM */
+       Wb35Reg_EP0VM_start(pHwData);
+
+       return true;
 }
 
 /*
@@ -238,43 +239,45 @@ unsigned char Wb35Reg_WriteWithCallbackValue(struct hw_data *pHwData,
        /* update the register by send urb request */
        UrbSize = sizeof(struct wb35_reg_queue) + sizeof(struct usb_ctrlrequest);
        reg_queue = kzalloc(UrbSize, GFP_ATOMIC);
-       urb = usb_alloc_urb(0, GFP_ATOMIC);
-       if (urb && reg_queue) {
-               reg_queue->DIRECT = 1; /* burst write register */
-               reg_queue->INDEX = RegisterNo;
-               reg_queue->VALUE = cpu_to_le32(RegisterValue);
-               /* NOTE : Users must guarantee the size of value will not exceed the buffer size. */
-               memcpy(reg_queue->RESERVED, pValue, Len);
-               reg_queue->RESERVED_VALID = true;
-               dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
-               dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
-               dr->bRequest = 0x03; /* USB or vendor-defined request code, burst mode */
-               dr->wValue = cpu_to_le16(0x0);
-               dr->wIndex = cpu_to_le16(RegisterNo);
-               dr->wLength = cpu_to_le16(4);
-
-               /* Enter the sending queue */
-               reg_queue->Next = NULL;
-               reg_queue->pUsbReq = dr;
-               reg_queue->urb = urb;
-               spin_lock_irq(&reg->EP0VM_spin_lock);
-               if (reg->reg_first == NULL)
-                       reg->reg_first = reg_queue;
-               else
-                       reg->reg_last->Next = reg_queue;
-               reg->reg_last = reg_queue;
-
-               spin_unlock_irq(&reg->EP0VM_spin_lock);
+       if (reg_queue == NULL)
+               return false;
 
-               /* Start EP0VM */
-               Wb35Reg_EP0VM_start(pHwData);
-               return true;
-       } else {
-               if (urb)
-                       usb_free_urb(urb);
+       urb = usb_alloc_urb(0, GFP_ATOMIC);
+       if (urb == NULL) {
                kfree(reg_queue);
                return false;
        }
+
+       reg_queue->DIRECT = 1; /* burst write register */
+       reg_queue->INDEX = RegisterNo;
+       reg_queue->VALUE = cpu_to_le32(RegisterValue);
+       /* NOTE : Users must guarantee the size of value will not exceed the buffer size. */
+       memcpy(reg_queue->RESERVED, pValue, Len);
+       reg_queue->RESERVED_VALID = true;
+       dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
+       dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT | USB_RECIP_DEVICE;
+       dr->bRequest = 0x03; /* USB or vendor-defined request code, burst mode */
+       dr->wValue = cpu_to_le16(0x0);
+       dr->wIndex = cpu_to_le16(RegisterNo);
+       dr->wLength = cpu_to_le16(4);
+
+       /* Enter the sending queue */
+       reg_queue->Next = NULL;
+       reg_queue->pUsbReq = dr;
+       reg_queue->urb = urb;
+       spin_lock_irq(&reg->EP0VM_spin_lock);
+       if (reg->reg_first == NULL)
+               reg->reg_first = reg_queue;
+       else
+               reg->reg_last->Next = reg_queue;
+       reg->reg_last = reg_queue;
+
+       spin_unlock_irq(&reg->EP0VM_spin_lock);
+
+       /* Start EP0VM */
+       Wb35Reg_EP0VM_start(pHwData);
+
+       return true;
 }
 
 /*
@@ -344,41 +347,41 @@ unsigned char Wb35Reg_Read(struct hw_data *pHwData, u16 RegisterNo, u32 *pRegist
        /* update the variable by send Urb to read register */
        UrbSize = sizeof(struct wb35_reg_queue) + sizeof(struct usb_ctrlrequest);
        reg_queue = kzalloc(UrbSize, GFP_ATOMIC);
-       urb = usb_alloc_urb(0, GFP_ATOMIC);
-       if (urb && reg_queue) {
-               reg_queue->DIRECT = 0; /* read register */
-               reg_queue->INDEX = RegisterNo;
-               reg_queue->pBuffer = pRegisterValue;
-               dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
-               dr->bRequestType = USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN;
-               dr->bRequest = 0x01; /* USB or vendor-defined request code, burst mode */
-               dr->wValue = cpu_to_le16(0x0);
-               dr->wIndex = cpu_to_le16(RegisterNo);
-               dr->wLength = cpu_to_le16(4);
-
-               /* Enter the sending queue */
-               reg_queue->Next = NULL;
-               reg_queue->pUsbReq = dr;
-               reg_queue->urb = urb;
-               spin_lock_irq(&reg->EP0VM_spin_lock);
-               if (reg->reg_first == NULL)
-                       reg->reg_first = reg_queue;
-               else
-                       reg->reg_last->Next = reg_queue;
-               reg->reg_last = reg_queue;
-
-               spin_unlock_irq(&reg->EP0VM_spin_lock);
-
-               /* Start EP0VM */
-               Wb35Reg_EP0VM_start(pHwData);
+       if (reg_queue == NULL)
+               return false;
 
-               return true;
-       } else {
-               if (urb)
-                       usb_free_urb(urb);
+       urb = usb_alloc_urb(0, GFP_ATOMIC);
+       if (urb == NULL) {
                kfree(reg_queue);
                return false;
        }
+       reg_queue->DIRECT = 0; /* read register */
+       reg_queue->INDEX = RegisterNo;
+       reg_queue->pBuffer = pRegisterValue;
+       dr = (struct usb_ctrlrequest *)((u8 *)reg_queue + sizeof(struct wb35_reg_queue));
+       dr->bRequestType = USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN;
+       dr->bRequest = 0x01; /* USB or vendor-defined request code, burst mode */
+       dr->wValue = cpu_to_le16(0x0);
+       dr->wIndex = cpu_to_le16(RegisterNo);
+       dr->wLength = cpu_to_le16(4);
+
+       /* Enter the sending queue */
+       reg_queue->Next = NULL;
+       reg_queue->pUsbReq = dr;
+       reg_queue->urb = urb;
+       spin_lock_irq(&reg->EP0VM_spin_lock);
+       if (reg->reg_first == NULL)
+               reg->reg_first = reg_queue;
+       else
+               reg->reg_last->Next = reg_queue;
+       reg->reg_last = reg_queue;
+
+       spin_unlock_irq(&reg->EP0VM_spin_lock);
+
+       /* Start EP0VM */
+       Wb35Reg_EP0VM_start(pHwData);
+
+       return true;
 }
 
 
index f118eeba396ae195bb77da4a9d8a518570accbe4..8d71bc2f5940b6f2037a225e00528fd8ee71bc43 100644 (file)
@@ -343,8 +343,7 @@ void Wb35Rx_destroy(struct hw_data *pHwData)
        } while (pWb35Rx->EP3vm_state != VM_STOP);
        msleep(10); /* Delay for waiting function exit */
 
-       if (pWb35Rx->RxUrb)
-               usb_free_urb(pWb35Rx->RxUrb);
+       usb_free_urb(pWb35Rx->RxUrb);
        pr_debug("Wb35Rx_destroy OK\n");
 }
 
index 7c7c77f9c86266e87213c4907b0074dcdf56b2f2..b55dc43a1d11ff40b197c7f8f2dc9fa6db4d334a 100644 (file)
@@ -133,6 +133,7 @@ static int wl_adapter_attach(struct pcmcia_device *link)
 {
        struct net_device   *dev;
        struct wl_private   *lp;
+       int ret;
        /*--------------------------------------------------------------------*/
 
        DBG_FUNC("wl_adapter_attach");
@@ -154,10 +155,12 @@ static int wl_adapter_attach(struct pcmcia_device *link)
        lp = wl_priv(dev);
        lp->link = link;
 
-       wl_adapter_insert(link);
+       ret = wl_adapter_insert(link);
+       if (ret != 0)
+               wl_device_dealloc(dev);
 
        DBG_LEAVE(DbgInfo);
-       return 0;
+       return ret;
 } /* wl_adapter_attach */
 /*============================================================================*/
 
@@ -224,7 +227,7 @@ static int wl_adapter_resume(struct pcmcia_device *link)
        return 0;
 } /* wl_adapter_resume */
 
-void wl_adapter_insert(struct pcmcia_device *link)
+int wl_adapter_insert(struct pcmcia_device *link)
 {
        struct net_device *dev;
        int ret;
@@ -256,7 +259,8 @@ void wl_adapter_insert(struct pcmcia_device *link)
        dev->base_addr  = link->resource[0]->start;
 
        SET_NETDEV_DEV(dev, &link->dev);
-       if (register_netdev(dev) != 0) {
+       ret = register_netdev(dev);
+       if (ret != 0) {
                printk("%s: register_netdev() failed\n", MODULE_NAME);
                goto failed;
        }
@@ -267,13 +271,13 @@ void wl_adapter_insert(struct pcmcia_device *link)
                " %pM\n", dev->name, dev->base_addr, dev->irq, dev->dev_addr);
 
        DBG_LEAVE(DbgInfo);
-       return;
+       return 0;
 
 failed:
        wl_adapter_release(link);
 
        DBG_LEAVE(DbgInfo);
-       return;
+       return ret;
 } /* wl_adapter_insert */
 /*============================================================================*/
 
index a7ab579759ded7f4a41db9b4ea91585b54cc850b..081cc6f28d1f630832c5d79e18f18ac5d6a6d996 100644 (file)
 
 
 /*******************************************************************************
- *  function protoypes
+ *  function prototypes
  ******************************************************************************/
 
-void wl_adapter_insert(struct pcmcia_device *link);
+int wl_adapter_insert(struct pcmcia_device *link);
 
 void wl_adapter_release(struct pcmcia_device *link);
 
index f28f15baea96460b11631004a57f22a46e94c5c5..43535610acc46d49cd06f0b50f4a948747ff951f 100644 (file)
@@ -3171,7 +3171,9 @@ void wl_process_mailbox( struct wl_private *lp )
 
                                        memset( ssid, 0, sizeof( ssid ));
                                        strncpy( ssid, &probe_rsp->rawData[2],
-                                                        probe_rsp->rawData[1] );
+                                                min_t(u8,
+                                                       probe_rsp->rawData[1],
+                                                       HCF_MAX_NAME_LEN - 1));
 
                                        DBG_TRACE( DbgInfo, "(%s) SSID        : %s\n",
                                                           lp->dev->name, ssid );
index 428a9be2501069800ce960c882374206e3ff25d4..76374b2202285b201981fd6c618c7fc744e72ff0 100644 (file)
@@ -1122,8 +1122,7 @@ static void prism2sta_inf_hostscanresults(wlandevice_t *wlandev,
 
        kfree(hw->scanresults);
 
-       hw->scanresults = kmalloc(sizeof(hfa384x_InfFrame_t), GFP_ATOMIC);
-       memcpy(hw->scanresults, inf, sizeof(hfa384x_InfFrame_t));
+       hw->scanresults = kmemdup(inf, sizeof(hfa384x_InfFrame_t), GFP_ATOMIC);
 
        if (nbss == 0)
                nbss = -1;
index 80c972305885babd09f61a0c1db00ddae2b0475d..5c739bebd8a5183f73ee1d548024c945fab22673 100644 (file)
 #define SetCRT2ToDualEdge   0x8000
 
 #define ReserveTVOption     0x0008
-#define GatingCRT           0x0800
-#define DisableChB          0x1000
-#define EnableChB           0x2000
-#define DisableChA          0x4000
-#define EnableChA           0x8000
 
 #define SetTVLowResolution   0x0400
 #define TVSimuMode           0x0800
index 19ce5a978cae732817745befba348b26cb99c490..5f1c41ed778b54f2f00c2e968809c645c2d0d8aa 100644 (file)
@@ -54,14 +54,12 @@ XGINew_GetXG20DRAMType(struct xgi_hw_device_info *HwDeviceExtension,
                udelay(800);
                xgifb_reg_or(pVBInfo->P3d4, 0x4A, 0x80); /* Enable GPIOH read */
                /* GPIOF 0:DVI 1:DVO */
-               temp = xgifb_reg_get(pVBInfo->P3d4, 0x48);
+               data = xgifb_reg_get(pVBInfo->P3d4, 0x48);
                /* HOTPLUG_SUPPORT */
                /* for current XG20 & XG21, GPIOH is floating, driver will
                 * fix DDR temporarily */
-               if (temp & 0x01) /* DVI read GPIOH */
-                       data = 1; /* DDRII */
-               else
-                       data = 0; /* DDR */
+               /* DVI read GPIOH */
+               data &= 0x01; /* 1=DDRII, 0=DDR */
                /* ~HOTPLUG_SUPPORT */
                xgifb_reg_or(pVBInfo->P3d4, 0xB4, 0x02);
                return data;
@@ -1079,44 +1077,23 @@ static unsigned short XGINew_SenseLCD(struct xgi_hw_device_info
                                                        *HwDeviceExtension,
                                      struct vb_device_info *pVBInfo)
 {
-       unsigned short temp;
-
-       /* add lcd sense */
-       if (HwDeviceExtension->ulCRT2LCDType == LCD_UNKNOWN) {
+       unsigned short temp = HwDeviceExtension->ulCRT2LCDType;
+
+       switch (HwDeviceExtension->ulCRT2LCDType) {
+       case LCD_640x480:
+       case LCD_1024x600:
+       case LCD_1152x864:
+       case LCD_1280x960:
+       case LCD_1152x768:
+       case LCD_1920x1440:
+       case LCD_2048x1536:
+               temp = 0; /* overwrite used ulCRT2LCDType */
+               break;
+       case LCD_UNKNOWN: /* unknown lcd, do nothing */
                return 0;
-       } else {
-               temp = (unsigned short) HwDeviceExtension->ulCRT2LCDType;
-               switch (HwDeviceExtension->ulCRT2LCDType) {
-               case LCD_INVALID:
-               case LCD_800x600:
-               case LCD_1024x768:
-               case LCD_1280x1024:
-                       break;
-
-               case LCD_640x480:
-               case LCD_1024x600:
-               case LCD_1152x864:
-               case LCD_1280x960:
-               case LCD_1152x768:
-                       temp = 0;
-                       break;
-
-               case LCD_1400x1050:
-               case LCD_1280x768:
-               case LCD_1600x1200:
-                       break;
-
-               case LCD_1920x1440:
-               case LCD_2048x1536:
-                       temp = 0;
-                       break;
-
-               default:
-                       break;
-               }
-               xgifb_reg_and_or(pVBInfo->P3d4, 0x36, 0xF0, temp);
-               return 1;
        }
+       xgifb_reg_and_or(pVBInfo->P3d4, 0x36, 0xF0, temp);
+       return 1;
 }
 
 static void XGINew_GetXG21Sense(struct pci_dev *pdev,
@@ -1138,17 +1115,11 @@ static void XGINew_GetXG21Sense(struct pci_dev *pdev,
                        xgifb_reg_or(pVBInfo->P3d4, 0x32, LCDSense);
                        /* Enable read GPIOF */
                        xgifb_reg_and_or(pVBInfo->P3d4, 0x4A, ~0x20, 0x20);
-                       Temp = xgifb_reg_get(pVBInfo->P3d4, 0x48) & 0x04;
-                       if (!Temp)
-                               xgifb_reg_and_or(pVBInfo->P3d4,
-                                                0x38,
-                                                ~0xE0,
-                                                0x80); /* TMDS on chip */
+                       if (xgifb_reg_get(pVBInfo->P3d4, 0x48) & 0x04)
+                               Temp = 0xA0; /* Only DVO on chip */
                        else
-                               xgifb_reg_and_or(pVBInfo->P3d4,
-                                                0x38,
-                                                ~0xE0,
-                                                0xA0); /* Only DVO on chip */
+                               Temp = 0x80; /* TMDS on chip */
+                       xgifb_reg_and_or(pVBInfo->P3d4, 0x38, ~0xE0, Temp);
                        /* Disable read GPIOF */
                        xgifb_reg_and(pVBInfo->P3d4, 0x4A, ~0x20);
                }
@@ -1206,9 +1177,7 @@ static unsigned char GetXG27FPBits(struct vb_device_info *pVBInfo)
        /* enable GPIOA/B/C read */
        xgifb_reg_and_or(pVBInfo->P3d4, 0x4A, ~0x03, 0x03);
        temp = xgifb_reg_get(pVBInfo->P3d4, 0x48);
-       if (temp <= 2)
-               temp &= 0x03;
-       else
+       if (temp > 2)
                temp = ((temp & 0x04) >> 1) | ((~temp) & 0x01);
 
        xgifb_reg_set(pVBInfo->P3d4, 0x4A, CR4A);
@@ -1216,6 +1185,14 @@ static unsigned char GetXG27FPBits(struct vb_device_info *pVBInfo)
        return temp;
 }
 
+static bool xgifb_bridge_is_on(struct vb_device_info *vb_info)
+{
+       u8 flag;
+
+       flag = xgifb_reg_get(vb_info->Part4Port, 0x00);
+       return flag == 1 || flag == 2;
+}
+
 unsigned char XGIInitNew(struct pci_dev *pdev)
 {
        struct xgifb_video_info *xgifb_info = pci_get_drvdata(pdev);
@@ -1235,10 +1212,6 @@ unsigned char XGIInitNew(struct pci_dev *pdev)
 
        outb(0x67, pVBInfo->P3c2);
 
-       if (HwDeviceExtension->jChipType < XG20)
-               /* Run XGI_GetVBType before InitTo330Pointer */
-               XGI_GetVBType(pVBInfo);
-
        InitTo330Pointer(HwDeviceExtension->jChipType, pVBInfo);
 
        /* Openkey */
@@ -1327,7 +1300,6 @@ unsigned char XGIInitNew(struct pci_dev *pdev)
                xgifb_reg_set(pVBInfo->Part1Port, 0x00, 0x00);
                /* chk if BCLK>=100MHz */
                temp1 = xgifb_reg_get(pVBInfo->P3d4, 0x7B);
-               temp = (unsigned char) ((temp1 >> 4) & 0x0F);
 
                xgifb_reg_set(pVBInfo->Part1Port,
                              0x02, XGI330_CRT2Data_1_2);
@@ -1353,7 +1325,7 @@ unsigned char XGIInitNew(struct pci_dev *pdev)
        xgifb_reg_set(pVBInfo->P3c4, 0x33, XGI330_SR33);
 
        if (HwDeviceExtension->jChipType < XG20) {
-               if (XGI_BridgeIsOn(pVBInfo) == 1) {
+               if (xgifb_bridge_is_on(pVBInfo)) {
                        xgifb_reg_set(pVBInfo->Part2Port, 0x00, 0x1C);
                        xgifb_reg_set(pVBInfo->Part4Port,
                                      0x0D, XGI330_CRT2Data_4_D);
index 3adec3f184621153fa0cae6fce7ea08a7f087175..fcefe5b36cddb99c01eaae5d85772016db7e6f6b 100644 (file)
@@ -35,6 +35,9 @@ void InitTo330Pointer(unsigned char ChipType, struct vb_device_info *pVBInfo)
        pVBInfo->SR18 = XGI340_SR18;
        pVBInfo->CR40 = XGI340_cr41;
 
+       if (ChipType < XG20)
+               XGI_GetVBType(pVBInfo);
+
        /* 310 customization related */
        if ((pVBInfo->VBType & VB_SIS301LV) || (pVBInfo->VBType & VB_SIS302LV))
                pVBInfo->LCDCapList = XGI_LCDDLCapList;
@@ -180,66 +183,45 @@ static unsigned char XGI_AjustCRT2Rate(unsigned short ModeNo,
        tempbx = XGI330_RefIndex[RefreshRateTableIndex + (*i)].ModeID;
        tempax = 0;
 
-       if (pVBInfo->IF_DEF_LVDS == 0) {
-               if (pVBInfo->VBInfo & SetCRT2ToRAMDAC) {
-                       tempax |= SupportRAMDAC2;
-
-                       if (pVBInfo->VBType & VB_XGI301C)
-                               tempax |= SupportCRT2in301C;
-               }
-
-               /* 301b */
-               if (pVBInfo->VBInfo & (SetCRT2ToLCD | XGI_SetCRT2ToLCDA)) {
-                       tempax |= SupportLCD;
+       if (pVBInfo->VBInfo & SetCRT2ToRAMDAC) {
+               tempax |= SupportRAMDAC2;
 
-                       if (pVBInfo->LCDResInfo != Panel_1280x1024 &&
-                           pVBInfo->LCDResInfo != Panel_1280x960 &&
-                           (pVBInfo->LCDInfo & LCDNonExpanding) &&
-                           resinfo >= 9)
-                               return 0;
-               }
+               if (pVBInfo->VBType & VB_XGI301C)
+                       tempax |= SupportCRT2in301C;
+       }
 
-               if (pVBInfo->VBInfo & SetCRT2ToHiVision) { /* for HiTV */
-                       tempax |= SupportHiVision;
-                       if ((pVBInfo->VBInfo & SetInSlaveMode) &&
-                           ((resinfo == 4) ||
-                            (resinfo == 3 &&
-                             (pVBInfo->SetFlag & TVSimuMode)) ||
-                            (resinfo > 7)))
-                                       return 0;
-               } else if (pVBInfo->VBInfo & (SetCRT2ToAVIDEO |
-                                              SetCRT2ToSVIDEO |
-                                              SetCRT2ToSCART |
-                                              SetCRT2ToYPbPr525750 |
-                                              SetCRT2ToHiVision)) {
-                       tempax |= SupportTV;
-
-                       if (pVBInfo->VBType & (VB_SIS301B |
-                                              VB_SIS302B |
-                                              VB_SIS301LV |
-                                              VB_SIS302LV |
-                                              VB_XGI301C))
-                               tempax |= SupportTV1024;
-
-                       if (!(pVBInfo->VBInfo & TVSetPAL) &&
-                           (modeflag & NoSupportSimuTV) &&
-                           (pVBInfo->VBInfo & SetInSlaveMode) &&
-                           (!(pVBInfo->VBInfo & SetNotSimuMode)))
-                               return 0;
-               }
-       } else if (pVBInfo->VBInfo & SetCRT2ToLCD) { /* for LVDS */
+       /* 301b */
+       if (pVBInfo->VBInfo & (SetCRT2ToLCD | XGI_SetCRT2ToLCDA)) {
                tempax |= SupportLCD;
 
-               if (resinfo > 0x08)
-                       return 0; /* 1024x768 */
-
-               if (pVBInfo->LCDResInfo < Panel_1024x768) {
-                       if (resinfo > 0x07)
-                               return 0; /* 800x600 */
+               if (pVBInfo->LCDResInfo != Panel_1280x1024 &&
+                   pVBInfo->LCDResInfo != Panel_1280x960 &&
+                   (pVBInfo->LCDInfo & LCDNonExpanding) &&
+                   resinfo >= 9)
+                       return 0;
+       }
 
-                       if (resinfo == 0x04)
-                               return 0; /* 512x384 */
-               }
+       if (pVBInfo->VBInfo & SetCRT2ToHiVision) { /* for HiTV */
+               tempax |= SupportHiVision;
+               if ((pVBInfo->VBInfo & SetInSlaveMode) &&
+                   ((resinfo == 4) ||
+                    (resinfo == 3 && (pVBInfo->SetFlag & TVSimuMode)) ||
+                    (resinfo > 7)))
+                       return 0;
+       } else if (pVBInfo->VBInfo & (SetCRT2ToAVIDEO | SetCRT2ToSVIDEO |
+                                     SetCRT2ToSCART | SetCRT2ToYPbPr525750 |
+                                     SetCRT2ToHiVision)) {
+               tempax |= SupportTV;
+
+               if (pVBInfo->VBType & (VB_SIS301B | VB_SIS302B | VB_SIS301LV |
+                                      VB_SIS302LV | VB_XGI301C))
+                       tempax |= SupportTV1024;
+
+               if (!(pVBInfo->VBInfo & TVSetPAL) &&
+                   (modeflag & NoSupportSimuTV) &&
+                   (pVBInfo->VBInfo & SetInSlaveMode) &&
+                   (!(pVBInfo->VBInfo & SetNotSimuMode)))
+                       return 0;
        }
 
        for (; XGI330_RefIndex[RefreshRateTableIndex + (*i)].ModeID ==
@@ -759,7 +741,6 @@ static void XGI_SetCRT1DE(struct xgi_hw_device_info *HwDeviceExtension,
 
        xgifb_reg_and_or(pVBInfo->P3d4, 0x07, ~0x42, tempax);
        data = xgifb_reg_get(pVBInfo->P3d4, 0x07);
-       data &= 0xFF;
        tempax = 0;
 
        if (tempbx & 0x04)
@@ -914,16 +895,10 @@ static void XGI_SetCRT1VCLK(unsigned short ModeNo,
        unsigned char index, data;
        unsigned short vclkindex;
 
-       if (pVBInfo->IF_DEF_LVDS == 1) {
-               index = XGI330_RefIndex[RefreshRateTableIndex].Ext_CRTVCLK;
-               data = xgifb_reg_get(pVBInfo->P3c4, 0x31) & 0xCF;
-               xgifb_reg_set(pVBInfo->P3c4, 0x31, data);
-               xgifb_reg_set(pVBInfo->P3c4, 0x2B, XGI_VCLKData[index].SR2B);
-               xgifb_reg_set(pVBInfo->P3c4, 0x2C, XGI_VCLKData[index].SR2C);
-               xgifb_reg_set(pVBInfo->P3c4, 0x2D, 0x01);
-       } else if ((pVBInfo->VBType & (VB_SIS301B | VB_SIS302B | VB_SIS301LV
-                       | VB_SIS302LV | VB_XGI301C)) && (pVBInfo->VBInfo
-                       & XGI_SetCRT2ToLCDA)) {
+       if ((pVBInfo->IF_DEF_LVDS == 0) &&
+           (pVBInfo->VBType & (VB_SIS301B | VB_SIS302B | VB_SIS301LV |
+                               VB_SIS302LV | VB_XGI301C)) &&
+           (pVBInfo->VBInfo & XGI_SetCRT2ToLCDA)) {
                vclkindex = XGI_GetVCLK2Ptr(ModeNo, ModeIdIndex,
                                RefreshRateTableIndex, HwDeviceExtension,
                                pVBInfo);
@@ -1448,8 +1423,6 @@ static void XGI_GetLCDSync(unsigned short *HSyncWidth,
        Index = XGI_GetLCDCapPtr(pVBInfo);
        *HSyncWidth = pVBInfo->LCDCapList[Index].LCD_HSyncWidth;
        *VSyncWidth = pVBInfo->LCDCapList[Index].LCD_VSyncWidth;
-
-       return;
 }
 
 static void XGI_SetLVDSRegs(unsigned short ModeNo, unsigned short ModeIdIndex,
@@ -1589,10 +1562,8 @@ static void XGI_SetLVDSRegs(unsigned short ModeNo, unsigned short ModeIdIndex,
        xgifb_reg_and_or(pVBInfo->Part1Port, 0x1a, 0x07,
                                tempax);
 
-       tempcx = pVBInfo->VGAVT;
        tempbx = pVBInfo->VDE;
        tempax = pVBInfo->VGAVDE;
-       tempcx -= tempax;
 
        temp = tempax; /* 0430 ylshieh */
        temp1 = (temp << 18) / tempbx;
@@ -1712,7 +1683,6 @@ static void XGI_GetLCDVCLKPtr(unsigned char *di_0, unsigned char *di_1,
                        *di_1 = pVBInfo->LCDCapList[index].LCDA_VCLKData2;
                }
        }
-       return;
 }
 
 static unsigned char XGI_GetVCLKPtr(unsigned short RefreshRateTableIndex,
@@ -1907,8 +1877,6 @@ static void XGI_UpdateModeInfo(struct xgi_hw_device_info *HwDeviceExtension,
 
                if (!(pVBInfo->SetFlag & ReserveTVOption))
                        xgifb_reg_set(pVBInfo->P3d4, 0x3e, tempch);
-       } else {
-               return;
        }
 }
 
@@ -1916,9 +1884,6 @@ void XGI_GetVBType(struct vb_device_info *pVBInfo)
 {
        unsigned short flag, tempbx, tempah;
 
-       if (pVBInfo->IF_DEF_LVDS != 0)
-               return;
-
        tempbx = VB_SIS302B;
        flag = xgifb_reg_get(pVBInfo->Part4Port, 0x00);
        if (flag == 0x02)
@@ -1995,37 +1960,23 @@ static void XGI_GetVBInfo(unsigned short ModeNo, unsigned short ModeIdIndex,
                }
        }
 
-       if (pVBInfo->IF_DEF_YPbPr == 1) {
-               if (pVBInfo->VBType & (VB_SIS301LV|VB_SIS302LV|VB_XGI301C)) {
-                       if (temp & SetYPbPr) {
-                               if (pVBInfo->IF_DEF_HiVision == 1) {
-                                       /* shampoo add for new scratch */
-                                       temp = xgifb_reg_get(pVBInfo->P3d4,
-                                                            0x35);
-                                       temp &= YPbPrMode;
-                                       tempbx |= SetCRT2ToHiVision;
+       if (pVBInfo->VBType & (VB_SIS301LV|VB_SIS302LV|VB_XGI301C)) {
+               if (temp & SetYPbPr) {
+                       /* shampoo add for new scratch */
+                       temp = xgifb_reg_get(pVBInfo->P3d4, 0x35);
+                       temp &= YPbPrMode;
+                       tempbx |= SetCRT2ToHiVision;
 
-                                       if (temp != YPbPrMode1080i) {
-                                               tempbx &= (~SetCRT2ToHiVision);
-                                               tempbx |= SetCRT2ToYPbPr525750;
-                                       }
-                               }
+                       if (temp != YPbPrMode1080i) {
+                               tempbx &= (~SetCRT2ToHiVision);
+                               tempbx |= SetCRT2ToYPbPr525750;
                        }
                }
        }
 
        tempax = push; /* restore CR31 */
 
-       if (pVBInfo->IF_DEF_YPbPr == 1) {
-               if (pVBInfo->IF_DEF_HiVision == 1)
-                       temp = 0x09FC;
-               else
-                       temp = 0x097C;
-       } else if (pVBInfo->IF_DEF_HiVision == 1) {
-               temp = 0x01FC;
-       } else {
-               temp = 0x017C;
-       }
+       temp = 0x09FC;
 
        if (!(tempbx & temp)) {
                tempax |= DisableCRT2Display;
@@ -2046,15 +1997,10 @@ static void XGI_GetVBInfo(unsigned short ModeNo, unsigned short ModeIdIndex,
        /* shampoo add */
        /* for driver abnormal */
        if (!(tempbx & (SwitchCRT2 | SetSimuScanMode))) {
-               if (pVBInfo->IF_DEF_CRT2Monitor == 1) {
-                       if (tempbx & SetCRT2ToRAMDAC) {
-                               tempbx &= (0xFF00 | SetCRT2ToRAMDAC |
-                                          SwitchCRT2 | SetSimuScanMode);
-                               tempbx &= (0x00FF | (~SetCRT2ToYPbPr525750));
-                       }
-               } else {
-                       tempbx &= (~(SetCRT2ToRAMDAC | SetCRT2ToLCD |
-                                    SetCRT2ToTV));
+               if (tempbx & SetCRT2ToRAMDAC) {
+                       tempbx &= (0xFF00 | SetCRT2ToRAMDAC |
+                                  SwitchCRT2 | SetSimuScanMode);
+                       tempbx &= (0x00FF | (~SetCRT2ToYPbPr525750));
                }
        }
 
@@ -2072,16 +2018,12 @@ static void XGI_GetVBInfo(unsigned short ModeNo, unsigned short ModeIdIndex,
                tempbx &= (0x00FF | (~SetCRT2ToYPbPr525750));
        }
 
-       if (pVBInfo->IF_DEF_YPbPr == 1) {
-               if (tempbx & SetCRT2ToYPbPr525750)
-                       tempbx &= (0xFF00 | SwitchCRT2 | SetSimuScanMode);
-       }
+       if (tempbx & SetCRT2ToYPbPr525750)
+               tempbx &= (0xFF00 | SwitchCRT2 | SetSimuScanMode);
 
-       if (pVBInfo->IF_DEF_HiVision == 1) {
-               if (tempbx & SetCRT2ToHiVision)
-                       tempbx &= (0xFF00 | SetCRT2ToHiVision | SwitchCRT2 |
-                                  SetSimuScanMode);
-       }
+       if (tempbx & SetCRT2ToHiVision)
+               tempbx &= (0xFF00 | SetCRT2ToHiVision | SwitchCRT2 |
+                          SetSimuScanMode);
 
        if (tempax & DisableCRT2Display) { /* Set Display Device Info */
                if (!(tempbx & (SwitchCRT2 | SetSimuScanMode)))
@@ -2132,25 +2074,21 @@ static void XGI_GetTVInfo(unsigned short ModeNo, unsigned short ModeIdIndex,
                if (pVBInfo->VBInfo & SetCRT2ToSCART)
                        tempbx |= TVSetPAL;
 
-               if (pVBInfo->IF_DEF_YPbPr == 1) {
-                       if (pVBInfo->VBInfo & SetCRT2ToYPbPr525750) {
-                               index1 = xgifb_reg_get(pVBInfo->P3d4, 0x35);
-                               index1 &= YPbPrMode;
+               if (pVBInfo->VBInfo & SetCRT2ToYPbPr525750) {
+                       index1 = xgifb_reg_get(pVBInfo->P3d4, 0x35);
+                       index1 &= YPbPrMode;
 
-                               if (index1 == YPbPrMode525i)
-                                       tempbx |= TVSetYPbPr525i;
+                       if (index1 == YPbPrMode525i)
+                               tempbx |= TVSetYPbPr525i;
 
-                               if (index1 == YPbPrMode525p)
-                                       tempbx = tempbx | TVSetYPbPr525p;
-                               if (index1 == YPbPrMode750p)
-                                       tempbx = tempbx | TVSetYPbPr750p;
-                       }
+                       if (index1 == YPbPrMode525p)
+                               tempbx = tempbx | TVSetYPbPr525p;
+                       if (index1 == YPbPrMode750p)
+                               tempbx = tempbx | TVSetYPbPr750p;
                }
 
-               if (pVBInfo->IF_DEF_HiVision == 1) {
-                       if (pVBInfo->VBInfo & SetCRT2ToHiVision)
-                               tempbx = tempbx | TVSetHiVision | TVSetPAL;
-               }
+               if (pVBInfo->VBInfo & SetCRT2ToHiVision)
+                       tempbx = tempbx | TVSetHiVision | TVSetPAL;
 
                if ((pVBInfo->VBInfo & SetInSlaveMode) &&
                    (!(pVBInfo->VBInfo & SetNotSimuMode)))
@@ -2657,10 +2595,7 @@ static void XGI_GetCRT2Data(unsigned short ModeNo, unsigned short ModeIdIndex,
                                        tempbx = 775;
                                else if (pVBInfo->VGAVDE == 600)
                                        tempbx = 775;
-                               else
-                                       tempbx = 768;
-                       } else
-                               tempbx = 768;
+                       }
                } else if (pVBInfo->LCDResInfo == Panel_1024x768x75) {
                        tempax = 1024;
                        tempbx = 768;
@@ -2784,7 +2719,6 @@ static void XGI_GetCRT2Data(unsigned short ModeNo, unsigned short ModeIdIndex,
 
                pVBInfo->HT = tempax;
                pVBInfo->VT = tempbx;
-               return;
        }
 }
 
@@ -3015,9 +2949,6 @@ static void XGI_SetGroup1(unsigned short ModeNo, unsigned short ModeIdIndex,
        temp |= ((tempcx & 0xFF00) >> 8);
        xgifb_reg_set(pVBInfo->Part1Port, 0x12, temp);
 
-       tempax = pVBInfo->VGAVDE;
-       tempbx = pVBInfo->VGAVDE;
-       tempcx = pVBInfo->VGAVT;
        /* BTVGA2VRS 0x10,0x11 */
        tempbx = (pVBInfo->VGAVT + pVBInfo->VGAVDE) >> 1;
        /* BTVGA2VRE 0x11 */
@@ -3178,7 +3109,7 @@ static void XGI_SetLockRegs(unsigned short ModeNo, unsigned short ModeIdIndex,
        if (pVBInfo->VBInfo & SetCRT2ToTV) {
                if (pVBInfo->TVInfo & TVSimuMode) {
                        if (ModeNo == 0x50) {
-                               if (pVBInfo->TVInfo & SetNTSCTV) {
+                               if (pVBInfo->TVInfo == SetNTSCTV) {
                                        xgifb_reg_set(pVBInfo->Part1Port,
                                                        0x07, 0x30);
                                        xgifb_reg_set(pVBInfo->Part1Port,
@@ -3226,7 +3157,6 @@ static void XGI_SetLockRegs(unsigned short ModeNo, unsigned short ModeIdIndex,
                }
        }
        tempbx--;
-       temp = tempbx & 0x00FF;
        tempbx--;
        temp = tempbx & 0x00FF;
        /* 0x10 vertical Blank Start */
@@ -3361,8 +3291,6 @@ static void XGI_SetLockRegs(unsigned short ModeNo, unsigned short ModeIdIndex,
                temp = 0x00;
 
        xgifb_reg_set(pVBInfo->Part1Port, 0x1A, temp); /* 0x1A SR0E */
-
-       return;
 }
 
 static void XGI_SetGroup2(unsigned short ModeNo, unsigned short ModeIdIndex,
@@ -3445,9 +3373,6 @@ static void XGI_SetGroup2(unsigned short ModeNo, unsigned short ModeIdIndex,
        temp &= 0x80;
        xgifb_reg_and_or(pVBInfo->Part2Port, 0x0A, 0xFF, temp);
 
-       if (pVBInfo->VBInfo & SetCRT2ToHiVision)
-               tempax = 950;
-
        if (pVBInfo->TVInfo & TVSetPAL)
                tempax = 520;
        else
@@ -3797,9 +3722,6 @@ static void XGI_SetGroup2(unsigned short ModeNo, unsigned short ModeIdIndex,
                if (!(pVBInfo->VBInfo & SetInSlaveMode))
                        xgifb_reg_set(pVBInfo->Part2Port, 0x0B, 0x00);
        }
-
-       if (pVBInfo->VBInfo & SetCRT2ToTV)
-               return;
 }
 
 static void XGI_SetLCDRegs(unsigned short ModeNo, unsigned short ModeIdIndex,
@@ -4135,8 +4057,7 @@ static void XGI_SetGroup3(unsigned short ModeNo, unsigned short ModeIdIndex,
                                xgifb_reg_set(pVBInfo->Part3Port, 0x28, 0x3f);
                }
        }
-       return;
-} /* {end of XGI_SetGroup3} */
+}
 
 static void XGI_SetGroup4(unsigned short ModeNo, unsigned short ModeIdIndex,
                unsigned short RefreshRateTableIndex,
@@ -4211,11 +4132,6 @@ static void XGI_SetGroup4(unsigned short ModeNo, unsigned short ModeIdIndex,
 
        tempebx = pVBInfo->VDE;
 
-       if (tempcx & SetCRT2ToHiVision) {
-               if (!(temp & 0xE000))
-                       tempbx = tempbx >> 1;
-       }
-
        tempcx = pVBInfo->RVBHRS;
        temp = tempcx & 0x00FF;
        xgifb_reg_set(pVBInfo->Part4Port, 0x18, temp);
@@ -4325,13 +4241,6 @@ static void XGI_SetGroup5(unsigned short ModeNo, unsigned short ModeIdIndex,
                        XGINew_EnableCRT2(pVBInfo);
                }
        }
-       return;
-}
-
-static void XGI_EnableGatingCRT(struct xgi_hw_device_info *HwDeviceExtension,
-               struct vb_device_info *pVBInfo)
-{
-       xgifb_reg_and_or(pVBInfo->P3d4, 0x63, 0xBF, 0x40);
 }
 
 static void XGI_DisableGatingCRT(struct xgi_hw_device_info *HwDeviceExtension,
@@ -4592,38 +4501,6 @@ static unsigned char XGI_IsLCDON(struct vb_device_info *pVBInfo)
        return 0;
 }
 
-/* --------------------------------------------------------------------- */
-/* Function : XGI_EnableChISLCD */
-/* Input : */
-/* Output : 0 -> Not LCD mode */
-/* Description : if bool enable = true -> enable, else disable  */
-/* --------------------------------------------------------------------- */
-static unsigned char XGI_EnableChISLCD(struct vb_device_info *pVBInfo,
-       bool enable)
-{
-       unsigned short tempbx, tempah;
-
-       if (enable)
-               tempbx = pVBInfo->SetFlag & (EnableChA | EnableChB);
-       else
-               tempbx = pVBInfo->SetFlag & (DisableChA | DisableChB);
-
-       tempah = ~((unsigned short) xgifb_reg_get(pVBInfo->Part1Port, 0x2E));
-
-       if (tempbx & (EnableChA | DisableChA)) {
-               if (!(tempah & 0x08)) /* Chk LCDA Mode */
-                       return 0;
-       }
-
-       if (!(tempbx & (EnableChB | DisableChB)))
-               return 0;
-
-       if (tempah & 0x01) /* Chk LCDB Mode */
-               return 1;
-
-       return 0;
-}
-
 static void XGI_DisableBridge(struct xgifb_video_info *xgifb_info,
                struct xgi_hw_device_info *HwDeviceExtension,
                struct vb_device_info *pVBInfo)
@@ -4636,21 +4513,8 @@ static void XGI_DisableBridge(struct xgifb_video_info *xgifb_info,
                if (!(pVBInfo->VBInfo &
                    (DisableCRT2Display | SetSimuScanMode))) {
                        if (pVBInfo->VBInfo & XGI_SetCRT2ToLCDA) {
-                               if (pVBInfo->VBInfo & SetCRT2ToDualEdge) {
+                               if (pVBInfo->VBInfo & SetCRT2ToDualEdge)
                                        tempah = 0x7F; /* Disable Channel A */
-                                       if (!(pVBInfo->VBInfo &
-                                             XGI_SetCRT2ToLCDA))
-                                               /* Disable Channel B */
-                                               tempah = 0xBF;
-
-                                       if (pVBInfo->SetFlag & DisableChB)
-                                               /* force to disable Cahnnel */
-                                               tempah &= 0xBF;
-
-                                       if (pVBInfo->SetFlag & DisableChA)
-                                               /* Force to disable Channel B */
-                                               tempah &= 0x7F;
-                               }
                        }
                }
 
@@ -4660,26 +4524,18 @@ static void XGI_DisableBridge(struct xgifb_video_info *xgifb_info,
                if (pVBInfo->VBType & (VB_SIS302LV | VB_XGI301C)) {
                        if (((pVBInfo->VBInfo &
                              (SetCRT2ToLCD | XGI_SetCRT2ToLCDA))) ||
-                               (XGI_EnableChISLCD(pVBInfo, false)) ||
                                (XGI_IsLCDON(pVBInfo)))
                                /* LVDS Driver power down */
                                xgifb_reg_or(pVBInfo->Part4Port, 0x30, 0x80);
                }
 
-               if ((pVBInfo->SetFlag & DisableChA) || (pVBInfo->VBInfo
-                               & (DisableCRT2Display | XGI_SetCRT2ToLCDA
-                                               | SetSimuScanMode))) {
-                       if (pVBInfo->SetFlag & GatingCRT)
-                               XGI_EnableGatingCRT(HwDeviceExtension, pVBInfo);
+               if (pVBInfo->VBInfo & (DisableCRT2Display | XGI_SetCRT2ToLCDA |
+                                      SetSimuScanMode))
                        XGI_DisplayOff(xgifb_info, HwDeviceExtension, pVBInfo);
-               }
 
-               if (pVBInfo->VBInfo & XGI_SetCRT2ToLCDA) {
-                       if ((pVBInfo->SetFlag & DisableChA) || (pVBInfo->VBInfo
-                                       & XGI_SetCRT2ToLCDA))
-                               /* Power down */
-                               xgifb_reg_and(pVBInfo->Part1Port, 0x1e, 0xdf);
-               }
+               if (pVBInfo->VBInfo & XGI_SetCRT2ToLCDA)
+                       /* Power down */
+                       xgifb_reg_and(pVBInfo->Part1Port, 0x1e, 0xdf);
 
                /* disable TV as primary VGA swap */
                xgifb_reg_and(pVBInfo->P3c4, 0x32, 0xdf);
@@ -4687,16 +4543,14 @@ static void XGI_DisableBridge(struct xgifb_video_info *xgifb_info,
                if ((pVBInfo->VBInfo & (SetSimuScanMode | SetCRT2ToDualEdge)))
                        xgifb_reg_and(pVBInfo->Part2Port, 0x00, 0xdf);
 
-               if ((pVBInfo->SetFlag & DisableChB) ||
-                   (pVBInfo->VBInfo &
+               if ((pVBInfo->VBInfo &
                        (DisableCRT2Display | SetSimuScanMode)) ||
                    ((!(pVBInfo->VBInfo & XGI_SetCRT2ToLCDA)) &&
                    (pVBInfo->VBInfo &
                        (SetCRT2ToRAMDAC | SetCRT2ToLCD | SetCRT2ToTV))))
                        xgifb_reg_or(pVBInfo->Part1Port, 0x00, 0x80);
 
-               if ((pVBInfo->SetFlag & DisableChB) ||
-                   (pVBInfo->VBInfo &
+               if ((pVBInfo->VBInfo &
                        (DisableCRT2Display | SetSimuScanMode)) ||
                    (!(pVBInfo->VBInfo & XGI_SetCRT2ToLCDA)) ||
                    (pVBInfo->VBInfo &
@@ -5308,21 +5162,6 @@ void XGI_LockCRT2(struct xgi_hw_device_info *HwDeviceExtension,
 
 }
 
-unsigned char XGI_BridgeIsOn(struct vb_device_info *pVBInfo)
-{
-       unsigned short flag;
-
-       if (pVBInfo->IF_DEF_LVDS == 1) {
-               return 1;
-       } else {
-               flag = xgifb_reg_get(pVBInfo->Part4Port, 0x00);
-               if ((flag == 1) || (flag == 2))
-                       return 1; /* 301b */
-               else
-                       return 0;
-       }
-}
-
 unsigned short XGI_GetRatePtrCRT2(struct xgi_hw_device_info *pXGIHWDE,
                unsigned short ModeNo, unsigned short ModeIdIndex,
                struct vb_device_info *pVBInfo)
@@ -5344,15 +5183,10 @@ unsigned short XGI_GetRatePtrCRT2(struct xgi_hw_device_info *pXGIHWDE,
 
        if (pVBInfo->SetFlag & ProgrammingCRT2) {
                if (pVBInfo->VBInfo & (SetCRT2ToLCD | XGI_SetCRT2ToLCDA)) {
-                       if (pVBInfo->IF_DEF_LVDS == 0) {
-                               temp = LCDARefreshIndex[
-                                       pVBInfo->LCDResInfo & 0x07];
+                       temp = LCDARefreshIndex[pVBInfo->LCDResInfo & 0x07];
 
-                               if (index > temp)
-                                       index = temp;
-                       } else {
-                               index = 0;
-                       }
+                       if (index > temp)
+                               index = temp;
                }
        }
 
@@ -5555,53 +5389,37 @@ static void XGI_EnableBridge(struct xgifb_video_info *xgifb_info,
 
        if (pVBInfo->VBType & (VB_SIS301B | VB_SIS302B | VB_SIS301LV
                        | VB_SIS302LV | VB_XGI301C)) {
-               if (!(pVBInfo->SetFlag & DisableChA)) {
-                       if ((pVBInfo->SetFlag & EnableChA) ||
-                           (pVBInfo->VBInfo & SetCRT2ToDualEdge)) {
-                               /* Power on */
-                               xgifb_reg_set(pVBInfo->Part1Port, 0x1E, 0x20);
+               if (pVBInfo->VBInfo & SetCRT2ToDualEdge)
+                       /* Power on */
+                       xgifb_reg_set(pVBInfo->Part1Port, 0x1E, 0x20);
+
+               if (pVBInfo->VBInfo & (SetCRT2ToLCD | SetCRT2ToTV |
+                                      SetCRT2ToRAMDAC)) {
+                       tempah = xgifb_reg_get(pVBInfo->P3c4, 0x32);
+                       tempah &= 0xDF;
+                       if (pVBInfo->VBInfo & SetInSlaveMode) {
+                               if (!(pVBInfo->VBInfo & SetCRT2ToRAMDAC))
+                                       tempah |= 0x20;
                        }
-               }
+                       xgifb_reg_set(pVBInfo->P3c4, 0x32, tempah);
+                       xgifb_reg_or(pVBInfo->P3c4, 0x1E, 0x20);
 
-               if (!(pVBInfo->SetFlag & DisableChB)) {
-                       if ((pVBInfo->SetFlag & EnableChB) || (pVBInfo->VBInfo
-                                       & (SetCRT2ToLCD | SetCRT2ToTV
-                                                       | SetCRT2ToRAMDAC))) {
-                               tempah = xgifb_reg_get(pVBInfo->P3c4, 0x32);
-                               tempah &= 0xDF;
-                               if (pVBInfo->VBInfo & SetInSlaveMode) {
-                                       if (!(pVBInfo->VBInfo &
-                                             SetCRT2ToRAMDAC))
-                                               tempah |= 0x20;
-                               }
-                               xgifb_reg_set(pVBInfo->P3c4, 0x32, tempah);
-                               xgifb_reg_or(pVBInfo->P3c4, 0x1E, 0x20);
+                       tempah = xgifb_reg_get(pVBInfo->Part1Port, 0x2E);
 
-                               tempah = xgifb_reg_get(pVBInfo->Part1Port,
-                                                      0x2E);
-
-                               if (!(tempah & 0x80))
-                                       xgifb_reg_or(pVBInfo->Part1Port,
-                                                       0x2E, 0x80);
-                               xgifb_reg_and(pVBInfo->Part1Port, 0x00, 0x7F);
-                       }
+                       if (!(tempah & 0x80))
+                               xgifb_reg_or(pVBInfo->Part1Port, 0x2E, 0x80);
+                       xgifb_reg_and(pVBInfo->Part1Port, 0x00, 0x7F);
                }
 
-               if ((pVBInfo->SetFlag & (EnableChA | EnableChB))
-                               || (!(pVBInfo->VBInfo & DisableCRT2Display))) {
+               if (!(pVBInfo->VBInfo & DisableCRT2Display)) {
                        xgifb_reg_and_or(pVBInfo->Part2Port, 0x00, ~0xE0,
                                        0x20); /* shampoo 0129 */
                        if (pVBInfo->VBType & (VB_SIS302LV | VB_XGI301C)) {
-                               if (!XGI_EnableChISLCD(pVBInfo, false)) {
-                                       if (XGI_EnableChISLCD(pVBInfo, true) ||
-                                           (pVBInfo->VBInfo &
-                                           (SetCRT2ToLCD | XGI_SetCRT2ToLCDA)))
-                                               /* LVDS PLL power on */
-                                               xgifb_reg_and(
-                                                       pVBInfo->Part4Port,
-                                                       0x2A,
-                                                       0x7F);
-                               }
+                               if (pVBInfo->VBInfo &
+                                       (SetCRT2ToLCD | XGI_SetCRT2ToLCDA))
+                                       /* LVDS PLL power on */
+                                       xgifb_reg_and(pVBInfo->Part4Port, 0x2A,
+                                                     0x7F);
                                /* LVDS Driver power on */
                                xgifb_reg_and(pVBInfo->Part4Port, 0x30, 0x7F);
                        }
@@ -5618,32 +5436,14 @@ static void XGI_EnableBridge(struct xgifb_video_info *xgifb_info,
                                tempah = tempah & 0x40;
                                if (pVBInfo->VBInfo & XGI_SetCRT2ToLCDA)
                                        tempah = tempah ^ 0xC0;
-
-                               if (pVBInfo->SetFlag & DisableChB)
-                                       tempah &= 0xBF;
-
-                               if (pVBInfo->SetFlag &  DisableChA)
-                                       tempah &= 0x7F;
-
-                               if (pVBInfo->SetFlag &  EnableChB)
-                                       tempah |= 0x40;
-
-                               if (pVBInfo->SetFlag &  EnableChA)
-                                       tempah |= 0x80;
                        }
                }
 
                /* EnablePart4_1F */
                xgifb_reg_or(pVBInfo->Part4Port, 0x1F, tempah);
 
-               if (!(pVBInfo->SetFlag & DisableChA)) {
-                       if (!(pVBInfo->SetFlag & GatingCRT)) {
-                               XGI_DisableGatingCRT(HwDeviceExtension,
-                                                    pVBInfo);
-                               XGI_DisplayOn(xgifb_info, HwDeviceExtension,
-                                               pVBInfo);
-                       }
-               }
+               XGI_DisableGatingCRT(HwDeviceExtension, pVBInfo);
+               XGI_DisplayOn(xgifb_info, HwDeviceExtension, pVBInfo);
        } /* 301 */
        else { /* LVDS */
                if (pVBInfo->VBInfo & (SetCRT2ToTV | SetCRT2ToLCD
@@ -5745,16 +5545,8 @@ unsigned char XGISetModeNew(struct xgifb_video_info *xgifb_info,
        struct vb_device_info *pVBInfo = &VBINF;
        pVBInfo->IF_DEF_LVDS = 0;
 
-       if (HwDeviceExtension->jChipType >= XG20) {
-               pVBInfo->IF_DEF_YPbPr = 0;
-               pVBInfo->IF_DEF_HiVision = 0;
-               pVBInfo->IF_DEF_CRT2Monitor = 0;
+       if (HwDeviceExtension->jChipType >= XG20)
                pVBInfo->VBType = 0; /*set VBType default 0*/
-       } else {
-               pVBInfo->IF_DEF_YPbPr = 1;
-               pVBInfo->IF_DEF_HiVision = 1;
-               pVBInfo->IF_DEF_CRT2Monitor = 1;
-       }
 
        XGIRegInit(pVBInfo, xgifb_info->vga_base);
 
@@ -5770,9 +5562,6 @@ unsigned char XGISetModeNew(struct xgifb_video_info *xgifb_info,
                }
        }
 
-       if (HwDeviceExtension->jChipType < XG20)
-               XGI_GetVBType(pVBInfo);
-
        InitTo330Pointer(HwDeviceExtension->jChipType, pVBInfo);
        if (ModeNo & 0x80)
                ModeNo = ModeNo & 0x7F;
index 552482858c1ccc62532025a203daf9268a89d198..2c0a31c8dfd5083a321ff0a74238fb7e0e16b5a5 100644 (file)
@@ -18,7 +18,6 @@ extern unsigned char XGISetModeNew(struct xgifb_video_info *xgifb_info,
 extern unsigned char XGI_SearchModeID(unsigned short ModeNo,
                                      unsigned short *ModeIdIndex,
                                      struct vb_device_info *);
-extern unsigned char XGI_BridgeIsOn(struct vb_device_info *);
 extern unsigned short XGI_GetRatePtrCRT2(struct xgi_hw_device_info *pXGIHWDE,
                                         unsigned short ModeNo,
                                         unsigned short ModeIdIndex,
index e34e3fe0ae2e61f12f7c167ba74588d663dee09a..ec2b2b5a122e87831fa0661cedd24b86f2cc2ce1 100644 (file)
 
 /* Globals */
 static int zram_major;
-struct zram *zram_devices;
+static struct zram *zram_devices;
 
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
 
-static void zram_stat64_add(struct zram *zram, u64 *v, u64 inc)
-{
-       spin_lock(&zram->stat64_lock);
-       *v = *v + inc;
-       spin_unlock(&zram->stat64_lock);
-}
-
-static void zram_stat64_sub(struct zram *zram, u64 *v, u64 dec)
-{
-       spin_lock(&zram->stat64_lock);
-       *v = *v - dec;
-       spin_unlock(&zram->stat64_lock);
-}
-
-static void zram_stat64_inc(struct zram *zram, u64 *v)
-{
-       zram_stat64_add(zram, v, 1);
-}
-
 static int zram_test_flag(struct zram_meta *meta, u32 index,
                        enum zram_pageflags flag)
 {
@@ -120,31 +101,33 @@ static void zram_free_page(struct zram *zram, size_t index)
        if (size <= PAGE_SIZE / 2)
                zram->stats.good_compress--;
 
-       zram_stat64_sub(zram, &zram->stats.compr_size,
-                       meta->table[index].size);
+       atomic64_sub(meta->table[index].size, &zram->stats.compr_size);
        zram->stats.pages_stored--;
 
        meta->table[index].handle = 0;
        meta->table[index].size = 0;
 }
 
+static inline int is_partial_io(struct bio_vec *bvec)
+{
+       return bvec->bv_len != PAGE_SIZE;
+}
+
 static void handle_zero_page(struct bio_vec *bvec)
 {
        struct page *page = bvec->bv_page;
        void *user_mem;
 
        user_mem = kmap_atomic(page);
-       memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
+       if (is_partial_io(bvec))
+               memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
+       else
+               clear_page(user_mem);
        kunmap_atomic(user_mem);
 
        flush_dcache_page(page);
 }
 
-static inline int is_partial_io(struct bio_vec *bvec)
-{
-       return bvec->bv_len != PAGE_SIZE;
-}
-
 static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
 {
        int ret = LZO_E_OK;
@@ -154,13 +137,13 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
        unsigned long handle = meta->table[index].handle;
 
        if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
-               memset(mem, 0, PAGE_SIZE);
+               clear_page(mem);
                return 0;
        }
 
        cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
        if (meta->table[index].size == PAGE_SIZE)
-               memcpy(mem, cmem, PAGE_SIZE);
+               copy_page(mem, cmem);
        else
                ret = lzo1x_decompress_safe(cmem, meta->table[index].size,
                                                mem, &clen);
@@ -169,7 +152,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
        /* Should NEVER happen. Return bio error if it does. */
        if (unlikely(ret != LZO_E_OK)) {
                pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
-               zram_stat64_inc(zram, &zram->stats.failed_reads);
+               atomic64_inc(&zram->stats.failed_reads);
                return ret;
        }
 
@@ -272,8 +255,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 
        if (page_zero_filled(uncmem)) {
                kunmap_atomic(user_mem);
-               if (is_partial_io(bvec))
-                       kfree(uncmem);
                zram->stats.pages_zero++;
                zram_set_flag(meta, index, ZRAM_ZERO);
                ret = 0;
@@ -304,18 +285,20 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 
        handle = zs_malloc(meta->mem_pool, clen);
        if (!handle) {
-               pr_info("Error allocating memory for compressed "
-                       "page: %u, size=%zu\n", index, clen);
+               pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
+                       index, clen);
                ret = -ENOMEM;
                goto out;
        }
        cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
 
-       if ((clen == PAGE_SIZE) && !is_partial_io(bvec))
+       if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
                src = kmap_atomic(page);
-       memcpy(cmem, src, clen);
-       if ((clen == PAGE_SIZE) && !is_partial_io(bvec))
+               copy_page(cmem, src);
                kunmap_atomic(src);
+       } else {
+               memcpy(cmem, src, clen);
+       }
 
        zs_unmap_object(meta->mem_pool, handle);
 
@@ -323,7 +306,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
        meta->table[index].size = clen;
 
        /* Update stats */
-       zram_stat64_add(zram, &zram->stats.compr_size, clen);
+       atomic64_add(clen, &zram->stats.compr_size);
        zram->stats.pages_stored++;
        if (clen <= PAGE_SIZE / 2)
                zram->stats.good_compress++;
@@ -333,7 +316,7 @@ out:
                kfree(uncmem);
 
        if (ret)
-               zram_stat64_inc(zram, &zram->stats.failed_writes);
+               atomic64_inc(&zram->stats.failed_writes);
        return ret;
 }
 
@@ -370,10 +353,10 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
 
        switch (rw) {
        case READ:
-               zram_stat64_inc(zram, &zram->stats.num_reads);
+               atomic64_inc(&zram->stats.num_reads);
                break;
        case WRITE:
-               zram_stat64_inc(zram, &zram->stats.num_writes);
+               atomic64_inc(&zram->stats.num_writes);
                break;
        }
 
@@ -422,13 +405,20 @@ out:
  */
 static inline int valid_io_request(struct zram *zram, struct bio *bio)
 {
-       if (unlikely(
-               (bio->bi_sector >= (zram->disksize >> SECTOR_SHIFT)) ||
-               (bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)) ||
-               (bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))) {
+       u64 start, end, bound;
+       
+       /* unaligned request */
+       if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+               return 0;
+       if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+               return 0;
 
+       start = bio->bi_sector;
+       end = start + (bio->bi_size >> SECTOR_SHIFT);
+       bound = zram->disksize >> SECTOR_SHIFT;
+       /* out of range range */
+       if (unlikely(start >= bound || end >= bound || start > end))
                return 0;
-       }
 
        /* I/O request is valid */
        return 1;
@@ -446,7 +436,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio)
                goto error;
 
        if (!valid_io_request(zram, bio)) {
-               zram_stat64_inc(zram, &zram->stats.invalid_io);
+               atomic64_inc(&zram->stats.invalid_io);
                goto error;
        }
 
@@ -582,8 +572,10 @@ static void zram_slot_free_notify(struct block_device *bdev,
        struct zram *zram;
 
        zram = bdev->bd_disk->private_data;
+       down_write(&zram->lock);
        zram_free_page(zram, index);
-       zram_stat64_inc(zram, &zram->stats.notify_free);
+       up_write(&zram->lock);
+       atomic64_inc(&zram->stats.notify_free);
 }
 
 static const struct block_device_operations zram_devops = {
@@ -593,17 +585,15 @@ static const struct block_device_operations zram_devops = {
 
 static int create_device(struct zram *zram, int device_id)
 {
-       int ret = 0;
+       int ret = -ENOMEM;
 
        init_rwsem(&zram->lock);
        init_rwsem(&zram->init_lock);
-       spin_lock_init(&zram->stat64_lock);
 
        zram->queue = blk_alloc_queue(GFP_KERNEL);
        if (!zram->queue) {
                pr_err("Error allocating disk queue for device %d\n",
                        device_id);
-               ret = -ENOMEM;
                goto out;
        }
 
@@ -613,11 +603,9 @@ static int create_device(struct zram *zram, int device_id)
         /* gendisk structure */
        zram->disk = alloc_disk(1);
        if (!zram->disk) {
-               blk_cleanup_queue(zram->queue);
                pr_warn("Error allocating disk structure for device %d\n",
                        device_id);
-               ret = -ENOMEM;
-               goto out;
+               goto out_free_queue;
        }
 
        zram->disk->major = zram_major;
@@ -646,11 +634,17 @@ static int create_device(struct zram *zram, int device_id)
                                &zram_disk_attr_group);
        if (ret < 0) {
                pr_warn("Error creating sysfs group");
-               goto out;
+               goto out_free_disk;
        }
 
        zram->init_done = 0;
+       return 0;
 
+out_free_disk:
+       del_gendisk(zram->disk);
+       put_disk(zram->disk);
+out_free_queue:
+       blk_cleanup_queue(zram->queue);
 out:
        return ret;
 }
@@ -669,11 +663,6 @@ static void destroy_device(struct zram *zram)
                blk_cleanup_queue(zram->queue);
 }
 
-unsigned int zram_get_num_devices(void)
-{
-       return num_devices;
-}
-
 static int __init zram_init(void)
 {
        int ret, dev_id;
@@ -727,8 +716,10 @@ static void __exit zram_exit(void)
        for (i = 0; i < num_devices; i++) {
                zram = &zram_devices[i];
 
+               get_disk(zram->disk);
                destroy_device(zram);
                zram_reset_device(zram);
+               put_disk(zram->disk);
        }
 
        unregister_blkdev(zram_major, "zram");
index 2d1a3f1e8edbdff9dc049d50339ffebcb6056d0d..11b09fc2595334c847ee5764fcd488e8f1289f49 100644 (file)
@@ -69,14 +69,18 @@ struct table {
        u8 flags;
 } __aligned(4);
 
+/*
+ * All 64bit fields should only be manipulated by 64bit atomic accessors.
+ * All modifications to 32bit counter should be protected by zram->lock.
+ */
 struct zram_stats {
-       u64 compr_size;         /* compressed size of pages stored */
-       u64 num_reads;          /* failed + successful */
-       u64 num_writes;         /* --do-- */
-       u64 failed_reads;       /* should NEVER! happen */
-       u64 failed_writes;      /* can happen when memory is too low */
-       u64 invalid_io;         /* non-page-aligned I/O requests */
-       u64 notify_free;        /* no. of swap slot free notifications */
+       atomic64_t compr_size;  /* compressed size of pages stored */
+       atomic64_t num_reads;   /* failed + successful */
+       atomic64_t num_writes;  /* --do-- */
+       atomic64_t failed_reads;        /* should NEVER! happen */
+       atomic64_t failed_writes;       /* can happen when memory is too low */
+       atomic64_t invalid_io;  /* non-page-aligned I/O requests */
+       atomic64_t notify_free; /* no. of swap slot free notifications */
        u32 pages_zero;         /* no. of zero filled pages */
        u32 pages_stored;       /* no. of pages currently stored */
        u32 good_compress;      /* % of pages with compression ratio<=50% */
@@ -92,9 +96,9 @@ struct zram_meta {
 
 struct zram {
        struct zram_meta *meta;
-       spinlock_t stat64_lock; /* protect 64-bit stats */
-       struct rw_semaphore lock; /* protect compression buffers and table
-                                  * against concurrent read and writes */
+       struct rw_semaphore lock; /* protect compression buffers, table,
+                                  * 32bit stat counters against concurrent
+                                  * notifications, reads and writes */
        struct request_queue *queue;
        struct gendisk *disk;
        int init_done;
@@ -109,8 +113,6 @@ struct zram {
        struct zram_stats stats;
 };
 
-extern struct zram *zram_devices;
-unsigned int zram_get_num_devices(void);
 #ifdef CONFIG_SYSFS
 extern struct attribute_group zram_disk_attr_group;
 #endif
index e6a929d452f72cda5b9953e1b9b0a1b3c15cb13d..93a2f9cafd7cb3d516546c1116677dda44b8d0b9 100644 (file)
 
 #include "zram_drv.h"
 
-static u64 zram_stat64_read(struct zram *zram, u64 *v)
+static inline struct zram *dev_to_zram(struct device *dev)
 {
-       u64 val;
-
-       spin_lock(&zram->stat64_lock);
-       val = *v;
-       spin_unlock(&zram->stat64_lock);
-
-       return val;
-}
-
-static struct zram *dev_to_zram(struct device *dev)
-{
-       int i;
-       struct zram *zram = NULL;
-
-       for (i = 0; i < zram_get_num_devices(); i++) {
-               zram = &zram_devices[i];
-               if (disk_to_dev(zram->disk) == dev)
-                       break;
-       }
-
-       return zram;
+       return (struct zram *)dev_to_disk(dev)->private_data;
 }
 
 static ssize_t disksize_show(struct device *dev,
@@ -125,7 +105,7 @@ static ssize_t num_reads_show(struct device *dev,
        struct zram *zram = dev_to_zram(dev);
 
        return sprintf(buf, "%llu\n",
-               zram_stat64_read(zram, &zram->stats.num_reads));
+                       (u64)atomic64_read(&zram->stats.num_reads));
 }
 
 static ssize_t num_writes_show(struct device *dev,
@@ -134,7 +114,7 @@ static ssize_t num_writes_show(struct device *dev,
        struct zram *zram = dev_to_zram(dev);
 
        return sprintf(buf, "%llu\n",
-               zram_stat64_read(zram, &zram->stats.num_writes));
+                       (u64)atomic64_read(&zram->stats.num_writes));
 }
 
 static ssize_t invalid_io_show(struct device *dev,
@@ -143,7 +123,7 @@ static ssize_t invalid_io_show(struct device *dev,
        struct zram *zram = dev_to_zram(dev);
 
        return sprintf(buf, "%llu\n",
-               zram_stat64_read(zram, &zram->stats.invalid_io));
+                       (u64)atomic64_read(&zram->stats.invalid_io));
 }
 
 static ssize_t notify_free_show(struct device *dev,
@@ -152,7 +132,7 @@ static ssize_t notify_free_show(struct device *dev,
        struct zram *zram = dev_to_zram(dev);
 
        return sprintf(buf, "%llu\n",
-               zram_stat64_read(zram, &zram->stats.notify_free));
+                       (u64)atomic64_read(&zram->stats.notify_free));
 }
 
 static ssize_t zero_pages_show(struct device *dev,
@@ -178,7 +158,7 @@ static ssize_t compr_data_size_show(struct device *dev,
        struct zram *zram = dev_to_zram(dev);
 
        return sprintf(buf, "%llu\n",
-               zram_stat64_read(zram, &zram->stats.compr_size));
+                       (u64)atomic64_read(&zram->stats.compr_size));
 }
 
 static ssize_t mem_used_total_show(struct device *dev,
@@ -188,8 +168,10 @@ static ssize_t mem_used_total_show(struct device *dev,
        struct zram *zram = dev_to_zram(dev);
        struct zram_meta *meta = zram->meta;
 
+       down_read(&zram->init_lock);
        if (zram->init_done)
                val = zs_get_total_size_bytes(meta->mem_pool);
+       up_read(&zram->init_lock);
 
        return sprintf(buf, "%llu\n", val);
 }
index f82f7e69c8a5082f9ecc99a8d4dee8ddb300015d..4bb275b2d98f8fff8cb19c87101349312848059c 100644 (file)
@@ -224,7 +224,7 @@ struct zs_pool {
  * performs VM mapping faster than copying, then it should be added here
  * so that USE_PGTABLE_MAPPING is defined. This causes zsmalloc to use
  * page table mapping rather than copying for object mapping.
-*/
+ */
 #if defined(CONFIG_ARM) && !defined(MODULE)
 #define USE_PGTABLE_MAPPING
 #endif
@@ -844,8 +844,7 @@ void zs_destroy_pool(struct zs_pool *pool)
 
                for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
                        if (class->fullness_list[fg]) {
-                               pr_info("Freeing non-empty class with size "
-                                       "%db, fullness group %d\n",
+                               pr_info("Freeing non-empty class with size %db, fullness group %d\n",
                                        class->size, fg);
                        }
                }
@@ -968,7 +967,7 @@ EXPORT_SYMBOL_GPL(zs_free);
  * against nested mappings.
  *
  * This function returns with preemption and page faults disabled.
-*/
+ */
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
                        enum zs_mapmode mm)
 {
index 46dbd0558d86cccfb5a62d40915d764783525a92..fbe6bec421aa93edc0aef1ccf5221e0bc498fb79 100644 (file)
@@ -19,7 +19,7 @@
  * zsmalloc mapping modes
  *
  * NOTE: These only make a difference when a mapped object spans pages
-*/
+ */
 enum zs_mapmode {
        ZS_MM_RW, /* normal read-write mapping */
        ZS_MM_RO, /* read-only (no copy-out at unmap time) */
index 172c5b23cb840d5270fde51d9809e321f012211a..72b26940730d715e9ea0c3802b2b65fcd1ea922a 100644 (file)
 #define ST_SENSORS_FULLSCALE_AVL_MAX           10
 
 #define ST_SENSORS_NUMBER_ALL_CHANNELS         4
-#define ST_SENSORS_NUMBER_DATA_CHANNELS                3
 #define ST_SENSORS_ENABLE_ALL_AXIS             0x07
-#define ST_SENSORS_BYTE_FOR_CHANNEL            2
 #define ST_SENSORS_SCAN_X                      0
 #define ST_SENSORS_SCAN_Y                      1
 #define ST_SENSORS_SCAN_Z                      2
-#define ST_SENSORS_DEFAULT_12_REALBITS         12
-#define ST_SENSORS_DEFAULT_16_REALBITS         16
 #define ST_SENSORS_DEFAULT_POWER_ON_VALUE      0x01
 #define ST_SENSORS_DEFAULT_POWER_OFF_VALUE     0x00
 #define ST_SENSORS_DEFAULT_WAI_ADDRESS         0x0f
 #define ST_SENSORS_MAX_NAME                    17
 #define ST_SENSORS_MAX_4WAI                    7
 
-#define ST_SENSORS_LSM_CHANNELS(device_type, index, mod, endian, bits, addr) \
+#define ST_SENSORS_LSM_CHANNELS(device_type, mask, index, mod, \
+                                       ch2, s, endian, rbits, sbits, addr) \
 { \
        .type = device_type, \
-       .modified = 1, \
-       .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
-                       BIT(IIO_CHAN_INFO_SCALE), \
+       .modified = mod, \
+       .info_mask_separate = mask, \
        .scan_index = index, \
-       .channel2 = mod, \
+       .channel2 = ch2, \
        .address = addr, \
        .scan_type = { \
-               .sign = 's', \
-               .realbits = bits, \
-               .shift = 16 - bits, \
-               .storagebits = 16, \
+               .sign = s, \
+               .realbits = rbits, \
+               .shift = sbits - rbits, \
+               .storagebits = sbits, \
                .endianness = endian, \
        }, \
 }
@@ -204,6 +200,7 @@ struct st_sensors {
  * @multiread_bit: Use or not particular bit for [I2C/SPI] multiread.
  * @buffer_data: Data used by buffer part.
  * @odr: Output data rate of the sensor [Hz].
+ * num_data_channels: Number of data channels used in buffer.
  * @get_irq_data_ready: Function to get the IRQ used for data ready signal.
  * @tf: Transfer function structure used by I/O operations.
  * @tb: Transfer buffers and mutex used by I/O operations.
@@ -220,6 +217,7 @@ struct st_sensor_data {
        char *buffer_data;
 
        unsigned int odr;
+       unsigned int num_data_channels;
 
        unsigned int (*get_irq_data_ready) (struct iio_dev *indio_dev);
 
index be91f344d5fce1b3b8a41ff149ece8b917715542..ffd8c8f90928cfa7af3a7e64c31ae6822297e3aa 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * ADF4350/ADF4351 SPI PLL driver
  *
- * Copyright 2012 Analog Devices Inc.
+ * Copyright 2012-2013 Analog Devices Inc.
  *
  * Licensed under the GPL-2.
  */
@@ -41,7 +41,7 @@
 #define ADF4350_REG2_RDIV2_EN                  (1 << 24)
 #define ADF4350_REG2_RMULT2_EN                 (1 << 25)
 #define ADF4350_REG2_MUXOUT(x)                 ((x) << 26)
-#define ADF4350_REG2_NOISE_MODE(x)             ((x) << 29)
+#define ADF4350_REG2_NOISE_MODE(x)             (((unsigned)(x)) << 29)
 #define ADF4350_MUXOUT_THREESTATE              0
 #define ADF4350_MUXOUT_DVDD                    1
 #define ADF4350_MUXOUT_GND                     2
diff --git a/include/linux/platform_data/ad7303.h b/include/linux/platform_data/ad7303.h
new file mode 100644 (file)
index 0000000..de6a7a6
--- /dev/null
@@ -0,0 +1,21 @@
+/*
+ * Analog Devices AD7303 DAC driver
+ *
+ * Copyright 2013 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2.
+ */
+
+#ifndef __IIO_ADC_AD7303_H__
+#define __IIO_ADC_AD7303_H__
+
+/**
+ * struct ad7303_platform_data - AD7303 platform data
+ * @use_external_ref: If set to true use an external voltage reference connected
+ * to the REF pin, otherwise use the internal reference derived from Vdd.
+ */
+struct ad7303_platform_data {
+       bool use_external_ref;
+};
+
+#endif